natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +422 -0
- natural_pdf/classification/mixin.py +163 -0
- natural_pdf/classification/results.py +80 -0
- natural_pdf/collections/mixins.py +111 -0
- natural_pdf/collections/pdf_collection.py +434 -15
- natural_pdf/core/element_manager.py +83 -0
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +578 -93
- natural_pdf/core/pdf.py +912 -460
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +712 -109
- natural_pdf/elements/region.py +722 -69
- natural_pdf/elements/text.py +4 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +5 -4
- natural_pdf/extraction/manager.py +135 -0
- natural_pdf/extraction/mixin.py +279 -0
- natural_pdf/extraction/result.py +23 -0
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +122 -26
- natural_pdf/ocr/ocr_options.py +94 -11
- natural_pdf/ocr/utils.py +19 -6
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +431 -230
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +60 -1
- natural_pdf/utils/tqdm_utils.py +51 -0
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -915
- docs/element-selection/index.md +0 -229
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -170
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -209
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -194
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -340
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -147
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -114
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -270
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -332
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -288
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -413
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -508
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2434
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -512
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -604
- docs/tutorials/12-ocr-integration.md +0 -175
- docs/tutorials/13-semantic-search.ipynb +0 -1328
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.7.dist-info/RECORD +0 -145
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
natural_pdf/utils/debug.py
CHANGED
@@ -3,13 +3,13 @@ OCR debug utilities for natural-pdf.
|
|
3
3
|
"""
|
4
4
|
|
5
5
|
import base64
|
6
|
+
import importlib.resources
|
7
|
+
import importlib.util
|
6
8
|
import io
|
7
9
|
import json
|
8
10
|
import os
|
9
|
-
import importlib.util
|
10
|
-
import importlib.resources
|
11
11
|
import webbrowser
|
12
|
-
from typing import Dict, List,
|
12
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
13
13
|
|
14
14
|
from PIL import Image
|
15
15
|
|
natural_pdf/utils/identifiers.py
CHANGED
natural_pdf/utils/packaging.py
CHANGED
@@ -2,23 +2,25 @@
|
|
2
2
|
Utilities for packaging data for external processes, like correction tasks.
|
3
3
|
"""
|
4
4
|
|
5
|
-
import os
|
6
5
|
import base64
|
7
6
|
import io
|
8
7
|
import json
|
9
|
-
import zipfile
|
10
|
-
import tempfile
|
11
8
|
import logging
|
9
|
+
import os
|
12
10
|
import shutil
|
13
|
-
|
11
|
+
import tempfile
|
12
|
+
import zipfile
|
13
|
+
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Union
|
14
|
+
|
14
15
|
from tqdm import tqdm
|
16
|
+
|
15
17
|
from natural_pdf.elements.text import TextElement
|
16
18
|
|
17
19
|
# Import the specific PDF/Page types if possible, otherwise use Any
|
18
20
|
if TYPE_CHECKING:
|
19
|
-
from natural_pdf.core.pdf import PDF
|
20
|
-
from natural_pdf.core.page import Page
|
21
21
|
from natural_pdf.collections.pdf_collection import PDFCollection
|
22
|
+
from natural_pdf.core.page import Page
|
23
|
+
from natural_pdf.core.pdf import PDF
|
22
24
|
else:
|
23
25
|
PDF = Any
|
24
26
|
Page = Any
|
@@ -2,7 +2,12 @@
|
|
2
2
|
import logging
|
3
3
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
|
4
4
|
|
5
|
-
from pdfplumber.utils.geometry import
|
5
|
+
from pdfplumber.utils.geometry import (
|
6
|
+
cluster_objects,
|
7
|
+
get_bbox_overlap,
|
8
|
+
merge_bboxes,
|
9
|
+
objects_to_bbox,
|
10
|
+
)
|
6
11
|
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
7
12
|
|
8
13
|
if TYPE_CHECKING:
|
@@ -11,6 +16,60 @@ if TYPE_CHECKING:
|
|
11
16
|
logger = logging.getLogger(__name__)
|
12
17
|
|
13
18
|
|
19
|
+
def _get_layout_kwargs(
|
20
|
+
layout_context_bbox: Optional[Tuple[float, float, float, float]] = None,
|
21
|
+
user_kwargs: Optional[Dict[str, Any]] = None,
|
22
|
+
) -> Dict[str, Any]:
|
23
|
+
"""
|
24
|
+
Prepares the keyword arguments for pdfplumber's chars_to_textmap based
|
25
|
+
on defaults, context bbox, and allowed user overrides.
|
26
|
+
"""
|
27
|
+
# 1. Start with an empty dict for layout kwargs
|
28
|
+
layout_kwargs = {}
|
29
|
+
|
30
|
+
# Build allowed keys set without trying to copy the constants
|
31
|
+
allowed_keys = set(TEXTMAP_KWARGS) | set(WORD_EXTRACTOR_KWARGS)
|
32
|
+
|
33
|
+
# Add common, well-known default values
|
34
|
+
layout_kwargs.update(
|
35
|
+
{
|
36
|
+
"x_tolerance": 5,
|
37
|
+
"y_tolerance": 5,
|
38
|
+
"x_density": 7.25,
|
39
|
+
"y_density": 13,
|
40
|
+
"mode": "box",
|
41
|
+
"min_words_vertical": 1,
|
42
|
+
"min_words_horizontal": 1,
|
43
|
+
}
|
44
|
+
)
|
45
|
+
|
46
|
+
# 2. Apply context if provided
|
47
|
+
if layout_context_bbox:
|
48
|
+
ctx_x0, ctx_top, ctx_x1, ctx_bottom = layout_context_bbox
|
49
|
+
layout_kwargs["layout_width"] = ctx_x1 - ctx_x0
|
50
|
+
layout_kwargs["layout_height"] = ctx_bottom - ctx_top
|
51
|
+
layout_kwargs["x_shift"] = ctx_x0
|
52
|
+
layout_kwargs["y_shift"] = ctx_top
|
53
|
+
# Add layout_bbox itself
|
54
|
+
layout_kwargs["layout_bbox"] = layout_context_bbox
|
55
|
+
|
56
|
+
# 3. Apply user overrides (only for allowed keys)
|
57
|
+
if user_kwargs:
|
58
|
+
for key, value in user_kwargs.items():
|
59
|
+
if key in allowed_keys:
|
60
|
+
layout_kwargs[key] = value
|
61
|
+
elif key == "layout": # Always allow layout flag
|
62
|
+
layout_kwargs[key] = value
|
63
|
+
else:
|
64
|
+
logger.warning(f"Ignoring unsupported layout keyword argument: '{key}'")
|
65
|
+
|
66
|
+
# 4. Ensure layout flag is present, defaulting to True
|
67
|
+
if "layout" not in layout_kwargs:
|
68
|
+
layout_kwargs["layout"] = True
|
69
|
+
|
70
|
+
return layout_kwargs
|
71
|
+
|
72
|
+
|
14
73
|
def filter_chars_spatially(
|
15
74
|
char_dicts: List[Dict[str, Any]],
|
16
75
|
exclusion_regions: List["Region"],
|
@@ -0,0 +1,51 @@
|
|
1
|
+
import os
|
2
|
+
import sys
|
3
|
+
|
4
|
+
# Default to standard tqdm
|
5
|
+
try:
|
6
|
+
from tqdm.std import tqdm as selected_tqdm
|
7
|
+
except ImportError:
|
8
|
+
# Basic fallback if even std is missing (though unlikely)
|
9
|
+
def selected_tqdm(*args, **kwargs):
|
10
|
+
iterable = args[0] if args else None
|
11
|
+
if iterable:
|
12
|
+
return iterable
|
13
|
+
return None # Simple passthrough if no iterable
|
14
|
+
|
15
|
+
|
16
|
+
# Try to detect notebook environment
|
17
|
+
try:
|
18
|
+
# Check 1: Are we running in an IPython kernel?
|
19
|
+
from IPython import get_ipython
|
20
|
+
|
21
|
+
ipython = get_ipython()
|
22
|
+
if ipython and "IPKernelApp" in ipython.config:
|
23
|
+
# Check 2: Is it likely a notebook UI (Jupyter Notebook/Lab, VSCode, etc.)?
|
24
|
+
# This checks for common indicators. Might not be foolproof.
|
25
|
+
if "VSCODE_PID" in os.environ or (
|
26
|
+
"ipykernel" in sys.modules and "spyder" not in sys.modules
|
27
|
+
):
|
28
|
+
# Check 3: Can we import notebook version?
|
29
|
+
try:
|
30
|
+
from tqdm.notebook import tqdm as notebook_tqdm
|
31
|
+
|
32
|
+
selected_tqdm = notebook_tqdm # Use notebook version
|
33
|
+
except ImportError:
|
34
|
+
pass # Stick with std if notebook version missing
|
35
|
+
except ImportError:
|
36
|
+
pass # Stick with std if IPython not available
|
37
|
+
|
38
|
+
|
39
|
+
def get_tqdm():
|
40
|
+
"""Returns the tqdm class best suited for the detected environment."""
|
41
|
+
return selected_tqdm
|
42
|
+
|
43
|
+
|
44
|
+
# Example usage (for testing):
|
45
|
+
if __name__ == "__main__":
|
46
|
+
import time
|
47
|
+
|
48
|
+
tqdm_instance = get_tqdm()
|
49
|
+
print(f"Using tqdm class: {tqdm_instance}")
|
50
|
+
for i in tqdm_instance(range(10), desc="Testing tqdm"):
|
51
|
+
time.sleep(0.1)
|
@@ -8,6 +8,7 @@ import math
|
|
8
8
|
import random
|
9
9
|
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
10
10
|
|
11
|
+
import pypdfium2
|
11
12
|
from PIL import Image, ImageDraw, ImageFont
|
12
13
|
|
13
14
|
# Define a base list of visually distinct colors for highlighting
|
@@ -193,6 +194,7 @@ def merge_images_with_legend(
|
|
193
194
|
return image # Return original image if legend is None or empty
|
194
195
|
|
195
196
|
bg_color = (255, 255, 255, 255) # Always use white for the merged background
|
197
|
+
bg_color = (255, 255, 255, 255) # Always use white for the merged background
|
196
198
|
|
197
199
|
if position == "right":
|
198
200
|
# Create a new image with extra width for the legend
|
@@ -230,3 +232,19 @@ def merge_images_with_legend(
|
|
230
232
|
merged = image
|
231
233
|
|
232
234
|
return merged
|
235
|
+
|
236
|
+
|
237
|
+
def render_plain_page(page, resolution):
|
238
|
+
doc = pypdfium2.PdfDocument(page._page.pdf.stream)
|
239
|
+
|
240
|
+
pdf_page = doc[page.index]
|
241
|
+
|
242
|
+
bitmap = pdf_page.render(
|
243
|
+
scale=resolution / 72,
|
244
|
+
)
|
245
|
+
image = bitmap.to_pil().convert("RGB")
|
246
|
+
|
247
|
+
pdf_page.close()
|
248
|
+
doc.close()
|
249
|
+
|
250
|
+
return image
|
natural_pdf/widgets/viewer.py
CHANGED
@@ -3,6 +3,8 @@
|
|
3
3
|
import logging
|
4
4
|
import os
|
5
5
|
|
6
|
+
from natural_pdf.utils.visualization import render_plain_page
|
7
|
+
|
6
8
|
logger = logging.getLogger(__name__)
|
7
9
|
|
8
10
|
# Initialize flag and module/class variables to None
|
@@ -615,31 +617,7 @@ try:
|
|
615
617
|
|
616
618
|
from PIL import Image # Ensure Image is imported
|
617
619
|
|
618
|
-
|
619
|
-
scale = 1.0 # Define scale factor used for rendering
|
620
|
-
try:
|
621
|
-
img_object = page.to_image(resolution=int(72 * scale)) # Call to_image
|
622
|
-
# Check if .original attribute exists, otherwise assume img_object is the PIL Image
|
623
|
-
if hasattr(img_object, "original") and isinstance(img_object.original, Image.Image):
|
624
|
-
img = img_object.original
|
625
|
-
elif isinstance(img_object, Image.Image):
|
626
|
-
img = img_object
|
627
|
-
else:
|
628
|
-
# If it's neither, maybe it's the raw bytes? Try opening it.
|
629
|
-
try:
|
630
|
-
img = Image.open(BytesIO(img_object)).convert("RGB")
|
631
|
-
except Exception:
|
632
|
-
raise TypeError(
|
633
|
-
f"page.to_image() returned unexpected type: {type(img_object)}"
|
634
|
-
)
|
635
|
-
logger.debug(f"Successfully rendered page {page.index} using to_image()")
|
636
|
-
except Exception as render_err:
|
637
|
-
logger.error(
|
638
|
-
f"Error rendering page {page.index} image for widget: {render_err}",
|
639
|
-
exc_info=True,
|
640
|
-
)
|
641
|
-
# Return None or raise the error? Let's raise for now to make it clear.
|
642
|
-
raise ValueError(f"Failed to render page image: {render_err}") from render_err
|
620
|
+
img = render_plain_page(page, resolution=72)
|
643
621
|
|
644
622
|
buffered = BytesIO()
|
645
623
|
img.save(buffered, format="PNG")
|
@@ -687,6 +665,7 @@ try:
|
|
687
665
|
original_y1 = element.bottom
|
688
666
|
width = element.width
|
689
667
|
height = element.height
|
668
|
+
scale = 1.0
|
690
669
|
|
691
670
|
# Base element dict with required info
|
692
671
|
elem_dict = {
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: natural-pdf
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.9
|
4
4
|
Summary: A more intuitive interface for working with PDFs
|
5
5
|
Author-email: Jonathan Soma <jonathan.soma@gmail.com>
|
6
6
|
License-Expression: MIT
|
@@ -17,11 +17,13 @@ Requires-Dist: colour
|
|
17
17
|
Requires-Dist: numpy
|
18
18
|
Requires-Dist: urllib3
|
19
19
|
Requires-Dist: tqdm
|
20
|
+
Requires-Dist: pydantic
|
20
21
|
Provides-Extra: interactive
|
21
22
|
Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "interactive"
|
22
23
|
Provides-Extra: haystack
|
23
24
|
Requires-Dist: haystack-ai; extra == "haystack"
|
24
|
-
Requires-Dist:
|
25
|
+
Requires-Dist: lancedb-haystack; extra == "haystack"
|
26
|
+
Requires-Dist: lancedb; extra == "haystack"
|
25
27
|
Requires-Dist: sentence-transformers; extra == "haystack"
|
26
28
|
Requires-Dist: natural-pdf[core-ml]; extra == "haystack"
|
27
29
|
Provides-Extra: easyocr
|
@@ -36,6 +38,9 @@ Requires-Dist: natural-pdf[core-ml]; extra == "layout-yolo"
|
|
36
38
|
Provides-Extra: surya
|
37
39
|
Requires-Dist: surya-ocr; extra == "surya"
|
38
40
|
Requires-Dist: natural-pdf[core-ml]; extra == "surya"
|
41
|
+
Provides-Extra: doctr
|
42
|
+
Requires-Dist: python-doctr[torch]; extra == "doctr"
|
43
|
+
Requires-Dist: natural-pdf[core-ml]; extra == "doctr"
|
39
44
|
Provides-Extra: qa
|
40
45
|
Requires-Dist: natural-pdf[core-ml]; extra == "qa"
|
41
46
|
Provides-Extra: docling
|
@@ -43,7 +48,10 @@ Requires-Dist: docling; extra == "docling"
|
|
43
48
|
Requires-Dist: natural-pdf[core-ml]; extra == "docling"
|
44
49
|
Provides-Extra: llm
|
45
50
|
Requires-Dist: openai>=1.0; extra == "llm"
|
46
|
-
|
51
|
+
Provides-Extra: classification
|
52
|
+
Requires-Dist: sentence-transformers; extra == "classification"
|
53
|
+
Requires-Dist: timm; extra == "classification"
|
54
|
+
Requires-Dist: natural-pdf[core-ml]; extra == "classification"
|
47
55
|
Provides-Extra: test
|
48
56
|
Requires-Dist: pytest; extra == "test"
|
49
57
|
Provides-Extra: dev
|
@@ -59,6 +67,9 @@ Requires-Dist: pipdeptree; extra == "dev"
|
|
59
67
|
Requires-Dist: nbformat; extra == "dev"
|
60
68
|
Requires-Dist: jupytext; extra == "dev"
|
61
69
|
Requires-Dist: nbclient; extra == "dev"
|
70
|
+
Provides-Extra: deskew
|
71
|
+
Requires-Dist: deskew>=1.5; extra == "deskew"
|
72
|
+
Requires-Dist: img2pdf; extra == "deskew"
|
62
73
|
Provides-Extra: all
|
63
74
|
Requires-Dist: natural-pdf[interactive]; extra == "all"
|
64
75
|
Requires-Dist: natural-pdf[haystack]; extra == "all"
|
@@ -66,10 +77,13 @@ Requires-Dist: natural-pdf[easyocr]; extra == "all"
|
|
66
77
|
Requires-Dist: natural-pdf[paddle]; extra == "all"
|
67
78
|
Requires-Dist: natural-pdf[layout_yolo]; extra == "all"
|
68
79
|
Requires-Dist: natural-pdf[surya]; extra == "all"
|
80
|
+
Requires-Dist: natural-pdf[doctr]; extra == "all"
|
69
81
|
Requires-Dist: natural-pdf[qa]; extra == "all"
|
70
82
|
Requires-Dist: natural-pdf[ocr-export]; extra == "all"
|
71
83
|
Requires-Dist: natural-pdf[docling]; extra == "all"
|
72
84
|
Requires-Dist: natural-pdf[llm]; extra == "all"
|
85
|
+
Requires-Dist: natural-pdf[classification]; extra == "all"
|
86
|
+
Requires-Dist: natural-pdf[deskew]; extra == "all"
|
73
87
|
Requires-Dist: natural-pdf[test]; extra == "all"
|
74
88
|
Provides-Extra: core-ml
|
75
89
|
Requires-Dist: torch; extra == "core-ml"
|
@@ -0,0 +1,80 @@
|
|
1
|
+
natural_pdf/__init__.py,sha256=LBrQcFOGooaUsTSAk6zrPCQqu0IM-ClvJLasexEk64k,2728
|
2
|
+
natural_pdf/analyzers/__init__.py,sha256=dIXjsMqoxKmd9OOnSBzn12wvdIz7D7YNQRAnXslpJSM,142
|
3
|
+
natural_pdf/analyzers/text_options.py,sha256=nE2E1pp4psDPpxmtarvNtEQsgozPkyFRjv0TVP2HTyU,2865
|
4
|
+
natural_pdf/analyzers/text_structure.py,sha256=Uhxc7aYB1jddkiwRTEPOg_Te2HfOua4z_OtgP1m3org,12794
|
5
|
+
natural_pdf/analyzers/utils.py,sha256=PYbzJzSAHZ7JsMes84WIrSbA0zkjJGs0CLvIeINsf_k,2100
|
6
|
+
natural_pdf/analyzers/layout/__init__.py,sha256=oq1uJ5UkGGMbBKGirV1aRKK3hxAUyjTLywYkPCQH1f0,33
|
7
|
+
natural_pdf/analyzers/layout/base.py,sha256=bYawhmc_0xqKG-xbxUSiazIU1om-aBox5Jh8qDqv-eM,6451
|
8
|
+
natural_pdf/analyzers/layout/docling.py,sha256=4BJYyNVR6VegZGxyisvNIBBRvVk6YKPyDVs7ZdVfzEU,12676
|
9
|
+
natural_pdf/analyzers/layout/gemini.py,sha256=iuq-zZYkTS7fdAjD3ULRhqYTP9Ky2NgVHaXSLppDidw,11751
|
10
|
+
natural_pdf/analyzers/layout/layout_analyzer.py,sha256=n327Zjuf7aSzKQKChPHeiCVHinzeDGaWNyKiwQ-DkJk,15571
|
11
|
+
natural_pdf/analyzers/layout/layout_manager.py,sha256=RiVq6gUA8t9OLj-HojdzQkJtabM32iBWEBoLtS7_TjY,8115
|
12
|
+
natural_pdf/analyzers/layout/layout_options.py,sha256=Jsm4MfD_vedXvS7NCpVmuIRsIuyNyKOjvdgoRYOKZpI,4133
|
13
|
+
natural_pdf/analyzers/layout/paddle.py,sha256=gTI9ZqNd5-t4H5IByGfL32WgcE6JrdchW6jRiGI6ulM,13375
|
14
|
+
natural_pdf/analyzers/layout/pdfplumber_table_finder.py,sha256=Tk0Q7wv7nGYPo69lh6RoezjdepTnMl90SaNIrP29Pwc,5902
|
15
|
+
natural_pdf/analyzers/layout/surya.py,sha256=4RdnhRxSS3i3Ns5mFhOA9-P0xd7Ms19uZuKvUGQfEBI,9789
|
16
|
+
natural_pdf/analyzers/layout/tatr.py,sha256=cVr0ZyhY2mNLAKZ4DGMm-b7XNJpILKh8x8ZpyDeUhLk,15032
|
17
|
+
natural_pdf/analyzers/layout/yolo.py,sha256=ANo2U4EZgeN2eYKM1bZIuysiuJLgwl4JeQchrRxOKwA,8388
|
18
|
+
natural_pdf/classification/manager.py,sha256=CvZd3-lN3fEhcaLXr8gYfrdBGoBgzkIeE14EqjrOAzU,17730
|
19
|
+
natural_pdf/classification/mixin.py,sha256=llari9AIMNGy9sTaR7y1g5vtVNUwuCutbKnjbJRMYx4,6903
|
20
|
+
natural_pdf/classification/results.py,sha256=Ia26BQxObL5sURpFmg66bfjFPCxjcO_jeP2G-S9wRgo,2289
|
21
|
+
natural_pdf/collections/mixins.py,sha256=ufetdzHmd2_WLGBPW4eBQrzZTFpjXyVsVwBquIE47zw,4476
|
22
|
+
natural_pdf/collections/pdf_collection.py,sha256=JnsJugE-vxYsW1ZJWmMlVv_jbyG37X-9rZK1RQyKWAY,30020
|
23
|
+
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
24
|
+
natural_pdf/core/element_manager.py,sha256=knRN6qXxV-6KZCj2GUOyiqRi83DjJzL77TmKGeiD08Y,25144
|
25
|
+
natural_pdf/core/highlighting_service.py,sha256=wINdRxq63_CYYA81EwuCRqhNKimn0dNKyoKWuzkirc0,31959
|
26
|
+
natural_pdf/core/page.py,sha256=icJLu6jRbkD3iOE8r60XPkQZ8FN3ZcKo5TT5MVGkGl0,105122
|
27
|
+
natural_pdf/core/pdf.py,sha256=Vw-L5149wO6RSfvb9sAfPDLqd9M1TdYoPHNEePh65y8,61201
|
28
|
+
natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
|
29
|
+
natural_pdf/elements/base.py,sha256=7vVCPQyEHifh4LyBuv0kLTqr_gNbbEMc4SoiJmLfEUQ,37585
|
30
|
+
natural_pdf/elements/collections.py,sha256=YRaJxNbJrBjgwzwuSoOtEotOKh6RaTi7NRCqKiGl514,92955
|
31
|
+
natural_pdf/elements/line.py,sha256=7cow3xMUKhAj7zoQz7OaB1eIH2_a8B__LB7iGJ4Mb0o,4612
|
32
|
+
natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
|
33
|
+
natural_pdf/elements/region.py,sha256=LfyB_9DCw5Tzn_G9xsjFz2FfKBOHRqGIND4DQWoA7KM,97324
|
34
|
+
natural_pdf/elements/text.py,sha256=13HvVZGinj2Vm_fFCAnqi7hohtoKvnpCp3VCfkpeAbc,11146
|
35
|
+
natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
|
36
|
+
natural_pdf/exporters/__init__.py,sha256=7MnvRLLQdwtg-ULu-8uK8C84GsKiJamyhRw_GgWhw7k,151
|
37
|
+
natural_pdf/exporters/base.py,sha256=XhR1xlkHOh7suOuX7mWbsj1h2o1pZNet-OAS5YCJyeI,2115
|
38
|
+
natural_pdf/exporters/paddleocr.py,sha256=BYpdtJI7S8rBkI2dkRESx2epVAZOTfzqU-rjJnUQ5jQ,16249
|
39
|
+
natural_pdf/exporters/searchable_pdf.py,sha256=qsaPsnbOOaZHA_aplfZbwQnBoK9KghWm-wzbyRRomeY,16859
|
40
|
+
natural_pdf/extraction/manager.py,sha256=mUBbfgLG5Pl31wmajXwyipdEJb_dZ5I-y8GnWw7IzGo,4969
|
41
|
+
natural_pdf/extraction/mixin.py,sha256=eKbr70VibpbtfjvCE80lTFuYHzq_BoVtOHjznL_GMRA,11719
|
42
|
+
natural_pdf/extraction/result.py,sha256=c1vLguCR6l95cvg-BJJmZvL_MPg2McJaczge55bKZMg,934
|
43
|
+
natural_pdf/ocr/__init__.py,sha256=VY8hhvDPf7Gh2lB-d2QRmghLLyTy6ydxlgo1cS4dOSk,2482
|
44
|
+
natural_pdf/ocr/engine.py,sha256=ZBC1tZNM5EDbGDJJmZI9mNHr4nCMLEZvUFhiJq8GdF4,8741
|
45
|
+
natural_pdf/ocr/engine_doctr.py,sha256=519WpvSHgwP6Hv24tci_YHFX7XPlaxOnlREN_YG-Yys,16331
|
46
|
+
natural_pdf/ocr/engine_easyocr.py,sha256=9TbxJjmhWFrzM8mcNnZjoRtIDr6gwpuwKm4-Zfub2-8,9281
|
47
|
+
natural_pdf/ocr/engine_paddle.py,sha256=2nIrvLBBAiZG1BxVo3eFVJulA6YGoOTXw_RN98p_BUk,6184
|
48
|
+
natural_pdf/ocr/engine_surya.py,sha256=iySjG-Dahgh0cLICfbMtOcwUpRFcZjo-5Ed5Zwz-o5Y,4805
|
49
|
+
natural_pdf/ocr/ocr_factory.py,sha256=gBFXdFs7E4aCynHz06sQsAhaO3s8yhgoFgN5nyxtg9c,5221
|
50
|
+
natural_pdf/ocr/ocr_manager.py,sha256=f0q68ynGYVPkF4D3WnufxmHWD5R1jW5Z_1czTEi9JVU,13931
|
51
|
+
natural_pdf/ocr/ocr_options.py,sha256=ZvtnFn1kPkFEoWveQ13uy6B-ofquP0gHEi4tBHrjqCE,6438
|
52
|
+
natural_pdf/ocr/utils.py,sha256=OxuHwDbHWj6setvnC0QYwMHrAjxGkhmLzWHpMqqGupA,4397
|
53
|
+
natural_pdf/qa/__init__.py,sha256=Pjo62JTnUNEjGNsC437mvsS5KQ5m7X_BibGvavR9AW0,108
|
54
|
+
natural_pdf/qa/document_qa.py,sha256=Jw4yyq3Vifn57D0ANmOfUlZeG8CJjBkItZBV-8ZAmos,15111
|
55
|
+
natural_pdf/search/__init__.py,sha256=gdGlW3kTCw87iXMwcIesbLkUsnv5UKJmF-_1ZMR0pfQ,3339
|
56
|
+
natural_pdf/search/haystack_search_service.py,sha256=UHr2UWNBetG3MZ1n_1LnV9oUe5fC-rY9p-V0j00JjQM,30339
|
57
|
+
natural_pdf/search/haystack_utils.py,sha256=6Hv5DeLSF4AVDrB_aFJZGB3XpSCLQ45dXLKEd4yG2tU,18978
|
58
|
+
natural_pdf/search/search_options.py,sha256=sq_e8_jSROicD94b_xtDtLnjEr_Zsy4icjzPkK0a8QA,3566
|
59
|
+
natural_pdf/search/search_service_protocol.py,sha256=Dl-Q-CrutkhZwI69scbW9EWPeYM63qxB60_EA7YqIYo,6699
|
60
|
+
natural_pdf/search/searchable_mixin.py,sha256=M2a6FaFVM0vcfh7FgjDH6BLhS-7ggeVpcfft4OOBDxY,26390
|
61
|
+
natural_pdf/selectors/__init__.py,sha256=oZGeqSv53EqmIZOhcnawuaGGlRg1h79vArXuZCWKm4A,123
|
62
|
+
natural_pdf/selectors/parser.py,sha256=oI3ezkB6sIyrq_nLJrbaBaBZktXwEp_HG_gKQlVSVcs,24447
|
63
|
+
natural_pdf/templates/__init__.py,sha256=jYBxzfi73vew0f6yhIh1MlRxw4F_TVN2hKQR0YXOFe0,20
|
64
|
+
natural_pdf/utils/__init__.py,sha256=s3M8FggaK1P3EBYn6R_-HgSDjNc9C73gyKe1hihtNWg,43
|
65
|
+
natural_pdf/utils/debug.py,sha256=RN7H3E6ph-GtxubCW6psW7TO8o2BxcNLiEzByTVR9fk,995
|
66
|
+
natural_pdf/utils/highlighting.py,sha256=EIY6ihVGtUTS_DjWyxpnr_UXpcR4btC1KhSGQ9VUfKg,698
|
67
|
+
natural_pdf/utils/identifiers.py,sha256=P7n6owcubnF8oAMa_UfYtENmIaJQdH_AMC9Jbs2bWXo,1117
|
68
|
+
natural_pdf/utils/locks.py,sha256=7HJqV0VsNcOfISnbw8goCKWP5ck11uSJo6T_x9XIPKI,215
|
69
|
+
natural_pdf/utils/packaging.py,sha256=Jshxp6S1zfcqoZmFhdd7WOpL--b6rBSz-Y9mYqELXIY,21581
|
70
|
+
natural_pdf/utils/reading_order.py,sha256=s3DsYq_3g_1YA07qhd4BGEjeIRTeyGtnwc_hNtSzwBY,7290
|
71
|
+
natural_pdf/utils/text_extraction.py,sha256=z6Jhy11pakYCsEpkvh8ldw6DkUFsYF1hCL9YDmfXWL4,9605
|
72
|
+
natural_pdf/utils/tqdm_utils.py,sha256=wV3RXvqog26eWEFEqjt2LkGnLswmO1GXaVGSqgS7tAY,1601
|
73
|
+
natural_pdf/utils/visualization.py,sha256=30pRWQdsRJh2pSObh-brKVsFgC1n8tHmSrta_UDnVPw,8989
|
74
|
+
natural_pdf/widgets/__init__.py,sha256=O2fSDo604wDAP6UwUkmBq3eT91RSqHwBpAOQXq92S8s,214
|
75
|
+
natural_pdf/widgets/viewer.py,sha256=dC_hlPlosc08gsDc3bdAa8chOKtAoH9QFU6mrGOG9vE,39532
|
76
|
+
natural_pdf-0.1.9.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
77
|
+
natural_pdf-0.1.9.dist-info/METADATA,sha256=10GX2Qesem-n8sPem4lls2EEQen4KyJVdcmQf1mt9mI,7400
|
78
|
+
natural_pdf-0.1.9.dist-info/WHEEL,sha256=wXxTzcEDnjrTwFYjLPcsW_7_XihufBwmpiBeiXNBGEA,91
|
79
|
+
natural_pdf-0.1.9.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
|
80
|
+
natural_pdf-0.1.9.dist-info/RECORD,,
|