natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +422 -0
- natural_pdf/classification/mixin.py +163 -0
- natural_pdf/classification/results.py +80 -0
- natural_pdf/collections/mixins.py +111 -0
- natural_pdf/collections/pdf_collection.py +434 -15
- natural_pdf/core/element_manager.py +83 -0
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +578 -93
- natural_pdf/core/pdf.py +912 -460
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +712 -109
- natural_pdf/elements/region.py +722 -69
- natural_pdf/elements/text.py +4 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +5 -4
- natural_pdf/extraction/manager.py +135 -0
- natural_pdf/extraction/mixin.py +279 -0
- natural_pdf/extraction/result.py +23 -0
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +122 -26
- natural_pdf/ocr/ocr_options.py +94 -11
- natural_pdf/ocr/utils.py +19 -6
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +431 -230
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +60 -1
- natural_pdf/utils/tqdm_utils.py +51 -0
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -915
- docs/element-selection/index.md +0 -229
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -170
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -209
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -194
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -340
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -147
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -114
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -270
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -332
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -288
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -413
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -508
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2434
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -512
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -604
- docs/tutorials/12-ocr-integration.md +0 -175
- docs/tutorials/13-semantic-search.ipynb +0 -1328
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.7.dist-info/RECORD +0 -145
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
natural_pdf/ocr/ocr_options.py
CHANGED
@@ -67,9 +67,10 @@ class EasyOCROptions(BaseOCROptions):
|
|
67
67
|
class PaddleOCROptions(BaseOCROptions):
|
68
68
|
"""Specific options for the PaddleOCR engine."""
|
69
69
|
|
70
|
-
|
70
|
+
# General
|
71
71
|
use_gpu: Optional[bool] = None
|
72
|
-
gpu_mem: int =
|
72
|
+
gpu_mem: int = 8000 # Default from Paddle documentation
|
73
|
+
gpu_mem: int = 8000 # Default from Paddle documentation
|
73
74
|
ir_optim: bool = True
|
74
75
|
use_tensorrt: bool = False
|
75
76
|
min_subgraph_size: int = 15
|
@@ -77,19 +78,73 @@ class PaddleOCROptions(BaseOCROptions):
|
|
77
78
|
enable_mkldnn: bool = False
|
78
79
|
cpu_threads: int = 10
|
79
80
|
use_fp16: bool = False
|
81
|
+
show_log: bool = False
|
82
|
+
use_onnx: bool = False
|
83
|
+
use_zero_copy_run: bool = False
|
84
|
+
|
85
|
+
# Detection
|
86
|
+
det: bool = True
|
87
|
+
det_algorithm: str = "DB"
|
88
|
+
show_log: bool = False
|
89
|
+
use_onnx: bool = False
|
90
|
+
use_zero_copy_run: bool = False
|
91
|
+
|
92
|
+
# Detection
|
93
|
+
det: bool = True
|
94
|
+
det_algorithm: str = "DB"
|
80
95
|
det_model_dir: Optional[str] = None
|
96
|
+
det_limit_side_len: int = 960 # Corresponds to det_max_side_len
|
97
|
+
# DB specific
|
98
|
+
det_db_thresh: float = 0.3
|
99
|
+
det_db_box_thresh: float = 0.5
|
100
|
+
det_db_unclip_ratio: float = 2.0
|
101
|
+
# EAST specific
|
102
|
+
det_east_score_thresh: float = 0.8
|
103
|
+
det_east_cover_thresh: float = 0.1
|
104
|
+
det_east_nms_thresh: float = 0.2
|
105
|
+
|
106
|
+
# Recognition
|
107
|
+
rec: bool = True
|
108
|
+
rec_algorithm: str = "CRNN"
|
109
|
+
det_limit_side_len: int = 960 # Corresponds to det_max_side_len
|
110
|
+
# DB specific
|
111
|
+
det_db_thresh: float = 0.3
|
112
|
+
det_db_box_thresh: float = 0.5
|
113
|
+
det_db_unclip_ratio: float = 2.0
|
114
|
+
# EAST specific
|
115
|
+
det_east_score_thresh: float = 0.8
|
116
|
+
det_east_cover_thresh: float = 0.1
|
117
|
+
det_east_nms_thresh: float = 0.2
|
118
|
+
|
119
|
+
# Recognition
|
120
|
+
rec: bool = True
|
121
|
+
rec_algorithm: str = "CRNN"
|
81
122
|
rec_model_dir: Optional[str] = None
|
82
|
-
|
83
|
-
|
84
|
-
|
123
|
+
rec_image_shape: str = "3, 32, 320" # Kept as string per Paddle examples
|
124
|
+
rec_batch_num: int = 30 # Default from Paddle documentation
|
125
|
+
rec_image_shape: str = "3, 32, 320" # Kept as string per Paddle examples
|
126
|
+
rec_batch_num: int = 30 # Default from Paddle documentation
|
85
127
|
max_text_length: int = 25
|
128
|
+
rec_char_dict_path: Optional[str] = None # Path to char dictionary file
|
129
|
+
rec_char_dict_path: Optional[str] = None # Path to char dictionary file
|
86
130
|
use_space_char: bool = True
|
87
131
|
drop_score: float = 0.5
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
132
|
+
|
133
|
+
# Classification
|
134
|
+
cls: Optional[bool] = None # Often inferred from use_angle_cls
|
135
|
+
use_angle_cls: bool = False # Default from Paddle documentation
|
136
|
+
cls_model_dir: Optional[str] = None
|
137
|
+
cls_image_shape: str = "3, 48, 192" # Kept as string per Paddle examples
|
138
|
+
label_list: List[str] = field(default_factory=lambda: ["0", "180"]) # Default from Paddle doc
|
139
|
+
cls_batch_num: int = 30
|
140
|
+
|
141
|
+
# Classification
|
142
|
+
cls: Optional[bool] = None # Often inferred from use_angle_cls
|
143
|
+
use_angle_cls: bool = False # Default from Paddle documentation
|
144
|
+
cls_model_dir: Optional[str] = None
|
145
|
+
cls_image_shape: str = "3, 48, 192" # Kept as string per Paddle examples
|
146
|
+
label_list: List[str] = field(default_factory=lambda: ["0", "180"]) # Default from Paddle doc
|
147
|
+
cls_batch_num: int = 30
|
93
148
|
|
94
149
|
def __post_init__(self):
|
95
150
|
pass
|
@@ -111,5 +166,33 @@ class SuryaOCROptions(BaseOCROptions):
|
|
111
166
|
pass
|
112
167
|
|
113
168
|
|
169
|
+
# --- Doctr Specific Options ---
|
170
|
+
@dataclass
|
171
|
+
class DoctrOCROptions(BaseOCROptions):
|
172
|
+
"""Specific options for the doctr engine."""
|
173
|
+
|
174
|
+
# OCR predictor options
|
175
|
+
det_arch: str = "db_resnet50"
|
176
|
+
reco_arch: str = "crnn_vgg16_bn"
|
177
|
+
pretrained: bool = True
|
178
|
+
assume_straight_pages: bool = True # Faster if pages are straight
|
179
|
+
export_as_straight_boxes: bool = False # Output straight boxes even if rotated text is detected
|
180
|
+
|
181
|
+
# Additional options from standalone predictors
|
182
|
+
# Detection predictor options
|
183
|
+
symmetric_pad: bool = True
|
184
|
+
preserve_aspect_ratio: bool = True
|
185
|
+
batch_size: int = 1
|
186
|
+
|
187
|
+
# Postprocessing parameters
|
188
|
+
bin_thresh: Optional[float] = None # Default is usually 0.3
|
189
|
+
box_thresh: Optional[float] = None # Default is usually 0.1
|
190
|
+
|
191
|
+
# Options for orientation predictors
|
192
|
+
use_orientation_predictor: bool = False # Whether to use page orientation predictor
|
193
|
+
|
194
|
+
|
114
195
|
# --- Union type for type hinting ---
|
115
|
-
OCROptions = Union[
|
196
|
+
OCROptions = Union[
|
197
|
+
EasyOCROptions, PaddleOCROptions, SuryaOCROptions, DoctrOCROptions, BaseOCROptions
|
198
|
+
]
|
natural_pdf/ocr/utils.py
CHANGED
@@ -1,13 +1,18 @@
|
|
1
|
-
import io
|
2
1
|
import base64
|
2
|
+
import io
|
3
3
|
import logging
|
4
|
-
from typing import TYPE_CHECKING, Callable, Iterable, Optional
|
5
|
-
|
4
|
+
from typing import TYPE_CHECKING, Any, Callable, Iterable, Optional
|
5
|
+
|
6
6
|
from tqdm.auto import tqdm
|
7
7
|
|
8
|
+
from natural_pdf.elements.text import TextElement
|
9
|
+
|
8
10
|
if TYPE_CHECKING:
|
9
11
|
from natural_pdf.elements.base import Element
|
10
12
|
|
13
|
+
# Import the global PDF render lock from dedicated locks module
|
14
|
+
from natural_pdf.utils.locks import pdf_render_lock
|
15
|
+
|
11
16
|
logger = logging.getLogger(__name__)
|
12
17
|
|
13
18
|
|
@@ -72,7 +77,7 @@ def direct_ocr_llm(
|
|
72
77
|
client,
|
73
78
|
model="",
|
74
79
|
resolution=150,
|
75
|
-
prompt="OCR this image. Return only the exact text from the image. Include misspellings, punctuation, etc.",
|
80
|
+
prompt="OCR this image. Return only the exact text from the image. Include misspellings, punctuation, etc. If you cannot see any text, return an empty string.",
|
76
81
|
padding=2,
|
77
82
|
) -> str:
|
78
83
|
"""Convenience method to directly OCR a region of the page."""
|
@@ -83,7 +88,15 @@ def direct_ocr_llm(
|
|
83
88
|
region = element
|
84
89
|
|
85
90
|
buffered = io.BytesIO()
|
86
|
-
|
91
|
+
# Use the global PDF render lock when rendering images
|
92
|
+
with pdf_render_lock:
|
93
|
+
region_img = region.to_image(resolution=resolution, include_highlights=False)
|
94
|
+
|
95
|
+
# Handle cases where image creation might fail (e.g., zero-dim region)
|
96
|
+
if region_img is None:
|
97
|
+
logger.warning(f"Could not generate image for region {region.bbox}, skipping OCR.")
|
98
|
+
return "" # Return empty string if image creation failed
|
99
|
+
|
87
100
|
region_img.save(buffered, format="PNG")
|
88
101
|
base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
89
102
|
|
@@ -107,7 +120,7 @@ def direct_ocr_llm(
|
|
107
120
|
],
|
108
121
|
)
|
109
122
|
|
110
|
-
corrected = response.choices[0].message.content
|
123
|
+
corrected = response.choices[0].message.content.strip()
|
111
124
|
logger.debug(f"Corrected {region.extract_text()} to {corrected}")
|
112
125
|
|
113
126
|
return corrected
|
natural_pdf/qa/document_qa.py
CHANGED
@@ -58,10 +58,6 @@ class DocumentQA:
|
|
58
58
|
import torch
|
59
59
|
from transformers import pipeline
|
60
60
|
|
61
|
-
# Determine device
|
62
|
-
if device is None:
|
63
|
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
64
|
-
|
65
61
|
logger.info(f"Initializing DocumentQA with model {model_name} on {device}")
|
66
62
|
|
67
63
|
# Initialize the pipeline
|
natural_pdf/search/__init__.py
CHANGED
@@ -24,63 +24,54 @@ from .search_service_protocol import Indexable, IndexConfigurationError, SearchS
|
|
24
24
|
|
25
25
|
logger = logging.getLogger(__name__)
|
26
26
|
|
27
|
-
# --- Factory Function ---
|
28
|
-
|
29
27
|
|
28
|
+
# Factory Function
|
30
29
|
def get_search_service(
|
31
|
-
collection_name: str,
|
32
|
-
persist: bool = False,
|
33
|
-
|
34
|
-
default_persist_path: Optional[str] = None,
|
30
|
+
collection_name: str,
|
31
|
+
persist: bool = False,
|
32
|
+
uri: Optional[str] = None,
|
35
33
|
default_embedding_model: Optional[str] = None,
|
36
|
-
# Potential future args: cache_services=True? service_type='haystack'?
|
37
34
|
) -> SearchServiceProtocol:
|
38
35
|
"""
|
39
36
|
Factory function to get an instance of the configured search service.
|
40
37
|
|
41
|
-
A service instance is tied to a specific
|
38
|
+
A service instance is tied to a specific index name (collection/table).
|
42
39
|
|
43
40
|
Currently, only returns HaystackSearchService but is structured for future extension.
|
44
41
|
|
45
42
|
Args:
|
46
|
-
collection_name: The name
|
43
|
+
collection_name: The logical name for the index this service instance manages
|
44
|
+
(used as table_name for LanceDB).
|
47
45
|
persist: If True, creates a service instance configured for persistent
|
48
|
-
storage (
|
49
|
-
|
46
|
+
storage (currently LanceDB). If False (default), uses InMemory.
|
47
|
+
uri: Override the default path/URI for persistent storage.
|
50
48
|
default_embedding_model: Override the default embedding model used by the service.
|
51
49
|
**kwargs: Reserved for future configuration options.
|
52
50
|
|
53
51
|
Returns:
|
54
|
-
An instance conforming to the SearchServiceProtocol for the specified collection.
|
52
|
+
An instance conforming to the SearchServiceProtocol for the specified collection/table.
|
55
53
|
"""
|
56
54
|
logger.debug(
|
57
|
-
f"Calling get_search_service factory for
|
55
|
+
f"Calling get_search_service factory for index '{collection_name}' (persist={persist}, uri={uri})..."
|
58
56
|
)
|
59
57
|
|
60
|
-
# For now, we only have one implementation
|
61
58
|
# Collect arguments relevant to HaystackSearchService.__init__
|
62
59
|
service_args = {}
|
63
|
-
service_args["
|
64
|
-
service_args["persist"] = persist
|
65
|
-
if
|
66
|
-
service_args["
|
60
|
+
service_args["table_name"] = collection_name
|
61
|
+
service_args["persist"] = persist
|
62
|
+
if uri is not None:
|
63
|
+
service_args["uri"] = uri
|
67
64
|
if default_embedding_model is not None:
|
68
|
-
service_args["
|
65
|
+
service_args["embedding_model"] = default_embedding_model
|
69
66
|
|
70
|
-
#
|
71
|
-
# for the same configuration instead of always creating a new one.
|
72
|
-
# cache_key = tuple(sorted(service_args.items()))
|
73
|
-
# if cache_key in _service_instance_cache:
|
74
|
-
# return _service_instance_cache[cache_key]
|
67
|
+
# Cache logic commented out as before
|
75
68
|
|
76
69
|
try:
|
77
70
|
service_instance = HaystackSearchService(**service_args)
|
78
|
-
|
79
|
-
logger.info(
|
80
|
-
f"Created new HaystackSearchService instance for collection '{collection_name}'."
|
81
|
-
)
|
71
|
+
logger.info(f"Created new HaystackSearchService instance for index '{collection_name}'.")
|
82
72
|
return service_instance
|
83
73
|
except ImportError as e:
|
74
|
+
# Error message remains valid
|
84
75
|
logger.error(
|
85
76
|
f"Failed to instantiate Search Service due to missing dependencies: {e}", exc_info=True
|
86
77
|
)
|
@@ -92,9 +83,4 @@ def get_search_service(
|
|
92
83
|
raise RuntimeError("Could not create Search Service instance.") from e
|
93
84
|
|
94
85
|
|
95
|
-
#
|
96
|
-
# try:
|
97
|
-
# default_search_service = get_search_service()
|
98
|
-
# except Exception:
|
99
|
-
# default_search_service = None
|
100
|
-
# logger.warning("Could not create default search service instance on import.")
|
86
|
+
# Default instance commented out as before
|