natural-pdf 0.1.15__py3-none-any.whl → 0.1.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +31 -0
- natural_pdf/analyzers/layout/gemini.py +137 -162
- natural_pdf/analyzers/layout/layout_manager.py +9 -5
- natural_pdf/analyzers/layout/layout_options.py +77 -7
- natural_pdf/analyzers/layout/paddle.py +318 -165
- natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
- natural_pdf/analyzers/shape_detection_mixin.py +770 -405
- natural_pdf/classification/mixin.py +2 -8
- natural_pdf/collections/pdf_collection.py +25 -30
- natural_pdf/core/highlighting_service.py +47 -32
- natural_pdf/core/page.py +119 -76
- natural_pdf/core/pdf.py +19 -22
- natural_pdf/describe/__init__.py +21 -0
- natural_pdf/describe/base.py +457 -0
- natural_pdf/describe/elements.py +411 -0
- natural_pdf/describe/mixin.py +84 -0
- natural_pdf/describe/summary.py +186 -0
- natural_pdf/elements/base.py +11 -10
- natural_pdf/elements/collections.py +116 -51
- natural_pdf/elements/region.py +204 -127
- natural_pdf/exporters/paddleocr.py +38 -13
- natural_pdf/flows/__init__.py +3 -3
- natural_pdf/flows/collections.py +303 -132
- natural_pdf/flows/element.py +277 -132
- natural_pdf/flows/flow.py +33 -16
- natural_pdf/flows/region.py +142 -79
- natural_pdf/ocr/engine_doctr.py +37 -4
- natural_pdf/ocr/engine_easyocr.py +23 -3
- natural_pdf/ocr/engine_paddle.py +281 -30
- natural_pdf/ocr/engine_surya.py +8 -3
- natural_pdf/ocr/ocr_manager.py +75 -76
- natural_pdf/ocr/ocr_options.py +52 -87
- natural_pdf/search/__init__.py +25 -12
- natural_pdf/search/lancedb_search_service.py +91 -54
- natural_pdf/search/numpy_search_service.py +86 -65
- natural_pdf/search/searchable_mixin.py +2 -2
- natural_pdf/selectors/parser.py +125 -81
- natural_pdf/widgets/__init__.py +1 -1
- natural_pdf/widgets/viewer.py +205 -449
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/METADATA +27 -45
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/RECORD +44 -38
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,9 @@
|
|
1
|
+
import collections
|
1
2
|
import logging
|
2
3
|
import os
|
3
4
|
import random
|
4
5
|
import shutil
|
5
6
|
from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union
|
6
|
-
import collections
|
7
7
|
|
8
8
|
from tqdm.auto import tqdm
|
9
9
|
|
@@ -144,16 +144,34 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
|
|
144
144
|
)
|
145
145
|
continue
|
146
146
|
|
147
|
-
elements = pdf.find_all(
|
148
|
-
self.selector, apply_exclusions=False
|
149
|
-
) # Usually want all text, even if excluded
|
147
|
+
elements = pdf.find_all(self.selector, apply_exclusions=False)
|
150
148
|
if not elements:
|
151
149
|
logger.debug(f"No elements matching '{self.selector}' found in {pdf.path}")
|
152
150
|
continue
|
153
151
|
|
152
|
+
# --- FILTER BASED ON CHARACTER FREQUENCY BEFORE EXPORT ---
|
153
|
+
filtered_elements = []
|
154
|
+
if self.min_char_freq > 1:
|
155
|
+
# First, count all characters in all elements
|
156
|
+
char_counts = collections.Counter()
|
157
|
+
for element in elements:
|
158
|
+
if hasattr(element, "text") and isinstance(element.text, str):
|
159
|
+
char_counts.update(element.text)
|
160
|
+
rare_chars = {
|
161
|
+
char for char, count in char_counts.items() if count < self.min_char_freq
|
162
|
+
}
|
163
|
+
for element in elements:
|
164
|
+
if hasattr(element, "text") and isinstance(element.text, str):
|
165
|
+
if any(char in rare_chars for char in element.text):
|
166
|
+
elements_skipped += 1
|
167
|
+
continue
|
168
|
+
filtered_elements.append(element)
|
169
|
+
else:
|
170
|
+
filtered_elements = elements
|
171
|
+
|
154
172
|
for i, element in enumerate(
|
155
173
|
tqdm(
|
156
|
-
|
174
|
+
filtered_elements,
|
157
175
|
desc=f"Exporting '{os.path.basename(pdf.path)}'",
|
158
176
|
leave=False,
|
159
177
|
position=1,
|
@@ -243,16 +261,20 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
|
|
243
261
|
filtered_labels = []
|
244
262
|
for img_path, text in labels:
|
245
263
|
if any(char in rare_chars for char in text):
|
246
|
-
elements_skipped += 1
|
247
|
-
elements_processed -=
|
264
|
+
elements_skipped += 1 # Count these as skipped due to rare chars
|
265
|
+
elements_processed -= (
|
266
|
+
1 # Decrement from processed as it's now being skipped
|
267
|
+
)
|
248
268
|
else:
|
249
269
|
filtered_labels.append((img_path, text))
|
250
|
-
|
270
|
+
|
251
271
|
labels_removed_count = original_label_count - len(filtered_labels)
|
252
272
|
if labels_removed_count > 0:
|
253
|
-
logger.info(
|
273
|
+
logger.info(
|
274
|
+
f"Removed {labels_removed_count} elements containing rare characters."
|
275
|
+
)
|
254
276
|
labels = filtered_labels
|
255
|
-
|
277
|
+
|
256
278
|
# Recalculate char_counts based on filtered_labels to update the dictionary
|
257
279
|
char_counts.clear()
|
258
280
|
for _, text in labels:
|
@@ -266,15 +288,18 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
|
|
266
288
|
else:
|
267
289
|
logger.info("No rare characters found below the frequency threshold.")
|
268
290
|
|
269
|
-
|
270
291
|
# --- 3. Generate Dictionary File (`dict.txt`) ---
|
271
292
|
dict_path = os.path.join(output_dir, "dict.txt")
|
272
293
|
try:
|
273
294
|
# Log the character set before sorting/writing
|
274
|
-
final_chars_for_dict = set(
|
295
|
+
final_chars_for_dict = set(
|
296
|
+
char_counts.keys()
|
297
|
+
) # Use keys from potentially filtered char_counts
|
275
298
|
logger.debug(f"Exporter final char_set for dict: {repr(final_chars_for_dict)}")
|
276
299
|
|
277
|
-
sorted_chars = sorted(
|
300
|
+
sorted_chars = sorted(
|
301
|
+
list(final_chars_for_dict)
|
302
|
+
) # No specific sorting order needed, just make it consistent
|
278
303
|
with open(dict_path, "w", encoding="utf-8") as f_dict:
|
279
304
|
for char in sorted_chars:
|
280
305
|
# Ensure we don't write empty strings or just newlines as dictionary entries
|
natural_pdf/flows/__init__.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
from .
|
1
|
+
from .collections import FlowElementCollection, FlowRegionCollection
|
2
2
|
from .element import FlowElement
|
3
|
+
from .flow import Flow
|
3
4
|
from .region import FlowRegion
|
4
|
-
from .collections import FlowElementCollection, FlowRegionCollection
|
5
5
|
|
6
6
|
__all__ = [
|
7
7
|
"Flow",
|
@@ -9,4 +9,4 @@ __all__ = [
|
|
9
9
|
"FlowRegion",
|
10
10
|
"FlowElementCollection",
|
11
11
|
"FlowRegionCollection",
|
12
|
-
]
|
12
|
+
]
|