natural-pdf 0.1.12__py3-none-any.whl → 0.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/shape_detection_mixin.py +1092 -0
- natural_pdf/classification/manager.py +2 -3
- natural_pdf/collections/pdf_collection.py +19 -39
- natural_pdf/core/highlighting_service.py +29 -38
- natural_pdf/core/page.py +283 -186
- natural_pdf/core/pdf.py +4 -4
- natural_pdf/elements/base.py +34 -0
- natural_pdf/elements/collections.py +160 -9
- natural_pdf/elements/line.py +5 -0
- natural_pdf/elements/region.py +353 -12
- natural_pdf/exporters/paddleocr.py +51 -11
- natural_pdf/flows/__init__.py +12 -0
- natural_pdf/flows/collections.py +533 -0
- natural_pdf/flows/element.py +382 -0
- natural_pdf/flows/flow.py +216 -0
- natural_pdf/flows/region.py +458 -0
- natural_pdf/selectors/parser.py +163 -8
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/METADATA +2 -1
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/RECORD +22 -17
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/WHEEL +1 -1
- natural_pdf/utils/tqdm_utils.py +0 -51
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/top_level.txt +0 -0
@@ -25,7 +25,7 @@ except ImportError:
|
|
25
25
|
AutoModelForSequenceClassification = object
|
26
26
|
torch = None
|
27
27
|
|
28
|
-
from
|
28
|
+
from tqdm.auto import tqdm
|
29
29
|
|
30
30
|
# Import result classes
|
31
31
|
from .results import CategoryScore, ClassificationResult
|
@@ -343,8 +343,7 @@ class ClassificationManager:
|
|
343
343
|
total_items = len(item_contents)
|
344
344
|
if progress_bar:
|
345
345
|
# Get the appropriate tqdm class
|
346
|
-
|
347
|
-
results_iterator = tqdm_class(
|
346
|
+
results_iterator = tqdm(
|
348
347
|
results_iterator,
|
349
348
|
total=total_items,
|
350
349
|
desc=f"Classifying batch ({model_id})",
|
@@ -25,14 +25,12 @@ from typing import (
|
|
25
25
|
)
|
26
26
|
|
27
27
|
from PIL import Image
|
28
|
-
from tqdm import tqdm
|
29
|
-
from tqdm.auto import tqdm as auto_tqdm
|
30
|
-
from tqdm.notebook import tqdm as notebook_tqdm
|
28
|
+
from tqdm.auto import tqdm
|
31
29
|
|
32
|
-
from natural_pdf.
|
30
|
+
from natural_pdf.exporters.base import FinetuneExporter
|
33
31
|
|
34
|
-
#
|
35
|
-
|
32
|
+
# Need to import this utility
|
33
|
+
from natural_pdf.utils.identifiers import generate_short_path_hash
|
36
34
|
|
37
35
|
# Set up logger early
|
38
36
|
# Configure logging to include thread information
|
@@ -67,8 +65,10 @@ except ImportError as e:
|
|
67
65
|
from natural_pdf.collections.mixins import ApplyMixin
|
68
66
|
from natural_pdf.search.searchable_mixin import SearchableMixin # Import the new mixin
|
69
67
|
|
68
|
+
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
70
69
|
|
71
|
-
|
70
|
+
|
71
|
+
class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin, ShapeDetectionMixin): # Add ExportMixin and ShapeDetectionMixin
|
72
72
|
def __init__(
|
73
73
|
self,
|
74
74
|
source: Union[str, Iterable[Union[str, "PDF"]]],
|
@@ -119,16 +119,8 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin): # Add ExportMixi
|
|
119
119
|
@staticmethod
|
120
120
|
def _get_pdf_class():
|
121
121
|
"""Helper method to dynamically import the PDF class."""
|
122
|
-
|
123
|
-
|
124
|
-
from natural_pdf.core.pdf import PDF
|
125
|
-
|
126
|
-
return PDF
|
127
|
-
except ImportError as e:
|
128
|
-
logger.error(
|
129
|
-
"Could not import PDF class from natural_pdf.core.pdf. Ensure it exists and there are no circular imports at runtime."
|
130
|
-
)
|
131
|
-
raise ImportError("PDF class is required but could not be imported.") from e
|
122
|
+
from natural_pdf.core.pdf import PDF
|
123
|
+
return PDF
|
132
124
|
|
133
125
|
# --- Internal Helpers ---
|
134
126
|
|
@@ -141,16 +133,13 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin): # Add ExportMixi
|
|
141
133
|
def _execute_glob(self, pattern: str) -> Set[str]:
|
142
134
|
"""Glob for paths and return a set of valid PDF paths."""
|
143
135
|
found_paths = set()
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
for
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
found_paths.add(str(p.resolve())) # Store resolved absolute path
|
152
|
-
except Exception as e:
|
153
|
-
logger.error(f"Error processing glob pattern '{pattern}': {e}")
|
136
|
+
# Use iglob for potentially large directories/matches
|
137
|
+
paths_iter = py_glob.iglob(pattern, recursive=self._recursive)
|
138
|
+
for path_str in paths_iter:
|
139
|
+
# Use Path object for easier checking
|
140
|
+
p = Path(path_str)
|
141
|
+
if p.is_file() and p.suffix.lower() == ".pdf":
|
142
|
+
found_paths.add(str(p.resolve())) # Store resolved absolute path
|
154
143
|
return found_paths
|
155
144
|
|
156
145
|
def _resolve_sources_to_paths(self, source: Union[str, Iterable[str]]) -> List[str]:
|
@@ -534,19 +523,10 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin): # Add ExportMixi
|
|
534
523
|
**kwargs: Additional arguments passed to create_correction_task_package
|
535
524
|
(e.g., image_render_scale, overwrite).
|
536
525
|
"""
|
537
|
-
|
538
|
-
from natural_pdf.utils.packaging import create_correction_task_package
|
526
|
+
from natural_pdf.utils.packaging import create_correction_task_package
|
539
527
|
|
540
|
-
|
541
|
-
|
542
|
-
except ImportError:
|
543
|
-
logger.error(
|
544
|
-
"Failed to import 'create_correction_task_package'. Packaging utility might be missing."
|
545
|
-
)
|
546
|
-
# Or raise
|
547
|
-
except Exception as e:
|
548
|
-
logger.error(f"Failed to export correction task for collection: {e}", exc_info=True)
|
549
|
-
raise # Re-raise the exception from the utility function
|
528
|
+
# Pass the collection itself (self) as the source
|
529
|
+
create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
|
550
530
|
|
551
531
|
# --- Mixin Required Implementation ---
|
552
532
|
def get_indexable_items(self) -> Iterable[Indexable]:
|
@@ -215,21 +215,14 @@ class HighlightRenderer:
|
|
215
215
|
def _render_ocr_text(self):
|
216
216
|
"""Renders OCR text onto the image. (Adapted from old HighlightManager)"""
|
217
217
|
# Use the page reference to get OCR elements
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
if
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
# Alternative: self.page.extract_ocr_elements() - but might be slow
|
227
|
-
|
228
|
-
except Exception as e:
|
229
|
-
logger.warning(
|
230
|
-
f"Could not get OCR elements for page {self.page.number}: {e}", exc_info=True
|
231
|
-
)
|
232
|
-
return # Don't modify image if OCR elements aren't available
|
218
|
+
# Try finding first, then extracting if necessary
|
219
|
+
ocr_elements = self.page.find_all("text[source=ocr]")
|
220
|
+
if not ocr_elements:
|
221
|
+
# Don't run full OCR here, just extract if already run
|
222
|
+
ocr_elements = [
|
223
|
+
el for el in self.page.words if getattr(el, "source", None) == "ocr"
|
224
|
+
]
|
225
|
+
# Alternative: self.page.extract_ocr_elements() - but might be slow
|
233
226
|
|
234
227
|
if not ocr_elements:
|
235
228
|
logger.debug(f"No OCR elements found for page {self.page.number} to render.")
|
@@ -293,20 +286,15 @@ class HighlightRenderer:
|
|
293
286
|
)
|
294
287
|
|
295
288
|
# Calculate text position (centered vertically, slightly offset from left)
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
text_x = x0_s + padding # Start near left edge with padding
|
306
|
-
|
307
|
-
except Exception:
|
308
|
-
# Fallback positioning
|
309
|
-
text_x, text_y = x0_s + padding, top_s + padding
|
289
|
+
if hasattr(sized_font, "getbbox"): # Modern PIL
|
290
|
+
_, text_top_offset, _, text_bottom_offset = sized_font.getbbox(element.text)
|
291
|
+
text_h = text_bottom_offset - text_top_offset
|
292
|
+
else: # Older PIL approximation
|
293
|
+
text_h = font_size
|
294
|
+
text_y = top_s + (box_h - text_h) / 2
|
295
|
+
# Adjust for vertical offset in some fonts
|
296
|
+
text_y -= text_top_offset if hasattr(sized_font, "getbbox") else 0
|
297
|
+
text_x = x0_s + padding # Start near left edge with padding
|
310
298
|
|
311
299
|
draw.text((text_x, text_y), element.text, fill=(0, 0, 0, 255), font=sized_font)
|
312
300
|
|
@@ -392,9 +380,6 @@ class HighlightingService:
|
|
392
380
|
except ValueError:
|
393
381
|
logger.warning(f"Invalid color string: '{color_input}'")
|
394
382
|
return None
|
395
|
-
except Exception as e:
|
396
|
-
logger.error(f"Error processing color string '{color_input}': {e}")
|
397
|
-
return None
|
398
383
|
else:
|
399
384
|
logger.warning(f"Invalid color input type: {type(color_input)}")
|
400
385
|
return None
|
@@ -677,9 +662,12 @@ class HighlightingService:
|
|
677
662
|
actual_scale_y = scale # Fallback
|
678
663
|
logger.debug(f"Calculated actual scales for page {page_index}: x={actual_scale_x:.2f}, y={actual_scale_y:.2f}")
|
679
664
|
|
680
|
-
except
|
681
|
-
logger.error(f"
|
682
|
-
|
665
|
+
except IOError as e:
|
666
|
+
logger.error(f"IOError creating base image for page {page_index}: {e}")
|
667
|
+
raise
|
668
|
+
except AttributeError as e:
|
669
|
+
logger.error(f"AttributeError creating base image for page {page_index}: {e}")
|
670
|
+
raise
|
683
671
|
|
684
672
|
renderer_scale = actual_scale_x # Assuming aspect ratio maintained, use x_scale
|
685
673
|
|
@@ -865,8 +853,11 @@ class HighlightingService:
|
|
865
853
|
else:
|
866
854
|
final_image = rendered_image
|
867
855
|
|
868
|
-
except
|
869
|
-
logger.error(f"
|
870
|
-
|
856
|
+
except IOError as e:
|
857
|
+
logger.error(f"IOError rendering preview for page {page_index}: {e}")
|
858
|
+
raise
|
859
|
+
except AttributeError as e:
|
860
|
+
logger.error(f"AttributeError rendering preview for page {page_index}: {e}")
|
861
|
+
raise
|
871
862
|
|
872
863
|
return final_image
|