natural-pdf 0.1.15__py3-none-any.whl → 0.1.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. natural_pdf/__init__.py +31 -0
  2. natural_pdf/analyzers/layout/gemini.py +137 -162
  3. natural_pdf/analyzers/layout/layout_manager.py +9 -5
  4. natural_pdf/analyzers/layout/layout_options.py +77 -7
  5. natural_pdf/analyzers/layout/paddle.py +318 -165
  6. natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
  7. natural_pdf/analyzers/shape_detection_mixin.py +770 -405
  8. natural_pdf/classification/mixin.py +2 -8
  9. natural_pdf/collections/pdf_collection.py +25 -30
  10. natural_pdf/core/highlighting_service.py +47 -32
  11. natural_pdf/core/page.py +117 -75
  12. natural_pdf/core/pdf.py +19 -22
  13. natural_pdf/elements/base.py +9 -9
  14. natural_pdf/elements/collections.py +105 -50
  15. natural_pdf/elements/region.py +200 -126
  16. natural_pdf/exporters/paddleocr.py +38 -13
  17. natural_pdf/flows/__init__.py +3 -3
  18. natural_pdf/flows/collections.py +303 -132
  19. natural_pdf/flows/element.py +277 -132
  20. natural_pdf/flows/flow.py +33 -16
  21. natural_pdf/flows/region.py +142 -79
  22. natural_pdf/ocr/engine_doctr.py +37 -4
  23. natural_pdf/ocr/engine_easyocr.py +23 -3
  24. natural_pdf/ocr/engine_paddle.py +281 -30
  25. natural_pdf/ocr/engine_surya.py +8 -3
  26. natural_pdf/ocr/ocr_manager.py +75 -76
  27. natural_pdf/ocr/ocr_options.py +52 -87
  28. natural_pdf/search/__init__.py +25 -12
  29. natural_pdf/search/lancedb_search_service.py +91 -54
  30. natural_pdf/search/numpy_search_service.py +86 -65
  31. natural_pdf/search/searchable_mixin.py +2 -2
  32. natural_pdf/selectors/parser.py +125 -81
  33. natural_pdf/widgets/__init__.py +1 -1
  34. natural_pdf/widgets/viewer.py +205 -449
  35. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/METADATA +27 -45
  36. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/RECORD +39 -38
  37. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/WHEEL +0 -0
  38. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/licenses/LICENSE +0 -0
  39. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,9 @@
1
+ import collections
1
2
  import logging
2
3
  import os
3
4
  import random
4
5
  import shutil
5
6
  from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union
6
- import collections
7
7
 
8
8
  from tqdm.auto import tqdm
9
9
 
@@ -144,16 +144,34 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
144
144
  )
145
145
  continue
146
146
 
147
- elements = pdf.find_all(
148
- self.selector, apply_exclusions=False
149
- ) # Usually want all text, even if excluded
147
+ elements = pdf.find_all(self.selector, apply_exclusions=False)
150
148
  if not elements:
151
149
  logger.debug(f"No elements matching '{self.selector}' found in {pdf.path}")
152
150
  continue
153
151
 
152
+ # --- FILTER BASED ON CHARACTER FREQUENCY BEFORE EXPORT ---
153
+ filtered_elements = []
154
+ if self.min_char_freq > 1:
155
+ # First, count all characters in all elements
156
+ char_counts = collections.Counter()
157
+ for element in elements:
158
+ if hasattr(element, "text") and isinstance(element.text, str):
159
+ char_counts.update(element.text)
160
+ rare_chars = {
161
+ char for char, count in char_counts.items() if count < self.min_char_freq
162
+ }
163
+ for element in elements:
164
+ if hasattr(element, "text") and isinstance(element.text, str):
165
+ if any(char in rare_chars for char in element.text):
166
+ elements_skipped += 1
167
+ continue
168
+ filtered_elements.append(element)
169
+ else:
170
+ filtered_elements = elements
171
+
154
172
  for i, element in enumerate(
155
173
  tqdm(
156
- elements,
174
+ filtered_elements,
157
175
  desc=f"Exporting '{os.path.basename(pdf.path)}'",
158
176
  leave=False,
159
177
  position=1,
@@ -243,16 +261,20 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
243
261
  filtered_labels = []
244
262
  for img_path, text in labels:
245
263
  if any(char in rare_chars for char in text):
246
- elements_skipped += 1 # Count these as skipped due to rare chars
247
- elements_processed -=1 # Decrement from processed as it's now being skipped
264
+ elements_skipped += 1 # Count these as skipped due to rare chars
265
+ elements_processed -= (
266
+ 1 # Decrement from processed as it's now being skipped
267
+ )
248
268
  else:
249
269
  filtered_labels.append((img_path, text))
250
-
270
+
251
271
  labels_removed_count = original_label_count - len(filtered_labels)
252
272
  if labels_removed_count > 0:
253
- logger.info(f"Removed {labels_removed_count} elements containing rare characters.")
273
+ logger.info(
274
+ f"Removed {labels_removed_count} elements containing rare characters."
275
+ )
254
276
  labels = filtered_labels
255
-
277
+
256
278
  # Recalculate char_counts based on filtered_labels to update the dictionary
257
279
  char_counts.clear()
258
280
  for _, text in labels:
@@ -266,15 +288,18 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
266
288
  else:
267
289
  logger.info("No rare characters found below the frequency threshold.")
268
290
 
269
-
270
291
  # --- 3. Generate Dictionary File (`dict.txt`) ---
271
292
  dict_path = os.path.join(output_dir, "dict.txt")
272
293
  try:
273
294
  # Log the character set before sorting/writing
274
- final_chars_for_dict = set(char_counts.keys()) # Use keys from potentially filtered char_counts
295
+ final_chars_for_dict = set(
296
+ char_counts.keys()
297
+ ) # Use keys from potentially filtered char_counts
275
298
  logger.debug(f"Exporter final char_set for dict: {repr(final_chars_for_dict)}")
276
299
 
277
- sorted_chars = sorted(list(final_chars_for_dict)) # No specific sorting order needed, just make it consistent
300
+ sorted_chars = sorted(
301
+ list(final_chars_for_dict)
302
+ ) # No specific sorting order needed, just make it consistent
278
303
  with open(dict_path, "w", encoding="utf-8") as f_dict:
279
304
  for char in sorted_chars:
280
305
  # Ensure we don't write empty strings or just newlines as dictionary entries
@@ -1,7 +1,7 @@
1
- from .flow import Flow
1
+ from .collections import FlowElementCollection, FlowRegionCollection
2
2
  from .element import FlowElement
3
+ from .flow import Flow
3
4
  from .region import FlowRegion
4
- from .collections import FlowElementCollection, FlowRegionCollection
5
5
 
6
6
  __all__ = [
7
7
  "Flow",
@@ -9,4 +9,4 @@ __all__ = [
9
9
  "FlowRegion",
10
10
  "FlowElementCollection",
11
11
  "FlowRegionCollection",
12
- ]
12
+ ]