natural-pdf 0.2.18__py3-none-any.whl → 0.2.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. natural_pdf/__init__.py +8 -0
  2. natural_pdf/analyzers/checkbox/__init__.py +6 -0
  3. natural_pdf/analyzers/checkbox/base.py +265 -0
  4. natural_pdf/analyzers/checkbox/checkbox_analyzer.py +329 -0
  5. natural_pdf/analyzers/checkbox/checkbox_manager.py +166 -0
  6. natural_pdf/analyzers/checkbox/checkbox_options.py +60 -0
  7. natural_pdf/analyzers/checkbox/mixin.py +95 -0
  8. natural_pdf/analyzers/checkbox/rtdetr.py +201 -0
  9. natural_pdf/collections/mixins.py +14 -5
  10. natural_pdf/core/element_manager.py +5 -1
  11. natural_pdf/core/page.py +61 -0
  12. natural_pdf/core/page_collection.py +41 -1
  13. natural_pdf/core/pdf.py +24 -1
  14. natural_pdf/describe/base.py +20 -0
  15. natural_pdf/elements/base.py +152 -10
  16. natural_pdf/elements/element_collection.py +41 -2
  17. natural_pdf/elements/region.py +115 -2
  18. natural_pdf/judge.py +1509 -0
  19. natural_pdf/selectors/parser.py +42 -1
  20. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/METADATA +1 -1
  21. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/RECORD +41 -17
  22. temp/check_model.py +49 -0
  23. temp/check_pdf_content.py +9 -0
  24. temp/checkbox_checks.py +590 -0
  25. temp/checkbox_simple.py +117 -0
  26. temp/checkbox_ux_ideas.py +400 -0
  27. temp/context_manager_prototype.py +177 -0
  28. temp/convert_to_hf.py +60 -0
  29. temp/demo_text_closest.py +66 -0
  30. temp/inspect_model.py +43 -0
  31. temp/rtdetr_dinov2_test.py +49 -0
  32. temp/test_closest_debug.py +26 -0
  33. temp/test_closest_debug2.py +22 -0
  34. temp/test_context_exploration.py +85 -0
  35. temp/test_durham.py +30 -0
  36. temp/test_empty_string.py +16 -0
  37. temp/test_similarity.py +15 -0
  38. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/WHEEL +0 -0
  39. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/entry_points.txt +0 -0
  40. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/licenses/LICENSE +0 -0
  41. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/top_level.txt +0 -0
@@ -19,6 +19,7 @@ from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to
19
19
  from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
20
20
  from tqdm.auto import tqdm
21
21
 
22
+ from natural_pdf.analyzers.checkbox.mixin import CheckboxDetectionMixin
22
23
  from natural_pdf.analyzers.layout.pdfplumber_table_finder import find_text_based_tables
23
24
 
24
25
  # --- Shape Detection Mixin --- #
@@ -75,12 +76,41 @@ except ImportError:
75
76
  logger = logging.getLogger(__name__)
76
77
 
77
78
 
79
+ class RegionContext:
80
+ """Context manager for constraining directional operations to a region."""
81
+
82
+ def __init__(self, region: "Region"):
83
+ """Initialize the context manager with a region.
84
+
85
+ Args:
86
+ region: The Region to use as a constraint for directional operations
87
+ """
88
+ self.region = region
89
+ self.previous_within = None
90
+
91
+ def __enter__(self):
92
+ """Enter the context, setting the global directional_within option."""
93
+ import natural_pdf
94
+
95
+ self.previous_within = natural_pdf.options.layout.directional_within
96
+ natural_pdf.options.layout.directional_within = self.region
97
+ return self.region
98
+
99
+ def __exit__(self, exc_type, exc_val, exc_tb):
100
+ """Exit the context, restoring the previous directional_within option."""
101
+ import natural_pdf
102
+
103
+ natural_pdf.options.layout.directional_within = self.previous_within
104
+ return False # Don't suppress exceptions
105
+
106
+
78
107
  class Region(
79
108
  TextMixin,
80
109
  DirectionalMixin,
81
110
  ClassificationMixin,
82
111
  ExtractionMixin,
83
112
  ShapeDetectionMixin,
113
+ CheckboxDetectionMixin,
84
114
  DescribeMixin,
85
115
  VisualSearchMixin,
86
116
  Visualizable,
@@ -574,6 +604,16 @@ class Region(
574
604
  (self.x0, self.bottom), # bottom-left
575
605
  ]
576
606
 
607
+ @property
608
+ def origin(self) -> Optional[Union["Element", "Region"]]:
609
+ """The element/region that created this region (if it was created via directional method)."""
610
+ return getattr(self, "source_element", None)
611
+
612
+ @property
613
+ def endpoint(self) -> Optional["Element"]:
614
+ """The element where this region stopped (if created with 'until' parameter)."""
615
+ return getattr(self, "boundary_element", None)
616
+
577
617
  def _is_point_in_polygon(self, x: float, y: float) -> bool:
578
618
  """
579
619
  Check if a point is inside the polygon using ray casting algorithm.
@@ -1297,9 +1337,11 @@ class Region(
1297
1337
 
1298
1338
  def extract_text(
1299
1339
  self,
1340
+ granularity: str = "chars",
1300
1341
  apply_exclusions: bool = True,
1301
1342
  debug: bool = False,
1302
1343
  *,
1344
+ overlap: str = "center",
1303
1345
  newlines: Union[bool, str] = True,
1304
1346
  content_filter=None,
1305
1347
  **kwargs,
@@ -1309,8 +1351,15 @@ class Region(
1309
1351
  layout engine (chars_to_textmap).
1310
1352
 
1311
1353
  Args:
1354
+ granularity: Level of text extraction - 'chars' (default) or 'words'.
1355
+ - 'chars': Character-by-character extraction (current behavior)
1356
+ - 'words': Word-level extraction with configurable overlap
1312
1357
  apply_exclusions: Whether to apply exclusion regions defined on the parent page.
1313
1358
  debug: Enable verbose debugging output for filtering steps.
1359
+ overlap: How to determine if words overlap with the region (only used when granularity='words'):
1360
+ - 'center': Word center point must be inside (default)
1361
+ - 'full': Word must be fully inside the region
1362
+ - 'partial': Any overlap includes the word
1314
1363
  newlines: Whether to strip newline characters from the extracted text.
1315
1364
  content_filter: Optional content filter to exclude specific text patterns. Can be:
1316
1365
  - A regex pattern string (characters matching the pattern are EXCLUDED)
@@ -1323,10 +1372,41 @@ class Region(
1323
1372
  Returns:
1324
1373
  Extracted text as string, potentially with layout-based spacing.
1325
1374
  """
1375
+ # Validate granularity parameter
1376
+ if granularity not in ("chars", "words"):
1377
+ raise ValueError(f"granularity must be 'chars' or 'words', got '{granularity}'")
1378
+
1326
1379
  # Allow 'debug_exclusions' for backward compatibility
1327
1380
  debug = kwargs.get("debug", debug or kwargs.get("debug_exclusions", False))
1328
- logger.debug(f"Region {self.bbox}: extract_text called with kwargs: {kwargs}")
1381
+ logger.debug(
1382
+ f"Region {self.bbox}: extract_text called with granularity='{granularity}', overlap='{overlap}', kwargs: {kwargs}"
1383
+ )
1329
1384
 
1385
+ # Handle word-level extraction
1386
+ if granularity == "words":
1387
+ # Use find_all to get words with proper overlap and exclusion handling
1388
+ word_elements = self.find_all(
1389
+ "text", overlap=overlap, apply_exclusions=apply_exclusions
1390
+ )
1391
+
1392
+ # Join the text from all matching words
1393
+ text_parts = []
1394
+ for word in word_elements:
1395
+ word_text = word.extract_text()
1396
+ if word_text: # Skip empty strings
1397
+ text_parts.append(word_text)
1398
+
1399
+ result = " ".join(text_parts)
1400
+
1401
+ # Apply newlines processing if requested
1402
+ if newlines is False:
1403
+ result = result.replace("\n", " ").replace("\r", " ")
1404
+ elif isinstance(newlines, str):
1405
+ result = result.replace("\n", newlines).replace("\r", newlines)
1406
+
1407
+ return result
1408
+
1409
+ # Original character-level extraction logic follows...
1330
1410
  # 1. Get Word Elements potentially within this region (initial broad phase)
1331
1411
  # Optimization: Could use spatial query if page elements were indexed
1332
1412
  page_words = self.page.words # Get all words from the page
@@ -3309,7 +3389,14 @@ class Region(
3309
3389
  name_info = f" name='{self.name}'" if self.name else ""
3310
3390
  type_info = f" type='{self.region_type}'" if self.region_type else ""
3311
3391
  source_info = f" source='{self.source}'" if self.source else ""
3312
- return f"<Region{name_info}{type_info}{source_info} bbox={self.bbox}{poly_info}>"
3392
+
3393
+ # Add checkbox state if this is a checkbox
3394
+ checkbox_info = ""
3395
+ if self.region_type == "checkbox" and hasattr(self, "is_checked"):
3396
+ state = "checked" if self.is_checked else "unchecked"
3397
+ checkbox_info = f" [{state}]"
3398
+
3399
+ return f"<Region{name_info}{type_info}{source_info}{checkbox_info} bbox={self.bbox}{poly_info}>"
3313
3400
 
3314
3401
  def update_text(
3315
3402
  self,
@@ -4038,3 +4125,29 @@ class Region(
4038
4125
  except Exception as e:
4039
4126
  logger.error(f"Error creating viewer for region {self.bbox}: {e}", exc_info=True)
4040
4127
  return None
4128
+
4129
+ def within(self):
4130
+ """Context manager that constrains directional operations to this region.
4131
+
4132
+ When used as a context manager, all directional navigation operations
4133
+ (above, below, left, right) will be constrained to the bounds of this region.
4134
+
4135
+ Returns:
4136
+ RegionContext: A context manager that yields this region
4137
+
4138
+ Examples:
4139
+ ```python
4140
+ # Create a column region
4141
+ left_col = page.region(right=page.width/2)
4142
+
4143
+ # All directional operations are constrained to left_col
4144
+ with left_col.within() as col:
4145
+ header = col.find("text[size>14]")
4146
+ content = header.below(until="text[size>14]")
4147
+ # content will only include elements within left_col
4148
+
4149
+ # Operations outside the context are not constrained
4150
+ full_page_below = header.below() # Searches full page
4151
+ ```
4152
+ """
4153
+ return RegionContext(self)