natural-pdf 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. natural_pdf/__init__.py +29 -40
  2. natural_pdf/analyzers/text_options.py +9 -1
  3. natural_pdf/analyzers/text_structure.py +371 -58
  4. natural_pdf/classification/manager.py +1 -1
  5. natural_pdf/core/element_manager.py +11 -1
  6. natural_pdf/core/highlighting_service.py +120 -40
  7. natural_pdf/core/page.py +20 -18
  8. natural_pdf/core/pdf.py +146 -13
  9. natural_pdf/elements/base.py +17 -0
  10. natural_pdf/elements/collections.py +374 -30
  11. natural_pdf/elements/region.py +45 -14
  12. natural_pdf/exporters/data/__init__.py +0 -0
  13. natural_pdf/exporters/data/pdf.ttf +0 -0
  14. natural_pdf/exporters/data/sRGB.icc +0 -0
  15. natural_pdf/exporters/hocr.py +519 -0
  16. natural_pdf/exporters/hocr_font.py +136 -0
  17. natural_pdf/exporters/original_pdf.py +127 -0
  18. natural_pdf/exporters/searchable_pdf.py +2 -12
  19. natural_pdf/ocr/engine_surya.py +1 -1
  20. natural_pdf/search/__init__.py +65 -52
  21. natural_pdf/search/lancedb_search_service.py +325 -0
  22. natural_pdf/search/numpy_search_service.py +255 -0
  23. natural_pdf/search/searchable_mixin.py +25 -71
  24. natural_pdf/widgets/viewer.py +22 -31
  25. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/METADATA +54 -50
  26. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/RECORD +29 -23
  27. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/WHEEL +1 -1
  28. natural_pdf/search/haystack_search_service.py +0 -687
  29. natural_pdf/search/haystack_utils.py +0 -474
  30. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/licenses/LICENSE +0 -0
  31. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/top_level.txt +0 -0
@@ -55,6 +55,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
55
55
  bbox: Tuple[float, float, float, float],
56
56
  polygon: List[Tuple[float, float]] = None,
57
57
  parent=None,
58
+ label: Optional[str] = None,
58
59
  ):
59
60
  """
60
61
  Initialize a region.
@@ -74,11 +75,8 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
74
75
  self.start_element = None
75
76
  self.end_element = None
76
77
 
77
- # --- ADDED --- Metadata store for mixins
78
78
  self.metadata: Dict[str, Any] = {}
79
- # --- NEW --- Central registry for analysis results
80
79
  self.analyses: Dict[str, Any] = {}
81
- # --- END ADDED ---
82
80
 
83
81
  # Standard attributes for all elements
84
82
  self.object_type = "region" # For selector compatibility
@@ -91,6 +89,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
91
89
 
92
90
  # Region management attributes
93
91
  self.name = None
92
+ self.label = label
94
93
  self.source = None # Will be set by creation methods
95
94
 
96
95
  # Hierarchy support for nested document structure
@@ -773,6 +772,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
773
772
  # Add a default color for standalone show
774
773
  color: Optional[Union[Tuple, str]] = "blue",
775
774
  label: Optional[str] = None,
775
+ width: Optional[int] = None, # Add width parameter
776
776
  ) -> "Image.Image":
777
777
  """
778
778
  Show the page with just this region highlighted temporarily.
@@ -783,6 +783,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
783
783
  legend_position: Position of the legend
784
784
  color: Color to highlight this region (default: blue)
785
785
  label: Optional label for this region in the legend
786
+ width: Optional width for the output image in pixels
786
787
 
787
788
  Returns:
788
789
  PIL Image of the page with only this region highlighted
@@ -813,6 +814,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
813
814
  page_index=self._page.index,
814
815
  temporary_highlights=[temp_highlight_data],
815
816
  scale=scale,
817
+ width=width, # Pass the width parameter
816
818
  labels=labels,
817
819
  legend_position=legend_position,
818
820
  )
@@ -1334,6 +1336,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1334
1336
  self,
1335
1337
  *,
1336
1338
  text: str,
1339
+ contains: str = "all",
1337
1340
  apply_exclusions: bool = True,
1338
1341
  regex: bool = False,
1339
1342
  case: bool = True,
@@ -1345,6 +1348,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1345
1348
  self,
1346
1349
  selector: str,
1347
1350
  *,
1351
+ contains: str = "all",
1348
1352
  apply_exclusions: bool = True,
1349
1353
  regex: bool = False,
1350
1354
  case: bool = True,
@@ -1356,6 +1360,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1356
1360
  selector: Optional[str] = None, # Now optional
1357
1361
  *,
1358
1362
  text: Optional[str] = None, # New text parameter
1363
+ contains: str = "all", # New parameter for containment behavior
1359
1364
  apply_exclusions: bool = True,
1360
1365
  regex: bool = False,
1361
1366
  case: bool = True,
@@ -1369,6 +1374,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1369
1374
  Args:
1370
1375
  selector: CSS-like selector string.
1371
1376
  text: Text content to search for (equivalent to 'text:contains(...)').
1377
+ contains: How to determine if elements are inside: 'all' (fully inside),
1378
+ 'any' (any overlap), or 'center' (center point inside).
1379
+ (default: "all")
1372
1380
  apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
1373
1381
  regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1374
1382
  case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -1381,6 +1389,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1381
1389
  elements = self.find_all(
1382
1390
  selector=selector,
1383
1391
  text=text,
1392
+ contains=contains,
1384
1393
  apply_exclusions=apply_exclusions,
1385
1394
  regex=regex,
1386
1395
  case=case,
@@ -1393,6 +1402,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1393
1402
  self,
1394
1403
  *,
1395
1404
  text: str,
1405
+ contains: str = "all",
1396
1406
  apply_exclusions: bool = True,
1397
1407
  regex: bool = False,
1398
1408
  case: bool = True,
@@ -1404,6 +1414,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1404
1414
  self,
1405
1415
  selector: str,
1406
1416
  *,
1417
+ contains: str = "all",
1407
1418
  apply_exclusions: bool = True,
1408
1419
  regex: bool = False,
1409
1420
  case: bool = True,
@@ -1415,6 +1426,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1415
1426
  selector: Optional[str] = None, # Now optional
1416
1427
  *,
1417
1428
  text: Optional[str] = None, # New text parameter
1429
+ contains: str = "all", # New parameter to control inside/overlap behavior
1418
1430
  apply_exclusions: bool = True,
1419
1431
  regex: bool = False,
1420
1432
  case: bool = True,
@@ -1428,6 +1440,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1428
1440
  Args:
1429
1441
  selector: CSS-like selector string.
1430
1442
  text: Text content to search for (equivalent to 'text:contains(...)').
1443
+ contains: How to determine if elements are inside: 'all' (fully inside),
1444
+ 'any' (any overlap), or 'center' (center point inside).
1445
+ (default: "all")
1431
1446
  apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
1432
1447
  regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1433
1448
  case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -1443,6 +1458,10 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1443
1458
  if selector is None and text is None:
1444
1459
  raise ValueError("Provide either 'selector' or 'text'.")
1445
1460
 
1461
+ # Validate contains parameter
1462
+ if contains not in ["all", "any", "center"]:
1463
+ raise ValueError(f"Invalid contains value: {contains}. Must be 'all', 'any', or 'center'")
1464
+
1446
1465
  # Construct selector if 'text' is provided
1447
1466
  effective_selector = ""
1448
1467
  if text is not None:
@@ -1482,22 +1501,34 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1482
1501
  # Let the page handle its exclusion logic if needed
1483
1502
  potential_elements = self.page.find_all(
1484
1503
  selector=effective_selector,
1485
- apply_exclusions=False, # Apply exclusions LATER based on region bbox
1504
+ apply_exclusions=apply_exclusions,
1486
1505
  regex=regex,
1487
1506
  case=case,
1488
1507
  **kwargs,
1489
1508
  )
1490
1509
 
1491
- # Filter these elements to those strictly within the region's bounds
1510
+ # Filter these elements based on the specified containment method
1492
1511
  region_bbox = self.bbox
1493
- matching_elements = [
1494
- el
1495
- for el in potential_elements
1496
- if el.x0 >= region_bbox[0]
1497
- and el.top >= region_bbox[1]
1498
- and el.x1 <= region_bbox[2]
1499
- and el.bottom <= region_bbox[3]
1500
- ]
1512
+ matching_elements = []
1513
+
1514
+ if contains == "all": # Fully inside (strict)
1515
+ matching_elements = [
1516
+ el for el in potential_elements
1517
+ if el.x0 >= region_bbox[0]
1518
+ and el.top >= region_bbox[1]
1519
+ and el.x1 <= region_bbox[2]
1520
+ and el.bottom <= region_bbox[3]
1521
+ ]
1522
+ elif contains == "any": # Any overlap
1523
+ matching_elements = [
1524
+ el for el in potential_elements
1525
+ if self.intersects(el)
1526
+ ]
1527
+ elif contains == "center": # Center point inside
1528
+ matching_elements = [
1529
+ el for el in potential_elements
1530
+ if self.is_element_center_inside(el)
1531
+ ]
1501
1532
 
1502
1533
  return ElementCollection(matching_elements)
1503
1534
 
@@ -1989,7 +2020,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1989
2020
  from natural_pdf.qa.document_qa import get_qa_engine
1990
2021
  except ImportError:
1991
2022
  logger.error(
1992
- "Question answering requires optional dependencies. Install with `pip install natural-pdf[qa]`"
2023
+ "Question answering requires optional dependencies. Install with `pip install natural-pdf[core-ml]`"
1993
2024
  )
1994
2025
  return {
1995
2026
  "answer": None,
File without changes
Binary file
Binary file