natural-pdf 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -761,8 +761,6 @@ class Region(DirectionalMixin):
761
761
  exclusion_regions = self._page._get_exclusion_regions(include_callable=True)
762
762
 
763
763
  if debug:
764
- import logging
765
- logger = logging.getLogger("natural_pdf.elements.region")
766
764
  logger.debug(f"Region {self.bbox} with {len(exclusion_regions)} exclusion regions")
767
765
 
768
766
  # IMPROVEMENT 1: Check if the region intersects with any exclusion zone
@@ -777,16 +775,12 @@ class Region(DirectionalMixin):
777
775
  if overlap:
778
776
  has_intersection = True
779
777
  if debug:
780
- import logging
781
- logger = logging.getLogger("natural_pdf.elements.region")
782
778
  logger.debug(f" Region intersects with exclusion {i}: {exclusion.bbox}")
783
779
  break
784
780
 
785
781
  # If no intersection, process without exclusions
786
782
  if not has_intersection:
787
783
  if debug:
788
- import logging
789
- logger = logging.getLogger("natural_pdf.elements.region")
790
784
  logger.debug(f" No intersection with any exclusion, ignoring exclusions")
791
785
  apply_exclusions = False
792
786
  exclusion_regions = []
@@ -809,8 +803,6 @@ class Region(DirectionalMixin):
809
803
  abs(exclusion.x1 - self.page.width) < 5)
810
804
 
811
805
  if debug:
812
- import logging
813
- logger = logging.getLogger("natural_pdf.elements.region")
814
806
  logger.debug(f" Exclusion {i}: {exclusion.bbox}, full width: {full_width}")
815
807
 
816
808
  if full_width:
@@ -827,8 +819,6 @@ class Region(DirectionalMixin):
827
819
  bottom_bound = self.bottom
828
820
 
829
821
  if debug:
830
- import logging
831
- logger = logging.getLogger("natural_pdf.elements.region")
832
822
  logger.debug(f" Using cropping approach, initial bounds: ({self.x0}, {top_bound}, {self.x1}, {bottom_bound})")
833
823
 
834
824
  # Process only header/footer exclusions for cropping
@@ -838,8 +828,6 @@ class Region(DirectionalMixin):
838
828
  # Move top bound to exclude the header
839
829
  top_bound = max(top_bound, exclusion.bottom)
840
830
  if debug:
841
- import logging
842
- logger = logging.getLogger("natural_pdf.elements.region")
843
831
  logger.debug(f" Adjusted top bound to {top_bound} due to header exclusion")
844
832
 
845
833
  # If exclusion is at the bottom of our region
@@ -847,14 +835,10 @@ class Region(DirectionalMixin):
847
835
  # Move bottom bound to exclude the footer
848
836
  bottom_bound = min(bottom_bound, exclusion.top)
849
837
  if debug:
850
- import logging
851
- logger = logging.getLogger("natural_pdf.elements.region")
852
838
  logger.debug(f" Adjusted bottom bound to {bottom_bound} due to footer exclusion")
853
839
 
854
840
 
855
841
  if debug:
856
- import logging
857
- logger = logging.getLogger("natural_pdf.elements.region")
858
842
  logger.debug(f" Final bounds after exclusion adjustment: ({self.x0}, {top_bound}, {self.x1}, {bottom_bound})")
859
843
 
860
844
  # If we still have a valid region after exclusions
@@ -865,8 +849,6 @@ class Region(DirectionalMixin):
865
849
  result = cropped.extract_text(keep_blank_chars=keep_blank_chars, **kwargs)
866
850
 
867
851
  if debug:
868
- import logging
869
- logger = logging.getLogger("natural_pdf.elements.region")
870
852
  logger.debug(f" Successfully extracted text using crop, got {len(result)} characters")
871
853
 
872
854
  # Skip the complex filtering approach
@@ -874,16 +856,12 @@ class Region(DirectionalMixin):
874
856
  else:
875
857
  # This would only happen if the region is entirely inside an exclusion zone
876
858
  # or if both top and bottom of the region are excluded leaving no valid area
877
- import logging
878
- logger = logging.getLogger("natural_pdf.elements.region")
879
859
  logger.debug(f"Region {self.bbox} completely covered by exclusions, returning empty string")
880
860
  return ""
881
861
  # We have exclusions, but not all are headers/footers,
882
862
  # or we have a non-rectangular region
883
863
  else:
884
864
  if debug:
885
- import logging
886
- logger = logging.getLogger("natural_pdf.elements.region")
887
865
  logger.debug(f" Mixed exclusion types or non-rectangular region, switching to filtering")
888
866
 
889
867
  # Don't use crop for mixed exclusion types
@@ -902,16 +880,13 @@ class Region(DirectionalMixin):
902
880
  return result
903
881
 
904
882
  # For all other cases (complex exclusions, polygons), we use element filtering
905
- import warnings
906
- import logging
907
- logger = logging.getLogger("natural_pdf.elements.region")
908
-
909
883
  if debug:
910
884
  logger.debug(f"Using element filtering approach for region {self.bbox}")
911
885
 
912
- # Get all elements in this region first
913
- all_elements = self.get_elements(apply_exclusions=False)
914
-
886
+ # Get only word elements in this region first (instead of ALL elements)
887
+ # This prevents duplication from joining both char and word text
888
+ all_elements = [e for e in self.page.words if self._is_element_in_region(e)]
889
+
915
890
  if apply_exclusions and exclusion_regions:
916
891
  if debug:
917
892
  logger.debug(f"Filtering with {len(exclusion_regions)} exclusion zones")
@@ -1325,83 +1300,6 @@ class Region(DirectionalMixin):
1325
1300
 
1326
1301
  return elements
1327
1302
 
1328
- def expand(self,
1329
- left: float = 0,
1330
- right: float = 0,
1331
- top_expand: float = 0, # Renamed to avoid conflict
1332
- bottom_expand: float = 0, # Renamed to avoid conflict
1333
- width_factor: float = 1.0,
1334
- height_factor: float = 1.0,
1335
- # Keep original parameter names for backward compatibility
1336
- top: float = None,
1337
- bottom: float = None) -> 'Region':
1338
- """
1339
- Create a new region expanded from this one.
1340
-
1341
- Args:
1342
- left: Amount to expand left edge
1343
- right: Amount to expand right edge
1344
- top_expand: Amount to expand top edge (upward)
1345
- bottom_expand: Amount to expand bottom edge (downward)
1346
- width_factor: Factor to multiply width by
1347
- height_factor: Factor to multiply height by
1348
- top: (DEPRECATED, use top_expand) Amount to expand top edge (upward)
1349
- bottom: (DEPRECATED, use bottom_expand) Amount to expand bottom edge (downward)
1350
-
1351
- Returns:
1352
- New expanded Region
1353
- """
1354
- # Start with current coordinates
1355
- new_x0 = self.x0
1356
- new_x1 = self.x1
1357
- new_top = self.top
1358
- new_bottom = self.bottom
1359
-
1360
- # Handle the deprecated parameter names for backward compatibility
1361
- if top is not None:
1362
- top_expand = top
1363
- if bottom is not None:
1364
- bottom_expand = bottom
1365
-
1366
- # Apply absolute expansions first
1367
- new_x0 -= left
1368
- new_x1 += right
1369
- new_top -= top_expand # Expand upward (decrease top coordinate)
1370
- new_bottom += bottom_expand # Expand downward (increase bottom coordinate)
1371
-
1372
- # Apply percentage factors if provided
1373
- if width_factor != 1.0 or height_factor != 1.0:
1374
- # Current width and height
1375
- current_width = new_x1 - new_x0
1376
- current_height = new_bottom - new_top
1377
-
1378
- # Calculate new width and height
1379
- new_width = current_width * width_factor
1380
- new_height = current_height * height_factor
1381
-
1382
- # Calculate width and height differences
1383
- width_diff = new_width - current_width
1384
- height_diff = new_height - current_height
1385
-
1386
- # Adjust coordinates to maintain center point
1387
- new_x0 -= width_diff / 2
1388
- new_x1 += width_diff / 2
1389
- new_top -= height_diff / 2
1390
- new_bottom += height_diff / 2
1391
-
1392
- # Create new region with expanded bbox
1393
- new_region = Region(self.page, (new_x0, new_top, new_x1, new_bottom))
1394
-
1395
- # Copy multi-page properties if present
1396
- if self._spans_pages:
1397
- new_region._spans_pages = True
1398
- new_region._multi_page_elements = self._multi_page_elements
1399
- new_region._page_range = self._page_range
1400
- new_region.start_element = self.start_element
1401
- new_region.end_element = self.end_element
1402
-
1403
- return new_region
1404
-
1405
1303
  def get_section_between(self, start_element=None, end_element=None, boundary_inclusion='both'):
1406
1304
  """
1407
1305
  Get a section between two elements within this region.
@@ -1616,48 +1514,75 @@ class Region(DirectionalMixin):
1616
1514
 
1617
1515
  def create_cells(self):
1618
1516
  """
1619
- Create cell regions for a TATR-detected table.
1517
+ Create cell regions for a detected table by intersecting its
1518
+ row and column regions, and add them to the page.
1620
1519
 
1520
+ Assumes child row and column regions are already present on the page.
1521
+
1621
1522
  Returns:
1622
- List of cell regions
1523
+ Self for method chaining.
1623
1524
  """
1624
- if not (self.region_type == 'table' and self.model == 'tatr'):
1625
- raise ValueError("Only works for TATR-detected table regions")
1525
+ # Ensure this is called on a table region
1526
+ if self.region_type not in ('table', 'tableofcontents'): # Allow for ToC which might have structure
1527
+ raise ValueError(f"create_cells should be called on a 'table' or 'tableofcontents' region, not '{self.region_type}'")
1626
1528
 
1627
- # Find rows and columns that belong to this table
1628
- rows = self.page.find_all(f'region[type=table-row][model=tatr]')
1629
- columns = self.page.find_all(f'region[type=table-column][model=tatr]')
1529
+ # Find rows and columns associated with this page
1530
+ # Remove the model-specific filter
1531
+ rows = self.page.find_all('region[type=table-row]')
1532
+ columns = self.page.find_all('region[type=table-column]')
1630
1533
 
1631
- # Filter to only include those that overlap with this table
1534
+ # Filter to only include those that overlap with this table region
1632
1535
  def is_in_table(element):
1633
- element_center_x = (element.x0 + element.x1) / 2
1634
- element_center_y = (element.top + element.bottom) / 2
1635
- return (self.x0 <= element_center_x <= self.x1 and
1636
- self.top <= element_center_y <= self.bottom)
1536
+ # Use a simple overlap check (more robust than just center point)
1537
+ # Check if element's bbox overlaps with self.bbox
1538
+ return (element.x0 < self.x1 and element.x1 > self.x0 and
1539
+ element.top < self.bottom and element.bottom > self.top)
1637
1540
 
1638
1541
  table_rows = [r for r in rows if is_in_table(r)]
1639
1542
  table_columns = [c for c in columns if is_in_table(c)]
1640
1543
 
1544
+ if not table_rows or not table_columns:
1545
+ self._page.logger.warning(f"Region {self.bbox}: Cannot create cells. No overlapping row or column regions found.")
1546
+ return self # Return self even if no cells created
1547
+
1641
1548
  # Sort rows and columns
1642
1549
  table_rows.sort(key=lambda r: r.top)
1643
1550
  table_columns.sort(key=lambda c: c.x0)
1644
1551
 
1645
- # Create cells
1646
- cells = []
1552
+ # Create cells and add them to the page's element manager
1553
+ created_count = 0
1647
1554
  for row in table_rows:
1648
1555
  for column in table_columns:
1649
- # Create cell region at the intersection
1650
- cell = self.page.create_region(
1651
- column.x0, row.top, column.x1, row.bottom
1652
- )
1653
- # Set minimal metadata
1654
- cell.source = 'derived'
1655
- cell.region_type = 'table-cell'
1656
- cell.model = 'tatr'
1657
-
1658
- cells.append(cell)
1556
+ # Calculate intersection bbox for the cell
1557
+ cell_x0 = max(row.x0, column.x0)
1558
+ cell_y0 = max(row.top, column.top)
1559
+ cell_x1 = min(row.x1, column.x1)
1560
+ cell_y1 = min(row.bottom, column.bottom)
1561
+
1562
+ # Only create a cell if the intersection is valid (positive width/height)
1563
+ if cell_x1 > cell_x0 and cell_y1 > cell_y0:
1564
+ # Create cell region at the intersection
1565
+ cell = self.page.create_region(
1566
+ cell_x0, cell_y0, cell_x1, cell_y1
1567
+ )
1568
+ # Set metadata
1569
+ cell.source = 'derived'
1570
+ cell.region_type = 'table-cell' # Explicitly set type
1571
+ cell.normalized_type = 'table-cell' # And normalized type
1572
+ # Inherit model from the parent table region
1573
+ cell.model = self.model
1574
+ cell.parent_region = self # Link cell to parent table region
1575
+
1576
+ # Add the cell region to the page's element manager
1577
+ self.page._element_mgr.add_region(cell)
1578
+ created_count += 1
1659
1579
 
1660
- return cells
1580
+ # Optional: Add created cells to the table region's children
1581
+ # self.child_regions.extend(cells_created_in_this_call) # Needs list management
1582
+
1583
+ self._page.logger.info(f"Region {self.bbox} (Model: {self.model}): Created and added {created_count} cell regions.")
1584
+
1585
+ return self # Return self for chaining
1661
1586
 
1662
1587
  def ask(self, question: str, min_confidence: float = 0.1, model: str = None, debug: bool = False, **kwargs) -> Dict[str, Any]:
1663
1588
  """
@@ -5,6 +5,7 @@ from PIL import Image, ImageDraw
5
5
  import os
6
6
  import tempfile
7
7
  import json
8
+ from natural_pdf.elements.collections import ElementCollection
8
9
 
9
10
  logger = logging.getLogger("natural_pdf.qa.document_qa")
10
11
 
@@ -304,8 +305,8 @@ class DocumentQA:
304
305
  # Remove from matched texts to avoid duplicates
305
306
  if element.text in matched_texts:
306
307
  matched_texts.remove(element.text)
307
-
308
- result["source_elements"] = source_elements
308
+
309
+ result["source_elements"] = ElementCollection(source_elements)
309
310
 
310
311
  return result
311
312
 
@@ -386,7 +387,7 @@ class DocumentQA:
386
387
  if element.text in matched_texts:
387
388
  matched_texts.remove(element.text)
388
389
 
389
- result["source_elements"] = source_elements
390
+ result["source_elements"] = ElementCollection(source_elements)
390
391
 
391
392
  return result
392
393
 
@@ -351,4 +351,218 @@ def _is_approximate_match(value1, value2, tolerance: float = 0.1) -> bool:
351
351
  return abs(value1 - value2) <= tolerance
352
352
 
353
353
  # Default to exact match for other types
354
- return value1 == value2
354
+ return value1 == value2
355
+
356
+
357
+ PSEUDO_CLASS_FUNCTIONS = {
358
+ 'bold': lambda el: hasattr(el, 'bold') and el.bold,
359
+ 'italic': lambda el: hasattr(el, 'italic') and el.italic,
360
+ 'first-child': lambda el: hasattr(el, 'parent') and el.parent and el.parent.children[0] == el, # Example placeholder
361
+ 'last-child': lambda el: hasattr(el, 'parent') and el.parent and el.parent.children[-1] == el, # Example placeholder
362
+ # Add the new pseudo-classes for negation
363
+ 'not-bold': lambda el: hasattr(el, 'bold') and not el.bold,
364
+ 'not-italic': lambda el: hasattr(el, 'italic') and not el.italic,
365
+ }
366
+
367
+
368
+ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> callable:
369
+ """
370
+ Convert a parsed selector to a filter function.
371
+
372
+ Args:
373
+ selector: Parsed selector dictionary
374
+ **kwargs: Additional filter parameters including:
375
+ - regex: Whether to use regex for text search
376
+ - case: Whether to do case-sensitive text search
377
+
378
+ Returns:
379
+ Function that takes an element and returns True if it matches
380
+ """
381
+ def filter_func(element):
382
+ # Check element type
383
+ if selector['type'] != 'any':
384
+ # Special handling for 'text' type to match both 'text', 'char', and 'word'
385
+ if selector['type'] == 'text':
386
+ if element.type not in ['text', 'char', 'word']:
387
+ return False
388
+ # Special handling for 'region' type to check for detected layout regions
389
+ elif selector['type'] == 'region':
390
+ # Check if this is a Region with region_type property
391
+ if not hasattr(element, 'region_type'):
392
+ return False
393
+
394
+ # If 'type' attribute specified, it will be checked in the attributes section
395
+ # Check for Docling-specific types (section-header, etc.)
396
+ elif hasattr(element, 'normalized_type') and element.normalized_type == selector['type']:
397
+ # This is a direct match with a Docling region type
398
+ pass
399
+ # Otherwise, require exact match with the element's type attribute
400
+ elif not hasattr(element, 'type') or element.type != selector['type']:
401
+ return False
402
+
403
+ # Check attributes
404
+ for name, attr_info in selector['attributes'].items():
405
+ op = attr_info['op']
406
+ value = attr_info['value']
407
+
408
+ # Special case for fontname attribute - allow matching part of the name
409
+ if name == 'fontname' and op == '*=':
410
+ element_value = getattr(element, name, None)
411
+ if element_value is None or value.lower() not in element_value.lower():
412
+ return False
413
+ continue
414
+
415
+ # Convert hyphenated attribute names to underscore for Python properties
416
+ python_name = name.replace('-', '_')
417
+
418
+ # Special case for region attributes
419
+ if selector['type'] == 'region':
420
+ if name == 'type':
421
+ # Use normalized_type for comparison if available
422
+ if hasattr(element, 'normalized_type') and element.normalized_type:
423
+ element_value = element.normalized_type
424
+ else:
425
+ # Convert spaces to hyphens for consistency with the normalized format
426
+ element_value = getattr(element, 'region_type', '').lower().replace(' ', '-')
427
+ elif name == 'model':
428
+ # Special handling for model attribute in regions
429
+ element_value = getattr(element, 'model', None)
430
+ else:
431
+ # Get the attribute value from the element normally
432
+ element_value = getattr(element, python_name, None)
433
+ else:
434
+ # Get the attribute value from the element normally for non-region elements
435
+ element_value = getattr(element, python_name, None)
436
+
437
+ if element_value is None:
438
+ return False
439
+
440
+ # Apply operator
441
+ if op == '=':
442
+ if element_value != value:
443
+ return False
444
+ elif op == '~=':
445
+ # Approximate match (e.g., for colors)
446
+ if not _is_approximate_match(element_value, value):
447
+ return False
448
+ elif op == '>=':
449
+ # Greater than or equal (element value must be >= specified value)
450
+ if not (isinstance(element_value, (int, float)) and
451
+ isinstance(value, (int, float)) and
452
+ element_value >= value):
453
+ return False
454
+ elif op == '<=':
455
+ # Less than or equal (element value must be <= specified value)
456
+ if not (isinstance(element_value, (int, float)) and
457
+ isinstance(value, (int, float)) and
458
+ element_value <= value):
459
+ return False
460
+ elif op == '>':
461
+ # Greater than (element value must be > specified value)
462
+ if not (isinstance(element_value, (int, float)) and
463
+ isinstance(value, (int, float)) and
464
+ element_value > value):
465
+ return False
466
+ elif op == '<':
467
+ # Less than (element value must be < specified value)
468
+ if not (isinstance(element_value, (int, float)) and
469
+ isinstance(value, (int, float)) and
470
+ element_value < value):
471
+ return False
472
+
473
+ # Check pseudo-classes
474
+ for pseudo in selector['pseudo_classes']:
475
+ name = pseudo['name']
476
+ args = pseudo['args']
477
+
478
+ # Handle various pseudo-classes
479
+ if name == 'contains' and hasattr(element, 'text'):
480
+ use_regex = kwargs.get('regex', False)
481
+ ignore_case = not kwargs.get('case', True)
482
+
483
+ if use_regex:
484
+ import re
485
+ if not element.text:
486
+ return False
487
+ try:
488
+ pattern = re.compile(args, re.IGNORECASE if ignore_case else 0)
489
+ if not pattern.search(element.text):
490
+ return False
491
+ except re.error:
492
+ # If regex is invalid, fall back to literal text search
493
+ element_text = element.text
494
+ search_text = args
495
+
496
+ if ignore_case:
497
+ element_text = element_text.lower()
498
+ search_text = search_text.lower()
499
+
500
+ if search_text not in element_text:
501
+ return False
502
+ else:
503
+ # String comparison with case sensitivity option
504
+ if not element.text:
505
+ return False
506
+
507
+ element_text = element.text
508
+ search_text = args
509
+
510
+ if ignore_case:
511
+ element_text = element_text.lower()
512
+ search_text = search_text.lower()
513
+
514
+ if search_text not in element_text:
515
+ return False
516
+ elif name == 'starts-with' and hasattr(element, 'text'):
517
+ if not element.text or not element.text.startswith(args):
518
+ return False
519
+ elif name == 'ends-with' and hasattr(element, 'text'):
520
+ if not element.text or not element.text.endswith(args):
521
+ return False
522
+ elif name == 'bold':
523
+ if not (hasattr(element, 'bold') and element.bold):
524
+ return False
525
+ elif name == 'italic':
526
+ if not (hasattr(element, 'italic') and element.italic):
527
+ return False
528
+ elif name == 'horizontal':
529
+ if not (hasattr(element, 'is_horizontal') and element.is_horizontal):
530
+ return False
531
+ elif name == 'vertical':
532
+ if not (hasattr(element, 'is_vertical') and element.is_vertical):
533
+ return False
534
+ else:
535
+ # Check pseudo-classes (basic ones like :bold, :italic)
536
+ if name in PSEUDO_CLASS_FUNCTIONS:
537
+ if not PSEUDO_CLASS_FUNCTIONS[name](element):
538
+ return False
539
+ elif name == 'contains':
540
+ if not hasattr(element, 'text') or not element.text:
541
+ return False
542
+ text_to_check = element.text
543
+ search_term = args
544
+ if not kwargs.get('case', True): # Check case flag from kwargs
545
+ text_to_check = text_to_check.lower()
546
+ search_term = search_term.lower()
547
+
548
+ if kwargs.get('regex', False): # Check regex flag from kwargs
549
+ try:
550
+ if not re.search(search_term, text_to_check):
551
+ return False
552
+ except re.error as e:
553
+ logger.warning(f"Invalid regex in :contains selector '{search_term}': {e}")
554
+ return False # Invalid regex cannot match
555
+ else:
556
+ if search_term not in text_to_check:
557
+ return False
558
+ # Skip complex pseudo-classes like :near, :above here, handled later
559
+ elif name in ('above', 'below', 'near', 'left-of', 'right-of'):
560
+ pass # Handled separately after initial filtering
561
+ else:
562
+ # Optionally log unknown pseudo-classes
563
+ # logger.warning(f"Unknown pseudo-class: {name}")
564
+ pass
565
+
566
+ return True # Element passes all attribute and simple pseudo-class filters
567
+
568
+ return filter_func
@@ -127,10 +127,10 @@ def create_legend(labels_colors: Dict[str, Tuple[int, int, int, int]],
127
127
  # Try to load a font, use default if not available
128
128
  try:
129
129
  # Use a commonly available font, adjust size
130
- font = ImageFont.truetype("DejaVuSans.ttf", 12)
130
+ font = ImageFont.truetype("DejaVuSans.ttf", 14)
131
131
  except IOError:
132
132
  try:
133
- font = ImageFont.truetype("Arial.ttf", 12)
133
+ font = ImageFont.truetype("Arial.ttf", 14)
134
134
  except IOError:
135
135
  font = ImageFont.load_default()
136
136