natural-pdf 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +1 -1
- natural_pdf/analyzers/layout/layout_analyzer.py +133 -44
- natural_pdf/analyzers/layout/layout_manager.py +9 -6
- natural_pdf/analyzers/layout/layout_options.py +2 -4
- natural_pdf/analyzers/layout/surya.py +199 -91
- natural_pdf/core/highlighting_service.py +48 -17
- natural_pdf/core/page.py +92 -27
- natural_pdf/core/pdf.py +11 -0
- natural_pdf/elements/base.py +99 -14
- natural_pdf/elements/collections.py +56 -0
- natural_pdf/elements/region.py +56 -131
- natural_pdf/qa/document_qa.py +4 -3
- natural_pdf/selectors/parser.py +215 -1
- natural_pdf/utils/visualization.py +2 -2
- natural_pdf-0.1.2.dist-info/METADATA +124 -0
- {natural_pdf-0.1.0.dist-info → natural_pdf-0.1.2.dist-info}/RECORD +19 -19
- natural_pdf-0.1.0.dist-info/METADATA +0 -295
- {natural_pdf-0.1.0.dist-info → natural_pdf-0.1.2.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.0.dist-info → natural_pdf-0.1.2.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.0.dist-info → natural_pdf-0.1.2.dist-info}/top_level.txt +0 -0
natural_pdf/elements/region.py
CHANGED
@@ -761,8 +761,6 @@ class Region(DirectionalMixin):
|
|
761
761
|
exclusion_regions = self._page._get_exclusion_regions(include_callable=True)
|
762
762
|
|
763
763
|
if debug:
|
764
|
-
import logging
|
765
|
-
logger = logging.getLogger("natural_pdf.elements.region")
|
766
764
|
logger.debug(f"Region {self.bbox} with {len(exclusion_regions)} exclusion regions")
|
767
765
|
|
768
766
|
# IMPROVEMENT 1: Check if the region intersects with any exclusion zone
|
@@ -777,16 +775,12 @@ class Region(DirectionalMixin):
|
|
777
775
|
if overlap:
|
778
776
|
has_intersection = True
|
779
777
|
if debug:
|
780
|
-
import logging
|
781
|
-
logger = logging.getLogger("natural_pdf.elements.region")
|
782
778
|
logger.debug(f" Region intersects with exclusion {i}: {exclusion.bbox}")
|
783
779
|
break
|
784
780
|
|
785
781
|
# If no intersection, process without exclusions
|
786
782
|
if not has_intersection:
|
787
783
|
if debug:
|
788
|
-
import logging
|
789
|
-
logger = logging.getLogger("natural_pdf.elements.region")
|
790
784
|
logger.debug(f" No intersection with any exclusion, ignoring exclusions")
|
791
785
|
apply_exclusions = False
|
792
786
|
exclusion_regions = []
|
@@ -809,8 +803,6 @@ class Region(DirectionalMixin):
|
|
809
803
|
abs(exclusion.x1 - self.page.width) < 5)
|
810
804
|
|
811
805
|
if debug:
|
812
|
-
import logging
|
813
|
-
logger = logging.getLogger("natural_pdf.elements.region")
|
814
806
|
logger.debug(f" Exclusion {i}: {exclusion.bbox}, full width: {full_width}")
|
815
807
|
|
816
808
|
if full_width:
|
@@ -827,8 +819,6 @@ class Region(DirectionalMixin):
|
|
827
819
|
bottom_bound = self.bottom
|
828
820
|
|
829
821
|
if debug:
|
830
|
-
import logging
|
831
|
-
logger = logging.getLogger("natural_pdf.elements.region")
|
832
822
|
logger.debug(f" Using cropping approach, initial bounds: ({self.x0}, {top_bound}, {self.x1}, {bottom_bound})")
|
833
823
|
|
834
824
|
# Process only header/footer exclusions for cropping
|
@@ -838,8 +828,6 @@ class Region(DirectionalMixin):
|
|
838
828
|
# Move top bound to exclude the header
|
839
829
|
top_bound = max(top_bound, exclusion.bottom)
|
840
830
|
if debug:
|
841
|
-
import logging
|
842
|
-
logger = logging.getLogger("natural_pdf.elements.region")
|
843
831
|
logger.debug(f" Adjusted top bound to {top_bound} due to header exclusion")
|
844
832
|
|
845
833
|
# If exclusion is at the bottom of our region
|
@@ -847,14 +835,10 @@ class Region(DirectionalMixin):
|
|
847
835
|
# Move bottom bound to exclude the footer
|
848
836
|
bottom_bound = min(bottom_bound, exclusion.top)
|
849
837
|
if debug:
|
850
|
-
import logging
|
851
|
-
logger = logging.getLogger("natural_pdf.elements.region")
|
852
838
|
logger.debug(f" Adjusted bottom bound to {bottom_bound} due to footer exclusion")
|
853
839
|
|
854
840
|
|
855
841
|
if debug:
|
856
|
-
import logging
|
857
|
-
logger = logging.getLogger("natural_pdf.elements.region")
|
858
842
|
logger.debug(f" Final bounds after exclusion adjustment: ({self.x0}, {top_bound}, {self.x1}, {bottom_bound})")
|
859
843
|
|
860
844
|
# If we still have a valid region after exclusions
|
@@ -865,8 +849,6 @@ class Region(DirectionalMixin):
|
|
865
849
|
result = cropped.extract_text(keep_blank_chars=keep_blank_chars, **kwargs)
|
866
850
|
|
867
851
|
if debug:
|
868
|
-
import logging
|
869
|
-
logger = logging.getLogger("natural_pdf.elements.region")
|
870
852
|
logger.debug(f" Successfully extracted text using crop, got {len(result)} characters")
|
871
853
|
|
872
854
|
# Skip the complex filtering approach
|
@@ -874,16 +856,12 @@ class Region(DirectionalMixin):
|
|
874
856
|
else:
|
875
857
|
# This would only happen if the region is entirely inside an exclusion zone
|
876
858
|
# or if both top and bottom of the region are excluded leaving no valid area
|
877
|
-
import logging
|
878
|
-
logger = logging.getLogger("natural_pdf.elements.region")
|
879
859
|
logger.debug(f"Region {self.bbox} completely covered by exclusions, returning empty string")
|
880
860
|
return ""
|
881
861
|
# We have exclusions, but not all are headers/footers,
|
882
862
|
# or we have a non-rectangular region
|
883
863
|
else:
|
884
864
|
if debug:
|
885
|
-
import logging
|
886
|
-
logger = logging.getLogger("natural_pdf.elements.region")
|
887
865
|
logger.debug(f" Mixed exclusion types or non-rectangular region, switching to filtering")
|
888
866
|
|
889
867
|
# Don't use crop for mixed exclusion types
|
@@ -902,16 +880,13 @@ class Region(DirectionalMixin):
|
|
902
880
|
return result
|
903
881
|
|
904
882
|
# For all other cases (complex exclusions, polygons), we use element filtering
|
905
|
-
import warnings
|
906
|
-
import logging
|
907
|
-
logger = logging.getLogger("natural_pdf.elements.region")
|
908
|
-
|
909
883
|
if debug:
|
910
884
|
logger.debug(f"Using element filtering approach for region {self.bbox}")
|
911
885
|
|
912
|
-
# Get
|
913
|
-
|
914
|
-
|
886
|
+
# Get only word elements in this region first (instead of ALL elements)
|
887
|
+
# This prevents duplication from joining both char and word text
|
888
|
+
all_elements = [e for e in self.page.words if self._is_element_in_region(e)]
|
889
|
+
|
915
890
|
if apply_exclusions and exclusion_regions:
|
916
891
|
if debug:
|
917
892
|
logger.debug(f"Filtering with {len(exclusion_regions)} exclusion zones")
|
@@ -1325,83 +1300,6 @@ class Region(DirectionalMixin):
|
|
1325
1300
|
|
1326
1301
|
return elements
|
1327
1302
|
|
1328
|
-
def expand(self,
|
1329
|
-
left: float = 0,
|
1330
|
-
right: float = 0,
|
1331
|
-
top_expand: float = 0, # Renamed to avoid conflict
|
1332
|
-
bottom_expand: float = 0, # Renamed to avoid conflict
|
1333
|
-
width_factor: float = 1.0,
|
1334
|
-
height_factor: float = 1.0,
|
1335
|
-
# Keep original parameter names for backward compatibility
|
1336
|
-
top: float = None,
|
1337
|
-
bottom: float = None) -> 'Region':
|
1338
|
-
"""
|
1339
|
-
Create a new region expanded from this one.
|
1340
|
-
|
1341
|
-
Args:
|
1342
|
-
left: Amount to expand left edge
|
1343
|
-
right: Amount to expand right edge
|
1344
|
-
top_expand: Amount to expand top edge (upward)
|
1345
|
-
bottom_expand: Amount to expand bottom edge (downward)
|
1346
|
-
width_factor: Factor to multiply width by
|
1347
|
-
height_factor: Factor to multiply height by
|
1348
|
-
top: (DEPRECATED, use top_expand) Amount to expand top edge (upward)
|
1349
|
-
bottom: (DEPRECATED, use bottom_expand) Amount to expand bottom edge (downward)
|
1350
|
-
|
1351
|
-
Returns:
|
1352
|
-
New expanded Region
|
1353
|
-
"""
|
1354
|
-
# Start with current coordinates
|
1355
|
-
new_x0 = self.x0
|
1356
|
-
new_x1 = self.x1
|
1357
|
-
new_top = self.top
|
1358
|
-
new_bottom = self.bottom
|
1359
|
-
|
1360
|
-
# Handle the deprecated parameter names for backward compatibility
|
1361
|
-
if top is not None:
|
1362
|
-
top_expand = top
|
1363
|
-
if bottom is not None:
|
1364
|
-
bottom_expand = bottom
|
1365
|
-
|
1366
|
-
# Apply absolute expansions first
|
1367
|
-
new_x0 -= left
|
1368
|
-
new_x1 += right
|
1369
|
-
new_top -= top_expand # Expand upward (decrease top coordinate)
|
1370
|
-
new_bottom += bottom_expand # Expand downward (increase bottom coordinate)
|
1371
|
-
|
1372
|
-
# Apply percentage factors if provided
|
1373
|
-
if width_factor != 1.0 or height_factor != 1.0:
|
1374
|
-
# Current width and height
|
1375
|
-
current_width = new_x1 - new_x0
|
1376
|
-
current_height = new_bottom - new_top
|
1377
|
-
|
1378
|
-
# Calculate new width and height
|
1379
|
-
new_width = current_width * width_factor
|
1380
|
-
new_height = current_height * height_factor
|
1381
|
-
|
1382
|
-
# Calculate width and height differences
|
1383
|
-
width_diff = new_width - current_width
|
1384
|
-
height_diff = new_height - current_height
|
1385
|
-
|
1386
|
-
# Adjust coordinates to maintain center point
|
1387
|
-
new_x0 -= width_diff / 2
|
1388
|
-
new_x1 += width_diff / 2
|
1389
|
-
new_top -= height_diff / 2
|
1390
|
-
new_bottom += height_diff / 2
|
1391
|
-
|
1392
|
-
# Create new region with expanded bbox
|
1393
|
-
new_region = Region(self.page, (new_x0, new_top, new_x1, new_bottom))
|
1394
|
-
|
1395
|
-
# Copy multi-page properties if present
|
1396
|
-
if self._spans_pages:
|
1397
|
-
new_region._spans_pages = True
|
1398
|
-
new_region._multi_page_elements = self._multi_page_elements
|
1399
|
-
new_region._page_range = self._page_range
|
1400
|
-
new_region.start_element = self.start_element
|
1401
|
-
new_region.end_element = self.end_element
|
1402
|
-
|
1403
|
-
return new_region
|
1404
|
-
|
1405
1303
|
def get_section_between(self, start_element=None, end_element=None, boundary_inclusion='both'):
|
1406
1304
|
"""
|
1407
1305
|
Get a section between two elements within this region.
|
@@ -1616,48 +1514,75 @@ class Region(DirectionalMixin):
|
|
1616
1514
|
|
1617
1515
|
def create_cells(self):
|
1618
1516
|
"""
|
1619
|
-
Create cell regions for a
|
1517
|
+
Create cell regions for a detected table by intersecting its
|
1518
|
+
row and column regions, and add them to the page.
|
1620
1519
|
|
1520
|
+
Assumes child row and column regions are already present on the page.
|
1521
|
+
|
1621
1522
|
Returns:
|
1622
|
-
|
1523
|
+
Self for method chaining.
|
1623
1524
|
"""
|
1624
|
-
|
1625
|
-
|
1525
|
+
# Ensure this is called on a table region
|
1526
|
+
if self.region_type not in ('table', 'tableofcontents'): # Allow for ToC which might have structure
|
1527
|
+
raise ValueError(f"create_cells should be called on a 'table' or 'tableofcontents' region, not '{self.region_type}'")
|
1626
1528
|
|
1627
|
-
# Find rows and columns
|
1628
|
-
|
1629
|
-
|
1529
|
+
# Find rows and columns associated with this page
|
1530
|
+
# Remove the model-specific filter
|
1531
|
+
rows = self.page.find_all('region[type=table-row]')
|
1532
|
+
columns = self.page.find_all('region[type=table-column]')
|
1630
1533
|
|
1631
|
-
# Filter to only include those that overlap with this table
|
1534
|
+
# Filter to only include those that overlap with this table region
|
1632
1535
|
def is_in_table(element):
|
1633
|
-
|
1634
|
-
|
1635
|
-
return (
|
1636
|
-
|
1536
|
+
# Use a simple overlap check (more robust than just center point)
|
1537
|
+
# Check if element's bbox overlaps with self.bbox
|
1538
|
+
return (element.x0 < self.x1 and element.x1 > self.x0 and
|
1539
|
+
element.top < self.bottom and element.bottom > self.top)
|
1637
1540
|
|
1638
1541
|
table_rows = [r for r in rows if is_in_table(r)]
|
1639
1542
|
table_columns = [c for c in columns if is_in_table(c)]
|
1640
1543
|
|
1544
|
+
if not table_rows or not table_columns:
|
1545
|
+
self._page.logger.warning(f"Region {self.bbox}: Cannot create cells. No overlapping row or column regions found.")
|
1546
|
+
return self # Return self even if no cells created
|
1547
|
+
|
1641
1548
|
# Sort rows and columns
|
1642
1549
|
table_rows.sort(key=lambda r: r.top)
|
1643
1550
|
table_columns.sort(key=lambda c: c.x0)
|
1644
1551
|
|
1645
|
-
# Create cells
|
1646
|
-
|
1552
|
+
# Create cells and add them to the page's element manager
|
1553
|
+
created_count = 0
|
1647
1554
|
for row in table_rows:
|
1648
1555
|
for column in table_columns:
|
1649
|
-
#
|
1650
|
-
|
1651
|
-
|
1652
|
-
)
|
1653
|
-
|
1654
|
-
|
1655
|
-
cell
|
1656
|
-
|
1657
|
-
|
1658
|
-
|
1556
|
+
# Calculate intersection bbox for the cell
|
1557
|
+
cell_x0 = max(row.x0, column.x0)
|
1558
|
+
cell_y0 = max(row.top, column.top)
|
1559
|
+
cell_x1 = min(row.x1, column.x1)
|
1560
|
+
cell_y1 = min(row.bottom, column.bottom)
|
1561
|
+
|
1562
|
+
# Only create a cell if the intersection is valid (positive width/height)
|
1563
|
+
if cell_x1 > cell_x0 and cell_y1 > cell_y0:
|
1564
|
+
# Create cell region at the intersection
|
1565
|
+
cell = self.page.create_region(
|
1566
|
+
cell_x0, cell_y0, cell_x1, cell_y1
|
1567
|
+
)
|
1568
|
+
# Set metadata
|
1569
|
+
cell.source = 'derived'
|
1570
|
+
cell.region_type = 'table-cell' # Explicitly set type
|
1571
|
+
cell.normalized_type = 'table-cell' # And normalized type
|
1572
|
+
# Inherit model from the parent table region
|
1573
|
+
cell.model = self.model
|
1574
|
+
cell.parent_region = self # Link cell to parent table region
|
1575
|
+
|
1576
|
+
# Add the cell region to the page's element manager
|
1577
|
+
self.page._element_mgr.add_region(cell)
|
1578
|
+
created_count += 1
|
1659
1579
|
|
1660
|
-
|
1580
|
+
# Optional: Add created cells to the table region's children
|
1581
|
+
# self.child_regions.extend(cells_created_in_this_call) # Needs list management
|
1582
|
+
|
1583
|
+
self._page.logger.info(f"Region {self.bbox} (Model: {self.model}): Created and added {created_count} cell regions.")
|
1584
|
+
|
1585
|
+
return self # Return self for chaining
|
1661
1586
|
|
1662
1587
|
def ask(self, question: str, min_confidence: float = 0.1, model: str = None, debug: bool = False, **kwargs) -> Dict[str, Any]:
|
1663
1588
|
"""
|
natural_pdf/qa/document_qa.py
CHANGED
@@ -5,6 +5,7 @@ from PIL import Image, ImageDraw
|
|
5
5
|
import os
|
6
6
|
import tempfile
|
7
7
|
import json
|
8
|
+
from natural_pdf.elements.collections import ElementCollection
|
8
9
|
|
9
10
|
logger = logging.getLogger("natural_pdf.qa.document_qa")
|
10
11
|
|
@@ -304,8 +305,8 @@ class DocumentQA:
|
|
304
305
|
# Remove from matched texts to avoid duplicates
|
305
306
|
if element.text in matched_texts:
|
306
307
|
matched_texts.remove(element.text)
|
307
|
-
|
308
|
-
result["source_elements"] = source_elements
|
308
|
+
|
309
|
+
result["source_elements"] = ElementCollection(source_elements)
|
309
310
|
|
310
311
|
return result
|
311
312
|
|
@@ -386,7 +387,7 @@ class DocumentQA:
|
|
386
387
|
if element.text in matched_texts:
|
387
388
|
matched_texts.remove(element.text)
|
388
389
|
|
389
|
-
result["source_elements"] = source_elements
|
390
|
+
result["source_elements"] = ElementCollection(source_elements)
|
390
391
|
|
391
392
|
return result
|
392
393
|
|
natural_pdf/selectors/parser.py
CHANGED
@@ -351,4 +351,218 @@ def _is_approximate_match(value1, value2, tolerance: float = 0.1) -> bool:
|
|
351
351
|
return abs(value1 - value2) <= tolerance
|
352
352
|
|
353
353
|
# Default to exact match for other types
|
354
|
-
return value1 == value2
|
354
|
+
return value1 == value2
|
355
|
+
|
356
|
+
|
357
|
+
PSEUDO_CLASS_FUNCTIONS = {
|
358
|
+
'bold': lambda el: hasattr(el, 'bold') and el.bold,
|
359
|
+
'italic': lambda el: hasattr(el, 'italic') and el.italic,
|
360
|
+
'first-child': lambda el: hasattr(el, 'parent') and el.parent and el.parent.children[0] == el, # Example placeholder
|
361
|
+
'last-child': lambda el: hasattr(el, 'parent') and el.parent and el.parent.children[-1] == el, # Example placeholder
|
362
|
+
# Add the new pseudo-classes for negation
|
363
|
+
'not-bold': lambda el: hasattr(el, 'bold') and not el.bold,
|
364
|
+
'not-italic': lambda el: hasattr(el, 'italic') and not el.italic,
|
365
|
+
}
|
366
|
+
|
367
|
+
|
368
|
+
def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> callable:
|
369
|
+
"""
|
370
|
+
Convert a parsed selector to a filter function.
|
371
|
+
|
372
|
+
Args:
|
373
|
+
selector: Parsed selector dictionary
|
374
|
+
**kwargs: Additional filter parameters including:
|
375
|
+
- regex: Whether to use regex for text search
|
376
|
+
- case: Whether to do case-sensitive text search
|
377
|
+
|
378
|
+
Returns:
|
379
|
+
Function that takes an element and returns True if it matches
|
380
|
+
"""
|
381
|
+
def filter_func(element):
|
382
|
+
# Check element type
|
383
|
+
if selector['type'] != 'any':
|
384
|
+
# Special handling for 'text' type to match both 'text', 'char', and 'word'
|
385
|
+
if selector['type'] == 'text':
|
386
|
+
if element.type not in ['text', 'char', 'word']:
|
387
|
+
return False
|
388
|
+
# Special handling for 'region' type to check for detected layout regions
|
389
|
+
elif selector['type'] == 'region':
|
390
|
+
# Check if this is a Region with region_type property
|
391
|
+
if not hasattr(element, 'region_type'):
|
392
|
+
return False
|
393
|
+
|
394
|
+
# If 'type' attribute specified, it will be checked in the attributes section
|
395
|
+
# Check for Docling-specific types (section-header, etc.)
|
396
|
+
elif hasattr(element, 'normalized_type') and element.normalized_type == selector['type']:
|
397
|
+
# This is a direct match with a Docling region type
|
398
|
+
pass
|
399
|
+
# Otherwise, require exact match with the element's type attribute
|
400
|
+
elif not hasattr(element, 'type') or element.type != selector['type']:
|
401
|
+
return False
|
402
|
+
|
403
|
+
# Check attributes
|
404
|
+
for name, attr_info in selector['attributes'].items():
|
405
|
+
op = attr_info['op']
|
406
|
+
value = attr_info['value']
|
407
|
+
|
408
|
+
# Special case for fontname attribute - allow matching part of the name
|
409
|
+
if name == 'fontname' and op == '*=':
|
410
|
+
element_value = getattr(element, name, None)
|
411
|
+
if element_value is None or value.lower() not in element_value.lower():
|
412
|
+
return False
|
413
|
+
continue
|
414
|
+
|
415
|
+
# Convert hyphenated attribute names to underscore for Python properties
|
416
|
+
python_name = name.replace('-', '_')
|
417
|
+
|
418
|
+
# Special case for region attributes
|
419
|
+
if selector['type'] == 'region':
|
420
|
+
if name == 'type':
|
421
|
+
# Use normalized_type for comparison if available
|
422
|
+
if hasattr(element, 'normalized_type') and element.normalized_type:
|
423
|
+
element_value = element.normalized_type
|
424
|
+
else:
|
425
|
+
# Convert spaces to hyphens for consistency with the normalized format
|
426
|
+
element_value = getattr(element, 'region_type', '').lower().replace(' ', '-')
|
427
|
+
elif name == 'model':
|
428
|
+
# Special handling for model attribute in regions
|
429
|
+
element_value = getattr(element, 'model', None)
|
430
|
+
else:
|
431
|
+
# Get the attribute value from the element normally
|
432
|
+
element_value = getattr(element, python_name, None)
|
433
|
+
else:
|
434
|
+
# Get the attribute value from the element normally for non-region elements
|
435
|
+
element_value = getattr(element, python_name, None)
|
436
|
+
|
437
|
+
if element_value is None:
|
438
|
+
return False
|
439
|
+
|
440
|
+
# Apply operator
|
441
|
+
if op == '=':
|
442
|
+
if element_value != value:
|
443
|
+
return False
|
444
|
+
elif op == '~=':
|
445
|
+
# Approximate match (e.g., for colors)
|
446
|
+
if not _is_approximate_match(element_value, value):
|
447
|
+
return False
|
448
|
+
elif op == '>=':
|
449
|
+
# Greater than or equal (element value must be >= specified value)
|
450
|
+
if not (isinstance(element_value, (int, float)) and
|
451
|
+
isinstance(value, (int, float)) and
|
452
|
+
element_value >= value):
|
453
|
+
return False
|
454
|
+
elif op == '<=':
|
455
|
+
# Less than or equal (element value must be <= specified value)
|
456
|
+
if not (isinstance(element_value, (int, float)) and
|
457
|
+
isinstance(value, (int, float)) and
|
458
|
+
element_value <= value):
|
459
|
+
return False
|
460
|
+
elif op == '>':
|
461
|
+
# Greater than (element value must be > specified value)
|
462
|
+
if not (isinstance(element_value, (int, float)) and
|
463
|
+
isinstance(value, (int, float)) and
|
464
|
+
element_value > value):
|
465
|
+
return False
|
466
|
+
elif op == '<':
|
467
|
+
# Less than (element value must be < specified value)
|
468
|
+
if not (isinstance(element_value, (int, float)) and
|
469
|
+
isinstance(value, (int, float)) and
|
470
|
+
element_value < value):
|
471
|
+
return False
|
472
|
+
|
473
|
+
# Check pseudo-classes
|
474
|
+
for pseudo in selector['pseudo_classes']:
|
475
|
+
name = pseudo['name']
|
476
|
+
args = pseudo['args']
|
477
|
+
|
478
|
+
# Handle various pseudo-classes
|
479
|
+
if name == 'contains' and hasattr(element, 'text'):
|
480
|
+
use_regex = kwargs.get('regex', False)
|
481
|
+
ignore_case = not kwargs.get('case', True)
|
482
|
+
|
483
|
+
if use_regex:
|
484
|
+
import re
|
485
|
+
if not element.text:
|
486
|
+
return False
|
487
|
+
try:
|
488
|
+
pattern = re.compile(args, re.IGNORECASE if ignore_case else 0)
|
489
|
+
if not pattern.search(element.text):
|
490
|
+
return False
|
491
|
+
except re.error:
|
492
|
+
# If regex is invalid, fall back to literal text search
|
493
|
+
element_text = element.text
|
494
|
+
search_text = args
|
495
|
+
|
496
|
+
if ignore_case:
|
497
|
+
element_text = element_text.lower()
|
498
|
+
search_text = search_text.lower()
|
499
|
+
|
500
|
+
if search_text not in element_text:
|
501
|
+
return False
|
502
|
+
else:
|
503
|
+
# String comparison with case sensitivity option
|
504
|
+
if not element.text:
|
505
|
+
return False
|
506
|
+
|
507
|
+
element_text = element.text
|
508
|
+
search_text = args
|
509
|
+
|
510
|
+
if ignore_case:
|
511
|
+
element_text = element_text.lower()
|
512
|
+
search_text = search_text.lower()
|
513
|
+
|
514
|
+
if search_text not in element_text:
|
515
|
+
return False
|
516
|
+
elif name == 'starts-with' and hasattr(element, 'text'):
|
517
|
+
if not element.text or not element.text.startswith(args):
|
518
|
+
return False
|
519
|
+
elif name == 'ends-with' and hasattr(element, 'text'):
|
520
|
+
if not element.text or not element.text.endswith(args):
|
521
|
+
return False
|
522
|
+
elif name == 'bold':
|
523
|
+
if not (hasattr(element, 'bold') and element.bold):
|
524
|
+
return False
|
525
|
+
elif name == 'italic':
|
526
|
+
if not (hasattr(element, 'italic') and element.italic):
|
527
|
+
return False
|
528
|
+
elif name == 'horizontal':
|
529
|
+
if not (hasattr(element, 'is_horizontal') and element.is_horizontal):
|
530
|
+
return False
|
531
|
+
elif name == 'vertical':
|
532
|
+
if not (hasattr(element, 'is_vertical') and element.is_vertical):
|
533
|
+
return False
|
534
|
+
else:
|
535
|
+
# Check pseudo-classes (basic ones like :bold, :italic)
|
536
|
+
if name in PSEUDO_CLASS_FUNCTIONS:
|
537
|
+
if not PSEUDO_CLASS_FUNCTIONS[name](element):
|
538
|
+
return False
|
539
|
+
elif name == 'contains':
|
540
|
+
if not hasattr(element, 'text') or not element.text:
|
541
|
+
return False
|
542
|
+
text_to_check = element.text
|
543
|
+
search_term = args
|
544
|
+
if not kwargs.get('case', True): # Check case flag from kwargs
|
545
|
+
text_to_check = text_to_check.lower()
|
546
|
+
search_term = search_term.lower()
|
547
|
+
|
548
|
+
if kwargs.get('regex', False): # Check regex flag from kwargs
|
549
|
+
try:
|
550
|
+
if not re.search(search_term, text_to_check):
|
551
|
+
return False
|
552
|
+
except re.error as e:
|
553
|
+
logger.warning(f"Invalid regex in :contains selector '{search_term}': {e}")
|
554
|
+
return False # Invalid regex cannot match
|
555
|
+
else:
|
556
|
+
if search_term not in text_to_check:
|
557
|
+
return False
|
558
|
+
# Skip complex pseudo-classes like :near, :above here, handled later
|
559
|
+
elif name in ('above', 'below', 'near', 'left-of', 'right-of'):
|
560
|
+
pass # Handled separately after initial filtering
|
561
|
+
else:
|
562
|
+
# Optionally log unknown pseudo-classes
|
563
|
+
# logger.warning(f"Unknown pseudo-class: {name}")
|
564
|
+
pass
|
565
|
+
|
566
|
+
return True # Element passes all attribute and simple pseudo-class filters
|
567
|
+
|
568
|
+
return filter_func
|
@@ -127,10 +127,10 @@ def create_legend(labels_colors: Dict[str, Tuple[int, int, int, int]],
|
|
127
127
|
# Try to load a font, use default if not available
|
128
128
|
try:
|
129
129
|
# Use a commonly available font, adjust size
|
130
|
-
font = ImageFont.truetype("DejaVuSans.ttf",
|
130
|
+
font = ImageFont.truetype("DejaVuSans.ttf", 14)
|
131
131
|
except IOError:
|
132
132
|
try:
|
133
|
-
font = ImageFont.truetype("Arial.ttf",
|
133
|
+
font = ImageFont.truetype("Arial.ttf", 14)
|
134
134
|
except IOError:
|
135
135
|
font = ImageFont.load_default()
|
136
136
|
|