natural-pdf 0.1.37__py3-none-any.whl → 0.1.40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -631,6 +631,12 @@ class FlowRegion:
631
631
  text_options: Optional[Dict] = None,
632
632
  cell_extraction_func: Optional[Callable[["PhysicalRegion"], Optional[str]]] = None,
633
633
  show_progress: bool = False,
634
+ # Optional row-level merge predicate. If provided, it decides whether
635
+ # the current row (first row of a segment/page) should be merged with
636
+ # the previous one (to handle multi-page spill-overs).
637
+ stitch_rows: Optional[
638
+ Callable[[List[Optional[str]], List[Optional[str]], int, "PhysicalRegion"], bool]
639
+ ] = None,
634
640
  **kwargs,
635
641
  ) -> TableResult:
636
642
  """Extracts a single logical table from the FlowRegion.
@@ -651,6 +657,15 @@ class FlowRegion:
651
657
  A TableResult object containing the aggregated table data. Rows returned from
652
658
  consecutive constituent regions are appended in document order. If
653
659
  no tables are detected in any region, an empty TableResult is returned.
660
+
661
+ stitch_rows parameter:
662
+ Controls whether the first rows of subsequent segments/regions should be merged
663
+ into the previous row (to handle spill-over across page breaks).
664
+
665
+ • None (default) – no merging (behaviour identical to previous versions).
666
+ • Callable – custom predicate taking
667
+ (prev_row, cur_row, row_idx_in_segment, segment_object) → bool.
668
+ Return True to merge `cur_row` into `prev_row` (default column-wise merge is used).
654
669
  """
655
670
 
656
671
  if table_settings is None:
@@ -661,9 +676,26 @@ class FlowRegion:
661
676
  if not self.constituent_regions:
662
677
  return TableResult([])
663
678
 
679
+ # Resolve stitch_rows predicate -------------------------------------------------------
680
+ predicate: Optional[
681
+ Callable[[List[Optional[str]], List[Optional[str]], int, "PhysicalRegion"], bool]
682
+ ] = stitch_rows if callable(stitch_rows) else None
683
+
684
+ def _default_merge(prev_row: List[Optional[str]], cur_row: List[Optional[str]]) -> List[Optional[str]]:
685
+ """Column-wise merge – concatenates non-empty strings with a space."""
686
+ from itertools import zip_longest
687
+
688
+ merged: List[Optional[str]] = []
689
+ for p, c in zip_longest(prev_row, cur_row, fillvalue=""):
690
+ if (p or "").strip() and (c or "").strip():
691
+ merged.append(f"{p} {c}".strip())
692
+ else:
693
+ merged.append((p or "") + (c or ""))
694
+ return merged
695
+
664
696
  aggregated_rows: List[List[Optional[str]]] = []
665
697
 
666
- for region in self.constituent_regions:
698
+ for region_idx, region in enumerate(self.constituent_regions):
667
699
  try:
668
700
  region_result = region.extract_table(
669
701
  method=method,
@@ -676,9 +708,25 @@ class FlowRegion:
676
708
  **kwargs,
677
709
  )
678
710
 
679
- # region_result is now a TableResult object, extract the rows
680
- if region_result:
681
- aggregated_rows.extend(region_result)
711
+ # Convert result to list of rows
712
+ if not region_result:
713
+ continue
714
+
715
+ if isinstance(region_result, TableResult):
716
+ segment_rows = list(region_result)
717
+ else:
718
+ segment_rows = list(region_result)
719
+
720
+ for row_idx, row in enumerate(segment_rows):
721
+ if (
722
+ predicate is not None
723
+ and aggregated_rows
724
+ and predicate(aggregated_rows[-1], row, row_idx, region)
725
+ ):
726
+ # Merge with previous row
727
+ aggregated_rows[-1] = _default_merge(aggregated_rows[-1], row)
728
+ else:
729
+ aggregated_rows.append(row)
682
730
  except Exception as e:
683
731
  logger.error(
684
732
  f"FlowRegion.extract_table: Error extracting table from constituent region {region}: {e}",
@@ -100,6 +100,12 @@ def safe_parse_color(value_str: str) -> tuple:
100
100
  ValueError: If the color cannot be parsed
101
101
  """
102
102
  value_str = value_str.strip()
103
+
104
+ # Strip quotes first if it's a quoted string (same logic as safe_parse_value)
105
+ if (value_str.startswith('"') and value_str.endswith('"')) or (
106
+ value_str.startswith("'") and value_str.endswith("'")
107
+ ):
108
+ value_str = value_str[1:-1]
103
109
 
104
110
  # Try parsing as a Python literal (for RGB tuples)
105
111
  try:
@@ -504,6 +510,21 @@ def _is_approximate_match(value1, value2) -> bool:
504
510
  return value1 == value2
505
511
 
506
512
 
513
+ def _is_exact_color_match(value1, value2) -> bool:
514
+ """
515
+ Check if two color values match exactly (with small tolerance for color variations).
516
+
517
+ For colors: Uses Delta E color difference with strict tolerance of 2.0
518
+ For non-colors: Falls back to exact equality
519
+ """
520
+ # First check if both values are colors
521
+ if _is_color_value(value1) and _is_color_value(value2):
522
+ return _color_distance(value1, value2) <= 2.0
523
+
524
+ # Default to exact match for non-colors
525
+ return value1 == value2
526
+
527
+
507
528
  PSEUDO_CLASS_FUNCTIONS = {
508
529
  "bold": lambda el: hasattr(el, "bold") and el.bold,
509
530
  "italic": lambda el: hasattr(el, "italic") and el.italic,
@@ -603,7 +624,19 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
603
624
 
604
625
  # Determine compare_func based on op (reuse existing logic)
605
626
  if op == "=":
606
- compare_func = lambda el_val, sel_val: el_val == sel_val
627
+ # For color attributes, use exact color matching with small tolerance
628
+ if name in [
629
+ "color",
630
+ "non_stroking_color",
631
+ "fill",
632
+ "stroke",
633
+ "strokeColor",
634
+ "fillColor",
635
+ ]:
636
+ op_desc = f"= {value!r} (exact color)"
637
+ compare_func = lambda el_val, sel_val: _is_exact_color_match(el_val, sel_val)
638
+ else:
639
+ compare_func = lambda el_val, sel_val: el_val == sel_val
607
640
  elif op == "!=":
608
641
  compare_func = lambda el_val, sel_val: el_val != sel_val
609
642
  elif op == "~=":
@@ -0,0 +1,97 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from typing import Any, Callable, Optional
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ class TextMixin: # pylint: disable=too-few-public-methods
10
+ """Mixin that adds general text-replacement capabilities.
11
+
12
+ Two public entry points are exposed to any class that inherits this mix-in:
13
+
14
+ 1. ``update_text`` (preferred) – iterate over text elements selected via the
15
+ ``selector`` argument (default: ``"text"``) and apply a *correction* callback
16
+ which optionally returns replacement text. If the callback returns a
17
+ non-``None`` string that differs from the current value, the element's
18
+ ``text`` attribute is updated in-place.
19
+
20
+ 2. ``correct_ocr`` – legacy name kept for backward compatibility. It simply
21
+ forwards to :py:meth:`update_text` while forcing
22
+ ``selector="text[source=ocr]"`` so that the historic behaviour (acting only
23
+ on OCR-generated elements) is preserved.
24
+ """
25
+
26
+ # ---------------------------------------------------------------------
27
+ # Back-compat shim
28
+ # ---------------------------------------------------------------------
29
+ def correct_ocr(self, *args, selector: str = "text[source=ocr]", **kwargs): # type: ignore[override]
30
+ """Backward-compatibility wrapper that forwards to *update_text*.
31
+
32
+ Parameters
33
+ ----------
34
+ *args, **kwargs
35
+ Forwarded verbatim to :py:meth:`update_text` (after injecting the
36
+ ``selector`` default shown above).
37
+ """
38
+
39
+ # Delegate – subclasses may have overridden *update_text* with a richer
40
+ # signature so we pass everything through untouched.
41
+ return self.update_text(*args, selector=selector, **kwargs) # type: ignore[arg-type]
42
+
43
+ # ------------------------------------------------------------------
44
+ # Generic fallback implementation
45
+ # ------------------------------------------------------------------
46
+ def update_text( # type: ignore[override]
47
+ self,
48
+ transform: Callable[[Any], Optional[str]],
49
+ *,
50
+ selector: str = "text",
51
+ apply_exclusions: bool = False,
52
+ **_,
53
+ ):
54
+ """Generic implementation that works for any object exposing *find_all*.
55
+
56
+ Classes that require more sophisticated behaviour (parallelism, page
57
+ delegation, etc.) are expected to *override* this method while keeping
58
+ the same public contract.
59
+ """
60
+
61
+ if not callable(transform):
62
+ raise TypeError("transform must be callable")
63
+
64
+ # We rely on the presence of *find_all* to obtain elements. If the
65
+ # subclass does not implement it then it *must* override update_text.
66
+ if not hasattr(self, "find_all"):
67
+ raise NotImplementedError(
68
+ f"{self.__class__.__name__} must implement `update_text` explicitly "
69
+ "(no `find_all` method found)."
70
+ )
71
+
72
+ try:
73
+ elements_collection = self.find_all(selector=selector, apply_exclusions=apply_exclusions)
74
+ except Exception as exc: # pragma: no cover – defensive
75
+ raise RuntimeError(f"Failed to gather elements with selector '{selector}': {exc}") from exc
76
+
77
+ # `find_all` returns an ElementCollection; fall back gracefully otherwise.
78
+ elements_iter = getattr(elements_collection, "elements", elements_collection)
79
+ updated = 0
80
+
81
+ for element in elements_iter:
82
+ if not hasattr(element, "text"):
83
+ continue
84
+
85
+ new_text = transform(element)
86
+ if new_text is not None and isinstance(new_text, str) and new_text != element.text:
87
+ element.text = new_text
88
+ updated += 1
89
+
90
+ logger.info(
91
+ "%s.update_text – processed %d element(s); updated %d.",
92
+ self.__class__.__name__,
93
+ len(elements_iter),
94
+ updated,
95
+ )
96
+
97
+ return self
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.1.37
3
+ Version: 0.1.40
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -1,5 +1,6 @@
1
- natural_pdf/__init__.py,sha256=qDFJNF8sbEDO-2WSFAxoWEM8updOUP6dB-ckya0kxfs,3275
1
+ natural_pdf/__init__.py,sha256=ACwgLmWAgifAO7NtatP5c57u60g3j6YfYZd-uSL9Ly4,3455
2
2
  natural_pdf/cli.py,sha256=SkPwhhMM-GhLsj3O1n1Agxz4KOxcZ08sj8hVQSFJB5c,4064
3
+ natural_pdf/text_mixin.py,sha256=O-RECzpjcmbf5HF7LSqL5VmyexVNF8GIwj9LyJNqLgQ,3952
3
4
  natural_pdf/analyzers/__init__.py,sha256=M5oD4oEsIBWrzgTaeg4uBYrC4OPd7Mp7tz8ootKN_l8,1134
4
5
  natural_pdf/analyzers/guides.py,sha256=D55ov45PE7mhqvEnarn82y5hG6gmDzk7tYw233LnluA,141896
5
6
  natural_pdf/analyzers/shape_detection_mixin.py,sha256=Ef1o73QYVXQ2QcQMM_W9XRwY6vaIQHgxzD7etJ6LbiM,62820
@@ -27,8 +28,8 @@ natural_pdf/collections/pdf_collection.py,sha256=sDVEbFMNME_2OaHIsCoR_W7V1cAATNw
27
28
  natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
28
29
  natural_pdf/core/element_manager.py,sha256=DRZvntd99wjXy6KeDjCq5uRhjMftZop9QklOZqlUH8M,55349
29
30
  natural_pdf/core/highlighting_service.py,sha256=2tBrrEq6d6hz5f6Yf7z5TysJdlTyuHTURBnQxokJnDM,40645
30
- natural_pdf/core/page.py,sha256=MwIENkMjEKStC6RlD3SBrqmyZt_MKzrIY7vLBFIvrwY,142529
31
- natural_pdf/core/pdf.py,sha256=2hK3yRVRxEQMVy1v4w6P26VGoDpCu_3FNkYgN-LO4hA,93221
31
+ natural_pdf/core/page.py,sha256=6QaeOZqh57dWNs9-JlJl-lQWkV1cQ7QqnUxUHfpvKXE,145377
32
+ natural_pdf/core/pdf.py,sha256=-ezlEPm-D_8U2UJe_Msb0v2wCr7I2nf88bVA3im8Spo,98945
32
33
  natural_pdf/describe/__init__.py,sha256=kIV7ORmWWB1SAur7nK2aAwR-wHqSedhKfUsaUl4hG0A,586
33
34
  natural_pdf/describe/base.py,sha256=CLhZXYQO6SOPUVWLt6VwZ7MK48t_6wgPMyFMLtTCKRc,18166
34
35
  natural_pdf/describe/elements.py,sha256=JicXC9SJmmasqxalpCXA47-kVwv-6JnR3Xiu778aNHM,12634
@@ -36,11 +37,11 @@ natural_pdf/describe/mixin.py,sha256=rkX14aGrSz7Jvxx8Rbxv3eSfbO-_29DipwpstrV2pDQ
36
37
  natural_pdf/describe/summary.py,sha256=cfT4ZQkeatCDAOwWPwhtEVXisNgk6E57fAXAnoRysSU,7645
37
38
  natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
38
39
  natural_pdf/elements/base.py,sha256=-ZAcc8lb2aSWTKcprwKTvnR6hsDGDm7T8a1Y9V38E_A,52042
39
- natural_pdf/elements/collections.py,sha256=_B03lJA1n147alE4xvn6qQ9uZWI8kb8VGxpchghqxqg,131834
40
+ natural_pdf/elements/collections.py,sha256=ko9tzZanbHsDtn0RP6p1Ah1kWH35BBxLOyrRjTokYLU,138948
40
41
  natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
41
42
  natural_pdf/elements/line.py,sha256=mHSeV-ZABY-Cc_K_NpFL53OGtTWlexYDlMvZc8_Vrx8,3845
42
43
  natural_pdf/elements/rect.py,sha256=QuQg0Qo7XYQKBac-3Ss0n0ELV6icdPcrygWM2VWzeX8,3325
43
- natural_pdf/elements/region.py,sha256=ewY9HmV_VN6tN_VKtHj7dtk6nh7hrot-pW5Soz5iMg0,148150
44
+ natural_pdf/elements/region.py,sha256=wpCqK7zLXiwKgH2SQ1-VKjQgcNQsc3Nhmy-bRmJ6Ds4,153201
44
45
  natural_pdf/elements/text.py,sha256=giPJQaXuOBCviQ7QKVx_ZMrKFVpgQAsaCS2-kn-8mp0,20530
45
46
  natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
46
47
  natural_pdf/exporters/__init__.py,sha256=QffoARekR6WzXEd05oxOytly4qPdBizuIF-SUkeFpig,643
@@ -58,9 +59,9 @@ natural_pdf/extraction/mixin.py,sha256=z0HNRs4x4RoioNjzg3slDeqoHbiPug0HB37bUHehq
58
59
  natural_pdf/extraction/result.py,sha256=PDaCCN2LQBbHsZy0_lrQ0ROeMsnmH1WRoXWOjk9M2o4,1825
59
60
  natural_pdf/flows/__init__.py,sha256=cUN4A8hTDLZSRr4PO2W_lR4z6hWpbNG8Seox-IIcrLU,277
60
61
  natural_pdf/flows/collections.py,sha256=iF8SsfKKb-YVIGi3m-yMRnfKgo_0n_EGhojnYK24h-Q,28493
61
- natural_pdf/flows/element.py,sha256=mKzk3B7A7sWNvu4CDvAjLr3_ZFLt--ktrSNoLfLpFxU,23940
62
- natural_pdf/flows/flow.py,sha256=ukkUqXsZmEw-QJEiVqEBLC8ktfBG2Bw56_RR1OEsd24,12802
63
- natural_pdf/flows/region.py,sha256=jRenBFh2ZmFNklNnGkzCsAM0OfMjbP8fo0p7BiVCl_k,31795
62
+ natural_pdf/flows/element.py,sha256=Tlxjhe3-4i3V6ondGTPWHHE3B3z7A3_CSamLe4X45SE,24932
63
+ natural_pdf/flows/flow.py,sha256=h_tNE0bUQPIC554rprLYoJ5ct1hYPz82FYViNZIGmMw,44068
64
+ natural_pdf/flows/region.py,sha256=QwJ5E0LplIzPf9TbdwivQZqouyVlozITzBkC2SI9gDM,34201
64
65
  natural_pdf/ocr/__init__.py,sha256=VY8hhvDPf7Gh2lB-d2QRmghLLyTy6ydxlgo1cS4dOSk,2482
65
66
  natural_pdf/ocr/engine.py,sha256=SwNlWydtHbrIghV5JD_j5B4-rnjCMYIWUIEARag-zHw,11839
66
67
  natural_pdf/ocr/engine_doctr.py,sha256=ptKrupMWoulZb-R93zr9btoe94JPWU7vlJuN7OBJEIM,17740
@@ -81,7 +82,7 @@ natural_pdf/search/search_options.py,sha256=sq_e8_jSROicD94b_xtDtLnjEr_Zsy4icjzP
81
82
  natural_pdf/search/search_service_protocol.py,sha256=Dl-Q-CrutkhZwI69scbW9EWPeYM63qxB60_EA7YqIYo,6699
82
83
  natural_pdf/search/searchable_mixin.py,sha256=hqQ_AuID5eTGRCtKYdFLZ1zF35y73uk3x1M1VW9Il8U,23514
83
84
  natural_pdf/selectors/__init__.py,sha256=oZGeqSv53EqmIZOhcnawuaGGlRg1h79vArXuZCWKm4A,123
84
- natural_pdf/selectors/parser.py,sha256=W1gZuBhGy2uHqCoExzCAFbsiVMKYSgUfCc9cr4rO1V0,37540
85
+ natural_pdf/selectors/parser.py,sha256=oa5tzX8UnTwxKbhVrH46lzdILfDObWWRWlQgy2tuwFQ,38828
85
86
  natural_pdf/tables/__init__.py,sha256=sCvCGbGsL6BiqlNxAYfVv003bIDLI11FmjHhaWfcU6w,104
86
87
  natural_pdf/tables/result.py,sha256=hrGIWDkImpdxsGzugcQKU-qrTgHwwfOigJDFdYl8aUc,3994
87
88
  natural_pdf/templates/__init__.py,sha256=jYBxzfi73vew0f6yhIh1MlRxw4F_TVN2hKQR0YXOFe0,20
@@ -98,7 +99,7 @@ natural_pdf/utils/text_extraction.py,sha256=HYWlYGPfafwzsuMyfL5oQhvcD4NobbvC_aCp
98
99
  natural_pdf/utils/visualization.py,sha256=olDkWtuVzP0NxRg0CP0DL-eXNCY7Bs-SH-2Xn-cjbo0,9370
99
100
  natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
100
101
  natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
101
- natural_pdf-0.1.37.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
102
+ natural_pdf-0.1.40.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
102
103
  optimization/memory_comparison.py,sha256=F90D_5WhliSGAct_lyx93xd4q4F-jeo8QpGyDr8tmNw,6543
103
104
  optimization/pdf_analyzer.py,sha256=xf6h-FNlqCpsm8NriXcs_bQZOB8eQkxgGGKVRL_jgCM,19347
104
105
  optimization/performance_analysis.py,sha256=RjAqeE3YS1r_7qTWkY6Ng5YMbb6MXJXfXX6LoVjg_xQ,13035
@@ -115,8 +116,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
115
116
  tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
116
117
  tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
117
118
  tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
118
- natural_pdf-0.1.37.dist-info/METADATA,sha256=1POawL7Edgjod2Qt1TO-2DhUkVesip-OnB0KkQCgGQ0,6739
119
- natural_pdf-0.1.37.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
120
- natural_pdf-0.1.37.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
121
- natural_pdf-0.1.37.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
122
- natural_pdf-0.1.37.dist-info/RECORD,,
119
+ natural_pdf-0.1.40.dist-info/METADATA,sha256=igAI5dRWEeocwR0_BpNgtA8Rw5dOVuoGZ0kJwh08q3Y,6739
120
+ natural_pdf-0.1.40.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
121
+ natural_pdf-0.1.40.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
122
+ natural_pdf-0.1.40.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
123
+ natural_pdf-0.1.40.dist-info/RECORD,,