natural-pdf 0.1.23__py3-none-any.whl → 0.1.26.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1210,6 +1210,24 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1210
1210
  # Try lattice first, then fall back to stream if no meaningful results
1211
1211
  logger.debug(f"Region {self.bbox}: Auto-detecting table extraction method...")
1212
1212
 
1213
+ # --- NEW: Prefer already-created table_cell regions if they exist --- #
1214
+ try:
1215
+ cell_regions_in_table = [
1216
+ c
1217
+ for c in self.page.find_all("region[type=table_cell]", apply_exclusions=False)
1218
+ if self.intersects(c)
1219
+ ]
1220
+ except Exception as _cells_err:
1221
+ cell_regions_in_table = [] # Fallback silently
1222
+
1223
+ if cell_regions_in_table:
1224
+ logger.debug(
1225
+ f"Region {self.bbox}: Found {len(cell_regions_in_table)} pre-computed table_cell regions – using 'cells' method."
1226
+ )
1227
+ return self._extract_table_from_cells(cell_regions_in_table)
1228
+
1229
+ # --------------------------------------------------------------- #
1230
+
1213
1231
  try:
1214
1232
  logger.debug(f"Region {self.bbox}: Trying 'lattice' method first...")
1215
1233
  lattice_result = self.extract_table(
@@ -1905,19 +1923,55 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1905
1923
  logger.info(
1906
1924
  f"Region {self.bbox}: Removing existing OCR elements before applying new OCR."
1907
1925
  )
1908
- # Find all OCR elements in this region
1909
- ocr_selector = "text[source=ocr]"
1910
- ocr_elements = self.find_all(ocr_selector)
1911
1926
 
1912
- if ocr_elements:
1913
- logger.info(
1914
- f"Region {self.bbox}: Found {len(ocr_elements)} existing OCR elements to remove."
1915
- )
1916
- # Remove these elements from their page
1917
- removed_count = ocr_elements.remove()
1918
- logger.info(f"Region {self.bbox}: Removed {removed_count} OCR elements.")
1919
- else:
1920
- logger.info(f"Region {self.bbox}: No existing OCR elements found to remove.")
1927
+ # --- Robust removal: iterate through all OCR elements on the page and
1928
+ # remove those that overlap this region. This avoids reliance on
1929
+ # identity‐based look-ups that can break if the ElementManager
1930
+ # rebuilt its internal lists.
1931
+
1932
+ removed_count = 0
1933
+
1934
+ # Helper to remove a single element safely
1935
+ def _safe_remove(elem):
1936
+ nonlocal removed_count
1937
+ success = False
1938
+ if hasattr(elem, "page") and hasattr(elem.page, "_element_mgr"):
1939
+ etype = getattr(elem, "object_type", "word")
1940
+ if etype == "word":
1941
+ etype_key = "words"
1942
+ elif etype == "char":
1943
+ etype_key = "chars"
1944
+ else:
1945
+ etype_key = etype + "s" if not etype.endswith("s") else etype
1946
+ try:
1947
+ success = elem.page._element_mgr.remove_element(elem, etype_key)
1948
+ except Exception:
1949
+ success = False
1950
+ if success:
1951
+ removed_count += 1
1952
+
1953
+ # Remove OCR WORD elements overlapping region
1954
+ for word in list(self.page._element_mgr.words):
1955
+ if getattr(word, "source", None) == "ocr" and self.intersects(word):
1956
+ _safe_remove(word)
1957
+
1958
+ # Remove OCR CHAR dicts overlapping region
1959
+ for char in list(self.page._element_mgr.chars):
1960
+ # char can be dict or TextElement; normalise
1961
+ char_src = char.get("source") if isinstance(char, dict) else getattr(char, "source", None)
1962
+ if char_src == "ocr":
1963
+ # Rough bbox for dicts
1964
+ if isinstance(char, dict):
1965
+ cx0, ctop, cx1, cbottom = char.get("x0", 0), char.get("top", 0), char.get("x1", 0), char.get("bottom", 0)
1966
+ else:
1967
+ cx0, ctop, cx1, cbottom = char.x0, char.top, char.x1, char.bottom
1968
+ # Quick overlap check
1969
+ if not (cx1 < self.x0 or cx0 > self.x1 or cbottom < self.top or ctop > self.bottom):
1970
+ _safe_remove(char)
1971
+
1972
+ logger.info(
1973
+ f"Region {self.bbox}: Removed {removed_count} existing OCR elements (words & chars) before re-applying OCR."
1974
+ )
1921
1975
 
1922
1976
  ocr_mgr = self.page._parent._ocr_manager
1923
1977
 
@@ -1978,8 +2032,17 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1978
2032
  page_top = self.top + (img_top * scale_y)
1979
2033
  page_x1 = self.x0 + (img_x1 * scale_x)
1980
2034
  page_bottom = self.top + (img_bottom * scale_y)
2035
+ raw_conf = result.get("confidence")
2036
+ # Convert confidence to float unless it is None/invalid
2037
+ try:
2038
+ confidence_val = float(raw_conf) if raw_conf is not None else None
2039
+ except (TypeError, ValueError):
2040
+ confidence_val = None
2041
+
2042
+ text_val = result.get("text") # May legitimately be None in detect_only mode
2043
+
1981
2044
  element_data = {
1982
- "text": result["text"],
2045
+ "text": text_val,
1983
2046
  "x0": page_x0,
1984
2047
  "top": page_top,
1985
2048
  "x1": page_x1,
@@ -1988,7 +2051,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1988
2051
  "height": page_bottom - page_top,
1989
2052
  "object_type": "word",
1990
2053
  "source": "ocr",
1991
- "confidence": float(result.get("confidence", 0.0)),
2054
+ "confidence": confidence_val,
1992
2055
  "fontname": "OCR",
1993
2056
  "size": round(pdf_height) if pdf_height > 0 else 10.0,
1994
2057
  "page_number": self.page.number,
@@ -2324,12 +2387,12 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2324
2387
 
2325
2388
  def ask(
2326
2389
  self,
2327
- question: str,
2390
+ question: Union[str, List[str], Tuple[str, ...]],
2328
2391
  min_confidence: float = 0.1,
2329
2392
  model: str = None,
2330
2393
  debug: bool = False,
2331
2394
  **kwargs,
2332
- ) -> Dict[str, Any]:
2395
+ ) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
2333
2396
  """
2334
2397
  Ask a question about the region content using document QA.
2335
2398
 
@@ -2870,4 +2933,98 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2870
2933
  self.metadata = {}
2871
2934
  self.metadata["analysis"] = value
2872
2935
 
2936
+ # ------------------------------------------------------------------
2937
+ # New helper: build table from pre-computed table_cell regions
2938
+ # ------------------------------------------------------------------
2939
+
2940
+ def _extract_table_from_cells(self, cell_regions: List["Region"]) -> List[List[Optional[str]]]:
2941
+ """Construct a table (list-of-lists) from table_cell regions.
2942
+
2943
+ This assumes each cell Region has metadata.row_index / col_index as written by
2944
+ detect_table_structure_from_lines(). If these keys are missing we will
2945
+ fall back to sorting by geometry.
2946
+ """
2947
+ if not cell_regions:
2948
+ return []
2949
+
2950
+ # Attempt to use explicit indices first
2951
+ all_row_idxs = []
2952
+ all_col_idxs = []
2953
+ for cell in cell_regions:
2954
+ try:
2955
+ r_idx = int(cell.metadata.get("row_index"))
2956
+ c_idx = int(cell.metadata.get("col_index"))
2957
+ all_row_idxs.append(r_idx)
2958
+ all_col_idxs.append(c_idx)
2959
+ except Exception:
2960
+ # Not all cells have indices – clear the lists so we switch to geometric sorting
2961
+ all_row_idxs = []
2962
+ all_col_idxs = []
2963
+ break
2964
+
2965
+ if all_row_idxs and all_col_idxs:
2966
+ num_rows = max(all_row_idxs) + 1
2967
+ num_cols = max(all_col_idxs) + 1
2968
+
2969
+ # Initialise blank grid
2970
+ table_grid: List[List[Optional[str]]] = [[None] * num_cols for _ in range(num_rows)]
2971
+
2972
+ for cell in cell_regions:
2973
+ try:
2974
+ r_idx = int(cell.metadata.get("row_index"))
2975
+ c_idx = int(cell.metadata.get("col_index"))
2976
+ text_val = cell.extract_text(layout=False, apply_exclusions=False).strip()
2977
+ table_grid[r_idx][c_idx] = text_val if text_val else None
2978
+ except Exception as _err:
2979
+ # Skip problematic cell
2980
+ continue
2981
+
2982
+ return table_grid
2983
+
2984
+ # ------------------------------------------------------------------
2985
+ # Fallback: derive order purely from geometry if indices are absent
2986
+ # ------------------------------------------------------------------
2987
+ # Sort unique centers to define ordering
2988
+ try:
2989
+ import numpy as np
2990
+ except ImportError:
2991
+ logger.warning("NumPy required for geometric cell ordering; returning empty result.")
2992
+ return []
2993
+
2994
+ # Build arrays of centers
2995
+ centers = np.array([
2996
+ [(c.x0 + c.x1) / 2.0, (c.top + c.bottom) / 2.0] for c in cell_regions
2997
+ ])
2998
+ xs = centers[:, 0]
2999
+ ys = centers[:, 1]
3000
+
3001
+ # Cluster unique row Y positions and column X positions with a tolerance
3002
+ def _cluster(vals, tol=1.0):
3003
+ sorted_vals = np.sort(vals)
3004
+ groups = [[sorted_vals[0]]]
3005
+ for v in sorted_vals[1:]:
3006
+ if abs(v - groups[-1][-1]) <= tol:
3007
+ groups[-1].append(v)
3008
+ else:
3009
+ groups.append([v])
3010
+ return [np.mean(g) for g in groups]
3011
+
3012
+ row_centers = _cluster(ys)
3013
+ col_centers = _cluster(xs)
3014
+
3015
+ num_rows = len(row_centers)
3016
+ num_cols = len(col_centers)
3017
+
3018
+ table_grid: List[List[Optional[str]]] = [[None] * num_cols for _ in range(num_rows)]
3019
+
3020
+ # Assign each cell to nearest row & col center
3021
+ for cell, (cx, cy) in zip(cell_regions, centers):
3022
+ row_idx = int(np.argmin([abs(cy - rc) for rc in row_centers]))
3023
+ col_idx = int(np.argmin([abs(cx - cc) for cc in col_centers]))
3024
+
3025
+ text_val = cell.extract_text(layout=False, apply_exclusions=False).strip()
3026
+ table_grid[row_idx][col_idx] = text_val if text_val else None
3027
+
3028
+ return table_grid
3029
+
2873
3030
 
@@ -43,9 +43,58 @@ class TextElement(Element):
43
43
 
44
44
  @text.setter
45
45
  def text(self, value: str):
46
- """Set the text content."""
46
+ """Set the text content and synchronise underlying char dictionaries (if any)."""
47
+ # Update the primary text value stored on the object itself
47
48
  self._obj["text"] = value
48
49
 
50
+ # --- Keep _char_dicts in sync so downstream utilities (e.g. extract_text)
51
+ # that rely on the raw character dictionaries see the corrected text.
52
+ # For OCR-generated words we usually have a single representative char
53
+ # dict; for native words there may be one per character.
54
+ # ---------------------------------------------------------------------
55
+ try:
56
+ if hasattr(self, "_char_dicts") and isinstance(self._char_dicts, list):
57
+ if not self._char_dicts:
58
+ return # Nothing to update
59
+
60
+ if len(self._char_dicts) == 1:
61
+ # Simple case – a single char dict represents the whole text
62
+ self._char_dicts[0]["text"] = value
63
+ else:
64
+ # Update character-by-character. If new value is shorter than
65
+ # existing char dicts, truncate remaining dicts by setting
66
+ # their text to empty string; if longer, extend by repeating
67
+ # the last char dict geometry (best-effort fallback).
68
+ for idx, char_dict in enumerate(self._char_dicts):
69
+ if idx < len(value):
70
+ char_dict["text"] = value[idx]
71
+ else:
72
+ # Clear extra characters from old text
73
+ char_dict["text"] = ""
74
+
75
+ # If new text is longer, append additional char dicts based
76
+ # on the last available geometry. This is an approximation
77
+ # but ensures text length consistency for downstream joins.
78
+ if len(value) > len(self._char_dicts):
79
+ last_dict = self._char_dicts[-1]
80
+ for extra_idx in range(len(self._char_dicts), len(value)):
81
+ new_dict = last_dict.copy()
82
+ new_dict["text"] = value[extra_idx]
83
+ # Advance x0/x1 roughly by average char width if available
84
+ char_width = last_dict.get("adv") or (
85
+ last_dict.get("width", 0) / max(len(self.text), 1)
86
+ )
87
+ if isinstance(char_width, (int, float)) and char_width > 0:
88
+ shift = char_width * (extra_idx - len(self._char_dicts) + 1)
89
+ new_dict["x0"] = last_dict.get("x0", 0) + shift
90
+ new_dict["x1"] = last_dict.get("x1", 0) + shift
91
+ self._char_dicts.append(new_dict)
92
+ except Exception as sync_err: # pragma: no cover
93
+ # Keep failures silent but logged; better to have outdated chars than crash.
94
+ import logging
95
+ logger = logging.getLogger(__name__)
96
+ logger.debug(f"TextElement: Failed to sync _char_dicts after text update: {sync_err}")
97
+
49
98
  @property
50
99
  def source(self) -> str:
51
100
  """Get the source of this text element (pdf or ocr)."""
@@ -151,20 +200,28 @@ class TextElement(Element):
151
200
  # Default to black
152
201
  return (0, 0, 0)
153
202
 
154
- def extract_text(self, keep_blank_chars=True, **kwargs) -> str:
203
+ def extract_text(self, keep_blank_chars=True, strip: Optional[bool] = True, **kwargs) -> str:
155
204
  """
156
205
  Extract text from this element.
157
206
 
158
207
  Args:
159
- keep_blank_chars: Whether to keep blank characters (default: True)
160
- **kwargs: Additional extraction parameters
208
+ keep_blank_chars: Retained for API compatibility (unused).
209
+ strip: If True (default) remove leading/trailing whitespace. Users may
210
+ pass ``strip=False`` to preserve whitespace exactly as stored.
211
+ **kwargs: Accepted for forward-compatibility and ignored here.
161
212
 
162
213
  Returns:
163
- Text content
214
+ The text content, optionally stripped.
164
215
  """
165
- # For text elements, keep_blank_chars doesn't affect anything as we're
166
- # simply returning the text property. Included for API consistency.
167
- return self.text
216
+ # Basic retrieval
217
+ result = self.text or ""
218
+
219
+ # Apply optional stripping – align with global convention where simple
220
+ # element extraction is stripped by default.
221
+ if strip:
222
+ result = result.strip()
223
+
224
+ return result
168
225
 
169
226
  def contains(self, substring: str, case_sensitive: bool = True) -> bool:
170
227
  """
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
2
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, Callable
3
3
 
4
4
  from pdfplumber.utils.geometry import objects_to_bbox # For calculating combined bbox
5
5
 
@@ -519,3 +519,118 @@ class FlowRegion:
519
519
  )
520
520
  except Exception:
521
521
  return True # If error during check, assume empty to be safe
522
+
523
+ # ------------------------------------------------------------------
524
+ # Table extraction helpers (delegates to underlying physical regions)
525
+ # ------------------------------------------------------------------
526
+
527
+ def extract_table(
528
+ self,
529
+ method: Optional[str] = None,
530
+ table_settings: Optional[dict] = None,
531
+ use_ocr: bool = False,
532
+ ocr_config: Optional[dict] = None,
533
+ text_options: Optional[Dict] = None,
534
+ cell_extraction_func: Optional[Callable[["PhysicalRegion"], Optional[str]]] = None,
535
+ show_progress: bool = False,
536
+ **kwargs,
537
+ ) -> List[List[Optional[str]]]:
538
+ """Extracts a single logical table from the FlowRegion.
539
+
540
+ This is a convenience wrapper that iterates through the constituent
541
+ physical regions **in flow order**, calls their ``extract_table``
542
+ method, and concatenates the resulting rows. It mirrors the public
543
+ interface of :pymeth:`natural_pdf.elements.region.Region.extract_table`.
544
+
545
+ Args:
546
+ method, table_settings, use_ocr, ocr_config, text_options, cell_extraction_func, show_progress:
547
+ Same as in :pymeth:`Region.extract_table` and are forwarded as-is
548
+ to each physical region.
549
+ **kwargs: Additional keyword arguments forwarded to the underlying
550
+ ``Region.extract_table`` implementation.
551
+
552
+ Returns:
553
+ A list of rows (``List[List[Optional[str]]]``). Rows returned from
554
+ consecutive constituent regions are appended in document order. If
555
+ no tables are detected in any region, an empty list is returned.
556
+ """
557
+
558
+ if table_settings is None:
559
+ table_settings = {}
560
+ if text_options is None:
561
+ text_options = {}
562
+
563
+ if not self.constituent_regions:
564
+ return []
565
+
566
+ aggregated_rows: List[List[Optional[str]]] = []
567
+
568
+ for region in self.constituent_regions:
569
+ try:
570
+ region_rows = region.extract_table(
571
+ method=method,
572
+ table_settings=table_settings.copy(), # Avoid side-effects
573
+ use_ocr=use_ocr,
574
+ ocr_config=ocr_config,
575
+ text_options=text_options.copy(),
576
+ cell_extraction_func=cell_extraction_func,
577
+ show_progress=show_progress,
578
+ **kwargs,
579
+ )
580
+
581
+ # ``region_rows`` can legitimately be [] if no table found.
582
+ if region_rows:
583
+ aggregated_rows.extend(region_rows)
584
+ except Exception as e:
585
+ logger.error(
586
+ f"FlowRegion.extract_table: Error extracting table from constituent region {region}: {e}",
587
+ exc_info=True,
588
+ )
589
+
590
+ return aggregated_rows
591
+
592
+ def extract_tables(
593
+ self,
594
+ method: Optional[str] = None,
595
+ table_settings: Optional[dict] = None,
596
+ **kwargs,
597
+ ) -> List[List[List[Optional[str]]]]:
598
+ """Extract **all** tables from the FlowRegion.
599
+
600
+ This simply chains :pymeth:`Region.extract_tables` over each physical
601
+ region and concatenates their results, preserving flow order.
602
+
603
+ Args:
604
+ method, table_settings: Forwarded to underlying ``Region.extract_tables``.
605
+ **kwargs: Additional keyword arguments forwarded.
606
+
607
+ Returns:
608
+ A list where each item is a full table (list of rows). The order of
609
+ tables follows the order of the constituent regions in the flow.
610
+ """
611
+
612
+ if table_settings is None:
613
+ table_settings = {}
614
+
615
+ if not self.constituent_regions:
616
+ return []
617
+
618
+ all_tables: List[List[List[Optional[str]]]] = []
619
+
620
+ for region in self.constituent_regions:
621
+ try:
622
+ region_tables = region.extract_tables(
623
+ method=method,
624
+ table_settings=table_settings.copy(),
625
+ **kwargs,
626
+ )
627
+ # ``region_tables`` is a list (possibly empty).
628
+ if region_tables:
629
+ all_tables.extend(region_tables)
630
+ except Exception as e:
631
+ logger.error(
632
+ f"FlowRegion.extract_tables: Error extracting tables from constituent region {region}: {e}",
633
+ exc_info=True,
634
+ )
635
+
636
+ return all_tables