natural-pdf 0.1.23__py3-none-any.whl → 0.1.26.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/shape_detection_mixin.py +40 -0
- natural_pdf/core/highlighting_service.py +4 -4
- natural_pdf/core/page.py +82 -9
- natural_pdf/describe/base.py +11 -1
- natural_pdf/describe/summary.py +28 -2
- natural_pdf/elements/base.py +2 -2
- natural_pdf/elements/collections.py +139 -100
- natural_pdf/elements/line.py +9 -4
- natural_pdf/elements/region.py +173 -16
- natural_pdf/elements/text.py +65 -8
- natural_pdf/flows/region.py +116 -1
- natural_pdf/qa/document_qa.py +224 -113
- natural_pdf/utils/packaging.py +23 -9
- natural_pdf/utils/text_extraction.py +34 -14
- {natural_pdf-0.1.23.dist-info → natural_pdf-0.1.26.dev0.dist-info}/METADATA +2 -1
- {natural_pdf-0.1.23.dist-info → natural_pdf-0.1.26.dev0.dist-info}/RECORD +20 -20
- {natural_pdf-0.1.23.dist-info → natural_pdf-0.1.26.dev0.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.23.dist-info → natural_pdf-0.1.26.dev0.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.23.dist-info → natural_pdf-0.1.26.dev0.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.23.dist-info → natural_pdf-0.1.26.dev0.dist-info}/top_level.txt +0 -0
natural_pdf/elements/region.py
CHANGED
@@ -1210,6 +1210,24 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1210
1210
|
# Try lattice first, then fall back to stream if no meaningful results
|
1211
1211
|
logger.debug(f"Region {self.bbox}: Auto-detecting table extraction method...")
|
1212
1212
|
|
1213
|
+
# --- NEW: Prefer already-created table_cell regions if they exist --- #
|
1214
|
+
try:
|
1215
|
+
cell_regions_in_table = [
|
1216
|
+
c
|
1217
|
+
for c in self.page.find_all("region[type=table_cell]", apply_exclusions=False)
|
1218
|
+
if self.intersects(c)
|
1219
|
+
]
|
1220
|
+
except Exception as _cells_err:
|
1221
|
+
cell_regions_in_table = [] # Fallback silently
|
1222
|
+
|
1223
|
+
if cell_regions_in_table:
|
1224
|
+
logger.debug(
|
1225
|
+
f"Region {self.bbox}: Found {len(cell_regions_in_table)} pre-computed table_cell regions – using 'cells' method."
|
1226
|
+
)
|
1227
|
+
return self._extract_table_from_cells(cell_regions_in_table)
|
1228
|
+
|
1229
|
+
# --------------------------------------------------------------- #
|
1230
|
+
|
1213
1231
|
try:
|
1214
1232
|
logger.debug(f"Region {self.bbox}: Trying 'lattice' method first...")
|
1215
1233
|
lattice_result = self.extract_table(
|
@@ -1905,19 +1923,55 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1905
1923
|
logger.info(
|
1906
1924
|
f"Region {self.bbox}: Removing existing OCR elements before applying new OCR."
|
1907
1925
|
)
|
1908
|
-
# Find all OCR elements in this region
|
1909
|
-
ocr_selector = "text[source=ocr]"
|
1910
|
-
ocr_elements = self.find_all(ocr_selector)
|
1911
1926
|
|
1912
|
-
|
1913
|
-
|
1914
|
-
|
1915
|
-
|
1916
|
-
|
1917
|
-
|
1918
|
-
|
1919
|
-
|
1920
|
-
|
1927
|
+
# --- Robust removal: iterate through all OCR elements on the page and
|
1928
|
+
# remove those that overlap this region. This avoids reliance on
|
1929
|
+
# identity‐based look-ups that can break if the ElementManager
|
1930
|
+
# rebuilt its internal lists.
|
1931
|
+
|
1932
|
+
removed_count = 0
|
1933
|
+
|
1934
|
+
# Helper to remove a single element safely
|
1935
|
+
def _safe_remove(elem):
|
1936
|
+
nonlocal removed_count
|
1937
|
+
success = False
|
1938
|
+
if hasattr(elem, "page") and hasattr(elem.page, "_element_mgr"):
|
1939
|
+
etype = getattr(elem, "object_type", "word")
|
1940
|
+
if etype == "word":
|
1941
|
+
etype_key = "words"
|
1942
|
+
elif etype == "char":
|
1943
|
+
etype_key = "chars"
|
1944
|
+
else:
|
1945
|
+
etype_key = etype + "s" if not etype.endswith("s") else etype
|
1946
|
+
try:
|
1947
|
+
success = elem.page._element_mgr.remove_element(elem, etype_key)
|
1948
|
+
except Exception:
|
1949
|
+
success = False
|
1950
|
+
if success:
|
1951
|
+
removed_count += 1
|
1952
|
+
|
1953
|
+
# Remove OCR WORD elements overlapping region
|
1954
|
+
for word in list(self.page._element_mgr.words):
|
1955
|
+
if getattr(word, "source", None) == "ocr" and self.intersects(word):
|
1956
|
+
_safe_remove(word)
|
1957
|
+
|
1958
|
+
# Remove OCR CHAR dicts overlapping region
|
1959
|
+
for char in list(self.page._element_mgr.chars):
|
1960
|
+
# char can be dict or TextElement; normalise
|
1961
|
+
char_src = char.get("source") if isinstance(char, dict) else getattr(char, "source", None)
|
1962
|
+
if char_src == "ocr":
|
1963
|
+
# Rough bbox for dicts
|
1964
|
+
if isinstance(char, dict):
|
1965
|
+
cx0, ctop, cx1, cbottom = char.get("x0", 0), char.get("top", 0), char.get("x1", 0), char.get("bottom", 0)
|
1966
|
+
else:
|
1967
|
+
cx0, ctop, cx1, cbottom = char.x0, char.top, char.x1, char.bottom
|
1968
|
+
# Quick overlap check
|
1969
|
+
if not (cx1 < self.x0 or cx0 > self.x1 or cbottom < self.top or ctop > self.bottom):
|
1970
|
+
_safe_remove(char)
|
1971
|
+
|
1972
|
+
logger.info(
|
1973
|
+
f"Region {self.bbox}: Removed {removed_count} existing OCR elements (words & chars) before re-applying OCR."
|
1974
|
+
)
|
1921
1975
|
|
1922
1976
|
ocr_mgr = self.page._parent._ocr_manager
|
1923
1977
|
|
@@ -1978,8 +2032,17 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1978
2032
|
page_top = self.top + (img_top * scale_y)
|
1979
2033
|
page_x1 = self.x0 + (img_x1 * scale_x)
|
1980
2034
|
page_bottom = self.top + (img_bottom * scale_y)
|
2035
|
+
raw_conf = result.get("confidence")
|
2036
|
+
# Convert confidence to float unless it is None/invalid
|
2037
|
+
try:
|
2038
|
+
confidence_val = float(raw_conf) if raw_conf is not None else None
|
2039
|
+
except (TypeError, ValueError):
|
2040
|
+
confidence_val = None
|
2041
|
+
|
2042
|
+
text_val = result.get("text") # May legitimately be None in detect_only mode
|
2043
|
+
|
1981
2044
|
element_data = {
|
1982
|
-
"text":
|
2045
|
+
"text": text_val,
|
1983
2046
|
"x0": page_x0,
|
1984
2047
|
"top": page_top,
|
1985
2048
|
"x1": page_x1,
|
@@ -1988,7 +2051,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1988
2051
|
"height": page_bottom - page_top,
|
1989
2052
|
"object_type": "word",
|
1990
2053
|
"source": "ocr",
|
1991
|
-
"confidence":
|
2054
|
+
"confidence": confidence_val,
|
1992
2055
|
"fontname": "OCR",
|
1993
2056
|
"size": round(pdf_height) if pdf_height > 0 else 10.0,
|
1994
2057
|
"page_number": self.page.number,
|
@@ -2324,12 +2387,12 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2324
2387
|
|
2325
2388
|
def ask(
|
2326
2389
|
self,
|
2327
|
-
question: str,
|
2390
|
+
question: Union[str, List[str], Tuple[str, ...]],
|
2328
2391
|
min_confidence: float = 0.1,
|
2329
2392
|
model: str = None,
|
2330
2393
|
debug: bool = False,
|
2331
2394
|
**kwargs,
|
2332
|
-
) -> Dict[str, Any]:
|
2395
|
+
) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
|
2333
2396
|
"""
|
2334
2397
|
Ask a question about the region content using document QA.
|
2335
2398
|
|
@@ -2870,4 +2933,98 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2870
2933
|
self.metadata = {}
|
2871
2934
|
self.metadata["analysis"] = value
|
2872
2935
|
|
2936
|
+
# ------------------------------------------------------------------
|
2937
|
+
# New helper: build table from pre-computed table_cell regions
|
2938
|
+
# ------------------------------------------------------------------
|
2939
|
+
|
2940
|
+
def _extract_table_from_cells(self, cell_regions: List["Region"]) -> List[List[Optional[str]]]:
|
2941
|
+
"""Construct a table (list-of-lists) from table_cell regions.
|
2942
|
+
|
2943
|
+
This assumes each cell Region has metadata.row_index / col_index as written by
|
2944
|
+
detect_table_structure_from_lines(). If these keys are missing we will
|
2945
|
+
fall back to sorting by geometry.
|
2946
|
+
"""
|
2947
|
+
if not cell_regions:
|
2948
|
+
return []
|
2949
|
+
|
2950
|
+
# Attempt to use explicit indices first
|
2951
|
+
all_row_idxs = []
|
2952
|
+
all_col_idxs = []
|
2953
|
+
for cell in cell_regions:
|
2954
|
+
try:
|
2955
|
+
r_idx = int(cell.metadata.get("row_index"))
|
2956
|
+
c_idx = int(cell.metadata.get("col_index"))
|
2957
|
+
all_row_idxs.append(r_idx)
|
2958
|
+
all_col_idxs.append(c_idx)
|
2959
|
+
except Exception:
|
2960
|
+
# Not all cells have indices – clear the lists so we switch to geometric sorting
|
2961
|
+
all_row_idxs = []
|
2962
|
+
all_col_idxs = []
|
2963
|
+
break
|
2964
|
+
|
2965
|
+
if all_row_idxs and all_col_idxs:
|
2966
|
+
num_rows = max(all_row_idxs) + 1
|
2967
|
+
num_cols = max(all_col_idxs) + 1
|
2968
|
+
|
2969
|
+
# Initialise blank grid
|
2970
|
+
table_grid: List[List[Optional[str]]] = [[None] * num_cols for _ in range(num_rows)]
|
2971
|
+
|
2972
|
+
for cell in cell_regions:
|
2973
|
+
try:
|
2974
|
+
r_idx = int(cell.metadata.get("row_index"))
|
2975
|
+
c_idx = int(cell.metadata.get("col_index"))
|
2976
|
+
text_val = cell.extract_text(layout=False, apply_exclusions=False).strip()
|
2977
|
+
table_grid[r_idx][c_idx] = text_val if text_val else None
|
2978
|
+
except Exception as _err:
|
2979
|
+
# Skip problematic cell
|
2980
|
+
continue
|
2981
|
+
|
2982
|
+
return table_grid
|
2983
|
+
|
2984
|
+
# ------------------------------------------------------------------
|
2985
|
+
# Fallback: derive order purely from geometry if indices are absent
|
2986
|
+
# ------------------------------------------------------------------
|
2987
|
+
# Sort unique centers to define ordering
|
2988
|
+
try:
|
2989
|
+
import numpy as np
|
2990
|
+
except ImportError:
|
2991
|
+
logger.warning("NumPy required for geometric cell ordering; returning empty result.")
|
2992
|
+
return []
|
2993
|
+
|
2994
|
+
# Build arrays of centers
|
2995
|
+
centers = np.array([
|
2996
|
+
[(c.x0 + c.x1) / 2.0, (c.top + c.bottom) / 2.0] for c in cell_regions
|
2997
|
+
])
|
2998
|
+
xs = centers[:, 0]
|
2999
|
+
ys = centers[:, 1]
|
3000
|
+
|
3001
|
+
# Cluster unique row Y positions and column X positions with a tolerance
|
3002
|
+
def _cluster(vals, tol=1.0):
|
3003
|
+
sorted_vals = np.sort(vals)
|
3004
|
+
groups = [[sorted_vals[0]]]
|
3005
|
+
for v in sorted_vals[1:]:
|
3006
|
+
if abs(v - groups[-1][-1]) <= tol:
|
3007
|
+
groups[-1].append(v)
|
3008
|
+
else:
|
3009
|
+
groups.append([v])
|
3010
|
+
return [np.mean(g) for g in groups]
|
3011
|
+
|
3012
|
+
row_centers = _cluster(ys)
|
3013
|
+
col_centers = _cluster(xs)
|
3014
|
+
|
3015
|
+
num_rows = len(row_centers)
|
3016
|
+
num_cols = len(col_centers)
|
3017
|
+
|
3018
|
+
table_grid: List[List[Optional[str]]] = [[None] * num_cols for _ in range(num_rows)]
|
3019
|
+
|
3020
|
+
# Assign each cell to nearest row & col center
|
3021
|
+
for cell, (cx, cy) in zip(cell_regions, centers):
|
3022
|
+
row_idx = int(np.argmin([abs(cy - rc) for rc in row_centers]))
|
3023
|
+
col_idx = int(np.argmin([abs(cx - cc) for cc in col_centers]))
|
3024
|
+
|
3025
|
+
text_val = cell.extract_text(layout=False, apply_exclusions=False).strip()
|
3026
|
+
table_grid[row_idx][col_idx] = text_val if text_val else None
|
3027
|
+
|
3028
|
+
return table_grid
|
3029
|
+
|
2873
3030
|
|
natural_pdf/elements/text.py
CHANGED
@@ -43,9 +43,58 @@ class TextElement(Element):
|
|
43
43
|
|
44
44
|
@text.setter
|
45
45
|
def text(self, value: str):
|
46
|
-
"""Set the text content."""
|
46
|
+
"""Set the text content and synchronise underlying char dictionaries (if any)."""
|
47
|
+
# Update the primary text value stored on the object itself
|
47
48
|
self._obj["text"] = value
|
48
49
|
|
50
|
+
# --- Keep _char_dicts in sync so downstream utilities (e.g. extract_text)
|
51
|
+
# that rely on the raw character dictionaries see the corrected text.
|
52
|
+
# For OCR-generated words we usually have a single representative char
|
53
|
+
# dict; for native words there may be one per character.
|
54
|
+
# ---------------------------------------------------------------------
|
55
|
+
try:
|
56
|
+
if hasattr(self, "_char_dicts") and isinstance(self._char_dicts, list):
|
57
|
+
if not self._char_dicts:
|
58
|
+
return # Nothing to update
|
59
|
+
|
60
|
+
if len(self._char_dicts) == 1:
|
61
|
+
# Simple case – a single char dict represents the whole text
|
62
|
+
self._char_dicts[0]["text"] = value
|
63
|
+
else:
|
64
|
+
# Update character-by-character. If new value is shorter than
|
65
|
+
# existing char dicts, truncate remaining dicts by setting
|
66
|
+
# their text to empty string; if longer, extend by repeating
|
67
|
+
# the last char dict geometry (best-effort fallback).
|
68
|
+
for idx, char_dict in enumerate(self._char_dicts):
|
69
|
+
if idx < len(value):
|
70
|
+
char_dict["text"] = value[idx]
|
71
|
+
else:
|
72
|
+
# Clear extra characters from old text
|
73
|
+
char_dict["text"] = ""
|
74
|
+
|
75
|
+
# If new text is longer, append additional char dicts based
|
76
|
+
# on the last available geometry. This is an approximation
|
77
|
+
# but ensures text length consistency for downstream joins.
|
78
|
+
if len(value) > len(self._char_dicts):
|
79
|
+
last_dict = self._char_dicts[-1]
|
80
|
+
for extra_idx in range(len(self._char_dicts), len(value)):
|
81
|
+
new_dict = last_dict.copy()
|
82
|
+
new_dict["text"] = value[extra_idx]
|
83
|
+
# Advance x0/x1 roughly by average char width if available
|
84
|
+
char_width = last_dict.get("adv") or (
|
85
|
+
last_dict.get("width", 0) / max(len(self.text), 1)
|
86
|
+
)
|
87
|
+
if isinstance(char_width, (int, float)) and char_width > 0:
|
88
|
+
shift = char_width * (extra_idx - len(self._char_dicts) + 1)
|
89
|
+
new_dict["x0"] = last_dict.get("x0", 0) + shift
|
90
|
+
new_dict["x1"] = last_dict.get("x1", 0) + shift
|
91
|
+
self._char_dicts.append(new_dict)
|
92
|
+
except Exception as sync_err: # pragma: no cover
|
93
|
+
# Keep failures silent but logged; better to have outdated chars than crash.
|
94
|
+
import logging
|
95
|
+
logger = logging.getLogger(__name__)
|
96
|
+
logger.debug(f"TextElement: Failed to sync _char_dicts after text update: {sync_err}")
|
97
|
+
|
49
98
|
@property
|
50
99
|
def source(self) -> str:
|
51
100
|
"""Get the source of this text element (pdf or ocr)."""
|
@@ -151,20 +200,28 @@ class TextElement(Element):
|
|
151
200
|
# Default to black
|
152
201
|
return (0, 0, 0)
|
153
202
|
|
154
|
-
def extract_text(self, keep_blank_chars=True, **kwargs) -> str:
|
203
|
+
def extract_text(self, keep_blank_chars=True, strip: Optional[bool] = True, **kwargs) -> str:
|
155
204
|
"""
|
156
205
|
Extract text from this element.
|
157
206
|
|
158
207
|
Args:
|
159
|
-
keep_blank_chars:
|
160
|
-
|
208
|
+
keep_blank_chars: Retained for API compatibility (unused).
|
209
|
+
strip: If True (default) remove leading/trailing whitespace. Users may
|
210
|
+
pass ``strip=False`` to preserve whitespace exactly as stored.
|
211
|
+
**kwargs: Accepted for forward-compatibility and ignored here.
|
161
212
|
|
162
213
|
Returns:
|
163
|
-
|
214
|
+
The text content, optionally stripped.
|
164
215
|
"""
|
165
|
-
#
|
166
|
-
|
167
|
-
|
216
|
+
# Basic retrieval
|
217
|
+
result = self.text or ""
|
218
|
+
|
219
|
+
# Apply optional stripping – align with global convention where simple
|
220
|
+
# element extraction is stripped by default.
|
221
|
+
if strip:
|
222
|
+
result = result.strip()
|
223
|
+
|
224
|
+
return result
|
168
225
|
|
169
226
|
def contains(self, substring: str, case_sensitive: bool = True) -> bool:
|
170
227
|
"""
|
natural_pdf/flows/region.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
import logging
|
2
|
-
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
2
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, Callable
|
3
3
|
|
4
4
|
from pdfplumber.utils.geometry import objects_to_bbox # For calculating combined bbox
|
5
5
|
|
@@ -519,3 +519,118 @@ class FlowRegion:
|
|
519
519
|
)
|
520
520
|
except Exception:
|
521
521
|
return True # If error during check, assume empty to be safe
|
522
|
+
|
523
|
+
# ------------------------------------------------------------------
|
524
|
+
# Table extraction helpers (delegates to underlying physical regions)
|
525
|
+
# ------------------------------------------------------------------
|
526
|
+
|
527
|
+
def extract_table(
|
528
|
+
self,
|
529
|
+
method: Optional[str] = None,
|
530
|
+
table_settings: Optional[dict] = None,
|
531
|
+
use_ocr: bool = False,
|
532
|
+
ocr_config: Optional[dict] = None,
|
533
|
+
text_options: Optional[Dict] = None,
|
534
|
+
cell_extraction_func: Optional[Callable[["PhysicalRegion"], Optional[str]]] = None,
|
535
|
+
show_progress: bool = False,
|
536
|
+
**kwargs,
|
537
|
+
) -> List[List[Optional[str]]]:
|
538
|
+
"""Extracts a single logical table from the FlowRegion.
|
539
|
+
|
540
|
+
This is a convenience wrapper that iterates through the constituent
|
541
|
+
physical regions **in flow order**, calls their ``extract_table``
|
542
|
+
method, and concatenates the resulting rows. It mirrors the public
|
543
|
+
interface of :pymeth:`natural_pdf.elements.region.Region.extract_table`.
|
544
|
+
|
545
|
+
Args:
|
546
|
+
method, table_settings, use_ocr, ocr_config, text_options, cell_extraction_func, show_progress:
|
547
|
+
Same as in :pymeth:`Region.extract_table` and are forwarded as-is
|
548
|
+
to each physical region.
|
549
|
+
**kwargs: Additional keyword arguments forwarded to the underlying
|
550
|
+
``Region.extract_table`` implementation.
|
551
|
+
|
552
|
+
Returns:
|
553
|
+
A list of rows (``List[List[Optional[str]]]``). Rows returned from
|
554
|
+
consecutive constituent regions are appended in document order. If
|
555
|
+
no tables are detected in any region, an empty list is returned.
|
556
|
+
"""
|
557
|
+
|
558
|
+
if table_settings is None:
|
559
|
+
table_settings = {}
|
560
|
+
if text_options is None:
|
561
|
+
text_options = {}
|
562
|
+
|
563
|
+
if not self.constituent_regions:
|
564
|
+
return []
|
565
|
+
|
566
|
+
aggregated_rows: List[List[Optional[str]]] = []
|
567
|
+
|
568
|
+
for region in self.constituent_regions:
|
569
|
+
try:
|
570
|
+
region_rows = region.extract_table(
|
571
|
+
method=method,
|
572
|
+
table_settings=table_settings.copy(), # Avoid side-effects
|
573
|
+
use_ocr=use_ocr,
|
574
|
+
ocr_config=ocr_config,
|
575
|
+
text_options=text_options.copy(),
|
576
|
+
cell_extraction_func=cell_extraction_func,
|
577
|
+
show_progress=show_progress,
|
578
|
+
**kwargs,
|
579
|
+
)
|
580
|
+
|
581
|
+
# ``region_rows`` can legitimately be [] if no table found.
|
582
|
+
if region_rows:
|
583
|
+
aggregated_rows.extend(region_rows)
|
584
|
+
except Exception as e:
|
585
|
+
logger.error(
|
586
|
+
f"FlowRegion.extract_table: Error extracting table from constituent region {region}: {e}",
|
587
|
+
exc_info=True,
|
588
|
+
)
|
589
|
+
|
590
|
+
return aggregated_rows
|
591
|
+
|
592
|
+
def extract_tables(
|
593
|
+
self,
|
594
|
+
method: Optional[str] = None,
|
595
|
+
table_settings: Optional[dict] = None,
|
596
|
+
**kwargs,
|
597
|
+
) -> List[List[List[Optional[str]]]]:
|
598
|
+
"""Extract **all** tables from the FlowRegion.
|
599
|
+
|
600
|
+
This simply chains :pymeth:`Region.extract_tables` over each physical
|
601
|
+
region and concatenates their results, preserving flow order.
|
602
|
+
|
603
|
+
Args:
|
604
|
+
method, table_settings: Forwarded to underlying ``Region.extract_tables``.
|
605
|
+
**kwargs: Additional keyword arguments forwarded.
|
606
|
+
|
607
|
+
Returns:
|
608
|
+
A list where each item is a full table (list of rows). The order of
|
609
|
+
tables follows the order of the constituent regions in the flow.
|
610
|
+
"""
|
611
|
+
|
612
|
+
if table_settings is None:
|
613
|
+
table_settings = {}
|
614
|
+
|
615
|
+
if not self.constituent_regions:
|
616
|
+
return []
|
617
|
+
|
618
|
+
all_tables: List[List[List[Optional[str]]]] = []
|
619
|
+
|
620
|
+
for region in self.constituent_regions:
|
621
|
+
try:
|
622
|
+
region_tables = region.extract_tables(
|
623
|
+
method=method,
|
624
|
+
table_settings=table_settings.copy(),
|
625
|
+
**kwargs,
|
626
|
+
)
|
627
|
+
# ``region_tables`` is a list (possibly empty).
|
628
|
+
if region_tables:
|
629
|
+
all_tables.extend(region_tables)
|
630
|
+
except Exception as e:
|
631
|
+
logger.error(
|
632
|
+
f"FlowRegion.extract_tables: Error extracting tables from constituent region {region}: {e}",
|
633
|
+
exc_info=True,
|
634
|
+
)
|
635
|
+
|
636
|
+
return all_tables
|