natural-pdf 0.1.22__py3-none-any.whl → 0.1.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. natural_pdf/analyzers/shape_detection_mixin.py +43 -3
  2. natural_pdf/classification/manager.py +1 -1
  3. natural_pdf/classification/mixin.py +35 -14
  4. natural_pdf/classification/results.py +16 -1
  5. natural_pdf/cli.py +1 -0
  6. natural_pdf/core/highlighting_service.py +23 -0
  7. natural_pdf/core/page.py +32 -2
  8. natural_pdf/core/pdf.py +24 -4
  9. natural_pdf/describe/base.py +11 -1
  10. natural_pdf/describe/summary.py +26 -0
  11. natural_pdf/elements/base.py +81 -3
  12. natural_pdf/elements/collections.py +162 -101
  13. natural_pdf/elements/region.py +187 -160
  14. natural_pdf/elements/text.py +15 -7
  15. natural_pdf/exporters/paddleocr.py +1 -1
  16. natural_pdf/extraction/manager.py +2 -2
  17. natural_pdf/extraction/mixin.py +295 -11
  18. natural_pdf/extraction/result.py +28 -1
  19. natural_pdf/flows/region.py +117 -2
  20. natural_pdf/ocr/engine_surya.py +25 -5
  21. natural_pdf/qa/__init__.py +2 -1
  22. natural_pdf/qa/document_qa.py +166 -113
  23. natural_pdf/qa/qa_result.py +55 -0
  24. natural_pdf/selectors/parser.py +22 -0
  25. natural_pdf/utils/text_extraction.py +34 -14
  26. {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/METADATA +22 -13
  27. {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/RECORD +31 -30
  28. {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/WHEEL +0 -0
  29. {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/entry_points.txt +0 -0
  30. {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/licenses/LICENSE +0 -0
  31. {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/top_level.txt +0 -0
@@ -82,7 +82,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
82
82
  self.end_element = None
83
83
 
84
84
  self.metadata: Dict[str, Any] = {}
85
- self.analyses: Dict[str, Any] = {}
85
+ # Analysis results live under self.metadata['analysis'] via property
86
86
 
87
87
  # Standard attributes for all elements
88
88
  self.object_type = "region" # For selector compatibility
@@ -115,146 +115,28 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
115
115
  **kwargs,
116
116
  ) -> "Region":
117
117
  """
118
- Protected helper method to create a region in a specified direction relative to this region.
118
+ Region-specific wrapper around :py:meth:`DirectionalMixin._direction`.
119
119
 
120
- Args:
121
- direction: 'left', 'right', 'above', or 'below'
122
- size: Size in the primary direction (width for horizontal, height for vertical)
123
- cross_size: Size in the cross direction ('full' or 'element')
124
- include_source: Whether to include this region's area in the result
125
- until: Optional selector string to specify a boundary element
126
- include_endpoint: Whether to include the boundary element found by 'until'
127
- **kwargs: Additional parameters for the 'until' selector search
128
-
129
- Returns:
130
- Region object
131
- """
132
- import math # Use math.inf for infinity
133
-
134
- is_horizontal = direction in ("left", "right")
135
- is_positive = direction in ("right", "below") # right/below are positive directions
136
- pixel_offset = 1 # Offset for excluding elements/endpoints
137
-
138
- # 1. Determine initial boundaries based on direction and include_source
139
- if is_horizontal:
140
- # Initial cross-boundaries (vertical)
141
- y0 = 0 if cross_size == "full" else self.top
142
- y1 = self.page.height if cross_size == "full" else self.bottom
143
-
144
- # Initial primary boundaries (horizontal)
145
- if is_positive: # right
146
- x0_initial = self.x0 if include_source else self.x1 + pixel_offset
147
- x1_initial = self.x1 # This edge moves
148
- else: # left
149
- x0_initial = self.x0 # This edge moves
150
- x1_initial = self.x1 if include_source else self.x0 - pixel_offset
151
- else: # Vertical
152
- # Initial cross-boundaries (horizontal)
153
- x0 = 0 if cross_size == "full" else self.x0
154
- x1 = self.page.width if cross_size == "full" else self.x1
155
-
156
- # Initial primary boundaries (vertical)
157
- if is_positive: # below
158
- y0_initial = self.top if include_source else self.bottom + pixel_offset
159
- y1_initial = self.bottom # This edge moves
160
- else: # above
161
- y0_initial = self.top # This edge moves
162
- y1_initial = self.bottom if include_source else self.top - pixel_offset
163
-
164
- # 2. Calculate the final primary boundary, considering 'size' or page limits
165
- if is_horizontal:
166
- if is_positive: # right
167
- x1_final = min(
168
- self.page.width,
169
- x1_initial + (size if size is not None else (self.page.width - x1_initial)),
170
- )
171
- x0_final = x0_initial
172
- else: # left
173
- x0_final = max(0, x0_initial - (size if size is not None else x0_initial))
174
- x1_final = x1_initial
175
- else: # Vertical
176
- if is_positive: # below
177
- y1_final = min(
178
- self.page.height,
179
- y1_initial + (size if size is not None else (self.page.height - y1_initial)),
180
- )
181
- y0_final = y0_initial
182
- else: # above
183
- y0_final = max(0, y0_initial - (size if size is not None else y0_initial))
184
- y1_final = y1_initial
185
-
186
- # 3. Handle 'until' selector if provided
187
- target = None
188
- if until:
189
- all_matches = self.page.find_all(until, **kwargs)
190
- matches_in_direction = []
191
-
192
- # Filter and sort matches based on direction
193
- if direction == "above":
194
- matches_in_direction = [m for m in all_matches if m.bottom <= self.top]
195
- matches_in_direction.sort(key=lambda e: e.bottom, reverse=True)
196
- elif direction == "below":
197
- matches_in_direction = [m for m in all_matches if m.top >= self.bottom]
198
- matches_in_direction.sort(key=lambda e: e.top)
199
- elif direction == "left":
200
- matches_in_direction = [m for m in all_matches if m.x1 <= self.x0]
201
- matches_in_direction.sort(key=lambda e: e.x1, reverse=True)
202
- elif direction == "right":
203
- matches_in_direction = [m for m in all_matches if m.x0 >= self.x1]
204
- matches_in_direction.sort(key=lambda e: e.x0)
205
-
206
- if matches_in_direction:
207
- target = matches_in_direction[0]
208
-
209
- # Adjust the primary boundary based on the target
210
- if is_horizontal:
211
- if is_positive: # right
212
- x1_final = target.x1 if include_endpoint else target.x0 - pixel_offset
213
- else: # left
214
- x0_final = target.x0 if include_endpoint else target.x1 + pixel_offset
215
- else: # Vertical
216
- if is_positive: # below
217
- y1_final = target.bottom if include_endpoint else target.top - pixel_offset
218
- else: # above
219
- y0_final = target.top if include_endpoint else target.bottom + pixel_offset
220
-
221
- # Adjust cross boundaries if cross_size is 'element'
222
- if cross_size == "element":
223
- if is_horizontal: # Adjust y0, y1
224
- target_y0 = (
225
- target.top if include_endpoint else target.bottom
226
- ) # Use opposite boundary if excluding
227
- target_y1 = target.bottom if include_endpoint else target.top
228
- y0 = min(y0, target_y0)
229
- y1 = max(y1, target_y1)
230
- else: # Adjust x0, x1
231
- target_x0 = (
232
- target.x0 if include_endpoint else target.x1
233
- ) # Use opposite boundary if excluding
234
- target_x1 = target.x1 if include_endpoint else target.x0
235
- x0 = min(x0, target_x0)
236
- x1 = max(x1, target_x1)
237
-
238
- # 4. Finalize bbox coordinates
239
- if is_horizontal:
240
- bbox = (x0_final, y0, x1_final, y1)
241
- else:
242
- bbox = (x0, y0_final, x1, y1_final)
120
+ It performs any pre-processing required by *Region* (none currently),
121
+ delegates the core geometry work to the mix-in implementation via
122
+ ``super()``, then attaches region-level metadata before returning the
123
+ new :class:`Region` instance.
124
+ """
243
125
 
244
- # Ensure valid coordinates (x0 <= x1, y0 <= y1)
245
- final_x0 = min(bbox[0], bbox[2])
246
- final_y0 = min(bbox[1], bbox[3])
247
- final_x1 = max(bbox[0], bbox[2])
248
- final_y1 = max(bbox[1], bbox[3])
249
- final_bbox = (final_x0, final_y0, final_x1, final_y1)
126
+ # Delegate to the shared implementation on DirectionalMixin
127
+ region = super()._direction(
128
+ direction=direction,
129
+ size=size,
130
+ cross_size=cross_size,
131
+ include_source=include_source,
132
+ until=until,
133
+ include_endpoint=include_endpoint,
134
+ **kwargs,
135
+ )
250
136
 
251
- # 5. Create and return Region
252
- region = Region(self.page, final_bbox)
137
+ # Post-process: make sure callers can trace lineage and flags
253
138
  region.source_element = self
254
139
  region.includes_source = include_source
255
- # Optionally store the boundary element if found
256
- if target:
257
- region.boundary_element = target
258
140
 
259
141
  return region
260
142
 
@@ -710,7 +592,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
710
592
  self,
711
593
  scale: float = 2.0,
712
594
  resolution: float = 150,
713
- crop_only: bool = False,
595
+ crop: bool = False,
714
596
  include_highlights: bool = True,
715
597
  **kwargs,
716
598
  ) -> "Image.Image":
@@ -719,7 +601,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
719
601
 
720
602
  Args:
721
603
  resolution: Resolution in DPI for rendering (default: 150)
722
- crop_only: If True, only crop the region without highlighting its boundaries
604
+ crop: If True, only crop the region without highlighting its boundaries
723
605
  include_highlights: Whether to include existing highlights (default: True)
724
606
  **kwargs: Additional parameters for page.to_image()
725
607
 
@@ -730,7 +612,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
730
612
  page_kwargs = kwargs.copy()
731
613
  effective_resolution = resolution # Start with the provided resolution
732
614
 
733
- if crop_only and "width" in kwargs:
615
+ if crop and "width" in kwargs:
734
616
  target_width = kwargs["width"]
735
617
  # Calculate what resolution is needed to make the region crop have target_width
736
618
  region_width_points = self.width # Region width in PDF points
@@ -785,8 +667,8 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
785
667
  # Crop the image to just this region
786
668
  region_image = page_image.crop((x0, top, x1, bottom))
787
669
 
788
- # If not crop_only, add a border to highlight the region boundaries
789
- if not crop_only:
670
+ # If not crop, add a border to highlight the region boundaries
671
+ if not crop:
790
672
  from PIL import ImageDraw
791
673
 
792
674
  # Create a 1px border around the region
@@ -808,6 +690,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
808
690
  color: Optional[Union[Tuple, str]] = "blue",
809
691
  label: Optional[str] = None,
810
692
  width: Optional[int] = None, # Add width parameter
693
+ crop: bool = False, # NEW: Crop output to region bounds before legend
811
694
  ) -> "Image.Image":
812
695
  """
813
696
  Show the page with just this region highlighted temporarily.
@@ -819,6 +702,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
819
702
  color: Color to highlight this region (default: blue)
820
703
  label: Optional label for this region in the legend
821
704
  width: Optional width for the output image in pixels
705
+ crop: If True, crop the rendered image to this region's
706
+ bounding box (with a small margin handled inside
707
+ HighlightingService) before legends/overlays are added.
822
708
 
823
709
  Returns:
824
710
  PIL Image of the page with only this region highlighted
@@ -844,6 +730,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
844
730
  "use_color_cycling": False, # Explicitly false for single preview
845
731
  }
846
732
 
733
+ # Determine crop bbox if requested
734
+ crop_bbox = self.bbox if crop else None
735
+
847
736
  # Use render_preview to show only this highlight
848
737
  return service.render_preview(
849
738
  page_index=self._page.index,
@@ -852,6 +741,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
852
741
  width=width, # Pass the width parameter
853
742
  labels=labels,
854
743
  legend_position=legend_position,
744
+ crop_bbox=crop_bbox,
855
745
  )
856
746
 
857
747
  def save(
@@ -880,7 +770,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
880
770
  self,
881
771
  filename: str,
882
772
  resolution: float = 150,
883
- crop_only: bool = False,
773
+ crop: bool = False,
884
774
  include_highlights: bool = True,
885
775
  **kwargs,
886
776
  ) -> "Region":
@@ -890,7 +780,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
890
780
  Args:
891
781
  filename: Path to save the image to
892
782
  resolution: Resolution in DPI for rendering (default: 150)
893
- crop_only: If True, only crop the region without highlighting its boundaries
783
+ crop: If True, only crop the region without highlighting its boundaries
894
784
  include_highlights: Whether to include existing highlights (default: True)
895
785
  **kwargs: Additional parameters for page.to_image()
896
786
 
@@ -900,7 +790,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
900
790
  # Get the region image
901
791
  image = self.to_image(
902
792
  resolution=resolution,
903
- crop_only=crop_only,
793
+ crop=crop,
904
794
  include_highlights=include_highlights,
905
795
  **kwargs,
906
796
  )
@@ -953,7 +843,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
953
843
 
954
844
  # Get the region image
955
845
  image = work_region.to_image(
956
- resolution=resolution, crop_only=True, include_highlights=False
846
+ resolution=resolution, crop=True, include_highlights=False
957
847
  )
958
848
 
959
849
  if image is None:
@@ -1320,6 +1210,24 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1320
1210
  # Try lattice first, then fall back to stream if no meaningful results
1321
1211
  logger.debug(f"Region {self.bbox}: Auto-detecting table extraction method...")
1322
1212
 
1213
+ # --- NEW: Prefer already-created table_cell regions if they exist --- #
1214
+ try:
1215
+ cell_regions_in_table = [
1216
+ c
1217
+ for c in self.page.find_all("region[type=table_cell]", apply_exclusions=False)
1218
+ if self.intersects(c)
1219
+ ]
1220
+ except Exception as _cells_err:
1221
+ cell_regions_in_table = [] # Fallback silently
1222
+
1223
+ if cell_regions_in_table:
1224
+ logger.debug(
1225
+ f"Region {self.bbox}: Found {len(cell_regions_in_table)} pre-computed table_cell regions – using 'cells' method."
1226
+ )
1227
+ return self._extract_table_from_cells(cell_regions_in_table)
1228
+
1229
+ # --------------------------------------------------------------- #
1230
+
1323
1231
  try:
1324
1232
  logger.debug(f"Region {self.bbox}: Trying 'lattice' method first...")
1325
1233
  lattice_result = self.extract_table(
@@ -2015,19 +1923,19 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2015
1923
  logger.info(
2016
1924
  f"Region {self.bbox}: Removing existing OCR elements before applying new OCR."
2017
1925
  )
2018
- # Find all OCR elements in this region
2019
- ocr_selector = "text[source=ocr]"
2020
- ocr_elements = self.find_all(ocr_selector)
2021
1926
 
1927
+ # Remove existing OCR word elements strictly inside this region
1928
+ ocr_selector = "text[source=ocr]"
1929
+ ocr_elements = self.find_all(ocr_selector, apply_exclusions=False)
2022
1930
  if ocr_elements:
1931
+ removed_count = ocr_elements.remove()
2023
1932
  logger.info(
2024
- f"Region {self.bbox}: Found {len(ocr_elements)} existing OCR elements to remove."
1933
+ f"Region {self.bbox}: Removed {removed_count} existing OCR word elements in region before re-applying OCR."
2025
1934
  )
2026
- # Remove these elements from their page
2027
- removed_count = ocr_elements.remove()
2028
- logger.info(f"Region {self.bbox}: Removed {removed_count} OCR elements.")
2029
1935
  else:
2030
- logger.info(f"Region {self.bbox}: No existing OCR elements found to remove.")
1936
+ logger.info(
1937
+ f"Region {self.bbox}: No existing OCR word elements found within region to remove."
1938
+ )
2031
1939
 
2032
1940
  ocr_mgr = self.page._parent._ocr_manager
2033
1941
 
@@ -2044,7 +1952,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2044
1952
  # Render the page region to an image using the determined resolution
2045
1953
  try:
2046
1954
  region_image = self.to_image(
2047
- resolution=final_resolution, include_highlights=False, crop_only=True
1955
+ resolution=final_resolution, include_highlights=False, crop=True
2048
1956
  )
2049
1957
  if not region_image:
2050
1958
  logger.error("Failed to render region to image for OCR.")
@@ -2088,8 +1996,17 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2088
1996
  page_top = self.top + (img_top * scale_y)
2089
1997
  page_x1 = self.x0 + (img_x1 * scale_x)
2090
1998
  page_bottom = self.top + (img_bottom * scale_y)
1999
+ raw_conf = result.get("confidence")
2000
+ # Convert confidence to float unless it is None/invalid
2001
+ try:
2002
+ confidence_val = float(raw_conf) if raw_conf is not None else None
2003
+ except (TypeError, ValueError):
2004
+ confidence_val = None
2005
+
2006
+ text_val = result.get("text") # May legitimately be None in detect_only mode
2007
+
2091
2008
  element_data = {
2092
- "text": result["text"],
2009
+ "text": text_val,
2093
2010
  "x0": page_x0,
2094
2011
  "top": page_top,
2095
2012
  "x1": page_x1,
@@ -2098,7 +2015,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2098
2015
  "height": page_bottom - page_top,
2099
2016
  "object_type": "word",
2100
2017
  "source": "ocr",
2101
- "confidence": float(result.get("confidence", 0.0)),
2018
+ "confidence": confidence_val,
2102
2019
  "fontname": "OCR",
2103
2020
  "size": round(pdf_height) if pdf_height > 0 else 10.0,
2104
2021
  "page_number": self.page.number,
@@ -2434,12 +2351,12 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2434
2351
 
2435
2352
  def ask(
2436
2353
  self,
2437
- question: str,
2354
+ question: Union[str, List[str], Tuple[str, ...]],
2438
2355
  min_confidence: float = 0.1,
2439
2356
  model: str = None,
2440
2357
  debug: bool = False,
2441
2358
  **kwargs,
2442
- ) -> Dict[str, Any]:
2359
+ ) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
2443
2360
  """
2444
2361
  Ask a question about the region content using document QA.
2445
2362
 
@@ -2466,7 +2383,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2466
2383
  from natural_pdf.qa.document_qa import get_qa_engine
2467
2384
  except ImportError:
2468
2385
  logger.error(
2469
- "Question answering requires optional dependencies. Install with `pip install natural-pdf[core-ml]`"
2386
+ "Question answering requires optional dependencies. Install with `pip install natural-pdf[ai]`"
2470
2387
  )
2471
2388
  return {
2472
2389
  "answer": None,
@@ -2684,7 +2601,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2684
2601
  img = self.to_image(
2685
2602
  resolution=resolution,
2686
2603
  include_highlights=False, # No highlights for classification input
2687
- crop_only=True, # Just the region content
2604
+ crop=True, # Just the region content
2688
2605
  )
2689
2606
  if img is None:
2690
2607
  raise ValueError(
@@ -2964,4 +2881,114 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2964
2881
 
2965
2882
  return text_element
2966
2883
 
2884
+ # ------------------------------------------------------------------
2885
+ # Unified analysis storage (maps to metadata["analysis"])
2886
+ # ------------------------------------------------------------------
2887
+
2888
+ @property
2889
+ def analyses(self) -> Dict[str, Any]:
2890
+ if not hasattr(self, "metadata") or self.metadata is None:
2891
+ self.metadata = {}
2892
+ return self.metadata.setdefault("analysis", {})
2893
+
2894
+ @analyses.setter
2895
+ def analyses(self, value: Dict[str, Any]):
2896
+ if not hasattr(self, "metadata") or self.metadata is None:
2897
+ self.metadata = {}
2898
+ self.metadata["analysis"] = value
2899
+
2900
+ # ------------------------------------------------------------------
2901
+ # New helper: build table from pre-computed table_cell regions
2902
+ # ------------------------------------------------------------------
2903
+
2904
+ def _extract_table_from_cells(self, cell_regions: List["Region"]) -> List[List[Optional[str]]]:
2905
+ """Construct a table (list-of-lists) from table_cell regions.
2906
+
2907
+ This assumes each cell Region has metadata.row_index / col_index as written by
2908
+ detect_table_structure_from_lines(). If these keys are missing we will
2909
+ fall back to sorting by geometry.
2910
+ """
2911
+ if not cell_regions:
2912
+ return []
2913
+
2914
+ # Attempt to use explicit indices first
2915
+ all_row_idxs = []
2916
+ all_col_idxs = []
2917
+ for cell in cell_regions:
2918
+ try:
2919
+ r_idx = int(cell.metadata.get("row_index"))
2920
+ c_idx = int(cell.metadata.get("col_index"))
2921
+ all_row_idxs.append(r_idx)
2922
+ all_col_idxs.append(c_idx)
2923
+ except Exception:
2924
+ # Not all cells have indices – clear the lists so we switch to geometric sorting
2925
+ all_row_idxs = []
2926
+ all_col_idxs = []
2927
+ break
2928
+
2929
+ if all_row_idxs and all_col_idxs:
2930
+ num_rows = max(all_row_idxs) + 1
2931
+ num_cols = max(all_col_idxs) + 1
2932
+
2933
+ # Initialise blank grid
2934
+ table_grid: List[List[Optional[str]]] = [[None] * num_cols for _ in range(num_rows)]
2935
+
2936
+ for cell in cell_regions:
2937
+ try:
2938
+ r_idx = int(cell.metadata.get("row_index"))
2939
+ c_idx = int(cell.metadata.get("col_index"))
2940
+ text_val = cell.extract_text(layout=False, apply_exclusions=False).strip()
2941
+ table_grid[r_idx][c_idx] = text_val if text_val else None
2942
+ except Exception as _err:
2943
+ # Skip problematic cell
2944
+ continue
2945
+
2946
+ return table_grid
2947
+
2948
+ # ------------------------------------------------------------------
2949
+ # Fallback: derive order purely from geometry if indices are absent
2950
+ # ------------------------------------------------------------------
2951
+ # Sort unique centers to define ordering
2952
+ try:
2953
+ import numpy as np
2954
+ except ImportError:
2955
+ logger.warning("NumPy required for geometric cell ordering; returning empty result.")
2956
+ return []
2957
+
2958
+ # Build arrays of centers
2959
+ centers = np.array([
2960
+ [(c.x0 + c.x1) / 2.0, (c.top + c.bottom) / 2.0] for c in cell_regions
2961
+ ])
2962
+ xs = centers[:, 0]
2963
+ ys = centers[:, 1]
2964
+
2965
+ # Cluster unique row Y positions and column X positions with a tolerance
2966
+ def _cluster(vals, tol=1.0):
2967
+ sorted_vals = np.sort(vals)
2968
+ groups = [[sorted_vals[0]]]
2969
+ for v in sorted_vals[1:]:
2970
+ if abs(v - groups[-1][-1]) <= tol:
2971
+ groups[-1].append(v)
2972
+ else:
2973
+ groups.append([v])
2974
+ return [np.mean(g) for g in groups]
2975
+
2976
+ row_centers = _cluster(ys)
2977
+ col_centers = _cluster(xs)
2978
+
2979
+ num_rows = len(row_centers)
2980
+ num_cols = len(col_centers)
2981
+
2982
+ table_grid: List[List[Optional[str]]] = [[None] * num_cols for _ in range(num_rows)]
2983
+
2984
+ # Assign each cell to nearest row & col center
2985
+ for cell, (cx, cy) in zip(cell_regions, centers):
2986
+ row_idx = int(np.argmin([abs(cy - rc) for rc in row_centers]))
2987
+ col_idx = int(np.argmin([abs(cx - cc) for cc in col_centers]))
2988
+
2989
+ text_val = cell.extract_text(layout=False, apply_exclusions=False).strip()
2990
+ table_grid[row_idx][col_idx] = text_val if text_val else None
2991
+
2992
+ return table_grid
2993
+
2967
2994
 
@@ -151,20 +151,28 @@ class TextElement(Element):
151
151
  # Default to black
152
152
  return (0, 0, 0)
153
153
 
154
- def extract_text(self, keep_blank_chars=True, **kwargs) -> str:
154
+ def extract_text(self, keep_blank_chars=True, strip: Optional[bool] = True, **kwargs) -> str:
155
155
  """
156
156
  Extract text from this element.
157
157
 
158
158
  Args:
159
- keep_blank_chars: Whether to keep blank characters (default: True)
160
- **kwargs: Additional extraction parameters
159
+ keep_blank_chars: Retained for API compatibility (unused).
160
+ strip: If True (default) remove leading/trailing whitespace. Users may
161
+ pass ``strip=False`` to preserve whitespace exactly as stored.
162
+ **kwargs: Accepted for forward-compatibility and ignored here.
161
163
 
162
164
  Returns:
163
- Text content
165
+ The text content, optionally stripped.
164
166
  """
165
- # For text elements, keep_blank_chars doesn't affect anything as we're
166
- # simply returning the text property. Included for API consistency.
167
- return self.text
167
+ # Basic retrieval
168
+ result = self.text or ""
169
+
170
+ # Apply optional stripping – align with global convention where simple
171
+ # element extraction is stripped by default.
172
+ if strip:
173
+ result = result.strip()
174
+
175
+ return result
168
176
 
169
177
  def contains(self, substring: str, case_sensitive: bool = True) -> bool:
170
178
  """
@@ -217,7 +217,7 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
217
217
  # Expand region, render, and save image
218
218
  region = element.expand(self.padding)
219
219
  img = region.to_image(
220
- resolution=self.resolution, crop_only=True, include_highlights=False
220
+ resolution=self.resolution, crop=True, include_highlights=False
221
221
  )
222
222
  img.save(absolute_image_path, "PNG")
223
223
 
@@ -126,10 +126,10 @@ class StructuredDataManager:
126
126
  )
127
127
  parsed_data = completion.choices[0].message.parsed
128
128
  return StructuredDataResult(
129
- data=parsed_data, success=True, error_message=None, model=selected_model
129
+ data=parsed_data, success=True, error_message=None, model_used=selected_model
130
130
  )
131
131
  except Exception as e:
132
132
  logger.error(f"Extraction failed: {str(e)}")
133
133
  return StructuredDataResult(
134
- data=None, success=False, error_message=str(e), model=selected_model
134
+ data=None, success=False, error_message=str(e), model_used=selected_model
135
135
  )