natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +3 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +422 -0
  13. natural_pdf/classification/mixin.py +163 -0
  14. natural_pdf/classification/results.py +80 -0
  15. natural_pdf/collections/mixins.py +111 -0
  16. natural_pdf/collections/pdf_collection.py +434 -15
  17. natural_pdf/core/element_manager.py +83 -0
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +578 -93
  20. natural_pdf/core/pdf.py +912 -460
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +712 -109
  23. natural_pdf/elements/region.py +722 -69
  24. natural_pdf/elements/text.py +4 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +5 -4
  28. natural_pdf/extraction/manager.py +135 -0
  29. natural_pdf/extraction/mixin.py +279 -0
  30. natural_pdf/extraction/result.py +23 -0
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/engine_easyocr.py +6 -3
  34. natural_pdf/ocr/ocr_factory.py +24 -4
  35. natural_pdf/ocr/ocr_manager.py +122 -26
  36. natural_pdf/ocr/ocr_options.py +94 -11
  37. natural_pdf/ocr/utils.py +19 -6
  38. natural_pdf/qa/document_qa.py +0 -4
  39. natural_pdf/search/__init__.py +20 -34
  40. natural_pdf/search/haystack_search_service.py +309 -265
  41. natural_pdf/search/haystack_utils.py +99 -75
  42. natural_pdf/search/search_service_protocol.py +11 -12
  43. natural_pdf/selectors/parser.py +431 -230
  44. natural_pdf/utils/debug.py +3 -3
  45. natural_pdf/utils/identifiers.py +1 -1
  46. natural_pdf/utils/locks.py +8 -0
  47. natural_pdf/utils/packaging.py +8 -6
  48. natural_pdf/utils/text_extraction.py +60 -1
  49. natural_pdf/utils/tqdm_utils.py +51 -0
  50. natural_pdf/utils/visualization.py +18 -0
  51. natural_pdf/widgets/viewer.py +4 -25
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
  53. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  54. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  55. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  56. docs/api/index.md +0 -386
  57. docs/assets/favicon.png +0 -3
  58. docs/assets/favicon.svg +0 -3
  59. docs/assets/javascripts/custom.js +0 -17
  60. docs/assets/logo.svg +0 -3
  61. docs/assets/sample-screen.png +0 -0
  62. docs/assets/social-preview.png +0 -17
  63. docs/assets/social-preview.svg +0 -17
  64. docs/assets/stylesheets/custom.css +0 -65
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -915
  68. docs/element-selection/index.md +0 -229
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -170
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -209
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -194
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -340
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -147
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -114
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -270
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -332
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -288
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -413
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -508
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2434
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -512
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -604
  112. docs/tutorials/12-ocr-integration.md +0 -175
  113. docs/tutorials/13-semantic-search.ipynb +0 -1328
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.7.dist-info/RECORD +0 -145
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,142 @@
1
+ from typing import Any, Dict, List, Optional, Tuple, Union
2
+
3
+ # Attempt to import pdfplumber modules
4
+ import pdfplumber.table as pdfplumber_table
5
+
6
+ # Type Definitions
7
+ T_num = Union[int, float]
8
+ T_bbox = Tuple[T_num, T_num, T_num, T_num]
9
+ T_obj = Dict[str, Any]
10
+ T_obj_list = List[T_obj]
11
+ T_intersections = Dict[Tuple[T_num, T_num], Dict[str, T_obj_list]]
12
+ T_cell_dict = Dict[str, T_num]
13
+
14
+ # Use defaults directly from pdfplumber.table (or placeholders if not installed)
15
+ DEFAULT_SNAP_TOLERANCE = pdfplumber_table.DEFAULT_SNAP_TOLERANCE
16
+ DEFAULT_JOIN_TOLERANCE = pdfplumber_table.DEFAULT_JOIN_TOLERANCE
17
+ DEFAULT_MIN_WORDS_VERTICAL = pdfplumber_table.DEFAULT_MIN_WORDS_VERTICAL
18
+ DEFAULT_MIN_WORDS_HORIZONTAL = pdfplumber_table.DEFAULT_MIN_WORDS_HORIZONTAL
19
+
20
+ # --- Main Function ---
21
+
22
+
23
+ def find_text_based_tables(
24
+ bboxes: List[T_bbox],
25
+ snap_tolerance: T_num = DEFAULT_SNAP_TOLERANCE,
26
+ join_tolerance: T_num = DEFAULT_JOIN_TOLERANCE,
27
+ min_words_vertical: int = DEFAULT_MIN_WORDS_VERTICAL,
28
+ min_words_horizontal: int = DEFAULT_MIN_WORDS_HORIZONTAL,
29
+ intersection_tolerance: T_num = 3,
30
+ snap_x_tolerance: Optional[T_num] = None,
31
+ snap_y_tolerance: Optional[T_num] = None,
32
+ join_x_tolerance: Optional[T_num] = None,
33
+ join_y_tolerance: Optional[T_num] = None,
34
+ intersection_x_tolerance: Optional[T_num] = None,
35
+ intersection_y_tolerance: Optional[T_num] = None,
36
+ ) -> Dict[str, Union[T_obj_list, List[T_cell_dict], T_intersections]]:
37
+ """
38
+ Finds table structures based on text element alignment using imported
39
+ pdfplumber functions. Accepts a list of bounding box tuples.
40
+
41
+ Args:
42
+ bboxes: A list of bounding box tuples (x0, top, x1, bottom).
43
+ snap_tolerance: General tolerance for snapping edges.
44
+ join_tolerance: General tolerance for joining nearby edges.
45
+ min_words_vertical: Minimum words to form a vertical edge.
46
+ min_words_horizontal: Minimum words to form a horizontal edge.
47
+ intersection_tolerance: General tolerance for intersections.
48
+ snap_x_tolerance: Specific horizontal snap tolerance (overrides general).
49
+ snap_y_tolerance: Specific vertical snap tolerance (overrides general).
50
+ join_x_tolerance: Specific horizontal join tolerance (overrides general).
51
+ join_y_tolerance: Specific vertical join tolerance (overrides general).
52
+ intersection_x_tolerance: Specific horizontal intersection tolerance.
53
+ intersection_y_tolerance: Specific vertical intersection tolerance.
54
+
55
+
56
+ Returns:
57
+ A dictionary containing:
58
+ - 'horizontal_edges': List of merged horizontal edge dictionaries.
59
+ - 'vertical_edges': List of merged vertical edge dictionaries.
60
+ - 'cells': List of dictionaries [{'left': x0, 'top': top, 'right': x1, 'bottom': bottom}, ...]
61
+ representing detected cells, ready for page.region().
62
+ - 'intersections': Dictionary of intersection points and the edges forming them.
63
+
64
+ Raises:
65
+ ImportError: If the 'pdfplumber' library is not installed when this function is called.
66
+ """
67
+
68
+ if not bboxes:
69
+ return {"horizontal_edges": [], "vertical_edges": [], "cells": [], "intersections": {}}
70
+
71
+ # Convert BBoxes to Dictionaries required by pdfplumber functions
72
+ text_elements = []
73
+ for i, bbox in enumerate(bboxes):
74
+ x0, top, x1, bottom = bbox
75
+ # Basic structure needed for words_to_edges_h/v
76
+ text_elements.append(
77
+ {
78
+ "x0": x0,
79
+ "top": top,
80
+ "x1": x1,
81
+ "bottom": bottom,
82
+ "width": x1 - x0,
83
+ "height": bottom - top,
84
+ "text": f"elem_{i}", # Placeholder text
85
+ "object_type": "char", # Mimic word/char structure loosely
86
+ }
87
+ )
88
+
89
+ # Resolve tolerances
90
+ sx = snap_x_tolerance if snap_x_tolerance is not None else snap_tolerance
91
+ sy = snap_y_tolerance if snap_y_tolerance is not None else snap_tolerance
92
+ jx = join_x_tolerance if join_x_tolerance is not None else join_tolerance
93
+ jy = join_y_tolerance if join_y_tolerance is not None else join_tolerance
94
+ ix = (
95
+ intersection_x_tolerance if intersection_x_tolerance is not None else intersection_tolerance
96
+ )
97
+ iy = (
98
+ intersection_y_tolerance if intersection_y_tolerance is not None else intersection_tolerance
99
+ )
100
+
101
+ # --- pdfplumber Pipeline ---
102
+ h_edges = pdfplumber_table.words_to_edges_h(text_elements, word_threshold=min_words_horizontal)
103
+ v_edges = pdfplumber_table.words_to_edges_v(text_elements, word_threshold=min_words_vertical)
104
+ initial_edges = h_edges + v_edges
105
+
106
+ if not initial_edges:
107
+ return {"horizontal_edges": [], "vertical_edges": [], "cells": [], "intersections": {}}
108
+
109
+ merged_edges = pdfplumber_table.merge_edges(initial_edges, sx, sy, jx, jy)
110
+ merged_h = [e for e in merged_edges if e["orientation"] == "h"]
111
+ merged_v = [e for e in merged_edges if e["orientation"] == "v"]
112
+
113
+ if not merged_edges:
114
+ return {
115
+ "horizontal_edges": merged_h,
116
+ "vertical_edges": merged_v,
117
+ "cells": [],
118
+ "intersections": {},
119
+ }
120
+
121
+ intersections = pdfplumber_table.edges_to_intersections(merged_edges, ix, iy)
122
+ if not intersections:
123
+ return {
124
+ "horizontal_edges": merged_h,
125
+ "vertical_edges": merged_v,
126
+ "cells": [],
127
+ "intersections": intersections,
128
+ }
129
+
130
+ cell_tuples = pdfplumber_table.intersections_to_cells(intersections)
131
+
132
+ # Convert cell tuples to dictionaries for page.region()
133
+ cell_dicts = []
134
+ for x0, top, x1, bottom in cell_tuples:
135
+ cell_dicts.append({"left": x0, "top": top, "right": x1, "bottom": bottom})
136
+
137
+ return {
138
+ "horizontal_edges": merged_h,
139
+ "vertical_edges": merged_v,
140
+ "cells": cell_dicts,
141
+ "intersections": intersections,
142
+ }
@@ -20,6 +20,7 @@ TableRecPredictor = None
20
20
 
21
21
  if surya_spec:
22
22
  try:
23
+ from surya.common.util import expand_bbox, rescale_bbox
23
24
  from surya.layout import LayoutPredictor
24
25
  from surya.table_rec import TableRecPredictor
25
26
  except ImportError as e:
@@ -74,25 +75,10 @@ class SuryaLayoutDetector(LayoutDetector):
74
75
  raise TypeError("Incorrect options type provided for Surya model loading.")
75
76
  self.logger.info(f"Loading Surya models (device={options.device})...")
76
77
  models = {}
77
- try:
78
- models["layout"] = LayoutPredictor()
79
- models["table_rec"] = TableRecPredictor()
80
- self.logger.info("Surya LayoutPredictor and TableRecPredictor loaded.")
81
- return models
82
- except Exception as e:
83
- self.logger.error(f"Failed to load Surya models: {e}", exc_info=True)
84
- raise
85
-
86
- def _expand_bbox(
87
- self, bbox: Tuple[float, float, float, float], padding: int, max_width: int, max_height: int
88
- ) -> Tuple[int, int, int, int]:
89
- """Expand bbox by padding, clamping to max dimensions."""
90
- x0, y0, x1, y1 = bbox
91
- x0 = max(0, int(x0 - padding))
92
- y0 = max(0, int(y0 - padding))
93
- x1 = min(max_width, int(x1 + padding))
94
- y1 = min(max_height, int(y1 + padding))
95
- return x0, y0, x1, y1
78
+ models["layout"] = LayoutPredictor()
79
+ models["table_rec"] = TableRecPredictor()
80
+ self.logger.info("Surya LayoutPredictor and TableRecPredictor loaded.")
81
+ return models
96
82
 
97
83
  def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
98
84
  """Detect layout elements and optionally table structure in an image using Surya."""
@@ -114,19 +100,12 @@ class SuryaLayoutDetector(LayoutDetector):
114
100
 
115
101
  # Extract page reference and scaling factors from extra_args (passed by LayoutAnalyzer)
116
102
  self._page_ref = options.extra_args.get("_page_ref")
117
- img_scale_x = options.extra_args.get("_img_scale_x")
118
- img_scale_y = options.extra_args.get("_img_scale_y")
119
103
 
120
104
  # We still need this check, otherwise later steps that need these vars will fail
121
- can_do_table_rec = (
122
- options.recognize_table_structure
123
- and self._page_ref
124
- and img_scale_x is not None
125
- and img_scale_y is not None
126
- )
105
+ can_do_table_rec = options.recognize_table_structure
127
106
  if options.recognize_table_structure and not can_do_table_rec:
128
107
  logger.warning(
129
- "Surya table recognition cannot proceed without page reference and scaling factors. Disabling."
108
+ "Surya table recognition cannot proceed without page reference. Disabling."
130
109
  )
131
110
  options.recognize_table_structure = False
132
111
 
@@ -141,14 +120,12 @@ class SuryaLayoutDetector(LayoutDetector):
141
120
  table_rec_predictor = models["table_rec"]
142
121
 
143
122
  input_image = image.convert("RGB")
144
- input_image_list = [input_image]
145
123
 
146
- initial_layout_detections = [] # Detections relative to input_image
124
+ initial_layout_detections = []
147
125
  tables_to_process = []
148
126
 
149
- # --- Initial Layout Detection ---
150
127
  self.logger.debug("Running Surya layout prediction...")
151
- layout_predictions = layout_predictor(input_image_list)
128
+ layout_predictions = layout_predictor([input_image])
152
129
  self.logger.debug(f"Surya prediction returned {len(layout_predictions)} results.")
153
130
  if not layout_predictions:
154
131
  return []
@@ -164,6 +141,7 @@ class SuryaLayoutDetector(LayoutDetector):
164
141
  )
165
142
 
166
143
  for layout_box in prediction.bboxes:
144
+
167
145
  class_name_orig = layout_box.label
168
146
  normalized_class = self._normalize_class_name(class_name_orig)
169
147
  score = float(layout_box.confidence)
@@ -196,7 +174,6 @@ class SuryaLayoutDetector(LayoutDetector):
196
174
  f"Surya initially detected {len(initial_layout_detections)} layout elements matching criteria."
197
175
  )
198
176
 
199
- # --- Table Structure Recognition (Optional) ---
200
177
  if not options.recognize_table_structure or not tables_to_process:
201
178
  self.logger.debug(
202
179
  "Skipping Surya table structure recognition (disabled or no tables found)."
@@ -207,59 +184,29 @@ class SuryaLayoutDetector(LayoutDetector):
207
184
  f"Attempting Surya table structure recognition for {len(tables_to_process)} tables..."
208
185
  )
209
186
  high_res_crops = []
210
- pdf_offsets = [] # Store (pdf_x0, pdf_y0) for each crop
211
187
 
212
188
  high_res_dpi = getattr(self._page_ref._parent, "_config", {}).get(
213
189
  "surya_table_rec_dpi", 192
214
190
  )
215
- bbox_padding = getattr(self._page_ref._parent, "_config", {}).get(
216
- "surya_table_bbox_padding", 10
191
+ high_res_page_image = self._page_ref.to_image(
192
+ resolution=high_res_dpi, include_highlights=False, scale=1.0
217
193
  )
218
- pdf_to_highres_scale = high_res_dpi / 72.0
219
194
 
220
195
  # Render high-res page ONCE
221
196
  self.logger.debug(
222
- f"Rendering page {self._page_ref.number} at {high_res_dpi} DPI for table recognition..."
223
- )
224
- high_res_page_image = self._page_ref.to_image(
225
- resolution=high_res_dpi, include_highlights=False
226
- )
227
- if not high_res_page_image:
228
- raise RuntimeError(f"Failed to render page {self._page_ref.number} at high resolution.")
229
- self.logger.debug(
230
- f" High-res image size: {high_res_page_image.width}x{high_res_page_image.height}"
197
+ f"Rendering page {self._page_ref.number} at {high_res_dpi} DPI for table recognition, size {high_res_page_image.width}x{high_res_page_image.height}."
231
198
  )
232
199
 
200
+ source_tables = []
233
201
  for i, table_detection in enumerate(tables_to_process):
234
- img_x0, img_y0, img_x1, img_y1 = table_detection["bbox"]
235
-
236
- # PDF coords
237
- pdf_x0 = img_x0 * img_scale_x
238
- pdf_y0 = img_y0 * img_scale_y
239
- pdf_x1 = img_x1 * img_scale_x
240
- pdf_y1 = img_y1 * img_scale_y
241
- pdf_x0 = max(0, pdf_x0)
242
- pdf_y0 = max(0, pdf_y0)
243
- pdf_x1 = min(self._page_ref.width, pdf_x1)
244
- pdf_y1 = min(self._page_ref.height, pdf_y1)
245
-
246
- # High-res image coords
247
- hr_x0 = pdf_x0 * pdf_to_highres_scale
248
- hr_y0 = pdf_y0 * pdf_to_highres_scale
249
- hr_x1 = pdf_x1 * pdf_to_highres_scale
250
- hr_y1 = pdf_y1 * pdf_to_highres_scale
251
-
252
- # Expand high-res bbox
253
- hr_x0_exp, hr_y0_exp, hr_x1_exp, hr_y1_exp = self._expand_bbox(
254
- (hr_x0, hr_y0, hr_x1, hr_y1),
255
- padding=bbox_padding,
256
- max_width=high_res_page_image.width,
257
- max_height=high_res_page_image.height,
202
+ highres_bbox = rescale_bbox(
203
+ list(table_detection["bbox"]), image.size, high_res_page_image.size
258
204
  )
205
+ highres_bbox = expand_bbox(highres_bbox)
259
206
 
260
- crop = high_res_page_image.crop((hr_x0_exp, hr_y0_exp, hr_x1_exp, hr_y1_exp))
207
+ crop = high_res_page_image.crop(highres_bbox)
261
208
  high_res_crops.append(crop)
262
- pdf_offsets.append((pdf_x0, pdf_y0))
209
+ source_tables.append(highres_bbox)
263
210
 
264
211
  if not high_res_crops:
265
212
  self.logger.info("No valid high-resolution table crops generated.")
@@ -267,64 +214,40 @@ class SuryaLayoutDetector(LayoutDetector):
267
214
 
268
215
  structure_detections = [] # Detections relative to std_res input_image
269
216
 
270
- # --- Run Table Recognition (will raise error on failure) ---
271
217
  self.logger.debug(
272
218
  f"Running Surya table recognition on {len(high_res_crops)} high-res images..."
273
219
  )
274
220
  table_predictions = table_rec_predictor(high_res_crops)
275
221
  self.logger.debug(f"Surya table recognition returned {len(table_predictions)} results.")
276
222
 
277
- # --- Process Results ---
278
- if len(table_predictions) != len(pdf_offsets):
279
- # This case is less likely if predictor didn't error, but good sanity check
280
- raise RuntimeError(
281
- f"Mismatch between table inputs ({len(pdf_offsets)}) and predictions ({len(table_predictions)})."
282
- )
223
+ def build_row_item(element, source_table_bbox, label):
224
+ adjusted_bbox = [
225
+ float(element.bbox[0] + source_table_bbox[0]),
226
+ float(element.bbox[1] + source_table_bbox[1]),
227
+ float(element.bbox[2] + source_table_bbox[0]),
228
+ float(element.bbox[3] + source_table_bbox[1]),
229
+ ]
230
+
231
+ adjusted_bbox = rescale_bbox(adjusted_bbox, high_res_page_image.size, image.size)
232
+
233
+ return {
234
+ "bbox": adjusted_bbox,
235
+ "class": label,
236
+ "confidence": 1.0,
237
+ "normalized_class": label,
238
+ "source": "layout",
239
+ "model": "surya",
240
+ }
241
+
242
+ for table_pred, source_table_bbox in zip(table_predictions, source_tables):
243
+ for box in table_pred.rows:
244
+ structure_detections.append(build_row_item(box, source_table_bbox, "table-row"))
245
+
246
+ for box in table_pred.cols:
247
+ structure_detections.append(build_row_item(box, source_table_bbox, "table-column"))
283
248
 
284
- for table_pred, (offset_pdf_x0, offset_pdf_y0) in zip(table_predictions, pdf_offsets):
285
- # Process Rows
286
- for row_box in table_pred.rows:
287
- crop_rx0, crop_ry0, crop_rx1, crop_ry1 = map(float, row_box.bbox)
288
- pdf_row_x0 = offset_pdf_x0 + crop_rx0 / pdf_to_highres_scale
289
- pdf_row_y0 = offset_pdf_y0 + crop_ry0 / pdf_to_highres_scale
290
- pdf_row_x1 = offset_pdf_x0 + crop_rx1 / pdf_to_highres_scale
291
- pdf_row_y1 = offset_pdf_y0 + crop_ry1 / pdf_to_highres_scale
292
- img_row_x0 = pdf_row_x0 / img_scale_x
293
- img_row_y0 = pdf_row_y0 / img_scale_y
294
- img_row_x1 = pdf_row_x1 / img_scale_x
295
- img_row_y1 = pdf_row_y1 / img_scale_y
296
- structure_detections.append(
297
- {
298
- "bbox": (img_row_x0, img_row_y0, img_row_x1, img_row_y1),
299
- "class": "table-row",
300
- "confidence": 1.0,
301
- "normalized_class": "table-row",
302
- "source": "layout",
303
- "model": "surya",
304
- }
305
- )
306
-
307
- # Process Columns
308
- for col_box in table_pred.cols:
309
- crop_cx0, crop_cy0, crop_cx1, crop_cy1 = map(float, col_box.bbox)
310
- pdf_col_x0 = offset_pdf_x0 + crop_cx0 / pdf_to_highres_scale
311
- pdf_col_y0 = offset_pdf_y0 + crop_cy0 / pdf_to_highres_scale
312
- pdf_col_x1 = offset_pdf_x0 + crop_cx1 / pdf_to_highres_scale
313
- pdf_col_y1 = offset_pdf_y0 + crop_cy1 / pdf_to_highres_scale
314
- img_col_x0 = pdf_col_x0 / img_scale_x
315
- img_col_y0 = pdf_col_y0 / img_scale_y
316
- img_col_x1 = pdf_col_x1 / img_scale_x
317
- img_col_y1 = pdf_col_y1 / img_scale_y
318
- structure_detections.append(
319
- {
320
- "bbox": (img_col_x0, img_col_y0, img_col_x1, img_col_y1),
321
- "class": "table-column",
322
- "confidence": 1.0,
323
- "normalized_class": "table-column",
324
- "source": "layout",
325
- "model": "surya",
326
- }
327
- )
249
+ for box in table_pred.cells:
250
+ structure_detections.append(build_row_item(box, source_table_bbox, "table-cell"))
328
251
 
329
252
  self.logger.info(f"Added {len(structure_detections)} table structure elements.")
330
253
 
@@ -5,7 +5,7 @@ import os
5
5
  import tempfile
6
6
  from typing import Any, Dict, List, Optional, Tuple
7
7
 
8
- from PIL import Image
8
+ from PIL import Image, ImageEnhance
9
9
 
10
10
  # Assuming base class and options are importable
11
11
  from .base import LayoutDetector
@@ -150,6 +150,26 @@ class TableTransformerDetector(LayoutDetector):
150
150
  )
151
151
  return objects
152
152
 
153
+ def preprocess_image(self, image: Image.Image, enhance_contrast: float = 1.5) -> Image.Image:
154
+ """Enhance the image to improve table structure detection.
155
+
156
+ Args:
157
+ image: The input PIL image
158
+ enhance_contrast: Contrast enhancement factor (1.0 = no change)
159
+
160
+ Returns:
161
+ Enhanced PIL image
162
+ """
163
+ # Convert to grayscale and back to RGB for better structure detection
164
+ if image.mode != "L": # If not already grayscale
165
+ grayscale = image.convert("L")
166
+ enhanced = ImageEnhance.Contrast(grayscale).enhance(enhance_contrast)
167
+ return enhanced.convert("RGB") # Convert back to RGB for model input
168
+ else:
169
+ # Just enhance contrast if already grayscale
170
+ enhanced = ImageEnhance.Contrast(image).enhance(enhance_contrast)
171
+ return enhanced.convert("RGB")
172
+
153
173
  # --- End Helper Methods ---
154
174
 
155
175
  def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
@@ -196,9 +216,17 @@ class TableTransformerDetector(LayoutDetector):
196
216
  ]
197
217
  )
198
218
 
219
+ # Use image preprocessing for better structure detection
220
+ enhance_contrast = (
221
+ options.enhance_contrast
222
+ if hasattr(options, "enhance_contrast")
223
+ else options.extra_args.get("enhance_contrast", 1.5)
224
+ )
225
+ processed_image = self.preprocess_image(image, enhance_contrast)
226
+
199
227
  # --- Detect Tables ---
200
228
  self.logger.debug("Running TATR table detection...")
201
- pixel_values = detection_transform(image.convert("RGB")).unsqueeze(0).to(device)
229
+ pixel_values = detection_transform(processed_image).unsqueeze(0).to(device)
202
230
  with torch.no_grad():
203
231
  outputs = detection_model(pixel_values)
204
232
 
@@ -271,19 +299,38 @@ class TableTransformerDetector(LayoutDetector):
271
299
  if x_max <= x_min or y_max <= y_min:
272
300
  continue # Skip invalid crop
273
301
 
302
+ # Process the cropped table for better structure detection
274
303
  cropped_table = image.crop((x_min, y_min, x_max, y_max))
275
304
  if cropped_table.width == 0 or cropped_table.height == 0:
276
305
  continue # Skip empty crop
277
306
 
278
- pixel_values_struct = structure_transform(cropped_table).unsqueeze(0).to(device)
307
+ processed_crop = self.preprocess_image(cropped_table, enhance_contrast)
308
+ pixel_values_struct = structure_transform(processed_crop).unsqueeze(0).to(device)
309
+
279
310
  with torch.no_grad():
280
311
  outputs_struct = structure_model(pixel_values_struct)
281
312
 
282
313
  structure_elements = self.outputs_to_objects(
283
314
  outputs_struct, cropped_table.size, id2label_struct
284
315
  )
316
+
317
+ # Reduce confidence threshold specifically for columns to catch more
318
+ column_threshold = None
319
+ if hasattr(options, "column_threshold") and options.column_threshold is not None:
320
+ column_threshold = options.column_threshold
321
+ else:
322
+ column_threshold = options.extra_args.get(
323
+ "column_threshold", options.confidence * 0.8
324
+ )
325
+
285
326
  structure_elements = [
286
- e for e in structure_elements if e["score"] >= options.confidence
327
+ e
328
+ for e in structure_elements
329
+ if (
330
+ e["score"] >= column_threshold
331
+ if "column" in e["label"]
332
+ else e["score"] >= options.confidence
333
+ )
287
334
  ]
288
335
 
289
336
  for element in structure_elements:
@@ -9,14 +9,10 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
9
9
 
10
10
  from natural_pdf.analyzers.text_options import TextStyleOptions
11
11
 
12
- # Import ElementCollection and TextStyleOptions
13
- from natural_pdf.elements.collections import ElementCollection
14
-
15
12
  if TYPE_CHECKING:
16
13
  from natural_pdf.core.page import Page
17
14
  from natural_pdf.elements.base import Element
18
-
19
- # Remove ElementCollection from here if imported above
15
+ from natural_pdf.elements.collections import ElementCollection
20
16
 
21
17
  logger = logging.getLogger(__name__)
22
18
 
@@ -68,6 +64,8 @@ class TextStyleAnalyzer:
68
64
  ElementCollection containing all processed text elements (typically words)
69
65
  with added 'style_label', 'style_key', and 'style_properties' attributes.
70
66
  """
67
+ from natural_pdf.elements.collections import ElementCollection
68
+
71
69
  current_options = options or self.options
72
70
  logger.info(
73
71
  f"Starting text style analysis for page {page.number} with options: {current_options}"
@@ -1,12 +1,10 @@
1
1
  import logging
2
2
  from typing import Any, Dict, List
3
3
 
4
- from ..elements.region import Region
5
-
6
4
 
7
5
  def convert_to_regions(
8
6
  page: Any, detections: List[Dict[str, Any]], scale_factor: float = 1.0
9
- ) -> List[Region]:
7
+ ) -> List["Region"]:
10
8
  """
11
9
  Convert layout detections to Region objects.
12
10
 
@@ -18,6 +16,8 @@ def convert_to_regions(
18
16
  Returns:
19
17
  List of Region objects with layout metadata
20
18
  """
19
+ from natural_pdf.elements.region import Region
20
+
21
21
  conversion_logger = logging.getLogger("natural_pdf.analyzers.layout.convert")
22
22
  conversion_logger.debug(
23
23
  f"Converting {len(detections)} detections to regions with scale {scale_factor}"