natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +422 -0
- natural_pdf/classification/mixin.py +163 -0
- natural_pdf/classification/results.py +80 -0
- natural_pdf/collections/mixins.py +111 -0
- natural_pdf/collections/pdf_collection.py +434 -15
- natural_pdf/core/element_manager.py +83 -0
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +578 -93
- natural_pdf/core/pdf.py +912 -460
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +712 -109
- natural_pdf/elements/region.py +722 -69
- natural_pdf/elements/text.py +4 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +5 -4
- natural_pdf/extraction/manager.py +135 -0
- natural_pdf/extraction/mixin.py +279 -0
- natural_pdf/extraction/result.py +23 -0
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +122 -26
- natural_pdf/ocr/ocr_options.py +94 -11
- natural_pdf/ocr/utils.py +19 -6
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +431 -230
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +60 -1
- natural_pdf/utils/tqdm_utils.py +51 -0
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -915
- docs/element-selection/index.md +0 -229
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -170
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -209
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -194
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -340
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -147
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -114
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -270
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -332
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -288
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -413
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -508
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2434
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -512
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -604
- docs/tutorials/12-ocr-integration.md +0 -175
- docs/tutorials/13-semantic-search.ipynb +0 -1328
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.7.dist-info/RECORD +0 -145
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,142 @@
|
|
1
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
2
|
+
|
3
|
+
# Attempt to import pdfplumber modules
|
4
|
+
import pdfplumber.table as pdfplumber_table
|
5
|
+
|
6
|
+
# Type Definitions
|
7
|
+
T_num = Union[int, float]
|
8
|
+
T_bbox = Tuple[T_num, T_num, T_num, T_num]
|
9
|
+
T_obj = Dict[str, Any]
|
10
|
+
T_obj_list = List[T_obj]
|
11
|
+
T_intersections = Dict[Tuple[T_num, T_num], Dict[str, T_obj_list]]
|
12
|
+
T_cell_dict = Dict[str, T_num]
|
13
|
+
|
14
|
+
# Use defaults directly from pdfplumber.table (or placeholders if not installed)
|
15
|
+
DEFAULT_SNAP_TOLERANCE = pdfplumber_table.DEFAULT_SNAP_TOLERANCE
|
16
|
+
DEFAULT_JOIN_TOLERANCE = pdfplumber_table.DEFAULT_JOIN_TOLERANCE
|
17
|
+
DEFAULT_MIN_WORDS_VERTICAL = pdfplumber_table.DEFAULT_MIN_WORDS_VERTICAL
|
18
|
+
DEFAULT_MIN_WORDS_HORIZONTAL = pdfplumber_table.DEFAULT_MIN_WORDS_HORIZONTAL
|
19
|
+
|
20
|
+
# --- Main Function ---
|
21
|
+
|
22
|
+
|
23
|
+
def find_text_based_tables(
|
24
|
+
bboxes: List[T_bbox],
|
25
|
+
snap_tolerance: T_num = DEFAULT_SNAP_TOLERANCE,
|
26
|
+
join_tolerance: T_num = DEFAULT_JOIN_TOLERANCE,
|
27
|
+
min_words_vertical: int = DEFAULT_MIN_WORDS_VERTICAL,
|
28
|
+
min_words_horizontal: int = DEFAULT_MIN_WORDS_HORIZONTAL,
|
29
|
+
intersection_tolerance: T_num = 3,
|
30
|
+
snap_x_tolerance: Optional[T_num] = None,
|
31
|
+
snap_y_tolerance: Optional[T_num] = None,
|
32
|
+
join_x_tolerance: Optional[T_num] = None,
|
33
|
+
join_y_tolerance: Optional[T_num] = None,
|
34
|
+
intersection_x_tolerance: Optional[T_num] = None,
|
35
|
+
intersection_y_tolerance: Optional[T_num] = None,
|
36
|
+
) -> Dict[str, Union[T_obj_list, List[T_cell_dict], T_intersections]]:
|
37
|
+
"""
|
38
|
+
Finds table structures based on text element alignment using imported
|
39
|
+
pdfplumber functions. Accepts a list of bounding box tuples.
|
40
|
+
|
41
|
+
Args:
|
42
|
+
bboxes: A list of bounding box tuples (x0, top, x1, bottom).
|
43
|
+
snap_tolerance: General tolerance for snapping edges.
|
44
|
+
join_tolerance: General tolerance for joining nearby edges.
|
45
|
+
min_words_vertical: Minimum words to form a vertical edge.
|
46
|
+
min_words_horizontal: Minimum words to form a horizontal edge.
|
47
|
+
intersection_tolerance: General tolerance for intersections.
|
48
|
+
snap_x_tolerance: Specific horizontal snap tolerance (overrides general).
|
49
|
+
snap_y_tolerance: Specific vertical snap tolerance (overrides general).
|
50
|
+
join_x_tolerance: Specific horizontal join tolerance (overrides general).
|
51
|
+
join_y_tolerance: Specific vertical join tolerance (overrides general).
|
52
|
+
intersection_x_tolerance: Specific horizontal intersection tolerance.
|
53
|
+
intersection_y_tolerance: Specific vertical intersection tolerance.
|
54
|
+
|
55
|
+
|
56
|
+
Returns:
|
57
|
+
A dictionary containing:
|
58
|
+
- 'horizontal_edges': List of merged horizontal edge dictionaries.
|
59
|
+
- 'vertical_edges': List of merged vertical edge dictionaries.
|
60
|
+
- 'cells': List of dictionaries [{'left': x0, 'top': top, 'right': x1, 'bottom': bottom}, ...]
|
61
|
+
representing detected cells, ready for page.region().
|
62
|
+
- 'intersections': Dictionary of intersection points and the edges forming them.
|
63
|
+
|
64
|
+
Raises:
|
65
|
+
ImportError: If the 'pdfplumber' library is not installed when this function is called.
|
66
|
+
"""
|
67
|
+
|
68
|
+
if not bboxes:
|
69
|
+
return {"horizontal_edges": [], "vertical_edges": [], "cells": [], "intersections": {}}
|
70
|
+
|
71
|
+
# Convert BBoxes to Dictionaries required by pdfplumber functions
|
72
|
+
text_elements = []
|
73
|
+
for i, bbox in enumerate(bboxes):
|
74
|
+
x0, top, x1, bottom = bbox
|
75
|
+
# Basic structure needed for words_to_edges_h/v
|
76
|
+
text_elements.append(
|
77
|
+
{
|
78
|
+
"x0": x0,
|
79
|
+
"top": top,
|
80
|
+
"x1": x1,
|
81
|
+
"bottom": bottom,
|
82
|
+
"width": x1 - x0,
|
83
|
+
"height": bottom - top,
|
84
|
+
"text": f"elem_{i}", # Placeholder text
|
85
|
+
"object_type": "char", # Mimic word/char structure loosely
|
86
|
+
}
|
87
|
+
)
|
88
|
+
|
89
|
+
# Resolve tolerances
|
90
|
+
sx = snap_x_tolerance if snap_x_tolerance is not None else snap_tolerance
|
91
|
+
sy = snap_y_tolerance if snap_y_tolerance is not None else snap_tolerance
|
92
|
+
jx = join_x_tolerance if join_x_tolerance is not None else join_tolerance
|
93
|
+
jy = join_y_tolerance if join_y_tolerance is not None else join_tolerance
|
94
|
+
ix = (
|
95
|
+
intersection_x_tolerance if intersection_x_tolerance is not None else intersection_tolerance
|
96
|
+
)
|
97
|
+
iy = (
|
98
|
+
intersection_y_tolerance if intersection_y_tolerance is not None else intersection_tolerance
|
99
|
+
)
|
100
|
+
|
101
|
+
# --- pdfplumber Pipeline ---
|
102
|
+
h_edges = pdfplumber_table.words_to_edges_h(text_elements, word_threshold=min_words_horizontal)
|
103
|
+
v_edges = pdfplumber_table.words_to_edges_v(text_elements, word_threshold=min_words_vertical)
|
104
|
+
initial_edges = h_edges + v_edges
|
105
|
+
|
106
|
+
if not initial_edges:
|
107
|
+
return {"horizontal_edges": [], "vertical_edges": [], "cells": [], "intersections": {}}
|
108
|
+
|
109
|
+
merged_edges = pdfplumber_table.merge_edges(initial_edges, sx, sy, jx, jy)
|
110
|
+
merged_h = [e for e in merged_edges if e["orientation"] == "h"]
|
111
|
+
merged_v = [e for e in merged_edges if e["orientation"] == "v"]
|
112
|
+
|
113
|
+
if not merged_edges:
|
114
|
+
return {
|
115
|
+
"horizontal_edges": merged_h,
|
116
|
+
"vertical_edges": merged_v,
|
117
|
+
"cells": [],
|
118
|
+
"intersections": {},
|
119
|
+
}
|
120
|
+
|
121
|
+
intersections = pdfplumber_table.edges_to_intersections(merged_edges, ix, iy)
|
122
|
+
if not intersections:
|
123
|
+
return {
|
124
|
+
"horizontal_edges": merged_h,
|
125
|
+
"vertical_edges": merged_v,
|
126
|
+
"cells": [],
|
127
|
+
"intersections": intersections,
|
128
|
+
}
|
129
|
+
|
130
|
+
cell_tuples = pdfplumber_table.intersections_to_cells(intersections)
|
131
|
+
|
132
|
+
# Convert cell tuples to dictionaries for page.region()
|
133
|
+
cell_dicts = []
|
134
|
+
for x0, top, x1, bottom in cell_tuples:
|
135
|
+
cell_dicts.append({"left": x0, "top": top, "right": x1, "bottom": bottom})
|
136
|
+
|
137
|
+
return {
|
138
|
+
"horizontal_edges": merged_h,
|
139
|
+
"vertical_edges": merged_v,
|
140
|
+
"cells": cell_dicts,
|
141
|
+
"intersections": intersections,
|
142
|
+
}
|
@@ -20,6 +20,7 @@ TableRecPredictor = None
|
|
20
20
|
|
21
21
|
if surya_spec:
|
22
22
|
try:
|
23
|
+
from surya.common.util import expand_bbox, rescale_bbox
|
23
24
|
from surya.layout import LayoutPredictor
|
24
25
|
from surya.table_rec import TableRecPredictor
|
25
26
|
except ImportError as e:
|
@@ -74,25 +75,10 @@ class SuryaLayoutDetector(LayoutDetector):
|
|
74
75
|
raise TypeError("Incorrect options type provided for Surya model loading.")
|
75
76
|
self.logger.info(f"Loading Surya models (device={options.device})...")
|
76
77
|
models = {}
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
return models
|
82
|
-
except Exception as e:
|
83
|
-
self.logger.error(f"Failed to load Surya models: {e}", exc_info=True)
|
84
|
-
raise
|
85
|
-
|
86
|
-
def _expand_bbox(
|
87
|
-
self, bbox: Tuple[float, float, float, float], padding: int, max_width: int, max_height: int
|
88
|
-
) -> Tuple[int, int, int, int]:
|
89
|
-
"""Expand bbox by padding, clamping to max dimensions."""
|
90
|
-
x0, y0, x1, y1 = bbox
|
91
|
-
x0 = max(0, int(x0 - padding))
|
92
|
-
y0 = max(0, int(y0 - padding))
|
93
|
-
x1 = min(max_width, int(x1 + padding))
|
94
|
-
y1 = min(max_height, int(y1 + padding))
|
95
|
-
return x0, y0, x1, y1
|
78
|
+
models["layout"] = LayoutPredictor()
|
79
|
+
models["table_rec"] = TableRecPredictor()
|
80
|
+
self.logger.info("Surya LayoutPredictor and TableRecPredictor loaded.")
|
81
|
+
return models
|
96
82
|
|
97
83
|
def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
|
98
84
|
"""Detect layout elements and optionally table structure in an image using Surya."""
|
@@ -114,19 +100,12 @@ class SuryaLayoutDetector(LayoutDetector):
|
|
114
100
|
|
115
101
|
# Extract page reference and scaling factors from extra_args (passed by LayoutAnalyzer)
|
116
102
|
self._page_ref = options.extra_args.get("_page_ref")
|
117
|
-
img_scale_x = options.extra_args.get("_img_scale_x")
|
118
|
-
img_scale_y = options.extra_args.get("_img_scale_y")
|
119
103
|
|
120
104
|
# We still need this check, otherwise later steps that need these vars will fail
|
121
|
-
can_do_table_rec =
|
122
|
-
options.recognize_table_structure
|
123
|
-
and self._page_ref
|
124
|
-
and img_scale_x is not None
|
125
|
-
and img_scale_y is not None
|
126
|
-
)
|
105
|
+
can_do_table_rec = options.recognize_table_structure
|
127
106
|
if options.recognize_table_structure and not can_do_table_rec:
|
128
107
|
logger.warning(
|
129
|
-
"Surya table recognition cannot proceed without page reference
|
108
|
+
"Surya table recognition cannot proceed without page reference. Disabling."
|
130
109
|
)
|
131
110
|
options.recognize_table_structure = False
|
132
111
|
|
@@ -141,14 +120,12 @@ class SuryaLayoutDetector(LayoutDetector):
|
|
141
120
|
table_rec_predictor = models["table_rec"]
|
142
121
|
|
143
122
|
input_image = image.convert("RGB")
|
144
|
-
input_image_list = [input_image]
|
145
123
|
|
146
|
-
initial_layout_detections = []
|
124
|
+
initial_layout_detections = []
|
147
125
|
tables_to_process = []
|
148
126
|
|
149
|
-
# --- Initial Layout Detection ---
|
150
127
|
self.logger.debug("Running Surya layout prediction...")
|
151
|
-
layout_predictions = layout_predictor(
|
128
|
+
layout_predictions = layout_predictor([input_image])
|
152
129
|
self.logger.debug(f"Surya prediction returned {len(layout_predictions)} results.")
|
153
130
|
if not layout_predictions:
|
154
131
|
return []
|
@@ -164,6 +141,7 @@ class SuryaLayoutDetector(LayoutDetector):
|
|
164
141
|
)
|
165
142
|
|
166
143
|
for layout_box in prediction.bboxes:
|
144
|
+
|
167
145
|
class_name_orig = layout_box.label
|
168
146
|
normalized_class = self._normalize_class_name(class_name_orig)
|
169
147
|
score = float(layout_box.confidence)
|
@@ -196,7 +174,6 @@ class SuryaLayoutDetector(LayoutDetector):
|
|
196
174
|
f"Surya initially detected {len(initial_layout_detections)} layout elements matching criteria."
|
197
175
|
)
|
198
176
|
|
199
|
-
# --- Table Structure Recognition (Optional) ---
|
200
177
|
if not options.recognize_table_structure or not tables_to_process:
|
201
178
|
self.logger.debug(
|
202
179
|
"Skipping Surya table structure recognition (disabled or no tables found)."
|
@@ -207,59 +184,29 @@ class SuryaLayoutDetector(LayoutDetector):
|
|
207
184
|
f"Attempting Surya table structure recognition for {len(tables_to_process)} tables..."
|
208
185
|
)
|
209
186
|
high_res_crops = []
|
210
|
-
pdf_offsets = [] # Store (pdf_x0, pdf_y0) for each crop
|
211
187
|
|
212
188
|
high_res_dpi = getattr(self._page_ref._parent, "_config", {}).get(
|
213
189
|
"surya_table_rec_dpi", 192
|
214
190
|
)
|
215
|
-
|
216
|
-
|
191
|
+
high_res_page_image = self._page_ref.to_image(
|
192
|
+
resolution=high_res_dpi, include_highlights=False, scale=1.0
|
217
193
|
)
|
218
|
-
pdf_to_highres_scale = high_res_dpi / 72.0
|
219
194
|
|
220
195
|
# Render high-res page ONCE
|
221
196
|
self.logger.debug(
|
222
|
-
f"Rendering page {self._page_ref.number} at {high_res_dpi} DPI for table recognition
|
223
|
-
)
|
224
|
-
high_res_page_image = self._page_ref.to_image(
|
225
|
-
resolution=high_res_dpi, include_highlights=False
|
226
|
-
)
|
227
|
-
if not high_res_page_image:
|
228
|
-
raise RuntimeError(f"Failed to render page {self._page_ref.number} at high resolution.")
|
229
|
-
self.logger.debug(
|
230
|
-
f" High-res image size: {high_res_page_image.width}x{high_res_page_image.height}"
|
197
|
+
f"Rendering page {self._page_ref.number} at {high_res_dpi} DPI for table recognition, size {high_res_page_image.width}x{high_res_page_image.height}."
|
231
198
|
)
|
232
199
|
|
200
|
+
source_tables = []
|
233
201
|
for i, table_detection in enumerate(tables_to_process):
|
234
|
-
|
235
|
-
|
236
|
-
# PDF coords
|
237
|
-
pdf_x0 = img_x0 * img_scale_x
|
238
|
-
pdf_y0 = img_y0 * img_scale_y
|
239
|
-
pdf_x1 = img_x1 * img_scale_x
|
240
|
-
pdf_y1 = img_y1 * img_scale_y
|
241
|
-
pdf_x0 = max(0, pdf_x0)
|
242
|
-
pdf_y0 = max(0, pdf_y0)
|
243
|
-
pdf_x1 = min(self._page_ref.width, pdf_x1)
|
244
|
-
pdf_y1 = min(self._page_ref.height, pdf_y1)
|
245
|
-
|
246
|
-
# High-res image coords
|
247
|
-
hr_x0 = pdf_x0 * pdf_to_highres_scale
|
248
|
-
hr_y0 = pdf_y0 * pdf_to_highres_scale
|
249
|
-
hr_x1 = pdf_x1 * pdf_to_highres_scale
|
250
|
-
hr_y1 = pdf_y1 * pdf_to_highres_scale
|
251
|
-
|
252
|
-
# Expand high-res bbox
|
253
|
-
hr_x0_exp, hr_y0_exp, hr_x1_exp, hr_y1_exp = self._expand_bbox(
|
254
|
-
(hr_x0, hr_y0, hr_x1, hr_y1),
|
255
|
-
padding=bbox_padding,
|
256
|
-
max_width=high_res_page_image.width,
|
257
|
-
max_height=high_res_page_image.height,
|
202
|
+
highres_bbox = rescale_bbox(
|
203
|
+
list(table_detection["bbox"]), image.size, high_res_page_image.size
|
258
204
|
)
|
205
|
+
highres_bbox = expand_bbox(highres_bbox)
|
259
206
|
|
260
|
-
crop = high_res_page_image.crop(
|
207
|
+
crop = high_res_page_image.crop(highres_bbox)
|
261
208
|
high_res_crops.append(crop)
|
262
|
-
|
209
|
+
source_tables.append(highres_bbox)
|
263
210
|
|
264
211
|
if not high_res_crops:
|
265
212
|
self.logger.info("No valid high-resolution table crops generated.")
|
@@ -267,64 +214,40 @@ class SuryaLayoutDetector(LayoutDetector):
|
|
267
214
|
|
268
215
|
structure_detections = [] # Detections relative to std_res input_image
|
269
216
|
|
270
|
-
# --- Run Table Recognition (will raise error on failure) ---
|
271
217
|
self.logger.debug(
|
272
218
|
f"Running Surya table recognition on {len(high_res_crops)} high-res images..."
|
273
219
|
)
|
274
220
|
table_predictions = table_rec_predictor(high_res_crops)
|
275
221
|
self.logger.debug(f"Surya table recognition returned {len(table_predictions)} results.")
|
276
222
|
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
223
|
+
def build_row_item(element, source_table_bbox, label):
|
224
|
+
adjusted_bbox = [
|
225
|
+
float(element.bbox[0] + source_table_bbox[0]),
|
226
|
+
float(element.bbox[1] + source_table_bbox[1]),
|
227
|
+
float(element.bbox[2] + source_table_bbox[0]),
|
228
|
+
float(element.bbox[3] + source_table_bbox[1]),
|
229
|
+
]
|
230
|
+
|
231
|
+
adjusted_bbox = rescale_bbox(adjusted_bbox, high_res_page_image.size, image.size)
|
232
|
+
|
233
|
+
return {
|
234
|
+
"bbox": adjusted_bbox,
|
235
|
+
"class": label,
|
236
|
+
"confidence": 1.0,
|
237
|
+
"normalized_class": label,
|
238
|
+
"source": "layout",
|
239
|
+
"model": "surya",
|
240
|
+
}
|
241
|
+
|
242
|
+
for table_pred, source_table_bbox in zip(table_predictions, source_tables):
|
243
|
+
for box in table_pred.rows:
|
244
|
+
structure_detections.append(build_row_item(box, source_table_bbox, "table-row"))
|
245
|
+
|
246
|
+
for box in table_pred.cols:
|
247
|
+
structure_detections.append(build_row_item(box, source_table_bbox, "table-column"))
|
283
248
|
|
284
|
-
|
285
|
-
|
286
|
-
for row_box in table_pred.rows:
|
287
|
-
crop_rx0, crop_ry0, crop_rx1, crop_ry1 = map(float, row_box.bbox)
|
288
|
-
pdf_row_x0 = offset_pdf_x0 + crop_rx0 / pdf_to_highres_scale
|
289
|
-
pdf_row_y0 = offset_pdf_y0 + crop_ry0 / pdf_to_highres_scale
|
290
|
-
pdf_row_x1 = offset_pdf_x0 + crop_rx1 / pdf_to_highres_scale
|
291
|
-
pdf_row_y1 = offset_pdf_y0 + crop_ry1 / pdf_to_highres_scale
|
292
|
-
img_row_x0 = pdf_row_x0 / img_scale_x
|
293
|
-
img_row_y0 = pdf_row_y0 / img_scale_y
|
294
|
-
img_row_x1 = pdf_row_x1 / img_scale_x
|
295
|
-
img_row_y1 = pdf_row_y1 / img_scale_y
|
296
|
-
structure_detections.append(
|
297
|
-
{
|
298
|
-
"bbox": (img_row_x0, img_row_y0, img_row_x1, img_row_y1),
|
299
|
-
"class": "table-row",
|
300
|
-
"confidence": 1.0,
|
301
|
-
"normalized_class": "table-row",
|
302
|
-
"source": "layout",
|
303
|
-
"model": "surya",
|
304
|
-
}
|
305
|
-
)
|
306
|
-
|
307
|
-
# Process Columns
|
308
|
-
for col_box in table_pred.cols:
|
309
|
-
crop_cx0, crop_cy0, crop_cx1, crop_cy1 = map(float, col_box.bbox)
|
310
|
-
pdf_col_x0 = offset_pdf_x0 + crop_cx0 / pdf_to_highres_scale
|
311
|
-
pdf_col_y0 = offset_pdf_y0 + crop_cy0 / pdf_to_highres_scale
|
312
|
-
pdf_col_x1 = offset_pdf_x0 + crop_cx1 / pdf_to_highres_scale
|
313
|
-
pdf_col_y1 = offset_pdf_y0 + crop_cy1 / pdf_to_highres_scale
|
314
|
-
img_col_x0 = pdf_col_x0 / img_scale_x
|
315
|
-
img_col_y0 = pdf_col_y0 / img_scale_y
|
316
|
-
img_col_x1 = pdf_col_x1 / img_scale_x
|
317
|
-
img_col_y1 = pdf_col_y1 / img_scale_y
|
318
|
-
structure_detections.append(
|
319
|
-
{
|
320
|
-
"bbox": (img_col_x0, img_col_y0, img_col_x1, img_col_y1),
|
321
|
-
"class": "table-column",
|
322
|
-
"confidence": 1.0,
|
323
|
-
"normalized_class": "table-column",
|
324
|
-
"source": "layout",
|
325
|
-
"model": "surya",
|
326
|
-
}
|
327
|
-
)
|
249
|
+
for box in table_pred.cells:
|
250
|
+
structure_detections.append(build_row_item(box, source_table_bbox, "table-cell"))
|
328
251
|
|
329
252
|
self.logger.info(f"Added {len(structure_detections)} table structure elements.")
|
330
253
|
|
@@ -5,7 +5,7 @@ import os
|
|
5
5
|
import tempfile
|
6
6
|
from typing import Any, Dict, List, Optional, Tuple
|
7
7
|
|
8
|
-
from PIL import Image
|
8
|
+
from PIL import Image, ImageEnhance
|
9
9
|
|
10
10
|
# Assuming base class and options are importable
|
11
11
|
from .base import LayoutDetector
|
@@ -150,6 +150,26 @@ class TableTransformerDetector(LayoutDetector):
|
|
150
150
|
)
|
151
151
|
return objects
|
152
152
|
|
153
|
+
def preprocess_image(self, image: Image.Image, enhance_contrast: float = 1.5) -> Image.Image:
|
154
|
+
"""Enhance the image to improve table structure detection.
|
155
|
+
|
156
|
+
Args:
|
157
|
+
image: The input PIL image
|
158
|
+
enhance_contrast: Contrast enhancement factor (1.0 = no change)
|
159
|
+
|
160
|
+
Returns:
|
161
|
+
Enhanced PIL image
|
162
|
+
"""
|
163
|
+
# Convert to grayscale and back to RGB for better structure detection
|
164
|
+
if image.mode != "L": # If not already grayscale
|
165
|
+
grayscale = image.convert("L")
|
166
|
+
enhanced = ImageEnhance.Contrast(grayscale).enhance(enhance_contrast)
|
167
|
+
return enhanced.convert("RGB") # Convert back to RGB for model input
|
168
|
+
else:
|
169
|
+
# Just enhance contrast if already grayscale
|
170
|
+
enhanced = ImageEnhance.Contrast(image).enhance(enhance_contrast)
|
171
|
+
return enhanced.convert("RGB")
|
172
|
+
|
153
173
|
# --- End Helper Methods ---
|
154
174
|
|
155
175
|
def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
|
@@ -196,9 +216,17 @@ class TableTransformerDetector(LayoutDetector):
|
|
196
216
|
]
|
197
217
|
)
|
198
218
|
|
219
|
+
# Use image preprocessing for better structure detection
|
220
|
+
enhance_contrast = (
|
221
|
+
options.enhance_contrast
|
222
|
+
if hasattr(options, "enhance_contrast")
|
223
|
+
else options.extra_args.get("enhance_contrast", 1.5)
|
224
|
+
)
|
225
|
+
processed_image = self.preprocess_image(image, enhance_contrast)
|
226
|
+
|
199
227
|
# --- Detect Tables ---
|
200
228
|
self.logger.debug("Running TATR table detection...")
|
201
|
-
pixel_values = detection_transform(
|
229
|
+
pixel_values = detection_transform(processed_image).unsqueeze(0).to(device)
|
202
230
|
with torch.no_grad():
|
203
231
|
outputs = detection_model(pixel_values)
|
204
232
|
|
@@ -271,19 +299,38 @@ class TableTransformerDetector(LayoutDetector):
|
|
271
299
|
if x_max <= x_min or y_max <= y_min:
|
272
300
|
continue # Skip invalid crop
|
273
301
|
|
302
|
+
# Process the cropped table for better structure detection
|
274
303
|
cropped_table = image.crop((x_min, y_min, x_max, y_max))
|
275
304
|
if cropped_table.width == 0 or cropped_table.height == 0:
|
276
305
|
continue # Skip empty crop
|
277
306
|
|
278
|
-
|
307
|
+
processed_crop = self.preprocess_image(cropped_table, enhance_contrast)
|
308
|
+
pixel_values_struct = structure_transform(processed_crop).unsqueeze(0).to(device)
|
309
|
+
|
279
310
|
with torch.no_grad():
|
280
311
|
outputs_struct = structure_model(pixel_values_struct)
|
281
312
|
|
282
313
|
structure_elements = self.outputs_to_objects(
|
283
314
|
outputs_struct, cropped_table.size, id2label_struct
|
284
315
|
)
|
316
|
+
|
317
|
+
# Reduce confidence threshold specifically for columns to catch more
|
318
|
+
column_threshold = None
|
319
|
+
if hasattr(options, "column_threshold") and options.column_threshold is not None:
|
320
|
+
column_threshold = options.column_threshold
|
321
|
+
else:
|
322
|
+
column_threshold = options.extra_args.get(
|
323
|
+
"column_threshold", options.confidence * 0.8
|
324
|
+
)
|
325
|
+
|
285
326
|
structure_elements = [
|
286
|
-
e
|
327
|
+
e
|
328
|
+
for e in structure_elements
|
329
|
+
if (
|
330
|
+
e["score"] >= column_threshold
|
331
|
+
if "column" in e["label"]
|
332
|
+
else e["score"] >= options.confidence
|
333
|
+
)
|
287
334
|
]
|
288
335
|
|
289
336
|
for element in structure_elements:
|
@@ -9,14 +9,10 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
|
9
9
|
|
10
10
|
from natural_pdf.analyzers.text_options import TextStyleOptions
|
11
11
|
|
12
|
-
# Import ElementCollection and TextStyleOptions
|
13
|
-
from natural_pdf.elements.collections import ElementCollection
|
14
|
-
|
15
12
|
if TYPE_CHECKING:
|
16
13
|
from natural_pdf.core.page import Page
|
17
14
|
from natural_pdf.elements.base import Element
|
18
|
-
|
19
|
-
# Remove ElementCollection from here if imported above
|
15
|
+
from natural_pdf.elements.collections import ElementCollection
|
20
16
|
|
21
17
|
logger = logging.getLogger(__name__)
|
22
18
|
|
@@ -68,6 +64,8 @@ class TextStyleAnalyzer:
|
|
68
64
|
ElementCollection containing all processed text elements (typically words)
|
69
65
|
with added 'style_label', 'style_key', and 'style_properties' attributes.
|
70
66
|
"""
|
67
|
+
from natural_pdf.elements.collections import ElementCollection
|
68
|
+
|
71
69
|
current_options = options or self.options
|
72
70
|
logger.info(
|
73
71
|
f"Starting text style analysis for page {page.number} with options: {current_options}"
|
natural_pdf/analyzers/utils.py
CHANGED
@@ -1,12 +1,10 @@
|
|
1
1
|
import logging
|
2
2
|
from typing import Any, Dict, List
|
3
3
|
|
4
|
-
from ..elements.region import Region
|
5
|
-
|
6
4
|
|
7
5
|
def convert_to_regions(
|
8
6
|
page: Any, detections: List[Dict[str, Any]], scale_factor: float = 1.0
|
9
|
-
) -> List[Region]:
|
7
|
+
) -> List["Region"]:
|
10
8
|
"""
|
11
9
|
Convert layout detections to Region objects.
|
12
10
|
|
@@ -18,6 +16,8 @@ def convert_to_regions(
|
|
18
16
|
Returns:
|
19
17
|
List of Region objects with layout metadata
|
20
18
|
"""
|
19
|
+
from natural_pdf.elements.region import Region
|
20
|
+
|
21
21
|
conversion_logger = logging.getLogger("natural_pdf.analyzers.layout.convert")
|
22
22
|
conversion_logger.debug(
|
23
23
|
f"Converting {len(detections)} detections to regions with scale {scale_factor}"
|