natural-pdf 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/__init__.py +2 -2
- natural_pdf/analyzers/guides.py +670 -595
- natural_pdf/analyzers/layout/base.py +53 -6
- natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
- natural_pdf/analyzers/layout/layout_manager.py +18 -14
- natural_pdf/analyzers/layout/layout_options.py +1 -0
- natural_pdf/analyzers/layout/paddle.py +102 -64
- natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
- natural_pdf/analyzers/layout/yolo.py +2 -6
- natural_pdf/analyzers/shape_detection_mixin.py +15 -6
- natural_pdf/classification/manager.py +92 -77
- natural_pdf/classification/mixin.py +49 -5
- natural_pdf/classification/results.py +1 -1
- natural_pdf/cli.py +7 -3
- natural_pdf/collections/pdf_collection.py +96 -101
- natural_pdf/core/element_manager.py +188 -82
- natural_pdf/core/highlighting_service.py +5 -6
- natural_pdf/core/page.py +132 -16
- natural_pdf/core/pdf.py +486 -71
- natural_pdf/describe/__init__.py +18 -12
- natural_pdf/describe/base.py +179 -172
- natural_pdf/describe/elements.py +155 -155
- natural_pdf/describe/mixin.py +27 -19
- natural_pdf/describe/summary.py +44 -55
- natural_pdf/elements/base.py +134 -18
- natural_pdf/elements/collections.py +90 -18
- natural_pdf/elements/image.py +2 -1
- natural_pdf/elements/line.py +0 -31
- natural_pdf/elements/rect.py +0 -14
- natural_pdf/elements/region.py +238 -111
- natural_pdf/elements/text.py +18 -12
- natural_pdf/exporters/__init__.py +4 -1
- natural_pdf/exporters/original_pdf.py +12 -4
- natural_pdf/extraction/mixin.py +66 -10
- natural_pdf/extraction/result.py +1 -1
- natural_pdf/flows/flow.py +63 -4
- natural_pdf/flows/region.py +4 -4
- natural_pdf/ocr/engine.py +83 -2
- natural_pdf/ocr/engine_paddle.py +5 -5
- natural_pdf/ocr/ocr_factory.py +2 -1
- natural_pdf/ocr/ocr_manager.py +24 -13
- natural_pdf/ocr/ocr_options.py +3 -10
- natural_pdf/qa/document_qa.py +21 -8
- natural_pdf/qa/qa_result.py +3 -7
- natural_pdf/search/__init__.py +3 -2
- natural_pdf/search/lancedb_search_service.py +5 -6
- natural_pdf/search/numpy_search_service.py +5 -2
- natural_pdf/selectors/parser.py +51 -6
- natural_pdf/tables/__init__.py +2 -2
- natural_pdf/tables/result.py +7 -6
- natural_pdf/utils/bidi_mirror.py +2 -1
- natural_pdf/utils/reading_order.py +3 -2
- natural_pdf/utils/visualization.py +3 -3
- natural_pdf/widgets/viewer.py +0 -1
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
- natural_pdf-0.1.34.dist-info/RECORD +121 -0
- optimization/memory_comparison.py +73 -58
- optimization/pdf_analyzer.py +141 -96
- optimization/performance_analysis.py +111 -110
- optimization/test_cleanup_methods.py +47 -36
- optimization/test_memory_fix.py +40 -39
- tools/bad_pdf_eval/__init__.py +0 -1
- tools/bad_pdf_eval/analyser.py +35 -18
- tools/bad_pdf_eval/collate_summaries.py +22 -18
- tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
- tools/bad_pdf_eval/eval_suite.py +21 -9
- tools/bad_pdf_eval/evaluate_quality.py +198 -0
- tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
- tools/bad_pdf_eval/llm_enrich.py +71 -39
- tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
- tools/bad_pdf_eval/reporter.py +1 -1
- tools/bad_pdf_eval/utils.py +7 -4
- natural_pdf-0.1.32.dist-info/RECORD +0 -118
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0
natural_pdf/describe/base.py
CHANGED
@@ -26,41 +26,41 @@ logger = logging.getLogger(__name__)
|
|
26
26
|
def describe_page(page: "Page") -> ElementSummary:
|
27
27
|
"""
|
28
28
|
Describe what's on a page with high-level summary.
|
29
|
-
|
29
|
+
|
30
30
|
Args:
|
31
31
|
page: Page to describe
|
32
|
-
|
32
|
+
|
33
33
|
Returns:
|
34
34
|
ElementSummary with page overview
|
35
35
|
"""
|
36
36
|
data = {}
|
37
|
-
|
37
|
+
|
38
38
|
# Get all elements
|
39
39
|
all_elements = page.get_elements()
|
40
|
-
|
40
|
+
|
41
41
|
if not all_elements:
|
42
42
|
data["message"] = "No elements found on page"
|
43
43
|
return ElementSummary(data, f"Page {page.number} Summary")
|
44
|
-
|
44
|
+
|
45
45
|
# Element counts by type (exclude chars - too granular)
|
46
46
|
type_counts = Counter()
|
47
47
|
for element in all_elements:
|
48
|
-
element_type = getattr(element,
|
49
|
-
if element_type !=
|
48
|
+
element_type = getattr(element, "type", "unknown")
|
49
|
+
if element_type != "char": # Skip character elements
|
50
50
|
type_counts[element_type] += 1
|
51
|
-
|
51
|
+
|
52
52
|
# Format element counts as dictionary for proper list formatting
|
53
53
|
element_summary = {}
|
54
54
|
for element_type, count in type_counts.most_common():
|
55
|
-
type_display = element_type.replace(
|
56
|
-
if element_type ==
|
55
|
+
type_display = element_type.replace("_", " ").title()
|
56
|
+
if element_type == "word":
|
57
57
|
# Add source breakdown for text
|
58
|
-
text_elements = [e for e in all_elements if getattr(e,
|
58
|
+
text_elements = [e for e in all_elements if getattr(e, "type", "") == "word"]
|
59
59
|
sources = Counter()
|
60
60
|
for elem in text_elements:
|
61
|
-
source = getattr(elem,
|
61
|
+
source = getattr(elem, "source", "unknown")
|
62
62
|
sources[source] += 1
|
63
|
-
|
63
|
+
|
64
64
|
if len(sources) > 1:
|
65
65
|
source_parts = []
|
66
66
|
for source, source_count in sources.most_common():
|
@@ -70,86 +70,83 @@ def describe_page(page: "Page") -> ElementSummary:
|
|
70
70
|
element_summary["text"] = f"{count} elements"
|
71
71
|
else:
|
72
72
|
element_summary[element_type] = f"{count} elements"
|
73
|
-
|
73
|
+
|
74
74
|
data["elements"] = element_summary
|
75
|
-
|
75
|
+
|
76
76
|
# Text analysis if we have text elements (exclude chars - too granular)
|
77
|
-
text_elements = [e for e in all_elements if getattr(e,
|
77
|
+
text_elements = [e for e in all_elements if getattr(e, "type", "") == "word"]
|
78
78
|
if text_elements:
|
79
79
|
text_analysis = describe_text_elements(text_elements)
|
80
|
-
if text_analysis and
|
80
|
+
if text_analysis and "message" not in text_analysis:
|
81
81
|
data["text_analysis"] = text_analysis
|
82
|
-
|
82
|
+
|
83
83
|
return ElementSummary(data, f"Page {page.number} Summary")
|
84
84
|
|
85
85
|
|
86
86
|
def describe_collection(collection: "ElementCollection") -> ElementSummary:
|
87
87
|
"""
|
88
88
|
Describe an element collection with type-specific analysis.
|
89
|
-
|
89
|
+
|
90
90
|
Args:
|
91
91
|
collection: ElementCollection to describe
|
92
|
-
|
92
|
+
|
93
93
|
Returns:
|
94
94
|
ElementSummary with collection analysis
|
95
95
|
"""
|
96
96
|
elements = list(collection)
|
97
|
-
|
97
|
+
|
98
98
|
if not elements:
|
99
99
|
data = {"message": "Empty collection"}
|
100
100
|
return ElementSummary(data, "Collection Summary")
|
101
|
-
|
101
|
+
|
102
102
|
data = {}
|
103
|
-
|
103
|
+
|
104
104
|
# Group elements by type
|
105
105
|
by_type = {}
|
106
106
|
for element in elements:
|
107
|
-
element_type = getattr(element,
|
107
|
+
element_type = getattr(element, "type", "unknown")
|
108
108
|
by_type.setdefault(element_type, []).append(element)
|
109
|
-
|
109
|
+
|
110
110
|
# Overall summary for mixed collections (exclude chars from overview)
|
111
111
|
if len(by_type) > 1:
|
112
|
-
type_counts = {k: len(v) for k, v in by_type.items() if k !=
|
112
|
+
type_counts = {k: len(v) for k, v in by_type.items() if k != "char"}
|
113
113
|
total = sum(type_counts.values())
|
114
|
-
|
114
|
+
|
115
115
|
summary_parts = []
|
116
116
|
for element_type, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True):
|
117
|
-
type_display = element_type.replace(
|
117
|
+
type_display = element_type.replace("_", " ").title()
|
118
118
|
summary_parts.append(f"**{type_display}**: {count}")
|
119
|
-
|
119
|
+
|
120
120
|
if summary_parts: # Only add overview if we have non-char elements
|
121
|
-
data["overview"] = {
|
122
|
-
|
123
|
-
"type_breakdown": summary_parts
|
124
|
-
}
|
125
|
-
|
121
|
+
data["overview"] = {"total_elements": total, "type_breakdown": summary_parts}
|
122
|
+
|
126
123
|
# Type-specific analysis (exclude chars - too granular)
|
127
124
|
for element_type, type_elements in by_type.items():
|
128
|
-
if element_type ==
|
125
|
+
if element_type == "char":
|
129
126
|
# Skip character elements - too granular for useful analysis
|
130
127
|
continue
|
131
|
-
elif element_type ==
|
128
|
+
elif element_type == "word":
|
132
129
|
analysis = describe_text_elements(type_elements)
|
133
|
-
elif element_type ==
|
130
|
+
elif element_type == "rect":
|
134
131
|
analysis = describe_rect_elements(type_elements)
|
135
|
-
elif element_type ==
|
132
|
+
elif element_type == "line":
|
136
133
|
analysis = describe_line_elements(type_elements)
|
137
|
-
elif element_type ==
|
134
|
+
elif element_type == "region":
|
138
135
|
analysis = describe_region_elements(type_elements)
|
139
136
|
else:
|
140
137
|
analysis = {"count": len(type_elements)}
|
141
|
-
|
142
|
-
if analysis and
|
143
|
-
section_name = element_type.replace(
|
138
|
+
|
139
|
+
if analysis and "message" not in analysis:
|
140
|
+
section_name = element_type.replace("_", " ").title()
|
144
141
|
if len(by_type) == 1:
|
145
142
|
# Single type collection - flatten the structure
|
146
143
|
data.update(analysis)
|
147
144
|
else:
|
148
145
|
# Mixed collection - keep sections separate
|
149
146
|
data[section_name] = analysis
|
150
|
-
|
147
|
+
|
151
148
|
# Count non-char elements for title
|
152
|
-
non_char_count = len([e for e in elements if getattr(e,
|
149
|
+
non_char_count = len([e for e in elements if getattr(e, "type", "unknown") != "char"])
|
153
150
|
title = f"Collection Summary ({non_char_count} elements)"
|
154
151
|
return ElementSummary(data, title)
|
155
152
|
|
@@ -157,29 +154,29 @@ def describe_collection(collection: "ElementCollection") -> ElementSummary:
|
|
157
154
|
def describe_region(region: "Region") -> ElementSummary:
|
158
155
|
"""
|
159
156
|
Describe a region with its properties and contents.
|
160
|
-
|
157
|
+
|
161
158
|
Args:
|
162
159
|
region: Region to describe
|
163
|
-
|
160
|
+
|
164
161
|
Returns:
|
165
162
|
ElementSummary with region analysis
|
166
163
|
"""
|
167
164
|
data = {}
|
168
|
-
|
165
|
+
|
169
166
|
# Region info
|
170
167
|
region_info = {
|
171
168
|
"page": region.page.number,
|
172
169
|
"dimensions": f"{region.width:.0f}×{region.height:.0f} pts",
|
173
170
|
"area": f"{region.width * region.height:.0f} sq pts",
|
174
|
-
"position": f"({region.x0:.0f}, {region.top:.0f}) to ({region.x1:.0f}, {region.bottom:.0f})"
|
171
|
+
"position": f"({region.x0:.0f}, {region.top:.0f}) to ({region.x1:.0f}, {region.bottom:.0f})",
|
175
172
|
}
|
176
|
-
|
173
|
+
|
177
174
|
# Add metadata if available
|
178
|
-
if hasattr(region,
|
175
|
+
if hasattr(region, "metadata") and region.metadata:
|
179
176
|
region_info["metadata"] = region.metadata
|
180
|
-
|
177
|
+
|
181
178
|
data["region_info"] = region_info
|
182
|
-
|
179
|
+
|
183
180
|
# Content analysis
|
184
181
|
content_elements = region.find_all("*")
|
185
182
|
if content_elements:
|
@@ -188,54 +185,54 @@ def describe_region(region: "Region") -> ElementSummary:
|
|
188
185
|
data["content"] = content_analysis.to_dict()
|
189
186
|
else:
|
190
187
|
data["content"] = {"message": "No elements found in region"}
|
191
|
-
|
188
|
+
|
192
189
|
return ElementSummary(data, "Region Summary")
|
193
190
|
|
194
191
|
|
195
192
|
def inspect_collection(collection: "ElementCollection", limit: int = 30) -> InspectionSummary:
|
196
193
|
"""
|
197
194
|
Inspect elements in a collection with detailed tabular view.
|
198
|
-
|
195
|
+
|
199
196
|
Args:
|
200
197
|
collection: ElementCollection to inspect
|
201
198
|
limit: Maximum elements per type to show (default: 30)
|
202
|
-
|
199
|
+
|
203
200
|
Returns:
|
204
201
|
InspectionSummary with element tables
|
205
202
|
"""
|
206
203
|
elements = list(collection)
|
207
|
-
|
204
|
+
|
208
205
|
if not elements:
|
209
206
|
data = {"message": "Empty collection"}
|
210
207
|
return InspectionSummary(data, "Collection Inspection")
|
211
|
-
|
208
|
+
|
212
209
|
data = {}
|
213
|
-
|
210
|
+
|
214
211
|
# Check if multi-page
|
215
212
|
pages = set()
|
216
213
|
for element in elements:
|
217
|
-
if hasattr(element,
|
214
|
+
if hasattr(element, "page") and hasattr(element.page, "number"):
|
218
215
|
pages.add(element.page.number)
|
219
216
|
show_page_column = len(pages) > 1
|
220
|
-
|
217
|
+
|
221
218
|
# Group by type
|
222
219
|
by_type = {}
|
223
220
|
for element in elements:
|
224
|
-
element_type = getattr(element,
|
221
|
+
element_type = getattr(element, "type", "unknown")
|
225
222
|
by_type.setdefault(element_type, []).append(element)
|
226
|
-
|
223
|
+
|
227
224
|
# Create tables for each type (exclude chars - too granular)
|
228
225
|
for element_type, type_elements in by_type.items():
|
229
|
-
if element_type ==
|
226
|
+
if element_type == "char":
|
230
227
|
# Skip character elements - too granular for useful inspection
|
231
228
|
continue
|
232
|
-
|
229
|
+
|
233
230
|
# Limit elements shown
|
234
231
|
display_elements = type_elements[:limit]
|
235
|
-
|
232
|
+
|
236
233
|
# Get appropriate columns for this type
|
237
234
|
columns = _get_columns_for_type(element_type, show_page_column)
|
238
|
-
|
235
|
+
|
239
236
|
# Extract data for each element
|
240
237
|
element_data = []
|
241
238
|
for element in display_elements:
|
@@ -244,110 +241,113 @@ def inspect_collection(collection: "ElementCollection", limit: int = 30) -> Insp
|
|
244
241
|
value = _extract_element_value(element, col)
|
245
242
|
row[col] = value
|
246
243
|
element_data.append(row)
|
247
|
-
|
244
|
+
|
248
245
|
# Create section
|
249
246
|
section_name = f"{element_type}_elements"
|
250
|
-
section_data = {
|
251
|
-
|
252
|
-
"columns": columns
|
253
|
-
}
|
254
|
-
|
247
|
+
section_data = {"elements": element_data, "columns": columns}
|
248
|
+
|
255
249
|
# Add note if truncated
|
256
250
|
if len(type_elements) > limit:
|
257
|
-
section_data["note"] =
|
258
|
-
|
251
|
+
section_data["note"] = (
|
252
|
+
f"Showing {limit} of {len(type_elements)} elements (pass limit= to see more)"
|
253
|
+
)
|
254
|
+
|
259
255
|
data[section_name] = section_data
|
260
|
-
|
256
|
+
|
261
257
|
# Count non-char elements for title
|
262
|
-
non_char_count = len([e for e in elements if getattr(e,
|
258
|
+
non_char_count = len([e for e in elements if getattr(e, "type", "unknown") != "char"])
|
263
259
|
title = f"Collection Inspection ({non_char_count} elements)"
|
264
260
|
return InspectionSummary(data, title)
|
265
261
|
|
266
262
|
|
267
263
|
def _get_columns_for_type(element_type: str, show_page_column: bool) -> List[str]:
|
268
264
|
"""Get appropriate columns for element type."""
|
269
|
-
base_columns = [
|
270
|
-
|
271
|
-
if element_type ==
|
272
|
-
columns =
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
265
|
+
base_columns = ["x0", "top", "x1", "bottom"]
|
266
|
+
|
267
|
+
if element_type == "word":
|
268
|
+
columns = (
|
269
|
+
["text"]
|
270
|
+
+ base_columns
|
271
|
+
+ [
|
272
|
+
"font_family",
|
273
|
+
"font_variant",
|
274
|
+
"size",
|
275
|
+
"bold",
|
276
|
+
"italic",
|
277
|
+
"strike",
|
278
|
+
"underline",
|
279
|
+
"highlight",
|
280
|
+
"source",
|
281
|
+
"confidence",
|
282
|
+
]
|
283
|
+
)
|
284
284
|
# Add foreground text colour too
|
285
|
-
columns.append(
|
286
|
-
elif element_type ==
|
287
|
-
columns = base_columns + [
|
288
|
-
elif element_type ==
|
289
|
-
columns = base_columns + [
|
290
|
-
elif element_type ==
|
291
|
-
columns = base_columns + [
|
292
|
-
elif element_type ==
|
293
|
-
columns = base_columns + [
|
285
|
+
columns.append("color")
|
286
|
+
elif element_type == "rect":
|
287
|
+
columns = base_columns + ["width", "height", "stroke", "fill", "stroke_width"]
|
288
|
+
elif element_type == "line":
|
289
|
+
columns = base_columns + ["width", "is_horizontal", "is_vertical"] # LineElement properties
|
290
|
+
elif element_type == "region":
|
291
|
+
columns = base_columns + ["width", "height", "type", "color"]
|
292
|
+
elif element_type == "blob":
|
293
|
+
columns = base_columns + ["width", "height", "color"]
|
294
294
|
else:
|
295
|
-
columns = base_columns + [
|
296
|
-
|
295
|
+
columns = base_columns + ["type"]
|
296
|
+
|
297
297
|
if show_page_column:
|
298
|
-
columns.append(
|
299
|
-
|
298
|
+
columns.append("page")
|
299
|
+
|
300
300
|
return columns
|
301
301
|
|
302
302
|
|
303
303
|
def _extract_element_value(element: "Element", column: str) -> Any:
|
304
304
|
"""Extract value for a column from an element."""
|
305
305
|
try:
|
306
|
-
if column ==
|
307
|
-
text = getattr(element,
|
306
|
+
if column == "text":
|
307
|
+
text = getattr(element, "text", "")
|
308
308
|
if text and len(text) > 60:
|
309
309
|
return text[:60] + "..."
|
310
310
|
return text or ""
|
311
|
-
|
312
|
-
elif column ==
|
313
|
-
if hasattr(element,
|
311
|
+
|
312
|
+
elif column == "page":
|
313
|
+
if hasattr(element, "page") and hasattr(element.page, "number"):
|
314
314
|
return element.page.number
|
315
315
|
return ""
|
316
|
-
|
317
|
-
elif column ==
|
318
|
-
confidence = getattr(element,
|
316
|
+
|
317
|
+
elif column == "confidence":
|
318
|
+
confidence = getattr(element, "confidence", None)
|
319
319
|
if confidence is not None and isinstance(confidence, (int, float)):
|
320
320
|
return f"{confidence:.2f}"
|
321
321
|
return ""
|
322
|
-
|
323
|
-
elif column ==
|
322
|
+
|
323
|
+
elif column == "font_family":
|
324
324
|
# Use the cleaner font_family property from TextElement
|
325
|
-
font_family = getattr(element,
|
325
|
+
font_family = getattr(element, "font_family", None)
|
326
326
|
if font_family:
|
327
327
|
return font_family
|
328
328
|
# Fallback to fontname
|
329
|
-
return getattr(element,
|
330
|
-
|
331
|
-
elif column ==
|
332
|
-
variant = getattr(element,
|
329
|
+
return getattr(element, "fontname", "")
|
330
|
+
|
331
|
+
elif column == "font_variant":
|
332
|
+
variant = getattr(element, "font_variant", None)
|
333
333
|
if variant:
|
334
334
|
return variant
|
335
335
|
# Fallback – try to derive from fontname if property missing
|
336
|
-
fontname = getattr(element,
|
336
|
+
fontname = getattr(element, "fontname", "")
|
337
337
|
if "+" in fontname:
|
338
338
|
return fontname.split("+", 1)[0]
|
339
|
-
return
|
340
|
-
|
341
|
-
elif column in [
|
339
|
+
return ""
|
340
|
+
|
341
|
+
elif column in ["bold", "italic", "strike", "underline"]:
|
342
342
|
value = getattr(element, column, False)
|
343
343
|
return value if isinstance(value, bool) else False
|
344
|
-
|
345
|
-
elif column ==
|
344
|
+
|
345
|
+
elif column == "highlight":
|
346
346
|
# If element is highlighted, return its colour; otherwise blank
|
347
|
-
if getattr(element,
|
348
|
-
col_val = getattr(element,
|
347
|
+
if getattr(element, "highlight", False):
|
348
|
+
col_val = getattr(element, "highlight_color", None)
|
349
349
|
if col_val is None:
|
350
|
-
return
|
350
|
+
return "True" # fallback if colour missing
|
351
351
|
# Convert tuple to hex
|
352
352
|
if isinstance(col_val, (tuple, list)) and len(col_val) >= 3:
|
353
353
|
try:
|
@@ -356,9 +356,9 @@ def _extract_element_value(element: "Element", column: str) -> Any:
|
|
356
356
|
except Exception:
|
357
357
|
return str(col_val)
|
358
358
|
return str(col_val)
|
359
|
-
return
|
360
|
-
|
361
|
-
elif column in [
|
359
|
+
return ""
|
360
|
+
|
361
|
+
elif column in ["stroke", "fill", "color"]:
|
362
362
|
value = getattr(element, column, None)
|
363
363
|
# If already a string (e.g. '#ff00aa' or 'red') return as is
|
364
364
|
if isinstance(value, str):
|
@@ -371,24 +371,24 @@ def _extract_element_value(element: "Element", column: str) -> Any:
|
|
371
371
|
except Exception:
|
372
372
|
return str(value)
|
373
373
|
return ""
|
374
|
-
|
375
|
-
elif column in [
|
374
|
+
|
375
|
+
elif column in ["x0", "top", "x1", "bottom", "width", "height", "size", "stroke_width"]:
|
376
376
|
value = getattr(element, column, 0)
|
377
377
|
if isinstance(value, (int, float)) and not isinstance(value, bool):
|
378
378
|
return int(round(value))
|
379
379
|
return 0
|
380
|
-
|
381
|
-
elif column in [
|
380
|
+
|
381
|
+
elif column in ["is_horizontal", "is_vertical"]:
|
382
382
|
value = getattr(element, column, False)
|
383
383
|
return value if isinstance(value, bool) else False
|
384
|
-
|
384
|
+
|
385
385
|
else:
|
386
386
|
# Generic attribute access
|
387
|
-
value = getattr(element, column,
|
387
|
+
value = getattr(element, column, "")
|
388
388
|
if value is None:
|
389
389
|
return ""
|
390
390
|
return str(value)
|
391
|
-
|
391
|
+
|
392
392
|
except Exception as e:
|
393
393
|
# Fallback for any unexpected errors
|
394
394
|
logger.warning(f"Error extracting {column} from element: {e}")
|
@@ -398,64 +398,71 @@ def _extract_element_value(element: "Element", column: str) -> Any:
|
|
398
398
|
def describe_element(element: "Element") -> "ElementSummary":
|
399
399
|
"""
|
400
400
|
Describe an individual element with its properties and attributes.
|
401
|
-
|
401
|
+
|
402
402
|
Args:
|
403
403
|
element: The element to describe
|
404
|
-
|
404
|
+
|
405
405
|
Returns:
|
406
406
|
ElementSummary with formatted element properties
|
407
407
|
"""
|
408
408
|
from natural_pdf.describe.summary import ElementSummary
|
409
|
-
|
409
|
+
|
410
410
|
# Get basic element info
|
411
|
-
element_type = getattr(element,
|
412
|
-
|
411
|
+
element_type = getattr(element, "type", element.__class__.__name__)
|
412
|
+
|
413
413
|
# Build the description data - use dict structure for proper list formatting
|
414
414
|
data = {
|
415
415
|
"info": {
|
416
416
|
"object_type": "element",
|
417
417
|
"element_type": element_type,
|
418
|
-
"class_name": element.__class__.__name__
|
418
|
+
"class_name": element.__class__.__name__,
|
419
419
|
}
|
420
420
|
}
|
421
|
-
|
421
|
+
|
422
422
|
# Add geometric properties - use dict structure for proper list formatting
|
423
|
-
if hasattr(element,
|
423
|
+
if hasattr(element, "bbox"):
|
424
424
|
data["geometry"] = {
|
425
425
|
"position": f"({round(element.x0, 1)}, {round(element.top, 1)}, {round(element.x1, 1)}, {round(element.bottom, 1)})",
|
426
|
-
"size": f"({round(element.width, 1)}, {round(element.height, 1)})"
|
426
|
+
"size": f"({round(element.width, 1)}, {round(element.height, 1)})",
|
427
427
|
}
|
428
|
-
|
428
|
+
|
429
429
|
# Add text content if available - use dict structure for proper list formatting
|
430
|
-
if hasattr(element,
|
430
|
+
if hasattr(element, "text") and element.text:
|
431
431
|
text = str(element.text).strip()
|
432
432
|
display_text = text[:50] + "..." if len(text) > 50 else text
|
433
|
-
data["content"] = {
|
434
|
-
|
435
|
-
"length": f"{len(text)} chars"
|
436
|
-
}
|
437
|
-
|
433
|
+
data["content"] = {"text": f"'{display_text}'", "length": f"{len(text)} chars"}
|
434
|
+
|
438
435
|
# Add common text properties - use dict structure for proper list formatting
|
439
436
|
text_props = {}
|
440
|
-
for prop in [
|
437
|
+
for prop in [
|
438
|
+
"font_family",
|
439
|
+
"size",
|
440
|
+
"bold",
|
441
|
+
"italic",
|
442
|
+
"strike",
|
443
|
+
"underline",
|
444
|
+
"highlight",
|
445
|
+
"source",
|
446
|
+
"confidence",
|
447
|
+
]:
|
441
448
|
if hasattr(element, prop):
|
442
449
|
value = getattr(element, prop)
|
443
450
|
if value is not None:
|
444
|
-
if prop ==
|
451
|
+
if prop == "confidence" and isinstance(value, (int, float)):
|
445
452
|
text_props[prop] = round(value, 3)
|
446
|
-
elif prop ==
|
453
|
+
elif prop == "size" and isinstance(value, (int, float)):
|
447
454
|
text_props[prop] = round(value, 1)
|
448
|
-
elif prop in [
|
455
|
+
elif prop in ["bold", "italic", "strike", "underline"]:
|
449
456
|
text_props[prop] = value
|
450
457
|
else:
|
451
458
|
text_props[prop] = value
|
452
|
-
|
459
|
+
|
453
460
|
if text_props:
|
454
461
|
data["properties"] = text_props
|
455
|
-
|
462
|
+
|
456
463
|
# Add color information - use dict structure for proper list formatting
|
457
464
|
color_info = {}
|
458
|
-
for prop in [
|
465
|
+
for prop in ["color", "fill", "stroke"]:
|
459
466
|
if hasattr(element, prop):
|
460
467
|
value = getattr(element, prop)
|
461
468
|
if value is not None:
|
@@ -471,28 +478,28 @@ def describe_element(element: "Element") -> "ElementSummary":
|
|
471
478
|
color_info[prop] = str(value)
|
472
479
|
else:
|
473
480
|
color_info[prop] = str(value)
|
474
|
-
|
481
|
+
|
475
482
|
if color_info:
|
476
483
|
data["colors"] = color_info
|
477
|
-
|
484
|
+
|
478
485
|
# Add page information - use dict structure for proper list formatting
|
479
|
-
if hasattr(element,
|
480
|
-
page_num = getattr(element.page,
|
486
|
+
if hasattr(element, "page") and element.page:
|
487
|
+
page_num = getattr(element.page, "number", None)
|
481
488
|
if page_num is not None:
|
482
489
|
data["page"] = {"number": page_num}
|
483
|
-
|
490
|
+
|
484
491
|
# Add polygon information if available - use dict structure for proper list formatting
|
485
|
-
if hasattr(element,
|
486
|
-
if hasattr(element,
|
492
|
+
if hasattr(element, "has_polygon") and element.has_polygon:
|
493
|
+
if hasattr(element, "polygon"):
|
487
494
|
polygon = element.polygon
|
488
495
|
if polygon and len(polygon) > 0:
|
489
496
|
data["shape"] = {"polygon_points": len(polygon)}
|
490
|
-
|
497
|
+
|
491
498
|
# Create title
|
492
499
|
title = f"{element_type.title()} Element"
|
493
|
-
if hasattr(element,
|
500
|
+
if hasattr(element, "text") and element.text:
|
494
501
|
preview = str(element.text).strip()[:30]
|
495
502
|
if preview:
|
496
503
|
title += f": '{preview}'"
|
497
|
-
|
498
|
-
return ElementSummary(data, title)
|
504
|
+
|
505
|
+
return ElementSummary(data, title)
|