natural-pdf 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/__init__.py +2 -2
- natural_pdf/analyzers/guides.py +670 -595
- natural_pdf/analyzers/layout/base.py +53 -6
- natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
- natural_pdf/analyzers/layout/layout_manager.py +18 -14
- natural_pdf/analyzers/layout/layout_options.py +1 -0
- natural_pdf/analyzers/layout/paddle.py +102 -64
- natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
- natural_pdf/analyzers/layout/yolo.py +2 -6
- natural_pdf/analyzers/shape_detection_mixin.py +15 -6
- natural_pdf/classification/manager.py +92 -77
- natural_pdf/classification/mixin.py +49 -5
- natural_pdf/classification/results.py +1 -1
- natural_pdf/cli.py +7 -3
- natural_pdf/collections/pdf_collection.py +96 -101
- natural_pdf/core/element_manager.py +188 -82
- natural_pdf/core/highlighting_service.py +5 -6
- natural_pdf/core/page.py +132 -16
- natural_pdf/core/pdf.py +486 -71
- natural_pdf/describe/__init__.py +18 -12
- natural_pdf/describe/base.py +179 -172
- natural_pdf/describe/elements.py +155 -155
- natural_pdf/describe/mixin.py +27 -19
- natural_pdf/describe/summary.py +44 -55
- natural_pdf/elements/base.py +134 -18
- natural_pdf/elements/collections.py +90 -18
- natural_pdf/elements/image.py +2 -1
- natural_pdf/elements/line.py +0 -31
- natural_pdf/elements/rect.py +0 -14
- natural_pdf/elements/region.py +238 -111
- natural_pdf/elements/text.py +18 -12
- natural_pdf/exporters/__init__.py +4 -1
- natural_pdf/exporters/original_pdf.py +12 -4
- natural_pdf/extraction/mixin.py +66 -10
- natural_pdf/extraction/result.py +1 -1
- natural_pdf/flows/flow.py +63 -4
- natural_pdf/flows/region.py +4 -4
- natural_pdf/ocr/engine.py +83 -2
- natural_pdf/ocr/engine_paddle.py +5 -5
- natural_pdf/ocr/ocr_factory.py +2 -1
- natural_pdf/ocr/ocr_manager.py +24 -13
- natural_pdf/ocr/ocr_options.py +3 -10
- natural_pdf/qa/document_qa.py +21 -8
- natural_pdf/qa/qa_result.py +3 -7
- natural_pdf/search/__init__.py +3 -2
- natural_pdf/search/lancedb_search_service.py +5 -6
- natural_pdf/search/numpy_search_service.py +5 -2
- natural_pdf/selectors/parser.py +51 -6
- natural_pdf/tables/__init__.py +2 -2
- natural_pdf/tables/result.py +7 -6
- natural_pdf/utils/bidi_mirror.py +2 -1
- natural_pdf/utils/reading_order.py +3 -2
- natural_pdf/utils/visualization.py +3 -3
- natural_pdf/widgets/viewer.py +0 -1
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
- natural_pdf-0.1.34.dist-info/RECORD +121 -0
- optimization/memory_comparison.py +73 -58
- optimization/pdf_analyzer.py +141 -96
- optimization/performance_analysis.py +111 -110
- optimization/test_cleanup_methods.py +47 -36
- optimization/test_memory_fix.py +40 -39
- tools/bad_pdf_eval/__init__.py +0 -1
- tools/bad_pdf_eval/analyser.py +35 -18
- tools/bad_pdf_eval/collate_summaries.py +22 -18
- tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
- tools/bad_pdf_eval/eval_suite.py +21 -9
- tools/bad_pdf_eval/evaluate_quality.py +198 -0
- tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
- tools/bad_pdf_eval/llm_enrich.py +71 -39
- tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
- tools/bad_pdf_eval/reporter.py +1 -1
- tools/bad_pdf_eval/utils.py +7 -4
- natural_pdf-0.1.32.dist-info/RECORD +0 -118
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0
natural_pdf/describe/elements.py
CHANGED
@@ -15,263 +15,263 @@ logger = logging.getLogger(__name__)
|
|
15
15
|
def describe_text_elements(elements: List["Element"]) -> Dict[str, Any]:
|
16
16
|
"""
|
17
17
|
Describe text elements with typography and OCR analysis.
|
18
|
-
|
18
|
+
|
19
19
|
Args:
|
20
20
|
elements: List of text elements
|
21
|
-
|
21
|
+
|
22
22
|
Returns:
|
23
23
|
Dictionary with text analysis sections
|
24
24
|
"""
|
25
25
|
if not elements:
|
26
26
|
return {"message": "No text elements found"}
|
27
|
-
|
27
|
+
|
28
28
|
result = {}
|
29
|
-
|
29
|
+
|
30
30
|
# Source breakdown
|
31
31
|
sources = Counter()
|
32
32
|
ocr_elements = []
|
33
|
-
|
33
|
+
|
34
34
|
for element in elements:
|
35
|
-
source = getattr(element,
|
35
|
+
source = getattr(element, "source", "unknown")
|
36
36
|
sources[source] += 1
|
37
|
-
if source ==
|
37
|
+
if source == "ocr":
|
38
38
|
ocr_elements.append(element)
|
39
|
-
|
39
|
+
|
40
40
|
if len(sources) > 1:
|
41
|
-
result[
|
42
|
-
|
41
|
+
result["sources"] = dict(sources)
|
42
|
+
|
43
43
|
# Typography analysis
|
44
44
|
typography = _analyze_typography(elements)
|
45
45
|
if typography:
|
46
|
-
result[
|
47
|
-
|
46
|
+
result["typography"] = typography
|
47
|
+
|
48
48
|
# OCR quality analysis
|
49
49
|
if ocr_elements:
|
50
50
|
ocr_quality = _analyze_ocr_quality(ocr_elements)
|
51
51
|
if ocr_quality:
|
52
|
-
result[
|
53
|
-
|
52
|
+
result["ocr_quality"] = ocr_quality
|
53
|
+
|
54
54
|
return result
|
55
55
|
|
56
56
|
|
57
57
|
def describe_rect_elements(elements: List["Element"]) -> Dict[str, Any]:
|
58
58
|
"""
|
59
59
|
Describe rectangle elements with size and style analysis.
|
60
|
-
|
60
|
+
|
61
61
|
Args:
|
62
62
|
elements: List of rectangle elements
|
63
|
-
|
63
|
+
|
64
64
|
Returns:
|
65
65
|
Dictionary with rectangle analysis
|
66
66
|
"""
|
67
67
|
if not elements:
|
68
68
|
return {"message": "No rectangle elements found"}
|
69
|
-
|
69
|
+
|
70
70
|
result = {}
|
71
|
-
|
71
|
+
|
72
72
|
# Size analysis
|
73
73
|
sizes = []
|
74
74
|
stroke_count = 0
|
75
75
|
fill_count = 0
|
76
76
|
colors = Counter()
|
77
77
|
stroke_widths = []
|
78
|
-
|
78
|
+
|
79
79
|
for element in elements:
|
80
80
|
# Size
|
81
|
-
width = getattr(element,
|
82
|
-
height = getattr(element,
|
81
|
+
width = getattr(element, "width", 0)
|
82
|
+
height = getattr(element, "height", 0)
|
83
83
|
if width and height:
|
84
84
|
sizes.append((width, height))
|
85
|
-
|
85
|
+
|
86
86
|
# Style properties - use RectangleElement properties
|
87
|
-
stroke = getattr(element,
|
87
|
+
stroke = getattr(element, "stroke", None)
|
88
88
|
if stroke and stroke != (0, 0, 0): # Check if stroke color exists and isn't black
|
89
89
|
stroke_count += 1
|
90
|
-
fill = getattr(element,
|
90
|
+
fill = getattr(element, "fill", None)
|
91
91
|
if fill and fill != (0, 0, 0): # Check if fill color exists and isn't black
|
92
92
|
fill_count += 1
|
93
|
-
|
93
|
+
|
94
94
|
# Stroke width
|
95
|
-
stroke_width = getattr(element,
|
95
|
+
stroke_width = getattr(element, "stroke_width", 0)
|
96
96
|
if stroke_width > 0:
|
97
97
|
stroke_widths.append(stroke_width)
|
98
|
-
|
98
|
+
|
99
99
|
# Color - use the element's stroke/fill properties
|
100
100
|
color = stroke or fill
|
101
101
|
if color:
|
102
102
|
if isinstance(color, (tuple, list)):
|
103
103
|
if color == (0, 0, 0) or color == (0.0, 0.0, 0.0):
|
104
|
-
colors[
|
104
|
+
colors["black"] += 1
|
105
105
|
elif color == (1, 1, 1) or color == (1.0, 1.0, 1.0):
|
106
|
-
colors[
|
106
|
+
colors["white"] += 1
|
107
107
|
else:
|
108
108
|
colors[str(color)] += 1
|
109
109
|
else:
|
110
110
|
colors[str(color)] += 1
|
111
|
-
|
111
|
+
|
112
112
|
# Size statistics
|
113
113
|
if sizes:
|
114
114
|
widths = [s[0] for s in sizes]
|
115
115
|
heights = [s[1] for s in sizes]
|
116
|
-
result[
|
117
|
-
|
118
|
-
|
119
|
-
|
116
|
+
result["size_stats"] = {
|
117
|
+
"width_range": f"{min(widths):.0f}-{max(widths):.0f}",
|
118
|
+
"height_range": f"{min(heights):.0f}-{max(heights):.0f}",
|
119
|
+
"avg_area": f"{sum(w*h for w,h in sizes)/len(sizes):.0f} sq pts",
|
120
120
|
}
|
121
|
-
|
121
|
+
|
122
122
|
# Style breakdown
|
123
123
|
style_info = {}
|
124
124
|
if stroke_count:
|
125
|
-
style_info[
|
125
|
+
style_info["stroke"] = stroke_count
|
126
126
|
if fill_count:
|
127
|
-
style_info[
|
127
|
+
style_info["fill"] = fill_count
|
128
128
|
if stroke_widths:
|
129
129
|
stroke_width_counts = Counter(stroke_widths)
|
130
130
|
# Convert float keys to strings to avoid formatting issues
|
131
131
|
stroke_width_dict = {str(k): v for k, v in stroke_width_counts.most_common()}
|
132
|
-
style_info[
|
132
|
+
style_info["stroke_widths"] = stroke_width_dict
|
133
133
|
if colors:
|
134
|
-
style_info[
|
135
|
-
|
134
|
+
style_info["colors"] = dict(colors.most_common(5))
|
135
|
+
|
136
136
|
if style_info:
|
137
|
-
result[
|
138
|
-
|
137
|
+
result["styles"] = style_info
|
138
|
+
|
139
139
|
return result
|
140
140
|
|
141
141
|
|
142
142
|
def describe_line_elements(elements: List["Element"]) -> Dict[str, Any]:
|
143
143
|
"""
|
144
144
|
Describe line elements with length and style analysis.
|
145
|
-
|
145
|
+
|
146
146
|
Args:
|
147
147
|
elements: List of line elements
|
148
|
-
|
148
|
+
|
149
149
|
Returns:
|
150
150
|
Dictionary with line analysis
|
151
151
|
"""
|
152
152
|
if not elements:
|
153
153
|
return {"message": "No line elements found"}
|
154
|
-
|
154
|
+
|
155
155
|
result = {}
|
156
|
-
|
156
|
+
|
157
157
|
lengths = []
|
158
158
|
widths = []
|
159
159
|
colors = Counter()
|
160
|
-
|
160
|
+
|
161
161
|
for element in elements:
|
162
162
|
# Calculate length
|
163
|
-
x0 = getattr(element,
|
164
|
-
y0 = getattr(element,
|
165
|
-
x1 = getattr(element,
|
166
|
-
y1 = getattr(element,
|
167
|
-
|
163
|
+
x0 = getattr(element, "x0", 0)
|
164
|
+
y0 = getattr(element, "top", 0)
|
165
|
+
x1 = getattr(element, "x1", 0)
|
166
|
+
y1 = getattr(element, "bottom", 0)
|
167
|
+
|
168
168
|
length = ((x1 - x0) ** 2 + (y1 - y0) ** 2) ** 0.5
|
169
169
|
if length > 0:
|
170
170
|
lengths.append(length)
|
171
|
-
|
171
|
+
|
172
172
|
# Line width - use the element's width property
|
173
|
-
width = getattr(element,
|
173
|
+
width = getattr(element, "width", 0) # LineElement has a width property
|
174
174
|
if width:
|
175
175
|
widths.append(width)
|
176
|
-
|
176
|
+
|
177
177
|
# Color - use the element's color property
|
178
|
-
color = getattr(element,
|
178
|
+
color = getattr(element, "color", None) # LineElement has a color property
|
179
179
|
if color:
|
180
180
|
if isinstance(color, (tuple, list)):
|
181
181
|
if color == (0, 0, 0) or color == (0.0, 0.0, 0.0):
|
182
|
-
colors[
|
182
|
+
colors["black"] += 1
|
183
183
|
else:
|
184
184
|
colors[str(color)] += 1
|
185
185
|
else:
|
186
186
|
colors[str(color)] += 1
|
187
|
-
|
187
|
+
|
188
188
|
# Length statistics
|
189
189
|
if lengths:
|
190
|
-
result[
|
191
|
-
|
192
|
-
|
193
|
-
|
190
|
+
result["length_stats"] = {
|
191
|
+
"min": f"{min(lengths):.0f}",
|
192
|
+
"max": f"{max(lengths):.0f}",
|
193
|
+
"avg": f"{sum(lengths)/len(lengths):.0f}",
|
194
194
|
}
|
195
|
-
|
195
|
+
|
196
196
|
# Width statistics
|
197
197
|
if widths:
|
198
198
|
width_counts = Counter(widths)
|
199
199
|
# Convert float keys to strings to avoid formatting issues
|
200
|
-
result[
|
201
|
-
|
200
|
+
result["line_widths"] = {str(k): v for k, v in width_counts.most_common()}
|
201
|
+
|
202
202
|
# Orientation analysis
|
203
|
-
horizontal_count = sum(1 for el in elements if getattr(el,
|
204
|
-
vertical_count = sum(1 for el in elements if getattr(el,
|
203
|
+
horizontal_count = sum(1 for el in elements if getattr(el, "is_horizontal", False))
|
204
|
+
vertical_count = sum(1 for el in elements if getattr(el, "is_vertical", False))
|
205
205
|
diagonal_count = len(elements) - horizontal_count - vertical_count
|
206
|
-
|
206
|
+
|
207
207
|
if horizontal_count or vertical_count or diagonal_count:
|
208
208
|
orientation_info = {}
|
209
209
|
if horizontal_count:
|
210
|
-
orientation_info[
|
210
|
+
orientation_info["horizontal"] = horizontal_count
|
211
211
|
if vertical_count:
|
212
|
-
orientation_info[
|
212
|
+
orientation_info["vertical"] = vertical_count
|
213
213
|
if diagonal_count:
|
214
|
-
orientation_info[
|
215
|
-
result[
|
216
|
-
|
214
|
+
orientation_info["diagonal"] = diagonal_count
|
215
|
+
result["orientations"] = orientation_info
|
216
|
+
|
217
217
|
# Colors
|
218
218
|
if colors:
|
219
|
-
result[
|
220
|
-
|
219
|
+
result["colors"] = dict(colors.most_common())
|
220
|
+
|
221
221
|
return result
|
222
222
|
|
223
223
|
|
224
224
|
def describe_region_elements(elements: List["Element"]) -> Dict[str, Any]:
|
225
225
|
"""
|
226
226
|
Describe region elements with type and metadata analysis.
|
227
|
-
|
227
|
+
|
228
228
|
Args:
|
229
229
|
elements: List of region elements
|
230
|
-
|
230
|
+
|
231
231
|
Returns:
|
232
232
|
Dictionary with region analysis
|
233
233
|
"""
|
234
234
|
if not elements:
|
235
235
|
return {"message": "No region elements found"}
|
236
|
-
|
236
|
+
|
237
237
|
result = {}
|
238
|
-
|
238
|
+
|
239
239
|
# Region types
|
240
240
|
types = Counter()
|
241
241
|
sizes = []
|
242
242
|
metadata_keys = set()
|
243
|
-
|
243
|
+
|
244
244
|
for element in elements:
|
245
245
|
# Type
|
246
|
-
region_type = getattr(element,
|
246
|
+
region_type = getattr(element, "type", "unknown")
|
247
247
|
types[region_type] += 1
|
248
|
-
|
248
|
+
|
249
249
|
# Size
|
250
|
-
width = getattr(element,
|
251
|
-
height = getattr(element,
|
250
|
+
width = getattr(element, "width", 0)
|
251
|
+
height = getattr(element, "height", 0)
|
252
252
|
if width and height:
|
253
253
|
sizes.append(width * height)
|
254
|
-
|
254
|
+
|
255
255
|
# Metadata keys
|
256
|
-
if hasattr(element,
|
256
|
+
if hasattr(element, "metadata") and element.metadata:
|
257
257
|
metadata_keys.update(element.metadata.keys())
|
258
|
-
|
258
|
+
|
259
259
|
# Type breakdown
|
260
260
|
if types:
|
261
|
-
result[
|
262
|
-
|
261
|
+
result["types"] = dict(types.most_common())
|
262
|
+
|
263
263
|
# Size statistics
|
264
264
|
if sizes:
|
265
|
-
result[
|
266
|
-
|
267
|
-
|
268
|
-
|
265
|
+
result["size_stats"] = {
|
266
|
+
"min_area": f"{min(sizes):.0f} sq pts",
|
267
|
+
"max_area": f"{max(sizes):.0f} sq pts",
|
268
|
+
"avg_area": f"{sum(sizes)/len(sizes):.0f} sq pts",
|
269
269
|
}
|
270
|
-
|
270
|
+
|
271
271
|
# Metadata
|
272
272
|
if metadata_keys:
|
273
|
-
result[
|
274
|
-
|
273
|
+
result["metadata_keys"] = sorted(list(metadata_keys))
|
274
|
+
|
275
275
|
return result
|
276
276
|
|
277
277
|
|
@@ -279,131 +279,131 @@ def _analyze_typography(elements: List["Element"]) -> Dict[str, Any]:
|
|
279
279
|
"""Analyze typography patterns in text elements."""
|
280
280
|
fonts = Counter()
|
281
281
|
sizes = Counter()
|
282
|
-
styles = {
|
282
|
+
styles = {"bold": 0, "italic": 0, "strikeout": 0, "underline": 0, "highlight": 0}
|
283
283
|
colors = Counter()
|
284
|
-
|
284
|
+
|
285
285
|
for element in elements:
|
286
286
|
# Font family - use TextElement's font_family property for cleaner names
|
287
|
-
font_family = getattr(element,
|
288
|
-
fontname = getattr(element,
|
287
|
+
font_family = getattr(element, "font_family", None)
|
288
|
+
fontname = getattr(element, "fontname", "Unknown")
|
289
289
|
display_font = font_family if font_family and font_family != fontname else fontname
|
290
290
|
if display_font:
|
291
291
|
fonts[display_font] += 1
|
292
|
-
|
292
|
+
|
293
293
|
# Size
|
294
|
-
size = getattr(element,
|
294
|
+
size = getattr(element, "size", None)
|
295
295
|
if size:
|
296
296
|
# Round to nearest 0.5
|
297
297
|
rounded_size = round(size * 2) / 2
|
298
298
|
sizes[f"{rounded_size}pt"] += 1
|
299
|
-
|
299
|
+
|
300
300
|
# Styles
|
301
|
-
if getattr(element,
|
302
|
-
styles[
|
303
|
-
if getattr(element,
|
304
|
-
styles[
|
305
|
-
if getattr(element,
|
306
|
-
styles[
|
307
|
-
if getattr(element,
|
308
|
-
styles[
|
309
|
-
if getattr(element,
|
310
|
-
styles[
|
311
|
-
|
301
|
+
if getattr(element, "bold", False):
|
302
|
+
styles["bold"] += 1
|
303
|
+
if getattr(element, "italic", False):
|
304
|
+
styles["italic"] += 1
|
305
|
+
if getattr(element, "strikeout", False):
|
306
|
+
styles["strikeout"] += 1
|
307
|
+
if getattr(element, "underline", False):
|
308
|
+
styles["underline"] += 1
|
309
|
+
if getattr(element, "highlight", False):
|
310
|
+
styles["highlight"] += 1
|
311
|
+
|
312
312
|
# Color - use TextElement's color property
|
313
|
-
color = getattr(element,
|
313
|
+
color = getattr(element, "color", None)
|
314
314
|
if color:
|
315
315
|
if isinstance(color, (tuple, list)):
|
316
316
|
if color == (0, 0, 0) or color == (0.0, 0.0, 0.0):
|
317
|
-
colors[
|
317
|
+
colors["black"] += 1
|
318
318
|
elif color == (1, 1, 1) or color == (1.0, 1.0, 1.0):
|
319
|
-
colors[
|
319
|
+
colors["white"] += 1
|
320
320
|
else:
|
321
|
-
colors[
|
321
|
+
colors["other"] += 1
|
322
322
|
else:
|
323
323
|
colors[str(color)] += 1
|
324
|
-
|
324
|
+
|
325
325
|
result = {}
|
326
|
-
|
326
|
+
|
327
327
|
# Fonts
|
328
328
|
if fonts:
|
329
|
-
result[
|
330
|
-
|
329
|
+
result["fonts"] = dict(fonts.most_common(10))
|
330
|
+
|
331
331
|
# Sizes (as horizontal table)
|
332
332
|
if sizes:
|
333
|
-
result[
|
334
|
-
|
333
|
+
result["sizes"] = dict(sizes.most_common())
|
334
|
+
|
335
335
|
# Styles
|
336
336
|
style_list = []
|
337
337
|
for style, count in styles.items():
|
338
338
|
if count > 0:
|
339
339
|
style_list.append(f"{count} {style}")
|
340
340
|
if style_list:
|
341
|
-
result[
|
342
|
-
|
341
|
+
result["styles"] = ", ".join(style_list)
|
342
|
+
|
343
343
|
# Colors
|
344
344
|
if colors and len(colors) > 1: # Only show if there are multiple colors
|
345
|
-
result[
|
346
|
-
|
345
|
+
result["colors"] = dict(colors.most_common())
|
346
|
+
|
347
347
|
return result
|
348
348
|
|
349
349
|
|
350
350
|
def _analyze_ocr_quality(elements: List["Element"]) -> Dict[str, Any]:
|
351
351
|
"""Analyze OCR quality metrics."""
|
352
352
|
confidences = []
|
353
|
-
|
353
|
+
|
354
354
|
for element in elements:
|
355
|
-
confidence = getattr(element,
|
355
|
+
confidence = getattr(element, "confidence", None)
|
356
356
|
if confidence is not None:
|
357
357
|
confidences.append(confidence)
|
358
|
-
|
358
|
+
|
359
359
|
if not confidences:
|
360
360
|
return {}
|
361
|
-
|
361
|
+
|
362
362
|
result = {}
|
363
|
-
|
363
|
+
|
364
364
|
# Basic stats
|
365
|
-
result[
|
366
|
-
|
367
|
-
|
368
|
-
|
365
|
+
result["confidence_stats"] = {
|
366
|
+
"mean": f"{sum(confidences)/len(confidences):.2f}",
|
367
|
+
"min": f"{min(confidences):.2f}",
|
368
|
+
"max": f"{max(confidences):.2f}",
|
369
369
|
}
|
370
|
-
|
370
|
+
|
371
371
|
# Threshold analysis with ASCII bars
|
372
372
|
thresholds = [
|
373
|
-
(
|
374
|
-
(
|
375
|
-
(
|
373
|
+
("99%+", 0.99),
|
374
|
+
("95%+", 0.95),
|
375
|
+
("90%+", 0.90),
|
376
376
|
]
|
377
|
-
|
377
|
+
|
378
378
|
element_count = len(elements)
|
379
379
|
threshold_bars = {}
|
380
|
-
|
380
|
+
|
381
381
|
for label, threshold in thresholds:
|
382
382
|
count = sum(1 for c in confidences if c >= threshold)
|
383
383
|
percentage = count / element_count
|
384
|
-
|
384
|
+
|
385
385
|
# Create ASCII bar (40 characters wide)
|
386
386
|
filled_chars = int(percentage * 40)
|
387
387
|
empty_chars = 40 - filled_chars
|
388
|
-
bar =
|
389
|
-
|
388
|
+
bar = "█" * filled_chars + "░" * empty_chars
|
389
|
+
|
390
390
|
# Format: "95%+ (32/43) 74%: `████████████████████████████████░░░░░░░░`"
|
391
391
|
threshold_bars[f"{label} ({count}/{element_count}) {percentage:.0%}"] = f"`{bar}`"
|
392
|
-
|
393
|
-
result[
|
394
|
-
|
392
|
+
|
393
|
+
result["quality_distribution"] = threshold_bars
|
394
|
+
|
395
395
|
# Show lowest quality items
|
396
396
|
element_confidences = []
|
397
397
|
for element in elements:
|
398
|
-
confidence = getattr(element,
|
398
|
+
confidence = getattr(element, "confidence", None)
|
399
399
|
if confidence is not None:
|
400
400
|
# Get text content for display
|
401
|
-
text = getattr(element,
|
401
|
+
text = getattr(element, "text", "").strip()
|
402
402
|
if text:
|
403
403
|
# Truncate long text
|
404
404
|
display_text = text[:60] + "..." if len(text) > 60 else text
|
405
405
|
element_confidences.append((confidence, display_text))
|
406
|
-
|
406
|
+
|
407
407
|
if element_confidences:
|
408
408
|
# Sort by confidence (lowest first) and take bottom 10
|
409
409
|
lowest_quality = sorted(element_confidences, key=lambda x: x[0])[:10]
|
@@ -411,6 +411,6 @@ def _analyze_ocr_quality(elements: List["Element"]) -> Dict[str, Any]:
|
|
411
411
|
lowest_items = {}
|
412
412
|
for i, (confidence, text) in enumerate(lowest_quality, 1):
|
413
413
|
lowest_items[f"#{i}"] = f"**{confidence:.2f}**: {text}"
|
414
|
-
result[
|
415
|
-
|
416
|
-
return result
|
414
|
+
result["lowest_scoring"] = lowest_items
|
415
|
+
|
416
|
+
return result
|
natural_pdf/describe/mixin.py
CHANGED
@@ -11,52 +11,59 @@ if TYPE_CHECKING:
|
|
11
11
|
class DescribeMixin:
|
12
12
|
"""
|
13
13
|
Mixin providing describe functionality for pages, collections, and regions.
|
14
|
-
|
14
|
+
|
15
15
|
Classes that inherit from this mixin get:
|
16
16
|
- .describe() method for high-level summaries
|
17
17
|
- .inspect() method for detailed tabular views (collections only)
|
18
18
|
"""
|
19
|
-
|
19
|
+
|
20
20
|
def describe(self) -> "ElementSummary":
|
21
21
|
"""
|
22
22
|
Describe this object with type-specific analysis.
|
23
|
-
|
23
|
+
|
24
24
|
Returns:
|
25
25
|
ElementSummary with analysis appropriate for the object type
|
26
26
|
"""
|
27
|
-
from natural_pdf.describe import
|
28
|
-
|
27
|
+
from natural_pdf.describe import (
|
28
|
+
describe_collection,
|
29
|
+
describe_element,
|
30
|
+
describe_page,
|
31
|
+
describe_region,
|
32
|
+
)
|
33
|
+
|
29
34
|
# Determine the appropriate describe function based on class type
|
30
35
|
class_name = self.__class__.__name__
|
31
|
-
|
32
|
-
if class_name ==
|
36
|
+
|
37
|
+
if class_name == "Page":
|
33
38
|
return describe_page(self)
|
34
|
-
elif class_name ==
|
39
|
+
elif class_name == "ElementCollection":
|
35
40
|
return describe_collection(self)
|
36
|
-
elif class_name ==
|
41
|
+
elif class_name == "Region":
|
37
42
|
return describe_region(self)
|
38
43
|
else:
|
39
44
|
# Check if it's an individual element (inherits from Element base class)
|
40
45
|
from natural_pdf.elements.base import Element
|
46
|
+
|
41
47
|
if isinstance(self, Element):
|
42
48
|
return describe_element(self)
|
43
|
-
|
49
|
+
|
44
50
|
# Fallback - try to determine based on available methods/attributes
|
45
|
-
if hasattr(self,
|
51
|
+
if hasattr(self, "get_elements") and hasattr(self, "width") and hasattr(self, "height"):
|
46
52
|
# Looks like a page or region
|
47
|
-
if hasattr(self,
|
53
|
+
if hasattr(self, "number"):
|
48
54
|
return describe_page(self) # Page
|
49
55
|
else:
|
50
56
|
return describe_region(self) # Region
|
51
|
-
elif hasattr(self,
|
57
|
+
elif hasattr(self, "__iter__") and hasattr(self, "__len__"):
|
52
58
|
# Looks like a collection
|
53
59
|
return describe_collection(self)
|
54
60
|
else:
|
55
61
|
# Unknown type - create a basic summary
|
56
62
|
from natural_pdf.describe.summary import ElementSummary
|
63
|
+
|
57
64
|
data = {
|
58
65
|
"object_type": class_name,
|
59
|
-
"message": f"Describe not fully implemented for {class_name}"
|
66
|
+
"message": f"Describe not fully implemented for {class_name}",
|
60
67
|
}
|
61
68
|
return ElementSummary(data, f"{class_name} Summary")
|
62
69
|
|
@@ -64,21 +71,22 @@ class DescribeMixin:
|
|
64
71
|
class InspectMixin:
|
65
72
|
"""
|
66
73
|
Mixin providing inspect functionality for collections.
|
67
|
-
|
74
|
+
|
68
75
|
Classes that inherit from this mixin get:
|
69
76
|
- .inspect() method for detailed tabular element views
|
70
77
|
"""
|
71
|
-
|
78
|
+
|
72
79
|
def inspect(self, limit: int = 30) -> "InspectionSummary":
|
73
80
|
"""
|
74
81
|
Inspect elements with detailed tabular view.
|
75
|
-
|
82
|
+
|
76
83
|
Args:
|
77
84
|
limit: Maximum elements per type to show (default: 30)
|
78
|
-
|
85
|
+
|
79
86
|
Returns:
|
80
87
|
InspectionSummary with element tables showing coordinates,
|
81
88
|
properties, and other details for each element
|
82
89
|
"""
|
83
90
|
from natural_pdf.describe import inspect_collection
|
84
|
-
|
91
|
+
|
92
|
+
return inspect_collection(self, limit=limit)
|