natural-pdf 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. natural_pdf/analyzers/__init__.py +2 -2
  2. natural_pdf/analyzers/guides.py +670 -595
  3. natural_pdf/analyzers/layout/base.py +53 -6
  4. natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -14
  6. natural_pdf/analyzers/layout/layout_options.py +1 -0
  7. natural_pdf/analyzers/layout/paddle.py +102 -64
  8. natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
  9. natural_pdf/analyzers/layout/yolo.py +2 -6
  10. natural_pdf/analyzers/shape_detection_mixin.py +15 -6
  11. natural_pdf/classification/manager.py +92 -77
  12. natural_pdf/classification/mixin.py +49 -5
  13. natural_pdf/classification/results.py +1 -1
  14. natural_pdf/cli.py +7 -3
  15. natural_pdf/collections/pdf_collection.py +96 -101
  16. natural_pdf/core/element_manager.py +188 -82
  17. natural_pdf/core/highlighting_service.py +5 -6
  18. natural_pdf/core/page.py +132 -16
  19. natural_pdf/core/pdf.py +486 -71
  20. natural_pdf/describe/__init__.py +18 -12
  21. natural_pdf/describe/base.py +179 -172
  22. natural_pdf/describe/elements.py +155 -155
  23. natural_pdf/describe/mixin.py +27 -19
  24. natural_pdf/describe/summary.py +44 -55
  25. natural_pdf/elements/base.py +134 -18
  26. natural_pdf/elements/collections.py +90 -18
  27. natural_pdf/elements/image.py +2 -1
  28. natural_pdf/elements/line.py +0 -31
  29. natural_pdf/elements/rect.py +0 -14
  30. natural_pdf/elements/region.py +238 -111
  31. natural_pdf/elements/text.py +18 -12
  32. natural_pdf/exporters/__init__.py +4 -1
  33. natural_pdf/exporters/original_pdf.py +12 -4
  34. natural_pdf/extraction/mixin.py +66 -10
  35. natural_pdf/extraction/result.py +1 -1
  36. natural_pdf/flows/flow.py +63 -4
  37. natural_pdf/flows/region.py +4 -4
  38. natural_pdf/ocr/engine.py +83 -2
  39. natural_pdf/ocr/engine_paddle.py +5 -5
  40. natural_pdf/ocr/ocr_factory.py +2 -1
  41. natural_pdf/ocr/ocr_manager.py +24 -13
  42. natural_pdf/ocr/ocr_options.py +3 -10
  43. natural_pdf/qa/document_qa.py +21 -8
  44. natural_pdf/qa/qa_result.py +3 -7
  45. natural_pdf/search/__init__.py +3 -2
  46. natural_pdf/search/lancedb_search_service.py +5 -6
  47. natural_pdf/search/numpy_search_service.py +5 -2
  48. natural_pdf/selectors/parser.py +51 -6
  49. natural_pdf/tables/__init__.py +2 -2
  50. natural_pdf/tables/result.py +7 -6
  51. natural_pdf/utils/bidi_mirror.py +2 -1
  52. natural_pdf/utils/reading_order.py +3 -2
  53. natural_pdf/utils/visualization.py +3 -3
  54. natural_pdf/widgets/viewer.py +0 -1
  55. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
  56. natural_pdf-0.1.34.dist-info/RECORD +121 -0
  57. optimization/memory_comparison.py +73 -58
  58. optimization/pdf_analyzer.py +141 -96
  59. optimization/performance_analysis.py +111 -110
  60. optimization/test_cleanup_methods.py +47 -36
  61. optimization/test_memory_fix.py +40 -39
  62. tools/bad_pdf_eval/__init__.py +0 -1
  63. tools/bad_pdf_eval/analyser.py +35 -18
  64. tools/bad_pdf_eval/collate_summaries.py +22 -18
  65. tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
  66. tools/bad_pdf_eval/eval_suite.py +21 -9
  67. tools/bad_pdf_eval/evaluate_quality.py +198 -0
  68. tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
  69. tools/bad_pdf_eval/llm_enrich.py +71 -39
  70. tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
  71. tools/bad_pdf_eval/reporter.py +1 -1
  72. tools/bad_pdf_eval/utils.py +7 -4
  73. natural_pdf-0.1.32.dist-info/RECORD +0 -118
  74. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
  75. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
  76. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
  77. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0
@@ -15,263 +15,263 @@ logger = logging.getLogger(__name__)
15
15
  def describe_text_elements(elements: List["Element"]) -> Dict[str, Any]:
16
16
  """
17
17
  Describe text elements with typography and OCR analysis.
18
-
18
+
19
19
  Args:
20
20
  elements: List of text elements
21
-
21
+
22
22
  Returns:
23
23
  Dictionary with text analysis sections
24
24
  """
25
25
  if not elements:
26
26
  return {"message": "No text elements found"}
27
-
27
+
28
28
  result = {}
29
-
29
+
30
30
  # Source breakdown
31
31
  sources = Counter()
32
32
  ocr_elements = []
33
-
33
+
34
34
  for element in elements:
35
- source = getattr(element, 'source', 'unknown')
35
+ source = getattr(element, "source", "unknown")
36
36
  sources[source] += 1
37
- if source == 'ocr':
37
+ if source == "ocr":
38
38
  ocr_elements.append(element)
39
-
39
+
40
40
  if len(sources) > 1:
41
- result['sources'] = dict(sources)
42
-
41
+ result["sources"] = dict(sources)
42
+
43
43
  # Typography analysis
44
44
  typography = _analyze_typography(elements)
45
45
  if typography:
46
- result['typography'] = typography
47
-
46
+ result["typography"] = typography
47
+
48
48
  # OCR quality analysis
49
49
  if ocr_elements:
50
50
  ocr_quality = _analyze_ocr_quality(ocr_elements)
51
51
  if ocr_quality:
52
- result['ocr_quality'] = ocr_quality
53
-
52
+ result["ocr_quality"] = ocr_quality
53
+
54
54
  return result
55
55
 
56
56
 
57
57
  def describe_rect_elements(elements: List["Element"]) -> Dict[str, Any]:
58
58
  """
59
59
  Describe rectangle elements with size and style analysis.
60
-
60
+
61
61
  Args:
62
62
  elements: List of rectangle elements
63
-
63
+
64
64
  Returns:
65
65
  Dictionary with rectangle analysis
66
66
  """
67
67
  if not elements:
68
68
  return {"message": "No rectangle elements found"}
69
-
69
+
70
70
  result = {}
71
-
71
+
72
72
  # Size analysis
73
73
  sizes = []
74
74
  stroke_count = 0
75
75
  fill_count = 0
76
76
  colors = Counter()
77
77
  stroke_widths = []
78
-
78
+
79
79
  for element in elements:
80
80
  # Size
81
- width = getattr(element, 'width', 0)
82
- height = getattr(element, 'height', 0)
81
+ width = getattr(element, "width", 0)
82
+ height = getattr(element, "height", 0)
83
83
  if width and height:
84
84
  sizes.append((width, height))
85
-
85
+
86
86
  # Style properties - use RectangleElement properties
87
- stroke = getattr(element, 'stroke', None)
87
+ stroke = getattr(element, "stroke", None)
88
88
  if stroke and stroke != (0, 0, 0): # Check if stroke color exists and isn't black
89
89
  stroke_count += 1
90
- fill = getattr(element, 'fill', None)
90
+ fill = getattr(element, "fill", None)
91
91
  if fill and fill != (0, 0, 0): # Check if fill color exists and isn't black
92
92
  fill_count += 1
93
-
93
+
94
94
  # Stroke width
95
- stroke_width = getattr(element, 'stroke_width', 0)
95
+ stroke_width = getattr(element, "stroke_width", 0)
96
96
  if stroke_width > 0:
97
97
  stroke_widths.append(stroke_width)
98
-
98
+
99
99
  # Color - use the element's stroke/fill properties
100
100
  color = stroke or fill
101
101
  if color:
102
102
  if isinstance(color, (tuple, list)):
103
103
  if color == (0, 0, 0) or color == (0.0, 0.0, 0.0):
104
- colors['black'] += 1
104
+ colors["black"] += 1
105
105
  elif color == (1, 1, 1) or color == (1.0, 1.0, 1.0):
106
- colors['white'] += 1
106
+ colors["white"] += 1
107
107
  else:
108
108
  colors[str(color)] += 1
109
109
  else:
110
110
  colors[str(color)] += 1
111
-
111
+
112
112
  # Size statistics
113
113
  if sizes:
114
114
  widths = [s[0] for s in sizes]
115
115
  heights = [s[1] for s in sizes]
116
- result['size_stats'] = {
117
- 'width_range': f"{min(widths):.0f}-{max(widths):.0f}",
118
- 'height_range': f"{min(heights):.0f}-{max(heights):.0f}",
119
- 'avg_area': f"{sum(w*h for w,h in sizes)/len(sizes):.0f} sq pts"
116
+ result["size_stats"] = {
117
+ "width_range": f"{min(widths):.0f}-{max(widths):.0f}",
118
+ "height_range": f"{min(heights):.0f}-{max(heights):.0f}",
119
+ "avg_area": f"{sum(w*h for w,h in sizes)/len(sizes):.0f} sq pts",
120
120
  }
121
-
121
+
122
122
  # Style breakdown
123
123
  style_info = {}
124
124
  if stroke_count:
125
- style_info['stroke'] = stroke_count
125
+ style_info["stroke"] = stroke_count
126
126
  if fill_count:
127
- style_info['fill'] = fill_count
127
+ style_info["fill"] = fill_count
128
128
  if stroke_widths:
129
129
  stroke_width_counts = Counter(stroke_widths)
130
130
  # Convert float keys to strings to avoid formatting issues
131
131
  stroke_width_dict = {str(k): v for k, v in stroke_width_counts.most_common()}
132
- style_info['stroke_widths'] = stroke_width_dict
132
+ style_info["stroke_widths"] = stroke_width_dict
133
133
  if colors:
134
- style_info['colors'] = dict(colors.most_common(5))
135
-
134
+ style_info["colors"] = dict(colors.most_common(5))
135
+
136
136
  if style_info:
137
- result['styles'] = style_info
138
-
137
+ result["styles"] = style_info
138
+
139
139
  return result
140
140
 
141
141
 
142
142
  def describe_line_elements(elements: List["Element"]) -> Dict[str, Any]:
143
143
  """
144
144
  Describe line elements with length and style analysis.
145
-
145
+
146
146
  Args:
147
147
  elements: List of line elements
148
-
148
+
149
149
  Returns:
150
150
  Dictionary with line analysis
151
151
  """
152
152
  if not elements:
153
153
  return {"message": "No line elements found"}
154
-
154
+
155
155
  result = {}
156
-
156
+
157
157
  lengths = []
158
158
  widths = []
159
159
  colors = Counter()
160
-
160
+
161
161
  for element in elements:
162
162
  # Calculate length
163
- x0 = getattr(element, 'x0', 0)
164
- y0 = getattr(element, 'top', 0)
165
- x1 = getattr(element, 'x1', 0)
166
- y1 = getattr(element, 'bottom', 0)
167
-
163
+ x0 = getattr(element, "x0", 0)
164
+ y0 = getattr(element, "top", 0)
165
+ x1 = getattr(element, "x1", 0)
166
+ y1 = getattr(element, "bottom", 0)
167
+
168
168
  length = ((x1 - x0) ** 2 + (y1 - y0) ** 2) ** 0.5
169
169
  if length > 0:
170
170
  lengths.append(length)
171
-
171
+
172
172
  # Line width - use the element's width property
173
- width = getattr(element, 'width', 0) # LineElement has a width property
173
+ width = getattr(element, "width", 0) # LineElement has a width property
174
174
  if width:
175
175
  widths.append(width)
176
-
176
+
177
177
  # Color - use the element's color property
178
- color = getattr(element, 'color', None) # LineElement has a color property
178
+ color = getattr(element, "color", None) # LineElement has a color property
179
179
  if color:
180
180
  if isinstance(color, (tuple, list)):
181
181
  if color == (0, 0, 0) or color == (0.0, 0.0, 0.0):
182
- colors['black'] += 1
182
+ colors["black"] += 1
183
183
  else:
184
184
  colors[str(color)] += 1
185
185
  else:
186
186
  colors[str(color)] += 1
187
-
187
+
188
188
  # Length statistics
189
189
  if lengths:
190
- result['length_stats'] = {
191
- 'min': f"{min(lengths):.0f}",
192
- 'max': f"{max(lengths):.0f}",
193
- 'avg': f"{sum(lengths)/len(lengths):.0f}"
190
+ result["length_stats"] = {
191
+ "min": f"{min(lengths):.0f}",
192
+ "max": f"{max(lengths):.0f}",
193
+ "avg": f"{sum(lengths)/len(lengths):.0f}",
194
194
  }
195
-
195
+
196
196
  # Width statistics
197
197
  if widths:
198
198
  width_counts = Counter(widths)
199
199
  # Convert float keys to strings to avoid formatting issues
200
- result['line_widths'] = {str(k): v for k, v in width_counts.most_common()}
201
-
200
+ result["line_widths"] = {str(k): v for k, v in width_counts.most_common()}
201
+
202
202
  # Orientation analysis
203
- horizontal_count = sum(1 for el in elements if getattr(el, 'is_horizontal', False))
204
- vertical_count = sum(1 for el in elements if getattr(el, 'is_vertical', False))
203
+ horizontal_count = sum(1 for el in elements if getattr(el, "is_horizontal", False))
204
+ vertical_count = sum(1 for el in elements if getattr(el, "is_vertical", False))
205
205
  diagonal_count = len(elements) - horizontal_count - vertical_count
206
-
206
+
207
207
  if horizontal_count or vertical_count or diagonal_count:
208
208
  orientation_info = {}
209
209
  if horizontal_count:
210
- orientation_info['horizontal'] = horizontal_count
210
+ orientation_info["horizontal"] = horizontal_count
211
211
  if vertical_count:
212
- orientation_info['vertical'] = vertical_count
212
+ orientation_info["vertical"] = vertical_count
213
213
  if diagonal_count:
214
- orientation_info['diagonal'] = diagonal_count
215
- result['orientations'] = orientation_info
216
-
214
+ orientation_info["diagonal"] = diagonal_count
215
+ result["orientations"] = orientation_info
216
+
217
217
  # Colors
218
218
  if colors:
219
- result['colors'] = dict(colors.most_common())
220
-
219
+ result["colors"] = dict(colors.most_common())
220
+
221
221
  return result
222
222
 
223
223
 
224
224
  def describe_region_elements(elements: List["Element"]) -> Dict[str, Any]:
225
225
  """
226
226
  Describe region elements with type and metadata analysis.
227
-
227
+
228
228
  Args:
229
229
  elements: List of region elements
230
-
230
+
231
231
  Returns:
232
232
  Dictionary with region analysis
233
233
  """
234
234
  if not elements:
235
235
  return {"message": "No region elements found"}
236
-
236
+
237
237
  result = {}
238
-
238
+
239
239
  # Region types
240
240
  types = Counter()
241
241
  sizes = []
242
242
  metadata_keys = set()
243
-
243
+
244
244
  for element in elements:
245
245
  # Type
246
- region_type = getattr(element, 'type', 'unknown')
246
+ region_type = getattr(element, "type", "unknown")
247
247
  types[region_type] += 1
248
-
248
+
249
249
  # Size
250
- width = getattr(element, 'width', 0)
251
- height = getattr(element, 'height', 0)
250
+ width = getattr(element, "width", 0)
251
+ height = getattr(element, "height", 0)
252
252
  if width and height:
253
253
  sizes.append(width * height)
254
-
254
+
255
255
  # Metadata keys
256
- if hasattr(element, 'metadata') and element.metadata:
256
+ if hasattr(element, "metadata") and element.metadata:
257
257
  metadata_keys.update(element.metadata.keys())
258
-
258
+
259
259
  # Type breakdown
260
260
  if types:
261
- result['types'] = dict(types.most_common())
262
-
261
+ result["types"] = dict(types.most_common())
262
+
263
263
  # Size statistics
264
264
  if sizes:
265
- result['size_stats'] = {
266
- 'min_area': f"{min(sizes):.0f} sq pts",
267
- 'max_area': f"{max(sizes):.0f} sq pts",
268
- 'avg_area': f"{sum(sizes)/len(sizes):.0f} sq pts"
265
+ result["size_stats"] = {
266
+ "min_area": f"{min(sizes):.0f} sq pts",
267
+ "max_area": f"{max(sizes):.0f} sq pts",
268
+ "avg_area": f"{sum(sizes)/len(sizes):.0f} sq pts",
269
269
  }
270
-
270
+
271
271
  # Metadata
272
272
  if metadata_keys:
273
- result['metadata_keys'] = sorted(list(metadata_keys))
274
-
273
+ result["metadata_keys"] = sorted(list(metadata_keys))
274
+
275
275
  return result
276
276
 
277
277
 
@@ -279,131 +279,131 @@ def _analyze_typography(elements: List["Element"]) -> Dict[str, Any]:
279
279
  """Analyze typography patterns in text elements."""
280
280
  fonts = Counter()
281
281
  sizes = Counter()
282
- styles = {'bold': 0, 'italic': 0, 'strikeout': 0, 'underline': 0, 'highlight': 0}
282
+ styles = {"bold": 0, "italic": 0, "strikeout": 0, "underline": 0, "highlight": 0}
283
283
  colors = Counter()
284
-
284
+
285
285
  for element in elements:
286
286
  # Font family - use TextElement's font_family property for cleaner names
287
- font_family = getattr(element, 'font_family', None)
288
- fontname = getattr(element, 'fontname', 'Unknown')
287
+ font_family = getattr(element, "font_family", None)
288
+ fontname = getattr(element, "fontname", "Unknown")
289
289
  display_font = font_family if font_family and font_family != fontname else fontname
290
290
  if display_font:
291
291
  fonts[display_font] += 1
292
-
292
+
293
293
  # Size
294
- size = getattr(element, 'size', None)
294
+ size = getattr(element, "size", None)
295
295
  if size:
296
296
  # Round to nearest 0.5
297
297
  rounded_size = round(size * 2) / 2
298
298
  sizes[f"{rounded_size}pt"] += 1
299
-
299
+
300
300
  # Styles
301
- if getattr(element, 'bold', False):
302
- styles['bold'] += 1
303
- if getattr(element, 'italic', False):
304
- styles['italic'] += 1
305
- if getattr(element, 'strikeout', False):
306
- styles['strikeout'] += 1
307
- if getattr(element, 'underline', False):
308
- styles['underline'] += 1
309
- if getattr(element, 'highlight', False):
310
- styles['highlight'] += 1
311
-
301
+ if getattr(element, "bold", False):
302
+ styles["bold"] += 1
303
+ if getattr(element, "italic", False):
304
+ styles["italic"] += 1
305
+ if getattr(element, "strikeout", False):
306
+ styles["strikeout"] += 1
307
+ if getattr(element, "underline", False):
308
+ styles["underline"] += 1
309
+ if getattr(element, "highlight", False):
310
+ styles["highlight"] += 1
311
+
312
312
  # Color - use TextElement's color property
313
- color = getattr(element, 'color', None)
313
+ color = getattr(element, "color", None)
314
314
  if color:
315
315
  if isinstance(color, (tuple, list)):
316
316
  if color == (0, 0, 0) or color == (0.0, 0.0, 0.0):
317
- colors['black'] += 1
317
+ colors["black"] += 1
318
318
  elif color == (1, 1, 1) or color == (1.0, 1.0, 1.0):
319
- colors['white'] += 1
319
+ colors["white"] += 1
320
320
  else:
321
- colors['other'] += 1
321
+ colors["other"] += 1
322
322
  else:
323
323
  colors[str(color)] += 1
324
-
324
+
325
325
  result = {}
326
-
326
+
327
327
  # Fonts
328
328
  if fonts:
329
- result['fonts'] = dict(fonts.most_common(10))
330
-
329
+ result["fonts"] = dict(fonts.most_common(10))
330
+
331
331
  # Sizes (as horizontal table)
332
332
  if sizes:
333
- result['sizes'] = dict(sizes.most_common())
334
-
333
+ result["sizes"] = dict(sizes.most_common())
334
+
335
335
  # Styles
336
336
  style_list = []
337
337
  for style, count in styles.items():
338
338
  if count > 0:
339
339
  style_list.append(f"{count} {style}")
340
340
  if style_list:
341
- result['styles'] = ", ".join(style_list)
342
-
341
+ result["styles"] = ", ".join(style_list)
342
+
343
343
  # Colors
344
344
  if colors and len(colors) > 1: # Only show if there are multiple colors
345
- result['colors'] = dict(colors.most_common())
346
-
345
+ result["colors"] = dict(colors.most_common())
346
+
347
347
  return result
348
348
 
349
349
 
350
350
  def _analyze_ocr_quality(elements: List["Element"]) -> Dict[str, Any]:
351
351
  """Analyze OCR quality metrics."""
352
352
  confidences = []
353
-
353
+
354
354
  for element in elements:
355
- confidence = getattr(element, 'confidence', None)
355
+ confidence = getattr(element, "confidence", None)
356
356
  if confidence is not None:
357
357
  confidences.append(confidence)
358
-
358
+
359
359
  if not confidences:
360
360
  return {}
361
-
361
+
362
362
  result = {}
363
-
363
+
364
364
  # Basic stats
365
- result['confidence_stats'] = {
366
- 'mean': f"{sum(confidences)/len(confidences):.2f}",
367
- 'min': f"{min(confidences):.2f}",
368
- 'max': f"{max(confidences):.2f}"
365
+ result["confidence_stats"] = {
366
+ "mean": f"{sum(confidences)/len(confidences):.2f}",
367
+ "min": f"{min(confidences):.2f}",
368
+ "max": f"{max(confidences):.2f}",
369
369
  }
370
-
370
+
371
371
  # Threshold analysis with ASCII bars
372
372
  thresholds = [
373
- ('99%+', 0.99),
374
- ('95%+', 0.95),
375
- ('90%+', 0.90),
373
+ ("99%+", 0.99),
374
+ ("95%+", 0.95),
375
+ ("90%+", 0.90),
376
376
  ]
377
-
377
+
378
378
  element_count = len(elements)
379
379
  threshold_bars = {}
380
-
380
+
381
381
  for label, threshold in thresholds:
382
382
  count = sum(1 for c in confidences if c >= threshold)
383
383
  percentage = count / element_count
384
-
384
+
385
385
  # Create ASCII bar (40 characters wide)
386
386
  filled_chars = int(percentage * 40)
387
387
  empty_chars = 40 - filled_chars
388
- bar = '' * filled_chars + '' * empty_chars
389
-
388
+ bar = "" * filled_chars + "" * empty_chars
389
+
390
390
  # Format: "95%+ (32/43) 74%: `████████████████████████████████░░░░░░░░`"
391
391
  threshold_bars[f"{label} ({count}/{element_count}) {percentage:.0%}"] = f"`{bar}`"
392
-
393
- result['quality_distribution'] = threshold_bars
394
-
392
+
393
+ result["quality_distribution"] = threshold_bars
394
+
395
395
  # Show lowest quality items
396
396
  element_confidences = []
397
397
  for element in elements:
398
- confidence = getattr(element, 'confidence', None)
398
+ confidence = getattr(element, "confidence", None)
399
399
  if confidence is not None:
400
400
  # Get text content for display
401
- text = getattr(element, 'text', '').strip()
401
+ text = getattr(element, "text", "").strip()
402
402
  if text:
403
403
  # Truncate long text
404
404
  display_text = text[:60] + "..." if len(text) > 60 else text
405
405
  element_confidences.append((confidence, display_text))
406
-
406
+
407
407
  if element_confidences:
408
408
  # Sort by confidence (lowest first) and take bottom 10
409
409
  lowest_quality = sorted(element_confidences, key=lambda x: x[0])[:10]
@@ -411,6 +411,6 @@ def _analyze_ocr_quality(elements: List["Element"]) -> Dict[str, Any]:
411
411
  lowest_items = {}
412
412
  for i, (confidence, text) in enumerate(lowest_quality, 1):
413
413
  lowest_items[f"#{i}"] = f"**{confidence:.2f}**: {text}"
414
- result['lowest_scoring'] = lowest_items
415
-
416
- return result
414
+ result["lowest_scoring"] = lowest_items
415
+
416
+ return result
@@ -11,52 +11,59 @@ if TYPE_CHECKING:
11
11
  class DescribeMixin:
12
12
  """
13
13
  Mixin providing describe functionality for pages, collections, and regions.
14
-
14
+
15
15
  Classes that inherit from this mixin get:
16
16
  - .describe() method for high-level summaries
17
17
  - .inspect() method for detailed tabular views (collections only)
18
18
  """
19
-
19
+
20
20
  def describe(self) -> "ElementSummary":
21
21
  """
22
22
  Describe this object with type-specific analysis.
23
-
23
+
24
24
  Returns:
25
25
  ElementSummary with analysis appropriate for the object type
26
26
  """
27
- from natural_pdf.describe import describe_page, describe_collection, describe_region, describe_element
28
-
27
+ from natural_pdf.describe import (
28
+ describe_collection,
29
+ describe_element,
30
+ describe_page,
31
+ describe_region,
32
+ )
33
+
29
34
  # Determine the appropriate describe function based on class type
30
35
  class_name = self.__class__.__name__
31
-
32
- if class_name == 'Page':
36
+
37
+ if class_name == "Page":
33
38
  return describe_page(self)
34
- elif class_name == 'ElementCollection':
39
+ elif class_name == "ElementCollection":
35
40
  return describe_collection(self)
36
- elif class_name == 'Region':
41
+ elif class_name == "Region":
37
42
  return describe_region(self)
38
43
  else:
39
44
  # Check if it's an individual element (inherits from Element base class)
40
45
  from natural_pdf.elements.base import Element
46
+
41
47
  if isinstance(self, Element):
42
48
  return describe_element(self)
43
-
49
+
44
50
  # Fallback - try to determine based on available methods/attributes
45
- if hasattr(self, 'get_elements') and hasattr(self, 'width') and hasattr(self, 'height'):
51
+ if hasattr(self, "get_elements") and hasattr(self, "width") and hasattr(self, "height"):
46
52
  # Looks like a page or region
47
- if hasattr(self, 'number'):
53
+ if hasattr(self, "number"):
48
54
  return describe_page(self) # Page
49
55
  else:
50
56
  return describe_region(self) # Region
51
- elif hasattr(self, '__iter__') and hasattr(self, '__len__'):
57
+ elif hasattr(self, "__iter__") and hasattr(self, "__len__"):
52
58
  # Looks like a collection
53
59
  return describe_collection(self)
54
60
  else:
55
61
  # Unknown type - create a basic summary
56
62
  from natural_pdf.describe.summary import ElementSummary
63
+
57
64
  data = {
58
65
  "object_type": class_name,
59
- "message": f"Describe not fully implemented for {class_name}"
66
+ "message": f"Describe not fully implemented for {class_name}",
60
67
  }
61
68
  return ElementSummary(data, f"{class_name} Summary")
62
69
 
@@ -64,21 +71,22 @@ class DescribeMixin:
64
71
  class InspectMixin:
65
72
  """
66
73
  Mixin providing inspect functionality for collections.
67
-
74
+
68
75
  Classes that inherit from this mixin get:
69
76
  - .inspect() method for detailed tabular element views
70
77
  """
71
-
78
+
72
79
  def inspect(self, limit: int = 30) -> "InspectionSummary":
73
80
  """
74
81
  Inspect elements with detailed tabular view.
75
-
82
+
76
83
  Args:
77
84
  limit: Maximum elements per type to show (default: 30)
78
-
85
+
79
86
  Returns:
80
87
  InspectionSummary with element tables showing coordinates,
81
88
  properties, and other details for each element
82
89
  """
83
90
  from natural_pdf.describe import inspect_collection
84
- return inspect_collection(self, limit=limit)
91
+
92
+ return inspect_collection(self, limit=limit)