natural-pdf 0.1.33__py3-none-any.whl → 0.1.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. natural_pdf/analyzers/__init__.py +2 -2
  2. natural_pdf/analyzers/guides.py +751 -607
  3. natural_pdf/analyzers/layout/base.py +53 -6
  4. natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -14
  6. natural_pdf/analyzers/layout/layout_options.py +1 -0
  7. natural_pdf/analyzers/layout/paddle.py +102 -64
  8. natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
  9. natural_pdf/analyzers/layout/yolo.py +2 -6
  10. natural_pdf/analyzers/shape_detection_mixin.py +15 -6
  11. natural_pdf/classification/manager.py +92 -77
  12. natural_pdf/classification/mixin.py +49 -5
  13. natural_pdf/classification/results.py +1 -1
  14. natural_pdf/cli.py +7 -3
  15. natural_pdf/collections/pdf_collection.py +96 -101
  16. natural_pdf/core/element_manager.py +131 -45
  17. natural_pdf/core/highlighting_service.py +5 -6
  18. natural_pdf/core/page.py +120 -23
  19. natural_pdf/core/pdf.py +477 -75
  20. natural_pdf/describe/__init__.py +18 -12
  21. natural_pdf/describe/base.py +179 -172
  22. natural_pdf/describe/elements.py +155 -155
  23. natural_pdf/describe/mixin.py +27 -19
  24. natural_pdf/describe/summary.py +44 -55
  25. natural_pdf/elements/base.py +134 -18
  26. natural_pdf/elements/collections.py +90 -18
  27. natural_pdf/elements/image.py +2 -1
  28. natural_pdf/elements/line.py +0 -31
  29. natural_pdf/elements/rect.py +0 -14
  30. natural_pdf/elements/region.py +222 -108
  31. natural_pdf/elements/text.py +18 -12
  32. natural_pdf/exporters/__init__.py +4 -1
  33. natural_pdf/exporters/original_pdf.py +12 -4
  34. natural_pdf/extraction/mixin.py +66 -10
  35. natural_pdf/extraction/result.py +1 -1
  36. natural_pdf/flows/flow.py +63 -4
  37. natural_pdf/flows/region.py +4 -4
  38. natural_pdf/ocr/engine.py +83 -2
  39. natural_pdf/ocr/engine_paddle.py +5 -5
  40. natural_pdf/ocr/ocr_factory.py +2 -1
  41. natural_pdf/ocr/ocr_manager.py +24 -13
  42. natural_pdf/ocr/ocr_options.py +3 -10
  43. natural_pdf/qa/document_qa.py +21 -8
  44. natural_pdf/qa/qa_result.py +3 -7
  45. natural_pdf/search/__init__.py +3 -2
  46. natural_pdf/search/lancedb_search_service.py +5 -6
  47. natural_pdf/search/numpy_search_service.py +5 -2
  48. natural_pdf/selectors/parser.py +51 -6
  49. natural_pdf/tables/__init__.py +2 -2
  50. natural_pdf/tables/result.py +7 -6
  51. natural_pdf/utils/bidi_mirror.py +2 -1
  52. natural_pdf/utils/reading_order.py +3 -2
  53. natural_pdf/utils/visualization.py +3 -3
  54. natural_pdf/widgets/viewer.py +0 -1
  55. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/METADATA +1 -1
  56. natural_pdf-0.1.35.dist-info/RECORD +121 -0
  57. optimization/memory_comparison.py +73 -58
  58. optimization/pdf_analyzer.py +141 -96
  59. optimization/performance_analysis.py +111 -110
  60. optimization/test_cleanup_methods.py +47 -36
  61. optimization/test_memory_fix.py +40 -39
  62. tools/bad_pdf_eval/__init__.py +0 -1
  63. tools/bad_pdf_eval/analyser.py +35 -18
  64. tools/bad_pdf_eval/collate_summaries.py +22 -18
  65. tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
  66. tools/bad_pdf_eval/eval_suite.py +21 -9
  67. tools/bad_pdf_eval/evaluate_quality.py +198 -0
  68. tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
  69. tools/bad_pdf_eval/llm_enrich.py +71 -39
  70. tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
  71. tools/bad_pdf_eval/reporter.py +1 -1
  72. tools/bad_pdf_eval/utils.py +7 -4
  73. natural_pdf-0.1.33.dist-info/RECORD +0 -118
  74. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/WHEEL +0 -0
  75. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/entry_points.txt +0 -0
  76. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/licenses/LICENSE +0 -0
  77. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/top_level.txt +0 -0
@@ -26,41 +26,41 @@ logger = logging.getLogger(__name__)
26
26
  def describe_page(page: "Page") -> ElementSummary:
27
27
  """
28
28
  Describe what's on a page with high-level summary.
29
-
29
+
30
30
  Args:
31
31
  page: Page to describe
32
-
32
+
33
33
  Returns:
34
34
  ElementSummary with page overview
35
35
  """
36
36
  data = {}
37
-
37
+
38
38
  # Get all elements
39
39
  all_elements = page.get_elements()
40
-
40
+
41
41
  if not all_elements:
42
42
  data["message"] = "No elements found on page"
43
43
  return ElementSummary(data, f"Page {page.number} Summary")
44
-
44
+
45
45
  # Element counts by type (exclude chars - too granular)
46
46
  type_counts = Counter()
47
47
  for element in all_elements:
48
- element_type = getattr(element, 'type', 'unknown')
49
- if element_type != 'char': # Skip character elements
48
+ element_type = getattr(element, "type", "unknown")
49
+ if element_type != "char": # Skip character elements
50
50
  type_counts[element_type] += 1
51
-
51
+
52
52
  # Format element counts as dictionary for proper list formatting
53
53
  element_summary = {}
54
54
  for element_type, count in type_counts.most_common():
55
- type_display = element_type.replace('_', ' ').title()
56
- if element_type == 'word':
55
+ type_display = element_type.replace("_", " ").title()
56
+ if element_type == "word":
57
57
  # Add source breakdown for text
58
- text_elements = [e for e in all_elements if getattr(e, 'type', '') == 'word']
58
+ text_elements = [e for e in all_elements if getattr(e, "type", "") == "word"]
59
59
  sources = Counter()
60
60
  for elem in text_elements:
61
- source = getattr(elem, 'source', 'unknown')
61
+ source = getattr(elem, "source", "unknown")
62
62
  sources[source] += 1
63
-
63
+
64
64
  if len(sources) > 1:
65
65
  source_parts = []
66
66
  for source, source_count in sources.most_common():
@@ -70,86 +70,83 @@ def describe_page(page: "Page") -> ElementSummary:
70
70
  element_summary["text"] = f"{count} elements"
71
71
  else:
72
72
  element_summary[element_type] = f"{count} elements"
73
-
73
+
74
74
  data["elements"] = element_summary
75
-
75
+
76
76
  # Text analysis if we have text elements (exclude chars - too granular)
77
- text_elements = [e for e in all_elements if getattr(e, 'type', '') == 'word']
77
+ text_elements = [e for e in all_elements if getattr(e, "type", "") == "word"]
78
78
  if text_elements:
79
79
  text_analysis = describe_text_elements(text_elements)
80
- if text_analysis and 'message' not in text_analysis:
80
+ if text_analysis and "message" not in text_analysis:
81
81
  data["text_analysis"] = text_analysis
82
-
82
+
83
83
  return ElementSummary(data, f"Page {page.number} Summary")
84
84
 
85
85
 
86
86
  def describe_collection(collection: "ElementCollection") -> ElementSummary:
87
87
  """
88
88
  Describe an element collection with type-specific analysis.
89
-
89
+
90
90
  Args:
91
91
  collection: ElementCollection to describe
92
-
92
+
93
93
  Returns:
94
94
  ElementSummary with collection analysis
95
95
  """
96
96
  elements = list(collection)
97
-
97
+
98
98
  if not elements:
99
99
  data = {"message": "Empty collection"}
100
100
  return ElementSummary(data, "Collection Summary")
101
-
101
+
102
102
  data = {}
103
-
103
+
104
104
  # Group elements by type
105
105
  by_type = {}
106
106
  for element in elements:
107
- element_type = getattr(element, 'type', 'unknown')
107
+ element_type = getattr(element, "type", "unknown")
108
108
  by_type.setdefault(element_type, []).append(element)
109
-
109
+
110
110
  # Overall summary for mixed collections (exclude chars from overview)
111
111
  if len(by_type) > 1:
112
- type_counts = {k: len(v) for k, v in by_type.items() if k != 'char'}
112
+ type_counts = {k: len(v) for k, v in by_type.items() if k != "char"}
113
113
  total = sum(type_counts.values())
114
-
114
+
115
115
  summary_parts = []
116
116
  for element_type, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True):
117
- type_display = element_type.replace('_', ' ').title()
117
+ type_display = element_type.replace("_", " ").title()
118
118
  summary_parts.append(f"**{type_display}**: {count}")
119
-
119
+
120
120
  if summary_parts: # Only add overview if we have non-char elements
121
- data["overview"] = {
122
- "total_elements": total,
123
- "type_breakdown": summary_parts
124
- }
125
-
121
+ data["overview"] = {"total_elements": total, "type_breakdown": summary_parts}
122
+
126
123
  # Type-specific analysis (exclude chars - too granular)
127
124
  for element_type, type_elements in by_type.items():
128
- if element_type == 'char':
125
+ if element_type == "char":
129
126
  # Skip character elements - too granular for useful analysis
130
127
  continue
131
- elif element_type == 'word':
128
+ elif element_type == "word":
132
129
  analysis = describe_text_elements(type_elements)
133
- elif element_type == 'rect':
130
+ elif element_type == "rect":
134
131
  analysis = describe_rect_elements(type_elements)
135
- elif element_type == 'line':
132
+ elif element_type == "line":
136
133
  analysis = describe_line_elements(type_elements)
137
- elif element_type == 'region':
134
+ elif element_type == "region":
138
135
  analysis = describe_region_elements(type_elements)
139
136
  else:
140
137
  analysis = {"count": len(type_elements)}
141
-
142
- if analysis and 'message' not in analysis:
143
- section_name = element_type.replace('_', ' ').title()
138
+
139
+ if analysis and "message" not in analysis:
140
+ section_name = element_type.replace("_", " ").title()
144
141
  if len(by_type) == 1:
145
142
  # Single type collection - flatten the structure
146
143
  data.update(analysis)
147
144
  else:
148
145
  # Mixed collection - keep sections separate
149
146
  data[section_name] = analysis
150
-
147
+
151
148
  # Count non-char elements for title
152
- non_char_count = len([e for e in elements if getattr(e, 'type', 'unknown') != 'char'])
149
+ non_char_count = len([e for e in elements if getattr(e, "type", "unknown") != "char"])
153
150
  title = f"Collection Summary ({non_char_count} elements)"
154
151
  return ElementSummary(data, title)
155
152
 
@@ -157,29 +154,29 @@ def describe_collection(collection: "ElementCollection") -> ElementSummary:
157
154
  def describe_region(region: "Region") -> ElementSummary:
158
155
  """
159
156
  Describe a region with its properties and contents.
160
-
157
+
161
158
  Args:
162
159
  region: Region to describe
163
-
160
+
164
161
  Returns:
165
162
  ElementSummary with region analysis
166
163
  """
167
164
  data = {}
168
-
165
+
169
166
  # Region info
170
167
  region_info = {
171
168
  "page": region.page.number,
172
169
  "dimensions": f"{region.width:.0f}×{region.height:.0f} pts",
173
170
  "area": f"{region.width * region.height:.0f} sq pts",
174
- "position": f"({region.x0:.0f}, {region.top:.0f}) to ({region.x1:.0f}, {region.bottom:.0f})"
171
+ "position": f"({region.x0:.0f}, {region.top:.0f}) to ({region.x1:.0f}, {region.bottom:.0f})",
175
172
  }
176
-
173
+
177
174
  # Add metadata if available
178
- if hasattr(region, 'metadata') and region.metadata:
175
+ if hasattr(region, "metadata") and region.metadata:
179
176
  region_info["metadata"] = region.metadata
180
-
177
+
181
178
  data["region_info"] = region_info
182
-
179
+
183
180
  # Content analysis
184
181
  content_elements = region.find_all("*")
185
182
  if content_elements:
@@ -188,54 +185,54 @@ def describe_region(region: "Region") -> ElementSummary:
188
185
  data["content"] = content_analysis.to_dict()
189
186
  else:
190
187
  data["content"] = {"message": "No elements found in region"}
191
-
188
+
192
189
  return ElementSummary(data, "Region Summary")
193
190
 
194
191
 
195
192
  def inspect_collection(collection: "ElementCollection", limit: int = 30) -> InspectionSummary:
196
193
  """
197
194
  Inspect elements in a collection with detailed tabular view.
198
-
195
+
199
196
  Args:
200
197
  collection: ElementCollection to inspect
201
198
  limit: Maximum elements per type to show (default: 30)
202
-
199
+
203
200
  Returns:
204
201
  InspectionSummary with element tables
205
202
  """
206
203
  elements = list(collection)
207
-
204
+
208
205
  if not elements:
209
206
  data = {"message": "Empty collection"}
210
207
  return InspectionSummary(data, "Collection Inspection")
211
-
208
+
212
209
  data = {}
213
-
210
+
214
211
  # Check if multi-page
215
212
  pages = set()
216
213
  for element in elements:
217
- if hasattr(element, 'page') and hasattr(element.page, 'number'):
214
+ if hasattr(element, "page") and hasattr(element.page, "number"):
218
215
  pages.add(element.page.number)
219
216
  show_page_column = len(pages) > 1
220
-
217
+
221
218
  # Group by type
222
219
  by_type = {}
223
220
  for element in elements:
224
- element_type = getattr(element, 'type', 'unknown')
221
+ element_type = getattr(element, "type", "unknown")
225
222
  by_type.setdefault(element_type, []).append(element)
226
-
223
+
227
224
  # Create tables for each type (exclude chars - too granular)
228
225
  for element_type, type_elements in by_type.items():
229
- if element_type == 'char':
226
+ if element_type == "char":
230
227
  # Skip character elements - too granular for useful inspection
231
228
  continue
232
-
229
+
233
230
  # Limit elements shown
234
231
  display_elements = type_elements[:limit]
235
-
232
+
236
233
  # Get appropriate columns for this type
237
234
  columns = _get_columns_for_type(element_type, show_page_column)
238
-
235
+
239
236
  # Extract data for each element
240
237
  element_data = []
241
238
  for element in display_elements:
@@ -244,110 +241,113 @@ def inspect_collection(collection: "ElementCollection", limit: int = 30) -> Insp
244
241
  value = _extract_element_value(element, col)
245
242
  row[col] = value
246
243
  element_data.append(row)
247
-
244
+
248
245
  # Create section
249
246
  section_name = f"{element_type}_elements"
250
- section_data = {
251
- "elements": element_data,
252
- "columns": columns
253
- }
254
-
247
+ section_data = {"elements": element_data, "columns": columns}
248
+
255
249
  # Add note if truncated
256
250
  if len(type_elements) > limit:
257
- section_data["note"] = f"Showing {limit} of {len(type_elements)} elements (pass limit= to see more)"
258
-
251
+ section_data["note"] = (
252
+ f"Showing {limit} of {len(type_elements)} elements (pass limit= to see more)"
253
+ )
254
+
259
255
  data[section_name] = section_data
260
-
256
+
261
257
  # Count non-char elements for title
262
- non_char_count = len([e for e in elements if getattr(e, 'type', 'unknown') != 'char'])
258
+ non_char_count = len([e for e in elements if getattr(e, "type", "unknown") != "char"])
263
259
  title = f"Collection Inspection ({non_char_count} elements)"
264
260
  return InspectionSummary(data, title)
265
261
 
266
262
 
267
263
  def _get_columns_for_type(element_type: str, show_page_column: bool) -> List[str]:
268
264
  """Get appropriate columns for element type."""
269
- base_columns = ['x0', 'top', 'x1', 'bottom']
270
-
271
- if element_type == 'word':
272
- columns = ['text'] + base_columns + [
273
- 'font_family',
274
- 'font_variant',
275
- 'size',
276
- 'bold',
277
- 'italic',
278
- 'strike',
279
- 'underline',
280
- 'highlight',
281
- 'source',
282
- 'confidence',
283
- ]
265
+ base_columns = ["x0", "top", "x1", "bottom"]
266
+
267
+ if element_type == "word":
268
+ columns = (
269
+ ["text"]
270
+ + base_columns
271
+ + [
272
+ "font_family",
273
+ "font_variant",
274
+ "size",
275
+ "bold",
276
+ "italic",
277
+ "strike",
278
+ "underline",
279
+ "highlight",
280
+ "source",
281
+ "confidence",
282
+ ]
283
+ )
284
284
  # Add foreground text colour too
285
- columns.append('color')
286
- elif element_type == 'rect':
287
- columns = base_columns + ['width', 'height', 'stroke', 'fill', 'stroke_width']
288
- elif element_type == 'line':
289
- columns = base_columns + ['width', 'is_horizontal', 'is_vertical'] # LineElement properties
290
- elif element_type == 'region':
291
- columns = base_columns + ['width', 'height', 'type', 'color']
292
- elif element_type == 'blob':
293
- columns = base_columns + ['width', 'height', 'color']
285
+ columns.append("color")
286
+ elif element_type == "rect":
287
+ columns = base_columns + ["width", "height", "stroke", "fill", "stroke_width"]
288
+ elif element_type == "line":
289
+ columns = base_columns + ["width", "is_horizontal", "is_vertical"] # LineElement properties
290
+ elif element_type == "region":
291
+ columns = base_columns + ["width", "height", "type", "color"]
292
+ elif element_type == "blob":
293
+ columns = base_columns + ["width", "height", "color"]
294
294
  else:
295
- columns = base_columns + ['type']
296
-
295
+ columns = base_columns + ["type"]
296
+
297
297
  if show_page_column:
298
- columns.append('page')
299
-
298
+ columns.append("page")
299
+
300
300
  return columns
301
301
 
302
302
 
303
303
  def _extract_element_value(element: "Element", column: str) -> Any:
304
304
  """Extract value for a column from an element."""
305
305
  try:
306
- if column == 'text':
307
- text = getattr(element, 'text', '')
306
+ if column == "text":
307
+ text = getattr(element, "text", "")
308
308
  if text and len(text) > 60:
309
309
  return text[:60] + "..."
310
310
  return text or ""
311
-
312
- elif column == 'page':
313
- if hasattr(element, 'page') and hasattr(element.page, 'number'):
311
+
312
+ elif column == "page":
313
+ if hasattr(element, "page") and hasattr(element.page, "number"):
314
314
  return element.page.number
315
315
  return ""
316
-
317
- elif column == 'confidence':
318
- confidence = getattr(element, 'confidence', None)
316
+
317
+ elif column == "confidence":
318
+ confidence = getattr(element, "confidence", None)
319
319
  if confidence is not None and isinstance(confidence, (int, float)):
320
320
  return f"{confidence:.2f}"
321
321
  return ""
322
-
323
- elif column == 'font_family':
322
+
323
+ elif column == "font_family":
324
324
  # Use the cleaner font_family property from TextElement
325
- font_family = getattr(element, 'font_family', None)
325
+ font_family = getattr(element, "font_family", None)
326
326
  if font_family:
327
327
  return font_family
328
328
  # Fallback to fontname
329
- return getattr(element, 'fontname', '')
330
-
331
- elif column == 'font_variant':
332
- variant = getattr(element, 'font_variant', None)
329
+ return getattr(element, "fontname", "")
330
+
331
+ elif column == "font_variant":
332
+ variant = getattr(element, "font_variant", None)
333
333
  if variant:
334
334
  return variant
335
335
  # Fallback – try to derive from fontname if property missing
336
- fontname = getattr(element, 'fontname', '')
336
+ fontname = getattr(element, "fontname", "")
337
337
  if "+" in fontname:
338
338
  return fontname.split("+", 1)[0]
339
- return ''
340
-
341
- elif column in ['bold', 'italic', 'strike', 'underline']:
339
+ return ""
340
+
341
+ elif column in ["bold", "italic", "strike", "underline"]:
342
342
  value = getattr(element, column, False)
343
343
  return value if isinstance(value, bool) else False
344
-
345
- elif column == 'highlight':
344
+
345
+ elif column == "highlight":
346
346
  # If element is highlighted, return its colour; otherwise blank
347
- if getattr(element, 'highlight', False):
348
- col_val = getattr(element, 'highlight_color', None)
347
+ if getattr(element, "highlight", False):
348
+ col_val = getattr(element, "highlight_color", None)
349
349
  if col_val is None:
350
- return 'True' # fallback if colour missing
350
+ return "True" # fallback if colour missing
351
351
  # Convert tuple to hex
352
352
  if isinstance(col_val, (tuple, list)) and len(col_val) >= 3:
353
353
  try:
@@ -356,9 +356,9 @@ def _extract_element_value(element: "Element", column: str) -> Any:
356
356
  except Exception:
357
357
  return str(col_val)
358
358
  return str(col_val)
359
- return ''
360
-
361
- elif column in ['stroke', 'fill', 'color']:
359
+ return ""
360
+
361
+ elif column in ["stroke", "fill", "color"]:
362
362
  value = getattr(element, column, None)
363
363
  # If already a string (e.g. '#ff00aa' or 'red') return as is
364
364
  if isinstance(value, str):
@@ -371,24 +371,24 @@ def _extract_element_value(element: "Element", column: str) -> Any:
371
371
  except Exception:
372
372
  return str(value)
373
373
  return ""
374
-
375
- elif column in ['x0', 'top', 'x1', 'bottom', 'width', 'height', 'size', 'stroke_width']:
374
+
375
+ elif column in ["x0", "top", "x1", "bottom", "width", "height", "size", "stroke_width"]:
376
376
  value = getattr(element, column, 0)
377
377
  if isinstance(value, (int, float)) and not isinstance(value, bool):
378
378
  return int(round(value))
379
379
  return 0
380
-
381
- elif column in ['is_horizontal', 'is_vertical']:
380
+
381
+ elif column in ["is_horizontal", "is_vertical"]:
382
382
  value = getattr(element, column, False)
383
383
  return value if isinstance(value, bool) else False
384
-
384
+
385
385
  else:
386
386
  # Generic attribute access
387
- value = getattr(element, column, '')
387
+ value = getattr(element, column, "")
388
388
  if value is None:
389
389
  return ""
390
390
  return str(value)
391
-
391
+
392
392
  except Exception as e:
393
393
  # Fallback for any unexpected errors
394
394
  logger.warning(f"Error extracting {column} from element: {e}")
@@ -398,64 +398,71 @@ def _extract_element_value(element: "Element", column: str) -> Any:
398
398
  def describe_element(element: "Element") -> "ElementSummary":
399
399
  """
400
400
  Describe an individual element with its properties and attributes.
401
-
401
+
402
402
  Args:
403
403
  element: The element to describe
404
-
404
+
405
405
  Returns:
406
406
  ElementSummary with formatted element properties
407
407
  """
408
408
  from natural_pdf.describe.summary import ElementSummary
409
-
409
+
410
410
  # Get basic element info
411
- element_type = getattr(element, 'type', element.__class__.__name__)
412
-
411
+ element_type = getattr(element, "type", element.__class__.__name__)
412
+
413
413
  # Build the description data - use dict structure for proper list formatting
414
414
  data = {
415
415
  "info": {
416
416
  "object_type": "element",
417
417
  "element_type": element_type,
418
- "class_name": element.__class__.__name__
418
+ "class_name": element.__class__.__name__,
419
419
  }
420
420
  }
421
-
421
+
422
422
  # Add geometric properties - use dict structure for proper list formatting
423
- if hasattr(element, 'bbox'):
423
+ if hasattr(element, "bbox"):
424
424
  data["geometry"] = {
425
425
  "position": f"({round(element.x0, 1)}, {round(element.top, 1)}, {round(element.x1, 1)}, {round(element.bottom, 1)})",
426
- "size": f"({round(element.width, 1)}, {round(element.height, 1)})"
426
+ "size": f"({round(element.width, 1)}, {round(element.height, 1)})",
427
427
  }
428
-
428
+
429
429
  # Add text content if available - use dict structure for proper list formatting
430
- if hasattr(element, 'text') and element.text:
430
+ if hasattr(element, "text") and element.text:
431
431
  text = str(element.text).strip()
432
432
  display_text = text[:50] + "..." if len(text) > 50 else text
433
- data["content"] = {
434
- "text": f"'{display_text}'",
435
- "length": f"{len(text)} chars"
436
- }
437
-
433
+ data["content"] = {"text": f"'{display_text}'", "length": f"{len(text)} chars"}
434
+
438
435
  # Add common text properties - use dict structure for proper list formatting
439
436
  text_props = {}
440
- for prop in ['font_family', 'size', 'bold', 'italic', 'strike', 'underline', 'highlight', 'source', 'confidence']:
437
+ for prop in [
438
+ "font_family",
439
+ "size",
440
+ "bold",
441
+ "italic",
442
+ "strike",
443
+ "underline",
444
+ "highlight",
445
+ "source",
446
+ "confidence",
447
+ ]:
441
448
  if hasattr(element, prop):
442
449
  value = getattr(element, prop)
443
450
  if value is not None:
444
- if prop == 'confidence' and isinstance(value, (int, float)):
451
+ if prop == "confidence" and isinstance(value, (int, float)):
445
452
  text_props[prop] = round(value, 3)
446
- elif prop == 'size' and isinstance(value, (int, float)):
453
+ elif prop == "size" and isinstance(value, (int, float)):
447
454
  text_props[prop] = round(value, 1)
448
- elif prop in ['bold', 'italic', 'strike', 'underline']:
455
+ elif prop in ["bold", "italic", "strike", "underline"]:
449
456
  text_props[prop] = value
450
457
  else:
451
458
  text_props[prop] = value
452
-
459
+
453
460
  if text_props:
454
461
  data["properties"] = text_props
455
-
462
+
456
463
  # Add color information - use dict structure for proper list formatting
457
464
  color_info = {}
458
- for prop in ['color', 'fill', 'stroke']:
465
+ for prop in ["color", "fill", "stroke"]:
459
466
  if hasattr(element, prop):
460
467
  value = getattr(element, prop)
461
468
  if value is not None:
@@ -471,28 +478,28 @@ def describe_element(element: "Element") -> "ElementSummary":
471
478
  color_info[prop] = str(value)
472
479
  else:
473
480
  color_info[prop] = str(value)
474
-
481
+
475
482
  if color_info:
476
483
  data["colors"] = color_info
477
-
484
+
478
485
  # Add page information - use dict structure for proper list formatting
479
- if hasattr(element, 'page') and element.page:
480
- page_num = getattr(element.page, 'number', None)
486
+ if hasattr(element, "page") and element.page:
487
+ page_num = getattr(element.page, "number", None)
481
488
  if page_num is not None:
482
489
  data["page"] = {"number": page_num}
483
-
490
+
484
491
  # Add polygon information if available - use dict structure for proper list formatting
485
- if hasattr(element, 'has_polygon') and element.has_polygon:
486
- if hasattr(element, 'polygon'):
492
+ if hasattr(element, "has_polygon") and element.has_polygon:
493
+ if hasattr(element, "polygon"):
487
494
  polygon = element.polygon
488
495
  if polygon and len(polygon) > 0:
489
496
  data["shape"] = {"polygon_points": len(polygon)}
490
-
497
+
491
498
  # Create title
492
499
  title = f"{element_type.title()} Element"
493
- if hasattr(element, 'text') and element.text:
500
+ if hasattr(element, "text") and element.text:
494
501
  preview = str(element.text).strip()[:30]
495
502
  if preview:
496
503
  title += f": '{preview}'"
497
-
498
- return ElementSummary(data, title)
504
+
505
+ return ElementSummary(data, title)