natural-pdf 0.1.15__py3-none-any.whl → 0.1.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. natural_pdf/__init__.py +31 -0
  2. natural_pdf/analyzers/layout/gemini.py +137 -162
  3. natural_pdf/analyzers/layout/layout_manager.py +9 -5
  4. natural_pdf/analyzers/layout/layout_options.py +77 -7
  5. natural_pdf/analyzers/layout/paddle.py +318 -165
  6. natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
  7. natural_pdf/analyzers/shape_detection_mixin.py +770 -405
  8. natural_pdf/classification/mixin.py +2 -8
  9. natural_pdf/collections/pdf_collection.py +25 -30
  10. natural_pdf/core/highlighting_service.py +47 -32
  11. natural_pdf/core/page.py +119 -76
  12. natural_pdf/core/pdf.py +19 -22
  13. natural_pdf/describe/__init__.py +21 -0
  14. natural_pdf/describe/base.py +457 -0
  15. natural_pdf/describe/elements.py +411 -0
  16. natural_pdf/describe/mixin.py +84 -0
  17. natural_pdf/describe/summary.py +186 -0
  18. natural_pdf/elements/base.py +11 -10
  19. natural_pdf/elements/collections.py +116 -51
  20. natural_pdf/elements/region.py +204 -127
  21. natural_pdf/exporters/paddleocr.py +38 -13
  22. natural_pdf/flows/__init__.py +3 -3
  23. natural_pdf/flows/collections.py +303 -132
  24. natural_pdf/flows/element.py +277 -132
  25. natural_pdf/flows/flow.py +33 -16
  26. natural_pdf/flows/region.py +142 -79
  27. natural_pdf/ocr/engine_doctr.py +37 -4
  28. natural_pdf/ocr/engine_easyocr.py +23 -3
  29. natural_pdf/ocr/engine_paddle.py +281 -30
  30. natural_pdf/ocr/engine_surya.py +8 -3
  31. natural_pdf/ocr/ocr_manager.py +75 -76
  32. natural_pdf/ocr/ocr_options.py +52 -87
  33. natural_pdf/search/__init__.py +25 -12
  34. natural_pdf/search/lancedb_search_service.py +91 -54
  35. natural_pdf/search/numpy_search_service.py +86 -65
  36. natural_pdf/search/searchable_mixin.py +2 -2
  37. natural_pdf/selectors/parser.py +125 -81
  38. natural_pdf/widgets/__init__.py +1 -1
  39. natural_pdf/widgets/viewer.py +205 -449
  40. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/METADATA +27 -45
  41. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/RECORD +44 -38
  42. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/WHEEL +0 -0
  43. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/licenses/LICENSE +0 -0
  44. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,457 @@
1
+ """
2
+ Main describe functions for pages, collections, and regions.
3
+ """
4
+
5
+ import logging
6
+ from collections import Counter
7
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional
8
+
9
+ from .elements import (
10
+ describe_line_elements,
11
+ describe_rect_elements,
12
+ describe_region_elements,
13
+ describe_text_elements,
14
+ )
15
+ from .summary import ElementSummary, InspectionSummary
16
+
17
+ if TYPE_CHECKING:
18
+ from natural_pdf.core.page import Page
19
+ from natural_pdf.elements.base import Element
20
+ from natural_pdf.elements.collections import ElementCollection
21
+ from natural_pdf.elements.region import Region
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ def describe_page(page: "Page") -> ElementSummary:
27
+ """
28
+ Describe what's on a page with high-level summary.
29
+
30
+ Args:
31
+ page: Page to describe
32
+
33
+ Returns:
34
+ ElementSummary with page overview
35
+ """
36
+ data = {}
37
+
38
+ # Get all elements
39
+ all_elements = page.get_elements()
40
+
41
+ if not all_elements:
42
+ data["message"] = "No elements found on page"
43
+ return ElementSummary(data, f"Page {page.number} Summary")
44
+
45
+ # Element counts by type (exclude chars - too granular)
46
+ type_counts = Counter()
47
+ for element in all_elements:
48
+ element_type = getattr(element, 'type', 'unknown')
49
+ if element_type != 'char': # Skip character elements
50
+ type_counts[element_type] += 1
51
+
52
+ # Format element counts as dictionary for proper list formatting
53
+ element_summary = {}
54
+ for element_type, count in type_counts.most_common():
55
+ type_display = element_type.replace('_', ' ').title()
56
+ if element_type == 'word':
57
+ # Add source breakdown for text
58
+ text_elements = [e for e in all_elements if getattr(e, 'type', '') == 'word']
59
+ sources = Counter()
60
+ for elem in text_elements:
61
+ source = getattr(elem, 'source', 'unknown')
62
+ sources[source] += 1
63
+
64
+ if len(sources) > 1:
65
+ source_parts = []
66
+ for source, source_count in sources.most_common():
67
+ source_parts.append(f"{source_count} {source}")
68
+ element_summary["text"] = f"{count} elements ({', '.join(source_parts)})"
69
+ else:
70
+ element_summary["text"] = f"{count} elements"
71
+ else:
72
+ element_summary[element_type] = f"{count} elements"
73
+
74
+ data["elements"] = element_summary
75
+
76
+ # Text analysis if we have text elements (exclude chars - too granular)
77
+ text_elements = [e for e in all_elements if getattr(e, 'type', '') == 'word']
78
+ if text_elements:
79
+ text_analysis = describe_text_elements(text_elements)
80
+ if text_analysis and 'message' not in text_analysis:
81
+ data["text_analysis"] = text_analysis
82
+
83
+ return ElementSummary(data, f"Page {page.number} Summary")
84
+
85
+
86
+ def describe_collection(collection: "ElementCollection") -> ElementSummary:
87
+ """
88
+ Describe an element collection with type-specific analysis.
89
+
90
+ Args:
91
+ collection: ElementCollection to describe
92
+
93
+ Returns:
94
+ ElementSummary with collection analysis
95
+ """
96
+ elements = list(collection)
97
+
98
+ if not elements:
99
+ data = {"message": "Empty collection"}
100
+ return ElementSummary(data, "Collection Summary")
101
+
102
+ data = {}
103
+
104
+ # Group elements by type
105
+ by_type = {}
106
+ for element in elements:
107
+ element_type = getattr(element, 'type', 'unknown')
108
+ by_type.setdefault(element_type, []).append(element)
109
+
110
+ # Overall summary for mixed collections (exclude chars from overview)
111
+ if len(by_type) > 1:
112
+ type_counts = {k: len(v) for k, v in by_type.items() if k != 'char'}
113
+ total = sum(type_counts.values())
114
+
115
+ summary_parts = []
116
+ for element_type, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True):
117
+ type_display = element_type.replace('_', ' ').title()
118
+ summary_parts.append(f"**{type_display}**: {count}")
119
+
120
+ if summary_parts: # Only add overview if we have non-char elements
121
+ data["overview"] = {
122
+ "total_elements": total,
123
+ "type_breakdown": summary_parts
124
+ }
125
+
126
+ # Type-specific analysis (exclude chars - too granular)
127
+ for element_type, type_elements in by_type.items():
128
+ if element_type == 'char':
129
+ # Skip character elements - too granular for useful analysis
130
+ continue
131
+ elif element_type == 'word':
132
+ analysis = describe_text_elements(type_elements)
133
+ elif element_type == 'rect':
134
+ analysis = describe_rect_elements(type_elements)
135
+ elif element_type == 'line':
136
+ analysis = describe_line_elements(type_elements)
137
+ elif element_type == 'region':
138
+ analysis = describe_region_elements(type_elements)
139
+ else:
140
+ analysis = {"count": len(type_elements)}
141
+
142
+ if analysis and 'message' not in analysis:
143
+ section_name = element_type.replace('_', ' ').title()
144
+ if len(by_type) == 1:
145
+ # Single type collection - flatten the structure
146
+ data.update(analysis)
147
+ else:
148
+ # Mixed collection - keep sections separate
149
+ data[section_name] = analysis
150
+
151
+ # Count non-char elements for title
152
+ non_char_count = len([e for e in elements if getattr(e, 'type', 'unknown') != 'char'])
153
+ title = f"Collection Summary ({non_char_count} elements)"
154
+ return ElementSummary(data, title)
155
+
156
+
157
+ def describe_region(region: "Region") -> ElementSummary:
158
+ """
159
+ Describe a region with its properties and contents.
160
+
161
+ Args:
162
+ region: Region to describe
163
+
164
+ Returns:
165
+ ElementSummary with region analysis
166
+ """
167
+ data = {}
168
+
169
+ # Region info
170
+ region_info = {
171
+ "page": region.page.number,
172
+ "dimensions": f"{region.width:.0f}×{region.height:.0f} pts",
173
+ "area": f"{region.width * region.height:.0f} sq pts",
174
+ "position": f"({region.x0:.0f}, {region.top:.0f}) to ({region.x1:.0f}, {region.bottom:.0f})"
175
+ }
176
+
177
+ # Add metadata if available
178
+ if hasattr(region, 'metadata') and region.metadata:
179
+ region_info["metadata"] = region.metadata
180
+
181
+ data["region_info"] = region_info
182
+
183
+ # Content analysis
184
+ content_elements = region.find_all("*")
185
+ if content_elements:
186
+ content_analysis = describe_collection(content_elements)
187
+ # Extract the data and add as "content" section
188
+ data["content"] = content_analysis.to_dict()
189
+ else:
190
+ data["content"] = {"message": "No elements found in region"}
191
+
192
+ return ElementSummary(data, "Region Summary")
193
+
194
+
195
+ def inspect_collection(collection: "ElementCollection", limit: int = 30) -> InspectionSummary:
196
+ """
197
+ Inspect elements in a collection with detailed tabular view.
198
+
199
+ Args:
200
+ collection: ElementCollection to inspect
201
+ limit: Maximum elements per type to show (default: 30)
202
+
203
+ Returns:
204
+ InspectionSummary with element tables
205
+ """
206
+ elements = list(collection)
207
+
208
+ if not elements:
209
+ data = {"message": "Empty collection"}
210
+ return InspectionSummary(data, "Collection Inspection")
211
+
212
+ data = {}
213
+
214
+ # Check if multi-page
215
+ pages = set()
216
+ for element in elements:
217
+ if hasattr(element, 'page') and hasattr(element.page, 'number'):
218
+ pages.add(element.page.number)
219
+ show_page_column = len(pages) > 1
220
+
221
+ # Group by type
222
+ by_type = {}
223
+ for element in elements:
224
+ element_type = getattr(element, 'type', 'unknown')
225
+ by_type.setdefault(element_type, []).append(element)
226
+
227
+ # Create tables for each type (exclude chars - too granular)
228
+ for element_type, type_elements in by_type.items():
229
+ if element_type == 'char':
230
+ # Skip character elements - too granular for useful inspection
231
+ continue
232
+
233
+ # Limit elements shown
234
+ display_elements = type_elements[:limit]
235
+
236
+ # Get appropriate columns for this type
237
+ columns = _get_columns_for_type(element_type, show_page_column)
238
+
239
+ # Extract data for each element
240
+ element_data = []
241
+ for element in display_elements:
242
+ row = {}
243
+ for col in columns:
244
+ value = _extract_element_value(element, col)
245
+ row[col] = value
246
+ element_data.append(row)
247
+
248
+ # Create section
249
+ section_name = f"{element_type}_elements"
250
+ section_data = {
251
+ "elements": element_data,
252
+ "columns": columns
253
+ }
254
+
255
+ # Add note if truncated
256
+ if len(type_elements) > limit:
257
+ section_data["note"] = f"Showing {limit} of {len(type_elements)} elements (pass limit= to see more)"
258
+
259
+ data[section_name] = section_data
260
+
261
+ # Count non-char elements for title
262
+ non_char_count = len([e for e in elements if getattr(e, 'type', 'unknown') != 'char'])
263
+ title = f"Collection Inspection ({non_char_count} elements)"
264
+ return InspectionSummary(data, title)
265
+
266
+
267
+ def _get_columns_for_type(element_type: str, show_page_column: bool) -> List[str]:
268
+ """Get appropriate columns for element type."""
269
+ base_columns = ['x0', 'top', 'x1', 'bottom']
270
+
271
+ if element_type == 'word':
272
+ columns = ['text'] + base_columns + ['font_family', 'size', 'bold', 'italic', 'source', 'confidence']
273
+ # Add color for text elements
274
+ columns.append('color')
275
+ elif element_type == 'rect':
276
+ columns = base_columns + ['width', 'height', 'stroke', 'fill', 'stroke_width']
277
+ elif element_type == 'line':
278
+ columns = base_columns + ['width', 'is_horizontal', 'is_vertical'] # LineElement properties
279
+ elif element_type == 'region':
280
+ columns = base_columns + ['width', 'height', 'type']
281
+ else:
282
+ columns = base_columns + ['type']
283
+
284
+ if show_page_column:
285
+ columns.append('page')
286
+
287
+ return columns
288
+
289
+
290
+ def _extract_element_value(element: "Element", column: str) -> Any:
291
+ """Extract value for a column from an element."""
292
+ try:
293
+ if column == 'text':
294
+ text = getattr(element, 'text', '')
295
+ if text and len(text) > 50:
296
+ return text[:50] + "..."
297
+ return text or ""
298
+
299
+ elif column == 'page':
300
+ if hasattr(element, 'page') and hasattr(element.page, 'number'):
301
+ return element.page.number
302
+ return ""
303
+
304
+ elif column == 'confidence':
305
+ confidence = getattr(element, 'confidence', None)
306
+ if confidence is not None and isinstance(confidence, (int, float)):
307
+ return f"{confidence:.2f}"
308
+ return ""
309
+
310
+ elif column == 'font_family':
311
+ # Use the cleaner font_family property from TextElement
312
+ font_family = getattr(element, 'font_family', None)
313
+ if font_family:
314
+ return font_family
315
+ # Fallback to fontname
316
+ return getattr(element, 'fontname', '')
317
+
318
+ elif column in ['bold', 'italic']:
319
+ value = getattr(element, column, False)
320
+ return value if isinstance(value, bool) else False
321
+
322
+ elif column in ['stroke', 'fill', 'color']:
323
+ # For rectangles and text, these return color tuples
324
+ value = getattr(element, column, None)
325
+ if value and isinstance(value, (tuple, list)) and len(value) >= 3:
326
+ # Convert to hex color for display
327
+ try:
328
+ r, g, b = [int(v * 255) if v <= 1 else int(v) for v in value[:3]]
329
+ return f"#{r:02x}{g:02x}{b:02x}"
330
+ except:
331
+ return str(value)
332
+ return ""
333
+
334
+ elif column in ['x0', 'top', 'x1', 'bottom', 'width', 'height', 'size', 'stroke_width']:
335
+ value = getattr(element, column, 0)
336
+ if isinstance(value, (int, float)) and not isinstance(value, bool):
337
+ return int(round(value))
338
+ return 0
339
+
340
+ elif column in ['is_horizontal', 'is_vertical']:
341
+ value = getattr(element, column, False)
342
+ return value if isinstance(value, bool) else False
343
+
344
+ else:
345
+ # Generic attribute access
346
+ value = getattr(element, column, '')
347
+ if value is None:
348
+ return ""
349
+ return str(value)
350
+
351
+ except Exception as e:
352
+ # Fallback for any unexpected errors
353
+ logger.warning(f"Error extracting {column} from element: {e}")
354
+ return ""
355
+
356
+
357
+ def describe_element(element: "Element") -> "ElementSummary":
358
+ """
359
+ Describe an individual element with its properties and attributes.
360
+
361
+ Args:
362
+ element: The element to describe
363
+
364
+ Returns:
365
+ ElementSummary with formatted element properties
366
+ """
367
+ from natural_pdf.describe.summary import ElementSummary
368
+
369
+ # Get basic element info
370
+ element_type = getattr(element, 'type', element.__class__.__name__)
371
+
372
+ # Build the description data - use dict structure for proper list formatting
373
+ data = {
374
+ "info": {
375
+ "object_type": "element",
376
+ "element_type": element_type,
377
+ "class_name": element.__class__.__name__
378
+ }
379
+ }
380
+
381
+ # Add geometric properties - use dict structure for proper list formatting
382
+ if hasattr(element, 'bbox'):
383
+ data["geometry"] = {
384
+ "position": f"({round(element.x0, 1)}, {round(element.top, 1)}, {round(element.x1, 1)}, {round(element.bottom, 1)})",
385
+ "size": f"({round(element.width, 1)}, {round(element.height, 1)})"
386
+ }
387
+
388
+ # Add text content if available - use dict structure for proper list formatting
389
+ if hasattr(element, 'text') and element.text:
390
+ text = str(element.text).strip()
391
+ display_text = text[:50] + "..." if len(text) > 50 else text
392
+ data["content"] = {
393
+ "text": f"'{display_text}'",
394
+ "length": f"{len(text)} chars"
395
+ }
396
+
397
+ # Add common text properties - use dict structure for proper list formatting
398
+ text_props = {}
399
+ for prop in ['font_family', 'size', 'bold', 'italic', 'source', 'confidence']:
400
+ if hasattr(element, prop):
401
+ value = getattr(element, prop)
402
+ if value is not None:
403
+ if prop == 'confidence' and isinstance(value, (int, float)):
404
+ text_props[prop] = round(value, 3)
405
+ elif prop == 'size' and isinstance(value, (int, float)):
406
+ text_props[prop] = round(value, 1)
407
+ elif prop in ['bold', 'italic']:
408
+ text_props[prop] = value
409
+ else:
410
+ text_props[prop] = value
411
+
412
+ if text_props:
413
+ data["properties"] = text_props
414
+
415
+ # Add color information - use dict structure for proper list formatting
416
+ color_info = {}
417
+ for prop in ['color', 'fill', 'stroke']:
418
+ if hasattr(element, prop):
419
+ value = getattr(element, prop)
420
+ if value is not None:
421
+ if isinstance(value, (tuple, list)) and len(value) >= 3:
422
+ # Convert RGB to hex if it's a color tuple
423
+ try:
424
+ if all(isinstance(v, (int, float)) for v in value[:3]):
425
+ r, g, b = [int(v * 255) if v <= 1 else int(v) for v in value[:3]]
426
+ color_info[prop] = f"#{r:02x}{g:02x}{b:02x}"
427
+ else:
428
+ color_info[prop] = str(value)
429
+ except:
430
+ color_info[prop] = str(value)
431
+ else:
432
+ color_info[prop] = str(value)
433
+
434
+ if color_info:
435
+ data["colors"] = color_info
436
+
437
+ # Add page information - use dict structure for proper list formatting
438
+ if hasattr(element, 'page') and element.page:
439
+ page_num = getattr(element.page, 'number', None)
440
+ if page_num is not None:
441
+ data["page"] = {"number": page_num}
442
+
443
+ # Add polygon information if available - use dict structure for proper list formatting
444
+ if hasattr(element, 'has_polygon') and element.has_polygon:
445
+ if hasattr(element, 'polygon'):
446
+ polygon = element.polygon
447
+ if polygon and len(polygon) > 0:
448
+ data["shape"] = {"polygon_points": len(polygon)}
449
+
450
+ # Create title
451
+ title = f"{element_type.title()} Element"
452
+ if hasattr(element, 'text') and element.text:
453
+ preview = str(element.text).strip()[:30]
454
+ if preview:
455
+ title += f": '{preview}'"
456
+
457
+ return ElementSummary(data, title)