natural-pdf 0.1.15__py3-none-any.whl → 0.1.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +31 -0
- natural_pdf/analyzers/layout/gemini.py +137 -162
- natural_pdf/analyzers/layout/layout_manager.py +9 -5
- natural_pdf/analyzers/layout/layout_options.py +77 -7
- natural_pdf/analyzers/layout/paddle.py +318 -165
- natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
- natural_pdf/analyzers/shape_detection_mixin.py +770 -405
- natural_pdf/classification/mixin.py +2 -8
- natural_pdf/collections/pdf_collection.py +25 -30
- natural_pdf/core/highlighting_service.py +47 -32
- natural_pdf/core/page.py +119 -76
- natural_pdf/core/pdf.py +19 -22
- natural_pdf/describe/__init__.py +21 -0
- natural_pdf/describe/base.py +457 -0
- natural_pdf/describe/elements.py +411 -0
- natural_pdf/describe/mixin.py +84 -0
- natural_pdf/describe/summary.py +186 -0
- natural_pdf/elements/base.py +11 -10
- natural_pdf/elements/collections.py +116 -51
- natural_pdf/elements/region.py +204 -127
- natural_pdf/exporters/paddleocr.py +38 -13
- natural_pdf/flows/__init__.py +3 -3
- natural_pdf/flows/collections.py +303 -132
- natural_pdf/flows/element.py +277 -132
- natural_pdf/flows/flow.py +33 -16
- natural_pdf/flows/region.py +142 -79
- natural_pdf/ocr/engine_doctr.py +37 -4
- natural_pdf/ocr/engine_easyocr.py +23 -3
- natural_pdf/ocr/engine_paddle.py +281 -30
- natural_pdf/ocr/engine_surya.py +8 -3
- natural_pdf/ocr/ocr_manager.py +75 -76
- natural_pdf/ocr/ocr_options.py +52 -87
- natural_pdf/search/__init__.py +25 -12
- natural_pdf/search/lancedb_search_service.py +91 -54
- natural_pdf/search/numpy_search_service.py +86 -65
- natural_pdf/search/searchable_mixin.py +2 -2
- natural_pdf/selectors/parser.py +125 -81
- natural_pdf/widgets/__init__.py +1 -1
- natural_pdf/widgets/viewer.py +205 -449
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/METADATA +27 -45
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/RECORD +44 -38
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,457 @@
|
|
1
|
+
"""
|
2
|
+
Main describe functions for pages, collections, and regions.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import logging
|
6
|
+
from collections import Counter
|
7
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
8
|
+
|
9
|
+
from .elements import (
|
10
|
+
describe_line_elements,
|
11
|
+
describe_rect_elements,
|
12
|
+
describe_region_elements,
|
13
|
+
describe_text_elements,
|
14
|
+
)
|
15
|
+
from .summary import ElementSummary, InspectionSummary
|
16
|
+
|
17
|
+
if TYPE_CHECKING:
|
18
|
+
from natural_pdf.core.page import Page
|
19
|
+
from natural_pdf.elements.base import Element
|
20
|
+
from natural_pdf.elements.collections import ElementCollection
|
21
|
+
from natural_pdf.elements.region import Region
|
22
|
+
|
23
|
+
logger = logging.getLogger(__name__)
|
24
|
+
|
25
|
+
|
26
|
+
def describe_page(page: "Page") -> ElementSummary:
|
27
|
+
"""
|
28
|
+
Describe what's on a page with high-level summary.
|
29
|
+
|
30
|
+
Args:
|
31
|
+
page: Page to describe
|
32
|
+
|
33
|
+
Returns:
|
34
|
+
ElementSummary with page overview
|
35
|
+
"""
|
36
|
+
data = {}
|
37
|
+
|
38
|
+
# Get all elements
|
39
|
+
all_elements = page.get_elements()
|
40
|
+
|
41
|
+
if not all_elements:
|
42
|
+
data["message"] = "No elements found on page"
|
43
|
+
return ElementSummary(data, f"Page {page.number} Summary")
|
44
|
+
|
45
|
+
# Element counts by type (exclude chars - too granular)
|
46
|
+
type_counts = Counter()
|
47
|
+
for element in all_elements:
|
48
|
+
element_type = getattr(element, 'type', 'unknown')
|
49
|
+
if element_type != 'char': # Skip character elements
|
50
|
+
type_counts[element_type] += 1
|
51
|
+
|
52
|
+
# Format element counts as dictionary for proper list formatting
|
53
|
+
element_summary = {}
|
54
|
+
for element_type, count in type_counts.most_common():
|
55
|
+
type_display = element_type.replace('_', ' ').title()
|
56
|
+
if element_type == 'word':
|
57
|
+
# Add source breakdown for text
|
58
|
+
text_elements = [e for e in all_elements if getattr(e, 'type', '') == 'word']
|
59
|
+
sources = Counter()
|
60
|
+
for elem in text_elements:
|
61
|
+
source = getattr(elem, 'source', 'unknown')
|
62
|
+
sources[source] += 1
|
63
|
+
|
64
|
+
if len(sources) > 1:
|
65
|
+
source_parts = []
|
66
|
+
for source, source_count in sources.most_common():
|
67
|
+
source_parts.append(f"{source_count} {source}")
|
68
|
+
element_summary["text"] = f"{count} elements ({', '.join(source_parts)})"
|
69
|
+
else:
|
70
|
+
element_summary["text"] = f"{count} elements"
|
71
|
+
else:
|
72
|
+
element_summary[element_type] = f"{count} elements"
|
73
|
+
|
74
|
+
data["elements"] = element_summary
|
75
|
+
|
76
|
+
# Text analysis if we have text elements (exclude chars - too granular)
|
77
|
+
text_elements = [e for e in all_elements if getattr(e, 'type', '') == 'word']
|
78
|
+
if text_elements:
|
79
|
+
text_analysis = describe_text_elements(text_elements)
|
80
|
+
if text_analysis and 'message' not in text_analysis:
|
81
|
+
data["text_analysis"] = text_analysis
|
82
|
+
|
83
|
+
return ElementSummary(data, f"Page {page.number} Summary")
|
84
|
+
|
85
|
+
|
86
|
+
def describe_collection(collection: "ElementCollection") -> ElementSummary:
|
87
|
+
"""
|
88
|
+
Describe an element collection with type-specific analysis.
|
89
|
+
|
90
|
+
Args:
|
91
|
+
collection: ElementCollection to describe
|
92
|
+
|
93
|
+
Returns:
|
94
|
+
ElementSummary with collection analysis
|
95
|
+
"""
|
96
|
+
elements = list(collection)
|
97
|
+
|
98
|
+
if not elements:
|
99
|
+
data = {"message": "Empty collection"}
|
100
|
+
return ElementSummary(data, "Collection Summary")
|
101
|
+
|
102
|
+
data = {}
|
103
|
+
|
104
|
+
# Group elements by type
|
105
|
+
by_type = {}
|
106
|
+
for element in elements:
|
107
|
+
element_type = getattr(element, 'type', 'unknown')
|
108
|
+
by_type.setdefault(element_type, []).append(element)
|
109
|
+
|
110
|
+
# Overall summary for mixed collections (exclude chars from overview)
|
111
|
+
if len(by_type) > 1:
|
112
|
+
type_counts = {k: len(v) for k, v in by_type.items() if k != 'char'}
|
113
|
+
total = sum(type_counts.values())
|
114
|
+
|
115
|
+
summary_parts = []
|
116
|
+
for element_type, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True):
|
117
|
+
type_display = element_type.replace('_', ' ').title()
|
118
|
+
summary_parts.append(f"**{type_display}**: {count}")
|
119
|
+
|
120
|
+
if summary_parts: # Only add overview if we have non-char elements
|
121
|
+
data["overview"] = {
|
122
|
+
"total_elements": total,
|
123
|
+
"type_breakdown": summary_parts
|
124
|
+
}
|
125
|
+
|
126
|
+
# Type-specific analysis (exclude chars - too granular)
|
127
|
+
for element_type, type_elements in by_type.items():
|
128
|
+
if element_type == 'char':
|
129
|
+
# Skip character elements - too granular for useful analysis
|
130
|
+
continue
|
131
|
+
elif element_type == 'word':
|
132
|
+
analysis = describe_text_elements(type_elements)
|
133
|
+
elif element_type == 'rect':
|
134
|
+
analysis = describe_rect_elements(type_elements)
|
135
|
+
elif element_type == 'line':
|
136
|
+
analysis = describe_line_elements(type_elements)
|
137
|
+
elif element_type == 'region':
|
138
|
+
analysis = describe_region_elements(type_elements)
|
139
|
+
else:
|
140
|
+
analysis = {"count": len(type_elements)}
|
141
|
+
|
142
|
+
if analysis and 'message' not in analysis:
|
143
|
+
section_name = element_type.replace('_', ' ').title()
|
144
|
+
if len(by_type) == 1:
|
145
|
+
# Single type collection - flatten the structure
|
146
|
+
data.update(analysis)
|
147
|
+
else:
|
148
|
+
# Mixed collection - keep sections separate
|
149
|
+
data[section_name] = analysis
|
150
|
+
|
151
|
+
# Count non-char elements for title
|
152
|
+
non_char_count = len([e for e in elements if getattr(e, 'type', 'unknown') != 'char'])
|
153
|
+
title = f"Collection Summary ({non_char_count} elements)"
|
154
|
+
return ElementSummary(data, title)
|
155
|
+
|
156
|
+
|
157
|
+
def describe_region(region: "Region") -> ElementSummary:
|
158
|
+
"""
|
159
|
+
Describe a region with its properties and contents.
|
160
|
+
|
161
|
+
Args:
|
162
|
+
region: Region to describe
|
163
|
+
|
164
|
+
Returns:
|
165
|
+
ElementSummary with region analysis
|
166
|
+
"""
|
167
|
+
data = {}
|
168
|
+
|
169
|
+
# Region info
|
170
|
+
region_info = {
|
171
|
+
"page": region.page.number,
|
172
|
+
"dimensions": f"{region.width:.0f}×{region.height:.0f} pts",
|
173
|
+
"area": f"{region.width * region.height:.0f} sq pts",
|
174
|
+
"position": f"({region.x0:.0f}, {region.top:.0f}) to ({region.x1:.0f}, {region.bottom:.0f})"
|
175
|
+
}
|
176
|
+
|
177
|
+
# Add metadata if available
|
178
|
+
if hasattr(region, 'metadata') and region.metadata:
|
179
|
+
region_info["metadata"] = region.metadata
|
180
|
+
|
181
|
+
data["region_info"] = region_info
|
182
|
+
|
183
|
+
# Content analysis
|
184
|
+
content_elements = region.find_all("*")
|
185
|
+
if content_elements:
|
186
|
+
content_analysis = describe_collection(content_elements)
|
187
|
+
# Extract the data and add as "content" section
|
188
|
+
data["content"] = content_analysis.to_dict()
|
189
|
+
else:
|
190
|
+
data["content"] = {"message": "No elements found in region"}
|
191
|
+
|
192
|
+
return ElementSummary(data, "Region Summary")
|
193
|
+
|
194
|
+
|
195
|
+
def inspect_collection(collection: "ElementCollection", limit: int = 30) -> InspectionSummary:
|
196
|
+
"""
|
197
|
+
Inspect elements in a collection with detailed tabular view.
|
198
|
+
|
199
|
+
Args:
|
200
|
+
collection: ElementCollection to inspect
|
201
|
+
limit: Maximum elements per type to show (default: 30)
|
202
|
+
|
203
|
+
Returns:
|
204
|
+
InspectionSummary with element tables
|
205
|
+
"""
|
206
|
+
elements = list(collection)
|
207
|
+
|
208
|
+
if not elements:
|
209
|
+
data = {"message": "Empty collection"}
|
210
|
+
return InspectionSummary(data, "Collection Inspection")
|
211
|
+
|
212
|
+
data = {}
|
213
|
+
|
214
|
+
# Check if multi-page
|
215
|
+
pages = set()
|
216
|
+
for element in elements:
|
217
|
+
if hasattr(element, 'page') and hasattr(element.page, 'number'):
|
218
|
+
pages.add(element.page.number)
|
219
|
+
show_page_column = len(pages) > 1
|
220
|
+
|
221
|
+
# Group by type
|
222
|
+
by_type = {}
|
223
|
+
for element in elements:
|
224
|
+
element_type = getattr(element, 'type', 'unknown')
|
225
|
+
by_type.setdefault(element_type, []).append(element)
|
226
|
+
|
227
|
+
# Create tables for each type (exclude chars - too granular)
|
228
|
+
for element_type, type_elements in by_type.items():
|
229
|
+
if element_type == 'char':
|
230
|
+
# Skip character elements - too granular for useful inspection
|
231
|
+
continue
|
232
|
+
|
233
|
+
# Limit elements shown
|
234
|
+
display_elements = type_elements[:limit]
|
235
|
+
|
236
|
+
# Get appropriate columns for this type
|
237
|
+
columns = _get_columns_for_type(element_type, show_page_column)
|
238
|
+
|
239
|
+
# Extract data for each element
|
240
|
+
element_data = []
|
241
|
+
for element in display_elements:
|
242
|
+
row = {}
|
243
|
+
for col in columns:
|
244
|
+
value = _extract_element_value(element, col)
|
245
|
+
row[col] = value
|
246
|
+
element_data.append(row)
|
247
|
+
|
248
|
+
# Create section
|
249
|
+
section_name = f"{element_type}_elements"
|
250
|
+
section_data = {
|
251
|
+
"elements": element_data,
|
252
|
+
"columns": columns
|
253
|
+
}
|
254
|
+
|
255
|
+
# Add note if truncated
|
256
|
+
if len(type_elements) > limit:
|
257
|
+
section_data["note"] = f"Showing {limit} of {len(type_elements)} elements (pass limit= to see more)"
|
258
|
+
|
259
|
+
data[section_name] = section_data
|
260
|
+
|
261
|
+
# Count non-char elements for title
|
262
|
+
non_char_count = len([e for e in elements if getattr(e, 'type', 'unknown') != 'char'])
|
263
|
+
title = f"Collection Inspection ({non_char_count} elements)"
|
264
|
+
return InspectionSummary(data, title)
|
265
|
+
|
266
|
+
|
267
|
+
def _get_columns_for_type(element_type: str, show_page_column: bool) -> List[str]:
|
268
|
+
"""Get appropriate columns for element type."""
|
269
|
+
base_columns = ['x0', 'top', 'x1', 'bottom']
|
270
|
+
|
271
|
+
if element_type == 'word':
|
272
|
+
columns = ['text'] + base_columns + ['font_family', 'size', 'bold', 'italic', 'source', 'confidence']
|
273
|
+
# Add color for text elements
|
274
|
+
columns.append('color')
|
275
|
+
elif element_type == 'rect':
|
276
|
+
columns = base_columns + ['width', 'height', 'stroke', 'fill', 'stroke_width']
|
277
|
+
elif element_type == 'line':
|
278
|
+
columns = base_columns + ['width', 'is_horizontal', 'is_vertical'] # LineElement properties
|
279
|
+
elif element_type == 'region':
|
280
|
+
columns = base_columns + ['width', 'height', 'type']
|
281
|
+
else:
|
282
|
+
columns = base_columns + ['type']
|
283
|
+
|
284
|
+
if show_page_column:
|
285
|
+
columns.append('page')
|
286
|
+
|
287
|
+
return columns
|
288
|
+
|
289
|
+
|
290
|
+
def _extract_element_value(element: "Element", column: str) -> Any:
|
291
|
+
"""Extract value for a column from an element."""
|
292
|
+
try:
|
293
|
+
if column == 'text':
|
294
|
+
text = getattr(element, 'text', '')
|
295
|
+
if text and len(text) > 50:
|
296
|
+
return text[:50] + "..."
|
297
|
+
return text or ""
|
298
|
+
|
299
|
+
elif column == 'page':
|
300
|
+
if hasattr(element, 'page') and hasattr(element.page, 'number'):
|
301
|
+
return element.page.number
|
302
|
+
return ""
|
303
|
+
|
304
|
+
elif column == 'confidence':
|
305
|
+
confidence = getattr(element, 'confidence', None)
|
306
|
+
if confidence is not None and isinstance(confidence, (int, float)):
|
307
|
+
return f"{confidence:.2f}"
|
308
|
+
return ""
|
309
|
+
|
310
|
+
elif column == 'font_family':
|
311
|
+
# Use the cleaner font_family property from TextElement
|
312
|
+
font_family = getattr(element, 'font_family', None)
|
313
|
+
if font_family:
|
314
|
+
return font_family
|
315
|
+
# Fallback to fontname
|
316
|
+
return getattr(element, 'fontname', '')
|
317
|
+
|
318
|
+
elif column in ['bold', 'italic']:
|
319
|
+
value = getattr(element, column, False)
|
320
|
+
return value if isinstance(value, bool) else False
|
321
|
+
|
322
|
+
elif column in ['stroke', 'fill', 'color']:
|
323
|
+
# For rectangles and text, these return color tuples
|
324
|
+
value = getattr(element, column, None)
|
325
|
+
if value and isinstance(value, (tuple, list)) and len(value) >= 3:
|
326
|
+
# Convert to hex color for display
|
327
|
+
try:
|
328
|
+
r, g, b = [int(v * 255) if v <= 1 else int(v) for v in value[:3]]
|
329
|
+
return f"#{r:02x}{g:02x}{b:02x}"
|
330
|
+
except:
|
331
|
+
return str(value)
|
332
|
+
return ""
|
333
|
+
|
334
|
+
elif column in ['x0', 'top', 'x1', 'bottom', 'width', 'height', 'size', 'stroke_width']:
|
335
|
+
value = getattr(element, column, 0)
|
336
|
+
if isinstance(value, (int, float)) and not isinstance(value, bool):
|
337
|
+
return int(round(value))
|
338
|
+
return 0
|
339
|
+
|
340
|
+
elif column in ['is_horizontal', 'is_vertical']:
|
341
|
+
value = getattr(element, column, False)
|
342
|
+
return value if isinstance(value, bool) else False
|
343
|
+
|
344
|
+
else:
|
345
|
+
# Generic attribute access
|
346
|
+
value = getattr(element, column, '')
|
347
|
+
if value is None:
|
348
|
+
return ""
|
349
|
+
return str(value)
|
350
|
+
|
351
|
+
except Exception as e:
|
352
|
+
# Fallback for any unexpected errors
|
353
|
+
logger.warning(f"Error extracting {column} from element: {e}")
|
354
|
+
return ""
|
355
|
+
|
356
|
+
|
357
|
+
def describe_element(element: "Element") -> "ElementSummary":
|
358
|
+
"""
|
359
|
+
Describe an individual element with its properties and attributes.
|
360
|
+
|
361
|
+
Args:
|
362
|
+
element: The element to describe
|
363
|
+
|
364
|
+
Returns:
|
365
|
+
ElementSummary with formatted element properties
|
366
|
+
"""
|
367
|
+
from natural_pdf.describe.summary import ElementSummary
|
368
|
+
|
369
|
+
# Get basic element info
|
370
|
+
element_type = getattr(element, 'type', element.__class__.__name__)
|
371
|
+
|
372
|
+
# Build the description data - use dict structure for proper list formatting
|
373
|
+
data = {
|
374
|
+
"info": {
|
375
|
+
"object_type": "element",
|
376
|
+
"element_type": element_type,
|
377
|
+
"class_name": element.__class__.__name__
|
378
|
+
}
|
379
|
+
}
|
380
|
+
|
381
|
+
# Add geometric properties - use dict structure for proper list formatting
|
382
|
+
if hasattr(element, 'bbox'):
|
383
|
+
data["geometry"] = {
|
384
|
+
"position": f"({round(element.x0, 1)}, {round(element.top, 1)}, {round(element.x1, 1)}, {round(element.bottom, 1)})",
|
385
|
+
"size": f"({round(element.width, 1)}, {round(element.height, 1)})"
|
386
|
+
}
|
387
|
+
|
388
|
+
# Add text content if available - use dict structure for proper list formatting
|
389
|
+
if hasattr(element, 'text') and element.text:
|
390
|
+
text = str(element.text).strip()
|
391
|
+
display_text = text[:50] + "..." if len(text) > 50 else text
|
392
|
+
data["content"] = {
|
393
|
+
"text": f"'{display_text}'",
|
394
|
+
"length": f"{len(text)} chars"
|
395
|
+
}
|
396
|
+
|
397
|
+
# Add common text properties - use dict structure for proper list formatting
|
398
|
+
text_props = {}
|
399
|
+
for prop in ['font_family', 'size', 'bold', 'italic', 'source', 'confidence']:
|
400
|
+
if hasattr(element, prop):
|
401
|
+
value = getattr(element, prop)
|
402
|
+
if value is not None:
|
403
|
+
if prop == 'confidence' and isinstance(value, (int, float)):
|
404
|
+
text_props[prop] = round(value, 3)
|
405
|
+
elif prop == 'size' and isinstance(value, (int, float)):
|
406
|
+
text_props[prop] = round(value, 1)
|
407
|
+
elif prop in ['bold', 'italic']:
|
408
|
+
text_props[prop] = value
|
409
|
+
else:
|
410
|
+
text_props[prop] = value
|
411
|
+
|
412
|
+
if text_props:
|
413
|
+
data["properties"] = text_props
|
414
|
+
|
415
|
+
# Add color information - use dict structure for proper list formatting
|
416
|
+
color_info = {}
|
417
|
+
for prop in ['color', 'fill', 'stroke']:
|
418
|
+
if hasattr(element, prop):
|
419
|
+
value = getattr(element, prop)
|
420
|
+
if value is not None:
|
421
|
+
if isinstance(value, (tuple, list)) and len(value) >= 3:
|
422
|
+
# Convert RGB to hex if it's a color tuple
|
423
|
+
try:
|
424
|
+
if all(isinstance(v, (int, float)) for v in value[:3]):
|
425
|
+
r, g, b = [int(v * 255) if v <= 1 else int(v) for v in value[:3]]
|
426
|
+
color_info[prop] = f"#{r:02x}{g:02x}{b:02x}"
|
427
|
+
else:
|
428
|
+
color_info[prop] = str(value)
|
429
|
+
except:
|
430
|
+
color_info[prop] = str(value)
|
431
|
+
else:
|
432
|
+
color_info[prop] = str(value)
|
433
|
+
|
434
|
+
if color_info:
|
435
|
+
data["colors"] = color_info
|
436
|
+
|
437
|
+
# Add page information - use dict structure for proper list formatting
|
438
|
+
if hasattr(element, 'page') and element.page:
|
439
|
+
page_num = getattr(element.page, 'number', None)
|
440
|
+
if page_num is not None:
|
441
|
+
data["page"] = {"number": page_num}
|
442
|
+
|
443
|
+
# Add polygon information if available - use dict structure for proper list formatting
|
444
|
+
if hasattr(element, 'has_polygon') and element.has_polygon:
|
445
|
+
if hasattr(element, 'polygon'):
|
446
|
+
polygon = element.polygon
|
447
|
+
if polygon and len(polygon) > 0:
|
448
|
+
data["shape"] = {"polygon_points": len(polygon)}
|
449
|
+
|
450
|
+
# Create title
|
451
|
+
title = f"{element_type.title()} Element"
|
452
|
+
if hasattr(element, 'text') and element.text:
|
453
|
+
preview = str(element.text).strip()[:30]
|
454
|
+
if preview:
|
455
|
+
title += f": '{preview}'"
|
456
|
+
|
457
|
+
return ElementSummary(data, title)
|