natural-pdf 0.1.16__py3-none-any.whl → 0.1.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/core/page.py +2 -1
- natural_pdf/describe/__init__.py +21 -0
- natural_pdf/describe/base.py +457 -0
- natural_pdf/describe/elements.py +411 -0
- natural_pdf/describe/mixin.py +84 -0
- natural_pdf/describe/summary.py +186 -0
- natural_pdf/elements/base.py +2 -1
- natural_pdf/elements/collections.py +11 -1
- natural_pdf/elements/region.py +4 -1
- {natural_pdf-0.1.16.dist-info → natural_pdf-0.1.17.dist-info}/METADATA +1 -1
- {natural_pdf-0.1.16.dist-info → natural_pdf-0.1.17.dist-info}/RECORD +14 -9
- {natural_pdf-0.1.16.dist-info → natural_pdf-0.1.17.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.16.dist-info → natural_pdf-0.1.17.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.16.dist-info → natural_pdf-0.1.17.dist-info}/top_level.txt +0 -0
natural_pdf/core/page.py
CHANGED
@@ -61,6 +61,7 @@ from natural_pdf.classification.manager import ClassificationManager # For type
|
|
61
61
|
# # --- Classification Imports --- #
|
62
62
|
from natural_pdf.classification.mixin import ClassificationMixin # Import classification mixin
|
63
63
|
from natural_pdf.core.element_manager import ElementManager
|
64
|
+
from natural_pdf.describe.mixin import DescribeMixin # Import describe mixin
|
64
65
|
from natural_pdf.elements.base import Element # Import base element
|
65
66
|
from natural_pdf.elements.text import TextElement
|
66
67
|
from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
|
@@ -92,7 +93,7 @@ except ImportError:
|
|
92
93
|
logger = logging.getLogger(__name__)
|
93
94
|
|
94
95
|
|
95
|
-
class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
96
|
+
class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
|
96
97
|
"""
|
97
98
|
Enhanced Page wrapper built on top of pdfplumber.Page.
|
98
99
|
|
@@ -0,0 +1,21 @@
|
|
1
|
+
"""
|
2
|
+
Describe functionality for natural-pdf.
|
3
|
+
|
4
|
+
Provides summary and inspection methods for pages, collections, and regions.
|
5
|
+
"""
|
6
|
+
|
7
|
+
from .base import describe_page, describe_collection, inspect_collection, describe_region, describe_element
|
8
|
+
from .summary import ElementSummary, InspectionSummary
|
9
|
+
from .mixin import DescribeMixin, InspectMixin
|
10
|
+
|
11
|
+
__all__ = [
|
12
|
+
'describe_page',
|
13
|
+
'describe_collection',
|
14
|
+
'inspect_collection',
|
15
|
+
'describe_region',
|
16
|
+
'describe_element',
|
17
|
+
'ElementSummary',
|
18
|
+
'InspectionSummary',
|
19
|
+
'DescribeMixin',
|
20
|
+
'InspectMixin'
|
21
|
+
]
|
@@ -0,0 +1,457 @@
|
|
1
|
+
"""
|
2
|
+
Main describe functions for pages, collections, and regions.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import logging
|
6
|
+
from collections import Counter
|
7
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
8
|
+
|
9
|
+
from .elements import (
|
10
|
+
describe_line_elements,
|
11
|
+
describe_rect_elements,
|
12
|
+
describe_region_elements,
|
13
|
+
describe_text_elements,
|
14
|
+
)
|
15
|
+
from .summary import ElementSummary, InspectionSummary
|
16
|
+
|
17
|
+
if TYPE_CHECKING:
|
18
|
+
from natural_pdf.core.page import Page
|
19
|
+
from natural_pdf.elements.base import Element
|
20
|
+
from natural_pdf.elements.collections import ElementCollection
|
21
|
+
from natural_pdf.elements.region import Region
|
22
|
+
|
23
|
+
logger = logging.getLogger(__name__)
|
24
|
+
|
25
|
+
|
26
|
+
def describe_page(page: "Page") -> ElementSummary:
|
27
|
+
"""
|
28
|
+
Describe what's on a page with high-level summary.
|
29
|
+
|
30
|
+
Args:
|
31
|
+
page: Page to describe
|
32
|
+
|
33
|
+
Returns:
|
34
|
+
ElementSummary with page overview
|
35
|
+
"""
|
36
|
+
data = {}
|
37
|
+
|
38
|
+
# Get all elements
|
39
|
+
all_elements = page.get_elements()
|
40
|
+
|
41
|
+
if not all_elements:
|
42
|
+
data["message"] = "No elements found on page"
|
43
|
+
return ElementSummary(data, f"Page {page.number} Summary")
|
44
|
+
|
45
|
+
# Element counts by type (exclude chars - too granular)
|
46
|
+
type_counts = Counter()
|
47
|
+
for element in all_elements:
|
48
|
+
element_type = getattr(element, 'type', 'unknown')
|
49
|
+
if element_type != 'char': # Skip character elements
|
50
|
+
type_counts[element_type] += 1
|
51
|
+
|
52
|
+
# Format element counts as dictionary for proper list formatting
|
53
|
+
element_summary = {}
|
54
|
+
for element_type, count in type_counts.most_common():
|
55
|
+
type_display = element_type.replace('_', ' ').title()
|
56
|
+
if element_type == 'word':
|
57
|
+
# Add source breakdown for text
|
58
|
+
text_elements = [e for e in all_elements if getattr(e, 'type', '') == 'word']
|
59
|
+
sources = Counter()
|
60
|
+
for elem in text_elements:
|
61
|
+
source = getattr(elem, 'source', 'unknown')
|
62
|
+
sources[source] += 1
|
63
|
+
|
64
|
+
if len(sources) > 1:
|
65
|
+
source_parts = []
|
66
|
+
for source, source_count in sources.most_common():
|
67
|
+
source_parts.append(f"{source_count} {source}")
|
68
|
+
element_summary["text"] = f"{count} elements ({', '.join(source_parts)})"
|
69
|
+
else:
|
70
|
+
element_summary["text"] = f"{count} elements"
|
71
|
+
else:
|
72
|
+
element_summary[element_type] = f"{count} elements"
|
73
|
+
|
74
|
+
data["elements"] = element_summary
|
75
|
+
|
76
|
+
# Text analysis if we have text elements (exclude chars - too granular)
|
77
|
+
text_elements = [e for e in all_elements if getattr(e, 'type', '') == 'word']
|
78
|
+
if text_elements:
|
79
|
+
text_analysis = describe_text_elements(text_elements)
|
80
|
+
if text_analysis and 'message' not in text_analysis:
|
81
|
+
data["text_analysis"] = text_analysis
|
82
|
+
|
83
|
+
return ElementSummary(data, f"Page {page.number} Summary")
|
84
|
+
|
85
|
+
|
86
|
+
def describe_collection(collection: "ElementCollection") -> ElementSummary:
|
87
|
+
"""
|
88
|
+
Describe an element collection with type-specific analysis.
|
89
|
+
|
90
|
+
Args:
|
91
|
+
collection: ElementCollection to describe
|
92
|
+
|
93
|
+
Returns:
|
94
|
+
ElementSummary with collection analysis
|
95
|
+
"""
|
96
|
+
elements = list(collection)
|
97
|
+
|
98
|
+
if not elements:
|
99
|
+
data = {"message": "Empty collection"}
|
100
|
+
return ElementSummary(data, "Collection Summary")
|
101
|
+
|
102
|
+
data = {}
|
103
|
+
|
104
|
+
# Group elements by type
|
105
|
+
by_type = {}
|
106
|
+
for element in elements:
|
107
|
+
element_type = getattr(element, 'type', 'unknown')
|
108
|
+
by_type.setdefault(element_type, []).append(element)
|
109
|
+
|
110
|
+
# Overall summary for mixed collections (exclude chars from overview)
|
111
|
+
if len(by_type) > 1:
|
112
|
+
type_counts = {k: len(v) for k, v in by_type.items() if k != 'char'}
|
113
|
+
total = sum(type_counts.values())
|
114
|
+
|
115
|
+
summary_parts = []
|
116
|
+
for element_type, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True):
|
117
|
+
type_display = element_type.replace('_', ' ').title()
|
118
|
+
summary_parts.append(f"**{type_display}**: {count}")
|
119
|
+
|
120
|
+
if summary_parts: # Only add overview if we have non-char elements
|
121
|
+
data["overview"] = {
|
122
|
+
"total_elements": total,
|
123
|
+
"type_breakdown": summary_parts
|
124
|
+
}
|
125
|
+
|
126
|
+
# Type-specific analysis (exclude chars - too granular)
|
127
|
+
for element_type, type_elements in by_type.items():
|
128
|
+
if element_type == 'char':
|
129
|
+
# Skip character elements - too granular for useful analysis
|
130
|
+
continue
|
131
|
+
elif element_type == 'word':
|
132
|
+
analysis = describe_text_elements(type_elements)
|
133
|
+
elif element_type == 'rect':
|
134
|
+
analysis = describe_rect_elements(type_elements)
|
135
|
+
elif element_type == 'line':
|
136
|
+
analysis = describe_line_elements(type_elements)
|
137
|
+
elif element_type == 'region':
|
138
|
+
analysis = describe_region_elements(type_elements)
|
139
|
+
else:
|
140
|
+
analysis = {"count": len(type_elements)}
|
141
|
+
|
142
|
+
if analysis and 'message' not in analysis:
|
143
|
+
section_name = element_type.replace('_', ' ').title()
|
144
|
+
if len(by_type) == 1:
|
145
|
+
# Single type collection - flatten the structure
|
146
|
+
data.update(analysis)
|
147
|
+
else:
|
148
|
+
# Mixed collection - keep sections separate
|
149
|
+
data[section_name] = analysis
|
150
|
+
|
151
|
+
# Count non-char elements for title
|
152
|
+
non_char_count = len([e for e in elements if getattr(e, 'type', 'unknown') != 'char'])
|
153
|
+
title = f"Collection Summary ({non_char_count} elements)"
|
154
|
+
return ElementSummary(data, title)
|
155
|
+
|
156
|
+
|
157
|
+
def describe_region(region: "Region") -> ElementSummary:
|
158
|
+
"""
|
159
|
+
Describe a region with its properties and contents.
|
160
|
+
|
161
|
+
Args:
|
162
|
+
region: Region to describe
|
163
|
+
|
164
|
+
Returns:
|
165
|
+
ElementSummary with region analysis
|
166
|
+
"""
|
167
|
+
data = {}
|
168
|
+
|
169
|
+
# Region info
|
170
|
+
region_info = {
|
171
|
+
"page": region.page.number,
|
172
|
+
"dimensions": f"{region.width:.0f}×{region.height:.0f} pts",
|
173
|
+
"area": f"{region.width * region.height:.0f} sq pts",
|
174
|
+
"position": f"({region.x0:.0f}, {region.top:.0f}) to ({region.x1:.0f}, {region.bottom:.0f})"
|
175
|
+
}
|
176
|
+
|
177
|
+
# Add metadata if available
|
178
|
+
if hasattr(region, 'metadata') and region.metadata:
|
179
|
+
region_info["metadata"] = region.metadata
|
180
|
+
|
181
|
+
data["region_info"] = region_info
|
182
|
+
|
183
|
+
# Content analysis
|
184
|
+
content_elements = region.find_all("*")
|
185
|
+
if content_elements:
|
186
|
+
content_analysis = describe_collection(content_elements)
|
187
|
+
# Extract the data and add as "content" section
|
188
|
+
data["content"] = content_analysis.to_dict()
|
189
|
+
else:
|
190
|
+
data["content"] = {"message": "No elements found in region"}
|
191
|
+
|
192
|
+
return ElementSummary(data, "Region Summary")
|
193
|
+
|
194
|
+
|
195
|
+
def inspect_collection(collection: "ElementCollection", limit: int = 30) -> InspectionSummary:
|
196
|
+
"""
|
197
|
+
Inspect elements in a collection with detailed tabular view.
|
198
|
+
|
199
|
+
Args:
|
200
|
+
collection: ElementCollection to inspect
|
201
|
+
limit: Maximum elements per type to show (default: 30)
|
202
|
+
|
203
|
+
Returns:
|
204
|
+
InspectionSummary with element tables
|
205
|
+
"""
|
206
|
+
elements = list(collection)
|
207
|
+
|
208
|
+
if not elements:
|
209
|
+
data = {"message": "Empty collection"}
|
210
|
+
return InspectionSummary(data, "Collection Inspection")
|
211
|
+
|
212
|
+
data = {}
|
213
|
+
|
214
|
+
# Check if multi-page
|
215
|
+
pages = set()
|
216
|
+
for element in elements:
|
217
|
+
if hasattr(element, 'page') and hasattr(element.page, 'number'):
|
218
|
+
pages.add(element.page.number)
|
219
|
+
show_page_column = len(pages) > 1
|
220
|
+
|
221
|
+
# Group by type
|
222
|
+
by_type = {}
|
223
|
+
for element in elements:
|
224
|
+
element_type = getattr(element, 'type', 'unknown')
|
225
|
+
by_type.setdefault(element_type, []).append(element)
|
226
|
+
|
227
|
+
# Create tables for each type (exclude chars - too granular)
|
228
|
+
for element_type, type_elements in by_type.items():
|
229
|
+
if element_type == 'char':
|
230
|
+
# Skip character elements - too granular for useful inspection
|
231
|
+
continue
|
232
|
+
|
233
|
+
# Limit elements shown
|
234
|
+
display_elements = type_elements[:limit]
|
235
|
+
|
236
|
+
# Get appropriate columns for this type
|
237
|
+
columns = _get_columns_for_type(element_type, show_page_column)
|
238
|
+
|
239
|
+
# Extract data for each element
|
240
|
+
element_data = []
|
241
|
+
for element in display_elements:
|
242
|
+
row = {}
|
243
|
+
for col in columns:
|
244
|
+
value = _extract_element_value(element, col)
|
245
|
+
row[col] = value
|
246
|
+
element_data.append(row)
|
247
|
+
|
248
|
+
# Create section
|
249
|
+
section_name = f"{element_type}_elements"
|
250
|
+
section_data = {
|
251
|
+
"elements": element_data,
|
252
|
+
"columns": columns
|
253
|
+
}
|
254
|
+
|
255
|
+
# Add note if truncated
|
256
|
+
if len(type_elements) > limit:
|
257
|
+
section_data["note"] = f"Showing {limit} of {len(type_elements)} elements (pass limit= to see more)"
|
258
|
+
|
259
|
+
data[section_name] = section_data
|
260
|
+
|
261
|
+
# Count non-char elements for title
|
262
|
+
non_char_count = len([e for e in elements if getattr(e, 'type', 'unknown') != 'char'])
|
263
|
+
title = f"Collection Inspection ({non_char_count} elements)"
|
264
|
+
return InspectionSummary(data, title)
|
265
|
+
|
266
|
+
|
267
|
+
def _get_columns_for_type(element_type: str, show_page_column: bool) -> List[str]:
|
268
|
+
"""Get appropriate columns for element type."""
|
269
|
+
base_columns = ['x0', 'top', 'x1', 'bottom']
|
270
|
+
|
271
|
+
if element_type == 'word':
|
272
|
+
columns = ['text'] + base_columns + ['font_family', 'size', 'bold', 'italic', 'source', 'confidence']
|
273
|
+
# Add color for text elements
|
274
|
+
columns.append('color')
|
275
|
+
elif element_type == 'rect':
|
276
|
+
columns = base_columns + ['width', 'height', 'stroke', 'fill', 'stroke_width']
|
277
|
+
elif element_type == 'line':
|
278
|
+
columns = base_columns + ['width', 'is_horizontal', 'is_vertical'] # LineElement properties
|
279
|
+
elif element_type == 'region':
|
280
|
+
columns = base_columns + ['width', 'height', 'type']
|
281
|
+
else:
|
282
|
+
columns = base_columns + ['type']
|
283
|
+
|
284
|
+
if show_page_column:
|
285
|
+
columns.append('page')
|
286
|
+
|
287
|
+
return columns
|
288
|
+
|
289
|
+
|
290
|
+
def _extract_element_value(element: "Element", column: str) -> Any:
|
291
|
+
"""Extract value for a column from an element."""
|
292
|
+
try:
|
293
|
+
if column == 'text':
|
294
|
+
text = getattr(element, 'text', '')
|
295
|
+
if text and len(text) > 50:
|
296
|
+
return text[:50] + "..."
|
297
|
+
return text or ""
|
298
|
+
|
299
|
+
elif column == 'page':
|
300
|
+
if hasattr(element, 'page') and hasattr(element.page, 'number'):
|
301
|
+
return element.page.number
|
302
|
+
return ""
|
303
|
+
|
304
|
+
elif column == 'confidence':
|
305
|
+
confidence = getattr(element, 'confidence', None)
|
306
|
+
if confidence is not None and isinstance(confidence, (int, float)):
|
307
|
+
return f"{confidence:.2f}"
|
308
|
+
return ""
|
309
|
+
|
310
|
+
elif column == 'font_family':
|
311
|
+
# Use the cleaner font_family property from TextElement
|
312
|
+
font_family = getattr(element, 'font_family', None)
|
313
|
+
if font_family:
|
314
|
+
return font_family
|
315
|
+
# Fallback to fontname
|
316
|
+
return getattr(element, 'fontname', '')
|
317
|
+
|
318
|
+
elif column in ['bold', 'italic']:
|
319
|
+
value = getattr(element, column, False)
|
320
|
+
return value if isinstance(value, bool) else False
|
321
|
+
|
322
|
+
elif column in ['stroke', 'fill', 'color']:
|
323
|
+
# For rectangles and text, these return color tuples
|
324
|
+
value = getattr(element, column, None)
|
325
|
+
if value and isinstance(value, (tuple, list)) and len(value) >= 3:
|
326
|
+
# Convert to hex color for display
|
327
|
+
try:
|
328
|
+
r, g, b = [int(v * 255) if v <= 1 else int(v) for v in value[:3]]
|
329
|
+
return f"#{r:02x}{g:02x}{b:02x}"
|
330
|
+
except:
|
331
|
+
return str(value)
|
332
|
+
return ""
|
333
|
+
|
334
|
+
elif column in ['x0', 'top', 'x1', 'bottom', 'width', 'height', 'size', 'stroke_width']:
|
335
|
+
value = getattr(element, column, 0)
|
336
|
+
if isinstance(value, (int, float)) and not isinstance(value, bool):
|
337
|
+
return int(round(value))
|
338
|
+
return 0
|
339
|
+
|
340
|
+
elif column in ['is_horizontal', 'is_vertical']:
|
341
|
+
value = getattr(element, column, False)
|
342
|
+
return value if isinstance(value, bool) else False
|
343
|
+
|
344
|
+
else:
|
345
|
+
# Generic attribute access
|
346
|
+
value = getattr(element, column, '')
|
347
|
+
if value is None:
|
348
|
+
return ""
|
349
|
+
return str(value)
|
350
|
+
|
351
|
+
except Exception as e:
|
352
|
+
# Fallback for any unexpected errors
|
353
|
+
logger.warning(f"Error extracting {column} from element: {e}")
|
354
|
+
return ""
|
355
|
+
|
356
|
+
|
357
|
+
def describe_element(element: "Element") -> "ElementSummary":
|
358
|
+
"""
|
359
|
+
Describe an individual element with its properties and attributes.
|
360
|
+
|
361
|
+
Args:
|
362
|
+
element: The element to describe
|
363
|
+
|
364
|
+
Returns:
|
365
|
+
ElementSummary with formatted element properties
|
366
|
+
"""
|
367
|
+
from natural_pdf.describe.summary import ElementSummary
|
368
|
+
|
369
|
+
# Get basic element info
|
370
|
+
element_type = getattr(element, 'type', element.__class__.__name__)
|
371
|
+
|
372
|
+
# Build the description data - use dict structure for proper list formatting
|
373
|
+
data = {
|
374
|
+
"info": {
|
375
|
+
"object_type": "element",
|
376
|
+
"element_type": element_type,
|
377
|
+
"class_name": element.__class__.__name__
|
378
|
+
}
|
379
|
+
}
|
380
|
+
|
381
|
+
# Add geometric properties - use dict structure for proper list formatting
|
382
|
+
if hasattr(element, 'bbox'):
|
383
|
+
data["geometry"] = {
|
384
|
+
"position": f"({round(element.x0, 1)}, {round(element.top, 1)}, {round(element.x1, 1)}, {round(element.bottom, 1)})",
|
385
|
+
"size": f"({round(element.width, 1)}, {round(element.height, 1)})"
|
386
|
+
}
|
387
|
+
|
388
|
+
# Add text content if available - use dict structure for proper list formatting
|
389
|
+
if hasattr(element, 'text') and element.text:
|
390
|
+
text = str(element.text).strip()
|
391
|
+
display_text = text[:50] + "..." if len(text) > 50 else text
|
392
|
+
data["content"] = {
|
393
|
+
"text": f"'{display_text}'",
|
394
|
+
"length": f"{len(text)} chars"
|
395
|
+
}
|
396
|
+
|
397
|
+
# Add common text properties - use dict structure for proper list formatting
|
398
|
+
text_props = {}
|
399
|
+
for prop in ['font_family', 'size', 'bold', 'italic', 'source', 'confidence']:
|
400
|
+
if hasattr(element, prop):
|
401
|
+
value = getattr(element, prop)
|
402
|
+
if value is not None:
|
403
|
+
if prop == 'confidence' and isinstance(value, (int, float)):
|
404
|
+
text_props[prop] = round(value, 3)
|
405
|
+
elif prop == 'size' and isinstance(value, (int, float)):
|
406
|
+
text_props[prop] = round(value, 1)
|
407
|
+
elif prop in ['bold', 'italic']:
|
408
|
+
text_props[prop] = value
|
409
|
+
else:
|
410
|
+
text_props[prop] = value
|
411
|
+
|
412
|
+
if text_props:
|
413
|
+
data["properties"] = text_props
|
414
|
+
|
415
|
+
# Add color information - use dict structure for proper list formatting
|
416
|
+
color_info = {}
|
417
|
+
for prop in ['color', 'fill', 'stroke']:
|
418
|
+
if hasattr(element, prop):
|
419
|
+
value = getattr(element, prop)
|
420
|
+
if value is not None:
|
421
|
+
if isinstance(value, (tuple, list)) and len(value) >= 3:
|
422
|
+
# Convert RGB to hex if it's a color tuple
|
423
|
+
try:
|
424
|
+
if all(isinstance(v, (int, float)) for v in value[:3]):
|
425
|
+
r, g, b = [int(v * 255) if v <= 1 else int(v) for v in value[:3]]
|
426
|
+
color_info[prop] = f"#{r:02x}{g:02x}{b:02x}"
|
427
|
+
else:
|
428
|
+
color_info[prop] = str(value)
|
429
|
+
except:
|
430
|
+
color_info[prop] = str(value)
|
431
|
+
else:
|
432
|
+
color_info[prop] = str(value)
|
433
|
+
|
434
|
+
if color_info:
|
435
|
+
data["colors"] = color_info
|
436
|
+
|
437
|
+
# Add page information - use dict structure for proper list formatting
|
438
|
+
if hasattr(element, 'page') and element.page:
|
439
|
+
page_num = getattr(element.page, 'number', None)
|
440
|
+
if page_num is not None:
|
441
|
+
data["page"] = {"number": page_num}
|
442
|
+
|
443
|
+
# Add polygon information if available - use dict structure for proper list formatting
|
444
|
+
if hasattr(element, 'has_polygon') and element.has_polygon:
|
445
|
+
if hasattr(element, 'polygon'):
|
446
|
+
polygon = element.polygon
|
447
|
+
if polygon and len(polygon) > 0:
|
448
|
+
data["shape"] = {"polygon_points": len(polygon)}
|
449
|
+
|
450
|
+
# Create title
|
451
|
+
title = f"{element_type.title()} Element"
|
452
|
+
if hasattr(element, 'text') and element.text:
|
453
|
+
preview = str(element.text).strip()[:30]
|
454
|
+
if preview:
|
455
|
+
title += f": '{preview}'"
|
456
|
+
|
457
|
+
return ElementSummary(data, title)
|
@@ -0,0 +1,411 @@
|
|
1
|
+
"""
|
2
|
+
Element-specific describe functions.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import logging
|
6
|
+
from collections import Counter
|
7
|
+
from typing import TYPE_CHECKING, Any, Dict, List
|
8
|
+
|
9
|
+
if TYPE_CHECKING:
|
10
|
+
from natural_pdf.elements.base import Element
|
11
|
+
|
12
|
+
logger = logging.getLogger(__name__)
|
13
|
+
|
14
|
+
|
15
|
+
def describe_text_elements(elements: List["Element"]) -> Dict[str, Any]:
|
16
|
+
"""
|
17
|
+
Describe text elements with typography and OCR analysis.
|
18
|
+
|
19
|
+
Args:
|
20
|
+
elements: List of text elements
|
21
|
+
|
22
|
+
Returns:
|
23
|
+
Dictionary with text analysis sections
|
24
|
+
"""
|
25
|
+
if not elements:
|
26
|
+
return {"message": "No text elements found"}
|
27
|
+
|
28
|
+
result = {}
|
29
|
+
|
30
|
+
# Source breakdown
|
31
|
+
sources = Counter()
|
32
|
+
ocr_elements = []
|
33
|
+
|
34
|
+
for element in elements:
|
35
|
+
source = getattr(element, 'source', 'unknown')
|
36
|
+
sources[source] += 1
|
37
|
+
if source == 'ocr':
|
38
|
+
ocr_elements.append(element)
|
39
|
+
|
40
|
+
if len(sources) > 1:
|
41
|
+
result['sources'] = dict(sources)
|
42
|
+
|
43
|
+
# Typography analysis
|
44
|
+
typography = _analyze_typography(elements)
|
45
|
+
if typography:
|
46
|
+
result['typography'] = typography
|
47
|
+
|
48
|
+
# OCR quality analysis
|
49
|
+
if ocr_elements:
|
50
|
+
ocr_quality = _analyze_ocr_quality(ocr_elements)
|
51
|
+
if ocr_quality:
|
52
|
+
result['ocr_quality'] = ocr_quality
|
53
|
+
|
54
|
+
return result
|
55
|
+
|
56
|
+
|
57
|
+
def describe_rect_elements(elements: List["Element"]) -> Dict[str, Any]:
|
58
|
+
"""
|
59
|
+
Describe rectangle elements with size and style analysis.
|
60
|
+
|
61
|
+
Args:
|
62
|
+
elements: List of rectangle elements
|
63
|
+
|
64
|
+
Returns:
|
65
|
+
Dictionary with rectangle analysis
|
66
|
+
"""
|
67
|
+
if not elements:
|
68
|
+
return {"message": "No rectangle elements found"}
|
69
|
+
|
70
|
+
result = {}
|
71
|
+
|
72
|
+
# Size analysis
|
73
|
+
sizes = []
|
74
|
+
stroke_count = 0
|
75
|
+
fill_count = 0
|
76
|
+
colors = Counter()
|
77
|
+
stroke_widths = []
|
78
|
+
|
79
|
+
for element in elements:
|
80
|
+
# Size
|
81
|
+
width = getattr(element, 'width', 0)
|
82
|
+
height = getattr(element, 'height', 0)
|
83
|
+
if width and height:
|
84
|
+
sizes.append((width, height))
|
85
|
+
|
86
|
+
# Style properties - use RectangleElement properties
|
87
|
+
stroke = getattr(element, 'stroke', None)
|
88
|
+
if stroke and stroke != (0, 0, 0): # Check if stroke color exists and isn't black
|
89
|
+
stroke_count += 1
|
90
|
+
fill = getattr(element, 'fill', None)
|
91
|
+
if fill and fill != (0, 0, 0): # Check if fill color exists and isn't black
|
92
|
+
fill_count += 1
|
93
|
+
|
94
|
+
# Stroke width
|
95
|
+
stroke_width = getattr(element, 'stroke_width', 0)
|
96
|
+
if stroke_width > 0:
|
97
|
+
stroke_widths.append(stroke_width)
|
98
|
+
|
99
|
+
# Color - use the element's stroke/fill properties
|
100
|
+
color = stroke or fill
|
101
|
+
if color:
|
102
|
+
if isinstance(color, (tuple, list)):
|
103
|
+
if color == (0, 0, 0) or color == (0.0, 0.0, 0.0):
|
104
|
+
colors['black'] += 1
|
105
|
+
elif color == (1, 1, 1) or color == (1.0, 1.0, 1.0):
|
106
|
+
colors['white'] += 1
|
107
|
+
else:
|
108
|
+
colors[str(color)] += 1
|
109
|
+
else:
|
110
|
+
colors[str(color)] += 1
|
111
|
+
|
112
|
+
# Size statistics
|
113
|
+
if sizes:
|
114
|
+
widths = [s[0] for s in sizes]
|
115
|
+
heights = [s[1] for s in sizes]
|
116
|
+
result['size_stats'] = {
|
117
|
+
'width_range': f"{min(widths):.0f}-{max(widths):.0f}",
|
118
|
+
'height_range': f"{min(heights):.0f}-{max(heights):.0f}",
|
119
|
+
'avg_area': f"{sum(w*h for w,h in sizes)/len(sizes):.0f} sq pts"
|
120
|
+
}
|
121
|
+
|
122
|
+
# Style breakdown
|
123
|
+
style_info = {}
|
124
|
+
if stroke_count:
|
125
|
+
style_info['stroke'] = stroke_count
|
126
|
+
if fill_count:
|
127
|
+
style_info['fill'] = fill_count
|
128
|
+
if stroke_widths:
|
129
|
+
stroke_width_counts = Counter(stroke_widths)
|
130
|
+
# Convert float keys to strings to avoid formatting issues
|
131
|
+
stroke_width_dict = {str(k): v for k, v in stroke_width_counts.most_common()}
|
132
|
+
style_info['stroke_widths'] = stroke_width_dict
|
133
|
+
if colors:
|
134
|
+
style_info['colors'] = dict(colors.most_common(5))
|
135
|
+
|
136
|
+
if style_info:
|
137
|
+
result['styles'] = style_info
|
138
|
+
|
139
|
+
return result
|
140
|
+
|
141
|
+
|
142
|
+
def describe_line_elements(elements: List["Element"]) -> Dict[str, Any]:
|
143
|
+
"""
|
144
|
+
Describe line elements with length and style analysis.
|
145
|
+
|
146
|
+
Args:
|
147
|
+
elements: List of line elements
|
148
|
+
|
149
|
+
Returns:
|
150
|
+
Dictionary with line analysis
|
151
|
+
"""
|
152
|
+
if not elements:
|
153
|
+
return {"message": "No line elements found"}
|
154
|
+
|
155
|
+
result = {}
|
156
|
+
|
157
|
+
lengths = []
|
158
|
+
widths = []
|
159
|
+
colors = Counter()
|
160
|
+
|
161
|
+
for element in elements:
|
162
|
+
# Calculate length
|
163
|
+
x0 = getattr(element, 'x0', 0)
|
164
|
+
y0 = getattr(element, 'top', 0)
|
165
|
+
x1 = getattr(element, 'x1', 0)
|
166
|
+
y1 = getattr(element, 'bottom', 0)
|
167
|
+
|
168
|
+
length = ((x1 - x0) ** 2 + (y1 - y0) ** 2) ** 0.5
|
169
|
+
if length > 0:
|
170
|
+
lengths.append(length)
|
171
|
+
|
172
|
+
# Line width - use the element's width property
|
173
|
+
width = getattr(element, 'width', 0) # LineElement has a width property
|
174
|
+
if width:
|
175
|
+
widths.append(width)
|
176
|
+
|
177
|
+
# Color - use the element's color property
|
178
|
+
color = getattr(element, 'color', None) # LineElement has a color property
|
179
|
+
if color:
|
180
|
+
if isinstance(color, (tuple, list)):
|
181
|
+
if color == (0, 0, 0) or color == (0.0, 0.0, 0.0):
|
182
|
+
colors['black'] += 1
|
183
|
+
else:
|
184
|
+
colors[str(color)] += 1
|
185
|
+
else:
|
186
|
+
colors[str(color)] += 1
|
187
|
+
|
188
|
+
# Length statistics
|
189
|
+
if lengths:
|
190
|
+
result['length_stats'] = {
|
191
|
+
'min': f"{min(lengths):.0f}",
|
192
|
+
'max': f"{max(lengths):.0f}",
|
193
|
+
'avg': f"{sum(lengths)/len(lengths):.0f}"
|
194
|
+
}
|
195
|
+
|
196
|
+
# Width statistics
|
197
|
+
if widths:
|
198
|
+
width_counts = Counter(widths)
|
199
|
+
# Convert float keys to strings to avoid formatting issues
|
200
|
+
result['line_widths'] = {str(k): v for k, v in width_counts.most_common()}
|
201
|
+
|
202
|
+
# Orientation analysis
|
203
|
+
horizontal_count = sum(1 for el in elements if getattr(el, 'is_horizontal', False))
|
204
|
+
vertical_count = sum(1 for el in elements if getattr(el, 'is_vertical', False))
|
205
|
+
diagonal_count = len(elements) - horizontal_count - vertical_count
|
206
|
+
|
207
|
+
if horizontal_count or vertical_count or diagonal_count:
|
208
|
+
orientation_info = {}
|
209
|
+
if horizontal_count:
|
210
|
+
orientation_info['horizontal'] = horizontal_count
|
211
|
+
if vertical_count:
|
212
|
+
orientation_info['vertical'] = vertical_count
|
213
|
+
if diagonal_count:
|
214
|
+
orientation_info['diagonal'] = diagonal_count
|
215
|
+
result['orientations'] = orientation_info
|
216
|
+
|
217
|
+
# Colors
|
218
|
+
if colors:
|
219
|
+
result['colors'] = dict(colors.most_common())
|
220
|
+
|
221
|
+
return result
|
222
|
+
|
223
|
+
|
224
|
+
def describe_region_elements(elements: List["Element"]) -> Dict[str, Any]:
|
225
|
+
"""
|
226
|
+
Describe region elements with type and metadata analysis.
|
227
|
+
|
228
|
+
Args:
|
229
|
+
elements: List of region elements
|
230
|
+
|
231
|
+
Returns:
|
232
|
+
Dictionary with region analysis
|
233
|
+
"""
|
234
|
+
if not elements:
|
235
|
+
return {"message": "No region elements found"}
|
236
|
+
|
237
|
+
result = {}
|
238
|
+
|
239
|
+
# Region types
|
240
|
+
types = Counter()
|
241
|
+
sizes = []
|
242
|
+
metadata_keys = set()
|
243
|
+
|
244
|
+
for element in elements:
|
245
|
+
# Type
|
246
|
+
region_type = getattr(element, 'type', 'unknown')
|
247
|
+
types[region_type] += 1
|
248
|
+
|
249
|
+
# Size
|
250
|
+
width = getattr(element, 'width', 0)
|
251
|
+
height = getattr(element, 'height', 0)
|
252
|
+
if width and height:
|
253
|
+
sizes.append(width * height)
|
254
|
+
|
255
|
+
# Metadata keys
|
256
|
+
if hasattr(element, 'metadata') and element.metadata:
|
257
|
+
metadata_keys.update(element.metadata.keys())
|
258
|
+
|
259
|
+
# Type breakdown
|
260
|
+
if types:
|
261
|
+
result['types'] = dict(types.most_common())
|
262
|
+
|
263
|
+
# Size statistics
|
264
|
+
if sizes:
|
265
|
+
result['size_stats'] = {
|
266
|
+
'min_area': f"{min(sizes):.0f} sq pts",
|
267
|
+
'max_area': f"{max(sizes):.0f} sq pts",
|
268
|
+
'avg_area': f"{sum(sizes)/len(sizes):.0f} sq pts"
|
269
|
+
}
|
270
|
+
|
271
|
+
# Metadata
|
272
|
+
if metadata_keys:
|
273
|
+
result['metadata_keys'] = sorted(list(metadata_keys))
|
274
|
+
|
275
|
+
return result
|
276
|
+
|
277
|
+
|
278
|
+
def _analyze_typography(elements: List["Element"]) -> Dict[str, Any]:
|
279
|
+
"""Analyze typography patterns in text elements."""
|
280
|
+
fonts = Counter()
|
281
|
+
sizes = Counter()
|
282
|
+
styles = {'bold': 0, 'italic': 0}
|
283
|
+
colors = Counter()
|
284
|
+
|
285
|
+
for element in elements:
|
286
|
+
# Font family - use TextElement's font_family property for cleaner names
|
287
|
+
font_family = getattr(element, 'font_family', None)
|
288
|
+
fontname = getattr(element, 'fontname', 'Unknown')
|
289
|
+
display_font = font_family if font_family and font_family != fontname else fontname
|
290
|
+
if display_font:
|
291
|
+
fonts[display_font] += 1
|
292
|
+
|
293
|
+
# Size
|
294
|
+
size = getattr(element, 'size', None)
|
295
|
+
if size:
|
296
|
+
# Round to nearest 0.5
|
297
|
+
rounded_size = round(size * 2) / 2
|
298
|
+
sizes[f"{rounded_size}pt"] += 1
|
299
|
+
|
300
|
+
# Styles
|
301
|
+
if getattr(element, 'bold', False):
|
302
|
+
styles['bold'] += 1
|
303
|
+
if getattr(element, 'italic', False):
|
304
|
+
styles['italic'] += 1
|
305
|
+
|
306
|
+
# Color - use TextElement's color property
|
307
|
+
color = getattr(element, 'color', None)
|
308
|
+
if color:
|
309
|
+
if isinstance(color, (tuple, list)):
|
310
|
+
if color == (0, 0, 0) or color == (0.0, 0.0, 0.0):
|
311
|
+
colors['black'] += 1
|
312
|
+
elif color == (1, 1, 1) or color == (1.0, 1.0, 1.0):
|
313
|
+
colors['white'] += 1
|
314
|
+
else:
|
315
|
+
colors['other'] += 1
|
316
|
+
else:
|
317
|
+
colors[str(color)] += 1
|
318
|
+
|
319
|
+
result = {}
|
320
|
+
|
321
|
+
# Fonts
|
322
|
+
if fonts:
|
323
|
+
result['fonts'] = dict(fonts.most_common(10))
|
324
|
+
|
325
|
+
# Sizes (as horizontal table)
|
326
|
+
if sizes:
|
327
|
+
result['sizes'] = dict(sizes.most_common())
|
328
|
+
|
329
|
+
# Styles
|
330
|
+
style_list = []
|
331
|
+
if styles['bold']:
|
332
|
+
style_list.append(f"{styles['bold']} bold")
|
333
|
+
if styles['italic']:
|
334
|
+
style_list.append(f"{styles['italic']} italic")
|
335
|
+
if style_list:
|
336
|
+
result['styles'] = ", ".join(style_list)
|
337
|
+
|
338
|
+
# Colors
|
339
|
+
if colors and len(colors) > 1: # Only show if there are multiple colors
|
340
|
+
result['colors'] = dict(colors.most_common())
|
341
|
+
|
342
|
+
return result
|
343
|
+
|
344
|
+
|
345
|
+
def _analyze_ocr_quality(elements: List["Element"]) -> Dict[str, Any]:
|
346
|
+
"""Analyze OCR quality metrics."""
|
347
|
+
confidences = []
|
348
|
+
|
349
|
+
for element in elements:
|
350
|
+
confidence = getattr(element, 'confidence', None)
|
351
|
+
if confidence is not None:
|
352
|
+
confidences.append(confidence)
|
353
|
+
|
354
|
+
if not confidences:
|
355
|
+
return {}
|
356
|
+
|
357
|
+
result = {}
|
358
|
+
|
359
|
+
# Basic stats
|
360
|
+
result['confidence_stats'] = {
|
361
|
+
'mean': f"{sum(confidences)/len(confidences):.2f}",
|
362
|
+
'min': f"{min(confidences):.2f}",
|
363
|
+
'max': f"{max(confidences):.2f}"
|
364
|
+
}
|
365
|
+
|
366
|
+
# Threshold analysis with ASCII bars
|
367
|
+
thresholds = [
|
368
|
+
('99%+', 0.99),
|
369
|
+
('95%+', 0.95),
|
370
|
+
('90%+', 0.90),
|
371
|
+
]
|
372
|
+
|
373
|
+
element_count = len(elements)
|
374
|
+
threshold_bars = {}
|
375
|
+
|
376
|
+
for label, threshold in thresholds:
|
377
|
+
count = sum(1 for c in confidences if c >= threshold)
|
378
|
+
percentage = count / element_count
|
379
|
+
|
380
|
+
# Create ASCII bar (40 characters wide)
|
381
|
+
filled_chars = int(percentage * 40)
|
382
|
+
empty_chars = 40 - filled_chars
|
383
|
+
bar = '█' * filled_chars + '░' * empty_chars
|
384
|
+
|
385
|
+
# Format: "95%+ (32/43) 74%: `████████████████████████████████░░░░░░░░`"
|
386
|
+
threshold_bars[f"{label} ({count}/{element_count}) {percentage:.0%}"] = f"`{bar}`"
|
387
|
+
|
388
|
+
result['quality_distribution'] = threshold_bars
|
389
|
+
|
390
|
+
# Show lowest quality items
|
391
|
+
element_confidences = []
|
392
|
+
for element in elements:
|
393
|
+
confidence = getattr(element, 'confidence', None)
|
394
|
+
if confidence is not None:
|
395
|
+
# Get text content for display
|
396
|
+
text = getattr(element, 'text', '').strip()
|
397
|
+
if text:
|
398
|
+
# Truncate long text
|
399
|
+
display_text = text[:50] + "..." if len(text) > 50 else text
|
400
|
+
element_confidences.append((confidence, display_text))
|
401
|
+
|
402
|
+
if element_confidences:
|
403
|
+
# Sort by confidence (lowest first) and take bottom 10
|
404
|
+
lowest_quality = sorted(element_confidences, key=lambda x: x[0])[:10]
|
405
|
+
if lowest_quality:
|
406
|
+
lowest_items = {}
|
407
|
+
for i, (confidence, text) in enumerate(lowest_quality, 1):
|
408
|
+
lowest_items[f"#{i}"] = f"**{confidence:.2f}**: {text}"
|
409
|
+
result['lowest_scoring'] = lowest_items
|
410
|
+
|
411
|
+
return result
|
@@ -0,0 +1,84 @@
|
|
1
|
+
"""
|
2
|
+
Mixin for describe functionality.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from typing import TYPE_CHECKING
|
6
|
+
|
7
|
+
if TYPE_CHECKING:
|
8
|
+
from natural_pdf.describe.summary import ElementSummary, InspectionSummary
|
9
|
+
|
10
|
+
|
11
|
+
class DescribeMixin:
|
12
|
+
"""
|
13
|
+
Mixin providing describe functionality for pages, collections, and regions.
|
14
|
+
|
15
|
+
Classes that inherit from this mixin get:
|
16
|
+
- .describe() method for high-level summaries
|
17
|
+
- .inspect() method for detailed tabular views (collections only)
|
18
|
+
"""
|
19
|
+
|
20
|
+
def describe(self) -> "ElementSummary":
|
21
|
+
"""
|
22
|
+
Describe this object with type-specific analysis.
|
23
|
+
|
24
|
+
Returns:
|
25
|
+
ElementSummary with analysis appropriate for the object type
|
26
|
+
"""
|
27
|
+
from natural_pdf.describe import describe_page, describe_collection, describe_region, describe_element
|
28
|
+
|
29
|
+
# Determine the appropriate describe function based on class type
|
30
|
+
class_name = self.__class__.__name__
|
31
|
+
|
32
|
+
if class_name == 'Page':
|
33
|
+
return describe_page(self)
|
34
|
+
elif class_name == 'ElementCollection':
|
35
|
+
return describe_collection(self)
|
36
|
+
elif class_name == 'Region':
|
37
|
+
return describe_region(self)
|
38
|
+
else:
|
39
|
+
# Check if it's an individual element (inherits from Element base class)
|
40
|
+
from natural_pdf.elements.base import Element
|
41
|
+
if isinstance(self, Element):
|
42
|
+
return describe_element(self)
|
43
|
+
|
44
|
+
# Fallback - try to determine based on available methods/attributes
|
45
|
+
if hasattr(self, 'get_elements') and hasattr(self, 'width') and hasattr(self, 'height'):
|
46
|
+
# Looks like a page or region
|
47
|
+
if hasattr(self, 'number'):
|
48
|
+
return describe_page(self) # Page
|
49
|
+
else:
|
50
|
+
return describe_region(self) # Region
|
51
|
+
elif hasattr(self, '__iter__') and hasattr(self, '__len__'):
|
52
|
+
# Looks like a collection
|
53
|
+
return describe_collection(self)
|
54
|
+
else:
|
55
|
+
# Unknown type - create a basic summary
|
56
|
+
from natural_pdf.describe.summary import ElementSummary
|
57
|
+
data = {
|
58
|
+
"object_type": class_name,
|
59
|
+
"message": f"Describe not fully implemented for {class_name}"
|
60
|
+
}
|
61
|
+
return ElementSummary(data, f"{class_name} Summary")
|
62
|
+
|
63
|
+
|
64
|
+
class InspectMixin:
|
65
|
+
"""
|
66
|
+
Mixin providing inspect functionality for collections.
|
67
|
+
|
68
|
+
Classes that inherit from this mixin get:
|
69
|
+
- .inspect() method for detailed tabular element views
|
70
|
+
"""
|
71
|
+
|
72
|
+
def inspect(self, limit: int = 30) -> "InspectionSummary":
|
73
|
+
"""
|
74
|
+
Inspect elements with detailed tabular view.
|
75
|
+
|
76
|
+
Args:
|
77
|
+
limit: Maximum elements per type to show (default: 30)
|
78
|
+
|
79
|
+
Returns:
|
80
|
+
InspectionSummary with element tables showing coordinates,
|
81
|
+
properties, and other details for each element
|
82
|
+
"""
|
83
|
+
from natural_pdf.describe import inspect_collection
|
84
|
+
return inspect_collection(self, limit=limit)
|
@@ -0,0 +1,186 @@
|
|
1
|
+
"""
|
2
|
+
Summary objects for describe functionality.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from typing import Any, Dict, List, Union
|
6
|
+
|
7
|
+
|
8
|
+
class ElementSummary:
|
9
|
+
"""
|
10
|
+
Container for element summary data with markdown rendering.
|
11
|
+
|
12
|
+
Automatically renders as markdown in Jupyter notebooks and provides
|
13
|
+
access to underlying data as dictionaries.
|
14
|
+
"""
|
15
|
+
|
16
|
+
def __init__(self, data: Dict[str, Any], title: str = "Summary"):
|
17
|
+
"""
|
18
|
+
Initialize summary with data and optional title.
|
19
|
+
|
20
|
+
Args:
|
21
|
+
data: Dictionary containing summary sections
|
22
|
+
title: Title for the summary display
|
23
|
+
"""
|
24
|
+
self.data = data
|
25
|
+
self.title = title
|
26
|
+
|
27
|
+
def __str__(self) -> str:
|
28
|
+
"""String representation as markdown."""
|
29
|
+
return self._to_markdown()
|
30
|
+
|
31
|
+
def __repr__(self) -> str:
|
32
|
+
"""Repr as markdown for better display."""
|
33
|
+
return self._to_markdown()
|
34
|
+
|
35
|
+
def _repr_markdown_(self) -> str:
|
36
|
+
"""Jupyter notebook markdown rendering."""
|
37
|
+
return self._to_markdown()
|
38
|
+
|
39
|
+
def to_dict(self) -> Dict[str, Any]:
|
40
|
+
"""Return underlying data as dictionary."""
|
41
|
+
return self.data.copy()
|
42
|
+
|
43
|
+
def _to_markdown(self) -> str:
|
44
|
+
"""Convert data to markdown format."""
|
45
|
+
lines = [f"## {self.title}", ""]
|
46
|
+
|
47
|
+
for section_name, section_data in self.data.items():
|
48
|
+
lines.extend(self._format_section(section_name, section_data))
|
49
|
+
lines.append("") # Empty line between sections
|
50
|
+
|
51
|
+
return "\n".join(lines).rstrip()
|
52
|
+
|
53
|
+
def _format_section(self, name: str, data: Any) -> List[str]:
|
54
|
+
"""Format a single section as markdown."""
|
55
|
+
# Use bold text instead of headers for more compact display
|
56
|
+
section_title = name.replace('_', ' ').title()
|
57
|
+
|
58
|
+
if isinstance(data, dict):
|
59
|
+
lines = [f"**{section_title}**:"]
|
60
|
+
lines.extend(self._format_dict(data, indent=" "))
|
61
|
+
elif isinstance(data, list):
|
62
|
+
lines = [f"**{section_title}**: {', '.join(str(item) for item in data)}"]
|
63
|
+
else:
|
64
|
+
lines = [f"**{section_title}**: {data}"]
|
65
|
+
|
66
|
+
return lines
|
67
|
+
|
68
|
+
def _format_dict(self, data: Dict[str, Any], indent: str = "") -> List[str]:
|
69
|
+
"""Format dictionary as markdown list."""
|
70
|
+
lines = []
|
71
|
+
|
72
|
+
for key, value in data.items():
|
73
|
+
key_display = key.replace('_', ' ')
|
74
|
+
|
75
|
+
if isinstance(value, dict):
|
76
|
+
# Nested dict - always format as list items
|
77
|
+
lines.append(f"{indent}- **{key_display}**:")
|
78
|
+
for subkey, subvalue in value.items():
|
79
|
+
subkey_display = subkey.replace('_', ' ')
|
80
|
+
if isinstance(subvalue, dict):
|
81
|
+
# Another level of nesting
|
82
|
+
lines.append(f"{indent} - **{subkey_display}**:")
|
83
|
+
for subsubkey, subsubvalue in subvalue.items():
|
84
|
+
subsubkey_display = subsubkey.replace('_', ' ')
|
85
|
+
lines.append(f"{indent} - {subsubkey_display}: {subsubvalue}")
|
86
|
+
else:
|
87
|
+
lines.append(f"{indent} - {subkey_display}: {subvalue}")
|
88
|
+
elif isinstance(value, list):
|
89
|
+
if len(value) <= 5:
|
90
|
+
value_str = ", ".join(str(v) for v in value)
|
91
|
+
lines.append(f"{indent}- **{key_display}**: {value_str}")
|
92
|
+
else:
|
93
|
+
lines.append(f"{indent}- **{key_display}**: {len(value)} items")
|
94
|
+
else:
|
95
|
+
lines.append(f"{indent}- **{key_display}**: {value}")
|
96
|
+
|
97
|
+
return lines
|
98
|
+
|
99
|
+
def _format_list(self, data: List[Any]) -> List[str]:
|
100
|
+
"""Format list as markdown."""
|
101
|
+
lines = []
|
102
|
+
for item in data:
|
103
|
+
if isinstance(item, dict):
|
104
|
+
# Could be table rows
|
105
|
+
lines.append(f"- {item}")
|
106
|
+
else:
|
107
|
+
lines.append(f"- {item}")
|
108
|
+
return lines
|
109
|
+
|
110
|
+
|
111
|
+
|
112
|
+
def _format_horizontal_table(self, title: str, data: Dict[str, Any]) -> List[str]:
|
113
|
+
"""Format dict as horizontal table."""
|
114
|
+
headers = list(data.keys())
|
115
|
+
values = list(data.values())
|
116
|
+
|
117
|
+
# Create table
|
118
|
+
header_row = "| " + " | ".join(headers) + " |"
|
119
|
+
separator = "|" + "|".join("------" for _ in headers) + "|"
|
120
|
+
value_row = "| " + " | ".join(str(v) for v in values) + " |"
|
121
|
+
|
122
|
+
return [
|
123
|
+
f"- **{title}**:",
|
124
|
+
"",
|
125
|
+
header_row,
|
126
|
+
separator,
|
127
|
+
value_row,
|
128
|
+
""
|
129
|
+
]
|
130
|
+
|
131
|
+
|
132
|
+
class InspectionSummary(ElementSummary):
|
133
|
+
"""
|
134
|
+
Summary for element inspection with tabular data.
|
135
|
+
"""
|
136
|
+
|
137
|
+
def _format_section(self, name: str, data: Any) -> List[str]:
|
138
|
+
"""Format inspection section with element tables."""
|
139
|
+
section_title = name.replace('_', ' ').title()
|
140
|
+
|
141
|
+
if isinstance(data, dict) and 'elements' in data:
|
142
|
+
# This is an element table section - use ### header for inspect
|
143
|
+
elements = data['elements']
|
144
|
+
lines = [f"### {section_title}"]
|
145
|
+
if elements:
|
146
|
+
lines.extend(self._format_element_table(elements, data.get('columns', [])))
|
147
|
+
# Add note if truncated
|
148
|
+
if 'note' in data:
|
149
|
+
lines.append(f"_{data['note']}_")
|
150
|
+
else:
|
151
|
+
lines.append("No elements found.")
|
152
|
+
else:
|
153
|
+
# Regular section formatting
|
154
|
+
lines = [f"**{section_title}**: {data}"]
|
155
|
+
|
156
|
+
return lines
|
157
|
+
|
158
|
+
def _format_element_table(self, elements: List[Dict[str, Any]], columns: List[str]) -> List[str]:
|
159
|
+
"""Format elements as markdown table."""
|
160
|
+
if not elements or not columns:
|
161
|
+
return ["No elements to display."]
|
162
|
+
|
163
|
+
lines = [""] # Empty line before table
|
164
|
+
|
165
|
+
# Table header
|
166
|
+
header_row = "| " + " | ".join(columns) + " |"
|
167
|
+
separator = "|" + "|".join("------" for _ in columns) + "|"
|
168
|
+
lines.extend([header_row, separator])
|
169
|
+
|
170
|
+
# Table rows
|
171
|
+
for element in elements:
|
172
|
+
row_values = []
|
173
|
+
for col in columns:
|
174
|
+
value = element.get(col, "")
|
175
|
+
if value is None:
|
176
|
+
value = ""
|
177
|
+
elif isinstance(value, float):
|
178
|
+
value = str(int(round(value)))
|
179
|
+
elif isinstance(value, str) and len(value) > 50:
|
180
|
+
value = value[:50] + "..."
|
181
|
+
row_values.append(str(value))
|
182
|
+
|
183
|
+
row = "| " + " | ".join(row_values) + " |"
|
184
|
+
lines.append(row)
|
185
|
+
|
186
|
+
return lines
|
natural_pdf/elements/base.py
CHANGED
@@ -8,6 +8,7 @@ from PIL import Image
|
|
8
8
|
|
9
9
|
# Import selector parsing functions
|
10
10
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
11
|
+
from natural_pdf.describe.mixin import DescribeMixin
|
11
12
|
|
12
13
|
if TYPE_CHECKING:
|
13
14
|
from natural_pdf.core.page import Page
|
@@ -412,7 +413,7 @@ class DirectionalMixin:
|
|
412
413
|
return new_region
|
413
414
|
|
414
415
|
|
415
|
-
class Element(DirectionalMixin):
|
416
|
+
class Element(DirectionalMixin, DescribeMixin):
|
416
417
|
"""
|
417
418
|
Base class for all PDF elements.
|
418
419
|
|
@@ -30,6 +30,7 @@ from tqdm.auto import tqdm
|
|
30
30
|
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
31
31
|
from natural_pdf.classification.manager import ClassificationManager
|
32
32
|
from natural_pdf.classification.mixin import ClassificationMixin
|
33
|
+
from natural_pdf.describe.mixin import DescribeMixin, InspectMixin
|
33
34
|
from natural_pdf.collections.mixins import ApplyMixin, DirectionalCollectionMixin
|
34
35
|
from natural_pdf.core.pdf import PDF
|
35
36
|
from natural_pdf.elements.base import Element
|
@@ -71,7 +72,14 @@ P = TypeVar("P", bound="Page")
|
|
71
72
|
|
72
73
|
|
73
74
|
class ElementCollection(
|
74
|
-
Generic[T],
|
75
|
+
Generic[T],
|
76
|
+
ApplyMixin,
|
77
|
+
ExportMixin,
|
78
|
+
ClassificationMixin,
|
79
|
+
DirectionalCollectionMixin,
|
80
|
+
DescribeMixin,
|
81
|
+
InspectMixin,
|
82
|
+
MutableSequence,
|
75
83
|
):
|
76
84
|
"""
|
77
85
|
Collection of PDF elements with batch operations.
|
@@ -1795,6 +1803,8 @@ class ElementCollection(
|
|
1795
1803
|
)
|
1796
1804
|
|
1797
1805
|
|
1806
|
+
|
1807
|
+
|
1798
1808
|
class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
1799
1809
|
"""
|
1800
1810
|
Represents a collection of Page objects, often from a single PDF document.
|
natural_pdf/elements/region.py
CHANGED
@@ -15,6 +15,7 @@ from natural_pdf.classification.manager import ClassificationManager # Keep for
|
|
15
15
|
|
16
16
|
# --- Classification Imports --- #
|
17
17
|
from natural_pdf.classification.mixin import ClassificationMixin
|
18
|
+
from natural_pdf.describe.mixin import DescribeMixin
|
18
19
|
from natural_pdf.elements.base import DirectionalMixin
|
19
20
|
from natural_pdf.elements.text import TextElement # ADDED IMPORT
|
20
21
|
from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
|
@@ -49,7 +50,7 @@ except ImportError:
|
|
49
50
|
logger = logging.getLogger(__name__)
|
50
51
|
|
51
52
|
|
52
|
-
class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
53
|
+
class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
|
53
54
|
"""
|
54
55
|
Represents a rectangular region on a page.
|
55
56
|
"""
|
@@ -2962,3 +2963,5 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2962
2963
|
)
|
2963
2964
|
|
2964
2965
|
return text_element
|
2966
|
+
|
2967
|
+
|
@@ -25,14 +25,19 @@ natural_pdf/collections/pdf_collection.py,sha256=HLlyakM--23ZOeHDPucoM6Tw3yUyMXm
|
|
25
25
|
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
26
26
|
natural_pdf/core/element_manager.py,sha256=_UdXu51sLi6STzc8Pj4k8R721G3yJixXDLuRHn3hmr8,25731
|
27
27
|
natural_pdf/core/highlighting_service.py,sha256=_kQUS6_BBvsLBuSZloFrVag6jN90KzHa0ULyGBjufSs,36955
|
28
|
-
natural_pdf/core/page.py,sha256=
|
28
|
+
natural_pdf/core/page.py,sha256=i3DriIQwoO4RuSrkrCXv44Dz8OL9KXPa2y4GhsD1y18,118324
|
29
29
|
natural_pdf/core/pdf.py,sha256=bAoGPiKIrFaebLwULMT-9VkHQ_wkE_zNl4hlbMLk-2w,69325
|
30
|
+
natural_pdf/describe/__init__.py,sha256=B3zjuHjFI_dFuBLgXR1Q4v7c72fVDyk84d2hs0H4KV8,561
|
31
|
+
natural_pdf/describe/base.py,sha256=7USCFIl4mI5b15LTVkwvhAn_mngMwhwxCnVYaZz5Vdc,16842
|
32
|
+
natural_pdf/describe/elements.py,sha256=BOkz2wDhGh6P8NOm6pSNxitgmVokLTISztaFhrxMcdw,12717
|
33
|
+
natural_pdf/describe/mixin.py,sha256=U0x6v8r57KQb8qC3VVo64hvhfXQWsti8vdKBM7AXnMo,3116
|
34
|
+
natural_pdf/describe/summary.py,sha256=dPtjrn6fQ8nL0F74RITX2vXlDX7ZgaX9JQPnJB-S_XQ,6735
|
30
35
|
natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
|
31
|
-
natural_pdf/elements/base.py,sha256=
|
32
|
-
natural_pdf/elements/collections.py,sha256=
|
36
|
+
natural_pdf/elements/base.py,sha256=IlAeyzV66xMrxVx9U3ocGPekzGUBJgKkAiJ5kpvCSAg,39675
|
37
|
+
natural_pdf/elements/collections.py,sha256=vgVZsVC3xxRF2S5KW7L0JKa-NSUFnqURk50NtvlwbcM,122113
|
33
38
|
natural_pdf/elements/line.py,sha256=300kSFBDUBIudfeQtH_tzW9gTYRgRKUDPiTABw6J-BE,4782
|
34
39
|
natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
|
35
|
-
natural_pdf/elements/region.py,sha256=
|
40
|
+
natural_pdf/elements/region.py,sha256=hBklYKcXJWyxayu9todYQOZ-d9KVDtqeV-CIt9IcSn8,123400
|
36
41
|
natural_pdf/elements/text.py,sha256=13HvVZGinj2Vm_fFCAnqi7hohtoKvnpCp3VCfkpeAbc,11146
|
37
42
|
natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
|
38
43
|
natural_pdf/exporters/__init__.py,sha256=7MnvRLLQdwtg-ULu-8uK8C84GsKiJamyhRw_GgWhw7k,151
|
@@ -85,8 +90,8 @@ natural_pdf/utils/text_extraction.py,sha256=z6Jhy11pakYCsEpkvh8ldw6DkUFsYF1hCL9Y
|
|
85
90
|
natural_pdf/utils/visualization.py,sha256=30pRWQdsRJh2pSObh-brKVsFgC1n8tHmSrta_UDnVPw,8989
|
86
91
|
natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
|
87
92
|
natural_pdf/widgets/viewer.py,sha256=2VUY1TzWMDe9I-IVNOosKZ2LaqpjLB62ftMAdk-s6_8,24952
|
88
|
-
natural_pdf-0.1.
|
89
|
-
natural_pdf-0.1.
|
90
|
-
natural_pdf-0.1.
|
91
|
-
natural_pdf-0.1.
|
92
|
-
natural_pdf-0.1.
|
93
|
+
natural_pdf-0.1.17.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
94
|
+
natural_pdf-0.1.17.dist-info/METADATA,sha256=yGeusUaYx_R_aRl0lUnAHVfBav9Zw43MXDYcB3b6BcA,6753
|
95
|
+
natural_pdf-0.1.17.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
96
|
+
natural_pdf-0.1.17.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
|
97
|
+
natural_pdf-0.1.17.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|