natural-pdf 0.1.16__py3-none-any.whl → 0.1.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
natural_pdf/core/page.py CHANGED
@@ -61,6 +61,7 @@ from natural_pdf.classification.manager import ClassificationManager # For type
61
61
  # # --- Classification Imports --- #
62
62
  from natural_pdf.classification.mixin import ClassificationMixin # Import classification mixin
63
63
  from natural_pdf.core.element_manager import ElementManager
64
+ from natural_pdf.describe.mixin import DescribeMixin # Import describe mixin
64
65
  from natural_pdf.elements.base import Element # Import base element
65
66
  from natural_pdf.elements.text import TextElement
66
67
  from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
@@ -92,7 +93,7 @@ except ImportError:
92
93
  logger = logging.getLogger(__name__)
93
94
 
94
95
 
95
- class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
96
+ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
96
97
  """
97
98
  Enhanced Page wrapper built on top of pdfplumber.Page.
98
99
 
@@ -0,0 +1,21 @@
1
+ """
2
+ Describe functionality for natural-pdf.
3
+
4
+ Provides summary and inspection methods for pages, collections, and regions.
5
+ """
6
+
7
+ from .base import describe_page, describe_collection, inspect_collection, describe_region, describe_element
8
+ from .summary import ElementSummary, InspectionSummary
9
+ from .mixin import DescribeMixin, InspectMixin
10
+
11
+ __all__ = [
12
+ 'describe_page',
13
+ 'describe_collection',
14
+ 'inspect_collection',
15
+ 'describe_region',
16
+ 'describe_element',
17
+ 'ElementSummary',
18
+ 'InspectionSummary',
19
+ 'DescribeMixin',
20
+ 'InspectMixin'
21
+ ]
@@ -0,0 +1,457 @@
1
+ """
2
+ Main describe functions for pages, collections, and regions.
3
+ """
4
+
5
+ import logging
6
+ from collections import Counter
7
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional
8
+
9
+ from .elements import (
10
+ describe_line_elements,
11
+ describe_rect_elements,
12
+ describe_region_elements,
13
+ describe_text_elements,
14
+ )
15
+ from .summary import ElementSummary, InspectionSummary
16
+
17
+ if TYPE_CHECKING:
18
+ from natural_pdf.core.page import Page
19
+ from natural_pdf.elements.base import Element
20
+ from natural_pdf.elements.collections import ElementCollection
21
+ from natural_pdf.elements.region import Region
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ def describe_page(page: "Page") -> ElementSummary:
27
+ """
28
+ Describe what's on a page with high-level summary.
29
+
30
+ Args:
31
+ page: Page to describe
32
+
33
+ Returns:
34
+ ElementSummary with page overview
35
+ """
36
+ data = {}
37
+
38
+ # Get all elements
39
+ all_elements = page.get_elements()
40
+
41
+ if not all_elements:
42
+ data["message"] = "No elements found on page"
43
+ return ElementSummary(data, f"Page {page.number} Summary")
44
+
45
+ # Element counts by type (exclude chars - too granular)
46
+ type_counts = Counter()
47
+ for element in all_elements:
48
+ element_type = getattr(element, 'type', 'unknown')
49
+ if element_type != 'char': # Skip character elements
50
+ type_counts[element_type] += 1
51
+
52
+ # Format element counts as dictionary for proper list formatting
53
+ element_summary = {}
54
+ for element_type, count in type_counts.most_common():
55
+ type_display = element_type.replace('_', ' ').title()
56
+ if element_type == 'word':
57
+ # Add source breakdown for text
58
+ text_elements = [e for e in all_elements if getattr(e, 'type', '') == 'word']
59
+ sources = Counter()
60
+ for elem in text_elements:
61
+ source = getattr(elem, 'source', 'unknown')
62
+ sources[source] += 1
63
+
64
+ if len(sources) > 1:
65
+ source_parts = []
66
+ for source, source_count in sources.most_common():
67
+ source_parts.append(f"{source_count} {source}")
68
+ element_summary["text"] = f"{count} elements ({', '.join(source_parts)})"
69
+ else:
70
+ element_summary["text"] = f"{count} elements"
71
+ else:
72
+ element_summary[element_type] = f"{count} elements"
73
+
74
+ data["elements"] = element_summary
75
+
76
+ # Text analysis if we have text elements (exclude chars - too granular)
77
+ text_elements = [e for e in all_elements if getattr(e, 'type', '') == 'word']
78
+ if text_elements:
79
+ text_analysis = describe_text_elements(text_elements)
80
+ if text_analysis and 'message' not in text_analysis:
81
+ data["text_analysis"] = text_analysis
82
+
83
+ return ElementSummary(data, f"Page {page.number} Summary")
84
+
85
+
86
+ def describe_collection(collection: "ElementCollection") -> ElementSummary:
87
+ """
88
+ Describe an element collection with type-specific analysis.
89
+
90
+ Args:
91
+ collection: ElementCollection to describe
92
+
93
+ Returns:
94
+ ElementSummary with collection analysis
95
+ """
96
+ elements = list(collection)
97
+
98
+ if not elements:
99
+ data = {"message": "Empty collection"}
100
+ return ElementSummary(data, "Collection Summary")
101
+
102
+ data = {}
103
+
104
+ # Group elements by type
105
+ by_type = {}
106
+ for element in elements:
107
+ element_type = getattr(element, 'type', 'unknown')
108
+ by_type.setdefault(element_type, []).append(element)
109
+
110
+ # Overall summary for mixed collections (exclude chars from overview)
111
+ if len(by_type) > 1:
112
+ type_counts = {k: len(v) for k, v in by_type.items() if k != 'char'}
113
+ total = sum(type_counts.values())
114
+
115
+ summary_parts = []
116
+ for element_type, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True):
117
+ type_display = element_type.replace('_', ' ').title()
118
+ summary_parts.append(f"**{type_display}**: {count}")
119
+
120
+ if summary_parts: # Only add overview if we have non-char elements
121
+ data["overview"] = {
122
+ "total_elements": total,
123
+ "type_breakdown": summary_parts
124
+ }
125
+
126
+ # Type-specific analysis (exclude chars - too granular)
127
+ for element_type, type_elements in by_type.items():
128
+ if element_type == 'char':
129
+ # Skip character elements - too granular for useful analysis
130
+ continue
131
+ elif element_type == 'word':
132
+ analysis = describe_text_elements(type_elements)
133
+ elif element_type == 'rect':
134
+ analysis = describe_rect_elements(type_elements)
135
+ elif element_type == 'line':
136
+ analysis = describe_line_elements(type_elements)
137
+ elif element_type == 'region':
138
+ analysis = describe_region_elements(type_elements)
139
+ else:
140
+ analysis = {"count": len(type_elements)}
141
+
142
+ if analysis and 'message' not in analysis:
143
+ section_name = element_type.replace('_', ' ').title()
144
+ if len(by_type) == 1:
145
+ # Single type collection - flatten the structure
146
+ data.update(analysis)
147
+ else:
148
+ # Mixed collection - keep sections separate
149
+ data[section_name] = analysis
150
+
151
+ # Count non-char elements for title
152
+ non_char_count = len([e for e in elements if getattr(e, 'type', 'unknown') != 'char'])
153
+ title = f"Collection Summary ({non_char_count} elements)"
154
+ return ElementSummary(data, title)
155
+
156
+
157
+ def describe_region(region: "Region") -> ElementSummary:
158
+ """
159
+ Describe a region with its properties and contents.
160
+
161
+ Args:
162
+ region: Region to describe
163
+
164
+ Returns:
165
+ ElementSummary with region analysis
166
+ """
167
+ data = {}
168
+
169
+ # Region info
170
+ region_info = {
171
+ "page": region.page.number,
172
+ "dimensions": f"{region.width:.0f}×{region.height:.0f} pts",
173
+ "area": f"{region.width * region.height:.0f} sq pts",
174
+ "position": f"({region.x0:.0f}, {region.top:.0f}) to ({region.x1:.0f}, {region.bottom:.0f})"
175
+ }
176
+
177
+ # Add metadata if available
178
+ if hasattr(region, 'metadata') and region.metadata:
179
+ region_info["metadata"] = region.metadata
180
+
181
+ data["region_info"] = region_info
182
+
183
+ # Content analysis
184
+ content_elements = region.find_all("*")
185
+ if content_elements:
186
+ content_analysis = describe_collection(content_elements)
187
+ # Extract the data and add as "content" section
188
+ data["content"] = content_analysis.to_dict()
189
+ else:
190
+ data["content"] = {"message": "No elements found in region"}
191
+
192
+ return ElementSummary(data, "Region Summary")
193
+
194
+
195
+ def inspect_collection(collection: "ElementCollection", limit: int = 30) -> InspectionSummary:
196
+ """
197
+ Inspect elements in a collection with detailed tabular view.
198
+
199
+ Args:
200
+ collection: ElementCollection to inspect
201
+ limit: Maximum elements per type to show (default: 30)
202
+
203
+ Returns:
204
+ InspectionSummary with element tables
205
+ """
206
+ elements = list(collection)
207
+
208
+ if not elements:
209
+ data = {"message": "Empty collection"}
210
+ return InspectionSummary(data, "Collection Inspection")
211
+
212
+ data = {}
213
+
214
+ # Check if multi-page
215
+ pages = set()
216
+ for element in elements:
217
+ if hasattr(element, 'page') and hasattr(element.page, 'number'):
218
+ pages.add(element.page.number)
219
+ show_page_column = len(pages) > 1
220
+
221
+ # Group by type
222
+ by_type = {}
223
+ for element in elements:
224
+ element_type = getattr(element, 'type', 'unknown')
225
+ by_type.setdefault(element_type, []).append(element)
226
+
227
+ # Create tables for each type (exclude chars - too granular)
228
+ for element_type, type_elements in by_type.items():
229
+ if element_type == 'char':
230
+ # Skip character elements - too granular for useful inspection
231
+ continue
232
+
233
+ # Limit elements shown
234
+ display_elements = type_elements[:limit]
235
+
236
+ # Get appropriate columns for this type
237
+ columns = _get_columns_for_type(element_type, show_page_column)
238
+
239
+ # Extract data for each element
240
+ element_data = []
241
+ for element in display_elements:
242
+ row = {}
243
+ for col in columns:
244
+ value = _extract_element_value(element, col)
245
+ row[col] = value
246
+ element_data.append(row)
247
+
248
+ # Create section
249
+ section_name = f"{element_type}_elements"
250
+ section_data = {
251
+ "elements": element_data,
252
+ "columns": columns
253
+ }
254
+
255
+ # Add note if truncated
256
+ if len(type_elements) > limit:
257
+ section_data["note"] = f"Showing {limit} of {len(type_elements)} elements (pass limit= to see more)"
258
+
259
+ data[section_name] = section_data
260
+
261
+ # Count non-char elements for title
262
+ non_char_count = len([e for e in elements if getattr(e, 'type', 'unknown') != 'char'])
263
+ title = f"Collection Inspection ({non_char_count} elements)"
264
+ return InspectionSummary(data, title)
265
+
266
+
267
+ def _get_columns_for_type(element_type: str, show_page_column: bool) -> List[str]:
268
+ """Get appropriate columns for element type."""
269
+ base_columns = ['x0', 'top', 'x1', 'bottom']
270
+
271
+ if element_type == 'word':
272
+ columns = ['text'] + base_columns + ['font_family', 'size', 'bold', 'italic', 'source', 'confidence']
273
+ # Add color for text elements
274
+ columns.append('color')
275
+ elif element_type == 'rect':
276
+ columns = base_columns + ['width', 'height', 'stroke', 'fill', 'stroke_width']
277
+ elif element_type == 'line':
278
+ columns = base_columns + ['width', 'is_horizontal', 'is_vertical'] # LineElement properties
279
+ elif element_type == 'region':
280
+ columns = base_columns + ['width', 'height', 'type']
281
+ else:
282
+ columns = base_columns + ['type']
283
+
284
+ if show_page_column:
285
+ columns.append('page')
286
+
287
+ return columns
288
+
289
+
290
+ def _extract_element_value(element: "Element", column: str) -> Any:
291
+ """Extract value for a column from an element."""
292
+ try:
293
+ if column == 'text':
294
+ text = getattr(element, 'text', '')
295
+ if text and len(text) > 50:
296
+ return text[:50] + "..."
297
+ return text or ""
298
+
299
+ elif column == 'page':
300
+ if hasattr(element, 'page') and hasattr(element.page, 'number'):
301
+ return element.page.number
302
+ return ""
303
+
304
+ elif column == 'confidence':
305
+ confidence = getattr(element, 'confidence', None)
306
+ if confidence is not None and isinstance(confidence, (int, float)):
307
+ return f"{confidence:.2f}"
308
+ return ""
309
+
310
+ elif column == 'font_family':
311
+ # Use the cleaner font_family property from TextElement
312
+ font_family = getattr(element, 'font_family', None)
313
+ if font_family:
314
+ return font_family
315
+ # Fallback to fontname
316
+ return getattr(element, 'fontname', '')
317
+
318
+ elif column in ['bold', 'italic']:
319
+ value = getattr(element, column, False)
320
+ return value if isinstance(value, bool) else False
321
+
322
+ elif column in ['stroke', 'fill', 'color']:
323
+ # For rectangles and text, these return color tuples
324
+ value = getattr(element, column, None)
325
+ if value and isinstance(value, (tuple, list)) and len(value) >= 3:
326
+ # Convert to hex color for display
327
+ try:
328
+ r, g, b = [int(v * 255) if v <= 1 else int(v) for v in value[:3]]
329
+ return f"#{r:02x}{g:02x}{b:02x}"
330
+ except:
331
+ return str(value)
332
+ return ""
333
+
334
+ elif column in ['x0', 'top', 'x1', 'bottom', 'width', 'height', 'size', 'stroke_width']:
335
+ value = getattr(element, column, 0)
336
+ if isinstance(value, (int, float)) and not isinstance(value, bool):
337
+ return int(round(value))
338
+ return 0
339
+
340
+ elif column in ['is_horizontal', 'is_vertical']:
341
+ value = getattr(element, column, False)
342
+ return value if isinstance(value, bool) else False
343
+
344
+ else:
345
+ # Generic attribute access
346
+ value = getattr(element, column, '')
347
+ if value is None:
348
+ return ""
349
+ return str(value)
350
+
351
+ except Exception as e:
352
+ # Fallback for any unexpected errors
353
+ logger.warning(f"Error extracting {column} from element: {e}")
354
+ return ""
355
+
356
+
357
+ def describe_element(element: "Element") -> "ElementSummary":
358
+ """
359
+ Describe an individual element with its properties and attributes.
360
+
361
+ Args:
362
+ element: The element to describe
363
+
364
+ Returns:
365
+ ElementSummary with formatted element properties
366
+ """
367
+ from natural_pdf.describe.summary import ElementSummary
368
+
369
+ # Get basic element info
370
+ element_type = getattr(element, 'type', element.__class__.__name__)
371
+
372
+ # Build the description data - use dict structure for proper list formatting
373
+ data = {
374
+ "info": {
375
+ "object_type": "element",
376
+ "element_type": element_type,
377
+ "class_name": element.__class__.__name__
378
+ }
379
+ }
380
+
381
+ # Add geometric properties - use dict structure for proper list formatting
382
+ if hasattr(element, 'bbox'):
383
+ data["geometry"] = {
384
+ "position": f"({round(element.x0, 1)}, {round(element.top, 1)}, {round(element.x1, 1)}, {round(element.bottom, 1)})",
385
+ "size": f"({round(element.width, 1)}, {round(element.height, 1)})"
386
+ }
387
+
388
+ # Add text content if available - use dict structure for proper list formatting
389
+ if hasattr(element, 'text') and element.text:
390
+ text = str(element.text).strip()
391
+ display_text = text[:50] + "..." if len(text) > 50 else text
392
+ data["content"] = {
393
+ "text": f"'{display_text}'",
394
+ "length": f"{len(text)} chars"
395
+ }
396
+
397
+ # Add common text properties - use dict structure for proper list formatting
398
+ text_props = {}
399
+ for prop in ['font_family', 'size', 'bold', 'italic', 'source', 'confidence']:
400
+ if hasattr(element, prop):
401
+ value = getattr(element, prop)
402
+ if value is not None:
403
+ if prop == 'confidence' and isinstance(value, (int, float)):
404
+ text_props[prop] = round(value, 3)
405
+ elif prop == 'size' and isinstance(value, (int, float)):
406
+ text_props[prop] = round(value, 1)
407
+ elif prop in ['bold', 'italic']:
408
+ text_props[prop] = value
409
+ else:
410
+ text_props[prop] = value
411
+
412
+ if text_props:
413
+ data["properties"] = text_props
414
+
415
+ # Add color information - use dict structure for proper list formatting
416
+ color_info = {}
417
+ for prop in ['color', 'fill', 'stroke']:
418
+ if hasattr(element, prop):
419
+ value = getattr(element, prop)
420
+ if value is not None:
421
+ if isinstance(value, (tuple, list)) and len(value) >= 3:
422
+ # Convert RGB to hex if it's a color tuple
423
+ try:
424
+ if all(isinstance(v, (int, float)) for v in value[:3]):
425
+ r, g, b = [int(v * 255) if v <= 1 else int(v) for v in value[:3]]
426
+ color_info[prop] = f"#{r:02x}{g:02x}{b:02x}"
427
+ else:
428
+ color_info[prop] = str(value)
429
+ except:
430
+ color_info[prop] = str(value)
431
+ else:
432
+ color_info[prop] = str(value)
433
+
434
+ if color_info:
435
+ data["colors"] = color_info
436
+
437
+ # Add page information - use dict structure for proper list formatting
438
+ if hasattr(element, 'page') and element.page:
439
+ page_num = getattr(element.page, 'number', None)
440
+ if page_num is not None:
441
+ data["page"] = {"number": page_num}
442
+
443
+ # Add polygon information if available - use dict structure for proper list formatting
444
+ if hasattr(element, 'has_polygon') and element.has_polygon:
445
+ if hasattr(element, 'polygon'):
446
+ polygon = element.polygon
447
+ if polygon and len(polygon) > 0:
448
+ data["shape"] = {"polygon_points": len(polygon)}
449
+
450
+ # Create title
451
+ title = f"{element_type.title()} Element"
452
+ if hasattr(element, 'text') and element.text:
453
+ preview = str(element.text).strip()[:30]
454
+ if preview:
455
+ title += f": '{preview}'"
456
+
457
+ return ElementSummary(data, title)
@@ -0,0 +1,411 @@
1
+ """
2
+ Element-specific describe functions.
3
+ """
4
+
5
+ import logging
6
+ from collections import Counter
7
+ from typing import TYPE_CHECKING, Any, Dict, List
8
+
9
+ if TYPE_CHECKING:
10
+ from natural_pdf.elements.base import Element
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def describe_text_elements(elements: List["Element"]) -> Dict[str, Any]:
16
+ """
17
+ Describe text elements with typography and OCR analysis.
18
+
19
+ Args:
20
+ elements: List of text elements
21
+
22
+ Returns:
23
+ Dictionary with text analysis sections
24
+ """
25
+ if not elements:
26
+ return {"message": "No text elements found"}
27
+
28
+ result = {}
29
+
30
+ # Source breakdown
31
+ sources = Counter()
32
+ ocr_elements = []
33
+
34
+ for element in elements:
35
+ source = getattr(element, 'source', 'unknown')
36
+ sources[source] += 1
37
+ if source == 'ocr':
38
+ ocr_elements.append(element)
39
+
40
+ if len(sources) > 1:
41
+ result['sources'] = dict(sources)
42
+
43
+ # Typography analysis
44
+ typography = _analyze_typography(elements)
45
+ if typography:
46
+ result['typography'] = typography
47
+
48
+ # OCR quality analysis
49
+ if ocr_elements:
50
+ ocr_quality = _analyze_ocr_quality(ocr_elements)
51
+ if ocr_quality:
52
+ result['ocr_quality'] = ocr_quality
53
+
54
+ return result
55
+
56
+
57
+ def describe_rect_elements(elements: List["Element"]) -> Dict[str, Any]:
58
+ """
59
+ Describe rectangle elements with size and style analysis.
60
+
61
+ Args:
62
+ elements: List of rectangle elements
63
+
64
+ Returns:
65
+ Dictionary with rectangle analysis
66
+ """
67
+ if not elements:
68
+ return {"message": "No rectangle elements found"}
69
+
70
+ result = {}
71
+
72
+ # Size analysis
73
+ sizes = []
74
+ stroke_count = 0
75
+ fill_count = 0
76
+ colors = Counter()
77
+ stroke_widths = []
78
+
79
+ for element in elements:
80
+ # Size
81
+ width = getattr(element, 'width', 0)
82
+ height = getattr(element, 'height', 0)
83
+ if width and height:
84
+ sizes.append((width, height))
85
+
86
+ # Style properties - use RectangleElement properties
87
+ stroke = getattr(element, 'stroke', None)
88
+ if stroke and stroke != (0, 0, 0): # Check if stroke color exists and isn't black
89
+ stroke_count += 1
90
+ fill = getattr(element, 'fill', None)
91
+ if fill and fill != (0, 0, 0): # Check if fill color exists and isn't black
92
+ fill_count += 1
93
+
94
+ # Stroke width
95
+ stroke_width = getattr(element, 'stroke_width', 0)
96
+ if stroke_width > 0:
97
+ stroke_widths.append(stroke_width)
98
+
99
+ # Color - use the element's stroke/fill properties
100
+ color = stroke or fill
101
+ if color:
102
+ if isinstance(color, (tuple, list)):
103
+ if color == (0, 0, 0) or color == (0.0, 0.0, 0.0):
104
+ colors['black'] += 1
105
+ elif color == (1, 1, 1) or color == (1.0, 1.0, 1.0):
106
+ colors['white'] += 1
107
+ else:
108
+ colors[str(color)] += 1
109
+ else:
110
+ colors[str(color)] += 1
111
+
112
+ # Size statistics
113
+ if sizes:
114
+ widths = [s[0] for s in sizes]
115
+ heights = [s[1] for s in sizes]
116
+ result['size_stats'] = {
117
+ 'width_range': f"{min(widths):.0f}-{max(widths):.0f}",
118
+ 'height_range': f"{min(heights):.0f}-{max(heights):.0f}",
119
+ 'avg_area': f"{sum(w*h for w,h in sizes)/len(sizes):.0f} sq pts"
120
+ }
121
+
122
+ # Style breakdown
123
+ style_info = {}
124
+ if stroke_count:
125
+ style_info['stroke'] = stroke_count
126
+ if fill_count:
127
+ style_info['fill'] = fill_count
128
+ if stroke_widths:
129
+ stroke_width_counts = Counter(stroke_widths)
130
+ # Convert float keys to strings to avoid formatting issues
131
+ stroke_width_dict = {str(k): v for k, v in stroke_width_counts.most_common()}
132
+ style_info['stroke_widths'] = stroke_width_dict
133
+ if colors:
134
+ style_info['colors'] = dict(colors.most_common(5))
135
+
136
+ if style_info:
137
+ result['styles'] = style_info
138
+
139
+ return result
140
+
141
+
142
+ def describe_line_elements(elements: List["Element"]) -> Dict[str, Any]:
143
+ """
144
+ Describe line elements with length and style analysis.
145
+
146
+ Args:
147
+ elements: List of line elements
148
+
149
+ Returns:
150
+ Dictionary with line analysis
151
+ """
152
+ if not elements:
153
+ return {"message": "No line elements found"}
154
+
155
+ result = {}
156
+
157
+ lengths = []
158
+ widths = []
159
+ colors = Counter()
160
+
161
+ for element in elements:
162
+ # Calculate length
163
+ x0 = getattr(element, 'x0', 0)
164
+ y0 = getattr(element, 'top', 0)
165
+ x1 = getattr(element, 'x1', 0)
166
+ y1 = getattr(element, 'bottom', 0)
167
+
168
+ length = ((x1 - x0) ** 2 + (y1 - y0) ** 2) ** 0.5
169
+ if length > 0:
170
+ lengths.append(length)
171
+
172
+ # Line width - use the element's width property
173
+ width = getattr(element, 'width', 0) # LineElement has a width property
174
+ if width:
175
+ widths.append(width)
176
+
177
+ # Color - use the element's color property
178
+ color = getattr(element, 'color', None) # LineElement has a color property
179
+ if color:
180
+ if isinstance(color, (tuple, list)):
181
+ if color == (0, 0, 0) or color == (0.0, 0.0, 0.0):
182
+ colors['black'] += 1
183
+ else:
184
+ colors[str(color)] += 1
185
+ else:
186
+ colors[str(color)] += 1
187
+
188
+ # Length statistics
189
+ if lengths:
190
+ result['length_stats'] = {
191
+ 'min': f"{min(lengths):.0f}",
192
+ 'max': f"{max(lengths):.0f}",
193
+ 'avg': f"{sum(lengths)/len(lengths):.0f}"
194
+ }
195
+
196
+ # Width statistics
197
+ if widths:
198
+ width_counts = Counter(widths)
199
+ # Convert float keys to strings to avoid formatting issues
200
+ result['line_widths'] = {str(k): v for k, v in width_counts.most_common()}
201
+
202
+ # Orientation analysis
203
+ horizontal_count = sum(1 for el in elements if getattr(el, 'is_horizontal', False))
204
+ vertical_count = sum(1 for el in elements if getattr(el, 'is_vertical', False))
205
+ diagonal_count = len(elements) - horizontal_count - vertical_count
206
+
207
+ if horizontal_count or vertical_count or diagonal_count:
208
+ orientation_info = {}
209
+ if horizontal_count:
210
+ orientation_info['horizontal'] = horizontal_count
211
+ if vertical_count:
212
+ orientation_info['vertical'] = vertical_count
213
+ if diagonal_count:
214
+ orientation_info['diagonal'] = diagonal_count
215
+ result['orientations'] = orientation_info
216
+
217
+ # Colors
218
+ if colors:
219
+ result['colors'] = dict(colors.most_common())
220
+
221
+ return result
222
+
223
+
224
+ def describe_region_elements(elements: List["Element"]) -> Dict[str, Any]:
225
+ """
226
+ Describe region elements with type and metadata analysis.
227
+
228
+ Args:
229
+ elements: List of region elements
230
+
231
+ Returns:
232
+ Dictionary with region analysis
233
+ """
234
+ if not elements:
235
+ return {"message": "No region elements found"}
236
+
237
+ result = {}
238
+
239
+ # Region types
240
+ types = Counter()
241
+ sizes = []
242
+ metadata_keys = set()
243
+
244
+ for element in elements:
245
+ # Type
246
+ region_type = getattr(element, 'type', 'unknown')
247
+ types[region_type] += 1
248
+
249
+ # Size
250
+ width = getattr(element, 'width', 0)
251
+ height = getattr(element, 'height', 0)
252
+ if width and height:
253
+ sizes.append(width * height)
254
+
255
+ # Metadata keys
256
+ if hasattr(element, 'metadata') and element.metadata:
257
+ metadata_keys.update(element.metadata.keys())
258
+
259
+ # Type breakdown
260
+ if types:
261
+ result['types'] = dict(types.most_common())
262
+
263
+ # Size statistics
264
+ if sizes:
265
+ result['size_stats'] = {
266
+ 'min_area': f"{min(sizes):.0f} sq pts",
267
+ 'max_area': f"{max(sizes):.0f} sq pts",
268
+ 'avg_area': f"{sum(sizes)/len(sizes):.0f} sq pts"
269
+ }
270
+
271
+ # Metadata
272
+ if metadata_keys:
273
+ result['metadata_keys'] = sorted(list(metadata_keys))
274
+
275
+ return result
276
+
277
+
278
+ def _analyze_typography(elements: List["Element"]) -> Dict[str, Any]:
279
+ """Analyze typography patterns in text elements."""
280
+ fonts = Counter()
281
+ sizes = Counter()
282
+ styles = {'bold': 0, 'italic': 0}
283
+ colors = Counter()
284
+
285
+ for element in elements:
286
+ # Font family - use TextElement's font_family property for cleaner names
287
+ font_family = getattr(element, 'font_family', None)
288
+ fontname = getattr(element, 'fontname', 'Unknown')
289
+ display_font = font_family if font_family and font_family != fontname else fontname
290
+ if display_font:
291
+ fonts[display_font] += 1
292
+
293
+ # Size
294
+ size = getattr(element, 'size', None)
295
+ if size:
296
+ # Round to nearest 0.5
297
+ rounded_size = round(size * 2) / 2
298
+ sizes[f"{rounded_size}pt"] += 1
299
+
300
+ # Styles
301
+ if getattr(element, 'bold', False):
302
+ styles['bold'] += 1
303
+ if getattr(element, 'italic', False):
304
+ styles['italic'] += 1
305
+
306
+ # Color - use TextElement's color property
307
+ color = getattr(element, 'color', None)
308
+ if color:
309
+ if isinstance(color, (tuple, list)):
310
+ if color == (0, 0, 0) or color == (0.0, 0.0, 0.0):
311
+ colors['black'] += 1
312
+ elif color == (1, 1, 1) or color == (1.0, 1.0, 1.0):
313
+ colors['white'] += 1
314
+ else:
315
+ colors['other'] += 1
316
+ else:
317
+ colors[str(color)] += 1
318
+
319
+ result = {}
320
+
321
+ # Fonts
322
+ if fonts:
323
+ result['fonts'] = dict(fonts.most_common(10))
324
+
325
+ # Sizes (as horizontal table)
326
+ if sizes:
327
+ result['sizes'] = dict(sizes.most_common())
328
+
329
+ # Styles
330
+ style_list = []
331
+ if styles['bold']:
332
+ style_list.append(f"{styles['bold']} bold")
333
+ if styles['italic']:
334
+ style_list.append(f"{styles['italic']} italic")
335
+ if style_list:
336
+ result['styles'] = ", ".join(style_list)
337
+
338
+ # Colors
339
+ if colors and len(colors) > 1: # Only show if there are multiple colors
340
+ result['colors'] = dict(colors.most_common())
341
+
342
+ return result
343
+
344
+
345
+ def _analyze_ocr_quality(elements: List["Element"]) -> Dict[str, Any]:
346
+ """Analyze OCR quality metrics."""
347
+ confidences = []
348
+
349
+ for element in elements:
350
+ confidence = getattr(element, 'confidence', None)
351
+ if confidence is not None:
352
+ confidences.append(confidence)
353
+
354
+ if not confidences:
355
+ return {}
356
+
357
+ result = {}
358
+
359
+ # Basic stats
360
+ result['confidence_stats'] = {
361
+ 'mean': f"{sum(confidences)/len(confidences):.2f}",
362
+ 'min': f"{min(confidences):.2f}",
363
+ 'max': f"{max(confidences):.2f}"
364
+ }
365
+
366
+ # Threshold analysis with ASCII bars
367
+ thresholds = [
368
+ ('99%+', 0.99),
369
+ ('95%+', 0.95),
370
+ ('90%+', 0.90),
371
+ ]
372
+
373
+ element_count = len(elements)
374
+ threshold_bars = {}
375
+
376
+ for label, threshold in thresholds:
377
+ count = sum(1 for c in confidences if c >= threshold)
378
+ percentage = count / element_count
379
+
380
+ # Create ASCII bar (40 characters wide)
381
+ filled_chars = int(percentage * 40)
382
+ empty_chars = 40 - filled_chars
383
+ bar = '█' * filled_chars + '░' * empty_chars
384
+
385
+ # Format: "95%+ (32/43) 74%: `████████████████████████████████░░░░░░░░`"
386
+ threshold_bars[f"{label} ({count}/{element_count}) {percentage:.0%}"] = f"`{bar}`"
387
+
388
+ result['quality_distribution'] = threshold_bars
389
+
390
+ # Show lowest quality items
391
+ element_confidences = []
392
+ for element in elements:
393
+ confidence = getattr(element, 'confidence', None)
394
+ if confidence is not None:
395
+ # Get text content for display
396
+ text = getattr(element, 'text', '').strip()
397
+ if text:
398
+ # Truncate long text
399
+ display_text = text[:50] + "..." if len(text) > 50 else text
400
+ element_confidences.append((confidence, display_text))
401
+
402
+ if element_confidences:
403
+ # Sort by confidence (lowest first) and take bottom 10
404
+ lowest_quality = sorted(element_confidences, key=lambda x: x[0])[:10]
405
+ if lowest_quality:
406
+ lowest_items = {}
407
+ for i, (confidence, text) in enumerate(lowest_quality, 1):
408
+ lowest_items[f"#{i}"] = f"**{confidence:.2f}**: {text}"
409
+ result['lowest_scoring'] = lowest_items
410
+
411
+ return result
@@ -0,0 +1,84 @@
1
+ """
2
+ Mixin for describe functionality.
3
+ """
4
+
5
+ from typing import TYPE_CHECKING
6
+
7
+ if TYPE_CHECKING:
8
+ from natural_pdf.describe.summary import ElementSummary, InspectionSummary
9
+
10
+
11
+ class DescribeMixin:
12
+ """
13
+ Mixin providing describe functionality for pages, collections, and regions.
14
+
15
+ Classes that inherit from this mixin get:
16
+ - .describe() method for high-level summaries
17
+ - .inspect() method for detailed tabular views (collections only)
18
+ """
19
+
20
+ def describe(self) -> "ElementSummary":
21
+ """
22
+ Describe this object with type-specific analysis.
23
+
24
+ Returns:
25
+ ElementSummary with analysis appropriate for the object type
26
+ """
27
+ from natural_pdf.describe import describe_page, describe_collection, describe_region, describe_element
28
+
29
+ # Determine the appropriate describe function based on class type
30
+ class_name = self.__class__.__name__
31
+
32
+ if class_name == 'Page':
33
+ return describe_page(self)
34
+ elif class_name == 'ElementCollection':
35
+ return describe_collection(self)
36
+ elif class_name == 'Region':
37
+ return describe_region(self)
38
+ else:
39
+ # Check if it's an individual element (inherits from Element base class)
40
+ from natural_pdf.elements.base import Element
41
+ if isinstance(self, Element):
42
+ return describe_element(self)
43
+
44
+ # Fallback - try to determine based on available methods/attributes
45
+ if hasattr(self, 'get_elements') and hasattr(self, 'width') and hasattr(self, 'height'):
46
+ # Looks like a page or region
47
+ if hasattr(self, 'number'):
48
+ return describe_page(self) # Page
49
+ else:
50
+ return describe_region(self) # Region
51
+ elif hasattr(self, '__iter__') and hasattr(self, '__len__'):
52
+ # Looks like a collection
53
+ return describe_collection(self)
54
+ else:
55
+ # Unknown type - create a basic summary
56
+ from natural_pdf.describe.summary import ElementSummary
57
+ data = {
58
+ "object_type": class_name,
59
+ "message": f"Describe not fully implemented for {class_name}"
60
+ }
61
+ return ElementSummary(data, f"{class_name} Summary")
62
+
63
+
64
+ class InspectMixin:
65
+ """
66
+ Mixin providing inspect functionality for collections.
67
+
68
+ Classes that inherit from this mixin get:
69
+ - .inspect() method for detailed tabular element views
70
+ """
71
+
72
+ def inspect(self, limit: int = 30) -> "InspectionSummary":
73
+ """
74
+ Inspect elements with detailed tabular view.
75
+
76
+ Args:
77
+ limit: Maximum elements per type to show (default: 30)
78
+
79
+ Returns:
80
+ InspectionSummary with element tables showing coordinates,
81
+ properties, and other details for each element
82
+ """
83
+ from natural_pdf.describe import inspect_collection
84
+ return inspect_collection(self, limit=limit)
@@ -0,0 +1,186 @@
1
+ """
2
+ Summary objects for describe functionality.
3
+ """
4
+
5
+ from typing import Any, Dict, List, Union
6
+
7
+
8
+ class ElementSummary:
9
+ """
10
+ Container for element summary data with markdown rendering.
11
+
12
+ Automatically renders as markdown in Jupyter notebooks and provides
13
+ access to underlying data as dictionaries.
14
+ """
15
+
16
+ def __init__(self, data: Dict[str, Any], title: str = "Summary"):
17
+ """
18
+ Initialize summary with data and optional title.
19
+
20
+ Args:
21
+ data: Dictionary containing summary sections
22
+ title: Title for the summary display
23
+ """
24
+ self.data = data
25
+ self.title = title
26
+
27
+ def __str__(self) -> str:
28
+ """String representation as markdown."""
29
+ return self._to_markdown()
30
+
31
+ def __repr__(self) -> str:
32
+ """Repr as markdown for better display."""
33
+ return self._to_markdown()
34
+
35
+ def _repr_markdown_(self) -> str:
36
+ """Jupyter notebook markdown rendering."""
37
+ return self._to_markdown()
38
+
39
+ def to_dict(self) -> Dict[str, Any]:
40
+ """Return underlying data as dictionary."""
41
+ return self.data.copy()
42
+
43
+ def _to_markdown(self) -> str:
44
+ """Convert data to markdown format."""
45
+ lines = [f"## {self.title}", ""]
46
+
47
+ for section_name, section_data in self.data.items():
48
+ lines.extend(self._format_section(section_name, section_data))
49
+ lines.append("") # Empty line between sections
50
+
51
+ return "\n".join(lines).rstrip()
52
+
53
+ def _format_section(self, name: str, data: Any) -> List[str]:
54
+ """Format a single section as markdown."""
55
+ # Use bold text instead of headers for more compact display
56
+ section_title = name.replace('_', ' ').title()
57
+
58
+ if isinstance(data, dict):
59
+ lines = [f"**{section_title}**:"]
60
+ lines.extend(self._format_dict(data, indent=" "))
61
+ elif isinstance(data, list):
62
+ lines = [f"**{section_title}**: {', '.join(str(item) for item in data)}"]
63
+ else:
64
+ lines = [f"**{section_title}**: {data}"]
65
+
66
+ return lines
67
+
68
+ def _format_dict(self, data: Dict[str, Any], indent: str = "") -> List[str]:
69
+ """Format dictionary as markdown list."""
70
+ lines = []
71
+
72
+ for key, value in data.items():
73
+ key_display = key.replace('_', ' ')
74
+
75
+ if isinstance(value, dict):
76
+ # Nested dict - always format as list items
77
+ lines.append(f"{indent}- **{key_display}**:")
78
+ for subkey, subvalue in value.items():
79
+ subkey_display = subkey.replace('_', ' ')
80
+ if isinstance(subvalue, dict):
81
+ # Another level of nesting
82
+ lines.append(f"{indent} - **{subkey_display}**:")
83
+ for subsubkey, subsubvalue in subvalue.items():
84
+ subsubkey_display = subsubkey.replace('_', ' ')
85
+ lines.append(f"{indent} - {subsubkey_display}: {subsubvalue}")
86
+ else:
87
+ lines.append(f"{indent} - {subkey_display}: {subvalue}")
88
+ elif isinstance(value, list):
89
+ if len(value) <= 5:
90
+ value_str = ", ".join(str(v) for v in value)
91
+ lines.append(f"{indent}- **{key_display}**: {value_str}")
92
+ else:
93
+ lines.append(f"{indent}- **{key_display}**: {len(value)} items")
94
+ else:
95
+ lines.append(f"{indent}- **{key_display}**: {value}")
96
+
97
+ return lines
98
+
99
+ def _format_list(self, data: List[Any]) -> List[str]:
100
+ """Format list as markdown."""
101
+ lines = []
102
+ for item in data:
103
+ if isinstance(item, dict):
104
+ # Could be table rows
105
+ lines.append(f"- {item}")
106
+ else:
107
+ lines.append(f"- {item}")
108
+ return lines
109
+
110
+
111
+
112
+ def _format_horizontal_table(self, title: str, data: Dict[str, Any]) -> List[str]:
113
+ """Format dict as horizontal table."""
114
+ headers = list(data.keys())
115
+ values = list(data.values())
116
+
117
+ # Create table
118
+ header_row = "| " + " | ".join(headers) + " |"
119
+ separator = "|" + "|".join("------" for _ in headers) + "|"
120
+ value_row = "| " + " | ".join(str(v) for v in values) + " |"
121
+
122
+ return [
123
+ f"- **{title}**:",
124
+ "",
125
+ header_row,
126
+ separator,
127
+ value_row,
128
+ ""
129
+ ]
130
+
131
+
132
+ class InspectionSummary(ElementSummary):
133
+ """
134
+ Summary for element inspection with tabular data.
135
+ """
136
+
137
+ def _format_section(self, name: str, data: Any) -> List[str]:
138
+ """Format inspection section with element tables."""
139
+ section_title = name.replace('_', ' ').title()
140
+
141
+ if isinstance(data, dict) and 'elements' in data:
142
+ # This is an element table section - use ### header for inspect
143
+ elements = data['elements']
144
+ lines = [f"### {section_title}"]
145
+ if elements:
146
+ lines.extend(self._format_element_table(elements, data.get('columns', [])))
147
+ # Add note if truncated
148
+ if 'note' in data:
149
+ lines.append(f"_{data['note']}_")
150
+ else:
151
+ lines.append("No elements found.")
152
+ else:
153
+ # Regular section formatting
154
+ lines = [f"**{section_title}**: {data}"]
155
+
156
+ return lines
157
+
158
+ def _format_element_table(self, elements: List[Dict[str, Any]], columns: List[str]) -> List[str]:
159
+ """Format elements as markdown table."""
160
+ if not elements or not columns:
161
+ return ["No elements to display."]
162
+
163
+ lines = [""] # Empty line before table
164
+
165
+ # Table header
166
+ header_row = "| " + " | ".join(columns) + " |"
167
+ separator = "|" + "|".join("------" for _ in columns) + "|"
168
+ lines.extend([header_row, separator])
169
+
170
+ # Table rows
171
+ for element in elements:
172
+ row_values = []
173
+ for col in columns:
174
+ value = element.get(col, "")
175
+ if value is None:
176
+ value = ""
177
+ elif isinstance(value, float):
178
+ value = str(int(round(value)))
179
+ elif isinstance(value, str) and len(value) > 50:
180
+ value = value[:50] + "..."
181
+ row_values.append(str(value))
182
+
183
+ row = "| " + " | ".join(row_values) + " |"
184
+ lines.append(row)
185
+
186
+ return lines
@@ -8,6 +8,7 @@ from PIL import Image
8
8
 
9
9
  # Import selector parsing functions
10
10
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
11
+ from natural_pdf.describe.mixin import DescribeMixin
11
12
 
12
13
  if TYPE_CHECKING:
13
14
  from natural_pdf.core.page import Page
@@ -412,7 +413,7 @@ class DirectionalMixin:
412
413
  return new_region
413
414
 
414
415
 
415
- class Element(DirectionalMixin):
416
+ class Element(DirectionalMixin, DescribeMixin):
416
417
  """
417
418
  Base class for all PDF elements.
418
419
 
@@ -30,6 +30,7 @@ from tqdm.auto import tqdm
30
30
  from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
31
31
  from natural_pdf.classification.manager import ClassificationManager
32
32
  from natural_pdf.classification.mixin import ClassificationMixin
33
+ from natural_pdf.describe.mixin import DescribeMixin, InspectMixin
33
34
  from natural_pdf.collections.mixins import ApplyMixin, DirectionalCollectionMixin
34
35
  from natural_pdf.core.pdf import PDF
35
36
  from natural_pdf.elements.base import Element
@@ -71,7 +72,14 @@ P = TypeVar("P", bound="Page")
71
72
 
72
73
 
73
74
  class ElementCollection(
74
- Generic[T], ApplyMixin, ExportMixin, DirectionalCollectionMixin, MutableSequence
75
+ Generic[T],
76
+ ApplyMixin,
77
+ ExportMixin,
78
+ ClassificationMixin,
79
+ DirectionalCollectionMixin,
80
+ DescribeMixin,
81
+ InspectMixin,
82
+ MutableSequence,
75
83
  ):
76
84
  """
77
85
  Collection of PDF elements with batch operations.
@@ -1795,6 +1803,8 @@ class ElementCollection(
1795
1803
  )
1796
1804
 
1797
1805
 
1806
+
1807
+
1798
1808
  class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
1799
1809
  """
1800
1810
  Represents a collection of Page objects, often from a single PDF document.
@@ -15,6 +15,7 @@ from natural_pdf.classification.manager import ClassificationManager # Keep for
15
15
 
16
16
  # --- Classification Imports --- #
17
17
  from natural_pdf.classification.mixin import ClassificationMixin
18
+ from natural_pdf.describe.mixin import DescribeMixin
18
19
  from natural_pdf.elements.base import DirectionalMixin
19
20
  from natural_pdf.elements.text import TextElement # ADDED IMPORT
20
21
  from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
@@ -49,7 +50,7 @@ except ImportError:
49
50
  logger = logging.getLogger(__name__)
50
51
 
51
52
 
52
- class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
53
+ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
53
54
  """
54
55
  Represents a rectangular region on a page.
55
56
  """
@@ -2962,3 +2963,5 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2962
2963
  )
2963
2964
 
2964
2965
  return text_element
2966
+
2967
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.1.16
3
+ Version: 0.1.17
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -25,14 +25,19 @@ natural_pdf/collections/pdf_collection.py,sha256=HLlyakM--23ZOeHDPucoM6Tw3yUyMXm
25
25
  natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
26
26
  natural_pdf/core/element_manager.py,sha256=_UdXu51sLi6STzc8Pj4k8R721G3yJixXDLuRHn3hmr8,25731
27
27
  natural_pdf/core/highlighting_service.py,sha256=_kQUS6_BBvsLBuSZloFrVag6jN90KzHa0ULyGBjufSs,36955
28
- natural_pdf/core/page.py,sha256=ciwBf-SoI431SJjp2VRfLxdtqgO2L6p044kXXjlNtjo,118231
28
+ natural_pdf/core/page.py,sha256=i3DriIQwoO4RuSrkrCXv44Dz8OL9KXPa2y4GhsD1y18,118324
29
29
  natural_pdf/core/pdf.py,sha256=bAoGPiKIrFaebLwULMT-9VkHQ_wkE_zNl4hlbMLk-2w,69325
30
+ natural_pdf/describe/__init__.py,sha256=B3zjuHjFI_dFuBLgXR1Q4v7c72fVDyk84d2hs0H4KV8,561
31
+ natural_pdf/describe/base.py,sha256=7USCFIl4mI5b15LTVkwvhAn_mngMwhwxCnVYaZz5Vdc,16842
32
+ natural_pdf/describe/elements.py,sha256=BOkz2wDhGh6P8NOm6pSNxitgmVokLTISztaFhrxMcdw,12717
33
+ natural_pdf/describe/mixin.py,sha256=U0x6v8r57KQb8qC3VVo64hvhfXQWsti8vdKBM7AXnMo,3116
34
+ natural_pdf/describe/summary.py,sha256=dPtjrn6fQ8nL0F74RITX2vXlDX7ZgaX9JQPnJB-S_XQ,6735
30
35
  natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
31
- natural_pdf/elements/base.py,sha256=tEyCInUc6wxbUtnXVaBa21Qpr591Sgu4yi7tKxWb-3U,39607
32
- natural_pdf/elements/collections.py,sha256=_lWL-W-RKlYikkGJU66dskGCZ8-7WfMyUx2G0IgjhlQ,121965
36
+ natural_pdf/elements/base.py,sha256=IlAeyzV66xMrxVx9U3ocGPekzGUBJgKkAiJ5kpvCSAg,39675
37
+ natural_pdf/elements/collections.py,sha256=vgVZsVC3xxRF2S5KW7L0JKa-NSUFnqURk50NtvlwbcM,122113
33
38
  natural_pdf/elements/line.py,sha256=300kSFBDUBIudfeQtH_tzW9gTYRgRKUDPiTABw6J-BE,4782
34
39
  natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
35
- natural_pdf/elements/region.py,sha256=nCXyI0vq9-MIQ4Zk90q5Nn-U6gDGv22NY6ime6qG1MY,123330
40
+ natural_pdf/elements/region.py,sha256=hBklYKcXJWyxayu9todYQOZ-d9KVDtqeV-CIt9IcSn8,123400
36
41
  natural_pdf/elements/text.py,sha256=13HvVZGinj2Vm_fFCAnqi7hohtoKvnpCp3VCfkpeAbc,11146
37
42
  natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
38
43
  natural_pdf/exporters/__init__.py,sha256=7MnvRLLQdwtg-ULu-8uK8C84GsKiJamyhRw_GgWhw7k,151
@@ -85,8 +90,8 @@ natural_pdf/utils/text_extraction.py,sha256=z6Jhy11pakYCsEpkvh8ldw6DkUFsYF1hCL9Y
85
90
  natural_pdf/utils/visualization.py,sha256=30pRWQdsRJh2pSObh-brKVsFgC1n8tHmSrta_UDnVPw,8989
86
91
  natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
87
92
  natural_pdf/widgets/viewer.py,sha256=2VUY1TzWMDe9I-IVNOosKZ2LaqpjLB62ftMAdk-s6_8,24952
88
- natural_pdf-0.1.16.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
89
- natural_pdf-0.1.16.dist-info/METADATA,sha256=ncvnNI_PubS4q4v29OKp5UXyanEZNVWqsCanu-xGCOA,6753
90
- natural_pdf-0.1.16.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
91
- natural_pdf-0.1.16.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
92
- natural_pdf-0.1.16.dist-info/RECORD,,
93
+ natural_pdf-0.1.17.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
94
+ natural_pdf-0.1.17.dist-info/METADATA,sha256=yGeusUaYx_R_aRl0lUnAHVfBav9Zw43MXDYcB3b6BcA,6753
95
+ natural_pdf-0.1.17.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
96
+ natural_pdf-0.1.17.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
97
+ natural_pdf-0.1.17.dist-info/RECORD,,