natural-pdf 25.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/__init__.py +3 -0
- examples/another_exclusion_example.py +20 -0
- examples/basic_usage.py +190 -0
- examples/boundary_exclusion_test.py +137 -0
- examples/boundary_inclusion_fix_test.py +157 -0
- examples/chainable_layout_example.py +70 -0
- examples/color_basic_test.py +49 -0
- examples/color_name_example.py +71 -0
- examples/color_test.py +62 -0
- examples/debug_ocr.py +91 -0
- examples/direct_ocr_test.py +148 -0
- examples/direct_paddle_test.py +99 -0
- examples/direct_qa_example.py +165 -0
- examples/document_layout_analysis.py +123 -0
- examples/document_qa_example.py +185 -0
- examples/exclusion_count_debug.py +128 -0
- examples/exclusion_debug.py +107 -0
- examples/exclusion_example.py +150 -0
- examples/exclusion_optimization_example.py +190 -0
- examples/extract_text_test.py +128 -0
- examples/font_aware_example.py +101 -0
- examples/font_variant_example.py +124 -0
- examples/footer_overlap_test.py +124 -0
- examples/highlight_all_example.py +82 -0
- examples/highlight_attributes_test.py +114 -0
- examples/highlight_confidence_display.py +122 -0
- examples/highlight_demo.py +110 -0
- examples/highlight_float_test.py +71 -0
- examples/highlight_test.py +147 -0
- examples/highlighting_example.py +123 -0
- examples/image_width_example.py +84 -0
- examples/improved_api_example.py +128 -0
- examples/layout_confidence_display_test.py +65 -0
- examples/layout_confidence_test.py +82 -0
- examples/layout_coordinate_debug.py +258 -0
- examples/layout_highlight_test.py +77 -0
- examples/logging_example.py +70 -0
- examples/ocr_comprehensive.py +193 -0
- examples/ocr_debug_example.py +87 -0
- examples/ocr_default_test.py +97 -0
- examples/ocr_engine_comparison.py +235 -0
- examples/ocr_example.py +89 -0
- examples/ocr_simplified_params.py +79 -0
- examples/ocr_visualization.py +102 -0
- examples/ocr_visualization_test.py +121 -0
- examples/paddle_layout_example.py +315 -0
- examples/paddle_layout_simple.py +74 -0
- examples/paddleocr_example.py +224 -0
- examples/page_collection_example.py +103 -0
- examples/polygon_highlight_example.py +83 -0
- examples/position_methods_example.py +134 -0
- examples/region_boundary_test.py +73 -0
- examples/region_exclusion_test.py +149 -0
- examples/region_expand_example.py +109 -0
- examples/region_image_example.py +116 -0
- examples/region_ocr_test.py +119 -0
- examples/region_sections_example.py +115 -0
- examples/school_books.py +49 -0
- examples/school_books_all.py +52 -0
- examples/scouring.py +36 -0
- examples/section_extraction_example.py +232 -0
- examples/simple_document_qa.py +97 -0
- examples/spatial_navigation_example.py +108 -0
- examples/table_extraction_example.py +135 -0
- examples/table_structure_detection.py +155 -0
- examples/tatr_cells_test.py +56 -0
- examples/tatr_ocr_table_test.py +94 -0
- examples/text_search_example.py +122 -0
- examples/text_style_example.py +110 -0
- examples/tiny-text.py +61 -0
- examples/until_boundaries_example.py +156 -0
- examples/until_example.py +112 -0
- examples/very_basics.py +15 -0
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +9 -0
- natural_pdf/analyzers/document_layout.py +736 -0
- natural_pdf/analyzers/text_structure.py +153 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/page.py +2376 -0
- natural_pdf/core/pdf.py +572 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +553 -0
- natural_pdf/elements/collections.py +770 -0
- natural_pdf/elements/line.py +124 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1366 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +62 -0
- natural_pdf/ocr/easyocr_engine.py +254 -0
- natural_pdf/ocr/engine.py +158 -0
- natural_pdf/ocr/paddleocr_engine.py +263 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +405 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +360 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +4 -0
- natural_pdf/utils/highlighting.py +605 -0
- natural_pdf/utils/ocr.py +515 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +151 -0
- natural_pdf-25.3.16.dist-info/LICENSE +21 -0
- natural_pdf-25.3.16.dist-info/METADATA +268 -0
- natural_pdf-25.3.16.dist-info/RECORD +109 -0
- natural_pdf-25.3.16.dist-info/WHEEL +5 -0
- natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
- tests/__init__.py +3 -0
- tests/test_pdf.py +39 -0
@@ -0,0 +1,153 @@
|
|
1
|
+
"""
|
2
|
+
Text structure analyzer for natural-pdf.
|
3
|
+
"""
|
4
|
+
from typing import List, Dict, Any, Optional, Tuple, Union, TYPE_CHECKING
|
5
|
+
from collections import defaultdict
|
6
|
+
|
7
|
+
if TYPE_CHECKING:
|
8
|
+
from natural_pdf.core.page import Page
|
9
|
+
from natural_pdf.elements.base import Element
|
10
|
+
from natural_pdf.elements.collections import ElementCollection
|
11
|
+
|
12
|
+
class TextStyleAnalyzer:
|
13
|
+
"""
|
14
|
+
Analyzes and groups text elements by their style properties.
|
15
|
+
|
16
|
+
This analyzer groups text elements based on their font properties
|
17
|
+
(size, fontname, etc.) to identify different text styles in a document.
|
18
|
+
"""
|
19
|
+
|
20
|
+
def __init__(self):
|
21
|
+
"""Initialize the text style analyzer."""
|
22
|
+
pass
|
23
|
+
|
24
|
+
def analyze(self, page: 'Page') -> Dict[str, 'ElementCollection']:
|
25
|
+
"""
|
26
|
+
Analyze the text styles on a page.
|
27
|
+
|
28
|
+
Args:
|
29
|
+
page: Page to analyze
|
30
|
+
|
31
|
+
Returns:
|
32
|
+
Dictionary mapping style labels to element collections
|
33
|
+
"""
|
34
|
+
# Get all text elements
|
35
|
+
text_elements = page.find_all('text')
|
36
|
+
|
37
|
+
# Skip empty pages
|
38
|
+
if not text_elements:
|
39
|
+
return {}
|
40
|
+
|
41
|
+
# Group elements by their style properties
|
42
|
+
style_groups = self._group_by_style(text_elements)
|
43
|
+
|
44
|
+
return style_groups
|
45
|
+
|
46
|
+
def _group_by_style(self, elements: 'ElementCollection') -> Dict[str, 'ElementCollection']:
|
47
|
+
"""
|
48
|
+
Group text elements by their style properties.
|
49
|
+
|
50
|
+
Args:
|
51
|
+
elements: Text elements to group
|
52
|
+
|
53
|
+
Returns:
|
54
|
+
Dictionary mapping style labels to element collections
|
55
|
+
"""
|
56
|
+
from natural_pdf.elements.collections import ElementCollection
|
57
|
+
|
58
|
+
# Extract style properties for each element
|
59
|
+
element_styles = []
|
60
|
+
for element in elements:
|
61
|
+
style = self._extract_style_properties(element)
|
62
|
+
element_styles.append((element, style))
|
63
|
+
|
64
|
+
# Group elements by their style properties
|
65
|
+
style_groups = defaultdict(list)
|
66
|
+
style_mapping = {} # Maps style tuple to style number
|
67
|
+
|
68
|
+
for element, style in element_styles:
|
69
|
+
# Get or create style number
|
70
|
+
if style not in style_mapping:
|
71
|
+
style_mapping[style] = len(style_mapping)
|
72
|
+
|
73
|
+
style_num = style_mapping[style]
|
74
|
+
style_groups[f"Text Style {style_num+1}"].append(element)
|
75
|
+
|
76
|
+
# Convert to ElementCollections
|
77
|
+
return {
|
78
|
+
label: ElementCollection(elements)
|
79
|
+
for label, elements in style_groups.items()
|
80
|
+
}
|
81
|
+
|
82
|
+
def _extract_style_properties(self, element: 'Element') -> Tuple:
|
83
|
+
"""
|
84
|
+
Extract style properties from a text element.
|
85
|
+
|
86
|
+
Args:
|
87
|
+
element: Text element
|
88
|
+
|
89
|
+
Returns:
|
90
|
+
Tuple of style properties (hashable)
|
91
|
+
"""
|
92
|
+
# Extract properties that define the style
|
93
|
+
properties = []
|
94
|
+
|
95
|
+
# Font size (rounded to nearest 0.5 to handle small variations)
|
96
|
+
if hasattr(element, 'size') and element.size is not None:
|
97
|
+
font_size = round(element.size * 2) / 2 # Round to nearest 0.5
|
98
|
+
properties.append(font_size)
|
99
|
+
else:
|
100
|
+
properties.append(None)
|
101
|
+
|
102
|
+
# Font name
|
103
|
+
if hasattr(element, 'fontname') and element.fontname is not None:
|
104
|
+
properties.append(element.fontname)
|
105
|
+
else:
|
106
|
+
properties.append(None)
|
107
|
+
|
108
|
+
# Font characteristics (derived from name)
|
109
|
+
is_bold = False
|
110
|
+
is_italic = False
|
111
|
+
if hasattr(element, 'fontname') and element.fontname is not None:
|
112
|
+
font_lower = element.fontname.lower()
|
113
|
+
is_bold = ('bold' in font_lower or 'black' in font_lower or element.fontname.endswith('-B'))
|
114
|
+
is_italic = ('italic' in font_lower or 'oblique' in font_lower or element.fontname.endswith('-I'))
|
115
|
+
|
116
|
+
properties.append(is_bold)
|
117
|
+
properties.append(is_italic)
|
118
|
+
|
119
|
+
# Text color
|
120
|
+
if hasattr(element, 'non_stroking_color') and element.non_stroking_color is not None:
|
121
|
+
# Convert color to a hashable form (tuple)
|
122
|
+
if isinstance(element.non_stroking_color, (list, tuple)):
|
123
|
+
color = tuple(element.non_stroking_color)
|
124
|
+
else:
|
125
|
+
color = element.non_stroking_color
|
126
|
+
properties.append(color)
|
127
|
+
else:
|
128
|
+
properties.append(None)
|
129
|
+
|
130
|
+
return tuple(properties)
|
131
|
+
|
132
|
+
def analyze_and_label(self, page: 'Page') -> 'Page':
|
133
|
+
"""
|
134
|
+
Analyze the page text styles and add style labels to elements.
|
135
|
+
|
136
|
+
Args:
|
137
|
+
page: Page to analyze
|
138
|
+
|
139
|
+
Returns:
|
140
|
+
Page with style labels added
|
141
|
+
"""
|
142
|
+
# Analyze the styles
|
143
|
+
styles = self.analyze(page)
|
144
|
+
|
145
|
+
# Add style as an attribute to each element
|
146
|
+
for label, elements in styles.items():
|
147
|
+
for element in elements:
|
148
|
+
element._style_label = label
|
149
|
+
|
150
|
+
# Store the styles on the page
|
151
|
+
page._text_styles = styles
|
152
|
+
|
153
|
+
return page
|