natural-pdf 25.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. examples/__init__.py +3 -0
  2. examples/another_exclusion_example.py +20 -0
  3. examples/basic_usage.py +190 -0
  4. examples/boundary_exclusion_test.py +137 -0
  5. examples/boundary_inclusion_fix_test.py +157 -0
  6. examples/chainable_layout_example.py +70 -0
  7. examples/color_basic_test.py +49 -0
  8. examples/color_name_example.py +71 -0
  9. examples/color_test.py +62 -0
  10. examples/debug_ocr.py +91 -0
  11. examples/direct_ocr_test.py +148 -0
  12. examples/direct_paddle_test.py +99 -0
  13. examples/direct_qa_example.py +165 -0
  14. examples/document_layout_analysis.py +123 -0
  15. examples/document_qa_example.py +185 -0
  16. examples/exclusion_count_debug.py +128 -0
  17. examples/exclusion_debug.py +107 -0
  18. examples/exclusion_example.py +150 -0
  19. examples/exclusion_optimization_example.py +190 -0
  20. examples/extract_text_test.py +128 -0
  21. examples/font_aware_example.py +101 -0
  22. examples/font_variant_example.py +124 -0
  23. examples/footer_overlap_test.py +124 -0
  24. examples/highlight_all_example.py +82 -0
  25. examples/highlight_attributes_test.py +114 -0
  26. examples/highlight_confidence_display.py +122 -0
  27. examples/highlight_demo.py +110 -0
  28. examples/highlight_float_test.py +71 -0
  29. examples/highlight_test.py +147 -0
  30. examples/highlighting_example.py +123 -0
  31. examples/image_width_example.py +84 -0
  32. examples/improved_api_example.py +128 -0
  33. examples/layout_confidence_display_test.py +65 -0
  34. examples/layout_confidence_test.py +82 -0
  35. examples/layout_coordinate_debug.py +258 -0
  36. examples/layout_highlight_test.py +77 -0
  37. examples/logging_example.py +70 -0
  38. examples/ocr_comprehensive.py +193 -0
  39. examples/ocr_debug_example.py +87 -0
  40. examples/ocr_default_test.py +97 -0
  41. examples/ocr_engine_comparison.py +235 -0
  42. examples/ocr_example.py +89 -0
  43. examples/ocr_simplified_params.py +79 -0
  44. examples/ocr_visualization.py +102 -0
  45. examples/ocr_visualization_test.py +121 -0
  46. examples/paddle_layout_example.py +315 -0
  47. examples/paddle_layout_simple.py +74 -0
  48. examples/paddleocr_example.py +224 -0
  49. examples/page_collection_example.py +103 -0
  50. examples/polygon_highlight_example.py +83 -0
  51. examples/position_methods_example.py +134 -0
  52. examples/region_boundary_test.py +73 -0
  53. examples/region_exclusion_test.py +149 -0
  54. examples/region_expand_example.py +109 -0
  55. examples/region_image_example.py +116 -0
  56. examples/region_ocr_test.py +119 -0
  57. examples/region_sections_example.py +115 -0
  58. examples/school_books.py +49 -0
  59. examples/school_books_all.py +52 -0
  60. examples/scouring.py +36 -0
  61. examples/section_extraction_example.py +232 -0
  62. examples/simple_document_qa.py +97 -0
  63. examples/spatial_navigation_example.py +108 -0
  64. examples/table_extraction_example.py +135 -0
  65. examples/table_structure_detection.py +155 -0
  66. examples/tatr_cells_test.py +56 -0
  67. examples/tatr_ocr_table_test.py +94 -0
  68. examples/text_search_example.py +122 -0
  69. examples/text_style_example.py +110 -0
  70. examples/tiny-text.py +61 -0
  71. examples/until_boundaries_example.py +156 -0
  72. examples/until_example.py +112 -0
  73. examples/very_basics.py +15 -0
  74. natural_pdf/__init__.py +55 -0
  75. natural_pdf/analyzers/__init__.py +9 -0
  76. natural_pdf/analyzers/document_layout.py +736 -0
  77. natural_pdf/analyzers/text_structure.py +153 -0
  78. natural_pdf/core/__init__.py +3 -0
  79. natural_pdf/core/page.py +2376 -0
  80. natural_pdf/core/pdf.py +572 -0
  81. natural_pdf/elements/__init__.py +3 -0
  82. natural_pdf/elements/base.py +553 -0
  83. natural_pdf/elements/collections.py +770 -0
  84. natural_pdf/elements/line.py +124 -0
  85. natural_pdf/elements/rect.py +122 -0
  86. natural_pdf/elements/region.py +1366 -0
  87. natural_pdf/elements/text.py +304 -0
  88. natural_pdf/ocr/__init__.py +62 -0
  89. natural_pdf/ocr/easyocr_engine.py +254 -0
  90. natural_pdf/ocr/engine.py +158 -0
  91. natural_pdf/ocr/paddleocr_engine.py +263 -0
  92. natural_pdf/qa/__init__.py +3 -0
  93. natural_pdf/qa/document_qa.py +405 -0
  94. natural_pdf/selectors/__init__.py +4 -0
  95. natural_pdf/selectors/parser.py +360 -0
  96. natural_pdf/templates/__init__.py +1 -0
  97. natural_pdf/templates/ocr_debug.html +517 -0
  98. natural_pdf/utils/__init__.py +4 -0
  99. natural_pdf/utils/highlighting.py +605 -0
  100. natural_pdf/utils/ocr.py +515 -0
  101. natural_pdf/utils/reading_order.py +227 -0
  102. natural_pdf/utils/visualization.py +151 -0
  103. natural_pdf-25.3.16.dist-info/LICENSE +21 -0
  104. natural_pdf-25.3.16.dist-info/METADATA +268 -0
  105. natural_pdf-25.3.16.dist-info/RECORD +109 -0
  106. natural_pdf-25.3.16.dist-info/WHEEL +5 -0
  107. natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
  108. tests/__init__.py +3 -0
  109. tests/test_pdf.py +39 -0
@@ -0,0 +1,153 @@
1
+ """
2
+ Text structure analyzer for natural-pdf.
3
+ """
4
+ from typing import List, Dict, Any, Optional, Tuple, Union, TYPE_CHECKING
5
+ from collections import defaultdict
6
+
7
+ if TYPE_CHECKING:
8
+ from natural_pdf.core.page import Page
9
+ from natural_pdf.elements.base import Element
10
+ from natural_pdf.elements.collections import ElementCollection
11
+
12
+ class TextStyleAnalyzer:
13
+ """
14
+ Analyzes and groups text elements by their style properties.
15
+
16
+ This analyzer groups text elements based on their font properties
17
+ (size, fontname, etc.) to identify different text styles in a document.
18
+ """
19
+
20
+ def __init__(self):
21
+ """Initialize the text style analyzer."""
22
+ pass
23
+
24
+ def analyze(self, page: 'Page') -> Dict[str, 'ElementCollection']:
25
+ """
26
+ Analyze the text styles on a page.
27
+
28
+ Args:
29
+ page: Page to analyze
30
+
31
+ Returns:
32
+ Dictionary mapping style labels to element collections
33
+ """
34
+ # Get all text elements
35
+ text_elements = page.find_all('text')
36
+
37
+ # Skip empty pages
38
+ if not text_elements:
39
+ return {}
40
+
41
+ # Group elements by their style properties
42
+ style_groups = self._group_by_style(text_elements)
43
+
44
+ return style_groups
45
+
46
+ def _group_by_style(self, elements: 'ElementCollection') -> Dict[str, 'ElementCollection']:
47
+ """
48
+ Group text elements by their style properties.
49
+
50
+ Args:
51
+ elements: Text elements to group
52
+
53
+ Returns:
54
+ Dictionary mapping style labels to element collections
55
+ """
56
+ from natural_pdf.elements.collections import ElementCollection
57
+
58
+ # Extract style properties for each element
59
+ element_styles = []
60
+ for element in elements:
61
+ style = self._extract_style_properties(element)
62
+ element_styles.append((element, style))
63
+
64
+ # Group elements by their style properties
65
+ style_groups = defaultdict(list)
66
+ style_mapping = {} # Maps style tuple to style number
67
+
68
+ for element, style in element_styles:
69
+ # Get or create style number
70
+ if style not in style_mapping:
71
+ style_mapping[style] = len(style_mapping)
72
+
73
+ style_num = style_mapping[style]
74
+ style_groups[f"Text Style {style_num+1}"].append(element)
75
+
76
+ # Convert to ElementCollections
77
+ return {
78
+ label: ElementCollection(elements)
79
+ for label, elements in style_groups.items()
80
+ }
81
+
82
+ def _extract_style_properties(self, element: 'Element') -> Tuple:
83
+ """
84
+ Extract style properties from a text element.
85
+
86
+ Args:
87
+ element: Text element
88
+
89
+ Returns:
90
+ Tuple of style properties (hashable)
91
+ """
92
+ # Extract properties that define the style
93
+ properties = []
94
+
95
+ # Font size (rounded to nearest 0.5 to handle small variations)
96
+ if hasattr(element, 'size') and element.size is not None:
97
+ font_size = round(element.size * 2) / 2 # Round to nearest 0.5
98
+ properties.append(font_size)
99
+ else:
100
+ properties.append(None)
101
+
102
+ # Font name
103
+ if hasattr(element, 'fontname') and element.fontname is not None:
104
+ properties.append(element.fontname)
105
+ else:
106
+ properties.append(None)
107
+
108
+ # Font characteristics (derived from name)
109
+ is_bold = False
110
+ is_italic = False
111
+ if hasattr(element, 'fontname') and element.fontname is not None:
112
+ font_lower = element.fontname.lower()
113
+ is_bold = ('bold' in font_lower or 'black' in font_lower or element.fontname.endswith('-B'))
114
+ is_italic = ('italic' in font_lower or 'oblique' in font_lower or element.fontname.endswith('-I'))
115
+
116
+ properties.append(is_bold)
117
+ properties.append(is_italic)
118
+
119
+ # Text color
120
+ if hasattr(element, 'non_stroking_color') and element.non_stroking_color is not None:
121
+ # Convert color to a hashable form (tuple)
122
+ if isinstance(element.non_stroking_color, (list, tuple)):
123
+ color = tuple(element.non_stroking_color)
124
+ else:
125
+ color = element.non_stroking_color
126
+ properties.append(color)
127
+ else:
128
+ properties.append(None)
129
+
130
+ return tuple(properties)
131
+
132
+ def analyze_and_label(self, page: 'Page') -> 'Page':
133
+ """
134
+ Analyze the page text styles and add style labels to elements.
135
+
136
+ Args:
137
+ page: Page to analyze
138
+
139
+ Returns:
140
+ Page with style labels added
141
+ """
142
+ # Analyze the styles
143
+ styles = self.analyze(page)
144
+
145
+ # Add style as an attribute to each element
146
+ for label, elements in styles.items():
147
+ for element in elements:
148
+ element._style_label = label
149
+
150
+ # Store the styles on the page
151
+ page._text_styles = styles
152
+
153
+ return page
@@ -0,0 +1,3 @@
1
+ """
2
+ Core classes for Natural PDF.
3
+ """