natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +222 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +260 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +409 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +484 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +586 -0
  56. docs/tutorials/12-ocr-integration.md +188 -0
  57. docs/tutorials/13-semantic-search.ipynb +1888 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +39 -20
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  67. natural_pdf/analyzers/layout/layout_manager.py +98 -58
  68. natural_pdf/analyzers/layout/layout_options.py +32 -17
  69. natural_pdf/analyzers/layout/paddle.py +152 -95
  70. natural_pdf/analyzers/layout/surya.py +164 -92
  71. natural_pdf/analyzers/layout/tatr.py +149 -84
  72. natural_pdf/analyzers/layout/yolo.py +84 -44
  73. natural_pdf/analyzers/text_options.py +22 -15
  74. natural_pdf/analyzers/text_structure.py +131 -85
  75. natural_pdf/analyzers/utils.py +30 -23
  76. natural_pdf/collections/pdf_collection.py +126 -98
  77. natural_pdf/core/__init__.py +1 -1
  78. natural_pdf/core/element_manager.py +416 -337
  79. natural_pdf/core/highlighting_service.py +268 -196
  80. natural_pdf/core/page.py +910 -516
  81. natural_pdf/core/pdf.py +387 -289
  82. natural_pdf/elements/__init__.py +1 -1
  83. natural_pdf/elements/base.py +302 -214
  84. natural_pdf/elements/collections.py +714 -514
  85. natural_pdf/elements/line.py +39 -36
  86. natural_pdf/elements/rect.py +32 -30
  87. natural_pdf/elements/region.py +854 -883
  88. natural_pdf/elements/text.py +122 -99
  89. natural_pdf/exporters/__init__.py +0 -1
  90. natural_pdf/exporters/searchable_pdf.py +261 -102
  91. natural_pdf/ocr/__init__.py +23 -14
  92. natural_pdf/ocr/engine.py +17 -8
  93. natural_pdf/ocr/engine_easyocr.py +63 -47
  94. natural_pdf/ocr/engine_paddle.py +97 -68
  95. natural_pdf/ocr/engine_surya.py +54 -44
  96. natural_pdf/ocr/ocr_manager.py +88 -62
  97. natural_pdf/ocr/ocr_options.py +16 -10
  98. natural_pdf/qa/__init__.py +1 -1
  99. natural_pdf/qa/document_qa.py +119 -111
  100. natural_pdf/search/__init__.py +37 -31
  101. natural_pdf/search/haystack_search_service.py +312 -189
  102. natural_pdf/search/haystack_utils.py +186 -122
  103. natural_pdf/search/search_options.py +25 -14
  104. natural_pdf/search/search_service_protocol.py +12 -6
  105. natural_pdf/search/searchable_mixin.py +261 -176
  106. natural_pdf/selectors/__init__.py +2 -1
  107. natural_pdf/selectors/parser.py +159 -316
  108. natural_pdf/templates/__init__.py +1 -1
  109. natural_pdf/utils/highlighting.py +8 -2
  110. natural_pdf/utils/reading_order.py +65 -63
  111. natural_pdf/utils/text_extraction.py +195 -0
  112. natural_pdf/utils/visualization.py +70 -61
  113. natural_pdf/widgets/__init__.py +2 -3
  114. natural_pdf/widgets/viewer.py +749 -718
  115. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
  116. natural_pdf-0.1.5.dist-info/RECORD +134 -0
  117. natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
  118. notebooks/Examples.ipynb +1293 -0
  119. pdfs/.gitkeep +0 -0
  120. pdfs/01-practice.pdf +543 -0
  121. pdfs/0500000US42001.pdf +0 -0
  122. pdfs/0500000US42007.pdf +0 -0
  123. pdfs/2014 Statistics.pdf +0 -0
  124. pdfs/2019 Statistics.pdf +0 -0
  125. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  126. pdfs/needs-ocr.pdf +0 -0
  127. tests/test_loading.py +50 -0
  128. tests/test_optional_deps.py +298 -0
  129. natural_pdf-0.1.3.dist-info/RECORD +0 -61
  130. natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
  131. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
  132. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -4,12 +4,15 @@ from typing import List, Optional
4
4
 
5
5
  logger = logging.getLogger(__name__)
6
6
 
7
+
7
8
  @dataclass
8
9
  class TextStyleOptions:
9
10
  """Options for configuring text style analysis."""
10
11
 
11
12
  # Properties to consider when grouping elements by style
12
- group_by: List[str] = field(default_factory=lambda: ['size', 'fontname', 'is_bold', 'is_italic', 'color'])
13
+ group_by: List[str] = field(
14
+ default_factory=lambda: ["size", "fontname", "is_bold", "is_italic", "color"]
15
+ )
13
16
 
14
17
  # Tolerance for comparing font sizes (e.g., 0.5 rounds to nearest 0.5 point)
15
18
  size_tolerance: float = 0.5
@@ -30,31 +33,35 @@ class TextStyleOptions:
30
33
  # Format string for descriptive labels. Placeholders match keys in style_properties dict.
31
34
  # Example: "{size}pt {weight}{style} {family} ({color})"
32
35
  # Available keys: size, fontname, is_bold, is_italic, color, weight, style, family
33
- label_format: str = "{size}pt {weight}{style} {family}" # Default format without color
34
-
36
+ label_format: str = "{size}pt {weight}{style} {family}" # Default format without color
35
37
 
36
38
  def __post_init__(self):
37
39
  # Validate size_tolerance
38
40
  if self.size_tolerance <= 0:
39
- logger.warning(f"size_tolerance must be positive, setting to 0.1. Original value: {self.size_tolerance}")
41
+ logger.warning(
42
+ f"size_tolerance must be positive, setting to 0.1. Original value: {self.size_tolerance}"
43
+ )
40
44
  self.size_tolerance = 0.1
41
45
 
42
46
  # Ensure 'size' is always considered if tolerance is relevant
43
- if 'size' not in self.group_by and self.size_tolerance > 0:
47
+ if "size" not in self.group_by and self.size_tolerance > 0:
44
48
  logger.debug("Adding 'size' to group_by keys because size_tolerance is set.")
45
- if 'size' not in self.group_by: self.group_by.append('size')
49
+ if "size" not in self.group_by:
50
+ self.group_by.append("size")
46
51
 
47
- if self.ignore_color and 'color' in self.group_by:
52
+ if self.ignore_color and "color" in self.group_by:
48
53
  logger.debug("Removing 'color' from group_by keys because ignore_color is True.")
49
- self.group_by = [key for key in self.group_by if key != 'color']
50
- elif not self.ignore_color and 'color' not in self.group_by:
51
- # If color isn't ignored, ensure it's included if requested in label format?
52
- # For now, just rely on explicit group_by setting.
53
- pass
54
+ self.group_by = [key for key in self.group_by if key != "color"]
55
+ elif not self.ignore_color and "color" not in self.group_by:
56
+ # If color isn't ignored, ensure it's included if requested in label format?
57
+ # For now, just rely on explicit group_by setting.
58
+ pass
54
59
 
55
60
  # Basic validation for group_by keys
56
- allowed_keys = {'size', 'fontname', 'is_bold', 'is_italic', 'color'}
61
+ allowed_keys = {"size", "fontname", "is_bold", "is_italic", "color"}
57
62
  invalid_keys = set(self.group_by) - allowed_keys
58
63
  if invalid_keys:
59
- logger.warning(f"Invalid keys found in group_by: {invalid_keys}. Allowed keys: {allowed_keys}. Ignoring invalid keys.")
60
- self.group_by = [key for key in self.group_by if key in allowed_keys]
64
+ logger.warning(
65
+ f"Invalid keys found in group_by: {invalid_keys}. Allowed keys: {allowed_keys}. Ignoring invalid keys."
66
+ )
67
+ self.group_by = [key for key in self.group_by if key in allowed_keys]
@@ -1,18 +1,21 @@
1
1
  """
2
2
  Text structure analyzer for natural-pdf.
3
3
  """
4
+
4
5
  import logging
5
6
  import re
6
- from typing import List, Dict, Any, Optional, Tuple, Union, TYPE_CHECKING
7
7
  from collections import defaultdict
8
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
9
+
10
+ from natural_pdf.analyzers.text_options import TextStyleOptions
8
11
 
9
12
  # Import ElementCollection and TextStyleOptions
10
13
  from natural_pdf.elements.collections import ElementCollection
11
- from natural_pdf.analyzers.text_options import TextStyleOptions
12
14
 
13
15
  if TYPE_CHECKING:
14
16
  from natural_pdf.core.page import Page
15
17
  from natural_pdf.elements.base import Element
18
+
16
19
  # Remove ElementCollection from here if imported above
17
20
 
18
21
  logger = logging.getLogger(__name__)
@@ -21,63 +24,77 @@ logger = logging.getLogger(__name__)
21
24
  FONT_PREFIX_RE = re.compile(r"^[A-Z]{6}\+")
22
25
 
23
26
  # Common font weight/style keywords
24
- FONT_WEIGHTS = {"bold": "Bold", "black": "Bold", "heavy": "Bold", "medium": "", "light": "Light", "thin": "Thin"}
27
+ FONT_WEIGHTS = {
28
+ "bold": "Bold",
29
+ "black": "Bold",
30
+ "heavy": "Bold",
31
+ "medium": "",
32
+ "light": "Light",
33
+ "thin": "Thin",
34
+ }
25
35
  FONT_STYLES = {"italic": "Italic", "oblique": "Italic"}
26
36
 
37
+
27
38
  class TextStyleAnalyzer:
28
39
  """
29
40
  Analyzes and groups text elements by their style properties based on configuration.
30
-
41
+
31
42
  This analyzer groups text elements based on specified font properties
32
43
  (controlled by TextStyleOptions) and adds 'style_label', 'style_key',
33
44
  and 'style_properties' attributes to each processed text element.
34
45
  """
35
-
46
+
36
47
  def __init__(self, options: Optional[TextStyleOptions] = None):
37
48
  """
38
49
  Initialize the text style analyzer.
39
-
50
+
40
51
  Args:
41
52
  options: Configuration options for the analysis. Uses default if None.
42
53
  """
43
54
  self.options = options or TextStyleOptions()
44
55
  logger.debug(f"Initialized TextStyleAnalyzer with options: {self.options}")
45
56
 
46
- def analyze(self, page: 'Page', options: Optional[TextStyleOptions] = None) -> 'ElementCollection':
57
+ def analyze(
58
+ self, page: "Page", options: Optional[TextStyleOptions] = None
59
+ ) -> "ElementCollection":
47
60
  """
48
61
  Analyze text styles on a page, group elements, and add style attributes.
49
-
62
+
50
63
  Args:
51
64
  page: The Page object to analyze.
52
65
  options: Override the analyzer's default TextStyleOptions for this run.
53
-
66
+
54
67
  Returns:
55
68
  ElementCollection containing all processed text elements (typically words)
56
69
  with added 'style_label', 'style_key', and 'style_properties' attributes.
57
70
  """
58
71
  current_options = options or self.options
59
- logger.info(f"Starting text style analysis for page {page.number} with options: {current_options}")
72
+ logger.info(
73
+ f"Starting text style analysis for page {page.number} with options: {current_options}"
74
+ )
60
75
 
61
76
  # Use page.words for better granularity
62
77
  text_elements = page.words
63
78
  # Fallback if words are somehow empty/not generated
64
79
  if not text_elements:
65
- text_elements = page.find_all('text').elements # Get list from collection
80
+ text_elements = page.find_all("text").elements # Get list from collection
66
81
 
67
82
  # Skip empty pages or pages with no text elements
68
83
  if not text_elements:
69
84
  logger.warning(f"Page {page.number} has no text elements to analyze.")
70
85
  return ElementCollection([])
71
86
 
72
- style_cache: Dict[Tuple, Dict[str, Any]] = {} # Maps style_key_tuple -> {'label': str, 'properties': dict}
73
- processed_elements: List['Element'] = []
87
+ style_cache: Dict[Tuple, Dict[str, Any]] = (
88
+ {}
89
+ ) # Maps style_key_tuple -> {'label': str, 'properties': dict}
90
+ processed_elements: List["Element"] = []
74
91
 
75
92
  # Ensure consistent ordering for style key creation
76
93
  group_by_keys = sorted(current_options.group_by)
77
94
 
78
95
  for element in text_elements:
79
96
  # Skip elements without necessary attributes (e.g., non-text elements if find_all was used)
80
- if not hasattr(element, 'text') or not hasattr(element, 'size'):
97
+ if not hasattr(element, "text") or not hasattr(element, "size"):
81
98
  logger.debug(f"Skipping element without text/size: {element}")
82
99
  continue
83
100
 
@@ -86,37 +103,47 @@ class TextStyleAnalyzer:
86
103
  style_key = self._create_style_key(style_properties, group_by_keys)
87
104
 
88
105
  if style_key not in style_cache:
89
- label = self._generate_style_label(style_properties, current_options, len(style_cache) + 1)
90
- style_cache[style_key] = {'label': label, 'properties': style_properties}
91
- logger.debug(f"New style detected (Key: {style_key}): Label='{label}', Props={style_properties}")
106
+ label = self._generate_style_label(
107
+ style_properties, current_options, len(style_cache) + 1
108
+ )
109
+ style_cache[style_key] = {"label": label, "properties": style_properties}
110
+ logger.debug(
111
+ f"New style detected (Key: {style_key}): Label='{label}', Props={style_properties}"
112
+ )
92
113
 
93
114
  # Add attributes to the element
94
- element.style_label = style_cache[style_key]['label']
115
+ element.style_label = style_cache[style_key]["label"]
95
116
  element.style_key = style_key
96
117
  # Add the full properties dict for potential detailed inspection
97
- element.style_properties = style_cache[style_key]['properties']
118
+ element.style_properties = style_cache[style_key]["properties"]
98
119
 
99
120
  processed_elements.append(element)
100
121
 
101
122
  except Exception as e:
102
- logger.warning(f"Error processing element {element} for text style: {e}", exc_info=True)
103
- # Optionally add element without style info or skip it
104
- # processed_elements.append(element) # Add anyway?
123
+ logger.warning(
124
+ f"Error processing element {element} for text style: {e}", exc_info=True
125
+ )
126
+ # Optionally add element without style info or skip it
127
+ # processed_elements.append(element) # Add anyway?
105
128
 
106
129
  # Optionally store a summary on the page
107
130
  page._text_styles_summary = style_cache
108
- logger.info(f"Finished text style analysis for page {page.number}. Found {len(style_cache)} unique styles.")
131
+ logger.info(
132
+ f"Finished text style analysis for page {page.number}. Found {len(style_cache)} unique styles."
133
+ )
109
134
 
110
135
  return ElementCollection(processed_elements)
111
-
112
- def _extract_style_properties(self, element: 'Element', options: TextStyleOptions) -> Dict[str, Any]:
136
+
137
+ def _extract_style_properties(
138
+ self, element: "Element", options: TextStyleOptions
139
+ ) -> Dict[str, Any]:
113
140
  """
114
141
  Extract style properties from a text element based on options.
115
-
142
+
116
143
  Args:
117
144
  element: Text element.
118
145
  options: TextStyleOptions driving the extraction.
119
-
146
+
120
147
  Returns:
121
148
  Dictionary of extracted style properties.
122
149
  """
@@ -124,68 +151,81 @@ class TextStyleAnalyzer:
124
151
 
125
152
  # Font size
126
153
  font_size = None
127
- if hasattr(element, 'size') and element.size is not None:
154
+ if hasattr(element, "size") and element.size is not None:
128
155
  # Round based on tolerance
129
156
  rounding_factor = 1.0 / options.size_tolerance
130
157
  font_size = round(element.size * rounding_factor) / rounding_factor
131
- properties['size'] = font_size
158
+ properties["size"] = font_size
132
159
 
133
160
  # Font name
134
161
  font_name = None
135
162
  normalized_font_name = None
136
- if hasattr(element, 'fontname') and element.fontname is not None:
163
+ if hasattr(element, "fontname") and element.fontname is not None:
137
164
  font_name = element.fontname
138
165
  normalized_font_name = self._normalize_font_name(font_name, options)
139
- properties['fontname'] = normalized_font_name if options.normalize_fontname else font_name
166
+ properties["fontname"] = normalized_font_name if options.normalize_fontname else font_name
140
167
 
141
168
  # Font characteristics (derived from normalized name if available)
142
169
  name_to_check = normalized_font_name or font_name or ""
143
170
  name_lower = name_to_check.lower()
144
- is_bold = ('bold' in name_lower or 'black' in name_lower or 'heavy' in name_lower or name_to_check.endswith('-B'))
145
- is_italic = ('italic' in name_lower or 'oblique' in name_lower or name_to_check.endswith('-I'))
146
-
147
- properties['is_bold'] = is_bold
148
- properties['is_italic'] = is_italic
171
+ is_bold = (
172
+ "bold" in name_lower
173
+ or "black" in name_lower
174
+ or "heavy" in name_lower
175
+ or name_to_check.endswith("-B")
176
+ )
177
+ is_italic = (
178
+ "italic" in name_lower or "oblique" in name_lower or name_to_check.endswith("-I")
179
+ )
180
+
181
+ properties["is_bold"] = is_bold
182
+ properties["is_italic"] = is_italic
149
183
 
150
184
  # Text color
151
185
  color = None
152
- if not options.ignore_color and hasattr(element, 'non_stroking_color') and element.non_stroking_color is not None:
186
+ if (
187
+ not options.ignore_color
188
+ and hasattr(element, "non_stroking_color")
189
+ and element.non_stroking_color is not None
190
+ ):
153
191
  raw_color = element.non_stroking_color
154
192
  # Convert color to a hashable form (tuple)
155
193
  if isinstance(raw_color, (list, tuple)):
156
- color = tuple(round(c, 3) for c in raw_color) # Round color components
194
+ color = tuple(round(c, 3) for c in raw_color) # Round color components
157
195
  else:
158
196
  # Handle simple grayscale or other non-list representations if needed
159
- try:
160
- color = round(float(raw_color), 3)
161
- except (ValueError, TypeError):
162
- color = str(raw_color) # Fallback to string if cannot convert
197
+ try:
198
+ color = round(float(raw_color), 3)
199
+ except (ValueError, TypeError):
200
+ color = str(raw_color) # Fallback to string if cannot convert
163
201
  # Normalize common colors (optional, could be complex)
164
202
  # Example: (0.0, 0.0, 0.0) -> 'black', (1.0, 1.0, 1.0) -> 'white'
165
- if color == (0.0, 0.0, 0.0) or color == 0.0: color = 'black'
166
- if color == (1.0, 1.0, 1.0) or color == 1.0: color = 'white'
167
- properties['color'] = color
203
+ if color == (0.0, 0.0, 0.0) or color == 0.0:
204
+ color = "black"
205
+ if color == (1.0, 1.0, 1.0) or color == 1.0:
206
+ color = "white"
207
+ properties["color"] = color
168
208
 
169
209
  return properties
170
-
210
+
171
211
  def _normalize_font_name(self, fontname: str, options: TextStyleOptions) -> str:
172
- """ Basic normalization of font names. """
212
+ """Basic normalization of font names."""
173
213
  if not options.normalize_fontname:
174
214
  return fontname
175
215
  # Remove common subset prefixes like "ABCDEF+"
176
216
  name = FONT_PREFIX_RE.sub("", fontname)
177
217
  # Could add more rules here, e.g., removing version numbers, standardizing separators
178
218
  return name
179
-
219
+
180
220
  def _parse_font_name(self, normalized_fontname: str) -> Dict[str, str]:
181
- """ Attempt to parse family, weight, and style from a font name. Very heuristic. """
221
+ """Attempt to parse family, weight, and style from a font name. Very heuristic."""
182
222
  if not normalized_fontname:
183
- return {'family': 'Unknown', 'weight': '', 'style': ''}
223
+ return {"family": "Unknown", "weight": "", "style": ""}
184
224
 
185
- parts = re.split(r'[-,_ ]', normalized_fontname)
225
+ parts = re.split(r"[-,_ ]", normalized_fontname)
186
226
  family_parts = []
187
- weight = ''
188
- style = ''
227
+ weight = ""
228
+ style = ""
189
229
 
190
230
  for part in parts:
191
231
  part_lower = part.lower()
@@ -196,7 +236,8 @@ class TextStyleAnalyzer:
196
236
  weight = val
197
237
  found = True
198
238
  break
199
- if found: continue # Skip part if it was a weight
239
+ if found:
240
+ continue # Skip part if it was a weight
200
241
 
201
242
  # Check styles
202
243
  for key, val in FONT_STYLES.items():
@@ -204,67 +245,72 @@ class TextStyleAnalyzer:
204
245
  style = val
205
246
  found = True
206
247
  break
207
- if found: continue # Skip part if it was a style
248
+ if found:
249
+ continue # Skip part if it was a style
208
250
 
209
251
  # If not weight or style, assume it's part of the family name
210
- if part: # Avoid empty strings from multiple delimiters
211
- family_parts.append(part)
252
+ if part: # Avoid empty strings from multiple delimiters
253
+ family_parts.append(part)
212
254
 
213
- family = "".join(family_parts) or "Unknown" # Join remaining parts
255
+ family = "".join(family_parts) or "Unknown" # Join remaining parts
214
256
  # Simple cleanup: Remove "MT" often appended? Maybe too aggressive.
215
257
  # if family.endswith("MT"): family = family[:-2]
216
258
 
217
- return {'family': family, 'weight': weight, 'style': style}
218
-
259
+ return {"family": family, "weight": weight, "style": style}
260
+
219
261
  def _create_style_key(self, properties: Dict[str, Any], group_by_keys: List[str]) -> Tuple:
220
- """ Create a hashable tuple key based on selected properties. """
262
+ """Create a hashable tuple key based on selected properties."""
221
263
  key_parts = []
222
- for key in group_by_keys: # Use the pre-sorted list
264
+ for key in group_by_keys: # Use the pre-sorted list
223
265
  value = properties.get(key)
224
266
  # Ensure hashable - colors should already be tuples or basic types
225
- if isinstance(value, list): # Should not happen if _extract handled color correctly
267
+ if isinstance(value, list): # Should not happen if _extract handled color correctly
226
268
  value = tuple(value)
227
269
  key_parts.append(value)
228
270
  return tuple(key_parts)
229
-
230
- def _generate_style_label(self, properties: Dict[str, Any], options: TextStyleOptions, style_index: int) -> str:
231
- """ Generate a style label based on properties and options. """
271
+
272
+ def _generate_style_label(
273
+ self, properties: Dict[str, Any], options: TextStyleOptions, style_index: int
274
+ ) -> str:
275
+ """Generate a style label based on properties and options."""
232
276
  if not options.descriptive_labels:
233
277
  return f"{options.label_prefix} {style_index}"
234
278
 
235
279
  try:
236
- font_details = self._parse_font_name(properties.get('fontname', ''))
280
+ font_details = self._parse_font_name(properties.get("fontname", ""))
237
281
 
238
282
  label_data = {
239
- 'size': properties.get('size', '?'),
240
- 'fontname': properties.get('fontname', 'Unknown'),
241
- 'is_bold': properties.get('is_bold', False),
242
- 'is_italic': properties.get('is_italic', False),
243
- 'color': properties.get('color', ''),
244
- 'family': font_details['family'],
283
+ "size": properties.get("size", "?"),
284
+ "fontname": properties.get("fontname", "Unknown"),
285
+ "is_bold": properties.get("is_bold", False),
286
+ "is_italic": properties.get("is_italic", False),
287
+ "color": properties.get("color", ""),
288
+ "family": font_details["family"],
245
289
  # Use parsed weight/style if available, otherwise fallback to is_bold/is_italic flags
246
- 'weight': font_details['weight'] or ('Bold' if properties.get('is_bold') else ''),
247
- 'style': font_details['style'] or ('Italic' if properties.get('is_italic') else ''),
290
+ "weight": font_details["weight"] or ("Bold" if properties.get("is_bold") else ""),
291
+ "style": font_details["style"] or ("Italic" if properties.get("is_italic") else ""),
248
292
  }
249
293
  # Ensure style has a space separator if both weight and style exist
250
- if label_data['weight'] and label_data['style']:
251
- label_data['style'] = " " + label_data['style']
294
+ if label_data["weight"] and label_data["style"]:
295
+ label_data["style"] = " " + label_data["style"]
252
296
 
253
297
  # Handle color formatting for label
254
- color_val = label_data['color']
298
+ color_val = label_data["color"]
255
299
  if isinstance(color_val, tuple):
256
- color_str = f"rgb{color_val}" # Basic tuple representation
300
+ color_str = f"rgb{color_val}" # Basic tuple representation
257
301
  elif isinstance(color_val, str):
258
- color_str = color_val # Already string ('black', 'white', or fallback)
302
+ color_str = color_val # Already string ('black', 'white', or fallback)
259
303
  else:
260
- color_str = str(color_val) # Other types
261
- label_data['color_str'] = color_str
304
+ color_str = str(color_val) # Other types
305
+ label_data["color_str"] = color_str
262
306
 
263
307
  # Format the label, handle potential missing keys in format string gracefully
264
308
  label = options.label_format.format_map(defaultdict(str, label_data))
265
- return label.strip().replace(" ", " ") # Cleanup extra spaces
309
+ return label.strip().replace(" ", " ") # Cleanup extra spaces
266
310
 
267
311
  except Exception as e:
268
- logger.warning(f"Error generating descriptive label for style {properties}: {e}. Falling back to numeric label.")
312
+ logger.warning(
313
+ f"Error generating descriptive label for style {properties}: {e}. Falling back to numeric label."
314
+ )
269
315
  # Fallback to numeric label on error
270
- return f"{options.label_prefix} {style_index}"
316
+ return f"{options.label_prefix} {style_index}"
@@ -1,57 +1,64 @@
1
1
  import logging
2
- from typing import List, Dict, Any
2
+ from typing import Any, Dict, List
3
+
3
4
  from ..elements.region import Region
4
5
 
5
- def convert_to_regions(page: Any, detections: List[Dict[str, Any]],
6
- scale_factor: float = 1.0) -> List[Region]:
6
+
7
+ def convert_to_regions(
8
+ page: Any, detections: List[Dict[str, Any]], scale_factor: float = 1.0
9
+ ) -> List[Region]:
7
10
  """
8
11
  Convert layout detections to Region objects.
9
-
12
+
10
13
  Args:
11
14
  page: Page object to create regions for
12
15
  detections: List of detection dictionaries
13
16
  scale_factor: Factor to scale coordinates from image to PDF space
14
-
17
+
15
18
  Returns:
16
19
  List of Region objects with layout metadata
17
20
  """
18
21
  conversion_logger = logging.getLogger("natural_pdf.analyzers.layout.convert")
19
- conversion_logger.debug(f"Converting {len(detections)} detections to regions with scale {scale_factor}")
22
+ conversion_logger.debug(
23
+ f"Converting {len(detections)} detections to regions with scale {scale_factor}"
24
+ )
20
25
  regions = []
21
-
26
+
22
27
  for det in detections:
23
28
  # Extract detection info
24
- x_min, y_min, x_max, y_max = det['bbox']
25
-
29
+ x_min, y_min, x_max, y_max = det["bbox"]
30
+
26
31
  # Ensure coordinates are in proper order (min values are smaller)
27
32
  if x_min > x_max:
28
33
  x_min, x_max = x_max, x_min
29
34
  if y_min > y_max:
30
35
  y_min, y_max = y_max, y_min
31
-
36
+
32
37
  # Scale coordinates from image to PDF space
33
38
  if scale_factor != 1.0:
34
39
  x_min *= scale_factor
35
40
  y_min *= scale_factor
36
41
  x_max *= scale_factor
37
42
  y_max *= scale_factor
38
-
43
+
39
44
  # Create region with metadata
40
45
  region = Region(page, (x_min, y_min, x_max, y_max))
41
- region.region_type = det['class']
42
- region.confidence = det['confidence']
43
- region.normalized_type = det['normalized_class']
44
-
46
+ region.region_type = det["class"]
47
+ region.confidence = det["confidence"]
48
+ region.normalized_type = det["normalized_class"]
49
+
45
50
  # Add source info - important for filtering
46
- region.source = det.get('source', 'detected')
47
- region.model = det.get('model', 'unknown')
48
-
51
+ region.source = det.get("source", "detected")
52
+ region.model = det.get("model", "unknown")
53
+
49
54
  # Add additional metadata if available
50
55
  for key, value in det.items():
51
- if key not in ('bbox', 'class', 'confidence', 'normalized_class', 'source', 'model'):
56
+ if key not in ("bbox", "class", "confidence", "normalized_class", "source", "model"):
52
57
  setattr(region, key, value)
53
-
58
+
54
59
  regions.append(region)
55
-
56
- conversion_logger.debug(f"Created {len(regions)} region objects from {len(detections)} detections")
57
- return regions
60
+
61
+ conversion_logger.debug(
62
+ f"Created {len(regions)} region objects from {len(detections)} detections"
63
+ )
64
+ return regions