natural-pdf 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +222 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +260 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +409 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +484 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +586 -0
  56. docs/tutorials/12-ocr-integration.md +188 -0
  57. docs/tutorials/13-semantic-search.ipynb +1888 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +39 -20
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  67. natural_pdf/analyzers/layout/layout_manager.py +98 -58
  68. natural_pdf/analyzers/layout/layout_options.py +32 -17
  69. natural_pdf/analyzers/layout/paddle.py +152 -95
  70. natural_pdf/analyzers/layout/surya.py +164 -92
  71. natural_pdf/analyzers/layout/tatr.py +149 -84
  72. natural_pdf/analyzers/layout/yolo.py +84 -44
  73. natural_pdf/analyzers/text_options.py +22 -15
  74. natural_pdf/analyzers/text_structure.py +131 -85
  75. natural_pdf/analyzers/utils.py +30 -23
  76. natural_pdf/collections/pdf_collection.py +125 -97
  77. natural_pdf/core/__init__.py +1 -1
  78. natural_pdf/core/element_manager.py +416 -337
  79. natural_pdf/core/highlighting_service.py +268 -196
  80. natural_pdf/core/page.py +907 -513
  81. natural_pdf/core/pdf.py +385 -287
  82. natural_pdf/elements/__init__.py +1 -1
  83. natural_pdf/elements/base.py +302 -214
  84. natural_pdf/elements/collections.py +708 -508
  85. natural_pdf/elements/line.py +39 -36
  86. natural_pdf/elements/rect.py +32 -30
  87. natural_pdf/elements/region.py +854 -883
  88. natural_pdf/elements/text.py +122 -99
  89. natural_pdf/exporters/__init__.py +0 -1
  90. natural_pdf/exporters/searchable_pdf.py +261 -102
  91. natural_pdf/ocr/__init__.py +23 -14
  92. natural_pdf/ocr/engine.py +17 -8
  93. natural_pdf/ocr/engine_easyocr.py +63 -47
  94. natural_pdf/ocr/engine_paddle.py +97 -68
  95. natural_pdf/ocr/engine_surya.py +54 -44
  96. natural_pdf/ocr/ocr_manager.py +88 -62
  97. natural_pdf/ocr/ocr_options.py +16 -10
  98. natural_pdf/qa/__init__.py +1 -1
  99. natural_pdf/qa/document_qa.py +119 -111
  100. natural_pdf/search/__init__.py +37 -31
  101. natural_pdf/search/haystack_search_service.py +312 -189
  102. natural_pdf/search/haystack_utils.py +186 -122
  103. natural_pdf/search/search_options.py +25 -14
  104. natural_pdf/search/search_service_protocol.py +12 -6
  105. natural_pdf/search/searchable_mixin.py +261 -176
  106. natural_pdf/selectors/__init__.py +2 -1
  107. natural_pdf/selectors/parser.py +159 -316
  108. natural_pdf/templates/__init__.py +1 -1
  109. natural_pdf/utils/highlighting.py +8 -2
  110. natural_pdf/utils/reading_order.py +65 -63
  111. natural_pdf/utils/text_extraction.py +195 -0
  112. natural_pdf/utils/visualization.py +70 -61
  113. natural_pdf/widgets/__init__.py +2 -3
  114. natural_pdf/widgets/viewer.py +749 -718
  115. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +15 -1
  116. natural_pdf-0.1.5.dist-info/RECORD +134 -0
  117. natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
  118. notebooks/Examples.ipynb +1293 -0
  119. pdfs/.gitkeep +0 -0
  120. pdfs/01-practice.pdf +543 -0
  121. pdfs/0500000US42001.pdf +0 -0
  122. pdfs/0500000US42007.pdf +0 -0
  123. pdfs/2014 Statistics.pdf +0 -0
  124. pdfs/2019 Statistics.pdf +0 -0
  125. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  126. pdfs/needs-ocr.pdf +0 -0
  127. tests/test_loading.py +50 -0
  128. tests/test_optional_deps.py +298 -0
  129. natural_pdf-0.1.4.dist-info/RECORD +0 -61
  130. natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
  131. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
  132. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -4,454 +4,533 @@ Element Manager for natural-pdf.
4
4
  This class handles the loading, creation, and management of PDF elements like
5
5
  characters, words, rectangles, and lines extracted from a page.
6
6
  """
7
+
7
8
  import logging
8
- from typing import List, Dict, Any, Optional, Union, Tuple
9
- from itertools import groupby
10
9
  import re
10
+ from itertools import groupby
11
+ from typing import Any, Dict, List, Optional, Tuple, Union
12
+
13
+ from pdfplumber.utils.text import WordExtractor
11
14
 
12
- from natural_pdf.elements.text import TextElement
13
- from natural_pdf.elements.rect import RectangleElement
14
15
  from natural_pdf.elements.line import LineElement
16
+ from natural_pdf.elements.rect import RectangleElement
17
+ from natural_pdf.elements.text import TextElement
15
18
 
16
19
  logger = logging.getLogger(__name__)
17
20
 
21
+
22
+ class NaturalWordExtractor(WordExtractor):
23
+ """
24
+ Custom WordExtractor that splits words based on specified character attributes
25
+ in addition to pdfplumber's default spatial logic.
26
+ """
27
+
28
+ def __init__(self, word_split_attributes: List[str], extra_attrs: List[str], *args, **kwargs):
29
+ """
30
+ Initialize the extractor.
31
+
32
+ Args:
33
+ word_split_attributes: List of character attributes (keys in char dict)
34
+ that should trigger a word split if they differ
35
+ between adjacent characters.
36
+ extra_attrs: List of character attributes (keys in char dict)
37
+ to copy from the first char of a word into the
38
+ resulting word dictionary.
39
+ *args: Positional arguments passed to WordExtractor parent.
40
+ **kwargs: Keyword arguments passed to WordExtractor parent.
41
+ """
42
+ self.word_split_attributes = word_split_attributes or []
43
+ # Remove our custom arg before passing to parent
44
+ # (Though WordExtractor likely ignores unknown kwargs)
45
+ # Ensure it's removed if it exists in kwargs
46
+ if "word_split_attributes" in kwargs:
47
+ del kwargs["word_split_attributes"]
48
+ # Pass extra_attrs to the parent constructor
49
+ kwargs["extra_attrs"] = extra_attrs
50
+ super().__init__(*args, **kwargs)
51
+
52
+ def char_begins_new_word(
53
+ self,
54
+ prev_char: Dict[str, Any],
55
+ curr_char: Dict[str, Any],
56
+ direction: str,
57
+ x_tolerance: float,
58
+ y_tolerance: float,
59
+ ) -> bool:
60
+ """
61
+ Determine if curr_char begins a new word, considering spatial and
62
+ attribute differences.
63
+ """
64
+ # 1. Check pdfplumber's spatial logic first
65
+ spatial_split = super().char_begins_new_word(
66
+ prev_char, curr_char, direction, x_tolerance, y_tolerance
67
+ )
68
+ if spatial_split:
69
+ return True
70
+
71
+ # 2. Check for differences in specified attributes
72
+ if self.word_split_attributes:
73
+ for attr in self.word_split_attributes:
74
+ # Use .get() for safety, although _prepare_char_dicts should ensure presence
75
+ if prev_char.get(attr) != curr_char.get(attr):
76
+ logger.debug(
77
+ f"Splitting word due to attribute mismatch on '{attr}': {prev_char.get(attr)} != {curr_char.get(attr)}"
78
+ )
79
+ return True # Attribute mismatch forces a new word
80
+
81
+ # If both spatial and attribute checks pass, it's the same word
82
+ return False
83
+
84
+
18
85
  class ElementManager:
19
86
  """
20
87
  Manages the loading, creation, and retrieval of elements from a PDF page.
21
-
88
+
22
89
  This class centralizes the element management functionality previously
23
90
  contained in the Page class, providing better separation of concerns.
24
91
  """
25
-
92
+
26
93
  def __init__(self, page, font_attrs=None):
27
94
  """
28
95
  Initialize the ElementManager.
29
-
96
+
30
97
  Args:
31
98
  page: The parent Page object
32
99
  font_attrs: Font attributes to consider when grouping characters into words.
33
- Default: ['fontname', 'size'] (Group by font name and size)
100
+ Default: ['fontname', 'size', 'bold', 'italic']
34
101
  None: Only consider spatial relationships
35
- List: Custom attributes to consider (e.g., ['fontname', 'size', 'color'])
102
+ List: Custom attributes to consider
36
103
  """
37
104
  self._page = page
38
105
  self._elements = None # Lazy-loaded
39
- # Default to grouping by fontname and size if not specified
40
- self._font_attrs = ['fontname', 'size'] if font_attrs is None else font_attrs
41
-
106
+ # Default to splitting by fontname, size, bold, italic if not specified
107
+ # Renamed internal variable for clarity
108
+ self._word_split_attributes = (
109
+ ["fontname", "size", "bold", "italic"] if font_attrs is None else font_attrs
110
+ )
111
+
42
112
  def load_elements(self):
43
113
  """
44
114
  Load all elements from the page (lazy loading).
115
+ Uses NaturalWordExtractor for word grouping.
45
116
  """
46
- if self._elements is None:
47
- # Create character elements with font information
48
- chars = self._create_char_elements()
49
-
50
- # Get keep_spaces setting from PDF config or default to True
51
- keep_spaces = self._page._parent._config.get('keep_spaces', True)
52
-
53
- # Group characters into words
54
- words = self._group_chars_into_words(keep_spaces, self._font_attrs)
55
-
56
- # Create the elements dictionary with all element types
57
- self._elements = {
58
- 'chars': chars,
59
- 'words': words,
60
- 'rects': [RectangleElement(r, self._page) for r in self._page._page.rects],
61
- 'lines': [LineElement(l, self._page) for l in self._page._page.lines],
62
- # Add other element types as needed
63
- }
64
-
65
- # Add regions if they exist
66
- if hasattr(self._page, '_regions') and ('detected' in self._page._regions or 'named' in self._page._regions):
67
- regions = []
68
- if 'detected' in self._page._regions:
69
- regions.extend(self._page._regions['detected'])
70
- if 'named' in self._page._regions:
71
- regions.extend(self._page._regions['named'].values())
72
- self._elements['regions'] = regions
73
-
74
- def _create_char_elements(self):
75
- """
76
- Create TextElement objects from page characters with enhanced font information.
77
-
78
- Returns:
79
- List of TextElement objects for characters
80
- """
81
- chars = []
82
- for c in self._page._page.chars:
83
- # Process font reference information
84
- self._process_font_information(c)
85
-
86
- # Add source attribute for native text elements
87
- c['source'] = 'native'
88
- chars.append(TextElement(c, self._page))
89
-
90
- return chars
91
-
92
- def _process_font_information(self, char_dict):
93
- """
94
- Process font information for a character dict, adding real_fontname when possible.
95
-
96
- Args:
97
- char_dict: Character dictionary to process
98
- """
99
- # Check for font references (F0, F1, etc.) and map to actual fonts
100
- if char_dict.get('fontname', '').startswith('F') and len(char_dict['fontname']) <= 3:
101
- # Access the PDF resource info to get actual font name
102
- font_ref = char_dict['fontname']
103
- try:
104
- # Try to get font info from resources
105
- if self._page._page.page_obj.get('Resources', {}).get('Font', {}):
106
- fonts = self._page._page.page_obj['Resources']['Font']
107
- if font_ref in fonts:
108
- font_obj = fonts[font_ref]
109
- if font_obj.get('BaseFont'):
110
- char_dict['real_fontname'] = font_obj['BaseFont']
111
- except (KeyError, AttributeError, TypeError):
112
- pass
113
-
114
- def _group_chars_into_words(self, keep_spaces=True, font_attrs=None):
115
- """
116
- Group characters into words based on font attributes and spatial relationships.
117
-
118
- Args:
119
- keep_spaces: Whether to keep spaces in words or use them as word separators
120
- font_attrs: Font attributes to consider when grouping characters
121
-
122
- Returns:
123
- List of TextElement word objects
124
- """
125
- # Sort chars by y-position (line) and then x-position
126
- sorted_chars = sorted(self._page._page.chars, key=lambda c: (round(c['top']), c['x0']))
127
-
128
- # Group chars by line (similar y-position)
129
- line_groups = []
130
- for _, line_chars in groupby(sorted_chars, key=lambda c: round(c['top'])):
131
- line_chars = list(line_chars)
132
-
133
- # Process each line of characters into words
134
- words = self._process_line_into_words(line_chars, keep_spaces, font_attrs)
135
- line_groups.extend(words)
136
-
137
- return line_groups
138
-
139
- def _process_line_into_words(self, line_chars, keep_spaces, font_attrs):
140
- """
141
- Process a single line of characters into words.
142
-
143
- Args:
144
- line_chars: List of characters in the line
145
- keep_spaces: Whether to keep spaces in words
146
- font_attrs: Font attributes to consider for word breaks
147
-
148
- Returns:
149
- List of TextElement word objects for this line
150
- """
151
- words = []
152
- current_word = []
153
-
154
- for i, char in enumerate(line_chars):
155
- # Handle whitespace characters differently based on keep_spaces setting
156
- if char['text'].isspace():
157
- if keep_spaces:
158
- # Include spaces in words when keep_spaces is enabled
159
- if current_word:
160
- current_word.append(char)
161
- else:
162
- # Skip leading spaces at the start of a line
163
- continue
164
- else:
165
- # Original behavior: Skip whitespace and close current word
166
- if current_word:
167
- # Create word and add to words list
168
- word = self._create_word_element(current_word, font_attrs)
169
- words.append(word)
170
- current_word = []
171
- continue
172
-
173
- # If this is a new word, start it
174
- if not current_word:
175
- current_word.append(char)
176
- else:
177
- # Check if this char is part of the current word or a new word
178
- prev_char = current_word[-1]
179
-
180
- # Check if font attributes match for this character
181
- font_attrs_match = self._check_font_attributes_match(char, prev_char, font_attrs)
182
-
183
- # If font attributes don't match, it's a new word
184
- if not font_attrs_match:
185
- # Complete current word
186
- word = self._create_word_element(current_word, font_attrs)
187
- words.append(word)
188
- current_word = [char]
189
- # If the gap between chars is larger than a threshold, it's a new word
190
- # Use a wider threshold when keep_spaces is enabled to allow for natural spaces
191
- elif char['x0'] - prev_char['x1'] > prev_char['width'] * (1.5 if keep_spaces else 0.5):
192
- # Complete current word
193
- word = self._create_word_element(current_word, font_attrs)
194
- words.append(word)
195
- current_word = [char]
196
- else:
197
- # Continue current word
198
- current_word.append(char)
199
-
200
- # Handle the last word if there is one
201
- if current_word:
202
- word = self._create_word_element(current_word, font_attrs)
203
- words.append(word)
204
-
205
- return words
206
-
207
- def _check_font_attributes_match(self, char, prev_char, font_attrs):
117
+ if self._elements is not None:
118
+ return
119
+
120
+ logger.debug(f"Page {self._page.number}: Loading elements...")
121
+
122
+ # 1. Prepare character dictionaries (native + OCR) with necessary attributes
123
+ prepared_char_dicts = self._prepare_char_dicts()
124
+ logger.debug(
125
+ f"Page {self._page.number}: Prepared {len(prepared_char_dicts)} character dictionaries."
126
+ )
127
+
128
+ # 2. Instantiate the custom word extractor
129
+ # Get config settings from the parent PDF or use defaults
130
+ pdf_config = getattr(self._page._parent, "_config", {})
131
+ xt = pdf_config.get("x_tolerance", 3)
132
+ yt = pdf_config.get("y_tolerance", 3)
133
+ use_flow = pdf_config.get("use_text_flow", False)
134
+
135
+ # Define which attributes to preserve on the merged word object
136
+ # Should include split attributes + any others needed for filtering (like color)
137
+ attributes_to_preserve = list(set(self._word_split_attributes + ["non_stroking_color"]))
138
+
139
+ # Pass our configured attributes for splitting
140
+ extractor = NaturalWordExtractor(
141
+ word_split_attributes=self._word_split_attributes,
142
+ extra_attrs=attributes_to_preserve,
143
+ x_tolerance=xt,
144
+ y_tolerance=yt,
145
+ keep_blank_chars=True,
146
+ use_text_flow=use_flow,
147
+ # Assuming default directions are okay, configure if needed
148
+ # line_dir=..., char_dir=...
149
+ )
150
+
151
+ # 3. Generate words using the extractor
152
+ generated_words = []
153
+ if prepared_char_dicts:
154
+ # Sort chars primarily by upright status, then page reading order
155
+ # Grouping by upright is crucial for WordExtractor's direction logic
156
+ sorted_chars_for_extraction = sorted(
157
+ prepared_char_dicts,
158
+ key=lambda c: (c.get("upright", True), round(c.get("top", 0)), c.get("x0", 0)),
159
+ )
160
+
161
+ word_tuples = extractor.iter_extract_tuples(sorted_chars_for_extraction)
162
+ for word_dict, char_list in word_tuples:
163
+ # Convert the generated word_dict to a TextElement
164
+ word_dict["_char_dicts"] = char_list
165
+ word_element = self._create_word_element(word_dict)
166
+ generated_words.append(word_element)
167
+ logger.debug(
168
+ f"Page {self._page.number}: Generated {len(generated_words)} words using NaturalWordExtractor."
169
+ )
170
+
171
+ # 4. Load other elements (rects, lines)
172
+ rect_elements = [RectangleElement(r, self._page) for r in self._page._page.rects]
173
+ line_elements = [LineElement(l, self._page) for l in self._page._page.lines]
174
+ logger.debug(
175
+ f"Page {self._page.number}: Loaded {len(rect_elements)} rects, {len(line_elements)} lines."
176
+ )
177
+
178
+ # 5. Create the final elements dictionary
179
+ self._elements = {
180
+ # Store original char elements if needed (e.g., for visualization/debugging)
181
+ # We re-create them here from the prepared dicts
182
+ "chars": [TextElement(c_dict, self._page) for c_dict in prepared_char_dicts],
183
+ "words": generated_words,
184
+ "rects": rect_elements,
185
+ "lines": line_elements,
186
+ }
187
+
188
+ # Add regions if they exist
189
+ if hasattr(self._page, "_regions") and (
190
+ "detected" in self._page._regions or "named" in self._page._regions
191
+ ):
192
+ regions = []
193
+ if "detected" in self._page._regions:
194
+ regions.extend(self._page._regions["detected"])
195
+ if "named" in self._page._regions:
196
+ regions.extend(self._page._regions["named"].values())
197
+ self._elements["regions"] = regions
198
+ logger.debug(f"Page {self._page.number}: Added {len(regions)} regions.")
199
+ else:
200
+ self._elements["regions"] = [] # Ensure key exists
201
+
202
+ logger.debug(f"Page {self._page.number}: Element loading complete.")
203
+
204
+ def _prepare_char_dicts(self) -> List[Dict[str, Any]]:
208
205
  """
209
- Check if two characters have matching font attributes.
210
-
211
- Args:
212
- char: Current character
213
- prev_char: Previous character
214
- font_attrs: List of font attributes to check
215
-
206
+ Prepares a list of character dictionaries from native PDF characters,
207
+ augmenting them with necessary attributes like bold/italic flags.
208
+ This method focuses ONLY on native characters. OCR results are
209
+ handled separately by create_text_elements_from_ocr.
210
+
216
211
  Returns:
217
- Boolean indicating whether font attributes match
212
+ List of augmented native character dictionaries.
218
213
  """
219
- # Default to match if no font attributes specified
220
- if not font_attrs:
221
- return True
222
-
223
- # Check each font attribute
224
- for attr in font_attrs:
225
- # If attribute doesn't match or isn't present in both chars, they don't match
226
- if attr not in char or attr not in prev_char or char[attr] != prev_char[attr]:
227
- return False
228
-
229
- return True
230
-
231
- def _create_word_element(self, chars, font_attrs):
214
+ prepared_dicts = []
215
+ processed_native_ids = set() # To track processed native chars
216
+
217
+ # 1. Process Native PDF Characters
218
+ native_chars = self._page._page.chars or []
219
+ logger.debug(f"Page {self._page.number}: Preparing {len(native_chars)} native char dicts.")
220
+ for i, char_dict in enumerate(native_chars):
221
+ # Create a temporary TextElement for analysis ONLY
222
+ # We need to ensure the char_dict has necessary keys first
223
+ if not all(k in char_dict for k in ["x0", "top", "x1", "bottom", "text"]):
224
+ logger.warning(f"Skipping native char dict due to missing keys: {char_dict}")
225
+ continue
226
+
227
+ temp_element = TextElement(char_dict, self._page)
228
+
229
+ # Augment the original dictionary
230
+ augmented_dict = char_dict.copy() # Work on a copy
231
+ augmented_dict["bold"] = temp_element.bold
232
+ augmented_dict["italic"] = temp_element.italic
233
+ augmented_dict["source"] = "native"
234
+ # Copy color if it exists
235
+ if "non_stroking_color" in char_dict:
236
+ augmented_dict["non_stroking_color"] = char_dict["non_stroking_color"]
237
+ # Ensure basic required keys are present
238
+ augmented_dict.setdefault("upright", True)
239
+ augmented_dict.setdefault("fontname", "Unknown")
240
+ augmented_dict.setdefault("size", 0)
241
+
242
+ prepared_dicts.append(augmented_dict)
243
+ # Use a unique identifier if available (e.g., tuple of key properties)
244
+ # Simple approach: use index for now, assuming list order is stable here
245
+ processed_native_ids.add(i)
246
+
247
+ # 2. Remove OCR Processing from this method
248
+ # OCR results will be added later via create_text_elements_from_ocr
249
+
250
+ logger.debug(
251
+ f"Page {self._page.number}: Total prepared native char dicts: {len(prepared_dicts)}"
252
+ )
253
+ return prepared_dicts
254
+
255
+ def _create_word_element(self, word_dict: Dict[str, Any]) -> TextElement:
232
256
  """
233
- Create a word element from a list of character dictionaries.
234
-
257
+ Create a TextElement (type 'word') from a word dictionary generated
258
+ by NaturalWordExtractor/pdfplumber.
259
+
235
260
  Args:
236
- chars: List of character dictionaries
237
- font_attrs: Font attributes to copy to the word
238
-
261
+ word_dict: Dictionary representing the word, including geometry,
262
+ text, and attributes copied from the first char
263
+ (e.g., fontname, size, bold, italic).
264
+
239
265
  Returns:
240
- TextElement representing the word
266
+ TextElement representing the word.
241
267
  """
242
- # Combine text from characters and normalize spaces
243
- text = ''.join(c['text'] for c in chars)
244
-
245
- # Collapse multiple consecutive spaces into a single space
246
- text = re.sub(r'\s+', ' ', text)
247
-
248
- # Create a combined word object
249
- word_obj = {
250
- 'text': text,
251
- 'x0': min(c['x0'] for c in chars),
252
- 'x1': max(c['x1'] for c in chars),
253
- 'top': min(c['top'] for c in chars),
254
- 'bottom': max(c['bottom'] for c in chars),
255
- 'fontname': chars[0].get('fontname', ''),
256
- 'size': chars[0].get('size', 0),
257
- 'object_type': 'word',
258
- 'page_number': chars[0]['page_number']
259
- }
260
-
261
- # Handle real fontname if available
262
- if 'real_fontname' in chars[0]:
263
- word_obj['real_fontname'] = chars[0]['real_fontname']
264
-
265
- # Handle color - use the first char's color
266
- if 'non_stroking_color' in chars[0]:
267
- word_obj['non_stroking_color'] = chars[0]['non_stroking_color']
268
-
269
- # Copy any additional font attributes
270
- if font_attrs:
271
- for attr in font_attrs:
272
- if attr in chars[0]:
273
- word_obj[attr] = chars[0][attr]
274
-
275
- # Add source attribute for native text elements
276
- word_obj['source'] = 'native'
277
-
278
- return TextElement(word_obj, self._page)
279
-
280
- def create_text_elements_from_ocr(self, ocr_results, image_width=None, image_height=None):
268
+ # word_dict already contains calculated geometry (x0, top, x1, bottom, etc.)
269
+ # and text content. We just need to ensure our required fields exist
270
+ # and potentially set the source.
271
+
272
+ # Start with a copy of the word_dict
273
+ element_data = word_dict.copy()
274
+
275
+ # Ensure required TextElement fields are present or add defaults
276
+ element_data.setdefault("object_type", "word") # Set type to 'word'
277
+ element_data.setdefault("page_number", self._page.number)
278
+ # Determine source based on attributes present (e.g., if 'confidence' exists, it's likely OCR)
279
+ # This assumes the word_dict carries over some hint from its chars.
280
+ # A simpler approach: assume 'native' unless fontname is 'OCR'.
281
+ element_data.setdefault(
282
+ "source", "ocr" if element_data.get("fontname") == "OCR" else "native"
283
+ )
284
+ element_data.setdefault(
285
+ "confidence", 1.0 if element_data["source"] == "native" else 0.0
286
+ ) # Default confidence
287
+
288
+ # Bold/italic should already be in word_dict if they were split attributes,
289
+ # copied from the first (representative) char by pdfplumber's merge_chars.
290
+ # Ensure they exist for TextElement initialization.
291
+ element_data.setdefault("bold", False)
292
+ element_data.setdefault("italic", False)
293
+
294
+ # Ensure fontname and size exist
295
+ element_data.setdefault("fontname", "Unknown")
296
+ element_data.setdefault("size", 0)
297
+
298
+ # Store the constituent char dicts (passed alongside word_dict from extractor)
299
+ # We need to modify the caller (load_elements) to pass this.
300
+ # For now, assume it might be passed in word_dict for placeholder.
301
+ element_data["_char_dicts"] = word_dict.get("_char_dicts", []) # Store char list
302
+
303
+ return TextElement(element_data, self._page)
304
+
305
+ def create_text_elements_from_ocr(self, ocr_results, scale_x=None, scale_y=None):
281
306
  """
282
- Convert OCR results to TextElement objects.
283
-
307
+ Convert OCR results to TextElement objects AND adds them to the manager's
308
+ 'words' and 'chars' lists.
309
+
310
+ This method should be called AFTER initial elements (native) might have
311
+ been loaded, as it appends to the existing lists.
312
+
284
313
  Args:
285
- ocr_results: List of OCR results with text, bbox, and confidence
286
- image_width: Width of the source image (for coordinate scaling)
287
- image_height: Height of the source image (for coordinate scaling)
288
-
314
+ ocr_results: List of OCR results dictionaries with 'text', 'bbox', 'confidence'.
315
+ scale_x: Factor to convert image x-coordinates to PDF coordinates.
316
+ scale_y: Factor to convert image y-coordinates to PDF coordinates.
317
+
289
318
  Returns:
290
- List of created TextElement objects
319
+ List of created TextElement word objects that were added.
291
320
  """
292
- elements = []
293
-
294
- # Calculate scale factors to convert from image coordinates to PDF coordinates
295
- # Default to 1.0 if not provided (assume coordinates are already in PDF space)
296
- scale_x = 1.0
297
- scale_y = 1.0
298
-
299
- if image_width and image_height:
300
- scale_x = self._page.width / image_width
301
- scale_y = self._page.height / image_height
302
-
321
+ added_word_elements = []
322
+ if self._elements is None:
323
+ # Trigger loading of native elements if not already done
324
+ logger.debug(
325
+ f"Page {self._page.number}: create_text_elements_from_ocr triggering initial load_elements."
326
+ )
327
+ self.load_elements()
328
+
329
+ # Ensure scales are valid numbers
330
+ scale_x = float(scale_x) if scale_x is not None else 1.0
331
+ scale_y = float(scale_y) if scale_y is not None else 1.0
332
+
333
+ logger.debug(
334
+ f"Page {self._page.number}: Adding {len(ocr_results)} OCR results as elements. Scale: x={scale_x:.2f}, y={scale_y:.2f}"
335
+ )
336
+
337
+ # Ensure the target lists exist in the _elements dict
338
+ if self._elements is None:
339
+ logger.error(
340
+ f"Page {self._page.number}: _elements dictionary is None after load_elements call in create_text_elements_from_ocr. Cannot add OCR elements."
341
+ )
342
+ return [] # Cannot proceed
343
+
344
+ if "words" not in self._elements:
345
+ self._elements["words"] = []
346
+ if "chars" not in self._elements:
347
+ self._elements["chars"] = []
348
+
303
349
  for result in ocr_results:
304
- # Convert numpy int32 to float if needed and scale to PDF coordinates
305
- x0 = float(result['bbox'][0]) * scale_x
306
- top = float(result['bbox'][1]) * scale_y
307
- x1 = float(result['bbox'][2]) * scale_x
308
- bottom = float(result['bbox'][3]) * scale_y
309
-
310
- # Create a TextElement object with additional required fields for highlighting
311
- element_data = {
312
- 'text': result['text'],
313
- 'x0': x0,
314
- 'top': top,
315
- 'x1': x1,
316
- 'bottom': bottom,
317
- 'width': x1 - x0,
318
- 'height': bottom - top,
319
- 'object_type': 'text',
320
- 'source': 'ocr',
321
- 'confidence': result['confidence'],
322
- # Add default font information to work with existing expectations
323
- 'fontname': 'OCR-detected',
324
- 'size': 10.0,
325
- 'page_number': self._page.number
326
- }
327
-
328
- elem = TextElement(element_data, self._page)
329
- elements.append(elem)
330
-
331
- # Add to page's elements
332
- if self._elements is not None:
333
- # Add to words list to make it accessible via standard API
334
- if 'words' in self._elements:
335
- self._elements['words'].append(elem)
336
- else:
337
- self._elements['words'] = [elem]
338
-
339
- return elements
340
-
341
- def add_element(self, element, element_type='words'):
350
+ try:
351
+ x0_img, top_img, x1_img, bottom_img = map(float, result["bbox"])
352
+ height_img = bottom_img - top_img
353
+ pdf_x0 = x0_img * scale_x
354
+ pdf_top = top_img * scale_y
355
+ pdf_x1 = x1_img * scale_x
356
+ pdf_bottom = bottom_img * scale_y
357
+ pdf_height = (bottom_img - top_img) * scale_y
358
+
359
+ # Create the TextElement for the word
360
+ word_element_data = {
361
+ "text": result["text"],
362
+ "x0": pdf_x0,
363
+ "top": pdf_top,
364
+ "x1": pdf_x1,
365
+ "bottom": pdf_bottom,
366
+ "width": (x1_img - x0_img) * scale_x,
367
+ "height": pdf_height,
368
+ "object_type": "word", # Treat OCR results as whole words
369
+ "source": "ocr",
370
+ "confidence": float(result.get("confidence", 0.0)),
371
+ "fontname": "OCR", # Use consistent OCR fontname
372
+ "size": (
373
+ round(pdf_height) if pdf_height > 0 else 10.0
374
+ ), # Use calculated PDF height for size
375
+ "page_number": self._page.number,
376
+ "bold": False,
377
+ "italic": False,
378
+ "upright": True,
379
+ "doctop": pdf_top + self._page._page.initial_doctop,
380
+ }
381
+
382
+ # Create the representative char dict for this OCR word
383
+ ocr_char_dict = word_element_data.copy()
384
+ ocr_char_dict["object_type"] = "char"
385
+ ocr_char_dict.setdefault("adv", ocr_char_dict.get("width", 0))
386
+
387
+ # Add the char dict list to the word data before creating TextElement
388
+ word_element_data["_char_dicts"] = [ocr_char_dict]
389
+
390
+ word_elem = TextElement(word_element_data, self._page)
391
+ added_word_elements.append(word_elem)
392
+
393
+ # Append the word element to the manager's list
394
+ self._elements["words"].append(word_elem)
395
+
396
+ # Also create and append a representative character dictionary
397
+ # for consistency if someone iterates through manager.chars later.
398
+ # This char dict represents the entire OCR word as a single 'char'.
399
+ char_dict_data = ocr_char_dict # Use the one we already created
400
+ char_dict_data["object_type"] = "char" # Mark as char type
401
+ # pdfplumber char dicts don't typically have width/height/doctop,
402
+ # but keeping them won't hurt WordExtractor if it encounters them.
403
+ char_dict_data.setdefault("adv", char_dict_data.get("width", 0))
404
+
405
+ self._elements["chars"].append(char_dict_data) # Append the dictionary
406
+
407
+ except (KeyError, ValueError, TypeError) as e:
408
+ logger.error(f"Failed to process OCR result: {result}. Error: {e}", exc_info=True)
409
+ continue
410
+
411
+ logger.info(
412
+ f"Page {self._page.number}: Appended {len(added_word_elements)} TextElements (words) and corresponding char dicts from OCR results."
413
+ )
414
+ return added_word_elements
415
+
416
+ def add_element(self, element, element_type="words"):
342
417
  """
343
418
  Add an element to the managed elements.
344
-
419
+
345
420
  Args:
346
421
  element: The element to add
347
422
  element_type: The type of element ('words', 'chars', etc.)
348
-
423
+
349
424
  Returns:
350
425
  True if added successfully, False otherwise
351
426
  """
352
427
  # Load elements if not already loaded
353
428
  self.load_elements()
354
-
429
+
355
430
  # Add to the appropriate list
356
431
  if element_type in self._elements:
357
- self._elements[element_type].append(element)
358
- return True
359
-
432
+ # Avoid adding duplicates
433
+ if element not in self._elements[element_type]:
434
+ self._elements[element_type].append(element)
435
+ return True
436
+ else:
437
+ # logger.debug(f"Element already exists in {element_type}: {element}")
438
+ return False # Indicate it wasn't newly added
439
+
360
440
  return False
361
-
441
+
362
442
  def add_region(self, region, name=None):
363
443
  """
364
444
  Add a region to the managed elements.
365
-
445
+
366
446
  Args:
367
447
  region: The region to add
368
448
  name: Optional name for the region
369
-
449
+
370
450
  Returns:
371
451
  True if added successfully, False otherwise
372
452
  """
373
453
  # Load elements if not already loaded
374
454
  self.load_elements()
375
-
455
+
376
456
  # Make sure regions is in _elements
377
- if 'regions' not in self._elements:
378
- self._elements['regions'] = []
379
-
457
+ if "regions" not in self._elements:
458
+ self._elements["regions"] = []
459
+
380
460
  # Add to elements for selector queries
381
- if region not in self._elements['regions']:
382
- self._elements['regions'].append(region)
461
+ if region not in self._elements["regions"]:
462
+ self._elements["regions"].append(region)
383
463
  return True
384
-
464
+
385
465
  return False
386
-
466
+
387
467
  def get_elements(self, element_type=None):
388
468
  """
389
469
  Get all elements of the specified type, or all elements if type is None.
390
-
470
+
391
471
  Args:
392
- element_type: Optional element type ('words', 'chars', 'rects', 'lines', etc.)
393
-
472
+ element_type: Optional element type ('words', 'chars', 'rects', 'lines', 'regions' etc.)
473
+
394
474
  Returns:
395
475
  List of elements
396
476
  """
397
477
  # Load elements if not already loaded
398
478
  self.load_elements()
399
-
479
+
400
480
  if element_type:
401
481
  return self._elements.get(element_type, [])
402
-
482
+
403
483
  # Combine all element types
404
484
  all_elements = []
405
485
  for elements in self._elements.values():
406
486
  all_elements.extend(elements)
407
-
487
+
408
488
  return all_elements
409
-
489
+
410
490
  def get_all_elements(self):
411
491
  """
412
492
  Get all elements from all types.
413
-
493
+
414
494
  Returns:
415
495
  List of all elements
416
496
  """
417
497
  # Load elements if not already loaded
418
498
  self.load_elements()
419
-
499
+
420
500
  # Combine all element types
421
501
  all_elements = []
422
- for elements in self._elements.values():
423
- all_elements.extend(elements)
424
-
502
+ if self._elements: # Ensure _elements is not None
503
+ for elements in self._elements.values():
504
+ if isinstance(elements, list): # Ensure we only extend lists
505
+ all_elements.extend(elements)
425
506
  return all_elements
426
-
507
+
427
508
  @property
428
509
  def chars(self):
429
510
  """Get all character elements."""
430
511
  self.load_elements()
431
- return self._elements['chars']
432
-
512
+ return self._elements.get("chars", [])
513
+
433
514
  @property
434
515
  def words(self):
435
516
  """Get all word elements."""
436
517
  self.load_elements()
437
- return self._elements['words']
438
-
518
+ return self._elements.get("words", [])
519
+
439
520
  @property
440
521
  def rects(self):
441
522
  """Get all rectangle elements."""
442
523
  self.load_elements()
443
- return self._elements['rects']
444
-
524
+ return self._elements.get("rects", [])
525
+
445
526
  @property
446
527
  def lines(self):
447
528
  """Get all line elements."""
448
529
  self.load_elements()
449
- return self._elements['lines']
450
-
530
+ return self._elements.get("lines", [])
531
+
451
532
  @property
452
533
  def regions(self):
453
534
  """Get all region elements."""
454
535
  self.load_elements()
455
- if 'regions' not in self._elements:
456
- self._elements['regions'] = []
457
- return self._elements['regions']
536
+ return self._elements.get("regions", [])