natural-pdf 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. docs/ocr/index.md +34 -47
  2. docs/tutorials/01-loading-and-extraction.ipynb +60 -46
  3. docs/tutorials/02-finding-elements.ipynb +42 -42
  4. docs/tutorials/03-extracting-blocks.ipynb +17 -17
  5. docs/tutorials/04-table-extraction.ipynb +12 -12
  6. docs/tutorials/05-excluding-content.ipynb +30 -30
  7. docs/tutorials/06-document-qa.ipynb +28 -28
  8. docs/tutorials/07-layout-analysis.ipynb +63 -35
  9. docs/tutorials/07-working-with-regions.ipynb +55 -51
  10. docs/tutorials/07-working-with-regions.md +2 -2
  11. docs/tutorials/08-spatial-navigation.ipynb +60 -60
  12. docs/tutorials/09-section-extraction.ipynb +113 -113
  13. docs/tutorials/10-form-field-extraction.ipynb +78 -50
  14. docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
  15. docs/tutorials/12-ocr-integration.ipynb +149 -131
  16. docs/tutorials/12-ocr-integration.md +0 -13
  17. docs/tutorials/13-semantic-search.ipynb +313 -873
  18. natural_pdf/__init__.py +21 -23
  19. natural_pdf/analyzers/layout/gemini.py +264 -0
  20. natural_pdf/analyzers/layout/layout_manager.py +28 -1
  21. natural_pdf/analyzers/layout/layout_options.py +11 -0
  22. natural_pdf/analyzers/layout/yolo.py +6 -2
  23. natural_pdf/collections/pdf_collection.py +21 -0
  24. natural_pdf/core/element_manager.py +16 -13
  25. natural_pdf/core/page.py +165 -36
  26. natural_pdf/core/pdf.py +146 -41
  27. natural_pdf/elements/base.py +11 -17
  28. natural_pdf/elements/collections.py +100 -38
  29. natural_pdf/elements/region.py +77 -38
  30. natural_pdf/elements/text.py +5 -0
  31. natural_pdf/ocr/__init__.py +49 -36
  32. natural_pdf/ocr/engine.py +146 -51
  33. natural_pdf/ocr/engine_easyocr.py +141 -161
  34. natural_pdf/ocr/engine_paddle.py +107 -193
  35. natural_pdf/ocr/engine_surya.py +75 -148
  36. natural_pdf/ocr/ocr_factory.py +114 -0
  37. natural_pdf/ocr/ocr_manager.py +65 -93
  38. natural_pdf/ocr/ocr_options.py +7 -17
  39. natural_pdf/ocr/utils.py +98 -0
  40. natural_pdf/templates/spa/css/style.css +334 -0
  41. natural_pdf/templates/spa/index.html +31 -0
  42. natural_pdf/templates/spa/js/app.js +472 -0
  43. natural_pdf/templates/spa/words.txt +235976 -0
  44. natural_pdf/utils/debug.py +32 -0
  45. natural_pdf/utils/identifiers.py +29 -0
  46. natural_pdf/utils/packaging.py +418 -0
  47. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +41 -19
  48. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/RECORD +51 -44
  49. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
  50. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/top_level.txt +0 -1
  51. natural_pdf/templates/ocr_debug.html +0 -517
  52. tests/test_loading.py +0 -50
  53. tests/test_optional_deps.py +0 -298
  54. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
natural_pdf/__init__.py CHANGED
@@ -12,17 +12,16 @@ logger = logging.getLogger("natural_pdf")
12
12
  logger.addHandler(logging.NullHandler())
13
13
 
14
14
 
15
- # Utility function for users to easily configure logging
16
15
  def configure_logging(level=logging.INFO, handler=None):
17
- """Configure Natural PDF's logging.
16
+ """Configure logging for the natural_pdf package.
18
17
 
19
18
  Args:
20
- level: The logging level (e.g., logging.INFO, logging.DEBUG)
21
- handler: A custom handler, or None to use StreamHandler
19
+ level: Logging level (e.g., logging.INFO, logging.DEBUG)
20
+ handler: Optional custom handler. Defaults to a StreamHandler.
22
21
  """
23
- # Remove NullHandler if present
24
- if logger.handlers and isinstance(logger.handlers[0], logging.NullHandler):
25
- logger.removeHandler(logger.handlers[0])
22
+ # Avoid adding duplicate handlers
23
+ if any(isinstance(h, logging.StreamHandler) for h in logger.handlers):
24
+ return
26
25
 
27
26
  if handler is None:
28
27
  handler = logging.StreamHandler()
@@ -32,11 +31,7 @@ def configure_logging(level=logging.INFO, handler=None):
32
31
  logger.addHandler(handler)
33
32
  logger.setLevel(level)
34
33
 
35
- # Propagate level to all child loggers
36
- for name in logging.root.manager.loggerDict:
37
- if name.startswith("natural_pdf."):
38
- logging.getLogger(name).setLevel(level)
39
-
34
+ logger.propagate = False
40
35
 
41
36
  from natural_pdf.core.page import Page
42
37
  from natural_pdf.core.pdf import PDF
@@ -53,18 +48,21 @@ except ImportError:
53
48
 
54
49
  __version__ = "0.1.1"
55
50
 
51
+ __all__ = [
52
+ "PDF",
53
+ "PDFCollection",
54
+ "Page",
55
+ "Region",
56
+ "ElementCollection",
57
+ "TextSearchOptions",
58
+ "MultiModalSearchOptions",
59
+ "BaseSearchOptions",
60
+ "configure_logging",
61
+ ]
62
+
56
63
  if HAS_QA:
57
- __all__ = [
58
- "PDF",
59
- "Page",
60
- "Region",
61
- "ElementCollection",
62
- "configure_logging",
63
- "DocumentQA",
64
- "get_qa_engine",
65
- ]
66
- else:
67
- __all__ = ["PDF", "Page", "Region", "ElementCollection", "configure_logging"]
64
+ __all__.extend(["DocumentQA", "get_qa_engine"])
65
+
68
66
 
69
67
  from .collections.pdf_collection import PDFCollection
70
68
 
@@ -0,0 +1,264 @@
1
+ # layout_detector_gemini.py
2
+ import importlib.util
3
+ import logging
4
+ import os
5
+ from typing import Any, Dict, List, Optional
6
+ import base64
7
+ import io
8
+
9
+ from pydantic import BaseModel, Field
10
+ from PIL import Image
11
+
12
+ # Use OpenAI library for interaction
13
+ try:
14
+ from openai import OpenAI
15
+ from openai.types.chat import ChatCompletion
16
+ # Import OpenAIError for exception handling if needed
17
+ except ImportError:
18
+ OpenAI = None
19
+ ChatCompletion = None
20
+
21
+ try:
22
+ from .base import LayoutDetector
23
+ from .layout_options import BaseLayoutOptions, GeminiLayoutOptions
24
+ except ImportError:
25
+ # Placeholders if run standalone or imports fail
26
+ class BaseLayoutOptions:
27
+ pass
28
+
29
+ class GeminiLayoutOptions(BaseLayoutOptions):
30
+ pass
31
+
32
+ class LayoutDetector:
33
+ def __init__(self):
34
+ self.logger = logging.getLogger()
35
+ self.supported_classes = set() # Will be dynamic based on user request
36
+
37
+ def _get_model(self, options):
38
+ raise NotImplementedError
39
+
40
+ def _normalize_class_name(self, n):
41
+ return n.lower().replace("_", "-").replace(" ", "-")
42
+
43
+ def validate_classes(self, c):
44
+ pass # Less strict validation needed for LLM
45
+
46
+ logging.basicConfig()
47
+
48
+ logger = logging.getLogger(__name__)
49
+
50
+ # Define Pydantic model for the expected output structure
51
+ # This is used by the openai library's `response_format`
52
+ class DetectedRegion(BaseModel):
53
+ label: str = Field(description="The identified class name.")
54
+ bbox: List[float] = Field(description="Bounding box coordinates [xmin, ymin, xmax, ymax].", min_items=4, max_items=4)
55
+ confidence: float = Field(description="Confidence score [0.0, 1.0].", ge=0.0, le=1.0)
56
+
57
+
58
+ class GeminiLayoutDetector(LayoutDetector):
59
+ """Document layout detector using Google's Gemini models via OpenAI compatibility layer."""
60
+
61
+ # Base URL for the Gemini OpenAI-compatible endpoint
62
+ GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"
63
+
64
+ def __init__(self):
65
+ super().__init__()
66
+ self.supported_classes = set() # Indicate dynamic nature
67
+
68
+ def is_available(self) -> bool:
69
+ """Check if openai library is installed and GOOGLE_API_KEY is available."""
70
+ api_key = os.environ.get("GOOGLE_API_KEY")
71
+ if not api_key:
72
+ logger.warning("GOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.")
73
+ return False
74
+ if OpenAI is None:
75
+ logger.warning("openai package not found. Gemini detector (via OpenAI lib) will not be available.")
76
+ return False
77
+ return True
78
+
79
+ def _get_cache_key(self, options: GeminiLayoutOptions) -> str:
80
+ """Generate cache key based on model name."""
81
+ if not isinstance(options, GeminiLayoutOptions):
82
+ options = GeminiLayoutOptions() # Use defaults
83
+
84
+ model_key = options.model_name
85
+ # Prompt is built dynamically, so not part of cache key based on options
86
+ return f"{self.__class__.__name__}_{model_key}"
87
+
88
+ def _load_model_from_options(self, options: GeminiLayoutOptions) -> Any:
89
+ """Validate options and return the model name."""
90
+ if not self.is_available():
91
+ raise RuntimeError(
92
+ "OpenAI library not installed or GOOGLE_API_KEY not set. Please run: pip install openai"
93
+ )
94
+
95
+ if not isinstance(options, GeminiLayoutOptions):
96
+ raise TypeError("Incorrect options type provided for Gemini model loading.")
97
+
98
+ # Simply return the model name, client is created in detect()
99
+ return options.model_name
100
+
101
+ def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
102
+ """Detect layout elements in an image using Gemini via OpenAI library."""
103
+ if not self.is_available():
104
+ raise RuntimeError(
105
+ "OpenAI library not installed or GOOGLE_API_KEY not set."
106
+ )
107
+
108
+ # Ensure options are the correct type
109
+ if not isinstance(options, GeminiLayoutOptions):
110
+ self.logger.warning(
111
+ "Received BaseLayoutOptions, expected GeminiLayoutOptions. Using defaults."
112
+ )
113
+ options = GeminiLayoutOptions(
114
+ confidence=options.confidence,
115
+ classes=options.classes,
116
+ exclude_classes=options.exclude_classes,
117
+ device=options.device,
118
+ extra_args=options.extra_args,
119
+ )
120
+
121
+ model_name = self._get_model(options)
122
+ api_key = os.environ.get("GOOGLE_API_KEY")
123
+
124
+ detections = []
125
+ try:
126
+ # --- 1. Initialize OpenAI Client for Gemini ---
127
+ client = OpenAI(
128
+ api_key=api_key,
129
+ base_url=self.GEMINI_BASE_URL
130
+ )
131
+
132
+ # --- 2. Prepare Input for OpenAI API ---
133
+ if not options.classes:
134
+ logger.error("Gemini layout detection requires a list of classes to find.")
135
+ return []
136
+
137
+ width, height = image.size
138
+
139
+ # Convert image to base64
140
+ buffered = io.BytesIO()
141
+ image.save(buffered, format="PNG")
142
+ img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
143
+ image_url = f"data:image/png;base64,{img_base64}"
144
+
145
+ # Construct the prompt text
146
+ class_list_str = ", ".join(f'`{c}`' for c in options.classes)
147
+ prompt_text = (
148
+ f"Analyze the provided image of a document page ({width}x{height}). "
149
+ f"Identify all regions corresponding to the following types: {class_list_str}. "
150
+ f"Return ONLY the structured data requested."
151
+ )
152
+
153
+ # Prepare messages for chat completions endpoint
154
+ messages = [
155
+ {
156
+ "role": "user",
157
+ "content": [
158
+ {"type": "text", "text": prompt_text},
159
+ {
160
+ "type": "image_url",
161
+ "image_url": {"url": image_url},
162
+ },
163
+ ],
164
+ }
165
+ ]
166
+
167
+ # --- 3. Call OpenAI API using .parse for structured output ---
168
+ logger.debug(f"Running Gemini detection via OpenAI lib (Model: {model_name}). Asking for classes: {options.classes}")
169
+
170
+ # Extract relevant generation parameters from extra_args if provided
171
+ # Mapping common names: temperature, top_p, max_tokens
172
+ completion_kwargs = {
173
+ "temperature": options.extra_args.get("temperature", 0.2), # Default to low temp
174
+ "top_p": options.extra_args.get("top_p"),
175
+ "max_tokens": options.extra_args.get("max_tokens", 4096), # Map from max_output_tokens
176
+ }
177
+ # Filter out None values
178
+ completion_kwargs = {k: v for k, v in completion_kwargs.items() if v is not None}
179
+
180
+ completion: ChatCompletion = client.beta.chat.completions.parse(
181
+ model=model_name,
182
+ messages=messages,
183
+ response_format=List[DetectedRegion], # Pass the Pydantic model list
184
+ **completion_kwargs
185
+ )
186
+
187
+ logger.debug(f"Gemini response received via OpenAI lib.")
188
+
189
+ # --- 4. Process Parsed Response ---
190
+ if not completion.choices:
191
+ logger.error("Gemini response (via OpenAI lib) contained no choices.")
192
+ return []
193
+
194
+ # Get the parsed Pydantic objects
195
+ parsed_results = completion.choices[0].message.parsed
196
+ if not parsed_results or not isinstance(parsed_results, list):
197
+ logger.error(f"Gemini response (via OpenAI lib) did not contain a valid list of parsed regions. Found: {type(parsed_results)}")
198
+ return []
199
+
200
+ # --- 5. Convert to Detections & Filter ---
201
+ normalized_classes_req = {
202
+ self._normalize_class_name(c) for c in options.classes
203
+ }
204
+ normalized_classes_excl = {
205
+ self._normalize_class_name(c) for c in options.exclude_classes
206
+ } if options.exclude_classes else set()
207
+
208
+ for item in parsed_results:
209
+ # The item is already a validated DetectedRegion Pydantic object
210
+ # Access fields directly
211
+ label = item.label
212
+ bbox_raw = item.bbox
213
+ confidence_score = item.confidence
214
+
215
+ # Coordinates should already be floats, but ensure tuple format
216
+ xmin, ymin, xmax, ymax = tuple(bbox_raw)
217
+
218
+ # --- Apply Filtering ---
219
+ normalized_class = self._normalize_class_name(label)
220
+
221
+ # Check against requested classes (Should be guaranteed by schema, but doesn't hurt)
222
+ if normalized_class not in normalized_classes_req:
223
+ logger.warning(f"Gemini (via OpenAI) returned unexpected class '{label}' despite schema. Skipping.")
224
+ continue
225
+
226
+ # Check against excluded classes
227
+ if normalized_class in normalized_classes_excl:
228
+ logger.debug(f"Skipping excluded class '{label}' (normalized: {normalized_class}).")
229
+ continue
230
+
231
+ # Check against base confidence threshold from options
232
+ if confidence_score < options.confidence:
233
+ logger.debug(f"Skipping item with confidence {confidence_score:.3f} below threshold {options.confidence}.")
234
+ continue
235
+
236
+ # Add detection
237
+ detections.append({
238
+ "bbox": (xmin, ymin, xmax, ymax),
239
+ "class": label, # Use original label from LLM
240
+ "confidence": confidence_score,
241
+ "normalized_class": normalized_class,
242
+ "source": "layout",
243
+ "model": "gemini", # Keep model name generic as gemini
244
+ })
245
+
246
+ self.logger.info(
247
+ f"Gemini (via OpenAI lib) processed response. Detected {len(detections)} layout elements matching criteria."
248
+ )
249
+
250
+ except Exception as e:
251
+ # Catch potential OpenAI API errors or other issues
252
+ self.logger.error(f"Error during Gemini detection (via OpenAI lib): {e}", exc_info=True)
253
+ return []
254
+
255
+ return detections
256
+
257
+ def _normalize_class_name(self, name: str) -> str:
258
+ """Normalizes class names for filtering (lowercase, hyphenated)."""
259
+ return super()._normalize_class_name(name)
260
+
261
+ def validate_classes(self, classes: List[str]):
262
+ """Validation is less critical as we pass requested classes to the LLM."""
263
+ pass # Override base validation if needed, but likely not necessary
264
+
@@ -37,9 +37,15 @@ try:
37
37
  except ImportError:
38
38
  DoclingLayoutDetector = None
39
39
 
40
+ try:
41
+ from .gemini import GeminiLayoutDetector
42
+ except ImportError:
43
+ GeminiLayoutDetector = None
44
+
40
45
  from .layout_options import (
41
46
  BaseLayoutOptions,
42
47
  DoclingLayoutOptions,
48
+ GeminiLayoutOptions,
43
49
  LayoutOptions,
44
50
  PaddleLayoutOptions,
45
51
  SuryaLayoutOptions,
@@ -83,6 +89,13 @@ class LayoutManager:
83
89
  "options_class": DoclingLayoutOptions,
84
90
  }
85
91
 
92
+ # Add Gemini entry if available
93
+ if GeminiLayoutDetector:
94
+ ENGINE_REGISTRY["gemini"] = {
95
+ "class": GeminiLayoutDetector,
96
+ "options_class": GeminiLayoutOptions,
97
+ }
98
+
86
99
  # Define the limited set of kwargs allowed for the simple analyze_layout call
87
100
  SIMPLE_MODE_ALLOWED_KWARGS = {"engine", "confidence", "classes", "exclude_classes", "device"}
88
101
 
@@ -108,8 +121,22 @@ class LayoutManager:
108
121
  detector_instance = engine_class() # Instantiate
109
122
  if not detector_instance.is_available():
110
123
  # Check availability before storing
124
+ # Construct helpful error message with install hint
125
+ install_hint = ""
126
+ if engine_name == "yolo":
127
+ install_hint = "pip install 'natural-pdf[layout_yolo]'"
128
+ elif engine_name == "tatr":
129
+ install_hint = "pip install 'natural-pdf[core-ml]'"
130
+ elif engine_name == "paddle":
131
+ install_hint = "pip install 'natural-pdf[paddle]'"
132
+ elif engine_name == "surya":
133
+ install_hint = "pip install 'natural-pdf[surya]'"
134
+ # Add other engines like docling if they become optional extras
135
+ else:
136
+ install_hint = f"(Check installation requirements for {engine_name})"
137
+
111
138
  raise RuntimeError(
112
- f"Layout engine '{engine_name}' is not available. Please check dependencies."
139
+ f"Layout engine '{engine_name}' is not available. Please install the required dependencies: {install_hint}"
113
140
  )
114
141
  self._detector_instances[engine_name] = detector_instance # Store if available
115
142
 
@@ -80,6 +80,16 @@ class DoclingLayoutOptions(BaseLayoutOptions):
80
80
  # Other kwargs like 'device', 'batch_size' can go in extra_args
81
81
 
82
82
 
83
+ # --- Gemini Specific Options ---
84
+ @dataclass
85
+ class GeminiLayoutOptions(BaseLayoutOptions):
86
+ """Options specific to Gemini-based layout detection (using OpenAI compatibility)."""
87
+
88
+ model_name: str = "gemini-2.0-flash"
89
+ # Removed: prompt_template, temperature, top_p, max_output_tokens
90
+ # These are typically passed directly to the chat completion call or via extra_args
91
+
92
+
83
93
  # --- Union Type ---
84
94
  LayoutOptions = Union[
85
95
  YOLOLayoutOptions,
@@ -87,5 +97,6 @@ LayoutOptions = Union[
87
97
  PaddleLayoutOptions,
88
98
  SuryaLayoutOptions,
89
99
  DoclingLayoutOptions,
100
+ GeminiLayoutOptions,
90
101
  BaseLayoutOptions, # Include base for typing flexibility
91
102
  ]
@@ -91,7 +91,9 @@ class YOLODocLayoutDetector(LayoutDetector):
91
91
  def _load_model_from_options(self, options: YOLOLayoutOptions) -> Any:
92
92
  """Load the YOLOv10 model based on options."""
93
93
  if not self.is_available():
94
- raise RuntimeError("YOLO dependencies (doclayout_yolo, huggingface_hub) not installed.")
94
+ raise RuntimeError(
95
+ "YOLO dependencies not installed. Please run: pip install 'natural-pdf[layout_yolo]'"
96
+ )
95
97
  self.logger.info(f"Loading YOLO model: {options.model_repo}/{options.model_file}")
96
98
  try:
97
99
  model_path = hf_hub_download(repo_id=options.model_repo, filename=options.model_file)
@@ -105,7 +107,9 @@ class YOLODocLayoutDetector(LayoutDetector):
105
107
  def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
106
108
  """Detect layout elements in an image using YOLO."""
107
109
  if not self.is_available():
108
- raise RuntimeError("YOLO dependencies (doclayout_yolo, huggingface_hub) not installed.")
110
+ raise RuntimeError(
111
+ "YOLO dependencies not installed. Please run: pip install 'natural-pdf[layout_yolo]'"
112
+ )
109
113
 
110
114
  # Ensure options are the correct type, falling back to defaults if base type passed
111
115
  if not isinstance(options, YOLOLayoutOptions):
@@ -267,6 +267,27 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
267
267
  # Implementation requires integrating with classification models or logic
268
268
  raise NotImplementedError("categorize requires classification implementation.")
269
269
 
270
+ def export_ocr_correction_task(self, output_zip_path: str, **kwargs):
271
+ """
272
+ Exports OCR results from all PDFs in this collection into a single
273
+ correction task package (zip file).
274
+
275
+ Args:
276
+ output_zip_path: The path to save the output zip file.
277
+ **kwargs: Additional arguments passed to create_correction_task_package
278
+ (e.g., image_render_scale, overwrite).
279
+ """
280
+ try:
281
+ from natural_pdf.utils.packaging import create_correction_task_package
282
+ # Pass the collection itself (self) as the source
283
+ create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
284
+ except ImportError:
285
+ logger.error("Failed to import 'create_correction_task_package'. Packaging utility might be missing.")
286
+ # Or raise
287
+ except Exception as e:
288
+ logger.error(f"Failed to export correction task for collection: {e}", exc_info=True)
289
+ raise # Re-raise the exception from the utility function
290
+
270
291
  # --- Mixin Required Implementation ---
271
292
  def get_indexable_items(self) -> Iterable[Indexable]:
272
293
  """Yields Page objects from the collection, conforming to Indexable."""
@@ -312,6 +312,7 @@ class ElementManager:
312
312
 
313
313
  Args:
314
314
  ocr_results: List of OCR results dictionaries with 'text', 'bbox', 'confidence'.
315
+ Confidence can be None for detection-only results.
315
316
  scale_x: Factor to convert image x-coordinates to PDF coordinates.
316
317
  scale_y: Factor to convert image y-coordinates to PDF coordinates.
317
318
 
@@ -356,9 +357,14 @@ class ElementManager:
356
357
  pdf_bottom = bottom_img * scale_y
357
358
  pdf_height = (bottom_img - top_img) * scale_y
358
359
 
360
+ # Handle potential None confidence
361
+ raw_confidence = result.get("confidence")
362
+ confidence_value = float(raw_confidence) if raw_confidence is not None else None # Keep None if it was None
363
+ ocr_text = result.get("text") # Get text, will be None if detect_only
364
+
359
365
  # Create the TextElement for the word
360
366
  word_element_data = {
361
- "text": result["text"],
367
+ "text": ocr_text,
362
368
  "x0": pdf_x0,
363
369
  "top": pdf_top,
364
370
  "x1": pdf_x1,
@@ -367,7 +373,7 @@ class ElementManager:
367
373
  "height": pdf_height,
368
374
  "object_type": "word", # Treat OCR results as whole words
369
375
  "source": "ocr",
370
- "confidence": float(result.get("confidence", 0.0)),
376
+ "confidence": confidence_value, # Use the handled confidence
371
377
  "fontname": "OCR", # Use consistent OCR fontname
372
378
  "size": (
373
379
  round(pdf_height) if pdf_height > 0 else 10.0
@@ -385,7 +391,7 @@ class ElementManager:
385
391
  ocr_char_dict.setdefault("adv", ocr_char_dict.get("width", 0))
386
392
 
387
393
  # Add the char dict list to the word data before creating TextElement
388
- word_element_data["_char_dicts"] = [ocr_char_dict]
394
+ word_element_data["_char_dicts"] = [ocr_char_dict] # Store itself as its only char
389
395
 
390
396
  word_elem = TextElement(word_element_data, self._page)
391
397
  added_word_elements.append(word_elem)
@@ -393,16 +399,13 @@ class ElementManager:
393
399
  # Append the word element to the manager's list
394
400
  self._elements["words"].append(word_elem)
395
401
 
396
- # Also create and append a representative character dictionary
397
- # for consistency if someone iterates through manager.chars later.
398
- # This char dict represents the entire OCR word as a single 'char'.
399
- char_dict_data = ocr_char_dict # Use the one we already created
400
- char_dict_data["object_type"] = "char" # Mark as char type
401
- # pdfplumber char dicts don't typically have width/height/doctop,
402
- # but keeping them won't hurt WordExtractor if it encounters them.
403
- char_dict_data.setdefault("adv", char_dict_data.get("width", 0))
404
-
405
- self._elements["chars"].append(char_dict_data) # Append the dictionary
402
+ # Only add a representative char dict if text actually exists
403
+ if ocr_text is not None:
404
+ # This char dict represents the entire OCR word as a single 'char'.
405
+ char_dict_data = ocr_char_dict # Use the one we already created
406
+ char_dict_data["object_type"] = "char" # Mark as char type
407
+ char_dict_data.setdefault("adv", char_dict_data.get("width", 0))
408
+ self._elements["chars"].append(char_dict_data) # Append the dictionary
406
409
 
407
410
  except (KeyError, ValueError, TypeError) as e:
408
411
  logger.error(f"Failed to process OCR result: {result}. Error: {e}", exc_info=True)