natural-pdf 25.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. examples/__init__.py +3 -0
  2. examples/another_exclusion_example.py +20 -0
  3. examples/basic_usage.py +190 -0
  4. examples/boundary_exclusion_test.py +137 -0
  5. examples/boundary_inclusion_fix_test.py +157 -0
  6. examples/chainable_layout_example.py +70 -0
  7. examples/color_basic_test.py +49 -0
  8. examples/color_name_example.py +71 -0
  9. examples/color_test.py +62 -0
  10. examples/debug_ocr.py +91 -0
  11. examples/direct_ocr_test.py +148 -0
  12. examples/direct_paddle_test.py +99 -0
  13. examples/direct_qa_example.py +165 -0
  14. examples/document_layout_analysis.py +123 -0
  15. examples/document_qa_example.py +185 -0
  16. examples/exclusion_count_debug.py +128 -0
  17. examples/exclusion_debug.py +107 -0
  18. examples/exclusion_example.py +150 -0
  19. examples/exclusion_optimization_example.py +190 -0
  20. examples/extract_text_test.py +128 -0
  21. examples/font_aware_example.py +101 -0
  22. examples/font_variant_example.py +124 -0
  23. examples/footer_overlap_test.py +124 -0
  24. examples/highlight_all_example.py +82 -0
  25. examples/highlight_attributes_test.py +114 -0
  26. examples/highlight_confidence_display.py +122 -0
  27. examples/highlight_demo.py +110 -0
  28. examples/highlight_float_test.py +71 -0
  29. examples/highlight_test.py +147 -0
  30. examples/highlighting_example.py +123 -0
  31. examples/image_width_example.py +84 -0
  32. examples/improved_api_example.py +128 -0
  33. examples/layout_confidence_display_test.py +65 -0
  34. examples/layout_confidence_test.py +82 -0
  35. examples/layout_coordinate_debug.py +258 -0
  36. examples/layout_highlight_test.py +77 -0
  37. examples/logging_example.py +70 -0
  38. examples/ocr_comprehensive.py +193 -0
  39. examples/ocr_debug_example.py +87 -0
  40. examples/ocr_default_test.py +97 -0
  41. examples/ocr_engine_comparison.py +235 -0
  42. examples/ocr_example.py +89 -0
  43. examples/ocr_simplified_params.py +79 -0
  44. examples/ocr_visualization.py +102 -0
  45. examples/ocr_visualization_test.py +121 -0
  46. examples/paddle_layout_example.py +315 -0
  47. examples/paddle_layout_simple.py +74 -0
  48. examples/paddleocr_example.py +224 -0
  49. examples/page_collection_example.py +103 -0
  50. examples/polygon_highlight_example.py +83 -0
  51. examples/position_methods_example.py +134 -0
  52. examples/region_boundary_test.py +73 -0
  53. examples/region_exclusion_test.py +149 -0
  54. examples/region_expand_example.py +109 -0
  55. examples/region_image_example.py +116 -0
  56. examples/region_ocr_test.py +119 -0
  57. examples/region_sections_example.py +115 -0
  58. examples/school_books.py +49 -0
  59. examples/school_books_all.py +52 -0
  60. examples/scouring.py +36 -0
  61. examples/section_extraction_example.py +232 -0
  62. examples/simple_document_qa.py +97 -0
  63. examples/spatial_navigation_example.py +108 -0
  64. examples/table_extraction_example.py +135 -0
  65. examples/table_structure_detection.py +155 -0
  66. examples/tatr_cells_test.py +56 -0
  67. examples/tatr_ocr_table_test.py +94 -0
  68. examples/text_search_example.py +122 -0
  69. examples/text_style_example.py +110 -0
  70. examples/tiny-text.py +61 -0
  71. examples/until_boundaries_example.py +156 -0
  72. examples/until_example.py +112 -0
  73. examples/very_basics.py +15 -0
  74. natural_pdf/__init__.py +55 -0
  75. natural_pdf/analyzers/__init__.py +9 -0
  76. natural_pdf/analyzers/document_layout.py +736 -0
  77. natural_pdf/analyzers/text_structure.py +153 -0
  78. natural_pdf/core/__init__.py +3 -0
  79. natural_pdf/core/page.py +2376 -0
  80. natural_pdf/core/pdf.py +572 -0
  81. natural_pdf/elements/__init__.py +3 -0
  82. natural_pdf/elements/base.py +553 -0
  83. natural_pdf/elements/collections.py +770 -0
  84. natural_pdf/elements/line.py +124 -0
  85. natural_pdf/elements/rect.py +122 -0
  86. natural_pdf/elements/region.py +1366 -0
  87. natural_pdf/elements/text.py +304 -0
  88. natural_pdf/ocr/__init__.py +62 -0
  89. natural_pdf/ocr/easyocr_engine.py +254 -0
  90. natural_pdf/ocr/engine.py +158 -0
  91. natural_pdf/ocr/paddleocr_engine.py +263 -0
  92. natural_pdf/qa/__init__.py +3 -0
  93. natural_pdf/qa/document_qa.py +405 -0
  94. natural_pdf/selectors/__init__.py +4 -0
  95. natural_pdf/selectors/parser.py +360 -0
  96. natural_pdf/templates/__init__.py +1 -0
  97. natural_pdf/templates/ocr_debug.html +517 -0
  98. natural_pdf/utils/__init__.py +4 -0
  99. natural_pdf/utils/highlighting.py +605 -0
  100. natural_pdf/utils/ocr.py +515 -0
  101. natural_pdf/utils/reading_order.py +227 -0
  102. natural_pdf/utils/visualization.py +151 -0
  103. natural_pdf-25.3.16.dist-info/LICENSE +21 -0
  104. natural_pdf-25.3.16.dist-info/METADATA +268 -0
  105. natural_pdf-25.3.16.dist-info/RECORD +109 -0
  106. natural_pdf-25.3.16.dist-info/WHEEL +5 -0
  107. natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
  108. tests/__init__.py +3 -0
  109. tests/test_pdf.py +39 -0
@@ -0,0 +1,515 @@
1
+ """
2
+ OCR utilities for natural-pdf.
3
+ """
4
+ import base64
5
+ import io
6
+ import json
7
+ import os
8
+ import importlib.util
9
+ import importlib.resources
10
+ import webbrowser
11
+ from typing import Dict, List, Any, Optional, Union, Tuple
12
+ import numpy as np
13
+ from PIL import Image
14
+
15
+
16
+ class OCRManager:
17
+ """
18
+ Manager class for OCR operations.
19
+
20
+ This singleton class handles:
21
+ - OCR engine initialization and caching
22
+ - OCR parameter normalization
23
+ - Detection and recognition operations
24
+ """
25
+ _instance = None
26
+
27
+ @classmethod
28
+ def get_instance(cls):
29
+ """Get the singleton instance of OCRManager."""
30
+ if cls._instance is None:
31
+ cls._instance = OCRManager()
32
+ return cls._instance
33
+
34
+ def __init__(self):
35
+ """Initialize the OCR manager."""
36
+ self._readers = {} # Cache for initialized OCR engines
37
+ self._default_config = {
38
+ "engine": "paddleocr", # Default to PaddleOCR
39
+ "languages": ["en"],
40
+ "min_confidence": 0.5
41
+ # Engine-specific parameters can be passed directly
42
+ }
43
+
44
+ def normalize_config(self, config: Optional[Union[bool, str, List, Dict]] = None) -> Dict[str, Any]:
45
+ """
46
+ Normalize OCR configuration from various formats.
47
+
48
+ Args:
49
+ config: OCR configuration in various formats:
50
+ - None: OCR disabled
51
+ - True: OCR enabled with defaults
52
+ - "auto": Auto OCR mode
53
+ - "easyocr": Use EasyOCR with defaults
54
+ - ["en", "fr"]: Use default engine with these languages
55
+ - {"languages": ["en"]}: Detailed configuration
56
+
57
+ Returns:
58
+ Normalized configuration dictionary
59
+ """
60
+ if config is None:
61
+ return {"enabled": False}
62
+
63
+ if config is True:
64
+ return {"enabled": True, **self._default_config}
65
+
66
+ if isinstance(config, str):
67
+ if config.lower() == "auto":
68
+ return {"enabled": "auto", **self._default_config}
69
+ else:
70
+ # Assume it's an engine name
71
+ return {"enabled": True, "engine": config.lower(), **self._default_config}
72
+
73
+ if isinstance(config, list):
74
+ # Assume it's a list of languages
75
+ return {"enabled": True, "languages": config, **self._default_config}
76
+
77
+ if isinstance(config, dict):
78
+ # Start with enabled=True and defaults
79
+ result = {"enabled": True, **self._default_config}
80
+ # Then override with provided values
81
+ result.update(config)
82
+ return result
83
+
84
+ # Fallback for unknown types
85
+ return {"enabled": False}
86
+
87
+ def merge_configs(self, base_config: Dict[str, Any], override_config: Dict[str, Any]) -> Dict[str, Any]:
88
+ """
89
+ Merge OCR configurations, with override_config taking precedence.
90
+
91
+ Args:
92
+ base_config: Base configuration
93
+ override_config: Configuration to override base with
94
+
95
+ Returns:
96
+ Merged configuration
97
+ """
98
+ result = base_config.copy()
99
+
100
+ # Simple override for top-level keys, except for nested dicts
101
+ for key, value in override_config.items():
102
+ if key in result and isinstance(result[key], dict) and isinstance(value, dict):
103
+ # Merge nested dicts
104
+ result[key].update(value)
105
+ else:
106
+ # Replace value
107
+ result[key] = value
108
+
109
+ return result
110
+
111
+ def get_reader(self, config: Dict[str, Any]) -> Any:
112
+ """
113
+ Get or initialize an OCR reader based on configuration.
114
+
115
+ Args:
116
+ config: OCR configuration
117
+
118
+ Returns:
119
+ OCR reader instance
120
+ """
121
+ engine = config.get("engine", "easyocr")
122
+ languages = config.get("languages", ["en"])
123
+
124
+ # Create a cache key from engine and languages
125
+ cache_key = f"{engine}_{'-'.join(languages)}"
126
+
127
+ # Return cached reader if available
128
+ if cache_key in self._readers:
129
+ return self._readers[cache_key]
130
+
131
+ # Initialize new reader based on engine
132
+ if engine == "easyocr":
133
+ # Check if easyocr is installed
134
+ if not importlib.util.find_spec("easyocr"):
135
+ raise ImportError(
136
+ "EasyOCR is not installed. Please install it with: pip install easyocr"
137
+ )
138
+
139
+ # Import easyocr
140
+ import easyocr
141
+
142
+ # Get GPU flag (use GPU if available)
143
+ gpu = config.get("gpu", None) # None means auto-detect
144
+
145
+ # Create reader
146
+ reader = easyocr.Reader(
147
+ languages,
148
+ gpu=False,
149
+ download_enabled=config.get("download_enabled", True),
150
+ model_storage_directory=config.get("model_storage_directory", None),
151
+ user_network_directory=config.get("user_network_directory", None),
152
+ recog_network=config.get("recog_network", "standard"),
153
+ detector=config.get("detector", True),
154
+ recognizer=config.get("recognizer", True)
155
+ )
156
+
157
+ # Cache reader
158
+ self._readers[cache_key] = reader
159
+ return reader
160
+
161
+ # Add other OCR engines here (tesseract, etc.)
162
+
163
+ raise ValueError(f"Unsupported OCR engine: {engine}")
164
+
165
+ def detect_and_recognize(self, image: Image.Image, config: Dict[str, Any]) -> List[Dict[str, Any]]:
166
+ """
167
+ Run full OCR pipeline on an image (detection + recognition).
168
+
169
+ Args:
170
+ image: PIL Image to process
171
+ config: OCR configuration
172
+
173
+ Returns:
174
+ List of OCR results with text, bbox and confidence
175
+ """
176
+ engine = config.get("engine", "easyocr")
177
+
178
+ if engine == "easyocr":
179
+ return self._easyocr_detect_and_recognize(image, config)
180
+
181
+ # Add other engines here
182
+
183
+ raise ValueError(f"Unsupported OCR engine: {engine}")
184
+
185
+ def _easyocr_detect_and_recognize(self, image: Image.Image, config: Dict[str, Any]) -> List[Dict[str, Any]]:
186
+ """
187
+ Run EasyOCR on an image.
188
+
189
+ Args:
190
+ image: PIL Image to process
191
+ config: OCR configuration
192
+
193
+ Returns:
194
+ List of OCR results with text, bbox and confidence
195
+ """
196
+ # Get reader
197
+ reader = self.get_reader(config)
198
+
199
+ # Convert PIL Image to numpy array if needed
200
+ if isinstance(image, Image.Image):
201
+ img_array = np.array(image)
202
+ else:
203
+ img_array = image
204
+
205
+ # Get parameters directly from config (flatten structure)
206
+ # Default values are based on EasyOCR's defaults
207
+
208
+ # Detection parameters
209
+ text_threshold = config.get("text_threshold", 0.7)
210
+ low_text = config.get("low_text", 0.4)
211
+ link_threshold = config.get("link_threshold", 0.4)
212
+ canvas_size = config.get("canvas_size", 2560)
213
+ mag_ratio = config.get("mag_ratio", 1.0)
214
+ slope_ths = config.get("slope_ths", 0.1)
215
+ ycenter_ths = config.get("ycenter_ths", 0.5)
216
+ height_ths = config.get("height_ths", 0.5)
217
+ width_ths = config.get("width_ths", 0.5)
218
+ add_margin = config.get("add_margin", 0.1)
219
+
220
+ # Recognition parameters
221
+ decoder = config.get("decoder", "greedy")
222
+ beamWidth = config.get("beamWidth", 5)
223
+ batch_size = config.get("batch_size", 1)
224
+ workers = config.get("workers", 0)
225
+ allowlist = config.get("allowlist", None)
226
+ blocklist = config.get("blocklist", None)
227
+ detail = config.get("detail", 1)
228
+ paragraph = config.get("paragraph", False)
229
+ min_size = config.get("min_size", 10)
230
+ contrast_ths = config.get("contrast_ths", 0.1)
231
+ adjust_contrast = config.get("adjust_contrast", 0.5)
232
+
233
+ # For backward compatibility, also check nested structures
234
+ detection_params = config.get("detection_params", {})
235
+ recognition_params = config.get("recognition_params", {})
236
+
237
+ # Override with nested params if provided (backward compatibility)
238
+ if detection_params:
239
+ text_threshold = detection_params.get("text_threshold", text_threshold)
240
+ low_text = detection_params.get("low_text", low_text)
241
+ link_threshold = detection_params.get("link_threshold", link_threshold)
242
+ canvas_size = detection_params.get("canvas_size", canvas_size)
243
+ mag_ratio = detection_params.get("mag_ratio", mag_ratio)
244
+ slope_ths = detection_params.get("slope_ths", slope_ths)
245
+ ycenter_ths = detection_params.get("ycenter_ths", ycenter_ths)
246
+ height_ths = detection_params.get("height_ths", height_ths)
247
+ width_ths = detection_params.get("width_ths", width_ths)
248
+ add_margin = detection_params.get("add_margin", add_margin)
249
+
250
+ if recognition_params:
251
+ decoder = recognition_params.get("decoder", decoder)
252
+ beamWidth = recognition_params.get("beamWidth", beamWidth)
253
+ batch_size = recognition_params.get("batch_size", batch_size)
254
+ workers = recognition_params.get("workers", workers)
255
+ allowlist = recognition_params.get("allowlist", allowlist)
256
+ blocklist = recognition_params.get("blocklist", blocklist)
257
+ detail = recognition_params.get("detail", detail)
258
+ paragraph = recognition_params.get("paragraph", paragraph)
259
+ min_size = recognition_params.get("min_size", min_size)
260
+ contrast_ths = recognition_params.get("contrast_ths", contrast_ths)
261
+ adjust_contrast = recognition_params.get("adjust_contrast", adjust_contrast)
262
+
263
+ # Run OCR
264
+ result = reader.readtext(
265
+ img_array,
266
+ decoder=decoder,
267
+ beamWidth=beamWidth,
268
+ batch_size=batch_size,
269
+ workers=workers,
270
+ allowlist=allowlist,
271
+ blocklist=blocklist,
272
+ detail=detail,
273
+ paragraph=paragraph,
274
+ min_size=min_size,
275
+ contrast_ths=contrast_ths,
276
+ adjust_contrast=adjust_contrast,
277
+ text_threshold=text_threshold,
278
+ low_text=low_text,
279
+ link_threshold=link_threshold,
280
+ canvas_size=canvas_size,
281
+ mag_ratio=mag_ratio,
282
+ slope_ths=slope_ths,
283
+ ycenter_ths=ycenter_ths,
284
+ height_ths=height_ths,
285
+ width_ths=width_ths,
286
+ add_margin=add_margin
287
+ )
288
+
289
+ # Convert to standardized format
290
+ # EasyOCR format depends on the 'detail' parameter:
291
+ # With detail=1 (default): [[bbox, text, confidence], ...]
292
+ # With detail=0: [text, ...]
293
+ # bbox is [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] (clockwise from top-left)
294
+ # We convert to our format: [{'bbox': (x0,y0,x1,y1), 'text': text, 'confidence': conf}, ...]
295
+
296
+ standardized_results = []
297
+
298
+ for detection in result:
299
+ # Check the format based on what was returned
300
+ if isinstance(detection, list) and len(detection) >= 3:
301
+ # This is the detailed format (detail=1)
302
+ bbox = detection[0] # [[x1,y1],[x2,y2],[x3,y3],[x4,y4]]
303
+ text = detection[1]
304
+ confidence = detection[2]
305
+
306
+ # Convert polygon bbox to rectangle (x0, y0, x1, y1)
307
+ x_coords = [point[0] for point in bbox]
308
+ y_coords = [point[1] for point in bbox]
309
+
310
+ x0 = min(x_coords)
311
+ y0 = min(y_coords)
312
+ x1 = max(x_coords)
313
+ y1 = max(y_coords)
314
+
315
+ standardized_results.append({
316
+ 'bbox': (x0, y0, x1, y1),
317
+ 'text': text,
318
+ 'confidence': confidence
319
+ })
320
+ elif isinstance(detection, str):
321
+ # Simple format (detail=0), no bbox or confidence
322
+ # This shouldn't happen if we're controlling the detail parameter,
323
+ # but handle it just in case
324
+ standardized_results.append({
325
+ 'bbox': (0, 0, 1, 1), # Dummy bbox
326
+ 'text': detection,
327
+ 'confidence': 1.0 # Default confidence
328
+ })
329
+ else:
330
+ # Check if it's the polygon format [polygon_points, text, confidence]
331
+ if (hasattr(detection, '__getitem__') and
332
+ len(detection) >= 3 and
333
+ isinstance(detection[0], list) and
334
+ len(detection[0]) >= 4 and
335
+ all(isinstance(pt, list) and len(pt) == 2 for pt in detection[0])):
336
+
337
+ # Extract polygon points, text, and confidence
338
+ polygon = detection[0] # List of [x,y] points
339
+ text = detection[1] if isinstance(detection[1], str) else str(detection[1])
340
+ confidence = float(detection[2]) if len(detection) > 2 else 0.5
341
+
342
+ # Convert polygon to rectangular bbox
343
+ x_coords = [point[0] for point in polygon]
344
+ y_coords = [point[1] for point in polygon]
345
+ x0 = min(x_coords)
346
+ y0 = min(y_coords)
347
+ x1 = max(x_coords)
348
+ y1 = max(y_coords)
349
+
350
+ # Convert the polygon points to tuples for consistency
351
+ polygon_tuples = [(float(point[0]), float(point[1])) for point in polygon]
352
+
353
+ standardized_results.append({
354
+ 'bbox': (x0, y0, x1, y1),
355
+ 'text': text,
356
+ 'confidence': confidence,
357
+ 'polygon': polygon_tuples # Store the original polygon points
358
+ })
359
+ # Handle other unknown formats
360
+ elif hasattr(detection, '__getitem__'):
361
+ # It's some kind of sequence but not a full polygon format
362
+ if len(detection) >= 2:
363
+ text = detection[1] if isinstance(detection[1], str) else str(detection[1])
364
+ confidence = float(detection[2]) if len(detection) > 2 else 0.5
365
+
366
+ # Try to extract bbox if first element looks like coordinates
367
+ if isinstance(detection[0], list) and all(isinstance(x, (int, float)) for x in detection[0]):
368
+ # Just a warning, not an error
369
+ print(f"Note: Using non-standard OCR format: {detection}")
370
+ standardized_results.append({
371
+ 'bbox': (0, 0, 1, 1), # Dummy bbox
372
+ 'text': text,
373
+ 'confidence': confidence
374
+ })
375
+ else:
376
+ standardized_results.append({
377
+ 'bbox': (0, 0, 1, 1), # Dummy bbox
378
+ 'text': text,
379
+ 'confidence': confidence
380
+ })
381
+ else:
382
+ # Just a warning for truly unknown formats
383
+ print(f"Warning: Unexpected OCR result format: {detection}")
384
+
385
+ return standardized_results
386
+
387
+ def recognize_region(self, image: Image.Image, config: Dict[str, Any]) -> List[Dict[str, Any]]:
388
+ """
389
+ Run OCR recognition on a specific region.
390
+
391
+ Args:
392
+ image: PIL Image of the region to process
393
+ config: OCR configuration
394
+
395
+ Returns:
396
+ List of OCR results with text, bbox and confidence
397
+ """
398
+ # For most OCR engines, we can just use detect_and_recognize on the cropped image
399
+ # Since the region is already extracted, it will just detect/recognize within it
400
+ return self.detect_and_recognize(image, config)
401
+
402
+
403
+ # Function to load the OCR debug HTML template
404
+ def get_ocr_debug_template():
405
+ """
406
+ Load the OCR debug HTML template.
407
+
408
+ Returns:
409
+ str: The HTML template as a string
410
+ """
411
+ try:
412
+ # Try using importlib.resources (Python 3.7+)
413
+ try:
414
+ # For Python 3.9+
415
+ with importlib.resources.files('natural_pdf.templates').joinpath('ocr_debug.html').open('r', encoding='utf-8') as f:
416
+ return f.read()
417
+ except (AttributeError, TypeError):
418
+ # Fallback for Python 3.7-3.8
419
+ return importlib.resources.read_text('natural_pdf.templates', 'ocr_debug.html')
420
+ except (ImportError, FileNotFoundError):
421
+ # Fallback for direct file access (development)
422
+ import os
423
+ package_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
424
+ template_path = os.path.join(package_dir, 'templates', 'ocr_debug.html')
425
+
426
+ if os.path.exists(template_path):
427
+ with open(template_path, 'r', encoding='utf-8') as f:
428
+ return f.read()
429
+ else:
430
+ raise FileNotFoundError(f"OCR debug template not found at {template_path}")
431
+
432
+
433
+ def debug_ocr_to_html(pages, output_path=None):
434
+ """
435
+ Generate an HTML debug report for OCR results.
436
+
437
+ Args:
438
+ pages: List of Page objects or a PageCollection
439
+ output_path: Path to save the HTML report (optional)
440
+
441
+ Returns:
442
+ Path to the generated HTML file, or HTML string if no path provided
443
+ """
444
+ # Prepare the data structure
445
+ pages_data = {"pages": []}
446
+
447
+ # Process each page
448
+ for i, page in enumerate(pages):
449
+ # Extract OCR elements
450
+ try:
451
+ ocr_elements = page.find_all('text[source=ocr]')
452
+ if not ocr_elements:
453
+ ocr_elements = page.extract_ocr_elements()
454
+ except Exception as e:
455
+ print(f"Error extracting OCR from page {i}: {e}")
456
+ continue
457
+
458
+ # Skip if no OCR elements found
459
+ if not ocr_elements:
460
+ continue
461
+
462
+ # Get page image as base64
463
+ img_data = _get_page_image_base64(page)
464
+
465
+ # Create page data
466
+ page_data = {
467
+ "page_number": page.number,
468
+ "image": img_data,
469
+ "regions": []
470
+ }
471
+
472
+ # Process OCR elements
473
+ for j, elem in enumerate(ocr_elements):
474
+ region = {
475
+ "id": f"region_{j}",
476
+ "bbox": [elem.x0, elem.top, elem.x1, elem.bottom],
477
+ "ocr_text": elem.text,
478
+ "corrected_text": elem.text,
479
+ "confidence": getattr(elem, 'confidence', 0.0),
480
+ "modified": False
481
+ }
482
+ page_data["regions"].append(region)
483
+
484
+ pages_data["pages"].append(page_data)
485
+
486
+ # Get the HTML template and generate the final HTML
487
+ template = get_ocr_debug_template()
488
+ html = template.format(
489
+ pages_data=json.dumps(pages_data)
490
+ )
491
+
492
+ # Save to file if output path provided
493
+ if output_path:
494
+ with open(output_path, 'w', encoding='utf-8') as f:
495
+ f.write(html)
496
+ # Try to open the file in browser
497
+ try:
498
+ webbrowser.open('file://' + os.path.abspath(output_path))
499
+ except Exception:
500
+ pass
501
+ return output_path
502
+
503
+ # Return as string otherwise
504
+ return html
505
+
506
+
507
+ def _get_page_image_base64(page):
508
+ """Generate a base64 encoded image of the page."""
509
+ # Create a clean image of the page
510
+ img = page.show(scale=2.0)
511
+
512
+ # Convert to base64
513
+ buffered = io.BytesIO()
514
+ img.save(buffered, format="PNG")
515
+ return f"data:image/png;base64,{base64.b64encode(buffered.getvalue()).decode('utf-8')}"