anymd 0.0.11 → 0.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "anymd",
3
- "version": "0.0.11",
3
+ "version": "0.0.12",
4
4
  "description": "Convert any document (PDF, DOC, DOCX) to clean Markdown for RAG",
5
5
  "keywords": [
6
6
  "markdown",
@@ -2,12 +2,13 @@ import gc
2
2
  import json
3
3
  import sys
4
4
  import time
5
+ from concurrent.futures import ThreadPoolExecutor
5
6
  from dataclasses import dataclass, field
6
7
  from pathlib import Path
7
8
 
8
9
  import pypdfium2 as pdfium
9
10
  import pypdfium2.raw as pdfium_c
10
- from PIL import Image
11
+ from PIL import Image, ImageChops, ImageOps
11
12
 
12
13
 
13
14
  def _get_arg(flag: str, default: str) -> str:
@@ -27,6 +28,16 @@ LOG_FILE = Path(_get_arg('--log-file', 'output/ocr-log.txt'))
27
28
  MODEL_ID = 'mlx-community/chandra-4bit'
28
29
  IMAGE_DPI = 150
29
30
  MAX_TOKENS = 8192
31
+ CROP_PADDING = 20
32
+ CROP_THRESHOLD = 10
33
+ VISION_MAX_PIXELS = 300_000
34
+
35
+ DIACRITICS = frozenset(
36
+ 'àáảãạăắằẳẵặâấầẩẫậèéẻẽẹêếềểễệìíỉĩịòóỏõọôốồổỗộơớờởỡợùúủũụưứừửữựỳýỷỹỵđ'
37
+ 'ÀÁẢÃẠĂẮẰẲẴẶÂẤẦẨẪẬÈÉẺẼẸÊẾỀỂỄỆÌÍỈĨỊÒÓỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÙÚỦŨỤƯỨỪỬỮỰỲÝỶỸỴĐ'
38
+ )
39
+ MIN_DIACRITICS_RATIO = 0.15
40
+ MIN_CHARS_FOR_PAGE = 50
30
41
 
31
42
 
32
43
  def _to_output_name(pdf_path: str) -> str:
@@ -37,12 +48,7 @@ def _to_output_name(pdf_path: str) -> str:
37
48
  return str(rel.with_suffix('')).replace('/', '--')
38
49
 
39
50
 
40
- OCR_PROMPT = """
41
- OCR this image to markdown.
42
- Output clean, readable markdown preserving the document structure.
43
- Use proper headings, lists, tables, and formatting.
44
- For math expressions, use LaTeX notation with $ delimiters.
45
- """.strip()
51
+ OCR_PROMPT = 'OCR this image to markdown. Preserve structure: headings, lists, tables. LaTeX $ for math.'
46
52
 
47
53
 
48
54
  def _log(msg: str) -> None:
@@ -109,6 +115,42 @@ def _flatten_page(page: object) -> None:
109
115
  NATIVE_TEXT_THRESHOLD = 50
110
116
 
111
117
 
118
+ def _crop_margins(img: Image.Image) -> Image.Image:
119
+ bg = Image.new('RGB', img.size, (255, 255, 255))
120
+ diff = ImageChops.difference(img, bg)
121
+ gray = ImageOps.grayscale(diff)
122
+ bbox = gray.point(lambda x: 255 if x > CROP_THRESHOLD else 0).getbbox()
123
+ bg.close()
124
+ diff.close()
125
+ gray.close()
126
+ if not bbox:
127
+ return img
128
+ x0 = max(0, bbox[0] - CROP_PADDING)
129
+ y0 = max(0, bbox[1] - CROP_PADDING)
130
+ x1 = min(img.size[0], bbox[2] + CROP_PADDING)
131
+ y1 = min(img.size[1], bbox[3] + CROP_PADDING)
132
+ cropped = img.crop((x0, y0, x1, y1))
133
+ img.close()
134
+ return cropped
135
+
136
+
137
+ def _tesseract_ocr(image: Image.Image) -> str:
138
+ import pytesseract # noqa: PLC0415
139
+
140
+ return pytesseract.image_to_string(image, lang='vie')
141
+
142
+
143
+ def _tesseract_quality_ok(text: str) -> bool:
144
+ alpha_count = sum(1 for c in text if c.isalpha())
145
+ if alpha_count < MIN_CHARS_FOR_PAGE:
146
+ return False
147
+ diacritics_count = 0
148
+ for ch in text:
149
+ if ch in DIACRITICS:
150
+ diacritics_count += 1
151
+ return diacritics_count / alpha_count >= MIN_DIACRITICS_RATIO
152
+
153
+
112
154
  def _render_page(doc: pdfium.PdfDocument, page_idx: int) -> tuple[Image.Image | None, str]:
113
155
  page_obj = doc[page_idx]
114
156
  text = page_obj.get_textpage().get_text_range()
@@ -118,27 +160,25 @@ def _render_page(doc: pdfium.PdfDocument, page_idx: int) -> tuple[Image.Image |
118
160
  return None, text.strip()
119
161
 
120
162
  min_dim = min(page_obj.get_width(), page_obj.get_height())
121
- scale_dpi = max((1024 / min_dim) * 72, IMAGE_DPI)
163
+ scale_dpi = max((768 / min_dim) * 72, IMAGE_DPI)
122
164
  _flatten_page(page_obj)
123
165
  page_obj = doc[page_idx]
124
166
  pil_image: Image.Image = page_obj.render(scale=scale_dpi / 72).to_pil().convert('RGB')
125
- return pil_image, ''
167
+ return _crop_margins(pil_image), ''
126
168
 
127
169
 
128
- def _ocr_page(
170
+ def _chandra_ocr(
129
171
  model: object,
130
172
  processor: object,
131
- config: dict[str, object],
173
+ formatted_prompt: str,
132
174
  image: Image.Image,
133
175
  ) -> str:
134
176
  from mlx_vlm import generate # noqa: PLC0415
135
- from mlx_vlm.prompt_utils import apply_chat_template # noqa: PLC0415
136
177
 
137
- formatted = apply_chat_template(processor, config, OCR_PROMPT, num_images=1)
138
178
  result = generate(
139
179
  model, # type: ignore[arg-type]
140
180
  processor, # type: ignore[arg-type]
141
- formatted, # type: ignore[arg-type]
181
+ formatted_prompt, # type: ignore[arg-type]
142
182
  image, # type: ignore[arg-type]
143
183
  max_tokens=MAX_TOKENS,
144
184
  temperature=0.0,
@@ -147,11 +187,44 @@ def _ocr_page(
147
187
  return result.text
148
188
 
149
189
 
150
- def _ocr_one_file( # noqa: PLR0913, PLR0917
190
+ @dataclass
191
+ class ChandraState:
192
+ model: object = None
193
+ processor: object = None
194
+ formatted_prompt: str = ''
195
+ loaded: bool = False
196
+
197
+
198
+ def _load_chandra(state: ChandraState) -> None:
199
+ if state.loaded:
200
+ return
201
+ _log(f'Loading MLX model {MODEL_ID}...')
202
+ t0 = time.time()
203
+ from mlx_vlm import load # noqa: PLC0415
204
+ from mlx_vlm.prompt_utils import apply_chat_template # noqa: PLC0415
205
+ from mlx_vlm.utils import load_config # noqa: PLC0415
206
+
207
+ state.model, state.processor = load(MODEL_ID)
208
+ config = load_config(MODEL_ID)
209
+ state.formatted_prompt = apply_chat_template(state.processor, config, OCR_PROMPT, num_images=1)
210
+ ip = state.processor.image_processor
211
+ ip.max_pixels = VISION_MAX_PIXELS
212
+ ip.min_pixels = VISION_MAX_PIXELS // 4
213
+ state.loaded = True
214
+ _log(f'Model loaded in {time.time() - t0:.0f}s')
215
+
216
+
217
+ @dataclass
218
+ class PageCounts:
219
+ tess: int = 0
220
+ vlm: int = 0
221
+ txt: int = 0
222
+
223
+
224
+ def _ocr_one_file( # noqa: PLR0913, PLR0915, PLR0917
151
225
  pdf: str,
152
- model: object,
153
- processor: object,
154
- config: dict[str, object],
226
+ chandra: ChandraState,
227
+ counts: PageCounts,
155
228
  done_count: int,
156
229
  idx: int,
157
230
  pending_len: int,
@@ -164,52 +237,76 @@ def _ocr_one_file( # noqa: PLR0913, PLR0917
164
237
  unique_name = _to_output_name(pdf)
165
238
  display_name = Path(pdf).stem
166
239
  file_start = time.time()
240
+ md_path = OUTPUT_BASE / f'{unique_name}.md'
241
+ tmp_path = OUTPUT_BASE / f'{unique_name}.md.tmp'
167
242
  doc = pdfium.PdfDocument(pdf)
168
243
  try:
169
244
  doc.init_forms()
170
245
  num_pages = len(doc)
171
246
  _log(f'[{idx}/{pending_len}] ({done_count + idx}/{total}) OCR {display_name} ({num_pages}p)')
172
- all_markdown: list[str] = []
173
-
174
- for p_idx in range(num_pages):
175
- _save_progress(
176
- Progress(
177
- done=done_count + idx - 1,
178
- total=total,
179
- errors=errors,
180
- current_file=display_name,
181
- current_page=f'{p_idx + 1}/{num_pages}',
182
- current_pages_total=num_pages,
183
- current_file_started=file_start,
184
- elapsed=time.time() - pipeline_start,
185
- avg_per_file=avg,
186
- recent_files=recent_files,
187
- )
188
- )
189
-
190
- page_t = time.time()
191
- image, native_text = _render_page(doc, p_idx)
192
-
193
- if image is not None:
194
- md = _ocr_page(model, processor, config, image)
195
- image.close()
196
- tag = 'ocr'
197
- else:
198
- md = native_text
199
- tag = 'txt'
200
247
 
201
- all_markdown.append(md)
202
- _log(f' p{p_idx + 1}/{num_pages} [{tag}] {time.time() - page_t:.0f}s ({len(md)} chars)')
203
-
204
- if p_idx % 5 == 0:
205
- _free_memory()
248
+ executor = ThreadPoolExecutor(max_workers=1)
249
+ with tmp_path.open('w', encoding='utf-8') as out:
250
+ next_future = executor.submit(_render_page, doc, 0)
206
251
 
207
- md_path = OUTPUT_BASE / f'{unique_name}.md'
208
- md_path.write_text('\n\n'.join(all_markdown), encoding='utf-8')
252
+ for p_idx in range(num_pages):
253
+ _save_progress(
254
+ Progress(
255
+ done=done_count + idx - 1,
256
+ total=total,
257
+ errors=errors,
258
+ current_file=display_name,
259
+ current_page=f'{p_idx + 1}/{num_pages}',
260
+ current_pages_total=num_pages,
261
+ current_file_started=file_start,
262
+ elapsed=time.time() - pipeline_start,
263
+ avg_per_file=avg,
264
+ recent_files=recent_files,
265
+ )
266
+ )
209
267
 
210
- del all_markdown
268
+ page_t = time.time()
269
+ image, native_text = next_future.result()
270
+
271
+ if p_idx + 1 < num_pages:
272
+ next_future = executor.submit(_render_page, doc, p_idx + 1)
273
+
274
+ if image is not None:
275
+ tess_text = _tesseract_ocr(image)
276
+ if _tesseract_quality_ok(tess_text):
277
+ md = tess_text
278
+ tag = 'tess'
279
+ counts.tess += 1
280
+ else:
281
+ _load_chandra(chandra)
282
+ md = _chandra_ocr(chandra.model, chandra.processor, chandra.formatted_prompt, image)
283
+ tag = 'vlm'
284
+ counts.vlm += 1
285
+ image.close()
286
+ else:
287
+ md = native_text
288
+ tag = 'txt'
289
+ counts.txt += 1
290
+
291
+ if p_idx > 0:
292
+ out.write('\n\n')
293
+ out.write(md)
294
+ out.flush()
295
+ _log(f' p{p_idx + 1}/{num_pages} [{tag}] {time.time() - page_t:.0f}s ({len(md)} chars)')
296
+
297
+ if p_idx % 10 == 9: # noqa: PLR2004
298
+ _free_memory()
299
+
300
+ executor.shutdown(wait=False)
301
+
302
+ tmp_path.rename(md_path)
211
303
  _free_memory()
212
- return num_pages
304
+ return num_pages # noqa: TRY300
305
+ except Exception:
306
+ for p in [tmp_path, md_path]:
307
+ if p.exists():
308
+ p.unlink()
309
+ raise
213
310
  finally:
214
311
  doc.close()
215
312
 
@@ -238,14 +335,9 @@ def _run_ocr() -> None: # noqa: PLR0914
238
335
  _save_progress(Progress(total, total, 0, '-', '-', 0, 0, 0, 0, []))
239
336
  return
240
337
 
241
- _log(f'Loading MLX model {MODEL_ID}...')
242
- t0 = time.time()
243
- from mlx_vlm import load # noqa: PLC0415
244
- from mlx_vlm.utils import load_config # noqa: PLC0415
245
-
246
- model, processor = load(MODEL_ID)
247
- config = load_config(MODEL_ID)
248
- _log(f'Model loaded in {time.time() - t0:.0f}s')
338
+ chandra = ChandraState()
339
+ counts = PageCounts()
340
+ _log('Hybrid OCR: Tesseract fast-pass, chandra VLM fallback')
249
341
 
250
342
  errors = 0
251
343
  file_times: list[float] = []
@@ -263,7 +355,17 @@ def _run_ocr() -> None: # noqa: PLR0914
263
355
 
264
356
  try:
265
357
  num_pages = _ocr_one_file(
266
- pdf, model, processor, config, done_count, i, len(pending), total, errors, avg, pipeline_start, recent_files
358
+ pdf,
359
+ chandra,
360
+ counts,
361
+ done_count,
362
+ i,
363
+ len(pending),
364
+ total,
365
+ errors,
366
+ avg,
367
+ pipeline_start,
368
+ recent_files,
267
369
  )
268
370
  elapsed = time.time() - t1
269
371
  file_times.append(elapsed)
@@ -297,6 +399,7 @@ def _run_ocr() -> None: # noqa: PLR0914
297
399
 
298
400
  total_time = _format_duration(time.time() - pipeline_start)
299
401
  _log(f'OCR complete. Done: {done_count + len(pending)}, Errors: {errors}, Time: {total_time}')
402
+ _log(f'Engine stats: tess={counts.tess} vlm={counts.vlm} txt={counts.txt}')
300
403
 
301
404
 
302
405
  def main() -> None:
@@ -0,0 +1,266 @@
1
+ import json
2
+ import os
3
+ import sys
4
+ import time
5
+ from pathlib import Path
6
+
7
+ os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
8
+
9
+ import pypdfium2 as pdfium
10
+ import pypdfium2.raw as pdfium_c
11
+ from PIL import Image, ImageChops, ImageOps
12
+
13
+ CROP_PADDING = 20
14
+ CROP_THRESHOLD = 10
15
+ IMAGE_DPI = 150
16
+
17
+ DIACRITICS = frozenset(
18
+ 'àáảãạăắằẳẵặâấầẩẫậèéẻẽẹêếềểễệìíỉĩịòóỏõọôốồổỗộơớờởỡợùúủũụưứừửữựỳýỷỹỵđ'
19
+ 'ÀÁẢÃẠĂẮẰẲẴẶÂẤẦẨẪẬÈÉẺẼẸÊẾỀỂỄỆÌÍỈĨỊÒÓỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÙÚỦŨỤƯỨỪỬỮỰỲÝỶỸỴĐ'
20
+ )
21
+
22
+
23
+ def _get_arg(flag: str, default: str) -> str:
24
+ try:
25
+ idx = sys.argv.index(flag)
26
+ return sys.argv[idx + 1]
27
+ except (ValueError, IndexError):
28
+ return default
29
+
30
+
31
+ PDF_PATH = _get_arg('--pdf', '')
32
+ PAGES_STR = _get_arg('--pages', '0,1,2')
33
+ ENGINES_STR = _get_arg('--engines', 'tesseract,paddleocr,easyocr,surya,vietocr')
34
+
35
+ if not PDF_PATH:
36
+ print(
37
+ 'Usage: python benchmark-ocr.py --pdf <path> [--pages 0,1,2] [--engines tesseract,paddleocr,easyocr,surya,vietocr]'
38
+ )
39
+ sys.exit(1)
40
+
41
+ PAGE_INDICES = [int(p.strip()) for p in PAGES_STR.split(',')]
42
+ ENGINES = [e.strip() for e in ENGINES_STR.split(',')]
43
+
44
+
45
+ def _crop_margins(img: Image.Image) -> Image.Image:
46
+ bg = Image.new('RGB', img.size, (255, 255, 255))
47
+ diff = ImageChops.difference(img, bg)
48
+ gray = ImageOps.grayscale(diff)
49
+ bbox = gray.point(lambda x: 255 if x > CROP_THRESHOLD else 0).getbbox()
50
+ bg.close()
51
+ diff.close()
52
+ gray.close()
53
+ if not bbox:
54
+ return img
55
+ x0 = max(0, bbox[0] - CROP_PADDING)
56
+ y0 = max(0, bbox[1] - CROP_PADDING)
57
+ x1 = min(img.size[0], bbox[2] + CROP_PADDING)
58
+ y1 = min(img.size[1], bbox[3] + CROP_PADDING)
59
+ return img.crop((x0, y0, x1, y1))
60
+
61
+
62
+ def render_page(doc: pdfium.PdfDocument, page_idx: int) -> Image.Image:
63
+ page_obj = doc[page_idx]
64
+ pdfium_c.FPDFPage_Flatten(page_obj, pdfium_c.FLAT_NORMALDISPLAY)
65
+ page_obj = doc[page_idx]
66
+ min_dim = min(page_obj.get_width(), page_obj.get_height())
67
+ scale_dpi = max((768 / min_dim) * 72, IMAGE_DPI)
68
+ pil_image: Image.Image = page_obj.render(scale=scale_dpi / 72).to_pil().convert('RGB')
69
+ return _crop_margins(pil_image)
70
+
71
+
72
+ def _count_vietnamese_diacritics(text: str) -> int:
73
+ count = 0
74
+ for ch in text:
75
+ if ch in DIACRITICS:
76
+ count += 1
77
+ return count
78
+
79
+
80
+ def run_tesseract(image: Image.Image, _state: dict) -> str:
81
+ import pytesseract # noqa: PLC0415
82
+
83
+ return pytesseract.image_to_string(image, lang='vie')
84
+
85
+
86
+ def run_paddleocr(image: Image.Image, _state: dict) -> str:
87
+ import numpy as np # noqa: PLC0415
88
+
89
+ if 'engine' not in _state:
90
+ from paddleocr import PaddleOCR # noqa: PLC0415
91
+
92
+ _state['engine'] = PaddleOCR(lang='vi')
93
+
94
+ img_array = np.array(image)
95
+ lines: list[str] = []
96
+ for page_result in _state['engine'].predict(img_array):
97
+ j = page_result.json
98
+ res = j.get('res', {})
99
+ if 'rec_texts' in res:
100
+ lines.extend(res['rec_texts'])
101
+ return '\n'.join(lines)
102
+
103
+
104
+ def run_easyocr(image: Image.Image, _state: dict) -> str:
105
+ import numpy as np # noqa: PLC0415
106
+
107
+ if 'reader' not in _state:
108
+ import easyocr # noqa: PLC0415
109
+
110
+ _state['reader'] = easyocr.Reader(['vi'], gpu=False)
111
+
112
+ img_array = np.array(image)
113
+ results = _state['reader'].readtext(img_array)
114
+ lines = []
115
+ for _bbox, text, _conf in results:
116
+ lines.append(text)
117
+ return '\n'.join(lines)
118
+
119
+
120
+ def run_surya(image: Image.Image, _state: dict) -> str:
121
+ if 'rec' not in _state:
122
+ from surya.detection import DetectionPredictor # noqa: PLC0415
123
+ from surya.foundation import FoundationPredictor # noqa: PLC0415
124
+ from surya.recognition import RecognitionPredictor # noqa: PLC0415
125
+
126
+ foundation = FoundationPredictor()
127
+ _state['det'] = DetectionPredictor()
128
+ _state['rec'] = RecognitionPredictor(foundation)
129
+
130
+ predictions = _state['rec']([image], det_predictor=_state['det'])
131
+ return '\n'.join(tl.text for tl in predictions[0].text_lines)
132
+
133
+
134
+ def run_vietocr(image: Image.Image, _state: dict) -> str:
135
+ if 'detector' not in _state:
136
+ import torch # noqa: PLC0415
137
+ from vietocr.tool.config import Cfg # noqa: PLC0415
138
+ from vietocr.tool.predictor import Predictor # noqa: PLC0415
139
+
140
+ config = Cfg.load_config_from_name('vgg_transformer')
141
+ config['cnn']['pretrained'] = False
142
+ if torch.backends.mps.is_available():
143
+ config['device'] = 'mps'
144
+ else:
145
+ config['device'] = 'cpu'
146
+ _state['detector'] = Predictor(config)
147
+
148
+ return _state['detector'].predict(image)
149
+
150
+
151
+ ENGINE_RUNNERS = {
152
+ 'tesseract': run_tesseract,
153
+ 'paddleocr': run_paddleocr,
154
+ 'easyocr': run_easyocr,
155
+ 'surya': run_surya,
156
+ 'vietocr': run_vietocr,
157
+ }
158
+
159
+
160
+ def main() -> None: # noqa: C901, PLR0912, PLR0914, PLR0915
161
+ doc = pdfium.PdfDocument(PDF_PATH)
162
+ num_pages = len(doc)
163
+ print(f'PDF: {PDF_PATH} ({num_pages} pages)')
164
+ print(f'Testing pages: {PAGE_INDICES}')
165
+ print(f'Engines: {ENGINES}')
166
+ print('=' * 80)
167
+
168
+ images: dict[int, Image.Image] = {}
169
+ for p_idx in PAGE_INDICES:
170
+ if p_idx >= num_pages:
171
+ print(f' SKIP page {p_idx} (only {num_pages} pages)')
172
+ else:
173
+ t0 = time.time()
174
+ images[p_idx] = render_page(doc, p_idx)
175
+ render_time = time.time() - t0
176
+ img = images[p_idx]
177
+ print(f' Rendered page {p_idx}: {img.size[0]}x{img.size[1]} in {render_time:.1f}s')
178
+
179
+ doc.close()
180
+ print()
181
+
182
+ states: dict[str, dict] = {}
183
+ results: list[dict] = []
184
+
185
+ for engine_name in ENGINES:
186
+ if engine_name not in ENGINE_RUNNERS:
187
+ print(f'Unknown engine: {engine_name}')
188
+ continue
189
+
190
+ states.setdefault(engine_name, {})
191
+ runner = ENGINE_RUNNERS[engine_name]
192
+ print(f'=== {engine_name.upper()} ===')
193
+
194
+ for p_idx in sorted(images.keys()):
195
+ img = images[p_idx]
196
+ print(f' page {p_idx} ... ', end='', flush=True)
197
+
198
+ t0 = time.time()
199
+ try:
200
+ text = runner(img, states[engine_name])
201
+ elapsed = time.time() - t0
202
+ char_count = len(text)
203
+ alpha_count = sum(1 for c in text if c.isalpha())
204
+ diacritics_count = _count_vietnamese_diacritics(text)
205
+ print(f'{elapsed:.1f}s | {char_count} chars | {alpha_count} alpha | {diacritics_count} diacritics')
206
+
207
+ results.append({
208
+ 'engine': engine_name,
209
+ 'page': p_idx,
210
+ 'time_s': round(elapsed, 2),
211
+ 'chars': char_count,
212
+ 'alpha': alpha_count,
213
+ 'diacritics': diacritics_count,
214
+ 'text_preview': text[:300].replace('\n', ' '),
215
+ 'text_full': text,
216
+ })
217
+ except Exception as run_err: # noqa: BLE001
218
+ elapsed = time.time() - t0
219
+ print(f'ERROR {elapsed:.1f}s: {run_err}')
220
+ results.append({
221
+ 'engine': engine_name,
222
+ 'page': p_idx,
223
+ 'time_s': round(elapsed, 2),
224
+ 'error': str(run_err),
225
+ })
226
+
227
+ print()
228
+ print('=' * 80)
229
+ print('SUMMARY')
230
+ print('=' * 80)
231
+ header = f'{"Engine":<12} {"Page":<6} {"Time":>8} {"Chars":>8} {"Alpha":>8} {"Diacritics":>10} Preview'
232
+ print(header)
233
+ print('-' * 100)
234
+ for r in results:
235
+ if 'error' in r:
236
+ print(f'{r["engine"]:<12} {r["page"]:<6} {r["time_s"]:>7.1f}s {"ERROR":>8}')
237
+ else:
238
+ preview = r['text_preview'][:60]
239
+ print(
240
+ f'{r["engine"]:<12} {r["page"]:<6} {r["time_s"]:>7.1f}s'
241
+ f' {r["chars"]:>8} {r["alpha"]:>8} {r["diacritics"]:>10}'
242
+ f' {preview}'
243
+ )
244
+
245
+ out_path = Path(PDF_PATH).parent / 'benchmark-results.json'
246
+ summary = []
247
+ for r in results:
248
+ s = dict(r)
249
+ s.pop('text_full', None)
250
+ summary.append(s)
251
+ out_path.write_text(json.dumps(summary, indent=2, ensure_ascii=False) + '\n', encoding='utf-8')
252
+ print(f'\nResults saved to {out_path}')
253
+
254
+ print('\n\nFULL TEXT OUTPUT PER ENGINE/PAGE:')
255
+ print('=' * 80)
256
+ for r in results:
257
+ if 'text_full' in r:
258
+ print(f'\n--- [{r["engine"]}] page {r["page"]} ({r["time_s"]}s, {r["chars"]} chars) ---')
259
+ full = r['text_full']
260
+ print(full[:2000])
261
+ if len(full) > 2000: # noqa: PLR2004
262
+ print(f'... (truncated, {len(full)} total chars)')
263
+
264
+
265
+ if __name__ == '__main__':
266
+ main()