npm - anymd - Versions diffs - 0.0.11 → 0.0.12 - Mend

anymd 0.0.11 → 0.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/package.json +1 -1
package/scripts/batch-ocr.py +167 -64
package/scripts/benchmark-ocr.py +266 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "anymd",
-  "version": "0.0.11",
+  "version": "0.0.12",
   "description": "Convert any document (PDF, DOC, DOCX) to clean Markdown for RAG",
   "keywords": [
     "markdown",

package/scripts/batch-ocr.py CHANGED Viewed

@@ -2,12 +2,13 @@ import gc
 import json
 import sys
 import time
+from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass, field
 from pathlib import Path
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
-from PIL import Image
+from PIL import Image, ImageChops, ImageOps
 def _get_arg(flag: str, default: str) -> str:
@@ -27,6 +28,16 @@ LOG_FILE = Path(_get_arg('--log-file', 'output/ocr-log.txt'))
 MODEL_ID = 'mlx-community/chandra-4bit'
 IMAGE_DPI = 150
 MAX_TOKENS = 8192
+CROP_PADDING = 20
+CROP_THRESHOLD = 10
+VISION_MAX_PIXELS = 300_000
+DIACRITICS = frozenset(
+  'àáảãạăắằẳẵặâấầẩẫậèéẻẽẹêếềểễệìíỉĩịòóỏõọôốồổỗộơớờởỡợùúủũụưứừửữựỳýỷỹỵđ'
+  'ÀÁẢÃẠĂẮẰẲẴẶÂẤẦẨẪẬÈÉẺẼẸÊẾỀỂỄỆÌÍỈĨỊÒÓỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÙÚỦŨỤƯỨỪỬỮỰỲÝỶỸỴĐ'
+)
+MIN_DIACRITICS_RATIO = 0.15
+MIN_CHARS_FOR_PAGE = 50
 def _to_output_name(pdf_path: str) -> str:
@@ -37,12 +48,7 @@ def _to_output_name(pdf_path: str) -> str:
   return str(rel.with_suffix('')).replace('/', '--')
-OCR_PROMPT = """
-OCR this image to markdown.
-Output clean, readable markdown preserving the document structure.
-Use proper headings, lists, tables, and formatting.
-For math expressions, use LaTeX notation with $ delimiters.
-""".strip()
+OCR_PROMPT = 'OCR this image to markdown. Preserve structure: headings, lists, tables. LaTeX $ for math.'
 def _log(msg: str) -> None:
@@ -109,6 +115,42 @@ def _flatten_page(page: object) -> None:
 NATIVE_TEXT_THRESHOLD = 50
+def _crop_margins(img: Image.Image) -> Image.Image:
+  bg = Image.new('RGB', img.size, (255, 255, 255))
+  diff = ImageChops.difference(img, bg)
+  gray = ImageOps.grayscale(diff)
+  bbox = gray.point(lambda x: 255 if x > CROP_THRESHOLD else 0).getbbox()
+  bg.close()
+  diff.close()
+  gray.close()
+  if not bbox:
+    return img
+  x0 = max(0, bbox[0] - CROP_PADDING)
+  y0 = max(0, bbox[1] - CROP_PADDING)
+  x1 = min(img.size[0], bbox[2] + CROP_PADDING)
+  y1 = min(img.size[1], bbox[3] + CROP_PADDING)
+  cropped = img.crop((x0, y0, x1, y1))
+  img.close()
+  return cropped
+def _tesseract_ocr(image: Image.Image) -> str:
+  import pytesseract  # noqa: PLC0415
+  return pytesseract.image_to_string(image, lang='vie')
+def _tesseract_quality_ok(text: str) -> bool:
+  alpha_count = sum(1 for c in text if c.isalpha())
+  if alpha_count < MIN_CHARS_FOR_PAGE:
+    return False
+  diacritics_count = 0
+  for ch in text:
+    if ch in DIACRITICS:
+      diacritics_count += 1
+  return diacritics_count / alpha_count >= MIN_DIACRITICS_RATIO
 def _render_page(doc: pdfium.PdfDocument, page_idx: int) -> tuple[Image.Image | None, str]:
   page_obj = doc[page_idx]
   text = page_obj.get_textpage().get_text_range()
@@ -118,27 +160,25 @@ def _render_page(doc: pdfium.PdfDocument, page_idx: int) -> tuple[Image.Image |
     return None, text.strip()
   min_dim = min(page_obj.get_width(), page_obj.get_height())
-  scale_dpi = max((1024 / min_dim) * 72, IMAGE_DPI)
+  scale_dpi = max((768 / min_dim) * 72, IMAGE_DPI)
   _flatten_page(page_obj)
   page_obj = doc[page_idx]
   pil_image: Image.Image = page_obj.render(scale=scale_dpi / 72).to_pil().convert('RGB')
-  return pil_image, ''
+  return _crop_margins(pil_image), ''
-def _ocr_page(
+def _chandra_ocr(
   model: object,
   processor: object,
-  config: dict[str, object],
+  formatted_prompt: str,
   image: Image.Image,
 ) -> str:
   from mlx_vlm import generate  # noqa: PLC0415
-  from mlx_vlm.prompt_utils import apply_chat_template  # noqa: PLC0415
-  formatted = apply_chat_template(processor, config, OCR_PROMPT, num_images=1)
   result = generate(
     model,  # type: ignore[arg-type]
     processor,  # type: ignore[arg-type]
-    formatted,  # type: ignore[arg-type]
+    formatted_prompt,  # type: ignore[arg-type]
     image,  # type: ignore[arg-type]
     max_tokens=MAX_TOKENS,
     temperature=0.0,
@@ -147,11 +187,44 @@ def _ocr_page(
   return result.text
-def _ocr_one_file(  # noqa: PLR0913, PLR0917
+@dataclass
+class ChandraState:
+  model: object = None
+  processor: object = None
+  formatted_prompt: str = ''
+  loaded: bool = False
+def _load_chandra(state: ChandraState) -> None:
+  if state.loaded:
+    return
+  _log(f'Loading MLX model {MODEL_ID}...')
+  t0 = time.time()
+  from mlx_vlm import load  # noqa: PLC0415
+  from mlx_vlm.prompt_utils import apply_chat_template  # noqa: PLC0415
+  from mlx_vlm.utils import load_config  # noqa: PLC0415
+  state.model, state.processor = load(MODEL_ID)
+  config = load_config(MODEL_ID)
+  state.formatted_prompt = apply_chat_template(state.processor, config, OCR_PROMPT, num_images=1)
+  ip = state.processor.image_processor
+  ip.max_pixels = VISION_MAX_PIXELS
+  ip.min_pixels = VISION_MAX_PIXELS // 4
+  state.loaded = True
+  _log(f'Model loaded in {time.time() - t0:.0f}s')
+@dataclass
+class PageCounts:
+  tess: int = 0
+  vlm: int = 0
+  txt: int = 0
+def _ocr_one_file(  # noqa: PLR0913, PLR0915, PLR0917
   pdf: str,
-  model: object,
-  processor: object,
-  config: dict[str, object],
+  chandra: ChandraState,
+  counts: PageCounts,
   done_count: int,
   idx: int,
   pending_len: int,
@@ -164,52 +237,76 @@ def _ocr_one_file(  # noqa: PLR0913, PLR0917
   unique_name = _to_output_name(pdf)
   display_name = Path(pdf).stem
   file_start = time.time()
+  md_path = OUTPUT_BASE / f'{unique_name}.md'
+  tmp_path = OUTPUT_BASE / f'{unique_name}.md.tmp'
   doc = pdfium.PdfDocument(pdf)
   try:
     doc.init_forms()
     num_pages = len(doc)
     _log(f'[{idx}/{pending_len}] ({done_count + idx}/{total}) OCR {display_name} ({num_pages}p)')
-    all_markdown: list[str] = []
-    for p_idx in range(num_pages):
-      _save_progress(
-        Progress(
-          done=done_count + idx - 1,
-          total=total,
-          errors=errors,
-          current_file=display_name,
-          current_page=f'{p_idx + 1}/{num_pages}',
-          current_pages_total=num_pages,
-          current_file_started=file_start,
-          elapsed=time.time() - pipeline_start,
-          avg_per_file=avg,
-          recent_files=recent_files,
-        )
-      )
-      page_t = time.time()
-      image, native_text = _render_page(doc, p_idx)
-      if image is not None:
-        md = _ocr_page(model, processor, config, image)
-        image.close()
-        tag = 'ocr'
-      else:
-        md = native_text
-        tag = 'txt'
-      all_markdown.append(md)
-      _log(f'  p{p_idx + 1}/{num_pages} [{tag}] {time.time() - page_t:.0f}s ({len(md)} chars)')
-      if p_idx % 5 == 0:
-        _free_memory()
+    executor = ThreadPoolExecutor(max_workers=1)
+    with tmp_path.open('w', encoding='utf-8') as out:
+      next_future = executor.submit(_render_page, doc, 0)
-    md_path = OUTPUT_BASE / f'{unique_name}.md'
-    md_path.write_text('\n\n'.join(all_markdown), encoding='utf-8')
+      for p_idx in range(num_pages):
+        _save_progress(
+          Progress(
+            done=done_count + idx - 1,
+            total=total,
+            errors=errors,
+            current_file=display_name,
+            current_page=f'{p_idx + 1}/{num_pages}',
+            current_pages_total=num_pages,
+            current_file_started=file_start,
+            elapsed=time.time() - pipeline_start,
+            avg_per_file=avg,
+            recent_files=recent_files,
+          )
+        )
-    del all_markdown
+        page_t = time.time()
+        image, native_text = next_future.result()
+        if p_idx + 1 < num_pages:
+          next_future = executor.submit(_render_page, doc, p_idx + 1)
+        if image is not None:
+          tess_text = _tesseract_ocr(image)
+          if _tesseract_quality_ok(tess_text):
+            md = tess_text
+            tag = 'tess'
+            counts.tess += 1
+          else:
+            _load_chandra(chandra)
+            md = _chandra_ocr(chandra.model, chandra.processor, chandra.formatted_prompt, image)
+            tag = 'vlm'
+            counts.vlm += 1
+          image.close()
+        else:
+          md = native_text
+          tag = 'txt'
+          counts.txt += 1
+        if p_idx > 0:
+          out.write('\n\n')
+        out.write(md)
+        out.flush()
+        _log(f'  p{p_idx + 1}/{num_pages} [{tag}] {time.time() - page_t:.0f}s ({len(md)} chars)')
+        if p_idx % 10 == 9:  # noqa: PLR2004
+          _free_memory()
+    executor.shutdown(wait=False)
+    tmp_path.rename(md_path)
     _free_memory()
-    return num_pages
+    return num_pages  # noqa: TRY300
+  except Exception:
+    for p in [tmp_path, md_path]:
+      if p.exists():
+        p.unlink()
+    raise
   finally:
     doc.close()
@@ -238,14 +335,9 @@ def _run_ocr() -> None:  # noqa: PLR0914
     _save_progress(Progress(total, total, 0, '-', '-', 0, 0, 0, 0, []))
     return
-  _log(f'Loading MLX model {MODEL_ID}...')
-  t0 = time.time()
-  from mlx_vlm import load  # noqa: PLC0415
-  from mlx_vlm.utils import load_config  # noqa: PLC0415
-  model, processor = load(MODEL_ID)
-  config = load_config(MODEL_ID)
-  _log(f'Model loaded in {time.time() - t0:.0f}s')
+  chandra = ChandraState()
+  counts = PageCounts()
+  _log('Hybrid OCR: Tesseract fast-pass, chandra VLM fallback')
   errors = 0
   file_times: list[float] = []
@@ -263,7 +355,17 @@ def _run_ocr() -> None:  # noqa: PLR0914
       try:
         num_pages = _ocr_one_file(
-          pdf, model, processor, config, done_count, i, len(pending), total, errors, avg, pipeline_start, recent_files
+          pdf,
+          chandra,
+          counts,
+          done_count,
+          i,
+          len(pending),
+          total,
+          errors,
+          avg,
+          pipeline_start,
+          recent_files,
         )
         elapsed = time.time() - t1
         file_times.append(elapsed)
@@ -297,6 +399,7 @@ def _run_ocr() -> None:  # noqa: PLR0914
   total_time = _format_duration(time.time() - pipeline_start)
   _log(f'OCR complete. Done: {done_count + len(pending)}, Errors: {errors}, Time: {total_time}')
+  _log(f'Engine stats: tess={counts.tess} vlm={counts.vlm} txt={counts.txt}')
 def main() -> None:

package/scripts/benchmark-ocr.py ADDED Viewed

@@ -0,0 +1,266 @@
+import json
+import os
+import sys
+import time
+from pathlib import Path
+os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
+import pypdfium2 as pdfium
+import pypdfium2.raw as pdfium_c
+from PIL import Image, ImageChops, ImageOps
+CROP_PADDING = 20
+CROP_THRESHOLD = 10
+IMAGE_DPI = 150
+DIACRITICS = frozenset(
+  'àáảãạăắằẳẵặâấầẩẫậèéẻẽẹêếềểễệìíỉĩịòóỏõọôốồổỗộơớờởỡợùúủũụưứừửữựỳýỷỹỵđ'
+  'ÀÁẢÃẠĂẮẰẲẴẶÂẤẦẨẪẬÈÉẺẼẸÊẾỀỂỄỆÌÍỈĨỊÒÓỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÙÚỦŨỤƯỨỪỬỮỰỲÝỶỸỴĐ'
+)
+def _get_arg(flag: str, default: str) -> str:
+  try:
+    idx = sys.argv.index(flag)
+    return sys.argv[idx + 1]
+  except (ValueError, IndexError):
+    return default
+PDF_PATH = _get_arg('--pdf', '')
+PAGES_STR = _get_arg('--pages', '0,1,2')
+ENGINES_STR = _get_arg('--engines', 'tesseract,paddleocr,easyocr,surya,vietocr')
+if not PDF_PATH:
+  print(
+    'Usage: python benchmark-ocr.py --pdf <path> [--pages 0,1,2] [--engines tesseract,paddleocr,easyocr,surya,vietocr]'
+  )
+  sys.exit(1)
+PAGE_INDICES = [int(p.strip()) for p in PAGES_STR.split(',')]
+ENGINES = [e.strip() for e in ENGINES_STR.split(',')]
+def _crop_margins(img: Image.Image) -> Image.Image:
+  bg = Image.new('RGB', img.size, (255, 255, 255))
+  diff = ImageChops.difference(img, bg)
+  gray = ImageOps.grayscale(diff)
+  bbox = gray.point(lambda x: 255 if x > CROP_THRESHOLD else 0).getbbox()
+  bg.close()
+  diff.close()
+  gray.close()
+  if not bbox:
+    return img
+  x0 = max(0, bbox[0] - CROP_PADDING)
+  y0 = max(0, bbox[1] - CROP_PADDING)
+  x1 = min(img.size[0], bbox[2] + CROP_PADDING)
+  y1 = min(img.size[1], bbox[3] + CROP_PADDING)
+  return img.crop((x0, y0, x1, y1))
+def render_page(doc: pdfium.PdfDocument, page_idx: int) -> Image.Image:
+  page_obj = doc[page_idx]
+  pdfium_c.FPDFPage_Flatten(page_obj, pdfium_c.FLAT_NORMALDISPLAY)
+  page_obj = doc[page_idx]
+  min_dim = min(page_obj.get_width(), page_obj.get_height())
+  scale_dpi = max((768 / min_dim) * 72, IMAGE_DPI)
+  pil_image: Image.Image = page_obj.render(scale=scale_dpi / 72).to_pil().convert('RGB')
+  return _crop_margins(pil_image)
+def _count_vietnamese_diacritics(text: str) -> int:
+  count = 0
+  for ch in text:
+    if ch in DIACRITICS:
+      count += 1
+  return count
+def run_tesseract(image: Image.Image, _state: dict) -> str:
+  import pytesseract  # noqa: PLC0415
+  return pytesseract.image_to_string(image, lang='vie')
+def run_paddleocr(image: Image.Image, _state: dict) -> str:
+  import numpy as np  # noqa: PLC0415
+  if 'engine' not in _state:
+    from paddleocr import PaddleOCR  # noqa: PLC0415
+    _state['engine'] = PaddleOCR(lang='vi')
+  img_array = np.array(image)
+  lines: list[str] = []
+  for page_result in _state['engine'].predict(img_array):
+    j = page_result.json
+    res = j.get('res', {})
+    if 'rec_texts' in res:
+      lines.extend(res['rec_texts'])
+  return '\n'.join(lines)
+def run_easyocr(image: Image.Image, _state: dict) -> str:
+  import numpy as np  # noqa: PLC0415
+  if 'reader' not in _state:
+    import easyocr  # noqa: PLC0415
+    _state['reader'] = easyocr.Reader(['vi'], gpu=False)
+  img_array = np.array(image)
+  results = _state['reader'].readtext(img_array)
+  lines = []
+  for _bbox, text, _conf in results:
+    lines.append(text)
+  return '\n'.join(lines)
+def run_surya(image: Image.Image, _state: dict) -> str:
+  if 'rec' not in _state:
+    from surya.detection import DetectionPredictor  # noqa: PLC0415
+    from surya.foundation import FoundationPredictor  # noqa: PLC0415
+    from surya.recognition import RecognitionPredictor  # noqa: PLC0415
+    foundation = FoundationPredictor()
+    _state['det'] = DetectionPredictor()
+    _state['rec'] = RecognitionPredictor(foundation)
+  predictions = _state['rec']([image], det_predictor=_state['det'])
+  return '\n'.join(tl.text for tl in predictions[0].text_lines)
+def run_vietocr(image: Image.Image, _state: dict) -> str:
+  if 'detector' not in _state:
+    import torch  # noqa: PLC0415
+    from vietocr.tool.config import Cfg  # noqa: PLC0415
+    from vietocr.tool.predictor import Predictor  # noqa: PLC0415
+    config = Cfg.load_config_from_name('vgg_transformer')
+    config['cnn']['pretrained'] = False
+    if torch.backends.mps.is_available():
+      config['device'] = 'mps'
+    else:
+      config['device'] = 'cpu'
+    _state['detector'] = Predictor(config)
+  return _state['detector'].predict(image)
+ENGINE_RUNNERS = {
+  'tesseract': run_tesseract,
+  'paddleocr': run_paddleocr,
+  'easyocr': run_easyocr,
+  'surya': run_surya,
+  'vietocr': run_vietocr,
+}
+def main() -> None:  # noqa: C901, PLR0912, PLR0914, PLR0915
+  doc = pdfium.PdfDocument(PDF_PATH)
+  num_pages = len(doc)
+  print(f'PDF: {PDF_PATH} ({num_pages} pages)')
+  print(f'Testing pages: {PAGE_INDICES}')
+  print(f'Engines: {ENGINES}')
+  print('=' * 80)
+  images: dict[int, Image.Image] = {}
+  for p_idx in PAGE_INDICES:
+    if p_idx >= num_pages:
+      print(f'  SKIP page {p_idx} (only {num_pages} pages)')
+    else:
+      t0 = time.time()
+      images[p_idx] = render_page(doc, p_idx)
+      render_time = time.time() - t0
+      img = images[p_idx]
+      print(f'  Rendered page {p_idx}: {img.size[0]}x{img.size[1]} in {render_time:.1f}s')
+  doc.close()
+  print()
+  states: dict[str, dict] = {}
+  results: list[dict] = []
+  for engine_name in ENGINES:
+    if engine_name not in ENGINE_RUNNERS:
+      print(f'Unknown engine: {engine_name}')
+      continue
+    states.setdefault(engine_name, {})
+    runner = ENGINE_RUNNERS[engine_name]
+    print(f'=== {engine_name.upper()} ===')
+    for p_idx in sorted(images.keys()):
+      img = images[p_idx]
+      print(f'  page {p_idx} ... ', end='', flush=True)
+      t0 = time.time()
+      try:
+        text = runner(img, states[engine_name])
+        elapsed = time.time() - t0
+        char_count = len(text)
+        alpha_count = sum(1 for c in text if c.isalpha())
+        diacritics_count = _count_vietnamese_diacritics(text)
+        print(f'{elapsed:.1f}s | {char_count} chars | {alpha_count} alpha | {diacritics_count} diacritics')
+        results.append({
+          'engine': engine_name,
+          'page': p_idx,
+          'time_s': round(elapsed, 2),
+          'chars': char_count,
+          'alpha': alpha_count,
+          'diacritics': diacritics_count,
+          'text_preview': text[:300].replace('\n', ' '),
+          'text_full': text,
+        })
+      except Exception as run_err:  # noqa: BLE001
+        elapsed = time.time() - t0
+        print(f'ERROR {elapsed:.1f}s: {run_err}')
+        results.append({
+          'engine': engine_name,
+          'page': p_idx,
+          'time_s': round(elapsed, 2),
+          'error': str(run_err),
+        })
+  print()
+  print('=' * 80)
+  print('SUMMARY')
+  print('=' * 80)
+  header = f'{"Engine":<12} {"Page":<6} {"Time":>8} {"Chars":>8} {"Alpha":>8} {"Diacritics":>10}  Preview'
+  print(header)
+  print('-' * 100)
+  for r in results:
+    if 'error' in r:
+      print(f'{r["engine"]:<12} {r["page"]:<6} {r["time_s"]:>7.1f}s {"ERROR":>8}')
+    else:
+      preview = r['text_preview'][:60]
+      print(
+        f'{r["engine"]:<12} {r["page"]:<6} {r["time_s"]:>7.1f}s'
+        f' {r["chars"]:>8} {r["alpha"]:>8} {r["diacritics"]:>10}'
+        f'  {preview}'
+      )
+  out_path = Path(PDF_PATH).parent / 'benchmark-results.json'
+  summary = []
+  for r in results:
+    s = dict(r)
+    s.pop('text_full', None)
+    summary.append(s)
+  out_path.write_text(json.dumps(summary, indent=2, ensure_ascii=False) + '\n', encoding='utf-8')
+  print(f'\nResults saved to {out_path}')
+  print('\n\nFULL TEXT OUTPUT PER ENGINE/PAGE:')
+  print('=' * 80)
+  for r in results:
+    if 'text_full' in r:
+      print(f'\n--- [{r["engine"]}] page {r["page"]} ({r["time_s"]}s, {r["chars"]} chars) ---')
+      full = r['text_full']
+      print(full[:2000])
+      if len(full) > 2000:  # noqa: PLR2004
+        print(f'... (truncated, {len(full)} total chars)')
+if __name__ == '__main__':
+  main()