npm - anymd - Versions diffs - 0.0.7 → 0.0.9 - Mend

anymd 0.0.7 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md CHANGED Viewed

@@ -22,29 +22,17 @@ bunx anymd --input-dir <path> [--output-dir <path>] [--config <path>]
 | `--output-dir` | No | `./output` | Where to write all output files |
 | `--config` | No | `./config.json` | Path to configuration file |
-Launches a TUI that auto-runs all 5 pipeline steps with live per-file progress. On completion: rings terminal bell, sends macOS notification, and updates the terminal title. Safe to Ctrl+C — progress is saved, re-run to resume.
+Runs a 3-step pipeline with verbose per-file progress printed to stdout. A progress summary prints every 5 seconds during long-running steps. On completion: rings terminal bell and sends macOS notification. Safe to Ctrl+C — progress is saved, re-run to resume.
 ## Pipeline Steps
 | Step | What | Tool | Output |
 |------|------|------|--------|
 | 1. Classify | Detect native/scanned/mixed PDFs | pdftotext (TypeScript) | `<output-dir>/classification.json` |
-| 2. Convert | Doc/docx/native PDF → raw markdown | soffice + markitdown + marker | `<output-dir>/raw-md/` |
-| 3. OCR | Scanned/mixed PDF → markdown | mlx-vlm chandra-8bit | `<output-dir>/ocr-raw/` |
-| 4. Enhance | Heading detection + cleanup | TypeScript | `<output-dir>/markdown/` |
-| 5. Dataset | Collect all markdown → JSONL | TypeScript | `<output-dir>/dataset/dataset.jsonl` |
+| 2. Convert + OCR + Enhance | Doc/docx/native PDF → markdown, scanned/mixed PDF → OCR markdown, heading detection + cleanup | marker + markitdown + mlx-vlm + TypeScript | `<output-dir>/markdown/` |
+| 3. Dataset | Collect all markdown → JSONL | TypeScript | `<output-dir>/dataset/dataset.jsonl` |
-Steps 2 and 3 run in parallel when both are needed.
-## TUI Hotkeys
-| Key | Action |
-|-----|--------|
-| R | Retry failed step |
-| S | Skip failed step |
-| L | Toggle log overlay (pipeline + OCR, scrollable) |
-| ↑↓ | Scroll log overlay |
-| Q / ESC | Quit |
+Convert and OCR run in parallel. Enhancement runs incrementally as files are ready — no separate step needed.
 ## System Requirements
@@ -54,17 +42,18 @@ Steps 2 and 3 run in parallel when both are needed.
 - [poppler](https://poppler.freedesktop.org/) (`pdftotext`) — `brew install poppler`
 - [LibreOffice](https://www.libreoffice.org/) (`soffice`) — optional, only for `.doc` files
-The TUI preflight check tells you exactly what's missing.
+A preflight check runs at startup and tells you exactly what's missing.
 ## Auto-Bootstrap
 On first run, `anymd` uses `uv` to create `~/.cache/anymd/.venv` (Python 3.13) and installs:
 - `marker-pdf` — PDF to markdown conversion
-- `markitdown` — DOCX to markdown conversion
+- `markitdown[docx,pdf]` — DOCX and PDF fallback conversion
 - `mlx-vlm` — Apple Silicon MLX inference for OCR
 - `pypdfium2` — PDF page rendering
+- `torchvision` — image processing for OCR
-This takes ~2 minutes. Progress is shown in the TUI. Subsequent runs detect the existing venv and skip setup.
+This takes ~2 minutes. Progress is printed to stdout. Subsequent runs detect the existing venv and skip setup.
 ## Configuration
@@ -82,13 +71,6 @@ Create a `config.json` in your working directory (or pass `--config <path>`):
 All fields are optional — omitted fields use defaults. No config file = all defaults.
-## Notifications
-- **Terminal title** updates with current step, progress percentage, and parallel status
-- **Terminal bell** rings on pipeline completion
-- **macOS notification** fires on completion via `osascript`
-- **Error log** written to `<output-dir>/errors.log` with timestamps — persists across all steps
 ## Output Structure
 ```
@@ -97,6 +79,7 @@ All fields are optional — omitted fields use defaults. No config file = all de
 ├── ocr-raw/                     OCR markdown from scanned PDFs (step 3)
 ├── markdown/                    Final enhanced markdown (step 4)
 ├── dataset/dataset.jsonl        JSONL dataset for RAG (step 5)
+├── classification.json          PDF classification results (step 1)
 ├── pipeline-log.txt             Pipeline conversion log
 ├── ocr-log.txt                  OCR processing log
 └── errors.log                   Timestamped error log (all steps)
@@ -108,9 +91,13 @@ All fields are optional — omitted fields use defaults. No config file = all de
 - Supports `.doc`, `.docx`, `.pdf` (native, scanned, mixed)
 - Output files use flat naming with `--` separator: `docs/foo/bar/doc.pdf` → `foo--bar--doc.md`
+## PDF Fallback
+When marker-pdf fails on a PDF (e.g. index out of bounds), `anymd` automatically falls back to markitdown (pdfminer-six) for text extraction.
 ## Dataset Deduplication
-Step 5 deduplicates entries by content hash. If two source files produce identical markdown, only one entry appears in the JSONL. The TUI completion summary shows the dedup count.
+Step 3 deduplicates entries by content hash. If two source files produce identical markdown, only one entry appears in the JSONL. The completion summary shows the dedup count.
 ## Resume Support

package/cli.tsx CHANGED Viewed

@@ -38,5 +38,5 @@ const resolvedOutput = resolve(args.outputDir)
 initPaths(resolvedInput, resolvedOutput)
 loadConfig(args.configPath)
-const { start } = await import('./tui')
-await start()
+const { run } = await import('./runner')
+await run()

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "anymd",
-  "version": "0.0.7",
+  "version": "0.0.9",
   "description": "Convert any document (PDF, DOC, DOCX) to clean Markdown for RAG",
   "keywords": [
     "markdown",
@@ -27,7 +27,7 @@
   "files": [
     "cli.tsx",
     "main.ts",
-    "tui.tsx",
+    "runner.ts",
     "src/",
     "!src/__tests__/",
     "scripts/",
@@ -44,17 +44,11 @@
     "with-env": "dotenv -e ../../.env --"
   },
   "dependencies": {
-    "@opentui/core": "^0.1.77",
-    "@opentui/react": "^0.1.77",
     "markdownlint": "^0.40.0",
     "p-map": "^7.0.4",
-    "react": "^19.2.4",
     "yoctocolors": "^2.1.2",
     "zod": "^4.3.6"
   },
-  "devDependencies": {
-    "@types/react": "^19.2.13"
-  },
   "engines": {
     "bun": ">=1.0.0"
   },

package/runner.ts ADDED Viewed

@@ -0,0 +1,308 @@
+/* eslint-disable max-statements */
+import { bootstrapPython } from '~/bootstrap'
+import { getPaths } from '~/paths'
+import { runPreflight } from '~/preflight'
+import {
+  appendErrorLog,
+  appendPipelineLog,
+  buildDataset,
+  clearErrorLog,
+  clearPipelineLog,
+  fetchStepData,
+  getOcrStats,
+  runClassify,
+  runEnhancePass,
+  spawnCommand,
+  writeNativeFileList
+} from '~/tui-data'
+const stripAnsi = (s: string): string => s.replaceAll(new RegExp(`${String.fromCodePoint(0x1b)}\\[[0-9;]*m`, 'gu'), '')
+const ts = (): string => new Date().toISOString().slice(11, 19)
+const log = (msg: string): void => {
+  process.stdout.write(`${ts()} ${msg}\n`)
+}
+const formatDuration = (seconds: number): string => {
+  const h = Math.floor(seconds / 3600)
+  const m = Math.floor((seconds % 3600) / 60)
+  const s = Math.floor(seconds % 60)
+  if (h > 0) return `${h}h${m.toString().padStart(2, '0')}m`
+  if (m > 0) return `${m}m${s.toString().padStart(2, '0')}s`
+  return `${s}s`
+}
+const LINE_SPLIT = /\r?\n|\r/u
+const ERROR_PATTERN = /\b(?:ERROR|Error:|Failed:|failed|FAILED|\u2716|exception|traceback)/iu
+// oxlint-disable-next-line promise/prefer-await-to-then
+const noop = (): Promise<void> => Promise.resolve() // eslint-disable-line @typescript-eslint/promise-function-async
+const readStream = async (stream: ReadableStream<Uint8Array>, onLine: (line: string) => void): Promise<void> => {
+  const reader = stream.getReader()
+  const decoder = new TextDecoder()
+  let buffer = ''
+  try {
+    for (;;) {
+      /** biome-ignore lint/performance/noAwaitInLoops: streaming reads */
+      const { done, value } = await reader.read() // eslint-disable-line no-await-in-loop
+      if (done) break
+      buffer += decoder.decode(value, { stream: true })
+      const parts = buffer.split(LINE_SPLIT)
+      buffer = parts.pop() ?? ''
+      for (const part of parts) if (part.trim() !== '') onLine(part)
+    }
+    if (buffer.trim() !== '') onLine(buffer)
+  } finally {
+    reader.releaseLock()
+  }
+}
+const PROGRESS_INTERVAL_MS = 5000
+const runConvertStep = async (prefix: string): Promise<number> => {
+  await clearPipelineLog()
+  await writeNativeFileList()
+  const spawned = spawnCommand('pipeline')
+  if (!spawned) return -1
+  const onLine = (line: string): void => {
+    const clean = stripAnsi(line)
+    log(`${prefix}${clean}`)
+    appendPipelineLog(clean)
+    if (ERROR_PATTERN.test(clean)) appendErrorLog('pipeline', clean)
+  }
+  const { stderr, stdout } = spawned.proc
+  const stdoutP = stdout instanceof ReadableStream ? readStream(stdout, onLine) : noop()
+  const stderrP = stderr instanceof ReadableStream ? readStream(stderr, onLine) : noop()
+  await Promise.all([stdoutP, stderrP])
+  return spawned.proc.exited
+}
+const runOcrStep = async (prefix: string): Promise<number> => {
+  const stats = await getOcrStats()
+  if (stats.total === 0) {
+    log(`${prefix}No scanned/mixed PDFs to OCR.`)
+    return 0
+  }
+  if (stats.remaining === 0) {
+    log(`${prefix}All ${stats.total} files already OCR'd.`)
+    return 0
+  }
+  log(`${prefix}OCR ${stats.remaining} remaining of ${stats.total} total`)
+  const spawned = spawnCommand('ocr')
+  if (!spawned) return -1
+  const onLine = (line: string): void => {
+    const clean = stripAnsi(line)
+    log(`${prefix}${clean}`)
+    if (ERROR_PATTERN.test(clean)) appendErrorLog('ocr', clean)
+  }
+  const { stderr, stdout } = spawned.proc
+  const stdoutP = stdout instanceof ReadableStream ? readStream(stdout, onLine) : noop()
+  const stderrP = stderr instanceof ReadableStream ? readStream(stderr, onLine) : noop()
+  await Promise.all([stdoutP, stderrP])
+  return spawned.proc.exited
+}
+const ENHANCE_POLL_MS = 2000
+const logProgress = async (startTime: number): Promise<void> => {
+  const d = await fetchStepData()
+  const parts: string[] = []
+  if (d.pipeline.total > 0) parts.push(`Convert ${d.pipeline.done}/${d.pipeline.total}`)
+  if (d.ocr.total > 0) parts.push(`OCR ${d.ocr.done}/${d.ocr.total}`)
+  if (d.enhance.total > 0) parts.push(`Enhance ${d.enhance.done}/${d.enhance.total}`)
+  if (parts.length > 0) {
+    const elapsed = formatDuration((Date.now() - startTime) / 1000)
+    log(`── ${parts.join(' · ')} · ${elapsed} elapsed ──`)
+  }
+}
+const startEnhancePoller = (
+  onFile: (name: string) => void
+): { stop: () => Promise<{ enhanced: number; failed: number }> } => {
+  const done = new Set<string>()
+  const interval = setInterval(() => {
+    // oxlint-disable-next-line promise/prefer-await-to-then
+    runEnhancePass(done, onFile).catch(noop)
+  }, ENHANCE_POLL_MS)
+  const stop = async (): Promise<{ enhanced: number; failed: number }> => {
+    clearInterval(interval)
+    return runEnhancePass(done, onFile)
+  }
+  return { stop }
+}
+const startProgressTicker = (startTime: number): ReturnType<typeof setInterval> =>
+  // eslint-disable-line @typescript-eslint/promise-function-async
+  setInterval(() => {
+    // oxlint-disable-next-line promise/prefer-await-to-then
+    logProgress(startTime).catch(noop)
+  }, PROGRESS_INTERVAL_MS)
+const runBootstrap = async (): Promise<void> => {
+  log('Checking Python environment...')
+  const ok = await bootstrapPython({
+    onDone: () => log('Python environment ready.'),
+    onStep: (msg: string) => log(`  ${msg}`)
+  })
+  if (!ok) {
+    log('FATAL: Python bootstrap failed. Install uv and try again.')
+    process.exit(1)
+  }
+}
+const runPreflightCheck = async (): Promise<void> => {
+  const preflight = await runPreflight()
+  if (preflight.errors.length > 0) {
+    for (const e of preflight.errors) log(`ERROR: ${e}`)
+    log('Fix the errors above and restart.')
+    process.exit(1)
+  }
+  for (const w of preflight.warnings) log(`WARN: ${w}`)
+}
+const runClassifyStep = async (): Promise<void> => {
+  const data = await fetchStepData()
+  const done = data.classify.done >= data.classify.total && data.classify.total > 0
+  if (done) {
+    log('Step 1/3: Classify \u2014 already done')
+    if (data.classify.details) for (const d of data.classify.details) log(`  ${d}`)
+    return
+  }
+  log('Step 1/3: Classify PDFs')
+  const t = Date.now()
+  await runClassify(p => {
+    log(`  ${p.done}/${p.total} ${p.file} → ${p.category}`)
+  })
+  const d = await fetchStepData()
+  if (d.classify.details) for (const det of d.classify.details) log(`  ${det}`)
+  log(`  Done in ${formatDuration((Date.now() - t) / 1000)}`)
+}
+const runParallelConvertOcr = async (): Promise<void> => {
+  log('  Convert + OCR (parallel)')
+  const ocrPromise = runOcrStep('[OCR] ')
+  const pipelineCode = await runConvertStep('[CONVERT] ')
+  if (pipelineCode !== 0) log(`  Convert exited with code ${pipelineCode}`)
+  log('  Convert done, waiting for OCR...')
+  const ocrCode = await ocrPromise
+  if (ocrCode !== 0) log(`  OCR exited with code ${ocrCode}`)
+}
+const runSequentialOcr = async (ocrNeeded: boolean, ocrDone: boolean): Promise<void> => {
+  if (ocrNeeded && !ocrDone) {
+    log('  OCR scanned PDFs')
+    const code = await runOcrStep('  ')
+    if (code !== 0) log(`  OCR exited with code ${code}`)
+  } else if (ocrNeeded) log('  OCR \u2014 already done')
+  else log('  OCR \u2014 no scanned PDFs')
+}
+// eslint-disable-next-line max-statements
+const runConvertOcrEnhance = async (startTime: number): Promise<void> => {
+  const data = await fetchStepData()
+  const pipelineDone = data.pipeline.done >= data.pipeline.total && data.pipeline.total > 0
+  const ocrNeeded = data.ocr.total > 0
+  const ocrDone = data.ocr.done >= data.ocr.total && data.ocr.total > 0
+  const allDone = pipelineDone && (!ocrNeeded || ocrDone)
+  if (allDone && data.enhance.done >= data.enhance.total && data.enhance.total > 0) {
+    log('Step 2/3: Convert + OCR + Enhance \u2014 already done')
+    return
+  }
+  log('Step 2/3: Convert + OCR + Enhance')
+  const ticker = startProgressTicker(startTime)
+  const enhancer = startEnhancePoller(name => log(`[ENHANCE] \u2713 ${name}`))
+  const t = Date.now()
+  if (!allDone) {
+    const parallel = !pipelineDone && ocrNeeded && !ocrDone
+    if (parallel) await runParallelConvertOcr()
+    else {
+      if (pipelineDone) log('  Convert \u2014 already done')
+      else {
+        log('  Convert to Markdown')
+        const code = await runConvertStep('  ')
+        if (code !== 0) log(`  Convert exited with code ${code}`)
+      }
+      await runSequentialOcr(ocrNeeded, ocrDone)
+    }
+  }
+  clearInterval(ticker)
+  const enhanceResult = await enhancer.stop()
+  log(`  Enhanced: ${enhanceResult.enhanced}, Failed: ${enhanceResult.failed}`)
+  log(`  Done in ${formatDuration((Date.now() - t) / 1000)}`)
+}
+const runDatasetStep = async (): Promise<{ duplicates: number; entries: number; skipped: number; totalChars: number }> => {
+  log('Step 3/3: Build Dataset')
+  const t = Date.now()
+  const result = await buildDataset({
+    onFileResult: p => {
+      const icon = p.status === 'added' ? '\u2713' : p.status === 'duplicate' ? '\u2261' : '\u2192'
+      const charStr = p.chars >= 1000 ? `${(p.chars / 1000).toFixed(1)}K` : `${p.chars}`
+      log(`  ${p.done}/${p.total} ${icon} ${p.file} → ${p.status} (${charStr} chars)`)
+    },
+    onReadProgress: (done, total) => {
+      if (done % 100 === 0 || done === total) log(`  Reading ${done}/${total} files...`)
+    }
+  })
+  log(`  Entries: ${result.entries}, Skipped: ${result.skipped}, Duplicates: ${result.duplicates}`)
+  log(`  Total chars: ${result.totalChars.toLocaleString()}`)
+  log(`  Done in ${formatDuration((Date.now() - t) / 1000)}`)
+  return result
+}
+const printSummary = async (
+  startTime: number,
+  dsResult: { duplicates: number; entries: number; skipped: number; totalChars: number }
+): Promise<void> => {
+  const data = await fetchStepData()
+  const elapsed = formatDuration((Date.now() - startTime) / 1000)
+  const sep = '\u2550'.repeat(45)
+  log('')
+  log(sep)
+  log('  Pipeline Complete')
+  log(sep)
+  log(`  Classified:   ${data.classify.done} PDFs`)
+  if (data.classify.details) for (const d of data.classify.details) log(`                ${d}`)
+  log(`  Converted:    ${data.pipeline.done} files`)
+  log(`  OCR:          ${data.ocr.done} files`)
+  log(`  Enhanced:     ${data.enhance.done} files`)
+  log(`  Dataset:      ${dsResult.entries} entries, ${dsResult.totalChars.toLocaleString()} chars`)
+  if (dsResult.duplicates > 0) log(`  Deduplicated: ${dsResult.duplicates}`)
+  if (dsResult.skipped > 0) log(`  Skipped:      ${dsResult.skipped} (below min length)`)
+  log(`  Duration:     ${elapsed}`)
+  log(`  Output:       ${getPaths().outputDir}`)
+  log(sep)
+}
+const run = async (): Promise<void> => {
+  const startTime = Date.now()
+  await runBootstrap()
+  await runPreflightCheck()
+  await clearErrorLog()
+  log('')
+  await runClassifyStep()
+  log('')
+  await runConvertOcrEnhance(startTime)
+  log('')
+  const dsResult = await runDatasetStep()
+  await printSummary(startTime, dsResult)
+  process.stdout.write('\u0007')
+  Bun.spawn(['osascript', '-e', 'display notification "Pipeline complete" with title "anymd"'])
+}
+export { run }

package/scripts/pdf-to-md.py CHANGED Viewed

@@ -6,14 +6,30 @@ from pathlib import Path
 from marker.converters.pdf import PdfConverter
 from marker.models import create_model_dict
 from marker.output import text_from_rendered
+from markitdown import MarkItDown
 MIN_ARGS = 2
+MIN_FALLBACK_CHARS = 10
+_mid = MarkItDown()
 def _emit(data: dict[str, object]) -> None:
   print(json.dumps(data), flush=True)
+def _markitdown_fallback(pdf_path: str) -> str | None:
+  try:
+    result = _mid.convert(pdf_path)
+    text = result.text_content.strip()
+  except Exception:  # noqa: BLE001
+    return None
+  else:
+    if len(text) < MIN_FALLBACK_CHARS:
+      return None
+    return text
 def _convert_one(converter: PdfConverter, pdf_path: str, out_path: str, index: int, total: int) -> None:
   t1 = time.time()
   try:
@@ -29,15 +45,28 @@ def _convert_one(converter: PdfConverter, pdf_path: str, out_path: str, index: i
       'seconds': round(time.time() - t1, 1),
       'chars': len(md),
     })
-  except Exception as exc:  # noqa: BLE001
-    _emit({
-      'type': 'error',
-      'index': index,
-      'total': total,
-      'file': Path(pdf_path).name,
-      'seconds': round(time.time() - t1, 1),
-      'error': str(exc),
-    })
+  except Exception as marker_exc:  # noqa: BLE001
+    fallback_md = _markitdown_fallback(pdf_path)
+    if fallback_md:
+      Path(out_path).parent.mkdir(parents=True, exist_ok=True)
+      Path(out_path).write_text(fallback_md, encoding='utf-8')
+      _emit({
+        'type': 'converted',
+        'index': index,
+        'total': total,
+        'file': Path(pdf_path).name,
+        'seconds': round(time.time() - t1, 1),
+        'chars': len(fallback_md),
+      })
+    else:
+      _emit({
+        'type': 'error',
+        'index': index,
+        'total': total,
+        'file': Path(pdf_path).name,
+        'seconds': round(time.time() - t1, 1),
+        'error': str(marker_exc),
+      })
 def main() -> None:

package/src/bootstrap.ts CHANGED Viewed

@@ -10,7 +10,7 @@ interface BootstrapCallbacks {
 }
 const REQUIRED_PACKAGES = ['marker', 'markitdown', 'mammoth', 'mlx_vlm', 'pypdfium2', 'torchvision']
-const PIP_PACKAGES = ['marker-pdf', 'markitdown[docx]', 'mlx-vlm', 'pypdfium2', 'torchvision']
+const PIP_PACKAGES = ['marker-pdf', 'markitdown[docx,pdf]', 'mlx-vlm', 'pypdfium2', 'torchvision']
 const CHANDRA_MODEL_ID = 'mlx-community/chandra-8bit'
 const checkImportable = async (py: string, pkg: string): Promise<boolean> => {

package/src/tui-data.ts CHANGED Viewed

@@ -63,19 +63,6 @@ const countFiles = async (dir: string, ext: string): Promise<number> => {
   }
 }
-const countOverlap = async (dirA: string, dirB: string, ext: string): Promise<number> => {
-  try {
-    const [entriesA, entriesB] = await Promise.all([readdir(dirA), readdir(dirB)])
-    const setB = new Set<string>()
-    for (const e of entriesB) if (e.endsWith(ext)) setB.add(e)
-    let count = 0
-    for (const e of entriesA) if (e.endsWith(ext) && setB.has(e)) count += 1
-    return count
-  } catch {
-    return 0
-  }
-}
 const readJson = async <T>(path: string): Promise<null | T> => {
   try {
     const text = await readFile(path, 'utf8')
@@ -382,6 +369,49 @@ const runEnhanceOcr = async (
   return { enhanced, failed, skipped: skippedFiles.length }
 }
+const gatherEnhanceCandidates = async (exclude: Set<string>): Promise<{ name: string; srcPath: string }[]> => {
+  const p = getPaths()
+  const candidates: { name: string; srcPath: string }[] = []
+  for (const srcDir of [p.rawMd, p.ocrRaw])
+    try {
+      /** biome-ignore lint/performance/noAwaitInLoops: iterating 2 dirs */
+      const entries = await readdir(srcDir) // eslint-disable-line no-await-in-loop
+      for (const f of entries)
+        if (f.endsWith('.md') && !exclude.has(f)) candidates.push({ name: f, srcPath: join(srcDir, f) })
+    } catch {
+      /* Empty */
+    }
+  return candidates
+}
+// eslint-disable-next-line max-statements
+const runEnhancePass = async (
+  alreadyDone: Set<string>,
+  onFile?: (file: string) => void
+): Promise<{ enhanced: number; failed: number }> => {
+  const { enhanceMarkdown } = await import('~/md-enhancer')
+  mkdirSync(getPaths().markdown, { recursive: true })
+  const candidates = await gatherEnhanceCandidates(alreadyDone)
+  let enhanced = 0
+  let failed = 0
+  for (const { name, srcPath } of candidates)
+    try {
+      /** biome-ignore lint/performance/noAwaitInLoops: sequential enhance */
+      const content = await readFile(srcPath, 'utf8') // eslint-disable-line no-await-in-loop
+      const result = enhanceMarkdown(content)
+      /** biome-ignore lint/performance/noAwaitInLoops: sequential enhance */
+      await writeFile(join(getPaths().markdown, name), result, 'utf8') // eslint-disable-line no-await-in-loop
+      alreadyDone.add(name)
+      enhanced += 1
+      onFile?.(name)
+    } catch {
+      failed += 1
+    }
+  return { enhanced, failed }
+}
 const spawnCommand = (key: CommandKey): null | { args: string[]; label: string; proc: ReturnType<typeof Bun.spawn> } => {
   const p = getPaths()
   const packageRoot = join(p.scriptsDir, '..')
@@ -466,7 +496,6 @@ interface StepCounts {
   docCount: number
   finalMdCount: number
   ocrDone: number
-  ocrEnhancedCount: number
   ocrProgress: null | OcrProgress
   ocrTotal: number
   pdfCount: number
@@ -491,9 +520,8 @@ const buildStepResults = (c: StepCounts): AllStepsData => {
       total: c.finalMdCount
     },
     enhance: {
-      done: c.ocrEnhancedCount,
-      requires: c.ocrTotal > 0 && c.ocrDone === 0 ? 'OCR output' : undefined,
-      total: c.ocrDone
+      done: c.finalMdCount,
+      total: c.rawMdCount + c.ocrDone
     },
     ocr: {
       done: c.ocrDone,
@@ -512,17 +540,15 @@ const buildStepResults = (c: StepCounts): AllStepsData => {
 const fetchStepData = async (): Promise<AllStepsData> => {
   const p = getPaths()
-  const [classification, ocrProgress, rawMdCount, ocrDone, finalMdCount, ocrEnhancedCount, dataCounts, datasetEntries] =
-    await Promise.all([
-      readJson<Classification>(p.classification),
-      readJson<OcrProgress>(p.ocrProgress),
-      countFiles(p.rawMd, '.md'),
-      countFiles(p.ocrRaw, '.md'),
-      countFiles(p.markdown, '.md'),
-      countOverlap(p.ocrRaw, p.markdown, '.md'),
-      countDataFiles(),
-      countDatasetEntries()
-    ])
+  const [classification, ocrProgress, rawMdCount, ocrDone, finalMdCount, dataCounts, datasetEntries] = await Promise.all([
+    readJson<Classification>(p.classification),
+    readJson<OcrProgress>(p.ocrProgress),
+    countFiles(p.rawMd, '.md'),
+    countFiles(p.ocrRaw, '.md'),
+    countFiles(p.markdown, '.md'),
+    countDataFiles(),
+    countDatasetEntries()
+  ])
   const ocrTotal = classification ? classification.scanned + classification.mixed : 0
@@ -532,7 +558,6 @@ const fetchStepData = async (): Promise<AllStepsData> => {
     docCount: dataCounts.docs,
     finalMdCount,
     ocrDone,
-    ocrEnhancedCount,
     ocrProgress,
     ocrTotal,
     pdfCount: dataCounts.pdfs,
@@ -584,6 +609,7 @@ export {
   readLogTail,
   runClassify,
   runEnhanceOcr,
+  runEnhancePass,
   spawnCommand,
   writeNativeFileList
 }