anymd 0.0.8 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -22,29 +22,17 @@ bunx anymd --input-dir <path> [--output-dir <path>] [--config <path>]
22
22
  | `--output-dir` | No | `./output` | Where to write all output files |
23
23
  | `--config` | No | `./config.json` | Path to configuration file |
24
24
 
25
- Launches a TUI that auto-runs all 5 pipeline steps with live per-file progress. On completion: rings terminal bell, sends macOS notification, and updates the terminal title. Safe to Ctrl+C — progress is saved, re-run to resume.
25
+ Runs a 3-step pipeline with verbose per-file progress printed to stdout. A progress summary prints every 5 seconds during long-running steps. On completion: rings terminal bell and sends macOS notification. Safe to Ctrl+C — progress is saved, re-run to resume.
26
26
 
27
27
  ## Pipeline Steps
28
28
 
29
29
  | Step | What | Tool | Output |
30
30
  |------|------|------|--------|
31
31
  | 1. Classify | Detect native/scanned/mixed PDFs | pdftotext (TypeScript) | `<output-dir>/classification.json` |
32
- | 2. Convert | Doc/docx/native PDF → raw markdown | soffice + markitdown + marker | `<output-dir>/raw-md/` |
33
- | 3. OCR | Scanned/mixed PDFmarkdown | mlx-vlm chandra-8bit | `<output-dir>/ocr-raw/` |
34
- | 4. Enhance | Heading detection + cleanup | TypeScript | `<output-dir>/markdown/` |
35
- | 5. Dataset | Collect all markdown → JSONL | TypeScript | `<output-dir>/dataset/dataset.jsonl` |
32
+ | 2. Convert + OCR + Enhance | Doc/docx/native PDF → markdown, scanned/mixed PDF → OCR markdown, heading detection + cleanup | marker + markitdown + mlx-vlm + TypeScript | `<output-dir>/markdown/` |
33
+ | 3. Dataset | Collect all markdown JSONL | TypeScript | `<output-dir>/dataset/dataset.jsonl` |
36
34
 
37
- Steps 2 and 3 run in parallel when both are needed.
38
-
39
- ## TUI Hotkeys
40
-
41
- | Key | Action |
42
- |-----|--------|
43
- | R | Retry failed step |
44
- | S | Skip failed step |
45
- | L | Toggle log overlay (pipeline + OCR, scrollable) |
46
- | ↑↓ | Scroll log overlay |
47
- | Q / ESC | Quit |
35
+ Convert and OCR run in parallel. Enhancement runs incrementally as files are ready — no separate step needed.
48
36
 
49
37
  ## System Requirements
50
38
 
@@ -54,17 +42,18 @@ Steps 2 and 3 run in parallel when both are needed.
54
42
  - [poppler](https://poppler.freedesktop.org/) (`pdftotext`) — `brew install poppler`
55
43
  - [LibreOffice](https://www.libreoffice.org/) (`soffice`) — optional, only for `.doc` files
56
44
 
57
- The TUI preflight check tells you exactly what's missing.
45
+ A preflight check runs at startup and tells you exactly what's missing.
58
46
 
59
47
  ## Auto-Bootstrap
60
48
 
61
49
  On first run, `anymd` uses `uv` to create `~/.cache/anymd/.venv` (Python 3.13) and installs:
62
50
  - `marker-pdf` — PDF to markdown conversion
63
- - `markitdown` — DOCX to markdown conversion
51
+ - `markitdown[docx,pdf]` — DOCX and PDF fallback conversion
64
52
  - `mlx-vlm` — Apple Silicon MLX inference for OCR
65
53
  - `pypdfium2` — PDF page rendering
54
+ - `torchvision` — image processing for OCR
66
55
 
67
- This takes ~2 minutes. Progress is shown in the TUI. Subsequent runs detect the existing venv and skip setup.
56
+ This takes ~2 minutes. Progress is printed to stdout. Subsequent runs detect the existing venv and skip setup.
68
57
 
69
58
  ## Configuration
70
59
 
@@ -82,13 +71,6 @@ Create a `config.json` in your working directory (or pass `--config <path>`):
82
71
 
83
72
  All fields are optional — omitted fields use defaults. No config file = all defaults.
84
73
 
85
- ## Notifications
86
-
87
- - **Terminal title** updates with current step, progress percentage, and parallel status
88
- - **Terminal bell** rings on pipeline completion
89
- - **macOS notification** fires on completion via `osascript`
90
- - **Error log** written to `<output-dir>/errors.log` with timestamps — persists across all steps
91
-
92
74
  ## Output Structure
93
75
 
94
76
  ```
@@ -97,6 +79,7 @@ All fields are optional — omitted fields use defaults. No config file = all de
97
79
  ├── ocr-raw/ OCR markdown from scanned PDFs (step 3)
98
80
  ├── markdown/ Final enhanced markdown (step 4)
99
81
  ├── dataset/dataset.jsonl JSONL dataset for RAG (step 5)
82
+ ├── classification.json PDF classification results (step 1)
100
83
  ├── pipeline-log.txt Pipeline conversion log
101
84
  ├── ocr-log.txt OCR processing log
102
85
  └── errors.log Timestamped error log (all steps)
@@ -108,9 +91,13 @@ All fields are optional — omitted fields use defaults. No config file = all de
108
91
  - Supports `.doc`, `.docx`, `.pdf` (native, scanned, mixed)
109
92
  - Output files use flat naming with `--` separator: `docs/foo/bar/doc.pdf` → `foo--bar--doc.md`
110
93
 
94
+ ## PDF Fallback
95
+
96
+ When marker-pdf fails on a PDF (e.g. index out of bounds), `anymd` automatically falls back to markitdown (pdfminer-six) for text extraction.
97
+
111
98
  ## Dataset Deduplication
112
99
 
113
- Step 5 deduplicates entries by content hash. If two source files produce identical markdown, only one entry appears in the JSONL. The TUI completion summary shows the dedup count.
100
+ Step 3 deduplicates entries by content hash. If two source files produce identical markdown, only one entry appears in the JSONL. The completion summary shows the dedup count.
114
101
 
115
102
  ## Resume Support
116
103
 
package/cli.tsx CHANGED
@@ -38,5 +38,5 @@ const resolvedOutput = resolve(args.outputDir)
38
38
  initPaths(resolvedInput, resolvedOutput)
39
39
  loadConfig(args.configPath)
40
40
 
41
- const { start } = await import('./tui')
42
- await start()
41
+ const { run } = await import('./runner')
42
+ await run()
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "anymd",
3
- "version": "0.0.8",
3
+ "version": "0.0.9",
4
4
  "description": "Convert any document (PDF, DOC, DOCX) to clean Markdown for RAG",
5
5
  "keywords": [
6
6
  "markdown",
@@ -27,7 +27,7 @@
27
27
  "files": [
28
28
  "cli.tsx",
29
29
  "main.ts",
30
- "tui.tsx",
30
+ "runner.ts",
31
31
  "src/",
32
32
  "!src/__tests__/",
33
33
  "scripts/",
@@ -44,17 +44,11 @@
44
44
  "with-env": "dotenv -e ../../.env --"
45
45
  },
46
46
  "dependencies": {
47
- "@opentui/core": "^0.1.77",
48
- "@opentui/react": "^0.1.77",
49
47
  "markdownlint": "^0.40.0",
50
48
  "p-map": "^7.0.4",
51
- "react": "^19.2.4",
52
49
  "yoctocolors": "^2.1.2",
53
50
  "zod": "^4.3.6"
54
51
  },
55
- "devDependencies": {
56
- "@types/react": "^19.2.13"
57
- },
58
52
  "engines": {
59
53
  "bun": ">=1.0.0"
60
54
  },
package/runner.ts ADDED
@@ -0,0 +1,308 @@
1
+ /* eslint-disable max-statements */
2
+ import { bootstrapPython } from '~/bootstrap'
3
+ import { getPaths } from '~/paths'
4
+ import { runPreflight } from '~/preflight'
5
+ import {
6
+ appendErrorLog,
7
+ appendPipelineLog,
8
+ buildDataset,
9
+ clearErrorLog,
10
+ clearPipelineLog,
11
+ fetchStepData,
12
+ getOcrStats,
13
+ runClassify,
14
+ runEnhancePass,
15
+ spawnCommand,
16
+ writeNativeFileList
17
+ } from '~/tui-data'
18
+
19
+ const stripAnsi = (s: string): string => s.replaceAll(new RegExp(`${String.fromCodePoint(0x1b)}\\[[0-9;]*m`, 'gu'), '')
20
+
21
+ const ts = (): string => new Date().toISOString().slice(11, 19)
22
+
23
+ const log = (msg: string): void => {
24
+ process.stdout.write(`${ts()} ${msg}\n`)
25
+ }
26
+
27
+ const formatDuration = (seconds: number): string => {
28
+ const h = Math.floor(seconds / 3600)
29
+ const m = Math.floor((seconds % 3600) / 60)
30
+ const s = Math.floor(seconds % 60)
31
+ if (h > 0) return `${h}h${m.toString().padStart(2, '0')}m`
32
+ if (m > 0) return `${m}m${s.toString().padStart(2, '0')}s`
33
+ return `${s}s`
34
+ }
35
+
36
+ const LINE_SPLIT = /\r?\n|\r/u
37
+ const ERROR_PATTERN = /\b(?:ERROR|Error:|Failed:|failed|FAILED|\u2716|exception|traceback)/iu
38
+ // oxlint-disable-next-line promise/prefer-await-to-then
39
+ const noop = (): Promise<void> => Promise.resolve() // eslint-disable-line @typescript-eslint/promise-function-async
40
+
41
+ const readStream = async (stream: ReadableStream<Uint8Array>, onLine: (line: string) => void): Promise<void> => {
42
+ const reader = stream.getReader()
43
+ const decoder = new TextDecoder()
44
+ let buffer = ''
45
+ try {
46
+ for (;;) {
47
+ /** biome-ignore lint/performance/noAwaitInLoops: streaming reads */
48
+ const { done, value } = await reader.read() // eslint-disable-line no-await-in-loop
49
+ if (done) break
50
+ buffer += decoder.decode(value, { stream: true })
51
+ const parts = buffer.split(LINE_SPLIT)
52
+ buffer = parts.pop() ?? ''
53
+ for (const part of parts) if (part.trim() !== '') onLine(part)
54
+ }
55
+ if (buffer.trim() !== '') onLine(buffer)
56
+ } finally {
57
+ reader.releaseLock()
58
+ }
59
+ }
60
+
61
+ const PROGRESS_INTERVAL_MS = 5000
62
+
63
+ const runConvertStep = async (prefix: string): Promise<number> => {
64
+ await clearPipelineLog()
65
+ await writeNativeFileList()
66
+ const spawned = spawnCommand('pipeline')
67
+ if (!spawned) return -1
68
+
69
+ const onLine = (line: string): void => {
70
+ const clean = stripAnsi(line)
71
+ log(`${prefix}${clean}`)
72
+ appendPipelineLog(clean)
73
+ if (ERROR_PATTERN.test(clean)) appendErrorLog('pipeline', clean)
74
+ }
75
+
76
+ const { stderr, stdout } = spawned.proc
77
+ const stdoutP = stdout instanceof ReadableStream ? readStream(stdout, onLine) : noop()
78
+ const stderrP = stderr instanceof ReadableStream ? readStream(stderr, onLine) : noop()
79
+ await Promise.all([stdoutP, stderrP])
80
+ return spawned.proc.exited
81
+ }
82
+
83
+ const runOcrStep = async (prefix: string): Promise<number> => {
84
+ const stats = await getOcrStats()
85
+ if (stats.total === 0) {
86
+ log(`${prefix}No scanned/mixed PDFs to OCR.`)
87
+ return 0
88
+ }
89
+ if (stats.remaining === 0) {
90
+ log(`${prefix}All ${stats.total} files already OCR'd.`)
91
+ return 0
92
+ }
93
+ log(`${prefix}OCR ${stats.remaining} remaining of ${stats.total} total`)
94
+
95
+ const spawned = spawnCommand('ocr')
96
+ if (!spawned) return -1
97
+
98
+ const onLine = (line: string): void => {
99
+ const clean = stripAnsi(line)
100
+ log(`${prefix}${clean}`)
101
+ if (ERROR_PATTERN.test(clean)) appendErrorLog('ocr', clean)
102
+ }
103
+
104
+ const { stderr, stdout } = spawned.proc
105
+ const stdoutP = stdout instanceof ReadableStream ? readStream(stdout, onLine) : noop()
106
+ const stderrP = stderr instanceof ReadableStream ? readStream(stderr, onLine) : noop()
107
+ await Promise.all([stdoutP, stderrP])
108
+ return spawned.proc.exited
109
+ }
110
+
111
+ const ENHANCE_POLL_MS = 2000
112
+
113
+ const logProgress = async (startTime: number): Promise<void> => {
114
+ const d = await fetchStepData()
115
+ const parts: string[] = []
116
+ if (d.pipeline.total > 0) parts.push(`Convert ${d.pipeline.done}/${d.pipeline.total}`)
117
+ if (d.ocr.total > 0) parts.push(`OCR ${d.ocr.done}/${d.ocr.total}`)
118
+ if (d.enhance.total > 0) parts.push(`Enhance ${d.enhance.done}/${d.enhance.total}`)
119
+ if (parts.length > 0) {
120
+ const elapsed = formatDuration((Date.now() - startTime) / 1000)
121
+ log(`── ${parts.join(' · ')} · ${elapsed} elapsed ──`)
122
+ }
123
+ }
124
+
125
+ const startEnhancePoller = (
126
+ onFile: (name: string) => void
127
+ ): { stop: () => Promise<{ enhanced: number; failed: number }> } => {
128
+ const done = new Set<string>()
129
+ const interval = setInterval(() => {
130
+ // oxlint-disable-next-line promise/prefer-await-to-then
131
+ runEnhancePass(done, onFile).catch(noop)
132
+ }, ENHANCE_POLL_MS)
133
+
134
+ const stop = async (): Promise<{ enhanced: number; failed: number }> => {
135
+ clearInterval(interval)
136
+ return runEnhancePass(done, onFile)
137
+ }
138
+ return { stop }
139
+ }
140
+
141
+ const startProgressTicker = (startTime: number): ReturnType<typeof setInterval> =>
142
+ // eslint-disable-line @typescript-eslint/promise-function-async
143
+ setInterval(() => {
144
+ // oxlint-disable-next-line promise/prefer-await-to-then
145
+ logProgress(startTime).catch(noop)
146
+ }, PROGRESS_INTERVAL_MS)
147
+
148
+ const runBootstrap = async (): Promise<void> => {
149
+ log('Checking Python environment...')
150
+ const ok = await bootstrapPython({
151
+ onDone: () => log('Python environment ready.'),
152
+ onStep: (msg: string) => log(` ${msg}`)
153
+ })
154
+ if (!ok) {
155
+ log('FATAL: Python bootstrap failed. Install uv and try again.')
156
+ process.exit(1)
157
+ }
158
+ }
159
+
160
+ const runPreflightCheck = async (): Promise<void> => {
161
+ const preflight = await runPreflight()
162
+ if (preflight.errors.length > 0) {
163
+ for (const e of preflight.errors) log(`ERROR: ${e}`)
164
+ log('Fix the errors above and restart.')
165
+ process.exit(1)
166
+ }
167
+ for (const w of preflight.warnings) log(`WARN: ${w}`)
168
+ }
169
+
170
+ const runClassifyStep = async (): Promise<void> => {
171
+ const data = await fetchStepData()
172
+ const done = data.classify.done >= data.classify.total && data.classify.total > 0
173
+ if (done) {
174
+ log('Step 1/3: Classify \u2014 already done')
175
+ if (data.classify.details) for (const d of data.classify.details) log(` ${d}`)
176
+ return
177
+ }
178
+ log('Step 1/3: Classify PDFs')
179
+ const t = Date.now()
180
+ await runClassify(p => {
181
+ log(` ${p.done}/${p.total} ${p.file} → ${p.category}`)
182
+ })
183
+ const d = await fetchStepData()
184
+ if (d.classify.details) for (const det of d.classify.details) log(` ${det}`)
185
+ log(` Done in ${formatDuration((Date.now() - t) / 1000)}`)
186
+ }
187
+
188
+ const runParallelConvertOcr = async (): Promise<void> => {
189
+ log(' Convert + OCR (parallel)')
190
+ const ocrPromise = runOcrStep('[OCR] ')
191
+ const pipelineCode = await runConvertStep('[CONVERT] ')
192
+ if (pipelineCode !== 0) log(` Convert exited with code ${pipelineCode}`)
193
+ log(' Convert done, waiting for OCR...')
194
+ const ocrCode = await ocrPromise
195
+ if (ocrCode !== 0) log(` OCR exited with code ${ocrCode}`)
196
+ }
197
+
198
+ const runSequentialOcr = async (ocrNeeded: boolean, ocrDone: boolean): Promise<void> => {
199
+ if (ocrNeeded && !ocrDone) {
200
+ log(' OCR scanned PDFs')
201
+ const code = await runOcrStep(' ')
202
+ if (code !== 0) log(` OCR exited with code ${code}`)
203
+ } else if (ocrNeeded) log(' OCR \u2014 already done')
204
+ else log(' OCR \u2014 no scanned PDFs')
205
+ }
206
+
207
+ // eslint-disable-next-line max-statements
208
+ const runConvertOcrEnhance = async (startTime: number): Promise<void> => {
209
+ const data = await fetchStepData()
210
+ const pipelineDone = data.pipeline.done >= data.pipeline.total && data.pipeline.total > 0
211
+ const ocrNeeded = data.ocr.total > 0
212
+ const ocrDone = data.ocr.done >= data.ocr.total && data.ocr.total > 0
213
+ const allDone = pipelineDone && (!ocrNeeded || ocrDone)
214
+
215
+ if (allDone && data.enhance.done >= data.enhance.total && data.enhance.total > 0) {
216
+ log('Step 2/3: Convert + OCR + Enhance \u2014 already done')
217
+ return
218
+ }
219
+
220
+ log('Step 2/3: Convert + OCR + Enhance')
221
+ const ticker = startProgressTicker(startTime)
222
+ const enhancer = startEnhancePoller(name => log(`[ENHANCE] \u2713 ${name}`))
223
+ const t = Date.now()
224
+
225
+ if (!allDone) {
226
+ const parallel = !pipelineDone && ocrNeeded && !ocrDone
227
+ if (parallel) await runParallelConvertOcr()
228
+ else {
229
+ if (pipelineDone) log(' Convert \u2014 already done')
230
+ else {
231
+ log(' Convert to Markdown')
232
+ const code = await runConvertStep(' ')
233
+ if (code !== 0) log(` Convert exited with code ${code}`)
234
+ }
235
+ await runSequentialOcr(ocrNeeded, ocrDone)
236
+ }
237
+ }
238
+
239
+ clearInterval(ticker)
240
+ const enhanceResult = await enhancer.stop()
241
+ log(` Enhanced: ${enhanceResult.enhanced}, Failed: ${enhanceResult.failed}`)
242
+ log(` Done in ${formatDuration((Date.now() - t) / 1000)}`)
243
+ }
244
+
245
+ const runDatasetStep = async (): Promise<{ duplicates: number; entries: number; skipped: number; totalChars: number }> => {
246
+ log('Step 3/3: Build Dataset')
247
+ const t = Date.now()
248
+ const result = await buildDataset({
249
+ onFileResult: p => {
250
+ const icon = p.status === 'added' ? '\u2713' : p.status === 'duplicate' ? '\u2261' : '\u2192'
251
+ const charStr = p.chars >= 1000 ? `${(p.chars / 1000).toFixed(1)}K` : `${p.chars}`
252
+ log(` ${p.done}/${p.total} ${icon} ${p.file} → ${p.status} (${charStr} chars)`)
253
+ },
254
+ onReadProgress: (done, total) => {
255
+ if (done % 100 === 0 || done === total) log(` Reading ${done}/${total} files...`)
256
+ }
257
+ })
258
+ log(` Entries: ${result.entries}, Skipped: ${result.skipped}, Duplicates: ${result.duplicates}`)
259
+ log(` Total chars: ${result.totalChars.toLocaleString()}`)
260
+ log(` Done in ${formatDuration((Date.now() - t) / 1000)}`)
261
+ return result
262
+ }
263
+
264
+ const printSummary = async (
265
+ startTime: number,
266
+ dsResult: { duplicates: number; entries: number; skipped: number; totalChars: number }
267
+ ): Promise<void> => {
268
+ const data = await fetchStepData()
269
+ const elapsed = formatDuration((Date.now() - startTime) / 1000)
270
+ const sep = '\u2550'.repeat(45)
271
+ log('')
272
+ log(sep)
273
+ log(' Pipeline Complete')
274
+ log(sep)
275
+ log(` Classified: ${data.classify.done} PDFs`)
276
+ if (data.classify.details) for (const d of data.classify.details) log(` ${d}`)
277
+ log(` Converted: ${data.pipeline.done} files`)
278
+ log(` OCR: ${data.ocr.done} files`)
279
+ log(` Enhanced: ${data.enhance.done} files`)
280
+ log(` Dataset: ${dsResult.entries} entries, ${dsResult.totalChars.toLocaleString()} chars`)
281
+ if (dsResult.duplicates > 0) log(` Deduplicated: ${dsResult.duplicates}`)
282
+ if (dsResult.skipped > 0) log(` Skipped: ${dsResult.skipped} (below min length)`)
283
+ log(` Duration: ${elapsed}`)
284
+ log(` Output: ${getPaths().outputDir}`)
285
+ log(sep)
286
+ }
287
+
288
+ const run = async (): Promise<void> => {
289
+ const startTime = Date.now()
290
+
291
+ await runBootstrap()
292
+ await runPreflightCheck()
293
+ await clearErrorLog()
294
+
295
+ log('')
296
+ await runClassifyStep()
297
+ log('')
298
+ await runConvertOcrEnhance(startTime)
299
+ log('')
300
+ const dsResult = await runDatasetStep()
301
+
302
+ await printSummary(startTime, dsResult)
303
+
304
+ process.stdout.write('\u0007')
305
+ Bun.spawn(['osascript', '-e', 'display notification "Pipeline complete" with title "anymd"'])
306
+ }
307
+
308
+ export { run }
package/src/tui-data.ts CHANGED
@@ -63,19 +63,6 @@ const countFiles = async (dir: string, ext: string): Promise<number> => {
63
63
  }
64
64
  }
65
65
 
66
- const countOverlap = async (dirA: string, dirB: string, ext: string): Promise<number> => {
67
- try {
68
- const [entriesA, entriesB] = await Promise.all([readdir(dirA), readdir(dirB)])
69
- const setB = new Set<string>()
70
- for (const e of entriesB) if (e.endsWith(ext)) setB.add(e)
71
- let count = 0
72
- for (const e of entriesA) if (e.endsWith(ext) && setB.has(e)) count += 1
73
- return count
74
- } catch {
75
- return 0
76
- }
77
- }
78
-
79
66
  const readJson = async <T>(path: string): Promise<null | T> => {
80
67
  try {
81
68
  const text = await readFile(path, 'utf8')
@@ -382,6 +369,49 @@ const runEnhanceOcr = async (
382
369
  return { enhanced, failed, skipped: skippedFiles.length }
383
370
  }
384
371
 
372
+ const gatherEnhanceCandidates = async (exclude: Set<string>): Promise<{ name: string; srcPath: string }[]> => {
373
+ const p = getPaths()
374
+ const candidates: { name: string; srcPath: string }[] = []
375
+ for (const srcDir of [p.rawMd, p.ocrRaw])
376
+ try {
377
+ /** biome-ignore lint/performance/noAwaitInLoops: iterating 2 dirs */
378
+ const entries = await readdir(srcDir) // eslint-disable-line no-await-in-loop
379
+ for (const f of entries)
380
+ if (f.endsWith('.md') && !exclude.has(f)) candidates.push({ name: f, srcPath: join(srcDir, f) })
381
+ } catch {
382
+ /* Empty */
383
+ }
384
+
385
+ return candidates
386
+ }
387
+
388
+ // eslint-disable-next-line max-statements
389
+ const runEnhancePass = async (
390
+ alreadyDone: Set<string>,
391
+ onFile?: (file: string) => void
392
+ ): Promise<{ enhanced: number; failed: number }> => {
393
+ const { enhanceMarkdown } = await import('~/md-enhancer')
394
+ mkdirSync(getPaths().markdown, { recursive: true })
395
+ const candidates = await gatherEnhanceCandidates(alreadyDone)
396
+ let enhanced = 0
397
+ let failed = 0
398
+ for (const { name, srcPath } of candidates)
399
+ try {
400
+ /** biome-ignore lint/performance/noAwaitInLoops: sequential enhance */
401
+ const content = await readFile(srcPath, 'utf8') // eslint-disable-line no-await-in-loop
402
+ const result = enhanceMarkdown(content)
403
+ /** biome-ignore lint/performance/noAwaitInLoops: sequential enhance */
404
+ await writeFile(join(getPaths().markdown, name), result, 'utf8') // eslint-disable-line no-await-in-loop
405
+ alreadyDone.add(name)
406
+ enhanced += 1
407
+ onFile?.(name)
408
+ } catch {
409
+ failed += 1
410
+ }
411
+
412
+ return { enhanced, failed }
413
+ }
414
+
385
415
  const spawnCommand = (key: CommandKey): null | { args: string[]; label: string; proc: ReturnType<typeof Bun.spawn> } => {
386
416
  const p = getPaths()
387
417
  const packageRoot = join(p.scriptsDir, '..')
@@ -466,7 +496,6 @@ interface StepCounts {
466
496
  docCount: number
467
497
  finalMdCount: number
468
498
  ocrDone: number
469
- ocrEnhancedCount: number
470
499
  ocrProgress: null | OcrProgress
471
500
  ocrTotal: number
472
501
  pdfCount: number
@@ -491,9 +520,8 @@ const buildStepResults = (c: StepCounts): AllStepsData => {
491
520
  total: c.finalMdCount
492
521
  },
493
522
  enhance: {
494
- done: c.ocrEnhancedCount,
495
- requires: c.ocrTotal > 0 && c.ocrDone === 0 ? 'OCR output' : undefined,
496
- total: c.ocrDone
523
+ done: c.finalMdCount,
524
+ total: c.rawMdCount + c.ocrDone
497
525
  },
498
526
  ocr: {
499
527
  done: c.ocrDone,
@@ -512,17 +540,15 @@ const buildStepResults = (c: StepCounts): AllStepsData => {
512
540
 
513
541
  const fetchStepData = async (): Promise<AllStepsData> => {
514
542
  const p = getPaths()
515
- const [classification, ocrProgress, rawMdCount, ocrDone, finalMdCount, ocrEnhancedCount, dataCounts, datasetEntries] =
516
- await Promise.all([
517
- readJson<Classification>(p.classification),
518
- readJson<OcrProgress>(p.ocrProgress),
519
- countFiles(p.rawMd, '.md'),
520
- countFiles(p.ocrRaw, '.md'),
521
- countFiles(p.markdown, '.md'),
522
- countOverlap(p.ocrRaw, p.markdown, '.md'),
523
- countDataFiles(),
524
- countDatasetEntries()
525
- ])
543
+ const [classification, ocrProgress, rawMdCount, ocrDone, finalMdCount, dataCounts, datasetEntries] = await Promise.all([
544
+ readJson<Classification>(p.classification),
545
+ readJson<OcrProgress>(p.ocrProgress),
546
+ countFiles(p.rawMd, '.md'),
547
+ countFiles(p.ocrRaw, '.md'),
548
+ countFiles(p.markdown, '.md'),
549
+ countDataFiles(),
550
+ countDatasetEntries()
551
+ ])
526
552
 
527
553
  const ocrTotal = classification ? classification.scanned + classification.mixed : 0
528
554
 
@@ -532,7 +558,6 @@ const fetchStepData = async (): Promise<AllStepsData> => {
532
558
  docCount: dataCounts.docs,
533
559
  finalMdCount,
534
560
  ocrDone,
535
- ocrEnhancedCount,
536
561
  ocrProgress,
537
562
  ocrTotal,
538
563
  pdfCount: dataCounts.pdfs,
@@ -584,6 +609,7 @@ export {
584
609
  readLogTail,
585
610
  runClassify,
586
611
  runEnhanceOcr,
612
+ runEnhancePass,
587
613
  spawnCommand,
588
614
  writeNativeFileList
589
615
  }