anymd 0.0.8 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +26 -115
- package/cli.tsx +2 -2
- package/package.json +2 -8
- package/runner.ts +308 -0
- package/src/tui-data.ts +55 -29
- package/tui.tsx +0 -1107
package/README.md
CHANGED
|
@@ -2,138 +2,49 @@
|
|
|
2
2
|
|
|
3
3
|
Convert any document (PDF, DOC, DOCX) to clean Markdown for RAG. macOS Apple Silicon only.
|
|
4
4
|
|
|
5
|
-
## Install
|
|
6
|
-
|
|
7
5
|
```bash
|
|
8
6
|
bunx anymd --input-dir ./my-documents
|
|
9
7
|
```
|
|
10
8
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
## Usage
|
|
14
|
-
|
|
15
|
-
```bash
|
|
16
|
-
bunx anymd --input-dir <path> [--output-dir <path>] [--config <path>]
|
|
17
|
-
```
|
|
18
|
-
|
|
19
|
-
| Flag | Required | Default | Description |
|
|
20
|
-
|------|----------|---------|-------------|
|
|
21
|
-
| `--input-dir` | Yes | — | Directory containing documents (any nested structure) |
|
|
22
|
-
| `--output-dir` | No | `./output` | Where to write all output files |
|
|
23
|
-
| `--config` | No | `./config.json` | Path to configuration file |
|
|
24
|
-
|
|
25
|
-
Launches a TUI that auto-runs all 5 pipeline steps with live per-file progress. On completion: rings terminal bell, sends macOS notification, and updates the terminal title. Safe to Ctrl+C — progress is saved, re-run to resume.
|
|
26
|
-
|
|
27
|
-
## Pipeline Steps
|
|
28
|
-
|
|
29
|
-
| Step | What | Tool | Output |
|
|
30
|
-
|------|------|------|--------|
|
|
31
|
-
| 1. Classify | Detect native/scanned/mixed PDFs | pdftotext (TypeScript) | `<output-dir>/classification.json` |
|
|
32
|
-
| 2. Convert | Doc/docx/native PDF → raw markdown | soffice + markitdown + marker | `<output-dir>/raw-md/` |
|
|
33
|
-
| 3. OCR | Scanned/mixed PDF → markdown | mlx-vlm chandra-8bit | `<output-dir>/ocr-raw/` |
|
|
34
|
-
| 4. Enhance | Heading detection + cleanup | TypeScript | `<output-dir>/markdown/` |
|
|
35
|
-
| 5. Dataset | Collect all markdown → JSONL | TypeScript | `<output-dir>/dataset/dataset.jsonl` |
|
|
36
|
-
|
|
37
|
-
Steps 2 and 3 run in parallel when both are needed.
|
|
38
|
-
|
|
39
|
-
## TUI Hotkeys
|
|
40
|
-
|
|
41
|
-
| Key | Action |
|
|
42
|
-
|-----|--------|
|
|
43
|
-
| R | Retry failed step |
|
|
44
|
-
| S | Skip failed step |
|
|
45
|
-
| L | Toggle log overlay (pipeline + OCR, scrollable) |
|
|
46
|
-
| ↑↓ | Scroll log overlay |
|
|
47
|
-
| Q / ESC | Quit |
|
|
48
|
-
|
|
49
|
-
## System Requirements
|
|
50
|
-
|
|
51
|
-
- macOS with Apple Silicon (64GB recommended for OCR)
|
|
52
|
-
- [Bun](https://bun.sh) runtime
|
|
53
|
-
- [uv](https://docs.astral.sh/uv/) — `curl -LsSf https://astral.sh/uv/install.sh | sh`
|
|
54
|
-
- [poppler](https://poppler.freedesktop.org/) (`pdftotext`) — `brew install poppler`
|
|
55
|
-
- [LibreOffice](https://www.libreoffice.org/) (`soffice`) — optional, only for `.doc` files
|
|
56
|
-
|
|
57
|
-
The TUI preflight check tells you exactly what's missing.
|
|
58
|
-
|
|
59
|
-
## Auto-Bootstrap
|
|
9
|
+
## Pipeline
|
|
60
10
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
-
|
|
65
|
-
|
|
11
|
+
| Step | Description |
|
|
12
|
+
|------|-------------|
|
|
13
|
+
| Classify | Detect native/scanned/mixed PDFs via `pdftotext` |
|
|
14
|
+
| Convert + OCR + Enhance | Parallel conversion (marker-pdf, markitdown) and OCR (mlx-vlm chandra-8bit), with incremental enhancement |
|
|
15
|
+
| Dataset | Deduplicated JSONL from enhanced markdown |
|
|
66
16
|
|
|
67
|
-
|
|
17
|
+
Convert and OCR run in parallel. Enhancement runs incrementally as files land.
|
|
68
18
|
|
|
69
|
-
##
|
|
19
|
+
## Requirements
|
|
70
20
|
|
|
71
|
-
|
|
21
|
+
- macOS Apple Silicon (64GB recommended for OCR)
|
|
22
|
+
- [Bun](https://bun.sh), [uv](https://docs.astral.sh/uv/), [poppler](https://poppler.freedesktop.org/) (`brew install poppler`)
|
|
23
|
+
- [LibreOffice](https://www.libreoffice.org/) — optional, for `.doc` files
|
|
72
24
|
|
|
73
|
-
|
|
74
|
-
|-----|---------|-------------|
|
|
75
|
-
| `classifyBatchSize` | 20 | PDFs classified per batch |
|
|
76
|
-
| `datasetConcurrency` | 50 | Parallel file reads during dataset build |
|
|
77
|
-
| `enhanceConcurrency` | 10 | Parallel markdown enhancement workers |
|
|
78
|
-
| `markerWorkers` | 3 | Parallel marker-pdf processes for PDF → markdown |
|
|
79
|
-
| `minTextLength` | 50 | Minimum characters for a document to be included in dataset |
|
|
80
|
-
| `nativeThreshold` | 200 | Alpha chars above which a PDF is classified as native |
|
|
81
|
-
| `scannedThreshold` | 50 | Alpha chars below which a PDF is classified as scanned |
|
|
25
|
+
On first run, a Python 3.13 venv is auto-created at `~/.cache/anymd/` with all ML dependencies (~2 min).
|
|
82
26
|
|
|
83
|
-
|
|
27
|
+
## Options
|
|
84
28
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
- **Error log** written to `<output-dir>/errors.log` with timestamps — persists across all steps
|
|
29
|
+
```
|
|
30
|
+
--input-dir <path> Input directory (required)
|
|
31
|
+
--output-dir <path> Output directory (default: ./output)
|
|
32
|
+
--config <path> Config file (default: ./config.json)
|
|
33
|
+
```
|
|
91
34
|
|
|
92
|
-
## Output
|
|
35
|
+
## Output
|
|
93
36
|
|
|
94
37
|
```
|
|
95
38
|
<output-dir>/
|
|
96
|
-
├──
|
|
97
|
-
├──
|
|
98
|
-
├──
|
|
99
|
-
├──
|
|
100
|
-
├──
|
|
101
|
-
|
|
102
|
-
└── errors.log Timestamped error log (all steps)
|
|
39
|
+
├── markdown/ Enhanced markdown
|
|
40
|
+
├── dataset/dataset.jsonl JSONL dataset for RAG
|
|
41
|
+
├── classification.json PDF classification
|
|
42
|
+
├── raw-md/ Raw converted markdown
|
|
43
|
+
├── ocr-raw/ OCR markdown
|
|
44
|
+
└── errors.log Error log
|
|
103
45
|
```
|
|
104
46
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
- Put documents anywhere inside `--input-dir` in any folder structure
|
|
108
|
-
- Supports `.doc`, `.docx`, `.pdf` (native, scanned, mixed)
|
|
109
|
-
- Output files use flat naming with `--` separator: `docs/foo/bar/doc.pdf` → `foo--bar--doc.md`
|
|
110
|
-
|
|
111
|
-
## Dataset Deduplication
|
|
112
|
-
|
|
113
|
-
Step 5 deduplicates entries by content hash. If two source files produce identical markdown, only one entry appears in the JSONL. The TUI completion summary shows the dedup count.
|
|
114
|
-
|
|
115
|
-
## Resume Support
|
|
116
|
-
|
|
117
|
-
All steps support resume:
|
|
118
|
-
|
|
119
|
-
- Classify: re-runs if `classification.json` missing
|
|
120
|
-
- Convert: skips files already in `raw-md/`
|
|
121
|
-
- OCR: skips files already in `ocr-raw/`
|
|
122
|
-
- Enhance: skips files already in `markdown/`
|
|
123
|
-
- Dataset: always regenerates from `markdown/`
|
|
124
|
-
|
|
125
|
-
## OCR Details
|
|
126
|
-
|
|
127
|
-
Native PDFs use marker-pdf for structured markdown extraction (headings, bold, lists). Scanned/mixed PDFs use `mlx-community/chandra-8bit` via mlx-vlm at 150 DPI (~32s per page on Apple Silicon). Chandra was chosen for superior Vietnamese diacritical accuracy over marker's surya OCR. Mixed PDFs use native text for text-heavy pages and OCR for scanned pages.
|
|
128
|
-
|
|
129
|
-
## Development
|
|
130
|
-
|
|
131
|
-
```bash
|
|
132
|
-
bun run doc # Run locally (uses ./data as input)
|
|
133
|
-
bun test # 73 unit tests
|
|
134
|
-
bun fix # TypeScript linting (oxlint + eslint + biome + tsc)
|
|
135
|
-
ruff format && ruff check --fix # Python linting
|
|
136
|
-
```
|
|
47
|
+
Safe to Ctrl+C — re-run to resume. When marker-pdf fails, falls back to markitdown automatically.
|
|
137
48
|
|
|
138
49
|
## License
|
|
139
50
|
|
package/cli.tsx
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "anymd",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.10",
|
|
4
4
|
"description": "Convert any document (PDF, DOC, DOCX) to clean Markdown for RAG",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"markdown",
|
|
@@ -27,7 +27,7 @@
|
|
|
27
27
|
"files": [
|
|
28
28
|
"cli.tsx",
|
|
29
29
|
"main.ts",
|
|
30
|
-
"
|
|
30
|
+
"runner.ts",
|
|
31
31
|
"src/",
|
|
32
32
|
"!src/__tests__/",
|
|
33
33
|
"scripts/",
|
|
@@ -44,17 +44,11 @@
|
|
|
44
44
|
"with-env": "dotenv -e ../../.env --"
|
|
45
45
|
},
|
|
46
46
|
"dependencies": {
|
|
47
|
-
"@opentui/core": "^0.1.77",
|
|
48
|
-
"@opentui/react": "^0.1.77",
|
|
49
47
|
"markdownlint": "^0.40.0",
|
|
50
48
|
"p-map": "^7.0.4",
|
|
51
|
-
"react": "^19.2.4",
|
|
52
49
|
"yoctocolors": "^2.1.2",
|
|
53
50
|
"zod": "^4.3.6"
|
|
54
51
|
},
|
|
55
|
-
"devDependencies": {
|
|
56
|
-
"@types/react": "^19.2.13"
|
|
57
|
-
},
|
|
58
52
|
"engines": {
|
|
59
53
|
"bun": ">=1.0.0"
|
|
60
54
|
},
|
package/runner.ts
ADDED
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
/* eslint-disable max-statements */
|
|
2
|
+
import { bootstrapPython } from '~/bootstrap'
|
|
3
|
+
import { getPaths } from '~/paths'
|
|
4
|
+
import { runPreflight } from '~/preflight'
|
|
5
|
+
import {
|
|
6
|
+
appendErrorLog,
|
|
7
|
+
appendPipelineLog,
|
|
8
|
+
buildDataset,
|
|
9
|
+
clearErrorLog,
|
|
10
|
+
clearPipelineLog,
|
|
11
|
+
fetchStepData,
|
|
12
|
+
getOcrStats,
|
|
13
|
+
runClassify,
|
|
14
|
+
runEnhancePass,
|
|
15
|
+
spawnCommand,
|
|
16
|
+
writeNativeFileList
|
|
17
|
+
} from '~/tui-data'
|
|
18
|
+
|
|
19
|
+
const stripAnsi = (s: string): string => s.replaceAll(new RegExp(`${String.fromCodePoint(0x1b)}\\[[0-9;]*m`, 'gu'), '')
|
|
20
|
+
|
|
21
|
+
const ts = (): string => new Date().toISOString().slice(11, 19)
|
|
22
|
+
|
|
23
|
+
const log = (msg: string): void => {
|
|
24
|
+
process.stdout.write(`${ts()} ${msg}\n`)
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
const formatDuration = (seconds: number): string => {
|
|
28
|
+
const h = Math.floor(seconds / 3600)
|
|
29
|
+
const m = Math.floor((seconds % 3600) / 60)
|
|
30
|
+
const s = Math.floor(seconds % 60)
|
|
31
|
+
if (h > 0) return `${h}h${m.toString().padStart(2, '0')}m`
|
|
32
|
+
if (m > 0) return `${m}m${s.toString().padStart(2, '0')}s`
|
|
33
|
+
return `${s}s`
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
const LINE_SPLIT = /\r?\n|\r/u
|
|
37
|
+
const ERROR_PATTERN = /\b(?:ERROR|Error:|Failed:|failed|FAILED|\u2716|exception|traceback)/iu
|
|
38
|
+
// oxlint-disable-next-line promise/prefer-await-to-then
|
|
39
|
+
const noop = (): Promise<void> => Promise.resolve() // eslint-disable-line @typescript-eslint/promise-function-async
|
|
40
|
+
|
|
41
|
+
const readStream = async (stream: ReadableStream<Uint8Array>, onLine: (line: string) => void): Promise<void> => {
|
|
42
|
+
const reader = stream.getReader()
|
|
43
|
+
const decoder = new TextDecoder()
|
|
44
|
+
let buffer = ''
|
|
45
|
+
try {
|
|
46
|
+
for (;;) {
|
|
47
|
+
/** biome-ignore lint/performance/noAwaitInLoops: streaming reads */
|
|
48
|
+
const { done, value } = await reader.read() // eslint-disable-line no-await-in-loop
|
|
49
|
+
if (done) break
|
|
50
|
+
buffer += decoder.decode(value, { stream: true })
|
|
51
|
+
const parts = buffer.split(LINE_SPLIT)
|
|
52
|
+
buffer = parts.pop() ?? ''
|
|
53
|
+
for (const part of parts) if (part.trim() !== '') onLine(part)
|
|
54
|
+
}
|
|
55
|
+
if (buffer.trim() !== '') onLine(buffer)
|
|
56
|
+
} finally {
|
|
57
|
+
reader.releaseLock()
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
const PROGRESS_INTERVAL_MS = 5000
|
|
62
|
+
|
|
63
|
+
const runConvertStep = async (prefix: string): Promise<number> => {
|
|
64
|
+
await clearPipelineLog()
|
|
65
|
+
await writeNativeFileList()
|
|
66
|
+
const spawned = spawnCommand('pipeline')
|
|
67
|
+
if (!spawned) return -1
|
|
68
|
+
|
|
69
|
+
const onLine = (line: string): void => {
|
|
70
|
+
const clean = stripAnsi(line)
|
|
71
|
+
log(`${prefix}${clean}`)
|
|
72
|
+
appendPipelineLog(clean)
|
|
73
|
+
if (ERROR_PATTERN.test(clean)) appendErrorLog('pipeline', clean)
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
const { stderr, stdout } = spawned.proc
|
|
77
|
+
const stdoutP = stdout instanceof ReadableStream ? readStream(stdout, onLine) : noop()
|
|
78
|
+
const stderrP = stderr instanceof ReadableStream ? readStream(stderr, onLine) : noop()
|
|
79
|
+
await Promise.all([stdoutP, stderrP])
|
|
80
|
+
return spawned.proc.exited
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const runOcrStep = async (prefix: string): Promise<number> => {
|
|
84
|
+
const stats = await getOcrStats()
|
|
85
|
+
if (stats.total === 0) {
|
|
86
|
+
log(`${prefix}No scanned/mixed PDFs to OCR.`)
|
|
87
|
+
return 0
|
|
88
|
+
}
|
|
89
|
+
if (stats.remaining === 0) {
|
|
90
|
+
log(`${prefix}All ${stats.total} files already OCR'd.`)
|
|
91
|
+
return 0
|
|
92
|
+
}
|
|
93
|
+
log(`${prefix}OCR ${stats.remaining} remaining of ${stats.total} total`)
|
|
94
|
+
|
|
95
|
+
const spawned = spawnCommand('ocr')
|
|
96
|
+
if (!spawned) return -1
|
|
97
|
+
|
|
98
|
+
const onLine = (line: string): void => {
|
|
99
|
+
const clean = stripAnsi(line)
|
|
100
|
+
log(`${prefix}${clean}`)
|
|
101
|
+
if (ERROR_PATTERN.test(clean)) appendErrorLog('ocr', clean)
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
const { stderr, stdout } = spawned.proc
|
|
105
|
+
const stdoutP = stdout instanceof ReadableStream ? readStream(stdout, onLine) : noop()
|
|
106
|
+
const stderrP = stderr instanceof ReadableStream ? readStream(stderr, onLine) : noop()
|
|
107
|
+
await Promise.all([stdoutP, stderrP])
|
|
108
|
+
return spawned.proc.exited
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
const ENHANCE_POLL_MS = 2000
|
|
112
|
+
|
|
113
|
+
const logProgress = async (startTime: number): Promise<void> => {
|
|
114
|
+
const d = await fetchStepData()
|
|
115
|
+
const parts: string[] = []
|
|
116
|
+
if (d.pipeline.total > 0) parts.push(`Convert ${d.pipeline.done}/${d.pipeline.total}`)
|
|
117
|
+
if (d.ocr.total > 0) parts.push(`OCR ${d.ocr.done}/${d.ocr.total}`)
|
|
118
|
+
if (d.enhance.total > 0) parts.push(`Enhance ${d.enhance.done}/${d.enhance.total}`)
|
|
119
|
+
if (parts.length > 0) {
|
|
120
|
+
const elapsed = formatDuration((Date.now() - startTime) / 1000)
|
|
121
|
+
log(`── ${parts.join(' · ')} · ${elapsed} elapsed ──`)
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
const startEnhancePoller = (
|
|
126
|
+
onFile: (name: string) => void
|
|
127
|
+
): { stop: () => Promise<{ enhanced: number; failed: number }> } => {
|
|
128
|
+
const done = new Set<string>()
|
|
129
|
+
const interval = setInterval(() => {
|
|
130
|
+
// oxlint-disable-next-line promise/prefer-await-to-then
|
|
131
|
+
runEnhancePass(done, onFile).catch(noop)
|
|
132
|
+
}, ENHANCE_POLL_MS)
|
|
133
|
+
|
|
134
|
+
const stop = async (): Promise<{ enhanced: number; failed: number }> => {
|
|
135
|
+
clearInterval(interval)
|
|
136
|
+
return runEnhancePass(done, onFile)
|
|
137
|
+
}
|
|
138
|
+
return { stop }
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
const startProgressTicker = (startTime: number): ReturnType<typeof setInterval> =>
|
|
142
|
+
// eslint-disable-line @typescript-eslint/promise-function-async
|
|
143
|
+
setInterval(() => {
|
|
144
|
+
// oxlint-disable-next-line promise/prefer-await-to-then
|
|
145
|
+
logProgress(startTime).catch(noop)
|
|
146
|
+
}, PROGRESS_INTERVAL_MS)
|
|
147
|
+
|
|
148
|
+
const runBootstrap = async (): Promise<void> => {
|
|
149
|
+
log('Checking Python environment...')
|
|
150
|
+
const ok = await bootstrapPython({
|
|
151
|
+
onDone: () => log('Python environment ready.'),
|
|
152
|
+
onStep: (msg: string) => log(` ${msg}`)
|
|
153
|
+
})
|
|
154
|
+
if (!ok) {
|
|
155
|
+
log('FATAL: Python bootstrap failed. Install uv and try again.')
|
|
156
|
+
process.exit(1)
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
const runPreflightCheck = async (): Promise<void> => {
|
|
161
|
+
const preflight = await runPreflight()
|
|
162
|
+
if (preflight.errors.length > 0) {
|
|
163
|
+
for (const e of preflight.errors) log(`ERROR: ${e}`)
|
|
164
|
+
log('Fix the errors above and restart.')
|
|
165
|
+
process.exit(1)
|
|
166
|
+
}
|
|
167
|
+
for (const w of preflight.warnings) log(`WARN: ${w}`)
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
const runClassifyStep = async (): Promise<void> => {
|
|
171
|
+
const data = await fetchStepData()
|
|
172
|
+
const done = data.classify.done >= data.classify.total && data.classify.total > 0
|
|
173
|
+
if (done) {
|
|
174
|
+
log('Step 1/3: Classify \u2014 already done')
|
|
175
|
+
if (data.classify.details) for (const d of data.classify.details) log(` ${d}`)
|
|
176
|
+
return
|
|
177
|
+
}
|
|
178
|
+
log('Step 1/3: Classify PDFs')
|
|
179
|
+
const t = Date.now()
|
|
180
|
+
await runClassify(p => {
|
|
181
|
+
log(` ${p.done}/${p.total} ${p.file} → ${p.category}`)
|
|
182
|
+
})
|
|
183
|
+
const d = await fetchStepData()
|
|
184
|
+
if (d.classify.details) for (const det of d.classify.details) log(` ${det}`)
|
|
185
|
+
log(` Done in ${formatDuration((Date.now() - t) / 1000)}`)
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
const runParallelConvertOcr = async (): Promise<void> => {
|
|
189
|
+
log(' Convert + OCR (parallel)')
|
|
190
|
+
const ocrPromise = runOcrStep('[OCR] ')
|
|
191
|
+
const pipelineCode = await runConvertStep('[CONVERT] ')
|
|
192
|
+
if (pipelineCode !== 0) log(` Convert exited with code ${pipelineCode}`)
|
|
193
|
+
log(' Convert done, waiting for OCR...')
|
|
194
|
+
const ocrCode = await ocrPromise
|
|
195
|
+
if (ocrCode !== 0) log(` OCR exited with code ${ocrCode}`)
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
const runSequentialOcr = async (ocrNeeded: boolean, ocrDone: boolean): Promise<void> => {
|
|
199
|
+
if (ocrNeeded && !ocrDone) {
|
|
200
|
+
log(' OCR scanned PDFs')
|
|
201
|
+
const code = await runOcrStep(' ')
|
|
202
|
+
if (code !== 0) log(` OCR exited with code ${code}`)
|
|
203
|
+
} else if (ocrNeeded) log(' OCR \u2014 already done')
|
|
204
|
+
else log(' OCR \u2014 no scanned PDFs')
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// eslint-disable-next-line max-statements
|
|
208
|
+
const runConvertOcrEnhance = async (startTime: number): Promise<void> => {
|
|
209
|
+
const data = await fetchStepData()
|
|
210
|
+
const pipelineDone = data.pipeline.done >= data.pipeline.total && data.pipeline.total > 0
|
|
211
|
+
const ocrNeeded = data.ocr.total > 0
|
|
212
|
+
const ocrDone = data.ocr.done >= data.ocr.total && data.ocr.total > 0
|
|
213
|
+
const allDone = pipelineDone && (!ocrNeeded || ocrDone)
|
|
214
|
+
|
|
215
|
+
if (allDone && data.enhance.done >= data.enhance.total && data.enhance.total > 0) {
|
|
216
|
+
log('Step 2/3: Convert + OCR + Enhance \u2014 already done')
|
|
217
|
+
return
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
log('Step 2/3: Convert + OCR + Enhance')
|
|
221
|
+
const ticker = startProgressTicker(startTime)
|
|
222
|
+
const enhancer = startEnhancePoller(name => log(`[ENHANCE] \u2713 ${name}`))
|
|
223
|
+
const t = Date.now()
|
|
224
|
+
|
|
225
|
+
if (!allDone) {
|
|
226
|
+
const parallel = !pipelineDone && ocrNeeded && !ocrDone
|
|
227
|
+
if (parallel) await runParallelConvertOcr()
|
|
228
|
+
else {
|
|
229
|
+
if (pipelineDone) log(' Convert \u2014 already done')
|
|
230
|
+
else {
|
|
231
|
+
log(' Convert to Markdown')
|
|
232
|
+
const code = await runConvertStep(' ')
|
|
233
|
+
if (code !== 0) log(` Convert exited with code ${code}`)
|
|
234
|
+
}
|
|
235
|
+
await runSequentialOcr(ocrNeeded, ocrDone)
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
clearInterval(ticker)
|
|
240
|
+
const enhanceResult = await enhancer.stop()
|
|
241
|
+
log(` Enhanced: ${enhanceResult.enhanced}, Failed: ${enhanceResult.failed}`)
|
|
242
|
+
log(` Done in ${formatDuration((Date.now() - t) / 1000)}`)
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
const runDatasetStep = async (): Promise<{ duplicates: number; entries: number; skipped: number; totalChars: number }> => {
|
|
246
|
+
log('Step 3/3: Build Dataset')
|
|
247
|
+
const t = Date.now()
|
|
248
|
+
const result = await buildDataset({
|
|
249
|
+
onFileResult: p => {
|
|
250
|
+
const icon = p.status === 'added' ? '\u2713' : p.status === 'duplicate' ? '\u2261' : '\u2192'
|
|
251
|
+
const charStr = p.chars >= 1000 ? `${(p.chars / 1000).toFixed(1)}K` : `${p.chars}`
|
|
252
|
+
log(` ${p.done}/${p.total} ${icon} ${p.file} → ${p.status} (${charStr} chars)`)
|
|
253
|
+
},
|
|
254
|
+
onReadProgress: (done, total) => {
|
|
255
|
+
if (done % 100 === 0 || done === total) log(` Reading ${done}/${total} files...`)
|
|
256
|
+
}
|
|
257
|
+
})
|
|
258
|
+
log(` Entries: ${result.entries}, Skipped: ${result.skipped}, Duplicates: ${result.duplicates}`)
|
|
259
|
+
log(` Total chars: ${result.totalChars.toLocaleString()}`)
|
|
260
|
+
log(` Done in ${formatDuration((Date.now() - t) / 1000)}`)
|
|
261
|
+
return result
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
const printSummary = async (
|
|
265
|
+
startTime: number,
|
|
266
|
+
dsResult: { duplicates: number; entries: number; skipped: number; totalChars: number }
|
|
267
|
+
): Promise<void> => {
|
|
268
|
+
const data = await fetchStepData()
|
|
269
|
+
const elapsed = formatDuration((Date.now() - startTime) / 1000)
|
|
270
|
+
const sep = '\u2550'.repeat(45)
|
|
271
|
+
log('')
|
|
272
|
+
log(sep)
|
|
273
|
+
log(' Pipeline Complete')
|
|
274
|
+
log(sep)
|
|
275
|
+
log(` Classified: ${data.classify.done} PDFs`)
|
|
276
|
+
if (data.classify.details) for (const d of data.classify.details) log(` ${d}`)
|
|
277
|
+
log(` Converted: ${data.pipeline.done} files`)
|
|
278
|
+
log(` OCR: ${data.ocr.done} files`)
|
|
279
|
+
log(` Enhanced: ${data.enhance.done} files`)
|
|
280
|
+
log(` Dataset: ${dsResult.entries} entries, ${dsResult.totalChars.toLocaleString()} chars`)
|
|
281
|
+
if (dsResult.duplicates > 0) log(` Deduplicated: ${dsResult.duplicates}`)
|
|
282
|
+
if (dsResult.skipped > 0) log(` Skipped: ${dsResult.skipped} (below min length)`)
|
|
283
|
+
log(` Duration: ${elapsed}`)
|
|
284
|
+
log(` Output: ${getPaths().outputDir}`)
|
|
285
|
+
log(sep)
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
const run = async (): Promise<void> => {
|
|
289
|
+
const startTime = Date.now()
|
|
290
|
+
|
|
291
|
+
await runBootstrap()
|
|
292
|
+
await runPreflightCheck()
|
|
293
|
+
await clearErrorLog()
|
|
294
|
+
|
|
295
|
+
log('')
|
|
296
|
+
await runClassifyStep()
|
|
297
|
+
log('')
|
|
298
|
+
await runConvertOcrEnhance(startTime)
|
|
299
|
+
log('')
|
|
300
|
+
const dsResult = await runDatasetStep()
|
|
301
|
+
|
|
302
|
+
await printSummary(startTime, dsResult)
|
|
303
|
+
|
|
304
|
+
process.stdout.write('\u0007')
|
|
305
|
+
Bun.spawn(['osascript', '-e', 'display notification "Pipeline complete" with title "anymd"'])
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
export { run }
|
package/src/tui-data.ts
CHANGED
|
@@ -63,19 +63,6 @@ const countFiles = async (dir: string, ext: string): Promise<number> => {
|
|
|
63
63
|
}
|
|
64
64
|
}
|
|
65
65
|
|
|
66
|
-
const countOverlap = async (dirA: string, dirB: string, ext: string): Promise<number> => {
|
|
67
|
-
try {
|
|
68
|
-
const [entriesA, entriesB] = await Promise.all([readdir(dirA), readdir(dirB)])
|
|
69
|
-
const setB = new Set<string>()
|
|
70
|
-
for (const e of entriesB) if (e.endsWith(ext)) setB.add(e)
|
|
71
|
-
let count = 0
|
|
72
|
-
for (const e of entriesA) if (e.endsWith(ext) && setB.has(e)) count += 1
|
|
73
|
-
return count
|
|
74
|
-
} catch {
|
|
75
|
-
return 0
|
|
76
|
-
}
|
|
77
|
-
}
|
|
78
|
-
|
|
79
66
|
const readJson = async <T>(path: string): Promise<null | T> => {
|
|
80
67
|
try {
|
|
81
68
|
const text = await readFile(path, 'utf8')
|
|
@@ -382,6 +369,49 @@ const runEnhanceOcr = async (
|
|
|
382
369
|
return { enhanced, failed, skipped: skippedFiles.length }
|
|
383
370
|
}
|
|
384
371
|
|
|
372
|
+
const gatherEnhanceCandidates = async (exclude: Set<string>): Promise<{ name: string; srcPath: string }[]> => {
|
|
373
|
+
const p = getPaths()
|
|
374
|
+
const candidates: { name: string; srcPath: string }[] = []
|
|
375
|
+
for (const srcDir of [p.rawMd, p.ocrRaw])
|
|
376
|
+
try {
|
|
377
|
+
/** biome-ignore lint/performance/noAwaitInLoops: iterating 2 dirs */
|
|
378
|
+
const entries = await readdir(srcDir) // eslint-disable-line no-await-in-loop
|
|
379
|
+
for (const f of entries)
|
|
380
|
+
if (f.endsWith('.md') && !exclude.has(f)) candidates.push({ name: f, srcPath: join(srcDir, f) })
|
|
381
|
+
} catch {
|
|
382
|
+
/* Empty */
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
return candidates
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
// eslint-disable-next-line max-statements
|
|
389
|
+
const runEnhancePass = async (
|
|
390
|
+
alreadyDone: Set<string>,
|
|
391
|
+
onFile?: (file: string) => void
|
|
392
|
+
): Promise<{ enhanced: number; failed: number }> => {
|
|
393
|
+
const { enhanceMarkdown } = await import('~/md-enhancer')
|
|
394
|
+
mkdirSync(getPaths().markdown, { recursive: true })
|
|
395
|
+
const candidates = await gatherEnhanceCandidates(alreadyDone)
|
|
396
|
+
let enhanced = 0
|
|
397
|
+
let failed = 0
|
|
398
|
+
for (const { name, srcPath } of candidates)
|
|
399
|
+
try {
|
|
400
|
+
/** biome-ignore lint/performance/noAwaitInLoops: sequential enhance */
|
|
401
|
+
const content = await readFile(srcPath, 'utf8') // eslint-disable-line no-await-in-loop
|
|
402
|
+
const result = enhanceMarkdown(content)
|
|
403
|
+
/** biome-ignore lint/performance/noAwaitInLoops: sequential enhance */
|
|
404
|
+
await writeFile(join(getPaths().markdown, name), result, 'utf8') // eslint-disable-line no-await-in-loop
|
|
405
|
+
alreadyDone.add(name)
|
|
406
|
+
enhanced += 1
|
|
407
|
+
onFile?.(name)
|
|
408
|
+
} catch {
|
|
409
|
+
failed += 1
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
return { enhanced, failed }
|
|
413
|
+
}
|
|
414
|
+
|
|
385
415
|
const spawnCommand = (key: CommandKey): null | { args: string[]; label: string; proc: ReturnType<typeof Bun.spawn> } => {
|
|
386
416
|
const p = getPaths()
|
|
387
417
|
const packageRoot = join(p.scriptsDir, '..')
|
|
@@ -466,7 +496,6 @@ interface StepCounts {
|
|
|
466
496
|
docCount: number
|
|
467
497
|
finalMdCount: number
|
|
468
498
|
ocrDone: number
|
|
469
|
-
ocrEnhancedCount: number
|
|
470
499
|
ocrProgress: null | OcrProgress
|
|
471
500
|
ocrTotal: number
|
|
472
501
|
pdfCount: number
|
|
@@ -491,9 +520,8 @@ const buildStepResults = (c: StepCounts): AllStepsData => {
|
|
|
491
520
|
total: c.finalMdCount
|
|
492
521
|
},
|
|
493
522
|
enhance: {
|
|
494
|
-
done: c.
|
|
495
|
-
|
|
496
|
-
total: c.ocrDone
|
|
523
|
+
done: c.finalMdCount,
|
|
524
|
+
total: c.rawMdCount + c.ocrDone
|
|
497
525
|
},
|
|
498
526
|
ocr: {
|
|
499
527
|
done: c.ocrDone,
|
|
@@ -512,17 +540,15 @@ const buildStepResults = (c: StepCounts): AllStepsData => {
|
|
|
512
540
|
|
|
513
541
|
const fetchStepData = async (): Promise<AllStepsData> => {
|
|
514
542
|
const p = getPaths()
|
|
515
|
-
const [classification, ocrProgress, rawMdCount, ocrDone, finalMdCount,
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
countDatasetEntries()
|
|
525
|
-
])
|
|
543
|
+
const [classification, ocrProgress, rawMdCount, ocrDone, finalMdCount, dataCounts, datasetEntries] = await Promise.all([
|
|
544
|
+
readJson<Classification>(p.classification),
|
|
545
|
+
readJson<OcrProgress>(p.ocrProgress),
|
|
546
|
+
countFiles(p.rawMd, '.md'),
|
|
547
|
+
countFiles(p.ocrRaw, '.md'),
|
|
548
|
+
countFiles(p.markdown, '.md'),
|
|
549
|
+
countDataFiles(),
|
|
550
|
+
countDatasetEntries()
|
|
551
|
+
])
|
|
526
552
|
|
|
527
553
|
const ocrTotal = classification ? classification.scanned + classification.mixed : 0
|
|
528
554
|
|
|
@@ -532,7 +558,6 @@ const fetchStepData = async (): Promise<AllStepsData> => {
|
|
|
532
558
|
docCount: dataCounts.docs,
|
|
533
559
|
finalMdCount,
|
|
534
560
|
ocrDone,
|
|
535
|
-
ocrEnhancedCount,
|
|
536
561
|
ocrProgress,
|
|
537
562
|
ocrTotal,
|
|
538
563
|
pdfCount: dataCounts.pdfs,
|
|
@@ -584,6 +609,7 @@ export {
|
|
|
584
609
|
readLogTail,
|
|
585
610
|
runClassify,
|
|
586
611
|
runEnhanceOcr,
|
|
612
|
+
runEnhancePass,
|
|
587
613
|
spawnCommand,
|
|
588
614
|
writeNativeFileList
|
|
589
615
|
}
|