anymd 0.0.7 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -27
- package/cli.tsx +2 -2
- package/package.json +2 -8
- package/runner.ts +308 -0
- package/scripts/pdf-to-md.py +38 -9
- package/src/bootstrap.ts +1 -1
- package/src/tui-data.ts +55 -29
- package/tui.tsx +0 -1105
package/README.md
CHANGED
|
@@ -22,29 +22,17 @@ bunx anymd --input-dir <path> [--output-dir <path>] [--config <path>]
|
|
|
22
22
|
| `--output-dir` | No | `./output` | Where to write all output files |
|
|
23
23
|
| `--config` | No | `./config.json` | Path to configuration file |
|
|
24
24
|
|
|
25
|
-
|
|
25
|
+
Runs a 3-step pipeline with verbose per-file progress printed to stdout. A progress summary prints every 5 seconds during long-running steps. On completion: rings terminal bell and sends macOS notification. Safe to Ctrl+C — progress is saved, re-run to resume.
|
|
26
26
|
|
|
27
27
|
## Pipeline Steps
|
|
28
28
|
|
|
29
29
|
| Step | What | Tool | Output |
|
|
30
30
|
|------|------|------|--------|
|
|
31
31
|
| 1. Classify | Detect native/scanned/mixed PDFs | pdftotext (TypeScript) | `<output-dir>/classification.json` |
|
|
32
|
-
| 2. Convert | Doc/docx/native PDF →
|
|
33
|
-
| 3.
|
|
34
|
-
| 4. Enhance | Heading detection + cleanup | TypeScript | `<output-dir>/markdown/` |
|
|
35
|
-
| 5. Dataset | Collect all markdown → JSONL | TypeScript | `<output-dir>/dataset/dataset.jsonl` |
|
|
32
|
+
| 2. Convert + OCR + Enhance | Doc/docx/native PDF → markdown, scanned/mixed PDF → OCR markdown, heading detection + cleanup | marker + markitdown + mlx-vlm + TypeScript | `<output-dir>/markdown/` |
|
|
33
|
+
| 3. Dataset | Collect all markdown → JSONL | TypeScript | `<output-dir>/dataset/dataset.jsonl` |
|
|
36
34
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
## TUI Hotkeys
|
|
40
|
-
|
|
41
|
-
| Key | Action |
|
|
42
|
-
|-----|--------|
|
|
43
|
-
| R | Retry failed step |
|
|
44
|
-
| S | Skip failed step |
|
|
45
|
-
| L | Toggle log overlay (pipeline + OCR, scrollable) |
|
|
46
|
-
| ↑↓ | Scroll log overlay |
|
|
47
|
-
| Q / ESC | Quit |
|
|
35
|
+
Convert and OCR run in parallel. Enhancement runs incrementally as files are ready — no separate step needed.
|
|
48
36
|
|
|
49
37
|
## System Requirements
|
|
50
38
|
|
|
@@ -54,17 +42,18 @@ Steps 2 and 3 run in parallel when both are needed.
|
|
|
54
42
|
- [poppler](https://poppler.freedesktop.org/) (`pdftotext`) — `brew install poppler`
|
|
55
43
|
- [LibreOffice](https://www.libreoffice.org/) (`soffice`) — optional, only for `.doc` files
|
|
56
44
|
|
|
57
|
-
|
|
45
|
+
A preflight check runs at startup and tells you exactly what's missing.
|
|
58
46
|
|
|
59
47
|
## Auto-Bootstrap
|
|
60
48
|
|
|
61
49
|
On first run, `anymd` uses `uv` to create `~/.cache/anymd/.venv` (Python 3.13) and installs:
|
|
62
50
|
- `marker-pdf` — PDF to markdown conversion
|
|
63
|
-
- `markitdown` — DOCX
|
|
51
|
+
- `markitdown[docx,pdf]` — DOCX and PDF fallback conversion
|
|
64
52
|
- `mlx-vlm` — Apple Silicon MLX inference for OCR
|
|
65
53
|
- `pypdfium2` — PDF page rendering
|
|
54
|
+
- `torchvision` — image processing for OCR
|
|
66
55
|
|
|
67
|
-
This takes ~2 minutes. Progress is
|
|
56
|
+
This takes ~2 minutes. Progress is printed to stdout. Subsequent runs detect the existing venv and skip setup.
|
|
68
57
|
|
|
69
58
|
## Configuration
|
|
70
59
|
|
|
@@ -82,13 +71,6 @@ Create a `config.json` in your working directory (or pass `--config <path>`):
|
|
|
82
71
|
|
|
83
72
|
All fields are optional — omitted fields use defaults. No config file = all defaults.
|
|
84
73
|
|
|
85
|
-
## Notifications
|
|
86
|
-
|
|
87
|
-
- **Terminal title** updates with current step, progress percentage, and parallel status
|
|
88
|
-
- **Terminal bell** rings on pipeline completion
|
|
89
|
-
- **macOS notification** fires on completion via `osascript`
|
|
90
|
-
- **Error log** written to `<output-dir>/errors.log` with timestamps — persists across all steps
|
|
91
|
-
|
|
92
74
|
## Output Structure
|
|
93
75
|
|
|
94
76
|
```
|
|
@@ -97,6 +79,7 @@ All fields are optional — omitted fields use defaults. No config file = all de
|
|
|
97
79
|
├── ocr-raw/ OCR markdown from scanned PDFs (step 3)
|
|
98
80
|
├── markdown/ Final enhanced markdown (step 4)
|
|
99
81
|
├── dataset/dataset.jsonl JSONL dataset for RAG (step 5)
|
|
82
|
+
├── classification.json PDF classification results (step 1)
|
|
100
83
|
├── pipeline-log.txt Pipeline conversion log
|
|
101
84
|
├── ocr-log.txt OCR processing log
|
|
102
85
|
└── errors.log Timestamped error log (all steps)
|
|
@@ -108,9 +91,13 @@ All fields are optional — omitted fields use defaults. No config file = all de
|
|
|
108
91
|
- Supports `.doc`, `.docx`, `.pdf` (native, scanned, mixed)
|
|
109
92
|
- Output files use flat naming with `--` separator: `docs/foo/bar/doc.pdf` → `foo--bar--doc.md`
|
|
110
93
|
|
|
94
|
+
## PDF Fallback
|
|
95
|
+
|
|
96
|
+
When marker-pdf fails on a PDF (e.g. index out of bounds), `anymd` automatically falls back to markitdown (pdfminer-six) for text extraction.
|
|
97
|
+
|
|
111
98
|
## Dataset Deduplication
|
|
112
99
|
|
|
113
|
-
Step
|
|
100
|
+
Step 3 deduplicates entries by content hash. If two source files produce identical markdown, only one entry appears in the JSONL. The completion summary shows the dedup count.
|
|
114
101
|
|
|
115
102
|
## Resume Support
|
|
116
103
|
|
package/cli.tsx
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "anymd",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.9",
|
|
4
4
|
"description": "Convert any document (PDF, DOC, DOCX) to clean Markdown for RAG",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"markdown",
|
|
@@ -27,7 +27,7 @@
|
|
|
27
27
|
"files": [
|
|
28
28
|
"cli.tsx",
|
|
29
29
|
"main.ts",
|
|
30
|
-
"
|
|
30
|
+
"runner.ts",
|
|
31
31
|
"src/",
|
|
32
32
|
"!src/__tests__/",
|
|
33
33
|
"scripts/",
|
|
@@ -44,17 +44,11 @@
|
|
|
44
44
|
"with-env": "dotenv -e ../../.env --"
|
|
45
45
|
},
|
|
46
46
|
"dependencies": {
|
|
47
|
-
"@opentui/core": "^0.1.77",
|
|
48
|
-
"@opentui/react": "^0.1.77",
|
|
49
47
|
"markdownlint": "^0.40.0",
|
|
50
48
|
"p-map": "^7.0.4",
|
|
51
|
-
"react": "^19.2.4",
|
|
52
49
|
"yoctocolors": "^2.1.2",
|
|
53
50
|
"zod": "^4.3.6"
|
|
54
51
|
},
|
|
55
|
-
"devDependencies": {
|
|
56
|
-
"@types/react": "^19.2.13"
|
|
57
|
-
},
|
|
58
52
|
"engines": {
|
|
59
53
|
"bun": ">=1.0.0"
|
|
60
54
|
},
|
package/runner.ts
ADDED
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
/* eslint-disable max-statements */
|
|
2
|
+
import { bootstrapPython } from '~/bootstrap'
|
|
3
|
+
import { getPaths } from '~/paths'
|
|
4
|
+
import { runPreflight } from '~/preflight'
|
|
5
|
+
import {
|
|
6
|
+
appendErrorLog,
|
|
7
|
+
appendPipelineLog,
|
|
8
|
+
buildDataset,
|
|
9
|
+
clearErrorLog,
|
|
10
|
+
clearPipelineLog,
|
|
11
|
+
fetchStepData,
|
|
12
|
+
getOcrStats,
|
|
13
|
+
runClassify,
|
|
14
|
+
runEnhancePass,
|
|
15
|
+
spawnCommand,
|
|
16
|
+
writeNativeFileList
|
|
17
|
+
} from '~/tui-data'
|
|
18
|
+
|
|
19
|
+
const stripAnsi = (s: string): string => s.replaceAll(new RegExp(`${String.fromCodePoint(0x1b)}\\[[0-9;]*m`, 'gu'), '')
|
|
20
|
+
|
|
21
|
+
const ts = (): string => new Date().toISOString().slice(11, 19)
|
|
22
|
+
|
|
23
|
+
const log = (msg: string): void => {
|
|
24
|
+
process.stdout.write(`${ts()} ${msg}\n`)
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
const formatDuration = (seconds: number): string => {
|
|
28
|
+
const h = Math.floor(seconds / 3600)
|
|
29
|
+
const m = Math.floor((seconds % 3600) / 60)
|
|
30
|
+
const s = Math.floor(seconds % 60)
|
|
31
|
+
if (h > 0) return `${h}h${m.toString().padStart(2, '0')}m`
|
|
32
|
+
if (m > 0) return `${m}m${s.toString().padStart(2, '0')}s`
|
|
33
|
+
return `${s}s`
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
const LINE_SPLIT = /\r?\n|\r/u
|
|
37
|
+
const ERROR_PATTERN = /\b(?:ERROR|Error:|Failed:|failed|FAILED|\u2716|exception|traceback)/iu
|
|
38
|
+
// oxlint-disable-next-line promise/prefer-await-to-then
|
|
39
|
+
const noop = (): Promise<void> => Promise.resolve() // eslint-disable-line @typescript-eslint/promise-function-async
|
|
40
|
+
|
|
41
|
+
const readStream = async (stream: ReadableStream<Uint8Array>, onLine: (line: string) => void): Promise<void> => {
|
|
42
|
+
const reader = stream.getReader()
|
|
43
|
+
const decoder = new TextDecoder()
|
|
44
|
+
let buffer = ''
|
|
45
|
+
try {
|
|
46
|
+
for (;;) {
|
|
47
|
+
/** biome-ignore lint/performance/noAwaitInLoops: streaming reads */
|
|
48
|
+
const { done, value } = await reader.read() // eslint-disable-line no-await-in-loop
|
|
49
|
+
if (done) break
|
|
50
|
+
buffer += decoder.decode(value, { stream: true })
|
|
51
|
+
const parts = buffer.split(LINE_SPLIT)
|
|
52
|
+
buffer = parts.pop() ?? ''
|
|
53
|
+
for (const part of parts) if (part.trim() !== '') onLine(part)
|
|
54
|
+
}
|
|
55
|
+
if (buffer.trim() !== '') onLine(buffer)
|
|
56
|
+
} finally {
|
|
57
|
+
reader.releaseLock()
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
const PROGRESS_INTERVAL_MS = 5000
|
|
62
|
+
|
|
63
|
+
const runConvertStep = async (prefix: string): Promise<number> => {
|
|
64
|
+
await clearPipelineLog()
|
|
65
|
+
await writeNativeFileList()
|
|
66
|
+
const spawned = spawnCommand('pipeline')
|
|
67
|
+
if (!spawned) return -1
|
|
68
|
+
|
|
69
|
+
const onLine = (line: string): void => {
|
|
70
|
+
const clean = stripAnsi(line)
|
|
71
|
+
log(`${prefix}${clean}`)
|
|
72
|
+
appendPipelineLog(clean)
|
|
73
|
+
if (ERROR_PATTERN.test(clean)) appendErrorLog('pipeline', clean)
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
const { stderr, stdout } = spawned.proc
|
|
77
|
+
const stdoutP = stdout instanceof ReadableStream ? readStream(stdout, onLine) : noop()
|
|
78
|
+
const stderrP = stderr instanceof ReadableStream ? readStream(stderr, onLine) : noop()
|
|
79
|
+
await Promise.all([stdoutP, stderrP])
|
|
80
|
+
return spawned.proc.exited
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const runOcrStep = async (prefix: string): Promise<number> => {
|
|
84
|
+
const stats = await getOcrStats()
|
|
85
|
+
if (stats.total === 0) {
|
|
86
|
+
log(`${prefix}No scanned/mixed PDFs to OCR.`)
|
|
87
|
+
return 0
|
|
88
|
+
}
|
|
89
|
+
if (stats.remaining === 0) {
|
|
90
|
+
log(`${prefix}All ${stats.total} files already OCR'd.`)
|
|
91
|
+
return 0
|
|
92
|
+
}
|
|
93
|
+
log(`${prefix}OCR ${stats.remaining} remaining of ${stats.total} total`)
|
|
94
|
+
|
|
95
|
+
const spawned = spawnCommand('ocr')
|
|
96
|
+
if (!spawned) return -1
|
|
97
|
+
|
|
98
|
+
const onLine = (line: string): void => {
|
|
99
|
+
const clean = stripAnsi(line)
|
|
100
|
+
log(`${prefix}${clean}`)
|
|
101
|
+
if (ERROR_PATTERN.test(clean)) appendErrorLog('ocr', clean)
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
const { stderr, stdout } = spawned.proc
|
|
105
|
+
const stdoutP = stdout instanceof ReadableStream ? readStream(stdout, onLine) : noop()
|
|
106
|
+
const stderrP = stderr instanceof ReadableStream ? readStream(stderr, onLine) : noop()
|
|
107
|
+
await Promise.all([stdoutP, stderrP])
|
|
108
|
+
return spawned.proc.exited
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
const ENHANCE_POLL_MS = 2000
|
|
112
|
+
|
|
113
|
+
const logProgress = async (startTime: number): Promise<void> => {
|
|
114
|
+
const d = await fetchStepData()
|
|
115
|
+
const parts: string[] = []
|
|
116
|
+
if (d.pipeline.total > 0) parts.push(`Convert ${d.pipeline.done}/${d.pipeline.total}`)
|
|
117
|
+
if (d.ocr.total > 0) parts.push(`OCR ${d.ocr.done}/${d.ocr.total}`)
|
|
118
|
+
if (d.enhance.total > 0) parts.push(`Enhance ${d.enhance.done}/${d.enhance.total}`)
|
|
119
|
+
if (parts.length > 0) {
|
|
120
|
+
const elapsed = formatDuration((Date.now() - startTime) / 1000)
|
|
121
|
+
log(`── ${parts.join(' · ')} · ${elapsed} elapsed ──`)
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
const startEnhancePoller = (
|
|
126
|
+
onFile: (name: string) => void
|
|
127
|
+
): { stop: () => Promise<{ enhanced: number; failed: number }> } => {
|
|
128
|
+
const done = new Set<string>()
|
|
129
|
+
const interval = setInterval(() => {
|
|
130
|
+
// oxlint-disable-next-line promise/prefer-await-to-then
|
|
131
|
+
runEnhancePass(done, onFile).catch(noop)
|
|
132
|
+
}, ENHANCE_POLL_MS)
|
|
133
|
+
|
|
134
|
+
const stop = async (): Promise<{ enhanced: number; failed: number }> => {
|
|
135
|
+
clearInterval(interval)
|
|
136
|
+
return runEnhancePass(done, onFile)
|
|
137
|
+
}
|
|
138
|
+
return { stop }
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
const startProgressTicker = (startTime: number): ReturnType<typeof setInterval> =>
|
|
142
|
+
// eslint-disable-line @typescript-eslint/promise-function-async
|
|
143
|
+
setInterval(() => {
|
|
144
|
+
// oxlint-disable-next-line promise/prefer-await-to-then
|
|
145
|
+
logProgress(startTime).catch(noop)
|
|
146
|
+
}, PROGRESS_INTERVAL_MS)
|
|
147
|
+
|
|
148
|
+
const runBootstrap = async (): Promise<void> => {
|
|
149
|
+
log('Checking Python environment...')
|
|
150
|
+
const ok = await bootstrapPython({
|
|
151
|
+
onDone: () => log('Python environment ready.'),
|
|
152
|
+
onStep: (msg: string) => log(` ${msg}`)
|
|
153
|
+
})
|
|
154
|
+
if (!ok) {
|
|
155
|
+
log('FATAL: Python bootstrap failed. Install uv and try again.')
|
|
156
|
+
process.exit(1)
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
const runPreflightCheck = async (): Promise<void> => {
|
|
161
|
+
const preflight = await runPreflight()
|
|
162
|
+
if (preflight.errors.length > 0) {
|
|
163
|
+
for (const e of preflight.errors) log(`ERROR: ${e}`)
|
|
164
|
+
log('Fix the errors above and restart.')
|
|
165
|
+
process.exit(1)
|
|
166
|
+
}
|
|
167
|
+
for (const w of preflight.warnings) log(`WARN: ${w}`)
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
const runClassifyStep = async (): Promise<void> => {
|
|
171
|
+
const data = await fetchStepData()
|
|
172
|
+
const done = data.classify.done >= data.classify.total && data.classify.total > 0
|
|
173
|
+
if (done) {
|
|
174
|
+
log('Step 1/3: Classify \u2014 already done')
|
|
175
|
+
if (data.classify.details) for (const d of data.classify.details) log(` ${d}`)
|
|
176
|
+
return
|
|
177
|
+
}
|
|
178
|
+
log('Step 1/3: Classify PDFs')
|
|
179
|
+
const t = Date.now()
|
|
180
|
+
await runClassify(p => {
|
|
181
|
+
log(` ${p.done}/${p.total} ${p.file} → ${p.category}`)
|
|
182
|
+
})
|
|
183
|
+
const d = await fetchStepData()
|
|
184
|
+
if (d.classify.details) for (const det of d.classify.details) log(` ${det}`)
|
|
185
|
+
log(` Done in ${formatDuration((Date.now() - t) / 1000)}`)
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
const runParallelConvertOcr = async (): Promise<void> => {
|
|
189
|
+
log(' Convert + OCR (parallel)')
|
|
190
|
+
const ocrPromise = runOcrStep('[OCR] ')
|
|
191
|
+
const pipelineCode = await runConvertStep('[CONVERT] ')
|
|
192
|
+
if (pipelineCode !== 0) log(` Convert exited with code ${pipelineCode}`)
|
|
193
|
+
log(' Convert done, waiting for OCR...')
|
|
194
|
+
const ocrCode = await ocrPromise
|
|
195
|
+
if (ocrCode !== 0) log(` OCR exited with code ${ocrCode}`)
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
const runSequentialOcr = async (ocrNeeded: boolean, ocrDone: boolean): Promise<void> => {
|
|
199
|
+
if (ocrNeeded && !ocrDone) {
|
|
200
|
+
log(' OCR scanned PDFs')
|
|
201
|
+
const code = await runOcrStep(' ')
|
|
202
|
+
if (code !== 0) log(` OCR exited with code ${code}`)
|
|
203
|
+
} else if (ocrNeeded) log(' OCR \u2014 already done')
|
|
204
|
+
else log(' OCR \u2014 no scanned PDFs')
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// eslint-disable-next-line max-statements
|
|
208
|
+
const runConvertOcrEnhance = async (startTime: number): Promise<void> => {
|
|
209
|
+
const data = await fetchStepData()
|
|
210
|
+
const pipelineDone = data.pipeline.done >= data.pipeline.total && data.pipeline.total > 0
|
|
211
|
+
const ocrNeeded = data.ocr.total > 0
|
|
212
|
+
const ocrDone = data.ocr.done >= data.ocr.total && data.ocr.total > 0
|
|
213
|
+
const allDone = pipelineDone && (!ocrNeeded || ocrDone)
|
|
214
|
+
|
|
215
|
+
if (allDone && data.enhance.done >= data.enhance.total && data.enhance.total > 0) {
|
|
216
|
+
log('Step 2/3: Convert + OCR + Enhance \u2014 already done')
|
|
217
|
+
return
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
log('Step 2/3: Convert + OCR + Enhance')
|
|
221
|
+
const ticker = startProgressTicker(startTime)
|
|
222
|
+
const enhancer = startEnhancePoller(name => log(`[ENHANCE] \u2713 ${name}`))
|
|
223
|
+
const t = Date.now()
|
|
224
|
+
|
|
225
|
+
if (!allDone) {
|
|
226
|
+
const parallel = !pipelineDone && ocrNeeded && !ocrDone
|
|
227
|
+
if (parallel) await runParallelConvertOcr()
|
|
228
|
+
else {
|
|
229
|
+
if (pipelineDone) log(' Convert \u2014 already done')
|
|
230
|
+
else {
|
|
231
|
+
log(' Convert to Markdown')
|
|
232
|
+
const code = await runConvertStep(' ')
|
|
233
|
+
if (code !== 0) log(` Convert exited with code ${code}`)
|
|
234
|
+
}
|
|
235
|
+
await runSequentialOcr(ocrNeeded, ocrDone)
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
clearInterval(ticker)
|
|
240
|
+
const enhanceResult = await enhancer.stop()
|
|
241
|
+
log(` Enhanced: ${enhanceResult.enhanced}, Failed: ${enhanceResult.failed}`)
|
|
242
|
+
log(` Done in ${formatDuration((Date.now() - t) / 1000)}`)
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
const runDatasetStep = async (): Promise<{ duplicates: number; entries: number; skipped: number; totalChars: number }> => {
|
|
246
|
+
log('Step 3/3: Build Dataset')
|
|
247
|
+
const t = Date.now()
|
|
248
|
+
const result = await buildDataset({
|
|
249
|
+
onFileResult: p => {
|
|
250
|
+
const icon = p.status === 'added' ? '\u2713' : p.status === 'duplicate' ? '\u2261' : '\u2192'
|
|
251
|
+
const charStr = p.chars >= 1000 ? `${(p.chars / 1000).toFixed(1)}K` : `${p.chars}`
|
|
252
|
+
log(` ${p.done}/${p.total} ${icon} ${p.file} → ${p.status} (${charStr} chars)`)
|
|
253
|
+
},
|
|
254
|
+
onReadProgress: (done, total) => {
|
|
255
|
+
if (done % 100 === 0 || done === total) log(` Reading ${done}/${total} files...`)
|
|
256
|
+
}
|
|
257
|
+
})
|
|
258
|
+
log(` Entries: ${result.entries}, Skipped: ${result.skipped}, Duplicates: ${result.duplicates}`)
|
|
259
|
+
log(` Total chars: ${result.totalChars.toLocaleString()}`)
|
|
260
|
+
log(` Done in ${formatDuration((Date.now() - t) / 1000)}`)
|
|
261
|
+
return result
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
const printSummary = async (
|
|
265
|
+
startTime: number,
|
|
266
|
+
dsResult: { duplicates: number; entries: number; skipped: number; totalChars: number }
|
|
267
|
+
): Promise<void> => {
|
|
268
|
+
const data = await fetchStepData()
|
|
269
|
+
const elapsed = formatDuration((Date.now() - startTime) / 1000)
|
|
270
|
+
const sep = '\u2550'.repeat(45)
|
|
271
|
+
log('')
|
|
272
|
+
log(sep)
|
|
273
|
+
log(' Pipeline Complete')
|
|
274
|
+
log(sep)
|
|
275
|
+
log(` Classified: ${data.classify.done} PDFs`)
|
|
276
|
+
if (data.classify.details) for (const d of data.classify.details) log(` ${d}`)
|
|
277
|
+
log(` Converted: ${data.pipeline.done} files`)
|
|
278
|
+
log(` OCR: ${data.ocr.done} files`)
|
|
279
|
+
log(` Enhanced: ${data.enhance.done} files`)
|
|
280
|
+
log(` Dataset: ${dsResult.entries} entries, ${dsResult.totalChars.toLocaleString()} chars`)
|
|
281
|
+
if (dsResult.duplicates > 0) log(` Deduplicated: ${dsResult.duplicates}`)
|
|
282
|
+
if (dsResult.skipped > 0) log(` Skipped: ${dsResult.skipped} (below min length)`)
|
|
283
|
+
log(` Duration: ${elapsed}`)
|
|
284
|
+
log(` Output: ${getPaths().outputDir}`)
|
|
285
|
+
log(sep)
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
const run = async (): Promise<void> => {
|
|
289
|
+
const startTime = Date.now()
|
|
290
|
+
|
|
291
|
+
await runBootstrap()
|
|
292
|
+
await runPreflightCheck()
|
|
293
|
+
await clearErrorLog()
|
|
294
|
+
|
|
295
|
+
log('')
|
|
296
|
+
await runClassifyStep()
|
|
297
|
+
log('')
|
|
298
|
+
await runConvertOcrEnhance(startTime)
|
|
299
|
+
log('')
|
|
300
|
+
const dsResult = await runDatasetStep()
|
|
301
|
+
|
|
302
|
+
await printSummary(startTime, dsResult)
|
|
303
|
+
|
|
304
|
+
process.stdout.write('\u0007')
|
|
305
|
+
Bun.spawn(['osascript', '-e', 'display notification "Pipeline complete" with title "anymd"'])
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
export { run }
|
package/scripts/pdf-to-md.py
CHANGED
|
@@ -6,14 +6,30 @@ from pathlib import Path
|
|
|
6
6
|
from marker.converters.pdf import PdfConverter
|
|
7
7
|
from marker.models import create_model_dict
|
|
8
8
|
from marker.output import text_from_rendered
|
|
9
|
+
from markitdown import MarkItDown
|
|
9
10
|
|
|
10
11
|
MIN_ARGS = 2
|
|
12
|
+
MIN_FALLBACK_CHARS = 10
|
|
13
|
+
|
|
14
|
+
_mid = MarkItDown()
|
|
11
15
|
|
|
12
16
|
|
|
13
17
|
def _emit(data: dict[str, object]) -> None:
|
|
14
18
|
print(json.dumps(data), flush=True)
|
|
15
19
|
|
|
16
20
|
|
|
21
|
+
def _markitdown_fallback(pdf_path: str) -> str | None:
|
|
22
|
+
try:
|
|
23
|
+
result = _mid.convert(pdf_path)
|
|
24
|
+
text = result.text_content.strip()
|
|
25
|
+
except Exception: # noqa: BLE001
|
|
26
|
+
return None
|
|
27
|
+
else:
|
|
28
|
+
if len(text) < MIN_FALLBACK_CHARS:
|
|
29
|
+
return None
|
|
30
|
+
return text
|
|
31
|
+
|
|
32
|
+
|
|
17
33
|
def _convert_one(converter: PdfConverter, pdf_path: str, out_path: str, index: int, total: int) -> None:
|
|
18
34
|
t1 = time.time()
|
|
19
35
|
try:
|
|
@@ -29,15 +45,28 @@ def _convert_one(converter: PdfConverter, pdf_path: str, out_path: str, index: i
|
|
|
29
45
|
'seconds': round(time.time() - t1, 1),
|
|
30
46
|
'chars': len(md),
|
|
31
47
|
})
|
|
32
|
-
except Exception as
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
'
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
48
|
+
except Exception as marker_exc: # noqa: BLE001
|
|
49
|
+
fallback_md = _markitdown_fallback(pdf_path)
|
|
50
|
+
if fallback_md:
|
|
51
|
+
Path(out_path).parent.mkdir(parents=True, exist_ok=True)
|
|
52
|
+
Path(out_path).write_text(fallback_md, encoding='utf-8')
|
|
53
|
+
_emit({
|
|
54
|
+
'type': 'converted',
|
|
55
|
+
'index': index,
|
|
56
|
+
'total': total,
|
|
57
|
+
'file': Path(pdf_path).name,
|
|
58
|
+
'seconds': round(time.time() - t1, 1),
|
|
59
|
+
'chars': len(fallback_md),
|
|
60
|
+
})
|
|
61
|
+
else:
|
|
62
|
+
_emit({
|
|
63
|
+
'type': 'error',
|
|
64
|
+
'index': index,
|
|
65
|
+
'total': total,
|
|
66
|
+
'file': Path(pdf_path).name,
|
|
67
|
+
'seconds': round(time.time() - t1, 1),
|
|
68
|
+
'error': str(marker_exc),
|
|
69
|
+
})
|
|
41
70
|
|
|
42
71
|
|
|
43
72
|
def main() -> None:
|
package/src/bootstrap.ts
CHANGED
|
@@ -10,7 +10,7 @@ interface BootstrapCallbacks {
|
|
|
10
10
|
}
|
|
11
11
|
|
|
12
12
|
const REQUIRED_PACKAGES = ['marker', 'markitdown', 'mammoth', 'mlx_vlm', 'pypdfium2', 'torchvision']
|
|
13
|
-
const PIP_PACKAGES = ['marker-pdf', 'markitdown[docx]', 'mlx-vlm', 'pypdfium2', 'torchvision']
|
|
13
|
+
const PIP_PACKAGES = ['marker-pdf', 'markitdown[docx,pdf]', 'mlx-vlm', 'pypdfium2', 'torchvision']
|
|
14
14
|
const CHANDRA_MODEL_ID = 'mlx-community/chandra-8bit'
|
|
15
15
|
|
|
16
16
|
const checkImportable = async (py: string, pkg: string): Promise<boolean> => {
|
package/src/tui-data.ts
CHANGED
|
@@ -63,19 +63,6 @@ const countFiles = async (dir: string, ext: string): Promise<number> => {
|
|
|
63
63
|
}
|
|
64
64
|
}
|
|
65
65
|
|
|
66
|
-
const countOverlap = async (dirA: string, dirB: string, ext: string): Promise<number> => {
|
|
67
|
-
try {
|
|
68
|
-
const [entriesA, entriesB] = await Promise.all([readdir(dirA), readdir(dirB)])
|
|
69
|
-
const setB = new Set<string>()
|
|
70
|
-
for (const e of entriesB) if (e.endsWith(ext)) setB.add(e)
|
|
71
|
-
let count = 0
|
|
72
|
-
for (const e of entriesA) if (e.endsWith(ext) && setB.has(e)) count += 1
|
|
73
|
-
return count
|
|
74
|
-
} catch {
|
|
75
|
-
return 0
|
|
76
|
-
}
|
|
77
|
-
}
|
|
78
|
-
|
|
79
66
|
const readJson = async <T>(path: string): Promise<null | T> => {
|
|
80
67
|
try {
|
|
81
68
|
const text = await readFile(path, 'utf8')
|
|
@@ -382,6 +369,49 @@ const runEnhanceOcr = async (
|
|
|
382
369
|
return { enhanced, failed, skipped: skippedFiles.length }
|
|
383
370
|
}
|
|
384
371
|
|
|
372
|
+
const gatherEnhanceCandidates = async (exclude: Set<string>): Promise<{ name: string; srcPath: string }[]> => {
|
|
373
|
+
const p = getPaths()
|
|
374
|
+
const candidates: { name: string; srcPath: string }[] = []
|
|
375
|
+
for (const srcDir of [p.rawMd, p.ocrRaw])
|
|
376
|
+
try {
|
|
377
|
+
/** biome-ignore lint/performance/noAwaitInLoops: iterating 2 dirs */
|
|
378
|
+
const entries = await readdir(srcDir) // eslint-disable-line no-await-in-loop
|
|
379
|
+
for (const f of entries)
|
|
380
|
+
if (f.endsWith('.md') && !exclude.has(f)) candidates.push({ name: f, srcPath: join(srcDir, f) })
|
|
381
|
+
} catch {
|
|
382
|
+
/* Empty */
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
return candidates
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
// eslint-disable-next-line max-statements
|
|
389
|
+
const runEnhancePass = async (
|
|
390
|
+
alreadyDone: Set<string>,
|
|
391
|
+
onFile?: (file: string) => void
|
|
392
|
+
): Promise<{ enhanced: number; failed: number }> => {
|
|
393
|
+
const { enhanceMarkdown } = await import('~/md-enhancer')
|
|
394
|
+
mkdirSync(getPaths().markdown, { recursive: true })
|
|
395
|
+
const candidates = await gatherEnhanceCandidates(alreadyDone)
|
|
396
|
+
let enhanced = 0
|
|
397
|
+
let failed = 0
|
|
398
|
+
for (const { name, srcPath } of candidates)
|
|
399
|
+
try {
|
|
400
|
+
/** biome-ignore lint/performance/noAwaitInLoops: sequential enhance */
|
|
401
|
+
const content = await readFile(srcPath, 'utf8') // eslint-disable-line no-await-in-loop
|
|
402
|
+
const result = enhanceMarkdown(content)
|
|
403
|
+
/** biome-ignore lint/performance/noAwaitInLoops: sequential enhance */
|
|
404
|
+
await writeFile(join(getPaths().markdown, name), result, 'utf8') // eslint-disable-line no-await-in-loop
|
|
405
|
+
alreadyDone.add(name)
|
|
406
|
+
enhanced += 1
|
|
407
|
+
onFile?.(name)
|
|
408
|
+
} catch {
|
|
409
|
+
failed += 1
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
return { enhanced, failed }
|
|
413
|
+
}
|
|
414
|
+
|
|
385
415
|
const spawnCommand = (key: CommandKey): null | { args: string[]; label: string; proc: ReturnType<typeof Bun.spawn> } => {
|
|
386
416
|
const p = getPaths()
|
|
387
417
|
const packageRoot = join(p.scriptsDir, '..')
|
|
@@ -466,7 +496,6 @@ interface StepCounts {
|
|
|
466
496
|
docCount: number
|
|
467
497
|
finalMdCount: number
|
|
468
498
|
ocrDone: number
|
|
469
|
-
ocrEnhancedCount: number
|
|
470
499
|
ocrProgress: null | OcrProgress
|
|
471
500
|
ocrTotal: number
|
|
472
501
|
pdfCount: number
|
|
@@ -491,9 +520,8 @@ const buildStepResults = (c: StepCounts): AllStepsData => {
|
|
|
491
520
|
total: c.finalMdCount
|
|
492
521
|
},
|
|
493
522
|
enhance: {
|
|
494
|
-
done: c.
|
|
495
|
-
|
|
496
|
-
total: c.ocrDone
|
|
523
|
+
done: c.finalMdCount,
|
|
524
|
+
total: c.rawMdCount + c.ocrDone
|
|
497
525
|
},
|
|
498
526
|
ocr: {
|
|
499
527
|
done: c.ocrDone,
|
|
@@ -512,17 +540,15 @@ const buildStepResults = (c: StepCounts): AllStepsData => {
|
|
|
512
540
|
|
|
513
541
|
const fetchStepData = async (): Promise<AllStepsData> => {
|
|
514
542
|
const p = getPaths()
|
|
515
|
-
const [classification, ocrProgress, rawMdCount, ocrDone, finalMdCount,
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
countDatasetEntries()
|
|
525
|
-
])
|
|
543
|
+
const [classification, ocrProgress, rawMdCount, ocrDone, finalMdCount, dataCounts, datasetEntries] = await Promise.all([
|
|
544
|
+
readJson<Classification>(p.classification),
|
|
545
|
+
readJson<OcrProgress>(p.ocrProgress),
|
|
546
|
+
countFiles(p.rawMd, '.md'),
|
|
547
|
+
countFiles(p.ocrRaw, '.md'),
|
|
548
|
+
countFiles(p.markdown, '.md'),
|
|
549
|
+
countDataFiles(),
|
|
550
|
+
countDatasetEntries()
|
|
551
|
+
])
|
|
526
552
|
|
|
527
553
|
const ocrTotal = classification ? classification.scanned + classification.mixed : 0
|
|
528
554
|
|
|
@@ -532,7 +558,6 @@ const fetchStepData = async (): Promise<AllStepsData> => {
|
|
|
532
558
|
docCount: dataCounts.docs,
|
|
533
559
|
finalMdCount,
|
|
534
560
|
ocrDone,
|
|
535
|
-
ocrEnhancedCount,
|
|
536
561
|
ocrProgress,
|
|
537
562
|
ocrTotal,
|
|
538
563
|
pdfCount: dataCounts.pdfs,
|
|
@@ -584,6 +609,7 @@ export {
|
|
|
584
609
|
readLogTail,
|
|
585
610
|
runClassify,
|
|
586
611
|
runEnhanceOcr,
|
|
612
|
+
runEnhancePass,
|
|
587
613
|
spawnCommand,
|
|
588
614
|
writeNativeFileList
|
|
589
615
|
}
|