anymd 0.0.0 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -2
- package/main.ts +17 -17
- package/package.json +1 -1
- package/scripts/batch-ocr.py +15 -5
- package/src/bootstrap.ts +54 -24
- package/src/preflight.ts +3 -5
- package/src/tui-data.ts +28 -2
package/README.md
CHANGED
|
@@ -50,16 +50,17 @@ Steps 2 and 3 run in parallel when both are needed.
|
|
|
50
50
|
|
|
51
51
|
- macOS with Apple Silicon (64GB recommended for OCR)
|
|
52
52
|
- [Bun](https://bun.sh) runtime
|
|
53
|
+
- [uv](https://docs.astral.sh/uv/) — `curl -LsSf https://astral.sh/uv/install.sh | sh`
|
|
53
54
|
- [poppler](https://poppler.freedesktop.org/) (`pdftotext`) — `brew install poppler`
|
|
54
55
|
- [LibreOffice](https://www.libreoffice.org/) (`soffice`) — optional, only for `.doc` files
|
|
55
|
-
- Python 3.13+ — venv and packages are auto-installed on first run
|
|
56
56
|
|
|
57
57
|
The TUI preflight check tells you exactly what's missing.
|
|
58
58
|
|
|
59
59
|
## Auto-Bootstrap
|
|
60
60
|
|
|
61
|
-
On first run, `anymd`
|
|
61
|
+
On first run, `anymd` uses `uv` to create `~/.cache/anymd/.venv` (Python 3.13) and installs:
|
|
62
62
|
- `marker-pdf` — PDF to markdown conversion
|
|
63
|
+
- `markitdown` — DOCX to markdown conversion
|
|
63
64
|
- `mlx-vlm` — Apple Silicon MLX inference for OCR
|
|
64
65
|
- `pypdfium2` — PDF page rendering
|
|
65
66
|
|
package/main.ts
CHANGED
|
@@ -6,30 +6,30 @@ import { getPaths, initPaths } from '~/paths'
|
|
|
6
6
|
import { runPipeline } from '~/pipeline'
|
|
7
7
|
import { logger } from '~/utils'
|
|
8
8
|
|
|
9
|
-
const
|
|
10
|
-
|
|
11
|
-
|
|
9
|
+
const getArg = (flag: string): string | undefined => {
|
|
10
|
+
const idx = process.argv.indexOf(flag)
|
|
11
|
+
return idx === -1 ? undefined : process.argv[idx + 1]
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
const fileListPath = getArg('--file-list')
|
|
15
|
+
const baseDir = getArg('--base-dir') ?? process.argv[2] ?? resolve('data')
|
|
16
|
+
const outputDir = getArg('--output-dir') ?? resolve('./output')
|
|
17
|
+
let inputDir = baseDir
|
|
12
18
|
let inputFiles: string[] | undefined
|
|
13
|
-
let baseDir = baseDirIdx === -1 ? '' : (process.argv[baseDirIdx + 1] ?? resolve('data'))
|
|
14
19
|
|
|
15
|
-
if (
|
|
16
|
-
const
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
.filter(l => l.length > 0)
|
|
23
|
-
inputDir = '.'
|
|
24
|
-
}
|
|
20
|
+
if (fileListPath) {
|
|
21
|
+
const content = await readFile(fileListPath, 'utf8')
|
|
22
|
+
inputFiles = content
|
|
23
|
+
.trim()
|
|
24
|
+
.split('\n')
|
|
25
|
+
.filter(l => l.length > 0)
|
|
26
|
+
inputDir = '.'
|
|
25
27
|
}
|
|
26
28
|
|
|
27
|
-
if (!baseDir) baseDir = inputDir
|
|
28
|
-
|
|
29
29
|
try {
|
|
30
30
|
getPaths()
|
|
31
31
|
} catch {
|
|
32
|
-
initPaths(resolve(baseDir), resolve(
|
|
32
|
+
initPaths(resolve(baseDir), resolve(outputDir))
|
|
33
33
|
}
|
|
34
34
|
|
|
35
35
|
const config = {
|
package/package.json
CHANGED
package/scripts/batch-ocr.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import gc
|
|
2
2
|
import json
|
|
3
|
+
import sys
|
|
3
4
|
import time
|
|
4
5
|
from dataclasses import dataclass, field
|
|
5
6
|
from pathlib import Path
|
|
@@ -8,11 +9,20 @@ import pypdfium2 as pdfium
|
|
|
8
9
|
import pypdfium2.raw as pdfium_c
|
|
9
10
|
from PIL import Image
|
|
10
11
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
12
|
+
|
|
13
|
+
def _get_arg(flag: str, default: str) -> str:
|
|
14
|
+
try:
|
|
15
|
+
idx = sys.argv.index(flag)
|
|
16
|
+
return sys.argv[idx + 1]
|
|
17
|
+
except (ValueError, IndexError):
|
|
18
|
+
return default
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
DATA_DIR = Path(_get_arg('--data-dir', 'data')).resolve()
|
|
22
|
+
DATA_FILE = Path(_get_arg('--classification', 'data/classification.json'))
|
|
23
|
+
OUTPUT_BASE = Path(_get_arg('--output-base', 'output/ocr-raw'))
|
|
24
|
+
STATUS_FILE = Path(_get_arg('--status-file', 'output/ocr-progress.json'))
|
|
25
|
+
LOG_FILE = Path(_get_arg('--log-file', 'output/ocr-log.txt'))
|
|
16
26
|
|
|
17
27
|
MODEL_ID = 'mlx-community/chandra-8bit'
|
|
18
28
|
IMAGE_DPI = 150
|
package/src/bootstrap.ts
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import { existsSync } from 'node:fs'
|
|
2
|
+
import { homedir } from 'node:os'
|
|
3
|
+
import { join } from 'node:path'
|
|
2
4
|
|
|
3
5
|
import { getPaths } from '~/paths'
|
|
4
6
|
|
|
@@ -7,7 +9,8 @@ interface BootstrapCallbacks {
|
|
|
7
9
|
onStep: (message: string) => void
|
|
8
10
|
}
|
|
9
11
|
|
|
10
|
-
const REQUIRED_PACKAGES = ['marker', 'mlx_vlm', 'pypdfium2']
|
|
12
|
+
const REQUIRED_PACKAGES = ['marker', 'markitdown', 'mlx_vlm', 'pypdfium2']
|
|
13
|
+
const PIP_PACKAGES = ['marker-pdf', 'markitdown', 'mlx-vlm', 'pypdfium2']
|
|
11
14
|
|
|
12
15
|
const checkPackageImportable = async (pythonPath: string, pkg: string): Promise<boolean> => {
|
|
13
16
|
try {
|
|
@@ -32,32 +35,51 @@ const runCommand = async (args: string[]): Promise<{ ok: boolean; stderr: string
|
|
|
32
35
|
return { ok: code === 0, stderr: stderrText }
|
|
33
36
|
}
|
|
34
37
|
|
|
35
|
-
const
|
|
36
|
-
|
|
37
|
-
|
|
38
|
+
const UV_CANDIDATES = ['uv', '/opt/homebrew/bin/uv', join(homedir(), '.local', 'bin', 'uv')]
|
|
39
|
+
|
|
40
|
+
const findUv = async (): Promise<string | undefined> => {
|
|
41
|
+
const checks = await Promise.all(
|
|
42
|
+
UV_CANDIDATES.map(async bin => ({ bin, ok: (await runCommand([bin, '--version'])).ok }))
|
|
43
|
+
)
|
|
44
|
+
const found = checks.find(c => c.ok)
|
|
45
|
+
return found?.bin
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
interface VenvOpts {
|
|
49
|
+
cacheDir: string
|
|
50
|
+
cbs: BootstrapCallbacks
|
|
51
|
+
uv: string
|
|
52
|
+
venvDir: string
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
const createVenv = async (opts: VenvOpts): Promise<boolean> => {
|
|
56
|
+
opts.cbs.onStep('Creating Python virtual environment via uv...')
|
|
57
|
+
const mkdirResult = await runCommand(['mkdir', '-p', opts.cacheDir])
|
|
38
58
|
if (!mkdirResult.ok) return false
|
|
39
|
-
const venvResult = await runCommand(['
|
|
59
|
+
const venvResult = await runCommand([opts.uv, 'venv', '--python', '3.13', opts.venvDir])
|
|
40
60
|
if (!venvResult.ok) {
|
|
41
|
-
cbs.onStep(`Failed to create venv: ${venvResult.stderr}`)
|
|
61
|
+
opts.cbs.onStep(`Failed to create venv: ${venvResult.stderr}`)
|
|
42
62
|
return false
|
|
43
63
|
}
|
|
44
|
-
cbs.onStep('Virtual environment created.')
|
|
64
|
+
opts.cbs.onStep('Virtual environment created (Python 3.13).')
|
|
45
65
|
return true
|
|
46
66
|
}
|
|
47
67
|
|
|
48
|
-
const installPackages = async (venvDir: string, cbs: BootstrapCallbacks): Promise<boolean> => {
|
|
49
|
-
cbs.onStep(
|
|
50
|
-
const
|
|
51
|
-
const result = await runCommand([pipPath, 'install', 'marker-pdf', 'mlx-vlm', 'pypdfium2'])
|
|
68
|
+
const installPackages = async (venvDir: string, uv: string, cbs: BootstrapCallbacks): Promise<boolean> => {
|
|
69
|
+
cbs.onStep(`Installing ${PIP_PACKAGES.join(', ')}...`)
|
|
70
|
+
const result = await runCommand([uv, 'pip', 'install', '--python', `${venvDir}/bin/python`, ...PIP_PACKAGES])
|
|
52
71
|
if (!result.ok) {
|
|
53
|
-
cbs.onStep(`pip install failed: ${result.stderr}`)
|
|
72
|
+
cbs.onStep(`uv pip install failed: ${result.stderr}`)
|
|
54
73
|
return false
|
|
55
74
|
}
|
|
56
75
|
cbs.onStep('Packages installed successfully.')
|
|
57
76
|
return true
|
|
58
77
|
}
|
|
59
78
|
|
|
60
|
-
const ensureVenvExists = async (
|
|
79
|
+
const ensureVenvExists = async (
|
|
80
|
+
uv: string,
|
|
81
|
+
cbs: BootstrapCallbacks
|
|
82
|
+
): Promise<null | { skip: boolean; venvDir: string }> => {
|
|
61
83
|
const { cacheDir, venvPython } = getPaths()
|
|
62
84
|
const venvDir = `${cacheDir}/.venv`
|
|
63
85
|
if (existsSync(venvPython)) {
|
|
@@ -66,21 +88,29 @@ const ensureVenvExists = async (cbs: BootstrapCallbacks): Promise<null | { skip:
|
|
|
66
88
|
cbs.onStep('Some packages missing, installing...')
|
|
67
89
|
return { skip: false, venvDir }
|
|
68
90
|
}
|
|
69
|
-
const created = await createVenv(cacheDir,
|
|
91
|
+
const created = await createVenv({ cacheDir, cbs, uv, venvDir })
|
|
70
92
|
return created ? { skip: false, venvDir } : null
|
|
71
93
|
}
|
|
72
94
|
|
|
73
|
-
const
|
|
74
|
-
const
|
|
95
|
+
const requireUv = async (cbs: BootstrapCallbacks): Promise<string | undefined> => {
|
|
96
|
+
const uv = await findUv()
|
|
97
|
+
if (!uv) cbs.onStep('uv not found. Install it: curl -LsSf https://astral.sh/uv/install.sh | sh')
|
|
98
|
+
return uv
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
const setupAndInstall = async (uv: string, cbs: BootstrapCallbacks): Promise<boolean> => {
|
|
102
|
+
const result = await ensureVenvExists(uv, cbs)
|
|
75
103
|
if (!result) return false
|
|
76
|
-
if (result.skip)
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
104
|
+
if (result.skip) return true
|
|
105
|
+
return installPackages(result.venvDir, uv, cbs)
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
const bootstrapPython = async (cbs: BootstrapCallbacks): Promise<boolean> => {
|
|
109
|
+
const uv = await requireUv(cbs)
|
|
110
|
+
if (!uv) return false
|
|
111
|
+
const ok = await setupAndInstall(uv, cbs)
|
|
112
|
+
if (ok) cbs.onDone()
|
|
113
|
+
return ok
|
|
84
114
|
}
|
|
85
115
|
|
|
86
116
|
export { bootstrapPython }
|
package/src/preflight.ts
CHANGED
|
@@ -37,7 +37,7 @@ const checkPythonPackage = async (pythonPath: string, pkg: string): Promise<bool
|
|
|
37
37
|
}
|
|
38
38
|
|
|
39
39
|
const checkVenvPackages = async (venvPython: string): Promise<string[]> => {
|
|
40
|
-
const requiredPackages = ['marker', 'mlx_vlm', 'pypdfium2']
|
|
40
|
+
const requiredPackages = ['marker', 'markitdown', 'mlx_vlm', 'pypdfium2']
|
|
41
41
|
const results = await Promise.all(
|
|
42
42
|
requiredPackages.map(async pkg => ({ missing: !(await checkPythonPackage(venvPython, pkg)), pkg }))
|
|
43
43
|
)
|
|
@@ -67,7 +67,7 @@ const collectToolIssues = (
|
|
|
67
67
|
if (!tools.hasSoffice) warnings.push('soffice not found \u2014 install LibreOffice for .doc/.docx support')
|
|
68
68
|
if (!tools.hasVenv)
|
|
69
69
|
errors.push(
|
|
70
|
-
`Python venv not found at ${venvPython} \u2014
|
|
70
|
+
`Python venv not found at ${venvPython} \u2014 install uv (curl -LsSf https://astral.sh/uv/install.sh | sh) and re-run`
|
|
71
71
|
)
|
|
72
72
|
if (!checkFileExists(getPaths().dataDir)) warnings.push('No data directory found \u2014 create it and add documents')
|
|
73
73
|
return { errors, warnings }
|
|
@@ -81,9 +81,7 @@ const runPreflight = async (): Promise<PreflightResult> => {
|
|
|
81
81
|
if (tools.hasVenv) {
|
|
82
82
|
const missing = await checkVenvPackages(venvPython)
|
|
83
83
|
if (missing.length > 0)
|
|
84
|
-
warnings.push(
|
|
85
|
-
`Python packages missing: ${missing.join(', ')} \u2014 run: .venv/bin/pip install ${missing.join(' ')}`
|
|
86
|
-
)
|
|
84
|
+
warnings.push(`Python packages missing: ${missing.join(', ')} \u2014 re-run anymd to auto-install`)
|
|
87
85
|
}
|
|
88
86
|
|
|
89
87
|
return { errors, warnings }
|
package/src/tui-data.ts
CHANGED
|
@@ -387,9 +387,35 @@ const spawnCommand = (key: CommandKey): null | { args: string[]; label: string;
|
|
|
387
387
|
classify: null,
|
|
388
388
|
dataset: null,
|
|
389
389
|
enhance: null,
|
|
390
|
-
ocr: {
|
|
390
|
+
ocr: {
|
|
391
|
+
args: [
|
|
392
|
+
p.venvPython,
|
|
393
|
+
join(p.scriptsDir, 'batch-ocr.py'),
|
|
394
|
+
'--data-dir',
|
|
395
|
+
p.dataDir,
|
|
396
|
+
'--classification',
|
|
397
|
+
p.classification,
|
|
398
|
+
'--output-base',
|
|
399
|
+
p.ocrRaw,
|
|
400
|
+
'--status-file',
|
|
401
|
+
p.ocrProgress,
|
|
402
|
+
'--log-file',
|
|
403
|
+
p.ocrLog
|
|
404
|
+
],
|
|
405
|
+
label: 'Chandra OCR'
|
|
406
|
+
},
|
|
391
407
|
pipeline: {
|
|
392
|
-
args: [
|
|
408
|
+
args: [
|
|
409
|
+
'bun',
|
|
410
|
+
'run',
|
|
411
|
+
join(packageRoot, 'main.ts'),
|
|
412
|
+
'--file-list',
|
|
413
|
+
p.nativeFileList,
|
|
414
|
+
'--base-dir',
|
|
415
|
+
p.dataDir,
|
|
416
|
+
'--output-dir',
|
|
417
|
+
p.outputDir
|
|
418
|
+
],
|
|
393
419
|
label: 'Doc/PDF \u2192 Markdown pipeline'
|
|
394
420
|
}
|
|
395
421
|
}
|