anymd 0.0.0 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -50,16 +50,17 @@ Steps 2 and 3 run in parallel when both are needed.
50
50
 
51
51
  - macOS with Apple Silicon (64GB recommended for OCR)
52
52
  - [Bun](https://bun.sh) runtime
53
+ - [uv](https://docs.astral.sh/uv/) — `curl -LsSf https://astral.sh/uv/install.sh | sh`
53
54
  - [poppler](https://poppler.freedesktop.org/) (`pdftotext`) — `brew install poppler`
54
55
  - [LibreOffice](https://www.libreoffice.org/) (`soffice`) — optional, only for `.doc` files
55
- - Python 3.13+ — venv and packages are auto-installed on first run
56
56
 
57
57
  The TUI preflight check tells you exactly what's missing.
58
58
 
59
59
  ## Auto-Bootstrap
60
60
 
61
- On first run, `anymd` creates `~/.cache/anymd/.venv` and installs:
61
+ On first run, `anymd` uses `uv` to create `~/.cache/anymd/.venv` (Python 3.13) and installs:
62
62
  - `marker-pdf` — PDF to markdown conversion
63
+ - `markitdown` — DOCX to markdown conversion
63
64
  - `mlx-vlm` — Apple Silicon MLX inference for OCR
64
65
  - `pypdfium2` — PDF page rendering
65
66
 
package/main.ts CHANGED
@@ -6,30 +6,30 @@ import { getPaths, initPaths } from '~/paths'
6
6
  import { runPipeline } from '~/pipeline'
7
7
  import { logger } from '~/utils'
8
8
 
9
- const fileListIdx = process.argv.indexOf('--file-list'),
10
- baseDirIdx = process.argv.indexOf('--base-dir')
11
- let inputDir = process.argv[2] ?? './data'
9
+ const getArg = (flag: string): string | undefined => {
10
+ const idx = process.argv.indexOf(flag)
11
+ return idx === -1 ? undefined : process.argv[idx + 1]
12
+ }
13
+
14
+ const fileListPath = getArg('--file-list')
15
+ const baseDir = getArg('--base-dir') ?? process.argv[2] ?? resolve('data')
16
+ const outputDir = getArg('--output-dir') ?? resolve('./output')
17
+ let inputDir = baseDir
12
18
  let inputFiles: string[] | undefined
13
- let baseDir = baseDirIdx === -1 ? '' : (process.argv[baseDirIdx + 1] ?? resolve('data'))
14
19
 
15
- if (fileListIdx !== -1) {
16
- const listPath = process.argv[fileListIdx + 1]
17
- if (listPath) {
18
- const content = await readFile(listPath, 'utf8')
19
- inputFiles = content
20
- .trim()
21
- .split('\n')
22
- .filter(l => l.length > 0)
23
- inputDir = '.'
24
- }
20
+ if (fileListPath) {
21
+ const content = await readFile(fileListPath, 'utf8')
22
+ inputFiles = content
23
+ .trim()
24
+ .split('\n')
25
+ .filter(l => l.length > 0)
26
+ inputDir = '.'
25
27
  }
26
28
 
27
- if (!baseDir) baseDir = inputDir
28
-
29
29
  try {
30
30
  getPaths()
31
31
  } catch {
32
- initPaths(resolve(baseDir), resolve('./output'))
32
+ initPaths(resolve(baseDir), resolve(outputDir))
33
33
  }
34
34
 
35
35
  const config = {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "anymd",
3
- "version": "0.0.0",
3
+ "version": "0.0.2",
4
4
  "description": "Convert any document (PDF, DOC, DOCX) to clean Markdown for RAG",
5
5
  "keywords": [
6
6
  "markdown",
@@ -1,5 +1,6 @@
1
1
  import gc
2
2
  import json
3
+ import sys
3
4
  import time
4
5
  from dataclasses import dataclass, field
5
6
  from pathlib import Path
@@ -8,11 +9,20 @@ import pypdfium2 as pdfium
8
9
  import pypdfium2.raw as pdfium_c
9
10
  from PIL import Image
10
11
 
11
- DATA_DIR = Path('data').resolve()
12
- DATA_FILE = Path('data/classification.json')
13
- OUTPUT_BASE = Path('output/ocr-raw')
14
- STATUS_FILE = Path('output/ocr-progress.json')
15
- LOG_FILE = Path('output/ocr-log.txt')
12
+
13
+ def _get_arg(flag: str, default: str) -> str:
14
+ try:
15
+ idx = sys.argv.index(flag)
16
+ return sys.argv[idx + 1]
17
+ except (ValueError, IndexError):
18
+ return default
19
+
20
+
21
+ DATA_DIR = Path(_get_arg('--data-dir', 'data')).resolve()
22
+ DATA_FILE = Path(_get_arg('--classification', 'data/classification.json'))
23
+ OUTPUT_BASE = Path(_get_arg('--output-base', 'output/ocr-raw'))
24
+ STATUS_FILE = Path(_get_arg('--status-file', 'output/ocr-progress.json'))
25
+ LOG_FILE = Path(_get_arg('--log-file', 'output/ocr-log.txt'))
16
26
 
17
27
  MODEL_ID = 'mlx-community/chandra-8bit'
18
28
  IMAGE_DPI = 150
package/src/bootstrap.ts CHANGED
@@ -1,4 +1,6 @@
1
1
  import { existsSync } from 'node:fs'
2
+ import { homedir } from 'node:os'
3
+ import { join } from 'node:path'
2
4
 
3
5
  import { getPaths } from '~/paths'
4
6
 
@@ -7,7 +9,8 @@ interface BootstrapCallbacks {
7
9
  onStep: (message: string) => void
8
10
  }
9
11
 
10
- const REQUIRED_PACKAGES = ['marker', 'mlx_vlm', 'pypdfium2']
12
+ const REQUIRED_PACKAGES = ['marker', 'markitdown', 'mlx_vlm', 'pypdfium2']
13
+ const PIP_PACKAGES = ['marker-pdf', 'markitdown', 'mlx-vlm', 'pypdfium2']
11
14
 
12
15
  const checkPackageImportable = async (pythonPath: string, pkg: string): Promise<boolean> => {
13
16
  try {
@@ -32,32 +35,51 @@ const runCommand = async (args: string[]): Promise<{ ok: boolean; stderr: string
32
35
  return { ok: code === 0, stderr: stderrText }
33
36
  }
34
37
 
35
- const createVenv = async (cacheDir: string, venvDir: string, cbs: BootstrapCallbacks): Promise<boolean> => {
36
- cbs.onStep('Creating Python virtual environment...')
37
- const mkdirResult = await runCommand(['mkdir', '-p', cacheDir])
38
+ const UV_CANDIDATES = ['uv', '/opt/homebrew/bin/uv', join(homedir(), '.local', 'bin', 'uv')]
39
+
40
+ const findUv = async (): Promise<string | undefined> => {
41
+ const checks = await Promise.all(
42
+ UV_CANDIDATES.map(async bin => ({ bin, ok: (await runCommand([bin, '--version'])).ok }))
43
+ )
44
+ const found = checks.find(c => c.ok)
45
+ return found?.bin
46
+ }
47
+
48
+ interface VenvOpts {
49
+ cacheDir: string
50
+ cbs: BootstrapCallbacks
51
+ uv: string
52
+ venvDir: string
53
+ }
54
+
55
+ const createVenv = async (opts: VenvOpts): Promise<boolean> => {
56
+ opts.cbs.onStep('Creating Python virtual environment via uv...')
57
+ const mkdirResult = await runCommand(['mkdir', '-p', opts.cacheDir])
38
58
  if (!mkdirResult.ok) return false
39
- const venvResult = await runCommand(['python3', '-m', 'venv', venvDir])
59
+ const venvResult = await runCommand([opts.uv, 'venv', '--python', '3.13', opts.venvDir])
40
60
  if (!venvResult.ok) {
41
- cbs.onStep(`Failed to create venv: ${venvResult.stderr}`)
61
+ opts.cbs.onStep(`Failed to create venv: ${venvResult.stderr}`)
42
62
  return false
43
63
  }
44
- cbs.onStep('Virtual environment created.')
64
+ opts.cbs.onStep('Virtual environment created (Python 3.13).')
45
65
  return true
46
66
  }
47
67
 
48
- const installPackages = async (venvDir: string, cbs: BootstrapCallbacks): Promise<boolean> => {
49
- cbs.onStep('Installing marker-pdf, mlx-vlm, pypdfium2...')
50
- const pipPath = `${venvDir}/bin/pip`
51
- const result = await runCommand([pipPath, 'install', 'marker-pdf', 'mlx-vlm', 'pypdfium2'])
68
+ const installPackages = async (venvDir: string, uv: string, cbs: BootstrapCallbacks): Promise<boolean> => {
69
+ cbs.onStep(`Installing ${PIP_PACKAGES.join(', ')}...`)
70
+ const result = await runCommand([uv, 'pip', 'install', '--python', `${venvDir}/bin/python`, ...PIP_PACKAGES])
52
71
  if (!result.ok) {
53
- cbs.onStep(`pip install failed: ${result.stderr}`)
72
+ cbs.onStep(`uv pip install failed: ${result.stderr}`)
54
73
  return false
55
74
  }
56
75
  cbs.onStep('Packages installed successfully.')
57
76
  return true
58
77
  }
59
78
 
60
- const ensureVenvExists = async (cbs: BootstrapCallbacks): Promise<null | { skip: boolean; venvDir: string }> => {
79
+ const ensureVenvExists = async (
80
+ uv: string,
81
+ cbs: BootstrapCallbacks
82
+ ): Promise<null | { skip: boolean; venvDir: string }> => {
61
83
  const { cacheDir, venvPython } = getPaths()
62
84
  const venvDir = `${cacheDir}/.venv`
63
85
  if (existsSync(venvPython)) {
@@ -66,21 +88,29 @@ const ensureVenvExists = async (cbs: BootstrapCallbacks): Promise<null | { skip:
66
88
  cbs.onStep('Some packages missing, installing...')
67
89
  return { skip: false, venvDir }
68
90
  }
69
- const created = await createVenv(cacheDir, venvDir, cbs)
91
+ const created = await createVenv({ cacheDir, cbs, uv, venvDir })
70
92
  return created ? { skip: false, venvDir } : null
71
93
  }
72
94
 
73
- const bootstrapPython = async (cbs: BootstrapCallbacks): Promise<boolean> => {
74
- const result = await ensureVenvExists(cbs)
95
+ const requireUv = async (cbs: BootstrapCallbacks): Promise<string | undefined> => {
96
+ const uv = await findUv()
97
+ if (!uv) cbs.onStep('uv not found. Install it: curl -LsSf https://astral.sh/uv/install.sh | sh')
98
+ return uv
99
+ }
100
+
101
+ const setupAndInstall = async (uv: string, cbs: BootstrapCallbacks): Promise<boolean> => {
102
+ const result = await ensureVenvExists(uv, cbs)
75
103
  if (!result) return false
76
- if (result.skip) {
77
- cbs.onDone()
78
- return true
79
- }
80
- const ok = await installPackages(result.venvDir, cbs)
81
- if (!ok) return false
82
- cbs.onDone()
83
- return true
104
+ if (result.skip) return true
105
+ return installPackages(result.venvDir, uv, cbs)
106
+ }
107
+
108
+ const bootstrapPython = async (cbs: BootstrapCallbacks): Promise<boolean> => {
109
+ const uv = await requireUv(cbs)
110
+ if (!uv) return false
111
+ const ok = await setupAndInstall(uv, cbs)
112
+ if (ok) cbs.onDone()
113
+ return ok
84
114
  }
85
115
 
86
116
  export { bootstrapPython }
package/src/preflight.ts CHANGED
@@ -37,7 +37,7 @@ const checkPythonPackage = async (pythonPath: string, pkg: string): Promise<bool
37
37
  }
38
38
 
39
39
  const checkVenvPackages = async (venvPython: string): Promise<string[]> => {
40
- const requiredPackages = ['marker', 'mlx_vlm', 'pypdfium2']
40
+ const requiredPackages = ['marker', 'markitdown', 'mlx_vlm', 'pypdfium2']
41
41
  const results = await Promise.all(
42
42
  requiredPackages.map(async pkg => ({ missing: !(await checkPythonPackage(venvPython, pkg)), pkg }))
43
43
  )
@@ -67,7 +67,7 @@ const collectToolIssues = (
67
67
  if (!tools.hasSoffice) warnings.push('soffice not found \u2014 install LibreOffice for .doc/.docx support')
68
68
  if (!tools.hasVenv)
69
69
  errors.push(
70
- `Python venv not found at ${venvPython} \u2014 run: python3 -m venv .venv && .venv/bin/pip install marker-pdf mlx-vlm pypdfium2`
70
+ `Python venv not found at ${venvPython} \u2014 install uv (curl -LsSf https://astral.sh/uv/install.sh | sh) and re-run`
71
71
  )
72
72
  if (!checkFileExists(getPaths().dataDir)) warnings.push('No data directory found \u2014 create it and add documents')
73
73
  return { errors, warnings }
@@ -81,9 +81,7 @@ const runPreflight = async (): Promise<PreflightResult> => {
81
81
  if (tools.hasVenv) {
82
82
  const missing = await checkVenvPackages(venvPython)
83
83
  if (missing.length > 0)
84
- warnings.push(
85
- `Python packages missing: ${missing.join(', ')} \u2014 run: .venv/bin/pip install ${missing.join(' ')}`
86
- )
84
+ warnings.push(`Python packages missing: ${missing.join(', ')} \u2014 re-run anymd to auto-install`)
87
85
  }
88
86
 
89
87
  return { errors, warnings }
package/src/tui-data.ts CHANGED
@@ -387,9 +387,35 @@ const spawnCommand = (key: CommandKey): null | { args: string[]; label: string;
387
387
  classify: null,
388
388
  dataset: null,
389
389
  enhance: null,
390
- ocr: { args: [p.venvPython, join(p.scriptsDir, 'batch-ocr.py')], label: 'Chandra OCR' },
390
+ ocr: {
391
+ args: [
392
+ p.venvPython,
393
+ join(p.scriptsDir, 'batch-ocr.py'),
394
+ '--data-dir',
395
+ p.dataDir,
396
+ '--classification',
397
+ p.classification,
398
+ '--output-base',
399
+ p.ocrRaw,
400
+ '--status-file',
401
+ p.ocrProgress,
402
+ '--log-file',
403
+ p.ocrLog
404
+ ],
405
+ label: 'Chandra OCR'
406
+ },
391
407
  pipeline: {
392
- args: ['bun', 'run', join(packageRoot, 'main.ts'), '--file-list', p.nativeFileList, '--base-dir', p.dataDir],
408
+ args: [
409
+ 'bun',
410
+ 'run',
411
+ join(packageRoot, 'main.ts'),
412
+ '--file-list',
413
+ p.nativeFileList,
414
+ '--base-dir',
415
+ p.dataDir,
416
+ '--output-dir',
417
+ p.outputDir
418
+ ],
393
419
  label: 'Doc/PDF \u2192 Markdown pipeline'
394
420
  }
395
421
  }