anymd 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +139 -0
- package/cli.tsx +42 -0
- package/main.ts +62 -0
- package/package.json +67 -0
- package/scripts/batch-ocr.py +300 -0
- package/scripts/docx-to-md.py +66 -0
- package/scripts/pdf-to-md.py +68 -0
- package/src/bootstrap.ts +87 -0
- package/src/config.ts +39 -0
- package/src/constants.ts +65 -0
- package/src/content-quality-validator.ts +138 -0
- package/src/doc-to-md.ts +333 -0
- package/src/md-enhancer.ts +158 -0
- package/src/md-validator.ts +83 -0
- package/src/paths.ts +58 -0
- package/src/pipeline.ts +77 -0
- package/src/preflight.ts +92 -0
- package/src/tui-data.ts +562 -0
- package/src/types.ts +96 -0
- package/src/utils.ts +85 -0
- package/tsconfig.json +30 -0
- package/tui.tsx +1204 -0
package/README.md
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# anymd
|
|
2
|
+
|
|
3
|
+
Convert any document (PDF, DOC, DOCX) to clean Markdown for RAG. macOS Apple Silicon only.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
bunx anymd --input-dir ./my-documents
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
On first run, `anymd` automatically sets up a Python virtual environment in `~/.cache/anymd/` and installs all required Python packages (marker-pdf, mlx-vlm, pypdfium2). Subsequent runs start instantly.
|
|
12
|
+
|
|
13
|
+
## Usage
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
bunx anymd --input-dir <path> [--output-dir <path>] [--config <path>]
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
| Flag | Required | Default | Description |
|
|
20
|
+
|------|----------|---------|-------------|
|
|
21
|
+
| `--input-dir` | Yes | — | Directory containing documents (any nested structure) |
|
|
22
|
+
| `--output-dir` | No | `./output` | Where to write all output files |
|
|
23
|
+
| `--config` | No | `./config.json` | Path to configuration file |
|
|
24
|
+
|
|
25
|
+
Launches a TUI that auto-runs all 5 pipeline steps with live per-file progress. On completion: rings terminal bell, sends macOS notification, and updates the terminal title. Safe to Ctrl+C — progress is saved, re-run to resume.
|
|
26
|
+
|
|
27
|
+
## Pipeline Steps
|
|
28
|
+
|
|
29
|
+
| Step | What | Tool | Output |
|
|
30
|
+
|------|------|------|--------|
|
|
31
|
+
| 1. Classify | Detect native/scanned/mixed PDFs | pdftotext (TypeScript) | `<input-dir>/classification.json` |
|
|
32
|
+
| 2. Convert | Doc/docx/native PDF → raw markdown | soffice + markitdown + marker | `<output-dir>/raw-md/` |
|
|
33
|
+
| 3. OCR | Scanned/mixed PDF → markdown | mlx-vlm chandra-8bit | `<output-dir>/ocr-raw/` |
|
|
34
|
+
| 4. Enhance | Heading detection + cleanup | TypeScript | `<output-dir>/markdown/` |
|
|
35
|
+
| 5. Dataset | Collect all markdown → JSONL | TypeScript | `<output-dir>/dataset/dataset.jsonl` |
|
|
36
|
+
|
|
37
|
+
Steps 2 and 3 run in parallel when both are needed.
|
|
38
|
+
|
|
39
|
+
## TUI Hotkeys
|
|
40
|
+
|
|
41
|
+
| Key | Action |
|
|
42
|
+
|-----|--------|
|
|
43
|
+
| R | Retry failed step |
|
|
44
|
+
| S | Skip failed step |
|
|
45
|
+
| L | Toggle log overlay (pipeline + OCR, scrollable) |
|
|
46
|
+
| ↑↓ | Scroll log overlay |
|
|
47
|
+
| Q / ESC | Quit |
|
|
48
|
+
|
|
49
|
+
## System Requirements
|
|
50
|
+
|
|
51
|
+
- macOS with Apple Silicon (64GB recommended for OCR)
|
|
52
|
+
- [Bun](https://bun.sh) runtime
|
|
53
|
+
- [poppler](https://poppler.freedesktop.org/) (`pdftotext`) — `brew install poppler`
|
|
54
|
+
- [LibreOffice](https://www.libreoffice.org/) (`soffice`) — optional, only for `.doc` files
|
|
55
|
+
- Python 3.13+ — venv and packages are auto-installed on first run
|
|
56
|
+
|
|
57
|
+
The TUI preflight check tells you exactly what's missing.
|
|
58
|
+
|
|
59
|
+
## Auto-Bootstrap
|
|
60
|
+
|
|
61
|
+
On first run, `anymd` creates `~/.cache/anymd/.venv` and installs:
|
|
62
|
+
- `marker-pdf` — PDF to markdown conversion
|
|
63
|
+
- `mlx-vlm` — Apple Silicon MLX inference for OCR
|
|
64
|
+
- `pypdfium2` — PDF page rendering
|
|
65
|
+
|
|
66
|
+
This takes ~2 minutes. Progress is shown in the TUI. Subsequent runs detect the existing venv and skip setup.
|
|
67
|
+
|
|
68
|
+
## Configuration
|
|
69
|
+
|
|
70
|
+
Create a `config.json` in your working directory (or pass `--config <path>`):
|
|
71
|
+
|
|
72
|
+
| Key | Default | Description |
|
|
73
|
+
|-----|---------|-------------|
|
|
74
|
+
| `classifyBatchSize` | 20 | PDFs classified per batch |
|
|
75
|
+
| `datasetConcurrency` | 50 | Parallel file reads during dataset build |
|
|
76
|
+
| `enhanceConcurrency` | 10 | Parallel markdown enhancement workers |
|
|
77
|
+
| `markerWorkers` | 3 | Parallel marker-pdf processes for PDF → markdown |
|
|
78
|
+
| `minTextLength` | 50 | Minimum characters for a document to be included in dataset |
|
|
79
|
+
| `nativeThreshold` | 200 | Alpha chars above which a PDF is classified as native |
|
|
80
|
+
| `scannedThreshold` | 50 | Alpha chars below which a PDF is classified as scanned |
|
|
81
|
+
|
|
82
|
+
All fields are optional — omitted fields use defaults. No config file = all defaults.
|
|
83
|
+
|
|
84
|
+
## Notifications
|
|
85
|
+
|
|
86
|
+
- **Terminal title** updates with current step, progress percentage, and parallel status
|
|
87
|
+
- **Terminal bell** rings on pipeline completion
|
|
88
|
+
- **macOS notification** fires on completion via `osascript`
|
|
89
|
+
- **Error log** written to `<output-dir>/errors.log` with timestamps — persists across all steps
|
|
90
|
+
|
|
91
|
+
## Output Structure
|
|
92
|
+
|
|
93
|
+
```
|
|
94
|
+
<output-dir>/
|
|
95
|
+
├── raw-md/ Raw markdown from conversion (step 2)
|
|
96
|
+
├── ocr-raw/ OCR markdown from scanned PDFs (step 3)
|
|
97
|
+
├── markdown/ Final enhanced markdown (step 4)
|
|
98
|
+
├── dataset/dataset.jsonl JSONL dataset for RAG (step 5)
|
|
99
|
+
├── pipeline-log.txt Pipeline conversion log
|
|
100
|
+
├── ocr-log.txt OCR processing log
|
|
101
|
+
└── errors.log Timestamped error log (all steps)
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## Input Requirements
|
|
105
|
+
|
|
106
|
+
- Put documents anywhere inside `--input-dir` in any folder structure
|
|
107
|
+
- Supports `.doc`, `.docx`, `.pdf` (native, scanned, mixed)
|
|
108
|
+
- Output files use flat naming with `--` separator: `docs/foo/bar/doc.pdf` → `foo--bar--doc.md`
|
|
109
|
+
|
|
110
|
+
## Dataset Deduplication
|
|
111
|
+
|
|
112
|
+
Step 5 deduplicates entries by content hash. If two source files produce identical markdown, only one entry appears in the JSONL. The TUI completion summary shows the dedup count.
|
|
113
|
+
|
|
114
|
+
## Resume Support
|
|
115
|
+
|
|
116
|
+
All steps support resume:
|
|
117
|
+
|
|
118
|
+
- Classify: re-runs if `classification.json` missing
|
|
119
|
+
- Convert: skips files already in `raw-md/`
|
|
120
|
+
- OCR: skips files already in `ocr-raw/`
|
|
121
|
+
- Enhance: skips files already in `markdown/`
|
|
122
|
+
- Dataset: always regenerates from `markdown/`
|
|
123
|
+
|
|
124
|
+
## OCR Details
|
|
125
|
+
|
|
126
|
+
Native PDFs use marker-pdf for structured markdown extraction (headings, bold, lists). Scanned/mixed PDFs use `mlx-community/chandra-8bit` via mlx-vlm at 150 DPI (~32s per page on Apple Silicon). Chandra was chosen for superior Vietnamese diacritical accuracy over marker's surya OCR. Mixed PDFs use native text for text-heavy pages and OCR for scanned pages.
|
|
127
|
+
|
|
128
|
+
## Development
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
bun run doc # Run locally (uses ./data as input)
|
|
132
|
+
bun test # 73 unit tests
|
|
133
|
+
bun fix # TypeScript linting (oxlint + eslint + biome + tsc)
|
|
134
|
+
ruff format && ruff check --fix # Python linting
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## License
|
|
138
|
+
|
|
139
|
+
MIT
|
package/cli.tsx
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
import { resolve } from 'node:path'
|
|
3
|
+
|
|
4
|
+
import { loadConfig } from '~/config'
|
|
5
|
+
import { initPaths } from '~/paths'
|
|
6
|
+
|
|
7
|
+
const printUsage = (): void => {
|
|
8
|
+
const text = `Usage: anymd --input-dir <path> [--output-dir <path>] [--config <path>]
|
|
9
|
+
|
|
10
|
+
Options:
|
|
11
|
+
--input-dir Input directory containing documents (required)
|
|
12
|
+
--output-dir Output directory (default: ./output)
|
|
13
|
+
--config Path to config.json (default: ./config.json)`
|
|
14
|
+
process.stdout.write(`${text}\n`)
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
const getArgValue = (argv: string[], flag: string): string | undefined => {
|
|
18
|
+
const idx = argv.indexOf(flag)
|
|
19
|
+
return idx === -1 ? undefined : argv[idx + 1]
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
const parseArgs = (argv: string[]): { configPath?: string; inputDir?: string; outputDir: string } => ({
|
|
23
|
+
configPath: getArgValue(argv, '--config'),
|
|
24
|
+
inputDir: getArgValue(argv, '--input-dir'),
|
|
25
|
+
outputDir: getArgValue(argv, '--output-dir') ?? './output'
|
|
26
|
+
})
|
|
27
|
+
|
|
28
|
+
const args = parseArgs(process.argv)
|
|
29
|
+
|
|
30
|
+
if (!args.inputDir) {
|
|
31
|
+
printUsage()
|
|
32
|
+
process.exit(1)
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
const resolvedInput = resolve(args.inputDir)
|
|
36
|
+
const resolvedOutput = resolve(args.outputDir)
|
|
37
|
+
|
|
38
|
+
initPaths(resolvedInput, resolvedOutput)
|
|
39
|
+
loadConfig(args.configPath)
|
|
40
|
+
|
|
41
|
+
const { start } = await import('./tui')
|
|
42
|
+
await start()
|
package/main.ts
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import { error, log } from 'node:console'
|
|
2
|
+
import { readFile } from 'node:fs/promises'
|
|
3
|
+
import { resolve } from 'node:path'
|
|
4
|
+
|
|
5
|
+
import { getPaths, initPaths } from '~/paths'
|
|
6
|
+
import { runPipeline } from '~/pipeline'
|
|
7
|
+
import { logger } from '~/utils'
|
|
8
|
+
|
|
9
|
+
const fileListIdx = process.argv.indexOf('--file-list'),
|
|
10
|
+
baseDirIdx = process.argv.indexOf('--base-dir')
|
|
11
|
+
let inputDir = process.argv[2] ?? './data'
|
|
12
|
+
let inputFiles: string[] | undefined
|
|
13
|
+
let baseDir = baseDirIdx === -1 ? '' : (process.argv[baseDirIdx + 1] ?? resolve('data'))
|
|
14
|
+
|
|
15
|
+
if (fileListIdx !== -1) {
|
|
16
|
+
const listPath = process.argv[fileListIdx + 1]
|
|
17
|
+
if (listPath) {
|
|
18
|
+
const content = await readFile(listPath, 'utf8')
|
|
19
|
+
inputFiles = content
|
|
20
|
+
.trim()
|
|
21
|
+
.split('\n')
|
|
22
|
+
.filter(l => l.length > 0)
|
|
23
|
+
inputDir = '.'
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
if (!baseDir) baseDir = inputDir
|
|
28
|
+
|
|
29
|
+
try {
|
|
30
|
+
getPaths()
|
|
31
|
+
} catch {
|
|
32
|
+
initPaths(resolve(baseDir), resolve('./output'))
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
const config = {
|
|
36
|
+
baseDir,
|
|
37
|
+
concurrency: 10,
|
|
38
|
+
inputDir,
|
|
39
|
+
inputFiles,
|
|
40
|
+
outputDir: getPaths().outputDir
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
try {
|
|
44
|
+
const stats = await runPipeline(config)
|
|
45
|
+
logger.summary({
|
|
46
|
+
'Converted to MD': stats.converted,
|
|
47
|
+
'Convert Failed': stats.convertFailed,
|
|
48
|
+
Enhanced: stats.enhanced,
|
|
49
|
+
'Total Characters': stats.totalCharacters.toLocaleString(),
|
|
50
|
+
'Total Input Files': stats.totalFiles
|
|
51
|
+
})
|
|
52
|
+
if (stats.errors.length > 0) {
|
|
53
|
+
log('\n[!] Errors encountered:')
|
|
54
|
+
for (const pipelineError of stats.errors.slice(0, 10)) log(` - ${pipelineError}`)
|
|
55
|
+
if (stats.errors.length > 10) log(` ... and ${stats.errors.length - 10} more`)
|
|
56
|
+
}
|
|
57
|
+
log('\n[DONE] Pipeline complete!')
|
|
58
|
+
log(`[>] Markdown files saved to: ${getPaths().markdown}`)
|
|
59
|
+
} catch (pipelineError) {
|
|
60
|
+
error('Pipeline failed:', pipelineError)
|
|
61
|
+
process.exit(1)
|
|
62
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "anymd",
|
|
3
|
+
"version": "0.0.0",
|
|
4
|
+
"description": "Convert any document (PDF, DOC, DOCX) to clean Markdown for RAG",
|
|
5
|
+
"keywords": [
|
|
6
|
+
"markdown",
|
|
7
|
+
"pdf",
|
|
8
|
+
"ocr",
|
|
9
|
+
"document",
|
|
10
|
+
"converter",
|
|
11
|
+
"rag",
|
|
12
|
+
"mlx",
|
|
13
|
+
"apple-silicon"
|
|
14
|
+
],
|
|
15
|
+
"homepage": "https://github.com/1qh/anymd#readme",
|
|
16
|
+
"bugs": {
|
|
17
|
+
"url": "https://github.com/1qh/anymd/issues"
|
|
18
|
+
},
|
|
19
|
+
"repository": {
|
|
20
|
+
"type": "git",
|
|
21
|
+
"url": "git+https://github.com/1qh/anymd.git"
|
|
22
|
+
},
|
|
23
|
+
"license": "MIT",
|
|
24
|
+
"bin": {
|
|
25
|
+
"anymd": "cli.tsx"
|
|
26
|
+
},
|
|
27
|
+
"files": [
|
|
28
|
+
"cli.tsx",
|
|
29
|
+
"main.ts",
|
|
30
|
+
"tui.tsx",
|
|
31
|
+
"src/",
|
|
32
|
+
"!src/__tests__/",
|
|
33
|
+
"scripts/",
|
|
34
|
+
"tsconfig.json"
|
|
35
|
+
],
|
|
36
|
+
"scripts": {
|
|
37
|
+
"build": "tsc",
|
|
38
|
+
"clean": "git clean -xdf .cache .turbo dist node_modules",
|
|
39
|
+
"doc": "bun cli.tsx --input-dir ./data --output-dir ./output",
|
|
40
|
+
"lint": "eslint",
|
|
41
|
+
"start": "bun run main.ts",
|
|
42
|
+
"test": "bun test",
|
|
43
|
+
"typecheck": "tsc --noEmit",
|
|
44
|
+
"with-env": "dotenv -e ../../.env --"
|
|
45
|
+
},
|
|
46
|
+
"dependencies": {
|
|
47
|
+
"@opentui/core": "^0.1.77",
|
|
48
|
+
"@opentui/react": "^0.1.77",
|
|
49
|
+
"markdownlint": "^0.40.0",
|
|
50
|
+
"p-map": "^7.0.4",
|
|
51
|
+
"react": "^19.2.4",
|
|
52
|
+
"yoctocolors": "^2.1.2",
|
|
53
|
+
"zod": "^4.3.6"
|
|
54
|
+
},
|
|
55
|
+
"devDependencies": {
|
|
56
|
+
"@types/react": "^19.2.13"
|
|
57
|
+
},
|
|
58
|
+
"engines": {
|
|
59
|
+
"bun": ">=1.0.0"
|
|
60
|
+
},
|
|
61
|
+
"os": [
|
|
62
|
+
"darwin"
|
|
63
|
+
],
|
|
64
|
+
"cpu": [
|
|
65
|
+
"arm64"
|
|
66
|
+
]
|
|
67
|
+
}
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
import gc
|
|
2
|
+
import json
|
|
3
|
+
import time
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import pypdfium2 as pdfium
|
|
8
|
+
import pypdfium2.raw as pdfium_c
|
|
9
|
+
from PIL import Image
|
|
10
|
+
|
|
11
|
+
DATA_DIR = Path('data').resolve()
|
|
12
|
+
DATA_FILE = Path('data/classification.json')
|
|
13
|
+
OUTPUT_BASE = Path('output/ocr-raw')
|
|
14
|
+
STATUS_FILE = Path('output/ocr-progress.json')
|
|
15
|
+
LOG_FILE = Path('output/ocr-log.txt')
|
|
16
|
+
|
|
17
|
+
MODEL_ID = 'mlx-community/chandra-8bit'
|
|
18
|
+
IMAGE_DPI = 150
|
|
19
|
+
MAX_TOKENS = 8192
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _to_output_name(pdf_path: str) -> str:
|
|
23
|
+
try:
|
|
24
|
+
rel = Path(pdf_path).resolve().relative_to(DATA_DIR)
|
|
25
|
+
except ValueError:
|
|
26
|
+
return Path(pdf_path).stem
|
|
27
|
+
return str(rel.with_suffix('')).replace('/', '--')
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
OCR_PROMPT = """
|
|
31
|
+
OCR this image to markdown.
|
|
32
|
+
Output clean, readable markdown preserving the document structure.
|
|
33
|
+
Use proper headings, lists, tables, and formatting.
|
|
34
|
+
For math expressions, use LaTeX notation with $ delimiters.
|
|
35
|
+
""".strip()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _log(msg: str) -> None:
|
|
39
|
+
line = f'[{time.strftime("%Y-%m-%d %H:%M:%S")}] {msg}'
|
|
40
|
+
print(line, flush=True)
|
|
41
|
+
with LOG_FILE.open('a', encoding='utf-8') as f:
|
|
42
|
+
f.write(line + '\n')
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _format_duration(seconds: float) -> str:
|
|
46
|
+
h = int(seconds // 3600)
|
|
47
|
+
m = int((seconds % 3600) // 60)
|
|
48
|
+
if h > 0:
|
|
49
|
+
return f'{h}h{m:02d}m'
|
|
50
|
+
return f'{m}m{int(seconds % 60):02d}s'
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class Progress:
|
|
55
|
+
done: int
|
|
56
|
+
total: int
|
|
57
|
+
errors: int
|
|
58
|
+
current_file: str
|
|
59
|
+
current_page: str
|
|
60
|
+
current_pages_total: int
|
|
61
|
+
current_file_started: float
|
|
62
|
+
elapsed: float
|
|
63
|
+
avg_per_file: float
|
|
64
|
+
recent_files: list[dict[str, object]] = field(default_factory=list)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _save_progress(p: Progress) -> None:
|
|
68
|
+
remaining = p.total - p.done
|
|
69
|
+
eta_seconds = remaining * p.avg_per_file if p.avg_per_file > 0 else 0
|
|
70
|
+
|
|
71
|
+
progress = {
|
|
72
|
+
'done': p.done,
|
|
73
|
+
'total': p.total,
|
|
74
|
+
'errors': p.errors,
|
|
75
|
+
'pct': round(p.done / max(p.total, 1) * 100, 1),
|
|
76
|
+
'current_file': p.current_file,
|
|
77
|
+
'current_page': p.current_page,
|
|
78
|
+
'current_pages_total': p.current_pages_total,
|
|
79
|
+
'current_file_started': p.current_file_started,
|
|
80
|
+
'elapsed': _format_duration(p.elapsed),
|
|
81
|
+
'avg_per_file': f'{p.avg_per_file:.0f}s',
|
|
82
|
+
'eta': _format_duration(eta_seconds),
|
|
83
|
+
'eta_hours': round(eta_seconds / 3600, 1),
|
|
84
|
+
'updated': time.strftime('%Y-%m-%d %H:%M:%S'),
|
|
85
|
+
'recent_files': p.recent_files[-10:],
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
STATUS_FILE.write_text(json.dumps(progress, indent=2) + '\n', encoding='utf-8')
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _free_memory() -> None:
|
|
92
|
+
gc.collect()
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _flatten_page(page: object) -> None:
|
|
96
|
+
pdfium_c.FPDFPage_Flatten(page, pdfium_c.FLAT_NORMALDISPLAY)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
NATIVE_TEXT_THRESHOLD = 50
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _render_page(doc: pdfium.PdfDocument, page_idx: int) -> tuple[Image.Image | None, str]:
|
|
103
|
+
page_obj = doc[page_idx]
|
|
104
|
+
text = page_obj.get_textpage().get_text_range()
|
|
105
|
+
alpha_count = sum(1 for c in text if c.isalpha())
|
|
106
|
+
|
|
107
|
+
if alpha_count >= NATIVE_TEXT_THRESHOLD:
|
|
108
|
+
return None, text.strip()
|
|
109
|
+
|
|
110
|
+
min_dim = min(page_obj.get_width(), page_obj.get_height())
|
|
111
|
+
scale_dpi = max((1024 / min_dim) * 72, IMAGE_DPI)
|
|
112
|
+
_flatten_page(page_obj)
|
|
113
|
+
page_obj = doc[page_idx]
|
|
114
|
+
pil_image: Image.Image = page_obj.render(scale=scale_dpi / 72).to_pil().convert('RGB')
|
|
115
|
+
return pil_image, ''
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _ocr_page(
|
|
119
|
+
model: object,
|
|
120
|
+
processor: object,
|
|
121
|
+
config: dict[str, object],
|
|
122
|
+
image: Image.Image,
|
|
123
|
+
) -> str:
|
|
124
|
+
from mlx_vlm import generate # noqa: PLC0415
|
|
125
|
+
from mlx_vlm.prompt_utils import apply_chat_template # noqa: PLC0415
|
|
126
|
+
|
|
127
|
+
formatted = apply_chat_template(processor, config, OCR_PROMPT, num_images=1)
|
|
128
|
+
result = generate(
|
|
129
|
+
model, # type: ignore[arg-type]
|
|
130
|
+
processor, # type: ignore[arg-type]
|
|
131
|
+
formatted, # type: ignore[arg-type]
|
|
132
|
+
image, # type: ignore[arg-type]
|
|
133
|
+
max_tokens=MAX_TOKENS,
|
|
134
|
+
temperature=0.0,
|
|
135
|
+
verbose=False,
|
|
136
|
+
)
|
|
137
|
+
return result.text
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _ocr_one_file( # noqa: PLR0913, PLR0917
|
|
141
|
+
pdf: str,
|
|
142
|
+
model: object,
|
|
143
|
+
processor: object,
|
|
144
|
+
config: dict[str, object],
|
|
145
|
+
done_count: int,
|
|
146
|
+
idx: int,
|
|
147
|
+
pending_len: int,
|
|
148
|
+
total: int,
|
|
149
|
+
errors: int,
|
|
150
|
+
avg: float,
|
|
151
|
+
pipeline_start: float,
|
|
152
|
+
recent_files: list[dict[str, object]],
|
|
153
|
+
) -> int:
|
|
154
|
+
unique_name = _to_output_name(pdf)
|
|
155
|
+
display_name = Path(pdf).stem
|
|
156
|
+
file_start = time.time()
|
|
157
|
+
doc = pdfium.PdfDocument(pdf)
|
|
158
|
+
try:
|
|
159
|
+
doc.init_forms()
|
|
160
|
+
num_pages = len(doc)
|
|
161
|
+
_log(f'[{idx}/{pending_len}] ({done_count + idx}/{total}) OCR {display_name} ({num_pages}p)')
|
|
162
|
+
all_markdown: list[str] = []
|
|
163
|
+
|
|
164
|
+
for p_idx in range(num_pages):
|
|
165
|
+
_save_progress(
|
|
166
|
+
Progress(
|
|
167
|
+
done=done_count + idx - 1,
|
|
168
|
+
total=total,
|
|
169
|
+
errors=errors,
|
|
170
|
+
current_file=display_name,
|
|
171
|
+
current_page=f'{p_idx + 1}/{num_pages}',
|
|
172
|
+
current_pages_total=num_pages,
|
|
173
|
+
current_file_started=file_start,
|
|
174
|
+
elapsed=time.time() - pipeline_start,
|
|
175
|
+
avg_per_file=avg,
|
|
176
|
+
recent_files=recent_files,
|
|
177
|
+
)
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
page_t = time.time()
|
|
181
|
+
image, native_text = _render_page(doc, p_idx)
|
|
182
|
+
|
|
183
|
+
if image is not None:
|
|
184
|
+
md = _ocr_page(model, processor, config, image)
|
|
185
|
+
image.close()
|
|
186
|
+
tag = 'ocr'
|
|
187
|
+
else:
|
|
188
|
+
md = native_text
|
|
189
|
+
tag = 'txt'
|
|
190
|
+
|
|
191
|
+
all_markdown.append(md)
|
|
192
|
+
_log(f' p{p_idx + 1}/{num_pages} [{tag}] {time.time() - page_t:.0f}s ({len(md)} chars)')
|
|
193
|
+
|
|
194
|
+
if p_idx % 5 == 0:
|
|
195
|
+
_free_memory()
|
|
196
|
+
|
|
197
|
+
md_path = OUTPUT_BASE / f'{unique_name}.md'
|
|
198
|
+
md_path.write_text('\n\n'.join(all_markdown), encoding='utf-8')
|
|
199
|
+
|
|
200
|
+
del all_markdown
|
|
201
|
+
_free_memory()
|
|
202
|
+
return num_pages
|
|
203
|
+
finally:
|
|
204
|
+
doc.close()
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def _run_ocr() -> None: # noqa: PLR0914
|
|
208
|
+
with DATA_FILE.open(encoding='utf-8') as f:
|
|
209
|
+
data = json.load(f)
|
|
210
|
+
|
|
211
|
+
files = data['files']['scanned'] + data['files']['mixed']
|
|
212
|
+
OUTPUT_BASE.mkdir(parents=True, exist_ok=True)
|
|
213
|
+
|
|
214
|
+
done_count = 0
|
|
215
|
+
pending = []
|
|
216
|
+
for pdf in files:
|
|
217
|
+
unique_name = _to_output_name(pdf)
|
|
218
|
+
if (OUTPUT_BASE / f'{unique_name}.md').exists():
|
|
219
|
+
done_count += 1
|
|
220
|
+
else:
|
|
221
|
+
pending.append(pdf)
|
|
222
|
+
|
|
223
|
+
total = len(files)
|
|
224
|
+
_log(f'Total: {total}, Already done: {done_count}, Pending: {len(pending)}')
|
|
225
|
+
|
|
226
|
+
if not pending:
|
|
227
|
+
_log('Nothing to OCR.')
|
|
228
|
+
_save_progress(Progress(total, total, 0, '-', '-', 0, 0, 0, 0, []))
|
|
229
|
+
return
|
|
230
|
+
|
|
231
|
+
_log(f'Loading MLX model {MODEL_ID}...')
|
|
232
|
+
t0 = time.time()
|
|
233
|
+
from mlx_vlm import load # noqa: PLC0415
|
|
234
|
+
from mlx_vlm.utils import load_config # noqa: PLC0415
|
|
235
|
+
|
|
236
|
+
model, processor = load(MODEL_ID)
|
|
237
|
+
config = load_config(MODEL_ID)
|
|
238
|
+
_log(f'Model loaded in {time.time() - t0:.0f}s')
|
|
239
|
+
|
|
240
|
+
errors = 0
|
|
241
|
+
file_times: list[float] = []
|
|
242
|
+
recent_files: list[dict[str, object]] = []
|
|
243
|
+
pipeline_start = time.time()
|
|
244
|
+
|
|
245
|
+
for i, pdf in enumerate(pending, 1):
|
|
246
|
+
unique_name = _to_output_name(pdf)
|
|
247
|
+
|
|
248
|
+
if (OUTPUT_BASE / f'{unique_name}.md').exists():
|
|
249
|
+
done_count += 1
|
|
250
|
+
else:
|
|
251
|
+
t1 = time.time()
|
|
252
|
+
avg = sum(file_times) / len(file_times) if file_times else 60
|
|
253
|
+
|
|
254
|
+
try:
|
|
255
|
+
num_pages = _ocr_one_file(
|
|
256
|
+
pdf, model, processor, config, done_count, i, len(pending), total, errors, avg, pipeline_start, recent_files
|
|
257
|
+
)
|
|
258
|
+
elapsed = time.time() - t1
|
|
259
|
+
file_times.append(elapsed)
|
|
260
|
+
avg = sum(file_times) / len(file_times)
|
|
261
|
+
display_name = Path(pdf).stem
|
|
262
|
+
recent_files.append({
|
|
263
|
+
'name': display_name,
|
|
264
|
+
'pages': num_pages,
|
|
265
|
+
'duration': round(elapsed, 1),
|
|
266
|
+
'per_page': round(elapsed / max(num_pages, 1), 1),
|
|
267
|
+
})
|
|
268
|
+
_log(f' done {elapsed:.0f}s ({elapsed / max(num_pages, 1):.0f}s/p) avg={avg:.0f}s/file')
|
|
269
|
+
_save_progress(
|
|
270
|
+
Progress(
|
|
271
|
+
done_count + i,
|
|
272
|
+
total,
|
|
273
|
+
errors,
|
|
274
|
+
'-',
|
|
275
|
+
'-',
|
|
276
|
+
0,
|
|
277
|
+
0,
|
|
278
|
+
time.time() - pipeline_start,
|
|
279
|
+
avg,
|
|
280
|
+
recent_files,
|
|
281
|
+
)
|
|
282
|
+
)
|
|
283
|
+
except Exception as ocr_err: # noqa: BLE001
|
|
284
|
+
errors += 1
|
|
285
|
+
_log(f' ERROR {Path(pdf).stem}: {ocr_err}')
|
|
286
|
+
_free_memory()
|
|
287
|
+
|
|
288
|
+
total_time = _format_duration(time.time() - pipeline_start)
|
|
289
|
+
_log(f'OCR complete. Done: {done_count + len(pending)}, Errors: {errors}, Time: {total_time}')
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def main() -> None:
|
|
293
|
+
OUTPUT_BASE.mkdir(parents=True, exist_ok=True)
|
|
294
|
+
_log('=== Batch OCR Start (MLX) ===')
|
|
295
|
+
_run_ocr()
|
|
296
|
+
_log('=== OCR DONE ===')
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
if __name__ == '__main__':
|
|
300
|
+
main()
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import sys
|
|
3
|
+
import time
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from markitdown import MarkItDown
|
|
7
|
+
|
|
8
|
+
MIN_ARGS = 2
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _emit(data: dict[str, object]) -> None:
|
|
12
|
+
print(json.dumps(data), flush=True)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _convert_one(converter: MarkItDown, input_path: str, out_path: str, index: int, total: int) -> None:
|
|
16
|
+
t1 = time.time()
|
|
17
|
+
try:
|
|
18
|
+
result = converter.convert(input_path)
|
|
19
|
+
md = result.text_content
|
|
20
|
+
Path(out_path).parent.mkdir(parents=True, exist_ok=True)
|
|
21
|
+
Path(out_path).write_text(md, encoding='utf-8')
|
|
22
|
+
_emit({
|
|
23
|
+
'type': 'converted',
|
|
24
|
+
'index': index,
|
|
25
|
+
'total': total,
|
|
26
|
+
'file': Path(input_path).name,
|
|
27
|
+
'seconds': round(time.time() - t1, 1),
|
|
28
|
+
'chars': len(md),
|
|
29
|
+
})
|
|
30
|
+
except Exception as exc: # noqa: BLE001
|
|
31
|
+
_emit({
|
|
32
|
+
'type': 'error',
|
|
33
|
+
'index': index,
|
|
34
|
+
'total': total,
|
|
35
|
+
'file': Path(input_path).name,
|
|
36
|
+
'seconds': round(time.time() - t1, 1),
|
|
37
|
+
'error': str(exc),
|
|
38
|
+
})
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def main() -> None:
|
|
42
|
+
if len(sys.argv) < MIN_ARGS:
|
|
43
|
+
print('Usage: docx-to-md.py <manifest.json>', file=sys.stderr)
|
|
44
|
+
sys.exit(1)
|
|
45
|
+
|
|
46
|
+
manifest_path = sys.argv[1]
|
|
47
|
+
manifest: list[dict[str, str]] = json.loads(Path(manifest_path).read_text('utf-8'))
|
|
48
|
+
|
|
49
|
+
if not manifest:
|
|
50
|
+
_emit({'type': 'done', 'total': 0})
|
|
51
|
+
return
|
|
52
|
+
|
|
53
|
+
t0 = time.time()
|
|
54
|
+
_emit({'type': 'loading'})
|
|
55
|
+
converter = MarkItDown()
|
|
56
|
+
_emit({'type': 'loaded', 'seconds': round(time.time() - t0, 1)})
|
|
57
|
+
|
|
58
|
+
total = len(manifest)
|
|
59
|
+
for i, entry in enumerate(manifest):
|
|
60
|
+
_convert_one(converter, entry['input'], entry['output'], i, total)
|
|
61
|
+
|
|
62
|
+
_emit({'type': 'done', 'total': total})
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
if __name__ == '__main__':
|
|
66
|
+
main()
|