@lenne.tech/cli 1.17.0 → 1.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/lt +6 -2
- package/build/commands/tools/ocr.js +166 -0
- package/build/extensions/frontend-helper.js +72 -0
- package/build/lib/marker.js +218 -0
- package/docs/commands.md +47 -0
- package/package.json +1 -1
package/bin/lt
CHANGED
|
@@ -155,8 +155,12 @@ function runCLI() {
|
|
|
155
155
|
require(`${__dirname}/../build/cli`).run(process.argv);
|
|
156
156
|
} else {
|
|
157
157
|
// this runs from the typescript source (for dev only)
|
|
158
|
-
// hook into ts-node so we can run typescript on the fly
|
|
159
|
-
|
|
158
|
+
// hook into ts-node so we can run typescript on the fly. Use
|
|
159
|
+
// `transpileOnly` to skip ts-node's runtime type-checking pass —
|
|
160
|
+
// it triples startup time (~4 s → ~1.2 s) without catching
|
|
161
|
+
// anything new, since `npm run lint` (eslint) and `npm run
|
|
162
|
+
// compile` (tsc) already type-check the project before publish.
|
|
163
|
+
require('ts-node').register({ project: `${__dirname}/../tsconfig.json`, transpileOnly: true });
|
|
160
164
|
// run the CLI with the current process arguments
|
|
161
165
|
require(`${__dirname}/../src/cli`).run(process.argv);
|
|
162
166
|
}
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
const fs_1 = require("fs");
|
|
13
|
+
const path_1 = require("path");
|
|
14
|
+
const marker_1 = require("../../lib/marker");
|
|
15
|
+
/**
|
|
16
|
+
* OCR command: convert PDFs (or a directory of PDFs) to clean Markdown
|
|
17
|
+
* using marker-pdf with Apple Silicon MPS acceleration when available.
|
|
18
|
+
*
|
|
19
|
+
* Marker is kept in `~/.lt/marker/.venv/`; it is auto-installed on the
|
|
20
|
+
* first run (~3 GB model download). Subsequent runs reuse the cache.
|
|
21
|
+
*
|
|
22
|
+
* Examples:
|
|
23
|
+
* lt tools ocr ./report.pdf
|
|
24
|
+
* lt tools ocr ./pdfs --output-dir ./md --workers 4
|
|
25
|
+
* lt tools ocr --install
|
|
26
|
+
* lt tools ocr --status
|
|
27
|
+
*/
|
|
28
|
+
const NewCommand = {
|
|
29
|
+
alias: ['ocr', 'pdf2md'],
|
|
30
|
+
description: 'OCR PDFs to Markdown via marker-pdf (MPS-accelerated on Apple Silicon)',
|
|
31
|
+
hidden: false,
|
|
32
|
+
name: 'ocr',
|
|
33
|
+
run: (toolbox) => __awaiter(void 0, void 0, void 0, function* () {
|
|
34
|
+
var _a, _b, _c, _d, _e;
|
|
35
|
+
const { parameters, print: { error, info, spin, warning }, } = toolbox;
|
|
36
|
+
const showStatus = !!parameters.options.status;
|
|
37
|
+
const installOnly = !!parameters.options.install;
|
|
38
|
+
// Status mode
|
|
39
|
+
if (showStatus) {
|
|
40
|
+
const status = yield (0, marker_1.getMarkerStatus)();
|
|
41
|
+
const device = (0, marker_1.resolveDevice)('auto');
|
|
42
|
+
info('marker-pdf status:');
|
|
43
|
+
info(` installed: ${status.installed ? 'yes' : 'no'}`);
|
|
44
|
+
info(` python3: ${status.pythonAvailable ? 'yes' : 'no'}`);
|
|
45
|
+
info(` uv: ${status.uvAvailable ? 'yes' : 'no'}`);
|
|
46
|
+
info(` venv: ${status.venvPath}`);
|
|
47
|
+
info(` device: ${device} (auto-detected)`);
|
|
48
|
+
if (!toolbox.parameters.options.fromGluegunMenu)
|
|
49
|
+
process.exit(0);
|
|
50
|
+
return 'ocr';
|
|
51
|
+
}
|
|
52
|
+
// Install-only mode
|
|
53
|
+
if (installOnly) {
|
|
54
|
+
const installSpinner = spin('Installing marker-pdf …');
|
|
55
|
+
try {
|
|
56
|
+
yield (0, marker_1.installMarker)({
|
|
57
|
+
onProgress: (msg) => {
|
|
58
|
+
installSpinner.text = msg;
|
|
59
|
+
},
|
|
60
|
+
});
|
|
61
|
+
installSpinner.succeed('marker-pdf installed');
|
|
62
|
+
}
|
|
63
|
+
catch (err) {
|
|
64
|
+
installSpinner.fail('Installation failed');
|
|
65
|
+
error(String(err.message));
|
|
66
|
+
if (!toolbox.parameters.options.fromGluegunMenu)
|
|
67
|
+
process.exit(1);
|
|
68
|
+
return 'ocr';
|
|
69
|
+
}
|
|
70
|
+
if (!toolbox.parameters.options.fromGluegunMenu)
|
|
71
|
+
process.exit(0);
|
|
72
|
+
return 'ocr';
|
|
73
|
+
}
|
|
74
|
+
// Normal run: need an input path
|
|
75
|
+
const inputArg = parameters.first;
|
|
76
|
+
if (!inputArg) {
|
|
77
|
+
error('Missing input path. Usage:');
|
|
78
|
+
info(' lt tools ocr <file.pdf|directory> Convert PDFs to Markdown');
|
|
79
|
+
info(' lt tools ocr --install Install marker-pdf locally');
|
|
80
|
+
info(' lt tools ocr --status Show installation status');
|
|
81
|
+
info('');
|
|
82
|
+
info('Options:');
|
|
83
|
+
info(' --output-dir <dir> Output directory (default: <input>-MD/)');
|
|
84
|
+
info(' --workers <n> Parallel workers for batch mode (default: 3)');
|
|
85
|
+
info(' --device <auto|mps|cuda|cpu> Override TORCH_DEVICE (default: auto)');
|
|
86
|
+
info(' --skip-existing Skip already-converted files (batch mode)');
|
|
87
|
+
info(' --keep-images Extract embedded images (default: off)');
|
|
88
|
+
info(' --format <markdown|json|html|chunks> Output format (default: markdown)');
|
|
89
|
+
if (!toolbox.parameters.options.fromGluegunMenu)
|
|
90
|
+
process.exit(1);
|
|
91
|
+
return 'ocr';
|
|
92
|
+
}
|
|
93
|
+
const inputPath = (0, path_1.resolve)(process.cwd(), inputArg);
|
|
94
|
+
if (!(0, fs_1.existsSync)(inputPath)) {
|
|
95
|
+
error(`Input not found: ${inputPath}`);
|
|
96
|
+
if (!toolbox.parameters.options.fromGluegunMenu)
|
|
97
|
+
process.exit(1);
|
|
98
|
+
return 'ocr';
|
|
99
|
+
}
|
|
100
|
+
// Auto-install if needed
|
|
101
|
+
let status = yield (0, marker_1.getMarkerStatus)();
|
|
102
|
+
if (!status.installed) {
|
|
103
|
+
warning('marker-pdf not yet installed — running first-time setup …');
|
|
104
|
+
const installSpinner = spin('Installing marker-pdf (one-time, ~3 GB model download) …');
|
|
105
|
+
try {
|
|
106
|
+
yield (0, marker_1.installMarker)({
|
|
107
|
+
onProgress: (msg) => {
|
|
108
|
+
installSpinner.text = msg;
|
|
109
|
+
},
|
|
110
|
+
});
|
|
111
|
+
installSpinner.succeed('marker-pdf installed');
|
|
112
|
+
status = yield (0, marker_1.getMarkerStatus)();
|
|
113
|
+
}
|
|
114
|
+
catch (err) {
|
|
115
|
+
installSpinner.fail('Installation failed');
|
|
116
|
+
error(String(err.message));
|
|
117
|
+
if (!toolbox.parameters.options.fromGluegunMenu)
|
|
118
|
+
process.exit(1);
|
|
119
|
+
return 'ocr';
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
// Resolve options
|
|
123
|
+
const isDir = (0, fs_1.statSync)(inputPath).isDirectory();
|
|
124
|
+
const defaultOutput = isDir ? `${inputPath}-MD` : `${inputPath}.md-out`;
|
|
125
|
+
const outputDir = (0, path_1.resolve)(process.cwd(), String((_b = (_a = parameters.options['output-dir']) !== null && _a !== void 0 ? _a : parameters.options.outputDir) !== null && _b !== void 0 ? _b : defaultOutput));
|
|
126
|
+
const workers = Number((_c = parameters.options.workers) !== null && _c !== void 0 ? _c : 3);
|
|
127
|
+
const skipExisting = parameters.options['skip-existing'] !== false; // default: true
|
|
128
|
+
const keepImages = !!parameters.options['keep-images'];
|
|
129
|
+
const format = String((_d = parameters.options.format) !== null && _d !== void 0 ? _d : 'markdown');
|
|
130
|
+
const device = ((_e = parameters.options.device) !== null && _e !== void 0 ? _e : 'auto');
|
|
131
|
+
info(`OCR ${isDir ? 'batch' : 'single'} → ${outputDir}`);
|
|
132
|
+
info(` device: ${(0, marker_1.resolveDevice)(device)}`);
|
|
133
|
+
if (isDir)
|
|
134
|
+
info(` workers: ${workers}, skip-existing: ${skipExisting}`);
|
|
135
|
+
info('');
|
|
136
|
+
const runSpinner = spin('Converting (may take a while on first run while models load)…');
|
|
137
|
+
let lastLine = '';
|
|
138
|
+
const result = yield (0, marker_1.runMarker)(inputPath, {
|
|
139
|
+
device,
|
|
140
|
+
disableImages: !keepImages,
|
|
141
|
+
onLine: (line) => {
|
|
142
|
+
// Forward marker output to spinner text (last line) so the user sees progress
|
|
143
|
+
if (line.trim()) {
|
|
144
|
+
lastLine = line.replace(/\s+/g, ' ').trim().slice(-160);
|
|
145
|
+
runSpinner.text = lastLine;
|
|
146
|
+
}
|
|
147
|
+
},
|
|
148
|
+
outputDir,
|
|
149
|
+
outputFormat: format,
|
|
150
|
+
skipExisting,
|
|
151
|
+
workers,
|
|
152
|
+
});
|
|
153
|
+
if (result.exitCode === 0) {
|
|
154
|
+
runSpinner.succeed(`Done — output in ${outputDir}`);
|
|
155
|
+
}
|
|
156
|
+
else {
|
|
157
|
+
runSpinner.fail(`marker exited with code ${result.exitCode}: ${lastLine}`);
|
|
158
|
+
if (!toolbox.parameters.options.fromGluegunMenu)
|
|
159
|
+
process.exit(result.exitCode);
|
|
160
|
+
}
|
|
161
|
+
if (!toolbox.parameters.options.fromGluegunMenu)
|
|
162
|
+
process.exit(0);
|
|
163
|
+
return 'ocr';
|
|
164
|
+
}),
|
|
165
|
+
};
|
|
166
|
+
exports.default = NewCommand;
|
|
@@ -63,6 +63,70 @@ class FrontendHelper {
|
|
|
63
63
|
content = content.replace(/^(NUXT_PUBLIC_STORAGE_PREFIX=).*$/m, `$1${projectName}-local`);
|
|
64
64
|
filesystem.write(envPath, content);
|
|
65
65
|
}
|
|
66
|
+
/**
|
|
67
|
+
* Flatten the cloned nuxt-base-starter wrapper layout so the project's
|
|
68
|
+
* `projects/app/` directory IS the Nuxt app.
|
|
69
|
+
*
|
|
70
|
+
* `lenneTech/nuxt-base-starter` ships a wrapper repo: the root
|
|
71
|
+
* `package.json` is the `create-nuxt-base` scaffolder (a separate npm
|
|
72
|
+
* package — `bin/create-nuxt-base` lives at `index.js`), and the
|
|
73
|
+
* actual Nuxt app lives one level deeper at `nuxt-base-template/`.
|
|
74
|
+
* Without this flatten, the generated monorepo's `pnpm-workspace.yaml`
|
|
75
|
+
* and the README's `cd projects/app && pnpm install && pnpm dev`
|
|
76
|
+
* point at the wrapper, not the app, so `pnpm install` resolves the
|
|
77
|
+
* wrong dependencies and `pnpm dev` has nothing to run
|
|
78
|
+
* (LLM-test 2026-05-03 friction #3 entry 20:30).
|
|
79
|
+
*
|
|
80
|
+
* Defense-in-depth: only mutate the layout if extraction succeeds.
|
|
81
|
+
* If `nuxt-base-template/` is missing or isn't a directory (corrupt
|
|
82
|
+
* clone, future repo reshape that drops the wrapper), we return
|
|
83
|
+
* `{ flattened: false, reason }` and leave the original tree alone.
|
|
84
|
+
* The pre-flatten layout is annoying but functional — better than
|
|
85
|
+
* wiping a user's clone over an unexpected layout.
|
|
86
|
+
*
|
|
87
|
+
* @param dest - The cloned `projects/app/` directory.
|
|
88
|
+
* @returns Whether the flatten ran, plus a reason if it didn't.
|
|
89
|
+
*/
|
|
90
|
+
flattenNuxtBaseTemplate(dest) {
|
|
91
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
92
|
+
const { filesystem } = this.toolbox;
|
|
93
|
+
const subdir = filesystem.path(dest, 'nuxt-base-template');
|
|
94
|
+
if (!filesystem.exists(subdir)) {
|
|
95
|
+
return { flattened: false, reason: 'no nuxt-base-template subdirectory' };
|
|
96
|
+
}
|
|
97
|
+
if (!filesystem.isDirectory(subdir)) {
|
|
98
|
+
// Stray file at the path we'd flatten — abort to avoid clobbering
|
|
99
|
+
// the user's tree on a corrupt clone.
|
|
100
|
+
return { flattened: false, reason: 'nuxt-base-template path exists but is not a directory' };
|
|
101
|
+
}
|
|
102
|
+
// Stage the template into a sibling directory before touching `dest`,
|
|
103
|
+
// so a copy failure leaves the original layout intact.
|
|
104
|
+
const parent = filesystem.path(dest, '..');
|
|
105
|
+
const stage = filesystem.path(parent, `.nuxt-base-template-staging-${Date.now()}-${process.pid}`);
|
|
106
|
+
try {
|
|
107
|
+
filesystem.copy(subdir, stage, { overwrite: true });
|
|
108
|
+
}
|
|
109
|
+
catch (err) {
|
|
110
|
+
// Couldn't stage — leave `dest` untouched and bubble the reason up.
|
|
111
|
+
filesystem.remove(stage);
|
|
112
|
+
return { flattened: false, reason: `failed to stage template: ${err.message}` };
|
|
113
|
+
}
|
|
114
|
+
try {
|
|
115
|
+
// Wipe the cloned root (wrapper package.json, index.js, lock file,
|
|
116
|
+
// README, etc.) and replace it with the staged template contents.
|
|
117
|
+
// gluegun's `filesystem.remove(dest)` removes the directory, so
|
|
118
|
+
// we re-create it before copying back so dotfiles land at the
|
|
119
|
+
// right level.
|
|
120
|
+
filesystem.remove(dest);
|
|
121
|
+
filesystem.dir(dest);
|
|
122
|
+
filesystem.copy(stage, dest, { overwrite: true });
|
|
123
|
+
}
|
|
124
|
+
finally {
|
|
125
|
+
filesystem.remove(stage);
|
|
126
|
+
}
|
|
127
|
+
return { flattened: true };
|
|
128
|
+
});
|
|
129
|
+
}
|
|
66
130
|
/**
|
|
67
131
|
* Setup Nuxt frontend
|
|
68
132
|
* Handles template setup (link/copy/clone) and optional npm install
|
|
@@ -87,6 +151,14 @@ class FrontendHelper {
|
|
|
87
151
|
if (!result.success) {
|
|
88
152
|
return { method: result.method, path: result.path, success: false };
|
|
89
153
|
}
|
|
154
|
+
// After a clone, flatten the wrapper layout so `projects/app/`
|
|
155
|
+
// IS the Nuxt app (the cloned root is the `create-nuxt-base`
|
|
156
|
+
// scaffolder, not the app — see flattenNuxtBaseTemplate).
|
|
157
|
+
// Skip on link mode: a symlink points at the user's local
|
|
158
|
+
// checkout and must not have its template subdir torn out.
|
|
159
|
+
if (result.method === 'clone') {
|
|
160
|
+
yield this.flattenNuxtBaseTemplate(dest);
|
|
161
|
+
}
|
|
90
162
|
// Run install if not skipped and not a symlink
|
|
91
163
|
if (!skipInstall && result.method !== 'link') {
|
|
92
164
|
try {
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
36
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
37
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
38
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
39
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
40
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
41
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
42
|
+
});
|
|
43
|
+
};
|
|
44
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
45
|
+
exports.getMarkerStatus = getMarkerStatus;
|
|
46
|
+
exports.installMarker = installMarker;
|
|
47
|
+
exports.resolveDevice = resolveDevice;
|
|
48
|
+
exports.runMarker = runMarker;
|
|
49
|
+
/**
|
|
50
|
+
* marker-pdf integration for the lt CLI.
|
|
51
|
+
*
|
|
52
|
+
* Marker (https://github.com/datalab-to/marker) is a PyTorch-based
|
|
53
|
+
* PDF → Markdown converter with first-class layout, table and equation
|
|
54
|
+
* support. On Apple Silicon it leverages Metal Performance Shaders
|
|
55
|
+
* (MPS) for GPU-accelerated inference.
|
|
56
|
+
*
|
|
57
|
+
* The CLI keeps marker in an isolated Python virtualenv under
|
|
58
|
+
* `~/.lt/marker/.venv/` so that:
|
|
59
|
+
* - we do not pollute the user's global Python environment
|
|
60
|
+
* - the ~3 GB of model weights are downloaded only once
|
|
61
|
+
* - subsequent runs start instantly (cached models)
|
|
62
|
+
*/
|
|
63
|
+
const child_process_1 = require("child_process");
|
|
64
|
+
const fs_1 = require("fs");
|
|
65
|
+
const promises_1 = require("fs/promises");
|
|
66
|
+
const os_1 = require("os");
|
|
67
|
+
const path_1 = require("path");
|
|
68
|
+
const util_1 = require("util");
|
|
69
|
+
const execAsync = (0, util_1.promisify)(child_process_1.exec);
|
|
70
|
+
const MARKER_HOME = (0, path_1.join)((0, os_1.homedir)(), '.lt', 'marker');
|
|
71
|
+
const VENV_DIR = (0, path_1.join)(MARKER_HOME, '.venv');
|
|
72
|
+
const VENV_BIN = (0, path_1.join)(VENV_DIR, 'bin');
|
|
73
|
+
const VENV_PYTHON = (0, path_1.join)(VENV_BIN, 'python3');
|
|
74
|
+
const VENV_MARKER_SINGLE = (0, path_1.join)(VENV_BIN, 'marker_single');
|
|
75
|
+
const VENV_MARKER_BATCH = (0, path_1.join)(VENV_BIN, 'marker');
|
|
76
|
+
/**
|
|
77
|
+
* Detect tool availability.
|
|
78
|
+
*/
|
|
79
|
+
function getMarkerStatus() {
|
|
80
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
81
|
+
const status = {
|
|
82
|
+
installed: false,
|
|
83
|
+
pythonAvailable: false,
|
|
84
|
+
uvAvailable: false,
|
|
85
|
+
venvPath: VENV_DIR,
|
|
86
|
+
};
|
|
87
|
+
try {
|
|
88
|
+
yield execAsync('python3 --version');
|
|
89
|
+
status.pythonAvailable = true;
|
|
90
|
+
}
|
|
91
|
+
catch (_a) {
|
|
92
|
+
// python3 missing
|
|
93
|
+
}
|
|
94
|
+
try {
|
|
95
|
+
yield execAsync('uv --version');
|
|
96
|
+
status.uvAvailable = true;
|
|
97
|
+
}
|
|
98
|
+
catch (_b) {
|
|
99
|
+
// uv missing — we'll fall back to python -m venv + pip
|
|
100
|
+
}
|
|
101
|
+
status.installed = (0, fs_1.existsSync)(VENV_MARKER_SINGLE) && (0, fs_1.existsSync)(VENV_MARKER_BATCH);
|
|
102
|
+
return status;
|
|
103
|
+
});
|
|
104
|
+
}
|
|
105
|
+
/**
|
|
106
|
+
* Install marker-pdf into ~/.lt/marker/.venv.
|
|
107
|
+
*
|
|
108
|
+
* Preferred path: `uv venv --python 3.12` + `uv pip install marker-pdf psutil`.
|
|
109
|
+
* Fallback: `python3 -m venv` + `pip install`.
|
|
110
|
+
*/
|
|
111
|
+
function installMarker() {
|
|
112
|
+
return __awaiter(this, arguments, void 0, function* (opts = {}) {
|
|
113
|
+
var _a;
|
|
114
|
+
const log = (_a = opts.onProgress) !== null && _a !== void 0 ? _a : (() => { });
|
|
115
|
+
const status = yield getMarkerStatus();
|
|
116
|
+
if (status.installed) {
|
|
117
|
+
log('marker already installed');
|
|
118
|
+
return;
|
|
119
|
+
}
|
|
120
|
+
if (!status.pythonAvailable) {
|
|
121
|
+
throw new Error('python3 is required but not found in PATH. Install Python 3.10+ (e.g. via Homebrew: `brew install python@3.12`)');
|
|
122
|
+
}
|
|
123
|
+
yield (0, promises_1.mkdir)(MARKER_HOME, { recursive: true });
|
|
124
|
+
const useUv = status.uvAvailable;
|
|
125
|
+
// 1. Create virtualenv
|
|
126
|
+
if (useUv) {
|
|
127
|
+
log('Creating venv with uv (Python 3.12)…');
|
|
128
|
+
yield execAsync(`uv venv --python 3.12 "${VENV_DIR}"`, { cwd: MARKER_HOME });
|
|
129
|
+
}
|
|
130
|
+
else {
|
|
131
|
+
log('Creating venv with python3 (uv not found, falling back)…');
|
|
132
|
+
yield execAsync(`python3 -m venv "${VENV_DIR}"`, { cwd: MARKER_HOME });
|
|
133
|
+
}
|
|
134
|
+
// 2. Install marker-pdf + psutil
|
|
135
|
+
// psutil is needed by the marker batch CLI; it is a soft dep on some
|
|
136
|
+
// marker-pdf releases, so we install it explicitly.
|
|
137
|
+
// We use shell quoting to handle macOS "Library" / spaces in paths.
|
|
138
|
+
const cmd = useUv
|
|
139
|
+
? `uv pip install --python "${VENV_PYTHON}" marker-pdf psutil`
|
|
140
|
+
: `"${VENV_BIN}/pip" install marker-pdf psutil`;
|
|
141
|
+
log('Installing marker-pdf + dependencies (~3 GB models will download on first run)…');
|
|
142
|
+
// Increase maxBuffer because pip output is large
|
|
143
|
+
yield execAsync(cmd, { cwd: MARKER_HOME, maxBuffer: 100 * 1024 * 1024 });
|
|
144
|
+
if (!(0, fs_1.existsSync)(VENV_MARKER_SINGLE)) {
|
|
145
|
+
throw new Error(`marker installation finished but ${VENV_MARKER_SINGLE} not found`);
|
|
146
|
+
}
|
|
147
|
+
log('marker installed successfully');
|
|
148
|
+
});
|
|
149
|
+
}
|
|
150
|
+
/**
|
|
151
|
+
* Decide the correct TORCH_DEVICE for this machine.
|
|
152
|
+
*/
|
|
153
|
+
function resolveDevice(requested = 'auto') {
|
|
154
|
+
if (requested !== 'auto')
|
|
155
|
+
return requested;
|
|
156
|
+
if (process.platform === 'darwin' && process.arch === 'arm64')
|
|
157
|
+
return 'mps';
|
|
158
|
+
// We don't probe nvidia-smi here — let PyTorch decide CUDA at runtime
|
|
159
|
+
return 'cpu';
|
|
160
|
+
}
|
|
161
|
+
/**
|
|
162
|
+
* Run marker on a single PDF or a directory of PDFs.
|
|
163
|
+
*/
|
|
164
|
+
function runMarker(inputPath, opts) {
|
|
165
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
166
|
+
var _a;
|
|
167
|
+
const status = yield getMarkerStatus();
|
|
168
|
+
if (!status.installed) {
|
|
169
|
+
throw new Error('marker is not installed. Run `lt tools ocr --install` first.');
|
|
170
|
+
}
|
|
171
|
+
const isDir = (0, fs_1.existsSync)(inputPath) && (yield Promise.resolve().then(() => __importStar(require('fs')))).statSync(inputPath).isDirectory();
|
|
172
|
+
const bin = isDir ? VENV_MARKER_BATCH : VENV_MARKER_SINGLE;
|
|
173
|
+
const args = [];
|
|
174
|
+
if (isDir) {
|
|
175
|
+
args.push(inputPath);
|
|
176
|
+
}
|
|
177
|
+
else {
|
|
178
|
+
args.push(inputPath);
|
|
179
|
+
}
|
|
180
|
+
args.push('--output_dir', opts.outputDir);
|
|
181
|
+
args.push('--output_format', (_a = opts.outputFormat) !== null && _a !== void 0 ? _a : 'markdown');
|
|
182
|
+
if (opts.disableImages)
|
|
183
|
+
args.push('--disable_image_extraction');
|
|
184
|
+
if (isDir) {
|
|
185
|
+
if (opts.skipExisting)
|
|
186
|
+
args.push('--skip_existing');
|
|
187
|
+
if (opts.workers && opts.workers > 0)
|
|
188
|
+
args.push('--workers', String(opts.workers));
|
|
189
|
+
}
|
|
190
|
+
const device = resolveDevice(opts.device);
|
|
191
|
+
return new Promise((resolve, reject) => {
|
|
192
|
+
var _a;
|
|
193
|
+
const proc = (0, child_process_1.spawn)(bin, args, {
|
|
194
|
+
env: Object.assign(Object.assign({}, process.env), { TORCH_DEVICE: device }),
|
|
195
|
+
stdio: ['ignore', 'pipe', 'pipe'],
|
|
196
|
+
});
|
|
197
|
+
const onLine = (_a = opts.onLine) !== null && _a !== void 0 ? _a : ((l) => process.stdout.write(`${l}\n`));
|
|
198
|
+
const handleStream = (stream) => {
|
|
199
|
+
let buf = '';
|
|
200
|
+
stream.on('data', (chunk) => {
|
|
201
|
+
var _a;
|
|
202
|
+
buf += chunk.toString();
|
|
203
|
+
const lines = buf.split(/\r?\n/);
|
|
204
|
+
buf = (_a = lines.pop()) !== null && _a !== void 0 ? _a : '';
|
|
205
|
+
for (const line of lines)
|
|
206
|
+
if (line)
|
|
207
|
+
onLine(line);
|
|
208
|
+
});
|
|
209
|
+
};
|
|
210
|
+
handleStream(proc.stdout);
|
|
211
|
+
handleStream(proc.stderr);
|
|
212
|
+
proc.on('close', (code) => {
|
|
213
|
+
resolve({ exitCode: code !== null && code !== void 0 ? code : 0 });
|
|
214
|
+
});
|
|
215
|
+
proc.on('error', reject);
|
|
216
|
+
});
|
|
217
|
+
});
|
|
218
|
+
}
|
package/docs/commands.md
CHANGED
|
@@ -1463,6 +1463,53 @@ lt tools crawl https://lenne.tech --all --noConfirm
|
|
|
1463
1463
|
lt tools crawl https://example.com --all --no-render --no-prune --noConfirm
|
|
1464
1464
|
```
|
|
1465
1465
|
|
|
1466
|
+
### `lt tools ocr`
|
|
1467
|
+
|
|
1468
|
+
Converts PDFs to clean Markdown using [marker-pdf](https://github.com/datalab-to/marker) — a PyTorch-based, layout-aware OCR engine that produces real Markdown tables, headings and lists. On Apple Silicon (M-series) inference runs on the GPU via Metal Performance Shaders (MPS) and is typically 5–15× faster than CPU-only PDF text extractors. Marker is auto-installed into an isolated virtualenv at `~/.lt/marker/.venv/` on first use; subsequent runs reuse the cached environment and ~3 GB of model weights.
|
|
1469
|
+
|
|
1470
|
+
**Aliases:** `ocr`, `pdf2md`
|
|
1471
|
+
|
|
1472
|
+
**Usage:**
|
|
1473
|
+
```bash
|
|
1474
|
+
lt tools ocr <file.pdf|directory> [options]
|
|
1475
|
+
lt tools ocr --status # Show installation status
|
|
1476
|
+
lt tools ocr --install # Install marker-pdf without converting anything
|
|
1477
|
+
```
|
|
1478
|
+
|
|
1479
|
+
**Options:**
|
|
1480
|
+
- `--output-dir <dir>` — Output directory (default: `<input>-MD/` for batch, `<input>.md-out/` for single).
|
|
1481
|
+
- `--workers <n>` — Parallel worker processes for batch mode (default `3`).
|
|
1482
|
+
- `--device <auto|mps|cuda|cpu>` — Override `TORCH_DEVICE`. Default `auto` picks `mps` on Apple Silicon, `cpu` elsewhere. Set `cuda` if running on a Linux machine with an NVIDIA GPU and the appropriate PyTorch CUDA build.
|
|
1483
|
+
- `--skip-existing` / `--no-skip-existing` — Skip already-converted files in batch mode (default **on**).
|
|
1484
|
+
- `--keep-images` — Extract embedded images alongside the Markdown (default **off** — Markdown only).
|
|
1485
|
+
- `--format <markdown|json|html|chunks>` — Output format (default `markdown`).
|
|
1486
|
+
|
|
1487
|
+
**Setup notes:**
|
|
1488
|
+
- Requires `python3` (≥ 3.10) on PATH.
|
|
1489
|
+
- Uses `uv` if available (fastest install path); falls back to `python3 -m venv` + `pip` otherwise.
|
|
1490
|
+
- The first conversion is slower because the model weights download (~3 GB). Subsequent runs start instantly.
|
|
1491
|
+
- Apple Silicon: `device: mps` is auto-selected. Linux/CUDA: pass `--device cuda`.
|
|
1492
|
+
|
|
1493
|
+
**Examples:**
|
|
1494
|
+
```bash
|
|
1495
|
+
# Inspect tooling status (python3, uv, venv path, auto-detected device)
|
|
1496
|
+
lt tools ocr --status
|
|
1497
|
+
|
|
1498
|
+
# One-time install (skip if you just want to convert and let auto-install handle it)
|
|
1499
|
+
lt tools ocr --install
|
|
1500
|
+
|
|
1501
|
+
# Convert a single PDF (creates ./report.pdf.md-out/report/report.md)
|
|
1502
|
+
lt tools ocr ./report.pdf
|
|
1503
|
+
|
|
1504
|
+
# Batch a directory with 4 parallel workers
|
|
1505
|
+
lt tools ocr ./pdfs --output-dir ./md --workers 4
|
|
1506
|
+
|
|
1507
|
+
# Force CPU mode (e.g. when MPS-related crashes occur on Sonoma)
|
|
1508
|
+
lt tools ocr ./report.pdf --device cpu
|
|
1509
|
+
```
|
|
1510
|
+
|
|
1511
|
+
**When to reach for this command vs. the lt-knowledge ingest pipeline:** `lt tools ocr` is for **local developer workflows** — quick PDF → Markdown for research, demos, validation sets, sanity checks. For productive ingestion (Vector / Graph / Wiki layers, confidence-based fallback, Whisper for audio, archive-aware processing) use the lt-knowledge stack with its Docling + LightOnOCR sidecars. Marker is intentionally **not** added there because its MPS advantage doesn't apply in Linux containers and Docling already covers the same use cases with native confidence scoring (see `lt-knowledge/docs/OCR-COMPARISON-MARKER.md`).
|
|
1512
|
+
|
|
1466
1513
|
---
|
|
1467
1514
|
|
|
1468
1515
|
## Configuration Priority
|