@lenne.tech/cli 1.17.0 → 1.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/lt CHANGED
@@ -155,8 +155,12 @@ function runCLI() {
155
155
  require(`${__dirname}/../build/cli`).run(process.argv);
156
156
  } else {
157
157
  // this runs from the typescript source (for dev only)
158
- // hook into ts-node so we can run typescript on the fly
159
- require('ts-node').register({ project: `${__dirname}/../tsconfig.json` });
158
+ // hook into ts-node so we can run typescript on the fly. Use
159
+ // `transpileOnly` to skip ts-node's runtime type-checking pass —
160
+ // it triples startup time (~4 s → ~1.2 s) without catching
161
+ // anything new, since `npm run lint` (eslint) and `npm run
162
+ // compile` (tsc) already type-check the project before publish.
163
+ require('ts-node').register({ project: `${__dirname}/../tsconfig.json`, transpileOnly: true });
160
164
  // run the CLI with the current process arguments
161
165
  require(`${__dirname}/../src/cli`).run(process.argv);
162
166
  }
@@ -0,0 +1,166 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ const fs_1 = require("fs");
13
+ const path_1 = require("path");
14
+ const marker_1 = require("../../lib/marker");
15
+ /**
16
+ * OCR command: convert PDFs (or a directory of PDFs) to clean Markdown
17
+ * using marker-pdf with Apple Silicon MPS acceleration when available.
18
+ *
19
+ * Marker is kept in `~/.lt/marker/.venv/`; it is auto-installed on the
20
+ * first run (~3 GB model download). Subsequent runs reuse the cache.
21
+ *
22
+ * Examples:
23
+ * lt tools ocr ./report.pdf
24
+ * lt tools ocr ./pdfs --output-dir ./md --workers 4
25
+ * lt tools ocr --install
26
+ * lt tools ocr --status
27
+ */
28
+ const NewCommand = {
29
+ alias: ['ocr', 'pdf2md'],
30
+ description: 'OCR PDFs to Markdown via marker-pdf (MPS-accelerated on Apple Silicon)',
31
+ hidden: false,
32
+ name: 'ocr',
33
+ run: (toolbox) => __awaiter(void 0, void 0, void 0, function* () {
34
+ var _a, _b, _c, _d, _e;
35
+ const { parameters, print: { error, info, spin, warning }, } = toolbox;
36
+ const showStatus = !!parameters.options.status;
37
+ const installOnly = !!parameters.options.install;
38
+ // Status mode
39
+ if (showStatus) {
40
+ const status = yield (0, marker_1.getMarkerStatus)();
41
+ const device = (0, marker_1.resolveDevice)('auto');
42
+ info('marker-pdf status:');
43
+ info(` installed: ${status.installed ? 'yes' : 'no'}`);
44
+ info(` python3: ${status.pythonAvailable ? 'yes' : 'no'}`);
45
+ info(` uv: ${status.uvAvailable ? 'yes' : 'no'}`);
46
+ info(` venv: ${status.venvPath}`);
47
+ info(` device: ${device} (auto-detected)`);
48
+ if (!toolbox.parameters.options.fromGluegunMenu)
49
+ process.exit(0);
50
+ return 'ocr';
51
+ }
52
+ // Install-only mode
53
+ if (installOnly) {
54
+ const installSpinner = spin('Installing marker-pdf …');
55
+ try {
56
+ yield (0, marker_1.installMarker)({
57
+ onProgress: (msg) => {
58
+ installSpinner.text = msg;
59
+ },
60
+ });
61
+ installSpinner.succeed('marker-pdf installed');
62
+ }
63
+ catch (err) {
64
+ installSpinner.fail('Installation failed');
65
+ error(String(err.message));
66
+ if (!toolbox.parameters.options.fromGluegunMenu)
67
+ process.exit(1);
68
+ return 'ocr';
69
+ }
70
+ if (!toolbox.parameters.options.fromGluegunMenu)
71
+ process.exit(0);
72
+ return 'ocr';
73
+ }
74
+ // Normal run: need an input path
75
+ const inputArg = parameters.first;
76
+ if (!inputArg) {
77
+ error('Missing input path. Usage:');
78
+ info(' lt tools ocr <file.pdf|directory> Convert PDFs to Markdown');
79
+ info(' lt tools ocr --install Install marker-pdf locally');
80
+ info(' lt tools ocr --status Show installation status');
81
+ info('');
82
+ info('Options:');
83
+ info(' --output-dir <dir> Output directory (default: <input>-MD/)');
84
+ info(' --workers <n> Parallel workers for batch mode (default: 3)');
85
+ info(' --device <auto|mps|cuda|cpu> Override TORCH_DEVICE (default: auto)');
86
+ info(' --skip-existing Skip already-converted files (batch mode)');
87
+ info(' --keep-images Extract embedded images (default: off)');
88
+ info(' --format <markdown|json|html|chunks> Output format (default: markdown)');
89
+ if (!toolbox.parameters.options.fromGluegunMenu)
90
+ process.exit(1);
91
+ return 'ocr';
92
+ }
93
+ const inputPath = (0, path_1.resolve)(process.cwd(), inputArg);
94
+ if (!(0, fs_1.existsSync)(inputPath)) {
95
+ error(`Input not found: ${inputPath}`);
96
+ if (!toolbox.parameters.options.fromGluegunMenu)
97
+ process.exit(1);
98
+ return 'ocr';
99
+ }
100
+ // Auto-install if needed
101
+ let status = yield (0, marker_1.getMarkerStatus)();
102
+ if (!status.installed) {
103
+ warning('marker-pdf not yet installed — running first-time setup …');
104
+ const installSpinner = spin('Installing marker-pdf (one-time, ~3 GB model download) …');
105
+ try {
106
+ yield (0, marker_1.installMarker)({
107
+ onProgress: (msg) => {
108
+ installSpinner.text = msg;
109
+ },
110
+ });
111
+ installSpinner.succeed('marker-pdf installed');
112
+ status = yield (0, marker_1.getMarkerStatus)();
113
+ }
114
+ catch (err) {
115
+ installSpinner.fail('Installation failed');
116
+ error(String(err.message));
117
+ if (!toolbox.parameters.options.fromGluegunMenu)
118
+ process.exit(1);
119
+ return 'ocr';
120
+ }
121
+ }
122
+ // Resolve options
123
+ const isDir = (0, fs_1.statSync)(inputPath).isDirectory();
124
+ const defaultOutput = isDir ? `${inputPath}-MD` : `${inputPath}.md-out`;
125
+ const outputDir = (0, path_1.resolve)(process.cwd(), String((_b = (_a = parameters.options['output-dir']) !== null && _a !== void 0 ? _a : parameters.options.outputDir) !== null && _b !== void 0 ? _b : defaultOutput));
126
+ const workers = Number((_c = parameters.options.workers) !== null && _c !== void 0 ? _c : 3);
127
+ const skipExisting = parameters.options['skip-existing'] !== false; // default: true
128
+ const keepImages = !!parameters.options['keep-images'];
129
+ const format = String((_d = parameters.options.format) !== null && _d !== void 0 ? _d : 'markdown');
130
+ const device = ((_e = parameters.options.device) !== null && _e !== void 0 ? _e : 'auto');
131
+ info(`OCR ${isDir ? 'batch' : 'single'} → ${outputDir}`);
132
+ info(` device: ${(0, marker_1.resolveDevice)(device)}`);
133
+ if (isDir)
134
+ info(` workers: ${workers}, skip-existing: ${skipExisting}`);
135
+ info('');
136
+ const runSpinner = spin('Converting (may take a while on first run while models load)…');
137
+ let lastLine = '';
138
+ const result = yield (0, marker_1.runMarker)(inputPath, {
139
+ device,
140
+ disableImages: !keepImages,
141
+ onLine: (line) => {
142
+ // Forward marker output to spinner text (last line) so the user sees progress
143
+ if (line.trim()) {
144
+ lastLine = line.replace(/\s+/g, ' ').trim().slice(-160);
145
+ runSpinner.text = lastLine;
146
+ }
147
+ },
148
+ outputDir,
149
+ outputFormat: format,
150
+ skipExisting,
151
+ workers,
152
+ });
153
+ if (result.exitCode === 0) {
154
+ runSpinner.succeed(`Done — output in ${outputDir}`);
155
+ }
156
+ else {
157
+ runSpinner.fail(`marker exited with code ${result.exitCode}: ${lastLine}`);
158
+ if (!toolbox.parameters.options.fromGluegunMenu)
159
+ process.exit(result.exitCode);
160
+ }
161
+ if (!toolbox.parameters.options.fromGluegunMenu)
162
+ process.exit(0);
163
+ return 'ocr';
164
+ }),
165
+ };
166
+ exports.default = NewCommand;
@@ -63,6 +63,70 @@ class FrontendHelper {
63
63
  content = content.replace(/^(NUXT_PUBLIC_STORAGE_PREFIX=).*$/m, `$1${projectName}-local`);
64
64
  filesystem.write(envPath, content);
65
65
  }
66
+ /**
67
+ * Flatten the cloned nuxt-base-starter wrapper layout so the project's
68
+ * `projects/app/` directory IS the Nuxt app.
69
+ *
70
+ * `lenneTech/nuxt-base-starter` ships a wrapper repo: the root
71
+ * `package.json` is the `create-nuxt-base` scaffolder (a separate npm
72
+ * package — `bin/create-nuxt-base` lives at `index.js`), and the
73
+ * actual Nuxt app lives one level deeper at `nuxt-base-template/`.
74
+ * Without this flatten, the generated monorepo's `pnpm-workspace.yaml`
75
+ * and the README's `cd projects/app && pnpm install && pnpm dev`
76
+ * point at the wrapper, not the app, so `pnpm install` resolves the
77
+ * wrong dependencies and `pnpm dev` has nothing to run
78
+ * (LLM-test 2026-05-03 friction #3 entry 20:30).
79
+ *
80
+ * Defense-in-depth: only mutate the layout if extraction succeeds.
81
+ * If `nuxt-base-template/` is missing or isn't a directory (corrupt
82
+ * clone, future repo reshape that drops the wrapper), we return
83
+ * `{ flattened: false, reason }` and leave the original tree alone.
84
+ * The pre-flatten layout is annoying but functional — better than
85
+ * wiping a user's clone over an unexpected layout.
86
+ *
87
+ * @param dest - The cloned `projects/app/` directory.
88
+ * @returns Whether the flatten ran, plus a reason if it didn't.
89
+ */
90
+ flattenNuxtBaseTemplate(dest) {
91
+ return __awaiter(this, void 0, void 0, function* () {
92
+ const { filesystem } = this.toolbox;
93
+ const subdir = filesystem.path(dest, 'nuxt-base-template');
94
+ if (!filesystem.exists(subdir)) {
95
+ return { flattened: false, reason: 'no nuxt-base-template subdirectory' };
96
+ }
97
+ if (!filesystem.isDirectory(subdir)) {
98
+ // Stray file at the path we'd flatten — abort to avoid clobbering
99
+ // the user's tree on a corrupt clone.
100
+ return { flattened: false, reason: 'nuxt-base-template path exists but is not a directory' };
101
+ }
102
+ // Stage the template into a sibling directory before touching `dest`,
103
+ // so a copy failure leaves the original layout intact.
104
+ const parent = filesystem.path(dest, '..');
105
+ const stage = filesystem.path(parent, `.nuxt-base-template-staging-${Date.now()}-${process.pid}`);
106
+ try {
107
+ filesystem.copy(subdir, stage, { overwrite: true });
108
+ }
109
+ catch (err) {
110
+ // Couldn't stage — leave `dest` untouched and bubble the reason up.
111
+ filesystem.remove(stage);
112
+ return { flattened: false, reason: `failed to stage template: ${err.message}` };
113
+ }
114
+ try {
115
+ // Wipe the cloned root (wrapper package.json, index.js, lock file,
116
+ // README, etc.) and replace it with the staged template contents.
117
+ // gluegun's `filesystem.remove(dest)` removes the directory, so
118
+ // we re-create it before copying back so dotfiles land at the
119
+ // right level.
120
+ filesystem.remove(dest);
121
+ filesystem.dir(dest);
122
+ filesystem.copy(stage, dest, { overwrite: true });
123
+ }
124
+ finally {
125
+ filesystem.remove(stage);
126
+ }
127
+ return { flattened: true };
128
+ });
129
+ }
66
130
  /**
67
131
  * Setup Nuxt frontend
68
132
  * Handles template setup (link/copy/clone) and optional npm install
@@ -87,6 +151,14 @@ class FrontendHelper {
87
151
  if (!result.success) {
88
152
  return { method: result.method, path: result.path, success: false };
89
153
  }
154
+ // After a clone, flatten the wrapper layout so `projects/app/`
155
+ // IS the Nuxt app (the cloned root is the `create-nuxt-base`
156
+ // scaffolder, not the app — see flattenNuxtBaseTemplate).
157
+ // Skip on link mode: a symlink points at the user's local
158
+ // checkout and must not have its template subdir torn out.
159
+ if (result.method === 'clone') {
160
+ yield this.flattenNuxtBaseTemplate(dest);
161
+ }
90
162
  // Run install if not skipped and not a symlink
91
163
  if (!skipInstall && result.method !== 'link') {
92
164
  try {
@@ -0,0 +1,218 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
36
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
37
+ return new (P || (P = Promise))(function (resolve, reject) {
38
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
39
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
40
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
41
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
42
+ });
43
+ };
44
+ Object.defineProperty(exports, "__esModule", { value: true });
45
+ exports.getMarkerStatus = getMarkerStatus;
46
+ exports.installMarker = installMarker;
47
+ exports.resolveDevice = resolveDevice;
48
+ exports.runMarker = runMarker;
49
+ /**
50
+ * marker-pdf integration for the lt CLI.
51
+ *
52
+ * Marker (https://github.com/datalab-to/marker) is a PyTorch-based
53
+ * PDF → Markdown converter with first-class layout, table and equation
54
+ * support. On Apple Silicon it leverages Metal Performance Shaders
55
+ * (MPS) for GPU-accelerated inference.
56
+ *
57
+ * The CLI keeps marker in an isolated Python virtualenv under
58
+ * `~/.lt/marker/.venv/` so that:
59
+ * - we do not pollute the user's global Python environment
60
+ * - the ~3 GB of model weights are downloaded only once
61
+ * - subsequent runs start instantly (cached models)
62
+ */
63
+ const child_process_1 = require("child_process");
64
+ const fs_1 = require("fs");
65
+ const promises_1 = require("fs/promises");
66
+ const os_1 = require("os");
67
+ const path_1 = require("path");
68
+ const util_1 = require("util");
69
+ const execAsync = (0, util_1.promisify)(child_process_1.exec);
70
+ const MARKER_HOME = (0, path_1.join)((0, os_1.homedir)(), '.lt', 'marker');
71
+ const VENV_DIR = (0, path_1.join)(MARKER_HOME, '.venv');
72
+ const VENV_BIN = (0, path_1.join)(VENV_DIR, 'bin');
73
+ const VENV_PYTHON = (0, path_1.join)(VENV_BIN, 'python3');
74
+ const VENV_MARKER_SINGLE = (0, path_1.join)(VENV_BIN, 'marker_single');
75
+ const VENV_MARKER_BATCH = (0, path_1.join)(VENV_BIN, 'marker');
76
+ /**
77
+ * Detect tool availability.
78
+ */
79
+ function getMarkerStatus() {
80
+ return __awaiter(this, void 0, void 0, function* () {
81
+ const status = {
82
+ installed: false,
83
+ pythonAvailable: false,
84
+ uvAvailable: false,
85
+ venvPath: VENV_DIR,
86
+ };
87
+ try {
88
+ yield execAsync('python3 --version');
89
+ status.pythonAvailable = true;
90
+ }
91
+ catch (_a) {
92
+ // python3 missing
93
+ }
94
+ try {
95
+ yield execAsync('uv --version');
96
+ status.uvAvailable = true;
97
+ }
98
+ catch (_b) {
99
+ // uv missing — we'll fall back to python -m venv + pip
100
+ }
101
+ status.installed = (0, fs_1.existsSync)(VENV_MARKER_SINGLE) && (0, fs_1.existsSync)(VENV_MARKER_BATCH);
102
+ return status;
103
+ });
104
+ }
105
+ /**
106
+ * Install marker-pdf into ~/.lt/marker/.venv.
107
+ *
108
+ * Preferred path: `uv venv --python 3.12` + `uv pip install marker-pdf psutil`.
109
+ * Fallback: `python3 -m venv` + `pip install`.
110
+ */
111
+ function installMarker() {
112
+ return __awaiter(this, arguments, void 0, function* (opts = {}) {
113
+ var _a;
114
+ const log = (_a = opts.onProgress) !== null && _a !== void 0 ? _a : (() => { });
115
+ const status = yield getMarkerStatus();
116
+ if (status.installed) {
117
+ log('marker already installed');
118
+ return;
119
+ }
120
+ if (!status.pythonAvailable) {
121
+ throw new Error('python3 is required but not found in PATH. Install Python 3.10+ (e.g. via Homebrew: `brew install python@3.12`)');
122
+ }
123
+ yield (0, promises_1.mkdir)(MARKER_HOME, { recursive: true });
124
+ const useUv = status.uvAvailable;
125
+ // 1. Create virtualenv
126
+ if (useUv) {
127
+ log('Creating venv with uv (Python 3.12)…');
128
+ yield execAsync(`uv venv --python 3.12 "${VENV_DIR}"`, { cwd: MARKER_HOME });
129
+ }
130
+ else {
131
+ log('Creating venv with python3 (uv not found, falling back)…');
132
+ yield execAsync(`python3 -m venv "${VENV_DIR}"`, { cwd: MARKER_HOME });
133
+ }
134
+ // 2. Install marker-pdf + psutil
135
+ // psutil is needed by the marker batch CLI; it is a soft dep on some
136
+ // marker-pdf releases, so we install it explicitly.
137
+ // We use shell quoting to handle macOS "Library" / spaces in paths.
138
+ const cmd = useUv
139
+ ? `uv pip install --python "${VENV_PYTHON}" marker-pdf psutil`
140
+ : `"${VENV_BIN}/pip" install marker-pdf psutil`;
141
+ log('Installing marker-pdf + dependencies (~3 GB models will download on first run)…');
142
+ // Increase maxBuffer because pip output is large
143
+ yield execAsync(cmd, { cwd: MARKER_HOME, maxBuffer: 100 * 1024 * 1024 });
144
+ if (!(0, fs_1.existsSync)(VENV_MARKER_SINGLE)) {
145
+ throw new Error(`marker installation finished but ${VENV_MARKER_SINGLE} not found`);
146
+ }
147
+ log('marker installed successfully');
148
+ });
149
+ }
150
+ /**
151
+ * Decide the correct TORCH_DEVICE for this machine.
152
+ */
153
+ function resolveDevice(requested = 'auto') {
154
+ if (requested !== 'auto')
155
+ return requested;
156
+ if (process.platform === 'darwin' && process.arch === 'arm64')
157
+ return 'mps';
158
+ // We don't probe nvidia-smi here — let PyTorch decide CUDA at runtime
159
+ return 'cpu';
160
+ }
161
+ /**
162
+ * Run marker on a single PDF or a directory of PDFs.
163
+ */
164
+ function runMarker(inputPath, opts) {
165
+ return __awaiter(this, void 0, void 0, function* () {
166
+ var _a;
167
+ const status = yield getMarkerStatus();
168
+ if (!status.installed) {
169
+ throw new Error('marker is not installed. Run `lt tools ocr --install` first.');
170
+ }
171
+ const isDir = (0, fs_1.existsSync)(inputPath) && (yield Promise.resolve().then(() => __importStar(require('fs')))).statSync(inputPath).isDirectory();
172
+ const bin = isDir ? VENV_MARKER_BATCH : VENV_MARKER_SINGLE;
173
+ const args = [];
174
+ if (isDir) {
175
+ args.push(inputPath);
176
+ }
177
+ else {
178
+ args.push(inputPath);
179
+ }
180
+ args.push('--output_dir', opts.outputDir);
181
+ args.push('--output_format', (_a = opts.outputFormat) !== null && _a !== void 0 ? _a : 'markdown');
182
+ if (opts.disableImages)
183
+ args.push('--disable_image_extraction');
184
+ if (isDir) {
185
+ if (opts.skipExisting)
186
+ args.push('--skip_existing');
187
+ if (opts.workers && opts.workers > 0)
188
+ args.push('--workers', String(opts.workers));
189
+ }
190
+ const device = resolveDevice(opts.device);
191
+ return new Promise((resolve, reject) => {
192
+ var _a;
193
+ const proc = (0, child_process_1.spawn)(bin, args, {
194
+ env: Object.assign(Object.assign({}, process.env), { TORCH_DEVICE: device }),
195
+ stdio: ['ignore', 'pipe', 'pipe'],
196
+ });
197
+ const onLine = (_a = opts.onLine) !== null && _a !== void 0 ? _a : ((l) => process.stdout.write(`${l}\n`));
198
+ const handleStream = (stream) => {
199
+ let buf = '';
200
+ stream.on('data', (chunk) => {
201
+ var _a;
202
+ buf += chunk.toString();
203
+ const lines = buf.split(/\r?\n/);
204
+ buf = (_a = lines.pop()) !== null && _a !== void 0 ? _a : '';
205
+ for (const line of lines)
206
+ if (line)
207
+ onLine(line);
208
+ });
209
+ };
210
+ handleStream(proc.stdout);
211
+ handleStream(proc.stderr);
212
+ proc.on('close', (code) => {
213
+ resolve({ exitCode: code !== null && code !== void 0 ? code : 0 });
214
+ });
215
+ proc.on('error', reject);
216
+ });
217
+ });
218
+ }
package/docs/commands.md CHANGED
@@ -1463,6 +1463,53 @@ lt tools crawl https://lenne.tech --all --noConfirm
1463
1463
  lt tools crawl https://example.com --all --no-render --no-prune --noConfirm
1464
1464
  ```
1465
1465
 
1466
+ ### `lt tools ocr`
1467
+
1468
+ Converts PDFs to clean Markdown using [marker-pdf](https://github.com/datalab-to/marker) — a PyTorch-based, layout-aware OCR engine that produces real Markdown tables, headings and lists. On Apple Silicon (M-series) inference runs on the GPU via Metal Performance Shaders (MPS) and is typically 5–15× faster than CPU-only PDF text extractors. Marker is auto-installed into an isolated virtualenv at `~/.lt/marker/.venv/` on first use; subsequent runs reuse the cached environment and ~3 GB of model weights.
1469
+
1470
+ **Aliases:** `ocr`, `pdf2md`
1471
+
1472
+ **Usage:**
1473
+ ```bash
1474
+ lt tools ocr <file.pdf|directory> [options]
1475
+ lt tools ocr --status # Show installation status
1476
+ lt tools ocr --install # Install marker-pdf without converting anything
1477
+ ```
1478
+
1479
+ **Options:**
1480
+ - `--output-dir <dir>` — Output directory (default: `<input>-MD/` for batch, `<input>.md-out/` for single).
1481
+ - `--workers <n>` — Parallel worker processes for batch mode (default `3`).
1482
+ - `--device <auto|mps|cuda|cpu>` — Override `TORCH_DEVICE`. Default `auto` picks `mps` on Apple Silicon, `cpu` elsewhere. Set `cuda` if running on a Linux machine with an NVIDIA GPU and the appropriate PyTorch CUDA build.
1483
+ - `--skip-existing` / `--no-skip-existing` — Skip already-converted files in batch mode (default **on**).
1484
+ - `--keep-images` — Extract embedded images alongside the Markdown (default **off** — Markdown only).
1485
+ - `--format <markdown|json|html|chunks>` — Output format (default `markdown`).
1486
+
1487
+ **Setup notes:**
1488
+ - Requires `python3` (≥ 3.10) on PATH.
1489
+ - Uses `uv` if available (fastest install path); falls back to `python3 -m venv` + `pip` otherwise.
1490
+ - The first conversion is slower because the model weights download (~3 GB). Subsequent runs start instantly.
1491
+ - Apple Silicon: `device: mps` is auto-selected. Linux/CUDA: pass `--device cuda`.
1492
+
1493
+ **Examples:**
1494
+ ```bash
1495
+ # Inspect tooling status (python3, uv, venv path, auto-detected device)
1496
+ lt tools ocr --status
1497
+
1498
+ # One-time install (skip if you just want to convert and let auto-install handle it)
1499
+ lt tools ocr --install
1500
+
1501
+ # Convert a single PDF (creates ./report.pdf.md-out/report/report.md)
1502
+ lt tools ocr ./report.pdf
1503
+
1504
+ # Batch a directory with 4 parallel workers
1505
+ lt tools ocr ./pdfs --output-dir ./md --workers 4
1506
+
1507
+ # Force CPU mode (e.g. when MPS-related crashes occur on Sonoma)
1508
+ lt tools ocr ./report.pdf --device cpu
1509
+ ```
1510
+
1511
+ **When to reach for this command vs. the lt-knowledge ingest pipeline:** `lt tools ocr` is for **local developer workflows** — quick PDF → Markdown for research, demos, validation sets, sanity checks. For productive ingestion (Vector / Graph / Wiki layers, confidence-based fallback, Whisper for audio, archive-aware processing) use the lt-knowledge stack with its Docling + LightOnOCR sidecars. Marker is intentionally **not** added there because its MPS advantage doesn't apply in Linux containers and Docling already covers the same use cases with native confidence scoring (see `lt-knowledge/docs/OCR-COMPARISON-MARKER.md`).
1512
+
1466
1513
  ---
1467
1514
 
1468
1515
  ## Configuration Priority
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lenne.tech/cli",
3
- "version": "1.17.0",
3
+ "version": "1.19.0",
4
4
  "description": "lenne.Tech CLI: lt",
5
5
  "keywords": [
6
6
  "lenne.Tech",