@okrapdf/cli 0.3.7 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,857 @@
1
+ /**
2
+ * Consolidated element management commands
3
+ *
4
+ * Combines entities, tables, toc, and review into a single top-level command.
5
+ * All extracted content from PDFs (tables, figures, footnotes, etc.) lives here.
6
+ */
7
+ import { Command } from 'commander';
8
+ import { writeFileSync, mkdirSync, existsSync, readFileSync } from 'fs';
9
+ import { join, resolve } from 'path';
10
+ import chalk from 'chalk';
11
+ import { get, post, patch, OkraApiError, EXIT_CODES } from '../lib/client.js';
12
+ import { formatOutput, formatDate, formatStatus as formatOutputStatus, success, error, info, warn, } from '../lib/output.js';
13
+ import { withSpinner } from '../lib/progress.js';
14
+ import { getDefaultFormat, shouldUseJsonOutput } from '../lib/config.js';
15
+ import { openInBrowser, getJobWebUrl } from '../lib/browser.js';
16
+ import { CacheManager } from '../lib/cache.js';
17
+ import { validateJobReady, throwValidationError } from '../lib/validator.js';
18
+ // ─── Table columns ───────────────────────────────────────────────────────────
19
+ const ENTITY_COLUMNS = [
20
+ { key: 'id', header: 'Entity ID', width: 25 },
21
+ { key: 'type', header: 'Type', width: 12 },
22
+ { key: 'page', header: 'Page', width: 6 },
23
+ { key: 'title_fmt', header: 'Title', width: 40 },
24
+ { key: 'has_bbox', header: 'BBox', width: 6 },
25
+ ];
26
+ const REVIEW_PAGE_COLUMNS = [
27
+ { key: 'page', header: 'Page', width: 6 },
28
+ { key: 'status', header: 'Status', width: 12 },
29
+ { key: 'resolution', header: 'Resolution', width: 12 },
30
+ { key: 'ocrLineCount', header: 'OCR Lines', width: 10 },
31
+ { key: 'hasOcr', header: 'Has OCR', width: 8 },
32
+ ];
33
+ // ─── Helpers ─────────────────────────────────────────────────────────────────
34
+ function truncate(str, maxLength) {
35
+ if (str.length <= maxLength)
36
+ return str;
37
+ return str.slice(0, maxLength - 3) + '...';
38
+ }
39
+ function formatReviewStatus(status) {
40
+ switch (status) {
41
+ case 'complete': return chalk.green(status);
42
+ case 'partial': return chalk.yellow(status);
43
+ case 'flagged': return chalk.red(status);
44
+ case 'pending': return chalk.yellow(status);
45
+ case 'gap': return chalk.magenta(status);
46
+ case 'empty': return chalk.dim(status);
47
+ default: return status;
48
+ }
49
+ }
50
+ function markdownTableToCsv(markdown) {
51
+ const lines = markdown.trim().split('\n');
52
+ const csvLines = [];
53
+ for (const line of lines) {
54
+ if (line.match(/^\|[\s-:|]+\|$/))
55
+ continue;
56
+ if (line.startsWith('|') && line.endsWith('|')) {
57
+ const cells = line
58
+ .slice(1, -1)
59
+ .split('|')
60
+ .map(cell => cell.trim());
61
+ const escapedCells = cells.map(cell => {
62
+ if (cell.includes(',') || cell.includes('"') || cell.includes('\n')) {
63
+ return `"${cell.replace(/"/g, '""')}"`;
64
+ }
65
+ return cell;
66
+ });
67
+ csvLines.push(escapedCells.join(','));
68
+ }
69
+ }
70
+ return csvLines.join('\n');
71
+ }
72
+ function formatTocAsMarkdown(toc) {
73
+ const lines = [];
74
+ lines.push(chalk.bold('Table of Contents'));
75
+ lines.push('');
76
+ for (const entry of toc.toc) {
77
+ const indent = ' '.repeat(entry.level - 1);
78
+ const pageNum = chalk.dim(`(p. ${entry.page})`);
79
+ lines.push(`${indent}- ${entry.title} ${pageNum}`);
80
+ }
81
+ lines.push('');
82
+ lines.push(chalk.dim(`Total entries: ${toc.total_entries}`));
83
+ return lines.join('\n');
84
+ }
85
+ // ─── Main command ────────────────────────────────────────────────────────────
86
+ export function createElementsCommand() {
87
+ const elements = new Command('elements')
88
+ .description('Manage extracted elements (tables, figures, footnotes, TOC, review)');
89
+ // ── elements list ──────────────────────────────────────────────────────────
90
+ elements
91
+ .command('list <jobId>')
92
+ .alias('ls')
93
+ .description('List elements from a job')
94
+ .option('-o, --output <format>', 'Output format (table, json, csv)', getDefaultFormat())
95
+ .option('-t, --type <type>', 'Filter by type (tables, figures, footnotes, summaries, signatures, all)', 'all')
96
+ .option('-p, --page <n>', 'Filter by page number')
97
+ .option('--with-bbox', 'Only show elements with bounding boxes')
98
+ .action(async (jobId, options) => {
99
+ const params = {
100
+ type: options.type,
101
+ };
102
+ const response = await withSpinner('Fetching elements', () => get(`api/ocr/jobs/${jobId}/entities`, params));
103
+ let entities = response.entities;
104
+ if (options.page) {
105
+ const pageNum = parseInt(options.page, 10);
106
+ entities = entities.filter(e => e.page === pageNum);
107
+ }
108
+ if (options.withBbox) {
109
+ entities = entities.filter(e => e.bbox != null);
110
+ }
111
+ if (entities.length === 0) {
112
+ console.log(chalk.dim('No elements found'));
113
+ return;
114
+ }
115
+ const formatted = entities.map(entity => ({
116
+ ...entity,
117
+ title_fmt: truncate(entity.title || '-', 40),
118
+ has_bbox: entity.bbox ? chalk.green('Yes') : chalk.dim('No'),
119
+ }));
120
+ console.log(formatOutput(formatted, options.output, ENTITY_COLUMNS));
121
+ if (!shouldUseJsonOutput(options.output)) {
122
+ console.log(chalk.dim(`\nTotal: ${entities.length} elements`));
123
+ console.log(chalk.dim(`Status: ${formatOutputStatus(response.extractionStatus)}`));
124
+ }
125
+ });
126
+ // ── elements get ───────────────────────────────────────────────────────────
127
+ elements
128
+ .command('get <elementId>')
129
+ .description('Get a single table/element by ID')
130
+ .option('-o, --output <format>', 'Output format (markdown, json)', 'markdown')
131
+ .action(async (elementId, options) => {
132
+ try {
133
+ const table = await withSpinner('Fetching element', () => get(`api/extractions/tables/${elementId}`));
134
+ if (options.output === 'json') {
135
+ console.log(formatOutput(table, 'json'));
136
+ }
137
+ else {
138
+ console.log(chalk.bold(`Table (Page ${table.page_number})`));
139
+ console.log(chalk.dim('-'.repeat(50)));
140
+ console.log(table.content_markdown);
141
+ console.log();
142
+ console.log(chalk.dim(`ID: ${table.id}`));
143
+ console.log(chalk.dim(`Source: ${table.processor_type}`));
144
+ if (table.confidence !== null) {
145
+ console.log(chalk.dim(`Confidence: ${(table.confidence * 100).toFixed(1)}%`));
146
+ }
147
+ }
148
+ }
149
+ catch (err) {
150
+ if (err instanceof OkraApiError && err.statusCode === 404) {
151
+ error(`Element not found: ${elementId}`);
152
+ process.exit(EXIT_CODES.NOT_FOUND);
153
+ }
154
+ throw err;
155
+ }
156
+ });
157
+ // ── elements export ────────────────────────────────────────────────────────
158
+ elements
159
+ .command('export <jobId>')
160
+ .description('Export elements to disk (tables as CSV/JSON/MD, images as PNG/JPG)')
161
+ .option('-t, --type <type>', 'Element type to export (tables, figures, all)', 'all')
162
+ .option('-f, --format <fmt>', 'Export format (csv, json, md, png, jpg)', 'csv')
163
+ .option('-d, --output-dir <dir>', 'Output directory', './element-export')
164
+ .option('-q, --quality <n>', 'Image quality for JPEG (1-100)', '90')
165
+ .option('-s, --scale <n>', 'Image scale factor (1-4)', '2')
166
+ .option('--padding <n>', 'Padding around crop region in pixels', '10')
167
+ .option('-o, --output <format>', 'Result output format (table, json)', 'table')
168
+ .action(async (jobId, options) => {
169
+ const useJson = shouldUseJsonOutput(options.output);
170
+ const format = options.format.toLowerCase();
171
+ const isImageFormat = format === 'png' || format === 'jpg';
172
+ if (isImageFormat) {
173
+ // Image export path (from entities images)
174
+ await exportImages(jobId, { ...options, format }, useJson);
175
+ }
176
+ else {
177
+ // Table data export path (from tables export)
178
+ await exportTableData(jobId, { ...options, format }, useJson);
179
+ }
180
+ });
181
+ // ── elements toc ───────────────────────────────────────────────────────────
182
+ elements
183
+ .command('toc <jobId>')
184
+ .description('Extract table of contents from PDF')
185
+ .option('-f, --format <format>', 'Output format (markdown, json)', 'markdown')
186
+ .option('--max-depth <n>', 'Maximum TOC depth level', parseInt)
187
+ .option('--refresh', 'Force refresh from API (ignore cache)')
188
+ .action(async (jobId, options) => {
189
+ try {
190
+ const cache = new CacheManager();
191
+ const validation = await validateJobReady(jobId, cache);
192
+ if (!validation.valid) {
193
+ throwValidationError(validation.error);
194
+ }
195
+ if (!options.refresh && cache.getToc(jobId)) {
196
+ const cachedToc = cache.getToc(jobId);
197
+ if (cachedToc) {
198
+ info('Using cached TOC data');
199
+ if (options.format === 'json') {
200
+ console.log(JSON.stringify(cachedToc, null, 2));
201
+ }
202
+ else {
203
+ console.log(formatTocAsMarkdown(cachedToc));
204
+ }
205
+ return;
206
+ }
207
+ }
208
+ const params = {};
209
+ if (options.maxDepth) {
210
+ params.max_depth = options.maxDepth.toString();
211
+ }
212
+ info('Extracting TOC from PDF...');
213
+ const apiResponse = await withSpinner('Running TOC extraction (this may take ~30s)', () => get(`api/steps/table-of-content/${jobId}`, params));
214
+ if (!apiResponse.success) {
215
+ error('TOC extraction failed');
216
+ process.exit(EXIT_CODES.GENERAL_ERROR);
217
+ }
218
+ const tocResult = {
219
+ toc: apiResponse.toc.map(entry => ({
220
+ level: entry.level,
221
+ title: entry.title,
222
+ page: entry.page,
223
+ })),
224
+ total_entries: apiResponse.total_entries,
225
+ };
226
+ cache.setToc(jobId, tocResult);
227
+ success('TOC cached locally');
228
+ cache.logCommand('toc', jobId, {
229
+ format: options.format,
230
+ maxDepth: options.maxDepth,
231
+ refresh: options.refresh,
232
+ });
233
+ cache.logResult(true, {
234
+ total_entries: tocResult.total_entries,
235
+ strategy: apiResponse.strategy,
236
+ });
237
+ if (options.format === 'json') {
238
+ console.log(JSON.stringify(tocResult, null, 2));
239
+ }
240
+ else {
241
+ console.log(formatTocAsMarkdown(tocResult));
242
+ }
243
+ console.log(chalk.dim(`\nExtraction: ${apiResponse.strategy} strategy, ${apiResponse.total_elapsed_ms}ms`));
244
+ }
245
+ catch (err) {
246
+ const cache = new CacheManager();
247
+ const errorMessage = err instanceof Error ? err.message : String(err);
248
+ cache.logResult(false, {}, errorMessage);
249
+ if (err instanceof OkraApiError) {
250
+ error(err.message);
251
+ process.exit(err.exitCode);
252
+ }
253
+ error(`TOC extraction failed: ${errorMessage}`);
254
+ process.exit(EXIT_CODES.GENERAL_ERROR);
255
+ }
256
+ });
257
+ // ── elements count ─────────────────────────────────────────────────────────
258
+ elements
259
+ .command('count <jobId>')
260
+ .description('Get element counts by type')
261
+ .option('-o, --output <format>', 'Output format (table, json)', getDefaultFormat())
262
+ .action(async (jobId, options) => {
263
+ const response = await withSpinner('Fetching element counts', () => get(`api/ocr/jobs/${jobId}/entities`));
264
+ if (options.output === 'json') {
265
+ console.log(formatOutput({
266
+ job_id: jobId,
267
+ counts: response.counts,
268
+ extraction_status: response.extractionStatus,
269
+ total_pages: response.totalPages,
270
+ }, 'json'));
271
+ }
272
+ else {
273
+ console.log(chalk.bold('\nElement Counts'));
274
+ console.log(chalk.dim('-'.repeat(30)));
275
+ console.log(chalk.bold('Tables:'), response.counts.tables);
276
+ console.log(chalk.bold('Figures:'), response.counts.figures);
277
+ console.log(chalk.bold('Footnotes:'), response.counts.footnotes);
278
+ console.log(chalk.bold('Summaries:'), response.counts.summaries);
279
+ console.log(chalk.bold('Signatures:'), response.counts.signatures);
280
+ console.log();
281
+ console.log(chalk.dim(`Status: ${response.extractionStatus}`));
282
+ if (response.totalPages) {
283
+ console.log(chalk.dim(`Total pages: ${response.totalPages}`));
284
+ }
285
+ }
286
+ });
287
+ // ── elements review ────────────────────────────────────────────────────────
288
+ const review = elements
289
+ .command('review')
290
+ .description('Review job verification status and page content');
291
+ // review status
292
+ review
293
+ .command('status <jobId>')
294
+ .description('Get verification status summary for a job')
295
+ .option('-o, --output <format>', 'Output format (table, json)', getDefaultFormat())
296
+ .option('-w, --web', 'Open job review page in browser')
297
+ .action(async (jobId, options) => {
298
+ if (options.web) {
299
+ const url = `${getJobWebUrl(jobId)}/review`;
300
+ console.error(`Opening ${url} in your browser.`);
301
+ await openInBrowser(url);
302
+ return;
303
+ }
304
+ const tree = await withSpinner('Fetching verification status', () => get(`api/ocr/jobs/${jobId}/verification-tree`));
305
+ if (options.output === 'json') {
306
+ console.log(formatOutput(tree, 'json'));
307
+ return;
308
+ }
309
+ console.log(chalk.bold('Verification Status'));
310
+ console.log(chalk.dim('-'.repeat(50)));
311
+ console.log(chalk.bold('Job:'), jobId);
312
+ console.log(chalk.bold('Total Pages:'), tree.totalPages);
313
+ console.log();
314
+ console.log(chalk.bold('Summary:'));
315
+ console.log(` ${chalk.green('Complete:')} ${tree.summary.complete}`);
316
+ console.log(` ${chalk.yellow('Pending:')} ${tree.summary.pending}`);
317
+ console.log(` ${chalk.red('Flagged:')} ${tree.summary.flagged}`);
318
+ console.log(` ${chalk.magenta('Gap:')} ${tree.summary.gap}`);
319
+ console.log(` ${chalk.blue('Resolved:')} ${tree.summary.resolved}`);
320
+ if (tree.summary.stale > 0) {
321
+ console.log(` ${chalk.dim('Stale:')} ${tree.summary.stale}`);
322
+ }
323
+ });
324
+ // review pages
325
+ review
326
+ .command('pages <jobId>')
327
+ .description('List pages with verification status')
328
+ .option('-o, --output <format>', 'Output format (table, json)', getDefaultFormat())
329
+ .option('-s, --status <status>', 'Filter by status (complete, pending, flagged, gap)')
330
+ .action(async (jobId, options) => {
331
+ const tree = await withSpinner('Fetching pages', () => get(`api/ocr/jobs/${jobId}/verification-tree`));
332
+ let pages = tree.pages;
333
+ if (options.status) {
334
+ pages = pages.filter(p => p.status === options.status);
335
+ }
336
+ if (pages.length === 0) {
337
+ console.log(chalk.dim('No pages found'));
338
+ return;
339
+ }
340
+ if (options.output === 'json') {
341
+ console.log(formatOutput(pages, 'json'));
342
+ return;
343
+ }
344
+ const formatted = pages.map(p => ({
345
+ ...p,
346
+ status: formatReviewStatus(p.status),
347
+ resolution: p.resolution || chalk.dim('-'),
348
+ hasOcr: p.hasOcr ? chalk.green('\u2713') : chalk.dim('\u2717'),
349
+ }));
350
+ console.log(formatOutput(formatted, 'table', REVIEW_PAGE_COLUMNS));
351
+ console.log(chalk.dim(`\n${pages.length} pages`));
352
+ });
353
+ // review page
354
+ review
355
+ .command('page <jobId> <pageNum>')
356
+ .description('Get page content (markdown and OCR blocks)')
357
+ .option('-o, --output <format>', 'Output format (markdown, json)', 'markdown')
358
+ .option('--ocr', 'Show OCR blocks instead of markdown')
359
+ .option('--raw', 'Output raw content without formatting')
360
+ .action(async (jobId, pageNum, options) => {
361
+ const page = await withSpinner('Fetching page content', () => get(`api/ocr/jobs/${jobId}/pages/${pageNum}`));
362
+ if (options.output === 'json') {
363
+ console.log(formatOutput(page, 'json'));
364
+ return;
365
+ }
366
+ if (options.ocr) {
367
+ if (!page.blocks || page.blocks.length === 0) {
368
+ console.log(chalk.dim('No OCR blocks available'));
369
+ return;
370
+ }
371
+ if (options.raw) {
372
+ for (const block of page.blocks) {
373
+ console.log(block.text);
374
+ }
375
+ return;
376
+ }
377
+ console.log(chalk.bold(`OCR Blocks - Page ${pageNum}`));
378
+ console.log(chalk.dim('-'.repeat(50)));
379
+ for (let i = 0; i < page.blocks.length; i++) {
380
+ const block = page.blocks[i];
381
+ const conf = block.confidence !== undefined ? ` (${(block.confidence * 100).toFixed(0)}%)` : '';
382
+ console.log(chalk.cyan(`[${i + 1}]${conf}`), block.text);
383
+ }
384
+ console.log(chalk.dim(`\n${page.blocks.length} blocks`));
385
+ return;
386
+ }
387
+ if (options.raw) {
388
+ console.log(page.content);
389
+ return;
390
+ }
391
+ console.log(chalk.bold(`Page ${pageNum} Content`));
392
+ console.log(chalk.dim('-'.repeat(50)));
393
+ console.log(page.content);
394
+ console.log();
395
+ console.log(chalk.dim(`Version: ${page.version}`));
396
+ if (page.dimension) {
397
+ console.log(chalk.dim(`Dimension: ${page.dimension.width}x${page.dimension.height}`));
398
+ }
399
+ if (page.blocks) {
400
+ console.log(chalk.dim(`OCR Blocks: ${page.blocks.length}`));
401
+ }
402
+ });
403
+ // review resolve
404
+ review
405
+ .command('resolve <jobId> <pageNum>')
406
+ .description('Mark a page as reviewed')
407
+ .option('-r, --resolution <type>', 'Resolution type (reviewed, skipped, flagged)', 'reviewed')
408
+ .option('-n, --note <text>', 'Add a note to the resolution')
409
+ .option('-o, --output <format>', 'Output format (table, json)', 'table')
410
+ .action(async (jobId, pageNum, options) => {
411
+ const useJson = shouldUseJsonOutput(options.output);
412
+ try {
413
+ await withSpinner(`Resolving page ${pageNum}`, () => post(`api/ocr/jobs/${jobId}/pages/${pageNum}/resolve`, {
414
+ resolution: options.resolution,
415
+ note: options.note,
416
+ }));
417
+ if (useJson) {
418
+ console.log(formatOutput({
419
+ success: true,
420
+ job_id: jobId,
421
+ page: parseInt(pageNum),
422
+ resolution: options.resolution,
423
+ }, 'json'));
424
+ }
425
+ else {
426
+ success(`Page ${pageNum} marked as ${options.resolution}`);
427
+ }
428
+ }
429
+ catch (err) {
430
+ if (err instanceof OkraApiError) {
431
+ if (useJson) {
432
+ console.log(formatOutput({ success: false, error: err.message }, 'json'));
433
+ }
434
+ else {
435
+ error(err.message);
436
+ }
437
+ process.exit(err.exitCode);
438
+ }
439
+ throw err;
440
+ }
441
+ });
442
+ // review history
443
+ review
444
+ .command('history <jobId>')
445
+ .description('Get verification audit trail')
446
+ .option('-o, --output <format>', 'Output format (table, json)', getDefaultFormat())
447
+ .option('-l, --limit <n>', 'Limit results', '20')
448
+ .option('-p, --page <n>', 'Filter by page number')
449
+ .action(async (jobId, options) => {
450
+ const params = { limit: options.limit };
451
+ if (options.page)
452
+ params.page = options.page;
453
+ const response = await withSpinner('Fetching history', () => get(`api/ocr/jobs/${jobId}/history`, params));
454
+ if (response.history.length === 0) {
455
+ console.log(chalk.dim('No history found'));
456
+ return;
457
+ }
458
+ if (options.output === 'json') {
459
+ console.log(formatOutput(response.history, 'json'));
460
+ return;
461
+ }
462
+ console.log(chalk.bold('Verification History'));
463
+ console.log(chalk.dim('-'.repeat(60)));
464
+ for (const entry of response.history) {
465
+ const page = entry.pageNumber ? `Page ${entry.pageNumber}` : '';
466
+ const entity = entry.entityType ? `${entry.entityType}` : '';
467
+ const target = [page, entity].filter(Boolean).join(' - ') || 'Job';
468
+ console.log(chalk.dim(formatDate(entry.createdAt)), chalk.cyan(entry.action), chalk.white(target), chalk.dim(`by ${entry.triggeredBy}`));
469
+ }
470
+ });
471
+ // review save
472
+ review
473
+ .command('save <jobId> <pageNum>')
474
+ .description('Save/update page markdown content')
475
+ .option('-f, --file <path>', 'Read content from file')
476
+ .option('-c, --content <text>', 'Content to save (use - for stdin)')
477
+ .option('-o, --output <format>', 'Output format (table, json)', 'table')
478
+ .action(async (jobId, pageNum, options) => {
479
+ const useJson = shouldUseJsonOutput(options.output);
480
+ let content;
481
+ if (options.file) {
482
+ if (!existsSync(options.file)) {
483
+ if (useJson) {
484
+ console.log(formatOutput({ success: false, error: `File not found: ${options.file}` }, 'json'));
485
+ }
486
+ else {
487
+ error(`File not found: ${options.file}`);
488
+ }
489
+ process.exit(EXIT_CODES.INVALID_ARGS);
490
+ }
491
+ content = readFileSync(options.file, 'utf-8');
492
+ }
493
+ else if (options.content) {
494
+ if (options.content === '-') {
495
+ const chunks = [];
496
+ for await (const chunk of process.stdin) {
497
+ chunks.push(chunk);
498
+ }
499
+ content = Buffer.concat(chunks).toString('utf-8');
500
+ }
501
+ else {
502
+ content = options.content;
503
+ }
504
+ }
505
+ else {
506
+ if (useJson) {
507
+ console.log(formatOutput({ success: false, error: 'Either --file or --content is required' }, 'json'));
508
+ }
509
+ else {
510
+ error('Either --file or --content is required');
511
+ }
512
+ process.exit(EXIT_CODES.INVALID_ARGS);
513
+ }
514
+ try {
515
+ const result = await withSpinner(`Saving page ${pageNum}`, () => patch(`api/ocr/jobs/${jobId}/pages/${pageNum}`, { content }));
516
+ if (useJson) {
517
+ console.log(formatOutput({
518
+ success: true,
519
+ job_id: jobId,
520
+ page: parseInt(pageNum),
521
+ version: result.version,
522
+ }, 'json'));
523
+ }
524
+ else {
525
+ success(`Page ${pageNum} saved (version ${result.version})`);
526
+ }
527
+ }
528
+ catch (err) {
529
+ if (err instanceof OkraApiError) {
530
+ if (useJson) {
531
+ console.log(formatOutput({ success: false, error: err.message }, 'json'));
532
+ }
533
+ else {
534
+ error(err.message);
535
+ }
536
+ process.exit(err.exitCode);
537
+ }
538
+ throw err;
539
+ }
540
+ });
541
+ // review versions
542
+ review
543
+ .command('versions <jobId> <pageNum>')
544
+ .description('List page content versions')
545
+ .option('-o, --output <format>', 'Output format (table, json)', getDefaultFormat())
546
+ .action(async (jobId, pageNum, options) => {
547
+ const response = await withSpinner('Fetching versions', () => get(`api/ocr/jobs/${jobId}/pages/${pageNum}/versions`));
548
+ if (response.versions.length === 0) {
549
+ console.log(chalk.dim('No versions found'));
550
+ return;
551
+ }
552
+ if (options.output === 'json') {
553
+ console.log(formatOutput(response.versions, 'json'));
554
+ return;
555
+ }
556
+ console.log(chalk.bold(`Page ${pageNum} Versions`));
557
+ console.log(chalk.dim('-'.repeat(50)));
558
+ for (const v of response.versions) {
559
+ console.log(` v${v.version} - ${formatDate(v.createdAt)} by ${v.createdBy}`);
560
+ }
561
+ });
562
+ // review version
563
+ review
564
+ .command('version <jobId> <pageNum> <version>')
565
+ .description('Get specific version of page content')
566
+ .option('-o, --output <format>', 'Output format (markdown, json)', 'markdown')
567
+ .option('--raw', 'Output raw content without formatting')
568
+ .action(async (jobId, pageNum, version, options) => {
569
+ const page = await withSpinner(`Fetching version ${version}`, () => get(`api/ocr/jobs/${jobId}/pages/${pageNum}/versions/${version}`));
570
+ if (options.output === 'json') {
571
+ console.log(formatOutput(page, 'json'));
572
+ return;
573
+ }
574
+ if (options.raw) {
575
+ console.log(page.content);
576
+ return;
577
+ }
578
+ console.log(chalk.bold(`Page ${pageNum} - Version ${version}`));
579
+ console.log(chalk.dim('-'.repeat(50)));
580
+ console.log(page.content);
581
+ });
582
+ // review diff
583
+ review
584
+ .command('diff <jobId> <pageNum>')
585
+ .description('Show diff between current and previous version')
586
+ .option('--from <v>', 'Compare from version')
587
+ .option('--to <v>', 'Compare to version')
588
+ .option('-o, --output <format>', 'Output format (table, json)', 'table')
589
+ .action(async (jobId, pageNum, options) => {
590
+ const useJson = shouldUseJsonOutput(options.output);
591
+ const current = await get(`api/ocr/jobs/${jobId}/pages/${pageNum}`);
592
+ let previousVersion = (current.version || 1) - 1;
593
+ if (options.from)
594
+ previousVersion = parseInt(options.from);
595
+ if (previousVersion < 1) {
596
+ if (useJson) {
597
+ console.log(formatOutput({
598
+ job_id: jobId,
599
+ page: parseInt(pageNum),
600
+ current_version: current.version,
601
+ previous_version: null,
602
+ message: 'No previous version to compare',
603
+ changes: [],
604
+ }, 'json'));
605
+ }
606
+ else {
607
+ console.log(chalk.dim('No previous version to compare'));
608
+ }
609
+ return;
610
+ }
611
+ const previous = await get(`api/ocr/jobs/${jobId}/pages/${pageNum}/versions/${previousVersion}`);
612
+ const currentLines = current.content.split('\n');
613
+ const previousLines = previous.content.split('\n');
614
+ const changes = [];
615
+ for (let i = 0; i < Math.max(currentLines.length, previousLines.length); i++) {
616
+ const curr = currentLines[i] ?? '';
617
+ const prev = previousLines[i] ?? '';
618
+ if (curr !== prev) {
619
+ if (prev)
620
+ changes.push({ line: i + 1, type: 'removed', content: prev });
621
+ if (curr)
622
+ changes.push({ line: i + 1, type: 'added', content: curr });
623
+ }
624
+ }
625
+ if (useJson) {
626
+ console.log(formatOutput({
627
+ job_id: jobId,
628
+ page: parseInt(pageNum),
629
+ from_version: previousVersion,
630
+ to_version: current.version,
631
+ changes,
632
+ }, 'json'));
633
+ return;
634
+ }
635
+ console.log(chalk.bold(`Diff: v${previousVersion} \u2192 v${current.version}`));
636
+ console.log(chalk.dim('-'.repeat(50)));
637
+ for (const change of changes) {
638
+ if (change.type === 'removed') {
639
+ console.log(chalk.red(`- ${change.content}`));
640
+ }
641
+ else {
642
+ console.log(chalk.green(`+ ${change.content}`));
643
+ }
644
+ }
645
+ });
646
+ return elements;
647
+ }
648
+ // ─── Export helpers ──────────────────────────────────────────────────────────
649
+ async function exportImages(jobId, options, useJson) {
650
+ const format = options.format;
651
+ if (format !== 'png' && format !== 'jpg') {
652
+ if (useJson) {
653
+ console.log(formatOutput({ success: false, error: 'Invalid image format. Use: png or jpg' }, 'json'));
654
+ }
655
+ else {
656
+ error('Invalid image format. Use: png or jpg');
657
+ }
658
+ process.exit(EXIT_CODES.INVALID_ARGS);
659
+ }
660
+ const entitiesResponse = await withSpinner('Fetching elements', () => get(`api/ocr/jobs/${jobId}/entities`, { type: options.type }));
661
+ let entities = entitiesResponse.entities.filter(e => e.bbox != null);
662
+ if (options.page) {
663
+ const pageNum = parseInt(options.page, 10);
664
+ entities = entities.filter(e => e.page === pageNum);
665
+ }
666
+ if (entities.length === 0) {
667
+ if (useJson) {
668
+ console.log(formatOutput({
669
+ success: true,
670
+ job_id: jobId,
671
+ message: 'No elements with bounding boxes found',
672
+ exported: 0,
673
+ }, 'json'));
674
+ }
675
+ else {
676
+ warn('No elements with bounding boxes found');
677
+ }
678
+ return;
679
+ }
680
+ info(`Found ${entities.length} elements with bounding boxes`);
681
+ const job = await withSpinner('Fetching job info', () => get(`api/v1/jobs/${jobId}`));
682
+ const jobData = job;
683
+ if (!jobData.document_uuid) {
684
+ if (useJson) {
685
+ console.log(formatOutput({ success: false, error: 'Job has no associated document' }, 'json'));
686
+ }
687
+ else {
688
+ error('Job has no associated document');
689
+ }
690
+ process.exit(EXIT_CODES.GENERAL_ERROR);
691
+ }
692
+ const docInfo = await withSpinner('Getting document URL', () => get(`api/documents/${jobData.document_uuid}/download`));
693
+ const got = (await import('got')).default;
694
+ info('Downloading PDF...');
695
+ const pdfResponse = await got(docInfo.signed_url, { responseType: 'buffer' });
696
+ const pdfBuffer = pdfResponse.body;
697
+ const { PdfImageRenderer } = await import('../lib/pdf-image.js');
698
+ const outDir = resolve(options.outputDir);
699
+ if (!existsSync(outDir)) {
700
+ mkdirSync(outDir, { recursive: true });
701
+ }
702
+ const renderer = await PdfImageRenderer.fromBuffer(pdfBuffer);
703
+ const exported = [];
704
+ const errors = [];
705
+ const renderOptions = {
706
+ format,
707
+ quality: parseInt(options.quality, 10),
708
+ scale: parseFloat(options.scale),
709
+ padding: parseInt(options.padding, 10),
710
+ };
711
+ console.log(chalk.dim(`Exporting ${entities.length} elements to ${outDir}...`));
712
+ for (const entity of entities) {
713
+ try {
714
+ const result = await renderer.renderRegion(entity.page, entity.bbox, renderOptions);
715
+ const safeTitle = (entity.title || 'untitled')
716
+ .toLowerCase()
717
+ .replace(/[^a-z0-9]+/g, '-')
718
+ .slice(0, 30);
719
+ const filename = `${entity.type}-p${entity.page}-${safeTitle}.${format}`;
720
+ const filepath = join(outDir, filename);
721
+ writeFileSync(filepath, result.buffer);
722
+ exported.push({
723
+ entity_id: entity.id,
724
+ file: filepath,
725
+ page: entity.page,
726
+ type: entity.type,
727
+ });
728
+ if (!useJson) {
729
+ console.log(chalk.green(' \u2713'), chalk.dim(`${entity.type} p${entity.page}:`), filename);
730
+ }
731
+ }
732
+ catch (err) {
733
+ const errorMsg = err instanceof Error ? err.message : 'Unknown error';
734
+ errors.push({ entity_id: entity.id, error: errorMsg });
735
+ if (!useJson) {
736
+ console.log(chalk.red(' \u2717'), chalk.dim(`${entity.type} p${entity.page}:`), errorMsg);
737
+ }
738
+ }
739
+ }
740
+ renderer.close();
741
+ if (useJson) {
742
+ console.log(formatOutput({
743
+ success: errors.length === 0,
744
+ job_id: jobId,
745
+ output_dir: outDir,
746
+ format,
747
+ exported: exported.length,
748
+ errors: errors.length,
749
+ files: exported,
750
+ error_details: errors.length > 0 ? errors : undefined,
751
+ }, 'json'));
752
+ }
753
+ else {
754
+ console.log();
755
+ if (exported.length > 0) {
756
+ success(`Exported ${exported.length} images to: ${outDir}`);
757
+ }
758
+ if (errors.length > 0) {
759
+ warn(`${errors.length} elements failed to export`);
760
+ }
761
+ }
762
+ }
763
+ async function exportTableData(jobId, options, useJson) {
764
+ const format = options.format;
765
+ if (!['csv', 'json', 'md', 'markdown'].includes(format)) {
766
+ if (useJson) {
767
+ console.log(formatOutput({ success: false, error: `Invalid format: ${format}. Use: csv, json, md, png, jpg` }, 'json'));
768
+ }
769
+ else {
770
+ error(`Invalid format: ${format}. Use: csv, json, md, png, jpg`);
771
+ }
772
+ process.exit(EXIT_CODES.INVALID_ARGS);
773
+ }
774
+ // Fetch entities to get table IDs
775
+ const entitiesResponse = await withSpinner('Fetching elements', () => get(`api/ocr/jobs/${jobId}/entities`, { type: 'tables' }));
776
+ const tableEntities = entitiesResponse.entities.filter(e => e.type === 'table');
777
+ if (tableEntities.length === 0) {
778
+ if (useJson) {
779
+ console.log(formatOutput({ success: true, job_id: jobId, message: 'No tables found', exported: 0 }, 'json'));
780
+ }
781
+ else {
782
+ warn('No tables found to export');
783
+ }
784
+ return;
785
+ }
786
+ const outDir = resolve(options.outputDir);
787
+ if (!existsSync(outDir)) {
788
+ mkdirSync(outDir, { recursive: true });
789
+ }
790
+ info(`Exporting ${tableEntities.length} tables to ${outDir}...`);
791
+ const exported = [];
792
+ const errors = [];
793
+ for (const entity of tableEntities) {
794
+ try {
795
+ const table = await get(`api/extractions/tables/${entity.id}`);
796
+ let content;
797
+ let ext;
798
+ switch (format) {
799
+ case 'json':
800
+ content = JSON.stringify(table, null, 2);
801
+ ext = 'json';
802
+ break;
803
+ case 'csv':
804
+ content = markdownTableToCsv(table.content_markdown);
805
+ ext = 'csv';
806
+ break;
807
+ case 'markdown':
808
+ case 'md':
809
+ content = table.content_markdown;
810
+ ext = 'md';
811
+ break;
812
+ default:
813
+ continue;
814
+ }
815
+ const safeTitle = (entity.title || 'untitled')
816
+ .toLowerCase()
817
+ .replace(/[^a-z0-9]+/g, '-')
818
+ .slice(0, 30);
819
+ const filename = `table-p${entity.page}-${safeTitle}.${ext}`;
820
+ const filepath = join(outDir, filename);
821
+ writeFileSync(filepath, content, 'utf-8');
822
+ exported.push({ table_id: entity.id, file: filepath, page: entity.page });
823
+ if (!useJson) {
824
+ console.log(chalk.green(' \u2713'), chalk.dim(`p${entity.page}:`), filename);
825
+ }
826
+ }
827
+ catch (err) {
828
+ const errorMsg = err instanceof Error ? err.message : 'Unknown error';
829
+ errors.push({ table_id: entity.id, error: errorMsg });
830
+ if (!useJson) {
831
+ console.log(chalk.red(' \u2717'), chalk.dim(`p${entity.page}:`), errorMsg);
832
+ }
833
+ }
834
+ }
835
+ if (useJson) {
836
+ console.log(formatOutput({
837
+ success: errors.length === 0,
838
+ job_id: jobId,
839
+ output_dir: outDir,
840
+ format,
841
+ exported: exported.length,
842
+ errors: errors.length,
843
+ files: exported,
844
+ error_details: errors.length > 0 ? errors : undefined,
845
+ }, 'json'));
846
+ }
847
+ else {
848
+ console.log();
849
+ if (exported.length > 0) {
850
+ success(`Exported ${exported.length} tables to: ${outDir}`);
851
+ }
852
+ if (errors.length > 0) {
853
+ warn(`${errors.length} tables failed to export`);
854
+ }
855
+ }
856
+ }
857
+ //# sourceMappingURL=elements.js.map