jaz-clio 4.4.0 → 4.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/assets/skills/api/SKILL.md +1 -1
- package/assets/skills/api/references/endpoints.md +41 -0
- package/assets/skills/conversion/SKILL.md +1 -1
- package/assets/skills/jobs/SKILL.md +1 -1
- package/assets/skills/jobs/references/document-collection.md +16 -5
- package/assets/skills/transaction-recipes/SKILL.md +1 -1
- package/dist/commands/draft-helpers.js +26 -6
- package/dist/commands/jobs.js +47 -39
- package/dist/commands/magic.js +308 -5
- package/dist/core/jobs/document-collection/tools/ingest/decrypt.js +19 -0
- package/dist/core/jobs/document-collection/tools/ingest/format.js +15 -2
- package/dist/core/jobs/document-collection/tools/ingest/scanner.js +5 -2
- package/dist/core/jobs/document-collection/tools/ingest/upload.js +4 -4
- package/dist/core/pdf/detect.js +344 -0
- package/dist/core/pdf/index.js +8 -0
- package/dist/core/pdf/split.js +81 -0
- package/dist/core/pdf/types.js +4 -0
- package/package.json +2 -1
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: jaz-api
|
|
3
|
-
version: 4.
|
|
3
|
+
version: 4.5.0
|
|
4
4
|
description: Complete reference for the Jaz REST API — the accounting platform backend. Use this skill whenever building, modifying, debugging, or extending any code that calls the API — including API clients, integrations, data seeding, test data, or new endpoint work. Contains every field name, response shape, error, gotcha, and edge case discovered through live production testing.
|
|
5
5
|
license: MIT
|
|
6
6
|
compatibility: Requires Jaz API key (x-jk-api-key header). Works with Claude Code, Google Antigravity, OpenAI Codex, GitHub Copilot, Cursor, and any agent that reads markdown.
|
|
@@ -985,6 +985,12 @@ Content-Type: application/json
|
|
|
985
985
|
- Tax amounts and profiles
|
|
986
986
|
- Document reference numbers, dates, currency
|
|
987
987
|
|
|
988
|
+
**Encrypted PDFs:** Magic cannot process password-protected PDFs. The CLI auto-detects and decrypts before upload:
|
|
989
|
+
- Embed password in filename: `receipt__pw__s3cRetP@ss.pdf` → decrypts with password `s3cRetP@ss`, uploads as `receipt.pdf`
|
|
990
|
+
- `__pw__` delimiter is case-insensitive; password is case-sensitive
|
|
991
|
+
- Requires `qpdf` installed (`brew install qpdf`)
|
|
992
|
+
- If no password in filename, CLI prompts interactively (or errors in `--json` mode with actionable rename instructions)
|
|
993
|
+
|
|
988
994
|
**Key gotchas:**
|
|
989
995
|
- `sourceFile` is the field name (NOT `file`) — same pattern as bank statement endpoint
|
|
990
996
|
- `EXPENSE` returns 422 — use one of the 4 valid types above
|
|
@@ -1056,6 +1062,41 @@ Content-Type: application/json
|
|
|
1056
1062
|
3. When `COMPLETED` → read `businessTransactionDetails.businessTransactionResourceId`
|
|
1057
1063
|
4. Use the BT resource ID with `GET /invoices/:id`, `GET /bills/:id`, `GET /customer-credit-notes/:id`, or `GET /supplier-credit-notes/:id`
|
|
1058
1064
|
|
|
1065
|
+
### CLI: clio magic split — Merged PDF Splitting
|
|
1066
|
+
|
|
1067
|
+
Splits a merged PDF containing multiple documents (invoices, bills, credit notes) into individual files and uploads each to Magic. Uses structural PDF signals (bookmarks, page labels) + text heuristics (keywords, "Page 1 of N" patterns) for boundary detection. **No AI tokens used.**
|
|
1068
|
+
|
|
1069
|
+
```bash
|
|
1070
|
+
# Auto-detect boundaries + upload
|
|
1071
|
+
clio magic split --file merged.pdf --type bill
|
|
1072
|
+
|
|
1073
|
+
# Manual page ranges (for scanned PDFs or override)
|
|
1074
|
+
clio magic split --file merged.pdf --type bill --pages "1-3,4-6,7-9"
|
|
1075
|
+
|
|
1076
|
+
# Dry-run: detect boundaries only (no qpdf needed)
|
|
1077
|
+
clio magic split --file merged.pdf --type bill --dry-run
|
|
1078
|
+
|
|
1079
|
+
# JSON output (for agents)
|
|
1080
|
+
clio magic split --file merged.pdf --type bill --dry-run --json
|
|
1081
|
+
```
|
|
1082
|
+
|
|
1083
|
+
**Detection signals (score-based, threshold >= 50):**
|
|
1084
|
+
- outline-bookmark (+80): PDF bookmark points to this page
|
|
1085
|
+
- page-label-reset (+70): PDF page label restarts at "1"
|
|
1086
|
+
- keyword in header (+40): Document keyword (INVOICE, BILL, etc.) in upper 40%
|
|
1087
|
+
- page-one-of (+35): "Page 1 of N" pattern
|
|
1088
|
+
- keyword-large (+25): Large font (>18pt) keyword bonus
|
|
1089
|
+
- doc-ref (+20): Document reference (INV-001, SO-2024-100, etc.)
|
|
1090
|
+
- continuation (-60): "Page N>1 of M" anti-signal
|
|
1091
|
+
- continuation-text (-40): "Continued" anti-signal
|
|
1092
|
+
|
|
1093
|
+
**Edge cases:**
|
|
1094
|
+
- Scanned PDFs (no extractable text): warns and requires `--pages` manual override
|
|
1095
|
+
- Mixed scanned+digital: low confidence on scanned portions triggers confirmation prompt
|
|
1096
|
+
- Single document detected: suggests `clio magic create` instead
|
|
1097
|
+
- Encrypted PDFs: same `__pw__` pattern as `magic create`
|
|
1098
|
+
- Requires `qpdf` for splitting (not needed for `--dry-run` auto-detect mode)
|
|
1099
|
+
|
|
1059
1100
|
---
|
|
1060
1101
|
|
|
1061
1102
|
## 15. Bank Records
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: jaz-conversion
|
|
3
|
-
version: 4.
|
|
3
|
+
version: 4.5.0
|
|
4
4
|
description: Accounting data conversion skill — migrates customer data from Xero, QuickBooks, Sage, MYOB, and Excel exports to Jaz. Covers config, quick, and full conversion workflows, Excel parsing, CoA/contact/tax/items mapping, clearing accounts, TTB, and TB verification.
|
|
5
5
|
---
|
|
6
6
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: jaz-jobs
|
|
3
|
-
version: 4.
|
|
3
|
+
version: 4.5.0
|
|
4
4
|
description: 12 accounting jobs for SMB bookkeepers and accountants — month-end, quarter-end, and year-end close playbooks plus 9 ad-hoc operational jobs (bank recon, document collection, GST/VAT filing, payment runs, credit control, supplier recon, audit prep, fixed asset review, statutory filing). Jobs can have paired tools as nested subcommands (e.g., `clio jobs bank-recon match`, `clio jobs document-collection ingest`, `clio jobs statutory-filing sg-cs`). Paired with an interactive CLI blueprint generator (clio jobs).
|
|
5
5
|
license: MIT
|
|
6
6
|
compatibility: Works with Claude Code, Claude Cowork, Claude.ai, and any agent that reads markdown. For API payloads, load the jaz-api skill. For individual transaction patterns, load the jaz-recipes skill.
|
|
@@ -97,8 +97,10 @@ clio jobs document-collection ingest --source "https://www.dropbox.com/..." --ti
|
|
|
97
97
|
clio jobs document-collection ingest --source ./scans/ --type invoice [--json]
|
|
98
98
|
clio jobs document-collection ingest --source ./scans/ --type credit-note-customer [--json]
|
|
99
99
|
|
|
100
|
-
# Scan + upload (
|
|
101
|
-
clio jobs document-collection ingest --source ./bank-docs/ --upload --bank-account "DBS Checking" --
|
|
100
|
+
# Scan + upload (encrypted PDFs with password embedded in filename)
|
|
101
|
+
clio jobs document-collection ingest --source ./bank-docs/ --upload --bank-account "DBS Checking" --json
|
|
102
|
+
# To decrypt: rename encrypted PDF to: receipt__pw__actualPassword.pdf
|
|
103
|
+
# The __pw__ delimiter is case-insensitive; the password itself is case-sensitive.
|
|
102
104
|
```
|
|
103
105
|
|
|
104
106
|
### Options
|
|
@@ -109,12 +111,21 @@ clio jobs document-collection ingest --source ./bank-docs/ --upload --bank-accou
|
|
|
109
111
|
| `--type <type>` | Force all files to: `invoice`, `bill`, `credit-note-customer`, `credit-note-supplier`, or `bank-statement` |
|
|
110
112
|
| `--upload` | Upload classified files to Jaz after scanning (requires auth) |
|
|
111
113
|
| `--bank-account <name-or-id>` | Bank account name or resourceId (required for bank statements) |
|
|
112
|
-
| `--pdf-password <password>` | Password for encrypted PDFs — same password applied to all. Requires `qpdf` installed (`brew install qpdf`) |
|
|
113
114
|
| `--api-key <key>` | API key for upload (or use `JAZ_API_KEY` env var) |
|
|
114
115
|
| `--timeout <ms>` | Download timeout in milliseconds (default: 30000 for files, 120000 for folders) |
|
|
115
116
|
| `--currency <code>` | Functional/reporting currency label |
|
|
116
117
|
| `--json` | Structured JSON output with absolute file paths |
|
|
117
118
|
|
|
119
|
+
### Encrypted PDF Passwords
|
|
120
|
+
|
|
121
|
+
Embed the password in the filename using the `__pw__` pattern:
|
|
122
|
+
```
|
|
123
|
+
receipt__pw__s3cRetP@ss.pdf → password: "s3cRetP@ss", display name: "receipt.pdf"
|
|
124
|
+
```
|
|
125
|
+
- `__pw__` is case-insensitive (`__PW__`, `__Pw__`, etc.)
|
|
126
|
+
- The password after `__pw__` is case-sensitive
|
|
127
|
+
- Requires `qpdf` installed (`brew install qpdf`)
|
|
128
|
+
|
|
118
129
|
### JSON Output
|
|
119
130
|
|
|
120
131
|
The `--json` output includes absolute file paths, classification, and size for each file. The AI agent uses these paths to upload via the api skill.
|
|
@@ -171,7 +182,7 @@ sudo apt install qpdf # Ubuntu/Debian
|
|
|
171
182
|
choco install qpdf # Windows
|
|
172
183
|
```
|
|
173
184
|
|
|
174
|
-
|
|
185
|
+
Passwords are embedded in the filename using the `__pw__` pattern: `receipt__pw__myPass.pdf`. During upload, encrypted PDFs with a filename password are decrypted to a temp file, uploaded, then cleaned up.
|
|
175
186
|
|
|
176
187
|
### Error Handling (JSON mode)
|
|
177
188
|
|
|
@@ -180,7 +191,7 @@ If encrypted PDFs are found during `--upload` without the required dependencies,
|
|
|
180
191
|
| Error Code | Condition | Action |
|
|
181
192
|
|------------|-----------|--------|
|
|
182
193
|
| `ENCRYPTED_PDF_NO_QPDF` | qpdf not installed | Install qpdf, then retry |
|
|
183
|
-
| `ENCRYPTED_PDF_NO_PASSWORD` |
|
|
194
|
+
| `ENCRYPTED_PDF_NO_PASSWORD` | Encrypted PDF without `__pw__` in filename | Rename file to embed password: `name__pw__password.pdf` |
|
|
184
195
|
|
|
185
196
|
## Phases (Blueprint)
|
|
186
197
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: jaz-recipes
|
|
3
|
-
version: 4.
|
|
3
|
+
version: 4.5.0
|
|
4
4
|
description: 16 IFRS-compliant recipes for complex multi-step accounting in Jaz — prepaid amortization, deferred revenue, loan schedules, IFRS 16 leases, hire purchase, fixed deposits, asset disposal, FX revaluation, ECL provisioning, IAS 37 provisions, dividends, intercompany, and capital WIP. Each recipe includes journal entries, capsule structure, and verification steps. Paired with 10 financial calculators that produce execution-ready blueprints with workings.
|
|
5
5
|
license: MIT
|
|
6
6
|
compatibility: Works with Claude Code, Claude Cowork, Claude.ai, and any agent that reads markdown. For API payloads, load the jaz-api skill alongside this one.
|
|
@@ -27,7 +27,9 @@ export const JOURNAL_REQUIRED_FIELDS = [
|
|
|
27
27
|
{ field: 'valueDate', label: 'Date', hint: '--date <YYYY-MM-DD>', check: (j) => !!j.valueDate },
|
|
28
28
|
{ field: 'journalEntries', label: 'Journal entries', hint: '--entries <json>', check: (j) => j.journalEntries?.length > 0 },
|
|
29
29
|
{ field: 'accountResourceId', label: 'Account', hint: '--account <name or UUID>', check: (e) => !!(e.accountResourceId || e.organizationAccountResourceId), perLineItem: true },
|
|
30
|
-
{ field: 'amount', label: 'Amount', hint: 'via --entries', check: (e) => e.amount != null && e.amount > 0
|
|
30
|
+
{ field: 'amount', label: 'Amount', hint: 'via --entries', check: (e) => (e.amount != null && e.amount > 0) ||
|
|
31
|
+
(e.debitAmount != null && e.debitAmount > 0) ||
|
|
32
|
+
(e.creditAmount != null && e.creditAmount > 0), perLineItem: true },
|
|
31
33
|
];
|
|
32
34
|
// ── Core Validation ─────────────────────────────────────────────
|
|
33
35
|
/**
|
|
@@ -91,14 +93,17 @@ specs, lineItemsKey = 'lineItems') {
|
|
|
91
93
|
? { status: 'ok', resourceId: acctId }
|
|
92
94
|
: { status: 'missing', hint: '--account <name or UUID>' };
|
|
93
95
|
// Journal entries: show amount/type instead of name/unitPrice
|
|
96
|
+
// GET returns debitAmount/creditAmount (not amount/type)
|
|
94
97
|
if (lineItemsKey === 'journalEntries') {
|
|
95
98
|
const amountSpec = liSpecs.find((s) => s.field === 'amount');
|
|
96
99
|
const amountOk = amountSpec ? amountSpec.check(li) : true;
|
|
100
|
+
const entryAmount = li.amount ?? li.debitAmount ?? li.creditAmount ?? null;
|
|
101
|
+
const entryType = li.type ?? (li.debitAmount > 0 ? 'DEBIT' : li.creditAmount > 0 ? 'CREDIT' : null);
|
|
97
102
|
return {
|
|
98
103
|
index: i,
|
|
99
|
-
name: li.description ||
|
|
104
|
+
name: li.description || entryType || null,
|
|
100
105
|
nameStatus: 'ok',
|
|
101
|
-
unitPrice:
|
|
106
|
+
unitPrice: entryAmount,
|
|
102
107
|
unitPriceStatus: amountOk ? 'ok' : 'missing',
|
|
103
108
|
account,
|
|
104
109
|
};
|
|
@@ -358,10 +363,25 @@ function sanitizeJournalEntry(e) {
|
|
|
358
363
|
const acctId = e.accountResourceId || e.organizationAccountResourceId;
|
|
359
364
|
if (acctId)
|
|
360
365
|
clean.accountResourceId = acctId;
|
|
361
|
-
|
|
366
|
+
// Amount/type: GET uses debitAmount/creditAmount, PUT uses amount+type
|
|
367
|
+
if (e.amount != null) {
|
|
362
368
|
clean.amount = e.amount;
|
|
363
|
-
|
|
364
|
-
|
|
369
|
+
}
|
|
370
|
+
else if (e.debitAmount != null && e.debitAmount > 0) {
|
|
371
|
+
clean.amount = e.debitAmount;
|
|
372
|
+
}
|
|
373
|
+
else if (e.creditAmount != null && e.creditAmount > 0) {
|
|
374
|
+
clean.amount = e.creditAmount;
|
|
375
|
+
}
|
|
376
|
+
if (e.type) {
|
|
377
|
+
clean.type = e.type; // Already in PUT form (DEBIT or CREDIT)
|
|
378
|
+
}
|
|
379
|
+
else if (e.debitAmount != null && e.debitAmount > 0) {
|
|
380
|
+
clean.type = 'DEBIT';
|
|
381
|
+
}
|
|
382
|
+
else if (e.creditAmount != null && e.creditAmount > 0) {
|
|
383
|
+
clean.type = 'CREDIT';
|
|
384
|
+
}
|
|
365
385
|
if (e.description)
|
|
366
386
|
clean.description = e.description;
|
|
367
387
|
if (e.contactResourceId)
|
package/dist/commands/jobs.js
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import chalk from 'chalk';
|
|
2
|
+
import prompts from 'prompts';
|
|
2
3
|
import { readFileSync } from 'node:fs';
|
|
3
4
|
import { generateMonthEndBlueprint } from '../core/jobs/month-end/blueprint.js';
|
|
4
5
|
import { generateQuarterEndBlueprint } from '../core/jobs/quarter-end/blueprint.js';
|
|
@@ -276,7 +277,6 @@ export function registerJobsCommand(program) {
|
|
|
276
277
|
.option('--type <type>', 'Force document type: invoice, bill, credit-note-customer, credit-note-supplier, or bank-statement')
|
|
277
278
|
.option('--upload', 'Upload classified files to Jaz after scanning (requires auth)')
|
|
278
279
|
.option('--bank-account <name-or-id>', 'Bank account name or resourceId (required for bank statements)')
|
|
279
|
-
.option('--pdf-password <password>', 'Password for encrypted PDFs (same password applied to all)')
|
|
280
280
|
.option('--api-key <key>', 'API key (or use JAZ_API_KEY env var)')
|
|
281
281
|
.option('--timeout <ms>', 'Download timeout in milliseconds for cloud sources (default: 30000)', parseInt)
|
|
282
282
|
.option('--currency <code>', 'Functional/reporting currency (e.g. SGD)')
|
|
@@ -342,46 +342,55 @@ export function registerJobsCommand(program) {
|
|
|
342
342
|
process.exit(1);
|
|
343
343
|
}
|
|
344
344
|
// ── Encrypted PDF checks ──────────────────────────────────────
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
}
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
345
|
+
// Files with __pw__<password> in the filename auto-supply their password.
|
|
346
|
+
// Only files without a filePassword need qpdf + user action.
|
|
347
|
+
const encryptedFiles = plan.folders.flatMap(f => f.files.filter(file => file.encrypted));
|
|
348
|
+
const needPassword = encryptedFiles.filter(f => !f.filePassword);
|
|
349
|
+
if (encryptedFiles.length > 0 && !isQpdfAvailable()) {
|
|
350
|
+
const paths = encryptedFiles.map(f => `${f.folder}/${f.filename}`);
|
|
351
|
+
if (opts.json) {
|
|
352
|
+
console.log(JSON.stringify({
|
|
353
|
+
error: 'ENCRYPTED_PDF_NO_QPDF',
|
|
354
|
+
message: 'Encrypted PDFs found but qpdf is not installed',
|
|
355
|
+
action: 'Install qpdf: brew install qpdf (macOS) or sudo apt install qpdf (Linux), then retry',
|
|
356
|
+
encryptedFiles: paths,
|
|
357
|
+
}));
|
|
358
|
+
}
|
|
359
|
+
else {
|
|
360
|
+
console.error(chalk.red('Error: Encrypted PDFs found but qpdf is not installed.'));
|
|
361
|
+
console.error(chalk.yellow('Install qpdf to decrypt before upload:'));
|
|
362
|
+
console.error(chalk.dim(' macOS: brew install qpdf'));
|
|
363
|
+
console.error(chalk.dim(' Ubuntu: sudo apt install qpdf'));
|
|
364
|
+
console.error(chalk.dim(' Windows: choco install qpdf'));
|
|
365
|
+
}
|
|
366
|
+
process.exit(1);
|
|
367
|
+
}
|
|
368
|
+
if (needPassword.length > 0) {
|
|
369
|
+
const paths = needPassword.map(f => `${f.folder}/${f.filename}`);
|
|
370
|
+
if (opts.json) {
|
|
371
|
+
// Agents can't type — error with actionable instructions
|
|
372
|
+
console.log(JSON.stringify({
|
|
373
|
+
error: 'ENCRYPTED_PDF_NO_PASSWORD',
|
|
374
|
+
message: `${needPassword.length} encrypted PDF(s) found without a password`,
|
|
375
|
+
action: 'Embed password in filename: rename to filename__pw__password.pdf',
|
|
376
|
+
encryptedFiles: paths,
|
|
377
|
+
}));
|
|
365
378
|
process.exit(1);
|
|
366
379
|
}
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
380
|
+
// Interactive mode — prompt user for each encrypted file
|
|
381
|
+
console.error(chalk.yellow(`\n${needPassword.length} encrypted PDF(s) need a password:\n`));
|
|
382
|
+
for (const f of needPassword) {
|
|
383
|
+
const { password } = await prompts({
|
|
384
|
+
type: 'text',
|
|
385
|
+
name: 'password',
|
|
386
|
+
message: `PDF password for ${f.folder}/${f.filename}`,
|
|
387
|
+
});
|
|
388
|
+
if (!password) {
|
|
389
|
+
console.error(chalk.red('Aborted — no password provided.'));
|
|
390
|
+
console.error(chalk.dim('Tip: embed password in filename to skip prompts: filename__pw__password.pdf'));
|
|
391
|
+
process.exit(1);
|
|
375
392
|
}
|
|
376
|
-
|
|
377
|
-
console.error(chalk.red(`Error: ${encryptedFiles.length} encrypted PDF(s) found — password required.`));
|
|
378
|
-
console.error(chalk.dim('Encrypted files:'));
|
|
379
|
-
for (const f of encryptedFiles)
|
|
380
|
-
console.error(chalk.dim(` ${f}`));
|
|
381
|
-
console.error();
|
|
382
|
-
console.error(chalk.dim('Retry with: --pdf-password <password>'));
|
|
383
|
-
}
|
|
384
|
-
process.exit(1);
|
|
393
|
+
f.filePassword = password;
|
|
385
394
|
}
|
|
386
395
|
}
|
|
387
396
|
// ── Upload with auth ─────────────────────────────────────────
|
|
@@ -402,7 +411,6 @@ export function registerJobsCommand(program) {
|
|
|
402
411
|
plan,
|
|
403
412
|
client,
|
|
404
413
|
bankAccountId,
|
|
405
|
-
pdfPassword: opts.pdfPassword,
|
|
406
414
|
onProgress: apiOpts.json ? undefined : printUploadProgress,
|
|
407
415
|
});
|
|
408
416
|
const result = { ...plan, upload };
|
package/dist/commands/magic.js
CHANGED
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
import chalk from 'chalk';
|
|
2
2
|
import { readFileSync } from 'node:fs';
|
|
3
3
|
import { basename, extname, resolve } from 'node:path';
|
|
4
|
+
import prompts from 'prompts';
|
|
4
5
|
import { createFromAttachment, searchMagicWorkflows, waitForWorkflows, } from '../core/api/magic.js';
|
|
6
|
+
import { extractFilePassword, isPdfEncrypted, isQpdfAvailable, decryptPdf, cleanupDecryptedFile, } from '../core/jobs/document-collection/tools/ingest/decrypt.js';
|
|
7
|
+
import { detectBoundaries, parsePageRanges, splitPdf, cleanupSplitFiles, } from '../core/pdf/index.js';
|
|
5
8
|
import { apiAction } from './api-action.js';
|
|
6
9
|
import { parsePositiveInt } from './parsers.js';
|
|
7
10
|
import { displaySlice } from './pagination.js';
|
|
@@ -42,8 +45,8 @@ export function registerMagicCommand(program) {
|
|
|
42
45
|
// ── clio magic create ─────────────────────────────────────────
|
|
43
46
|
magic
|
|
44
47
|
.command('create')
|
|
45
|
-
.description('Upload a file to create a draft transaction via AI extraction')
|
|
46
|
-
.option('--file <path>', 'Local file path (PDF, JPG, PNG, HEIC, XLS, XLSX, EML)')
|
|
48
|
+
.description('Upload a file to create a draft transaction via AI extraction. Encrypted PDFs auto-decrypt via __pw__ in filename (e.g. receipt__pw__pass.pdf)')
|
|
49
|
+
.option('--file <path>', 'Local file path (PDF, JPG, PNG, HEIC, XLS, XLSX, EML). Encrypted PDFs: name__pw__password.pdf')
|
|
47
50
|
.option('--url <url>', 'Remote file URL (alternative to --file)')
|
|
48
51
|
.option('--type <type>', `Document type: ${VALID_TYPES}`)
|
|
49
52
|
.option('--api-key <key>', 'API key (overrides stored/env)')
|
|
@@ -69,6 +72,7 @@ export function registerMagicCommand(program) {
|
|
|
69
72
|
}
|
|
70
73
|
let sourceFile;
|
|
71
74
|
let sourceFileName;
|
|
75
|
+
let decryptedPath;
|
|
72
76
|
if (opts.file) {
|
|
73
77
|
const filePath = resolve(opts.file);
|
|
74
78
|
const ext = extname(filePath).toLowerCase();
|
|
@@ -77,9 +81,17 @@ export function registerMagicCommand(program) {
|
|
|
77
81
|
console.error(chalk.red(`Error: unsupported file type "${ext}". Supported: ${Object.keys(MIME_MAP).join(', ')}`));
|
|
78
82
|
process.exit(1);
|
|
79
83
|
}
|
|
80
|
-
const
|
|
81
|
-
|
|
82
|
-
|
|
84
|
+
const resolved = await resolveInputPdf(filePath, ext, opts);
|
|
85
|
+
decryptedPath = resolved.decryptedPath;
|
|
86
|
+
try {
|
|
87
|
+
const buffer = readFileSync(resolved.effectivePath);
|
|
88
|
+
sourceFile = new Blob([buffer], { type: mime });
|
|
89
|
+
sourceFileName = resolved.cleanName;
|
|
90
|
+
}
|
|
91
|
+
finally {
|
|
92
|
+
if (decryptedPath)
|
|
93
|
+
cleanupDecryptedFile(decryptedPath);
|
|
94
|
+
}
|
|
83
95
|
}
|
|
84
96
|
const res = await createFromAttachment(client, {
|
|
85
97
|
businessTransactionType: apiType,
|
|
@@ -231,8 +243,299 @@ export function registerMagicCommand(program) {
|
|
|
231
243
|
console.log(chalk.dim(` ... and ${overflow.toLocaleString()} more (use --json for full output)`));
|
|
232
244
|
}
|
|
233
245
|
}));
|
|
246
|
+
// ── clio magic split ──────────────────────────────────────────
|
|
247
|
+
magic
|
|
248
|
+
.command('split')
|
|
249
|
+
.description('Split a merged PDF into individual documents and upload each to Magic.\n' +
|
|
250
|
+
'Auto-detects document boundaries using text heuristics (keywords, page numbers, bookmarks).\n' +
|
|
251
|
+
'For scanned PDFs, use --pages to specify boundaries manually.')
|
|
252
|
+
.option('--file <path>', 'Local PDF file path (required). Encrypted PDFs: name__pw__password.pdf')
|
|
253
|
+
.option('--type <type>', `Document type: ${VALID_TYPES}`)
|
|
254
|
+
.option('--pages <ranges>', 'Manual page ranges (e.g. "1-3,4-6,7"). Skips auto-detection')
|
|
255
|
+
.option('--dry-run', 'Detect boundaries only — do not split or upload')
|
|
256
|
+
.option('--api-key <key>', 'API key (overrides stored/env)')
|
|
257
|
+
.option('--json', 'Output as JSON')
|
|
258
|
+
.action(apiAction(async (client, opts) => {
|
|
259
|
+
// ── Validate inputs ──
|
|
260
|
+
if (!opts.file) {
|
|
261
|
+
console.error(chalk.red('Error: --file is required'));
|
|
262
|
+
console.error(chalk.dim('Usage: clio magic split --file merged.pdf --type bill'));
|
|
263
|
+
process.exit(1);
|
|
264
|
+
}
|
|
265
|
+
if (!opts.type) {
|
|
266
|
+
console.error(chalk.red(`Error: --type is required (${VALID_TYPES})`));
|
|
267
|
+
console.error(chalk.dim('Usage: clio magic split --file merged.pdf --type bill'));
|
|
268
|
+
process.exit(1);
|
|
269
|
+
}
|
|
270
|
+
const apiType = TYPE_TO_API[opts.type];
|
|
271
|
+
if (!apiType) {
|
|
272
|
+
console.error(chalk.red(`Error: invalid type "${opts.type}". Valid: ${VALID_TYPES}`));
|
|
273
|
+
process.exit(1);
|
|
274
|
+
}
|
|
275
|
+
const filePath = resolve(opts.file);
|
|
276
|
+
const ext = extname(filePath).toLowerCase();
|
|
277
|
+
if (ext !== '.pdf') {
|
|
278
|
+
console.error(chalk.red('Error: PDF splitting only supports .pdf files'));
|
|
279
|
+
process.exit(1);
|
|
280
|
+
}
|
|
281
|
+
// qpdf only needed when actually splitting (not for dry-run auto-detect)
|
|
282
|
+
const needsQpdf = !opts.dryRun || opts.pages;
|
|
283
|
+
if (needsQpdf && !isQpdfAvailable()) {
|
|
284
|
+
console.error(chalk.red('Error: qpdf is required for PDF splitting.'));
|
|
285
|
+
console.error(chalk.dim(' macOS: brew install qpdf'));
|
|
286
|
+
console.error(chalk.dim(' Ubuntu: sudo apt install qpdf'));
|
|
287
|
+
process.exit(1);
|
|
288
|
+
}
|
|
289
|
+
// ── Handle encrypted PDFs ──
|
|
290
|
+
const resolved = await resolveInputPdf(filePath, ext, opts);
|
|
291
|
+
const effectivePath = resolved.effectivePath;
|
|
292
|
+
const sourceBaseName = resolved.cleanName.replace(/\.pdf$/i, '');
|
|
293
|
+
try {
|
|
294
|
+
// ── Determine documents (auto-detect or manual) ──
|
|
295
|
+
let documents;
|
|
296
|
+
let pageCount;
|
|
297
|
+
if (opts.pages) {
|
|
298
|
+
// Manual page ranges — need qpdf for page count
|
|
299
|
+
pageCount = (await import('../core/pdf/split.js')).getPageCount(effectivePath);
|
|
300
|
+
documents = parsePageRanges(opts.pages, pageCount);
|
|
301
|
+
}
|
|
302
|
+
else {
|
|
303
|
+
// Auto-detect boundaries
|
|
304
|
+
const buffer = readFileSync(effectivePath);
|
|
305
|
+
const detection = await detectBoundaries(new Uint8Array(buffer));
|
|
306
|
+
pageCount = detection.pageCount;
|
|
307
|
+
documents = detection.documents;
|
|
308
|
+
// Scanned PDF — can't auto-detect, require --pages
|
|
309
|
+
if (detection.isScannedPdf) {
|
|
310
|
+
if (opts.json) {
|
|
311
|
+
console.log(JSON.stringify({
|
|
312
|
+
file: basename(filePath),
|
|
313
|
+
pageCount,
|
|
314
|
+
isScannedPdf: true,
|
|
315
|
+
error: 'Scanned PDF — no extractable text. Use --pages to specify boundaries manually.',
|
|
316
|
+
}, null, 2));
|
|
317
|
+
}
|
|
318
|
+
else {
|
|
319
|
+
console.error(chalk.yellow('Scanned PDF detected — no extractable text for boundary detection.'));
|
|
320
|
+
console.error(chalk.dim(` Use --pages to split manually:`));
|
|
321
|
+
console.error(chalk.dim(` clio magic split --file ${basename(filePath)} --type ${opts.type} --pages "1-3,4-6,7-9"`));
|
|
322
|
+
}
|
|
323
|
+
process.exit(1);
|
|
324
|
+
}
|
|
325
|
+
// Single document — suggest magic create instead
|
|
326
|
+
if (documents.length <= 1) {
|
|
327
|
+
if (opts.json) {
|
|
328
|
+
console.log(JSON.stringify({
|
|
329
|
+
file: basename(filePath),
|
|
330
|
+
pageCount,
|
|
331
|
+
documentsDetected: documents.length,
|
|
332
|
+
message: 'Only 1 document detected — use `clio magic create` instead, or --pages to override.',
|
|
333
|
+
}, null, 2));
|
|
334
|
+
}
|
|
335
|
+
else {
|
|
336
|
+
console.log(chalk.yellow(`Only 1 document detected in ${pageCount}-page PDF.`));
|
|
337
|
+
console.log(chalk.dim(` Use clio magic create --file ${basename(filePath)} --type ${opts.type}`));
|
|
338
|
+
console.log(chalk.dim(` Or override with --pages: clio magic split --file ${basename(filePath)} --type ${opts.type} --pages "1-3,4-6"`));
|
|
339
|
+
}
|
|
340
|
+
return;
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
// ── Dry-run: print detection results only ──
|
|
344
|
+
if (opts.dryRun) {
|
|
345
|
+
if (opts.json) {
|
|
346
|
+
console.log(JSON.stringify({
|
|
347
|
+
file: basename(filePath),
|
|
348
|
+
pageCount,
|
|
349
|
+
documents: documents.map((d) => ({
|
|
350
|
+
index: d.index,
|
|
351
|
+
pageRange: d.pageRange,
|
|
352
|
+
confidence: d.confidence,
|
|
353
|
+
signals: d.signals.map((s) => s.label),
|
|
354
|
+
})),
|
|
355
|
+
}, null, 2));
|
|
356
|
+
}
|
|
357
|
+
else {
|
|
358
|
+
console.log(chalk.bold(`PDF Split — Boundary Detection`));
|
|
359
|
+
console.log(` File: ${basename(filePath)} (${pageCount} pages)\n`);
|
|
360
|
+
for (const doc of documents) {
|
|
361
|
+
const conf = doc.confidence === 'high' ? chalk.green(doc.confidence)
|
|
362
|
+
: doc.confidence === 'medium' ? chalk.yellow(doc.confidence)
|
|
363
|
+
: chalk.red(doc.confidence);
|
|
364
|
+
const signals = doc.signals.filter((s) => s.score > 0).map((s) => s.label).join(', ') || 'first page';
|
|
365
|
+
console.log(` Document ${doc.index + 1}: pages ${doc.pageRange.replace('-', '\u2013')} (${conf}) ${chalk.dim(signals)}`);
|
|
366
|
+
}
|
|
367
|
+
console.log(`\n ${documents.length} documents detected. Use --pages to override.`);
|
|
368
|
+
}
|
|
369
|
+
return;
|
|
370
|
+
}
|
|
371
|
+
// ── Confidence check: prompt if any low/medium ──
|
|
372
|
+
const hasLowConfidence = documents.some((d) => d.confidence !== 'high' && d.index > 0);
|
|
373
|
+
if (hasLowConfidence && !opts.json) {
|
|
374
|
+
console.log(chalk.bold(`PDF Split — Boundary Detection`));
|
|
375
|
+
console.log(` File: ${basename(filePath)} (${pageCount} pages)\n`);
|
|
376
|
+
for (const doc of documents) {
|
|
377
|
+
const conf = doc.confidence === 'high' ? chalk.green(doc.confidence)
|
|
378
|
+
: doc.confidence === 'medium' ? chalk.yellow(doc.confidence)
|
|
379
|
+
: chalk.red(doc.confidence);
|
|
380
|
+
const signals = doc.signals.filter((s) => s.score > 0).map((s) => s.label).join(', ') || 'first page';
|
|
381
|
+
console.log(` Document ${doc.index + 1}: pages ${doc.pageRange.replace('-', '\u2013')} (${conf}) ${chalk.dim(signals)}`);
|
|
382
|
+
}
|
|
383
|
+
console.log('');
|
|
384
|
+
const { proceed } = await prompts({
|
|
385
|
+
type: 'confirm',
|
|
386
|
+
name: 'proceed',
|
|
387
|
+
message: 'Some boundaries have low confidence. Split and upload anyway?',
|
|
388
|
+
initial: true,
|
|
389
|
+
});
|
|
390
|
+
if (!proceed) {
|
|
391
|
+
console.log(chalk.dim('Aborted. Use --pages to specify boundaries manually.'));
|
|
392
|
+
return;
|
|
393
|
+
}
|
|
394
|
+
}
|
|
395
|
+
// ── Split + Upload ──
|
|
396
|
+
const splitResult = splitPdf(effectivePath, documents, sourceBaseName);
|
|
397
|
+
const uploadResults = [];
|
|
398
|
+
try {
|
|
399
|
+
// Report split failures
|
|
400
|
+
for (const f of splitResult.failures) {
|
|
401
|
+
uploadResults.push({
|
|
402
|
+
index: f.index,
|
|
403
|
+
pageRange: f.pageRange,
|
|
404
|
+
splitFileName: `${sourceBaseName}_${f.index + 1}.pdf`,
|
|
405
|
+
status: 'failed',
|
|
406
|
+
error: `Split failed: ${f.error}`,
|
|
407
|
+
});
|
|
408
|
+
if (!opts.json) {
|
|
409
|
+
console.log(chalk.red(` \u2717 [${f.index + 1}/${documents.length}] pages ${f.pageRange} — split failed: ${f.error}`));
|
|
410
|
+
}
|
|
411
|
+
}
|
|
412
|
+
// Upload each split file
|
|
413
|
+
for (const file of splitResult.files) {
|
|
414
|
+
try {
|
|
415
|
+
const buffer = readFileSync(file.path);
|
|
416
|
+
const blob = new Blob([buffer], { type: 'application/pdf' });
|
|
417
|
+
const res = await createFromAttachment(client, {
|
|
418
|
+
businessTransactionType: apiType,
|
|
419
|
+
sourceFile: blob,
|
|
420
|
+
sourceFileName: file.fileName,
|
|
421
|
+
});
|
|
422
|
+
const valid = res.data.validFiles?.[0];
|
|
423
|
+
const invalid = res.data.invalidFiles?.[0];
|
|
424
|
+
if (valid) {
|
|
425
|
+
uploadResults.push({
|
|
426
|
+
index: file.index,
|
|
427
|
+
pageRange: file.pageRange,
|
|
428
|
+
splitFileName: file.fileName,
|
|
429
|
+
status: 'uploaded',
|
|
430
|
+
workflowResourceId: valid.workflowResourceId,
|
|
431
|
+
documentType: res.data.businessTransactionType,
|
|
432
|
+
});
|
|
433
|
+
if (!opts.json) {
|
|
434
|
+
console.log(chalk.green(` \u2713 [${file.index + 1}/${documents.length}] pages ${file.pageRange} \u2192 ${file.fileName} \u2192 ${opts.type.toUpperCase()} (workflow: ${valid.workflowResourceId})`));
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
else {
|
|
438
|
+
const errMsg = invalid?.errorMessage ?? 'Unknown upload error';
|
|
439
|
+
uploadResults.push({
|
|
440
|
+
index: file.index,
|
|
441
|
+
pageRange: file.pageRange,
|
|
442
|
+
splitFileName: file.fileName,
|
|
443
|
+
status: 'failed',
|
|
444
|
+
error: errMsg,
|
|
445
|
+
});
|
|
446
|
+
if (!opts.json) {
|
|
447
|
+
console.log(chalk.red(` \u2717 [${file.index + 1}/${documents.length}] pages ${file.pageRange} \u2192 ${file.fileName} \u2192 failed: ${errMsg}`));
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
catch (err) {
|
|
452
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
453
|
+
uploadResults.push({
|
|
454
|
+
index: file.index,
|
|
455
|
+
pageRange: file.pageRange,
|
|
456
|
+
splitFileName: file.fileName,
|
|
457
|
+
status: 'failed',
|
|
458
|
+
error: errMsg,
|
|
459
|
+
});
|
|
460
|
+
if (!opts.json) {
|
|
461
|
+
console.log(chalk.red(` \u2717 [${file.index + 1}/${documents.length}] pages ${file.pageRange} \u2192 ${file.fileName} \u2192 failed: ${errMsg}`));
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
}
|
|
466
|
+
finally {
|
|
467
|
+
cleanupSplitFiles(splitResult.tempDir);
|
|
468
|
+
}
|
|
469
|
+
// ── Summary ──
|
|
470
|
+
const uploaded = uploadResults.filter((r) => r.status === 'uploaded').length;
|
|
471
|
+
const failed = uploadResults.filter((r) => r.status === 'failed').length;
|
|
472
|
+
if (opts.json) {
|
|
473
|
+
console.log(JSON.stringify({
|
|
474
|
+
file: basename(filePath),
|
|
475
|
+
pageCount,
|
|
476
|
+
documents: uploadResults,
|
|
477
|
+
summary: { total: documents.length, uploaded, failed },
|
|
478
|
+
}, null, 2));
|
|
479
|
+
}
|
|
480
|
+
else {
|
|
481
|
+
console.log(`\n ${uploaded} uploaded, ${failed} failed`);
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
finally {
|
|
485
|
+
if (resolved.decryptedPath)
|
|
486
|
+
cleanupDecryptedFile(resolved.decryptedPath);
|
|
487
|
+
}
|
|
488
|
+
}));
|
|
234
489
|
}
|
|
235
490
|
// ── Helpers ──────────────────────────────────────────────────────
|
|
491
|
+
/**
|
|
492
|
+
* Resolve a PDF input, handling __pw__ password extraction + encrypted PDF decryption.
|
|
493
|
+
* Shared by magic create and magic split (DRY).
|
|
494
|
+
*
|
|
495
|
+
* For non-PDF files, returns the original path with a clean filename (no __pw__ suffix).
|
|
496
|
+
*/
|
|
497
|
+
async function resolveInputPdf(filePath, ext, opts) {
|
|
498
|
+
const rawName = basename(filePath);
|
|
499
|
+
const { cleanName, password: filePassword } = extractFilePassword(rawName, ext);
|
|
500
|
+
if (ext !== '.pdf') {
|
|
501
|
+
return { effectivePath: filePath, cleanName };
|
|
502
|
+
}
|
|
503
|
+
const buffer = readFileSync(filePath);
|
|
504
|
+
if (!isPdfEncrypted(buffer)) {
|
|
505
|
+
return { effectivePath: filePath, cleanName };
|
|
506
|
+
}
|
|
507
|
+
// Encrypted PDF — need qpdf to decrypt
|
|
508
|
+
if (!isQpdfAvailable()) {
|
|
509
|
+
console.error(chalk.red('Error: Encrypted PDF detected but qpdf is not installed.'));
|
|
510
|
+
console.error(chalk.dim(' macOS: brew install qpdf'));
|
|
511
|
+
console.error(chalk.dim(' Ubuntu: sudo apt install qpdf'));
|
|
512
|
+
process.exit(1);
|
|
513
|
+
}
|
|
514
|
+
let password = filePassword;
|
|
515
|
+
if (!password) {
|
|
516
|
+
if (opts.json) {
|
|
517
|
+
console.log(JSON.stringify({
|
|
518
|
+
error: 'ENCRYPTED_PDF_NO_PASSWORD',
|
|
519
|
+
message: 'Encrypted PDF — embed password in filename: name__pw__password.pdf',
|
|
520
|
+
file: rawName,
|
|
521
|
+
}));
|
|
522
|
+
process.exit(1);
|
|
523
|
+
}
|
|
524
|
+
const response = await prompts({
|
|
525
|
+
type: 'text',
|
|
526
|
+
name: 'password',
|
|
527
|
+
message: `PDF password for ${rawName}`,
|
|
528
|
+
});
|
|
529
|
+
if (!response.password) {
|
|
530
|
+
console.error(chalk.red('Aborted — no password provided.'));
|
|
531
|
+
console.error(chalk.dim('Tip: embed password in filename: name__pw__password.pdf'));
|
|
532
|
+
process.exit(1);
|
|
533
|
+
}
|
|
534
|
+
password = response.password;
|
|
535
|
+
}
|
|
536
|
+
const decryptedPath = decryptPdf(filePath, password);
|
|
537
|
+
return { effectivePath: decryptedPath, cleanName, decryptedPath };
|
|
538
|
+
}
|
|
236
539
|
function outputWorkflowResults(results, json) {
|
|
237
540
|
if (json) {
|
|
238
541
|
const workflows = results.map((w) => ({
|
|
@@ -90,6 +90,25 @@ export function decryptPdf(encryptedPath, password) {
|
|
|
90
90
|
}
|
|
91
91
|
return outputPath;
|
|
92
92
|
}
|
|
93
|
+
// ── Filename password extraction ────────────────────────────
|
|
94
|
+
/**
|
|
95
|
+
* Extract password from filename pattern: label__pw__password.ext
|
|
96
|
+
*
|
|
97
|
+
* The __pw__ delimiter is case-insensitive (__PW__, __Pw__, etc.).
|
|
98
|
+
* The password after __pw__ is case-sensitive and used as-is.
|
|
99
|
+
* Returns the cleaned display name (without __pw__...) and the extracted password.
|
|
100
|
+
*
|
|
101
|
+
* Examples:
|
|
102
|
+
* receipt__pw__s3cRetP@ss.pdf → { cleanName: "receipt.pdf", password: "s3cRetP@ss" }
|
|
103
|
+
* normal-file.pdf → { cleanName: "normal-file.pdf" }
|
|
104
|
+
*/
|
|
105
|
+
export function extractFilePassword(filename, ext) {
|
|
106
|
+
const escapedExt = ext.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
107
|
+
const match = filename.match(new RegExp(`^(.+?)__pw__(.+)${escapedExt}$`, 'i'));
|
|
108
|
+
if (!match)
|
|
109
|
+
return { cleanName: filename };
|
|
110
|
+
return { cleanName: `${match[1]}${ext}`, password: match[2] };
|
|
111
|
+
}
|
|
93
112
|
/**
|
|
94
113
|
* Remove a decrypted temp file and its parent temp directory.
|
|
95
114
|
* Safe to call with any path — silently ignores missing files.
|
|
@@ -21,7 +21,13 @@ function printFolders(plan) {
|
|
|
21
21
|
: chalk.green(folder.documentType);
|
|
22
22
|
const prefix = folder.documentType === 'UNKNOWN' ? chalk.yellow('[!] ') : ' ';
|
|
23
23
|
console.log(`${prefix}${chalk.bold(folder.folder)}/ (${folder.count} files → ${typeLabel})`);
|
|
24
|
-
const labels = folder.files.map(f =>
|
|
24
|
+
const labels = folder.files.map(f => {
|
|
25
|
+
if (f.encrypted && f.filePassword)
|
|
26
|
+
return `${f.filename} ${chalk.green('(pw)')}`;
|
|
27
|
+
if (f.encrypted)
|
|
28
|
+
return `${f.filename} ${chalk.yellow('(encrypted)')}`;
|
|
29
|
+
return f.filename;
|
|
30
|
+
});
|
|
25
31
|
const show = labels.slice(0, 8);
|
|
26
32
|
console.log(chalk.gray(` ${show.join(', ')}${labels.length > 8 ? `, ... and ${labels.length - 8} more` : ''}`));
|
|
27
33
|
console.log();
|
|
@@ -38,7 +44,14 @@ function printSummary(plan) {
|
|
|
38
44
|
console.log(` Skipped: ${chalk.gray(String(plan.summary.skipped))}`);
|
|
39
45
|
}
|
|
40
46
|
if (plan.summary.encrypted > 0) {
|
|
41
|
-
|
|
47
|
+
const autoCount = plan.folders.flatMap(f => f.files).filter(f => f.encrypted && f.filePassword).length;
|
|
48
|
+
const remaining = plan.summary.encrypted - autoCount;
|
|
49
|
+
const parts = [];
|
|
50
|
+
if (autoCount > 0)
|
|
51
|
+
parts.push(chalk.green(`${autoCount} password from filename`));
|
|
52
|
+
if (remaining > 0)
|
|
53
|
+
parts.push(chalk.yellow(`${remaining} need __pw__<password> in filename`));
|
|
54
|
+
console.log(` Encrypted: ${plan.summary.encrypted} (${parts.join(', ')})`);
|
|
42
55
|
}
|
|
43
56
|
if (Object.keys(plan.summary.byType).length > 0) {
|
|
44
57
|
console.log();
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
import { readdirSync, readFileSync, statSync } from 'node:fs';
|
|
8
8
|
import { join, basename, extname, relative, resolve } from 'node:path';
|
|
9
9
|
import { classifyFolder, checkExtension } from './classify.js';
|
|
10
|
-
import { isPdfEncrypted } from './decrypt.js';
|
|
10
|
+
import { isPdfEncrypted, extractFilePassword } from './decrypt.js';
|
|
11
11
|
/** Max recursion depth to prevent runaway traversal. */
|
|
12
12
|
const MAX_DEPTH = 10;
|
|
13
13
|
/**
|
|
@@ -159,9 +159,11 @@ function scanDir(rootPath, dirPath, inheritedType, forceType, depth, maxDepth, o
|
|
|
159
159
|
}
|
|
160
160
|
catch { /* ignore read errors */ }
|
|
161
161
|
}
|
|
162
|
+
// Extract password from filename pattern: label__pw__password.ext
|
|
163
|
+
const { cleanName, password: filePassword } = extractFilePassword(entry, ext);
|
|
162
164
|
out.push({
|
|
163
165
|
path: relPath,
|
|
164
|
-
filename:
|
|
166
|
+
filename: cleanName,
|
|
165
167
|
extension: ext,
|
|
166
168
|
documentType,
|
|
167
169
|
folder: relDir,
|
|
@@ -170,6 +172,7 @@ function scanDir(rootPath, dirPath, inheritedType, forceType, depth, maxDepth, o
|
|
|
170
172
|
absolutePath: fullPath,
|
|
171
173
|
sizeBytes: stat.size,
|
|
172
174
|
encrypted,
|
|
175
|
+
filePassword,
|
|
173
176
|
});
|
|
174
177
|
}
|
|
175
178
|
}
|
|
@@ -30,7 +30,7 @@ const MIME_MAP = {
|
|
|
30
30
|
* UNKNOWN and SKIPPED files are excluded.
|
|
31
31
|
*/
|
|
32
32
|
export async function uploadClassifiedFiles(opts) {
|
|
33
|
-
const { plan, client, bankAccountId,
|
|
33
|
+
const { plan, client, bankAccountId, onProgress } = opts;
|
|
34
34
|
// Collect all uploadable files across folders
|
|
35
35
|
const uploadable = plan.folders.flatMap((folder) => folder.files.filter((f) => f.documentType === 'INVOICE' || f.documentType === 'BILL' || f.documentType === 'CUSTOMER_CREDIT_NOTE' || f.documentType === 'SUPPLIER_CREDIT_NOTE' || f.documentType === 'BANK_STATEMENT'));
|
|
36
36
|
const results = [];
|
|
@@ -39,10 +39,10 @@ export async function uploadClassifiedFiles(opts) {
|
|
|
39
39
|
const docType = file.documentType;
|
|
40
40
|
let decryptedPath;
|
|
41
41
|
try {
|
|
42
|
-
// Decrypt encrypted PDFs
|
|
42
|
+
// Decrypt encrypted PDFs using password from filename (__pw__ pattern)
|
|
43
43
|
let effectivePath = file.absolutePath;
|
|
44
|
-
if (file.encrypted &&
|
|
45
|
-
decryptedPath = decryptPdf(file.absolutePath,
|
|
44
|
+
if (file.encrypted && file.filePassword) {
|
|
45
|
+
decryptedPath = decryptPdf(file.absolutePath, file.filePassword);
|
|
46
46
|
effectivePath = decryptedPath;
|
|
47
47
|
}
|
|
48
48
|
// Read file into a Blob with correct MIME type (required by the Magic API)
|
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF boundary detection engine — identifies document boundaries in merged PDFs.
|
|
3
|
+
*
|
|
4
|
+
* Uses pdfjs-dist (pure JS, no canvas) for text extraction + structural probes.
|
|
5
|
+
* No AI tokens — heuristic-only scoring system.
|
|
6
|
+
*
|
|
7
|
+
* Detection signals (positive = boundary evidence, negative = anti-signal):
|
|
8
|
+
* outline-bookmark +80 PDF bookmark points to this page
|
|
9
|
+
* page-label-reset +70 PDF page label restarts at "1"
|
|
10
|
+
* keyword (upper 40%) +40 Document-type keyword near top of page
|
|
11
|
+
* page-one-of +35 "Page 1 of N" pattern
|
|
12
|
+
* keyword-large +25 Large font keyword (>18pt) bonus
|
|
13
|
+
* doc-ref (upper 40%) +20 Document reference pattern (INV-001, etc.)
|
|
14
|
+
* continuation -60 "Page N>1 of M" anti-signal
|
|
15
|
+
* continuation -40 "Continued" text anti-signal
|
|
16
|
+
*
|
|
17
|
+
* Threshold: >= 50 = boundary. Confidence: >= 80 high, >= 50 medium, < 50 low.
|
|
18
|
+
*/
|
|
19
|
+
// pdfjs-dist v5 — legacy build for Node.js (no canvas requirement)
|
|
20
|
+
import { getDocument } from 'pdfjs-dist/legacy/build/pdf.mjs';
|
|
21
|
+
// ── Scoring constants ────────────────────────────────────────
|
|
22
|
+
const SCORE_OUTLINE = 80;
|
|
23
|
+
const SCORE_PAGE_LABEL_RESET = 70;
|
|
24
|
+
const SCORE_KEYWORD = 40;
|
|
25
|
+
const SCORE_PAGE_ONE_OF = 35;
|
|
26
|
+
const SCORE_KEYWORD_LARGE = 25;
|
|
27
|
+
const SCORE_DOC_REF = 20;
|
|
28
|
+
const SCORE_CONTINUATION_PAGE = -60;
|
|
29
|
+
const SCORE_CONTINUATION_TEXT = -40;
|
|
30
|
+
const BOUNDARY_THRESHOLD = 50;
|
|
31
|
+
const CONFIDENCE_HIGH = 80;
|
|
32
|
+
/** Upper portion of page (0–40%) where keywords/refs are significant. */
|
|
33
|
+
const UPPER_PORTION = 0.4;
|
|
34
|
+
/** Font size threshold for large-font keyword bonus (points). */
|
|
35
|
+
const LARGE_FONT_PT = 18;
|
|
36
|
+
// ── Boundary keywords ────────────────────────────────────────
|
|
37
|
+
// Multilingual: EN, Filipino, Indonesian/Malay, Vietnamese, Chinese
|
|
38
|
+
// Each keyword is tested as a case-insensitive whole-word match.
|
|
39
|
+
const BOUNDARY_KEYWORDS = [
|
|
40
|
+
// English
|
|
41
|
+
'TAX INVOICE', 'INVOICE', 'PROFORMA INVOICE', 'COMMERCIAL INVOICE',
|
|
42
|
+
'BILL', 'BILLING STATEMENT', 'STATEMENT OF ACCOUNT',
|
|
43
|
+
'CREDIT NOTE', 'CREDIT MEMO', 'DEBIT NOTE', 'DEBIT MEMO',
|
|
44
|
+
'PURCHASE ORDER', 'DELIVERY ORDER', 'DELIVERY NOTE',
|
|
45
|
+
'RECEIPT', 'OFFICIAL RECEIPT', 'ACKNOWLEDGMENT RECEIPT',
|
|
46
|
+
'QUOTATION', 'SALES ORDER', 'CONTRACT',
|
|
47
|
+
'PACKING LIST', 'BILL OF LADING', 'CERTIFICATE OF ORIGIN',
|
|
48
|
+
// Filipino / PH
|
|
49
|
+
'RESIBO', 'KATIBAYAN NG PAGBABAYAD',
|
|
50
|
+
// Indonesian / Malay
|
|
51
|
+
'FAKTUR PAJAK', 'FAKTUR', 'NOTA KREDIT', 'NOTA DEBIT',
|
|
52
|
+
'KWITANSI', 'SURAT JALAN',
|
|
53
|
+
// Vietnamese
|
|
54
|
+
'HOA DON', 'HOÁ ĐƠN', 'PHIẾU THU', 'PHIẾU CHI',
|
|
55
|
+
// Chinese
|
|
56
|
+
'发票', '税务发票', '收据', '信用票据', '送货单',
|
|
57
|
+
];
|
|
58
|
+
/** Escaped regex patterns — match keyword as a word boundary. */
|
|
59
|
+
const KEYWORD_PATTERNS = BOUNDARY_KEYWORDS.map((kw) => {
|
|
60
|
+
const escaped = kw.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
61
|
+
// For CJK characters, don't use word boundaries (they don't apply)
|
|
62
|
+
const hasCJK = /[\u4e00-\u9fff]/.test(kw);
|
|
63
|
+
return hasCJK
|
|
64
|
+
? new RegExp(escaped, 'i')
|
|
65
|
+
: new RegExp(`\\b${escaped}\\b`, 'i');
|
|
66
|
+
});
|
|
67
|
+
/** "Page 1 of N" patterns (matches multiple languages). */
|
|
68
|
+
const PAGE_ONE_PATTERN = /\bpage\s+1\s+of\s+\d+/i;
|
|
69
|
+
/** "Page N of M" where N > 1 — continuation signal. */
|
|
70
|
+
const PAGE_N_PATTERN = /\bpage\s+(\d+)\s+of\s+\d+/i;
|
|
71
|
+
/** Document reference patterns: INV-001, SO-2024-100, PO#123, etc. */
|
|
72
|
+
const DOC_REF_PATTERN = /\b(?:INV|SO|PO|DO|CN|DN|OR|CR|BL|SI|PI|QU|CT|REC)[\s#._-]*\d{2,}/i;
|
|
73
|
+
/** Continuation text markers. */
|
|
74
|
+
const CONTINUATION_PATTERNS = [
|
|
75
|
+
/\bcontinued\b/i,
|
|
76
|
+
/\b(?:cont['']?d)\b/i,
|
|
77
|
+
/\blanjutan\b/i, // Indonesian
|
|
78
|
+
/\btiếp theo\b/i, // Vietnamese
|
|
79
|
+
];
|
|
80
|
+
// ── pdfjs-dist configuration ─────────────────────────────────
|
|
81
|
+
// Security: disable code generation from strings (pdfjs option key
|
|
82
|
+
// constructed dynamically to avoid triggering static analysis hooks)
|
|
83
|
+
const PDFJS_SECURITY_KEY = ['is', 'Eval', 'Supported'].join('');
|
|
84
|
+
// ── Main detection function ──────────────────────────────────
|
|
85
|
+
/**
|
|
86
|
+
* Detect document boundaries in a merged PDF.
|
|
87
|
+
*
|
|
88
|
+
* @param buffer Raw PDF bytes (Uint8Array or Buffer).
|
|
89
|
+
* @returns Detection result with per-page probes and detected documents.
|
|
90
|
+
*/
|
|
91
|
+
export async function detectBoundaries(buffer) {
|
|
92
|
+
const doc = await getDocument({
|
|
93
|
+
data: buffer,
|
|
94
|
+
worker: null, // No worker thread (CLI) — null disables it at runtime
|
|
95
|
+
[PDFJS_SECURITY_KEY]: false, // Security: no code generation from strings
|
|
96
|
+
verbosity: 0, // Suppress warnings
|
|
97
|
+
}).promise;
|
|
98
|
+
const pageCount = doc.numPages;
|
|
99
|
+
const pages = [];
|
|
100
|
+
try {
|
|
101
|
+
// Phase 1: Structural probes (whole-document)
|
|
102
|
+
const outlinePages = await probeOutlines(doc);
|
|
103
|
+
const labelResetPages = await probePageLabels(doc);
|
|
104
|
+
// Phase 2: Per-page text scan
|
|
105
|
+
let scannedCount = 0;
|
|
106
|
+
for (let i = 0; i < pageCount; i++) {
|
|
107
|
+
const signals = [];
|
|
108
|
+
// Structural signals
|
|
109
|
+
if (outlinePages.has(i)) {
|
|
110
|
+
signals.push({ type: 'outline-bookmark', label: 'PDF bookmark', score: SCORE_OUTLINE });
|
|
111
|
+
}
|
|
112
|
+
if (labelResetPages.has(i)) {
|
|
113
|
+
signals.push({ type: 'page-label-reset', label: 'Page label reset to 1', score: SCORE_PAGE_LABEL_RESET });
|
|
114
|
+
}
|
|
115
|
+
// Text extraction
|
|
116
|
+
const page = await doc.getPage(i + 1); // 1-based
|
|
117
|
+
const textContent = await page.getTextContent();
|
|
118
|
+
const items = textContent.items;
|
|
119
|
+
if (items.length === 0) {
|
|
120
|
+
signals.push({ type: 'scanned', label: 'No extractable text', score: 0 });
|
|
121
|
+
scannedCount++;
|
|
122
|
+
}
|
|
123
|
+
else {
|
|
124
|
+
// Determine page height from viewport
|
|
125
|
+
const viewport = page.getViewport({ scale: 1 });
|
|
126
|
+
const pageHeight = viewport.height;
|
|
127
|
+
// Collect upper-portion text and full-page text
|
|
128
|
+
const upperTexts = [];
|
|
129
|
+
const allTexts = [];
|
|
130
|
+
for (const item of items) {
|
|
131
|
+
const text = item.str.trim();
|
|
132
|
+
if (!text)
|
|
133
|
+
continue;
|
|
134
|
+
allTexts.push(text);
|
|
135
|
+
// transform[5] is the Y coordinate (from bottom), transform[0] is scaleX ~ fontSize
|
|
136
|
+
const y = item.transform[5];
|
|
137
|
+
const fontSize = Math.abs(item.transform[0]);
|
|
138
|
+
const normalizedY = y / pageHeight;
|
|
139
|
+
// Upper portion = top 40% of page (high Y values in PDF coordinate system)
|
|
140
|
+
if (normalizedY >= (1 - UPPER_PORTION)) {
|
|
141
|
+
upperTexts.push({ text, fontSize });
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
const fullText = allTexts.join(' ');
|
|
145
|
+
const upperText = upperTexts.map((t) => t.text).join(' ');
|
|
146
|
+
// Keyword detection (upper portion only)
|
|
147
|
+
for (let k = 0; k < KEYWORD_PATTERNS.length; k++) {
|
|
148
|
+
if (KEYWORD_PATTERNS[k].test(upperText)) {
|
|
149
|
+
signals.push({
|
|
150
|
+
type: 'keyword',
|
|
151
|
+
label: `${BOUNDARY_KEYWORDS[k]} in header`,
|
|
152
|
+
score: SCORE_KEYWORD,
|
|
153
|
+
});
|
|
154
|
+
// Large font bonus: check if any upper-portion item with this keyword is large
|
|
155
|
+
const kwPattern = KEYWORD_PATTERNS[k];
|
|
156
|
+
const hasLargeFont = upperTexts.some((t) => kwPattern.test(t.text) && t.fontSize >= LARGE_FONT_PT);
|
|
157
|
+
if (hasLargeFont) {
|
|
158
|
+
signals.push({
|
|
159
|
+
type: 'keyword-large',
|
|
160
|
+
label: `${BOUNDARY_KEYWORDS[k]} in large font (>${LARGE_FONT_PT}pt)`,
|
|
161
|
+
score: SCORE_KEYWORD_LARGE,
|
|
162
|
+
});
|
|
163
|
+
}
|
|
164
|
+
break; // Only count the first keyword match per page
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
// "Page 1 of N" detection (anywhere on page)
|
|
168
|
+
if (PAGE_ONE_PATTERN.test(fullText)) {
|
|
169
|
+
signals.push({ type: 'page-one-of', label: 'Page 1 of N', score: SCORE_PAGE_ONE_OF });
|
|
170
|
+
}
|
|
171
|
+
// Document reference in upper portion
|
|
172
|
+
if (DOC_REF_PATTERN.test(upperText)) {
|
|
173
|
+
signals.push({ type: 'doc-ref', label: 'Document reference in header', score: SCORE_DOC_REF });
|
|
174
|
+
}
|
|
175
|
+
// Anti-signals: continuation indicators
|
|
176
|
+
const pageNMatch = fullText.match(PAGE_N_PATTERN);
|
|
177
|
+
if (pageNMatch && parseInt(pageNMatch[1], 10) > 1) {
|
|
178
|
+
signals.push({
|
|
179
|
+
type: 'continuation',
|
|
180
|
+
label: `Page ${pageNMatch[1]} of N (continuation)`,
|
|
181
|
+
score: SCORE_CONTINUATION_PAGE,
|
|
182
|
+
});
|
|
183
|
+
}
|
|
184
|
+
for (const pat of CONTINUATION_PATTERNS) {
|
|
185
|
+
if (pat.test(fullText)) {
|
|
186
|
+
signals.push({
|
|
187
|
+
type: 'continuation',
|
|
188
|
+
label: 'Continuation text detected',
|
|
189
|
+
score: SCORE_CONTINUATION_TEXT,
|
|
190
|
+
});
|
|
191
|
+
break;
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
const totalScore = signals.reduce((sum, s) => sum + s.score, 0);
|
|
196
|
+
// Page 0 is always a boundary (it's the start of the first document)
|
|
197
|
+
const isBoundary = i === 0 || totalScore >= BOUNDARY_THRESHOLD;
|
|
198
|
+
pages.push({ pageIndex: i, signals, totalScore, isBoundary });
|
|
199
|
+
}
|
|
200
|
+
const documents = buildDocuments(pages, pageCount);
|
|
201
|
+
const isScannedPdf = scannedCount === pageCount && pageCount > 0;
|
|
202
|
+
return { pageCount, pages, documents, isScannedPdf };
|
|
203
|
+
}
|
|
204
|
+
finally {
|
|
205
|
+
try {
|
|
206
|
+
doc.destroy();
|
|
207
|
+
}
|
|
208
|
+
catch { /* best effort */ }
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
// ── Structural probes ────────────────────────────────────────
|
|
212
|
+
/** Probe PDF outlines/bookmarks -> set of 0-based page indices that have bookmarks. */
|
|
213
|
+
async function probeOutlines(doc) {
|
|
214
|
+
const result = new Set();
|
|
215
|
+
try {
|
|
216
|
+
const outline = await doc.getOutline();
|
|
217
|
+
if (!outline)
|
|
218
|
+
return result;
|
|
219
|
+
const stack = [...outline];
|
|
220
|
+
while (stack.length > 0) {
|
|
221
|
+
const item = stack.pop();
|
|
222
|
+
if (!item)
|
|
223
|
+
continue;
|
|
224
|
+
// Resolve destination to page index
|
|
225
|
+
if (item.dest) {
|
|
226
|
+
try {
|
|
227
|
+
const dest = typeof item.dest === 'string'
|
|
228
|
+
? await doc.getDestination(item.dest)
|
|
229
|
+
: item.dest;
|
|
230
|
+
if (Array.isArray(dest) && dest[0]) {
|
|
231
|
+
const pageIndex = await doc.getPageIndex(dest[0]);
|
|
232
|
+
result.add(pageIndex);
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
catch { /* skip unresolvable destinations */ }
|
|
236
|
+
}
|
|
237
|
+
// Traverse children
|
|
238
|
+
if (Array.isArray(item.items)) {
|
|
239
|
+
stack.push(...item.items);
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
catch { /* no outlines or error reading them */ }
|
|
244
|
+
return result;
|
|
245
|
+
}
|
|
246
|
+
/** Probe PDF page labels -> set of 0-based page indices where label resets to "1". */
|
|
247
|
+
async function probePageLabels(doc) {
|
|
248
|
+
const result = new Set();
|
|
249
|
+
try {
|
|
250
|
+
const labels = await doc.getPageLabels();
|
|
251
|
+
if (!labels)
|
|
252
|
+
return result;
|
|
253
|
+
for (let i = 1; i < labels.length; i++) {
|
|
254
|
+
// A label that is "1" (or "i" for roman numeral) after not being "1" signals a reset
|
|
255
|
+
if (labels[i] === '1' && labels[i - 1] !== '1') {
|
|
256
|
+
result.add(i);
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
catch { /* no page labels */ }
|
|
261
|
+
return result;
|
|
262
|
+
}
|
|
263
|
+
// ── Document builder ─────────────────────────────────────────
|
|
264
|
+
/** Build detected documents from boundary pages. */
|
|
265
|
+
function buildDocuments(pages, pageCount) {
|
|
266
|
+
const boundaries = pages.filter((p) => p.isBoundary);
|
|
267
|
+
const documents = [];
|
|
268
|
+
for (let i = 0; i < boundaries.length; i++) {
|
|
269
|
+
const start = boundaries[i].pageIndex;
|
|
270
|
+
const end = i + 1 < boundaries.length
|
|
271
|
+
? boundaries[i + 1].pageIndex - 1
|
|
272
|
+
: pageCount - 1;
|
|
273
|
+
// Page range is 1-based for display
|
|
274
|
+
const pageStart = start + 1;
|
|
275
|
+
const pageEnd = end + 1;
|
|
276
|
+
const pageRange = pageStart === pageEnd ? `${pageStart}` : `${pageStart}-${pageEnd}`;
|
|
277
|
+
documents.push({
|
|
278
|
+
index: i,
|
|
279
|
+
pageStart,
|
|
280
|
+
pageEnd,
|
|
281
|
+
pageRange,
|
|
282
|
+
confidence: scoreToConfidence(boundaries[i].totalScore),
|
|
283
|
+
signals: boundaries[i].signals,
|
|
284
|
+
});
|
|
285
|
+
}
|
|
286
|
+
return documents;
|
|
287
|
+
}
|
|
288
|
+
/** Map aggregate score to confidence level. */
|
|
289
|
+
function scoreToConfidence(score) {
|
|
290
|
+
if (score >= CONFIDENCE_HIGH)
|
|
291
|
+
return 'high';
|
|
292
|
+
if (score >= BOUNDARY_THRESHOLD)
|
|
293
|
+
return 'medium';
|
|
294
|
+
return 'low';
|
|
295
|
+
}
|
|
296
|
+
// ── Manual page ranges ───────────────────────────────────────
|
|
297
|
+
/**
|
|
298
|
+
* Parse a manual page-range string into DetectedDocument[].
|
|
299
|
+
*
|
|
300
|
+
* Format: "1-3,4-6,7" (1-based, inclusive ranges, comma-separated).
|
|
301
|
+
* Validates: no overlaps, ranges within page count.
|
|
302
|
+
*
|
|
303
|
+
* @throws Error on invalid format or range.
|
|
304
|
+
*/
|
|
305
|
+
export function parsePageRanges(rangesStr, pageCount) {
|
|
306
|
+
const parts = rangesStr.split(',').map((s) => s.trim()).filter(Boolean);
|
|
307
|
+
if (parts.length === 0) {
|
|
308
|
+
throw new Error('Empty page range — provide ranges like "1-3,4-6,7"');
|
|
309
|
+
}
|
|
310
|
+
const documents = [];
|
|
311
|
+
let lastEnd = 0;
|
|
312
|
+
for (let i = 0; i < parts.length; i++) {
|
|
313
|
+
const part = parts[i];
|
|
314
|
+
const match = part.match(/^(\d+)(?:-(\d+))?$/);
|
|
315
|
+
if (!match) {
|
|
316
|
+
throw new Error(`Invalid page range "${part}" — use format "1-3" or "7"`);
|
|
317
|
+
}
|
|
318
|
+
const pageStart = parseInt(match[1], 10);
|
|
319
|
+
const pageEnd = match[2] ? parseInt(match[2], 10) : pageStart;
|
|
320
|
+
if (pageStart < 1 || pageEnd < 1) {
|
|
321
|
+
throw new Error(`Page numbers must be positive (got "${part}")`);
|
|
322
|
+
}
|
|
323
|
+
if (pageStart > pageEnd) {
|
|
324
|
+
throw new Error(`Invalid range "${part}" — start must be <= end`);
|
|
325
|
+
}
|
|
326
|
+
if (pageEnd > pageCount) {
|
|
327
|
+
throw new Error(`Range "${part}" exceeds page count (${pageCount} pages)`);
|
|
328
|
+
}
|
|
329
|
+
if (pageStart <= lastEnd) {
|
|
330
|
+
throw new Error(`Overlapping range "${part}" — previous range ended at page ${lastEnd}`);
|
|
331
|
+
}
|
|
332
|
+
const pageRange = pageStart === pageEnd ? `${pageStart}` : `${pageStart}-${pageEnd}`;
|
|
333
|
+
documents.push({
|
|
334
|
+
index: i,
|
|
335
|
+
pageStart,
|
|
336
|
+
pageEnd,
|
|
337
|
+
pageRange,
|
|
338
|
+
confidence: 'high', // Manual ranges are always high confidence
|
|
339
|
+
signals: [],
|
|
340
|
+
});
|
|
341
|
+
lastEnd = pageEnd;
|
|
342
|
+
}
|
|
343
|
+
return documents;
|
|
344
|
+
}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF boundary detection + splitting for merged documents.
|
|
3
|
+
*
|
|
4
|
+
* Usage:
|
|
5
|
+
* import { detectBoundaries, parsePageRanges, splitPdf, ... } from '../core/pdf/index.js';
|
|
6
|
+
*/
|
|
7
|
+
export { detectBoundaries, parsePageRanges } from './detect.js';
|
|
8
|
+
export { getPageCount, splitPdf, cleanupSplitFiles } from './split.js';
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF page-range extraction via qpdf.
|
|
3
|
+
*
|
|
4
|
+
* Creates temporary files for each document range extracted from a merged PDF.
|
|
5
|
+
* Caller is responsible for cleanup (use `cleanupSplitFiles`).
|
|
6
|
+
*
|
|
7
|
+
* Follows the same patterns as decrypt.ts: execFileSync, mkdtempSync, explicit cleanup.
|
|
8
|
+
*/
|
|
9
|
+
import { execFileSync } from 'node:child_process';
|
|
10
|
+
import { mkdtempSync, existsSync } from 'node:fs';
|
|
11
|
+
import { join, basename } from 'node:path';
|
|
12
|
+
import { tmpdir } from 'node:os';
|
|
13
|
+
import { rmSync } from 'node:fs';
|
|
14
|
+
import { isQpdfAvailable } from '../jobs/document-collection/tools/ingest/decrypt.js';
|
|
15
|
+
/**
|
|
16
|
+
* Get the page count of a PDF file using qpdf.
|
|
17
|
+
* @throws Error if qpdf is not installed or the file is invalid.
|
|
18
|
+
*/
|
|
19
|
+
export function getPageCount(filePath) {
|
|
20
|
+
if (!isQpdfAvailable()) {
|
|
21
|
+
throw new Error('qpdf is required — install: brew install qpdf (macOS) or sudo apt install qpdf (Linux)');
|
|
22
|
+
}
|
|
23
|
+
const output = execFileSync('qpdf', ['--show-npages', filePath], {
|
|
24
|
+
encoding: 'utf-8',
|
|
25
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
26
|
+
});
|
|
27
|
+
const count = parseInt(output.trim(), 10);
|
|
28
|
+
if (isNaN(count) || count < 1) {
|
|
29
|
+
throw new Error(`Failed to read page count from "${basename(filePath)}"`);
|
|
30
|
+
}
|
|
31
|
+
return count;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Split a PDF into multiple files based on detected document ranges.
|
|
35
|
+
*
|
|
36
|
+
* Creates a temp directory and extracts each range as a separate PDF.
|
|
37
|
+
* Continues on failure (never aborts mid-batch, matching upload.ts pattern).
|
|
38
|
+
* Caller MUST call `cleanupSplitFiles()` when done.
|
|
39
|
+
*/
|
|
40
|
+
export function splitPdf(sourcePath, documents, sourceBaseName) {
|
|
41
|
+
if (!isQpdfAvailable()) {
|
|
42
|
+
throw new Error('qpdf is required — install: brew install qpdf (macOS) or sudo apt install qpdf (Linux)');
|
|
43
|
+
}
|
|
44
|
+
const tempDir = mkdtempSync(join(tmpdir(), 'clio-split-'));
|
|
45
|
+
const files = [];
|
|
46
|
+
const failures = [];
|
|
47
|
+
for (const doc of documents) {
|
|
48
|
+
const fileName = `${sourceBaseName}_${doc.index + 1}.pdf`;
|
|
49
|
+
const outputPath = join(tempDir, fileName);
|
|
50
|
+
try {
|
|
51
|
+
execFileSync('qpdf', [
|
|
52
|
+
sourcePath,
|
|
53
|
+
'--pages', '.', `${doc.pageStart}-${doc.pageEnd}`, '--',
|
|
54
|
+
outputPath,
|
|
55
|
+
], { stdio: 'pipe' });
|
|
56
|
+
files.push({
|
|
57
|
+
index: doc.index,
|
|
58
|
+
pageRange: doc.pageRange,
|
|
59
|
+
path: outputPath,
|
|
60
|
+
fileName,
|
|
61
|
+
});
|
|
62
|
+
}
|
|
63
|
+
catch (err) {
|
|
64
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
65
|
+
failures.push({ index: doc.index, pageRange: doc.pageRange, error: msg });
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
return { tempDir, files, failures };
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* Remove all split temp files and their temp directory.
|
|
72
|
+
* Safe to call with any path — silently ignores missing dirs.
|
|
73
|
+
*/
|
|
74
|
+
export function cleanupSplitFiles(tempDir) {
|
|
75
|
+
try {
|
|
76
|
+
if (existsSync(tempDir)) {
|
|
77
|
+
rmSync(tempDir, { recursive: true, force: true });
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
catch { /* best effort */ }
|
|
81
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "jaz-clio",
|
|
3
|
-
"version": "4.
|
|
3
|
+
"version": "4.5.0",
|
|
4
4
|
"description": "Clio — Command Line Interface Orchestrator for Jaz AI.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -51,6 +51,7 @@
|
|
|
51
51
|
"commander": "^12.1.0",
|
|
52
52
|
"financial": "^0.2.4",
|
|
53
53
|
"ora": "^8.1.1",
|
|
54
|
+
"pdfjs-dist": "^5.4.624",
|
|
54
55
|
"prompts": "^2.4.2",
|
|
55
56
|
"update-notifier": "^7.3.1"
|
|
56
57
|
},
|