@claritylabs/cl-sdk 0.18.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -1
- package/dist/index.d.mts +9 -0
- package/dist/index.d.ts +9 -0
- package/dist/index.js +308 -102
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +308 -102
- package/dist/index.mjs.map +1 -1
- package/dist/storage-sqlite.js +2 -1
- package/dist/storage-sqlite.js.map +1 -1
- package/dist/storage-sqlite.mjs +2 -1
- package/dist/storage-sqlite.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -369,10 +369,11 @@ async function withRetry(fn, log) {
|
|
|
369
369
|
|
|
370
370
|
// src/core/concurrency.ts
|
|
371
371
|
function pLimit(concurrency) {
|
|
372
|
+
const maxConcurrency = Number.isFinite(concurrency) ? Math.max(1, Math.floor(concurrency)) : 1;
|
|
372
373
|
let active = 0;
|
|
373
374
|
const queue = [];
|
|
374
375
|
function next() {
|
|
375
|
-
if (queue.length > 0 && active <
|
|
376
|
+
if (queue.length > 0 && active < maxConcurrency) {
|
|
376
377
|
active++;
|
|
377
378
|
queue.shift()();
|
|
378
379
|
}
|
|
@@ -2663,6 +2664,35 @@ async function extractPageRange(input, startPage, endPage) {
|
|
|
2663
2664
|
const bytes = await newDoc.save();
|
|
2664
2665
|
return bytesToBase64(new Uint8Array(bytes));
|
|
2665
2666
|
}
|
|
2667
|
+
async function createPdfPageSlicer(input) {
|
|
2668
|
+
if (isFileIdRef(input)) {
|
|
2669
|
+
throw new Error(
|
|
2670
|
+
"Cannot create a page slicer from a fileId reference. Pass the full PDF as base64/bytes, or provide pre-rendered page assets."
|
|
2671
|
+
);
|
|
2672
|
+
}
|
|
2673
|
+
const srcBytes = await pdfInputToBytes(input);
|
|
2674
|
+
const srcDoc = await import_pdf_lib.PDFDocument.load(srcBytes, { ignoreEncryption: true });
|
|
2675
|
+
const totalPages = srcDoc.getPageCount();
|
|
2676
|
+
const originalBase64 = isBytes(input) ? bytesToBase64(input) : typeof input === "string" ? input : bytesToBase64(srcBytes);
|
|
2677
|
+
return {
|
|
2678
|
+
getPageCount() {
|
|
2679
|
+
return totalPages;
|
|
2680
|
+
},
|
|
2681
|
+
async extractPageRange(startPage, endPage) {
|
|
2682
|
+
const start = Math.max(startPage - 1, 0);
|
|
2683
|
+
const end = Math.min(endPage, totalPages) - 1;
|
|
2684
|
+
if (start === 0 && end >= totalPages - 1) {
|
|
2685
|
+
return originalBase64;
|
|
2686
|
+
}
|
|
2687
|
+
const newDoc = await import_pdf_lib.PDFDocument.create();
|
|
2688
|
+
const indices = Array.from({ length: end - start + 1 }, (_, i) => start + i);
|
|
2689
|
+
const pages = await newDoc.copyPages(srcDoc, indices);
|
|
2690
|
+
pages.forEach((page) => newDoc.addPage(page));
|
|
2691
|
+
const bytes = await newDoc.save();
|
|
2692
|
+
return bytesToBase64(new Uint8Array(bytes));
|
|
2693
|
+
}
|
|
2694
|
+
};
|
|
2695
|
+
}
|
|
2666
2696
|
async function buildPdfProviderOptions(input, existingOptions) {
|
|
2667
2697
|
const options = { ...existingOptions };
|
|
2668
2698
|
if (isFileIdRef(input)) {
|
|
@@ -2810,20 +2840,19 @@ async function runExtractor(params) {
|
|
|
2810
2840
|
} = params;
|
|
2811
2841
|
const extractorProviderOptions = { ...providerOptions };
|
|
2812
2842
|
let fullPrompt;
|
|
2813
|
-
const
|
|
2843
|
+
const needsPdfBase64 = convertPdfToImages && !params.getPageImages || !convertPdfToImages && !params.getPageRangePdf;
|
|
2844
|
+
const pdfBase64 = needsPdfBase64 ? await pdfInputToBase64(pdfInput) : void 0;
|
|
2814
2845
|
if (convertPdfToImages) {
|
|
2815
|
-
const images = await convertPdfToImages(pdfBase64, startPage, endPage);
|
|
2846
|
+
const images = params.getPageImages ? await params.getPageImages(startPage, endPage) : await convertPdfToImages(pdfBase64, startPage, endPage);
|
|
2816
2847
|
extractorProviderOptions.images = images;
|
|
2817
2848
|
fullPrompt = `${prompt}
|
|
2818
2849
|
|
|
2819
2850
|
[Document pages ${startPage}-${endPage} are provided as images.]`;
|
|
2820
2851
|
} else {
|
|
2821
2852
|
const cacheKey = `${startPage}-${endPage}`;
|
|
2822
|
-
|
|
2823
|
-
|
|
2824
|
-
|
|
2825
|
-
pageRangeCache?.set(cacheKey, pagesPdf);
|
|
2826
|
-
}
|
|
2853
|
+
const cachedPagesPdf = pageRangeCache?.get(cacheKey);
|
|
2854
|
+
const pagesPdf = cachedPagesPdf ?? (params.getPageRangePdf ? await params.getPageRangePdf(startPage, endPage) : await extractPageRange(pdfBase64, startPage, endPage));
|
|
2855
|
+
if (!cachedPagesPdf) pageRangeCache?.set(cacheKey, pagesPdf);
|
|
2827
2856
|
extractorProviderOptions.pdfBase64 = pagesPdf;
|
|
2828
2857
|
fullPrompt = `${prompt}
|
|
2829
2858
|
|
|
@@ -3809,33 +3838,40 @@ async function formatDocumentContent(doc, generateText, options) {
|
|
|
3809
3838
|
for (let i = 0; i < entries.length; i += MAX_ENTRIES_PER_BATCH) {
|
|
3810
3839
|
batches.push(entries.slice(i, i + MAX_ENTRIES_PER_BATCH));
|
|
3811
3840
|
}
|
|
3812
|
-
|
|
3813
|
-
|
|
3814
|
-
|
|
3815
|
-
|
|
3816
|
-
|
|
3817
|
-
|
|
3818
|
-
|
|
3819
|
-
|
|
3820
|
-
|
|
3821
|
-
|
|
3822
|
-
|
|
3823
|
-
|
|
3824
|
-
|
|
3825
|
-
|
|
3826
|
-
|
|
3827
|
-
|
|
3828
|
-
|
|
3841
|
+
const limit = pLimit(options?.concurrency ?? 2);
|
|
3842
|
+
const batchResults = await Promise.all(batches.map(
|
|
3843
|
+
(batch, batchIdx) => limit(async () => {
|
|
3844
|
+
try {
|
|
3845
|
+
const prompt = buildFormatPrompt(batch.map((e) => ({ id: e.id, text: e.text })));
|
|
3846
|
+
const result = await withRetry(
|
|
3847
|
+
() => generateText({
|
|
3848
|
+
prompt,
|
|
3849
|
+
maxTokens: options?.maxTokens ?? 16384,
|
|
3850
|
+
providerOptions: options?.providerOptions
|
|
3851
|
+
})
|
|
3852
|
+
);
|
|
3853
|
+
const formatted = parseFormatResponse(result.text);
|
|
3854
|
+
if (formatted.size < batch.length) {
|
|
3855
|
+
await options?.log?.(
|
|
3856
|
+
`Format batch ${batchIdx + 1}/${batches.length}: model returned ${formatted.size}/${batch.length} entries \u2014 unformatted entries will keep original content`
|
|
3857
|
+
);
|
|
3858
|
+
}
|
|
3859
|
+
return { batch, formatted, usage: result.usage };
|
|
3860
|
+
} catch (error) {
|
|
3829
3861
|
await options?.log?.(
|
|
3830
|
-
`Format batch ${batchIdx + 1}/${batches.length}
|
|
3862
|
+
`Format batch ${batchIdx + 1}/${batches.length} failed, keeping original content: ${error instanceof Error ? error.message : String(error)}`
|
|
3831
3863
|
);
|
|
3864
|
+
return void 0;
|
|
3832
3865
|
}
|
|
3833
|
-
|
|
3834
|
-
|
|
3835
|
-
|
|
3836
|
-
|
|
3837
|
-
|
|
3866
|
+
})
|
|
3867
|
+
));
|
|
3868
|
+
for (const result of batchResults) {
|
|
3869
|
+
if (!result) continue;
|
|
3870
|
+
if (result.usage) {
|
|
3871
|
+
totalUsage.inputTokens += result.usage.inputTokens;
|
|
3872
|
+
totalUsage.outputTokens += result.usage.outputTokens;
|
|
3838
3873
|
}
|
|
3874
|
+
applyFormattedContent(doc, result.batch, result.formatted);
|
|
3839
3875
|
}
|
|
3840
3876
|
return { document: doc, usage: totalUsage };
|
|
3841
3877
|
}
|
|
@@ -7106,6 +7142,8 @@ async function resolveReferentialCoverages(params) {
|
|
|
7106
7142
|
pageCount,
|
|
7107
7143
|
generateObject,
|
|
7108
7144
|
convertPdfToImages,
|
|
7145
|
+
getPageRangePdf,
|
|
7146
|
+
getPageImages,
|
|
7109
7147
|
concurrency = 2,
|
|
7110
7148
|
providerOptions,
|
|
7111
7149
|
modelCapabilities,
|
|
@@ -7215,6 +7253,8 @@ async function resolveReferentialCoverages(params) {
|
|
|
7215
7253
|
endPage: pageRange.endPage,
|
|
7216
7254
|
generateObject,
|
|
7217
7255
|
convertPdfToImages,
|
|
7256
|
+
getPageRangePdf,
|
|
7257
|
+
getPageImages,
|
|
7218
7258
|
maxTokens: budget.maxTokens,
|
|
7219
7259
|
providerOptions
|
|
7220
7260
|
});
|
|
@@ -7306,6 +7346,8 @@ async function runFocusedExtractorWithFallback(params) {
|
|
|
7306
7346
|
convertPdfToImages,
|
|
7307
7347
|
providerOptions,
|
|
7308
7348
|
pageRangeCache,
|
|
7349
|
+
getPageRangePdf,
|
|
7350
|
+
getPageImages,
|
|
7309
7351
|
trackUsage,
|
|
7310
7352
|
resolveBudget,
|
|
7311
7353
|
log
|
|
@@ -7319,6 +7361,7 @@ async function runFocusedExtractorWithFallback(params) {
|
|
|
7319
7361
|
const hintTokens = ext.maxTokens ?? 4096;
|
|
7320
7362
|
const taskKind = hintTokens >= 8192 ? "extraction_long_list" : "extraction_focused";
|
|
7321
7363
|
const budget = resolveBudget(taskKind, hintTokens);
|
|
7364
|
+
const startedAt = Date.now();
|
|
7322
7365
|
const result = await runExtractor({
|
|
7323
7366
|
name: task.extractorName,
|
|
7324
7367
|
prompt: ext.buildPrompt(),
|
|
@@ -7330,12 +7373,15 @@ async function runFocusedExtractorWithFallback(params) {
|
|
|
7330
7373
|
convertPdfToImages,
|
|
7331
7374
|
maxTokens: budget.maxTokens,
|
|
7332
7375
|
providerOptions,
|
|
7333
|
-
pageRangeCache
|
|
7376
|
+
pageRangeCache,
|
|
7377
|
+
getPageRangePdf,
|
|
7378
|
+
getPageImages
|
|
7334
7379
|
});
|
|
7335
7380
|
trackUsage(result.usage, {
|
|
7336
7381
|
taskKind,
|
|
7337
7382
|
label: task.extractorName,
|
|
7338
|
-
maxTokens: budget.maxTokens
|
|
7383
|
+
maxTokens: budget.maxTokens,
|
|
7384
|
+
durationMs: Date.now() - startedAt
|
|
7339
7385
|
});
|
|
7340
7386
|
if (!ext.fallback?.isEmpty(result.data)) {
|
|
7341
7387
|
return result;
|
|
@@ -7358,6 +7404,7 @@ async function runFocusedExtractorWithFallback(params) {
|
|
|
7358
7404
|
const hintTokens = fallbackExt.maxTokens ?? 4096;
|
|
7359
7405
|
const taskKind = hintTokens >= 8192 ? "extraction_long_list" : "extraction_focused";
|
|
7360
7406
|
const budget = resolveBudget(taskKind, hintTokens);
|
|
7407
|
+
const startedAt = Date.now();
|
|
7361
7408
|
const fallbackResult = await runExtractor({
|
|
7362
7409
|
name: ext.fallback.extractorName,
|
|
7363
7410
|
prompt: fallbackExt.buildPrompt(),
|
|
@@ -7369,12 +7416,15 @@ async function runFocusedExtractorWithFallback(params) {
|
|
|
7369
7416
|
convertPdfToImages,
|
|
7370
7417
|
maxTokens: budget.maxTokens,
|
|
7371
7418
|
providerOptions,
|
|
7372
|
-
pageRangeCache
|
|
7419
|
+
pageRangeCache,
|
|
7420
|
+
getPageRangePdf,
|
|
7421
|
+
getPageImages
|
|
7373
7422
|
});
|
|
7374
7423
|
trackUsage(fallbackResult.usage, {
|
|
7375
7424
|
taskKind,
|
|
7376
7425
|
label: ext.fallback.extractorName,
|
|
7377
|
-
maxTokens: budget.maxTokens
|
|
7426
|
+
maxTokens: budget.maxTokens,
|
|
7427
|
+
durationMs: Date.now() - startedAt
|
|
7378
7428
|
});
|
|
7379
7429
|
const focusedData = ext.fallback.deriveFocusedResult(fallbackResult.data);
|
|
7380
7430
|
return focusedData ? [
|
|
@@ -8035,7 +8085,11 @@ function createExtractor(config) {
|
|
|
8035
8085
|
generateObject,
|
|
8036
8086
|
convertPdfToImages,
|
|
8037
8087
|
concurrency = 2,
|
|
8088
|
+
pageMapConcurrency,
|
|
8089
|
+
extractorConcurrency,
|
|
8090
|
+
formatConcurrency,
|
|
8038
8091
|
maxReviewRounds = 2,
|
|
8092
|
+
reviewMode = "auto",
|
|
8039
8093
|
onTokenUsage,
|
|
8040
8094
|
onProgress,
|
|
8041
8095
|
log,
|
|
@@ -8046,7 +8100,8 @@ function createExtractor(config) {
|
|
|
8046
8100
|
modelBudgetConstraints,
|
|
8047
8101
|
onCheckpointSave
|
|
8048
8102
|
} = config;
|
|
8049
|
-
const
|
|
8103
|
+
const pageMapLimit = pLimit(pageMapConcurrency ?? concurrency);
|
|
8104
|
+
const extractorLimit = pLimit(extractorConcurrency ?? concurrency);
|
|
8050
8105
|
const extractorCatalog = formatExtractorCatalogForPrompt();
|
|
8051
8106
|
let totalUsage = { inputTokens: 0, outputTokens: 0 };
|
|
8052
8107
|
let modelCalls = 0;
|
|
@@ -8081,7 +8136,7 @@ function createExtractor(config) {
|
|
|
8081
8136
|
usage,
|
|
8082
8137
|
usageReported: !!usage
|
|
8083
8138
|
});
|
|
8084
|
-
if (report.durationMs) {
|
|
8139
|
+
if (report.durationMs != null) {
|
|
8085
8140
|
performanceReport.totalModelCallDurationMs += report.durationMs;
|
|
8086
8141
|
}
|
|
8087
8142
|
}
|
|
@@ -8138,6 +8193,46 @@ function createExtractor(config) {
|
|
|
8138
8193
|
return textIncludesSupplementarySignal(JSON.stringify(value));
|
|
8139
8194
|
});
|
|
8140
8195
|
}
|
|
8196
|
+
function getSupplementaryPageRanges(pageAssignments, formInventory) {
|
|
8197
|
+
const pages = /* @__PURE__ */ new Set();
|
|
8198
|
+
for (const assignment of pageAssignments) {
|
|
8199
|
+
if (assignment.pageRole === "supplementary" || assignment.extractorNames.includes("supplementary") || textIncludesSupplementarySignal(assignment.notes)) {
|
|
8200
|
+
pages.add(assignment.localPageNumber);
|
|
8201
|
+
}
|
|
8202
|
+
}
|
|
8203
|
+
for (const form of formInventory?.forms ?? []) {
|
|
8204
|
+
if (form.formType === "notice" || textIncludesSupplementarySignal(form.title) || textIncludesSupplementarySignal(form.formNumber)) {
|
|
8205
|
+
const startPage2 = form.pageStart;
|
|
8206
|
+
const endPage = form.pageEnd ?? form.pageStart;
|
|
8207
|
+
if (typeof startPage2 !== "number" || typeof endPage !== "number") continue;
|
|
8208
|
+
for (let page = startPage2; page <= endPage; page += 1) {
|
|
8209
|
+
pages.add(page);
|
|
8210
|
+
}
|
|
8211
|
+
}
|
|
8212
|
+
}
|
|
8213
|
+
const sortedPages = [...pages].sort((a, b) => a - b);
|
|
8214
|
+
if (sortedPages.length === 0) return [];
|
|
8215
|
+
const ranges = [];
|
|
8216
|
+
let startPage = sortedPages[0];
|
|
8217
|
+
let previousPage = sortedPages[0];
|
|
8218
|
+
for (const page of sortedPages.slice(1)) {
|
|
8219
|
+
if (page === previousPage + 1) {
|
|
8220
|
+
previousPage = page;
|
|
8221
|
+
continue;
|
|
8222
|
+
}
|
|
8223
|
+
ranges.push({ startPage, endPage: previousPage });
|
|
8224
|
+
startPage = page;
|
|
8225
|
+
previousPage = page;
|
|
8226
|
+
}
|
|
8227
|
+
ranges.push({ startPage, endPage: previousPage });
|
|
8228
|
+
return ranges;
|
|
8229
|
+
}
|
|
8230
|
+
function shouldRunLlmReview(mode, report, sourceSpansAvailable) {
|
|
8231
|
+
if (mode === "skip" || maxReviewRounds <= 0) return false;
|
|
8232
|
+
if (mode === "always") return true;
|
|
8233
|
+
if (!sourceSpansAvailable) return true;
|
|
8234
|
+
return report.qualityGateStatus !== "passed" || report.issues.length > 0;
|
|
8235
|
+
}
|
|
8141
8236
|
function buildAlreadyExtractedSummary(memory) {
|
|
8142
8237
|
const lines = [];
|
|
8143
8238
|
const declarationResult = readMemoryRecord(memory, "declarations");
|
|
@@ -8167,10 +8262,11 @@ function createExtractor(config) {
|
|
|
8167
8262
|
}
|
|
8168
8263
|
return lines.length > 0 ? lines.join("\n") : "";
|
|
8169
8264
|
}
|
|
8170
|
-
async function runFocusedExtractorTask(task, pdfInput, memory, pageRangeCache) {
|
|
8265
|
+
async function runFocusedExtractorTask(task, pdfInput, memory, pageRangeCache, getPageRangePdf, getPageImages) {
|
|
8171
8266
|
if (task.extractorName === "supplementary") {
|
|
8172
8267
|
const alreadyExtractedSummary = buildAlreadyExtractedSummary(memory);
|
|
8173
8268
|
const budget = resolveBudget("extraction_focused", 4096);
|
|
8269
|
+
const startedAt = Date.now();
|
|
8174
8270
|
const result = await runExtractor({
|
|
8175
8271
|
name: "supplementary",
|
|
8176
8272
|
prompt: buildSupplementaryPrompt(alreadyExtractedSummary),
|
|
@@ -8182,12 +8278,15 @@ function createExtractor(config) {
|
|
|
8182
8278
|
convertPdfToImages,
|
|
8183
8279
|
maxTokens: budget.maxTokens,
|
|
8184
8280
|
providerOptions: activeProviderOptions,
|
|
8185
|
-
pageRangeCache
|
|
8281
|
+
pageRangeCache,
|
|
8282
|
+
getPageRangePdf,
|
|
8283
|
+
getPageImages
|
|
8186
8284
|
});
|
|
8187
8285
|
trackUsage(result.usage, {
|
|
8188
8286
|
taskKind: "extraction_focused",
|
|
8189
8287
|
label: "supplementary",
|
|
8190
|
-
maxTokens: budget.maxTokens
|
|
8288
|
+
maxTokens: budget.maxTokens,
|
|
8289
|
+
durationMs: Date.now() - startedAt
|
|
8191
8290
|
});
|
|
8192
8291
|
return result;
|
|
8193
8292
|
}
|
|
@@ -8198,6 +8297,8 @@ function createExtractor(config) {
|
|
|
8198
8297
|
convertPdfToImages,
|
|
8199
8298
|
providerOptions: activeProviderOptions,
|
|
8200
8299
|
pageRangeCache,
|
|
8300
|
+
getPageRangePdf,
|
|
8301
|
+
getPageImages,
|
|
8201
8302
|
trackUsage,
|
|
8202
8303
|
resolveBudget,
|
|
8203
8304
|
log
|
|
@@ -8246,20 +8347,68 @@ function createExtractor(config) {
|
|
|
8246
8347
|
}
|
|
8247
8348
|
}
|
|
8248
8349
|
let pdfBase64Cache;
|
|
8350
|
+
const completedPageRangePdfCache = /* @__PURE__ */ new Map();
|
|
8249
8351
|
const pageRangePdfCache = /* @__PURE__ */ new Map();
|
|
8352
|
+
const pageRangeImageCache = /* @__PURE__ */ new Map();
|
|
8353
|
+
let pdfSlicerPromise;
|
|
8354
|
+
let fullPdfProviderOptionsPromise;
|
|
8355
|
+
let pageCountPromise;
|
|
8250
8356
|
async function getPdfBase64ForExtraction() {
|
|
8251
8357
|
if (pdfBase64Cache === void 0) {
|
|
8252
8358
|
pdfBase64Cache = await pdfInputToBase64(pdfInput);
|
|
8253
8359
|
}
|
|
8254
8360
|
return pdfBase64Cache;
|
|
8255
8361
|
}
|
|
8362
|
+
async function getCachedPageCount() {
|
|
8363
|
+
if (!pageCountPromise) {
|
|
8364
|
+
pageCountPromise = getPdfSlicer().then((slicer) => slicer.getPageCount()).catch(() => getPdfPageCount(pdfInput));
|
|
8365
|
+
}
|
|
8366
|
+
return pageCountPromise;
|
|
8367
|
+
}
|
|
8368
|
+
async function getFullPdfProviderOptions() {
|
|
8369
|
+
if (!fullPdfProviderOptionsPromise) {
|
|
8370
|
+
fullPdfProviderOptionsPromise = buildPdfProviderOptions(pdfInput, activeProviderOptions);
|
|
8371
|
+
}
|
|
8372
|
+
return fullPdfProviderOptionsPromise;
|
|
8373
|
+
}
|
|
8374
|
+
async function getPdfSlicer() {
|
|
8375
|
+
if (!pdfSlicerPromise) {
|
|
8376
|
+
pdfSlicerPromise = createPdfPageSlicer(pdfInput);
|
|
8377
|
+
}
|
|
8378
|
+
return pdfSlicerPromise;
|
|
8379
|
+
}
|
|
8256
8380
|
async function getPageRangePdf(startPage, endPage) {
|
|
8257
8381
|
const cacheKey = `${startPage}-${endPage}`;
|
|
8258
|
-
const cached =
|
|
8382
|
+
const cached = completedPageRangePdfCache.get(cacheKey);
|
|
8383
|
+
if (cached) return cached;
|
|
8384
|
+
const pending = pageRangePdfCache.get(cacheKey);
|
|
8385
|
+
if (pending) return pending;
|
|
8386
|
+
const promise = (async () => {
|
|
8387
|
+
const slicer = await getPdfSlicer();
|
|
8388
|
+
const pagesPdf = await slicer.extractPageRange(startPage, endPage);
|
|
8389
|
+
completedPageRangePdfCache.set(cacheKey, pagesPdf);
|
|
8390
|
+
return pagesPdf;
|
|
8391
|
+
})().catch((error) => {
|
|
8392
|
+
pageRangePdfCache.delete(cacheKey);
|
|
8393
|
+
throw error;
|
|
8394
|
+
});
|
|
8395
|
+
pageRangePdfCache.set(cacheKey, promise);
|
|
8396
|
+
return promise;
|
|
8397
|
+
}
|
|
8398
|
+
async function getPageImages(startPage, endPage) {
|
|
8399
|
+
if (!convertPdfToImages) return [];
|
|
8400
|
+
const cacheKey = `${startPage}-${endPage}`;
|
|
8401
|
+
const cached = pageRangeImageCache.get(cacheKey);
|
|
8259
8402
|
if (cached) return cached;
|
|
8260
|
-
const
|
|
8261
|
-
|
|
8262
|
-
|
|
8403
|
+
const promise = (async () => {
|
|
8404
|
+
const pdfBase64 = await getPdfBase64ForExtraction();
|
|
8405
|
+
return convertPdfToImages(pdfBase64, startPage, endPage);
|
|
8406
|
+
})().catch((error) => {
|
|
8407
|
+
pageRangeImageCache.delete(cacheKey);
|
|
8408
|
+
throw error;
|
|
8409
|
+
});
|
|
8410
|
+
pageRangeImageCache.set(cacheKey, promise);
|
|
8411
|
+
return promise;
|
|
8263
8412
|
}
|
|
8264
8413
|
let classifyResult;
|
|
8265
8414
|
if (resumed?.classifyResult && pipelineCtx.isPhaseComplete("classify")) {
|
|
@@ -8267,15 +8416,16 @@ function createExtractor(config) {
|
|
|
8267
8416
|
onProgress?.("Resuming from checkpoint (classify complete)...");
|
|
8268
8417
|
} else {
|
|
8269
8418
|
onProgress?.("Classifying document...");
|
|
8270
|
-
const pageCount2 = await
|
|
8419
|
+
const pageCount2 = await getCachedPageCount();
|
|
8271
8420
|
const budget = resolveBudget("extraction_classify", 512);
|
|
8421
|
+
const startedAt = Date.now();
|
|
8272
8422
|
const classifyResponse = await safeGenerateObject(
|
|
8273
8423
|
generateObject,
|
|
8274
8424
|
{
|
|
8275
8425
|
prompt: buildClassifyPrompt(),
|
|
8276
8426
|
schema: ClassifyResultSchema,
|
|
8277
8427
|
maxTokens: budget.maxTokens,
|
|
8278
|
-
providerOptions: await
|
|
8428
|
+
providerOptions: await getFullPdfProviderOptions()
|
|
8279
8429
|
},
|
|
8280
8430
|
{
|
|
8281
8431
|
fallback: { documentType: "policy", policyTypes: ["other"], confidence: 0 },
|
|
@@ -8287,7 +8437,8 @@ function createExtractor(config) {
|
|
|
8287
8437
|
trackUsage(classifyResponse.usage, {
|
|
8288
8438
|
taskKind: "extraction_classify",
|
|
8289
8439
|
label: "classify",
|
|
8290
|
-
maxTokens: budget.maxTokens
|
|
8440
|
+
maxTokens: budget.maxTokens,
|
|
8441
|
+
durationMs: Date.now() - startedAt
|
|
8291
8442
|
});
|
|
8292
8443
|
classifyResult = classifyResponse.object;
|
|
8293
8444
|
if (classifyResult.confidence === 0) {
|
|
@@ -8305,7 +8456,7 @@ function createExtractor(config) {
|
|
|
8305
8456
|
const policyTypes = classifyResult.policyTypes ?? [];
|
|
8306
8457
|
const primaryType = policyTypes[0] ?? "other";
|
|
8307
8458
|
const template = getTemplate(primaryType);
|
|
8308
|
-
const pageCount = resumed?.pageCount ?? await
|
|
8459
|
+
const pageCount = resumed?.pageCount ?? await getCachedPageCount();
|
|
8309
8460
|
const templateHints = buildTemplateHints(primaryType, documentType, pageCount, template);
|
|
8310
8461
|
let formInventory;
|
|
8311
8462
|
if (resumed?.formInventory && pipelineCtx.isPhaseComplete("form_inventory")) {
|
|
@@ -8315,13 +8466,14 @@ function createExtractor(config) {
|
|
|
8315
8466
|
} else {
|
|
8316
8467
|
onProgress?.(`Building form inventory for ${primaryType} ${documentType}...`);
|
|
8317
8468
|
const budget = resolveBudget("extraction_form_inventory", 2048);
|
|
8469
|
+
const startedAt = Date.now();
|
|
8318
8470
|
const formInventoryResponse = await safeGenerateObject(
|
|
8319
8471
|
generateObject,
|
|
8320
8472
|
{
|
|
8321
8473
|
prompt: buildFormInventoryPrompt(templateHints),
|
|
8322
8474
|
schema: FormInventorySchema,
|
|
8323
8475
|
maxTokens: budget.maxTokens,
|
|
8324
|
-
providerOptions: await
|
|
8476
|
+
providerOptions: await getFullPdfProviderOptions()
|
|
8325
8477
|
},
|
|
8326
8478
|
{
|
|
8327
8479
|
fallback: { forms: [] },
|
|
@@ -8332,7 +8484,8 @@ function createExtractor(config) {
|
|
|
8332
8484
|
trackUsage(formInventoryResponse.usage, {
|
|
8333
8485
|
taskKind: "extraction_form_inventory",
|
|
8334
8486
|
label: "form_inventory",
|
|
8335
|
-
maxTokens: budget.maxTokens
|
|
8487
|
+
maxTokens: budget.maxTokens,
|
|
8488
|
+
durationMs: Date.now() - startedAt
|
|
8336
8489
|
});
|
|
8337
8490
|
formInventory = formInventoryResponse.object;
|
|
8338
8491
|
memory.set("form_inventory", formInventory);
|
|
@@ -8362,9 +8515,10 @@ function createExtractor(config) {
|
|
|
8362
8515
|
);
|
|
8363
8516
|
const pageMapResults = await Promise.all(
|
|
8364
8517
|
pageMapChunks.map(
|
|
8365
|
-
({ startPage, endPage }) =>
|
|
8518
|
+
({ startPage, endPage }) => pageMapLimit(async () => {
|
|
8366
8519
|
const pagesPdf = await getPageRangePdf(startPage, endPage);
|
|
8367
8520
|
const budget = resolveBudget("extraction_page_map", 2048);
|
|
8521
|
+
const startedAt = Date.now();
|
|
8368
8522
|
const mapResponse = await safeGenerateObject(
|
|
8369
8523
|
generateObject,
|
|
8370
8524
|
{
|
|
@@ -8389,7 +8543,8 @@ function createExtractor(config) {
|
|
|
8389
8543
|
trackUsage(mapResponse.usage, {
|
|
8390
8544
|
taskKind: "extraction_page_map",
|
|
8391
8545
|
label: `page_map:${startPage}-${endPage}`,
|
|
8392
|
-
maxTokens: budget.maxTokens
|
|
8546
|
+
maxTokens: budget.maxTokens,
|
|
8547
|
+
durationMs: Date.now() - startedAt
|
|
8393
8548
|
});
|
|
8394
8549
|
return mapResponse.object.pages.map((assignment) => ({
|
|
8395
8550
|
...assignment,
|
|
@@ -8435,14 +8590,32 @@ function createExtractor(config) {
|
|
|
8435
8590
|
});
|
|
8436
8591
|
}
|
|
8437
8592
|
if (!pipelineCtx.isPhaseComplete("extract")) {
|
|
8438
|
-
const
|
|
8593
|
+
const supplementaryRanges = getSupplementaryPageRanges(pageAssignments, formInventory);
|
|
8594
|
+
const baseTasks = plan.tasks;
|
|
8595
|
+
const hasPlannedSupplementary = baseTasks.some((task) => task.extractorName === "supplementary");
|
|
8596
|
+
const tasks = hasPlannedSupplementary || supplementaryRanges.length === 0 ? baseTasks : [
|
|
8597
|
+
...baseTasks,
|
|
8598
|
+
...supplementaryRanges.map((range) => ({
|
|
8599
|
+
extractorName: "supplementary",
|
|
8600
|
+
startPage: range.startPage,
|
|
8601
|
+
endPage: range.endPage,
|
|
8602
|
+
description: `Page-signaled supplementary extraction for pages ${range.startPage}-${range.endPage}`
|
|
8603
|
+
}))
|
|
8604
|
+
];
|
|
8439
8605
|
onProgress?.(`Dispatching ${tasks.length} extractors...`);
|
|
8440
8606
|
const extractionPdfInput = await getPdfBase64ForExtraction();
|
|
8441
8607
|
const extractorResults = await Promise.all(
|
|
8442
8608
|
tasks.map(
|
|
8443
|
-
(task) =>
|
|
8609
|
+
(task) => extractorLimit(async () => {
|
|
8444
8610
|
onProgress?.(`Extracting ${task.extractorName} (pages ${task.startPage}-${task.endPage})...`);
|
|
8445
|
-
return runFocusedExtractorTask(
|
|
8611
|
+
return runFocusedExtractorTask(
|
|
8612
|
+
task,
|
|
8613
|
+
extractionPdfInput,
|
|
8614
|
+
memory,
|
|
8615
|
+
completedPageRangePdfCache,
|
|
8616
|
+
getPageRangePdf,
|
|
8617
|
+
convertPdfToImages ? getPageImages : void 0
|
|
8618
|
+
);
|
|
8446
8619
|
})
|
|
8447
8620
|
)
|
|
8448
8621
|
);
|
|
@@ -8457,6 +8630,7 @@ function createExtractor(config) {
|
|
|
8457
8630
|
try {
|
|
8458
8631
|
const alreadyExtractedSummary = buildAlreadyExtractedSummary(memory);
|
|
8459
8632
|
const budget = resolveBudget("extraction_focused", 4096);
|
|
8633
|
+
const startedAt = Date.now();
|
|
8460
8634
|
const supplementaryResult = await runExtractor({
|
|
8461
8635
|
name: "supplementary",
|
|
8462
8636
|
prompt: buildSupplementaryPrompt(alreadyExtractedSummary),
|
|
@@ -8468,12 +8642,15 @@ function createExtractor(config) {
|
|
|
8468
8642
|
convertPdfToImages,
|
|
8469
8643
|
maxTokens: budget.maxTokens,
|
|
8470
8644
|
providerOptions: activeProviderOptions,
|
|
8471
|
-
pageRangeCache:
|
|
8645
|
+
pageRangeCache: completedPageRangePdfCache,
|
|
8646
|
+
getPageRangePdf,
|
|
8647
|
+
getPageImages: convertPdfToImages ? getPageImages : void 0
|
|
8472
8648
|
});
|
|
8473
8649
|
trackUsage(supplementaryResult.usage, {
|
|
8474
8650
|
taskKind: "extraction_focused",
|
|
8475
8651
|
label: "supplementary",
|
|
8476
|
-
maxTokens: budget.maxTokens
|
|
8652
|
+
maxTokens: budget.maxTokens,
|
|
8653
|
+
durationMs: Date.now() - startedAt
|
|
8477
8654
|
});
|
|
8478
8655
|
mergeMemoryResult(supplementaryResult.name, supplementaryResult.data, memory);
|
|
8479
8656
|
} catch (error) {
|
|
@@ -8493,6 +8670,7 @@ function createExtractor(config) {
|
|
|
8493
8670
|
if (!pipelineCtx.isPhaseComplete("resolve_referential")) {
|
|
8494
8671
|
onProgress?.("Resolving referential coverage limits...");
|
|
8495
8672
|
try {
|
|
8673
|
+
const startedAt = Date.now();
|
|
8496
8674
|
const resolution = await resolveReferentialCoverages({
|
|
8497
8675
|
memory,
|
|
8498
8676
|
pdfInput,
|
|
@@ -8500,6 +8678,8 @@ function createExtractor(config) {
|
|
|
8500
8678
|
generateObject,
|
|
8501
8679
|
convertPdfToImages,
|
|
8502
8680
|
concurrency,
|
|
8681
|
+
getPageRangePdf,
|
|
8682
|
+
getPageImages: convertPdfToImages ? getPageImages : void 0,
|
|
8503
8683
|
providerOptions: activeProviderOptions,
|
|
8504
8684
|
modelCapabilities,
|
|
8505
8685
|
modelBudgetConstraints,
|
|
@@ -8508,7 +8688,8 @@ function createExtractor(config) {
|
|
|
8508
8688
|
});
|
|
8509
8689
|
trackUsage(resolution.usage, {
|
|
8510
8690
|
taskKind: "extraction_referential_lookup",
|
|
8511
|
-
label: "referential_resolution"
|
|
8691
|
+
label: "referential_resolution",
|
|
8692
|
+
durationMs: Date.now() - startedAt
|
|
8512
8693
|
});
|
|
8513
8694
|
if (resolution.attempts > 0) {
|
|
8514
8695
|
await log?.(`Referential resolution: ${resolution.resolved}/${resolution.attempts} resolved, ${resolution.unresolved} unresolved`);
|
|
@@ -8530,52 +8711,72 @@ function createExtractor(config) {
|
|
|
8530
8711
|
let reviewReport = resumed?.reviewReport;
|
|
8531
8712
|
if (!pipelineCtx.isPhaseComplete("review")) {
|
|
8532
8713
|
reviewRounds = [];
|
|
8533
|
-
|
|
8534
|
-
|
|
8535
|
-
|
|
8536
|
-
|
|
8537
|
-
|
|
8538
|
-
|
|
8539
|
-
|
|
8540
|
-
|
|
8541
|
-
|
|
8542
|
-
|
|
8714
|
+
groundExtractionMemoryWithSourceSpans(memory, sourceSpans);
|
|
8715
|
+
const preReviewReport = buildExtractionReviewReport({
|
|
8716
|
+
memory,
|
|
8717
|
+
pageAssignments,
|
|
8718
|
+
reviewRounds,
|
|
8719
|
+
sourceSpansAvailable: sourceSpans.length > 0
|
|
8720
|
+
});
|
|
8721
|
+
if (shouldRunLlmReview(reviewMode, preReviewReport, sourceSpans.length > 0)) {
|
|
8722
|
+
for (let round = 0; round < maxReviewRounds; round++) {
|
|
8723
|
+
const extractedKeys = [...memory.keys()].filter((k) => k !== "classify");
|
|
8724
|
+
const extractionSummary = summarizeExtraction(memory);
|
|
8725
|
+
const pageMapSummary = formatPageMapSummary(pageAssignments);
|
|
8726
|
+
const budget = resolveBudget("extraction_review", 1536);
|
|
8727
|
+
const startedAt = Date.now();
|
|
8728
|
+
const reviewResponse = await safeGenerateObject(
|
|
8729
|
+
generateObject,
|
|
8730
|
+
{
|
|
8731
|
+
prompt: buildReviewPrompt(template.required, extractedKeys, extractionSummary, pageMapSummary, extractorCatalog),
|
|
8732
|
+
schema: ReviewResultSchema,
|
|
8733
|
+
maxTokens: budget.maxTokens,
|
|
8734
|
+
providerOptions: await getFullPdfProviderOptions()
|
|
8735
|
+
},
|
|
8736
|
+
{
|
|
8737
|
+
fallback: { complete: true, missingFields: [], qualityIssues: [], additionalTasks: [] },
|
|
8738
|
+
log,
|
|
8739
|
+
onError: (err, attempt) => log?.(`Review round ${round + 1} attempt ${attempt + 1} failed: ${err}`)
|
|
8740
|
+
}
|
|
8741
|
+
);
|
|
8742
|
+
trackUsage(reviewResponse.usage, {
|
|
8743
|
+
taskKind: "extraction_review",
|
|
8744
|
+
label: `review:${round + 1}`,
|
|
8543
8745
|
maxTokens: budget.maxTokens,
|
|
8544
|
-
|
|
8545
|
-
}
|
|
8546
|
-
|
|
8547
|
-
|
|
8548
|
-
log
|
|
8549
|
-
onError: (err, attempt) => log?.(`Review round ${round + 1} attempt ${attempt + 1} failed: ${err}`)
|
|
8746
|
+
durationMs: Date.now() - startedAt
|
|
8747
|
+
});
|
|
8748
|
+
reviewRounds.push(toReviewRoundRecord(round + 1, reviewResponse.object));
|
|
8749
|
+
if (reviewResponse.object.qualityIssues?.length) {
|
|
8750
|
+
await log?.(`Review round ${round + 1} quality issues: ${reviewResponse.object.qualityIssues.join("; ")}`);
|
|
8550
8751
|
}
|
|
8551
|
-
|
|
8552
|
-
|
|
8553
|
-
|
|
8554
|
-
|
|
8555
|
-
|
|
8556
|
-
|
|
8557
|
-
|
|
8558
|
-
|
|
8559
|
-
|
|
8560
|
-
|
|
8561
|
-
|
|
8562
|
-
|
|
8563
|
-
|
|
8564
|
-
|
|
8565
|
-
|
|
8566
|
-
|
|
8567
|
-
|
|
8568
|
-
|
|
8569
|
-
|
|
8570
|
-
|
|
8571
|
-
|
|
8572
|
-
|
|
8573
|
-
|
|
8574
|
-
|
|
8575
|
-
if (result) {
|
|
8576
|
-
mergeMemoryResult(result.name, result.data, memory);
|
|
8752
|
+
if (reviewResponse.object.complete || reviewResponse.object.additionalTasks.length === 0) {
|
|
8753
|
+
onProgress?.("Extraction complete.");
|
|
8754
|
+
break;
|
|
8755
|
+
}
|
|
8756
|
+
onProgress?.(`Review round ${round + 1}: dispatching ${reviewResponse.object.additionalTasks.length} follow-up extractors...`);
|
|
8757
|
+
const extractionPdfInput = await getPdfBase64ForExtraction();
|
|
8758
|
+
const followUpResults = await Promise.all(
|
|
8759
|
+
reviewResponse.object.additionalTasks.map(
|
|
8760
|
+
(task) => extractorLimit(async () => {
|
|
8761
|
+
return runFocusedExtractorTask(
|
|
8762
|
+
task,
|
|
8763
|
+
extractionPdfInput,
|
|
8764
|
+
memory,
|
|
8765
|
+
completedPageRangePdfCache,
|
|
8766
|
+
getPageRangePdf,
|
|
8767
|
+
convertPdfToImages ? getPageImages : void 0
|
|
8768
|
+
);
|
|
8769
|
+
})
|
|
8770
|
+
)
|
|
8771
|
+
);
|
|
8772
|
+
for (const result of followUpResults.flatMap((item) => Array.isArray(item) ? item : item ? [item] : [])) {
|
|
8773
|
+
if (result) {
|
|
8774
|
+
mergeMemoryResult(result.name, result.data, memory);
|
|
8775
|
+
}
|
|
8577
8776
|
}
|
|
8578
8777
|
}
|
|
8778
|
+
} else {
|
|
8779
|
+
onProgress?.("Skipping LLM extraction review; deterministic checks passed.");
|
|
8579
8780
|
}
|
|
8580
8781
|
groundExtractionMemoryWithSourceSpans(memory, sourceSpans);
|
|
8581
8782
|
reviewReport = buildExtractionReviewReport({
|
|
@@ -8627,6 +8828,7 @@ function createExtractor(config) {
|
|
|
8627
8828
|
onProgress?.("Generating document summary...");
|
|
8628
8829
|
try {
|
|
8629
8830
|
const budget = resolveBudget("extraction_summary", 512);
|
|
8831
|
+
const startedAt = Date.now();
|
|
8630
8832
|
const summaryResponse = await safeGenerateObject(
|
|
8631
8833
|
generateObject,
|
|
8632
8834
|
{
|
|
@@ -8644,7 +8846,8 @@ function createExtractor(config) {
|
|
|
8644
8846
|
trackUsage(summaryResponse.usage, {
|
|
8645
8847
|
taskKind: "extraction_summary",
|
|
8646
8848
|
label: "summary",
|
|
8647
|
-
maxTokens: budget.maxTokens
|
|
8849
|
+
maxTokens: budget.maxTokens,
|
|
8850
|
+
durationMs: Date.now() - startedAt
|
|
8648
8851
|
});
|
|
8649
8852
|
if (summaryResponse.object.summary) {
|
|
8650
8853
|
document.summary = summaryResponse.object.summary;
|
|
@@ -8655,16 +8858,19 @@ function createExtractor(config) {
|
|
|
8655
8858
|
}
|
|
8656
8859
|
onProgress?.("Formatting extracted content...");
|
|
8657
8860
|
const formatBudget = resolveBudget("extraction_format", 16384);
|
|
8861
|
+
const formatStartedAt = Date.now();
|
|
8658
8862
|
const formatResult = await formatDocumentContent(document, generateText, {
|
|
8659
8863
|
providerOptions: activeProviderOptions,
|
|
8660
8864
|
maxTokens: formatBudget.maxTokens,
|
|
8865
|
+
concurrency: formatConcurrency ?? concurrency,
|
|
8661
8866
|
onProgress,
|
|
8662
8867
|
log
|
|
8663
8868
|
});
|
|
8664
8869
|
trackUsage(formatResult.usage, {
|
|
8665
8870
|
taskKind: "extraction_format",
|
|
8666
8871
|
label: "format",
|
|
8667
|
-
maxTokens: formatBudget.maxTokens
|
|
8872
|
+
maxTokens: formatBudget.maxTokens,
|
|
8873
|
+
durationMs: Date.now() - formatStartedAt
|
|
8668
8874
|
});
|
|
8669
8875
|
const chunks = chunkDocument(formatResult.document);
|
|
8670
8876
|
const finalCheckpoint = pipelineCtx.getCheckpoint();
|