@claritylabs/cl-sdk 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -1
- package/dist/index.d.mts +9 -0
- package/dist/index.d.ts +9 -0
- package/dist/index.js +308 -102
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +308 -102
- package/dist/index.mjs.map +1 -1
- package/dist/storage-sqlite.js +2 -1
- package/dist/storage-sqlite.js.map +1 -1
- package/dist/storage-sqlite.mjs +2 -1
- package/dist/storage-sqlite.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.mjs
CHANGED
|
@@ -38,10 +38,11 @@ async function withRetry(fn, log) {
|
|
|
38
38
|
|
|
39
39
|
// src/core/concurrency.ts
|
|
40
40
|
function pLimit(concurrency) {
|
|
41
|
+
const maxConcurrency = Number.isFinite(concurrency) ? Math.max(1, Math.floor(concurrency)) : 1;
|
|
41
42
|
let active = 0;
|
|
42
43
|
const queue = [];
|
|
43
44
|
function next() {
|
|
44
|
-
if (queue.length > 0 && active <
|
|
45
|
+
if (queue.length > 0 && active < maxConcurrency) {
|
|
45
46
|
active++;
|
|
46
47
|
queue.shift()();
|
|
47
48
|
}
|
|
@@ -2340,6 +2341,35 @@ async function extractPageRange(input, startPage, endPage) {
|
|
|
2340
2341
|
const bytes = await newDoc.save();
|
|
2341
2342
|
return bytesToBase64(new Uint8Array(bytes));
|
|
2342
2343
|
}
|
|
2344
|
+
async function createPdfPageSlicer(input) {
|
|
2345
|
+
if (isFileIdRef(input)) {
|
|
2346
|
+
throw new Error(
|
|
2347
|
+
"Cannot create a page slicer from a fileId reference. Pass the full PDF as base64/bytes, or provide pre-rendered page assets."
|
|
2348
|
+
);
|
|
2349
|
+
}
|
|
2350
|
+
const srcBytes = await pdfInputToBytes(input);
|
|
2351
|
+
const srcDoc = await PDFDocument.load(srcBytes, { ignoreEncryption: true });
|
|
2352
|
+
const totalPages = srcDoc.getPageCount();
|
|
2353
|
+
const originalBase64 = isBytes(input) ? bytesToBase64(input) : typeof input === "string" ? input : bytesToBase64(srcBytes);
|
|
2354
|
+
return {
|
|
2355
|
+
getPageCount() {
|
|
2356
|
+
return totalPages;
|
|
2357
|
+
},
|
|
2358
|
+
async extractPageRange(startPage, endPage) {
|
|
2359
|
+
const start = Math.max(startPage - 1, 0);
|
|
2360
|
+
const end = Math.min(endPage, totalPages) - 1;
|
|
2361
|
+
if (start === 0 && end >= totalPages - 1) {
|
|
2362
|
+
return originalBase64;
|
|
2363
|
+
}
|
|
2364
|
+
const newDoc = await PDFDocument.create();
|
|
2365
|
+
const indices = Array.from({ length: end - start + 1 }, (_, i) => start + i);
|
|
2366
|
+
const pages = await newDoc.copyPages(srcDoc, indices);
|
|
2367
|
+
pages.forEach((page) => newDoc.addPage(page));
|
|
2368
|
+
const bytes = await newDoc.save();
|
|
2369
|
+
return bytesToBase64(new Uint8Array(bytes));
|
|
2370
|
+
}
|
|
2371
|
+
};
|
|
2372
|
+
}
|
|
2343
2373
|
async function buildPdfProviderOptions(input, existingOptions) {
|
|
2344
2374
|
const options = { ...existingOptions };
|
|
2345
2375
|
if (isFileIdRef(input)) {
|
|
@@ -2487,20 +2517,19 @@ async function runExtractor(params) {
|
|
|
2487
2517
|
} = params;
|
|
2488
2518
|
const extractorProviderOptions = { ...providerOptions };
|
|
2489
2519
|
let fullPrompt;
|
|
2490
|
-
const
|
|
2520
|
+
const needsPdfBase64 = convertPdfToImages && !params.getPageImages || !convertPdfToImages && !params.getPageRangePdf;
|
|
2521
|
+
const pdfBase64 = needsPdfBase64 ? await pdfInputToBase64(pdfInput) : void 0;
|
|
2491
2522
|
if (convertPdfToImages) {
|
|
2492
|
-
const images = await convertPdfToImages(pdfBase64, startPage, endPage);
|
|
2523
|
+
const images = params.getPageImages ? await params.getPageImages(startPage, endPage) : await convertPdfToImages(pdfBase64, startPage, endPage);
|
|
2493
2524
|
extractorProviderOptions.images = images;
|
|
2494
2525
|
fullPrompt = `${prompt}
|
|
2495
2526
|
|
|
2496
2527
|
[Document pages ${startPage}-${endPage} are provided as images.]`;
|
|
2497
2528
|
} else {
|
|
2498
2529
|
const cacheKey = `${startPage}-${endPage}`;
|
|
2499
|
-
|
|
2500
|
-
|
|
2501
|
-
|
|
2502
|
-
pageRangeCache?.set(cacheKey, pagesPdf);
|
|
2503
|
-
}
|
|
2530
|
+
const cachedPagesPdf = pageRangeCache?.get(cacheKey);
|
|
2531
|
+
const pagesPdf = cachedPagesPdf ?? (params.getPageRangePdf ? await params.getPageRangePdf(startPage, endPage) : await extractPageRange(pdfBase64, startPage, endPage));
|
|
2532
|
+
if (!cachedPagesPdf) pageRangeCache?.set(cacheKey, pagesPdf);
|
|
2504
2533
|
extractorProviderOptions.pdfBase64 = pagesPdf;
|
|
2505
2534
|
fullPrompt = `${prompt}
|
|
2506
2535
|
|
|
@@ -3486,33 +3515,40 @@ async function formatDocumentContent(doc, generateText, options) {
|
|
|
3486
3515
|
for (let i = 0; i < entries.length; i += MAX_ENTRIES_PER_BATCH) {
|
|
3487
3516
|
batches.push(entries.slice(i, i + MAX_ENTRIES_PER_BATCH));
|
|
3488
3517
|
}
|
|
3489
|
-
|
|
3490
|
-
|
|
3491
|
-
|
|
3492
|
-
|
|
3493
|
-
|
|
3494
|
-
|
|
3495
|
-
|
|
3496
|
-
|
|
3497
|
-
|
|
3498
|
-
|
|
3499
|
-
|
|
3500
|
-
|
|
3501
|
-
|
|
3502
|
-
|
|
3503
|
-
|
|
3504
|
-
|
|
3505
|
-
|
|
3518
|
+
const limit = pLimit(options?.concurrency ?? 2);
|
|
3519
|
+
const batchResults = await Promise.all(batches.map(
|
|
3520
|
+
(batch, batchIdx) => limit(async () => {
|
|
3521
|
+
try {
|
|
3522
|
+
const prompt = buildFormatPrompt(batch.map((e) => ({ id: e.id, text: e.text })));
|
|
3523
|
+
const result = await withRetry(
|
|
3524
|
+
() => generateText({
|
|
3525
|
+
prompt,
|
|
3526
|
+
maxTokens: options?.maxTokens ?? 16384,
|
|
3527
|
+
providerOptions: options?.providerOptions
|
|
3528
|
+
})
|
|
3529
|
+
);
|
|
3530
|
+
const formatted = parseFormatResponse(result.text);
|
|
3531
|
+
if (formatted.size < batch.length) {
|
|
3532
|
+
await options?.log?.(
|
|
3533
|
+
`Format batch ${batchIdx + 1}/${batches.length}: model returned ${formatted.size}/${batch.length} entries \u2014 unformatted entries will keep original content`
|
|
3534
|
+
);
|
|
3535
|
+
}
|
|
3536
|
+
return { batch, formatted, usage: result.usage };
|
|
3537
|
+
} catch (error) {
|
|
3506
3538
|
await options?.log?.(
|
|
3507
|
-
`Format batch ${batchIdx + 1}/${batches.length}
|
|
3539
|
+
`Format batch ${batchIdx + 1}/${batches.length} failed, keeping original content: ${error instanceof Error ? error.message : String(error)}`
|
|
3508
3540
|
);
|
|
3541
|
+
return void 0;
|
|
3509
3542
|
}
|
|
3510
|
-
|
|
3511
|
-
|
|
3512
|
-
|
|
3513
|
-
|
|
3514
|
-
|
|
3543
|
+
})
|
|
3544
|
+
));
|
|
3545
|
+
for (const result of batchResults) {
|
|
3546
|
+
if (!result) continue;
|
|
3547
|
+
if (result.usage) {
|
|
3548
|
+
totalUsage.inputTokens += result.usage.inputTokens;
|
|
3549
|
+
totalUsage.outputTokens += result.usage.outputTokens;
|
|
3515
3550
|
}
|
|
3551
|
+
applyFormattedContent(doc, result.batch, result.formatted);
|
|
3516
3552
|
}
|
|
3517
3553
|
return { document: doc, usage: totalUsage };
|
|
3518
3554
|
}
|
|
@@ -6783,6 +6819,8 @@ async function resolveReferentialCoverages(params) {
|
|
|
6783
6819
|
pageCount,
|
|
6784
6820
|
generateObject,
|
|
6785
6821
|
convertPdfToImages,
|
|
6822
|
+
getPageRangePdf,
|
|
6823
|
+
getPageImages,
|
|
6786
6824
|
concurrency = 2,
|
|
6787
6825
|
providerOptions,
|
|
6788
6826
|
modelCapabilities,
|
|
@@ -6892,6 +6930,8 @@ async function resolveReferentialCoverages(params) {
|
|
|
6892
6930
|
endPage: pageRange.endPage,
|
|
6893
6931
|
generateObject,
|
|
6894
6932
|
convertPdfToImages,
|
|
6933
|
+
getPageRangePdf,
|
|
6934
|
+
getPageImages,
|
|
6895
6935
|
maxTokens: budget.maxTokens,
|
|
6896
6936
|
providerOptions
|
|
6897
6937
|
});
|
|
@@ -6983,6 +7023,8 @@ async function runFocusedExtractorWithFallback(params) {
|
|
|
6983
7023
|
convertPdfToImages,
|
|
6984
7024
|
providerOptions,
|
|
6985
7025
|
pageRangeCache,
|
|
7026
|
+
getPageRangePdf,
|
|
7027
|
+
getPageImages,
|
|
6986
7028
|
trackUsage,
|
|
6987
7029
|
resolveBudget,
|
|
6988
7030
|
log
|
|
@@ -6996,6 +7038,7 @@ async function runFocusedExtractorWithFallback(params) {
|
|
|
6996
7038
|
const hintTokens = ext.maxTokens ?? 4096;
|
|
6997
7039
|
const taskKind = hintTokens >= 8192 ? "extraction_long_list" : "extraction_focused";
|
|
6998
7040
|
const budget = resolveBudget(taskKind, hintTokens);
|
|
7041
|
+
const startedAt = Date.now();
|
|
6999
7042
|
const result = await runExtractor({
|
|
7000
7043
|
name: task.extractorName,
|
|
7001
7044
|
prompt: ext.buildPrompt(),
|
|
@@ -7007,12 +7050,15 @@ async function runFocusedExtractorWithFallback(params) {
|
|
|
7007
7050
|
convertPdfToImages,
|
|
7008
7051
|
maxTokens: budget.maxTokens,
|
|
7009
7052
|
providerOptions,
|
|
7010
|
-
pageRangeCache
|
|
7053
|
+
pageRangeCache,
|
|
7054
|
+
getPageRangePdf,
|
|
7055
|
+
getPageImages
|
|
7011
7056
|
});
|
|
7012
7057
|
trackUsage(result.usage, {
|
|
7013
7058
|
taskKind,
|
|
7014
7059
|
label: task.extractorName,
|
|
7015
|
-
maxTokens: budget.maxTokens
|
|
7060
|
+
maxTokens: budget.maxTokens,
|
|
7061
|
+
durationMs: Date.now() - startedAt
|
|
7016
7062
|
});
|
|
7017
7063
|
if (!ext.fallback?.isEmpty(result.data)) {
|
|
7018
7064
|
return result;
|
|
@@ -7035,6 +7081,7 @@ async function runFocusedExtractorWithFallback(params) {
|
|
|
7035
7081
|
const hintTokens = fallbackExt.maxTokens ?? 4096;
|
|
7036
7082
|
const taskKind = hintTokens >= 8192 ? "extraction_long_list" : "extraction_focused";
|
|
7037
7083
|
const budget = resolveBudget(taskKind, hintTokens);
|
|
7084
|
+
const startedAt = Date.now();
|
|
7038
7085
|
const fallbackResult = await runExtractor({
|
|
7039
7086
|
name: ext.fallback.extractorName,
|
|
7040
7087
|
prompt: fallbackExt.buildPrompt(),
|
|
@@ -7046,12 +7093,15 @@ async function runFocusedExtractorWithFallback(params) {
|
|
|
7046
7093
|
convertPdfToImages,
|
|
7047
7094
|
maxTokens: budget.maxTokens,
|
|
7048
7095
|
providerOptions,
|
|
7049
|
-
pageRangeCache
|
|
7096
|
+
pageRangeCache,
|
|
7097
|
+
getPageRangePdf,
|
|
7098
|
+
getPageImages
|
|
7050
7099
|
});
|
|
7051
7100
|
trackUsage(fallbackResult.usage, {
|
|
7052
7101
|
taskKind,
|
|
7053
7102
|
label: ext.fallback.extractorName,
|
|
7054
|
-
maxTokens: budget.maxTokens
|
|
7103
|
+
maxTokens: budget.maxTokens,
|
|
7104
|
+
durationMs: Date.now() - startedAt
|
|
7055
7105
|
});
|
|
7056
7106
|
const focusedData = ext.fallback.deriveFocusedResult(fallbackResult.data);
|
|
7057
7107
|
return focusedData ? [
|
|
@@ -7712,7 +7762,11 @@ function createExtractor(config) {
|
|
|
7712
7762
|
generateObject,
|
|
7713
7763
|
convertPdfToImages,
|
|
7714
7764
|
concurrency = 2,
|
|
7765
|
+
pageMapConcurrency,
|
|
7766
|
+
extractorConcurrency,
|
|
7767
|
+
formatConcurrency,
|
|
7715
7768
|
maxReviewRounds = 2,
|
|
7769
|
+
reviewMode = "auto",
|
|
7716
7770
|
onTokenUsage,
|
|
7717
7771
|
onProgress,
|
|
7718
7772
|
log,
|
|
@@ -7723,7 +7777,8 @@ function createExtractor(config) {
|
|
|
7723
7777
|
modelBudgetConstraints,
|
|
7724
7778
|
onCheckpointSave
|
|
7725
7779
|
} = config;
|
|
7726
|
-
const
|
|
7780
|
+
const pageMapLimit = pLimit(pageMapConcurrency ?? concurrency);
|
|
7781
|
+
const extractorLimit = pLimit(extractorConcurrency ?? concurrency);
|
|
7727
7782
|
const extractorCatalog = formatExtractorCatalogForPrompt();
|
|
7728
7783
|
let totalUsage = { inputTokens: 0, outputTokens: 0 };
|
|
7729
7784
|
let modelCalls = 0;
|
|
@@ -7758,7 +7813,7 @@ function createExtractor(config) {
|
|
|
7758
7813
|
usage,
|
|
7759
7814
|
usageReported: !!usage
|
|
7760
7815
|
});
|
|
7761
|
-
if (report.durationMs) {
|
|
7816
|
+
if (report.durationMs != null) {
|
|
7762
7817
|
performanceReport.totalModelCallDurationMs += report.durationMs;
|
|
7763
7818
|
}
|
|
7764
7819
|
}
|
|
@@ -7815,6 +7870,46 @@ function createExtractor(config) {
|
|
|
7815
7870
|
return textIncludesSupplementarySignal(JSON.stringify(value));
|
|
7816
7871
|
});
|
|
7817
7872
|
}
|
|
7873
|
+
function getSupplementaryPageRanges(pageAssignments, formInventory) {
|
|
7874
|
+
const pages = /* @__PURE__ */ new Set();
|
|
7875
|
+
for (const assignment of pageAssignments) {
|
|
7876
|
+
if (assignment.pageRole === "supplementary" || assignment.extractorNames.includes("supplementary") || textIncludesSupplementarySignal(assignment.notes)) {
|
|
7877
|
+
pages.add(assignment.localPageNumber);
|
|
7878
|
+
}
|
|
7879
|
+
}
|
|
7880
|
+
for (const form of formInventory?.forms ?? []) {
|
|
7881
|
+
if (form.formType === "notice" || textIncludesSupplementarySignal(form.title) || textIncludesSupplementarySignal(form.formNumber)) {
|
|
7882
|
+
const startPage2 = form.pageStart;
|
|
7883
|
+
const endPage = form.pageEnd ?? form.pageStart;
|
|
7884
|
+
if (typeof startPage2 !== "number" || typeof endPage !== "number") continue;
|
|
7885
|
+
for (let page = startPage2; page <= endPage; page += 1) {
|
|
7886
|
+
pages.add(page);
|
|
7887
|
+
}
|
|
7888
|
+
}
|
|
7889
|
+
}
|
|
7890
|
+
const sortedPages = [...pages].sort((a, b) => a - b);
|
|
7891
|
+
if (sortedPages.length === 0) return [];
|
|
7892
|
+
const ranges = [];
|
|
7893
|
+
let startPage = sortedPages[0];
|
|
7894
|
+
let previousPage = sortedPages[0];
|
|
7895
|
+
for (const page of sortedPages.slice(1)) {
|
|
7896
|
+
if (page === previousPage + 1) {
|
|
7897
|
+
previousPage = page;
|
|
7898
|
+
continue;
|
|
7899
|
+
}
|
|
7900
|
+
ranges.push({ startPage, endPage: previousPage });
|
|
7901
|
+
startPage = page;
|
|
7902
|
+
previousPage = page;
|
|
7903
|
+
}
|
|
7904
|
+
ranges.push({ startPage, endPage: previousPage });
|
|
7905
|
+
return ranges;
|
|
7906
|
+
}
|
|
7907
|
+
function shouldRunLlmReview(mode, report, sourceSpansAvailable) {
|
|
7908
|
+
if (mode === "skip" || maxReviewRounds <= 0) return false;
|
|
7909
|
+
if (mode === "always") return true;
|
|
7910
|
+
if (!sourceSpansAvailable) return true;
|
|
7911
|
+
return report.qualityGateStatus !== "passed" || report.issues.length > 0;
|
|
7912
|
+
}
|
|
7818
7913
|
function buildAlreadyExtractedSummary(memory) {
|
|
7819
7914
|
const lines = [];
|
|
7820
7915
|
const declarationResult = readMemoryRecord(memory, "declarations");
|
|
@@ -7844,10 +7939,11 @@ function createExtractor(config) {
|
|
|
7844
7939
|
}
|
|
7845
7940
|
return lines.length > 0 ? lines.join("\n") : "";
|
|
7846
7941
|
}
|
|
7847
|
-
async function runFocusedExtractorTask(task, pdfInput, memory, pageRangeCache) {
|
|
7942
|
+
async function runFocusedExtractorTask(task, pdfInput, memory, pageRangeCache, getPageRangePdf, getPageImages) {
|
|
7848
7943
|
if (task.extractorName === "supplementary") {
|
|
7849
7944
|
const alreadyExtractedSummary = buildAlreadyExtractedSummary(memory);
|
|
7850
7945
|
const budget = resolveBudget("extraction_focused", 4096);
|
|
7946
|
+
const startedAt = Date.now();
|
|
7851
7947
|
const result = await runExtractor({
|
|
7852
7948
|
name: "supplementary",
|
|
7853
7949
|
prompt: buildSupplementaryPrompt(alreadyExtractedSummary),
|
|
@@ -7859,12 +7955,15 @@ function createExtractor(config) {
|
|
|
7859
7955
|
convertPdfToImages,
|
|
7860
7956
|
maxTokens: budget.maxTokens,
|
|
7861
7957
|
providerOptions: activeProviderOptions,
|
|
7862
|
-
pageRangeCache
|
|
7958
|
+
pageRangeCache,
|
|
7959
|
+
getPageRangePdf,
|
|
7960
|
+
getPageImages
|
|
7863
7961
|
});
|
|
7864
7962
|
trackUsage(result.usage, {
|
|
7865
7963
|
taskKind: "extraction_focused",
|
|
7866
7964
|
label: "supplementary",
|
|
7867
|
-
maxTokens: budget.maxTokens
|
|
7965
|
+
maxTokens: budget.maxTokens,
|
|
7966
|
+
durationMs: Date.now() - startedAt
|
|
7868
7967
|
});
|
|
7869
7968
|
return result;
|
|
7870
7969
|
}
|
|
@@ -7875,6 +7974,8 @@ function createExtractor(config) {
|
|
|
7875
7974
|
convertPdfToImages,
|
|
7876
7975
|
providerOptions: activeProviderOptions,
|
|
7877
7976
|
pageRangeCache,
|
|
7977
|
+
getPageRangePdf,
|
|
7978
|
+
getPageImages,
|
|
7878
7979
|
trackUsage,
|
|
7879
7980
|
resolveBudget,
|
|
7880
7981
|
log
|
|
@@ -7923,20 +8024,68 @@ function createExtractor(config) {
|
|
|
7923
8024
|
}
|
|
7924
8025
|
}
|
|
7925
8026
|
let pdfBase64Cache;
|
|
8027
|
+
const completedPageRangePdfCache = /* @__PURE__ */ new Map();
|
|
7926
8028
|
const pageRangePdfCache = /* @__PURE__ */ new Map();
|
|
8029
|
+
const pageRangeImageCache = /* @__PURE__ */ new Map();
|
|
8030
|
+
let pdfSlicerPromise;
|
|
8031
|
+
let fullPdfProviderOptionsPromise;
|
|
8032
|
+
let pageCountPromise;
|
|
7927
8033
|
async function getPdfBase64ForExtraction() {
|
|
7928
8034
|
if (pdfBase64Cache === void 0) {
|
|
7929
8035
|
pdfBase64Cache = await pdfInputToBase64(pdfInput);
|
|
7930
8036
|
}
|
|
7931
8037
|
return pdfBase64Cache;
|
|
7932
8038
|
}
|
|
8039
|
+
async function getCachedPageCount() {
|
|
8040
|
+
if (!pageCountPromise) {
|
|
8041
|
+
pageCountPromise = getPdfSlicer().then((slicer) => slicer.getPageCount()).catch(() => getPdfPageCount(pdfInput));
|
|
8042
|
+
}
|
|
8043
|
+
return pageCountPromise;
|
|
8044
|
+
}
|
|
8045
|
+
async function getFullPdfProviderOptions() {
|
|
8046
|
+
if (!fullPdfProviderOptionsPromise) {
|
|
8047
|
+
fullPdfProviderOptionsPromise = buildPdfProviderOptions(pdfInput, activeProviderOptions);
|
|
8048
|
+
}
|
|
8049
|
+
return fullPdfProviderOptionsPromise;
|
|
8050
|
+
}
|
|
8051
|
+
async function getPdfSlicer() {
|
|
8052
|
+
if (!pdfSlicerPromise) {
|
|
8053
|
+
pdfSlicerPromise = createPdfPageSlicer(pdfInput);
|
|
8054
|
+
}
|
|
8055
|
+
return pdfSlicerPromise;
|
|
8056
|
+
}
|
|
7933
8057
|
async function getPageRangePdf(startPage, endPage) {
|
|
7934
8058
|
const cacheKey = `${startPage}-${endPage}`;
|
|
7935
|
-
const cached =
|
|
8059
|
+
const cached = completedPageRangePdfCache.get(cacheKey);
|
|
8060
|
+
if (cached) return cached;
|
|
8061
|
+
const pending = pageRangePdfCache.get(cacheKey);
|
|
8062
|
+
if (pending) return pending;
|
|
8063
|
+
const promise = (async () => {
|
|
8064
|
+
const slicer = await getPdfSlicer();
|
|
8065
|
+
const pagesPdf = await slicer.extractPageRange(startPage, endPage);
|
|
8066
|
+
completedPageRangePdfCache.set(cacheKey, pagesPdf);
|
|
8067
|
+
return pagesPdf;
|
|
8068
|
+
})().catch((error) => {
|
|
8069
|
+
pageRangePdfCache.delete(cacheKey);
|
|
8070
|
+
throw error;
|
|
8071
|
+
});
|
|
8072
|
+
pageRangePdfCache.set(cacheKey, promise);
|
|
8073
|
+
return promise;
|
|
8074
|
+
}
|
|
8075
|
+
async function getPageImages(startPage, endPage) {
|
|
8076
|
+
if (!convertPdfToImages) return [];
|
|
8077
|
+
const cacheKey = `${startPage}-${endPage}`;
|
|
8078
|
+
const cached = pageRangeImageCache.get(cacheKey);
|
|
7936
8079
|
if (cached) return cached;
|
|
7937
|
-
const
|
|
7938
|
-
|
|
7939
|
-
|
|
8080
|
+
const promise = (async () => {
|
|
8081
|
+
const pdfBase64 = await getPdfBase64ForExtraction();
|
|
8082
|
+
return convertPdfToImages(pdfBase64, startPage, endPage);
|
|
8083
|
+
})().catch((error) => {
|
|
8084
|
+
pageRangeImageCache.delete(cacheKey);
|
|
8085
|
+
throw error;
|
|
8086
|
+
});
|
|
8087
|
+
pageRangeImageCache.set(cacheKey, promise);
|
|
8088
|
+
return promise;
|
|
7940
8089
|
}
|
|
7941
8090
|
let classifyResult;
|
|
7942
8091
|
if (resumed?.classifyResult && pipelineCtx.isPhaseComplete("classify")) {
|
|
@@ -7944,15 +8093,16 @@ function createExtractor(config) {
|
|
|
7944
8093
|
onProgress?.("Resuming from checkpoint (classify complete)...");
|
|
7945
8094
|
} else {
|
|
7946
8095
|
onProgress?.("Classifying document...");
|
|
7947
|
-
const pageCount2 = await
|
|
8096
|
+
const pageCount2 = await getCachedPageCount();
|
|
7948
8097
|
const budget = resolveBudget("extraction_classify", 512);
|
|
8098
|
+
const startedAt = Date.now();
|
|
7949
8099
|
const classifyResponse = await safeGenerateObject(
|
|
7950
8100
|
generateObject,
|
|
7951
8101
|
{
|
|
7952
8102
|
prompt: buildClassifyPrompt(),
|
|
7953
8103
|
schema: ClassifyResultSchema,
|
|
7954
8104
|
maxTokens: budget.maxTokens,
|
|
7955
|
-
providerOptions: await
|
|
8105
|
+
providerOptions: await getFullPdfProviderOptions()
|
|
7956
8106
|
},
|
|
7957
8107
|
{
|
|
7958
8108
|
fallback: { documentType: "policy", policyTypes: ["other"], confidence: 0 },
|
|
@@ -7964,7 +8114,8 @@ function createExtractor(config) {
|
|
|
7964
8114
|
trackUsage(classifyResponse.usage, {
|
|
7965
8115
|
taskKind: "extraction_classify",
|
|
7966
8116
|
label: "classify",
|
|
7967
|
-
maxTokens: budget.maxTokens
|
|
8117
|
+
maxTokens: budget.maxTokens,
|
|
8118
|
+
durationMs: Date.now() - startedAt
|
|
7968
8119
|
});
|
|
7969
8120
|
classifyResult = classifyResponse.object;
|
|
7970
8121
|
if (classifyResult.confidence === 0) {
|
|
@@ -7982,7 +8133,7 @@ function createExtractor(config) {
|
|
|
7982
8133
|
const policyTypes = classifyResult.policyTypes ?? [];
|
|
7983
8134
|
const primaryType = policyTypes[0] ?? "other";
|
|
7984
8135
|
const template = getTemplate(primaryType);
|
|
7985
|
-
const pageCount = resumed?.pageCount ?? await
|
|
8136
|
+
const pageCount = resumed?.pageCount ?? await getCachedPageCount();
|
|
7986
8137
|
const templateHints = buildTemplateHints(primaryType, documentType, pageCount, template);
|
|
7987
8138
|
let formInventory;
|
|
7988
8139
|
if (resumed?.formInventory && pipelineCtx.isPhaseComplete("form_inventory")) {
|
|
@@ -7992,13 +8143,14 @@ function createExtractor(config) {
|
|
|
7992
8143
|
} else {
|
|
7993
8144
|
onProgress?.(`Building form inventory for ${primaryType} ${documentType}...`);
|
|
7994
8145
|
const budget = resolveBudget("extraction_form_inventory", 2048);
|
|
8146
|
+
const startedAt = Date.now();
|
|
7995
8147
|
const formInventoryResponse = await safeGenerateObject(
|
|
7996
8148
|
generateObject,
|
|
7997
8149
|
{
|
|
7998
8150
|
prompt: buildFormInventoryPrompt(templateHints),
|
|
7999
8151
|
schema: FormInventorySchema,
|
|
8000
8152
|
maxTokens: budget.maxTokens,
|
|
8001
|
-
providerOptions: await
|
|
8153
|
+
providerOptions: await getFullPdfProviderOptions()
|
|
8002
8154
|
},
|
|
8003
8155
|
{
|
|
8004
8156
|
fallback: { forms: [] },
|
|
@@ -8009,7 +8161,8 @@ function createExtractor(config) {
|
|
|
8009
8161
|
trackUsage(formInventoryResponse.usage, {
|
|
8010
8162
|
taskKind: "extraction_form_inventory",
|
|
8011
8163
|
label: "form_inventory",
|
|
8012
|
-
maxTokens: budget.maxTokens
|
|
8164
|
+
maxTokens: budget.maxTokens,
|
|
8165
|
+
durationMs: Date.now() - startedAt
|
|
8013
8166
|
});
|
|
8014
8167
|
formInventory = formInventoryResponse.object;
|
|
8015
8168
|
memory.set("form_inventory", formInventory);
|
|
@@ -8039,9 +8192,10 @@ function createExtractor(config) {
|
|
|
8039
8192
|
);
|
|
8040
8193
|
const pageMapResults = await Promise.all(
|
|
8041
8194
|
pageMapChunks.map(
|
|
8042
|
-
({ startPage, endPage }) =>
|
|
8195
|
+
({ startPage, endPage }) => pageMapLimit(async () => {
|
|
8043
8196
|
const pagesPdf = await getPageRangePdf(startPage, endPage);
|
|
8044
8197
|
const budget = resolveBudget("extraction_page_map", 2048);
|
|
8198
|
+
const startedAt = Date.now();
|
|
8045
8199
|
const mapResponse = await safeGenerateObject(
|
|
8046
8200
|
generateObject,
|
|
8047
8201
|
{
|
|
@@ -8066,7 +8220,8 @@ function createExtractor(config) {
|
|
|
8066
8220
|
trackUsage(mapResponse.usage, {
|
|
8067
8221
|
taskKind: "extraction_page_map",
|
|
8068
8222
|
label: `page_map:${startPage}-${endPage}`,
|
|
8069
|
-
maxTokens: budget.maxTokens
|
|
8223
|
+
maxTokens: budget.maxTokens,
|
|
8224
|
+
durationMs: Date.now() - startedAt
|
|
8070
8225
|
});
|
|
8071
8226
|
return mapResponse.object.pages.map((assignment) => ({
|
|
8072
8227
|
...assignment,
|
|
@@ -8112,14 +8267,32 @@ function createExtractor(config) {
|
|
|
8112
8267
|
});
|
|
8113
8268
|
}
|
|
8114
8269
|
if (!pipelineCtx.isPhaseComplete("extract")) {
|
|
8115
|
-
const
|
|
8270
|
+
const supplementaryRanges = getSupplementaryPageRanges(pageAssignments, formInventory);
|
|
8271
|
+
const baseTasks = plan.tasks;
|
|
8272
|
+
const hasPlannedSupplementary = baseTasks.some((task) => task.extractorName === "supplementary");
|
|
8273
|
+
const tasks = hasPlannedSupplementary || supplementaryRanges.length === 0 ? baseTasks : [
|
|
8274
|
+
...baseTasks,
|
|
8275
|
+
...supplementaryRanges.map((range) => ({
|
|
8276
|
+
extractorName: "supplementary",
|
|
8277
|
+
startPage: range.startPage,
|
|
8278
|
+
endPage: range.endPage,
|
|
8279
|
+
description: `Page-signaled supplementary extraction for pages ${range.startPage}-${range.endPage}`
|
|
8280
|
+
}))
|
|
8281
|
+
];
|
|
8116
8282
|
onProgress?.(`Dispatching ${tasks.length} extractors...`);
|
|
8117
8283
|
const extractionPdfInput = await getPdfBase64ForExtraction();
|
|
8118
8284
|
const extractorResults = await Promise.all(
|
|
8119
8285
|
tasks.map(
|
|
8120
|
-
(task) =>
|
|
8286
|
+
(task) => extractorLimit(async () => {
|
|
8121
8287
|
onProgress?.(`Extracting ${task.extractorName} (pages ${task.startPage}-${task.endPage})...`);
|
|
8122
|
-
return runFocusedExtractorTask(
|
|
8288
|
+
return runFocusedExtractorTask(
|
|
8289
|
+
task,
|
|
8290
|
+
extractionPdfInput,
|
|
8291
|
+
memory,
|
|
8292
|
+
completedPageRangePdfCache,
|
|
8293
|
+
getPageRangePdf,
|
|
8294
|
+
convertPdfToImages ? getPageImages : void 0
|
|
8295
|
+
);
|
|
8123
8296
|
})
|
|
8124
8297
|
)
|
|
8125
8298
|
);
|
|
@@ -8134,6 +8307,7 @@ function createExtractor(config) {
|
|
|
8134
8307
|
try {
|
|
8135
8308
|
const alreadyExtractedSummary = buildAlreadyExtractedSummary(memory);
|
|
8136
8309
|
const budget = resolveBudget("extraction_focused", 4096);
|
|
8310
|
+
const startedAt = Date.now();
|
|
8137
8311
|
const supplementaryResult = await runExtractor({
|
|
8138
8312
|
name: "supplementary",
|
|
8139
8313
|
prompt: buildSupplementaryPrompt(alreadyExtractedSummary),
|
|
@@ -8145,12 +8319,15 @@ function createExtractor(config) {
|
|
|
8145
8319
|
convertPdfToImages,
|
|
8146
8320
|
maxTokens: budget.maxTokens,
|
|
8147
8321
|
providerOptions: activeProviderOptions,
|
|
8148
|
-
pageRangeCache:
|
|
8322
|
+
pageRangeCache: completedPageRangePdfCache,
|
|
8323
|
+
getPageRangePdf,
|
|
8324
|
+
getPageImages: convertPdfToImages ? getPageImages : void 0
|
|
8149
8325
|
});
|
|
8150
8326
|
trackUsage(supplementaryResult.usage, {
|
|
8151
8327
|
taskKind: "extraction_focused",
|
|
8152
8328
|
label: "supplementary",
|
|
8153
|
-
maxTokens: budget.maxTokens
|
|
8329
|
+
maxTokens: budget.maxTokens,
|
|
8330
|
+
durationMs: Date.now() - startedAt
|
|
8154
8331
|
});
|
|
8155
8332
|
mergeMemoryResult(supplementaryResult.name, supplementaryResult.data, memory);
|
|
8156
8333
|
} catch (error) {
|
|
@@ -8170,6 +8347,7 @@ function createExtractor(config) {
|
|
|
8170
8347
|
if (!pipelineCtx.isPhaseComplete("resolve_referential")) {
|
|
8171
8348
|
onProgress?.("Resolving referential coverage limits...");
|
|
8172
8349
|
try {
|
|
8350
|
+
const startedAt = Date.now();
|
|
8173
8351
|
const resolution = await resolveReferentialCoverages({
|
|
8174
8352
|
memory,
|
|
8175
8353
|
pdfInput,
|
|
@@ -8177,6 +8355,8 @@ function createExtractor(config) {
|
|
|
8177
8355
|
generateObject,
|
|
8178
8356
|
convertPdfToImages,
|
|
8179
8357
|
concurrency,
|
|
8358
|
+
getPageRangePdf,
|
|
8359
|
+
getPageImages: convertPdfToImages ? getPageImages : void 0,
|
|
8180
8360
|
providerOptions: activeProviderOptions,
|
|
8181
8361
|
modelCapabilities,
|
|
8182
8362
|
modelBudgetConstraints,
|
|
@@ -8185,7 +8365,8 @@ function createExtractor(config) {
|
|
|
8185
8365
|
});
|
|
8186
8366
|
trackUsage(resolution.usage, {
|
|
8187
8367
|
taskKind: "extraction_referential_lookup",
|
|
8188
|
-
label: "referential_resolution"
|
|
8368
|
+
label: "referential_resolution",
|
|
8369
|
+
durationMs: Date.now() - startedAt
|
|
8189
8370
|
});
|
|
8190
8371
|
if (resolution.attempts > 0) {
|
|
8191
8372
|
await log?.(`Referential resolution: ${resolution.resolved}/${resolution.attempts} resolved, ${resolution.unresolved} unresolved`);
|
|
@@ -8207,52 +8388,72 @@ function createExtractor(config) {
|
|
|
8207
8388
|
let reviewReport = resumed?.reviewReport;
|
|
8208
8389
|
if (!pipelineCtx.isPhaseComplete("review")) {
|
|
8209
8390
|
reviewRounds = [];
|
|
8210
|
-
|
|
8211
|
-
|
|
8212
|
-
|
|
8213
|
-
|
|
8214
|
-
|
|
8215
|
-
|
|
8216
|
-
|
|
8217
|
-
|
|
8218
|
-
|
|
8219
|
-
|
|
8391
|
+
groundExtractionMemoryWithSourceSpans(memory, sourceSpans);
|
|
8392
|
+
const preReviewReport = buildExtractionReviewReport({
|
|
8393
|
+
memory,
|
|
8394
|
+
pageAssignments,
|
|
8395
|
+
reviewRounds,
|
|
8396
|
+
sourceSpansAvailable: sourceSpans.length > 0
|
|
8397
|
+
});
|
|
8398
|
+
if (shouldRunLlmReview(reviewMode, preReviewReport, sourceSpans.length > 0)) {
|
|
8399
|
+
for (let round = 0; round < maxReviewRounds; round++) {
|
|
8400
|
+
const extractedKeys = [...memory.keys()].filter((k) => k !== "classify");
|
|
8401
|
+
const extractionSummary = summarizeExtraction(memory);
|
|
8402
|
+
const pageMapSummary = formatPageMapSummary(pageAssignments);
|
|
8403
|
+
const budget = resolveBudget("extraction_review", 1536);
|
|
8404
|
+
const startedAt = Date.now();
|
|
8405
|
+
const reviewResponse = await safeGenerateObject(
|
|
8406
|
+
generateObject,
|
|
8407
|
+
{
|
|
8408
|
+
prompt: buildReviewPrompt(template.required, extractedKeys, extractionSummary, pageMapSummary, extractorCatalog),
|
|
8409
|
+
schema: ReviewResultSchema,
|
|
8410
|
+
maxTokens: budget.maxTokens,
|
|
8411
|
+
providerOptions: await getFullPdfProviderOptions()
|
|
8412
|
+
},
|
|
8413
|
+
{
|
|
8414
|
+
fallback: { complete: true, missingFields: [], qualityIssues: [], additionalTasks: [] },
|
|
8415
|
+
log,
|
|
8416
|
+
onError: (err, attempt) => log?.(`Review round ${round + 1} attempt ${attempt + 1} failed: ${err}`)
|
|
8417
|
+
}
|
|
8418
|
+
);
|
|
8419
|
+
trackUsage(reviewResponse.usage, {
|
|
8420
|
+
taskKind: "extraction_review",
|
|
8421
|
+
label: `review:${round + 1}`,
|
|
8220
8422
|
maxTokens: budget.maxTokens,
|
|
8221
|
-
|
|
8222
|
-
}
|
|
8223
|
-
|
|
8224
|
-
|
|
8225
|
-
log
|
|
8226
|
-
onError: (err, attempt) => log?.(`Review round ${round + 1} attempt ${attempt + 1} failed: ${err}`)
|
|
8423
|
+
durationMs: Date.now() - startedAt
|
|
8424
|
+
});
|
|
8425
|
+
reviewRounds.push(toReviewRoundRecord(round + 1, reviewResponse.object));
|
|
8426
|
+
if (reviewResponse.object.qualityIssues?.length) {
|
|
8427
|
+
await log?.(`Review round ${round + 1} quality issues: ${reviewResponse.object.qualityIssues.join("; ")}`);
|
|
8227
8428
|
}
|
|
8228
|
-
|
|
8229
|
-
|
|
8230
|
-
|
|
8231
|
-
|
|
8232
|
-
|
|
8233
|
-
|
|
8234
|
-
|
|
8235
|
-
|
|
8236
|
-
|
|
8237
|
-
|
|
8238
|
-
|
|
8239
|
-
|
|
8240
|
-
|
|
8241
|
-
|
|
8242
|
-
|
|
8243
|
-
|
|
8244
|
-
|
|
8245
|
-
|
|
8246
|
-
|
|
8247
|
-
|
|
8248
|
-
|
|
8249
|
-
|
|
8250
|
-
|
|
8251
|
-
|
|
8252
|
-
if (result) {
|
|
8253
|
-
mergeMemoryResult(result.name, result.data, memory);
|
|
8429
|
+
if (reviewResponse.object.complete || reviewResponse.object.additionalTasks.length === 0) {
|
|
8430
|
+
onProgress?.("Extraction complete.");
|
|
8431
|
+
break;
|
|
8432
|
+
}
|
|
8433
|
+
onProgress?.(`Review round ${round + 1}: dispatching ${reviewResponse.object.additionalTasks.length} follow-up extractors...`);
|
|
8434
|
+
const extractionPdfInput = await getPdfBase64ForExtraction();
|
|
8435
|
+
const followUpResults = await Promise.all(
|
|
8436
|
+
reviewResponse.object.additionalTasks.map(
|
|
8437
|
+
(task) => extractorLimit(async () => {
|
|
8438
|
+
return runFocusedExtractorTask(
|
|
8439
|
+
task,
|
|
8440
|
+
extractionPdfInput,
|
|
8441
|
+
memory,
|
|
8442
|
+
completedPageRangePdfCache,
|
|
8443
|
+
getPageRangePdf,
|
|
8444
|
+
convertPdfToImages ? getPageImages : void 0
|
|
8445
|
+
);
|
|
8446
|
+
})
|
|
8447
|
+
)
|
|
8448
|
+
);
|
|
8449
|
+
for (const result of followUpResults.flatMap((item) => Array.isArray(item) ? item : item ? [item] : [])) {
|
|
8450
|
+
if (result) {
|
|
8451
|
+
mergeMemoryResult(result.name, result.data, memory);
|
|
8452
|
+
}
|
|
8254
8453
|
}
|
|
8255
8454
|
}
|
|
8455
|
+
} else {
|
|
8456
|
+
onProgress?.("Skipping LLM extraction review; deterministic checks passed.");
|
|
8256
8457
|
}
|
|
8257
8458
|
groundExtractionMemoryWithSourceSpans(memory, sourceSpans);
|
|
8258
8459
|
reviewReport = buildExtractionReviewReport({
|
|
@@ -8304,6 +8505,7 @@ function createExtractor(config) {
|
|
|
8304
8505
|
onProgress?.("Generating document summary...");
|
|
8305
8506
|
try {
|
|
8306
8507
|
const budget = resolveBudget("extraction_summary", 512);
|
|
8508
|
+
const startedAt = Date.now();
|
|
8307
8509
|
const summaryResponse = await safeGenerateObject(
|
|
8308
8510
|
generateObject,
|
|
8309
8511
|
{
|
|
@@ -8321,7 +8523,8 @@ function createExtractor(config) {
|
|
|
8321
8523
|
trackUsage(summaryResponse.usage, {
|
|
8322
8524
|
taskKind: "extraction_summary",
|
|
8323
8525
|
label: "summary",
|
|
8324
|
-
maxTokens: budget.maxTokens
|
|
8526
|
+
maxTokens: budget.maxTokens,
|
|
8527
|
+
durationMs: Date.now() - startedAt
|
|
8325
8528
|
});
|
|
8326
8529
|
if (summaryResponse.object.summary) {
|
|
8327
8530
|
document.summary = summaryResponse.object.summary;
|
|
@@ -8332,16 +8535,19 @@ function createExtractor(config) {
|
|
|
8332
8535
|
}
|
|
8333
8536
|
onProgress?.("Formatting extracted content...");
|
|
8334
8537
|
const formatBudget = resolveBudget("extraction_format", 16384);
|
|
8538
|
+
const formatStartedAt = Date.now();
|
|
8335
8539
|
const formatResult = await formatDocumentContent(document, generateText, {
|
|
8336
8540
|
providerOptions: activeProviderOptions,
|
|
8337
8541
|
maxTokens: formatBudget.maxTokens,
|
|
8542
|
+
concurrency: formatConcurrency ?? concurrency,
|
|
8338
8543
|
onProgress,
|
|
8339
8544
|
log
|
|
8340
8545
|
});
|
|
8341
8546
|
trackUsage(formatResult.usage, {
|
|
8342
8547
|
taskKind: "extraction_format",
|
|
8343
8548
|
label: "format",
|
|
8344
|
-
maxTokens: formatBudget.maxTokens
|
|
8549
|
+
maxTokens: formatBudget.maxTokens,
|
|
8550
|
+
durationMs: Date.now() - formatStartedAt
|
|
8345
8551
|
});
|
|
8346
8552
|
const chunks = chunkDocument(formatResult.document);
|
|
8347
8553
|
const finalCheckpoint = pipelineCtx.getCheckpoint();
|