@claritylabs/cl-sdk 1.0.3 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -5
- package/dist/index.d.mts +82 -2
- package/dist/index.d.ts +82 -2
- package/dist/index.js +457 -51
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +452 -51
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.mjs
CHANGED
|
@@ -2471,6 +2471,254 @@ async function overlayTextOnPdf(pdfBytes, overlays) {
|
|
|
2471
2471
|
return await pdfDoc.save();
|
|
2472
2472
|
}
|
|
2473
2473
|
|
|
2474
|
+
// src/extraction/docling.ts
|
|
2475
|
+
function isDoclingExtractionInput(input) {
|
|
2476
|
+
return Boolean(
|
|
2477
|
+
input && typeof input === "object" && input.kind === "docling_document" && input.document && typeof input.document === "object"
|
|
2478
|
+
);
|
|
2479
|
+
}
|
|
2480
|
+
function normalizeDoclingDocument(document, options) {
|
|
2481
|
+
const itemMap = buildItemMap(document);
|
|
2482
|
+
const orderedRefs = getOrderedBodyRefs(document, itemMap);
|
|
2483
|
+
const orderedItems = orderedRefs.length > 0 ? orderedRefs.map((ref) => itemMap.get(ref)).filter((item) => Boolean(item)) : getFallbackOrderedItems(document, itemMap);
|
|
2484
|
+
const units = orderedItems.map(({ ref, item }) => normalizeItem(ref, item)).filter((unit) => Boolean(unit && unit.text.trim()));
|
|
2485
|
+
const pageCount = inferPageCount(document, units);
|
|
2486
|
+
const pageTexts = /* @__PURE__ */ new Map();
|
|
2487
|
+
for (const unit of units) {
|
|
2488
|
+
const page = clampPage(unit.pageStart ?? 1, pageCount);
|
|
2489
|
+
pageTexts.set(page, appendText(pageTexts.get(page), unit.text));
|
|
2490
|
+
}
|
|
2491
|
+
const fullText = Array.from({ length: pageCount }, (_, index) => {
|
|
2492
|
+
const pageNumber = index + 1;
|
|
2493
|
+
const text = pageTexts.get(pageNumber)?.trim();
|
|
2494
|
+
return text ? `Page ${pageNumber}
|
|
2495
|
+
${text}` : "";
|
|
2496
|
+
}).filter(Boolean).join("\n\n");
|
|
2497
|
+
const sourceKind = options.sourceKind ?? "policy_pdf";
|
|
2498
|
+
const sourceSpans = units.map((unit, index) => {
|
|
2499
|
+
const span = buildSourceSpan(
|
|
2500
|
+
{
|
|
2501
|
+
documentId: options.documentId,
|
|
2502
|
+
sourceKind,
|
|
2503
|
+
text: unit.text,
|
|
2504
|
+
pageStart: unit.pageStart,
|
|
2505
|
+
pageEnd: unit.pageEnd,
|
|
2506
|
+
sectionId: unit.label,
|
|
2507
|
+
metadata: {
|
|
2508
|
+
sourceSystem: "docling",
|
|
2509
|
+
sourceUnit: "docling_item",
|
|
2510
|
+
doclingRef: unit.ref,
|
|
2511
|
+
...unit.label ? { doclingLabel: unit.label } : {}
|
|
2512
|
+
}
|
|
2513
|
+
},
|
|
2514
|
+
index
|
|
2515
|
+
);
|
|
2516
|
+
return {
|
|
2517
|
+
...span,
|
|
2518
|
+
kind: "plain_text",
|
|
2519
|
+
bbox: unit.bboxes?.length ? unit.bboxes : void 0
|
|
2520
|
+
};
|
|
2521
|
+
});
|
|
2522
|
+
return {
|
|
2523
|
+
pageCount,
|
|
2524
|
+
fullText,
|
|
2525
|
+
pageTexts,
|
|
2526
|
+
units,
|
|
2527
|
+
sourceSpans
|
|
2528
|
+
};
|
|
2529
|
+
}
|
|
2530
|
+
function getDoclingPageRangeText(normalized, startPage, endPage) {
|
|
2531
|
+
const start = clampPage(startPage, normalized.pageCount);
|
|
2532
|
+
const end = clampPage(endPage, normalized.pageCount);
|
|
2533
|
+
const lines = [];
|
|
2534
|
+
for (let page = start; page <= end; page++) {
|
|
2535
|
+
const text = normalized.pageTexts.get(page)?.trim();
|
|
2536
|
+
if (text) {
|
|
2537
|
+
lines.push(`Page ${page}
|
|
2538
|
+
${text}`);
|
|
2539
|
+
}
|
|
2540
|
+
}
|
|
2541
|
+
return lines.join("\n\n");
|
|
2542
|
+
}
|
|
2543
|
+
function buildDoclingProviderOptions(normalized, existingOptions) {
|
|
2544
|
+
return {
|
|
2545
|
+
...existingOptions,
|
|
2546
|
+
doclingText: normalized.fullText,
|
|
2547
|
+
doclingPageCount: normalized.pageCount
|
|
2548
|
+
};
|
|
2549
|
+
}
|
|
2550
|
+
function mergeSourceSpans(spans) {
|
|
2551
|
+
const seen = /* @__PURE__ */ new Set();
|
|
2552
|
+
const merged = [];
|
|
2553
|
+
for (const span of spans) {
|
|
2554
|
+
const key = [
|
|
2555
|
+
span.documentId,
|
|
2556
|
+
span.pageStart ?? span.location?.startPage ?? span.location?.page ?? "na",
|
|
2557
|
+
span.pageEnd ?? span.location?.endPage ?? span.pageStart ?? "na",
|
|
2558
|
+
span.sectionId ?? span.location?.fieldPath ?? "na",
|
|
2559
|
+
span.textHash ?? sourceSpanTextHash(span.text)
|
|
2560
|
+
].join(":");
|
|
2561
|
+
if (seen.has(key)) continue;
|
|
2562
|
+
seen.add(key);
|
|
2563
|
+
merged.push(span);
|
|
2564
|
+
}
|
|
2565
|
+
return merged;
|
|
2566
|
+
}
|
|
2567
|
+
function buildItemMap(document) {
|
|
2568
|
+
const map = /* @__PURE__ */ new Map();
|
|
2569
|
+
addItems(map, "#/texts", document.texts ?? []);
|
|
2570
|
+
addItems(map, "#/tables", document.tables ?? []);
|
|
2571
|
+
addItems(map, "#/key_value_items", document.key_value_items ?? document.keyValueItems ?? []);
|
|
2572
|
+
addItems(map, "#/pictures", document.pictures ?? []);
|
|
2573
|
+
return map;
|
|
2574
|
+
}
|
|
2575
|
+
function addItems(map, baseRef, items) {
|
|
2576
|
+
items.forEach((item, index) => {
|
|
2577
|
+
const ref = getSelfRef(item) ?? `${baseRef}/${index}`;
|
|
2578
|
+
map.set(ref, { ref, item });
|
|
2579
|
+
});
|
|
2580
|
+
}
|
|
2581
|
+
function getFallbackOrderedItems(document, itemMap) {
|
|
2582
|
+
const refs = [
|
|
2583
|
+
...(document.texts ?? []).map((item, index) => getSelfRef(item) ?? `#/texts/${index}`),
|
|
2584
|
+
...(document.tables ?? []).map((item, index) => getSelfRef(item) ?? `#/tables/${index}`),
|
|
2585
|
+
...(document.key_value_items ?? document.keyValueItems ?? []).map((item, index) => getSelfRef(item) ?? `#/key_value_items/${index}`)
|
|
2586
|
+
];
|
|
2587
|
+
return refs.map((ref) => itemMap.get(ref)).filter((item) => Boolean(item));
|
|
2588
|
+
}
|
|
2589
|
+
function getOrderedBodyRefs(document, itemMap) {
|
|
2590
|
+
const groupMap = /* @__PURE__ */ new Map();
|
|
2591
|
+
(document.groups ?? []).forEach((group, index) => {
|
|
2592
|
+
groupMap.set(getSelfRef(group) ?? `#/groups/${index}`, group);
|
|
2593
|
+
});
|
|
2594
|
+
const refs = [];
|
|
2595
|
+
const visited = /* @__PURE__ */ new Set();
|
|
2596
|
+
const visitRef = (ref) => {
|
|
2597
|
+
const itemEntry = itemMap.get(ref);
|
|
2598
|
+
if (itemEntry) {
|
|
2599
|
+
if (!visited.has(ref)) {
|
|
2600
|
+
visited.add(ref);
|
|
2601
|
+
refs.push(ref);
|
|
2602
|
+
}
|
|
2603
|
+
visitNode(itemEntry.item);
|
|
2604
|
+
return;
|
|
2605
|
+
}
|
|
2606
|
+
visitNode(groupMap.get(ref));
|
|
2607
|
+
};
|
|
2608
|
+
const visitNode = (node) => {
|
|
2609
|
+
for (const child of node?.children ?? []) {
|
|
2610
|
+
const ref = getRef(child);
|
|
2611
|
+
if (!ref) continue;
|
|
2612
|
+
visitRef(ref);
|
|
2613
|
+
}
|
|
2614
|
+
};
|
|
2615
|
+
visitNode(document.body);
|
|
2616
|
+
return refs;
|
|
2617
|
+
}
|
|
2618
|
+
function normalizeItem(ref, item) {
|
|
2619
|
+
const text = getItemText(item).trim();
|
|
2620
|
+
if (!text) return void 0;
|
|
2621
|
+
const pages = (item.prov ?? []).map((prov) => getPageNumber(prov)).filter((page) => typeof page === "number" && page > 0);
|
|
2622
|
+
const pageStart = pages.length ? Math.min(...pages) : void 0;
|
|
2623
|
+
const pageEnd = pages.length ? Math.max(...pages) : pageStart;
|
|
2624
|
+
const bboxes = (item.prov ?? []).map((prov) => toSourceSpanBBox(prov)).filter((bbox) => Boolean(bbox));
|
|
2625
|
+
return {
|
|
2626
|
+
ref,
|
|
2627
|
+
label: typeof item.label === "string" ? item.label : void 0,
|
|
2628
|
+
text,
|
|
2629
|
+
pageStart,
|
|
2630
|
+
pageEnd,
|
|
2631
|
+
bboxes: bboxes.length ? bboxes : void 0
|
|
2632
|
+
};
|
|
2633
|
+
}
|
|
2634
|
+
function getItemText(item) {
|
|
2635
|
+
if (typeof item.text === "string" && item.text.trim()) return item.text;
|
|
2636
|
+
if (typeof item.orig === "string" && item.orig.trim()) return item.orig;
|
|
2637
|
+
const table = tableToMarkdown(item.data);
|
|
2638
|
+
if (table) return table;
|
|
2639
|
+
return "";
|
|
2640
|
+
}
|
|
2641
|
+
function tableToMarkdown(data) {
|
|
2642
|
+
const record = asRecord(data);
|
|
2643
|
+
const cells = Array.isArray(record?.table_cells) ? record.table_cells : Array.isArray(record?.tableCells) ? record.tableCells : void 0;
|
|
2644
|
+
if (!cells) return void 0;
|
|
2645
|
+
const parsedCells = cells.map((cell) => asRecord(cell)).filter((cell) => Boolean(cell)).map((cell) => ({
|
|
2646
|
+
row: firstNumber2([cell.start_row_offset, cell.row_header, cell.row, cell.rowIndex]) ?? 0,
|
|
2647
|
+
col: firstNumber2([cell.start_col_offset, cell.col, cell.colIndex]) ?? 0,
|
|
2648
|
+
text: firstString([cell.text, cell.orig, cell.content])
|
|
2649
|
+
})).filter((cell) => cell.text);
|
|
2650
|
+
if (parsedCells.length === 0) return void 0;
|
|
2651
|
+
const maxRow = Math.max(...parsedCells.map((cell) => cell.row));
|
|
2652
|
+
const maxCol = Math.max(...parsedCells.map((cell) => cell.col));
|
|
2653
|
+
const rows = Array.from({ length: maxRow + 1 }, () => Array.from({ length: maxCol + 1 }, () => ""));
|
|
2654
|
+
for (const cell of parsedCells) {
|
|
2655
|
+
rows[cell.row][cell.col] = cell.text;
|
|
2656
|
+
}
|
|
2657
|
+
if (rows.length === 1) return rows[0].filter(Boolean).join(" | ");
|
|
2658
|
+
const header = rows[0];
|
|
2659
|
+
const separator = header.map(() => "---");
|
|
2660
|
+
return [header, separator, ...rows.slice(1)].map((row) => `| ${row.map((value) => value.trim()).join(" | ")} |`).join("\n");
|
|
2661
|
+
}
|
|
2662
|
+
function inferPageCount(document, units) {
|
|
2663
|
+
const pages = document.pages;
|
|
2664
|
+
if (Array.isArray(pages)) return Math.max(1, pages.length);
|
|
2665
|
+
if (pages && typeof pages === "object") {
|
|
2666
|
+
const keys = Object.keys(pages);
|
|
2667
|
+
const numericMax = Math.max(0, ...keys.map((key) => Number(key)).filter((value) => Number.isFinite(value)));
|
|
2668
|
+
return Math.max(1, numericMax || keys.length);
|
|
2669
|
+
}
|
|
2670
|
+
return Math.max(1, ...units.flatMap((unit) => [unit.pageStart ?? 0, unit.pageEnd ?? 0]));
|
|
2671
|
+
}
|
|
2672
|
+
function getSelfRef(value) {
|
|
2673
|
+
return value.self_ref ?? value.selfRef;
|
|
2674
|
+
}
|
|
2675
|
+
function getRef(value) {
|
|
2676
|
+
if (typeof value === "string") return value;
|
|
2677
|
+
return value.$ref ?? value.ref;
|
|
2678
|
+
}
|
|
2679
|
+
function getPageNumber(prov) {
|
|
2680
|
+
return prov.page_no ?? prov.pageNo ?? prov.page;
|
|
2681
|
+
}
|
|
2682
|
+
function toSourceSpanBBox(prov) {
|
|
2683
|
+
const page = getPageNumber(prov);
|
|
2684
|
+
const bbox = asRecord(prov.bbox);
|
|
2685
|
+
if (!page || !bbox) return void 0;
|
|
2686
|
+
const x = firstNumber2([bbox.x, bbox.l, bbox.left]);
|
|
2687
|
+
const y = firstNumber2([bbox.y, bbox.t, bbox.top]);
|
|
2688
|
+
const width = firstNumber2([bbox.width]);
|
|
2689
|
+
const height = firstNumber2([bbox.height]);
|
|
2690
|
+
const right = firstNumber2([bbox.r, bbox.right]);
|
|
2691
|
+
const bottom = firstNumber2([bbox.b, bbox.bottom]);
|
|
2692
|
+
if (x == null || y == null) return void 0;
|
|
2693
|
+
const resolvedWidth = width ?? (right != null ? right - x : void 0);
|
|
2694
|
+
const resolvedHeight = height ?? (bottom != null ? bottom - y : void 0);
|
|
2695
|
+
if (resolvedWidth == null || resolvedHeight == null) return void 0;
|
|
2696
|
+
return { page, x, y, width: resolvedWidth, height: resolvedHeight };
|
|
2697
|
+
}
|
|
2698
|
+
function clampPage(page, pageCount) {
|
|
2699
|
+
return Math.max(1, Math.min(pageCount, page));
|
|
2700
|
+
}
|
|
2701
|
+
function appendText(existing, next) {
|
|
2702
|
+
return existing ? `${existing}
|
|
2703
|
+
|
|
2704
|
+
${next}` : next;
|
|
2705
|
+
}
|
|
2706
|
+
function asRecord(value) {
|
|
2707
|
+
return value && typeof value === "object" && !Array.isArray(value) ? value : void 0;
|
|
2708
|
+
}
|
|
2709
|
+
function firstString(values) {
|
|
2710
|
+
for (const value of values) {
|
|
2711
|
+
if (typeof value === "string" && value.trim()) return value.trim();
|
|
2712
|
+
}
|
|
2713
|
+
return "";
|
|
2714
|
+
}
|
|
2715
|
+
function firstNumber2(values) {
|
|
2716
|
+
for (const value of values) {
|
|
2717
|
+
if (typeof value === "number" && Number.isFinite(value)) return value;
|
|
2718
|
+
}
|
|
2719
|
+
return void 0;
|
|
2720
|
+
}
|
|
2721
|
+
|
|
2474
2722
|
// src/extraction/extractor.ts
|
|
2475
2723
|
function sourceSpansForPageRange(providerOptions, startPage, endPage) {
|
|
2476
2724
|
const sourceSpans = providerOptions?.sourceSpans;
|
|
@@ -2512,20 +2760,38 @@ async function runExtractor(params) {
|
|
|
2512
2760
|
generateObject,
|
|
2513
2761
|
convertPdfToImages,
|
|
2514
2762
|
maxTokens = 4096,
|
|
2763
|
+
taskKind,
|
|
2764
|
+
budgetDiagnostics,
|
|
2515
2765
|
providerOptions,
|
|
2516
2766
|
pageRangeCache
|
|
2517
2767
|
} = params;
|
|
2518
2768
|
const extractorProviderOptions = { ...providerOptions };
|
|
2519
2769
|
let fullPrompt;
|
|
2520
|
-
|
|
2521
|
-
|
|
2522
|
-
|
|
2770
|
+
if (params.getPageRangeText) {
|
|
2771
|
+
const pageText = await params.getPageRangeText(startPage, endPage);
|
|
2772
|
+
extractorProviderOptions.doclingText = pageText;
|
|
2773
|
+
extractorProviderOptions.doclingPageRange = { startPage, endPage };
|
|
2774
|
+
fullPrompt = `${prompt}
|
|
2775
|
+
|
|
2776
|
+
[Document pages ${startPage}-${endPage} are provided below as Docling-extracted text.]
|
|
2777
|
+
|
|
2778
|
+
${pageText || "(No Docling text was available for this page range.)"}`;
|
|
2779
|
+
} else if (convertPdfToImages) {
|
|
2780
|
+
if (!pdfInput) {
|
|
2781
|
+
throw new Error("pdfInput is required when extracting page images.");
|
|
2782
|
+
}
|
|
2783
|
+
const needsPdfBase64 = !params.getPageImages;
|
|
2784
|
+
const pdfBase64 = needsPdfBase64 ? await pdfInputToBase64(pdfInput) : void 0;
|
|
2523
2785
|
const images = params.getPageImages ? await params.getPageImages(startPage, endPage) : await convertPdfToImages(pdfBase64, startPage, endPage);
|
|
2524
2786
|
extractorProviderOptions.images = images;
|
|
2525
2787
|
fullPrompt = `${prompt}
|
|
2526
2788
|
|
|
2527
2789
|
[Document pages ${startPage}-${endPage} are provided as images.]`;
|
|
2528
2790
|
} else {
|
|
2791
|
+
if (!pdfInput) {
|
|
2792
|
+
throw new Error("pdfInput is required when extracting page PDFs.");
|
|
2793
|
+
}
|
|
2794
|
+
const pdfBase64 = params.getPageRangePdf ? void 0 : await pdfInputToBase64(pdfInput);
|
|
2529
2795
|
const cacheKey = `${startPage}-${endPage}`;
|
|
2530
2796
|
const cachedPagesPdf = pageRangeCache?.get(cacheKey);
|
|
2531
2797
|
const pagesPdf = cachedPagesPdf ?? (params.getPageRangePdf ? await params.getPageRangePdf(startPage, endPage) : await extractPageRange(pdfBase64, startPage, endPage));
|
|
@@ -2545,6 +2811,8 @@ async function runExtractor(params) {
|
|
|
2545
2811
|
prompt: fullPrompt,
|
|
2546
2812
|
schema: strictSchema,
|
|
2547
2813
|
maxTokens,
|
|
2814
|
+
taskKind,
|
|
2815
|
+
budgetDiagnostics,
|
|
2548
2816
|
providerOptions: extractorProviderOptions
|
|
2549
2817
|
})
|
|
2550
2818
|
);
|
|
@@ -3524,6 +3792,8 @@ async function formatDocumentContent(doc, generateText, options) {
|
|
|
3524
3792
|
() => generateText({
|
|
3525
3793
|
prompt,
|
|
3526
3794
|
maxTokens: options?.maxTokens ?? 16384,
|
|
3795
|
+
taskKind: options?.taskKind,
|
|
3796
|
+
budgetDiagnostics: options?.budgetDiagnostics,
|
|
3527
3797
|
providerOptions: options?.providerOptions
|
|
3528
3798
|
})
|
|
3529
3799
|
);
|
|
@@ -3561,7 +3831,7 @@ function formatAddress(addr) {
|
|
|
3561
3831
|
function asRecordArray(value) {
|
|
3562
3832
|
return Array.isArray(value) ? value.filter((item) => Boolean(item) && typeof item === "object" && !Array.isArray(item)) : [];
|
|
3563
3833
|
}
|
|
3564
|
-
function
|
|
3834
|
+
function firstString2(item, keys) {
|
|
3565
3835
|
for (const key of keys) {
|
|
3566
3836
|
const value = item[key];
|
|
3567
3837
|
if (typeof value === "string" && value.trim()) return value;
|
|
@@ -3918,32 +4188,32 @@ ${exc.content}`.trim(), {
|
|
|
3918
4188
|
);
|
|
3919
4189
|
});
|
|
3920
4190
|
asRecordArray(extendedDoc.definitions).forEach((definition, i) => {
|
|
3921
|
-
const term =
|
|
3922
|
-
const body =
|
|
4191
|
+
const term = firstString2(definition, ["term", "name", "title"]) ?? `Definition ${i + 1}`;
|
|
4192
|
+
const body = firstString2(definition, ["definition", "content", "text", "meaning"]);
|
|
3923
4193
|
pushChunk(
|
|
3924
4194
|
`definition:${i}`,
|
|
3925
4195
|
"definition",
|
|
3926
4196
|
lines([
|
|
3927
4197
|
`Definition: ${term}`,
|
|
3928
4198
|
body,
|
|
3929
|
-
|
|
4199
|
+
firstString2(definition, ["originalContent", "source"]) ? `Source: ${firstString2(definition, ["originalContent", "source"])}` : null
|
|
3930
4200
|
]),
|
|
3931
4201
|
{
|
|
3932
4202
|
term,
|
|
3933
|
-
formNumber:
|
|
3934
|
-
formTitle:
|
|
4203
|
+
formNumber: firstString2(definition, ["formNumber"]),
|
|
4204
|
+
formTitle: firstString2(definition, ["formTitle"]),
|
|
3935
4205
|
pageNumber: typeof definition.pageNumber === "number" ? definition.pageNumber : void 0,
|
|
3936
|
-
sectionRef:
|
|
4206
|
+
sectionRef: firstString2(definition, ["sectionRef", "sectionTitle"]),
|
|
3937
4207
|
documentType: doc.type
|
|
3938
4208
|
}
|
|
3939
4209
|
);
|
|
3940
4210
|
});
|
|
3941
4211
|
const coveredReasons = asRecordArray(extendedDoc.coveredReasons ?? extendedDoc.covered_reasons);
|
|
3942
4212
|
coveredReasons.forEach((coveredReason, i) => {
|
|
3943
|
-
const title =
|
|
3944
|
-
const coverageName =
|
|
3945
|
-
const reasonNumber =
|
|
3946
|
-
const body =
|
|
4213
|
+
const title = firstString2(coveredReason, ["title", "name", "reason", "peril", "cause"]) ?? `Covered Reason ${i + 1}`;
|
|
4214
|
+
const coverageName = firstString2(coveredReason, ["coverageName", "coverage", "coveragePart"]);
|
|
4215
|
+
const reasonNumber = firstString2(coveredReason, ["reasonNumber", "number"]);
|
|
4216
|
+
const body = firstString2(coveredReason, ["content", "description", "text", "coverageGrant"]);
|
|
3947
4217
|
pushChunk(
|
|
3948
4218
|
`covered_reason:${i}`,
|
|
3949
4219
|
"covered_reason",
|
|
@@ -3952,16 +4222,16 @@ ${exc.content}`.trim(), {
|
|
|
3952
4222
|
reasonNumber ? `Reason Number: ${reasonNumber}` : null,
|
|
3953
4223
|
`Covered Reason: ${title}`,
|
|
3954
4224
|
body,
|
|
3955
|
-
|
|
4225
|
+
firstString2(coveredReason, ["originalContent", "source"]) ? `Source: ${firstString2(coveredReason, ["originalContent", "source"])}` : null
|
|
3956
4226
|
]),
|
|
3957
4227
|
{
|
|
3958
4228
|
coverageName,
|
|
3959
4229
|
reasonNumber,
|
|
3960
4230
|
title,
|
|
3961
|
-
formNumber:
|
|
3962
|
-
formTitle:
|
|
4231
|
+
formNumber: firstString2(coveredReason, ["formNumber"]),
|
|
4232
|
+
formTitle: firstString2(coveredReason, ["formTitle"]),
|
|
3963
4233
|
pageNumber: typeof coveredReason.pageNumber === "number" ? coveredReason.pageNumber : void 0,
|
|
3964
|
-
sectionRef:
|
|
4234
|
+
sectionRef: firstString2(coveredReason, ["sectionRef", "sectionTitle"]),
|
|
3965
4235
|
documentType: doc.type
|
|
3966
4236
|
}
|
|
3967
4237
|
);
|
|
@@ -3981,10 +4251,10 @@ ${exc.content}`.trim(), {
|
|
|
3981
4251
|
reasonNumber,
|
|
3982
4252
|
title,
|
|
3983
4253
|
conditionIndex,
|
|
3984
|
-
formNumber:
|
|
3985
|
-
formTitle:
|
|
4254
|
+
formNumber: firstString2(coveredReason, ["formNumber"]),
|
|
4255
|
+
formTitle: firstString2(coveredReason, ["formTitle"]),
|
|
3986
4256
|
pageNumber: typeof coveredReason.pageNumber === "number" ? coveredReason.pageNumber : void 0,
|
|
3987
|
-
sectionRef:
|
|
4257
|
+
sectionRef: firstString2(coveredReason, ["sectionRef", "sectionTitle"]),
|
|
3988
4258
|
documentType: doc.type
|
|
3989
4259
|
}
|
|
3990
4260
|
);
|
|
@@ -6455,21 +6725,21 @@ Return JSON only.`;
|
|
|
6455
6725
|
}
|
|
6456
6726
|
|
|
6457
6727
|
// src/prompts/extractors/index.ts
|
|
6458
|
-
function
|
|
6728
|
+
function asRecord2(data) {
|
|
6459
6729
|
return data && typeof data === "object" ? data : void 0;
|
|
6460
6730
|
}
|
|
6461
6731
|
function getSections2(data) {
|
|
6462
|
-
const sections =
|
|
6732
|
+
const sections = asRecord2(data)?.sections;
|
|
6463
6733
|
return Array.isArray(sections) ? sections : [];
|
|
6464
6734
|
}
|
|
6465
6735
|
function isCoveredReasonsEmpty(data) {
|
|
6466
|
-
const record =
|
|
6736
|
+
const record = asRecord2(data);
|
|
6467
6737
|
if (!record) return true;
|
|
6468
6738
|
const coveredReasons = Array.isArray(record.coveredReasons) ? record.coveredReasons : Array.isArray(record.covered_reasons) ? record.covered_reasons : [];
|
|
6469
6739
|
return coveredReasons.length === 0;
|
|
6470
6740
|
}
|
|
6471
6741
|
function isDefinitionsEmpty(data) {
|
|
6472
|
-
const definitions =
|
|
6742
|
+
const definitions = asRecord2(data)?.definitions;
|
|
6473
6743
|
return !Array.isArray(definitions) || definitions.length === 0;
|
|
6474
6744
|
}
|
|
6475
6745
|
function sectionLooksLikeCoveredReason(section) {
|
|
@@ -6703,6 +6973,14 @@ function decideReferentialResolutionAction(params) {
|
|
|
6703
6973
|
}
|
|
6704
6974
|
|
|
6705
6975
|
// src/extraction/resolve-referential.ts
|
|
6976
|
+
function formatDoclingTextContext(providerOptions) {
|
|
6977
|
+
const doclingText = providerOptions?.doclingText;
|
|
6978
|
+
if (typeof doclingText !== "string" || !doclingText.trim()) return "";
|
|
6979
|
+
return `
|
|
6980
|
+
|
|
6981
|
+
DOCLING DOCUMENT TEXT:
|
|
6982
|
+
${doclingText}`;
|
|
6983
|
+
}
|
|
6706
6984
|
function parseReferenceTarget(text) {
|
|
6707
6985
|
if (typeof text !== "string") return void 0;
|
|
6708
6986
|
const normalized = text.trim();
|
|
@@ -6784,10 +7062,12 @@ Return the page range (1-indexed) where this section is located. If the section
|
|
|
6784
7062
|
|
|
6785
7063
|
If you cannot find the section, return startPage: 0 and endPage: 0.
|
|
6786
7064
|
|
|
6787
|
-
Return JSON only
|
|
7065
|
+
Return JSON only.${formatDoclingTextContext(providerOptions)}`,
|
|
6788
7066
|
schema: PageLocationSchema,
|
|
6789
7067
|
maxTokens: budget.maxTokens,
|
|
6790
|
-
|
|
7068
|
+
taskKind: "extraction_referential_lookup",
|
|
7069
|
+
budgetDiagnostics: budget,
|
|
7070
|
+
providerOptions: pdfInput ? await buildPdfProviderOptions(pdfInput, providerOptions) : providerOptions
|
|
6791
7071
|
},
|
|
6792
7072
|
{
|
|
6793
7073
|
fallback: { startPage: 0, endPage: 0 },
|
|
@@ -6821,6 +7101,7 @@ async function resolveReferentialCoverages(params) {
|
|
|
6821
7101
|
convertPdfToImages,
|
|
6822
7102
|
getPageRangePdf,
|
|
6823
7103
|
getPageImages,
|
|
7104
|
+
getPageRangeText,
|
|
6824
7105
|
concurrency = 2,
|
|
6825
7106
|
providerOptions,
|
|
6826
7107
|
modelCapabilities,
|
|
@@ -6932,7 +7213,10 @@ async function resolveReferentialCoverages(params) {
|
|
|
6932
7213
|
convertPdfToImages,
|
|
6933
7214
|
getPageRangePdf,
|
|
6934
7215
|
getPageImages,
|
|
7216
|
+
getPageRangeText,
|
|
6935
7217
|
maxTokens: budget.maxTokens,
|
|
7218
|
+
taskKind: "extraction_referential_lookup",
|
|
7219
|
+
budgetDiagnostics: budget,
|
|
6936
7220
|
providerOptions
|
|
6937
7221
|
});
|
|
6938
7222
|
trackUsage(result.usage);
|
|
@@ -7025,6 +7309,7 @@ async function runFocusedExtractorWithFallback(params) {
|
|
|
7025
7309
|
pageRangeCache,
|
|
7026
7310
|
getPageRangePdf,
|
|
7027
7311
|
getPageImages,
|
|
7312
|
+
getPageRangeText,
|
|
7028
7313
|
trackUsage,
|
|
7029
7314
|
resolveBudget,
|
|
7030
7315
|
log
|
|
@@ -7049,10 +7334,13 @@ async function runFocusedExtractorWithFallback(params) {
|
|
|
7049
7334
|
generateObject,
|
|
7050
7335
|
convertPdfToImages,
|
|
7051
7336
|
maxTokens: budget.maxTokens,
|
|
7337
|
+
taskKind,
|
|
7338
|
+
budgetDiagnostics: budget,
|
|
7052
7339
|
providerOptions,
|
|
7053
7340
|
pageRangeCache,
|
|
7054
7341
|
getPageRangePdf,
|
|
7055
|
-
getPageImages
|
|
7342
|
+
getPageImages,
|
|
7343
|
+
getPageRangeText
|
|
7056
7344
|
});
|
|
7057
7345
|
trackUsage(result.usage, {
|
|
7058
7346
|
taskKind,
|
|
@@ -7092,10 +7380,13 @@ async function runFocusedExtractorWithFallback(params) {
|
|
|
7092
7380
|
generateObject,
|
|
7093
7381
|
convertPdfToImages,
|
|
7094
7382
|
maxTokens: budget.maxTokens,
|
|
7383
|
+
taskKind,
|
|
7384
|
+
budgetDiagnostics: budget,
|
|
7095
7385
|
providerOptions,
|
|
7096
7386
|
pageRangeCache,
|
|
7097
7387
|
getPageRangePdf,
|
|
7098
|
-
getPageImages
|
|
7388
|
+
getPageImages,
|
|
7389
|
+
getPageRangeText
|
|
7099
7390
|
});
|
|
7100
7391
|
trackUsage(fallbackResult.usage, {
|
|
7101
7392
|
taskKind,
|
|
@@ -7939,7 +8230,7 @@ function createExtractor(config) {
|
|
|
7939
8230
|
}
|
|
7940
8231
|
return lines.length > 0 ? lines.join("\n") : "";
|
|
7941
8232
|
}
|
|
7942
|
-
async function runFocusedExtractorTask(task, pdfInput, memory, pageRangeCache, getPageRangePdf, getPageImages) {
|
|
8233
|
+
async function runFocusedExtractorTask(task, pdfInput, memory, pageRangeCache, getPageRangePdf, getPageImages, getPageRangeText) {
|
|
7943
8234
|
if (task.extractorName === "supplementary") {
|
|
7944
8235
|
const alreadyExtractedSummary = buildAlreadyExtractedSummary(memory);
|
|
7945
8236
|
const budget = resolveBudget("extraction_focused", 4096);
|
|
@@ -7954,10 +8245,13 @@ function createExtractor(config) {
|
|
|
7954
8245
|
generateObject,
|
|
7955
8246
|
convertPdfToImages,
|
|
7956
8247
|
maxTokens: budget.maxTokens,
|
|
8248
|
+
taskKind: "extraction_focused",
|
|
8249
|
+
budgetDiagnostics: budget,
|
|
7957
8250
|
providerOptions: activeProviderOptions,
|
|
7958
8251
|
pageRangeCache,
|
|
7959
8252
|
getPageRangePdf,
|
|
7960
|
-
getPageImages
|
|
8253
|
+
getPageImages,
|
|
8254
|
+
getPageRangeText
|
|
7961
8255
|
});
|
|
7962
8256
|
trackUsage(result.usage, {
|
|
7963
8257
|
taskKind: "extraction_focused",
|
|
@@ -7976,6 +8270,7 @@ function createExtractor(config) {
|
|
|
7976
8270
|
pageRangeCache,
|
|
7977
8271
|
getPageRangePdf,
|
|
7978
8272
|
getPageImages,
|
|
8273
|
+
getPageRangeText,
|
|
7979
8274
|
trackUsage,
|
|
7980
8275
|
resolveBudget,
|
|
7981
8276
|
log
|
|
@@ -7991,8 +8286,14 @@ function createExtractor(config) {
|
|
|
7991
8286
|
if (extractorPages.size === 0) return "No page assignments available.";
|
|
7992
8287
|
return [...extractorPages.entries()].map(([extractorName, pages]) => `${extractorName}: ${pages.length} page(s), pages ${pages.join(", ")}`).join("\n");
|
|
7993
8288
|
}
|
|
7994
|
-
async function extract(
|
|
8289
|
+
async function extract(input, documentId, options) {
|
|
7995
8290
|
const id = documentId ?? `doc-${Date.now()}`;
|
|
8291
|
+
const isDoclingInput = isDoclingExtractionInput(input);
|
|
8292
|
+
const pdfInput = isDoclingInput ? void 0 : input;
|
|
8293
|
+
const doclingDocument = isDoclingInput ? normalizeDoclingDocument(input.document, {
|
|
8294
|
+
documentId: id,
|
|
8295
|
+
sourceKind: input.sourceKind
|
|
8296
|
+
}) : void 0;
|
|
7996
8297
|
const memory = /* @__PURE__ */ new Map();
|
|
7997
8298
|
totalUsage = { inputTokens: 0, outputTokens: 0 };
|
|
7998
8299
|
modelCalls = 0;
|
|
@@ -8002,7 +8303,10 @@ function createExtractor(config) {
|
|
|
8002
8303
|
modelCalls: [],
|
|
8003
8304
|
totalModelCallDurationMs: 0
|
|
8004
8305
|
};
|
|
8005
|
-
const sourceSpans =
|
|
8306
|
+
const sourceSpans = mergeSourceSpans([
|
|
8307
|
+
...doclingDocument?.sourceSpans ?? [],
|
|
8308
|
+
...options?.sourceSpans ?? []
|
|
8309
|
+
]);
|
|
8006
8310
|
const sourceChunks = sourceSpans.length ? chunkSourceSpans(sourceSpans) : [];
|
|
8007
8311
|
activeProviderOptions = sourceSpans.length ? { ...providerOptions, sourceSpans, sourceChunks } : providerOptions;
|
|
8008
8312
|
if (sourceStore && sourceSpans.length > 0) {
|
|
@@ -8031,24 +8335,40 @@ function createExtractor(config) {
|
|
|
8031
8335
|
let fullPdfProviderOptionsPromise;
|
|
8032
8336
|
let pageCountPromise;
|
|
8033
8337
|
async function getPdfBase64ForExtraction() {
|
|
8338
|
+
if (!pdfInput) {
|
|
8339
|
+
throw new Error("PDF input is not available for Docling extraction.");
|
|
8340
|
+
}
|
|
8034
8341
|
if (pdfBase64Cache === void 0) {
|
|
8035
8342
|
pdfBase64Cache = await pdfInputToBase64(pdfInput);
|
|
8036
8343
|
}
|
|
8037
8344
|
return pdfBase64Cache;
|
|
8038
8345
|
}
|
|
8039
8346
|
async function getCachedPageCount() {
|
|
8347
|
+
if (doclingDocument) return doclingDocument.pageCount;
|
|
8348
|
+
if (!pdfInput) {
|
|
8349
|
+
throw new Error("PDF input is required to read page count.");
|
|
8350
|
+
}
|
|
8040
8351
|
if (!pageCountPromise) {
|
|
8041
8352
|
pageCountPromise = getPdfSlicer().then((slicer) => slicer.getPageCount()).catch(() => getPdfPageCount(pdfInput));
|
|
8042
8353
|
}
|
|
8043
8354
|
return pageCountPromise;
|
|
8044
8355
|
}
|
|
8045
|
-
async function
|
|
8356
|
+
async function getFullDocumentProviderOptions() {
|
|
8357
|
+
if (doclingDocument) {
|
|
8358
|
+
return buildDoclingProviderOptions(doclingDocument, activeProviderOptions);
|
|
8359
|
+
}
|
|
8360
|
+
if (!pdfInput) {
|
|
8361
|
+
return activeProviderOptions ?? {};
|
|
8362
|
+
}
|
|
8046
8363
|
if (!fullPdfProviderOptionsPromise) {
|
|
8047
8364
|
fullPdfProviderOptionsPromise = buildPdfProviderOptions(pdfInput, activeProviderOptions);
|
|
8048
8365
|
}
|
|
8049
8366
|
return fullPdfProviderOptionsPromise;
|
|
8050
8367
|
}
|
|
8051
8368
|
async function getPdfSlicer() {
|
|
8369
|
+
if (!pdfInput) {
|
|
8370
|
+
throw new Error("PDF input is not available for Docling extraction.");
|
|
8371
|
+
}
|
|
8052
8372
|
if (!pdfSlicerPromise) {
|
|
8053
8373
|
pdfSlicerPromise = createPdfPageSlicer(pdfInput);
|
|
8054
8374
|
}
|
|
@@ -8087,6 +8407,23 @@ function createExtractor(config) {
|
|
|
8087
8407
|
pageRangeImageCache.set(cacheKey, promise);
|
|
8088
8408
|
return promise;
|
|
8089
8409
|
}
|
|
8410
|
+
async function getPageRangeText(startPage, endPage) {
|
|
8411
|
+
return doclingDocument ? getDoclingPageRangeText(doclingDocument, startPage, endPage) : "";
|
|
8412
|
+
}
|
|
8413
|
+
function withFullDocumentTextContext(prompt) {
|
|
8414
|
+
if (!doclingDocument) return prompt;
|
|
8415
|
+
return `${prompt}
|
|
8416
|
+
|
|
8417
|
+
DOCLING DOCUMENT TEXT:
|
|
8418
|
+
${doclingDocument.fullText}`;
|
|
8419
|
+
}
|
|
8420
|
+
function withPageRangeTextContext(prompt, startPage, endPage, pageText) {
|
|
8421
|
+
if (!doclingDocument) return prompt;
|
|
8422
|
+
return `${prompt}
|
|
8423
|
+
|
|
8424
|
+
DOCLING DOCUMENT PAGES ${startPage}-${endPage}:
|
|
8425
|
+
${pageText || "(No Docling text was available for this page range.)"}`;
|
|
8426
|
+
}
|
|
8090
8427
|
let classifyResult;
|
|
8091
8428
|
if (resumed?.classifyResult && pipelineCtx.isPhaseComplete("classify")) {
|
|
8092
8429
|
classifyResult = resumed.classifyResult;
|
|
@@ -8099,10 +8436,12 @@ function createExtractor(config) {
|
|
|
8099
8436
|
const classifyResponse = await safeGenerateObject(
|
|
8100
8437
|
generateObject,
|
|
8101
8438
|
{
|
|
8102
|
-
prompt: buildClassifyPrompt(),
|
|
8439
|
+
prompt: withFullDocumentTextContext(buildClassifyPrompt()),
|
|
8103
8440
|
schema: ClassifyResultSchema,
|
|
8104
8441
|
maxTokens: budget.maxTokens,
|
|
8105
|
-
|
|
8442
|
+
taskKind: "extraction_classify",
|
|
8443
|
+
budgetDiagnostics: budget,
|
|
8444
|
+
providerOptions: await getFullDocumentProviderOptions()
|
|
8106
8445
|
},
|
|
8107
8446
|
{
|
|
8108
8447
|
fallback: { documentType: "policy", policyTypes: ["other"], confidence: 0 },
|
|
@@ -8147,10 +8486,12 @@ function createExtractor(config) {
|
|
|
8147
8486
|
const formInventoryResponse = await safeGenerateObject(
|
|
8148
8487
|
generateObject,
|
|
8149
8488
|
{
|
|
8150
|
-
prompt: buildFormInventoryPrompt(templateHints),
|
|
8489
|
+
prompt: withFullDocumentTextContext(buildFormInventoryPrompt(templateHints)),
|
|
8151
8490
|
schema: FormInventorySchema,
|
|
8152
8491
|
maxTokens: budget.maxTokens,
|
|
8153
|
-
|
|
8492
|
+
taskKind: "extraction_form_inventory",
|
|
8493
|
+
budgetDiagnostics: budget,
|
|
8494
|
+
providerOptions: await getFullDocumentProviderOptions()
|
|
8154
8495
|
},
|
|
8155
8496
|
{
|
|
8156
8497
|
fallback: { forms: [] },
|
|
@@ -8193,16 +8534,24 @@ function createExtractor(config) {
|
|
|
8193
8534
|
const pageMapResults = await Promise.all(
|
|
8194
8535
|
pageMapChunks.map(
|
|
8195
8536
|
({ startPage, endPage }) => pageMapLimit(async () => {
|
|
8196
|
-
const pagesPdf = await getPageRangePdf(startPage, endPage);
|
|
8537
|
+
const pagesPdf = doclingDocument ? void 0 : await getPageRangePdf(startPage, endPage);
|
|
8538
|
+
const pagesText = doclingDocument ? await getPageRangeText(startPage, endPage) : "";
|
|
8197
8539
|
const budget = resolveBudget("extraction_page_map", 2048);
|
|
8198
8540
|
const startedAt = Date.now();
|
|
8199
8541
|
const mapResponse = await safeGenerateObject(
|
|
8200
8542
|
generateObject,
|
|
8201
8543
|
{
|
|
8202
|
-
prompt:
|
|
8544
|
+
prompt: withPageRangeTextContext(
|
|
8545
|
+
buildPageMapPrompt(templateHints, startPage, endPage, formInventoryHint),
|
|
8546
|
+
startPage,
|
|
8547
|
+
endPage,
|
|
8548
|
+
pagesText
|
|
8549
|
+
),
|
|
8203
8550
|
schema: PageMapChunkSchema,
|
|
8204
8551
|
maxTokens: budget.maxTokens,
|
|
8205
|
-
|
|
8552
|
+
taskKind: "extraction_page_map",
|
|
8553
|
+
budgetDiagnostics: budget,
|
|
8554
|
+
providerOptions: doclingDocument ? { ...activeProviderOptions, doclingText: pagesText, doclingPageRange: { startPage, endPage } } : { ...activeProviderOptions, pdfBase64: pagesPdf }
|
|
8206
8555
|
},
|
|
8207
8556
|
{
|
|
8208
8557
|
fallback: {
|
|
@@ -8280,7 +8629,7 @@ function createExtractor(config) {
|
|
|
8280
8629
|
}))
|
|
8281
8630
|
];
|
|
8282
8631
|
onProgress?.(`Dispatching ${tasks.length} extractors...`);
|
|
8283
|
-
const extractionPdfInput = await getPdfBase64ForExtraction();
|
|
8632
|
+
const extractionPdfInput = doclingDocument ? void 0 : await getPdfBase64ForExtraction();
|
|
8284
8633
|
const extractorResults = await Promise.all(
|
|
8285
8634
|
tasks.map(
|
|
8286
8635
|
(task) => extractorLimit(async () => {
|
|
@@ -8291,7 +8640,8 @@ function createExtractor(config) {
|
|
|
8291
8640
|
memory,
|
|
8292
8641
|
completedPageRangePdfCache,
|
|
8293
8642
|
getPageRangePdf,
|
|
8294
|
-
convertPdfToImages ? getPageImages : void 0
|
|
8643
|
+
convertPdfToImages ? getPageImages : void 0,
|
|
8644
|
+
doclingDocument ? getPageRangeText : void 0
|
|
8295
8645
|
);
|
|
8296
8646
|
})
|
|
8297
8647
|
)
|
|
@@ -8318,10 +8668,13 @@ function createExtractor(config) {
|
|
|
8318
8668
|
generateObject,
|
|
8319
8669
|
convertPdfToImages,
|
|
8320
8670
|
maxTokens: budget.maxTokens,
|
|
8671
|
+
taskKind: "extraction_focused",
|
|
8672
|
+
budgetDiagnostics: budget,
|
|
8321
8673
|
providerOptions: activeProviderOptions,
|
|
8322
8674
|
pageRangeCache: completedPageRangePdfCache,
|
|
8323
8675
|
getPageRangePdf,
|
|
8324
|
-
getPageImages: convertPdfToImages ? getPageImages : void 0
|
|
8676
|
+
getPageImages: convertPdfToImages ? getPageImages : void 0,
|
|
8677
|
+
getPageRangeText: doclingDocument ? getPageRangeText : void 0
|
|
8325
8678
|
});
|
|
8326
8679
|
trackUsage(supplementaryResult.usage, {
|
|
8327
8680
|
taskKind: "extraction_focused",
|
|
@@ -8357,6 +8710,7 @@ function createExtractor(config) {
|
|
|
8357
8710
|
concurrency,
|
|
8358
8711
|
getPageRangePdf,
|
|
8359
8712
|
getPageImages: convertPdfToImages ? getPageImages : void 0,
|
|
8713
|
+
getPageRangeText: doclingDocument ? getPageRangeText : void 0,
|
|
8360
8714
|
providerOptions: activeProviderOptions,
|
|
8361
8715
|
modelCapabilities,
|
|
8362
8716
|
modelBudgetConstraints,
|
|
@@ -8405,13 +8759,22 @@ function createExtractor(config) {
|
|
|
8405
8759
|
const reviewResponse = await safeGenerateObject(
|
|
8406
8760
|
generateObject,
|
|
8407
8761
|
{
|
|
8408
|
-
prompt: buildReviewPrompt(template.required, extractedKeys, extractionSummary, pageMapSummary, extractorCatalog),
|
|
8762
|
+
prompt: withFullDocumentTextContext(buildReviewPrompt(template.required, extractedKeys, extractionSummary, pageMapSummary, extractorCatalog)),
|
|
8409
8763
|
schema: ReviewResultSchema,
|
|
8410
8764
|
maxTokens: budget.maxTokens,
|
|
8411
|
-
|
|
8765
|
+
taskKind: "extraction_review",
|
|
8766
|
+
budgetDiagnostics: budget,
|
|
8767
|
+
providerOptions: await getFullDocumentProviderOptions()
|
|
8412
8768
|
},
|
|
8413
8769
|
{
|
|
8414
|
-
fallback: {
|
|
8770
|
+
fallback: {
|
|
8771
|
+
complete: false,
|
|
8772
|
+
missingFields: ["llm_review_unavailable"],
|
|
8773
|
+
qualityIssues: [
|
|
8774
|
+
"LLM extraction review failed; deterministic review was used and the result needs review."
|
|
8775
|
+
],
|
|
8776
|
+
additionalTasks: []
|
|
8777
|
+
},
|
|
8415
8778
|
log,
|
|
8416
8779
|
onError: (err, attempt) => log?.(`Review round ${round + 1} attempt ${attempt + 1} failed: ${err}`)
|
|
8417
8780
|
}
|
|
@@ -8431,7 +8794,7 @@ function createExtractor(config) {
|
|
|
8431
8794
|
break;
|
|
8432
8795
|
}
|
|
8433
8796
|
onProgress?.(`Review round ${round + 1}: dispatching ${reviewResponse.object.additionalTasks.length} follow-up extractors...`);
|
|
8434
|
-
const extractionPdfInput = await getPdfBase64ForExtraction();
|
|
8797
|
+
const extractionPdfInput = doclingDocument ? void 0 : await getPdfBase64ForExtraction();
|
|
8435
8798
|
const followUpResults = await Promise.all(
|
|
8436
8799
|
reviewResponse.object.additionalTasks.map(
|
|
8437
8800
|
(task) => extractorLimit(async () => {
|
|
@@ -8441,7 +8804,8 @@ function createExtractor(config) {
|
|
|
8441
8804
|
memory,
|
|
8442
8805
|
completedPageRangePdfCache,
|
|
8443
8806
|
getPageRangePdf,
|
|
8444
|
-
convertPdfToImages ? getPageImages : void 0
|
|
8807
|
+
convertPdfToImages ? getPageImages : void 0,
|
|
8808
|
+
doclingDocument ? getPageRangeText : void 0
|
|
8445
8809
|
);
|
|
8446
8810
|
})
|
|
8447
8811
|
)
|
|
@@ -8512,6 +8876,8 @@ function createExtractor(config) {
|
|
|
8512
8876
|
prompt: buildSummaryPrompt(document),
|
|
8513
8877
|
schema: SummaryResultSchema,
|
|
8514
8878
|
maxTokens: budget.maxTokens,
|
|
8879
|
+
taskKind: "extraction_summary",
|
|
8880
|
+
budgetDiagnostics: budget,
|
|
8515
8881
|
providerOptions: activeProviderOptions
|
|
8516
8882
|
},
|
|
8517
8883
|
{
|
|
@@ -8539,6 +8905,8 @@ function createExtractor(config) {
|
|
|
8539
8905
|
const formatResult = await formatDocumentContent(document, generateText, {
|
|
8540
8906
|
providerOptions: activeProviderOptions,
|
|
8541
8907
|
maxTokens: formatBudget.maxTokens,
|
|
8908
|
+
taskKind: "extraction_format",
|
|
8909
|
+
budgetDiagnostics: formatBudget,
|
|
8542
8910
|
concurrency: formatConcurrency ?? concurrency,
|
|
8543
8911
|
onProgress,
|
|
8544
8912
|
log
|
|
@@ -8941,6 +9309,7 @@ async function classifyApplication(pdfContent, generateObject, providerOptions,
|
|
|
8941
9309
|
Analyze the attached insurance document. If text source units are provided in provider options, use them as supporting context. Do not infer from base64 text.`,
|
|
8942
9310
|
schema: ApplicationClassifyResultSchema,
|
|
8943
9311
|
maxTokens,
|
|
9312
|
+
taskKind: "application_classify",
|
|
8944
9313
|
providerOptions: {
|
|
8945
9314
|
...providerOptions,
|
|
8946
9315
|
pdfBase64: providerOptions?.pdfBase64 ?? pdfContent
|
|
@@ -9043,6 +9412,7 @@ Extract fields from the attached application PDF. Use provider-supplied source u
|
|
|
9043
9412
|
prompt,
|
|
9044
9413
|
schema: FieldExtractionResultSchema,
|
|
9045
9414
|
maxTokens,
|
|
9415
|
+
taskKind: "application_extract_fields",
|
|
9046
9416
|
providerOptions: {
|
|
9047
9417
|
...providerOptions,
|
|
9048
9418
|
pdfBase64: providerOptions?.pdfBase64 ?? pdfContent
|
|
@@ -9096,6 +9466,7 @@ async function autoFillFromContext(fields, orgContext, generateObject, providerO
|
|
|
9096
9466
|
prompt,
|
|
9097
9467
|
schema: AutoFillResultSchema,
|
|
9098
9468
|
maxTokens,
|
|
9469
|
+
taskKind: "application_auto_fill",
|
|
9099
9470
|
providerOptions
|
|
9100
9471
|
})
|
|
9101
9472
|
);
|
|
@@ -9166,6 +9537,7 @@ async function batchQuestions(unfilledFields, generateObject, providerOptions, m
|
|
|
9166
9537
|
prompt,
|
|
9167
9538
|
schema: QuestionBatchResultSchema,
|
|
9168
9539
|
maxTokens,
|
|
9540
|
+
taskKind: "application_batch",
|
|
9169
9541
|
providerOptions
|
|
9170
9542
|
})
|
|
9171
9543
|
);
|
|
@@ -9217,6 +9589,7 @@ async function classifyReplyIntent(fields, replyText, generateObject, providerOp
|
|
|
9217
9589
|
prompt,
|
|
9218
9590
|
schema: ReplyIntentSchema,
|
|
9219
9591
|
maxTokens,
|
|
9592
|
+
taskKind: "application_classify",
|
|
9220
9593
|
providerOptions
|
|
9221
9594
|
})
|
|
9222
9595
|
);
|
|
@@ -9276,6 +9649,7 @@ async function parseAnswers(fields, replyText, generateObject, providerOptions,
|
|
|
9276
9649
|
prompt,
|
|
9277
9650
|
schema: AnswerParsingResultSchema,
|
|
9278
9651
|
maxTokens,
|
|
9652
|
+
taskKind: "application_parse_answers",
|
|
9279
9653
|
providerOptions
|
|
9280
9654
|
})
|
|
9281
9655
|
);
|
|
@@ -9405,6 +9779,7 @@ async function fillFromLookup(requests, targetFields, availableData, generateObj
|
|
|
9405
9779
|
prompt,
|
|
9406
9780
|
schema: LookupFillResultSchema,
|
|
9407
9781
|
maxTokens,
|
|
9782
|
+
taskKind: "application_lookup",
|
|
9408
9783
|
providerOptions
|
|
9409
9784
|
})
|
|
9410
9785
|
);
|
|
@@ -9487,6 +9862,7 @@ async function generateBatchEmail(batchFields, batchIndex, totalBatches, opts, g
|
|
|
9487
9862
|
() => generateText({
|
|
9488
9863
|
prompt,
|
|
9489
9864
|
maxTokens,
|
|
9865
|
+
taskKind: "application_email",
|
|
9490
9866
|
providerOptions
|
|
9491
9867
|
})
|
|
9492
9868
|
);
|
|
@@ -10009,11 +10385,14 @@ function createApplicationPipeline(config) {
|
|
|
10009
10385
|
}
|
|
10010
10386
|
if (replyPlan.answerQuestion && intent.questionText) {
|
|
10011
10387
|
try {
|
|
10388
|
+
const budget = resolveBudget("application_email", 512);
|
|
10012
10389
|
const { text, usage } = await generateText({
|
|
10013
10390
|
prompt: `The user is filling out an insurance application and asked: "${intent.questionText}"
|
|
10014
10391
|
|
|
10015
10392
|
Provide a brief, helpful explanation (2-3 sentences). End with "Just reply with the answer when you're ready and I'll fill it in."`,
|
|
10016
|
-
maxTokens:
|
|
10393
|
+
maxTokens: budget.maxTokens,
|
|
10394
|
+
taskKind: "application_email",
|
|
10395
|
+
budgetDiagnostics: budget,
|
|
10017
10396
|
providerOptions
|
|
10018
10397
|
});
|
|
10019
10398
|
trackUsage(usage);
|
|
@@ -10138,6 +10517,7 @@ ${emailText}`;
|
|
|
10138
10517
|
if (!state) throw new Error(`Application ${applicationId} not found`);
|
|
10139
10518
|
const filledFields = state.fields.filter((f) => f.value);
|
|
10140
10519
|
const fieldSummary = filledFields.map((f) => `${f.section} > ${f.label}: ${f.value} (source: ${f.source ?? "unknown"})`).join("\n");
|
|
10520
|
+
const budget = resolveBudget("application_email", 4096);
|
|
10141
10521
|
const { text, usage } = await generateText({
|
|
10142
10522
|
prompt: `Format these filled insurance application fields as a clean confirmation summary for the user to review. Group by section, show each field as "Label: Value". End with a note asking them to confirm or request changes.
|
|
10143
10523
|
|
|
@@ -10145,7 +10525,9 @@ Application: ${state.title ?? "Insurance Application"}
|
|
|
10145
10525
|
|
|
10146
10526
|
Fields:
|
|
10147
10527
|
${fieldSummary}`,
|
|
10148
|
-
maxTokens:
|
|
10528
|
+
maxTokens: budget.maxTokens,
|
|
10529
|
+
taskKind: "application_email",
|
|
10530
|
+
budgetDiagnostics: budget,
|
|
10149
10531
|
providerOptions
|
|
10150
10532
|
});
|
|
10151
10533
|
trackUsage(usage);
|
|
@@ -10619,6 +11001,8 @@ ${e.text}`;
|
|
|
10619
11001
|
prompt,
|
|
10620
11002
|
schema: SubAnswerSchema,
|
|
10621
11003
|
maxTokens: budget.maxTokens,
|
|
11004
|
+
taskKind: "query_reason",
|
|
11005
|
+
budgetDiagnostics: budget,
|
|
10622
11006
|
providerOptions
|
|
10623
11007
|
})
|
|
10624
11008
|
);
|
|
@@ -10842,6 +11226,8 @@ async function verify(originalQuestion, subAnswers, allEvidence, config) {
|
|
|
10842
11226
|
prompt,
|
|
10843
11227
|
schema: VerifyResultSchema,
|
|
10844
11228
|
maxTokens: budget.maxTokens,
|
|
11229
|
+
taskKind: "query_verify",
|
|
11230
|
+
budgetDiagnostics: budget,
|
|
10845
11231
|
providerOptions
|
|
10846
11232
|
})
|
|
10847
11233
|
);
|
|
@@ -10984,6 +11370,8 @@ async function interpretAttachments(params) {
|
|
|
10984
11370
|
prompt,
|
|
10985
11371
|
schema: AttachmentInterpretationSchema,
|
|
10986
11372
|
maxTokens: budget.maxTokens,
|
|
11373
|
+
taskKind: "query_attachment",
|
|
11374
|
+
budgetDiagnostics: budget,
|
|
10987
11375
|
providerOptions: buildAttachmentProviderOptions(attachment, providerOptions)
|
|
10988
11376
|
},
|
|
10989
11377
|
{
|
|
@@ -11321,6 +11709,8 @@ function createQueryAgent(config) {
|
|
|
11321
11709
|
prompt,
|
|
11322
11710
|
schema: QueryClassifyResultSchema,
|
|
11323
11711
|
maxTokens: budget.maxTokens,
|
|
11712
|
+
taskKind: "query_classify",
|
|
11713
|
+
budgetDiagnostics: budget,
|
|
11324
11714
|
providerOptions
|
|
11325
11715
|
},
|
|
11326
11716
|
{
|
|
@@ -11372,6 +11762,8 @@ function createQueryAgent(config) {
|
|
|
11372
11762
|
prompt,
|
|
11373
11763
|
schema: QueryResultSchema,
|
|
11374
11764
|
maxTokens: budget.maxTokens,
|
|
11765
|
+
taskKind: "query_respond",
|
|
11766
|
+
budgetDiagnostics: budget,
|
|
11375
11767
|
providerOptions
|
|
11376
11768
|
},
|
|
11377
11769
|
{
|
|
@@ -11467,6 +11859,8 @@ function createPceAgent(config = {}) {
|
|
|
11467
11859
|
prompt: buildPceNormalizePrompt({ requestText: input.requestText, evidenceSources }),
|
|
11468
11860
|
schema: PceNormalizationResultSchema,
|
|
11469
11861
|
maxTokens: budget.maxTokens,
|
|
11862
|
+
taskKind: "pce_impact_analysis",
|
|
11863
|
+
budgetDiagnostics: budget,
|
|
11470
11864
|
providerOptions: config.providerOptions
|
|
11471
11865
|
},
|
|
11472
11866
|
{ fallback, maxRetries: 1, log: config.log }
|
|
@@ -11528,6 +11922,8 @@ function createPceAgent(config = {}) {
|
|
|
11528
11922
|
}),
|
|
11529
11923
|
schema: ReplyAnswersSchema,
|
|
11530
11924
|
maxTokens: budget.maxTokens,
|
|
11925
|
+
taskKind: "pce_reply_parse",
|
|
11926
|
+
budgetDiagnostics: budget,
|
|
11531
11927
|
providerOptions: config.providerOptions
|
|
11532
11928
|
},
|
|
11533
11929
|
{ fallback: { answers }, maxRetries: 1, log: config.log }
|
|
@@ -12408,6 +12804,7 @@ export {
|
|
|
12408
12804
|
buildConfirmationSummaryPrompt,
|
|
12409
12805
|
buildConversationMemoryGuidance,
|
|
12410
12806
|
buildCoverageGapPrompt,
|
|
12807
|
+
buildDoclingProviderOptions,
|
|
12411
12808
|
buildFieldExplanationPrompt,
|
|
12412
12809
|
buildFieldExtractionPrompt,
|
|
12413
12810
|
buildFlatPdfMappingPrompt,
|
|
@@ -12449,12 +12846,16 @@ export {
|
|
|
12449
12846
|
fillAcroForm,
|
|
12450
12847
|
generateNextMessage,
|
|
12451
12848
|
getAcroFormFields,
|
|
12849
|
+
getDoclingPageRangeText,
|
|
12452
12850
|
getExtractor,
|
|
12453
12851
|
getFileIdentifier,
|
|
12454
12852
|
getPdfPageCount,
|
|
12455
12853
|
getTemplate,
|
|
12854
|
+
isDoclingExtractionInput,
|
|
12456
12855
|
isFileReference,
|
|
12457
12856
|
mergeQuestionAnswers,
|
|
12857
|
+
mergeSourceSpans,
|
|
12858
|
+
normalizeDoclingDocument,
|
|
12458
12859
|
normalizeForMatch,
|
|
12459
12860
|
orderSourceEvidence,
|
|
12460
12861
|
overlayTextOnPdf,
|