@claritylabs/cl-sdk 1.0.3 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -5
- package/dist/index.d.mts +82 -2
- package/dist/index.d.ts +82 -2
- package/dist/index.js +457 -51
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +452 -51
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -256,6 +256,7 @@ __export(index_exports, {
|
|
|
256
256
|
buildConfirmationSummaryPrompt: () => buildConfirmationSummaryPrompt,
|
|
257
257
|
buildConversationMemoryGuidance: () => buildConversationMemoryGuidance,
|
|
258
258
|
buildCoverageGapPrompt: () => buildCoverageGapPrompt,
|
|
259
|
+
buildDoclingProviderOptions: () => buildDoclingProviderOptions,
|
|
259
260
|
buildFieldExplanationPrompt: () => buildFieldExplanationPrompt,
|
|
260
261
|
buildFieldExtractionPrompt: () => buildFieldExtractionPrompt,
|
|
261
262
|
buildFlatPdfMappingPrompt: () => buildFlatPdfMappingPrompt,
|
|
@@ -297,12 +298,16 @@ __export(index_exports, {
|
|
|
297
298
|
fillAcroForm: () => fillAcroForm,
|
|
298
299
|
generateNextMessage: () => generateNextMessage,
|
|
299
300
|
getAcroFormFields: () => getAcroFormFields,
|
|
301
|
+
getDoclingPageRangeText: () => getDoclingPageRangeText,
|
|
300
302
|
getExtractor: () => getExtractor,
|
|
301
303
|
getFileIdentifier: () => getFileIdentifier,
|
|
302
304
|
getPdfPageCount: () => getPdfPageCount,
|
|
303
305
|
getTemplate: () => getTemplate,
|
|
306
|
+
isDoclingExtractionInput: () => isDoclingExtractionInput,
|
|
304
307
|
isFileReference: () => isFileReference,
|
|
305
308
|
mergeQuestionAnswers: () => mergeQuestionAnswers,
|
|
309
|
+
mergeSourceSpans: () => mergeSourceSpans,
|
|
310
|
+
normalizeDoclingDocument: () => normalizeDoclingDocument,
|
|
306
311
|
normalizeForMatch: () => normalizeForMatch,
|
|
307
312
|
orderSourceEvidence: () => orderSourceEvidence,
|
|
308
313
|
overlayTextOnPdf: () => overlayTextOnPdf,
|
|
@@ -2794,6 +2799,254 @@ async function overlayTextOnPdf(pdfBytes, overlays) {
|
|
|
2794
2799
|
return await pdfDoc.save();
|
|
2795
2800
|
}
|
|
2796
2801
|
|
|
2802
|
+
// src/extraction/docling.ts
|
|
2803
|
+
function isDoclingExtractionInput(input) {
|
|
2804
|
+
return Boolean(
|
|
2805
|
+
input && typeof input === "object" && input.kind === "docling_document" && input.document && typeof input.document === "object"
|
|
2806
|
+
);
|
|
2807
|
+
}
|
|
2808
|
+
function normalizeDoclingDocument(document, options) {
|
|
2809
|
+
const itemMap = buildItemMap(document);
|
|
2810
|
+
const orderedRefs = getOrderedBodyRefs(document, itemMap);
|
|
2811
|
+
const orderedItems = orderedRefs.length > 0 ? orderedRefs.map((ref) => itemMap.get(ref)).filter((item) => Boolean(item)) : getFallbackOrderedItems(document, itemMap);
|
|
2812
|
+
const units = orderedItems.map(({ ref, item }) => normalizeItem(ref, item)).filter((unit) => Boolean(unit && unit.text.trim()));
|
|
2813
|
+
const pageCount = inferPageCount(document, units);
|
|
2814
|
+
const pageTexts = /* @__PURE__ */ new Map();
|
|
2815
|
+
for (const unit of units) {
|
|
2816
|
+
const page = clampPage(unit.pageStart ?? 1, pageCount);
|
|
2817
|
+
pageTexts.set(page, appendText(pageTexts.get(page), unit.text));
|
|
2818
|
+
}
|
|
2819
|
+
const fullText = Array.from({ length: pageCount }, (_, index) => {
|
|
2820
|
+
const pageNumber = index + 1;
|
|
2821
|
+
const text = pageTexts.get(pageNumber)?.trim();
|
|
2822
|
+
return text ? `Page ${pageNumber}
|
|
2823
|
+
${text}` : "";
|
|
2824
|
+
}).filter(Boolean).join("\n\n");
|
|
2825
|
+
const sourceKind = options.sourceKind ?? "policy_pdf";
|
|
2826
|
+
const sourceSpans = units.map((unit, index) => {
|
|
2827
|
+
const span = buildSourceSpan(
|
|
2828
|
+
{
|
|
2829
|
+
documentId: options.documentId,
|
|
2830
|
+
sourceKind,
|
|
2831
|
+
text: unit.text,
|
|
2832
|
+
pageStart: unit.pageStart,
|
|
2833
|
+
pageEnd: unit.pageEnd,
|
|
2834
|
+
sectionId: unit.label,
|
|
2835
|
+
metadata: {
|
|
2836
|
+
sourceSystem: "docling",
|
|
2837
|
+
sourceUnit: "docling_item",
|
|
2838
|
+
doclingRef: unit.ref,
|
|
2839
|
+
...unit.label ? { doclingLabel: unit.label } : {}
|
|
2840
|
+
}
|
|
2841
|
+
},
|
|
2842
|
+
index
|
|
2843
|
+
);
|
|
2844
|
+
return {
|
|
2845
|
+
...span,
|
|
2846
|
+
kind: "plain_text",
|
|
2847
|
+
bbox: unit.bboxes?.length ? unit.bboxes : void 0
|
|
2848
|
+
};
|
|
2849
|
+
});
|
|
2850
|
+
return {
|
|
2851
|
+
pageCount,
|
|
2852
|
+
fullText,
|
|
2853
|
+
pageTexts,
|
|
2854
|
+
units,
|
|
2855
|
+
sourceSpans
|
|
2856
|
+
};
|
|
2857
|
+
}
|
|
2858
|
+
function getDoclingPageRangeText(normalized, startPage, endPage) {
|
|
2859
|
+
const start = clampPage(startPage, normalized.pageCount);
|
|
2860
|
+
const end = clampPage(endPage, normalized.pageCount);
|
|
2861
|
+
const lines = [];
|
|
2862
|
+
for (let page = start; page <= end; page++) {
|
|
2863
|
+
const text = normalized.pageTexts.get(page)?.trim();
|
|
2864
|
+
if (text) {
|
|
2865
|
+
lines.push(`Page ${page}
|
|
2866
|
+
${text}`);
|
|
2867
|
+
}
|
|
2868
|
+
}
|
|
2869
|
+
return lines.join("\n\n");
|
|
2870
|
+
}
|
|
2871
|
+
function buildDoclingProviderOptions(normalized, existingOptions) {
|
|
2872
|
+
return {
|
|
2873
|
+
...existingOptions,
|
|
2874
|
+
doclingText: normalized.fullText,
|
|
2875
|
+
doclingPageCount: normalized.pageCount
|
|
2876
|
+
};
|
|
2877
|
+
}
|
|
2878
|
+
function mergeSourceSpans(spans) {
|
|
2879
|
+
const seen = /* @__PURE__ */ new Set();
|
|
2880
|
+
const merged = [];
|
|
2881
|
+
for (const span of spans) {
|
|
2882
|
+
const key = [
|
|
2883
|
+
span.documentId,
|
|
2884
|
+
span.pageStart ?? span.location?.startPage ?? span.location?.page ?? "na",
|
|
2885
|
+
span.pageEnd ?? span.location?.endPage ?? span.pageStart ?? "na",
|
|
2886
|
+
span.sectionId ?? span.location?.fieldPath ?? "na",
|
|
2887
|
+
span.textHash ?? sourceSpanTextHash(span.text)
|
|
2888
|
+
].join(":");
|
|
2889
|
+
if (seen.has(key)) continue;
|
|
2890
|
+
seen.add(key);
|
|
2891
|
+
merged.push(span);
|
|
2892
|
+
}
|
|
2893
|
+
return merged;
|
|
2894
|
+
}
|
|
2895
|
+
function buildItemMap(document) {
|
|
2896
|
+
const map = /* @__PURE__ */ new Map();
|
|
2897
|
+
addItems(map, "#/texts", document.texts ?? []);
|
|
2898
|
+
addItems(map, "#/tables", document.tables ?? []);
|
|
2899
|
+
addItems(map, "#/key_value_items", document.key_value_items ?? document.keyValueItems ?? []);
|
|
2900
|
+
addItems(map, "#/pictures", document.pictures ?? []);
|
|
2901
|
+
return map;
|
|
2902
|
+
}
|
|
2903
|
+
function addItems(map, baseRef, items) {
|
|
2904
|
+
items.forEach((item, index) => {
|
|
2905
|
+
const ref = getSelfRef(item) ?? `${baseRef}/${index}`;
|
|
2906
|
+
map.set(ref, { ref, item });
|
|
2907
|
+
});
|
|
2908
|
+
}
|
|
2909
|
+
function getFallbackOrderedItems(document, itemMap) {
|
|
2910
|
+
const refs = [
|
|
2911
|
+
...(document.texts ?? []).map((item, index) => getSelfRef(item) ?? `#/texts/${index}`),
|
|
2912
|
+
...(document.tables ?? []).map((item, index) => getSelfRef(item) ?? `#/tables/${index}`),
|
|
2913
|
+
...(document.key_value_items ?? document.keyValueItems ?? []).map((item, index) => getSelfRef(item) ?? `#/key_value_items/${index}`)
|
|
2914
|
+
];
|
|
2915
|
+
return refs.map((ref) => itemMap.get(ref)).filter((item) => Boolean(item));
|
|
2916
|
+
}
|
|
2917
|
+
function getOrderedBodyRefs(document, itemMap) {
|
|
2918
|
+
const groupMap = /* @__PURE__ */ new Map();
|
|
2919
|
+
(document.groups ?? []).forEach((group, index) => {
|
|
2920
|
+
groupMap.set(getSelfRef(group) ?? `#/groups/${index}`, group);
|
|
2921
|
+
});
|
|
2922
|
+
const refs = [];
|
|
2923
|
+
const visited = /* @__PURE__ */ new Set();
|
|
2924
|
+
const visitRef = (ref) => {
|
|
2925
|
+
const itemEntry = itemMap.get(ref);
|
|
2926
|
+
if (itemEntry) {
|
|
2927
|
+
if (!visited.has(ref)) {
|
|
2928
|
+
visited.add(ref);
|
|
2929
|
+
refs.push(ref);
|
|
2930
|
+
}
|
|
2931
|
+
visitNode(itemEntry.item);
|
|
2932
|
+
return;
|
|
2933
|
+
}
|
|
2934
|
+
visitNode(groupMap.get(ref));
|
|
2935
|
+
};
|
|
2936
|
+
const visitNode = (node) => {
|
|
2937
|
+
for (const child of node?.children ?? []) {
|
|
2938
|
+
const ref = getRef(child);
|
|
2939
|
+
if (!ref) continue;
|
|
2940
|
+
visitRef(ref);
|
|
2941
|
+
}
|
|
2942
|
+
};
|
|
2943
|
+
visitNode(document.body);
|
|
2944
|
+
return refs;
|
|
2945
|
+
}
|
|
2946
|
+
function normalizeItem(ref, item) {
|
|
2947
|
+
const text = getItemText(item).trim();
|
|
2948
|
+
if (!text) return void 0;
|
|
2949
|
+
const pages = (item.prov ?? []).map((prov) => getPageNumber(prov)).filter((page) => typeof page === "number" && page > 0);
|
|
2950
|
+
const pageStart = pages.length ? Math.min(...pages) : void 0;
|
|
2951
|
+
const pageEnd = pages.length ? Math.max(...pages) : pageStart;
|
|
2952
|
+
const bboxes = (item.prov ?? []).map((prov) => toSourceSpanBBox(prov)).filter((bbox) => Boolean(bbox));
|
|
2953
|
+
return {
|
|
2954
|
+
ref,
|
|
2955
|
+
label: typeof item.label === "string" ? item.label : void 0,
|
|
2956
|
+
text,
|
|
2957
|
+
pageStart,
|
|
2958
|
+
pageEnd,
|
|
2959
|
+
bboxes: bboxes.length ? bboxes : void 0
|
|
2960
|
+
};
|
|
2961
|
+
}
|
|
2962
|
+
function getItemText(item) {
|
|
2963
|
+
if (typeof item.text === "string" && item.text.trim()) return item.text;
|
|
2964
|
+
if (typeof item.orig === "string" && item.orig.trim()) return item.orig;
|
|
2965
|
+
const table = tableToMarkdown(item.data);
|
|
2966
|
+
if (table) return table;
|
|
2967
|
+
return "";
|
|
2968
|
+
}
|
|
2969
|
+
function tableToMarkdown(data) {
|
|
2970
|
+
const record = asRecord(data);
|
|
2971
|
+
const cells = Array.isArray(record?.table_cells) ? record.table_cells : Array.isArray(record?.tableCells) ? record.tableCells : void 0;
|
|
2972
|
+
if (!cells) return void 0;
|
|
2973
|
+
const parsedCells = cells.map((cell) => asRecord(cell)).filter((cell) => Boolean(cell)).map((cell) => ({
|
|
2974
|
+
row: firstNumber2([cell.start_row_offset, cell.row_header, cell.row, cell.rowIndex]) ?? 0,
|
|
2975
|
+
col: firstNumber2([cell.start_col_offset, cell.col, cell.colIndex]) ?? 0,
|
|
2976
|
+
text: firstString([cell.text, cell.orig, cell.content])
|
|
2977
|
+
})).filter((cell) => cell.text);
|
|
2978
|
+
if (parsedCells.length === 0) return void 0;
|
|
2979
|
+
const maxRow = Math.max(...parsedCells.map((cell) => cell.row));
|
|
2980
|
+
const maxCol = Math.max(...parsedCells.map((cell) => cell.col));
|
|
2981
|
+
const rows = Array.from({ length: maxRow + 1 }, () => Array.from({ length: maxCol + 1 }, () => ""));
|
|
2982
|
+
for (const cell of parsedCells) {
|
|
2983
|
+
rows[cell.row][cell.col] = cell.text;
|
|
2984
|
+
}
|
|
2985
|
+
if (rows.length === 1) return rows[0].filter(Boolean).join(" | ");
|
|
2986
|
+
const header = rows[0];
|
|
2987
|
+
const separator = header.map(() => "---");
|
|
2988
|
+
return [header, separator, ...rows.slice(1)].map((row) => `| ${row.map((value) => value.trim()).join(" | ")} |`).join("\n");
|
|
2989
|
+
}
|
|
2990
|
+
function inferPageCount(document, units) {
|
|
2991
|
+
const pages = document.pages;
|
|
2992
|
+
if (Array.isArray(pages)) return Math.max(1, pages.length);
|
|
2993
|
+
if (pages && typeof pages === "object") {
|
|
2994
|
+
const keys = Object.keys(pages);
|
|
2995
|
+
const numericMax = Math.max(0, ...keys.map((key) => Number(key)).filter((value) => Number.isFinite(value)));
|
|
2996
|
+
return Math.max(1, numericMax || keys.length);
|
|
2997
|
+
}
|
|
2998
|
+
return Math.max(1, ...units.flatMap((unit) => [unit.pageStart ?? 0, unit.pageEnd ?? 0]));
|
|
2999
|
+
}
|
|
3000
|
+
function getSelfRef(value) {
|
|
3001
|
+
return value.self_ref ?? value.selfRef;
|
|
3002
|
+
}
|
|
3003
|
+
function getRef(value) {
|
|
3004
|
+
if (typeof value === "string") return value;
|
|
3005
|
+
return value.$ref ?? value.ref;
|
|
3006
|
+
}
|
|
3007
|
+
function getPageNumber(prov) {
|
|
3008
|
+
return prov.page_no ?? prov.pageNo ?? prov.page;
|
|
3009
|
+
}
|
|
3010
|
+
function toSourceSpanBBox(prov) {
|
|
3011
|
+
const page = getPageNumber(prov);
|
|
3012
|
+
const bbox = asRecord(prov.bbox);
|
|
3013
|
+
if (!page || !bbox) return void 0;
|
|
3014
|
+
const x = firstNumber2([bbox.x, bbox.l, bbox.left]);
|
|
3015
|
+
const y = firstNumber2([bbox.y, bbox.t, bbox.top]);
|
|
3016
|
+
const width = firstNumber2([bbox.width]);
|
|
3017
|
+
const height = firstNumber2([bbox.height]);
|
|
3018
|
+
const right = firstNumber2([bbox.r, bbox.right]);
|
|
3019
|
+
const bottom = firstNumber2([bbox.b, bbox.bottom]);
|
|
3020
|
+
if (x == null || y == null) return void 0;
|
|
3021
|
+
const resolvedWidth = width ?? (right != null ? right - x : void 0);
|
|
3022
|
+
const resolvedHeight = height ?? (bottom != null ? bottom - y : void 0);
|
|
3023
|
+
if (resolvedWidth == null || resolvedHeight == null) return void 0;
|
|
3024
|
+
return { page, x, y, width: resolvedWidth, height: resolvedHeight };
|
|
3025
|
+
}
|
|
3026
|
+
function clampPage(page, pageCount) {
|
|
3027
|
+
return Math.max(1, Math.min(pageCount, page));
|
|
3028
|
+
}
|
|
3029
|
+
function appendText(existing, next) {
|
|
3030
|
+
return existing ? `${existing}
|
|
3031
|
+
|
|
3032
|
+
${next}` : next;
|
|
3033
|
+
}
|
|
3034
|
+
function asRecord(value) {
|
|
3035
|
+
return value && typeof value === "object" && !Array.isArray(value) ? value : void 0;
|
|
3036
|
+
}
|
|
3037
|
+
function firstString(values) {
|
|
3038
|
+
for (const value of values) {
|
|
3039
|
+
if (typeof value === "string" && value.trim()) return value.trim();
|
|
3040
|
+
}
|
|
3041
|
+
return "";
|
|
3042
|
+
}
|
|
3043
|
+
function firstNumber2(values) {
|
|
3044
|
+
for (const value of values) {
|
|
3045
|
+
if (typeof value === "number" && Number.isFinite(value)) return value;
|
|
3046
|
+
}
|
|
3047
|
+
return void 0;
|
|
3048
|
+
}
|
|
3049
|
+
|
|
2797
3050
|
// src/extraction/extractor.ts
|
|
2798
3051
|
function sourceSpansForPageRange(providerOptions, startPage, endPage) {
|
|
2799
3052
|
const sourceSpans = providerOptions?.sourceSpans;
|
|
@@ -2835,20 +3088,38 @@ async function runExtractor(params) {
|
|
|
2835
3088
|
generateObject,
|
|
2836
3089
|
convertPdfToImages,
|
|
2837
3090
|
maxTokens = 4096,
|
|
3091
|
+
taskKind,
|
|
3092
|
+
budgetDiagnostics,
|
|
2838
3093
|
providerOptions,
|
|
2839
3094
|
pageRangeCache
|
|
2840
3095
|
} = params;
|
|
2841
3096
|
const extractorProviderOptions = { ...providerOptions };
|
|
2842
3097
|
let fullPrompt;
|
|
2843
|
-
|
|
2844
|
-
|
|
2845
|
-
|
|
3098
|
+
if (params.getPageRangeText) {
|
|
3099
|
+
const pageText = await params.getPageRangeText(startPage, endPage);
|
|
3100
|
+
extractorProviderOptions.doclingText = pageText;
|
|
3101
|
+
extractorProviderOptions.doclingPageRange = { startPage, endPage };
|
|
3102
|
+
fullPrompt = `${prompt}
|
|
3103
|
+
|
|
3104
|
+
[Document pages ${startPage}-${endPage} are provided below as Docling-extracted text.]
|
|
3105
|
+
|
|
3106
|
+
${pageText || "(No Docling text was available for this page range.)"}`;
|
|
3107
|
+
} else if (convertPdfToImages) {
|
|
3108
|
+
if (!pdfInput) {
|
|
3109
|
+
throw new Error("pdfInput is required when extracting page images.");
|
|
3110
|
+
}
|
|
3111
|
+
const needsPdfBase64 = !params.getPageImages;
|
|
3112
|
+
const pdfBase64 = needsPdfBase64 ? await pdfInputToBase64(pdfInput) : void 0;
|
|
2846
3113
|
const images = params.getPageImages ? await params.getPageImages(startPage, endPage) : await convertPdfToImages(pdfBase64, startPage, endPage);
|
|
2847
3114
|
extractorProviderOptions.images = images;
|
|
2848
3115
|
fullPrompt = `${prompt}
|
|
2849
3116
|
|
|
2850
3117
|
[Document pages ${startPage}-${endPage} are provided as images.]`;
|
|
2851
3118
|
} else {
|
|
3119
|
+
if (!pdfInput) {
|
|
3120
|
+
throw new Error("pdfInput is required when extracting page PDFs.");
|
|
3121
|
+
}
|
|
3122
|
+
const pdfBase64 = params.getPageRangePdf ? void 0 : await pdfInputToBase64(pdfInput);
|
|
2852
3123
|
const cacheKey = `${startPage}-${endPage}`;
|
|
2853
3124
|
const cachedPagesPdf = pageRangeCache?.get(cacheKey);
|
|
2854
3125
|
const pagesPdf = cachedPagesPdf ?? (params.getPageRangePdf ? await params.getPageRangePdf(startPage, endPage) : await extractPageRange(pdfBase64, startPage, endPage));
|
|
@@ -2868,6 +3139,8 @@ async function runExtractor(params) {
|
|
|
2868
3139
|
prompt: fullPrompt,
|
|
2869
3140
|
schema: strictSchema,
|
|
2870
3141
|
maxTokens,
|
|
3142
|
+
taskKind,
|
|
3143
|
+
budgetDiagnostics,
|
|
2871
3144
|
providerOptions: extractorProviderOptions
|
|
2872
3145
|
})
|
|
2873
3146
|
);
|
|
@@ -3847,6 +4120,8 @@ async function formatDocumentContent(doc, generateText, options) {
|
|
|
3847
4120
|
() => generateText({
|
|
3848
4121
|
prompt,
|
|
3849
4122
|
maxTokens: options?.maxTokens ?? 16384,
|
|
4123
|
+
taskKind: options?.taskKind,
|
|
4124
|
+
budgetDiagnostics: options?.budgetDiagnostics,
|
|
3850
4125
|
providerOptions: options?.providerOptions
|
|
3851
4126
|
})
|
|
3852
4127
|
);
|
|
@@ -3884,7 +4159,7 @@ function formatAddress(addr) {
|
|
|
3884
4159
|
function asRecordArray(value) {
|
|
3885
4160
|
return Array.isArray(value) ? value.filter((item) => Boolean(item) && typeof item === "object" && !Array.isArray(item)) : [];
|
|
3886
4161
|
}
|
|
3887
|
-
function
|
|
4162
|
+
function firstString2(item, keys) {
|
|
3888
4163
|
for (const key of keys) {
|
|
3889
4164
|
const value = item[key];
|
|
3890
4165
|
if (typeof value === "string" && value.trim()) return value;
|
|
@@ -4241,32 +4516,32 @@ ${exc.content}`.trim(), {
|
|
|
4241
4516
|
);
|
|
4242
4517
|
});
|
|
4243
4518
|
asRecordArray(extendedDoc.definitions).forEach((definition, i) => {
|
|
4244
|
-
const term =
|
|
4245
|
-
const body =
|
|
4519
|
+
const term = firstString2(definition, ["term", "name", "title"]) ?? `Definition ${i + 1}`;
|
|
4520
|
+
const body = firstString2(definition, ["definition", "content", "text", "meaning"]);
|
|
4246
4521
|
pushChunk(
|
|
4247
4522
|
`definition:${i}`,
|
|
4248
4523
|
"definition",
|
|
4249
4524
|
lines([
|
|
4250
4525
|
`Definition: ${term}`,
|
|
4251
4526
|
body,
|
|
4252
|
-
|
|
4527
|
+
firstString2(definition, ["originalContent", "source"]) ? `Source: ${firstString2(definition, ["originalContent", "source"])}` : null
|
|
4253
4528
|
]),
|
|
4254
4529
|
{
|
|
4255
4530
|
term,
|
|
4256
|
-
formNumber:
|
|
4257
|
-
formTitle:
|
|
4531
|
+
formNumber: firstString2(definition, ["formNumber"]),
|
|
4532
|
+
formTitle: firstString2(definition, ["formTitle"]),
|
|
4258
4533
|
pageNumber: typeof definition.pageNumber === "number" ? definition.pageNumber : void 0,
|
|
4259
|
-
sectionRef:
|
|
4534
|
+
sectionRef: firstString2(definition, ["sectionRef", "sectionTitle"]),
|
|
4260
4535
|
documentType: doc.type
|
|
4261
4536
|
}
|
|
4262
4537
|
);
|
|
4263
4538
|
});
|
|
4264
4539
|
const coveredReasons = asRecordArray(extendedDoc.coveredReasons ?? extendedDoc.covered_reasons);
|
|
4265
4540
|
coveredReasons.forEach((coveredReason, i) => {
|
|
4266
|
-
const title =
|
|
4267
|
-
const coverageName =
|
|
4268
|
-
const reasonNumber =
|
|
4269
|
-
const body =
|
|
4541
|
+
const title = firstString2(coveredReason, ["title", "name", "reason", "peril", "cause"]) ?? `Covered Reason ${i + 1}`;
|
|
4542
|
+
const coverageName = firstString2(coveredReason, ["coverageName", "coverage", "coveragePart"]);
|
|
4543
|
+
const reasonNumber = firstString2(coveredReason, ["reasonNumber", "number"]);
|
|
4544
|
+
const body = firstString2(coveredReason, ["content", "description", "text", "coverageGrant"]);
|
|
4270
4545
|
pushChunk(
|
|
4271
4546
|
`covered_reason:${i}`,
|
|
4272
4547
|
"covered_reason",
|
|
@@ -4275,16 +4550,16 @@ ${exc.content}`.trim(), {
|
|
|
4275
4550
|
reasonNumber ? `Reason Number: ${reasonNumber}` : null,
|
|
4276
4551
|
`Covered Reason: ${title}`,
|
|
4277
4552
|
body,
|
|
4278
|
-
|
|
4553
|
+
firstString2(coveredReason, ["originalContent", "source"]) ? `Source: ${firstString2(coveredReason, ["originalContent", "source"])}` : null
|
|
4279
4554
|
]),
|
|
4280
4555
|
{
|
|
4281
4556
|
coverageName,
|
|
4282
4557
|
reasonNumber,
|
|
4283
4558
|
title,
|
|
4284
|
-
formNumber:
|
|
4285
|
-
formTitle:
|
|
4559
|
+
formNumber: firstString2(coveredReason, ["formNumber"]),
|
|
4560
|
+
formTitle: firstString2(coveredReason, ["formTitle"]),
|
|
4286
4561
|
pageNumber: typeof coveredReason.pageNumber === "number" ? coveredReason.pageNumber : void 0,
|
|
4287
|
-
sectionRef:
|
|
4562
|
+
sectionRef: firstString2(coveredReason, ["sectionRef", "sectionTitle"]),
|
|
4288
4563
|
documentType: doc.type
|
|
4289
4564
|
}
|
|
4290
4565
|
);
|
|
@@ -4304,10 +4579,10 @@ ${exc.content}`.trim(), {
|
|
|
4304
4579
|
reasonNumber,
|
|
4305
4580
|
title,
|
|
4306
4581
|
conditionIndex,
|
|
4307
|
-
formNumber:
|
|
4308
|
-
formTitle:
|
|
4582
|
+
formNumber: firstString2(coveredReason, ["formNumber"]),
|
|
4583
|
+
formTitle: firstString2(coveredReason, ["formTitle"]),
|
|
4309
4584
|
pageNumber: typeof coveredReason.pageNumber === "number" ? coveredReason.pageNumber : void 0,
|
|
4310
|
-
sectionRef:
|
|
4585
|
+
sectionRef: firstString2(coveredReason, ["sectionRef", "sectionTitle"]),
|
|
4311
4586
|
documentType: doc.type
|
|
4312
4587
|
}
|
|
4313
4588
|
);
|
|
@@ -6778,21 +7053,21 @@ Return JSON only.`;
|
|
|
6778
7053
|
}
|
|
6779
7054
|
|
|
6780
7055
|
// src/prompts/extractors/index.ts
|
|
6781
|
-
function
|
|
7056
|
+
function asRecord2(data) {
|
|
6782
7057
|
return data && typeof data === "object" ? data : void 0;
|
|
6783
7058
|
}
|
|
6784
7059
|
function getSections2(data) {
|
|
6785
|
-
const sections =
|
|
7060
|
+
const sections = asRecord2(data)?.sections;
|
|
6786
7061
|
return Array.isArray(sections) ? sections : [];
|
|
6787
7062
|
}
|
|
6788
7063
|
function isCoveredReasonsEmpty(data) {
|
|
6789
|
-
const record =
|
|
7064
|
+
const record = asRecord2(data);
|
|
6790
7065
|
if (!record) return true;
|
|
6791
7066
|
const coveredReasons = Array.isArray(record.coveredReasons) ? record.coveredReasons : Array.isArray(record.covered_reasons) ? record.covered_reasons : [];
|
|
6792
7067
|
return coveredReasons.length === 0;
|
|
6793
7068
|
}
|
|
6794
7069
|
function isDefinitionsEmpty(data) {
|
|
6795
|
-
const definitions =
|
|
7070
|
+
const definitions = asRecord2(data)?.definitions;
|
|
6796
7071
|
return !Array.isArray(definitions) || definitions.length === 0;
|
|
6797
7072
|
}
|
|
6798
7073
|
function sectionLooksLikeCoveredReason(section) {
|
|
@@ -7026,6 +7301,14 @@ function decideReferentialResolutionAction(params) {
|
|
|
7026
7301
|
}
|
|
7027
7302
|
|
|
7028
7303
|
// src/extraction/resolve-referential.ts
|
|
7304
|
+
function formatDoclingTextContext(providerOptions) {
|
|
7305
|
+
const doclingText = providerOptions?.doclingText;
|
|
7306
|
+
if (typeof doclingText !== "string" || !doclingText.trim()) return "";
|
|
7307
|
+
return `
|
|
7308
|
+
|
|
7309
|
+
DOCLING DOCUMENT TEXT:
|
|
7310
|
+
${doclingText}`;
|
|
7311
|
+
}
|
|
7029
7312
|
function parseReferenceTarget(text) {
|
|
7030
7313
|
if (typeof text !== "string") return void 0;
|
|
7031
7314
|
const normalized = text.trim();
|
|
@@ -7107,10 +7390,12 @@ Return the page range (1-indexed) where this section is located. If the section
|
|
|
7107
7390
|
|
|
7108
7391
|
If you cannot find the section, return startPage: 0 and endPage: 0.
|
|
7109
7392
|
|
|
7110
|
-
Return JSON only
|
|
7393
|
+
Return JSON only.${formatDoclingTextContext(providerOptions)}`,
|
|
7111
7394
|
schema: PageLocationSchema,
|
|
7112
7395
|
maxTokens: budget.maxTokens,
|
|
7113
|
-
|
|
7396
|
+
taskKind: "extraction_referential_lookup",
|
|
7397
|
+
budgetDiagnostics: budget,
|
|
7398
|
+
providerOptions: pdfInput ? await buildPdfProviderOptions(pdfInput, providerOptions) : providerOptions
|
|
7114
7399
|
},
|
|
7115
7400
|
{
|
|
7116
7401
|
fallback: { startPage: 0, endPage: 0 },
|
|
@@ -7144,6 +7429,7 @@ async function resolveReferentialCoverages(params) {
|
|
|
7144
7429
|
convertPdfToImages,
|
|
7145
7430
|
getPageRangePdf,
|
|
7146
7431
|
getPageImages,
|
|
7432
|
+
getPageRangeText,
|
|
7147
7433
|
concurrency = 2,
|
|
7148
7434
|
providerOptions,
|
|
7149
7435
|
modelCapabilities,
|
|
@@ -7255,7 +7541,10 @@ async function resolveReferentialCoverages(params) {
|
|
|
7255
7541
|
convertPdfToImages,
|
|
7256
7542
|
getPageRangePdf,
|
|
7257
7543
|
getPageImages,
|
|
7544
|
+
getPageRangeText,
|
|
7258
7545
|
maxTokens: budget.maxTokens,
|
|
7546
|
+
taskKind: "extraction_referential_lookup",
|
|
7547
|
+
budgetDiagnostics: budget,
|
|
7259
7548
|
providerOptions
|
|
7260
7549
|
});
|
|
7261
7550
|
trackUsage(result.usage);
|
|
@@ -7348,6 +7637,7 @@ async function runFocusedExtractorWithFallback(params) {
|
|
|
7348
7637
|
pageRangeCache,
|
|
7349
7638
|
getPageRangePdf,
|
|
7350
7639
|
getPageImages,
|
|
7640
|
+
getPageRangeText,
|
|
7351
7641
|
trackUsage,
|
|
7352
7642
|
resolveBudget,
|
|
7353
7643
|
log
|
|
@@ -7372,10 +7662,13 @@ async function runFocusedExtractorWithFallback(params) {
|
|
|
7372
7662
|
generateObject,
|
|
7373
7663
|
convertPdfToImages,
|
|
7374
7664
|
maxTokens: budget.maxTokens,
|
|
7665
|
+
taskKind,
|
|
7666
|
+
budgetDiagnostics: budget,
|
|
7375
7667
|
providerOptions,
|
|
7376
7668
|
pageRangeCache,
|
|
7377
7669
|
getPageRangePdf,
|
|
7378
|
-
getPageImages
|
|
7670
|
+
getPageImages,
|
|
7671
|
+
getPageRangeText
|
|
7379
7672
|
});
|
|
7380
7673
|
trackUsage(result.usage, {
|
|
7381
7674
|
taskKind,
|
|
@@ -7415,10 +7708,13 @@ async function runFocusedExtractorWithFallback(params) {
|
|
|
7415
7708
|
generateObject,
|
|
7416
7709
|
convertPdfToImages,
|
|
7417
7710
|
maxTokens: budget.maxTokens,
|
|
7711
|
+
taskKind,
|
|
7712
|
+
budgetDiagnostics: budget,
|
|
7418
7713
|
providerOptions,
|
|
7419
7714
|
pageRangeCache,
|
|
7420
7715
|
getPageRangePdf,
|
|
7421
|
-
getPageImages
|
|
7716
|
+
getPageImages,
|
|
7717
|
+
getPageRangeText
|
|
7422
7718
|
});
|
|
7423
7719
|
trackUsage(fallbackResult.usage, {
|
|
7424
7720
|
taskKind,
|
|
@@ -8262,7 +8558,7 @@ function createExtractor(config) {
|
|
|
8262
8558
|
}
|
|
8263
8559
|
return lines.length > 0 ? lines.join("\n") : "";
|
|
8264
8560
|
}
|
|
8265
|
-
async function runFocusedExtractorTask(task, pdfInput, memory, pageRangeCache, getPageRangePdf, getPageImages) {
|
|
8561
|
+
async function runFocusedExtractorTask(task, pdfInput, memory, pageRangeCache, getPageRangePdf, getPageImages, getPageRangeText) {
|
|
8266
8562
|
if (task.extractorName === "supplementary") {
|
|
8267
8563
|
const alreadyExtractedSummary = buildAlreadyExtractedSummary(memory);
|
|
8268
8564
|
const budget = resolveBudget("extraction_focused", 4096);
|
|
@@ -8277,10 +8573,13 @@ function createExtractor(config) {
|
|
|
8277
8573
|
generateObject,
|
|
8278
8574
|
convertPdfToImages,
|
|
8279
8575
|
maxTokens: budget.maxTokens,
|
|
8576
|
+
taskKind: "extraction_focused",
|
|
8577
|
+
budgetDiagnostics: budget,
|
|
8280
8578
|
providerOptions: activeProviderOptions,
|
|
8281
8579
|
pageRangeCache,
|
|
8282
8580
|
getPageRangePdf,
|
|
8283
|
-
getPageImages
|
|
8581
|
+
getPageImages,
|
|
8582
|
+
getPageRangeText
|
|
8284
8583
|
});
|
|
8285
8584
|
trackUsage(result.usage, {
|
|
8286
8585
|
taskKind: "extraction_focused",
|
|
@@ -8299,6 +8598,7 @@ function createExtractor(config) {
|
|
|
8299
8598
|
pageRangeCache,
|
|
8300
8599
|
getPageRangePdf,
|
|
8301
8600
|
getPageImages,
|
|
8601
|
+
getPageRangeText,
|
|
8302
8602
|
trackUsage,
|
|
8303
8603
|
resolveBudget,
|
|
8304
8604
|
log
|
|
@@ -8314,8 +8614,14 @@ function createExtractor(config) {
|
|
|
8314
8614
|
if (extractorPages.size === 0) return "No page assignments available.";
|
|
8315
8615
|
return [...extractorPages.entries()].map(([extractorName, pages]) => `${extractorName}: ${pages.length} page(s), pages ${pages.join(", ")}`).join("\n");
|
|
8316
8616
|
}
|
|
8317
|
-
async function extract(
|
|
8617
|
+
async function extract(input, documentId, options) {
|
|
8318
8618
|
const id = documentId ?? `doc-${Date.now()}`;
|
|
8619
|
+
const isDoclingInput = isDoclingExtractionInput(input);
|
|
8620
|
+
const pdfInput = isDoclingInput ? void 0 : input;
|
|
8621
|
+
const doclingDocument = isDoclingInput ? normalizeDoclingDocument(input.document, {
|
|
8622
|
+
documentId: id,
|
|
8623
|
+
sourceKind: input.sourceKind
|
|
8624
|
+
}) : void 0;
|
|
8319
8625
|
const memory = /* @__PURE__ */ new Map();
|
|
8320
8626
|
totalUsage = { inputTokens: 0, outputTokens: 0 };
|
|
8321
8627
|
modelCalls = 0;
|
|
@@ -8325,7 +8631,10 @@ function createExtractor(config) {
|
|
|
8325
8631
|
modelCalls: [],
|
|
8326
8632
|
totalModelCallDurationMs: 0
|
|
8327
8633
|
};
|
|
8328
|
-
const sourceSpans =
|
|
8634
|
+
const sourceSpans = mergeSourceSpans([
|
|
8635
|
+
...doclingDocument?.sourceSpans ?? [],
|
|
8636
|
+
...options?.sourceSpans ?? []
|
|
8637
|
+
]);
|
|
8329
8638
|
const sourceChunks = sourceSpans.length ? chunkSourceSpans(sourceSpans) : [];
|
|
8330
8639
|
activeProviderOptions = sourceSpans.length ? { ...providerOptions, sourceSpans, sourceChunks } : providerOptions;
|
|
8331
8640
|
if (sourceStore && sourceSpans.length > 0) {
|
|
@@ -8354,24 +8663,40 @@ function createExtractor(config) {
|
|
|
8354
8663
|
let fullPdfProviderOptionsPromise;
|
|
8355
8664
|
let pageCountPromise;
|
|
8356
8665
|
async function getPdfBase64ForExtraction() {
|
|
8666
|
+
if (!pdfInput) {
|
|
8667
|
+
throw new Error("PDF input is not available for Docling extraction.");
|
|
8668
|
+
}
|
|
8357
8669
|
if (pdfBase64Cache === void 0) {
|
|
8358
8670
|
pdfBase64Cache = await pdfInputToBase64(pdfInput);
|
|
8359
8671
|
}
|
|
8360
8672
|
return pdfBase64Cache;
|
|
8361
8673
|
}
|
|
8362
8674
|
async function getCachedPageCount() {
|
|
8675
|
+
if (doclingDocument) return doclingDocument.pageCount;
|
|
8676
|
+
if (!pdfInput) {
|
|
8677
|
+
throw new Error("PDF input is required to read page count.");
|
|
8678
|
+
}
|
|
8363
8679
|
if (!pageCountPromise) {
|
|
8364
8680
|
pageCountPromise = getPdfSlicer().then((slicer) => slicer.getPageCount()).catch(() => getPdfPageCount(pdfInput));
|
|
8365
8681
|
}
|
|
8366
8682
|
return pageCountPromise;
|
|
8367
8683
|
}
|
|
8368
|
-
async function
|
|
8684
|
+
async function getFullDocumentProviderOptions() {
|
|
8685
|
+
if (doclingDocument) {
|
|
8686
|
+
return buildDoclingProviderOptions(doclingDocument, activeProviderOptions);
|
|
8687
|
+
}
|
|
8688
|
+
if (!pdfInput) {
|
|
8689
|
+
return activeProviderOptions ?? {};
|
|
8690
|
+
}
|
|
8369
8691
|
if (!fullPdfProviderOptionsPromise) {
|
|
8370
8692
|
fullPdfProviderOptionsPromise = buildPdfProviderOptions(pdfInput, activeProviderOptions);
|
|
8371
8693
|
}
|
|
8372
8694
|
return fullPdfProviderOptionsPromise;
|
|
8373
8695
|
}
|
|
8374
8696
|
async function getPdfSlicer() {
|
|
8697
|
+
if (!pdfInput) {
|
|
8698
|
+
throw new Error("PDF input is not available for Docling extraction.");
|
|
8699
|
+
}
|
|
8375
8700
|
if (!pdfSlicerPromise) {
|
|
8376
8701
|
pdfSlicerPromise = createPdfPageSlicer(pdfInput);
|
|
8377
8702
|
}
|
|
@@ -8410,6 +8735,23 @@ function createExtractor(config) {
|
|
|
8410
8735
|
pageRangeImageCache.set(cacheKey, promise);
|
|
8411
8736
|
return promise;
|
|
8412
8737
|
}
|
|
8738
|
+
async function getPageRangeText(startPage, endPage) {
|
|
8739
|
+
return doclingDocument ? getDoclingPageRangeText(doclingDocument, startPage, endPage) : "";
|
|
8740
|
+
}
|
|
8741
|
+
function withFullDocumentTextContext(prompt) {
|
|
8742
|
+
if (!doclingDocument) return prompt;
|
|
8743
|
+
return `${prompt}
|
|
8744
|
+
|
|
8745
|
+
DOCLING DOCUMENT TEXT:
|
|
8746
|
+
${doclingDocument.fullText}`;
|
|
8747
|
+
}
|
|
8748
|
+
function withPageRangeTextContext(prompt, startPage, endPage, pageText) {
|
|
8749
|
+
if (!doclingDocument) return prompt;
|
|
8750
|
+
return `${prompt}
|
|
8751
|
+
|
|
8752
|
+
DOCLING DOCUMENT PAGES ${startPage}-${endPage}:
|
|
8753
|
+
${pageText || "(No Docling text was available for this page range.)"}`;
|
|
8754
|
+
}
|
|
8413
8755
|
let classifyResult;
|
|
8414
8756
|
if (resumed?.classifyResult && pipelineCtx.isPhaseComplete("classify")) {
|
|
8415
8757
|
classifyResult = resumed.classifyResult;
|
|
@@ -8422,10 +8764,12 @@ function createExtractor(config) {
|
|
|
8422
8764
|
const classifyResponse = await safeGenerateObject(
|
|
8423
8765
|
generateObject,
|
|
8424
8766
|
{
|
|
8425
|
-
prompt: buildClassifyPrompt(),
|
|
8767
|
+
prompt: withFullDocumentTextContext(buildClassifyPrompt()),
|
|
8426
8768
|
schema: ClassifyResultSchema,
|
|
8427
8769
|
maxTokens: budget.maxTokens,
|
|
8428
|
-
|
|
8770
|
+
taskKind: "extraction_classify",
|
|
8771
|
+
budgetDiagnostics: budget,
|
|
8772
|
+
providerOptions: await getFullDocumentProviderOptions()
|
|
8429
8773
|
},
|
|
8430
8774
|
{
|
|
8431
8775
|
fallback: { documentType: "policy", policyTypes: ["other"], confidence: 0 },
|
|
@@ -8470,10 +8814,12 @@ function createExtractor(config) {
|
|
|
8470
8814
|
const formInventoryResponse = await safeGenerateObject(
|
|
8471
8815
|
generateObject,
|
|
8472
8816
|
{
|
|
8473
|
-
prompt: buildFormInventoryPrompt(templateHints),
|
|
8817
|
+
prompt: withFullDocumentTextContext(buildFormInventoryPrompt(templateHints)),
|
|
8474
8818
|
schema: FormInventorySchema,
|
|
8475
8819
|
maxTokens: budget.maxTokens,
|
|
8476
|
-
|
|
8820
|
+
taskKind: "extraction_form_inventory",
|
|
8821
|
+
budgetDiagnostics: budget,
|
|
8822
|
+
providerOptions: await getFullDocumentProviderOptions()
|
|
8477
8823
|
},
|
|
8478
8824
|
{
|
|
8479
8825
|
fallback: { forms: [] },
|
|
@@ -8516,16 +8862,24 @@ function createExtractor(config) {
|
|
|
8516
8862
|
const pageMapResults = await Promise.all(
|
|
8517
8863
|
pageMapChunks.map(
|
|
8518
8864
|
({ startPage, endPage }) => pageMapLimit(async () => {
|
|
8519
|
-
const pagesPdf = await getPageRangePdf(startPage, endPage);
|
|
8865
|
+
const pagesPdf = doclingDocument ? void 0 : await getPageRangePdf(startPage, endPage);
|
|
8866
|
+
const pagesText = doclingDocument ? await getPageRangeText(startPage, endPage) : "";
|
|
8520
8867
|
const budget = resolveBudget("extraction_page_map", 2048);
|
|
8521
8868
|
const startedAt = Date.now();
|
|
8522
8869
|
const mapResponse = await safeGenerateObject(
|
|
8523
8870
|
generateObject,
|
|
8524
8871
|
{
|
|
8525
|
-
prompt:
|
|
8872
|
+
prompt: withPageRangeTextContext(
|
|
8873
|
+
buildPageMapPrompt(templateHints, startPage, endPage, formInventoryHint),
|
|
8874
|
+
startPage,
|
|
8875
|
+
endPage,
|
|
8876
|
+
pagesText
|
|
8877
|
+
),
|
|
8526
8878
|
schema: PageMapChunkSchema,
|
|
8527
8879
|
maxTokens: budget.maxTokens,
|
|
8528
|
-
|
|
8880
|
+
taskKind: "extraction_page_map",
|
|
8881
|
+
budgetDiagnostics: budget,
|
|
8882
|
+
providerOptions: doclingDocument ? { ...activeProviderOptions, doclingText: pagesText, doclingPageRange: { startPage, endPage } } : { ...activeProviderOptions, pdfBase64: pagesPdf }
|
|
8529
8883
|
},
|
|
8530
8884
|
{
|
|
8531
8885
|
fallback: {
|
|
@@ -8603,7 +8957,7 @@ function createExtractor(config) {
|
|
|
8603
8957
|
}))
|
|
8604
8958
|
];
|
|
8605
8959
|
onProgress?.(`Dispatching ${tasks.length} extractors...`);
|
|
8606
|
-
const extractionPdfInput = await getPdfBase64ForExtraction();
|
|
8960
|
+
const extractionPdfInput = doclingDocument ? void 0 : await getPdfBase64ForExtraction();
|
|
8607
8961
|
const extractorResults = await Promise.all(
|
|
8608
8962
|
tasks.map(
|
|
8609
8963
|
(task) => extractorLimit(async () => {
|
|
@@ -8614,7 +8968,8 @@ function createExtractor(config) {
|
|
|
8614
8968
|
memory,
|
|
8615
8969
|
completedPageRangePdfCache,
|
|
8616
8970
|
getPageRangePdf,
|
|
8617
|
-
convertPdfToImages ? getPageImages : void 0
|
|
8971
|
+
convertPdfToImages ? getPageImages : void 0,
|
|
8972
|
+
doclingDocument ? getPageRangeText : void 0
|
|
8618
8973
|
);
|
|
8619
8974
|
})
|
|
8620
8975
|
)
|
|
@@ -8641,10 +8996,13 @@ function createExtractor(config) {
|
|
|
8641
8996
|
generateObject,
|
|
8642
8997
|
convertPdfToImages,
|
|
8643
8998
|
maxTokens: budget.maxTokens,
|
|
8999
|
+
taskKind: "extraction_focused",
|
|
9000
|
+
budgetDiagnostics: budget,
|
|
8644
9001
|
providerOptions: activeProviderOptions,
|
|
8645
9002
|
pageRangeCache: completedPageRangePdfCache,
|
|
8646
9003
|
getPageRangePdf,
|
|
8647
|
-
getPageImages: convertPdfToImages ? getPageImages : void 0
|
|
9004
|
+
getPageImages: convertPdfToImages ? getPageImages : void 0,
|
|
9005
|
+
getPageRangeText: doclingDocument ? getPageRangeText : void 0
|
|
8648
9006
|
});
|
|
8649
9007
|
trackUsage(supplementaryResult.usage, {
|
|
8650
9008
|
taskKind: "extraction_focused",
|
|
@@ -8680,6 +9038,7 @@ function createExtractor(config) {
|
|
|
8680
9038
|
concurrency,
|
|
8681
9039
|
getPageRangePdf,
|
|
8682
9040
|
getPageImages: convertPdfToImages ? getPageImages : void 0,
|
|
9041
|
+
getPageRangeText: doclingDocument ? getPageRangeText : void 0,
|
|
8683
9042
|
providerOptions: activeProviderOptions,
|
|
8684
9043
|
modelCapabilities,
|
|
8685
9044
|
modelBudgetConstraints,
|
|
@@ -8728,13 +9087,22 @@ function createExtractor(config) {
|
|
|
8728
9087
|
const reviewResponse = await safeGenerateObject(
|
|
8729
9088
|
generateObject,
|
|
8730
9089
|
{
|
|
8731
|
-
prompt: buildReviewPrompt(template.required, extractedKeys, extractionSummary, pageMapSummary, extractorCatalog),
|
|
9090
|
+
prompt: withFullDocumentTextContext(buildReviewPrompt(template.required, extractedKeys, extractionSummary, pageMapSummary, extractorCatalog)),
|
|
8732
9091
|
schema: ReviewResultSchema,
|
|
8733
9092
|
maxTokens: budget.maxTokens,
|
|
8734
|
-
|
|
9093
|
+
taskKind: "extraction_review",
|
|
9094
|
+
budgetDiagnostics: budget,
|
|
9095
|
+
providerOptions: await getFullDocumentProviderOptions()
|
|
8735
9096
|
},
|
|
8736
9097
|
{
|
|
8737
|
-
fallback: {
|
|
9098
|
+
fallback: {
|
|
9099
|
+
complete: false,
|
|
9100
|
+
missingFields: ["llm_review_unavailable"],
|
|
9101
|
+
qualityIssues: [
|
|
9102
|
+
"LLM extraction review failed; deterministic review was used and the result needs review."
|
|
9103
|
+
],
|
|
9104
|
+
additionalTasks: []
|
|
9105
|
+
},
|
|
8738
9106
|
log,
|
|
8739
9107
|
onError: (err, attempt) => log?.(`Review round ${round + 1} attempt ${attempt + 1} failed: ${err}`)
|
|
8740
9108
|
}
|
|
@@ -8754,7 +9122,7 @@ function createExtractor(config) {
|
|
|
8754
9122
|
break;
|
|
8755
9123
|
}
|
|
8756
9124
|
onProgress?.(`Review round ${round + 1}: dispatching ${reviewResponse.object.additionalTasks.length} follow-up extractors...`);
|
|
8757
|
-
const extractionPdfInput = await getPdfBase64ForExtraction();
|
|
9125
|
+
const extractionPdfInput = doclingDocument ? void 0 : await getPdfBase64ForExtraction();
|
|
8758
9126
|
const followUpResults = await Promise.all(
|
|
8759
9127
|
reviewResponse.object.additionalTasks.map(
|
|
8760
9128
|
(task) => extractorLimit(async () => {
|
|
@@ -8764,7 +9132,8 @@ function createExtractor(config) {
|
|
|
8764
9132
|
memory,
|
|
8765
9133
|
completedPageRangePdfCache,
|
|
8766
9134
|
getPageRangePdf,
|
|
8767
|
-
convertPdfToImages ? getPageImages : void 0
|
|
9135
|
+
convertPdfToImages ? getPageImages : void 0,
|
|
9136
|
+
doclingDocument ? getPageRangeText : void 0
|
|
8768
9137
|
);
|
|
8769
9138
|
})
|
|
8770
9139
|
)
|
|
@@ -8835,6 +9204,8 @@ function createExtractor(config) {
|
|
|
8835
9204
|
prompt: buildSummaryPrompt(document),
|
|
8836
9205
|
schema: SummaryResultSchema,
|
|
8837
9206
|
maxTokens: budget.maxTokens,
|
|
9207
|
+
taskKind: "extraction_summary",
|
|
9208
|
+
budgetDiagnostics: budget,
|
|
8838
9209
|
providerOptions: activeProviderOptions
|
|
8839
9210
|
},
|
|
8840
9211
|
{
|
|
@@ -8862,6 +9233,8 @@ function createExtractor(config) {
|
|
|
8862
9233
|
const formatResult = await formatDocumentContent(document, generateText, {
|
|
8863
9234
|
providerOptions: activeProviderOptions,
|
|
8864
9235
|
maxTokens: formatBudget.maxTokens,
|
|
9236
|
+
taskKind: "extraction_format",
|
|
9237
|
+
budgetDiagnostics: formatBudget,
|
|
8865
9238
|
concurrency: formatConcurrency ?? concurrency,
|
|
8866
9239
|
onProgress,
|
|
8867
9240
|
log
|
|
@@ -9264,6 +9637,7 @@ async function classifyApplication(pdfContent, generateObject, providerOptions,
|
|
|
9264
9637
|
Analyze the attached insurance document. If text source units are provided in provider options, use them as supporting context. Do not infer from base64 text.`,
|
|
9265
9638
|
schema: ApplicationClassifyResultSchema,
|
|
9266
9639
|
maxTokens,
|
|
9640
|
+
taskKind: "application_classify",
|
|
9267
9641
|
providerOptions: {
|
|
9268
9642
|
...providerOptions,
|
|
9269
9643
|
pdfBase64: providerOptions?.pdfBase64 ?? pdfContent
|
|
@@ -9366,6 +9740,7 @@ Extract fields from the attached application PDF. Use provider-supplied source u
|
|
|
9366
9740
|
prompt,
|
|
9367
9741
|
schema: FieldExtractionResultSchema,
|
|
9368
9742
|
maxTokens,
|
|
9743
|
+
taskKind: "application_extract_fields",
|
|
9369
9744
|
providerOptions: {
|
|
9370
9745
|
...providerOptions,
|
|
9371
9746
|
pdfBase64: providerOptions?.pdfBase64 ?? pdfContent
|
|
@@ -9419,6 +9794,7 @@ async function autoFillFromContext(fields, orgContext, generateObject, providerO
|
|
|
9419
9794
|
prompt,
|
|
9420
9795
|
schema: AutoFillResultSchema,
|
|
9421
9796
|
maxTokens,
|
|
9797
|
+
taskKind: "application_auto_fill",
|
|
9422
9798
|
providerOptions
|
|
9423
9799
|
})
|
|
9424
9800
|
);
|
|
@@ -9489,6 +9865,7 @@ async function batchQuestions(unfilledFields, generateObject, providerOptions, m
|
|
|
9489
9865
|
prompt,
|
|
9490
9866
|
schema: QuestionBatchResultSchema,
|
|
9491
9867
|
maxTokens,
|
|
9868
|
+
taskKind: "application_batch",
|
|
9492
9869
|
providerOptions
|
|
9493
9870
|
})
|
|
9494
9871
|
);
|
|
@@ -9540,6 +9917,7 @@ async function classifyReplyIntent(fields, replyText, generateObject, providerOp
|
|
|
9540
9917
|
prompt,
|
|
9541
9918
|
schema: ReplyIntentSchema,
|
|
9542
9919
|
maxTokens,
|
|
9920
|
+
taskKind: "application_classify",
|
|
9543
9921
|
providerOptions
|
|
9544
9922
|
})
|
|
9545
9923
|
);
|
|
@@ -9599,6 +9977,7 @@ async function parseAnswers(fields, replyText, generateObject, providerOptions,
|
|
|
9599
9977
|
prompt,
|
|
9600
9978
|
schema: AnswerParsingResultSchema,
|
|
9601
9979
|
maxTokens,
|
|
9980
|
+
taskKind: "application_parse_answers",
|
|
9602
9981
|
providerOptions
|
|
9603
9982
|
})
|
|
9604
9983
|
);
|
|
@@ -9728,6 +10107,7 @@ async function fillFromLookup(requests, targetFields, availableData, generateObj
|
|
|
9728
10107
|
prompt,
|
|
9729
10108
|
schema: LookupFillResultSchema,
|
|
9730
10109
|
maxTokens,
|
|
10110
|
+
taskKind: "application_lookup",
|
|
9731
10111
|
providerOptions
|
|
9732
10112
|
})
|
|
9733
10113
|
);
|
|
@@ -9810,6 +10190,7 @@ async function generateBatchEmail(batchFields, batchIndex, totalBatches, opts, g
|
|
|
9810
10190
|
() => generateText({
|
|
9811
10191
|
prompt,
|
|
9812
10192
|
maxTokens,
|
|
10193
|
+
taskKind: "application_email",
|
|
9813
10194
|
providerOptions
|
|
9814
10195
|
})
|
|
9815
10196
|
);
|
|
@@ -10332,11 +10713,14 @@ function createApplicationPipeline(config) {
|
|
|
10332
10713
|
}
|
|
10333
10714
|
if (replyPlan.answerQuestion && intent.questionText) {
|
|
10334
10715
|
try {
|
|
10716
|
+
const budget = resolveBudget("application_email", 512);
|
|
10335
10717
|
const { text, usage } = await generateText({
|
|
10336
10718
|
prompt: `The user is filling out an insurance application and asked: "${intent.questionText}"
|
|
10337
10719
|
|
|
10338
10720
|
Provide a brief, helpful explanation (2-3 sentences). End with "Just reply with the answer when you're ready and I'll fill it in."`,
|
|
10339
|
-
maxTokens:
|
|
10721
|
+
maxTokens: budget.maxTokens,
|
|
10722
|
+
taskKind: "application_email",
|
|
10723
|
+
budgetDiagnostics: budget,
|
|
10340
10724
|
providerOptions
|
|
10341
10725
|
});
|
|
10342
10726
|
trackUsage(usage);
|
|
@@ -10461,6 +10845,7 @@ ${emailText}`;
|
|
|
10461
10845
|
if (!state) throw new Error(`Application ${applicationId} not found`);
|
|
10462
10846
|
const filledFields = state.fields.filter((f) => f.value);
|
|
10463
10847
|
const fieldSummary = filledFields.map((f) => `${f.section} > ${f.label}: ${f.value} (source: ${f.source ?? "unknown"})`).join("\n");
|
|
10848
|
+
const budget = resolveBudget("application_email", 4096);
|
|
10464
10849
|
const { text, usage } = await generateText({
|
|
10465
10850
|
prompt: `Format these filled insurance application fields as a clean confirmation summary for the user to review. Group by section, show each field as "Label: Value". End with a note asking them to confirm or request changes.
|
|
10466
10851
|
|
|
@@ -10468,7 +10853,9 @@ Application: ${state.title ?? "Insurance Application"}
|
|
|
10468
10853
|
|
|
10469
10854
|
Fields:
|
|
10470
10855
|
${fieldSummary}`,
|
|
10471
|
-
maxTokens:
|
|
10856
|
+
maxTokens: budget.maxTokens,
|
|
10857
|
+
taskKind: "application_email",
|
|
10858
|
+
budgetDiagnostics: budget,
|
|
10472
10859
|
providerOptions
|
|
10473
10860
|
});
|
|
10474
10861
|
trackUsage(usage);
|
|
@@ -10942,6 +11329,8 @@ ${e.text}`;
|
|
|
10942
11329
|
prompt,
|
|
10943
11330
|
schema: SubAnswerSchema,
|
|
10944
11331
|
maxTokens: budget.maxTokens,
|
|
11332
|
+
taskKind: "query_reason",
|
|
11333
|
+
budgetDiagnostics: budget,
|
|
10945
11334
|
providerOptions
|
|
10946
11335
|
})
|
|
10947
11336
|
);
|
|
@@ -11165,6 +11554,8 @@ async function verify(originalQuestion, subAnswers, allEvidence, config) {
|
|
|
11165
11554
|
prompt,
|
|
11166
11555
|
schema: VerifyResultSchema,
|
|
11167
11556
|
maxTokens: budget.maxTokens,
|
|
11557
|
+
taskKind: "query_verify",
|
|
11558
|
+
budgetDiagnostics: budget,
|
|
11168
11559
|
providerOptions
|
|
11169
11560
|
})
|
|
11170
11561
|
);
|
|
@@ -11307,6 +11698,8 @@ async function interpretAttachments(params) {
|
|
|
11307
11698
|
prompt,
|
|
11308
11699
|
schema: AttachmentInterpretationSchema,
|
|
11309
11700
|
maxTokens: budget.maxTokens,
|
|
11701
|
+
taskKind: "query_attachment",
|
|
11702
|
+
budgetDiagnostics: budget,
|
|
11310
11703
|
providerOptions: buildAttachmentProviderOptions(attachment, providerOptions)
|
|
11311
11704
|
},
|
|
11312
11705
|
{
|
|
@@ -11644,6 +12037,8 @@ function createQueryAgent(config) {
|
|
|
11644
12037
|
prompt,
|
|
11645
12038
|
schema: QueryClassifyResultSchema,
|
|
11646
12039
|
maxTokens: budget.maxTokens,
|
|
12040
|
+
taskKind: "query_classify",
|
|
12041
|
+
budgetDiagnostics: budget,
|
|
11647
12042
|
providerOptions
|
|
11648
12043
|
},
|
|
11649
12044
|
{
|
|
@@ -11695,6 +12090,8 @@ function createQueryAgent(config) {
|
|
|
11695
12090
|
prompt,
|
|
11696
12091
|
schema: QueryResultSchema,
|
|
11697
12092
|
maxTokens: budget.maxTokens,
|
|
12093
|
+
taskKind: "query_respond",
|
|
12094
|
+
budgetDiagnostics: budget,
|
|
11698
12095
|
providerOptions
|
|
11699
12096
|
},
|
|
11700
12097
|
{
|
|
@@ -11790,6 +12187,8 @@ function createPceAgent(config = {}) {
|
|
|
11790
12187
|
prompt: buildPceNormalizePrompt({ requestText: input.requestText, evidenceSources }),
|
|
11791
12188
|
schema: PceNormalizationResultSchema,
|
|
11792
12189
|
maxTokens: budget.maxTokens,
|
|
12190
|
+
taskKind: "pce_impact_analysis",
|
|
12191
|
+
budgetDiagnostics: budget,
|
|
11793
12192
|
providerOptions: config.providerOptions
|
|
11794
12193
|
},
|
|
11795
12194
|
{ fallback, maxRetries: 1, log: config.log }
|
|
@@ -11851,6 +12250,8 @@ function createPceAgent(config = {}) {
|
|
|
11851
12250
|
}),
|
|
11852
12251
|
schema: ReplyAnswersSchema,
|
|
11853
12252
|
maxTokens: budget.maxTokens,
|
|
12253
|
+
taskKind: "pce_reply_parse",
|
|
12254
|
+
budgetDiagnostics: budget,
|
|
11854
12255
|
providerOptions: config.providerOptions
|
|
11855
12256
|
},
|
|
11856
12257
|
{ fallback: { answers }, maxRetries: 1, log: config.log }
|
|
@@ -12732,6 +13133,7 @@ var AGENT_TOOLS = [
|
|
|
12732
13133
|
buildConfirmationSummaryPrompt,
|
|
12733
13134
|
buildConversationMemoryGuidance,
|
|
12734
13135
|
buildCoverageGapPrompt,
|
|
13136
|
+
buildDoclingProviderOptions,
|
|
12735
13137
|
buildFieldExplanationPrompt,
|
|
12736
13138
|
buildFieldExtractionPrompt,
|
|
12737
13139
|
buildFlatPdfMappingPrompt,
|
|
@@ -12773,12 +13175,16 @@ var AGENT_TOOLS = [
|
|
|
12773
13175
|
fillAcroForm,
|
|
12774
13176
|
generateNextMessage,
|
|
12775
13177
|
getAcroFormFields,
|
|
13178
|
+
getDoclingPageRangeText,
|
|
12776
13179
|
getExtractor,
|
|
12777
13180
|
getFileIdentifier,
|
|
12778
13181
|
getPdfPageCount,
|
|
12779
13182
|
getTemplate,
|
|
13183
|
+
isDoclingExtractionInput,
|
|
12780
13184
|
isFileReference,
|
|
12781
13185
|
mergeQuestionAnswers,
|
|
13186
|
+
mergeSourceSpans,
|
|
13187
|
+
normalizeDoclingDocument,
|
|
12782
13188
|
normalizeForMatch,
|
|
12783
13189
|
orderSourceEvidence,
|
|
12784
13190
|
overlayTextOnPdf,
|