@claritylabs/cl-sdk 1.1.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -1
- package/dist/index.d.mts +72 -2
- package/dist/index.d.ts +72 -2
- package/dist/index.js +389 -48
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +384 -48
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.mjs
CHANGED
|
@@ -2471,6 +2471,254 @@ async function overlayTextOnPdf(pdfBytes, overlays) {
|
|
|
2471
2471
|
return await pdfDoc.save();
|
|
2472
2472
|
}
|
|
2473
2473
|
|
|
2474
|
+
// src/extraction/docling.ts
|
|
2475
|
+
function isDoclingExtractionInput(input) {
|
|
2476
|
+
return Boolean(
|
|
2477
|
+
input && typeof input === "object" && input.kind === "docling_document" && input.document && typeof input.document === "object"
|
|
2478
|
+
);
|
|
2479
|
+
}
|
|
2480
|
+
function normalizeDoclingDocument(document, options) {
|
|
2481
|
+
const itemMap = buildItemMap(document);
|
|
2482
|
+
const orderedRefs = getOrderedBodyRefs(document, itemMap);
|
|
2483
|
+
const orderedItems = orderedRefs.length > 0 ? orderedRefs.map((ref) => itemMap.get(ref)).filter((item) => Boolean(item)) : getFallbackOrderedItems(document, itemMap);
|
|
2484
|
+
const units = orderedItems.map(({ ref, item }) => normalizeItem(ref, item)).filter((unit) => Boolean(unit && unit.text.trim()));
|
|
2485
|
+
const pageCount = inferPageCount(document, units);
|
|
2486
|
+
const pageTexts = /* @__PURE__ */ new Map();
|
|
2487
|
+
for (const unit of units) {
|
|
2488
|
+
const page = clampPage(unit.pageStart ?? 1, pageCount);
|
|
2489
|
+
pageTexts.set(page, appendText(pageTexts.get(page), unit.text));
|
|
2490
|
+
}
|
|
2491
|
+
const fullText = Array.from({ length: pageCount }, (_, index) => {
|
|
2492
|
+
const pageNumber = index + 1;
|
|
2493
|
+
const text = pageTexts.get(pageNumber)?.trim();
|
|
2494
|
+
return text ? `Page ${pageNumber}
|
|
2495
|
+
${text}` : "";
|
|
2496
|
+
}).filter(Boolean).join("\n\n");
|
|
2497
|
+
const sourceKind = options.sourceKind ?? "policy_pdf";
|
|
2498
|
+
const sourceSpans = units.map((unit, index) => {
|
|
2499
|
+
const span = buildSourceSpan(
|
|
2500
|
+
{
|
|
2501
|
+
documentId: options.documentId,
|
|
2502
|
+
sourceKind,
|
|
2503
|
+
text: unit.text,
|
|
2504
|
+
pageStart: unit.pageStart,
|
|
2505
|
+
pageEnd: unit.pageEnd,
|
|
2506
|
+
sectionId: unit.label,
|
|
2507
|
+
metadata: {
|
|
2508
|
+
sourceSystem: "docling",
|
|
2509
|
+
sourceUnit: "docling_item",
|
|
2510
|
+
doclingRef: unit.ref,
|
|
2511
|
+
...unit.label ? { doclingLabel: unit.label } : {}
|
|
2512
|
+
}
|
|
2513
|
+
},
|
|
2514
|
+
index
|
|
2515
|
+
);
|
|
2516
|
+
return {
|
|
2517
|
+
...span,
|
|
2518
|
+
kind: "plain_text",
|
|
2519
|
+
bbox: unit.bboxes?.length ? unit.bboxes : void 0
|
|
2520
|
+
};
|
|
2521
|
+
});
|
|
2522
|
+
return {
|
|
2523
|
+
pageCount,
|
|
2524
|
+
fullText,
|
|
2525
|
+
pageTexts,
|
|
2526
|
+
units,
|
|
2527
|
+
sourceSpans
|
|
2528
|
+
};
|
|
2529
|
+
}
|
|
2530
|
+
function getDoclingPageRangeText(normalized, startPage, endPage) {
|
|
2531
|
+
const start = clampPage(startPage, normalized.pageCount);
|
|
2532
|
+
const end = clampPage(endPage, normalized.pageCount);
|
|
2533
|
+
const lines = [];
|
|
2534
|
+
for (let page = start; page <= end; page++) {
|
|
2535
|
+
const text = normalized.pageTexts.get(page)?.trim();
|
|
2536
|
+
if (text) {
|
|
2537
|
+
lines.push(`Page ${page}
|
|
2538
|
+
${text}`);
|
|
2539
|
+
}
|
|
2540
|
+
}
|
|
2541
|
+
return lines.join("\n\n");
|
|
2542
|
+
}
|
|
2543
|
+
function buildDoclingProviderOptions(normalized, existingOptions) {
|
|
2544
|
+
return {
|
|
2545
|
+
...existingOptions,
|
|
2546
|
+
doclingText: normalized.fullText,
|
|
2547
|
+
doclingPageCount: normalized.pageCount
|
|
2548
|
+
};
|
|
2549
|
+
}
|
|
2550
|
+
function mergeSourceSpans(spans) {
|
|
2551
|
+
const seen = /* @__PURE__ */ new Set();
|
|
2552
|
+
const merged = [];
|
|
2553
|
+
for (const span of spans) {
|
|
2554
|
+
const key = [
|
|
2555
|
+
span.documentId,
|
|
2556
|
+
span.pageStart ?? span.location?.startPage ?? span.location?.page ?? "na",
|
|
2557
|
+
span.pageEnd ?? span.location?.endPage ?? span.pageStart ?? "na",
|
|
2558
|
+
span.sectionId ?? span.location?.fieldPath ?? "na",
|
|
2559
|
+
span.textHash ?? sourceSpanTextHash(span.text)
|
|
2560
|
+
].join(":");
|
|
2561
|
+
if (seen.has(key)) continue;
|
|
2562
|
+
seen.add(key);
|
|
2563
|
+
merged.push(span);
|
|
2564
|
+
}
|
|
2565
|
+
return merged;
|
|
2566
|
+
}
|
|
2567
|
+
function buildItemMap(document) {
|
|
2568
|
+
const map = /* @__PURE__ */ new Map();
|
|
2569
|
+
addItems(map, "#/texts", document.texts ?? []);
|
|
2570
|
+
addItems(map, "#/tables", document.tables ?? []);
|
|
2571
|
+
addItems(map, "#/key_value_items", document.key_value_items ?? document.keyValueItems ?? []);
|
|
2572
|
+
addItems(map, "#/pictures", document.pictures ?? []);
|
|
2573
|
+
return map;
|
|
2574
|
+
}
|
|
2575
|
+
function addItems(map, baseRef, items) {
|
|
2576
|
+
items.forEach((item, index) => {
|
|
2577
|
+
const ref = getSelfRef(item) ?? `${baseRef}/${index}`;
|
|
2578
|
+
map.set(ref, { ref, item });
|
|
2579
|
+
});
|
|
2580
|
+
}
|
|
2581
|
+
function getFallbackOrderedItems(document, itemMap) {
|
|
2582
|
+
const refs = [
|
|
2583
|
+
...(document.texts ?? []).map((item, index) => getSelfRef(item) ?? `#/texts/${index}`),
|
|
2584
|
+
...(document.tables ?? []).map((item, index) => getSelfRef(item) ?? `#/tables/${index}`),
|
|
2585
|
+
...(document.key_value_items ?? document.keyValueItems ?? []).map((item, index) => getSelfRef(item) ?? `#/key_value_items/${index}`)
|
|
2586
|
+
];
|
|
2587
|
+
return refs.map((ref) => itemMap.get(ref)).filter((item) => Boolean(item));
|
|
2588
|
+
}
|
|
2589
|
+
function getOrderedBodyRefs(document, itemMap) {
|
|
2590
|
+
const groupMap = /* @__PURE__ */ new Map();
|
|
2591
|
+
(document.groups ?? []).forEach((group, index) => {
|
|
2592
|
+
groupMap.set(getSelfRef(group) ?? `#/groups/${index}`, group);
|
|
2593
|
+
});
|
|
2594
|
+
const refs = [];
|
|
2595
|
+
const visited = /* @__PURE__ */ new Set();
|
|
2596
|
+
const visitRef = (ref) => {
|
|
2597
|
+
const itemEntry = itemMap.get(ref);
|
|
2598
|
+
if (itemEntry) {
|
|
2599
|
+
if (!visited.has(ref)) {
|
|
2600
|
+
visited.add(ref);
|
|
2601
|
+
refs.push(ref);
|
|
2602
|
+
}
|
|
2603
|
+
visitNode(itemEntry.item);
|
|
2604
|
+
return;
|
|
2605
|
+
}
|
|
2606
|
+
visitNode(groupMap.get(ref));
|
|
2607
|
+
};
|
|
2608
|
+
const visitNode = (node) => {
|
|
2609
|
+
for (const child of node?.children ?? []) {
|
|
2610
|
+
const ref = getRef(child);
|
|
2611
|
+
if (!ref) continue;
|
|
2612
|
+
visitRef(ref);
|
|
2613
|
+
}
|
|
2614
|
+
};
|
|
2615
|
+
visitNode(document.body);
|
|
2616
|
+
return refs;
|
|
2617
|
+
}
|
|
2618
|
+
function normalizeItem(ref, item) {
|
|
2619
|
+
const text = getItemText(item).trim();
|
|
2620
|
+
if (!text) return void 0;
|
|
2621
|
+
const pages = (item.prov ?? []).map((prov) => getPageNumber(prov)).filter((page) => typeof page === "number" && page > 0);
|
|
2622
|
+
const pageStart = pages.length ? Math.min(...pages) : void 0;
|
|
2623
|
+
const pageEnd = pages.length ? Math.max(...pages) : pageStart;
|
|
2624
|
+
const bboxes = (item.prov ?? []).map((prov) => toSourceSpanBBox(prov)).filter((bbox) => Boolean(bbox));
|
|
2625
|
+
return {
|
|
2626
|
+
ref,
|
|
2627
|
+
label: typeof item.label === "string" ? item.label : void 0,
|
|
2628
|
+
text,
|
|
2629
|
+
pageStart,
|
|
2630
|
+
pageEnd,
|
|
2631
|
+
bboxes: bboxes.length ? bboxes : void 0
|
|
2632
|
+
};
|
|
2633
|
+
}
|
|
2634
|
+
function getItemText(item) {
|
|
2635
|
+
if (typeof item.text === "string" && item.text.trim()) return item.text;
|
|
2636
|
+
if (typeof item.orig === "string" && item.orig.trim()) return item.orig;
|
|
2637
|
+
const table = tableToMarkdown(item.data);
|
|
2638
|
+
if (table) return table;
|
|
2639
|
+
return "";
|
|
2640
|
+
}
|
|
2641
|
+
function tableToMarkdown(data) {
|
|
2642
|
+
const record = asRecord(data);
|
|
2643
|
+
const cells = Array.isArray(record?.table_cells) ? record.table_cells : Array.isArray(record?.tableCells) ? record.tableCells : void 0;
|
|
2644
|
+
if (!cells) return void 0;
|
|
2645
|
+
const parsedCells = cells.map((cell) => asRecord(cell)).filter((cell) => Boolean(cell)).map((cell) => ({
|
|
2646
|
+
row: firstNumber2([cell.start_row_offset, cell.row_header, cell.row, cell.rowIndex]) ?? 0,
|
|
2647
|
+
col: firstNumber2([cell.start_col_offset, cell.col, cell.colIndex]) ?? 0,
|
|
2648
|
+
text: firstString([cell.text, cell.orig, cell.content])
|
|
2649
|
+
})).filter((cell) => cell.text);
|
|
2650
|
+
if (parsedCells.length === 0) return void 0;
|
|
2651
|
+
const maxRow = Math.max(...parsedCells.map((cell) => cell.row));
|
|
2652
|
+
const maxCol = Math.max(...parsedCells.map((cell) => cell.col));
|
|
2653
|
+
const rows = Array.from({ length: maxRow + 1 }, () => Array.from({ length: maxCol + 1 }, () => ""));
|
|
2654
|
+
for (const cell of parsedCells) {
|
|
2655
|
+
rows[cell.row][cell.col] = cell.text;
|
|
2656
|
+
}
|
|
2657
|
+
if (rows.length === 1) return rows[0].filter(Boolean).join(" | ");
|
|
2658
|
+
const header = rows[0];
|
|
2659
|
+
const separator = header.map(() => "---");
|
|
2660
|
+
return [header, separator, ...rows.slice(1)].map((row) => `| ${row.map((value) => value.trim()).join(" | ")} |`).join("\n");
|
|
2661
|
+
}
|
|
2662
|
+
function inferPageCount(document, units) {
|
|
2663
|
+
const pages = document.pages;
|
|
2664
|
+
if (Array.isArray(pages)) return Math.max(1, pages.length);
|
|
2665
|
+
if (pages && typeof pages === "object") {
|
|
2666
|
+
const keys = Object.keys(pages);
|
|
2667
|
+
const numericMax = Math.max(0, ...keys.map((key) => Number(key)).filter((value) => Number.isFinite(value)));
|
|
2668
|
+
return Math.max(1, numericMax || keys.length);
|
|
2669
|
+
}
|
|
2670
|
+
return Math.max(1, ...units.flatMap((unit) => [unit.pageStart ?? 0, unit.pageEnd ?? 0]));
|
|
2671
|
+
}
|
|
2672
|
+
function getSelfRef(value) {
|
|
2673
|
+
return value.self_ref ?? value.selfRef;
|
|
2674
|
+
}
|
|
2675
|
+
function getRef(value) {
|
|
2676
|
+
if (typeof value === "string") return value;
|
|
2677
|
+
return value.$ref ?? value.ref;
|
|
2678
|
+
}
|
|
2679
|
+
function getPageNumber(prov) {
|
|
2680
|
+
return prov.page_no ?? prov.pageNo ?? prov.page;
|
|
2681
|
+
}
|
|
2682
|
+
function toSourceSpanBBox(prov) {
|
|
2683
|
+
const page = getPageNumber(prov);
|
|
2684
|
+
const bbox = asRecord(prov.bbox);
|
|
2685
|
+
if (!page || !bbox) return void 0;
|
|
2686
|
+
const x = firstNumber2([bbox.x, bbox.l, bbox.left]);
|
|
2687
|
+
const y = firstNumber2([bbox.y, bbox.t, bbox.top]);
|
|
2688
|
+
const width = firstNumber2([bbox.width]);
|
|
2689
|
+
const height = firstNumber2([bbox.height]);
|
|
2690
|
+
const right = firstNumber2([bbox.r, bbox.right]);
|
|
2691
|
+
const bottom = firstNumber2([bbox.b, bbox.bottom]);
|
|
2692
|
+
if (x == null || y == null) return void 0;
|
|
2693
|
+
const resolvedWidth = width ?? (right != null ? right - x : void 0);
|
|
2694
|
+
const resolvedHeight = height ?? (bottom != null ? bottom - y : void 0);
|
|
2695
|
+
if (resolvedWidth == null || resolvedHeight == null) return void 0;
|
|
2696
|
+
return { page, x, y, width: resolvedWidth, height: resolvedHeight };
|
|
2697
|
+
}
|
|
2698
|
+
function clampPage(page, pageCount) {
|
|
2699
|
+
return Math.max(1, Math.min(pageCount, page));
|
|
2700
|
+
}
|
|
2701
|
+
function appendText(existing, next) {
|
|
2702
|
+
return existing ? `${existing}
|
|
2703
|
+
|
|
2704
|
+
${next}` : next;
|
|
2705
|
+
}
|
|
2706
|
+
function asRecord(value) {
|
|
2707
|
+
return value && typeof value === "object" && !Array.isArray(value) ? value : void 0;
|
|
2708
|
+
}
|
|
2709
|
+
function firstString(values) {
|
|
2710
|
+
for (const value of values) {
|
|
2711
|
+
if (typeof value === "string" && value.trim()) return value.trim();
|
|
2712
|
+
}
|
|
2713
|
+
return "";
|
|
2714
|
+
}
|
|
2715
|
+
function firstNumber2(values) {
|
|
2716
|
+
for (const value of values) {
|
|
2717
|
+
if (typeof value === "number" && Number.isFinite(value)) return value;
|
|
2718
|
+
}
|
|
2719
|
+
return void 0;
|
|
2720
|
+
}
|
|
2721
|
+
|
|
2474
2722
|
// src/extraction/extractor.ts
|
|
2475
2723
|
function sourceSpansForPageRange(providerOptions, startPage, endPage) {
|
|
2476
2724
|
const sourceSpans = providerOptions?.sourceSpans;
|
|
@@ -2519,15 +2767,31 @@ async function runExtractor(params) {
|
|
|
2519
2767
|
} = params;
|
|
2520
2768
|
const extractorProviderOptions = { ...providerOptions };
|
|
2521
2769
|
let fullPrompt;
|
|
2522
|
-
|
|
2523
|
-
|
|
2524
|
-
|
|
2770
|
+
if (params.getPageRangeText) {
|
|
2771
|
+
const pageText = await params.getPageRangeText(startPage, endPage);
|
|
2772
|
+
extractorProviderOptions.doclingText = pageText;
|
|
2773
|
+
extractorProviderOptions.doclingPageRange = { startPage, endPage };
|
|
2774
|
+
fullPrompt = `${prompt}
|
|
2775
|
+
|
|
2776
|
+
[Document pages ${startPage}-${endPage} are provided below as Docling-extracted text.]
|
|
2777
|
+
|
|
2778
|
+
${pageText || "(No Docling text was available for this page range.)"}`;
|
|
2779
|
+
} else if (convertPdfToImages) {
|
|
2780
|
+
if (!pdfInput) {
|
|
2781
|
+
throw new Error("pdfInput is required when extracting page images.");
|
|
2782
|
+
}
|
|
2783
|
+
const needsPdfBase64 = !params.getPageImages;
|
|
2784
|
+
const pdfBase64 = needsPdfBase64 ? await pdfInputToBase64(pdfInput) : void 0;
|
|
2525
2785
|
const images = params.getPageImages ? await params.getPageImages(startPage, endPage) : await convertPdfToImages(pdfBase64, startPage, endPage);
|
|
2526
2786
|
extractorProviderOptions.images = images;
|
|
2527
2787
|
fullPrompt = `${prompt}
|
|
2528
2788
|
|
|
2529
2789
|
[Document pages ${startPage}-${endPage} are provided as images.]`;
|
|
2530
2790
|
} else {
|
|
2791
|
+
if (!pdfInput) {
|
|
2792
|
+
throw new Error("pdfInput is required when extracting page PDFs.");
|
|
2793
|
+
}
|
|
2794
|
+
const pdfBase64 = params.getPageRangePdf ? void 0 : await pdfInputToBase64(pdfInput);
|
|
2531
2795
|
const cacheKey = `${startPage}-${endPage}`;
|
|
2532
2796
|
const cachedPagesPdf = pageRangeCache?.get(cacheKey);
|
|
2533
2797
|
const pagesPdf = cachedPagesPdf ?? (params.getPageRangePdf ? await params.getPageRangePdf(startPage, endPage) : await extractPageRange(pdfBase64, startPage, endPage));
|
|
@@ -3567,7 +3831,7 @@ function formatAddress(addr) {
|
|
|
3567
3831
|
function asRecordArray(value) {
|
|
3568
3832
|
return Array.isArray(value) ? value.filter((item) => Boolean(item) && typeof item === "object" && !Array.isArray(item)) : [];
|
|
3569
3833
|
}
|
|
3570
|
-
function
|
|
3834
|
+
function firstString2(item, keys) {
|
|
3571
3835
|
for (const key of keys) {
|
|
3572
3836
|
const value = item[key];
|
|
3573
3837
|
if (typeof value === "string" && value.trim()) return value;
|
|
@@ -3924,32 +4188,32 @@ ${exc.content}`.trim(), {
|
|
|
3924
4188
|
);
|
|
3925
4189
|
});
|
|
3926
4190
|
asRecordArray(extendedDoc.definitions).forEach((definition, i) => {
|
|
3927
|
-
const term =
|
|
3928
|
-
const body =
|
|
4191
|
+
const term = firstString2(definition, ["term", "name", "title"]) ?? `Definition ${i + 1}`;
|
|
4192
|
+
const body = firstString2(definition, ["definition", "content", "text", "meaning"]);
|
|
3929
4193
|
pushChunk(
|
|
3930
4194
|
`definition:${i}`,
|
|
3931
4195
|
"definition",
|
|
3932
4196
|
lines([
|
|
3933
4197
|
`Definition: ${term}`,
|
|
3934
4198
|
body,
|
|
3935
|
-
|
|
4199
|
+
firstString2(definition, ["originalContent", "source"]) ? `Source: ${firstString2(definition, ["originalContent", "source"])}` : null
|
|
3936
4200
|
]),
|
|
3937
4201
|
{
|
|
3938
4202
|
term,
|
|
3939
|
-
formNumber:
|
|
3940
|
-
formTitle:
|
|
4203
|
+
formNumber: firstString2(definition, ["formNumber"]),
|
|
4204
|
+
formTitle: firstString2(definition, ["formTitle"]),
|
|
3941
4205
|
pageNumber: typeof definition.pageNumber === "number" ? definition.pageNumber : void 0,
|
|
3942
|
-
sectionRef:
|
|
4206
|
+
sectionRef: firstString2(definition, ["sectionRef", "sectionTitle"]),
|
|
3943
4207
|
documentType: doc.type
|
|
3944
4208
|
}
|
|
3945
4209
|
);
|
|
3946
4210
|
});
|
|
3947
4211
|
const coveredReasons = asRecordArray(extendedDoc.coveredReasons ?? extendedDoc.covered_reasons);
|
|
3948
4212
|
coveredReasons.forEach((coveredReason, i) => {
|
|
3949
|
-
const title =
|
|
3950
|
-
const coverageName =
|
|
3951
|
-
const reasonNumber =
|
|
3952
|
-
const body =
|
|
4213
|
+
const title = firstString2(coveredReason, ["title", "name", "reason", "peril", "cause"]) ?? `Covered Reason ${i + 1}`;
|
|
4214
|
+
const coverageName = firstString2(coveredReason, ["coverageName", "coverage", "coveragePart"]);
|
|
4215
|
+
const reasonNumber = firstString2(coveredReason, ["reasonNumber", "number"]);
|
|
4216
|
+
const body = firstString2(coveredReason, ["content", "description", "text", "coverageGrant"]);
|
|
3953
4217
|
pushChunk(
|
|
3954
4218
|
`covered_reason:${i}`,
|
|
3955
4219
|
"covered_reason",
|
|
@@ -3958,16 +4222,16 @@ ${exc.content}`.trim(), {
|
|
|
3958
4222
|
reasonNumber ? `Reason Number: ${reasonNumber}` : null,
|
|
3959
4223
|
`Covered Reason: ${title}`,
|
|
3960
4224
|
body,
|
|
3961
|
-
|
|
4225
|
+
firstString2(coveredReason, ["originalContent", "source"]) ? `Source: ${firstString2(coveredReason, ["originalContent", "source"])}` : null
|
|
3962
4226
|
]),
|
|
3963
4227
|
{
|
|
3964
4228
|
coverageName,
|
|
3965
4229
|
reasonNumber,
|
|
3966
4230
|
title,
|
|
3967
|
-
formNumber:
|
|
3968
|
-
formTitle:
|
|
4231
|
+
formNumber: firstString2(coveredReason, ["formNumber"]),
|
|
4232
|
+
formTitle: firstString2(coveredReason, ["formTitle"]),
|
|
3969
4233
|
pageNumber: typeof coveredReason.pageNumber === "number" ? coveredReason.pageNumber : void 0,
|
|
3970
|
-
sectionRef:
|
|
4234
|
+
sectionRef: firstString2(coveredReason, ["sectionRef", "sectionTitle"]),
|
|
3971
4235
|
documentType: doc.type
|
|
3972
4236
|
}
|
|
3973
4237
|
);
|
|
@@ -3987,10 +4251,10 @@ ${exc.content}`.trim(), {
|
|
|
3987
4251
|
reasonNumber,
|
|
3988
4252
|
title,
|
|
3989
4253
|
conditionIndex,
|
|
3990
|
-
formNumber:
|
|
3991
|
-
formTitle:
|
|
4254
|
+
formNumber: firstString2(coveredReason, ["formNumber"]),
|
|
4255
|
+
formTitle: firstString2(coveredReason, ["formTitle"]),
|
|
3992
4256
|
pageNumber: typeof coveredReason.pageNumber === "number" ? coveredReason.pageNumber : void 0,
|
|
3993
|
-
sectionRef:
|
|
4257
|
+
sectionRef: firstString2(coveredReason, ["sectionRef", "sectionTitle"]),
|
|
3994
4258
|
documentType: doc.type
|
|
3995
4259
|
}
|
|
3996
4260
|
);
|
|
@@ -6461,21 +6725,21 @@ Return JSON only.`;
|
|
|
6461
6725
|
}
|
|
6462
6726
|
|
|
6463
6727
|
// src/prompts/extractors/index.ts
|
|
6464
|
-
function
|
|
6728
|
+
function asRecord2(data) {
|
|
6465
6729
|
return data && typeof data === "object" ? data : void 0;
|
|
6466
6730
|
}
|
|
6467
6731
|
function getSections2(data) {
|
|
6468
|
-
const sections =
|
|
6732
|
+
const sections = asRecord2(data)?.sections;
|
|
6469
6733
|
return Array.isArray(sections) ? sections : [];
|
|
6470
6734
|
}
|
|
6471
6735
|
function isCoveredReasonsEmpty(data) {
|
|
6472
|
-
const record =
|
|
6736
|
+
const record = asRecord2(data);
|
|
6473
6737
|
if (!record) return true;
|
|
6474
6738
|
const coveredReasons = Array.isArray(record.coveredReasons) ? record.coveredReasons : Array.isArray(record.covered_reasons) ? record.covered_reasons : [];
|
|
6475
6739
|
return coveredReasons.length === 0;
|
|
6476
6740
|
}
|
|
6477
6741
|
function isDefinitionsEmpty(data) {
|
|
6478
|
-
const definitions =
|
|
6742
|
+
const definitions = asRecord2(data)?.definitions;
|
|
6479
6743
|
return !Array.isArray(definitions) || definitions.length === 0;
|
|
6480
6744
|
}
|
|
6481
6745
|
function sectionLooksLikeCoveredReason(section) {
|
|
@@ -6709,6 +6973,14 @@ function decideReferentialResolutionAction(params) {
|
|
|
6709
6973
|
}
|
|
6710
6974
|
|
|
6711
6975
|
// src/extraction/resolve-referential.ts
|
|
6976
|
+
function formatDoclingTextContext(providerOptions) {
|
|
6977
|
+
const doclingText = providerOptions?.doclingText;
|
|
6978
|
+
if (typeof doclingText !== "string" || !doclingText.trim()) return "";
|
|
6979
|
+
return `
|
|
6980
|
+
|
|
6981
|
+
DOCLING DOCUMENT TEXT:
|
|
6982
|
+
${doclingText}`;
|
|
6983
|
+
}
|
|
6712
6984
|
function parseReferenceTarget(text) {
|
|
6713
6985
|
if (typeof text !== "string") return void 0;
|
|
6714
6986
|
const normalized = text.trim();
|
|
@@ -6790,12 +7062,12 @@ Return the page range (1-indexed) where this section is located. If the section
|
|
|
6790
7062
|
|
|
6791
7063
|
If you cannot find the section, return startPage: 0 and endPage: 0.
|
|
6792
7064
|
|
|
6793
|
-
Return JSON only
|
|
7065
|
+
Return JSON only.${formatDoclingTextContext(providerOptions)}`,
|
|
6794
7066
|
schema: PageLocationSchema,
|
|
6795
7067
|
maxTokens: budget.maxTokens,
|
|
6796
7068
|
taskKind: "extraction_referential_lookup",
|
|
6797
7069
|
budgetDiagnostics: budget,
|
|
6798
|
-
providerOptions: await buildPdfProviderOptions(pdfInput, providerOptions)
|
|
7070
|
+
providerOptions: pdfInput ? await buildPdfProviderOptions(pdfInput, providerOptions) : providerOptions
|
|
6799
7071
|
},
|
|
6800
7072
|
{
|
|
6801
7073
|
fallback: { startPage: 0, endPage: 0 },
|
|
@@ -6829,6 +7101,7 @@ async function resolveReferentialCoverages(params) {
|
|
|
6829
7101
|
convertPdfToImages,
|
|
6830
7102
|
getPageRangePdf,
|
|
6831
7103
|
getPageImages,
|
|
7104
|
+
getPageRangeText,
|
|
6832
7105
|
concurrency = 2,
|
|
6833
7106
|
providerOptions,
|
|
6834
7107
|
modelCapabilities,
|
|
@@ -6940,6 +7213,7 @@ async function resolveReferentialCoverages(params) {
|
|
|
6940
7213
|
convertPdfToImages,
|
|
6941
7214
|
getPageRangePdf,
|
|
6942
7215
|
getPageImages,
|
|
7216
|
+
getPageRangeText,
|
|
6943
7217
|
maxTokens: budget.maxTokens,
|
|
6944
7218
|
taskKind: "extraction_referential_lookup",
|
|
6945
7219
|
budgetDiagnostics: budget,
|
|
@@ -7035,6 +7309,7 @@ async function runFocusedExtractorWithFallback(params) {
|
|
|
7035
7309
|
pageRangeCache,
|
|
7036
7310
|
getPageRangePdf,
|
|
7037
7311
|
getPageImages,
|
|
7312
|
+
getPageRangeText,
|
|
7038
7313
|
trackUsage,
|
|
7039
7314
|
resolveBudget,
|
|
7040
7315
|
log
|
|
@@ -7064,7 +7339,8 @@ async function runFocusedExtractorWithFallback(params) {
|
|
|
7064
7339
|
providerOptions,
|
|
7065
7340
|
pageRangeCache,
|
|
7066
7341
|
getPageRangePdf,
|
|
7067
|
-
getPageImages
|
|
7342
|
+
getPageImages,
|
|
7343
|
+
getPageRangeText
|
|
7068
7344
|
});
|
|
7069
7345
|
trackUsage(result.usage, {
|
|
7070
7346
|
taskKind,
|
|
@@ -7109,7 +7385,8 @@ async function runFocusedExtractorWithFallback(params) {
|
|
|
7109
7385
|
providerOptions,
|
|
7110
7386
|
pageRangeCache,
|
|
7111
7387
|
getPageRangePdf,
|
|
7112
|
-
getPageImages
|
|
7388
|
+
getPageImages,
|
|
7389
|
+
getPageRangeText
|
|
7113
7390
|
});
|
|
7114
7391
|
trackUsage(fallbackResult.usage, {
|
|
7115
7392
|
taskKind,
|
|
@@ -7953,7 +8230,7 @@ function createExtractor(config) {
|
|
|
7953
8230
|
}
|
|
7954
8231
|
return lines.length > 0 ? lines.join("\n") : "";
|
|
7955
8232
|
}
|
|
7956
|
-
async function runFocusedExtractorTask(task, pdfInput, memory, pageRangeCache, getPageRangePdf, getPageImages) {
|
|
8233
|
+
async function runFocusedExtractorTask(task, pdfInput, memory, pageRangeCache, getPageRangePdf, getPageImages, getPageRangeText) {
|
|
7957
8234
|
if (task.extractorName === "supplementary") {
|
|
7958
8235
|
const alreadyExtractedSummary = buildAlreadyExtractedSummary(memory);
|
|
7959
8236
|
const budget = resolveBudget("extraction_focused", 4096);
|
|
@@ -7973,7 +8250,8 @@ function createExtractor(config) {
|
|
|
7973
8250
|
providerOptions: activeProviderOptions,
|
|
7974
8251
|
pageRangeCache,
|
|
7975
8252
|
getPageRangePdf,
|
|
7976
|
-
getPageImages
|
|
8253
|
+
getPageImages,
|
|
8254
|
+
getPageRangeText
|
|
7977
8255
|
});
|
|
7978
8256
|
trackUsage(result.usage, {
|
|
7979
8257
|
taskKind: "extraction_focused",
|
|
@@ -7992,6 +8270,7 @@ function createExtractor(config) {
|
|
|
7992
8270
|
pageRangeCache,
|
|
7993
8271
|
getPageRangePdf,
|
|
7994
8272
|
getPageImages,
|
|
8273
|
+
getPageRangeText,
|
|
7995
8274
|
trackUsage,
|
|
7996
8275
|
resolveBudget,
|
|
7997
8276
|
log
|
|
@@ -8007,8 +8286,14 @@ function createExtractor(config) {
|
|
|
8007
8286
|
if (extractorPages.size === 0) return "No page assignments available.";
|
|
8008
8287
|
return [...extractorPages.entries()].map(([extractorName, pages]) => `${extractorName}: ${pages.length} page(s), pages ${pages.join(", ")}`).join("\n");
|
|
8009
8288
|
}
|
|
8010
|
-
async function extract(
|
|
8289
|
+
async function extract(input, documentId, options) {
|
|
8011
8290
|
const id = documentId ?? `doc-${Date.now()}`;
|
|
8291
|
+
const isDoclingInput = isDoclingExtractionInput(input);
|
|
8292
|
+
const pdfInput = isDoclingInput ? void 0 : input;
|
|
8293
|
+
const doclingDocument = isDoclingInput ? normalizeDoclingDocument(input.document, {
|
|
8294
|
+
documentId: id,
|
|
8295
|
+
sourceKind: input.sourceKind
|
|
8296
|
+
}) : void 0;
|
|
8012
8297
|
const memory = /* @__PURE__ */ new Map();
|
|
8013
8298
|
totalUsage = { inputTokens: 0, outputTokens: 0 };
|
|
8014
8299
|
modelCalls = 0;
|
|
@@ -8018,7 +8303,10 @@ function createExtractor(config) {
|
|
|
8018
8303
|
modelCalls: [],
|
|
8019
8304
|
totalModelCallDurationMs: 0
|
|
8020
8305
|
};
|
|
8021
|
-
const sourceSpans =
|
|
8306
|
+
const sourceSpans = mergeSourceSpans([
|
|
8307
|
+
...doclingDocument?.sourceSpans ?? [],
|
|
8308
|
+
...options?.sourceSpans ?? []
|
|
8309
|
+
]);
|
|
8022
8310
|
const sourceChunks = sourceSpans.length ? chunkSourceSpans(sourceSpans) : [];
|
|
8023
8311
|
activeProviderOptions = sourceSpans.length ? { ...providerOptions, sourceSpans, sourceChunks } : providerOptions;
|
|
8024
8312
|
if (sourceStore && sourceSpans.length > 0) {
|
|
@@ -8047,24 +8335,40 @@ function createExtractor(config) {
|
|
|
8047
8335
|
let fullPdfProviderOptionsPromise;
|
|
8048
8336
|
let pageCountPromise;
|
|
8049
8337
|
async function getPdfBase64ForExtraction() {
|
|
8338
|
+
if (!pdfInput) {
|
|
8339
|
+
throw new Error("PDF input is not available for Docling extraction.");
|
|
8340
|
+
}
|
|
8050
8341
|
if (pdfBase64Cache === void 0) {
|
|
8051
8342
|
pdfBase64Cache = await pdfInputToBase64(pdfInput);
|
|
8052
8343
|
}
|
|
8053
8344
|
return pdfBase64Cache;
|
|
8054
8345
|
}
|
|
8055
8346
|
async function getCachedPageCount() {
|
|
8347
|
+
if (doclingDocument) return doclingDocument.pageCount;
|
|
8348
|
+
if (!pdfInput) {
|
|
8349
|
+
throw new Error("PDF input is required to read page count.");
|
|
8350
|
+
}
|
|
8056
8351
|
if (!pageCountPromise) {
|
|
8057
8352
|
pageCountPromise = getPdfSlicer().then((slicer) => slicer.getPageCount()).catch(() => getPdfPageCount(pdfInput));
|
|
8058
8353
|
}
|
|
8059
8354
|
return pageCountPromise;
|
|
8060
8355
|
}
|
|
8061
|
-
async function
|
|
8356
|
+
async function getFullDocumentProviderOptions() {
|
|
8357
|
+
if (doclingDocument) {
|
|
8358
|
+
return buildDoclingProviderOptions(doclingDocument, activeProviderOptions);
|
|
8359
|
+
}
|
|
8360
|
+
if (!pdfInput) {
|
|
8361
|
+
return activeProviderOptions ?? {};
|
|
8362
|
+
}
|
|
8062
8363
|
if (!fullPdfProviderOptionsPromise) {
|
|
8063
8364
|
fullPdfProviderOptionsPromise = buildPdfProviderOptions(pdfInput, activeProviderOptions);
|
|
8064
8365
|
}
|
|
8065
8366
|
return fullPdfProviderOptionsPromise;
|
|
8066
8367
|
}
|
|
8067
8368
|
async function getPdfSlicer() {
|
|
8369
|
+
if (!pdfInput) {
|
|
8370
|
+
throw new Error("PDF input is not available for Docling extraction.");
|
|
8371
|
+
}
|
|
8068
8372
|
if (!pdfSlicerPromise) {
|
|
8069
8373
|
pdfSlicerPromise = createPdfPageSlicer(pdfInput);
|
|
8070
8374
|
}
|
|
@@ -8103,6 +8407,23 @@ function createExtractor(config) {
|
|
|
8103
8407
|
pageRangeImageCache.set(cacheKey, promise);
|
|
8104
8408
|
return promise;
|
|
8105
8409
|
}
|
|
8410
|
+
async function getPageRangeText(startPage, endPage) {
|
|
8411
|
+
return doclingDocument ? getDoclingPageRangeText(doclingDocument, startPage, endPage) : "";
|
|
8412
|
+
}
|
|
8413
|
+
function withFullDocumentTextContext(prompt) {
|
|
8414
|
+
if (!doclingDocument) return prompt;
|
|
8415
|
+
return `${prompt}
|
|
8416
|
+
|
|
8417
|
+
DOCLING DOCUMENT TEXT:
|
|
8418
|
+
${doclingDocument.fullText}`;
|
|
8419
|
+
}
|
|
8420
|
+
function withPageRangeTextContext(prompt, startPage, endPage, pageText) {
|
|
8421
|
+
if (!doclingDocument) return prompt;
|
|
8422
|
+
return `${prompt}
|
|
8423
|
+
|
|
8424
|
+
DOCLING DOCUMENT PAGES ${startPage}-${endPage}:
|
|
8425
|
+
${pageText || "(No Docling text was available for this page range.)"}`;
|
|
8426
|
+
}
|
|
8106
8427
|
let classifyResult;
|
|
8107
8428
|
if (resumed?.classifyResult && pipelineCtx.isPhaseComplete("classify")) {
|
|
8108
8429
|
classifyResult = resumed.classifyResult;
|
|
@@ -8115,12 +8436,12 @@ function createExtractor(config) {
|
|
|
8115
8436
|
const classifyResponse = await safeGenerateObject(
|
|
8116
8437
|
generateObject,
|
|
8117
8438
|
{
|
|
8118
|
-
prompt: buildClassifyPrompt(),
|
|
8439
|
+
prompt: withFullDocumentTextContext(buildClassifyPrompt()),
|
|
8119
8440
|
schema: ClassifyResultSchema,
|
|
8120
8441
|
maxTokens: budget.maxTokens,
|
|
8121
8442
|
taskKind: "extraction_classify",
|
|
8122
8443
|
budgetDiagnostics: budget,
|
|
8123
|
-
providerOptions: await
|
|
8444
|
+
providerOptions: await getFullDocumentProviderOptions()
|
|
8124
8445
|
},
|
|
8125
8446
|
{
|
|
8126
8447
|
fallback: { documentType: "policy", policyTypes: ["other"], confidence: 0 },
|
|
@@ -8165,12 +8486,12 @@ function createExtractor(config) {
|
|
|
8165
8486
|
const formInventoryResponse = await safeGenerateObject(
|
|
8166
8487
|
generateObject,
|
|
8167
8488
|
{
|
|
8168
|
-
prompt: buildFormInventoryPrompt(templateHints),
|
|
8489
|
+
prompt: withFullDocumentTextContext(buildFormInventoryPrompt(templateHints)),
|
|
8169
8490
|
schema: FormInventorySchema,
|
|
8170
8491
|
maxTokens: budget.maxTokens,
|
|
8171
8492
|
taskKind: "extraction_form_inventory",
|
|
8172
8493
|
budgetDiagnostics: budget,
|
|
8173
|
-
providerOptions: await
|
|
8494
|
+
providerOptions: await getFullDocumentProviderOptions()
|
|
8174
8495
|
},
|
|
8175
8496
|
{
|
|
8176
8497
|
fallback: { forms: [] },
|
|
@@ -8213,18 +8534,24 @@ function createExtractor(config) {
|
|
|
8213
8534
|
const pageMapResults = await Promise.all(
|
|
8214
8535
|
pageMapChunks.map(
|
|
8215
8536
|
({ startPage, endPage }) => pageMapLimit(async () => {
|
|
8216
|
-
const pagesPdf = await getPageRangePdf(startPage, endPage);
|
|
8537
|
+
const pagesPdf = doclingDocument ? void 0 : await getPageRangePdf(startPage, endPage);
|
|
8538
|
+
const pagesText = doclingDocument ? await getPageRangeText(startPage, endPage) : "";
|
|
8217
8539
|
const budget = resolveBudget("extraction_page_map", 2048);
|
|
8218
8540
|
const startedAt = Date.now();
|
|
8219
8541
|
const mapResponse = await safeGenerateObject(
|
|
8220
8542
|
generateObject,
|
|
8221
8543
|
{
|
|
8222
|
-
prompt:
|
|
8544
|
+
prompt: withPageRangeTextContext(
|
|
8545
|
+
buildPageMapPrompt(templateHints, startPage, endPage, formInventoryHint),
|
|
8546
|
+
startPage,
|
|
8547
|
+
endPage,
|
|
8548
|
+
pagesText
|
|
8549
|
+
),
|
|
8223
8550
|
schema: PageMapChunkSchema,
|
|
8224
8551
|
maxTokens: budget.maxTokens,
|
|
8225
8552
|
taskKind: "extraction_page_map",
|
|
8226
8553
|
budgetDiagnostics: budget,
|
|
8227
|
-
providerOptions: { ...activeProviderOptions, pdfBase64: pagesPdf }
|
|
8554
|
+
providerOptions: doclingDocument ? { ...activeProviderOptions, doclingText: pagesText, doclingPageRange: { startPage, endPage } } : { ...activeProviderOptions, pdfBase64: pagesPdf }
|
|
8228
8555
|
},
|
|
8229
8556
|
{
|
|
8230
8557
|
fallback: {
|
|
@@ -8302,7 +8629,7 @@ function createExtractor(config) {
|
|
|
8302
8629
|
}))
|
|
8303
8630
|
];
|
|
8304
8631
|
onProgress?.(`Dispatching ${tasks.length} extractors...`);
|
|
8305
|
-
const extractionPdfInput = await getPdfBase64ForExtraction();
|
|
8632
|
+
const extractionPdfInput = doclingDocument ? void 0 : await getPdfBase64ForExtraction();
|
|
8306
8633
|
const extractorResults = await Promise.all(
|
|
8307
8634
|
tasks.map(
|
|
8308
8635
|
(task) => extractorLimit(async () => {
|
|
@@ -8313,7 +8640,8 @@ function createExtractor(config) {
|
|
|
8313
8640
|
memory,
|
|
8314
8641
|
completedPageRangePdfCache,
|
|
8315
8642
|
getPageRangePdf,
|
|
8316
|
-
convertPdfToImages ? getPageImages : void 0
|
|
8643
|
+
convertPdfToImages ? getPageImages : void 0,
|
|
8644
|
+
doclingDocument ? getPageRangeText : void 0
|
|
8317
8645
|
);
|
|
8318
8646
|
})
|
|
8319
8647
|
)
|
|
@@ -8345,7 +8673,8 @@ function createExtractor(config) {
|
|
|
8345
8673
|
providerOptions: activeProviderOptions,
|
|
8346
8674
|
pageRangeCache: completedPageRangePdfCache,
|
|
8347
8675
|
getPageRangePdf,
|
|
8348
|
-
getPageImages: convertPdfToImages ? getPageImages : void 0
|
|
8676
|
+
getPageImages: convertPdfToImages ? getPageImages : void 0,
|
|
8677
|
+
getPageRangeText: doclingDocument ? getPageRangeText : void 0
|
|
8349
8678
|
});
|
|
8350
8679
|
trackUsage(supplementaryResult.usage, {
|
|
8351
8680
|
taskKind: "extraction_focused",
|
|
@@ -8381,6 +8710,7 @@ function createExtractor(config) {
|
|
|
8381
8710
|
concurrency,
|
|
8382
8711
|
getPageRangePdf,
|
|
8383
8712
|
getPageImages: convertPdfToImages ? getPageImages : void 0,
|
|
8713
|
+
getPageRangeText: doclingDocument ? getPageRangeText : void 0,
|
|
8384
8714
|
providerOptions: activeProviderOptions,
|
|
8385
8715
|
modelCapabilities,
|
|
8386
8716
|
modelBudgetConstraints,
|
|
@@ -8429,12 +8759,12 @@ function createExtractor(config) {
|
|
|
8429
8759
|
const reviewResponse = await safeGenerateObject(
|
|
8430
8760
|
generateObject,
|
|
8431
8761
|
{
|
|
8432
|
-
prompt: buildReviewPrompt(template.required, extractedKeys, extractionSummary, pageMapSummary, extractorCatalog),
|
|
8762
|
+
prompt: withFullDocumentTextContext(buildReviewPrompt(template.required, extractedKeys, extractionSummary, pageMapSummary, extractorCatalog)),
|
|
8433
8763
|
schema: ReviewResultSchema,
|
|
8434
8764
|
maxTokens: budget.maxTokens,
|
|
8435
8765
|
taskKind: "extraction_review",
|
|
8436
8766
|
budgetDiagnostics: budget,
|
|
8437
|
-
providerOptions: await
|
|
8767
|
+
providerOptions: await getFullDocumentProviderOptions()
|
|
8438
8768
|
},
|
|
8439
8769
|
{
|
|
8440
8770
|
fallback: {
|
|
@@ -8464,7 +8794,7 @@ function createExtractor(config) {
|
|
|
8464
8794
|
break;
|
|
8465
8795
|
}
|
|
8466
8796
|
onProgress?.(`Review round ${round + 1}: dispatching ${reviewResponse.object.additionalTasks.length} follow-up extractors...`);
|
|
8467
|
-
const extractionPdfInput = await getPdfBase64ForExtraction();
|
|
8797
|
+
const extractionPdfInput = doclingDocument ? void 0 : await getPdfBase64ForExtraction();
|
|
8468
8798
|
const followUpResults = await Promise.all(
|
|
8469
8799
|
reviewResponse.object.additionalTasks.map(
|
|
8470
8800
|
(task) => extractorLimit(async () => {
|
|
@@ -8474,7 +8804,8 @@ function createExtractor(config) {
|
|
|
8474
8804
|
memory,
|
|
8475
8805
|
completedPageRangePdfCache,
|
|
8476
8806
|
getPageRangePdf,
|
|
8477
|
-
convertPdfToImages ? getPageImages : void 0
|
|
8807
|
+
convertPdfToImages ? getPageImages : void 0,
|
|
8808
|
+
doclingDocument ? getPageRangeText : void 0
|
|
8478
8809
|
);
|
|
8479
8810
|
})
|
|
8480
8811
|
)
|
|
@@ -12473,6 +12804,7 @@ export {
|
|
|
12473
12804
|
buildConfirmationSummaryPrompt,
|
|
12474
12805
|
buildConversationMemoryGuidance,
|
|
12475
12806
|
buildCoverageGapPrompt,
|
|
12807
|
+
buildDoclingProviderOptions,
|
|
12476
12808
|
buildFieldExplanationPrompt,
|
|
12477
12809
|
buildFieldExtractionPrompt,
|
|
12478
12810
|
buildFlatPdfMappingPrompt,
|
|
@@ -12514,12 +12846,16 @@ export {
|
|
|
12514
12846
|
fillAcroForm,
|
|
12515
12847
|
generateNextMessage,
|
|
12516
12848
|
getAcroFormFields,
|
|
12849
|
+
getDoclingPageRangeText,
|
|
12517
12850
|
getExtractor,
|
|
12518
12851
|
getFileIdentifier,
|
|
12519
12852
|
getPdfPageCount,
|
|
12520
12853
|
getTemplate,
|
|
12854
|
+
isDoclingExtractionInput,
|
|
12521
12855
|
isFileReference,
|
|
12522
12856
|
mergeQuestionAnswers,
|
|
12857
|
+
mergeSourceSpans,
|
|
12858
|
+
normalizeDoclingDocument,
|
|
12523
12859
|
normalizeForMatch,
|
|
12524
12860
|
orderSourceEvidence,
|
|
12525
12861
|
overlayTextOnPdf,
|