@claritylabs/cl-sdk 1.1.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -2471,6 +2471,254 @@ async function overlayTextOnPdf(pdfBytes, overlays) {
2471
2471
  return await pdfDoc.save();
2472
2472
  }
2473
2473
 
2474
+ // src/extraction/docling.ts
2475
+ function isDoclingExtractionInput(input) {
2476
+ return Boolean(
2477
+ input && typeof input === "object" && input.kind === "docling_document" && input.document && typeof input.document === "object"
2478
+ );
2479
+ }
2480
+ function normalizeDoclingDocument(document, options) {
2481
+ const itemMap = buildItemMap(document);
2482
+ const orderedRefs = getOrderedBodyRefs(document, itemMap);
2483
+ const orderedItems = orderedRefs.length > 0 ? orderedRefs.map((ref) => itemMap.get(ref)).filter((item) => Boolean(item)) : getFallbackOrderedItems(document, itemMap);
2484
+ const units = orderedItems.map(({ ref, item }) => normalizeItem(ref, item)).filter((unit) => Boolean(unit && unit.text.trim()));
2485
+ const pageCount = inferPageCount(document, units);
2486
+ const pageTexts = /* @__PURE__ */ new Map();
2487
+ for (const unit of units) {
2488
+ const page = clampPage(unit.pageStart ?? 1, pageCount);
2489
+ pageTexts.set(page, appendText(pageTexts.get(page), unit.text));
2490
+ }
2491
+ const fullText = Array.from({ length: pageCount }, (_, index) => {
2492
+ const pageNumber = index + 1;
2493
+ const text = pageTexts.get(pageNumber)?.trim();
2494
+ return text ? `Page ${pageNumber}
2495
+ ${text}` : "";
2496
+ }).filter(Boolean).join("\n\n");
2497
+ const sourceKind = options.sourceKind ?? "policy_pdf";
2498
+ const sourceSpans = units.map((unit, index) => {
2499
+ const span = buildSourceSpan(
2500
+ {
2501
+ documentId: options.documentId,
2502
+ sourceKind,
2503
+ text: unit.text,
2504
+ pageStart: unit.pageStart,
2505
+ pageEnd: unit.pageEnd,
2506
+ sectionId: unit.label,
2507
+ metadata: {
2508
+ sourceSystem: "docling",
2509
+ sourceUnit: "docling_item",
2510
+ doclingRef: unit.ref,
2511
+ ...unit.label ? { doclingLabel: unit.label } : {}
2512
+ }
2513
+ },
2514
+ index
2515
+ );
2516
+ return {
2517
+ ...span,
2518
+ kind: "plain_text",
2519
+ bbox: unit.bboxes?.length ? unit.bboxes : void 0
2520
+ };
2521
+ });
2522
+ return {
2523
+ pageCount,
2524
+ fullText,
2525
+ pageTexts,
2526
+ units,
2527
+ sourceSpans
2528
+ };
2529
+ }
2530
+ function getDoclingPageRangeText(normalized, startPage, endPage) {
2531
+ const start = clampPage(startPage, normalized.pageCount);
2532
+ const end = clampPage(endPage, normalized.pageCount);
2533
+ const lines = [];
2534
+ for (let page = start; page <= end; page++) {
2535
+ const text = normalized.pageTexts.get(page)?.trim();
2536
+ if (text) {
2537
+ lines.push(`Page ${page}
2538
+ ${text}`);
2539
+ }
2540
+ }
2541
+ return lines.join("\n\n");
2542
+ }
2543
+ function buildDoclingProviderOptions(normalized, existingOptions) {
2544
+ return {
2545
+ ...existingOptions,
2546
+ doclingText: normalized.fullText,
2547
+ doclingPageCount: normalized.pageCount
2548
+ };
2549
+ }
2550
+ function mergeSourceSpans(spans) {
2551
+ const seen = /* @__PURE__ */ new Set();
2552
+ const merged = [];
2553
+ for (const span of spans) {
2554
+ const key = [
2555
+ span.documentId,
2556
+ span.pageStart ?? span.location?.startPage ?? span.location?.page ?? "na",
2557
+ span.pageEnd ?? span.location?.endPage ?? span.pageStart ?? "na",
2558
+ span.sectionId ?? span.location?.fieldPath ?? "na",
2559
+ span.textHash ?? sourceSpanTextHash(span.text)
2560
+ ].join(":");
2561
+ if (seen.has(key)) continue;
2562
+ seen.add(key);
2563
+ merged.push(span);
2564
+ }
2565
+ return merged;
2566
+ }
2567
+ function buildItemMap(document) {
2568
+ const map = /* @__PURE__ */ new Map();
2569
+ addItems(map, "#/texts", document.texts ?? []);
2570
+ addItems(map, "#/tables", document.tables ?? []);
2571
+ addItems(map, "#/key_value_items", document.key_value_items ?? document.keyValueItems ?? []);
2572
+ addItems(map, "#/pictures", document.pictures ?? []);
2573
+ return map;
2574
+ }
2575
+ function addItems(map, baseRef, items) {
2576
+ items.forEach((item, index) => {
2577
+ const ref = getSelfRef(item) ?? `${baseRef}/${index}`;
2578
+ map.set(ref, { ref, item });
2579
+ });
2580
+ }
2581
+ function getFallbackOrderedItems(document, itemMap) {
2582
+ const refs = [
2583
+ ...(document.texts ?? []).map((item, index) => getSelfRef(item) ?? `#/texts/${index}`),
2584
+ ...(document.tables ?? []).map((item, index) => getSelfRef(item) ?? `#/tables/${index}`),
2585
+ ...(document.key_value_items ?? document.keyValueItems ?? []).map((item, index) => getSelfRef(item) ?? `#/key_value_items/${index}`)
2586
+ ];
2587
+ return refs.map((ref) => itemMap.get(ref)).filter((item) => Boolean(item));
2588
+ }
2589
+ function getOrderedBodyRefs(document, itemMap) {
2590
+ const groupMap = /* @__PURE__ */ new Map();
2591
+ (document.groups ?? []).forEach((group, index) => {
2592
+ groupMap.set(getSelfRef(group) ?? `#/groups/${index}`, group);
2593
+ });
2594
+ const refs = [];
2595
+ const visited = /* @__PURE__ */ new Set();
2596
+ const visitRef = (ref) => {
2597
+ const itemEntry = itemMap.get(ref);
2598
+ if (itemEntry) {
2599
+ if (!visited.has(ref)) {
2600
+ visited.add(ref);
2601
+ refs.push(ref);
2602
+ }
2603
+ visitNode(itemEntry.item);
2604
+ return;
2605
+ }
2606
+ visitNode(groupMap.get(ref));
2607
+ };
2608
+ const visitNode = (node) => {
2609
+ for (const child of node?.children ?? []) {
2610
+ const ref = getRef(child);
2611
+ if (!ref) continue;
2612
+ visitRef(ref);
2613
+ }
2614
+ };
2615
+ visitNode(document.body);
2616
+ return refs;
2617
+ }
2618
+ function normalizeItem(ref, item) {
2619
+ const text = getItemText(item).trim();
2620
+ if (!text) return void 0;
2621
+ const pages = (item.prov ?? []).map((prov) => getPageNumber(prov)).filter((page) => typeof page === "number" && page > 0);
2622
+ const pageStart = pages.length ? Math.min(...pages) : void 0;
2623
+ const pageEnd = pages.length ? Math.max(...pages) : pageStart;
2624
+ const bboxes = (item.prov ?? []).map((prov) => toSourceSpanBBox(prov)).filter((bbox) => Boolean(bbox));
2625
+ return {
2626
+ ref,
2627
+ label: typeof item.label === "string" ? item.label : void 0,
2628
+ text,
2629
+ pageStart,
2630
+ pageEnd,
2631
+ bboxes: bboxes.length ? bboxes : void 0
2632
+ };
2633
+ }
2634
+ function getItemText(item) {
2635
+ if (typeof item.text === "string" && item.text.trim()) return item.text;
2636
+ if (typeof item.orig === "string" && item.orig.trim()) return item.orig;
2637
+ const table = tableToMarkdown(item.data);
2638
+ if (table) return table;
2639
+ return "";
2640
+ }
2641
+ function tableToMarkdown(data) {
2642
+ const record = asRecord(data);
2643
+ const cells = Array.isArray(record?.table_cells) ? record.table_cells : Array.isArray(record?.tableCells) ? record.tableCells : void 0;
2644
+ if (!cells) return void 0;
2645
+ const parsedCells = cells.map((cell) => asRecord(cell)).filter((cell) => Boolean(cell)).map((cell) => ({
2646
+ row: firstNumber2([cell.start_row_offset, cell.row_header, cell.row, cell.rowIndex]) ?? 0,
2647
+ col: firstNumber2([cell.start_col_offset, cell.col, cell.colIndex]) ?? 0,
2648
+ text: firstString([cell.text, cell.orig, cell.content])
2649
+ })).filter((cell) => cell.text);
2650
+ if (parsedCells.length === 0) return void 0;
2651
+ const maxRow = Math.max(...parsedCells.map((cell) => cell.row));
2652
+ const maxCol = Math.max(...parsedCells.map((cell) => cell.col));
2653
+ const rows = Array.from({ length: maxRow + 1 }, () => Array.from({ length: maxCol + 1 }, () => ""));
2654
+ for (const cell of parsedCells) {
2655
+ rows[cell.row][cell.col] = cell.text;
2656
+ }
2657
+ if (rows.length === 1) return rows[0].filter(Boolean).join(" | ");
2658
+ const header = rows[0];
2659
+ const separator = header.map(() => "---");
2660
+ return [header, separator, ...rows.slice(1)].map((row) => `| ${row.map((value) => value.trim()).join(" | ")} |`).join("\n");
2661
+ }
2662
+ function inferPageCount(document, units) {
2663
+ const pages = document.pages;
2664
+ if (Array.isArray(pages)) return Math.max(1, pages.length);
2665
+ if (pages && typeof pages === "object") {
2666
+ const keys = Object.keys(pages);
2667
+ const numericMax = Math.max(0, ...keys.map((key) => Number(key)).filter((value) => Number.isFinite(value)));
2668
+ return Math.max(1, numericMax || keys.length);
2669
+ }
2670
+ return Math.max(1, ...units.flatMap((unit) => [unit.pageStart ?? 0, unit.pageEnd ?? 0]));
2671
+ }
2672
+ function getSelfRef(value) {
2673
+ return value.self_ref ?? value.selfRef;
2674
+ }
2675
+ function getRef(value) {
2676
+ if (typeof value === "string") return value;
2677
+ return value.$ref ?? value.ref;
2678
+ }
2679
+ function getPageNumber(prov) {
2680
+ return prov.page_no ?? prov.pageNo ?? prov.page;
2681
+ }
2682
+ function toSourceSpanBBox(prov) {
2683
+ const page = getPageNumber(prov);
2684
+ const bbox = asRecord(prov.bbox);
2685
+ if (!page || !bbox) return void 0;
2686
+ const x = firstNumber2([bbox.x, bbox.l, bbox.left]);
2687
+ const y = firstNumber2([bbox.y, bbox.t, bbox.top]);
2688
+ const width = firstNumber2([bbox.width]);
2689
+ const height = firstNumber2([bbox.height]);
2690
+ const right = firstNumber2([bbox.r, bbox.right]);
2691
+ const bottom = firstNumber2([bbox.b, bbox.bottom]);
2692
+ if (x == null || y == null) return void 0;
2693
+ const resolvedWidth = width ?? (right != null ? right - x : void 0);
2694
+ const resolvedHeight = height ?? (bottom != null ? bottom - y : void 0);
2695
+ if (resolvedWidth == null || resolvedHeight == null) return void 0;
2696
+ return { page, x, y, width: resolvedWidth, height: resolvedHeight };
2697
+ }
2698
+ function clampPage(page, pageCount) {
2699
+ return Math.max(1, Math.min(pageCount, page));
2700
+ }
2701
+ function appendText(existing, next) {
2702
+ return existing ? `${existing}
2703
+
2704
+ ${next}` : next;
2705
+ }
2706
+ function asRecord(value) {
2707
+ return value && typeof value === "object" && !Array.isArray(value) ? value : void 0;
2708
+ }
2709
+ function firstString(values) {
2710
+ for (const value of values) {
2711
+ if (typeof value === "string" && value.trim()) return value.trim();
2712
+ }
2713
+ return "";
2714
+ }
2715
+ function firstNumber2(values) {
2716
+ for (const value of values) {
2717
+ if (typeof value === "number" && Number.isFinite(value)) return value;
2718
+ }
2719
+ return void 0;
2720
+ }
2721
+
2474
2722
  // src/extraction/extractor.ts
2475
2723
  function sourceSpansForPageRange(providerOptions, startPage, endPage) {
2476
2724
  const sourceSpans = providerOptions?.sourceSpans;
@@ -2519,15 +2767,31 @@ async function runExtractor(params) {
2519
2767
  } = params;
2520
2768
  const extractorProviderOptions = { ...providerOptions };
2521
2769
  let fullPrompt;
2522
- const needsPdfBase64 = convertPdfToImages && !params.getPageImages || !convertPdfToImages && !params.getPageRangePdf;
2523
- const pdfBase64 = needsPdfBase64 ? await pdfInputToBase64(pdfInput) : void 0;
2524
- if (convertPdfToImages) {
2770
+ if (params.getPageRangeText) {
2771
+ const pageText = await params.getPageRangeText(startPage, endPage);
2772
+ extractorProviderOptions.doclingText = pageText;
2773
+ extractorProviderOptions.doclingPageRange = { startPage, endPage };
2774
+ fullPrompt = `${prompt}
2775
+
2776
+ [Document pages ${startPage}-${endPage} are provided below as Docling-extracted text.]
2777
+
2778
+ ${pageText || "(No Docling text was available for this page range.)"}`;
2779
+ } else if (convertPdfToImages) {
2780
+ if (!pdfInput) {
2781
+ throw new Error("pdfInput is required when extracting page images.");
2782
+ }
2783
+ const needsPdfBase64 = !params.getPageImages;
2784
+ const pdfBase64 = needsPdfBase64 ? await pdfInputToBase64(pdfInput) : void 0;
2525
2785
  const images = params.getPageImages ? await params.getPageImages(startPage, endPage) : await convertPdfToImages(pdfBase64, startPage, endPage);
2526
2786
  extractorProviderOptions.images = images;
2527
2787
  fullPrompt = `${prompt}
2528
2788
 
2529
2789
  [Document pages ${startPage}-${endPage} are provided as images.]`;
2530
2790
  } else {
2791
+ if (!pdfInput) {
2792
+ throw new Error("pdfInput is required when extracting page PDFs.");
2793
+ }
2794
+ const pdfBase64 = params.getPageRangePdf ? void 0 : await pdfInputToBase64(pdfInput);
2531
2795
  const cacheKey = `${startPage}-${endPage}`;
2532
2796
  const cachedPagesPdf = pageRangeCache?.get(cacheKey);
2533
2797
  const pagesPdf = cachedPagesPdf ?? (params.getPageRangePdf ? await params.getPageRangePdf(startPage, endPage) : await extractPageRange(pdfBase64, startPage, endPage));
@@ -3567,7 +3831,7 @@ function formatAddress(addr) {
3567
3831
  function asRecordArray(value) {
3568
3832
  return Array.isArray(value) ? value.filter((item) => Boolean(item) && typeof item === "object" && !Array.isArray(item)) : [];
3569
3833
  }
3570
- function firstString(item, keys) {
3834
+ function firstString2(item, keys) {
3571
3835
  for (const key of keys) {
3572
3836
  const value = item[key];
3573
3837
  if (typeof value === "string" && value.trim()) return value;
@@ -3924,32 +4188,32 @@ ${exc.content}`.trim(), {
3924
4188
  );
3925
4189
  });
3926
4190
  asRecordArray(extendedDoc.definitions).forEach((definition, i) => {
3927
- const term = firstString(definition, ["term", "name", "title"]) ?? `Definition ${i + 1}`;
3928
- const body = firstString(definition, ["definition", "content", "text", "meaning"]);
4191
+ const term = firstString2(definition, ["term", "name", "title"]) ?? `Definition ${i + 1}`;
4192
+ const body = firstString2(definition, ["definition", "content", "text", "meaning"]);
3929
4193
  pushChunk(
3930
4194
  `definition:${i}`,
3931
4195
  "definition",
3932
4196
  lines([
3933
4197
  `Definition: ${term}`,
3934
4198
  body,
3935
- firstString(definition, ["originalContent", "source"]) ? `Source: ${firstString(definition, ["originalContent", "source"])}` : null
4199
+ firstString2(definition, ["originalContent", "source"]) ? `Source: ${firstString2(definition, ["originalContent", "source"])}` : null
3936
4200
  ]),
3937
4201
  {
3938
4202
  term,
3939
- formNumber: firstString(definition, ["formNumber"]),
3940
- formTitle: firstString(definition, ["formTitle"]),
4203
+ formNumber: firstString2(definition, ["formNumber"]),
4204
+ formTitle: firstString2(definition, ["formTitle"]),
3941
4205
  pageNumber: typeof definition.pageNumber === "number" ? definition.pageNumber : void 0,
3942
- sectionRef: firstString(definition, ["sectionRef", "sectionTitle"]),
4206
+ sectionRef: firstString2(definition, ["sectionRef", "sectionTitle"]),
3943
4207
  documentType: doc.type
3944
4208
  }
3945
4209
  );
3946
4210
  });
3947
4211
  const coveredReasons = asRecordArray(extendedDoc.coveredReasons ?? extendedDoc.covered_reasons);
3948
4212
  coveredReasons.forEach((coveredReason, i) => {
3949
- const title = firstString(coveredReason, ["title", "name", "reason", "peril", "cause"]) ?? `Covered Reason ${i + 1}`;
3950
- const coverageName = firstString(coveredReason, ["coverageName", "coverage", "coveragePart"]);
3951
- const reasonNumber = firstString(coveredReason, ["reasonNumber", "number"]);
3952
- const body = firstString(coveredReason, ["content", "description", "text", "coverageGrant"]);
4213
+ const title = firstString2(coveredReason, ["title", "name", "reason", "peril", "cause"]) ?? `Covered Reason ${i + 1}`;
4214
+ const coverageName = firstString2(coveredReason, ["coverageName", "coverage", "coveragePart"]);
4215
+ const reasonNumber = firstString2(coveredReason, ["reasonNumber", "number"]);
4216
+ const body = firstString2(coveredReason, ["content", "description", "text", "coverageGrant"]);
3953
4217
  pushChunk(
3954
4218
  `covered_reason:${i}`,
3955
4219
  "covered_reason",
@@ -3958,16 +4222,16 @@ ${exc.content}`.trim(), {
3958
4222
  reasonNumber ? `Reason Number: ${reasonNumber}` : null,
3959
4223
  `Covered Reason: ${title}`,
3960
4224
  body,
3961
- firstString(coveredReason, ["originalContent", "source"]) ? `Source: ${firstString(coveredReason, ["originalContent", "source"])}` : null
4225
+ firstString2(coveredReason, ["originalContent", "source"]) ? `Source: ${firstString2(coveredReason, ["originalContent", "source"])}` : null
3962
4226
  ]),
3963
4227
  {
3964
4228
  coverageName,
3965
4229
  reasonNumber,
3966
4230
  title,
3967
- formNumber: firstString(coveredReason, ["formNumber"]),
3968
- formTitle: firstString(coveredReason, ["formTitle"]),
4231
+ formNumber: firstString2(coveredReason, ["formNumber"]),
4232
+ formTitle: firstString2(coveredReason, ["formTitle"]),
3969
4233
  pageNumber: typeof coveredReason.pageNumber === "number" ? coveredReason.pageNumber : void 0,
3970
- sectionRef: firstString(coveredReason, ["sectionRef", "sectionTitle"]),
4234
+ sectionRef: firstString2(coveredReason, ["sectionRef", "sectionTitle"]),
3971
4235
  documentType: doc.type
3972
4236
  }
3973
4237
  );
@@ -3987,10 +4251,10 @@ ${exc.content}`.trim(), {
3987
4251
  reasonNumber,
3988
4252
  title,
3989
4253
  conditionIndex,
3990
- formNumber: firstString(coveredReason, ["formNumber"]),
3991
- formTitle: firstString(coveredReason, ["formTitle"]),
4254
+ formNumber: firstString2(coveredReason, ["formNumber"]),
4255
+ formTitle: firstString2(coveredReason, ["formTitle"]),
3992
4256
  pageNumber: typeof coveredReason.pageNumber === "number" ? coveredReason.pageNumber : void 0,
3993
- sectionRef: firstString(coveredReason, ["sectionRef", "sectionTitle"]),
4257
+ sectionRef: firstString2(coveredReason, ["sectionRef", "sectionTitle"]),
3994
4258
  documentType: doc.type
3995
4259
  }
3996
4260
  );
@@ -6461,21 +6725,21 @@ Return JSON only.`;
6461
6725
  }
6462
6726
 
6463
6727
  // src/prompts/extractors/index.ts
6464
- function asRecord(data) {
6728
+ function asRecord2(data) {
6465
6729
  return data && typeof data === "object" ? data : void 0;
6466
6730
  }
6467
6731
  function getSections2(data) {
6468
- const sections = asRecord(data)?.sections;
6732
+ const sections = asRecord2(data)?.sections;
6469
6733
  return Array.isArray(sections) ? sections : [];
6470
6734
  }
6471
6735
  function isCoveredReasonsEmpty(data) {
6472
- const record = asRecord(data);
6736
+ const record = asRecord2(data);
6473
6737
  if (!record) return true;
6474
6738
  const coveredReasons = Array.isArray(record.coveredReasons) ? record.coveredReasons : Array.isArray(record.covered_reasons) ? record.covered_reasons : [];
6475
6739
  return coveredReasons.length === 0;
6476
6740
  }
6477
6741
  function isDefinitionsEmpty(data) {
6478
- const definitions = asRecord(data)?.definitions;
6742
+ const definitions = asRecord2(data)?.definitions;
6479
6743
  return !Array.isArray(definitions) || definitions.length === 0;
6480
6744
  }
6481
6745
  function sectionLooksLikeCoveredReason(section) {
@@ -6709,6 +6973,14 @@ function decideReferentialResolutionAction(params) {
6709
6973
  }
6710
6974
 
6711
6975
  // src/extraction/resolve-referential.ts
6976
+ function formatDoclingTextContext(providerOptions) {
6977
+ const doclingText = providerOptions?.doclingText;
6978
+ if (typeof doclingText !== "string" || !doclingText.trim()) return "";
6979
+ return `
6980
+
6981
+ DOCLING DOCUMENT TEXT:
6982
+ ${doclingText}`;
6983
+ }
6712
6984
  function parseReferenceTarget(text) {
6713
6985
  if (typeof text !== "string") return void 0;
6714
6986
  const normalized = text.trim();
@@ -6790,12 +7062,12 @@ Return the page range (1-indexed) where this section is located. If the section
6790
7062
 
6791
7063
  If you cannot find the section, return startPage: 0 and endPage: 0.
6792
7064
 
6793
- Return JSON only.`,
7065
+ Return JSON only.${formatDoclingTextContext(providerOptions)}`,
6794
7066
  schema: PageLocationSchema,
6795
7067
  maxTokens: budget.maxTokens,
6796
7068
  taskKind: "extraction_referential_lookup",
6797
7069
  budgetDiagnostics: budget,
6798
- providerOptions: await buildPdfProviderOptions(pdfInput, providerOptions)
7070
+ providerOptions: pdfInput ? await buildPdfProviderOptions(pdfInput, providerOptions) : providerOptions
6799
7071
  },
6800
7072
  {
6801
7073
  fallback: { startPage: 0, endPage: 0 },
@@ -6829,6 +7101,7 @@ async function resolveReferentialCoverages(params) {
6829
7101
  convertPdfToImages,
6830
7102
  getPageRangePdf,
6831
7103
  getPageImages,
7104
+ getPageRangeText,
6832
7105
  concurrency = 2,
6833
7106
  providerOptions,
6834
7107
  modelCapabilities,
@@ -6940,6 +7213,7 @@ async function resolveReferentialCoverages(params) {
6940
7213
  convertPdfToImages,
6941
7214
  getPageRangePdf,
6942
7215
  getPageImages,
7216
+ getPageRangeText,
6943
7217
  maxTokens: budget.maxTokens,
6944
7218
  taskKind: "extraction_referential_lookup",
6945
7219
  budgetDiagnostics: budget,
@@ -7035,6 +7309,7 @@ async function runFocusedExtractorWithFallback(params) {
7035
7309
  pageRangeCache,
7036
7310
  getPageRangePdf,
7037
7311
  getPageImages,
7312
+ getPageRangeText,
7038
7313
  trackUsage,
7039
7314
  resolveBudget,
7040
7315
  log
@@ -7064,7 +7339,8 @@ async function runFocusedExtractorWithFallback(params) {
7064
7339
  providerOptions,
7065
7340
  pageRangeCache,
7066
7341
  getPageRangePdf,
7067
- getPageImages
7342
+ getPageImages,
7343
+ getPageRangeText
7068
7344
  });
7069
7345
  trackUsage(result.usage, {
7070
7346
  taskKind,
@@ -7109,7 +7385,8 @@ async function runFocusedExtractorWithFallback(params) {
7109
7385
  providerOptions,
7110
7386
  pageRangeCache,
7111
7387
  getPageRangePdf,
7112
- getPageImages
7388
+ getPageImages,
7389
+ getPageRangeText
7113
7390
  });
7114
7391
  trackUsage(fallbackResult.usage, {
7115
7392
  taskKind,
@@ -7953,7 +8230,7 @@ function createExtractor(config) {
7953
8230
  }
7954
8231
  return lines.length > 0 ? lines.join("\n") : "";
7955
8232
  }
7956
- async function runFocusedExtractorTask(task, pdfInput, memory, pageRangeCache, getPageRangePdf, getPageImages) {
8233
+ async function runFocusedExtractorTask(task, pdfInput, memory, pageRangeCache, getPageRangePdf, getPageImages, getPageRangeText) {
7957
8234
  if (task.extractorName === "supplementary") {
7958
8235
  const alreadyExtractedSummary = buildAlreadyExtractedSummary(memory);
7959
8236
  const budget = resolveBudget("extraction_focused", 4096);
@@ -7973,7 +8250,8 @@ function createExtractor(config) {
7973
8250
  providerOptions: activeProviderOptions,
7974
8251
  pageRangeCache,
7975
8252
  getPageRangePdf,
7976
- getPageImages
8253
+ getPageImages,
8254
+ getPageRangeText
7977
8255
  });
7978
8256
  trackUsage(result.usage, {
7979
8257
  taskKind: "extraction_focused",
@@ -7992,6 +8270,7 @@ function createExtractor(config) {
7992
8270
  pageRangeCache,
7993
8271
  getPageRangePdf,
7994
8272
  getPageImages,
8273
+ getPageRangeText,
7995
8274
  trackUsage,
7996
8275
  resolveBudget,
7997
8276
  log
@@ -8007,8 +8286,14 @@ function createExtractor(config) {
8007
8286
  if (extractorPages.size === 0) return "No page assignments available.";
8008
8287
  return [...extractorPages.entries()].map(([extractorName, pages]) => `${extractorName}: ${pages.length} page(s), pages ${pages.join(", ")}`).join("\n");
8009
8288
  }
8010
- async function extract(pdfInput, documentId, options) {
8289
+ async function extract(input, documentId, options) {
8011
8290
  const id = documentId ?? `doc-${Date.now()}`;
8291
+ const isDoclingInput = isDoclingExtractionInput(input);
8292
+ const pdfInput = isDoclingInput ? void 0 : input;
8293
+ const doclingDocument = isDoclingInput ? normalizeDoclingDocument(input.document, {
8294
+ documentId: id,
8295
+ sourceKind: input.sourceKind
8296
+ }) : void 0;
8012
8297
  const memory = /* @__PURE__ */ new Map();
8013
8298
  totalUsage = { inputTokens: 0, outputTokens: 0 };
8014
8299
  modelCalls = 0;
@@ -8018,7 +8303,10 @@ function createExtractor(config) {
8018
8303
  modelCalls: [],
8019
8304
  totalModelCallDurationMs: 0
8020
8305
  };
8021
- const sourceSpans = options?.sourceSpans ?? [];
8306
+ const sourceSpans = mergeSourceSpans([
8307
+ ...doclingDocument?.sourceSpans ?? [],
8308
+ ...options?.sourceSpans ?? []
8309
+ ]);
8022
8310
  const sourceChunks = sourceSpans.length ? chunkSourceSpans(sourceSpans) : [];
8023
8311
  activeProviderOptions = sourceSpans.length ? { ...providerOptions, sourceSpans, sourceChunks } : providerOptions;
8024
8312
  if (sourceStore && sourceSpans.length > 0) {
@@ -8047,24 +8335,40 @@ function createExtractor(config) {
8047
8335
  let fullPdfProviderOptionsPromise;
8048
8336
  let pageCountPromise;
8049
8337
  async function getPdfBase64ForExtraction() {
8338
+ if (!pdfInput) {
8339
+ throw new Error("PDF input is not available for Docling extraction.");
8340
+ }
8050
8341
  if (pdfBase64Cache === void 0) {
8051
8342
  pdfBase64Cache = await pdfInputToBase64(pdfInput);
8052
8343
  }
8053
8344
  return pdfBase64Cache;
8054
8345
  }
8055
8346
  async function getCachedPageCount() {
8347
+ if (doclingDocument) return doclingDocument.pageCount;
8348
+ if (!pdfInput) {
8349
+ throw new Error("PDF input is required to read page count.");
8350
+ }
8056
8351
  if (!pageCountPromise) {
8057
8352
  pageCountPromise = getPdfSlicer().then((slicer) => slicer.getPageCount()).catch(() => getPdfPageCount(pdfInput));
8058
8353
  }
8059
8354
  return pageCountPromise;
8060
8355
  }
8061
- async function getFullPdfProviderOptions() {
8356
+ async function getFullDocumentProviderOptions() {
8357
+ if (doclingDocument) {
8358
+ return buildDoclingProviderOptions(doclingDocument, activeProviderOptions);
8359
+ }
8360
+ if (!pdfInput) {
8361
+ return activeProviderOptions ?? {};
8362
+ }
8062
8363
  if (!fullPdfProviderOptionsPromise) {
8063
8364
  fullPdfProviderOptionsPromise = buildPdfProviderOptions(pdfInput, activeProviderOptions);
8064
8365
  }
8065
8366
  return fullPdfProviderOptionsPromise;
8066
8367
  }
8067
8368
  async function getPdfSlicer() {
8369
+ if (!pdfInput) {
8370
+ throw new Error("PDF input is not available for Docling extraction.");
8371
+ }
8068
8372
  if (!pdfSlicerPromise) {
8069
8373
  pdfSlicerPromise = createPdfPageSlicer(pdfInput);
8070
8374
  }
@@ -8103,6 +8407,23 @@ function createExtractor(config) {
8103
8407
  pageRangeImageCache.set(cacheKey, promise);
8104
8408
  return promise;
8105
8409
  }
8410
+ async function getPageRangeText(startPage, endPage) {
8411
+ return doclingDocument ? getDoclingPageRangeText(doclingDocument, startPage, endPage) : "";
8412
+ }
8413
+ function withFullDocumentTextContext(prompt) {
8414
+ if (!doclingDocument) return prompt;
8415
+ return `${prompt}
8416
+
8417
+ DOCLING DOCUMENT TEXT:
8418
+ ${doclingDocument.fullText}`;
8419
+ }
8420
+ function withPageRangeTextContext(prompt, startPage, endPage, pageText) {
8421
+ if (!doclingDocument) return prompt;
8422
+ return `${prompt}
8423
+
8424
+ DOCLING DOCUMENT PAGES ${startPage}-${endPage}:
8425
+ ${pageText || "(No Docling text was available for this page range.)"}`;
8426
+ }
8106
8427
  let classifyResult;
8107
8428
  if (resumed?.classifyResult && pipelineCtx.isPhaseComplete("classify")) {
8108
8429
  classifyResult = resumed.classifyResult;
@@ -8115,12 +8436,12 @@ function createExtractor(config) {
8115
8436
  const classifyResponse = await safeGenerateObject(
8116
8437
  generateObject,
8117
8438
  {
8118
- prompt: buildClassifyPrompt(),
8439
+ prompt: withFullDocumentTextContext(buildClassifyPrompt()),
8119
8440
  schema: ClassifyResultSchema,
8120
8441
  maxTokens: budget.maxTokens,
8121
8442
  taskKind: "extraction_classify",
8122
8443
  budgetDiagnostics: budget,
8123
- providerOptions: await getFullPdfProviderOptions()
8444
+ providerOptions: await getFullDocumentProviderOptions()
8124
8445
  },
8125
8446
  {
8126
8447
  fallback: { documentType: "policy", policyTypes: ["other"], confidence: 0 },
@@ -8165,12 +8486,12 @@ function createExtractor(config) {
8165
8486
  const formInventoryResponse = await safeGenerateObject(
8166
8487
  generateObject,
8167
8488
  {
8168
- prompt: buildFormInventoryPrompt(templateHints),
8489
+ prompt: withFullDocumentTextContext(buildFormInventoryPrompt(templateHints)),
8169
8490
  schema: FormInventorySchema,
8170
8491
  maxTokens: budget.maxTokens,
8171
8492
  taskKind: "extraction_form_inventory",
8172
8493
  budgetDiagnostics: budget,
8173
- providerOptions: await getFullPdfProviderOptions()
8494
+ providerOptions: await getFullDocumentProviderOptions()
8174
8495
  },
8175
8496
  {
8176
8497
  fallback: { forms: [] },
@@ -8213,18 +8534,24 @@ function createExtractor(config) {
8213
8534
  const pageMapResults = await Promise.all(
8214
8535
  pageMapChunks.map(
8215
8536
  ({ startPage, endPage }) => pageMapLimit(async () => {
8216
- const pagesPdf = await getPageRangePdf(startPage, endPage);
8537
+ const pagesPdf = doclingDocument ? void 0 : await getPageRangePdf(startPage, endPage);
8538
+ const pagesText = doclingDocument ? await getPageRangeText(startPage, endPage) : "";
8217
8539
  const budget = resolveBudget("extraction_page_map", 2048);
8218
8540
  const startedAt = Date.now();
8219
8541
  const mapResponse = await safeGenerateObject(
8220
8542
  generateObject,
8221
8543
  {
8222
- prompt: buildPageMapPrompt(templateHints, startPage, endPage, formInventoryHint),
8544
+ prompt: withPageRangeTextContext(
8545
+ buildPageMapPrompt(templateHints, startPage, endPage, formInventoryHint),
8546
+ startPage,
8547
+ endPage,
8548
+ pagesText
8549
+ ),
8223
8550
  schema: PageMapChunkSchema,
8224
8551
  maxTokens: budget.maxTokens,
8225
8552
  taskKind: "extraction_page_map",
8226
8553
  budgetDiagnostics: budget,
8227
- providerOptions: { ...activeProviderOptions, pdfBase64: pagesPdf }
8554
+ providerOptions: doclingDocument ? { ...activeProviderOptions, doclingText: pagesText, doclingPageRange: { startPage, endPage } } : { ...activeProviderOptions, pdfBase64: pagesPdf }
8228
8555
  },
8229
8556
  {
8230
8557
  fallback: {
@@ -8302,7 +8629,7 @@ function createExtractor(config) {
8302
8629
  }))
8303
8630
  ];
8304
8631
  onProgress?.(`Dispatching ${tasks.length} extractors...`);
8305
- const extractionPdfInput = await getPdfBase64ForExtraction();
8632
+ const extractionPdfInput = doclingDocument ? void 0 : await getPdfBase64ForExtraction();
8306
8633
  const extractorResults = await Promise.all(
8307
8634
  tasks.map(
8308
8635
  (task) => extractorLimit(async () => {
@@ -8313,7 +8640,8 @@ function createExtractor(config) {
8313
8640
  memory,
8314
8641
  completedPageRangePdfCache,
8315
8642
  getPageRangePdf,
8316
- convertPdfToImages ? getPageImages : void 0
8643
+ convertPdfToImages ? getPageImages : void 0,
8644
+ doclingDocument ? getPageRangeText : void 0
8317
8645
  );
8318
8646
  })
8319
8647
  )
@@ -8345,7 +8673,8 @@ function createExtractor(config) {
8345
8673
  providerOptions: activeProviderOptions,
8346
8674
  pageRangeCache: completedPageRangePdfCache,
8347
8675
  getPageRangePdf,
8348
- getPageImages: convertPdfToImages ? getPageImages : void 0
8676
+ getPageImages: convertPdfToImages ? getPageImages : void 0,
8677
+ getPageRangeText: doclingDocument ? getPageRangeText : void 0
8349
8678
  });
8350
8679
  trackUsage(supplementaryResult.usage, {
8351
8680
  taskKind: "extraction_focused",
@@ -8381,6 +8710,7 @@ function createExtractor(config) {
8381
8710
  concurrency,
8382
8711
  getPageRangePdf,
8383
8712
  getPageImages: convertPdfToImages ? getPageImages : void 0,
8713
+ getPageRangeText: doclingDocument ? getPageRangeText : void 0,
8384
8714
  providerOptions: activeProviderOptions,
8385
8715
  modelCapabilities,
8386
8716
  modelBudgetConstraints,
@@ -8429,12 +8759,12 @@ function createExtractor(config) {
8429
8759
  const reviewResponse = await safeGenerateObject(
8430
8760
  generateObject,
8431
8761
  {
8432
- prompt: buildReviewPrompt(template.required, extractedKeys, extractionSummary, pageMapSummary, extractorCatalog),
8762
+ prompt: withFullDocumentTextContext(buildReviewPrompt(template.required, extractedKeys, extractionSummary, pageMapSummary, extractorCatalog)),
8433
8763
  schema: ReviewResultSchema,
8434
8764
  maxTokens: budget.maxTokens,
8435
8765
  taskKind: "extraction_review",
8436
8766
  budgetDiagnostics: budget,
8437
- providerOptions: await getFullPdfProviderOptions()
8767
+ providerOptions: await getFullDocumentProviderOptions()
8438
8768
  },
8439
8769
  {
8440
8770
  fallback: {
@@ -8464,7 +8794,7 @@ function createExtractor(config) {
8464
8794
  break;
8465
8795
  }
8466
8796
  onProgress?.(`Review round ${round + 1}: dispatching ${reviewResponse.object.additionalTasks.length} follow-up extractors...`);
8467
- const extractionPdfInput = await getPdfBase64ForExtraction();
8797
+ const extractionPdfInput = doclingDocument ? void 0 : await getPdfBase64ForExtraction();
8468
8798
  const followUpResults = await Promise.all(
8469
8799
  reviewResponse.object.additionalTasks.map(
8470
8800
  (task) => extractorLimit(async () => {
@@ -8474,7 +8804,8 @@ function createExtractor(config) {
8474
8804
  memory,
8475
8805
  completedPageRangePdfCache,
8476
8806
  getPageRangePdf,
8477
- convertPdfToImages ? getPageImages : void 0
8807
+ convertPdfToImages ? getPageImages : void 0,
8808
+ doclingDocument ? getPageRangeText : void 0
8478
8809
  );
8479
8810
  })
8480
8811
  )
@@ -12473,6 +12804,7 @@ export {
12473
12804
  buildConfirmationSummaryPrompt,
12474
12805
  buildConversationMemoryGuidance,
12475
12806
  buildCoverageGapPrompt,
12807
+ buildDoclingProviderOptions,
12476
12808
  buildFieldExplanationPrompt,
12477
12809
  buildFieldExtractionPrompt,
12478
12810
  buildFlatPdfMappingPrompt,
@@ -12514,12 +12846,16 @@ export {
12514
12846
  fillAcroForm,
12515
12847
  generateNextMessage,
12516
12848
  getAcroFormFields,
12849
+ getDoclingPageRangeText,
12517
12850
  getExtractor,
12518
12851
  getFileIdentifier,
12519
12852
  getPdfPageCount,
12520
12853
  getTemplate,
12854
+ isDoclingExtractionInput,
12521
12855
  isFileReference,
12522
12856
  mergeQuestionAnswers,
12857
+ mergeSourceSpans,
12858
+ normalizeDoclingDocument,
12523
12859
  normalizeForMatch,
12524
12860
  orderSourceEvidence,
12525
12861
  overlayTextOnPdf,