@claritylabs/cl-sdk 1.1.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -256,6 +256,7 @@ __export(index_exports, {
256
256
  buildConfirmationSummaryPrompt: () => buildConfirmationSummaryPrompt,
257
257
  buildConversationMemoryGuidance: () => buildConversationMemoryGuidance,
258
258
  buildCoverageGapPrompt: () => buildCoverageGapPrompt,
259
+ buildDoclingProviderOptions: () => buildDoclingProviderOptions,
259
260
  buildFieldExplanationPrompt: () => buildFieldExplanationPrompt,
260
261
  buildFieldExtractionPrompt: () => buildFieldExtractionPrompt,
261
262
  buildFlatPdfMappingPrompt: () => buildFlatPdfMappingPrompt,
@@ -297,12 +298,16 @@ __export(index_exports, {
297
298
  fillAcroForm: () => fillAcroForm,
298
299
  generateNextMessage: () => generateNextMessage,
299
300
  getAcroFormFields: () => getAcroFormFields,
301
+ getDoclingPageRangeText: () => getDoclingPageRangeText,
300
302
  getExtractor: () => getExtractor,
301
303
  getFileIdentifier: () => getFileIdentifier,
302
304
  getPdfPageCount: () => getPdfPageCount,
303
305
  getTemplate: () => getTemplate,
306
+ isDoclingExtractionInput: () => isDoclingExtractionInput,
304
307
  isFileReference: () => isFileReference,
305
308
  mergeQuestionAnswers: () => mergeQuestionAnswers,
309
+ mergeSourceSpans: () => mergeSourceSpans,
310
+ normalizeDoclingDocument: () => normalizeDoclingDocument,
306
311
  normalizeForMatch: () => normalizeForMatch,
307
312
  orderSourceEvidence: () => orderSourceEvidence,
308
313
  overlayTextOnPdf: () => overlayTextOnPdf,
@@ -2794,6 +2799,254 @@ async function overlayTextOnPdf(pdfBytes, overlays) {
2794
2799
  return await pdfDoc.save();
2795
2800
  }
2796
2801
 
2802
+ // src/extraction/docling.ts
2803
+ function isDoclingExtractionInput(input) {
2804
+ return Boolean(
2805
+ input && typeof input === "object" && input.kind === "docling_document" && input.document && typeof input.document === "object"
2806
+ );
2807
+ }
2808
+ function normalizeDoclingDocument(document, options) {
2809
+ const itemMap = buildItemMap(document);
2810
+ const orderedRefs = getOrderedBodyRefs(document, itemMap);
2811
+ const orderedItems = orderedRefs.length > 0 ? orderedRefs.map((ref) => itemMap.get(ref)).filter((item) => Boolean(item)) : getFallbackOrderedItems(document, itemMap);
2812
+ const units = orderedItems.map(({ ref, item }) => normalizeItem(ref, item)).filter((unit) => Boolean(unit && unit.text.trim()));
2813
+ const pageCount = inferPageCount(document, units);
2814
+ const pageTexts = /* @__PURE__ */ new Map();
2815
+ for (const unit of units) {
2816
+ const page = clampPage(unit.pageStart ?? 1, pageCount);
2817
+ pageTexts.set(page, appendText(pageTexts.get(page), unit.text));
2818
+ }
2819
+ const fullText = Array.from({ length: pageCount }, (_, index) => {
2820
+ const pageNumber = index + 1;
2821
+ const text = pageTexts.get(pageNumber)?.trim();
2822
+ return text ? `Page ${pageNumber}
2823
+ ${text}` : "";
2824
+ }).filter(Boolean).join("\n\n");
2825
+ const sourceKind = options.sourceKind ?? "policy_pdf";
2826
+ const sourceSpans = units.map((unit, index) => {
2827
+ const span = buildSourceSpan(
2828
+ {
2829
+ documentId: options.documentId,
2830
+ sourceKind,
2831
+ text: unit.text,
2832
+ pageStart: unit.pageStart,
2833
+ pageEnd: unit.pageEnd,
2834
+ sectionId: unit.label,
2835
+ metadata: {
2836
+ sourceSystem: "docling",
2837
+ sourceUnit: "docling_item",
2838
+ doclingRef: unit.ref,
2839
+ ...unit.label ? { doclingLabel: unit.label } : {}
2840
+ }
2841
+ },
2842
+ index
2843
+ );
2844
+ return {
2845
+ ...span,
2846
+ kind: "plain_text",
2847
+ bbox: unit.bboxes?.length ? unit.bboxes : void 0
2848
+ };
2849
+ });
2850
+ return {
2851
+ pageCount,
2852
+ fullText,
2853
+ pageTexts,
2854
+ units,
2855
+ sourceSpans
2856
+ };
2857
+ }
2858
+ function getDoclingPageRangeText(normalized, startPage, endPage) {
2859
+ const start = clampPage(startPage, normalized.pageCount);
2860
+ const end = clampPage(endPage, normalized.pageCount);
2861
+ const lines = [];
2862
+ for (let page = start; page <= end; page++) {
2863
+ const text = normalized.pageTexts.get(page)?.trim();
2864
+ if (text) {
2865
+ lines.push(`Page ${page}
2866
+ ${text}`);
2867
+ }
2868
+ }
2869
+ return lines.join("\n\n");
2870
+ }
2871
+ function buildDoclingProviderOptions(normalized, existingOptions) {
2872
+ return {
2873
+ ...existingOptions,
2874
+ doclingText: normalized.fullText,
2875
+ doclingPageCount: normalized.pageCount
2876
+ };
2877
+ }
2878
+ function mergeSourceSpans(spans) {
2879
+ const seen = /* @__PURE__ */ new Set();
2880
+ const merged = [];
2881
+ for (const span of spans) {
2882
+ const key = [
2883
+ span.documentId,
2884
+ span.pageStart ?? span.location?.startPage ?? span.location?.page ?? "na",
2885
+ span.pageEnd ?? span.location?.endPage ?? span.pageStart ?? "na",
2886
+ span.sectionId ?? span.location?.fieldPath ?? "na",
2887
+ span.textHash ?? sourceSpanTextHash(span.text)
2888
+ ].join(":");
2889
+ if (seen.has(key)) continue;
2890
+ seen.add(key);
2891
+ merged.push(span);
2892
+ }
2893
+ return merged;
2894
+ }
2895
+ function buildItemMap(document) {
2896
+ const map = /* @__PURE__ */ new Map();
2897
+ addItems(map, "#/texts", document.texts ?? []);
2898
+ addItems(map, "#/tables", document.tables ?? []);
2899
+ addItems(map, "#/key_value_items", document.key_value_items ?? document.keyValueItems ?? []);
2900
+ addItems(map, "#/pictures", document.pictures ?? []);
2901
+ return map;
2902
+ }
2903
+ function addItems(map, baseRef, items) {
2904
+ items.forEach((item, index) => {
2905
+ const ref = getSelfRef(item) ?? `${baseRef}/${index}`;
2906
+ map.set(ref, { ref, item });
2907
+ });
2908
+ }
2909
+ function getFallbackOrderedItems(document, itemMap) {
2910
+ const refs = [
2911
+ ...(document.texts ?? []).map((item, index) => getSelfRef(item) ?? `#/texts/${index}`),
2912
+ ...(document.tables ?? []).map((item, index) => getSelfRef(item) ?? `#/tables/${index}`),
2913
+ ...(document.key_value_items ?? document.keyValueItems ?? []).map((item, index) => getSelfRef(item) ?? `#/key_value_items/${index}`)
2914
+ ];
2915
+ return refs.map((ref) => itemMap.get(ref)).filter((item) => Boolean(item));
2916
+ }
2917
+ function getOrderedBodyRefs(document, itemMap) {
2918
+ const groupMap = /* @__PURE__ */ new Map();
2919
+ (document.groups ?? []).forEach((group, index) => {
2920
+ groupMap.set(getSelfRef(group) ?? `#/groups/${index}`, group);
2921
+ });
2922
+ const refs = [];
2923
+ const visited = /* @__PURE__ */ new Set();
2924
+ const visitRef = (ref) => {
2925
+ const itemEntry = itemMap.get(ref);
2926
+ if (itemEntry) {
2927
+ if (!visited.has(ref)) {
2928
+ visited.add(ref);
2929
+ refs.push(ref);
2930
+ }
2931
+ visitNode(itemEntry.item);
2932
+ return;
2933
+ }
2934
+ visitNode(groupMap.get(ref));
2935
+ };
2936
+ const visitNode = (node) => {
2937
+ for (const child of node?.children ?? []) {
2938
+ const ref = getRef(child);
2939
+ if (!ref) continue;
2940
+ visitRef(ref);
2941
+ }
2942
+ };
2943
+ visitNode(document.body);
2944
+ return refs;
2945
+ }
2946
+ function normalizeItem(ref, item) {
2947
+ const text = getItemText(item).trim();
2948
+ if (!text) return void 0;
2949
+ const pages = (item.prov ?? []).map((prov) => getPageNumber(prov)).filter((page) => typeof page === "number" && page > 0);
2950
+ const pageStart = pages.length ? Math.min(...pages) : void 0;
2951
+ const pageEnd = pages.length ? Math.max(...pages) : pageStart;
2952
+ const bboxes = (item.prov ?? []).map((prov) => toSourceSpanBBox(prov)).filter((bbox) => Boolean(bbox));
2953
+ return {
2954
+ ref,
2955
+ label: typeof item.label === "string" ? item.label : void 0,
2956
+ text,
2957
+ pageStart,
2958
+ pageEnd,
2959
+ bboxes: bboxes.length ? bboxes : void 0
2960
+ };
2961
+ }
2962
+ function getItemText(item) {
2963
+ if (typeof item.text === "string" && item.text.trim()) return item.text;
2964
+ if (typeof item.orig === "string" && item.orig.trim()) return item.orig;
2965
+ const table = tableToMarkdown(item.data);
2966
+ if (table) return table;
2967
+ return "";
2968
+ }
2969
+ function tableToMarkdown(data) {
2970
+ const record = asRecord(data);
2971
+ const cells = Array.isArray(record?.table_cells) ? record.table_cells : Array.isArray(record?.tableCells) ? record.tableCells : void 0;
2972
+ if (!cells) return void 0;
2973
+ const parsedCells = cells.map((cell) => asRecord(cell)).filter((cell) => Boolean(cell)).map((cell) => ({
2974
+ row: firstNumber2([cell.start_row_offset, cell.row_header, cell.row, cell.rowIndex]) ?? 0,
2975
+ col: firstNumber2([cell.start_col_offset, cell.col, cell.colIndex]) ?? 0,
2976
+ text: firstString([cell.text, cell.orig, cell.content])
2977
+ })).filter((cell) => cell.text);
2978
+ if (parsedCells.length === 0) return void 0;
2979
+ const maxRow = Math.max(...parsedCells.map((cell) => cell.row));
2980
+ const maxCol = Math.max(...parsedCells.map((cell) => cell.col));
2981
+ const rows = Array.from({ length: maxRow + 1 }, () => Array.from({ length: maxCol + 1 }, () => ""));
2982
+ for (const cell of parsedCells) {
2983
+ rows[cell.row][cell.col] = cell.text;
2984
+ }
2985
+ if (rows.length === 1) return rows[0].filter(Boolean).join(" | ");
2986
+ const header = rows[0];
2987
+ const separator = header.map(() => "---");
2988
+ return [header, separator, ...rows.slice(1)].map((row) => `| ${row.map((value) => value.trim()).join(" | ")} |`).join("\n");
2989
+ }
2990
+ function inferPageCount(document, units) {
2991
+ const pages = document.pages;
2992
+ if (Array.isArray(pages)) return Math.max(1, pages.length);
2993
+ if (pages && typeof pages === "object") {
2994
+ const keys = Object.keys(pages);
2995
+ const numericMax = Math.max(0, ...keys.map((key) => Number(key)).filter((value) => Number.isFinite(value)));
2996
+ return Math.max(1, numericMax || keys.length);
2997
+ }
2998
+ return Math.max(1, ...units.flatMap((unit) => [unit.pageStart ?? 0, unit.pageEnd ?? 0]));
2999
+ }
3000
+ function getSelfRef(value) {
3001
+ return value.self_ref ?? value.selfRef;
3002
+ }
3003
+ function getRef(value) {
3004
+ if (typeof value === "string") return value;
3005
+ return value.$ref ?? value.ref;
3006
+ }
3007
+ function getPageNumber(prov) {
3008
+ return prov.page_no ?? prov.pageNo ?? prov.page;
3009
+ }
3010
+ function toSourceSpanBBox(prov) {
3011
+ const page = getPageNumber(prov);
3012
+ const bbox = asRecord(prov.bbox);
3013
+ if (!page || !bbox) return void 0;
3014
+ const x = firstNumber2([bbox.x, bbox.l, bbox.left]);
3015
+ const y = firstNumber2([bbox.y, bbox.t, bbox.top]);
3016
+ const width = firstNumber2([bbox.width]);
3017
+ const height = firstNumber2([bbox.height]);
3018
+ const right = firstNumber2([bbox.r, bbox.right]);
3019
+ const bottom = firstNumber2([bbox.b, bbox.bottom]);
3020
+ if (x == null || y == null) return void 0;
3021
+ const resolvedWidth = width ?? (right != null ? right - x : void 0);
3022
+ const resolvedHeight = height ?? (bottom != null ? bottom - y : void 0);
3023
+ if (resolvedWidth == null || resolvedHeight == null) return void 0;
3024
+ return { page, x, y, width: resolvedWidth, height: resolvedHeight };
3025
+ }
3026
+ function clampPage(page, pageCount) {
3027
+ return Math.max(1, Math.min(pageCount, page));
3028
+ }
3029
+ function appendText(existing, next) {
3030
+ return existing ? `${existing}
3031
+
3032
+ ${next}` : next;
3033
+ }
3034
+ function asRecord(value) {
3035
+ return value && typeof value === "object" && !Array.isArray(value) ? value : void 0;
3036
+ }
3037
+ function firstString(values) {
3038
+ for (const value of values) {
3039
+ if (typeof value === "string" && value.trim()) return value.trim();
3040
+ }
3041
+ return "";
3042
+ }
3043
+ function firstNumber2(values) {
3044
+ for (const value of values) {
3045
+ if (typeof value === "number" && Number.isFinite(value)) return value;
3046
+ }
3047
+ return void 0;
3048
+ }
3049
+
2797
3050
  // src/extraction/extractor.ts
2798
3051
  function sourceSpansForPageRange(providerOptions, startPage, endPage) {
2799
3052
  const sourceSpans = providerOptions?.sourceSpans;
@@ -2842,15 +3095,31 @@ async function runExtractor(params) {
2842
3095
  } = params;
2843
3096
  const extractorProviderOptions = { ...providerOptions };
2844
3097
  let fullPrompt;
2845
- const needsPdfBase64 = convertPdfToImages && !params.getPageImages || !convertPdfToImages && !params.getPageRangePdf;
2846
- const pdfBase64 = needsPdfBase64 ? await pdfInputToBase64(pdfInput) : void 0;
2847
- if (convertPdfToImages) {
3098
+ if (params.getPageRangeText) {
3099
+ const pageText = await params.getPageRangeText(startPage, endPage);
3100
+ extractorProviderOptions.doclingText = pageText;
3101
+ extractorProviderOptions.doclingPageRange = { startPage, endPage };
3102
+ fullPrompt = `${prompt}
3103
+
3104
+ [Document pages ${startPage}-${endPage} are provided below as Docling-extracted text.]
3105
+
3106
+ ${pageText || "(No Docling text was available for this page range.)"}`;
3107
+ } else if (convertPdfToImages) {
3108
+ if (!pdfInput) {
3109
+ throw new Error("pdfInput is required when extracting page images.");
3110
+ }
3111
+ const needsPdfBase64 = !params.getPageImages;
3112
+ const pdfBase64 = needsPdfBase64 ? await pdfInputToBase64(pdfInput) : void 0;
2848
3113
  const images = params.getPageImages ? await params.getPageImages(startPage, endPage) : await convertPdfToImages(pdfBase64, startPage, endPage);
2849
3114
  extractorProviderOptions.images = images;
2850
3115
  fullPrompt = `${prompt}
2851
3116
 
2852
3117
  [Document pages ${startPage}-${endPage} are provided as images.]`;
2853
3118
  } else {
3119
+ if (!pdfInput) {
3120
+ throw new Error("pdfInput is required when extracting page PDFs.");
3121
+ }
3122
+ const pdfBase64 = params.getPageRangePdf ? void 0 : await pdfInputToBase64(pdfInput);
2854
3123
  const cacheKey = `${startPage}-${endPage}`;
2855
3124
  const cachedPagesPdf = pageRangeCache?.get(cacheKey);
2856
3125
  const pagesPdf = cachedPagesPdf ?? (params.getPageRangePdf ? await params.getPageRangePdf(startPage, endPage) : await extractPageRange(pdfBase64, startPage, endPage));
@@ -3890,7 +4159,7 @@ function formatAddress(addr) {
3890
4159
  function asRecordArray(value) {
3891
4160
  return Array.isArray(value) ? value.filter((item) => Boolean(item) && typeof item === "object" && !Array.isArray(item)) : [];
3892
4161
  }
3893
- function firstString(item, keys) {
4162
+ function firstString2(item, keys) {
3894
4163
  for (const key of keys) {
3895
4164
  const value = item[key];
3896
4165
  if (typeof value === "string" && value.trim()) return value;
@@ -4247,32 +4516,32 @@ ${exc.content}`.trim(), {
4247
4516
  );
4248
4517
  });
4249
4518
  asRecordArray(extendedDoc.definitions).forEach((definition, i) => {
4250
- const term = firstString(definition, ["term", "name", "title"]) ?? `Definition ${i + 1}`;
4251
- const body = firstString(definition, ["definition", "content", "text", "meaning"]);
4519
+ const term = firstString2(definition, ["term", "name", "title"]) ?? `Definition ${i + 1}`;
4520
+ const body = firstString2(definition, ["definition", "content", "text", "meaning"]);
4252
4521
  pushChunk(
4253
4522
  `definition:${i}`,
4254
4523
  "definition",
4255
4524
  lines([
4256
4525
  `Definition: ${term}`,
4257
4526
  body,
4258
- firstString(definition, ["originalContent", "source"]) ? `Source: ${firstString(definition, ["originalContent", "source"])}` : null
4527
+ firstString2(definition, ["originalContent", "source"]) ? `Source: ${firstString2(definition, ["originalContent", "source"])}` : null
4259
4528
  ]),
4260
4529
  {
4261
4530
  term,
4262
- formNumber: firstString(definition, ["formNumber"]),
4263
- formTitle: firstString(definition, ["formTitle"]),
4531
+ formNumber: firstString2(definition, ["formNumber"]),
4532
+ formTitle: firstString2(definition, ["formTitle"]),
4264
4533
  pageNumber: typeof definition.pageNumber === "number" ? definition.pageNumber : void 0,
4265
- sectionRef: firstString(definition, ["sectionRef", "sectionTitle"]),
4534
+ sectionRef: firstString2(definition, ["sectionRef", "sectionTitle"]),
4266
4535
  documentType: doc.type
4267
4536
  }
4268
4537
  );
4269
4538
  });
4270
4539
  const coveredReasons = asRecordArray(extendedDoc.coveredReasons ?? extendedDoc.covered_reasons);
4271
4540
  coveredReasons.forEach((coveredReason, i) => {
4272
- const title = firstString(coveredReason, ["title", "name", "reason", "peril", "cause"]) ?? `Covered Reason ${i + 1}`;
4273
- const coverageName = firstString(coveredReason, ["coverageName", "coverage", "coveragePart"]);
4274
- const reasonNumber = firstString(coveredReason, ["reasonNumber", "number"]);
4275
- const body = firstString(coveredReason, ["content", "description", "text", "coverageGrant"]);
4541
+ const title = firstString2(coveredReason, ["title", "name", "reason", "peril", "cause"]) ?? `Covered Reason ${i + 1}`;
4542
+ const coverageName = firstString2(coveredReason, ["coverageName", "coverage", "coveragePart"]);
4543
+ const reasonNumber = firstString2(coveredReason, ["reasonNumber", "number"]);
4544
+ const body = firstString2(coveredReason, ["content", "description", "text", "coverageGrant"]);
4276
4545
  pushChunk(
4277
4546
  `covered_reason:${i}`,
4278
4547
  "covered_reason",
@@ -4281,16 +4550,16 @@ ${exc.content}`.trim(), {
4281
4550
  reasonNumber ? `Reason Number: ${reasonNumber}` : null,
4282
4551
  `Covered Reason: ${title}`,
4283
4552
  body,
4284
- firstString(coveredReason, ["originalContent", "source"]) ? `Source: ${firstString(coveredReason, ["originalContent", "source"])}` : null
4553
+ firstString2(coveredReason, ["originalContent", "source"]) ? `Source: ${firstString2(coveredReason, ["originalContent", "source"])}` : null
4285
4554
  ]),
4286
4555
  {
4287
4556
  coverageName,
4288
4557
  reasonNumber,
4289
4558
  title,
4290
- formNumber: firstString(coveredReason, ["formNumber"]),
4291
- formTitle: firstString(coveredReason, ["formTitle"]),
4559
+ formNumber: firstString2(coveredReason, ["formNumber"]),
4560
+ formTitle: firstString2(coveredReason, ["formTitle"]),
4292
4561
  pageNumber: typeof coveredReason.pageNumber === "number" ? coveredReason.pageNumber : void 0,
4293
- sectionRef: firstString(coveredReason, ["sectionRef", "sectionTitle"]),
4562
+ sectionRef: firstString2(coveredReason, ["sectionRef", "sectionTitle"]),
4294
4563
  documentType: doc.type
4295
4564
  }
4296
4565
  );
@@ -4310,10 +4579,10 @@ ${exc.content}`.trim(), {
4310
4579
  reasonNumber,
4311
4580
  title,
4312
4581
  conditionIndex,
4313
- formNumber: firstString(coveredReason, ["formNumber"]),
4314
- formTitle: firstString(coveredReason, ["formTitle"]),
4582
+ formNumber: firstString2(coveredReason, ["formNumber"]),
4583
+ formTitle: firstString2(coveredReason, ["formTitle"]),
4315
4584
  pageNumber: typeof coveredReason.pageNumber === "number" ? coveredReason.pageNumber : void 0,
4316
- sectionRef: firstString(coveredReason, ["sectionRef", "sectionTitle"]),
4585
+ sectionRef: firstString2(coveredReason, ["sectionRef", "sectionTitle"]),
4317
4586
  documentType: doc.type
4318
4587
  }
4319
4588
  );
@@ -6784,21 +7053,21 @@ Return JSON only.`;
6784
7053
  }
6785
7054
 
6786
7055
  // src/prompts/extractors/index.ts
6787
- function asRecord(data) {
7056
+ function asRecord2(data) {
6788
7057
  return data && typeof data === "object" ? data : void 0;
6789
7058
  }
6790
7059
  function getSections2(data) {
6791
- const sections = asRecord(data)?.sections;
7060
+ const sections = asRecord2(data)?.sections;
6792
7061
  return Array.isArray(sections) ? sections : [];
6793
7062
  }
6794
7063
  function isCoveredReasonsEmpty(data) {
6795
- const record = asRecord(data);
7064
+ const record = asRecord2(data);
6796
7065
  if (!record) return true;
6797
7066
  const coveredReasons = Array.isArray(record.coveredReasons) ? record.coveredReasons : Array.isArray(record.covered_reasons) ? record.covered_reasons : [];
6798
7067
  return coveredReasons.length === 0;
6799
7068
  }
6800
7069
  function isDefinitionsEmpty(data) {
6801
- const definitions = asRecord(data)?.definitions;
7070
+ const definitions = asRecord2(data)?.definitions;
6802
7071
  return !Array.isArray(definitions) || definitions.length === 0;
6803
7072
  }
6804
7073
  function sectionLooksLikeCoveredReason(section) {
@@ -7032,6 +7301,14 @@ function decideReferentialResolutionAction(params) {
7032
7301
  }
7033
7302
 
7034
7303
  // src/extraction/resolve-referential.ts
7304
+ function formatDoclingTextContext(providerOptions) {
7305
+ const doclingText = providerOptions?.doclingText;
7306
+ if (typeof doclingText !== "string" || !doclingText.trim()) return "";
7307
+ return `
7308
+
7309
+ DOCLING DOCUMENT TEXT:
7310
+ ${doclingText}`;
7311
+ }
7035
7312
  function parseReferenceTarget(text) {
7036
7313
  if (typeof text !== "string") return void 0;
7037
7314
  const normalized = text.trim();
@@ -7113,12 +7390,12 @@ Return the page range (1-indexed) where this section is located. If the section
7113
7390
 
7114
7391
  If you cannot find the section, return startPage: 0 and endPage: 0.
7115
7392
 
7116
- Return JSON only.`,
7393
+ Return JSON only.${formatDoclingTextContext(providerOptions)}`,
7117
7394
  schema: PageLocationSchema,
7118
7395
  maxTokens: budget.maxTokens,
7119
7396
  taskKind: "extraction_referential_lookup",
7120
7397
  budgetDiagnostics: budget,
7121
- providerOptions: await buildPdfProviderOptions(pdfInput, providerOptions)
7398
+ providerOptions: pdfInput ? await buildPdfProviderOptions(pdfInput, providerOptions) : providerOptions
7122
7399
  },
7123
7400
  {
7124
7401
  fallback: { startPage: 0, endPage: 0 },
@@ -7152,6 +7429,7 @@ async function resolveReferentialCoverages(params) {
7152
7429
  convertPdfToImages,
7153
7430
  getPageRangePdf,
7154
7431
  getPageImages,
7432
+ getPageRangeText,
7155
7433
  concurrency = 2,
7156
7434
  providerOptions,
7157
7435
  modelCapabilities,
@@ -7263,6 +7541,7 @@ async function resolveReferentialCoverages(params) {
7263
7541
  convertPdfToImages,
7264
7542
  getPageRangePdf,
7265
7543
  getPageImages,
7544
+ getPageRangeText,
7266
7545
  maxTokens: budget.maxTokens,
7267
7546
  taskKind: "extraction_referential_lookup",
7268
7547
  budgetDiagnostics: budget,
@@ -7358,6 +7637,7 @@ async function runFocusedExtractorWithFallback(params) {
7358
7637
  pageRangeCache,
7359
7638
  getPageRangePdf,
7360
7639
  getPageImages,
7640
+ getPageRangeText,
7361
7641
  trackUsage,
7362
7642
  resolveBudget,
7363
7643
  log
@@ -7387,7 +7667,8 @@ async function runFocusedExtractorWithFallback(params) {
7387
7667
  providerOptions,
7388
7668
  pageRangeCache,
7389
7669
  getPageRangePdf,
7390
- getPageImages
7670
+ getPageImages,
7671
+ getPageRangeText
7391
7672
  });
7392
7673
  trackUsage(result.usage, {
7393
7674
  taskKind,
@@ -7432,7 +7713,8 @@ async function runFocusedExtractorWithFallback(params) {
7432
7713
  providerOptions,
7433
7714
  pageRangeCache,
7434
7715
  getPageRangePdf,
7435
- getPageImages
7716
+ getPageImages,
7717
+ getPageRangeText
7436
7718
  });
7437
7719
  trackUsage(fallbackResult.usage, {
7438
7720
  taskKind,
@@ -8276,7 +8558,7 @@ function createExtractor(config) {
8276
8558
  }
8277
8559
  return lines.length > 0 ? lines.join("\n") : "";
8278
8560
  }
8279
- async function runFocusedExtractorTask(task, pdfInput, memory, pageRangeCache, getPageRangePdf, getPageImages) {
8561
+ async function runFocusedExtractorTask(task, pdfInput, memory, pageRangeCache, getPageRangePdf, getPageImages, getPageRangeText) {
8280
8562
  if (task.extractorName === "supplementary") {
8281
8563
  const alreadyExtractedSummary = buildAlreadyExtractedSummary(memory);
8282
8564
  const budget = resolveBudget("extraction_focused", 4096);
@@ -8296,7 +8578,8 @@ function createExtractor(config) {
8296
8578
  providerOptions: activeProviderOptions,
8297
8579
  pageRangeCache,
8298
8580
  getPageRangePdf,
8299
- getPageImages
8581
+ getPageImages,
8582
+ getPageRangeText
8300
8583
  });
8301
8584
  trackUsage(result.usage, {
8302
8585
  taskKind: "extraction_focused",
@@ -8315,6 +8598,7 @@ function createExtractor(config) {
8315
8598
  pageRangeCache,
8316
8599
  getPageRangePdf,
8317
8600
  getPageImages,
8601
+ getPageRangeText,
8318
8602
  trackUsage,
8319
8603
  resolveBudget,
8320
8604
  log
@@ -8330,8 +8614,14 @@ function createExtractor(config) {
8330
8614
  if (extractorPages.size === 0) return "No page assignments available.";
8331
8615
  return [...extractorPages.entries()].map(([extractorName, pages]) => `${extractorName}: ${pages.length} page(s), pages ${pages.join(", ")}`).join("\n");
8332
8616
  }
8333
- async function extract(pdfInput, documentId, options) {
8617
+ async function extract(input, documentId, options) {
8334
8618
  const id = documentId ?? `doc-${Date.now()}`;
8619
+ const isDoclingInput = isDoclingExtractionInput(input);
8620
+ const pdfInput = isDoclingInput ? void 0 : input;
8621
+ const doclingDocument = isDoclingInput ? normalizeDoclingDocument(input.document, {
8622
+ documentId: id,
8623
+ sourceKind: input.sourceKind
8624
+ }) : void 0;
8335
8625
  const memory = /* @__PURE__ */ new Map();
8336
8626
  totalUsage = { inputTokens: 0, outputTokens: 0 };
8337
8627
  modelCalls = 0;
@@ -8341,7 +8631,10 @@ function createExtractor(config) {
8341
8631
  modelCalls: [],
8342
8632
  totalModelCallDurationMs: 0
8343
8633
  };
8344
- const sourceSpans = options?.sourceSpans ?? [];
8634
+ const sourceSpans = mergeSourceSpans([
8635
+ ...doclingDocument?.sourceSpans ?? [],
8636
+ ...options?.sourceSpans ?? []
8637
+ ]);
8345
8638
  const sourceChunks = sourceSpans.length ? chunkSourceSpans(sourceSpans) : [];
8346
8639
  activeProviderOptions = sourceSpans.length ? { ...providerOptions, sourceSpans, sourceChunks } : providerOptions;
8347
8640
  if (sourceStore && sourceSpans.length > 0) {
@@ -8370,24 +8663,40 @@ function createExtractor(config) {
8370
8663
  let fullPdfProviderOptionsPromise;
8371
8664
  let pageCountPromise;
8372
8665
  async function getPdfBase64ForExtraction() {
8666
+ if (!pdfInput) {
8667
+ throw new Error("PDF input is not available for Docling extraction.");
8668
+ }
8373
8669
  if (pdfBase64Cache === void 0) {
8374
8670
  pdfBase64Cache = await pdfInputToBase64(pdfInput);
8375
8671
  }
8376
8672
  return pdfBase64Cache;
8377
8673
  }
8378
8674
  async function getCachedPageCount() {
8675
+ if (doclingDocument) return doclingDocument.pageCount;
8676
+ if (!pdfInput) {
8677
+ throw new Error("PDF input is required to read page count.");
8678
+ }
8379
8679
  if (!pageCountPromise) {
8380
8680
  pageCountPromise = getPdfSlicer().then((slicer) => slicer.getPageCount()).catch(() => getPdfPageCount(pdfInput));
8381
8681
  }
8382
8682
  return pageCountPromise;
8383
8683
  }
8384
- async function getFullPdfProviderOptions() {
8684
+ async function getFullDocumentProviderOptions() {
8685
+ if (doclingDocument) {
8686
+ return buildDoclingProviderOptions(doclingDocument, activeProviderOptions);
8687
+ }
8688
+ if (!pdfInput) {
8689
+ return activeProviderOptions ?? {};
8690
+ }
8385
8691
  if (!fullPdfProviderOptionsPromise) {
8386
8692
  fullPdfProviderOptionsPromise = buildPdfProviderOptions(pdfInput, activeProviderOptions);
8387
8693
  }
8388
8694
  return fullPdfProviderOptionsPromise;
8389
8695
  }
8390
8696
  async function getPdfSlicer() {
8697
+ if (!pdfInput) {
8698
+ throw new Error("PDF input is not available for Docling extraction.");
8699
+ }
8391
8700
  if (!pdfSlicerPromise) {
8392
8701
  pdfSlicerPromise = createPdfPageSlicer(pdfInput);
8393
8702
  }
@@ -8426,6 +8735,23 @@ function createExtractor(config) {
8426
8735
  pageRangeImageCache.set(cacheKey, promise);
8427
8736
  return promise;
8428
8737
  }
8738
+ async function getPageRangeText(startPage, endPage) {
8739
+ return doclingDocument ? getDoclingPageRangeText(doclingDocument, startPage, endPage) : "";
8740
+ }
8741
+ function withFullDocumentTextContext(prompt) {
8742
+ if (!doclingDocument) return prompt;
8743
+ return `${prompt}
8744
+
8745
+ DOCLING DOCUMENT TEXT:
8746
+ ${doclingDocument.fullText}`;
8747
+ }
8748
+ function withPageRangeTextContext(prompt, startPage, endPage, pageText) {
8749
+ if (!doclingDocument) return prompt;
8750
+ return `${prompt}
8751
+
8752
+ DOCLING DOCUMENT PAGES ${startPage}-${endPage}:
8753
+ ${pageText || "(No Docling text was available for this page range.)"}`;
8754
+ }
8429
8755
  let classifyResult;
8430
8756
  if (resumed?.classifyResult && pipelineCtx.isPhaseComplete("classify")) {
8431
8757
  classifyResult = resumed.classifyResult;
@@ -8438,12 +8764,12 @@ function createExtractor(config) {
8438
8764
  const classifyResponse = await safeGenerateObject(
8439
8765
  generateObject,
8440
8766
  {
8441
- prompt: buildClassifyPrompt(),
8767
+ prompt: withFullDocumentTextContext(buildClassifyPrompt()),
8442
8768
  schema: ClassifyResultSchema,
8443
8769
  maxTokens: budget.maxTokens,
8444
8770
  taskKind: "extraction_classify",
8445
8771
  budgetDiagnostics: budget,
8446
- providerOptions: await getFullPdfProviderOptions()
8772
+ providerOptions: await getFullDocumentProviderOptions()
8447
8773
  },
8448
8774
  {
8449
8775
  fallback: { documentType: "policy", policyTypes: ["other"], confidence: 0 },
@@ -8488,12 +8814,12 @@ function createExtractor(config) {
8488
8814
  const formInventoryResponse = await safeGenerateObject(
8489
8815
  generateObject,
8490
8816
  {
8491
- prompt: buildFormInventoryPrompt(templateHints),
8817
+ prompt: withFullDocumentTextContext(buildFormInventoryPrompt(templateHints)),
8492
8818
  schema: FormInventorySchema,
8493
8819
  maxTokens: budget.maxTokens,
8494
8820
  taskKind: "extraction_form_inventory",
8495
8821
  budgetDiagnostics: budget,
8496
- providerOptions: await getFullPdfProviderOptions()
8822
+ providerOptions: await getFullDocumentProviderOptions()
8497
8823
  },
8498
8824
  {
8499
8825
  fallback: { forms: [] },
@@ -8536,18 +8862,24 @@ function createExtractor(config) {
8536
8862
  const pageMapResults = await Promise.all(
8537
8863
  pageMapChunks.map(
8538
8864
  ({ startPage, endPage }) => pageMapLimit(async () => {
8539
- const pagesPdf = await getPageRangePdf(startPage, endPage);
8865
+ const pagesPdf = doclingDocument ? void 0 : await getPageRangePdf(startPage, endPage);
8866
+ const pagesText = doclingDocument ? await getPageRangeText(startPage, endPage) : "";
8540
8867
  const budget = resolveBudget("extraction_page_map", 2048);
8541
8868
  const startedAt = Date.now();
8542
8869
  const mapResponse = await safeGenerateObject(
8543
8870
  generateObject,
8544
8871
  {
8545
- prompt: buildPageMapPrompt(templateHints, startPage, endPage, formInventoryHint),
8872
+ prompt: withPageRangeTextContext(
8873
+ buildPageMapPrompt(templateHints, startPage, endPage, formInventoryHint),
8874
+ startPage,
8875
+ endPage,
8876
+ pagesText
8877
+ ),
8546
8878
  schema: PageMapChunkSchema,
8547
8879
  maxTokens: budget.maxTokens,
8548
8880
  taskKind: "extraction_page_map",
8549
8881
  budgetDiagnostics: budget,
8550
- providerOptions: { ...activeProviderOptions, pdfBase64: pagesPdf }
8882
+ providerOptions: doclingDocument ? { ...activeProviderOptions, doclingText: pagesText, doclingPageRange: { startPage, endPage } } : { ...activeProviderOptions, pdfBase64: pagesPdf }
8551
8883
  },
8552
8884
  {
8553
8885
  fallback: {
@@ -8625,7 +8957,7 @@ function createExtractor(config) {
8625
8957
  }))
8626
8958
  ];
8627
8959
  onProgress?.(`Dispatching ${tasks.length} extractors...`);
8628
- const extractionPdfInput = await getPdfBase64ForExtraction();
8960
+ const extractionPdfInput = doclingDocument ? void 0 : await getPdfBase64ForExtraction();
8629
8961
  const extractorResults = await Promise.all(
8630
8962
  tasks.map(
8631
8963
  (task) => extractorLimit(async () => {
@@ -8636,7 +8968,8 @@ function createExtractor(config) {
8636
8968
  memory,
8637
8969
  completedPageRangePdfCache,
8638
8970
  getPageRangePdf,
8639
- convertPdfToImages ? getPageImages : void 0
8971
+ convertPdfToImages ? getPageImages : void 0,
8972
+ doclingDocument ? getPageRangeText : void 0
8640
8973
  );
8641
8974
  })
8642
8975
  )
@@ -8668,7 +9001,8 @@ function createExtractor(config) {
8668
9001
  providerOptions: activeProviderOptions,
8669
9002
  pageRangeCache: completedPageRangePdfCache,
8670
9003
  getPageRangePdf,
8671
- getPageImages: convertPdfToImages ? getPageImages : void 0
9004
+ getPageImages: convertPdfToImages ? getPageImages : void 0,
9005
+ getPageRangeText: doclingDocument ? getPageRangeText : void 0
8672
9006
  });
8673
9007
  trackUsage(supplementaryResult.usage, {
8674
9008
  taskKind: "extraction_focused",
@@ -8704,6 +9038,7 @@ function createExtractor(config) {
8704
9038
  concurrency,
8705
9039
  getPageRangePdf,
8706
9040
  getPageImages: convertPdfToImages ? getPageImages : void 0,
9041
+ getPageRangeText: doclingDocument ? getPageRangeText : void 0,
8707
9042
  providerOptions: activeProviderOptions,
8708
9043
  modelCapabilities,
8709
9044
  modelBudgetConstraints,
@@ -8752,12 +9087,12 @@ function createExtractor(config) {
8752
9087
  const reviewResponse = await safeGenerateObject(
8753
9088
  generateObject,
8754
9089
  {
8755
- prompt: buildReviewPrompt(template.required, extractedKeys, extractionSummary, pageMapSummary, extractorCatalog),
9090
+ prompt: withFullDocumentTextContext(buildReviewPrompt(template.required, extractedKeys, extractionSummary, pageMapSummary, extractorCatalog)),
8756
9091
  schema: ReviewResultSchema,
8757
9092
  maxTokens: budget.maxTokens,
8758
9093
  taskKind: "extraction_review",
8759
9094
  budgetDiagnostics: budget,
8760
- providerOptions: await getFullPdfProviderOptions()
9095
+ providerOptions: await getFullDocumentProviderOptions()
8761
9096
  },
8762
9097
  {
8763
9098
  fallback: {
@@ -8787,7 +9122,7 @@ function createExtractor(config) {
8787
9122
  break;
8788
9123
  }
8789
9124
  onProgress?.(`Review round ${round + 1}: dispatching ${reviewResponse.object.additionalTasks.length} follow-up extractors...`);
8790
- const extractionPdfInput = await getPdfBase64ForExtraction();
9125
+ const extractionPdfInput = doclingDocument ? void 0 : await getPdfBase64ForExtraction();
8791
9126
  const followUpResults = await Promise.all(
8792
9127
  reviewResponse.object.additionalTasks.map(
8793
9128
  (task) => extractorLimit(async () => {
@@ -8797,7 +9132,8 @@ function createExtractor(config) {
8797
9132
  memory,
8798
9133
  completedPageRangePdfCache,
8799
9134
  getPageRangePdf,
8800
- convertPdfToImages ? getPageImages : void 0
9135
+ convertPdfToImages ? getPageImages : void 0,
9136
+ doclingDocument ? getPageRangeText : void 0
8801
9137
  );
8802
9138
  })
8803
9139
  )
@@ -12797,6 +13133,7 @@ var AGENT_TOOLS = [
12797
13133
  buildConfirmationSummaryPrompt,
12798
13134
  buildConversationMemoryGuidance,
12799
13135
  buildCoverageGapPrompt,
13136
+ buildDoclingProviderOptions,
12800
13137
  buildFieldExplanationPrompt,
12801
13138
  buildFieldExtractionPrompt,
12802
13139
  buildFlatPdfMappingPrompt,
@@ -12838,12 +13175,16 @@ var AGENT_TOOLS = [
12838
13175
  fillAcroForm,
12839
13176
  generateNextMessage,
12840
13177
  getAcroFormFields,
13178
+ getDoclingPageRangeText,
12841
13179
  getExtractor,
12842
13180
  getFileIdentifier,
12843
13181
  getPdfPageCount,
12844
13182
  getTemplate,
13183
+ isDoclingExtractionInput,
12845
13184
  isFileReference,
12846
13185
  mergeQuestionAnswers,
13186
+ mergeSourceSpans,
13187
+ normalizeDoclingDocument,
12847
13188
  normalizeForMatch,
12848
13189
  orderSourceEvidence,
12849
13190
  overlayTextOnPdf,