@claritylabs/cl-sdk 0.8.1 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -2138,6 +2138,125 @@ Total Cost: ${doc.totalCost}` : ""}`,
2138
2138
  return chunks;
2139
2139
  }
2140
2140
 
2141
+ // src/extraction/merge.ts
2142
+ function isPresent(value) {
2143
+ if (value === void 0 || value === null) return false;
2144
+ if (typeof value === "string") return value.trim().length > 0;
2145
+ if (Array.isArray(value)) return value.length > 0;
2146
+ return true;
2147
+ }
2148
+ function dedupeByKey(items, keyFn) {
2149
+ const seen = /* @__PURE__ */ new Set();
2150
+ const merged = [];
2151
+ for (const item of items) {
2152
+ const key = keyFn(item);
2153
+ if (seen.has(key)) continue;
2154
+ seen.add(key);
2155
+ merged.push(item);
2156
+ }
2157
+ return merged;
2158
+ }
2159
+ function mergeUniqueObjects(existing, incoming, keyFn) {
2160
+ return dedupeByKey([...existing, ...incoming], keyFn);
2161
+ }
2162
+ function mergeShallowPreferPresent(existing, incoming) {
2163
+ const merged = { ...existing };
2164
+ for (const [key, value] of Object.entries(incoming)) {
2165
+ const current = merged[key];
2166
+ if (Array.isArray(current) && Array.isArray(value)) {
2167
+ merged[key] = [...current, ...value];
2168
+ continue;
2169
+ }
2170
+ if (current && value && typeof current === "object" && typeof value === "object" && !Array.isArray(current) && !Array.isArray(value)) {
2171
+ merged[key] = mergeShallowPreferPresent(
2172
+ current,
2173
+ value
2174
+ );
2175
+ continue;
2176
+ }
2177
+ if (!isPresent(current) && isPresent(value)) {
2178
+ merged[key] = value;
2179
+ }
2180
+ }
2181
+ return merged;
2182
+ }
2183
+ function mergeCoverageLimits(existing, incoming) {
2184
+ const merged = mergeShallowPreferPresent(existing, incoming);
2185
+ const existingCoverages = Array.isArray(existing.coverages) ? existing.coverages : [];
2186
+ const incomingCoverages = Array.isArray(incoming.coverages) ? incoming.coverages : [];
2187
+ merged.coverages = mergeUniqueObjects(existingCoverages, incomingCoverages, (coverage) => [
2188
+ String(coverage.name ?? "").toLowerCase(),
2189
+ String(coverage.limit ?? "").toLowerCase(),
2190
+ String(coverage.deductible ?? "").toLowerCase(),
2191
+ String(coverage.formNumber ?? "").toLowerCase()
2192
+ ].join("|"));
2193
+ return merged;
2194
+ }
2195
+ function mergeDeclarations(existing, incoming) {
2196
+ const merged = mergeShallowPreferPresent(existing, incoming);
2197
+ const existingFields = Array.isArray(existing.fields) ? existing.fields : [];
2198
+ const incomingFields = Array.isArray(incoming.fields) ? incoming.fields : [];
2199
+ merged.fields = mergeUniqueObjects(existingFields, incomingFields, (field) => [
2200
+ String(field.field ?? "").toLowerCase(),
2201
+ String(field.value ?? "").toLowerCase(),
2202
+ String(field.section ?? "").toLowerCase()
2203
+ ].join("|"));
2204
+ return merged;
2205
+ }
2206
+ function mergeArrayPayload(existing, incoming, arrayKey, keyFn) {
2207
+ const merged = mergeShallowPreferPresent(existing, incoming);
2208
+ const existingItems = Array.isArray(existing[arrayKey]) ? existing[arrayKey] : [];
2209
+ const incomingItems = Array.isArray(incoming[arrayKey]) ? incoming[arrayKey] : [];
2210
+ merged[arrayKey] = mergeUniqueObjects(existingItems, incomingItems, keyFn);
2211
+ return merged;
2212
+ }
2213
+ function mergeExtractorResult(extractorName, existing, incoming) {
2214
+ if (!existing) return incoming;
2215
+ if (!incoming) return existing;
2216
+ if (typeof existing !== "object" || typeof incoming !== "object") return incoming;
2217
+ const current = existing;
2218
+ const next = incoming;
2219
+ switch (extractorName) {
2220
+ case "carrier_info":
2221
+ case "named_insured":
2222
+ case "loss_history":
2223
+ case "supplementary":
2224
+ case "premium_breakdown":
2225
+ return mergeShallowPreferPresent(current, next);
2226
+ case "coverage_limits":
2227
+ return mergeCoverageLimits(current, next);
2228
+ case "declarations":
2229
+ return mergeDeclarations(current, next);
2230
+ case "endorsements":
2231
+ return mergeArrayPayload(current, next, "endorsements", (item) => [
2232
+ String(item.formNumber ?? "").toLowerCase(),
2233
+ String(item.title ?? "").toLowerCase(),
2234
+ String(item.pageStart ?? "")
2235
+ ].join("|"));
2236
+ case "exclusions":
2237
+ return mergeArrayPayload(current, next, "exclusions", (item) => [
2238
+ String(item.name ?? "").toLowerCase(),
2239
+ String(item.formNumber ?? "").toLowerCase(),
2240
+ String(item.pageNumber ?? "")
2241
+ ].join("|"));
2242
+ case "conditions":
2243
+ return mergeArrayPayload(current, next, "conditions", (item) => [
2244
+ String(item.name ?? "").toLowerCase(),
2245
+ String(item.conditionType ?? "").toLowerCase(),
2246
+ String(item.pageNumber ?? "")
2247
+ ].join("|"));
2248
+ case "sections":
2249
+ return mergeArrayPayload(current, next, "sections", (item) => [
2250
+ String(item.title ?? "").toLowerCase(),
2251
+ String(item.type ?? "").toLowerCase(),
2252
+ String(item.pageStart ?? ""),
2253
+ String(item.pageEnd ?? "")
2254
+ ].join("|"));
2255
+ default:
2256
+ return mergeShallowPreferPresent(current, next);
2257
+ }
2258
+ }
2259
+
2141
2260
  // src/prompts/templates/homeowners.ts
2142
2261
  var HOMEOWNERS_TEMPLATE = {
2143
2262
  type: "homeowners",
@@ -2927,57 +3046,74 @@ Return JSON only:
2927
3046
  }`;
2928
3047
  }
2929
3048
 
2930
- // src/prompts/coordinator/plan.ts
3049
+ // src/prompts/coordinator/page-map.ts
2931
3050
  var import_zod19 = require("zod");
2932
- var ExtractionTaskSchema = import_zod19.z.object({
2933
- extractorName: import_zod19.z.string(),
2934
- startPage: import_zod19.z.number(),
2935
- endPage: import_zod19.z.number(),
2936
- description: import_zod19.z.string()
2937
- });
2938
- var PageMapEntrySchema = import_zod19.z.object({
2939
- section: import_zod19.z.string(),
2940
- pages: import_zod19.z.string()
3051
+ var PageExtractorSchema = import_zod19.z.enum([
3052
+ "carrier_info",
3053
+ "named_insured",
3054
+ "coverage_limits",
3055
+ "endorsements",
3056
+ "exclusions",
3057
+ "conditions",
3058
+ "premium_breakdown",
3059
+ "declarations",
3060
+ "loss_history",
3061
+ "sections",
3062
+ "supplementary"
3063
+ ]);
3064
+ var PageAssignmentSchema = import_zod19.z.object({
3065
+ localPageNumber: import_zod19.z.number().int().positive().describe("1-based page number within this supplied PDF chunk"),
3066
+ extractorNames: import_zod19.z.array(PageExtractorSchema).describe("Focused extractors that should inspect this page"),
3067
+ confidence: import_zod19.z.number().min(0).max(1).optional().describe("Confidence in the page assignment"),
3068
+ notes: import_zod19.z.string().optional().describe("Short explanation of what appears on the page")
2941
3069
  });
2942
- var ExtractionPlanSchema = import_zod19.z.object({
2943
- tasks: import_zod19.z.array(ExtractionTaskSchema),
2944
- pageMap: import_zod19.z.array(PageMapEntrySchema).optional()
3070
+ var PageMapChunkSchema = import_zod19.z.object({
3071
+ pages: import_zod19.z.array(PageAssignmentSchema)
2945
3072
  });
2946
- function buildPlanPrompt(templateHints) {
2947
- return `You are planning the extraction of an insurance document. You have already classified this document. Now scan the full document and create a page map + extraction plan.
3073
+ function buildPageMapPrompt(templateHints, startPage, endPage) {
3074
+ return `You are mapping insurance document pages to focused extractors.
3075
+
3076
+ These supplied pages are ORIGINAL DOCUMENT PAGES ${startPage}-${endPage}.
2948
3077
 
2949
3078
  DOCUMENT TYPE HINTS:
2950
3079
  ${templateHints}
2951
3080
 
2952
- For each section of the document, decide which extractor should handle it and which pages to send.
3081
+ For each page in this supplied PDF chunk, decide which extractor(s) should inspect it.
2953
3082
 
2954
3083
  Available extractors:
2955
- - carrier_info: Carrier name, legal name, NAIC, AM Best rating, admitted status, MGA, underwriter
2956
- - named_insured: Insured name, DBA, address, entity type, FEIN, SIC/NAICS codes, additional named insureds
2957
- - coverage_limits: Coverage names, limits, deductibles, coverage form, triggers
2958
- - endorsements: Endorsement forms, titles, types, content, affected parties
2959
- - exclusions: Exclusion titles, content, applicability
2960
- - conditions: Policy conditions (duties after loss, cancellation, etc.)
2961
- - premium_breakdown: Premium amounts, taxes, fees, payment plans, rating basis
2962
- - declarations: Line-specific structured declarations data (varies by policy type)
2963
- - loss_history: Loss runs, claim records, experience modification
2964
- - sections: Raw section content (for sections that don't fit other extractors)
2965
- - supplementary: Regulatory context, contacts, claims contacts, third-party administrators
3084
+ - carrier_info
3085
+ - named_insured
3086
+ - coverage_limits
3087
+ - endorsements
3088
+ - exclusions
3089
+ - conditions
3090
+ - premium_breakdown
3091
+ - declarations
3092
+ - loss_history
3093
+ - sections
3094
+ - supplementary
3095
+
3096
+ Rules:
3097
+ - Use specific extractors for declarations, schedules, endorsements, exclusions, conditions, premium pages, and loss runs.
3098
+ - Use "sections" for pages that contain substantive policy text or mixed content that should still be preserved as raw sections.
3099
+ - Avoid assigning broad ranges mentally; decide page by page.
3100
+ - A page may map to multiple extractors if it legitimately contains multiple relevant sections.
3101
+ - Prefer declarations and schedules for numeric limits/deductibles over later generic form wording.
3102
+ - If a page is mostly generic form language with no declaration-specific values, do not assign "coverage_limits" unless it clearly contains schedule-specific limits.
3103
+ - Return every page in the supplied chunk exactly once.
2966
3104
 
2967
3105
  Return JSON:
2968
3106
  {
2969
- "tasks": [
2970
- { "extractorName": "carrier_info", "startPage": 1, "endPage": 2, "description": "Extract carrier details from declarations page" },
2971
- ...
2972
- ],
2973
- "pageMap": [
2974
- { "section": "declarations", "pages": "pages 1-3" },
2975
- { "section": "endorsements", "pages": "pages 15-22" }
3107
+ "pages": [
3108
+ {
3109
+ "localPageNumber": 1,
3110
+ "extractorNames": ["declarations", "carrier_info", "named_insured", "coverage_limits"],
3111
+ "confidence": 0.96,
3112
+ "notes": "Declarations page with insured, policy period, and scheduled limits"
3113
+ }
2976
3114
  ]
2977
3115
  }
2978
3116
 
2979
- Create tasks that cover the entire document. Prefer specific extractors over generic "sections" where possible. Keep page ranges tight \u2014 only include pages relevant to each extractor.
2980
-
2981
3117
  Respond with JSON only.`;
2982
3118
  }
2983
3119
 
@@ -2986,6 +3122,7 @@ var import_zod20 = require("zod");
2986
3122
  var ReviewResultSchema = import_zod20.z.object({
2987
3123
  complete: import_zod20.z.boolean(),
2988
3124
  missingFields: import_zod20.z.array(import_zod20.z.string()),
3125
+ qualityIssues: import_zod20.z.array(import_zod20.z.string()).optional(),
2989
3126
  additionalTasks: import_zod20.z.array(import_zod20.z.object({
2990
3127
  extractorName: import_zod20.z.string(),
2991
3128
  startPage: import_zod20.z.number(),
@@ -2993,8 +3130,8 @@ var ReviewResultSchema = import_zod20.z.object({
2993
3130
  description: import_zod20.z.string()
2994
3131
  }))
2995
3132
  });
2996
- function buildReviewPrompt(templateExpected, extractedKeys) {
2997
- return `You are reviewing an extraction for completeness. Compare what was expected vs what was found.
3133
+ function buildReviewPrompt(templateExpected, extractedKeys, extractionSummary, pageMapSummary) {
3134
+ return `You are reviewing an extraction for completeness and quality. Compare what was expected vs what was found.
2998
3135
 
2999
3136
  EXPECTED FIELDS (from document type template):
3000
3137
  ${templateExpected.map((f) => `- ${f}`).join("\n")}
@@ -3002,21 +3139,36 @@ ${templateExpected.map((f) => `- ${f}`).join("\n")}
3002
3139
  FIELDS ALREADY EXTRACTED:
3003
3140
  ${extractedKeys.map((f) => `- ${f}`).join("\n")}
3004
3141
 
3142
+ PAGE MAP SUMMARY:
3143
+ ${pageMapSummary}
3144
+
3145
+ CURRENT EXTRACTION SUMMARY:
3146
+ ${extractionSummary}
3147
+
3005
3148
  Determine:
3006
- 1. Is the extraction complete enough? (required fields present = complete)
3149
+ 1. Is the extraction complete enough?
3007
3150
  2. What fields are missing?
3008
- 3. Should any additional extraction tasks be dispatched?
3151
+ 3. What quality issues are present?
3152
+ 4. Should any additional extraction tasks be dispatched?
3153
+
3154
+ Mark the extraction as NOT complete if any of these are true:
3155
+ - required fields are missing
3156
+ - extracted values are generic placeholders like "shown in declarations", "per schedule", "if applicable", "as stated"
3157
+ - coverage limits or deductibles appear to come from generic form language instead of declaration/schedule-specific values
3158
+ - page assignments suggest declaration, schedule, endorsement, exclusion, or condition pages were not actually extracted with the matching focused extractor
3159
+ - a focused extractor exists but returned too little substance for the relevant pages
3009
3160
 
3010
3161
  Return JSON:
3011
3162
  {
3012
3163
  "complete": boolean,
3013
3164
  "missingFields": ["field1", "field2"],
3165
+ "qualityIssues": ["issue 1", "issue 2"],
3014
3166
  "additionalTasks": [
3015
3167
  { "extractorName": "...", "startPage": N, "endPage": N, "description": "..." }
3016
3168
  ]
3017
3169
  }
3018
3170
 
3019
- If all required fields are present, set complete=true even if some optional fields are missing.
3171
+ Use the page map to target follow-up extraction pages precisely. Prefer narrow, declaration/schedule-focused follow-up tasks over broad page ranges.
3020
3172
 
3021
3173
  Respond with JSON only.`;
3022
3174
  }
@@ -3564,17 +3716,125 @@ function createExtractor(config) {
3564
3716
  } = config;
3565
3717
  const limit = pLimit(concurrency);
3566
3718
  let totalUsage = { inputTokens: 0, outputTokens: 0 };
3719
+ let modelCalls = 0;
3720
+ let callsWithUsage = 0;
3721
+ let callsMissingUsage = 0;
3567
3722
  function trackUsage(usage) {
3723
+ modelCalls += 1;
3568
3724
  if (usage) {
3725
+ callsWithUsage += 1;
3569
3726
  totalUsage.inputTokens += usage.inputTokens;
3570
3727
  totalUsage.outputTokens += usage.outputTokens;
3571
3728
  onTokenUsage?.(usage);
3729
+ } else {
3730
+ callsMissingUsage += 1;
3731
+ }
3732
+ }
3733
+ function mergeMemoryResult(name, data, memory) {
3734
+ const existing = memory.get(name);
3735
+ memory.set(name, mergeExtractorResult(name, existing, data));
3736
+ }
3737
+ function summarizeExtraction(memory) {
3738
+ const coverageResult = memory.get("coverage_limits");
3739
+ const declarationResult = memory.get("declarations");
3740
+ const endorsementResult = memory.get("endorsements");
3741
+ const exclusionResult = memory.get("exclusions");
3742
+ const conditionResult = memory.get("conditions");
3743
+ const sectionResult = memory.get("sections");
3744
+ const coverageSummary = Array.isArray(coverageResult?.coverages) ? coverageResult.coverages.slice(0, 12).map((coverage) => ({
3745
+ name: coverage.name,
3746
+ limit: coverage.limit,
3747
+ deductible: coverage.deductible,
3748
+ formNumber: coverage.formNumber
3749
+ })) : [];
3750
+ return JSON.stringify({
3751
+ extractedKeys: [...memory.keys()].filter((key) => key !== "classify"),
3752
+ declarationFieldCount: Array.isArray(declarationResult?.fields) ? declarationResult.fields.length : 0,
3753
+ coverageCount: Array.isArray(coverageResult?.coverages) ? coverageResult.coverages.length : 0,
3754
+ coverageSamples: coverageSummary,
3755
+ endorsementCount: Array.isArray(endorsementResult?.endorsements) ? endorsementResult.endorsements.length : 0,
3756
+ exclusionCount: Array.isArray(exclusionResult?.exclusions) ? exclusionResult.exclusions.length : 0,
3757
+ conditionCount: Array.isArray(conditionResult?.conditions) ? conditionResult.conditions.length : 0,
3758
+ sectionCount: Array.isArray(sectionResult?.sections) ? sectionResult.sections.length : 0
3759
+ }, null, 2);
3760
+ }
3761
+ function formatPageMapSummary(pageAssignments) {
3762
+ const extractorPages = /* @__PURE__ */ new Map();
3763
+ for (const assignment of pageAssignments) {
3764
+ for (const extractorName of assignment.extractorNames) {
3765
+ extractorPages.set(extractorName, [...extractorPages.get(extractorName) ?? [], assignment.localPageNumber]);
3766
+ }
3767
+ }
3768
+ if (extractorPages.size === 0) return "No page assignments available.";
3769
+ return [...extractorPages.entries()].map(([extractorName, pages]) => `${extractorName}: pages ${pages.join(", ")}`).join("\n");
3770
+ }
3771
+ function buildTemplateHints(primaryType, documentType, pageCount, template) {
3772
+ return [
3773
+ `Document type: ${primaryType} ${documentType}`,
3774
+ `Expected sections: ${template.expectedSections.join(", ")}`,
3775
+ `Page hints: ${Object.entries(template.pageHints).map(([k, v]) => `${k}: ${v}`).join("; ")}`,
3776
+ `Total pages: ${pageCount}`
3777
+ ].join("\n");
3778
+ }
3779
+ function groupContiguousPages(pages) {
3780
+ if (pages.length === 0) return [];
3781
+ const sorted = [...new Set(pages)].sort((a, b) => a - b);
3782
+ const ranges = [];
3783
+ let start = sorted[0];
3784
+ let previous = sorted[0];
3785
+ for (let i = 1; i < sorted.length; i += 1) {
3786
+ const current = sorted[i];
3787
+ if (current === previous + 1) {
3788
+ previous = current;
3789
+ continue;
3790
+ }
3791
+ ranges.push({ startPage: start, endPage: previous });
3792
+ start = current;
3793
+ previous = current;
3572
3794
  }
3795
+ ranges.push({ startPage: start, endPage: previous });
3796
+ return ranges;
3797
+ }
3798
+ function buildPlanFromPageAssignments(pageAssignments, pageCount) {
3799
+ const extractorPages = /* @__PURE__ */ new Map();
3800
+ for (const assignment of pageAssignments) {
3801
+ const extractors = assignment.extractorNames.length > 0 ? assignment.extractorNames : ["sections"];
3802
+ for (const extractorName of extractors) {
3803
+ extractorPages.set(extractorName, [...extractorPages.get(extractorName) ?? [], assignment.localPageNumber]);
3804
+ }
3805
+ }
3806
+ const coveredPages = /* @__PURE__ */ new Set();
3807
+ for (const pages of extractorPages.values()) {
3808
+ for (const page of pages) coveredPages.add(page);
3809
+ }
3810
+ for (let page = 1; page <= pageCount; page += 1) {
3811
+ if (!coveredPages.has(page)) {
3812
+ extractorPages.set("sections", [...extractorPages.get("sections") ?? [], page]);
3813
+ }
3814
+ }
3815
+ const tasks = [...extractorPages.entries()].flatMap(
3816
+ ([extractorName, pages]) => groupContiguousPages(pages).map(({ startPage, endPage }) => ({
3817
+ extractorName,
3818
+ startPage,
3819
+ endPage,
3820
+ description: `Page-mapped ${extractorName} extraction for pages ${startPage}-${endPage}`
3821
+ }))
3822
+ ).sort((a, b) => a.startPage - b.startPage || a.extractorName.localeCompare(b.extractorName));
3823
+ return {
3824
+ tasks,
3825
+ pageMap: [...extractorPages.entries()].map(([section, pages]) => ({
3826
+ section,
3827
+ pages: `pages ${[...new Set(pages)].sort((a, b) => a - b).join(", ")}`
3828
+ }))
3829
+ };
3573
3830
  }
3574
3831
  async function extract(pdfBase64, documentId, options) {
3575
3832
  const id = documentId ?? `doc-${Date.now()}`;
3576
3833
  const memory = /* @__PURE__ */ new Map();
3577
3834
  totalUsage = { inputTokens: 0, outputTokens: 0 };
3835
+ modelCalls = 0;
3836
+ callsWithUsage = 0;
3837
+ callsMissingUsage = 0;
3578
3838
  const pipelineCtx = createPipelineContext({
3579
3839
  id,
3580
3840
  onSave: onCheckpointSave,
@@ -3625,40 +3885,73 @@ function createExtractor(config) {
3625
3885
  const primaryType = policyTypes[0] ?? "other";
3626
3886
  const template = getTemplate(primaryType);
3627
3887
  const pageCount = resumed?.pageCount ?? await getPdfPageCount(pdfBase64);
3888
+ const templateHints = buildTemplateHints(primaryType, documentType, pageCount, template);
3889
+ let pageAssignments;
3890
+ if (resumed?.pageAssignments && pipelineCtx.isPhaseComplete("page_map")) {
3891
+ pageAssignments = resumed.pageAssignments;
3892
+ onProgress?.("Resuming from checkpoint (page map complete)...");
3893
+ } else {
3894
+ onProgress?.(`Mapping document pages for ${primaryType} ${documentType}...`);
3895
+ const chunkSize = 8;
3896
+ const collectedAssignments = [];
3897
+ for (let startPage = 1; startPage <= pageCount; startPage += chunkSize) {
3898
+ const endPage = Math.min(pageCount, startPage + chunkSize - 1);
3899
+ const pagesPdf = await extractPageRange(pdfBase64, startPage, endPage);
3900
+ const mapResponse = await safeGenerateObject(
3901
+ generateObject,
3902
+ {
3903
+ prompt: buildPageMapPrompt(templateHints, startPage, endPage),
3904
+ schema: PageMapChunkSchema,
3905
+ maxTokens: 2048,
3906
+ providerOptions: { ...providerOptions, pdfBase64: pagesPdf }
3907
+ },
3908
+ {
3909
+ fallback: {
3910
+ pages: Array.from({ length: endPage - startPage + 1 }, (_, index) => ({
3911
+ localPageNumber: index + 1,
3912
+ extractorNames: index === 0 && startPage === 1 ? ["carrier_info", "named_insured", "declarations", "coverage_limits"] : ["sections"],
3913
+ confidence: 0,
3914
+ notes: "Fallback page assignment"
3915
+ }))
3916
+ },
3917
+ log,
3918
+ onError: (err, attempt) => log?.(`Page map attempt ${attempt + 1} failed for pages ${startPage}-${endPage}: ${err}`)
3919
+ }
3920
+ );
3921
+ trackUsage(mapResponse.usage);
3922
+ for (const assignment of mapResponse.object.pages) {
3923
+ collectedAssignments.push({
3924
+ ...assignment,
3925
+ localPageNumber: startPage + assignment.localPageNumber - 1
3926
+ });
3927
+ }
3928
+ }
3929
+ pageAssignments = collectedAssignments.length > 0 ? collectedAssignments : Array.from({ length: pageCount }, (_, index) => ({
3930
+ localPageNumber: index + 1,
3931
+ extractorNames: index === 0 ? ["carrier_info", "named_insured", "declarations", "coverage_limits"] : ["sections"],
3932
+ confidence: 0,
3933
+ notes: "Full-document fallback page assignment"
3934
+ }));
3935
+ await pipelineCtx.save("page_map", {
3936
+ id,
3937
+ pageCount,
3938
+ classifyResult,
3939
+ pageAssignments,
3940
+ memory: Object.fromEntries(memory)
3941
+ });
3942
+ }
3628
3943
  let plan;
3629
3944
  if (resumed?.plan && pipelineCtx.isPhaseComplete("plan")) {
3630
3945
  plan = resumed.plan;
3631
3946
  onProgress?.("Resuming from checkpoint (plan complete)...");
3632
3947
  } else {
3633
- onProgress?.(`Planning extraction for ${primaryType} ${documentType}...`);
3634
- const templateHints = [
3635
- `Document type: ${primaryType} ${documentType}`,
3636
- `Expected sections: ${template.expectedSections.join(", ")}`,
3637
- `Page hints: ${Object.entries(template.pageHints).map(([k, v]) => `${k}: ${v}`).join("; ")}`,
3638
- `Total pages: ${pageCount}`
3639
- ].join("\n");
3640
- const planResponse = await safeGenerateObject(
3641
- generateObject,
3642
- {
3643
- prompt: buildPlanPrompt(templateHints),
3644
- schema: ExtractionPlanSchema,
3645
- maxTokens: 2048,
3646
- providerOptions: { ...providerOptions, pdfBase64 }
3647
- },
3648
- {
3649
- fallback: {
3650
- tasks: [{ extractorName: "sections", startPage: 1, endPage: pageCount, description: "Full document fallback extraction" }]
3651
- },
3652
- log,
3653
- onError: (err, attempt) => log?.(`Plan attempt ${attempt + 1} failed: ${err}`)
3654
- }
3655
- );
3656
- trackUsage(planResponse.usage);
3657
- plan = planResponse.object;
3948
+ onProgress?.(`Building extraction plan from page map for ${primaryType} ${documentType}...`);
3949
+ plan = buildPlanFromPageAssignments(pageAssignments, pageCount);
3658
3950
  await pipelineCtx.save("plan", {
3659
3951
  id,
3660
3952
  pageCount,
3661
3953
  classifyResult,
3954
+ pageAssignments,
3662
3955
  plan,
3663
3956
  memory: Object.fromEntries(memory)
3664
3957
  });
@@ -3699,13 +3992,14 @@ function createExtractor(config) {
3699
3992
  );
3700
3993
  for (const result of extractorResults) {
3701
3994
  if (result) {
3702
- memory.set(result.name, result.data);
3995
+ mergeMemoryResult(result.name, result.data, memory);
3703
3996
  }
3704
3997
  }
3705
3998
  await pipelineCtx.save("extract", {
3706
3999
  id,
3707
4000
  pageCount,
3708
4001
  classifyResult,
4002
+ pageAssignments,
3709
4003
  plan,
3710
4004
  memory: Object.fromEntries(memory)
3711
4005
  });
@@ -3713,21 +4007,26 @@ function createExtractor(config) {
3713
4007
  if (!pipelineCtx.isPhaseComplete("review")) {
3714
4008
  for (let round = 0; round < maxReviewRounds; round++) {
3715
4009
  const extractedKeys = [...memory.keys()].filter((k) => k !== "classify");
4010
+ const extractionSummary = summarizeExtraction(memory);
4011
+ const pageMapSummary = formatPageMapSummary(pageAssignments);
3716
4012
  const reviewResponse = await safeGenerateObject(
3717
4013
  generateObject,
3718
4014
  {
3719
- prompt: buildReviewPrompt(template.required, extractedKeys),
4015
+ prompt: buildReviewPrompt(template.required, extractedKeys, extractionSummary, pageMapSummary),
3720
4016
  schema: ReviewResultSchema,
3721
- maxTokens: 1024,
3722
- providerOptions
4017
+ maxTokens: 1536,
4018
+ providerOptions: { ...providerOptions, pdfBase64 }
3723
4019
  },
3724
4020
  {
3725
- fallback: { complete: true, missingFields: [], additionalTasks: [] },
4021
+ fallback: { complete: true, missingFields: [], qualityIssues: [], additionalTasks: [] },
3726
4022
  log,
3727
4023
  onError: (err, attempt) => log?.(`Review round ${round + 1} attempt ${attempt + 1} failed: ${err}`)
3728
4024
  }
3729
4025
  );
3730
4026
  trackUsage(reviewResponse.usage);
4027
+ if (reviewResponse.object.qualityIssues?.length) {
4028
+ await log?.(`Review round ${round + 1} quality issues: ${reviewResponse.object.qualityIssues.join("; ")}`);
4029
+ }
3731
4030
  if (reviewResponse.object.complete || reviewResponse.object.additionalTasks.length === 0) {
3732
4031
  onProgress?.("Extraction complete.");
3733
4032
  break;
@@ -3762,7 +4061,7 @@ function createExtractor(config) {
3762
4061
  );
3763
4062
  for (const result of followUpResults) {
3764
4063
  if (result) {
3765
- memory.set(result.name, result.data);
4064
+ mergeMemoryResult(result.name, result.data, memory);
3766
4065
  }
3767
4066
  }
3768
4067
  }
@@ -3770,6 +4069,7 @@ function createExtractor(config) {
3770
4069
  id,
3771
4070
  pageCount,
3772
4071
  classifyResult,
4072
+ pageAssignments,
3773
4073
  plan,
3774
4074
  memory: Object.fromEntries(memory)
3775
4075
  });
@@ -3780,6 +4080,7 @@ function createExtractor(config) {
3780
4080
  id,
3781
4081
  pageCount,
3782
4082
  classifyResult,
4083
+ pageAssignments,
3783
4084
  plan,
3784
4085
  memory: Object.fromEntries(memory),
3785
4086
  document
@@ -3793,10 +4094,19 @@ function createExtractor(config) {
3793
4094
  trackUsage(formatResult.usage);
3794
4095
  const chunks = chunkDocument(formatResult.document);
3795
4096
  const finalCheckpoint = pipelineCtx.getCheckpoint();
4097
+ if (callsMissingUsage > 0) {
4098
+ await log?.(`Token usage was unavailable for ${callsMissingUsage}/${modelCalls} model calls. Check that your provider callbacks return usage.`);
4099
+ onProgress?.(`Token usage unavailable for ${callsMissingUsage}/${modelCalls} model calls.`);
4100
+ }
3796
4101
  return {
3797
4102
  document: formatResult.document,
3798
4103
  chunks,
3799
4104
  tokenUsage: totalUsage,
4105
+ usageReporting: {
4106
+ modelCalls,
4107
+ callsWithUsage,
4108
+ callsMissingUsage
4109
+ },
3800
4110
  checkpoint: finalCheckpoint
3801
4111
  };
3802
4112
  }