npm - @claritylabs/cl-sdk - Versions diffs - 0.8.1 → 0.9.0 - Mend

@claritylabs/cl-sdk 0.8.1 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/dist/index.mjs CHANGED Viewed

@@ -1905,6 +1905,125 @@ Total Cost: ${doc.totalCost}` : ""}`,
   return chunks;
 }
+// src/extraction/merge.ts
+function isPresent(value) {
+  if (value === void 0 || value === null) return false;
+  if (typeof value === "string") return value.trim().length > 0;
+  if (Array.isArray(value)) return value.length > 0;
+  return true;
+}
+function dedupeByKey(items, keyFn) {
+  const seen = /* @__PURE__ */ new Set();
+  const merged = [];
+  for (const item of items) {
+    const key = keyFn(item);
+    if (seen.has(key)) continue;
+    seen.add(key);
+    merged.push(item);
+  }
+  return merged;
+}
+function mergeUniqueObjects(existing, incoming, keyFn) {
+  return dedupeByKey([...existing, ...incoming], keyFn);
+}
+function mergeShallowPreferPresent(existing, incoming) {
+  const merged = { ...existing };
+  for (const [key, value] of Object.entries(incoming)) {
+    const current = merged[key];
+    if (Array.isArray(current) && Array.isArray(value)) {
+      merged[key] = [...current, ...value];
+      continue;
+    }
+    if (current && value && typeof current === "object" && typeof value === "object" && !Array.isArray(current) && !Array.isArray(value)) {
+      merged[key] = mergeShallowPreferPresent(
+        current,
+        value
+      );
+      continue;
+    }
+    if (!isPresent(current) && isPresent(value)) {
+      merged[key] = value;
+    }
+  }
+  return merged;
+}
+function mergeCoverageLimits(existing, incoming) {
+  const merged = mergeShallowPreferPresent(existing, incoming);
+  const existingCoverages = Array.isArray(existing.coverages) ? existing.coverages : [];
+  const incomingCoverages = Array.isArray(incoming.coverages) ? incoming.coverages : [];
+  merged.coverages = mergeUniqueObjects(existingCoverages, incomingCoverages, (coverage) => [
+    String(coverage.name ?? "").toLowerCase(),
+    String(coverage.limit ?? "").toLowerCase(),
+    String(coverage.deductible ?? "").toLowerCase(),
+    String(coverage.formNumber ?? "").toLowerCase()
+  ].join("|"));
+  return merged;
+}
+function mergeDeclarations(existing, incoming) {
+  const merged = mergeShallowPreferPresent(existing, incoming);
+  const existingFields = Array.isArray(existing.fields) ? existing.fields : [];
+  const incomingFields = Array.isArray(incoming.fields) ? incoming.fields : [];
+  merged.fields = mergeUniqueObjects(existingFields, incomingFields, (field) => [
+    String(field.field ?? "").toLowerCase(),
+    String(field.value ?? "").toLowerCase(),
+    String(field.section ?? "").toLowerCase()
+  ].join("|"));
+  return merged;
+}
+function mergeArrayPayload(existing, incoming, arrayKey, keyFn) {
+  const merged = mergeShallowPreferPresent(existing, incoming);
+  const existingItems = Array.isArray(existing[arrayKey]) ? existing[arrayKey] : [];
+  const incomingItems = Array.isArray(incoming[arrayKey]) ? incoming[arrayKey] : [];
+  merged[arrayKey] = mergeUniqueObjects(existingItems, incomingItems, keyFn);
+  return merged;
+}
+function mergeExtractorResult(extractorName, existing, incoming) {
+  if (!existing) return incoming;
+  if (!incoming) return existing;
+  if (typeof existing !== "object" || typeof incoming !== "object") return incoming;
+  const current = existing;
+  const next = incoming;
+  switch (extractorName) {
+    case "carrier_info":
+    case "named_insured":
+    case "loss_history":
+    case "supplementary":
+    case "premium_breakdown":
+      return mergeShallowPreferPresent(current, next);
+    case "coverage_limits":
+      return mergeCoverageLimits(current, next);
+    case "declarations":
+      return mergeDeclarations(current, next);
+    case "endorsements":
+      return mergeArrayPayload(current, next, "endorsements", (item) => [
+        String(item.formNumber ?? "").toLowerCase(),
+        String(item.title ?? "").toLowerCase(),
+        String(item.pageStart ?? "")
+      ].join("|"));
+    case "exclusions":
+      return mergeArrayPayload(current, next, "exclusions", (item) => [
+        String(item.name ?? "").toLowerCase(),
+        String(item.formNumber ?? "").toLowerCase(),
+        String(item.pageNumber ?? "")
+      ].join("|"));
+    case "conditions":
+      return mergeArrayPayload(current, next, "conditions", (item) => [
+        String(item.name ?? "").toLowerCase(),
+        String(item.conditionType ?? "").toLowerCase(),
+        String(item.pageNumber ?? "")
+      ].join("|"));
+    case "sections":
+      return mergeArrayPayload(current, next, "sections", (item) => [
+        String(item.title ?? "").toLowerCase(),
+        String(item.type ?? "").toLowerCase(),
+        String(item.pageStart ?? ""),
+        String(item.pageEnd ?? "")
+      ].join("|"));
+    default:
+      return mergeShallowPreferPresent(current, next);
+  }
+}
 // src/prompts/templates/homeowners.ts
 var HOMEOWNERS_TEMPLATE = {
   type: "homeowners",
@@ -2694,57 +2813,74 @@ Return JSON only:
 }`;
 }
-// src/prompts/coordinator/plan.ts
+// src/prompts/coordinator/page-map.ts
 import { z as z19 } from "zod";
-var ExtractionTaskSchema = z19.object({
-  extractorName: z19.string(),
-  startPage: z19.number(),
-  endPage: z19.number(),
-  description: z19.string()
-});
-var PageMapEntrySchema = z19.object({
-  section: z19.string(),
-  pages: z19.string()
+var PageExtractorSchema = z19.enum([
+  "carrier_info",
+  "named_insured",
+  "coverage_limits",
+  "endorsements",
+  "exclusions",
+  "conditions",
+  "premium_breakdown",
+  "declarations",
+  "loss_history",
+  "sections",
+  "supplementary"
+]);
+var PageAssignmentSchema = z19.object({
+  localPageNumber: z19.number().int().positive().describe("1-based page number within this supplied PDF chunk"),
+  extractorNames: z19.array(PageExtractorSchema).describe("Focused extractors that should inspect this page"),
+  confidence: z19.number().min(0).max(1).optional().describe("Confidence in the page assignment"),
+  notes: z19.string().optional().describe("Short explanation of what appears on the page")
 });
-var ExtractionPlanSchema = z19.object({
-  tasks: z19.array(ExtractionTaskSchema),
-  pageMap: z19.array(PageMapEntrySchema).optional()
+var PageMapChunkSchema = z19.object({
+  pages: z19.array(PageAssignmentSchema)
 });
-function buildPlanPrompt(templateHints) {
-  return `You are planning the extraction of an insurance document. You have already classified this document. Now scan the full document and create a page map + extraction plan.
+function buildPageMapPrompt(templateHints, startPage, endPage) {
+  return `You are mapping insurance document pages to focused extractors.
+These supplied pages are ORIGINAL DOCUMENT PAGES ${startPage}-${endPage}.
 DOCUMENT TYPE HINTS:
 ${templateHints}
-For each section of the document, decide which extractor should handle it and which pages to send.
+For each page in this supplied PDF chunk, decide which extractor(s) should inspect it.
 Available extractors:
-- carrier_info: Carrier name, legal name, NAIC, AM Best rating, admitted status, MGA, underwriter
-- named_insured: Insured name, DBA, address, entity type, FEIN, SIC/NAICS codes, additional named insureds
-- coverage_limits: Coverage names, limits, deductibles, coverage form, triggers
-- endorsements: Endorsement forms, titles, types, content, affected parties
-- exclusions: Exclusion titles, content, applicability
-- conditions: Policy conditions (duties after loss, cancellation, etc.)
-- premium_breakdown: Premium amounts, taxes, fees, payment plans, rating basis
-- declarations: Line-specific structured declarations data (varies by policy type)
-- loss_history: Loss runs, claim records, experience modification
-- sections: Raw section content (for sections that don't fit other extractors)
-- supplementary: Regulatory context, contacts, claims contacts, third-party administrators
+- carrier_info
+- named_insured
+- coverage_limits
+- endorsements
+- exclusions
+- conditions
+- premium_breakdown
+- declarations
+- loss_history
+- sections
+- supplementary
+Rules:
+- Use specific extractors for declarations, schedules, endorsements, exclusions, conditions, premium pages, and loss runs.
+- Use "sections" for pages that contain substantive policy text or mixed content that should still be preserved as raw sections.
+- Avoid assigning broad ranges mentally; decide page by page.
+- A page may map to multiple extractors if it legitimately contains multiple relevant sections.
+- Prefer declarations and schedules for numeric limits/deductibles over later generic form wording.
+- If a page is mostly generic form language with no declaration-specific values, do not assign "coverage_limits" unless it clearly contains schedule-specific limits.
+- Return every page in the supplied chunk exactly once.
 Return JSON:
 {
-  "tasks": [
-    { "extractorName": "carrier_info", "startPage": 1, "endPage": 2, "description": "Extract carrier details from declarations page" },
-    ...
-  ],
-  "pageMap": [
-    { "section": "declarations", "pages": "pages 1-3" },
-    { "section": "endorsements", "pages": "pages 15-22" }
+  "pages": [
+    {
+      "localPageNumber": 1,
+      "extractorNames": ["declarations", "carrier_info", "named_insured", "coverage_limits"],
+      "confidence": 0.96,
+      "notes": "Declarations page with insured, policy period, and scheduled limits"
+    }
   ]
 }
-Create tasks that cover the entire document. Prefer specific extractors over generic "sections" where possible. Keep page ranges tight \u2014 only include pages relevant to each extractor.
 Respond with JSON only.`;
 }
@@ -2753,6 +2889,7 @@ import { z as z20 } from "zod";
 var ReviewResultSchema = z20.object({
   complete: z20.boolean(),
   missingFields: z20.array(z20.string()),
+  qualityIssues: z20.array(z20.string()).optional(),
   additionalTasks: z20.array(z20.object({
     extractorName: z20.string(),
     startPage: z20.number(),
@@ -2760,8 +2897,8 @@ var ReviewResultSchema = z20.object({
     description: z20.string()
   }))
 });
-function buildReviewPrompt(templateExpected, extractedKeys) {
-  return `You are reviewing an extraction for completeness. Compare what was expected vs what was found.
+function buildReviewPrompt(templateExpected, extractedKeys, extractionSummary, pageMapSummary) {
+  return `You are reviewing an extraction for completeness and quality. Compare what was expected vs what was found.
 EXPECTED FIELDS (from document type template):
 ${templateExpected.map((f) => `- ${f}`).join("\n")}
@@ -2769,21 +2906,36 @@ ${templateExpected.map((f) => `- ${f}`).join("\n")}
 FIELDS ALREADY EXTRACTED:
 ${extractedKeys.map((f) => `- ${f}`).join("\n")}
+PAGE MAP SUMMARY:
+${pageMapSummary}
+CURRENT EXTRACTION SUMMARY:
+${extractionSummary}
 Determine:
-1. Is the extraction complete enough? (required fields present = complete)
+1. Is the extraction complete enough?
 2. What fields are missing?
-3. Should any additional extraction tasks be dispatched?
+3. What quality issues are present?
+4. Should any additional extraction tasks be dispatched?
+Mark the extraction as NOT complete if any of these are true:
+- required fields are missing
+- extracted values are generic placeholders like "shown in declarations", "per schedule", "if applicable", "as stated"
+- coverage limits or deductibles appear to come from generic form language instead of declaration/schedule-specific values
+- page assignments suggest declaration, schedule, endorsement, exclusion, or condition pages were not actually extracted with the matching focused extractor
+- a focused extractor exists but returned too little substance for the relevant pages
 Return JSON:
 {
   "complete": boolean,
   "missingFields": ["field1", "field2"],
+  "qualityIssues": ["issue 1", "issue 2"],
   "additionalTasks": [
     { "extractorName": "...", "startPage": N, "endPage": N, "description": "..." }
   ]
 }
-If all required fields are present, set complete=true even if some optional fields are missing.
+Use the page map to target follow-up extraction pages precisely. Prefer narrow, declaration/schedule-focused follow-up tasks over broad page ranges.
 Respond with JSON only.`;
 }
@@ -3331,17 +3483,125 @@ function createExtractor(config) {
   } = config;
   const limit = pLimit(concurrency);
   let totalUsage = { inputTokens: 0, outputTokens: 0 };
+  let modelCalls = 0;
+  let callsWithUsage = 0;
+  let callsMissingUsage = 0;
   function trackUsage(usage) {
+    modelCalls += 1;
     if (usage) {
+      callsWithUsage += 1;
       totalUsage.inputTokens += usage.inputTokens;
       totalUsage.outputTokens += usage.outputTokens;
       onTokenUsage?.(usage);
+    } else {
+      callsMissingUsage += 1;
+    }
+  }
+  function mergeMemoryResult(name, data, memory) {
+    const existing = memory.get(name);
+    memory.set(name, mergeExtractorResult(name, existing, data));
+  }
+  function summarizeExtraction(memory) {
+    const coverageResult = memory.get("coverage_limits");
+    const declarationResult = memory.get("declarations");
+    const endorsementResult = memory.get("endorsements");
+    const exclusionResult = memory.get("exclusions");
+    const conditionResult = memory.get("conditions");
+    const sectionResult = memory.get("sections");
+    const coverageSummary = Array.isArray(coverageResult?.coverages) ? coverageResult.coverages.slice(0, 12).map((coverage) => ({
+      name: coverage.name,
+      limit: coverage.limit,
+      deductible: coverage.deductible,
+      formNumber: coverage.formNumber
+    })) : [];
+    return JSON.stringify({
+      extractedKeys: [...memory.keys()].filter((key) => key !== "classify"),
+      declarationFieldCount: Array.isArray(declarationResult?.fields) ? declarationResult.fields.length : 0,
+      coverageCount: Array.isArray(coverageResult?.coverages) ? coverageResult.coverages.length : 0,
+      coverageSamples: coverageSummary,
+      endorsementCount: Array.isArray(endorsementResult?.endorsements) ? endorsementResult.endorsements.length : 0,
+      exclusionCount: Array.isArray(exclusionResult?.exclusions) ? exclusionResult.exclusions.length : 0,
+      conditionCount: Array.isArray(conditionResult?.conditions) ? conditionResult.conditions.length : 0,
+      sectionCount: Array.isArray(sectionResult?.sections) ? sectionResult.sections.length : 0
+    }, null, 2);
+  }
+  function formatPageMapSummary(pageAssignments) {
+    const extractorPages = /* @__PURE__ */ new Map();
+    for (const assignment of pageAssignments) {
+      for (const extractorName of assignment.extractorNames) {
+        extractorPages.set(extractorName, [...extractorPages.get(extractorName) ?? [], assignment.localPageNumber]);
+      }
+    }
+    if (extractorPages.size === 0) return "No page assignments available.";
+    return [...extractorPages.entries()].map(([extractorName, pages]) => `${extractorName}: pages ${pages.join(", ")}`).join("\n");
+  }
+  function buildTemplateHints(primaryType, documentType, pageCount, template) {
+    return [
+      `Document type: ${primaryType} ${documentType}`,
+      `Expected sections: ${template.expectedSections.join(", ")}`,
+      `Page hints: ${Object.entries(template.pageHints).map(([k, v]) => `${k}: ${v}`).join("; ")}`,
+      `Total pages: ${pageCount}`
+    ].join("\n");
+  }
+  function groupContiguousPages(pages) {
+    if (pages.length === 0) return [];
+    const sorted = [...new Set(pages)].sort((a, b) => a - b);
+    const ranges = [];
+    let start = sorted[0];
+    let previous = sorted[0];
+    for (let i = 1; i < sorted.length; i += 1) {
+      const current = sorted[i];
+      if (current === previous + 1) {
+        previous = current;
+        continue;
+      }
+      ranges.push({ startPage: start, endPage: previous });
+      start = current;
+      previous = current;
     }
+    ranges.push({ startPage: start, endPage: previous });
+    return ranges;
+  }
+  function buildPlanFromPageAssignments(pageAssignments, pageCount) {
+    const extractorPages = /* @__PURE__ */ new Map();
+    for (const assignment of pageAssignments) {
+      const extractors = assignment.extractorNames.length > 0 ? assignment.extractorNames : ["sections"];
+      for (const extractorName of extractors) {
+        extractorPages.set(extractorName, [...extractorPages.get(extractorName) ?? [], assignment.localPageNumber]);
+      }
+    }
+    const coveredPages = /* @__PURE__ */ new Set();
+    for (const pages of extractorPages.values()) {
+      for (const page of pages) coveredPages.add(page);
+    }
+    for (let page = 1; page <= pageCount; page += 1) {
+      if (!coveredPages.has(page)) {
+        extractorPages.set("sections", [...extractorPages.get("sections") ?? [], page]);
+      }
+    }
+    const tasks = [...extractorPages.entries()].flatMap(
+      ([extractorName, pages]) => groupContiguousPages(pages).map(({ startPage, endPage }) => ({
+        extractorName,
+        startPage,
+        endPage,
+        description: `Page-mapped ${extractorName} extraction for pages ${startPage}-${endPage}`
+      }))
+    ).sort((a, b) => a.startPage - b.startPage || a.extractorName.localeCompare(b.extractorName));
+    return {
+      tasks,
+      pageMap: [...extractorPages.entries()].map(([section, pages]) => ({
+        section,
+        pages: `pages ${[...new Set(pages)].sort((a, b) => a - b).join(", ")}`
+      }))
+    };
   }
   async function extract(pdfBase64, documentId, options) {
     const id = documentId ?? `doc-${Date.now()}`;
     const memory = /* @__PURE__ */ new Map();
     totalUsage = { inputTokens: 0, outputTokens: 0 };
+    modelCalls = 0;
+    callsWithUsage = 0;
+    callsMissingUsage = 0;
     const pipelineCtx = createPipelineContext({
       id,
       onSave: onCheckpointSave,
@@ -3392,40 +3652,73 @@ function createExtractor(config) {
     const primaryType = policyTypes[0] ?? "other";
     const template = getTemplate(primaryType);
     const pageCount = resumed?.pageCount ?? await getPdfPageCount(pdfBase64);
+    const templateHints = buildTemplateHints(primaryType, documentType, pageCount, template);
+    let pageAssignments;
+    if (resumed?.pageAssignments && pipelineCtx.isPhaseComplete("page_map")) {
+      pageAssignments = resumed.pageAssignments;
+      onProgress?.("Resuming from checkpoint (page map complete)...");
+    } else {
+      onProgress?.(`Mapping document pages for ${primaryType} ${documentType}...`);
+      const chunkSize = 8;
+      const collectedAssignments = [];
+      for (let startPage = 1; startPage <= pageCount; startPage += chunkSize) {
+        const endPage = Math.min(pageCount, startPage + chunkSize - 1);
+        const pagesPdf = await extractPageRange(pdfBase64, startPage, endPage);
+        const mapResponse = await safeGenerateObject(
+          generateObject,
+          {
+            prompt: buildPageMapPrompt(templateHints, startPage, endPage),
+            schema: PageMapChunkSchema,
+            maxTokens: 2048,
+            providerOptions: { ...providerOptions, pdfBase64: pagesPdf }
+          },
+          {
+            fallback: {
+              pages: Array.from({ length: endPage - startPage + 1 }, (_, index) => ({
+                localPageNumber: index + 1,
+                extractorNames: index === 0 && startPage === 1 ? ["carrier_info", "named_insured", "declarations", "coverage_limits"] : ["sections"],
+                confidence: 0,
+                notes: "Fallback page assignment"
+              }))
+            },
+            log,
+            onError: (err, attempt) => log?.(`Page map attempt ${attempt + 1} failed for pages ${startPage}-${endPage}: ${err}`)
+          }
+        );
+        trackUsage(mapResponse.usage);
+        for (const assignment of mapResponse.object.pages) {
+          collectedAssignments.push({
+            ...assignment,
+            localPageNumber: startPage + assignment.localPageNumber - 1
+          });
+        }
+      }
+      pageAssignments = collectedAssignments.length > 0 ? collectedAssignments : Array.from({ length: pageCount }, (_, index) => ({
+        localPageNumber: index + 1,
+        extractorNames: index === 0 ? ["carrier_info", "named_insured", "declarations", "coverage_limits"] : ["sections"],
+        confidence: 0,
+        notes: "Full-document fallback page assignment"
+      }));
+      await pipelineCtx.save("page_map", {
+        id,
+        pageCount,
+        classifyResult,
+        pageAssignments,
+        memory: Object.fromEntries(memory)
+      });
+    }
     let plan;
     if (resumed?.plan && pipelineCtx.isPhaseComplete("plan")) {
       plan = resumed.plan;
       onProgress?.("Resuming from checkpoint (plan complete)...");
     } else {
-      onProgress?.(`Planning extraction for ${primaryType} ${documentType}...`);
-      const templateHints = [
-        `Document type: ${primaryType} ${documentType}`,
-        `Expected sections: ${template.expectedSections.join(", ")}`,
-        `Page hints: ${Object.entries(template.pageHints).map(([k, v]) => `${k}: ${v}`).join("; ")}`,
-        `Total pages: ${pageCount}`
-      ].join("\n");
-      const planResponse = await safeGenerateObject(
-        generateObject,
-        {
-          prompt: buildPlanPrompt(templateHints),
-          schema: ExtractionPlanSchema,
-          maxTokens: 2048,
-          providerOptions: { ...providerOptions, pdfBase64 }
-        },
-        {
-          fallback: {
-            tasks: [{ extractorName: "sections", startPage: 1, endPage: pageCount, description: "Full document fallback extraction" }]
-          },
-          log,
-          onError: (err, attempt) => log?.(`Plan attempt ${attempt + 1} failed: ${err}`)
-        }
-      );
-      trackUsage(planResponse.usage);
-      plan = planResponse.object;
+      onProgress?.(`Building extraction plan from page map for ${primaryType} ${documentType}...`);
+      plan = buildPlanFromPageAssignments(pageAssignments, pageCount);
       await pipelineCtx.save("plan", {
         id,
         pageCount,
         classifyResult,
+        pageAssignments,
         plan,
         memory: Object.fromEntries(memory)
       });
@@ -3466,13 +3759,14 @@ function createExtractor(config) {
       );
       for (const result of extractorResults) {
         if (result) {
-          memory.set(result.name, result.data);
+          mergeMemoryResult(result.name, result.data, memory);
         }
       }
       await pipelineCtx.save("extract", {
         id,
         pageCount,
         classifyResult,
+        pageAssignments,
         plan,
         memory: Object.fromEntries(memory)
       });
@@ -3480,21 +3774,26 @@ function createExtractor(config) {
     if (!pipelineCtx.isPhaseComplete("review")) {
       for (let round = 0; round < maxReviewRounds; round++) {
         const extractedKeys = [...memory.keys()].filter((k) => k !== "classify");
+        const extractionSummary = summarizeExtraction(memory);
+        const pageMapSummary = formatPageMapSummary(pageAssignments);
         const reviewResponse = await safeGenerateObject(
           generateObject,
           {
-            prompt: buildReviewPrompt(template.required, extractedKeys),
+            prompt: buildReviewPrompt(template.required, extractedKeys, extractionSummary, pageMapSummary),
             schema: ReviewResultSchema,
-            maxTokens: 1024,
-            providerOptions
+            maxTokens: 1536,
+            providerOptions: { ...providerOptions, pdfBase64 }
           },
           {
-            fallback: { complete: true, missingFields: [], additionalTasks: [] },
+            fallback: { complete: true, missingFields: [], qualityIssues: [], additionalTasks: [] },
             log,
             onError: (err, attempt) => log?.(`Review round ${round + 1} attempt ${attempt + 1} failed: ${err}`)
           }
         );
         trackUsage(reviewResponse.usage);
+        if (reviewResponse.object.qualityIssues?.length) {
+          await log?.(`Review round ${round + 1} quality issues: ${reviewResponse.object.qualityIssues.join("; ")}`);
+        }
         if (reviewResponse.object.complete || reviewResponse.object.additionalTasks.length === 0) {
           onProgress?.("Extraction complete.");
           break;
@@ -3529,7 +3828,7 @@ function createExtractor(config) {
         );
         for (const result of followUpResults) {
           if (result) {
-            memory.set(result.name, result.data);
+            mergeMemoryResult(result.name, result.data, memory);
           }
         }
       }
@@ -3537,6 +3836,7 @@ function createExtractor(config) {
         id,
         pageCount,
         classifyResult,
+        pageAssignments,
         plan,
         memory: Object.fromEntries(memory)
       });
@@ -3547,6 +3847,7 @@ function createExtractor(config) {
       id,
       pageCount,
       classifyResult,
+      pageAssignments,
       plan,
       memory: Object.fromEntries(memory),
       document
@@ -3560,10 +3861,19 @@ function createExtractor(config) {
     trackUsage(formatResult.usage);
     const chunks = chunkDocument(formatResult.document);
     const finalCheckpoint = pipelineCtx.getCheckpoint();
+    if (callsMissingUsage > 0) {
+      await log?.(`Token usage was unavailable for ${callsMissingUsage}/${modelCalls} model calls. Check that your provider callbacks return usage.`);
+      onProgress?.(`Token usage unavailable for ${callsMissingUsage}/${modelCalls} model calls.`);
+    }
     return {
       document: formatResult.document,
       chunks,
       tokenUsage: totalUsage,
+      usageReporting: {
+        modelCalls,
+        callsWithUsage,
+        callsMissingUsage
+      },
       checkpoint: finalCheckpoint
     };
   }