@claritylabs/cl-sdk 0.8.1 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1905,6 +1905,125 @@ Total Cost: ${doc.totalCost}` : ""}`,
1905
1905
  return chunks;
1906
1906
  }
1907
1907
 
1908
+ // src/extraction/merge.ts
1909
+ function isPresent(value) {
1910
+ if (value === void 0 || value === null) return false;
1911
+ if (typeof value === "string") return value.trim().length > 0;
1912
+ if (Array.isArray(value)) return value.length > 0;
1913
+ return true;
1914
+ }
1915
+ function dedupeByKey(items, keyFn) {
1916
+ const seen = /* @__PURE__ */ new Set();
1917
+ const merged = [];
1918
+ for (const item of items) {
1919
+ const key = keyFn(item);
1920
+ if (seen.has(key)) continue;
1921
+ seen.add(key);
1922
+ merged.push(item);
1923
+ }
1924
+ return merged;
1925
+ }
1926
+ function mergeUniqueObjects(existing, incoming, keyFn) {
1927
+ return dedupeByKey([...existing, ...incoming], keyFn);
1928
+ }
1929
+ function mergeShallowPreferPresent(existing, incoming) {
1930
+ const merged = { ...existing };
1931
+ for (const [key, value] of Object.entries(incoming)) {
1932
+ const current = merged[key];
1933
+ if (Array.isArray(current) && Array.isArray(value)) {
1934
+ merged[key] = [...current, ...value];
1935
+ continue;
1936
+ }
1937
+ if (current && value && typeof current === "object" && typeof value === "object" && !Array.isArray(current) && !Array.isArray(value)) {
1938
+ merged[key] = mergeShallowPreferPresent(
1939
+ current,
1940
+ value
1941
+ );
1942
+ continue;
1943
+ }
1944
+ if (!isPresent(current) && isPresent(value)) {
1945
+ merged[key] = value;
1946
+ }
1947
+ }
1948
+ return merged;
1949
+ }
1950
+ function mergeCoverageLimits(existing, incoming) {
1951
+ const merged = mergeShallowPreferPresent(existing, incoming);
1952
+ const existingCoverages = Array.isArray(existing.coverages) ? existing.coverages : [];
1953
+ const incomingCoverages = Array.isArray(incoming.coverages) ? incoming.coverages : [];
1954
+ merged.coverages = mergeUniqueObjects(existingCoverages, incomingCoverages, (coverage) => [
1955
+ String(coverage.name ?? "").toLowerCase(),
1956
+ String(coverage.limit ?? "").toLowerCase(),
1957
+ String(coverage.deductible ?? "").toLowerCase(),
1958
+ String(coverage.formNumber ?? "").toLowerCase()
1959
+ ].join("|"));
1960
+ return merged;
1961
+ }
1962
+ function mergeDeclarations(existing, incoming) {
1963
+ const merged = mergeShallowPreferPresent(existing, incoming);
1964
+ const existingFields = Array.isArray(existing.fields) ? existing.fields : [];
1965
+ const incomingFields = Array.isArray(incoming.fields) ? incoming.fields : [];
1966
+ merged.fields = mergeUniqueObjects(existingFields, incomingFields, (field) => [
1967
+ String(field.field ?? "").toLowerCase(),
1968
+ String(field.value ?? "").toLowerCase(),
1969
+ String(field.section ?? "").toLowerCase()
1970
+ ].join("|"));
1971
+ return merged;
1972
+ }
1973
+ function mergeArrayPayload(existing, incoming, arrayKey, keyFn) {
1974
+ const merged = mergeShallowPreferPresent(existing, incoming);
1975
+ const existingItems = Array.isArray(existing[arrayKey]) ? existing[arrayKey] : [];
1976
+ const incomingItems = Array.isArray(incoming[arrayKey]) ? incoming[arrayKey] : [];
1977
+ merged[arrayKey] = mergeUniqueObjects(existingItems, incomingItems, keyFn);
1978
+ return merged;
1979
+ }
1980
+ function mergeExtractorResult(extractorName, existing, incoming) {
1981
+ if (!existing) return incoming;
1982
+ if (!incoming) return existing;
1983
+ if (typeof existing !== "object" || typeof incoming !== "object") return incoming;
1984
+ const current = existing;
1985
+ const next = incoming;
1986
+ switch (extractorName) {
1987
+ case "carrier_info":
1988
+ case "named_insured":
1989
+ case "loss_history":
1990
+ case "supplementary":
1991
+ case "premium_breakdown":
1992
+ return mergeShallowPreferPresent(current, next);
1993
+ case "coverage_limits":
1994
+ return mergeCoverageLimits(current, next);
1995
+ case "declarations":
1996
+ return mergeDeclarations(current, next);
1997
+ case "endorsements":
1998
+ return mergeArrayPayload(current, next, "endorsements", (item) => [
1999
+ String(item.formNumber ?? "").toLowerCase(),
2000
+ String(item.title ?? "").toLowerCase(),
2001
+ String(item.pageStart ?? "")
2002
+ ].join("|"));
2003
+ case "exclusions":
2004
+ return mergeArrayPayload(current, next, "exclusions", (item) => [
2005
+ String(item.name ?? "").toLowerCase(),
2006
+ String(item.formNumber ?? "").toLowerCase(),
2007
+ String(item.pageNumber ?? "")
2008
+ ].join("|"));
2009
+ case "conditions":
2010
+ return mergeArrayPayload(current, next, "conditions", (item) => [
2011
+ String(item.name ?? "").toLowerCase(),
2012
+ String(item.conditionType ?? "").toLowerCase(),
2013
+ String(item.pageNumber ?? "")
2014
+ ].join("|"));
2015
+ case "sections":
2016
+ return mergeArrayPayload(current, next, "sections", (item) => [
2017
+ String(item.title ?? "").toLowerCase(),
2018
+ String(item.type ?? "").toLowerCase(),
2019
+ String(item.pageStart ?? ""),
2020
+ String(item.pageEnd ?? "")
2021
+ ].join("|"));
2022
+ default:
2023
+ return mergeShallowPreferPresent(current, next);
2024
+ }
2025
+ }
2026
+
1908
2027
  // src/prompts/templates/homeowners.ts
1909
2028
  var HOMEOWNERS_TEMPLATE = {
1910
2029
  type: "homeowners",
@@ -2694,57 +2813,74 @@ Return JSON only:
2694
2813
  }`;
2695
2814
  }
2696
2815
 
2697
- // src/prompts/coordinator/plan.ts
2816
+ // src/prompts/coordinator/page-map.ts
2698
2817
  import { z as z19 } from "zod";
2699
- var ExtractionTaskSchema = z19.object({
2700
- extractorName: z19.string(),
2701
- startPage: z19.number(),
2702
- endPage: z19.number(),
2703
- description: z19.string()
2704
- });
2705
- var PageMapEntrySchema = z19.object({
2706
- section: z19.string(),
2707
- pages: z19.string()
2818
+ var PageExtractorSchema = z19.enum([
2819
+ "carrier_info",
2820
+ "named_insured",
2821
+ "coverage_limits",
2822
+ "endorsements",
2823
+ "exclusions",
2824
+ "conditions",
2825
+ "premium_breakdown",
2826
+ "declarations",
2827
+ "loss_history",
2828
+ "sections",
2829
+ "supplementary"
2830
+ ]);
2831
+ var PageAssignmentSchema = z19.object({
2832
+ localPageNumber: z19.number().int().positive().describe("1-based page number within this supplied PDF chunk"),
2833
+ extractorNames: z19.array(PageExtractorSchema).describe("Focused extractors that should inspect this page"),
2834
+ confidence: z19.number().min(0).max(1).optional().describe("Confidence in the page assignment"),
2835
+ notes: z19.string().optional().describe("Short explanation of what appears on the page")
2708
2836
  });
2709
- var ExtractionPlanSchema = z19.object({
2710
- tasks: z19.array(ExtractionTaskSchema),
2711
- pageMap: z19.array(PageMapEntrySchema).optional()
2837
+ var PageMapChunkSchema = z19.object({
2838
+ pages: z19.array(PageAssignmentSchema)
2712
2839
  });
2713
- function buildPlanPrompt(templateHints) {
2714
- return `You are planning the extraction of an insurance document. You have already classified this document. Now scan the full document and create a page map + extraction plan.
2840
+ function buildPageMapPrompt(templateHints, startPage, endPage) {
2841
+ return `You are mapping insurance document pages to focused extractors.
2842
+
2843
+ These supplied pages are ORIGINAL DOCUMENT PAGES ${startPage}-${endPage}.
2715
2844
 
2716
2845
  DOCUMENT TYPE HINTS:
2717
2846
  ${templateHints}
2718
2847
 
2719
- For each section of the document, decide which extractor should handle it and which pages to send.
2848
+ For each page in this supplied PDF chunk, decide which extractor(s) should inspect it.
2720
2849
 
2721
2850
  Available extractors:
2722
- - carrier_info: Carrier name, legal name, NAIC, AM Best rating, admitted status, MGA, underwriter
2723
- - named_insured: Insured name, DBA, address, entity type, FEIN, SIC/NAICS codes, additional named insureds
2724
- - coverage_limits: Coverage names, limits, deductibles, coverage form, triggers
2725
- - endorsements: Endorsement forms, titles, types, content, affected parties
2726
- - exclusions: Exclusion titles, content, applicability
2727
- - conditions: Policy conditions (duties after loss, cancellation, etc.)
2728
- - premium_breakdown: Premium amounts, taxes, fees, payment plans, rating basis
2729
- - declarations: Line-specific structured declarations data (varies by policy type)
2730
- - loss_history: Loss runs, claim records, experience modification
2731
- - sections: Raw section content (for sections that don't fit other extractors)
2732
- - supplementary: Regulatory context, contacts, claims contacts, third-party administrators
2851
+ - carrier_info
2852
+ - named_insured
2853
+ - coverage_limits
2854
+ - endorsements
2855
+ - exclusions
2856
+ - conditions
2857
+ - premium_breakdown
2858
+ - declarations
2859
+ - loss_history
2860
+ - sections
2861
+ - supplementary
2862
+
2863
+ Rules:
2864
+ - Use specific extractors for declarations, schedules, endorsements, exclusions, conditions, premium pages, and loss runs.
2865
+ - Use "sections" for pages that contain substantive policy text or mixed content that should still be preserved as raw sections.
2866
+ - Avoid assigning broad ranges mentally; decide page by page.
2867
+ - A page may map to multiple extractors if it legitimately contains multiple relevant sections.
2868
+ - Prefer declarations and schedules for numeric limits/deductibles over later generic form wording.
2869
+ - If a page is mostly generic form language with no declaration-specific values, do not assign "coverage_limits" unless it clearly contains schedule-specific limits.
2870
+ - Return every page in the supplied chunk exactly once.
2733
2871
 
2734
2872
  Return JSON:
2735
2873
  {
2736
- "tasks": [
2737
- { "extractorName": "carrier_info", "startPage": 1, "endPage": 2, "description": "Extract carrier details from declarations page" },
2738
- ...
2739
- ],
2740
- "pageMap": [
2741
- { "section": "declarations", "pages": "pages 1-3" },
2742
- { "section": "endorsements", "pages": "pages 15-22" }
2874
+ "pages": [
2875
+ {
2876
+ "localPageNumber": 1,
2877
+ "extractorNames": ["declarations", "carrier_info", "named_insured", "coverage_limits"],
2878
+ "confidence": 0.96,
2879
+ "notes": "Declarations page with insured, policy period, and scheduled limits"
2880
+ }
2743
2881
  ]
2744
2882
  }
2745
2883
 
2746
- Create tasks that cover the entire document. Prefer specific extractors over generic "sections" where possible. Keep page ranges tight \u2014 only include pages relevant to each extractor.
2747
-
2748
2884
  Respond with JSON only.`;
2749
2885
  }
2750
2886
 
@@ -2753,6 +2889,7 @@ import { z as z20 } from "zod";
2753
2889
  var ReviewResultSchema = z20.object({
2754
2890
  complete: z20.boolean(),
2755
2891
  missingFields: z20.array(z20.string()),
2892
+ qualityIssues: z20.array(z20.string()).optional(),
2756
2893
  additionalTasks: z20.array(z20.object({
2757
2894
  extractorName: z20.string(),
2758
2895
  startPage: z20.number(),
@@ -2760,8 +2897,8 @@ var ReviewResultSchema = z20.object({
2760
2897
  description: z20.string()
2761
2898
  }))
2762
2899
  });
2763
- function buildReviewPrompt(templateExpected, extractedKeys) {
2764
- return `You are reviewing an extraction for completeness. Compare what was expected vs what was found.
2900
+ function buildReviewPrompt(templateExpected, extractedKeys, extractionSummary, pageMapSummary) {
2901
+ return `You are reviewing an extraction for completeness and quality. Compare what was expected vs what was found.
2765
2902
 
2766
2903
  EXPECTED FIELDS (from document type template):
2767
2904
  ${templateExpected.map((f) => `- ${f}`).join("\n")}
@@ -2769,21 +2906,36 @@ ${templateExpected.map((f) => `- ${f}`).join("\n")}
2769
2906
  FIELDS ALREADY EXTRACTED:
2770
2907
  ${extractedKeys.map((f) => `- ${f}`).join("\n")}
2771
2908
 
2909
+ PAGE MAP SUMMARY:
2910
+ ${pageMapSummary}
2911
+
2912
+ CURRENT EXTRACTION SUMMARY:
2913
+ ${extractionSummary}
2914
+
2772
2915
  Determine:
2773
- 1. Is the extraction complete enough? (required fields present = complete)
2916
+ 1. Is the extraction complete enough?
2774
2917
  2. What fields are missing?
2775
- 3. Should any additional extraction tasks be dispatched?
2918
+ 3. What quality issues are present?
2919
+ 4. Should any additional extraction tasks be dispatched?
2920
+
2921
+ Mark the extraction as NOT complete if any of these are true:
2922
+ - required fields are missing
2923
+ - extracted values are generic placeholders like "shown in declarations", "per schedule", "if applicable", "as stated"
2924
+ - coverage limits or deductibles appear to come from generic form language instead of declaration/schedule-specific values
2925
+ - page assignments suggest declaration, schedule, endorsement, exclusion, or condition pages were not actually extracted with the matching focused extractor
2926
+ - a focused extractor exists but returned too little substance for the relevant pages
2776
2927
 
2777
2928
  Return JSON:
2778
2929
  {
2779
2930
  "complete": boolean,
2780
2931
  "missingFields": ["field1", "field2"],
2932
+ "qualityIssues": ["issue 1", "issue 2"],
2781
2933
  "additionalTasks": [
2782
2934
  { "extractorName": "...", "startPage": N, "endPage": N, "description": "..." }
2783
2935
  ]
2784
2936
  }
2785
2937
 
2786
- If all required fields are present, set complete=true even if some optional fields are missing.
2938
+ Use the page map to target follow-up extraction pages precisely. Prefer narrow, declaration/schedule-focused follow-up tasks over broad page ranges.
2787
2939
 
2788
2940
  Respond with JSON only.`;
2789
2941
  }
@@ -3331,17 +3483,125 @@ function createExtractor(config) {
3331
3483
  } = config;
3332
3484
  const limit = pLimit(concurrency);
3333
3485
  let totalUsage = { inputTokens: 0, outputTokens: 0 };
3486
+ let modelCalls = 0;
3487
+ let callsWithUsage = 0;
3488
+ let callsMissingUsage = 0;
3334
3489
  function trackUsage(usage) {
3490
+ modelCalls += 1;
3335
3491
  if (usage) {
3492
+ callsWithUsage += 1;
3336
3493
  totalUsage.inputTokens += usage.inputTokens;
3337
3494
  totalUsage.outputTokens += usage.outputTokens;
3338
3495
  onTokenUsage?.(usage);
3496
+ } else {
3497
+ callsMissingUsage += 1;
3498
+ }
3499
+ }
3500
+ function mergeMemoryResult(name, data, memory) {
3501
+ const existing = memory.get(name);
3502
+ memory.set(name, mergeExtractorResult(name, existing, data));
3503
+ }
3504
+ function summarizeExtraction(memory) {
3505
+ const coverageResult = memory.get("coverage_limits");
3506
+ const declarationResult = memory.get("declarations");
3507
+ const endorsementResult = memory.get("endorsements");
3508
+ const exclusionResult = memory.get("exclusions");
3509
+ const conditionResult = memory.get("conditions");
3510
+ const sectionResult = memory.get("sections");
3511
+ const coverageSummary = Array.isArray(coverageResult?.coverages) ? coverageResult.coverages.slice(0, 12).map((coverage) => ({
3512
+ name: coverage.name,
3513
+ limit: coverage.limit,
3514
+ deductible: coverage.deductible,
3515
+ formNumber: coverage.formNumber
3516
+ })) : [];
3517
+ return JSON.stringify({
3518
+ extractedKeys: [...memory.keys()].filter((key) => key !== "classify"),
3519
+ declarationFieldCount: Array.isArray(declarationResult?.fields) ? declarationResult.fields.length : 0,
3520
+ coverageCount: Array.isArray(coverageResult?.coverages) ? coverageResult.coverages.length : 0,
3521
+ coverageSamples: coverageSummary,
3522
+ endorsementCount: Array.isArray(endorsementResult?.endorsements) ? endorsementResult.endorsements.length : 0,
3523
+ exclusionCount: Array.isArray(exclusionResult?.exclusions) ? exclusionResult.exclusions.length : 0,
3524
+ conditionCount: Array.isArray(conditionResult?.conditions) ? conditionResult.conditions.length : 0,
3525
+ sectionCount: Array.isArray(sectionResult?.sections) ? sectionResult.sections.length : 0
3526
+ }, null, 2);
3527
+ }
3528
+ function formatPageMapSummary(pageAssignments) {
3529
+ const extractorPages = /* @__PURE__ */ new Map();
3530
+ for (const assignment of pageAssignments) {
3531
+ for (const extractorName of assignment.extractorNames) {
3532
+ extractorPages.set(extractorName, [...extractorPages.get(extractorName) ?? [], assignment.localPageNumber]);
3533
+ }
3534
+ }
3535
+ if (extractorPages.size === 0) return "No page assignments available.";
3536
+ return [...extractorPages.entries()].map(([extractorName, pages]) => `${extractorName}: pages ${pages.join(", ")}`).join("\n");
3537
+ }
3538
+ function buildTemplateHints(primaryType, documentType, pageCount, template) {
3539
+ return [
3540
+ `Document type: ${primaryType} ${documentType}`,
3541
+ `Expected sections: ${template.expectedSections.join(", ")}`,
3542
+ `Page hints: ${Object.entries(template.pageHints).map(([k, v]) => `${k}: ${v}`).join("; ")}`,
3543
+ `Total pages: ${pageCount}`
3544
+ ].join("\n");
3545
+ }
3546
+ function groupContiguousPages(pages) {
3547
+ if (pages.length === 0) return [];
3548
+ const sorted = [...new Set(pages)].sort((a, b) => a - b);
3549
+ const ranges = [];
3550
+ let start = sorted[0];
3551
+ let previous = sorted[0];
3552
+ for (let i = 1; i < sorted.length; i += 1) {
3553
+ const current = sorted[i];
3554
+ if (current === previous + 1) {
3555
+ previous = current;
3556
+ continue;
3557
+ }
3558
+ ranges.push({ startPage: start, endPage: previous });
3559
+ start = current;
3560
+ previous = current;
3339
3561
  }
3562
+ ranges.push({ startPage: start, endPage: previous });
3563
+ return ranges;
3564
+ }
3565
+ function buildPlanFromPageAssignments(pageAssignments, pageCount) {
3566
+ const extractorPages = /* @__PURE__ */ new Map();
3567
+ for (const assignment of pageAssignments) {
3568
+ const extractors = assignment.extractorNames.length > 0 ? assignment.extractorNames : ["sections"];
3569
+ for (const extractorName of extractors) {
3570
+ extractorPages.set(extractorName, [...extractorPages.get(extractorName) ?? [], assignment.localPageNumber]);
3571
+ }
3572
+ }
3573
+ const coveredPages = /* @__PURE__ */ new Set();
3574
+ for (const pages of extractorPages.values()) {
3575
+ for (const page of pages) coveredPages.add(page);
3576
+ }
3577
+ for (let page = 1; page <= pageCount; page += 1) {
3578
+ if (!coveredPages.has(page)) {
3579
+ extractorPages.set("sections", [...extractorPages.get("sections") ?? [], page]);
3580
+ }
3581
+ }
3582
+ const tasks = [...extractorPages.entries()].flatMap(
3583
+ ([extractorName, pages]) => groupContiguousPages(pages).map(({ startPage, endPage }) => ({
3584
+ extractorName,
3585
+ startPage,
3586
+ endPage,
3587
+ description: `Page-mapped ${extractorName} extraction for pages ${startPage}-${endPage}`
3588
+ }))
3589
+ ).sort((a, b) => a.startPage - b.startPage || a.extractorName.localeCompare(b.extractorName));
3590
+ return {
3591
+ tasks,
3592
+ pageMap: [...extractorPages.entries()].map(([section, pages]) => ({
3593
+ section,
3594
+ pages: `pages ${[...new Set(pages)].sort((a, b) => a - b).join(", ")}`
3595
+ }))
3596
+ };
3340
3597
  }
3341
3598
  async function extract(pdfBase64, documentId, options) {
3342
3599
  const id = documentId ?? `doc-${Date.now()}`;
3343
3600
  const memory = /* @__PURE__ */ new Map();
3344
3601
  totalUsage = { inputTokens: 0, outputTokens: 0 };
3602
+ modelCalls = 0;
3603
+ callsWithUsage = 0;
3604
+ callsMissingUsage = 0;
3345
3605
  const pipelineCtx = createPipelineContext({
3346
3606
  id,
3347
3607
  onSave: onCheckpointSave,
@@ -3392,40 +3652,73 @@ function createExtractor(config) {
3392
3652
  const primaryType = policyTypes[0] ?? "other";
3393
3653
  const template = getTemplate(primaryType);
3394
3654
  const pageCount = resumed?.pageCount ?? await getPdfPageCount(pdfBase64);
3655
+ const templateHints = buildTemplateHints(primaryType, documentType, pageCount, template);
3656
+ let pageAssignments;
3657
+ if (resumed?.pageAssignments && pipelineCtx.isPhaseComplete("page_map")) {
3658
+ pageAssignments = resumed.pageAssignments;
3659
+ onProgress?.("Resuming from checkpoint (page map complete)...");
3660
+ } else {
3661
+ onProgress?.(`Mapping document pages for ${primaryType} ${documentType}...`);
3662
+ const chunkSize = 8;
3663
+ const collectedAssignments = [];
3664
+ for (let startPage = 1; startPage <= pageCount; startPage += chunkSize) {
3665
+ const endPage = Math.min(pageCount, startPage + chunkSize - 1);
3666
+ const pagesPdf = await extractPageRange(pdfBase64, startPage, endPage);
3667
+ const mapResponse = await safeGenerateObject(
3668
+ generateObject,
3669
+ {
3670
+ prompt: buildPageMapPrompt(templateHints, startPage, endPage),
3671
+ schema: PageMapChunkSchema,
3672
+ maxTokens: 2048,
3673
+ providerOptions: { ...providerOptions, pdfBase64: pagesPdf }
3674
+ },
3675
+ {
3676
+ fallback: {
3677
+ pages: Array.from({ length: endPage - startPage + 1 }, (_, index) => ({
3678
+ localPageNumber: index + 1,
3679
+ extractorNames: index === 0 && startPage === 1 ? ["carrier_info", "named_insured", "declarations", "coverage_limits"] : ["sections"],
3680
+ confidence: 0,
3681
+ notes: "Fallback page assignment"
3682
+ }))
3683
+ },
3684
+ log,
3685
+ onError: (err, attempt) => log?.(`Page map attempt ${attempt + 1} failed for pages ${startPage}-${endPage}: ${err}`)
3686
+ }
3687
+ );
3688
+ trackUsage(mapResponse.usage);
3689
+ for (const assignment of mapResponse.object.pages) {
3690
+ collectedAssignments.push({
3691
+ ...assignment,
3692
+ localPageNumber: startPage + assignment.localPageNumber - 1
3693
+ });
3694
+ }
3695
+ }
3696
+ pageAssignments = collectedAssignments.length > 0 ? collectedAssignments : Array.from({ length: pageCount }, (_, index) => ({
3697
+ localPageNumber: index + 1,
3698
+ extractorNames: index === 0 ? ["carrier_info", "named_insured", "declarations", "coverage_limits"] : ["sections"],
3699
+ confidence: 0,
3700
+ notes: "Full-document fallback page assignment"
3701
+ }));
3702
+ await pipelineCtx.save("page_map", {
3703
+ id,
3704
+ pageCount,
3705
+ classifyResult,
3706
+ pageAssignments,
3707
+ memory: Object.fromEntries(memory)
3708
+ });
3709
+ }
3395
3710
  let plan;
3396
3711
  if (resumed?.plan && pipelineCtx.isPhaseComplete("plan")) {
3397
3712
  plan = resumed.plan;
3398
3713
  onProgress?.("Resuming from checkpoint (plan complete)...");
3399
3714
  } else {
3400
- onProgress?.(`Planning extraction for ${primaryType} ${documentType}...`);
3401
- const templateHints = [
3402
- `Document type: ${primaryType} ${documentType}`,
3403
- `Expected sections: ${template.expectedSections.join(", ")}`,
3404
- `Page hints: ${Object.entries(template.pageHints).map(([k, v]) => `${k}: ${v}`).join("; ")}`,
3405
- `Total pages: ${pageCount}`
3406
- ].join("\n");
3407
- const planResponse = await safeGenerateObject(
3408
- generateObject,
3409
- {
3410
- prompt: buildPlanPrompt(templateHints),
3411
- schema: ExtractionPlanSchema,
3412
- maxTokens: 2048,
3413
- providerOptions: { ...providerOptions, pdfBase64 }
3414
- },
3415
- {
3416
- fallback: {
3417
- tasks: [{ extractorName: "sections", startPage: 1, endPage: pageCount, description: "Full document fallback extraction" }]
3418
- },
3419
- log,
3420
- onError: (err, attempt) => log?.(`Plan attempt ${attempt + 1} failed: ${err}`)
3421
- }
3422
- );
3423
- trackUsage(planResponse.usage);
3424
- plan = planResponse.object;
3715
+ onProgress?.(`Building extraction plan from page map for ${primaryType} ${documentType}...`);
3716
+ plan = buildPlanFromPageAssignments(pageAssignments, pageCount);
3425
3717
  await pipelineCtx.save("plan", {
3426
3718
  id,
3427
3719
  pageCount,
3428
3720
  classifyResult,
3721
+ pageAssignments,
3429
3722
  plan,
3430
3723
  memory: Object.fromEntries(memory)
3431
3724
  });
@@ -3466,13 +3759,14 @@ function createExtractor(config) {
3466
3759
  );
3467
3760
  for (const result of extractorResults) {
3468
3761
  if (result) {
3469
- memory.set(result.name, result.data);
3762
+ mergeMemoryResult(result.name, result.data, memory);
3470
3763
  }
3471
3764
  }
3472
3765
  await pipelineCtx.save("extract", {
3473
3766
  id,
3474
3767
  pageCount,
3475
3768
  classifyResult,
3769
+ pageAssignments,
3476
3770
  plan,
3477
3771
  memory: Object.fromEntries(memory)
3478
3772
  });
@@ -3480,21 +3774,26 @@ function createExtractor(config) {
3480
3774
  if (!pipelineCtx.isPhaseComplete("review")) {
3481
3775
  for (let round = 0; round < maxReviewRounds; round++) {
3482
3776
  const extractedKeys = [...memory.keys()].filter((k) => k !== "classify");
3777
+ const extractionSummary = summarizeExtraction(memory);
3778
+ const pageMapSummary = formatPageMapSummary(pageAssignments);
3483
3779
  const reviewResponse = await safeGenerateObject(
3484
3780
  generateObject,
3485
3781
  {
3486
- prompt: buildReviewPrompt(template.required, extractedKeys),
3782
+ prompt: buildReviewPrompt(template.required, extractedKeys, extractionSummary, pageMapSummary),
3487
3783
  schema: ReviewResultSchema,
3488
- maxTokens: 1024,
3489
- providerOptions
3784
+ maxTokens: 1536,
3785
+ providerOptions: { ...providerOptions, pdfBase64 }
3490
3786
  },
3491
3787
  {
3492
- fallback: { complete: true, missingFields: [], additionalTasks: [] },
3788
+ fallback: { complete: true, missingFields: [], qualityIssues: [], additionalTasks: [] },
3493
3789
  log,
3494
3790
  onError: (err, attempt) => log?.(`Review round ${round + 1} attempt ${attempt + 1} failed: ${err}`)
3495
3791
  }
3496
3792
  );
3497
3793
  trackUsage(reviewResponse.usage);
3794
+ if (reviewResponse.object.qualityIssues?.length) {
3795
+ await log?.(`Review round ${round + 1} quality issues: ${reviewResponse.object.qualityIssues.join("; ")}`);
3796
+ }
3498
3797
  if (reviewResponse.object.complete || reviewResponse.object.additionalTasks.length === 0) {
3499
3798
  onProgress?.("Extraction complete.");
3500
3799
  break;
@@ -3529,7 +3828,7 @@ function createExtractor(config) {
3529
3828
  );
3530
3829
  for (const result of followUpResults) {
3531
3830
  if (result) {
3532
- memory.set(result.name, result.data);
3831
+ mergeMemoryResult(result.name, result.data, memory);
3533
3832
  }
3534
3833
  }
3535
3834
  }
@@ -3537,6 +3836,7 @@ function createExtractor(config) {
3537
3836
  id,
3538
3837
  pageCount,
3539
3838
  classifyResult,
3839
+ pageAssignments,
3540
3840
  plan,
3541
3841
  memory: Object.fromEntries(memory)
3542
3842
  });
@@ -3547,6 +3847,7 @@ function createExtractor(config) {
3547
3847
  id,
3548
3848
  pageCount,
3549
3849
  classifyResult,
3850
+ pageAssignments,
3550
3851
  plan,
3551
3852
  memory: Object.fromEntries(memory),
3552
3853
  document
@@ -3560,10 +3861,19 @@ function createExtractor(config) {
3560
3861
  trackUsage(formatResult.usage);
3561
3862
  const chunks = chunkDocument(formatResult.document);
3562
3863
  const finalCheckpoint = pipelineCtx.getCheckpoint();
3864
+ if (callsMissingUsage > 0) {
3865
+ await log?.(`Token usage was unavailable for ${callsMissingUsage}/${modelCalls} model calls. Check that your provider callbacks return usage.`);
3866
+ onProgress?.(`Token usage unavailable for ${callsMissingUsage}/${modelCalls} model calls.`);
3867
+ }
3563
3868
  return {
3564
3869
  document: formatResult.document,
3565
3870
  chunks,
3566
3871
  tokenUsage: totalUsage,
3872
+ usageReporting: {
3873
+ modelCalls,
3874
+ callsWithUsage,
3875
+ callsMissingUsage
3876
+ },
3567
3877
  checkpoint: finalCheckpoint
3568
3878
  };
3569
3879
  }