@claritylabs/cl-sdk 0.8.1 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -36
- package/dist/index.d.mts +24 -0
- package/dist/index.d.ts +24 -0
- package/dist/index.js +382 -72
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +382 -72
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.mjs
CHANGED
|
@@ -1905,6 +1905,125 @@ Total Cost: ${doc.totalCost}` : ""}`,
|
|
|
1905
1905
|
return chunks;
|
|
1906
1906
|
}
|
|
1907
1907
|
|
|
1908
|
+
// src/extraction/merge.ts
|
|
1909
|
+
function isPresent(value) {
|
|
1910
|
+
if (value === void 0 || value === null) return false;
|
|
1911
|
+
if (typeof value === "string") return value.trim().length > 0;
|
|
1912
|
+
if (Array.isArray(value)) return value.length > 0;
|
|
1913
|
+
return true;
|
|
1914
|
+
}
|
|
1915
|
+
function dedupeByKey(items, keyFn) {
|
|
1916
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1917
|
+
const merged = [];
|
|
1918
|
+
for (const item of items) {
|
|
1919
|
+
const key = keyFn(item);
|
|
1920
|
+
if (seen.has(key)) continue;
|
|
1921
|
+
seen.add(key);
|
|
1922
|
+
merged.push(item);
|
|
1923
|
+
}
|
|
1924
|
+
return merged;
|
|
1925
|
+
}
|
|
1926
|
+
function mergeUniqueObjects(existing, incoming, keyFn) {
|
|
1927
|
+
return dedupeByKey([...existing, ...incoming], keyFn);
|
|
1928
|
+
}
|
|
1929
|
+
function mergeShallowPreferPresent(existing, incoming) {
|
|
1930
|
+
const merged = { ...existing };
|
|
1931
|
+
for (const [key, value] of Object.entries(incoming)) {
|
|
1932
|
+
const current = merged[key];
|
|
1933
|
+
if (Array.isArray(current) && Array.isArray(value)) {
|
|
1934
|
+
merged[key] = [...current, ...value];
|
|
1935
|
+
continue;
|
|
1936
|
+
}
|
|
1937
|
+
if (current && value && typeof current === "object" && typeof value === "object" && !Array.isArray(current) && !Array.isArray(value)) {
|
|
1938
|
+
merged[key] = mergeShallowPreferPresent(
|
|
1939
|
+
current,
|
|
1940
|
+
value
|
|
1941
|
+
);
|
|
1942
|
+
continue;
|
|
1943
|
+
}
|
|
1944
|
+
if (!isPresent(current) && isPresent(value)) {
|
|
1945
|
+
merged[key] = value;
|
|
1946
|
+
}
|
|
1947
|
+
}
|
|
1948
|
+
return merged;
|
|
1949
|
+
}
|
|
1950
|
+
function mergeCoverageLimits(existing, incoming) {
|
|
1951
|
+
const merged = mergeShallowPreferPresent(existing, incoming);
|
|
1952
|
+
const existingCoverages = Array.isArray(existing.coverages) ? existing.coverages : [];
|
|
1953
|
+
const incomingCoverages = Array.isArray(incoming.coverages) ? incoming.coverages : [];
|
|
1954
|
+
merged.coverages = mergeUniqueObjects(existingCoverages, incomingCoverages, (coverage) => [
|
|
1955
|
+
String(coverage.name ?? "").toLowerCase(),
|
|
1956
|
+
String(coverage.limit ?? "").toLowerCase(),
|
|
1957
|
+
String(coverage.deductible ?? "").toLowerCase(),
|
|
1958
|
+
String(coverage.formNumber ?? "").toLowerCase()
|
|
1959
|
+
].join("|"));
|
|
1960
|
+
return merged;
|
|
1961
|
+
}
|
|
1962
|
+
function mergeDeclarations(existing, incoming) {
|
|
1963
|
+
const merged = mergeShallowPreferPresent(existing, incoming);
|
|
1964
|
+
const existingFields = Array.isArray(existing.fields) ? existing.fields : [];
|
|
1965
|
+
const incomingFields = Array.isArray(incoming.fields) ? incoming.fields : [];
|
|
1966
|
+
merged.fields = mergeUniqueObjects(existingFields, incomingFields, (field) => [
|
|
1967
|
+
String(field.field ?? "").toLowerCase(),
|
|
1968
|
+
String(field.value ?? "").toLowerCase(),
|
|
1969
|
+
String(field.section ?? "").toLowerCase()
|
|
1970
|
+
].join("|"));
|
|
1971
|
+
return merged;
|
|
1972
|
+
}
|
|
1973
|
+
function mergeArrayPayload(existing, incoming, arrayKey, keyFn) {
|
|
1974
|
+
const merged = mergeShallowPreferPresent(existing, incoming);
|
|
1975
|
+
const existingItems = Array.isArray(existing[arrayKey]) ? existing[arrayKey] : [];
|
|
1976
|
+
const incomingItems = Array.isArray(incoming[arrayKey]) ? incoming[arrayKey] : [];
|
|
1977
|
+
merged[arrayKey] = mergeUniqueObjects(existingItems, incomingItems, keyFn);
|
|
1978
|
+
return merged;
|
|
1979
|
+
}
|
|
1980
|
+
function mergeExtractorResult(extractorName, existing, incoming) {
|
|
1981
|
+
if (!existing) return incoming;
|
|
1982
|
+
if (!incoming) return existing;
|
|
1983
|
+
if (typeof existing !== "object" || typeof incoming !== "object") return incoming;
|
|
1984
|
+
const current = existing;
|
|
1985
|
+
const next = incoming;
|
|
1986
|
+
switch (extractorName) {
|
|
1987
|
+
case "carrier_info":
|
|
1988
|
+
case "named_insured":
|
|
1989
|
+
case "loss_history":
|
|
1990
|
+
case "supplementary":
|
|
1991
|
+
case "premium_breakdown":
|
|
1992
|
+
return mergeShallowPreferPresent(current, next);
|
|
1993
|
+
case "coverage_limits":
|
|
1994
|
+
return mergeCoverageLimits(current, next);
|
|
1995
|
+
case "declarations":
|
|
1996
|
+
return mergeDeclarations(current, next);
|
|
1997
|
+
case "endorsements":
|
|
1998
|
+
return mergeArrayPayload(current, next, "endorsements", (item) => [
|
|
1999
|
+
String(item.formNumber ?? "").toLowerCase(),
|
|
2000
|
+
String(item.title ?? "").toLowerCase(),
|
|
2001
|
+
String(item.pageStart ?? "")
|
|
2002
|
+
].join("|"));
|
|
2003
|
+
case "exclusions":
|
|
2004
|
+
return mergeArrayPayload(current, next, "exclusions", (item) => [
|
|
2005
|
+
String(item.name ?? "").toLowerCase(),
|
|
2006
|
+
String(item.formNumber ?? "").toLowerCase(),
|
|
2007
|
+
String(item.pageNumber ?? "")
|
|
2008
|
+
].join("|"));
|
|
2009
|
+
case "conditions":
|
|
2010
|
+
return mergeArrayPayload(current, next, "conditions", (item) => [
|
|
2011
|
+
String(item.name ?? "").toLowerCase(),
|
|
2012
|
+
String(item.conditionType ?? "").toLowerCase(),
|
|
2013
|
+
String(item.pageNumber ?? "")
|
|
2014
|
+
].join("|"));
|
|
2015
|
+
case "sections":
|
|
2016
|
+
return mergeArrayPayload(current, next, "sections", (item) => [
|
|
2017
|
+
String(item.title ?? "").toLowerCase(),
|
|
2018
|
+
String(item.type ?? "").toLowerCase(),
|
|
2019
|
+
String(item.pageStart ?? ""),
|
|
2020
|
+
String(item.pageEnd ?? "")
|
|
2021
|
+
].join("|"));
|
|
2022
|
+
default:
|
|
2023
|
+
return mergeShallowPreferPresent(current, next);
|
|
2024
|
+
}
|
|
2025
|
+
}
|
|
2026
|
+
|
|
1908
2027
|
// src/prompts/templates/homeowners.ts
|
|
1909
2028
|
var HOMEOWNERS_TEMPLATE = {
|
|
1910
2029
|
type: "homeowners",
|
|
@@ -2694,57 +2813,74 @@ Return JSON only:
|
|
|
2694
2813
|
}`;
|
|
2695
2814
|
}
|
|
2696
2815
|
|
|
2697
|
-
// src/prompts/coordinator/
|
|
2816
|
+
// src/prompts/coordinator/page-map.ts
|
|
2698
2817
|
import { z as z19 } from "zod";
|
|
2699
|
-
var
|
|
2700
|
-
|
|
2701
|
-
|
|
2702
|
-
|
|
2703
|
-
|
|
2704
|
-
|
|
2705
|
-
|
|
2706
|
-
|
|
2707
|
-
|
|
2818
|
+
var PageExtractorSchema = z19.enum([
|
|
2819
|
+
"carrier_info",
|
|
2820
|
+
"named_insured",
|
|
2821
|
+
"coverage_limits",
|
|
2822
|
+
"endorsements",
|
|
2823
|
+
"exclusions",
|
|
2824
|
+
"conditions",
|
|
2825
|
+
"premium_breakdown",
|
|
2826
|
+
"declarations",
|
|
2827
|
+
"loss_history",
|
|
2828
|
+
"sections",
|
|
2829
|
+
"supplementary"
|
|
2830
|
+
]);
|
|
2831
|
+
var PageAssignmentSchema = z19.object({
|
|
2832
|
+
localPageNumber: z19.number().int().positive().describe("1-based page number within this supplied PDF chunk"),
|
|
2833
|
+
extractorNames: z19.array(PageExtractorSchema).describe("Focused extractors that should inspect this page"),
|
|
2834
|
+
confidence: z19.number().min(0).max(1).optional().describe("Confidence in the page assignment"),
|
|
2835
|
+
notes: z19.string().optional().describe("Short explanation of what appears on the page")
|
|
2708
2836
|
});
|
|
2709
|
-
var
|
|
2710
|
-
|
|
2711
|
-
pageMap: z19.array(PageMapEntrySchema).optional()
|
|
2837
|
+
var PageMapChunkSchema = z19.object({
|
|
2838
|
+
pages: z19.array(PageAssignmentSchema)
|
|
2712
2839
|
});
|
|
2713
|
-
function
|
|
2714
|
-
return `You are
|
|
2840
|
+
function buildPageMapPrompt(templateHints, startPage, endPage) {
|
|
2841
|
+
return `You are mapping insurance document pages to focused extractors.
|
|
2842
|
+
|
|
2843
|
+
These supplied pages are ORIGINAL DOCUMENT PAGES ${startPage}-${endPage}.
|
|
2715
2844
|
|
|
2716
2845
|
DOCUMENT TYPE HINTS:
|
|
2717
2846
|
${templateHints}
|
|
2718
2847
|
|
|
2719
|
-
For each
|
|
2848
|
+
For each page in this supplied PDF chunk, decide which extractor(s) should inspect it.
|
|
2720
2849
|
|
|
2721
2850
|
Available extractors:
|
|
2722
|
-
- carrier_info
|
|
2723
|
-
- named_insured
|
|
2724
|
-
- coverage_limits
|
|
2725
|
-
- endorsements
|
|
2726
|
-
- exclusions
|
|
2727
|
-
- conditions
|
|
2728
|
-
- premium_breakdown
|
|
2729
|
-
- declarations
|
|
2730
|
-
- loss_history
|
|
2731
|
-
- sections
|
|
2732
|
-
- supplementary
|
|
2851
|
+
- carrier_info
|
|
2852
|
+
- named_insured
|
|
2853
|
+
- coverage_limits
|
|
2854
|
+
- endorsements
|
|
2855
|
+
- exclusions
|
|
2856
|
+
- conditions
|
|
2857
|
+
- premium_breakdown
|
|
2858
|
+
- declarations
|
|
2859
|
+
- loss_history
|
|
2860
|
+
- sections
|
|
2861
|
+
- supplementary
|
|
2862
|
+
|
|
2863
|
+
Rules:
|
|
2864
|
+
- Use specific extractors for declarations, schedules, endorsements, exclusions, conditions, premium pages, and loss runs.
|
|
2865
|
+
- Use "sections" for pages that contain substantive policy text or mixed content that should still be preserved as raw sections.
|
|
2866
|
+
- Avoid assigning broad ranges mentally; decide page by page.
|
|
2867
|
+
- A page may map to multiple extractors if it legitimately contains multiple relevant sections.
|
|
2868
|
+
- Prefer declarations and schedules for numeric limits/deductibles over later generic form wording.
|
|
2869
|
+
- If a page is mostly generic form language with no declaration-specific values, do not assign "coverage_limits" unless it clearly contains schedule-specific limits.
|
|
2870
|
+
- Return every page in the supplied chunk exactly once.
|
|
2733
2871
|
|
|
2734
2872
|
Return JSON:
|
|
2735
2873
|
{
|
|
2736
|
-
"
|
|
2737
|
-
{
|
|
2738
|
-
|
|
2739
|
-
|
|
2740
|
-
|
|
2741
|
-
|
|
2742
|
-
|
|
2874
|
+
"pages": [
|
|
2875
|
+
{
|
|
2876
|
+
"localPageNumber": 1,
|
|
2877
|
+
"extractorNames": ["declarations", "carrier_info", "named_insured", "coverage_limits"],
|
|
2878
|
+
"confidence": 0.96,
|
|
2879
|
+
"notes": "Declarations page with insured, policy period, and scheduled limits"
|
|
2880
|
+
}
|
|
2743
2881
|
]
|
|
2744
2882
|
}
|
|
2745
2883
|
|
|
2746
|
-
Create tasks that cover the entire document. Prefer specific extractors over generic "sections" where possible. Keep page ranges tight \u2014 only include pages relevant to each extractor.
|
|
2747
|
-
|
|
2748
2884
|
Respond with JSON only.`;
|
|
2749
2885
|
}
|
|
2750
2886
|
|
|
@@ -2753,6 +2889,7 @@ import { z as z20 } from "zod";
|
|
|
2753
2889
|
var ReviewResultSchema = z20.object({
|
|
2754
2890
|
complete: z20.boolean(),
|
|
2755
2891
|
missingFields: z20.array(z20.string()),
|
|
2892
|
+
qualityIssues: z20.array(z20.string()).optional(),
|
|
2756
2893
|
additionalTasks: z20.array(z20.object({
|
|
2757
2894
|
extractorName: z20.string(),
|
|
2758
2895
|
startPage: z20.number(),
|
|
@@ -2760,8 +2897,8 @@ var ReviewResultSchema = z20.object({
|
|
|
2760
2897
|
description: z20.string()
|
|
2761
2898
|
}))
|
|
2762
2899
|
});
|
|
2763
|
-
function buildReviewPrompt(templateExpected, extractedKeys) {
|
|
2764
|
-
return `You are reviewing an extraction for completeness. Compare what was expected vs what was found.
|
|
2900
|
+
function buildReviewPrompt(templateExpected, extractedKeys, extractionSummary, pageMapSummary) {
|
|
2901
|
+
return `You are reviewing an extraction for completeness and quality. Compare what was expected vs what was found.
|
|
2765
2902
|
|
|
2766
2903
|
EXPECTED FIELDS (from document type template):
|
|
2767
2904
|
${templateExpected.map((f) => `- ${f}`).join("\n")}
|
|
@@ -2769,21 +2906,36 @@ ${templateExpected.map((f) => `- ${f}`).join("\n")}
|
|
|
2769
2906
|
FIELDS ALREADY EXTRACTED:
|
|
2770
2907
|
${extractedKeys.map((f) => `- ${f}`).join("\n")}
|
|
2771
2908
|
|
|
2909
|
+
PAGE MAP SUMMARY:
|
|
2910
|
+
${pageMapSummary}
|
|
2911
|
+
|
|
2912
|
+
CURRENT EXTRACTION SUMMARY:
|
|
2913
|
+
${extractionSummary}
|
|
2914
|
+
|
|
2772
2915
|
Determine:
|
|
2773
|
-
1. Is the extraction complete enough?
|
|
2916
|
+
1. Is the extraction complete enough?
|
|
2774
2917
|
2. What fields are missing?
|
|
2775
|
-
3.
|
|
2918
|
+
3. What quality issues are present?
|
|
2919
|
+
4. Should any additional extraction tasks be dispatched?
|
|
2920
|
+
|
|
2921
|
+
Mark the extraction as NOT complete if any of these are true:
|
|
2922
|
+
- required fields are missing
|
|
2923
|
+
- extracted values are generic placeholders like "shown in declarations", "per schedule", "if applicable", "as stated"
|
|
2924
|
+
- coverage limits or deductibles appear to come from generic form language instead of declaration/schedule-specific values
|
|
2925
|
+
- page assignments suggest declaration, schedule, endorsement, exclusion, or condition pages were not actually extracted with the matching focused extractor
|
|
2926
|
+
- a focused extractor exists but returned too little substance for the relevant pages
|
|
2776
2927
|
|
|
2777
2928
|
Return JSON:
|
|
2778
2929
|
{
|
|
2779
2930
|
"complete": boolean,
|
|
2780
2931
|
"missingFields": ["field1", "field2"],
|
|
2932
|
+
"qualityIssues": ["issue 1", "issue 2"],
|
|
2781
2933
|
"additionalTasks": [
|
|
2782
2934
|
{ "extractorName": "...", "startPage": N, "endPage": N, "description": "..." }
|
|
2783
2935
|
]
|
|
2784
2936
|
}
|
|
2785
2937
|
|
|
2786
|
-
|
|
2938
|
+
Use the page map to target follow-up extraction pages precisely. Prefer narrow, declaration/schedule-focused follow-up tasks over broad page ranges.
|
|
2787
2939
|
|
|
2788
2940
|
Respond with JSON only.`;
|
|
2789
2941
|
}
|
|
@@ -3331,17 +3483,125 @@ function createExtractor(config) {
|
|
|
3331
3483
|
} = config;
|
|
3332
3484
|
const limit = pLimit(concurrency);
|
|
3333
3485
|
let totalUsage = { inputTokens: 0, outputTokens: 0 };
|
|
3486
|
+
let modelCalls = 0;
|
|
3487
|
+
let callsWithUsage = 0;
|
|
3488
|
+
let callsMissingUsage = 0;
|
|
3334
3489
|
function trackUsage(usage) {
|
|
3490
|
+
modelCalls += 1;
|
|
3335
3491
|
if (usage) {
|
|
3492
|
+
callsWithUsage += 1;
|
|
3336
3493
|
totalUsage.inputTokens += usage.inputTokens;
|
|
3337
3494
|
totalUsage.outputTokens += usage.outputTokens;
|
|
3338
3495
|
onTokenUsage?.(usage);
|
|
3496
|
+
} else {
|
|
3497
|
+
callsMissingUsage += 1;
|
|
3498
|
+
}
|
|
3499
|
+
}
|
|
3500
|
+
function mergeMemoryResult(name, data, memory) {
|
|
3501
|
+
const existing = memory.get(name);
|
|
3502
|
+
memory.set(name, mergeExtractorResult(name, existing, data));
|
|
3503
|
+
}
|
|
3504
|
+
function summarizeExtraction(memory) {
|
|
3505
|
+
const coverageResult = memory.get("coverage_limits");
|
|
3506
|
+
const declarationResult = memory.get("declarations");
|
|
3507
|
+
const endorsementResult = memory.get("endorsements");
|
|
3508
|
+
const exclusionResult = memory.get("exclusions");
|
|
3509
|
+
const conditionResult = memory.get("conditions");
|
|
3510
|
+
const sectionResult = memory.get("sections");
|
|
3511
|
+
const coverageSummary = Array.isArray(coverageResult?.coverages) ? coverageResult.coverages.slice(0, 12).map((coverage) => ({
|
|
3512
|
+
name: coverage.name,
|
|
3513
|
+
limit: coverage.limit,
|
|
3514
|
+
deductible: coverage.deductible,
|
|
3515
|
+
formNumber: coverage.formNumber
|
|
3516
|
+
})) : [];
|
|
3517
|
+
return JSON.stringify({
|
|
3518
|
+
extractedKeys: [...memory.keys()].filter((key) => key !== "classify"),
|
|
3519
|
+
declarationFieldCount: Array.isArray(declarationResult?.fields) ? declarationResult.fields.length : 0,
|
|
3520
|
+
coverageCount: Array.isArray(coverageResult?.coverages) ? coverageResult.coverages.length : 0,
|
|
3521
|
+
coverageSamples: coverageSummary,
|
|
3522
|
+
endorsementCount: Array.isArray(endorsementResult?.endorsements) ? endorsementResult.endorsements.length : 0,
|
|
3523
|
+
exclusionCount: Array.isArray(exclusionResult?.exclusions) ? exclusionResult.exclusions.length : 0,
|
|
3524
|
+
conditionCount: Array.isArray(conditionResult?.conditions) ? conditionResult.conditions.length : 0,
|
|
3525
|
+
sectionCount: Array.isArray(sectionResult?.sections) ? sectionResult.sections.length : 0
|
|
3526
|
+
}, null, 2);
|
|
3527
|
+
}
|
|
3528
|
+
function formatPageMapSummary(pageAssignments) {
|
|
3529
|
+
const extractorPages = /* @__PURE__ */ new Map();
|
|
3530
|
+
for (const assignment of pageAssignments) {
|
|
3531
|
+
for (const extractorName of assignment.extractorNames) {
|
|
3532
|
+
extractorPages.set(extractorName, [...extractorPages.get(extractorName) ?? [], assignment.localPageNumber]);
|
|
3533
|
+
}
|
|
3534
|
+
}
|
|
3535
|
+
if (extractorPages.size === 0) return "No page assignments available.";
|
|
3536
|
+
return [...extractorPages.entries()].map(([extractorName, pages]) => `${extractorName}: pages ${pages.join(", ")}`).join("\n");
|
|
3537
|
+
}
|
|
3538
|
+
function buildTemplateHints(primaryType, documentType, pageCount, template) {
|
|
3539
|
+
return [
|
|
3540
|
+
`Document type: ${primaryType} ${documentType}`,
|
|
3541
|
+
`Expected sections: ${template.expectedSections.join(", ")}`,
|
|
3542
|
+
`Page hints: ${Object.entries(template.pageHints).map(([k, v]) => `${k}: ${v}`).join("; ")}`,
|
|
3543
|
+
`Total pages: ${pageCount}`
|
|
3544
|
+
].join("\n");
|
|
3545
|
+
}
|
|
3546
|
+
function groupContiguousPages(pages) {
|
|
3547
|
+
if (pages.length === 0) return [];
|
|
3548
|
+
const sorted = [...new Set(pages)].sort((a, b) => a - b);
|
|
3549
|
+
const ranges = [];
|
|
3550
|
+
let start = sorted[0];
|
|
3551
|
+
let previous = sorted[0];
|
|
3552
|
+
for (let i = 1; i < sorted.length; i += 1) {
|
|
3553
|
+
const current = sorted[i];
|
|
3554
|
+
if (current === previous + 1) {
|
|
3555
|
+
previous = current;
|
|
3556
|
+
continue;
|
|
3557
|
+
}
|
|
3558
|
+
ranges.push({ startPage: start, endPage: previous });
|
|
3559
|
+
start = current;
|
|
3560
|
+
previous = current;
|
|
3339
3561
|
}
|
|
3562
|
+
ranges.push({ startPage: start, endPage: previous });
|
|
3563
|
+
return ranges;
|
|
3564
|
+
}
|
|
3565
|
+
function buildPlanFromPageAssignments(pageAssignments, pageCount) {
|
|
3566
|
+
const extractorPages = /* @__PURE__ */ new Map();
|
|
3567
|
+
for (const assignment of pageAssignments) {
|
|
3568
|
+
const extractors = assignment.extractorNames.length > 0 ? assignment.extractorNames : ["sections"];
|
|
3569
|
+
for (const extractorName of extractors) {
|
|
3570
|
+
extractorPages.set(extractorName, [...extractorPages.get(extractorName) ?? [], assignment.localPageNumber]);
|
|
3571
|
+
}
|
|
3572
|
+
}
|
|
3573
|
+
const coveredPages = /* @__PURE__ */ new Set();
|
|
3574
|
+
for (const pages of extractorPages.values()) {
|
|
3575
|
+
for (const page of pages) coveredPages.add(page);
|
|
3576
|
+
}
|
|
3577
|
+
for (let page = 1; page <= pageCount; page += 1) {
|
|
3578
|
+
if (!coveredPages.has(page)) {
|
|
3579
|
+
extractorPages.set("sections", [...extractorPages.get("sections") ?? [], page]);
|
|
3580
|
+
}
|
|
3581
|
+
}
|
|
3582
|
+
const tasks = [...extractorPages.entries()].flatMap(
|
|
3583
|
+
([extractorName, pages]) => groupContiguousPages(pages).map(({ startPage, endPage }) => ({
|
|
3584
|
+
extractorName,
|
|
3585
|
+
startPage,
|
|
3586
|
+
endPage,
|
|
3587
|
+
description: `Page-mapped ${extractorName} extraction for pages ${startPage}-${endPage}`
|
|
3588
|
+
}))
|
|
3589
|
+
).sort((a, b) => a.startPage - b.startPage || a.extractorName.localeCompare(b.extractorName));
|
|
3590
|
+
return {
|
|
3591
|
+
tasks,
|
|
3592
|
+
pageMap: [...extractorPages.entries()].map(([section, pages]) => ({
|
|
3593
|
+
section,
|
|
3594
|
+
pages: `pages ${[...new Set(pages)].sort((a, b) => a - b).join(", ")}`
|
|
3595
|
+
}))
|
|
3596
|
+
};
|
|
3340
3597
|
}
|
|
3341
3598
|
async function extract(pdfBase64, documentId, options) {
|
|
3342
3599
|
const id = documentId ?? `doc-${Date.now()}`;
|
|
3343
3600
|
const memory = /* @__PURE__ */ new Map();
|
|
3344
3601
|
totalUsage = { inputTokens: 0, outputTokens: 0 };
|
|
3602
|
+
modelCalls = 0;
|
|
3603
|
+
callsWithUsage = 0;
|
|
3604
|
+
callsMissingUsage = 0;
|
|
3345
3605
|
const pipelineCtx = createPipelineContext({
|
|
3346
3606
|
id,
|
|
3347
3607
|
onSave: onCheckpointSave,
|
|
@@ -3392,40 +3652,73 @@ function createExtractor(config) {
|
|
|
3392
3652
|
const primaryType = policyTypes[0] ?? "other";
|
|
3393
3653
|
const template = getTemplate(primaryType);
|
|
3394
3654
|
const pageCount = resumed?.pageCount ?? await getPdfPageCount(pdfBase64);
|
|
3655
|
+
const templateHints = buildTemplateHints(primaryType, documentType, pageCount, template);
|
|
3656
|
+
let pageAssignments;
|
|
3657
|
+
if (resumed?.pageAssignments && pipelineCtx.isPhaseComplete("page_map")) {
|
|
3658
|
+
pageAssignments = resumed.pageAssignments;
|
|
3659
|
+
onProgress?.("Resuming from checkpoint (page map complete)...");
|
|
3660
|
+
} else {
|
|
3661
|
+
onProgress?.(`Mapping document pages for ${primaryType} ${documentType}...`);
|
|
3662
|
+
const chunkSize = 8;
|
|
3663
|
+
const collectedAssignments = [];
|
|
3664
|
+
for (let startPage = 1; startPage <= pageCount; startPage += chunkSize) {
|
|
3665
|
+
const endPage = Math.min(pageCount, startPage + chunkSize - 1);
|
|
3666
|
+
const pagesPdf = await extractPageRange(pdfBase64, startPage, endPage);
|
|
3667
|
+
const mapResponse = await safeGenerateObject(
|
|
3668
|
+
generateObject,
|
|
3669
|
+
{
|
|
3670
|
+
prompt: buildPageMapPrompt(templateHints, startPage, endPage),
|
|
3671
|
+
schema: PageMapChunkSchema,
|
|
3672
|
+
maxTokens: 2048,
|
|
3673
|
+
providerOptions: { ...providerOptions, pdfBase64: pagesPdf }
|
|
3674
|
+
},
|
|
3675
|
+
{
|
|
3676
|
+
fallback: {
|
|
3677
|
+
pages: Array.from({ length: endPage - startPage + 1 }, (_, index) => ({
|
|
3678
|
+
localPageNumber: index + 1,
|
|
3679
|
+
extractorNames: index === 0 && startPage === 1 ? ["carrier_info", "named_insured", "declarations", "coverage_limits"] : ["sections"],
|
|
3680
|
+
confidence: 0,
|
|
3681
|
+
notes: "Fallback page assignment"
|
|
3682
|
+
}))
|
|
3683
|
+
},
|
|
3684
|
+
log,
|
|
3685
|
+
onError: (err, attempt) => log?.(`Page map attempt ${attempt + 1} failed for pages ${startPage}-${endPage}: ${err}`)
|
|
3686
|
+
}
|
|
3687
|
+
);
|
|
3688
|
+
trackUsage(mapResponse.usage);
|
|
3689
|
+
for (const assignment of mapResponse.object.pages) {
|
|
3690
|
+
collectedAssignments.push({
|
|
3691
|
+
...assignment,
|
|
3692
|
+
localPageNumber: startPage + assignment.localPageNumber - 1
|
|
3693
|
+
});
|
|
3694
|
+
}
|
|
3695
|
+
}
|
|
3696
|
+
pageAssignments = collectedAssignments.length > 0 ? collectedAssignments : Array.from({ length: pageCount }, (_, index) => ({
|
|
3697
|
+
localPageNumber: index + 1,
|
|
3698
|
+
extractorNames: index === 0 ? ["carrier_info", "named_insured", "declarations", "coverage_limits"] : ["sections"],
|
|
3699
|
+
confidence: 0,
|
|
3700
|
+
notes: "Full-document fallback page assignment"
|
|
3701
|
+
}));
|
|
3702
|
+
await pipelineCtx.save("page_map", {
|
|
3703
|
+
id,
|
|
3704
|
+
pageCount,
|
|
3705
|
+
classifyResult,
|
|
3706
|
+
pageAssignments,
|
|
3707
|
+
memory: Object.fromEntries(memory)
|
|
3708
|
+
});
|
|
3709
|
+
}
|
|
3395
3710
|
let plan;
|
|
3396
3711
|
if (resumed?.plan && pipelineCtx.isPhaseComplete("plan")) {
|
|
3397
3712
|
plan = resumed.plan;
|
|
3398
3713
|
onProgress?.("Resuming from checkpoint (plan complete)...");
|
|
3399
3714
|
} else {
|
|
3400
|
-
onProgress?.(`
|
|
3401
|
-
|
|
3402
|
-
`Document type: ${primaryType} ${documentType}`,
|
|
3403
|
-
`Expected sections: ${template.expectedSections.join(", ")}`,
|
|
3404
|
-
`Page hints: ${Object.entries(template.pageHints).map(([k, v]) => `${k}: ${v}`).join("; ")}`,
|
|
3405
|
-
`Total pages: ${pageCount}`
|
|
3406
|
-
].join("\n");
|
|
3407
|
-
const planResponse = await safeGenerateObject(
|
|
3408
|
-
generateObject,
|
|
3409
|
-
{
|
|
3410
|
-
prompt: buildPlanPrompt(templateHints),
|
|
3411
|
-
schema: ExtractionPlanSchema,
|
|
3412
|
-
maxTokens: 2048,
|
|
3413
|
-
providerOptions: { ...providerOptions, pdfBase64 }
|
|
3414
|
-
},
|
|
3415
|
-
{
|
|
3416
|
-
fallback: {
|
|
3417
|
-
tasks: [{ extractorName: "sections", startPage: 1, endPage: pageCount, description: "Full document fallback extraction" }]
|
|
3418
|
-
},
|
|
3419
|
-
log,
|
|
3420
|
-
onError: (err, attempt) => log?.(`Plan attempt ${attempt + 1} failed: ${err}`)
|
|
3421
|
-
}
|
|
3422
|
-
);
|
|
3423
|
-
trackUsage(planResponse.usage);
|
|
3424
|
-
plan = planResponse.object;
|
|
3715
|
+
onProgress?.(`Building extraction plan from page map for ${primaryType} ${documentType}...`);
|
|
3716
|
+
plan = buildPlanFromPageAssignments(pageAssignments, pageCount);
|
|
3425
3717
|
await pipelineCtx.save("plan", {
|
|
3426
3718
|
id,
|
|
3427
3719
|
pageCount,
|
|
3428
3720
|
classifyResult,
|
|
3721
|
+
pageAssignments,
|
|
3429
3722
|
plan,
|
|
3430
3723
|
memory: Object.fromEntries(memory)
|
|
3431
3724
|
});
|
|
@@ -3466,13 +3759,14 @@ function createExtractor(config) {
|
|
|
3466
3759
|
);
|
|
3467
3760
|
for (const result of extractorResults) {
|
|
3468
3761
|
if (result) {
|
|
3469
|
-
|
|
3762
|
+
mergeMemoryResult(result.name, result.data, memory);
|
|
3470
3763
|
}
|
|
3471
3764
|
}
|
|
3472
3765
|
await pipelineCtx.save("extract", {
|
|
3473
3766
|
id,
|
|
3474
3767
|
pageCount,
|
|
3475
3768
|
classifyResult,
|
|
3769
|
+
pageAssignments,
|
|
3476
3770
|
plan,
|
|
3477
3771
|
memory: Object.fromEntries(memory)
|
|
3478
3772
|
});
|
|
@@ -3480,21 +3774,26 @@ function createExtractor(config) {
|
|
|
3480
3774
|
if (!pipelineCtx.isPhaseComplete("review")) {
|
|
3481
3775
|
for (let round = 0; round < maxReviewRounds; round++) {
|
|
3482
3776
|
const extractedKeys = [...memory.keys()].filter((k) => k !== "classify");
|
|
3777
|
+
const extractionSummary = summarizeExtraction(memory);
|
|
3778
|
+
const pageMapSummary = formatPageMapSummary(pageAssignments);
|
|
3483
3779
|
const reviewResponse = await safeGenerateObject(
|
|
3484
3780
|
generateObject,
|
|
3485
3781
|
{
|
|
3486
|
-
prompt: buildReviewPrompt(template.required, extractedKeys),
|
|
3782
|
+
prompt: buildReviewPrompt(template.required, extractedKeys, extractionSummary, pageMapSummary),
|
|
3487
3783
|
schema: ReviewResultSchema,
|
|
3488
|
-
maxTokens:
|
|
3489
|
-
providerOptions
|
|
3784
|
+
maxTokens: 1536,
|
|
3785
|
+
providerOptions: { ...providerOptions, pdfBase64 }
|
|
3490
3786
|
},
|
|
3491
3787
|
{
|
|
3492
|
-
fallback: { complete: true, missingFields: [], additionalTasks: [] },
|
|
3788
|
+
fallback: { complete: true, missingFields: [], qualityIssues: [], additionalTasks: [] },
|
|
3493
3789
|
log,
|
|
3494
3790
|
onError: (err, attempt) => log?.(`Review round ${round + 1} attempt ${attempt + 1} failed: ${err}`)
|
|
3495
3791
|
}
|
|
3496
3792
|
);
|
|
3497
3793
|
trackUsage(reviewResponse.usage);
|
|
3794
|
+
if (reviewResponse.object.qualityIssues?.length) {
|
|
3795
|
+
await log?.(`Review round ${round + 1} quality issues: ${reviewResponse.object.qualityIssues.join("; ")}`);
|
|
3796
|
+
}
|
|
3498
3797
|
if (reviewResponse.object.complete || reviewResponse.object.additionalTasks.length === 0) {
|
|
3499
3798
|
onProgress?.("Extraction complete.");
|
|
3500
3799
|
break;
|
|
@@ -3529,7 +3828,7 @@ function createExtractor(config) {
|
|
|
3529
3828
|
);
|
|
3530
3829
|
for (const result of followUpResults) {
|
|
3531
3830
|
if (result) {
|
|
3532
|
-
|
|
3831
|
+
mergeMemoryResult(result.name, result.data, memory);
|
|
3533
3832
|
}
|
|
3534
3833
|
}
|
|
3535
3834
|
}
|
|
@@ -3537,6 +3836,7 @@ function createExtractor(config) {
|
|
|
3537
3836
|
id,
|
|
3538
3837
|
pageCount,
|
|
3539
3838
|
classifyResult,
|
|
3839
|
+
pageAssignments,
|
|
3540
3840
|
plan,
|
|
3541
3841
|
memory: Object.fromEntries(memory)
|
|
3542
3842
|
});
|
|
@@ -3547,6 +3847,7 @@ function createExtractor(config) {
|
|
|
3547
3847
|
id,
|
|
3548
3848
|
pageCount,
|
|
3549
3849
|
classifyResult,
|
|
3850
|
+
pageAssignments,
|
|
3550
3851
|
plan,
|
|
3551
3852
|
memory: Object.fromEntries(memory),
|
|
3552
3853
|
document
|
|
@@ -3560,10 +3861,19 @@ function createExtractor(config) {
|
|
|
3560
3861
|
trackUsage(formatResult.usage);
|
|
3561
3862
|
const chunks = chunkDocument(formatResult.document);
|
|
3562
3863
|
const finalCheckpoint = pipelineCtx.getCheckpoint();
|
|
3864
|
+
if (callsMissingUsage > 0) {
|
|
3865
|
+
await log?.(`Token usage was unavailable for ${callsMissingUsage}/${modelCalls} model calls. Check that your provider callbacks return usage.`);
|
|
3866
|
+
onProgress?.(`Token usage unavailable for ${callsMissingUsage}/${modelCalls} model calls.`);
|
|
3867
|
+
}
|
|
3563
3868
|
return {
|
|
3564
3869
|
document: formatResult.document,
|
|
3565
3870
|
chunks,
|
|
3566
3871
|
tokenUsage: totalUsage,
|
|
3872
|
+
usageReporting: {
|
|
3873
|
+
modelCalls,
|
|
3874
|
+
callsWithUsage,
|
|
3875
|
+
callsMissingUsage
|
|
3876
|
+
},
|
|
3567
3877
|
checkpoint: finalCheckpoint
|
|
3568
3878
|
};
|
|
3569
3879
|
}
|