@claritylabs/cl-sdk 0.8.1 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -36
- package/dist/index.d.mts +24 -0
- package/dist/index.d.ts +24 -0
- package/dist/index.js +382 -72
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +382 -72
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -2138,6 +2138,125 @@ Total Cost: ${doc.totalCost}` : ""}`,
|
|
|
2138
2138
|
return chunks;
|
|
2139
2139
|
}
|
|
2140
2140
|
|
|
2141
|
+
// src/extraction/merge.ts
|
|
2142
|
+
function isPresent(value) {
|
|
2143
|
+
if (value === void 0 || value === null) return false;
|
|
2144
|
+
if (typeof value === "string") return value.trim().length > 0;
|
|
2145
|
+
if (Array.isArray(value)) return value.length > 0;
|
|
2146
|
+
return true;
|
|
2147
|
+
}
|
|
2148
|
+
function dedupeByKey(items, keyFn) {
|
|
2149
|
+
const seen = /* @__PURE__ */ new Set();
|
|
2150
|
+
const merged = [];
|
|
2151
|
+
for (const item of items) {
|
|
2152
|
+
const key = keyFn(item);
|
|
2153
|
+
if (seen.has(key)) continue;
|
|
2154
|
+
seen.add(key);
|
|
2155
|
+
merged.push(item);
|
|
2156
|
+
}
|
|
2157
|
+
return merged;
|
|
2158
|
+
}
|
|
2159
|
+
function mergeUniqueObjects(existing, incoming, keyFn) {
|
|
2160
|
+
return dedupeByKey([...existing, ...incoming], keyFn);
|
|
2161
|
+
}
|
|
2162
|
+
function mergeShallowPreferPresent(existing, incoming) {
|
|
2163
|
+
const merged = { ...existing };
|
|
2164
|
+
for (const [key, value] of Object.entries(incoming)) {
|
|
2165
|
+
const current = merged[key];
|
|
2166
|
+
if (Array.isArray(current) && Array.isArray(value)) {
|
|
2167
|
+
merged[key] = [...current, ...value];
|
|
2168
|
+
continue;
|
|
2169
|
+
}
|
|
2170
|
+
if (current && value && typeof current === "object" && typeof value === "object" && !Array.isArray(current) && !Array.isArray(value)) {
|
|
2171
|
+
merged[key] = mergeShallowPreferPresent(
|
|
2172
|
+
current,
|
|
2173
|
+
value
|
|
2174
|
+
);
|
|
2175
|
+
continue;
|
|
2176
|
+
}
|
|
2177
|
+
if (!isPresent(current) && isPresent(value)) {
|
|
2178
|
+
merged[key] = value;
|
|
2179
|
+
}
|
|
2180
|
+
}
|
|
2181
|
+
return merged;
|
|
2182
|
+
}
|
|
2183
|
+
function mergeCoverageLimits(existing, incoming) {
|
|
2184
|
+
const merged = mergeShallowPreferPresent(existing, incoming);
|
|
2185
|
+
const existingCoverages = Array.isArray(existing.coverages) ? existing.coverages : [];
|
|
2186
|
+
const incomingCoverages = Array.isArray(incoming.coverages) ? incoming.coverages : [];
|
|
2187
|
+
merged.coverages = mergeUniqueObjects(existingCoverages, incomingCoverages, (coverage) => [
|
|
2188
|
+
String(coverage.name ?? "").toLowerCase(),
|
|
2189
|
+
String(coverage.limit ?? "").toLowerCase(),
|
|
2190
|
+
String(coverage.deductible ?? "").toLowerCase(),
|
|
2191
|
+
String(coverage.formNumber ?? "").toLowerCase()
|
|
2192
|
+
].join("|"));
|
|
2193
|
+
return merged;
|
|
2194
|
+
}
|
|
2195
|
+
function mergeDeclarations(existing, incoming) {
|
|
2196
|
+
const merged = mergeShallowPreferPresent(existing, incoming);
|
|
2197
|
+
const existingFields = Array.isArray(existing.fields) ? existing.fields : [];
|
|
2198
|
+
const incomingFields = Array.isArray(incoming.fields) ? incoming.fields : [];
|
|
2199
|
+
merged.fields = mergeUniqueObjects(existingFields, incomingFields, (field) => [
|
|
2200
|
+
String(field.field ?? "").toLowerCase(),
|
|
2201
|
+
String(field.value ?? "").toLowerCase(),
|
|
2202
|
+
String(field.section ?? "").toLowerCase()
|
|
2203
|
+
].join("|"));
|
|
2204
|
+
return merged;
|
|
2205
|
+
}
|
|
2206
|
+
function mergeArrayPayload(existing, incoming, arrayKey, keyFn) {
|
|
2207
|
+
const merged = mergeShallowPreferPresent(existing, incoming);
|
|
2208
|
+
const existingItems = Array.isArray(existing[arrayKey]) ? existing[arrayKey] : [];
|
|
2209
|
+
const incomingItems = Array.isArray(incoming[arrayKey]) ? incoming[arrayKey] : [];
|
|
2210
|
+
merged[arrayKey] = mergeUniqueObjects(existingItems, incomingItems, keyFn);
|
|
2211
|
+
return merged;
|
|
2212
|
+
}
|
|
2213
|
+
function mergeExtractorResult(extractorName, existing, incoming) {
|
|
2214
|
+
if (!existing) return incoming;
|
|
2215
|
+
if (!incoming) return existing;
|
|
2216
|
+
if (typeof existing !== "object" || typeof incoming !== "object") return incoming;
|
|
2217
|
+
const current = existing;
|
|
2218
|
+
const next = incoming;
|
|
2219
|
+
switch (extractorName) {
|
|
2220
|
+
case "carrier_info":
|
|
2221
|
+
case "named_insured":
|
|
2222
|
+
case "loss_history":
|
|
2223
|
+
case "supplementary":
|
|
2224
|
+
case "premium_breakdown":
|
|
2225
|
+
return mergeShallowPreferPresent(current, next);
|
|
2226
|
+
case "coverage_limits":
|
|
2227
|
+
return mergeCoverageLimits(current, next);
|
|
2228
|
+
case "declarations":
|
|
2229
|
+
return mergeDeclarations(current, next);
|
|
2230
|
+
case "endorsements":
|
|
2231
|
+
return mergeArrayPayload(current, next, "endorsements", (item) => [
|
|
2232
|
+
String(item.formNumber ?? "").toLowerCase(),
|
|
2233
|
+
String(item.title ?? "").toLowerCase(),
|
|
2234
|
+
String(item.pageStart ?? "")
|
|
2235
|
+
].join("|"));
|
|
2236
|
+
case "exclusions":
|
|
2237
|
+
return mergeArrayPayload(current, next, "exclusions", (item) => [
|
|
2238
|
+
String(item.name ?? "").toLowerCase(),
|
|
2239
|
+
String(item.formNumber ?? "").toLowerCase(),
|
|
2240
|
+
String(item.pageNumber ?? "")
|
|
2241
|
+
].join("|"));
|
|
2242
|
+
case "conditions":
|
|
2243
|
+
return mergeArrayPayload(current, next, "conditions", (item) => [
|
|
2244
|
+
String(item.name ?? "").toLowerCase(),
|
|
2245
|
+
String(item.conditionType ?? "").toLowerCase(),
|
|
2246
|
+
String(item.pageNumber ?? "")
|
|
2247
|
+
].join("|"));
|
|
2248
|
+
case "sections":
|
|
2249
|
+
return mergeArrayPayload(current, next, "sections", (item) => [
|
|
2250
|
+
String(item.title ?? "").toLowerCase(),
|
|
2251
|
+
String(item.type ?? "").toLowerCase(),
|
|
2252
|
+
String(item.pageStart ?? ""),
|
|
2253
|
+
String(item.pageEnd ?? "")
|
|
2254
|
+
].join("|"));
|
|
2255
|
+
default:
|
|
2256
|
+
return mergeShallowPreferPresent(current, next);
|
|
2257
|
+
}
|
|
2258
|
+
}
|
|
2259
|
+
|
|
2141
2260
|
// src/prompts/templates/homeowners.ts
|
|
2142
2261
|
var HOMEOWNERS_TEMPLATE = {
|
|
2143
2262
|
type: "homeowners",
|
|
@@ -2927,57 +3046,74 @@ Return JSON only:
|
|
|
2927
3046
|
}`;
|
|
2928
3047
|
}
|
|
2929
3048
|
|
|
2930
|
-
// src/prompts/coordinator/
|
|
3049
|
+
// src/prompts/coordinator/page-map.ts
|
|
2931
3050
|
var import_zod19 = require("zod");
|
|
2932
|
-
var
|
|
2933
|
-
|
|
2934
|
-
|
|
2935
|
-
|
|
2936
|
-
|
|
2937
|
-
|
|
2938
|
-
|
|
2939
|
-
|
|
2940
|
-
|
|
3051
|
+
var PageExtractorSchema = import_zod19.z.enum([
|
|
3052
|
+
"carrier_info",
|
|
3053
|
+
"named_insured",
|
|
3054
|
+
"coverage_limits",
|
|
3055
|
+
"endorsements",
|
|
3056
|
+
"exclusions",
|
|
3057
|
+
"conditions",
|
|
3058
|
+
"premium_breakdown",
|
|
3059
|
+
"declarations",
|
|
3060
|
+
"loss_history",
|
|
3061
|
+
"sections",
|
|
3062
|
+
"supplementary"
|
|
3063
|
+
]);
|
|
3064
|
+
var PageAssignmentSchema = import_zod19.z.object({
|
|
3065
|
+
localPageNumber: import_zod19.z.number().int().positive().describe("1-based page number within this supplied PDF chunk"),
|
|
3066
|
+
extractorNames: import_zod19.z.array(PageExtractorSchema).describe("Focused extractors that should inspect this page"),
|
|
3067
|
+
confidence: import_zod19.z.number().min(0).max(1).optional().describe("Confidence in the page assignment"),
|
|
3068
|
+
notes: import_zod19.z.string().optional().describe("Short explanation of what appears on the page")
|
|
2941
3069
|
});
|
|
2942
|
-
var
|
|
2943
|
-
|
|
2944
|
-
pageMap: import_zod19.z.array(PageMapEntrySchema).optional()
|
|
3070
|
+
var PageMapChunkSchema = import_zod19.z.object({
|
|
3071
|
+
pages: import_zod19.z.array(PageAssignmentSchema)
|
|
2945
3072
|
});
|
|
2946
|
-
function
|
|
2947
|
-
return `You are
|
|
3073
|
+
function buildPageMapPrompt(templateHints, startPage, endPage) {
|
|
3074
|
+
return `You are mapping insurance document pages to focused extractors.
|
|
3075
|
+
|
|
3076
|
+
These supplied pages are ORIGINAL DOCUMENT PAGES ${startPage}-${endPage}.
|
|
2948
3077
|
|
|
2949
3078
|
DOCUMENT TYPE HINTS:
|
|
2950
3079
|
${templateHints}
|
|
2951
3080
|
|
|
2952
|
-
For each
|
|
3081
|
+
For each page in this supplied PDF chunk, decide which extractor(s) should inspect it.
|
|
2953
3082
|
|
|
2954
3083
|
Available extractors:
|
|
2955
|
-
- carrier_info
|
|
2956
|
-
- named_insured
|
|
2957
|
-
- coverage_limits
|
|
2958
|
-
- endorsements
|
|
2959
|
-
- exclusions
|
|
2960
|
-
- conditions
|
|
2961
|
-
- premium_breakdown
|
|
2962
|
-
- declarations
|
|
2963
|
-
- loss_history
|
|
2964
|
-
- sections
|
|
2965
|
-
- supplementary
|
|
3084
|
+
- carrier_info
|
|
3085
|
+
- named_insured
|
|
3086
|
+
- coverage_limits
|
|
3087
|
+
- endorsements
|
|
3088
|
+
- exclusions
|
|
3089
|
+
- conditions
|
|
3090
|
+
- premium_breakdown
|
|
3091
|
+
- declarations
|
|
3092
|
+
- loss_history
|
|
3093
|
+
- sections
|
|
3094
|
+
- supplementary
|
|
3095
|
+
|
|
3096
|
+
Rules:
|
|
3097
|
+
- Use specific extractors for declarations, schedules, endorsements, exclusions, conditions, premium pages, and loss runs.
|
|
3098
|
+
- Use "sections" for pages that contain substantive policy text or mixed content that should still be preserved as raw sections.
|
|
3099
|
+
- Avoid assigning broad ranges mentally; decide page by page.
|
|
3100
|
+
- A page may map to multiple extractors if it legitimately contains multiple relevant sections.
|
|
3101
|
+
- Prefer declarations and schedules for numeric limits/deductibles over later generic form wording.
|
|
3102
|
+
- If a page is mostly generic form language with no declaration-specific values, do not assign "coverage_limits" unless it clearly contains schedule-specific limits.
|
|
3103
|
+
- Return every page in the supplied chunk exactly once.
|
|
2966
3104
|
|
|
2967
3105
|
Return JSON:
|
|
2968
3106
|
{
|
|
2969
|
-
"
|
|
2970
|
-
{
|
|
2971
|
-
|
|
2972
|
-
|
|
2973
|
-
|
|
2974
|
-
|
|
2975
|
-
|
|
3107
|
+
"pages": [
|
|
3108
|
+
{
|
|
3109
|
+
"localPageNumber": 1,
|
|
3110
|
+
"extractorNames": ["declarations", "carrier_info", "named_insured", "coverage_limits"],
|
|
3111
|
+
"confidence": 0.96,
|
|
3112
|
+
"notes": "Declarations page with insured, policy period, and scheduled limits"
|
|
3113
|
+
}
|
|
2976
3114
|
]
|
|
2977
3115
|
}
|
|
2978
3116
|
|
|
2979
|
-
Create tasks that cover the entire document. Prefer specific extractors over generic "sections" where possible. Keep page ranges tight \u2014 only include pages relevant to each extractor.
|
|
2980
|
-
|
|
2981
3117
|
Respond with JSON only.`;
|
|
2982
3118
|
}
|
|
2983
3119
|
|
|
@@ -2986,6 +3122,7 @@ var import_zod20 = require("zod");
|
|
|
2986
3122
|
var ReviewResultSchema = import_zod20.z.object({
|
|
2987
3123
|
complete: import_zod20.z.boolean(),
|
|
2988
3124
|
missingFields: import_zod20.z.array(import_zod20.z.string()),
|
|
3125
|
+
qualityIssues: import_zod20.z.array(import_zod20.z.string()).optional(),
|
|
2989
3126
|
additionalTasks: import_zod20.z.array(import_zod20.z.object({
|
|
2990
3127
|
extractorName: import_zod20.z.string(),
|
|
2991
3128
|
startPage: import_zod20.z.number(),
|
|
@@ -2993,8 +3130,8 @@ var ReviewResultSchema = import_zod20.z.object({
|
|
|
2993
3130
|
description: import_zod20.z.string()
|
|
2994
3131
|
}))
|
|
2995
3132
|
});
|
|
2996
|
-
function buildReviewPrompt(templateExpected, extractedKeys) {
|
|
2997
|
-
return `You are reviewing an extraction for completeness. Compare what was expected vs what was found.
|
|
3133
|
+
function buildReviewPrompt(templateExpected, extractedKeys, extractionSummary, pageMapSummary) {
|
|
3134
|
+
return `You are reviewing an extraction for completeness and quality. Compare what was expected vs what was found.
|
|
2998
3135
|
|
|
2999
3136
|
EXPECTED FIELDS (from document type template):
|
|
3000
3137
|
${templateExpected.map((f) => `- ${f}`).join("\n")}
|
|
@@ -3002,21 +3139,36 @@ ${templateExpected.map((f) => `- ${f}`).join("\n")}
|
|
|
3002
3139
|
FIELDS ALREADY EXTRACTED:
|
|
3003
3140
|
${extractedKeys.map((f) => `- ${f}`).join("\n")}
|
|
3004
3141
|
|
|
3142
|
+
PAGE MAP SUMMARY:
|
|
3143
|
+
${pageMapSummary}
|
|
3144
|
+
|
|
3145
|
+
CURRENT EXTRACTION SUMMARY:
|
|
3146
|
+
${extractionSummary}
|
|
3147
|
+
|
|
3005
3148
|
Determine:
|
|
3006
|
-
1. Is the extraction complete enough?
|
|
3149
|
+
1. Is the extraction complete enough?
|
|
3007
3150
|
2. What fields are missing?
|
|
3008
|
-
3.
|
|
3151
|
+
3. What quality issues are present?
|
|
3152
|
+
4. Should any additional extraction tasks be dispatched?
|
|
3153
|
+
|
|
3154
|
+
Mark the extraction as NOT complete if any of these are true:
|
|
3155
|
+
- required fields are missing
|
|
3156
|
+
- extracted values are generic placeholders like "shown in declarations", "per schedule", "if applicable", "as stated"
|
|
3157
|
+
- coverage limits or deductibles appear to come from generic form language instead of declaration/schedule-specific values
|
|
3158
|
+
- page assignments suggest declaration, schedule, endorsement, exclusion, or condition pages were not actually extracted with the matching focused extractor
|
|
3159
|
+
- a focused extractor exists but returned too little substance for the relevant pages
|
|
3009
3160
|
|
|
3010
3161
|
Return JSON:
|
|
3011
3162
|
{
|
|
3012
3163
|
"complete": boolean,
|
|
3013
3164
|
"missingFields": ["field1", "field2"],
|
|
3165
|
+
"qualityIssues": ["issue 1", "issue 2"],
|
|
3014
3166
|
"additionalTasks": [
|
|
3015
3167
|
{ "extractorName": "...", "startPage": N, "endPage": N, "description": "..." }
|
|
3016
3168
|
]
|
|
3017
3169
|
}
|
|
3018
3170
|
|
|
3019
|
-
|
|
3171
|
+
Use the page map to target follow-up extraction pages precisely. Prefer narrow, declaration/schedule-focused follow-up tasks over broad page ranges.
|
|
3020
3172
|
|
|
3021
3173
|
Respond with JSON only.`;
|
|
3022
3174
|
}
|
|
@@ -3564,17 +3716,125 @@ function createExtractor(config) {
|
|
|
3564
3716
|
} = config;
|
|
3565
3717
|
const limit = pLimit(concurrency);
|
|
3566
3718
|
let totalUsage = { inputTokens: 0, outputTokens: 0 };
|
|
3719
|
+
let modelCalls = 0;
|
|
3720
|
+
let callsWithUsage = 0;
|
|
3721
|
+
let callsMissingUsage = 0;
|
|
3567
3722
|
function trackUsage(usage) {
|
|
3723
|
+
modelCalls += 1;
|
|
3568
3724
|
if (usage) {
|
|
3725
|
+
callsWithUsage += 1;
|
|
3569
3726
|
totalUsage.inputTokens += usage.inputTokens;
|
|
3570
3727
|
totalUsage.outputTokens += usage.outputTokens;
|
|
3571
3728
|
onTokenUsage?.(usage);
|
|
3729
|
+
} else {
|
|
3730
|
+
callsMissingUsage += 1;
|
|
3731
|
+
}
|
|
3732
|
+
}
|
|
3733
|
+
function mergeMemoryResult(name, data, memory) {
|
|
3734
|
+
const existing = memory.get(name);
|
|
3735
|
+
memory.set(name, mergeExtractorResult(name, existing, data));
|
|
3736
|
+
}
|
|
3737
|
+
function summarizeExtraction(memory) {
|
|
3738
|
+
const coverageResult = memory.get("coverage_limits");
|
|
3739
|
+
const declarationResult = memory.get("declarations");
|
|
3740
|
+
const endorsementResult = memory.get("endorsements");
|
|
3741
|
+
const exclusionResult = memory.get("exclusions");
|
|
3742
|
+
const conditionResult = memory.get("conditions");
|
|
3743
|
+
const sectionResult = memory.get("sections");
|
|
3744
|
+
const coverageSummary = Array.isArray(coverageResult?.coverages) ? coverageResult.coverages.slice(0, 12).map((coverage) => ({
|
|
3745
|
+
name: coverage.name,
|
|
3746
|
+
limit: coverage.limit,
|
|
3747
|
+
deductible: coverage.deductible,
|
|
3748
|
+
formNumber: coverage.formNumber
|
|
3749
|
+
})) : [];
|
|
3750
|
+
return JSON.stringify({
|
|
3751
|
+
extractedKeys: [...memory.keys()].filter((key) => key !== "classify"),
|
|
3752
|
+
declarationFieldCount: Array.isArray(declarationResult?.fields) ? declarationResult.fields.length : 0,
|
|
3753
|
+
coverageCount: Array.isArray(coverageResult?.coverages) ? coverageResult.coverages.length : 0,
|
|
3754
|
+
coverageSamples: coverageSummary,
|
|
3755
|
+
endorsementCount: Array.isArray(endorsementResult?.endorsements) ? endorsementResult.endorsements.length : 0,
|
|
3756
|
+
exclusionCount: Array.isArray(exclusionResult?.exclusions) ? exclusionResult.exclusions.length : 0,
|
|
3757
|
+
conditionCount: Array.isArray(conditionResult?.conditions) ? conditionResult.conditions.length : 0,
|
|
3758
|
+
sectionCount: Array.isArray(sectionResult?.sections) ? sectionResult.sections.length : 0
|
|
3759
|
+
}, null, 2);
|
|
3760
|
+
}
|
|
3761
|
+
function formatPageMapSummary(pageAssignments) {
|
|
3762
|
+
const extractorPages = /* @__PURE__ */ new Map();
|
|
3763
|
+
for (const assignment of pageAssignments) {
|
|
3764
|
+
for (const extractorName of assignment.extractorNames) {
|
|
3765
|
+
extractorPages.set(extractorName, [...extractorPages.get(extractorName) ?? [], assignment.localPageNumber]);
|
|
3766
|
+
}
|
|
3767
|
+
}
|
|
3768
|
+
if (extractorPages.size === 0) return "No page assignments available.";
|
|
3769
|
+
return [...extractorPages.entries()].map(([extractorName, pages]) => `${extractorName}: pages ${pages.join(", ")}`).join("\n");
|
|
3770
|
+
}
|
|
3771
|
+
function buildTemplateHints(primaryType, documentType, pageCount, template) {
|
|
3772
|
+
return [
|
|
3773
|
+
`Document type: ${primaryType} ${documentType}`,
|
|
3774
|
+
`Expected sections: ${template.expectedSections.join(", ")}`,
|
|
3775
|
+
`Page hints: ${Object.entries(template.pageHints).map(([k, v]) => `${k}: ${v}`).join("; ")}`,
|
|
3776
|
+
`Total pages: ${pageCount}`
|
|
3777
|
+
].join("\n");
|
|
3778
|
+
}
|
|
3779
|
+
function groupContiguousPages(pages) {
|
|
3780
|
+
if (pages.length === 0) return [];
|
|
3781
|
+
const sorted = [...new Set(pages)].sort((a, b) => a - b);
|
|
3782
|
+
const ranges = [];
|
|
3783
|
+
let start = sorted[0];
|
|
3784
|
+
let previous = sorted[0];
|
|
3785
|
+
for (let i = 1; i < sorted.length; i += 1) {
|
|
3786
|
+
const current = sorted[i];
|
|
3787
|
+
if (current === previous + 1) {
|
|
3788
|
+
previous = current;
|
|
3789
|
+
continue;
|
|
3790
|
+
}
|
|
3791
|
+
ranges.push({ startPage: start, endPage: previous });
|
|
3792
|
+
start = current;
|
|
3793
|
+
previous = current;
|
|
3572
3794
|
}
|
|
3795
|
+
ranges.push({ startPage: start, endPage: previous });
|
|
3796
|
+
return ranges;
|
|
3797
|
+
}
|
|
3798
|
+
function buildPlanFromPageAssignments(pageAssignments, pageCount) {
|
|
3799
|
+
const extractorPages = /* @__PURE__ */ new Map();
|
|
3800
|
+
for (const assignment of pageAssignments) {
|
|
3801
|
+
const extractors = assignment.extractorNames.length > 0 ? assignment.extractorNames : ["sections"];
|
|
3802
|
+
for (const extractorName of extractors) {
|
|
3803
|
+
extractorPages.set(extractorName, [...extractorPages.get(extractorName) ?? [], assignment.localPageNumber]);
|
|
3804
|
+
}
|
|
3805
|
+
}
|
|
3806
|
+
const coveredPages = /* @__PURE__ */ new Set();
|
|
3807
|
+
for (const pages of extractorPages.values()) {
|
|
3808
|
+
for (const page of pages) coveredPages.add(page);
|
|
3809
|
+
}
|
|
3810
|
+
for (let page = 1; page <= pageCount; page += 1) {
|
|
3811
|
+
if (!coveredPages.has(page)) {
|
|
3812
|
+
extractorPages.set("sections", [...extractorPages.get("sections") ?? [], page]);
|
|
3813
|
+
}
|
|
3814
|
+
}
|
|
3815
|
+
const tasks = [...extractorPages.entries()].flatMap(
|
|
3816
|
+
([extractorName, pages]) => groupContiguousPages(pages).map(({ startPage, endPage }) => ({
|
|
3817
|
+
extractorName,
|
|
3818
|
+
startPage,
|
|
3819
|
+
endPage,
|
|
3820
|
+
description: `Page-mapped ${extractorName} extraction for pages ${startPage}-${endPage}`
|
|
3821
|
+
}))
|
|
3822
|
+
).sort((a, b) => a.startPage - b.startPage || a.extractorName.localeCompare(b.extractorName));
|
|
3823
|
+
return {
|
|
3824
|
+
tasks,
|
|
3825
|
+
pageMap: [...extractorPages.entries()].map(([section, pages]) => ({
|
|
3826
|
+
section,
|
|
3827
|
+
pages: `pages ${[...new Set(pages)].sort((a, b) => a - b).join(", ")}`
|
|
3828
|
+
}))
|
|
3829
|
+
};
|
|
3573
3830
|
}
|
|
3574
3831
|
async function extract(pdfBase64, documentId, options) {
|
|
3575
3832
|
const id = documentId ?? `doc-${Date.now()}`;
|
|
3576
3833
|
const memory = /* @__PURE__ */ new Map();
|
|
3577
3834
|
totalUsage = { inputTokens: 0, outputTokens: 0 };
|
|
3835
|
+
modelCalls = 0;
|
|
3836
|
+
callsWithUsage = 0;
|
|
3837
|
+
callsMissingUsage = 0;
|
|
3578
3838
|
const pipelineCtx = createPipelineContext({
|
|
3579
3839
|
id,
|
|
3580
3840
|
onSave: onCheckpointSave,
|
|
@@ -3625,40 +3885,73 @@ function createExtractor(config) {
|
|
|
3625
3885
|
const primaryType = policyTypes[0] ?? "other";
|
|
3626
3886
|
const template = getTemplate(primaryType);
|
|
3627
3887
|
const pageCount = resumed?.pageCount ?? await getPdfPageCount(pdfBase64);
|
|
3888
|
+
const templateHints = buildTemplateHints(primaryType, documentType, pageCount, template);
|
|
3889
|
+
let pageAssignments;
|
|
3890
|
+
if (resumed?.pageAssignments && pipelineCtx.isPhaseComplete("page_map")) {
|
|
3891
|
+
pageAssignments = resumed.pageAssignments;
|
|
3892
|
+
onProgress?.("Resuming from checkpoint (page map complete)...");
|
|
3893
|
+
} else {
|
|
3894
|
+
onProgress?.(`Mapping document pages for ${primaryType} ${documentType}...`);
|
|
3895
|
+
const chunkSize = 8;
|
|
3896
|
+
const collectedAssignments = [];
|
|
3897
|
+
for (let startPage = 1; startPage <= pageCount; startPage += chunkSize) {
|
|
3898
|
+
const endPage = Math.min(pageCount, startPage + chunkSize - 1);
|
|
3899
|
+
const pagesPdf = await extractPageRange(pdfBase64, startPage, endPage);
|
|
3900
|
+
const mapResponse = await safeGenerateObject(
|
|
3901
|
+
generateObject,
|
|
3902
|
+
{
|
|
3903
|
+
prompt: buildPageMapPrompt(templateHints, startPage, endPage),
|
|
3904
|
+
schema: PageMapChunkSchema,
|
|
3905
|
+
maxTokens: 2048,
|
|
3906
|
+
providerOptions: { ...providerOptions, pdfBase64: pagesPdf }
|
|
3907
|
+
},
|
|
3908
|
+
{
|
|
3909
|
+
fallback: {
|
|
3910
|
+
pages: Array.from({ length: endPage - startPage + 1 }, (_, index) => ({
|
|
3911
|
+
localPageNumber: index + 1,
|
|
3912
|
+
extractorNames: index === 0 && startPage === 1 ? ["carrier_info", "named_insured", "declarations", "coverage_limits"] : ["sections"],
|
|
3913
|
+
confidence: 0,
|
|
3914
|
+
notes: "Fallback page assignment"
|
|
3915
|
+
}))
|
|
3916
|
+
},
|
|
3917
|
+
log,
|
|
3918
|
+
onError: (err, attempt) => log?.(`Page map attempt ${attempt + 1} failed for pages ${startPage}-${endPage}: ${err}`)
|
|
3919
|
+
}
|
|
3920
|
+
);
|
|
3921
|
+
trackUsage(mapResponse.usage);
|
|
3922
|
+
for (const assignment of mapResponse.object.pages) {
|
|
3923
|
+
collectedAssignments.push({
|
|
3924
|
+
...assignment,
|
|
3925
|
+
localPageNumber: startPage + assignment.localPageNumber - 1
|
|
3926
|
+
});
|
|
3927
|
+
}
|
|
3928
|
+
}
|
|
3929
|
+
pageAssignments = collectedAssignments.length > 0 ? collectedAssignments : Array.from({ length: pageCount }, (_, index) => ({
|
|
3930
|
+
localPageNumber: index + 1,
|
|
3931
|
+
extractorNames: index === 0 ? ["carrier_info", "named_insured", "declarations", "coverage_limits"] : ["sections"],
|
|
3932
|
+
confidence: 0,
|
|
3933
|
+
notes: "Full-document fallback page assignment"
|
|
3934
|
+
}));
|
|
3935
|
+
await pipelineCtx.save("page_map", {
|
|
3936
|
+
id,
|
|
3937
|
+
pageCount,
|
|
3938
|
+
classifyResult,
|
|
3939
|
+
pageAssignments,
|
|
3940
|
+
memory: Object.fromEntries(memory)
|
|
3941
|
+
});
|
|
3942
|
+
}
|
|
3628
3943
|
let plan;
|
|
3629
3944
|
if (resumed?.plan && pipelineCtx.isPhaseComplete("plan")) {
|
|
3630
3945
|
plan = resumed.plan;
|
|
3631
3946
|
onProgress?.("Resuming from checkpoint (plan complete)...");
|
|
3632
3947
|
} else {
|
|
3633
|
-
onProgress?.(`
|
|
3634
|
-
|
|
3635
|
-
`Document type: ${primaryType} ${documentType}`,
|
|
3636
|
-
`Expected sections: ${template.expectedSections.join(", ")}`,
|
|
3637
|
-
`Page hints: ${Object.entries(template.pageHints).map(([k, v]) => `${k}: ${v}`).join("; ")}`,
|
|
3638
|
-
`Total pages: ${pageCount}`
|
|
3639
|
-
].join("\n");
|
|
3640
|
-
const planResponse = await safeGenerateObject(
|
|
3641
|
-
generateObject,
|
|
3642
|
-
{
|
|
3643
|
-
prompt: buildPlanPrompt(templateHints),
|
|
3644
|
-
schema: ExtractionPlanSchema,
|
|
3645
|
-
maxTokens: 2048,
|
|
3646
|
-
providerOptions: { ...providerOptions, pdfBase64 }
|
|
3647
|
-
},
|
|
3648
|
-
{
|
|
3649
|
-
fallback: {
|
|
3650
|
-
tasks: [{ extractorName: "sections", startPage: 1, endPage: pageCount, description: "Full document fallback extraction" }]
|
|
3651
|
-
},
|
|
3652
|
-
log,
|
|
3653
|
-
onError: (err, attempt) => log?.(`Plan attempt ${attempt + 1} failed: ${err}`)
|
|
3654
|
-
}
|
|
3655
|
-
);
|
|
3656
|
-
trackUsage(planResponse.usage);
|
|
3657
|
-
plan = planResponse.object;
|
|
3948
|
+
onProgress?.(`Building extraction plan from page map for ${primaryType} ${documentType}...`);
|
|
3949
|
+
plan = buildPlanFromPageAssignments(pageAssignments, pageCount);
|
|
3658
3950
|
await pipelineCtx.save("plan", {
|
|
3659
3951
|
id,
|
|
3660
3952
|
pageCount,
|
|
3661
3953
|
classifyResult,
|
|
3954
|
+
pageAssignments,
|
|
3662
3955
|
plan,
|
|
3663
3956
|
memory: Object.fromEntries(memory)
|
|
3664
3957
|
});
|
|
@@ -3699,13 +3992,14 @@ function createExtractor(config) {
|
|
|
3699
3992
|
);
|
|
3700
3993
|
for (const result of extractorResults) {
|
|
3701
3994
|
if (result) {
|
|
3702
|
-
|
|
3995
|
+
mergeMemoryResult(result.name, result.data, memory);
|
|
3703
3996
|
}
|
|
3704
3997
|
}
|
|
3705
3998
|
await pipelineCtx.save("extract", {
|
|
3706
3999
|
id,
|
|
3707
4000
|
pageCount,
|
|
3708
4001
|
classifyResult,
|
|
4002
|
+
pageAssignments,
|
|
3709
4003
|
plan,
|
|
3710
4004
|
memory: Object.fromEntries(memory)
|
|
3711
4005
|
});
|
|
@@ -3713,21 +4007,26 @@ function createExtractor(config) {
|
|
|
3713
4007
|
if (!pipelineCtx.isPhaseComplete("review")) {
|
|
3714
4008
|
for (let round = 0; round < maxReviewRounds; round++) {
|
|
3715
4009
|
const extractedKeys = [...memory.keys()].filter((k) => k !== "classify");
|
|
4010
|
+
const extractionSummary = summarizeExtraction(memory);
|
|
4011
|
+
const pageMapSummary = formatPageMapSummary(pageAssignments);
|
|
3716
4012
|
const reviewResponse = await safeGenerateObject(
|
|
3717
4013
|
generateObject,
|
|
3718
4014
|
{
|
|
3719
|
-
prompt: buildReviewPrompt(template.required, extractedKeys),
|
|
4015
|
+
prompt: buildReviewPrompt(template.required, extractedKeys, extractionSummary, pageMapSummary),
|
|
3720
4016
|
schema: ReviewResultSchema,
|
|
3721
|
-
maxTokens:
|
|
3722
|
-
providerOptions
|
|
4017
|
+
maxTokens: 1536,
|
|
4018
|
+
providerOptions: { ...providerOptions, pdfBase64 }
|
|
3723
4019
|
},
|
|
3724
4020
|
{
|
|
3725
|
-
fallback: { complete: true, missingFields: [], additionalTasks: [] },
|
|
4021
|
+
fallback: { complete: true, missingFields: [], qualityIssues: [], additionalTasks: [] },
|
|
3726
4022
|
log,
|
|
3727
4023
|
onError: (err, attempt) => log?.(`Review round ${round + 1} attempt ${attempt + 1} failed: ${err}`)
|
|
3728
4024
|
}
|
|
3729
4025
|
);
|
|
3730
4026
|
trackUsage(reviewResponse.usage);
|
|
4027
|
+
if (reviewResponse.object.qualityIssues?.length) {
|
|
4028
|
+
await log?.(`Review round ${round + 1} quality issues: ${reviewResponse.object.qualityIssues.join("; ")}`);
|
|
4029
|
+
}
|
|
3731
4030
|
if (reviewResponse.object.complete || reviewResponse.object.additionalTasks.length === 0) {
|
|
3732
4031
|
onProgress?.("Extraction complete.");
|
|
3733
4032
|
break;
|
|
@@ -3762,7 +4061,7 @@ function createExtractor(config) {
|
|
|
3762
4061
|
);
|
|
3763
4062
|
for (const result of followUpResults) {
|
|
3764
4063
|
if (result) {
|
|
3765
|
-
|
|
4064
|
+
mergeMemoryResult(result.name, result.data, memory);
|
|
3766
4065
|
}
|
|
3767
4066
|
}
|
|
3768
4067
|
}
|
|
@@ -3770,6 +4069,7 @@ function createExtractor(config) {
|
|
|
3770
4069
|
id,
|
|
3771
4070
|
pageCount,
|
|
3772
4071
|
classifyResult,
|
|
4072
|
+
pageAssignments,
|
|
3773
4073
|
plan,
|
|
3774
4074
|
memory: Object.fromEntries(memory)
|
|
3775
4075
|
});
|
|
@@ -3780,6 +4080,7 @@ function createExtractor(config) {
|
|
|
3780
4080
|
id,
|
|
3781
4081
|
pageCount,
|
|
3782
4082
|
classifyResult,
|
|
4083
|
+
pageAssignments,
|
|
3783
4084
|
plan,
|
|
3784
4085
|
memory: Object.fromEntries(memory),
|
|
3785
4086
|
document
|
|
@@ -3793,10 +4094,19 @@ function createExtractor(config) {
|
|
|
3793
4094
|
trackUsage(formatResult.usage);
|
|
3794
4095
|
const chunks = chunkDocument(formatResult.document);
|
|
3795
4096
|
const finalCheckpoint = pipelineCtx.getCheckpoint();
|
|
4097
|
+
if (callsMissingUsage > 0) {
|
|
4098
|
+
await log?.(`Token usage was unavailable for ${callsMissingUsage}/${modelCalls} model calls. Check that your provider callbacks return usage.`);
|
|
4099
|
+
onProgress?.(`Token usage unavailable for ${callsMissingUsage}/${modelCalls} model calls.`);
|
|
4100
|
+
}
|
|
3796
4101
|
return {
|
|
3797
4102
|
document: formatResult.document,
|
|
3798
4103
|
chunks,
|
|
3799
4104
|
tokenUsage: totalUsage,
|
|
4105
|
+
usageReporting: {
|
|
4106
|
+
modelCalls,
|
|
4107
|
+
callsWithUsage,
|
|
4108
|
+
callsMissingUsage
|
|
4109
|
+
},
|
|
3800
4110
|
checkpoint: finalCheckpoint
|
|
3801
4111
|
};
|
|
3802
4112
|
}
|