@claritylabs/cl-sdk 0.5.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +33 -9
- package/dist/index.d.mts +382 -77
- package/dist/index.d.ts +382 -77
- package/dist/index.js +718 -205
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +715 -205
- package/dist/index.mjs.map +1 -1
- package/dist/storage-sqlite.d.mts +52 -10
- package/dist/storage-sqlite.d.ts +52 -10
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -54,6 +54,7 @@ __export(index_exports, {
|
|
|
54
54
|
CommercialAutoDeclarationsSchema: () => CommercialAutoDeclarationsSchema,
|
|
55
55
|
CommercialPropertyDeclarationsSchema: () => CommercialPropertyDeclarationsSchema,
|
|
56
56
|
CommunicationIntentSchema: () => CommunicationIntentSchema,
|
|
57
|
+
ConditionKeyValueSchema: () => ConditionKeyValueSchema,
|
|
57
58
|
ConditionTypeSchema: () => ConditionTypeSchema,
|
|
58
59
|
ConstructionTypeSchema: () => ConstructionTypeSchema,
|
|
59
60
|
ContactSchema: () => ContactSchema,
|
|
@@ -220,6 +221,7 @@ __export(index_exports, {
|
|
|
220
221
|
chunkDocument: () => chunkDocument,
|
|
221
222
|
createApplicationPipeline: () => createApplicationPipeline,
|
|
222
223
|
createExtractor: () => createExtractor,
|
|
224
|
+
createPipelineContext: () => createPipelineContext,
|
|
223
225
|
createQueryAgent: () => createQueryAgent,
|
|
224
226
|
extractPageRange: () => extractPageRange,
|
|
225
227
|
fillAcroForm: () => fillAcroForm,
|
|
@@ -229,6 +231,7 @@ __export(index_exports, {
|
|
|
229
231
|
getTemplate: () => getTemplate,
|
|
230
232
|
overlayTextOnPdf: () => overlayTextOnPdf,
|
|
231
233
|
pLimit: () => pLimit,
|
|
234
|
+
safeGenerateObject: () => safeGenerateObject,
|
|
232
235
|
sanitizeNulls: () => sanitizeNulls,
|
|
233
236
|
stripFences: () => stripFences,
|
|
234
237
|
withRetry: () => withRetry
|
|
@@ -308,6 +311,69 @@ function sanitizeNulls(obj) {
|
|
|
308
311
|
return obj;
|
|
309
312
|
}
|
|
310
313
|
|
|
314
|
+
// src/core/safe-generate.ts
|
|
315
|
+
async function safeGenerateObject(generateObject, params, options) {
|
|
316
|
+
const maxRetries = options?.maxRetries ?? 1;
|
|
317
|
+
let lastError;
|
|
318
|
+
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
319
|
+
try {
|
|
320
|
+
const result = await withRetry(
|
|
321
|
+
() => generateObject(params),
|
|
322
|
+
options?.log
|
|
323
|
+
);
|
|
324
|
+
return result;
|
|
325
|
+
} catch (error) {
|
|
326
|
+
lastError = error;
|
|
327
|
+
options?.onError?.(error, attempt);
|
|
328
|
+
await options?.log?.(
|
|
329
|
+
`safeGenerateObject attempt ${attempt + 1}/${maxRetries + 1} failed: ${error instanceof Error ? error.message : String(error)}`
|
|
330
|
+
);
|
|
331
|
+
if (attempt < maxRetries) {
|
|
332
|
+
await new Promise((resolve) => setTimeout(resolve, 1e3));
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
}
|
|
336
|
+
if (options?.fallback !== void 0) {
|
|
337
|
+
await options?.log?.(
|
|
338
|
+
`safeGenerateObject: all retries exhausted, returning fallback`
|
|
339
|
+
);
|
|
340
|
+
return { object: options.fallback };
|
|
341
|
+
}
|
|
342
|
+
throw lastError;
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
// src/core/pipeline.ts
|
|
346
|
+
function createPipelineContext(opts) {
|
|
347
|
+
let latest = opts.resumeFrom;
|
|
348
|
+
const completedPhases = /* @__PURE__ */ new Set();
|
|
349
|
+
if (opts.resumeFrom) {
|
|
350
|
+
completedPhases.add(opts.resumeFrom.phase);
|
|
351
|
+
}
|
|
352
|
+
return {
|
|
353
|
+
id: opts.id,
|
|
354
|
+
async save(phase, state) {
|
|
355
|
+
const checkpoint = {
|
|
356
|
+
phase,
|
|
357
|
+
state,
|
|
358
|
+
timestamp: Date.now()
|
|
359
|
+
};
|
|
360
|
+
latest = checkpoint;
|
|
361
|
+
completedPhases.add(phase);
|
|
362
|
+
await opts.onSave?.(checkpoint);
|
|
363
|
+
},
|
|
364
|
+
getCheckpoint() {
|
|
365
|
+
return latest;
|
|
366
|
+
},
|
|
367
|
+
isPhaseComplete(phase) {
|
|
368
|
+
return completedPhases.has(phase);
|
|
369
|
+
},
|
|
370
|
+
clear() {
|
|
371
|
+
latest = void 0;
|
|
372
|
+
completedPhases.clear();
|
|
373
|
+
}
|
|
374
|
+
};
|
|
375
|
+
}
|
|
376
|
+
|
|
311
377
|
// src/schemas/enums.ts
|
|
312
378
|
var import_zod = require("zod");
|
|
313
379
|
var PolicyTypeSchema = import_zod.z.enum([
|
|
@@ -708,11 +774,15 @@ var ExclusionSchema = import_zod5.z.object({
|
|
|
708
774
|
|
|
709
775
|
// src/schemas/condition.ts
|
|
710
776
|
var import_zod6 = require("zod");
|
|
777
|
+
var ConditionKeyValueSchema = import_zod6.z.object({
|
|
778
|
+
key: import_zod6.z.string(),
|
|
779
|
+
value: import_zod6.z.string()
|
|
780
|
+
});
|
|
711
781
|
var PolicyConditionSchema = import_zod6.z.object({
|
|
712
782
|
name: import_zod6.z.string(),
|
|
713
783
|
conditionType: ConditionTypeSchema,
|
|
714
784
|
content: import_zod6.z.string(),
|
|
715
|
-
keyValues: import_zod6.z.
|
|
785
|
+
keyValues: import_zod6.z.array(ConditionKeyValueSchema).optional(),
|
|
716
786
|
pageNumber: import_zod6.z.number().optional()
|
|
717
787
|
});
|
|
718
788
|
|
|
@@ -1700,6 +1770,218 @@ function assembleDocument(documentId, documentType, memory) {
|
|
|
1700
1770
|
};
|
|
1701
1771
|
}
|
|
1702
1772
|
|
|
1773
|
+
// src/prompts/coordinator/format.ts
|
|
1774
|
+
function buildFormatPrompt(entries) {
|
|
1775
|
+
const block = entries.map((e) => `===ENTRY ${e.id}===
|
|
1776
|
+
${e.text}`).join("\n\n");
|
|
1777
|
+
return `You are a markdown formatting specialist for insurance document content. You will receive numbered content entries extracted from insurance policies, quotes, and endorsements. Your job is to clean up the formatting so every entry renders correctly as standard markdown.
|
|
1778
|
+
|
|
1779
|
+
## Primary issues to fix
|
|
1780
|
+
|
|
1781
|
+
### 1. Pipe-delimited data missing table syntax
|
|
1782
|
+
The most common issue. Content uses pipe characters as column separators but is missing the separator row required for markdown table rendering.
|
|
1783
|
+
|
|
1784
|
+
Before (broken \u2014 won't render as a table):
|
|
1785
|
+
COVERAGE | FORM # | LIMIT | DEDUCTIBLE
|
|
1786
|
+
Employee Theft | | $10,000 | $1,000
|
|
1787
|
+
|
|
1788
|
+
After (valid markdown table):
|
|
1789
|
+
| COVERAGE | FORM # | LIMIT | DEDUCTIBLE |
|
|
1790
|
+
| --- | --- | --- | --- |
|
|
1791
|
+
| Employee Theft | | $10,000 | $1,000 |
|
|
1792
|
+
|
|
1793
|
+
Rules for pipe tables:
|
|
1794
|
+
- Add leading and trailing pipes to every row
|
|
1795
|
+
- Add the separator row (| --- | --- |) after the header row
|
|
1796
|
+
- Every row must have the same number of pipe-separated columns as the header
|
|
1797
|
+
- Empty cells are fine \u2014 just keep the pipes: | | $10,000 |
|
|
1798
|
+
|
|
1799
|
+
### 2. Sub-items indented within pipe tables
|
|
1800
|
+
Insurance schedules often have indented sub-items that belong to the previous coverage line. These break table column counts.
|
|
1801
|
+
|
|
1802
|
+
Before (broken):
|
|
1803
|
+
COVERAGE | LIMIT | DEDUCTIBLE
|
|
1804
|
+
Causes Of Loss - Equipment Breakdown | PR650END
|
|
1805
|
+
Described Premises Limit | | $350,804 |
|
|
1806
|
+
Diagnostic Equipment | | $100,000 |
|
|
1807
|
+
Deductible Type - Business Income: Waiting Period - Hours
|
|
1808
|
+
Waiting Period (Hours): 24
|
|
1809
|
+
|
|
1810
|
+
After: Pull sub-items out of the table. End the table before the sub-items, show them as an indented list, then start a new table if tabular data resumes:
|
|
1811
|
+
| COVERAGE | LIMIT | DEDUCTIBLE |
|
|
1812
|
+
| --- | --- | --- |
|
|
1813
|
+
| Causes Of Loss - Equipment Breakdown | PR650END | |
|
|
1814
|
+
|
|
1815
|
+
- Described Premises Limit: $350,804
|
|
1816
|
+
- Diagnostic Equipment: $100,000
|
|
1817
|
+
- Deductible Type - Business Income: Waiting Period - Hours
|
|
1818
|
+
- Waiting Period (Hours): 24
|
|
1819
|
+
|
|
1820
|
+
### 3. Space-aligned tables
|
|
1821
|
+
Declarations often align columns with spaces instead of pipes. These render as plain monospace text and lose structure.
|
|
1822
|
+
|
|
1823
|
+
Before:
|
|
1824
|
+
Coverage Limit of Liability Retention
|
|
1825
|
+
A. Network Security Liability $500,000 $10,000
|
|
1826
|
+
B. Privacy Liability $500,000 $10,000
|
|
1827
|
+
|
|
1828
|
+
After (convert to proper markdown table):
|
|
1829
|
+
| Coverage | Limit of Liability | Retention |
|
|
1830
|
+
| --- | --- | --- |
|
|
1831
|
+
| A. Network Security Liability | $500,000 | $10,000 |
|
|
1832
|
+
| B. Privacy Liability | $500,000 | $10,000 |
|
|
1833
|
+
|
|
1834
|
+
### 4. Mixed table/prose content
|
|
1835
|
+
A single entry often contains prose paragraphs followed by tabular data followed by more prose. Handle each segment independently \u2014 don't try to force everything into one table.
|
|
1836
|
+
|
|
1837
|
+
### 5. General markdown cleanup
|
|
1838
|
+
- **Line spacing**: Remove excessive blank lines (3+ consecutive newlines \u2192 2). Ensure one blank line before and after tables and headings.
|
|
1839
|
+
- **Trailing whitespace**: Remove trailing spaces on all lines.
|
|
1840
|
+
- **Broken lists**: Ensure list items use consistent markers (-, *, or 1.) with proper nesting indentation.
|
|
1841
|
+
- **Orphaned formatting**: Close any unclosed bold (**), italic (*), or code (\`) markers.
|
|
1842
|
+
- **Heading levels**: Ensure heading markers (##) have a space after the hashes.
|
|
1843
|
+
|
|
1844
|
+
## Rules
|
|
1845
|
+
- Do NOT change the meaning or substance of any content. Only fix formatting.
|
|
1846
|
+
- Do NOT add new information, headers, or commentary.
|
|
1847
|
+
- Do NOT wrap entries in code fences.
|
|
1848
|
+
- Preserve all dollar amounts, dates, policy numbers, form numbers, and technical terms exactly as they appear.
|
|
1849
|
+
- If an entry is already well-formatted, return it unchanged.
|
|
1850
|
+
- When in doubt about whether something is a table, prefer table formatting for structured data with multiple columns.
|
|
1851
|
+
|
|
1852
|
+
Return your output in this exact format \u2014 one block per entry, in the same order:
|
|
1853
|
+
|
|
1854
|
+
===ENTRY 0===
|
|
1855
|
+
(cleaned content for entry 0)
|
|
1856
|
+
|
|
1857
|
+
===ENTRY 1===
|
|
1858
|
+
(cleaned content for entry 1)
|
|
1859
|
+
|
|
1860
|
+
...and so on for each entry.
|
|
1861
|
+
|
|
1862
|
+
Here are the entries to format:
|
|
1863
|
+
|
|
1864
|
+
${block}`;
|
|
1865
|
+
}
|
|
1866
|
+
|
|
1867
|
+
// src/extraction/formatter.ts
|
|
1868
|
+
function collectContentFields(doc) {
|
|
1869
|
+
const entries = [];
|
|
1870
|
+
let id = 0;
|
|
1871
|
+
function add(path, text) {
|
|
1872
|
+
if (text && text.length > 20) {
|
|
1873
|
+
entries.push({ id: id++, path, text });
|
|
1874
|
+
}
|
|
1875
|
+
}
|
|
1876
|
+
add("summary", doc.summary);
|
|
1877
|
+
if (doc.sections) {
|
|
1878
|
+
for (let i = 0; i < doc.sections.length; i++) {
|
|
1879
|
+
const s = doc.sections[i];
|
|
1880
|
+
add(`sections[${i}].content`, s.content);
|
|
1881
|
+
if (s.subsections) {
|
|
1882
|
+
for (let j = 0; j < s.subsections.length; j++) {
|
|
1883
|
+
add(`sections[${i}].subsections[${j}].content`, s.subsections[j].content);
|
|
1884
|
+
}
|
|
1885
|
+
}
|
|
1886
|
+
}
|
|
1887
|
+
}
|
|
1888
|
+
if (doc.endorsements) {
|
|
1889
|
+
for (let i = 0; i < doc.endorsements.length; i++) {
|
|
1890
|
+
add(`endorsements[${i}].content`, doc.endorsements[i].content);
|
|
1891
|
+
}
|
|
1892
|
+
}
|
|
1893
|
+
if (doc.exclusions) {
|
|
1894
|
+
for (let i = 0; i < doc.exclusions.length; i++) {
|
|
1895
|
+
add(`exclusions[${i}].content`, doc.exclusions[i].content);
|
|
1896
|
+
}
|
|
1897
|
+
}
|
|
1898
|
+
if (doc.conditions) {
|
|
1899
|
+
for (let i = 0; i < doc.conditions.length; i++) {
|
|
1900
|
+
add(`conditions[${i}].content`, doc.conditions[i].content);
|
|
1901
|
+
}
|
|
1902
|
+
}
|
|
1903
|
+
return entries;
|
|
1904
|
+
}
|
|
1905
|
+
function parseFormatResponse(response) {
|
|
1906
|
+
const results = /* @__PURE__ */ new Map();
|
|
1907
|
+
const parts = response.split(/===ENTRY (\d+)===/);
|
|
1908
|
+
for (let i = 1; i < parts.length; i += 2) {
|
|
1909
|
+
const entryId = parseInt(parts[i], 10);
|
|
1910
|
+
const content = parts[i + 1]?.trim();
|
|
1911
|
+
if (!isNaN(entryId) && content !== void 0) {
|
|
1912
|
+
results.set(entryId, content);
|
|
1913
|
+
}
|
|
1914
|
+
}
|
|
1915
|
+
return results;
|
|
1916
|
+
}
|
|
1917
|
+
function applyFormattedContent(doc, entries, formatted) {
|
|
1918
|
+
for (const entry of entries) {
|
|
1919
|
+
const cleaned = formatted.get(entry.id);
|
|
1920
|
+
if (!cleaned) continue;
|
|
1921
|
+
const segments = entry.path.match(/^(\w+)(?:\[(\d+)\])?(?:\.(\w+)(?:\[(\d+)\])?(?:\.(\w+))?)?$/);
|
|
1922
|
+
if (!segments) continue;
|
|
1923
|
+
const [, field, idx1, sub1, idx2, sub2] = segments;
|
|
1924
|
+
if (!sub1) {
|
|
1925
|
+
doc[field] = cleaned;
|
|
1926
|
+
} else if (!sub2) {
|
|
1927
|
+
const arr = doc[field];
|
|
1928
|
+
if (arr && arr[Number(idx1)]) {
|
|
1929
|
+
arr[Number(idx1)][sub1] = cleaned;
|
|
1930
|
+
}
|
|
1931
|
+
} else {
|
|
1932
|
+
const arr = doc[field];
|
|
1933
|
+
if (arr && arr[Number(idx1)]) {
|
|
1934
|
+
const nested = arr[Number(idx1)][sub1];
|
|
1935
|
+
if (nested && nested[Number(idx2)]) {
|
|
1936
|
+
nested[Number(idx2)][sub2] = cleaned;
|
|
1937
|
+
}
|
|
1938
|
+
}
|
|
1939
|
+
}
|
|
1940
|
+
}
|
|
1941
|
+
}
|
|
1942
|
+
var MAX_ENTRIES_PER_BATCH = 20;
|
|
1943
|
+
async function formatDocumentContent(doc, generateText, options) {
|
|
1944
|
+
const entries = collectContentFields(doc);
|
|
1945
|
+
const totalUsage = { inputTokens: 0, outputTokens: 0 };
|
|
1946
|
+
if (entries.length === 0) {
|
|
1947
|
+
return { document: doc, usage: totalUsage };
|
|
1948
|
+
}
|
|
1949
|
+
options?.onProgress?.(`Formatting ${entries.length} content fields...`);
|
|
1950
|
+
const batches = [];
|
|
1951
|
+
for (let i = 0; i < entries.length; i += MAX_ENTRIES_PER_BATCH) {
|
|
1952
|
+
batches.push(entries.slice(i, i + MAX_ENTRIES_PER_BATCH));
|
|
1953
|
+
}
|
|
1954
|
+
for (let batchIdx = 0; batchIdx < batches.length; batchIdx++) {
|
|
1955
|
+
const batch = batches[batchIdx];
|
|
1956
|
+
try {
|
|
1957
|
+
const prompt = buildFormatPrompt(batch.map((e) => ({ id: e.id, text: e.text })));
|
|
1958
|
+
const result = await withRetry(
|
|
1959
|
+
() => generateText({
|
|
1960
|
+
prompt,
|
|
1961
|
+
maxTokens: 16384,
|
|
1962
|
+
providerOptions: options?.providerOptions
|
|
1963
|
+
})
|
|
1964
|
+
);
|
|
1965
|
+
if (result.usage) {
|
|
1966
|
+
totalUsage.inputTokens += result.usage.inputTokens;
|
|
1967
|
+
totalUsage.outputTokens += result.usage.outputTokens;
|
|
1968
|
+
}
|
|
1969
|
+
const formatted = parseFormatResponse(result.text);
|
|
1970
|
+
if (formatted.size < batch.length) {
|
|
1971
|
+
await options?.log?.(
|
|
1972
|
+
`Format batch ${batchIdx + 1}/${batches.length}: model returned ${formatted.size}/${batch.length} entries \u2014 unformatted entries will keep original content`
|
|
1973
|
+
);
|
|
1974
|
+
}
|
|
1975
|
+
applyFormattedContent(doc, batch, formatted);
|
|
1976
|
+
} catch (error) {
|
|
1977
|
+
await options?.log?.(
|
|
1978
|
+
`Format batch ${batchIdx + 1}/${batches.length} failed, keeping original content: ${error instanceof Error ? error.message : String(error)}`
|
|
1979
|
+
);
|
|
1980
|
+
}
|
|
1981
|
+
}
|
|
1982
|
+
return { document: doc, usage: totalUsage };
|
|
1983
|
+
}
|
|
1984
|
+
|
|
1703
1985
|
// src/extraction/chunking.ts
|
|
1704
1986
|
function chunkDocument(doc) {
|
|
1705
1987
|
const chunks = [];
|
|
@@ -2536,9 +2818,13 @@ var ExtractionTaskSchema = import_zod18.z.object({
|
|
|
2536
2818
|
endPage: import_zod18.z.number(),
|
|
2537
2819
|
description: import_zod18.z.string()
|
|
2538
2820
|
});
|
|
2821
|
+
var PageMapEntrySchema = import_zod18.z.object({
|
|
2822
|
+
section: import_zod18.z.string(),
|
|
2823
|
+
pages: import_zod18.z.string()
|
|
2824
|
+
});
|
|
2539
2825
|
var ExtractionPlanSchema = import_zod18.z.object({
|
|
2540
2826
|
tasks: import_zod18.z.array(ExtractionTaskSchema),
|
|
2541
|
-
pageMap: import_zod18.z.
|
|
2827
|
+
pageMap: import_zod18.z.array(PageMapEntrySchema).optional()
|
|
2542
2828
|
});
|
|
2543
2829
|
function buildPlanPrompt(templateHints) {
|
|
2544
2830
|
return `You are planning the extraction of an insurance document. You have already classified this document. Now scan the full document and create a page map + extraction plan.
|
|
@@ -2567,7 +2853,10 @@ Return JSON:
|
|
|
2567
2853
|
{ "extractorName": "carrier_info", "startPage": 1, "endPage": 2, "description": "Extract carrier details from declarations page" },
|
|
2568
2854
|
...
|
|
2569
2855
|
],
|
|
2570
|
-
"pageMap":
|
|
2856
|
+
"pageMap": [
|
|
2857
|
+
{ "section": "declarations", "pages": "pages 1-3" },
|
|
2858
|
+
{ "section": "endorsements", "pages": "pages 15-22" }
|
|
2859
|
+
]
|
|
2571
2860
|
}
|
|
2572
2861
|
|
|
2573
2862
|
Create tasks that cover the entire document. Prefer specific extractors over generic "sections" where possible. Keep page ranges tight \u2014 only include pages relevant to each extractor.
|
|
@@ -3060,7 +3349,8 @@ function createExtractor(config) {
|
|
|
3060
3349
|
onTokenUsage,
|
|
3061
3350
|
onProgress,
|
|
3062
3351
|
log,
|
|
3063
|
-
providerOptions
|
|
3352
|
+
providerOptions,
|
|
3353
|
+
onCheckpointSave
|
|
3064
3354
|
} = config;
|
|
3065
3355
|
const limit = pLimit(concurrency);
|
|
3066
3356
|
let totalUsage = { inputTokens: 0, outputTokens: 0 };
|
|
@@ -3071,100 +3361,106 @@ function createExtractor(config) {
|
|
|
3071
3361
|
onTokenUsage?.(usage);
|
|
3072
3362
|
}
|
|
3073
3363
|
}
|
|
3074
|
-
async function extract(pdfBase64, documentId) {
|
|
3364
|
+
async function extract(pdfBase64, documentId, options) {
|
|
3075
3365
|
const id = documentId ?? `doc-${Date.now()}`;
|
|
3076
3366
|
const memory = /* @__PURE__ */ new Map();
|
|
3077
3367
|
totalUsage = { inputTokens: 0, outputTokens: 0 };
|
|
3078
|
-
|
|
3079
|
-
|
|
3080
|
-
|
|
3081
|
-
|
|
3082
|
-
|
|
3083
|
-
|
|
3084
|
-
|
|
3085
|
-
|
|
3086
|
-
|
|
3087
|
-
);
|
|
3088
|
-
trackUsage(classifyResult.usage);
|
|
3089
|
-
memory.set("classify", classifyResult.object);
|
|
3090
|
-
const { documentType, policyTypes } = classifyResult.object;
|
|
3091
|
-
const primaryType = policyTypes[0] ?? "other";
|
|
3092
|
-
const template = getTemplate(primaryType);
|
|
3093
|
-
onProgress?.(`Planning extraction for ${primaryType} ${documentType}...`);
|
|
3094
|
-
const templateHints = [
|
|
3095
|
-
`Document type: ${primaryType} ${documentType}`,
|
|
3096
|
-
`Expected sections: ${template.expectedSections.join(", ")}`,
|
|
3097
|
-
`Page hints: ${Object.entries(template.pageHints).map(([k, v]) => `${k}: ${v}`).join("; ")}`,
|
|
3098
|
-
`Total pages: ${pageCount}`
|
|
3099
|
-
].join("\n");
|
|
3100
|
-
const planResult = await withRetry(
|
|
3101
|
-
() => generateObject({
|
|
3102
|
-
prompt: buildPlanPrompt(templateHints),
|
|
3103
|
-
schema: ExtractionPlanSchema,
|
|
3104
|
-
maxTokens: 2048,
|
|
3105
|
-
providerOptions
|
|
3106
|
-
})
|
|
3107
|
-
);
|
|
3108
|
-
trackUsage(planResult.usage);
|
|
3109
|
-
const tasks = planResult.object.tasks;
|
|
3110
|
-
onProgress?.(`Dispatching ${tasks.length} extractors...`);
|
|
3111
|
-
const extractorResults = await Promise.all(
|
|
3112
|
-
tasks.map(
|
|
3113
|
-
(task) => limit(async () => {
|
|
3114
|
-
const ext = getExtractor(task.extractorName);
|
|
3115
|
-
if (!ext) {
|
|
3116
|
-
await log?.(`Unknown extractor: ${task.extractorName}, skipping`);
|
|
3117
|
-
return null;
|
|
3118
|
-
}
|
|
3119
|
-
onProgress?.(`Extracting ${task.extractorName} (pages ${task.startPage}-${task.endPage})...`);
|
|
3120
|
-
try {
|
|
3121
|
-
const result = await runExtractor({
|
|
3122
|
-
name: task.extractorName,
|
|
3123
|
-
prompt: ext.buildPrompt(),
|
|
3124
|
-
schema: ext.schema,
|
|
3125
|
-
pdfBase64,
|
|
3126
|
-
startPage: task.startPage,
|
|
3127
|
-
endPage: task.endPage,
|
|
3128
|
-
generateObject,
|
|
3129
|
-
convertPdfToImages,
|
|
3130
|
-
maxTokens: ext.maxTokens ?? 4096,
|
|
3131
|
-
providerOptions
|
|
3132
|
-
});
|
|
3133
|
-
trackUsage(result.usage);
|
|
3134
|
-
return result;
|
|
3135
|
-
} catch (error) {
|
|
3136
|
-
await log?.(`Extractor ${task.extractorName} failed: ${error}`);
|
|
3137
|
-
return null;
|
|
3138
|
-
}
|
|
3139
|
-
})
|
|
3140
|
-
)
|
|
3141
|
-
);
|
|
3142
|
-
for (const result of extractorResults) {
|
|
3143
|
-
if (result) {
|
|
3144
|
-
memory.set(result.name, result.data);
|
|
3368
|
+
const pipelineCtx = createPipelineContext({
|
|
3369
|
+
id,
|
|
3370
|
+
onSave: onCheckpointSave,
|
|
3371
|
+
resumeFrom: options?.resumeFrom
|
|
3372
|
+
});
|
|
3373
|
+
const resumed = pipelineCtx.getCheckpoint()?.state;
|
|
3374
|
+
if (resumed?.memory) {
|
|
3375
|
+
for (const [k, v] of Object.entries(resumed.memory)) {
|
|
3376
|
+
memory.set(k, v);
|
|
3145
3377
|
}
|
|
3146
3378
|
}
|
|
3147
|
-
|
|
3148
|
-
|
|
3149
|
-
|
|
3150
|
-
|
|
3151
|
-
|
|
3152
|
-
|
|
3153
|
-
|
|
3379
|
+
let classifyResult;
|
|
3380
|
+
if (resumed?.classifyResult && pipelineCtx.isPhaseComplete("classify")) {
|
|
3381
|
+
classifyResult = resumed.classifyResult;
|
|
3382
|
+
onProgress?.("Resuming from checkpoint (classify complete)...");
|
|
3383
|
+
} else {
|
|
3384
|
+
onProgress?.("Classifying document...");
|
|
3385
|
+
const pageCount2 = await getPdfPageCount(pdfBase64);
|
|
3386
|
+
const classifyResponse = await safeGenerateObject(
|
|
3387
|
+
generateObject,
|
|
3388
|
+
{
|
|
3389
|
+
prompt: buildClassifyPrompt(),
|
|
3390
|
+
schema: ClassifyResultSchema,
|
|
3391
|
+
maxTokens: 512,
|
|
3154
3392
|
providerOptions
|
|
3155
|
-
}
|
|
3393
|
+
},
|
|
3394
|
+
{
|
|
3395
|
+
fallback: { documentType: "policy", policyTypes: ["other"], confidence: 0 },
|
|
3396
|
+
log,
|
|
3397
|
+
onError: (err, attempt) => log?.(`Classify attempt ${attempt + 1} failed: ${err}`)
|
|
3398
|
+
}
|
|
3156
3399
|
);
|
|
3157
|
-
trackUsage(
|
|
3158
|
-
|
|
3159
|
-
|
|
3160
|
-
|
|
3161
|
-
|
|
3162
|
-
|
|
3163
|
-
|
|
3164
|
-
|
|
3400
|
+
trackUsage(classifyResponse.usage);
|
|
3401
|
+
classifyResult = classifyResponse.object;
|
|
3402
|
+
memory.set("classify", classifyResult);
|
|
3403
|
+
await pipelineCtx.save("classify", {
|
|
3404
|
+
id,
|
|
3405
|
+
pageCount: pageCount2,
|
|
3406
|
+
classifyResult,
|
|
3407
|
+
memory: Object.fromEntries(memory)
|
|
3408
|
+
});
|
|
3409
|
+
}
|
|
3410
|
+
const { documentType, policyTypes } = classifyResult;
|
|
3411
|
+
const primaryType = policyTypes[0] ?? "other";
|
|
3412
|
+
const template = getTemplate(primaryType);
|
|
3413
|
+
const pageCount = resumed?.pageCount ?? await getPdfPageCount(pdfBase64);
|
|
3414
|
+
let plan;
|
|
3415
|
+
if (resumed?.plan && pipelineCtx.isPhaseComplete("plan")) {
|
|
3416
|
+
plan = resumed.plan;
|
|
3417
|
+
onProgress?.("Resuming from checkpoint (plan complete)...");
|
|
3418
|
+
} else {
|
|
3419
|
+
onProgress?.(`Planning extraction for ${primaryType} ${documentType}...`);
|
|
3420
|
+
const templateHints = [
|
|
3421
|
+
`Document type: ${primaryType} ${documentType}`,
|
|
3422
|
+
`Expected sections: ${template.expectedSections.join(", ")}`,
|
|
3423
|
+
`Page hints: ${Object.entries(template.pageHints).map(([k, v]) => `${k}: ${v}`).join("; ")}`,
|
|
3424
|
+
`Total pages: ${pageCount}`
|
|
3425
|
+
].join("\n");
|
|
3426
|
+
const planResponse = await safeGenerateObject(
|
|
3427
|
+
generateObject,
|
|
3428
|
+
{
|
|
3429
|
+
prompt: buildPlanPrompt(templateHints),
|
|
3430
|
+
schema: ExtractionPlanSchema,
|
|
3431
|
+
maxTokens: 2048,
|
|
3432
|
+
providerOptions
|
|
3433
|
+
},
|
|
3434
|
+
{
|
|
3435
|
+
fallback: {
|
|
3436
|
+
tasks: [{ extractorName: "sections", startPage: 1, endPage: pageCount, description: "Full document fallback extraction" }]
|
|
3437
|
+
},
|
|
3438
|
+
log,
|
|
3439
|
+
onError: (err, attempt) => log?.(`Plan attempt ${attempt + 1} failed: ${err}`)
|
|
3440
|
+
}
|
|
3441
|
+
);
|
|
3442
|
+
trackUsage(planResponse.usage);
|
|
3443
|
+
plan = planResponse.object;
|
|
3444
|
+
await pipelineCtx.save("plan", {
|
|
3445
|
+
id,
|
|
3446
|
+
pageCount,
|
|
3447
|
+
classifyResult,
|
|
3448
|
+
plan,
|
|
3449
|
+
memory: Object.fromEntries(memory)
|
|
3450
|
+
});
|
|
3451
|
+
}
|
|
3452
|
+
if (!pipelineCtx.isPhaseComplete("extract")) {
|
|
3453
|
+
const tasks = plan.tasks;
|
|
3454
|
+
onProgress?.(`Dispatching ${tasks.length} extractors...`);
|
|
3455
|
+
const extractorResults = await Promise.all(
|
|
3456
|
+
tasks.map(
|
|
3165
3457
|
(task) => limit(async () => {
|
|
3166
3458
|
const ext = getExtractor(task.extractorName);
|
|
3167
|
-
if (!ext)
|
|
3459
|
+
if (!ext) {
|
|
3460
|
+
await log?.(`Unknown extractor: ${task.extractorName}, skipping`);
|
|
3461
|
+
return null;
|
|
3462
|
+
}
|
|
3463
|
+
onProgress?.(`Extracting ${task.extractorName} (pages ${task.startPage}-${task.endPage})...`);
|
|
3168
3464
|
try {
|
|
3169
3465
|
const result = await runExtractor({
|
|
3170
3466
|
name: task.extractorName,
|
|
@@ -3181,22 +3477,114 @@ function createExtractor(config) {
|
|
|
3181
3477
|
trackUsage(result.usage);
|
|
3182
3478
|
return result;
|
|
3183
3479
|
} catch (error) {
|
|
3184
|
-
await log?.(`
|
|
3480
|
+
await log?.(`Extractor ${task.extractorName} failed: ${error}`);
|
|
3185
3481
|
return null;
|
|
3186
3482
|
}
|
|
3187
3483
|
})
|
|
3188
3484
|
)
|
|
3189
3485
|
);
|
|
3190
|
-
for (const result of
|
|
3486
|
+
for (const result of extractorResults) {
|
|
3191
3487
|
if (result) {
|
|
3192
3488
|
memory.set(result.name, result.data);
|
|
3193
3489
|
}
|
|
3194
3490
|
}
|
|
3491
|
+
await pipelineCtx.save("extract", {
|
|
3492
|
+
id,
|
|
3493
|
+
pageCount,
|
|
3494
|
+
classifyResult,
|
|
3495
|
+
plan,
|
|
3496
|
+
memory: Object.fromEntries(memory)
|
|
3497
|
+
});
|
|
3498
|
+
}
|
|
3499
|
+
if (!pipelineCtx.isPhaseComplete("review")) {
|
|
3500
|
+
for (let round = 0; round < maxReviewRounds; round++) {
|
|
3501
|
+
const extractedKeys = [...memory.keys()].filter((k) => k !== "classify");
|
|
3502
|
+
const reviewResponse = await safeGenerateObject(
|
|
3503
|
+
generateObject,
|
|
3504
|
+
{
|
|
3505
|
+
prompt: buildReviewPrompt(template.required, extractedKeys),
|
|
3506
|
+
schema: ReviewResultSchema,
|
|
3507
|
+
maxTokens: 1024,
|
|
3508
|
+
providerOptions
|
|
3509
|
+
},
|
|
3510
|
+
{
|
|
3511
|
+
fallback: { complete: true, missingFields: [], additionalTasks: [] },
|
|
3512
|
+
log,
|
|
3513
|
+
onError: (err, attempt) => log?.(`Review round ${round + 1} attempt ${attempt + 1} failed: ${err}`)
|
|
3514
|
+
}
|
|
3515
|
+
);
|
|
3516
|
+
trackUsage(reviewResponse.usage);
|
|
3517
|
+
if (reviewResponse.object.complete || reviewResponse.object.additionalTasks.length === 0) {
|
|
3518
|
+
onProgress?.("Extraction complete.");
|
|
3519
|
+
break;
|
|
3520
|
+
}
|
|
3521
|
+
onProgress?.(`Review round ${round + 1}: dispatching ${reviewResponse.object.additionalTasks.length} follow-up extractors...`);
|
|
3522
|
+
const followUpResults = await Promise.all(
|
|
3523
|
+
reviewResponse.object.additionalTasks.map(
|
|
3524
|
+
(task) => limit(async () => {
|
|
3525
|
+
const ext = getExtractor(task.extractorName);
|
|
3526
|
+
if (!ext) return null;
|
|
3527
|
+
try {
|
|
3528
|
+
const result = await runExtractor({
|
|
3529
|
+
name: task.extractorName,
|
|
3530
|
+
prompt: ext.buildPrompt(),
|
|
3531
|
+
schema: ext.schema,
|
|
3532
|
+
pdfBase64,
|
|
3533
|
+
startPage: task.startPage,
|
|
3534
|
+
endPage: task.endPage,
|
|
3535
|
+
generateObject,
|
|
3536
|
+
convertPdfToImages,
|
|
3537
|
+
maxTokens: ext.maxTokens ?? 4096,
|
|
3538
|
+
providerOptions
|
|
3539
|
+
});
|
|
3540
|
+
trackUsage(result.usage);
|
|
3541
|
+
return result;
|
|
3542
|
+
} catch (error) {
|
|
3543
|
+
await log?.(`Follow-up extractor ${task.extractorName} failed: ${error}`);
|
|
3544
|
+
return null;
|
|
3545
|
+
}
|
|
3546
|
+
})
|
|
3547
|
+
)
|
|
3548
|
+
);
|
|
3549
|
+
for (const result of followUpResults) {
|
|
3550
|
+
if (result) {
|
|
3551
|
+
memory.set(result.name, result.data);
|
|
3552
|
+
}
|
|
3553
|
+
}
|
|
3554
|
+
}
|
|
3555
|
+
await pipelineCtx.save("review", {
|
|
3556
|
+
id,
|
|
3557
|
+
pageCount,
|
|
3558
|
+
classifyResult,
|
|
3559
|
+
plan,
|
|
3560
|
+
memory: Object.fromEntries(memory)
|
|
3561
|
+
});
|
|
3195
3562
|
}
|
|
3196
3563
|
onProgress?.("Assembling document...");
|
|
3197
3564
|
const document = assembleDocument(id, documentType, memory);
|
|
3198
|
-
|
|
3199
|
-
|
|
3565
|
+
await pipelineCtx.save("assemble", {
|
|
3566
|
+
id,
|
|
3567
|
+
pageCount,
|
|
3568
|
+
classifyResult,
|
|
3569
|
+
plan,
|
|
3570
|
+
memory: Object.fromEntries(memory),
|
|
3571
|
+
document
|
|
3572
|
+
});
|
|
3573
|
+
onProgress?.("Formatting extracted content...");
|
|
3574
|
+
const formatResult = await formatDocumentContent(document, generateText, {
|
|
3575
|
+
providerOptions,
|
|
3576
|
+
onProgress,
|
|
3577
|
+
log
|
|
3578
|
+
});
|
|
3579
|
+
trackUsage(formatResult.usage);
|
|
3580
|
+
const chunks = chunkDocument(formatResult.document);
|
|
3581
|
+
const finalCheckpoint = pipelineCtx.getCheckpoint();
|
|
3582
|
+
return {
|
|
3583
|
+
document: formatResult.document,
|
|
3584
|
+
chunks,
|
|
3585
|
+
tokenUsage: totalUsage,
|
|
3586
|
+
checkpoint: finalCheckpoint
|
|
3587
|
+
};
|
|
3200
3588
|
}
|
|
3201
3589
|
return { extract };
|
|
3202
3590
|
}
|
|
@@ -4059,7 +4447,6 @@ function createApplicationPipeline(config) {
|
|
|
4059
4447
|
let state = {
|
|
4060
4448
|
id,
|
|
4061
4449
|
pdfBase64: void 0,
|
|
4062
|
-
// Don't persist the full PDF in state
|
|
4063
4450
|
title: void 0,
|
|
4064
4451
|
applicationType: null,
|
|
4065
4452
|
fields: [],
|
|
@@ -4070,13 +4457,20 @@ function createApplicationPipeline(config) {
|
|
|
4070
4457
|
updatedAt: now
|
|
4071
4458
|
};
|
|
4072
4459
|
onProgress?.("Classifying document...");
|
|
4073
|
-
|
|
4074
|
-
|
|
4075
|
-
|
|
4076
|
-
|
|
4077
|
-
|
|
4078
|
-
|
|
4079
|
-
|
|
4460
|
+
await applicationStore?.save(state);
|
|
4461
|
+
let classifyResult;
|
|
4462
|
+
try {
|
|
4463
|
+
const { result, usage: classifyUsage } = await classifyApplication(
|
|
4464
|
+
pdfBase64.slice(0, 2e3),
|
|
4465
|
+
generateObject,
|
|
4466
|
+
providerOptions
|
|
4467
|
+
);
|
|
4468
|
+
trackUsage(classifyUsage);
|
|
4469
|
+
classifyResult = result;
|
|
4470
|
+
} catch (error) {
|
|
4471
|
+
await log?.(`Classification failed, treating as non-application: ${error instanceof Error ? error.message : String(error)}`);
|
|
4472
|
+
classifyResult = { isApplication: false, confidence: 0, applicationType: null };
|
|
4473
|
+
}
|
|
4080
4474
|
if (!classifyResult.isApplication) {
|
|
4081
4475
|
state.status = "complete";
|
|
4082
4476
|
state.updatedAt = Date.now();
|
|
@@ -4086,13 +4480,28 @@ function createApplicationPipeline(config) {
|
|
|
4086
4480
|
state.applicationType = classifyResult.applicationType;
|
|
4087
4481
|
state.status = "extracting";
|
|
4088
4482
|
state.updatedAt = Date.now();
|
|
4483
|
+
await applicationStore?.save(state);
|
|
4089
4484
|
onProgress?.("Extracting form fields...");
|
|
4090
|
-
|
|
4091
|
-
|
|
4092
|
-
|
|
4093
|
-
|
|
4094
|
-
|
|
4095
|
-
|
|
4485
|
+
let fields;
|
|
4486
|
+
try {
|
|
4487
|
+
const { fields: extractedFields, usage: extractUsage } = await extractFields(
|
|
4488
|
+
pdfBase64,
|
|
4489
|
+
generateObject,
|
|
4490
|
+
providerOptions
|
|
4491
|
+
);
|
|
4492
|
+
trackUsage(extractUsage);
|
|
4493
|
+
fields = extractedFields;
|
|
4494
|
+
} catch (error) {
|
|
4495
|
+
await log?.(`Field extraction failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
4496
|
+
fields = [];
|
|
4497
|
+
}
|
|
4498
|
+
if (fields.length === 0) {
|
|
4499
|
+
await log?.("No fields extracted, completing pipeline with empty result");
|
|
4500
|
+
state.status = "complete";
|
|
4501
|
+
state.updatedAt = Date.now();
|
|
4502
|
+
await applicationStore?.save(state);
|
|
4503
|
+
return { state, tokenUsage: totalUsage };
|
|
4504
|
+
}
|
|
4096
4505
|
state.fields = fields;
|
|
4097
4506
|
state.title = classifyResult.applicationType ?? void 0;
|
|
4098
4507
|
state.status = "auto_filling";
|
|
@@ -4124,20 +4533,24 @@ function createApplicationPipeline(config) {
|
|
|
4124
4533
|
limit(async () => {
|
|
4125
4534
|
const unfilledFields2 = state.fields.filter((f) => !f.value);
|
|
4126
4535
|
if (unfilledFields2.length === 0) return;
|
|
4127
|
-
|
|
4128
|
-
|
|
4129
|
-
|
|
4130
|
-
|
|
4131
|
-
|
|
4132
|
-
|
|
4133
|
-
|
|
4134
|
-
|
|
4135
|
-
const
|
|
4136
|
-
|
|
4137
|
-
field
|
|
4138
|
-
|
|
4139
|
-
|
|
4536
|
+
try {
|
|
4537
|
+
const { result: autoFillResult, usage: afUsage } = await autoFillFromContext(
|
|
4538
|
+
unfilledFields2,
|
|
4539
|
+
orgContext,
|
|
4540
|
+
generateObject,
|
|
4541
|
+
providerOptions
|
|
4542
|
+
);
|
|
4543
|
+
trackUsage(afUsage);
|
|
4544
|
+
for (const match of autoFillResult.matches) {
|
|
4545
|
+
const field = state.fields.find((f) => f.id === match.fieldId);
|
|
4546
|
+
if (field && !field.value) {
|
|
4547
|
+
field.value = match.value;
|
|
4548
|
+
field.source = `auto-fill: ${match.contextKey}`;
|
|
4549
|
+
field.confidence = match.confidence;
|
|
4550
|
+
}
|
|
4140
4551
|
}
|
|
4552
|
+
} catch (e) {
|
|
4553
|
+
await log?.(`Auto-fill from context failed: ${e instanceof Error ? e.message : String(e)}`);
|
|
4141
4554
|
}
|
|
4142
4555
|
})
|
|
4143
4556
|
);
|
|
@@ -4170,13 +4583,18 @@ function createApplicationPipeline(config) {
|
|
|
4170
4583
|
if (unfilledFields.length > 0) {
|
|
4171
4584
|
onProgress?.(`Batching ${unfilledFields.length} remaining questions...`);
|
|
4172
4585
|
state.status = "batching";
|
|
4173
|
-
|
|
4174
|
-
|
|
4175
|
-
|
|
4176
|
-
|
|
4177
|
-
|
|
4178
|
-
|
|
4179
|
-
|
|
4586
|
+
try {
|
|
4587
|
+
const { result: batchResult, usage: batchUsage } = await batchQuestions(
|
|
4588
|
+
unfilledFields,
|
|
4589
|
+
generateObject,
|
|
4590
|
+
providerOptions
|
|
4591
|
+
);
|
|
4592
|
+
trackUsage(batchUsage);
|
|
4593
|
+
state.batches = batchResult.batches;
|
|
4594
|
+
} catch (error) {
|
|
4595
|
+
await log?.(`Batching failed, using single-batch fallback: ${error instanceof Error ? error.message : String(error)}`);
|
|
4596
|
+
state.batches = [unfilledFields.map((f) => f.id)];
|
|
4597
|
+
}
|
|
4180
4598
|
state.currentBatchIndex = 0;
|
|
4181
4599
|
state.status = "collecting";
|
|
4182
4600
|
} else {
|
|
@@ -4203,32 +4621,49 @@ function createApplicationPipeline(config) {
|
|
|
4203
4621
|
(f) => currentBatchFieldIds.includes(f.id)
|
|
4204
4622
|
);
|
|
4205
4623
|
onProgress?.("Classifying reply...");
|
|
4206
|
-
|
|
4207
|
-
|
|
4208
|
-
|
|
4209
|
-
generateObject,
|
|
4210
|
-
providerOptions
|
|
4211
|
-
);
|
|
4212
|
-
trackUsage(intentUsage);
|
|
4213
|
-
let fieldsFilled = 0;
|
|
4214
|
-
let responseText;
|
|
4215
|
-
if (intent.hasAnswers) {
|
|
4216
|
-
onProgress?.("Parsing answers...");
|
|
4217
|
-
const { result: parseResult, usage: parseUsage } = await parseAnswers(
|
|
4624
|
+
let intent;
|
|
4625
|
+
try {
|
|
4626
|
+
const { intent: classifiedIntent, usage: intentUsage } = await classifyReplyIntent(
|
|
4218
4627
|
currentBatchFields,
|
|
4219
4628
|
replyText,
|
|
4220
4629
|
generateObject,
|
|
4221
4630
|
providerOptions
|
|
4222
4631
|
);
|
|
4223
|
-
trackUsage(
|
|
4224
|
-
|
|
4225
|
-
|
|
4226
|
-
|
|
4227
|
-
|
|
4228
|
-
|
|
4229
|
-
|
|
4230
|
-
|
|
4632
|
+
trackUsage(intentUsage);
|
|
4633
|
+
intent = classifiedIntent;
|
|
4634
|
+
} catch (error) {
|
|
4635
|
+
await log?.(`Reply intent classification failed, defaulting to answers_only: ${error instanceof Error ? error.message : String(error)}`);
|
|
4636
|
+
intent = {
|
|
4637
|
+
primaryIntent: "answers_only",
|
|
4638
|
+
hasAnswers: true,
|
|
4639
|
+
questionText: void 0,
|
|
4640
|
+
questionFieldIds: void 0,
|
|
4641
|
+
lookupRequests: void 0
|
|
4642
|
+
};
|
|
4643
|
+
}
|
|
4644
|
+
let fieldsFilled = 0;
|
|
4645
|
+
let responseText;
|
|
4646
|
+
if (intent.hasAnswers) {
|
|
4647
|
+
onProgress?.("Parsing answers...");
|
|
4648
|
+
try {
|
|
4649
|
+
const { result: parseResult, usage: parseUsage } = await parseAnswers(
|
|
4650
|
+
currentBatchFields,
|
|
4651
|
+
replyText,
|
|
4652
|
+
generateObject,
|
|
4653
|
+
providerOptions
|
|
4654
|
+
);
|
|
4655
|
+
trackUsage(parseUsage);
|
|
4656
|
+
for (const answer of parseResult.answers) {
|
|
4657
|
+
const field = state.fields.find((f) => f.id === answer.fieldId);
|
|
4658
|
+
if (field) {
|
|
4659
|
+
field.value = answer.value;
|
|
4660
|
+
field.source = "user";
|
|
4661
|
+
field.confidence = "confirmed";
|
|
4662
|
+
fieldsFilled++;
|
|
4663
|
+
}
|
|
4231
4664
|
}
|
|
4665
|
+
} catch (error) {
|
|
4666
|
+
await log?.(`Answer parsing failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
4232
4667
|
}
|
|
4233
4668
|
}
|
|
4234
4669
|
if (intent.lookupRequests?.length) {
|
|
@@ -4249,36 +4684,45 @@ function createApplicationPipeline(config) {
|
|
|
4249
4684
|
const targetFields = state.fields.filter(
|
|
4250
4685
|
(f) => intent.lookupRequests.some((lr) => lr.targetFieldIds.includes(f.id))
|
|
4251
4686
|
);
|
|
4252
|
-
|
|
4253
|
-
|
|
4254
|
-
|
|
4255
|
-
|
|
4256
|
-
|
|
4257
|
-
|
|
4258
|
-
|
|
4259
|
-
|
|
4260
|
-
|
|
4261
|
-
const
|
|
4262
|
-
|
|
4263
|
-
field
|
|
4264
|
-
|
|
4265
|
-
|
|
4266
|
-
|
|
4687
|
+
try {
|
|
4688
|
+
const { result: lookupResult, usage: lookupUsage } = await fillFromLookup(
|
|
4689
|
+
intent.lookupRequests,
|
|
4690
|
+
targetFields,
|
|
4691
|
+
availableData,
|
|
4692
|
+
generateObject,
|
|
4693
|
+
providerOptions
|
|
4694
|
+
);
|
|
4695
|
+
trackUsage(lookupUsage);
|
|
4696
|
+
for (const fill of lookupResult.fills) {
|
|
4697
|
+
const field = state.fields.find((f) => f.id === fill.fieldId);
|
|
4698
|
+
if (field) {
|
|
4699
|
+
field.value = fill.value;
|
|
4700
|
+
field.source = `lookup: ${fill.source}`;
|
|
4701
|
+
field.confidence = "high";
|
|
4702
|
+
fieldsFilled++;
|
|
4703
|
+
}
|
|
4267
4704
|
}
|
|
4705
|
+
} catch (error) {
|
|
4706
|
+
await log?.(`Lookup fill failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
4268
4707
|
}
|
|
4269
4708
|
}
|
|
4270
4709
|
}
|
|
4271
4710
|
if (intent.primaryIntent === "question" || intent.primaryIntent === "mixed") {
|
|
4272
4711
|
if (intent.questionText) {
|
|
4273
|
-
|
|
4274
|
-
|
|
4712
|
+
try {
|
|
4713
|
+
const { text, usage } = await generateText({
|
|
4714
|
+
prompt: `The user is filling out an insurance application and asked: "${intent.questionText}"
|
|
4275
4715
|
|
|
4276
4716
|
Provide a brief, helpful explanation (2-3 sentences). End with "Just reply with the answer when you're ready and I'll fill it in."`,
|
|
4277
|
-
|
|
4278
|
-
|
|
4279
|
-
|
|
4280
|
-
|
|
4281
|
-
|
|
4717
|
+
maxTokens: 512,
|
|
4718
|
+
providerOptions
|
|
4719
|
+
});
|
|
4720
|
+
trackUsage(usage);
|
|
4721
|
+
responseText = text;
|
|
4722
|
+
} catch (error) {
|
|
4723
|
+
await log?.(`Question response generation failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
4724
|
+
responseText = `I wasn't able to generate an explanation for your question. Could you rephrase it, or just provide the answer directly?`;
|
|
4725
|
+
}
|
|
4282
4726
|
}
|
|
4283
4727
|
}
|
|
4284
4728
|
const currentBatchComplete = currentBatchFieldIds.every(
|
|
@@ -4292,26 +4736,30 @@ Provide a brief, helpful explanation (2-3 sentences). End with "Just reply with
|
|
|
4292
4736
|
(f) => nextBatchFieldIds.includes(f.id)
|
|
4293
4737
|
);
|
|
4294
4738
|
const filledCount = state.fields.filter((f) => f.value).length;
|
|
4295
|
-
|
|
4296
|
-
|
|
4297
|
-
|
|
4298
|
-
|
|
4299
|
-
|
|
4300
|
-
|
|
4301
|
-
|
|
4302
|
-
|
|
4303
|
-
|
|
4304
|
-
|
|
4305
|
-
|
|
4306
|
-
|
|
4307
|
-
|
|
4308
|
-
|
|
4309
|
-
|
|
4310
|
-
responseText
|
|
4311
|
-
|
|
4312
|
-
|
|
4739
|
+
try {
|
|
4740
|
+
const { text: emailText, usage: emailUsage } = await generateBatchEmail(
|
|
4741
|
+
nextBatchFields,
|
|
4742
|
+
state.currentBatchIndex,
|
|
4743
|
+
state.batches.length,
|
|
4744
|
+
{
|
|
4745
|
+
appTitle: state.title,
|
|
4746
|
+
totalFieldCount: state.fields.length,
|
|
4747
|
+
filledFieldCount: filledCount,
|
|
4748
|
+
companyName: context?.companyName
|
|
4749
|
+
},
|
|
4750
|
+
generateText,
|
|
4751
|
+
providerOptions
|
|
4752
|
+
);
|
|
4753
|
+
trackUsage(emailUsage);
|
|
4754
|
+
if (!responseText) {
|
|
4755
|
+
responseText = emailText;
|
|
4756
|
+
} else {
|
|
4757
|
+
responseText += `
|
|
4313
4758
|
|
|
4314
4759
|
${emailText}`;
|
|
4760
|
+
}
|
|
4761
|
+
} catch (error) {
|
|
4762
|
+
await log?.(`Batch email generation failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
4315
4763
|
}
|
|
4316
4764
|
} else {
|
|
4317
4765
|
state.status = "confirming";
|
|
@@ -4520,7 +4968,7 @@ var EvidenceItemSchema = import_zod32.z.object({
|
|
|
4520
4968
|
turnId: import_zod32.z.string().optional(),
|
|
4521
4969
|
text: import_zod32.z.string().describe("Text excerpt from the source"),
|
|
4522
4970
|
relevance: import_zod32.z.number().min(0).max(1),
|
|
4523
|
-
metadata: import_zod32.z.
|
|
4971
|
+
metadata: import_zod32.z.array(import_zod32.z.object({ key: import_zod32.z.string(), value: import_zod32.z.string() })).optional()
|
|
4524
4972
|
});
|
|
4525
4973
|
var RetrievalResultSchema = import_zod32.z.object({
|
|
4526
4974
|
subQuestion: import_zod32.z.string(),
|
|
@@ -4556,6 +5004,9 @@ var QueryResultSchema = import_zod32.z.object({
|
|
|
4556
5004
|
});
|
|
4557
5005
|
|
|
4558
5006
|
// src/query/retriever.ts
|
|
5007
|
+
function recordToKVArray(record) {
|
|
5008
|
+
return Object.entries(record).map(([key, value]) => ({ key, value }));
|
|
5009
|
+
}
|
|
4559
5010
|
async function retrieve(subQuestion, conversationId, config) {
|
|
4560
5011
|
const { documentStore, memoryStore, retrievalLimit, log } = config;
|
|
4561
5012
|
const evidence = [];
|
|
@@ -4582,7 +5033,7 @@ async function retrieve(subQuestion, conversationId, config) {
|
|
|
4582
5033
|
text: chunk.text,
|
|
4583
5034
|
relevance: 0.8,
|
|
4584
5035
|
// Default — store doesn't expose scores directly
|
|
4585
|
-
metadata: chunk.metadata
|
|
5036
|
+
metadata: recordToKVArray(chunk.metadata)
|
|
4586
5037
|
});
|
|
4587
5038
|
}
|
|
4588
5039
|
}
|
|
@@ -4597,7 +5048,7 @@ async function retrieve(subQuestion, conversationId, config) {
|
|
|
4597
5048
|
documentId: chunk.documentId,
|
|
4598
5049
|
text: chunk.text,
|
|
4599
5050
|
relevance: 0.8,
|
|
4600
|
-
metadata: chunk.metadata
|
|
5051
|
+
metadata: recordToKVArray(chunk.metadata)
|
|
4601
5052
|
});
|
|
4602
5053
|
}
|
|
4603
5054
|
}
|
|
@@ -4625,11 +5076,11 @@ async function retrieve(subQuestion, conversationId, config) {
|
|
|
4625
5076
|
text: summary,
|
|
4626
5077
|
relevance: 0.9,
|
|
4627
5078
|
// Direct lookup is high relevance
|
|
4628
|
-
metadata:
|
|
4629
|
-
type: doc.type,
|
|
4630
|
-
carrier: doc.carrier ?? "",
|
|
4631
|
-
insuredName: doc.insuredName ?? ""
|
|
4632
|
-
|
|
5079
|
+
metadata: [
|
|
5080
|
+
{ key: "type", value: doc.type },
|
|
5081
|
+
{ key: "carrier", value: doc.carrier ?? "" },
|
|
5082
|
+
{ key: "insuredName", value: doc.insuredName ?? "" }
|
|
5083
|
+
]
|
|
4633
5084
|
});
|
|
4634
5085
|
}
|
|
4635
5086
|
} catch (e) {
|
|
@@ -4864,8 +5315,12 @@ function createQueryAgent(config) {
|
|
|
4864
5315
|
async function query(input) {
|
|
4865
5316
|
totalUsage = { inputTokens: 0, outputTokens: 0 };
|
|
4866
5317
|
const { question, conversationId, context } = input;
|
|
5318
|
+
const pipelineCtx = createPipelineContext({
|
|
5319
|
+
id: `query-${Date.now()}`
|
|
5320
|
+
});
|
|
4867
5321
|
onProgress?.("Classifying query...");
|
|
4868
5322
|
const classification = await classify(question, conversationId);
|
|
5323
|
+
await pipelineCtx.save("classify", { classification });
|
|
4869
5324
|
onProgress?.(`Retrieving evidence for ${classification.subQuestions.length} sub-question(s)...`);
|
|
4870
5325
|
const retrieverConfig = {
|
|
4871
5326
|
documentStore,
|
|
@@ -4879,9 +5334,10 @@ function createQueryAgent(config) {
|
|
|
4879
5334
|
)
|
|
4880
5335
|
);
|
|
4881
5336
|
const allEvidence = retrievalResults.flatMap((r) => r.evidence);
|
|
5337
|
+
await pipelineCtx.save("retrieve", { classification, evidence: allEvidence });
|
|
4882
5338
|
onProgress?.("Reasoning over evidence...");
|
|
4883
5339
|
const reasonerConfig = { generateObject, providerOptions };
|
|
4884
|
-
|
|
5340
|
+
const reasonResults = await Promise.allSettled(
|
|
4885
5341
|
classification.subQuestions.map(
|
|
4886
5342
|
(sq, i) => limit(async () => {
|
|
4887
5343
|
const { subAnswer, usage } = await reason(
|
|
@@ -4895,10 +5351,27 @@ function createQueryAgent(config) {
|
|
|
4895
5351
|
})
|
|
4896
5352
|
)
|
|
4897
5353
|
);
|
|
5354
|
+
let subAnswers = [];
|
|
5355
|
+
for (let i = 0; i < reasonResults.length; i++) {
|
|
5356
|
+
const result = reasonResults[i];
|
|
5357
|
+
if (result.status === "fulfilled") {
|
|
5358
|
+
subAnswers.push(result.value);
|
|
5359
|
+
} else {
|
|
5360
|
+
await log?.(`Reasoner failed for sub-question "${classification.subQuestions[i].question}": ${result.reason}`);
|
|
5361
|
+
subAnswers.push({
|
|
5362
|
+
subQuestion: classification.subQuestions[i].question,
|
|
5363
|
+
answer: "Unable to answer this part of the question due to a processing error.",
|
|
5364
|
+
citations: [],
|
|
5365
|
+
confidence: 0,
|
|
5366
|
+
needsMoreContext: true
|
|
5367
|
+
});
|
|
5368
|
+
}
|
|
5369
|
+
}
|
|
5370
|
+
await pipelineCtx.save("reason", { classification, evidence: allEvidence, subAnswers });
|
|
4898
5371
|
onProgress?.("Verifying answer grounding...");
|
|
4899
5372
|
const verifierConfig = { generateObject, providerOptions };
|
|
4900
5373
|
for (let round = 0; round < maxVerifyRounds; round++) {
|
|
4901
|
-
const { result: verifyResult, usage } = await
|
|
5374
|
+
const { result: verifyResult, usage } = await safeVerify(
|
|
4902
5375
|
question,
|
|
4903
5376
|
subAnswers,
|
|
4904
5377
|
allEvidence,
|
|
@@ -4922,7 +5395,6 @@ function createQueryAgent(config) {
|
|
|
4922
5395
|
() => retrieve(sq, conversationId, {
|
|
4923
5396
|
...retrieverConfig,
|
|
4924
5397
|
retrievalLimit: retrievalLimit * 2
|
|
4925
|
-
// Broader retrieval on retry
|
|
4926
5398
|
})
|
|
4927
5399
|
)
|
|
4928
5400
|
)
|
|
@@ -4930,7 +5402,7 @@ function createQueryAgent(config) {
|
|
|
4930
5402
|
for (const r of retryRetrievals) {
|
|
4931
5403
|
allEvidence.push(...r.evidence);
|
|
4932
5404
|
}
|
|
4933
|
-
const
|
|
5405
|
+
const retrySettled = await Promise.allSettled(
|
|
4934
5406
|
retryQuestions.map(
|
|
4935
5407
|
(sq, i) => limit(async () => {
|
|
4936
5408
|
const { subAnswer, usage: u } = await reason(
|
|
@@ -4944,6 +5416,7 @@ function createQueryAgent(config) {
|
|
|
4944
5416
|
})
|
|
4945
5417
|
)
|
|
4946
5418
|
);
|
|
5419
|
+
const retrySubAnswers = retrySettled.filter((r) => r.status === "fulfilled").map((r) => r.value);
|
|
4947
5420
|
const retryQSet = new Set(retryQuestions.map((sq) => sq.question));
|
|
4948
5421
|
subAnswers = subAnswers.map((sa) => {
|
|
4949
5422
|
if (retryQSet.has(sa.subQuestion)) {
|
|
@@ -4996,17 +5469,42 @@ function createQueryAgent(config) {
|
|
|
4996
5469
|
}
|
|
4997
5470
|
}
|
|
4998
5471
|
const prompt = buildQueryClassifyPrompt(question, conversationContext);
|
|
4999
|
-
const { object, usage } = await
|
|
5000
|
-
|
|
5472
|
+
const { object, usage } = await safeGenerateObject(
|
|
5473
|
+
generateObject,
|
|
5474
|
+
{
|
|
5001
5475
|
prompt,
|
|
5002
5476
|
schema: QueryClassifyResultSchema,
|
|
5003
5477
|
maxTokens: 2048,
|
|
5004
5478
|
providerOptions
|
|
5005
|
-
}
|
|
5479
|
+
},
|
|
5480
|
+
{
|
|
5481
|
+
fallback: {
|
|
5482
|
+
intent: "general_knowledge",
|
|
5483
|
+
subQuestions: [
|
|
5484
|
+
{
|
|
5485
|
+
question,
|
|
5486
|
+
intent: "general_knowledge"
|
|
5487
|
+
}
|
|
5488
|
+
],
|
|
5489
|
+
requiresDocumentLookup: true,
|
|
5490
|
+
requiresChunkSearch: true,
|
|
5491
|
+
requiresConversationHistory: !!conversationId
|
|
5492
|
+
},
|
|
5493
|
+
log,
|
|
5494
|
+
onError: (err, attempt) => log?.(`Query classify attempt ${attempt + 1} failed: ${err}`)
|
|
5495
|
+
}
|
|
5006
5496
|
);
|
|
5007
5497
|
trackUsage(usage);
|
|
5008
5498
|
return object;
|
|
5009
5499
|
}
|
|
5500
|
+
async function safeVerify(originalQuestion, subAnswers, allEvidence, verifierConfig) {
|
|
5501
|
+
try {
|
|
5502
|
+
return await verify(originalQuestion, subAnswers, allEvidence, verifierConfig);
|
|
5503
|
+
} catch (error) {
|
|
5504
|
+
await log?.(`Verification failed, approving by default: ${error instanceof Error ? error.message : String(error)}`);
|
|
5505
|
+
return { result: { approved: true, issues: [] } };
|
|
5506
|
+
}
|
|
5507
|
+
}
|
|
5010
5508
|
async function respond(originalQuestion, subAnswers, classification, platform) {
|
|
5011
5509
|
const subAnswersJson = JSON.stringify(
|
|
5012
5510
|
subAnswers.map((sa) => ({
|
|
@@ -5020,13 +5518,25 @@ function createQueryAgent(config) {
|
|
|
5020
5518
|
2
|
|
5021
5519
|
);
|
|
5022
5520
|
const prompt = buildRespondPrompt(originalQuestion, subAnswersJson, platform);
|
|
5023
|
-
const { object, usage } = await
|
|
5024
|
-
|
|
5521
|
+
const { object, usage } = await safeGenerateObject(
|
|
5522
|
+
generateObject,
|
|
5523
|
+
{
|
|
5025
5524
|
prompt,
|
|
5026
5525
|
schema: QueryResultSchema,
|
|
5027
5526
|
maxTokens: 4096,
|
|
5028
5527
|
providerOptions
|
|
5029
|
-
}
|
|
5528
|
+
},
|
|
5529
|
+
{
|
|
5530
|
+
fallback: {
|
|
5531
|
+
answer: subAnswers.map((sa) => `**${sa.subQuestion}**
|
|
5532
|
+
${sa.answer}`).join("\n\n"),
|
|
5533
|
+
citations: subAnswers.flatMap((sa) => sa.citations),
|
|
5534
|
+
intent: classification.intent,
|
|
5535
|
+
confidence: Math.min(...subAnswers.map((sa) => sa.confidence), 1)
|
|
5536
|
+
},
|
|
5537
|
+
log,
|
|
5538
|
+
onError: (err, attempt) => log?.(`Respond attempt ${attempt + 1} failed: ${err}`)
|
|
5539
|
+
}
|
|
5030
5540
|
);
|
|
5031
5541
|
trackUsage(usage);
|
|
5032
5542
|
const result = object;
|
|
@@ -5192,6 +5702,7 @@ var AGENT_TOOLS = [
|
|
|
5192
5702
|
CommercialAutoDeclarationsSchema,
|
|
5193
5703
|
CommercialPropertyDeclarationsSchema,
|
|
5194
5704
|
CommunicationIntentSchema,
|
|
5705
|
+
ConditionKeyValueSchema,
|
|
5195
5706
|
ConditionTypeSchema,
|
|
5196
5707
|
ConstructionTypeSchema,
|
|
5197
5708
|
ContactSchema,
|
|
@@ -5358,6 +5869,7 @@ var AGENT_TOOLS = [
|
|
|
5358
5869
|
chunkDocument,
|
|
5359
5870
|
createApplicationPipeline,
|
|
5360
5871
|
createExtractor,
|
|
5872
|
+
createPipelineContext,
|
|
5361
5873
|
createQueryAgent,
|
|
5362
5874
|
extractPageRange,
|
|
5363
5875
|
fillAcroForm,
|
|
@@ -5367,6 +5879,7 @@ var AGENT_TOOLS = [
|
|
|
5367
5879
|
getTemplate,
|
|
5368
5880
|
overlayTextOnPdf,
|
|
5369
5881
|
pLimit,
|
|
5882
|
+
safeGenerateObject,
|
|
5370
5883
|
sanitizeNulls,
|
|
5371
5884
|
stripFences,
|
|
5372
5885
|
withRetry
|