@claritylabs/cl-sdk 0.5.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +33 -9
- package/dist/index.d.mts +382 -77
- package/dist/index.d.ts +382 -77
- package/dist/index.js +718 -205
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +715 -205
- package/dist/index.mjs.map +1 -1
- package/dist/storage-sqlite.d.mts +52 -10
- package/dist/storage-sqlite.d.ts +52 -10
- package/package.json +1 -1
package/dist/index.mjs
CHANGED
|
@@ -71,6 +71,69 @@ function sanitizeNulls(obj) {
|
|
|
71
71
|
return obj;
|
|
72
72
|
}
|
|
73
73
|
|
|
74
|
+
// src/core/safe-generate.ts
|
|
75
|
+
async function safeGenerateObject(generateObject, params, options) {
|
|
76
|
+
const maxRetries = options?.maxRetries ?? 1;
|
|
77
|
+
let lastError;
|
|
78
|
+
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
79
|
+
try {
|
|
80
|
+
const result = await withRetry(
|
|
81
|
+
() => generateObject(params),
|
|
82
|
+
options?.log
|
|
83
|
+
);
|
|
84
|
+
return result;
|
|
85
|
+
} catch (error) {
|
|
86
|
+
lastError = error;
|
|
87
|
+
options?.onError?.(error, attempt);
|
|
88
|
+
await options?.log?.(
|
|
89
|
+
`safeGenerateObject attempt ${attempt + 1}/${maxRetries + 1} failed: ${error instanceof Error ? error.message : String(error)}`
|
|
90
|
+
);
|
|
91
|
+
if (attempt < maxRetries) {
|
|
92
|
+
await new Promise((resolve) => setTimeout(resolve, 1e3));
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
if (options?.fallback !== void 0) {
|
|
97
|
+
await options?.log?.(
|
|
98
|
+
`safeGenerateObject: all retries exhausted, returning fallback`
|
|
99
|
+
);
|
|
100
|
+
return { object: options.fallback };
|
|
101
|
+
}
|
|
102
|
+
throw lastError;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// src/core/pipeline.ts
|
|
106
|
+
function createPipelineContext(opts) {
|
|
107
|
+
let latest = opts.resumeFrom;
|
|
108
|
+
const completedPhases = /* @__PURE__ */ new Set();
|
|
109
|
+
if (opts.resumeFrom) {
|
|
110
|
+
completedPhases.add(opts.resumeFrom.phase);
|
|
111
|
+
}
|
|
112
|
+
return {
|
|
113
|
+
id: opts.id,
|
|
114
|
+
async save(phase, state) {
|
|
115
|
+
const checkpoint = {
|
|
116
|
+
phase,
|
|
117
|
+
state,
|
|
118
|
+
timestamp: Date.now()
|
|
119
|
+
};
|
|
120
|
+
latest = checkpoint;
|
|
121
|
+
completedPhases.add(phase);
|
|
122
|
+
await opts.onSave?.(checkpoint);
|
|
123
|
+
},
|
|
124
|
+
getCheckpoint() {
|
|
125
|
+
return latest;
|
|
126
|
+
},
|
|
127
|
+
isPhaseComplete(phase) {
|
|
128
|
+
return completedPhases.has(phase);
|
|
129
|
+
},
|
|
130
|
+
clear() {
|
|
131
|
+
latest = void 0;
|
|
132
|
+
completedPhases.clear();
|
|
133
|
+
}
|
|
134
|
+
};
|
|
135
|
+
}
|
|
136
|
+
|
|
74
137
|
// src/schemas/enums.ts
|
|
75
138
|
import { z } from "zod";
|
|
76
139
|
var PolicyTypeSchema = z.enum([
|
|
@@ -471,11 +534,15 @@ var ExclusionSchema = z5.object({
|
|
|
471
534
|
|
|
472
535
|
// src/schemas/condition.ts
|
|
473
536
|
import { z as z6 } from "zod";
|
|
537
|
+
var ConditionKeyValueSchema = z6.object({
|
|
538
|
+
key: z6.string(),
|
|
539
|
+
value: z6.string()
|
|
540
|
+
});
|
|
474
541
|
var PolicyConditionSchema = z6.object({
|
|
475
542
|
name: z6.string(),
|
|
476
543
|
conditionType: ConditionTypeSchema,
|
|
477
544
|
content: z6.string(),
|
|
478
|
-
keyValues: z6.
|
|
545
|
+
keyValues: z6.array(ConditionKeyValueSchema).optional(),
|
|
479
546
|
pageNumber: z6.number().optional()
|
|
480
547
|
});
|
|
481
548
|
|
|
@@ -1471,6 +1538,218 @@ function assembleDocument(documentId, documentType, memory) {
|
|
|
1471
1538
|
};
|
|
1472
1539
|
}
|
|
1473
1540
|
|
|
1541
|
+
// src/prompts/coordinator/format.ts
|
|
1542
|
+
function buildFormatPrompt(entries) {
|
|
1543
|
+
const block = entries.map((e) => `===ENTRY ${e.id}===
|
|
1544
|
+
${e.text}`).join("\n\n");
|
|
1545
|
+
return `You are a markdown formatting specialist for insurance document content. You will receive numbered content entries extracted from insurance policies, quotes, and endorsements. Your job is to clean up the formatting so every entry renders correctly as standard markdown.
|
|
1546
|
+
|
|
1547
|
+
## Primary issues to fix
|
|
1548
|
+
|
|
1549
|
+
### 1. Pipe-delimited data missing table syntax
|
|
1550
|
+
The most common issue. Content uses pipe characters as column separators but is missing the separator row required for markdown table rendering.
|
|
1551
|
+
|
|
1552
|
+
Before (broken \u2014 won't render as a table):
|
|
1553
|
+
COVERAGE | FORM # | LIMIT | DEDUCTIBLE
|
|
1554
|
+
Employee Theft | | $10,000 | $1,000
|
|
1555
|
+
|
|
1556
|
+
After (valid markdown table):
|
|
1557
|
+
| COVERAGE | FORM # | LIMIT | DEDUCTIBLE |
|
|
1558
|
+
| --- | --- | --- | --- |
|
|
1559
|
+
| Employee Theft | | $10,000 | $1,000 |
|
|
1560
|
+
|
|
1561
|
+
Rules for pipe tables:
|
|
1562
|
+
- Add leading and trailing pipes to every row
|
|
1563
|
+
- Add the separator row (| --- | --- |) after the header row
|
|
1564
|
+
- Every row must have the same number of pipe-separated columns as the header
|
|
1565
|
+
- Empty cells are fine \u2014 just keep the pipes: | | $10,000 |
|
|
1566
|
+
|
|
1567
|
+
### 2. Sub-items indented within pipe tables
|
|
1568
|
+
Insurance schedules often have indented sub-items that belong to the previous coverage line. These break table column counts.
|
|
1569
|
+
|
|
1570
|
+
Before (broken):
|
|
1571
|
+
COVERAGE | LIMIT | DEDUCTIBLE
|
|
1572
|
+
Causes Of Loss - Equipment Breakdown | PR650END
|
|
1573
|
+
Described Premises Limit | | $350,804 |
|
|
1574
|
+
Diagnostic Equipment | | $100,000 |
|
|
1575
|
+
Deductible Type - Business Income: Waiting Period - Hours
|
|
1576
|
+
Waiting Period (Hours): 24
|
|
1577
|
+
|
|
1578
|
+
After: Pull sub-items out of the table. End the table before the sub-items, show them as an indented list, then start a new table if tabular data resumes:
|
|
1579
|
+
| COVERAGE | LIMIT | DEDUCTIBLE |
|
|
1580
|
+
| --- | --- | --- |
|
|
1581
|
+
| Causes Of Loss - Equipment Breakdown | PR650END | |
|
|
1582
|
+
|
|
1583
|
+
- Described Premises Limit: $350,804
|
|
1584
|
+
- Diagnostic Equipment: $100,000
|
|
1585
|
+
- Deductible Type - Business Income: Waiting Period - Hours
|
|
1586
|
+
- Waiting Period (Hours): 24
|
|
1587
|
+
|
|
1588
|
+
### 3. Space-aligned tables
|
|
1589
|
+
Declarations often align columns with spaces instead of pipes. These render as plain monospace text and lose structure.
|
|
1590
|
+
|
|
1591
|
+
Before:
|
|
1592
|
+
Coverage Limit of Liability Retention
|
|
1593
|
+
A. Network Security Liability $500,000 $10,000
|
|
1594
|
+
B. Privacy Liability $500,000 $10,000
|
|
1595
|
+
|
|
1596
|
+
After (convert to proper markdown table):
|
|
1597
|
+
| Coverage | Limit of Liability | Retention |
|
|
1598
|
+
| --- | --- | --- |
|
|
1599
|
+
| A. Network Security Liability | $500,000 | $10,000 |
|
|
1600
|
+
| B. Privacy Liability | $500,000 | $10,000 |
|
|
1601
|
+
|
|
1602
|
+
### 4. Mixed table/prose content
|
|
1603
|
+
A single entry often contains prose paragraphs followed by tabular data followed by more prose. Handle each segment independently \u2014 don't try to force everything into one table.
|
|
1604
|
+
|
|
1605
|
+
### 5. General markdown cleanup
|
|
1606
|
+
- **Line spacing**: Remove excessive blank lines (3+ consecutive newlines \u2192 2). Ensure one blank line before and after tables and headings.
|
|
1607
|
+
- **Trailing whitespace**: Remove trailing spaces on all lines.
|
|
1608
|
+
- **Broken lists**: Ensure list items use consistent markers (-, *, or 1.) with proper nesting indentation.
|
|
1609
|
+
- **Orphaned formatting**: Close any unclosed bold (**), italic (*), or code (\`) markers.
|
|
1610
|
+
- **Heading levels**: Ensure heading markers (##) have a space after the hashes.
|
|
1611
|
+
|
|
1612
|
+
## Rules
|
|
1613
|
+
- Do NOT change the meaning or substance of any content. Only fix formatting.
|
|
1614
|
+
- Do NOT add new information, headers, or commentary.
|
|
1615
|
+
- Do NOT wrap entries in code fences.
|
|
1616
|
+
- Preserve all dollar amounts, dates, policy numbers, form numbers, and technical terms exactly as they appear.
|
|
1617
|
+
- If an entry is already well-formatted, return it unchanged.
|
|
1618
|
+
- When in doubt about whether something is a table, prefer table formatting for structured data with multiple columns.
|
|
1619
|
+
|
|
1620
|
+
Return your output in this exact format \u2014 one block per entry, in the same order:
|
|
1621
|
+
|
|
1622
|
+
===ENTRY 0===
|
|
1623
|
+
(cleaned content for entry 0)
|
|
1624
|
+
|
|
1625
|
+
===ENTRY 1===
|
|
1626
|
+
(cleaned content for entry 1)
|
|
1627
|
+
|
|
1628
|
+
...and so on for each entry.
|
|
1629
|
+
|
|
1630
|
+
Here are the entries to format:
|
|
1631
|
+
|
|
1632
|
+
${block}`;
|
|
1633
|
+
}
|
|
1634
|
+
|
|
1635
|
+
// src/extraction/formatter.ts
|
|
1636
|
+
function collectContentFields(doc) {
|
|
1637
|
+
const entries = [];
|
|
1638
|
+
let id = 0;
|
|
1639
|
+
function add(path, text) {
|
|
1640
|
+
if (text && text.length > 20) {
|
|
1641
|
+
entries.push({ id: id++, path, text });
|
|
1642
|
+
}
|
|
1643
|
+
}
|
|
1644
|
+
add("summary", doc.summary);
|
|
1645
|
+
if (doc.sections) {
|
|
1646
|
+
for (let i = 0; i < doc.sections.length; i++) {
|
|
1647
|
+
const s = doc.sections[i];
|
|
1648
|
+
add(`sections[${i}].content`, s.content);
|
|
1649
|
+
if (s.subsections) {
|
|
1650
|
+
for (let j = 0; j < s.subsections.length; j++) {
|
|
1651
|
+
add(`sections[${i}].subsections[${j}].content`, s.subsections[j].content);
|
|
1652
|
+
}
|
|
1653
|
+
}
|
|
1654
|
+
}
|
|
1655
|
+
}
|
|
1656
|
+
if (doc.endorsements) {
|
|
1657
|
+
for (let i = 0; i < doc.endorsements.length; i++) {
|
|
1658
|
+
add(`endorsements[${i}].content`, doc.endorsements[i].content);
|
|
1659
|
+
}
|
|
1660
|
+
}
|
|
1661
|
+
if (doc.exclusions) {
|
|
1662
|
+
for (let i = 0; i < doc.exclusions.length; i++) {
|
|
1663
|
+
add(`exclusions[${i}].content`, doc.exclusions[i].content);
|
|
1664
|
+
}
|
|
1665
|
+
}
|
|
1666
|
+
if (doc.conditions) {
|
|
1667
|
+
for (let i = 0; i < doc.conditions.length; i++) {
|
|
1668
|
+
add(`conditions[${i}].content`, doc.conditions[i].content);
|
|
1669
|
+
}
|
|
1670
|
+
}
|
|
1671
|
+
return entries;
|
|
1672
|
+
}
|
|
1673
|
+
function parseFormatResponse(response) {
|
|
1674
|
+
const results = /* @__PURE__ */ new Map();
|
|
1675
|
+
const parts = response.split(/===ENTRY (\d+)===/);
|
|
1676
|
+
for (let i = 1; i < parts.length; i += 2) {
|
|
1677
|
+
const entryId = parseInt(parts[i], 10);
|
|
1678
|
+
const content = parts[i + 1]?.trim();
|
|
1679
|
+
if (!isNaN(entryId) && content !== void 0) {
|
|
1680
|
+
results.set(entryId, content);
|
|
1681
|
+
}
|
|
1682
|
+
}
|
|
1683
|
+
return results;
|
|
1684
|
+
}
|
|
1685
|
+
function applyFormattedContent(doc, entries, formatted) {
|
|
1686
|
+
for (const entry of entries) {
|
|
1687
|
+
const cleaned = formatted.get(entry.id);
|
|
1688
|
+
if (!cleaned) continue;
|
|
1689
|
+
const segments = entry.path.match(/^(\w+)(?:\[(\d+)\])?(?:\.(\w+)(?:\[(\d+)\])?(?:\.(\w+))?)?$/);
|
|
1690
|
+
if (!segments) continue;
|
|
1691
|
+
const [, field, idx1, sub1, idx2, sub2] = segments;
|
|
1692
|
+
if (!sub1) {
|
|
1693
|
+
doc[field] = cleaned;
|
|
1694
|
+
} else if (!sub2) {
|
|
1695
|
+
const arr = doc[field];
|
|
1696
|
+
if (arr && arr[Number(idx1)]) {
|
|
1697
|
+
arr[Number(idx1)][sub1] = cleaned;
|
|
1698
|
+
}
|
|
1699
|
+
} else {
|
|
1700
|
+
const arr = doc[field];
|
|
1701
|
+
if (arr && arr[Number(idx1)]) {
|
|
1702
|
+
const nested = arr[Number(idx1)][sub1];
|
|
1703
|
+
if (nested && nested[Number(idx2)]) {
|
|
1704
|
+
nested[Number(idx2)][sub2] = cleaned;
|
|
1705
|
+
}
|
|
1706
|
+
}
|
|
1707
|
+
}
|
|
1708
|
+
}
|
|
1709
|
+
}
|
|
1710
|
+
var MAX_ENTRIES_PER_BATCH = 20;
|
|
1711
|
+
async function formatDocumentContent(doc, generateText, options) {
|
|
1712
|
+
const entries = collectContentFields(doc);
|
|
1713
|
+
const totalUsage = { inputTokens: 0, outputTokens: 0 };
|
|
1714
|
+
if (entries.length === 0) {
|
|
1715
|
+
return { document: doc, usage: totalUsage };
|
|
1716
|
+
}
|
|
1717
|
+
options?.onProgress?.(`Formatting ${entries.length} content fields...`);
|
|
1718
|
+
const batches = [];
|
|
1719
|
+
for (let i = 0; i < entries.length; i += MAX_ENTRIES_PER_BATCH) {
|
|
1720
|
+
batches.push(entries.slice(i, i + MAX_ENTRIES_PER_BATCH));
|
|
1721
|
+
}
|
|
1722
|
+
for (let batchIdx = 0; batchIdx < batches.length; batchIdx++) {
|
|
1723
|
+
const batch = batches[batchIdx];
|
|
1724
|
+
try {
|
|
1725
|
+
const prompt = buildFormatPrompt(batch.map((e) => ({ id: e.id, text: e.text })));
|
|
1726
|
+
const result = await withRetry(
|
|
1727
|
+
() => generateText({
|
|
1728
|
+
prompt,
|
|
1729
|
+
maxTokens: 16384,
|
|
1730
|
+
providerOptions: options?.providerOptions
|
|
1731
|
+
})
|
|
1732
|
+
);
|
|
1733
|
+
if (result.usage) {
|
|
1734
|
+
totalUsage.inputTokens += result.usage.inputTokens;
|
|
1735
|
+
totalUsage.outputTokens += result.usage.outputTokens;
|
|
1736
|
+
}
|
|
1737
|
+
const formatted = parseFormatResponse(result.text);
|
|
1738
|
+
if (formatted.size < batch.length) {
|
|
1739
|
+
await options?.log?.(
|
|
1740
|
+
`Format batch ${batchIdx + 1}/${batches.length}: model returned ${formatted.size}/${batch.length} entries \u2014 unformatted entries will keep original content`
|
|
1741
|
+
);
|
|
1742
|
+
}
|
|
1743
|
+
applyFormattedContent(doc, batch, formatted);
|
|
1744
|
+
} catch (error) {
|
|
1745
|
+
await options?.log?.(
|
|
1746
|
+
`Format batch ${batchIdx + 1}/${batches.length} failed, keeping original content: ${error instanceof Error ? error.message : String(error)}`
|
|
1747
|
+
);
|
|
1748
|
+
}
|
|
1749
|
+
}
|
|
1750
|
+
return { document: doc, usage: totalUsage };
|
|
1751
|
+
}
|
|
1752
|
+
|
|
1474
1753
|
// src/extraction/chunking.ts
|
|
1475
1754
|
function chunkDocument(doc) {
|
|
1476
1755
|
const chunks = [];
|
|
@@ -2307,9 +2586,13 @@ var ExtractionTaskSchema = z18.object({
|
|
|
2307
2586
|
endPage: z18.number(),
|
|
2308
2587
|
description: z18.string()
|
|
2309
2588
|
});
|
|
2589
|
+
var PageMapEntrySchema = z18.object({
|
|
2590
|
+
section: z18.string(),
|
|
2591
|
+
pages: z18.string()
|
|
2592
|
+
});
|
|
2310
2593
|
var ExtractionPlanSchema = z18.object({
|
|
2311
2594
|
tasks: z18.array(ExtractionTaskSchema),
|
|
2312
|
-
pageMap: z18.
|
|
2595
|
+
pageMap: z18.array(PageMapEntrySchema).optional()
|
|
2313
2596
|
});
|
|
2314
2597
|
function buildPlanPrompt(templateHints) {
|
|
2315
2598
|
return `You are planning the extraction of an insurance document. You have already classified this document. Now scan the full document and create a page map + extraction plan.
|
|
@@ -2338,7 +2621,10 @@ Return JSON:
|
|
|
2338
2621
|
{ "extractorName": "carrier_info", "startPage": 1, "endPage": 2, "description": "Extract carrier details from declarations page" },
|
|
2339
2622
|
...
|
|
2340
2623
|
],
|
|
2341
|
-
"pageMap":
|
|
2624
|
+
"pageMap": [
|
|
2625
|
+
{ "section": "declarations", "pages": "pages 1-3" },
|
|
2626
|
+
{ "section": "endorsements", "pages": "pages 15-22" }
|
|
2627
|
+
]
|
|
2342
2628
|
}
|
|
2343
2629
|
|
|
2344
2630
|
Create tasks that cover the entire document. Prefer specific extractors over generic "sections" where possible. Keep page ranges tight \u2014 only include pages relevant to each extractor.
|
|
@@ -2831,7 +3117,8 @@ function createExtractor(config) {
|
|
|
2831
3117
|
onTokenUsage,
|
|
2832
3118
|
onProgress,
|
|
2833
3119
|
log,
|
|
2834
|
-
providerOptions
|
|
3120
|
+
providerOptions,
|
|
3121
|
+
onCheckpointSave
|
|
2835
3122
|
} = config;
|
|
2836
3123
|
const limit = pLimit(concurrency);
|
|
2837
3124
|
let totalUsage = { inputTokens: 0, outputTokens: 0 };
|
|
@@ -2842,100 +3129,106 @@ function createExtractor(config) {
|
|
|
2842
3129
|
onTokenUsage?.(usage);
|
|
2843
3130
|
}
|
|
2844
3131
|
}
|
|
2845
|
-
async function extract(pdfBase64, documentId) {
|
|
3132
|
+
async function extract(pdfBase64, documentId, options) {
|
|
2846
3133
|
const id = documentId ?? `doc-${Date.now()}`;
|
|
2847
3134
|
const memory = /* @__PURE__ */ new Map();
|
|
2848
3135
|
totalUsage = { inputTokens: 0, outputTokens: 0 };
|
|
2849
|
-
|
|
2850
|
-
|
|
2851
|
-
|
|
2852
|
-
|
|
2853
|
-
|
|
2854
|
-
|
|
2855
|
-
|
|
2856
|
-
|
|
2857
|
-
|
|
2858
|
-
);
|
|
2859
|
-
trackUsage(classifyResult.usage);
|
|
2860
|
-
memory.set("classify", classifyResult.object);
|
|
2861
|
-
const { documentType, policyTypes } = classifyResult.object;
|
|
2862
|
-
const primaryType = policyTypes[0] ?? "other";
|
|
2863
|
-
const template = getTemplate(primaryType);
|
|
2864
|
-
onProgress?.(`Planning extraction for ${primaryType} ${documentType}...`);
|
|
2865
|
-
const templateHints = [
|
|
2866
|
-
`Document type: ${primaryType} ${documentType}`,
|
|
2867
|
-
`Expected sections: ${template.expectedSections.join(", ")}`,
|
|
2868
|
-
`Page hints: ${Object.entries(template.pageHints).map(([k, v]) => `${k}: ${v}`).join("; ")}`,
|
|
2869
|
-
`Total pages: ${pageCount}`
|
|
2870
|
-
].join("\n");
|
|
2871
|
-
const planResult = await withRetry(
|
|
2872
|
-
() => generateObject({
|
|
2873
|
-
prompt: buildPlanPrompt(templateHints),
|
|
2874
|
-
schema: ExtractionPlanSchema,
|
|
2875
|
-
maxTokens: 2048,
|
|
2876
|
-
providerOptions
|
|
2877
|
-
})
|
|
2878
|
-
);
|
|
2879
|
-
trackUsage(planResult.usage);
|
|
2880
|
-
const tasks = planResult.object.tasks;
|
|
2881
|
-
onProgress?.(`Dispatching ${tasks.length} extractors...`);
|
|
2882
|
-
const extractorResults = await Promise.all(
|
|
2883
|
-
tasks.map(
|
|
2884
|
-
(task) => limit(async () => {
|
|
2885
|
-
const ext = getExtractor(task.extractorName);
|
|
2886
|
-
if (!ext) {
|
|
2887
|
-
await log?.(`Unknown extractor: ${task.extractorName}, skipping`);
|
|
2888
|
-
return null;
|
|
2889
|
-
}
|
|
2890
|
-
onProgress?.(`Extracting ${task.extractorName} (pages ${task.startPage}-${task.endPage})...`);
|
|
2891
|
-
try {
|
|
2892
|
-
const result = await runExtractor({
|
|
2893
|
-
name: task.extractorName,
|
|
2894
|
-
prompt: ext.buildPrompt(),
|
|
2895
|
-
schema: ext.schema,
|
|
2896
|
-
pdfBase64,
|
|
2897
|
-
startPage: task.startPage,
|
|
2898
|
-
endPage: task.endPage,
|
|
2899
|
-
generateObject,
|
|
2900
|
-
convertPdfToImages,
|
|
2901
|
-
maxTokens: ext.maxTokens ?? 4096,
|
|
2902
|
-
providerOptions
|
|
2903
|
-
});
|
|
2904
|
-
trackUsage(result.usage);
|
|
2905
|
-
return result;
|
|
2906
|
-
} catch (error) {
|
|
2907
|
-
await log?.(`Extractor ${task.extractorName} failed: ${error}`);
|
|
2908
|
-
return null;
|
|
2909
|
-
}
|
|
2910
|
-
})
|
|
2911
|
-
)
|
|
2912
|
-
);
|
|
2913
|
-
for (const result of extractorResults) {
|
|
2914
|
-
if (result) {
|
|
2915
|
-
memory.set(result.name, result.data);
|
|
3136
|
+
const pipelineCtx = createPipelineContext({
|
|
3137
|
+
id,
|
|
3138
|
+
onSave: onCheckpointSave,
|
|
3139
|
+
resumeFrom: options?.resumeFrom
|
|
3140
|
+
});
|
|
3141
|
+
const resumed = pipelineCtx.getCheckpoint()?.state;
|
|
3142
|
+
if (resumed?.memory) {
|
|
3143
|
+
for (const [k, v] of Object.entries(resumed.memory)) {
|
|
3144
|
+
memory.set(k, v);
|
|
2916
3145
|
}
|
|
2917
3146
|
}
|
|
2918
|
-
|
|
2919
|
-
|
|
2920
|
-
|
|
2921
|
-
|
|
2922
|
-
|
|
2923
|
-
|
|
2924
|
-
|
|
3147
|
+
let classifyResult;
|
|
3148
|
+
if (resumed?.classifyResult && pipelineCtx.isPhaseComplete("classify")) {
|
|
3149
|
+
classifyResult = resumed.classifyResult;
|
|
3150
|
+
onProgress?.("Resuming from checkpoint (classify complete)...");
|
|
3151
|
+
} else {
|
|
3152
|
+
onProgress?.("Classifying document...");
|
|
3153
|
+
const pageCount2 = await getPdfPageCount(pdfBase64);
|
|
3154
|
+
const classifyResponse = await safeGenerateObject(
|
|
3155
|
+
generateObject,
|
|
3156
|
+
{
|
|
3157
|
+
prompt: buildClassifyPrompt(),
|
|
3158
|
+
schema: ClassifyResultSchema,
|
|
3159
|
+
maxTokens: 512,
|
|
2925
3160
|
providerOptions
|
|
2926
|
-
}
|
|
3161
|
+
},
|
|
3162
|
+
{
|
|
3163
|
+
fallback: { documentType: "policy", policyTypes: ["other"], confidence: 0 },
|
|
3164
|
+
log,
|
|
3165
|
+
onError: (err, attempt) => log?.(`Classify attempt ${attempt + 1} failed: ${err}`)
|
|
3166
|
+
}
|
|
2927
3167
|
);
|
|
2928
|
-
trackUsage(
|
|
2929
|
-
|
|
2930
|
-
|
|
2931
|
-
|
|
2932
|
-
|
|
2933
|
-
|
|
2934
|
-
|
|
2935
|
-
|
|
3168
|
+
trackUsage(classifyResponse.usage);
|
|
3169
|
+
classifyResult = classifyResponse.object;
|
|
3170
|
+
memory.set("classify", classifyResult);
|
|
3171
|
+
await pipelineCtx.save("classify", {
|
|
3172
|
+
id,
|
|
3173
|
+
pageCount: pageCount2,
|
|
3174
|
+
classifyResult,
|
|
3175
|
+
memory: Object.fromEntries(memory)
|
|
3176
|
+
});
|
|
3177
|
+
}
|
|
3178
|
+
const { documentType, policyTypes } = classifyResult;
|
|
3179
|
+
const primaryType = policyTypes[0] ?? "other";
|
|
3180
|
+
const template = getTemplate(primaryType);
|
|
3181
|
+
const pageCount = resumed?.pageCount ?? await getPdfPageCount(pdfBase64);
|
|
3182
|
+
let plan;
|
|
3183
|
+
if (resumed?.plan && pipelineCtx.isPhaseComplete("plan")) {
|
|
3184
|
+
plan = resumed.plan;
|
|
3185
|
+
onProgress?.("Resuming from checkpoint (plan complete)...");
|
|
3186
|
+
} else {
|
|
3187
|
+
onProgress?.(`Planning extraction for ${primaryType} ${documentType}...`);
|
|
3188
|
+
const templateHints = [
|
|
3189
|
+
`Document type: ${primaryType} ${documentType}`,
|
|
3190
|
+
`Expected sections: ${template.expectedSections.join(", ")}`,
|
|
3191
|
+
`Page hints: ${Object.entries(template.pageHints).map(([k, v]) => `${k}: ${v}`).join("; ")}`,
|
|
3192
|
+
`Total pages: ${pageCount}`
|
|
3193
|
+
].join("\n");
|
|
3194
|
+
const planResponse = await safeGenerateObject(
|
|
3195
|
+
generateObject,
|
|
3196
|
+
{
|
|
3197
|
+
prompt: buildPlanPrompt(templateHints),
|
|
3198
|
+
schema: ExtractionPlanSchema,
|
|
3199
|
+
maxTokens: 2048,
|
|
3200
|
+
providerOptions
|
|
3201
|
+
},
|
|
3202
|
+
{
|
|
3203
|
+
fallback: {
|
|
3204
|
+
tasks: [{ extractorName: "sections", startPage: 1, endPage: pageCount, description: "Full document fallback extraction" }]
|
|
3205
|
+
},
|
|
3206
|
+
log,
|
|
3207
|
+
onError: (err, attempt) => log?.(`Plan attempt ${attempt + 1} failed: ${err}`)
|
|
3208
|
+
}
|
|
3209
|
+
);
|
|
3210
|
+
trackUsage(planResponse.usage);
|
|
3211
|
+
plan = planResponse.object;
|
|
3212
|
+
await pipelineCtx.save("plan", {
|
|
3213
|
+
id,
|
|
3214
|
+
pageCount,
|
|
3215
|
+
classifyResult,
|
|
3216
|
+
plan,
|
|
3217
|
+
memory: Object.fromEntries(memory)
|
|
3218
|
+
});
|
|
3219
|
+
}
|
|
3220
|
+
if (!pipelineCtx.isPhaseComplete("extract")) {
|
|
3221
|
+
const tasks = plan.tasks;
|
|
3222
|
+
onProgress?.(`Dispatching ${tasks.length} extractors...`);
|
|
3223
|
+
const extractorResults = await Promise.all(
|
|
3224
|
+
tasks.map(
|
|
2936
3225
|
(task) => limit(async () => {
|
|
2937
3226
|
const ext = getExtractor(task.extractorName);
|
|
2938
|
-
if (!ext)
|
|
3227
|
+
if (!ext) {
|
|
3228
|
+
await log?.(`Unknown extractor: ${task.extractorName}, skipping`);
|
|
3229
|
+
return null;
|
|
3230
|
+
}
|
|
3231
|
+
onProgress?.(`Extracting ${task.extractorName} (pages ${task.startPage}-${task.endPage})...`);
|
|
2939
3232
|
try {
|
|
2940
3233
|
const result = await runExtractor({
|
|
2941
3234
|
name: task.extractorName,
|
|
@@ -2952,22 +3245,114 @@ function createExtractor(config) {
|
|
|
2952
3245
|
trackUsage(result.usage);
|
|
2953
3246
|
return result;
|
|
2954
3247
|
} catch (error) {
|
|
2955
|
-
await log?.(`
|
|
3248
|
+
await log?.(`Extractor ${task.extractorName} failed: ${error}`);
|
|
2956
3249
|
return null;
|
|
2957
3250
|
}
|
|
2958
3251
|
})
|
|
2959
3252
|
)
|
|
2960
3253
|
);
|
|
2961
|
-
for (const result of
|
|
3254
|
+
for (const result of extractorResults) {
|
|
2962
3255
|
if (result) {
|
|
2963
3256
|
memory.set(result.name, result.data);
|
|
2964
3257
|
}
|
|
2965
3258
|
}
|
|
3259
|
+
await pipelineCtx.save("extract", {
|
|
3260
|
+
id,
|
|
3261
|
+
pageCount,
|
|
3262
|
+
classifyResult,
|
|
3263
|
+
plan,
|
|
3264
|
+
memory: Object.fromEntries(memory)
|
|
3265
|
+
});
|
|
3266
|
+
}
|
|
3267
|
+
if (!pipelineCtx.isPhaseComplete("review")) {
|
|
3268
|
+
for (let round = 0; round < maxReviewRounds; round++) {
|
|
3269
|
+
const extractedKeys = [...memory.keys()].filter((k) => k !== "classify");
|
|
3270
|
+
const reviewResponse = await safeGenerateObject(
|
|
3271
|
+
generateObject,
|
|
3272
|
+
{
|
|
3273
|
+
prompt: buildReviewPrompt(template.required, extractedKeys),
|
|
3274
|
+
schema: ReviewResultSchema,
|
|
3275
|
+
maxTokens: 1024,
|
|
3276
|
+
providerOptions
|
|
3277
|
+
},
|
|
3278
|
+
{
|
|
3279
|
+
fallback: { complete: true, missingFields: [], additionalTasks: [] },
|
|
3280
|
+
log,
|
|
3281
|
+
onError: (err, attempt) => log?.(`Review round ${round + 1} attempt ${attempt + 1} failed: ${err}`)
|
|
3282
|
+
}
|
|
3283
|
+
);
|
|
3284
|
+
trackUsage(reviewResponse.usage);
|
|
3285
|
+
if (reviewResponse.object.complete || reviewResponse.object.additionalTasks.length === 0) {
|
|
3286
|
+
onProgress?.("Extraction complete.");
|
|
3287
|
+
break;
|
|
3288
|
+
}
|
|
3289
|
+
onProgress?.(`Review round ${round + 1}: dispatching ${reviewResponse.object.additionalTasks.length} follow-up extractors...`);
|
|
3290
|
+
const followUpResults = await Promise.all(
|
|
3291
|
+
reviewResponse.object.additionalTasks.map(
|
|
3292
|
+
(task) => limit(async () => {
|
|
3293
|
+
const ext = getExtractor(task.extractorName);
|
|
3294
|
+
if (!ext) return null;
|
|
3295
|
+
try {
|
|
3296
|
+
const result = await runExtractor({
|
|
3297
|
+
name: task.extractorName,
|
|
3298
|
+
prompt: ext.buildPrompt(),
|
|
3299
|
+
schema: ext.schema,
|
|
3300
|
+
pdfBase64,
|
|
3301
|
+
startPage: task.startPage,
|
|
3302
|
+
endPage: task.endPage,
|
|
3303
|
+
generateObject,
|
|
3304
|
+
convertPdfToImages,
|
|
3305
|
+
maxTokens: ext.maxTokens ?? 4096,
|
|
3306
|
+
providerOptions
|
|
3307
|
+
});
|
|
3308
|
+
trackUsage(result.usage);
|
|
3309
|
+
return result;
|
|
3310
|
+
} catch (error) {
|
|
3311
|
+
await log?.(`Follow-up extractor ${task.extractorName} failed: ${error}`);
|
|
3312
|
+
return null;
|
|
3313
|
+
}
|
|
3314
|
+
})
|
|
3315
|
+
)
|
|
3316
|
+
);
|
|
3317
|
+
for (const result of followUpResults) {
|
|
3318
|
+
if (result) {
|
|
3319
|
+
memory.set(result.name, result.data);
|
|
3320
|
+
}
|
|
3321
|
+
}
|
|
3322
|
+
}
|
|
3323
|
+
await pipelineCtx.save("review", {
|
|
3324
|
+
id,
|
|
3325
|
+
pageCount,
|
|
3326
|
+
classifyResult,
|
|
3327
|
+
plan,
|
|
3328
|
+
memory: Object.fromEntries(memory)
|
|
3329
|
+
});
|
|
2966
3330
|
}
|
|
2967
3331
|
onProgress?.("Assembling document...");
|
|
2968
3332
|
const document = assembleDocument(id, documentType, memory);
|
|
2969
|
-
|
|
2970
|
-
|
|
3333
|
+
await pipelineCtx.save("assemble", {
|
|
3334
|
+
id,
|
|
3335
|
+
pageCount,
|
|
3336
|
+
classifyResult,
|
|
3337
|
+
plan,
|
|
3338
|
+
memory: Object.fromEntries(memory),
|
|
3339
|
+
document
|
|
3340
|
+
});
|
|
3341
|
+
onProgress?.("Formatting extracted content...");
|
|
3342
|
+
const formatResult = await formatDocumentContent(document, generateText, {
|
|
3343
|
+
providerOptions,
|
|
3344
|
+
onProgress,
|
|
3345
|
+
log
|
|
3346
|
+
});
|
|
3347
|
+
trackUsage(formatResult.usage);
|
|
3348
|
+
const chunks = chunkDocument(formatResult.document);
|
|
3349
|
+
const finalCheckpoint = pipelineCtx.getCheckpoint();
|
|
3350
|
+
return {
|
|
3351
|
+
document: formatResult.document,
|
|
3352
|
+
chunks,
|
|
3353
|
+
tokenUsage: totalUsage,
|
|
3354
|
+
checkpoint: finalCheckpoint
|
|
3355
|
+
};
|
|
2971
3356
|
}
|
|
2972
3357
|
return { extract };
|
|
2973
3358
|
}
|
|
@@ -3830,7 +4215,6 @@ function createApplicationPipeline(config) {
|
|
|
3830
4215
|
let state = {
|
|
3831
4216
|
id,
|
|
3832
4217
|
pdfBase64: void 0,
|
|
3833
|
-
// Don't persist the full PDF in state
|
|
3834
4218
|
title: void 0,
|
|
3835
4219
|
applicationType: null,
|
|
3836
4220
|
fields: [],
|
|
@@ -3841,13 +4225,20 @@ function createApplicationPipeline(config) {
|
|
|
3841
4225
|
updatedAt: now
|
|
3842
4226
|
};
|
|
3843
4227
|
onProgress?.("Classifying document...");
|
|
3844
|
-
|
|
3845
|
-
|
|
3846
|
-
|
|
3847
|
-
|
|
3848
|
-
|
|
3849
|
-
|
|
3850
|
-
|
|
4228
|
+
await applicationStore?.save(state);
|
|
4229
|
+
let classifyResult;
|
|
4230
|
+
try {
|
|
4231
|
+
const { result, usage: classifyUsage } = await classifyApplication(
|
|
4232
|
+
pdfBase64.slice(0, 2e3),
|
|
4233
|
+
generateObject,
|
|
4234
|
+
providerOptions
|
|
4235
|
+
);
|
|
4236
|
+
trackUsage(classifyUsage);
|
|
4237
|
+
classifyResult = result;
|
|
4238
|
+
} catch (error) {
|
|
4239
|
+
await log?.(`Classification failed, treating as non-application: ${error instanceof Error ? error.message : String(error)}`);
|
|
4240
|
+
classifyResult = { isApplication: false, confidence: 0, applicationType: null };
|
|
4241
|
+
}
|
|
3851
4242
|
if (!classifyResult.isApplication) {
|
|
3852
4243
|
state.status = "complete";
|
|
3853
4244
|
state.updatedAt = Date.now();
|
|
@@ -3857,13 +4248,28 @@ function createApplicationPipeline(config) {
|
|
|
3857
4248
|
state.applicationType = classifyResult.applicationType;
|
|
3858
4249
|
state.status = "extracting";
|
|
3859
4250
|
state.updatedAt = Date.now();
|
|
4251
|
+
await applicationStore?.save(state);
|
|
3860
4252
|
onProgress?.("Extracting form fields...");
|
|
3861
|
-
|
|
3862
|
-
|
|
3863
|
-
|
|
3864
|
-
|
|
3865
|
-
|
|
3866
|
-
|
|
4253
|
+
let fields;
|
|
4254
|
+
try {
|
|
4255
|
+
const { fields: extractedFields, usage: extractUsage } = await extractFields(
|
|
4256
|
+
pdfBase64,
|
|
4257
|
+
generateObject,
|
|
4258
|
+
providerOptions
|
|
4259
|
+
);
|
|
4260
|
+
trackUsage(extractUsage);
|
|
4261
|
+
fields = extractedFields;
|
|
4262
|
+
} catch (error) {
|
|
4263
|
+
await log?.(`Field extraction failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
4264
|
+
fields = [];
|
|
4265
|
+
}
|
|
4266
|
+
if (fields.length === 0) {
|
|
4267
|
+
await log?.("No fields extracted, completing pipeline with empty result");
|
|
4268
|
+
state.status = "complete";
|
|
4269
|
+
state.updatedAt = Date.now();
|
|
4270
|
+
await applicationStore?.save(state);
|
|
4271
|
+
return { state, tokenUsage: totalUsage };
|
|
4272
|
+
}
|
|
3867
4273
|
state.fields = fields;
|
|
3868
4274
|
state.title = classifyResult.applicationType ?? void 0;
|
|
3869
4275
|
state.status = "auto_filling";
|
|
@@ -3895,20 +4301,24 @@ function createApplicationPipeline(config) {
|
|
|
3895
4301
|
limit(async () => {
|
|
3896
4302
|
const unfilledFields2 = state.fields.filter((f) => !f.value);
|
|
3897
4303
|
if (unfilledFields2.length === 0) return;
|
|
3898
|
-
|
|
3899
|
-
|
|
3900
|
-
|
|
3901
|
-
|
|
3902
|
-
|
|
3903
|
-
|
|
3904
|
-
|
|
3905
|
-
|
|
3906
|
-
const
|
|
3907
|
-
|
|
3908
|
-
field
|
|
3909
|
-
|
|
3910
|
-
|
|
4304
|
+
try {
|
|
4305
|
+
const { result: autoFillResult, usage: afUsage } = await autoFillFromContext(
|
|
4306
|
+
unfilledFields2,
|
|
4307
|
+
orgContext,
|
|
4308
|
+
generateObject,
|
|
4309
|
+
providerOptions
|
|
4310
|
+
);
|
|
4311
|
+
trackUsage(afUsage);
|
|
4312
|
+
for (const match of autoFillResult.matches) {
|
|
4313
|
+
const field = state.fields.find((f) => f.id === match.fieldId);
|
|
4314
|
+
if (field && !field.value) {
|
|
4315
|
+
field.value = match.value;
|
|
4316
|
+
field.source = `auto-fill: ${match.contextKey}`;
|
|
4317
|
+
field.confidence = match.confidence;
|
|
4318
|
+
}
|
|
3911
4319
|
}
|
|
4320
|
+
} catch (e) {
|
|
4321
|
+
await log?.(`Auto-fill from context failed: ${e instanceof Error ? e.message : String(e)}`);
|
|
3912
4322
|
}
|
|
3913
4323
|
})
|
|
3914
4324
|
);
|
|
@@ -3941,13 +4351,18 @@ function createApplicationPipeline(config) {
|
|
|
3941
4351
|
if (unfilledFields.length > 0) {
|
|
3942
4352
|
onProgress?.(`Batching ${unfilledFields.length} remaining questions...`);
|
|
3943
4353
|
state.status = "batching";
|
|
3944
|
-
|
|
3945
|
-
|
|
3946
|
-
|
|
3947
|
-
|
|
3948
|
-
|
|
3949
|
-
|
|
3950
|
-
|
|
4354
|
+
try {
|
|
4355
|
+
const { result: batchResult, usage: batchUsage } = await batchQuestions(
|
|
4356
|
+
unfilledFields,
|
|
4357
|
+
generateObject,
|
|
4358
|
+
providerOptions
|
|
4359
|
+
);
|
|
4360
|
+
trackUsage(batchUsage);
|
|
4361
|
+
state.batches = batchResult.batches;
|
|
4362
|
+
} catch (error) {
|
|
4363
|
+
await log?.(`Batching failed, using single-batch fallback: ${error instanceof Error ? error.message : String(error)}`);
|
|
4364
|
+
state.batches = [unfilledFields.map((f) => f.id)];
|
|
4365
|
+
}
|
|
3951
4366
|
state.currentBatchIndex = 0;
|
|
3952
4367
|
state.status = "collecting";
|
|
3953
4368
|
} else {
|
|
@@ -3974,32 +4389,49 @@ function createApplicationPipeline(config) {
|
|
|
3974
4389
|
(f) => currentBatchFieldIds.includes(f.id)
|
|
3975
4390
|
);
|
|
3976
4391
|
onProgress?.("Classifying reply...");
|
|
3977
|
-
|
|
3978
|
-
|
|
3979
|
-
|
|
3980
|
-
generateObject,
|
|
3981
|
-
providerOptions
|
|
3982
|
-
);
|
|
3983
|
-
trackUsage(intentUsage);
|
|
3984
|
-
let fieldsFilled = 0;
|
|
3985
|
-
let responseText;
|
|
3986
|
-
if (intent.hasAnswers) {
|
|
3987
|
-
onProgress?.("Parsing answers...");
|
|
3988
|
-
const { result: parseResult, usage: parseUsage } = await parseAnswers(
|
|
4392
|
+
let intent;
|
|
4393
|
+
try {
|
|
4394
|
+
const { intent: classifiedIntent, usage: intentUsage } = await classifyReplyIntent(
|
|
3989
4395
|
currentBatchFields,
|
|
3990
4396
|
replyText,
|
|
3991
4397
|
generateObject,
|
|
3992
4398
|
providerOptions
|
|
3993
4399
|
);
|
|
3994
|
-
trackUsage(
|
|
3995
|
-
|
|
3996
|
-
|
|
3997
|
-
|
|
3998
|
-
|
|
3999
|
-
|
|
4000
|
-
|
|
4001
|
-
|
|
4400
|
+
trackUsage(intentUsage);
|
|
4401
|
+
intent = classifiedIntent;
|
|
4402
|
+
} catch (error) {
|
|
4403
|
+
await log?.(`Reply intent classification failed, defaulting to answers_only: ${error instanceof Error ? error.message : String(error)}`);
|
|
4404
|
+
intent = {
|
|
4405
|
+
primaryIntent: "answers_only",
|
|
4406
|
+
hasAnswers: true,
|
|
4407
|
+
questionText: void 0,
|
|
4408
|
+
questionFieldIds: void 0,
|
|
4409
|
+
lookupRequests: void 0
|
|
4410
|
+
};
|
|
4411
|
+
}
|
|
4412
|
+
let fieldsFilled = 0;
|
|
4413
|
+
let responseText;
|
|
4414
|
+
if (intent.hasAnswers) {
|
|
4415
|
+
onProgress?.("Parsing answers...");
|
|
4416
|
+
try {
|
|
4417
|
+
const { result: parseResult, usage: parseUsage } = await parseAnswers(
|
|
4418
|
+
currentBatchFields,
|
|
4419
|
+
replyText,
|
|
4420
|
+
generateObject,
|
|
4421
|
+
providerOptions
|
|
4422
|
+
);
|
|
4423
|
+
trackUsage(parseUsage);
|
|
4424
|
+
for (const answer of parseResult.answers) {
|
|
4425
|
+
const field = state.fields.find((f) => f.id === answer.fieldId);
|
|
4426
|
+
if (field) {
|
|
4427
|
+
field.value = answer.value;
|
|
4428
|
+
field.source = "user";
|
|
4429
|
+
field.confidence = "confirmed";
|
|
4430
|
+
fieldsFilled++;
|
|
4431
|
+
}
|
|
4002
4432
|
}
|
|
4433
|
+
} catch (error) {
|
|
4434
|
+
await log?.(`Answer parsing failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
4003
4435
|
}
|
|
4004
4436
|
}
|
|
4005
4437
|
if (intent.lookupRequests?.length) {
|
|
@@ -4020,36 +4452,45 @@ function createApplicationPipeline(config) {
|
|
|
4020
4452
|
const targetFields = state.fields.filter(
|
|
4021
4453
|
(f) => intent.lookupRequests.some((lr) => lr.targetFieldIds.includes(f.id))
|
|
4022
4454
|
);
|
|
4023
|
-
|
|
4024
|
-
|
|
4025
|
-
|
|
4026
|
-
|
|
4027
|
-
|
|
4028
|
-
|
|
4029
|
-
|
|
4030
|
-
|
|
4031
|
-
|
|
4032
|
-
const
|
|
4033
|
-
|
|
4034
|
-
field
|
|
4035
|
-
|
|
4036
|
-
|
|
4037
|
-
|
|
4455
|
+
try {
|
|
4456
|
+
const { result: lookupResult, usage: lookupUsage } = await fillFromLookup(
|
|
4457
|
+
intent.lookupRequests,
|
|
4458
|
+
targetFields,
|
|
4459
|
+
availableData,
|
|
4460
|
+
generateObject,
|
|
4461
|
+
providerOptions
|
|
4462
|
+
);
|
|
4463
|
+
trackUsage(lookupUsage);
|
|
4464
|
+
for (const fill of lookupResult.fills) {
|
|
4465
|
+
const field = state.fields.find((f) => f.id === fill.fieldId);
|
|
4466
|
+
if (field) {
|
|
4467
|
+
field.value = fill.value;
|
|
4468
|
+
field.source = `lookup: ${fill.source}`;
|
|
4469
|
+
field.confidence = "high";
|
|
4470
|
+
fieldsFilled++;
|
|
4471
|
+
}
|
|
4038
4472
|
}
|
|
4473
|
+
} catch (error) {
|
|
4474
|
+
await log?.(`Lookup fill failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
4039
4475
|
}
|
|
4040
4476
|
}
|
|
4041
4477
|
}
|
|
4042
4478
|
if (intent.primaryIntent === "question" || intent.primaryIntent === "mixed") {
|
|
4043
4479
|
if (intent.questionText) {
|
|
4044
|
-
|
|
4045
|
-
|
|
4480
|
+
try {
|
|
4481
|
+
const { text, usage } = await generateText({
|
|
4482
|
+
prompt: `The user is filling out an insurance application and asked: "${intent.questionText}"
|
|
4046
4483
|
|
|
4047
4484
|
Provide a brief, helpful explanation (2-3 sentences). End with "Just reply with the answer when you're ready and I'll fill it in."`,
|
|
4048
|
-
|
|
4049
|
-
|
|
4050
|
-
|
|
4051
|
-
|
|
4052
|
-
|
|
4485
|
+
maxTokens: 512,
|
|
4486
|
+
providerOptions
|
|
4487
|
+
});
|
|
4488
|
+
trackUsage(usage);
|
|
4489
|
+
responseText = text;
|
|
4490
|
+
} catch (error) {
|
|
4491
|
+
await log?.(`Question response generation failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
4492
|
+
responseText = `I wasn't able to generate an explanation for your question. Could you rephrase it, or just provide the answer directly?`;
|
|
4493
|
+
}
|
|
4053
4494
|
}
|
|
4054
4495
|
}
|
|
4055
4496
|
const currentBatchComplete = currentBatchFieldIds.every(
|
|
@@ -4063,26 +4504,30 @@ Provide a brief, helpful explanation (2-3 sentences). End with "Just reply with
|
|
|
4063
4504
|
(f) => nextBatchFieldIds.includes(f.id)
|
|
4064
4505
|
);
|
|
4065
4506
|
const filledCount = state.fields.filter((f) => f.value).length;
|
|
4066
|
-
|
|
4067
|
-
|
|
4068
|
-
|
|
4069
|
-
|
|
4070
|
-
|
|
4071
|
-
|
|
4072
|
-
|
|
4073
|
-
|
|
4074
|
-
|
|
4075
|
-
|
|
4076
|
-
|
|
4077
|
-
|
|
4078
|
-
|
|
4079
|
-
|
|
4080
|
-
|
|
4081
|
-
responseText
|
|
4082
|
-
|
|
4083
|
-
|
|
4507
|
+
try {
|
|
4508
|
+
const { text: emailText, usage: emailUsage } = await generateBatchEmail(
|
|
4509
|
+
nextBatchFields,
|
|
4510
|
+
state.currentBatchIndex,
|
|
4511
|
+
state.batches.length,
|
|
4512
|
+
{
|
|
4513
|
+
appTitle: state.title,
|
|
4514
|
+
totalFieldCount: state.fields.length,
|
|
4515
|
+
filledFieldCount: filledCount,
|
|
4516
|
+
companyName: context?.companyName
|
|
4517
|
+
},
|
|
4518
|
+
generateText,
|
|
4519
|
+
providerOptions
|
|
4520
|
+
);
|
|
4521
|
+
trackUsage(emailUsage);
|
|
4522
|
+
if (!responseText) {
|
|
4523
|
+
responseText = emailText;
|
|
4524
|
+
} else {
|
|
4525
|
+
responseText += `
|
|
4084
4526
|
|
|
4085
4527
|
${emailText}`;
|
|
4528
|
+
}
|
|
4529
|
+
} catch (error) {
|
|
4530
|
+
await log?.(`Batch email generation failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
4086
4531
|
}
|
|
4087
4532
|
} else {
|
|
4088
4533
|
state.status = "confirming";
|
|
@@ -4291,7 +4736,7 @@ var EvidenceItemSchema = z32.object({
|
|
|
4291
4736
|
turnId: z32.string().optional(),
|
|
4292
4737
|
text: z32.string().describe("Text excerpt from the source"),
|
|
4293
4738
|
relevance: z32.number().min(0).max(1),
|
|
4294
|
-
metadata: z32.
|
|
4739
|
+
metadata: z32.array(z32.object({ key: z32.string(), value: z32.string() })).optional()
|
|
4295
4740
|
});
|
|
4296
4741
|
var RetrievalResultSchema = z32.object({
|
|
4297
4742
|
subQuestion: z32.string(),
|
|
@@ -4327,6 +4772,9 @@ var QueryResultSchema = z32.object({
|
|
|
4327
4772
|
});
|
|
4328
4773
|
|
|
4329
4774
|
// src/query/retriever.ts
|
|
4775
|
+
function recordToKVArray(record) {
|
|
4776
|
+
return Object.entries(record).map(([key, value]) => ({ key, value }));
|
|
4777
|
+
}
|
|
4330
4778
|
async function retrieve(subQuestion, conversationId, config) {
|
|
4331
4779
|
const { documentStore, memoryStore, retrievalLimit, log } = config;
|
|
4332
4780
|
const evidence = [];
|
|
@@ -4353,7 +4801,7 @@ async function retrieve(subQuestion, conversationId, config) {
|
|
|
4353
4801
|
text: chunk.text,
|
|
4354
4802
|
relevance: 0.8,
|
|
4355
4803
|
// Default — store doesn't expose scores directly
|
|
4356
|
-
metadata: chunk.metadata
|
|
4804
|
+
metadata: recordToKVArray(chunk.metadata)
|
|
4357
4805
|
});
|
|
4358
4806
|
}
|
|
4359
4807
|
}
|
|
@@ -4368,7 +4816,7 @@ async function retrieve(subQuestion, conversationId, config) {
|
|
|
4368
4816
|
documentId: chunk.documentId,
|
|
4369
4817
|
text: chunk.text,
|
|
4370
4818
|
relevance: 0.8,
|
|
4371
|
-
metadata: chunk.metadata
|
|
4819
|
+
metadata: recordToKVArray(chunk.metadata)
|
|
4372
4820
|
});
|
|
4373
4821
|
}
|
|
4374
4822
|
}
|
|
@@ -4396,11 +4844,11 @@ async function retrieve(subQuestion, conversationId, config) {
|
|
|
4396
4844
|
text: summary,
|
|
4397
4845
|
relevance: 0.9,
|
|
4398
4846
|
// Direct lookup is high relevance
|
|
4399
|
-
metadata:
|
|
4400
|
-
type: doc.type,
|
|
4401
|
-
carrier: doc.carrier ?? "",
|
|
4402
|
-
insuredName: doc.insuredName ?? ""
|
|
4403
|
-
|
|
4847
|
+
metadata: [
|
|
4848
|
+
{ key: "type", value: doc.type },
|
|
4849
|
+
{ key: "carrier", value: doc.carrier ?? "" },
|
|
4850
|
+
{ key: "insuredName", value: doc.insuredName ?? "" }
|
|
4851
|
+
]
|
|
4404
4852
|
});
|
|
4405
4853
|
}
|
|
4406
4854
|
} catch (e) {
|
|
@@ -4635,8 +5083,12 @@ function createQueryAgent(config) {
|
|
|
4635
5083
|
async function query(input) {
|
|
4636
5084
|
totalUsage = { inputTokens: 0, outputTokens: 0 };
|
|
4637
5085
|
const { question, conversationId, context } = input;
|
|
5086
|
+
const pipelineCtx = createPipelineContext({
|
|
5087
|
+
id: `query-${Date.now()}`
|
|
5088
|
+
});
|
|
4638
5089
|
onProgress?.("Classifying query...");
|
|
4639
5090
|
const classification = await classify(question, conversationId);
|
|
5091
|
+
await pipelineCtx.save("classify", { classification });
|
|
4640
5092
|
onProgress?.(`Retrieving evidence for ${classification.subQuestions.length} sub-question(s)...`);
|
|
4641
5093
|
const retrieverConfig = {
|
|
4642
5094
|
documentStore,
|
|
@@ -4650,9 +5102,10 @@ function createQueryAgent(config) {
|
|
|
4650
5102
|
)
|
|
4651
5103
|
);
|
|
4652
5104
|
const allEvidence = retrievalResults.flatMap((r) => r.evidence);
|
|
5105
|
+
await pipelineCtx.save("retrieve", { classification, evidence: allEvidence });
|
|
4653
5106
|
onProgress?.("Reasoning over evidence...");
|
|
4654
5107
|
const reasonerConfig = { generateObject, providerOptions };
|
|
4655
|
-
|
|
5108
|
+
const reasonResults = await Promise.allSettled(
|
|
4656
5109
|
classification.subQuestions.map(
|
|
4657
5110
|
(sq, i) => limit(async () => {
|
|
4658
5111
|
const { subAnswer, usage } = await reason(
|
|
@@ -4666,10 +5119,27 @@ function createQueryAgent(config) {
|
|
|
4666
5119
|
})
|
|
4667
5120
|
)
|
|
4668
5121
|
);
|
|
5122
|
+
let subAnswers = [];
|
|
5123
|
+
for (let i = 0; i < reasonResults.length; i++) {
|
|
5124
|
+
const result = reasonResults[i];
|
|
5125
|
+
if (result.status === "fulfilled") {
|
|
5126
|
+
subAnswers.push(result.value);
|
|
5127
|
+
} else {
|
|
5128
|
+
await log?.(`Reasoner failed for sub-question "${classification.subQuestions[i].question}": ${result.reason}`);
|
|
5129
|
+
subAnswers.push({
|
|
5130
|
+
subQuestion: classification.subQuestions[i].question,
|
|
5131
|
+
answer: "Unable to answer this part of the question due to a processing error.",
|
|
5132
|
+
citations: [],
|
|
5133
|
+
confidence: 0,
|
|
5134
|
+
needsMoreContext: true
|
|
5135
|
+
});
|
|
5136
|
+
}
|
|
5137
|
+
}
|
|
5138
|
+
await pipelineCtx.save("reason", { classification, evidence: allEvidence, subAnswers });
|
|
4669
5139
|
onProgress?.("Verifying answer grounding...");
|
|
4670
5140
|
const verifierConfig = { generateObject, providerOptions };
|
|
4671
5141
|
for (let round = 0; round < maxVerifyRounds; round++) {
|
|
4672
|
-
const { result: verifyResult, usage } = await
|
|
5142
|
+
const { result: verifyResult, usage } = await safeVerify(
|
|
4673
5143
|
question,
|
|
4674
5144
|
subAnswers,
|
|
4675
5145
|
allEvidence,
|
|
@@ -4693,7 +5163,6 @@ function createQueryAgent(config) {
|
|
|
4693
5163
|
() => retrieve(sq, conversationId, {
|
|
4694
5164
|
...retrieverConfig,
|
|
4695
5165
|
retrievalLimit: retrievalLimit * 2
|
|
4696
|
-
// Broader retrieval on retry
|
|
4697
5166
|
})
|
|
4698
5167
|
)
|
|
4699
5168
|
)
|
|
@@ -4701,7 +5170,7 @@ function createQueryAgent(config) {
|
|
|
4701
5170
|
for (const r of retryRetrievals) {
|
|
4702
5171
|
allEvidence.push(...r.evidence);
|
|
4703
5172
|
}
|
|
4704
|
-
const
|
|
5173
|
+
const retrySettled = await Promise.allSettled(
|
|
4705
5174
|
retryQuestions.map(
|
|
4706
5175
|
(sq, i) => limit(async () => {
|
|
4707
5176
|
const { subAnswer, usage: u } = await reason(
|
|
@@ -4715,6 +5184,7 @@ function createQueryAgent(config) {
|
|
|
4715
5184
|
})
|
|
4716
5185
|
)
|
|
4717
5186
|
);
|
|
5187
|
+
const retrySubAnswers = retrySettled.filter((r) => r.status === "fulfilled").map((r) => r.value);
|
|
4718
5188
|
const retryQSet = new Set(retryQuestions.map((sq) => sq.question));
|
|
4719
5189
|
subAnswers = subAnswers.map((sa) => {
|
|
4720
5190
|
if (retryQSet.has(sa.subQuestion)) {
|
|
@@ -4767,17 +5237,42 @@ function createQueryAgent(config) {
|
|
|
4767
5237
|
}
|
|
4768
5238
|
}
|
|
4769
5239
|
const prompt = buildQueryClassifyPrompt(question, conversationContext);
|
|
4770
|
-
const { object, usage } = await
|
|
4771
|
-
|
|
5240
|
+
const { object, usage } = await safeGenerateObject(
|
|
5241
|
+
generateObject,
|
|
5242
|
+
{
|
|
4772
5243
|
prompt,
|
|
4773
5244
|
schema: QueryClassifyResultSchema,
|
|
4774
5245
|
maxTokens: 2048,
|
|
4775
5246
|
providerOptions
|
|
4776
|
-
}
|
|
5247
|
+
},
|
|
5248
|
+
{
|
|
5249
|
+
fallback: {
|
|
5250
|
+
intent: "general_knowledge",
|
|
5251
|
+
subQuestions: [
|
|
5252
|
+
{
|
|
5253
|
+
question,
|
|
5254
|
+
intent: "general_knowledge"
|
|
5255
|
+
}
|
|
5256
|
+
],
|
|
5257
|
+
requiresDocumentLookup: true,
|
|
5258
|
+
requiresChunkSearch: true,
|
|
5259
|
+
requiresConversationHistory: !!conversationId
|
|
5260
|
+
},
|
|
5261
|
+
log,
|
|
5262
|
+
onError: (err, attempt) => log?.(`Query classify attempt ${attempt + 1} failed: ${err}`)
|
|
5263
|
+
}
|
|
4777
5264
|
);
|
|
4778
5265
|
trackUsage(usage);
|
|
4779
5266
|
return object;
|
|
4780
5267
|
}
|
|
5268
|
+
async function safeVerify(originalQuestion, subAnswers, allEvidence, verifierConfig) {
|
|
5269
|
+
try {
|
|
5270
|
+
return await verify(originalQuestion, subAnswers, allEvidence, verifierConfig);
|
|
5271
|
+
} catch (error) {
|
|
5272
|
+
await log?.(`Verification failed, approving by default: ${error instanceof Error ? error.message : String(error)}`);
|
|
5273
|
+
return { result: { approved: true, issues: [] } };
|
|
5274
|
+
}
|
|
5275
|
+
}
|
|
4781
5276
|
async function respond(originalQuestion, subAnswers, classification, platform) {
|
|
4782
5277
|
const subAnswersJson = JSON.stringify(
|
|
4783
5278
|
subAnswers.map((sa) => ({
|
|
@@ -4791,13 +5286,25 @@ function createQueryAgent(config) {
|
|
|
4791
5286
|
2
|
|
4792
5287
|
);
|
|
4793
5288
|
const prompt = buildRespondPrompt(originalQuestion, subAnswersJson, platform);
|
|
4794
|
-
const { object, usage } = await
|
|
4795
|
-
|
|
5289
|
+
const { object, usage } = await safeGenerateObject(
|
|
5290
|
+
generateObject,
|
|
5291
|
+
{
|
|
4796
5292
|
prompt,
|
|
4797
5293
|
schema: QueryResultSchema,
|
|
4798
5294
|
maxTokens: 4096,
|
|
4799
5295
|
providerOptions
|
|
4800
|
-
}
|
|
5296
|
+
},
|
|
5297
|
+
{
|
|
5298
|
+
fallback: {
|
|
5299
|
+
answer: subAnswers.map((sa) => `**${sa.subQuestion}**
|
|
5300
|
+
${sa.answer}`).join("\n\n"),
|
|
5301
|
+
citations: subAnswers.flatMap((sa) => sa.citations),
|
|
5302
|
+
intent: classification.intent,
|
|
5303
|
+
confidence: Math.min(...subAnswers.map((sa) => sa.confidence), 1)
|
|
5304
|
+
},
|
|
5305
|
+
log,
|
|
5306
|
+
onError: (err, attempt) => log?.(`Respond attempt ${attempt + 1} failed: ${err}`)
|
|
5307
|
+
}
|
|
4801
5308
|
);
|
|
4802
5309
|
trackUsage(usage);
|
|
4803
5310
|
const result = object;
|
|
@@ -4962,6 +5469,7 @@ export {
|
|
|
4962
5469
|
CommercialAutoDeclarationsSchema,
|
|
4963
5470
|
CommercialPropertyDeclarationsSchema,
|
|
4964
5471
|
CommunicationIntentSchema,
|
|
5472
|
+
ConditionKeyValueSchema,
|
|
4965
5473
|
ConditionTypeSchema,
|
|
4966
5474
|
ConstructionTypeSchema,
|
|
4967
5475
|
ContactSchema,
|
|
@@ -5128,6 +5636,7 @@ export {
|
|
|
5128
5636
|
chunkDocument,
|
|
5129
5637
|
createApplicationPipeline,
|
|
5130
5638
|
createExtractor,
|
|
5639
|
+
createPipelineContext,
|
|
5131
5640
|
createQueryAgent,
|
|
5132
5641
|
extractPageRange,
|
|
5133
5642
|
fillAcroForm,
|
|
@@ -5137,6 +5646,7 @@ export {
|
|
|
5137
5646
|
getTemplate,
|
|
5138
5647
|
overlayTextOnPdf,
|
|
5139
5648
|
pLimit,
|
|
5649
|
+
safeGenerateObject,
|
|
5140
5650
|
sanitizeNulls,
|
|
5141
5651
|
stripFences,
|
|
5142
5652
|
withRetry
|