@claritylabs/cl-sdk 0.2.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +491 -193
- package/dist/index.d.mts +28735 -1157
- package/dist/index.d.ts +28735 -1157
- package/dist/index.js +4806 -2320
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +4636 -2298
- package/dist/index.mjs.map +1 -1
- package/dist/storage-sqlite.d.mts +10805 -0
- package/dist/storage-sqlite.d.ts +10805 -0
- package/dist/storage-sqlite.js +238 -0
- package/dist/storage-sqlite.js.map +1 -0
- package/dist/storage-sqlite.mjs +218 -0
- package/dist/storage-sqlite.mjs.map +1 -0
- package/package.json +16 -7
package/README.md
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
|
-
|
|
1
|
+
# CL-SDK
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
[Clarity Labs](https://claritylabs.inc) allows insurers to understand their clients as well as they know themselves. A better understanding of clients means insurers can automate servicing to reduce costs and identify coverage gaps to cross-sell products.
|
|
4
|
+
|
|
5
|
+
CL-SDK is the open infrastructure layer that makes this possible — a pure TypeScript library for extracting, reasoning about, and acting on insurance documents. Provider-agnostic by design: bring any LLM, any embedding model, any storage backend.
|
|
4
6
|
|
|
5
7
|
## Installation
|
|
6
8
|
|
|
@@ -10,276 +12,568 @@ npm install @claritylabs/cl-sdk
|
|
|
10
12
|
|
|
11
13
|
### Peer Dependencies
|
|
12
14
|
|
|
13
|
-
|
|
15
|
+
```bash
|
|
16
|
+
npm install pdf-lib zod
|
|
17
|
+
```
|
|
14
18
|
|
|
19
|
+
Optional (for SQLite storage):
|
|
15
20
|
```bash
|
|
16
|
-
npm install
|
|
21
|
+
npm install better-sqlite3
|
|
17
22
|
```
|
|
18
23
|
|
|
19
|
-
|
|
24
|
+
## Quick Start
|
|
25
|
+
|
|
26
|
+
### Document Extraction
|
|
20
27
|
|
|
21
|
-
|
|
22
|
-
# Anthropic
|
|
23
|
-
npm install @ai-sdk/anthropic
|
|
28
|
+
CL-SDK extracts structured data from insurance PDFs using a multi-agent pipeline. You provide two callback functions — `generateText` and `generateObject` — and the SDK handles the rest:
|
|
24
29
|
|
|
25
|
-
|
|
26
|
-
|
|
30
|
+
```typescript
|
|
31
|
+
import { createExtractor } from "@claritylabs/cl-sdk";
|
|
27
32
|
|
|
28
|
-
|
|
29
|
-
|
|
33
|
+
const extractor = createExtractor({
|
|
34
|
+
generateText: async ({ prompt, system, maxTokens }) => {
|
|
35
|
+
// Wrap your preferred LLM provider
|
|
36
|
+
const result = await yourProvider.generate({ prompt, system, maxTokens });
|
|
37
|
+
return { text: result.text, usage: result.usage };
|
|
38
|
+
},
|
|
39
|
+
generateObject: async ({ prompt, system, schema, maxTokens }) => {
|
|
40
|
+
// schema is a Zod schema — use it for structured output
|
|
41
|
+
const result = await yourProvider.generateStructured({ prompt, system, schema, maxTokens });
|
|
42
|
+
return { object: result.object, usage: result.usage };
|
|
43
|
+
},
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
const pdfBase64 = "..."; // base64-encoded insurance PDF
|
|
47
|
+
const result = await extractor.extract(pdfBase64);
|
|
48
|
+
console.log(result.document); // Typed InsuranceDocument (policy or quote)
|
|
49
|
+
console.log(result.chunks); // DocumentChunk[] ready for vector storage
|
|
30
50
|
```
|
|
31
51
|
|
|
32
|
-
|
|
52
|
+
### With PDF-to-Image Conversion
|
|
53
|
+
|
|
54
|
+
For providers that don't support native PDF input (e.g., OpenAI):
|
|
55
|
+
|
|
56
|
+
```typescript
|
|
57
|
+
const extractor = createExtractor({
|
|
58
|
+
generateText: /* ... */,
|
|
59
|
+
generateObject: /* ... */,
|
|
60
|
+
convertPdfToImages: async (pdfBase64, startPage, endPage) => {
|
|
61
|
+
// Convert PDF pages to images using your preferred library
|
|
62
|
+
return [{ imageBase64: "...", mimeType: "image/png" }]; // one per page
|
|
63
|
+
},
|
|
64
|
+
});
|
|
65
|
+
```
|
|
33
66
|
|
|
34
|
-
|
|
67
|
+
## Architecture
|
|
68
|
+
|
|
69
|
+
### Provider-Agnostic Callbacks
|
|
70
|
+
|
|
71
|
+
CL-SDK has **zero framework dependencies**. All LLM interaction happens through two callback types:
|
|
35
72
|
|
|
36
73
|
```typescript
|
|
37
|
-
|
|
38
|
-
|
|
74
|
+
type GenerateText = (params: {
|
|
75
|
+
prompt: string;
|
|
76
|
+
system?: string;
|
|
77
|
+
maxTokens: number;
|
|
78
|
+
providerOptions?: Record<string, unknown>;
|
|
79
|
+
}) => Promise<{ text: string; usage?: { inputTokens: number; outputTokens: number } }>;
|
|
80
|
+
|
|
81
|
+
type GenerateObject<T> = (params: {
|
|
82
|
+
prompt: string;
|
|
83
|
+
system?: string;
|
|
84
|
+
schema: ZodSchema<T>;
|
|
85
|
+
maxTokens: number;
|
|
86
|
+
providerOptions?: Record<string, unknown>;
|
|
87
|
+
}) => Promise<{ object: T; usage?: { inputTokens: number; outputTokens: number } }>;
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
Works with any provider: Anthropic, OpenAI, Google, Mistral, Bedrock, Azure, Ollama, etc. You write the adapter once; the SDK calls it throughout the pipeline.
|
|
91
|
+
|
|
92
|
+
### Extraction Pipeline
|
|
93
|
+
|
|
94
|
+
The extraction system uses a **coordinator/worker pattern** — a coordinator agent plans the work, specialized extractor agents execute in parallel, and a review loop ensures completeness.
|
|
95
|
+
|
|
96
|
+
```
|
|
97
|
+
┌─────────────┐ ┌─────────────┐ ┌──────────────────────┐
|
|
98
|
+
│ 1. CLASSIFY │────▶│ 2. PLAN │────▶│ 3. EXTRACT (parallel)│
|
|
99
|
+
│ │ │ │ │ │
|
|
100
|
+
│ Document │ │ Select │ │ Run focused │
|
|
101
|
+
│ type, line │ │ template, │ │ extractors against │
|
|
102
|
+
│ of business │ │ assign │ │ assigned page │
|
|
103
|
+
│ │ │ extractors │ │ ranges │
|
|
104
|
+
│ │ │ to pages │ │ │
|
|
105
|
+
└─────────────┘ └─────────────┘ └──────────┬───────────┘
|
|
106
|
+
│
|
|
107
|
+
┌─────────────┐ ┌──────────▼───────────┐
|
|
108
|
+
│ 5. ASSEMBLE │◀────│ 4. REVIEW │
|
|
109
|
+
│ │ │ │
|
|
110
|
+
│ Merge all │ │ Check completeness │
|
|
111
|
+
│ results, │ │ against template, │
|
|
112
|
+
│ validate, │ │ dispatch follow-up │
|
|
113
|
+
│ chunk │ │ extractors for gaps │
|
|
114
|
+
└─────────────┘ └──────────────────────┘
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
#### Phase 1: Classify
|
|
118
|
+
|
|
119
|
+
The coordinator sends the document to `generateObject` with the `ClassifyResultSchema`. It determines:
|
|
120
|
+
- **Document type** — policy or quote
|
|
121
|
+
- **Policy types** — one or more lines of business (e.g., `general_liability`, `workers_comp`)
|
|
122
|
+
- **Confidence score**
|
|
39
123
|
|
|
40
|
-
|
|
41
|
-
const pdfBase64 = "..."; // base64-encoded PDF
|
|
124
|
+
#### Phase 2: Plan
|
|
42
125
|
|
|
43
|
-
|
|
44
|
-
|
|
126
|
+
Based on the classification, the coordinator selects a **line-of-business template** (e.g., `workers_comp`, `cyber`, `homeowners_ho3`) that defines expected sections and page hints. It then generates an **extraction plan** — a list of tasks that map specific extractors to page ranges within the PDF.
|
|
127
|
+
|
|
128
|
+
#### Phase 3: Extract
|
|
129
|
+
|
|
130
|
+
Focused extractor agents are dispatched **in parallel** (concurrency-limited, default 2). Each extractor targets a specific data domain against its assigned page range. The 11 extractor types are:
|
|
131
|
+
|
|
132
|
+
| Extractor | What It Extracts |
|
|
133
|
+
|-----------|-----------------|
|
|
134
|
+
| `carrier_info` | Carrier name, NAIC, AM Best rating, MGA, underwriter, broker |
|
|
135
|
+
| `named_insured` | Insured name, DBA, address, entity type, FEIN, SIC/NAICS |
|
|
136
|
+
| `declarations` | Line-specific structured declarations (varies by policy type) |
|
|
137
|
+
| `coverage_limits` | Coverage names, limits, deductibles, forms, triggers |
|
|
138
|
+
| `endorsements` | Form numbers, titles, types, content, affected parties |
|
|
139
|
+
| `exclusions` | Exclusion titles, content, applicability |
|
|
140
|
+
| `conditions` | Duties after loss, cancellation, other insurance, etc. |
|
|
141
|
+
| `premium_breakdown` | Premium amounts, taxes, fees, payment plans, rating basis |
|
|
142
|
+
| `loss_history` | Loss runs, claim records, experience modification |
|
|
143
|
+
| `supplementary` | Regulatory context, contacts, TPA, claims contacts |
|
|
144
|
+
| `sections` | Raw section content (fallback for unmatched sections) |
|
|
145
|
+
|
|
146
|
+
Each extractor writes its results to an in-memory `Map`. Results accumulate across all extractors.
|
|
147
|
+
|
|
148
|
+
#### Phase 4: Review
|
|
149
|
+
|
|
150
|
+
After initial extraction, a review loop (up to `maxReviewRounds`, default 2) checks completeness against the template's expected sections. If gaps are found, additional extractor tasks are dispatched to fill missing data. This iterative refinement ensures comprehensive extraction.
|
|
151
|
+
|
|
152
|
+
#### Phase 5: Assemble
|
|
153
|
+
|
|
154
|
+
All extractor results are merged into a final validated `InsuranceDocument`, then chunked into `DocumentChunk[]` for vector storage. Chunks are deterministically IDed as `${documentId}:${type}:${index}`.
|
|
155
|
+
|
|
156
|
+
### Configuration
|
|
157
|
+
|
|
158
|
+
```typescript
|
|
159
|
+
const extractor = createExtractor({
|
|
160
|
+
// Required: LLM callbacks
|
|
161
|
+
generateText,
|
|
162
|
+
generateObject,
|
|
163
|
+
|
|
164
|
+
// Optional: PDF vision mode
|
|
165
|
+
convertPdfToImages: async (pdfBase64, startPage, endPage) => [...],
|
|
166
|
+
|
|
167
|
+
// Optional: storage backends
|
|
168
|
+
documentStore, // Persist extracted documents
|
|
169
|
+
memoryStore, // Vector search over chunks + conversation history
|
|
170
|
+
|
|
171
|
+
// Optional: tuning
|
|
172
|
+
concurrency: 2, // Max parallel extractors (default: 2)
|
|
173
|
+
maxReviewRounds: 2, // Review loop iterations (default: 2)
|
|
174
|
+
|
|
175
|
+
// Optional: observability
|
|
176
|
+
onTokenUsage: (usage) => console.log(`${usage.inputTokens} in, ${usage.outputTokens} out`),
|
|
177
|
+
onProgress: (message) => console.log(message),
|
|
178
|
+
log: async (message) => logger.info(message),
|
|
179
|
+
providerOptions: {}, // Passed through to every LLM call
|
|
45
180
|
});
|
|
46
|
-
const fields = applyExtracted(extracted);
|
|
47
181
|
```
|
|
48
182
|
|
|
49
|
-
###
|
|
183
|
+
### Line-of-Business Templates
|
|
184
|
+
|
|
185
|
+
Templates define what the extraction pipeline expects for each policy type. Each template specifies expected sections, page hints, and required vs. optional fields.
|
|
186
|
+
|
|
187
|
+
**Personal lines:** homeowners (HO-3, HO-5), renters (HO-4), condo (HO-6), dwelling fire, personal auto, personal umbrella, personal inland marine, flood (NFIP + private), earthquake, watercraft, recreational vehicle, farm/ranch, mobile home
|
|
188
|
+
|
|
189
|
+
**Commercial lines:** general liability, commercial property, commercial auto, workers' comp, umbrella/excess, professional liability, cyber, directors & officers, crime/fidelity
|
|
190
|
+
|
|
191
|
+
## Storage
|
|
192
|
+
|
|
193
|
+
CL-SDK defines two storage interfaces (`DocumentStore` and `MemoryStore`) and ships a reference SQLite implementation. You can implement these interfaces with any backend.
|
|
194
|
+
|
|
195
|
+
### DocumentStore
|
|
196
|
+
|
|
197
|
+
CRUD for extracted `InsuranceDocument` objects:
|
|
198
|
+
|
|
199
|
+
```typescript
|
|
200
|
+
interface DocumentStore {
|
|
201
|
+
save(doc: InsuranceDocument): Promise<void>;
|
|
202
|
+
get(id: string): Promise<InsuranceDocument | null>;
|
|
203
|
+
query(filters: DocumentFilters): Promise<InsuranceDocument[]>;
|
|
204
|
+
delete(id: string): Promise<void>;
|
|
205
|
+
}
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
Filters support: `type` (policy/quote), `carrier` (fuzzy), `insuredName` (fuzzy), `policyNumber` (exact), `quoteNumber` (exact).
|
|
50
209
|
|
|
51
|
-
|
|
210
|
+
### MemoryStore
|
|
211
|
+
|
|
212
|
+
Vector-searchable storage for document chunks and conversation history. Requires an `EmbedText` callback for generating embeddings:
|
|
52
213
|
|
|
53
214
|
```typescript
|
|
54
|
-
|
|
55
|
-
import { extractFromPdf, createUniformModelConfig } from "@claritylabs/cl-sdk";
|
|
215
|
+
type EmbedText = (text: string) => Promise<number[]>;
|
|
56
216
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
})
|
|
217
|
+
interface MemoryStore {
|
|
218
|
+
// Document chunks with embeddings
|
|
219
|
+
addChunks(chunks: DocumentChunk[]): Promise<void>;
|
|
220
|
+
search(query: string, options?: { limit?: number; filter?: ChunkFilter }): Promise<DocumentChunk[]>;
|
|
221
|
+
|
|
222
|
+
// Conversation turns with embeddings
|
|
223
|
+
addTurn(turn: ConversationTurn): Promise<void>;
|
|
224
|
+
getHistory(conversationId: string, options?: { limit?: number }): Promise<ConversationTurn[]>;
|
|
225
|
+
searchHistory(query: string, conversationId?: string): Promise<ConversationTurn[]>;
|
|
226
|
+
}
|
|
61
227
|
```
|
|
62
228
|
|
|
63
|
-
|
|
229
|
+
Search uses **cosine similarity** over embeddings to find semantically relevant chunks or conversation turns. Embedding failures are non-fatal — chunks are still stored, just not searchable by vector.
|
|
230
|
+
|
|
231
|
+
### SQLite Reference Implementation
|
|
64
232
|
|
|
65
233
|
```typescript
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
234
|
+
import { createSqliteStore } from "@claritylabs/cl-sdk/storage/sqlite";
|
|
235
|
+
|
|
236
|
+
const store = createSqliteStore({
|
|
237
|
+
path: "./cl-sdk.db",
|
|
238
|
+
embed: async (text) => {
|
|
239
|
+
// Your embedding function (OpenAI, Cohere, local model, etc.)
|
|
240
|
+
return await yourEmbeddingProvider.embed(text);
|
|
72
241
|
},
|
|
73
242
|
});
|
|
74
|
-
```
|
|
75
243
|
|
|
76
|
-
|
|
244
|
+
// Use with extractor
|
|
245
|
+
const extractor = createExtractor({
|
|
246
|
+
generateText,
|
|
247
|
+
generateObject,
|
|
248
|
+
documentStore: store.documents,
|
|
249
|
+
memoryStore: store.memory,
|
|
250
|
+
});
|
|
77
251
|
|
|
78
|
-
|
|
252
|
+
// Or use standalone
|
|
253
|
+
await store.documents.save(document);
|
|
254
|
+
const results = await store.memory.search("what is the deductible?", { limit: 5 });
|
|
79
255
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
import { extractFromPdf, type ModelConfig } from "@claritylabs/cl-sdk";
|
|
83
|
-
|
|
84
|
-
const anthropic = createAnthropic();
|
|
85
|
-
const models: ModelConfig = {
|
|
86
|
-
classification: anthropic("claude-haiku-4-5-20251001"), // fast, cheap
|
|
87
|
-
metadata: anthropic("claude-sonnet-4-6"), // capable
|
|
88
|
-
sections: anthropic("claude-haiku-4-5-20251001"), // fast, cheap
|
|
89
|
-
sectionsFallback: anthropic("claude-sonnet-4-6"), // capable (fallback)
|
|
90
|
-
enrichment: anthropic("claude-haiku-4-5-20251001"), // fast, cheap
|
|
91
|
-
};
|
|
92
|
-
|
|
93
|
-
const { extracted } = await extractFromPdf(pdfBase64, { models });
|
|
256
|
+
// Clean up
|
|
257
|
+
store.close();
|
|
94
258
|
```
|
|
95
259
|
|
|
96
|
-
|
|
260
|
+
## Agent System
|
|
261
|
+
|
|
262
|
+
CL-SDK includes a composable prompt system for building insurance-aware AI agents. The `buildAgentSystemPrompt` function assembles modular prompt segments based on the agent's context:
|
|
97
263
|
|
|
98
264
|
```typescript
|
|
99
|
-
import {
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
classification: openai("gpt-4o-mini"),
|
|
108
|
-
metadata: anthropic("claude-sonnet-4-6"),
|
|
109
|
-
sections: openai("gpt-4o-mini"),
|
|
110
|
-
sectionsFallback: anthropic("claude-sonnet-4-6"),
|
|
111
|
-
enrichment: openai("gpt-4o-mini"),
|
|
112
|
-
};
|
|
113
|
-
|
|
114
|
-
const { extracted } = await extractFromPdf(pdfBase64, { models });
|
|
265
|
+
import { buildAgentSystemPrompt } from "@claritylabs/cl-sdk";
|
|
266
|
+
|
|
267
|
+
const systemPrompt = buildAgentSystemPrompt({
|
|
268
|
+
platform: "email", // email | chat | sms | slack | discord
|
|
269
|
+
intent: "direct", // direct | mediated | observed
|
|
270
|
+
userName: "John",
|
|
271
|
+
companyName: "Acme Insurance",
|
|
272
|
+
});
|
|
115
273
|
```
|
|
116
274
|
|
|
117
|
-
|
|
275
|
+
### Prompt Modules
|
|
118
276
|
|
|
119
|
-
|
|
277
|
+
The system prompt is composed from these modules:
|
|
120
278
|
|
|
121
|
-
|
|
279
|
+
| Module | Purpose |
|
|
280
|
+
|--------|---------|
|
|
281
|
+
| **identity** | Agent role, company context, professional persona |
|
|
282
|
+
| **intent** | Behavioral rules based on platform and interaction mode |
|
|
283
|
+
| **formatting** | Output formatting rules (markdown for chat, plaintext for email/SMS) |
|
|
284
|
+
| **safety** | Security guardrails, prompt injection resistance, data handling |
|
|
285
|
+
| **coverage-gaps** | Coverage gap disclosure rules (only in mediated/observed mode) |
|
|
286
|
+
| **coi-routing** | Certificate of Insurance request handling |
|
|
287
|
+
| **quotes-policies** | Guidance for distinguishing quotes vs. active policies |
|
|
288
|
+
| **conversation-memory** | Context about conversation history and document retrieval |
|
|
122
289
|
|
|
123
|
-
|
|
124
|
-
- **Pass 1 — Metadata Extraction**: Extracts high-level metadata — carrier, policy/quote number, dates, premium, insured name, coverage table with limits and deductibles. Includes an early persistence callback (`onMetadata`) so metadata is saved immediately, surviving downstream failures.
|
|
125
|
-
- **Pass 2 — Section Extraction**: Splits the document into page chunks (starting at 15 pages) and extracts structured sections in parallel (concurrency-limited, default 2). All model calls automatically retry on rate-limit errors with exponential backoff. Adaptive fallback: if a chunk's output is truncated (JSON parse failure), it re-splits into smaller chunks (10, then 5 pages), and escalates to the fallback model. Results are merged across chunks.
|
|
126
|
-
- **Pass 3 — Enrichment**: A non-fatal pass that parses raw text into structured supplementary fields — regulatory context, complaint contacts, costs and fees, claims contacts.
|
|
290
|
+
### Message Intent Classification
|
|
127
291
|
|
|
128
|
-
|
|
292
|
+
Classify incoming messages to route them appropriately:
|
|
129
293
|
|
|
130
|
-
|
|
294
|
+
```typescript
|
|
295
|
+
import { buildClassifyMessagePrompt } from "@claritylabs/cl-sdk";
|
|
131
296
|
|
|
132
|
-
|
|
297
|
+
const prompt = buildClassifyMessagePrompt("email");
|
|
298
|
+
// Returns classification prompt for intents:
|
|
299
|
+
// policy_question, coi_request, renewal_inquiry, claim_report,
|
|
300
|
+
// coverage_shopping, general, unrelated
|
|
301
|
+
```
|
|
133
302
|
|
|
134
|
-
|
|
135
|
-
- **Field extraction** — reads every fillable field as structured data (text, numeric, currency, date, yes/no, table, and declaration fields)
|
|
136
|
-
- **Auto-fill** — matches extracted fields against known business context to pre-populate answers
|
|
137
|
-
- **Question batching** — organizes unfilled fields into topic-based batches for emailing the insured
|
|
138
|
-
- **Answer parsing** — parses free-text replies back into structured field values
|
|
139
|
-
- **PDF filling** — maps answers back onto the original PDF, supporting both AcroForm and flat PDF overlay
|
|
303
|
+
## Application Processing Pipeline
|
|
140
304
|
|
|
141
|
-
|
|
305
|
+
The application pipeline processes insurance applications through an agentic coordinator/worker system — small focused agents handle classification, field extraction, auto-fill, question batching, reply routing, and PDF mapping. Supports persistent state and vector-based answer backfill from prior applications.
|
|
142
306
|
|
|
143
|
-
|
|
307
|
+
### Quick Start
|
|
144
308
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
309
|
+
```typescript
|
|
310
|
+
import { createApplicationPipeline } from "@claritylabs/cl-sdk";
|
|
311
|
+
|
|
312
|
+
const pipeline = createApplicationPipeline({
|
|
313
|
+
generateText,
|
|
314
|
+
generateObject,
|
|
315
|
+
applicationStore, // persistent state storage
|
|
316
|
+
documentStore, // for policy/quote lookups during auto-fill
|
|
317
|
+
memoryStore, // for vector-based answer backfill
|
|
318
|
+
orgContext: [ // business context for auto-fill
|
|
319
|
+
{ key: "company_name", value: "Acme Corp", category: "company_info" },
|
|
320
|
+
{ key: "company_address", value: "123 Main St", category: "company_info" },
|
|
321
|
+
],
|
|
322
|
+
});
|
|
151
323
|
|
|
152
|
-
|
|
324
|
+
// Process a new application PDF
|
|
325
|
+
const { state } = await pipeline.processApplication({
|
|
326
|
+
pdfBase64: "...",
|
|
327
|
+
applicationId: "app-123",
|
|
328
|
+
});
|
|
329
|
+
// state.fields → extracted fields, some already auto-filled
|
|
330
|
+
// state.batches → question batches ready for user collection
|
|
153
331
|
|
|
154
|
-
|
|
332
|
+
// Generate email for current batch
|
|
333
|
+
const { text: emailBody } = await pipeline.generateCurrentBatchEmail("app-123", {
|
|
334
|
+
companyName: "Acme Corp",
|
|
335
|
+
});
|
|
155
336
|
|
|
156
|
-
|
|
337
|
+
// Process user's reply
|
|
338
|
+
const { state: updated, fieldsFilled, responseText } = await pipeline.processReply({
|
|
339
|
+
applicationId: "app-123",
|
|
340
|
+
replyText: "1. Yes\n2. $1,000,000\n3. Check our website for revenue",
|
|
341
|
+
});
|
|
342
|
+
```
|
|
157
343
|
|
|
158
|
-
|
|
344
|
+
### Pipeline Phases
|
|
159
345
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
346
|
+
```
|
|
347
|
+
┌─────────────┐ ┌──────────────┐ ┌─────────────────────┐
|
|
348
|
+
│ 1. CLASSIFY │────>│ 2. EXTRACT │────>│ 3. BACKFILL + │
|
|
349
|
+
│ │ │ FIELDS │ │ AUTO-FILL │
|
|
350
|
+
│ Is this an │ │ │ │ (parallel) │
|
|
351
|
+
│ application? │ │ All fillable │ │ │
|
|
352
|
+
│ │ │ fields as │ │ • vector backfill │
|
|
353
|
+
│ │ │ structured │ │ • context auto-fill │
|
|
354
|
+
│ │ │ data │ │ • document search │
|
|
355
|
+
└──────────────┘ └──────────────┘ └──────────┬──────────┘
|
|
356
|
+
│
|
|
357
|
+
┌──────────────┐ ┌──────────v──────────┐
|
|
358
|
+
│ REPLY LOOP │<────│ 4. BATCH QUESTIONS │
|
|
359
|
+
│ │ │ │
|
|
360
|
+
│ Route intent │ │ Group unfilled │
|
|
361
|
+
│ Parse answers│ │ fields by topic │
|
|
362
|
+
│ Handle lookup│ │ Generate emails │
|
|
363
|
+
│ Explain field│ │ │
|
|
364
|
+
└──────┬───────┘ └─────────────────────┘
|
|
365
|
+
│
|
|
366
|
+
┌──────v───────┐
|
|
367
|
+
│ 5. CONFIRM + │
|
|
368
|
+
│ MAP PDF │
|
|
369
|
+
└──────────────┘
|
|
370
|
+
```
|
|
163
371
|
|
|
164
|
-
|
|
372
|
+
### Focused Agents (8 types)
|
|
165
373
|
|
|
166
|
-
|
|
374
|
+
| Agent | Task | Model Size |
|
|
375
|
+
|-------|------|-----------|
|
|
376
|
+
| `classifier` | Detect if PDF is an application | Tiny |
|
|
377
|
+
| `field-extractor` | Extract all form fields | Medium |
|
|
378
|
+
| `auto-filler` | Match fields to business context | Small |
|
|
379
|
+
| `batcher` | Group fields into topic batches | Small |
|
|
380
|
+
| `reply-router` | Classify reply intent | Tiny |
|
|
381
|
+
| `answer-parser` | Extract answers from replies | Small |
|
|
382
|
+
| `lookup-filler` | Fill from policy/record lookups | Small |
|
|
383
|
+
| `email-generator` | Generate professional batch emails | Small |
|
|
167
384
|
|
|
168
|
-
|
|
169
|
-
|----------|-------------|
|
|
170
|
-
| `classifyDocumentType(pdf, options)` | Classify document as policy or quote |
|
|
171
|
-
| `extractFromPdf(pdf, options)` | Full policy extraction (passes 1-3) |
|
|
172
|
-
| `extractQuoteFromPdf(pdf, options)` | Full quote extraction (passes 1-2) |
|
|
173
|
-
| `extractSectionsOnly(pdf, metadata, options)` | Retry pass 2 using saved metadata |
|
|
174
|
-
| `applyExtracted(extracted)` | Map extraction JSON to persistence fields |
|
|
175
|
-
| `applyExtractedQuote(extracted)` | Map quote extraction JSON to persistence fields |
|
|
385
|
+
### Vector-Based Answer Backfill
|
|
176
386
|
|
|
177
|
-
|
|
387
|
+
The `BackfillProvider` interface enables searching prior application answers and extracted document data to pre-fill new applications:
|
|
178
388
|
|
|
179
389
|
```typescript
|
|
180
|
-
interface
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
fallbackProviderOptions?: ProviderOptions;
|
|
186
|
-
concurrency?: number; // parallel chunk limit (default: 2)
|
|
187
|
-
tokenLimits?: TokenLimits; // override default maxTokens per role
|
|
188
|
-
onTokenUsage?: (usage: TokenUsage) => void;
|
|
189
|
-
pdfContentFormat?: "file" | "image"; // default: "file"
|
|
190
|
-
convertPdfToImages?: ConvertPdfToImagesFn; // required when pdfContentFormat is "image"
|
|
390
|
+
interface BackfillProvider {
|
|
391
|
+
searchPriorAnswers(
|
|
392
|
+
fields: { id: string; label: string; section: string; fieldType: string }[],
|
|
393
|
+
options?: { limit?: number },
|
|
394
|
+
): Promise<PriorAnswer[]>;
|
|
191
395
|
}
|
|
396
|
+
```
|
|
192
397
|
|
|
193
|
-
|
|
194
|
-
log?: LogFn;
|
|
195
|
-
promptBuilder?: PromptBuilder;
|
|
196
|
-
models: ModelConfig; // required — bring your own models
|
|
197
|
-
fallbackProviderOptions?: ProviderOptions;
|
|
198
|
-
concurrency?: number; // parallel chunk limit (default: 2)
|
|
199
|
-
tokenLimits?: TokenLimits; // override default maxTokens per role
|
|
200
|
-
onTokenUsage?: (usage: TokenUsage) => void;
|
|
201
|
-
pdfContentFormat?: "file" | "image"; // default: "file"
|
|
202
|
-
convertPdfToImages?: ConvertPdfToImagesFn; // required when pdfContentFormat is "image"
|
|
203
|
-
}
|
|
398
|
+
This runs in parallel with context-based auto-fill, so the pipeline fills as many fields as possible before asking the user anything.
|
|
204
399
|
|
|
205
|
-
|
|
206
|
-
log?: LogFn;
|
|
207
|
-
models: ModelConfig; // required — bring your own models
|
|
208
|
-
tokenLimits?: TokenLimits; // override default maxTokens per role
|
|
209
|
-
onTokenUsage?: (usage: TokenUsage) => void;
|
|
210
|
-
pdfContentFormat?: "file" | "image"; // default: "file"
|
|
211
|
-
convertPdfToImages?: ConvertPdfToImagesFn; // required when pdfContentFormat is "image"
|
|
212
|
-
}
|
|
400
|
+
### Application Prompts (for advanced use)
|
|
213
401
|
|
|
214
|
-
|
|
215
|
-
interface TokenLimits {
|
|
216
|
-
classification?: number; // default: 512
|
|
217
|
-
metadata?: number; // default: 16384
|
|
218
|
-
sections?: number; // default: 8192
|
|
219
|
-
sectionsFallback?: number; // default: 16384
|
|
220
|
-
enrichment?: number; // default: 4096
|
|
221
|
-
}
|
|
402
|
+
The individual prompt functions are still exported for custom pipelines:
|
|
222
403
|
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
404
|
+
```typescript
|
|
405
|
+
import {
|
|
406
|
+
buildFieldExtractionPrompt,
|
|
407
|
+
buildAutoFillPrompt,
|
|
408
|
+
buildQuestionBatchPrompt,
|
|
409
|
+
buildAnswerParsingPrompt,
|
|
410
|
+
buildConfirmationSummaryPrompt,
|
|
411
|
+
buildBatchEmailGenerationPrompt,
|
|
412
|
+
buildReplyIntentClassificationPrompt,
|
|
413
|
+
buildFieldExplanationPrompt,
|
|
414
|
+
buildFlatPdfMappingPrompt,
|
|
415
|
+
buildAcroFormMappingPrompt,
|
|
416
|
+
buildLookupFillPrompt,
|
|
417
|
+
} from "@claritylabs/cl-sdk";
|
|
418
|
+
```
|
|
419
|
+
|
|
420
|
+
## Query Agent Pipeline
|
|
421
|
+
|
|
422
|
+
The query agent answers user questions against stored documents with citation-backed provenance. It mirrors the extraction pipeline's coordinator/worker pattern: a classifier decomposes questions, retrievers pull evidence in parallel, reasoners answer from evidence only, and a verifier checks grounding.
|
|
423
|
+
|
|
424
|
+
### Quick Start
|
|
425
|
+
|
|
426
|
+
```typescript
|
|
427
|
+
import { createQueryAgent } from "@claritylabs/cl-sdk";
|
|
428
|
+
|
|
429
|
+
const agent = createQueryAgent({
|
|
430
|
+
generateText,
|
|
431
|
+
generateObject,
|
|
432
|
+
documentStore, // where extracted documents are stored
|
|
433
|
+
memoryStore, // where document chunks + conversation history live
|
|
434
|
+
});
|
|
227
435
|
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
436
|
+
const result = await agent.query({
|
|
437
|
+
question: "What is the deductible on our GL policy?",
|
|
438
|
+
conversationId: "conv-123",
|
|
439
|
+
});
|
|
440
|
+
|
|
441
|
+
console.log(result.answer); // Natural language answer
|
|
442
|
+
console.log(result.citations); // Source references with exact quotes
|
|
443
|
+
console.log(result.confidence); // 0-1 confidence score
|
|
233
444
|
```
|
|
234
445
|
|
|
235
|
-
###
|
|
446
|
+
### Pipeline Phases
|
|
447
|
+
|
|
448
|
+
```
|
|
449
|
+
┌─────────────┐ ┌──────────────┐ ┌────────────────────┐
|
|
450
|
+
│ 1. CLASSIFY │────>│ 2. RETRIEVE │────>│ 3. REASON │
|
|
451
|
+
│ │ │ (parallel) │ │ (parallel) │
|
|
452
|
+
│ Intent + │ │ │ │ │
|
|
453
|
+
│ sub-question │ │ chunk search │ │ Answer each sub-Q │
|
|
454
|
+
│ decomposition│ │ doc lookup │ │ from evidence only │
|
|
455
|
+
│ │ │ conv history │ │ │
|
|
456
|
+
└──────────────┘ └──────────────┘ └─────────┬──────────┘
|
|
457
|
+
│
|
|
458
|
+
┌──────────────┐ ┌─────────v──────────┐
|
|
459
|
+
│ 5. RESPOND │<────│ 4. VERIFY │
|
|
460
|
+
│ │ │ │
|
|
461
|
+
│ Format with │ │ Grounding check │
|
|
462
|
+
│ citations, │ │ Consistency check │
|
|
463
|
+
│ store turn │ │ Completeness check │
|
|
464
|
+
└──────────────┘ └────────────────────┘
|
|
465
|
+
```
|
|
236
466
|
|
|
237
|
-
|
|
238
|
-
|--------|-------------|-------------|
|
|
239
|
-
| `file` (default) | Send PDF as a native file. Most efficient — no conversion needed. | Most models (Anthropic, Google, OpenAI, Mistral, Bedrock, Azure) |
|
|
240
|
-
| `image` | Convert PDF pages to images via `convertPdfToImages` callback. | Models that don't support native PDF file input |
|
|
467
|
+
**Phase 1 — Classify:** Determines intent (`policy_question`, `coverage_comparison`, `document_search`, `claims_inquiry`, `general_knowledge`) and decomposes complex questions into atomic sub-questions. Each sub-question specifies which chunk types and document filters to use for retrieval.
|
|
241
468
|
|
|
242
|
-
|
|
469
|
+
**Phase 2 — Retrieve (parallel):** For each sub-question, a retriever searches chunk embeddings, does structured document lookups, and pulls conversation history — all in parallel. Returns ranked evidence items.
|
|
243
470
|
|
|
244
|
-
|
|
471
|
+
**Phase 3 — Reason (parallel):** For each sub-question, a reasoner receives only the retrieved evidence (never the full document) and produces a sub-answer with citations. Intent-specific prompts guide reasoning (e.g., coverage questions get prompts tuned for interpreting limits and endorsements).
|
|
245
472
|
|
|
246
|
-
|
|
473
|
+
**Phase 4 — Verify:** The verifier checks that every claim is grounded in a citation, sub-answers don't contradict each other, and no evidence was overlooked. If issues are found, it can trigger re-retrieval with broader context.
|
|
247
474
|
|
|
248
|
-
|
|
475
|
+
**Phase 5 — Respond:** Merges verified sub-answers into a single natural-language response with inline citations (`[1]`, `[2]`), deduplicates references, and stores the exchange as conversation turns.
|
|
249
476
|
|
|
250
|
-
|
|
477
|
+
### Configuration
|
|
251
478
|
|
|
252
479
|
```typescript
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
480
|
+
const agent = createQueryAgent({
|
|
481
|
+
// Required
|
|
482
|
+
generateText,
|
|
483
|
+
generateObject,
|
|
484
|
+
documentStore,
|
|
485
|
+
memoryStore,
|
|
486
|
+
|
|
487
|
+
// Optional: tuning
|
|
488
|
+
concurrency: 3, // max parallel retrievers/reasoners (default: 3)
|
|
489
|
+
maxVerifyRounds: 1, // verification loop iterations (default: 1)
|
|
490
|
+
retrievalLimit: 10, // max evidence items per sub-question (default: 10)
|
|
491
|
+
|
|
492
|
+
// Optional: observability
|
|
493
|
+
onTokenUsage: (usage) => console.log(`${usage.inputTokens} in, ${usage.outputTokens} out`),
|
|
494
|
+
onProgress: (message) => console.log(message),
|
|
495
|
+
log: async (message) => logger.info(message),
|
|
496
|
+
providerOptions: {},
|
|
263
497
|
});
|
|
498
|
+
```
|
|
499
|
+
|
|
500
|
+
### Citations
|
|
501
|
+
|
|
502
|
+
Every factual claim in the answer references its source:
|
|
503
|
+
|
|
504
|
+
```typescript
|
|
505
|
+
interface Citation {
|
|
506
|
+
index: number; // [1], [2], etc.
|
|
507
|
+
chunkId: string; // e.g. "doc-123:coverage:2"
|
|
508
|
+
documentId: string;
|
|
509
|
+
documentType?: "policy" | "quote";
|
|
510
|
+
field?: string; // e.g. "coverages[0].deductible"
|
|
511
|
+
quote: string; // exact text from source
|
|
512
|
+
relevance: number; // 0-1 similarity score
|
|
513
|
+
}
|
|
514
|
+
```
|
|
515
|
+
|
|
516
|
+
## PDF Operations
|
|
517
|
+
|
|
518
|
+
```typescript
|
|
519
|
+
import {
|
|
520
|
+
extractPageRange, // Extract specific pages from a PDF
|
|
521
|
+
getPdfPageCount, // Get total page count
|
|
522
|
+
getAcroFormFields, // Enumerate form fields (text, checkbox, dropdown, radio)
|
|
523
|
+
fillAcroForm, // Fill and flatten AcroForm fields
|
|
524
|
+
overlayTextOnPdf, // Overlay text at coordinates on flat PDFs
|
|
525
|
+
} from "@claritylabs/cl-sdk";
|
|
526
|
+
```
|
|
527
|
+
|
|
528
|
+
## Tool Definitions
|
|
529
|
+
|
|
530
|
+
Claude `tool_use`-compatible schemas for agent integrations:
|
|
264
531
|
|
|
265
|
-
|
|
532
|
+
```typescript
|
|
533
|
+
import {
|
|
534
|
+
AGENT_TOOLS, // All tools as an array
|
|
535
|
+
DOCUMENT_LOOKUP_TOOL, // Search/retrieve policies and quotes
|
|
536
|
+
COI_GENERATION_TOOL, // Generate Certificates of Insurance
|
|
537
|
+
COVERAGE_COMPARISON_TOOL, // Compare coverages across documents
|
|
538
|
+
} from "@claritylabs/cl-sdk";
|
|
266
539
|
```
|
|
267
540
|
|
|
268
|
-
|
|
541
|
+
These are schema-only definitions (input schemas + descriptions). You provide the implementations that call your storage and PDF layers.
|
|
542
|
+
|
|
543
|
+
## Document Types
|
|
269
544
|
|
|
270
|
-
|
|
271
|
-
|----------|-------------|
|
|
272
|
-
| `buildAgentSystemPrompt(ctx)` | Full system prompt from `AgentContext` |
|
|
273
|
-
| `buildDocumentContext(docs, query)` | Ranked document context for a query |
|
|
274
|
-
| `buildClassifyMessagePrompt(platform)` | Intent classification prompt |
|
|
545
|
+
All types are derived from Zod schemas, providing both runtime validation and TypeScript types:
|
|
275
546
|
|
|
276
|
-
|
|
547
|
+
```typescript
|
|
548
|
+
import type {
|
|
549
|
+
InsuranceDocument, // PolicyDocument | QuoteDocument (discriminated union)
|
|
550
|
+
PolicyDocument, // Extracted policy with all enrichments
|
|
551
|
+
QuoteDocument, // Extracted quote with subjectivities, premium breakdown
|
|
552
|
+
Coverage, // Coverage name, limits, deductibles, form
|
|
553
|
+
EnrichedCoverage, // Coverage + additional metadata
|
|
554
|
+
Endorsement, // Form number, title, type, content
|
|
555
|
+
Exclusion, // Title, content, applicability
|
|
556
|
+
Condition, // Type, title, content
|
|
557
|
+
Declaration, // Line-specific declarations (19 types)
|
|
558
|
+
Platform, // email | chat | sms | slack | discord
|
|
559
|
+
AgentContext, // Platform + intent + user/company context
|
|
560
|
+
} from "@claritylabs/cl-sdk";
|
|
561
|
+
```
|
|
277
562
|
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
563
|
+
### Supported Policy Types
|
|
564
|
+
|
|
565
|
+
42 policy types across personal and commercial lines — including general liability, commercial property, workers' comp, cyber, D&O, homeowners (HO-3/HO-5/HO-4/HO-6), personal auto, flood (NFIP + private), earthquake, and more.
|
|
566
|
+
|
|
567
|
+
## Core Utilities
|
|
568
|
+
|
|
569
|
+
```typescript
|
|
570
|
+
import {
|
|
571
|
+
withRetry, // Exponential backoff with jitter (5 retries, 2–32s) for rate limits
|
|
572
|
+
pLimit, // Concurrency limiter for parallel async tasks
|
|
573
|
+
sanitizeNulls, // Recursively convert null → undefined (for database compatibility)
|
|
574
|
+
stripFences, // Remove markdown code fences from LLM JSON responses
|
|
575
|
+
} from "@claritylabs/cl-sdk";
|
|
576
|
+
```
|
|
283
577
|
|
|
284
578
|
## Development
|
|
285
579
|
|
|
@@ -290,4 +584,8 @@ npm run dev # Watch mode
|
|
|
290
584
|
npm run typecheck # Type check (tsc --noEmit)
|
|
291
585
|
```
|
|
292
586
|
|
|
293
|
-
|
|
587
|
+
Zero framework dependencies. Peer deps: `pdf-lib`, `zod`. Optional: `better-sqlite3`.
|
|
588
|
+
|
|
589
|
+
## License
|
|
590
|
+
|
|
591
|
+
Apache-2.0
|