@claritylabs/cl-sdk 0.8.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +73 -39
- package/dist/index.d.mts +24 -0
- package/dist/index.d.ts +24 -0
- package/dist/index.js +382 -72
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +382 -72
- package/dist/index.mjs.map +1 -1
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -31,14 +31,22 @@ CL-SDK extracts structured data from insurance PDFs using a multi-agent pipeline
|
|
|
31
31
|
import { createExtractor } from "@claritylabs/cl-sdk";
|
|
32
32
|
|
|
33
33
|
const extractor = createExtractor({
|
|
34
|
-
generateText: async ({ prompt, system, maxTokens }) => {
|
|
34
|
+
generateText: async ({ prompt, system, maxTokens, providerOptions }) => {
|
|
35
35
|
// Wrap your preferred LLM provider
|
|
36
|
-
const result = await yourProvider.generate({ prompt, system, maxTokens });
|
|
36
|
+
const result = await yourProvider.generate({ prompt, system, maxTokens, providerOptions });
|
|
37
37
|
return { text: result.text, usage: result.usage };
|
|
38
38
|
},
|
|
39
|
-
generateObject: async ({ prompt, system, schema, maxTokens }) => {
|
|
39
|
+
generateObject: async ({ prompt, system, schema, maxTokens, providerOptions }) => {
|
|
40
40
|
// schema is a Zod schema — use it for structured output
|
|
41
|
-
|
|
41
|
+
// IMPORTANT: pass providerOptions.pdfBase64 and/or providerOptions.images
|
|
42
|
+
// through to your model as file/image message parts.
|
|
43
|
+
const result = await yourProvider.generateStructured({
|
|
44
|
+
prompt,
|
|
45
|
+
system,
|
|
46
|
+
schema,
|
|
47
|
+
maxTokens,
|
|
48
|
+
providerOptions,
|
|
49
|
+
});
|
|
42
50
|
return { object: result.object, usage: result.usage };
|
|
43
51
|
},
|
|
44
52
|
});
|
|
@@ -47,6 +55,8 @@ const pdfBase64 = "..."; // base64-encoded insurance PDF
|
|
|
47
55
|
const result = await extractor.extract(pdfBase64);
|
|
48
56
|
console.log(result.document); // Typed InsuranceDocument (policy or quote)
|
|
49
57
|
console.log(result.chunks); // DocumentChunk[] ready for vector storage
|
|
58
|
+
console.log(result.tokenUsage); // Aggregate input/output tokens when available
|
|
59
|
+
console.log(result.usageReporting); // How many model calls did or did not report usage
|
|
50
60
|
```
|
|
51
61
|
|
|
52
62
|
### With PDF-to-Image Conversion
|
|
@@ -87,40 +97,48 @@ type GenerateObject<T> = (params: {
|
|
|
87
97
|
}) => Promise<{ object: T; usage?: { inputTokens: number; outputTokens: number } }>;
|
|
88
98
|
```
|
|
89
99
|
|
|
100
|
+
For extraction calls, `providerOptions` can carry document content:
|
|
101
|
+
|
|
102
|
+
- `providerOptions.pdfBase64` — the PDF to send as a file part
|
|
103
|
+
- `providerOptions.images` — page images to send as image parts
|
|
104
|
+
|
|
105
|
+
The coordinator passes the full PDF to classify and plan. Worker extractors pass a page-scoped PDF produced by `extractPageRange()` unless `convertPdfToImages` is enabled, in which case they pass page images instead. Your callback must include that content in the actual model request; the prompt text alone is not sufficient.
|
|
106
|
+
|
|
90
107
|
Works with any provider: Anthropic, OpenAI, Google, Mistral, Bedrock, Azure, Ollama, etc. You write the adapter once; the SDK calls it throughout the pipeline.
|
|
91
108
|
|
|
92
109
|
> **Strict structured output compatibility:** The SDK automatically transforms Zod schemas before passing them to `generateObject` — converting `.optional()` fields to `.nullable()` so all properties appear in the JSON Schema `required` array. This ensures compatibility with providers like OpenAI that enforce strict structured output validation. No adapter changes needed on your end.
|
|
93
110
|
|
|
94
111
|
### Extraction Pipeline
|
|
95
112
|
|
|
96
|
-
The extraction system uses a **coordinator/worker pattern**
|
|
113
|
+
The extraction system uses a **coordinator/worker pattern** with page-aware planning, merged worker outputs, and a document-grounded review loop.
|
|
97
114
|
|
|
98
115
|
```
|
|
99
|
-
┌─────────────┐ ┌─────────────┐
|
|
100
|
-
│
|
|
101
|
-
│
|
|
102
|
-
│
|
|
103
|
-
│
|
|
104
|
-
│
|
|
105
|
-
|
|
106
|
-
│
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
│
|
|
111
|
-
|
|
112
|
-
│
|
|
113
|
-
|
|
114
|
-
│
|
|
115
|
-
│
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
│
|
|
120
|
-
│
|
|
121
|
-
│
|
|
122
|
-
│
|
|
123
|
-
│
|
|
116
|
+
┌─────────────┐ ┌──────────────┐ ┌─────────────┐
|
|
117
|
+
│ 1. CLASSIFY │────▶│ 2. PAGE MAP │────▶│ 3. PLAN │
|
|
118
|
+
│ │ │ │ │ │
|
|
119
|
+
│ Document │ │ Assign pages │ │ Build tasks │
|
|
120
|
+
│ type, line │ │ to focused │ │ from page │
|
|
121
|
+
│ of business │ │ extractors │ │ assignments │
|
|
122
|
+
└─────────────┘ └──────┬───────┘ └──────┬──────┘
|
|
123
|
+
│ │
|
|
124
|
+
┌──────▼────────────────────▼──────┐
|
|
125
|
+
│ 4. EXTRACT + MERGE (parallel) │
|
|
126
|
+
│ Run focused extractors, merge │
|
|
127
|
+
│ repeat runs instead of overwrite │
|
|
128
|
+
└──────────────┬───────────────────┘
|
|
129
|
+
│
|
|
130
|
+
┌─────────────┐ ┌─────────────▼───────────┐ ┌─────────────┐
|
|
131
|
+
│ 7. FORMAT │◀────│ 6. ASSEMBLE │◀────│ 5. REVIEW │
|
|
132
|
+
│ │ │ │ │ │
|
|
133
|
+
│ Clean up │ │ Merge all data into │ │ Check │
|
|
134
|
+
│ markdown │ │ final document │ │ completeness│
|
|
135
|
+
└──────┬──────┘ └─────────────────────────┘ │ and quality │
|
|
136
|
+
│ └──────┬──────┘
|
|
137
|
+
┌──────▼──────┐ │
|
|
138
|
+
│ 8. CHUNK │◀───────────────────────────────────────────────┘
|
|
139
|
+
│ Break into │
|
|
140
|
+
│ retrieval- │
|
|
141
|
+
│ ready chunks│
|
|
124
142
|
└─────────────┘
|
|
125
143
|
```
|
|
126
144
|
|
|
@@ -131,11 +149,21 @@ The coordinator sends the document to `generateObject` with the `ClassifyResultS
|
|
|
131
149
|
- **Policy types** — one or more lines of business (e.g., `general_liability`, `workers_comp`)
|
|
132
150
|
- **Confidence score**
|
|
133
151
|
|
|
134
|
-
|
|
152
|
+
The full document is passed through `providerOptions.pdfBase64` for this step, so your callback must attach that PDF to the model request as a real document/file part.
|
|
153
|
+
|
|
154
|
+
#### Phase 2: Page Map
|
|
155
|
+
|
|
156
|
+
Before planning tasks, the coordinator maps each page to one or more focused extractors. This reduces the chance that declaration pages and schedule pages get mixed into large generic ranges dominated by form language.
|
|
157
|
+
|
|
158
|
+
The page-mapping step uses the relevant PDF pages through `providerOptions.pdfBase64`, chunk by chunk, and produces page-level assignments for the downstream plan.
|
|
135
159
|
|
|
136
|
-
|
|
160
|
+
#### Phase 3: Plan
|
|
137
161
|
|
|
138
|
-
|
|
162
|
+
Based on the classification, page map, and **line-of-business template** (e.g., `workers_comp`, `cyber`, `homeowners_ho3`), the coordinator builds an **extraction plan** — a list of focused extractor tasks derived from those page assignments.
|
|
163
|
+
|
|
164
|
+
The old prompt-only `plan.ts` module is a deprecated candidate and is no longer the active planning path used by the coordinator.
|
|
165
|
+
|
|
166
|
+
#### Phase 4: Extract And Merge
|
|
139
167
|
|
|
140
168
|
Focused extractor agents are dispatched **in parallel** (concurrency-limited, default 2). Each extractor targets a specific data domain against its assigned page range. The 11 extractor types are:
|
|
141
169
|
|
|
@@ -153,17 +181,21 @@ Focused extractor agents are dispatched **in parallel** (concurrency-limited, de
|
|
|
153
181
|
| `supplementary` | Regulatory context, contacts, TPA, claims contacts |
|
|
154
182
|
| `sections` | Raw section content (fallback for unmatched sections) |
|
|
155
183
|
|
|
156
|
-
Each extractor writes its results to an in-memory `Map`.
|
|
184
|
+
Each extractor writes its results to an in-memory `Map`. Repeated extractor runs now **merge** instead of overwriting previous results, which is critical for extractors like `coverage_limits`, `endorsements`, `exclusions`, `conditions`, `sections`, and `declarations`.
|
|
185
|
+
|
|
186
|
+
Before each worker call, the SDK slices the requested page range with `extractPageRange()` and passes that page-scoped PDF through `providerOptions.pdfBase64`. If `convertPdfToImages` is configured, it passes `providerOptions.images` instead. The callback layer is responsible for actually including that content in the model input.
|
|
157
187
|
|
|
158
|
-
#### Phase
|
|
188
|
+
#### Phase 5: Review
|
|
159
189
|
|
|
160
|
-
After initial extraction, a review loop (up to `maxReviewRounds`, default 2) checks completeness
|
|
190
|
+
After initial extraction, a review loop (up to `maxReviewRounds`, default 2) checks both **completeness and quality**. The reviewer sees the full PDF, a page-map summary, and a summary of extracted results. It is expected to catch issues like missing required fields, generic placeholder outputs such as "shown in declarations" or "per schedule", and outputs that appear to come from generic form text instead of declaration/schedule values.
|
|
161
191
|
|
|
162
|
-
|
|
192
|
+
If gaps or quality issues are found, additional focused extractor tasks are dispatched.
|
|
193
|
+
|
|
194
|
+
#### Phase 6: Assemble
|
|
163
195
|
|
|
164
196
|
All extractor results are merged into a final validated `InsuranceDocument`.
|
|
165
197
|
|
|
166
|
-
#### Phase
|
|
198
|
+
#### Phase 7: Format
|
|
167
199
|
|
|
168
200
|
A formatting agent pass cleans up markdown in all content-bearing string fields (sections, subsections, endorsements, exclusions, conditions, summary). It fixes:
|
|
169
201
|
|
|
@@ -175,7 +207,7 @@ A formatting agent pass cleans up markdown in all content-bearing string fields
|
|
|
175
207
|
|
|
176
208
|
Content is batched (up to 20 fields per call) and sent through `generateText` for formatting cleanup. Token usage is tracked the same as other pipeline steps.
|
|
177
209
|
|
|
178
|
-
#### Phase
|
|
210
|
+
#### Phase 8: Chunk
|
|
179
211
|
|
|
180
212
|
The formatted document is chunked into `DocumentChunk[]` for vector storage. Chunks are deterministically IDed as `${documentId}:${type}:${index}`.
|
|
181
213
|
|
|
@@ -206,6 +238,8 @@ const extractor = createExtractor({
|
|
|
206
238
|
});
|
|
207
239
|
```
|
|
208
240
|
|
|
241
|
+
`tokenUsage` aggregates whatever usage your callbacks return. `usageReporting` tells you how many model calls reported usage versus how many omitted it, so a `0 in / 0 out` result is diagnosable instead of silent.
|
|
242
|
+
|
|
209
243
|
### Line-of-Business Templates
|
|
210
244
|
|
|
211
245
|
Templates define what the extraction pipeline expects for each policy type. Each template specifies expected sections, page hints, and required vs. optional fields.
|
package/dist/index.d.mts
CHANGED
|
@@ -27908,11 +27908,30 @@ declare const ExtractionPlanSchema: z.ZodObject<{
|
|
|
27908
27908
|
}>;
|
|
27909
27909
|
type ExtractionPlan = z.infer<typeof ExtractionPlanSchema>;
|
|
27910
27910
|
|
|
27911
|
+
declare const PageAssignmentSchema: z.ZodObject<{
|
|
27912
|
+
localPageNumber: z.ZodNumber;
|
|
27913
|
+
extractorNames: z.ZodArray<z.ZodEnum<["carrier_info", "named_insured", "coverage_limits", "endorsements", "exclusions", "conditions", "premium_breakdown", "declarations", "loss_history", "sections", "supplementary"]>, "many">;
|
|
27914
|
+
confidence: z.ZodOptional<z.ZodNumber>;
|
|
27915
|
+
notes: z.ZodOptional<z.ZodString>;
|
|
27916
|
+
}, "strip", z.ZodTypeAny, {
|
|
27917
|
+
localPageNumber: number;
|
|
27918
|
+
extractorNames: ("declarations" | "supplementary" | "conditions" | "named_insured" | "sections" | "endorsements" | "exclusions" | "loss_history" | "carrier_info" | "coverage_limits" | "premium_breakdown")[];
|
|
27919
|
+
confidence?: number | undefined;
|
|
27920
|
+
notes?: string | undefined;
|
|
27921
|
+
}, {
|
|
27922
|
+
localPageNumber: number;
|
|
27923
|
+
extractorNames: ("declarations" | "supplementary" | "conditions" | "named_insured" | "sections" | "endorsements" | "exclusions" | "loss_history" | "carrier_info" | "coverage_limits" | "premium_breakdown")[];
|
|
27924
|
+
confidence?: number | undefined;
|
|
27925
|
+
notes?: string | undefined;
|
|
27926
|
+
}>;
|
|
27927
|
+
type PageAssignment = z.infer<typeof PageAssignmentSchema>;
|
|
27928
|
+
|
|
27911
27929
|
/** Internal state checkpointed between extraction phases. */
|
|
27912
27930
|
interface ExtractionState {
|
|
27913
27931
|
id: string;
|
|
27914
27932
|
pageCount: number;
|
|
27915
27933
|
classifyResult?: ClassifyResult;
|
|
27934
|
+
pageAssignments?: PageAssignment[];
|
|
27916
27935
|
plan?: ExtractionPlan;
|
|
27917
27936
|
memory: Record<string, unknown>;
|
|
27918
27937
|
document?: InsuranceDocument;
|
|
@@ -27934,6 +27953,11 @@ interface ExtractionResult {
|
|
|
27934
27953
|
document: InsuranceDocument;
|
|
27935
27954
|
chunks: DocumentChunk[];
|
|
27936
27955
|
tokenUsage: TokenUsage;
|
|
27956
|
+
usageReporting: {
|
|
27957
|
+
modelCalls: number;
|
|
27958
|
+
callsWithUsage: number;
|
|
27959
|
+
callsMissingUsage: number;
|
|
27960
|
+
};
|
|
27937
27961
|
/** Last checkpoint — can be passed as `resumeFrom` to retry from a failure point. */
|
|
27938
27962
|
checkpoint?: PipelineCheckpoint<ExtractionState>;
|
|
27939
27963
|
}
|
package/dist/index.d.ts
CHANGED
|
@@ -27908,11 +27908,30 @@ declare const ExtractionPlanSchema: z.ZodObject<{
|
|
|
27908
27908
|
}>;
|
|
27909
27909
|
type ExtractionPlan = z.infer<typeof ExtractionPlanSchema>;
|
|
27910
27910
|
|
|
27911
|
+
declare const PageAssignmentSchema: z.ZodObject<{
|
|
27912
|
+
localPageNumber: z.ZodNumber;
|
|
27913
|
+
extractorNames: z.ZodArray<z.ZodEnum<["carrier_info", "named_insured", "coverage_limits", "endorsements", "exclusions", "conditions", "premium_breakdown", "declarations", "loss_history", "sections", "supplementary"]>, "many">;
|
|
27914
|
+
confidence: z.ZodOptional<z.ZodNumber>;
|
|
27915
|
+
notes: z.ZodOptional<z.ZodString>;
|
|
27916
|
+
}, "strip", z.ZodTypeAny, {
|
|
27917
|
+
localPageNumber: number;
|
|
27918
|
+
extractorNames: ("declarations" | "supplementary" | "conditions" | "named_insured" | "sections" | "endorsements" | "exclusions" | "loss_history" | "carrier_info" | "coverage_limits" | "premium_breakdown")[];
|
|
27919
|
+
confidence?: number | undefined;
|
|
27920
|
+
notes?: string | undefined;
|
|
27921
|
+
}, {
|
|
27922
|
+
localPageNumber: number;
|
|
27923
|
+
extractorNames: ("declarations" | "supplementary" | "conditions" | "named_insured" | "sections" | "endorsements" | "exclusions" | "loss_history" | "carrier_info" | "coverage_limits" | "premium_breakdown")[];
|
|
27924
|
+
confidence?: number | undefined;
|
|
27925
|
+
notes?: string | undefined;
|
|
27926
|
+
}>;
|
|
27927
|
+
type PageAssignment = z.infer<typeof PageAssignmentSchema>;
|
|
27928
|
+
|
|
27911
27929
|
/** Internal state checkpointed between extraction phases. */
|
|
27912
27930
|
interface ExtractionState {
|
|
27913
27931
|
id: string;
|
|
27914
27932
|
pageCount: number;
|
|
27915
27933
|
classifyResult?: ClassifyResult;
|
|
27934
|
+
pageAssignments?: PageAssignment[];
|
|
27916
27935
|
plan?: ExtractionPlan;
|
|
27917
27936
|
memory: Record<string, unknown>;
|
|
27918
27937
|
document?: InsuranceDocument;
|
|
@@ -27934,6 +27953,11 @@ interface ExtractionResult {
|
|
|
27934
27953
|
document: InsuranceDocument;
|
|
27935
27954
|
chunks: DocumentChunk[];
|
|
27936
27955
|
tokenUsage: TokenUsage;
|
|
27956
|
+
usageReporting: {
|
|
27957
|
+
modelCalls: number;
|
|
27958
|
+
callsWithUsage: number;
|
|
27959
|
+
callsMissingUsage: number;
|
|
27960
|
+
};
|
|
27937
27961
|
/** Last checkpoint — can be passed as `resumeFrom` to retry from a failure point. */
|
|
27938
27962
|
checkpoint?: PipelineCheckpoint<ExtractionState>;
|
|
27939
27963
|
}
|