@claritylabs/cl-sdk 0.8.1 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -36
- package/dist/index.d.mts +24 -0
- package/dist/index.d.ts +24 -0
- package/dist/index.js +382 -72
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +382 -72
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -55,6 +55,8 @@ const pdfBase64 = "..."; // base64-encoded insurance PDF
|
|
|
55
55
|
const result = await extractor.extract(pdfBase64);
|
|
56
56
|
console.log(result.document); // Typed InsuranceDocument (policy or quote)
|
|
57
57
|
console.log(result.chunks); // DocumentChunk[] ready for vector storage
|
|
58
|
+
console.log(result.tokenUsage); // Aggregate input/output tokens when available
|
|
59
|
+
console.log(result.usageReporting); // How many model calls did or did not report usage
|
|
58
60
|
```
|
|
59
61
|
|
|
60
62
|
### With PDF-to-Image Conversion
|
|
@@ -108,34 +110,35 @@ Works with any provider: Anthropic, OpenAI, Google, Mistral, Bedrock, Azure, Oll
|
|
|
108
110
|
|
|
109
111
|
### Extraction Pipeline
|
|
110
112
|
|
|
111
|
-
The extraction system uses a **coordinator/worker pattern**
|
|
113
|
+
The extraction system uses a **coordinator/worker pattern** with page-aware planning, merged worker outputs, and a document-grounded review loop.
|
|
112
114
|
|
|
113
115
|
```
|
|
114
|
-
┌─────────────┐ ┌─────────────┐
|
|
115
|
-
│
|
|
116
|
-
│
|
|
117
|
-
│
|
|
118
|
-
│
|
|
119
|
-
│
|
|
120
|
-
|
|
121
|
-
│
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
│
|
|
126
|
-
|
|
127
|
-
│
|
|
128
|
-
|
|
129
|
-
│
|
|
130
|
-
│
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
│
|
|
135
|
-
│
|
|
136
|
-
│
|
|
137
|
-
│
|
|
138
|
-
│
|
|
116
|
+
┌─────────────┐ ┌──────────────┐ ┌─────────────┐
|
|
117
|
+
│ 1. CLASSIFY │────▶│ 2. PAGE MAP │────▶│ 3. PLAN │
|
|
118
|
+
│ │ │ │ │ │
|
|
119
|
+
│ Document │ │ Assign pages │ │ Build tasks │
|
|
120
|
+
│ type, line │ │ to focused │ │ from page │
|
|
121
|
+
│ of business │ │ extractors │ │ assignments │
|
|
122
|
+
└─────────────┘ └──────┬───────┘ └──────┬──────┘
|
|
123
|
+
│ │
|
|
124
|
+
┌──────▼────────────────────▼──────┐
|
|
125
|
+
│ 4. EXTRACT + MERGE (parallel) │
|
|
126
|
+
│ Run focused extractors, merge │
|
|
127
|
+
│ repeat runs instead of overwrite │
|
|
128
|
+
└──────────────┬───────────────────┘
|
|
129
|
+
│
|
|
130
|
+
┌─────────────┐ ┌─────────────▼───────────┐ ┌─────────────┐
|
|
131
|
+
│ 7. FORMAT │◀────│ 6. ASSEMBLE │◀────│ 5. REVIEW │
|
|
132
|
+
│ │ │ │ │ │
|
|
133
|
+
│ Clean up │ │ Merge all data into │ │ Check │
|
|
134
|
+
│ markdown │ │ final document │ │ completeness│
|
|
135
|
+
└──────┬──────┘ └─────────────────────────┘ │ and quality │
|
|
136
|
+
│ └──────┬──────┘
|
|
137
|
+
┌──────▼──────┐ │
|
|
138
|
+
│ 8. CHUNK │◀───────────────────────────────────────────────┘
|
|
139
|
+
│ Break into │
|
|
140
|
+
│ retrieval- │
|
|
141
|
+
│ ready chunks│
|
|
139
142
|
└─────────────┘
|
|
140
143
|
```
|
|
141
144
|
|
|
@@ -148,13 +151,19 @@ The coordinator sends the document to `generateObject` with the `ClassifyResultS
|
|
|
148
151
|
|
|
149
152
|
The full document is passed through `providerOptions.pdfBase64` for this step, so your callback must attach that PDF to the model request as a real document/file part.
|
|
150
153
|
|
|
151
|
-
#### Phase 2:
|
|
154
|
+
#### Phase 2: Page Map
|
|
152
155
|
|
|
153
|
-
|
|
156
|
+
Before planning tasks, the coordinator maps each page to one or more focused extractors. This reduces the chance that declaration pages and schedule pages get mixed into large generic ranges dominated by form language.
|
|
154
157
|
|
|
155
|
-
The
|
|
158
|
+
The page-mapping step uses the relevant PDF pages through `providerOptions.pdfBase64`, chunk by chunk, and produces page-level assignments for the downstream plan.
|
|
156
159
|
|
|
157
|
-
#### Phase 3:
|
|
160
|
+
#### Phase 3: Plan
|
|
161
|
+
|
|
162
|
+
Based on the classification, page map, and **line-of-business template** (e.g., `workers_comp`, `cyber`, `homeowners_ho3`), the coordinator builds an **extraction plan** — a list of focused extractor tasks derived from those page assignments.
|
|
163
|
+
|
|
164
|
+
The old prompt-only `plan.ts` module is a deprecated candidate and is no longer the active planning path used by the coordinator.
|
|
165
|
+
|
|
166
|
+
#### Phase 4: Extract And Merge
|
|
158
167
|
|
|
159
168
|
Focused extractor agents are dispatched **in parallel** (concurrency-limited, default 2). Each extractor targets a specific data domain against its assigned page range. The 11 extractor types are:
|
|
160
169
|
|
|
@@ -172,19 +181,21 @@ Focused extractor agents are dispatched **in parallel** (concurrency-limited, de
|
|
|
172
181
|
| `supplementary` | Regulatory context, contacts, TPA, claims contacts |
|
|
173
182
|
| `sections` | Raw section content (fallback for unmatched sections) |
|
|
174
183
|
|
|
175
|
-
Each extractor writes its results to an in-memory `Map`.
|
|
184
|
+
Each extractor writes its results to an in-memory `Map`. Repeated extractor runs now **merge** instead of overwriting previous results, which is critical for extractors like `coverage_limits`, `endorsements`, `exclusions`, `conditions`, `sections`, and `declarations`.
|
|
176
185
|
|
|
177
186
|
Before each worker call, the SDK slices the requested page range with `extractPageRange()` and passes that page-scoped PDF through `providerOptions.pdfBase64`. If `convertPdfToImages` is configured, it passes `providerOptions.images` instead. The callback layer is responsible for actually including that content in the model input.
|
|
178
187
|
|
|
179
|
-
#### Phase
|
|
188
|
+
#### Phase 5: Review
|
|
180
189
|
|
|
181
|
-
After initial extraction, a review loop (up to `maxReviewRounds`, default 2) checks completeness
|
|
190
|
+
After initial extraction, a review loop (up to `maxReviewRounds`, default 2) checks both **completeness and quality**. The reviewer sees the full PDF, a page-map summary, and a summary of extracted results. It is expected to catch issues like missing required fields, generic placeholder outputs such as "shown in declarations" or "per schedule", and outputs that appear to come from generic form text instead of declaration/schedule values.
|
|
182
191
|
|
|
183
|
-
|
|
192
|
+
If gaps or quality issues are found, additional focused extractor tasks are dispatched.
|
|
193
|
+
|
|
194
|
+
#### Phase 6: Assemble
|
|
184
195
|
|
|
185
196
|
All extractor results are merged into a final validated `InsuranceDocument`.
|
|
186
197
|
|
|
187
|
-
#### Phase
|
|
198
|
+
#### Phase 7: Format
|
|
188
199
|
|
|
189
200
|
A formatting agent pass cleans up markdown in all content-bearing string fields (sections, subsections, endorsements, exclusions, conditions, summary). It fixes:
|
|
190
201
|
|
|
@@ -196,7 +207,7 @@ A formatting agent pass cleans up markdown in all content-bearing string fields
|
|
|
196
207
|
|
|
197
208
|
Content is batched (up to 20 fields per call) and sent through `generateText` for formatting cleanup. Token usage is tracked the same as other pipeline steps.
|
|
198
209
|
|
|
199
|
-
#### Phase
|
|
210
|
+
#### Phase 8: Chunk
|
|
200
211
|
|
|
201
212
|
The formatted document is chunked into `DocumentChunk[]` for vector storage. Chunks are deterministically IDed as `${documentId}:${type}:${index}`.
|
|
202
213
|
|
|
@@ -227,6 +238,8 @@ const extractor = createExtractor({
|
|
|
227
238
|
});
|
|
228
239
|
```
|
|
229
240
|
|
|
241
|
+
`tokenUsage` aggregates whatever usage your callbacks return. `usageReporting` tells you how many model calls reported usage versus how many omitted it, so a `0 in / 0 out` result is diagnosable instead of silent.
|
|
242
|
+
|
|
230
243
|
### Line-of-Business Templates
|
|
231
244
|
|
|
232
245
|
Templates define what the extraction pipeline expects for each policy type. Each template specifies expected sections, page hints, and required vs. optional fields.
|
package/dist/index.d.mts
CHANGED
|
@@ -27908,11 +27908,30 @@ declare const ExtractionPlanSchema: z.ZodObject<{
|
|
|
27908
27908
|
}>;
|
|
27909
27909
|
type ExtractionPlan = z.infer<typeof ExtractionPlanSchema>;
|
|
27910
27910
|
|
|
27911
|
+
declare const PageAssignmentSchema: z.ZodObject<{
|
|
27912
|
+
localPageNumber: z.ZodNumber;
|
|
27913
|
+
extractorNames: z.ZodArray<z.ZodEnum<["carrier_info", "named_insured", "coverage_limits", "endorsements", "exclusions", "conditions", "premium_breakdown", "declarations", "loss_history", "sections", "supplementary"]>, "many">;
|
|
27914
|
+
confidence: z.ZodOptional<z.ZodNumber>;
|
|
27915
|
+
notes: z.ZodOptional<z.ZodString>;
|
|
27916
|
+
}, "strip", z.ZodTypeAny, {
|
|
27917
|
+
localPageNumber: number;
|
|
27918
|
+
extractorNames: ("declarations" | "supplementary" | "conditions" | "named_insured" | "sections" | "endorsements" | "exclusions" | "loss_history" | "carrier_info" | "coverage_limits" | "premium_breakdown")[];
|
|
27919
|
+
confidence?: number | undefined;
|
|
27920
|
+
notes?: string | undefined;
|
|
27921
|
+
}, {
|
|
27922
|
+
localPageNumber: number;
|
|
27923
|
+
extractorNames: ("declarations" | "supplementary" | "conditions" | "named_insured" | "sections" | "endorsements" | "exclusions" | "loss_history" | "carrier_info" | "coverage_limits" | "premium_breakdown")[];
|
|
27924
|
+
confidence?: number | undefined;
|
|
27925
|
+
notes?: string | undefined;
|
|
27926
|
+
}>;
|
|
27927
|
+
type PageAssignment = z.infer<typeof PageAssignmentSchema>;
|
|
27928
|
+
|
|
27911
27929
|
/** Internal state checkpointed between extraction phases. */
|
|
27912
27930
|
interface ExtractionState {
|
|
27913
27931
|
id: string;
|
|
27914
27932
|
pageCount: number;
|
|
27915
27933
|
classifyResult?: ClassifyResult;
|
|
27934
|
+
pageAssignments?: PageAssignment[];
|
|
27916
27935
|
plan?: ExtractionPlan;
|
|
27917
27936
|
memory: Record<string, unknown>;
|
|
27918
27937
|
document?: InsuranceDocument;
|
|
@@ -27934,6 +27953,11 @@ interface ExtractionResult {
|
|
|
27934
27953
|
document: InsuranceDocument;
|
|
27935
27954
|
chunks: DocumentChunk[];
|
|
27936
27955
|
tokenUsage: TokenUsage;
|
|
27956
|
+
usageReporting: {
|
|
27957
|
+
modelCalls: number;
|
|
27958
|
+
callsWithUsage: number;
|
|
27959
|
+
callsMissingUsage: number;
|
|
27960
|
+
};
|
|
27937
27961
|
/** Last checkpoint — can be passed as `resumeFrom` to retry from a failure point. */
|
|
27938
27962
|
checkpoint?: PipelineCheckpoint<ExtractionState>;
|
|
27939
27963
|
}
|
package/dist/index.d.ts
CHANGED
|
@@ -27908,11 +27908,30 @@ declare const ExtractionPlanSchema: z.ZodObject<{
|
|
|
27908
27908
|
}>;
|
|
27909
27909
|
type ExtractionPlan = z.infer<typeof ExtractionPlanSchema>;
|
|
27910
27910
|
|
|
27911
|
+
declare const PageAssignmentSchema: z.ZodObject<{
|
|
27912
|
+
localPageNumber: z.ZodNumber;
|
|
27913
|
+
extractorNames: z.ZodArray<z.ZodEnum<["carrier_info", "named_insured", "coverage_limits", "endorsements", "exclusions", "conditions", "premium_breakdown", "declarations", "loss_history", "sections", "supplementary"]>, "many">;
|
|
27914
|
+
confidence: z.ZodOptional<z.ZodNumber>;
|
|
27915
|
+
notes: z.ZodOptional<z.ZodString>;
|
|
27916
|
+
}, "strip", z.ZodTypeAny, {
|
|
27917
|
+
localPageNumber: number;
|
|
27918
|
+
extractorNames: ("declarations" | "supplementary" | "conditions" | "named_insured" | "sections" | "endorsements" | "exclusions" | "loss_history" | "carrier_info" | "coverage_limits" | "premium_breakdown")[];
|
|
27919
|
+
confidence?: number | undefined;
|
|
27920
|
+
notes?: string | undefined;
|
|
27921
|
+
}, {
|
|
27922
|
+
localPageNumber: number;
|
|
27923
|
+
extractorNames: ("declarations" | "supplementary" | "conditions" | "named_insured" | "sections" | "endorsements" | "exclusions" | "loss_history" | "carrier_info" | "coverage_limits" | "premium_breakdown")[];
|
|
27924
|
+
confidence?: number | undefined;
|
|
27925
|
+
notes?: string | undefined;
|
|
27926
|
+
}>;
|
|
27927
|
+
type PageAssignment = z.infer<typeof PageAssignmentSchema>;
|
|
27928
|
+
|
|
27911
27929
|
/** Internal state checkpointed between extraction phases. */
|
|
27912
27930
|
interface ExtractionState {
|
|
27913
27931
|
id: string;
|
|
27914
27932
|
pageCount: number;
|
|
27915
27933
|
classifyResult?: ClassifyResult;
|
|
27934
|
+
pageAssignments?: PageAssignment[];
|
|
27916
27935
|
plan?: ExtractionPlan;
|
|
27917
27936
|
memory: Record<string, unknown>;
|
|
27918
27937
|
document?: InsuranceDocument;
|
|
@@ -27934,6 +27953,11 @@ interface ExtractionResult {
|
|
|
27934
27953
|
document: InsuranceDocument;
|
|
27935
27954
|
chunks: DocumentChunk[];
|
|
27936
27955
|
tokenUsage: TokenUsage;
|
|
27956
|
+
usageReporting: {
|
|
27957
|
+
modelCalls: number;
|
|
27958
|
+
callsWithUsage: number;
|
|
27959
|
+
callsMissingUsage: number;
|
|
27960
|
+
};
|
|
27937
27961
|
/** Last checkpoint — can be passed as `resumeFrom` to retry from a failure point. */
|
|
27938
27962
|
checkpoint?: PipelineCheckpoint<ExtractionState>;
|
|
27939
27963
|
}
|