@claritylabs/cl-sdk 0.8.1 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -55,6 +55,8 @@ const pdfBase64 = "..."; // base64-encoded insurance PDF
55
55
  const result = await extractor.extract(pdfBase64);
56
56
  console.log(result.document); // Typed InsuranceDocument (policy or quote)
57
57
  console.log(result.chunks); // DocumentChunk[] ready for vector storage
58
+ console.log(result.tokenUsage); // Aggregate input/output tokens when available
59
+ console.log(result.usageReporting); // How many model calls did or did not report usage
58
60
  ```
59
61
 
60
62
  ### With PDF-to-Image Conversion
@@ -108,34 +110,35 @@ Works with any provider: Anthropic, OpenAI, Google, Mistral, Bedrock, Azure, Oll
108
110
 
109
111
  ### Extraction Pipeline
110
112
 
111
- The extraction system uses a **coordinator/worker pattern** a coordinator agent plans the work, specialized extractor agents execute in parallel, and a review loop ensures completeness.
113
+ The extraction system uses a **coordinator/worker pattern** with page-aware planning, merged worker outputs, and a document-grounded review loop.
112
114
 
113
115
  ```
114
- ┌─────────────┐ ┌─────────────┐ ┌──────────────────────┐
115
- 1. CLASSIFY │────▶│ 2. PLAN │────▶│ 3. EXTRACT (parallel)
116
- │ │ │ │
117
- Document │ │ Select │ │ Run focused
118
- type, line │ │ template, │ │ extractors against
119
- of business │ │ assign │ │ assigned page
120
- │ │ │ extractors │ │ ranges │
121
- │ to pages │ │ │
122
- └─────────────┘ └─────────────┘ └──────────┬───────────┘
123
-
124
- ┌─────────────┐ ┌─────────────┐ ┌──────────▼───────────┐
125
- 6. FORMAT │◀────│ 5. ASSEMBLE │◀────│ 4. REVIEW
126
- │ │ │ │ │ │
127
- Clean up │ │ Merge all │ │ Check completeness │
128
- │ markdown │ │ results │ │ against template, │
129
- tables, │ │ into final │ │ dispatch follow-up
130
- spacing │ │ document │ │ extractors for gaps
131
- └──────┬──────┘ └─────────────┘ └──────────────────────┘
132
-
133
- ┌──────▼──────┐
134
- 7. CHUNK │
135
- Break into │
136
- retrieval-
137
- ready
138
- chunks
116
+ ┌─────────────┐ ┌──────────────┐ ┌─────────────┐
117
+ 1. CLASSIFY │────▶│ 2. PAGE MAP │────▶│ 3. PLAN
118
+ │ │ │ │
119
+ Document │ │ Assign pages │ │ Build tasks
120
+ type, line │ │ to focused │ │ from page
121
+ of business │ │ extractors │ │ assignments
122
+ └─────────────┘ └──────┬───────┘ └──────┬──────┘
123
+
124
+ ┌──────▼────────────────────▼──────┐
125
+ 4. EXTRACT + MERGE (parallel) │
126
+ │ Run focused extractors, merge │
127
+ repeat runs instead of overwrite
128
+ └──────────────┬───────────────────┘
129
+
130
+ ┌─────────────┐ ┌─────────────▼───────────┐ ┌─────────────┐
131
+ 7. FORMAT │◀────│ 6. ASSEMBLE │◀────│ 5. REVIEW
132
+ │ │ │ │
133
+ │ Clean up │ │ Merge all data into │ │ Check │
134
+ markdown │ │ final document │ │ completeness│
135
+ └──────┬──────┘ └─────────────────────────┘ │ and quality │
136
+ └──────┬──────┘
137
+ ┌──────▼──────┐
138
+ 8. CHUNK │◀───────────────────────────────────────────────┘
139
+ Break into
140
+ retrieval-
141
+ │ ready chunks│
139
142
  └─────────────┘
140
143
  ```
141
144
 
@@ -148,13 +151,19 @@ The coordinator sends the document to `generateObject` with the `ClassifyResultS
148
151
 
149
152
  The full document is passed through `providerOptions.pdfBase64` for this step, so your callback must attach that PDF to the model request as a real document/file part.
150
153
 
151
- #### Phase 2: Plan
154
+ #### Phase 2: Page Map
152
155
 
153
- Based on the classification, the coordinator selects a **line-of-business template** (e.g., `workers_comp`, `cyber`, `homeowners_ho3`) that defines expected sections and page hints. It then generates an **extraction plan** a list of tasks that map specific extractors to page ranges within the PDF.
156
+ Before planning tasks, the coordinator maps each page to one or more focused extractors. This reduces the chance that declaration pages and schedule pages get mixed into large generic ranges dominated by form language.
154
157
 
155
- The planner also receives the full document through `providerOptions.pdfBase64`, not just prompt text.
158
+ The page-mapping step uses the relevant PDF pages through `providerOptions.pdfBase64`, chunk by chunk, and produces page-level assignments for the downstream plan.
156
159
 
157
- #### Phase 3: Extract
160
+ #### Phase 3: Plan
161
+
162
+ Based on the classification, page map, and **line-of-business template** (e.g., `workers_comp`, `cyber`, `homeowners_ho3`), the coordinator builds an **extraction plan** — a list of focused extractor tasks derived from those page assignments.
163
+
164
+ The old prompt-only `plan.ts` module is a deprecated candidate and is no longer the active planning path used by the coordinator.
165
+
166
+ #### Phase 4: Extract And Merge
158
167
 
159
168
  Focused extractor agents are dispatched **in parallel** (concurrency-limited, default 2). Each extractor targets a specific data domain against its assigned page range. The 11 extractor types are:
160
169
 
@@ -172,19 +181,21 @@ Focused extractor agents are dispatched **in parallel** (concurrency-limited, de
172
181
  | `supplementary` | Regulatory context, contacts, TPA, claims contacts |
173
182
  | `sections` | Raw section content (fallback for unmatched sections) |
174
183
 
175
- Each extractor writes its results to an in-memory `Map`. Results accumulate across all extractors.
184
+ Each extractor writes its results to an in-memory `Map`. Repeated extractor runs now **merge** instead of overwriting previous results, which is critical for extractors like `coverage_limits`, `endorsements`, `exclusions`, `conditions`, `sections`, and `declarations`.
176
185
 
177
186
  Before each worker call, the SDK slices the requested page range with `extractPageRange()` and passes that page-scoped PDF through `providerOptions.pdfBase64`. If `convertPdfToImages` is configured, it passes `providerOptions.images` instead. The callback layer is responsible for actually including that content in the model input.
178
187
 
179
- #### Phase 4: Review
188
+ #### Phase 5: Review
180
189
 
181
- After initial extraction, a review loop (up to `maxReviewRounds`, default 2) checks completeness against the template's expected sections. If gaps are found, additional extractor tasks are dispatched to fill missing data. This iterative refinement ensures comprehensive extraction.
190
+ After initial extraction, a review loop (up to `maxReviewRounds`, default 2) checks both **completeness and quality**. The reviewer sees the full PDF, a page-map summary, and a summary of extracted results. It is expected to catch issues like missing required fields, generic placeholder outputs such as "shown in declarations" or "per schedule", and outputs that appear to come from generic form text instead of declaration/schedule values.
182
191
 
183
- #### Phase 5: Assemble
192
+ If gaps or quality issues are found, additional focused extractor tasks are dispatched.
193
+
194
+ #### Phase 6: Assemble
184
195
 
185
196
  All extractor results are merged into a final validated `InsuranceDocument`.
186
197
 
187
- #### Phase 6: Format
198
+ #### Phase 7: Format
188
199
 
189
200
  A formatting agent pass cleans up markdown in all content-bearing string fields (sections, subsections, endorsements, exclusions, conditions, summary). It fixes:
190
201
 
@@ -196,7 +207,7 @@ A formatting agent pass cleans up markdown in all content-bearing string fields
196
207
 
197
208
  Content is batched (up to 20 fields per call) and sent through `generateText` for formatting cleanup. Token usage is tracked the same as other pipeline steps.
198
209
 
199
- #### Phase 7: Chunk
210
+ #### Phase 8: Chunk
200
211
 
201
212
  The formatted document is chunked into `DocumentChunk[]` for vector storage. Chunks are deterministically IDed as `${documentId}:${type}:${index}`.
202
213
 
@@ -227,6 +238,8 @@ const extractor = createExtractor({
227
238
  });
228
239
  ```
229
240
 
241
+ `tokenUsage` aggregates whatever usage your callbacks return. `usageReporting` tells you how many model calls reported usage versus how many omitted it, so a `0 in / 0 out` result is diagnosable instead of silent.
242
+
230
243
  ### Line-of-Business Templates
231
244
 
232
245
  Templates define what the extraction pipeline expects for each policy type. Each template specifies expected sections, page hints, and required vs. optional fields.
package/dist/index.d.mts CHANGED
@@ -27908,11 +27908,30 @@ declare const ExtractionPlanSchema: z.ZodObject<{
27908
27908
  }>;
27909
27909
  type ExtractionPlan = z.infer<typeof ExtractionPlanSchema>;
27910
27910
 
27911
+ declare const PageAssignmentSchema: z.ZodObject<{
27912
+ localPageNumber: z.ZodNumber;
27913
+ extractorNames: z.ZodArray<z.ZodEnum<["carrier_info", "named_insured", "coverage_limits", "endorsements", "exclusions", "conditions", "premium_breakdown", "declarations", "loss_history", "sections", "supplementary"]>, "many">;
27914
+ confidence: z.ZodOptional<z.ZodNumber>;
27915
+ notes: z.ZodOptional<z.ZodString>;
27916
+ }, "strip", z.ZodTypeAny, {
27917
+ localPageNumber: number;
27918
+ extractorNames: ("declarations" | "supplementary" | "conditions" | "named_insured" | "sections" | "endorsements" | "exclusions" | "loss_history" | "carrier_info" | "coverage_limits" | "premium_breakdown")[];
27919
+ confidence?: number | undefined;
27920
+ notes?: string | undefined;
27921
+ }, {
27922
+ localPageNumber: number;
27923
+ extractorNames: ("declarations" | "supplementary" | "conditions" | "named_insured" | "sections" | "endorsements" | "exclusions" | "loss_history" | "carrier_info" | "coverage_limits" | "premium_breakdown")[];
27924
+ confidence?: number | undefined;
27925
+ notes?: string | undefined;
27926
+ }>;
27927
+ type PageAssignment = z.infer<typeof PageAssignmentSchema>;
27928
+
27911
27929
  /** Internal state checkpointed between extraction phases. */
27912
27930
  interface ExtractionState {
27913
27931
  id: string;
27914
27932
  pageCount: number;
27915
27933
  classifyResult?: ClassifyResult;
27934
+ pageAssignments?: PageAssignment[];
27916
27935
  plan?: ExtractionPlan;
27917
27936
  memory: Record<string, unknown>;
27918
27937
  document?: InsuranceDocument;
@@ -27934,6 +27953,11 @@ interface ExtractionResult {
27934
27953
  document: InsuranceDocument;
27935
27954
  chunks: DocumentChunk[];
27936
27955
  tokenUsage: TokenUsage;
27956
+ usageReporting: {
27957
+ modelCalls: number;
27958
+ callsWithUsage: number;
27959
+ callsMissingUsage: number;
27960
+ };
27937
27961
  /** Last checkpoint — can be passed as `resumeFrom` to retry from a failure point. */
27938
27962
  checkpoint?: PipelineCheckpoint<ExtractionState>;
27939
27963
  }
package/dist/index.d.ts CHANGED
@@ -27908,11 +27908,30 @@ declare const ExtractionPlanSchema: z.ZodObject<{
27908
27908
  }>;
27909
27909
  type ExtractionPlan = z.infer<typeof ExtractionPlanSchema>;
27910
27910
 
27911
+ declare const PageAssignmentSchema: z.ZodObject<{
27912
+ localPageNumber: z.ZodNumber;
27913
+ extractorNames: z.ZodArray<z.ZodEnum<["carrier_info", "named_insured", "coverage_limits", "endorsements", "exclusions", "conditions", "premium_breakdown", "declarations", "loss_history", "sections", "supplementary"]>, "many">;
27914
+ confidence: z.ZodOptional<z.ZodNumber>;
27915
+ notes: z.ZodOptional<z.ZodString>;
27916
+ }, "strip", z.ZodTypeAny, {
27917
+ localPageNumber: number;
27918
+ extractorNames: ("declarations" | "supplementary" | "conditions" | "named_insured" | "sections" | "endorsements" | "exclusions" | "loss_history" | "carrier_info" | "coverage_limits" | "premium_breakdown")[];
27919
+ confidence?: number | undefined;
27920
+ notes?: string | undefined;
27921
+ }, {
27922
+ localPageNumber: number;
27923
+ extractorNames: ("declarations" | "supplementary" | "conditions" | "named_insured" | "sections" | "endorsements" | "exclusions" | "loss_history" | "carrier_info" | "coverage_limits" | "premium_breakdown")[];
27924
+ confidence?: number | undefined;
27925
+ notes?: string | undefined;
27926
+ }>;
27927
+ type PageAssignment = z.infer<typeof PageAssignmentSchema>;
27928
+
27911
27929
  /** Internal state checkpointed between extraction phases. */
27912
27930
  interface ExtractionState {
27913
27931
  id: string;
27914
27932
  pageCount: number;
27915
27933
  classifyResult?: ClassifyResult;
27934
+ pageAssignments?: PageAssignment[];
27916
27935
  plan?: ExtractionPlan;
27917
27936
  memory: Record<string, unknown>;
27918
27937
  document?: InsuranceDocument;
@@ -27934,6 +27953,11 @@ interface ExtractionResult {
27934
27953
  document: InsuranceDocument;
27935
27954
  chunks: DocumentChunk[];
27936
27955
  tokenUsage: TokenUsage;
27956
+ usageReporting: {
27957
+ modelCalls: number;
27958
+ callsWithUsage: number;
27959
+ callsMissingUsage: number;
27960
+ };
27937
27961
  /** Last checkpoint — can be passed as `resumeFrom` to retry from a failure point. */
27938
27962
  checkpoint?: PipelineCheckpoint<ExtractionState>;
27939
27963
  }