@claritylabs/cl-sdk 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -31,14 +31,22 @@ CL-SDK extracts structured data from insurance PDFs using a multi-agent pipeline
31
31
  import { createExtractor } from "@claritylabs/cl-sdk";
32
32
 
33
33
  const extractor = createExtractor({
34
- generateText: async ({ prompt, system, maxTokens }) => {
34
+ generateText: async ({ prompt, system, maxTokens, providerOptions }) => {
35
35
  // Wrap your preferred LLM provider
36
- const result = await yourProvider.generate({ prompt, system, maxTokens });
36
+ const result = await yourProvider.generate({ prompt, system, maxTokens, providerOptions });
37
37
  return { text: result.text, usage: result.usage };
38
38
  },
39
- generateObject: async ({ prompt, system, schema, maxTokens }) => {
39
+ generateObject: async ({ prompt, system, schema, maxTokens, providerOptions }) => {
40
40
  // schema is a Zod schema — use it for structured output
41
- const result = await yourProvider.generateStructured({ prompt, system, schema, maxTokens });
41
+ // IMPORTANT: pass providerOptions.pdfBase64 and/or providerOptions.images
42
+ // through to your model as file/image message parts.
43
+ const result = await yourProvider.generateStructured({
44
+ prompt,
45
+ system,
46
+ schema,
47
+ maxTokens,
48
+ providerOptions,
49
+ });
42
50
  return { object: result.object, usage: result.usage };
43
51
  },
44
52
  });
@@ -47,6 +55,8 @@ const pdfBase64 = "..."; // base64-encoded insurance PDF
47
55
  const result = await extractor.extract(pdfBase64);
48
56
  console.log(result.document); // Typed InsuranceDocument (policy or quote)
49
57
  console.log(result.chunks); // DocumentChunk[] ready for vector storage
58
+ console.log(result.tokenUsage); // Aggregate input/output tokens when available
59
+ console.log(result.usageReporting); // How many model calls did or did not report usage
50
60
  ```
51
61
 
52
62
  ### With PDF-to-Image Conversion
@@ -87,40 +97,48 @@ type GenerateObject<T> = (params: {
87
97
  }) => Promise<{ object: T; usage?: { inputTokens: number; outputTokens: number } }>;
88
98
  ```
89
99
 
100
+ For extraction calls, `providerOptions` can carry document content:
101
+
102
+ - `providerOptions.pdfBase64` — the PDF to send as a file part
103
+ - `providerOptions.images` — page images to send as image parts
104
+
105
+ The coordinator passes the full PDF to classify and plan. Worker extractors pass a page-scoped PDF produced by `extractPageRange()` unless `convertPdfToImages` is enabled, in which case they pass page images instead. Your callback must include that content in the actual model request; the prompt text alone is not sufficient.
106
+
90
107
  Works with any provider: Anthropic, OpenAI, Google, Mistral, Bedrock, Azure, Ollama, etc. You write the adapter once; the SDK calls it throughout the pipeline.
91
108
 
92
109
  > **Strict structured output compatibility:** The SDK automatically transforms Zod schemas before passing them to `generateObject` — converting `.optional()` fields to `.nullable()` so all properties appear in the JSON Schema `required` array. This ensures compatibility with providers like OpenAI that enforce strict structured output validation. No adapter changes needed on your end.
93
110
 
94
111
  ### Extraction Pipeline
95
112
 
96
- The extraction system uses a **coordinator/worker pattern** a coordinator agent plans the work, specialized extractor agents execute in parallel, and a review loop ensures completeness.
113
+ The extraction system uses a **coordinator/worker pattern** with page-aware planning, merged worker outputs, and a document-grounded review loop.
97
114
 
98
115
  ```
99
- ┌─────────────┐ ┌─────────────┐ ┌──────────────────────┐
100
- 1. CLASSIFY │────▶│ 2. PLAN │────▶│ 3. EXTRACT (parallel)
101
- │ │ │ │
102
- Document │ │ Select │ │ Run focused
103
- type, line │ │ template, │ │ extractors against
104
- of business │ │ assign │ │ assigned page
105
- │ │ │ extractors │ │ ranges │
106
- │ to pages │ │ │
107
- └─────────────┘ └─────────────┘ └──────────┬───────────┘
108
-
109
- ┌─────────────┐ ┌─────────────┐ ┌──────────▼───────────┐
110
- 6. FORMAT │◀────│ 5. ASSEMBLE │◀────│ 4. REVIEW
111
- │ │ │ │ │ │
112
- Clean up │ │ Merge all │ │ Check completeness │
113
- │ markdown │ │ results │ │ against template, │
114
- tables, │ │ into final │ │ dispatch follow-up
115
- spacing │ │ document │ │ extractors for gaps
116
- └──────┬──────┘ └─────────────┘ └──────────────────────┘
117
-
118
- ┌──────▼──────┐
119
- 7. CHUNK │
120
- Break into │
121
- retrieval-
122
- ready
123
- chunks
116
+ ┌─────────────┐ ┌──────────────┐ ┌─────────────┐
117
+ 1. CLASSIFY │────▶│ 2. PAGE MAP │────▶│ 3. PLAN
118
+ │ │ │ │
119
+ Document │ │ Assign pages │ │ Build tasks
120
+ type, line │ │ to focused │ │ from page
121
+ of business │ │ extractors │ │ assignments
122
+ └─────────────┘ └──────┬───────┘ └──────┬──────┘
123
+
124
+ ┌──────▼────────────────────▼──────┐
125
+ 4. EXTRACT + MERGE (parallel) │
126
+ │ Run focused extractors, merge │
127
+ repeat runs instead of overwrite
128
+ └──────────────┬───────────────────┘
129
+
130
+ ┌─────────────┐ ┌─────────────▼───────────┐ ┌─────────────┐
131
+ 7. FORMAT │◀────│ 6. ASSEMBLE │◀────│ 5. REVIEW
132
+ │ │ │ │
133
+ │ Clean up │ │ Merge all data into │ │ Check │
134
+ markdown │ │ final document │ │ completeness│
135
+ └──────┬──────┘ └─────────────────────────┘ │ and quality │
136
+ └──────┬──────┘
137
+ ┌──────▼──────┐
138
+ 8. CHUNK │◀───────────────────────────────────────────────┘
139
+ Break into
140
+ retrieval-
141
+ │ ready chunks│
124
142
  └─────────────┘
125
143
  ```
126
144
 
@@ -131,11 +149,21 @@ The coordinator sends the document to `generateObject` with the `ClassifyResultS
131
149
  - **Policy types** — one or more lines of business (e.g., `general_liability`, `workers_comp`)
132
150
  - **Confidence score**
133
151
 
134
- #### Phase 2: Plan
152
+ The full document is passed through `providerOptions.pdfBase64` for this step, so your callback must attach that PDF to the model request as a real document/file part.
153
+
154
+ #### Phase 2: Page Map
155
+
156
+ Before planning tasks, the coordinator maps each page to one or more focused extractors. This reduces the chance that declaration pages and schedule pages get mixed into large generic ranges dominated by form language.
157
+
158
+ The page-mapping step uses the relevant PDF pages through `providerOptions.pdfBase64`, chunk by chunk, and produces page-level assignments for the downstream plan.
135
159
 
136
- Based on the classification, the coordinator selects a **line-of-business template** (e.g., `workers_comp`, `cyber`, `homeowners_ho3`) that defines expected sections and page hints. It then generates an **extraction plan** — a list of tasks that map specific extractors to page ranges within the PDF.
160
+ #### Phase 3: Plan
137
161
 
138
- #### Phase 3: Extract
162
+ Based on the classification, page map, and **line-of-business template** (e.g., `workers_comp`, `cyber`, `homeowners_ho3`), the coordinator builds an **extraction plan** — a list of focused extractor tasks derived from those page assignments.
163
+
164
+ The old prompt-only `plan.ts` module is a deprecated candidate and is no longer the active planning path used by the coordinator.
165
+
166
+ #### Phase 4: Extract And Merge
139
167
 
140
168
  Focused extractor agents are dispatched **in parallel** (concurrency-limited, default 2). Each extractor targets a specific data domain against its assigned page range. The 11 extractor types are:
141
169
 
@@ -153,17 +181,21 @@ Focused extractor agents are dispatched **in parallel** (concurrency-limited, de
153
181
  | `supplementary` | Regulatory context, contacts, TPA, claims contacts |
154
182
  | `sections` | Raw section content (fallback for unmatched sections) |
155
183
 
156
- Each extractor writes its results to an in-memory `Map`. Results accumulate across all extractors.
184
+ Each extractor writes its results to an in-memory `Map`. Repeated extractor runs now **merge** instead of overwriting previous results, which is critical for extractors like `coverage_limits`, `endorsements`, `exclusions`, `conditions`, `sections`, and `declarations`.
185
+
186
+ Before each worker call, the SDK slices the requested page range with `extractPageRange()` and passes that page-scoped PDF through `providerOptions.pdfBase64`. If `convertPdfToImages` is configured, it passes `providerOptions.images` instead. The callback layer is responsible for actually including that content in the model input.
157
187
 
158
- #### Phase 4: Review
188
+ #### Phase 5: Review
159
189
 
160
- After initial extraction, a review loop (up to `maxReviewRounds`, default 2) checks completeness against the template's expected sections. If gaps are found, additional extractor tasks are dispatched to fill missing data. This iterative refinement ensures comprehensive extraction.
190
+ After initial extraction, a review loop (up to `maxReviewRounds`, default 2) checks both **completeness and quality**. The reviewer sees the full PDF, a page-map summary, and a summary of extracted results. It is expected to catch issues like missing required fields, generic placeholder outputs such as "shown in declarations" or "per schedule", and outputs that appear to come from generic form text instead of declaration/schedule values.
161
191
 
162
- #### Phase 5: Assemble
192
+ If gaps or quality issues are found, additional focused extractor tasks are dispatched.
193
+
194
+ #### Phase 6: Assemble
163
195
 
164
196
  All extractor results are merged into a final validated `InsuranceDocument`.
165
197
 
166
- #### Phase 6: Format
198
+ #### Phase 7: Format
167
199
 
168
200
  A formatting agent pass cleans up markdown in all content-bearing string fields (sections, subsections, endorsements, exclusions, conditions, summary). It fixes:
169
201
 
@@ -175,7 +207,7 @@ A formatting agent pass cleans up markdown in all content-bearing string fields
175
207
 
176
208
  Content is batched (up to 20 fields per call) and sent through `generateText` for formatting cleanup. Token usage is tracked the same as other pipeline steps.
177
209
 
178
- #### Phase 7: Chunk
210
+ #### Phase 8: Chunk
179
211
 
180
212
  The formatted document is chunked into `DocumentChunk[]` for vector storage. Chunks are deterministically IDed as `${documentId}:${type}:${index}`.
181
213
 
@@ -206,6 +238,8 @@ const extractor = createExtractor({
206
238
  });
207
239
  ```
208
240
 
241
+ `tokenUsage` aggregates whatever usage your callbacks return. `usageReporting` tells you how many model calls reported usage versus how many omitted it, so a `0 in / 0 out` result is diagnosable instead of silent.
242
+
209
243
  ### Line-of-Business Templates
210
244
 
211
245
  Templates define what the extraction pipeline expects for each policy type. Each template specifies expected sections, page hints, and required vs. optional fields.
package/dist/index.d.mts CHANGED
@@ -27908,11 +27908,30 @@ declare const ExtractionPlanSchema: z.ZodObject<{
27908
27908
  }>;
27909
27909
  type ExtractionPlan = z.infer<typeof ExtractionPlanSchema>;
27910
27910
 
27911
+ declare const PageAssignmentSchema: z.ZodObject<{
27912
+ localPageNumber: z.ZodNumber;
27913
+ extractorNames: z.ZodArray<z.ZodEnum<["carrier_info", "named_insured", "coverage_limits", "endorsements", "exclusions", "conditions", "premium_breakdown", "declarations", "loss_history", "sections", "supplementary"]>, "many">;
27914
+ confidence: z.ZodOptional<z.ZodNumber>;
27915
+ notes: z.ZodOptional<z.ZodString>;
27916
+ }, "strip", z.ZodTypeAny, {
27917
+ localPageNumber: number;
27918
+ extractorNames: ("declarations" | "supplementary" | "conditions" | "named_insured" | "sections" | "endorsements" | "exclusions" | "loss_history" | "carrier_info" | "coverage_limits" | "premium_breakdown")[];
27919
+ confidence?: number | undefined;
27920
+ notes?: string | undefined;
27921
+ }, {
27922
+ localPageNumber: number;
27923
+ extractorNames: ("declarations" | "supplementary" | "conditions" | "named_insured" | "sections" | "endorsements" | "exclusions" | "loss_history" | "carrier_info" | "coverage_limits" | "premium_breakdown")[];
27924
+ confidence?: number | undefined;
27925
+ notes?: string | undefined;
27926
+ }>;
27927
+ type PageAssignment = z.infer<typeof PageAssignmentSchema>;
27928
+
27911
27929
  /** Internal state checkpointed between extraction phases. */
27912
27930
  interface ExtractionState {
27913
27931
  id: string;
27914
27932
  pageCount: number;
27915
27933
  classifyResult?: ClassifyResult;
27934
+ pageAssignments?: PageAssignment[];
27916
27935
  plan?: ExtractionPlan;
27917
27936
  memory: Record<string, unknown>;
27918
27937
  document?: InsuranceDocument;
@@ -27934,6 +27953,11 @@ interface ExtractionResult {
27934
27953
  document: InsuranceDocument;
27935
27954
  chunks: DocumentChunk[];
27936
27955
  tokenUsage: TokenUsage;
27956
+ usageReporting: {
27957
+ modelCalls: number;
27958
+ callsWithUsage: number;
27959
+ callsMissingUsage: number;
27960
+ };
27937
27961
  /** Last checkpoint — can be passed as `resumeFrom` to retry from a failure point. */
27938
27962
  checkpoint?: PipelineCheckpoint<ExtractionState>;
27939
27963
  }
package/dist/index.d.ts CHANGED
@@ -27908,11 +27908,30 @@ declare const ExtractionPlanSchema: z.ZodObject<{
27908
27908
  }>;
27909
27909
  type ExtractionPlan = z.infer<typeof ExtractionPlanSchema>;
27910
27910
 
27911
+ declare const PageAssignmentSchema: z.ZodObject<{
27912
+ localPageNumber: z.ZodNumber;
27913
+ extractorNames: z.ZodArray<z.ZodEnum<["carrier_info", "named_insured", "coverage_limits", "endorsements", "exclusions", "conditions", "premium_breakdown", "declarations", "loss_history", "sections", "supplementary"]>, "many">;
27914
+ confidence: z.ZodOptional<z.ZodNumber>;
27915
+ notes: z.ZodOptional<z.ZodString>;
27916
+ }, "strip", z.ZodTypeAny, {
27917
+ localPageNumber: number;
27918
+ extractorNames: ("declarations" | "supplementary" | "conditions" | "named_insured" | "sections" | "endorsements" | "exclusions" | "loss_history" | "carrier_info" | "coverage_limits" | "premium_breakdown")[];
27919
+ confidence?: number | undefined;
27920
+ notes?: string | undefined;
27921
+ }, {
27922
+ localPageNumber: number;
27923
+ extractorNames: ("declarations" | "supplementary" | "conditions" | "named_insured" | "sections" | "endorsements" | "exclusions" | "loss_history" | "carrier_info" | "coverage_limits" | "premium_breakdown")[];
27924
+ confidence?: number | undefined;
27925
+ notes?: string | undefined;
27926
+ }>;
27927
+ type PageAssignment = z.infer<typeof PageAssignmentSchema>;
27928
+
27911
27929
  /** Internal state checkpointed between extraction phases. */
27912
27930
  interface ExtractionState {
27913
27931
  id: string;
27914
27932
  pageCount: number;
27915
27933
  classifyResult?: ClassifyResult;
27934
+ pageAssignments?: PageAssignment[];
27916
27935
  plan?: ExtractionPlan;
27917
27936
  memory: Record<string, unknown>;
27918
27937
  document?: InsuranceDocument;
@@ -27934,6 +27953,11 @@ interface ExtractionResult {
27934
27953
  document: InsuranceDocument;
27935
27954
  chunks: DocumentChunk[];
27936
27955
  tokenUsage: TokenUsage;
27956
+ usageReporting: {
27957
+ modelCalls: number;
27958
+ callsWithUsage: number;
27959
+ callsMissingUsage: number;
27960
+ };
27937
27961
  /** Last checkpoint — can be passed as `resumeFrom` to retry from a failure point. */
27938
27962
  checkpoint?: PipelineCheckpoint<ExtractionState>;
27939
27963
  }