@claritylabs/cl-sdk 0.8.0 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +25 -4
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -31,14 +31,22 @@ CL-SDK extracts structured data from insurance PDFs using a multi-agent pipeline
|
|
|
31
31
|
import { createExtractor } from "@claritylabs/cl-sdk";
|
|
32
32
|
|
|
33
33
|
const extractor = createExtractor({
|
|
34
|
-
generateText: async ({ prompt, system, maxTokens }) => {
|
|
34
|
+
generateText: async ({ prompt, system, maxTokens, providerOptions }) => {
|
|
35
35
|
// Wrap your preferred LLM provider
|
|
36
|
-
const result = await yourProvider.generate({ prompt, system, maxTokens });
|
|
36
|
+
const result = await yourProvider.generate({ prompt, system, maxTokens, providerOptions });
|
|
37
37
|
return { text: result.text, usage: result.usage };
|
|
38
38
|
},
|
|
39
|
-
generateObject: async ({ prompt, system, schema, maxTokens }) => {
|
|
39
|
+
generateObject: async ({ prompt, system, schema, maxTokens, providerOptions }) => {
|
|
40
40
|
// schema is a Zod schema — use it for structured output
|
|
41
|
-
|
|
41
|
+
// IMPORTANT: pass providerOptions.pdfBase64 and/or providerOptions.images
|
|
42
|
+
// through to your model as file/image message parts.
|
|
43
|
+
const result = await yourProvider.generateStructured({
|
|
44
|
+
prompt,
|
|
45
|
+
system,
|
|
46
|
+
schema,
|
|
47
|
+
maxTokens,
|
|
48
|
+
providerOptions,
|
|
49
|
+
});
|
|
42
50
|
return { object: result.object, usage: result.usage };
|
|
43
51
|
},
|
|
44
52
|
});
|
|
@@ -87,6 +95,13 @@ type GenerateObject<T> = (params: {
|
|
|
87
95
|
}) => Promise<{ object: T; usage?: { inputTokens: number; outputTokens: number } }>;
|
|
88
96
|
```
|
|
89
97
|
|
|
98
|
+
For extraction calls, `providerOptions` can carry document content:
|
|
99
|
+
|
|
100
|
+
- `providerOptions.pdfBase64` — the PDF to send as a file part
|
|
101
|
+
- `providerOptions.images` — page images to send as image parts
|
|
102
|
+
|
|
103
|
+
The coordinator passes the full PDF to classify and plan. Worker extractors pass a page-scoped PDF produced by `extractPageRange()` unless `convertPdfToImages` is enabled, in which case they pass page images instead. Your callback must include that content in the actual model request; the prompt text alone is not sufficient.
|
|
104
|
+
|
|
90
105
|
Works with any provider: Anthropic, OpenAI, Google, Mistral, Bedrock, Azure, Ollama, etc. You write the adapter once; the SDK calls it throughout the pipeline.
|
|
91
106
|
|
|
92
107
|
> **Strict structured output compatibility:** The SDK automatically transforms Zod schemas before passing them to `generateObject` — converting `.optional()` fields to `.nullable()` so all properties appear in the JSON Schema `required` array. This ensures compatibility with providers like OpenAI that enforce strict structured output validation. No adapter changes needed on your end.
|
|
@@ -131,10 +146,14 @@ The coordinator sends the document to `generateObject` with the `ClassifyResultS
|
|
|
131
146
|
- **Policy types** — one or more lines of business (e.g., `general_liability`, `workers_comp`)
|
|
132
147
|
- **Confidence score**
|
|
133
148
|
|
|
149
|
+
The full document is passed through `providerOptions.pdfBase64` for this step, so your callback must attach that PDF to the model request as a real document/file part.
|
|
150
|
+
|
|
134
151
|
#### Phase 2: Plan
|
|
135
152
|
|
|
136
153
|
Based on the classification, the coordinator selects a **line-of-business template** (e.g., `workers_comp`, `cyber`, `homeowners_ho3`) that defines expected sections and page hints. It then generates an **extraction plan** — a list of tasks that map specific extractors to page ranges within the PDF.
|
|
137
154
|
|
|
155
|
+
The planner also receives the full document through `providerOptions.pdfBase64`, not just prompt text.
|
|
156
|
+
|
|
138
157
|
#### Phase 3: Extract
|
|
139
158
|
|
|
140
159
|
Focused extractor agents are dispatched **in parallel** (concurrency-limited, default 2). Each extractor targets a specific data domain against its assigned page range. The 11 extractor types are:
|
|
@@ -155,6 +174,8 @@ Focused extractor agents are dispatched **in parallel** (concurrency-limited, de
|
|
|
155
174
|
|
|
156
175
|
Each extractor writes its results to an in-memory `Map`. Results accumulate across all extractors.
|
|
157
176
|
|
|
177
|
+
Before each worker call, the SDK slices the requested page range with `extractPageRange()` and passes that page-scoped PDF through `providerOptions.pdfBase64`. If `convertPdfToImages` is configured, it passes `providerOptions.images` instead. The callback layer is responsible for actually including that content in the model input.
|
|
178
|
+
|
|
158
179
|
#### Phase 4: Review
|
|
159
180
|
|
|
160
181
|
After initial extraction, a review loop (up to `maxReviewRounds`, default 2) checks completeness against the template's expected sections. If gaps are found, additional extractor tasks are dispatched to fill missing data. This iterative refinement ensures comprehensive extraction.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@claritylabs/cl-sdk",
|
|
3
|
-
"version": "0.8.
|
|
3
|
+
"version": "0.8.1",
|
|
4
4
|
"description": "CL-0 SDK — open infrastructure for building AI agents that work with insurance",
|
|
5
5
|
"main": "./dist/index.js",
|
|
6
6
|
"module": "./dist/index.mjs",
|
|
@@ -47,7 +47,7 @@
|
|
|
47
47
|
"semantic-release": "^25.0.3",
|
|
48
48
|
"tsup": "^8.4.0",
|
|
49
49
|
"typescript": "^5.8.0",
|
|
50
|
-
"vitest": "^3.
|
|
50
|
+
"vitest": "^3.2.4",
|
|
51
51
|
"zod": "^3.24.0"
|
|
52
52
|
},
|
|
53
53
|
"repository": {
|