npm - @claritylabs/cl-sdk - Versions diffs - 0.8.0 → 0.9.0 - Mend

@claritylabs/cl-sdk 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md CHANGED Viewed

@@ -31,14 +31,22 @@ CL-SDK extracts structured data from insurance PDFs using a multi-agent pipeline
 import { createExtractor } from "@claritylabs/cl-sdk";
 const extractor = createExtractor({
-  generateText: async ({ prompt, system, maxTokens }) => {
+  generateText: async ({ prompt, system, maxTokens, providerOptions }) => {
     // Wrap your preferred LLM provider
-    const result = await yourProvider.generate({ prompt, system, maxTokens });
+    const result = await yourProvider.generate({ prompt, system, maxTokens, providerOptions });
     return { text: result.text, usage: result.usage };
   },
-  generateObject: async ({ prompt, system, schema, maxTokens }) => {
+  generateObject: async ({ prompt, system, schema, maxTokens, providerOptions }) => {
     // schema is a Zod schema — use it for structured output
-    const result = await yourProvider.generateStructured({ prompt, system, schema, maxTokens });
+    // IMPORTANT: pass providerOptions.pdfBase64 and/or providerOptions.images
+    // through to your model as file/image message parts.
+    const result = await yourProvider.generateStructured({
+      prompt,
+      system,
+      schema,
+      maxTokens,
+      providerOptions,
+    });
     return { object: result.object, usage: result.usage };
   },
 });
@@ -47,6 +55,8 @@ const pdfBase64 = "..."; // base64-encoded insurance PDF
 const result = await extractor.extract(pdfBase64);
 console.log(result.document); // Typed InsuranceDocument (policy or quote)
 console.log(result.chunks);   // DocumentChunk[] ready for vector storage
+console.log(result.tokenUsage);      // Aggregate input/output tokens when available
+console.log(result.usageReporting);  // How many model calls did or did not report usage
 ```
 ### With PDF-to-Image Conversion
@@ -87,40 +97,48 @@ type GenerateObject<T> = (params: {
 }) => Promise<{ object: T; usage?: { inputTokens: number; outputTokens: number } }>;
 ```
+For extraction calls, `providerOptions` can carry document content:
+- `providerOptions.pdfBase64` — the PDF to send as a file part
+- `providerOptions.images` — page images to send as image parts
+The coordinator passes the full PDF to classify and plan. Worker extractors pass a page-scoped PDF produced by `extractPageRange()` unless `convertPdfToImages` is enabled, in which case they pass page images instead. Your callback must include that content in the actual model request; the prompt text alone is not sufficient.
 Works with any provider: Anthropic, OpenAI, Google, Mistral, Bedrock, Azure, Ollama, etc. You write the adapter once; the SDK calls it throughout the pipeline.
 > **Strict structured output compatibility:** The SDK automatically transforms Zod schemas before passing them to `generateObject` — converting `.optional()` fields to `.nullable()` so all properties appear in the JSON Schema `required` array. This ensures compatibility with providers like OpenAI that enforce strict structured output validation. No adapter changes needed on your end.
 ### Extraction Pipeline
-The extraction system uses a **coordinator/worker pattern** — a coordinator agent plans the work, specialized extractor agents execute in parallel, and a review loop ensures completeness.
+The extraction system uses a **coordinator/worker pattern** with page-aware planning, merged worker outputs, and a document-grounded review loop.
 ```
-┌─────────────┐     ┌─────────────┐     ┌──────────────────────┐
-│  1. CLASSIFY │────▶│  2. PLAN    │────▶│  3. EXTRACT (parallel)│
-│              │     │             │     │                      │
-│  Document    │     │  Select     │     │  Run focused         │
-│  type, line  │     │  template,  │     │  extractors against  │
-│  of business │     │  assign     │     │  assigned page       │
-│              │     │  extractors │     │  ranges              │
-│              │     │  to pages   │     │                      │
-└─────────────┘     └─────────────┘     └──────────┬───────────┘
-                                                   │
-┌─────────────┐     ┌─────────────┐     ┌──────────▼───────────┐
-│ 6. FORMAT   │◀────│ 5. ASSEMBLE │◀────│  4. REVIEW           │
-│             │     │             │     │                      │
-│  Clean up   │     │  Merge all  │     │  Check completeness  │
-│  markdown   │     │  results    │     │  against template,   │
-│  tables,    │     │  into final │     │  dispatch follow-up  │
-│  spacing    │     │  document   │     │  extractors for gaps │
-└──────┬──────┘     └─────────────┘     └──────────────────────┘
-       │
-┌──────▼──────┐
-│ 7. CHUNK    │
-│  Break into │
-│  retrieval- │
-│  ready      │
-│  chunks     │
+┌─────────────┐     ┌──────────────┐     ┌─────────────┐
+│ 1. CLASSIFY │────▶│ 2. PAGE MAP  │────▶│ 3. PLAN     │
+│             │     │              │     │             │
+│ Document    │     │ Assign pages │     │ Build tasks │
+│ type, line  │     │ to focused   │     │ from page   │
+│ of business │     │ extractors   │     │ assignments │
+└─────────────┘     └──────┬───────┘     └──────┬──────┘
+                           │                    │
+                    ┌──────▼────────────────────▼──────┐
+                    │ 4. EXTRACT + MERGE (parallel)    │
+                    │ Run focused extractors, merge    │
+                    │ repeat runs instead of overwrite │
+                    └──────────────┬───────────────────┘
+                                   │
+┌─────────────┐     ┌─────────────▼───────────┐     ┌─────────────┐
+│ 7. FORMAT   │◀────│ 6. ASSEMBLE             │◀────│ 5. REVIEW   │
+│             │     │                         │     │             │
+│ Clean up    │     │ Merge all data into     │     │ Check       │
+│ markdown    │     │ final document          │     │ completeness│
+└──────┬──────┘     └─────────────────────────┘     │ and quality │
+       │                                             └──────┬──────┘
+┌──────▼──────┐                                                │
+│ 8. CHUNK    │◀───────────────────────────────────────────────┘
+│ Break into  │
+│ retrieval-  │
+│ ready chunks│
 └─────────────┘
 ```
@@ -131,11 +149,21 @@ The coordinator sends the document to `generateObject` with the `ClassifyResultS
 - **Policy types** — one or more lines of business (e.g., `general_liability`, `workers_comp`)
 - **Confidence score**
-#### Phase 2: Plan
+The full document is passed through `providerOptions.pdfBase64` for this step, so your callback must attach that PDF to the model request as a real document/file part.
+#### Phase 2: Page Map
+Before planning tasks, the coordinator maps each page to one or more focused extractors. This reduces the chance that declaration pages and schedule pages get mixed into large generic ranges dominated by form language.
+The page-mapping step uses the relevant PDF pages through `providerOptions.pdfBase64`, chunk by chunk, and produces page-level assignments for the downstream plan.
-Based on the classification, the coordinator selects a **line-of-business template** (e.g., `workers_comp`, `cyber`, `homeowners_ho3`) that defines expected sections and page hints. It then generates an **extraction plan** — a list of tasks that map specific extractors to page ranges within the PDF.
+#### Phase 3: Plan
-#### Phase 3: Extract
+Based on the classification, page map, and **line-of-business template** (e.g., `workers_comp`, `cyber`, `homeowners_ho3`), the coordinator builds an **extraction plan** — a list of focused extractor tasks derived from those page assignments.
+The old prompt-only `plan.ts` module is a deprecated candidate and is no longer the active planning path used by the coordinator.
+#### Phase 4: Extract And Merge
 Focused extractor agents are dispatched **in parallel** (concurrency-limited, default 2). Each extractor targets a specific data domain against its assigned page range. The 11 extractor types are:
@@ -153,17 +181,21 @@ Focused extractor agents are dispatched **in parallel** (concurrency-limited, de
 | `supplementary` | Regulatory context, contacts, TPA, claims contacts |
 | `sections` | Raw section content (fallback for unmatched sections) |
-Each extractor writes its results to an in-memory `Map`. Results accumulate across all extractors.
+Each extractor writes its results to an in-memory `Map`. Repeated extractor runs now **merge** instead of overwriting previous results, which is critical for extractors like `coverage_limits`, `endorsements`, `exclusions`, `conditions`, `sections`, and `declarations`.
+Before each worker call, the SDK slices the requested page range with `extractPageRange()` and passes that page-scoped PDF through `providerOptions.pdfBase64`. If `convertPdfToImages` is configured, it passes `providerOptions.images` instead. The callback layer is responsible for actually including that content in the model input.
-#### Phase 4: Review
+#### Phase 5: Review
-After initial extraction, a review loop (up to `maxReviewRounds`, default 2) checks completeness against the template's expected sections. If gaps are found, additional extractor tasks are dispatched to fill missing data. This iterative refinement ensures comprehensive extraction.
+After initial extraction, a review loop (up to `maxReviewRounds`, default 2) checks both **completeness and quality**. The reviewer sees the full PDF, a page-map summary, and a summary of extracted results. It is expected to catch issues like missing required fields, generic placeholder outputs such as "shown in declarations" or "per schedule", and outputs that appear to come from generic form text instead of declaration/schedule values.
-#### Phase 5: Assemble
+If gaps or quality issues are found, additional focused extractor tasks are dispatched.
+#### Phase 6: Assemble
 All extractor results are merged into a final validated `InsuranceDocument`.
-#### Phase 6: Format
+#### Phase 7: Format
 A formatting agent pass cleans up markdown in all content-bearing string fields (sections, subsections, endorsements, exclusions, conditions, summary). It fixes:
@@ -175,7 +207,7 @@ A formatting agent pass cleans up markdown in all content-bearing string fields
 Content is batched (up to 20 fields per call) and sent through `generateText` for formatting cleanup. Token usage is tracked the same as other pipeline steps.
-#### Phase 7: Chunk
+#### Phase 8: Chunk
 The formatted document is chunked into `DocumentChunk[]` for vector storage. Chunks are deterministically IDed as `${documentId}:${type}:${index}`.
@@ -206,6 +238,8 @@ const extractor = createExtractor({
 });
 ```
+`tokenUsage` aggregates whatever usage your callbacks return. `usageReporting` tells you how many model calls reported usage versus how many omitted it, so a `0 in / 0 out` result is diagnosable instead of silent.
 ### Line-of-Business Templates
 Templates define what the extraction pipeline expects for each policy type. Each template specifies expected sections, page hints, and required vs. optional fields.

package/dist/index.d.mts CHANGED Viewed

@@ -27908,11 +27908,30 @@ declare const ExtractionPlanSchema: z.ZodObject<{
 }>;
 type ExtractionPlan = z.infer<typeof ExtractionPlanSchema>;
+declare const PageAssignmentSchema: z.ZodObject<{
+    localPageNumber: z.ZodNumber;
+    extractorNames: z.ZodArray<z.ZodEnum<["carrier_info", "named_insured", "coverage_limits", "endorsements", "exclusions", "conditions", "premium_breakdown", "declarations", "loss_history", "sections", "supplementary"]>, "many">;
+    confidence: z.ZodOptional<z.ZodNumber>;
+    notes: z.ZodOptional<z.ZodString>;
+}, "strip", z.ZodTypeAny, {
+    localPageNumber: number;
+    extractorNames: ("declarations" | "supplementary" | "conditions" | "named_insured" | "sections" | "endorsements" | "exclusions" | "loss_history" | "carrier_info" | "coverage_limits" | "premium_breakdown")[];
+    confidence?: number | undefined;
+    notes?: string | undefined;
+}, {
+    localPageNumber: number;
+    extractorNames: ("declarations" | "supplementary" | "conditions" | "named_insured" | "sections" | "endorsements" | "exclusions" | "loss_history" | "carrier_info" | "coverage_limits" | "premium_breakdown")[];
+    confidence?: number | undefined;
+    notes?: string | undefined;
+}>;
+type PageAssignment = z.infer<typeof PageAssignmentSchema>;
 /** Internal state checkpointed between extraction phases. */
 interface ExtractionState {
     id: string;
     pageCount: number;
     classifyResult?: ClassifyResult;
+    pageAssignments?: PageAssignment[];
     plan?: ExtractionPlan;
     memory: Record<string, unknown>;
     document?: InsuranceDocument;
@@ -27934,6 +27953,11 @@ interface ExtractionResult {
     document: InsuranceDocument;
     chunks: DocumentChunk[];
     tokenUsage: TokenUsage;
+    usageReporting: {
+        modelCalls: number;
+        callsWithUsage: number;
+        callsMissingUsage: number;
+    };
     /** Last checkpoint — can be passed as `resumeFrom` to retry from a failure point. */
     checkpoint?: PipelineCheckpoint<ExtractionState>;
 }

package/dist/index.d.ts CHANGED Viewed

@@ -27908,11 +27908,30 @@ declare const ExtractionPlanSchema: z.ZodObject<{
 }>;
 type ExtractionPlan = z.infer<typeof ExtractionPlanSchema>;
+declare const PageAssignmentSchema: z.ZodObject<{
+    localPageNumber: z.ZodNumber;
+    extractorNames: z.ZodArray<z.ZodEnum<["carrier_info", "named_insured", "coverage_limits", "endorsements", "exclusions", "conditions", "premium_breakdown", "declarations", "loss_history", "sections", "supplementary"]>, "many">;
+    confidence: z.ZodOptional<z.ZodNumber>;
+    notes: z.ZodOptional<z.ZodString>;
+}, "strip", z.ZodTypeAny, {
+    localPageNumber: number;
+    extractorNames: ("declarations" | "supplementary" | "conditions" | "named_insured" | "sections" | "endorsements" | "exclusions" | "loss_history" | "carrier_info" | "coverage_limits" | "premium_breakdown")[];
+    confidence?: number | undefined;
+    notes?: string | undefined;
+}, {
+    localPageNumber: number;
+    extractorNames: ("declarations" | "supplementary" | "conditions" | "named_insured" | "sections" | "endorsements" | "exclusions" | "loss_history" | "carrier_info" | "coverage_limits" | "premium_breakdown")[];
+    confidence?: number | undefined;
+    notes?: string | undefined;
+}>;
+type PageAssignment = z.infer<typeof PageAssignmentSchema>;
 /** Internal state checkpointed between extraction phases. */
 interface ExtractionState {
     id: string;
     pageCount: number;
     classifyResult?: ClassifyResult;
+    pageAssignments?: PageAssignment[];
     plan?: ExtractionPlan;
     memory: Record<string, unknown>;
     document?: InsuranceDocument;
@@ -27934,6 +27953,11 @@ interface ExtractionResult {
     document: InsuranceDocument;
     chunks: DocumentChunk[];
     tokenUsage: TokenUsage;
+    usageReporting: {
+        modelCalls: number;
+        callsWithUsage: number;
+        callsMissingUsage: number;
+    };
     /** Last checkpoint — can be passed as `resumeFrom` to retry from a failure point. */
     checkpoint?: PipelineCheckpoint<ExtractionState>;
 }