npm - @claritylabs/cl-sdk - Versions diffs - 0.5.0 → 0.6.0 - Mend

@claritylabs/cl-sdk 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md CHANGED Viewed

@@ -104,14 +104,22 @@ The extraction system uses a **coordinator/worker pattern** — a coordinator ag
 │              │     │  to pages   │     │                      │
 └─────────────┘     └─────────────┘     └──────────┬───────────┘
                                                    │
-                    ┌─────────────┐     ┌──────────▼───────────┐
-                    │ 5. ASSEMBLE │◀────│  4. REVIEW           │
-                    │             │     │                      │
-                    │  Merge all  │     │  Check completeness  │
-                    │  results,   │     │  against template,   │
-                    │  validate,  │     │  dispatch follow-up  │
-                    │  chunk      │     │  extractors for gaps │
-                    └─────────────┘     └──────────────────────┘
+┌─────────────┐     ┌─────────────┐     ┌──────────▼───────────┐
+│ 6. FORMAT   │◀────│ 5. ASSEMBLE │◀────│  4. REVIEW           │
+│             │     │             │     │                      │
+│  Clean up   │     │  Merge all  │     │  Check completeness  │
+│  markdown   │     │  results    │     │  against template,   │
+│  tables,    │     │  into final │     │  dispatch follow-up  │
+│  spacing    │     │  document   │     │  extractors for gaps │
+└──────┬──────┘     └─────────────┘     └──────────────────────┘
+       │
+┌──────▼──────┐
+│ 7. CHUNK    │
+│  Break into │
+│  retrieval- │
+│  ready      │
+│  chunks     │
+└─────────────┘
 ```
 #### Phase 1: Classify
@@ -151,7 +159,23 @@ After initial extraction, a review loop (up to `maxReviewRounds`, default 2) che
 #### Phase 5: Assemble
-All extractor results are merged into a final validated `InsuranceDocument`, then chunked into `DocumentChunk[]` for vector storage. Chunks are deterministically IDed as `${documentId}:${type}:${index}`.
+All extractor results are merged into a final validated `InsuranceDocument`.
+#### Phase 6: Format
+A formatting agent pass cleans up markdown in all content-bearing string fields (sections, subsections, endorsements, exclusions, conditions, summary). It fixes:
+- **Pipe tables missing separator rows** — adds `| --- | --- |` and leading/trailing pipes
+- **Space-aligned tables** — converts whitespace-padded columns into proper markdown tables
+- **Sub-items mixed into tables** — pulls indented sub-items out of tables into lists
+- **Mixed table/prose content** — handles each segment independently
+- **General cleanup** — excessive blank lines, trailing whitespace, orphaned formatting markers
+Content is batched (up to 20 fields per call) and sent through `generateText` for formatting cleanup. Token usage is tracked the same as other pipeline steps.
+#### Phase 7: Chunk
+The formatted document is chunked into `DocumentChunk[]` for vector storage. Chunks are deterministically IDed as `${documentId}:${type}:${index}`.
 ### Configuration

package/dist/index.d.mts CHANGED Viewed

@@ -28723,16 +28723,16 @@ declare const CitationSchema: z.ZodObject<{
     documentId: string;
     relevance: number;
     index: number;
-    documentType?: "policy" | "quote" | undefined;
     field?: string | undefined;
+    documentType?: "policy" | "quote" | undefined;
 }, {
     quote: string;
     chunkId: string;
     documentId: string;
     relevance: number;
     index: number;
-    documentType?: "policy" | "quote" | undefined;
     field?: string | undefined;
+    documentType?: "policy" | "quote" | undefined;
 }>;
 type Citation = z.infer<typeof CitationSchema>;
 declare const SubAnswerSchema: z.ZodObject<{
@@ -28752,16 +28752,16 @@ declare const SubAnswerSchema: z.ZodObject<{
         documentId: string;
         relevance: number;
         index: number;
-        documentType?: "policy" | "quote" | undefined;
         field?: string | undefined;
+        documentType?: "policy" | "quote" | undefined;
     }, {
         quote: string;
         chunkId: string;
         documentId: string;
         relevance: number;
         index: number;
-        documentType?: "policy" | "quote" | undefined;
         field?: string | undefined;
+        documentType?: "policy" | "quote" | undefined;
     }>, "many">;
     confidence: z.ZodNumber;
     needsMoreContext: z.ZodBoolean;
@@ -28775,8 +28775,8 @@ declare const SubAnswerSchema: z.ZodObject<{
         documentId: string;
         relevance: number;
         index: number;
-        documentType?: "policy" | "quote" | undefined;
         field?: string | undefined;
+        documentType?: "policy" | "quote" | undefined;
     }[];
     needsMoreContext: boolean;
 }, {
@@ -28789,8 +28789,8 @@ declare const SubAnswerSchema: z.ZodObject<{
         documentId: string;
         relevance: number;
         index: number;
-        documentType?: "policy" | "quote" | undefined;
         field?: string | undefined;
+        documentType?: "policy" | "quote" | undefined;
     }[];
     needsMoreContext: boolean;
 }>;
@@ -28825,16 +28825,16 @@ declare const QueryResultSchema: z.ZodObject<{
         documentId: string;
         relevance: number;
         index: number;
-        documentType?: "policy" | "quote" | undefined;
         field?: string | undefined;
+        documentType?: "policy" | "quote" | undefined;
     }, {
         quote: string;
         chunkId: string;
         documentId: string;
         relevance: number;
         index: number;
-        documentType?: "policy" | "quote" | undefined;
         field?: string | undefined;
+        documentType?: "policy" | "quote" | undefined;
     }>, "many">;
     intent: z.ZodEnum<["policy_question", "coverage_comparison", "document_search", "claims_inquiry", "general_knowledge"]>;
     confidence: z.ZodNumber;
@@ -28849,8 +28849,8 @@ declare const QueryResultSchema: z.ZodObject<{
         documentId: string;
         relevance: number;
         index: number;
-        documentType?: "policy" | "quote" | undefined;
         field?: string | undefined;
+        documentType?: "policy" | "quote" | undefined;
     }[];
     followUp?: string | undefined;
 }, {
@@ -28863,8 +28863,8 @@ declare const QueryResultSchema: z.ZodObject<{
         documentId: string;
         relevance: number;
         index: number;
-        documentType?: "policy" | "quote" | undefined;
         field?: string | undefined;
+        documentType?: "policy" | "quote" | undefined;
     }[];
     followUp?: string | undefined;
 }>;

package/dist/index.d.ts CHANGED Viewed

@@ -28723,16 +28723,16 @@ declare const CitationSchema: z.ZodObject<{
     documentId: string;
     relevance: number;
     index: number;
-    documentType?: "policy" | "quote" | undefined;
     field?: string | undefined;
+    documentType?: "policy" | "quote" | undefined;
 }, {
     quote: string;
     chunkId: string;
     documentId: string;
     relevance: number;
     index: number;
-    documentType?: "policy" | "quote" | undefined;
     field?: string | undefined;
+    documentType?: "policy" | "quote" | undefined;
 }>;
 type Citation = z.infer<typeof CitationSchema>;
 declare const SubAnswerSchema: z.ZodObject<{
@@ -28752,16 +28752,16 @@ declare const SubAnswerSchema: z.ZodObject<{
         documentId: string;
         relevance: number;
         index: number;
-        documentType?: "policy" | "quote" | undefined;
         field?: string | undefined;
+        documentType?: "policy" | "quote" | undefined;
     }, {
         quote: string;
         chunkId: string;
         documentId: string;
         relevance: number;
         index: number;
-        documentType?: "policy" | "quote" | undefined;
         field?: string | undefined;
+        documentType?: "policy" | "quote" | undefined;
     }>, "many">;
     confidence: z.ZodNumber;
     needsMoreContext: z.ZodBoolean;
@@ -28775,8 +28775,8 @@ declare const SubAnswerSchema: z.ZodObject<{
         documentId: string;
         relevance: number;
         index: number;
-        documentType?: "policy" | "quote" | undefined;
         field?: string | undefined;
+        documentType?: "policy" | "quote" | undefined;
     }[];
     needsMoreContext: boolean;
 }, {
@@ -28789,8 +28789,8 @@ declare const SubAnswerSchema: z.ZodObject<{
         documentId: string;
         relevance: number;
         index: number;
-        documentType?: "policy" | "quote" | undefined;
         field?: string | undefined;
+        documentType?: "policy" | "quote" | undefined;
     }[];
     needsMoreContext: boolean;
 }>;
@@ -28825,16 +28825,16 @@ declare const QueryResultSchema: z.ZodObject<{
         documentId: string;
         relevance: number;
         index: number;
-        documentType?: "policy" | "quote" | undefined;
         field?: string | undefined;
+        documentType?: "policy" | "quote" | undefined;
     }, {
         quote: string;
         chunkId: string;
         documentId: string;
         relevance: number;
         index: number;
-        documentType?: "policy" | "quote" | undefined;
         field?: string | undefined;
+        documentType?: "policy" | "quote" | undefined;
     }>, "many">;
     intent: z.ZodEnum<["policy_question", "coverage_comparison", "document_search", "claims_inquiry", "general_knowledge"]>;
     confidence: z.ZodNumber;
@@ -28849,8 +28849,8 @@ declare const QueryResultSchema: z.ZodObject<{
         documentId: string;
         relevance: number;
         index: number;
-        documentType?: "policy" | "quote" | undefined;
         field?: string | undefined;
+        documentType?: "policy" | "quote" | undefined;
     }[];
     followUp?: string | undefined;
 }, {
@@ -28863,8 +28863,8 @@ declare const QueryResultSchema: z.ZodObject<{
         documentId: string;
         relevance: number;
         index: number;
-        documentType?: "policy" | "quote" | undefined;
         field?: string | undefined;
+        documentType?: "policy" | "quote" | undefined;
     }[];
     followUp?: string | undefined;
 }>;

package/dist/index.js CHANGED Viewed

@@ -1700,6 +1700,206 @@ function assembleDocument(documentId, documentType, memory) {
   };
 }
+// src/prompts/coordinator/format.ts
+function buildFormatPrompt(entries) {
+  const block = entries.map((e) => `===ENTRY ${e.id}===
+${e.text}`).join("\n\n");
+  return `You are a markdown formatting specialist for insurance document content. You will receive numbered content entries extracted from insurance policies, quotes, and endorsements. Your job is to clean up the formatting so every entry renders correctly as standard markdown.
+## Primary issues to fix
+### 1. Pipe-delimited data missing table syntax
+The most common issue. Content uses pipe characters as column separators but is missing the separator row required for markdown table rendering.
+Before (broken \u2014 won't render as a table):
+COVERAGE | FORM # | LIMIT | DEDUCTIBLE
+Employee Theft | | $10,000 | $1,000
+After (valid markdown table):
+| COVERAGE | FORM # | LIMIT | DEDUCTIBLE |
+| --- | --- | --- | --- |
+| Employee Theft | | $10,000 | $1,000 |
+Rules for pipe tables:
+- Add leading and trailing pipes to every row
+- Add the separator row (| --- | --- |) after the header row
+- Every row must have the same number of pipe-separated columns as the header
+- Empty cells are fine \u2014 just keep the pipes: | | $10,000 |
+### 2. Sub-items indented within pipe tables
+Insurance schedules often have indented sub-items that belong to the previous coverage line. These break table column counts.
+Before (broken):
+COVERAGE | LIMIT | DEDUCTIBLE
+Causes Of Loss - Equipment Breakdown | PR650END
+  Described Premises Limit | | $350,804 |
+  Diagnostic Equipment | | $100,000 |
+  Deductible Type - Business Income: Waiting Period - Hours
+  Waiting Period (Hours): 24
+After: Pull sub-items out of the table. End the table before the sub-items, show them as an indented list, then start a new table if tabular data resumes:
+| COVERAGE | LIMIT | DEDUCTIBLE |
+| --- | --- | --- |
+| Causes Of Loss - Equipment Breakdown | PR650END | |
+- Described Premises Limit: $350,804
+- Diagnostic Equipment: $100,000
+- Deductible Type - Business Income: Waiting Period - Hours
+- Waiting Period (Hours): 24
+### 3. Space-aligned tables
+Declarations often align columns with spaces instead of pipes. These render as plain monospace text and lose structure.
+Before:
+Coverage                               Limit of Liability    Retention
+A. Network Security Liability          $500,000              $10,000
+B. Privacy Liability                   $500,000              $10,000
+After (convert to proper markdown table):
+| Coverage | Limit of Liability | Retention |
+| --- | --- | --- |
+| A. Network Security Liability | $500,000 | $10,000 |
+| B. Privacy Liability | $500,000 | $10,000 |
+### 4. Mixed table/prose content
+A single entry often contains prose paragraphs followed by tabular data followed by more prose. Handle each segment independently \u2014 don't try to force everything into one table.
+### 5. General markdown cleanup
+- **Line spacing**: Remove excessive blank lines (3+ consecutive newlines \u2192 2). Ensure one blank line before and after tables and headings.
+- **Trailing whitespace**: Remove trailing spaces on all lines.
+- **Broken lists**: Ensure list items use consistent markers (-, *, or 1.) with proper nesting indentation.
+- **Orphaned formatting**: Close any unclosed bold (**), italic (*), or code (\`) markers.
+- **Heading levels**: Ensure heading markers (##) have a space after the hashes.
+## Rules
+- Do NOT change the meaning or substance of any content. Only fix formatting.
+- Do NOT add new information, headers, or commentary.
+- Do NOT wrap entries in code fences.
+- Preserve all dollar amounts, dates, policy numbers, form numbers, and technical terms exactly as they appear.
+- If an entry is already well-formatted, return it unchanged.
+- When in doubt about whether something is a table, prefer table formatting for structured data with multiple columns.
+Return your output in this exact format \u2014 one block per entry, in the same order:
+===ENTRY 0===
+(cleaned content for entry 0)
+===ENTRY 1===
+(cleaned content for entry 1)
+...and so on for each entry.
+Here are the entries to format:
+${block}`;
+}
+// src/extraction/formatter.ts
+function collectContentFields(doc) {
+  const entries = [];
+  let id = 0;
+  function add(path, text) {
+    if (text && text.length > 20) {
+      entries.push({ id: id++, path, text });
+    }
+  }
+  add("summary", doc.summary);
+  if (doc.sections) {
+    for (let i = 0; i < doc.sections.length; i++) {
+      const s = doc.sections[i];
+      add(`sections[${i}].content`, s.content);
+      if (s.subsections) {
+        for (let j = 0; j < s.subsections.length; j++) {
+          add(`sections[${i}].subsections[${j}].content`, s.subsections[j].content);
+        }
+      }
+    }
+  }
+  if (doc.endorsements) {
+    for (let i = 0; i < doc.endorsements.length; i++) {
+      add(`endorsements[${i}].content`, doc.endorsements[i].content);
+    }
+  }
+  if (doc.exclusions) {
+    for (let i = 0; i < doc.exclusions.length; i++) {
+      add(`exclusions[${i}].content`, doc.exclusions[i].content);
+    }
+  }
+  if (doc.conditions) {
+    for (let i = 0; i < doc.conditions.length; i++) {
+      add(`conditions[${i}].content`, doc.conditions[i].content);
+    }
+  }
+  return entries;
+}
+function parseFormatResponse(response) {
+  const results = /* @__PURE__ */ new Map();
+  const parts = response.split(/===ENTRY (\d+)===/);
+  for (let i = 1; i < parts.length; i += 2) {
+    const entryId = parseInt(parts[i], 10);
+    const content = parts[i + 1]?.trim();
+    if (!isNaN(entryId) && content !== void 0) {
+      results.set(entryId, content);
+    }
+  }
+  return results;
+}
+function applyFormattedContent(doc, entries, formatted) {
+  for (const entry of entries) {
+    const cleaned = formatted.get(entry.id);
+    if (!cleaned) continue;
+    const segments = entry.path.match(/^(\w+)(?:\[(\d+)\])?(?:\.(\w+)(?:\[(\d+)\])?(?:\.(\w+))?)?$/);
+    if (!segments) continue;
+    const [, field, idx1, sub1, idx2, sub2] = segments;
+    if (!sub1) {
+      doc[field] = cleaned;
+    } else if (!sub2) {
+      const arr = doc[field];
+      if (arr && arr[Number(idx1)]) {
+        arr[Number(idx1)][sub1] = cleaned;
+      }
+    } else {
+      const arr = doc[field];
+      if (arr && arr[Number(idx1)]) {
+        const nested = arr[Number(idx1)][sub1];
+        if (nested && nested[Number(idx2)]) {
+          nested[Number(idx2)][sub2] = cleaned;
+        }
+      }
+    }
+  }
+}
+var MAX_ENTRIES_PER_BATCH = 20;
+async function formatDocumentContent(doc, generateText, options) {
+  const entries = collectContentFields(doc);
+  const totalUsage = { inputTokens: 0, outputTokens: 0 };
+  if (entries.length === 0) {
+    return { document: doc, usage: totalUsage };
+  }
+  options?.onProgress?.(`Formatting ${entries.length} content fields...`);
+  const batches = [];
+  for (let i = 0; i < entries.length; i += MAX_ENTRIES_PER_BATCH) {
+    batches.push(entries.slice(i, i + MAX_ENTRIES_PER_BATCH));
+  }
+  for (const batch of batches) {
+    const prompt = buildFormatPrompt(batch.map((e) => ({ id: e.id, text: e.text })));
+    const result = await withRetry(
+      () => generateText({
+        prompt,
+        maxTokens: 16384,
+        providerOptions: options?.providerOptions
+      })
+    );
+    if (result.usage) {
+      totalUsage.inputTokens += result.usage.inputTokens;
+      totalUsage.outputTokens += result.usage.outputTokens;
+    }
+    const formatted = parseFormatResponse(result.text);
+    applyFormattedContent(doc, batch, formatted);
+  }
+  return { document: doc, usage: totalUsage };
+}
 // src/extraction/chunking.ts
 function chunkDocument(doc) {
   const chunks = [];
@@ -3195,8 +3395,14 @@ function createExtractor(config) {
     }
     onProgress?.("Assembling document...");
     const document = assembleDocument(id, documentType, memory);
-    const chunks = chunkDocument(document);
-    return { document, chunks, tokenUsage: totalUsage };
+    onProgress?.("Formatting extracted content...");
+    const formatResult = await formatDocumentContent(document, generateText, {
+      providerOptions,
+      onProgress
+    });
+    trackUsage(formatResult.usage);
+    const chunks = chunkDocument(formatResult.document);
+    return { document: formatResult.document, chunks, tokenUsage: totalUsage };
   }
   return { extract };
 }