@claritylabs/cl-sdk 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -104,14 +104,22 @@ The extraction system uses a **coordinator/worker pattern** — a coordinator ag
104
104
  │ │ │ to pages │ │ │
105
105
  └─────────────┘ └─────────────┘ └──────────┬───────────┘
106
106
 
107
- ┌─────────────┐ ┌──────────▼───────────┐
108
- │ 5. ASSEMBLE │◀────│ 4. REVIEW │
109
- │ │ │ │
110
- │ Merge all │ │ Check completeness │
111
- results, │ │ against template, │
112
- validate, │ │ dispatch follow-up │
113
- chunk │ │ extractors for gaps │
114
- └─────────────┘ └──────────────────────┘
107
+ ┌─────────────┐ ┌─────────────┐ ┌──────────▼───────────┐
108
+ 6. FORMAT │◀────│ 5. ASSEMBLE │◀────│ 4. REVIEW │
109
+ │ │ │ │ │
110
+ Clean up │ │ Merge all │ │ Check completeness │
111
+ markdown │ │ results │ │ against template, │
112
+ tables, │ │ into final │ │ dispatch follow-up │
113
+ spacing │ │ document │ │ extractors for gaps │
114
+ └──────┬──────┘ └─────────────┘ └──────────────────────┘
115
+
116
+ ┌──────▼──────┐
117
+ │ 7. CHUNK │
118
+ │ Break into │
119
+ │ retrieval- │
120
+ │ ready │
121
+ │ chunks │
122
+ └─────────────┘
115
123
  ```
116
124
 
117
125
  #### Phase 1: Classify
@@ -151,7 +159,23 @@ After initial extraction, a review loop (up to `maxReviewRounds`, default 2) che
151
159
 
152
160
  #### Phase 5: Assemble
153
161
 
154
- All extractor results are merged into a final validated `InsuranceDocument`, then chunked into `DocumentChunk[]` for vector storage. Chunks are deterministically IDed as `${documentId}:${type}:${index}`.
162
+ All extractor results are merged into a final validated `InsuranceDocument`.
163
+
164
+ #### Phase 6: Format
165
+
166
+ A formatting agent pass cleans up markdown in all content-bearing string fields (sections, subsections, endorsements, exclusions, conditions, summary). It fixes:
167
+
168
+ - **Pipe tables missing separator rows** — adds `| --- | --- |` and leading/trailing pipes
169
+ - **Space-aligned tables** — converts whitespace-padded columns into proper markdown tables
170
+ - **Sub-items mixed into tables** — pulls indented sub-items out of tables into lists
171
+ - **Mixed table/prose content** — handles each segment independently
172
+ - **General cleanup** — excessive blank lines, trailing whitespace, orphaned formatting markers
173
+
174
+ Content is batched (up to 20 fields per call) and sent through `generateText` for formatting cleanup. Token usage is tracked the same as other pipeline steps.
175
+
176
+ #### Phase 7: Chunk
177
+
178
+ The formatted document is chunked into `DocumentChunk[]` for vector storage. Chunks are deterministically IDed as `${documentId}:${type}:${index}`.
155
179
 
156
180
  ### Configuration
157
181
 
package/dist/index.d.mts CHANGED
@@ -28723,16 +28723,16 @@ declare const CitationSchema: z.ZodObject<{
28723
28723
  documentId: string;
28724
28724
  relevance: number;
28725
28725
  index: number;
28726
- documentType?: "policy" | "quote" | undefined;
28727
28726
  field?: string | undefined;
28727
+ documentType?: "policy" | "quote" | undefined;
28728
28728
  }, {
28729
28729
  quote: string;
28730
28730
  chunkId: string;
28731
28731
  documentId: string;
28732
28732
  relevance: number;
28733
28733
  index: number;
28734
- documentType?: "policy" | "quote" | undefined;
28735
28734
  field?: string | undefined;
28735
+ documentType?: "policy" | "quote" | undefined;
28736
28736
  }>;
28737
28737
  type Citation = z.infer<typeof CitationSchema>;
28738
28738
  declare const SubAnswerSchema: z.ZodObject<{
@@ -28752,16 +28752,16 @@ declare const SubAnswerSchema: z.ZodObject<{
28752
28752
  documentId: string;
28753
28753
  relevance: number;
28754
28754
  index: number;
28755
- documentType?: "policy" | "quote" | undefined;
28756
28755
  field?: string | undefined;
28756
+ documentType?: "policy" | "quote" | undefined;
28757
28757
  }, {
28758
28758
  quote: string;
28759
28759
  chunkId: string;
28760
28760
  documentId: string;
28761
28761
  relevance: number;
28762
28762
  index: number;
28763
- documentType?: "policy" | "quote" | undefined;
28764
28763
  field?: string | undefined;
28764
+ documentType?: "policy" | "quote" | undefined;
28765
28765
  }>, "many">;
28766
28766
  confidence: z.ZodNumber;
28767
28767
  needsMoreContext: z.ZodBoolean;
@@ -28775,8 +28775,8 @@ declare const SubAnswerSchema: z.ZodObject<{
28775
28775
  documentId: string;
28776
28776
  relevance: number;
28777
28777
  index: number;
28778
- documentType?: "policy" | "quote" | undefined;
28779
28778
  field?: string | undefined;
28779
+ documentType?: "policy" | "quote" | undefined;
28780
28780
  }[];
28781
28781
  needsMoreContext: boolean;
28782
28782
  }, {
@@ -28789,8 +28789,8 @@ declare const SubAnswerSchema: z.ZodObject<{
28789
28789
  documentId: string;
28790
28790
  relevance: number;
28791
28791
  index: number;
28792
- documentType?: "policy" | "quote" | undefined;
28793
28792
  field?: string | undefined;
28793
+ documentType?: "policy" | "quote" | undefined;
28794
28794
  }[];
28795
28795
  needsMoreContext: boolean;
28796
28796
  }>;
@@ -28825,16 +28825,16 @@ declare const QueryResultSchema: z.ZodObject<{
28825
28825
  documentId: string;
28826
28826
  relevance: number;
28827
28827
  index: number;
28828
- documentType?: "policy" | "quote" | undefined;
28829
28828
  field?: string | undefined;
28829
+ documentType?: "policy" | "quote" | undefined;
28830
28830
  }, {
28831
28831
  quote: string;
28832
28832
  chunkId: string;
28833
28833
  documentId: string;
28834
28834
  relevance: number;
28835
28835
  index: number;
28836
- documentType?: "policy" | "quote" | undefined;
28837
28836
  field?: string | undefined;
28837
+ documentType?: "policy" | "quote" | undefined;
28838
28838
  }>, "many">;
28839
28839
  intent: z.ZodEnum<["policy_question", "coverage_comparison", "document_search", "claims_inquiry", "general_knowledge"]>;
28840
28840
  confidence: z.ZodNumber;
@@ -28849,8 +28849,8 @@ declare const QueryResultSchema: z.ZodObject<{
28849
28849
  documentId: string;
28850
28850
  relevance: number;
28851
28851
  index: number;
28852
- documentType?: "policy" | "quote" | undefined;
28853
28852
  field?: string | undefined;
28853
+ documentType?: "policy" | "quote" | undefined;
28854
28854
  }[];
28855
28855
  followUp?: string | undefined;
28856
28856
  }, {
@@ -28863,8 +28863,8 @@ declare const QueryResultSchema: z.ZodObject<{
28863
28863
  documentId: string;
28864
28864
  relevance: number;
28865
28865
  index: number;
28866
- documentType?: "policy" | "quote" | undefined;
28867
28866
  field?: string | undefined;
28867
+ documentType?: "policy" | "quote" | undefined;
28868
28868
  }[];
28869
28869
  followUp?: string | undefined;
28870
28870
  }>;
package/dist/index.d.ts CHANGED
@@ -28723,16 +28723,16 @@ declare const CitationSchema: z.ZodObject<{
28723
28723
  documentId: string;
28724
28724
  relevance: number;
28725
28725
  index: number;
28726
- documentType?: "policy" | "quote" | undefined;
28727
28726
  field?: string | undefined;
28727
+ documentType?: "policy" | "quote" | undefined;
28728
28728
  }, {
28729
28729
  quote: string;
28730
28730
  chunkId: string;
28731
28731
  documentId: string;
28732
28732
  relevance: number;
28733
28733
  index: number;
28734
- documentType?: "policy" | "quote" | undefined;
28735
28734
  field?: string | undefined;
28735
+ documentType?: "policy" | "quote" | undefined;
28736
28736
  }>;
28737
28737
  type Citation = z.infer<typeof CitationSchema>;
28738
28738
  declare const SubAnswerSchema: z.ZodObject<{
@@ -28752,16 +28752,16 @@ declare const SubAnswerSchema: z.ZodObject<{
28752
28752
  documentId: string;
28753
28753
  relevance: number;
28754
28754
  index: number;
28755
- documentType?: "policy" | "quote" | undefined;
28756
28755
  field?: string | undefined;
28756
+ documentType?: "policy" | "quote" | undefined;
28757
28757
  }, {
28758
28758
  quote: string;
28759
28759
  chunkId: string;
28760
28760
  documentId: string;
28761
28761
  relevance: number;
28762
28762
  index: number;
28763
- documentType?: "policy" | "quote" | undefined;
28764
28763
  field?: string | undefined;
28764
+ documentType?: "policy" | "quote" | undefined;
28765
28765
  }>, "many">;
28766
28766
  confidence: z.ZodNumber;
28767
28767
  needsMoreContext: z.ZodBoolean;
@@ -28775,8 +28775,8 @@ declare const SubAnswerSchema: z.ZodObject<{
28775
28775
  documentId: string;
28776
28776
  relevance: number;
28777
28777
  index: number;
28778
- documentType?: "policy" | "quote" | undefined;
28779
28778
  field?: string | undefined;
28779
+ documentType?: "policy" | "quote" | undefined;
28780
28780
  }[];
28781
28781
  needsMoreContext: boolean;
28782
28782
  }, {
@@ -28789,8 +28789,8 @@ declare const SubAnswerSchema: z.ZodObject<{
28789
28789
  documentId: string;
28790
28790
  relevance: number;
28791
28791
  index: number;
28792
- documentType?: "policy" | "quote" | undefined;
28793
28792
  field?: string | undefined;
28793
+ documentType?: "policy" | "quote" | undefined;
28794
28794
  }[];
28795
28795
  needsMoreContext: boolean;
28796
28796
  }>;
@@ -28825,16 +28825,16 @@ declare const QueryResultSchema: z.ZodObject<{
28825
28825
  documentId: string;
28826
28826
  relevance: number;
28827
28827
  index: number;
28828
- documentType?: "policy" | "quote" | undefined;
28829
28828
  field?: string | undefined;
28829
+ documentType?: "policy" | "quote" | undefined;
28830
28830
  }, {
28831
28831
  quote: string;
28832
28832
  chunkId: string;
28833
28833
  documentId: string;
28834
28834
  relevance: number;
28835
28835
  index: number;
28836
- documentType?: "policy" | "quote" | undefined;
28837
28836
  field?: string | undefined;
28837
+ documentType?: "policy" | "quote" | undefined;
28838
28838
  }>, "many">;
28839
28839
  intent: z.ZodEnum<["policy_question", "coverage_comparison", "document_search", "claims_inquiry", "general_knowledge"]>;
28840
28840
  confidence: z.ZodNumber;
@@ -28849,8 +28849,8 @@ declare const QueryResultSchema: z.ZodObject<{
28849
28849
  documentId: string;
28850
28850
  relevance: number;
28851
28851
  index: number;
28852
- documentType?: "policy" | "quote" | undefined;
28853
28852
  field?: string | undefined;
28853
+ documentType?: "policy" | "quote" | undefined;
28854
28854
  }[];
28855
28855
  followUp?: string | undefined;
28856
28856
  }, {
@@ -28863,8 +28863,8 @@ declare const QueryResultSchema: z.ZodObject<{
28863
28863
  documentId: string;
28864
28864
  relevance: number;
28865
28865
  index: number;
28866
- documentType?: "policy" | "quote" | undefined;
28867
28866
  field?: string | undefined;
28867
+ documentType?: "policy" | "quote" | undefined;
28868
28868
  }[];
28869
28869
  followUp?: string | undefined;
28870
28870
  }>;
package/dist/index.js CHANGED
@@ -1700,6 +1700,206 @@ function assembleDocument(documentId, documentType, memory) {
1700
1700
  };
1701
1701
  }
1702
1702
 
1703
+ // src/prompts/coordinator/format.ts
1704
+ function buildFormatPrompt(entries) {
1705
+ const block = entries.map((e) => `===ENTRY ${e.id}===
1706
+ ${e.text}`).join("\n\n");
1707
+ return `You are a markdown formatting specialist for insurance document content. You will receive numbered content entries extracted from insurance policies, quotes, and endorsements. Your job is to clean up the formatting so every entry renders correctly as standard markdown.
1708
+
1709
+ ## Primary issues to fix
1710
+
1711
+ ### 1. Pipe-delimited data missing table syntax
1712
+ The most common issue. Content uses pipe characters as column separators but is missing the separator row required for markdown table rendering.
1713
+
1714
+ Before (broken \u2014 won't render as a table):
1715
+ COVERAGE | FORM # | LIMIT | DEDUCTIBLE
1716
+ Employee Theft | | $10,000 | $1,000
1717
+
1718
+ After (valid markdown table):
1719
+ | COVERAGE | FORM # | LIMIT | DEDUCTIBLE |
1720
+ | --- | --- | --- | --- |
1721
+ | Employee Theft | | $10,000 | $1,000 |
1722
+
1723
+ Rules for pipe tables:
1724
+ - Add leading and trailing pipes to every row
1725
+ - Add the separator row (| --- | --- |) after the header row
1726
+ - Every row must have the same number of pipe-separated columns as the header
1727
+ - Empty cells are fine \u2014 just keep the pipes: | | $10,000 |
1728
+
1729
+ ### 2. Sub-items indented within pipe tables
1730
+ Insurance schedules often have indented sub-items that belong to the previous coverage line. These break table column counts.
1731
+
1732
+ Before (broken):
1733
+ COVERAGE | LIMIT | DEDUCTIBLE
1734
+ Causes Of Loss - Equipment Breakdown | PR650END
1735
+ Described Premises Limit | | $350,804 |
1736
+ Diagnostic Equipment | | $100,000 |
1737
+ Deductible Type - Business Income: Waiting Period - Hours
1738
+ Waiting Period (Hours): 24
1739
+
1740
+ After: Pull sub-items out of the table. End the table before the sub-items, show them as an indented list, then start a new table if tabular data resumes:
1741
+ | COVERAGE | LIMIT | DEDUCTIBLE |
1742
+ | --- | --- | --- |
1743
+ | Causes Of Loss - Equipment Breakdown | PR650END | |
1744
+
1745
+ - Described Premises Limit: $350,804
1746
+ - Diagnostic Equipment: $100,000
1747
+ - Deductible Type - Business Income: Waiting Period - Hours
1748
+ - Waiting Period (Hours): 24
1749
+
1750
+ ### 3. Space-aligned tables
1751
+ Declarations often align columns with spaces instead of pipes. These render as plain monospace text and lose structure.
1752
+
1753
+ Before:
1754
+ Coverage Limit of Liability Retention
1755
+ A. Network Security Liability $500,000 $10,000
1756
+ B. Privacy Liability $500,000 $10,000
1757
+
1758
+ After (convert to proper markdown table):
1759
+ | Coverage | Limit of Liability | Retention |
1760
+ | --- | --- | --- |
1761
+ | A. Network Security Liability | $500,000 | $10,000 |
1762
+ | B. Privacy Liability | $500,000 | $10,000 |
1763
+
1764
+ ### 4. Mixed table/prose content
1765
+ A single entry often contains prose paragraphs followed by tabular data followed by more prose. Handle each segment independently \u2014 don't try to force everything into one table.
1766
+
1767
+ ### 5. General markdown cleanup
1768
+ - **Line spacing**: Remove excessive blank lines (3+ consecutive newlines \u2192 2). Ensure one blank line before and after tables and headings.
1769
+ - **Trailing whitespace**: Remove trailing spaces on all lines.
1770
+ - **Broken lists**: Ensure list items use consistent markers (-, *, or 1.) with proper nesting indentation.
1771
+ - **Orphaned formatting**: Close any unclosed bold (**), italic (*), or code (\`) markers.
1772
+ - **Heading levels**: Ensure heading markers (##) have a space after the hashes.
1773
+
1774
+ ## Rules
1775
+ - Do NOT change the meaning or substance of any content. Only fix formatting.
1776
+ - Do NOT add new information, headers, or commentary.
1777
+ - Do NOT wrap entries in code fences.
1778
+ - Preserve all dollar amounts, dates, policy numbers, form numbers, and technical terms exactly as they appear.
1779
+ - If an entry is already well-formatted, return it unchanged.
1780
+ - When in doubt about whether something is a table, prefer table formatting for structured data with multiple columns.
1781
+
1782
+ Return your output in this exact format \u2014 one block per entry, in the same order:
1783
+
1784
+ ===ENTRY 0===
1785
+ (cleaned content for entry 0)
1786
+
1787
+ ===ENTRY 1===
1788
+ (cleaned content for entry 1)
1789
+
1790
+ ...and so on for each entry.
1791
+
1792
+ Here are the entries to format:
1793
+
1794
+ ${block}`;
1795
+ }
1796
+
1797
+ // src/extraction/formatter.ts
1798
+ function collectContentFields(doc) {
1799
+ const entries = [];
1800
+ let id = 0;
1801
+ function add(path, text) {
1802
+ if (text && text.length > 20) {
1803
+ entries.push({ id: id++, path, text });
1804
+ }
1805
+ }
1806
+ add("summary", doc.summary);
1807
+ if (doc.sections) {
1808
+ for (let i = 0; i < doc.sections.length; i++) {
1809
+ const s = doc.sections[i];
1810
+ add(`sections[${i}].content`, s.content);
1811
+ if (s.subsections) {
1812
+ for (let j = 0; j < s.subsections.length; j++) {
1813
+ add(`sections[${i}].subsections[${j}].content`, s.subsections[j].content);
1814
+ }
1815
+ }
1816
+ }
1817
+ }
1818
+ if (doc.endorsements) {
1819
+ for (let i = 0; i < doc.endorsements.length; i++) {
1820
+ add(`endorsements[${i}].content`, doc.endorsements[i].content);
1821
+ }
1822
+ }
1823
+ if (doc.exclusions) {
1824
+ for (let i = 0; i < doc.exclusions.length; i++) {
1825
+ add(`exclusions[${i}].content`, doc.exclusions[i].content);
1826
+ }
1827
+ }
1828
+ if (doc.conditions) {
1829
+ for (let i = 0; i < doc.conditions.length; i++) {
1830
+ add(`conditions[${i}].content`, doc.conditions[i].content);
1831
+ }
1832
+ }
1833
+ return entries;
1834
+ }
1835
+ function parseFormatResponse(response) {
1836
+ const results = /* @__PURE__ */ new Map();
1837
+ const parts = response.split(/===ENTRY (\d+)===/);
1838
+ for (let i = 1; i < parts.length; i += 2) {
1839
+ const entryId = parseInt(parts[i], 10);
1840
+ const content = parts[i + 1]?.trim();
1841
+ if (!isNaN(entryId) && content !== void 0) {
1842
+ results.set(entryId, content);
1843
+ }
1844
+ }
1845
+ return results;
1846
+ }
1847
+ function applyFormattedContent(doc, entries, formatted) {
1848
+ for (const entry of entries) {
1849
+ const cleaned = formatted.get(entry.id);
1850
+ if (!cleaned) continue;
1851
+ const segments = entry.path.match(/^(\w+)(?:\[(\d+)\])?(?:\.(\w+)(?:\[(\d+)\])?(?:\.(\w+))?)?$/);
1852
+ if (!segments) continue;
1853
+ const [, field, idx1, sub1, idx2, sub2] = segments;
1854
+ if (!sub1) {
1855
+ doc[field] = cleaned;
1856
+ } else if (!sub2) {
1857
+ const arr = doc[field];
1858
+ if (arr && arr[Number(idx1)]) {
1859
+ arr[Number(idx1)][sub1] = cleaned;
1860
+ }
1861
+ } else {
1862
+ const arr = doc[field];
1863
+ if (arr && arr[Number(idx1)]) {
1864
+ const nested = arr[Number(idx1)][sub1];
1865
+ if (nested && nested[Number(idx2)]) {
1866
+ nested[Number(idx2)][sub2] = cleaned;
1867
+ }
1868
+ }
1869
+ }
1870
+ }
1871
+ }
1872
+ var MAX_ENTRIES_PER_BATCH = 20;
1873
+ async function formatDocumentContent(doc, generateText, options) {
1874
+ const entries = collectContentFields(doc);
1875
+ const totalUsage = { inputTokens: 0, outputTokens: 0 };
1876
+ if (entries.length === 0) {
1877
+ return { document: doc, usage: totalUsage };
1878
+ }
1879
+ options?.onProgress?.(`Formatting ${entries.length} content fields...`);
1880
+ const batches = [];
1881
+ for (let i = 0; i < entries.length; i += MAX_ENTRIES_PER_BATCH) {
1882
+ batches.push(entries.slice(i, i + MAX_ENTRIES_PER_BATCH));
1883
+ }
1884
+ for (const batch of batches) {
1885
+ const prompt = buildFormatPrompt(batch.map((e) => ({ id: e.id, text: e.text })));
1886
+ const result = await withRetry(
1887
+ () => generateText({
1888
+ prompt,
1889
+ maxTokens: 16384,
1890
+ providerOptions: options?.providerOptions
1891
+ })
1892
+ );
1893
+ if (result.usage) {
1894
+ totalUsage.inputTokens += result.usage.inputTokens;
1895
+ totalUsage.outputTokens += result.usage.outputTokens;
1896
+ }
1897
+ const formatted = parseFormatResponse(result.text);
1898
+ applyFormattedContent(doc, batch, formatted);
1899
+ }
1900
+ return { document: doc, usage: totalUsage };
1901
+ }
1902
+
1703
1903
  // src/extraction/chunking.ts
1704
1904
  function chunkDocument(doc) {
1705
1905
  const chunks = [];
@@ -3195,8 +3395,14 @@ function createExtractor(config) {
3195
3395
  }
3196
3396
  onProgress?.("Assembling document...");
3197
3397
  const document = assembleDocument(id, documentType, memory);
3198
- const chunks = chunkDocument(document);
3199
- return { document, chunks, tokenUsage: totalUsage };
3398
+ onProgress?.("Formatting extracted content...");
3399
+ const formatResult = await formatDocumentContent(document, generateText, {
3400
+ providerOptions,
3401
+ onProgress
3402
+ });
3403
+ trackUsage(formatResult.usage);
3404
+ const chunks = chunkDocument(formatResult.document);
3405
+ return { document: formatResult.document, chunks, tokenUsage: totalUsage };
3200
3406
  }
3201
3407
  return { extract };
3202
3408
  }