@claritylabs/cl-sdk 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1471,6 +1471,206 @@ function assembleDocument(documentId, documentType, memory) {
1471
1471
  };
1472
1472
  }
1473
1473
 
1474
+ // src/prompts/coordinator/format.ts
1475
+ function buildFormatPrompt(entries) {
1476
+ const block = entries.map((e) => `===ENTRY ${e.id}===
1477
+ ${e.text}`).join("\n\n");
1478
+ return `You are a markdown formatting specialist for insurance document content. You will receive numbered content entries extracted from insurance policies, quotes, and endorsements. Your job is to clean up the formatting so every entry renders correctly as standard markdown.
1479
+
1480
+ ## Primary issues to fix
1481
+
1482
+ ### 1. Pipe-delimited data missing table syntax
1483
+ The most common issue. Content uses pipe characters as column separators but is missing the separator row required for markdown table rendering.
1484
+
1485
+ Before (broken \u2014 won't render as a table):
1486
+ COVERAGE | FORM # | LIMIT | DEDUCTIBLE
1487
+ Employee Theft | | $10,000 | $1,000
1488
+
1489
+ After (valid markdown table):
1490
+ | COVERAGE | FORM # | LIMIT | DEDUCTIBLE |
1491
+ | --- | --- | --- | --- |
1492
+ | Employee Theft | | $10,000 | $1,000 |
1493
+
1494
+ Rules for pipe tables:
1495
+ - Add leading and trailing pipes to every row
1496
+ - Add the separator row (| --- | --- |) after the header row
1497
+ - Every row must have the same number of pipe-separated columns as the header
1498
+ - Empty cells are fine \u2014 just keep the pipes: | | $10,000 |
1499
+
1500
+ ### 2. Sub-items indented within pipe tables
1501
+ Insurance schedules often have indented sub-items that belong to the previous coverage line. These break table column counts.
1502
+
1503
+ Before (broken):
1504
+ COVERAGE | LIMIT | DEDUCTIBLE
1505
+ Causes Of Loss - Equipment Breakdown | PR650END
1506
+ Described Premises Limit | | $350,804 |
1507
+ Diagnostic Equipment | | $100,000 |
1508
+ Deductible Type - Business Income: Waiting Period - Hours
1509
+ Waiting Period (Hours): 24
1510
+
1511
+ After: Pull sub-items out of the table. End the table before the sub-items, show them as an indented list, then start a new table if tabular data resumes:
1512
+ | COVERAGE | LIMIT | DEDUCTIBLE |
1513
+ | --- | --- | --- |
1514
+ | Causes Of Loss - Equipment Breakdown | PR650END | |
1515
+
1516
+ - Described Premises Limit: $350,804
1517
+ - Diagnostic Equipment: $100,000
1518
+ - Deductible Type - Business Income: Waiting Period - Hours
1519
+ - Waiting Period (Hours): 24
1520
+
1521
+ ### 3. Space-aligned tables
1522
+ Declarations often align columns with spaces instead of pipes. These render as plain monospace text and lose structure.
1523
+
1524
+ Before:
1525
+ Coverage Limit of Liability Retention
1526
+ A. Network Security Liability $500,000 $10,000
1527
+ B. Privacy Liability $500,000 $10,000
1528
+
1529
+ After (convert to proper markdown table):
1530
+ | Coverage | Limit of Liability | Retention |
1531
+ | --- | --- | --- |
1532
+ | A. Network Security Liability | $500,000 | $10,000 |
1533
+ | B. Privacy Liability | $500,000 | $10,000 |
1534
+
1535
+ ### 4. Mixed table/prose content
1536
+ A single entry often contains prose paragraphs followed by tabular data followed by more prose. Handle each segment independently \u2014 don't try to force everything into one table.
1537
+
1538
+ ### 5. General markdown cleanup
1539
+ - **Line spacing**: Remove excessive blank lines (3+ consecutive newlines \u2192 2). Ensure one blank line before and after tables and headings.
1540
+ - **Trailing whitespace**: Remove trailing spaces on all lines.
1541
+ - **Broken lists**: Ensure list items use consistent markers (-, *, or 1.) with proper nesting indentation.
1542
+ - **Orphaned formatting**: Close any unclosed bold (**), italic (*), or code (\`) markers.
1543
+ - **Heading levels**: Ensure heading markers (##) have a space after the hashes.
1544
+
1545
+ ## Rules
1546
+ - Do NOT change the meaning or substance of any content. Only fix formatting.
1547
+ - Do NOT add new information, headers, or commentary.
1548
+ - Do NOT wrap entries in code fences.
1549
+ - Preserve all dollar amounts, dates, policy numbers, form numbers, and technical terms exactly as they appear.
1550
+ - If an entry is already well-formatted, return it unchanged.
1551
+ - When in doubt about whether something is a table, prefer table formatting for structured data with multiple columns.
1552
+
1553
+ Return your output in this exact format \u2014 one block per entry, in the same order:
1554
+
1555
+ ===ENTRY 0===
1556
+ (cleaned content for entry 0)
1557
+
1558
+ ===ENTRY 1===
1559
+ (cleaned content for entry 1)
1560
+
1561
+ ...and so on for each entry.
1562
+
1563
+ Here are the entries to format:
1564
+
1565
+ ${block}`;
1566
+ }
1567
+
1568
+ // src/extraction/formatter.ts
1569
+ function collectContentFields(doc) {
1570
+ const entries = [];
1571
+ let id = 0;
1572
+ function add(path, text) {
1573
+ if (text && text.length > 20) {
1574
+ entries.push({ id: id++, path, text });
1575
+ }
1576
+ }
1577
+ add("summary", doc.summary);
1578
+ if (doc.sections) {
1579
+ for (let i = 0; i < doc.sections.length; i++) {
1580
+ const s = doc.sections[i];
1581
+ add(`sections[${i}].content`, s.content);
1582
+ if (s.subsections) {
1583
+ for (let j = 0; j < s.subsections.length; j++) {
1584
+ add(`sections[${i}].subsections[${j}].content`, s.subsections[j].content);
1585
+ }
1586
+ }
1587
+ }
1588
+ }
1589
+ if (doc.endorsements) {
1590
+ for (let i = 0; i < doc.endorsements.length; i++) {
1591
+ add(`endorsements[${i}].content`, doc.endorsements[i].content);
1592
+ }
1593
+ }
1594
+ if (doc.exclusions) {
1595
+ for (let i = 0; i < doc.exclusions.length; i++) {
1596
+ add(`exclusions[${i}].content`, doc.exclusions[i].content);
1597
+ }
1598
+ }
1599
+ if (doc.conditions) {
1600
+ for (let i = 0; i < doc.conditions.length; i++) {
1601
+ add(`conditions[${i}].content`, doc.conditions[i].content);
1602
+ }
1603
+ }
1604
+ return entries;
1605
+ }
1606
+ function parseFormatResponse(response) {
1607
+ const results = /* @__PURE__ */ new Map();
1608
+ const parts = response.split(/===ENTRY (\d+)===/);
1609
+ for (let i = 1; i < parts.length; i += 2) {
1610
+ const entryId = parseInt(parts[i], 10);
1611
+ const content = parts[i + 1]?.trim();
1612
+ if (!isNaN(entryId) && content !== void 0) {
1613
+ results.set(entryId, content);
1614
+ }
1615
+ }
1616
+ return results;
1617
+ }
1618
+ function applyFormattedContent(doc, entries, formatted) {
1619
+ for (const entry of entries) {
1620
+ const cleaned = formatted.get(entry.id);
1621
+ if (!cleaned) continue;
1622
+ const segments = entry.path.match(/^(\w+)(?:\[(\d+)\])?(?:\.(\w+)(?:\[(\d+)\])?(?:\.(\w+))?)?$/);
1623
+ if (!segments) continue;
1624
+ const [, field, idx1, sub1, idx2, sub2] = segments;
1625
+ if (!sub1) {
1626
+ doc[field] = cleaned;
1627
+ } else if (!sub2) {
1628
+ const arr = doc[field];
1629
+ if (arr && arr[Number(idx1)]) {
1630
+ arr[Number(idx1)][sub1] = cleaned;
1631
+ }
1632
+ } else {
1633
+ const arr = doc[field];
1634
+ if (arr && arr[Number(idx1)]) {
1635
+ const nested = arr[Number(idx1)][sub1];
1636
+ if (nested && nested[Number(idx2)]) {
1637
+ nested[Number(idx2)][sub2] = cleaned;
1638
+ }
1639
+ }
1640
+ }
1641
+ }
1642
+ }
1643
+ var MAX_ENTRIES_PER_BATCH = 20;
1644
+ async function formatDocumentContent(doc, generateText, options) {
1645
+ const entries = collectContentFields(doc);
1646
+ const totalUsage = { inputTokens: 0, outputTokens: 0 };
1647
+ if (entries.length === 0) {
1648
+ return { document: doc, usage: totalUsage };
1649
+ }
1650
+ options?.onProgress?.(`Formatting ${entries.length} content fields...`);
1651
+ const batches = [];
1652
+ for (let i = 0; i < entries.length; i += MAX_ENTRIES_PER_BATCH) {
1653
+ batches.push(entries.slice(i, i + MAX_ENTRIES_PER_BATCH));
1654
+ }
1655
+ for (const batch of batches) {
1656
+ const prompt = buildFormatPrompt(batch.map((e) => ({ id: e.id, text: e.text })));
1657
+ const result = await withRetry(
1658
+ () => generateText({
1659
+ prompt,
1660
+ maxTokens: 16384,
1661
+ providerOptions: options?.providerOptions
1662
+ })
1663
+ );
1664
+ if (result.usage) {
1665
+ totalUsage.inputTokens += result.usage.inputTokens;
1666
+ totalUsage.outputTokens += result.usage.outputTokens;
1667
+ }
1668
+ const formatted = parseFormatResponse(result.text);
1669
+ applyFormattedContent(doc, batch, formatted);
1670
+ }
1671
+ return { document: doc, usage: totalUsage };
1672
+ }
1673
+
1474
1674
  // src/extraction/chunking.ts
1475
1675
  function chunkDocument(doc) {
1476
1676
  const chunks = [];
@@ -2966,8 +3166,14 @@ function createExtractor(config) {
2966
3166
  }
2967
3167
  onProgress?.("Assembling document...");
2968
3168
  const document = assembleDocument(id, documentType, memory);
2969
- const chunks = chunkDocument(document);
2970
- return { document, chunks, tokenUsage: totalUsage };
3169
+ onProgress?.("Formatting extracted content...");
3170
+ const formatResult = await formatDocumentContent(document, generateText, {
3171
+ providerOptions,
3172
+ onProgress
3173
+ });
3174
+ trackUsage(formatResult.usage);
3175
+ const chunks = chunkDocument(formatResult.document);
3176
+ return { document: formatResult.document, chunks, tokenUsage: totalUsage };
2971
3177
  }
2972
3178
  return { extract };
2973
3179
  }