@claritylabs/cl-sdk 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +33 -9
- package/dist/index.d.mts +10 -10
- package/dist/index.d.ts +10 -10
- package/dist/index.js +208 -2
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +208 -2
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -104,14 +104,22 @@ The extraction system uses a **coordinator/worker pattern** — a coordinator ag
|
|
|
104
104
|
│ │ │ to pages │ │ │
|
|
105
105
|
└─────────────┘ └─────────────┘ └──────────┬───────────┘
|
|
106
106
|
│
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
107
|
+
┌─────────────┐ ┌─────────────┐ ┌──────────▼───────────┐
|
|
108
|
+
│ 6. FORMAT │◀────│ 5. ASSEMBLE │◀────│ 4. REVIEW │
|
|
109
|
+
│ │ │ │ │ │
|
|
110
|
+
│ Clean up │ │ Merge all │ │ Check completeness │
|
|
111
|
+
│ markdown │ │ results │ │ against template, │
|
|
112
|
+
│ tables, │ │ into final │ │ dispatch follow-up │
|
|
113
|
+
│ spacing │ │ document │ │ extractors for gaps │
|
|
114
|
+
└──────┬──────┘ └─────────────┘ └──────────────────────┘
|
|
115
|
+
│
|
|
116
|
+
┌──────▼──────┐
|
|
117
|
+
│ 7. CHUNK │
|
|
118
|
+
│ Break into │
|
|
119
|
+
│ retrieval- │
|
|
120
|
+
│ ready │
|
|
121
|
+
│ chunks │
|
|
122
|
+
└─────────────┘
|
|
115
123
|
```
|
|
116
124
|
|
|
117
125
|
#### Phase 1: Classify
|
|
@@ -151,7 +159,23 @@ After initial extraction, a review loop (up to `maxReviewRounds`, default 2) che
|
|
|
151
159
|
|
|
152
160
|
#### Phase 5: Assemble
|
|
153
161
|
|
|
154
|
-
All extractor results are merged into a final validated `InsuranceDocument
|
|
162
|
+
All extractor results are merged into a final validated `InsuranceDocument`.
|
|
163
|
+
|
|
164
|
+
#### Phase 6: Format
|
|
165
|
+
|
|
166
|
+
A formatting agent pass cleans up markdown in all content-bearing string fields (sections, subsections, endorsements, exclusions, conditions, summary). It fixes:
|
|
167
|
+
|
|
168
|
+
- **Pipe tables missing separator rows** — adds `| --- | --- |` and leading/trailing pipes
|
|
169
|
+
- **Space-aligned tables** — converts whitespace-padded columns into proper markdown tables
|
|
170
|
+
- **Sub-items mixed into tables** — pulls indented sub-items out of tables into lists
|
|
171
|
+
- **Mixed table/prose content** — handles each segment independently
|
|
172
|
+
- **General cleanup** — excessive blank lines, trailing whitespace, orphaned formatting markers
|
|
173
|
+
|
|
174
|
+
Content is batched (up to 20 fields per call) and sent through `generateText` for formatting cleanup. Token usage is tracked the same as other pipeline steps.
|
|
175
|
+
|
|
176
|
+
#### Phase 7: Chunk
|
|
177
|
+
|
|
178
|
+
The formatted document is chunked into `DocumentChunk[]` for vector storage. Chunks are deterministically IDed as `${documentId}:${type}:${index}`.
|
|
155
179
|
|
|
156
180
|
### Configuration
|
|
157
181
|
|
package/dist/index.d.mts
CHANGED
|
@@ -28723,16 +28723,16 @@ declare const CitationSchema: z.ZodObject<{
|
|
|
28723
28723
|
documentId: string;
|
|
28724
28724
|
relevance: number;
|
|
28725
28725
|
index: number;
|
|
28726
|
-
documentType?: "policy" | "quote" | undefined;
|
|
28727
28726
|
field?: string | undefined;
|
|
28727
|
+
documentType?: "policy" | "quote" | undefined;
|
|
28728
28728
|
}, {
|
|
28729
28729
|
quote: string;
|
|
28730
28730
|
chunkId: string;
|
|
28731
28731
|
documentId: string;
|
|
28732
28732
|
relevance: number;
|
|
28733
28733
|
index: number;
|
|
28734
|
-
documentType?: "policy" | "quote" | undefined;
|
|
28735
28734
|
field?: string | undefined;
|
|
28735
|
+
documentType?: "policy" | "quote" | undefined;
|
|
28736
28736
|
}>;
|
|
28737
28737
|
type Citation = z.infer<typeof CitationSchema>;
|
|
28738
28738
|
declare const SubAnswerSchema: z.ZodObject<{
|
|
@@ -28752,16 +28752,16 @@ declare const SubAnswerSchema: z.ZodObject<{
|
|
|
28752
28752
|
documentId: string;
|
|
28753
28753
|
relevance: number;
|
|
28754
28754
|
index: number;
|
|
28755
|
-
documentType?: "policy" | "quote" | undefined;
|
|
28756
28755
|
field?: string | undefined;
|
|
28756
|
+
documentType?: "policy" | "quote" | undefined;
|
|
28757
28757
|
}, {
|
|
28758
28758
|
quote: string;
|
|
28759
28759
|
chunkId: string;
|
|
28760
28760
|
documentId: string;
|
|
28761
28761
|
relevance: number;
|
|
28762
28762
|
index: number;
|
|
28763
|
-
documentType?: "policy" | "quote" | undefined;
|
|
28764
28763
|
field?: string | undefined;
|
|
28764
|
+
documentType?: "policy" | "quote" | undefined;
|
|
28765
28765
|
}>, "many">;
|
|
28766
28766
|
confidence: z.ZodNumber;
|
|
28767
28767
|
needsMoreContext: z.ZodBoolean;
|
|
@@ -28775,8 +28775,8 @@ declare const SubAnswerSchema: z.ZodObject<{
|
|
|
28775
28775
|
documentId: string;
|
|
28776
28776
|
relevance: number;
|
|
28777
28777
|
index: number;
|
|
28778
|
-
documentType?: "policy" | "quote" | undefined;
|
|
28779
28778
|
field?: string | undefined;
|
|
28779
|
+
documentType?: "policy" | "quote" | undefined;
|
|
28780
28780
|
}[];
|
|
28781
28781
|
needsMoreContext: boolean;
|
|
28782
28782
|
}, {
|
|
@@ -28789,8 +28789,8 @@ declare const SubAnswerSchema: z.ZodObject<{
|
|
|
28789
28789
|
documentId: string;
|
|
28790
28790
|
relevance: number;
|
|
28791
28791
|
index: number;
|
|
28792
|
-
documentType?: "policy" | "quote" | undefined;
|
|
28793
28792
|
field?: string | undefined;
|
|
28793
|
+
documentType?: "policy" | "quote" | undefined;
|
|
28794
28794
|
}[];
|
|
28795
28795
|
needsMoreContext: boolean;
|
|
28796
28796
|
}>;
|
|
@@ -28825,16 +28825,16 @@ declare const QueryResultSchema: z.ZodObject<{
|
|
|
28825
28825
|
documentId: string;
|
|
28826
28826
|
relevance: number;
|
|
28827
28827
|
index: number;
|
|
28828
|
-
documentType?: "policy" | "quote" | undefined;
|
|
28829
28828
|
field?: string | undefined;
|
|
28829
|
+
documentType?: "policy" | "quote" | undefined;
|
|
28830
28830
|
}, {
|
|
28831
28831
|
quote: string;
|
|
28832
28832
|
chunkId: string;
|
|
28833
28833
|
documentId: string;
|
|
28834
28834
|
relevance: number;
|
|
28835
28835
|
index: number;
|
|
28836
|
-
documentType?: "policy" | "quote" | undefined;
|
|
28837
28836
|
field?: string | undefined;
|
|
28837
|
+
documentType?: "policy" | "quote" | undefined;
|
|
28838
28838
|
}>, "many">;
|
|
28839
28839
|
intent: z.ZodEnum<["policy_question", "coverage_comparison", "document_search", "claims_inquiry", "general_knowledge"]>;
|
|
28840
28840
|
confidence: z.ZodNumber;
|
|
@@ -28849,8 +28849,8 @@ declare const QueryResultSchema: z.ZodObject<{
|
|
|
28849
28849
|
documentId: string;
|
|
28850
28850
|
relevance: number;
|
|
28851
28851
|
index: number;
|
|
28852
|
-
documentType?: "policy" | "quote" | undefined;
|
|
28853
28852
|
field?: string | undefined;
|
|
28853
|
+
documentType?: "policy" | "quote" | undefined;
|
|
28854
28854
|
}[];
|
|
28855
28855
|
followUp?: string | undefined;
|
|
28856
28856
|
}, {
|
|
@@ -28863,8 +28863,8 @@ declare const QueryResultSchema: z.ZodObject<{
|
|
|
28863
28863
|
documentId: string;
|
|
28864
28864
|
relevance: number;
|
|
28865
28865
|
index: number;
|
|
28866
|
-
documentType?: "policy" | "quote" | undefined;
|
|
28867
28866
|
field?: string | undefined;
|
|
28867
|
+
documentType?: "policy" | "quote" | undefined;
|
|
28868
28868
|
}[];
|
|
28869
28869
|
followUp?: string | undefined;
|
|
28870
28870
|
}>;
|
package/dist/index.d.ts
CHANGED
|
@@ -28723,16 +28723,16 @@ declare const CitationSchema: z.ZodObject<{
|
|
|
28723
28723
|
documentId: string;
|
|
28724
28724
|
relevance: number;
|
|
28725
28725
|
index: number;
|
|
28726
|
-
documentType?: "policy" | "quote" | undefined;
|
|
28727
28726
|
field?: string | undefined;
|
|
28727
|
+
documentType?: "policy" | "quote" | undefined;
|
|
28728
28728
|
}, {
|
|
28729
28729
|
quote: string;
|
|
28730
28730
|
chunkId: string;
|
|
28731
28731
|
documentId: string;
|
|
28732
28732
|
relevance: number;
|
|
28733
28733
|
index: number;
|
|
28734
|
-
documentType?: "policy" | "quote" | undefined;
|
|
28735
28734
|
field?: string | undefined;
|
|
28735
|
+
documentType?: "policy" | "quote" | undefined;
|
|
28736
28736
|
}>;
|
|
28737
28737
|
type Citation = z.infer<typeof CitationSchema>;
|
|
28738
28738
|
declare const SubAnswerSchema: z.ZodObject<{
|
|
@@ -28752,16 +28752,16 @@ declare const SubAnswerSchema: z.ZodObject<{
|
|
|
28752
28752
|
documentId: string;
|
|
28753
28753
|
relevance: number;
|
|
28754
28754
|
index: number;
|
|
28755
|
-
documentType?: "policy" | "quote" | undefined;
|
|
28756
28755
|
field?: string | undefined;
|
|
28756
|
+
documentType?: "policy" | "quote" | undefined;
|
|
28757
28757
|
}, {
|
|
28758
28758
|
quote: string;
|
|
28759
28759
|
chunkId: string;
|
|
28760
28760
|
documentId: string;
|
|
28761
28761
|
relevance: number;
|
|
28762
28762
|
index: number;
|
|
28763
|
-
documentType?: "policy" | "quote" | undefined;
|
|
28764
28763
|
field?: string | undefined;
|
|
28764
|
+
documentType?: "policy" | "quote" | undefined;
|
|
28765
28765
|
}>, "many">;
|
|
28766
28766
|
confidence: z.ZodNumber;
|
|
28767
28767
|
needsMoreContext: z.ZodBoolean;
|
|
@@ -28775,8 +28775,8 @@ declare const SubAnswerSchema: z.ZodObject<{
|
|
|
28775
28775
|
documentId: string;
|
|
28776
28776
|
relevance: number;
|
|
28777
28777
|
index: number;
|
|
28778
|
-
documentType?: "policy" | "quote" | undefined;
|
|
28779
28778
|
field?: string | undefined;
|
|
28779
|
+
documentType?: "policy" | "quote" | undefined;
|
|
28780
28780
|
}[];
|
|
28781
28781
|
needsMoreContext: boolean;
|
|
28782
28782
|
}, {
|
|
@@ -28789,8 +28789,8 @@ declare const SubAnswerSchema: z.ZodObject<{
|
|
|
28789
28789
|
documentId: string;
|
|
28790
28790
|
relevance: number;
|
|
28791
28791
|
index: number;
|
|
28792
|
-
documentType?: "policy" | "quote" | undefined;
|
|
28793
28792
|
field?: string | undefined;
|
|
28793
|
+
documentType?: "policy" | "quote" | undefined;
|
|
28794
28794
|
}[];
|
|
28795
28795
|
needsMoreContext: boolean;
|
|
28796
28796
|
}>;
|
|
@@ -28825,16 +28825,16 @@ declare const QueryResultSchema: z.ZodObject<{
|
|
|
28825
28825
|
documentId: string;
|
|
28826
28826
|
relevance: number;
|
|
28827
28827
|
index: number;
|
|
28828
|
-
documentType?: "policy" | "quote" | undefined;
|
|
28829
28828
|
field?: string | undefined;
|
|
28829
|
+
documentType?: "policy" | "quote" | undefined;
|
|
28830
28830
|
}, {
|
|
28831
28831
|
quote: string;
|
|
28832
28832
|
chunkId: string;
|
|
28833
28833
|
documentId: string;
|
|
28834
28834
|
relevance: number;
|
|
28835
28835
|
index: number;
|
|
28836
|
-
documentType?: "policy" | "quote" | undefined;
|
|
28837
28836
|
field?: string | undefined;
|
|
28837
|
+
documentType?: "policy" | "quote" | undefined;
|
|
28838
28838
|
}>, "many">;
|
|
28839
28839
|
intent: z.ZodEnum<["policy_question", "coverage_comparison", "document_search", "claims_inquiry", "general_knowledge"]>;
|
|
28840
28840
|
confidence: z.ZodNumber;
|
|
@@ -28849,8 +28849,8 @@ declare const QueryResultSchema: z.ZodObject<{
|
|
|
28849
28849
|
documentId: string;
|
|
28850
28850
|
relevance: number;
|
|
28851
28851
|
index: number;
|
|
28852
|
-
documentType?: "policy" | "quote" | undefined;
|
|
28853
28852
|
field?: string | undefined;
|
|
28853
|
+
documentType?: "policy" | "quote" | undefined;
|
|
28854
28854
|
}[];
|
|
28855
28855
|
followUp?: string | undefined;
|
|
28856
28856
|
}, {
|
|
@@ -28863,8 +28863,8 @@ declare const QueryResultSchema: z.ZodObject<{
|
|
|
28863
28863
|
documentId: string;
|
|
28864
28864
|
relevance: number;
|
|
28865
28865
|
index: number;
|
|
28866
|
-
documentType?: "policy" | "quote" | undefined;
|
|
28867
28866
|
field?: string | undefined;
|
|
28867
|
+
documentType?: "policy" | "quote" | undefined;
|
|
28868
28868
|
}[];
|
|
28869
28869
|
followUp?: string | undefined;
|
|
28870
28870
|
}>;
|
package/dist/index.js
CHANGED
|
@@ -1700,6 +1700,206 @@ function assembleDocument(documentId, documentType, memory) {
|
|
|
1700
1700
|
};
|
|
1701
1701
|
}
|
|
1702
1702
|
|
|
1703
|
+
// src/prompts/coordinator/format.ts
|
|
1704
|
+
function buildFormatPrompt(entries) {
|
|
1705
|
+
const block = entries.map((e) => `===ENTRY ${e.id}===
|
|
1706
|
+
${e.text}`).join("\n\n");
|
|
1707
|
+
return `You are a markdown formatting specialist for insurance document content. You will receive numbered content entries extracted from insurance policies, quotes, and endorsements. Your job is to clean up the formatting so every entry renders correctly as standard markdown.
|
|
1708
|
+
|
|
1709
|
+
## Primary issues to fix
|
|
1710
|
+
|
|
1711
|
+
### 1. Pipe-delimited data missing table syntax
|
|
1712
|
+
The most common issue. Content uses pipe characters as column separators but is missing the separator row required for markdown table rendering.
|
|
1713
|
+
|
|
1714
|
+
Before (broken \u2014 won't render as a table):
|
|
1715
|
+
COVERAGE | FORM # | LIMIT | DEDUCTIBLE
|
|
1716
|
+
Employee Theft | | $10,000 | $1,000
|
|
1717
|
+
|
|
1718
|
+
After (valid markdown table):
|
|
1719
|
+
| COVERAGE | FORM # | LIMIT | DEDUCTIBLE |
|
|
1720
|
+
| --- | --- | --- | --- |
|
|
1721
|
+
| Employee Theft | | $10,000 | $1,000 |
|
|
1722
|
+
|
|
1723
|
+
Rules for pipe tables:
|
|
1724
|
+
- Add leading and trailing pipes to every row
|
|
1725
|
+
- Add the separator row (| --- | --- |) after the header row
|
|
1726
|
+
- Every row must have the same number of pipe-separated columns as the header
|
|
1727
|
+
- Empty cells are fine \u2014 just keep the pipes: | | $10,000 |
|
|
1728
|
+
|
|
1729
|
+
### 2. Sub-items indented within pipe tables
|
|
1730
|
+
Insurance schedules often have indented sub-items that belong to the previous coverage line. These break table column counts.
|
|
1731
|
+
|
|
1732
|
+
Before (broken):
|
|
1733
|
+
COVERAGE | LIMIT | DEDUCTIBLE
|
|
1734
|
+
Causes Of Loss - Equipment Breakdown | PR650END
|
|
1735
|
+
Described Premises Limit | | $350,804 |
|
|
1736
|
+
Diagnostic Equipment | | $100,000 |
|
|
1737
|
+
Deductible Type - Business Income: Waiting Period - Hours
|
|
1738
|
+
Waiting Period (Hours): 24
|
|
1739
|
+
|
|
1740
|
+
After: Pull sub-items out of the table. End the table before the sub-items, show them as an indented list, then start a new table if tabular data resumes:
|
|
1741
|
+
| COVERAGE | LIMIT | DEDUCTIBLE |
|
|
1742
|
+
| --- | --- | --- |
|
|
1743
|
+
| Causes Of Loss - Equipment Breakdown | PR650END | |
|
|
1744
|
+
|
|
1745
|
+
- Described Premises Limit: $350,804
|
|
1746
|
+
- Diagnostic Equipment: $100,000
|
|
1747
|
+
- Deductible Type - Business Income: Waiting Period - Hours
|
|
1748
|
+
- Waiting Period (Hours): 24
|
|
1749
|
+
|
|
1750
|
+
### 3. Space-aligned tables
|
|
1751
|
+
Declarations often align columns with spaces instead of pipes. These render as plain monospace text and lose structure.
|
|
1752
|
+
|
|
1753
|
+
Before:
|
|
1754
|
+
Coverage Limit of Liability Retention
|
|
1755
|
+
A. Network Security Liability $500,000 $10,000
|
|
1756
|
+
B. Privacy Liability $500,000 $10,000
|
|
1757
|
+
|
|
1758
|
+
After (convert to proper markdown table):
|
|
1759
|
+
| Coverage | Limit of Liability | Retention |
|
|
1760
|
+
| --- | --- | --- |
|
|
1761
|
+
| A. Network Security Liability | $500,000 | $10,000 |
|
|
1762
|
+
| B. Privacy Liability | $500,000 | $10,000 |
|
|
1763
|
+
|
|
1764
|
+
### 4. Mixed table/prose content
|
|
1765
|
+
A single entry often contains prose paragraphs followed by tabular data followed by more prose. Handle each segment independently \u2014 don't try to force everything into one table.
|
|
1766
|
+
|
|
1767
|
+
### 5. General markdown cleanup
|
|
1768
|
+
- **Line spacing**: Remove excessive blank lines (3+ consecutive newlines \u2192 2). Ensure one blank line before and after tables and headings.
|
|
1769
|
+
- **Trailing whitespace**: Remove trailing spaces on all lines.
|
|
1770
|
+
- **Broken lists**: Ensure list items use consistent markers (-, *, or 1.) with proper nesting indentation.
|
|
1771
|
+
- **Orphaned formatting**: Close any unclosed bold (**), italic (*), or code (\`) markers.
|
|
1772
|
+
- **Heading levels**: Ensure heading markers (##) have a space after the hashes.
|
|
1773
|
+
|
|
1774
|
+
## Rules
|
|
1775
|
+
- Do NOT change the meaning or substance of any content. Only fix formatting.
|
|
1776
|
+
- Do NOT add new information, headers, or commentary.
|
|
1777
|
+
- Do NOT wrap entries in code fences.
|
|
1778
|
+
- Preserve all dollar amounts, dates, policy numbers, form numbers, and technical terms exactly as they appear.
|
|
1779
|
+
- If an entry is already well-formatted, return it unchanged.
|
|
1780
|
+
- When in doubt about whether something is a table, prefer table formatting for structured data with multiple columns.
|
|
1781
|
+
|
|
1782
|
+
Return your output in this exact format \u2014 one block per entry, in the same order:
|
|
1783
|
+
|
|
1784
|
+
===ENTRY 0===
|
|
1785
|
+
(cleaned content for entry 0)
|
|
1786
|
+
|
|
1787
|
+
===ENTRY 1===
|
|
1788
|
+
(cleaned content for entry 1)
|
|
1789
|
+
|
|
1790
|
+
...and so on for each entry.
|
|
1791
|
+
|
|
1792
|
+
Here are the entries to format:
|
|
1793
|
+
|
|
1794
|
+
${block}`;
|
|
1795
|
+
}
|
|
1796
|
+
|
|
1797
|
+
// src/extraction/formatter.ts
|
|
1798
|
+
function collectContentFields(doc) {
|
|
1799
|
+
const entries = [];
|
|
1800
|
+
let id = 0;
|
|
1801
|
+
function add(path, text) {
|
|
1802
|
+
if (text && text.length > 20) {
|
|
1803
|
+
entries.push({ id: id++, path, text });
|
|
1804
|
+
}
|
|
1805
|
+
}
|
|
1806
|
+
add("summary", doc.summary);
|
|
1807
|
+
if (doc.sections) {
|
|
1808
|
+
for (let i = 0; i < doc.sections.length; i++) {
|
|
1809
|
+
const s = doc.sections[i];
|
|
1810
|
+
add(`sections[${i}].content`, s.content);
|
|
1811
|
+
if (s.subsections) {
|
|
1812
|
+
for (let j = 0; j < s.subsections.length; j++) {
|
|
1813
|
+
add(`sections[${i}].subsections[${j}].content`, s.subsections[j].content);
|
|
1814
|
+
}
|
|
1815
|
+
}
|
|
1816
|
+
}
|
|
1817
|
+
}
|
|
1818
|
+
if (doc.endorsements) {
|
|
1819
|
+
for (let i = 0; i < doc.endorsements.length; i++) {
|
|
1820
|
+
add(`endorsements[${i}].content`, doc.endorsements[i].content);
|
|
1821
|
+
}
|
|
1822
|
+
}
|
|
1823
|
+
if (doc.exclusions) {
|
|
1824
|
+
for (let i = 0; i < doc.exclusions.length; i++) {
|
|
1825
|
+
add(`exclusions[${i}].content`, doc.exclusions[i].content);
|
|
1826
|
+
}
|
|
1827
|
+
}
|
|
1828
|
+
if (doc.conditions) {
|
|
1829
|
+
for (let i = 0; i < doc.conditions.length; i++) {
|
|
1830
|
+
add(`conditions[${i}].content`, doc.conditions[i].content);
|
|
1831
|
+
}
|
|
1832
|
+
}
|
|
1833
|
+
return entries;
|
|
1834
|
+
}
|
|
1835
|
+
function parseFormatResponse(response) {
|
|
1836
|
+
const results = /* @__PURE__ */ new Map();
|
|
1837
|
+
const parts = response.split(/===ENTRY (\d+)===/);
|
|
1838
|
+
for (let i = 1; i < parts.length; i += 2) {
|
|
1839
|
+
const entryId = parseInt(parts[i], 10);
|
|
1840
|
+
const content = parts[i + 1]?.trim();
|
|
1841
|
+
if (!isNaN(entryId) && content !== void 0) {
|
|
1842
|
+
results.set(entryId, content);
|
|
1843
|
+
}
|
|
1844
|
+
}
|
|
1845
|
+
return results;
|
|
1846
|
+
}
|
|
1847
|
+
function applyFormattedContent(doc, entries, formatted) {
|
|
1848
|
+
for (const entry of entries) {
|
|
1849
|
+
const cleaned = formatted.get(entry.id);
|
|
1850
|
+
if (!cleaned) continue;
|
|
1851
|
+
const segments = entry.path.match(/^(\w+)(?:\[(\d+)\])?(?:\.(\w+)(?:\[(\d+)\])?(?:\.(\w+))?)?$/);
|
|
1852
|
+
if (!segments) continue;
|
|
1853
|
+
const [, field, idx1, sub1, idx2, sub2] = segments;
|
|
1854
|
+
if (!sub1) {
|
|
1855
|
+
doc[field] = cleaned;
|
|
1856
|
+
} else if (!sub2) {
|
|
1857
|
+
const arr = doc[field];
|
|
1858
|
+
if (arr && arr[Number(idx1)]) {
|
|
1859
|
+
arr[Number(idx1)][sub1] = cleaned;
|
|
1860
|
+
}
|
|
1861
|
+
} else {
|
|
1862
|
+
const arr = doc[field];
|
|
1863
|
+
if (arr && arr[Number(idx1)]) {
|
|
1864
|
+
const nested = arr[Number(idx1)][sub1];
|
|
1865
|
+
if (nested && nested[Number(idx2)]) {
|
|
1866
|
+
nested[Number(idx2)][sub2] = cleaned;
|
|
1867
|
+
}
|
|
1868
|
+
}
|
|
1869
|
+
}
|
|
1870
|
+
}
|
|
1871
|
+
}
|
|
1872
|
+
var MAX_ENTRIES_PER_BATCH = 20;
|
|
1873
|
+
async function formatDocumentContent(doc, generateText, options) {
|
|
1874
|
+
const entries = collectContentFields(doc);
|
|
1875
|
+
const totalUsage = { inputTokens: 0, outputTokens: 0 };
|
|
1876
|
+
if (entries.length === 0) {
|
|
1877
|
+
return { document: doc, usage: totalUsage };
|
|
1878
|
+
}
|
|
1879
|
+
options?.onProgress?.(`Formatting ${entries.length} content fields...`);
|
|
1880
|
+
const batches = [];
|
|
1881
|
+
for (let i = 0; i < entries.length; i += MAX_ENTRIES_PER_BATCH) {
|
|
1882
|
+
batches.push(entries.slice(i, i + MAX_ENTRIES_PER_BATCH));
|
|
1883
|
+
}
|
|
1884
|
+
for (const batch of batches) {
|
|
1885
|
+
const prompt = buildFormatPrompt(batch.map((e) => ({ id: e.id, text: e.text })));
|
|
1886
|
+
const result = await withRetry(
|
|
1887
|
+
() => generateText({
|
|
1888
|
+
prompt,
|
|
1889
|
+
maxTokens: 16384,
|
|
1890
|
+
providerOptions: options?.providerOptions
|
|
1891
|
+
})
|
|
1892
|
+
);
|
|
1893
|
+
if (result.usage) {
|
|
1894
|
+
totalUsage.inputTokens += result.usage.inputTokens;
|
|
1895
|
+
totalUsage.outputTokens += result.usage.outputTokens;
|
|
1896
|
+
}
|
|
1897
|
+
const formatted = parseFormatResponse(result.text);
|
|
1898
|
+
applyFormattedContent(doc, batch, formatted);
|
|
1899
|
+
}
|
|
1900
|
+
return { document: doc, usage: totalUsage };
|
|
1901
|
+
}
|
|
1902
|
+
|
|
1703
1903
|
// src/extraction/chunking.ts
|
|
1704
1904
|
function chunkDocument(doc) {
|
|
1705
1905
|
const chunks = [];
|
|
@@ -3195,8 +3395,14 @@ function createExtractor(config) {
|
|
|
3195
3395
|
}
|
|
3196
3396
|
onProgress?.("Assembling document...");
|
|
3197
3397
|
const document = assembleDocument(id, documentType, memory);
|
|
3198
|
-
|
|
3199
|
-
|
|
3398
|
+
onProgress?.("Formatting extracted content...");
|
|
3399
|
+
const formatResult = await formatDocumentContent(document, generateText, {
|
|
3400
|
+
providerOptions,
|
|
3401
|
+
onProgress
|
|
3402
|
+
});
|
|
3403
|
+
trackUsage(formatResult.usage);
|
|
3404
|
+
const chunks = chunkDocument(formatResult.document);
|
|
3405
|
+
return { document: formatResult.document, chunks, tokenUsage: totalUsage };
|
|
3200
3406
|
}
|
|
3201
3407
|
return { extract };
|
|
3202
3408
|
}
|