@claritylabs/cl-sdk 1.0.0 → 1.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +41 -1
- package/dist/index.d.mts +17 -2
- package/dist/index.d.ts +17 -2
- package/dist/index.js +151 -70
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +151 -70
- package/dist/index.mjs.map +1 -1
- package/package.json +5 -1
package/README.md
CHANGED
|
@@ -99,7 +99,7 @@ A multi-pass system that turns insurance PDFs into structured, queryable data:
|
|
|
99
99
|
|
|
100
100
|
- **Pass 0 — Classification**: Determines whether a document is a policy or a quote. Returns document type, confidence score, and supporting signals.
|
|
101
101
|
- **Pass 1 — Metadata Extraction**: Extracts high-level metadata — carrier, policy/quote number, dates, premium, insured name, coverage table with limits and deductibles. Includes an early persistence callback (`onMetadata`) so metadata is saved immediately, surviving downstream failures.
|
|
102
|
-
- **Pass 2 — Section Extraction**: Splits the document into page chunks (starting at 15 pages) and extracts structured sections. Adaptive fallback: if a chunk's output is truncated (JSON parse failure), it re-splits into smaller chunks (10, then 5 pages), and escalates to the fallback model. Results are merged across chunks.
|
|
102
|
+
- **Pass 2 — Section Extraction**: Splits the document into page chunks (starting at 15 pages) and extracts structured sections in parallel (concurrency-limited, default 2). All model calls automatically retry on rate-limit errors with exponential backoff. Adaptive fallback: if a chunk's output is truncated (JSON parse failure), it re-splits into smaller chunks (10, then 5 pages), and escalates to the fallback model. Results are merged across chunks.
|
|
103
103
|
- **Pass 3 — Enrichment**: A non-fatal pass that parses raw text into structured supplementary fields — regulatory context, complaint contacts, costs and fees, claims contacts.
|
|
104
104
|
|
|
105
105
|
For quotes specifically, the pipeline also extracts premium breakdowns, subjectivities (conditions that must be met before binding), and underwriting conditions.
|
|
@@ -160,12 +160,52 @@ interface ExtractOptions {
|
|
|
160
160
|
models?: ModelConfig;
|
|
161
161
|
metadataProviderOptions?: ProviderOptions;
|
|
162
162
|
fallbackProviderOptions?: ProviderOptions;
|
|
163
|
+
concurrency?: number; // parallel chunk limit (default: 2)
|
|
164
|
+
onTokenUsage?: (usage: TokenUsage) => void;
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
interface ExtractSectionsOptions {
|
|
168
|
+
log?: LogFn;
|
|
169
|
+
promptBuilder?: PromptBuilder;
|
|
170
|
+
models?: ModelConfig;
|
|
171
|
+
fallbackProviderOptions?: ProviderOptions;
|
|
172
|
+
concurrency?: number; // parallel chunk limit (default: 2)
|
|
173
|
+
onTokenUsage?: (usage: TokenUsage) => void;
|
|
163
174
|
}
|
|
164
175
|
|
|
165
176
|
interface ClassifyOptions {
|
|
166
177
|
log?: LogFn;
|
|
167
178
|
models?: ModelConfig;
|
|
179
|
+
onTokenUsage?: (usage: TokenUsage) => void;
|
|
168
180
|
}
|
|
181
|
+
|
|
182
|
+
interface TokenUsage {
|
|
183
|
+
inputTokens: number;
|
|
184
|
+
outputTokens: number;
|
|
185
|
+
}
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
### Rate-Limit Resilience
|
|
189
|
+
|
|
190
|
+
All model calls automatically retry on rate-limit errors (HTTP 429 or "rate limit" in error message) with exponential backoff — up to 5 retries with delays of 2s, 4s, 8s, 16s, 32s (plus jitter). Non-rate-limit errors are re-thrown immediately.
|
|
191
|
+
|
|
192
|
+
### Parallel Chunk Extraction
|
|
193
|
+
|
|
194
|
+
Pass 2 section extraction processes page chunks in parallel with a configurable concurrency limit (default: 2). This balances throughput against rate limits. Sub-chunk retries on truncation are also parallelized.
|
|
195
|
+
|
|
196
|
+
```typescript
|
|
197
|
+
// Track token usage across all passes
|
|
198
|
+
let totalInput = 0, totalOutput = 0;
|
|
199
|
+
|
|
200
|
+
const { extracted } = await extractFromPdf(pdfBase64, {
|
|
201
|
+
concurrency: 3,
|
|
202
|
+
onTokenUsage: ({ inputTokens, outputTokens }) => {
|
|
203
|
+
totalInput += inputTokens;
|
|
204
|
+
totalOutput += outputTokens;
|
|
205
|
+
},
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
console.log(`Total: ${totalInput} input, ${totalOutput} output tokens`);
|
|
169
209
|
```
|
|
170
210
|
|
|
171
211
|
### Agent
|
package/dist/index.d.mts
CHANGED
|
@@ -407,10 +407,12 @@ declare function getPageChunks(totalPages: number, chunkSize?: number): Array<[n
|
|
|
407
407
|
* Pass 3: Enrich supplementary fields with structured data.
|
|
408
408
|
* Text-only enrichment call — non-fatal on failure (returns document unchanged).
|
|
409
409
|
*/
|
|
410
|
-
declare function enrichSupplementaryFields(document: any, models?: ModelConfig, log?: LogFn): Promise<any>;
|
|
410
|
+
declare function enrichSupplementaryFields(document: any, models?: ModelConfig, log?: LogFn, onTokenUsage?: (usage: TokenUsage) => void): Promise<any>;
|
|
411
411
|
interface ClassifyOptions {
|
|
412
412
|
log?: LogFn;
|
|
413
413
|
models?: ModelConfig;
|
|
414
|
+
/** Called after each model call with token usage for tracking. */
|
|
415
|
+
onTokenUsage?: (usage: TokenUsage) => void;
|
|
414
416
|
}
|
|
415
417
|
/**
|
|
416
418
|
* Pass 0: Classify document as policy or quote.
|
|
@@ -454,6 +456,11 @@ declare function applyExtractedQuote(extracted: any): {
|
|
|
454
456
|
*/
|
|
455
457
|
declare function mergeChunkedQuoteSections(metadataResult: any, sectionChunks: any[]): any;
|
|
456
458
|
type PromptBuilder = (pageStart: number, pageEnd: number) => string;
|
|
459
|
+
/** Token usage reported per model call. */
|
|
460
|
+
interface TokenUsage {
|
|
461
|
+
inputTokens: number;
|
|
462
|
+
outputTokens: number;
|
|
463
|
+
}
|
|
457
464
|
interface ExtractOptions {
|
|
458
465
|
log?: LogFn;
|
|
459
466
|
onMetadata?: (raw: string) => Promise<void>;
|
|
@@ -462,6 +469,10 @@ interface ExtractOptions {
|
|
|
462
469
|
metadataProviderOptions?: ProviderOptions;
|
|
463
470
|
/** Provider-specific options for fallback calls. Defaults to Anthropic thinking enabled. */
|
|
464
471
|
fallbackProviderOptions?: ProviderOptions;
|
|
472
|
+
/** Maximum number of chunk extractions to run in parallel (default: 2). */
|
|
473
|
+
concurrency?: number;
|
|
474
|
+
/** Called after each model call with token usage for tracking. */
|
|
475
|
+
onTokenUsage?: (usage: TokenUsage) => void;
|
|
465
476
|
}
|
|
466
477
|
/**
|
|
467
478
|
* Full extraction pipeline for policy documents (passes 1-3).
|
|
@@ -483,6 +494,10 @@ interface ExtractSectionsOptions {
|
|
|
483
494
|
models?: ModelConfig;
|
|
484
495
|
/** Provider-specific options for fallback calls. */
|
|
485
496
|
fallbackProviderOptions?: ProviderOptions;
|
|
497
|
+
/** Maximum number of chunk extractions to run in parallel (default: 2). */
|
|
498
|
+
concurrency?: number;
|
|
499
|
+
/** Called after each model call with token usage for tracking. */
|
|
500
|
+
onTokenUsage?: (usage: TokenUsage) => void;
|
|
486
501
|
}
|
|
487
502
|
/**
|
|
488
503
|
* Sections-only extraction: skip pass 1, use saved metadata.
|
|
@@ -533,4 +548,4 @@ interface TextOverlay {
|
|
|
533
548
|
/** Overlay text on a flat PDF at specified coordinates. */
|
|
534
549
|
declare function overlayTextOnPdf(pdfBytes: Uint8Array, overlays: TextOverlay[]): Promise<Uint8Array>;
|
|
535
550
|
|
|
536
|
-
export { AGENT_TOOLS, APPLICATION_CLASSIFY_PROMPT, type AcroFormFieldInfo, type AgentContext, type BaseDocument, CLASSIFY_DOCUMENT_PROMPT, CLASSIFY_EMAIL_PROMPT, COI_GENERATION_TOOL, COVERAGE_COMPARISON_TOOL, type ClassifyOptions, type CommunicationIntent, type Coverage, DOCUMENT_LOOKUP_TOOL, EXTRACTION_PROMPT, type ExtractOptions, type ExtractSectionsOptions, type FieldMapping, HAIKU_MODEL, type InsuranceDocument, type LogFn, METADATA_PROMPT, MODEL_TOKEN_LIMITS, type ModelConfig, PLATFORM_CONFIGS, type Platform, type PlatformConfig, type PolicyDocument, type PremiumLine, type PromptBuilder, QUOTE_METADATA_PROMPT, type QuoteDocument, SONNET_MODEL, type Section, type Subjectivity, type Subsection, type TextOverlay, type ToolDefinition, type UnderwritingCondition, applyExtracted, applyExtractedQuote, buildAcroFormMappingPrompt, buildAgentSystemPrompt, buildAnswerParsingPrompt, buildAutoFillPrompt, buildBatchEmailGenerationPrompt, buildClassifyMessagePrompt, buildCoiRoutingPrompt, buildConfirmationSummaryPrompt, buildConversationMemoryContext, buildConversationMemoryGuidance, buildCoverageGapPrompt, buildDocumentContext, buildFieldExplanationPrompt, buildFieldExtractionPrompt, buildFlatPdfMappingPrompt, buildFormattingPrompt, buildIdentityPrompt, buildIntentPrompt, buildLookupFillPrompt, buildPolicyContext, buildPolicySectionsPrompt, buildQuestionBatchPrompt, buildQuoteSectionsPrompt, buildQuotesPoliciesPrompt, buildReplyIntentClassificationPrompt, buildSafetyPrompt, buildSectionsPrompt, buildSupplementaryEnrichmentPrompt, buildSystemPrompt, classifyDocumentType, createDefaultModelConfig, createUniformModelConfig, enrichSupplementaryFields, extractFromPdf, extractQuoteFromPdf, extractSectionsOnly, fillAcroForm, getAcroFormFields, getPageChunks, mergeChunkedQuoteSections, mergeChunkedSections, overlayTextOnPdf, sanitizeNulls, stripFences };
|
|
551
|
+
export { AGENT_TOOLS, APPLICATION_CLASSIFY_PROMPT, type AcroFormFieldInfo, type AgentContext, type BaseDocument, CLASSIFY_DOCUMENT_PROMPT, CLASSIFY_EMAIL_PROMPT, COI_GENERATION_TOOL, COVERAGE_COMPARISON_TOOL, type ClassifyOptions, type CommunicationIntent, type Coverage, DOCUMENT_LOOKUP_TOOL, EXTRACTION_PROMPT, type ExtractOptions, type ExtractSectionsOptions, type FieldMapping, HAIKU_MODEL, type InsuranceDocument, type LogFn, METADATA_PROMPT, MODEL_TOKEN_LIMITS, type ModelConfig, PLATFORM_CONFIGS, type Platform, type PlatformConfig, type PolicyDocument, type PremiumLine, type PromptBuilder, QUOTE_METADATA_PROMPT, type QuoteDocument, SONNET_MODEL, type Section, type Subjectivity, type Subsection, type TextOverlay, type TokenUsage, type ToolDefinition, type UnderwritingCondition, applyExtracted, applyExtractedQuote, buildAcroFormMappingPrompt, buildAgentSystemPrompt, buildAnswerParsingPrompt, buildAutoFillPrompt, buildBatchEmailGenerationPrompt, buildClassifyMessagePrompt, buildCoiRoutingPrompt, buildConfirmationSummaryPrompt, buildConversationMemoryContext, buildConversationMemoryGuidance, buildCoverageGapPrompt, buildDocumentContext, buildFieldExplanationPrompt, buildFieldExtractionPrompt, buildFlatPdfMappingPrompt, buildFormattingPrompt, buildIdentityPrompt, buildIntentPrompt, buildLookupFillPrompt, buildPolicyContext, buildPolicySectionsPrompt, buildQuestionBatchPrompt, buildQuoteSectionsPrompt, buildQuotesPoliciesPrompt, buildReplyIntentClassificationPrompt, buildSafetyPrompt, buildSectionsPrompt, buildSupplementaryEnrichmentPrompt, buildSystemPrompt, classifyDocumentType, createDefaultModelConfig, createUniformModelConfig, enrichSupplementaryFields, extractFromPdf, extractQuoteFromPdf, extractSectionsOnly, fillAcroForm, getAcroFormFields, getPageChunks, mergeChunkedQuoteSections, mergeChunkedSections, overlayTextOnPdf, sanitizeNulls, stripFences };
|
package/dist/index.d.ts
CHANGED
|
@@ -407,10 +407,12 @@ declare function getPageChunks(totalPages: number, chunkSize?: number): Array<[n
|
|
|
407
407
|
* Pass 3: Enrich supplementary fields with structured data.
|
|
408
408
|
* Text-only enrichment call — non-fatal on failure (returns document unchanged).
|
|
409
409
|
*/
|
|
410
|
-
declare function enrichSupplementaryFields(document: any, models?: ModelConfig, log?: LogFn): Promise<any>;
|
|
410
|
+
declare function enrichSupplementaryFields(document: any, models?: ModelConfig, log?: LogFn, onTokenUsage?: (usage: TokenUsage) => void): Promise<any>;
|
|
411
411
|
interface ClassifyOptions {
|
|
412
412
|
log?: LogFn;
|
|
413
413
|
models?: ModelConfig;
|
|
414
|
+
/** Called after each model call with token usage for tracking. */
|
|
415
|
+
onTokenUsage?: (usage: TokenUsage) => void;
|
|
414
416
|
}
|
|
415
417
|
/**
|
|
416
418
|
* Pass 0: Classify document as policy or quote.
|
|
@@ -454,6 +456,11 @@ declare function applyExtractedQuote(extracted: any): {
|
|
|
454
456
|
*/
|
|
455
457
|
declare function mergeChunkedQuoteSections(metadataResult: any, sectionChunks: any[]): any;
|
|
456
458
|
type PromptBuilder = (pageStart: number, pageEnd: number) => string;
|
|
459
|
+
/** Token usage reported per model call. */
|
|
460
|
+
interface TokenUsage {
|
|
461
|
+
inputTokens: number;
|
|
462
|
+
outputTokens: number;
|
|
463
|
+
}
|
|
457
464
|
interface ExtractOptions {
|
|
458
465
|
log?: LogFn;
|
|
459
466
|
onMetadata?: (raw: string) => Promise<void>;
|
|
@@ -462,6 +469,10 @@ interface ExtractOptions {
|
|
|
462
469
|
metadataProviderOptions?: ProviderOptions;
|
|
463
470
|
/** Provider-specific options for fallback calls. Defaults to Anthropic thinking enabled. */
|
|
464
471
|
fallbackProviderOptions?: ProviderOptions;
|
|
472
|
+
/** Maximum number of chunk extractions to run in parallel (default: 2). */
|
|
473
|
+
concurrency?: number;
|
|
474
|
+
/** Called after each model call with token usage for tracking. */
|
|
475
|
+
onTokenUsage?: (usage: TokenUsage) => void;
|
|
465
476
|
}
|
|
466
477
|
/**
|
|
467
478
|
* Full extraction pipeline for policy documents (passes 1-3).
|
|
@@ -483,6 +494,10 @@ interface ExtractSectionsOptions {
|
|
|
483
494
|
models?: ModelConfig;
|
|
484
495
|
/** Provider-specific options for fallback calls. */
|
|
485
496
|
fallbackProviderOptions?: ProviderOptions;
|
|
497
|
+
/** Maximum number of chunk extractions to run in parallel (default: 2). */
|
|
498
|
+
concurrency?: number;
|
|
499
|
+
/** Called after each model call with token usage for tracking. */
|
|
500
|
+
onTokenUsage?: (usage: TokenUsage) => void;
|
|
486
501
|
}
|
|
487
502
|
/**
|
|
488
503
|
* Sections-only extraction: skip pass 1, use saved metadata.
|
|
@@ -533,4 +548,4 @@ interface TextOverlay {
|
|
|
533
548
|
/** Overlay text on a flat PDF at specified coordinates. */
|
|
534
549
|
declare function overlayTextOnPdf(pdfBytes: Uint8Array, overlays: TextOverlay[]): Promise<Uint8Array>;
|
|
535
550
|
|
|
536
|
-
export { AGENT_TOOLS, APPLICATION_CLASSIFY_PROMPT, type AcroFormFieldInfo, type AgentContext, type BaseDocument, CLASSIFY_DOCUMENT_PROMPT, CLASSIFY_EMAIL_PROMPT, COI_GENERATION_TOOL, COVERAGE_COMPARISON_TOOL, type ClassifyOptions, type CommunicationIntent, type Coverage, DOCUMENT_LOOKUP_TOOL, EXTRACTION_PROMPT, type ExtractOptions, type ExtractSectionsOptions, type FieldMapping, HAIKU_MODEL, type InsuranceDocument, type LogFn, METADATA_PROMPT, MODEL_TOKEN_LIMITS, type ModelConfig, PLATFORM_CONFIGS, type Platform, type PlatformConfig, type PolicyDocument, type PremiumLine, type PromptBuilder, QUOTE_METADATA_PROMPT, type QuoteDocument, SONNET_MODEL, type Section, type Subjectivity, type Subsection, type TextOverlay, type ToolDefinition, type UnderwritingCondition, applyExtracted, applyExtractedQuote, buildAcroFormMappingPrompt, buildAgentSystemPrompt, buildAnswerParsingPrompt, buildAutoFillPrompt, buildBatchEmailGenerationPrompt, buildClassifyMessagePrompt, buildCoiRoutingPrompt, buildConfirmationSummaryPrompt, buildConversationMemoryContext, buildConversationMemoryGuidance, buildCoverageGapPrompt, buildDocumentContext, buildFieldExplanationPrompt, buildFieldExtractionPrompt, buildFlatPdfMappingPrompt, buildFormattingPrompt, buildIdentityPrompt, buildIntentPrompt, buildLookupFillPrompt, buildPolicyContext, buildPolicySectionsPrompt, buildQuestionBatchPrompt, buildQuoteSectionsPrompt, buildQuotesPoliciesPrompt, buildReplyIntentClassificationPrompt, buildSafetyPrompt, buildSectionsPrompt, buildSupplementaryEnrichmentPrompt, buildSystemPrompt, classifyDocumentType, createDefaultModelConfig, createUniformModelConfig, enrichSupplementaryFields, extractFromPdf, extractQuoteFromPdf, extractSectionsOnly, fillAcroForm, getAcroFormFields, getPageChunks, mergeChunkedQuoteSections, mergeChunkedSections, overlayTextOnPdf, sanitizeNulls, stripFences };
|
|
551
|
+
export { AGENT_TOOLS, APPLICATION_CLASSIFY_PROMPT, type AcroFormFieldInfo, type AgentContext, type BaseDocument, CLASSIFY_DOCUMENT_PROMPT, CLASSIFY_EMAIL_PROMPT, COI_GENERATION_TOOL, COVERAGE_COMPARISON_TOOL, type ClassifyOptions, type CommunicationIntent, type Coverage, DOCUMENT_LOOKUP_TOOL, EXTRACTION_PROMPT, type ExtractOptions, type ExtractSectionsOptions, type FieldMapping, HAIKU_MODEL, type InsuranceDocument, type LogFn, METADATA_PROMPT, MODEL_TOKEN_LIMITS, type ModelConfig, PLATFORM_CONFIGS, type Platform, type PlatformConfig, type PolicyDocument, type PremiumLine, type PromptBuilder, QUOTE_METADATA_PROMPT, type QuoteDocument, SONNET_MODEL, type Section, type Subjectivity, type Subsection, type TextOverlay, type TokenUsage, type ToolDefinition, type UnderwritingCondition, applyExtracted, applyExtractedQuote, buildAcroFormMappingPrompt, buildAgentSystemPrompt, buildAnswerParsingPrompt, buildAutoFillPrompt, buildBatchEmailGenerationPrompt, buildClassifyMessagePrompt, buildCoiRoutingPrompt, buildConfirmationSummaryPrompt, buildConversationMemoryContext, buildConversationMemoryGuidance, buildCoverageGapPrompt, buildDocumentContext, buildFieldExplanationPrompt, buildFieldExtractionPrompt, buildFlatPdfMappingPrompt, buildFormattingPrompt, buildIdentityPrompt, buildIntentPrompt, buildLookupFillPrompt, buildPolicyContext, buildPolicySectionsPrompt, buildQuestionBatchPrompt, buildQuoteSectionsPrompt, buildQuotesPoliciesPrompt, buildReplyIntentClassificationPrompt, buildSafetyPrompt, buildSectionsPrompt, buildSupplementaryEnrichmentPrompt, buildSystemPrompt, classifyDocumentType, createDefaultModelConfig, createUniformModelConfig, enrichSupplementaryFields, extractFromPdf, extractQuoteFromPdf, extractSectionsOnly, fillAcroForm, getAcroFormFields, getPageChunks, mergeChunkedQuoteSections, mergeChunkedSections, overlayTextOnPdf, sanitizeNulls, stripFences };
|
package/dist/index.js
CHANGED
|
@@ -29870,6 +29870,56 @@ var DEFAULT_METADATA_PROVIDER_OPTIONS = {
|
|
|
29870
29870
|
var DEFAULT_FALLBACK_PROVIDER_OPTIONS = {
|
|
29871
29871
|
anthropic: { thinking: { type: "enabled", budgetTokens: 4096 } }
|
|
29872
29872
|
};
|
|
29873
|
+
var MAX_RETRIES = 5;
|
|
29874
|
+
var BASE_DELAY_MS = 2e3;
|
|
29875
|
+
function isRateLimitError(error) {
|
|
29876
|
+
if (error instanceof Error) {
|
|
29877
|
+
const msg = error.message.toLowerCase();
|
|
29878
|
+
if (msg.includes("rate limit") || msg.includes("rate_limit") || msg.includes("too many requests")) {
|
|
29879
|
+
return true;
|
|
29880
|
+
}
|
|
29881
|
+
}
|
|
29882
|
+
if (typeof error === "object" && error !== null) {
|
|
29883
|
+
const status = error.status ?? error.statusCode;
|
|
29884
|
+
if (status === 429) return true;
|
|
29885
|
+
}
|
|
29886
|
+
return false;
|
|
29887
|
+
}
|
|
29888
|
+
async function withRetry(fn, log) {
|
|
29889
|
+
for (let attempt = 0; ; attempt++) {
|
|
29890
|
+
try {
|
|
29891
|
+
return await fn();
|
|
29892
|
+
} catch (error) {
|
|
29893
|
+
if (!isRateLimitError(error) || attempt >= MAX_RETRIES) {
|
|
29894
|
+
throw error;
|
|
29895
|
+
}
|
|
29896
|
+
const jitter = Math.random() * 1e3;
|
|
29897
|
+
const delay = BASE_DELAY_MS * Math.pow(2, attempt) + jitter;
|
|
29898
|
+
await log?.(`Rate limited, retrying in ${(delay / 1e3).toFixed(1)}s (attempt ${attempt + 1}/${MAX_RETRIES})...`);
|
|
29899
|
+
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
29900
|
+
}
|
|
29901
|
+
}
|
|
29902
|
+
}
|
|
29903
|
+
function pLimit(concurrency) {
|
|
29904
|
+
let active = 0;
|
|
29905
|
+
const queue = [];
|
|
29906
|
+
function next() {
|
|
29907
|
+
if (queue.length > 0 && active < concurrency) {
|
|
29908
|
+
active++;
|
|
29909
|
+
queue.shift()();
|
|
29910
|
+
}
|
|
29911
|
+
}
|
|
29912
|
+
return (fn) => new Promise((resolve, reject) => {
|
|
29913
|
+
const run = () => {
|
|
29914
|
+
fn().then(resolve, reject).finally(() => {
|
|
29915
|
+
active--;
|
|
29916
|
+
next();
|
|
29917
|
+
});
|
|
29918
|
+
};
|
|
29919
|
+
queue.push(run);
|
|
29920
|
+
next();
|
|
29921
|
+
});
|
|
29922
|
+
}
|
|
29873
29923
|
function stripFences(text) {
|
|
29874
29924
|
return text.replace(/^```(?:json)?\s*\n?/i, "").replace(/\n?```\s*$/i, "");
|
|
29875
29925
|
}
|
|
@@ -29948,48 +29998,56 @@ function getPageChunks(totalPages, chunkSize = 30) {
|
|
|
29948
29998
|
}
|
|
29949
29999
|
return chunks;
|
|
29950
30000
|
}
|
|
29951
|
-
async function callModel(model, pdfBase64, prompt, maxTokens, providerOptions, log) {
|
|
30001
|
+
async function callModel(model, pdfBase64, prompt, maxTokens, providerOptions, log, onTokenUsage) {
|
|
29952
30002
|
await log?.(`Calling model (max ${maxTokens} tokens)...`);
|
|
29953
30003
|
const start = Date.now();
|
|
29954
|
-
const { text, usage } = await (
|
|
29955
|
-
|
|
29956
|
-
|
|
29957
|
-
|
|
29958
|
-
|
|
29959
|
-
|
|
29960
|
-
|
|
29961
|
-
|
|
29962
|
-
|
|
29963
|
-
|
|
29964
|
-
|
|
29965
|
-
|
|
30004
|
+
const { text, usage } = await withRetry(
|
|
30005
|
+
() => (0, import_ai.generateText)({
|
|
30006
|
+
model,
|
|
30007
|
+
maxOutputTokens: maxTokens,
|
|
30008
|
+
messages: [{
|
|
30009
|
+
role: "user",
|
|
30010
|
+
content: [
|
|
30011
|
+
{ type: "file", data: pdfBase64, mediaType: "application/pdf" },
|
|
30012
|
+
{ type: "text", text: prompt }
|
|
30013
|
+
]
|
|
30014
|
+
}],
|
|
30015
|
+
...providerOptions ? { providerOptions } : {}
|
|
30016
|
+
}),
|
|
30017
|
+
log
|
|
30018
|
+
);
|
|
29966
30019
|
const elapsed = ((Date.now() - start) / 1e3).toFixed(1);
|
|
29967
30020
|
const inputTokens = usage?.inputTokens ?? 0;
|
|
29968
30021
|
const outputTokens = usage?.outputTokens ?? 0;
|
|
29969
30022
|
await log?.(`${inputTokens} in / ${outputTokens} out tokens (${elapsed}s)`);
|
|
30023
|
+
onTokenUsage?.({ inputTokens, outputTokens });
|
|
29970
30024
|
return text || "{}";
|
|
29971
30025
|
}
|
|
29972
|
-
async function callModelText(model, prompt, maxTokens, log) {
|
|
30026
|
+
async function callModelText(model, prompt, maxTokens, log, onTokenUsage) {
|
|
29973
30027
|
await log?.(`Calling model text-only (max ${maxTokens} tokens)...`);
|
|
29974
30028
|
const start = Date.now();
|
|
29975
|
-
const { text, usage } = await (
|
|
29976
|
-
|
|
29977
|
-
|
|
29978
|
-
|
|
29979
|
-
|
|
29980
|
-
|
|
29981
|
-
|
|
29982
|
-
|
|
30029
|
+
const { text, usage } = await withRetry(
|
|
30030
|
+
() => (0, import_ai.generateText)({
|
|
30031
|
+
model,
|
|
30032
|
+
maxOutputTokens: maxTokens,
|
|
30033
|
+
messages: [{
|
|
30034
|
+
role: "user",
|
|
30035
|
+
content: prompt
|
|
30036
|
+
}]
|
|
30037
|
+
}),
|
|
30038
|
+
log
|
|
30039
|
+
);
|
|
29983
30040
|
const elapsed = ((Date.now() - start) / 1e3).toFixed(1);
|
|
29984
30041
|
const inputTokens = usage?.inputTokens ?? 0;
|
|
29985
30042
|
const outputTokens = usage?.outputTokens ?? 0;
|
|
29986
30043
|
await log?.(`text: ${inputTokens} in / ${outputTokens} out tokens (${elapsed}s)`);
|
|
30044
|
+
onTokenUsage?.({ inputTokens, outputTokens });
|
|
29987
30045
|
return text || "{}";
|
|
29988
30046
|
}
|
|
29989
30047
|
function resolveModels(models) {
|
|
29990
30048
|
return models ?? createDefaultModelConfig();
|
|
29991
30049
|
}
|
|
29992
|
-
async function enrichSupplementaryFields(document, models, log) {
|
|
30050
|
+
async function enrichSupplementaryFields(document, models, log, onTokenUsage) {
|
|
29993
30051
|
const fields = {};
|
|
29994
30052
|
if (document.regulatoryContext?.content) {
|
|
29995
30053
|
fields.regulatoryContext = document.regulatoryContext.content;
|
|
@@ -30011,7 +30069,7 @@ async function enrichSupplementaryFields(document, models, log) {
|
|
|
30011
30069
|
try {
|
|
30012
30070
|
const resolved = resolveModels(models);
|
|
30013
30071
|
const prompt = buildSupplementaryEnrichmentPrompt(fields);
|
|
30014
|
-
const raw = await callModelText(resolved.enrichment, prompt, MODEL_TOKEN_LIMITS.enrichment, log);
|
|
30072
|
+
const raw = await callModelText(resolved.enrichment, prompt, MODEL_TOKEN_LIMITS.enrichment, log, onTokenUsage);
|
|
30015
30073
|
const parsed = JSON.parse(stripFences(raw));
|
|
30016
30074
|
const enriched = { ...document };
|
|
30017
30075
|
if (parsed.regulatoryContext && enriched.regulatoryContext) {
|
|
@@ -30046,7 +30104,7 @@ async function enrichSupplementaryFields(document, models, log) {
|
|
|
30046
30104
|
}
|
|
30047
30105
|
}
|
|
30048
30106
|
async function classifyDocumentType(pdfBase64, options) {
|
|
30049
|
-
const { log, models } = options ?? {};
|
|
30107
|
+
const { log, models, onTokenUsage } = options ?? {};
|
|
30050
30108
|
const resolved = resolveModels(models);
|
|
30051
30109
|
await log?.("Pass 0: Classifying document type...");
|
|
30052
30110
|
const raw = await callModel(
|
|
@@ -30055,7 +30113,8 @@ async function classifyDocumentType(pdfBase64, options) {
|
|
|
30055
30113
|
CLASSIFY_DOCUMENT_PROMPT,
|
|
30056
30114
|
MODEL_TOKEN_LIMITS.classification,
|
|
30057
30115
|
void 0,
|
|
30058
|
-
log
|
|
30116
|
+
log,
|
|
30117
|
+
onTokenUsage
|
|
30059
30118
|
);
|
|
30060
30119
|
try {
|
|
30061
30120
|
const parsed = JSON.parse(stripFences(raw));
|
|
@@ -30135,7 +30194,7 @@ function mergeChunkedQuoteSections(metadataResult, sectionChunks) {
|
|
|
30135
30194
|
};
|
|
30136
30195
|
}
|
|
30137
30196
|
var CHUNK_SIZES = [15, 10, 5];
|
|
30138
|
-
async function extractChunkWithRetry(models, pdfBase64, start, end, sizeIndex, promptBuilder, fallbackProviderOptions, log) {
|
|
30197
|
+
async function extractChunkWithRetry(models, pdfBase64, start, end, sizeIndex, promptBuilder, fallbackProviderOptions, log, onTokenUsage, concurrency = 2) {
|
|
30139
30198
|
await log?.(`Pass 2: Extracting sections pages ${start}\u2013${end}...`);
|
|
30140
30199
|
const chunkRaw = await callModel(
|
|
30141
30200
|
models.sections,
|
|
@@ -30143,7 +30202,8 @@ async function extractChunkWithRetry(models, pdfBase64, start, end, sizeIndex, p
|
|
|
30143
30202
|
promptBuilder(start, end),
|
|
30144
30203
|
MODEL_TOKEN_LIMITS.sections,
|
|
30145
30204
|
void 0,
|
|
30146
|
-
log
|
|
30205
|
+
log,
|
|
30206
|
+
onTokenUsage
|
|
30147
30207
|
);
|
|
30148
30208
|
try {
|
|
30149
30209
|
return [JSON.parse(stripFences(chunkRaw))];
|
|
@@ -30157,21 +30217,24 @@ async function extractChunkWithRetry(models, pdfBase64, start, end, sizeIndex, p
|
|
|
30157
30217
|
const subChunks = getPageChunks(pageSpan, smallerSize).map(
|
|
30158
30218
|
([s, e]) => [s + start - 1, e + start - 1]
|
|
30159
30219
|
);
|
|
30160
|
-
const
|
|
30161
|
-
|
|
30162
|
-
|
|
30163
|
-
|
|
30164
|
-
|
|
30165
|
-
|
|
30166
|
-
|
|
30167
|
-
|
|
30168
|
-
|
|
30169
|
-
|
|
30170
|
-
|
|
30171
|
-
|
|
30172
|
-
|
|
30173
|
-
|
|
30174
|
-
|
|
30220
|
+
const limit = pLimit(concurrency);
|
|
30221
|
+
const nestedResults = await Promise.all(
|
|
30222
|
+
subChunks.map(
|
|
30223
|
+
([subStart, subEnd]) => limit(() => extractChunkWithRetry(
|
|
30224
|
+
models,
|
|
30225
|
+
pdfBase64,
|
|
30226
|
+
subStart,
|
|
30227
|
+
subEnd,
|
|
30228
|
+
nextSizeIndex,
|
|
30229
|
+
promptBuilder,
|
|
30230
|
+
fallbackProviderOptions,
|
|
30231
|
+
log,
|
|
30232
|
+
onTokenUsage,
|
|
30233
|
+
concurrency
|
|
30234
|
+
))
|
|
30235
|
+
)
|
|
30236
|
+
);
|
|
30237
|
+
return nestedResults.flat();
|
|
30175
30238
|
}
|
|
30176
30239
|
}
|
|
30177
30240
|
await log?.(`Sections model exhausted for pages ${start}\u2013${end}, falling back...`);
|
|
@@ -30181,7 +30244,8 @@ async function extractChunkWithRetry(models, pdfBase64, start, end, sizeIndex, p
|
|
|
30181
30244
|
promptBuilder(start, end),
|
|
30182
30245
|
MODEL_TOKEN_LIMITS.sectionsFallback,
|
|
30183
30246
|
fallbackProviderOptions,
|
|
30184
|
-
log
|
|
30247
|
+
log,
|
|
30248
|
+
onTokenUsage
|
|
30185
30249
|
);
|
|
30186
30250
|
try {
|
|
30187
30251
|
return [JSON.parse(stripFences(fallbackRaw))];
|
|
@@ -30192,23 +30256,26 @@ async function extractChunkWithRetry(models, pdfBase64, start, end, sizeIndex, p
|
|
|
30192
30256
|
}
|
|
30193
30257
|
}
|
|
30194
30258
|
}
|
|
30195
|
-
async function extractSectionChunks(models, pdfBase64, pageCount, promptBuilder = buildSectionsPrompt, fallbackProviderOptions, log) {
|
|
30259
|
+
async function extractSectionChunks(models, pdfBase64, pageCount, promptBuilder = buildSectionsPrompt, fallbackProviderOptions, log, onTokenUsage, concurrency = 2) {
|
|
30196
30260
|
const chunks = getPageChunks(pageCount, CHUNK_SIZES[0]);
|
|
30197
|
-
const
|
|
30198
|
-
|
|
30199
|
-
|
|
30200
|
-
|
|
30201
|
-
|
|
30202
|
-
|
|
30203
|
-
|
|
30204
|
-
|
|
30205
|
-
|
|
30206
|
-
|
|
30207
|
-
|
|
30208
|
-
|
|
30209
|
-
|
|
30210
|
-
|
|
30211
|
-
|
|
30261
|
+
const limit = pLimit(concurrency);
|
|
30262
|
+
const nestedResults = await Promise.all(
|
|
30263
|
+
chunks.map(
|
|
30264
|
+
([start, end]) => limit(() => extractChunkWithRetry(
|
|
30265
|
+
models,
|
|
30266
|
+
pdfBase64,
|
|
30267
|
+
start,
|
|
30268
|
+
end,
|
|
30269
|
+
0,
|
|
30270
|
+
promptBuilder,
|
|
30271
|
+
fallbackProviderOptions,
|
|
30272
|
+
log,
|
|
30273
|
+
onTokenUsage,
|
|
30274
|
+
concurrency
|
|
30275
|
+
))
|
|
30276
|
+
)
|
|
30277
|
+
);
|
|
30278
|
+
return nestedResults.flat();
|
|
30212
30279
|
}
|
|
30213
30280
|
async function extractFromPdf(pdfBase64, options) {
|
|
30214
30281
|
const {
|
|
@@ -30216,7 +30283,9 @@ async function extractFromPdf(pdfBase64, options) {
|
|
|
30216
30283
|
onMetadata,
|
|
30217
30284
|
models,
|
|
30218
30285
|
metadataProviderOptions = DEFAULT_METADATA_PROVIDER_OPTIONS,
|
|
30219
|
-
fallbackProviderOptions = DEFAULT_FALLBACK_PROVIDER_OPTIONS
|
|
30286
|
+
fallbackProviderOptions = DEFAULT_FALLBACK_PROVIDER_OPTIONS,
|
|
30287
|
+
concurrency = 2,
|
|
30288
|
+
onTokenUsage
|
|
30220
30289
|
} = options ?? {};
|
|
30221
30290
|
const resolved = resolveModels(models);
|
|
30222
30291
|
await log?.("Pass 1: Extracting metadata...");
|
|
@@ -30226,7 +30295,8 @@ async function extractFromPdf(pdfBase64, options) {
|
|
|
30226
30295
|
METADATA_PROMPT,
|
|
30227
30296
|
MODEL_TOKEN_LIMITS.metadata,
|
|
30228
30297
|
metadataProviderOptions,
|
|
30229
|
-
log
|
|
30298
|
+
log,
|
|
30299
|
+
onTokenUsage
|
|
30230
30300
|
);
|
|
30231
30301
|
let metadataResult;
|
|
30232
30302
|
try {
|
|
@@ -30245,12 +30315,14 @@ async function extractFromPdf(pdfBase64, options) {
|
|
|
30245
30315
|
pageCount,
|
|
30246
30316
|
buildSectionsPrompt,
|
|
30247
30317
|
fallbackProviderOptions,
|
|
30248
|
-
log
|
|
30318
|
+
log,
|
|
30319
|
+
onTokenUsage,
|
|
30320
|
+
concurrency
|
|
30249
30321
|
);
|
|
30250
30322
|
await log?.("Merging extraction results...");
|
|
30251
30323
|
const merged = mergeChunkedSections(metadataResult, sectionChunks);
|
|
30252
30324
|
if (merged.document) {
|
|
30253
|
-
merged.document = await enrichSupplementaryFields(merged.document, resolved, log);
|
|
30325
|
+
merged.document = await enrichSupplementaryFields(merged.document, resolved, log, onTokenUsage);
|
|
30254
30326
|
}
|
|
30255
30327
|
const mergedRaw = JSON.stringify(merged);
|
|
30256
30328
|
return { rawText: mergedRaw, extracted: merged };
|
|
@@ -30260,7 +30332,9 @@ async function extractSectionsOnly(pdfBase64, metadataRaw, options) {
|
|
|
30260
30332
|
log,
|
|
30261
30333
|
promptBuilder = buildSectionsPrompt,
|
|
30262
30334
|
models,
|
|
30263
|
-
fallbackProviderOptions = DEFAULT_FALLBACK_PROVIDER_OPTIONS
|
|
30335
|
+
fallbackProviderOptions = DEFAULT_FALLBACK_PROVIDER_OPTIONS,
|
|
30336
|
+
concurrency = 2,
|
|
30337
|
+
onTokenUsage
|
|
30264
30338
|
} = options ?? {};
|
|
30265
30339
|
const resolved = resolveModels(models);
|
|
30266
30340
|
await log?.("Using saved metadata, skipping pass 1...");
|
|
@@ -30278,12 +30352,14 @@ async function extractSectionsOnly(pdfBase64, metadataRaw, options) {
|
|
|
30278
30352
|
pageCount,
|
|
30279
30353
|
promptBuilder,
|
|
30280
30354
|
fallbackProviderOptions,
|
|
30281
|
-
log
|
|
30355
|
+
log,
|
|
30356
|
+
onTokenUsage,
|
|
30357
|
+
concurrency
|
|
30282
30358
|
);
|
|
30283
30359
|
await log?.("Merging extraction results...");
|
|
30284
30360
|
const merged = mergeChunkedSections(metadataResult, sectionChunks);
|
|
30285
30361
|
if (merged.document) {
|
|
30286
|
-
merged.document = await enrichSupplementaryFields(merged.document, resolved, log);
|
|
30362
|
+
merged.document = await enrichSupplementaryFields(merged.document, resolved, log, onTokenUsage);
|
|
30287
30363
|
}
|
|
30288
30364
|
const mergedRaw = JSON.stringify(merged);
|
|
30289
30365
|
return { rawText: mergedRaw, extracted: merged };
|
|
@@ -30294,7 +30370,9 @@ async function extractQuoteFromPdf(pdfBase64, options) {
|
|
|
30294
30370
|
onMetadata,
|
|
30295
30371
|
models,
|
|
30296
30372
|
metadataProviderOptions = DEFAULT_METADATA_PROVIDER_OPTIONS,
|
|
30297
|
-
fallbackProviderOptions = DEFAULT_FALLBACK_PROVIDER_OPTIONS
|
|
30373
|
+
fallbackProviderOptions = DEFAULT_FALLBACK_PROVIDER_OPTIONS,
|
|
30374
|
+
concurrency = 2,
|
|
30375
|
+
onTokenUsage
|
|
30298
30376
|
} = options ?? {};
|
|
30299
30377
|
const resolved = resolveModels(models);
|
|
30300
30378
|
await log?.("Pass 1: Extracting quote metadata...");
|
|
@@ -30304,7 +30382,8 @@ async function extractQuoteFromPdf(pdfBase64, options) {
|
|
|
30304
30382
|
QUOTE_METADATA_PROMPT,
|
|
30305
30383
|
MODEL_TOKEN_LIMITS.metadata,
|
|
30306
30384
|
metadataProviderOptions,
|
|
30307
|
-
log
|
|
30385
|
+
log,
|
|
30386
|
+
onTokenUsage
|
|
30308
30387
|
);
|
|
30309
30388
|
let metadataResult;
|
|
30310
30389
|
try {
|
|
@@ -30323,7 +30402,9 @@ async function extractQuoteFromPdf(pdfBase64, options) {
|
|
|
30323
30402
|
pageCount,
|
|
30324
30403
|
buildQuoteSectionsPrompt,
|
|
30325
30404
|
fallbackProviderOptions,
|
|
30326
|
-
log
|
|
30405
|
+
log,
|
|
30406
|
+
onTokenUsage,
|
|
30407
|
+
concurrency
|
|
30327
30408
|
);
|
|
30328
30409
|
await log?.("Merging quote extraction results...");
|
|
30329
30410
|
const merged = mergeChunkedQuoteSections(metadataResult, sectionChunks);
|