@claritylabs/cl-sdk 1.0.0 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of @claritylabs/cl-sdk might be problematic. Click here for more details.
- package/README.md +41 -1
- package/dist/index.d.mts +17 -2
- package/dist/index.d.ts +17 -2
- package/dist/index.js +151 -70
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +151 -70
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.mjs
CHANGED
|
@@ -29787,6 +29787,56 @@ var DEFAULT_METADATA_PROVIDER_OPTIONS = {
|
|
|
29787
29787
|
var DEFAULT_FALLBACK_PROVIDER_OPTIONS = {
|
|
29788
29788
|
anthropic: { thinking: { type: "enabled", budgetTokens: 4096 } }
|
|
29789
29789
|
};
|
|
29790
|
+
var MAX_RETRIES = 5;
|
|
29791
|
+
var BASE_DELAY_MS = 2e3;
|
|
29792
|
+
function isRateLimitError(error) {
|
|
29793
|
+
if (error instanceof Error) {
|
|
29794
|
+
const msg = error.message.toLowerCase();
|
|
29795
|
+
if (msg.includes("rate limit") || msg.includes("rate_limit") || msg.includes("too many requests")) {
|
|
29796
|
+
return true;
|
|
29797
|
+
}
|
|
29798
|
+
}
|
|
29799
|
+
if (typeof error === "object" && error !== null) {
|
|
29800
|
+
const status = error.status ?? error.statusCode;
|
|
29801
|
+
if (status === 429) return true;
|
|
29802
|
+
}
|
|
29803
|
+
return false;
|
|
29804
|
+
}
|
|
29805
|
+
async function withRetry(fn, log) {
|
|
29806
|
+
for (let attempt = 0; ; attempt++) {
|
|
29807
|
+
try {
|
|
29808
|
+
return await fn();
|
|
29809
|
+
} catch (error) {
|
|
29810
|
+
if (!isRateLimitError(error) || attempt >= MAX_RETRIES) {
|
|
29811
|
+
throw error;
|
|
29812
|
+
}
|
|
29813
|
+
const jitter = Math.random() * 1e3;
|
|
29814
|
+
const delay = BASE_DELAY_MS * Math.pow(2, attempt) + jitter;
|
|
29815
|
+
await log?.(`Rate limited, retrying in ${(delay / 1e3).toFixed(1)}s (attempt ${attempt + 1}/${MAX_RETRIES})...`);
|
|
29816
|
+
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
29817
|
+
}
|
|
29818
|
+
}
|
|
29819
|
+
}
|
|
29820
|
+
function pLimit(concurrency) {
|
|
29821
|
+
let active = 0;
|
|
29822
|
+
const queue = [];
|
|
29823
|
+
function next() {
|
|
29824
|
+
if (queue.length > 0 && active < concurrency) {
|
|
29825
|
+
active++;
|
|
29826
|
+
queue.shift()();
|
|
29827
|
+
}
|
|
29828
|
+
}
|
|
29829
|
+
return (fn) => new Promise((resolve, reject) => {
|
|
29830
|
+
const run = () => {
|
|
29831
|
+
fn().then(resolve, reject).finally(() => {
|
|
29832
|
+
active--;
|
|
29833
|
+
next();
|
|
29834
|
+
});
|
|
29835
|
+
};
|
|
29836
|
+
queue.push(run);
|
|
29837
|
+
next();
|
|
29838
|
+
});
|
|
29839
|
+
}
|
|
29790
29840
|
function stripFences(text) {
|
|
29791
29841
|
return text.replace(/^```(?:json)?\s*\n?/i, "").replace(/\n?```\s*$/i, "");
|
|
29792
29842
|
}
|
|
@@ -29865,48 +29915,56 @@ function getPageChunks(totalPages, chunkSize = 30) {
|
|
|
29865
29915
|
}
|
|
29866
29916
|
return chunks;
|
|
29867
29917
|
}
|
|
29868
|
-
async function callModel(model, pdfBase64, prompt, maxTokens, providerOptions, log) {
|
|
29918
|
+
async function callModel(model, pdfBase64, prompt, maxTokens, providerOptions, log, onTokenUsage) {
|
|
29869
29919
|
await log?.(`Calling model (max ${maxTokens} tokens)...`);
|
|
29870
29920
|
const start = Date.now();
|
|
29871
|
-
const { text, usage } = await
|
|
29872
|
-
|
|
29873
|
-
|
|
29874
|
-
|
|
29875
|
-
|
|
29876
|
-
|
|
29877
|
-
|
|
29878
|
-
|
|
29879
|
-
|
|
29880
|
-
|
|
29881
|
-
|
|
29882
|
-
|
|
29921
|
+
const { text, usage } = await withRetry(
|
|
29922
|
+
() => generateText({
|
|
29923
|
+
model,
|
|
29924
|
+
maxOutputTokens: maxTokens,
|
|
29925
|
+
messages: [{
|
|
29926
|
+
role: "user",
|
|
29927
|
+
content: [
|
|
29928
|
+
{ type: "file", data: pdfBase64, mediaType: "application/pdf" },
|
|
29929
|
+
{ type: "text", text: prompt }
|
|
29930
|
+
]
|
|
29931
|
+
}],
|
|
29932
|
+
...providerOptions ? { providerOptions } : {}
|
|
29933
|
+
}),
|
|
29934
|
+
log
|
|
29935
|
+
);
|
|
29883
29936
|
const elapsed = ((Date.now() - start) / 1e3).toFixed(1);
|
|
29884
29937
|
const inputTokens = usage?.inputTokens ?? 0;
|
|
29885
29938
|
const outputTokens = usage?.outputTokens ?? 0;
|
|
29886
29939
|
await log?.(`${inputTokens} in / ${outputTokens} out tokens (${elapsed}s)`);
|
|
29940
|
+
onTokenUsage?.({ inputTokens, outputTokens });
|
|
29887
29941
|
return text || "{}";
|
|
29888
29942
|
}
|
|
29889
|
-
async function callModelText(model, prompt, maxTokens, log) {
|
|
29943
|
+
async function callModelText(model, prompt, maxTokens, log, onTokenUsage) {
|
|
29890
29944
|
await log?.(`Calling model text-only (max ${maxTokens} tokens)...`);
|
|
29891
29945
|
const start = Date.now();
|
|
29892
|
-
const { text, usage } = await
|
|
29893
|
-
|
|
29894
|
-
|
|
29895
|
-
|
|
29896
|
-
|
|
29897
|
-
|
|
29898
|
-
|
|
29899
|
-
|
|
29946
|
+
const { text, usage } = await withRetry(
|
|
29947
|
+
() => generateText({
|
|
29948
|
+
model,
|
|
29949
|
+
maxOutputTokens: maxTokens,
|
|
29950
|
+
messages: [{
|
|
29951
|
+
role: "user",
|
|
29952
|
+
content: prompt
|
|
29953
|
+
}]
|
|
29954
|
+
}),
|
|
29955
|
+
log
|
|
29956
|
+
);
|
|
29900
29957
|
const elapsed = ((Date.now() - start) / 1e3).toFixed(1);
|
|
29901
29958
|
const inputTokens = usage?.inputTokens ?? 0;
|
|
29902
29959
|
const outputTokens = usage?.outputTokens ?? 0;
|
|
29903
29960
|
await log?.(`text: ${inputTokens} in / ${outputTokens} out tokens (${elapsed}s)`);
|
|
29961
|
+
onTokenUsage?.({ inputTokens, outputTokens });
|
|
29904
29962
|
return text || "{}";
|
|
29905
29963
|
}
|
|
29906
29964
|
function resolveModels(models) {
|
|
29907
29965
|
return models ?? createDefaultModelConfig();
|
|
29908
29966
|
}
|
|
29909
|
-
async function enrichSupplementaryFields(document, models, log) {
|
|
29967
|
+
async function enrichSupplementaryFields(document, models, log, onTokenUsage) {
|
|
29910
29968
|
const fields = {};
|
|
29911
29969
|
if (document.regulatoryContext?.content) {
|
|
29912
29970
|
fields.regulatoryContext = document.regulatoryContext.content;
|
|
@@ -29928,7 +29986,7 @@ async function enrichSupplementaryFields(document, models, log) {
|
|
|
29928
29986
|
try {
|
|
29929
29987
|
const resolved = resolveModels(models);
|
|
29930
29988
|
const prompt = buildSupplementaryEnrichmentPrompt(fields);
|
|
29931
|
-
const raw = await callModelText(resolved.enrichment, prompt, MODEL_TOKEN_LIMITS.enrichment, log);
|
|
29989
|
+
const raw = await callModelText(resolved.enrichment, prompt, MODEL_TOKEN_LIMITS.enrichment, log, onTokenUsage);
|
|
29932
29990
|
const parsed = JSON.parse(stripFences(raw));
|
|
29933
29991
|
const enriched = { ...document };
|
|
29934
29992
|
if (parsed.regulatoryContext && enriched.regulatoryContext) {
|
|
@@ -29963,7 +30021,7 @@ async function enrichSupplementaryFields(document, models, log) {
|
|
|
29963
30021
|
}
|
|
29964
30022
|
}
|
|
29965
30023
|
async function classifyDocumentType(pdfBase64, options) {
|
|
29966
|
-
const { log, models } = options ?? {};
|
|
30024
|
+
const { log, models, onTokenUsage } = options ?? {};
|
|
29967
30025
|
const resolved = resolveModels(models);
|
|
29968
30026
|
await log?.("Pass 0: Classifying document type...");
|
|
29969
30027
|
const raw = await callModel(
|
|
@@ -29972,7 +30030,8 @@ async function classifyDocumentType(pdfBase64, options) {
|
|
|
29972
30030
|
CLASSIFY_DOCUMENT_PROMPT,
|
|
29973
30031
|
MODEL_TOKEN_LIMITS.classification,
|
|
29974
30032
|
void 0,
|
|
29975
|
-
log
|
|
30033
|
+
log,
|
|
30034
|
+
onTokenUsage
|
|
29976
30035
|
);
|
|
29977
30036
|
try {
|
|
29978
30037
|
const parsed = JSON.parse(stripFences(raw));
|
|
@@ -30052,7 +30111,7 @@ function mergeChunkedQuoteSections(metadataResult, sectionChunks) {
|
|
|
30052
30111
|
};
|
|
30053
30112
|
}
|
|
30054
30113
|
var CHUNK_SIZES = [15, 10, 5];
|
|
30055
|
-
async function extractChunkWithRetry(models, pdfBase64, start, end, sizeIndex, promptBuilder, fallbackProviderOptions, log) {
|
|
30114
|
+
async function extractChunkWithRetry(models, pdfBase64, start, end, sizeIndex, promptBuilder, fallbackProviderOptions, log, onTokenUsage, concurrency = 2) {
|
|
30056
30115
|
await log?.(`Pass 2: Extracting sections pages ${start}\u2013${end}...`);
|
|
30057
30116
|
const chunkRaw = await callModel(
|
|
30058
30117
|
models.sections,
|
|
@@ -30060,7 +30119,8 @@ async function extractChunkWithRetry(models, pdfBase64, start, end, sizeIndex, p
|
|
|
30060
30119
|
promptBuilder(start, end),
|
|
30061
30120
|
MODEL_TOKEN_LIMITS.sections,
|
|
30062
30121
|
void 0,
|
|
30063
|
-
log
|
|
30122
|
+
log,
|
|
30123
|
+
onTokenUsage
|
|
30064
30124
|
);
|
|
30065
30125
|
try {
|
|
30066
30126
|
return [JSON.parse(stripFences(chunkRaw))];
|
|
@@ -30074,21 +30134,24 @@ async function extractChunkWithRetry(models, pdfBase64, start, end, sizeIndex, p
|
|
|
30074
30134
|
const subChunks = getPageChunks(pageSpan, smallerSize).map(
|
|
30075
30135
|
([s, e]) => [s + start - 1, e + start - 1]
|
|
30076
30136
|
);
|
|
30077
|
-
const
|
|
30078
|
-
|
|
30079
|
-
|
|
30080
|
-
|
|
30081
|
-
|
|
30082
|
-
|
|
30083
|
-
|
|
30084
|
-
|
|
30085
|
-
|
|
30086
|
-
|
|
30087
|
-
|
|
30088
|
-
|
|
30089
|
-
|
|
30090
|
-
|
|
30091
|
-
|
|
30137
|
+
const limit = pLimit(concurrency);
|
|
30138
|
+
const nestedResults = await Promise.all(
|
|
30139
|
+
subChunks.map(
|
|
30140
|
+
([subStart, subEnd]) => limit(() => extractChunkWithRetry(
|
|
30141
|
+
models,
|
|
30142
|
+
pdfBase64,
|
|
30143
|
+
subStart,
|
|
30144
|
+
subEnd,
|
|
30145
|
+
nextSizeIndex,
|
|
30146
|
+
promptBuilder,
|
|
30147
|
+
fallbackProviderOptions,
|
|
30148
|
+
log,
|
|
30149
|
+
onTokenUsage,
|
|
30150
|
+
concurrency
|
|
30151
|
+
))
|
|
30152
|
+
)
|
|
30153
|
+
);
|
|
30154
|
+
return nestedResults.flat();
|
|
30092
30155
|
}
|
|
30093
30156
|
}
|
|
30094
30157
|
await log?.(`Sections model exhausted for pages ${start}\u2013${end}, falling back...`);
|
|
@@ -30098,7 +30161,8 @@ async function extractChunkWithRetry(models, pdfBase64, start, end, sizeIndex, p
|
|
|
30098
30161
|
promptBuilder(start, end),
|
|
30099
30162
|
MODEL_TOKEN_LIMITS.sectionsFallback,
|
|
30100
30163
|
fallbackProviderOptions,
|
|
30101
|
-
log
|
|
30164
|
+
log,
|
|
30165
|
+
onTokenUsage
|
|
30102
30166
|
);
|
|
30103
30167
|
try {
|
|
30104
30168
|
return [JSON.parse(stripFences(fallbackRaw))];
|
|
@@ -30109,23 +30173,26 @@ async function extractChunkWithRetry(models, pdfBase64, start, end, sizeIndex, p
|
|
|
30109
30173
|
}
|
|
30110
30174
|
}
|
|
30111
30175
|
}
|
|
30112
|
-
async function extractSectionChunks(models, pdfBase64, pageCount, promptBuilder = buildSectionsPrompt, fallbackProviderOptions, log) {
|
|
30176
|
+
async function extractSectionChunks(models, pdfBase64, pageCount, promptBuilder = buildSectionsPrompt, fallbackProviderOptions, log, onTokenUsage, concurrency = 2) {
|
|
30113
30177
|
const chunks = getPageChunks(pageCount, CHUNK_SIZES[0]);
|
|
30114
|
-
const
|
|
30115
|
-
|
|
30116
|
-
|
|
30117
|
-
|
|
30118
|
-
|
|
30119
|
-
|
|
30120
|
-
|
|
30121
|
-
|
|
30122
|
-
|
|
30123
|
-
|
|
30124
|
-
|
|
30125
|
-
|
|
30126
|
-
|
|
30127
|
-
|
|
30128
|
-
|
|
30178
|
+
const limit = pLimit(concurrency);
|
|
30179
|
+
const nestedResults = await Promise.all(
|
|
30180
|
+
chunks.map(
|
|
30181
|
+
([start, end]) => limit(() => extractChunkWithRetry(
|
|
30182
|
+
models,
|
|
30183
|
+
pdfBase64,
|
|
30184
|
+
start,
|
|
30185
|
+
end,
|
|
30186
|
+
0,
|
|
30187
|
+
promptBuilder,
|
|
30188
|
+
fallbackProviderOptions,
|
|
30189
|
+
log,
|
|
30190
|
+
onTokenUsage,
|
|
30191
|
+
concurrency
|
|
30192
|
+
))
|
|
30193
|
+
)
|
|
30194
|
+
);
|
|
30195
|
+
return nestedResults.flat();
|
|
30129
30196
|
}
|
|
30130
30197
|
async function extractFromPdf(pdfBase64, options) {
|
|
30131
30198
|
const {
|
|
@@ -30133,7 +30200,9 @@ async function extractFromPdf(pdfBase64, options) {
|
|
|
30133
30200
|
onMetadata,
|
|
30134
30201
|
models,
|
|
30135
30202
|
metadataProviderOptions = DEFAULT_METADATA_PROVIDER_OPTIONS,
|
|
30136
|
-
fallbackProviderOptions = DEFAULT_FALLBACK_PROVIDER_OPTIONS
|
|
30203
|
+
fallbackProviderOptions = DEFAULT_FALLBACK_PROVIDER_OPTIONS,
|
|
30204
|
+
concurrency = 2,
|
|
30205
|
+
onTokenUsage
|
|
30137
30206
|
} = options ?? {};
|
|
30138
30207
|
const resolved = resolveModels(models);
|
|
30139
30208
|
await log?.("Pass 1: Extracting metadata...");
|
|
@@ -30143,7 +30212,8 @@ async function extractFromPdf(pdfBase64, options) {
|
|
|
30143
30212
|
METADATA_PROMPT,
|
|
30144
30213
|
MODEL_TOKEN_LIMITS.metadata,
|
|
30145
30214
|
metadataProviderOptions,
|
|
30146
|
-
log
|
|
30215
|
+
log,
|
|
30216
|
+
onTokenUsage
|
|
30147
30217
|
);
|
|
30148
30218
|
let metadataResult;
|
|
30149
30219
|
try {
|
|
@@ -30162,12 +30232,14 @@ async function extractFromPdf(pdfBase64, options) {
|
|
|
30162
30232
|
pageCount,
|
|
30163
30233
|
buildSectionsPrompt,
|
|
30164
30234
|
fallbackProviderOptions,
|
|
30165
|
-
log
|
|
30235
|
+
log,
|
|
30236
|
+
onTokenUsage,
|
|
30237
|
+
concurrency
|
|
30166
30238
|
);
|
|
30167
30239
|
await log?.("Merging extraction results...");
|
|
30168
30240
|
const merged = mergeChunkedSections(metadataResult, sectionChunks);
|
|
30169
30241
|
if (merged.document) {
|
|
30170
|
-
merged.document = await enrichSupplementaryFields(merged.document, resolved, log);
|
|
30242
|
+
merged.document = await enrichSupplementaryFields(merged.document, resolved, log, onTokenUsage);
|
|
30171
30243
|
}
|
|
30172
30244
|
const mergedRaw = JSON.stringify(merged);
|
|
30173
30245
|
return { rawText: mergedRaw, extracted: merged };
|
|
@@ -30177,7 +30249,9 @@ async function extractSectionsOnly(pdfBase64, metadataRaw, options) {
|
|
|
30177
30249
|
log,
|
|
30178
30250
|
promptBuilder = buildSectionsPrompt,
|
|
30179
30251
|
models,
|
|
30180
|
-
fallbackProviderOptions = DEFAULT_FALLBACK_PROVIDER_OPTIONS
|
|
30252
|
+
fallbackProviderOptions = DEFAULT_FALLBACK_PROVIDER_OPTIONS,
|
|
30253
|
+
concurrency = 2,
|
|
30254
|
+
onTokenUsage
|
|
30181
30255
|
} = options ?? {};
|
|
30182
30256
|
const resolved = resolveModels(models);
|
|
30183
30257
|
await log?.("Using saved metadata, skipping pass 1...");
|
|
@@ -30195,12 +30269,14 @@ async function extractSectionsOnly(pdfBase64, metadataRaw, options) {
|
|
|
30195
30269
|
pageCount,
|
|
30196
30270
|
promptBuilder,
|
|
30197
30271
|
fallbackProviderOptions,
|
|
30198
|
-
log
|
|
30272
|
+
log,
|
|
30273
|
+
onTokenUsage,
|
|
30274
|
+
concurrency
|
|
30199
30275
|
);
|
|
30200
30276
|
await log?.("Merging extraction results...");
|
|
30201
30277
|
const merged = mergeChunkedSections(metadataResult, sectionChunks);
|
|
30202
30278
|
if (merged.document) {
|
|
30203
|
-
merged.document = await enrichSupplementaryFields(merged.document, resolved, log);
|
|
30279
|
+
merged.document = await enrichSupplementaryFields(merged.document, resolved, log, onTokenUsage);
|
|
30204
30280
|
}
|
|
30205
30281
|
const mergedRaw = JSON.stringify(merged);
|
|
30206
30282
|
return { rawText: mergedRaw, extracted: merged };
|
|
@@ -30211,7 +30287,9 @@ async function extractQuoteFromPdf(pdfBase64, options) {
|
|
|
30211
30287
|
onMetadata,
|
|
30212
30288
|
models,
|
|
30213
30289
|
metadataProviderOptions = DEFAULT_METADATA_PROVIDER_OPTIONS,
|
|
30214
|
-
fallbackProviderOptions = DEFAULT_FALLBACK_PROVIDER_OPTIONS
|
|
30290
|
+
fallbackProviderOptions = DEFAULT_FALLBACK_PROVIDER_OPTIONS,
|
|
30291
|
+
concurrency = 2,
|
|
30292
|
+
onTokenUsage
|
|
30215
30293
|
} = options ?? {};
|
|
30216
30294
|
const resolved = resolveModels(models);
|
|
30217
30295
|
await log?.("Pass 1: Extracting quote metadata...");
|
|
@@ -30221,7 +30299,8 @@ async function extractQuoteFromPdf(pdfBase64, options) {
|
|
|
30221
30299
|
QUOTE_METADATA_PROMPT,
|
|
30222
30300
|
MODEL_TOKEN_LIMITS.metadata,
|
|
30223
30301
|
metadataProviderOptions,
|
|
30224
|
-
log
|
|
30302
|
+
log,
|
|
30303
|
+
onTokenUsage
|
|
30225
30304
|
);
|
|
30226
30305
|
let metadataResult;
|
|
30227
30306
|
try {
|
|
@@ -30240,7 +30319,9 @@ async function extractQuoteFromPdf(pdfBase64, options) {
|
|
|
30240
30319
|
pageCount,
|
|
30241
30320
|
buildQuoteSectionsPrompt,
|
|
30242
30321
|
fallbackProviderOptions,
|
|
30243
|
-
log
|
|
30322
|
+
log,
|
|
30323
|
+
onTokenUsage,
|
|
30324
|
+
concurrency
|
|
30244
30325
|
);
|
|
30245
30326
|
await log?.("Merging quote extraction results...");
|
|
30246
30327
|
const merged = mergeChunkedQuoteSections(metadataResult, sectionChunks);
|