@claritylabs/cl-sdk 1.1.2 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of @claritylabs/cl-sdk might be problematic. Click here for more details.
- package/dist/index.d.mts +15 -1
- package/dist/index.d.ts +15 -1
- package/dist/index.js +146 -97
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +152 -105
- package/dist/index.mjs.map +1 -1
- package/package.json +5 -1
package/dist/index.mjs
CHANGED
|
@@ -29779,6 +29779,132 @@ var AGENT_TOOLS = [
|
|
|
29779
29779
|
|
|
29780
29780
|
// src/extraction/pipeline.ts
|
|
29781
29781
|
import { generateText } from "ai";
|
|
29782
|
+
|
|
29783
|
+
// src/extraction/pdf.ts
|
|
29784
|
+
import {
|
|
29785
|
+
PDFDocument,
|
|
29786
|
+
PDFTextField,
|
|
29787
|
+
PDFCheckBox,
|
|
29788
|
+
PDFDropdown,
|
|
29789
|
+
PDFRadioGroup,
|
|
29790
|
+
StandardFonts,
|
|
29791
|
+
rgb
|
|
29792
|
+
} from "pdf-lib";
|
|
29793
|
+
async function extractPageRange(pdfBase64, startPage, endPage) {
|
|
29794
|
+
const srcBytes = typeof Buffer !== "undefined" ? Buffer.from(pdfBase64, "base64") : Uint8Array.from(atob(pdfBase64), (c) => c.charCodeAt(0));
|
|
29795
|
+
const srcDoc = await PDFDocument.load(srcBytes, { ignoreEncryption: true });
|
|
29796
|
+
const totalPages = srcDoc.getPageCount();
|
|
29797
|
+
const start = Math.max(startPage - 1, 0);
|
|
29798
|
+
const end = Math.min(endPage, totalPages) - 1;
|
|
29799
|
+
if (start === 0 && end >= totalPages - 1) {
|
|
29800
|
+
return pdfBase64;
|
|
29801
|
+
}
|
|
29802
|
+
const newDoc = await PDFDocument.create();
|
|
29803
|
+
const indices = Array.from({ length: end - start + 1 }, (_, i) => start + i);
|
|
29804
|
+
const pages = await newDoc.copyPages(srcDoc, indices);
|
|
29805
|
+
pages.forEach((page) => newDoc.addPage(page));
|
|
29806
|
+
const bytes = await newDoc.save();
|
|
29807
|
+
if (typeof Buffer !== "undefined") {
|
|
29808
|
+
return Buffer.from(bytes).toString("base64");
|
|
29809
|
+
}
|
|
29810
|
+
let binary = "";
|
|
29811
|
+
const uint8 = new Uint8Array(bytes);
|
|
29812
|
+
for (let i = 0; i < uint8.length; i++) {
|
|
29813
|
+
binary += String.fromCharCode(uint8[i]);
|
|
29814
|
+
}
|
|
29815
|
+
return btoa(binary);
|
|
29816
|
+
}
|
|
29817
|
+
async function getPdfPageCount(pdfBase64) {
|
|
29818
|
+
const srcBytes = typeof Buffer !== "undefined" ? Buffer.from(pdfBase64, "base64") : Uint8Array.from(atob(pdfBase64), (c) => c.charCodeAt(0));
|
|
29819
|
+
const doc = await PDFDocument.load(srcBytes, { ignoreEncryption: true });
|
|
29820
|
+
return doc.getPageCount();
|
|
29821
|
+
}
|
|
29822
|
+
function getAcroFormFields(pdfDoc) {
|
|
29823
|
+
const form = pdfDoc.getForm();
|
|
29824
|
+
const fields = form.getFields();
|
|
29825
|
+
if (fields.length === 0) return [];
|
|
29826
|
+
return fields.map((field) => {
|
|
29827
|
+
const name = field.getName();
|
|
29828
|
+
if (field instanceof PDFTextField) {
|
|
29829
|
+
return { name, type: "text" };
|
|
29830
|
+
}
|
|
29831
|
+
if (field instanceof PDFCheckBox) {
|
|
29832
|
+
return { name, type: "checkbox" };
|
|
29833
|
+
}
|
|
29834
|
+
if (field instanceof PDFDropdown) {
|
|
29835
|
+
return { name, type: "dropdown", options: field.getOptions() };
|
|
29836
|
+
}
|
|
29837
|
+
if (field instanceof PDFRadioGroup) {
|
|
29838
|
+
return { name, type: "radio", options: field.getOptions() };
|
|
29839
|
+
}
|
|
29840
|
+
return { name, type: "text" };
|
|
29841
|
+
});
|
|
29842
|
+
}
|
|
29843
|
+
async function fillAcroForm(pdfBytes, mappings) {
|
|
29844
|
+
const pdfDoc = await PDFDocument.load(pdfBytes, { ignoreEncryption: true });
|
|
29845
|
+
const form = pdfDoc.getForm();
|
|
29846
|
+
for (const { acroFormName, value } of mappings) {
|
|
29847
|
+
try {
|
|
29848
|
+
const field = form.getField(acroFormName);
|
|
29849
|
+
if (field instanceof PDFTextField) {
|
|
29850
|
+
field.setText(value);
|
|
29851
|
+
} else if (field instanceof PDFCheckBox) {
|
|
29852
|
+
const lower = value.toLowerCase();
|
|
29853
|
+
if (["yes", "true", "x", "checked", "on"].includes(lower)) {
|
|
29854
|
+
field.check();
|
|
29855
|
+
} else {
|
|
29856
|
+
field.uncheck();
|
|
29857
|
+
}
|
|
29858
|
+
} else if (field instanceof PDFDropdown) {
|
|
29859
|
+
try {
|
|
29860
|
+
field.select(value);
|
|
29861
|
+
} catch {
|
|
29862
|
+
}
|
|
29863
|
+
} else if (field instanceof PDFRadioGroup) {
|
|
29864
|
+
try {
|
|
29865
|
+
field.select(value);
|
|
29866
|
+
} catch {
|
|
29867
|
+
}
|
|
29868
|
+
}
|
|
29869
|
+
} catch {
|
|
29870
|
+
}
|
|
29871
|
+
}
|
|
29872
|
+
form.flatten();
|
|
29873
|
+
return await pdfDoc.save();
|
|
29874
|
+
}
|
|
29875
|
+
async function overlayTextOnPdf(pdfBytes, overlays) {
|
|
29876
|
+
const pdfDoc = await PDFDocument.load(pdfBytes, { ignoreEncryption: true });
|
|
29877
|
+
const font = await pdfDoc.embedFont(StandardFonts.Helvetica);
|
|
29878
|
+
const pageCount = pdfDoc.getPageCount();
|
|
29879
|
+
for (const overlay of overlays) {
|
|
29880
|
+
if (overlay.page < 0 || overlay.page >= pageCount) continue;
|
|
29881
|
+
const page = pdfDoc.getPage(overlay.page);
|
|
29882
|
+
const { width, height } = page.getSize();
|
|
29883
|
+
const fontSize = overlay.fontSize ?? 10;
|
|
29884
|
+
const x = overlay.x / 100 * width;
|
|
29885
|
+
const y = height - overlay.y / 100 * height - fontSize;
|
|
29886
|
+
if (overlay.isCheckmark) {
|
|
29887
|
+
page.drawText("X", {
|
|
29888
|
+
x,
|
|
29889
|
+
y,
|
|
29890
|
+
size: fontSize,
|
|
29891
|
+
font,
|
|
29892
|
+
color: rgb(0, 0, 0)
|
|
29893
|
+
});
|
|
29894
|
+
} else {
|
|
29895
|
+
page.drawText(overlay.text, {
|
|
29896
|
+
x,
|
|
29897
|
+
y,
|
|
29898
|
+
size: fontSize,
|
|
29899
|
+
font,
|
|
29900
|
+
color: rgb(0, 0, 0)
|
|
29901
|
+
});
|
|
29902
|
+
}
|
|
29903
|
+
}
|
|
29904
|
+
return await pdfDoc.save();
|
|
29905
|
+
}
|
|
29906
|
+
|
|
29907
|
+
// src/extraction/pipeline.ts
|
|
29782
29908
|
var SONNET_MODEL = "claude-sonnet-4-6";
|
|
29783
29909
|
var HAIKU_MODEL = "claude-haiku-4-5-20251001";
|
|
29784
29910
|
var DEFAULT_METADATA_PROVIDER_OPTIONS = {
|
|
@@ -29915,8 +30041,10 @@ function getPageChunks(totalPages, chunkSize = 30) {
|
|
|
29915
30041
|
}
|
|
29916
30042
|
return chunks;
|
|
29917
30043
|
}
|
|
29918
|
-
async function callModel(model, pdfBase64, prompt, maxTokens, providerOptions, log, onTokenUsage) {
|
|
29919
|
-
await
|
|
30044
|
+
async function callModel(model, pdfBase64, prompt, maxTokens, providerOptions, log, onTokenUsage, pageRange) {
|
|
30045
|
+
const pdfToSend = pageRange ? await extractPageRange(pdfBase64, pageRange[0], pageRange[1]) : pdfBase64;
|
|
30046
|
+
const rangeLabel = pageRange ? ` [pages ${pageRange[0]}\u2013${pageRange[1]}]` : "";
|
|
30047
|
+
await log?.(`Calling model (max ${maxTokens} tokens)${rangeLabel}...`);
|
|
29920
30048
|
const start = Date.now();
|
|
29921
30049
|
const { text, usage } = await withRetry(
|
|
29922
30050
|
() => generateText({
|
|
@@ -29925,7 +30053,7 @@ async function callModel(model, pdfBase64, prompt, maxTokens, providerOptions, l
|
|
|
29925
30053
|
messages: [{
|
|
29926
30054
|
role: "user",
|
|
29927
30055
|
content: [
|
|
29928
|
-
{ type: "file", data:
|
|
30056
|
+
{ type: "file", data: pdfToSend, mediaType: "application/pdf" },
|
|
29929
30057
|
{ type: "text", text: prompt }
|
|
29930
30058
|
]
|
|
29931
30059
|
}],
|
|
@@ -30031,7 +30159,9 @@ async function classifyDocumentType(pdfBase64, options) {
|
|
|
30031
30159
|
MODEL_TOKEN_LIMITS.classification,
|
|
30032
30160
|
void 0,
|
|
30033
30161
|
log,
|
|
30034
|
-
onTokenUsage
|
|
30162
|
+
onTokenUsage,
|
|
30163
|
+
[1, 3]
|
|
30164
|
+
// Only need first 3 pages for classification
|
|
30035
30165
|
);
|
|
30036
30166
|
try {
|
|
30037
30167
|
const parsed = JSON.parse(stripFences(raw));
|
|
@@ -30120,7 +30250,9 @@ async function extractChunkWithRetry(models, pdfBase64, start, end, sizeIndex, p
|
|
|
30120
30250
|
MODEL_TOKEN_LIMITS.sections,
|
|
30121
30251
|
void 0,
|
|
30122
30252
|
log,
|
|
30123
|
-
onTokenUsage
|
|
30253
|
+
onTokenUsage,
|
|
30254
|
+
[start, end]
|
|
30255
|
+
// Only send this chunk's pages
|
|
30124
30256
|
);
|
|
30125
30257
|
try {
|
|
30126
30258
|
return [JSON.parse(stripFences(chunkRaw))];
|
|
@@ -30162,7 +30294,9 @@ async function extractChunkWithRetry(models, pdfBase64, start, end, sizeIndex, p
|
|
|
30162
30294
|
MODEL_TOKEN_LIMITS.sectionsFallback,
|
|
30163
30295
|
fallbackProviderOptions,
|
|
30164
30296
|
log,
|
|
30165
|
-
onTokenUsage
|
|
30297
|
+
onTokenUsage,
|
|
30298
|
+
[start, end]
|
|
30299
|
+
// Only send this chunk's pages
|
|
30166
30300
|
);
|
|
30167
30301
|
try {
|
|
30168
30302
|
return [JSON.parse(stripFences(fallbackRaw))];
|
|
@@ -30205,7 +30339,9 @@ async function extractFromPdf(pdfBase64, options) {
|
|
|
30205
30339
|
onTokenUsage
|
|
30206
30340
|
} = options ?? {};
|
|
30207
30341
|
const resolved = resolveModels(models);
|
|
30342
|
+
const actualPageCount = await getPdfPageCount(pdfBase64);
|
|
30208
30343
|
await log?.("Pass 1: Extracting metadata...");
|
|
30344
|
+
const metadataPageRange = [1, Math.min(10, actualPageCount)];
|
|
30209
30345
|
const metadataRaw = await callModel(
|
|
30210
30346
|
resolved.metadata,
|
|
30211
30347
|
pdfBase64,
|
|
@@ -30213,7 +30349,8 @@ async function extractFromPdf(pdfBase64, options) {
|
|
|
30213
30349
|
MODEL_TOKEN_LIMITS.metadata,
|
|
30214
30350
|
metadataProviderOptions,
|
|
30215
30351
|
log,
|
|
30216
|
-
onTokenUsage
|
|
30352
|
+
onTokenUsage,
|
|
30353
|
+
metadataPageRange
|
|
30217
30354
|
);
|
|
30218
30355
|
let metadataResult;
|
|
30219
30356
|
try {
|
|
@@ -30224,7 +30361,7 @@ async function extractFromPdf(pdfBase64, options) {
|
|
|
30224
30361
|
throw new Error(`Metadata JSON parse failed: ${e.message}`);
|
|
30225
30362
|
}
|
|
30226
30363
|
await onMetadata?.(metadataRaw);
|
|
30227
|
-
const pageCount =
|
|
30364
|
+
const pageCount = actualPageCount;
|
|
30228
30365
|
await log?.(`Document: ${pageCount} page(s)`);
|
|
30229
30366
|
const sectionChunks = await extractSectionChunks(
|
|
30230
30367
|
resolved,
|
|
@@ -30292,7 +30429,9 @@ async function extractQuoteFromPdf(pdfBase64, options) {
|
|
|
30292
30429
|
onTokenUsage
|
|
30293
30430
|
} = options ?? {};
|
|
30294
30431
|
const resolved = resolveModels(models);
|
|
30432
|
+
const actualPageCount = await getPdfPageCount(pdfBase64);
|
|
30295
30433
|
await log?.("Pass 1: Extracting quote metadata...");
|
|
30434
|
+
const metadataPageRange = [1, Math.min(10, actualPageCount)];
|
|
30296
30435
|
const metadataRaw = await callModel(
|
|
30297
30436
|
resolved.metadata,
|
|
30298
30437
|
pdfBase64,
|
|
@@ -30300,7 +30439,8 @@ async function extractQuoteFromPdf(pdfBase64, options) {
|
|
|
30300
30439
|
MODEL_TOKEN_LIMITS.metadata,
|
|
30301
30440
|
metadataProviderOptions,
|
|
30302
30441
|
log,
|
|
30303
|
-
onTokenUsage
|
|
30442
|
+
onTokenUsage,
|
|
30443
|
+
metadataPageRange
|
|
30304
30444
|
);
|
|
30305
30445
|
let metadataResult;
|
|
30306
30446
|
try {
|
|
@@ -30311,7 +30451,7 @@ async function extractQuoteFromPdf(pdfBase64, options) {
|
|
|
30311
30451
|
throw new Error(`Quote metadata JSON parse failed: ${e.message}`);
|
|
30312
30452
|
}
|
|
30313
30453
|
await onMetadata?.(metadataRaw);
|
|
30314
|
-
const pageCount =
|
|
30454
|
+
const pageCount = actualPageCount;
|
|
30315
30455
|
await log?.(`Quote document: ${pageCount} page(s)`);
|
|
30316
30456
|
const sectionChunks = await extractSectionChunks(
|
|
30317
30457
|
resolved,
|
|
@@ -30328,101 +30468,6 @@ async function extractQuoteFromPdf(pdfBase64, options) {
|
|
|
30328
30468
|
const mergedRaw = JSON.stringify(merged);
|
|
30329
30469
|
return { rawText: mergedRaw, extracted: merged };
|
|
30330
30470
|
}
|
|
30331
|
-
|
|
30332
|
-
// src/extraction/pdf.ts
|
|
30333
|
-
import {
|
|
30334
|
-
PDFDocument,
|
|
30335
|
-
PDFTextField,
|
|
30336
|
-
PDFCheckBox,
|
|
30337
|
-
PDFDropdown,
|
|
30338
|
-
PDFRadioGroup,
|
|
30339
|
-
StandardFonts,
|
|
30340
|
-
rgb
|
|
30341
|
-
} from "pdf-lib";
|
|
30342
|
-
function getAcroFormFields(pdfDoc) {
|
|
30343
|
-
const form = pdfDoc.getForm();
|
|
30344
|
-
const fields = form.getFields();
|
|
30345
|
-
if (fields.length === 0) return [];
|
|
30346
|
-
return fields.map((field) => {
|
|
30347
|
-
const name = field.getName();
|
|
30348
|
-
if (field instanceof PDFTextField) {
|
|
30349
|
-
return { name, type: "text" };
|
|
30350
|
-
}
|
|
30351
|
-
if (field instanceof PDFCheckBox) {
|
|
30352
|
-
return { name, type: "checkbox" };
|
|
30353
|
-
}
|
|
30354
|
-
if (field instanceof PDFDropdown) {
|
|
30355
|
-
return { name, type: "dropdown", options: field.getOptions() };
|
|
30356
|
-
}
|
|
30357
|
-
if (field instanceof PDFRadioGroup) {
|
|
30358
|
-
return { name, type: "radio", options: field.getOptions() };
|
|
30359
|
-
}
|
|
30360
|
-
return { name, type: "text" };
|
|
30361
|
-
});
|
|
30362
|
-
}
|
|
30363
|
-
async function fillAcroForm(pdfBytes, mappings) {
|
|
30364
|
-
const pdfDoc = await PDFDocument.load(pdfBytes, { ignoreEncryption: true });
|
|
30365
|
-
const form = pdfDoc.getForm();
|
|
30366
|
-
for (const { acroFormName, value } of mappings) {
|
|
30367
|
-
try {
|
|
30368
|
-
const field = form.getField(acroFormName);
|
|
30369
|
-
if (field instanceof PDFTextField) {
|
|
30370
|
-
field.setText(value);
|
|
30371
|
-
} else if (field instanceof PDFCheckBox) {
|
|
30372
|
-
const lower = value.toLowerCase();
|
|
30373
|
-
if (["yes", "true", "x", "checked", "on"].includes(lower)) {
|
|
30374
|
-
field.check();
|
|
30375
|
-
} else {
|
|
30376
|
-
field.uncheck();
|
|
30377
|
-
}
|
|
30378
|
-
} else if (field instanceof PDFDropdown) {
|
|
30379
|
-
try {
|
|
30380
|
-
field.select(value);
|
|
30381
|
-
} catch {
|
|
30382
|
-
}
|
|
30383
|
-
} else if (field instanceof PDFRadioGroup) {
|
|
30384
|
-
try {
|
|
30385
|
-
field.select(value);
|
|
30386
|
-
} catch {
|
|
30387
|
-
}
|
|
30388
|
-
}
|
|
30389
|
-
} catch {
|
|
30390
|
-
}
|
|
30391
|
-
}
|
|
30392
|
-
form.flatten();
|
|
30393
|
-
return await pdfDoc.save();
|
|
30394
|
-
}
|
|
30395
|
-
async function overlayTextOnPdf(pdfBytes, overlays) {
|
|
30396
|
-
const pdfDoc = await PDFDocument.load(pdfBytes, { ignoreEncryption: true });
|
|
30397
|
-
const font = await pdfDoc.embedFont(StandardFonts.Helvetica);
|
|
30398
|
-
const pageCount = pdfDoc.getPageCount();
|
|
30399
|
-
for (const overlay of overlays) {
|
|
30400
|
-
if (overlay.page < 0 || overlay.page >= pageCount) continue;
|
|
30401
|
-
const page = pdfDoc.getPage(overlay.page);
|
|
30402
|
-
const { width, height } = page.getSize();
|
|
30403
|
-
const fontSize = overlay.fontSize ?? 10;
|
|
30404
|
-
const x = overlay.x / 100 * width;
|
|
30405
|
-
const y = height - overlay.y / 100 * height - fontSize;
|
|
30406
|
-
if (overlay.isCheckmark) {
|
|
30407
|
-
page.drawText("X", {
|
|
30408
|
-
x,
|
|
30409
|
-
y,
|
|
30410
|
-
size: fontSize,
|
|
30411
|
-
font,
|
|
30412
|
-
color: rgb(0, 0, 0)
|
|
30413
|
-
});
|
|
30414
|
-
} else {
|
|
30415
|
-
page.drawText(overlay.text, {
|
|
30416
|
-
x,
|
|
30417
|
-
y,
|
|
30418
|
-
size: fontSize,
|
|
30419
|
-
font,
|
|
30420
|
-
color: rgb(0, 0, 0)
|
|
30421
|
-
});
|
|
30422
|
-
}
|
|
30423
|
-
}
|
|
30424
|
-
return await pdfDoc.save();
|
|
30425
|
-
}
|
|
30426
30471
|
export {
|
|
30427
30472
|
AGENT_TOOLS,
|
|
30428
30473
|
APPLICATION_CLASSIFY_PROMPT,
|
|
@@ -30474,11 +30519,13 @@ export {
|
|
|
30474
30519
|
createUniformModelConfig,
|
|
30475
30520
|
enrichSupplementaryFields,
|
|
30476
30521
|
extractFromPdf,
|
|
30522
|
+
extractPageRange,
|
|
30477
30523
|
extractQuoteFromPdf,
|
|
30478
30524
|
extractSectionsOnly,
|
|
30479
30525
|
fillAcroForm,
|
|
30480
30526
|
getAcroFormFields,
|
|
30481
30527
|
getPageChunks,
|
|
30528
|
+
getPdfPageCount,
|
|
30482
30529
|
mergeChunkedQuoteSections,
|
|
30483
30530
|
mergeChunkedSections,
|
|
30484
30531
|
overlayTextOnPdf,
|