@claritylabs/cl-sdk 0.15.0 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +64 -9
- package/dist/index.d.ts +64 -9
- package/dist/index.js +161 -32
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +146 -32
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.mjs
CHANGED
|
@@ -1423,34 +1423,134 @@ import {
|
|
|
1423
1423
|
StandardFonts,
|
|
1424
1424
|
rgb
|
|
1425
1425
|
} from "pdf-lib";
|
|
1426
|
-
|
|
1427
|
-
|
|
1426
|
+
function isFileIdRef(input) {
|
|
1427
|
+
return typeof input === "object" && input !== null && "fileId" in input;
|
|
1428
|
+
}
|
|
1429
|
+
function isUrl(input) {
|
|
1430
|
+
return input instanceof URL;
|
|
1431
|
+
}
|
|
1432
|
+
function isBytes(input) {
|
|
1433
|
+
return input instanceof Uint8Array;
|
|
1434
|
+
}
|
|
1435
|
+
async function pdfInputToBytes(input) {
|
|
1436
|
+
if (isFileIdRef(input)) {
|
|
1437
|
+
throw new Error(
|
|
1438
|
+
"Cannot convert fileId reference to bytes. Pass the fileId directly to your provider callback instead."
|
|
1439
|
+
);
|
|
1440
|
+
}
|
|
1441
|
+
if (isUrl(input)) {
|
|
1442
|
+
if (input.protocol === "file:") {
|
|
1443
|
+
if (typeof process !== "undefined" && process.versions?.node) {
|
|
1444
|
+
const fs = await import("fs/promises");
|
|
1445
|
+
const buffer = await fs.readFile(input.pathname);
|
|
1446
|
+
return new Uint8Array(buffer);
|
|
1447
|
+
}
|
|
1448
|
+
throw new Error("File URLs not supported in browser environment");
|
|
1449
|
+
}
|
|
1450
|
+
const response = await fetch(input.toString());
|
|
1451
|
+
if (!response.ok) {
|
|
1452
|
+
throw new Error(`Failed to fetch PDF: ${response.status} ${response.statusText}`);
|
|
1453
|
+
}
|
|
1454
|
+
const arrayBuffer = await response.arrayBuffer();
|
|
1455
|
+
return new Uint8Array(arrayBuffer);
|
|
1456
|
+
}
|
|
1457
|
+
if (isBytes(input)) {
|
|
1458
|
+
return input;
|
|
1459
|
+
}
|
|
1460
|
+
if (typeof Buffer !== "undefined") {
|
|
1461
|
+
return new Uint8Array(Buffer.from(input, "base64"));
|
|
1462
|
+
}
|
|
1463
|
+
return Uint8Array.from(atob(input), (c) => c.charCodeAt(0));
|
|
1464
|
+
}
|
|
1465
|
+
async function pdfInputToBase64(input) {
|
|
1466
|
+
if (isFileIdRef(input)) {
|
|
1467
|
+
throw new Error(
|
|
1468
|
+
"Cannot convert fileId reference to base64. Pass the fileId directly to your provider callback instead."
|
|
1469
|
+
);
|
|
1470
|
+
}
|
|
1471
|
+
if (isUrl(input)) {
|
|
1472
|
+
const bytes = await pdfInputToBytes(input);
|
|
1473
|
+
return bytesToBase64(bytes);
|
|
1474
|
+
}
|
|
1475
|
+
if (isBytes(input)) {
|
|
1476
|
+
return bytesToBase64(input);
|
|
1477
|
+
}
|
|
1478
|
+
return input;
|
|
1479
|
+
}
|
|
1480
|
+
function bytesToBase64(bytes) {
|
|
1481
|
+
if (typeof Buffer !== "undefined") {
|
|
1482
|
+
return Buffer.from(bytes).toString("base64");
|
|
1483
|
+
}
|
|
1484
|
+
let binary = "";
|
|
1485
|
+
for (let i = 0; i < bytes.length; i++) {
|
|
1486
|
+
binary += String.fromCharCode(bytes[i]);
|
|
1487
|
+
}
|
|
1488
|
+
return btoa(binary);
|
|
1489
|
+
}
|
|
1490
|
+
function isFileReference(input) {
|
|
1491
|
+
return isFileIdRef(input) || isUrl(input);
|
|
1492
|
+
}
|
|
1493
|
+
function getFileIdentifier(input) {
|
|
1494
|
+
if (isFileIdRef(input)) {
|
|
1495
|
+
return { fileId: input.fileId };
|
|
1496
|
+
}
|
|
1497
|
+
if (isUrl(input)) {
|
|
1498
|
+
return { url: input.toString() };
|
|
1499
|
+
}
|
|
1500
|
+
return void 0;
|
|
1501
|
+
}
|
|
1502
|
+
async function getPdfPageCount(input) {
|
|
1503
|
+
const bytes = await pdfInputToBytes(input);
|
|
1504
|
+
const doc = await PDFDocument.load(bytes, { ignoreEncryption: true });
|
|
1505
|
+
return doc.getPageCount();
|
|
1506
|
+
}
|
|
1507
|
+
async function extractPageRange(input, startPage, endPage) {
|
|
1508
|
+
if (isFileIdRef(input)) {
|
|
1509
|
+
throw new Error(
|
|
1510
|
+
"Cannot extract page range from fileId reference. The provider must handle fileId inputs directly or you must pass the full PDF as base64/bytes."
|
|
1511
|
+
);
|
|
1512
|
+
}
|
|
1513
|
+
if (isUrl(input) && (input.protocol === "http:" || input.protocol === "https:")) {
|
|
1514
|
+
throw new Error(
|
|
1515
|
+
"Cannot extract page range from remote URL. Either pass the full PDF as base64/bytes, or download it first."
|
|
1516
|
+
);
|
|
1517
|
+
}
|
|
1518
|
+
const srcBytes = await pdfInputToBytes(input);
|
|
1428
1519
|
const srcDoc = await PDFDocument.load(srcBytes, { ignoreEncryption: true });
|
|
1429
1520
|
const totalPages = srcDoc.getPageCount();
|
|
1430
1521
|
const start = Math.max(startPage - 1, 0);
|
|
1431
1522
|
const end = Math.min(endPage, totalPages) - 1;
|
|
1432
1523
|
if (start === 0 && end >= totalPages - 1) {
|
|
1433
|
-
|
|
1524
|
+
if (isBytes(input)) {
|
|
1525
|
+
return bytesToBase64(input);
|
|
1526
|
+
}
|
|
1527
|
+
if (typeof input === "string") {
|
|
1528
|
+
return input;
|
|
1529
|
+
}
|
|
1530
|
+
return bytesToBase64(srcBytes);
|
|
1434
1531
|
}
|
|
1435
1532
|
const newDoc = await PDFDocument.create();
|
|
1436
1533
|
const indices = Array.from({ length: end - start + 1 }, (_, i) => start + i);
|
|
1437
1534
|
const pages = await newDoc.copyPages(srcDoc, indices);
|
|
1438
1535
|
pages.forEach((page) => newDoc.addPage(page));
|
|
1439
1536
|
const bytes = await newDoc.save();
|
|
1440
|
-
|
|
1441
|
-
|
|
1537
|
+
return bytesToBase64(new Uint8Array(bytes));
|
|
1538
|
+
}
|
|
1539
|
+
async function buildPdfProviderOptions(input, existingOptions) {
|
|
1540
|
+
const options = { ...existingOptions };
|
|
1541
|
+
if (isFileIdRef(input)) {
|
|
1542
|
+
options.fileId = input.fileId;
|
|
1543
|
+
if (input.mimeType) {
|
|
1544
|
+
options.fileMimeType = input.mimeType;
|
|
1545
|
+
}
|
|
1546
|
+
return options;
|
|
1442
1547
|
}
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
binary += String.fromCharCode(uint8[i]);
|
|
1548
|
+
if (isUrl(input)) {
|
|
1549
|
+
options.pdfUrl = input;
|
|
1550
|
+
return options;
|
|
1447
1551
|
}
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
async function getPdfPageCount(pdfBase64) {
|
|
1451
|
-
const srcBytes = typeof Buffer !== "undefined" ? Buffer.from(pdfBase64, "base64") : Uint8Array.from(atob(pdfBase64), (c) => c.charCodeAt(0));
|
|
1452
|
-
const doc = await PDFDocument.load(srcBytes, { ignoreEncryption: true });
|
|
1453
|
-
return doc.getPageCount();
|
|
1552
|
+
options.pdfBase64 = await pdfInputToBase64(input);
|
|
1553
|
+
return options;
|
|
1454
1554
|
}
|
|
1455
1555
|
function getAcroFormFields(pdfDoc) {
|
|
1456
1556
|
const form = pdfDoc.getForm();
|
|
@@ -1543,7 +1643,7 @@ async function runExtractor(params) {
|
|
|
1543
1643
|
name,
|
|
1544
1644
|
prompt,
|
|
1545
1645
|
schema,
|
|
1546
|
-
|
|
1646
|
+
pdfInput,
|
|
1547
1647
|
startPage,
|
|
1548
1648
|
endPage,
|
|
1549
1649
|
generateObject,
|
|
@@ -1553,6 +1653,7 @@ async function runExtractor(params) {
|
|
|
1553
1653
|
} = params;
|
|
1554
1654
|
const extractorProviderOptions = { ...providerOptions };
|
|
1555
1655
|
let fullPrompt;
|
|
1656
|
+
const pdfBase64 = await pdfInputToBase64(pdfInput);
|
|
1556
1657
|
if (convertPdfToImages) {
|
|
1557
1658
|
const images = await convertPdfToImages(pdfBase64, startPage, endPage);
|
|
1558
1659
|
extractorProviderOptions.images = images;
|
|
@@ -5005,7 +5106,7 @@ async function findReferencedPages(params) {
|
|
|
5005
5106
|
referenceTarget,
|
|
5006
5107
|
sections,
|
|
5007
5108
|
formInventory,
|
|
5008
|
-
|
|
5109
|
+
pdfInput,
|
|
5009
5110
|
pageCount,
|
|
5010
5111
|
generateObject,
|
|
5011
5112
|
providerOptions,
|
|
@@ -5045,7 +5146,7 @@ If you cannot find the section, return startPage: 0 and endPage: 0.
|
|
|
5045
5146
|
Return JSON only.`,
|
|
5046
5147
|
schema: PageLocationSchema,
|
|
5047
5148
|
maxTokens: 256,
|
|
5048
|
-
providerOptions:
|
|
5149
|
+
providerOptions: await buildPdfProviderOptions(pdfInput, providerOptions)
|
|
5049
5150
|
},
|
|
5050
5151
|
{
|
|
5051
5152
|
fallback: { startPage: 0, endPage: 0 },
|
|
@@ -5072,7 +5173,7 @@ Return JSON only.`,
|
|
|
5072
5173
|
async function resolveReferentialCoverages(params) {
|
|
5073
5174
|
const {
|
|
5074
5175
|
memory,
|
|
5075
|
-
|
|
5176
|
+
pdfInput,
|
|
5076
5177
|
pageCount,
|
|
5077
5178
|
generateObject,
|
|
5078
5179
|
convertPdfToImages,
|
|
@@ -5133,7 +5234,7 @@ async function resolveReferentialCoverages(params) {
|
|
|
5133
5234
|
referenceTarget: target,
|
|
5134
5235
|
sections,
|
|
5135
5236
|
formInventory,
|
|
5136
|
-
|
|
5237
|
+
pdfInput,
|
|
5137
5238
|
pageCount,
|
|
5138
5239
|
generateObject,
|
|
5139
5240
|
providerOptions,
|
|
@@ -5167,7 +5268,7 @@ async function resolveReferentialCoverages(params) {
|
|
|
5167
5268
|
name: "referential_lookup",
|
|
5168
5269
|
prompt: buildReferentialLookupPrompt(promptCoverages),
|
|
5169
5270
|
schema: ReferentialLookupSchema,
|
|
5170
|
-
|
|
5271
|
+
pdfInput,
|
|
5171
5272
|
startPage: pageRange.startPage,
|
|
5172
5273
|
endPage: pageRange.endPage,
|
|
5173
5274
|
generateObject,
|
|
@@ -5789,7 +5890,7 @@ function createExtractor(config) {
|
|
|
5789
5890
|
}))
|
|
5790
5891
|
};
|
|
5791
5892
|
}
|
|
5792
|
-
async function extract(
|
|
5893
|
+
async function extract(pdfInput, documentId, options) {
|
|
5793
5894
|
const id = documentId ?? `doc-${Date.now()}`;
|
|
5794
5895
|
const memory = /* @__PURE__ */ new Map();
|
|
5795
5896
|
totalUsage = { inputTokens: 0, outputTokens: 0 };
|
|
@@ -5807,20 +5908,27 @@ function createExtractor(config) {
|
|
|
5807
5908
|
memory.set(k, v);
|
|
5808
5909
|
}
|
|
5809
5910
|
}
|
|
5911
|
+
let pdfBase64Cache;
|
|
5912
|
+
async function getPdfBase64ForExtraction() {
|
|
5913
|
+
if (pdfBase64Cache === void 0) {
|
|
5914
|
+
pdfBase64Cache = await pdfInputToBase64(pdfInput);
|
|
5915
|
+
}
|
|
5916
|
+
return pdfBase64Cache;
|
|
5917
|
+
}
|
|
5810
5918
|
let classifyResult;
|
|
5811
5919
|
if (resumed?.classifyResult && pipelineCtx.isPhaseComplete("classify")) {
|
|
5812
5920
|
classifyResult = resumed.classifyResult;
|
|
5813
5921
|
onProgress?.("Resuming from checkpoint (classify complete)...");
|
|
5814
5922
|
} else {
|
|
5815
5923
|
onProgress?.("Classifying document...");
|
|
5816
|
-
const pageCount2 = await getPdfPageCount(
|
|
5924
|
+
const pageCount2 = await getPdfPageCount(pdfInput);
|
|
5817
5925
|
const classifyResponse = await safeGenerateObject(
|
|
5818
5926
|
generateObject,
|
|
5819
5927
|
{
|
|
5820
5928
|
prompt: buildClassifyPrompt(),
|
|
5821
5929
|
schema: ClassifyResultSchema,
|
|
5822
5930
|
maxTokens: 512,
|
|
5823
|
-
providerOptions:
|
|
5931
|
+
providerOptions: await buildPdfProviderOptions(pdfInput, providerOptions)
|
|
5824
5932
|
},
|
|
5825
5933
|
{
|
|
5826
5934
|
fallback: { documentType: "policy", policyTypes: ["other"], confidence: 0 },
|
|
@@ -5845,7 +5953,7 @@ function createExtractor(config) {
|
|
|
5845
5953
|
const { documentType, policyTypes } = classifyResult;
|
|
5846
5954
|
const primaryType = policyTypes[0] ?? "other";
|
|
5847
5955
|
const template = getTemplate(primaryType);
|
|
5848
|
-
const pageCount = resumed?.pageCount ?? await getPdfPageCount(
|
|
5956
|
+
const pageCount = resumed?.pageCount ?? await getPdfPageCount(pdfInput);
|
|
5849
5957
|
const templateHints = buildTemplateHints(primaryType, documentType, pageCount, template);
|
|
5850
5958
|
let formInventory;
|
|
5851
5959
|
if (resumed?.formInventory && pipelineCtx.isPhaseComplete("form_inventory")) {
|
|
@@ -5860,7 +5968,7 @@ function createExtractor(config) {
|
|
|
5860
5968
|
prompt: buildFormInventoryPrompt(templateHints),
|
|
5861
5969
|
schema: FormInventorySchema,
|
|
5862
5970
|
maxTokens: 2048,
|
|
5863
|
-
providerOptions:
|
|
5971
|
+
providerOptions: await buildPdfProviderOptions(pdfInput, providerOptions)
|
|
5864
5972
|
},
|
|
5865
5973
|
{
|
|
5866
5974
|
fallback: { forms: [] },
|
|
@@ -5888,9 +5996,10 @@ function createExtractor(config) {
|
|
|
5888
5996
|
const chunkSize = 8;
|
|
5889
5997
|
const collectedAssignments = [];
|
|
5890
5998
|
const formInventoryHint = formInventory?.forms.length ? formatFormInventoryForPageMap(formInventory.forms) : void 0;
|
|
5999
|
+
const extractionBase64 = await getPdfBase64ForExtraction();
|
|
5891
6000
|
for (let startPage = 1; startPage <= pageCount; startPage += chunkSize) {
|
|
5892
6001
|
const endPage = Math.min(pageCount, startPage + chunkSize - 1);
|
|
5893
|
-
const pagesPdf = await extractPageRange(
|
|
6002
|
+
const pagesPdf = await extractPageRange(extractionBase64, startPage, endPage);
|
|
5894
6003
|
const mapResponse = await safeGenerateObject(
|
|
5895
6004
|
generateObject,
|
|
5896
6005
|
{
|
|
@@ -5970,7 +6079,7 @@ function createExtractor(config) {
|
|
|
5970
6079
|
name: task.extractorName,
|
|
5971
6080
|
prompt: ext.buildPrompt(),
|
|
5972
6081
|
schema: ext.schema,
|
|
5973
|
-
|
|
6082
|
+
pdfInput,
|
|
5974
6083
|
startPage: task.startPage,
|
|
5975
6084
|
endPage: task.endPage,
|
|
5976
6085
|
generateObject,
|
|
@@ -6000,7 +6109,7 @@ function createExtractor(config) {
|
|
|
6000
6109
|
name: "supplementary",
|
|
6001
6110
|
prompt: buildSupplementaryPrompt(alreadyExtractedSummary),
|
|
6002
6111
|
schema: SupplementarySchema,
|
|
6003
|
-
|
|
6112
|
+
pdfInput,
|
|
6004
6113
|
startPage: 1,
|
|
6005
6114
|
endPage: pageCount,
|
|
6006
6115
|
generateObject,
|
|
@@ -6029,7 +6138,7 @@ function createExtractor(config) {
|
|
|
6029
6138
|
try {
|
|
6030
6139
|
const resolution = await resolveReferentialCoverages({
|
|
6031
6140
|
memory,
|
|
6032
|
-
|
|
6141
|
+
pdfInput,
|
|
6033
6142
|
pageCount,
|
|
6034
6143
|
generateObject,
|
|
6035
6144
|
convertPdfToImages,
|
|
@@ -6069,7 +6178,7 @@ function createExtractor(config) {
|
|
|
6069
6178
|
prompt: buildReviewPrompt(template.required, extractedKeys, extractionSummary, pageMapSummary),
|
|
6070
6179
|
schema: ReviewResultSchema,
|
|
6071
6180
|
maxTokens: 1536,
|
|
6072
|
-
providerOptions:
|
|
6181
|
+
providerOptions: await buildPdfProviderOptions(pdfInput, providerOptions)
|
|
6073
6182
|
},
|
|
6074
6183
|
{
|
|
6075
6184
|
fallback: { complete: true, missingFields: [], qualityIssues: [], additionalTasks: [] },
|
|
@@ -6097,7 +6206,7 @@ function createExtractor(config) {
|
|
|
6097
6206
|
name: task.extractorName,
|
|
6098
6207
|
prompt: ext.buildPrompt(),
|
|
6099
6208
|
schema: ext.schema,
|
|
6100
|
-
|
|
6209
|
+
pdfInput,
|
|
6101
6210
|
startPage: task.startPage,
|
|
6102
6211
|
endPage: task.endPage,
|
|
6103
6212
|
generateObject,
|
|
@@ -8942,6 +9051,7 @@ export {
|
|
|
8942
9051
|
buildIntentPrompt,
|
|
8943
9052
|
buildInterpretAttachmentPrompt,
|
|
8944
9053
|
buildLookupFillPrompt,
|
|
9054
|
+
buildPdfProviderOptions,
|
|
8945
9055
|
buildQueryClassifyPrompt,
|
|
8946
9056
|
buildQuestionBatchPrompt,
|
|
8947
9057
|
buildQuotesPoliciesPrompt,
|
|
@@ -8959,10 +9069,14 @@ export {
|
|
|
8959
9069
|
fillAcroForm,
|
|
8960
9070
|
getAcroFormFields,
|
|
8961
9071
|
getExtractor,
|
|
9072
|
+
getFileIdentifier,
|
|
8962
9073
|
getPdfPageCount,
|
|
8963
9074
|
getTemplate,
|
|
9075
|
+
isFileReference,
|
|
8964
9076
|
overlayTextOnPdf,
|
|
8965
9077
|
pLimit,
|
|
9078
|
+
pdfInputToBase64,
|
|
9079
|
+
pdfInputToBytes,
|
|
8966
9080
|
safeGenerateObject,
|
|
8967
9081
|
sanitizeNulls,
|
|
8968
9082
|
stripFences,
|