@claritylabs/cl-sdk 0.15.0 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1423,34 +1423,134 @@ import {
1423
1423
  StandardFonts,
1424
1424
  rgb
1425
1425
  } from "pdf-lib";
1426
- async function extractPageRange(pdfBase64, startPage, endPage) {
1427
- const srcBytes = typeof Buffer !== "undefined" ? Buffer.from(pdfBase64, "base64") : Uint8Array.from(atob(pdfBase64), (c) => c.charCodeAt(0));
1426
+ function isFileIdRef(input) {
1427
+ return typeof input === "object" && input !== null && "fileId" in input;
1428
+ }
1429
+ function isUrl(input) {
1430
+ return input instanceof URL;
1431
+ }
1432
+ function isBytes(input) {
1433
+ return input instanceof Uint8Array;
1434
+ }
1435
+ async function pdfInputToBytes(input) {
1436
+ if (isFileIdRef(input)) {
1437
+ throw new Error(
1438
+ "Cannot convert fileId reference to bytes. Pass the fileId directly to your provider callback instead."
1439
+ );
1440
+ }
1441
+ if (isUrl(input)) {
1442
+ if (input.protocol === "file:") {
1443
+ if (typeof process !== "undefined" && process.versions?.node) {
1444
+ const fs = await import("fs/promises");
1445
+ const buffer = await fs.readFile(input.pathname);
1446
+ return new Uint8Array(buffer);
1447
+ }
1448
+ throw new Error("File URLs not supported in browser environment");
1449
+ }
1450
+ const response = await fetch(input.toString());
1451
+ if (!response.ok) {
1452
+ throw new Error(`Failed to fetch PDF: ${response.status} ${response.statusText}`);
1453
+ }
1454
+ const arrayBuffer = await response.arrayBuffer();
1455
+ return new Uint8Array(arrayBuffer);
1456
+ }
1457
+ if (isBytes(input)) {
1458
+ return input;
1459
+ }
1460
+ if (typeof Buffer !== "undefined") {
1461
+ return new Uint8Array(Buffer.from(input, "base64"));
1462
+ }
1463
+ return Uint8Array.from(atob(input), (c) => c.charCodeAt(0));
1464
+ }
1465
+ async function pdfInputToBase64(input) {
1466
+ if (isFileIdRef(input)) {
1467
+ throw new Error(
1468
+ "Cannot convert fileId reference to base64. Pass the fileId directly to your provider callback instead."
1469
+ );
1470
+ }
1471
+ if (isUrl(input)) {
1472
+ const bytes = await pdfInputToBytes(input);
1473
+ return bytesToBase64(bytes);
1474
+ }
1475
+ if (isBytes(input)) {
1476
+ return bytesToBase64(input);
1477
+ }
1478
+ return input;
1479
+ }
1480
+ function bytesToBase64(bytes) {
1481
+ if (typeof Buffer !== "undefined") {
1482
+ return Buffer.from(bytes).toString("base64");
1483
+ }
1484
+ let binary = "";
1485
+ for (let i = 0; i < bytes.length; i++) {
1486
+ binary += String.fromCharCode(bytes[i]);
1487
+ }
1488
+ return btoa(binary);
1489
+ }
1490
+ function isFileReference(input) {
1491
+ return isFileIdRef(input) || isUrl(input);
1492
+ }
1493
+ function getFileIdentifier(input) {
1494
+ if (isFileIdRef(input)) {
1495
+ return { fileId: input.fileId };
1496
+ }
1497
+ if (isUrl(input)) {
1498
+ return { url: input.toString() };
1499
+ }
1500
+ return void 0;
1501
+ }
1502
+ async function getPdfPageCount(input) {
1503
+ const bytes = await pdfInputToBytes(input);
1504
+ const doc = await PDFDocument.load(bytes, { ignoreEncryption: true });
1505
+ return doc.getPageCount();
1506
+ }
1507
+ async function extractPageRange(input, startPage, endPage) {
1508
+ if (isFileIdRef(input)) {
1509
+ throw new Error(
1510
+ "Cannot extract page range from fileId reference. The provider must handle fileId inputs directly or you must pass the full PDF as base64/bytes."
1511
+ );
1512
+ }
1513
+ if (isUrl(input) && (input.protocol === "http:" || input.protocol === "https:")) {
1514
+ throw new Error(
1515
+ "Cannot extract page range from remote URL. Either pass the full PDF as base64/bytes, or download it first."
1516
+ );
1517
+ }
1518
+ const srcBytes = await pdfInputToBytes(input);
1428
1519
  const srcDoc = await PDFDocument.load(srcBytes, { ignoreEncryption: true });
1429
1520
  const totalPages = srcDoc.getPageCount();
1430
1521
  const start = Math.max(startPage - 1, 0);
1431
1522
  const end = Math.min(endPage, totalPages) - 1;
1432
1523
  if (start === 0 && end >= totalPages - 1) {
1433
- return pdfBase64;
1524
+ if (isBytes(input)) {
1525
+ return bytesToBase64(input);
1526
+ }
1527
+ if (typeof input === "string") {
1528
+ return input;
1529
+ }
1530
+ return bytesToBase64(srcBytes);
1434
1531
  }
1435
1532
  const newDoc = await PDFDocument.create();
1436
1533
  const indices = Array.from({ length: end - start + 1 }, (_, i) => start + i);
1437
1534
  const pages = await newDoc.copyPages(srcDoc, indices);
1438
1535
  pages.forEach((page) => newDoc.addPage(page));
1439
1536
  const bytes = await newDoc.save();
1440
- if (typeof Buffer !== "undefined") {
1441
- return Buffer.from(bytes).toString("base64");
1537
+ return bytesToBase64(new Uint8Array(bytes));
1538
+ }
1539
+ async function buildPdfProviderOptions(input, existingOptions) {
1540
+ const options = { ...existingOptions };
1541
+ if (isFileIdRef(input)) {
1542
+ options.fileId = input.fileId;
1543
+ if (input.mimeType) {
1544
+ options.fileMimeType = input.mimeType;
1545
+ }
1546
+ return options;
1442
1547
  }
1443
- let binary = "";
1444
- const uint8 = new Uint8Array(bytes);
1445
- for (let i = 0; i < uint8.length; i++) {
1446
- binary += String.fromCharCode(uint8[i]);
1548
+ if (isUrl(input)) {
1549
+ options.pdfUrl = input;
1550
+ return options;
1447
1551
  }
1448
- return btoa(binary);
1449
- }
1450
- async function getPdfPageCount(pdfBase64) {
1451
- const srcBytes = typeof Buffer !== "undefined" ? Buffer.from(pdfBase64, "base64") : Uint8Array.from(atob(pdfBase64), (c) => c.charCodeAt(0));
1452
- const doc = await PDFDocument.load(srcBytes, { ignoreEncryption: true });
1453
- return doc.getPageCount();
1552
+ options.pdfBase64 = await pdfInputToBase64(input);
1553
+ return options;
1454
1554
  }
1455
1555
  function getAcroFormFields(pdfDoc) {
1456
1556
  const form = pdfDoc.getForm();
@@ -1543,7 +1643,7 @@ async function runExtractor(params) {
1543
1643
  name,
1544
1644
  prompt,
1545
1645
  schema,
1546
- pdfBase64,
1646
+ pdfInput,
1547
1647
  startPage,
1548
1648
  endPage,
1549
1649
  generateObject,
@@ -1553,6 +1653,7 @@ async function runExtractor(params) {
1553
1653
  } = params;
1554
1654
  const extractorProviderOptions = { ...providerOptions };
1555
1655
  let fullPrompt;
1656
+ const pdfBase64 = await pdfInputToBase64(pdfInput);
1556
1657
  if (convertPdfToImages) {
1557
1658
  const images = await convertPdfToImages(pdfBase64, startPage, endPage);
1558
1659
  extractorProviderOptions.images = images;
@@ -5005,7 +5106,7 @@ async function findReferencedPages(params) {
5005
5106
  referenceTarget,
5006
5107
  sections,
5007
5108
  formInventory,
5008
- pdfBase64,
5109
+ pdfInput,
5009
5110
  pageCount,
5010
5111
  generateObject,
5011
5112
  providerOptions,
@@ -5045,7 +5146,7 @@ If you cannot find the section, return startPage: 0 and endPage: 0.
5045
5146
  Return JSON only.`,
5046
5147
  schema: PageLocationSchema,
5047
5148
  maxTokens: 256,
5048
- providerOptions: { ...providerOptions, pdfBase64 }
5149
+ providerOptions: await buildPdfProviderOptions(pdfInput, providerOptions)
5049
5150
  },
5050
5151
  {
5051
5152
  fallback: { startPage: 0, endPage: 0 },
@@ -5072,7 +5173,7 @@ Return JSON only.`,
5072
5173
  async function resolveReferentialCoverages(params) {
5073
5174
  const {
5074
5175
  memory,
5075
- pdfBase64,
5176
+ pdfInput,
5076
5177
  pageCount,
5077
5178
  generateObject,
5078
5179
  convertPdfToImages,
@@ -5133,7 +5234,7 @@ async function resolveReferentialCoverages(params) {
5133
5234
  referenceTarget: target,
5134
5235
  sections,
5135
5236
  formInventory,
5136
- pdfBase64,
5237
+ pdfInput,
5137
5238
  pageCount,
5138
5239
  generateObject,
5139
5240
  providerOptions,
@@ -5167,7 +5268,7 @@ async function resolveReferentialCoverages(params) {
5167
5268
  name: "referential_lookup",
5168
5269
  prompt: buildReferentialLookupPrompt(promptCoverages),
5169
5270
  schema: ReferentialLookupSchema,
5170
- pdfBase64,
5271
+ pdfInput,
5171
5272
  startPage: pageRange.startPage,
5172
5273
  endPage: pageRange.endPage,
5173
5274
  generateObject,
@@ -5789,7 +5890,7 @@ function createExtractor(config) {
5789
5890
  }))
5790
5891
  };
5791
5892
  }
5792
- async function extract(pdfBase64, documentId, options) {
5893
+ async function extract(pdfInput, documentId, options) {
5793
5894
  const id = documentId ?? `doc-${Date.now()}`;
5794
5895
  const memory = /* @__PURE__ */ new Map();
5795
5896
  totalUsage = { inputTokens: 0, outputTokens: 0 };
@@ -5807,20 +5908,27 @@ function createExtractor(config) {
5807
5908
  memory.set(k, v);
5808
5909
  }
5809
5910
  }
5911
+ let pdfBase64Cache;
5912
+ async function getPdfBase64ForExtraction() {
5913
+ if (pdfBase64Cache === void 0) {
5914
+ pdfBase64Cache = await pdfInputToBase64(pdfInput);
5915
+ }
5916
+ return pdfBase64Cache;
5917
+ }
5810
5918
  let classifyResult;
5811
5919
  if (resumed?.classifyResult && pipelineCtx.isPhaseComplete("classify")) {
5812
5920
  classifyResult = resumed.classifyResult;
5813
5921
  onProgress?.("Resuming from checkpoint (classify complete)...");
5814
5922
  } else {
5815
5923
  onProgress?.("Classifying document...");
5816
- const pageCount2 = await getPdfPageCount(pdfBase64);
5924
+ const pageCount2 = await getPdfPageCount(pdfInput);
5817
5925
  const classifyResponse = await safeGenerateObject(
5818
5926
  generateObject,
5819
5927
  {
5820
5928
  prompt: buildClassifyPrompt(),
5821
5929
  schema: ClassifyResultSchema,
5822
5930
  maxTokens: 512,
5823
- providerOptions: { ...providerOptions, pdfBase64 }
5931
+ providerOptions: await buildPdfProviderOptions(pdfInput, providerOptions)
5824
5932
  },
5825
5933
  {
5826
5934
  fallback: { documentType: "policy", policyTypes: ["other"], confidence: 0 },
@@ -5845,7 +5953,7 @@ function createExtractor(config) {
5845
5953
  const { documentType, policyTypes } = classifyResult;
5846
5954
  const primaryType = policyTypes[0] ?? "other";
5847
5955
  const template = getTemplate(primaryType);
5848
- const pageCount = resumed?.pageCount ?? await getPdfPageCount(pdfBase64);
5956
+ const pageCount = resumed?.pageCount ?? await getPdfPageCount(pdfInput);
5849
5957
  const templateHints = buildTemplateHints(primaryType, documentType, pageCount, template);
5850
5958
  let formInventory;
5851
5959
  if (resumed?.formInventory && pipelineCtx.isPhaseComplete("form_inventory")) {
@@ -5860,7 +5968,7 @@ function createExtractor(config) {
5860
5968
  prompt: buildFormInventoryPrompt(templateHints),
5861
5969
  schema: FormInventorySchema,
5862
5970
  maxTokens: 2048,
5863
- providerOptions: { ...providerOptions, pdfBase64 }
5971
+ providerOptions: await buildPdfProviderOptions(pdfInput, providerOptions)
5864
5972
  },
5865
5973
  {
5866
5974
  fallback: { forms: [] },
@@ -5888,9 +5996,10 @@ function createExtractor(config) {
5888
5996
  const chunkSize = 8;
5889
5997
  const collectedAssignments = [];
5890
5998
  const formInventoryHint = formInventory?.forms.length ? formatFormInventoryForPageMap(formInventory.forms) : void 0;
5999
+ const extractionBase64 = await getPdfBase64ForExtraction();
5891
6000
  for (let startPage = 1; startPage <= pageCount; startPage += chunkSize) {
5892
6001
  const endPage = Math.min(pageCount, startPage + chunkSize - 1);
5893
- const pagesPdf = await extractPageRange(pdfBase64, startPage, endPage);
6002
+ const pagesPdf = await extractPageRange(extractionBase64, startPage, endPage);
5894
6003
  const mapResponse = await safeGenerateObject(
5895
6004
  generateObject,
5896
6005
  {
@@ -5970,7 +6079,7 @@ function createExtractor(config) {
5970
6079
  name: task.extractorName,
5971
6080
  prompt: ext.buildPrompt(),
5972
6081
  schema: ext.schema,
5973
- pdfBase64,
6082
+ pdfInput,
5974
6083
  startPage: task.startPage,
5975
6084
  endPage: task.endPage,
5976
6085
  generateObject,
@@ -6000,7 +6109,7 @@ function createExtractor(config) {
6000
6109
  name: "supplementary",
6001
6110
  prompt: buildSupplementaryPrompt(alreadyExtractedSummary),
6002
6111
  schema: SupplementarySchema,
6003
- pdfBase64,
6112
+ pdfInput,
6004
6113
  startPage: 1,
6005
6114
  endPage: pageCount,
6006
6115
  generateObject,
@@ -6029,7 +6138,7 @@ function createExtractor(config) {
6029
6138
  try {
6030
6139
  const resolution = await resolveReferentialCoverages({
6031
6140
  memory,
6032
- pdfBase64,
6141
+ pdfInput,
6033
6142
  pageCount,
6034
6143
  generateObject,
6035
6144
  convertPdfToImages,
@@ -6069,7 +6178,7 @@ function createExtractor(config) {
6069
6178
  prompt: buildReviewPrompt(template.required, extractedKeys, extractionSummary, pageMapSummary),
6070
6179
  schema: ReviewResultSchema,
6071
6180
  maxTokens: 1536,
6072
- providerOptions: { ...providerOptions, pdfBase64 }
6181
+ providerOptions: await buildPdfProviderOptions(pdfInput, providerOptions)
6073
6182
  },
6074
6183
  {
6075
6184
  fallback: { complete: true, missingFields: [], qualityIssues: [], additionalTasks: [] },
@@ -6097,7 +6206,7 @@ function createExtractor(config) {
6097
6206
  name: task.extractorName,
6098
6207
  prompt: ext.buildPrompt(),
6099
6208
  schema: ext.schema,
6100
- pdfBase64,
6209
+ pdfInput,
6101
6210
  startPage: task.startPage,
6102
6211
  endPage: task.endPage,
6103
6212
  generateObject,
@@ -8942,6 +9051,7 @@ export {
8942
9051
  buildIntentPrompt,
8943
9052
  buildInterpretAttachmentPrompt,
8944
9053
  buildLookupFillPrompt,
9054
+ buildPdfProviderOptions,
8945
9055
  buildQueryClassifyPrompt,
8946
9056
  buildQuestionBatchPrompt,
8947
9057
  buildQuotesPoliciesPrompt,
@@ -8959,10 +9069,14 @@ export {
8959
9069
  fillAcroForm,
8960
9070
  getAcroFormFields,
8961
9071
  getExtractor,
9072
+ getFileIdentifier,
8962
9073
  getPdfPageCount,
8963
9074
  getTemplate,
9075
+ isFileReference,
8964
9076
  overlayTextOnPdf,
8965
9077
  pLimit,
9078
+ pdfInputToBase64,
9079
+ pdfInputToBytes,
8966
9080
  safeGenerateObject,
8967
9081
  sanitizeNulls,
8968
9082
  stripFences,