@claritylabs/cl-sdk 0.15.0 → 0.16.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,7 +1,9 @@
1
1
  "use strict";
2
+ var __create = Object.create;
2
3
  var __defProp = Object.defineProperty;
3
4
  var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
5
  var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
5
7
  var __hasOwnProp = Object.prototype.hasOwnProperty;
6
8
  var __export = (target, all) => {
7
9
  for (var name in all)
@@ -15,6 +17,14 @@ var __copyProps = (to, from, except, desc) => {
15
17
  }
16
18
  return to;
17
19
  };
20
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
+ // If the importer is in node compatibility mode or this is not an ESM
22
+ // file that has been converted to a CommonJS file using a Babel-
23
+ // compatible transform (i.e. "__esModule" has not been set), then set
24
+ // "default" to the CommonJS "module.exports" for node compatibility.
25
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
+ mod
27
+ ));
18
28
  var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
29
 
20
30
  // src/index.ts
@@ -221,6 +231,7 @@ __export(index_exports, {
221
231
  buildIntentPrompt: () => buildIntentPrompt,
222
232
  buildInterpretAttachmentPrompt: () => buildInterpretAttachmentPrompt,
223
233
  buildLookupFillPrompt: () => buildLookupFillPrompt,
234
+ buildPdfProviderOptions: () => buildPdfProviderOptions,
224
235
  buildQueryClassifyPrompt: () => buildQueryClassifyPrompt,
225
236
  buildQuestionBatchPrompt: () => buildQuestionBatchPrompt,
226
237
  buildQuotesPoliciesPrompt: () => buildQuotesPoliciesPrompt,
@@ -238,10 +249,14 @@ __export(index_exports, {
238
249
  fillAcroForm: () => fillAcroForm,
239
250
  getAcroFormFields: () => getAcroFormFields,
240
251
  getExtractor: () => getExtractor,
252
+ getFileIdentifier: () => getFileIdentifier,
241
253
  getPdfPageCount: () => getPdfPageCount,
242
254
  getTemplate: () => getTemplate,
255
+ isFileReference: () => isFileReference,
243
256
  overlayTextOnPdf: () => overlayTextOnPdf,
244
257
  pLimit: () => pLimit,
258
+ pdfInputToBase64: () => pdfInputToBase64,
259
+ pdfInputToBytes: () => pdfInputToBytes,
245
260
  safeGenerateObject: () => safeGenerateObject,
246
261
  sanitizeNulls: () => sanitizeNulls,
247
262
  stripFences: () => stripFences,
@@ -1667,34 +1682,134 @@ var CONTEXT_KEY_MAP = [
1667
1682
 
1668
1683
  // src/extraction/pdf.ts
1669
1684
  var import_pdf_lib = require("pdf-lib");
1670
- async function extractPageRange(pdfBase64, startPage, endPage) {
1671
- const srcBytes = typeof Buffer !== "undefined" ? Buffer.from(pdfBase64, "base64") : Uint8Array.from(atob(pdfBase64), (c) => c.charCodeAt(0));
1685
+ function isFileIdRef(input) {
1686
+ return typeof input === "object" && input !== null && "fileId" in input;
1687
+ }
1688
+ function isUrl(input) {
1689
+ return input instanceof URL;
1690
+ }
1691
+ function isBytes(input) {
1692
+ return input instanceof Uint8Array;
1693
+ }
1694
+ async function pdfInputToBytes(input) {
1695
+ if (isFileIdRef(input)) {
1696
+ throw new Error(
1697
+ "Cannot convert fileId reference to bytes. Pass the fileId directly to your provider callback instead."
1698
+ );
1699
+ }
1700
+ if (isUrl(input)) {
1701
+ if (input.protocol === "file:") {
1702
+ if (typeof process !== "undefined" && process.versions?.node) {
1703
+ const fs = await import("fs/promises");
1704
+ const buffer = await fs.readFile(input.pathname);
1705
+ return new Uint8Array(buffer);
1706
+ }
1707
+ throw new Error("File URLs not supported in browser environment");
1708
+ }
1709
+ const response = await fetch(input.toString());
1710
+ if (!response.ok) {
1711
+ throw new Error(`Failed to fetch PDF: ${response.status} ${response.statusText}`);
1712
+ }
1713
+ const arrayBuffer = await response.arrayBuffer();
1714
+ return new Uint8Array(arrayBuffer);
1715
+ }
1716
+ if (isBytes(input)) {
1717
+ return input;
1718
+ }
1719
+ if (typeof Buffer !== "undefined") {
1720
+ return new Uint8Array(Buffer.from(input, "base64"));
1721
+ }
1722
+ return Uint8Array.from(atob(input), (c) => c.charCodeAt(0));
1723
+ }
1724
+ async function pdfInputToBase64(input) {
1725
+ if (isFileIdRef(input)) {
1726
+ throw new Error(
1727
+ "Cannot convert fileId reference to base64. Pass the fileId directly to your provider callback instead."
1728
+ );
1729
+ }
1730
+ if (isUrl(input)) {
1731
+ const bytes = await pdfInputToBytes(input);
1732
+ return bytesToBase64(bytes);
1733
+ }
1734
+ if (isBytes(input)) {
1735
+ return bytesToBase64(input);
1736
+ }
1737
+ return input;
1738
+ }
1739
+ function bytesToBase64(bytes) {
1740
+ if (typeof Buffer !== "undefined") {
1741
+ return Buffer.from(bytes).toString("base64");
1742
+ }
1743
+ let binary = "";
1744
+ for (let i = 0; i < bytes.length; i++) {
1745
+ binary += String.fromCharCode(bytes[i]);
1746
+ }
1747
+ return btoa(binary);
1748
+ }
1749
+ function isFileReference(input) {
1750
+ return isFileIdRef(input) || isUrl(input);
1751
+ }
1752
+ function getFileIdentifier(input) {
1753
+ if (isFileIdRef(input)) {
1754
+ return { fileId: input.fileId };
1755
+ }
1756
+ if (isUrl(input)) {
1757
+ return { url: input.toString() };
1758
+ }
1759
+ return void 0;
1760
+ }
1761
+ async function getPdfPageCount(input) {
1762
+ const bytes = await pdfInputToBytes(input);
1763
+ const doc = await import_pdf_lib.PDFDocument.load(bytes, { ignoreEncryption: true });
1764
+ return doc.getPageCount();
1765
+ }
1766
+ async function extractPageRange(input, startPage, endPage) {
1767
+ if (isFileIdRef(input)) {
1768
+ throw new Error(
1769
+ "Cannot extract page range from fileId reference. The provider must handle fileId inputs directly or you must pass the full PDF as base64/bytes."
1770
+ );
1771
+ }
1772
+ if (isUrl(input) && (input.protocol === "http:" || input.protocol === "https:")) {
1773
+ throw new Error(
1774
+ "Cannot extract page range from remote URL. Either pass the full PDF as base64/bytes, or download it first."
1775
+ );
1776
+ }
1777
+ const srcBytes = await pdfInputToBytes(input);
1672
1778
  const srcDoc = await import_pdf_lib.PDFDocument.load(srcBytes, { ignoreEncryption: true });
1673
1779
  const totalPages = srcDoc.getPageCount();
1674
1780
  const start = Math.max(startPage - 1, 0);
1675
1781
  const end = Math.min(endPage, totalPages) - 1;
1676
1782
  if (start === 0 && end >= totalPages - 1) {
1677
- return pdfBase64;
1783
+ if (isBytes(input)) {
1784
+ return bytesToBase64(input);
1785
+ }
1786
+ if (typeof input === "string") {
1787
+ return input;
1788
+ }
1789
+ return bytesToBase64(srcBytes);
1678
1790
  }
1679
1791
  const newDoc = await import_pdf_lib.PDFDocument.create();
1680
1792
  const indices = Array.from({ length: end - start + 1 }, (_, i) => start + i);
1681
1793
  const pages = await newDoc.copyPages(srcDoc, indices);
1682
1794
  pages.forEach((page) => newDoc.addPage(page));
1683
1795
  const bytes = await newDoc.save();
1684
- if (typeof Buffer !== "undefined") {
1685
- return Buffer.from(bytes).toString("base64");
1796
+ return bytesToBase64(new Uint8Array(bytes));
1797
+ }
1798
+ async function buildPdfProviderOptions(input, existingOptions) {
1799
+ const options = { ...existingOptions };
1800
+ if (isFileIdRef(input)) {
1801
+ options.fileId = input.fileId;
1802
+ if (input.mimeType) {
1803
+ options.fileMimeType = input.mimeType;
1804
+ }
1805
+ return options;
1686
1806
  }
1687
- let binary = "";
1688
- const uint8 = new Uint8Array(bytes);
1689
- for (let i = 0; i < uint8.length; i++) {
1690
- binary += String.fromCharCode(uint8[i]);
1807
+ if (isUrl(input)) {
1808
+ options.pdfUrl = input;
1809
+ return options;
1691
1810
  }
1692
- return btoa(binary);
1693
- }
1694
- async function getPdfPageCount(pdfBase64) {
1695
- const srcBytes = typeof Buffer !== "undefined" ? Buffer.from(pdfBase64, "base64") : Uint8Array.from(atob(pdfBase64), (c) => c.charCodeAt(0));
1696
- const doc = await import_pdf_lib.PDFDocument.load(srcBytes, { ignoreEncryption: true });
1697
- return doc.getPageCount();
1811
+ options.pdfBase64 = await pdfInputToBase64(input);
1812
+ return options;
1698
1813
  }
1699
1814
  function getAcroFormFields(pdfDoc) {
1700
1815
  const form = pdfDoc.getForm();
@@ -1787,7 +1902,7 @@ async function runExtractor(params) {
1787
1902
  name,
1788
1903
  prompt,
1789
1904
  schema,
1790
- pdfBase64,
1905
+ pdfInput,
1791
1906
  startPage,
1792
1907
  endPage,
1793
1908
  generateObject,
@@ -1797,6 +1912,7 @@ async function runExtractor(params) {
1797
1912
  } = params;
1798
1913
  const extractorProviderOptions = { ...providerOptions };
1799
1914
  let fullPrompt;
1915
+ const pdfBase64 = await pdfInputToBase64(pdfInput);
1800
1916
  if (convertPdfToImages) {
1801
1917
  const images = await convertPdfToImages(pdfBase64, startPage, endPage);
1802
1918
  extractorProviderOptions.images = images;
@@ -5249,7 +5365,7 @@ async function findReferencedPages(params) {
5249
5365
  referenceTarget,
5250
5366
  sections,
5251
5367
  formInventory,
5252
- pdfBase64,
5368
+ pdfInput,
5253
5369
  pageCount,
5254
5370
  generateObject,
5255
5371
  providerOptions,
@@ -5289,7 +5405,7 @@ If you cannot find the section, return startPage: 0 and endPage: 0.
5289
5405
  Return JSON only.`,
5290
5406
  schema: PageLocationSchema,
5291
5407
  maxTokens: 256,
5292
- providerOptions: { ...providerOptions, pdfBase64 }
5408
+ providerOptions: await buildPdfProviderOptions(pdfInput, providerOptions)
5293
5409
  },
5294
5410
  {
5295
5411
  fallback: { startPage: 0, endPage: 0 },
@@ -5316,7 +5432,7 @@ Return JSON only.`,
5316
5432
  async function resolveReferentialCoverages(params) {
5317
5433
  const {
5318
5434
  memory,
5319
- pdfBase64,
5435
+ pdfInput,
5320
5436
  pageCount,
5321
5437
  generateObject,
5322
5438
  convertPdfToImages,
@@ -5377,7 +5493,7 @@ async function resolveReferentialCoverages(params) {
5377
5493
  referenceTarget: target,
5378
5494
  sections,
5379
5495
  formInventory,
5380
- pdfBase64,
5496
+ pdfInput,
5381
5497
  pageCount,
5382
5498
  generateObject,
5383
5499
  providerOptions,
@@ -5411,7 +5527,7 @@ async function resolveReferentialCoverages(params) {
5411
5527
  name: "referential_lookup",
5412
5528
  prompt: buildReferentialLookupPrompt(promptCoverages),
5413
5529
  schema: ReferentialLookupSchema,
5414
- pdfBase64,
5530
+ pdfInput,
5415
5531
  startPage: pageRange.startPage,
5416
5532
  endPage: pageRange.endPage,
5417
5533
  generateObject,
@@ -6033,7 +6149,7 @@ function createExtractor(config) {
6033
6149
  }))
6034
6150
  };
6035
6151
  }
6036
- async function extract(pdfBase64, documentId, options) {
6152
+ async function extract(pdfInput, documentId, options) {
6037
6153
  const id = documentId ?? `doc-${Date.now()}`;
6038
6154
  const memory = /* @__PURE__ */ new Map();
6039
6155
  totalUsage = { inputTokens: 0, outputTokens: 0 };
@@ -6051,20 +6167,27 @@ function createExtractor(config) {
6051
6167
  memory.set(k, v);
6052
6168
  }
6053
6169
  }
6170
+ let pdfBase64Cache;
6171
+ async function getPdfBase64ForExtraction() {
6172
+ if (pdfBase64Cache === void 0) {
6173
+ pdfBase64Cache = await pdfInputToBase64(pdfInput);
6174
+ }
6175
+ return pdfBase64Cache;
6176
+ }
6054
6177
  let classifyResult;
6055
6178
  if (resumed?.classifyResult && pipelineCtx.isPhaseComplete("classify")) {
6056
6179
  classifyResult = resumed.classifyResult;
6057
6180
  onProgress?.("Resuming from checkpoint (classify complete)...");
6058
6181
  } else {
6059
6182
  onProgress?.("Classifying document...");
6060
- const pageCount2 = await getPdfPageCount(pdfBase64);
6183
+ const pageCount2 = await getPdfPageCount(pdfInput);
6061
6184
  const classifyResponse = await safeGenerateObject(
6062
6185
  generateObject,
6063
6186
  {
6064
6187
  prompt: buildClassifyPrompt(),
6065
6188
  schema: ClassifyResultSchema,
6066
6189
  maxTokens: 512,
6067
- providerOptions: { ...providerOptions, pdfBase64 }
6190
+ providerOptions: await buildPdfProviderOptions(pdfInput, providerOptions)
6068
6191
  },
6069
6192
  {
6070
6193
  fallback: { documentType: "policy", policyTypes: ["other"], confidence: 0 },
@@ -6089,7 +6212,7 @@ function createExtractor(config) {
6089
6212
  const { documentType, policyTypes } = classifyResult;
6090
6213
  const primaryType = policyTypes[0] ?? "other";
6091
6214
  const template = getTemplate(primaryType);
6092
- const pageCount = resumed?.pageCount ?? await getPdfPageCount(pdfBase64);
6215
+ const pageCount = resumed?.pageCount ?? await getPdfPageCount(pdfInput);
6093
6216
  const templateHints = buildTemplateHints(primaryType, documentType, pageCount, template);
6094
6217
  let formInventory;
6095
6218
  if (resumed?.formInventory && pipelineCtx.isPhaseComplete("form_inventory")) {
@@ -6104,7 +6227,7 @@ function createExtractor(config) {
6104
6227
  prompt: buildFormInventoryPrompt(templateHints),
6105
6228
  schema: FormInventorySchema,
6106
6229
  maxTokens: 2048,
6107
- providerOptions: { ...providerOptions, pdfBase64 }
6230
+ providerOptions: await buildPdfProviderOptions(pdfInput, providerOptions)
6108
6231
  },
6109
6232
  {
6110
6233
  fallback: { forms: [] },
@@ -6132,9 +6255,10 @@ function createExtractor(config) {
6132
6255
  const chunkSize = 8;
6133
6256
  const collectedAssignments = [];
6134
6257
  const formInventoryHint = formInventory?.forms.length ? formatFormInventoryForPageMap(formInventory.forms) : void 0;
6258
+ const extractionBase64 = await getPdfBase64ForExtraction();
6135
6259
  for (let startPage = 1; startPage <= pageCount; startPage += chunkSize) {
6136
6260
  const endPage = Math.min(pageCount, startPage + chunkSize - 1);
6137
- const pagesPdf = await extractPageRange(pdfBase64, startPage, endPage);
6261
+ const pagesPdf = await extractPageRange(extractionBase64, startPage, endPage);
6138
6262
  const mapResponse = await safeGenerateObject(
6139
6263
  generateObject,
6140
6264
  {
@@ -6214,7 +6338,7 @@ function createExtractor(config) {
6214
6338
  name: task.extractorName,
6215
6339
  prompt: ext.buildPrompt(),
6216
6340
  schema: ext.schema,
6217
- pdfBase64,
6341
+ pdfInput,
6218
6342
  startPage: task.startPage,
6219
6343
  endPage: task.endPage,
6220
6344
  generateObject,
@@ -6244,7 +6368,7 @@ function createExtractor(config) {
6244
6368
  name: "supplementary",
6245
6369
  prompt: buildSupplementaryPrompt(alreadyExtractedSummary),
6246
6370
  schema: SupplementarySchema,
6247
- pdfBase64,
6371
+ pdfInput,
6248
6372
  startPage: 1,
6249
6373
  endPage: pageCount,
6250
6374
  generateObject,
@@ -6273,7 +6397,7 @@ function createExtractor(config) {
6273
6397
  try {
6274
6398
  const resolution = await resolveReferentialCoverages({
6275
6399
  memory,
6276
- pdfBase64,
6400
+ pdfInput,
6277
6401
  pageCount,
6278
6402
  generateObject,
6279
6403
  convertPdfToImages,
@@ -6313,7 +6437,7 @@ function createExtractor(config) {
6313
6437
  prompt: buildReviewPrompt(template.required, extractedKeys, extractionSummary, pageMapSummary),
6314
6438
  schema: ReviewResultSchema,
6315
6439
  maxTokens: 1536,
6316
- providerOptions: { ...providerOptions, pdfBase64 }
6440
+ providerOptions: await buildPdfProviderOptions(pdfInput, providerOptions)
6317
6441
  },
6318
6442
  {
6319
6443
  fallback: { complete: true, missingFields: [], qualityIssues: [], additionalTasks: [] },
@@ -6341,7 +6465,7 @@ function createExtractor(config) {
6341
6465
  name: task.extractorName,
6342
6466
  prompt: ext.buildPrompt(),
6343
6467
  schema: ext.schema,
6344
- pdfBase64,
6468
+ pdfInput,
6345
6469
  startPage: task.startPage,
6346
6470
  endPage: task.endPage,
6347
6471
  generateObject,
@@ -9187,6 +9311,7 @@ var AGENT_TOOLS = [
9187
9311
  buildIntentPrompt,
9188
9312
  buildInterpretAttachmentPrompt,
9189
9313
  buildLookupFillPrompt,
9314
+ buildPdfProviderOptions,
9190
9315
  buildQueryClassifyPrompt,
9191
9316
  buildQuestionBatchPrompt,
9192
9317
  buildQuotesPoliciesPrompt,
@@ -9204,10 +9329,14 @@ var AGENT_TOOLS = [
9204
9329
  fillAcroForm,
9205
9330
  getAcroFormFields,
9206
9331
  getExtractor,
9332
+ getFileIdentifier,
9207
9333
  getPdfPageCount,
9208
9334
  getTemplate,
9335
+ isFileReference,
9209
9336
  overlayTextOnPdf,
9210
9337
  pLimit,
9338
+ pdfInputToBase64,
9339
+ pdfInputToBytes,
9211
9340
  safeGenerateObject,
9212
9341
  sanitizeNulls,
9213
9342
  stripFences,