npm - @claritylabs/cl-sdk - Versions diffs - 0.15.0 → 0.16.0 - Mend

@claritylabs/cl-sdk 0.15.0 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.mjs CHANGED Viewed

@@ -1423,34 +1423,134 @@ import {
   StandardFonts,
   rgb
 } from "pdf-lib";
-async function extractPageRange(pdfBase64, startPage, endPage) {
-  const srcBytes = typeof Buffer !== "undefined" ? Buffer.from(pdfBase64, "base64") : Uint8Array.from(atob(pdfBase64), (c) => c.charCodeAt(0));
+function isFileIdRef(input) {
+  return typeof input === "object" && input !== null && "fileId" in input;
+}
+function isUrl(input) {
+  return input instanceof URL;
+}
+function isBytes(input) {
+  return input instanceof Uint8Array;
+}
+async function pdfInputToBytes(input) {
+  if (isFileIdRef(input)) {
+    throw new Error(
+      "Cannot convert fileId reference to bytes. Pass the fileId directly to your provider callback instead."
+    );
+  }
+  if (isUrl(input)) {
+    if (input.protocol === "file:") {
+      if (typeof process !== "undefined" && process.versions?.node) {
+        const fs = await import("fs/promises");
+        const buffer = await fs.readFile(input.pathname);
+        return new Uint8Array(buffer);
+      }
+      throw new Error("File URLs not supported in browser environment");
+    }
+    const response = await fetch(input.toString());
+    if (!response.ok) {
+      throw new Error(`Failed to fetch PDF: ${response.status} ${response.statusText}`);
+    }
+    const arrayBuffer = await response.arrayBuffer();
+    return new Uint8Array(arrayBuffer);
+  }
+  if (isBytes(input)) {
+    return input;
+  }
+  if (typeof Buffer !== "undefined") {
+    return new Uint8Array(Buffer.from(input, "base64"));
+  }
+  return Uint8Array.from(atob(input), (c) => c.charCodeAt(0));
+}
+async function pdfInputToBase64(input) {
+  if (isFileIdRef(input)) {
+    throw new Error(
+      "Cannot convert fileId reference to base64. Pass the fileId directly to your provider callback instead."
+    );
+  }
+  if (isUrl(input)) {
+    const bytes = await pdfInputToBytes(input);
+    return bytesToBase64(bytes);
+  }
+  if (isBytes(input)) {
+    return bytesToBase64(input);
+  }
+  return input;
+}
+function bytesToBase64(bytes) {
+  if (typeof Buffer !== "undefined") {
+    return Buffer.from(bytes).toString("base64");
+  }
+  let binary = "";
+  for (let i = 0; i < bytes.length; i++) {
+    binary += String.fromCharCode(bytes[i]);
+  }
+  return btoa(binary);
+}
+function isFileReference(input) {
+  return isFileIdRef(input) || isUrl(input);
+}
+function getFileIdentifier(input) {
+  if (isFileIdRef(input)) {
+    return { fileId: input.fileId };
+  }
+  if (isUrl(input)) {
+    return { url: input.toString() };
+  }
+  return void 0;
+}
+async function getPdfPageCount(input) {
+  const bytes = await pdfInputToBytes(input);
+  const doc = await PDFDocument.load(bytes, { ignoreEncryption: true });
+  return doc.getPageCount();
+}
+async function extractPageRange(input, startPage, endPage) {
+  if (isFileIdRef(input)) {
+    throw new Error(
+      "Cannot extract page range from fileId reference. The provider must handle fileId inputs directly or you must pass the full PDF as base64/bytes."
+    );
+  }
+  if (isUrl(input) && (input.protocol === "http:" || input.protocol === "https:")) {
+    throw new Error(
+      "Cannot extract page range from remote URL. Either pass the full PDF as base64/bytes, or download it first."
+    );
+  }
+  const srcBytes = await pdfInputToBytes(input);
   const srcDoc = await PDFDocument.load(srcBytes, { ignoreEncryption: true });
   const totalPages = srcDoc.getPageCount();
   const start = Math.max(startPage - 1, 0);
   const end = Math.min(endPage, totalPages) - 1;
   if (start === 0 && end >= totalPages - 1) {
-    return pdfBase64;
+    if (isBytes(input)) {
+      return bytesToBase64(input);
+    }
+    if (typeof input === "string") {
+      return input;
+    }
+    return bytesToBase64(srcBytes);
   }
   const newDoc = await PDFDocument.create();
   const indices = Array.from({ length: end - start + 1 }, (_, i) => start + i);
   const pages = await newDoc.copyPages(srcDoc, indices);
   pages.forEach((page) => newDoc.addPage(page));
   const bytes = await newDoc.save();
-  if (typeof Buffer !== "undefined") {
-    return Buffer.from(bytes).toString("base64");
+  return bytesToBase64(new Uint8Array(bytes));
+}
+async function buildPdfProviderOptions(input, existingOptions) {
+  const options = { ...existingOptions };
+  if (isFileIdRef(input)) {
+    options.fileId = input.fileId;
+    if (input.mimeType) {
+      options.fileMimeType = input.mimeType;
+    }
+    return options;
   }
-  let binary = "";
-  const uint8 = new Uint8Array(bytes);
-  for (let i = 0; i < uint8.length; i++) {
-    binary += String.fromCharCode(uint8[i]);
+  if (isUrl(input)) {
+    options.pdfUrl = input;
+    return options;
   }
-  return btoa(binary);
-}
-async function getPdfPageCount(pdfBase64) {
-  const srcBytes = typeof Buffer !== "undefined" ? Buffer.from(pdfBase64, "base64") : Uint8Array.from(atob(pdfBase64), (c) => c.charCodeAt(0));
-  const doc = await PDFDocument.load(srcBytes, { ignoreEncryption: true });
-  return doc.getPageCount();
+  options.pdfBase64 = await pdfInputToBase64(input);
+  return options;
 }
 function getAcroFormFields(pdfDoc) {
   const form = pdfDoc.getForm();
@@ -1543,7 +1643,7 @@ async function runExtractor(params) {
     name,
     prompt,
     schema,
-    pdfBase64,
+    pdfInput,
     startPage,
     endPage,
     generateObject,
@@ -1553,6 +1653,7 @@ async function runExtractor(params) {
   } = params;
   const extractorProviderOptions = { ...providerOptions };
   let fullPrompt;
+  const pdfBase64 = await pdfInputToBase64(pdfInput);
   if (convertPdfToImages) {
     const images = await convertPdfToImages(pdfBase64, startPage, endPage);
     extractorProviderOptions.images = images;
@@ -5005,7 +5106,7 @@ async function findReferencedPages(params) {
     referenceTarget,
     sections,
     formInventory,
-    pdfBase64,
+    pdfInput,
     pageCount,
     generateObject,
     providerOptions,
@@ -5045,7 +5146,7 @@ If you cannot find the section, return startPage: 0 and endPage: 0.
 Return JSON only.`,
         schema: PageLocationSchema,
         maxTokens: 256,
-        providerOptions: { ...providerOptions, pdfBase64 }
+        providerOptions: await buildPdfProviderOptions(pdfInput, providerOptions)
       },
       {
         fallback: { startPage: 0, endPage: 0 },
@@ -5072,7 +5173,7 @@ Return JSON only.`,
 async function resolveReferentialCoverages(params) {
   const {
     memory,
-    pdfBase64,
+    pdfInput,
     pageCount,
     generateObject,
     convertPdfToImages,
@@ -5133,7 +5234,7 @@ async function resolveReferentialCoverages(params) {
           referenceTarget: target,
           sections,
           formInventory,
-          pdfBase64,
+          pdfInput,
           pageCount,
           generateObject,
           providerOptions,
@@ -5167,7 +5268,7 @@ async function resolveReferentialCoverages(params) {
             name: "referential_lookup",
             prompt: buildReferentialLookupPrompt(promptCoverages),
             schema: ReferentialLookupSchema,
-            pdfBase64,
+            pdfInput,
             startPage: pageRange.startPage,
             endPage: pageRange.endPage,
             generateObject,
@@ -5789,7 +5890,7 @@ function createExtractor(config) {
       }))
     };
   }
-  async function extract(pdfBase64, documentId, options) {
+  async function extract(pdfInput, documentId, options) {
     const id = documentId ?? `doc-${Date.now()}`;
     const memory = /* @__PURE__ */ new Map();
     totalUsage = { inputTokens: 0, outputTokens: 0 };
@@ -5807,20 +5908,27 @@ function createExtractor(config) {
         memory.set(k, v);
       }
     }
+    let pdfBase64Cache;
+    async function getPdfBase64ForExtraction() {
+      if (pdfBase64Cache === void 0) {
+        pdfBase64Cache = await pdfInputToBase64(pdfInput);
+      }
+      return pdfBase64Cache;
+    }
     let classifyResult;
     if (resumed?.classifyResult && pipelineCtx.isPhaseComplete("classify")) {
       classifyResult = resumed.classifyResult;
       onProgress?.("Resuming from checkpoint (classify complete)...");
     } else {
       onProgress?.("Classifying document...");
-      const pageCount2 = await getPdfPageCount(pdfBase64);
+      const pageCount2 = await getPdfPageCount(pdfInput);
       const classifyResponse = await safeGenerateObject(
         generateObject,
         {
           prompt: buildClassifyPrompt(),
           schema: ClassifyResultSchema,
           maxTokens: 512,
-          providerOptions: { ...providerOptions, pdfBase64 }
+          providerOptions: await buildPdfProviderOptions(pdfInput, providerOptions)
         },
         {
           fallback: { documentType: "policy", policyTypes: ["other"], confidence: 0 },
@@ -5845,7 +5953,7 @@ function createExtractor(config) {
     const { documentType, policyTypes } = classifyResult;
     const primaryType = policyTypes[0] ?? "other";
     const template = getTemplate(primaryType);
-    const pageCount = resumed?.pageCount ?? await getPdfPageCount(pdfBase64);
+    const pageCount = resumed?.pageCount ?? await getPdfPageCount(pdfInput);
     const templateHints = buildTemplateHints(primaryType, documentType, pageCount, template);
     let formInventory;
     if (resumed?.formInventory && pipelineCtx.isPhaseComplete("form_inventory")) {
@@ -5860,7 +5968,7 @@ function createExtractor(config) {
           prompt: buildFormInventoryPrompt(templateHints),
           schema: FormInventorySchema,
           maxTokens: 2048,
-          providerOptions: { ...providerOptions, pdfBase64 }
+          providerOptions: await buildPdfProviderOptions(pdfInput, providerOptions)
         },
         {
           fallback: { forms: [] },
@@ -5888,9 +5996,10 @@ function createExtractor(config) {
       const chunkSize = 8;
       const collectedAssignments = [];
       const formInventoryHint = formInventory?.forms.length ? formatFormInventoryForPageMap(formInventory.forms) : void 0;
+      const extractionBase64 = await getPdfBase64ForExtraction();
       for (let startPage = 1; startPage <= pageCount; startPage += chunkSize) {
         const endPage = Math.min(pageCount, startPage + chunkSize - 1);
-        const pagesPdf = await extractPageRange(pdfBase64, startPage, endPage);
+        const pagesPdf = await extractPageRange(extractionBase64, startPage, endPage);
         const mapResponse = await safeGenerateObject(
           generateObject,
           {
@@ -5970,7 +6079,7 @@ function createExtractor(config) {
                 name: task.extractorName,
                 prompt: ext.buildPrompt(),
                 schema: ext.schema,
-                pdfBase64,
+                pdfInput,
                 startPage: task.startPage,
                 endPage: task.endPage,
                 generateObject,
@@ -6000,7 +6109,7 @@ function createExtractor(config) {
             name: "supplementary",
             prompt: buildSupplementaryPrompt(alreadyExtractedSummary),
             schema: SupplementarySchema,
-            pdfBase64,
+            pdfInput,
             startPage: 1,
             endPage: pageCount,
             generateObject,
@@ -6029,7 +6138,7 @@ function createExtractor(config) {
       try {
         const resolution = await resolveReferentialCoverages({
           memory,
-          pdfBase64,
+          pdfInput,
           pageCount,
           generateObject,
           convertPdfToImages,
@@ -6069,7 +6178,7 @@ function createExtractor(config) {
             prompt: buildReviewPrompt(template.required, extractedKeys, extractionSummary, pageMapSummary),
             schema: ReviewResultSchema,
             maxTokens: 1536,
-            providerOptions: { ...providerOptions, pdfBase64 }
+            providerOptions: await buildPdfProviderOptions(pdfInput, providerOptions)
           },
           {
             fallback: { complete: true, missingFields: [], qualityIssues: [], additionalTasks: [] },
@@ -6097,7 +6206,7 @@ function createExtractor(config) {
                   name: task.extractorName,
                   prompt: ext.buildPrompt(),
                   schema: ext.schema,
-                  pdfBase64,
+                  pdfInput,
                   startPage: task.startPage,
                   endPage: task.endPage,
                   generateObject,
@@ -8942,6 +9051,7 @@ export {
   buildIntentPrompt,
   buildInterpretAttachmentPrompt,
   buildLookupFillPrompt,
+  buildPdfProviderOptions,
   buildQueryClassifyPrompt,
   buildQuestionBatchPrompt,
   buildQuotesPoliciesPrompt,
@@ -8959,10 +9069,14 @@ export {
   fillAcroForm,
   getAcroFormFields,
   getExtractor,
+  getFileIdentifier,
   getPdfPageCount,
   getTemplate,
+  isFileReference,
   overlayTextOnPdf,
   pLimit,
+  pdfInputToBase64,
+  pdfInputToBytes,
   safeGenerateObject,
   sanitizeNulls,
   stripFences,