npm - @intuned/browser-dev - Versions diffs - 2.2.3-unify-sdks.28 → 2.2.3-unify-sdks.29 - Mend

@intuned/browser-dev 2.2.3-unify-sdks.28 → 2.2.3-unify-sdks.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

package/dist/ai/export.d.ts +230 -116
package/dist/ai/extractStructuredData.js +117 -28
package/dist/ai/extractStructuredDataUsingAi.js +4 -4
package/dist/ai/index.d.ts +230 -116
package/dist/ai/prompt.js +2 -2
package/dist/ai/tests/testExtractFromContent.spec.js +372 -0
package/dist/ai/tests/testExtractStructuredData.spec.js +24 -23
package/dist/ai/validators.js +37 -6
package/dist/helpers/export.d.ts +78 -125
package/dist/helpers/index.d.ts +78 -125
package/dist/helpers/saveFileToS3.js +6 -2
package/dist/helpers/tests/testDownloadFile.spec.js +9 -2
package/dist/helpers/uploadFileToS3.js +11 -9
package/dist/helpers/waitForDomSettled.js +3 -2
package/dist/helpers/withNetworkIdleWait.js +1 -1
package/generated-docs/ai/functions/extractStructuredData.mdx +145 -57
package/generated-docs/ai/functions/isPageLoaded.mdx +2 -56
package/generated-docs/ai/interfaces/ImageBufferContentItem.mdx +12 -0
package/generated-docs/ai/interfaces/ImageUrlContentItem.mdx +12 -0
package/generated-docs/ai/interfaces/TextContentItem.mdx +11 -0
package/generated-docs/ai/type-aliases/JsonSchema.mdx +40 -0
package/generated-docs/ai/type-aliases/SUPPORTED_MODELS.mdx +73 -2
package/generated-docs/helpers/functions/resolveUrl.mdx +106 -24
package/generated-docs/helpers/functions/saveFileToS3.mdx +25 -45
package/generated-docs/helpers/functions/uploadFileToS3.mdx +20 -6
package/generated-docs/helpers/functions/{withNetworkIdle.mdx → withNetworkIdleWait.mdx} +6 -6
package/generated-docs/helpers/interfaces/S3Configs.mdx +4 -4
package/generated-docs/optimized-extractors/functions/extractArrayFromLocator.mdx +121 -0
package/generated-docs/optimized-extractors/functions/extractArrayFromPage.mdx +126 -0
package/generated-docs/optimized-extractors/functions/extractObjectFromLocator.mdx +122 -0
package/generated-docs/optimized-extractors/functions/extractObjectFromPage.mdx +122 -0
package/generated-docs/optimized-extractors/interfaces/HtmlStrategy.mdx +39 -0
package/generated-docs/optimized-extractors/interfaces/ImageStrategy.mdx +35 -0
package/generated-docs/optimized-extractors/interfaces/SimpleArrayItemSchema.mdx +15 -0
package/generated-docs/optimized-extractors/interfaces/SimpleArrayStringSchema.mdx +13 -0
package/generated-docs/optimized-extractors/interfaces/SimpleObjectSchema.mdx +15 -0
package/generated-docs/optimized-extractors/interfaces/SimpleObjectStringSchema.mdx +12 -0
package/package.json +2 -1
package/test-docs/functions/downloadFile.mdx +95 -0
package/test-docs/functions/extractMarkdown.mdx +53 -0
package/test-docs/functions/filterEmptyValues.mdx +48 -0
package/test-docs/functions/goToUrl.mdx +97 -0
package/test-docs/functions/processDate.mdx +52 -0
package/test-docs/functions/resolveUrl.mdx +161 -0
package/test-docs/functions/sanitizeHtml.mdx +113 -0
package/test-docs/functions/saveFileToS3.mdx +124 -0
package/test-docs/functions/scrollToLoadContent.mdx +87 -0
package/test-docs/functions/uploadFileToS3.mdx +118 -0
package/test-docs/functions/validateDataUsingSchema.mdx +66 -0
package/test-docs/functions/waitForDomSettled.mdx +95 -0
package/test-docs/functions/withNetworkIdleWait.mdx +93 -0
package/test-docs/interfaces/Attachment.mdx +45 -0
package/test-docs/interfaces/S3Configs.mdx +36 -0
package/test-docs/interfaces/SanitizeHtmlOptions.mdx +22 -0
package/test-docs/type-aliases/AttachmentType.mdx +12 -0
package/test-docs/type-aliases/DataInput.mdx +11 -0
package/test-docs/type-aliases/DataObject.mdx +11 -0
package/test-docs/type-aliases/S3UploadableFile.mdx +10 -0
package/test-docs/type-aliases/Trigger.mdx +13 -0
package/dist/intunedServices/ApiGateway/test.spec.js +0 -1
package/generated-docs/helpers/interfaces/S3UploadOptions.mdx +0 -40

package/dist/ai/export.d.ts CHANGED Viewed

@@ -151,10 +151,40 @@ export interface ObjectSchema extends BasicSchema {
   /** Minimum number of properties required */
   minProperties?: number;
 }
+import { z } from "zod";
+/**
+ * JsonSchema can be a Zod schema, a string schema, a number schema, a boolean schema, an array schema, or an object schema.
+ * @interface JsonSchema
+ * @extends BasicSchema
+ * @example String Schema
+ * ```typescript
+ * const schema: JsonSchema = {
+ *   type: "object",
+ *   properties: {
+ *     name: { type: "string" },
+ *     age: { type: "number" }
+ *   }
+ * };
+ * ```
+ * @example Zod Schema
+ * ```typescript
+ * const schema: JsonSchema = z.object({
+ *   name: z.string(),
+ *   age: z.number()
+ * });
+ * ```
+ */
+export type JsonSchema =
+  | z.ZodSchema
+  | StringSchema
+  | NumberSchema
+  | BooleanSchema
+  | ArraySchema
+  | ObjectSchema;
 /**
  * Extract structured data from web pages using AI-powered content analysis.
- *
+ * @overload Extract From Page or Locator
  * This function provides intelligent data extraction from web pages using various strategies
  * including HTML parsing, image analysis, and Markdown conversion. It supports extraction
  * from entire pages or specific elements, with built-in caching and retry mechanisms.
@@ -176,7 +206,11 @@ export interface ObjectSchema extends BasicSchema {
  * ```typescript Extract Product Information from Entire Page
  * import { extractStructuredData } from '@intuned/browser/ai';
  *
- * const productSchema = {
+ * const product = await extractStructuredData({
+ *   source: page,
+ *   strategy: "HTML",
+ *   model: "gpt-4o",
+ *   dataSchema: {
  *   type: "object",
  *   properties: {
  *     name: { type: "string" },
@@ -185,13 +219,7 @@ export interface ObjectSchema extends BasicSchema {
  *     inStock: { type: "boolean" }
  *   },
  *   required: ["name", "price"]
- * };
- *
- * const product = await extractStructuredData({
- *   source: page,
- *   strategy: "HTML",
- *   model: "gpt-4o",
- *   dataSchema: productSchema,
+ * },
  *   prompt: "Extract product details from this e-commerce page"
  * });
  *
@@ -202,7 +230,12 @@ export interface ObjectSchema extends BasicSchema {
  * ```typescript Extract Article Data from Specific Element
  * import { extractStructuredData } from '@intuned/browser/ai';
  *
- * const articleSchema = {
+ * const articleContainer = page.locator("article.main-content");
+ * const article = await extractStructuredData({
+ *   source: articleContainer,
+ *   strategy: "MARKDOWN",
+ *   model: "claude-3",
+ *   dataSchema: {
  *   type: "object",
  *   properties: {
  *     title: { type: "string" },
@@ -212,64 +245,107 @@ export interface ObjectSchema extends BasicSchema {
  *     tags: { type: "array", items: { type: "string" } }
  *   },
  *   required: ["title", "content"]
- * };
- *
- * const articleContainer = page.locator("article.main-content");
- * const article = await extractStructuredData({
- *   source: articleContainer,
- *   strategy: "MARKDOWN",
- *   model: "claude-3",
- *   dataSchema: articleSchema,
+ * },
  *   maxRetries: 5
  * });
  *
  * console.log(`Article: ${article.title} by ${article.author}`);
  * ```
+ */
+export declare function extractStructuredData(options: {
+  source: Page | Locator;
+  dataSchema: JsonSchema;
+  prompt?: string;
+  strategy?: "IMAGE" | "MARKDOWN" | "HTML";
+  model?: SUPPORTED_MODELS;
+  apiKey?: string;
+  enableDomMatching?: boolean;
+  enableCache?: boolean;
+  maxRetries?: number;
+}): Promise<any>;
+/**
+ * Extract structured data from content items (text, images) using AI-powered analysis.
+ * @overload Extract From Content
+ * This overload provides a simplified interface for data extraction from various content types
+ * without requiring a page source or extraction strategy. It accepts text content, image buffers,
+ * or image URLs and extracts structured data according to the provided schema.
  *
- * @example
- * ```typescript Extract Data from Screenshots using Image Strategy
+ * @param {Object} options - Configuration object containing extraction parameters
+ * @param {TextContentItem | ImageBufferContentItem | ImageUrlContentItem | Array<TextContentItem | ImageBufferContentItem | ImageUrlContentItem>} options.content - Content to extract data from - can be a single content item or array of content items
+ * @param {JsonSchema} options.dataSchema - [JsonSchema](../interfaces/JsonSchema) defining the structure of the data to extract
+ * @param {SUPPORTED_MODELS} options.model - AI model to use for extraction (e.g., "gpt-4", "claude-3"), see [SUPPORTED_MODELS](../type-aliases/SUPPORTED_MODELS) for all supported models
+ * @param {string} [options.prompt] - Optional prompt to guide the extraction process and provide more context
+ * @param {string} [options.apiKey] - Optional API key for AI extraction (if provided, will not be billed to your account)
+ * @param {boolean} [options.enableCache=true] - Whether to enable caching of the extracted data. default true
+ * @param {integer} [options.retries=3] - Maximum number of retry attempts on failures. Failures can be validation errors, api errors, output errors, etc.
+ *
+ * @returns Promise resolving to the extracted structured data matching the provided schema
+ *
+ * @example
+ * ```typescript Extract Data from Text Content
  * import { extractStructuredData } from '@intuned/browser/ai';
  *
- * const chartSchema = {
- *   type: "object",
- *   properties: {
- *     title: { type: "string" },
- *     dataPoints: {
- *       type: "array",
- *       items: {
- *         type: "object",
- *         properties: {
- *           label: { type: "string" },
- *           value: { type: "number" }
- *         }
- *       }
- *     }
- *   }
+ * const textContent: TextContentItem = {
+ *   type: "text",
+ *   data: "John Doe, age 30, works as a Software Engineer at Tech Corp"
  * };
  *
- * const chartElement = page.locator("#data-visualization");
- * const chartData = await extractStructuredData({
- *   source: chartElement,
- *   strategy: "IMAGE",
+ * const person = await extractStructuredData({
+ *   content: textContent,
  *   model: "gpt-4o",
- *   dataSchema: chartSchema,
- *   prompt: "Extract the chart title and all data points with their values"
+ *   dataSchema: {
+ *   type: "object",
+ *   properties: {
+ *     name: { type: "string" },
+ *     age: { type: "number" },
+ *     occupation: { type: "string" },
+ *     company: { type: "string" }
+ *   },
+ *   required: ["name"]
+ * },
+ *   prompt: "Extract person information from the text"
  * });
  *
- * console.log(`Chart: ${chartData.title}`);
- * chartData.dataPoints.forEach(point => {
- *   console.log(`${point.label}: ${point.value}`);
+ * console.log(`Found person: ${person.name}, ${person.age} years old`);
+ * ```
+ *
+ * @example
+ * ```typescript Extract Data from Multiple Content Items
+ * import { extractStructuredData } from '@intuned/browser/ai';
+ *
+ * const mixedContent = [
+ *   { type: "text", data: "Product: iPhone 15" },
+ *   { type: "image-url", image_type: "jpeg", data: "https://mintcdn.com/intuned-7/asXJUUPBWwDlStUB/logo/light.svg?fit=max&auto=format&n=asXJUUPBWwDlStUB&q=85&s=6525c0b299b3226464eba6afa9b7ebe6" }
+ * ];
+ *
+ *
+ * const product = await extractStructuredData({
+ *   content: mixedContent,
+ *   model: "claude-3",
+ *   dataSchema: {
+ *   type: "object",
+ *   properties: {
+ *     name: { type: "string" },
+ *     price: { type: "string" },
+ *     features: { type: "array", items: { type: "string" } }
+ *   }
+ * },
+ *   maxRetries: 1,
+ *   enableCache: true
  * });
  * ```
  */
 export declare function extractStructuredData(options: {
-  source: Page | Locator;
+  content:
+    | (TextContentItem | ImageBufferContentItem | ImageUrlContentItem)[]
+    | TextContentItem
+    | ImageBufferContentItem
+    | ImageUrlContentItem;
   dataSchema: JsonSchema;
   prompt?: string;
-  strategy?: "IMAGE" | "MARKDOWN" | "HTML";
-  model?: SUPPORTED_MODELS;
+  model: SUPPORTED_MODELS;
   apiKey?: string;
-  enableDomMatching?: boolean;
   enableCache?: boolean;
   maxRetries?: number;
 }): Promise<any>;
@@ -348,72 +424,83 @@ type SUPPORTED_OPENAI_MODELS =
   | "o4-mini-deep-research-2025-06-26";
 /**
  * This type defines the supported AI models for data extraction.
- * It includes models from OpenAI, Anthropic, and Google Gemini.
- * The models are used in the extraction strategies to process and analyze the content of web pages or elements.
+ * It includes models from OpenAI and Anthropic
+ * **Supported OPENAI Models**
+ * "gpt-3.5-turbo"
+ * "gpt-3.5-turbo-0125"
+ * "gpt-3.5-turbo-0301"
+ * "gpt-3.5-turbo-0613"
+ * "gpt-3.5-turbo-1106"
+ * "gpt-3.5-turbo-16k"
+ * "gpt-3.5-turbo-16k-0613"
+ * "gpt-3.5-turbo-instruct"
+ * "gpt-3.5-turbo-instruct-0914"
+ * "gpt-4"
+ * "gpt-4-0314"
+ * "gpt-4-0613"
+ * "gpt-4-32k"
+ * "gpt-4-32k-0314"
+ * "gpt-4-32k-0613"
+ * "gpt-4-turbo"
+ * "gpt-4-turbo-2024-04-09"
+ * "gpt-4.1"
+ * "gpt-4.1-2025-04-14"
+ * "gpt-4.1-mini"
+ * "gpt-4.1-mini-2025-04-14"
+ * "gpt-4.1-nano"
+ * "gpt-4.1-nano-2025-04-14"
+ * "gpt-4o"
+ * "gpt-4o-2024-05-13"
+ * "gpt-4o-2024-08-06"
+ * "gpt-4o-2024-11-20"
+ * "gpt-4o-mini"
+ * "gpt-4o-mini-2024-07-18"
+ * "gpt-5"
+ * "gpt-5-2025-08-07"
+ * "gpt-5-chat"
+ * "gpt-5-chat-latest"
+ * "gpt-5-mini"
+ * "gpt-5-mini-2025-08-07"
+ * "gpt-5-nano"
+ * "gpt-5-nano-2025-08-07"
+ * "o1"
+ * "o1-2024-12-17"
+ * "o1-mini"
+ * "o1-mini-2024-09-12"
+ * "o1-pro"
+ * "o1-pro-2025-03-19"
+ * "o3"
+ * "o3-2025-04-16"
+ * "o3-deep-research"
+ * "o3-deep-research-2025-06-26"
+ * "o3-mini"
+ * "o3-mini-2025-01-31"
+ * "o3-pro"
+ * "o3-pro-2025-06-10"
+ * "o4-mini"
+ * "o4-mini-2025-04-16"
+ * "o4-mini-deep-research"
+ * "o4-mini-deep-research-2025-06-26";
+ *
+ * **Supported Anthropic Models**
+ * "claude-3-5-haiku-20241022"
+ * "claude-3-5-haiku-latest"
+ * "claude-3-5-sonnet-20240620"
+ * "claude-3-5-sonnet-20241022"
+ * "claude-3-5-sonnet-latest"
+ * "claude-3-7-sonnet-20250219"
+ * "claude-3-7-sonnet-latest"
+ * "claude-3-haiku-20240307"
+ * "claude-4-opus-20250514"
+ * "claude-4-sonnet-20250514"
+ * "claude-opus-4-1"
+ * "claude-opus-4-1-20250805"
+ * "claude-opus-4-20250514"
+ * "claude-sonnet-4-20250514";
  * @type SUPPORTED_MODELS
  */
 type SUPPORTED_MODELS = SUPPORTED_CLAUDE_MODELS | SUPPORTED_OPENAI_MODELS;
-/**
- * Represents a JSON Schema definition for validating data structures.
- * Supports various schema types including string, number, boolean, array, and object schemas
- * with their respective validation rules and constraints.
- *
- * This type is a union of different schema types:
- * - StringSchema: For string validation with length and pattern constraints
- * - NumberSchema: For number/integer validation with range constraints
- * - BooleanSchema: For boolean values
- * - ArraySchema: For array validation with item constraints
- * - ObjectSchema: For object validation with property constraints
- *
- * @type JsonSchema
- * @example
- * ```typescript String Schema
- * const stringSchema: JsonSchema = {
- *   type: "string",
- *   minLength: 3,
- *   maxLength: 50,
- *   pattern: "^[A-Za-z]+$"
- * };
- * ```
- *
- * @example
- * ```typescript Number Schema
- * const numberSchema: JsonSchema = {
- *   type: "number",
- *   minimum: 0,
- *   maximum: 100,
- *   multipleOf: 0.5
- * };
- * ```
- *
- * @example
- * ```typescript Array Schema
- * const arraySchema: JsonSchema = {
- *   type: "array",
- *   items: {
- *     type: "string"
- *   },
- *   minItems: 1,
- *   maxItems: 10,
- *   uniqueItems: true
- * };
- * ```
- *
- * @example
- * ```typescript Object Schema
- * const objectSchema: JsonSchema = {
- *   type: "object",
- *   properties: {
- *     name: { type: "string" },
- *     age: { type: "number", minimum: 0 },
- *     email: { type: "string", pattern: "^[^@]+@[^@]+\\.[^@]+$" }
- *   },
- *   required: ["name", "email"]
- * };
- * ```
- */
 /**
  * Uses AI vision to determine if a webpage has finished loading by analyzing a screenshot.
  * Detects loading spinners, blank content, or incomplete page states.
@@ -465,9 +552,36 @@ export declare function isPageLoaded(input: {
   apiKey?: string;
 }): Promise<boolean>;
-export type JsonSchema =
-  | StringSchema
-  | NumberSchema
-  | BooleanSchema
-  | ArraySchema
-  | ObjectSchema;
+/**
+ * @interface
+ * @property {string} type - The type of the content item, which is always "text".
+ * @property {string} data - The text data.
+ */
+export interface TextContentItem {
+  type: "text";
+  data: string;
+}
+/**
+ * @interface
+ * @property {string} type - The type of the content item, which is always "image-buffer".
+ * @property {string} image_type - The image format (e.g., "png", "jpeg", "gif", "webp").
+ * @property {Buffer} data - The buffer containing the image data.
+ */
+export interface ImageBufferContentItem {
+  type: "image-buffer";
+  image_type: "png" | "jpeg" | "gif" | "webp";
+  data: Buffer;
+}
+/**
+ * @interface
+ * @property {string} type - The type of the content item, which is always "image-url".
+ * @property {string} image_type - The image format (e.g., "png", "jpeg", "gif", "webp").
+ * @property {string} data - The URL of the image.
+ */
+export interface ImageUrlContentItem {
+  type: "image-url";
+  image_type: "png" | "jpeg" | "gif" | "webp";
+  data: string;
+}

package/dist/ai/extractStructuredData.js CHANGED Viewed

@@ -17,6 +17,9 @@ var _Logger = require("../common/Logger");
 var _helpers = require("../helpers");
 var _xpathMapping = require("../common/xpathMapping");
 const extractStructuredData = async options => {
+  if ("content" in options && !("source" in options)) {
+    return await extractStructuredDataFromContent(options);
+  }
   const pageOrLocator = options.source;
   const isPageInput = (0, _locatorHelpers.isPage)(pageOrLocator);
   const {
@@ -83,15 +86,18 @@ const extractStructuredData = async options => {
         return cachedResult;
       }
     }
-    const result = await (0, _extractStructuredDataUsingAi.extractStructuredDataUsingAi)(pageObject, {
-      apiKey: validatedData.apiKey,
-      enableDomMatching: validatedData.enableDomMatching,
-      jsonSchema: validatedData.dataSchema,
-      model: validatedData.model,
-      content: simplifiedHtml,
-      prompt: validatedData.prompt,
-      images: [],
-      maxRetries: validatedData.maxRetries
+    const result = await (0, _extractStructuredDataUsingAi.extractStructuredDataUsingAi)({
+      page: pageObject,
+      options: {
+        apiKey: validatedData.apiKey,
+        enableDomMatching: validatedData.enableDomMatching,
+        jsonSchema: validatedData.dataSchema,
+        model: validatedData.model || "claude-3-5-haiku-latest",
+        content: simplifiedHtml,
+        prompt: validatedData.prompt,
+        images: [],
+        maxRetries: validatedData.maxRetries
+      }
     });
     if (result.isErr()) {
       throw new Error(result.error.context);
@@ -139,15 +145,21 @@ const extractStructuredData = async options => {
     if (images.isErr()) {
       throw new Error(images.error.context);
     }
-    const result = await (0, _extractStructuredDataUsingAi.extractStructuredDataUsingAi)(pageObject, {
-      apiKey: validatedData.apiKey,
-      enableDomMatching: validatedData.enableDomMatching,
-      jsonSchema: validatedData.dataSchema,
-      model: validatedData.model,
-      content: "Extract structured data from the following images.",
-      prompt: validatedData.prompt,
-      images: images.value,
-      maxRetries: validatedData.maxRetries
+    const result = await (0, _extractStructuredDataUsingAi.extractStructuredDataUsingAi)({
+      page: pageObject,
+      options: {
+        apiKey: validatedData.apiKey,
+        enableDomMatching: validatedData.enableDomMatching,
+        jsonSchema: validatedData.dataSchema,
+        model: validatedData.model || "claude-3-5-haiku-latest",
+        content: "Extract structured data from the following images.",
+        prompt: validatedData.prompt,
+        images: images.value.map(i => ({
+          data: i,
+          image_type: "png"
+        })),
+        maxRetries: validatedData.maxRetries
+      }
     });
     if (result.isErr()) {
       throw new Error(result.error.context);
@@ -200,15 +212,18 @@ const extractStructuredData = async options => {
         return cachedResult;
       }
     }
-    const result = await (0, _extractStructuredDataUsingAi.extractStructuredDataUsingAi)(pageObject, {
-      apiKey: validatedData.apiKey,
-      enableDomMatching: validatedData.enableDomMatching,
-      jsonSchema: validatedData.dataSchema,
-      model: validatedData.model,
-      content: markdown,
-      prompt: validatedData.prompt,
-      images: [],
-      maxRetries: validatedData.maxRetries
+    const result = await (0, _extractStructuredDataUsingAi.extractStructuredDataUsingAi)({
+      page: pageObject,
+      options: {
+        apiKey: validatedData.apiKey,
+        enableDomMatching: validatedData.enableDomMatching,
+        jsonSchema: validatedData.dataSchema,
+        model: validatedData.model || "claude-3-5-haiku-latest",
+        content: markdown,
+        prompt: validatedData.prompt,
+        images: [],
+        maxRetries: validatedData.maxRetries
+      }
     });
     if (result.isErr()) {
       throw new Error(result.error.context);
@@ -228,4 +243,78 @@ const extractStructuredData = async options => {
   }
   throw new Error(`Unsupported strategy type: ${validatedData.strategy}. Supported types are: HTML, IMAGE, and MARKDOWN.`);
 };
-exports.extractStructuredData = extractStructuredData;
+exports.extractStructuredData = extractStructuredData;
+const extractStructuredDataFromContent = async options => {
+  const contentValidationResult = _validators.contentValidationSchema.safeParse(options.content);
+  if (!contentValidationResult.success) {
+    const error = contentValidationResult.error;
+    const messages = (0, _formatZodError.formatZodError)(error);
+    throw new Error("extractStructuredDataFromContent content is invalid: \n" + messages.join("\n"));
+  }
+  const {
+    content: _,
+    ...rest
+  } = options;
+  const parsingResult = _validators.genericExtractDataInputSchema.safeParse(rest);
+  if (!parsingResult.success) {
+    const error = parsingResult.error;
+    const messages = (0, _formatZodError.formatZodError)(error);
+    throw new Error("extractStructuredDataFromContent input is invalid: \n" + messages.join("\n"));
+  }
+  const content = Array.isArray(options.content) ? options.content : [options.content];
+  const imagesFromBuffers = content.filter(c => c.type === "image-buffer").map(c => ({
+    image_type: c.image_type,
+    data: c.data
+  }));
+  const imagesFromUrls = content.filter(c => c.type === "image-url").map(c => ({
+    image_type: c.image_type,
+    data: c.data
+  })).map(async c => {
+    try {
+      const response = await fetch(c.data);
+      const buffer = Buffer.from(await response.arrayBuffer());
+      return {
+        image_type: c.image_type,
+        data: buffer
+      };
+    } catch (e) {
+      throw new Error(`fetching image:${c.data} from url Failed: ${e}`);
+    }
+  });
+  const images = [...(await Promise.all(imagesFromUrls)), ...imagesFromBuffers];
+  const texts = content.filter(c => c.type === "text").map(c => c.data);
+  let cacheKey = "";
+  if (options.enableCache != false) {
+    cacheKey = (0, _hashObject.hashObject)({
+      systemMessage: options.prompt,
+      images,
+      jsonSchema: options.dataSchema,
+      model: options.model,
+      text: texts
+    }, false);
+    const cachedResult = await _cache.cache.get(cacheKey);
+    if (cachedResult) {
+      _Logger.logger.info("Results for the extractor found in the cache, returning cached result");
+      return cachedResult;
+    }
+  }
+  const result = await (0, _extractStructuredDataUsingAi.extractStructuredDataUsingAi)({
+    options: {
+      prompt: options.prompt,
+      images,
+      jsonSchema: options.dataSchema,
+      content: texts.join("\n"),
+      enableDomMatching: false,
+      apiKey: options.apiKey,
+      model: options.model || "claude-3-5-haiku-latest",
+      maxRetries: options.maxRetries
+    }
+  });
+  if (result.isErr()) {
+    throw new Error(result.error.context);
+  }
+  if (options.enableCache != false) {
+    await _cache.cache.set(cacheKey, result.value.result);
+  }
+  return result.value.result;
+};

package/dist/ai/extractStructuredDataUsingAi.js CHANGED Viewed

@@ -18,7 +18,7 @@ var _prompt = require("./prompt");
 var _ai = require("ai");
 var _loadRuntime = require("../common/loadRuntime");
 function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
-async function extractStructuredDataUsingAi(page, input) {
+async function extractStructuredDataUsingAi(input) {
   var _getExecutionContext, _getExecutionContext2, _getExecutionContext3;
   const {
     apiKey,
@@ -29,7 +29,7 @@ async function extractStructuredDataUsingAi(page, input) {
     prompt,
     images,
     maxRetries = 3
-  } = input;
+  } = input.options;
   let accumulatedTokens = 0;
   const getExecutionContext = await (0, _loadRuntime.loadRuntime)();
   const toolName = `extract_data`;
@@ -99,7 +99,7 @@ async function extractStructuredDataUsingAi(page, input) {
         currentRetry++;
         continue;
       }
-      if (!enableDomMatching) {
+      if (!enableDomMatching || !input.page) {
         _Logger.logger.info(`Extraction completed, total LLM ${isGateway ? "Cost In Cents" : "Tokens"}: ${accumulatedTokens}`);
         return (0, _neverthrow.ok)({
           result: extractedData,
@@ -123,7 +123,7 @@ async function extractStructuredDataUsingAi(page, input) {
         xpathMapping
       } = await (0, _matching.replaceWithBestMatches)({
         stringsToMatch,
-        pageObject: page
+        pageObject: input.page
       });
       const stringReplacements = {};
       Object.entries(replacements).forEach(([key, value]) => {