@intuned/browser-dev 2.2.3-unify-sdks.28 → 2.2.3-unify-sdks.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/dist/ai/export.d.ts +230 -116
  2. package/dist/ai/extractStructuredData.js +117 -28
  3. package/dist/ai/extractStructuredDataUsingAi.js +4 -4
  4. package/dist/ai/index.d.ts +230 -116
  5. package/dist/ai/prompt.js +2 -2
  6. package/dist/ai/tests/testExtractFromContent.spec.js +372 -0
  7. package/dist/ai/tests/testExtractStructuredData.spec.js +24 -23
  8. package/dist/ai/validators.js +37 -6
  9. package/dist/helpers/export.d.ts +78 -125
  10. package/dist/helpers/index.d.ts +78 -125
  11. package/dist/helpers/saveFileToS3.js +6 -2
  12. package/dist/helpers/tests/testDownloadFile.spec.js +9 -2
  13. package/dist/helpers/uploadFileToS3.js +11 -9
  14. package/dist/helpers/waitForDomSettled.js +3 -2
  15. package/dist/helpers/withNetworkIdleWait.js +1 -1
  16. package/generated-docs/ai/functions/extractStructuredData.mdx +145 -57
  17. package/generated-docs/ai/functions/isPageLoaded.mdx +2 -56
  18. package/generated-docs/ai/interfaces/ImageBufferContentItem.mdx +12 -0
  19. package/generated-docs/ai/interfaces/ImageUrlContentItem.mdx +12 -0
  20. package/generated-docs/ai/interfaces/TextContentItem.mdx +11 -0
  21. package/generated-docs/ai/type-aliases/JsonSchema.mdx +40 -0
  22. package/generated-docs/ai/type-aliases/SUPPORTED_MODELS.mdx +73 -2
  23. package/generated-docs/helpers/functions/resolveUrl.mdx +106 -24
  24. package/generated-docs/helpers/functions/saveFileToS3.mdx +25 -45
  25. package/generated-docs/helpers/functions/uploadFileToS3.mdx +20 -6
  26. package/generated-docs/helpers/functions/{withNetworkIdle.mdx → withNetworkIdleWait.mdx} +6 -6
  27. package/generated-docs/helpers/interfaces/S3Configs.mdx +4 -4
  28. package/generated-docs/optimized-extractors/functions/extractArrayFromLocator.mdx +121 -0
  29. package/generated-docs/optimized-extractors/functions/extractArrayFromPage.mdx +126 -0
  30. package/generated-docs/optimized-extractors/functions/extractObjectFromLocator.mdx +122 -0
  31. package/generated-docs/optimized-extractors/functions/extractObjectFromPage.mdx +122 -0
  32. package/generated-docs/optimized-extractors/interfaces/HtmlStrategy.mdx +39 -0
  33. package/generated-docs/optimized-extractors/interfaces/ImageStrategy.mdx +35 -0
  34. package/generated-docs/optimized-extractors/interfaces/SimpleArrayItemSchema.mdx +15 -0
  35. package/generated-docs/optimized-extractors/interfaces/SimpleArrayStringSchema.mdx +13 -0
  36. package/generated-docs/optimized-extractors/interfaces/SimpleObjectSchema.mdx +15 -0
  37. package/generated-docs/optimized-extractors/interfaces/SimpleObjectStringSchema.mdx +12 -0
  38. package/package.json +2 -1
  39. package/test-docs/functions/downloadFile.mdx +95 -0
  40. package/test-docs/functions/extractMarkdown.mdx +53 -0
  41. package/test-docs/functions/filterEmptyValues.mdx +48 -0
  42. package/test-docs/functions/goToUrl.mdx +97 -0
  43. package/test-docs/functions/processDate.mdx +52 -0
  44. package/test-docs/functions/resolveUrl.mdx +161 -0
  45. package/test-docs/functions/sanitizeHtml.mdx +113 -0
  46. package/test-docs/functions/saveFileToS3.mdx +124 -0
  47. package/test-docs/functions/scrollToLoadContent.mdx +87 -0
  48. package/test-docs/functions/uploadFileToS3.mdx +118 -0
  49. package/test-docs/functions/validateDataUsingSchema.mdx +66 -0
  50. package/test-docs/functions/waitForDomSettled.mdx +95 -0
  51. package/test-docs/functions/withNetworkIdleWait.mdx +93 -0
  52. package/test-docs/interfaces/Attachment.mdx +45 -0
  53. package/test-docs/interfaces/S3Configs.mdx +36 -0
  54. package/test-docs/interfaces/SanitizeHtmlOptions.mdx +22 -0
  55. package/test-docs/type-aliases/AttachmentType.mdx +12 -0
  56. package/test-docs/type-aliases/DataInput.mdx +11 -0
  57. package/test-docs/type-aliases/DataObject.mdx +11 -0
  58. package/test-docs/type-aliases/S3UploadableFile.mdx +10 -0
  59. package/test-docs/type-aliases/Trigger.mdx +13 -0
  60. package/dist/intunedServices/ApiGateway/test.spec.js +0 -1
  61. package/generated-docs/helpers/interfaces/S3UploadOptions.mdx +0 -40
@@ -151,10 +151,40 @@ export interface ObjectSchema extends BasicSchema {
151
151
  /** Minimum number of properties required */
152
152
  minProperties?: number;
153
153
  }
154
+ import { z } from "zod";
154
155
 
156
+ /**
157
+ * JsonSchema can be a Zod schema, a string schema, a number schema, a boolean schema, an array schema, or an object schema.
158
+ * @interface JsonSchema
159
+ * @extends BasicSchema
160
+ * @example String Schema
161
+ * ```typescript
162
+ * const schema: JsonSchema = {
163
+ * type: "object",
164
+ * properties: {
165
+ * name: { type: "string" },
166
+ * age: { type: "number" }
167
+ * }
168
+ * };
169
+ * ```
170
+ * @example Zod Schema
171
+ * ```typescript
172
+ * const schema: JsonSchema = z.object({
173
+ * name: z.string(),
174
+ * age: z.number()
175
+ * });
176
+ * ```
177
+ */
178
+ export type JsonSchema =
179
+ | z.ZodSchema
180
+ | StringSchema
181
+ | NumberSchema
182
+ | BooleanSchema
183
+ | ArraySchema
184
+ | ObjectSchema;
155
185
  /**
156
186
  * Extract structured data from web pages using AI-powered content analysis.
157
- *
187
+ * @overload Extract From Page or Locator
158
188
  * This function provides intelligent data extraction from web pages using various strategies
159
189
  * including HTML parsing, image analysis, and Markdown conversion. It supports extraction
160
190
  * from entire pages or specific elements, with built-in caching and retry mechanisms.
@@ -176,7 +206,11 @@ export interface ObjectSchema extends BasicSchema {
176
206
  * ```typescript Extract Product Information from Entire Page
177
207
  * import { extractStructuredData } from '@intuned/browser/ai';
178
208
  *
179
- * const productSchema = {
209
+ * const product = await extractStructuredData({
210
+ * source: page,
211
+ * strategy: "HTML",
212
+ * model: "gpt-4o",
213
+ * dataSchema: {
180
214
  * type: "object",
181
215
  * properties: {
182
216
  * name: { type: "string" },
@@ -185,13 +219,7 @@ export interface ObjectSchema extends BasicSchema {
185
219
  * inStock: { type: "boolean" }
186
220
  * },
187
221
  * required: ["name", "price"]
188
- * };
189
- *
190
- * const product = await extractStructuredData({
191
- * source: page,
192
- * strategy: "HTML",
193
- * model: "gpt-4o",
194
- * dataSchema: productSchema,
222
+ * },
195
223
  * prompt: "Extract product details from this e-commerce page"
196
224
  * });
197
225
  *
@@ -202,7 +230,12 @@ export interface ObjectSchema extends BasicSchema {
202
230
  * ```typescript Extract Article Data from Specific Element
203
231
  * import { extractStructuredData } from '@intuned/browser/ai';
204
232
  *
205
- * const articleSchema = {
233
+ * const articleContainer = page.locator("article.main-content");
234
+ * const article = await extractStructuredData({
235
+ * source: articleContainer,
236
+ * strategy: "MARKDOWN",
237
+ * model: "claude-3",
238
+ * dataSchema: {
206
239
  * type: "object",
207
240
  * properties: {
208
241
  * title: { type: "string" },
@@ -212,64 +245,107 @@ export interface ObjectSchema extends BasicSchema {
212
245
  * tags: { type: "array", items: { type: "string" } }
213
246
  * },
214
247
  * required: ["title", "content"]
215
- * };
216
- *
217
- * const articleContainer = page.locator("article.main-content");
218
- * const article = await extractStructuredData({
219
- * source: articleContainer,
220
- * strategy: "MARKDOWN",
221
- * model: "claude-3",
222
- * dataSchema: articleSchema,
248
+ * },
223
249
  * maxRetries: 5
224
250
  * });
225
251
  *
226
252
  * console.log(`Article: ${article.title} by ${article.author}`);
227
253
  * ```
254
+ */
255
+ export declare function extractStructuredData(options: {
256
+ source: Page | Locator;
257
+ dataSchema: JsonSchema;
258
+ prompt?: string;
259
+ strategy?: "IMAGE" | "MARKDOWN" | "HTML";
260
+ model?: SUPPORTED_MODELS;
261
+ apiKey?: string;
262
+ enableDomMatching?: boolean;
263
+ enableCache?: boolean;
264
+ maxRetries?: number;
265
+ }): Promise<any>;
266
+
267
+ /**
268
+ * Extract structured data from content items (text, images) using AI-powered analysis.
269
+ * @overload Extract From Content
270
+ * This overload provides a simplified interface for data extraction from various content types
271
+ * without requiring a page source or extraction strategy. It accepts text content, image buffers,
272
+ * or image URLs and extracts structured data according to the provided schema.
228
273
  *
229
- * @example
230
- * ```typescript Extract Data from Screenshots using Image Strategy
274
+ * @param {Object} options - Configuration object containing extraction parameters
275
+ * @param {TextContentItem | ImageBufferContentItem | ImageUrlContentItem | Array<TextContentItem | ImageBufferContentItem | ImageUrlContentItem>} options.content - Content to extract data from - can be a single content item or array of content items
276
+ * @param {JsonSchema} options.dataSchema - [JsonSchema](../interfaces/JsonSchema) defining the structure of the data to extract
277
+ * @param {SUPPORTED_MODELS} options.model - AI model to use for extraction (e.g., "gpt-4", "claude-3"), see [SUPPORTED_MODELS](../type-aliases/SUPPORTED_MODELS) for all supported models
278
+ * @param {string} [options.prompt] - Optional prompt to guide the extraction process and provide more context
279
+ * @param {string} [options.apiKey] - Optional API key for AI extraction (if provided, will not be billed to your account)
280
+ * @param {boolean} [options.enableCache=true] - Whether to enable caching of the extracted data. default true
281
+ * @param {integer} [options.retries=3] - Maximum number of retry attempts on failures. Failures can be validation errors, api errors, output errors, etc.
282
+ *
283
+ * @returns Promise resolving to the extracted structured data matching the provided schema
284
+ *
285
+ * @example
286
+ * ```typescript Extract Data from Text Content
231
287
  * import { extractStructuredData } from '@intuned/browser/ai';
232
288
  *
233
- * const chartSchema = {
234
- * type: "object",
235
- * properties: {
236
- * title: { type: "string" },
237
- * dataPoints: {
238
- * type: "array",
239
- * items: {
240
- * type: "object",
241
- * properties: {
242
- * label: { type: "string" },
243
- * value: { type: "number" }
244
- * }
245
- * }
246
- * }
247
- * }
289
+ * const textContent: TextContentItem = {
290
+ * type: "text",
291
+ * data: "John Doe, age 30, works as a Software Engineer at Tech Corp"
248
292
  * };
249
293
  *
250
- * const chartElement = page.locator("#data-visualization");
251
- * const chartData = await extractStructuredData({
252
- * source: chartElement,
253
- * strategy: "IMAGE",
294
+ * const person = await extractStructuredData({
295
+ * content: textContent,
254
296
  * model: "gpt-4o",
255
- * dataSchema: chartSchema,
256
- * prompt: "Extract the chart title and all data points with their values"
297
+ * dataSchema: {
298
+ * type: "object",
299
+ * properties: {
300
+ * name: { type: "string" },
301
+ * age: { type: "number" },
302
+ * occupation: { type: "string" },
303
+ * company: { type: "string" }
304
+ * },
305
+ * required: ["name"]
306
+ * },
307
+ * prompt: "Extract person information from the text"
257
308
  * });
258
309
  *
259
- * console.log(`Chart: ${chartData.title}`);
260
- * chartData.dataPoints.forEach(point => {
261
- * console.log(`${point.label}: ${point.value}`);
310
+ * console.log(`Found person: ${person.name}, ${person.age} years old`);
311
+ * ```
312
+ *
313
+ * @example
314
+ * ```typescript Extract Data from Multiple Content Items
315
+ * import { extractStructuredData } from '@intuned/browser/ai';
316
+ *
317
+ * const mixedContent = [
318
+ * { type: "text", data: "Product: iPhone 15" },
319
+ * { type: "image-url", image_type: "jpeg", data: "https://mintcdn.com/intuned-7/asXJUUPBWwDlStUB/logo/light.svg?fit=max&auto=format&n=asXJUUPBWwDlStUB&q=85&s=6525c0b299b3226464eba6afa9b7ebe6" }
320
+ * ];
321
+ *
322
+ *
323
+ * const product = await extractStructuredData({
324
+ * content: mixedContent,
325
+ * model: "claude-3",
326
+ * dataSchema: {
327
+ * type: "object",
328
+ * properties: {
329
+ * name: { type: "string" },
330
+ * price: { type: "string" },
331
+ * features: { type: "array", items: { type: "string" } }
332
+ * }
333
+ * },
334
+ * maxRetries: 1,
335
+ * enableCache: true
262
336
  * });
263
337
  * ```
264
338
  */
265
339
  export declare function extractStructuredData(options: {
266
- source: Page | Locator;
340
+ content:
341
+ | (TextContentItem | ImageBufferContentItem | ImageUrlContentItem)[]
342
+ | TextContentItem
343
+ | ImageBufferContentItem
344
+ | ImageUrlContentItem;
267
345
  dataSchema: JsonSchema;
268
346
  prompt?: string;
269
- strategy?: "IMAGE" | "MARKDOWN" | "HTML";
270
- model?: SUPPORTED_MODELS;
347
+ model: SUPPORTED_MODELS;
271
348
  apiKey?: string;
272
- enableDomMatching?: boolean;
273
349
  enableCache?: boolean;
274
350
  maxRetries?: number;
275
351
  }): Promise<any>;
@@ -348,72 +424,83 @@ type SUPPORTED_OPENAI_MODELS =
348
424
  | "o4-mini-deep-research-2025-06-26";
349
425
  /**
350
426
  * This type defines the supported AI models for data extraction.
351
- * It includes models from OpenAI, Anthropic, and Google Gemini.
352
- * The models are used in the extraction strategies to process and analyze the content of web pages or elements.
427
+ * It includes models from OpenAI and Anthropic
428
+ * **Supported OPENAI Models**
429
+ * "gpt-3.5-turbo"
430
+ * "gpt-3.5-turbo-0125"
431
+ * "gpt-3.5-turbo-0301"
432
+ * "gpt-3.5-turbo-0613"
433
+ * "gpt-3.5-turbo-1106"
434
+ * "gpt-3.5-turbo-16k"
435
+ * "gpt-3.5-turbo-16k-0613"
436
+ * "gpt-3.5-turbo-instruct"
437
+ * "gpt-3.5-turbo-instruct-0914"
438
+ * "gpt-4"
439
+ * "gpt-4-0314"
440
+ * "gpt-4-0613"
441
+ * "gpt-4-32k"
442
+ * "gpt-4-32k-0314"
443
+ * "gpt-4-32k-0613"
444
+ * "gpt-4-turbo"
445
+ * "gpt-4-turbo-2024-04-09"
446
+ * "gpt-4.1"
447
+ * "gpt-4.1-2025-04-14"
448
+ * "gpt-4.1-mini"
449
+ * "gpt-4.1-mini-2025-04-14"
450
+ * "gpt-4.1-nano"
451
+ * "gpt-4.1-nano-2025-04-14"
452
+ * "gpt-4o"
453
+ * "gpt-4o-2024-05-13"
454
+ * "gpt-4o-2024-08-06"
455
+ * "gpt-4o-2024-11-20"
456
+ * "gpt-4o-mini"
457
+ * "gpt-4o-mini-2024-07-18"
458
+ * "gpt-5"
459
+ * "gpt-5-2025-08-07"
460
+ * "gpt-5-chat"
461
+ * "gpt-5-chat-latest"
462
+ * "gpt-5-mini"
463
+ * "gpt-5-mini-2025-08-07"
464
+ * "gpt-5-nano"
465
+ * "gpt-5-nano-2025-08-07"
466
+ * "o1"
467
+ * "o1-2024-12-17"
468
+ * "o1-mini"
469
+ * "o1-mini-2024-09-12"
470
+ * "o1-pro"
471
+ * "o1-pro-2025-03-19"
472
+ * "o3"
473
+ * "o3-2025-04-16"
474
+ * "o3-deep-research"
475
+ * "o3-deep-research-2025-06-26"
476
+ * "o3-mini"
477
+ * "o3-mini-2025-01-31"
478
+ * "o3-pro"
479
+ * "o3-pro-2025-06-10"
480
+ * "o4-mini"
481
+ * "o4-mini-2025-04-16"
482
+ * "o4-mini-deep-research"
483
+ * "o4-mini-deep-research-2025-06-26";
484
+ *
485
+ * **Supported Anthropic Models**
486
+ * "claude-3-5-haiku-20241022"
487
+ * "claude-3-5-haiku-latest"
488
+ * "claude-3-5-sonnet-20240620"
489
+ * "claude-3-5-sonnet-20241022"
490
+ * "claude-3-5-sonnet-latest"
491
+ * "claude-3-7-sonnet-20250219"
492
+ * "claude-3-7-sonnet-latest"
493
+ * "claude-3-haiku-20240307"
494
+ * "claude-4-opus-20250514"
495
+ * "claude-4-sonnet-20250514"
496
+ * "claude-opus-4-1"
497
+ * "claude-opus-4-1-20250805"
498
+ * "claude-opus-4-20250514"
499
+ * "claude-sonnet-4-20250514";
353
500
  * @type SUPPORTED_MODELS
354
501
  */
355
502
  type SUPPORTED_MODELS = SUPPORTED_CLAUDE_MODELS | SUPPORTED_OPENAI_MODELS;
356
503
 
357
- /**
358
- * Represents a JSON Schema definition for validating data structures.
359
- * Supports various schema types including string, number, boolean, array, and object schemas
360
- * with their respective validation rules and constraints.
361
- *
362
- * This type is a union of different schema types:
363
- * - StringSchema: For string validation with length and pattern constraints
364
- * - NumberSchema: For number/integer validation with range constraints
365
- * - BooleanSchema: For boolean values
366
- * - ArraySchema: For array validation with item constraints
367
- * - ObjectSchema: For object validation with property constraints
368
- *
369
- * @type JsonSchema
370
- * @example
371
- * ```typescript String Schema
372
- * const stringSchema: JsonSchema = {
373
- * type: "string",
374
- * minLength: 3,
375
- * maxLength: 50,
376
- * pattern: "^[A-Za-z]+$"
377
- * };
378
- * ```
379
- *
380
- * @example
381
- * ```typescript Number Schema
382
- * const numberSchema: JsonSchema = {
383
- * type: "number",
384
- * minimum: 0,
385
- * maximum: 100,
386
- * multipleOf: 0.5
387
- * };
388
- * ```
389
- *
390
- * @example
391
- * ```typescript Array Schema
392
- * const arraySchema: JsonSchema = {
393
- * type: "array",
394
- * items: {
395
- * type: "string"
396
- * },
397
- * minItems: 1,
398
- * maxItems: 10,
399
- * uniqueItems: true
400
- * };
401
- * ```
402
- *
403
- * @example
404
- * ```typescript Object Schema
405
- * const objectSchema: JsonSchema = {
406
- * type: "object",
407
- * properties: {
408
- * name: { type: "string" },
409
- * age: { type: "number", minimum: 0 },
410
- * email: { type: "string", pattern: "^[^@]+@[^@]+\\.[^@]+$" }
411
- * },
412
- * required: ["name", "email"]
413
- * };
414
- * ```
415
- */
416
-
417
504
  /**
418
505
  * Uses AI vision to determine if a webpage has finished loading by analyzing a screenshot.
419
506
  * Detects loading spinners, blank content, or incomplete page states.
@@ -465,9 +552,36 @@ export declare function isPageLoaded(input: {
465
552
  apiKey?: string;
466
553
  }): Promise<boolean>;
467
554
 
468
- export type JsonSchema =
469
- | StringSchema
470
- | NumberSchema
471
- | BooleanSchema
472
- | ArraySchema
473
- | ObjectSchema;
555
+ /**
556
+ * @interface
557
+ * @property {string} type - The type of the content item, which is always "text".
558
+ * @property {string} data - The text data.
559
+ */
560
+ export interface TextContentItem {
561
+ type: "text";
562
+ data: string;
563
+ }
564
+
565
+ /**
566
+ * @interface
567
+ * @property {string} type - The type of the content item, which is always "image-buffer".
568
+ * @property {string} image_type - The image format (e.g., "png", "jpeg", "gif", "webp").
569
+ * @property {Buffer} data - The buffer containing the image data.
570
+ */
571
+ export interface ImageBufferContentItem {
572
+ type: "image-buffer";
573
+ image_type: "png" | "jpeg" | "gif" | "webp";
574
+ data: Buffer;
575
+ }
576
+
577
+ /**
578
+ * @interface
579
+ * @property {string} type - The type of the content item, which is always "image-url".
580
+ * @property {string} image_type - The image format (e.g., "png", "jpeg", "gif", "webp").
581
+ * @property {string} data - The URL of the image.
582
+ */
583
+ export interface ImageUrlContentItem {
584
+ type: "image-url";
585
+ image_type: "png" | "jpeg" | "gif" | "webp";
586
+ data: string;
587
+ }
@@ -17,6 +17,9 @@ var _Logger = require("../common/Logger");
17
17
  var _helpers = require("../helpers");
18
18
  var _xpathMapping = require("../common/xpathMapping");
19
19
  const extractStructuredData = async options => {
20
+ if ("content" in options && !("source" in options)) {
21
+ return await extractStructuredDataFromContent(options);
22
+ }
20
23
  const pageOrLocator = options.source;
21
24
  const isPageInput = (0, _locatorHelpers.isPage)(pageOrLocator);
22
25
  const {
@@ -83,15 +86,18 @@ const extractStructuredData = async options => {
83
86
  return cachedResult;
84
87
  }
85
88
  }
86
- const result = await (0, _extractStructuredDataUsingAi.extractStructuredDataUsingAi)(pageObject, {
87
- apiKey: validatedData.apiKey,
88
- enableDomMatching: validatedData.enableDomMatching,
89
- jsonSchema: validatedData.dataSchema,
90
- model: validatedData.model,
91
- content: simplifiedHtml,
92
- prompt: validatedData.prompt,
93
- images: [],
94
- maxRetries: validatedData.maxRetries
89
+ const result = await (0, _extractStructuredDataUsingAi.extractStructuredDataUsingAi)({
90
+ page: pageObject,
91
+ options: {
92
+ apiKey: validatedData.apiKey,
93
+ enableDomMatching: validatedData.enableDomMatching,
94
+ jsonSchema: validatedData.dataSchema,
95
+ model: validatedData.model || "claude-3-5-haiku-latest",
96
+ content: simplifiedHtml,
97
+ prompt: validatedData.prompt,
98
+ images: [],
99
+ maxRetries: validatedData.maxRetries
100
+ }
95
101
  });
96
102
  if (result.isErr()) {
97
103
  throw new Error(result.error.context);
@@ -139,15 +145,21 @@ const extractStructuredData = async options => {
139
145
  if (images.isErr()) {
140
146
  throw new Error(images.error.context);
141
147
  }
142
- const result = await (0, _extractStructuredDataUsingAi.extractStructuredDataUsingAi)(pageObject, {
143
- apiKey: validatedData.apiKey,
144
- enableDomMatching: validatedData.enableDomMatching,
145
- jsonSchema: validatedData.dataSchema,
146
- model: validatedData.model,
147
- content: "Extract structured data from the following images.",
148
- prompt: validatedData.prompt,
149
- images: images.value,
150
- maxRetries: validatedData.maxRetries
148
+ const result = await (0, _extractStructuredDataUsingAi.extractStructuredDataUsingAi)({
149
+ page: pageObject,
150
+ options: {
151
+ apiKey: validatedData.apiKey,
152
+ enableDomMatching: validatedData.enableDomMatching,
153
+ jsonSchema: validatedData.dataSchema,
154
+ model: validatedData.model || "claude-3-5-haiku-latest",
155
+ content: "Extract structured data from the following images.",
156
+ prompt: validatedData.prompt,
157
+ images: images.value.map(i => ({
158
+ data: i,
159
+ image_type: "png"
160
+ })),
161
+ maxRetries: validatedData.maxRetries
162
+ }
151
163
  });
152
164
  if (result.isErr()) {
153
165
  throw new Error(result.error.context);
@@ -200,15 +212,18 @@ const extractStructuredData = async options => {
200
212
  return cachedResult;
201
213
  }
202
214
  }
203
- const result = await (0, _extractStructuredDataUsingAi.extractStructuredDataUsingAi)(pageObject, {
204
- apiKey: validatedData.apiKey,
205
- enableDomMatching: validatedData.enableDomMatching,
206
- jsonSchema: validatedData.dataSchema,
207
- model: validatedData.model,
208
- content: markdown,
209
- prompt: validatedData.prompt,
210
- images: [],
211
- maxRetries: validatedData.maxRetries
215
+ const result = await (0, _extractStructuredDataUsingAi.extractStructuredDataUsingAi)({
216
+ page: pageObject,
217
+ options: {
218
+ apiKey: validatedData.apiKey,
219
+ enableDomMatching: validatedData.enableDomMatching,
220
+ jsonSchema: validatedData.dataSchema,
221
+ model: validatedData.model || "claude-3-5-haiku-latest",
222
+ content: markdown,
223
+ prompt: validatedData.prompt,
224
+ images: [],
225
+ maxRetries: validatedData.maxRetries
226
+ }
212
227
  });
213
228
  if (result.isErr()) {
214
229
  throw new Error(result.error.context);
@@ -228,4 +243,78 @@ const extractStructuredData = async options => {
228
243
  }
229
244
  throw new Error(`Unsupported strategy type: ${validatedData.strategy}. Supported types are: HTML, IMAGE, and MARKDOWN.`);
230
245
  };
231
- exports.extractStructuredData = extractStructuredData;
246
+ exports.extractStructuredData = extractStructuredData;
247
+ const extractStructuredDataFromContent = async options => {
248
+ const contentValidationResult = _validators.contentValidationSchema.safeParse(options.content);
249
+ if (!contentValidationResult.success) {
250
+ const error = contentValidationResult.error;
251
+ const messages = (0, _formatZodError.formatZodError)(error);
252
+ throw new Error("extractStructuredDataFromContent content is invalid: \n" + messages.join("\n"));
253
+ }
254
+ const {
255
+ content: _,
256
+ ...rest
257
+ } = options;
258
+ const parsingResult = _validators.genericExtractDataInputSchema.safeParse(rest);
259
+ if (!parsingResult.success) {
260
+ const error = parsingResult.error;
261
+ const messages = (0, _formatZodError.formatZodError)(error);
262
+ throw new Error("extractStructuredDataFromContent input is invalid: \n" + messages.join("\n"));
263
+ }
264
+ const content = Array.isArray(options.content) ? options.content : [options.content];
265
+ const imagesFromBuffers = content.filter(c => c.type === "image-buffer").map(c => ({
266
+ image_type: c.image_type,
267
+ data: c.data
268
+ }));
269
+ const imagesFromUrls = content.filter(c => c.type === "image-url").map(c => ({
270
+ image_type: c.image_type,
271
+ data: c.data
272
+ })).map(async c => {
273
+ try {
274
+ const response = await fetch(c.data);
275
+ const buffer = Buffer.from(await response.arrayBuffer());
276
+ return {
277
+ image_type: c.image_type,
278
+ data: buffer
279
+ };
280
+ } catch (e) {
281
+ throw new Error(`fetching image:${c.data} from url Failed: ${e}`);
282
+ }
283
+ });
284
+ const images = [...(await Promise.all(imagesFromUrls)), ...imagesFromBuffers];
285
+ const texts = content.filter(c => c.type === "text").map(c => c.data);
286
+ let cacheKey = "";
287
+ if (options.enableCache != false) {
288
+ cacheKey = (0, _hashObject.hashObject)({
289
+ systemMessage: options.prompt,
290
+ images,
291
+ jsonSchema: options.dataSchema,
292
+ model: options.model,
293
+ text: texts
294
+ }, false);
295
+ const cachedResult = await _cache.cache.get(cacheKey);
296
+ if (cachedResult) {
297
+ _Logger.logger.info("Results for the extractor found in the cache, returning cached result");
298
+ return cachedResult;
299
+ }
300
+ }
301
+ const result = await (0, _extractStructuredDataUsingAi.extractStructuredDataUsingAi)({
302
+ options: {
303
+ prompt: options.prompt,
304
+ images,
305
+ jsonSchema: options.dataSchema,
306
+ content: texts.join("\n"),
307
+ enableDomMatching: false,
308
+ apiKey: options.apiKey,
309
+ model: options.model || "claude-3-5-haiku-latest",
310
+ maxRetries: options.maxRetries
311
+ }
312
+ });
313
+ if (result.isErr()) {
314
+ throw new Error(result.error.context);
315
+ }
316
+ if (options.enableCache != false) {
317
+ await _cache.cache.set(cacheKey, result.value.result);
318
+ }
319
+ return result.value.result;
320
+ };
@@ -18,7 +18,7 @@ var _prompt = require("./prompt");
18
18
  var _ai = require("ai");
19
19
  var _loadRuntime = require("../common/loadRuntime");
20
20
  function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
21
- async function extractStructuredDataUsingAi(page, input) {
21
+ async function extractStructuredDataUsingAi(input) {
22
22
  var _getExecutionContext, _getExecutionContext2, _getExecutionContext3;
23
23
  const {
24
24
  apiKey,
@@ -29,7 +29,7 @@ async function extractStructuredDataUsingAi(page, input) {
29
29
  prompt,
30
30
  images,
31
31
  maxRetries = 3
32
- } = input;
32
+ } = input.options;
33
33
  let accumulatedTokens = 0;
34
34
  const getExecutionContext = await (0, _loadRuntime.loadRuntime)();
35
35
  const toolName = `extract_data`;
@@ -99,7 +99,7 @@ async function extractStructuredDataUsingAi(page, input) {
99
99
  currentRetry++;
100
100
  continue;
101
101
  }
102
- if (!enableDomMatching) {
102
+ if (!enableDomMatching || !input.page) {
103
103
  _Logger.logger.info(`Extraction completed, total LLM ${isGateway ? "Cost In Cents" : "Tokens"}: ${accumulatedTokens}`);
104
104
  return (0, _neverthrow.ok)({
105
105
  result: extractedData,
@@ -123,7 +123,7 @@ async function extractStructuredDataUsingAi(page, input) {
123
123
  xpathMapping
124
124
  } = await (0, _matching.replaceWithBestMatches)({
125
125
  stringsToMatch,
126
- pageObject: page
126
+ pageObject: input.page
127
127
  });
128
128
  const stringReplacements = {};
129
129
  Object.entries(replacements).forEach(([key, value]) => {