@intuned/browser-dev 2.2.3-unify-sdks.28 → 2.2.3-unify-sdks.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ai/export.d.ts +230 -116
- package/dist/ai/extractStructuredData.js +117 -28
- package/dist/ai/extractStructuredDataUsingAi.js +4 -4
- package/dist/ai/index.d.ts +230 -116
- package/dist/ai/prompt.js +2 -2
- package/dist/ai/tests/testExtractFromContent.spec.js +372 -0
- package/dist/ai/tests/testExtractStructuredData.spec.js +24 -23
- package/dist/ai/validators.js +37 -6
- package/dist/helpers/export.d.ts +78 -125
- package/dist/helpers/index.d.ts +78 -125
- package/dist/helpers/saveFileToS3.js +6 -2
- package/dist/helpers/tests/testDownloadFile.spec.js +9 -2
- package/dist/helpers/uploadFileToS3.js +11 -9
- package/dist/helpers/waitForDomSettled.js +3 -2
- package/dist/helpers/withNetworkIdleWait.js +1 -1
- package/generated-docs/ai/functions/extractStructuredData.mdx +145 -57
- package/generated-docs/ai/functions/isPageLoaded.mdx +2 -56
- package/generated-docs/ai/interfaces/ImageBufferContentItem.mdx +12 -0
- package/generated-docs/ai/interfaces/ImageUrlContentItem.mdx +12 -0
- package/generated-docs/ai/interfaces/TextContentItem.mdx +11 -0
- package/generated-docs/ai/type-aliases/JsonSchema.mdx +40 -0
- package/generated-docs/ai/type-aliases/SUPPORTED_MODELS.mdx +73 -2
- package/generated-docs/helpers/functions/resolveUrl.mdx +106 -24
- package/generated-docs/helpers/functions/saveFileToS3.mdx +25 -45
- package/generated-docs/helpers/functions/uploadFileToS3.mdx +20 -6
- package/generated-docs/helpers/functions/{withNetworkIdle.mdx → withNetworkIdleWait.mdx} +6 -6
- package/generated-docs/helpers/interfaces/S3Configs.mdx +4 -4
- package/generated-docs/optimized-extractors/functions/extractArrayFromLocator.mdx +121 -0
- package/generated-docs/optimized-extractors/functions/extractArrayFromPage.mdx +126 -0
- package/generated-docs/optimized-extractors/functions/extractObjectFromLocator.mdx +122 -0
- package/generated-docs/optimized-extractors/functions/extractObjectFromPage.mdx +122 -0
- package/generated-docs/optimized-extractors/interfaces/HtmlStrategy.mdx +39 -0
- package/generated-docs/optimized-extractors/interfaces/ImageStrategy.mdx +35 -0
- package/generated-docs/optimized-extractors/interfaces/SimpleArrayItemSchema.mdx +15 -0
- package/generated-docs/optimized-extractors/interfaces/SimpleArrayStringSchema.mdx +13 -0
- package/generated-docs/optimized-extractors/interfaces/SimpleObjectSchema.mdx +15 -0
- package/generated-docs/optimized-extractors/interfaces/SimpleObjectStringSchema.mdx +12 -0
- package/package.json +2 -1
- package/test-docs/functions/downloadFile.mdx +95 -0
- package/test-docs/functions/extractMarkdown.mdx +53 -0
- package/test-docs/functions/filterEmptyValues.mdx +48 -0
- package/test-docs/functions/goToUrl.mdx +97 -0
- package/test-docs/functions/processDate.mdx +52 -0
- package/test-docs/functions/resolveUrl.mdx +161 -0
- package/test-docs/functions/sanitizeHtml.mdx +113 -0
- package/test-docs/functions/saveFileToS3.mdx +124 -0
- package/test-docs/functions/scrollToLoadContent.mdx +87 -0
- package/test-docs/functions/uploadFileToS3.mdx +118 -0
- package/test-docs/functions/validateDataUsingSchema.mdx +66 -0
- package/test-docs/functions/waitForDomSettled.mdx +95 -0
- package/test-docs/functions/withNetworkIdleWait.mdx +93 -0
- package/test-docs/interfaces/Attachment.mdx +45 -0
- package/test-docs/interfaces/S3Configs.mdx +36 -0
- package/test-docs/interfaces/SanitizeHtmlOptions.mdx +22 -0
- package/test-docs/type-aliases/AttachmentType.mdx +12 -0
- package/test-docs/type-aliases/DataInput.mdx +11 -0
- package/test-docs/type-aliases/DataObject.mdx +11 -0
- package/test-docs/type-aliases/S3UploadableFile.mdx +10 -0
- package/test-docs/type-aliases/Trigger.mdx +13 -0
- package/dist/intunedServices/ApiGateway/test.spec.js +0 -1
- package/generated-docs/helpers/interfaces/S3UploadOptions.mdx +0 -40
package/dist/ai/export.d.ts
CHANGED
|
@@ -151,10 +151,40 @@ export interface ObjectSchema extends BasicSchema {
|
|
|
151
151
|
/** Minimum number of properties required */
|
|
152
152
|
minProperties?: number;
|
|
153
153
|
}
|
|
154
|
+
import { z } from "zod";
|
|
154
155
|
|
|
156
|
+
/**
|
|
157
|
+
* JsonSchema can be a Zod schema, a string schema, a number schema, a boolean schema, an array schema, or an object schema.
|
|
158
|
+
* @interface JsonSchema
|
|
159
|
+
* @extends BasicSchema
|
|
160
|
+
* @example String Schema
|
|
161
|
+
* ```typescript
|
|
162
|
+
* const schema: JsonSchema = {
|
|
163
|
+
* type: "object",
|
|
164
|
+
* properties: {
|
|
165
|
+
* name: { type: "string" },
|
|
166
|
+
* age: { type: "number" }
|
|
167
|
+
* }
|
|
168
|
+
* };
|
|
169
|
+
* ```
|
|
170
|
+
* @example Zod Schema
|
|
171
|
+
* ```typescript
|
|
172
|
+
* const schema: JsonSchema = z.object({
|
|
173
|
+
* name: z.string(),
|
|
174
|
+
* age: z.number()
|
|
175
|
+
* });
|
|
176
|
+
* ```
|
|
177
|
+
*/
|
|
178
|
+
export type JsonSchema =
|
|
179
|
+
| z.ZodSchema
|
|
180
|
+
| StringSchema
|
|
181
|
+
| NumberSchema
|
|
182
|
+
| BooleanSchema
|
|
183
|
+
| ArraySchema
|
|
184
|
+
| ObjectSchema;
|
|
155
185
|
/**
|
|
156
186
|
* Extract structured data from web pages using AI-powered content analysis.
|
|
157
|
-
*
|
|
187
|
+
* @overload Extract From Page or Locator
|
|
158
188
|
* This function provides intelligent data extraction from web pages using various strategies
|
|
159
189
|
* including HTML parsing, image analysis, and Markdown conversion. It supports extraction
|
|
160
190
|
* from entire pages or specific elements, with built-in caching and retry mechanisms.
|
|
@@ -176,7 +206,11 @@ export interface ObjectSchema extends BasicSchema {
|
|
|
176
206
|
* ```typescript Extract Product Information from Entire Page
|
|
177
207
|
* import { extractStructuredData } from '@intuned/browser/ai';
|
|
178
208
|
*
|
|
179
|
-
* const
|
|
209
|
+
* const product = await extractStructuredData({
|
|
210
|
+
* source: page,
|
|
211
|
+
* strategy: "HTML",
|
|
212
|
+
* model: "gpt-4o",
|
|
213
|
+
* dataSchema: {
|
|
180
214
|
* type: "object",
|
|
181
215
|
* properties: {
|
|
182
216
|
* name: { type: "string" },
|
|
@@ -185,13 +219,7 @@ export interface ObjectSchema extends BasicSchema {
|
|
|
185
219
|
* inStock: { type: "boolean" }
|
|
186
220
|
* },
|
|
187
221
|
* required: ["name", "price"]
|
|
188
|
-
* }
|
|
189
|
-
*
|
|
190
|
-
* const product = await extractStructuredData({
|
|
191
|
-
* source: page,
|
|
192
|
-
* strategy: "HTML",
|
|
193
|
-
* model: "gpt-4o",
|
|
194
|
-
* dataSchema: productSchema,
|
|
222
|
+
* },
|
|
195
223
|
* prompt: "Extract product details from this e-commerce page"
|
|
196
224
|
* });
|
|
197
225
|
*
|
|
@@ -202,7 +230,12 @@ export interface ObjectSchema extends BasicSchema {
|
|
|
202
230
|
* ```typescript Extract Article Data from Specific Element
|
|
203
231
|
* import { extractStructuredData } from '@intuned/browser/ai';
|
|
204
232
|
*
|
|
205
|
-
* const
|
|
233
|
+
* const articleContainer = page.locator("article.main-content");
|
|
234
|
+
* const article = await extractStructuredData({
|
|
235
|
+
* source: articleContainer,
|
|
236
|
+
* strategy: "MARKDOWN",
|
|
237
|
+
* model: "claude-3",
|
|
238
|
+
* dataSchema: {
|
|
206
239
|
* type: "object",
|
|
207
240
|
* properties: {
|
|
208
241
|
* title: { type: "string" },
|
|
@@ -212,64 +245,107 @@ export interface ObjectSchema extends BasicSchema {
|
|
|
212
245
|
* tags: { type: "array", items: { type: "string" } }
|
|
213
246
|
* },
|
|
214
247
|
* required: ["title", "content"]
|
|
215
|
-
* }
|
|
216
|
-
*
|
|
217
|
-
* const articleContainer = page.locator("article.main-content");
|
|
218
|
-
* const article = await extractStructuredData({
|
|
219
|
-
* source: articleContainer,
|
|
220
|
-
* strategy: "MARKDOWN",
|
|
221
|
-
* model: "claude-3",
|
|
222
|
-
* dataSchema: articleSchema,
|
|
248
|
+
* },
|
|
223
249
|
* maxRetries: 5
|
|
224
250
|
* });
|
|
225
251
|
*
|
|
226
252
|
* console.log(`Article: ${article.title} by ${article.author}`);
|
|
227
253
|
* ```
|
|
254
|
+
*/
|
|
255
|
+
export declare function extractStructuredData(options: {
|
|
256
|
+
source: Page | Locator;
|
|
257
|
+
dataSchema: JsonSchema;
|
|
258
|
+
prompt?: string;
|
|
259
|
+
strategy?: "IMAGE" | "MARKDOWN" | "HTML";
|
|
260
|
+
model?: SUPPORTED_MODELS;
|
|
261
|
+
apiKey?: string;
|
|
262
|
+
enableDomMatching?: boolean;
|
|
263
|
+
enableCache?: boolean;
|
|
264
|
+
maxRetries?: number;
|
|
265
|
+
}): Promise<any>;
|
|
266
|
+
|
|
267
|
+
/**
|
|
268
|
+
* Extract structured data from content items (text, images) using AI-powered analysis.
|
|
269
|
+
* @overload Extract From Content
|
|
270
|
+
* This overload provides a simplified interface for data extraction from various content types
|
|
271
|
+
* without requiring a page source or extraction strategy. It accepts text content, image buffers,
|
|
272
|
+
* or image URLs and extracts structured data according to the provided schema.
|
|
228
273
|
*
|
|
229
|
-
* @
|
|
230
|
-
*
|
|
274
|
+
* @param {Object} options - Configuration object containing extraction parameters
|
|
275
|
+
* @param {TextContentItem | ImageBufferContentItem | ImageUrlContentItem | Array<TextContentItem | ImageBufferContentItem | ImageUrlContentItem>} options.content - Content to extract data from - can be a single content item or array of content items
|
|
276
|
+
* @param {JsonSchema} options.dataSchema - [JsonSchema](../interfaces/JsonSchema) defining the structure of the data to extract
|
|
277
|
+
* @param {SUPPORTED_MODELS} options.model - AI model to use for extraction (e.g., "gpt-4", "claude-3"), see [SUPPORTED_MODELS](../type-aliases/SUPPORTED_MODELS) for all supported models
|
|
278
|
+
* @param {string} [options.prompt] - Optional prompt to guide the extraction process and provide more context
|
|
279
|
+
* @param {string} [options.apiKey] - Optional API key for AI extraction (if provided, will not be billed to your account)
|
|
280
|
+
* @param {boolean} [options.enableCache=true] - Whether to enable caching of the extracted data. default true
|
|
281
|
+
* @param {integer} [options.retries=3] - Maximum number of retry attempts on failures. Failures can be validation errors, api errors, output errors, etc.
|
|
282
|
+
*
|
|
283
|
+
* @returns Promise resolving to the extracted structured data matching the provided schema
|
|
284
|
+
*
|
|
285
|
+
* @example
|
|
286
|
+
* ```typescript Extract Data from Text Content
|
|
231
287
|
* import { extractStructuredData } from '@intuned/browser/ai';
|
|
232
288
|
*
|
|
233
|
-
* const
|
|
234
|
-
* type: "
|
|
235
|
-
*
|
|
236
|
-
* title: { type: "string" },
|
|
237
|
-
* dataPoints: {
|
|
238
|
-
* type: "array",
|
|
239
|
-
* items: {
|
|
240
|
-
* type: "object",
|
|
241
|
-
* properties: {
|
|
242
|
-
* label: { type: "string" },
|
|
243
|
-
* value: { type: "number" }
|
|
244
|
-
* }
|
|
245
|
-
* }
|
|
246
|
-
* }
|
|
247
|
-
* }
|
|
289
|
+
* const textContent: TextContentItem = {
|
|
290
|
+
* type: "text",
|
|
291
|
+
* data: "John Doe, age 30, works as a Software Engineer at Tech Corp"
|
|
248
292
|
* };
|
|
249
293
|
*
|
|
250
|
-
* const
|
|
251
|
-
*
|
|
252
|
-
* source: chartElement,
|
|
253
|
-
* strategy: "IMAGE",
|
|
294
|
+
* const person = await extractStructuredData({
|
|
295
|
+
* content: textContent,
|
|
254
296
|
* model: "gpt-4o",
|
|
255
|
-
* dataSchema:
|
|
256
|
-
*
|
|
297
|
+
* dataSchema: {
|
|
298
|
+
* type: "object",
|
|
299
|
+
* properties: {
|
|
300
|
+
* name: { type: "string" },
|
|
301
|
+
* age: { type: "number" },
|
|
302
|
+
* occupation: { type: "string" },
|
|
303
|
+
* company: { type: "string" }
|
|
304
|
+
* },
|
|
305
|
+
* required: ["name"]
|
|
306
|
+
* },
|
|
307
|
+
* prompt: "Extract person information from the text"
|
|
257
308
|
* });
|
|
258
309
|
*
|
|
259
|
-
* console.log(`
|
|
260
|
-
*
|
|
261
|
-
*
|
|
310
|
+
* console.log(`Found person: ${person.name}, ${person.age} years old`);
|
|
311
|
+
* ```
|
|
312
|
+
*
|
|
313
|
+
* @example
|
|
314
|
+
* ```typescript Extract Data from Multiple Content Items
|
|
315
|
+
* import { extractStructuredData } from '@intuned/browser/ai';
|
|
316
|
+
*
|
|
317
|
+
* const mixedContent = [
|
|
318
|
+
* { type: "text", data: "Product: iPhone 15" },
|
|
319
|
+
* { type: "image-url", image_type: "jpeg", data: "https://mintcdn.com/intuned-7/asXJUUPBWwDlStUB/logo/light.svg?fit=max&auto=format&n=asXJUUPBWwDlStUB&q=85&s=6525c0b299b3226464eba6afa9b7ebe6" }
|
|
320
|
+
* ];
|
|
321
|
+
*
|
|
322
|
+
*
|
|
323
|
+
* const product = await extractStructuredData({
|
|
324
|
+
* content: mixedContent,
|
|
325
|
+
* model: "claude-3",
|
|
326
|
+
* dataSchema: {
|
|
327
|
+
* type: "object",
|
|
328
|
+
* properties: {
|
|
329
|
+
* name: { type: "string" },
|
|
330
|
+
* price: { type: "string" },
|
|
331
|
+
* features: { type: "array", items: { type: "string" } }
|
|
332
|
+
* }
|
|
333
|
+
* },
|
|
334
|
+
* maxRetries: 1,
|
|
335
|
+
* enableCache: true
|
|
262
336
|
* });
|
|
263
337
|
* ```
|
|
264
338
|
*/
|
|
265
339
|
export declare function extractStructuredData(options: {
|
|
266
|
-
|
|
340
|
+
content:
|
|
341
|
+
| (TextContentItem | ImageBufferContentItem | ImageUrlContentItem)[]
|
|
342
|
+
| TextContentItem
|
|
343
|
+
| ImageBufferContentItem
|
|
344
|
+
| ImageUrlContentItem;
|
|
267
345
|
dataSchema: JsonSchema;
|
|
268
346
|
prompt?: string;
|
|
269
|
-
|
|
270
|
-
model?: SUPPORTED_MODELS;
|
|
347
|
+
model: SUPPORTED_MODELS;
|
|
271
348
|
apiKey?: string;
|
|
272
|
-
enableDomMatching?: boolean;
|
|
273
349
|
enableCache?: boolean;
|
|
274
350
|
maxRetries?: number;
|
|
275
351
|
}): Promise<any>;
|
|
@@ -348,72 +424,83 @@ type SUPPORTED_OPENAI_MODELS =
|
|
|
348
424
|
| "o4-mini-deep-research-2025-06-26";
|
|
349
425
|
/**
|
|
350
426
|
* This type defines the supported AI models for data extraction.
|
|
351
|
-
* It includes models from OpenAI
|
|
352
|
-
*
|
|
427
|
+
* It includes models from OpenAI and Anthropic
|
|
428
|
+
* **Supported OPENAI Models**
|
|
429
|
+
* "gpt-3.5-turbo"
|
|
430
|
+
* "gpt-3.5-turbo-0125"
|
|
431
|
+
* "gpt-3.5-turbo-0301"
|
|
432
|
+
* "gpt-3.5-turbo-0613"
|
|
433
|
+
* "gpt-3.5-turbo-1106"
|
|
434
|
+
* "gpt-3.5-turbo-16k"
|
|
435
|
+
* "gpt-3.5-turbo-16k-0613"
|
|
436
|
+
* "gpt-3.5-turbo-instruct"
|
|
437
|
+
* "gpt-3.5-turbo-instruct-0914"
|
|
438
|
+
* "gpt-4"
|
|
439
|
+
* "gpt-4-0314"
|
|
440
|
+
* "gpt-4-0613"
|
|
441
|
+
* "gpt-4-32k"
|
|
442
|
+
* "gpt-4-32k-0314"
|
|
443
|
+
* "gpt-4-32k-0613"
|
|
444
|
+
* "gpt-4-turbo"
|
|
445
|
+
* "gpt-4-turbo-2024-04-09"
|
|
446
|
+
* "gpt-4.1"
|
|
447
|
+
* "gpt-4.1-2025-04-14"
|
|
448
|
+
* "gpt-4.1-mini"
|
|
449
|
+
* "gpt-4.1-mini-2025-04-14"
|
|
450
|
+
* "gpt-4.1-nano"
|
|
451
|
+
* "gpt-4.1-nano-2025-04-14"
|
|
452
|
+
* "gpt-4o"
|
|
453
|
+
* "gpt-4o-2024-05-13"
|
|
454
|
+
* "gpt-4o-2024-08-06"
|
|
455
|
+
* "gpt-4o-2024-11-20"
|
|
456
|
+
* "gpt-4o-mini"
|
|
457
|
+
* "gpt-4o-mini-2024-07-18"
|
|
458
|
+
* "gpt-5"
|
|
459
|
+
* "gpt-5-2025-08-07"
|
|
460
|
+
* "gpt-5-chat"
|
|
461
|
+
* "gpt-5-chat-latest"
|
|
462
|
+
* "gpt-5-mini"
|
|
463
|
+
* "gpt-5-mini-2025-08-07"
|
|
464
|
+
* "gpt-5-nano"
|
|
465
|
+
* "gpt-5-nano-2025-08-07"
|
|
466
|
+
* "o1"
|
|
467
|
+
* "o1-2024-12-17"
|
|
468
|
+
* "o1-mini"
|
|
469
|
+
* "o1-mini-2024-09-12"
|
|
470
|
+
* "o1-pro"
|
|
471
|
+
* "o1-pro-2025-03-19"
|
|
472
|
+
* "o3"
|
|
473
|
+
* "o3-2025-04-16"
|
|
474
|
+
* "o3-deep-research"
|
|
475
|
+
* "o3-deep-research-2025-06-26"
|
|
476
|
+
* "o3-mini"
|
|
477
|
+
* "o3-mini-2025-01-31"
|
|
478
|
+
* "o3-pro"
|
|
479
|
+
* "o3-pro-2025-06-10"
|
|
480
|
+
* "o4-mini"
|
|
481
|
+
* "o4-mini-2025-04-16"
|
|
482
|
+
* "o4-mini-deep-research"
|
|
483
|
+
* "o4-mini-deep-research-2025-06-26";
|
|
484
|
+
*
|
|
485
|
+
* **Supported Anthropic Models**
|
|
486
|
+
* "claude-3-5-haiku-20241022"
|
|
487
|
+
* "claude-3-5-haiku-latest"
|
|
488
|
+
* "claude-3-5-sonnet-20240620"
|
|
489
|
+
* "claude-3-5-sonnet-20241022"
|
|
490
|
+
* "claude-3-5-sonnet-latest"
|
|
491
|
+
* "claude-3-7-sonnet-20250219"
|
|
492
|
+
* "claude-3-7-sonnet-latest"
|
|
493
|
+
* "claude-3-haiku-20240307"
|
|
494
|
+
* "claude-4-opus-20250514"
|
|
495
|
+
* "claude-4-sonnet-20250514"
|
|
496
|
+
* "claude-opus-4-1"
|
|
497
|
+
* "claude-opus-4-1-20250805"
|
|
498
|
+
* "claude-opus-4-20250514"
|
|
499
|
+
* "claude-sonnet-4-20250514";
|
|
353
500
|
* @type SUPPORTED_MODELS
|
|
354
501
|
*/
|
|
355
502
|
type SUPPORTED_MODELS = SUPPORTED_CLAUDE_MODELS | SUPPORTED_OPENAI_MODELS;
|
|
356
503
|
|
|
357
|
-
/**
|
|
358
|
-
* Represents a JSON Schema definition for validating data structures.
|
|
359
|
-
* Supports various schema types including string, number, boolean, array, and object schemas
|
|
360
|
-
* with their respective validation rules and constraints.
|
|
361
|
-
*
|
|
362
|
-
* This type is a union of different schema types:
|
|
363
|
-
* - StringSchema: For string validation with length and pattern constraints
|
|
364
|
-
* - NumberSchema: For number/integer validation with range constraints
|
|
365
|
-
* - BooleanSchema: For boolean values
|
|
366
|
-
* - ArraySchema: For array validation with item constraints
|
|
367
|
-
* - ObjectSchema: For object validation with property constraints
|
|
368
|
-
*
|
|
369
|
-
* @type JsonSchema
|
|
370
|
-
* @example
|
|
371
|
-
* ```typescript String Schema
|
|
372
|
-
* const stringSchema: JsonSchema = {
|
|
373
|
-
* type: "string",
|
|
374
|
-
* minLength: 3,
|
|
375
|
-
* maxLength: 50,
|
|
376
|
-
* pattern: "^[A-Za-z]+$"
|
|
377
|
-
* };
|
|
378
|
-
* ```
|
|
379
|
-
*
|
|
380
|
-
* @example
|
|
381
|
-
* ```typescript Number Schema
|
|
382
|
-
* const numberSchema: JsonSchema = {
|
|
383
|
-
* type: "number",
|
|
384
|
-
* minimum: 0,
|
|
385
|
-
* maximum: 100,
|
|
386
|
-
* multipleOf: 0.5
|
|
387
|
-
* };
|
|
388
|
-
* ```
|
|
389
|
-
*
|
|
390
|
-
* @example
|
|
391
|
-
* ```typescript Array Schema
|
|
392
|
-
* const arraySchema: JsonSchema = {
|
|
393
|
-
* type: "array",
|
|
394
|
-
* items: {
|
|
395
|
-
* type: "string"
|
|
396
|
-
* },
|
|
397
|
-
* minItems: 1,
|
|
398
|
-
* maxItems: 10,
|
|
399
|
-
* uniqueItems: true
|
|
400
|
-
* };
|
|
401
|
-
* ```
|
|
402
|
-
*
|
|
403
|
-
* @example
|
|
404
|
-
* ```typescript Object Schema
|
|
405
|
-
* const objectSchema: JsonSchema = {
|
|
406
|
-
* type: "object",
|
|
407
|
-
* properties: {
|
|
408
|
-
* name: { type: "string" },
|
|
409
|
-
* age: { type: "number", minimum: 0 },
|
|
410
|
-
* email: { type: "string", pattern: "^[^@]+@[^@]+\\.[^@]+$" }
|
|
411
|
-
* },
|
|
412
|
-
* required: ["name", "email"]
|
|
413
|
-
* };
|
|
414
|
-
* ```
|
|
415
|
-
*/
|
|
416
|
-
|
|
417
504
|
/**
|
|
418
505
|
* Uses AI vision to determine if a webpage has finished loading by analyzing a screenshot.
|
|
419
506
|
* Detects loading spinners, blank content, or incomplete page states.
|
|
@@ -465,9 +552,36 @@ export declare function isPageLoaded(input: {
|
|
|
465
552
|
apiKey?: string;
|
|
466
553
|
}): Promise<boolean>;
|
|
467
554
|
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
555
|
+
/**
|
|
556
|
+
* @interface
|
|
557
|
+
* @property {string} type - The type of the content item, which is always "text".
|
|
558
|
+
* @property {string} data - The text data.
|
|
559
|
+
*/
|
|
560
|
+
export interface TextContentItem {
|
|
561
|
+
type: "text";
|
|
562
|
+
data: string;
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
/**
|
|
566
|
+
* @interface
|
|
567
|
+
* @property {string} type - The type of the content item, which is always "image-buffer".
|
|
568
|
+
* @property {string} image_type - The image format (e.g., "png", "jpeg", "gif", "webp").
|
|
569
|
+
* @property {Buffer} data - The buffer containing the image data.
|
|
570
|
+
*/
|
|
571
|
+
export interface ImageBufferContentItem {
|
|
572
|
+
type: "image-buffer";
|
|
573
|
+
image_type: "png" | "jpeg" | "gif" | "webp";
|
|
574
|
+
data: Buffer;
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
/**
|
|
578
|
+
* @interface
|
|
579
|
+
* @property {string} type - The type of the content item, which is always "image-url".
|
|
580
|
+
* @property {string} image_type - The image format (e.g., "png", "jpeg", "gif", "webp").
|
|
581
|
+
* @property {string} data - The URL of the image.
|
|
582
|
+
*/
|
|
583
|
+
export interface ImageUrlContentItem {
|
|
584
|
+
type: "image-url";
|
|
585
|
+
image_type: "png" | "jpeg" | "gif" | "webp";
|
|
586
|
+
data: string;
|
|
587
|
+
}
|
|
@@ -17,6 +17,9 @@ var _Logger = require("../common/Logger");
|
|
|
17
17
|
var _helpers = require("../helpers");
|
|
18
18
|
var _xpathMapping = require("../common/xpathMapping");
|
|
19
19
|
const extractStructuredData = async options => {
|
|
20
|
+
if ("content" in options && !("source" in options)) {
|
|
21
|
+
return await extractStructuredDataFromContent(options);
|
|
22
|
+
}
|
|
20
23
|
const pageOrLocator = options.source;
|
|
21
24
|
const isPageInput = (0, _locatorHelpers.isPage)(pageOrLocator);
|
|
22
25
|
const {
|
|
@@ -83,15 +86,18 @@ const extractStructuredData = async options => {
|
|
|
83
86
|
return cachedResult;
|
|
84
87
|
}
|
|
85
88
|
}
|
|
86
|
-
const result = await (0, _extractStructuredDataUsingAi.extractStructuredDataUsingAi)(
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
89
|
+
const result = await (0, _extractStructuredDataUsingAi.extractStructuredDataUsingAi)({
|
|
90
|
+
page: pageObject,
|
|
91
|
+
options: {
|
|
92
|
+
apiKey: validatedData.apiKey,
|
|
93
|
+
enableDomMatching: validatedData.enableDomMatching,
|
|
94
|
+
jsonSchema: validatedData.dataSchema,
|
|
95
|
+
model: validatedData.model || "claude-3-5-haiku-latest",
|
|
96
|
+
content: simplifiedHtml,
|
|
97
|
+
prompt: validatedData.prompt,
|
|
98
|
+
images: [],
|
|
99
|
+
maxRetries: validatedData.maxRetries
|
|
100
|
+
}
|
|
95
101
|
});
|
|
96
102
|
if (result.isErr()) {
|
|
97
103
|
throw new Error(result.error.context);
|
|
@@ -139,15 +145,21 @@ const extractStructuredData = async options => {
|
|
|
139
145
|
if (images.isErr()) {
|
|
140
146
|
throw new Error(images.error.context);
|
|
141
147
|
}
|
|
142
|
-
const result = await (0, _extractStructuredDataUsingAi.extractStructuredDataUsingAi)(
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
148
|
+
const result = await (0, _extractStructuredDataUsingAi.extractStructuredDataUsingAi)({
|
|
149
|
+
page: pageObject,
|
|
150
|
+
options: {
|
|
151
|
+
apiKey: validatedData.apiKey,
|
|
152
|
+
enableDomMatching: validatedData.enableDomMatching,
|
|
153
|
+
jsonSchema: validatedData.dataSchema,
|
|
154
|
+
model: validatedData.model || "claude-3-5-haiku-latest",
|
|
155
|
+
content: "Extract structured data from the following images.",
|
|
156
|
+
prompt: validatedData.prompt,
|
|
157
|
+
images: images.value.map(i => ({
|
|
158
|
+
data: i,
|
|
159
|
+
image_type: "png"
|
|
160
|
+
})),
|
|
161
|
+
maxRetries: validatedData.maxRetries
|
|
162
|
+
}
|
|
151
163
|
});
|
|
152
164
|
if (result.isErr()) {
|
|
153
165
|
throw new Error(result.error.context);
|
|
@@ -200,15 +212,18 @@ const extractStructuredData = async options => {
|
|
|
200
212
|
return cachedResult;
|
|
201
213
|
}
|
|
202
214
|
}
|
|
203
|
-
const result = await (0, _extractStructuredDataUsingAi.extractStructuredDataUsingAi)(
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
215
|
+
const result = await (0, _extractStructuredDataUsingAi.extractStructuredDataUsingAi)({
|
|
216
|
+
page: pageObject,
|
|
217
|
+
options: {
|
|
218
|
+
apiKey: validatedData.apiKey,
|
|
219
|
+
enableDomMatching: validatedData.enableDomMatching,
|
|
220
|
+
jsonSchema: validatedData.dataSchema,
|
|
221
|
+
model: validatedData.model || "claude-3-5-haiku-latest",
|
|
222
|
+
content: markdown,
|
|
223
|
+
prompt: validatedData.prompt,
|
|
224
|
+
images: [],
|
|
225
|
+
maxRetries: validatedData.maxRetries
|
|
226
|
+
}
|
|
212
227
|
});
|
|
213
228
|
if (result.isErr()) {
|
|
214
229
|
throw new Error(result.error.context);
|
|
@@ -228,4 +243,78 @@ const extractStructuredData = async options => {
|
|
|
228
243
|
}
|
|
229
244
|
throw new Error(`Unsupported strategy type: ${validatedData.strategy}. Supported types are: HTML, IMAGE, and MARKDOWN.`);
|
|
230
245
|
};
|
|
231
|
-
exports.extractStructuredData = extractStructuredData;
|
|
246
|
+
exports.extractStructuredData = extractStructuredData;
|
|
247
|
+
const extractStructuredDataFromContent = async options => {
|
|
248
|
+
const contentValidationResult = _validators.contentValidationSchema.safeParse(options.content);
|
|
249
|
+
if (!contentValidationResult.success) {
|
|
250
|
+
const error = contentValidationResult.error;
|
|
251
|
+
const messages = (0, _formatZodError.formatZodError)(error);
|
|
252
|
+
throw new Error("extractStructuredDataFromContent content is invalid: \n" + messages.join("\n"));
|
|
253
|
+
}
|
|
254
|
+
const {
|
|
255
|
+
content: _,
|
|
256
|
+
...rest
|
|
257
|
+
} = options;
|
|
258
|
+
const parsingResult = _validators.genericExtractDataInputSchema.safeParse(rest);
|
|
259
|
+
if (!parsingResult.success) {
|
|
260
|
+
const error = parsingResult.error;
|
|
261
|
+
const messages = (0, _formatZodError.formatZodError)(error);
|
|
262
|
+
throw new Error("extractStructuredDataFromContent input is invalid: \n" + messages.join("\n"));
|
|
263
|
+
}
|
|
264
|
+
const content = Array.isArray(options.content) ? options.content : [options.content];
|
|
265
|
+
const imagesFromBuffers = content.filter(c => c.type === "image-buffer").map(c => ({
|
|
266
|
+
image_type: c.image_type,
|
|
267
|
+
data: c.data
|
|
268
|
+
}));
|
|
269
|
+
const imagesFromUrls = content.filter(c => c.type === "image-url").map(c => ({
|
|
270
|
+
image_type: c.image_type,
|
|
271
|
+
data: c.data
|
|
272
|
+
})).map(async c => {
|
|
273
|
+
try {
|
|
274
|
+
const response = await fetch(c.data);
|
|
275
|
+
const buffer = Buffer.from(await response.arrayBuffer());
|
|
276
|
+
return {
|
|
277
|
+
image_type: c.image_type,
|
|
278
|
+
data: buffer
|
|
279
|
+
};
|
|
280
|
+
} catch (e) {
|
|
281
|
+
throw new Error(`fetching image:${c.data} from url Failed: ${e}`);
|
|
282
|
+
}
|
|
283
|
+
});
|
|
284
|
+
const images = [...(await Promise.all(imagesFromUrls)), ...imagesFromBuffers];
|
|
285
|
+
const texts = content.filter(c => c.type === "text").map(c => c.data);
|
|
286
|
+
let cacheKey = "";
|
|
287
|
+
if (options.enableCache != false) {
|
|
288
|
+
cacheKey = (0, _hashObject.hashObject)({
|
|
289
|
+
systemMessage: options.prompt,
|
|
290
|
+
images,
|
|
291
|
+
jsonSchema: options.dataSchema,
|
|
292
|
+
model: options.model,
|
|
293
|
+
text: texts
|
|
294
|
+
}, false);
|
|
295
|
+
const cachedResult = await _cache.cache.get(cacheKey);
|
|
296
|
+
if (cachedResult) {
|
|
297
|
+
_Logger.logger.info("Results for the extractor found in the cache, returning cached result");
|
|
298
|
+
return cachedResult;
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
const result = await (0, _extractStructuredDataUsingAi.extractStructuredDataUsingAi)({
|
|
302
|
+
options: {
|
|
303
|
+
prompt: options.prompt,
|
|
304
|
+
images,
|
|
305
|
+
jsonSchema: options.dataSchema,
|
|
306
|
+
content: texts.join("\n"),
|
|
307
|
+
enableDomMatching: false,
|
|
308
|
+
apiKey: options.apiKey,
|
|
309
|
+
model: options.model || "claude-3-5-haiku-latest",
|
|
310
|
+
maxRetries: options.maxRetries
|
|
311
|
+
}
|
|
312
|
+
});
|
|
313
|
+
if (result.isErr()) {
|
|
314
|
+
throw new Error(result.error.context);
|
|
315
|
+
}
|
|
316
|
+
if (options.enableCache != false) {
|
|
317
|
+
await _cache.cache.set(cacheKey, result.value.result);
|
|
318
|
+
}
|
|
319
|
+
return result.value.result;
|
|
320
|
+
};
|
|
@@ -18,7 +18,7 @@ var _prompt = require("./prompt");
|
|
|
18
18
|
var _ai = require("ai");
|
|
19
19
|
var _loadRuntime = require("../common/loadRuntime");
|
|
20
20
|
function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
|
|
21
|
-
async function extractStructuredDataUsingAi(
|
|
21
|
+
async function extractStructuredDataUsingAi(input) {
|
|
22
22
|
var _getExecutionContext, _getExecutionContext2, _getExecutionContext3;
|
|
23
23
|
const {
|
|
24
24
|
apiKey,
|
|
@@ -29,7 +29,7 @@ async function extractStructuredDataUsingAi(page, input) {
|
|
|
29
29
|
prompt,
|
|
30
30
|
images,
|
|
31
31
|
maxRetries = 3
|
|
32
|
-
} = input;
|
|
32
|
+
} = input.options;
|
|
33
33
|
let accumulatedTokens = 0;
|
|
34
34
|
const getExecutionContext = await (0, _loadRuntime.loadRuntime)();
|
|
35
35
|
const toolName = `extract_data`;
|
|
@@ -99,7 +99,7 @@ async function extractStructuredDataUsingAi(page, input) {
|
|
|
99
99
|
currentRetry++;
|
|
100
100
|
continue;
|
|
101
101
|
}
|
|
102
|
-
if (!enableDomMatching) {
|
|
102
|
+
if (!enableDomMatching || !input.page) {
|
|
103
103
|
_Logger.logger.info(`Extraction completed, total LLM ${isGateway ? "Cost In Cents" : "Tokens"}: ${accumulatedTokens}`);
|
|
104
104
|
return (0, _neverthrow.ok)({
|
|
105
105
|
result: extractedData,
|
|
@@ -123,7 +123,7 @@ async function extractStructuredDataUsingAi(page, input) {
|
|
|
123
123
|
xpathMapping
|
|
124
124
|
} = await (0, _matching.replaceWithBestMatches)({
|
|
125
125
|
stringsToMatch,
|
|
126
|
-
pageObject: page
|
|
126
|
+
pageObject: input.page
|
|
127
127
|
});
|
|
128
128
|
const stringReplacements = {};
|
|
129
129
|
Object.entries(replacements).forEach(([key, value]) => {
|