@intuned/browser-dev 0.1.7-dev.0 → 0.1.9-dev.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +85 -143
- package/dist/ai/export.d.ts +292 -144
- package/dist/ai/extractStructuredDataUsingAi.js +24 -1
- package/dist/ai/index.d.ts +292 -144
- package/dist/ai/tests/testExtractStructuredData.spec.js +2 -2
- package/dist/common/Logger/index.js +2 -2
- package/dist/helpers/export.d.ts +703 -577
- package/dist/helpers/gotoUrl.js +50 -51
- package/dist/helpers/index.d.ts +703 -577
- package/dist/helpers/tests/testClickUntilExhausted.spec.js +2 -1
- package/dist/helpers/withNetworkSettledWait.js +2 -7
- package/dist/optimized-extractors/export.d.ts +17 -18
- package/dist/optimized-extractors/index.d.ts +17 -18
- package/how-to-generate-docs.md +40 -28
- package/package.json +2 -3
- package/generated-docs/ai/functions/extractStructuredData.mdx +0 -255
- package/generated-docs/ai/functions/isPageLoaded.mdx +0 -89
- package/generated-docs/ai/interfaces/ArraySchema.mdx +0 -36
- package/generated-docs/ai/interfaces/BasicSchema.mdx +0 -14
- package/generated-docs/ai/interfaces/BooleanSchema.mdx +0 -28
- package/generated-docs/ai/interfaces/ImageBufferContentItem.mdx +0 -16
- package/generated-docs/ai/interfaces/ImageUrlContentItem.mdx +0 -16
- package/generated-docs/ai/interfaces/NumberSchema.mdx +0 -35
- package/generated-docs/ai/interfaces/ObjectSchema.mdx +0 -39
- package/generated-docs/ai/interfaces/StringSchema.mdx +0 -35
- package/generated-docs/ai/interfaces/TextContentItem.mdx +0 -14
- package/generated-docs/ai/type-aliases/ContentItem.mdx +0 -12
- package/generated-docs/ai/type-aliases/JsonSchema.mdx +0 -47
- package/generated-docs/ai/type-aliases/SUPPORTED_MODELS.mdx +0 -85
- package/generated-docs/helpers/functions/clickButtonAndWait.mdx +0 -63
- package/generated-docs/helpers/functions/clickUntilExhausted.mdx +0 -112
- package/generated-docs/helpers/functions/downloadFile.mdx +0 -99
- package/generated-docs/helpers/functions/extractMarkdown.mdx +0 -56
- package/generated-docs/helpers/functions/filterEmptyValues.mdx +0 -51
- package/generated-docs/helpers/functions/goToUrl.mdx +0 -124
- package/generated-docs/helpers/functions/processDate.mdx +0 -55
- package/generated-docs/helpers/functions/resolveUrl.mdx +0 -165
- package/generated-docs/helpers/functions/sanitizeHtml.mdx +0 -113
- package/generated-docs/helpers/functions/saveFileToS3.mdx +0 -127
- package/generated-docs/helpers/functions/scrollToLoadContent.mdx +0 -83
- package/generated-docs/helpers/functions/uploadFileToS3.mdx +0 -121
- package/generated-docs/helpers/functions/validateDataUsingSchema.mdx +0 -90
- package/generated-docs/helpers/functions/waitForDomSettled.mdx +0 -91
- package/generated-docs/helpers/functions/withNetworkSettledWait.mdx +0 -76
- package/generated-docs/helpers/interfaces/Attachment.mdx +0 -56
- package/generated-docs/helpers/interfaces/S3Configs.mdx +0 -52
- package/generated-docs/helpers/interfaces/SanitizeHtmlOptions.mdx +0 -22
- package/generated-docs/helpers/type-aliases/AttachmentType.mdx +0 -10
- package/generated-docs/helpers/type-aliases/FileType.mdx +0 -61
- package/generated-docs/helpers/type-aliases/Trigger.mdx +0 -62
package/dist/ai/export.d.ts
CHANGED
|
@@ -177,6 +177,10 @@ export interface ObjectSchema extends BasicSchema {
|
|
|
177
177
|
* @example
|
|
178
178
|
* ```typescript Object Schema
|
|
179
179
|
* import { JsonSchema } from "@intuned/browser/ai";
|
|
180
|
+
* import { BrowserContext, Page } from "playwright";
|
|
181
|
+
*
|
|
182
|
+
* interface Params {}
|
|
183
|
+
*
|
|
180
184
|
* export default async function handler(params, page, context){
|
|
181
185
|
* const schema: JsonSchema = {
|
|
182
186
|
* type: "object",
|
|
@@ -207,73 +211,122 @@ export type JsonSchema =
|
|
|
207
211
|
| ArraySchema
|
|
208
212
|
| ObjectSchema;
|
|
209
213
|
/**
|
|
210
|
-
*
|
|
211
|
-
*
|
|
214
|
+
* Extracts structured data from web pages using AI-powered content analysis.
|
|
215
|
+
*
|
|
212
216
|
* This function provides intelligent data extraction from web pages using various strategies
|
|
213
|
-
* including HTML parsing, image analysis, and Markdown conversion.
|
|
214
|
-
* from entire pages or specific elements, with built-in caching and retry mechanisms.
|
|
217
|
+
* including HTML parsing, image analysis, and Markdown conversion. Or by using Text or Image Content.
|
|
218
|
+
* It supports extraction from entire pages or specific elements, with built-in caching and retry mechanisms.
|
|
219
|
+
*
|
|
220
|
+
* @overload Extract From Page or Locator
|
|
221
|
+
*
|
|
222
|
+
* Extract data from web pages or specific elements using HTML, IMAGE, or MARKDOWN strategies with DOM matching support.
|
|
223
|
+
*
|
|
224
|
+
* ## Features and limitations
|
|
225
|
+
*
|
|
226
|
+
* **Features:**
|
|
227
|
+
* - **Smart caching:** Hashes inputs and uses [KV Cache](https://docs.intunedhq.com/docs/01-learn/recipes/kv-cache) for persistent storage
|
|
228
|
+
* - **DOM matching:** With `enableDomMatching=true`, values match DOM elements for smart caching
|
|
229
|
+
* - **Multiple strategies:** HTML, IMAGE, or MARKDOWN based on content type
|
|
230
|
+
* - **Flexible models:** Use any up-to-date model from Anthropic, OpenAI, or Google based on your needs
|
|
231
|
+
*
|
|
232
|
+
* **Limitations:**
|
|
233
|
+
* - **Model variability:** Quality varies by model—experiment to find the best fit
|
|
234
|
+
* - **DOM complexity:** Dynamic structures can affect caching and matching
|
|
235
|
+
* - **IMAGE strategy constraints:** Can't capture truncated or off-screen content
|
|
236
|
+
* - **Schema design:** Complex schemas may reduce accuracy
|
|
215
237
|
*
|
|
216
238
|
* @param {Object} options - Configuration object containing extraction parameters
|
|
217
|
-
* @param {Page | Locator} options.source - Playwright Page object to extract data from the entire page or Locator object to extract data from a specific element
|
|
218
|
-
* @param {JsonSchema | z.ZodSchema} options.dataSchema -
|
|
219
|
-
* @param {string} [options.
|
|
220
|
-
* @param {
|
|
221
|
-
*
|
|
222
|
-
*
|
|
223
|
-
*
|
|
224
|
-
* @param {
|
|
225
|
-
* @param {
|
|
226
|
-
*
|
|
227
|
-
* @
|
|
228
|
-
|
|
239
|
+
* @param {Page | Locator} options.source - Playwright Page object to extract data from the entire page or Locator object to extract data from a specific element.
|
|
240
|
+
* @param {JsonSchema | z.ZodSchema} options.dataSchema - Schema defining the structure of the data to extract. Can be a JSON Schema object or a Zod schema.
|
|
241
|
+
* @param {string} [options.prompt] - Optional prompt to guide the extraction process and provide more context. Defaults to undefined.
|
|
242
|
+
* @param {("HTML"|"IMAGE"|"MARKDOWN")} [options.strategy="HTML"] - Type of extraction strategy:
|
|
243
|
+
* - **"HTML"** (default) - Best for text-heavy pages with structured content
|
|
244
|
+
* - **"IMAGE"** - Best for visual content, charts, or complex layouts
|
|
245
|
+
* - **"MARKDOWN"** - Best for article-style content with semantic structure
|
|
246
|
+
* @param {boolean} [options.enableDomMatching=false] - Whether to enable DOM element matching during extraction. You must enable cache for this to work. When enabled, extraction results are mapped to their corresponding DOM elements and returned with matched results. These results are intelligently cached, allowing subsequent extractions with minor DOM changes to utilize the cached data for improved performance. Defaults to false.
|
|
247
|
+
* @param {boolean} [options.enableCache=true] - Whether to enable caching of extraction results. Defaults to true.
|
|
248
|
+
* @param {number} [options.maxRetries=3] - Maximum number of retry attempts on failures. Failures can be validation errors, API errors, output errors, etc. Defaults to 3.
|
|
249
|
+
* @param {string} [options.model="claude-haiku-4-5-20251001"] - AI model to use for extraction. Defaults to "claude-haiku-4-5-20251001".
|
|
250
|
+
* @param {string} [options.apiKey] - Optional API key for AI extraction (if provided, will not be billed to your account). Defaults to undefined.
|
|
251
|
+
*
|
|
252
|
+
* @returns {Promise<any>} The extracted structured data conforming to the provided schema.
|
|
253
|
+
*
|
|
229
254
|
* @example
|
|
230
|
-
* ```typescript
|
|
255
|
+
* ```typescript Extract book details
|
|
231
256
|
* import { extractStructuredData } from '@intuned/browser/ai';
|
|
232
|
-
*
|
|
233
|
-
*
|
|
234
|
-
*
|
|
235
|
-
*
|
|
236
|
-
*
|
|
237
|
-
*
|
|
238
|
-
*
|
|
239
|
-
*
|
|
240
|
-
*
|
|
241
|
-
*
|
|
242
|
-
*
|
|
243
|
-
*
|
|
244
|
-
*
|
|
257
|
+
* import { BrowserContext, Page } from "playwright";
|
|
258
|
+
*
|
|
259
|
+
* interface Params {}
|
|
260
|
+
*
|
|
261
|
+
* export default async function handler(params: Params, page: Page, context: BrowserContext){
|
|
262
|
+
* await page.goto('https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html');
|
|
263
|
+
* // This will extract the book details from the page, using the HTML strategy with the gpt-4o model.
|
|
264
|
+
* // The dataSchema is a JSON Schema object that defines the structure of the data to extract.
|
|
265
|
+
* // You can also use a Zod schema instead of a JSON Schema object.
|
|
266
|
+
* const book = await extractStructuredData({
|
|
267
|
+
* source: page,
|
|
268
|
+
* strategy: "HTML", // The HTML strategy is the default strategy and will be used if no strategy is provided.
|
|
269
|
+
* model: "gpt-4o",
|
|
270
|
+
* dataSchema: {
|
|
271
|
+
* type: "object",
|
|
272
|
+
* properties: {
|
|
273
|
+
* name: { type: "string" },
|
|
274
|
+
* price: { type: "string" },
|
|
275
|
+
* description: { type: "string" },
|
|
276
|
+
* inStock: { type: "boolean" },
|
|
277
|
+
* rating: { type: "string" }
|
|
278
|
+
* },
|
|
279
|
+
* required: ["name", "price"]
|
|
245
280
|
* },
|
|
246
|
-
*
|
|
247
|
-
*
|
|
248
|
-
*
|
|
249
|
-
*
|
|
250
|
-
*
|
|
281
|
+
* prompt: "Extract book details from this page",
|
|
282
|
+
* enableCache: true, // since this is true, the method will call AI for the first time, and then whenever you call this method it will return cached results as long as the DOM is the same.
|
|
283
|
+
* enableDomMatching: true, // since this is true, the method will return the results mapped to the DOM elements, you MUST enable cache for this to work.
|
|
284
|
+
* maxRetries: 3
|
|
285
|
+
* });
|
|
286
|
+
*
|
|
287
|
+
* console.log(`Found book: ${book.name} - ${book.price}`);
|
|
251
288
|
* }
|
|
252
289
|
* ```
|
|
253
290
|
*
|
|
254
291
|
* @example
|
|
255
|
-
* ```typescript
|
|
292
|
+
* ```typescript Extract all books listings
|
|
256
293
|
* import { extractStructuredData } from '@intuned/browser/ai';
|
|
257
|
-
*
|
|
258
|
-
*
|
|
259
|
-
*
|
|
260
|
-
*
|
|
261
|
-
*
|
|
262
|
-
*
|
|
263
|
-
*
|
|
264
|
-
* dataSchema
|
|
265
|
-
*
|
|
266
|
-
*
|
|
267
|
-
*
|
|
268
|
-
*
|
|
269
|
-
*
|
|
270
|
-
*
|
|
271
|
-
*
|
|
272
|
-
*
|
|
273
|
-
*
|
|
274
|
-
*
|
|
275
|
-
*
|
|
276
|
-
*
|
|
294
|
+
* import { BrowserContext, Page } from "playwright";
|
|
295
|
+
*
|
|
296
|
+
* interface Params {}
|
|
297
|
+
*
|
|
298
|
+
* export default async function handler(params: Params, page: Page, context: BrowserContext){
|
|
299
|
+
* await page.goto('https://books.toscrape.com/');
|
|
300
|
+
* // This will extract all the books listings from the page, using the HTML strategy with the claude-3-7-sonnet-latest model.
|
|
301
|
+
* // The dataSchema is a JSON Schema object that defines the structure of the data to extract.
|
|
302
|
+
* // You can also use a Zod schema instead of a JSON Schema object.
|
|
303
|
+
* const books = await extractStructuredData({
|
|
304
|
+
* source: page,
|
|
305
|
+
* strategy: "HTML",
|
|
306
|
+
* model: "claude-3-7-sonnet-latest",
|
|
307
|
+
* dataSchema: {
|
|
308
|
+
* type: "object",
|
|
309
|
+
* properties: {
|
|
310
|
+
* products: {
|
|
311
|
+
* type: "array",
|
|
312
|
+
* items: {
|
|
313
|
+
* type: "object",
|
|
314
|
+
* properties: {
|
|
315
|
+
* title: { type: "string" },
|
|
316
|
+
* price: { type: "string" },
|
|
317
|
+
* availability: { type: "string" }
|
|
318
|
+
* }
|
|
319
|
+
* }
|
|
320
|
+
* }
|
|
321
|
+
* }
|
|
322
|
+
* },
|
|
323
|
+
* prompt: "Extract all book listings",
|
|
324
|
+
* enableCache: false, // In this example, we don't want to cache the extracted data, we want to extract the data every time.
|
|
325
|
+
* });
|
|
326
|
+
*
|
|
327
|
+
* for (const book of books.products) {
|
|
328
|
+
* console.log(`${book.title}: ${book.price}`);
|
|
329
|
+
* }
|
|
277
330
|
* }
|
|
278
331
|
* ```
|
|
279
332
|
*/
|
|
@@ -290,76 +343,110 @@ export declare function extractStructuredData(options: {
|
|
|
290
343
|
}): Promise<any>;
|
|
291
344
|
|
|
292
345
|
/**
|
|
293
|
-
*
|
|
294
|
-
*
|
|
295
|
-
* This
|
|
296
|
-
*
|
|
297
|
-
*
|
|
346
|
+
* Extracts structured data from web pages using AI-powered content analysis.
|
|
347
|
+
*
|
|
348
|
+
* This function provides intelligent data extraction from web pages using various strategies
|
|
349
|
+
* including HTML parsing, image analysis, and Markdown conversion. Or by using Text or Image Content.
|
|
350
|
+
* It supports extraction from entire pages or specific elements, with built-in caching and retry mechanisms.
|
|
351
|
+
*
|
|
352
|
+
* @overload Extract From Content
|
|
353
|
+
*
|
|
354
|
+
* Extract data from text, image buffers, or image URLs without requiring a page source.
|
|
355
|
+
*
|
|
356
|
+
* ## Features and limitations
|
|
357
|
+
*
|
|
358
|
+
* **Features:**
|
|
359
|
+
* - **Smart caching:** Hashes content and uses [KV Cache](https://docs.intunedhq.com/docs/01-learn/recipes/kv-cache) for persistent storage
|
|
360
|
+
* - **Multiple content items:** Combine text, images (buffer or URL) for comprehensive extraction
|
|
361
|
+
* - **Flexible models:** Use any up-to-date model from Anthropic, OpenAI, or Google based on your needs
|
|
362
|
+
*
|
|
363
|
+
* **Limitations:**
|
|
364
|
+
* - **Model variability:** Quality varies by model—experiment to find the best fit
|
|
365
|
+
* - **Schema design:** Complex schemas may reduce accuracy
|
|
366
|
+
* - **Content quality:** Requires meaningful, contextual content for accurate extraction—sparse or ambiguous content produces poor results
|
|
298
367
|
*
|
|
299
368
|
* @param {Object} options - Configuration object containing extraction parameters
|
|
300
|
-
* @param {ContentItem[] | ContentItem} options.content - Content to extract data from - can be a single content item or array of
|
|
301
|
-
* @param {JsonSchema | z.ZodSchema} options.dataSchema -
|
|
302
|
-
* @param {string} [options.prompt] - Optional prompt to guide the extraction process and provide more context
|
|
303
|
-
* @param {
|
|
304
|
-
* @param {
|
|
305
|
-
* @param {string} options.model - AI model to use for extraction
|
|
306
|
-
* @param {string} [options.apiKey] - Optional API key for AI extraction (if provided, will not be billed to your account)
|
|
369
|
+
* @param {ContentItem[] | ContentItem} options.content - Content to extract data from - can be a single content item or array of [ContentItem](../type-references/ContentItem).
|
|
370
|
+
* @param {JsonSchema | z.ZodSchema} options.dataSchema - Schema defining the expected structure of the extracted data. Can be a JSON Schema object or a Zod schema.
|
|
371
|
+
* @param {string} [options.prompt] - Optional prompt to guide the extraction process and provide more context. Defaults to undefined.
|
|
372
|
+
* @param {number} [options.maxRetries=3] - Maximum number of retry attempts on failures. Failures can be validation errors, API errors, output errors, etc. Defaults to 3.
|
|
373
|
+
* @param {boolean} [options.enableCache=true] - Whether to enable caching of the extracted data. Defaults to true.
|
|
374
|
+
* @param {string} [options.model="claude-haiku-4-5-20251001"] - AI model to use for extraction. Defaults to "claude-haiku-4-5-20251001".
|
|
375
|
+
* @param {string} [options.apiKey] - Optional API key for AI extraction (if provided, will not be billed to your account). Defaults to undefined.
|
|
307
376
|
*
|
|
308
|
-
* @returns Promise
|
|
377
|
+
* @returns {Promise<any>} The extracted structured data conforming to the provided schema.
|
|
309
378
|
*
|
|
310
379
|
* @example
|
|
311
|
-
* ```typescript Text Content
|
|
312
|
-
* import { extractStructuredData } from '@intuned/browser/ai';
|
|
313
|
-
*
|
|
314
|
-
* const textContent: TextContentItem = {
|
|
315
|
-
* type: "text",
|
|
316
|
-
* data: "John Doe, age 30, works as a Software Engineer at Tech Corp"
|
|
317
|
-
* };
|
|
380
|
+
* ```typescript Basic Text Content Extraction
|
|
381
|
+
* import { extractStructuredData, TextContentItem } from '@intuned/browser/ai';
|
|
382
|
+
* import { BrowserContext, Page } from "playwright";
|
|
318
383
|
*
|
|
319
|
-
*
|
|
320
|
-
*
|
|
321
|
-
*
|
|
322
|
-
*
|
|
323
|
-
*
|
|
324
|
-
*
|
|
325
|
-
*
|
|
326
|
-
*
|
|
327
|
-
* occupation: { type: "string" },
|
|
328
|
-
* company: { type: "string" }
|
|
329
|
-
* },
|
|
330
|
-
* required: ["name"]
|
|
331
|
-
* },
|
|
332
|
-
* prompt: "Extract person information from the text"
|
|
333
|
-
* });
|
|
384
|
+
* interface Params {}
|
|
385
|
+
*
|
|
386
|
+
* export default async function handler(params: Params, page: Page, context: BrowserContext){
|
|
387
|
+
* // This will extract the person information from the text, using the gpt-4o model.
|
|
388
|
+
* const textContent: TextContentItem = {
|
|
389
|
+
* type: "text",
|
|
390
|
+
* data: "John Doe, age 30, works as a Software Engineer at Tech Corp"
|
|
391
|
+
* };
|
|
334
392
|
*
|
|
335
|
-
*
|
|
393
|
+
* const person = await extractStructuredData({
|
|
394
|
+
* content: textContent,
|
|
395
|
+
* model: "gpt-4o",
|
|
396
|
+
* dataSchema: {
|
|
397
|
+
* type: "object",
|
|
398
|
+
* properties: {
|
|
399
|
+
* name: { type: "string" },
|
|
400
|
+
* age: { type: "number" },
|
|
401
|
+
* occupation: { type: "string" },
|
|
402
|
+
* company: { type: "string" }
|
|
403
|
+
* },
|
|
404
|
+
* required: ["name"]
|
|
405
|
+
* },
|
|
406
|
+
* prompt: "Extract person information from the text"
|
|
407
|
+
* });
|
|
408
|
+
*
|
|
409
|
+
* console.log(`Found person: ${person.name}, ${person.age} years old`);
|
|
336
410
|
* }
|
|
337
411
|
* ```
|
|
338
412
|
*
|
|
339
413
|
* @example
|
|
340
|
-
* ```typescript
|
|
341
|
-
* import { extractStructuredData } from '@intuned/browser/ai';
|
|
342
|
-
*
|
|
343
|
-
* const mixedContent = [
|
|
344
|
-
* { type: "text", data: "Product: iPhone 15" },
|
|
345
|
-
* { type: "image-url", image_type: "jpeg", data: "https://mintcdn.com/intuned-7/asXJUUPBWwDlStUB/logo/light.svg?fit=max&auto=format&n=asXJUUPBWwDlStUB&q=85&s=6525c0b299b3226464eba6afa9b7ebe6" }
|
|
346
|
-
* ];
|
|
414
|
+
* ```typescript List Extraction from Text Content
|
|
415
|
+
* import { extractStructuredData, TextContentItem } from '@intuned/browser/ai';
|
|
416
|
+
* import { BrowserContext, Page } from "playwright";
|
|
347
417
|
*
|
|
418
|
+
* interface Params {}
|
|
348
419
|
*
|
|
349
|
-
*
|
|
350
|
-
*
|
|
351
|
-
*
|
|
352
|
-
*
|
|
353
|
-
*
|
|
354
|
-
*
|
|
355
|
-
*
|
|
356
|
-
*
|
|
357
|
-
*
|
|
420
|
+
* export default async function handler(params: Params, page: Page, context: BrowserContext){
|
|
421
|
+
* const textContent: TextContentItem = {
|
|
422
|
+
* type: "text",
|
|
423
|
+
* data: "iPhone 15 - $999, Samsung Galaxy - $899, Pixel 8 - $699"
|
|
424
|
+
* };
|
|
425
|
+
*
|
|
426
|
+
* const products = await extractStructuredData({
|
|
427
|
+
* content: textContent,
|
|
428
|
+
* model: "gpt-4o",
|
|
429
|
+
* dataSchema: {
|
|
430
|
+
* type: "object",
|
|
431
|
+
* properties: {
|
|
432
|
+
* products: {
|
|
433
|
+
* type: "array",
|
|
434
|
+
* items: {
|
|
435
|
+
* type: "object",
|
|
436
|
+
* properties: {
|
|
437
|
+
* name: { type: "string" },
|
|
438
|
+
* price: { type: "string" }
|
|
439
|
+
* }
|
|
440
|
+
* }
|
|
441
|
+
* }
|
|
442
|
+
* }
|
|
443
|
+
* },
|
|
444
|
+
* prompt: "Extract all products"
|
|
445
|
+
* });
|
|
446
|
+
*
|
|
447
|
+
* for (const product of products.products) {
|
|
448
|
+
* console.log(`${product.name}: ${product.price}`);
|
|
358
449
|
* }
|
|
359
|
-
* },
|
|
360
|
-
* maxRetries: 1,
|
|
361
|
-
* enableCache: true
|
|
362
|
-
* });
|
|
363
450
|
* }
|
|
364
451
|
* ```
|
|
365
452
|
*/
|
|
@@ -369,7 +456,7 @@ export declare function extractStructuredData(options: {
|
|
|
369
456
|
prompt?: string;
|
|
370
457
|
maxRetries?: number;
|
|
371
458
|
enableCache?: boolean;
|
|
372
|
-
model
|
|
459
|
+
model?: string;
|
|
373
460
|
apiKey?: string;
|
|
374
461
|
}): Promise<any>;
|
|
375
462
|
|
|
@@ -380,21 +467,27 @@ export declare function extractStructuredData(options: {
|
|
|
380
467
|
* @param {Object} input - Input object containing the page to check
|
|
381
468
|
* @param {Page} input.page - The Playwright page to check
|
|
382
469
|
* @param {number} [input.timeoutInMs=10000] - Screenshot timeout in milliseconds. Defaults to 10000
|
|
383
|
-
* @param {string} [input.model="
|
|
384
|
-
* @param {string} [input.apiKey] - Optional API key for the AI
|
|
385
|
-
* @returns {Promise<boolean>} Promise resolving to true if page is loaded, false if still loading
|
|
470
|
+
* @param {string} [input.model="gpt-5-mini-2025-08-07"] - AI model to use for the check. Defaults to "gpt-5-mini-2025-08-07"
|
|
471
|
+
* @param {string} [input.apiKey] - Optional API key for the AI call.
|
|
472
|
+
* @returns {Promise<boolean>} Promise resolving to true if page is loaded, false if still loading.
|
|
386
473
|
* @example
|
|
387
474
|
* ```typescript Check Page Loading
|
|
388
475
|
* import { isPageLoaded } from "@intuned/browser/ai";
|
|
389
|
-
*
|
|
476
|
+
* import { BrowserContext, Page } from "playwright";
|
|
477
|
+
*
|
|
478
|
+
* interface Params {}
|
|
479
|
+
*
|
|
480
|
+
* export default async function handler(params: Params, page: Page, context: BrowserContext){
|
|
390
481
|
* // Wait for page to finish loading
|
|
391
|
-
* await page.goto('https://
|
|
482
|
+
* await page.goto('https://sandbox.intuned.dev/');
|
|
392
483
|
*
|
|
393
484
|
* const pageLoaded = await isPageLoaded({page});
|
|
394
485
|
* if (pageLoaded) {
|
|
395
486
|
* // Continue with scraping or interactions
|
|
487
|
+
* console.log("Page is loaded");
|
|
396
488
|
* } else {
|
|
397
489
|
* // Wait longer or retry
|
|
490
|
+
* await page.waitForTimeout(5000);
|
|
398
491
|
* }
|
|
399
492
|
* }
|
|
400
493
|
* ```
|
|
@@ -402,24 +495,27 @@ export declare function extractStructuredData(options: {
|
|
|
402
495
|
* @example
|
|
403
496
|
* ```typescript Loading Loop
|
|
404
497
|
* import { isPageLoaded } from "@intuned/browser/ai";
|
|
405
|
-
*
|
|
498
|
+
* import { BrowserContext, Page } from "playwright";
|
|
499
|
+
*
|
|
500
|
+
* interface Params {}
|
|
501
|
+
*
|
|
502
|
+
* export default async function handler(params: Params, page: Page, context: BrowserContext){
|
|
406
503
|
* // Keep checking until page loads
|
|
407
504
|
* await page.goto("https://example.com");
|
|
408
505
|
* let attempts = 0;
|
|
409
|
-
* while (attempts < 10) {
|
|
506
|
+
* while (attempts < 10) { // We will retry up to 10 times with a 2-second delay between attempts.
|
|
410
507
|
* const pageLoaded = await isPageLoaded({
|
|
411
508
|
* page,
|
|
412
|
-
* model: "claude-
|
|
509
|
+
* model: "claude-3-7-sonnet-latest",
|
|
413
510
|
* timeoutInMs: 5000
|
|
414
511
|
* });
|
|
415
|
-
* if (pageLoaded) break;
|
|
512
|
+
* if (pageLoaded) break; // If the page is loaded, break the loop.
|
|
416
513
|
*
|
|
417
|
-
* await page.waitForTimeout(2000);
|
|
514
|
+
* await page.waitForTimeout(2000); // Wait for 2 seconds before the next attempt.
|
|
418
515
|
* attempts++;
|
|
419
516
|
* }
|
|
420
517
|
* }
|
|
421
518
|
* ```
|
|
422
|
-
* }
|
|
423
519
|
*/
|
|
424
520
|
export declare function isPageLoaded(input: {
|
|
425
521
|
page: Page;
|
|
@@ -429,56 +525,108 @@ export declare function isPageLoaded(input: {
|
|
|
429
525
|
}): Promise<boolean>;
|
|
430
526
|
|
|
431
527
|
/**
|
|
432
|
-
*
|
|
433
|
-
* Used when passing text data directly to extractStructuredData without a page source.
|
|
528
|
+
* Text content item for content-based extraction.
|
|
434
529
|
*
|
|
435
530
|
* @interface TextContentItem
|
|
436
|
-
* @property {string} type - The type of the content item, which is always "text"
|
|
437
|
-
* @property {string} data - The text content to extract data from
|
|
438
531
|
*/
|
|
439
532
|
export interface TextContentItem {
|
|
533
|
+
/** The type of the content item, which is always "text". */
|
|
440
534
|
type: "text";
|
|
535
|
+
/** The text data to extract from. */
|
|
441
536
|
data: string;
|
|
442
537
|
}
|
|
443
538
|
|
|
444
539
|
/**
|
|
445
|
-
*
|
|
446
|
-
* Used when passing image data directly to extractStructuredData without a page source.
|
|
447
|
-
* The image will be analyzed by AI vision models for data extraction.
|
|
540
|
+
* Image buffer content item for content-based extraction.
|
|
448
541
|
*
|
|
449
542
|
* @interface ImageBufferContentItem
|
|
450
|
-
* @property {string} type - The type of the content item, which is always "image-buffer"
|
|
451
|
-
* @property {string} image_type - The image format (e.g., "png", "jpeg", "gif", "webp")
|
|
452
|
-
* @property {Buffer} data - The Buffer containing the raw image data
|
|
453
543
|
*/
|
|
454
544
|
export interface ImageBufferContentItem {
|
|
545
|
+
/** The type of the content item, which is always "image-buffer". */
|
|
455
546
|
type: "image-buffer";
|
|
547
|
+
/** The image format (e.g., "png", "jpeg", "gif", "webp"). */
|
|
456
548
|
image_type: "png" | "jpeg" | "gif" | "webp";
|
|
549
|
+
/** The Buffer containing the raw image data. */
|
|
457
550
|
data: Buffer;
|
|
458
551
|
}
|
|
459
552
|
|
|
460
553
|
/**
|
|
461
|
-
*
|
|
462
|
-
* Used when passing image URLs directly to extractStructuredData without a page source.
|
|
463
|
-
* The image will be fetched from the URL and analyzed by AI vision models for data extraction.
|
|
554
|
+
* Image URL content item for content-based extraction.
|
|
464
555
|
*
|
|
465
556
|
* @interface ImageUrlContentItem
|
|
466
|
-
* @property {string} type - The type of the content item, which is always "image-url"
|
|
467
|
-
* @property {string} image_type - The image format (e.g., "png", "jpeg", "gif", "webp")
|
|
468
|
-
* @property {string} data - The URL of the image to fetch and analyze
|
|
469
557
|
*/
|
|
470
558
|
export interface ImageUrlContentItem {
|
|
559
|
+
/** The type of the content item, which is always "image-url". */
|
|
471
560
|
type: "image-url";
|
|
561
|
+
/** The image format (e.g., "png", "jpeg", "gif", "webp"). */
|
|
472
562
|
image_type: "png" | "jpeg" | "gif" | "webp";
|
|
563
|
+
/** The URL of the image. */
|
|
473
564
|
data: string;
|
|
474
565
|
}
|
|
475
566
|
|
|
476
567
|
/**
|
|
477
|
-
*
|
|
568
|
+
* A union type representing content items for AI data extraction from various content types.
|
|
569
|
+
*
|
|
570
|
+
* This type alias defines the complete set of content types supported by the content-based
|
|
571
|
+
* extractStructuredData function for extracting data from text, image buffers, or image URLs
|
|
572
|
+
* without requiring a page source.
|
|
573
|
+
*
|
|
574
|
+
* **Type variants:**
|
|
575
|
+
* - `TextContentItem`: [TextContentItem](../type-references/TextContentItem) for text data extraction
|
|
576
|
+
* - `ImageBufferContentItem`: [ImageBufferContentItem](../type-references/ImageBufferContentItem) for image data stored as Buffer
|
|
577
|
+
* - `ImageUrlContentItem`: [ImageUrlContentItem](../type-references/ImageUrlContentItem) for image data accessible via URL
|
|
578
|
+
*
|
|
478
579
|
* @type ContentItem
|
|
479
|
-
*
|
|
480
|
-
* @
|
|
481
|
-
*
|
|
580
|
+
*
|
|
581
|
+
* @example
|
|
582
|
+
* ```typescript Text Content
|
|
583
|
+
* import { TextContentItem } from "@intuned/browser";
|
|
584
|
+
* import { BrowserContext, Page } from "playwright";
|
|
585
|
+
*
|
|
586
|
+
* interface Params {}
|
|
587
|
+
*
|
|
588
|
+
* export default async function handler(params: Params, page: Page, context: BrowserContext){
|
|
589
|
+
* const textContent: TextContentItem = {
|
|
590
|
+
* type: "text",
|
|
591
|
+
* data: "John Doe, age 30, works as a Software Engineer at Tech Corp"
|
|
592
|
+
* };
|
|
593
|
+
* }
|
|
594
|
+
* ```
|
|
595
|
+
*
|
|
596
|
+
* @example
|
|
597
|
+
* ```typescript Image Buffer Content
|
|
598
|
+
* import { ImageBufferContentItem } from "@intuned/browser";
|
|
599
|
+
* import { BrowserContext, Page } from "playwright";
|
|
600
|
+
*
|
|
601
|
+
* interface Params {}
|
|
602
|
+
*
|
|
603
|
+
* export default async function handler(params: Params, page: Page, context: BrowserContext){
|
|
604
|
+
* // Assuming you have image data as Buffer
|
|
605
|
+
* const imageData = fs.readFileSync("image.png");
|
|
606
|
+
*
|
|
607
|
+
* const imageContent: ImageBufferContentItem = {
|
|
608
|
+
* type: "image-buffer",
|
|
609
|
+
* image_type: "png",
|
|
610
|
+
* data: imageData
|
|
611
|
+
* };
|
|
612
|
+
* }
|
|
613
|
+
* ```
|
|
614
|
+
*
|
|
615
|
+
* @example
|
|
616
|
+
* ```typescript Image URL Content
|
|
617
|
+
* import { ImageUrlContentItem } from "@intuned/browser";
|
|
618
|
+
* import { BrowserContext, Page } from "playwright";
|
|
619
|
+
*
|
|
620
|
+
* interface Params {}
|
|
621
|
+
*
|
|
622
|
+
* export default async function handler(params: Params, page: Page, context: BrowserContext){
|
|
623
|
+
* const imageContent: ImageUrlContentItem = {
|
|
624
|
+
* type: "image-url",
|
|
625
|
+
* image_type: "jpeg",
|
|
626
|
+
* data: "https://example.com/image.jpg"
|
|
627
|
+
* };
|
|
628
|
+
* }
|
|
629
|
+
* ```
|
|
482
630
|
*/
|
|
483
631
|
export type ContentItem =
|
|
484
632
|
| TextContentItem
|
|
@@ -154,7 +154,30 @@ async function extractStructuredDataUsingAi(input) {
|
|
|
154
154
|
});
|
|
155
155
|
_Logger.logger.info(`Extraction failed,
|
|
156
156
|
Total LLM ${isGateway ? "Cost In Cents" : "Tokens"}: ${accumulatedTokens}`);
|
|
157
|
-
|
|
157
|
+
let errorMessage = "Unknown error during extraction";
|
|
158
|
+
if (error instanceof Error) {
|
|
159
|
+
errorMessage = error.message;
|
|
160
|
+
const apiError = error;
|
|
161
|
+
if (apiError.responseBody) {
|
|
162
|
+
try {
|
|
163
|
+
const responseBody = JSON.parse(apiError.responseBody);
|
|
164
|
+
if (responseBody.error) {
|
|
165
|
+
if (typeof responseBody.error === "string") {
|
|
166
|
+
errorMessage = responseBody.error;
|
|
167
|
+
} else if (responseBody.error.message) {
|
|
168
|
+
errorMessage = responseBody.error.message;
|
|
169
|
+
} else {
|
|
170
|
+
errorMessage = JSON.stringify(responseBody.error);
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
} catch {
|
|
174
|
+
if (typeof apiError.responseBody === "string") {
|
|
175
|
+
errorMessage = apiError.responseBody;
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
return (0, _neverthrow.err)(Errors.invalidExtractionResult(errorMessage));
|
|
158
181
|
}
|
|
159
182
|
}
|
|
160
183
|
_Logger.logger.info(`Extraction failed.
|