@intuned/browser-dev 0.1.7-dev.0 → 0.1.8-dev.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ai/export.d.ts +1 -1
- package/dist/ai/index.d.ts +1 -1
- package/dist/helpers/export.d.ts +2 -3
- package/dist/helpers/gotoUrl.js +50 -51
- package/dist/helpers/index.d.ts +2 -3
- package/dist/helpers/tests/testClickUntilExhausted.spec.js +2 -1
- package/package.json +1 -2
- package/generated-docs/ai/functions/extractStructuredData.mdx +0 -255
- package/generated-docs/ai/functions/isPageLoaded.mdx +0 -89
- package/generated-docs/ai/interfaces/ArraySchema.mdx +0 -36
- package/generated-docs/ai/interfaces/BasicSchema.mdx +0 -14
- package/generated-docs/ai/interfaces/BooleanSchema.mdx +0 -28
- package/generated-docs/ai/interfaces/ImageBufferContentItem.mdx +0 -16
- package/generated-docs/ai/interfaces/ImageUrlContentItem.mdx +0 -16
- package/generated-docs/ai/interfaces/NumberSchema.mdx +0 -35
- package/generated-docs/ai/interfaces/ObjectSchema.mdx +0 -39
- package/generated-docs/ai/interfaces/StringSchema.mdx +0 -35
- package/generated-docs/ai/interfaces/TextContentItem.mdx +0 -14
- package/generated-docs/ai/type-aliases/ContentItem.mdx +0 -12
- package/generated-docs/ai/type-aliases/JsonSchema.mdx +0 -47
- package/generated-docs/ai/type-aliases/SUPPORTED_MODELS.mdx +0 -85
- package/generated-docs/helpers/functions/clickButtonAndWait.mdx +0 -63
- package/generated-docs/helpers/functions/clickUntilExhausted.mdx +0 -112
- package/generated-docs/helpers/functions/downloadFile.mdx +0 -99
- package/generated-docs/helpers/functions/extractMarkdown.mdx +0 -56
- package/generated-docs/helpers/functions/filterEmptyValues.mdx +0 -51
- package/generated-docs/helpers/functions/goToUrl.mdx +0 -124
- package/generated-docs/helpers/functions/processDate.mdx +0 -55
- package/generated-docs/helpers/functions/resolveUrl.mdx +0 -165
- package/generated-docs/helpers/functions/sanitizeHtml.mdx +0 -113
- package/generated-docs/helpers/functions/saveFileToS3.mdx +0 -127
- package/generated-docs/helpers/functions/scrollToLoadContent.mdx +0 -83
- package/generated-docs/helpers/functions/uploadFileToS3.mdx +0 -121
- package/generated-docs/helpers/functions/validateDataUsingSchema.mdx +0 -90
- package/generated-docs/helpers/functions/waitForDomSettled.mdx +0 -91
- package/generated-docs/helpers/functions/withNetworkSettledWait.mdx +0 -76
- package/generated-docs/helpers/interfaces/Attachment.mdx +0 -56
- package/generated-docs/helpers/interfaces/S3Configs.mdx +0 -52
- package/generated-docs/helpers/interfaces/SanitizeHtmlOptions.mdx +0 -22
- package/generated-docs/helpers/type-aliases/AttachmentType.mdx +0 -10
- package/generated-docs/helpers/type-aliases/FileType.mdx +0 -61
- package/generated-docs/helpers/type-aliases/Trigger.mdx +0 -62
package/dist/ai/export.d.ts
CHANGED
|
@@ -380,7 +380,7 @@ export declare function extractStructuredData(options: {
|
|
|
380
380
|
* @param {Object} input - Input object containing the page to check
|
|
381
381
|
* @param {Page} input.page - The Playwright page to check
|
|
382
382
|
* @param {number} [input.timeoutInMs=10000] - Screenshot timeout in milliseconds. Defaults to 10000
|
|
383
|
-
* @param {string} [input.model="
|
|
383
|
+
* @param {string} [input.model="gpt-5-mini-2025-08-07"] - AI model to use for the check. Defaults to "gpt-5-mini-2025-08-07"
|
|
384
384
|
* @param {string} [input.apiKey] - Optional API key for the AI service (if provided, will not be billed to your account)
|
|
385
385
|
* @returns {Promise<boolean>} Promise resolving to true if page is loaded, false if still loading
|
|
386
386
|
* @example
|
package/dist/ai/index.d.ts
CHANGED
|
@@ -380,7 +380,7 @@ export declare function extractStructuredData(options: {
|
|
|
380
380
|
* @param {Object} input - Input object containing the page to check
|
|
381
381
|
* @param {Page} input.page - The Playwright page to check
|
|
382
382
|
* @param {number} [input.timeoutInMs=10000] - Screenshot timeout in milliseconds. Defaults to 10000
|
|
383
|
-
* @param {string} [input.model="
|
|
383
|
+
* @param {string} [input.model="gpt-5-mini-2025-08-07"] - AI model to use for the check. Defaults to "gpt-5-mini-2025-08-07"
|
|
384
384
|
* @param {string} [input.apiKey] - Optional API key for the AI service (if provided, will not be billed to your account)
|
|
385
385
|
* @returns {Promise<boolean>} Promise resolving to true if page is loaded, false if still loading
|
|
386
386
|
* @example
|
package/dist/helpers/export.d.ts
CHANGED
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
import type { Locator, Page, ElementHandle } from "playwright";
|
|
4
4
|
import type { ReadStream } from "fs";
|
|
5
5
|
import { Download } from "playwright";
|
|
6
|
-
import { SUPPORTED_MODELS } from "../ai/export";
|
|
7
6
|
|
|
8
7
|
/**
|
|
9
8
|
* Configuration options for sanitizing HTML content.
|
|
@@ -204,7 +203,7 @@ export declare function filterEmptyValues<T>(input: { data: T }): T;
|
|
|
204
203
|
* @param {string} [input.waitForLoadState="load"] - When to consider navigation succeeded. Options: "load", "domcontentloaded", "networkidle", "commit". Defaults to "load"
|
|
205
204
|
* @param {boolean} [input.throwOnTimeout=true] - Whether to throw an error if navigation times out. When false, the function returns without throwing, allowing continued execution. Defaults to true.
|
|
206
205
|
* @param {boolean} [input.waitForLoadingStateUsingAi=false] - When true, uses AI vision to verify the page is fully loaded by checking for loading spinners, blank content, or incomplete states. Retries up to 4 times with 5-second delays. Defaults to false
|
|
207
|
-
* @param {
|
|
206
|
+
* @param {string} [input.model="gpt-5-mini-2025-08-07"] - AI model to use for loading verification. Defaults to "gpt-5-mini-2025-08-07"
|
|
208
207
|
* @param {string} [input.apiKey] - Optional API key for the AI service (if provided, will not be billed to your account)
|
|
209
208
|
* @returns {Promise<void>} Promise that resolves when navigation completes successfully. If the operation fails and `throwOnTimeout` is false, resolves without error
|
|
210
209
|
*
|
|
@@ -261,7 +260,7 @@ export declare function goToUrl(input: {
|
|
|
261
260
|
throwOnTimeout?: boolean;
|
|
262
261
|
waitForLoadState?: "load" | "domcontentloaded" | "networkidle";
|
|
263
262
|
waitForLoadingStateUsingAi?: boolean;
|
|
264
|
-
model?:
|
|
263
|
+
model?: string;
|
|
265
264
|
apiKey?: string;
|
|
266
265
|
}): Promise<void>;
|
|
267
266
|
|
package/dist/helpers/gotoUrl.js
CHANGED
|
@@ -9,7 +9,7 @@ var _asyncRetry = _interopRequireDefault(require("async-retry"));
|
|
|
9
9
|
var _playwright = require("playwright");
|
|
10
10
|
var _isPageLoaded = require("../ai/isPageLoaded");
|
|
11
11
|
var _Logger = require("../common/Logger");
|
|
12
|
-
var
|
|
12
|
+
var _withNetworkSettledWait = require("./withNetworkSettledWait");
|
|
13
13
|
function _interopRequireDefault(e) { return e && e.__esModule ? e : { default: e }; }
|
|
14
14
|
const DEFAULT_PLAYWRIGHT_TIMEOUT = 30000;
|
|
15
15
|
const TIMEOUT_PADDING = 3000;
|
|
@@ -17,7 +17,7 @@ const goToUrl = async input => {
|
|
|
17
17
|
const {
|
|
18
18
|
page,
|
|
19
19
|
url,
|
|
20
|
-
throwOnTimeout =
|
|
20
|
+
throwOnTimeout = false,
|
|
21
21
|
waitForLoadingStateUsingAi = false,
|
|
22
22
|
retries = 3,
|
|
23
23
|
model = "gpt-5-mini-2025-08-07",
|
|
@@ -28,63 +28,62 @@ const goToUrl = async input => {
|
|
|
28
28
|
const timeoutInMs = getPageGotoTimeout(page, {
|
|
29
29
|
timeoutInMs: input.timeoutInMs
|
|
30
30
|
});
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
31
|
+
return await (0, _withNetworkSettledWait.withNetworkSettledWait)(async () => {
|
|
32
|
+
let responseOrTimeout;
|
|
33
|
+
try {
|
|
34
|
+
await (0, _asyncRetry.default)(async () => {
|
|
35
|
+
const promises = [page.goto(url, {
|
|
36
|
+
waitUntil: waitForLoadState,
|
|
37
|
+
timeout: timeoutInMs !== undefined ? timeoutInMs : undefined
|
|
38
|
+
})];
|
|
39
|
+
if (timeoutInMs !== undefined) {
|
|
40
|
+
promises.push((0, _promises.setTimeout)(timeoutInMs + TIMEOUT_PADDING, timeoutSymbol));
|
|
41
|
+
}
|
|
42
|
+
responseOrTimeout = await Promise.race(promises);
|
|
43
|
+
if (responseOrTimeout === timeoutSymbol) {
|
|
44
|
+
throw new _playwright.errors.TimeoutError("Page.goto timed out but did not throw an error. Consider using a proxy.\n" + `(URL: ${url}, timeout: ${timeoutInMs}ms)`);
|
|
45
|
+
}
|
|
46
|
+
}, {
|
|
47
|
+
retries,
|
|
48
|
+
factor: 2,
|
|
49
|
+
minTimeout: 1000
|
|
50
|
+
});
|
|
51
|
+
} catch (error) {
|
|
52
|
+
if (!throwOnTimeout) {
|
|
53
|
+
return;
|
|
44
54
|
}
|
|
55
|
+
throw error;
|
|
56
|
+
}
|
|
57
|
+
if (!waitForLoadingStateUsingAi) {
|
|
58
|
+
return responseOrTimeout;
|
|
59
|
+
}
|
|
60
|
+
for (let i = 0; i < retries; i++) {
|
|
61
|
+
let isLoaded = false;
|
|
45
62
|
try {
|
|
46
|
-
await
|
|
47
|
-
|
|
63
|
+
isLoaded = await (0, _isPageLoaded.isPageLoaded)({
|
|
64
|
+
page,
|
|
65
|
+
timeoutInMs,
|
|
66
|
+
model,
|
|
67
|
+
apiKey: apiKey ? apiKey : undefined
|
|
48
68
|
});
|
|
69
|
+
if (isLoaded === true) {
|
|
70
|
+
return;
|
|
71
|
+
}
|
|
49
72
|
} catch (error) {
|
|
50
|
-
_Logger.logger.
|
|
73
|
+
_Logger.logger.debug(`Failed to check if page is loaded: ${url}. Error: ${error}`);
|
|
74
|
+
isLoaded = false;
|
|
51
75
|
}
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
factor: 2,
|
|
55
|
-
minTimeout: 1000
|
|
56
|
-
});
|
|
57
|
-
} catch (error) {
|
|
58
|
-
if (!throwOnTimeout) {
|
|
59
|
-
return;
|
|
60
|
-
}
|
|
61
|
-
throw error;
|
|
62
|
-
}
|
|
63
|
-
if (!waitForLoadingStateUsingAi && !(0, _utils.isGenerateCodeMode)()) {
|
|
64
|
-
return responseOrTimeout;
|
|
65
|
-
}
|
|
66
|
-
for (let i = 0; i < retries; i++) {
|
|
67
|
-
try {
|
|
68
|
-
const isLoaded = await (0, _isPageLoaded.isPageLoaded)({
|
|
69
|
-
page,
|
|
70
|
-
timeoutInMs,
|
|
71
|
-
model,
|
|
72
|
-
apiKey: apiKey ? apiKey : undefined
|
|
73
|
-
});
|
|
74
|
-
if (isLoaded === true) {
|
|
76
|
+
if (i === retries - 1) {
|
|
77
|
+
_Logger.logger.warn("Page never loaded, url: " + url);
|
|
75
78
|
return;
|
|
76
79
|
}
|
|
77
|
-
|
|
78
|
-
_Logger.logger.error(`Error in AI page load detection for URL: ${url}. Error: ${error}`);
|
|
79
|
-
_Logger.logger.warn("AI page load detection failed. Continuing without AI verification.");
|
|
80
|
-
return;
|
|
80
|
+
await (0, _promises.setTimeout)(5000);
|
|
81
81
|
}
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
}
|
|
82
|
+
}, {
|
|
83
|
+
page,
|
|
84
|
+
maxInflightRequests: 0,
|
|
85
|
+
timeoutInMs: 30000
|
|
86
|
+
});
|
|
88
87
|
};
|
|
89
88
|
exports.goToUrl = goToUrl;
|
|
90
89
|
function getPageGotoTimeout(page, options) {
|
package/dist/helpers/index.d.ts
CHANGED
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
import type { Locator, Page, ElementHandle } from "playwright";
|
|
4
4
|
import type { ReadStream } from "fs";
|
|
5
5
|
import { Download } from "playwright";
|
|
6
|
-
import { SUPPORTED_MODELS } from "../ai/export";
|
|
7
6
|
|
|
8
7
|
/**
|
|
9
8
|
* Configuration options for sanitizing HTML content.
|
|
@@ -204,7 +203,7 @@ export declare function filterEmptyValues<T>(input: { data: T }): T;
|
|
|
204
203
|
* @param {string} [input.waitForLoadState="load"] - When to consider navigation succeeded. Options: "load", "domcontentloaded", "networkidle", "commit". Defaults to "load"
|
|
205
204
|
* @param {boolean} [input.throwOnTimeout=true] - Whether to throw an error if navigation times out. When false, the function returns without throwing, allowing continued execution. Defaults to true.
|
|
206
205
|
* @param {boolean} [input.waitForLoadingStateUsingAi=false] - When true, uses AI vision to verify the page is fully loaded by checking for loading spinners, blank content, or incomplete states. Retries up to 4 times with 5-second delays. Defaults to false
|
|
207
|
-
* @param {
|
|
206
|
+
* @param {string} [input.model="gpt-5-mini-2025-08-07"] - AI model to use for loading verification. Defaults to "gpt-5-mini-2025-08-07"
|
|
208
207
|
* @param {string} [input.apiKey] - Optional API key for the AI service (if provided, will not be billed to your account)
|
|
209
208
|
* @returns {Promise<void>} Promise that resolves when navigation completes successfully. If the operation fails and `throwOnTimeout` is false, resolves without error
|
|
210
209
|
*
|
|
@@ -261,7 +260,7 @@ export declare function goToUrl(input: {
|
|
|
261
260
|
throwOnTimeout?: boolean;
|
|
262
261
|
waitForLoadState?: "load" | "domcontentloaded" | "networkidle";
|
|
263
262
|
waitForLoadingStateUsingAi?: boolean;
|
|
264
|
-
model?:
|
|
263
|
+
model?: string;
|
|
265
264
|
apiKey?: string;
|
|
266
265
|
}): Promise<void>;
|
|
267
266
|
|
|
@@ -231,7 +231,8 @@ const noChangeThresholdHtml = `
|
|
|
231
231
|
const buttonLocator = page.locator("#load-more");
|
|
232
232
|
await (0, _.clickUntilExhausted)({
|
|
233
233
|
page,
|
|
234
|
-
buttonLocator
|
|
234
|
+
buttonLocator,
|
|
235
|
+
maxClicks: 10
|
|
235
236
|
});
|
|
236
237
|
const finalCount = await page.locator(".item").count();
|
|
237
238
|
(0, _extendedTest.expect)(finalCount).toBe(10);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@intuned/browser-dev",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.8-dev.0",
|
|
4
4
|
"description": "runner package for intuned functions",
|
|
5
5
|
"types": "./dist/index.d.ts",
|
|
6
6
|
"typesVersions": {
|
|
@@ -59,7 +59,6 @@
|
|
|
59
59
|
"@anthropic-ai/sdk": "0.22.0",
|
|
60
60
|
"@aws-sdk/client-s3": "3.821.0",
|
|
61
61
|
"@aws-sdk/s3-request-presigner": "3.821.0",
|
|
62
|
-
"@intuned/runtime": "^1.3.12",
|
|
63
62
|
"ai": "5.0.15",
|
|
64
63
|
"ajv": "8.13.0",
|
|
65
64
|
"ajv-formats": "2.1.1",
|
|
@@ -1,255 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
title: extractStructuredData
|
|
3
|
-
description: ""
|
|
4
|
-
---
|
|
5
|
-
|
|
6
|
-
<Tabs>
|
|
7
|
-
|
|
8
|
-
<Tab title="From Page or Locator">
|
|
9
|
-
|
|
10
|
-
```typescript
|
|
11
|
-
export declare function extractStructuredData(options: {
|
|
12
|
-
source: Page | Locator;
|
|
13
|
-
dataSchema: JsonSchema | z.ZodSchema;
|
|
14
|
-
prompt?: string;
|
|
15
|
-
strategy?: "IMAGE" | "MARKDOWN" | "HTML";
|
|
16
|
-
enableDomMatching?: boolean;
|
|
17
|
-
enableCache?: boolean;
|
|
18
|
-
maxRetries?: number;
|
|
19
|
-
model?: SUPPORTED_MODELS;
|
|
20
|
-
apiKey?: string;
|
|
21
|
-
}): Promise<any>;
|
|
22
|
-
```
|
|
23
|
-
|
|
24
|
-
Extract structured data from web pages using AI-powered content analysis.
|
|
25
|
-
|
|
26
|
-
## Examples
|
|
27
|
-
|
|
28
|
-
<CodeGroup>
|
|
29
|
-
|
|
30
|
-
```typescript Page source
|
|
31
|
-
import { extractStructuredData } from '@intuned/browser/ai';
|
|
32
|
-
export default async function handler(params, page, context){
|
|
33
|
-
await page.goto("https://books.toscrape.com/")
|
|
34
|
-
const product = await extractStructuredData({
|
|
35
|
-
source: page,
|
|
36
|
-
strategy: "HTML",
|
|
37
|
-
model: "gpt-4o",
|
|
38
|
-
dataSchema: {
|
|
39
|
-
type: "object",
|
|
40
|
-
properties: {
|
|
41
|
-
name: { type: "string" },
|
|
42
|
-
price: { type: "string" },
|
|
43
|
-
description: { type: "string" },
|
|
44
|
-
inStock: { type: "boolean" }
|
|
45
|
-
},
|
|
46
|
-
required: ["name", "price"]
|
|
47
|
-
},
|
|
48
|
-
prompt: "Extract product details from this e page"
|
|
49
|
-
});
|
|
50
|
-
console.log(`Found book: ${product.name} - ${product.price}`);
|
|
51
|
-
}
|
|
52
|
-
```
|
|
53
|
-
|
|
54
|
-
```typescript Locator source
|
|
55
|
-
import { extractStructuredData } from '@intuned/browser/ai';
|
|
56
|
-
export default async function handler(params, page, context){
|
|
57
|
-
await page.goto("https://books.toscrape.com/")
|
|
58
|
-
const articleContainer = page.locator("article").first()
|
|
59
|
-
const article = await extractStructuredData({
|
|
60
|
-
source: articleContainer,
|
|
61
|
-
strategy: "MARKDOWN",
|
|
62
|
-
model: "claude-3-5-sonnet-20240620",
|
|
63
|
-
dataSchema: {
|
|
64
|
-
type: "object",
|
|
65
|
-
properties: {
|
|
66
|
-
title: { type: "string" },
|
|
67
|
-
author: { type: "string" },
|
|
68
|
-
publishDate: { type: "string" },
|
|
69
|
-
content: { type: "string" },
|
|
70
|
-
},
|
|
71
|
-
required: ["title"]
|
|
72
|
-
},
|
|
73
|
-
maxRetries: 5
|
|
74
|
-
});
|
|
75
|
-
console.log(`Found book: ${article.title}`);
|
|
76
|
-
}
|
|
77
|
-
```
|
|
78
|
-
|
|
79
|
-
</CodeGroup>
|
|
80
|
-
|
|
81
|
-
## Arguments
|
|
82
|
-
|
|
83
|
-
<ParamField path="options" type="Object" required
|
|
84
|
-
>
|
|
85
|
-
Configuration object containing extraction parameters
|
|
86
|
-
|
|
87
|
-
<Expandable title="options">
|
|
88
|
-
<ParamField path="options.source" type="Page | Locator">
|
|
89
|
-
Playwright Page object to extract data from the entire page or Locator object to extract data from a specific element
|
|
90
|
-
</ParamField>
|
|
91
|
-
|
|
92
|
-
<ParamField path="options.dataSchema" type="JsonSchema | z.ZodSchema">
|
|
93
|
-
JsonSchema defining the structure of the data to extract. This can be a JsonSchema or ZodSchema
|
|
94
|
-
</ParamField>
|
|
95
|
-
|
|
96
|
-
<ParamField path="options.strategy" type="string">
|
|
97
|
-
Type of extraction: "HTML", "IMAGE", or "MARKDOWN". Defaults to "HTML"
|
|
98
|
-
</ParamField>
|
|
99
|
-
|
|
100
|
-
<ParamField path="options.prompt" type="string">
|
|
101
|
-
Optional prompt to guide the extraction process and provide more context
|
|
102
|
-
</ParamField>
|
|
103
|
-
|
|
104
|
-
<ParamField path="options.enableDomMatching" type="boolean">
|
|
105
|
-
Whether to enable DOM element matching during extraction. Defaults to false. When set to true, all types in the schema must be strings to match with the DOM elements. The extracted results will be matched with the DOM elements and returned, then cached in a smart fashion so that the next time the same data is extracted, the result will be returned from the cache even if the DOM has minor changes.
|
|
106
|
-
</ParamField>
|
|
107
|
-
|
|
108
|
-
<ParamField path="options.enableCache" type="boolean">
|
|
109
|
-
Whether to enable caching of the extracted data. Defaults to true
|
|
110
|
-
</ParamField>
|
|
111
|
-
|
|
112
|
-
<ParamField path="options.maxRetries" type="number">
|
|
113
|
-
Maximum number of retry attempts on failures. Failures can be validation errors, API errors, output errors, etc. Defaults to 3
|
|
114
|
-
</ParamField>
|
|
115
|
-
|
|
116
|
-
<ParamField path="options.model" type="SUPPORTED_MODELS">
|
|
117
|
-
AI model to use for extraction. See [SUPPORTED_MODELS](../type-aliases/SUPPORTED_MODELS) for all supported models. Defaults to "claude-3-5-haiku-latest"
|
|
118
|
-
</ParamField>
|
|
119
|
-
|
|
120
|
-
<ParamField path="options.apiKey" type="string">
|
|
121
|
-
Optional API key for AI extraction (if provided, will not be billed to your account)
|
|
122
|
-
</ParamField>
|
|
123
|
-
|
|
124
|
-
</Expandable>
|
|
125
|
-
|
|
126
|
-
</ParamField>
|
|
127
|
-
|
|
128
|
-
## Returns: `any`
|
|
129
|
-
|
|
130
|
-
Promise resolving to the extracted structured data matching the provided schema
|
|
131
|
-
|
|
132
|
-
</Tab>
|
|
133
|
-
|
|
134
|
-
<Tab title="From Content">
|
|
135
|
-
|
|
136
|
-
```typescript
|
|
137
|
-
export declare function extractStructuredData(options: {
|
|
138
|
-
content: ContentItem[] | ContentItem;
|
|
139
|
-
dataSchema: JsonSchema | z.ZodSchema;
|
|
140
|
-
prompt?: string;
|
|
141
|
-
maxRetries?: number;
|
|
142
|
-
enableCache?: boolean;
|
|
143
|
-
model: SUPPORTED_MODELS;
|
|
144
|
-
apiKey?: string;
|
|
145
|
-
}): Promise<any>;
|
|
146
|
-
```
|
|
147
|
-
|
|
148
|
-
Extract structured data from content items (text, images) using AI-powered analysis.
|
|
149
|
-
|
|
150
|
-
## Examples
|
|
151
|
-
|
|
152
|
-
<CodeGroup>
|
|
153
|
-
|
|
154
|
-
```typescript Text Content
|
|
155
|
-
import { extractStructuredData } from '@intuned/browser/ai';
|
|
156
|
-
export default async function handler(params, page, context){
|
|
157
|
-
const textContent: TextContentItem = {
|
|
158
|
-
type: "text",
|
|
159
|
-
data: "John Doe, age 30, works as a Software Engineer at Tech Corp"
|
|
160
|
-
};
|
|
161
|
-
|
|
162
|
-
const person = await extractStructuredData({
|
|
163
|
-
content: textContent,
|
|
164
|
-
model: "gpt-4o",
|
|
165
|
-
dataSchema: {
|
|
166
|
-
type: "object",
|
|
167
|
-
properties: {
|
|
168
|
-
name: { type: "string" },
|
|
169
|
-
age: { type: "number" },
|
|
170
|
-
occupation: { type: "string" },
|
|
171
|
-
company: { type: "string" }
|
|
172
|
-
},
|
|
173
|
-
required: ["name"]
|
|
174
|
-
},
|
|
175
|
-
prompt: "Extract person information from the text"
|
|
176
|
-
});
|
|
177
|
-
|
|
178
|
-
console.log(`Found person: ${person.name}, ${person.age} years old`);
|
|
179
|
-
}
|
|
180
|
-
```
|
|
181
|
-
|
|
182
|
-
```typescript Multiple Content Items
|
|
183
|
-
import { extractStructuredData } from '@intuned/browser/ai';
|
|
184
|
-
export default async function handler(params, page, context){
|
|
185
|
-
const mixedContent = [
|
|
186
|
-
{ type: "text", data: "Product: iPhone 15" },
|
|
187
|
-
{ type: "image-url", image_type: "jpeg", data: "https://mintcdn.com/intuned-7/asXJUUPBWwDlStUB/logo/light.svg?fit=max&auto=format&n=asXJUUPBWwDlStUB&q=85&s=6525c0b299b3226464eba6afa9b7ebe6" }
|
|
188
|
-
];
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
const product = await extractStructuredData({
|
|
192
|
-
content: mixedContent,
|
|
193
|
-
model: "claude-3",
|
|
194
|
-
dataSchema: {
|
|
195
|
-
type: "object",
|
|
196
|
-
properties: {
|
|
197
|
-
name: { type: "string" },
|
|
198
|
-
price: { type: "string" },
|
|
199
|
-
features: { type: "array", items: { type: "string" } }
|
|
200
|
-
}
|
|
201
|
-
},
|
|
202
|
-
maxRetries: 1,
|
|
203
|
-
enableCache: true
|
|
204
|
-
});
|
|
205
|
-
}
|
|
206
|
-
```
|
|
207
|
-
|
|
208
|
-
</CodeGroup>
|
|
209
|
-
|
|
210
|
-
## Arguments
|
|
211
|
-
|
|
212
|
-
<ParamField path="options" type="Object" required
|
|
213
|
-
>
|
|
214
|
-
Configuration object containing extraction parameters
|
|
215
|
-
|
|
216
|
-
<Expandable title="options">
|
|
217
|
-
<ParamField path="options.content" type="Array<ContentItem> | ContentItem">
|
|
218
|
-
Content to extract data from - can be a single content item or array of content items
|
|
219
|
-
</ParamField>
|
|
220
|
-
|
|
221
|
-
<ParamField path="options.dataSchema" type="JsonSchema | z.ZodSchema">
|
|
222
|
-
JsonSchema defining the structure of the data to extract
|
|
223
|
-
</ParamField>
|
|
224
|
-
|
|
225
|
-
<ParamField path="options.prompt" type="string">
|
|
226
|
-
Optional prompt to guide the extraction process and provide more context
|
|
227
|
-
</ParamField>
|
|
228
|
-
|
|
229
|
-
<ParamField path="options.enableCache" type="boolean">
|
|
230
|
-
Whether to enable caching of the extracted data. Defaults to true
|
|
231
|
-
</ParamField>
|
|
232
|
-
|
|
233
|
-
<ParamField path="options.maxRetries" type="number">
|
|
234
|
-
Maximum number of retry attempts on failures. Failures can be validation errors, API errors, output errors, etc. Defaults to 3
|
|
235
|
-
</ParamField>
|
|
236
|
-
|
|
237
|
-
<ParamField path="options.model" type="SUPPORTED_MODELS">
|
|
238
|
-
AI model to use for extraction (e.g., "gpt-4", "claude-3"), see [SUPPORTED_MODELS](../type-aliases/SUPPORTED_MODELS) for all supported models
|
|
239
|
-
</ParamField>
|
|
240
|
-
|
|
241
|
-
<ParamField path="options.apiKey" type="string">
|
|
242
|
-
Optional API key for AI extraction (if provided, will not be billed to your account)
|
|
243
|
-
</ParamField>
|
|
244
|
-
|
|
245
|
-
</Expandable>
|
|
246
|
-
|
|
247
|
-
</ParamField>
|
|
248
|
-
|
|
249
|
-
## Returns: `any`
|
|
250
|
-
|
|
251
|
-
Promise resolving to the extracted structured data matching the provided schema
|
|
252
|
-
|
|
253
|
-
</Tab>
|
|
254
|
-
|
|
255
|
-
</Tabs>
|
|
@@ -1,89 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
title: isPageLoaded
|
|
3
|
-
description: ""
|
|
4
|
-
---
|
|
5
|
-
|
|
6
|
-
```typescript
|
|
7
|
-
export declare function isPageLoaded(input: {
|
|
8
|
-
page: Page;
|
|
9
|
-
timeoutInMs?: number;
|
|
10
|
-
model?: SUPPORTED_MODELS;
|
|
11
|
-
apiKey?: string;
|
|
12
|
-
}): Promise<boolean>;
|
|
13
|
-
```
|
|
14
|
-
|
|
15
|
-
Uses AI vision to determine if a webpage has finished loading by analyzing a screenshot.
|
|
16
|
-
Detects loading spinners, blank content, or incomplete page states.
|
|
17
|
-
|
|
18
|
-
## Examples
|
|
19
|
-
|
|
20
|
-
<CodeGroup>
|
|
21
|
-
|
|
22
|
-
```typescript Check Page Loading
|
|
23
|
-
import { isPageLoaded } from "@intuned/browser/ai";
|
|
24
|
-
export default async function handler(params, page, context){
|
|
25
|
-
// Wait for page to finish loading
|
|
26
|
-
await page.goto('https://example.com');
|
|
27
|
-
|
|
28
|
-
const pageLoaded = await isPageLoaded({page});
|
|
29
|
-
if (pageLoaded) {
|
|
30
|
-
// Continue with scraping or interactions
|
|
31
|
-
} else {
|
|
32
|
-
// Wait longer or retry
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
```
|
|
36
|
-
|
|
37
|
-
```typescript Loading Loop
|
|
38
|
-
import { isPageLoaded } from "@intuned/browser/ai";
|
|
39
|
-
export default async function handler(params, page, context){
|
|
40
|
-
// Keep checking until page loads
|
|
41
|
-
await page.goto("https://example.com");
|
|
42
|
-
let attempts = 0;
|
|
43
|
-
while (attempts < 10) {
|
|
44
|
-
const pageLoaded = await isPageLoaded({
|
|
45
|
-
page,
|
|
46
|
-
model: "gpt-4o",
|
|
47
|
-
timeoutInMs: 5000
|
|
48
|
-
});
|
|
49
|
-
if (pageLoaded) break;
|
|
50
|
-
|
|
51
|
-
await page.waitForTimeout(2000);
|
|
52
|
-
attempts++;
|
|
53
|
-
}
|
|
54
|
-
}
|
|
55
|
-
```
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
</CodeGroup>
|
|
59
|
-
|
|
60
|
-
## Arguments
|
|
61
|
-
|
|
62
|
-
<ParamField path="input" type="Object" required
|
|
63
|
-
>
|
|
64
|
-
Input object containing the page to check
|
|
65
|
-
|
|
66
|
-
<Expandable title="input">
|
|
67
|
-
<ParamField path="input.page" type="Page">
|
|
68
|
-
The Playwright page to check
|
|
69
|
-
</ParamField>
|
|
70
|
-
|
|
71
|
-
<ParamField path="input.timeoutInMs" type="number">
|
|
72
|
-
Screenshot timeout in milliseconds. Defaults to 10000
|
|
73
|
-
</ParamField>
|
|
74
|
-
|
|
75
|
-
<ParamField path="input.model" type="SUPPORTED_MODELS">
|
|
76
|
-
AI model to use for the check. See [SUPPORTED_MODELS](../type-aliases/SUPPORTED_MODELS) for all supported models. Defaults to "gpt-4o-2024-08-06"
|
|
77
|
-
</ParamField>
|
|
78
|
-
|
|
79
|
-
<ParamField path="input.apiKey" type="string">
|
|
80
|
-
Optional API key for the AI service (if provided, will not be billed to your account)
|
|
81
|
-
</ParamField>
|
|
82
|
-
|
|
83
|
-
</Expandable>
|
|
84
|
-
|
|
85
|
-
</ParamField>
|
|
86
|
-
|
|
87
|
-
## Returns: `Promise<boolean>`
|
|
88
|
-
|
|
89
|
-
Promise resolving to true if page is loaded, false if still loading
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
title: ArraySchema
|
|
3
|
-
description: ""
|
|
4
|
-
---
|
|
5
|
-
|
|
6
|
-
```typescript
|
|
7
|
-
export interface ArraySchema extends BasicSchema {
|
|
8
|
-
type: "array";
|
|
9
|
-
items: JsonSchema | z.ZodSchema;
|
|
10
|
-
maxItems?: number;
|
|
11
|
-
minItems?: number;
|
|
12
|
-
uniqueItems?: boolean;
|
|
13
|
-
}
|
|
14
|
-
```
|
|
15
|
-
|
|
16
|
-
Schema definition for array values with item validation and constraints.
|
|
17
|
-
|
|
18
|
-
## Examples
|
|
19
|
-
|
|
20
|
-
<CodeGroup>
|
|
21
|
-
|
|
22
|
-
```typescript Array Schema
|
|
23
|
-
import { ArraySchema } from "@intuned/browser/ai";
|
|
24
|
-
export default async function handler(params, page, context){
|
|
25
|
-
const tagsSchema: ArraySchema = {
|
|
26
|
-
type: "array",
|
|
27
|
-
items: { type: "string" },
|
|
28
|
-
minItems: 1,
|
|
29
|
-
maxItems: 10,
|
|
30
|
-
uniqueItems: true,
|
|
31
|
-
description: "List of tags"
|
|
32
|
-
};
|
|
33
|
-
}
|
|
34
|
-
```
|
|
35
|
-
|
|
36
|
-
</CodeGroup>
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
title: BasicSchema
|
|
3
|
-
description: ""
|
|
4
|
-
---
|
|
5
|
-
|
|
6
|
-
```typescript
|
|
7
|
-
export interface BasicSchema {
|
|
8
|
-
type: "string" | "number" | "integer" | "boolean" | "array" | "object";
|
|
9
|
-
description?: string;
|
|
10
|
-
}
|
|
11
|
-
```
|
|
12
|
-
|
|
13
|
-
Base schema interface that all JSON schema types extend from.
|
|
14
|
-
Provides common properties like type and description.
|
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
title: BooleanSchema
|
|
3
|
-
description: ""
|
|
4
|
-
---
|
|
5
|
-
|
|
6
|
-
```typescript
|
|
7
|
-
export interface BooleanSchema extends BasicSchema {
|
|
8
|
-
type: "boolean";
|
|
9
|
-
}
|
|
10
|
-
```
|
|
11
|
-
|
|
12
|
-
Schema definition for boolean values.
|
|
13
|
-
|
|
14
|
-
## Examples
|
|
15
|
-
|
|
16
|
-
<CodeGroup>
|
|
17
|
-
|
|
18
|
-
```typescript Boolean Schema
|
|
19
|
-
import { BooleanSchema } from "@intuned/browser/ai";
|
|
20
|
-
export default async function handler(params, page, context){
|
|
21
|
-
const isActiveSchema: BooleanSchema = {
|
|
22
|
-
type: "boolean",
|
|
23
|
-
description: "Whether the user account is active"
|
|
24
|
-
};
|
|
25
|
-
}
|
|
26
|
-
```
|
|
27
|
-
|
|
28
|
-
</CodeGroup>
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
title: ImageBufferContentItem
|
|
3
|
-
description: ""
|
|
4
|
-
---
|
|
5
|
-
|
|
6
|
-
```typescript
|
|
7
|
-
export interface ImageBufferContentItem {
|
|
8
|
-
type: "image-buffer";
|
|
9
|
-
image_type: "png" | "jpeg" | "gif" | "webp";
|
|
10
|
-
data: Buffer;
|
|
11
|
-
}
|
|
12
|
-
```
|
|
13
|
-
|
|
14
|
-
Represents image content provided as a Buffer for AI extraction.
|
|
15
|
-
Used when passing image data directly to extractStructuredData without a page source.
|
|
16
|
-
The image will be analyzed by AI vision models for data extraction.
|