@intuned/browser-dev 0.1.8-dev.0 → 0.1.10-dev.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +85 -143
- package/dist/ai/export.d.ts +291 -143
- package/dist/ai/extractStructuredData.js +21 -27
- package/dist/ai/extractStructuredDataUsingAi.js +24 -1
- package/dist/ai/index.d.ts +291 -143
- package/dist/ai/tests/testCreateMatchesMapping.spec.js +216 -0
- package/dist/ai/tests/testExtractStructuredData.spec.js +348 -2
- package/dist/ai/tests/testExtractStructuredDataDomMatchingIframes.spec.js +459 -0
- package/dist/ai/tests/testExtractStructuredDataUnit.spec.js +375 -0
- package/dist/ai/tests/testMatching.spec.js +342 -0
- package/dist/ai/tests/testValidateMatchesMapping.spec.js +265 -0
- package/dist/common/Logger/index.js +2 -2
- package/dist/common/extendedTest.js +38 -30
- package/dist/common/frame_utils/frameTree.js +116 -0
- package/dist/common/frame_utils/getContentWithNestedIframes.js +13 -0
- package/dist/common/frame_utils/index.js +95 -0
- package/dist/common/frame_utils/stitchIframe.js +105 -0
- package/dist/{helpers → common}/frame_utils/tests/testFindAllIframes.spec.js +24 -15
- package/dist/common/frame_utils/tests/testGetContentWithNestedIframes.spec.js +241 -0
- package/dist/common/frame_utils/utils.js +91 -0
- package/dist/common/getSimplifiedHtml.js +20 -20
- package/dist/common/matching/matching.js +91 -16
- package/dist/common/tests/matching.test.js +225 -0
- package/dist/common/tests/testGetSimplifiedHtml.spec.js +324 -0
- package/dist/helpers/export.d.ts +702 -575
- package/dist/helpers/extractMarkdown.js +16 -7
- package/dist/helpers/index.d.ts +702 -575
- package/dist/helpers/tests/testExtractMarkdown.spec.js +29 -0
- package/dist/helpers/waitForDomSettled.js +4 -4
- package/dist/helpers/withNetworkSettledWait.js +2 -7
- package/dist/optimized-extractors/export.d.ts +17 -18
- package/dist/optimized-extractors/index.d.ts +17 -18
- package/dist/types/intuned-runtime.d.ts +6 -32
- package/how-to-generate-docs.md +40 -28
- package/package.json +2 -2
- package/dist/helpers/frame_utils/constants.js +0 -8
- package/dist/helpers/frame_utils/findAllIframes.js +0 -82
- package/dist/helpers/frame_utils/index.js +0 -44
- /package/dist/{helpers → common}/frame_utils/checkFrameAllowsAsyncScripts.js +0 -0
- /package/dist/{helpers → common}/frame_utils/getContainerFrame.js +0 -0
|
@@ -287,4 +287,33 @@ var _ = require("..");
|
|
|
287
287
|
(0, _extendedTest.expect)(result).toContain("> Important quote here.");
|
|
288
288
|
(0, _extendedTest.expect)(result).toContain("`inline code`");
|
|
289
289
|
});
|
|
290
|
+
(0, _extendedTest.test)("extract markdown from locator with iframe", async ({
|
|
291
|
+
page
|
|
292
|
+
}) => {
|
|
293
|
+
await page.setContent(`
|
|
294
|
+
<html>
|
|
295
|
+
<body>
|
|
296
|
+
<div id="outside">
|
|
297
|
+
<h1>Outside Content</h1>
|
|
298
|
+
</div>
|
|
299
|
+
<div id="container">
|
|
300
|
+
<h2>Container Header</h2>
|
|
301
|
+
<iframe id="nested-iframe" srcdoc="<html><body><h3>Nested Iframe</h3><p>Nested content</p></body></html>"></iframe>
|
|
302
|
+
<p>After iframe</p>
|
|
303
|
+
</div>
|
|
304
|
+
</body>
|
|
305
|
+
</html>
|
|
306
|
+
`);
|
|
307
|
+
await page.waitForSelector("#nested-iframe");
|
|
308
|
+
await page.frameLocator("#nested-iframe").locator("body").waitFor();
|
|
309
|
+
const locator = page.locator("#container");
|
|
310
|
+
const result = await (0, _.extractMarkdown)({
|
|
311
|
+
source: locator
|
|
312
|
+
});
|
|
313
|
+
(0, _extendedTest.expect)(result).toContain("## Container Header");
|
|
314
|
+
(0, _extendedTest.expect)(result).toContain("### Nested Iframe");
|
|
315
|
+
(0, _extendedTest.expect)(result).toContain("Nested content");
|
|
316
|
+
(0, _extendedTest.expect)(result).toContain("After iframe");
|
|
317
|
+
(0, _extendedTest.expect)(result).not.toContain("Outside Content");
|
|
318
|
+
});
|
|
290
319
|
});
|
|
@@ -6,8 +6,8 @@ Object.defineProperty(exports, "__esModule", {
|
|
|
6
6
|
exports.waitForDomSettled = void 0;
|
|
7
7
|
var _locatorHelpers = require("../common/locatorHelpers");
|
|
8
8
|
var _Logger = require("../common/Logger");
|
|
9
|
-
var
|
|
10
|
-
var _getContainerFrame = require("
|
|
9
|
+
var _frameTree = require("../common/frame_utils/frameTree");
|
|
10
|
+
var _getContainerFrame = require("../common/frame_utils/getContainerFrame");
|
|
11
11
|
const waitForDomSettled = async options => {
|
|
12
12
|
const {
|
|
13
13
|
source,
|
|
@@ -82,9 +82,9 @@ const waitForDomSettled = async options => {
|
|
|
82
82
|
if (!result) {
|
|
83
83
|
return false;
|
|
84
84
|
}
|
|
85
|
-
const
|
|
85
|
+
const iframeTree = await _frameTree.FrameTree.fromRoot(frame, 10.0, true);
|
|
86
86
|
let hasRestrictedIframes = false;
|
|
87
|
-
for (const iframeNode of
|
|
87
|
+
for (const iframeNode of iframeTree.nodes()) {
|
|
88
88
|
if (iframeNode.allowsAsyncScripts) {
|
|
89
89
|
const iframeElementHandle = await iframeNode.frame.evaluateHandle("document.documentElement");
|
|
90
90
|
const iframeResult = await iframeElementHandle.evaluate(jsCode, {
|
|
@@ -11,7 +11,6 @@ const withNetworkSettledWait = async (callback, options) => {
|
|
|
11
11
|
timeoutInMs = 30000,
|
|
12
12
|
maxInflightRequests = 0
|
|
13
13
|
} = options || {};
|
|
14
|
-
_Logger.logger.debug(`Page object: ${page}`);
|
|
15
14
|
let networkSettledResolve = null;
|
|
16
15
|
let networkSettledPromise = new Promise(resolve => {
|
|
17
16
|
networkSettledResolve = resolve;
|
|
@@ -50,7 +49,6 @@ const withNetworkSettledWait = async (callback, options) => {
|
|
|
50
49
|
const timeoutPromise = new Promise(resolve => {
|
|
51
50
|
setTimeout(() => {
|
|
52
51
|
var _networkSettledResolv2;
|
|
53
|
-
_Logger.logger.info("waiting for network to settle timed out");
|
|
54
52
|
isTimeout = true;
|
|
55
53
|
(_networkSettledResolv2 = networkSettledResolve) === null || _networkSettledResolv2 === void 0 || _networkSettledResolv2();
|
|
56
54
|
resolve();
|
|
@@ -61,17 +59,15 @@ const withNetworkSettledWait = async (callback, options) => {
|
|
|
61
59
|
actionDone = true;
|
|
62
60
|
await new Promise(resolve => setTimeout(resolve, 500));
|
|
63
61
|
await maybeSettle();
|
|
64
|
-
_Logger.logger.info("-- Start waiting for network to settle... --");
|
|
65
62
|
let shouldContinue = true;
|
|
66
63
|
while (shouldContinue) {
|
|
67
|
-
_Logger.logger.info(`waiting for network to settle, ${requestCounter} requests pending`);
|
|
68
64
|
await Promise.race([networkSettledPromise, timeoutPromise]);
|
|
69
65
|
await new Promise(resolve => setTimeout(resolve, 500));
|
|
70
66
|
if (actionDone && requestCounter <= maxInflightRequests || isTimeout) {
|
|
71
67
|
if (isTimeout) {
|
|
72
|
-
_Logger.logger.
|
|
68
|
+
_Logger.logger.debug("Network did not settle within timeout.");
|
|
73
69
|
} else {
|
|
74
|
-
_Logger.logger.
|
|
70
|
+
_Logger.logger.debug("Network settled.");
|
|
75
71
|
}
|
|
76
72
|
shouldContinue = false;
|
|
77
73
|
} else {
|
|
@@ -80,7 +76,6 @@ const withNetworkSettledWait = async (callback, options) => {
|
|
|
80
76
|
});
|
|
81
77
|
}
|
|
82
78
|
}
|
|
83
|
-
_Logger.logger.info("-- Finished waiting for network to settle --");
|
|
84
79
|
return result;
|
|
85
80
|
} finally {
|
|
86
81
|
page === null || page === void 0 || page.off("request", onRequest);
|
|
@@ -2,8 +2,8 @@ import { Locator, Page } from "playwright";
|
|
|
2
2
|
import { BasicSchema } from "./types/jsonSchema";
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
|
-
*
|
|
6
|
-
*
|
|
5
|
+
* This strategy uses a screenshot of the page/locator with some processing to extract the needed data.
|
|
6
|
+
* Use when the information you're trying to extract isn't present in the DOM as text but can be identified visually.
|
|
7
7
|
* @interface
|
|
8
8
|
* @property model - the model to use in the extraction process.
|
|
9
9
|
* @property type - the type of the strategy
|
|
@@ -35,8 +35,8 @@ export interface ImageStrategy {
|
|
|
35
35
|
type: "IMAGE";
|
|
36
36
|
}
|
|
37
37
|
/**
|
|
38
|
-
*
|
|
39
|
-
*
|
|
38
|
+
* This strategy uses the HTML of the page/locator to extract the needed data. We filter out some attributes to reduce context.
|
|
39
|
+
* The attributes included are only: `aria-label` `data-name` `name` `type` `placeholder` `value` `role` `title` `href` `id` `alt`.
|
|
40
40
|
*
|
|
41
41
|
* @interface
|
|
42
42
|
* @property model - the model to use in the extraction process
|
|
@@ -73,23 +73,22 @@ export interface HtmlStrategy {
|
|
|
73
73
|
type: "HTML";
|
|
74
74
|
}
|
|
75
75
|
/**
|
|
76
|
-
* Extracts an array of structured data from a web page in an optimized way
|
|
77
|
-
* then it will build reliable selectors in the background to make the process more efficient
|
|
76
|
+
* Extracts an array of structured data from a web page in an optimized way. This function uses AI for the first few extractions until it collects multiple examples, then builds reliable selectors in the background for improved efficiency.
|
|
78
77
|
* @deprecated This function is deprecated and will be removed in the future.
|
|
79
78
|
* @param page - The Playwright Page object from which to extract the data.
|
|
80
79
|
* @param options.label - A label for this extraction process, used for billing and monitoring.
|
|
81
|
-
* @param options.itemEntityName - The name of the entity items being extracted
|
|
80
|
+
* @param options.itemEntityName - The name of the entity items being extracted. Must be 1–50 characters long and can only contain letters, digits, periods, underscores, and hyphens.
|
|
82
81
|
* @param options.itemEntitySchema - The schema of the entity items being extracted.
|
|
83
82
|
* @param options.strategy - Optional. The strategy to use for extraction, if not provided, the html strategy with claude haiku will be used.
|
|
84
83
|
* @param options.prompt - Optional. A prompt to guide the extraction process.
|
|
85
84
|
* @param options.optionalPropertiesInvalidator - Optional. A function to invalidate optional properties.
|
|
86
85
|
* @param options.variantKey - Optional. A variant key for the extraction process, use this when the page has multiple variants/shapes.
|
|
87
|
-
* @param options.apiKey - Optional. An API key
|
|
86
|
+
* @param options.apiKey - Optional. An API key for AI extraction. Extractions made with your API key won't be billed to your account.
|
|
88
87
|
* @returns A promise that resolves to a list of extracted data.
|
|
89
88
|
*
|
|
90
89
|
* @example
|
|
91
90
|
* ```typescript extractArrayFromPage
|
|
92
|
-
* import { extractArrayFromPage } from "@intuned/
|
|
91
|
+
* import { extractArrayFromPage } from "@intuned/browser/optimized-extractors";
|
|
93
92
|
*
|
|
94
93
|
* await page.goto("https://books.toscrape.com/")
|
|
95
94
|
* const books = await extractArrayFromPage(page,
|
|
@@ -149,18 +148,18 @@ export declare function extractArrayFromPage(
|
|
|
149
148
|
* @deprecated This function is deprecated and will be removed in the future.
|
|
150
149
|
* @param locator - The Playwright Locator object from which to extract the data.
|
|
151
150
|
* @param options.label - A label for this extraction process, used for billing and monitoring.
|
|
152
|
-
* @param options.itemEntityName - The name of the entity items being extracted.
|
|
151
|
+
* @param options.itemEntityName - The name of the entity items being extracted. Must be 1–50 characters long and can only contain letters, digits, periods, underscores, and hyphens.
|
|
153
152
|
* @param options.itemEntitySchema - The schema of the entity items being extracted.
|
|
154
153
|
* @param options.strategy - Optional. The strategy to use for extraction, if not provided, the html strategy with claude haiku will be used.
|
|
155
154
|
* @param options.prompt - Optional. A prompt to guide the extraction process.
|
|
156
155
|
* @param options.optionalPropertiesInvalidator - Optional. A function to invalidate optional properties.
|
|
157
156
|
* @param options.variantKey - Optional. A variant key for the extraction process.
|
|
158
|
-
* @param options.apiKey - Optional. An API key
|
|
157
|
+
* @param options.apiKey - Optional. An API key for AI extraction. Extractions made with your API key won't be billed to your account.
|
|
159
158
|
* @returns A promise that resolves to a list of extracted data.
|
|
160
159
|
*
|
|
161
160
|
* @example
|
|
162
161
|
* ```typescript extractArrayFromLocator
|
|
163
|
-
* import { extractArrayFromLocator } from "@intuned/
|
|
162
|
+
* import { extractArrayFromLocator } from "@intuned/browser/optimized-extractors";
|
|
164
163
|
*
|
|
165
164
|
* await page.goto("https://books.toscrape.com/")
|
|
166
165
|
* const books = await extractArrayFromLocator(page.locator("section"),
|
|
@@ -266,17 +265,17 @@ export interface SimpleArrayItemSchema extends BasicSchema {
|
|
|
266
265
|
* @deprecated This function is deprecated and will be removed in the future.
|
|
267
266
|
* @param page - The Playwright Page object from which to extract the data.
|
|
268
267
|
* @param options.label - A label for this extraction process, used for billing and monitoring.
|
|
269
|
-
* @param options.entityName - The name of the entity being extracted.
|
|
268
|
+
* @param options.entityName - The name of the entity being extracted. Must be 1–50 characters long and can only contain letters, digits, periods, underscores, and hyphens.
|
|
270
269
|
* @param options.entitySchema - The schema of the entity being extracted.
|
|
271
270
|
* @param options.strategy - Optional. The strategy to use for extraction, if not provided, the html strategy with claude haiku will be used.
|
|
272
271
|
* @param options.prompt - Optional. A prompt to guide the extraction process.
|
|
273
272
|
* @param options.optionalPropertiesInvalidator - Optional. A function to invalidate optional properties.
|
|
274
273
|
* @param options.variantKey - Optional. A variant key for the extraction process.
|
|
275
|
-
* @param options.apiKey - Optional. An API key
|
|
274
|
+
* @param options.apiKey - Optional. An API key for AI extraction. Extractions made with your API key won't be billed to your account.
|
|
276
275
|
* @returns A promise that resolves to the extracted object.
|
|
277
276
|
* @example
|
|
278
277
|
* ```typescript extractObjectFromPage
|
|
279
|
-
* import { extractObjectFromPage } from "@intuned/
|
|
278
|
+
* import { extractObjectFromPage } from "@intuned/browser/optimized-extractors";
|
|
280
279
|
*
|
|
281
280
|
* await page.goto("https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html")
|
|
282
281
|
* const book = await extractObjectFromPage(page,
|
|
@@ -333,18 +332,18 @@ export declare function extractObjectFromPage(
|
|
|
333
332
|
* @deprecated This function is deprecated and will be removed in the future.
|
|
334
333
|
* @param locator - The Playwright Locator object from which to extract the data.
|
|
335
334
|
* @param options.label - A label for this extraction process, used for billing and monitoring.
|
|
336
|
-
* @param options.entityName - The name of the entity being extracted.
|
|
335
|
+
* @param options.entityName - The name of the entity being extracted. Must be 1–50 characters long and can only contain letters, digits, periods, underscores, and hyphens.
|
|
337
336
|
* @param options.entitySchema - The schema of the entity being extracted.
|
|
338
337
|
* @param options.strategy - Optional. The strategy to use for extraction, if not provided, the html strategy with claude haiku will be used.
|
|
339
338
|
* @param options.prompt - Optional. A prompt to guide the extraction process.
|
|
340
339
|
* @param options.optionalPropertiesInvalidator - Optional. A function to invalidate optional properties.
|
|
341
340
|
* @param options.variantKey - Optional. A variant key for the extraction process.
|
|
342
|
-
* @param options.apiKey - Optional. An API key
|
|
341
|
+
* @param options.apiKey - Optional. An API key for AI extraction. Extractions made with your API key won't be billed to your account.
|
|
343
342
|
* @returns A promise that resolves to the extracted object.
|
|
344
343
|
*
|
|
345
344
|
* @example
|
|
346
345
|
* ```typescript extractObjectFromLocator
|
|
347
|
-
* import { extractObjectFromLocator } from "@intuned/
|
|
346
|
+
* import { extractObjectFromLocator } from "@intuned/browser/optimized-extractors";
|
|
348
347
|
*
|
|
349
348
|
* await page.goto("https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html")
|
|
350
349
|
* const book = await extractObjectFromLocator(page.locator(".page_inner"),
|
|
@@ -2,8 +2,8 @@ import { Locator, Page } from "playwright";
|
|
|
2
2
|
import { BasicSchema } from "./types/jsonSchema";
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
|
-
*
|
|
6
|
-
*
|
|
5
|
+
* This strategy uses a screenshot of the page/locator with some processing to extract the needed data.
|
|
6
|
+
* Use when the information you're trying to extract isn't present in the DOM as text but can be identified visually.
|
|
7
7
|
* @interface
|
|
8
8
|
* @property model - the model to use in the extraction process.
|
|
9
9
|
* @property type - the type of the strategy
|
|
@@ -35,8 +35,8 @@ export interface ImageStrategy {
|
|
|
35
35
|
type: "IMAGE";
|
|
36
36
|
}
|
|
37
37
|
/**
|
|
38
|
-
*
|
|
39
|
-
*
|
|
38
|
+
* This strategy uses the HTML of the page/locator to extract the needed data. We filter out some attributes to reduce context.
|
|
39
|
+
* The attributes included are only: `aria-label` `data-name` `name` `type` `placeholder` `value` `role` `title` `href` `id` `alt`.
|
|
40
40
|
*
|
|
41
41
|
* @interface
|
|
42
42
|
* @property model - the model to use in the extraction process
|
|
@@ -73,23 +73,22 @@ export interface HtmlStrategy {
|
|
|
73
73
|
type: "HTML";
|
|
74
74
|
}
|
|
75
75
|
/**
|
|
76
|
-
* Extracts an array of structured data from a web page in an optimized way
|
|
77
|
-
* then it will build reliable selectors in the background to make the process more efficient
|
|
76
|
+
* Extracts an array of structured data from a web page in an optimized way. This function uses AI for the first few extractions until it collects multiple examples, then builds reliable selectors in the background for improved efficiency.
|
|
78
77
|
* @deprecated This function is deprecated and will be removed in the future.
|
|
79
78
|
* @param page - The Playwright Page object from which to extract the data.
|
|
80
79
|
* @param options.label - A label for this extraction process, used for billing and monitoring.
|
|
81
|
-
* @param options.itemEntityName - The name of the entity items being extracted
|
|
80
|
+
* @param options.itemEntityName - The name of the entity items being extracted. Must be 1–50 characters long and can only contain letters, digits, periods, underscores, and hyphens.
|
|
82
81
|
* @param options.itemEntitySchema - The schema of the entity items being extracted.
|
|
83
82
|
* @param options.strategy - Optional. The strategy to use for extraction, if not provided, the html strategy with claude haiku will be used.
|
|
84
83
|
* @param options.prompt - Optional. A prompt to guide the extraction process.
|
|
85
84
|
* @param options.optionalPropertiesInvalidator - Optional. A function to invalidate optional properties.
|
|
86
85
|
* @param options.variantKey - Optional. A variant key for the extraction process, use this when the page has multiple variants/shapes.
|
|
87
|
-
* @param options.apiKey - Optional. An API key
|
|
86
|
+
* @param options.apiKey - Optional. An API key for AI extraction. Extractions made with your API key won't be billed to your account.
|
|
88
87
|
* @returns A promise that resolves to a list of extracted data.
|
|
89
88
|
*
|
|
90
89
|
* @example
|
|
91
90
|
* ```typescript extractArrayFromPage
|
|
92
|
-
* import { extractArrayFromPage } from "@intuned/
|
|
91
|
+
* import { extractArrayFromPage } from "@intuned/browser/optimized-extractors";
|
|
93
92
|
*
|
|
94
93
|
* await page.goto("https://books.toscrape.com/")
|
|
95
94
|
* const books = await extractArrayFromPage(page,
|
|
@@ -149,18 +148,18 @@ export declare function extractArrayFromPage(
|
|
|
149
148
|
* @deprecated This function is deprecated and will be removed in the future.
|
|
150
149
|
* @param locator - The Playwright Locator object from which to extract the data.
|
|
151
150
|
* @param options.label - A label for this extraction process, used for billing and monitoring.
|
|
152
|
-
* @param options.itemEntityName - The name of the entity items being extracted.
|
|
151
|
+
* @param options.itemEntityName - The name of the entity items being extracted. Must be 1–50 characters long and can only contain letters, digits, periods, underscores, and hyphens.
|
|
153
152
|
* @param options.itemEntitySchema - The schema of the entity items being extracted.
|
|
154
153
|
* @param options.strategy - Optional. The strategy to use for extraction, if not provided, the html strategy with claude haiku will be used.
|
|
155
154
|
* @param options.prompt - Optional. A prompt to guide the extraction process.
|
|
156
155
|
* @param options.optionalPropertiesInvalidator - Optional. A function to invalidate optional properties.
|
|
157
156
|
* @param options.variantKey - Optional. A variant key for the extraction process.
|
|
158
|
-
* @param options.apiKey - Optional. An API key
|
|
157
|
+
* @param options.apiKey - Optional. An API key for AI extraction. Extractions made with your API key won't be billed to your account.
|
|
159
158
|
* @returns A promise that resolves to a list of extracted data.
|
|
160
159
|
*
|
|
161
160
|
* @example
|
|
162
161
|
* ```typescript extractArrayFromLocator
|
|
163
|
-
* import { extractArrayFromLocator } from "@intuned/
|
|
162
|
+
* import { extractArrayFromLocator } from "@intuned/browser/optimized-extractors";
|
|
164
163
|
*
|
|
165
164
|
* await page.goto("https://books.toscrape.com/")
|
|
166
165
|
* const books = await extractArrayFromLocator(page.locator("section"),
|
|
@@ -266,17 +265,17 @@ export interface SimpleArrayItemSchema extends BasicSchema {
|
|
|
266
265
|
* @deprecated This function is deprecated and will be removed in the future.
|
|
267
266
|
* @param page - The Playwright Page object from which to extract the data.
|
|
268
267
|
* @param options.label - A label for this extraction process, used for billing and monitoring.
|
|
269
|
-
* @param options.entityName - The name of the entity being extracted.
|
|
268
|
+
* @param options.entityName - The name of the entity being extracted. Must be 1–50 characters long and can only contain letters, digits, periods, underscores, and hyphens.
|
|
270
269
|
* @param options.entitySchema - The schema of the entity being extracted.
|
|
271
270
|
* @param options.strategy - Optional. The strategy to use for extraction, if not provided, the html strategy with claude haiku will be used.
|
|
272
271
|
* @param options.prompt - Optional. A prompt to guide the extraction process.
|
|
273
272
|
* @param options.optionalPropertiesInvalidator - Optional. A function to invalidate optional properties.
|
|
274
273
|
* @param options.variantKey - Optional. A variant key for the extraction process.
|
|
275
|
-
* @param options.apiKey - Optional. An API key
|
|
274
|
+
* @param options.apiKey - Optional. An API key for AI extraction. Extractions made with your API key won't be billed to your account.
|
|
276
275
|
* @returns A promise that resolves to the extracted object.
|
|
277
276
|
* @example
|
|
278
277
|
* ```typescript extractObjectFromPage
|
|
279
|
-
* import { extractObjectFromPage } from "@intuned/
|
|
278
|
+
* import { extractObjectFromPage } from "@intuned/browser/optimized-extractors";
|
|
280
279
|
*
|
|
281
280
|
* await page.goto("https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html")
|
|
282
281
|
* const book = await extractObjectFromPage(page,
|
|
@@ -333,18 +332,18 @@ export declare function extractObjectFromPage(
|
|
|
333
332
|
* @deprecated This function is deprecated and will be removed in the future.
|
|
334
333
|
* @param locator - The Playwright Locator object from which to extract the data.
|
|
335
334
|
* @param options.label - A label for this extraction process, used for billing and monitoring.
|
|
336
|
-
* @param options.entityName - The name of the entity being extracted.
|
|
335
|
+
* @param options.entityName - The name of the entity being extracted. Must be 1–50 characters long and can only contain letters, digits, periods, underscores, and hyphens.
|
|
337
336
|
* @param options.entitySchema - The schema of the entity being extracted.
|
|
338
337
|
* @param options.strategy - Optional. The strategy to use for extraction, if not provided, the html strategy with claude haiku will be used.
|
|
339
338
|
* @param options.prompt - Optional. A prompt to guide the extraction process.
|
|
340
339
|
* @param options.optionalPropertiesInvalidator - Optional. A function to invalidate optional properties.
|
|
341
340
|
* @param options.variantKey - Optional. A variant key for the extraction process.
|
|
342
|
-
* @param options.apiKey - Optional. An API key
|
|
341
|
+
* @param options.apiKey - Optional. An API key for AI extraction. Extractions made with your API key won't be billed to your account.
|
|
343
342
|
* @returns A promise that resolves to the extracted object.
|
|
344
343
|
*
|
|
345
344
|
* @example
|
|
346
345
|
* ```typescript extractObjectFromLocator
|
|
347
|
-
* import { extractObjectFromLocator } from "@intuned/
|
|
346
|
+
* import { extractObjectFromLocator } from "@intuned/browser/optimized-extractors";
|
|
348
347
|
*
|
|
349
348
|
* await page.goto("https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html")
|
|
350
349
|
* const book = await extractObjectFromLocator(page.locator(".page_inner"),
|
|
@@ -9,41 +9,15 @@ declare module "@intuned/runtime" {
|
|
|
9
9
|
password: string;
|
|
10
10
|
}
|
|
11
11
|
|
|
12
|
-
export type WithPlaywrightContextWrappedFunction<R> = (
|
|
13
|
-
context: BrowserContext,
|
|
14
|
-
page: Page
|
|
15
|
-
) => Promise<any>;
|
|
16
|
-
|
|
17
12
|
export function getExecutionContext(): any;
|
|
18
13
|
export function extendTimeout(): void;
|
|
19
|
-
export function runWithContext<
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
14
|
+
export function runWithContext<R, TArgs extends any[]>(
|
|
15
|
+
contextData: any,
|
|
16
|
+
callback: (...args: TArgs) => R,
|
|
17
|
+
...args: TArgs
|
|
18
|
+
): R;
|
|
23
19
|
export function getDownloadDirectoryPath(): string;
|
|
24
20
|
|
|
25
|
-
export function withPlaywrightContext<R>(
|
|
26
|
-
options: {
|
|
27
|
-
proxy?: Proxy;
|
|
28
|
-
headless: boolean;
|
|
29
|
-
downloadsPath: string;
|
|
30
|
-
importFunction?: any;
|
|
31
|
-
apiName?: string;
|
|
32
|
-
apiParameters?: any;
|
|
33
|
-
},
|
|
34
|
-
fn: WithPlaywrightContextWrappedFunction<R>
|
|
35
|
-
): Promise<any>;
|
|
36
|
-
|
|
37
|
-
export function withPlaywrightContext<R>(
|
|
38
|
-
options: {
|
|
39
|
-
cdpAddress: string;
|
|
40
|
-
importFunction?: any;
|
|
41
|
-
apiName?: string;
|
|
42
|
-
apiParameters?: any;
|
|
43
|
-
},
|
|
44
|
-
fn: WithPlaywrightContextWrappedFunction<R>
|
|
45
|
-
): Promise<any>;
|
|
46
|
-
|
|
47
21
|
// Add other exports from @intuned/runtime if needed
|
|
48
22
|
}
|
|
49
23
|
|
|
@@ -59,6 +33,6 @@ declare module "@intuned/runtime/dist/common/jwtTokenManager" {
|
|
|
59
33
|
|
|
60
34
|
export function callBackendFunctionWithToken<T = any>(
|
|
61
35
|
functionName: string,
|
|
62
|
-
params?: any
|
|
36
|
+
params?: any,
|
|
63
37
|
): Promise<T>;
|
|
64
38
|
}
|
package/how-to-generate-docs.md
CHANGED
|
@@ -1,50 +1,62 @@
|
|
|
1
1
|
## How to Generate Documentation
|
|
2
2
|
|
|
3
|
-
###
|
|
3
|
+
### Quick Start
|
|
4
4
|
|
|
5
5
|
**Generate all docs (recommended):**
|
|
6
6
|
|
|
7
7
|
```bash
|
|
8
|
-
yarn generate-
|
|
8
|
+
yarn generate-docs
|
|
9
9
|
```
|
|
10
10
|
|
|
11
|
-
This command processes all `export.d.ts` files from the 3 namespaces and
|
|
11
|
+
This command automatically processes all `export.d.ts` files from the 3 namespaces and outputs them to the correct paths in the docs folder to work with Mintlify.
|
|
12
12
|
|
|
13
|
-
|
|
13
|
+
### Output Structure
|
|
14
14
|
|
|
15
|
-
|
|
16
|
-
yarn generate-docs <input.d.ts> [outputdir]
|
|
17
|
-
```
|
|
15
|
+
Docs are automatically generated to:
|
|
18
16
|
|
|
19
|
-
|
|
17
|
+
```
|
|
18
|
+
docs/automation-sdks/intuned-sdk/typescript/
|
|
19
|
+
├── helpers/
|
|
20
|
+
│ ├── functions/
|
|
21
|
+
│ ├── interfaces/
|
|
22
|
+
│ └── type-aliases/
|
|
23
|
+
├── ai/
|
|
24
|
+
│ ├── functions/
|
|
25
|
+
│ ├── interfaces/
|
|
26
|
+
│ └── type-aliases/
|
|
27
|
+
└── optimized-extractors/
|
|
28
|
+
├── functions/
|
|
29
|
+
├── interfaces/
|
|
30
|
+
└── type-aliases/
|
|
31
|
+
```
|
|
20
32
|
|
|
21
|
-
|
|
22
|
-
# Generate docs for helpers (recommended)
|
|
23
|
-
yarn generate-docs ./src/helpers/export.d.ts ./generated-docs/helpers
|
|
33
|
+
### Available Namespaces
|
|
24
34
|
|
|
25
|
-
|
|
26
|
-
yarn generate-docs ./src/ai/export.d.ts ./generated-docs/ai
|
|
35
|
+
The following namespaces are automatically processed:
|
|
27
36
|
|
|
28
|
-
|
|
29
|
-
|
|
37
|
+
| Namespace | Input File | Output Path |
|
|
38
|
+
| -------------------- | -------------------------------------- | ------------------------------------------- |
|
|
39
|
+
| helpers | `src/helpers/export.d.ts` | `docs/.../typescript/helpers/` |
|
|
40
|
+
| ai | `src/ai/export.d.ts` | `docs/.../typescript/ai/` |
|
|
41
|
+
| optimized-extractors | `src/optimized-extractors/export.d.ts` | `docs/.../typescript/optimized-extractors/` |
|
|
30
42
|
|
|
31
|
-
|
|
32
|
-
yarn generate-docs ./src/helpers/export.d.ts
|
|
33
|
-
```
|
|
43
|
+
### Legacy Usage (Single File)
|
|
34
44
|
|
|
35
|
-
|
|
45
|
+
You can still process a single file with a custom output directory:
|
|
36
46
|
|
|
37
|
-
|
|
38
|
-
-
|
|
39
|
-
|
|
47
|
+
```bash
|
|
48
|
+
yarn generate-docs <input.d.ts> [output-dir]
|
|
49
|
+
```
|
|
40
50
|
|
|
41
|
-
|
|
51
|
+
**Examples:**
|
|
42
52
|
|
|
43
|
-
|
|
53
|
+
```bash
|
|
54
|
+
# Generate docs for helpers to a custom directory
|
|
55
|
+
yarn generate-docs ./src/helpers/export.d.ts ./custom-output/helpers
|
|
44
56
|
|
|
45
|
-
|
|
46
|
-
-
|
|
47
|
-
|
|
57
|
+
# Generate docs for AI functions to a custom directory
|
|
58
|
+
yarn generate-docs ./src/ai/export.d.ts ./custom-output/ai
|
|
59
|
+
```
|
|
48
60
|
|
|
49
61
|
### How It Works
|
|
50
62
|
|
|
@@ -59,4 +71,4 @@ The script in `./scripts/generate-docs.ts` reads JSDocs from `export.d.ts` files
|
|
|
59
71
|
- The markdown converters parse JSDocs into Mintlify-compatible format
|
|
60
72
|
- Include `@example` blocks with TypeScript code snippets
|
|
61
73
|
- Use `@param`, `@returns`, and `@interface` tags for proper documentation
|
|
62
|
-
|
|
74
|
+
- Use `@overload` tag to specify tab titles for overloaded functions
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@intuned/browser-dev",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.10-dev.0",
|
|
4
4
|
"description": "runner package for intuned functions",
|
|
5
5
|
"types": "./dist/index.d.ts",
|
|
6
6
|
"typesVersions": {
|
|
@@ -46,7 +46,6 @@
|
|
|
46
46
|
"lint": "eslint .",
|
|
47
47
|
"fix": "eslint . --fix",
|
|
48
48
|
"generate-docs": "npx tsx ./scripts/generate-docs.ts",
|
|
49
|
-
"generate-all-docs": "npx tsx ./scripts/generate-docs.ts ./src/helpers/export.d.ts ./generated-docs/helpers && npx tsx ./scripts/generate-docs.ts ./src/ai/export.d.ts ./generated-docs/ai && npx tsx ./scripts/generate-docs.ts ./src/optimized-extractors/export.d.ts ./generated-docs/optimized-extractors",
|
|
50
49
|
"build-browser-scripts": "rollup -c ./src/common/browserScripts/rollup.config.mjs",
|
|
51
50
|
"copy-dts": "copyfiles -u 1 \"src/**/*.d.ts\" dist",
|
|
52
51
|
"release": "npx tsx ./scripts/release.ts"
|
|
@@ -110,6 +109,7 @@
|
|
|
110
109
|
"prettier": "^2.8.8",
|
|
111
110
|
"rollup": "3.26.2",
|
|
112
111
|
"ts-jest": "^29.4.0",
|
|
112
|
+
"ts-morph": "^27.0.2",
|
|
113
113
|
"typescript": "5.4.4",
|
|
114
114
|
"vite": "^5.4.12",
|
|
115
115
|
"vite-plugin-babel-macros": "^1.0.6",
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
|
|
3
|
-
Object.defineProperty(exports, "__esModule", {
|
|
4
|
-
value: true
|
|
5
|
-
});
|
|
6
|
-
exports.IFRAME_TAGS = exports.ALL_IFRAMES_CSS_SELECTOR = void 0;
|
|
7
|
-
const IFRAME_TAGS = exports.IFRAME_TAGS = ["iframe", "frame"];
|
|
8
|
-
const ALL_IFRAMES_CSS_SELECTOR = exports.ALL_IFRAMES_CSS_SELECTOR = IFRAME_TAGS.join(", ");
|
|
@@ -1,82 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
|
|
3
|
-
Object.defineProperty(exports, "__esModule", {
|
|
4
|
-
value: true
|
|
5
|
-
});
|
|
6
|
-
exports.findAllIframes = findAllIframes;
|
|
7
|
-
exports.findAllIframesList = findAllIframesList;
|
|
8
|
-
var _Logger = require("../../common/Logger");
|
|
9
|
-
var _checkFrameAllowsAsyncScripts = require("./checkFrameAllowsAsyncScripts");
|
|
10
|
-
var _constants = require("./constants");
|
|
11
|
-
async function findAllIframes(root, iframeTimeoutMs = 10000) {
|
|
12
|
-
const processed = new Set();
|
|
13
|
-
return await processFrameRecursive(root, processed, iframeTimeoutMs);
|
|
14
|
-
}
|
|
15
|
-
async function findAllIframesList(root, iframeTimeoutMs = 10000) {
|
|
16
|
-
const iframeNodes = await findAllIframes(root, iframeTimeoutMs);
|
|
17
|
-
return flattenIframeTree(iframeNodes);
|
|
18
|
-
}
|
|
19
|
-
async function processFrameRecursive(root, processedRoots, iframeTimeoutMs) {
|
|
20
|
-
if (processedRoots.has(root)) {
|
|
21
|
-
return [];
|
|
22
|
-
}
|
|
23
|
-
processedRoots.add(root);
|
|
24
|
-
const iframeNodes = [];
|
|
25
|
-
try {
|
|
26
|
-
const iframeLocator = root.locator(_constants.ALL_IFRAMES_CSS_SELECTOR);
|
|
27
|
-
let iframeCount;
|
|
28
|
-
try {
|
|
29
|
-
iframeCount = await Promise.race([iframeLocator.count(), new Promise((_, reject) => setTimeout(() => reject(new Error("Timeout")), iframeTimeoutMs))]);
|
|
30
|
-
} catch (error) {
|
|
31
|
-
_Logger.logger.error("Timeout counting iframes in context, skipping");
|
|
32
|
-
return [];
|
|
33
|
-
}
|
|
34
|
-
for (let i = 0; i < iframeCount; i++) {
|
|
35
|
-
try {
|
|
36
|
-
const processSingleIframe = async index => {
|
|
37
|
-
const iframeElementLocator = iframeLocator.nth(index);
|
|
38
|
-
const iframeElement = await iframeElementLocator.elementHandle();
|
|
39
|
-
if (!iframeElement) {
|
|
40
|
-
_Logger.logger.error(`Could not get element handle for iframe: ${iframeElement}`);
|
|
41
|
-
return null;
|
|
42
|
-
}
|
|
43
|
-
const contentFrame = await iframeElement.contentFrame();
|
|
44
|
-
if (!contentFrame) {
|
|
45
|
-
_Logger.logger.error(`Could not access content_frame for iframe: ${iframeElement}`);
|
|
46
|
-
return null;
|
|
47
|
-
}
|
|
48
|
-
const allowsAsyncScripts = await (0, _checkFrameAllowsAsyncScripts.checkFrameAllowsAsyncScripts)(iframeElement);
|
|
49
|
-
const nestedIframes = await processFrameRecursive(contentFrame, processedRoots, iframeTimeoutMs);
|
|
50
|
-
return {
|
|
51
|
-
frame: contentFrame,
|
|
52
|
-
nestedIframes,
|
|
53
|
-
allowsAsyncScripts
|
|
54
|
-
};
|
|
55
|
-
};
|
|
56
|
-
const iframeNode = await Promise.race([processSingleIframe(i), new Promise((_, reject) => setTimeout(() => reject(new Error("Timeout")), iframeTimeoutMs))]);
|
|
57
|
-
if (iframeNode !== null) {
|
|
58
|
-
iframeNodes.push(iframeNode);
|
|
59
|
-
}
|
|
60
|
-
} catch (error) {
|
|
61
|
-
_Logger.logger.error(`Timeout processing iframe ${i} in context, skipping`);
|
|
62
|
-
continue;
|
|
63
|
-
}
|
|
64
|
-
}
|
|
65
|
-
} catch (error) {
|
|
66
|
-
_Logger.logger.error(`Error processing frames in context: ${error}`);
|
|
67
|
-
}
|
|
68
|
-
return iframeNodes;
|
|
69
|
-
}
|
|
70
|
-
function flattenIframeTree(iframeNodes) {
|
|
71
|
-
const flattened = [];
|
|
72
|
-
function flattenRecursive(nodes) {
|
|
73
|
-
for (const node of nodes) {
|
|
74
|
-
flattened.push(node);
|
|
75
|
-
if (node.nestedIframes && node.nestedIframes.length > 0) {
|
|
76
|
-
flattenRecursive(node.nestedIframes);
|
|
77
|
-
}
|
|
78
|
-
}
|
|
79
|
-
}
|
|
80
|
-
flattenRecursive(iframeNodes);
|
|
81
|
-
return flattened;
|
|
82
|
-
}
|
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
|
|
3
|
-
Object.defineProperty(exports, "__esModule", {
|
|
4
|
-
value: true
|
|
5
|
-
});
|
|
6
|
-
Object.defineProperty(exports, "ALL_IFRAMES_CSS_SELECTOR", {
|
|
7
|
-
enumerable: true,
|
|
8
|
-
get: function () {
|
|
9
|
-
return _constants.ALL_IFRAMES_CSS_SELECTOR;
|
|
10
|
-
}
|
|
11
|
-
});
|
|
12
|
-
Object.defineProperty(exports, "IFRAME_TAGS", {
|
|
13
|
-
enumerable: true,
|
|
14
|
-
get: function () {
|
|
15
|
-
return _constants.IFRAME_TAGS;
|
|
16
|
-
}
|
|
17
|
-
});
|
|
18
|
-
Object.defineProperty(exports, "IframeNode", {
|
|
19
|
-
enumerable: true,
|
|
20
|
-
get: function () {
|
|
21
|
-
return _findAllIframes.IframeNode;
|
|
22
|
-
}
|
|
23
|
-
});
|
|
24
|
-
Object.defineProperty(exports, "findAllIframes", {
|
|
25
|
-
enumerable: true,
|
|
26
|
-
get: function () {
|
|
27
|
-
return _findAllIframes.findAllIframes;
|
|
28
|
-
}
|
|
29
|
-
});
|
|
30
|
-
Object.defineProperty(exports, "findAllIframesList", {
|
|
31
|
-
enumerable: true,
|
|
32
|
-
get: function () {
|
|
33
|
-
return _findAllIframes.findAllIframesList;
|
|
34
|
-
}
|
|
35
|
-
});
|
|
36
|
-
Object.defineProperty(exports, "getContainerFrame", {
|
|
37
|
-
enumerable: true,
|
|
38
|
-
get: function () {
|
|
39
|
-
return _getContainerFrame.getContainerFrame;
|
|
40
|
-
}
|
|
41
|
-
});
|
|
42
|
-
var _findAllIframes = require("./findAllIframes");
|
|
43
|
-
var _getContainerFrame = require("./getContainerFrame");
|
|
44
|
-
var _constants = require("./constants");
|