@intuned/browser-dev 2.2.3-unify-sdks.21 → 2.2.3-unify-sdks.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ai/export.d.ts +68 -45
- package/dist/ai/extractStructuredDataUsingAi.js +12 -8
- package/dist/ai/index.d.ts +68 -45
- package/dist/ai/isPageLoaded.js +5 -0
- package/dist/helpers/downloadFile.js +2 -2
- package/dist/helpers/export.d.ts +108 -112
- package/dist/helpers/gotoUrl.js +3 -4
- package/dist/helpers/index.d.ts +108 -112
- package/dist/helpers/sanitizeHtml.js +5 -4
- package/dist/helpers/scrollToLoadContent.js +1 -1
- package/dist/helpers/tests/testExtractMarkdown.spec.js +4 -6
- package/dist/playwright/export.d.js +5 -0
- package/dist/playwright/export.d.ts +229 -0
- package/dist/playwright/index.d.ts +229 -0
- package/dist/playwright/index.js +18 -0
- package/dist/playwright/staticExtractors/extractHelpers.js +170 -0
- package/dist/playwright/staticExtractors/getArrayUsingArrayExtractor.js +84 -0
- package/dist/playwright/staticExtractors/getObjectUsingObjectExtractor.js +45 -0
- package/dist/playwright/staticExtractors/index.js +37 -0
- package/dist/playwright/staticExtractors/types.js +26 -0
- package/package.json +7 -2
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
import type { Locator, Page } from "playwright-core";
|
|
2
|
+
import {
|
|
3
|
+
extractMarkdownFromPage,
|
|
4
|
+
extractStructuredDataFromPage,
|
|
5
|
+
} from "../ai-extractors";
|
|
6
|
+
import {
|
|
7
|
+
extractArrayFromPage,
|
|
8
|
+
extractObjectFromPage,
|
|
9
|
+
} from "../optimized-extractors";
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* --
|
|
13
|
+
* @interface
|
|
14
|
+
* @property selector - The selector string for the element.
|
|
15
|
+
* @property [type] - Optional. The type of the selector (xpath or css) default is `css`
|
|
16
|
+
*/
|
|
17
|
+
export interface ElementSelector {
|
|
18
|
+
selector: string;
|
|
19
|
+
type?: "xpath" | "css";
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* a record or property name and the value selector to extract the value from the page.
|
|
24
|
+
* you can provide a list of `ValueSelector` to provide a backup selector in case the first one fails.
|
|
25
|
+
* the primary selector is the first one in the list.
|
|
26
|
+
*/
|
|
27
|
+
export type ObjectExtractor = Record<string, ValueSelector | ValueSelector[]>;
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* --
|
|
31
|
+
* @interface
|
|
32
|
+
* @property selector - The selector string for the element.
|
|
33
|
+
* @property [type] - Optional. The type of the selector (xpath or css). default to `css`
|
|
34
|
+
*/
|
|
35
|
+
export interface ElementSelector {
|
|
36
|
+
selector: string;
|
|
37
|
+
type?: "xpath" | "css";
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* represents a dom element selector and the method to extract the value from the element.
|
|
42
|
+
*
|
|
43
|
+
* @interface
|
|
44
|
+
* @extends ElementSelector
|
|
45
|
+
* @property [selectionMethod] - Optional. The method for selecting the value. `all-text` selects all text content, `direct-text` selects the direct text content(does not include the text inside nested elements), and `propertyName` selects the value of a property.
|
|
46
|
+
* @property [regex] - Optional. A regex pattern and match index for extracting the value.
|
|
47
|
+
* @property [multiValue] - Optional. Whether the selector extracts multiple values, if set to true the returned value will be array of strings
|
|
48
|
+
*/
|
|
49
|
+
export interface ValueSelector extends ElementSelector {
|
|
50
|
+
selectionMethod?:
|
|
51
|
+
| "direct-text"
|
|
52
|
+
| "all-text"
|
|
53
|
+
| {
|
|
54
|
+
propertyName: string;
|
|
55
|
+
};
|
|
56
|
+
|
|
57
|
+
regex?: {
|
|
58
|
+
pattern: string;
|
|
59
|
+
matchIndex?: number;
|
|
60
|
+
};
|
|
61
|
+
multiValue?: boolean;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* --
|
|
66
|
+
*
|
|
67
|
+
* @interface
|
|
68
|
+
* @property containerSelector - The selector(s) for the container elements of the list, all list items should be direct children of this container.
|
|
69
|
+
* @property propertySelectors - The selectors for the properties to extract. the values of the selector should be relative to the list item.
|
|
70
|
+
*
|
|
71
|
+
* **example:** if the list was:
|
|
72
|
+
*
|
|
73
|
+
* ```html
|
|
74
|
+
* <ul>
|
|
75
|
+
* <li>
|
|
76
|
+
* <div class="title">title 1</div>
|
|
77
|
+
* <div class="price">price 1</div>
|
|
78
|
+
* </li>
|
|
79
|
+
* <li>
|
|
80
|
+
* <div class="title">title 2</div>
|
|
81
|
+
* <div class="price">price 2</div>
|
|
82
|
+
* </li>
|
|
83
|
+
* </ul>
|
|
84
|
+
* ```
|
|
85
|
+
* the css relative selectors should be:
|
|
86
|
+
*
|
|
87
|
+
* title -> `.title`
|
|
88
|
+
*
|
|
89
|
+
* price -> `.price`
|
|
90
|
+
*
|
|
91
|
+
*/
|
|
92
|
+
export interface ListStaticExtractor {
|
|
93
|
+
containerSelector: ElementSelector | ElementSelector[];
|
|
94
|
+
propertySelectors: Record<string, ValueSelector | ValueSelector[]>;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
export type ExtractObjectFromPageUsingSelectorsReturnType<
|
|
98
|
+
T extends ObjectExtractor
|
|
99
|
+
> = {
|
|
100
|
+
[K in keyof T]: T[K] extends {
|
|
101
|
+
multiValue: true;
|
|
102
|
+
}
|
|
103
|
+
? string[] | null
|
|
104
|
+
: string | null;
|
|
105
|
+
};
|
|
106
|
+
|
|
107
|
+
export type ExtractListObjectsUsingStaticSelectorsReturnType<
|
|
108
|
+
T extends ListStaticExtractor
|
|
109
|
+
> = {
|
|
110
|
+
[K in keyof T["propertySelectors"]]: T["propertySelectors"][K] extends {
|
|
111
|
+
multiValue: true;
|
|
112
|
+
}
|
|
113
|
+
? string[] | null
|
|
114
|
+
: string | null;
|
|
115
|
+
}[];
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Extracts an object from a web page using the specified selectors.
|
|
119
|
+
*
|
|
120
|
+
* @param page - The Playwright Page object from which to extract the data.
|
|
121
|
+
* @param extractor - The object extractor with the selectors to use.
|
|
122
|
+
* @returns A promise that resolves to the extracted object.
|
|
123
|
+
*
|
|
124
|
+
* @example
|
|
125
|
+
* ```typescript extractObjectFromPageUsingSelectors
|
|
126
|
+
* import { extractObjectFromPageUsingSelectors, goto } from "@intuned/sdk/playwright";
|
|
127
|
+
*
|
|
128
|
+
* await goto(page, 'https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html');
|
|
129
|
+
* const book = await extractObjectFromPageUsingSelectors(page, {
|
|
130
|
+
* name: {
|
|
131
|
+
* selector: "h1",
|
|
132
|
+
* selectionMethod: "all-text"
|
|
133
|
+
* },
|
|
134
|
+
* inStock: {
|
|
135
|
+
* selector: ".price_color",
|
|
136
|
+
* },
|
|
137
|
+
* imgUrl: {
|
|
138
|
+
* selector: "#product_gallery > div > div > div > img",
|
|
139
|
+
* selectionMethod: {
|
|
140
|
+
* propertyName: "src"
|
|
141
|
+
* }
|
|
142
|
+
* }
|
|
143
|
+
* })
|
|
144
|
+
*
|
|
145
|
+
* console.log(book)
|
|
146
|
+
*
|
|
147
|
+
* // output:
|
|
148
|
+
* // {
|
|
149
|
+
* // name: 'A Light in the Attic',
|
|
150
|
+
* // inStock: '£51.77',
|
|
151
|
+
* // imgUrl: '../../media/cache/fe/72/fe72f0532301ec28892ae79a629a293c.jpg'
|
|
152
|
+
* // }
|
|
153
|
+
*
|
|
154
|
+
* ```
|
|
155
|
+
*/
|
|
156
|
+
export declare function extractObjectFromPageUsingSelectors<
|
|
157
|
+
T extends ObjectExtractor
|
|
158
|
+
>(
|
|
159
|
+
page: Page,
|
|
160
|
+
extractor: T
|
|
161
|
+
): Promise<ExtractObjectFromPageUsingSelectorsReturnType<T>>;
|
|
162
|
+
|
|
163
|
+
/**
|
|
164
|
+
* Extracts a list of objects from a web page using the specified static selectors.
|
|
165
|
+
*
|
|
166
|
+
* @param page - The Playwright Page object from which to extract the data.
|
|
167
|
+
* @param listExtractor - The list static extractor with the selectors to use.
|
|
168
|
+
* @returns A promise that resolves to the extracted list of objects.
|
|
169
|
+
* @example
|
|
170
|
+
* ```typescript extractArrayFromPageUsingSelectors
|
|
171
|
+
* import { extractArrayFromPageUsingSelectors, goto } from "@intuned/sdk/playwright";
|
|
172
|
+
*
|
|
173
|
+
* await goto(page, 'https://books.toscrape.com/index.html');
|
|
174
|
+
* const books = await extractArrayFromPageUsingSelectors(page, {
|
|
175
|
+
* containerSelector: {
|
|
176
|
+
* selector: '//*[@id="default"]/div/div/div/div/section/div[2]/ol',
|
|
177
|
+
* type: "xpath"
|
|
178
|
+
* },
|
|
179
|
+
* propertySelectors: {
|
|
180
|
+
* name: {
|
|
181
|
+
* selector: "h3",
|
|
182
|
+
* },
|
|
183
|
+
* inStock: {
|
|
184
|
+
* selector: ".price_color",
|
|
185
|
+
* },
|
|
186
|
+
* imgUrl: {
|
|
187
|
+
* selector: "article > div.image_container > a > img",
|
|
188
|
+
* selectionMethod: {
|
|
189
|
+
* propertyName: "src"
|
|
190
|
+
* }
|
|
191
|
+
* }
|
|
192
|
+
* }
|
|
193
|
+
* })
|
|
194
|
+
*
|
|
195
|
+
* console.log(books)
|
|
196
|
+
*
|
|
197
|
+
* // output:
|
|
198
|
+
* // [
|
|
199
|
+
* // {
|
|
200
|
+
* // name: 'A Light in the ...',
|
|
201
|
+
* // inStock: '£51.77',
|
|
202
|
+
* // imgUrl: 'media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg'
|
|
203
|
+
* // },
|
|
204
|
+
* // {
|
|
205
|
+
* // name: 'Tipping the Velvet',
|
|
206
|
+
* // inStock: '£53.74',
|
|
207
|
+
* // imgUrl: 'media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg'
|
|
208
|
+
* // },
|
|
209
|
+
* // {
|
|
210
|
+
* // name: 'Soumission',
|
|
211
|
+
* // inStock: '£50.10',
|
|
212
|
+
* // imgUrl: 'media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg'
|
|
213
|
+
* // },
|
|
214
|
+
* // {
|
|
215
|
+
* // name: 'Sharp Objects',
|
|
216
|
+
* // inStock: '£47.82',
|
|
217
|
+
* // imgUrl: 'media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg'
|
|
218
|
+
* // },
|
|
219
|
+
* // ...
|
|
220
|
+
* // ]
|
|
221
|
+
*
|
|
222
|
+
* ```
|
|
223
|
+
*/
|
|
224
|
+
export declare function extractArrayFromPageUsingSelectors<
|
|
225
|
+
T extends ListStaticExtractor
|
|
226
|
+
>(
|
|
227
|
+
page: Page,
|
|
228
|
+
listExtractor: T
|
|
229
|
+
): Promise<ExtractListObjectsUsingStaticSelectorsReturnType<T>>;
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
import type { Locator, Page } from "playwright-core";
|
|
2
|
+
import {
|
|
3
|
+
extractMarkdownFromPage,
|
|
4
|
+
extractStructuredDataFromPage,
|
|
5
|
+
} from "../ai-extractors";
|
|
6
|
+
import {
|
|
7
|
+
extractArrayFromPage,
|
|
8
|
+
extractObjectFromPage,
|
|
9
|
+
} from "../optimized-extractors";
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* --
|
|
13
|
+
* @interface
|
|
14
|
+
* @property selector - The selector string for the element.
|
|
15
|
+
* @property [type] - Optional. The type of the selector (xpath or css) default is `css`
|
|
16
|
+
*/
|
|
17
|
+
export interface ElementSelector {
|
|
18
|
+
selector: string;
|
|
19
|
+
type?: "xpath" | "css";
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* a record or property name and the value selector to extract the value from the page.
|
|
24
|
+
* you can provide a list of `ValueSelector` to provide a backup selector in case the first one fails.
|
|
25
|
+
* the primary selector is the first one in the list.
|
|
26
|
+
*/
|
|
27
|
+
export type ObjectExtractor = Record<string, ValueSelector | ValueSelector[]>;
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* --
|
|
31
|
+
* @interface
|
|
32
|
+
* @property selector - The selector string for the element.
|
|
33
|
+
* @property [type] - Optional. The type of the selector (xpath or css). default to `css`
|
|
34
|
+
*/
|
|
35
|
+
export interface ElementSelector {
|
|
36
|
+
selector: string;
|
|
37
|
+
type?: "xpath" | "css";
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* represents a dom element selector and the method to extract the value from the element.
|
|
42
|
+
*
|
|
43
|
+
* @interface
|
|
44
|
+
* @extends ElementSelector
|
|
45
|
+
* @property [selectionMethod] - Optional. The method for selecting the value. `all-text` selects all text content, `direct-text` selects the direct text content(does not include the text inside nested elements), and `propertyName` selects the value of a property.
|
|
46
|
+
* @property [regex] - Optional. A regex pattern and match index for extracting the value.
|
|
47
|
+
* @property [multiValue] - Optional. Whether the selector extracts multiple values, if set to true the returned value will be array of strings
|
|
48
|
+
*/
|
|
49
|
+
export interface ValueSelector extends ElementSelector {
|
|
50
|
+
selectionMethod?:
|
|
51
|
+
| "direct-text"
|
|
52
|
+
| "all-text"
|
|
53
|
+
| {
|
|
54
|
+
propertyName: string;
|
|
55
|
+
};
|
|
56
|
+
|
|
57
|
+
regex?: {
|
|
58
|
+
pattern: string;
|
|
59
|
+
matchIndex?: number;
|
|
60
|
+
};
|
|
61
|
+
multiValue?: boolean;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* --
|
|
66
|
+
*
|
|
67
|
+
* @interface
|
|
68
|
+
* @property containerSelector - The selector(s) for the container elements of the list, all list items should be direct children of this container.
|
|
69
|
+
* @property propertySelectors - The selectors for the properties to extract. the values of the selector should be relative to the list item.
|
|
70
|
+
*
|
|
71
|
+
* **example:** if the list was:
|
|
72
|
+
*
|
|
73
|
+
* ```html
|
|
74
|
+
* <ul>
|
|
75
|
+
* <li>
|
|
76
|
+
* <div class="title">title 1</div>
|
|
77
|
+
* <div class="price">price 1</div>
|
|
78
|
+
* </li>
|
|
79
|
+
* <li>
|
|
80
|
+
* <div class="title">title 2</div>
|
|
81
|
+
* <div class="price">price 2</div>
|
|
82
|
+
* </li>
|
|
83
|
+
* </ul>
|
|
84
|
+
* ```
|
|
85
|
+
* the css relative selectors should be:
|
|
86
|
+
*
|
|
87
|
+
* title -> `.title`
|
|
88
|
+
*
|
|
89
|
+
* price -> `.price`
|
|
90
|
+
*
|
|
91
|
+
*/
|
|
92
|
+
export interface ListStaticExtractor {
|
|
93
|
+
containerSelector: ElementSelector | ElementSelector[];
|
|
94
|
+
propertySelectors: Record<string, ValueSelector | ValueSelector[]>;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
export type ExtractObjectFromPageUsingSelectorsReturnType<
|
|
98
|
+
T extends ObjectExtractor
|
|
99
|
+
> = {
|
|
100
|
+
[K in keyof T]: T[K] extends {
|
|
101
|
+
multiValue: true;
|
|
102
|
+
}
|
|
103
|
+
? string[] | null
|
|
104
|
+
: string | null;
|
|
105
|
+
};
|
|
106
|
+
|
|
107
|
+
export type ExtractListObjectsUsingStaticSelectorsReturnType<
|
|
108
|
+
T extends ListStaticExtractor
|
|
109
|
+
> = {
|
|
110
|
+
[K in keyof T["propertySelectors"]]: T["propertySelectors"][K] extends {
|
|
111
|
+
multiValue: true;
|
|
112
|
+
}
|
|
113
|
+
? string[] | null
|
|
114
|
+
: string | null;
|
|
115
|
+
}[];
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Extracts an object from a web page using the specified selectors.
|
|
119
|
+
*
|
|
120
|
+
* @param page - The Playwright Page object from which to extract the data.
|
|
121
|
+
* @param extractor - The object extractor with the selectors to use.
|
|
122
|
+
* @returns A promise that resolves to the extracted object.
|
|
123
|
+
*
|
|
124
|
+
* @example
|
|
125
|
+
* ```typescript extractObjectFromPageUsingSelectors
|
|
126
|
+
* import { extractObjectFromPageUsingSelectors, goto } from "@intuned/sdk/playwright";
|
|
127
|
+
*
|
|
128
|
+
* await goto(page, 'https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html');
|
|
129
|
+
* const book = await extractObjectFromPageUsingSelectors(page, {
|
|
130
|
+
* name: {
|
|
131
|
+
* selector: "h1",
|
|
132
|
+
* selectionMethod: "all-text"
|
|
133
|
+
* },
|
|
134
|
+
* inStock: {
|
|
135
|
+
* selector: ".price_color",
|
|
136
|
+
* },
|
|
137
|
+
* imgUrl: {
|
|
138
|
+
* selector: "#product_gallery > div > div > div > img",
|
|
139
|
+
* selectionMethod: {
|
|
140
|
+
* propertyName: "src"
|
|
141
|
+
* }
|
|
142
|
+
* }
|
|
143
|
+
* })
|
|
144
|
+
*
|
|
145
|
+
* console.log(book)
|
|
146
|
+
*
|
|
147
|
+
* // output:
|
|
148
|
+
* // {
|
|
149
|
+
* // name: 'A Light in the Attic',
|
|
150
|
+
* // inStock: '£51.77',
|
|
151
|
+
* // imgUrl: '../../media/cache/fe/72/fe72f0532301ec28892ae79a629a293c.jpg'
|
|
152
|
+
* // }
|
|
153
|
+
*
|
|
154
|
+
* ```
|
|
155
|
+
*/
|
|
156
|
+
export declare function extractObjectFromPageUsingSelectors<
|
|
157
|
+
T extends ObjectExtractor
|
|
158
|
+
>(
|
|
159
|
+
page: Page,
|
|
160
|
+
extractor: T
|
|
161
|
+
): Promise<ExtractObjectFromPageUsingSelectorsReturnType<T>>;
|
|
162
|
+
|
|
163
|
+
/**
|
|
164
|
+
* Extracts a list of objects from a web page using the specified static selectors.
|
|
165
|
+
*
|
|
166
|
+
* @param page - The Playwright Page object from which to extract the data.
|
|
167
|
+
* @param listExtractor - The list static extractor with the selectors to use.
|
|
168
|
+
* @returns A promise that resolves to the extracted list of objects.
|
|
169
|
+
* @example
|
|
170
|
+
* ```typescript extractArrayFromPageUsingSelectors
|
|
171
|
+
* import { extractArrayFromPageUsingSelectors, goto } from "@intuned/sdk/playwright";
|
|
172
|
+
*
|
|
173
|
+
* await goto(page, 'https://books.toscrape.com/index.html');
|
|
174
|
+
* const books = await extractArrayFromPageUsingSelectors(page, {
|
|
175
|
+
* containerSelector: {
|
|
176
|
+
* selector: '//*[@id="default"]/div/div/div/div/section/div[2]/ol',
|
|
177
|
+
* type: "xpath"
|
|
178
|
+
* },
|
|
179
|
+
* propertySelectors: {
|
|
180
|
+
* name: {
|
|
181
|
+
* selector: "h3",
|
|
182
|
+
* },
|
|
183
|
+
* inStock: {
|
|
184
|
+
* selector: ".price_color",
|
|
185
|
+
* },
|
|
186
|
+
* imgUrl: {
|
|
187
|
+
* selector: "article > div.image_container > a > img",
|
|
188
|
+
* selectionMethod: {
|
|
189
|
+
* propertyName: "src"
|
|
190
|
+
* }
|
|
191
|
+
* }
|
|
192
|
+
* }
|
|
193
|
+
* })
|
|
194
|
+
*
|
|
195
|
+
* console.log(books)
|
|
196
|
+
*
|
|
197
|
+
* // output:
|
|
198
|
+
* // [
|
|
199
|
+
* // {
|
|
200
|
+
* // name: 'A Light in the ...',
|
|
201
|
+
* // inStock: '£51.77',
|
|
202
|
+
* // imgUrl: 'media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg'
|
|
203
|
+
* // },
|
|
204
|
+
* // {
|
|
205
|
+
* // name: 'Tipping the Velvet',
|
|
206
|
+
* // inStock: '£53.74',
|
|
207
|
+
* // imgUrl: 'media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg'
|
|
208
|
+
* // },
|
|
209
|
+
* // {
|
|
210
|
+
* // name: 'Soumission',
|
|
211
|
+
* // inStock: '£50.10',
|
|
212
|
+
* // imgUrl: 'media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg'
|
|
213
|
+
* // },
|
|
214
|
+
* // {
|
|
215
|
+
* // name: 'Sharp Objects',
|
|
216
|
+
* // inStock: '£47.82',
|
|
217
|
+
* // imgUrl: 'media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg'
|
|
218
|
+
* // },
|
|
219
|
+
* // ...
|
|
220
|
+
* // ]
|
|
221
|
+
*
|
|
222
|
+
* ```
|
|
223
|
+
*/
|
|
224
|
+
export declare function extractArrayFromPageUsingSelectors<
|
|
225
|
+
T extends ListStaticExtractor
|
|
226
|
+
>(
|
|
227
|
+
page: Page,
|
|
228
|
+
listExtractor: T
|
|
229
|
+
): Promise<ExtractListObjectsUsingStaticSelectorsReturnType<T>>;
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
Object.defineProperty(exports, "__esModule", {
|
|
4
|
+
value: true
|
|
5
|
+
});
|
|
6
|
+
Object.defineProperty(exports, "extractArrayFromPageUsingSelectors", {
|
|
7
|
+
enumerable: true,
|
|
8
|
+
get: function () {
|
|
9
|
+
return _staticExtractors.extractArrayFromPageUsingSelectors;
|
|
10
|
+
}
|
|
11
|
+
});
|
|
12
|
+
Object.defineProperty(exports, "extractObjectFromPageUsingSelectors", {
|
|
13
|
+
enumerable: true,
|
|
14
|
+
get: function () {
|
|
15
|
+
return _staticExtractors.extractObjectFromPageUsingSelectors;
|
|
16
|
+
}
|
|
17
|
+
});
|
|
18
|
+
var _staticExtractors = require("./staticExtractors");
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
Object.defineProperty(exports, "__esModule", {
|
|
4
|
+
value: true
|
|
5
|
+
});
|
|
6
|
+
exports.compressStringSpaces = compressStringSpaces;
|
|
7
|
+
exports.injectExtractorsHelperFunctions = injectExtractorsHelperFunctions;
|
|
8
|
+
exports.selectArrayValuesUsingRelativeSelector = selectArrayValuesUsingRelativeSelector;
|
|
9
|
+
exports.selectLocatorsUsingXpath = selectLocatorsUsingXpath;
|
|
10
|
+
exports.selectMultiValueUsingRelativeSelector = selectMultiValueUsingRelativeSelector;
|
|
11
|
+
exports.selectValueUsingRelativeSelector = selectValueUsingRelativeSelector;
|
|
12
|
+
exports.splitContainerIntoListLocators = splitContainerIntoListLocators;
|
|
13
|
+
async function injectExtractorsHelperFunctions(page) {
|
|
14
|
+
await page.evaluate(() => {
|
|
15
|
+
const generateMultiValueWarningMessage = ({
|
|
16
|
+
propertyName,
|
|
17
|
+
nodesLength
|
|
18
|
+
}) => {
|
|
19
|
+
return `The selector for the property '${propertyName}' is matching ${nodesLength} elements. Please ensure that the 'multiValue' flag is enabled to retrieve all matching elements. Without this flag, only the first element will be processed.`;
|
|
20
|
+
};
|
|
21
|
+
window["cleanValue"] = (regex, value) => {
|
|
22
|
+
var _text$match;
|
|
23
|
+
const compressStringSpaces = str => str.replace(/\s+/g, " ").trim();
|
|
24
|
+
const text = compressStringSpaces(value);
|
|
25
|
+
if (!regex) return text;
|
|
26
|
+
const regexMatchIndex = regex.matchIndex === undefined || regex.matchIndex === null ? 1 : regex.matchIndex;
|
|
27
|
+
const regexFromString = new RegExp(regex.pattern);
|
|
28
|
+
const valueFromRegex = (_text$match = text.match(regexFromString)) === null || _text$match === void 0 ? void 0 : _text$match[regexMatchIndex];
|
|
29
|
+
return compressStringSpaces(valueFromRegex ?? "") || null;
|
|
30
|
+
};
|
|
31
|
+
window["getValueUsingNodeAndSelector"] = ({
|
|
32
|
+
selector,
|
|
33
|
+
element
|
|
34
|
+
}) => {
|
|
35
|
+
if (selector.selectionMethod === "all-text") {
|
|
36
|
+
return element.textContent;
|
|
37
|
+
} else if (selector.selectionMethod === "direct-text") {
|
|
38
|
+
return Array.from(element.childNodes).filter(child => child.nodeType === Node.TEXT_NODE).map(child => child.textContent ?? "").join("");
|
|
39
|
+
} else {
|
|
40
|
+
return element.getAttribute(selector.selectionMethod.propertyName);
|
|
41
|
+
}
|
|
42
|
+
};
|
|
43
|
+
window["getValueUsingSelector"] = ({
|
|
44
|
+
selector,
|
|
45
|
+
relativeElement,
|
|
46
|
+
propertyName
|
|
47
|
+
}) => {
|
|
48
|
+
let node = null;
|
|
49
|
+
const warningMessages = [];
|
|
50
|
+
if (selector.selector.trim() === "") {
|
|
51
|
+
node = relativeElement ?? document.documentElement;
|
|
52
|
+
} else if (selector.type === "xpath") {
|
|
53
|
+
if (!relativeElement && !selector.selector.startsWith("/")) {
|
|
54
|
+
selector.selector = `/${selector.selector}`;
|
|
55
|
+
}
|
|
56
|
+
const nodes = document.evaluate(selector.selector, relativeElement ? relativeElement : document.documentElement, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
|
|
57
|
+
if (nodes.snapshotLength > 0) {
|
|
58
|
+
if (nodes.snapshotLength > 1) {
|
|
59
|
+
warningMessages.push(generateMultiValueWarningMessage({
|
|
60
|
+
propertyName,
|
|
61
|
+
nodesLength: nodes.snapshotLength
|
|
62
|
+
}));
|
|
63
|
+
}
|
|
64
|
+
node = nodes.snapshotItem(0);
|
|
65
|
+
} else {
|
|
66
|
+
return {
|
|
67
|
+
value: null,
|
|
68
|
+
warnings: warningMessages
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
} else {
|
|
72
|
+
const nodes = relativeElement ? relativeElement.querySelectorAll(selector.selector) : document.querySelectorAll(selector.selector);
|
|
73
|
+
if (nodes.length > 0) {
|
|
74
|
+
if (nodes.length > 1) {
|
|
75
|
+
warningMessages.push(generateMultiValueWarningMessage({
|
|
76
|
+
propertyName,
|
|
77
|
+
nodesLength: nodes.length
|
|
78
|
+
}));
|
|
79
|
+
}
|
|
80
|
+
node = nodes[0];
|
|
81
|
+
} else {
|
|
82
|
+
return {
|
|
83
|
+
value: null,
|
|
84
|
+
warnings: warningMessages
|
|
85
|
+
};
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
if (!node) return {
|
|
89
|
+
value: null,
|
|
90
|
+
warnings: warningMessages
|
|
91
|
+
};
|
|
92
|
+
const result = window["getValueUsingNodeAndSelector"]({
|
|
93
|
+
selector,
|
|
94
|
+
element: node
|
|
95
|
+
});
|
|
96
|
+
if (!result) return {
|
|
97
|
+
value: null,
|
|
98
|
+
warnings: warningMessages
|
|
99
|
+
};
|
|
100
|
+
return {
|
|
101
|
+
value: window["cleanValue"](selector.regex, result),
|
|
102
|
+
warnings: warningMessages
|
|
103
|
+
};
|
|
104
|
+
};
|
|
105
|
+
window["getMultiValueUsingSelector"] = ({
|
|
106
|
+
selector,
|
|
107
|
+
relativeElement
|
|
108
|
+
}) => {
|
|
109
|
+
let nodes = [];
|
|
110
|
+
if (selector.selector.trim() === "") {
|
|
111
|
+
nodes = [relativeElement ?? document.documentElement];
|
|
112
|
+
} else if (selector.type === "xpath") {
|
|
113
|
+
if (!relativeElement && !selector.selector.startsWith("/")) {
|
|
114
|
+
selector.selector = `/${selector.selector}`;
|
|
115
|
+
}
|
|
116
|
+
const nodesSnapshot = document.evaluate(selector.selector, relativeElement ? relativeElement : document.documentElement, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
|
|
117
|
+
for (let i = 0; i < nodesSnapshot.snapshotLength; i++) {
|
|
118
|
+
const node = nodesSnapshot.snapshotItem(i);
|
|
119
|
+
node && nodes.push(node);
|
|
120
|
+
}
|
|
121
|
+
} else {
|
|
122
|
+
nodes = Array.from(relativeElement ? relativeElement.querySelectorAll(selector.selector) : document.querySelectorAll(selector.selector));
|
|
123
|
+
}
|
|
124
|
+
return nodes.map(node => window["getValueUsingNodeAndSelector"]({
|
|
125
|
+
selector,
|
|
126
|
+
element: node
|
|
127
|
+
})).map(result => window["cleanValue"](selector.regex, result)).filter(v => v !== null && v !== undefined);
|
|
128
|
+
};
|
|
129
|
+
});
|
|
130
|
+
}
|
|
131
|
+
async function splitContainerIntoListLocators(locator) {
|
|
132
|
+
const childrenLocators = await locator.locator(":scope > *").all();
|
|
133
|
+
return childrenLocators;
|
|
134
|
+
}
|
|
135
|
+
function compressStringSpaces(str) {
|
|
136
|
+
return str.replace(/\s+/g, " ").trim();
|
|
137
|
+
}
|
|
138
|
+
async function selectArrayValuesUsingRelativeSelector(locators, selectorInfo) {
|
|
139
|
+
const extractorFn = selectorInfo.multiValue ? selectMultiValueUsingRelativeSelector : selectValueUsingRelativeSelector;
|
|
140
|
+
const listValuesPromises = locators.map(locator => {
|
|
141
|
+
return extractorFn(locator, selectorInfo);
|
|
142
|
+
});
|
|
143
|
+
const listValues = await Promise.all(listValuesPromises);
|
|
144
|
+
const someValidValues = listValues.some(value => !!value);
|
|
145
|
+
if (someValidValues) return listValues;
|
|
146
|
+
return locators.map(() => null);
|
|
147
|
+
}
|
|
148
|
+
async function selectMultiValueUsingRelativeSelector(locator, valueSelector) {
|
|
149
|
+
await injectExtractorsHelperFunctions(locator.page());
|
|
150
|
+
return locator.evaluate((element, valueSelector) => {
|
|
151
|
+
return window["getMultiValueUsingSelector"]({
|
|
152
|
+
relativeElement: element,
|
|
153
|
+
selector: valueSelector
|
|
154
|
+
});
|
|
155
|
+
}, valueSelector);
|
|
156
|
+
}
|
|
157
|
+
async function selectValueUsingRelativeSelector(locator, valueSelector) {
|
|
158
|
+
const page = locator.page();
|
|
159
|
+
await injectExtractorsHelperFunctions(page);
|
|
160
|
+
return locator.evaluate((element, valueSelector) => {
|
|
161
|
+
return window["getValueUsingSelector"]({
|
|
162
|
+
relativeElement: element,
|
|
163
|
+
selector: valueSelector
|
|
164
|
+
}).value;
|
|
165
|
+
}, valueSelector);
|
|
166
|
+
}
|
|
167
|
+
async function selectLocatorsUsingXpath(page, xpath) {
|
|
168
|
+
const trimmed = xpath.replace(/^(\/+)/, "");
|
|
169
|
+
return await page.locator(`//${trimmed}`).all();
|
|
170
|
+
}
|