@d-zero/beholder 0.1.28 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +15 -0
- package/README.md +172 -477
- package/dist/debug.d.ts +4 -1
- package/dist/debug.js +5 -2
- package/dist/dom-evaluation.d.ts +72 -14
- package/dist/dom-evaluation.js +169 -43
- package/dist/index.d.ts +20 -3
- package/dist/index.js +15 -3
- package/dist/is-error.d.ts +8 -0
- package/dist/is-error.js +10 -0
- package/dist/keyword-check.d.ts +5 -3
- package/dist/keyword-check.js +5 -3
- package/dist/parse-url.d.ts +14 -0
- package/dist/parse-url.js +23 -0
- package/dist/scraper.d.ts +39 -13
- package/dist/scraper.js +300 -263
- package/dist/types.d.ts +286 -214
- package/dist/types.js +6 -0
- package/package.json +7 -10
- package/src/debug.ts +5 -2
- package/src/dom-evaluation.ts +195 -65
- package/src/index.ts +27 -3
- package/src/is-error.spec.ts +33 -0
- package/src/is-error.ts +10 -0
- package/src/keyword-check.spec.ts +45 -4
- package/src/keyword-check.ts +5 -3
- package/src/parse-url.spec.ts +35 -0
- package/src/parse-url.ts +26 -0
- package/src/scraper.ts +338 -300
- package/src/types.ts +345 -258
- package/tsconfig.tsbuildinfo +1 -1
- package/dist/events.d.ts +0 -32
- package/dist/events.js +0 -15
- package/dist/fetch-destination.d.ts +0 -8
- package/dist/fetch-destination.js +0 -145
- package/dist/net-timeout-error.d.ts +0 -3
- package/dist/net-timeout-error.js +0 -3
- package/dist/sub-process-runner.d.ts +0 -12
- package/dist/sub-process-runner.js +0 -180
- package/dist/sub-process.d.ts +0 -1
- package/dist/sub-process.js +0 -67
- package/dist/utils.d.ts +0 -16
- package/dist/utils.js +0 -69
- package/src/events.ts +0 -21
- package/src/fetch-destination.ts +0 -173
- package/src/net-timeout-error.ts +0 -3
- package/src/sub-process-runner.ts +0 -220
- package/src/sub-process.ts +0 -86
- package/src/utils.ts +0 -89
package/src/types.ts
CHANGED
|
@@ -1,341 +1,428 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
1
|
+
/**
|
|
2
|
+
* Beholder type definitions for the page-level scraper.
|
|
3
|
+
* @see {@link ./scraper.ts} for the Scraper class that produces these types
|
|
4
|
+
* @see {@link ./dom-evaluation.ts} for DOM extraction functions (anchors, images, meta)
|
|
5
|
+
* @module
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
export type { ExURL, ParseURLOptions } from '@d-zero/shared/parse-url';
|
|
9
|
+
export type { CompressType } from '@d-zero/shared/detect-compress';
|
|
10
|
+
export type { CDNType } from '@d-zero/shared/detect-cdn';
|
|
11
|
+
|
|
12
|
+
import type { CDNType } from '@d-zero/shared/detect-cdn';
|
|
13
|
+
import type { CompressType } from '@d-zero/shared/detect-compress';
|
|
14
|
+
import type { ExURL } from '@d-zero/shared/parse-url';
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Scraped page data returned by the scraper after successfully processing a page.
|
|
18
|
+
*/
|
|
19
|
+
export type PageData = {
|
|
20
|
+
/** The parsed URL of the page. */
|
|
5
21
|
url: ExURL;
|
|
6
|
-
};
|
|
7
22
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
error: {
|
|
11
|
-
name: string;
|
|
12
|
-
message: string;
|
|
13
|
-
stack?: string;
|
|
14
|
-
};
|
|
15
|
-
};
|
|
23
|
+
/** Chain of redirect URLs traversed to reach the final destination. */
|
|
24
|
+
redirectPaths: string[];
|
|
16
25
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
reason: {
|
|
20
|
-
matchedText: string;
|
|
21
|
-
excludeKeywords: string[];
|
|
22
|
-
};
|
|
23
|
-
};
|
|
24
|
-
resourceResponse: ScrapeEvent & {
|
|
25
|
-
log: NetworkLog;
|
|
26
|
-
resource: Omit<Resource, 'uid'>;
|
|
27
|
-
};
|
|
28
|
-
scrapeEnd: ScrapeEvent & {
|
|
29
|
-
timestamp: number;
|
|
30
|
-
result: PageData;
|
|
31
|
-
};
|
|
32
|
-
destroyed: Omit<ScrapeEvent, 'url'>;
|
|
33
|
-
error: ScrapeErrorEvent;
|
|
34
|
-
changePhase: ChangePhaseEvent;
|
|
35
|
-
};
|
|
26
|
+
/** Whether this page is a target page (internal and within the crawl scope). */
|
|
27
|
+
isTarget: boolean;
|
|
36
28
|
|
|
37
|
-
|
|
38
|
-
pid: number;
|
|
39
|
-
name:
|
|
40
|
-
| 'scrapeStart'
|
|
41
|
-
| 'launchBrowser'
|
|
42
|
-
| 'touchHead'
|
|
43
|
-
| 'touchHeadTimeout'
|
|
44
|
-
| 'newPage'
|
|
45
|
-
| 'openPage'
|
|
46
|
-
| 'loadDOMContent'
|
|
47
|
-
| 'waitNetworkIdleZero'
|
|
48
|
-
| 'getHTML'
|
|
49
|
-
| 'setViewport'
|
|
50
|
-
| 'scrollToBottom'
|
|
51
|
-
| 'getImages'
|
|
52
|
-
| 'getAnchors'
|
|
53
|
-
| 'getMeta'
|
|
54
|
-
| 'ignoreAndSkip'
|
|
55
|
-
| 'scrapeEnd'
|
|
56
|
-
| 'beforeDestroy'
|
|
57
|
-
| 'destroyed';
|
|
58
|
-
url: ExURL | null;
|
|
29
|
+
/** Whether this page is external to the crawl scope. */
|
|
59
30
|
isExternal: boolean;
|
|
60
|
-
message: string;
|
|
61
|
-
};
|
|
62
31
|
|
|
63
|
-
|
|
32
|
+
/** HTTP status code of the response. */
|
|
33
|
+
status: number;
|
|
64
34
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
url: ExURL;
|
|
68
|
-
isExternal: boolean;
|
|
69
|
-
isGettingImages: boolean;
|
|
70
|
-
excludeKeywords: string[];
|
|
71
|
-
executablePath: string | null;
|
|
72
|
-
isSkip: boolean;
|
|
73
|
-
isTitleOnly: boolean;
|
|
74
|
-
screenshot: string | null;
|
|
75
|
-
} & Required<ParseURLOptions>;
|
|
76
|
-
destroy: void;
|
|
77
|
-
};
|
|
35
|
+
/** HTTP status text of the response. */
|
|
36
|
+
statusText: string;
|
|
78
37
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
};
|
|
38
|
+
/** The Content-Type header value, or `null` if unavailable. */
|
|
39
|
+
contentType: string | null;
|
|
82
40
|
|
|
83
|
-
|
|
84
|
-
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
41
|
+
/** The Content-Length header value in bytes, or `null` if unavailable. */
|
|
42
|
+
contentLength: number | null;
|
|
43
|
+
|
|
44
|
+
/** Raw HTTP response headers, or `null` if unavailable. */
|
|
45
|
+
responseHeaders: Record<string, string | string[] | undefined> | null;
|
|
46
|
+
|
|
47
|
+
/** Extracted metadata from the page (title, description, OGP, etc.). */
|
|
48
|
+
meta: Meta;
|
|
49
|
+
|
|
50
|
+
/** List of anchor elements found on the page. */
|
|
51
|
+
anchorList: AnchorData[];
|
|
52
|
+
|
|
53
|
+
/** List of image elements found on the page. */
|
|
54
|
+
imageList: ImageElement[];
|
|
55
|
+
|
|
56
|
+
/** HTML snapshot of the rendered DOM. */
|
|
57
|
+
html: string;
|
|
58
|
+
|
|
59
|
+
/** Always `false` for successfully scraped pages. See {@link SkippedPageData} for skipped pages. */
|
|
60
|
+
isSkipped: false;
|
|
98
61
|
};
|
|
99
62
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
63
|
+
/**
|
|
64
|
+
* Information about an image element found on a page.
|
|
65
|
+
*/
|
|
66
|
+
export type ImageElement = {
|
|
67
|
+
/** The `src` attribute value of the image element. */
|
|
68
|
+
src: string;
|
|
105
69
|
|
|
106
|
-
/**
|
|
107
|
-
|
|
108
|
-
*/
|
|
109
|
-
_originUrlString: string;
|
|
70
|
+
/** The `currentSrc` property value (the actual URL loaded by the browser). */
|
|
71
|
+
currentSrc: string;
|
|
110
72
|
|
|
111
|
-
/**
|
|
112
|
-
|
|
113
|
-
*/
|
|
114
|
-
withoutHash: string;
|
|
73
|
+
/** The `alt` attribute value of the image element. */
|
|
74
|
+
alt: string;
|
|
115
75
|
|
|
116
|
-
/**
|
|
117
|
-
|
|
118
|
-
*/
|
|
119
|
-
withoutHashAndAuth: string;
|
|
76
|
+
/** The CSS layout width of the image in pixels. */
|
|
77
|
+
width: number;
|
|
120
78
|
|
|
121
|
-
/**
|
|
122
|
-
|
|
123
|
-
* - case-insensitive
|
|
124
|
-
*/
|
|
125
|
-
protocol: string;
|
|
79
|
+
/** The CSS layout height of the image in pixels. */
|
|
80
|
+
height: number;
|
|
126
81
|
|
|
127
|
-
/**
|
|
128
|
-
|
|
129
|
-
*/
|
|
130
|
-
isHTTP: boolean;
|
|
82
|
+
/** The intrinsic width of the image in pixels. */
|
|
83
|
+
naturalWidth: number;
|
|
131
84
|
|
|
132
|
-
/**
|
|
133
|
-
|
|
134
|
-
*/
|
|
135
|
-
isSecure: boolean;
|
|
85
|
+
/** The intrinsic height of the image in pixels. */
|
|
86
|
+
naturalHeight: number;
|
|
136
87
|
|
|
137
|
-
/**
|
|
138
|
-
|
|
139
|
-
*/
|
|
140
|
-
username: string | null;
|
|
88
|
+
/** Whether the image uses lazy loading (`loading="lazy"` or IntersectionObserver). */
|
|
89
|
+
isLazy: boolean;
|
|
141
90
|
|
|
142
|
-
/**
|
|
143
|
-
|
|
144
|
-
*/
|
|
145
|
-
password: string | null;
|
|
91
|
+
/** The viewport width at which this image was captured. */
|
|
92
|
+
viewportWidth: number;
|
|
146
93
|
|
|
147
|
-
/**
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
* - case-insensitive
|
|
151
|
-
* - encode non-ASCII characters
|
|
152
|
-
* - without port number
|
|
153
|
-
*/
|
|
154
|
-
hostname: string;
|
|
94
|
+
/** The outer HTML source code of the image element. */
|
|
95
|
+
sourceCode: string;
|
|
96
|
+
};
|
|
155
97
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
98
|
+
/**
|
|
99
|
+
* Data for a page that was skipped during crawling due to keyword or path exclusion.
|
|
100
|
+
*/
|
|
101
|
+
export type SkippedPageData = {
|
|
102
|
+
/** Always `true` for skipped pages. */
|
|
103
|
+
isSkipped: true;
|
|
160
104
|
|
|
161
|
-
/**
|
|
162
|
-
|
|
163
|
-
*
|
|
164
|
-
* It is only `/` if pathname is empty
|
|
165
|
-
*
|
|
166
|
-
* - case-sensitive
|
|
167
|
-
*/
|
|
168
|
-
pathname: string | null;
|
|
105
|
+
/** The URL of the skipped page. */
|
|
106
|
+
url: ExURL;
|
|
169
107
|
|
|
170
|
-
/**
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
108
|
+
/** The reason the page was skipped, with match details. */
|
|
109
|
+
matched:
|
|
110
|
+
| {
|
|
111
|
+
/** Skipped due to a keyword match in the page content. */
|
|
112
|
+
type: 'keyword';
|
|
113
|
+
/** The text that matched the exclusion keyword. */
|
|
114
|
+
text: string;
|
|
115
|
+
/** The exclusion keywords that triggered the skip. */
|
|
116
|
+
excludeKeywords: string[];
|
|
117
|
+
}
|
|
118
|
+
| {
|
|
119
|
+
/** Skipped due to a URL path pattern match. */
|
|
120
|
+
type: 'path';
|
|
121
|
+
/** The exclusion patterns that triggered the skip. */
|
|
122
|
+
excludes: string[];
|
|
123
|
+
};
|
|
124
|
+
};
|
|
174
125
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
126
|
+
/**
|
|
127
|
+
* A network resource (CSS, JS, image, etc.) captured during page scraping.
|
|
128
|
+
*/
|
|
129
|
+
export type Resource = {
|
|
130
|
+
/** The URL of the resource. */
|
|
131
|
+
url: ExURL;
|
|
179
132
|
|
|
180
|
-
/**
|
|
181
|
-
|
|
182
|
-
*
|
|
183
|
-
* It is null if it is `/` only
|
|
184
|
-
*/
|
|
185
|
-
dirname: string | null;
|
|
133
|
+
/** Whether the resource is from an external domain. */
|
|
134
|
+
isExternal: boolean;
|
|
186
135
|
|
|
187
|
-
/**
|
|
188
|
-
|
|
189
|
-
*/
|
|
190
|
-
basename: string | null;
|
|
136
|
+
/** Whether the resource request resulted in an error. */
|
|
137
|
+
isError: boolean;
|
|
191
138
|
|
|
192
|
-
/**
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
139
|
+
/** HTTP status code, or `null` if the request failed. */
|
|
140
|
+
status: number | null;
|
|
141
|
+
|
|
142
|
+
/** HTTP status text, or `null` if the request failed. */
|
|
143
|
+
statusText: string | null;
|
|
144
|
+
|
|
145
|
+
/** The Content-Type header value, or `null` if unavailable. */
|
|
146
|
+
contentType: string | null;
|
|
147
|
+
|
|
148
|
+
/** The Content-Length header value in bytes, or `null` if unavailable. */
|
|
149
|
+
contentLength: number | null;
|
|
150
|
+
|
|
151
|
+
/** The compression algorithm used, or `false` if uncompressed. */
|
|
152
|
+
compress: false | CompressType;
|
|
153
|
+
|
|
154
|
+
/** The CDN provider detected from response headers, or `false` if none detected. */
|
|
155
|
+
cdn: false | CDNType;
|
|
156
|
+
|
|
157
|
+
/** Raw HTTP response headers, or `null` if unavailable. */
|
|
158
|
+
headers: Record<string, string | string[] | undefined> | null;
|
|
159
|
+
};
|
|
196
160
|
|
|
161
|
+
/**
|
|
162
|
+
* Data extracted from an anchor element (`<a>` or `<area>`) on a page.
|
|
163
|
+
*/
|
|
164
|
+
export type AnchorData = {
|
|
197
165
|
/**
|
|
198
|
-
*
|
|
166
|
+
* Extracts the value of the `href` attribute from anchor element (`<a>` `<area>`)
|
|
199
167
|
*/
|
|
200
|
-
|
|
168
|
+
href: ExURL;
|
|
201
169
|
|
|
202
170
|
/**
|
|
203
|
-
*
|
|
204
|
-
*
|
|
205
|
-
* - case-sensitive
|
|
171
|
+
* The accessible name of the anchor element
|
|
206
172
|
*/
|
|
207
|
-
|
|
173
|
+
textContent: string;
|
|
208
174
|
|
|
209
175
|
/**
|
|
210
|
-
*
|
|
211
|
-
*
|
|
212
|
-
* - case-sensitive
|
|
176
|
+
* Whether the anchor points to an external URL.
|
|
177
|
+
* Set by `processAnchors()` in the crawler; not available in the sub-process.
|
|
213
178
|
*/
|
|
214
|
-
|
|
215
|
-
};
|
|
216
|
-
|
|
217
|
-
export type ParseURLOptions = {
|
|
218
|
-
disableQueries?: boolean;
|
|
219
|
-
};
|
|
220
|
-
|
|
221
|
-
export type PageData = {
|
|
222
|
-
url: ExURL;
|
|
223
|
-
redirectPaths: string[];
|
|
224
|
-
isTarget: boolean;
|
|
225
|
-
isExternal: boolean;
|
|
226
|
-
status: number;
|
|
227
|
-
statusText: string;
|
|
228
|
-
contentType: string | null;
|
|
229
|
-
contentLength: number | null;
|
|
230
|
-
responseHeaders: Record<string, string | string[] | undefined> | null;
|
|
231
|
-
meta: Meta;
|
|
232
|
-
anchorList: AnchorData[];
|
|
233
|
-
imageList: ImageElement[];
|
|
234
|
-
html: string;
|
|
235
|
-
isSkipped: false;
|
|
179
|
+
isExternal?: boolean;
|
|
236
180
|
};
|
|
237
181
|
|
|
182
|
+
/**
|
|
183
|
+
* Metadata extracted from a page's `<head>` element.
|
|
184
|
+
*/
|
|
238
185
|
export type Meta = {
|
|
186
|
+
/** The `lang` attribute of the `<html>` element. */
|
|
239
187
|
lang?: string;
|
|
188
|
+
|
|
189
|
+
/** The text content of the `<title>` element. */
|
|
240
190
|
title: string;
|
|
191
|
+
|
|
192
|
+
/** The `content` attribute of `<meta name="description">`. */
|
|
241
193
|
description?: string;
|
|
194
|
+
|
|
195
|
+
/** The `content` attribute of `<meta name="keywords">`. */
|
|
242
196
|
keywords?: string;
|
|
197
|
+
|
|
198
|
+
/** Whether `noindex` is present in the robots meta tag. */
|
|
243
199
|
noindex?: boolean;
|
|
200
|
+
|
|
201
|
+
/** Whether `nofollow` is present in the robots meta tag. */
|
|
244
202
|
nofollow?: boolean;
|
|
203
|
+
|
|
204
|
+
/** Whether `noarchive` is present in the robots meta tag. */
|
|
245
205
|
noarchive?: boolean;
|
|
206
|
+
|
|
207
|
+
/** The canonical URL from `<link rel="canonical">`. */
|
|
246
208
|
canonical?: string;
|
|
209
|
+
|
|
210
|
+
/** The alternate URL from `<link rel="alternate">`. */
|
|
247
211
|
alternate?: string;
|
|
212
|
+
|
|
213
|
+
/** The Open Graph type (`og:type`). */
|
|
248
214
|
'og:type'?: string;
|
|
215
|
+
|
|
216
|
+
/** The Open Graph title (`og:title`). */
|
|
249
217
|
'og:title'?: string;
|
|
218
|
+
|
|
219
|
+
/** The Open Graph site name (`og:site_name`). */
|
|
250
220
|
'og:site_name'?: string;
|
|
221
|
+
|
|
222
|
+
/** The Open Graph description (`og:description`). */
|
|
251
223
|
'og:description'?: string;
|
|
252
|
-
'og:url'?: string;
|
|
253
|
-
'og:image'?: string;
|
|
254
|
-
'twitter:card'?: string;
|
|
255
|
-
};
|
|
256
224
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
* Extracts the value of the `href` attribute from anchor element (`<a>` `<area>`)
|
|
260
|
-
*/
|
|
261
|
-
href: ExURL;
|
|
225
|
+
/** The Open Graph URL (`og:url`). */
|
|
226
|
+
'og:url'?: string;
|
|
262
227
|
|
|
263
|
-
/**
|
|
264
|
-
|
|
265
|
-
*/
|
|
266
|
-
textContent: string;
|
|
267
|
-
};
|
|
228
|
+
/** The Open Graph image URL (`og:image`). */
|
|
229
|
+
'og:image'?: string;
|
|
268
230
|
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
currentSrc: string;
|
|
272
|
-
alt: string;
|
|
273
|
-
width: number;
|
|
274
|
-
height: number;
|
|
275
|
-
naturalWidth: number;
|
|
276
|
-
naturalHeight: number;
|
|
277
|
-
isLazy: boolean;
|
|
278
|
-
viewportWidth: number;
|
|
279
|
-
sourceCode: string;
|
|
231
|
+
/** The Twitter Card type (`twitter:card`). */
|
|
232
|
+
'twitter:card'?: string;
|
|
280
233
|
};
|
|
281
234
|
|
|
235
|
+
/**
|
|
236
|
+
* A network request/response log entry captured during page scraping via Puppeteer.
|
|
237
|
+
*/
|
|
282
238
|
export type NetworkLog = {
|
|
239
|
+
/** The URL of the network request. */
|
|
283
240
|
url: ExURL;
|
|
241
|
+
|
|
242
|
+
/** HTTP status code of the response, or `null` if the request failed. */
|
|
284
243
|
status: number | null;
|
|
244
|
+
|
|
245
|
+
/** The Content-Length of the response body in bytes. */
|
|
285
246
|
contentLength: number;
|
|
247
|
+
|
|
248
|
+
/** The Content-Type of the response. */
|
|
286
249
|
contentType: string;
|
|
250
|
+
|
|
251
|
+
/** Whether the request resulted in an error. */
|
|
287
252
|
isError: boolean;
|
|
253
|
+
|
|
254
|
+
/** Details of the outgoing HTTP request. */
|
|
288
255
|
request: {
|
|
256
|
+
/** Timestamp of the request in milliseconds. */
|
|
289
257
|
ts: number;
|
|
258
|
+
/** HTTP request headers. */
|
|
290
259
|
headers: Record<string, string>;
|
|
260
|
+
/** HTTP method used (e.g., "GET", "POST"). */
|
|
291
261
|
method: string;
|
|
292
262
|
};
|
|
263
|
+
|
|
264
|
+
/** Details of the HTTP response, absent if the request failed. */
|
|
293
265
|
response?: {
|
|
266
|
+
/** Timestamp of the response in milliseconds. */
|
|
294
267
|
ts: number;
|
|
268
|
+
/** HTTP status code. */
|
|
295
269
|
status: number;
|
|
270
|
+
/** HTTP status text. */
|
|
296
271
|
statusText: string;
|
|
272
|
+
/** Whether the response was served from cache. */
|
|
297
273
|
fromCache: boolean;
|
|
274
|
+
/** HTTP response headers. */
|
|
298
275
|
headers: Record<string, string>;
|
|
299
276
|
};
|
|
300
277
|
};
|
|
301
278
|
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
279
|
+
/**
|
|
280
|
+
* The result of a single page scrape operation.
|
|
281
|
+
* Encapsulates the outcome and all captured sub-resources.
|
|
282
|
+
*/
|
|
283
|
+
export type ScrapeResult = {
|
|
284
|
+
/**
|
|
285
|
+
* The type of result:
|
|
286
|
+
* - `"success"` - Scraping completed successfully.
|
|
287
|
+
* - `"skipped"` - The page was skipped due to an exclusion rule.
|
|
288
|
+
* - `"error"` - An error occurred during scraping.
|
|
289
|
+
*/
|
|
290
|
+
type: 'success' | 'skipped' | 'error';
|
|
291
|
+
/** The full page data, present when `type` is `"success"`. */
|
|
292
|
+
pageData?: PageData;
|
|
293
|
+
/** All sub-resources captured during the page load. */
|
|
294
|
+
resources: ResourceEntry[];
|
|
295
|
+
/** Details about why the page was ignored, present when `type` is `"skipped"`. */
|
|
296
|
+
ignored?: { url: ExURL; matchedText: string; excludeKeywords: string[] };
|
|
297
|
+
/** Error details, present when `type` is `"error"`. */
|
|
298
|
+
error?: { name: string; message: string; stack?: string; shutdown: boolean };
|
|
313
299
|
};
|
|
314
300
|
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
301
|
+
/**
|
|
302
|
+
* A single sub-resource entry captured during page scraping.
|
|
303
|
+
* Represents one network resource (CSS, JS, image, etc.) loaded by a page.
|
|
304
|
+
*/
|
|
305
|
+
export type ResourceEntry = {
|
|
306
|
+
/** The network log entry containing request/response timing and headers. */
|
|
307
|
+
log: NetworkLog;
|
|
308
|
+
/** The resource metadata (without UID, which is assigned by the archive layer). */
|
|
309
|
+
resource: Omit<Resource, 'uid'>;
|
|
310
|
+
/** The URL (without hash) of the page that triggered this resource load. */
|
|
311
|
+
pageUrl: string;
|
|
312
|
+
};
|
|
323
313
|
|
|
324
|
-
|
|
314
|
+
/**
|
|
315
|
+
* Event payload describing a phase transition in the scraping lifecycle.
|
|
316
|
+
* Phases proceed roughly in order: scrapeStart -> headRequest -> openPage ->
|
|
317
|
+
* loadDOMContent -> waitNetworkIdle -> getHTML -> getAnchors -> getMeta ->
|
|
318
|
+
* getImages -> scrapeEnd.
|
|
319
|
+
*/
|
|
320
|
+
export type ChangePhaseEvent = {
|
|
321
|
+
/** The process ID of the scraper worker. */
|
|
322
|
+
pid: number;
|
|
323
|
+
/**
|
|
324
|
+
* The name of the current scraping phase.
|
|
325
|
+
*
|
|
326
|
+
* - `scrapeStart` - Scraping has begun for a URL.
|
|
327
|
+
* - `launchBrowser` - A browser instance is being launched.
|
|
328
|
+
* - `headRequest` - Performing an HTTP HEAD request to check the destination.
|
|
329
|
+
* - `headRequestTimeout` - The HEAD request timed out.
|
|
330
|
+
* - `newPage` - A new browser page/tab is being created.
|
|
331
|
+
* - `openPage` - Navigating the browser page to the target URL.
|
|
332
|
+
* - `loadDOMContent` - Waiting for the DOM content to finish loading.
|
|
333
|
+
* - `waitNetworkIdle` - Waiting for all network activity to cease.
|
|
334
|
+
* - `getHTML` - Extracting the page HTML content.
|
|
335
|
+
* - `setViewport` - Setting the browser viewport dimensions.
|
|
336
|
+
* - `scrollToBottom` - Scrolling the page to trigger lazy-loaded content.
|
|
337
|
+
* - `extractImages` - Starting the image extraction pipeline.
|
|
338
|
+
* - `waitImageLoad` - Waiting for images to finish loading on the page.
|
|
339
|
+
* - `getImages` - Extracting image element data from the page.
|
|
340
|
+
* - `getAnchors` - Extracting anchor/link data from the page.
|
|
341
|
+
* - `getMeta` - Extracting meta tag information from the page.
|
|
342
|
+
* - `pageSkipped` - The page matched an exclusion rule and is being skipped.
|
|
343
|
+
* - `retryWait` - Waiting before a retry attempt after a transient failure.
|
|
344
|
+
* - `retryExhausted` - All retry attempts exhausted; giving up on this operation.
|
|
345
|
+
* - `scrapeEnd` - Scraping has completed for this URL.
|
|
346
|
+
* - `beforeCleanup` - The scraper is about to clean up resources.
|
|
347
|
+
* - `cleanedUp` - The scraper has finished cleaning up.
|
|
348
|
+
*/
|
|
349
|
+
name:
|
|
350
|
+
| 'scrapeStart'
|
|
351
|
+
| 'launchBrowser'
|
|
352
|
+
| 'headRequest'
|
|
353
|
+
| 'headRequestTimeout'
|
|
354
|
+
| 'newPage'
|
|
355
|
+
| 'openPage'
|
|
356
|
+
| 'loadDOMContent'
|
|
357
|
+
| 'waitNetworkIdle'
|
|
358
|
+
| 'getHTML'
|
|
359
|
+
| 'setViewport'
|
|
360
|
+
| 'scrollToBottom'
|
|
361
|
+
| 'extractImages'
|
|
362
|
+
| 'waitImageLoad'
|
|
363
|
+
| 'getImages'
|
|
364
|
+
| 'getAnchors'
|
|
365
|
+
| 'getMeta'
|
|
366
|
+
| 'pageSkipped'
|
|
367
|
+
| 'retryWait'
|
|
368
|
+
| 'retryExhausted'
|
|
369
|
+
| 'scrapeEnd'
|
|
370
|
+
| 'beforeCleanup'
|
|
371
|
+
| 'cleanedUp';
|
|
372
|
+
/** The URL being scraped, or `null` when the phase is not URL-specific (e.g., setViewport). */
|
|
373
|
+
url: ExURL | null;
|
|
374
|
+
/** Whether the URL being scraped is external to the crawl scope. */
|
|
375
|
+
isExternal: boolean;
|
|
376
|
+
/** An optional human-readable message providing additional context about the phase. */
|
|
377
|
+
message: string;
|
|
378
|
+
};
|
|
325
379
|
|
|
326
|
-
|
|
380
|
+
/**
|
|
381
|
+
* Streaming event types emitted by the Scraper.
|
|
382
|
+
* Result events (success, skipped, error) are returned as values,
|
|
383
|
+
* not emitted as events.
|
|
384
|
+
*/
|
|
385
|
+
export type ScraperEventTypes = {
|
|
386
|
+
/**
|
|
387
|
+
* Emitted when a sub-resource response is captured during page loading.
|
|
388
|
+
* Only fires for internal (non-external) pages.
|
|
389
|
+
*/
|
|
390
|
+
resourceResponse: {
|
|
391
|
+
/** The process ID of the scraper worker. */
|
|
392
|
+
pid: number;
|
|
393
|
+
/** The URL of the page being scraped. */
|
|
394
|
+
url: ExURL;
|
|
395
|
+
/** Network log entry for the resource request/response. */
|
|
396
|
+
log: NetworkLog;
|
|
397
|
+
/** The resource metadata (without UID, which is assigned later by the archive). */
|
|
398
|
+
resource: Omit<Resource, 'uid'>;
|
|
399
|
+
};
|
|
400
|
+
/**
|
|
401
|
+
* Emitted when the scraper transitions between lifecycle phases.
|
|
402
|
+
*/
|
|
403
|
+
changePhase: ChangePhaseEvent;
|
|
404
|
+
};
|
|
327
405
|
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
406
|
+
/**
|
|
407
|
+
* Configuration options for the Scraper.
|
|
408
|
+
*/
|
|
409
|
+
export type ScraperOptions = {
|
|
410
|
+
/** Whether the URL is external to the crawl scope. */
|
|
411
|
+
isExternal: boolean;
|
|
412
|
+
/** Whether to capture image element data from the page. */
|
|
413
|
+
captureImages: boolean;
|
|
414
|
+
/** Keywords or patterns that, if found in the page HTML, cause the page to be skipped. */
|
|
415
|
+
excludeKeywords: string[];
|
|
416
|
+
/** When `true`, only metadata is fetched (via HEAD request) without full browser scraping. */
|
|
417
|
+
metadataOnly: boolean;
|
|
418
|
+
/** Timeout in ms for waiting lazy-loaded images to finish loading. Defaults to 5000. */
|
|
419
|
+
imageLoadTimeout: number;
|
|
420
|
+
/** When `true`, query parameters are stripped from URLs during parsing. */
|
|
421
|
+
disableQueries: boolean;
|
|
422
|
+
/** Number of retries for network operations. Overrides `@retryable` default. */
|
|
423
|
+
retries?: number;
|
|
424
|
+
/** Pre-fetched HEAD check result. When provided, scrapeStart() skips the HEAD request. */
|
|
425
|
+
headCheckResult?: PageData;
|
|
426
|
+
/** Timeout (ms) for page.goto(). Default: 60_000 (60s). */
|
|
427
|
+
navigationTimeout?: number;
|
|
341
428
|
};
|