@d-zero/beholder 2.0.1 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/LICENSE +21 -0
- package/README.md +4 -0
- package/dist/debug.d.ts +9 -0
- package/dist/debug.js +9 -0
- package/dist/dom-evaluation.d.ts +109 -0
- package/dist/dom-evaluation.js +273 -0
- package/dist/index.d.ts +21 -0
- package/dist/index.js +16 -0
- package/dist/is-error.d.ts +8 -0
- package/dist/is-error.js +10 -0
- package/dist/keyword-check.d.ts +8 -0
- package/dist/keyword-check.js +17 -0
- package/dist/network-disconnection.d.ts +28 -0
- package/dist/network-disconnection.js +30 -0
- package/dist/parse-url.d.ts +14 -0
- package/dist/parse-url.js +23 -0
- package/dist/scraper.d.ts +41 -0
- package/dist/scraper.js +712 -0
- package/dist/types.d.ts +348 -0
- package/dist/types.js +7 -0
- package/package.json +5 -4
- package/src/network-disconnection.spec.ts +68 -0
- package/src/network-disconnection.ts +33 -0
- package/src/scraper.ts +72 -13
- package/src/types.ts +4 -2
- package/tsconfig.tsbuildinfo +1 -0
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,348 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Beholder type definitions for the page-level scraper.
|
|
3
|
+
* @see {@link ./scraper.ts} for the Scraper class that produces these types
|
|
4
|
+
* @see {@link ./dom-evaluation.ts} for DOM extraction functions (anchors, images, meta)
|
|
5
|
+
* @module
|
|
6
|
+
*/
|
|
7
|
+
export type { ExURL, ParseURLOptions } from '@d-zero/shared/parse-url';
|
|
8
|
+
export type { CompressType } from '@d-zero/shared/detect-compress';
|
|
9
|
+
export type { CDNType } from '@d-zero/shared/detect-cdn';
|
|
10
|
+
import type { CDNType } from '@d-zero/shared/detect-cdn';
|
|
11
|
+
import type { CompressType } from '@d-zero/shared/detect-compress';
|
|
12
|
+
import type { ExURL } from '@d-zero/shared/parse-url';
|
|
13
|
+
/**
|
|
14
|
+
* Scraped page data returned by the scraper after successfully processing a page.
|
|
15
|
+
*/
|
|
16
|
+
export type PageData = {
|
|
17
|
+
/** The parsed URL of the page. */
|
|
18
|
+
url: ExURL;
|
|
19
|
+
/** Chain of redirect URLs traversed to reach the final destination. */
|
|
20
|
+
redirectPaths: string[];
|
|
21
|
+
/** Whether this page is a target page (internal and within the crawl scope). */
|
|
22
|
+
isTarget: boolean;
|
|
23
|
+
/** Whether this page is external to the crawl scope. */
|
|
24
|
+
isExternal: boolean;
|
|
25
|
+
/** HTTP status code of the response. */
|
|
26
|
+
status: number;
|
|
27
|
+
/** HTTP status text of the response. */
|
|
28
|
+
statusText: string;
|
|
29
|
+
/** The Content-Type header value, or `null` if unavailable. */
|
|
30
|
+
contentType: string | null;
|
|
31
|
+
/** The Content-Length header value in bytes, or `null` if unavailable. */
|
|
32
|
+
contentLength: number | null;
|
|
33
|
+
/** Raw HTTP response headers, or `null` if unavailable. */
|
|
34
|
+
responseHeaders: Record<string, string | string[] | undefined> | null;
|
|
35
|
+
/** Extracted metadata from the page (title, description, OGP, etc.). */
|
|
36
|
+
meta: Meta;
|
|
37
|
+
/** List of anchor elements found on the page. */
|
|
38
|
+
anchorList: AnchorData[];
|
|
39
|
+
/** List of image elements found on the page. */
|
|
40
|
+
imageList: ImageElement[];
|
|
41
|
+
/** HTML snapshot of the rendered DOM. */
|
|
42
|
+
html: string;
|
|
43
|
+
/** Always `false` for successfully scraped pages. See {@link SkippedPageData} for skipped pages. */
|
|
44
|
+
isSkipped: false;
|
|
45
|
+
};
|
|
46
|
+
/**
|
|
47
|
+
* Information about an image element found on a page.
|
|
48
|
+
*/
|
|
49
|
+
export type ImageElement = {
|
|
50
|
+
/** The `src` attribute value of the image element. */
|
|
51
|
+
src: string;
|
|
52
|
+
/** The `currentSrc` property value (the actual URL loaded by the browser). */
|
|
53
|
+
currentSrc: string;
|
|
54
|
+
/** The `alt` attribute value of the image element. */
|
|
55
|
+
alt: string;
|
|
56
|
+
/** The CSS layout width of the image in pixels. */
|
|
57
|
+
width: number;
|
|
58
|
+
/** The CSS layout height of the image in pixels. */
|
|
59
|
+
height: number;
|
|
60
|
+
/** The intrinsic width of the image in pixels. */
|
|
61
|
+
naturalWidth: number;
|
|
62
|
+
/** The intrinsic height of the image in pixels. */
|
|
63
|
+
naturalHeight: number;
|
|
64
|
+
/** Whether the image uses lazy loading (`loading="lazy"` or IntersectionObserver). */
|
|
65
|
+
isLazy: boolean;
|
|
66
|
+
/** The viewport width at which this image was captured. */
|
|
67
|
+
viewportWidth: number;
|
|
68
|
+
/** The outer HTML source code of the image element. */
|
|
69
|
+
sourceCode: string;
|
|
70
|
+
};
|
|
71
|
+
/**
|
|
72
|
+
* Data for a page that was skipped during crawling due to keyword or path exclusion.
|
|
73
|
+
*/
|
|
74
|
+
export type SkippedPageData = {
|
|
75
|
+
/** Always `true` for skipped pages. */
|
|
76
|
+
isSkipped: true;
|
|
77
|
+
/** The URL of the skipped page. */
|
|
78
|
+
url: ExURL;
|
|
79
|
+
/** The reason the page was skipped, with match details. */
|
|
80
|
+
matched: {
|
|
81
|
+
/** Skipped due to a keyword match in the page content. */
|
|
82
|
+
type: 'keyword';
|
|
83
|
+
/** The text that matched the exclusion keyword. */
|
|
84
|
+
text: string;
|
|
85
|
+
/** The exclusion keywords that triggered the skip. */
|
|
86
|
+
excludeKeywords: string[];
|
|
87
|
+
} | {
|
|
88
|
+
/** Skipped due to a URL path pattern match. */
|
|
89
|
+
type: 'path';
|
|
90
|
+
/** The exclusion patterns that triggered the skip. */
|
|
91
|
+
excludes: string[];
|
|
92
|
+
};
|
|
93
|
+
};
|
|
94
|
+
/**
|
|
95
|
+
* A network resource (CSS, JS, image, etc.) captured during page scraping.
|
|
96
|
+
*/
|
|
97
|
+
export type Resource = {
|
|
98
|
+
/** The URL of the resource. */
|
|
99
|
+
url: ExURL;
|
|
100
|
+
/** Whether the resource is from an external domain. */
|
|
101
|
+
isExternal: boolean;
|
|
102
|
+
/** Whether the resource request resulted in an error. */
|
|
103
|
+
isError: boolean;
|
|
104
|
+
/** HTTP status code, or `null` if the request failed. */
|
|
105
|
+
status: number | null;
|
|
106
|
+
/** HTTP status text, or `null` if the request failed. */
|
|
107
|
+
statusText: string | null;
|
|
108
|
+
/** The Content-Type header value, or `null` if unavailable. */
|
|
109
|
+
contentType: string | null;
|
|
110
|
+
/** The Content-Length header value in bytes, or `null` if unavailable. */
|
|
111
|
+
contentLength: number | null;
|
|
112
|
+
/** The compression algorithm used, or `false` if uncompressed. */
|
|
113
|
+
compress: false | CompressType;
|
|
114
|
+
/** The CDN provider detected from response headers, or `false` if none detected. */
|
|
115
|
+
cdn: false | CDNType;
|
|
116
|
+
/** Raw HTTP response headers, or `null` if unavailable. */
|
|
117
|
+
headers: Record<string, string | string[] | undefined> | null;
|
|
118
|
+
};
|
|
119
|
+
/**
|
|
120
|
+
* Data extracted from an anchor element (`<a>` or `<area>`) on a page.
|
|
121
|
+
*/
|
|
122
|
+
export type AnchorData = {
|
|
123
|
+
/**
|
|
124
|
+
* Extracts the value of the `href` attribute from anchor element (`<a>` `<area>`)
|
|
125
|
+
*/
|
|
126
|
+
href: ExURL;
|
|
127
|
+
/**
|
|
128
|
+
* The accessible name of the anchor element
|
|
129
|
+
*/
|
|
130
|
+
textContent: string;
|
|
131
|
+
/**
|
|
132
|
+
* Whether the anchor points to an external URL.
|
|
133
|
+
* Set by `processAnchors()` in the crawler; not available in the sub-process.
|
|
134
|
+
*/
|
|
135
|
+
isExternal?: boolean;
|
|
136
|
+
};
|
|
137
|
+
/**
|
|
138
|
+
* Metadata extracted from a page's `<head>` element.
|
|
139
|
+
*/
|
|
140
|
+
export type Meta = {
|
|
141
|
+
/** The `lang` attribute of the `<html>` element. */
|
|
142
|
+
lang?: string;
|
|
143
|
+
/** The text content of the `<title>` element. */
|
|
144
|
+
title: string;
|
|
145
|
+
/** The `content` attribute of `<meta name="description">`. */
|
|
146
|
+
description?: string;
|
|
147
|
+
/** The `content` attribute of `<meta name="keywords">`. */
|
|
148
|
+
keywords?: string;
|
|
149
|
+
/** Whether `noindex` is present in the robots meta tag. */
|
|
150
|
+
noindex?: boolean;
|
|
151
|
+
/** Whether `nofollow` is present in the robots meta tag. */
|
|
152
|
+
nofollow?: boolean;
|
|
153
|
+
/** Whether `noarchive` is present in the robots meta tag. */
|
|
154
|
+
noarchive?: boolean;
|
|
155
|
+
/** The canonical URL from `<link rel="canonical">`. */
|
|
156
|
+
canonical?: string;
|
|
157
|
+
/** The alternate URL from `<link rel="alternate">`. */
|
|
158
|
+
alternate?: string;
|
|
159
|
+
/** The Open Graph type (`og:type`). */
|
|
160
|
+
'og:type'?: string;
|
|
161
|
+
/** The Open Graph title (`og:title`). */
|
|
162
|
+
'og:title'?: string;
|
|
163
|
+
/** The Open Graph site name (`og:site_name`). */
|
|
164
|
+
'og:site_name'?: string;
|
|
165
|
+
/** The Open Graph description (`og:description`). */
|
|
166
|
+
'og:description'?: string;
|
|
167
|
+
/** The Open Graph URL (`og:url`). */
|
|
168
|
+
'og:url'?: string;
|
|
169
|
+
/** The Open Graph image URL (`og:image`). */
|
|
170
|
+
'og:image'?: string;
|
|
171
|
+
/** The Twitter Card type (`twitter:card`). */
|
|
172
|
+
'twitter:card'?: string;
|
|
173
|
+
};
|
|
174
|
+
/**
|
|
175
|
+
* A network request/response log entry captured during page scraping via Puppeteer.
|
|
176
|
+
*/
|
|
177
|
+
export type NetworkLog = {
|
|
178
|
+
/** The URL of the network request. */
|
|
179
|
+
url: ExURL;
|
|
180
|
+
/** HTTP status code of the response, or `null` if the request failed. */
|
|
181
|
+
status: number | null;
|
|
182
|
+
/** The Content-Length of the response body in bytes. */
|
|
183
|
+
contentLength: number;
|
|
184
|
+
/** The Content-Type of the response. */
|
|
185
|
+
contentType: string;
|
|
186
|
+
/** Whether the request resulted in an error. */
|
|
187
|
+
isError: boolean;
|
|
188
|
+
/** Details of the outgoing HTTP request. */
|
|
189
|
+
request: {
|
|
190
|
+
/** Timestamp of the request in milliseconds. */
|
|
191
|
+
ts: number;
|
|
192
|
+
/** HTTP request headers. */
|
|
193
|
+
headers: Record<string, string>;
|
|
194
|
+
/** HTTP method used (e.g., "GET", "POST"). */
|
|
195
|
+
method: string;
|
|
196
|
+
};
|
|
197
|
+
/** Details of the HTTP response, absent if the request failed. */
|
|
198
|
+
response?: {
|
|
199
|
+
/** Timestamp of the response in milliseconds. */
|
|
200
|
+
ts: number;
|
|
201
|
+
/** HTTP status code. */
|
|
202
|
+
status: number;
|
|
203
|
+
/** HTTP status text. */
|
|
204
|
+
statusText: string;
|
|
205
|
+
/** Whether the response was served from cache. */
|
|
206
|
+
fromCache: boolean;
|
|
207
|
+
/** HTTP response headers. */
|
|
208
|
+
headers: Record<string, string>;
|
|
209
|
+
};
|
|
210
|
+
};
|
|
211
|
+
/**
|
|
212
|
+
* The result of a single page scrape operation.
|
|
213
|
+
* Encapsulates the outcome and all captured sub-resources.
|
|
214
|
+
*/
|
|
215
|
+
export type ScrapeResult = {
|
|
216
|
+
/**
|
|
217
|
+
* The type of result:
|
|
218
|
+
* - `"success"` - Scraping completed successfully.
|
|
219
|
+
* - `"skipped"` - The page was skipped due to an exclusion rule.
|
|
220
|
+
* - `"error"` - An error occurred during scraping.
|
|
221
|
+
*/
|
|
222
|
+
type: 'success' | 'skipped' | 'error';
|
|
223
|
+
/** The full page data, present when `type` is `"success"`. */
|
|
224
|
+
pageData?: PageData;
|
|
225
|
+
/** All sub-resources captured during the page load. */
|
|
226
|
+
resources: ResourceEntry[];
|
|
227
|
+
/** Details about why the page was ignored, present when `type` is `"skipped"`. */
|
|
228
|
+
ignored?: {
|
|
229
|
+
url: ExURL;
|
|
230
|
+
matchedText: string;
|
|
231
|
+
excludeKeywords: string[];
|
|
232
|
+
};
|
|
233
|
+
/** Error details, present when `type` is `"error"`. */
|
|
234
|
+
error?: {
|
|
235
|
+
name: string;
|
|
236
|
+
message: string;
|
|
237
|
+
stack?: string;
|
|
238
|
+
shutdown: boolean;
|
|
239
|
+
};
|
|
240
|
+
/** Sub-resource requests that failed during page loading (e.g., due to network disconnection). */
|
|
241
|
+
failedRequests?: ReadonlyArray<{
|
|
242
|
+
url: string;
|
|
243
|
+
errorText: string;
|
|
244
|
+
}>;
|
|
245
|
+
};
|
|
246
|
+
/**
|
|
247
|
+
* A single sub-resource entry captured during page scraping.
|
|
248
|
+
* Represents one network resource (CSS, JS, image, etc.) loaded by a page.
|
|
249
|
+
*/
|
|
250
|
+
export type ResourceEntry = {
|
|
251
|
+
/** The network log entry containing request/response timing and headers. */
|
|
252
|
+
log: NetworkLog;
|
|
253
|
+
/** The resource metadata (without UID, which is assigned by the archive layer). */
|
|
254
|
+
resource: Omit<Resource, 'uid'>;
|
|
255
|
+
/** The URL (without hash) of the page that triggered this resource load. */
|
|
256
|
+
pageUrl: string;
|
|
257
|
+
};
|
|
258
|
+
/**
|
|
259
|
+
* Event payload describing a phase transition in the scraping lifecycle.
|
|
260
|
+
* Phases proceed roughly in order: scrapeStart -> openPage ->
|
|
261
|
+
* loadDOMContent -> waitNetworkIdle -> getHTML -> getAnchors -> getMeta ->
|
|
262
|
+
* extractImages -> getImages -> scrapeEnd.
|
|
263
|
+
*/
|
|
264
|
+
export type ChangePhaseEvent = {
|
|
265
|
+
/** The process ID of the scraper worker. */
|
|
266
|
+
pid: number;
|
|
267
|
+
/**
|
|
268
|
+
* The name of the current scraping phase.
|
|
269
|
+
*
|
|
270
|
+
* - `scrapeStart` - Scraping has begun for a URL.
|
|
271
|
+
* - `launchBrowser` - A browser instance is being launched.
|
|
272
|
+
* - `headRequest` - Performing an HTTP HEAD request to check the destination.
|
|
273
|
+
* - `headRequestTimeout` - The HEAD request timed out.
|
|
274
|
+
* - `newPage` - A new browser page/tab is being created.
|
|
275
|
+
* - `openPage` - Navigating the browser page to the target URL.
|
|
276
|
+
* - `loadDOMContent` - Waiting for the DOM content to finish loading.
|
|
277
|
+
* - `waitNetworkIdle` - Waiting for all network activity to cease.
|
|
278
|
+
* - `getHTML` - Extracting the page HTML content.
|
|
279
|
+
* - `setViewport` - Setting the browser viewport dimensions.
|
|
280
|
+
* - `scrollToBottom` - Scrolling the page to trigger lazy-loaded content.
|
|
281
|
+
* - `extractImages` - Starting the image extraction pipeline.
|
|
282
|
+
* - `waitImageLoad` - Waiting for images to finish loading on the page.
|
|
283
|
+
* - `getImages` - Extracting image element data from the page.
|
|
284
|
+
* - `getAnchors` - Extracting anchor/link data from the page.
|
|
285
|
+
* - `getMeta` - Extracting meta tag information from the page.
|
|
286
|
+
* - `pageSkipped` - The page matched an exclusion rule and is being skipped.
|
|
287
|
+
* - `retryWait` - Waiting before a retry attempt after a transient failure.
|
|
288
|
+
* - `retryExhausted` - All retry attempts exhausted; giving up on this operation.
|
|
289
|
+
* - `scrapeEnd` - Scraping has completed for this URL.
|
|
290
|
+
* - `beforeCleanup` - The scraper is about to clean up resources.
|
|
291
|
+
* - `cleanedUp` - The scraper has finished cleaning up.
|
|
292
|
+
*/
|
|
293
|
+
name: 'scrapeStart' | 'launchBrowser' | 'headRequest' | 'headRequestTimeout' | 'newPage' | 'openPage' | 'loadDOMContent' | 'waitNetworkIdle' | 'getHTML' | 'setViewport' | 'scrollToBottom' | 'extractImages' | 'waitImageLoad' | 'getImages' | 'getAnchors' | 'getMeta' | 'pageSkipped' | 'retryWait' | 'retryExhausted' | 'scrapeEnd' | 'beforeCleanup' | 'cleanedUp';
|
|
294
|
+
/** The URL being scraped, or `null` when the phase is not URL-specific (e.g., setViewport). */
|
|
295
|
+
url: ExURL | null;
|
|
296
|
+
/** Whether the URL being scraped is external to the crawl scope. */
|
|
297
|
+
isExternal: boolean;
|
|
298
|
+
/** An optional human-readable message providing additional context about the phase. */
|
|
299
|
+
message: string;
|
|
300
|
+
};
|
|
301
|
+
/**
|
|
302
|
+
* Streaming event types emitted by the Scraper.
|
|
303
|
+
* Result events (success, skipped, error) are returned as values,
|
|
304
|
+
* not emitted as events.
|
|
305
|
+
*/
|
|
306
|
+
export type ScraperEventTypes = {
|
|
307
|
+
/**
|
|
308
|
+
* Emitted when a sub-resource response is captured during page loading.
|
|
309
|
+
* Only fires for internal (non-external) pages.
|
|
310
|
+
*/
|
|
311
|
+
resourceResponse: {
|
|
312
|
+
/** The process ID of the scraper worker. */
|
|
313
|
+
pid: number;
|
|
314
|
+
/** The URL of the page being scraped. */
|
|
315
|
+
url: ExURL;
|
|
316
|
+
/** Network log entry for the resource request/response. */
|
|
317
|
+
log: NetworkLog;
|
|
318
|
+
/** The resource metadata (without UID, which is assigned later by the archive). */
|
|
319
|
+
resource: Omit<Resource, 'uid'>;
|
|
320
|
+
};
|
|
321
|
+
/**
|
|
322
|
+
* Emitted when the scraper transitions between lifecycle phases.
|
|
323
|
+
*/
|
|
324
|
+
changePhase: ChangePhaseEvent;
|
|
325
|
+
};
|
|
326
|
+
/**
|
|
327
|
+
* Configuration options for the Scraper.
|
|
328
|
+
*/
|
|
329
|
+
export type ScraperOptions = {
|
|
330
|
+
/** Whether the URL is external to the crawl scope. */
|
|
331
|
+
isExternal: boolean;
|
|
332
|
+
/** Whether to capture image element data from the page. */
|
|
333
|
+
captureImages: boolean;
|
|
334
|
+
/** Keywords or patterns that, if found in the page HTML, cause the page to be skipped. */
|
|
335
|
+
excludeKeywords: string[];
|
|
336
|
+
/** When `true`, only metadata is fetched (via HEAD request) without full browser scraping. */
|
|
337
|
+
metadataOnly: boolean;
|
|
338
|
+
/** Timeout in ms for waiting lazy-loaded images to finish loading. Defaults to 5000. */
|
|
339
|
+
imageLoadTimeout: number;
|
|
340
|
+
/** When `true`, query parameters are stripped from URLs during parsing. */
|
|
341
|
+
disableQueries: boolean;
|
|
342
|
+
/** Number of retries for network operations. Overrides `@retryable` default. */
|
|
343
|
+
retries?: number;
|
|
344
|
+
/** Pre-fetched HEAD check result. When provided, scrapeStart() skips the HEAD request. */
|
|
345
|
+
headCheckResult?: PageData;
|
|
346
|
+
/** Timeout (ms) for page.goto(). Default: 60_000 (60s). */
|
|
347
|
+
navigationTimeout?: number;
|
|
348
|
+
};
|
package/dist/types.js
ADDED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@d-zero/beholder",
|
|
3
|
-
"version": "2.0
|
|
3
|
+
"version": "2.1.0",
|
|
4
4
|
"description": "Page-level scraper for web crawling and auditing",
|
|
5
5
|
"author": "D-ZERO",
|
|
6
6
|
"license": "MIT",
|
|
@@ -20,12 +20,13 @@
|
|
|
20
20
|
"clean": "tsc --build --clean"
|
|
21
21
|
},
|
|
22
22
|
"dependencies": {
|
|
23
|
-
"@d-zero/puppeteer-page-scan": "4.4.
|
|
24
|
-
"@d-zero/shared": "0.21.
|
|
23
|
+
"@d-zero/puppeteer-page-scan": "4.4.7",
|
|
24
|
+
"@d-zero/shared": "0.21.1",
|
|
25
25
|
"debug": "4.4.3",
|
|
26
26
|
"puppeteer": "24.37.5"
|
|
27
27
|
},
|
|
28
28
|
"devDependencies": {
|
|
29
29
|
"@types/debug": "4.1.12"
|
|
30
|
-
}
|
|
30
|
+
},
|
|
31
|
+
"gitHead": "6f13c72151f3d6fe5ddf72d0cde61b31a7fc96ac"
|
|
31
32
|
}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import { describe, it, expect } from 'vitest';
|
|
2
|
+
|
|
3
|
+
import {
|
|
4
|
+
NETWORK_DISCONNECTION_ERRORS,
|
|
5
|
+
findDisconnectionFailures,
|
|
6
|
+
} from './network-disconnection.js';
|
|
7
|
+
|
|
8
|
+
describe('NETWORK_DISCONNECTION_ERRORS', () => {
|
|
9
|
+
it('contains the expected error codes', () => {
|
|
10
|
+
expect(NETWORK_DISCONNECTION_ERRORS.has('net::ERR_NETWORK_CHANGED')).toBe(true);
|
|
11
|
+
expect(NETWORK_DISCONNECTION_ERRORS.has('net::ERR_INTERNET_DISCONNECTED')).toBe(true);
|
|
12
|
+
expect(NETWORK_DISCONNECTION_ERRORS.has('net::ERR_CONNECTION_RESET')).toBe(true);
|
|
13
|
+
expect(NETWORK_DISCONNECTION_ERRORS.has('net::ERR_NETWORK_IO_SUSPENDED')).toBe(true);
|
|
14
|
+
expect(NETWORK_DISCONNECTION_ERRORS.has('net::ERR_CONNECTION_TIMED_OUT')).toBe(true);
|
|
15
|
+
});
|
|
16
|
+
|
|
17
|
+
it('does not contain unrelated error codes', () => {
|
|
18
|
+
expect(NETWORK_DISCONNECTION_ERRORS.has('net::ERR_NAME_NOT_RESOLVED')).toBe(false);
|
|
19
|
+
expect(NETWORK_DISCONNECTION_ERRORS.has('net::ERR_CERT_COMMON_NAME_INVALID')).toBe(
|
|
20
|
+
false,
|
|
21
|
+
);
|
|
22
|
+
expect(NETWORK_DISCONNECTION_ERRORS.has('net::ERR_ABORTED')).toBe(false);
|
|
23
|
+
});
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
describe('findDisconnectionFailures', () => {
|
|
27
|
+
it('returns empty array when no failed requests', () => {
|
|
28
|
+
expect(findDisconnectionFailures([])).toStrictEqual([]);
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
it('returns empty array when failures are not network disconnection errors', () => {
|
|
32
|
+
const failures = [
|
|
33
|
+
{ url: 'https://example.com/style.css', errorText: 'net::ERR_ABORTED' },
|
|
34
|
+
{ url: 'https://example.com/script.js', errorText: 'net::ERR_NAME_NOT_RESOLVED' },
|
|
35
|
+
];
|
|
36
|
+
expect(findDisconnectionFailures(failures)).toStrictEqual([]);
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
it('returns only disconnection failures from mixed failures', () => {
|
|
40
|
+
const failures = [
|
|
41
|
+
{ url: 'https://example.com/style.css', errorText: 'net::ERR_ABORTED' },
|
|
42
|
+
{
|
|
43
|
+
url: 'https://example.com/font.woff2',
|
|
44
|
+
errorText: 'net::ERR_INTERNET_DISCONNECTED',
|
|
45
|
+
},
|
|
46
|
+
{ url: 'https://example.com/script.js', errorText: 'net::ERR_CONNECTION_RESET' },
|
|
47
|
+
{ url: 'https://example.com/image.png', errorText: 'net::ERR_FAILED' },
|
|
48
|
+
];
|
|
49
|
+
const result = findDisconnectionFailures(failures);
|
|
50
|
+
expect(result).toStrictEqual([
|
|
51
|
+
{
|
|
52
|
+
url: 'https://example.com/font.woff2',
|
|
53
|
+
errorText: 'net::ERR_INTERNET_DISCONNECTED',
|
|
54
|
+
},
|
|
55
|
+
{ url: 'https://example.com/script.js', errorText: 'net::ERR_CONNECTION_RESET' },
|
|
56
|
+
]);
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
it('returns all failures when all are disconnection errors', () => {
|
|
60
|
+
const failures = [
|
|
61
|
+
{ url: 'https://example.com/a.js', errorText: 'net::ERR_NETWORK_CHANGED' },
|
|
62
|
+
{ url: 'https://example.com/b.css', errorText: 'net::ERR_CONNECTION_TIMED_OUT' },
|
|
63
|
+
{ url: 'https://example.com/c.png', errorText: 'net::ERR_NETWORK_IO_SUSPENDED' },
|
|
64
|
+
];
|
|
65
|
+
const result = findDisconnectionFailures(failures);
|
|
66
|
+
expect(result).toHaveLength(3);
|
|
67
|
+
});
|
|
68
|
+
});
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Chromium network error codes that indicate a transient network disconnection.
|
|
3
|
+
* When sub-resource requests fail with these errors, the page should be retried.
|
|
4
|
+
*
|
|
5
|
+
* These codes are defined in Chromium's net_error_list.h:
|
|
6
|
+
* https://source.chromium.org/chromium/chromium/src/+/main:net/base/net_error_list.h
|
|
7
|
+
*
|
|
8
|
+
* Selection criteria: errors caused by transient connectivity loss (NIC flip,
|
|
9
|
+
* Wi-Fi roaming, brief ISP outage). Permanent DNS or TLS errors
|
|
10
|
+
* (e.g. ERR_NAME_NOT_RESOLVED, ERR_CERT_*) are excluded because retrying
|
|
11
|
+
* the same page would not recover from them.
|
|
12
|
+
*
|
|
13
|
+
* To update: search net_error_list.h for new transient-connectivity codes
|
|
14
|
+
* when upgrading the Puppeteer / Chromium version.
|
|
15
|
+
*/
|
|
16
|
+
export const NETWORK_DISCONNECTION_ERRORS: ReadonlySet<string> = new Set([
|
|
17
|
+
'net::ERR_NETWORK_CHANGED',
|
|
18
|
+
'net::ERR_INTERNET_DISCONNECTED',
|
|
19
|
+
'net::ERR_CONNECTION_RESET',
|
|
20
|
+
'net::ERR_NETWORK_IO_SUSPENDED',
|
|
21
|
+
'net::ERR_CONNECTION_TIMED_OUT',
|
|
22
|
+
]);
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Checks whether any of the given failed requests were caused by a network disconnection.
|
|
26
|
+
* @param failedRequests - Array of failed request entries
|
|
27
|
+
* @returns The subset of failed requests caused by network disconnection, empty array if none
|
|
28
|
+
*/
|
|
29
|
+
export function findDisconnectionFailures(
|
|
30
|
+
failedRequests: ReadonlyArray<{ url: string; errorText: string }>,
|
|
31
|
+
): Array<{ url: string; errorText: string }> {
|
|
32
|
+
return failedRequests.filter((r) => NETWORK_DISCONNECTION_ERRORS.has(r.errorText));
|
|
33
|
+
}
|
package/src/scraper.ts
CHANGED
|
@@ -13,7 +13,7 @@ import type {
|
|
|
13
13
|
SkippedPageData,
|
|
14
14
|
} from './types.js';
|
|
15
15
|
import type { PageScanPhase } from '@d-zero/puppeteer-page-scan';
|
|
16
|
-
import type { Page } from 'puppeteer';
|
|
16
|
+
import type { Dialog, HTTPRequest, HTTPResponse, Page } from 'puppeteer';
|
|
17
17
|
|
|
18
18
|
import { beforePageScan, devicePresets } from '@d-zero/puppeteer-page-scan';
|
|
19
19
|
import { detectCDN } from '@d-zero/shared/detect-cdn';
|
|
@@ -25,6 +25,7 @@ import { resourceLog, scraperLog } from './debug.js';
|
|
|
25
25
|
import { getAnchorList, getImageList, getMeta } from './dom-evaluation.js';
|
|
26
26
|
import { isError } from './is-error.js';
|
|
27
27
|
import { keywordCheck } from './keyword-check.js';
|
|
28
|
+
import { findDisconnectionFailures } from './network-disconnection.js';
|
|
28
29
|
import { parseUrl } from './parse-url.js';
|
|
29
30
|
|
|
30
31
|
const pid = `${process.pid}`;
|
|
@@ -51,6 +52,8 @@ const rLog = resourceLog.extend(pid);
|
|
|
51
52
|
export default class Scraper extends EventEmitter<ScraperEventTypes> {
|
|
52
53
|
/** Number of retries for `@retryable`-decorated methods. Set per-scrape from options. */
|
|
53
54
|
retries?: number;
|
|
55
|
+
/** Cleanup function to remove page listeners registered by `#fetchData`. */
|
|
56
|
+
#pageListenerCleanup: (() => void) | null = null;
|
|
54
57
|
|
|
55
58
|
/**
|
|
56
59
|
* Begins the scraping process for a given URL on the provided Puppeteer page.
|
|
@@ -81,6 +84,7 @@ export default class Scraper extends EventEmitter<ScraperEventTypes> {
|
|
|
81
84
|
const metadataOnly = options?.metadataOnly ?? false;
|
|
82
85
|
const imageLoadTimeout = options?.imageLoadTimeout ?? 5000;
|
|
83
86
|
const resources: ResourceEntry[] = [];
|
|
87
|
+
const failedRequests: Array<{ url: string; errorText: string }> = [];
|
|
84
88
|
|
|
85
89
|
void this.emit('changePhase', {
|
|
86
90
|
pid: process.pid,
|
|
@@ -169,6 +173,7 @@ export default class Scraper extends EventEmitter<ScraperEventTypes> {
|
|
|
169
173
|
captureImages,
|
|
170
174
|
imageLoadTimeout,
|
|
171
175
|
resources,
|
|
176
|
+
failedRequests,
|
|
172
177
|
options,
|
|
173
178
|
).catch((error) => {
|
|
174
179
|
if (error instanceof Error) {
|
|
@@ -179,10 +184,11 @@ export default class Scraper extends EventEmitter<ScraperEventTypes> {
|
|
|
179
184
|
|
|
180
185
|
if (fetchResult instanceof Error) {
|
|
181
186
|
log('Error(FETCH_DATA): %s', url.href);
|
|
182
|
-
|
|
187
|
+
this.#cleanupPageListeners();
|
|
183
188
|
return {
|
|
184
189
|
type: 'error',
|
|
185
190
|
resources,
|
|
191
|
+
failedRequests: failedRequests.length > 0 ? failedRequests : undefined,
|
|
186
192
|
error: {
|
|
187
193
|
name: fetchResult.name,
|
|
188
194
|
message: fetchResult.message,
|
|
@@ -192,7 +198,7 @@ export default class Scraper extends EventEmitter<ScraperEventTypes> {
|
|
|
192
198
|
};
|
|
193
199
|
}
|
|
194
200
|
|
|
195
|
-
|
|
201
|
+
this.#cleanupPageListeners();
|
|
196
202
|
headResult = fetchResult;
|
|
197
203
|
|
|
198
204
|
if (!headResult.isSkipped) {
|
|
@@ -250,7 +256,18 @@ export default class Scraper extends EventEmitter<ScraperEventTypes> {
|
|
|
250
256
|
message: '',
|
|
251
257
|
});
|
|
252
258
|
|
|
253
|
-
return {
|
|
259
|
+
return {
|
|
260
|
+
type: 'success',
|
|
261
|
+
pageData: headResult,
|
|
262
|
+
resources,
|
|
263
|
+
failedRequests: failedRequests.length > 0 ? failedRequests : undefined,
|
|
264
|
+
};
|
|
265
|
+
}
|
|
266
|
+
#cleanupPageListeners() {
|
|
267
|
+
if (this.#pageListenerCleanup) {
|
|
268
|
+
this.#pageListenerCleanup();
|
|
269
|
+
this.#pageListenerCleanup = null;
|
|
270
|
+
}
|
|
254
271
|
}
|
|
255
272
|
|
|
256
273
|
/**
|
|
@@ -292,17 +309,19 @@ export default class Scraper extends EventEmitter<ScraperEventTypes> {
|
|
|
292
309
|
* emitting `retryWait` / `retryExhausted` phase events for progress monitoring.
|
|
293
310
|
*
|
|
294
311
|
* Flow:
|
|
295
|
-
* 1. Register request/response listeners to capture sub-resources (internal pages only)
|
|
312
|
+
* 1. Register request/response/requestfailed listeners to capture sub-resources (internal pages only)
|
|
296
313
|
* 2. Navigate to URL via `page.goto()` and track redirect chain
|
|
297
314
|
* 3. Wait for DOM content and network idle
|
|
298
|
-
* 4.
|
|
299
|
-
* 5.
|
|
315
|
+
* 4. Check for network disconnection errors and throw to trigger retry
|
|
316
|
+
* 5. Extract anchors, meta, and optionally images
|
|
317
|
+
* 6. Check for keyword exclusion in HTML content
|
|
300
318
|
* @param page - Puppeteer page instance
|
|
301
319
|
* @param url - Target URL to navigate to
|
|
302
320
|
* @param isExternal - Whether the URL is external to the crawl scope
|
|
303
321
|
* @param captureImages - Whether to run the image extraction pipeline
|
|
304
322
|
* @param imageLoadTimeout - Timeout (ms) for waiting lazy-loaded images to complete
|
|
305
323
|
* @param resources - Mutable array to collect captured sub-resources into
|
|
324
|
+
* @param failedRequests - Mutable array to collect failed sub-resource requests into
|
|
306
325
|
* @param options - Additional scraper options (e.g. `disableQueries`, `navigationTimeout`)
|
|
307
326
|
* @returns Full page data or skipped page data if an exclusion rule matched
|
|
308
327
|
*/
|
|
@@ -334,6 +353,7 @@ export default class Scraper extends EventEmitter<ScraperEventTypes> {
|
|
|
334
353
|
captureImages: boolean,
|
|
335
354
|
imageLoadTimeout: number,
|
|
336
355
|
resources: ResourceEntry[],
|
|
356
|
+
failedRequests: Array<{ url: string; errorText: string }>,
|
|
337
357
|
options?: Partial<ScraperOptions>,
|
|
338
358
|
): Promise<PageData | SkippedPageData> {
|
|
339
359
|
const parseOpts: ParseURLOptions | undefined =
|
|
@@ -342,7 +362,14 @@ export default class Scraper extends EventEmitter<ScraperEventTypes> {
|
|
|
342
362
|
: { disableQueries: options.disableQueries };
|
|
343
363
|
const networkLogs: Record<string, NetworkLog> = {};
|
|
344
364
|
|
|
345
|
-
|
|
365
|
+
// Clear stale state from previous retries (@retryable may re-invoke this method
|
|
366
|
+
// with the same page and mutable arrays, so we must reset to avoid accumulation)
|
|
367
|
+
this.#cleanupPageListeners();
|
|
368
|
+
failedRequests.length = 0;
|
|
369
|
+
resources.length = 0;
|
|
370
|
+
|
|
371
|
+
// Define named listeners so they can be individually removed on retry/cleanup
|
|
372
|
+
const onDialog = async (dialog: Dialog) => {
|
|
346
373
|
log(`Appear ${dialog.type()} dialog: ${dialog.message()}`);
|
|
347
374
|
try {
|
|
348
375
|
await dialog.accept();
|
|
@@ -350,10 +377,15 @@ export default class Scraper extends EventEmitter<ScraperEventTypes> {
|
|
|
350
377
|
log(`Error: ${error}`);
|
|
351
378
|
}
|
|
352
379
|
log(`Accept ${dialog.type()} dialog`);
|
|
353
|
-
}
|
|
380
|
+
};
|
|
381
|
+
page.on('dialog', onDialog);
|
|
382
|
+
|
|
383
|
+
let onRequest: ((req: HTTPRequest) => void) | null = null;
|
|
384
|
+
let onResponse: ((res: HTTPResponse) => void) | null = null;
|
|
385
|
+
let onRequestFailed: ((req: HTTPRequest) => void) | null = null;
|
|
354
386
|
|
|
355
387
|
if (!isExternal) {
|
|
356
|
-
|
|
388
|
+
onRequest = (request: HTTPRequest) => {
|
|
357
389
|
const url = parseUrl(request.url(), parseOpts)!;
|
|
358
390
|
networkLogs[request.url()] = {
|
|
359
391
|
url,
|
|
@@ -367,10 +399,10 @@ export default class Scraper extends EventEmitter<ScraperEventTypes> {
|
|
|
367
399
|
method: request.method(),
|
|
368
400
|
},
|
|
369
401
|
};
|
|
370
|
-
}
|
|
402
|
+
};
|
|
371
403
|
|
|
372
404
|
const uniqueRes = new Set<string>();
|
|
373
|
-
|
|
405
|
+
onResponse = (response: HTTPResponse) => {
|
|
374
406
|
const resURL = parseUrl(response.url(), parseOpts)!;
|
|
375
407
|
|
|
376
408
|
if (uniqueRes.has(resURL.withoutHash)) {
|
|
@@ -428,9 +460,27 @@ export default class Scraper extends EventEmitter<ScraperEventTypes> {
|
|
|
428
460
|
log,
|
|
429
461
|
resource: referredLink,
|
|
430
462
|
});
|
|
431
|
-
}
|
|
463
|
+
};
|
|
464
|
+
|
|
465
|
+
onRequestFailed = (request: HTTPRequest) => {
|
|
466
|
+
const errorText = request.failure()?.errorText ?? 'Unknown error';
|
|
467
|
+
rLog('Request failed: %s (%s)', request.url(), errorText);
|
|
468
|
+
failedRequests.push({ url: request.url(), errorText });
|
|
469
|
+
};
|
|
470
|
+
|
|
471
|
+
page.on('request', onRequest);
|
|
472
|
+
page.on('response', onResponse);
|
|
473
|
+
page.on('requestfailed', onRequestFailed);
|
|
432
474
|
}
|
|
433
475
|
|
|
476
|
+
// Store cleanup function for retry/post-fetch removal
|
|
477
|
+
this.#pageListenerCleanup = () => {
|
|
478
|
+
page.off('dialog', onDialog);
|
|
479
|
+
if (onRequest) page.off('request', onRequest);
|
|
480
|
+
if (onResponse) page.off('response', onResponse);
|
|
481
|
+
if (onRequestFailed) page.off('requestfailed', onRequestFailed);
|
|
482
|
+
};
|
|
483
|
+
|
|
434
484
|
const navigationTimeout = options?.navigationTimeout ?? 60_000;
|
|
435
485
|
|
|
436
486
|
void this.emit('changePhase', {
|
|
@@ -560,6 +610,15 @@ export default class Scraper extends EventEmitter<ScraperEventTypes> {
|
|
|
560
610
|
.waitForNavigation({ waitUntil: 'networkidle0', timeout: 5000 })
|
|
561
611
|
.catch(() => {});
|
|
562
612
|
|
|
613
|
+
// Check for network disconnection errors in failed requests
|
|
614
|
+
const disconnectionFailures = findDisconnectionFailures(failedRequests);
|
|
615
|
+
if (disconnectionFailures.length > 0) {
|
|
616
|
+
const errorSummary = disconnectionFailures
|
|
617
|
+
.map((r) => `${r.url} (${r.errorText})`)
|
|
618
|
+
.join(', ');
|
|
619
|
+
throw new Error(`Network disconnection detected during page load: ${errorSummary}`);
|
|
620
|
+
}
|
|
621
|
+
|
|
563
622
|
void this.emit('changePhase', {
|
|
564
623
|
pid: process.pid,
|
|
565
624
|
name: 'getAnchors',
|