@d-zero/beholder 0.1.29 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +11 -0
- package/README.md +172 -477
- package/dist/debug.d.ts +4 -1
- package/dist/debug.js +5 -2
- package/dist/dom-evaluation.d.ts +72 -14
- package/dist/dom-evaluation.js +169 -43
- package/dist/index.d.ts +20 -3
- package/dist/index.js +15 -3
- package/dist/is-error.d.ts +8 -0
- package/dist/is-error.js +10 -0
- package/dist/keyword-check.d.ts +5 -3
- package/dist/keyword-check.js +5 -3
- package/dist/parse-url.d.ts +14 -0
- package/dist/parse-url.js +23 -0
- package/dist/scraper.d.ts +39 -13
- package/dist/scraper.js +300 -263
- package/dist/types.d.ts +286 -214
- package/dist/types.js +6 -0
- package/package.json +7 -10
- package/src/debug.ts +5 -2
- package/src/dom-evaluation.ts +195 -65
- package/src/index.ts +27 -3
- package/src/is-error.spec.ts +33 -0
- package/src/is-error.ts +10 -0
- package/src/keyword-check.spec.ts +45 -4
- package/src/keyword-check.ts +5 -3
- package/src/parse-url.spec.ts +35 -0
- package/src/parse-url.ts +26 -0
- package/src/scraper.ts +338 -300
- package/src/types.ts +345 -258
- package/tsconfig.tsbuildinfo +1 -1
- package/dist/events.d.ts +0 -32
- package/dist/events.js +0 -15
- package/dist/fetch-destination.d.ts +0 -8
- package/dist/fetch-destination.js +0 -145
- package/dist/net-timeout-error.d.ts +0 -3
- package/dist/net-timeout-error.js +0 -3
- package/dist/sub-process-runner.d.ts +0 -12
- package/dist/sub-process-runner.js +0 -180
- package/dist/sub-process.d.ts +0 -1
- package/dist/sub-process.js +0 -67
- package/dist/utils.d.ts +0 -16
- package/dist/utils.js +0 -69
- package/src/events.ts +0 -21
- package/src/fetch-destination.ts +0 -173
- package/src/net-timeout-error.ts +0 -3
- package/src/sub-process-runner.ts +0 -220
- package/src/sub-process.ts +0 -86
- package/src/utils.ts +0 -89
package/src/scraper.ts
CHANGED
|
@@ -1,130 +1,119 @@
|
|
|
1
1
|
import type {
|
|
2
|
-
|
|
2
|
+
ChangePhaseEvent,
|
|
3
|
+
ResourceEntry,
|
|
4
|
+
ScraperEventTypes,
|
|
5
|
+
ScraperOptions,
|
|
6
|
+
ScrapeResult,
|
|
7
|
+
ExURL,
|
|
3
8
|
ImageElement,
|
|
4
9
|
NetworkLog,
|
|
5
10
|
PageData,
|
|
6
11
|
ParseURLOptions,
|
|
7
12
|
Resource,
|
|
8
13
|
SkippedPageData,
|
|
9
|
-
ExURL,
|
|
10
14
|
} from './types.js';
|
|
11
|
-
import type {
|
|
15
|
+
import type { PageScanPhase } from '@d-zero/puppeteer-page-scan';
|
|
16
|
+
import type { Page } from 'puppeteer';
|
|
12
17
|
|
|
13
|
-
import { beforePageScan } from '@d-zero/puppeteer-page-scan';
|
|
14
|
-
import {
|
|
15
|
-
import {
|
|
16
|
-
import {
|
|
17
|
-
import {
|
|
18
|
+
import { beforePageScan, devicePresets } from '@d-zero/puppeteer-page-scan';
|
|
19
|
+
import { detectCDN } from '@d-zero/shared/detect-cdn';
|
|
20
|
+
import { detectCompress } from '@d-zero/shared/detect-compress';
|
|
21
|
+
import { retry as retryable } from '@d-zero/shared/retry';
|
|
22
|
+
import { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
|
|
18
23
|
|
|
19
24
|
import { resourceLog, scraperLog } from './debug.js';
|
|
20
25
|
import { getAnchorList, getImageList, getMeta } from './dom-evaluation.js';
|
|
21
|
-
import {
|
|
26
|
+
import { isError } from './is-error.js';
|
|
22
27
|
import { keywordCheck } from './keyword-check.js';
|
|
23
|
-
import {
|
|
28
|
+
import { parseUrl } from './parse-url.js';
|
|
24
29
|
|
|
25
30
|
const pid = `${process.pid}`;
|
|
26
31
|
const log = scraperLog.extend(pid);
|
|
27
32
|
const rLog = resourceLog.extend(pid);
|
|
28
33
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
log('Browser disconnects');
|
|
74
|
-
await this.#browser.disconnect();
|
|
75
|
-
}
|
|
76
|
-
log('Scraper discards browser');
|
|
77
|
-
this.#browser = null;
|
|
78
|
-
void this.emit('destroyed', {
|
|
79
|
-
pid: process.pid,
|
|
80
|
-
});
|
|
81
|
-
void this.emit('changePhase', {
|
|
82
|
-
pid: process.pid,
|
|
83
|
-
name: 'destroyed',
|
|
84
|
-
url: this.#url,
|
|
85
|
-
isExternal,
|
|
86
|
-
message: '',
|
|
87
|
-
});
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
async scrapeStart(url: ExURL, options?: Partial<ScraperOptions>, isSkip = false) {
|
|
34
|
+
/**
|
|
35
|
+
* Page-level scraper that extracts data from a single browser page.
|
|
36
|
+
*
|
|
37
|
+
* The scraper returns results as values from `scrapeStart()` rather than
|
|
38
|
+
* emitting them as events. Only streaming events (changePhase, resourceResponse)
|
|
39
|
+
* are emitted for progress monitoring.
|
|
40
|
+
*
|
|
41
|
+
* The Puppeteer `Page` object is injected externally, and page lifecycle
|
|
42
|
+
* (including `page.close()`) is managed by the caller.
|
|
43
|
+
* @example
|
|
44
|
+
* ```ts
|
|
45
|
+
* const scraper = new Scraper();
|
|
46
|
+
* scraper.on('changePhase', (e) => console.log(e.name));
|
|
47
|
+
* const result = await scraper.scrapeStart(page, url, { isExternal: false });
|
|
48
|
+
* ```
|
|
49
|
+
*/
|
|
50
|
+
// eslint-disable-next-line unicorn/prefer-event-target -- TypedAwaitEventEmitter is a project-specific typed wrapper, not Node.js EventEmitter
|
|
51
|
+
export default class Scraper extends EventEmitter<ScraperEventTypes> {
|
|
52
|
+
/** Number of retries for `@retryable`-decorated methods. Set per-scrape from options. */
|
|
53
|
+
retries?: number;
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Begins the scraping process for a given URL on the provided Puppeteer page.
|
|
57
|
+
*
|
|
58
|
+
* Returns a `ScrapeResult` containing the outcome:
|
|
59
|
+
* - `type: "success"` with `pageData` on success
|
|
60
|
+
* - `type: "skipped"` with `ignored` details when the page is excluded
|
|
61
|
+
* - `type: "error"` with `error` details when scraping fails
|
|
62
|
+
*
|
|
63
|
+
* Sub-resources are collected via the `resourceResponse` event and
|
|
64
|
+
* included in the returned `ScrapeResult.resources`.
|
|
65
|
+
* @param page - The Puppeteer page instance to use for navigation and DOM evaluation.
|
|
66
|
+
* @param url - The extended URL to scrape.
|
|
67
|
+
* @param options - Optional scraper configuration overriding defaults.
|
|
68
|
+
* @param isSkip - When `true`, the page is immediately skipped without any network requests.
|
|
69
|
+
* @returns The scrape result containing the outcome and captured resources.
|
|
70
|
+
*/
|
|
71
|
+
async scrapeStart(
|
|
72
|
+
page: Page,
|
|
73
|
+
url: ExURL,
|
|
74
|
+
options?: Partial<ScraperOptions>,
|
|
75
|
+
isSkip = false,
|
|
76
|
+
): Promise<ScrapeResult> {
|
|
77
|
+
this.retries = options?.retries;
|
|
91
78
|
const isExternal = options?.isExternal ?? false;
|
|
92
|
-
const
|
|
79
|
+
const captureImages = options?.captureImages ?? true;
|
|
93
80
|
const excludeKeywords = options?.excludeKeywords ?? [];
|
|
94
|
-
const
|
|
95
|
-
const
|
|
81
|
+
const metadataOnly = options?.metadataOnly ?? false;
|
|
82
|
+
const imageLoadTimeout = options?.imageLoadTimeout ?? 5000;
|
|
83
|
+
const resources: ResourceEntry[] = [];
|
|
96
84
|
|
|
97
|
-
this.#url = url;
|
|
98
85
|
void this.emit('changePhase', {
|
|
99
86
|
pid: process.pid,
|
|
100
87
|
name: 'scrapeStart',
|
|
101
|
-
url
|
|
88
|
+
url,
|
|
102
89
|
isExternal,
|
|
103
90
|
message: '',
|
|
104
91
|
});
|
|
105
92
|
|
|
93
|
+
// Path-excluded: return SkippedPageData
|
|
106
94
|
if (isSkip) {
|
|
107
|
-
void this.emit('ignoreAndSkip', {
|
|
108
|
-
pid: process.pid,
|
|
109
|
-
url: this.#url,
|
|
110
|
-
reason: {
|
|
111
|
-
matchedText: this.#url.pathname || '',
|
|
112
|
-
excludeKeywords,
|
|
113
|
-
},
|
|
114
|
-
});
|
|
115
95
|
void this.emit('changePhase', {
|
|
116
96
|
pid: process.pid,
|
|
117
|
-
name: '
|
|
118
|
-
url
|
|
97
|
+
name: 'pageSkipped',
|
|
98
|
+
url,
|
|
119
99
|
isExternal,
|
|
120
100
|
message: 'Matched: excluded path',
|
|
121
101
|
});
|
|
122
|
-
return
|
|
102
|
+
return {
|
|
103
|
+
type: 'skipped',
|
|
104
|
+
resources,
|
|
105
|
+
ignored: {
|
|
106
|
+
url,
|
|
107
|
+
matchedText: url.pathname || '',
|
|
108
|
+
excludeKeywords,
|
|
109
|
+
},
|
|
110
|
+
};
|
|
123
111
|
}
|
|
124
112
|
|
|
125
|
-
|
|
113
|
+
// Non-HTTP protocol: return minimal PageData
|
|
114
|
+
if (!url.isHTTP) {
|
|
126
115
|
const result: PageData = {
|
|
127
|
-
url
|
|
116
|
+
url,
|
|
128
117
|
isTarget: false,
|
|
129
118
|
isExternal,
|
|
130
119
|
redirectPaths: [],
|
|
@@ -142,69 +131,44 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
|
|
|
142
131
|
isSkipped: false,
|
|
143
132
|
};
|
|
144
133
|
|
|
145
|
-
void this.emit('scrapeEnd', {
|
|
146
|
-
pid: process.pid,
|
|
147
|
-
url: this.#url,
|
|
148
|
-
timestamp: Date.now(),
|
|
149
|
-
result,
|
|
150
|
-
});
|
|
151
|
-
|
|
152
134
|
void this.emit('changePhase', {
|
|
153
135
|
pid: process.pid,
|
|
154
136
|
name: 'scrapeEnd',
|
|
155
|
-
url
|
|
137
|
+
url,
|
|
156
138
|
isExternal,
|
|
157
139
|
message: '',
|
|
158
140
|
});
|
|
159
|
-
return;
|
|
141
|
+
return { type: 'success', pageData: result, resources };
|
|
160
142
|
}
|
|
161
143
|
|
|
162
|
-
|
|
163
|
-
pid: process.pid,
|
|
164
|
-
name: 'touchHead',
|
|
165
|
-
url: this.#url,
|
|
166
|
-
isExternal,
|
|
167
|
-
message: '',
|
|
168
|
-
});
|
|
144
|
+
let headResult: PageData | SkippedPageData | null = options?.headCheckResult ?? null;
|
|
169
145
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
isExternal,
|
|
173
|
-
);
|
|
174
|
-
|
|
175
|
-
if (result instanceof Error) {
|
|
176
|
-
log('Error(FETCH_HEAD): %s', url.href);
|
|
177
|
-
void this.emit('error', {
|
|
146
|
+
if (headResult && metadataOnly) {
|
|
147
|
+
void this.emit('changePhase', {
|
|
178
148
|
pid: process.pid,
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
149
|
+
name: 'scrapeEnd',
|
|
150
|
+
url,
|
|
151
|
+
isExternal,
|
|
152
|
+
message: '',
|
|
182
153
|
});
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
void this.emit('scrapeEnd', {
|
|
188
|
-
pid: process.pid,
|
|
189
|
-
url: this.#url,
|
|
190
|
-
timestamp: Date.now(),
|
|
191
|
-
result: {
|
|
192
|
-
...result,
|
|
154
|
+
return {
|
|
155
|
+
type: 'success',
|
|
156
|
+
pageData: {
|
|
157
|
+
...headResult,
|
|
193
158
|
isTarget: false,
|
|
194
159
|
},
|
|
195
|
-
|
|
196
|
-
|
|
160
|
+
resources,
|
|
161
|
+
};
|
|
197
162
|
}
|
|
198
163
|
|
|
199
|
-
if (
|
|
200
|
-
const
|
|
201
|
-
const page = await this.#createPage(isExternal, executablePath, headlessMode);
|
|
202
|
-
|
|
203
|
-
result = await this.#fetchData(
|
|
164
|
+
if (headResult === null || headResult.contentType === 'text/html') {
|
|
165
|
+
const fetchResult = await this.#fetchData(
|
|
204
166
|
page,
|
|
205
167
|
url,
|
|
206
168
|
isExternal,
|
|
207
|
-
|
|
169
|
+
captureImages,
|
|
170
|
+
imageLoadTimeout,
|
|
171
|
+
resources,
|
|
208
172
|
options,
|
|
209
173
|
).catch((error) => {
|
|
210
174
|
if (error instanceof Error) {
|
|
@@ -213,28 +177,29 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
|
|
|
213
177
|
return new Error(error);
|
|
214
178
|
});
|
|
215
179
|
|
|
216
|
-
if (
|
|
180
|
+
if (fetchResult instanceof Error) {
|
|
217
181
|
log('Error(FETCH_DATA): %s', url.href);
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
error:
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
182
|
+
page.removeAllListeners();
|
|
183
|
+
return {
|
|
184
|
+
type: 'error',
|
|
185
|
+
resources,
|
|
186
|
+
error: {
|
|
187
|
+
name: fetchResult.name,
|
|
188
|
+
message: fetchResult.message,
|
|
189
|
+
stack: fetchResult.stack,
|
|
190
|
+
shutdown: true,
|
|
191
|
+
},
|
|
192
|
+
};
|
|
226
193
|
}
|
|
227
194
|
|
|
228
195
|
page.removeAllListeners();
|
|
229
|
-
|
|
230
|
-
await page.close();
|
|
231
|
-
}
|
|
196
|
+
headResult = fetchResult;
|
|
232
197
|
|
|
233
|
-
if (!
|
|
234
|
-
const checkedKeyword = keywordCheck(
|
|
198
|
+
if (!headResult.isSkipped) {
|
|
199
|
+
const checkedKeyword = keywordCheck(headResult.html, excludeKeywords);
|
|
235
200
|
|
|
236
201
|
if (checkedKeyword) {
|
|
237
|
-
|
|
202
|
+
headResult = {
|
|
238
203
|
url,
|
|
239
204
|
isSkipped: true,
|
|
240
205
|
matched: {
|
|
@@ -246,136 +211,135 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
|
|
|
246
211
|
}
|
|
247
212
|
}
|
|
248
213
|
|
|
249
|
-
if (
|
|
250
|
-
if (
|
|
251
|
-
return
|
|
214
|
+
if (headResult.isSkipped) {
|
|
215
|
+
if (headResult.matched.type === 'path') {
|
|
216
|
+
return {
|
|
217
|
+
type: 'skipped',
|
|
218
|
+
resources,
|
|
219
|
+
ignored: {
|
|
220
|
+
url,
|
|
221
|
+
matchedText: url.pathname || '',
|
|
222
|
+
excludeKeywords,
|
|
223
|
+
},
|
|
224
|
+
};
|
|
252
225
|
}
|
|
253
|
-
void this.emit('ignoreAndSkip', {
|
|
254
|
-
pid: process.pid,
|
|
255
|
-
url: this.#url,
|
|
256
|
-
reason: {
|
|
257
|
-
matchedText: result.matched.text,
|
|
258
|
-
excludeKeywords,
|
|
259
|
-
},
|
|
260
|
-
});
|
|
261
226
|
void this.emit('changePhase', {
|
|
262
227
|
pid: process.pid,
|
|
263
|
-
name: '
|
|
264
|
-
url
|
|
228
|
+
name: 'pageSkipped',
|
|
229
|
+
url,
|
|
265
230
|
isExternal,
|
|
266
|
-
message: `Matched: "${
|
|
231
|
+
message: `Matched: "${headResult.matched.text}"`,
|
|
267
232
|
});
|
|
268
|
-
return
|
|
233
|
+
return {
|
|
234
|
+
type: 'skipped',
|
|
235
|
+
resources,
|
|
236
|
+
ignored: {
|
|
237
|
+
url,
|
|
238
|
+
matchedText: headResult.matched.text,
|
|
239
|
+
excludeKeywords,
|
|
240
|
+
},
|
|
241
|
+
};
|
|
269
242
|
}
|
|
270
243
|
}
|
|
271
244
|
|
|
272
|
-
void this.emit('scrapeEnd', {
|
|
273
|
-
pid: process.pid,
|
|
274
|
-
url: this.#url,
|
|
275
|
-
timestamp: Date.now(),
|
|
276
|
-
result,
|
|
277
|
-
});
|
|
278
|
-
|
|
279
245
|
void this.emit('changePhase', {
|
|
280
246
|
pid: process.pid,
|
|
281
247
|
name: 'scrapeEnd',
|
|
282
|
-
url
|
|
248
|
+
url,
|
|
283
249
|
isExternal,
|
|
284
250
|
message: '',
|
|
285
251
|
});
|
|
286
252
|
|
|
287
|
-
return
|
|
253
|
+
return { type: 'success', pageData: headResult, resources };
|
|
288
254
|
}
|
|
289
255
|
|
|
290
|
-
|
|
291
|
-
|
|
256
|
+
/**
|
|
257
|
+
* Creates a callback for `@d-zero/puppeteer-page-scan`'s `beforePageScan` listener.
|
|
258
|
+
*
|
|
259
|
+
* WHY a separate factory: The listener must capture `isExternal` for phase events
|
|
260
|
+
* while conforming to the `beforePageScan` listener signature.
|
|
261
|
+
* Currently only handles the `scroll` phase to report scroll progress.
|
|
262
|
+
* @param isExternal - Whether the current page is external to the crawl scope
|
|
263
|
+
* @returns A listener function compatible with `beforePageScan`'s `listener` option
|
|
264
|
+
*/
|
|
265
|
+
#createPageScanListener(
|
|
292
266
|
isExternal: boolean,
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
267
|
+
): (phase: keyof PageScanPhase, data: PageScanPhase[keyof PageScanPhase]) => void {
|
|
268
|
+
return (phase, data) => {
|
|
269
|
+
switch (phase) {
|
|
270
|
+
case 'scroll': {
|
|
271
|
+
const d = data as PageScanPhase['scroll'];
|
|
272
|
+
const scrollMsg = Number.isNaN(d.scrollHeight)
|
|
273
|
+
? `%propeller% ${d.message}`
|
|
274
|
+
: `%propeller% ${d.scrollY}px/${d.scrollHeight}px (${Math.round((d.scrollY / d.scrollHeight) * 100)}%) ${d.message}`;
|
|
275
|
+
void this.emit('changePhase', {
|
|
276
|
+
pid: process.pid,
|
|
277
|
+
name: 'scrollToBottom',
|
|
278
|
+
url: null,
|
|
279
|
+
isExternal,
|
|
280
|
+
message: scrollMsg,
|
|
281
|
+
} satisfies ChangePhaseEvent);
|
|
282
|
+
break;
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
};
|
|
286
|
+
}
|
|
287
|
+
/**
|
|
288
|
+
* Navigates the page to the target URL and extracts full page data.
|
|
289
|
+
*
|
|
290
|
+
* WHY retryable with 3-min timeout: Page navigation can fail due to transient
|
|
291
|
+
* network issues or slow-loading pages. The decorator retries automatically,
|
|
292
|
+
* emitting `retryWait` / `retryExhausted` phase events for progress monitoring.
|
|
293
|
+
*
|
|
294
|
+
* Flow:
|
|
295
|
+
* 1. Register request/response listeners to capture sub-resources (internal pages only)
|
|
296
|
+
* 2. Navigate to URL via `page.goto()` and track redirect chain
|
|
297
|
+
* 3. Wait for DOM content and network idle
|
|
298
|
+
* 4. Extract anchors, meta, and optionally images
|
|
299
|
+
* 5. Check for keyword exclusion in HTML content
|
|
300
|
+
* @param page - Puppeteer page instance
|
|
301
|
+
* @param url - Target URL to navigate to
|
|
302
|
+
* @param isExternal - Whether the URL is external to the crawl scope
|
|
303
|
+
* @param captureImages - Whether to run the image extraction pipeline
|
|
304
|
+
* @param imageLoadTimeout - Timeout (ms) for waiting lazy-loaded images to complete
|
|
305
|
+
* @param resources - Mutable array to collect captured sub-resources into
|
|
306
|
+
* @param options - Additional scraper options (e.g. `disableQueries`, `navigationTimeout`)
|
|
307
|
+
* @returns Full page data or skipped page data if an exclusion rule matched
|
|
308
|
+
*/
|
|
309
|
+
@retryable({
|
|
310
|
+
timeout: 3 * 60 * 1000,
|
|
311
|
+
onWait(this: Scraper, determinedInterval, retryCount, methodName, error) {
|
|
297
312
|
void this.emit('changePhase', {
|
|
298
313
|
pid: process.pid,
|
|
299
|
-
name: '
|
|
300
|
-
url:
|
|
301
|
-
isExternal,
|
|
302
|
-
message:
|
|
314
|
+
name: 'retryWait',
|
|
315
|
+
url: null,
|
|
316
|
+
isExternal: false,
|
|
317
|
+
message: `${methodName}: ${error.message} — %countdown(${determinedInterval},${methodName}_${retryCount},s)%s (retry #${retryCount + 1})`,
|
|
303
318
|
});
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
'--no-zygote',
|
|
313
|
-
'--ignore-certificate-errors',
|
|
314
|
-
],
|
|
315
|
-
}).catch((error) => {
|
|
316
|
-
if (error instanceof Error) {
|
|
317
|
-
return error;
|
|
318
|
-
}
|
|
319
|
-
throw error;
|
|
319
|
+
},
|
|
320
|
+
onGiveUp(this: Scraper, retryCount, error, methodName) {
|
|
321
|
+
void this.emit('changePhase', {
|
|
322
|
+
pid: process.pid,
|
|
323
|
+
name: 'retryExhausted',
|
|
324
|
+
url: null,
|
|
325
|
+
isExternal: false,
|
|
326
|
+
message: `${methodName}: gave up after ${retryCount} retries — ${error.message}`,
|
|
320
327
|
});
|
|
321
|
-
|
|
322
|
-
if (browser instanceof Error) {
|
|
323
|
-
void this.emit('error', {
|
|
324
|
-
pid: process.pid,
|
|
325
|
-
url: this.#url!,
|
|
326
|
-
shutdown: false,
|
|
327
|
-
error: browser,
|
|
328
|
-
});
|
|
329
|
-
throw browser;
|
|
330
|
-
}
|
|
331
|
-
|
|
332
|
-
this.#browser = browser;
|
|
333
|
-
} else if (!this.#browser.isConnected()) {
|
|
334
|
-
await this.#browser.close();
|
|
335
|
-
}
|
|
336
|
-
|
|
337
|
-
return this.#browser;
|
|
338
|
-
}
|
|
339
|
-
|
|
340
|
-
@retry()
|
|
341
|
-
async #createPage(
|
|
342
|
-
isExternal: boolean,
|
|
343
|
-
executablePath: string | null,
|
|
344
|
-
headless: boolean | 'shell',
|
|
345
|
-
) {
|
|
346
|
-
const browser = await this.#bootBrowser(isExternal, executablePath, headless);
|
|
347
|
-
|
|
348
|
-
void this.emit('changePhase', {
|
|
349
|
-
pid: process.pid,
|
|
350
|
-
name: 'newPage',
|
|
351
|
-
url: this.#url,
|
|
352
|
-
isExternal,
|
|
353
|
-
message: '',
|
|
354
|
-
});
|
|
355
|
-
|
|
356
|
-
const page = await browser.newPage();
|
|
357
|
-
page.setDefaultNavigationTimeout(0);
|
|
358
|
-
await page.setUserAgent(
|
|
359
|
-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
|
|
360
|
-
);
|
|
361
|
-
await page.setExtraHTTPHeaders({
|
|
362
|
-
// TODO: Optional lang
|
|
363
|
-
'Accept-Language': 'ja-JP',
|
|
364
|
-
});
|
|
365
|
-
|
|
366
|
-
return page;
|
|
367
|
-
}
|
|
368
|
-
|
|
369
|
-
@retry({
|
|
370
|
-
timeout: 1 * 60 * 1000, // 1sec,
|
|
328
|
+
},
|
|
371
329
|
})
|
|
372
330
|
async #fetchData(
|
|
373
331
|
page: Page,
|
|
374
332
|
url: ExURL,
|
|
375
333
|
isExternal: boolean,
|
|
376
|
-
|
|
377
|
-
|
|
334
|
+
captureImages: boolean,
|
|
335
|
+
imageLoadTimeout: number,
|
|
336
|
+
resources: ResourceEntry[],
|
|
337
|
+
options?: Partial<ScraperOptions>,
|
|
378
338
|
): Promise<PageData | SkippedPageData> {
|
|
339
|
+
const parseOpts: ParseURLOptions | undefined =
|
|
340
|
+
options?.disableQueries == null
|
|
341
|
+
? undefined
|
|
342
|
+
: { disableQueries: options.disableQueries };
|
|
379
343
|
const networkLogs: Record<string, NetworkLog> = {};
|
|
380
344
|
|
|
381
345
|
page.on('dialog', async (dialog) => {
|
|
@@ -390,7 +354,7 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
|
|
|
390
354
|
|
|
391
355
|
if (!isExternal) {
|
|
392
356
|
page.on('request', (request) => {
|
|
393
|
-
const url = parseUrl(request.url(),
|
|
357
|
+
const url = parseUrl(request.url(), parseOpts)!;
|
|
394
358
|
networkLogs[request.url()] = {
|
|
395
359
|
url,
|
|
396
360
|
status: null,
|
|
@@ -407,7 +371,7 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
|
|
|
407
371
|
|
|
408
372
|
const uniqueRes = new Set<string>();
|
|
409
373
|
page.on('response', (response) => {
|
|
410
|
-
const resURL = parseUrl(response.url(),
|
|
374
|
+
const resURL = parseUrl(response.url(), parseOpts)!;
|
|
411
375
|
|
|
412
376
|
if (uniqueRes.has(resURL.withoutHash)) {
|
|
413
377
|
return;
|
|
@@ -453,6 +417,11 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
|
|
|
453
417
|
};
|
|
454
418
|
|
|
455
419
|
rLog('Fetched: %s', resURL.href);
|
|
420
|
+
|
|
421
|
+
// Collect resource into the results array
|
|
422
|
+
resources.push({ log, resource: referredLink, pageUrl: url.withoutHash });
|
|
423
|
+
|
|
424
|
+
// Also emit for streaming consumers
|
|
456
425
|
void this.emit('resourceResponse', {
|
|
457
426
|
pid: process.pid,
|
|
458
427
|
url,
|
|
@@ -462,12 +431,14 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
|
|
|
462
431
|
});
|
|
463
432
|
}
|
|
464
433
|
|
|
434
|
+
const navigationTimeout = options?.navigationTimeout ?? 60_000;
|
|
435
|
+
|
|
465
436
|
void this.emit('changePhase', {
|
|
466
437
|
pid: process.pid,
|
|
467
438
|
name: 'openPage',
|
|
468
|
-
url
|
|
439
|
+
url,
|
|
469
440
|
isExternal,
|
|
470
|
-
message:
|
|
441
|
+
message: `%countdown(${navigationTimeout},openPage_${url.withoutHash},s)%s`,
|
|
471
442
|
});
|
|
472
443
|
|
|
473
444
|
if (url.username && url.password) {
|
|
@@ -476,19 +447,24 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
|
|
|
476
447
|
});
|
|
477
448
|
}
|
|
478
449
|
|
|
479
|
-
const res = await page.goto(url.withoutHashAndAuth);
|
|
450
|
+
const res = await page.goto(url.withoutHashAndAuth, { timeout: navigationTimeout });
|
|
480
451
|
|
|
481
452
|
if (!res) {
|
|
482
453
|
throw new Error('The method Page.goto returned null');
|
|
483
454
|
}
|
|
484
455
|
|
|
485
|
-
const destUrl = parseUrl(page.url(),
|
|
486
|
-
const redirectPaths =
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
456
|
+
const destUrl = parseUrl(page.url(), parseOpts)!;
|
|
457
|
+
const redirectPaths = new Set<string>();
|
|
458
|
+
|
|
459
|
+
if (url.withoutHash !== destUrl.withoutHash) {
|
|
460
|
+
const redirectChain = res
|
|
461
|
+
.request()
|
|
462
|
+
.redirectChain()
|
|
463
|
+
.map((req) => req.url());
|
|
464
|
+
for (const redirectPath of redirectChain) {
|
|
465
|
+
redirectPaths.add(redirectPath);
|
|
466
|
+
}
|
|
467
|
+
redirectPaths.add(destUrl.withoutHash);
|
|
492
468
|
}
|
|
493
469
|
|
|
494
470
|
if (destUrl.hostname !== url.hostname) {
|
|
@@ -507,7 +483,7 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
|
|
|
507
483
|
url,
|
|
508
484
|
isTarget: false,
|
|
509
485
|
isExternal,
|
|
510
|
-
redirectPaths,
|
|
486
|
+
redirectPaths: [...redirectPaths],
|
|
511
487
|
status,
|
|
512
488
|
statusText,
|
|
513
489
|
contentType,
|
|
@@ -526,7 +502,7 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
|
|
|
526
502
|
void this.emit('changePhase', {
|
|
527
503
|
pid: process.pid,
|
|
528
504
|
name: 'loadDOMContent',
|
|
529
|
-
url
|
|
505
|
+
url,
|
|
530
506
|
isExternal,
|
|
531
507
|
message: '',
|
|
532
508
|
});
|
|
@@ -538,7 +514,7 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
|
|
|
538
514
|
void this.emit('changePhase', {
|
|
539
515
|
pid: process.pid,
|
|
540
516
|
name: 'getHTML',
|
|
541
|
-
url
|
|
517
|
+
url,
|
|
542
518
|
isExternal,
|
|
543
519
|
message: '',
|
|
544
520
|
});
|
|
@@ -556,7 +532,7 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
|
|
|
556
532
|
url,
|
|
557
533
|
isTarget: false,
|
|
558
534
|
isExternal,
|
|
559
|
-
redirectPaths,
|
|
535
|
+
redirectPaths: [...redirectPaths],
|
|
560
536
|
status,
|
|
561
537
|
statusText,
|
|
562
538
|
contentType,
|
|
@@ -574,8 +550,8 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
|
|
|
574
550
|
|
|
575
551
|
void this.emit('changePhase', {
|
|
576
552
|
pid: process.pid,
|
|
577
|
-
name: '
|
|
578
|
-
url
|
|
553
|
+
name: 'waitNetworkIdle',
|
|
554
|
+
url,
|
|
579
555
|
isExternal,
|
|
580
556
|
message: '',
|
|
581
557
|
});
|
|
@@ -587,28 +563,44 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
|
|
|
587
563
|
void this.emit('changePhase', {
|
|
588
564
|
pid: process.pid,
|
|
589
565
|
name: 'getAnchors',
|
|
590
|
-
url
|
|
566
|
+
url,
|
|
591
567
|
isExternal,
|
|
592
568
|
message: '',
|
|
593
569
|
});
|
|
594
|
-
const anchorList = await getAnchorList(page,
|
|
570
|
+
const anchorList = await getAnchorList(page, parseOpts);
|
|
595
571
|
|
|
596
572
|
void this.emit('changePhase', {
|
|
597
573
|
pid: process.pid,
|
|
598
574
|
name: 'getMeta',
|
|
599
|
-
url
|
|
575
|
+
url,
|
|
600
576
|
isExternal,
|
|
601
577
|
message: '',
|
|
602
578
|
});
|
|
603
579
|
const meta = await getMeta(page);
|
|
604
580
|
|
|
605
|
-
const imageList =
|
|
581
|
+
const imageList = captureImages
|
|
582
|
+
? await (async () => {
|
|
583
|
+
void this.emit('changePhase', {
|
|
584
|
+
pid: process.pid,
|
|
585
|
+
name: 'extractImages',
|
|
586
|
+
url,
|
|
587
|
+
isExternal,
|
|
588
|
+
message: '',
|
|
589
|
+
});
|
|
590
|
+
return this.#fetchImages(
|
|
591
|
+
page,
|
|
592
|
+
url.withoutHashAndAuth,
|
|
593
|
+
isExternal,
|
|
594
|
+
imageLoadTimeout,
|
|
595
|
+
);
|
|
596
|
+
})()
|
|
597
|
+
: [];
|
|
606
598
|
|
|
607
599
|
return {
|
|
608
600
|
url,
|
|
609
601
|
isTarget: true,
|
|
610
602
|
isExternal,
|
|
611
|
-
redirectPaths,
|
|
603
|
+
redirectPaths: [...redirectPaths],
|
|
612
604
|
status,
|
|
613
605
|
statusText,
|
|
614
606
|
contentType,
|
|
@@ -621,50 +613,96 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
|
|
|
621
613
|
isSkipped: false,
|
|
622
614
|
};
|
|
623
615
|
}
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
616
|
+
/**
|
|
617
|
+
* Extracts image data from the page across multiple device presets.
|
|
618
|
+
*
|
|
619
|
+
* WHY multiple device presets: Images may differ between desktop and mobile
|
|
620
|
+
* due to responsive `<picture>` / `srcset`. Capturing both `desktop-compact`
|
|
621
|
+
* and `mobile-small` viewports reveals responsive image issues.
|
|
622
|
+
*
|
|
623
|
+
* WHY retryable with 5-min timeout and `fallback: []`: Image extraction is
|
|
624
|
+
* best-effort. If all retries fail, an empty array is returned rather than
|
|
625
|
+
* failing the entire page scrape.
|
|
626
|
+
* @param page - Puppeteer page instance
|
|
627
|
+
* @param url - The page URL string (without hash and auth)
|
|
628
|
+
* @param isExternal - Whether the page is external
|
|
629
|
+
* @param imageLoadTimeout - Timeout (ms) for waiting images to complete loading
|
|
630
|
+
* @returns Array of image elements from all device presets
|
|
631
|
+
*/
|
|
632
|
+
@retryable({
|
|
633
|
+
timeout: 5 * 60 * 1000,
|
|
632
634
|
fallback: [],
|
|
635
|
+
onWait(this: Scraper, determinedInterval, retryCount, methodName, error) {
|
|
636
|
+
void this.emit('changePhase', {
|
|
637
|
+
pid: process.pid,
|
|
638
|
+
name: 'retryWait',
|
|
639
|
+
url: null,
|
|
640
|
+
isExternal: false,
|
|
641
|
+
message: `${methodName}: ${error.message} — %countdown(${determinedInterval},${methodName}_${retryCount},s)%s (retry #${retryCount + 1} / images)`,
|
|
642
|
+
});
|
|
643
|
+
},
|
|
644
|
+
onGiveUp(this: Scraper, retryCount, error, methodName) {
|
|
645
|
+
void this.emit('changePhase', {
|
|
646
|
+
pid: process.pid,
|
|
647
|
+
name: 'retryExhausted',
|
|
648
|
+
url: null,
|
|
649
|
+
isExternal: false,
|
|
650
|
+
message: `${methodName}: gave up after ${retryCount} retries — ${error.message}`,
|
|
651
|
+
});
|
|
652
|
+
},
|
|
633
653
|
})
|
|
634
|
-
async #fetchImages(
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
654
|
+
async #fetchImages(
|
|
655
|
+
page: Page,
|
|
656
|
+
url: string,
|
|
657
|
+
isExternal: boolean,
|
|
658
|
+
imageLoadTimeout: number,
|
|
659
|
+
): Promise<ImageElement[]> {
|
|
660
|
+
const listener = this.#createPageScanListener(isExternal);
|
|
661
|
+
const devices: { key: string; preset: { width: number; resolution?: number } }[] = [
|
|
662
|
+
{ key: 'desktop-compact', preset: devicePresets['desktop-compact'] },
|
|
663
|
+
{ key: 'mobile-small', preset: devicePresets['mobile-small'] },
|
|
641
664
|
];
|
|
665
|
+
const imageList: ImageElement[] = [];
|
|
642
666
|
|
|
643
|
-
for (const
|
|
667
|
+
for (const { key, preset } of devices) {
|
|
644
668
|
void this.emit('changePhase', {
|
|
645
669
|
pid: process.pid,
|
|
646
670
|
name: 'setViewport',
|
|
647
|
-
url:
|
|
671
|
+
url: null,
|
|
648
672
|
isExternal,
|
|
649
|
-
message:
|
|
673
|
+
message: `📷 ${key} ↔️ ${preset.width}px`,
|
|
650
674
|
});
|
|
651
675
|
|
|
652
676
|
await beforePageScan(page, url, {
|
|
653
|
-
name:
|
|
654
|
-
width:
|
|
655
|
-
resolution:
|
|
677
|
+
name: key,
|
|
678
|
+
width: preset.width,
|
|
679
|
+
resolution: preset.resolution,
|
|
680
|
+
listener,
|
|
656
681
|
timeout: 5000,
|
|
657
682
|
});
|
|
658
683
|
|
|
659
684
|
void this.emit('changePhase', {
|
|
660
685
|
pid: process.pid,
|
|
661
|
-
name: '
|
|
662
|
-
url:
|
|
686
|
+
name: 'waitImageLoad',
|
|
687
|
+
url: null,
|
|
663
688
|
isExternal,
|
|
664
|
-
message:
|
|
689
|
+
message: `📷 ${key}: Waiting for images%dots%`,
|
|
665
690
|
});
|
|
666
691
|
|
|
667
|
-
|
|
692
|
+
await page
|
|
693
|
+
.waitForFunction(() => [...document.images].every((img) => img.complete), {
|
|
694
|
+
timeout: imageLoadTimeout,
|
|
695
|
+
})
|
|
696
|
+
.catch(() => {});
|
|
697
|
+
|
|
698
|
+
void this.emit('changePhase', {
|
|
699
|
+
pid: process.pid,
|
|
700
|
+
name: 'getImages',
|
|
701
|
+
url: null,
|
|
702
|
+
isExternal,
|
|
703
|
+
message: `📸 ${key}: Extracting images%dots%`,
|
|
704
|
+
});
|
|
705
|
+
const images = await getImageList(page, preset.width);
|
|
668
706
|
imageList.push(...images);
|
|
669
707
|
}
|
|
670
708
|
|