@d-zero/beholder 0.1.29 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/CHANGELOG.md +11 -0
  2. package/README.md +172 -477
  3. package/dist/debug.d.ts +4 -1
  4. package/dist/debug.js +5 -2
  5. package/dist/dom-evaluation.d.ts +72 -14
  6. package/dist/dom-evaluation.js +169 -43
  7. package/dist/index.d.ts +20 -3
  8. package/dist/index.js +15 -3
  9. package/dist/is-error.d.ts +8 -0
  10. package/dist/is-error.js +10 -0
  11. package/dist/keyword-check.d.ts +5 -3
  12. package/dist/keyword-check.js +5 -3
  13. package/dist/parse-url.d.ts +14 -0
  14. package/dist/parse-url.js +23 -0
  15. package/dist/scraper.d.ts +39 -13
  16. package/dist/scraper.js +300 -263
  17. package/dist/types.d.ts +286 -214
  18. package/dist/types.js +6 -0
  19. package/package.json +7 -10
  20. package/src/debug.ts +5 -2
  21. package/src/dom-evaluation.ts +195 -65
  22. package/src/index.ts +27 -3
  23. package/src/is-error.spec.ts +33 -0
  24. package/src/is-error.ts +10 -0
  25. package/src/keyword-check.spec.ts +45 -4
  26. package/src/keyword-check.ts +5 -3
  27. package/src/parse-url.spec.ts +35 -0
  28. package/src/parse-url.ts +26 -0
  29. package/src/scraper.ts +338 -300
  30. package/src/types.ts +345 -258
  31. package/tsconfig.tsbuildinfo +1 -1
  32. package/dist/events.d.ts +0 -32
  33. package/dist/events.js +0 -15
  34. package/dist/fetch-destination.d.ts +0 -8
  35. package/dist/fetch-destination.js +0 -145
  36. package/dist/net-timeout-error.d.ts +0 -3
  37. package/dist/net-timeout-error.js +0 -3
  38. package/dist/sub-process-runner.d.ts +0 -12
  39. package/dist/sub-process-runner.js +0 -180
  40. package/dist/sub-process.d.ts +0 -1
  41. package/dist/sub-process.js +0 -67
  42. package/dist/utils.d.ts +0 -16
  43. package/dist/utils.js +0 -69
  44. package/src/events.ts +0 -21
  45. package/src/fetch-destination.ts +0 -173
  46. package/src/net-timeout-error.ts +0 -3
  47. package/src/sub-process-runner.ts +0 -220
  48. package/src/sub-process.ts +0 -86
  49. package/src/utils.ts +0 -89
package/dist/scraper.js CHANGED
@@ -36,106 +36,76 @@ var __setFunctionName = (this && this.__setFunctionName) || function (f, name, p
36
36
  if (typeof name === "symbol") name = name.description ? "[".concat(name.description, "]") : "";
37
37
  return Object.defineProperty(f, "name", { configurable: true, value: prefix ? "".concat(prefix, " ", name) : name });
38
38
  };
39
- import { beforePageScan } from '@d-zero/puppeteer-page-scan';
40
- import { parseUrl } from '@d-zero/shared/parse-url';
41
- import { retry } from '@d-zero/shared/retry';
42
- import { TypedAwaitEventEmitter } from '@d-zero/shared/typed-await-event-emitter';
43
- import { launch } from 'puppeteer';
39
+ import { beforePageScan, devicePresets } from '@d-zero/puppeteer-page-scan';
40
+ import { detectCDN } from '@d-zero/shared/detect-cdn';
41
+ import { detectCompress } from '@d-zero/shared/detect-compress';
42
+ import { retry as retryable } from '@d-zero/shared/retry';
43
+ import { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
44
44
  import { resourceLog, scraperLog } from './debug.js';
45
45
  import { getAnchorList, getImageList, getMeta } from './dom-evaluation.js';
46
- import { fetchDestination } from './fetch-destination.js';
46
+ import { isError } from './is-error.js';
47
47
  import { keywordCheck } from './keyword-check.js';
48
- import { detectCDN, detectCompress, isError } from './utils.js';
48
+ import { parseUrl } from './parse-url.js';
49
49
  const pid = `${process.pid}`;
50
50
  const log = scraperLog.extend(pid);
51
51
  const rLog = resourceLog.extend(pid);
52
- const LAUNCH_BROWSER_TIMEOUT = 1000 * 30;
53
52
  let Scraper = (() => {
54
- let _classSuper = TypedAwaitEventEmitter;
53
+ let _classSuper = EventEmitter;
55
54
  let _instanceExtraInitializers = [];
56
- let _private_bootBrowser_decorators;
57
- let _private_bootBrowser_descriptor;
58
- let _private_createPage_decorators;
59
- let _private_createPage_descriptor;
60
55
  let _private_fetchData_decorators;
61
56
  let _private_fetchData_descriptor;
62
- let _private_fetchHead_decorators;
63
- let _private_fetchHead_descriptor;
64
57
  let _private_fetchImages_decorators;
65
58
  let _private_fetchImages_descriptor;
66
59
  return class Scraper extends _classSuper {
67
60
  static {
68
61
  const _metadata = typeof Symbol === "function" && Symbol.metadata ? Object.create(_classSuper[Symbol.metadata] ?? null) : void 0;
69
- _private_bootBrowser_decorators = [retry()];
70
- _private_createPage_decorators = [retry()];
71
- _private_fetchData_decorators = [retry({
72
- timeout: 1 * 60 * 1000, // 1sec,
62
+ _private_fetchData_decorators = [retryable({
63
+ timeout: 3 * 60 * 1000,
64
+ onWait(determinedInterval, retryCount, methodName, error) {
65
+ void this.emit('changePhase', {
66
+ pid: process.pid,
67
+ name: 'retryWait',
68
+ url: null,
69
+ isExternal: false,
70
+ message: `${methodName}: ${error.message} — %countdown(${determinedInterval},${methodName}_${retryCount},s)%s (retry #${retryCount + 1})`,
71
+ });
72
+ },
73
+ onGiveUp(retryCount, error, methodName) {
74
+ void this.emit('changePhase', {
75
+ pid: process.pid,
76
+ name: 'retryExhausted',
77
+ url: null,
78
+ isExternal: false,
79
+ message: `${methodName}: gave up after ${retryCount} retries — ${error.message}`,
80
+ });
81
+ },
73
82
  })];
74
- _private_fetchHead_decorators = [retry()];
75
- _private_fetchImages_decorators = [retry({
76
- timeout: 5 * 60 * 1000, // 5sec
83
+ _private_fetchImages_decorators = [retryable({
84
+ timeout: 5 * 60 * 1000,
77
85
  fallback: [],
78
- })];
79
- __esDecorate(this, _private_bootBrowser_descriptor = { value: __setFunctionName(async function (isExternal, executablePath, headless) {
80
- if (!this.#browser) {
86
+ onWait(determinedInterval, retryCount, methodName, error) {
81
87
  void this.emit('changePhase', {
82
88
  pid: process.pid,
83
- name: 'launchBrowser',
84
- url: this.#url,
85
- isExternal,
86
- message: executablePath || '(executablePath is default)',
89
+ name: 'retryWait',
90
+ url: null,
91
+ isExternal: false,
92
+ message: `${methodName}: ${error.message} — %countdown(${determinedInterval},${methodName}_${retryCount},s)%s (retry #${retryCount + 1} / images)`,
87
93
  });
88
- const browser = await launch({
89
- headless,
90
- timeout: LAUNCH_BROWSER_TIMEOUT,
91
- executablePath: executablePath ?? undefined,
92
- args: [
93
- // TODO: Optional lang
94
- '--lang=ja',
95
- '--no-zygote',
96
- '--ignore-certificate-errors',
97
- ],
98
- }).catch((error) => {
99
- if (error instanceof Error) {
100
- return error;
101
- }
102
- throw error;
94
+ },
95
+ onGiveUp(retryCount, error, methodName) {
96
+ void this.emit('changePhase', {
97
+ pid: process.pid,
98
+ name: 'retryExhausted',
99
+ url: null,
100
+ isExternal: false,
101
+ message: `${methodName}: gave up after ${retryCount} retries — ${error.message}`,
103
102
  });
104
- if (browser instanceof Error) {
105
- void this.emit('error', {
106
- pid: process.pid,
107
- url: this.#url,
108
- shutdown: false,
109
- error: browser,
110
- });
111
- throw browser;
112
- }
113
- this.#browser = browser;
114
- }
115
- else if (!this.#browser.isConnected()) {
116
- await this.#browser.close();
117
- }
118
- return this.#browser;
119
- }, "#bootBrowser") }, _private_bootBrowser_decorators, { kind: "method", name: "#bootBrowser", static: false, private: true, access: { has: obj => #bootBrowser in obj, get: obj => obj.#bootBrowser }, metadata: _metadata }, null, _instanceExtraInitializers);
120
- __esDecorate(this, _private_createPage_descriptor = { value: __setFunctionName(async function (isExternal, executablePath, headless) {
121
- const browser = await this.#bootBrowser(isExternal, executablePath, headless);
122
- void this.emit('changePhase', {
123
- pid: process.pid,
124
- name: 'newPage',
125
- url: this.#url,
126
- isExternal,
127
- message: '',
128
- });
129
- const page = await browser.newPage();
130
- page.setDefaultNavigationTimeout(0);
131
- await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36');
132
- await page.setExtraHTTPHeaders({
133
- // TODO: Optional lang
134
- 'Accept-Language': 'ja-JP',
135
- });
136
- return page;
137
- }, "#createPage") }, _private_createPage_decorators, { kind: "method", name: "#createPage", static: false, private: true, access: { has: obj => #createPage in obj, get: obj => obj.#createPage }, metadata: _metadata }, null, _instanceExtraInitializers);
138
- __esDecorate(this, _private_fetchData_descriptor = { value: __setFunctionName(async function (page, url, isExternal, isGettingImages, options) {
103
+ },
104
+ })];
105
+ __esDecorate(this, _private_fetchData_descriptor = { value: __setFunctionName(async function (page, url, isExternal, captureImages, imageLoadTimeout, resources, options) {
106
+ const parseOpts = options?.disableQueries == null
107
+ ? undefined
108
+ : { disableQueries: options.disableQueries };
139
109
  const networkLogs = {};
140
110
  page.on('dialog', async (dialog) => {
141
111
  log(`Appear ${dialog.type()} dialog: ${dialog.message()}`);
@@ -149,7 +119,7 @@ let Scraper = (() => {
149
119
  });
150
120
  if (!isExternal) {
151
121
  page.on('request', (request) => {
152
- const url = parseUrl(request.url(), options);
122
+ const url = parseUrl(request.url(), parseOpts);
153
123
  networkLogs[request.url()] = {
154
124
  url,
155
125
  status: null,
@@ -165,7 +135,7 @@ let Scraper = (() => {
165
135
  });
166
136
  const uniqueRes = new Set();
167
137
  page.on('response', (response) => {
168
- const resURL = parseUrl(response.url(), options);
138
+ const resURL = parseUrl(response.url(), parseOpts);
169
139
  if (uniqueRes.has(resURL.withoutHash)) {
170
140
  return;
171
141
  }
@@ -206,6 +176,9 @@ let Scraper = (() => {
206
176
  headers: headers,
207
177
  };
208
178
  rLog('Fetched: %s', resURL.href);
179
+ // Collect resource into the results array
180
+ resources.push({ log, resource: referredLink, pageUrl: url.withoutHash });
181
+ // Also emit for streaming consumers
209
182
  void this.emit('resourceResponse', {
210
183
  pid: process.pid,
211
184
  url,
@@ -214,29 +187,34 @@ let Scraper = (() => {
214
187
  });
215
188
  });
216
189
  }
190
+ const navigationTimeout = options?.navigationTimeout ?? 60_000;
217
191
  void this.emit('changePhase', {
218
192
  pid: process.pid,
219
193
  name: 'openPage',
220
- url: this.#url,
194
+ url,
221
195
  isExternal,
222
- message: '',
196
+ message: `%countdown(${navigationTimeout},openPage_${url.withoutHash},s)%s`,
223
197
  });
224
198
  if (url.username && url.password) {
225
199
  await page.setExtraHTTPHeaders({
226
200
  Authorization: `Basic ${Buffer.from(`${url.username}:${url.password}`).toString('base64')}`,
227
201
  });
228
202
  }
229
- const res = await page.goto(url.withoutHashAndAuth);
203
+ const res = await page.goto(url.withoutHashAndAuth, { timeout: navigationTimeout });
230
204
  if (!res) {
231
205
  throw new Error('The method Page.goto returned null');
232
206
  }
233
- const destUrl = parseUrl(page.url(), options);
234
- const redirectPaths = res
235
- .request()
236
- .redirectChain()
237
- .map((req) => req.url());
238
- if (destUrl.withoutHash !== url.withoutHash) {
239
- redirectPaths.push(destUrl.withoutHash);
207
+ const destUrl = parseUrl(page.url(), parseOpts);
208
+ const redirectPaths = new Set();
209
+ if (url.withoutHash !== destUrl.withoutHash) {
210
+ const redirectChain = res
211
+ .request()
212
+ .redirectChain()
213
+ .map((req) => req.url());
214
+ for (const redirectPath of redirectChain) {
215
+ redirectPaths.add(redirectPath);
216
+ }
217
+ redirectPaths.add(destUrl.withoutHash);
240
218
  }
241
219
  if (destUrl.hostname !== url.hostname) {
242
220
  isExternal = true;
@@ -252,7 +230,7 @@ let Scraper = (() => {
252
230
  url,
253
231
  isTarget: false,
254
232
  isExternal,
255
- redirectPaths,
233
+ redirectPaths: [...redirectPaths],
256
234
  status,
257
235
  statusText,
258
236
  contentType,
@@ -270,7 +248,7 @@ let Scraper = (() => {
270
248
  void this.emit('changePhase', {
271
249
  pid: process.pid,
272
250
  name: 'loadDOMContent',
273
- url: this.#url,
251
+ url,
274
252
  isExternal,
275
253
  message: '',
276
254
  });
@@ -280,7 +258,7 @@ let Scraper = (() => {
280
258
  void this.emit('changePhase', {
281
259
  pid: process.pid,
282
260
  name: 'getHTML',
283
- url: this.#url,
261
+ url,
284
262
  isExternal,
285
263
  message: '',
286
264
  });
@@ -296,7 +274,7 @@ let Scraper = (() => {
296
274
  url,
297
275
  isTarget: false,
298
276
  isExternal,
299
- redirectPaths,
277
+ redirectPaths: [...redirectPaths],
300
278
  status,
301
279
  statusText,
302
280
  contentType,
@@ -313,8 +291,8 @@ let Scraper = (() => {
313
291
  }
314
292
  void this.emit('changePhase', {
315
293
  pid: process.pid,
316
- name: 'waitNetworkIdleZero',
317
- url: this.#url,
294
+ name: 'waitNetworkIdle',
295
+ url,
318
296
  isExternal,
319
297
  message: '',
320
298
  });
@@ -324,25 +302,36 @@ let Scraper = (() => {
324
302
  void this.emit('changePhase', {
325
303
  pid: process.pid,
326
304
  name: 'getAnchors',
327
- url: this.#url,
305
+ url,
328
306
  isExternal,
329
307
  message: '',
330
308
  });
331
- const anchorList = await getAnchorList(page, options);
309
+ const anchorList = await getAnchorList(page, parseOpts);
332
310
  void this.emit('changePhase', {
333
311
  pid: process.pid,
334
312
  name: 'getMeta',
335
- url: this.#url,
313
+ url,
336
314
  isExternal,
337
315
  message: '',
338
316
  });
339
317
  const meta = await getMeta(page);
340
- const imageList = isGettingImages ? await this.#fetchImages(page, isExternal) : [];
318
+ const imageList = captureImages
319
+ ? await (async () => {
320
+ void this.emit('changePhase', {
321
+ pid: process.pid,
322
+ name: 'extractImages',
323
+ url,
324
+ isExternal,
325
+ message: '',
326
+ });
327
+ return this.#fetchImages(page, url.withoutHashAndAuth, isExternal, imageLoadTimeout);
328
+ })()
329
+ : [];
341
330
  return {
342
331
  url,
343
332
  isTarget: true,
344
333
  isExternal,
345
- redirectPaths,
334
+ redirectPaths: [...redirectPaths],
346
335
  status,
347
336
  statusText,
348
337
  contentType,
@@ -355,126 +344,110 @@ let Scraper = (() => {
355
344
  isSkipped: false,
356
345
  };
357
346
  }, "#fetchData") }, _private_fetchData_decorators, { kind: "method", name: "#fetchData", static: false, private: true, access: { has: obj => #fetchData in obj, get: obj => obj.#fetchData }, metadata: _metadata }, null, _instanceExtraInitializers);
358
- __esDecorate(this, _private_fetchHead_descriptor = { value: __setFunctionName(async function (url, isExternal) {
359
- return await fetchDestination(url, isExternal);
360
- }, "#fetchHead") }, _private_fetchHead_decorators, { kind: "method", name: "#fetchHead", static: false, private: true, access: { has: obj => #fetchHead in obj, get: obj => obj.#fetchHead }, metadata: _metadata }, null, _instanceExtraInitializers);
361
- __esDecorate(this, _private_fetchImages_descriptor = { value: __setFunctionName(async function (page, isExternal) {
362
- const url = this.#url.withoutHashAndAuth;
363
- const imageList = [];
347
+ __esDecorate(this, _private_fetchImages_descriptor = { value: __setFunctionName(async function (page, url, isExternal, imageLoadTimeout) {
348
+ const listener = this.#createPageScanListener(isExternal);
364
349
  const devices = [
365
- { name: 'desktop', width: 1280 },
366
- { name: 'mobile', width: 320, resolution: 2 },
350
+ { key: 'desktop-compact', preset: devicePresets['desktop-compact'] },
351
+ { key: 'mobile-small', preset: devicePresets['mobile-small'] },
367
352
  ];
368
- for (const device of devices) {
353
+ const imageList = [];
354
+ for (const { key, preset } of devices) {
369
355
  void this.emit('changePhase', {
370
356
  pid: process.pid,
371
357
  name: 'setViewport',
372
- url: this.#url,
358
+ url: null,
373
359
  isExternal,
374
- message: device.name,
360
+ message: `📷 ${key} ↔️ ${preset.width}px`,
375
361
  });
376
362
  await beforePageScan(page, url, {
377
- name: device.name,
378
- width: device.width,
379
- resolution: device.resolution,
363
+ name: key,
364
+ width: preset.width,
365
+ resolution: preset.resolution,
366
+ listener,
380
367
  timeout: 5000,
381
368
  });
369
+ void this.emit('changePhase', {
370
+ pid: process.pid,
371
+ name: 'waitImageLoad',
372
+ url: null,
373
+ isExternal,
374
+ message: `📷 ${key}: Waiting for images%dots%`,
375
+ });
376
+ await page
377
+ .waitForFunction(() => [...document.images].every((img) => img.complete), {
378
+ timeout: imageLoadTimeout,
379
+ })
380
+ .catch(() => { });
382
381
  void this.emit('changePhase', {
383
382
  pid: process.pid,
384
383
  name: 'getImages',
385
- url: this.#url,
384
+ url: null,
386
385
  isExternal,
387
- message: device.name,
386
+ message: `📸 ${key}: Extracting images%dots%`,
388
387
  });
389
- const images = await getImageList(page, device.width);
388
+ const images = await getImageList(page, preset.width);
390
389
  imageList.push(...images);
391
390
  }
392
391
  return imageList;
393
392
  }, "#fetchImages") }, _private_fetchImages_decorators, { kind: "method", name: "#fetchImages", static: false, private: true, access: { has: obj => #fetchImages in obj, get: obj => obj.#fetchImages }, metadata: _metadata }, null, _instanceExtraInitializers);
394
393
  if (_metadata) Object.defineProperty(this, Symbol.metadata, { enumerable: true, configurable: true, writable: true, value: _metadata });
395
394
  }
396
- #browser = (__runInitializers(this, _instanceExtraInitializers), null);
397
- #url = null;
398
- async destroy(isExternal) {
399
- log('Scraper destroys self');
400
- if (!this.#url) {
401
- throw new Error('The instance is already destroyed.');
402
- }
403
- if (!this.#browser) {
404
- void this.emit('destroyed', {
405
- pid: process.pid,
406
- });
407
- void this.emit('changePhase', {
408
- pid: process.pid,
409
- name: 'destroyed',
410
- url: this.#url,
411
- isExternal,
412
- message: '',
413
- });
414
- return;
415
- }
416
- while (!this.#browser.isConnected()) {
417
- log('Browser closes all pages');
418
- const pages = await this.#browser.pages();
419
- for (const page of pages) {
420
- page.removeAllListeners();
421
- if (!page.isClosed) {
422
- await page.close();
423
- }
424
- }
425
- log('Browser closes self');
426
- await this.#browser.close();
427
- log('Browser disconnects');
428
- await this.#browser.disconnect();
429
- }
430
- log('Scraper discards browser');
431
- this.#browser = null;
432
- void this.emit('destroyed', {
433
- pid: process.pid,
434
- });
435
- void this.emit('changePhase', {
436
- pid: process.pid,
437
- name: 'destroyed',
438
- url: this.#url,
439
- isExternal,
440
- message: '',
441
- });
442
- }
443
- async scrapeStart(url, options, isSkip = false) {
395
+ /** Number of retries for `@retryable`-decorated methods. Set per-scrape from options. */
396
+ retries = __runInitializers(this, _instanceExtraInitializers);
397
+ /**
398
+ * Begins the scraping process for a given URL on the provided Puppeteer page.
399
+ *
400
+ * Returns a `ScrapeResult` containing the outcome:
401
+ * - `type: "success"` with `pageData` on success
402
+ * - `type: "skipped"` with `ignored` details when the page is excluded
403
+ * - `type: "error"` with `error` details when scraping fails
404
+ *
405
+ * Sub-resources are collected via the `resourceResponse` event and
406
+ * included in the returned `ScrapeResult.resources`.
407
+ * @param page - The Puppeteer page instance to use for navigation and DOM evaluation.
408
+ * @param url - The extended URL to scrape.
409
+ * @param options - Optional scraper configuration overriding defaults.
410
+ * @param isSkip - When `true`, the page is immediately skipped without any network requests.
411
+ * @returns The scrape result containing the outcome and captured resources.
412
+ */
413
+ async scrapeStart(page, url, options, isSkip = false) {
414
+ this.retries = options?.retries;
444
415
  const isExternal = options?.isExternal ?? false;
445
- const isGettingImages = options?.isGettingImages ?? true;
416
+ const captureImages = options?.captureImages ?? true;
446
417
  const excludeKeywords = options?.excludeKeywords ?? [];
447
- const executablePath = options?.executablePath ?? null;
448
- const isTitleOnly = options?.isTitleOnly ?? false;
449
- this.#url = url;
418
+ const metadataOnly = options?.metadataOnly ?? false;
419
+ const imageLoadTimeout = options?.imageLoadTimeout ?? 5000;
420
+ const resources = [];
450
421
  void this.emit('changePhase', {
451
422
  pid: process.pid,
452
423
  name: 'scrapeStart',
453
- url: this.#url,
424
+ url,
454
425
  isExternal,
455
426
  message: '',
456
427
  });
428
+ // Path-excluded: return SkippedPageData
457
429
  if (isSkip) {
458
- void this.emit('ignoreAndSkip', {
459
- pid: process.pid,
460
- url: this.#url,
461
- reason: {
462
- matchedText: this.#url.pathname || '',
463
- excludeKeywords,
464
- },
465
- });
466
430
  void this.emit('changePhase', {
467
431
  pid: process.pid,
468
- name: 'ignoreAndSkip',
469
- url: this.#url,
432
+ name: 'pageSkipped',
433
+ url,
470
434
  isExternal,
471
435
  message: 'Matched: excluded path',
472
436
  });
473
- return;
437
+ return {
438
+ type: 'skipped',
439
+ resources,
440
+ ignored: {
441
+ url,
442
+ matchedText: url.pathname || '',
443
+ excludeKeywords,
444
+ },
445
+ };
474
446
  }
475
- if (!this.#url.isHTTP) {
447
+ // Non-HTTP protocol: return minimal PageData
448
+ if (!url.isHTTP) {
476
449
  const result = {
477
- url: this.#url,
450
+ url,
478
451
  isTarget: false,
479
452
  isExternal,
480
453
  redirectPaths: [],
@@ -491,79 +464,60 @@ let Scraper = (() => {
491
464
  html: '',
492
465
  isSkipped: false,
493
466
  };
494
- void this.emit('scrapeEnd', {
495
- pid: process.pid,
496
- url: this.#url,
497
- timestamp: Date.now(),
498
- result,
499
- });
500
467
  void this.emit('changePhase', {
501
468
  pid: process.pid,
502
469
  name: 'scrapeEnd',
503
- url: this.#url,
470
+ url,
504
471
  isExternal,
505
472
  message: '',
506
473
  });
507
- return;
474
+ return { type: 'success', pageData: result, resources };
508
475
  }
509
- void this.emit('changePhase', {
510
- pid: process.pid,
511
- name: 'touchHead',
512
- url: this.#url,
513
- isExternal,
514
- message: '',
515
- });
516
- let result = await this.#fetchHead(url, isExternal);
517
- if (result instanceof Error) {
518
- log('Error(FETCH_HEAD): %s', url.href);
519
- void this.emit('error', {
476
+ let headResult = options?.headCheckResult ?? null;
477
+ if (headResult && metadataOnly) {
478
+ void this.emit('changePhase', {
520
479
  pid: process.pid,
521
- url: this.#url,
522
- shutdown: false,
523
- error: result,
480
+ name: 'scrapeEnd',
481
+ url,
482
+ isExternal,
483
+ message: '',
524
484
  });
525
- result = null;
526
- }
527
- if (result && isTitleOnly) {
528
- void this.emit('scrapeEnd', {
529
- pid: process.pid,
530
- url: this.#url,
531
- timestamp: Date.now(),
532
- result: {
533
- ...result,
485
+ return {
486
+ type: 'success',
487
+ pageData: {
488
+ ...headResult,
534
489
  isTarget: false,
535
490
  },
536
- });
537
- return;
491
+ resources,
492
+ };
538
493
  }
539
- if (result === null || result.contentType === 'text/html') {
540
- const headlessMode = url.isSecure ? true : 'shell';
541
- const page = await this.#createPage(isExternal, executablePath, headlessMode);
542
- result = await this.#fetchData(page, url, isExternal, isGettingImages, options).catch((error) => {
494
+ if (headResult === null || headResult.contentType === 'text/html') {
495
+ const fetchResult = await this.#fetchData(page, url, isExternal, captureImages, imageLoadTimeout, resources, options).catch((error) => {
543
496
  if (error instanceof Error) {
544
497
  return error;
545
498
  }
546
499
  return new Error(error);
547
500
  });
548
- if (result instanceof Error) {
501
+ if (fetchResult instanceof Error) {
549
502
  log('Error(FETCH_DATA): %s', url.href);
550
- void this.emit('error', {
551
- pid: process.pid,
552
- url: this.#url,
553
- shutdown: true,
554
- error: result,
555
- });
556
- await this.destroy(isExternal);
557
- return;
503
+ page.removeAllListeners();
504
+ return {
505
+ type: 'error',
506
+ resources,
507
+ error: {
508
+ name: fetchResult.name,
509
+ message: fetchResult.message,
510
+ stack: fetchResult.stack,
511
+ shutdown: true,
512
+ },
513
+ };
558
514
  }
559
515
  page.removeAllListeners();
560
- if (!page.isClosed) {
561
- await page.close();
562
- }
563
- if (!result.isSkipped) {
564
- const checkedKeyword = keywordCheck(result.html, excludeKeywords);
516
+ headResult = fetchResult;
517
+ if (!headResult.isSkipped) {
518
+ const checkedKeyword = keywordCheck(headResult.html, excludeKeywords);
565
519
  if (checkedKeyword) {
566
- result = {
520
+ headResult = {
567
521
  url,
568
522
  isSkipped: true,
569
523
  matched: {
@@ -574,48 +528,131 @@ let Scraper = (() => {
574
528
  };
575
529
  }
576
530
  }
577
- if (result.isSkipped) {
578
- if (result.matched.type === 'path') {
579
- return;
531
+ if (headResult.isSkipped) {
532
+ if (headResult.matched.type === 'path') {
533
+ return {
534
+ type: 'skipped',
535
+ resources,
536
+ ignored: {
537
+ url,
538
+ matchedText: url.pathname || '',
539
+ excludeKeywords,
540
+ },
541
+ };
580
542
  }
581
- void this.emit('ignoreAndSkip', {
582
- pid: process.pid,
583
- url: this.#url,
584
- reason: {
585
- matchedText: result.matched.text,
586
- excludeKeywords,
587
- },
588
- });
589
543
  void this.emit('changePhase', {
590
544
  pid: process.pid,
591
- name: 'ignoreAndSkip',
592
- url: this.#url,
545
+ name: 'pageSkipped',
546
+ url,
593
547
  isExternal,
594
- message: `Matched: "${result.matched.text}"`,
548
+ message: `Matched: "${headResult.matched.text}"`,
595
549
  });
596
- return;
550
+ return {
551
+ type: 'skipped',
552
+ resources,
553
+ ignored: {
554
+ url,
555
+ matchedText: headResult.matched.text,
556
+ excludeKeywords,
557
+ },
558
+ };
597
559
  }
598
560
  }
599
- void this.emit('scrapeEnd', {
600
- pid: process.pid,
601
- url: this.#url,
602
- timestamp: Date.now(),
603
- result,
604
- });
605
561
  void this.emit('changePhase', {
606
562
  pid: process.pid,
607
563
  name: 'scrapeEnd',
608
- url: this.#url,
564
+ url,
609
565
  isExternal,
610
566
  message: '',
611
567
  });
612
- return result;
568
+ return { type: 'success', pageData: headResult, resources };
569
+ }
570
+ /**
571
+ * Creates a callback for `@d-zero/puppeteer-page-scan`'s `beforePageScan` listener.
572
+ *
573
+ * WHY a separate factory: The listener must capture `isExternal` for phase events
574
+ * while conforming to the `beforePageScan` listener signature.
575
+ * Currently only handles the `scroll` phase to report scroll progress.
576
+ * @param isExternal - Whether the current page is external to the crawl scope
577
+ * @returns A listener function compatible with `beforePageScan`'s `listener` option
578
+ */
579
+ #createPageScanListener(isExternal) {
580
+ return (phase, data) => {
581
+ switch (phase) {
582
+ case 'scroll': {
583
+ const d = data;
584
+ const scrollMsg = Number.isNaN(d.scrollHeight)
585
+ ? `%propeller% ${d.message}`
586
+ : `%propeller% ${d.scrollY}px/${d.scrollHeight}px (${Math.round((d.scrollY / d.scrollHeight) * 100)}%) ${d.message}`;
587
+ void this.emit('changePhase', {
588
+ pid: process.pid,
589
+ name: 'scrollToBottom',
590
+ url: null,
591
+ isExternal,
592
+ message: scrollMsg,
593
+ });
594
+ break;
595
+ }
596
+ }
597
+ };
613
598
  }
614
- get #bootBrowser() { return _private_bootBrowser_descriptor.value; }
615
- get #createPage() { return _private_createPage_descriptor.value; }
599
+ /**
600
+ * Navigates the page to the target URL and extracts full page data.
601
+ *
602
+ * WHY retryable with 3-min timeout: Page navigation can fail due to transient
603
+ * network issues or slow-loading pages. The decorator retries automatically,
604
+ * emitting `retryWait` / `retryExhausted` phase events for progress monitoring.
605
+ *
606
+ * Flow:
607
+ * 1. Register request/response listeners to capture sub-resources (internal pages only)
608
+ * 2. Navigate to URL via `page.goto()` and track redirect chain
609
+ * 3. Wait for DOM content and network idle
610
+ * 4. Extract anchors, meta, and optionally images
611
+ * 5. Check for keyword exclusion in HTML content
612
+ * @param page - Puppeteer page instance
613
+ * @param url - Target URL to navigate to
614
+ * @param isExternal - Whether the URL is external to the crawl scope
615
+ * @param captureImages - Whether to run the image extraction pipeline
616
+ * @param imageLoadTimeout - Timeout (ms) for waiting lazy-loaded images to complete
617
+ * @param resources - Mutable array to collect captured sub-resources into
618
+ * @param options - Additional scraper options (e.g. `disableQueries`, `navigationTimeout`)
619
+ * @returns Full page data or skipped page data if an exclusion rule matched
620
+ */
616
621
  get #fetchData() { return _private_fetchData_descriptor.value; }
617
- get #fetchHead() { return _private_fetchHead_descriptor.value; }
622
+ /**
623
+ * Extracts image data from the page across multiple device presets.
624
+ *
625
+ * WHY multiple device presets: Images may differ between desktop and mobile
626
+ * due to responsive `<picture>` / `srcset`. Capturing both `desktop-compact`
627
+ * and `mobile-small` viewports reveals responsive image issues.
628
+ *
629
+ * WHY retryable with 5-min timeout and `fallback: []`: Image extraction is
630
+ * best-effort. If all retries fail, an empty array is returned rather than
631
+ * failing the entire page scrape.
632
+ * @param page - Puppeteer page instance
633
+ * @param url - The page URL string (without hash and auth)
634
+ * @param isExternal - Whether the page is external
635
+ * @param imageLoadTimeout - Timeout (ms) for waiting images to complete loading
636
+ * @returns Array of image elements from all device presets
637
+ */
618
638
  get #fetchImages() { return _private_fetchImages_descriptor.value; }
619
639
  };
620
640
  })();
641
+ /**
642
+ * Page-level scraper that extracts data from a single browser page.
643
+ *
644
+ * The scraper returns results as values from `scrapeStart()` rather than
645
+ * emitting them as events. Only streaming events (changePhase, resourceResponse)
646
+ * are emitted for progress monitoring.
647
+ *
648
+ * The Puppeteer `Page` object is injected externally, and page lifecycle
649
+ * (including `page.close()`) is managed by the caller.
650
+ * @example
651
+ * ```ts
652
+ * const scraper = new Scraper();
653
+ * scraper.on('changePhase', (e) => console.log(e.name));
654
+ * const result = await scraper.scrapeStart(page, url, { isExternal: false });
655
+ * ```
656
+ */
657
+ // eslint-disable-next-line unicorn/prefer-event-target -- TypedAwaitEventEmitter is a project-specific typed wrapper, not Node.js EventEmitter
621
658
  export default Scraper;