@d-zero/beholder 2.1.5 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,12 @@
1
+ /**
2
+ * Type definitions for the `Meta` data extracted from a page's `<head>` and full document.
3
+ *
4
+ * Structure follows the reference table in `frontmatter-keys.md`, with one dot-path
5
+ * field per category. Optional fields are absent when not detected on the page.
6
+ * Array fields are required and default to `[]` so consumers can iterate without
7
+ * null-checks.
8
+ * @see {@link ./classify.ts} for the function that builds `Meta` from raw head entries
9
+ * @see {@link ./parsers.ts} for the value normalizers used by `classify`
10
+ * @module
11
+ */
12
+ export {};
package/dist/scraper.js CHANGED
@@ -42,9 +42,10 @@ import { detectCompress } from '@d-zero/shared/detect-compress';
42
42
  import { retry as retryable } from '@d-zero/shared/retry';
43
43
  import { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
44
44
  import { resourceLog, scraperLog } from './debug.js';
45
- import { getAnchorList, getImageList, getMeta } from './dom-evaluation.js';
45
+ import { DEFAULT_DOM_EVALUATION_TIMEOUT, getAnchorList, getImageList, getMeta, } from './dom-evaluation.js';
46
46
  import { isError } from './is-error.js';
47
47
  import { keywordCheck } from './keyword-check.js';
48
+ import { emptyMeta } from './meta/classify.js';
48
49
  import { findDisconnectionFailures } from './network-disconnection.js';
49
50
  import { parseUrl } from './parse-url.js';
50
51
  const pid = `${process.pid}`;
@@ -107,6 +108,7 @@ let Scraper = (() => {
107
108
  const parseOpts = options?.disableQueries == null
108
109
  ? undefined
109
110
  : { disableQueries: options.disableQueries };
111
+ const domEvaluationTimeout = options?.domEvaluationTimeout ?? DEFAULT_DOM_EVALUATION_TIMEOUT;
110
112
  const networkLogs = {};
111
113
  // Clear stale state from previous retries (@retryable may re-invoke this method
112
114
  // with the same page and mutable arrays, so we must reset to avoid accumulation)
@@ -265,9 +267,7 @@ let Scraper = (() => {
265
267
  contentType,
266
268
  contentLength,
267
269
  responseHeaders,
268
- meta: {
269
- title: '',
270
- },
270
+ meta: emptyMeta(),
271
271
  imageList: [],
272
272
  anchorList: [],
273
273
  html: '',
@@ -299,6 +299,8 @@ let Scraper = (() => {
299
299
  };
300
300
  });
301
301
  if (isExternal) {
302
+ const externalMeta = emptyMeta();
303
+ externalMeta.title = title;
302
304
  return {
303
305
  url,
304
306
  isTarget: false,
@@ -309,9 +311,7 @@ let Scraper = (() => {
309
311
  contentType,
310
312
  contentLength,
311
313
  responseHeaders,
312
- meta: {
313
- title,
314
- },
314
+ meta: externalMeta,
315
315
  imageList: [],
316
316
  anchorList: [],
317
317
  html,
@@ -341,17 +341,22 @@ let Scraper = (() => {
341
341
  name: 'getAnchors',
342
342
  url,
343
343
  isExternal,
344
- message: '',
344
+ message: `%countdown(${domEvaluationTimeout},getAnchors_${url.withoutHash},s)%s`,
345
345
  });
346
- const anchorList = await getAnchorList(page, parseOpts);
346
+ const anchorList = await getAnchorList(page, parseOpts, domEvaluationTimeout);
347
347
  void this.emit('changePhase', {
348
348
  pid: process.pid,
349
349
  name: 'getMeta',
350
350
  url,
351
351
  isExternal,
352
- message: '',
352
+ message: `%countdown(${domEvaluationTimeout},getMeta_${url.withoutHash},s)%s`,
353
353
  });
354
- const meta = await getMeta(page);
354
+ const meta = await getMeta(page, {
355
+ url: url.withoutHashAndAuth,
356
+ html,
357
+ statusCode: status,
358
+ headers: responseHeaders ?? undefined,
359
+ }, domEvaluationTimeout);
355
360
  const imageList = captureImages
356
361
  ? await (async () => {
357
362
  void this.emit('changePhase', {
@@ -359,9 +364,9 @@ let Scraper = (() => {
359
364
  name: 'extractImages',
360
365
  url,
361
366
  isExternal,
362
- message: '',
367
+ message: `%countdown(${domEvaluationTimeout},extractImages_${url.withoutHash},s)%s`,
363
368
  });
364
- return this.#fetchImages(page, url.withoutHashAndAuth, isExternal, imageLoadTimeout);
369
+ return this.#fetchImages(page, url.withoutHashAndAuth, isExternal, imageLoadTimeout, domEvaluationTimeout);
365
370
  })()
366
371
  : [];
367
372
  return {
@@ -381,7 +386,7 @@ let Scraper = (() => {
381
386
  isSkipped: false,
382
387
  };
383
388
  }, "#fetchData") }, _private_fetchData_decorators, { kind: "method", name: "#fetchData", static: false, private: true, access: { has: obj => #fetchData in obj, get: obj => obj.#fetchData }, metadata: _metadata }, null, _instanceExtraInitializers);
384
- __esDecorate(this, _private_fetchImages_descriptor = { value: __setFunctionName(async function (page, url, isExternal, imageLoadTimeout) {
389
+ __esDecorate(this, _private_fetchImages_descriptor = { value: __setFunctionName(async function (page, url, isExternal, imageLoadTimeout, domEvaluationTimeout) {
385
390
  const listener = this.#createPageScanListener(isExternal);
386
391
  const devices = [
387
392
  { key: 'desktop-compact', preset: devicePresets['desktop-compact'] },
@@ -423,7 +428,7 @@ let Scraper = (() => {
423
428
  isExternal,
424
429
  message: `📸 ${key}: Extracting images%dots%`,
425
430
  });
426
- const images = await getImageList(page, preset.width);
431
+ const images = await getImageList(page, preset.width, domEvaluationTimeout);
427
432
  imageList.push(...images);
428
433
  }
429
434
  catch (error) {
@@ -509,9 +514,7 @@ let Scraper = (() => {
509
514
  contentType: null,
510
515
  contentLength: null,
511
516
  responseHeaders: {},
512
- meta: {
513
- title: '',
514
- },
517
+ meta: emptyMeta(),
515
518
  imageList: [],
516
519
  anchorList: [],
517
520
  html: '',
@@ -705,6 +708,7 @@ let Scraper = (() => {
705
708
  * @param url - The page URL string (without hash and auth)
706
709
  * @param isExternal - Whether the page is external
707
710
  * @param imageLoadTimeout - Timeout (ms) for waiting images to complete loading
711
+ * @param domEvaluationTimeout - Timeout (ms) for the in-page image extraction `page.evaluate`
708
712
  * @returns Array of image elements from all device presets (may be partial if some viewports failed)
709
713
  */
710
714
  get #fetchImages() { return _private_fetchImages_descriptor.value; }
package/dist/types.d.ts CHANGED
@@ -7,6 +7,8 @@
7
7
  export type { ExURL, ParseURLOptions } from '@d-zero/shared/parse-url';
8
8
  export type { CompressType } from '@d-zero/shared/detect-compress';
9
9
  export type { CDNType } from '@d-zero/shared/detect-cdn';
10
+ export type { Meta, OpenGraphMeta, OgArticleMeta, OgBookMeta, OgProfileMeta, OgMusicMeta, OgVideoNsMeta, TwitterMeta, FbMeta, FediverseMeta, AppleMeta, MsApplicationMeta, VerificationMeta, GoogleMeta, GeoMeta, CitationMeta, RdfaMeta, MicrodataMeta, AmpMeta, LegacyMeta, MobileMeta, MicroformatsMeta, PinterestMeta, SlackMeta, LinkedInMeta, ExperimentalMeta, WikiMeta, LinkMeta, LinkEntry, JsonLdEntry, OthersBucket, ScriptEntry, IframeEntry, TagsMeta, TagDetail, TagEntry, TagSource, ViewportMeta, RobotsMeta, ReferrerMeta, FormatDetectionMeta, HttpEquivMeta, HttpEquivRefresh, RawHeadEntry, } from './meta/types.js';
11
+ import type { Meta } from './meta/types.js';
10
12
  import type { CDNType } from '@d-zero/shared/detect-cdn';
11
13
  import type { CompressType } from '@d-zero/shared/detect-compress';
12
14
  import type { ExURL } from '@d-zero/shared/parse-url';
@@ -134,43 +136,6 @@ export type AnchorData = {
134
136
  */
135
137
  isExternal?: boolean;
136
138
  };
137
- /**
138
- * Metadata extracted from a page's `<head>` element.
139
- */
140
- export type Meta = {
141
- /** The `lang` attribute of the `<html>` element. */
142
- lang?: string;
143
- /** The text content of the `<title>` element. */
144
- title: string;
145
- /** The `content` attribute of `<meta name="description">`. */
146
- description?: string;
147
- /** The `content` attribute of `<meta name="keywords">`. */
148
- keywords?: string;
149
- /** Whether `noindex` is present in the robots meta tag. */
150
- noindex?: boolean;
151
- /** Whether `nofollow` is present in the robots meta tag. */
152
- nofollow?: boolean;
153
- /** Whether `noarchive` is present in the robots meta tag. */
154
- noarchive?: boolean;
155
- /** The canonical URL from `<link rel="canonical">`. */
156
- canonical?: string;
157
- /** The alternate URL from `<link rel="alternate">`. */
158
- alternate?: string;
159
- /** The Open Graph type (`og:type`). */
160
- 'og:type'?: string;
161
- /** The Open Graph title (`og:title`). */
162
- 'og:title'?: string;
163
- /** The Open Graph site name (`og:site_name`). */
164
- 'og:site_name'?: string;
165
- /** The Open Graph description (`og:description`). */
166
- 'og:description'?: string;
167
- /** The Open Graph URL (`og:url`). */
168
- 'og:url'?: string;
169
- /** The Open Graph image URL (`og:image`). */
170
- 'og:image'?: string;
171
- /** The Twitter Card type (`twitter:card`). */
172
- 'twitter:card'?: string;
173
- };
174
139
  /**
175
140
  * A network request/response log entry captured during page scraping via Puppeteer.
176
141
  */
@@ -345,4 +310,10 @@ export type ScraperOptions = {
345
310
  headCheckResult?: PageData;
346
311
  /** Timeout (ms) for page.goto(). Default: 60_000 (60s). */
347
312
  navigationTimeout?: number;
313
+ /**
314
+ * Timeout (ms) for DOM evaluation operations (meta/image/anchor extraction).
315
+ * Bounds how long extraction may hang on a page with an unresponsive main thread.
316
+ * Default: 180_000 (180s, aligned with the upstream retryable timeout).
317
+ */
318
+ domEvaluationTimeout?: number;
348
319
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@d-zero/beholder",
3
- "version": "2.1.5",
3
+ "version": "3.0.0",
4
4
  "description": "Page-level scraper for web crawling and auditing",
5
5
  "author": "D-ZERO",
6
6
  "license": "MIT",
@@ -20,10 +20,11 @@
20
20
  "clean": "tsc --build --clean"
21
21
  },
22
22
  "dependencies": {
23
- "@d-zero/puppeteer-page-scan": "4.5.0",
23
+ "@d-zero/puppeteer-page-scan": "4.5.2",
24
24
  "@d-zero/shared": "0.22.0",
25
25
  "debug": "4.4.3",
26
- "puppeteer": "24.37.5"
26
+ "puppeteer": "24.37.5",
27
+ "simple-wappalyzer": "1.1.99"
27
28
  },
28
29
  "devDependencies": {
29
30
  "@types/debug": "4.1.12"
@@ -33,5 +34,5 @@
33
34
  "url": "https://github.com/d-zero-dev/tools.git",
34
35
  "directory": "packages/@d-zero/beholder"
35
36
  },
36
- "gitHead": "2d24e08c0cb516b7ea9d07a4301eb991193cca11"
37
+ "gitHead": "16c831105a12bb635d49130e7f5add25b6643c40"
37
38
  }