@d-zero/beholder 2.1.5 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +44 -0
- package/README.md +9 -276
- package/dist/dom-evaluation.d.ts +100 -62
- package/dist/dom-evaluation.js +498 -195
- package/dist/index.d.ts +1 -1
- package/dist/meta/classify.d.ts +52 -0
- package/dist/meta/classify.js +731 -0
- package/dist/meta/id-extractors.d.ts +40 -0
- package/dist/meta/id-extractors.js +196 -0
- package/dist/meta/keys.d.ts +41 -0
- package/dist/meta/keys.js +507 -0
- package/dist/meta/parsers.d.ts +74 -0
- package/dist/meta/parsers.js +293 -0
- package/dist/meta/tag-detection.d.ts +59 -0
- package/dist/meta/tag-detection.js +120 -0
- package/dist/meta/types.d.ts +874 -0
- package/dist/meta/types.js +12 -0
- package/dist/scraper.js +22 -18
- package/dist/types.d.ts +8 -37
- package/package.json +5 -4
- package/src/dom-evaluation.spec.ts +521 -0
- package/src/dom-evaluation.ts +655 -227
- package/src/index.ts +43 -0
- package/src/meta/classify.spec.ts +281 -0
- package/src/meta/classify.ts +810 -0
- package/src/meta/id-extractors.spec.ts +69 -0
- package/src/meta/id-extractors.ts +206 -0
- package/src/meta/keys.ts +568 -0
- package/src/meta/parsers.spec.ts +178 -0
- package/src/meta/parsers.ts +304 -0
- package/src/meta/simple-wappalyzer.d.ts +37 -0
- package/src/meta/tag-detection.spec.ts +134 -0
- package/src/meta/tag-detection.ts +161 -0
- package/src/meta/types.ts +949 -0
- package/src/scraper.ts +32 -16
- package/src/types.ts +54 -54
- package/tsconfig.tsbuildinfo +1 -1
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Type definitions for the `Meta` data extracted from a page's `<head>` and full document.
|
|
3
|
+
*
|
|
4
|
+
* Structure follows the reference table in `frontmatter-keys.md`, with one dot-path
|
|
5
|
+
* field per category. Optional fields are absent when not detected on the page.
|
|
6
|
+
* Array fields are required and default to `[]` so consumers can iterate without
|
|
7
|
+
* null-checks.
|
|
8
|
+
* @see {@link ./classify.ts} for the function that builds `Meta` from raw head entries
|
|
9
|
+
* @see {@link ./parsers.ts} for the value normalizers used by `classify`
|
|
10
|
+
* @module
|
|
11
|
+
*/
|
|
12
|
+
export {};
|
package/dist/scraper.js
CHANGED
|
@@ -42,9 +42,10 @@ import { detectCompress } from '@d-zero/shared/detect-compress';
|
|
|
42
42
|
import { retry as retryable } from '@d-zero/shared/retry';
|
|
43
43
|
import { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
|
|
44
44
|
import { resourceLog, scraperLog } from './debug.js';
|
|
45
|
-
import { getAnchorList, getImageList, getMeta } from './dom-evaluation.js';
|
|
45
|
+
import { DEFAULT_DOM_EVALUATION_TIMEOUT, getAnchorList, getImageList, getMeta, } from './dom-evaluation.js';
|
|
46
46
|
import { isError } from './is-error.js';
|
|
47
47
|
import { keywordCheck } from './keyword-check.js';
|
|
48
|
+
import { emptyMeta } from './meta/classify.js';
|
|
48
49
|
import { findDisconnectionFailures } from './network-disconnection.js';
|
|
49
50
|
import { parseUrl } from './parse-url.js';
|
|
50
51
|
const pid = `${process.pid}`;
|
|
@@ -107,6 +108,7 @@ let Scraper = (() => {
|
|
|
107
108
|
const parseOpts = options?.disableQueries == null
|
|
108
109
|
? undefined
|
|
109
110
|
: { disableQueries: options.disableQueries };
|
|
111
|
+
const domEvaluationTimeout = options?.domEvaluationTimeout ?? DEFAULT_DOM_EVALUATION_TIMEOUT;
|
|
110
112
|
const networkLogs = {};
|
|
111
113
|
// Clear stale state from previous retries (@retryable may re-invoke this method
|
|
112
114
|
// with the same page and mutable arrays, so we must reset to avoid accumulation)
|
|
@@ -265,9 +267,7 @@ let Scraper = (() => {
|
|
|
265
267
|
contentType,
|
|
266
268
|
contentLength,
|
|
267
269
|
responseHeaders,
|
|
268
|
-
meta:
|
|
269
|
-
title: '',
|
|
270
|
-
},
|
|
270
|
+
meta: emptyMeta(),
|
|
271
271
|
imageList: [],
|
|
272
272
|
anchorList: [],
|
|
273
273
|
html: '',
|
|
@@ -299,6 +299,8 @@ let Scraper = (() => {
|
|
|
299
299
|
};
|
|
300
300
|
});
|
|
301
301
|
if (isExternal) {
|
|
302
|
+
const externalMeta = emptyMeta();
|
|
303
|
+
externalMeta.title = title;
|
|
302
304
|
return {
|
|
303
305
|
url,
|
|
304
306
|
isTarget: false,
|
|
@@ -309,9 +311,7 @@ let Scraper = (() => {
|
|
|
309
311
|
contentType,
|
|
310
312
|
contentLength,
|
|
311
313
|
responseHeaders,
|
|
312
|
-
meta:
|
|
313
|
-
title,
|
|
314
|
-
},
|
|
314
|
+
meta: externalMeta,
|
|
315
315
|
imageList: [],
|
|
316
316
|
anchorList: [],
|
|
317
317
|
html,
|
|
@@ -341,17 +341,22 @@ let Scraper = (() => {
|
|
|
341
341
|
name: 'getAnchors',
|
|
342
342
|
url,
|
|
343
343
|
isExternal,
|
|
344
|
-
message:
|
|
344
|
+
message: `%countdown(${domEvaluationTimeout},getAnchors_${url.withoutHash},s)%s`,
|
|
345
345
|
});
|
|
346
|
-
const anchorList = await getAnchorList(page, parseOpts);
|
|
346
|
+
const anchorList = await getAnchorList(page, parseOpts, domEvaluationTimeout);
|
|
347
347
|
void this.emit('changePhase', {
|
|
348
348
|
pid: process.pid,
|
|
349
349
|
name: 'getMeta',
|
|
350
350
|
url,
|
|
351
351
|
isExternal,
|
|
352
|
-
message:
|
|
352
|
+
message: `%countdown(${domEvaluationTimeout},getMeta_${url.withoutHash},s)%s`,
|
|
353
353
|
});
|
|
354
|
-
const meta = await getMeta(page
|
|
354
|
+
const meta = await getMeta(page, {
|
|
355
|
+
url: url.withoutHashAndAuth,
|
|
356
|
+
html,
|
|
357
|
+
statusCode: status,
|
|
358
|
+
headers: responseHeaders ?? undefined,
|
|
359
|
+
}, domEvaluationTimeout);
|
|
355
360
|
const imageList = captureImages
|
|
356
361
|
? await (async () => {
|
|
357
362
|
void this.emit('changePhase', {
|
|
@@ -359,9 +364,9 @@ let Scraper = (() => {
|
|
|
359
364
|
name: 'extractImages',
|
|
360
365
|
url,
|
|
361
366
|
isExternal,
|
|
362
|
-
message:
|
|
367
|
+
message: `%countdown(${domEvaluationTimeout},extractImages_${url.withoutHash},s)%s`,
|
|
363
368
|
});
|
|
364
|
-
return this.#fetchImages(page, url.withoutHashAndAuth, isExternal, imageLoadTimeout);
|
|
369
|
+
return this.#fetchImages(page, url.withoutHashAndAuth, isExternal, imageLoadTimeout, domEvaluationTimeout);
|
|
365
370
|
})()
|
|
366
371
|
: [];
|
|
367
372
|
return {
|
|
@@ -381,7 +386,7 @@ let Scraper = (() => {
|
|
|
381
386
|
isSkipped: false,
|
|
382
387
|
};
|
|
383
388
|
}, "#fetchData") }, _private_fetchData_decorators, { kind: "method", name: "#fetchData", static: false, private: true, access: { has: obj => #fetchData in obj, get: obj => obj.#fetchData }, metadata: _metadata }, null, _instanceExtraInitializers);
|
|
384
|
-
__esDecorate(this, _private_fetchImages_descriptor = { value: __setFunctionName(async function (page, url, isExternal, imageLoadTimeout) {
|
|
389
|
+
__esDecorate(this, _private_fetchImages_descriptor = { value: __setFunctionName(async function (page, url, isExternal, imageLoadTimeout, domEvaluationTimeout) {
|
|
385
390
|
const listener = this.#createPageScanListener(isExternal);
|
|
386
391
|
const devices = [
|
|
387
392
|
{ key: 'desktop-compact', preset: devicePresets['desktop-compact'] },
|
|
@@ -423,7 +428,7 @@ let Scraper = (() => {
|
|
|
423
428
|
isExternal,
|
|
424
429
|
message: `📸 ${key}: Extracting images%dots%`,
|
|
425
430
|
});
|
|
426
|
-
const images = await getImageList(page, preset.width);
|
|
431
|
+
const images = await getImageList(page, preset.width, domEvaluationTimeout);
|
|
427
432
|
imageList.push(...images);
|
|
428
433
|
}
|
|
429
434
|
catch (error) {
|
|
@@ -509,9 +514,7 @@ let Scraper = (() => {
|
|
|
509
514
|
contentType: null,
|
|
510
515
|
contentLength: null,
|
|
511
516
|
responseHeaders: {},
|
|
512
|
-
meta:
|
|
513
|
-
title: '',
|
|
514
|
-
},
|
|
517
|
+
meta: emptyMeta(),
|
|
515
518
|
imageList: [],
|
|
516
519
|
anchorList: [],
|
|
517
520
|
html: '',
|
|
@@ -705,6 +708,7 @@ let Scraper = (() => {
|
|
|
705
708
|
* @param url - The page URL string (without hash and auth)
|
|
706
709
|
* @param isExternal - Whether the page is external
|
|
707
710
|
* @param imageLoadTimeout - Timeout (ms) for waiting images to complete loading
|
|
711
|
+
* @param domEvaluationTimeout - Timeout (ms) for the in-page image extraction `page.evaluate`
|
|
708
712
|
* @returns Array of image elements from all device presets (may be partial if some viewports failed)
|
|
709
713
|
*/
|
|
710
714
|
get #fetchImages() { return _private_fetchImages_descriptor.value; }
|
package/dist/types.d.ts
CHANGED
|
@@ -7,6 +7,8 @@
|
|
|
7
7
|
export type { ExURL, ParseURLOptions } from '@d-zero/shared/parse-url';
|
|
8
8
|
export type { CompressType } from '@d-zero/shared/detect-compress';
|
|
9
9
|
export type { CDNType } from '@d-zero/shared/detect-cdn';
|
|
10
|
+
export type { Meta, OpenGraphMeta, OgArticleMeta, OgBookMeta, OgProfileMeta, OgMusicMeta, OgVideoNsMeta, TwitterMeta, FbMeta, FediverseMeta, AppleMeta, MsApplicationMeta, VerificationMeta, GoogleMeta, GeoMeta, CitationMeta, RdfaMeta, MicrodataMeta, AmpMeta, LegacyMeta, MobileMeta, MicroformatsMeta, PinterestMeta, SlackMeta, LinkedInMeta, ExperimentalMeta, WikiMeta, LinkMeta, LinkEntry, JsonLdEntry, OthersBucket, ScriptEntry, IframeEntry, TagsMeta, TagDetail, TagEntry, TagSource, ViewportMeta, RobotsMeta, ReferrerMeta, FormatDetectionMeta, HttpEquivMeta, HttpEquivRefresh, RawHeadEntry, } from './meta/types.js';
|
|
11
|
+
import type { Meta } from './meta/types.js';
|
|
10
12
|
import type { CDNType } from '@d-zero/shared/detect-cdn';
|
|
11
13
|
import type { CompressType } from '@d-zero/shared/detect-compress';
|
|
12
14
|
import type { ExURL } from '@d-zero/shared/parse-url';
|
|
@@ -134,43 +136,6 @@ export type AnchorData = {
|
|
|
134
136
|
*/
|
|
135
137
|
isExternal?: boolean;
|
|
136
138
|
};
|
|
137
|
-
/**
|
|
138
|
-
* Metadata extracted from a page's `<head>` element.
|
|
139
|
-
*/
|
|
140
|
-
export type Meta = {
|
|
141
|
-
/** The `lang` attribute of the `<html>` element. */
|
|
142
|
-
lang?: string;
|
|
143
|
-
/** The text content of the `<title>` element. */
|
|
144
|
-
title: string;
|
|
145
|
-
/** The `content` attribute of `<meta name="description">`. */
|
|
146
|
-
description?: string;
|
|
147
|
-
/** The `content` attribute of `<meta name="keywords">`. */
|
|
148
|
-
keywords?: string;
|
|
149
|
-
/** Whether `noindex` is present in the robots meta tag. */
|
|
150
|
-
noindex?: boolean;
|
|
151
|
-
/** Whether `nofollow` is present in the robots meta tag. */
|
|
152
|
-
nofollow?: boolean;
|
|
153
|
-
/** Whether `noarchive` is present in the robots meta tag. */
|
|
154
|
-
noarchive?: boolean;
|
|
155
|
-
/** The canonical URL from `<link rel="canonical">`. */
|
|
156
|
-
canonical?: string;
|
|
157
|
-
/** The alternate URL from `<link rel="alternate">`. */
|
|
158
|
-
alternate?: string;
|
|
159
|
-
/** The Open Graph type (`og:type`). */
|
|
160
|
-
'og:type'?: string;
|
|
161
|
-
/** The Open Graph title (`og:title`). */
|
|
162
|
-
'og:title'?: string;
|
|
163
|
-
/** The Open Graph site name (`og:site_name`). */
|
|
164
|
-
'og:site_name'?: string;
|
|
165
|
-
/** The Open Graph description (`og:description`). */
|
|
166
|
-
'og:description'?: string;
|
|
167
|
-
/** The Open Graph URL (`og:url`). */
|
|
168
|
-
'og:url'?: string;
|
|
169
|
-
/** The Open Graph image URL (`og:image`). */
|
|
170
|
-
'og:image'?: string;
|
|
171
|
-
/** The Twitter Card type (`twitter:card`). */
|
|
172
|
-
'twitter:card'?: string;
|
|
173
|
-
};
|
|
174
139
|
/**
|
|
175
140
|
* A network request/response log entry captured during page scraping via Puppeteer.
|
|
176
141
|
*/
|
|
@@ -345,4 +310,10 @@ export type ScraperOptions = {
|
|
|
345
310
|
headCheckResult?: PageData;
|
|
346
311
|
/** Timeout (ms) for page.goto(). Default: 60_000 (60s). */
|
|
347
312
|
navigationTimeout?: number;
|
|
313
|
+
/**
|
|
314
|
+
* Timeout (ms) for DOM evaluation operations (meta/image/anchor extraction).
|
|
315
|
+
* Bounds how long extraction may hang on a page with an unresponsive main thread.
|
|
316
|
+
* Default: 180_000 (180s, aligned with the upstream retryable timeout).
|
|
317
|
+
*/
|
|
318
|
+
domEvaluationTimeout?: number;
|
|
348
319
|
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@d-zero/beholder",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "3.0.0",
|
|
4
4
|
"description": "Page-level scraper for web crawling and auditing",
|
|
5
5
|
"author": "D-ZERO",
|
|
6
6
|
"license": "MIT",
|
|
@@ -20,10 +20,11 @@
|
|
|
20
20
|
"clean": "tsc --build --clean"
|
|
21
21
|
},
|
|
22
22
|
"dependencies": {
|
|
23
|
-
"@d-zero/puppeteer-page-scan": "4.5.
|
|
23
|
+
"@d-zero/puppeteer-page-scan": "4.5.2",
|
|
24
24
|
"@d-zero/shared": "0.22.0",
|
|
25
25
|
"debug": "4.4.3",
|
|
26
|
-
"puppeteer": "24.37.5"
|
|
26
|
+
"puppeteer": "24.37.5",
|
|
27
|
+
"simple-wappalyzer": "1.1.99"
|
|
27
28
|
},
|
|
28
29
|
"devDependencies": {
|
|
29
30
|
"@types/debug": "4.1.12"
|
|
@@ -33,5 +34,5 @@
|
|
|
33
34
|
"url": "https://github.com/d-zero-dev/tools.git",
|
|
34
35
|
"directory": "packages/@d-zero/beholder"
|
|
35
36
|
},
|
|
36
|
-
"gitHead": "
|
|
37
|
+
"gitHead": "16c831105a12bb635d49130e7f5add25b6643c40"
|
|
37
38
|
}
|