@d-zero/beholder 2.0.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +4 -0
- package/package.json +4 -5
- package/LICENSE +0 -21
- package/dist/debug.d.ts +0 -9
- package/dist/debug.js +0 -9
- package/dist/dom-evaluation.d.ts +0 -109
- package/dist/dom-evaluation.js +0 -273
- package/dist/index.d.ts +0 -21
- package/dist/index.js +0 -16
- package/dist/is-error.d.ts +0 -8
- package/dist/is-error.js +0 -10
- package/dist/keyword-check.d.ts +0 -8
- package/dist/keyword-check.js +0 -17
- package/dist/parse-url.d.ts +0 -14
- package/dist/parse-url.js +0 -23
- package/dist/scraper.d.ts +0 -41
- package/dist/scraper.js +0 -658
- package/dist/types.d.ts +0 -343
- package/dist/types.js +0 -7
- package/tsconfig.tsbuildinfo +0 -1
package/CHANGELOG.md
CHANGED
|
@@ -3,6 +3,10 @@
|
|
|
3
3
|
All notable changes to this project will be documented in this file.
|
|
4
4
|
See [Conventional Commits](https://conventionalcommits.org) for commit guidelines.
|
|
5
5
|
|
|
6
|
+
## [2.0.1](https://github.com/d-zero-dev/tools/compare/@d-zero/beholder@2.0.0...@d-zero/beholder@2.0.1) (2026-03-11)
|
|
7
|
+
|
|
8
|
+
**Note:** Version bump only for package @d-zero/beholder
|
|
9
|
+
|
|
6
10
|
# [2.0.0](https://github.com/d-zero-dev/tools/compare/@d-zero/beholder@0.1.29...@d-zero/beholder@2.0.0) (2026-02-26)
|
|
7
11
|
|
|
8
12
|
- feat(beholder)!: replace SubProcessRunner with in-process Scraper ([eaf2768](https://github.com/d-zero-dev/tools/commit/eaf276898d96dccf6b504b22b7c8f0234162e82e))
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@d-zero/beholder",
|
|
3
|
-
"version": "2.0.
|
|
3
|
+
"version": "2.0.1",
|
|
4
4
|
"description": "Page-level scraper for web crawling and auditing",
|
|
5
5
|
"author": "D-ZERO",
|
|
6
6
|
"license": "MIT",
|
|
@@ -20,13 +20,12 @@
|
|
|
20
20
|
"clean": "tsc --build --clean"
|
|
21
21
|
},
|
|
22
22
|
"dependencies": {
|
|
23
|
-
"@d-zero/puppeteer-page-scan": "4.4.
|
|
24
|
-
"@d-zero/shared": "0.
|
|
23
|
+
"@d-zero/puppeteer-page-scan": "4.4.6",
|
|
24
|
+
"@d-zero/shared": "0.21.0",
|
|
25
25
|
"debug": "4.4.3",
|
|
26
26
|
"puppeteer": "24.37.5"
|
|
27
27
|
},
|
|
28
28
|
"devDependencies": {
|
|
29
29
|
"@types/debug": "4.1.12"
|
|
30
|
-
}
|
|
31
|
-
"gitHead": "a6b5eb0a0a327c003053f7c25be4c075ed319c76"
|
|
30
|
+
}
|
|
32
31
|
}
|
package/LICENSE
DELETED
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
MIT License
|
|
2
|
-
|
|
3
|
-
Copyright (c) 2024 D-ZERO Co., Ltd.
|
|
4
|
-
|
|
5
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
-
in the Software without restriction, including without limitation the rights
|
|
8
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
-
furnished to do so, subject to the following conditions:
|
|
11
|
-
|
|
12
|
-
The above copyright notice and this permission notice shall be included in all
|
|
13
|
-
copies or substantial portions of the Software.
|
|
14
|
-
|
|
15
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
-
SOFTWARE.
|
package/dist/debug.d.ts
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
import debug from 'debug';
|
|
2
|
-
/** Root debug logger for the beholder package. */
|
|
3
|
-
export declare const scraperLog: debug.Debugger;
|
|
4
|
-
/** Debug logger for resource fetching. */
|
|
5
|
-
export declare const resourceLog: debug.Debugger;
|
|
6
|
-
/** Debug logger for DOM evaluation. */
|
|
7
|
-
export declare const domLog: debug.Debugger;
|
|
8
|
-
/** Debug logger for detailed DOM evaluation output. */
|
|
9
|
-
export declare const domDetailsLog: debug.Debugger;
|
package/dist/debug.js
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
import debug from 'debug';
|
|
2
|
-
/** Root debug logger for the beholder package. */
|
|
3
|
-
export const scraperLog = debug('Beholder');
|
|
4
|
-
/** Debug logger for resource fetching. */
|
|
5
|
-
export const resourceLog = scraperLog.extend('Resource');
|
|
6
|
-
/** Debug logger for DOM evaluation. */
|
|
7
|
-
export const domLog = scraperLog.extend('DOM');
|
|
8
|
-
/** Debug logger for detailed DOM evaluation output. */
|
|
9
|
-
export const domDetailsLog = domLog.extend('Details');
|
package/dist/dom-evaluation.d.ts
DELETED
|
@@ -1,109 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* DOM evaluation functions for extracting structured data from Puppeteer pages.
|
|
3
|
-
*
|
|
4
|
-
* These functions are called by {@link ./scraper.ts | Scraper.#fetchData} to extract
|
|
5
|
-
* anchors, images, and meta information after page navigation completes.
|
|
6
|
-
* @see {@link ./types.ts} for the data types returned by these functions
|
|
7
|
-
*/
|
|
8
|
-
import type { AnchorData, ImageElement, ParseURLOptions } from './types.js';
|
|
9
|
-
import type { ElementHandle, Page } from 'puppeteer';
|
|
10
|
-
/**
|
|
11
|
-
* Parameters for {@link getProp}.
|
|
12
|
-
* @template T - The expected type of the property value.
|
|
13
|
-
*/
|
|
14
|
-
export interface GetPropParams<T> {
|
|
15
|
-
/** The Puppeteer element handle to read the property from. */
|
|
16
|
-
readonly $el: ElementHandle<Element>;
|
|
17
|
-
/** The name of the DOM property to retrieve (e.g., `"href"`, `"textContent"`). */
|
|
18
|
-
readonly propName: string;
|
|
19
|
-
/** The default value to return if the property cannot be read or times out. */
|
|
20
|
-
readonly fallback: T;
|
|
21
|
-
}
|
|
22
|
-
/**
|
|
23
|
-
* Retrieves a DOM property value from a Puppeteer element handle with a timeout.
|
|
24
|
-
*
|
|
25
|
-
* Races the actual property retrieval against a 10-second timeout.
|
|
26
|
-
* If the property cannot be read or the timeout expires, the fallback value is returned.
|
|
27
|
-
* @template T - The expected type of the property value.
|
|
28
|
-
* @param params - Parameters containing the element, property name, and fallback.
|
|
29
|
-
* @returns The property value, or the fallback if retrieval fails.
|
|
30
|
-
*/
|
|
31
|
-
export declare function getProp<T>(params: GetPropParams<T>): Promise<T>;
|
|
32
|
-
/**
|
|
33
|
-
* Parameters for {@link getPropBySelector}.
|
|
34
|
-
* @template T - The expected type of the property value.
|
|
35
|
-
*/
|
|
36
|
-
export interface GetPropBySelectorParams<T> {
|
|
37
|
-
/** The Puppeteer page to query. */
|
|
38
|
-
readonly page: Page;
|
|
39
|
-
/** A CSS selector to find the target element. */
|
|
40
|
-
readonly selector: string;
|
|
41
|
-
/** The DOM property name to read from the matched element. */
|
|
42
|
-
readonly propName: string;
|
|
43
|
-
/** The default value if no element matches or the property cannot be read. */
|
|
44
|
-
readonly fallback: T;
|
|
45
|
-
}
|
|
46
|
-
/**
|
|
47
|
-
* Retrieves a DOM property value from the first element matching a CSS selector.
|
|
48
|
-
*
|
|
49
|
-
* Combines `page.$()` with {@link getProp} for convenient single-element lookups.
|
|
50
|
-
* @template T - The expected type of the property value.
|
|
51
|
-
* @param params - Parameters containing the page, selector, property name, and fallback.
|
|
52
|
-
* @returns The property value, or the fallback if the element is not found or retrieval fails.
|
|
53
|
-
*/
|
|
54
|
-
export declare function getPropBySelector<T>(params: GetPropBySelectorParams<T>): Promise<T>;
|
|
55
|
-
/**
|
|
56
|
-
* Extracts all `<img>` elements from the page and returns their properties.
|
|
57
|
-
*
|
|
58
|
-
* For each image, collects the `src`, `currentSrc`, `alt`, bounding box dimensions,
|
|
59
|
-
* natural dimensions, lazy-loading status, and the outer HTML source code.
|
|
60
|
-
* @param page - The Puppeteer page to extract images from.
|
|
61
|
-
* @param viewportWidth - The current viewport width in pixels, recorded alongside each image entry.
|
|
62
|
-
* @returns An array of {@link ImageElement} objects describing each image on the page.
|
|
63
|
-
*/
|
|
64
|
-
export declare function getImageList(page: Page, viewportWidth: number): Promise<ImageElement[]>;
|
|
65
|
-
/**
|
|
66
|
-
* Extracts all anchor (`<a>` and `<area>`) elements with `href` attributes from the page.
|
|
67
|
-
*
|
|
68
|
-
* For each anchor, resolves the `href` to an `ExURL` via `parseUrl`, retrieves
|
|
69
|
-
* the accessible name (from the accessibility tree, falling back to `textContent`),
|
|
70
|
-
* and filters out non-HTTP links.
|
|
71
|
-
* @param page - The Puppeteer page to extract anchors from.
|
|
72
|
-
* @param options - Optional URL parsing options (e.g., `disableQueries`).
|
|
73
|
-
* @returns An array of {@link AnchorData} objects for all HTTP(S) links found on the page.
|
|
74
|
-
*/
|
|
75
|
-
export declare function getAnchorList(page: Page, options?: ParseURLOptions): Promise<AnchorData[]>;
|
|
76
|
-
/**
|
|
77
|
-
* Extracts comprehensive meta information from the page's `<head>`.
|
|
78
|
-
*
|
|
79
|
-
* Collects the following metadata:
|
|
80
|
-
* - `title` - The document title.
|
|
81
|
-
* - `lang` - The `lang` attribute of the `<html>` element.
|
|
82
|
-
* - `description` - The `<meta name="description">` content.
|
|
83
|
-
* - `keywords` - The `<meta name="keywords">` content.
|
|
84
|
-
* - `noindex` / `nofollow` / `noarchive` - Parsed from the `<meta name="robots">` directives.
|
|
85
|
-
* - `canonical` - The `<link rel="canonical">` content.
|
|
86
|
-
* - `alternate` - The `<link rel="alternate">` content.
|
|
87
|
-
* - Open Graph tags: `og:type`, `og:title`, `og:site_name`, `og:description`, `og:url`, `og:image`.
|
|
88
|
-
* - `twitter:card` - The Twitter Card type.
|
|
89
|
-
* @param page - The Puppeteer page to extract meta information from.
|
|
90
|
-
* @returns An object containing all extracted meta properties.
|
|
91
|
-
*/
|
|
92
|
-
export declare function getMeta(page: Page): Promise<{
|
|
93
|
-
title: string;
|
|
94
|
-
lang: string;
|
|
95
|
-
description: string;
|
|
96
|
-
keywords: string;
|
|
97
|
-
noindex: boolean;
|
|
98
|
-
nofollow: boolean;
|
|
99
|
-
noarchive: boolean;
|
|
100
|
-
canonical: string;
|
|
101
|
-
alternate: string;
|
|
102
|
-
'og:type': string;
|
|
103
|
-
'og:title': string;
|
|
104
|
-
'og:site_name': string;
|
|
105
|
-
'og:description': string;
|
|
106
|
-
'og:url': string;
|
|
107
|
-
'og:image': string;
|
|
108
|
-
'twitter:card': string;
|
|
109
|
-
}>;
|
package/dist/dom-evaluation.js
DELETED
|
@@ -1,273 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* DOM evaluation functions for extracting structured data from Puppeteer pages.
|
|
3
|
-
*
|
|
4
|
-
* These functions are called by {@link ./scraper.ts | Scraper.#fetchData} to extract
|
|
5
|
-
* anchors, images, and meta information after page navigation completes.
|
|
6
|
-
* @see {@link ./types.ts} for the data types returned by these functions
|
|
7
|
-
*/
|
|
8
|
-
import { domDetailsLog, domLog } from './debug.js';
|
|
9
|
-
import { parseUrl } from './parse-url.js';
|
|
10
|
-
const pid = `${process.pid}`;
|
|
11
|
-
const log = domLog.extend(pid);
|
|
12
|
-
const dLog = domDetailsLog.extend(pid);
|
|
13
|
-
/**
|
|
14
|
-
* Retrieves a DOM property value from a Puppeteer element handle with a timeout.
|
|
15
|
-
*
|
|
16
|
-
* Races the actual property retrieval against a 10-second timeout.
|
|
17
|
-
* If the property cannot be read or the timeout expires, the fallback value is returned.
|
|
18
|
-
* @template T - The expected type of the property value.
|
|
19
|
-
* @param params - Parameters containing the element, property name, and fallback.
|
|
20
|
-
* @returns The property value, or the fallback if retrieval fails.
|
|
21
|
-
*/
|
|
22
|
-
export async function getProp(params) {
|
|
23
|
-
const { $el, propName, fallback } = params;
|
|
24
|
-
return Promise.race([
|
|
25
|
-
_getProp($el, propName, fallback),
|
|
26
|
-
new Promise((res) => setTimeout(() => res(fallback), 10 * 1000)),
|
|
27
|
-
]);
|
|
28
|
-
}
|
|
29
|
-
/**
|
|
30
|
-
* Internal implementation of property retrieval without timeout.
|
|
31
|
-
* @template T - The expected type of the property value.
|
|
32
|
-
* @param $el - The Puppeteer element handle.
|
|
33
|
-
* @param propName - The DOM property name.
|
|
34
|
-
* @param fallback - The default value on failure.
|
|
35
|
-
* @returns The property value cast to `T`, or the fallback.
|
|
36
|
-
*/
|
|
37
|
-
async function _getProp($el, propName, fallback) {
|
|
38
|
-
try {
|
|
39
|
-
const prop = await $el.getProperty(propName);
|
|
40
|
-
if (!prop) {
|
|
41
|
-
return fallback;
|
|
42
|
-
}
|
|
43
|
-
const value = (await prop.jsonValue());
|
|
44
|
-
return value;
|
|
45
|
-
}
|
|
46
|
-
catch {
|
|
47
|
-
return fallback;
|
|
48
|
-
}
|
|
49
|
-
}
|
|
50
|
-
/**
|
|
51
|
-
* Retrieves a DOM property value from the first element matching a CSS selector.
|
|
52
|
-
*
|
|
53
|
-
* Combines `page.$()` with {@link getProp} for convenient single-element lookups.
|
|
54
|
-
* @template T - The expected type of the property value.
|
|
55
|
-
* @param params - Parameters containing the page, selector, property name, and fallback.
|
|
56
|
-
* @returns The property value, or the fallback if the element is not found or retrieval fails.
|
|
57
|
-
*/
|
|
58
|
-
export async function getPropBySelector(params) {
|
|
59
|
-
const { page, selector, propName, fallback } = params;
|
|
60
|
-
const $el = await page.$(selector);
|
|
61
|
-
if (!$el) {
|
|
62
|
-
return fallback;
|
|
63
|
-
}
|
|
64
|
-
return getProp({ $el, propName, fallback });
|
|
65
|
-
}
|
|
66
|
-
/**
|
|
67
|
-
* Extracts all `<img>` elements from the page and returns their properties.
|
|
68
|
-
*
|
|
69
|
-
* For each image, collects the `src`, `currentSrc`, `alt`, bounding box dimensions,
|
|
70
|
-
* natural dimensions, lazy-loading status, and the outer HTML source code.
|
|
71
|
-
* @param page - The Puppeteer page to extract images from.
|
|
72
|
-
* @param viewportWidth - The current viewport width in pixels, recorded alongside each image entry.
|
|
73
|
-
* @returns An array of {@link ImageElement} objects describing each image on the page.
|
|
74
|
-
*/
|
|
75
|
-
export async function getImageList(page, viewportWidth) {
|
|
76
|
-
log('Getting images (Viewport: %dpx)', viewportWidth);
|
|
77
|
-
const $images = await page.$$('img');
|
|
78
|
-
const imageList = [];
|
|
79
|
-
for (const $image of $images) {
|
|
80
|
-
const boundingBox = await $image.boundingBox();
|
|
81
|
-
const width = boundingBox?.width || 0;
|
|
82
|
-
const height = boundingBox?.height || 0;
|
|
83
|
-
const src = await getProp({ $el: $image, propName: 'src', fallback: '' });
|
|
84
|
-
const currentSrc = await getProp({
|
|
85
|
-
$el: $image,
|
|
86
|
-
propName: 'currentSrc',
|
|
87
|
-
fallback: '',
|
|
88
|
-
});
|
|
89
|
-
const alt = await getProp({ $el: $image, propName: 'alt', fallback: '' });
|
|
90
|
-
const naturalWidth = await getProp({
|
|
91
|
-
$el: $image,
|
|
92
|
-
propName: 'naturalWidth',
|
|
93
|
-
fallback: 0,
|
|
94
|
-
});
|
|
95
|
-
const naturalHeight = await getProp({
|
|
96
|
-
$el: $image,
|
|
97
|
-
propName: 'naturalHeight',
|
|
98
|
-
fallback: 0,
|
|
99
|
-
});
|
|
100
|
-
const loading = await getProp({ $el: $image, propName: 'loading', fallback: '' });
|
|
101
|
-
const sourceCode = await getProp({
|
|
102
|
-
$el: $image,
|
|
103
|
-
propName: 'outerHTML',
|
|
104
|
-
fallback: '',
|
|
105
|
-
});
|
|
106
|
-
const isLazy = loading.toLowerCase().trim() === 'lazy';
|
|
107
|
-
imageList.push({
|
|
108
|
-
src,
|
|
109
|
-
currentSrc,
|
|
110
|
-
alt,
|
|
111
|
-
width,
|
|
112
|
-
height,
|
|
113
|
-
naturalWidth,
|
|
114
|
-
naturalHeight,
|
|
115
|
-
isLazy,
|
|
116
|
-
viewportWidth,
|
|
117
|
-
sourceCode,
|
|
118
|
-
});
|
|
119
|
-
}
|
|
120
|
-
log('Got %d images (Viewport: %dpx)', imageList.length, viewportWidth);
|
|
121
|
-
dLog('Images are: %O', imageList.map((i) => i.src));
|
|
122
|
-
return imageList;
|
|
123
|
-
}
|
|
124
|
-
/**
|
|
125
|
-
* Extracts all anchor (`<a>` and `<area>`) elements with `href` attributes from the page.
|
|
126
|
-
*
|
|
127
|
-
* For each anchor, resolves the `href` to an `ExURL` via `parseUrl`, retrieves
|
|
128
|
-
* the accessible name (from the accessibility tree, falling back to `textContent`),
|
|
129
|
-
* and filters out non-HTTP links.
|
|
130
|
-
* @param page - The Puppeteer page to extract anchors from.
|
|
131
|
-
* @param options - Optional URL parsing options (e.g., `disableQueries`).
|
|
132
|
-
* @returns An array of {@link AnchorData} objects for all HTTP(S) links found on the page.
|
|
133
|
-
*/
|
|
134
|
-
export async function getAnchorList(page, options) {
|
|
135
|
-
log('Getting anchors');
|
|
136
|
-
const $anchors = await page.$$('a[href], area[href]');
|
|
137
|
-
const anchorList = [];
|
|
138
|
-
for (const $anchor of $anchors) {
|
|
139
|
-
const $href = await getProp({ $el: $anchor, propName: 'href', fallback: '' });
|
|
140
|
-
const hrefVal = $href.toString();
|
|
141
|
-
const href = parseUrl(hrefVal, options);
|
|
142
|
-
if (!href || !href.isHTTP) {
|
|
143
|
-
continue;
|
|
144
|
-
}
|
|
145
|
-
const axNode = await page.accessibility.snapshot({ root: $anchor });
|
|
146
|
-
const textContent = await getProp({
|
|
147
|
-
$el: $anchor,
|
|
148
|
-
propName: 'textContent',
|
|
149
|
-
fallback: '',
|
|
150
|
-
});
|
|
151
|
-
const accessibleName = axNode ? axNode.name || '' : textContent.trim();
|
|
152
|
-
const link = {
|
|
153
|
-
href,
|
|
154
|
-
textContent: accessibleName,
|
|
155
|
-
};
|
|
156
|
-
anchorList.push(link);
|
|
157
|
-
}
|
|
158
|
-
log('Got %d anchors', anchorList.length);
|
|
159
|
-
dLog('Anchors are: %O', anchorList.map((a) => a.href.href));
|
|
160
|
-
return anchorList;
|
|
161
|
-
}
|
|
162
|
-
/**
|
|
163
|
-
* Extracts comprehensive meta information from the page's `<head>`.
|
|
164
|
-
*
|
|
165
|
-
* Collects the following metadata:
|
|
166
|
-
* - `title` - The document title.
|
|
167
|
-
* - `lang` - The `lang` attribute of the `<html>` element.
|
|
168
|
-
* - `description` - The `<meta name="description">` content.
|
|
169
|
-
* - `keywords` - The `<meta name="keywords">` content.
|
|
170
|
-
* - `noindex` / `nofollow` / `noarchive` - Parsed from the `<meta name="robots">` directives.
|
|
171
|
-
* - `canonical` - The `<link rel="canonical">` content.
|
|
172
|
-
* - `alternate` - The `<link rel="alternate">` content.
|
|
173
|
-
* - Open Graph tags: `og:type`, `og:title`, `og:site_name`, `og:description`, `og:url`, `og:image`.
|
|
174
|
-
* - `twitter:card` - The Twitter Card type.
|
|
175
|
-
* @param page - The Puppeteer page to extract meta information from.
|
|
176
|
-
* @returns An object containing all extracted meta properties.
|
|
177
|
-
*/
|
|
178
|
-
export async function getMeta(page) {
|
|
179
|
-
log('Getting Meta');
|
|
180
|
-
const robotsVal = await getPropBySelector({
|
|
181
|
-
page,
|
|
182
|
-
selector: 'meta[name="robots"]',
|
|
183
|
-
propName: 'content',
|
|
184
|
-
fallback: '',
|
|
185
|
-
});
|
|
186
|
-
const robots = new Set(robotsVal.split(',').map((robot) => robot.trim().toLowerCase()));
|
|
187
|
-
const meta = {
|
|
188
|
-
title: await getPropBySelector({
|
|
189
|
-
page,
|
|
190
|
-
selector: 'title',
|
|
191
|
-
propName: 'textContent',
|
|
192
|
-
fallback: '',
|
|
193
|
-
}),
|
|
194
|
-
lang: await getPropBySelector({
|
|
195
|
-
page,
|
|
196
|
-
selector: 'html',
|
|
197
|
-
propName: 'lang',
|
|
198
|
-
fallback: '',
|
|
199
|
-
}),
|
|
200
|
-
description: await getPropBySelector({
|
|
201
|
-
page,
|
|
202
|
-
selector: 'meta[name="description"]',
|
|
203
|
-
propName: 'content',
|
|
204
|
-
fallback: '',
|
|
205
|
-
}),
|
|
206
|
-
keywords: await getPropBySelector({
|
|
207
|
-
page,
|
|
208
|
-
selector: 'meta[name="keywords"]',
|
|
209
|
-
propName: 'content',
|
|
210
|
-
fallback: '',
|
|
211
|
-
}),
|
|
212
|
-
noindex: robots.has('noindex'),
|
|
213
|
-
nofollow: robots.has('nofollow'),
|
|
214
|
-
noarchive: robots.has('noarchive'),
|
|
215
|
-
canonical: await getPropBySelector({
|
|
216
|
-
page,
|
|
217
|
-
selector: 'link[rel="canonical"]',
|
|
218
|
-
propName: 'href',
|
|
219
|
-
fallback: '',
|
|
220
|
-
}),
|
|
221
|
-
alternate: await getPropBySelector({
|
|
222
|
-
page,
|
|
223
|
-
selector: 'link[rel="alternate"]',
|
|
224
|
-
propName: 'href',
|
|
225
|
-
fallback: '',
|
|
226
|
-
}),
|
|
227
|
-
'og:type': await getPropBySelector({
|
|
228
|
-
page,
|
|
229
|
-
selector: 'meta[property="og:type"]',
|
|
230
|
-
propName: 'content',
|
|
231
|
-
fallback: '',
|
|
232
|
-
}),
|
|
233
|
-
'og:title': await getPropBySelector({
|
|
234
|
-
page,
|
|
235
|
-
selector: 'meta[property="og:title"]',
|
|
236
|
-
propName: 'content',
|
|
237
|
-
fallback: '',
|
|
238
|
-
}),
|
|
239
|
-
'og:site_name': await getPropBySelector({
|
|
240
|
-
page,
|
|
241
|
-
selector: 'meta[property="og:site_name"]',
|
|
242
|
-
propName: 'content',
|
|
243
|
-
fallback: '',
|
|
244
|
-
}),
|
|
245
|
-
'og:description': await getPropBySelector({
|
|
246
|
-
page,
|
|
247
|
-
selector: 'meta[property="og:description"]',
|
|
248
|
-
propName: 'content',
|
|
249
|
-
fallback: '',
|
|
250
|
-
}),
|
|
251
|
-
'og:url': await getPropBySelector({
|
|
252
|
-
page,
|
|
253
|
-
selector: 'meta[property="og:url"]',
|
|
254
|
-
propName: 'content',
|
|
255
|
-
fallback: '',
|
|
256
|
-
}),
|
|
257
|
-
'og:image': await getPropBySelector({
|
|
258
|
-
page,
|
|
259
|
-
selector: 'meta[property="og:image"]',
|
|
260
|
-
propName: 'content',
|
|
261
|
-
fallback: '',
|
|
262
|
-
}),
|
|
263
|
-
'twitter:card': await getPropBySelector({
|
|
264
|
-
page,
|
|
265
|
-
selector: 'meta[name="twitter:card"]',
|
|
266
|
-
propName: 'content',
|
|
267
|
-
fallback: '',
|
|
268
|
-
}),
|
|
269
|
-
};
|
|
270
|
-
log('Got meta');
|
|
271
|
-
dLog('Meta data are: %O', meta);
|
|
272
|
-
return meta;
|
|
273
|
-
}
|
package/dist/index.d.ts
DELETED
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* @module @d-zero/beholder
|
|
3
|
-
*
|
|
4
|
-
* The beholder package provides page-level scraping capabilities for web crawlers.
|
|
5
|
-
* It handles browser page navigation, DOM data extraction (anchors, images, meta tags),
|
|
6
|
-
* network resource monitoring, and keyword-based page exclusion.
|
|
7
|
-
*
|
|
8
|
-
* Results are returned as values from `scrapeStart()`, not emitted as events.
|
|
9
|
-
* Only streaming events (changePhase, resourceResponse) are emitted for progress monitoring.
|
|
10
|
-
*
|
|
11
|
-
* The main entry point is the `Scraper` class (default export).
|
|
12
|
-
*/
|
|
13
|
-
export { default as default } from './scraper.js';
|
|
14
|
-
export { isError } from './is-error.js';
|
|
15
|
-
export { detectCompress } from '@d-zero/shared/detect-compress';
|
|
16
|
-
export type { CompressType } from '@d-zero/shared/detect-compress';
|
|
17
|
-
export { detectCDN } from '@d-zero/shared/detect-cdn';
|
|
18
|
-
export type { CDNType } from '@d-zero/shared/detect-cdn';
|
|
19
|
-
export type { ScrapeResult, ResourceEntry, PageData } from './types.js';
|
|
20
|
-
export type { ScraperOptions, ChangePhaseEvent, ScraperEventTypes } from './types.js';
|
|
21
|
-
export type { Resource, AnchorData, Meta, ImageElement, SkippedPageData, NetworkLog, } from './types.js';
|
package/dist/index.js
DELETED
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* @module @d-zero/beholder
|
|
3
|
-
*
|
|
4
|
-
* The beholder package provides page-level scraping capabilities for web crawlers.
|
|
5
|
-
* It handles browser page navigation, DOM data extraction (anchors, images, meta tags),
|
|
6
|
-
* network resource monitoring, and keyword-based page exclusion.
|
|
7
|
-
*
|
|
8
|
-
* Results are returned as values from `scrapeStart()`, not emitted as events.
|
|
9
|
-
* Only streaming events (changePhase, resourceResponse) are emitted for progress monitoring.
|
|
10
|
-
*
|
|
11
|
-
* The main entry point is the `Scraper` class (default export).
|
|
12
|
-
*/
|
|
13
|
-
export { default as default } from './scraper.js';
|
|
14
|
-
export { isError } from './is-error.js';
|
|
15
|
-
export { detectCompress } from '@d-zero/shared/detect-compress';
|
|
16
|
-
export { detectCDN } from '@d-zero/shared/detect-cdn';
|
package/dist/is-error.d.ts
DELETED
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Determines whether an HTTP status code represents an error.
|
|
3
|
-
* Status codes in the range 200-399 (inclusive) are considered successful;
|
|
4
|
-
* all others are considered errors.
|
|
5
|
-
* @param status - HTTP status code to evaluate
|
|
6
|
-
* @returns `true` if the status code indicates an error (< 200 or >= 400)
|
|
7
|
-
*/
|
|
8
|
-
export declare function isError(status: number): boolean;
|
package/dist/is-error.js
DELETED
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Determines whether an HTTP status code represents an error.
|
|
3
|
-
* Status codes in the range 200-399 (inclusive) are considered successful;
|
|
4
|
-
* all others are considered errors.
|
|
5
|
-
* @param status - HTTP status code to evaluate
|
|
6
|
-
* @returns `true` if the status code indicates an error (< 200 or >= 400)
|
|
7
|
-
*/
|
|
8
|
-
export function isError(status) {
|
|
9
|
-
return !(200 <= status && status < 400);
|
|
10
|
-
}
|
package/dist/keyword-check.d.ts
DELETED
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Checks whether the given HTML content contains any of the specified exclude keywords.
|
|
3
|
-
* Each keyword is converted to a regular expression via `strToRegex` before testing.
|
|
4
|
-
* @param html - The raw HTML string to search within.
|
|
5
|
-
* @param excludeKeywords - An array of keyword strings or regex patterns to match against the HTML.
|
|
6
|
-
* @returns The first matched keyword string if a match is found, or `false` if none match.
|
|
7
|
-
*/
|
|
8
|
-
export declare function keywordCheck(html: string, excludeKeywords: string[]): string | false;
|
package/dist/keyword-check.js
DELETED
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
import { strToRegex } from '@d-zero/shared/str-to-regex';
|
|
2
|
-
/**
|
|
3
|
-
* Checks whether the given HTML content contains any of the specified exclude keywords.
|
|
4
|
-
* Each keyword is converted to a regular expression via `strToRegex` before testing.
|
|
5
|
-
* @param html - The raw HTML string to search within.
|
|
6
|
-
* @param excludeKeywords - An array of keyword strings or regex patterns to match against the HTML.
|
|
7
|
-
* @returns The first matched keyword string if a match is found, or `false` if none match.
|
|
8
|
-
*/
|
|
9
|
-
export function keywordCheck(html, excludeKeywords) {
|
|
10
|
-
for (const keyword of excludeKeywords) {
|
|
11
|
-
const pattern = strToRegex(keyword);
|
|
12
|
-
if (pattern.test(html)) {
|
|
13
|
-
return keyword;
|
|
14
|
-
}
|
|
15
|
-
}
|
|
16
|
-
return false;
|
|
17
|
-
}
|
package/dist/parse-url.d.ts
DELETED
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
import type { ExURL, ParseURLOptions } from '@d-zero/shared/parse-url';
|
|
2
|
-
/**
|
|
3
|
-
* Parses a URL string into an ExURL object, filtering out non-HTTP URLs
|
|
4
|
-
* that lack a hostname and protocol. If the input is already an ExURL object,
|
|
5
|
-
* it is returned as-is without re-parsing.
|
|
6
|
-
*
|
|
7
|
-
* WHY null return: Bare fragment-only strings (e.g. `"#section"`) and
|
|
8
|
-
* protocol-relative paths without a host are not meaningful URLs for crawling.
|
|
9
|
-
* @param url - A URL string or an already-parsed ExURL object
|
|
10
|
-
* @param options - URL parsing options (e.g. `disableQueries` to strip query strings)
|
|
11
|
-
* @returns The parsed ExURL, or `null` if the URL is not navigable
|
|
12
|
-
* @see `@d-zero/shared/parse-url` for the underlying parser
|
|
13
|
-
*/
|
|
14
|
-
export declare function parseUrl(url: string | ExURL, options?: ParseURLOptions): ExURL | null;
|
package/dist/parse-url.js
DELETED
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
import { parseUrl as sharedParseUrl } from '@d-zero/shared/parse-url';
|
|
2
|
-
/**
|
|
3
|
-
* Parses a URL string into an ExURL object, filtering out non-HTTP URLs
|
|
4
|
-
* that lack a hostname and protocol. If the input is already an ExURL object,
|
|
5
|
-
* it is returned as-is without re-parsing.
|
|
6
|
-
*
|
|
7
|
-
* WHY null return: Bare fragment-only strings (e.g. `"#section"`) and
|
|
8
|
-
* protocol-relative paths without a host are not meaningful URLs for crawling.
|
|
9
|
-
* @param url - A URL string or an already-parsed ExURL object
|
|
10
|
-
* @param options - URL parsing options (e.g. `disableQueries` to strip query strings)
|
|
11
|
-
* @returns The parsed ExURL, or `null` if the URL is not navigable
|
|
12
|
-
* @see `@d-zero/shared/parse-url` for the underlying parser
|
|
13
|
-
*/
|
|
14
|
-
export function parseUrl(url, options) {
|
|
15
|
-
if (typeof url !== 'string') {
|
|
16
|
-
return url;
|
|
17
|
-
}
|
|
18
|
-
const result = sharedParseUrl(url, options);
|
|
19
|
-
if (!result.isHTTP && !result.hostname && !result.protocol) {
|
|
20
|
-
return null;
|
|
21
|
-
}
|
|
22
|
-
return result;
|
|
23
|
-
}
|
package/dist/scraper.d.ts
DELETED
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
import type { ScraperEventTypes, ScraperOptions, ScrapeResult, ExURL } from './types.js';
|
|
2
|
-
import type { Page } from 'puppeteer';
|
|
3
|
-
import { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
|
|
4
|
-
/**
|
|
5
|
-
* Page-level scraper that extracts data from a single browser page.
|
|
6
|
-
*
|
|
7
|
-
* The scraper returns results as values from `scrapeStart()` rather than
|
|
8
|
-
* emitting them as events. Only streaming events (changePhase, resourceResponse)
|
|
9
|
-
* are emitted for progress monitoring.
|
|
10
|
-
*
|
|
11
|
-
* The Puppeteer `Page` object is injected externally, and page lifecycle
|
|
12
|
-
* (including `page.close()`) is managed by the caller.
|
|
13
|
-
* @example
|
|
14
|
-
* ```ts
|
|
15
|
-
* const scraper = new Scraper();
|
|
16
|
-
* scraper.on('changePhase', (e) => console.log(e.name));
|
|
17
|
-
* const result = await scraper.scrapeStart(page, url, { isExternal: false });
|
|
18
|
-
* ```
|
|
19
|
-
*/
|
|
20
|
-
export default class Scraper extends EventEmitter<ScraperEventTypes> {
|
|
21
|
-
#private;
|
|
22
|
-
/** Number of retries for `@retryable`-decorated methods. Set per-scrape from options. */
|
|
23
|
-
retries?: number;
|
|
24
|
-
/**
|
|
25
|
-
* Begins the scraping process for a given URL on the provided Puppeteer page.
|
|
26
|
-
*
|
|
27
|
-
* Returns a `ScrapeResult` containing the outcome:
|
|
28
|
-
* - `type: "success"` with `pageData` on success
|
|
29
|
-
* - `type: "skipped"` with `ignored` details when the page is excluded
|
|
30
|
-
* - `type: "error"` with `error` details when scraping fails
|
|
31
|
-
*
|
|
32
|
-
* Sub-resources are collected via the `resourceResponse` event and
|
|
33
|
-
* included in the returned `ScrapeResult.resources`.
|
|
34
|
-
* @param page - The Puppeteer page instance to use for navigation and DOM evaluation.
|
|
35
|
-
* @param url - The extended URL to scrape.
|
|
36
|
-
* @param options - Optional scraper configuration overriding defaults.
|
|
37
|
-
* @param isSkip - When `true`, the page is immediately skipped without any network requests.
|
|
38
|
-
* @returns The scrape result containing the outcome and captured resources.
|
|
39
|
-
*/
|
|
40
|
-
scrapeStart(page: Page, url: ExURL, options?: Partial<ScraperOptions>, isSkip?: boolean): Promise<ScrapeResult>;
|
|
41
|
-
}
|