@d-zero/beholder 0.1.28 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/CHANGELOG.md +15 -0
  2. package/README.md +172 -477
  3. package/dist/debug.d.ts +4 -1
  4. package/dist/debug.js +5 -2
  5. package/dist/dom-evaluation.d.ts +72 -14
  6. package/dist/dom-evaluation.js +169 -43
  7. package/dist/index.d.ts +20 -3
  8. package/dist/index.js +15 -3
  9. package/dist/is-error.d.ts +8 -0
  10. package/dist/is-error.js +10 -0
  11. package/dist/keyword-check.d.ts +5 -3
  12. package/dist/keyword-check.js +5 -3
  13. package/dist/parse-url.d.ts +14 -0
  14. package/dist/parse-url.js +23 -0
  15. package/dist/scraper.d.ts +39 -13
  16. package/dist/scraper.js +300 -263
  17. package/dist/types.d.ts +286 -214
  18. package/dist/types.js +6 -0
  19. package/package.json +7 -10
  20. package/src/debug.ts +5 -2
  21. package/src/dom-evaluation.ts +195 -65
  22. package/src/index.ts +27 -3
  23. package/src/is-error.spec.ts +33 -0
  24. package/src/is-error.ts +10 -0
  25. package/src/keyword-check.spec.ts +45 -4
  26. package/src/keyword-check.ts +5 -3
  27. package/src/parse-url.spec.ts +35 -0
  28. package/src/parse-url.ts +26 -0
  29. package/src/scraper.ts +338 -300
  30. package/src/types.ts +345 -258
  31. package/tsconfig.tsbuildinfo +1 -1
  32. package/dist/events.d.ts +0 -32
  33. package/dist/events.js +0 -15
  34. package/dist/fetch-destination.d.ts +0 -8
  35. package/dist/fetch-destination.js +0 -145
  36. package/dist/net-timeout-error.d.ts +0 -3
  37. package/dist/net-timeout-error.js +0 -3
  38. package/dist/sub-process-runner.d.ts +0 -12
  39. package/dist/sub-process-runner.js +0 -180
  40. package/dist/sub-process.d.ts +0 -1
  41. package/dist/sub-process.js +0 -67
  42. package/dist/utils.d.ts +0 -16
  43. package/dist/utils.js +0 -69
  44. package/src/events.ts +0 -21
  45. package/src/fetch-destination.ts +0 -173
  46. package/src/net-timeout-error.ts +0 -3
  47. package/src/sub-process-runner.ts +0 -220
  48. package/src/sub-process.ts +0 -86
  49. package/src/utils.ts +0 -89
package/dist/types.d.ts CHANGED
@@ -1,271 +1,343 @@
1
- import type { Action } from 'typescript-fsa';
2
- export type ScrapeEvent = {
3
- pid: number | undefined;
4
- url: ExURL;
5
- };
6
- export type ScrapeErrorEvent = ScrapeEvent & {
7
- shutdown: boolean;
8
- error: {
9
- name: string;
10
- message: string;
11
- stack?: string;
12
- };
13
- };
14
- export type ScrapeEventTypes = {
15
- ignoreAndSkip: ScrapeEvent & {
16
- reason: {
17
- matchedText: string;
18
- excludeKeywords: string[];
19
- };
20
- };
21
- resourceResponse: ScrapeEvent & {
22
- log: NetworkLog;
23
- resource: Omit<Resource, 'uid'>;
24
- };
25
- scrapeEnd: ScrapeEvent & {
26
- timestamp: number;
27
- result: PageData;
28
- };
29
- destroyed: Omit<ScrapeEvent, 'url'>;
30
- error: ScrapeErrorEvent;
31
- changePhase: ChangePhaseEvent;
32
- };
33
- export type ChangePhaseEvent = {
34
- pid: number;
35
- name: 'scrapeStart' | 'launchBrowser' | 'touchHead' | 'touchHeadTimeout' | 'newPage' | 'openPage' | 'loadDOMContent' | 'waitNetworkIdleZero' | 'getHTML' | 'setViewport' | 'scrollToBottom' | 'getImages' | 'getAnchors' | 'getMeta' | 'ignoreAndSkip' | 'scrapeEnd' | 'beforeDestroy' | 'destroyed';
36
- url: ExURL | null;
37
- isExternal: boolean;
38
- message: string;
39
- };
40
- export type AnyScrapeEvent = ScrapeEventTypes[keyof ScrapeEventTypes];
41
- export type SubProcessEventTypes = {
42
- start: {
43
- url: ExURL;
44
- isExternal: boolean;
45
- isGettingImages: boolean;
46
- excludeKeywords: string[];
47
- executablePath: string | null;
48
- isSkip: boolean;
49
- isTitleOnly: boolean;
50
- screenshot: string | null;
51
- } & Required<ParseURLOptions>;
52
- destroy: void;
53
- };
54
- export type SubProcessEvent = {
55
- pid: number | undefined;
56
- };
57
- export type SubProcessChangeEvent = ChangePhaseEvent | {
58
- pid: number | undefined;
59
- name: 'reset' | 'boot' | 'disconnect';
60
- url: ExURL | null;
61
- isExternal: boolean;
62
- message: string;
63
- };
64
- export type SubProcessRunnerEventTypes = {
65
- reset: SubProcessEvent;
66
- scrapeEvent: Action<AnyScrapeEvent>;
67
- changePhase: SubProcessChangeEvent;
68
- error: ScrapeErrorEvent;
69
- };
70
- export type ExURL = {
71
- /**
72
- * Full URL (optimized)
73
- */
74
- href: string;
75
- /**
76
- * Full URL that before parse
77
- */
78
- _originUrlString: string;
79
- /**
80
- * Full URL without hash
81
- */
82
- withoutHash: string;
83
- /**
84
- * Full URL without hash and Authentication
85
- */
86
- withoutHashAndAuth: string;
87
- /**
88
- * Protocol or URI scheme (includes ":")
89
- * - case-insensitive
90
- */
91
- protocol: string;
92
- /**
93
- * Whether protocol is HTTP or HTTPS
94
- */
95
- isHTTP: boolean;
96
- /**
97
- * Whether protocol is HTTPS
98
- */
99
- isSecure: boolean;
100
- /**
101
- * User name of authentication
102
- */
103
- username: string | null;
104
- /**
105
- * Password of authentication
106
- */
107
- password: string | null;
108
- /**
109
- * Host name
110
- *
111
- * - case-insensitive
112
- * - encode non-ASCII characters
113
- * - without port number
114
- */
115
- hostname: string;
116
- /**
117
- * Port number
118
- */
119
- port: string | null;
120
- /**
121
- * Path part
122
- *
123
- * It is only `/` if pathname is empty
124
- *
125
- * - case-sensitive
126
- */
127
- pathname: string | null;
128
- /**
129
- * Array of path
130
- */
131
- paths: string[];
132
- /**
133
- * Depth of paths
134
- */
135
- depth: number;
136
- /**
137
- * Directory name of paths
138
- *
139
- * It is null if it is `/` only
140
- */
141
- dirname: string | null;
142
- /**
143
- * Base name of paths (File name without file extension)
144
- */
145
- basename: string | null;
146
- /**
147
- * Whether index page (It's true if basename is null)
148
- */
149
- isIndex: boolean;
150
- /**
151
- * File extension name (inclues ".")
152
- */
153
- extname: string | null;
154
- /**
155
- * Search query (without `?`)
156
- *
157
- * - case-sensitive
158
- */
159
- query: string | null;
160
- /**
161
- * Hash (includes `#`)
162
- *
163
- * - case-sensitive
164
- */
165
- hash: string | null;
166
- };
167
- export type ParseURLOptions = {
168
- disableQueries?: boolean;
169
- };
1
+ /**
2
+ * Beholder type definitions for the page-level scraper.
3
+ * @see {@link ./scraper.ts} for the Scraper class that produces these types
4
+ * @see {@link ./dom-evaluation.ts} for DOM extraction functions (anchors, images, meta)
5
+ * @module
6
+ */
7
+ export type { ExURL, ParseURLOptions } from '@d-zero/shared/parse-url';
8
+ export type { CompressType } from '@d-zero/shared/detect-compress';
9
+ export type { CDNType } from '@d-zero/shared/detect-cdn';
10
+ import type { CDNType } from '@d-zero/shared/detect-cdn';
11
+ import type { CompressType } from '@d-zero/shared/detect-compress';
12
+ import type { ExURL } from '@d-zero/shared/parse-url';
13
+ /**
14
+ * Scraped page data returned by the scraper after successfully processing a page.
15
+ */
170
16
  export type PageData = {
17
+ /** The parsed URL of the page. */
171
18
  url: ExURL;
19
+ /** Chain of redirect URLs traversed to reach the final destination. */
172
20
  redirectPaths: string[];
21
+ /** Whether this page is a target page (internal and within the crawl scope). */
173
22
  isTarget: boolean;
23
+ /** Whether this page is external to the crawl scope. */
174
24
  isExternal: boolean;
25
+ /** HTTP status code of the response. */
175
26
  status: number;
27
+ /** HTTP status text of the response. */
176
28
  statusText: string;
29
+ /** The Content-Type header value, or `null` if unavailable. */
177
30
  contentType: string | null;
31
+ /** The Content-Length header value in bytes, or `null` if unavailable. */
178
32
  contentLength: number | null;
33
+ /** Raw HTTP response headers, or `null` if unavailable. */
179
34
  responseHeaders: Record<string, string | string[] | undefined> | null;
35
+ /** Extracted metadata from the page (title, description, OGP, etc.). */
180
36
  meta: Meta;
37
+ /** List of anchor elements found on the page. */
181
38
  anchorList: AnchorData[];
39
+ /** List of image elements found on the page. */
182
40
  imageList: ImageElement[];
41
+ /** HTML snapshot of the rendered DOM. */
183
42
  html: string;
43
+ /** Always `false` for successfully scraped pages. See {@link SkippedPageData} for skipped pages. */
184
44
  isSkipped: false;
185
45
  };
46
+ /**
47
+ * Information about an image element found on a page.
48
+ */
49
+ export type ImageElement = {
50
+ /** The `src` attribute value of the image element. */
51
+ src: string;
52
+ /** The `currentSrc` property value (the actual URL loaded by the browser). */
53
+ currentSrc: string;
54
+ /** The `alt` attribute value of the image element. */
55
+ alt: string;
56
+ /** The CSS layout width of the image in pixels. */
57
+ width: number;
58
+ /** The CSS layout height of the image in pixels. */
59
+ height: number;
60
+ /** The intrinsic width of the image in pixels. */
61
+ naturalWidth: number;
62
+ /** The intrinsic height of the image in pixels. */
63
+ naturalHeight: number;
64
+ /** Whether the image uses lazy loading (`loading="lazy"` or IntersectionObserver). */
65
+ isLazy: boolean;
66
+ /** The viewport width at which this image was captured. */
67
+ viewportWidth: number;
68
+ /** The outer HTML source code of the image element. */
69
+ sourceCode: string;
70
+ };
71
+ /**
72
+ * Data for a page that was skipped during crawling due to keyword or path exclusion.
73
+ */
74
+ export type SkippedPageData = {
75
+ /** Always `true` for skipped pages. */
76
+ isSkipped: true;
77
+ /** The URL of the skipped page. */
78
+ url: ExURL;
79
+ /** The reason the page was skipped, with match details. */
80
+ matched: {
81
+ /** Skipped due to a keyword match in the page content. */
82
+ type: 'keyword';
83
+ /** The text that matched the exclusion keyword. */
84
+ text: string;
85
+ /** The exclusion keywords that triggered the skip. */
86
+ excludeKeywords: string[];
87
+ } | {
88
+ /** Skipped due to a URL path pattern match. */
89
+ type: 'path';
90
+ /** The exclusion patterns that triggered the skip. */
91
+ excludes: string[];
92
+ };
93
+ };
94
+ /**
95
+ * A network resource (CSS, JS, image, etc.) captured during page scraping.
96
+ */
97
+ export type Resource = {
98
+ /** The URL of the resource. */
99
+ url: ExURL;
100
+ /** Whether the resource is from an external domain. */
101
+ isExternal: boolean;
102
+ /** Whether the resource request resulted in an error. */
103
+ isError: boolean;
104
+ /** HTTP status code, or `null` if the request failed. */
105
+ status: number | null;
106
+ /** HTTP status text, or `null` if the request failed. */
107
+ statusText: string | null;
108
+ /** The Content-Type header value, or `null` if unavailable. */
109
+ contentType: string | null;
110
+ /** The Content-Length header value in bytes, or `null` if unavailable. */
111
+ contentLength: number | null;
112
+ /** The compression algorithm used, or `false` if uncompressed. */
113
+ compress: false | CompressType;
114
+ /** The CDN provider detected from response headers, or `false` if none detected. */
115
+ cdn: false | CDNType;
116
+ /** Raw HTTP response headers, or `null` if unavailable. */
117
+ headers: Record<string, string | string[] | undefined> | null;
118
+ };
119
+ /**
120
+ * Data extracted from an anchor element (`<a>` or `<area>`) on a page.
121
+ */
122
+ export type AnchorData = {
123
+ /**
124
+ * Extracts the value of the `href` attribute from anchor element (`<a>` `<area>`)
125
+ */
126
+ href: ExURL;
127
+ /**
128
+ * The accessible name of the anchor element
129
+ */
130
+ textContent: string;
131
+ /**
132
+ * Whether the anchor points to an external URL.
133
+ * Set by `processAnchors()` in the crawler; not available in the sub-process.
134
+ */
135
+ isExternal?: boolean;
136
+ };
137
+ /**
138
+ * Metadata extracted from a page's `<head>` element.
139
+ */
186
140
  export type Meta = {
141
+ /** The `lang` attribute of the `<html>` element. */
187
142
  lang?: string;
143
+ /** The text content of the `<title>` element. */
188
144
  title: string;
145
+ /** The `content` attribute of `<meta name="description">`. */
189
146
  description?: string;
147
+ /** The `content` attribute of `<meta name="keywords">`. */
190
148
  keywords?: string;
149
+ /** Whether `noindex` is present in the robots meta tag. */
191
150
  noindex?: boolean;
151
+ /** Whether `nofollow` is present in the robots meta tag. */
192
152
  nofollow?: boolean;
153
+ /** Whether `noarchive` is present in the robots meta tag. */
193
154
  noarchive?: boolean;
155
+ /** The canonical URL from `<link rel="canonical">`. */
194
156
  canonical?: string;
157
+ /** The alternate URL from `<link rel="alternate">`. */
195
158
  alternate?: string;
159
+ /** The Open Graph type (`og:type`). */
196
160
  'og:type'?: string;
161
+ /** The Open Graph title (`og:title`). */
197
162
  'og:title'?: string;
163
+ /** The Open Graph site name (`og:site_name`). */
198
164
  'og:site_name'?: string;
165
+ /** The Open Graph description (`og:description`). */
199
166
  'og:description'?: string;
167
+ /** The Open Graph URL (`og:url`). */
200
168
  'og:url'?: string;
169
+ /** The Open Graph image URL (`og:image`). */
201
170
  'og:image'?: string;
171
+ /** The Twitter Card type (`twitter:card`). */
202
172
  'twitter:card'?: string;
203
173
  };
204
- export type AnchorData = {
205
- /**
206
- * Extracts the value of the `href` attribute from anchor element (`<a>` `<area>`)
207
- */
208
- href: ExURL;
209
- /**
210
- * The accessible name of the anchor element
211
- */
212
- textContent: string;
213
- };
214
- export type ImageElement = {
215
- src: string;
216
- currentSrc: string;
217
- alt: string;
218
- width: number;
219
- height: number;
220
- naturalWidth: number;
221
- naturalHeight: number;
222
- isLazy: boolean;
223
- viewportWidth: number;
224
- sourceCode: string;
225
- };
174
+ /**
175
+ * A network request/response log entry captured during page scraping via Puppeteer.
176
+ */
226
177
  export type NetworkLog = {
178
+ /** The URL of the network request. */
227
179
  url: ExURL;
180
+ /** HTTP status code of the response, or `null` if the request failed. */
228
181
  status: number | null;
182
+ /** The Content-Length of the response body in bytes. */
229
183
  contentLength: number;
184
+ /** The Content-Type of the response. */
230
185
  contentType: string;
186
+ /** Whether the request resulted in an error. */
231
187
  isError: boolean;
188
+ /** Details of the outgoing HTTP request. */
232
189
  request: {
190
+ /** Timestamp of the request in milliseconds. */
233
191
  ts: number;
192
+ /** HTTP request headers. */
234
193
  headers: Record<string, string>;
194
+ /** HTTP method used (e.g., "GET", "POST"). */
235
195
  method: string;
236
196
  };
197
+ /** Details of the HTTP response, absent if the request failed. */
237
198
  response?: {
199
+ /** Timestamp of the response in milliseconds. */
238
200
  ts: number;
201
+ /** HTTP status code. */
239
202
  status: number;
203
+ /** HTTP status text. */
240
204
  statusText: string;
205
+ /** Whether the response was served from cache. */
241
206
  fromCache: boolean;
207
+ /** HTTP response headers. */
242
208
  headers: Record<string, string>;
243
209
  };
244
210
  };
245
- export type Resource = {
246
- url: ExURL;
211
+ /**
212
+ * The result of a single page scrape operation.
213
+ * Encapsulates the outcome and all captured sub-resources.
214
+ */
215
+ export type ScrapeResult = {
216
+ /**
217
+ * The type of result:
218
+ * - `"success"` - Scraping completed successfully.
219
+ * - `"skipped"` - The page was skipped due to an exclusion rule.
220
+ * - `"error"` - An error occurred during scraping.
221
+ */
222
+ type: 'success' | 'skipped' | 'error';
223
+ /** The full page data, present when `type` is `"success"`. */
224
+ pageData?: PageData;
225
+ /** All sub-resources captured during the page load. */
226
+ resources: ResourceEntry[];
227
+ /** Details about why the page was ignored, present when `type` is `"skipped"`. */
228
+ ignored?: {
229
+ url: ExURL;
230
+ matchedText: string;
231
+ excludeKeywords: string[];
232
+ };
233
+ /** Error details, present when `type` is `"error"`. */
234
+ error?: {
235
+ name: string;
236
+ message: string;
237
+ stack?: string;
238
+ shutdown: boolean;
239
+ };
240
+ };
241
+ /**
242
+ * A single sub-resource entry captured during page scraping.
243
+ * Represents one network resource (CSS, JS, image, etc.) loaded by a page.
244
+ */
245
+ export type ResourceEntry = {
246
+ /** The network log entry containing request/response timing and headers. */
247
+ log: NetworkLog;
248
+ /** The resource metadata (without UID, which is assigned by the archive layer). */
249
+ resource: Omit<Resource, 'uid'>;
250
+ /** The URL (without hash) of the page that triggered this resource load. */
251
+ pageUrl: string;
252
+ };
253
+ /**
254
+ * Event payload describing a phase transition in the scraping lifecycle.
255
+ * Phases proceed roughly in order: scrapeStart -> headRequest -> openPage ->
256
+ * loadDOMContent -> waitNetworkIdle -> getHTML -> getAnchors -> getMeta ->
257
+ * getImages -> scrapeEnd.
258
+ */
259
+ export type ChangePhaseEvent = {
260
+ /** The process ID of the scraper worker. */
261
+ pid: number;
262
+ /**
263
+ * The name of the current scraping phase.
264
+ *
265
+ * - `scrapeStart` - Scraping has begun for a URL.
266
+ * - `launchBrowser` - A browser instance is being launched.
267
+ * - `headRequest` - Performing an HTTP HEAD request to check the destination.
268
+ * - `headRequestTimeout` - The HEAD request timed out.
269
+ * - `newPage` - A new browser page/tab is being created.
270
+ * - `openPage` - Navigating the browser page to the target URL.
271
+ * - `loadDOMContent` - Waiting for the DOM content to finish loading.
272
+ * - `waitNetworkIdle` - Waiting for all network activity to cease.
273
+ * - `getHTML` - Extracting the page HTML content.
274
+ * - `setViewport` - Setting the browser viewport dimensions.
275
+ * - `scrollToBottom` - Scrolling the page to trigger lazy-loaded content.
276
+ * - `extractImages` - Starting the image extraction pipeline.
277
+ * - `waitImageLoad` - Waiting for images to finish loading on the page.
278
+ * - `getImages` - Extracting image element data from the page.
279
+ * - `getAnchors` - Extracting anchor/link data from the page.
280
+ * - `getMeta` - Extracting meta tag information from the page.
281
+ * - `pageSkipped` - The page matched an exclusion rule and is being skipped.
282
+ * - `retryWait` - Waiting before a retry attempt after a transient failure.
283
+ * - `retryExhausted` - All retry attempts exhausted; giving up on this operation.
284
+ * - `scrapeEnd` - Scraping has completed for this URL.
285
+ * - `beforeCleanup` - The scraper is about to clean up resources.
286
+ * - `cleanedUp` - The scraper has finished cleaning up.
287
+ */
288
+ name: 'scrapeStart' | 'launchBrowser' | 'headRequest' | 'headRequestTimeout' | 'newPage' | 'openPage' | 'loadDOMContent' | 'waitNetworkIdle' | 'getHTML' | 'setViewport' | 'scrollToBottom' | 'extractImages' | 'waitImageLoad' | 'getImages' | 'getAnchors' | 'getMeta' | 'pageSkipped' | 'retryWait' | 'retryExhausted' | 'scrapeEnd' | 'beforeCleanup' | 'cleanedUp';
289
+ /** The URL being scraped, or `null` when the phase is not URL-specific (e.g., setViewport). */
290
+ url: ExURL | null;
291
+ /** Whether the URL being scraped is external to the crawl scope. */
247
292
  isExternal: boolean;
248
- isError: boolean;
249
- status: number | null;
250
- statusText: string | null;
251
- contentType: string | null;
252
- contentLength: number | null;
253
- compress: false | CompressType;
254
- cdn: false | CDNType;
255
- headers: Record<string, string | string[] | undefined> | null;
293
+ /** An optional human-readable message providing additional context about the phase. */
294
+ message: string;
256
295
  };
257
- export type CompressType = 'gzip' | 'compress' | 'deflate' | 'br' | 'sdch' | 'vcdiff' | 'xdelta';
258
- export type CDNType = 'Amazon S3' | 'Amazon CloudFront' | 'IIJ' | 'Cloudflare' | 'Akamai';
259
- export type HTTPMethod = 'HEAD' | 'GET' | 'POST' | 'PATCH' | 'PUT' | 'DELETE' | 'OPTIONS';
260
- export type SkippedPageData = {
261
- isSkipped: true;
262
- url: ExURL;
263
- matched: {
264
- type: 'keyword';
265
- text: string;
266
- excludeKeywords: string[];
267
- } | {
268
- type: 'path';
269
- excludes: string[];
296
+ /**
297
+ * Streaming event types emitted by the Scraper.
298
+ * Result events (success, skipped, error) are returned as values,
299
+ * not emitted as events.
300
+ */
301
+ export type ScraperEventTypes = {
302
+ /**
303
+ * Emitted when a sub-resource response is captured during page loading.
304
+ * Only fires for internal (non-external) pages.
305
+ */
306
+ resourceResponse: {
307
+ /** The process ID of the scraper worker. */
308
+ pid: number;
309
+ /** The URL of the page being scraped. */
310
+ url: ExURL;
311
+ /** Network log entry for the resource request/response. */
312
+ log: NetworkLog;
313
+ /** The resource metadata (without UID, which is assigned later by the archive). */
314
+ resource: Omit<Resource, 'uid'>;
270
315
  };
316
+ /**
317
+ * Emitted when the scraper transitions between lifecycle phases.
318
+ */
319
+ changePhase: ChangePhaseEvent;
320
+ };
321
+ /**
322
+ * Configuration options for the Scraper.
323
+ */
324
+ export type ScraperOptions = {
325
+ /** Whether the URL is external to the crawl scope. */
326
+ isExternal: boolean;
327
+ /** Whether to capture image element data from the page. */
328
+ captureImages: boolean;
329
+ /** Keywords or patterns that, if found in the page HTML, cause the page to be skipped. */
330
+ excludeKeywords: string[];
331
+ /** When `true`, only metadata is fetched (via HEAD request) without full browser scraping. */
332
+ metadataOnly: boolean;
333
+ /** Timeout in ms for waiting lazy-loaded images to finish loading. Defaults to 5000. */
334
+ imageLoadTimeout: number;
335
+ /** When `true`, query parameters are stripped from URLs during parsing. */
336
+ disableQueries: boolean;
337
+ /** Number of retries for network operations. Overrides `@retryable` default. */
338
+ retries?: number;
339
+ /** Pre-fetched HEAD check result. When provided, scrapeStart() skips the HEAD request. */
340
+ headCheckResult?: PageData;
341
+ /** Timeout (ms) for page.goto(). Default: 60_000 (60s). */
342
+ navigationTimeout?: number;
271
343
  };
package/dist/types.js CHANGED
@@ -1 +1,7 @@
1
+ /**
2
+ * Beholder type definitions for the page-level scraper.
3
+ * @see {@link ./scraper.ts} for the Scraper class that produces these types
4
+ * @see {@link ./dom-evaluation.ts} for DOM extraction functions (anchors, images, meta)
5
+ * @module
6
+ */
1
7
  export {};
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@d-zero/beholder",
3
- "version": "0.1.28",
4
- "description": "The tool for scraping and recording web pages",
3
+ "version": "2.0.0",
4
+ "description": "Page-level scraper for web crawling and auditing",
5
5
  "author": "D-ZERO",
6
6
  "license": "MIT",
7
7
  "publishConfig": {
@@ -20,16 +20,13 @@
20
20
  "clean": "tsc --build --clean"
21
21
  },
22
22
  "dependencies": {
23
- "@d-zero/puppeteer-page-scan": "4.4.3",
24
- "@d-zero/shared": "0.19.0",
23
+ "@d-zero/puppeteer-page-scan": "4.4.5",
24
+ "@d-zero/shared": "0.20.1",
25
25
  "debug": "4.4.3",
26
- "follow-redirects": "1.15.11",
27
- "puppeteer": "24.37.3",
28
- "typescript-fsa": "3.0.0"
26
+ "puppeteer": "24.37.5"
29
27
  },
30
28
  "devDependencies": {
31
- "@types/debug": "4.1.12",
32
- "@types/follow-redirects": "1.14.4"
29
+ "@types/debug": "4.1.12"
33
30
  },
34
- "gitHead": "7f90e8c637c40b9abee652eac927a3ca257ef29f"
31
+ "gitHead": "a6b5eb0a0a327c003053f7c25be4c075ed319c76"
35
32
  }
package/src/debug.ts CHANGED
@@ -1,7 +1,10 @@
1
1
  import debug from 'debug';
2
2
 
3
- export const log = debug('Beholder');
4
- export const scraperLog = log.extend('Scraper');
3
+ /** Root debug logger for the beholder package. */
4
+ export const scraperLog = debug('Beholder');
5
+ /** Debug logger for resource fetching. */
5
6
  export const resourceLog = scraperLog.extend('Resource');
7
+ /** Debug logger for DOM evaluation. */
6
8
  export const domLog = scraperLog.extend('DOM');
9
+ /** Debug logger for detailed DOM evaluation output. */
7
10
  export const domDetailsLog = domLog.extend('Details');