@d-zero/beholder 0.1.29 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/CHANGELOG.md +11 -0
  2. package/README.md +172 -477
  3. package/dist/debug.d.ts +4 -1
  4. package/dist/debug.js +5 -2
  5. package/dist/dom-evaluation.d.ts +72 -14
  6. package/dist/dom-evaluation.js +169 -43
  7. package/dist/index.d.ts +20 -3
  8. package/dist/index.js +15 -3
  9. package/dist/is-error.d.ts +8 -0
  10. package/dist/is-error.js +10 -0
  11. package/dist/keyword-check.d.ts +5 -3
  12. package/dist/keyword-check.js +5 -3
  13. package/dist/parse-url.d.ts +14 -0
  14. package/dist/parse-url.js +23 -0
  15. package/dist/scraper.d.ts +39 -13
  16. package/dist/scraper.js +300 -263
  17. package/dist/types.d.ts +286 -214
  18. package/dist/types.js +6 -0
  19. package/package.json +7 -10
  20. package/src/debug.ts +5 -2
  21. package/src/dom-evaluation.ts +195 -65
  22. package/src/index.ts +27 -3
  23. package/src/is-error.spec.ts +33 -0
  24. package/src/is-error.ts +10 -0
  25. package/src/keyword-check.spec.ts +45 -4
  26. package/src/keyword-check.ts +5 -3
  27. package/src/parse-url.spec.ts +35 -0
  28. package/src/parse-url.ts +26 -0
  29. package/src/scraper.ts +338 -300
  30. package/src/types.ts +345 -258
  31. package/tsconfig.tsbuildinfo +1 -1
  32. package/dist/events.d.ts +0 -32
  33. package/dist/events.js +0 -15
  34. package/dist/fetch-destination.d.ts +0 -8
  35. package/dist/fetch-destination.js +0 -145
  36. package/dist/net-timeout-error.d.ts +0 -3
  37. package/dist/net-timeout-error.js +0 -3
  38. package/dist/sub-process-runner.d.ts +0 -12
  39. package/dist/sub-process-runner.js +0 -180
  40. package/dist/sub-process.d.ts +0 -1
  41. package/dist/sub-process.js +0 -67
  42. package/dist/utils.d.ts +0 -16
  43. package/dist/utils.js +0 -69
  44. package/src/events.ts +0 -21
  45. package/src/fetch-destination.ts +0 -173
  46. package/src/net-timeout-error.ts +0 -3
  47. package/src/sub-process-runner.ts +0 -220
  48. package/src/sub-process.ts +0 -86
  49. package/src/utils.ts +0 -89
@@ -1,25 +1,45 @@
1
+ /**
2
+ * DOM evaluation functions for extracting structured data from Puppeteer pages.
3
+ *
4
+ * These functions are called by {@link ./scraper.ts | Scraper.#fetchData} to extract
5
+ * anchors, images, and meta information after page navigation completes.
6
+ * @see {@link ./types.ts} for the data types returned by these functions
7
+ */
8
+
1
9
  import type { AnchorData, ImageElement, ParseURLOptions } from './types.js';
2
10
  import type { ElementHandle, Page } from 'puppeteer';
3
11
 
4
- import { parseUrl } from '@d-zero/shared/parse-url';
5
-
6
12
  import { domDetailsLog, domLog } from './debug.js';
13
+ import { parseUrl } from './parse-url.js';
7
14
 
8
15
  const pid = `${process.pid}`;
9
16
  const log = domLog.extend(pid);
10
17
  const dLog = domDetailsLog.extend(pid);
11
18
 
12
19
  /**
20
+ * Parameters for {@link getProp}.
21
+ * @template T - The expected type of the property value.
22
+ */
23
+ export interface GetPropParams<T> {
24
+ /** The Puppeteer element handle to read the property from. */
25
+ readonly $el: ElementHandle<Element>;
26
+ /** The name of the DOM property to retrieve (e.g., `"href"`, `"textContent"`). */
27
+ readonly propName: string;
28
+ /** The default value to return if the property cannot be read or times out. */
29
+ readonly fallback: T;
30
+ }
31
+
32
+ /**
33
+ * Retrieves a DOM property value from a Puppeteer element handle with a timeout.
13
34
  *
14
- * @param $el
15
- * @param propName
16
- * @param fallback
35
+ * Races the actual property retrieval against a 10-second timeout.
36
+ * If the property cannot be read or the timeout expires, the fallback value is returned.
37
+ * @template T - The expected type of the property value.
38
+ * @param params - Parameters containing the element, property name, and fallback.
39
+ * @returns The property value, or the fallback if retrieval fails.
17
40
  */
18
- export async function getProp<T>(
19
- $el: ElementHandle<Element>,
20
- propName: string,
21
- fallback: T,
22
- ) {
41
+ export async function getProp<T>(params: GetPropParams<T>) {
42
+ const { $el, propName, fallback } = params;
23
43
  return Promise.race([
24
44
  _getProp($el, propName, fallback),
25
45
  new Promise<T>((res) => setTimeout(() => res(fallback), 10 * 1000)),
@@ -27,10 +47,12 @@ export async function getProp<T>(
27
47
  }
28
48
 
29
49
  /**
30
- *
31
- * @param $el
32
- * @param propName
33
- * @param fallback
50
+ * Internal implementation of property retrieval without timeout.
51
+ * @template T - The expected type of the property value.
52
+ * @param $el - The Puppeteer element handle.
53
+ * @param propName - The DOM property name.
54
+ * @param fallback - The default value on failure.
55
+ * @returns The property value cast to `T`, or the fallback.
34
56
  */
35
57
  async function _getProp<T>($el: ElementHandle<Element>, propName: string, fallback: T) {
36
58
  try {
@@ -46,30 +68,46 @@ async function _getProp<T>($el: ElementHandle<Element>, propName: string, fallba
46
68
  }
47
69
 
48
70
  /**
71
+ * Parameters for {@link getPropBySelector}.
72
+ * @template T - The expected type of the property value.
73
+ */
74
+ export interface GetPropBySelectorParams<T> {
75
+ /** The Puppeteer page to query. */
76
+ readonly page: Page;
77
+ /** A CSS selector to find the target element. */
78
+ readonly selector: string;
79
+ /** The DOM property name to read from the matched element. */
80
+ readonly propName: string;
81
+ /** The default value if no element matches or the property cannot be read. */
82
+ readonly fallback: T;
83
+ }
84
+
85
+ /**
86
+ * Retrieves a DOM property value from the first element matching a CSS selector.
49
87
  *
50
- * @param page
51
- * @param selector
52
- * @param propName
53
- * @param fallback
88
+ * Combines `page.$()` with {@link getProp} for convenient single-element lookups.
89
+ * @template T - The expected type of the property value.
90
+ * @param params - Parameters containing the page, selector, property name, and fallback.
91
+ * @returns The property value, or the fallback if the element is not found or retrieval fails.
54
92
  */
55
- export async function getPropBySelector<T>(
56
- page: Page,
57
- selector: string,
58
- propName: string,
59
- fallback: T,
60
- ) {
93
+ export async function getPropBySelector<T>(params: GetPropBySelectorParams<T>) {
94
+ const { page, selector, propName, fallback } = params;
61
95
  const $el = await page.$(selector);
62
96
  if (!$el) {
63
97
  return fallback;
64
98
  }
65
99
 
66
- return getProp($el, propName, fallback);
100
+ return getProp({ $el, propName, fallback });
67
101
  }
68
102
 
69
103
  /**
104
+ * Extracts all `<img>` elements from the page and returns their properties.
70
105
  *
71
- * @param page
72
- * @param viewportWidth
106
+ * For each image, collects the `src`, `currentSrc`, `alt`, bounding box dimensions,
107
+ * natural dimensions, lazy-loading status, and the outer HTML source code.
108
+ * @param page - The Puppeteer page to extract images from.
109
+ * @param viewportWidth - The current viewport width in pixels, recorded alongside each image entry.
110
+ * @returns An array of {@link ImageElement} objects describing each image on the page.
73
111
  */
74
112
  export async function getImageList(
75
113
  page: Page,
@@ -94,13 +132,29 @@ export async function getImageList(
94
132
  const boundingBox = await $image.boundingBox();
95
133
  const width = boundingBox?.width || 0;
96
134
  const height = boundingBox?.height || 0;
97
- const src = await getProp($image, 'src', '');
98
- const currentSrc = await getProp($image, 'currentSrc', '');
99
- const alt = await getProp($image, 'alt', '');
100
- const naturalWidth = await getProp($image, 'naturalWidth', 0);
101
- const naturalHeight = await getProp($image, 'naturalHeight', 0);
102
- const loading = await getProp($image, 'loading', '');
103
- const sourceCode = await getProp($image, 'outerHTML', '');
135
+ const src = await getProp({ $el: $image, propName: 'src', fallback: '' });
136
+ const currentSrc = await getProp({
137
+ $el: $image,
138
+ propName: 'currentSrc',
139
+ fallback: '',
140
+ });
141
+ const alt = await getProp({ $el: $image, propName: 'alt', fallback: '' });
142
+ const naturalWidth = await getProp({
143
+ $el: $image,
144
+ propName: 'naturalWidth',
145
+ fallback: 0,
146
+ });
147
+ const naturalHeight = await getProp({
148
+ $el: $image,
149
+ propName: 'naturalHeight',
150
+ fallback: 0,
151
+ });
152
+ const loading = await getProp({ $el: $image, propName: 'loading', fallback: '' });
153
+ const sourceCode = await getProp({
154
+ $el: $image,
155
+ propName: 'outerHTML',
156
+ fallback: '',
157
+ });
104
158
  const isLazy = loading.toLowerCase().trim() === 'lazy';
105
159
  imageList.push({
106
160
  src,
@@ -125,9 +179,14 @@ export async function getImageList(
125
179
  }
126
180
 
127
181
  /**
182
+ * Extracts all anchor (`<a>` and `<area>`) elements with `href` attributes from the page.
128
183
  *
129
- * @param page
130
- * @param options
184
+ * For each anchor, resolves the `href` to an `ExURL` via `parseUrl`, retrieves
185
+ * the accessible name (from the accessibility tree, falling back to `textContent`),
186
+ * and filters out non-HTTP links.
187
+ * @param page - The Puppeteer page to extract anchors from.
188
+ * @param options - Optional URL parsing options (e.g., `disableQueries`).
189
+ * @returns An array of {@link AnchorData} objects for all HTTP(S) links found on the page.
131
190
  */
132
191
  export async function getAnchorList(page: Page, options?: ParseURLOptions) {
133
192
  log('Getting anchors');
@@ -136,14 +195,18 @@ export async function getAnchorList(page: Page, options?: ParseURLOptions) {
136
195
  const anchorList: AnchorData[] = [];
137
196
 
138
197
  for (const $anchor of $anchors) {
139
- const $href = await getProp($anchor, 'href', '');
198
+ const $href = await getProp({ $el: $anchor, propName: 'href', fallback: '' });
140
199
  const hrefVal = $href.toString();
141
200
  const href = parseUrl(hrefVal, options);
142
201
  if (!href || !href.isHTTP) {
143
202
  continue;
144
203
  }
145
204
  const axNode = await page.accessibility.snapshot({ root: $anchor });
146
- const textContent = await getProp($anchor, 'textContent', '');
205
+ const textContent = await getProp({
206
+ $el: $anchor,
207
+ propName: 'textContent',
208
+ fallback: '',
209
+ });
147
210
  const accessibleName = axNode ? axNode.name || '' : textContent.trim();
148
211
  const link: AnchorData = {
149
212
  href,
@@ -161,46 +224,113 @@ export async function getAnchorList(page: Page, options?: ParseURLOptions) {
161
224
  }
162
225
 
163
226
  /**
227
+ * Extracts comprehensive meta information from the page's `<head>`.
164
228
  *
165
- * @param page
229
+ * Collects the following metadata:
230
+ * - `title` - The document title.
231
+ * - `lang` - The `lang` attribute of the `<html>` element.
232
+ * - `description` - The `<meta name="description">` content.
233
+ * - `keywords` - The `<meta name="keywords">` content.
234
+ * - `noindex` / `nofollow` / `noarchive` - Parsed from the `<meta name="robots">` directives.
235
+ * - `canonical` - The `<link rel="canonical">` content.
236
+ * - `alternate` - The `<link rel="alternate">` content.
237
+ * - Open Graph tags: `og:type`, `og:title`, `og:site_name`, `og:description`, `og:url`, `og:image`.
238
+ * - `twitter:card` - The Twitter Card type.
239
+ * @param page - The Puppeteer page to extract meta information from.
240
+ * @returns An object containing all extracted meta properties.
166
241
  */
167
242
  export async function getMeta(page: Page) {
168
243
  log('Getting Meta');
169
244
 
170
- const robotsVal = await getPropBySelector(page, 'meta[name="robots"]', 'content', '');
245
+ const robotsVal = await getPropBySelector({
246
+ page,
247
+ selector: 'meta[name="robots"]',
248
+ propName: 'content',
249
+ fallback: '',
250
+ });
171
251
  const robots = new Set(robotsVal.split(',').map((robot) => robot.trim().toLowerCase()));
172
252
  const meta = {
173
- title: await getPropBySelector(page, 'title', 'textContent', ''),
174
- lang: await getPropBySelector(page, 'html', 'lang', ''),
175
- description: await getPropBySelector(page, 'meta[name="description"]', 'content', ''),
176
- keywords: await getPropBySelector(page, 'meta[name="keywords"]', 'content', ''),
253
+ title: await getPropBySelector({
254
+ page,
255
+ selector: 'title',
256
+ propName: 'textContent',
257
+ fallback: '',
258
+ }),
259
+ lang: await getPropBySelector({
260
+ page,
261
+ selector: 'html',
262
+ propName: 'lang',
263
+ fallback: '',
264
+ }),
265
+ description: await getPropBySelector({
266
+ page,
267
+ selector: 'meta[name="description"]',
268
+ propName: 'content',
269
+ fallback: '',
270
+ }),
271
+ keywords: await getPropBySelector({
272
+ page,
273
+ selector: 'meta[name="keywords"]',
274
+ propName: 'content',
275
+ fallback: '',
276
+ }),
177
277
  noindex: robots.has('noindex'),
178
278
  nofollow: robots.has('nofollow'),
179
279
  noarchive: robots.has('noarchive'),
180
- canonical: await getPropBySelector(page, 'link[rel="canonical"]', 'content', ''),
181
- alternate: await getPropBySelector(page, 'link[rel="alternate"]', 'content', ''),
182
- 'og:type': await getPropBySelector(page, 'meta[property="og:type"]', 'content', ''),
183
- 'og:title': await getPropBySelector(page, 'meta[property="og:title"]', 'content', ''),
184
- 'og:site_name': await getPropBySelector(
280
+ canonical: await getPropBySelector({
281
+ page,
282
+ selector: 'link[rel="canonical"]',
283
+ propName: 'href',
284
+ fallback: '',
285
+ }),
286
+ alternate: await getPropBySelector({
287
+ page,
288
+ selector: 'link[rel="alternate"]',
289
+ propName: 'href',
290
+ fallback: '',
291
+ }),
292
+ 'og:type': await getPropBySelector({
293
+ page,
294
+ selector: 'meta[property="og:type"]',
295
+ propName: 'content',
296
+ fallback: '',
297
+ }),
298
+ 'og:title': await getPropBySelector({
299
+ page,
300
+ selector: 'meta[property="og:title"]',
301
+ propName: 'content',
302
+ fallback: '',
303
+ }),
304
+ 'og:site_name': await getPropBySelector({
305
+ page,
306
+ selector: 'meta[property="og:site_name"]',
307
+ propName: 'content',
308
+ fallback: '',
309
+ }),
310
+ 'og:description': await getPropBySelector({
311
+ page,
312
+ selector: 'meta[property="og:description"]',
313
+ propName: 'content',
314
+ fallback: '',
315
+ }),
316
+ 'og:url': await getPropBySelector({
185
317
  page,
186
- 'meta[property="og:site_name"]',
187
- 'content',
188
- '',
189
- ),
190
- 'og:description': await getPropBySelector(
318
+ selector: 'meta[property="og:url"]',
319
+ propName: 'content',
320
+ fallback: '',
321
+ }),
322
+ 'og:image': await getPropBySelector({
191
323
  page,
192
- 'meta[property="og:description"]',
193
- 'content',
194
- '',
195
- ),
196
- 'og:url': await getPropBySelector(page, 'meta[property="og:url"]', 'content', ''),
197
- 'og:image': await getPropBySelector(page, 'meta[property="og:image"]', 'content', ''),
198
- 'twitter:card': await getPropBySelector(
324
+ selector: 'meta[property="og:image"]',
325
+ propName: 'content',
326
+ fallback: '',
327
+ }),
328
+ 'twitter:card': await getPropBySelector({
199
329
  page,
200
- 'meta[name="twitter:card"]',
201
- 'content',
202
- '',
203
- ),
330
+ selector: 'meta[name="twitter:card"]',
331
+ propName: 'content',
332
+ fallback: '',
333
+ }),
204
334
  };
205
335
 
206
336
  log('Got meta');
package/src/index.ts CHANGED
@@ -1,4 +1,28 @@
1
- export { default as SubProcessRunner } from './sub-process-runner.js';
1
+ /**
2
+ * @module @d-zero/beholder
3
+ *
4
+ * The beholder package provides page-level scraping capabilities for web crawlers.
5
+ * It handles browser page navigation, DOM data extraction (anchors, images, meta tags),
6
+ * network resource monitoring, and keyword-based page exclusion.
7
+ *
8
+ * Results are returned as values from `scrapeStart()`, not emitted as events.
9
+ * Only streaming events (changePhase, resourceResponse) are emitted for progress monitoring.
10
+ *
11
+ * The main entry point is the `Scraper` class (default export).
12
+ */
2
13
  export { default as default } from './scraper.js';
3
- export * from './types.js';
4
- export * from './events.js';
14
+ export { isError } from './is-error.js';
15
+ export { detectCompress } from '@d-zero/shared/detect-compress';
16
+ export type { CompressType } from '@d-zero/shared/detect-compress';
17
+ export { detectCDN } from '@d-zero/shared/detect-cdn';
18
+ export type { CDNType } from '@d-zero/shared/detect-cdn';
19
+ export type { ScrapeResult, ResourceEntry, PageData } from './types.js';
20
+ export type { ScraperOptions, ChangePhaseEvent, ScraperEventTypes } from './types.js';
21
+ export type {
22
+ Resource,
23
+ AnchorData,
24
+ Meta,
25
+ ImageElement,
26
+ SkippedPageData,
27
+ NetworkLog,
28
+ } from './types.js';
@@ -0,0 +1,33 @@
1
+ import { describe, it, expect } from 'vitest';
2
+
3
+ import { isError } from './is-error.js';
4
+
5
+ describe('isError', () => {
6
+ it('returns true for status below 200', () => {
7
+ expect(isError(199)).toBe(true);
8
+ });
9
+
10
+ it('returns false for status 200 (lower boundary)', () => {
11
+ expect(isError(200)).toBe(false);
12
+ });
13
+
14
+ it('returns false for status 399 (upper boundary)', () => {
15
+ expect(isError(399)).toBe(false);
16
+ });
17
+
18
+ it('returns true for status 400', () => {
19
+ expect(isError(400)).toBe(true);
20
+ });
21
+
22
+ it('returns true for status 0', () => {
23
+ expect(isError(0)).toBe(true);
24
+ });
25
+
26
+ it('returns true for negative status', () => {
27
+ expect(isError(-1)).toBe(true);
28
+ });
29
+
30
+ it('returns true for status 500', () => {
31
+ expect(isError(500)).toBe(true);
32
+ });
33
+ });
@@ -0,0 +1,10 @@
1
+ /**
2
+ * Determines whether an HTTP status code represents an error.
3
+ * Status codes in the range 200-399 (inclusive) are considered successful;
4
+ * all others are considered errors.
5
+ * @param status - HTTP status code to evaluate
6
+ * @returns `true` if the status code indicates an error (< 200 or >= 400)
7
+ */
8
+ export function isError(status: number) {
9
+ return !(200 <= status && status < 400);
10
+ }
@@ -1,8 +1,49 @@
1
- import { test, expect } from 'vitest';
1
+ import { describe, it, expect } from 'vitest';
2
2
 
3
3
  import { keywordCheck } from './keyword-check.js';
4
4
 
5
- test('keyword checking', () => {
6
- expect(keywordCheck('abc', ['abc'])).toBe('abc');
7
- expect(keywordCheck('ABC', ['abc', '/abc/i'])).toBe('/abc/i');
5
+ describe('keywordCheck', () => {
6
+ it('returns the matched keyword when found', () => {
7
+ expect(keywordCheck('<html><body>error message</body></html>', ['error'])).toBe(
8
+ 'error',
9
+ );
10
+ });
11
+
12
+ it('returns false when no keyword matches', () => {
13
+ expect(keywordCheck('<html><body>hello world</body></html>', ['error'])).toBe(false);
14
+ });
15
+
16
+ it('returns the first matching keyword when multiple match', () => {
17
+ expect(
18
+ keywordCheck('<html><body>error warning</body></html>', ['warning', 'error']),
19
+ ).toBe('warning');
20
+ });
21
+
22
+ it('returns false for empty keyword array', () => {
23
+ expect(keywordCheck('<html><body>some content</body></html>', [])).toBe(false);
24
+ });
25
+
26
+ it('returns false for empty HTML', () => {
27
+ expect(keywordCheck('', ['error'])).toBe(false);
28
+ });
29
+
30
+ it('supports regex pattern with /pattern/ syntax', () => {
31
+ expect(keywordCheck('<html><body>code 404 found</body></html>', ['/\\d{3}/'])).toBe(
32
+ '/\\d{3}/',
33
+ );
34
+ });
35
+
36
+ it('is case-sensitive by default for plain keywords', () => {
37
+ expect(keywordCheck('<html><body>Error</body></html>', ['error'])).toBe(false);
38
+ });
39
+
40
+ it('supports case-insensitive flag /pattern/i', () => {
41
+ expect(keywordCheck('<html><body>Error</body></html>', ['/error/i'])).toBe(
42
+ '/error/i',
43
+ );
44
+ });
45
+
46
+ it('returns false when regex pattern does not match', () => {
47
+ expect(keywordCheck('<html><body>no numbers</body></html>', ['/\\d+/'])).toBe(false);
48
+ });
8
49
  });
@@ -1,9 +1,11 @@
1
1
  import { strToRegex } from '@d-zero/shared/str-to-regex';
2
2
 
3
3
  /**
4
- *
5
- * @param html
6
- * @param excludeKeywords
4
+ * Checks whether the given HTML content contains any of the specified exclude keywords.
5
+ * Each keyword is converted to a regular expression via `strToRegex` before testing.
6
+ * @param html - The raw HTML string to search within.
7
+ * @param excludeKeywords - An array of keyword strings or regex patterns to match against the HTML.
8
+ * @returns The first matched keyword string if a match is found, or `false` if none match.
7
9
  */
8
10
  export function keywordCheck(html: string, excludeKeywords: string[]) {
9
11
  for (const keyword of excludeKeywords) {
@@ -0,0 +1,35 @@
1
+ import { describe, it, expect } from 'vitest';
2
+
3
+ import { parseUrl } from './parse-url.js';
4
+
5
+ describe('parseUrl', () => {
6
+ it('parses a string URL into ExURL', () => {
7
+ const result = parseUrl('https://example.com/path');
8
+ expect(result).not.toBeNull();
9
+ expect(result!.hostname).toBe('example.com');
10
+ });
11
+
12
+ it('returns ExURL object as-is when passed an ExURL', () => {
13
+ const exUrl = parseUrl('https://example.com')!;
14
+ const result = parseUrl(exUrl);
15
+ expect(result).toBe(exUrl);
16
+ });
17
+
18
+ it('returns null for fragment-only string', () => {
19
+ const result = parseUrl('#fragment');
20
+ expect(result).toBeNull();
21
+ });
22
+
23
+ it('returns non-null for tel: URL (has protocol)', () => {
24
+ const result = parseUrl('tel:000-0000-0000');
25
+ // tel: URL has protocol set, so parseUrl does not filter it out
26
+ expect(result).not.toBeNull();
27
+ expect(result!.protocol).toBe('tel:');
28
+ });
29
+
30
+ it('parses http URL', () => {
31
+ const result = parseUrl('http://example.com');
32
+ expect(result).not.toBeNull();
33
+ expect(result!.protocol).toBe('http:');
34
+ });
35
+ });
@@ -0,0 +1,26 @@
1
+ import type { ExURL, ParseURLOptions } from '@d-zero/shared/parse-url';
2
+
3
+ import { parseUrl as sharedParseUrl } from '@d-zero/shared/parse-url';
4
+
5
+ /**
6
+ * Parses a URL string into an ExURL object, filtering out non-HTTP URLs
7
+ * that lack a hostname and protocol. If the input is already an ExURL object,
8
+ * it is returned as-is without re-parsing.
9
+ *
10
+ * WHY null return: Bare fragment-only strings (e.g. `"#section"`) and
11
+ * protocol-relative paths without a host are not meaningful URLs for crawling.
12
+ * @param url - A URL string or an already-parsed ExURL object
13
+ * @param options - URL parsing options (e.g. `disableQueries` to strip query strings)
14
+ * @returns The parsed ExURL, or `null` if the URL is not navigable
15
+ * @see `@d-zero/shared/parse-url` for the underlying parser
16
+ */
17
+ export function parseUrl(url: string | ExURL, options?: ParseURLOptions): ExURL | null {
18
+ if (typeof url !== 'string') {
19
+ return url;
20
+ }
21
+ const result = sharedParseUrl(url, options);
22
+ if (!result.isHTTP && !result.hostname && !result.protocol) {
23
+ return null;
24
+ }
25
+ return result;
26
+ }