npm - @nitpicker/crawler - Versions diffs - 0.4.2 → 0.4.4 - Mend

@nitpicker/crawler 0.4.2 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (122) hide show

package/lib/archive/archive-accessor.d.ts +6 -1
package/lib/archive/archive-accessor.js +7 -0
package/lib/archive/database.js +2 -1
package/package.json +5 -2
package/CHANGELOG.md +0 -16
package/src/archive/__mock__/.gitignore +0 -3
package/src/archive/__mock__/mock.sqlite +0 -0
package/src/archive/archive-accessor.ts +0 -337
package/src/archive/archive.ts +0 -408
package/src/archive/database.spec.ts +0 -469
package/src/archive/database.ts +0 -1059
package/src/archive/debug.ts +0 -10
package/src/archive/filesystem/append-text.spec.ts +0 -26
package/src/archive/filesystem/append-text.ts +0 -16
package/src/archive/filesystem/copy-dir-sync.spec.ts +0 -27
package/src/archive/filesystem/copy-dir-sync.ts +0 -10
package/src/archive/filesystem/copy-dir.spec.ts +0 -33
package/src/archive/filesystem/copy-dir.ts +0 -14
package/src/archive/filesystem/exists.spec.ts +0 -33
package/src/archive/filesystem/exists.ts +0 -10
package/src/archive/filesystem/get-file-list.spec.ts +0 -37
package/src/archive/filesystem/get-file-list.ts +0 -13
package/src/archive/filesystem/index.ts +0 -17
package/src/archive/filesystem/is-dir.spec.ts +0 -29
package/src/archive/filesystem/is-dir.ts +0 -11
package/src/archive/filesystem/mkdir.spec.ts +0 -37
package/src/archive/filesystem/mkdir.ts +0 -16
package/src/archive/filesystem/output-json.spec.ts +0 -34
package/src/archive/filesystem/output-json.ts +0 -16
package/src/archive/filesystem/output-text.spec.ts +0 -31
package/src/archive/filesystem/output-text.ts +0 -35
package/src/archive/filesystem/read-json.spec.ts +0 -26
package/src/archive/filesystem/read-json.ts +0 -12
package/src/archive/filesystem/read-text.spec.ts +0 -25
package/src/archive/filesystem/read-text.ts +0 -11
package/src/archive/filesystem/readline.spec.ts +0 -29
package/src/archive/filesystem/readline.ts +0 -30
package/src/archive/filesystem/remove.spec.ts +0 -34
package/src/archive/filesystem/remove.ts +0 -11
package/src/archive/filesystem/rename.spec.ts +0 -46
package/src/archive/filesystem/rename.ts +0 -21
package/src/archive/filesystem/tar.spec.ts +0 -33
package/src/archive/filesystem/tar.ts +0 -27
package/src/archive/filesystem/untar.spec.ts +0 -34
package/src/archive/filesystem/untar.ts +0 -36
package/src/archive/index.ts +0 -13
package/src/archive/page.spec.ts +0 -368
package/src/archive/page.ts +0 -420
package/src/archive/resource.spec.ts +0 -101
package/src/archive/resource.ts +0 -73
package/src/archive/safe-path.spec.ts +0 -44
package/src/archive/safe-path.ts +0 -18
package/src/archive/types.ts +0 -227
package/src/crawler/clear-destination-cache.spec.ts +0 -20
package/src/crawler/clear-destination-cache.ts +0 -9
package/src/crawler/crawler.ts +0 -873
package/src/crawler/decompose-url.spec.ts +0 -48
package/src/crawler/decompose-url.ts +0 -90
package/src/crawler/destination-cache.spec.ts +0 -23
package/src/crawler/destination-cache.ts +0 -8
package/src/crawler/detect-pagination-pattern.spec.ts +0 -169
package/src/crawler/detect-pagination-pattern.ts +0 -66
package/src/crawler/fetch-destination.ts +0 -257
package/src/crawler/fetch-robots-txt.spec.ts +0 -83
package/src/crawler/fetch-robots-txt.ts +0 -91
package/src/crawler/find-best-matching-scope.spec.ts +0 -39
package/src/crawler/find-best-matching-scope.ts +0 -57
package/src/crawler/generate-predicted-urls.spec.ts +0 -42
package/src/crawler/generate-predicted-urls.ts +0 -34
package/src/crawler/handle-ignore-and-skip.spec.ts +0 -66
package/src/crawler/handle-ignore-and-skip.ts +0 -30
package/src/crawler/handle-resource-response.spec.ts +0 -45
package/src/crawler/handle-resource-response.ts +0 -21
package/src/crawler/handle-scrape-end.spec.ts +0 -109
package/src/crawler/handle-scrape-end.ts +0 -115
package/src/crawler/handle-scrape-error.spec.ts +0 -105
package/src/crawler/handle-scrape-error.ts +0 -58
package/src/crawler/index.ts +0 -2
package/src/crawler/inject-scope-auth.spec.ts +0 -36
package/src/crawler/inject-scope-auth.ts +0 -27
package/src/crawler/is-external-url.spec.ts +0 -31
package/src/crawler/is-external-url.ts +0 -17
package/src/crawler/is-in-any-lower-layer.spec.ts +0 -31
package/src/crawler/is-in-any-lower-layer.ts +0 -22
package/src/crawler/link-list.spec.ts +0 -355
package/src/crawler/link-list.ts +0 -275
package/src/crawler/link-to-page-data.spec.ts +0 -133
package/src/crawler/link-to-page-data.ts +0 -34
package/src/crawler/net-timeout-error.spec.ts +0 -25
package/src/crawler/net-timeout-error.ts +0 -11
package/src/crawler/protocol-agnostic-key.spec.ts +0 -40
package/src/crawler/protocol-agnostic-key.ts +0 -11
package/src/crawler/reconstruct-url.spec.ts +0 -37
package/src/crawler/reconstruct-url.ts +0 -37
package/src/crawler/robots-checker.spec.ts +0 -104
package/src/crawler/robots-checker.ts +0 -73
package/src/crawler/should-discard-predicted.spec.ts +0 -125
package/src/crawler/should-discard-predicted.ts +0 -33
package/src/crawler/should-skip-url.spec.ts +0 -77
package/src/crawler/should-skip-url.ts +0 -37
package/src/crawler/types.ts +0 -146
package/src/crawler-orchestrator.ts +0 -401
package/src/debug.ts +0 -10
package/src/index.ts +0 -25
package/src/types.ts +0 -30
package/src/utils/array/each-splitted.spec.ts +0 -38
package/src/utils/array/each-splitted.ts +0 -19
package/src/utils/array/index.ts +0 -1
package/src/utils/debug.ts +0 -6
package/src/utils/error/dom-evaluation-error.spec.ts +0 -20
package/src/utils/error/dom-evaluation-error.ts +0 -6
package/src/utils/error/error-emitter.spec.ts +0 -78
package/src/utils/error/error-emitter.ts +0 -44
package/src/utils/error/index.ts +0 -3
package/src/utils/index.ts +0 -5
package/src/utils/object/clean-object.spec.ts +0 -24
package/src/utils/object/clean-object.ts +0 -13
package/src/utils/object/index.ts +0 -1
package/src/utils/types/index.ts +0 -1
package/src/utils/types/types.ts +0 -65
package/tsconfig.json +0 -11
package/tsconfig.tsbuildinfo +0 -1

package/src/crawler/decompose-url.spec.ts DELETED Viewed

@@ -1,48 +0,0 @@
-import { describe, it, expect } from 'vitest';
-import { decomposeUrl } from './decompose-url.js';
-describe('decomposeUrl', () => {
-	it('decomposes a full URL with path and query', () => {
-		const result = decomposeUrl('https://example.com/page/2?sort=name&p=1');
-		expect(result).not.toBeNull();
-		expect(result!.host).toBe('example.com');
-		expect(result!.pathSegments).toEqual(['page', '2']);
-		expect(result!.queryKeys).toEqual(['p', 'sort']);
-		expect(result!.queryValues).toEqual(['1', 'name']);
-		expect(result!.protocol).toBe('https:');
-	});
-	it('decomposes a protocol-agnostic URL', () => {
-		const result = decomposeUrl('//example.com/page/1');
-		expect(result).not.toBeNull();
-		expect(result!.host).toBe('example.com');
-		expect(result!.pathSegments).toEqual(['page', '1']);
-		expect(result!.protocol).toBe('');
-	});
-	it('decomposes URL with port', () => {
-		const result = decomposeUrl('//example.com:8080/page/1');
-		expect(result).not.toBeNull();
-		expect(result!.host).toBe('example.com:8080');
-	});
-	it('returns null for invalid URL format', () => {
-		expect(decomposeUrl('not-a-url')).toBeNull();
-	});
-	it('handles URL with query only (no path)', () => {
-		const result = decomposeUrl('//example.com?offset=0');
-		expect(result).not.toBeNull();
-		expect(result!.pathSegments).toEqual([]);
-		expect(result!.queryKeys).toEqual(['offset']);
-		expect(result!.queryValues).toEqual(['0']);
-	});
-	it('handles URL with no path and no query', () => {
-		const result = decomposeUrl('//example.com');
-		expect(result).not.toBeNull();
-		expect(result!.pathSegments).toEqual([]);
-		expect(result!.queryKeys).toEqual([]);
-	});
-});

package/src/crawler/decompose-url.ts DELETED Viewed

@@ -1,90 +0,0 @@
-/**
- * Intermediate representation of a URL split into comparable tokens.
- * Used by pagination detection to identify which token changed between two URLs.
- */
-export interface DecomposedUrl {
-	/** Hostname including port (e.g. `"example.com:8080"`). */
-	host: string;
-	/** Path segments split by `/` (e.g. `["page", "2"]` for `/page/2`). */
-	pathSegments: string[];
-	/** Sorted query parameter keys. */
-	queryKeys: string[];
-	/** Query parameter values sorted by their corresponding key. */
-	queryValues: string[];
-	/** Protocol prefix (e.g. `"https:"`) or empty string if protocol-agnostic. */
-	protocol: string;
-}
-/**
- * Decomposes a URL string into its constituent tokens for comparison.
- * Handles both full URLs (`https://host/path?q=v`) and protocol-agnostic
- * URLs (`//host/path?q=v`). Query parameters are sorted by key for
- * consistent comparison.
- * @param url - The URL string to decompose
- * @returns The decomposed URL, or `null` if the format is invalid
- */
-export function decomposeUrl(url: string): DecomposedUrl | null {
-	// URL format: //host/path?query  or  //host?query  (protocol-agnostic)
-	// Also handle protocol://host/path?query
-	let work = url;
-	let protocol = '';
-	// Strip protocol
-	const protoMatch = /^(https?:)?\/\//.exec(work);
-	if (!protoMatch) return null;
-	protocol = protoMatch[1] ?? '';
-	work = work.slice(protoMatch[0].length);
-	// Split host from rest
-	const slashIdx = work.indexOf('/');
-	const qmarkIdx = work.indexOf('?');
-	let host: string;
-	let pathPart: string;
-	let queryPart: string;
-	if (slashIdx === -1 && qmarkIdx === -1) {
-		host = work;
-		pathPart = '';
-		queryPart = '';
-	} else if (slashIdx === -1) {
-		host = work.slice(0, qmarkIdx);
-		pathPart = '';
-		queryPart = work.slice(qmarkIdx + 1);
-	} else {
-		host = work.slice(0, slashIdx);
-		const pathAndQuery = work.slice(slashIdx + 1);
-		const pq = pathAndQuery.indexOf('?');
-		if (pq === -1) {
-			pathPart = pathAndQuery;
-			queryPart = '';
-		} else {
-			pathPart = pathAndQuery.slice(0, pq);
-			queryPart = pathAndQuery.slice(pq + 1);
-		}
-	}
-	const pathSegments = pathPart ? pathPart.split('/') : [];
-	// Parse query into sorted key-value pairs
-	const queryPairs: [string, string][] = [];
-	if (queryPart) {
-		for (const pair of queryPart.split('&')) {
-			const eqIdx = pair.indexOf('=');
-			if (eqIdx === -1) {
-				queryPairs.push([pair, '']);
-			} else {
-				queryPairs.push([pair.slice(0, eqIdx), pair.slice(eqIdx + 1)]);
-			}
-		}
-	}
-	queryPairs.sort((a, b) => a[0].localeCompare(b[0]));
-	return {
-		host,
-		pathSegments,
-		queryKeys: queryPairs.map(([k]) => k),
-		queryValues: queryPairs.map(([, v]) => v),
-		protocol,
-	};
-}

package/src/crawler/destination-cache.spec.ts DELETED Viewed

@@ -1,23 +0,0 @@
-import { describe, it, expect } from 'vitest';
-import { destinationCache } from './destination-cache.js';
-describe('destinationCache', () => {
-	it('is a Map instance', () => {
-		expect(destinationCache).toBeInstanceOf(Map);
-	});
-	it('supports set and get operations', () => {
-		destinationCache.set('test-key', new Error('test'));
-		expect(destinationCache.has('test-key')).toBe(true);
-		expect(destinationCache.get('test-key')).toBeInstanceOf(Error);
-		destinationCache.delete('test-key');
-	});
-	it('supports clear operation', () => {
-		destinationCache.set('key1', new Error('a'));
-		destinationCache.set('key2', new Error('b'));
-		destinationCache.clear();
-		expect(destinationCache.size).toBe(0);
-	});
-});

package/src/crawler/destination-cache.ts DELETED Viewed

@@ -1,8 +0,0 @@
-import type { PageData } from '@d-zero/beholder';
-/**
- * In-memory cache of HEAD request results keyed by URL (without hash).
- * Stores either the successful {@link PageData} or the {@link Error} to avoid
- * repeated requests to the same destination.
- */
-export const destinationCache = new Map<string, PageData | Error>();

package/src/crawler/detect-pagination-pattern.spec.ts DELETED Viewed

@@ -1,169 +0,0 @@
-import { describe, expect, it } from 'vitest';
-import { detectPaginationPattern } from './detect-pagination-pattern.js';
-describe('detectPaginationPattern', () => {
-	describe('正常検出ケース', () => {
-		it('パスセグメントの数値差異を検出する', () => {
-			const result = detectPaginationPattern(
-				'//example.com/page/2',
-				'//example.com/page/3',
-			);
-			expect(result).toEqual({ tokenIndex: 1, step: 1, currentNumber: 3 });
-		});
-		it('末尾パスの数値差異を検出する', () => {
-			const result = detectPaginationPattern(
-				'//example.com/blog/2',
-				'//example.com/blog/3',
-			);
-			expect(result).toEqual({ tokenIndex: 1, step: 1, currentNumber: 3 });
-		});
-		it('深いパスの数値差異を検出する', () => {
-			const result = detectPaginationPattern(
-				'//example.com/a/b/2/c',
-				'//example.com/a/b/3/c',
-			);
-			expect(result).toEqual({ tokenIndex: 2, step: 1, currentNumber: 3 });
-		});
-		it('クエリパラメータの数値差異を検出する', () => {
-			const result = detectPaginationPattern(
-				'//example.com/list?p=1&sort=name',
-				'//example.com/list?p=2&sort=name',
-			);
-			expect(result).toEqual({ tokenIndex: 1, step: 1, currentNumber: 2 });
-		});
-		it('step > 1 を検出する', () => {
-			const result = detectPaginationPattern(
-				'//example.com/page/10',
-				'//example.com/page/20',
-			);
-			expect(result).toEqual({ tokenIndex: 1, step: 10, currentNumber: 20 });
-		});
-		it('0始まりページネーションを検出する', () => {
-			const result = detectPaginationPattern(
-				'//example.com/page/0',
-				'//example.com/page/1',
-			);
-			expect(result).toEqual({ tokenIndex: 1, step: 1, currentNumber: 1 });
-		});
-		it('大きい数値でも検出する', () => {
-			const result = detectPaginationPattern(
-				'//example.com/items/100',
-				'//example.com/items/101',
-			);
-			expect(result).toEqual({ tokenIndex: 1, step: 1, currentNumber: 101 });
-		});
-		it('ポート付きURLを検出する', () => {
-			const result = detectPaginationPattern(
-				'//example.com:8080/page/1',
-				'//example.com:8080/page/2',
-			);
-			expect(result).toEqual({ tokenIndex: 1, step: 1, currentNumber: 2 });
-		});
-		it('クエリのみ（パスなし）を検出する', () => {
-			const result = detectPaginationPattern(
-				'//example.com?offset=0',
-				'//example.com?offset=10',
-			);
-			expect(result).toEqual({ tokenIndex: 0, step: 10, currentNumber: 10 });
-		});
-	});
-	describe('null を返すケース', () => {
-		it('ホスト名が異なる場合', () => {
-			expect(
-				detectPaginationPattern('//example.com/page/2', '//other.com/page/3'),
-			).toBeNull();
-		});
-		it('パスの長さが異なる場合', () => {
-			expect(
-				detectPaginationPattern('//example.com/page/2', '//example.com/page/2/extra'),
-			).toBeNull();
-		});
-		it('非数値の差異がある場合', () => {
-			expect(
-				detectPaginationPattern('//example.com/page/a', '//example.com/page/b'),
-			).toBeNull();
-		});
-		it('複数箇所の数値差異がある場合', () => {
-			expect(
-				detectPaginationPattern('//example.com/1/page/2', '//example.com/2/page/3'),
-			).toBeNull();
-		});
-		it('数値 + 非数値の差異がある場合', () => {
-			expect(
-				detectPaginationPattern('//example.com/a/2', '//example.com/b/3'),
-			).toBeNull();
-		});
-		it('step が 0（同一URL）の場合', () => {
-			expect(
-				detectPaginationPattern('//example.com/page/3', '//example.com/page/3'),
-			).toBeNull();
-		});
-		it('step が負（デクリメント）の場合', () => {
-			expect(
-				detectPaginationPattern('//example.com/page/5', '//example.com/page/3'),
-			).toBeNull();
-		});
-		it('クエリのキーセットが異なる場合', () => {
-			expect(
-				detectPaginationPattern(
-					'//example.com/list?p=1&a=x',
-					'//example.com/list?p=2&b=y',
-				),
-			).toBeNull();
-		});
-		it('クエリのキー数が異なる場合', () => {
-			expect(
-				detectPaginationPattern(
-					'//example.com/list?p=1',
-					'//example.com/list?p=2&extra=1',
-				),
-			).toBeNull();
-		});
-		it('プロトコル以外完全一致（数値差異なし）の場合', () => {
-			expect(
-				detectPaginationPattern('//example.com/about', '//example.com/about'),
-			).toBeNull();
-		});
-		it('空パス同士の場合', () => {
-			expect(detectPaginationPattern('//example.com', '//example.com')).toBeNull();
-		});
-	});
-	describe('境界値ケース', () => {
-		it('非常に大きな数値でも動作する', () => {
-			const result = detectPaginationPattern(
-				'//example.com/page/999999',
-				'//example.com/page/1000000',
-			);
-			expect(result).toEqual({ tokenIndex: 1, step: 1, currentNumber: 1_000_000 });
-		});
-		it('パス内に固定数値セグメントがあっても変化箇所のみ検出する', () => {
-			const result = detectPaginationPattern(
-				'//example.com/v2/page/3',
-				'//example.com/v2/page/4',
-			);
-			expect(result).toEqual({ tokenIndex: 2, step: 1, currentNumber: 4 });
-		});
-	});
-});

package/src/crawler/detect-pagination-pattern.ts DELETED Viewed

@@ -1,66 +0,0 @@
-import type { PaginationPattern } from './types.js';
-import { decomposeUrl } from './decompose-url.js';
-/**
- * Compares two consecutive URL strings and detects a single-token numeric
- * pagination pattern (e.g. `/page/1` → `/page/2`, or `?p=1` → `?p=2`).
- *
- * The algorithm decomposes each URL into tokens (path segments + sorted query values),
- * then checks that exactly one token differs and both values are integers with a
- * positive step. Returns `null` when no pattern is detected.
- *
- * WHY single-token constraint: Multi-token differences (e.g. both path and query
- * changing) indicate different routes rather than pagination, so they are rejected.
- * @param prevUrl - The previously pushed URL (protocol-agnostic, without hash/auth)
- * @param currentUrl - The newly discovered URL
- * @returns The detected pattern, or `null` if no pagination pattern was found
- */
-export function detectPaginationPattern(
-	prevUrl: string,
-	currentUrl: string,
-): PaginationPattern | null {
-	const prev = decomposeUrl(prevUrl);
-	const curr = decomposeUrl(currentUrl);
-	if (!prev || !curr) return null;
-	// Host (including port) must match
-	if (prev.host !== curr.host) return null;
-	// Path segment count must match
-	if (prev.pathSegments.length !== curr.pathSegments.length) return null;
-	// Query key sets must match in count and identity
-	if (prev.queryKeys.length !== curr.queryKeys.length) return null;
-	for (let i = 0; i < prev.queryKeys.length; i++) {
-		if (prev.queryKeys[i] !== curr.queryKeys[i]) return null;
-	}
-	// Build combined token arrays: path segments + query values (sorted by key)
-	const prevTokens = [...prev.pathSegments, ...prev.queryValues];
-	const currTokens = [...curr.pathSegments, ...curr.queryValues];
-	let diffIndex = -1;
-	for (const [i, prevToken] of prevTokens.entries()) {
-		if (prevToken !== currTokens[i]) {
-			if (diffIndex !== -1) return null; // more than one difference
-			diffIndex = i;
-		}
-	}
-	if (diffIndex === -1) return null; // identical URLs
-	const prevNum = Number(prevTokens[diffIndex]);
-	const currNum = Number(currTokens[diffIndex]);
-	if (!Number.isFinite(prevNum) || !Number.isFinite(currNum)) return null;
-	if (!Number.isInteger(prevNum) || !Number.isInteger(currNum)) return null;
-	const step = currNum - prevNum;
-	if (step <= 0) return null;
-	return {
-		tokenIndex: diffIndex,
-		step,
-		currentNumber: currNum,
-	};
-}

package/src/crawler/fetch-destination.ts DELETED Viewed

@@ -1,257 +0,0 @@
-import type { PageData } from '@d-zero/beholder';
-import type { ExURL } from '@d-zero/shared/parse-url';
-import type { FollowResponse, RedirectableRequest } from 'follow-redirects';
-import type { ClientRequest, IncomingMessage, RequestOptions } from 'node:http';
-import { delay } from '@d-zero/shared/delay';
-import redirects from 'follow-redirects';
-import { destinationCache } from './destination-cache.js';
-import NetTimeoutError from './net-timeout-error.js';
-/**
- * Parameters for {@link fetchDestination}.
- */
-export interface FetchDestinationParams {
-	/** The extended URL to fetch. */
-	readonly url: ExURL;
-	/** Whether the URL is external to the crawl scope. */
-	readonly isExternal: boolean;
-	/** The HTTP method to use. Defaults to `"HEAD"`. */
-	readonly method?: string;
-	/** Additional options. */
-	readonly options?: {
-		/**
-		 * When set, forces a GET request and reads up to this many bytes from
-		 * the response body to extract an HTML `<title>` tag.
-		 */
-		titleBytesLimit?: number;
-	};
-	/** User-Agent string to send with the request. */
-	readonly userAgent?: string;
-}
-/**
- * Fetches the destination metadata for a URL using an HTTP HEAD request (or GET as fallback).
- *
- * Results are cached in memory so that repeated calls for the same URL
- * (without hash) return immediately. The request races against a 10-second
- * timeout; if the server does not respond in time, a {@link NetTimeoutError} is thrown.
- *
- * If the server returns 405 (Method Not Allowed), 501 (Not Implemented), or 503
- * (Service Unavailable) for a HEAD request, the function automatically retries with GET.
- * @param params - Parameters containing URL, external flag, method, options, and optional User-Agent.
- * @returns The page metadata obtained from the HTTP response.
- * @throws {NetTimeoutError} If the request exceeds the 10-second timeout.
- * @throws {Error} If the HTTP request fails for any other reason.
- */
-export async function fetchDestination(
-	params: FetchDestinationParams,
-): Promise<PageData> {
-	const { url, isExternal, method = 'HEAD', options, userAgent } = params;
-	const titleBytesLimit = options?.titleBytesLimit;
-	const cacheKey = titleBytesLimit == null ? url.withoutHash : `${url.withoutHash}:title`;
-	if (destinationCache.has(cacheKey)) {
-		const cache = destinationCache.get(cacheKey)!;
-		if (cache instanceof Error) {
-			throw cache;
-		}
-		return cache;
-	}
-	const effectiveMethod = titleBytesLimit == null ? method : 'GET';
-	const result = await Promise.race([
-		_fetchHead(url, isExternal, effectiveMethod, titleBytesLimit, userAgent).catch(
-			(error: unknown) => (error instanceof Error ? error : new Error(String(error))),
-		),
-		(async () => {
-			await delay(10 * 1000);
-			return new NetTimeoutError(url.href);
-		})(),
-	]);
-	destinationCache.set(cacheKey, result);
-	if (result instanceof Error) {
-		throw result;
-	}
-	return result;
-}
-/**
- * Performs the actual HTTP request to retrieve page metadata.
- *
- * Handles both HTTP and HTTPS protocols via `follow-redirects`, tracks redirect chains,
- * and falls back to GET on certain status codes (405, 501, 503).
- * @param url - The extended URL to request.
- * @param isExternal - Whether the URL is external to the crawl scope.
- * @param method - The HTTP method (`"HEAD"` or `"GET"`).
- * @param titleBytesLimit - When set, reads up to this many bytes from the response body
- *   to extract a `<title>` tag, then destroys the connection.
- * @param userAgent - Optional User-Agent string to send with the request.
- * @returns A promise resolving to {@link PageData} with response metadata.
- */
-async function _fetchHead(
-	url: ExURL,
-	isExternal: boolean,
-	method: string,
-	titleBytesLimit?: number,
-	userAgent?: string,
-) {
-	return new Promise<PageData>((resolve, reject) => {
-		const hostHeader = url.port ? `${url.hostname}:${url.port}` : url.hostname;
-		const request: RequestOptions = {
-			protocol: url.protocol,
-			hostname: url.hostname,
-			port: url.port || undefined,
-			path: url.pathname,
-			method,
-			headers: {
-				host: hostHeader,
-				...(userAgent ? { 'User-Agent': userAgent } : {}),
-				Connection: 'keep-alive',
-				Pragma: 'no-cache',
-				'Cache-Control': 'no-cache',
-				'Upgrade-Insecure-Requests': 1,
-				Accept:
-					'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
-				'Accept-Encoding': 'gzip, deflate',
-				'Accept-Language':
-					'ja,en;q=0.9,zh;q=0.8,en-US;q=0.7,pl;q=0.6,de;q=0.5,zh-CN;q=0.4,zh-TW;q=0.3,th;q=0.2,ko;q=0.1,fr;q=0.1',
-				// Range: url.extname?.toLowerCase() === 'pdf' ? 'bytes=0-0' : undefined,
-			},
-		};
-		if (url.username && url.password) {
-			request.auth = `${url.username}:${url.password}`;
-		}
-		let req: RedirectableRequest<ClientRequest, IncomingMessage>;
-		let destroyed = false;
-		const response = (res: IncomingMessage & FollowResponse) => {
-			const chunks: Buffer[] = [];
-			let totalBytes = 0;
-			let settled = false;
-			const buildPageData = (title: string): PageData => {
-				const redirectPaths = res.redirects.map((r) => r.url);
-				const _contentLength = Number.parseInt(res.headers['content-length'] || '');
-				const contentLength = Number.isFinite(_contentLength) ? _contentLength : null;
-				return {
-					url,
-					isTarget: !isExternal,
-					isExternal,
-					redirectPaths,
-					status: res.statusCode || 0,
-					statusText: res.statusMessage || '',
-					contentType: res.headers['content-type']?.split(';')[0] || null,
-					contentLength,
-					responseHeaders: res.headers,
-					meta: { title },
-					imageList: [],
-					anchorList: [],
-					html: '',
-					isSkipped: false,
-				};
-			};
-			if (titleBytesLimit == null) {
-				res.on('data', () => {});
-				res.on('end', async () => {
-					let rep = buildPageData('');
-					if (rep.status === 405) {
-						if (method === 'GET') {
-							reject(new Error(`Method Not Allowed: ${url.href} ${rep.statusText}`));
-							return;
-						}
-						try {
-							rep = await fetchDestination({ url, isExternal, method: 'GET' });
-						} catch (error) {
-							reject(error);
-							return;
-						}
-					}
-					if (rep.status === 501) {
-						if (method === 'GET') {
-							reject(new Error(`Method Not Implemented: ${url.href} ${rep.statusText}`));
-							return;
-						}
-						await delay(5 * 1000);
-						try {
-							rep = await fetchDestination({ url, isExternal, method: 'GET' });
-						} catch (error) {
-							reject(error);
-							return;
-						}
-					}
-					if (rep.status === 503) {
-						if (method === 'GET') {
-							reject(new Error(`Retrying failed: ${url.href} ${rep.statusText}`));
-							return;
-						}
-						await delay(5 * 1000);
-						try {
-							rep = await fetchDestination({ url, isExternal, method: 'GET' });
-						} catch (error) {
-							reject(error);
-							return;
-						}
-					}
-					resolve(rep);
-				});
-			} else {
-				res.on('data', (chunk: Buffer) => {
-					if (settled) return;
-					chunks.push(chunk);
-					totalBytes += chunk.length;
-					// Check for title in accumulated data so far
-					const body = Buffer.concat(chunks).toString('utf8');
-					const titleMatch = /<title[^>]*>([\s\S]*?)<\/title>/i.exec(body);
-					if (titleMatch) {
-						settled = true;
-						const title = titleMatch[1]?.trim() ?? '';
-						resolve(buildPageData(title));
-						destroyed = true;
-						req.destroy();
-						return;
-					}
-					// Reached byte limit without finding title
-					if (totalBytes >= titleBytesLimit) {
-						settled = true;
-						resolve(buildPageData(''));
-						destroyed = true;
-						req.destroy();
-					}
-				});
-				res.on('end', () => {
-					if (settled) return;
-					settled = true;
-					// Stream ended before limit — try to extract title from what we have
-					const body = Buffer.concat(chunks).toString('utf8');
-					const titleMatch = /<title[^>]*>([\s\S]*?)<\/title>/i.exec(body);
-					const title = titleMatch?.[1]?.trim() ?? '';
-					resolve(buildPageData(title));
-				});
-			}
-		};
-		if (url.protocol === 'https:') {
-			req = redirects.https.request(request, response);
-		} else {
-			req = redirects.http.request(request, response);
-		}
-		req.on('error', (error) => {
-			// Ignore errors caused by intentional req.destroy()
-			if (destroyed) return;
-			reject(error);
-		});
-		req.end();
-	});
-}