npm - @nitpicker/crawler - Versions diffs - 0.4.2 → 0.4.3 - Mend

@nitpicker/crawler 0.4.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (119) hide show

package/package.json +5 -2
package/CHANGELOG.md +0 -16
package/src/archive/__mock__/.gitignore +0 -3
package/src/archive/__mock__/mock.sqlite +0 -0
package/src/archive/archive-accessor.ts +0 -337
package/src/archive/archive.ts +0 -408
package/src/archive/database.spec.ts +0 -469
package/src/archive/database.ts +0 -1059
package/src/archive/debug.ts +0 -10
package/src/archive/filesystem/append-text.spec.ts +0 -26
package/src/archive/filesystem/append-text.ts +0 -16
package/src/archive/filesystem/copy-dir-sync.spec.ts +0 -27
package/src/archive/filesystem/copy-dir-sync.ts +0 -10
package/src/archive/filesystem/copy-dir.spec.ts +0 -33
package/src/archive/filesystem/copy-dir.ts +0 -14
package/src/archive/filesystem/exists.spec.ts +0 -33
package/src/archive/filesystem/exists.ts +0 -10
package/src/archive/filesystem/get-file-list.spec.ts +0 -37
package/src/archive/filesystem/get-file-list.ts +0 -13
package/src/archive/filesystem/index.ts +0 -17
package/src/archive/filesystem/is-dir.spec.ts +0 -29
package/src/archive/filesystem/is-dir.ts +0 -11
package/src/archive/filesystem/mkdir.spec.ts +0 -37
package/src/archive/filesystem/mkdir.ts +0 -16
package/src/archive/filesystem/output-json.spec.ts +0 -34
package/src/archive/filesystem/output-json.ts +0 -16
package/src/archive/filesystem/output-text.spec.ts +0 -31
package/src/archive/filesystem/output-text.ts +0 -35
package/src/archive/filesystem/read-json.spec.ts +0 -26
package/src/archive/filesystem/read-json.ts +0 -12
package/src/archive/filesystem/read-text.spec.ts +0 -25
package/src/archive/filesystem/read-text.ts +0 -11
package/src/archive/filesystem/readline.spec.ts +0 -29
package/src/archive/filesystem/readline.ts +0 -30
package/src/archive/filesystem/remove.spec.ts +0 -34
package/src/archive/filesystem/remove.ts +0 -11
package/src/archive/filesystem/rename.spec.ts +0 -46
package/src/archive/filesystem/rename.ts +0 -21
package/src/archive/filesystem/tar.spec.ts +0 -33
package/src/archive/filesystem/tar.ts +0 -27
package/src/archive/filesystem/untar.spec.ts +0 -34
package/src/archive/filesystem/untar.ts +0 -36
package/src/archive/index.ts +0 -13
package/src/archive/page.spec.ts +0 -368
package/src/archive/page.ts +0 -420
package/src/archive/resource.spec.ts +0 -101
package/src/archive/resource.ts +0 -73
package/src/archive/safe-path.spec.ts +0 -44
package/src/archive/safe-path.ts +0 -18
package/src/archive/types.ts +0 -227
package/src/crawler/clear-destination-cache.spec.ts +0 -20
package/src/crawler/clear-destination-cache.ts +0 -9
package/src/crawler/crawler.ts +0 -873
package/src/crawler/decompose-url.spec.ts +0 -48
package/src/crawler/decompose-url.ts +0 -90
package/src/crawler/destination-cache.spec.ts +0 -23
package/src/crawler/destination-cache.ts +0 -8
package/src/crawler/detect-pagination-pattern.spec.ts +0 -169
package/src/crawler/detect-pagination-pattern.ts +0 -66
package/src/crawler/fetch-destination.ts +0 -257
package/src/crawler/fetch-robots-txt.spec.ts +0 -83
package/src/crawler/fetch-robots-txt.ts +0 -91
package/src/crawler/find-best-matching-scope.spec.ts +0 -39
package/src/crawler/find-best-matching-scope.ts +0 -57
package/src/crawler/generate-predicted-urls.spec.ts +0 -42
package/src/crawler/generate-predicted-urls.ts +0 -34
package/src/crawler/handle-ignore-and-skip.spec.ts +0 -66
package/src/crawler/handle-ignore-and-skip.ts +0 -30
package/src/crawler/handle-resource-response.spec.ts +0 -45
package/src/crawler/handle-resource-response.ts +0 -21
package/src/crawler/handle-scrape-end.spec.ts +0 -109
package/src/crawler/handle-scrape-end.ts +0 -115
package/src/crawler/handle-scrape-error.spec.ts +0 -105
package/src/crawler/handle-scrape-error.ts +0 -58
package/src/crawler/index.ts +0 -2
package/src/crawler/inject-scope-auth.spec.ts +0 -36
package/src/crawler/inject-scope-auth.ts +0 -27
package/src/crawler/is-external-url.spec.ts +0 -31
package/src/crawler/is-external-url.ts +0 -17
package/src/crawler/is-in-any-lower-layer.spec.ts +0 -31
package/src/crawler/is-in-any-lower-layer.ts +0 -22
package/src/crawler/link-list.spec.ts +0 -355
package/src/crawler/link-list.ts +0 -275
package/src/crawler/link-to-page-data.spec.ts +0 -133
package/src/crawler/link-to-page-data.ts +0 -34
package/src/crawler/net-timeout-error.spec.ts +0 -25
package/src/crawler/net-timeout-error.ts +0 -11
package/src/crawler/protocol-agnostic-key.spec.ts +0 -40
package/src/crawler/protocol-agnostic-key.ts +0 -11
package/src/crawler/reconstruct-url.spec.ts +0 -37
package/src/crawler/reconstruct-url.ts +0 -37
package/src/crawler/robots-checker.spec.ts +0 -104
package/src/crawler/robots-checker.ts +0 -73
package/src/crawler/should-discard-predicted.spec.ts +0 -125
package/src/crawler/should-discard-predicted.ts +0 -33
package/src/crawler/should-skip-url.spec.ts +0 -77
package/src/crawler/should-skip-url.ts +0 -37
package/src/crawler/types.ts +0 -146
package/src/crawler-orchestrator.ts +0 -401
package/src/debug.ts +0 -10
package/src/index.ts +0 -25
package/src/types.ts +0 -30
package/src/utils/array/each-splitted.spec.ts +0 -38
package/src/utils/array/each-splitted.ts +0 -19
package/src/utils/array/index.ts +0 -1
package/src/utils/debug.ts +0 -6
package/src/utils/error/dom-evaluation-error.spec.ts +0 -20
package/src/utils/error/dom-evaluation-error.ts +0 -6
package/src/utils/error/error-emitter.spec.ts +0 -78
package/src/utils/error/error-emitter.ts +0 -44
package/src/utils/error/index.ts +0 -3
package/src/utils/index.ts +0 -5
package/src/utils/object/clean-object.spec.ts +0 -24
package/src/utils/object/clean-object.ts +0 -13
package/src/utils/object/index.ts +0 -1
package/src/utils/types/index.ts +0 -1
package/src/utils/types/types.ts +0 -65
package/tsconfig.json +0 -11
package/tsconfig.tsbuildinfo +0 -1

package/src/crawler/fetch-robots-txt.spec.ts DELETED Viewed

@@ -1,83 +0,0 @@
-import http from 'node:http';
-import { describe, it, expect, beforeAll, afterAll } from 'vitest';
-import { fetchRobotsTxt } from './fetch-robots-txt.js';
-const ROBOTS_TXT = `
-User-agent: *
-Disallow: /secret/
-Allow: /public/
-User-agent: Nitpicker
-Disallow: /admin/
-`;
-let server: http.Server;
-let port: number;
-beforeAll(async () => {
-	server = http.createServer((req, res) => {
-		if (req.url === '/robots.txt') {
-			res.writeHead(200, { 'Content-Type': 'text/plain' });
-			res.end(ROBOTS_TXT);
-		} else {
-			res.writeHead(404);
-			res.end();
-		}
-	});
-	await new Promise<void>((resolve) => {
-		server.listen(0, () => resolve());
-	});
-	const address = server.address();
-	port = typeof address === 'object' && address ? address.port : 0;
-});
-afterAll(async () => {
-	await new Promise<void>((resolve) => {
-		server.close(() => resolve());
-	});
-});
-describe('fetchRobotsTxt', () => {
-	it('returns a parsed Robot object for a valid robots.txt', async () => {
-		const robot = await fetchRobotsTxt(`http://127.0.0.1:${port}`);
-		expect(robot).not.toBeNull();
-		expect(robot!.isAllowed(`http://127.0.0.1:${port}/public/page`, '*')).toBe(true);
-		expect(robot!.isAllowed(`http://127.0.0.1:${port}/secret/page`, '*')).toBe(false);
-	});
-	it('respects user-agent specific rules', async () => {
-		const robot = await fetchRobotsTxt(`http://127.0.0.1:${port}`);
-		expect(robot).not.toBeNull();
-		expect(robot!.isAllowed(`http://127.0.0.1:${port}/admin/page`, 'Nitpicker')).toBe(
-			false,
-		);
-	});
-	it('returns null when robots.txt does not exist', async () => {
-		const noRobotsServer = http.createServer((_req, res) => {
-			res.writeHead(404);
-			res.end();
-		});
-		await new Promise<void>((resolve) => {
-			noRobotsServer.listen(0, () => resolve());
-		});
-		const address = noRobotsServer.address();
-		const noRobotsPort = typeof address === 'object' && address ? address.port : 0;
-		try {
-			const robot = await fetchRobotsTxt(`http://127.0.0.1:${noRobotsPort}`);
-			expect(robot).toBeNull();
-		} finally {
-			await new Promise<void>((resolve) => {
-				noRobotsServer.close(() => resolve());
-			});
-		}
-	});
-	it('returns null when the server is unreachable', async () => {
-		const robot = await fetchRobotsTxt('http://127.0.0.1:1');
-		expect(robot).toBeNull();
-	});
-});

package/src/crawler/fetch-robots-txt.ts DELETED Viewed

@@ -1,91 +0,0 @@
-import type { FollowResponse } from 'follow-redirects';
-import type { IncomingMessage } from 'node:http';
-import { createRequire } from 'node:module';
-import redirects from 'follow-redirects';
-/**
- * Result of parsing a robots.txt file.
- */
-interface RobotsResult {
-	/**
-	 * Check if a URL is allowed for a given user-agent.
-	 * @param url - The URL to check.
-	 * @param ua - The user-agent string to match against.
-	 * @returns `true` if allowed, `false` if disallowed, `undefined` if no matching rule.
-	 */
-	isAllowed(url: string, ua?: string): boolean | undefined;
-	/**
-	 * Check if a URL is disallowed for a given user-agent.
-	 * @param url - The URL to check.
-	 * @param ua - The user-agent string to match against.
-	 * @returns `true` if disallowed, `false` if allowed, `undefined` if no matching rule.
-	 */
-	isDisallowed(url: string, ua?: string): boolean | undefined;
-	/**
-	 * Get the crawl delay for a given user-agent.
-	 * @param ua - The user-agent string to match against.
-	 * @returns The crawl delay in seconds, or `undefined` if not specified.
-	 */
-	getCrawlDelay(ua?: string): number | undefined;
-	/**
-	 * Get the sitemaps listed in robots.txt.
-	 * @returns An array of sitemap URLs.
-	 */
-	getSitemaps(): string[];
-}
-const require = createRequire(import.meta.url);
-const robotsParser = require('robots-parser') as (
-	url: string,
-	robotstxt: string,
-) => RobotsResult;
-/**
- * Fetches and parses the robots.txt file for a given origin URL.
- *
- * Sends an HTTP(S) GET request to `{origin}/robots.txt` and parses the
- * response using `robots-parser`. Returns `null` if the server returns
- * a non-200 status code or if the request fails.
- * @param origin - The origin URL (e.g., `https://example.com`).
- * @param userAgent - Optional User-Agent string to send with the request.
- * @returns A parsed RobotsResult instance, or `null` if robots.txt is unavailable.
- */
-export async function fetchRobotsTxt(
-	origin: string,
-	userAgent?: string,
-): Promise<RobotsResult | null> {
-	const robotsUrl = `${origin}/robots.txt`;
-	return new Promise((resolve) => {
-		const protocol = robotsUrl.startsWith('https') ? redirects.https : redirects.http;
-		const req = protocol.get(
-			robotsUrl,
-			{
-				headers: {
-					...(userAgent ? { 'User-Agent': userAgent } : {}),
-				},
-				timeout: 10_000,
-			},
-			(res: IncomingMessage & FollowResponse) => {
-				if (res.statusCode !== 200) {
-					res.resume();
-					resolve(null);
-					return;
-				}
-				const chunks: Buffer[] = [];
-				res.on('data', (chunk: Buffer) => chunks.push(chunk));
-				res.on('end', () => {
-					const body = Buffer.concat(chunks).toString('utf8');
-					resolve(robotsParser(robotsUrl, body));
-				});
-				res.on('error', () => resolve(null));
-			},
-		);
-		req.on('error', () => resolve(null));
-		req.on('timeout', () => {
-			req.destroy();
-			resolve(null);
-		});
-	});
-}

package/src/crawler/find-best-matching-scope.spec.ts DELETED Viewed

@@ -1,39 +0,0 @@
-import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
-import { describe, it, expect } from 'vitest';
-import { findBestMatchingScope } from './find-best-matching-scope.js';
-describe('findBestMatchingScope', () => {
-	it('returns the deepest matching scope URL', () => {
-		const url = parseUrl('https://example.com/blog/post/1')!;
-		const scopes = [
-			parseUrl('https://example.com/blog')!,
-			parseUrl('https://example.com/blog/post')!,
-		];
-		const result = findBestMatchingScope(url, scopes);
-		expect(result).not.toBeNull();
-		expect(result!.pathname).toBe('/blog/post');
-	});
-	it('returns null when no scope matches', () => {
-		const url = parseUrl('https://other.com/page')!;
-		const scopes = [parseUrl('https://example.com/')!];
-		const result = findBestMatchingScope(url, scopes);
-		expect(result).toBeNull();
-	});
-	it('returns null for empty scopes array', () => {
-		const url = parseUrl('https://example.com/page')!;
-		const result = findBestMatchingScope(url, []);
-		expect(result).toBeNull();
-	});
-	it('does not match root scope without trailing slash against subpath', () => {
-		// Root URL https://example.com has paths [''], while /page has paths ['page']
-		// isPathMatch(['page'], ['']) fails because 'page' !== ''
-		const url = parseUrl('https://example.com/page')!;
-		const scopes = [parseUrl('https://example.com')!];
-		const result = findBestMatchingScope(url, scopes);
-		expect(result).toBeNull();
-	});
-});

package/src/crawler/find-best-matching-scope.ts DELETED Viewed

@@ -1,57 +0,0 @@
-import type { ExURL } from '@d-zero/shared/parse-url';
-/**
- * Find the scope URL with the deepest matching path for a given URL.
- *
- * Among all scope URLs sharing the same hostname, returns the one whose
- * path segments are a prefix of the target URL's path segments and which
- * has the greatest depth. Returns `null` if no scope URL matches.
- * @param url - The parsed URL to match against scope URLs.
- * @param scopes - The list of scope URLs to search.
- * @returns The best-matching scope URL, or `null` if none match.
- */
-export function findBestMatchingScope(
-	url: ExURL,
-	scopes: readonly ExURL[],
-): ExURL | null {
-	let bestMatch: ExURL | null = null;
-	let maxDepth = -1;
-	for (const scope of scopes) {
-		if (url.hostname !== scope.hostname) {
-			continue;
-		}
-		const isMatch = isPathMatch(url.paths, scope.paths);
-		if (isMatch && scope.depth > maxDepth) {
-			bestMatch = scope;
-			maxDepth = scope.depth;
-		}
-	}
-	return bestMatch;
-}
-/**
- * Check whether a target path is equal to or is a descendant of a base path.
- *
- * Compares path segments element by element. The target path matches if
- * all segments of the base path appear in the same positions at the
- * beginning of the target path.
- * @param targetPaths - The path segments of the URL being checked.
- * @param basePaths - The path segments of the scope URL to match against.
- * @returns `true` if the target path starts with or equals the base path.
- */
-function isPathMatch(targetPaths: string[], basePaths: string[]): boolean {
-	if (targetPaths.length < basePaths.length) {
-		return false;
-	}
-	for (const [i, basePath] of basePaths.entries()) {
-		if (targetPaths[i] !== basePath) {
-			return false;
-		}
-	}
-	return true;
-}

package/src/crawler/generate-predicted-urls.spec.ts DELETED Viewed

@@ -1,42 +0,0 @@
-import { describe, expect, it } from 'vitest';
-import { generatePredictedUrls } from './generate-predicted-urls.js';
-describe('generatePredictedUrls', () => {
-	it('step=1, count=3 で3つのURLを生成する', () => {
-		const pattern = { tokenIndex: 1, step: 1, currentNumber: 3 };
-		const urls = generatePredictedUrls(pattern, '//example.com/page/3', 3);
-		expect(urls).toEqual([
-			'//example.com/page/4',
-			'//example.com/page/5',
-			'//example.com/page/6',
-		]);
-	});
-	it('step=10, count=2 で2つのURLを生成する', () => {
-		const pattern = { tokenIndex: 1, step: 10, currentNumber: 20 };
-		const urls = generatePredictedUrls(pattern, '//example.com/page/20', 2);
-		expect(urls).toEqual(['//example.com/page/30', '//example.com/page/40']);
-	});
-	it('クエリパターンのURLを生成する', () => {
-		const pattern = { tokenIndex: 1, step: 1, currentNumber: 2 };
-		const urls = generatePredictedUrls(pattern, '//example.com/list?p=2&sort=name', 2);
-		expect(urls).toEqual([
-			'//example.com/list?p=3&sort=name',
-			'//example.com/list?p=4&sort=name',
-		]);
-	});
-	it('count=0 で空配列を返す', () => {
-		const pattern = { tokenIndex: 1, step: 1, currentNumber: 3 };
-		const urls = generatePredictedUrls(pattern, '//example.com/page/3', 0);
-		expect(urls).toEqual([]);
-	});
-	it('深いパスのURLを生成する', () => {
-		const pattern = { tokenIndex: 2, step: 1, currentNumber: 3 };
-		const urls = generatePredictedUrls(pattern, '//example.com/a/b/3/c', 2);
-		expect(urls).toEqual(['//example.com/a/b/4/c', '//example.com/a/b/5/c']);
-	});
-});

package/src/crawler/generate-predicted-urls.ts DELETED Viewed

@@ -1,34 +0,0 @@
-import type { PaginationPattern } from './types.js';
-import { decomposeUrl } from './decompose-url.js';
-import { reconstructUrl } from './reconstruct-url.js';
-/**
- * Generates predicted URLs by extrapolating the detected pagination pattern.
- *
- * Starting from `currentUrl`, applies the pattern's step `count` times to produce
- * future page URLs (e.g. if step=1 and currentNumber=2, generates page 3, 4, ...).
- * These URLs are pushed into the crawl queue and discarded later if they 404.
- * @param pattern - The detected pagination pattern from `detectPaginationPattern()`
- * @param currentUrl - The URL to extrapolate from (protocol-agnostic, without hash/auth)
- * @param count - Number of predicted URLs to generate (typically equals concurrency)
- * @returns Array of predicted URL strings
- */
-export function generatePredictedUrls(
-	pattern: PaginationPattern,
-	currentUrl: string,
-	count: number,
-): string[] {
-	if (count <= 0) return [];
-	const decomposed = decomposeUrl(currentUrl);
-	if (!decomposed) return [];
-	const results: string[] = [];
-	for (let i = 1; i <= count; i++) {
-		const nextNum = pattern.currentNumber + pattern.step * i;
-		const url = reconstructUrl(decomposed, pattern.tokenIndex, String(nextNum));
-		results.push(url);
-	}
-	return results;
-}

package/src/crawler/handle-ignore-and-skip.spec.ts DELETED Viewed

@@ -1,66 +0,0 @@
-import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
-import { describe, it, expect, vi } from 'vitest';
-import { handleIgnoreAndSkip } from './handle-ignore-and-skip.js';
-describe('handleIgnoreAndSkip', () => {
-	it('calls linkList.done and returns the link when URL is in queue', () => {
-		const url = parseUrl('https://example.com/page')!;
-		const mockLink = { url, isExternal: false, isLowerLayer: false };
-		const linkList = {
-			done: vi.fn().mockReturnValue(mockLink),
-		};
-		const scope = new Map();
-		const options = {
-			interval: 0,
-			parallels: 1,
-			recursive: true,
-			fromList: false,
-			captureImages: false,
-			executablePath: null,
-			fetchExternal: false,
-			scope: [],
-			excludes: [],
-			excludeKeywords: [],
-			excludeUrls: [] as readonly string[],
-			maxExcludedDepth: 0,
-			retry: 0,
-			verbose: false,
-			disableQueries: false,
-		};
-		const result = handleIgnoreAndSkip(url, linkList as never, scope, options);
-		expect(linkList.done).toHaveBeenCalledWith(url, scope, {}, options);
-		expect(result).toBe(mockLink);
-	});
-	it('returns null when URL is not in queue', () => {
-		const url = parseUrl('https://example.com/page')!;
-		const linkList = {
-			done: vi.fn().mockReturnValue(null),
-		};
-		const scope = new Map();
-		const options = {
-			interval: 0,
-			parallels: 1,
-			recursive: true,
-			fromList: false,
-			captureImages: false,
-			executablePath: null,
-			fetchExternal: false,
-			scope: [],
-			excludes: [],
-			excludeKeywords: [],
-			excludeUrls: [] as readonly string[],
-			maxExcludedDepth: 0,
-			retry: 0,
-			verbose: false,
-			disableQueries: false,
-		};
-		const result = handleIgnoreAndSkip(url, linkList as never, scope, options);
-		expect(result).toBeNull();
-	});
-});

package/src/crawler/handle-ignore-and-skip.ts DELETED Viewed

@@ -1,30 +0,0 @@
-import type LinkList from './link-list.js';
-import type { CrawlerOptions } from './types.js';
-import type { Link } from '../utils/index.js';
-import type { ExURL } from '@d-zero/shared/parse-url';
-import { crawlerLog } from '../debug.js';
-/**
- * Handle a URL that was ignored or skipped during scraping.
- *
- * Marks the URL as done in the link list without any page data,
- * effectively recording that it was encountered but not scraped.
- * @param url - The URL that was skipped.
- * @param linkList - The link list managing the crawl queue.
- * @param scope - Map of hostnames to their scope URLs.
- * @param options - Crawler configuration options.
- * @returns The constructed {@link Link} object, or `null` if the URL was not in the queue.
- */
-export function handleIgnoreAndSkip(
-	url: ExURL,
-	linkList: LinkList,
-	scope: ReadonlyMap<string, readonly ExURL[]>,
-	options: CrawlerOptions,
-): Link | null {
-	const updated = linkList.done(url, scope, {}, options);
-	if (updated) {
-		crawlerLog('Skipped URL: %s', url.href);
-	}
-	return updated;
-}

package/src/crawler/handle-resource-response.spec.ts DELETED Viewed

@@ -1,45 +0,0 @@
-import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
-import { describe, it, expect } from 'vitest';
-import { handleResourceResponse } from './handle-resource-response.js';
-describe('handleResourceResponse', () => {
-	it('returns isNew: true for a newly seen resource', () => {
-		const resources = new Set<string>();
-		const resource = {
-			url: parseUrl('https://example.com/style.css')!,
-			isExternal: false,
-			status: 200,
-			statusText: 'OK',
-			contentType: 'text/css',
-			contentLength: 1024,
-			compress: '' as const,
-			cdn: '' as const,
-			responseHeaders: {},
-		};
-		const result = handleResourceResponse(resource, resources);
-		expect(result.isNew).toBe(true);
-		expect(resources.has('https://example.com/style.css')).toBe(true);
-	});
-	it('returns isNew: false for a duplicate resource', () => {
-		const resources = new Set<string>(['https://example.com/style.css']);
-		const resource = {
-			url: parseUrl('https://example.com/style.css')!,
-			isExternal: false,
-			status: 200,
-			statusText: 'OK',
-			contentType: 'text/css',
-			contentLength: 1024,
-			compress: '' as const,
-			cdn: '' as const,
-			responseHeaders: {},
-		};
-		const result = handleResourceResponse(resource, resources);
-		expect(result.isNew).toBe(false);
-	});
-});

package/src/crawler/handle-resource-response.ts DELETED Viewed

@@ -1,21 +0,0 @@
-import type { Resource } from '../utils/index.js';
-/**
- * Track a network resource response and determine if it is newly discovered.
- *
- * Checks whether the resource URL has already been seen. If it is new,
- * adds it to the known resources set.
- * @param resource - The captured network resource data.
- * @param resources - The set of already-known resource URLs (without hash).
- * @returns An object with `isNew` indicating whether this resource was seen for the first time.
- */
-export function handleResourceResponse(
-	resource: Resource,
-	resources: Set<string>,
-): { isNew: boolean } {
-	const isNew = !resources.has(resource.url.withoutHash);
-	if (isNew) {
-		resources.add(resource.url.withoutHash);
-	}
-	return { isNew };
-}

package/src/crawler/handle-scrape-end.spec.ts DELETED Viewed

@@ -1,109 +0,0 @@
-import type { CrawlerOptions } from './types.js';
-import type { AnchorData, PageData } from '../utils/index.js';
-import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
-import { describe, it, expect, vi } from 'vitest';
-import { handleScrapeEnd } from './handle-scrape-end.js';
-const defaultOptions: CrawlerOptions = {
-	interval: 0,
-	parallels: 1,
-	recursive: true,
-	fromList: false,
-	captureImages: false,
-	executablePath: null,
-	fetchExternal: false,
-	scope: ['https://example.com/'],
-	excludes: [],
-	excludeKeywords: [],
-	excludeUrls: [],
-	maxExcludedDepth: 0,
-	retry: 0,
-	verbose: false,
-	disableQueries: false,
-};
-/**
- *
- * @param overrides
- */
-function createMockResult(overrides?: Partial<PageData>): PageData {
-	return {
-		url: parseUrl('https://example.com/page')!,
-		isTarget: true,
-		isExternal: false,
-		redirectPaths: [],
-		status: 200,
-		statusText: 'OK',
-		contentType: 'text/html',
-		contentLength: 1000,
-		responseHeaders: {},
-		meta: { title: 'Test' },
-		imageList: [],
-		anchorList: [] as AnchorData[],
-		html: '<html></html>',
-		isSkipped: false,
-		...overrides,
-	};
-}
-describe('handleScrapeEnd', () => {
-	it('marks URL as done in the link list', () => {
-		const result = createMockResult();
-		const mockLink = { url: result.url, isExternal: false, isLowerLayer: false };
-		const linkList = {
-			done: vi.fn().mockReturnValue(mockLink),
-			isMetadataOnly: vi.fn().mockReturnValue(false),
-		};
-		const scope = new Map([['example.com', [parseUrl('https://example.com/')!]]]);
-		const addUrl = vi.fn();
-		const { link, isExternal } = handleScrapeEnd(
-			result,
-			linkList as never,
-			scope,
-			defaultOptions,
-			addUrl,
-		);
-		expect(linkList.done).toHaveBeenCalledOnce();
-		expect(link).toBe(mockLink);
-		expect(isExternal).toBe(false);
-	});
-	it('skips anchor processing in title-only mode', () => {
-		const anchor = { href: parseUrl('https://example.com/other')!, textContent: 'link' };
-		const result = createMockResult({ anchorList: [anchor as AnchorData] });
-		const linkList = {
-			done: vi.fn().mockReturnValue(null),
-			isMetadataOnly: vi.fn().mockReturnValue(true),
-		};
-		const scope = new Map([['example.com', [parseUrl('https://example.com/')!]]]);
-		const addUrl = vi.fn();
-		handleScrapeEnd(result, linkList as never, scope, defaultOptions, addUrl);
-		expect(addUrl).not.toHaveBeenCalled();
-	});
-	it('returns isExternal: true for external pages', () => {
-		const result = createMockResult({ isExternal: true });
-		const linkList = {
-			done: vi.fn().mockReturnValue(null),
-			isMetadataOnly: vi.fn().mockReturnValue(false),
-		};
-		const scope = new Map();
-		const addUrl = vi.fn();
-		const { isExternal } = handleScrapeEnd(
-			result,
-			linkList as never,
-			scope,
-			defaultOptions,
-			addUrl,
-		);
-		expect(isExternal).toBe(true);
-	});
-});