npm - @nitpicker/crawler - Versions diffs - 0.4.2 → 0.4.3 - Mend

@nitpicker/crawler 0.4.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (119) hide show

package/package.json +5 -2
package/CHANGELOG.md +0 -16
package/src/archive/__mock__/.gitignore +0 -3
package/src/archive/__mock__/mock.sqlite +0 -0
package/src/archive/archive-accessor.ts +0 -337
package/src/archive/archive.ts +0 -408
package/src/archive/database.spec.ts +0 -469
package/src/archive/database.ts +0 -1059
package/src/archive/debug.ts +0 -10
package/src/archive/filesystem/append-text.spec.ts +0 -26
package/src/archive/filesystem/append-text.ts +0 -16
package/src/archive/filesystem/copy-dir-sync.spec.ts +0 -27
package/src/archive/filesystem/copy-dir-sync.ts +0 -10
package/src/archive/filesystem/copy-dir.spec.ts +0 -33
package/src/archive/filesystem/copy-dir.ts +0 -14
package/src/archive/filesystem/exists.spec.ts +0 -33
package/src/archive/filesystem/exists.ts +0 -10
package/src/archive/filesystem/get-file-list.spec.ts +0 -37
package/src/archive/filesystem/get-file-list.ts +0 -13
package/src/archive/filesystem/index.ts +0 -17
package/src/archive/filesystem/is-dir.spec.ts +0 -29
package/src/archive/filesystem/is-dir.ts +0 -11
package/src/archive/filesystem/mkdir.spec.ts +0 -37
package/src/archive/filesystem/mkdir.ts +0 -16
package/src/archive/filesystem/output-json.spec.ts +0 -34
package/src/archive/filesystem/output-json.ts +0 -16
package/src/archive/filesystem/output-text.spec.ts +0 -31
package/src/archive/filesystem/output-text.ts +0 -35
package/src/archive/filesystem/read-json.spec.ts +0 -26
package/src/archive/filesystem/read-json.ts +0 -12
package/src/archive/filesystem/read-text.spec.ts +0 -25
package/src/archive/filesystem/read-text.ts +0 -11
package/src/archive/filesystem/readline.spec.ts +0 -29
package/src/archive/filesystem/readline.ts +0 -30
package/src/archive/filesystem/remove.spec.ts +0 -34
package/src/archive/filesystem/remove.ts +0 -11
package/src/archive/filesystem/rename.spec.ts +0 -46
package/src/archive/filesystem/rename.ts +0 -21
package/src/archive/filesystem/tar.spec.ts +0 -33
package/src/archive/filesystem/tar.ts +0 -27
package/src/archive/filesystem/untar.spec.ts +0 -34
package/src/archive/filesystem/untar.ts +0 -36
package/src/archive/index.ts +0 -13
package/src/archive/page.spec.ts +0 -368
package/src/archive/page.ts +0 -420
package/src/archive/resource.spec.ts +0 -101
package/src/archive/resource.ts +0 -73
package/src/archive/safe-path.spec.ts +0 -44
package/src/archive/safe-path.ts +0 -18
package/src/archive/types.ts +0 -227
package/src/crawler/clear-destination-cache.spec.ts +0 -20
package/src/crawler/clear-destination-cache.ts +0 -9
package/src/crawler/crawler.ts +0 -873
package/src/crawler/decompose-url.spec.ts +0 -48
package/src/crawler/decompose-url.ts +0 -90
package/src/crawler/destination-cache.spec.ts +0 -23
package/src/crawler/destination-cache.ts +0 -8
package/src/crawler/detect-pagination-pattern.spec.ts +0 -169
package/src/crawler/detect-pagination-pattern.ts +0 -66
package/src/crawler/fetch-destination.ts +0 -257
package/src/crawler/fetch-robots-txt.spec.ts +0 -83
package/src/crawler/fetch-robots-txt.ts +0 -91
package/src/crawler/find-best-matching-scope.spec.ts +0 -39
package/src/crawler/find-best-matching-scope.ts +0 -57
package/src/crawler/generate-predicted-urls.spec.ts +0 -42
package/src/crawler/generate-predicted-urls.ts +0 -34
package/src/crawler/handle-ignore-and-skip.spec.ts +0 -66
package/src/crawler/handle-ignore-and-skip.ts +0 -30
package/src/crawler/handle-resource-response.spec.ts +0 -45
package/src/crawler/handle-resource-response.ts +0 -21
package/src/crawler/handle-scrape-end.spec.ts +0 -109
package/src/crawler/handle-scrape-end.ts +0 -115
package/src/crawler/handle-scrape-error.spec.ts +0 -105
package/src/crawler/handle-scrape-error.ts +0 -58
package/src/crawler/index.ts +0 -2
package/src/crawler/inject-scope-auth.spec.ts +0 -36
package/src/crawler/inject-scope-auth.ts +0 -27
package/src/crawler/is-external-url.spec.ts +0 -31
package/src/crawler/is-external-url.ts +0 -17
package/src/crawler/is-in-any-lower-layer.spec.ts +0 -31
package/src/crawler/is-in-any-lower-layer.ts +0 -22
package/src/crawler/link-list.spec.ts +0 -355
package/src/crawler/link-list.ts +0 -275
package/src/crawler/link-to-page-data.spec.ts +0 -133
package/src/crawler/link-to-page-data.ts +0 -34
package/src/crawler/net-timeout-error.spec.ts +0 -25
package/src/crawler/net-timeout-error.ts +0 -11
package/src/crawler/protocol-agnostic-key.spec.ts +0 -40
package/src/crawler/protocol-agnostic-key.ts +0 -11
package/src/crawler/reconstruct-url.spec.ts +0 -37
package/src/crawler/reconstruct-url.ts +0 -37
package/src/crawler/robots-checker.spec.ts +0 -104
package/src/crawler/robots-checker.ts +0 -73
package/src/crawler/should-discard-predicted.spec.ts +0 -125
package/src/crawler/should-discard-predicted.ts +0 -33
package/src/crawler/should-skip-url.spec.ts +0 -77
package/src/crawler/should-skip-url.ts +0 -37
package/src/crawler/types.ts +0 -146
package/src/crawler-orchestrator.ts +0 -401
package/src/debug.ts +0 -10
package/src/index.ts +0 -25
package/src/types.ts +0 -30
package/src/utils/array/each-splitted.spec.ts +0 -38
package/src/utils/array/each-splitted.ts +0 -19
package/src/utils/array/index.ts +0 -1
package/src/utils/debug.ts +0 -6
package/src/utils/error/dom-evaluation-error.spec.ts +0 -20
package/src/utils/error/dom-evaluation-error.ts +0 -6
package/src/utils/error/error-emitter.spec.ts +0 -78
package/src/utils/error/error-emitter.ts +0 -44
package/src/utils/error/index.ts +0 -3
package/src/utils/index.ts +0 -5
package/src/utils/object/clean-object.spec.ts +0 -24
package/src/utils/object/clean-object.ts +0 -13
package/src/utils/object/index.ts +0 -1
package/src/utils/types/index.ts +0 -1
package/src/utils/types/types.ts +0 -65
package/tsconfig.json +0 -11
package/tsconfig.tsbuildinfo +0 -1

package/src/crawler/handle-scrape-end.ts DELETED Viewed

@@ -1,115 +0,0 @@
-import type LinkList from './link-list.js';
-import type { CrawlerOptions } from './types.js';
-import type { AnchorData, Link, PageData } from '../utils/index.js';
-import type { ExURL } from '@d-zero/shared/parse-url';
-import { crawlerLog } from '../debug.js';
-import { injectScopeAuth } from './inject-scope-auth.js';
-import { isExternalUrl } from './is-external-url.js';
-import { isInAnyLowerLayer } from './is-in-any-lower-layer.js';
-/**
- * Process the result of a successful page scrape.
- *
- * Extracts anchors from the page (unless in metadata-only mode), enqueues
- * newly discovered URLs via the `addUrl` callback, and marks the URL
- * as done in the link list.
- * @param result - The scraped page data.
- * @param linkList - The link list managing the crawl queue.
- * @param scope - Map of hostnames to their scope URLs.
- * @param options - Crawler configuration options.
- * @param addUrl - Callback to enqueue a newly discovered URL. Accepts optional
- *   `{ metadataOnly: true }` to request metadata-only scraping.
- * @returns An object containing the constructed link and whether the page is external.
- */
-export function handleScrapeEnd(
-	result: PageData,
-	linkList: LinkList,
-	scope: ReadonlyMap<string, readonly ExURL[]>,
-	options: CrawlerOptions,
-	addUrl: (url: ExURL, opts?: { metadataOnly?: true }) => void,
-): { link: Link | null; isExternal: boolean } {
-	const isMetadataOnly = linkList.isMetadataOnly(result.url.withoutHash);
-	if (!isMetadataOnly) {
-		processAnchors(result.anchorList, scope, options, addUrl);
-	}
-	const link = linkList.done(
-		result.url,
-		scope,
-		{
-			page: result,
-		},
-		options,
-	);
-	crawlerLog('Scrape end URL: %s', result.url.href);
-	crawlerLog('Scrape end Status: %d', result.status);
-	crawlerLog('Scrape end Type: %s', result.contentType);
-	if (!result.isExternal) {
-		crawlerLog('Scrape end Anchors: %d URLs', result.anchorList.length);
-	}
-	return { link, isExternal: result.isExternal };
-}
-/**
- * Process anchor elements extracted from a scraped page and enqueue new URLs.
- *
- * For each anchor:
- * 1. Determines if it is external (outside the crawl scope)
- * 2. Injects authentication credentials from matching scope URLs
- * 3. Reconstructs the `withoutHash` URL with injected auth
- * 4. In recursive mode: enqueues internal lower-layer URLs for full scraping,
- *    and external URLs for metadata-only scraping (if `fetchExternal` is enabled)
- * 5. In non-recursive mode: enqueues all URLs for metadata-only scraping
- * @param anchors - The list of anchor data extracted from the page.
- * @param scope - Map of hostnames to their scope URLs.
- * @param options - Crawler configuration options.
- * @param addUrl - Callback to enqueue a newly discovered URL. Accepts optional
- *   `{ metadataOnly: true }` to request metadata-only scraping.
- */
-function processAnchors(
-	anchors: AnchorData[],
-	scope: ReadonlyMap<string, readonly ExURL[]>,
-	options: CrawlerOptions,
-	addUrl: (url: ExURL, opts?: { metadataOnly?: true }) => void,
-): void {
-	for (const anchor of anchors) {
-		const isExternal = isExternalUrl(anchor.href, scope);
-		anchor.isExternal = isExternal;
-		if (!isExternal && (!anchor.href.username || !anchor.href.password)) {
-			injectScopeAuth(anchor.href, scope);
-			const auth =
-				anchor.href.username && anchor.href.password
-					? `${anchor.href.username}:${anchor.href.password}@`
-					: '';
-			const host =
-				anchor.href.hostname + (anchor.href.port ? `:${anchor.href.port}` : '');
-			const newSearch = anchor.href.query ? `?${anchor.href.query}` : '';
-			const body = anchor.href.dirname
-				? `${anchor.href.paths.join('/')}${newSearch}`
-				: newSearch
-					? `${newSearch}`
-					: '';
-			const withoutHash = `${anchor.href.protocol}//${auth}${host}${body ? `/${body}` : ''}`;
-			anchor.href.withoutHash = withoutHash;
-		}
-		if (options.recursive) {
-			const scopes = scope.get(anchor.href.hostname);
-			if (scopes && isInAnyLowerLayer(anchor.href, scopes, options)) {
-				addUrl(anchor.href);
-			} else if (isExternal && options.fetchExternal) {
-				addUrl(anchor.href, { metadataOnly: true });
-			}
-			continue;
-		}
-		addUrl(anchor.href, { metadataOnly: true });
-	}
-}

package/src/crawler/handle-scrape-error.spec.ts DELETED Viewed

@@ -1,105 +0,0 @@
-import type { CrawlerOptions } from './types.js';
-import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
-import { describe, it, expect } from 'vitest';
-import { handleScrapeError } from './handle-scrape-error.js';
-import LinkList from './link-list.js';
-const defaultOptions: CrawlerOptions = {
-	interval: 0,
-	parallels: 1,
-	recursive: true,
-	fromList: false,
-	captureImages: false,
-	executablePath: null,
-	fetchExternal: false,
-	scope: ['https://example.com/'],
-	excludes: [],
-	excludeKeywords: [],
-	excludeUrls: [],
-	maxExcludedDepth: 10,
-	retry: 3,
-	verbose: false,
-	disableQueries: false,
-};
-/**
- * Create a scope map for testing.
- * @returns A scope map with example.com.
- */
-function createScope() {
-	return new Map([['example.com', [parseUrl('https://example.com/')!]]]);
-}
-describe('handleScrapeError', () => {
-	it('marks the URL as done when shutdown is true', () => {
-		const linkList = new LinkList();
-		const url = parseUrl('https://example.com/page')!;
-		const scope = createScope();
-		linkList.add(url);
-		linkList.progress(url);
-		const { link, result } = handleScrapeError(
-			{
-				url,
-				error: { name: 'Error', message: 'Browser crashed' },
-				shutdown: true,
-				pid: 1234,
-			},
-			linkList,
-			scope,
-			defaultOptions,
-		);
-		expect(link).not.toBeNull();
-		expect(link!.url.href).toBe(url.href);
-		expect(result).toBeDefined();
-		expect(result!.status).toBe(-1);
-	});
-	it('marks the URL as done when shutdown is false', () => {
-		const linkList = new LinkList();
-		const url = parseUrl('https://example.com/page')!;
-		const scope = createScope();
-		linkList.add(url);
-		linkList.progress(url);
-		const { link, result } = handleScrapeError(
-			{
-				url,
-				error: { name: 'Error', message: 'ERR_NAME_NOT_RESOLVED' },
-				shutdown: false,
-				pid: 5678,
-			},
-			linkList,
-			scope,
-			defaultOptions,
-		);
-		expect(link).not.toBeNull();
-		expect(link!.url.href).toBe(url.href);
-		expect(result).toBeDefined();
-		expect(result!.status).toBe(-1);
-	});
-	it('returns null link when url is null', () => {
-		const linkList = new LinkList();
-		const scope = createScope();
-		const { link, result } = handleScrapeError(
-			{
-				url: null,
-				error: { name: 'Error', message: 'Unknown error' },
-				shutdown: true,
-				pid: undefined,
-			},
-			linkList,
-			scope,
-			defaultOptions,
-		);
-		expect(link).toBeNull();
-		expect(result).toBeUndefined();
-	});
-});

package/src/crawler/handle-scrape-error.ts DELETED Viewed

@@ -1,58 +0,0 @@
-import type LinkList from './link-list.js';
-import type { CrawlerOptions } from './types.js';
-import type { Link, PageData } from '../utils/index.js';
-import type { ExURL } from '@d-zero/shared/parse-url';
-import { crawlerErrorLog } from '../debug.js';
-import { linkToPageData } from './link-to-page-data.js';
-/**
- * Handle an error that occurred during page scraping.
- *
- * Marks the URL as done and creates a fallback {@link PageData} from the
- * link, regardless of whether the error caused a shutdown. This ensures
- * that errored URLs are recorded in the DB (`status = -1, scraped = 1`)
- * and not re-queued on resume.
- * @param payload - The error payload from the scraper.
- * @param payload.url - The URL being scraped when the error occurred, or `null`.
- * @param payload.error - The error details including name, message, and optional stack.
- * @param payload.error.name
- * @param payload.error.message
- * @param payload.error.stack
- * @param payload.shutdown - Whether the error caused the scraper process to shut down.
- * @param payload.pid - The process ID of the scraper, or `undefined`.
- * @param linkList - The link list managing the crawl queue.
- * @param scope - Map of hostnames to their scope URLs.
- * @param options - Crawler configuration options.
- * @returns An object with the link and an optional fallback PageData result.
- */
-export function handleScrapeError(
-	payload: {
-		url: ExURL | null;
-		error: { name: string; message: string; stack?: string };
-		shutdown: boolean;
-		pid: number | undefined;
-	},
-	linkList: LinkList,
-	scope: ReadonlyMap<string, readonly ExURL[]>,
-	options: CrawlerOptions,
-): { link: Link | null; result?: PageData } {
-	const { url, error, shutdown, pid } = payload;
-	let link: Link | null = null;
-	let result: PageData | undefined;
-	if (url) {
-		const updated = linkList.done(url, scope, { error }, options);
-		if (updated) {
-			link = updated;
-			result = linkToPageData(updated);
-		}
-	}
-	crawlerErrorLog('From %d(%s)', pid, url?.href ?? 'UNKNOWN_URL');
-	crawlerErrorLog('Then shutdown?: %s', shutdown ? 'Yes' : 'No');
-	crawlerErrorLog('%O', error);
-	return { link, result };
-}

package/src/crawler/index.ts DELETED Viewed

	@@ -1,2 +0,0 @@
1	- export { default as Crawler } from './crawler.js';
2	- export { clearDestinationCache } from './clear-destination-cache.js';

package/src/crawler/inject-scope-auth.spec.ts DELETED Viewed

@@ -1,36 +0,0 @@
-import type { ExURL } from '@d-zero/shared/parse-url';
-import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
-import { describe, it, expect } from 'vitest';
-import { injectScopeAuth } from './inject-scope-auth.js';
-/**
- * Create a scope map from hostname-URL pairs for testing.
- * @param entries - Array of [hostname, urls] tuples.
- * @returns A map from hostname to parsed ExURL arrays.
- */
-function createScope(entries: [string, string[]][]): Map<string, ExURL[]> {
-	return new Map(
-		entries.map(([h, urls]) => [h, urls.map((u) => parseUrl(u)!).filter(Boolean)]),
-	);
-}
-describe('injectScopeAuth', () => {
-	it('injects auth from matching scope URL', () => {
-		const url = parseUrl('https://example.com/blog/post')!;
-		const scope = createScope([['example.com', ['https://user:pass@example.com/blog']]]);
-		injectScopeAuth(url, scope);
-		expect(url.username).toBe('user');
-		expect(url.password).toBe('pass');
-	});
-	it('does not inject auth when hostname does not match', () => {
-		const url = parseUrl('https://other.com/page')!;
-		const scope = createScope([['example.com', ['https://user:pass@example.com/']]]);
-		injectScopeAuth(url, scope);
-		// username/password are null for URLs parsed without auth
-		expect(url.username).toBeNull();
-		expect(url.password).toBeNull();
-	});
-});

package/src/crawler/inject-scope-auth.ts DELETED Viewed

@@ -1,27 +0,0 @@
-import type { ExURL } from '@d-zero/shared/parse-url';
-import { findBestMatchingScope } from './find-best-matching-scope.js';
-/**
- * Inject authentication credentials from a matching scope URL into the target URL.
- *
- * Finds the best-matching scope URL (deepest path match) for the given URL's
- * hostname and copies its `username` and `password` properties. This mutates
- * the `url` parameter in place.
- * @param url - The parsed URL to receive authentication credentials (mutated in place).
- * @param scope - Map of hostnames to their scope URLs.
- */
-export function injectScopeAuth(
-	url: ExURL,
-	scope: ReadonlyMap<string, readonly ExURL[]>,
-): void {
-	const scopes = scope.get(url.hostname);
-	if (!scopes) {
-		return;
-	}
-	const matchedScope = findBestMatchingScope(url, scopes);
-	if (matchedScope) {
-		url.username = matchedScope.username;
-		url.password = matchedScope.password;
-	}
-}

package/src/crawler/is-external-url.spec.ts DELETED Viewed

@@ -1,31 +0,0 @@
-import type { ExURL } from '@d-zero/shared/parse-url';
-import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
-import { describe, it, expect } from 'vitest';
-import { isExternalUrl } from './is-external-url.js';
-/**
- * Create a scope map from hostname-URL pairs for testing.
- * @param entries - Array of [hostname, urls] tuples.
- * @returns A map from hostname to parsed ExURL arrays.
- */
-function createScope(entries: [string, string[]][]): Map<string, ExURL[]> {
-	return new Map(
-		entries.map(([h, urls]) => [h, urls.map((u) => parseUrl(u)!).filter(Boolean)]),
-	);
-}
-describe('isExternalUrl', () => {
-	it('returns false when hostname is in scope', () => {
-		const url = parseUrl('https://example.com/page')!;
-		const scope = createScope([['example.com', ['https://example.com/']]]);
-		expect(isExternalUrl(url, scope)).toBe(false);
-	});
-	it('returns true when hostname is not in scope', () => {
-		const url = parseUrl('https://other.com/page')!;
-		const scope = createScope([['example.com', ['https://example.com/']]]);
-		expect(isExternalUrl(url, scope)).toBe(true);
-	});
-});

package/src/crawler/is-external-url.ts DELETED Viewed

@@ -1,17 +0,0 @@
-import type { ExURL } from '@d-zero/shared/parse-url';
-/**
- * Determine whether a URL is external to the crawl scope.
- *
- * A URL is considered external if its hostname does not appear
- * as a key in the scope map.
- * @param url - The parsed URL to check.
- * @param scope - Map of hostnames to their scope URLs.
- * @returns `true` if the URL is outside the crawl scope.
- */
-export function isExternalUrl(
-	url: ExURL,
-	scope: ReadonlyMap<string, readonly ExURL[]>,
-): boolean {
-	return !scope.has(url.hostname);
-}

package/src/crawler/is-in-any-lower-layer.spec.ts DELETED Viewed

@@ -1,31 +0,0 @@
-import type { ParseURLOptions } from '@d-zero/shared/parse-url';
-import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
-import { describe, it, expect } from 'vitest';
-import { isInAnyLowerLayer } from './is-in-any-lower-layer.js';
-const defaultOptions: ParseURLOptions = {};
-describe('isInAnyLowerLayer', () => {
-	it('returns true when URL is in a lower layer of a scope', () => {
-		const url = parseUrl('https://example.com/blog/post')!;
-		const scopes = [parseUrl('https://example.com/blog/')!];
-		expect(isInAnyLowerLayer(url, scopes, defaultOptions)).toBe(true);
-	});
-	it('returns false when URL is not in a lower layer', () => {
-		const url = parseUrl('https://example.com/about')!;
-		const scopes = [parseUrl('https://example.com/blog/')!];
-		expect(isInAnyLowerLayer(url, scopes, defaultOptions)).toBe(false);
-	});
-	it('returns true when URL matches one of multiple scopes', () => {
-		const url = parseUrl('https://example.com/docs/api')!;
-		const scopes = [
-			parseUrl('https://example.com/blog/')!,
-			parseUrl('https://example.com/docs/')!,
-		];
-		expect(isInAnyLowerLayer(url, scopes, defaultOptions)).toBe(true);
-	});
-});

package/src/crawler/is-in-any-lower-layer.ts DELETED Viewed

@@ -1,22 +0,0 @@
-import type { ExURL, ParseURLOptions } from '@d-zero/shared/parse-url';
-import { isLowerLayer } from '@d-zero/shared/is-lower-layer';
-/**
- * Check whether a URL is in a lower layer (subdirectory) of any scope URL.
- *
- * Tests the URL against each scope URL using the `isLowerLayer` utility,
- * which checks if the URL's path is at the same level or deeper than
- * the scope URL's path.
- * @param url - The parsed URL to check.
- * @param scopes - The list of scope URLs to test against.
- * @param options - URL parsing options used for layer comparison.
- * @returns `true` if the URL is in a lower layer of at least one scope URL.
- */
-export function isInAnyLowerLayer(
-	url: ExURL,
-	scopes: readonly ExURL[],
-	options: ParseURLOptions,
-): boolean {
-	return scopes.some((scope) => isLowerLayer(url.href, scope.href, options));
-}