@nitpicker/crawler 0.4.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/package.json +5 -2
  2. package/CHANGELOG.md +0 -16
  3. package/src/archive/__mock__/.gitignore +0 -3
  4. package/src/archive/__mock__/mock.sqlite +0 -0
  5. package/src/archive/archive-accessor.ts +0 -337
  6. package/src/archive/archive.ts +0 -408
  7. package/src/archive/database.spec.ts +0 -469
  8. package/src/archive/database.ts +0 -1059
  9. package/src/archive/debug.ts +0 -10
  10. package/src/archive/filesystem/append-text.spec.ts +0 -26
  11. package/src/archive/filesystem/append-text.ts +0 -16
  12. package/src/archive/filesystem/copy-dir-sync.spec.ts +0 -27
  13. package/src/archive/filesystem/copy-dir-sync.ts +0 -10
  14. package/src/archive/filesystem/copy-dir.spec.ts +0 -33
  15. package/src/archive/filesystem/copy-dir.ts +0 -14
  16. package/src/archive/filesystem/exists.spec.ts +0 -33
  17. package/src/archive/filesystem/exists.ts +0 -10
  18. package/src/archive/filesystem/get-file-list.spec.ts +0 -37
  19. package/src/archive/filesystem/get-file-list.ts +0 -13
  20. package/src/archive/filesystem/index.ts +0 -17
  21. package/src/archive/filesystem/is-dir.spec.ts +0 -29
  22. package/src/archive/filesystem/is-dir.ts +0 -11
  23. package/src/archive/filesystem/mkdir.spec.ts +0 -37
  24. package/src/archive/filesystem/mkdir.ts +0 -16
  25. package/src/archive/filesystem/output-json.spec.ts +0 -34
  26. package/src/archive/filesystem/output-json.ts +0 -16
  27. package/src/archive/filesystem/output-text.spec.ts +0 -31
  28. package/src/archive/filesystem/output-text.ts +0 -35
  29. package/src/archive/filesystem/read-json.spec.ts +0 -26
  30. package/src/archive/filesystem/read-json.ts +0 -12
  31. package/src/archive/filesystem/read-text.spec.ts +0 -25
  32. package/src/archive/filesystem/read-text.ts +0 -11
  33. package/src/archive/filesystem/readline.spec.ts +0 -29
  34. package/src/archive/filesystem/readline.ts +0 -30
  35. package/src/archive/filesystem/remove.spec.ts +0 -34
  36. package/src/archive/filesystem/remove.ts +0 -11
  37. package/src/archive/filesystem/rename.spec.ts +0 -46
  38. package/src/archive/filesystem/rename.ts +0 -21
  39. package/src/archive/filesystem/tar.spec.ts +0 -33
  40. package/src/archive/filesystem/tar.ts +0 -27
  41. package/src/archive/filesystem/untar.spec.ts +0 -34
  42. package/src/archive/filesystem/untar.ts +0 -36
  43. package/src/archive/index.ts +0 -13
  44. package/src/archive/page.spec.ts +0 -368
  45. package/src/archive/page.ts +0 -420
  46. package/src/archive/resource.spec.ts +0 -101
  47. package/src/archive/resource.ts +0 -73
  48. package/src/archive/safe-path.spec.ts +0 -44
  49. package/src/archive/safe-path.ts +0 -18
  50. package/src/archive/types.ts +0 -227
  51. package/src/crawler/clear-destination-cache.spec.ts +0 -20
  52. package/src/crawler/clear-destination-cache.ts +0 -9
  53. package/src/crawler/crawler.ts +0 -873
  54. package/src/crawler/decompose-url.spec.ts +0 -48
  55. package/src/crawler/decompose-url.ts +0 -90
  56. package/src/crawler/destination-cache.spec.ts +0 -23
  57. package/src/crawler/destination-cache.ts +0 -8
  58. package/src/crawler/detect-pagination-pattern.spec.ts +0 -169
  59. package/src/crawler/detect-pagination-pattern.ts +0 -66
  60. package/src/crawler/fetch-destination.ts +0 -257
  61. package/src/crawler/fetch-robots-txt.spec.ts +0 -83
  62. package/src/crawler/fetch-robots-txt.ts +0 -91
  63. package/src/crawler/find-best-matching-scope.spec.ts +0 -39
  64. package/src/crawler/find-best-matching-scope.ts +0 -57
  65. package/src/crawler/generate-predicted-urls.spec.ts +0 -42
  66. package/src/crawler/generate-predicted-urls.ts +0 -34
  67. package/src/crawler/handle-ignore-and-skip.spec.ts +0 -66
  68. package/src/crawler/handle-ignore-and-skip.ts +0 -30
  69. package/src/crawler/handle-resource-response.spec.ts +0 -45
  70. package/src/crawler/handle-resource-response.ts +0 -21
  71. package/src/crawler/handle-scrape-end.spec.ts +0 -109
  72. package/src/crawler/handle-scrape-end.ts +0 -115
  73. package/src/crawler/handle-scrape-error.spec.ts +0 -105
  74. package/src/crawler/handle-scrape-error.ts +0 -58
  75. package/src/crawler/index.ts +0 -2
  76. package/src/crawler/inject-scope-auth.spec.ts +0 -36
  77. package/src/crawler/inject-scope-auth.ts +0 -27
  78. package/src/crawler/is-external-url.spec.ts +0 -31
  79. package/src/crawler/is-external-url.ts +0 -17
  80. package/src/crawler/is-in-any-lower-layer.spec.ts +0 -31
  81. package/src/crawler/is-in-any-lower-layer.ts +0 -22
  82. package/src/crawler/link-list.spec.ts +0 -355
  83. package/src/crawler/link-list.ts +0 -275
  84. package/src/crawler/link-to-page-data.spec.ts +0 -133
  85. package/src/crawler/link-to-page-data.ts +0 -34
  86. package/src/crawler/net-timeout-error.spec.ts +0 -25
  87. package/src/crawler/net-timeout-error.ts +0 -11
  88. package/src/crawler/protocol-agnostic-key.spec.ts +0 -40
  89. package/src/crawler/protocol-agnostic-key.ts +0 -11
  90. package/src/crawler/reconstruct-url.spec.ts +0 -37
  91. package/src/crawler/reconstruct-url.ts +0 -37
  92. package/src/crawler/robots-checker.spec.ts +0 -104
  93. package/src/crawler/robots-checker.ts +0 -73
  94. package/src/crawler/should-discard-predicted.spec.ts +0 -125
  95. package/src/crawler/should-discard-predicted.ts +0 -33
  96. package/src/crawler/should-skip-url.spec.ts +0 -77
  97. package/src/crawler/should-skip-url.ts +0 -37
  98. package/src/crawler/types.ts +0 -146
  99. package/src/crawler-orchestrator.ts +0 -401
  100. package/src/debug.ts +0 -10
  101. package/src/index.ts +0 -25
  102. package/src/types.ts +0 -30
  103. package/src/utils/array/each-splitted.spec.ts +0 -38
  104. package/src/utils/array/each-splitted.ts +0 -19
  105. package/src/utils/array/index.ts +0 -1
  106. package/src/utils/debug.ts +0 -6
  107. package/src/utils/error/dom-evaluation-error.spec.ts +0 -20
  108. package/src/utils/error/dom-evaluation-error.ts +0 -6
  109. package/src/utils/error/error-emitter.spec.ts +0 -78
  110. package/src/utils/error/error-emitter.ts +0 -44
  111. package/src/utils/error/index.ts +0 -3
  112. package/src/utils/index.ts +0 -5
  113. package/src/utils/object/clean-object.spec.ts +0 -24
  114. package/src/utils/object/clean-object.ts +0 -13
  115. package/src/utils/object/index.ts +0 -1
  116. package/src/utils/types/index.ts +0 -1
  117. package/src/utils/types/types.ts +0 -65
  118. package/tsconfig.json +0 -11
  119. package/tsconfig.tsbuildinfo +0 -1
package/src/types.ts DELETED
@@ -1,30 +0,0 @@
1
- import type { CrawlerError } from './utils/index.js';
2
-
3
- /**
4
- * Event map for the `CrawlerOrchestrator` class.
5
- *
6
- * Each key represents an event name and its value is the payload type
7
- * passed to listeners subscribed via `on()` or `once()`.
8
- */
9
- export interface CrawlEvent {
10
- /**
11
- * Emitted when the archive file write operation begins.
12
- */
13
- writeFileStart: {
14
- /** Absolute path of the archive file being written. */
15
- filePath: string;
16
- };
17
-
18
- /**
19
- * Emitted when the archive file write operation completes.
20
- */
21
- writeFileEnd: {
22
- /** Absolute path of the archive file that was written. */
23
- filePath: string;
24
- };
25
-
26
- /**
27
- * Emitted when an error occurs during crawling or archiving.
28
- */
29
- error: CrawlerError;
30
- }
@@ -1,38 +0,0 @@
1
- import { describe, it, expect } from 'vitest';
2
-
3
- import { eachSplitted } from './each-splitted.js';
4
-
5
- describe('eachSplitted', () => {
6
- it('splits array into chunks and calls callback on each', async () => {
7
- const chunks: number[][] = [];
8
- await eachSplitted([1, 2, 3, 4, 5], 2, (items) => {
9
- chunks.push(items);
10
- });
11
- expect(chunks).toEqual([[1, 2], [3, 4], [5]]);
12
- });
13
-
14
- it('handles empty array', async () => {
15
- const chunks: number[][] = [];
16
- await eachSplitted([], 2, (items) => {
17
- chunks.push(items);
18
- });
19
- expect(chunks).toEqual([]);
20
- });
21
-
22
- it('handles chunk size larger than array', async () => {
23
- const chunks: number[][] = [];
24
- await eachSplitted([1, 2], 10, (items) => {
25
- chunks.push(items);
26
- });
27
- expect(chunks).toEqual([[1, 2]]);
28
- });
29
-
30
- it('supports async callback', async () => {
31
- const results: string[] = [];
32
- await eachSplitted(['a', 'b', 'c'], 1, async (items) => {
33
- await new Promise((resolve) => setTimeout(resolve, 1));
34
- results.push(...items);
35
- });
36
- expect(results.toSorted()).toEqual(['a', 'b', 'c']);
37
- });
38
- });
@@ -1,19 +0,0 @@
1
- import { splitArray } from '@d-zero/shared/split-array';
2
-
3
- /**
4
- * Splits an array into chunks of the specified size and executes a callback
5
- * on each chunk in parallel using `Promise.all`.
6
- * @template T - The element type of the array.
7
- * @param a - The array to split into chunks.
8
- * @param count - The maximum number of elements per chunk.
9
- * @param callback - A function to invoke on each chunk. May be synchronous or asynchronous.
10
- * @returns A promise that resolves when all chunk callbacks have completed.
11
- */
12
- export async function eachSplitted<T>(
13
- a: T[],
14
- count: number,
15
- callback: (items: T[]) => void | Promise<void>,
16
- ) {
17
- const splitted = splitArray(a, count);
18
- await Promise.all(splitted.map((items) => callback(items)));
19
- }
@@ -1 +0,0 @@
1
- export { eachSplitted } from './each-splitted.js';
@@ -1,6 +0,0 @@
1
- import debug from 'debug';
2
-
3
- /** Root debug logger for the Nitpicker application. Namespace: `Nitpicker`. */
4
- export const globalLog = debug('Nitpicker');
5
- /** Debug logger for the utils package. Namespace: `Nitpicker:Utils`. */
6
- export const log = globalLog.extend('Utils');
@@ -1,20 +0,0 @@
1
- import { describe, it, expect } from 'vitest';
2
-
3
- import { DOMEvaluationError } from './dom-evaluation-error.js';
4
-
5
- describe('DOMEvaluationError', () => {
6
- it('is an instance of Error', () => {
7
- const error = new DOMEvaluationError('test');
8
- expect(error).toBeInstanceOf(Error);
9
- });
10
-
11
- it('sets the message', () => {
12
- const error = new DOMEvaluationError('test message');
13
- expect(error.message).toBe('test message');
14
- });
15
-
16
- it('has the correct name', () => {
17
- const error = new DOMEvaluationError('test');
18
- expect(error.name).toBe('Error');
19
- });
20
- });
@@ -1,6 +0,0 @@
1
- /**
2
- * Error thrown when DOM evaluation (e.g., running scripts within a browser page context)
3
- * fails. This typically occurs during page scraping when JavaScript execution
4
- * in the browser context encounters an error.
5
- */
6
- export class DOMEvaluationError extends Error {}
@@ -1,78 +0,0 @@
1
- import type { ErrorEvent } from './error-emitter.js';
2
-
3
- import { TypedAwaitEventEmitter } from '@d-zero/shared/typed-await-event-emitter';
4
- import { describe, it, expect, vi } from 'vitest';
5
-
6
- import { ErrorEmitter } from './error-emitter.js';
7
-
8
- /**
9
- * Test class that uses the ErrorEmitter decorator.
10
- */
11
- class TestEmitter extends TypedAwaitEventEmitter<ErrorEvent> {
12
- /**
13
- * A method that throws an Error.
14
- * @param message - Error message.
15
- */
16
- @ErrorEmitter()
17
- failWithError(message: string): Promise<never> {
18
- throw new Error(message);
19
- }
20
- /**
21
- * A method that throws a non-Error value.
22
- */
23
- @ErrorEmitter()
24
- failWithNonError(): Promise<never> {
25
- throw 'string-error';
26
- }
27
- /**
28
- * A method that succeeds.
29
- * @param value - A value to return.
30
- * @returns The input value.
31
- */
32
- @ErrorEmitter()
33
- succeed(value: string): Promise<string> {
34
- return Promise.resolve(value);
35
- }
36
- }
37
-
38
- describe('ErrorEmitter', () => {
39
- it('returns the method result on success', async () => {
40
- const emitter = new TestEmitter();
41
- const result = await emitter.succeed('hello');
42
- expect(result).toBe('hello');
43
- });
44
-
45
- it('emits an error event when the method throws an Error', async () => {
46
- const emitter = new TestEmitter();
47
- const listener = vi.fn();
48
- emitter.on('error', listener);
49
-
50
- await expect(emitter.failWithError('test failure')).rejects.toThrow('test failure');
51
- expect(listener).toHaveBeenCalledOnce();
52
- expect(listener).toHaveBeenCalledWith(expect.any(Error));
53
- expect(listener.mock.calls[0][0].message).toBe('test failure');
54
- });
55
-
56
- it('re-throws the error after emitting', async () => {
57
- const emitter = new TestEmitter();
58
- await expect(emitter.failWithError('boom')).rejects.toThrow('boom');
59
- });
60
-
61
- it('does not emit error event for non-Error throws', async () => {
62
- const emitter = new TestEmitter();
63
- const listener = vi.fn();
64
- emitter.on('error', listener);
65
-
66
- await expect(emitter.failWithNonError()).rejects.toBe('string-error');
67
- expect(listener).not.toHaveBeenCalled();
68
- });
69
-
70
- it('does not emit error event on success', async () => {
71
- const emitter = new TestEmitter();
72
- const listener = vi.fn();
73
- emitter.on('error', listener);
74
-
75
- await emitter.succeed('ok');
76
- expect(listener).not.toHaveBeenCalled();
77
- });
78
- });
@@ -1,44 +0,0 @@
1
- import type { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
2
-
3
- import { log } from '../debug.js';
4
-
5
- const errorLog = log.extend('ErrorEmitter');
6
-
7
- /**
8
- * Event payload type for error events emitted by classes using the {@link ErrorEmitter} decorator.
9
- * @template E - The specific error type, defaults to `Error`.
10
- */
11
- export type ErrorEvent<E extends Error = Error> = {
12
- /** The error instance that was caught. */
13
- error: E;
14
- };
15
-
16
- /**
17
- * A class method decorator factory that wraps the decorated method with error handling.
18
- * When the method throws an `Error`, it emits an `'error'` event on the class instance
19
- * (which must extend {@link EventEmitter}) with the caught error, then re-throws the error.
20
- * @template C - The class type, which must be an EventEmitter capable of emitting error events.
21
- * @template E - The error event type, defaults to {@link ErrorEvent}.
22
- * @returns A decorator function that wraps the target method with error-emitting behavior.
23
- */
24
- export function ErrorEmitter<
25
- C extends EventEmitter<E>,
26
- E extends ErrorEvent = ErrorEvent,
27
- >() {
28
- // eslint-disable-next-line @typescript-eslint/no-unsafe-function-type
29
- return (method: Function, context: ClassMethodDecoratorContext) => {
30
- return async function (this: C, ...args: unknown[]) {
31
- const constructorName = String(this.constructor?.name || this.constructor || this);
32
- const methodName = `${constructorName}.${String(context.name)}`;
33
- try {
34
- return await method.apply(this, args);
35
- } catch (error: unknown) {
36
- if (error instanceof Error) {
37
- errorLog('%s: %O', methodName, error);
38
- void this.emit('error', error);
39
- }
40
- throw error;
41
- }
42
- };
43
- };
44
- }
@@ -1,3 +0,0 @@
1
- export { DOMEvaluationError } from './dom-evaluation-error.js';
2
- export { ErrorEmitter } from './error-emitter.js';
3
- export type { ErrorEvent } from './error-emitter.js';
@@ -1,5 +0,0 @@
1
- export * from './types/index.js';
2
- export * from './array/index.js';
3
- export * from './error/index.js';
4
- export * from './object/index.js';
5
- export { globalLog as log } from './debug.js';
@@ -1,24 +0,0 @@
1
- import { describe, it, expect } from 'vitest';
2
-
3
- import { cleanObject } from './clean-object.js';
4
-
5
- describe('cleanObject', () => {
6
- it('removes undefined properties', () => {
7
- const result = cleanObject({ a: 1, b: undefined, c: 'hello' });
8
- expect(result).toEqual({ a: 1, c: 'hello' });
9
- });
10
-
11
- it('keeps null properties', () => {
12
- const result = cleanObject({ a: null, b: 0, c: '' });
13
- expect(result).toEqual({ a: null, b: 0, c: '' });
14
- });
15
-
16
- it('returns empty object for falsy input', () => {
17
- expect(cleanObject()).toEqual({});
18
- });
19
-
20
- it('returns same properties when no undefined values', () => {
21
- const obj = { x: 1, y: 2 };
22
- expect(cleanObject(obj)).toEqual({ x: 1, y: 2 });
23
- });
24
- });
@@ -1,13 +0,0 @@
1
- /**
2
- * Creates a shallow copy of an object with all `undefined`-valued properties removed.
3
- * If the input is falsy (e.g., `undefined` or `null`), returns an empty object.
4
- * @template T - The type of the input object.
5
- * @param obj - The object to clean. If falsy, an empty `Partial<T>` is returned.
6
- * @returns A new object containing only the properties whose values are not `undefined`.
7
- */
8
- export function cleanObject<T extends Record<string, unknown>>(obj?: T) {
9
- if (!obj) {
10
- return {} as Partial<T>;
11
- }
12
- return Object.fromEntries(Object.entries(obj).filter(([, v]) => v !== undefined)) as T;
13
- }
@@ -1 +0,0 @@
1
- export * from './clean-object.js';
@@ -1 +0,0 @@
1
- export * from './types.js';
@@ -1,65 +0,0 @@
1
- import type { ExURL } from '@d-zero/shared/parse-url';
2
-
3
- // beholder is the canonical owner of scraping types — re-export
4
- export type {
5
- PageData,
6
- ImageElement,
7
- SkippedPageData,
8
- Resource,
9
- AnchorData,
10
- Meta,
11
- NetworkLog,
12
- } from '@d-zero/beholder';
13
-
14
- export type { ExURL } from '@d-zero/shared/parse-url';
15
- export type { CompressType } from '@d-zero/shared/detect-compress';
16
- export type { CDNType } from '@d-zero/shared/detect-cdn';
17
-
18
- /**
19
- * Represents a discovered link during crawling, with its metadata from the HEAD request.
20
- */
21
- export interface Link {
22
- /** The parsed URL of the link. */
23
- url: ExURL;
24
-
25
- /** Whether this link points to an external domain. */
26
- isExternal: boolean;
27
-
28
- /** Whether this link is in a lower layer (subdirectory) of a scope URL. */
29
- isLowerLayer: boolean;
30
-
31
- /** Destination data from the HEAD request, present only if the link was fetched. */
32
- dest?: {
33
- /** Chain of redirect URLs traversed. */
34
- redirectPaths: string[];
35
- /** HTTP status code of the final response. */
36
- status: number;
37
- /** HTTP status text of the final response. */
38
- statusText: string;
39
- /** The Content-Type header value, or `null` if unavailable. */
40
- contentType: string | null;
41
- /** The Content-Length header value in bytes, or `null` if unavailable. */
42
- contentLength: number | null;
43
- /** Raw HTTP response headers, or `null` if unavailable. */
44
- responseHeaders: Record<string, string | string[] | undefined> | null;
45
- /** The page title, if available from a title-only scrape. */
46
- title?: string;
47
- };
48
- }
49
-
50
- /**
51
- * An error event emitted during crawling or scraping.
52
- */
53
- export interface CrawlerError {
54
- /** The process ID where the error occurred. */
55
- pid: number;
56
-
57
- /** Whether the error occurred in the main process (as opposed to a sub-process). */
58
- isMainProcess: boolean;
59
-
60
- /** The URL being processed when the error occurred, or `null` if not applicable. */
61
- url: string | null;
62
-
63
- /** The error object. */
64
- error: Error;
65
- }
package/tsconfig.json DELETED
@@ -1,11 +0,0 @@
1
- {
2
- "extends": "../../../tsconfig.json",
3
- "compilerOptions": {
4
- "composite": true,
5
- "outDir": "./lib",
6
- "rootDir": "./src",
7
- "resolveJsonModule": true
8
- },
9
- "include": ["./src/**/*"],
10
- "exclude": ["node_modules", "lib", "./src/**/*.spec.ts"]
11
- }