@nitpicker/crawler 0.4.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/package.json +5 -2
  2. package/CHANGELOG.md +0 -16
  3. package/src/archive/__mock__/.gitignore +0 -3
  4. package/src/archive/__mock__/mock.sqlite +0 -0
  5. package/src/archive/archive-accessor.ts +0 -337
  6. package/src/archive/archive.ts +0 -408
  7. package/src/archive/database.spec.ts +0 -469
  8. package/src/archive/database.ts +0 -1059
  9. package/src/archive/debug.ts +0 -10
  10. package/src/archive/filesystem/append-text.spec.ts +0 -26
  11. package/src/archive/filesystem/append-text.ts +0 -16
  12. package/src/archive/filesystem/copy-dir-sync.spec.ts +0 -27
  13. package/src/archive/filesystem/copy-dir-sync.ts +0 -10
  14. package/src/archive/filesystem/copy-dir.spec.ts +0 -33
  15. package/src/archive/filesystem/copy-dir.ts +0 -14
  16. package/src/archive/filesystem/exists.spec.ts +0 -33
  17. package/src/archive/filesystem/exists.ts +0 -10
  18. package/src/archive/filesystem/get-file-list.spec.ts +0 -37
  19. package/src/archive/filesystem/get-file-list.ts +0 -13
  20. package/src/archive/filesystem/index.ts +0 -17
  21. package/src/archive/filesystem/is-dir.spec.ts +0 -29
  22. package/src/archive/filesystem/is-dir.ts +0 -11
  23. package/src/archive/filesystem/mkdir.spec.ts +0 -37
  24. package/src/archive/filesystem/mkdir.ts +0 -16
  25. package/src/archive/filesystem/output-json.spec.ts +0 -34
  26. package/src/archive/filesystem/output-json.ts +0 -16
  27. package/src/archive/filesystem/output-text.spec.ts +0 -31
  28. package/src/archive/filesystem/output-text.ts +0 -35
  29. package/src/archive/filesystem/read-json.spec.ts +0 -26
  30. package/src/archive/filesystem/read-json.ts +0 -12
  31. package/src/archive/filesystem/read-text.spec.ts +0 -25
  32. package/src/archive/filesystem/read-text.ts +0 -11
  33. package/src/archive/filesystem/readline.spec.ts +0 -29
  34. package/src/archive/filesystem/readline.ts +0 -30
  35. package/src/archive/filesystem/remove.spec.ts +0 -34
  36. package/src/archive/filesystem/remove.ts +0 -11
  37. package/src/archive/filesystem/rename.spec.ts +0 -46
  38. package/src/archive/filesystem/rename.ts +0 -21
  39. package/src/archive/filesystem/tar.spec.ts +0 -33
  40. package/src/archive/filesystem/tar.ts +0 -27
  41. package/src/archive/filesystem/untar.spec.ts +0 -34
  42. package/src/archive/filesystem/untar.ts +0 -36
  43. package/src/archive/index.ts +0 -13
  44. package/src/archive/page.spec.ts +0 -368
  45. package/src/archive/page.ts +0 -420
  46. package/src/archive/resource.spec.ts +0 -101
  47. package/src/archive/resource.ts +0 -73
  48. package/src/archive/safe-path.spec.ts +0 -44
  49. package/src/archive/safe-path.ts +0 -18
  50. package/src/archive/types.ts +0 -227
  51. package/src/crawler/clear-destination-cache.spec.ts +0 -20
  52. package/src/crawler/clear-destination-cache.ts +0 -9
  53. package/src/crawler/crawler.ts +0 -873
  54. package/src/crawler/decompose-url.spec.ts +0 -48
  55. package/src/crawler/decompose-url.ts +0 -90
  56. package/src/crawler/destination-cache.spec.ts +0 -23
  57. package/src/crawler/destination-cache.ts +0 -8
  58. package/src/crawler/detect-pagination-pattern.spec.ts +0 -169
  59. package/src/crawler/detect-pagination-pattern.ts +0 -66
  60. package/src/crawler/fetch-destination.ts +0 -257
  61. package/src/crawler/fetch-robots-txt.spec.ts +0 -83
  62. package/src/crawler/fetch-robots-txt.ts +0 -91
  63. package/src/crawler/find-best-matching-scope.spec.ts +0 -39
  64. package/src/crawler/find-best-matching-scope.ts +0 -57
  65. package/src/crawler/generate-predicted-urls.spec.ts +0 -42
  66. package/src/crawler/generate-predicted-urls.ts +0 -34
  67. package/src/crawler/handle-ignore-and-skip.spec.ts +0 -66
  68. package/src/crawler/handle-ignore-and-skip.ts +0 -30
  69. package/src/crawler/handle-resource-response.spec.ts +0 -45
  70. package/src/crawler/handle-resource-response.ts +0 -21
  71. package/src/crawler/handle-scrape-end.spec.ts +0 -109
  72. package/src/crawler/handle-scrape-end.ts +0 -115
  73. package/src/crawler/handle-scrape-error.spec.ts +0 -105
  74. package/src/crawler/handle-scrape-error.ts +0 -58
  75. package/src/crawler/index.ts +0 -2
  76. package/src/crawler/inject-scope-auth.spec.ts +0 -36
  77. package/src/crawler/inject-scope-auth.ts +0 -27
  78. package/src/crawler/is-external-url.spec.ts +0 -31
  79. package/src/crawler/is-external-url.ts +0 -17
  80. package/src/crawler/is-in-any-lower-layer.spec.ts +0 -31
  81. package/src/crawler/is-in-any-lower-layer.ts +0 -22
  82. package/src/crawler/link-list.spec.ts +0 -355
  83. package/src/crawler/link-list.ts +0 -275
  84. package/src/crawler/link-to-page-data.spec.ts +0 -133
  85. package/src/crawler/link-to-page-data.ts +0 -34
  86. package/src/crawler/net-timeout-error.spec.ts +0 -25
  87. package/src/crawler/net-timeout-error.ts +0 -11
  88. package/src/crawler/protocol-agnostic-key.spec.ts +0 -40
  89. package/src/crawler/protocol-agnostic-key.ts +0 -11
  90. package/src/crawler/reconstruct-url.spec.ts +0 -37
  91. package/src/crawler/reconstruct-url.ts +0 -37
  92. package/src/crawler/robots-checker.spec.ts +0 -104
  93. package/src/crawler/robots-checker.ts +0 -73
  94. package/src/crawler/should-discard-predicted.spec.ts +0 -125
  95. package/src/crawler/should-discard-predicted.ts +0 -33
  96. package/src/crawler/should-skip-url.spec.ts +0 -77
  97. package/src/crawler/should-skip-url.ts +0 -37
  98. package/src/crawler/types.ts +0 -146
  99. package/src/crawler-orchestrator.ts +0 -401
  100. package/src/debug.ts +0 -10
  101. package/src/index.ts +0 -25
  102. package/src/types.ts +0 -30
  103. package/src/utils/array/each-splitted.spec.ts +0 -38
  104. package/src/utils/array/each-splitted.ts +0 -19
  105. package/src/utils/array/index.ts +0 -1
  106. package/src/utils/debug.ts +0 -6
  107. package/src/utils/error/dom-evaluation-error.spec.ts +0 -20
  108. package/src/utils/error/dom-evaluation-error.ts +0 -6
  109. package/src/utils/error/error-emitter.spec.ts +0 -78
  110. package/src/utils/error/error-emitter.ts +0 -44
  111. package/src/utils/error/index.ts +0 -3
  112. package/src/utils/index.ts +0 -5
  113. package/src/utils/object/clean-object.spec.ts +0 -24
  114. package/src/utils/object/clean-object.ts +0 -13
  115. package/src/utils/object/index.ts +0 -1
  116. package/src/utils/types/index.ts +0 -1
  117. package/src/utils/types/types.ts +0 -65
  118. package/tsconfig.json +0 -11
  119. package/tsconfig.tsbuildinfo +0 -1
@@ -1,77 +0,0 @@
1
- import type { ParseURLOptions } from '@d-zero/shared/parse-url';
2
-
3
- import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
4
- import { describe, it, expect } from 'vitest';
5
-
6
- import { shouldSkipUrl } from './should-skip-url.js';
7
-
8
- const defaultOptions: ParseURLOptions = {};
9
-
10
- describe('shouldSkipUrl', () => {
11
- it('returns true when URL matches a glob exclude pattern', () => {
12
- const url = parseUrl('https://example.com/secret/page')!;
13
- expect(
14
- shouldSkipUrl({
15
- url,
16
- excludes: ['/secret/**/*'],
17
- excludeUrls: [],
18
- options: defaultOptions,
19
- }),
20
- ).toBe(true);
21
- });
22
-
23
- it('returns true when URL matches a prefix exclude', () => {
24
- const url = parseUrl('https://example.com/admin/settings')!;
25
- expect(
26
- shouldSkipUrl({
27
- url,
28
- excludes: [],
29
- excludeUrls: ['https://example.com/admin/'],
30
- options: defaultOptions,
31
- }),
32
- ).toBe(true);
33
- });
34
-
35
- it('returns false when URL matches neither', () => {
36
- const url = parseUrl('https://example.com/public/page')!;
37
- expect(
38
- shouldSkipUrl({
39
- url,
40
- excludes: ['/secret/**/*'],
41
- excludeUrls: ['https://example.com/admin/'],
42
- options: defaultOptions,
43
- }),
44
- ).toBe(false);
45
- });
46
-
47
- it('returns false with empty exclude lists', () => {
48
- const url = parseUrl('https://example.com/page')!;
49
- expect(
50
- shouldSkipUrl({ url, excludes: [], excludeUrls: [], options: defaultOptions }),
51
- ).toBe(false);
52
- });
53
-
54
- it('matches HTTP URL against HTTPS excludeUrls prefix', () => {
55
- const url = parseUrl('http://twitter.com/user')!;
56
- expect(
57
- shouldSkipUrl({
58
- url,
59
- excludes: [],
60
- excludeUrls: ['https://twitter.com'],
61
- options: defaultOptions,
62
- }),
63
- ).toBe(true);
64
- });
65
-
66
- it('matches HTTPS URL against HTTP excludeUrls prefix', () => {
67
- const url = parseUrl('https://twitter.com/user')!;
68
- expect(
69
- shouldSkipUrl({
70
- url,
71
- excludes: [],
72
- excludeUrls: ['http://twitter.com'],
73
- options: defaultOptions,
74
- }),
75
- ).toBe(true);
76
- });
77
- });
@@ -1,37 +0,0 @@
1
- import type { ExURL, ParseURLOptions } from '@d-zero/shared/parse-url';
2
-
3
- import { pathMatch } from '@d-zero/shared/path-match';
4
-
5
- import { protocolAgnosticKey } from './protocol-agnostic-key.js';
6
-
7
- /**
8
- * Parameters for {@link shouldSkipUrl}.
9
- */
10
- export interface ShouldSkipUrlParams {
11
- /** The parsed URL to check. */
12
- readonly url: ExURL;
13
- /** Array of glob patterns for URLs to exclude. */
14
- readonly excludes: readonly string[];
15
- /** Array of URL prefixes to exclude (matched via `startsWith`). */
16
- readonly excludeUrls: readonly string[];
17
- /** URL parsing options used for pattern matching. */
18
- readonly options: ParseURLOptions;
19
- }
20
-
21
- /**
22
- * Determine whether a URL should be skipped during crawling.
23
- *
24
- * A URL is skipped if it matches any user-defined exclude glob pattern
25
- * or starts with any of the excluded URL prefixes.
26
- * @param params - Parameters containing the URL, exclude patterns, and options.
27
- * @returns `true` if the URL should be skipped.
28
- */
29
- export function shouldSkipUrl(params: ShouldSkipUrlParams): boolean {
30
- const { url, excludes, excludeUrls, options } = params;
31
- return (
32
- excludes.some((excludeGlobPattern) => pathMatch(url, excludeGlobPattern, options)) ||
33
- excludeUrls.some((prefix) =>
34
- protocolAgnosticKey(url.href).startsWith(protocolAgnosticKey(prefix)),
35
- )
36
- );
37
- }
@@ -1,146 +0,0 @@
1
- import type { PageData, CrawlerError, Resource } from '../utils/index.js';
2
- import type { ChangePhaseEvent } from '@d-zero/beholder';
3
- import type { ParseURLOptions } from '@d-zero/shared/parse-url';
4
-
5
- /**
6
- * Configuration options that control crawler behavior.
7
- *
8
- * Used by the result handler functions to determine how to process
9
- * scrape results, which URLs to follow, and how to handle external links.
10
- * @see {@link ./crawler.ts | Crawler} for the main consumer of this type
11
- * @see {@link ../crawler-orchestrator.ts | CrawlerOrchestrator} for factory methods that build these options
12
- */
13
- export interface CrawlerOptions extends Required<
14
- Pick<ParseURLOptions, 'disableQueries'>
15
- > {
16
- /** Delay in milliseconds between page requests. */
17
- interval: number;
18
-
19
- /** Maximum number of concurrent scraping processes. 0 uses the default. */
20
- parallels: number;
21
-
22
- /** Whether to recursively follow discovered links within the scope. */
23
- recursive: boolean;
24
-
25
- /** Whether the crawl was started from a pre-defined URL list. */
26
- fromList: boolean;
27
-
28
- /** Whether to capture image resources during scraping. */
29
- captureImages: boolean;
30
-
31
- /** Path to the Chromium/Chrome executable, or `null` for the bundled version. */
32
- executablePath: string | null;
33
-
34
- /** Whether to fetch and scrape external (out-of-scope) pages. */
35
- fetchExternal: boolean;
36
-
37
- /** List of scope URL strings that define the crawl boundary. */
38
- scope: string[];
39
-
40
- /** Glob patterns for URLs to exclude from crawling. */
41
- excludes: string[];
42
-
43
- /** Keywords that trigger page exclusion when found in content. */
44
- excludeKeywords: string[];
45
-
46
- /** URL prefixes to exclude from crawling (merged defaults + user additions). */
47
- excludeUrls: readonly string[];
48
-
49
- /** Maximum directory depth for excluded paths. */
50
- maxExcludedDepth: number;
51
-
52
- /** Maximum number of retry attempts per URL on scrape failure. */
53
- retry: number;
54
-
55
- /** Whether to enable verbose logging. */
56
- verbose: boolean;
57
-
58
- /** User-Agent string sent with HTTP requests. */
59
- userAgent: string;
60
-
61
- /** Whether to ignore robots.txt restrictions. */
62
- ignoreRobots: boolean;
63
- }
64
-
65
- /**
66
- * Describes a detected pagination pattern between two consecutive URLs.
67
- */
68
- export interface PaginationPattern {
69
- /** Index within the combined token array (path segments + query values) where the numeric difference was found. */
70
- tokenIndex: number;
71
- /** The numeric increment (always > 0). */
72
- step: number;
73
- /** The number found at `tokenIndex` in the "current" URL. */
74
- currentNumber: number;
75
- }
76
-
77
- /**
78
- * Event map for the `Crawler` class.
79
- *
80
- * Each key represents an event name and its value is the payload type
81
- * passed to listeners subscribed via `on()` or `once()`.
82
- */
83
- export interface CrawlerEventTypes {
84
- /**
85
- * Emitted when a page within the crawl scope has been successfully scraped.
86
- */
87
- page: {
88
- /** The scraped page data including HTML, metadata, anchors, and images. */
89
- result: PageData;
90
- };
91
-
92
- /**
93
- * Emitted when an external page (outside the crawl scope) has been scraped.
94
- */
95
- externalPage: {
96
- /** The scraped page data for the external page. */
97
- result: PageData;
98
- };
99
-
100
- /**
101
- * Emitted when a URL is skipped due to exclusion rules, robots.txt restrictions,
102
- * or external fetch being disabled.
103
- */
104
- skip: {
105
- /** The URL that was skipped. */
106
- url: string;
107
- /** The reason the URL was skipped (e.g., "excluded", "blocked by robots.txt", or a JSON description). */
108
- reason: string;
109
- /** Whether the skipped URL is external to the crawl scope. */
110
- isExternal: boolean;
111
- };
112
-
113
- /**
114
- * Emitted when a network resource (CSS, JS, image, etc.) is captured during page scraping.
115
- */
116
- response: {
117
- /** The captured resource data including URL, status, content type, and headers. */
118
- resource: Resource;
119
- };
120
-
121
- /**
122
- * Emitted to record the relationship between a page and a resource it references.
123
- */
124
- responseReferrers: {
125
- /** The URL of the page that references the resource. */
126
- url: string;
127
- /** The URL of the referenced resource (without hash). */
128
- src: string;
129
- };
130
-
131
- /**
132
- * Emitted when the entire crawl process has completed or been aborted.
133
- */
134
- crawlEnd: Record<string, unknown>;
135
-
136
- /**
137
- * Emitted when an error occurs during crawling.
138
- */
139
- error: CrawlerError;
140
-
141
- /**
142
- * Emitted when the scraper transitions between phases of the page scraping lifecycle
143
- * (e.g., scrapeStart, headRequest, openPage, success).
144
- */
145
- changePhase: ChangePhaseEvent;
146
- }
@@ -1,401 +0,0 @@
1
- import type { Config } from './archive/types.js';
2
- import type { CrawlEvent } from './types.js';
3
- import type { ExURL } from '@d-zero/shared/parse-url';
4
-
5
- import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
6
- import { sortUrl } from '@d-zero/shared/sort-url';
7
- import { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
8
-
9
- import pkg from '../package.json' with { type: 'json' };
10
-
11
- import Archive from './archive/archive.js';
12
- import { clearDestinationCache, Crawler } from './crawler/index.js';
13
- import { crawlerLog, log } from './debug.js';
14
- import { cleanObject } from './utils/index.js';
15
-
16
- /**
17
- * Default list of external URL prefixes excluded from crawling.
18
- * Includes social media sharing endpoints that are commonly linked
19
- * but provide no useful crawl data.
20
- */
21
- export const DEFAULT_EXCLUDED_EXTERNAL_URLS = [
22
- 'https://social-plugins.line.me',
23
- 'https://access.line.me',
24
- 'https://lineit.line.me',
25
- 'https://line.me',
26
- 'https://plus.google.com',
27
- 'https://twitter.com',
28
- 'https://x.com',
29
- 'https://www.facebook.com/share.php',
30
- 'https://www.facebook.com/share/',
31
- 'https://www.facebook.com/sharer/',
32
- 'https://www.facebook.com/share_channel/',
33
- 'https://www.google.com',
34
- ];
35
-
36
- /**
37
- * Configuration options for the CrawlerOrchestrator.
38
- *
39
- * Extends the archive {@link Config} with additional runtime settings
40
- * such as working directory, browser executable path, and output options.
41
- */
42
- interface CrawlConfig extends Config {
43
- /** The working directory for output files. Defaults to `process.cwd()`. */
44
- cwd: string;
45
-
46
- /** Path to a Chromium/Chrome executable for Puppeteer. */
47
- executablePath: string;
48
-
49
- /** Output file path for the archive. */
50
- filePath: string;
51
-
52
- /** Whether to capture image resources during crawling. */
53
- image: boolean;
54
-
55
- /** File-size threshold (in bytes) above which images are excluded. */
56
- imageFileSizeThreshold: number;
57
-
58
- /** Delay in milliseconds between each page request. */
59
- interval: number;
60
-
61
- /** Whether the input is a pre-defined URL list (non-recursive mode). */
62
- list: boolean;
63
-
64
- /** Maximum number of retry attempts per URL on scrape failure. */
65
- retry: number;
66
-
67
- /** Whether to enable verbose logging output. */
68
- verbose: boolean;
69
-
70
- /** Custom User-Agent string for HTTP requests. */
71
- userAgent: string;
72
-
73
- /** Whether to ignore robots.txt restrictions. */
74
- ignoreRobots: boolean;
75
- }
76
-
77
- /**
78
- * Callback invoked after the CrawlerOrchestrator instance is fully initialized
79
- * but before crawling begins.
80
- * @param orchestrator - The initialized CrawlerOrchestrator instance.
81
- * @param config - The resolved archive configuration.
82
- */
83
- type CrawlInitializedCallback = (
84
- orchestrator: CrawlerOrchestrator,
85
- config: Config,
86
- ) => void | Promise<void>;
87
-
88
- /**
89
- * The main entry point for Nitpicker web crawling and archiving.
90
- *
91
- * CrawlerOrchestrator orchestrates the full lifecycle of a crawl session: it creates an archive,
92
- * configures a {@link Crawler}, processes discovered pages and resources, and
93
- * writes the final archive file. It emits events defined by {@link CrawlEvent}.
94
- *
95
- * Instances are created via the static factory methods {@link CrawlerOrchestrator.crawling}
96
- * or {@link CrawlerOrchestrator.resume}; the constructor is private.
97
- * @example
98
- * ```ts
99
- * const orchestrator = await CrawlerOrchestrator.crawling(['https://example.com'], { recursive: true });
100
- * await orchestrator.write();
101
- * ```
102
- */
103
- export class CrawlerOrchestrator extends EventEmitter<CrawlEvent> {
104
- /** The archive instance for persisting crawl results to SQLite + tar. */
105
- readonly #archive: Archive;
106
- /** The crawler engine that discovers and scrapes pages. */
107
- readonly #crawler: Crawler;
108
- /** Whether the crawl was started from a pre-defined URL list (non-recursive mode). */
109
- readonly #fromList: boolean;
110
-
111
- /**
112
- * The underlying archive instance used for storing crawl results.
113
- */
114
- get archive() {
115
- return this.#archive;
116
- }
117
-
118
- // eslint-disable-next-line no-restricted-syntax
119
- private constructor(archive: Archive, options?: Partial<CrawlConfig>) {
120
- super();
121
-
122
- this.#fromList = !!options?.list;
123
- this.#archive = archive;
124
- this.#archive.on('error', (e) => {
125
- this.#crawler.abort();
126
- void this.emit('error', {
127
- pid: process.pid,
128
- isMainProcess: true,
129
- url: null,
130
- error: e instanceof Error ? e : new Error(String(e)),
131
- });
132
- });
133
-
134
- const defaultUserAgent = `Nitpicker/${pkg.version}`;
135
- this.#crawler = new Crawler({
136
- interval: options?.interval || 0,
137
- parallels: options?.parallels || 0,
138
- captureImages: options?.image,
139
- executablePath: options?.executablePath || null,
140
- fetchExternal: options?.fetchExternal ?? true,
141
- recursive: options?.recursive ?? true,
142
- scope: options?.scope ?? [],
143
- excludes: normalizeToArray(options?.excludes),
144
- excludeKeywords: normalizeToArray(options?.excludeKeywords),
145
- excludeUrls: [
146
- ...DEFAULT_EXCLUDED_EXTERNAL_URLS,
147
- ...normalizeToArray(options?.excludeUrls),
148
- ],
149
- maxExcludedDepth: options?.maxExcludedDepth || 10,
150
- retry: options?.retry ?? 3,
151
- disableQueries: options?.disableQueries,
152
- verbose: options?.verbose ?? false,
153
- userAgent: options?.userAgent || defaultUserAgent,
154
- ignoreRobots: options?.ignoreRobots ?? false,
155
- });
156
- }
157
-
158
- /**
159
- * Abort the current crawl and archive operations.
160
- *
161
- * Delegates to the archive's abort method, which stops all in-progress
162
- * database writes and cleans up temporary resources.
163
- * @returns The result of the archive abort operation.
164
- */
165
- abort() {
166
- return this.#archive.abort();
167
- }
168
-
169
- /**
170
- * Execute the crawl for the given list of URLs.
171
- *
172
- * Sets up event listeners on the crawler, starts crawling, and resolves
173
- * when the crawl completes. Discovered pages, external pages, skipped pages,
174
- * and resources are forwarded to the archive for storage.
175
- * @param list - The list of parsed URLs to crawl. The first URL is used as the root.
176
- * @returns A promise that resolves when crawling is complete.
177
- * @throws {Error} If the URL list is empty.
178
- */
179
- async crawling(list: ExURL[]) {
180
- const root = list[0];
181
-
182
- if (!root) {
183
- throw new Error('URL is empty');
184
- }
185
-
186
- return new Promise<void>((resolve, reject) => {
187
- this.#crawler.on('error', (error) => {
188
- crawlerLog('On error: %O', error);
189
- void this.#archive.addError(error);
190
- void this.emit('error', error);
191
- });
192
-
193
- this.#crawler.on('page', async ({ result }) => {
194
- await this.#archive.setPage(result).catch((error) => reject(error));
195
- });
196
-
197
- this.#crawler.on('externalPage', ({ result }) => {
198
- this.#archive.setExternalPage(result).catch((error) => reject(error));
199
- });
200
-
201
- this.#crawler.on('skip', ({ url, reason, isExternal }) => {
202
- this.#archive
203
- .setSkippedPage(url, reason, isExternal)
204
- .catch((error) => reject(error));
205
- });
206
-
207
- this.#crawler.on('response', ({ resource }) => {
208
- this.#archive.setResources(resource).catch((error) => reject(error));
209
- });
210
-
211
- this.#crawler.on('responseReferrers', (resource) => {
212
- this.#archive.setResourcesReferrers(resource).catch((error) => reject(error));
213
- });
214
-
215
- this.#crawler.on('crawlEnd', () => {
216
- resolve();
217
- });
218
-
219
- if (this.#fromList) {
220
- this.#crawler.startMultiple(list);
221
- } else {
222
- this.#crawler.start(root);
223
- }
224
- });
225
- }
226
-
227
- /**
228
- * Kill any zombie Chromium processes that were not properly cleaned up.
229
- *
230
- * Retrieves the list of undead process IDs from the crawler and sends
231
- * a SIGTERM signal to each one. Chromium is intentionally sent SIGTERM
232
- * (not SIGKILL) to avoid leaving zombie processes.
233
- */
234
- garbageCollect() {
235
- const pidList = this.getUndeadPid();
236
- log('Undead PIDs: %O', pidList);
237
- for (const pid of pidList) {
238
- try {
239
- log('Garbage collect: kill PID:%d', pid);
240
- // Chromium becomes a zombie process if SIGKILL signal.
241
- process.kill(pid);
242
- } catch (error) {
243
- log('Garbage collect: Failed killing PID:%d %O', pid, error);
244
- }
245
- }
246
- }
247
-
248
- /**
249
- * Retrieve the list of process IDs for Chromium instances that are
250
- * still running after crawling has ended.
251
- * @returns An array of process IDs that should be terminated.
252
- */
253
- getUndeadPid() {
254
- return this.#crawler.getUndeadPid();
255
- }
256
-
257
- /**
258
- * Write the archive to its configured file path.
259
- *
260
- * Emits `writeFileStart` before writing and `writeFileEnd` after
261
- * the write completes successfully.
262
- */
263
- async write() {
264
- void this.emit('writeFileStart', { filePath: this.#archive.filePath });
265
- await this.#archive.write();
266
- void this.emit('writeFileEnd', { filePath: this.#archive.filePath });
267
- }
268
-
269
- /**
270
- * Create a new CrawlerOrchestrator instance and start crawling the given URLs.
271
- *
272
- * This is the primary factory method for starting a fresh crawl. It:
273
- * 1. Parses and sorts the input URLs
274
- * 2. Creates an archive file
275
- * 3. Saves the crawl configuration
276
- * 4. Runs the optional initialized callback
277
- * 5. Executes the crawl
278
- * 6. Sorts the archived URLs in natural order
279
- * @param url - One or more URL strings to crawl.
280
- * @param options - Optional configuration overrides for the crawl session.
281
- * @param initializedCallback - Optional callback invoked after initialization but before crawling starts.
282
- * @returns A promise that resolves to the CrawlerOrchestrator instance after crawling completes.
283
- * @throws {Error} If the URL list is empty or contains no valid URLs.
284
- */
285
- static async crawling(
286
- url: string[],
287
- options?: Partial<CrawlConfig>,
288
- initializedCallback?: CrawlInitializedCallback,
289
- ) {
290
- const list = sortUrl(url, options);
291
- const urlParsed = list[0];
292
-
293
- if (!urlParsed) {
294
- throw new Error('URL is empty');
295
- }
296
-
297
- const fileName = `${urlParsed.hostname}-${Archive.timestamp()}`;
298
- const cwd = options?.cwd ?? process.cwd();
299
- const filePath = Archive.joinPath(cwd, `${fileName}.${Archive.FILE_EXTENSION}`);
300
- const disableQueries = options?.disableQueries || false;
301
- const defaultUserAgent = `Nitpicker/${pkg.version}`;
302
- const archive = await Archive.create({ filePath, cwd, disableQueries });
303
- await archive.setConfig({
304
- version: pkg.version,
305
- name: fileName,
306
- baseUrl: urlParsed.withoutHash,
307
- recursive: options?.recursive ?? true,
308
- fetchExternal: options?.fetchExternal ?? true,
309
- image: options?.image ?? true,
310
- interval: options?.interval || 0,
311
- parallels: options?.parallels || 0,
312
- scope: options?.scope ?? [],
313
- // @ts-expect-error TODO: Fix CLI arguments
314
- excludes: normalizeToArray(options?.exclude),
315
- // @ts-expect-error TODO: Fix CLI arguments
316
- excludeKeywords: normalizeToArray(options?.excludeKeyword),
317
- excludeUrls: [
318
- ...DEFAULT_EXCLUDED_EXTERNAL_URLS,
319
- // @ts-expect-error TODO: Fix CLI arguments
320
- ...normalizeToArray(options?.excludeUrl),
321
- ],
322
- maxExcludedDepth: options?.maxExcludedDepth || 10,
323
- retry: options?.retry ?? 3,
324
- fromList: !!options?.list,
325
- disableQueries,
326
- userAgent: options?.userAgent || defaultUserAgent,
327
- ignoreRobots: options?.ignoreRobots ?? false,
328
- });
329
- const orchestrator = new CrawlerOrchestrator(archive, options);
330
- const config = await archive.getConfig();
331
- if (initializedCallback) {
332
- await initializedCallback(orchestrator, config);
333
- }
334
- log('Start crawling');
335
- log(
336
- 'URL %O',
337
- list.map((url) => url.href),
338
- );
339
- log('Config %O', config);
340
- await orchestrator.crawling(list);
341
- log('Crawling completed');
342
- clearDestinationCache();
343
- log('Set order natural URL sort');
344
- await archive.setUrlOrder();
345
- log('Sorting done');
346
- return orchestrator;
347
- }
348
-
349
- /**
350
- * Resume a previously interrupted crawl from an existing archive file.
351
- *
352
- * Restores the crawl state (pending URLs, scraped URLs, and resources)
353
- * from the archive, merges any option overrides, and continues crawling
354
- * from where it left off.
355
- * @param stubPath - Path to the existing archive file to resume from.
356
- * @param options - Optional configuration overrides to apply on top of the archived config.
357
- * @param initializedCallback - Optional callback invoked after initialization but before crawling resumes.
358
- * @returns A promise that resolves to the CrawlerOrchestrator instance after crawling completes.
359
- * @throws {Error} If the archived URL is invalid.
360
- */
361
- static async resume(
362
- stubPath: string,
363
- options?: Partial<CrawlConfig>,
364
- initializedCallback?: CrawlInitializedCallback,
365
- ) {
366
- const archive = await Archive.resume(stubPath);
367
- const archivedConfig = await archive.getConfig();
368
- const config = {
369
- ...archivedConfig,
370
- ...cleanObject(options),
371
- };
372
- const orchestrator = new CrawlerOrchestrator(archive, config);
373
- const _url = await archive.getUrl();
374
- const url = parseUrl(_url, config);
375
- if (!url) {
376
- throw new Error(`URL (${_url}) is invalid`);
377
- }
378
- const { scraped, pending } = await archive.getCrawlingState();
379
- const resources = await archive.getResourceUrlList();
380
- orchestrator.#crawler.resume(pending, scraped, resources);
381
- if (initializedCallback) {
382
- await initializedCallback(orchestrator, config);
383
- }
384
- log('Start resuming');
385
- log('Data %s', stubPath);
386
- log('URL %s', url.href);
387
- log('Config %O', config);
388
- await orchestrator.crawling([url]);
389
- return orchestrator;
390
- }
391
- }
392
-
393
- /**
394
- * Normalize an optional parameter that may be a single value, an array,
395
- * null, or undefined into a guaranteed array.
396
- * @param param - The parameter to normalize.
397
- * @returns An array containing the parameter value(s), or an empty array if absent.
398
- */
399
- function normalizeToArray<T>(param: T | T[] | null | undefined) {
400
- return Array.isArray(param) ? param : param ? [param] : [];
401
- }
package/src/debug.ts DELETED
@@ -1,10 +0,0 @@
1
- import { log as globalLog } from './utils/debug.js';
2
-
3
- /** Debug logger for the core package. Namespace: `Nitpicker`. */
4
- export const log = globalLog;
5
- /** Debug logger for the crawler module. Namespace: `Nitpicker:Crawler`. */
6
- export const crawlerLog = log.extend('Crawler');
7
- /** Debug logger for the dealer integration. Namespace: `Nitpicker:Crawler:Deal`. */
8
- export const dealLog = crawlerLog.extend('Deal');
9
- /** Debug logger for crawler errors. Namespace: `Nitpicker:Crawler:Error`. */
10
- export const crawlerErrorLog = crawlerLog.extend('Error');
package/src/index.ts DELETED
@@ -1,25 +0,0 @@
1
- /**
2
- * @module @nitpicker/crawler
3
- *
4
- * Core module of Nitpicker that provides the main crawling engine,
5
- * utility functions, type definitions, and archive storage layer.
6
- */
7
-
8
- // Types + Utils (旧 @nitpicker/types + utils)
9
- export * from './utils/index.js';
10
-
11
- // Archive
12
- export { ArchiveAccessor } from './archive/archive-accessor.js';
13
- export type { Redirect, Referrer, Anchor, StaticPageData } from './archive/page.js';
14
- export { default as Page } from './archive/page.js';
15
- export { default as ArchiveResource } from './archive/resource.js';
16
- export * from './archive/types.js';
17
- export { default as Archive } from './archive/archive.js';
18
-
19
- // Core
20
- export {
21
- DEFAULT_EXCLUDED_EXTERNAL_URLS,
22
- CrawlerOrchestrator,
23
- } from './crawler-orchestrator.js';
24
- export * from './types.js';
25
- export * from './crawler/types.js';