@nitpicker/crawler 0.4.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/package.json +5 -2
  2. package/CHANGELOG.md +0 -16
  3. package/src/archive/__mock__/.gitignore +0 -3
  4. package/src/archive/__mock__/mock.sqlite +0 -0
  5. package/src/archive/archive-accessor.ts +0 -337
  6. package/src/archive/archive.ts +0 -408
  7. package/src/archive/database.spec.ts +0 -469
  8. package/src/archive/database.ts +0 -1059
  9. package/src/archive/debug.ts +0 -10
  10. package/src/archive/filesystem/append-text.spec.ts +0 -26
  11. package/src/archive/filesystem/append-text.ts +0 -16
  12. package/src/archive/filesystem/copy-dir-sync.spec.ts +0 -27
  13. package/src/archive/filesystem/copy-dir-sync.ts +0 -10
  14. package/src/archive/filesystem/copy-dir.spec.ts +0 -33
  15. package/src/archive/filesystem/copy-dir.ts +0 -14
  16. package/src/archive/filesystem/exists.spec.ts +0 -33
  17. package/src/archive/filesystem/exists.ts +0 -10
  18. package/src/archive/filesystem/get-file-list.spec.ts +0 -37
  19. package/src/archive/filesystem/get-file-list.ts +0 -13
  20. package/src/archive/filesystem/index.ts +0 -17
  21. package/src/archive/filesystem/is-dir.spec.ts +0 -29
  22. package/src/archive/filesystem/is-dir.ts +0 -11
  23. package/src/archive/filesystem/mkdir.spec.ts +0 -37
  24. package/src/archive/filesystem/mkdir.ts +0 -16
  25. package/src/archive/filesystem/output-json.spec.ts +0 -34
  26. package/src/archive/filesystem/output-json.ts +0 -16
  27. package/src/archive/filesystem/output-text.spec.ts +0 -31
  28. package/src/archive/filesystem/output-text.ts +0 -35
  29. package/src/archive/filesystem/read-json.spec.ts +0 -26
  30. package/src/archive/filesystem/read-json.ts +0 -12
  31. package/src/archive/filesystem/read-text.spec.ts +0 -25
  32. package/src/archive/filesystem/read-text.ts +0 -11
  33. package/src/archive/filesystem/readline.spec.ts +0 -29
  34. package/src/archive/filesystem/readline.ts +0 -30
  35. package/src/archive/filesystem/remove.spec.ts +0 -34
  36. package/src/archive/filesystem/remove.ts +0 -11
  37. package/src/archive/filesystem/rename.spec.ts +0 -46
  38. package/src/archive/filesystem/rename.ts +0 -21
  39. package/src/archive/filesystem/tar.spec.ts +0 -33
  40. package/src/archive/filesystem/tar.ts +0 -27
  41. package/src/archive/filesystem/untar.spec.ts +0 -34
  42. package/src/archive/filesystem/untar.ts +0 -36
  43. package/src/archive/index.ts +0 -13
  44. package/src/archive/page.spec.ts +0 -368
  45. package/src/archive/page.ts +0 -420
  46. package/src/archive/resource.spec.ts +0 -101
  47. package/src/archive/resource.ts +0 -73
  48. package/src/archive/safe-path.spec.ts +0 -44
  49. package/src/archive/safe-path.ts +0 -18
  50. package/src/archive/types.ts +0 -227
  51. package/src/crawler/clear-destination-cache.spec.ts +0 -20
  52. package/src/crawler/clear-destination-cache.ts +0 -9
  53. package/src/crawler/crawler.ts +0 -873
  54. package/src/crawler/decompose-url.spec.ts +0 -48
  55. package/src/crawler/decompose-url.ts +0 -90
  56. package/src/crawler/destination-cache.spec.ts +0 -23
  57. package/src/crawler/destination-cache.ts +0 -8
  58. package/src/crawler/detect-pagination-pattern.spec.ts +0 -169
  59. package/src/crawler/detect-pagination-pattern.ts +0 -66
  60. package/src/crawler/fetch-destination.ts +0 -257
  61. package/src/crawler/fetch-robots-txt.spec.ts +0 -83
  62. package/src/crawler/fetch-robots-txt.ts +0 -91
  63. package/src/crawler/find-best-matching-scope.spec.ts +0 -39
  64. package/src/crawler/find-best-matching-scope.ts +0 -57
  65. package/src/crawler/generate-predicted-urls.spec.ts +0 -42
  66. package/src/crawler/generate-predicted-urls.ts +0 -34
  67. package/src/crawler/handle-ignore-and-skip.spec.ts +0 -66
  68. package/src/crawler/handle-ignore-and-skip.ts +0 -30
  69. package/src/crawler/handle-resource-response.spec.ts +0 -45
  70. package/src/crawler/handle-resource-response.ts +0 -21
  71. package/src/crawler/handle-scrape-end.spec.ts +0 -109
  72. package/src/crawler/handle-scrape-end.ts +0 -115
  73. package/src/crawler/handle-scrape-error.spec.ts +0 -105
  74. package/src/crawler/handle-scrape-error.ts +0 -58
  75. package/src/crawler/index.ts +0 -2
  76. package/src/crawler/inject-scope-auth.spec.ts +0 -36
  77. package/src/crawler/inject-scope-auth.ts +0 -27
  78. package/src/crawler/is-external-url.spec.ts +0 -31
  79. package/src/crawler/is-external-url.ts +0 -17
  80. package/src/crawler/is-in-any-lower-layer.spec.ts +0 -31
  81. package/src/crawler/is-in-any-lower-layer.ts +0 -22
  82. package/src/crawler/link-list.spec.ts +0 -355
  83. package/src/crawler/link-list.ts +0 -275
  84. package/src/crawler/link-to-page-data.spec.ts +0 -133
  85. package/src/crawler/link-to-page-data.ts +0 -34
  86. package/src/crawler/net-timeout-error.spec.ts +0 -25
  87. package/src/crawler/net-timeout-error.ts +0 -11
  88. package/src/crawler/protocol-agnostic-key.spec.ts +0 -40
  89. package/src/crawler/protocol-agnostic-key.ts +0 -11
  90. package/src/crawler/reconstruct-url.spec.ts +0 -37
  91. package/src/crawler/reconstruct-url.ts +0 -37
  92. package/src/crawler/robots-checker.spec.ts +0 -104
  93. package/src/crawler/robots-checker.ts +0 -73
  94. package/src/crawler/should-discard-predicted.spec.ts +0 -125
  95. package/src/crawler/should-discard-predicted.ts +0 -33
  96. package/src/crawler/should-skip-url.spec.ts +0 -77
  97. package/src/crawler/should-skip-url.ts +0 -37
  98. package/src/crawler/types.ts +0 -146
  99. package/src/crawler-orchestrator.ts +0 -401
  100. package/src/debug.ts +0 -10
  101. package/src/index.ts +0 -25
  102. package/src/types.ts +0 -30
  103. package/src/utils/array/each-splitted.spec.ts +0 -38
  104. package/src/utils/array/each-splitted.ts +0 -19
  105. package/src/utils/array/index.ts +0 -1
  106. package/src/utils/debug.ts +0 -6
  107. package/src/utils/error/dom-evaluation-error.spec.ts +0 -20
  108. package/src/utils/error/dom-evaluation-error.ts +0 -6
  109. package/src/utils/error/error-emitter.spec.ts +0 -78
  110. package/src/utils/error/error-emitter.ts +0 -44
  111. package/src/utils/error/index.ts +0 -3
  112. package/src/utils/index.ts +0 -5
  113. package/src/utils/object/clean-object.spec.ts +0 -24
  114. package/src/utils/object/clean-object.ts +0 -13
  115. package/src/utils/object/index.ts +0 -1
  116. package/src/utils/types/index.ts +0 -1
  117. package/src/utils/types/types.ts +0 -65
  118. package/tsconfig.json +0 -11
  119. package/tsconfig.tsbuildinfo +0 -1
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@nitpicker/crawler",
3
- "version": "0.4.2",
3
+ "version": "0.4.3",
4
4
  "description": "Web crawler engine with headless browser rendering and archive storage",
5
5
  "author": "D-ZERO",
6
6
  "license": "Apache-2.0",
@@ -12,6 +12,9 @@
12
12
  "publishConfig": {
13
13
  "access": "public"
14
14
  },
15
+ "files": [
16
+ "lib"
17
+ ],
15
18
  "type": "module",
16
19
  "exports": {
17
20
  ".": {
@@ -45,5 +48,5 @@
45
48
  "@types/tar": "7.0.87",
46
49
  "@types/unzipper": "0.10.11"
47
50
  },
48
- "gitHead": "14066d0e7b9e652ddd7a5abb5f20bba682f09c0c"
51
+ "gitHead": "0f4ca55751be2f83dd5b6622c3502503fc7dfb41"
49
52
  }
package/CHANGELOG.md DELETED
@@ -1,16 +0,0 @@
1
- # Change Log
2
-
3
- All notable changes to this project will be documented in this file.
4
- See [Conventional Commits](https://conventionalcommits.org) for commit guidelines.
5
-
6
- ## [0.4.2](https://github.com/d-zero-dev/nitpicker/compare/v0.4.1...v0.4.2) (2026-02-27)
7
-
8
- **Note:** Version bump only for package @nitpicker/crawler
9
-
10
-
11
-
12
-
13
-
14
- ## [0.4.1](https://github.com/d-zero-dev/nitpicker/compare/v0.4.0...v0.4.1) (2026-02-27)
15
-
16
- **Note:** Version bump only for package @nitpicker/crawler
@@ -1,3 +0,0 @@
1
- *
2
- !.gitignore
3
- !mock.sqlite
Binary file
@@ -1,337 +0,0 @@
1
- import type { Database } from './database.js';
2
- import type {
3
- DB_Anchor,
4
- DB_Redirect,
5
- DB_Referrer,
6
- DatabaseEvent,
7
- PageFilter,
8
- } from './types.js';
9
- import type { ParseURLOptions } from '@d-zero/shared/parse-url';
10
-
11
- import path from 'node:path';
12
-
13
- import { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
14
-
15
- import { log } from './debug.js';
16
- import {
17
- exists,
18
- extractZip,
19
- outputJSON,
20
- outputText,
21
- readJSON,
22
- readText,
23
- unzip,
24
- } from './filesystem/index.js';
25
- import Page from './page.js';
26
- import Resource from './resource.js';
27
- import { safePath } from './safe-path.js';
28
-
29
- /**
30
- * Provides read-only access to an archive's database and stored data files.
31
- *
32
- * This class is the base for the `Archive` class and is also returned
33
- * by `Archive.connect` for read-only access to an existing archive.
34
- * It supports querying pages, anchors, referrers, resources, and custom data.
35
- */
36
- export class ArchiveAccessor extends EventEmitter<DatabaseEvent> {
37
- /** The SQLite database instance for querying archived data. */
38
- #db: Database;
39
- /** Namespace prefix for custom data storage (e.g. `"analysis/plugin-name"`). `null` disables `setData`. */
40
- #namespace: string | null = null;
41
- /** Absolute path to the temporary working directory containing the database and files. */
42
- #tmpDir: string;
43
-
44
- /**
45
- * The absolute path to the temporary working directory used by this accessor.
46
- */
47
- get tmpDir() {
48
- return this.#tmpDir;
49
- }
50
-
51
- /**
52
- * Creates a new ArchiveAccessor instance.
53
- * @param tmpDir - The path to the temporary directory containing the archive data.
54
- * @param db - The Database instance for querying the SQLite database.
55
- * @param namespace - An optional namespace for scoping custom data storage.
56
- * When null, `setData` is not available.
57
- */
58
- constructor(tmpDir: string, db: Database, namespace: string | null = null) {
59
- super();
60
- this.#tmpDir = tmpDir;
61
- this.#db = db;
62
- this.#namespace = namespace;
63
-
64
- this.#db.on('error', (e) => {
65
- void this.emit('error', e);
66
- });
67
- }
68
-
69
- /**
70
- * Retrieves anchor (link) data for a specific page by its database ID.
71
- * @param pageId - The database ID of the page whose anchors to retrieve.
72
- * @returns An array of anchor records found on the page.
73
- */
74
- async getAnchorsOnPage(pageId: number) {
75
- const refs = await this.#db.getAnchorsOnPage(pageId);
76
- return refs;
77
- }
78
-
79
- /**
80
- * Reads custom data stored in the archive by name.
81
- * @param name - The base name of the data file (without extension).
82
- * @param format - The file format: `'json'` (default), `'txt'`, or `'html'`.
83
- * @returns The parsed JSON object for `'json'` format, or a string for `'txt'`/`'html'` format.
84
- */
85
- async getData<T>(name: string, format?: 'json'): Promise<T>;
86
- /**
87
- * Reads custom data stored in the archive by name as a string.
88
- * @param name - The base name of the data file (without extension).
89
- * @param format - The file format: `'txt'` or `'html'`.
90
- * @returns The file contents as a string.
91
- */
92
- async getData(name: string, format?: 'txt' | 'html'): Promise<string>;
93
- async getData<T>(name: string, format: 'json' | 'txt' | 'html' = 'json') {
94
- const namespace = this.#namespace || '';
95
- const filePath = safePath(this.#tmpDir, namespace, `${name}.${format}`);
96
- if (format === 'json') {
97
- return await readJSON<T>(filePath);
98
- }
99
- return await readText(filePath);
100
- }
101
-
102
- /**
103
- * Reads the HTML content of a page snapshot from the archive.
104
- * Supports reading from both unzipped directories and zipped snapshot archives.
105
- * @param filePath - The relative file path to the HTML snapshot, or null.
106
- * @param openZipped - Whether to attempt unzipping the snapshot archive. Defaults to `true`.
107
- * @returns The HTML content as a string, or null if the snapshot is not found or filePath is null.
108
- */
109
- async getHtmlOfPage(filePath: string | null, openZipped = true) {
110
- if (!filePath) {
111
- return null;
112
- }
113
- const snapshotDir = safePath(this.#tmpDir, path.dirname(filePath));
114
- const name = path.basename(filePath);
115
-
116
- if (openZipped) {
117
- await unzip(`${snapshotDir}.zip`, snapshotDir);
118
- }
119
-
120
- if (exists(snapshotDir)) {
121
- log('Load %s directly because snapshot dir is unzipped', name);
122
- const html = await readText(path.resolve(snapshotDir, name)).catch(
123
- (error) => error,
124
- );
125
- if (typeof html === 'string') {
126
- log('Loaded: %s ...', html.split('\n')[0]);
127
- return html;
128
- }
129
- log('Failed Loading: %O', html);
130
- return null;
131
- }
132
- log('Extracts %s from zipped snapshots', name);
133
- const zipDir = await extractZip(`${snapshotDir}.zip`);
134
- const file = zipDir.files.find((f) => f.type === 'File' && f.path === name);
135
- if (!file) {
136
- log('Failed: Not found %s from zipped snapshots', name);
137
- return null;
138
- }
139
- const buffer = await file.buffer();
140
- const html = buffer.toString('utf8') || null;
141
- log('Succeeded: Extracts %s from zipped snapshots', name);
142
- return html;
143
- }
144
-
145
- /**
146
- * Retrieves all pages from the archive, optionally filtered by type.
147
- * Eagerly loads redirect relationships (`redirectFrom`) but does NOT load
148
- * anchor or referrer relationships.
149
- * Use {@link getPagesWithRefs} if you need those relationships.
150
- * @param filter - An optional filter to narrow the results (e.g., `'internal-page'`, `'external-page'`).
151
- * @returns An array of {@link Page} instances.
152
- */
153
- async getPages(filter?: PageFilter) {
154
- const pages = await this.#db.getPages(filter);
155
- if (pages.length === 0) return [];
156
-
157
- const pageIds = pages.map((p) => p.id);
158
- const redirects = await this.#db.getRedirectsForPages(pageIds);
159
-
160
- const redirectMap = new Map<number, DB_Redirect[]>();
161
- for (const redirect of redirects) {
162
- const current = redirectMap.get(redirect.pageId);
163
- if (current) {
164
- current.push(redirect);
165
- continue;
166
- }
167
- redirectMap.set(redirect.pageId, [redirect]);
168
- }
169
-
170
- return pages.map((page) => new Page(this, page, redirectMap.get(page.id) || []));
171
- }
172
-
173
- /**
174
- * Retrieves pages with their related data (redirects, anchors, referrers) in batches.
175
- * Processes pages in chunks of `limit` size, calling the callback for each batch.
176
- * @param limit - The maximum number of pages to load per batch.
177
- * @param callback - A function called for each batch of pages with the current offset and total count.
178
- * @param options - Optional URL parsing options and whether to include referrer relationships.
179
- */
180
- async getPagesWithRefs(
181
- limit: number,
182
- callback: (pages: Page[], currentOffset: number, max: number) => void | Promise<void>,
183
- options?: ParseURLOptions & {
184
- withRefs?: boolean;
185
- },
186
- ) {
187
- const max = await this.#getPageCount();
188
- let times = 0;
189
-
190
- while (true) {
191
- const offset = times * limit;
192
- log('%d times loop: %o', times, {
193
- offset,
194
- limit,
195
- max,
196
- });
197
- const pages = await this.#getPagesWithRels(offset, limit, options);
198
- if (pages.length === 0) {
199
- break;
200
- }
201
- await callback(pages, offset, max);
202
- times++;
203
- }
204
- }
205
-
206
- /**
207
- * Retrieves pages that link to the specified page (incoming links).
208
- * @param pageId - The database ID of the target page.
209
- * @returns An array of referrer records.
210
- */
211
- async getReferrersOfPage(pageId: number) {
212
- const refs = await this.#db.getReferrersOfPage(pageId);
213
- return refs;
214
- }
215
-
216
- /**
217
- * Retrieves page URLs that reference the specified resource.
218
- * @param pageId - The database ID of the resource.
219
- * @returns An array of page URL strings that reference this resource.
220
- */
221
- async getReferrersOfResource(pageId: number) {
222
- const refs = await this.#db.getReferrersOfResource(pageId);
223
- return refs;
224
- }
225
-
226
- /**
227
- * Retrieves all sub-resources (CSS, JS, images, etc.) stored in the archive.
228
- * @returns An array of {@link Resource} instances.
229
- */
230
- async getResources() {
231
- const resources = await this.#db.getResources();
232
- return resources.map((r) => new Resource(this, r));
233
- }
234
-
235
- /**
236
- * Retrieves a flat list of all resource URLs stored in the archive.
237
- * @returns An array of resource URL strings.
238
- */
239
- async getResourceUrlList() {
240
- return this.#db.getResourceUrlList();
241
- }
242
-
243
- /**
244
- * Stores custom data in the archive under the configured namespace.
245
- * Requires a namespace to be set on this accessor; throws if namespace is null.
246
- * @param name - The base name of the data file (without extension).
247
- * @param data - The data to store. For JSON format, this will be serialized. For text/HTML, it will be stringified.
248
- * @param format - The file format: `'json'` (default), `'txt'`, or `'html'`.
249
- * @returns The relative file path (from the tmp directory) of the stored data file.
250
- * @throws {Error} If no namespace is set on this accessor.
251
- */
252
- async setData(name: string, data: unknown, format: 'json' | 'txt' | 'html' = 'json') {
253
- if (this.#namespace == null) {
254
- throw new Error('"setData" method of the ArchiveAccessor API must set namespace');
255
- }
256
- const filePath = safePath(this.#tmpDir, this.#namespace, `${name}.${format}`);
257
- if (format === 'json') {
258
- await outputJSON(filePath, data);
259
- } else {
260
- await outputText(filePath, `${data}`);
261
- }
262
- return path.relative(this.#tmpDir, filePath);
263
- }
264
-
265
- /**
266
- * Returns the total number of internal pages in the archive.
267
- */
268
- async #getPageCount() {
269
- return this.#db.getPageCount();
270
- }
271
-
272
- /**
273
- * Loads a batch of pages with their related data (redirects, anchors, referrers).
274
- * When `withRefs` is false, loads only pages without relationships for better performance.
275
- * @param offset - The number of pages to skip
276
- * @param limit - The maximum number of pages to return
277
- * @param options - URL parsing and referrer loading options
278
- */
279
- async #getPagesWithRels(
280
- offset: number,
281
- limit: number,
282
- options?: ParseURLOptions & {
283
- withRefs?: boolean;
284
- },
285
- ) {
286
- if (options?.withRefs === false) {
287
- const pages = await this.#db.getPages('internal-page', offset, limit);
288
- return pages.map((page) => new Page(this, page));
289
- }
290
- const { pages, redirects, anchors, referrers } = await this.#db.getPagesWithRels(
291
- offset,
292
- limit,
293
- );
294
- const redirectMap = new Map<number, DB_Redirect[]>();
295
- const anchorMap = new Map<number, DB_Anchor[]>();
296
- const refersMap = new Map<number, DB_Referrer[]>();
297
- log('Mapping redirects');
298
- for (const redirect of redirects) {
299
- const current = redirectMap.get(redirect.pageId);
300
- if (current) {
301
- current.push(redirect);
302
- continue;
303
- }
304
- redirectMap.set(redirect.pageId, [redirect]);
305
- }
306
- log('Mapping anchors');
307
- for (const anchor of anchors) {
308
- const current = anchorMap.get(anchor.pageId);
309
- if (current) {
310
- current.push(anchor);
311
- continue;
312
- }
313
- anchorMap.set(anchor.pageId, [anchor]);
314
- }
315
- log('Mapping referrers');
316
- for (const referrer of referrers) {
317
- const current = refersMap.get(referrer.pageId);
318
- if (current) {
319
- current.push(referrer);
320
- continue;
321
- }
322
- refersMap.set(referrer.pageId, [referrer]);
323
- }
324
- log('Create Page Data');
325
- const pPages: Page[] = [];
326
- for (const page of pages) {
327
- const pRedirects = redirectMap.get(page.id) || [];
328
- const pAnchors = anchorMap.get(page.id) || [];
329
- const pRefers = refersMap.get(page.id) || [];
330
- pPages.push(
331
- new Page(this, page, pRedirects, pAnchors, pRefers, options?.disableQueries),
332
- );
333
- }
334
- log('Create Page Data: Done');
335
- return pPages;
336
- }
337
- }