@nitpicker/crawler 0.4.2 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. package/lib/archive/archive-accessor.d.ts +6 -1
  2. package/lib/archive/archive-accessor.js +7 -0
  3. package/lib/archive/database.js +2 -1
  4. package/package.json +5 -2
  5. package/CHANGELOG.md +0 -16
  6. package/src/archive/__mock__/.gitignore +0 -3
  7. package/src/archive/__mock__/mock.sqlite +0 -0
  8. package/src/archive/archive-accessor.ts +0 -337
  9. package/src/archive/archive.ts +0 -408
  10. package/src/archive/database.spec.ts +0 -469
  11. package/src/archive/database.ts +0 -1059
  12. package/src/archive/debug.ts +0 -10
  13. package/src/archive/filesystem/append-text.spec.ts +0 -26
  14. package/src/archive/filesystem/append-text.ts +0 -16
  15. package/src/archive/filesystem/copy-dir-sync.spec.ts +0 -27
  16. package/src/archive/filesystem/copy-dir-sync.ts +0 -10
  17. package/src/archive/filesystem/copy-dir.spec.ts +0 -33
  18. package/src/archive/filesystem/copy-dir.ts +0 -14
  19. package/src/archive/filesystem/exists.spec.ts +0 -33
  20. package/src/archive/filesystem/exists.ts +0 -10
  21. package/src/archive/filesystem/get-file-list.spec.ts +0 -37
  22. package/src/archive/filesystem/get-file-list.ts +0 -13
  23. package/src/archive/filesystem/index.ts +0 -17
  24. package/src/archive/filesystem/is-dir.spec.ts +0 -29
  25. package/src/archive/filesystem/is-dir.ts +0 -11
  26. package/src/archive/filesystem/mkdir.spec.ts +0 -37
  27. package/src/archive/filesystem/mkdir.ts +0 -16
  28. package/src/archive/filesystem/output-json.spec.ts +0 -34
  29. package/src/archive/filesystem/output-json.ts +0 -16
  30. package/src/archive/filesystem/output-text.spec.ts +0 -31
  31. package/src/archive/filesystem/output-text.ts +0 -35
  32. package/src/archive/filesystem/read-json.spec.ts +0 -26
  33. package/src/archive/filesystem/read-json.ts +0 -12
  34. package/src/archive/filesystem/read-text.spec.ts +0 -25
  35. package/src/archive/filesystem/read-text.ts +0 -11
  36. package/src/archive/filesystem/readline.spec.ts +0 -29
  37. package/src/archive/filesystem/readline.ts +0 -30
  38. package/src/archive/filesystem/remove.spec.ts +0 -34
  39. package/src/archive/filesystem/remove.ts +0 -11
  40. package/src/archive/filesystem/rename.spec.ts +0 -46
  41. package/src/archive/filesystem/rename.ts +0 -21
  42. package/src/archive/filesystem/tar.spec.ts +0 -33
  43. package/src/archive/filesystem/tar.ts +0 -27
  44. package/src/archive/filesystem/untar.spec.ts +0 -34
  45. package/src/archive/filesystem/untar.ts +0 -36
  46. package/src/archive/index.ts +0 -13
  47. package/src/archive/page.spec.ts +0 -368
  48. package/src/archive/page.ts +0 -420
  49. package/src/archive/resource.spec.ts +0 -101
  50. package/src/archive/resource.ts +0 -73
  51. package/src/archive/safe-path.spec.ts +0 -44
  52. package/src/archive/safe-path.ts +0 -18
  53. package/src/archive/types.ts +0 -227
  54. package/src/crawler/clear-destination-cache.spec.ts +0 -20
  55. package/src/crawler/clear-destination-cache.ts +0 -9
  56. package/src/crawler/crawler.ts +0 -873
  57. package/src/crawler/decompose-url.spec.ts +0 -48
  58. package/src/crawler/decompose-url.ts +0 -90
  59. package/src/crawler/destination-cache.spec.ts +0 -23
  60. package/src/crawler/destination-cache.ts +0 -8
  61. package/src/crawler/detect-pagination-pattern.spec.ts +0 -169
  62. package/src/crawler/detect-pagination-pattern.ts +0 -66
  63. package/src/crawler/fetch-destination.ts +0 -257
  64. package/src/crawler/fetch-robots-txt.spec.ts +0 -83
  65. package/src/crawler/fetch-robots-txt.ts +0 -91
  66. package/src/crawler/find-best-matching-scope.spec.ts +0 -39
  67. package/src/crawler/find-best-matching-scope.ts +0 -57
  68. package/src/crawler/generate-predicted-urls.spec.ts +0 -42
  69. package/src/crawler/generate-predicted-urls.ts +0 -34
  70. package/src/crawler/handle-ignore-and-skip.spec.ts +0 -66
  71. package/src/crawler/handle-ignore-and-skip.ts +0 -30
  72. package/src/crawler/handle-resource-response.spec.ts +0 -45
  73. package/src/crawler/handle-resource-response.ts +0 -21
  74. package/src/crawler/handle-scrape-end.spec.ts +0 -109
  75. package/src/crawler/handle-scrape-end.ts +0 -115
  76. package/src/crawler/handle-scrape-error.spec.ts +0 -105
  77. package/src/crawler/handle-scrape-error.ts +0 -58
  78. package/src/crawler/index.ts +0 -2
  79. package/src/crawler/inject-scope-auth.spec.ts +0 -36
  80. package/src/crawler/inject-scope-auth.ts +0 -27
  81. package/src/crawler/is-external-url.spec.ts +0 -31
  82. package/src/crawler/is-external-url.ts +0 -17
  83. package/src/crawler/is-in-any-lower-layer.spec.ts +0 -31
  84. package/src/crawler/is-in-any-lower-layer.ts +0 -22
  85. package/src/crawler/link-list.spec.ts +0 -355
  86. package/src/crawler/link-list.ts +0 -275
  87. package/src/crawler/link-to-page-data.spec.ts +0 -133
  88. package/src/crawler/link-to-page-data.ts +0 -34
  89. package/src/crawler/net-timeout-error.spec.ts +0 -25
  90. package/src/crawler/net-timeout-error.ts +0 -11
  91. package/src/crawler/protocol-agnostic-key.spec.ts +0 -40
  92. package/src/crawler/protocol-agnostic-key.ts +0 -11
  93. package/src/crawler/reconstruct-url.spec.ts +0 -37
  94. package/src/crawler/reconstruct-url.ts +0 -37
  95. package/src/crawler/robots-checker.spec.ts +0 -104
  96. package/src/crawler/robots-checker.ts +0 -73
  97. package/src/crawler/should-discard-predicted.spec.ts +0 -125
  98. package/src/crawler/should-discard-predicted.ts +0 -33
  99. package/src/crawler/should-skip-url.spec.ts +0 -77
  100. package/src/crawler/should-skip-url.ts +0 -37
  101. package/src/crawler/types.ts +0 -146
  102. package/src/crawler-orchestrator.ts +0 -401
  103. package/src/debug.ts +0 -10
  104. package/src/index.ts +0 -25
  105. package/src/types.ts +0 -30
  106. package/src/utils/array/each-splitted.spec.ts +0 -38
  107. package/src/utils/array/each-splitted.ts +0 -19
  108. package/src/utils/array/index.ts +0 -1
  109. package/src/utils/debug.ts +0 -6
  110. package/src/utils/error/dom-evaluation-error.spec.ts +0 -20
  111. package/src/utils/error/dom-evaluation-error.ts +0 -6
  112. package/src/utils/error/error-emitter.spec.ts +0 -78
  113. package/src/utils/error/error-emitter.ts +0 -44
  114. package/src/utils/error/index.ts +0 -3
  115. package/src/utils/index.ts +0 -5
  116. package/src/utils/object/clean-object.spec.ts +0 -24
  117. package/src/utils/object/clean-object.ts +0 -13
  118. package/src/utils/object/index.ts +0 -1
  119. package/src/utils/types/index.ts +0 -1
  120. package/src/utils/types/types.ts +0 -65
  121. package/tsconfig.json +0 -11
  122. package/tsconfig.tsbuildinfo +0 -1
@@ -1,1059 +0,0 @@
1
- import type {
2
- Config,
3
- DB_Anchor,
4
- DB_Page,
5
- DB_Redirect,
6
- DB_Referrer,
7
- DB_Resource,
8
- DatabaseEvent,
9
- PageFilter,
10
- } from './types.js';
11
- import type { PageData, Resource } from '../utils/index.js';
12
- import type { RetryDecoratorOptions } from '@d-zero/shared/retry';
13
- import type { Knex } from 'knex';
14
-
15
- import path from 'node:path';
16
-
17
- import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
18
- import { retry } from '@d-zero/shared/retry';
19
- import { pathComparator } from '@d-zero/shared/sort/path';
20
- import { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
21
- import knex from 'knex';
22
-
23
- import { ErrorEmitter, eachSplitted } from '../utils/index.js';
24
-
25
- import { dbLog } from './debug.js';
26
- import { mkdir } from './filesystem/index.js';
27
-
28
- const retrySetting: RetryDecoratorOptions = {
29
- interval: 300,
30
- retries: 3,
31
- };
32
-
33
- /**
34
- * Low-level database abstraction layer for the archive's SQLite database.
35
- *
36
- * Manages the `pages`, `anchors`, `images`, `resources`, and `resources-referrers`
37
- * tables. All public methods that perform database queries use the `@retryable`
38
- * decorator for automatic retry on transient failures, and `@ErrorEmitter` to
39
- * propagate errors as events.
40
- *
41
- * Use the static {@link Database.connect} factory method to create instances.
42
- * The constructor is private.
43
- */
44
- export class Database extends EventEmitter<DatabaseEvent> {
45
- /** The Knex query builder instance connected to the SQLite database. */
46
- #instance: Knex;
47
- /** Absolute path to the working directory, used for resolving relative snapshot paths. */
48
- #workingDir: string;
49
- // eslint-disable-next-line no-restricted-syntax
50
- private constructor(options: DatabaseOption) {
51
- super();
52
- this.#workingDir = options.workingDir;
53
- switch (options.type) {
54
- case 'sqlite3': {
55
- this.#instance = knex({
56
- client: options.type,
57
- connection: {
58
- filename: options.filename,
59
- },
60
- useNullAsDefault: true,
61
- pool: {
62
- acquireTimeoutMillis: 600_000,
63
- },
64
- });
65
- break;
66
- }
67
- case 'mysql': {
68
- throw new Error("Don't support MySQL yet.");
69
- }
70
- }
71
- }
72
-
73
- /**
74
- * Adds the `order` column to the `pages` table for URL sort ordering.
75
- * @deprecated Since v0.1.x. The column is now created during table initialization.
76
- * @returns The result of the schema alteration.
77
- */
78
- async addOrderField() {
79
- return await this.#instance.schema.table('pages', (t) => {
80
- t.integer('order').unsigned().nullable().defaultTo(null);
81
- });
82
- }
83
-
84
- /**
85
- * Forces a WAL checkpoint, writing all pending WAL data back to the main database file.
86
- * Uses TRUNCATE mode to reset the WAL file to zero bytes after checkpointing.
87
- * This ensures the database is fully self-contained in `db.sqlite` before archiving.
88
- */
89
- async checkpoint() {
90
- await this.#instance.raw('PRAGMA wal_checkpoint(TRUNCATE)');
91
- }
92
-
93
- async destroy() {
94
- await this.#instance.destroy();
95
- }
96
- /**
97
- * Retrieves all anchors (outgoing links) on a specific page.
98
- * Joins the `anchors` table with the `pages` table to resolve link destinations.
99
- * @param pageId - The database ID of the page whose anchors to retrieve.
100
- * @returns An array of anchor records with resolved URL, title, status, and content type.
101
- */
102
- @ErrorEmitter()
103
- @retry(retrySetting)
104
- async getAnchorsOnPage(pageId: number) {
105
- const res = await this.#instance
106
- .select(
107
- 'pages.url',
108
- 'pages.title',
109
- 'pages.status',
110
- 'pages.statusText',
111
- 'pages.contentType',
112
- 'anchors.hash',
113
- 'anchors.textContent',
114
- )
115
- .from('anchors')
116
- .join('pages', 'anchors.hrefId', '=', 'pages.id')
117
- .where('anchors.pageId', pageId);
118
- return res;
119
- }
120
-
121
- /**
122
- * Retrieves the base URL of the crawl session from the `info` table.
123
- * @returns The base URL string.
124
- * @throws {Error} If no base URL is found in the database.
125
- */
126
- @ErrorEmitter()
127
- @retry(retrySetting)
128
- async getBaseUrl() {
129
- const selected = await this.#instance.select('baseUrl').from<Config>('info');
130
- if (!selected[0]) {
131
- throw new Error('No baseUrl');
132
- }
133
- const [{ baseUrl }] = selected;
134
- return baseUrl || '';
135
- }
136
-
137
- /**
138
- * Retrieves the full crawl configuration from the `info` table.
139
- * Deserializes JSON-encoded fields (`excludes`, `excludeKeywords`, `scope`).
140
- * @returns The parsed {@link Config} object.
141
- * @throws {Error} If no configuration is found in the database.
142
- */
143
- @ErrorEmitter()
144
- @retry(retrySetting)
145
- async getConfig() {
146
- const [config] = await this.#instance.select('*').from<Config>('info');
147
- if (!config) {
148
- throw new Error('No config');
149
- }
150
- const opt: Config = {
151
- ...config,
152
- excludes: getJSON<string[]>(config.excludes, []),
153
- excludeKeywords: getJSON<string[]>(config.excludeKeywords, []),
154
- excludeUrls: getJSON<string[]>(config.excludeUrls, []),
155
- scope: getJSON<string[]>(config.scope, []),
156
- retry: config.retry ?? 3,
157
- };
158
- // @ts-expect-error
159
- delete opt.id;
160
- dbLog('Table `info`: %O => %O', config, opt);
161
- return opt;
162
- }
163
-
164
- /**
165
- * Retrieves the current crawling state by listing scraped and pending URLs.
166
- * @returns An object with `scraped` (completed URLs) and `pending` (remaining URLs) arrays.
167
- */
168
- @ErrorEmitter()
169
- @retry(retrySetting)
170
- async getCrawlingState() {
171
- const ex = (r: { url: string }) => r.url;
172
- const $scraped = await this.#instance
173
- .select('url')
174
- .from<DB_Page>('pages')
175
- .where('scraped', 1);
176
- const scraped = $scraped.map(ex);
177
- const $pending = await this.#instance
178
- .select('url')
179
- .from<DB_Page>('pages')
180
- .where('scraped', 0);
181
- const pending = $pending.map(ex);
182
- return {
183
- scraped,
184
- pending,
185
- };
186
- }
187
-
188
- /**
189
- * Retrieves the HTML snapshot file path for a specific page.
190
- * @param pageId - The database ID of the page.
191
- * @returns The relative file path to the HTML snapshot, or null if not saved.
192
- */
193
- @ErrorEmitter()
194
- @retry(retrySetting)
195
- async getHtmlPathOnPage(pageId: number) {
196
- return await this.#instance.transaction(async (trx) => {
197
- const [{ html }] = await trx
198
- .select('html')
199
- .from<DB_Page>('pages')
200
- .where('id', pageId);
201
- return html || null;
202
- });
203
- }
204
-
205
- /**
206
- * Retrieves the crawl session name from the `info` table.
207
- * @returns The name string.
208
- * @throws {Error} If no name is found in the database.
209
- */
210
- @ErrorEmitter()
211
- @retry(retrySetting)
212
- async getName() {
213
- const selected = await this.#instance.select('name').from<Config>('info');
214
- if (!selected[0]) {
215
- throw new Error('No name');
216
- }
217
- const [{ name }] = selected;
218
- return name;
219
- }
220
-
221
- /**
222
- * Counts the total number of pages in the database.
223
- * @returns The total page count.
224
- * @throws {Error} If the count query fails.
225
- */
226
- @ErrorEmitter()
227
- @retry(retrySetting)
228
- async getPageCount() {
229
- const selected = await this.#instance.count('id').from<DB_Page>('pages');
230
- if (!selected[0]) {
231
- throw new Error('No count');
232
- }
233
- // @ts-expect-error
234
- const count: number = selected[0]['count(`id`)'];
235
- dbLog('Number of pages: %d', count);
236
- return count;
237
- }
238
-
239
- /**
240
- * Retrieves pages from the database with optional filtering, pagination via offset and limit.
241
- * @param filter - An optional {@link PageFilter} to narrow results by content type and origin.
242
- * @param offset - The number of rows to skip. Defaults to `0`.
243
- * @param limit - The maximum number of rows to return. Defaults to `100000`.
244
- * @returns An array of raw {@link DB_Page} rows.
245
- */
246
- @ErrorEmitter()
247
- @retry(retrySetting)
248
- async getPages(filter?: PageFilter, offset = 0, limit = 100_000) {
249
- const q = this.#instance.select('*').from<DB_Page>('pages');
250
- switch (filter) {
251
- case 'page': {
252
- return q
253
- .where({
254
- contentType: 'text/html',
255
- isTarget: 1,
256
- })
257
- .limit(limit)
258
- .offset(offset);
259
- }
260
- case 'page-included-no-target': {
261
- return q
262
- .where({
263
- contentType: 'text/html',
264
- })
265
- .limit(limit)
266
- .offset(offset);
267
- }
268
- case 'external-page': {
269
- return q
270
- .where({
271
- contentType: 'text/html',
272
- isExternal: 1,
273
- })
274
- .limit(limit)
275
- .offset(offset);
276
- }
277
- case 'internal-page': {
278
- return q
279
- .where({
280
- contentType: 'text/html',
281
- isExternal: 0,
282
- })
283
- .limit(limit)
284
- .offset(offset);
285
- }
286
- case 'no-page': {
287
- return q
288
- .whereNull('contentType')
289
- .orWhereNot({
290
- contentType: 'text/html',
291
- })
292
- .limit(limit)
293
- .offset(offset);
294
- }
295
- case 'external-no-page': {
296
- return q
297
- .where((qb) => {
298
- qb.whereNull('contentType').orWhereNot({
299
- contentType: 'text/html',
300
- });
301
- })
302
- .andWhere({
303
- isExternal: 1,
304
- })
305
- .limit(limit)
306
- .offset(offset);
307
- }
308
- case 'internal-no-page': {
309
- return q
310
- .where((qb) => {
311
- qb.whereNull('contentType').orWhereNot({
312
- contentType: 'text/html',
313
- });
314
- })
315
- .andWhere({
316
- isExternal: 0,
317
- })
318
- .limit(limit)
319
- .offset(offset);
320
- }
321
- }
322
- return q.limit(limit).offset(offset);
323
- }
324
-
325
- /**
326
- * Retrieves pages along with their related redirect, anchor, and referrer data.
327
- * Results are ordered by the natural URL sort order. Only non-redirected pages are returned.
328
- * @param offset - The number of rows to skip.
329
- * @param limit - The maximum number of pages to return.
330
- * @returns An object containing `pages`, `redirects`, `anchors`, and `referrers` arrays.
331
- */
332
- @ErrorEmitter()
333
- @retry(retrySetting)
334
- async getPagesWithRels(offset: number, limit: number) {
335
- await this.addOrderField().catch((error) => error);
336
- await this.setUrlOrder();
337
- dbLog('Get Pages');
338
- const pages = await this.#instance
339
- .select('*')
340
- .from<DB_Page>('pages')
341
- .orderByRaw('`order` ASC NULLS LAST')
342
- .whereNull('redirectDestId')
343
- .limit(limit)
344
- .offset(offset);
345
-
346
- // When empty
347
- if (pages.length === 0) {
348
- return {
349
- pages: [],
350
- redirects: [],
351
- referrers: [],
352
- anchors: [],
353
- };
354
- }
355
-
356
- dbLog('Get Pages: Redirects');
357
- const redirects: DB_Redirect[] = await this.#instance
358
- .with('limitedPages', limitedPageIds(limit, offset))
359
- .with('redirect', redirectTable(false))
360
- .select('id as pageId', 'from', 'fromId')
361
- .from('redirect')
362
- // Filter
363
- .join('limitedPages', 'redirect.toId', '=', 'limitedPages.id')
364
- // Sort
365
- .orderBy('id', 'asc');
366
-
367
- dbLog('Get Pages: Anchors');
368
- const anchors: DB_Anchor[] = await this.#instance
369
- .with('limitedPages', limitedPageIds(limit, offset))
370
- .with('redirect', redirectTable())
371
- .select(
372
- 'limitedPages.id as pageId',
373
- 'href.url',
374
- 'redirect.from as href',
375
- 'href.isExternal',
376
- 'href.title',
377
- 'href.status',
378
- 'href.statusText',
379
- 'href.contentType',
380
- 'anchors.hash',
381
- 'anchors.textContent',
382
- )
383
- .from('anchors')
384
- // Filters
385
- .join('limitedPages', 'anchors.pageId', '=', 'limitedPages.id')
386
- // Resolves redirect
387
- .join('redirect', 'anchors.hrefId', '=', 'redirect.fromId')
388
- // Target
389
- .join('pages as href', 'redirect.toId', '=', 'href.id')
390
- // Sort
391
- .orderBy('anchors.id', 'asc');
392
-
393
- dbLog('Get Pages: Referrers');
394
- const referrers: DB_Referrer[] = await this.#instance
395
- .with('limitedPages', limitedPageIds(limit, offset))
396
- .with('redirect', redirectTable())
397
- .select(
398
- 'redirect.toId as pageId',
399
- 'referrer.url',
400
- 'redirect.from as through',
401
- 'redirect.fromId as throughId',
402
- 'anchors.hash',
403
- 'anchors.textContent',
404
- )
405
- .from('anchors')
406
- // Resolves redirect
407
- .join('redirect', 'anchors.hrefId', '=', 'redirect.fromId')
408
- // Referrer
409
- .join('pages as referrer', 'anchors.pageId', '=', 'referrer.id')
410
- // Filters
411
- .join('limitedPages', 'redirect.toId', '=', 'limitedPages.id')
412
- // Sort
413
- .orderBy('anchors.id', 'asc');
414
-
415
- dbLog('Get Pages: Done');
416
- return {
417
- pages,
418
- redirects,
419
- anchors,
420
- referrers,
421
- };
422
- }
423
-
424
- /**
425
- * Retrieves redirect sources for the given page IDs in bulk.
426
- * @param pageIds - The database IDs of the destination pages.
427
- * @returns An array of {@link DB_Redirect} records mapping destination pages to their redirect sources.
428
- */
429
- @ErrorEmitter()
430
- @retry(retrySetting)
431
- async getRedirectsForPages(pageIds: number[]): Promise<DB_Redirect[]> {
432
- if (pageIds.length === 0) return [];
433
- return this.#instance
434
- .select('redirectDestId as pageId', 'url as from', 'id as fromId')
435
- .from('pages')
436
- .whereIn('redirectDestId', pageIds);
437
- }
438
- /**
439
- * Retrieves pages that link to a specific page (incoming links / referrers).
440
- * @param pageId - The database ID of the target page.
441
- * @returns An array of referrer records with URL, hash, and text content.
442
- */
443
- @ErrorEmitter()
444
- @retry(retrySetting)
445
- async getReferrersOfPage(pageId: number) {
446
- const res = await this.#instance
447
- .select('pages.url', 'anchors.hash', 'anchors.textContent')
448
- .from('anchors')
449
- .join('pages', 'anchors.pageId', '=', 'pages.id')
450
- .where('anchors.hrefId', pageId);
451
- return res;
452
- }
453
-
454
- /**
455
- * Retrieves the page URLs that reference a specific resource.
456
- * @param id - The database ID of the resource.
457
- * @returns An array of page URL strings that reference the resource.
458
- */
459
- @ErrorEmitter()
460
- @retry(retrySetting)
461
- async getReferrersOfResource(id: number): Promise<string[]> {
462
- const res = await this.#instance
463
- .select('pages.url')
464
- .from('resources-referrers')
465
- .join('resources', 'resources.id', '=', 'resources-referrers.resourceId')
466
- .join('pages', 'pages.id', '=', 'resources-referrers.pageId')
467
- .where('resources.id', id);
468
- return res.map((r) => r.url);
469
- }
470
-
471
- /**
472
- * Retrieves all sub-resources from the `resources` table.
473
- * @returns An array of raw {@link DB_Resource} rows.
474
- */
475
- @ErrorEmitter()
476
- @retry(retrySetting)
477
- async getResources() {
478
- return this.#instance.select('*').from<DB_Resource>('resources');
479
- }
480
-
481
- /**
482
- * Retrieves a flat list of all resource URLs from the `resources` table.
483
- * @returns An array of resource URL strings.
484
- */
485
- @ErrorEmitter()
486
- @retry(retrySetting)
487
- async getResourceUrlList() {
488
- const res = await this.#instance.select('url').from<DB_Resource>('resources');
489
- return res.map((r) => r.url);
490
- }
491
-
492
- /**
493
- * Inserts a sub-resource into the `resources` table.
494
- * Ignores duplicate URLs (uses `ON CONFLICT IGNORE`).
495
- * @param resource - The resource data to insert.
496
- */
497
- @ErrorEmitter()
498
- @retry(retrySetting)
499
- async insertResource(resource: Resource) {
500
- await this.#instance
501
- .from<DB_Resource>('resources')
502
- .insert({
503
- url: resource.url.href,
504
- isExternal: resource.isExternal ? 1 : 0,
505
- status: resource.status,
506
- statusText: resource.statusText,
507
- contentType: resource.contentType,
508
- contentLength: resource.contentLength,
509
- compress: resource.compress || 0,
510
- cdn: resource.cdn || 0,
511
- responseHeaders: JSON.stringify(resource.headers),
512
- })
513
- .onConflict('url')
514
- .ignore();
515
- }
516
-
517
- /**
518
- * Inserts a referrer relationship between a resource and a page into the
519
- * `resources-referrers` table. Silently skips if the resource is not found.
520
- * @param src - The URL of the resource.
521
- * @param pageUrl - The URL of the page that references the resource.
522
- */
523
- @ErrorEmitter()
524
- @retry(retrySetting)
525
- async insertResourceReferrers(src: string, pageUrl: string) {
526
- const selected = await this.#instance
527
- .select('id')
528
- .from<DB_Resource>('resources')
529
- .where('url', src);
530
- if (!selected[0]) {
531
- // Ignore when the resource is not found
532
- return;
533
- }
534
- const [{ id: resourceId }] = selected;
535
- const pageId = await this.#getIdByUrl(pageUrl);
536
- await this.#instance('resources-referrers').insert({
537
- resourceId,
538
- pageId,
539
- });
540
- }
541
-
542
- /**
543
- * Stores the crawl configuration in the `info` table.
544
- * Serializes array fields (`excludes`, `excludeKeywords`, `scope`) as JSON strings.
545
- * @param config - The {@link Config} object to store.
546
- */
547
- @ErrorEmitter()
548
- @retry(retrySetting)
549
- async setConfig(config: Config) {
550
- return this.#instance.from<Config>('info').insert({
551
- ...config,
552
- // @ts-expect-error
553
- excludes: JSON.stringify(config.excludes),
554
- // @ts-expect-error
555
- excludeKeywords: JSON.stringify(config.excludeKeywords),
556
- // @ts-expect-error
557
- excludeUrls: JSON.stringify(config.excludeUrls),
558
- // @ts-expect-error
559
- scope: JSON.stringify(config.scope),
560
- });
561
- }
562
-
563
- /**
564
- * Marks a page as skipped in the database with the given reason.
565
- * Creates the page row if it does not already exist.
566
- * @param url - The URL of the skipped page.
567
- * @param reason - The reason the page was skipped.
568
- * @param isExternal - Whether the page is on an external domain. Defaults to `false`.
569
- */
570
- @ErrorEmitter()
571
- @retry(retrySetting)
572
- async setSkippedPage(url: string, reason: string, isExternal = false) {
573
- const pageId = await this.#getIdByUrl(url, isExternal ? 1 : 0);
574
- await this.#instance<DB_Page>('pages')
575
- .where('id', pageId)
576
- .update({
577
- scraped: 1,
578
- isExternal: isExternal ? 1 : 0,
579
- isSkipped: 1,
580
- skipReason: reason,
581
- });
582
- }
583
-
584
- /**
585
- * Assigns natural URL sort order values to all internal pages.
586
- * Pages are sorted using {@link pathComparator} and assigned sequential order numbers.
587
- */
588
- async setUrlOrder() {
589
- dbLog('Set URL Order');
590
- const res = await this.#instance
591
- .select('id', 'url')
592
- .from<DB_Page>('pages')
593
- .where('isExternal', '=', 0);
594
- const sorted = res.toSorted((a, b) => pathComparator(a.url, b.url));
595
-
596
- // Batch update using chunked CASE statements to avoid N+1 queries
597
- const BATCH_SIZE = 500;
598
- for (let i = 0; i < sorted.length; i += BATCH_SIZE) {
599
- const batch = sorted.slice(i, i + BATCH_SIZE);
600
- const ids = batch.map((row) => row.id);
601
- const bindings: (string | number)[] = [];
602
- const cases = batch
603
- .map((row, j) => {
604
- bindings.push(row.id, i + j + 1);
605
- return 'WHEN ? THEN ?';
606
- })
607
- .join(' ');
608
- const placeholders = ids.map(() => '?').join(',');
609
- await this.#instance.raw(
610
- `UPDATE pages SET \`order\` = CASE id ${cases} END WHERE id IN (${placeholders})`,
611
- [...bindings, ...ids],
612
- );
613
- }
614
- }
615
-
616
- /**
617
- * Inserts or updates a crawled page in the database, including its redirect chain,
618
- * anchors, and images. Optionally creates an HTML snapshot file path entry.
619
- * @param page - The page data to store.
620
- * @param snapshotDir - The directory for saving HTML snapshots, or null to skip snapshots.
621
- * @param isTarget - Whether this page is a crawl target.
622
- * @returns An object with the optional `html` snapshot file path and the page's database `pageId`.
623
- */
624
- @ErrorEmitter()
625
- @retry(retrySetting)
626
- async updatePage(
627
- page: PageData,
628
- snapshotDir: string | null,
629
- isTarget: boolean,
630
- ): Promise<{
631
- html?: string | undefined;
632
- pageId: number;
633
- }> {
634
- let destUrl = page.url.withoutHashAndAuth;
635
- const redirectPaths = [...page.redirectPaths];
636
- if (redirectPaths.length > 0) {
637
- destUrl = redirectPaths.pop()!;
638
- redirectPaths.unshift(page.url.withoutHashAndAuth);
639
- }
640
-
641
- const destUrlObject = parseUrl(destUrl);
642
-
643
- if (!destUrlObject) {
644
- throw new Error(`Failed to parse URL: ${destUrl}`);
645
- }
646
-
647
- return await this.#instance.transaction(async (trx) => {
648
- const pageId = await this.#insertPage(
649
- {
650
- ...page,
651
- url: destUrlObject,
652
- },
653
- isTarget,
654
- trx,
655
- );
656
-
657
- for (const redirect of redirectPaths) {
658
- dbLog('Set redirected url: %s -> %s', redirect, destUrl);
659
- const redirectId = await this.#getIdByUrl(redirect, undefined, trx);
660
- await trx<DB_Page>('pages')
661
- .where('id', redirectId)
662
- .update({
663
- scraped: 1,
664
- redirectDestId: pageId,
665
- isExternal: page.isExternal ? 1 : 0,
666
- });
667
- }
668
- let snapshot: { html?: string; pageId: number } = { pageId };
669
- if (isTarget && snapshotDir) {
670
- snapshot = await this.#updateSnapshotPath(pageId, snapshotDir, trx);
671
- }
672
- const anchors = await Promise.all(
673
- page.anchorList.map(async (anchor) => {
674
- const hrefId = await this.#getIdByUrl(
675
- anchor.href.withoutHashAndAuth,
676
- anchor.isExternal ? 1 : 0,
677
- trx,
678
- );
679
- return {
680
- pageId,
681
- hrefId,
682
- hash: anchor.href.hash,
683
- textContent: anchor.textContent,
684
- };
685
- }),
686
- );
687
- dbLog('Insert anchors.length: %d', anchors.length);
688
- if (anchors.length > 0) {
689
- await eachSplitted(anchors, 100, async (_anchors) => {
690
- await trx('anchors').insert(_anchors);
691
- });
692
- }
693
- const images = page.imageList.map((image) => ({
694
- pageId,
695
- ...image,
696
- }));
697
- dbLog('Insert images.length: %d', images.length);
698
- if (images.length > 0) {
699
- await eachSplitted(images, 100, async (_images) => {
700
- await trx('images').insert(_images);
701
- });
702
- }
703
- return snapshot;
704
- });
705
- }
706
-
707
- /**
708
- * Returns the database ID for a URL, creating a new page row if needed.
709
- * Uses `ON CONFLICT IGNORE` to handle race conditions in concurrent inserts.
710
- * @param url
711
- * @param isExternal
712
- * @param trx
713
- */
714
- async #getIdByUrl(url: string, isExternal?: 0 | 1, trx?: Knex.Transaction) {
715
- const qb = trx ?? this.#instance;
716
- const [record] = await qb.select('id').from<DB_Page>('pages').where('url', url);
717
- // Must use `?` because it may be `undefined`
718
- const pageId = record?.id ?? Number.NaN;
719
- if (Number.isFinite(pageId)) {
720
- return pageId;
721
- }
722
- const insertedRows = await qb<DB_Page>('pages')
723
- .insert({
724
- url,
725
- scraped: 0,
726
- isTarget: 0,
727
- ...(isExternal != null && { isExternal }),
728
- })
729
- .onConflict('url')
730
- .ignore();
731
- const [insertedId] = insertedRows;
732
- if (!insertedId) {
733
- // onConflict.ignore() returns 0 on race condition — re-select
734
- const [existing] = await qb.select('id').from<DB_Page>('pages').where('url', url);
735
- if (existing?.id) {
736
- return existing.id;
737
- }
738
- throw new Error(`Failed to insert a new page: ${url}`);
739
- }
740
- return insertedId;
741
- }
742
-
743
- /**
744
- * Initializes the database schema if tables do not exist.
745
- * Enables WAL journal mode and foreign keys, then creates all tables
746
- * (`info`, `pages`, `anchors`, `images`, `resources`, `resources-referrers`).
747
- */
748
- async #init() {
749
- const isExists = await this.#instance.schema.hasTable('info');
750
- if (isExists) {
751
- return;
752
- }
753
-
754
- // Enable WAL mode and foreign keys for better performance and data integrity
755
- await this.#instance.raw('PRAGMA journal_mode = WAL');
756
- await this.#instance.raw('PRAGMA foreign_keys = ON');
757
-
758
- await this.#instance.schema
759
- .createTable('info', (t) => {
760
- t.increments('id');
761
- t.string('version');
762
- t.string('name');
763
- t.string('baseUrl');
764
- t.boolean('recursive');
765
- t.boolean('useSubprocess');
766
- t.integer('interval');
767
- t.boolean('image');
768
- t.boolean('fetchExternal');
769
- t.integer('parallels');
770
- t.json('scope');
771
- t.json('excludes');
772
- t.json('excludeKeywords');
773
- t.json('excludeUrls');
774
- t.integer('maxExcludedDepth');
775
- t.integer('retry');
776
- t.boolean('fromList');
777
- t.boolean('disableQueries');
778
- })
779
- .createTable('pages', (t) => {
780
- t.increments('id');
781
- t.string('url', 8190).notNullable().unique();
782
- t.integer('redirectDestId').unsigned().references('pages.id').defaultTo(null);
783
- t.boolean('scraped').notNullable();
784
- t.boolean('isTarget').notNullable();
785
- t.boolean('isExternal');
786
- t.integer('status');
787
- t.string('statusText');
788
- t.string('contentType').nullable();
789
- t.integer('contentLength').unsigned().nullable();
790
- t.json('responseHeaders').nullable();
791
- t.string('lang');
792
- t.string('title');
793
- t.string('description');
794
- t.string('keywords');
795
- t.boolean('noindex');
796
- t.boolean('nofollow');
797
- t.boolean('noarchive');
798
- t.string('canonical');
799
- t.string('alternate');
800
- t.string('og_type');
801
- t.string('og_title');
802
- t.string('og_site_name');
803
- t.string('og_description');
804
- t.string('og_url');
805
- t.string('og_image');
806
- t.string('twitter_card');
807
- t.string('html');
808
- t.boolean('isSkipped');
809
- t.string('skipReason');
810
- t.integer('order').unsigned().nullable();
811
-
812
- t.index('isExternal');
813
- t.index('contentType');
814
- t.index('scraped');
815
- t.index('redirectDestId');
816
- t.index('order');
817
- })
818
- .createTable('anchors', (t) => {
819
- t.increments('id');
820
- t.integer('pageId').notNullable().unsigned().references('pages.id');
821
- t.integer('hrefId').notNullable().unsigned().references('pages.id');
822
- t.string('hash');
823
- t.string('textContent').nullable();
824
-
825
- t.index('pageId');
826
- t.index('hrefId');
827
- })
828
- .createTable('images', (t) => {
829
- t.increments('id');
830
- t.integer('pageId').notNullable().unsigned().references('pages.id');
831
- t.string('src', 8190);
832
- t.string('currentSrc', 8190);
833
- t.string('alt');
834
- t.float('width').unsigned().notNullable();
835
- t.float('height').unsigned().notNullable();
836
- t.integer('naturalWidth').unsigned().notNullable();
837
- t.integer('naturalHeight').unsigned().notNullable();
838
- t.boolean('isLazy');
839
- t.integer('viewportWidth').unsigned().notNullable();
840
- t.string('sourceCode');
841
-
842
- t.index('pageId');
843
- })
844
- .createTable('resources', (t) => {
845
- t.increments('id');
846
- t.string('url', 8190).notNullable().unique();
847
- t.boolean('isExternal');
848
- t.integer('status');
849
- t.string('statusText');
850
- t.string('contentType').nullable();
851
- t.integer('contentLength').unsigned().nullable();
852
- t.string('compress').nullable();
853
- t.string('cdn').nullable();
854
- t.json('responseHeaders').nullable();
855
- })
856
- .createTable('resources-referrers', (t) => {
857
- t.increments('id');
858
- t.integer('resourceId').notNullable().unsigned().references('resources.id');
859
- t.integer('pageId').notNullable().unsigned().references('pages.id');
860
-
861
- t.unique(['resourceId', 'pageId']);
862
- t.index('resourceId');
863
- t.index('pageId');
864
- });
865
- }
866
-
867
- /**
868
- * Upserts page data into the `pages` table (inserts if new, updates if existing).
869
- * @param page
870
- * @param isTarget
871
- * @param trx
872
- */
873
- async #insertPage(page: PageData, isTarget: boolean, trx?: Knex.Transaction) {
874
- const qb = trx ?? this.#instance;
875
- const pageId = await this.#getIdByUrl(page.url.withoutHashAndAuth, undefined, trx);
876
- await qb('pages')
877
- .where('id', pageId)
878
- .update({
879
- scraped: true,
880
- isTarget,
881
- isExternal: page.isExternal,
882
- status: page.status,
883
- statusText: page.statusText,
884
- contentType: page.contentType,
885
- contentLength: page.contentLength,
886
- responseHeaders: JSON.stringify(page.responseHeaders),
887
- lang: page.meta.lang,
888
- title: page.meta.title,
889
- description: page.meta.description,
890
- keywords: page.meta.keywords,
891
- noindex: page.meta.noindex,
892
- nofollow: page.meta.nofollow,
893
- noarchive: page.meta.noarchive,
894
- canonical: page.meta.canonical,
895
- alternate: page.meta.alternate,
896
- og_type: page.meta['og:type'],
897
- og_title: page.meta['og:title'],
898
- og_site_name: page.meta['og:site_name'],
899
- og_description: page.meta['og:description'],
900
- og_url: page.meta['og:url'],
901
- og_image: page.meta['og:image'],
902
- twitter_card: page.meta['twitter:card'],
903
- isSkipped: page.isSkipped,
904
- });
905
- return pageId;
906
- }
907
-
908
- /**
909
- * Assigns and persists the HTML snapshot file path for a page.
910
- * @param pageId
911
- * @param snapshotDir
912
- * @param trx
913
- */
914
- async #updateSnapshotPath(pageId: number, snapshotDir: string, trx?: Knex.Transaction) {
915
- const qb = trx ?? this.#instance;
916
- const snapshotHtmlPath = path.resolve(snapshotDir, `${pageId}.html`);
917
- const snapshotRelHtmlPath = path.relative(this.#workingDir, snapshotHtmlPath);
918
- await qb('pages').where('id', pageId).update({
919
- html: snapshotRelHtmlPath,
920
- });
921
- return {
922
- html: snapshotHtmlPath,
923
- pageId,
924
- };
925
- }
926
-
927
- /**
928
- * Creates and initializes a new Database instance.
929
- * Creates the parent directory for the database file if needed,
930
- * establishes the connection, and initializes tables if they do not exist.
931
- * @param options - The database connection options specifying the type and file path.
932
- * @returns A fully initialized Database instance.
933
- */
934
- static async connect(options: DatabaseOption) {
935
- switch (options.type) {
936
- case 'sqlite3': {
937
- mkdir(options.filename);
938
- break;
939
- }
940
- }
941
- const db = new Database(options);
942
- await db.#init();
943
- return db;
944
- }
945
- }
946
-
947
- // ----- ----- ----- ----- -----
948
- //
949
- // Common Queries
950
- //
951
- // ----- ----- ----- ----- -----
952
-
953
- /**
954
- * Returns a Knex subquery builder that selects page IDs with pagination,
955
- * ordered by the `order` column (nulls last), excluding redirected pages.
956
- * @param limit - The maximum number of page IDs to return.
957
- * @param offset - The number of page IDs to skip before returning results.
958
- */
959
- function limitedPageIds(limit: number, offset: number) {
960
- return async (qb: Knex.QueryBuilder<Record<string, unknown>, unknown>) => {
961
- await qb
962
- .select('id')
963
- .from<DB_Page>('pages')
964
- .orderByRaw('`order` ASC NULLS LAST')
965
- .whereNull('redirectDestId')
966
- .limit(limit)
967
- .offset(offset);
968
- };
969
- }
970
-
971
- /**
972
- * Returns a Knex subquery builder that joins pages with their redirect destinations.
973
- * When `includeNull` is true, also includes pages without redirects (self-referencing).
974
- * @param includeNull - Whether to include non-redirected pages in the result. Defaults to `true`.
975
- */
976
- function redirectTable(includeNull = true) {
977
- return async (qb: Knex.QueryBuilder<Record<string, unknown>, unknown>) => {
978
- const list = qb
979
- .select('A.id as fromId', 'A.url as from', 'B.url as to', 'B.id as toId')
980
- .from('pages as A')
981
- .join('pages as B', (j) => {
982
- j.on('A.redirectDestId', '=', 'B.id').andOnNotNull('A.redirectDestId');
983
- });
984
- if (includeNull) {
985
- await list.union(async (qb) => {
986
- await qb
987
- .select('A.id as fromId', 'A.url as from', 'A.url as to', 'A.id as toId')
988
- .from('pages as A')
989
- .whereNull('A.redirectDestId');
990
- });
991
- }
992
- };
993
- }
994
-
995
- // ----- ----- ----- ----- -----
996
- //
997
- // Utils
998
- //
999
- // ----- ----- ----- ----- -----
1000
-
1001
- /**
1002
- * Safely parses a JSON string, returning a fallback value if parsing fails or the input is not a string.
1003
- * @param data - The data to parse. Only string values are parsed; other types return the fallback.
1004
- * @param fallback - The value to return if parsing fails or the result is falsy.
1005
- * @returns The parsed JSON value, or the fallback.
1006
- */
1007
- function getJSON<T>(data: unknown, fallback: T): T {
1008
- try {
1009
- if (typeof data === 'string') {
1010
- const result = JSON.parse(data);
1011
- if (result) {
1012
- return result;
1013
- }
1014
- return fallback;
1015
- }
1016
- } catch {
1017
- // void
1018
- }
1019
-
1020
- return fallback;
1021
- }
1022
-
1023
- // ----- ----- ----- ----- -----
1024
- //
1025
- // Types
1026
- //
1027
- // ----- ----- ----- ----- -----
1028
-
1029
- /**
1030
- * Base options shared by all database connection configurations.
1031
- */
1032
- type AbsDatabaseOption = {
1033
- /** The working directory for the database (used for resolving relative paths). */
1034
- workingDir: string;
1035
- };
1036
-
1037
- /**
1038
- * Union type for all supported database connection options.
1039
- */
1040
- type DatabaseOption = DatabaseSqlite3Option | DatabaseMySqlOption;
1041
-
1042
- /**
1043
- * Connection options for a SQLite3 database.
1044
- */
1045
- type DatabaseSqlite3Option = AbsDatabaseOption & {
1046
- /** The database type identifier. */
1047
- type: 'sqlite3';
1048
- /** The absolute file path to the SQLite database file. */
1049
- filename: string;
1050
- };
1051
-
1052
- /**
1053
- * Connection options for a MySQL database.
1054
- * Note: MySQL support is not yet implemented.
1055
- */
1056
- type DatabaseMySqlOption = AbsDatabaseOption & {
1057
- /** The database type identifier. */
1058
- type: 'mysql';
1059
- };