@nitpicker/crawler 0.4.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/package.json +5 -2
  2. package/CHANGELOG.md +0 -16
  3. package/src/archive/__mock__/.gitignore +0 -3
  4. package/src/archive/__mock__/mock.sqlite +0 -0
  5. package/src/archive/archive-accessor.ts +0 -337
  6. package/src/archive/archive.ts +0 -408
  7. package/src/archive/database.spec.ts +0 -469
  8. package/src/archive/database.ts +0 -1059
  9. package/src/archive/debug.ts +0 -10
  10. package/src/archive/filesystem/append-text.spec.ts +0 -26
  11. package/src/archive/filesystem/append-text.ts +0 -16
  12. package/src/archive/filesystem/copy-dir-sync.spec.ts +0 -27
  13. package/src/archive/filesystem/copy-dir-sync.ts +0 -10
  14. package/src/archive/filesystem/copy-dir.spec.ts +0 -33
  15. package/src/archive/filesystem/copy-dir.ts +0 -14
  16. package/src/archive/filesystem/exists.spec.ts +0 -33
  17. package/src/archive/filesystem/exists.ts +0 -10
  18. package/src/archive/filesystem/get-file-list.spec.ts +0 -37
  19. package/src/archive/filesystem/get-file-list.ts +0 -13
  20. package/src/archive/filesystem/index.ts +0 -17
  21. package/src/archive/filesystem/is-dir.spec.ts +0 -29
  22. package/src/archive/filesystem/is-dir.ts +0 -11
  23. package/src/archive/filesystem/mkdir.spec.ts +0 -37
  24. package/src/archive/filesystem/mkdir.ts +0 -16
  25. package/src/archive/filesystem/output-json.spec.ts +0 -34
  26. package/src/archive/filesystem/output-json.ts +0 -16
  27. package/src/archive/filesystem/output-text.spec.ts +0 -31
  28. package/src/archive/filesystem/output-text.ts +0 -35
  29. package/src/archive/filesystem/read-json.spec.ts +0 -26
  30. package/src/archive/filesystem/read-json.ts +0 -12
  31. package/src/archive/filesystem/read-text.spec.ts +0 -25
  32. package/src/archive/filesystem/read-text.ts +0 -11
  33. package/src/archive/filesystem/readline.spec.ts +0 -29
  34. package/src/archive/filesystem/readline.ts +0 -30
  35. package/src/archive/filesystem/remove.spec.ts +0 -34
  36. package/src/archive/filesystem/remove.ts +0 -11
  37. package/src/archive/filesystem/rename.spec.ts +0 -46
  38. package/src/archive/filesystem/rename.ts +0 -21
  39. package/src/archive/filesystem/tar.spec.ts +0 -33
  40. package/src/archive/filesystem/tar.ts +0 -27
  41. package/src/archive/filesystem/untar.spec.ts +0 -34
  42. package/src/archive/filesystem/untar.ts +0 -36
  43. package/src/archive/index.ts +0 -13
  44. package/src/archive/page.spec.ts +0 -368
  45. package/src/archive/page.ts +0 -420
  46. package/src/archive/resource.spec.ts +0 -101
  47. package/src/archive/resource.ts +0 -73
  48. package/src/archive/safe-path.spec.ts +0 -44
  49. package/src/archive/safe-path.ts +0 -18
  50. package/src/archive/types.ts +0 -227
  51. package/src/crawler/clear-destination-cache.spec.ts +0 -20
  52. package/src/crawler/clear-destination-cache.ts +0 -9
  53. package/src/crawler/crawler.ts +0 -873
  54. package/src/crawler/decompose-url.spec.ts +0 -48
  55. package/src/crawler/decompose-url.ts +0 -90
  56. package/src/crawler/destination-cache.spec.ts +0 -23
  57. package/src/crawler/destination-cache.ts +0 -8
  58. package/src/crawler/detect-pagination-pattern.spec.ts +0 -169
  59. package/src/crawler/detect-pagination-pattern.ts +0 -66
  60. package/src/crawler/fetch-destination.ts +0 -257
  61. package/src/crawler/fetch-robots-txt.spec.ts +0 -83
  62. package/src/crawler/fetch-robots-txt.ts +0 -91
  63. package/src/crawler/find-best-matching-scope.spec.ts +0 -39
  64. package/src/crawler/find-best-matching-scope.ts +0 -57
  65. package/src/crawler/generate-predicted-urls.spec.ts +0 -42
  66. package/src/crawler/generate-predicted-urls.ts +0 -34
  67. package/src/crawler/handle-ignore-and-skip.spec.ts +0 -66
  68. package/src/crawler/handle-ignore-and-skip.ts +0 -30
  69. package/src/crawler/handle-resource-response.spec.ts +0 -45
  70. package/src/crawler/handle-resource-response.ts +0 -21
  71. package/src/crawler/handle-scrape-end.spec.ts +0 -109
  72. package/src/crawler/handle-scrape-end.ts +0 -115
  73. package/src/crawler/handle-scrape-error.spec.ts +0 -105
  74. package/src/crawler/handle-scrape-error.ts +0 -58
  75. package/src/crawler/index.ts +0 -2
  76. package/src/crawler/inject-scope-auth.spec.ts +0 -36
  77. package/src/crawler/inject-scope-auth.ts +0 -27
  78. package/src/crawler/is-external-url.spec.ts +0 -31
  79. package/src/crawler/is-external-url.ts +0 -17
  80. package/src/crawler/is-in-any-lower-layer.spec.ts +0 -31
  81. package/src/crawler/is-in-any-lower-layer.ts +0 -22
  82. package/src/crawler/link-list.spec.ts +0 -355
  83. package/src/crawler/link-list.ts +0 -275
  84. package/src/crawler/link-to-page-data.spec.ts +0 -133
  85. package/src/crawler/link-to-page-data.ts +0 -34
  86. package/src/crawler/net-timeout-error.spec.ts +0 -25
  87. package/src/crawler/net-timeout-error.ts +0 -11
  88. package/src/crawler/protocol-agnostic-key.spec.ts +0 -40
  89. package/src/crawler/protocol-agnostic-key.ts +0 -11
  90. package/src/crawler/reconstruct-url.spec.ts +0 -37
  91. package/src/crawler/reconstruct-url.ts +0 -37
  92. package/src/crawler/robots-checker.spec.ts +0 -104
  93. package/src/crawler/robots-checker.ts +0 -73
  94. package/src/crawler/should-discard-predicted.spec.ts +0 -125
  95. package/src/crawler/should-discard-predicted.ts +0 -33
  96. package/src/crawler/should-skip-url.spec.ts +0 -77
  97. package/src/crawler/should-skip-url.ts +0 -37
  98. package/src/crawler/types.ts +0 -146
  99. package/src/crawler-orchestrator.ts +0 -401
  100. package/src/debug.ts +0 -10
  101. package/src/index.ts +0 -25
  102. package/src/types.ts +0 -30
  103. package/src/utils/array/each-splitted.spec.ts +0 -38
  104. package/src/utils/array/each-splitted.ts +0 -19
  105. package/src/utils/array/index.ts +0 -1
  106. package/src/utils/debug.ts +0 -6
  107. package/src/utils/error/dom-evaluation-error.spec.ts +0 -20
  108. package/src/utils/error/dom-evaluation-error.ts +0 -6
  109. package/src/utils/error/error-emitter.spec.ts +0 -78
  110. package/src/utils/error/error-emitter.ts +0 -44
  111. package/src/utils/error/index.ts +0 -3
  112. package/src/utils/index.ts +0 -5
  113. package/src/utils/object/clean-object.spec.ts +0 -24
  114. package/src/utils/object/clean-object.ts +0 -13
  115. package/src/utils/object/index.ts +0 -1
  116. package/src/utils/types/index.ts +0 -1
  117. package/src/utils/types/types.ts +0 -65
  118. package/tsconfig.json +0 -11
  119. package/tsconfig.tsbuildinfo +0 -1
@@ -1,355 +0,0 @@
1
- import type { PageData } from '../utils/index.js';
2
- import type { ExURL, ParseURLOptions } from '@d-zero/shared/parse-url';
3
-
4
- import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
5
- import { describe, it, expect } from 'vitest';
6
-
7
- import LinkList from './link-list.js';
8
- import { protocolAgnosticKey } from './protocol-agnostic-key.js';
9
-
10
- const defaultOptions: ParseURLOptions = {};
11
-
12
- /**
13
- *
14
- * @param href
15
- */
16
- function createUrl(href: string): ExURL {
17
- return parseUrl(href)!;
18
- }
19
-
20
- /**
21
- *
22
- * @param entries
23
- */
24
- function createScope(entries: [string, string[]][]): Map<string, ExURL[]> {
25
- return new Map(
26
- entries.map(([h, urls]) => [h, urls.map((u) => parseUrl(u)!).filter(Boolean)]),
27
- );
28
- }
29
-
30
- /**
31
- *
32
- * @param overrides
33
- */
34
- function createPageData(overrides: Partial<PageData> = {}): PageData {
35
- return {
36
- url: createUrl('https://example.com/page'),
37
- redirectPaths: [],
38
- isTarget: true,
39
- isExternal: false,
40
- status: 200,
41
- statusText: 'OK',
42
- contentType: 'text/html',
43
- contentLength: 1024,
44
- responseHeaders: {},
45
- meta: { title: 'Test' },
46
- anchorList: [],
47
- imageList: [],
48
- html: '<html></html>',
49
- isSkipped: false,
50
- ...overrides,
51
- };
52
- }
53
-
54
- describe('LinkList', () => {
55
- describe('add', () => {
56
- it('adds a new URL to pending', () => {
57
- const list = new LinkList();
58
- const url = createUrl('https://example.com/page');
59
- list.add(url);
60
- const { pending } = list.getLinks();
61
- expect(pending).toContain(protocolAgnosticKey(url.withoutHashAndAuth));
62
- });
63
-
64
- it('deduplicates by withoutHashAndAuth', () => {
65
- const list = new LinkList();
66
- const url1 = createUrl('https://example.com/page');
67
- const url2 = createUrl('https://example.com/page');
68
- list.add(url1);
69
- list.add(url2);
70
- const { pending } = list.getLinks();
71
- expect(pending.length).toBe(1);
72
- });
73
-
74
- it('does not re-add a URL that is already done', () => {
75
- const list = new LinkList();
76
- const url = createUrl('https://example.com/page');
77
- const scope = createScope([['example.com', ['https://example.com/']]]);
78
- list.add(url);
79
- list.done(url, scope, { page: createPageData() }, defaultOptions);
80
- list.add(url);
81
- const { pending } = list.getLinks();
82
- expect(pending).not.toContain(protocolAgnosticKey(url.withoutHashAndAuth));
83
- });
84
-
85
- it('supports metadataOnly flag', () => {
86
- const list = new LinkList();
87
- const url = createUrl('https://example.com/page');
88
- list.add(url, { metadataOnly: true });
89
- expect(list.isMetadataOnly(url.withoutHashAndAuth)).toBe(true);
90
- });
91
-
92
- it('supports predicted flag', () => {
93
- const list = new LinkList();
94
- const url = createUrl('https://example.com/page/5');
95
- list.add(url, { predicted: true });
96
- expect(list.isPredicted(url.withoutHashAndAuth)).toBe(true);
97
- });
98
-
99
- it('returns false for isPredicted when flag is not set', () => {
100
- const list = new LinkList();
101
- const url = createUrl('https://example.com/page/5');
102
- list.add(url);
103
- expect(list.isPredicted(url.withoutHashAndAuth)).toBe(false);
104
- });
105
-
106
- it('metadataOnly and predicted flags are independent', () => {
107
- const list = new LinkList();
108
- const url = createUrl('https://example.com/page/5');
109
- list.add(url, { metadataOnly: true, predicted: true });
110
- expect(list.isMetadataOnly(url.withoutHashAndAuth)).toBe(true);
111
- expect(list.isPredicted(url.withoutHashAndAuth)).toBe(true);
112
- });
113
-
114
- it('predicted add on existing URL is a no-op (dedup)', () => {
115
- const list = new LinkList();
116
- const url1 = createUrl('https://example.com/page/5');
117
- const url2 = createUrl('https://example.com/page/5');
118
- list.add(url1);
119
- list.add(url2, { predicted: true });
120
- // predicted flag should not be set since second add was a no-op
121
- expect(list.isPredicted(url1.withoutHashAndAuth)).toBe(false);
122
- });
123
-
124
- it('predicted flag is protocol-agnostic', () => {
125
- const list = new LinkList();
126
- const httpUrl = createUrl('http://example.com/page/5');
127
- list.add(httpUrl, { predicted: true });
128
- const httpsUrl = createUrl('https://example.com/page/5');
129
- expect(list.isPredicted(httpsUrl.withoutHashAndAuth)).toBe(true);
130
- });
131
-
132
- it('deduplicates HTTP and HTTPS URLs', () => {
133
- const list = new LinkList();
134
- const httpUrl = createUrl('http://example.com/page');
135
- const httpsUrl = createUrl('https://example.com/page');
136
- list.add(httpUrl);
137
- list.add(httpsUrl);
138
- const { pending } = list.getLinks();
139
- expect(pending.length).toBe(1);
140
- });
141
-
142
- it('does not re-add HTTPS URL when HTTP variant is already done', () => {
143
- const list = new LinkList();
144
- const httpUrl = createUrl('http://example.com/page');
145
- const httpsUrl = createUrl('https://example.com/page');
146
- const scope = createScope([['example.com', ['http://example.com/']]]);
147
- list.add(httpUrl);
148
- list.done(httpUrl, scope, { page: createPageData() }, defaultOptions);
149
- list.add(httpsUrl);
150
- const { pending } = list.getLinks();
151
- expect(pending.length).toBe(0);
152
- });
153
- });
154
-
155
- describe('progress', () => {
156
- it('moves URL from pending to progress', () => {
157
- const list = new LinkList();
158
- const url = createUrl('https://example.com/page');
159
- list.add(url);
160
- list.progress(url);
161
- const { pending, progress } = list.getLinks();
162
- expect(pending).not.toContain(protocolAgnosticKey(url.withoutHashAndAuth));
163
- expect(progress).toContain(protocolAgnosticKey(url.withoutHashAndAuth));
164
- });
165
-
166
- it('is a no-op if URL is not pending', () => {
167
- const list = new LinkList();
168
- const url = createUrl('https://example.com/page');
169
- list.progress(url);
170
- const { pending, progress } = list.getLinks();
171
- expect(pending.length).toBe(0);
172
- expect(progress.length).toBe(0);
173
- });
174
- });
175
-
176
- describe('done', () => {
177
- it('creates a Link with dest from page data', () => {
178
- const list = new LinkList();
179
- const url = createUrl('https://example.com/page');
180
- const scope = createScope([['example.com', ['https://example.com/']]]);
181
- list.add(url);
182
- const page = createPageData();
183
- const link = list.done(url, scope, { page }, defaultOptions);
184
- expect(link).not.toBeNull();
185
- expect(link!.dest).toBeDefined();
186
- expect(link!.dest!.status).toBe(200);
187
- expect(link!.dest!.title).toBe('Test');
188
- });
189
-
190
- it('sets isExternal=true when hostname not in scope', () => {
191
- const list = new LinkList();
192
- const url = createUrl('https://external.com/page');
193
- const scope = createScope([['example.com', ['https://example.com/']]]);
194
- list.add(url);
195
- const link = list.done(url, scope, { page: createPageData() }, defaultOptions);
196
- expect(link).not.toBeNull();
197
- expect(link!.isExternal).toBe(true);
198
- });
199
-
200
- it('sets isLowerLayer based on scope matching', () => {
201
- const list = new LinkList();
202
- const url = createUrl('https://example.com/blog/post');
203
- const scope = createScope([['example.com', ['https://example.com/blog/']]]);
204
- list.add(url);
205
- const link = list.done(url, scope, { page: createPageData() }, defaultOptions);
206
- expect(link).not.toBeNull();
207
- expect(link!.isLowerLayer).toBe(true);
208
- });
209
-
210
- it('increments completePages for valid internal HTML pages', () => {
211
- const list = new LinkList();
212
- const url = createUrl('https://example.com/blog/post');
213
- const scope = createScope([['example.com', ['https://example.com/']]]);
214
- list.add(url);
215
- list.done(
216
- url,
217
- scope,
218
- {
219
- page: createPageData({
220
- status: 200,
221
- contentType: 'text/html',
222
- }),
223
- },
224
- defaultOptions,
225
- );
226
- expect(list.completePages).toBe(1);
227
- });
228
-
229
- it('does not count error pages as completePages', () => {
230
- const list = new LinkList();
231
- const url = createUrl('https://example.com/page');
232
- const scope = createScope([['example.com', ['https://example.com/']]]);
233
- list.add(url);
234
- list.done(
235
- url,
236
- scope,
237
- {
238
- page: createPageData({ status: 404, statusText: 'Not Found' }),
239
- },
240
- defaultOptions,
241
- );
242
- expect(list.completePages).toBe(0);
243
- });
244
-
245
- it('handles error resource with ERR_NAME_NOT_RESOLVED', () => {
246
- const list = new LinkList();
247
- const url = createUrl('https://nonexistent.example.com/page');
248
- const scope = createScope([
249
- ['nonexistent.example.com', ['https://nonexistent.example.com/']],
250
- ]);
251
- list.add(url);
252
- const link = list.done(
253
- url,
254
- scope,
255
- { error: new Error('net::ERR_NAME_NOT_RESOLVED') },
256
- defaultOptions,
257
- );
258
- expect(link).not.toBeNull();
259
- expect(link!.dest).toBeDefined();
260
- expect(link!.dest!.status).toBe(-1);
261
- expect(link!.dest!.statusText).toBe('net::ERR_NAME_NOT_RESOLVED');
262
- });
263
-
264
- it('adds redirectPaths to done set', () => {
265
- const list = new LinkList();
266
- const url = createUrl('https://example.com/start');
267
- const scope = createScope([['example.com', ['https://example.com/']]]);
268
- list.add(url);
269
- const page = createPageData({
270
- redirectPaths: ['https://example.com/redirected'],
271
- });
272
- list.done(url, scope, { page }, defaultOptions);
273
- // The redirect path should be in done set, so adding it again should be a no-op
274
- const redirectUrl = createUrl('https://example.com/redirected');
275
- list.add(redirectUrl);
276
- const { pending } = list.getLinks();
277
- expect(pending).not.toContain(protocolAgnosticKey(redirectUrl.withoutHashAndAuth));
278
- });
279
-
280
- it('returns null if URL was not added', () => {
281
- const list = new LinkList();
282
- const url = createUrl('https://example.com/unknown');
283
- const scope = createScope([['example.com', ['https://example.com/']]]);
284
- const link = list.done(url, scope, { page: createPageData() }, defaultOptions);
285
- expect(link).toBeNull();
286
- });
287
-
288
- it('sets dest with status -1 for ERR_NAME_NOT_RESOLVED errors', () => {
289
- const list = new LinkList();
290
- const url = createUrl('https://example.com/page');
291
- const scope = createScope([['example.com', ['https://example.com/']]]);
292
- list.add(url);
293
- const link = list.done(
294
- url,
295
- scope,
296
- { error: new Error('ERR_NAME_NOT_RESOLVED at something') },
297
- defaultOptions,
298
- );
299
- expect(link).not.toBeNull();
300
- expect(link!.dest).toEqual({
301
- redirectPaths: [],
302
- status: -1,
303
- statusText: 'ERR_NAME_NOT_RESOLVED at something',
304
- contentType: null,
305
- contentLength: null,
306
- responseHeaders: {},
307
- });
308
- });
309
- });
310
-
311
- describe('getPageCount', () => {
312
- it('returns correct counts', () => {
313
- const list = new LinkList();
314
- const url1 = createUrl('https://example.com/page1');
315
- const url2 = createUrl('https://example.com/page2');
316
- const scope = createScope([['example.com', ['https://example.com/']]]);
317
- list.add(url1);
318
- list.add(url2);
319
- list.done(
320
- url1,
321
- scope,
322
- {
323
- page: createPageData({ status: 200, contentType: 'text/html' }),
324
- },
325
- defaultOptions,
326
- );
327
-
328
- const counts = list.getPageCount();
329
- expect(counts.totalLinks).toBe(2);
330
- expect(counts.completedLinks).toBe(1);
331
- expect(counts.completedPages).toBe(1);
332
- });
333
- });
334
-
335
- describe('resume', () => {
336
- it('restores pending and done URLs', () => {
337
- const list = new LinkList();
338
- list.resume(
339
- ['https://example.com/pending1'],
340
- ['https://example.com/done1'],
341
- defaultOptions,
342
- );
343
-
344
- // pending URLs are added to pending and also to done
345
- const { pending } = list.getLinks();
346
- expect(pending.length).toBe(1);
347
-
348
- // done URLs should be marked as done, so adding again is a no-op
349
- const doneUrl = createUrl('https://example.com/done1');
350
- list.add(doneUrl);
351
- const { pending: afterAdd } = list.getLinks();
352
- expect(afterAdd).not.toContain(protocolAgnosticKey(doneUrl.withoutHashAndAuth));
353
- });
354
- });
355
- });
@@ -1,275 +0,0 @@
1
- import type { Link, PageData } from '../utils/index.js';
2
- import type { ExURL, ParseURLOptions } from '@d-zero/shared/parse-url';
3
-
4
- import { isError } from '@d-zero/beholder';
5
- import { isLowerLayer } from '@d-zero/shared/is-lower-layer';
6
- import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
7
-
8
- import { protocolAgnosticKey } from './protocol-agnostic-key.js';
9
-
10
- /**
11
- * Manages the queue of URLs discovered during crawling.
12
- *
13
- * Tracks URLs across three states: pending (queued but not started),
14
- * in-progress (currently being scraped), and done (scraping completed).
15
- * Provides deduplication based on `withoutHashAndAuth` normalization
16
- * and tracks page completion counts for progress reporting.
17
- */
18
- export default class LinkList {
19
- #completePages = 0;
20
- #done = new Set<string>();
21
- #metadataOnlyFlag = new Set<string>();
22
- #pending = new Set<string>();
23
- #predictedFlag = new Set<string>();
24
- #progress = new Set<string>();
25
-
26
- /**
27
- * The number of successfully completed internal HTML pages.
28
- *
29
- * Only counts pages that are internal, in a lower layer, use HTTP(S),
30
- * have no error status, and have `text/html` content type.
31
- */
32
- get completePages() {
33
- return this.#completePages;
34
- }
35
-
36
- /**
37
- * Add a URL to the pending queue if it has not been seen before.
38
- *
39
- * Deduplication is based on the URL's `withoutHashAndAuth` representation.
40
- * If the URL is already pending, in progress, or done, this is a no-op.
41
- * @param linkUrl - The parsed URL to add to the queue.
42
- * @param options - Optional flags for the URL.
43
- * @param options.metadataOnly - If `true`, marks this URL for title-only scraping
44
- * (metadata extraction without full page processing).
45
- * @param options.predicted - If `true`, marks this URL as a predicted pagination guess
46
- * that should be discarded if it returns a 4xx/5xx status.
47
- */
48
- add(linkUrl: ExURL, options?: { metadataOnly?: true; predicted?: true }) {
49
- const key = protocolAgnosticKey(linkUrl.withoutHashAndAuth);
50
- if (this.#pending.has(key) || this.#progress.has(key) || this.#done.has(key)) {
51
- return;
52
- }
53
- this.#pending.add(key);
54
- if (options?.metadataOnly) {
55
- this.#metadataOnlyFlag.add(key);
56
- }
57
- if (options?.predicted) {
58
- this.#predictedFlag.add(key);
59
- }
60
- }
61
-
62
- /**
63
- * Mark a URL as completed and record its scrape result.
64
- *
65
- * Moves the URL from pending/progress to done, constructs a {@link Link}
66
- * object with scope and layer information, and increments the page counter
67
- * if the result qualifies as a valid HTML page.
68
- * @param url - The URL that has been scraped.
69
- * @param scope - The current scope map (hostname to scope URLs).
70
- * @param resource - The scrape result containing page data and/or error information.
71
- * @param resource.page - The scraped page data, if the scrape succeeded.
72
- * @param resource.error - The error object, if the scrape failed.
73
- * @param options - URL parsing options (e.g., `disableQueries`).
74
- * @returns The constructed {@link Link} object, or `null` if the URL was not in the queue.
75
- */
76
- done(
77
- url: ExURL,
78
- scope: ReadonlyMap<string /* hostname */, readonly ExURL[]>,
79
- resource: { page?: PageData; error?: Error },
80
- options: ParseURLOptions,
81
- ): Link | null {
82
- const key = protocolAgnosticKey(url.withoutHashAndAuth);
83
- if (!(this.#pending.has(key) || this.#progress.has(key))) {
84
- return null;
85
- }
86
- this.#pending.delete(key);
87
- this.#progress.delete(key);
88
- const linkUrl = parseUrl(url, options);
89
- if (!linkUrl) {
90
- return null;
91
- }
92
- const sameScopes = scope.get(linkUrl.hostname);
93
- const link: Link = {
94
- url: linkUrl,
95
- isLowerLayer: sameScopes
96
- ? sameScopes.some((s) => isLowerLayer(linkUrl.href, s.href, options))
97
- : false,
98
- isExternal: !sameScopes,
99
- };
100
- const urlList = new Set<string>([key]);
101
- if (resource.page) {
102
- link.dest = {
103
- redirectPaths: resource.page.redirectPaths,
104
- status: resource.page.status,
105
- statusText: resource.page.statusText,
106
- contentType: resource.page.contentType,
107
- contentLength: resource.page.contentLength,
108
- responseHeaders: resource.page.responseHeaders,
109
- title: resource.page.meta.title,
110
- };
111
-
112
- for (const path of resource.page.redirectPaths) {
113
- urlList.add(protocolAgnosticKey(path));
114
- }
115
- }
116
- if (resource.error?.message.includes('ERR_NAME_NOT_RESOLVED')) {
117
- link.dest = {
118
- redirectPaths: [],
119
- status: -1,
120
- statusText: resource.error.message,
121
- contentType: null,
122
- contentLength: null,
123
- responseHeaders: {},
124
- };
125
- }
126
- const isPageLink = isPage(link);
127
- for (const passedUrl of urlList) {
128
- this.#done.add(passedUrl);
129
- if (isPageLink) {
130
- this.#completePages += 1;
131
- }
132
- }
133
- return link;
134
- }
135
-
136
- /**
137
- * Get the current pending and in-progress URL lists.
138
- * @returns An object containing arrays of pending and in-progress URL strings.
139
- */
140
- getLinks() {
141
- return {
142
- /** URLs queued but not yet started. */
143
- pending: [...this.#pending.values()],
144
- /** URLs currently being scraped. */
145
- progress: [...this.#progress.values()],
146
- };
147
- }
148
-
149
- /**
150
- * Get a summary of crawl progress counts.
151
- * @returns An object with total/completed counts for both all links and pages only.
152
- */
153
- getPageCount() {
154
- const { pending, progress } = this.getLinks();
155
- const pendingPages = pending;
156
- const progressPages = progress;
157
- const totalLinks = pending.length + progress.length + this.#done.size;
158
- const completedLinks = this.#done.size;
159
- const totalPages = pendingPages.length + progressPages.length + this.#completePages;
160
- const completedPages = this.#completePages;
161
-
162
- return {
163
- /** Total number of discovered links (pending + progress + done). */
164
- totalLinks,
165
- /** Number of links that have been fully processed. */
166
- completedLinks,
167
- /** Total number of discovered pages (pending + progress + completed pages). */
168
- totalPages,
169
- /** Number of pages that have been successfully scraped. */
170
- completedPages,
171
- };
172
- }
173
-
174
- /**
175
- * Check whether a URL is flagged for title-only scraping.
176
- *
177
- * Title-only scraping extracts only the page title and basic metadata,
178
- * without processing anchors or capturing the full HTML.
179
- * @param urlWithoutHashAndAuth - The normalized URL string (without hash and auth) to check.
180
- * @returns `true` if the URL should be scraped in title-only mode.
181
- */
182
- isMetadataOnly(urlWithoutHashAndAuth: string) {
183
- return this.#metadataOnlyFlag.has(protocolAgnosticKey(urlWithoutHashAndAuth));
184
- }
185
- /**
186
- * Check whether a URL was added as a predicted pagination URL.
187
- * @param urlWithoutHashAndAuth - The normalized URL string (without hash and auth) to check.
188
- * @returns `true` if the URL was added with the predicted flag.
189
- */
190
- isPredicted(urlWithoutHashAndAuth: string) {
191
- return this.#predictedFlag.has(protocolAgnosticKey(urlWithoutHashAndAuth));
192
- }
193
-
194
- /**
195
- * Transition a URL from the pending state to the in-progress state.
196
- *
197
- * This should be called when scraping of the URL actually begins.
198
- * If the URL is not in the pending set, this is a no-op.
199
- * @param url - The URL that is now being actively scraped.
200
- */
201
- progress(url: ExURL) {
202
- const key = protocolAgnosticKey(url.withoutHashAndAuth);
203
- if (!this.#pending.has(key)) {
204
- return;
205
- }
206
- this.#pending.delete(key);
207
- this.#progress.add(key);
208
- }
209
-
210
- /**
211
- * Restore the link list state from a previous crawl session.
212
- *
213
- * Re-adds pending URLs to the queue and marks previously done URLs
214
- * as completed, enabling the crawler to resume from where it left off.
215
- * @param pending - URLs that were pending in the previous session.
216
- * @param done - URLs that were already completed in the previous session.
217
- * @param options - URL parsing options for re-parsing the pending URLs.
218
- * @returns The parsed pending URLs that were successfully added to the queue.
219
- */
220
- resume(pending: string[], done: string[], options: ParseURLOptions): ExURL[] {
221
- const parsedPending: ExURL[] = [];
222
- for (const url of done) {
223
- this.#done.add(protocolAgnosticKey(url));
224
- }
225
- for (const url of pending) {
226
- const parsedUrl = parseUrl(url, options);
227
- if (!parsedUrl) {
228
- continue;
229
- }
230
- this.add(parsedUrl);
231
- parsedPending.push(parsedUrl);
232
- }
233
- return parsedPending;
234
- }
235
- }
236
-
237
- /**
238
- * Determine whether a link represents a valid internal HTML page.
239
- *
240
- * A link qualifies as a "page" if it is:
241
- * - Internal (not external)
242
- * - In a lower layer of the scope
243
- * - Using HTTP or HTTPS protocol
244
- * - Has destination data with a non-error status
245
- * - Has `text/html` content type
246
- * @param link - The link to evaluate.
247
- * @returns `true` if the link represents a valid internal HTML page.
248
- */
249
- function isPage(link: Link) {
250
- if (link.isExternal) {
251
- return false;
252
- }
253
-
254
- if (!link.isLowerLayer) {
255
- return false;
256
- }
257
-
258
- if (!/^https?:$/.test(link.url.protocol)) {
259
- return false;
260
- }
261
-
262
- if (!link.dest) {
263
- return false;
264
- }
265
-
266
- if (isError(link.dest.status)) {
267
- return false;
268
- }
269
-
270
- if (link.dest.contentType === 'text/html') {
271
- return true;
272
- }
273
-
274
- return false;
275
- }