@nitpicker/crawler 0.4.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/package.json +5 -2
  2. package/CHANGELOG.md +0 -16
  3. package/src/archive/__mock__/.gitignore +0 -3
  4. package/src/archive/__mock__/mock.sqlite +0 -0
  5. package/src/archive/archive-accessor.ts +0 -337
  6. package/src/archive/archive.ts +0 -408
  7. package/src/archive/database.spec.ts +0 -469
  8. package/src/archive/database.ts +0 -1059
  9. package/src/archive/debug.ts +0 -10
  10. package/src/archive/filesystem/append-text.spec.ts +0 -26
  11. package/src/archive/filesystem/append-text.ts +0 -16
  12. package/src/archive/filesystem/copy-dir-sync.spec.ts +0 -27
  13. package/src/archive/filesystem/copy-dir-sync.ts +0 -10
  14. package/src/archive/filesystem/copy-dir.spec.ts +0 -33
  15. package/src/archive/filesystem/copy-dir.ts +0 -14
  16. package/src/archive/filesystem/exists.spec.ts +0 -33
  17. package/src/archive/filesystem/exists.ts +0 -10
  18. package/src/archive/filesystem/get-file-list.spec.ts +0 -37
  19. package/src/archive/filesystem/get-file-list.ts +0 -13
  20. package/src/archive/filesystem/index.ts +0 -17
  21. package/src/archive/filesystem/is-dir.spec.ts +0 -29
  22. package/src/archive/filesystem/is-dir.ts +0 -11
  23. package/src/archive/filesystem/mkdir.spec.ts +0 -37
  24. package/src/archive/filesystem/mkdir.ts +0 -16
  25. package/src/archive/filesystem/output-json.spec.ts +0 -34
  26. package/src/archive/filesystem/output-json.ts +0 -16
  27. package/src/archive/filesystem/output-text.spec.ts +0 -31
  28. package/src/archive/filesystem/output-text.ts +0 -35
  29. package/src/archive/filesystem/read-json.spec.ts +0 -26
  30. package/src/archive/filesystem/read-json.ts +0 -12
  31. package/src/archive/filesystem/read-text.spec.ts +0 -25
  32. package/src/archive/filesystem/read-text.ts +0 -11
  33. package/src/archive/filesystem/readline.spec.ts +0 -29
  34. package/src/archive/filesystem/readline.ts +0 -30
  35. package/src/archive/filesystem/remove.spec.ts +0 -34
  36. package/src/archive/filesystem/remove.ts +0 -11
  37. package/src/archive/filesystem/rename.spec.ts +0 -46
  38. package/src/archive/filesystem/rename.ts +0 -21
  39. package/src/archive/filesystem/tar.spec.ts +0 -33
  40. package/src/archive/filesystem/tar.ts +0 -27
  41. package/src/archive/filesystem/untar.spec.ts +0 -34
  42. package/src/archive/filesystem/untar.ts +0 -36
  43. package/src/archive/index.ts +0 -13
  44. package/src/archive/page.spec.ts +0 -368
  45. package/src/archive/page.ts +0 -420
  46. package/src/archive/resource.spec.ts +0 -101
  47. package/src/archive/resource.ts +0 -73
  48. package/src/archive/safe-path.spec.ts +0 -44
  49. package/src/archive/safe-path.ts +0 -18
  50. package/src/archive/types.ts +0 -227
  51. package/src/crawler/clear-destination-cache.spec.ts +0 -20
  52. package/src/crawler/clear-destination-cache.ts +0 -9
  53. package/src/crawler/crawler.ts +0 -873
  54. package/src/crawler/decompose-url.spec.ts +0 -48
  55. package/src/crawler/decompose-url.ts +0 -90
  56. package/src/crawler/destination-cache.spec.ts +0 -23
  57. package/src/crawler/destination-cache.ts +0 -8
  58. package/src/crawler/detect-pagination-pattern.spec.ts +0 -169
  59. package/src/crawler/detect-pagination-pattern.ts +0 -66
  60. package/src/crawler/fetch-destination.ts +0 -257
  61. package/src/crawler/fetch-robots-txt.spec.ts +0 -83
  62. package/src/crawler/fetch-robots-txt.ts +0 -91
  63. package/src/crawler/find-best-matching-scope.spec.ts +0 -39
  64. package/src/crawler/find-best-matching-scope.ts +0 -57
  65. package/src/crawler/generate-predicted-urls.spec.ts +0 -42
  66. package/src/crawler/generate-predicted-urls.ts +0 -34
  67. package/src/crawler/handle-ignore-and-skip.spec.ts +0 -66
  68. package/src/crawler/handle-ignore-and-skip.ts +0 -30
  69. package/src/crawler/handle-resource-response.spec.ts +0 -45
  70. package/src/crawler/handle-resource-response.ts +0 -21
  71. package/src/crawler/handle-scrape-end.spec.ts +0 -109
  72. package/src/crawler/handle-scrape-end.ts +0 -115
  73. package/src/crawler/handle-scrape-error.spec.ts +0 -105
  74. package/src/crawler/handle-scrape-error.ts +0 -58
  75. package/src/crawler/index.ts +0 -2
  76. package/src/crawler/inject-scope-auth.spec.ts +0 -36
  77. package/src/crawler/inject-scope-auth.ts +0 -27
  78. package/src/crawler/is-external-url.spec.ts +0 -31
  79. package/src/crawler/is-external-url.ts +0 -17
  80. package/src/crawler/is-in-any-lower-layer.spec.ts +0 -31
  81. package/src/crawler/is-in-any-lower-layer.ts +0 -22
  82. package/src/crawler/link-list.spec.ts +0 -355
  83. package/src/crawler/link-list.ts +0 -275
  84. package/src/crawler/link-to-page-data.spec.ts +0 -133
  85. package/src/crawler/link-to-page-data.ts +0 -34
  86. package/src/crawler/net-timeout-error.spec.ts +0 -25
  87. package/src/crawler/net-timeout-error.ts +0 -11
  88. package/src/crawler/protocol-agnostic-key.spec.ts +0 -40
  89. package/src/crawler/protocol-agnostic-key.ts +0 -11
  90. package/src/crawler/reconstruct-url.spec.ts +0 -37
  91. package/src/crawler/reconstruct-url.ts +0 -37
  92. package/src/crawler/robots-checker.spec.ts +0 -104
  93. package/src/crawler/robots-checker.ts +0 -73
  94. package/src/crawler/should-discard-predicted.spec.ts +0 -125
  95. package/src/crawler/should-discard-predicted.ts +0 -33
  96. package/src/crawler/should-skip-url.spec.ts +0 -77
  97. package/src/crawler/should-skip-url.ts +0 -37
  98. package/src/crawler/types.ts +0 -146
  99. package/src/crawler-orchestrator.ts +0 -401
  100. package/src/debug.ts +0 -10
  101. package/src/index.ts +0 -25
  102. package/src/types.ts +0 -30
  103. package/src/utils/array/each-splitted.spec.ts +0 -38
  104. package/src/utils/array/each-splitted.ts +0 -19
  105. package/src/utils/array/index.ts +0 -1
  106. package/src/utils/debug.ts +0 -6
  107. package/src/utils/error/dom-evaluation-error.spec.ts +0 -20
  108. package/src/utils/error/dom-evaluation-error.ts +0 -6
  109. package/src/utils/error/error-emitter.spec.ts +0 -78
  110. package/src/utils/error/error-emitter.ts +0 -44
  111. package/src/utils/error/index.ts +0 -3
  112. package/src/utils/index.ts +0 -5
  113. package/src/utils/object/clean-object.spec.ts +0 -24
  114. package/src/utils/object/clean-object.ts +0 -13
  115. package/src/utils/object/index.ts +0 -1
  116. package/src/utils/types/index.ts +0 -1
  117. package/src/utils/types/types.ts +0 -65
  118. package/tsconfig.json +0 -11
  119. package/tsconfig.tsbuildinfo +0 -1
@@ -1,469 +0,0 @@
1
- import path from 'node:path';
2
-
3
- import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
4
- import { afterAll, describe, expect, it } from 'vitest';
5
-
6
- import { Database } from './database.js';
7
- import { remove } from './filesystem/index.js';
8
-
9
- const __filename = new URL(import.meta.url).pathname;
10
- const __dirname = path.dirname(__filename);
11
- const workingDir = path.resolve(__dirname, '__mock__');
12
-
13
- afterAll(async () => {
14
- await remove(path.resolve(workingDir, 'tmp.sqlite'));
15
- });
16
-
17
- describe('Pages', () => {
18
- it('insert', async () => {
19
- const db = await Database.connect({
20
- type: 'sqlite3',
21
- workingDir,
22
- filename: path.resolve(workingDir, 'tmp.sqlite'),
23
- });
24
-
25
- await db.updatePage(
26
- {
27
- url: parseUrl('http://localhost/path/to')!,
28
- redirectPaths: [],
29
- isExternal: false,
30
- status: 200,
31
- statusText: 'OK',
32
- contentLength: 1000,
33
- contentType: 'html/text',
34
- responseHeaders: {},
35
- meta: {
36
- title: 'LOCAL_SERVER',
37
- },
38
- anchorList: [],
39
- imageList: [],
40
- html: '',
41
- isSkipped: false,
42
- },
43
- workingDir,
44
- true,
45
- );
46
-
47
- const pages = await db.getPages();
48
-
49
- expect(pages.length).toBe(1);
50
- });
51
-
52
- // Create mock.sqlite for the next test
53
- // it.skip('insert 2', async () => {
54
- // const db = await Database.connect({
55
- // type: 'sqlite3',
56
- // workingDir,
57
- // filename: path.resolve(workingDir, 'mock.sqlite'),
58
- // });
59
-
60
- // await db.updatePage(
61
- // {
62
- // url: parseUrl('http://localhost/path/to')!,
63
- // redirectPaths: ['https://localhost/path/to'],
64
- // isExternal: false,
65
- // status: 200,
66
- // statusText: 'OK',
67
- // contentLength: 1000,
68
- // contentType: 'html/text',
69
- // responseHeaders: {},
70
- // meta: {
71
- // title: 'LOCAL_SERVER',
72
- // },
73
- // anchorList: [
74
- // {
75
- // href: parseUrl('https://localhost/data/1')!,
76
- // textContent: 'DATA-1',
77
- // },
78
- // {
79
- // href: parseUrl('https://localhost/data/2')!,
80
- // textContent: 'DATA-2',
81
- // },
82
- // {
83
- // href: parseUrl('https://localhost/data/3')!,
84
- // textContent: 'DATA-3',
85
- // },
86
- // {
87
- // href: parseUrl('https://localhost/lp')!,
88
- // textContent: 'Advertisement',
89
- // },
90
- // {
91
- // href: parseUrl('https://example.com/abc')!,
92
- // textContent: 'ABC',
93
- // },
94
- // {
95
- // href: parseUrl('https://example.com/xyz')!,
96
- // textContent: 'XYZ',
97
- // },
98
- // ],
99
- // imageList: [],
100
- // html: '',
101
- // isSkipped: false,
102
- // },
103
- // workingDir,
104
- // true,
105
- // );
106
-
107
- // await db.updatePage(
108
- // {
109
- // url: parseUrl('https://localhost/data/1')!,
110
- // redirectPaths: ['https://localhost/data/one'],
111
- // isExternal: false,
112
- // status: 200,
113
- // statusText: 'OK',
114
- // contentLength: 1000,
115
- // contentType: 'html/text',
116
- // responseHeaders: {},
117
- // meta: {
118
- // title: 'DATA ONE | LOCAL_SERVER',
119
- // },
120
- // anchorList: [
121
- // {
122
- // href: parseUrl('https://localhost/data/one')!,
123
- // textContent: 'DATA ONE',
124
- // },
125
- // {
126
- // href: parseUrl('https://localhost/data/two')!,
127
- // textContent: 'DATA TWO',
128
- // },
129
- // {
130
- // href: parseUrl('https://localhost/data/three')!,
131
- // textContent: 'DATA THREE',
132
- // },
133
- // {
134
- // href: parseUrl('https://localhost/lp')!,
135
- // textContent: 'Advertisement',
136
- // },
137
- // {
138
- // href: parseUrl('https://example.com/abc')!,
139
- // textContent: 'ABC',
140
- // },
141
- // {
142
- // href: parseUrl('https://example.com/xyz')!,
143
- // textContent: 'XYZ',
144
- // },
145
- // ],
146
- // imageList: [],
147
- // html: '',
148
- // isSkipped: false,
149
- // },
150
- // workingDir,
151
- // true,
152
- // );
153
-
154
- // await db.updatePage(
155
- // {
156
- // url: parseUrl('https://localhost/data/2')!,
157
- // redirectPaths: ['https://localhost/data/two'],
158
- // isExternal: false,
159
- // status: 200,
160
- // statusText: 'OK',
161
- // contentLength: 1000,
162
- // contentType: 'html/text',
163
- // responseHeaders: {},
164
- // meta: {
165
- // title: 'DATA TWO | LOCAL_SERVER',
166
- // },
167
- // anchorList: [
168
- // {
169
- // href: parseUrl('https://localhost/data/one')!,
170
- // textContent: 'DATA ONE',
171
- // },
172
- // {
173
- // href: parseUrl('https://localhost/data/two')!,
174
- // textContent: 'DATA TWO',
175
- // },
176
- // {
177
- // href: parseUrl('https://localhost/data/three')!,
178
- // textContent: 'DATA THREE',
179
- // },
180
- // {
181
- // href: parseUrl('https://localhost/lp')!,
182
- // textContent: 'Advertisement',
183
- // },
184
- // {
185
- // href: parseUrl('https://example.com/abc')!,
186
- // textContent: 'ABC',
187
- // },
188
- // {
189
- // href: parseUrl('https://example.com/xyz')!,
190
- // textContent: 'XYZ',
191
- // },
192
- // ],
193
- // imageList: [],
194
- // html: '',
195
- // isSkipped: false,
196
- // },
197
- // workingDir,
198
- // true,
199
- // );
200
-
201
- // await db.updatePage(
202
- // {
203
- // url: parseUrl('https://localhost/data/3')!,
204
- // redirectPaths: ['https://localhost/data/three'],
205
- // isExternal: false,
206
- // status: 200,
207
- // statusText: 'OK',
208
- // contentLength: 1000,
209
- // contentType: 'html/text',
210
- // responseHeaders: {},
211
- // meta: {
212
- // title: 'DATA THREE | LOCAL_SERVER',
213
- // },
214
- // anchorList: [
215
- // {
216
- // href: parseUrl('https://localhost/data/one')!,
217
- // textContent: 'DATA ONE',
218
- // },
219
- // {
220
- // href: parseUrl('https://localhost/data/two')!,
221
- // textContent: 'DATA TWO',
222
- // },
223
- // {
224
- // href: parseUrl('https://localhost/data/three')!,
225
- // textContent: 'DATA THREE',
226
- // },
227
- // {
228
- // href: parseUrl('https://localhost/lp')!,
229
- // textContent: 'Advertisement',
230
- // },
231
- // {
232
- // href: parseUrl('https://example.com/abc')!,
233
- // textContent: 'ABC',
234
- // },
235
- // {
236
- // href: parseUrl('https://example.com/xyz')!,
237
- // textContent: 'XYZ',
238
- // },
239
- // ],
240
- // imageList: [],
241
- // html: '',
242
- // isSkipped: false,
243
- // },
244
- // workingDir,
245
- // true,
246
- // );
247
-
248
- // await db.updatePage(
249
- // {
250
- // url: parseUrl('https://localhost/lp')!,
251
- // redirectPaths: [],
252
- // isExternal: false,
253
- // status: 200,
254
- // statusText: 'OK',
255
- // contentLength: 1000,
256
- // contentType: 'html/text',
257
- // responseHeaders: {},
258
- // meta: {
259
- // title: '[AD] THE EARTH IS BLUE',
260
- // },
261
- // anchorList: [
262
- // {
263
- // href: parseUrl('https://ec.localhost/buy?id=0123')!,
264
- // textContent: 'BUY',
265
- // },
266
- // ],
267
- // imageList: [],
268
- // html: '',
269
- // isSkipped: false,
270
- // },
271
- // workingDir,
272
- // true,
273
- // );
274
-
275
- // await db.updatePage(
276
- // {
277
- // url: parseUrl('https://example.com/abc')!,
278
- // redirectPaths: [],
279
- // isExternal: true,
280
- // status: 200,
281
- // statusText: 'OK',
282
- // contentLength: 1000,
283
- // contentType: 'html/text',
284
- // responseHeaders: {},
285
- // meta: {
286
- // title: 'ABC - example.com',
287
- // },
288
- // anchorList: [],
289
- // imageList: [],
290
- // html: '',
291
- // isSkipped: false,
292
- // },
293
- // workingDir,
294
- // true,
295
- // );
296
-
297
- // await db.updatePage(
298
- // {
299
- // url: parseUrl('https://example.com/xyz')!,
300
- // redirectPaths: ['https://example.com/404'],
301
- // isExternal: true,
302
- // status: 404,
303
- // statusText: 'Not Found',
304
- // contentLength: 1000,
305
- // contentType: 'html/text',
306
- // responseHeaders: {},
307
- // meta: {
308
- // title: '404 Not Found - example.com',
309
- // },
310
- // anchorList: [],
311
- // imageList: [],
312
- // html: '',
313
- // isSkipped: false,
314
- // },
315
- // workingDir,
316
- // true,
317
- // );
318
-
319
- // await db.updatePage(
320
- // {
321
- // url: parseUrl('https://ec.localhost/buy?id=0123')!,
322
- // redirectPaths: ['https://ec.localhost/items/0123/details'],
323
- // isExternal: true,
324
- // status: 200,
325
- // statusText: 'OK',
326
- // contentLength: 1000,
327
- // contentType: 'html/text',
328
- // responseHeaders: {},
329
- // meta: {
330
- // title: '[ID-0123] The tool of something | EC',
331
- // },
332
- // anchorList: [],
333
- // imageList: [],
334
- // html: '',
335
- // isSkipped: false,
336
- // },
337
- // workingDir,
338
- // true,
339
- // );
340
- // });
341
-
342
- it('get', async () => {
343
- const db = await Database.connect({
344
- type: 'sqlite3',
345
- workingDir,
346
- filename: path.resolve(workingDir, 'mock.sqlite'),
347
- });
348
-
349
- const { pages, redirects, anchors, referrers } = await db.getPagesWithRels(0, 2);
350
-
351
- expect(pages.map((p) => p.url)).toEqual([
352
- 'https://localhost/data/one',
353
- 'https://localhost/data/three',
354
- ]);
355
-
356
- expect(pages.map((p) => p.title)).toEqual([
357
- 'DATA ONE | LOCAL_SERVER',
358
- 'DATA THREE | LOCAL_SERVER',
359
- ]);
360
-
361
- expect(redirects).toEqual([
362
- {
363
- pageId: 9,
364
- from: 'https://localhost/data/1',
365
- fromId: 3,
366
- },
367
- {
368
- pageId: 11,
369
- from: 'https://localhost/data/3',
370
- fromId: 5,
371
- },
372
- ]);
373
-
374
- expect(
375
- anchors
376
- .filter((a) => a.pageId === 9)
377
- .map((a) => ({
378
- url: a.url,
379
- href: a.href,
380
- title: a.title,
381
- textContent: a.textContent,
382
- })),
383
- ).toEqual([
384
- {
385
- url: 'https://localhost/data/one',
386
- href: 'https://localhost/data/one',
387
- title: 'DATA ONE | LOCAL_SERVER',
388
- textContent: 'DATA ONE',
389
- },
390
- {
391
- url: 'https://localhost/data/two',
392
- href: 'https://localhost/data/two',
393
- title: 'DATA TWO | LOCAL_SERVER',
394
- textContent: 'DATA TWO',
395
- },
396
- {
397
- url: 'https://localhost/data/three',
398
- href: 'https://localhost/data/three',
399
- title: 'DATA THREE | LOCAL_SERVER',
400
- textContent: 'DATA THREE',
401
- },
402
- {
403
- url: 'https://localhost/lp',
404
- href: 'https://localhost/lp',
405
- title: '[AD] THE EARTH IS BLUE',
406
- textContent: 'Advertisement',
407
- },
408
- {
409
- url: 'https://example.com/abc',
410
- href: 'https://example.com/abc',
411
- title: 'ABC - example.com',
412
- textContent: 'ABC',
413
- },
414
- {
415
- url: 'https://example.com/404',
416
- href: 'https://example.com/xyz',
417
- title: '404 Not Found - example.com',
418
- textContent: 'XYZ',
419
- },
420
- ]);
421
-
422
- expect(referrers.filter((r) => r.pageId === 9)).toEqual([
423
- {
424
- pageId: 9,
425
- url: 'https://localhost/path/to',
426
- through: 'https://localhost/data/1',
427
- throughId: 3,
428
- hash: null,
429
- textContent: 'DATA-1',
430
- },
431
- {
432
- pageId: 9,
433
- url: 'https://localhost/data/one',
434
- through: 'https://localhost/data/one',
435
- throughId: 9,
436
- hash: null,
437
- textContent: 'DATA ONE',
438
- },
439
- {
440
- pageId: 9,
441
- url: 'https://localhost/data/two',
442
- through: 'https://localhost/data/one',
443
- throughId: 9,
444
- hash: null,
445
- textContent: 'DATA ONE',
446
- },
447
- {
448
- pageId: 9,
449
- url: 'https://localhost/data/three',
450
- through: 'https://localhost/data/one',
451
- throughId: 9,
452
- hash: null,
453
- textContent: 'DATA ONE',
454
- },
455
- ]);
456
- });
457
-
458
- it('getPageCount', async () => {
459
- const db = await Database.connect({
460
- type: 'sqlite3',
461
- workingDir,
462
- filename: path.resolve(workingDir, 'mock.sqlite'),
463
- });
464
-
465
- const count = await db.getPageCount();
466
-
467
- expect(count).toEqual(14);
468
- });
469
- });