@nitpicker/crawler 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. package/CHANGELOG.md +8 -0
  2. package/LICENSE +191 -0
  3. package/README.md +13 -0
  4. package/lib/archive/archive-accessor.d.ts +107 -0
  5. package/lib/archive/archive-accessor.js +264 -0
  6. package/lib/archive/archive.d.ts +174 -0
  7. package/lib/archive/archive.js +331 -0
  8. package/lib/archive/database.d.ts +207 -0
  9. package/lib/archive/database.js +972 -0
  10. package/lib/archive/debug.d.ts +8 -0
  11. package/lib/archive/debug.js +9 -0
  12. package/lib/archive/filesystem/append-text.d.ts +9 -0
  13. package/lib/archive/filesystem/append-text.js +14 -0
  14. package/lib/archive/filesystem/copy-dir-sync.d.ts +6 -0
  15. package/lib/archive/filesystem/copy-dir-sync.js +9 -0
  16. package/lib/archive/filesystem/copy-dir.d.ts +7 -0
  17. package/lib/archive/filesystem/copy-dir.js +13 -0
  18. package/lib/archive/filesystem/exists.d.ts +6 -0
  19. package/lib/archive/filesystem/exists.js +9 -0
  20. package/lib/archive/filesystem/get-file-list.d.ts +8 -0
  21. package/lib/archive/filesystem/get-file-list.js +12 -0
  22. package/lib/archive/filesystem/index.d.ts +17 -0
  23. package/lib/archive/filesystem/index.js +17 -0
  24. package/lib/archive/filesystem/is-dir.d.ts +6 -0
  25. package/lib/archive/filesystem/is-dir.js +10 -0
  26. package/lib/archive/filesystem/mkdir.d.ts +8 -0
  27. package/lib/archive/filesystem/mkdir.js +15 -0
  28. package/lib/archive/filesystem/output-json.d.ts +9 -0
  29. package/lib/archive/filesystem/output-json.js +14 -0
  30. package/lib/archive/filesystem/output-text.d.ts +11 -0
  31. package/lib/archive/filesystem/output-text.js +32 -0
  32. package/lib/archive/filesystem/read-json.d.ts +7 -0
  33. package/lib/archive/filesystem/read-json.js +11 -0
  34. package/lib/archive/filesystem/read-text.d.ts +6 -0
  35. package/lib/archive/filesystem/read-text.js +10 -0
  36. package/lib/archive/filesystem/readline.d.ts +11 -0
  37. package/lib/archive/filesystem/readline.js +26 -0
  38. package/lib/archive/filesystem/remove.d.ts +5 -0
  39. package/lib/archive/filesystem/remove.js +10 -0
  40. package/lib/archive/filesystem/rename.d.ts +11 -0
  41. package/lib/archive/filesystem/rename.js +18 -0
  42. package/lib/archive/filesystem/tar.d.ts +11 -0
  43. package/lib/archive/filesystem/tar.js +22 -0
  44. package/lib/archive/filesystem/untar.d.ts +20 -0
  45. package/lib/archive/filesystem/untar.js +24 -0
  46. package/lib/archive/filesystem/utils.d.ts +109 -0
  47. package/lib/archive/filesystem/utils.js +185 -0
  48. package/lib/archive/filesystem/zip.d.ts +29 -0
  49. package/lib/archive/filesystem/zip.js +53 -0
  50. package/lib/archive/index.d.ts +6 -0
  51. package/lib/archive/index.js +11 -0
  52. package/lib/archive/page.d.ts +263 -0
  53. package/lib/archive/page.js +316 -0
  54. package/lib/archive/resource.d.ts +46 -0
  55. package/lib/archive/resource.js +62 -0
  56. package/lib/archive/safe-path.d.ts +9 -0
  57. package/lib/archive/safe-path.js +17 -0
  58. package/lib/archive/types.d.ts +210 -0
  59. package/lib/archive/types.js +1 -0
  60. package/lib/crawler/clear-destination-cache.d.ts +5 -0
  61. package/lib/crawler/clear-destination-cache.js +8 -0
  62. package/lib/crawler/crawler.d.ts +73 -0
  63. package/lib/crawler/crawler.js +748 -0
  64. package/lib/crawler/decompose-url.d.ts +25 -0
  65. package/lib/crawler/decompose-url.js +71 -0
  66. package/lib/crawler/destination-cache.d.ts +7 -0
  67. package/lib/crawler/destination-cache.js +6 -0
  68. package/lib/crawler/detect-pagination-pattern.d.ts +16 -0
  69. package/lib/crawler/detect-pagination-pattern.js +61 -0
  70. package/lib/crawler/fetch-destination.d.ts +38 -0
  71. package/lib/crawler/fetch-destination.js +208 -0
  72. package/lib/crawler/fetch-robots-txt.d.ts +42 -0
  73. package/lib/crawler/fetch-robots-txt.js +44 -0
  74. package/lib/crawler/find-best-matching-scope.d.ts +12 -0
  75. package/lib/crawler/find-best-matching-scope.js +46 -0
  76. package/lib/crawler/generate-predicted-urls.d.ts +13 -0
  77. package/lib/crawler/generate-predicted-urls.js +27 -0
  78. package/lib/crawler/handle-ignore-and-skip.d.ts +16 -0
  79. package/lib/crawler/handle-ignore-and-skip.js +19 -0
  80. package/lib/crawler/handle-resource-response.d.ts +13 -0
  81. package/lib/crawler/handle-resource-response.js +16 -0
  82. package/lib/crawler/handle-scrape-end.d.ts +24 -0
  83. package/lib/crawler/handle-scrape-end.js +82 -0
  84. package/lib/crawler/handle-scrape-error.d.ts +37 -0
  85. package/lib/crawler/handle-scrape-error.js +38 -0
  86. package/lib/crawler/index.d.ts +2 -0
  87. package/lib/crawler/index.js +2 -0
  88. package/lib/crawler/inject-scope-auth.d.ts +11 -0
  89. package/lib/crawler/inject-scope-auth.js +21 -0
  90. package/lib/crawler/is-external-url.d.ts +11 -0
  91. package/lib/crawler/is-external-url.js +12 -0
  92. package/lib/crawler/is-in-any-lower-layer.d.ts +13 -0
  93. package/lib/crawler/is-in-any-lower-layer.js +15 -0
  94. package/lib/crawler/link-list.d.ts +112 -0
  95. package/lib/crawler/link-list.js +248 -0
  96. package/lib/crawler/link-to-page-data.d.ts +14 -0
  97. package/lib/crawler/link-to-page-data.js +32 -0
  98. package/lib/crawler/net-timeout-error.d.ts +9 -0
  99. package/lib/crawler/net-timeout-error.js +11 -0
  100. package/lib/crawler/network.d.ts +30 -0
  101. package/lib/crawler/network.js +226 -0
  102. package/lib/crawler/protocol-agnostic-key.d.ts +9 -0
  103. package/lib/crawler/protocol-agnostic-key.js +11 -0
  104. package/lib/crawler/reconstruct-url.d.ts +10 -0
  105. package/lib/crawler/reconstruct-url.js +28 -0
  106. package/lib/crawler/result-handler.d.ts +118 -0
  107. package/lib/crawler/result-handler.js +153 -0
  108. package/lib/crawler/robots-checker.d.ts +26 -0
  109. package/lib/crawler/robots-checker.js +62 -0
  110. package/lib/crawler/should-discard-predicted.d.ts +14 -0
  111. package/lib/crawler/should-discard-predicted.js +31 -0
  112. package/lib/crawler/should-skip-url.d.ts +23 -0
  113. package/lib/crawler/should-skip-url.js +15 -0
  114. package/lib/crawler/speculative-pagination.d.ts +52 -0
  115. package/lib/crawler/speculative-pagination.js +215 -0
  116. package/lib/crawler/types.d.ts +119 -0
  117. package/lib/crawler/types.js +1 -0
  118. package/lib/crawler/url-filter.d.ts +56 -0
  119. package/lib/crawler/url-filter.js +110 -0
  120. package/lib/crawler-orchestrator.d.ts +142 -0
  121. package/lib/crawler-orchestrator.js +309 -0
  122. package/lib/debug.d.ts +8 -0
  123. package/lib/debug.js +9 -0
  124. package/lib/index.d.ts +16 -0
  125. package/lib/index.js +18 -0
  126. package/lib/qzilla.d.ts +136 -0
  127. package/lib/qzilla.js +292 -0
  128. package/lib/types.d.ts +27 -0
  129. package/lib/types.js +1 -0
  130. package/lib/utils/array/each-splitted.d.ts +10 -0
  131. package/lib/utils/array/each-splitted.js +14 -0
  132. package/lib/utils/array/index.d.ts +1 -0
  133. package/lib/utils/array/index.js +1 -0
  134. package/lib/utils/async/index.d.ts +1 -0
  135. package/lib/utils/async/index.js +1 -0
  136. package/lib/utils/debug.d.ts +5 -0
  137. package/lib/utils/debug.js +5 -0
  138. package/lib/utils/error/dom-evaluation-error.d.ts +7 -0
  139. package/lib/utils/error/dom-evaluation-error.js +7 -0
  140. package/lib/utils/error/error-emitter.d.ts +18 -0
  141. package/lib/utils/error/error-emitter.js +29 -0
  142. package/lib/utils/error/index.d.ts +3 -0
  143. package/lib/utils/error/index.js +2 -0
  144. package/lib/utils/event-emitter/index.d.ts +6 -0
  145. package/lib/utils/event-emitter/index.js +6 -0
  146. package/lib/utils/index.d.ts +5 -0
  147. package/lib/utils/index.js +5 -0
  148. package/lib/utils/network/index.d.ts +1 -0
  149. package/lib/utils/network/index.js +1 -0
  150. package/lib/utils/object/clean-object.d.ts +8 -0
  151. package/lib/utils/object/clean-object.js +13 -0
  152. package/lib/utils/object/index.d.ts +1 -0
  153. package/lib/utils/object/index.js +1 -0
  154. package/lib/utils/path/index.d.ts +1 -0
  155. package/lib/utils/path/index.js +1 -0
  156. package/lib/utils/path/safe-filepath.d.ts +7 -0
  157. package/lib/utils/path/safe-filepath.js +12 -0
  158. package/lib/utils/regexp/index.d.ts +1 -0
  159. package/lib/utils/regexp/index.js +1 -0
  160. package/lib/utils/retryable/index.d.ts +2 -0
  161. package/lib/utils/retryable/index.js +1 -0
  162. package/lib/utils/sort/index.d.ts +14 -0
  163. package/lib/utils/sort/index.js +61 -0
  164. package/lib/utils/sort/remove-matches.d.ts +9 -0
  165. package/lib/utils/sort/remove-matches.js +23 -0
  166. package/lib/utils/types/index.d.ts +1 -0
  167. package/lib/utils/types/index.js +1 -0
  168. package/lib/utils/types/types.d.ts +46 -0
  169. package/lib/utils/types/types.js +1 -0
  170. package/lib/utils/url/index.d.ts +5 -0
  171. package/lib/utils/url/index.js +5 -0
  172. package/lib/utils/url/is-lower-layer.d.ts +15 -0
  173. package/lib/utils/url/is-lower-layer.js +55 -0
  174. package/lib/utils/url/parse-url.d.ts +11 -0
  175. package/lib/utils/url/parse-url.js +20 -0
  176. package/lib/utils/url/path-match.d.ts +11 -0
  177. package/lib/utils/url/path-match.js +18 -0
  178. package/lib/utils/url/sort-url.d.ts +10 -0
  179. package/lib/utils/url/sort-url.js +24 -0
  180. package/lib/utils/url/url-partial-match.d.ts +11 -0
  181. package/lib/utils/url/url-partial-match.js +32 -0
  182. package/package.json +49 -0
  183. package/src/archive/__mock__/.gitignore +3 -0
  184. package/src/archive/__mock__/mock.sqlite +0 -0
  185. package/src/archive/archive-accessor.ts +337 -0
  186. package/src/archive/archive.ts +408 -0
  187. package/src/archive/database.spec.ts +469 -0
  188. package/src/archive/database.ts +1059 -0
  189. package/src/archive/debug.ts +10 -0
  190. package/src/archive/filesystem/append-text.spec.ts +26 -0
  191. package/src/archive/filesystem/append-text.ts +16 -0
  192. package/src/archive/filesystem/copy-dir-sync.spec.ts +27 -0
  193. package/src/archive/filesystem/copy-dir-sync.ts +10 -0
  194. package/src/archive/filesystem/copy-dir.spec.ts +33 -0
  195. package/src/archive/filesystem/copy-dir.ts +14 -0
  196. package/src/archive/filesystem/exists.spec.ts +33 -0
  197. package/src/archive/filesystem/exists.ts +10 -0
  198. package/src/archive/filesystem/get-file-list.spec.ts +37 -0
  199. package/src/archive/filesystem/get-file-list.ts +13 -0
  200. package/src/archive/filesystem/index.ts +17 -0
  201. package/src/archive/filesystem/is-dir.spec.ts +29 -0
  202. package/src/archive/filesystem/is-dir.ts +11 -0
  203. package/src/archive/filesystem/mkdir.spec.ts +37 -0
  204. package/src/archive/filesystem/mkdir.ts +16 -0
  205. package/src/archive/filesystem/output-json.spec.ts +34 -0
  206. package/src/archive/filesystem/output-json.ts +16 -0
  207. package/src/archive/filesystem/output-text.spec.ts +31 -0
  208. package/src/archive/filesystem/output-text.ts +35 -0
  209. package/src/archive/filesystem/read-json.spec.ts +26 -0
  210. package/src/archive/filesystem/read-json.ts +12 -0
  211. package/src/archive/filesystem/read-text.spec.ts +25 -0
  212. package/src/archive/filesystem/read-text.ts +11 -0
  213. package/src/archive/filesystem/readline.spec.ts +29 -0
  214. package/src/archive/filesystem/readline.ts +30 -0
  215. package/src/archive/filesystem/remove.spec.ts +34 -0
  216. package/src/archive/filesystem/remove.ts +11 -0
  217. package/src/archive/filesystem/rename.spec.ts +46 -0
  218. package/src/archive/filesystem/rename.ts +21 -0
  219. package/src/archive/filesystem/tar.spec.ts +33 -0
  220. package/src/archive/filesystem/tar.ts +27 -0
  221. package/src/archive/filesystem/untar.spec.ts +34 -0
  222. package/src/archive/filesystem/untar.ts +36 -0
  223. package/src/archive/index.ts +13 -0
  224. package/src/archive/page.spec.ts +368 -0
  225. package/src/archive/page.ts +420 -0
  226. package/src/archive/resource.spec.ts +101 -0
  227. package/src/archive/resource.ts +73 -0
  228. package/src/archive/safe-path.spec.ts +44 -0
  229. package/src/archive/safe-path.ts +18 -0
  230. package/src/archive/types.ts +227 -0
  231. package/src/crawler/clear-destination-cache.spec.ts +20 -0
  232. package/src/crawler/clear-destination-cache.ts +9 -0
  233. package/src/crawler/crawler.ts +873 -0
  234. package/src/crawler/decompose-url.spec.ts +48 -0
  235. package/src/crawler/decompose-url.ts +90 -0
  236. package/src/crawler/destination-cache.spec.ts +23 -0
  237. package/src/crawler/destination-cache.ts +8 -0
  238. package/src/crawler/detect-pagination-pattern.spec.ts +169 -0
  239. package/src/crawler/detect-pagination-pattern.ts +66 -0
  240. package/src/crawler/fetch-destination.ts +257 -0
  241. package/src/crawler/fetch-robots-txt.spec.ts +83 -0
  242. package/src/crawler/fetch-robots-txt.ts +91 -0
  243. package/src/crawler/find-best-matching-scope.spec.ts +39 -0
  244. package/src/crawler/find-best-matching-scope.ts +57 -0
  245. package/src/crawler/generate-predicted-urls.spec.ts +42 -0
  246. package/src/crawler/generate-predicted-urls.ts +34 -0
  247. package/src/crawler/handle-ignore-and-skip.spec.ts +66 -0
  248. package/src/crawler/handle-ignore-and-skip.ts +30 -0
  249. package/src/crawler/handle-resource-response.spec.ts +45 -0
  250. package/src/crawler/handle-resource-response.ts +21 -0
  251. package/src/crawler/handle-scrape-end.spec.ts +109 -0
  252. package/src/crawler/handle-scrape-end.ts +115 -0
  253. package/src/crawler/handle-scrape-error.spec.ts +105 -0
  254. package/src/crawler/handle-scrape-error.ts +58 -0
  255. package/src/crawler/index.ts +2 -0
  256. package/src/crawler/inject-scope-auth.spec.ts +36 -0
  257. package/src/crawler/inject-scope-auth.ts +27 -0
  258. package/src/crawler/is-external-url.spec.ts +31 -0
  259. package/src/crawler/is-external-url.ts +17 -0
  260. package/src/crawler/is-in-any-lower-layer.spec.ts +31 -0
  261. package/src/crawler/is-in-any-lower-layer.ts +22 -0
  262. package/src/crawler/link-list.spec.ts +355 -0
  263. package/src/crawler/link-list.ts +275 -0
  264. package/src/crawler/link-to-page-data.spec.ts +133 -0
  265. package/src/crawler/link-to-page-data.ts +34 -0
  266. package/src/crawler/net-timeout-error.spec.ts +25 -0
  267. package/src/crawler/net-timeout-error.ts +11 -0
  268. package/src/crawler/protocol-agnostic-key.spec.ts +40 -0
  269. package/src/crawler/protocol-agnostic-key.ts +11 -0
  270. package/src/crawler/reconstruct-url.spec.ts +37 -0
  271. package/src/crawler/reconstruct-url.ts +37 -0
  272. package/src/crawler/robots-checker.spec.ts +104 -0
  273. package/src/crawler/robots-checker.ts +73 -0
  274. package/src/crawler/should-discard-predicted.spec.ts +125 -0
  275. package/src/crawler/should-discard-predicted.ts +33 -0
  276. package/src/crawler/should-skip-url.spec.ts +77 -0
  277. package/src/crawler/should-skip-url.ts +37 -0
  278. package/src/crawler/types.ts +146 -0
  279. package/src/crawler-orchestrator.ts +401 -0
  280. package/src/debug.ts +10 -0
  281. package/src/index.ts +25 -0
  282. package/src/types.ts +30 -0
  283. package/src/utils/array/each-splitted.spec.ts +38 -0
  284. package/src/utils/array/each-splitted.ts +19 -0
  285. package/src/utils/array/index.ts +1 -0
  286. package/src/utils/debug.ts +6 -0
  287. package/src/utils/error/dom-evaluation-error.spec.ts +20 -0
  288. package/src/utils/error/dom-evaluation-error.ts +6 -0
  289. package/src/utils/error/error-emitter.spec.ts +78 -0
  290. package/src/utils/error/error-emitter.ts +44 -0
  291. package/src/utils/error/index.ts +3 -0
  292. package/src/utils/index.ts +5 -0
  293. package/src/utils/object/clean-object.spec.ts +24 -0
  294. package/src/utils/object/clean-object.ts +13 -0
  295. package/src/utils/object/index.ts +1 -0
  296. package/src/utils/types/index.ts +1 -0
  297. package/src/utils/types/types.ts +65 -0
  298. package/tsconfig.json +11 -0
  299. package/tsconfig.tsbuildinfo +1 -0
@@ -0,0 +1,469 @@
1
+ import path from 'node:path';
2
+
3
+ import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
4
+ import { afterAll, describe, expect, it } from 'vitest';
5
+
6
+ import { Database } from './database.js';
7
+ import { remove } from './filesystem/index.js';
8
+
9
+ const __filename = new URL(import.meta.url).pathname;
10
+ const __dirname = path.dirname(__filename);
11
+ const workingDir = path.resolve(__dirname, '__mock__');
12
+
13
+ afterAll(async () => {
14
+ await remove(path.resolve(workingDir, 'tmp.sqlite'));
15
+ });
16
+
17
+ describe('Pages', () => {
18
+ it('insert', async () => {
19
+ const db = await Database.connect({
20
+ type: 'sqlite3',
21
+ workingDir,
22
+ filename: path.resolve(workingDir, 'tmp.sqlite'),
23
+ });
24
+
25
+ await db.updatePage(
26
+ {
27
+ url: parseUrl('http://localhost/path/to')!,
28
+ redirectPaths: [],
29
+ isExternal: false,
30
+ status: 200,
31
+ statusText: 'OK',
32
+ contentLength: 1000,
33
+ contentType: 'html/text',
34
+ responseHeaders: {},
35
+ meta: {
36
+ title: 'LOCAL_SERVER',
37
+ },
38
+ anchorList: [],
39
+ imageList: [],
40
+ html: '',
41
+ isSkipped: false,
42
+ },
43
+ workingDir,
44
+ true,
45
+ );
46
+
47
+ const pages = await db.getPages();
48
+
49
+ expect(pages.length).toBe(1);
50
+ });
51
+
52
+ // Create mock.sqlite for the next test
53
+ // it.skip('insert 2', async () => {
54
+ // const db = await Database.connect({
55
+ // type: 'sqlite3',
56
+ // workingDir,
57
+ // filename: path.resolve(workingDir, 'mock.sqlite'),
58
+ // });
59
+
60
+ // await db.updatePage(
61
+ // {
62
+ // url: parseUrl('http://localhost/path/to')!,
63
+ // redirectPaths: ['https://localhost/path/to'],
64
+ // isExternal: false,
65
+ // status: 200,
66
+ // statusText: 'OK',
67
+ // contentLength: 1000,
68
+ // contentType: 'html/text',
69
+ // responseHeaders: {},
70
+ // meta: {
71
+ // title: 'LOCAL_SERVER',
72
+ // },
73
+ // anchorList: [
74
+ // {
75
+ // href: parseUrl('https://localhost/data/1')!,
76
+ // textContent: 'DATA-1',
77
+ // },
78
+ // {
79
+ // href: parseUrl('https://localhost/data/2')!,
80
+ // textContent: 'DATA-2',
81
+ // },
82
+ // {
83
+ // href: parseUrl('https://localhost/data/3')!,
84
+ // textContent: 'DATA-3',
85
+ // },
86
+ // {
87
+ // href: parseUrl('https://localhost/lp')!,
88
+ // textContent: 'Advertisement',
89
+ // },
90
+ // {
91
+ // href: parseUrl('https://example.com/abc')!,
92
+ // textContent: 'ABC',
93
+ // },
94
+ // {
95
+ // href: parseUrl('https://example.com/xyz')!,
96
+ // textContent: 'XYZ',
97
+ // },
98
+ // ],
99
+ // imageList: [],
100
+ // html: '',
101
+ // isSkipped: false,
102
+ // },
103
+ // workingDir,
104
+ // true,
105
+ // );
106
+
107
+ // await db.updatePage(
108
+ // {
109
+ // url: parseUrl('https://localhost/data/1')!,
110
+ // redirectPaths: ['https://localhost/data/one'],
111
+ // isExternal: false,
112
+ // status: 200,
113
+ // statusText: 'OK',
114
+ // contentLength: 1000,
115
+ // contentType: 'html/text',
116
+ // responseHeaders: {},
117
+ // meta: {
118
+ // title: 'DATA ONE | LOCAL_SERVER',
119
+ // },
120
+ // anchorList: [
121
+ // {
122
+ // href: parseUrl('https://localhost/data/one')!,
123
+ // textContent: 'DATA ONE',
124
+ // },
125
+ // {
126
+ // href: parseUrl('https://localhost/data/two')!,
127
+ // textContent: 'DATA TWO',
128
+ // },
129
+ // {
130
+ // href: parseUrl('https://localhost/data/three')!,
131
+ // textContent: 'DATA THREE',
132
+ // },
133
+ // {
134
+ // href: parseUrl('https://localhost/lp')!,
135
+ // textContent: 'Advertisement',
136
+ // },
137
+ // {
138
+ // href: parseUrl('https://example.com/abc')!,
139
+ // textContent: 'ABC',
140
+ // },
141
+ // {
142
+ // href: parseUrl('https://example.com/xyz')!,
143
+ // textContent: 'XYZ',
144
+ // },
145
+ // ],
146
+ // imageList: [],
147
+ // html: '',
148
+ // isSkipped: false,
149
+ // },
150
+ // workingDir,
151
+ // true,
152
+ // );
153
+
154
+ // await db.updatePage(
155
+ // {
156
+ // url: parseUrl('https://localhost/data/2')!,
157
+ // redirectPaths: ['https://localhost/data/two'],
158
+ // isExternal: false,
159
+ // status: 200,
160
+ // statusText: 'OK',
161
+ // contentLength: 1000,
162
+ // contentType: 'html/text',
163
+ // responseHeaders: {},
164
+ // meta: {
165
+ // title: 'DATA TWO | LOCAL_SERVER',
166
+ // },
167
+ // anchorList: [
168
+ // {
169
+ // href: parseUrl('https://localhost/data/one')!,
170
+ // textContent: 'DATA ONE',
171
+ // },
172
+ // {
173
+ // href: parseUrl('https://localhost/data/two')!,
174
+ // textContent: 'DATA TWO',
175
+ // },
176
+ // {
177
+ // href: parseUrl('https://localhost/data/three')!,
178
+ // textContent: 'DATA THREE',
179
+ // },
180
+ // {
181
+ // href: parseUrl('https://localhost/lp')!,
182
+ // textContent: 'Advertisement',
183
+ // },
184
+ // {
185
+ // href: parseUrl('https://example.com/abc')!,
186
+ // textContent: 'ABC',
187
+ // },
188
+ // {
189
+ // href: parseUrl('https://example.com/xyz')!,
190
+ // textContent: 'XYZ',
191
+ // },
192
+ // ],
193
+ // imageList: [],
194
+ // html: '',
195
+ // isSkipped: false,
196
+ // },
197
+ // workingDir,
198
+ // true,
199
+ // );
200
+
201
+ // await db.updatePage(
202
+ // {
203
+ // url: parseUrl('https://localhost/data/3')!,
204
+ // redirectPaths: ['https://localhost/data/three'],
205
+ // isExternal: false,
206
+ // status: 200,
207
+ // statusText: 'OK',
208
+ // contentLength: 1000,
209
+ // contentType: 'html/text',
210
+ // responseHeaders: {},
211
+ // meta: {
212
+ // title: 'DATA THREE | LOCAL_SERVER',
213
+ // },
214
+ // anchorList: [
215
+ // {
216
+ // href: parseUrl('https://localhost/data/one')!,
217
+ // textContent: 'DATA ONE',
218
+ // },
219
+ // {
220
+ // href: parseUrl('https://localhost/data/two')!,
221
+ // textContent: 'DATA TWO',
222
+ // },
223
+ // {
224
+ // href: parseUrl('https://localhost/data/three')!,
225
+ // textContent: 'DATA THREE',
226
+ // },
227
+ // {
228
+ // href: parseUrl('https://localhost/lp')!,
229
+ // textContent: 'Advertisement',
230
+ // },
231
+ // {
232
+ // href: parseUrl('https://example.com/abc')!,
233
+ // textContent: 'ABC',
234
+ // },
235
+ // {
236
+ // href: parseUrl('https://example.com/xyz')!,
237
+ // textContent: 'XYZ',
238
+ // },
239
+ // ],
240
+ // imageList: [],
241
+ // html: '',
242
+ // isSkipped: false,
243
+ // },
244
+ // workingDir,
245
+ // true,
246
+ // );
247
+
248
+ // await db.updatePage(
249
+ // {
250
+ // url: parseUrl('https://localhost/lp')!,
251
+ // redirectPaths: [],
252
+ // isExternal: false,
253
+ // status: 200,
254
+ // statusText: 'OK',
255
+ // contentLength: 1000,
256
+ // contentType: 'html/text',
257
+ // responseHeaders: {},
258
+ // meta: {
259
+ // title: '[AD] THE EARTH IS BLUE',
260
+ // },
261
+ // anchorList: [
262
+ // {
263
+ // href: parseUrl('https://ec.localhost/buy?id=0123')!,
264
+ // textContent: 'BUY',
265
+ // },
266
+ // ],
267
+ // imageList: [],
268
+ // html: '',
269
+ // isSkipped: false,
270
+ // },
271
+ // workingDir,
272
+ // true,
273
+ // );
274
+
275
+ // await db.updatePage(
276
+ // {
277
+ // url: parseUrl('https://example.com/abc')!,
278
+ // redirectPaths: [],
279
+ // isExternal: true,
280
+ // status: 200,
281
+ // statusText: 'OK',
282
+ // contentLength: 1000,
283
+ // contentType: 'html/text',
284
+ // responseHeaders: {},
285
+ // meta: {
286
+ // title: 'ABC - example.com',
287
+ // },
288
+ // anchorList: [],
289
+ // imageList: [],
290
+ // html: '',
291
+ // isSkipped: false,
292
+ // },
293
+ // workingDir,
294
+ // true,
295
+ // );
296
+
297
+ // await db.updatePage(
298
+ // {
299
+ // url: parseUrl('https://example.com/xyz')!,
300
+ // redirectPaths: ['https://example.com/404'],
301
+ // isExternal: true,
302
+ // status: 404,
303
+ // statusText: 'Not Found',
304
+ // contentLength: 1000,
305
+ // contentType: 'html/text',
306
+ // responseHeaders: {},
307
+ // meta: {
308
+ // title: '404 Not Found - example.com',
309
+ // },
310
+ // anchorList: [],
311
+ // imageList: [],
312
+ // html: '',
313
+ // isSkipped: false,
314
+ // },
315
+ // workingDir,
316
+ // true,
317
+ // );
318
+
319
+ // await db.updatePage(
320
+ // {
321
+ // url: parseUrl('https://ec.localhost/buy?id=0123')!,
322
+ // redirectPaths: ['https://ec.localhost/items/0123/details'],
323
+ // isExternal: true,
324
+ // status: 200,
325
+ // statusText: 'OK',
326
+ // contentLength: 1000,
327
+ // contentType: 'html/text',
328
+ // responseHeaders: {},
329
+ // meta: {
330
+ // title: '[ID-0123] The tool of something | EC',
331
+ // },
332
+ // anchorList: [],
333
+ // imageList: [],
334
+ // html: '',
335
+ // isSkipped: false,
336
+ // },
337
+ // workingDir,
338
+ // true,
339
+ // );
340
+ // });
341
+
342
+ it('get', async () => {
343
+ const db = await Database.connect({
344
+ type: 'sqlite3',
345
+ workingDir,
346
+ filename: path.resolve(workingDir, 'mock.sqlite'),
347
+ });
348
+
349
+ const { pages, redirects, anchors, referrers } = await db.getPagesWithRels(0, 2);
350
+
351
+ expect(pages.map((p) => p.url)).toEqual([
352
+ 'https://localhost/data/one',
353
+ 'https://localhost/data/three',
354
+ ]);
355
+
356
+ expect(pages.map((p) => p.title)).toEqual([
357
+ 'DATA ONE | LOCAL_SERVER',
358
+ 'DATA THREE | LOCAL_SERVER',
359
+ ]);
360
+
361
+ expect(redirects).toEqual([
362
+ {
363
+ pageId: 9,
364
+ from: 'https://localhost/data/1',
365
+ fromId: 3,
366
+ },
367
+ {
368
+ pageId: 11,
369
+ from: 'https://localhost/data/3',
370
+ fromId: 5,
371
+ },
372
+ ]);
373
+
374
+ expect(
375
+ anchors
376
+ .filter((a) => a.pageId === 9)
377
+ .map((a) => ({
378
+ url: a.url,
379
+ href: a.href,
380
+ title: a.title,
381
+ textContent: a.textContent,
382
+ })),
383
+ ).toEqual([
384
+ {
385
+ url: 'https://localhost/data/one',
386
+ href: 'https://localhost/data/one',
387
+ title: 'DATA ONE | LOCAL_SERVER',
388
+ textContent: 'DATA ONE',
389
+ },
390
+ {
391
+ url: 'https://localhost/data/two',
392
+ href: 'https://localhost/data/two',
393
+ title: 'DATA TWO | LOCAL_SERVER',
394
+ textContent: 'DATA TWO',
395
+ },
396
+ {
397
+ url: 'https://localhost/data/three',
398
+ href: 'https://localhost/data/three',
399
+ title: 'DATA THREE | LOCAL_SERVER',
400
+ textContent: 'DATA THREE',
401
+ },
402
+ {
403
+ url: 'https://localhost/lp',
404
+ href: 'https://localhost/lp',
405
+ title: '[AD] THE EARTH IS BLUE',
406
+ textContent: 'Advertisement',
407
+ },
408
+ {
409
+ url: 'https://example.com/abc',
410
+ href: 'https://example.com/abc',
411
+ title: 'ABC - example.com',
412
+ textContent: 'ABC',
413
+ },
414
+ {
415
+ url: 'https://example.com/404',
416
+ href: 'https://example.com/xyz',
417
+ title: '404 Not Found - example.com',
418
+ textContent: 'XYZ',
419
+ },
420
+ ]);
421
+
422
+ expect(referrers.filter((r) => r.pageId === 9)).toEqual([
423
+ {
424
+ pageId: 9,
425
+ url: 'https://localhost/path/to',
426
+ through: 'https://localhost/data/1',
427
+ throughId: 3,
428
+ hash: null,
429
+ textContent: 'DATA-1',
430
+ },
431
+ {
432
+ pageId: 9,
433
+ url: 'https://localhost/data/one',
434
+ through: 'https://localhost/data/one',
435
+ throughId: 9,
436
+ hash: null,
437
+ textContent: 'DATA ONE',
438
+ },
439
+ {
440
+ pageId: 9,
441
+ url: 'https://localhost/data/two',
442
+ through: 'https://localhost/data/one',
443
+ throughId: 9,
444
+ hash: null,
445
+ textContent: 'DATA ONE',
446
+ },
447
+ {
448
+ pageId: 9,
449
+ url: 'https://localhost/data/three',
450
+ through: 'https://localhost/data/one',
451
+ throughId: 9,
452
+ hash: null,
453
+ textContent: 'DATA ONE',
454
+ },
455
+ ]);
456
+ });
457
+
458
+ it('getPageCount', async () => {
459
+ const db = await Database.connect({
460
+ type: 'sqlite3',
461
+ workingDir,
462
+ filename: path.resolve(workingDir, 'mock.sqlite'),
463
+ });
464
+
465
+ const count = await db.getPageCount();
466
+
467
+ expect(count).toEqual(14);
468
+ });
469
+ });