@nitpicker/crawler 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. package/CHANGELOG.md +8 -0
  2. package/LICENSE +191 -0
  3. package/README.md +13 -0
  4. package/lib/archive/archive-accessor.d.ts +107 -0
  5. package/lib/archive/archive-accessor.js +264 -0
  6. package/lib/archive/archive.d.ts +174 -0
  7. package/lib/archive/archive.js +331 -0
  8. package/lib/archive/database.d.ts +207 -0
  9. package/lib/archive/database.js +972 -0
  10. package/lib/archive/debug.d.ts +8 -0
  11. package/lib/archive/debug.js +9 -0
  12. package/lib/archive/filesystem/append-text.d.ts +9 -0
  13. package/lib/archive/filesystem/append-text.js +14 -0
  14. package/lib/archive/filesystem/copy-dir-sync.d.ts +6 -0
  15. package/lib/archive/filesystem/copy-dir-sync.js +9 -0
  16. package/lib/archive/filesystem/copy-dir.d.ts +7 -0
  17. package/lib/archive/filesystem/copy-dir.js +13 -0
  18. package/lib/archive/filesystem/exists.d.ts +6 -0
  19. package/lib/archive/filesystem/exists.js +9 -0
  20. package/lib/archive/filesystem/get-file-list.d.ts +8 -0
  21. package/lib/archive/filesystem/get-file-list.js +12 -0
  22. package/lib/archive/filesystem/index.d.ts +17 -0
  23. package/lib/archive/filesystem/index.js +17 -0
  24. package/lib/archive/filesystem/is-dir.d.ts +6 -0
  25. package/lib/archive/filesystem/is-dir.js +10 -0
  26. package/lib/archive/filesystem/mkdir.d.ts +8 -0
  27. package/lib/archive/filesystem/mkdir.js +15 -0
  28. package/lib/archive/filesystem/output-json.d.ts +9 -0
  29. package/lib/archive/filesystem/output-json.js +14 -0
  30. package/lib/archive/filesystem/output-text.d.ts +11 -0
  31. package/lib/archive/filesystem/output-text.js +32 -0
  32. package/lib/archive/filesystem/read-json.d.ts +7 -0
  33. package/lib/archive/filesystem/read-json.js +11 -0
  34. package/lib/archive/filesystem/read-text.d.ts +6 -0
  35. package/lib/archive/filesystem/read-text.js +10 -0
  36. package/lib/archive/filesystem/readline.d.ts +11 -0
  37. package/lib/archive/filesystem/readline.js +26 -0
  38. package/lib/archive/filesystem/remove.d.ts +5 -0
  39. package/lib/archive/filesystem/remove.js +10 -0
  40. package/lib/archive/filesystem/rename.d.ts +11 -0
  41. package/lib/archive/filesystem/rename.js +18 -0
  42. package/lib/archive/filesystem/tar.d.ts +11 -0
  43. package/lib/archive/filesystem/tar.js +22 -0
  44. package/lib/archive/filesystem/untar.d.ts +20 -0
  45. package/lib/archive/filesystem/untar.js +24 -0
  46. package/lib/archive/filesystem/utils.d.ts +109 -0
  47. package/lib/archive/filesystem/utils.js +185 -0
  48. package/lib/archive/filesystem/zip.d.ts +29 -0
  49. package/lib/archive/filesystem/zip.js +53 -0
  50. package/lib/archive/index.d.ts +6 -0
  51. package/lib/archive/index.js +11 -0
  52. package/lib/archive/page.d.ts +263 -0
  53. package/lib/archive/page.js +316 -0
  54. package/lib/archive/resource.d.ts +46 -0
  55. package/lib/archive/resource.js +62 -0
  56. package/lib/archive/safe-path.d.ts +9 -0
  57. package/lib/archive/safe-path.js +17 -0
  58. package/lib/archive/types.d.ts +210 -0
  59. package/lib/archive/types.js +1 -0
  60. package/lib/crawler/clear-destination-cache.d.ts +5 -0
  61. package/lib/crawler/clear-destination-cache.js +8 -0
  62. package/lib/crawler/crawler.d.ts +73 -0
  63. package/lib/crawler/crawler.js +748 -0
  64. package/lib/crawler/decompose-url.d.ts +25 -0
  65. package/lib/crawler/decompose-url.js +71 -0
  66. package/lib/crawler/destination-cache.d.ts +7 -0
  67. package/lib/crawler/destination-cache.js +6 -0
  68. package/lib/crawler/detect-pagination-pattern.d.ts +16 -0
  69. package/lib/crawler/detect-pagination-pattern.js +61 -0
  70. package/lib/crawler/fetch-destination.d.ts +38 -0
  71. package/lib/crawler/fetch-destination.js +208 -0
  72. package/lib/crawler/fetch-robots-txt.d.ts +42 -0
  73. package/lib/crawler/fetch-robots-txt.js +44 -0
  74. package/lib/crawler/find-best-matching-scope.d.ts +12 -0
  75. package/lib/crawler/find-best-matching-scope.js +46 -0
  76. package/lib/crawler/generate-predicted-urls.d.ts +13 -0
  77. package/lib/crawler/generate-predicted-urls.js +27 -0
  78. package/lib/crawler/handle-ignore-and-skip.d.ts +16 -0
  79. package/lib/crawler/handle-ignore-and-skip.js +19 -0
  80. package/lib/crawler/handle-resource-response.d.ts +13 -0
  81. package/lib/crawler/handle-resource-response.js +16 -0
  82. package/lib/crawler/handle-scrape-end.d.ts +24 -0
  83. package/lib/crawler/handle-scrape-end.js +82 -0
  84. package/lib/crawler/handle-scrape-error.d.ts +37 -0
  85. package/lib/crawler/handle-scrape-error.js +38 -0
  86. package/lib/crawler/index.d.ts +2 -0
  87. package/lib/crawler/index.js +2 -0
  88. package/lib/crawler/inject-scope-auth.d.ts +11 -0
  89. package/lib/crawler/inject-scope-auth.js +21 -0
  90. package/lib/crawler/is-external-url.d.ts +11 -0
  91. package/lib/crawler/is-external-url.js +12 -0
  92. package/lib/crawler/is-in-any-lower-layer.d.ts +13 -0
  93. package/lib/crawler/is-in-any-lower-layer.js +15 -0
  94. package/lib/crawler/link-list.d.ts +112 -0
  95. package/lib/crawler/link-list.js +248 -0
  96. package/lib/crawler/link-to-page-data.d.ts +14 -0
  97. package/lib/crawler/link-to-page-data.js +32 -0
  98. package/lib/crawler/net-timeout-error.d.ts +9 -0
  99. package/lib/crawler/net-timeout-error.js +11 -0
  100. package/lib/crawler/network.d.ts +30 -0
  101. package/lib/crawler/network.js +226 -0
  102. package/lib/crawler/protocol-agnostic-key.d.ts +9 -0
  103. package/lib/crawler/protocol-agnostic-key.js +11 -0
  104. package/lib/crawler/reconstruct-url.d.ts +10 -0
  105. package/lib/crawler/reconstruct-url.js +28 -0
  106. package/lib/crawler/result-handler.d.ts +118 -0
  107. package/lib/crawler/result-handler.js +153 -0
  108. package/lib/crawler/robots-checker.d.ts +26 -0
  109. package/lib/crawler/robots-checker.js +62 -0
  110. package/lib/crawler/should-discard-predicted.d.ts +14 -0
  111. package/lib/crawler/should-discard-predicted.js +31 -0
  112. package/lib/crawler/should-skip-url.d.ts +23 -0
  113. package/lib/crawler/should-skip-url.js +15 -0
  114. package/lib/crawler/speculative-pagination.d.ts +52 -0
  115. package/lib/crawler/speculative-pagination.js +215 -0
  116. package/lib/crawler/types.d.ts +119 -0
  117. package/lib/crawler/types.js +1 -0
  118. package/lib/crawler/url-filter.d.ts +56 -0
  119. package/lib/crawler/url-filter.js +110 -0
  120. package/lib/crawler-orchestrator.d.ts +142 -0
  121. package/lib/crawler-orchestrator.js +309 -0
  122. package/lib/debug.d.ts +8 -0
  123. package/lib/debug.js +9 -0
  124. package/lib/index.d.ts +16 -0
  125. package/lib/index.js +18 -0
  126. package/lib/qzilla.d.ts +136 -0
  127. package/lib/qzilla.js +292 -0
  128. package/lib/types.d.ts +27 -0
  129. package/lib/types.js +1 -0
  130. package/lib/utils/array/each-splitted.d.ts +10 -0
  131. package/lib/utils/array/each-splitted.js +14 -0
  132. package/lib/utils/array/index.d.ts +1 -0
  133. package/lib/utils/array/index.js +1 -0
  134. package/lib/utils/async/index.d.ts +1 -0
  135. package/lib/utils/async/index.js +1 -0
  136. package/lib/utils/debug.d.ts +5 -0
  137. package/lib/utils/debug.js +5 -0
  138. package/lib/utils/error/dom-evaluation-error.d.ts +7 -0
  139. package/lib/utils/error/dom-evaluation-error.js +7 -0
  140. package/lib/utils/error/error-emitter.d.ts +18 -0
  141. package/lib/utils/error/error-emitter.js +29 -0
  142. package/lib/utils/error/index.d.ts +3 -0
  143. package/lib/utils/error/index.js +2 -0
  144. package/lib/utils/event-emitter/index.d.ts +6 -0
  145. package/lib/utils/event-emitter/index.js +6 -0
  146. package/lib/utils/index.d.ts +5 -0
  147. package/lib/utils/index.js +5 -0
  148. package/lib/utils/network/index.d.ts +1 -0
  149. package/lib/utils/network/index.js +1 -0
  150. package/lib/utils/object/clean-object.d.ts +8 -0
  151. package/lib/utils/object/clean-object.js +13 -0
  152. package/lib/utils/object/index.d.ts +1 -0
  153. package/lib/utils/object/index.js +1 -0
  154. package/lib/utils/path/index.d.ts +1 -0
  155. package/lib/utils/path/index.js +1 -0
  156. package/lib/utils/path/safe-filepath.d.ts +7 -0
  157. package/lib/utils/path/safe-filepath.js +12 -0
  158. package/lib/utils/regexp/index.d.ts +1 -0
  159. package/lib/utils/regexp/index.js +1 -0
  160. package/lib/utils/retryable/index.d.ts +2 -0
  161. package/lib/utils/retryable/index.js +1 -0
  162. package/lib/utils/sort/index.d.ts +14 -0
  163. package/lib/utils/sort/index.js +61 -0
  164. package/lib/utils/sort/remove-matches.d.ts +9 -0
  165. package/lib/utils/sort/remove-matches.js +23 -0
  166. package/lib/utils/types/index.d.ts +1 -0
  167. package/lib/utils/types/index.js +1 -0
  168. package/lib/utils/types/types.d.ts +46 -0
  169. package/lib/utils/types/types.js +1 -0
  170. package/lib/utils/url/index.d.ts +5 -0
  171. package/lib/utils/url/index.js +5 -0
  172. package/lib/utils/url/is-lower-layer.d.ts +15 -0
  173. package/lib/utils/url/is-lower-layer.js +55 -0
  174. package/lib/utils/url/parse-url.d.ts +11 -0
  175. package/lib/utils/url/parse-url.js +20 -0
  176. package/lib/utils/url/path-match.d.ts +11 -0
  177. package/lib/utils/url/path-match.js +18 -0
  178. package/lib/utils/url/sort-url.d.ts +10 -0
  179. package/lib/utils/url/sort-url.js +24 -0
  180. package/lib/utils/url/url-partial-match.d.ts +11 -0
  181. package/lib/utils/url/url-partial-match.js +32 -0
  182. package/package.json +49 -0
  183. package/src/archive/__mock__/.gitignore +3 -0
  184. package/src/archive/__mock__/mock.sqlite +0 -0
  185. package/src/archive/archive-accessor.ts +337 -0
  186. package/src/archive/archive.ts +408 -0
  187. package/src/archive/database.spec.ts +469 -0
  188. package/src/archive/database.ts +1059 -0
  189. package/src/archive/debug.ts +10 -0
  190. package/src/archive/filesystem/append-text.spec.ts +26 -0
  191. package/src/archive/filesystem/append-text.ts +16 -0
  192. package/src/archive/filesystem/copy-dir-sync.spec.ts +27 -0
  193. package/src/archive/filesystem/copy-dir-sync.ts +10 -0
  194. package/src/archive/filesystem/copy-dir.spec.ts +33 -0
  195. package/src/archive/filesystem/copy-dir.ts +14 -0
  196. package/src/archive/filesystem/exists.spec.ts +33 -0
  197. package/src/archive/filesystem/exists.ts +10 -0
  198. package/src/archive/filesystem/get-file-list.spec.ts +37 -0
  199. package/src/archive/filesystem/get-file-list.ts +13 -0
  200. package/src/archive/filesystem/index.ts +17 -0
  201. package/src/archive/filesystem/is-dir.spec.ts +29 -0
  202. package/src/archive/filesystem/is-dir.ts +11 -0
  203. package/src/archive/filesystem/mkdir.spec.ts +37 -0
  204. package/src/archive/filesystem/mkdir.ts +16 -0
  205. package/src/archive/filesystem/output-json.spec.ts +34 -0
  206. package/src/archive/filesystem/output-json.ts +16 -0
  207. package/src/archive/filesystem/output-text.spec.ts +31 -0
  208. package/src/archive/filesystem/output-text.ts +35 -0
  209. package/src/archive/filesystem/read-json.spec.ts +26 -0
  210. package/src/archive/filesystem/read-json.ts +12 -0
  211. package/src/archive/filesystem/read-text.spec.ts +25 -0
  212. package/src/archive/filesystem/read-text.ts +11 -0
  213. package/src/archive/filesystem/readline.spec.ts +29 -0
  214. package/src/archive/filesystem/readline.ts +30 -0
  215. package/src/archive/filesystem/remove.spec.ts +34 -0
  216. package/src/archive/filesystem/remove.ts +11 -0
  217. package/src/archive/filesystem/rename.spec.ts +46 -0
  218. package/src/archive/filesystem/rename.ts +21 -0
  219. package/src/archive/filesystem/tar.spec.ts +33 -0
  220. package/src/archive/filesystem/tar.ts +27 -0
  221. package/src/archive/filesystem/untar.spec.ts +34 -0
  222. package/src/archive/filesystem/untar.ts +36 -0
  223. package/src/archive/index.ts +13 -0
  224. package/src/archive/page.spec.ts +368 -0
  225. package/src/archive/page.ts +420 -0
  226. package/src/archive/resource.spec.ts +101 -0
  227. package/src/archive/resource.ts +73 -0
  228. package/src/archive/safe-path.spec.ts +44 -0
  229. package/src/archive/safe-path.ts +18 -0
  230. package/src/archive/types.ts +227 -0
  231. package/src/crawler/clear-destination-cache.spec.ts +20 -0
  232. package/src/crawler/clear-destination-cache.ts +9 -0
  233. package/src/crawler/crawler.ts +873 -0
  234. package/src/crawler/decompose-url.spec.ts +48 -0
  235. package/src/crawler/decompose-url.ts +90 -0
  236. package/src/crawler/destination-cache.spec.ts +23 -0
  237. package/src/crawler/destination-cache.ts +8 -0
  238. package/src/crawler/detect-pagination-pattern.spec.ts +169 -0
  239. package/src/crawler/detect-pagination-pattern.ts +66 -0
  240. package/src/crawler/fetch-destination.ts +257 -0
  241. package/src/crawler/fetch-robots-txt.spec.ts +83 -0
  242. package/src/crawler/fetch-robots-txt.ts +91 -0
  243. package/src/crawler/find-best-matching-scope.spec.ts +39 -0
  244. package/src/crawler/find-best-matching-scope.ts +57 -0
  245. package/src/crawler/generate-predicted-urls.spec.ts +42 -0
  246. package/src/crawler/generate-predicted-urls.ts +34 -0
  247. package/src/crawler/handle-ignore-and-skip.spec.ts +66 -0
  248. package/src/crawler/handle-ignore-and-skip.ts +30 -0
  249. package/src/crawler/handle-resource-response.spec.ts +45 -0
  250. package/src/crawler/handle-resource-response.ts +21 -0
  251. package/src/crawler/handle-scrape-end.spec.ts +109 -0
  252. package/src/crawler/handle-scrape-end.ts +115 -0
  253. package/src/crawler/handle-scrape-error.spec.ts +105 -0
  254. package/src/crawler/handle-scrape-error.ts +58 -0
  255. package/src/crawler/index.ts +2 -0
  256. package/src/crawler/inject-scope-auth.spec.ts +36 -0
  257. package/src/crawler/inject-scope-auth.ts +27 -0
  258. package/src/crawler/is-external-url.spec.ts +31 -0
  259. package/src/crawler/is-external-url.ts +17 -0
  260. package/src/crawler/is-in-any-lower-layer.spec.ts +31 -0
  261. package/src/crawler/is-in-any-lower-layer.ts +22 -0
  262. package/src/crawler/link-list.spec.ts +355 -0
  263. package/src/crawler/link-list.ts +275 -0
  264. package/src/crawler/link-to-page-data.spec.ts +133 -0
  265. package/src/crawler/link-to-page-data.ts +34 -0
  266. package/src/crawler/net-timeout-error.spec.ts +25 -0
  267. package/src/crawler/net-timeout-error.ts +11 -0
  268. package/src/crawler/protocol-agnostic-key.spec.ts +40 -0
  269. package/src/crawler/protocol-agnostic-key.ts +11 -0
  270. package/src/crawler/reconstruct-url.spec.ts +37 -0
  271. package/src/crawler/reconstruct-url.ts +37 -0
  272. package/src/crawler/robots-checker.spec.ts +104 -0
  273. package/src/crawler/robots-checker.ts +73 -0
  274. package/src/crawler/should-discard-predicted.spec.ts +125 -0
  275. package/src/crawler/should-discard-predicted.ts +33 -0
  276. package/src/crawler/should-skip-url.spec.ts +77 -0
  277. package/src/crawler/should-skip-url.ts +37 -0
  278. package/src/crawler/types.ts +146 -0
  279. package/src/crawler-orchestrator.ts +401 -0
  280. package/src/debug.ts +10 -0
  281. package/src/index.ts +25 -0
  282. package/src/types.ts +30 -0
  283. package/src/utils/array/each-splitted.spec.ts +38 -0
  284. package/src/utils/array/each-splitted.ts +19 -0
  285. package/src/utils/array/index.ts +1 -0
  286. package/src/utils/debug.ts +6 -0
  287. package/src/utils/error/dom-evaluation-error.spec.ts +20 -0
  288. package/src/utils/error/dom-evaluation-error.ts +6 -0
  289. package/src/utils/error/error-emitter.spec.ts +78 -0
  290. package/src/utils/error/error-emitter.ts +44 -0
  291. package/src/utils/error/index.ts +3 -0
  292. package/src/utils/index.ts +5 -0
  293. package/src/utils/object/clean-object.spec.ts +24 -0
  294. package/src/utils/object/clean-object.ts +13 -0
  295. package/src/utils/object/index.ts +1 -0
  296. package/src/utils/types/index.ts +1 -0
  297. package/src/utils/types/types.ts +65 -0
  298. package/tsconfig.json +11 -0
  299. package/tsconfig.tsbuildinfo +1 -0
@@ -0,0 +1,309 @@
1
+ import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
2
+ import { sortUrl } from '@d-zero/shared/sort-url';
3
+ import { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
4
+ import pkg from '../package.json' with { type: 'json' };
5
+ import Archive from './archive/archive.js';
6
+ import { clearDestinationCache, Crawler } from './crawler/index.js';
7
+ import { crawlerLog, log } from './debug.js';
8
+ import { cleanObject } from './utils/index.js';
9
+ /**
10
+ * Default list of external URL prefixes excluded from crawling.
11
+ * Includes social media sharing endpoints that are commonly linked
12
+ * but provide no useful crawl data.
13
+ */
14
+ export const DEFAULT_EXCLUDED_EXTERNAL_URLS = [
15
+ 'https://social-plugins.line.me',
16
+ 'https://access.line.me',
17
+ 'https://lineit.line.me',
18
+ 'https://line.me',
19
+ 'https://plus.google.com',
20
+ 'https://twitter.com',
21
+ 'https://x.com',
22
+ 'https://www.facebook.com/share.php',
23
+ 'https://www.facebook.com/share/',
24
+ 'https://www.facebook.com/sharer/',
25
+ 'https://www.facebook.com/share_channel/',
26
+ 'https://www.google.com',
27
+ ];
28
+ /**
29
+ * The main entry point for Nitpicker web crawling and archiving.
30
+ *
31
+ * CrawlerOrchestrator orchestrates the full lifecycle of a crawl session: it creates an archive,
32
+ * configures a {@link Crawler}, processes discovered pages and resources, and
33
+ * writes the final archive file. It emits events defined by {@link CrawlEvent}.
34
+ *
35
+ * Instances are created via the static factory methods {@link CrawlerOrchestrator.crawling}
36
+ * or {@link CrawlerOrchestrator.resume}; the constructor is private.
37
+ * @example
38
+ * ```ts
39
+ * const orchestrator = await CrawlerOrchestrator.crawling(['https://example.com'], { recursive: true });
40
+ * await orchestrator.write();
41
+ * ```
42
+ */
43
+ export class CrawlerOrchestrator extends EventEmitter {
44
+ /** The archive instance for persisting crawl results to SQLite + tar. */
45
+ #archive;
46
+ /** The crawler engine that discovers and scrapes pages. */
47
+ #crawler;
48
+ /** Whether the crawl was started from a pre-defined URL list (non-recursive mode). */
49
+ #fromList;
50
+ /**
51
+ * The underlying archive instance used for storing crawl results.
52
+ */
53
+ get archive() {
54
+ return this.#archive;
55
+ }
56
+ // eslint-disable-next-line no-restricted-syntax
57
+ constructor(archive, options) {
58
+ super();
59
+ this.#fromList = !!options?.list;
60
+ this.#archive = archive;
61
+ this.#archive.on('error', (e) => {
62
+ this.#crawler.abort();
63
+ void this.emit('error', {
64
+ pid: process.pid,
65
+ isMainProcess: true,
66
+ url: null,
67
+ error: e instanceof Error ? e : new Error(String(e)),
68
+ });
69
+ });
70
+ const defaultUserAgent = `Nitpicker/${pkg.version}`;
71
+ this.#crawler = new Crawler({
72
+ interval: options?.interval || 0,
73
+ parallels: options?.parallels || 0,
74
+ captureImages: options?.image,
75
+ executablePath: options?.executablePath || null,
76
+ fetchExternal: options?.fetchExternal ?? true,
77
+ recursive: options?.recursive ?? true,
78
+ scope: options?.scope ?? [],
79
+ excludes: normalizeToArray(options?.excludes),
80
+ excludeKeywords: normalizeToArray(options?.excludeKeywords),
81
+ excludeUrls: [
82
+ ...DEFAULT_EXCLUDED_EXTERNAL_URLS,
83
+ ...normalizeToArray(options?.excludeUrls),
84
+ ],
85
+ maxExcludedDepth: options?.maxExcludedDepth || 10,
86
+ retry: options?.retry ?? 3,
87
+ disableQueries: options?.disableQueries,
88
+ verbose: options?.verbose ?? false,
89
+ userAgent: options?.userAgent || defaultUserAgent,
90
+ ignoreRobots: options?.ignoreRobots ?? false,
91
+ });
92
+ }
93
+ /**
94
+ * Abort the current crawl and archive operations.
95
+ *
96
+ * Delegates to the archive's abort method, which stops all in-progress
97
+ * database writes and cleans up temporary resources.
98
+ * @returns The result of the archive abort operation.
99
+ */
100
+ abort() {
101
+ return this.#archive.abort();
102
+ }
103
+ /**
104
+ * Execute the crawl for the given list of URLs.
105
+ *
106
+ * Sets up event listeners on the crawler, starts crawling, and resolves
107
+ * when the crawl completes. Discovered pages, external pages, skipped pages,
108
+ * and resources are forwarded to the archive for storage.
109
+ * @param list - The list of parsed URLs to crawl. The first URL is used as the root.
110
+ * @returns A promise that resolves when crawling is complete.
111
+ * @throws {Error} If the URL list is empty.
112
+ */
113
+ async crawling(list) {
114
+ const root = list[0];
115
+ if (!root) {
116
+ throw new Error('URL is empty');
117
+ }
118
+ return new Promise((resolve, reject) => {
119
+ this.#crawler.on('error', (error) => {
120
+ crawlerLog('On error: %O', error);
121
+ void this.#archive.addError(error);
122
+ void this.emit('error', error);
123
+ });
124
+ this.#crawler.on('page', async ({ result }) => {
125
+ await this.#archive.setPage(result).catch((error) => reject(error));
126
+ });
127
+ this.#crawler.on('externalPage', ({ result }) => {
128
+ this.#archive.setExternalPage(result).catch((error) => reject(error));
129
+ });
130
+ this.#crawler.on('skip', ({ url, reason, isExternal }) => {
131
+ this.#archive
132
+ .setSkippedPage(url, reason, isExternal)
133
+ .catch((error) => reject(error));
134
+ });
135
+ this.#crawler.on('response', ({ resource }) => {
136
+ this.#archive.setResources(resource).catch((error) => reject(error));
137
+ });
138
+ this.#crawler.on('responseReferrers', (resource) => {
139
+ this.#archive.setResourcesReferrers(resource).catch((error) => reject(error));
140
+ });
141
+ this.#crawler.on('crawlEnd', () => {
142
+ resolve();
143
+ });
144
+ if (this.#fromList) {
145
+ this.#crawler.startMultiple(list);
146
+ }
147
+ else {
148
+ this.#crawler.start(root);
149
+ }
150
+ });
151
+ }
152
+ /**
153
+ * Kill any zombie Chromium processes that were not properly cleaned up.
154
+ *
155
+ * Retrieves the list of undead process IDs from the crawler and sends
156
+ * a SIGTERM signal to each one. Chromium is intentionally sent SIGTERM
157
+ * (not SIGKILL) to avoid leaving zombie processes.
158
+ */
159
+ garbageCollect() {
160
+ const pidList = this.getUndeadPid();
161
+ log('Undead PIDs: %O', pidList);
162
+ for (const pid of pidList) {
163
+ try {
164
+ log('Garbage collect: kill PID:%d', pid);
165
+ // Chromium becomes a zombie process if SIGKILL signal.
166
+ process.kill(pid);
167
+ }
168
+ catch (error) {
169
+ log('Garbage collect: Failed killing PID:%d %O', pid, error);
170
+ }
171
+ }
172
+ }
173
+ /**
174
+ * Retrieve the list of process IDs for Chromium instances that are
175
+ * still running after crawling has ended.
176
+ * @returns An array of process IDs that should be terminated.
177
+ */
178
+ getUndeadPid() {
179
+ return this.#crawler.getUndeadPid();
180
+ }
181
+ /**
182
+ * Write the archive to its configured file path.
183
+ *
184
+ * Emits `writeFileStart` before writing and `writeFileEnd` after
185
+ * the write completes successfully.
186
+ */
187
+ async write() {
188
+ void this.emit('writeFileStart', { filePath: this.#archive.filePath });
189
+ await this.#archive.write();
190
+ void this.emit('writeFileEnd', { filePath: this.#archive.filePath });
191
+ }
192
+ /**
193
+ * Create a new CrawlerOrchestrator instance and start crawling the given URLs.
194
+ *
195
+ * This is the primary factory method for starting a fresh crawl. It:
196
+ * 1. Parses and sorts the input URLs
197
+ * 2. Creates an archive file
198
+ * 3. Saves the crawl configuration
199
+ * 4. Runs the optional initialized callback
200
+ * 5. Executes the crawl
201
+ * 6. Sorts the archived URLs in natural order
202
+ * @param url - One or more URL strings to crawl.
203
+ * @param options - Optional configuration overrides for the crawl session.
204
+ * @param initializedCallback - Optional callback invoked after initialization but before crawling starts.
205
+ * @returns A promise that resolves to the CrawlerOrchestrator instance after crawling completes.
206
+ * @throws {Error} If the URL list is empty or contains no valid URLs.
207
+ */
208
+ static async crawling(url, options, initializedCallback) {
209
+ const list = sortUrl(url, options);
210
+ const urlParsed = list[0];
211
+ if (!urlParsed) {
212
+ throw new Error('URL is empty');
213
+ }
214
+ const fileName = `${urlParsed.hostname}-${Archive.timestamp()}`;
215
+ const cwd = options?.cwd ?? process.cwd();
216
+ const filePath = Archive.joinPath(cwd, `${fileName}.${Archive.FILE_EXTENSION}`);
217
+ const disableQueries = options?.disableQueries || false;
218
+ const defaultUserAgent = `Nitpicker/${pkg.version}`;
219
+ const archive = await Archive.create({ filePath, cwd, disableQueries });
220
+ await archive.setConfig({
221
+ version: pkg.version,
222
+ name: fileName,
223
+ baseUrl: urlParsed.withoutHash,
224
+ recursive: options?.recursive ?? true,
225
+ fetchExternal: options?.fetchExternal ?? true,
226
+ image: options?.image ?? true,
227
+ interval: options?.interval || 0,
228
+ parallels: options?.parallels || 0,
229
+ scope: options?.scope ?? [],
230
+ // @ts-expect-error TODO: Fix CLI arguments
231
+ excludes: normalizeToArray(options?.exclude),
232
+ // @ts-expect-error TODO: Fix CLI arguments
233
+ excludeKeywords: normalizeToArray(options?.excludeKeyword),
234
+ excludeUrls: [
235
+ ...DEFAULT_EXCLUDED_EXTERNAL_URLS,
236
+ // @ts-expect-error TODO: Fix CLI arguments
237
+ ...normalizeToArray(options?.excludeUrl),
238
+ ],
239
+ maxExcludedDepth: options?.maxExcludedDepth || 10,
240
+ retry: options?.retry ?? 3,
241
+ fromList: !!options?.list,
242
+ disableQueries,
243
+ userAgent: options?.userAgent || defaultUserAgent,
244
+ ignoreRobots: options?.ignoreRobots ?? false,
245
+ });
246
+ const orchestrator = new CrawlerOrchestrator(archive, options);
247
+ const config = await archive.getConfig();
248
+ if (initializedCallback) {
249
+ await initializedCallback(orchestrator, config);
250
+ }
251
+ log('Start crawling');
252
+ log('URL %O', list.map((url) => url.href));
253
+ log('Config %O', config);
254
+ await orchestrator.crawling(list);
255
+ log('Crawling completed');
256
+ clearDestinationCache();
257
+ log('Set order natural URL sort');
258
+ await archive.setUrlOrder();
259
+ log('Sorting done');
260
+ return orchestrator;
261
+ }
262
+ /**
263
+ * Resume a previously interrupted crawl from an existing archive file.
264
+ *
265
+ * Restores the crawl state (pending URLs, scraped URLs, and resources)
266
+ * from the archive, merges any option overrides, and continues crawling
267
+ * from where it left off.
268
+ * @param stubPath - Path to the existing archive file to resume from.
269
+ * @param options - Optional configuration overrides to apply on top of the archived config.
270
+ * @param initializedCallback - Optional callback invoked after initialization but before crawling resumes.
271
+ * @returns A promise that resolves to the CrawlerOrchestrator instance after crawling completes.
272
+ * @throws {Error} If the archived URL is invalid.
273
+ */
274
+ static async resume(stubPath, options, initializedCallback) {
275
+ const archive = await Archive.resume(stubPath);
276
+ const archivedConfig = await archive.getConfig();
277
+ const config = {
278
+ ...archivedConfig,
279
+ ...cleanObject(options),
280
+ };
281
+ const orchestrator = new CrawlerOrchestrator(archive, config);
282
+ const _url = await archive.getUrl();
283
+ const url = parseUrl(_url, config);
284
+ if (!url) {
285
+ throw new Error(`URL (${_url}) is invalid`);
286
+ }
287
+ const { scraped, pending } = await archive.getCrawlingState();
288
+ const resources = await archive.getResourceUrlList();
289
+ orchestrator.#crawler.resume(pending, scraped, resources);
290
+ if (initializedCallback) {
291
+ await initializedCallback(orchestrator, config);
292
+ }
293
+ log('Start resuming');
294
+ log('Data %s', stubPath);
295
+ log('URL %s', url.href);
296
+ log('Config %O', config);
297
+ await orchestrator.crawling([url]);
298
+ return orchestrator;
299
+ }
300
+ }
301
+ /**
302
+ * Normalize an optional parameter that may be a single value, an array,
303
+ * null, or undefined into a guaranteed array.
304
+ * @param param - The parameter to normalize.
305
+ * @returns An array containing the parameter value(s), or an empty array if absent.
306
+ */
307
+ function normalizeToArray(param) {
308
+ return Array.isArray(param) ? param : param ? [param] : [];
309
+ }
package/lib/debug.d.ts ADDED
@@ -0,0 +1,8 @@
1
+ /** Debug logger for the core package. Namespace: `Nitpicker`. */
2
+ export declare const log: import("debug").Debugger;
3
+ /** Debug logger for the crawler module. Namespace: `Nitpicker:Crawler`. */
4
+ export declare const crawlerLog: import("debug").Debugger;
5
+ /** Debug logger for the dealer integration. Namespace: `Nitpicker:Crawler:Deal`. */
6
+ export declare const dealLog: import("debug").Debugger;
7
+ /** Debug logger for crawler errors. Namespace: `Nitpicker:Crawler:Error`. */
8
+ export declare const crawlerErrorLog: import("debug").Debugger;
package/lib/debug.js ADDED
@@ -0,0 +1,9 @@
1
+ import { log as globalLog } from './utils/debug.js';
2
+ /** Debug logger for the core package. Namespace: `Nitpicker`. */
3
+ export const log = globalLog;
4
+ /** Debug logger for the crawler module. Namespace: `Nitpicker:Crawler`. */
5
+ export const crawlerLog = log.extend('Crawler');
6
+ /** Debug logger for the dealer integration. Namespace: `Nitpicker:Crawler:Deal`. */
7
+ export const dealLog = crawlerLog.extend('Deal');
8
+ /** Debug logger for crawler errors. Namespace: `Nitpicker:Crawler:Error`. */
9
+ export const crawlerErrorLog = crawlerLog.extend('Error');
package/lib/index.d.ts ADDED
@@ -0,0 +1,16 @@
1
+ /**
2
+ * @module @nitpicker/crawler
3
+ *
4
+ * Core module of Nitpicker that provides the main crawling engine,
5
+ * utility functions, type definitions, and archive storage layer.
6
+ */
7
+ export * from './utils/index.js';
8
+ export { ArchiveAccessor } from './archive/archive-accessor.js';
9
+ export type { Redirect, Referrer, Anchor, StaticPageData } from './archive/page.js';
10
+ export { default as Page } from './archive/page.js';
11
+ export { default as ArchiveResource } from './archive/resource.js';
12
+ export * from './archive/types.js';
13
+ export { default as Archive } from './archive/archive.js';
14
+ export { DEFAULT_EXCLUDED_EXTERNAL_URLS, CrawlerOrchestrator, } from './crawler-orchestrator.js';
15
+ export * from './types.js';
16
+ export * from './crawler/types.js';
package/lib/index.js ADDED
@@ -0,0 +1,18 @@
1
+ /**
2
+ * @module @nitpicker/crawler
3
+ *
4
+ * Core module of Nitpicker that provides the main crawling engine,
5
+ * utility functions, type definitions, and archive storage layer.
6
+ */
7
+ // Types + Utils (旧 @nitpicker/types + utils)
8
+ export * from './utils/index.js';
9
+ // Archive
10
+ export { ArchiveAccessor } from './archive/archive-accessor.js';
11
+ export { default as Page } from './archive/page.js';
12
+ export { default as ArchiveResource } from './archive/resource.js';
13
+ export * from './archive/types.js';
14
+ export { default as Archive } from './archive/archive.js';
15
+ // Core
16
+ export { DEFAULT_EXCLUDED_EXTERNAL_URLS, CrawlerOrchestrator, } from './crawler-orchestrator.js';
17
+ export * from './types.js';
18
+ export * from './crawler/types.js';
@@ -0,0 +1,136 @@
1
+ import type { QzillaEvent } from './types.js';
2
+ import type { Config } from './archive/types.js';
3
+ import Archive from './archive/archive.js';
4
+ import { EventEmitter } from './utils/index.js';
5
+ import type { ExURL } from './utils/index.js';
6
+ /**
7
+ * Default list of external URL prefixes excluded from crawling.
8
+ * Includes social media sharing endpoints that are commonly linked
9
+ * but provide no useful crawl data.
10
+ */
11
+ export declare const DEFAULT_EXCLUDED_EXTERNAL_URLS: string[];
12
+ /**
13
+ * Configuration options for the Qzilla crawler.
14
+ *
15
+ * Extends the archive {@link Config} with additional runtime settings
16
+ * such as working directory, browser executable path, and output options.
17
+ */
18
+ type QzillaConfig = {
19
+ /** The working directory for output files. Defaults to `process.cwd()`. */
20
+ cwd: string;
21
+ /** Path to a Chromium/Chrome executable for Puppeteer. */
22
+ executablePath: string;
23
+ /** Output file path for the archive. */
24
+ filePath: string;
25
+ /** Whether to capture image resources during crawling. */
26
+ image: boolean;
27
+ /** File-size threshold (in bytes) above which images are excluded. */
28
+ imageFileSizeThreshold: number;
29
+ /** Delay in milliseconds between each page request. */
30
+ interval: number;
31
+ /** Whether the input is a pre-defined URL list (non-recursive mode). */
32
+ list: boolean;
33
+ /** Whether to enable verbose logging output. */
34
+ verbose: boolean;
35
+ } & Config;
36
+ /**
37
+ * Callback invoked after the Qzilla instance is fully initialized
38
+ * but before crawling begins.
39
+ * @param qzilla - The initialized Qzilla instance.
40
+ * @param config - The resolved archive configuration.
41
+ */
42
+ type QzillaInitializedCallback = (qzilla: Qzilla, config: Config) => void | Promise<void>;
43
+ /**
44
+ * The main entry point for Qzilla web crawling and archiving.
45
+ *
46
+ * Qzilla orchestrates the full lifecycle of a crawl session: it creates an archive,
47
+ * configures a {@link Crawler}, processes discovered pages and resources, and
48
+ * writes the final archive file. It emits events defined by {@link QzillaEvent}.
49
+ *
50
+ * Instances are created via the static factory methods {@link Qzilla.crawling}
51
+ * or {@link Qzilla.resume}; the constructor is private.
52
+ * @example
53
+ * ```ts
54
+ * const qzilla = await Qzilla.crawling(['https://example.com'], { recursive: true });
55
+ * await qzilla.write();
56
+ * ```
57
+ */
58
+ export declare class Qzilla extends EventEmitter<QzillaEvent> {
59
+ #private;
60
+ /**
61
+ * The underlying archive instance used for storing crawl results.
62
+ */
63
+ get archive(): Archive;
64
+ private constructor();
65
+ /**
66
+ * Abort the current crawl and archive operations.
67
+ *
68
+ * Delegates to the archive's abort method, which stops all in-progress
69
+ * database writes and cleans up temporary resources.
70
+ * @returns The result of the archive abort operation.
71
+ */
72
+ abort(): void;
73
+ /**
74
+ * Execute the crawl for the given list of URLs.
75
+ *
76
+ * Sets up event listeners on the crawler, starts crawling, and resolves
77
+ * when the crawl completes. Discovered pages, external pages, skipped pages,
78
+ * and resources are forwarded to the archive for storage.
79
+ * @param list - The list of parsed URLs to crawl. The first URL is used as the root.
80
+ * @returns A promise that resolves when crawling is complete.
81
+ * @throws {Error} If the URL list is empty.
82
+ */
83
+ crawling(list: ExURL[]): Promise<void>;
84
+ /**
85
+ * Kill any zombie Chromium processes that were not properly cleaned up.
86
+ *
87
+ * Retrieves the list of undead process IDs from the crawler and sends
88
+ * a SIGTERM signal to each one. Chromium is intentionally sent SIGTERM
89
+ * (not SIGKILL) to avoid leaving zombie processes.
90
+ */
91
+ garbageCollect(): void;
92
+ /**
93
+ * Retrieve the list of process IDs for Chromium instances that are
94
+ * still running after crawling has ended.
95
+ * @returns An array of process IDs that should be terminated.
96
+ */
97
+ getUndeadPid(): never[];
98
+ /**
99
+ * Write the archive to its configured file path.
100
+ *
101
+ * Emits `writeFileStart` before writing and `writeFileEnd` after
102
+ * the write completes successfully.
103
+ */
104
+ write(): Promise<void>;
105
+ /**
106
+ * Create a new Qzilla instance and start crawling the given URLs.
107
+ *
108
+ * This is the primary factory method for starting a fresh crawl. It:
109
+ * 1. Parses and sorts the input URLs
110
+ * 2. Creates an archive file
111
+ * 3. Saves the crawl configuration
112
+ * 4. Runs the optional initialized callback
113
+ * 5. Executes the crawl
114
+ * 6. Sorts the archived URLs in natural order
115
+ * @param url - One or more URL strings to crawl.
116
+ * @param options - Optional configuration overrides for the crawl session.
117
+ * @param initializedCallback - Optional callback invoked after initialization but before crawling starts.
118
+ * @returns A promise that resolves to the Qzilla instance after crawling completes.
119
+ * @throws {Error} If the URL list is empty or contains no valid URLs.
120
+ */
121
+ static crawling(url: string[], options?: Partial<QzillaConfig>, initializedCallback?: QzillaInitializedCallback): Promise<Qzilla>;
122
+ /**
123
+ * Resume a previously interrupted crawl from an existing archive file.
124
+ *
125
+ * Restores the crawl state (pending URLs, scraped URLs, and resources)
126
+ * from the archive, merges any option overrides, and continues crawling
127
+ * from where it left off.
128
+ * @param stubPath - Path to the existing archive file to resume from.
129
+ * @param options - Optional configuration overrides to apply on top of the archived config.
130
+ * @param initializedCallback - Optional callback invoked after initialization but before crawling resumes.
131
+ * @returns A promise that resolves to the Qzilla instance after crawling completes.
132
+ * @throws {Error} If the archived URL is invalid.
133
+ */
134
+ static resume(stubPath: string, options?: Partial<QzillaConfig>, initializedCallback?: QzillaInitializedCallback): Promise<Qzilla>;
135
+ }
136
+ export {};