hydra-crawler 2.8.4 → 3.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (467) hide show
  1. package/dist/apis/{autocomplete.api.d.ts → autocomplete.api.d.mts} +4 -4
  2. package/dist/apis/autocomplete.api.mjs +14 -0
  3. package/dist/apis/autocomplete.api.mjs.map +1 -0
  4. package/dist/apis/{bugs.api.d.ts → bugs.api.d.mts} +4 -4
  5. package/dist/apis/bugs.api.mjs +18 -0
  6. package/dist/apis/bugs.api.mjs.map +1 -0
  7. package/dist/apis/{crawl.api.d.ts → crawl.api.d.mts} +4 -4
  8. package/dist/apis/crawl.api.mjs +17 -0
  9. package/dist/apis/crawl.api.mjs.map +1 -0
  10. package/dist/apis/{domains.api.d.ts → domains.api.d.mts} +4 -4
  11. package/dist/apis/{domains.api.js → domains.api.mjs} +10 -18
  12. package/dist/apis/domains.api.mjs.map +1 -0
  13. package/dist/apis/{images.api.d.ts → images.api.d.mts} +4 -4
  14. package/dist/apis/images.api.mjs +19 -0
  15. package/dist/apis/images.api.mjs.map +1 -0
  16. package/dist/apis/{statistics.api.d.ts → statistics.api.d.mts} +4 -4
  17. package/dist/apis/statistics.api.mjs +33 -0
  18. package/dist/apis/statistics.api.mjs.map +1 -0
  19. package/dist/apis/{test.api.d.ts → test.api.d.mts} +3 -3
  20. package/dist/apis/test.api.mjs +10 -0
  21. package/dist/apis/test.api.mjs.map +1 -0
  22. package/dist/apis/{urls.api.d.ts → urls.api.d.mts} +4 -4
  23. package/dist/apis/urls.api.mjs +21 -0
  24. package/dist/apis/urls.api.mjs.map +1 -0
  25. package/dist/apps/{cleanup.app.d.ts → cleanup.app.d.mts} +5 -5
  26. package/dist/apps/cleanup.app.mjs +129 -0
  27. package/dist/apps/cleanup.app.mjs.map +1 -0
  28. package/dist/apps/{cross-populate-export.app.d.ts → cross-populate-export.app.d.mts} +3 -3
  29. package/dist/apps/cross-populate-export.app.mjs +61 -0
  30. package/dist/apps/cross-populate-export.app.mjs.map +1 -0
  31. package/dist/apps/{cross-populate-import.app.d.ts → cross-populate-import.app.d.mts} +3 -3
  32. package/dist/apps/cross-populate-import.app.mjs +61 -0
  33. package/dist/apps/cross-populate-import.app.mjs.map +1 -0
  34. package/dist/apps/{denylist.app.d.ts → denylist.app.d.mts} +5 -5
  35. package/dist/apps/denylist.app.mjs +135 -0
  36. package/dist/apps/denylist.app.mjs.map +1 -0
  37. package/dist/apps/{expire.app.d.ts → expire.app.d.mts} +8 -7
  38. package/dist/apps/expire.app.mjs +51 -0
  39. package/dist/apps/expire.app.mjs.map +1 -0
  40. package/dist/apps/{export-domain-urls.d.ts → export-domain-urls.d.mts} +3 -3
  41. package/dist/apps/export-domain-urls.mjs +85 -0
  42. package/dist/apps/export-domain-urls.mjs.map +1 -0
  43. package/dist/apps/{extract-text.app.d.ts → extract-text.app.d.mts} +2 -2
  44. package/dist/apps/extract-text.app.mjs +43 -0
  45. package/dist/apps/extract-text.app.mjs.map +1 -0
  46. package/dist/apps/{hydra.app.d.ts → hydra.app.d.mts} +10 -10
  47. package/dist/apps/hydra.app.mjs +222 -0
  48. package/dist/apps/hydra.app.mjs.map +1 -0
  49. package/dist/apps/{import.app.d.ts → import.app.d.mts} +3 -3
  50. package/dist/apps/import.app.mjs +48 -0
  51. package/dist/apps/import.app.mjs.map +1 -0
  52. package/dist/apps/{internal-hydra-common.app.d.ts → internal-hydra-common.app.d.mts} +6 -6
  53. package/dist/apps/{internal-hydra-common.app.js → internal-hydra-common.app.mjs} +1 -1
  54. package/dist/apps/internal-hydra-common.app.mjs.map +1 -0
  55. package/dist/apps/{move-to-archive.app.d.ts → move-to-archive.app.d.mts} +2 -2
  56. package/dist/apps/move-to-archive.app.mjs +31 -0
  57. package/dist/apps/move-to-archive.app.mjs.map +1 -0
  58. package/dist/apps/{prune-archive.app.d.ts → prune-archive.app.d.mts} +2 -2
  59. package/dist/apps/prune-archive.app.mjs +40 -0
  60. package/dist/apps/prune-archive.app.mjs.map +1 -0
  61. package/dist/apps/{query.app.d.ts → query.app.d.mts} +6 -6
  62. package/dist/apps/query.app.mjs +69 -0
  63. package/dist/apps/query.app.mjs.map +1 -0
  64. package/dist/apps/{reattempt.app.d.ts → reattempt.app.d.mts} +6 -6
  65. package/dist/apps/reattempt.app.mjs +81 -0
  66. package/dist/apps/reattempt.app.mjs.map +1 -0
  67. package/dist/apps/{requeue-domain.app.d.ts → requeue-domain.app.d.mts} +3 -3
  68. package/dist/apps/requeue-domain.app.mjs +52 -0
  69. package/dist/apps/requeue-domain.app.mjs.map +1 -0
  70. package/dist/apps/{seed.app.d.ts → seed.app.d.mts} +5 -5
  71. package/dist/apps/seed.app.mjs +59 -0
  72. package/dist/apps/seed.app.mjs.map +1 -0
  73. package/dist/apps/{startup.app.d.ts → startup.app.d.mts} +3 -3
  74. package/dist/apps/startup.app.mjs +55 -0
  75. package/dist/apps/startup.app.mjs.map +1 -0
  76. package/dist/apps/{unarchive-urls.app.d.ts → unarchive-urls.app.d.mts} +5 -5
  77. package/dist/apps/unarchive-urls.app.mjs +78 -0
  78. package/dist/apps/unarchive-urls.app.mjs.map +1 -0
  79. package/dist/classes/{cleaner.d.ts → cleaner.d.mts} +2 -2
  80. package/dist/classes/cleaner.mjs +264 -0
  81. package/dist/classes/cleaner.mjs.map +1 -0
  82. package/dist/classes/{crawler.d.ts → crawler.d.mts} +10 -12
  83. package/dist/classes/{crawler.js → crawler.mjs} +216 -184
  84. package/dist/classes/crawler.mjs.map +1 -0
  85. package/dist/classes/{dns.js → dns.mjs} +4 -4
  86. package/dist/classes/dns.mjs.map +1 -0
  87. package/dist/classes/{expirer.d.ts → expirer.d.mts} +2 -2
  88. package/dist/classes/expirer.mjs +117 -0
  89. package/dist/classes/expirer.mjs.map +1 -0
  90. package/dist/classes/{expiry.d.ts → expiry.d.mts} +1 -1
  91. package/dist/classes/{expiry.js → expiry.mjs} +7 -9
  92. package/dist/classes/expiry.mjs.map +1 -0
  93. package/dist/classes/{lists.d.ts → lists.d.mts} +2 -2
  94. package/dist/classes/{lists.js → lists.mjs} +4 -4
  95. package/dist/classes/lists.mjs.map +1 -0
  96. package/dist/classes/{robot.d.ts → robot.d.mts} +3 -3
  97. package/dist/classes/robot.mjs +74 -0
  98. package/dist/classes/robot.mjs.map +1 -0
  99. package/dist/classes/{tracker.d.ts → tracker.d.mts} +3 -3
  100. package/dist/classes/tracker.mjs +101 -0
  101. package/dist/classes/tracker.mjs.map +1 -0
  102. package/dist/{cli.js → cli.mjs} +46 -58
  103. package/dist/cli.mjs.map +1 -0
  104. package/dist/enums/{eavailable-strategy.js → eavailable-strategy.mjs} +1 -1
  105. package/dist/enums/eavailable-strategy.mjs.map +1 -0
  106. package/dist/enums/{elist.js → elist.mjs} +1 -1
  107. package/dist/enums/elist.mjs.map +1 -0
  108. package/dist/enums/{eserver.js → eserver.mjs} +1 -1
  109. package/dist/enums/eserver.mjs.map +1 -0
  110. package/dist/enums/{ex-powered-by.js → ex-powered-by.mjs} +1 -1
  111. package/dist/enums/ex-powered-by.mjs.map +1 -0
  112. package/dist/helpers/{matcher.d.ts → matcher.d.mts} +1 -1
  113. package/dist/helpers/{matcher.js → matcher.mjs} +1 -1
  114. package/dist/helpers/matcher.mjs.map +1 -0
  115. package/dist/helpers/{random.d.ts → random.d.mts} +1 -1
  116. package/dist/helpers/{random.js → random.mjs} +1 -1
  117. package/dist/helpers/random.mjs.map +1 -0
  118. package/dist/helpers/{utf-decoder.d.ts → utf-decoder.d.mts} +0 -1
  119. package/dist/helpers/{utf-decoder.js → utf-decoder.mjs} +3 -3
  120. package/dist/helpers/utf-decoder.mjs.map +1 -0
  121. package/dist/interfaces/{iexpiry.d.ts → iexpiry.d.mts} +1 -1
  122. package/dist/interfaces/{iexpiry.js → iexpiry.mjs} +3 -3
  123. package/dist/interfaces/iexpiry.mjs.map +1 -0
  124. package/dist/interfaces/{imatch.js → imatch.mjs} +2 -2
  125. package/dist/interfaces/imatch.mjs.map +1 -0
  126. package/dist/interfaces/{iparser-config.js → iparser-config.mjs} +2 -2
  127. package/dist/interfaces/iparser-config.mjs.map +1 -0
  128. package/dist/interfaces/{iparser.d.ts → iparser.d.mts} +1 -1
  129. package/dist/interfaces/iparser.mjs +2 -0
  130. package/dist/interfaces/iparser.mjs.map +1 -0
  131. package/dist/interfaces/{irequest-outcome.d.ts → irequest-outcome.d.mts} +0 -2
  132. package/dist/interfaces/irequest-outcome.mjs +2 -0
  133. package/dist/interfaces/irequest-outcome.mjs.map +1 -0
  134. package/dist/interfaces/iserver.mjs +2 -0
  135. package/dist/interfaces/iserver.mjs.map +1 -0
  136. package/dist/parsers/accessibility-metrics.parser.d.mts +11 -0
  137. package/dist/parsers/accessibility-metrics.parser.mjs +30 -0
  138. package/dist/parsers/accessibility-metrics.parser.mjs.map +1 -0
  139. package/dist/parsers/asp-error.parser.d.mts +12 -0
  140. package/dist/parsers/asp-error.parser.mjs +38 -0
  141. package/dist/parsers/asp-error.parser.mjs.map +1 -0
  142. package/dist/parsers/{bad-words.parser.d.ts → bad-words.parser.d.mts} +6 -6
  143. package/dist/parsers/bad-words.parser.mjs +17 -0
  144. package/dist/parsers/bad-words.parser.mjs.map +1 -0
  145. package/dist/parsers/complex-english.parser.d.mts +15 -0
  146. package/dist/parsers/complex-english.parser.mjs +52 -0
  147. package/dist/parsers/complex-english.parser.mjs.map +1 -0
  148. package/dist/parsers/data.parser.d.mts +13 -0
  149. package/dist/parsers/{data.parser.js → data.parser.mjs} +8 -7
  150. package/dist/parsers/data.parser.mjs.map +1 -0
  151. package/dist/parsers/{dictionary.parser.d.ts → dictionary.parser.d.mts} +6 -6
  152. package/dist/parsers/dictionary.parser.mjs +63 -0
  153. package/dist/parsers/dictionary.parser.mjs.map +1 -0
  154. package/dist/parsers/html.parser.d.mts +13 -0
  155. package/dist/parsers/{html.parser.js → html.parser.mjs} +4 -3
  156. package/dist/parsers/html.parser.mjs.map +1 -0
  157. package/dist/parsers/hyperlinks.parser.d.mts +20 -0
  158. package/dist/parsers/hyperlinks.parser.mjs +104 -0
  159. package/dist/parsers/hyperlinks.parser.mjs.map +1 -0
  160. package/dist/parsers/image-tags.parser.d.mts +19 -0
  161. package/dist/parsers/image-tags.parser.mjs +42 -0
  162. package/dist/parsers/image-tags.parser.mjs.map +1 -0
  163. package/dist/parsers/{interest.parser.d.ts → interest.parser.d.mts} +7 -7
  164. package/dist/parsers/interest.parser.mjs +60 -0
  165. package/dist/parsers/interest.parser.mjs.map +1 -0
  166. package/dist/parsers/jpeg.parser.d.mts +11 -0
  167. package/dist/parsers/jpeg.parser.mjs +29 -0
  168. package/dist/parsers/jpeg.parser.mjs.map +1 -0
  169. package/dist/parsers/{llama-guard.parser.d.ts → llama-guard.parser.d.mts} +7 -7
  170. package/dist/parsers/llama-guard.parser.mjs +56 -0
  171. package/dist/parsers/llama-guard.parser.mjs.map +1 -0
  172. package/dist/parsers/{offence.parser.d.ts → offence.parser.d.mts} +7 -7
  173. package/dist/parsers/offence.parser.mjs +60 -0
  174. package/dist/parsers/offence.parser.mjs.map +1 -0
  175. package/dist/parsers/{ollama.parser.d.ts → ollama.parser.d.mts} +6 -6
  176. package/dist/parsers/ollama.parser.mjs +43 -0
  177. package/dist/parsers/ollama.parser.mjs.map +1 -0
  178. package/dist/parsers/{paragraphs.parser.d.ts → paragraphs.parser.d.mts} +5 -5
  179. package/dist/parsers/paragraphs.parser.mjs +38 -0
  180. package/dist/parsers/paragraphs.parser.mjs.map +1 -0
  181. package/dist/parsers/{parser.d.ts → parser.d.mts} +6 -6
  182. package/dist/parsers/parser.mjs +45 -0
  183. package/dist/parsers/parser.mjs.map +1 -0
  184. package/dist/parsers/php-error.parser.d.mts +12 -0
  185. package/dist/parsers/php-error.parser.mjs +42 -0
  186. package/dist/parsers/php-error.parser.mjs.map +1 -0
  187. package/dist/parsers/{phrase.parser.d.ts → phrase.parser.d.mts} +3 -3
  188. package/dist/parsers/phrase.parser.mjs +15 -0
  189. package/dist/parsers/phrase.parser.mjs.map +1 -0
  190. package/dist/parsers/{regex.parser.d.ts → regex.parser.d.mts} +3 -3
  191. package/dist/parsers/regex.parser.mjs +29 -0
  192. package/dist/parsers/regex.parser.mjs.map +1 -0
  193. package/dist/parsers/server.parser.d.mts +11 -0
  194. package/dist/parsers/server.parser.mjs +57 -0
  195. package/dist/parsers/server.parser.mjs.map +1 -0
  196. package/dist/parsers/{spelling.parser.d.ts → spelling.parser.d.mts} +6 -6
  197. package/dist/parsers/spelling.parser.mjs +17 -0
  198. package/dist/parsers/spelling.parser.mjs.map +1 -0
  199. package/dist/parsers/string.parser.d.mts +8 -0
  200. package/dist/parsers/{string.parser.js → string.parser.mjs} +5 -4
  201. package/dist/parsers/string.parser.mjs.map +1 -0
  202. package/dist/parsers/{text.parser.d.ts → text.parser.d.mts} +3 -3
  203. package/dist/parsers/text.parser.mjs +30 -0
  204. package/dist/parsers/text.parser.mjs.map +1 -0
  205. package/dist/parsers/{words.parser.d.ts → words.parser.d.mts} +3 -3
  206. package/dist/parsers/words.parser.mjs +29 -0
  207. package/dist/parsers/words.parser.mjs.map +1 -0
  208. package/dist/queries/complex-english.query.d.mts +2 -0
  209. package/dist/queries/{complex-english.query.js → complex-english.query.mjs} +26 -28
  210. package/dist/queries/complex-english.query.mjs.map +1 -0
  211. package/dist/queries/flash-content.query.d.mts +2 -0
  212. package/dist/queries/flash-content.query.mjs +80 -0
  213. package/dist/queries/flash-content.query.mjs.map +1 -0
  214. package/dist/queries/linking-to-domains.query.d.mts +2 -0
  215. package/dist/queries/linking-to-domains.query.mjs +128 -0
  216. package/dist/queries/linking-to-domains.query.mjs.map +1 -0
  217. package/dist/queries/llamaguard-unsafe-content.query.d.mts +2 -0
  218. package/dist/queries/llamaguard-unsafe-content.query.mjs +90 -0
  219. package/dist/queries/llamaguard-unsafe-content.query.mjs.map +1 -0
  220. package/dist/queries/readability-score.query.d.mts +2 -0
  221. package/dist/queries/{readability-score.query.js → readability-score.query.mjs} +11 -20
  222. package/dist/queries/readability-score.query.mjs.map +1 -0
  223. package/dist/servers/{crawl.server.d.ts → crawl.server.d.mts} +10 -10
  224. package/dist/servers/crawl.server.mjs +192 -0
  225. package/dist/servers/crawl.server.mjs.map +1 -0
  226. package/dist/servers/{express.server.d.ts → express.server.d.mts} +2 -3
  227. package/dist/servers/express.server.mjs +13 -0
  228. package/dist/servers/express.server.mjs.map +1 -0
  229. package/dist/servers/{maintenance.server.d.ts → maintenance.server.d.mts} +5 -5
  230. package/dist/servers/maintenance.server.mjs +97 -0
  231. package/dist/servers/maintenance.server.mjs.map +1 -0
  232. package/dist/servers/{rest.server.d.ts → rest.server.d.mts} +4 -4
  233. package/dist/servers/rest.server.mjs +62 -0
  234. package/dist/servers/rest.server.mjs.map +1 -0
  235. package/dist/servers/{socket-io.server.d.ts → socket-io.server.d.mts} +4 -4
  236. package/dist/servers/socket-io.server.mjs +22 -0
  237. package/dist/servers/socket-io.server.mjs.map +1 -0
  238. package/dist/services/{database.service.d.ts → database.service.d.mts} +12 -12
  239. package/dist/services/database.service.mjs +645 -0
  240. package/dist/services/database.service.mjs.map +1 -0
  241. package/dist/services/{ollama-rest.service.d.ts → ollama-rest.service.d.mts} +2 -2
  242. package/dist/services/ollama-rest.service.mjs +27 -0
  243. package/dist/services/ollama-rest.service.mjs.map +1 -0
  244. package/dist/services/{rig-llama-guard.service.d.ts → rig-llama-guard.service.d.mts} +2 -2
  245. package/dist/services/{rig-llama-guard.service.js → rig-llama-guard.service.mjs} +33 -43
  246. package/dist/services/rig-llama-guard.service.mjs.map +1 -0
  247. package/dist/services/{rig-queue-length.service.d.ts → rig-queue-length.service.d.mts} +1 -1
  248. package/dist/services/rig-queue-length.service.mjs +12 -0
  249. package/dist/services/rig-queue-length.service.mjs.map +1 -0
  250. package/dist/services/{rig-subjectivity-scale.service.d.ts → rig-subjectivity-scale.service.d.mts} +2 -2
  251. package/dist/services/rig-subjectivity-scale.service.mjs +82 -0
  252. package/dist/services/rig-subjectivity-scale.service.mjs.map +1 -0
  253. package/dist/services/{rig-ticket-rest.service.d.ts → rig-ticket-rest.service.d.mts} +3 -3
  254. package/dist/services/rig-ticket-rest.service.mjs +15 -0
  255. package/dist/services/rig-ticket-rest.service.mjs.map +1 -0
  256. package/dist/services/{rig-ticketed-promise.service.d.ts → rig-ticketed-promise.service.d.mts} +4 -6
  257. package/dist/services/{rig-ticketed-promise.service.js → rig-ticketed-promise.service.mjs} +7 -6
  258. package/dist/services/rig-ticketed-promise.service.mjs.map +1 -0
  259. package/dist/types/{tcrawl-config.js → tcrawl-config.mjs} +2 -2
  260. package/dist/types/tcrawl-config.mjs.map +1 -0
  261. package/dist/types/{thydra-config.js → thydra-config.mjs} +2 -2
  262. package/dist/types/thydra-config.mjs.map +1 -0
  263. package/dist/types/tparser-ctor.d.mts +7 -0
  264. package/dist/types/tparser-ctor.mjs +2 -0
  265. package/dist/types/tparser-ctor.mjs.map +1 -0
  266. package/dist/types/tquery.d.mts +7 -0
  267. package/dist/types/tquery.mjs +2 -0
  268. package/dist/types/tquery.mjs.map +1 -0
  269. package/dist/types/tqueue-length.mjs +2 -0
  270. package/dist/types/tqueue-length.mjs.map +1 -0
  271. package/dist/types/{trobots-config.js → trobots-config.mjs} +2 -2
  272. package/dist/types/trobots-config.mjs.map +1 -0
  273. package/package.json +37 -43
  274. package/dist/apis/autocomplete.api.js +0 -22
  275. package/dist/apis/autocomplete.api.js.map +0 -1
  276. package/dist/apis/bugs.api.js +0 -26
  277. package/dist/apis/bugs.api.js.map +0 -1
  278. package/dist/apis/crawl.api.js +0 -25
  279. package/dist/apis/crawl.api.js.map +0 -1
  280. package/dist/apis/domains.api.js.map +0 -1
  281. package/dist/apis/images.api.js +0 -27
  282. package/dist/apis/images.api.js.map +0 -1
  283. package/dist/apis/statistics.api.js +0 -41
  284. package/dist/apis/statistics.api.js.map +0 -1
  285. package/dist/apis/test.api.js +0 -19
  286. package/dist/apis/test.api.js.map +0 -1
  287. package/dist/apis/urls.api.js +0 -29
  288. package/dist/apis/urls.api.js.map +0 -1
  289. package/dist/apps/cleanup.app.js +0 -151
  290. package/dist/apps/cleanup.app.js.map +0 -1
  291. package/dist/apps/cross-populate-export.app.js +0 -75
  292. package/dist/apps/cross-populate-export.app.js.map +0 -1
  293. package/dist/apps/cross-populate-import.app.js +0 -100
  294. package/dist/apps/cross-populate-import.app.js.map +0 -1
  295. package/dist/apps/denylist.app.js +0 -132
  296. package/dist/apps/denylist.app.js.map +0 -1
  297. package/dist/apps/expire.app.js +0 -63
  298. package/dist/apps/expire.app.js.map +0 -1
  299. package/dist/apps/export-domain-urls.js +0 -99
  300. package/dist/apps/export-domain-urls.js.map +0 -1
  301. package/dist/apps/extract-text.app.js +0 -55
  302. package/dist/apps/extract-text.app.js.map +0 -1
  303. package/dist/apps/hydra.app.js +0 -218
  304. package/dist/apps/hydra.app.js.map +0 -1
  305. package/dist/apps/import.app.js +0 -57
  306. package/dist/apps/import.app.js.map +0 -1
  307. package/dist/apps/internal-hydra-common.app.js.map +0 -1
  308. package/dist/apps/move-to-archive.app.js +0 -46
  309. package/dist/apps/move-to-archive.app.js.map +0 -1
  310. package/dist/apps/prune-archive.app.js +0 -55
  311. package/dist/apps/prune-archive.app.js.map +0 -1
  312. package/dist/apps/query.app.js +0 -80
  313. package/dist/apps/query.app.js.map +0 -1
  314. package/dist/apps/reattempt.app.js +0 -83
  315. package/dist/apps/reattempt.app.js.map +0 -1
  316. package/dist/apps/requeue-domain.app.js +0 -64
  317. package/dist/apps/requeue-domain.app.js.map +0 -1
  318. package/dist/apps/seed.app.js +0 -69
  319. package/dist/apps/seed.app.js.map +0 -1
  320. package/dist/apps/startup.app.js +0 -64
  321. package/dist/apps/startup.app.js.map +0 -1
  322. package/dist/apps/unarchive-urls.app.js +0 -83
  323. package/dist/apps/unarchive-urls.app.js.map +0 -1
  324. package/dist/classes/cleaner.js +0 -266
  325. package/dist/classes/cleaner.js.map +0 -1
  326. package/dist/classes/crawler.js.map +0 -1
  327. package/dist/classes/dns.js.map +0 -1
  328. package/dist/classes/expirer.js +0 -121
  329. package/dist/classes/expirer.js.map +0 -1
  330. package/dist/classes/expiry.js.map +0 -1
  331. package/dist/classes/lists.js.map +0 -1
  332. package/dist/classes/robot.js +0 -82
  333. package/dist/classes/robot.js.map +0 -1
  334. package/dist/classes/tracker.js +0 -120
  335. package/dist/classes/tracker.js.map +0 -1
  336. package/dist/cli.js.map +0 -1
  337. package/dist/enums/eavailable-strategy.js.map +0 -1
  338. package/dist/enums/elist.js.map +0 -1
  339. package/dist/enums/eserver.js.map +0 -1
  340. package/dist/enums/ex-powered-by.js.map +0 -1
  341. package/dist/helpers/matcher.js.map +0 -1
  342. package/dist/helpers/random.js.map +0 -1
  343. package/dist/helpers/utf-decoder.js.map +0 -1
  344. package/dist/interfaces/iexpiry.js.map +0 -1
  345. package/dist/interfaces/imatch.js.map +0 -1
  346. package/dist/interfaces/iparser-config.js.map +0 -1
  347. package/dist/interfaces/iparser.js +0 -2
  348. package/dist/interfaces/iparser.js.map +0 -1
  349. package/dist/interfaces/irequest-outcome.js +0 -2
  350. package/dist/interfaces/irequest-outcome.js.map +0 -1
  351. package/dist/interfaces/iserver.js +0 -2
  352. package/dist/interfaces/iserver.js.map +0 -1
  353. package/dist/parsers/accessibility-metrics.parser.d.ts +0 -11
  354. package/dist/parsers/accessibility-metrics.parser.js +0 -40
  355. package/dist/parsers/accessibility-metrics.parser.js.map +0 -1
  356. package/dist/parsers/asp-error.parser.d.ts +0 -12
  357. package/dist/parsers/asp-error.parser.js +0 -48
  358. package/dist/parsers/asp-error.parser.js.map +0 -1
  359. package/dist/parsers/bad-words.parser.js +0 -27
  360. package/dist/parsers/bad-words.parser.js.map +0 -1
  361. package/dist/parsers/complex-english.parser.d.ts +0 -15
  362. package/dist/parsers/complex-english.parser.js +0 -61
  363. package/dist/parsers/complex-english.parser.js.map +0 -1
  364. package/dist/parsers/data.parser.d.ts +0 -14
  365. package/dist/parsers/data.parser.js.map +0 -1
  366. package/dist/parsers/dictionary.parser.js +0 -73
  367. package/dist/parsers/dictionary.parser.js.map +0 -1
  368. package/dist/parsers/html.parser.d.ts +0 -13
  369. package/dist/parsers/html.parser.js.map +0 -1
  370. package/dist/parsers/hyperlinks.parser.d.ts +0 -20
  371. package/dist/parsers/hyperlinks.parser.js +0 -115
  372. package/dist/parsers/hyperlinks.parser.js.map +0 -1
  373. package/dist/parsers/image-tags.parser.d.ts +0 -19
  374. package/dist/parsers/image-tags.parser.js +0 -52
  375. package/dist/parsers/image-tags.parser.js.map +0 -1
  376. package/dist/parsers/interest.parser.js +0 -69
  377. package/dist/parsers/interest.parser.js.map +0 -1
  378. package/dist/parsers/jpeg.parser.d.ts +0 -11
  379. package/dist/parsers/jpeg.parser.js +0 -39
  380. package/dist/parsers/jpeg.parser.js.map +0 -1
  381. package/dist/parsers/llama-guard.parser.js +0 -65
  382. package/dist/parsers/llama-guard.parser.js.map +0 -1
  383. package/dist/parsers/offence.parser.js +0 -69
  384. package/dist/parsers/offence.parser.js.map +0 -1
  385. package/dist/parsers/ollama.parser.js +0 -51
  386. package/dist/parsers/ollama.parser.js.map +0 -1
  387. package/dist/parsers/paragraphs.parser.js +0 -49
  388. package/dist/parsers/paragraphs.parser.js.map +0 -1
  389. package/dist/parsers/parser.js +0 -57
  390. package/dist/parsers/parser.js.map +0 -1
  391. package/dist/parsers/php-error.parser.d.ts +0 -12
  392. package/dist/parsers/php-error.parser.js +0 -52
  393. package/dist/parsers/php-error.parser.js.map +0 -1
  394. package/dist/parsers/phrase.parser.js +0 -26
  395. package/dist/parsers/phrase.parser.js.map +0 -1
  396. package/dist/parsers/regex.parser.js +0 -43
  397. package/dist/parsers/regex.parser.js.map +0 -1
  398. package/dist/parsers/server.parser.d.ts +0 -11
  399. package/dist/parsers/server.parser.js +0 -67
  400. package/dist/parsers/server.parser.js.map +0 -1
  401. package/dist/parsers/spelling.parser.js +0 -27
  402. package/dist/parsers/spelling.parser.js.map +0 -1
  403. package/dist/parsers/string.parser.d.ts +0 -8
  404. package/dist/parsers/string.parser.js.map +0 -1
  405. package/dist/parsers/text.parser.js +0 -41
  406. package/dist/parsers/text.parser.js.map +0 -1
  407. package/dist/parsers/words.parser.js +0 -40
  408. package/dist/parsers/words.parser.js.map +0 -1
  409. package/dist/queries/complex-english.query.d.ts +0 -2
  410. package/dist/queries/complex-english.query.js.map +0 -1
  411. package/dist/queries/flash-content.query.d.ts +0 -2
  412. package/dist/queries/flash-content.query.js +0 -82
  413. package/dist/queries/flash-content.query.js.map +0 -1
  414. package/dist/queries/linking-to-domains.query.d.ts +0 -2
  415. package/dist/queries/linking-to-domains.query.js +0 -130
  416. package/dist/queries/linking-to-domains.query.js.map +0 -1
  417. package/dist/queries/llamaguard-unsafe-content.query.d.ts +0 -2
  418. package/dist/queries/llamaguard-unsafe-content.query.js +0 -92
  419. package/dist/queries/llamaguard-unsafe-content.query.js.map +0 -1
  420. package/dist/queries/readability-score.query.d.ts +0 -2
  421. package/dist/queries/readability-score.query.js.map +0 -1
  422. package/dist/servers/crawl.server.js +0 -198
  423. package/dist/servers/crawl.server.js.map +0 -1
  424. package/dist/servers/express.server.js +0 -13
  425. package/dist/servers/express.server.js.map +0 -1
  426. package/dist/servers/maintenance.server.js +0 -103
  427. package/dist/servers/maintenance.server.js.map +0 -1
  428. package/dist/servers/rest.server.js +0 -46
  429. package/dist/servers/rest.server.js.map +0 -1
  430. package/dist/servers/socket-io.server.js +0 -61
  431. package/dist/servers/socket-io.server.js.map +0 -1
  432. package/dist/services/database.service.js +0 -795
  433. package/dist/services/database.service.js.map +0 -1
  434. package/dist/services/ollama-rest.service.js +0 -46
  435. package/dist/services/ollama-rest.service.js.map +0 -1
  436. package/dist/services/rig-llama-guard.service.js.map +0 -1
  437. package/dist/services/rig-queue-length.service.js +0 -22
  438. package/dist/services/rig-queue-length.service.js.map +0 -1
  439. package/dist/services/rig-subjectivity-scale.service.js +0 -96
  440. package/dist/services/rig-subjectivity-scale.service.js.map +0 -1
  441. package/dist/services/rig-ticket-rest.service.js +0 -29
  442. package/dist/services/rig-ticket-rest.service.js.map +0 -1
  443. package/dist/services/rig-ticketed-promise.service.js.map +0 -1
  444. package/dist/types/tcrawl-config.js.map +0 -1
  445. package/dist/types/thydra-config.js.map +0 -1
  446. package/dist/types/tparser-ctor.d.ts +0 -7
  447. package/dist/types/tparser-ctor.js +0 -2
  448. package/dist/types/tparser-ctor.js.map +0 -1
  449. package/dist/types/tquery.d.ts +0 -7
  450. package/dist/types/tquery.js +0 -2
  451. package/dist/types/tquery.js.map +0 -1
  452. package/dist/types/tqueue-length.js +0 -2
  453. package/dist/types/tqueue-length.js.map +0 -1
  454. package/dist/types/trobots-config.js.map +0 -1
  455. /package/dist/classes/{dns.d.ts → dns.d.mts} +0 -0
  456. /package/dist/{cli.d.ts → cli.d.mts} +0 -0
  457. /package/dist/enums/{eavailable-strategy.d.ts → eavailable-strategy.d.mts} +0 -0
  458. /package/dist/enums/{elist.d.ts → elist.d.mts} +0 -0
  459. /package/dist/enums/{eserver.d.ts → eserver.d.mts} +0 -0
  460. /package/dist/enums/{ex-powered-by.d.ts → ex-powered-by.d.mts} +0 -0
  461. /package/dist/interfaces/{imatch.d.ts → imatch.d.mts} +0 -0
  462. /package/dist/interfaces/{iparser-config.d.ts → iparser-config.d.mts} +0 -0
  463. /package/dist/interfaces/{iserver.d.ts → iserver.d.mts} +0 -0
  464. /package/dist/types/{tcrawl-config.d.ts → tcrawl-config.d.mts} +0 -0
  465. /package/dist/types/{thydra-config.d.ts → thydra-config.d.mts} +0 -0
  466. /package/dist/types/{tqueue-length.d.ts → tqueue-length.d.mts} +0 -0
  467. /package/dist/types/{trobots-config.d.ts → trobots-config.d.mts} +0 -0
@@ -1,795 +0,0 @@
1
- var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
- function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
- return new (P || (P = Promise))(function (resolve, reject) {
4
- function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
- function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
- function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
- step((generator = generator.apply(thisArg, _arguments || [])).next());
8
- });
9
- };
10
- var __asyncValues = (this && this.__asyncValues) || function (o) {
11
- if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
12
- var m = o[Symbol.asyncIterator], i;
13
- return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
14
- function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
15
- function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
16
- };
17
- import { URL } from 'url';
18
- import mongodb from 'mongodb';
19
- import { commonsObjectStripNulls, commonsStringRegexLike, commonsTypeAttemptNumber, commonsTypeHasPropertyNumber, commonsTypeHasPropertyString, commonsTypeHasPropertyTArray } from 'tscommons-es-core';
20
- import { isIUrl } from 'hydra-crawler-ts-assets';
21
- import { isTPhpError } from 'hydra-crawler-ts-assets';
22
- import { isTAspError } from 'hydra-crawler-ts-assets';
23
- import { isTDomain } from 'hydra-crawler-ts-assets';
24
- import { isTLink } from 'hydra-crawler-ts-assets';
25
- import { EStatus, toEStatus } from 'hydra-crawler-ts-assets';
26
- import { EComparator } from 'hydra-crawler-ts-assets';
27
- import { commonsOutputDebug, commonsOutputDoing, commonsOutputError, commonsOutputProgress, commonsOutputResult, commonsOutputSuccess } from 'nodecommons-es-cli';
28
- import { CommonsMongodbService } from 'nodecommons-es-database-mongodb';
29
- import { EAvailableStrategy } from '../enums/eavailable-strategy';
30
- export function isTMongoIdRow(test) {
31
- if (!commonsTypeHasPropertyString(test, '_id'))
32
- return false;
33
- return true;
34
- }
35
- export function isTMongoIdTallyRow(test) {
36
- if (!isTMongoIdRow(test))
37
- return false;
38
- if (!commonsTypeHasPropertyNumber(test, 'tally'))
39
- return false;
40
- return true;
41
- }
42
- export class DatabaseService extends CommonsMongodbService {
43
- getDomains() {
44
- if (!this.domains)
45
- throw new Error('Domains collected has not been instantiated yet');
46
- return this.domains;
47
- }
48
- getUrls() {
49
- if (!this.urls)
50
- throw new Error('Urls collected has not been instantiated yet');
51
- return this.urls;
52
- }
53
- getLinks() {
54
- if (!this.links)
55
- throw new Error('Links collected has not been instantiated yet');
56
- return this.links;
57
- }
58
- getArchiveds() {
59
- if (!this.archiveds)
60
- throw new Error('Archiveds collected has not been instantiated yet');
61
- return this.archiveds;
62
- }
63
- init() {
64
- const _super = Object.create(null, {
65
- init: { get: () => super.init }
66
- });
67
- return __awaiter(this, void 0, void 0, function* () {
68
- yield _super.init.call(this);
69
- if (!this.database)
70
- throw new Error('Database has not been instantiated yet');
71
- this.domains = yield this.ensureCollection('domains');
72
- yield this.domains.createIndex({ domain: 1 }, { unique: true });
73
- yield this.domains.createIndex({ ip: 1 }, { unique: false });
74
- this.urls = yield this.ensureCollection('urls');
75
- yield this.urls.createIndex({ url: 1 }, { unique: true });
76
- yield this.urls.createIndex({ domain: 1 }, { unique: false });
77
- yield this.urls.createIndex({ status: 1 }, { unique: false });
78
- yield this.urls.createIndex({ attempted: 1 }, { unique: false });
79
- yield this.urls.createIndex({ orphan: 1 }, { unique: false });
80
- yield this.urls.createIndex({ statusCode: 1 }, { unique: false });
81
- yield this.urls.createIndex({ 'headers.content-type': 1 }, { unique: false });
82
- yield this.urls.createIndex({ 'headers.content-length': 1 }, { unique: false });
83
- this.links = yield this.ensureCollection('links');
84
- yield this.links.createIndex({ url: 1 }, { unique: false });
85
- yield this.links.createIndex({ outgoing: 1 }, { unique: false });
86
- yield this.links.createIndex({ url: 1, outgoing: 1 }, { unique: true });
87
- this.archiveds = yield this.ensureCollection('archiveds');
88
- // no indices for the archiveds
89
- });
90
- }
91
- initParser(ctor) {
92
- return __awaiter(this, void 0, void 0, function* () {
93
- const parser = new ctor();
94
- yield parser.init(this);
95
- });
96
- }
97
- getRawDatabase() {
98
- return super.getRawDatabase();
99
- }
100
- wipe() {
101
- return __awaiter(this, void 0, void 0, function* () {
102
- yield this.getLinks().deleteMany({});
103
- yield this.getUrls().deleteMany({});
104
- yield this.getDomains().deleteMany({});
105
- });
106
- }
107
- resetActive() {
108
- return __awaiter(this, void 0, void 0, function* () {
109
- yield this.getUrls().updateMany({ status: EStatus.ACTIVE }, { $set: { status: EStatus.QUEUED } });
110
- });
111
- }
112
- domain(domain, ip) {
113
- return __awaiter(this, void 0, void 0, function* () {
114
- try {
115
- yield this.getDomains().updateOne({ domain: domain }, { $set: { ip: ip } }, { upsert: true });
116
- return true;
117
- }
118
- catch (ex) {
119
- commonsOutputDebug('debug position 7');
120
- console.log(ex);
121
- return false;
122
- }
123
- });
124
- }
125
- queue(url, isDeny) {
126
- return __awaiter(this, void 0, void 0, function* () {
127
- const whatwg = new URL(url);
128
- if (!whatwg.protocol.match(/^http(s?):$/))
129
- return false;
130
- try {
131
- const status = isDeny ? EStatus.DENY : EStatus.QUEUED;
132
- // un-archive if currently archived
133
- yield this.getUrls().deleteOne({ url: url, status: EStatus.ARCHIVED });
134
- yield this.getUrls().insertOne({
135
- url: url,
136
- domain: whatwg.hostname,
137
- status: status
138
- });
139
- return true;
140
- }
141
- catch (ex) {
142
- return false;
143
- }
144
- });
145
- }
146
- available(strategy, threshold, limit, existing, restrictTo) {
147
- return __awaiter(this, void 0, void 0, function* () {
148
- if (limit === 0)
149
- return [];
150
- const comparator = strategy === EAvailableStrategy.LARGEST ? EComparator.GTE : EComparator.LT;
151
- const thresholdMatch = { tally: {} };
152
- switch (comparator) {
153
- case EComparator.GTE:
154
- thresholdMatch.tally = { $gte: threshold };
155
- break;
156
- case EComparator.LT:
157
- thresholdMatch.tally = { $lt: threshold };
158
- break;
159
- }
160
- const sortOrder = strategy === EAvailableStrategy.SMALLEST ? 1 : -1;
161
- const domainMatch = { $nin: existing };
162
- if (restrictTo.length > 0)
163
- domainMatch['$in'] = restrictTo;
164
- const results = this.getUrls().aggregate([
165
- { $match: { status: EStatus.QUEUED, domain: domainMatch } },
166
- { $group: { _id: '$domain', tally: { $sum: 1 } } },
167
- { $match: thresholdMatch },
168
- { $sort: { tally: sortOrder } },
169
- { $limit: limit }
170
- ], { allowDiskUse: true });
171
- const entries = yield this.listQueryResults(results, isTMongoIdTallyRow);
172
- return entries
173
- // eslint-disable-next-line no-underscore-dangle
174
- .map((entry) => entry._id);
175
- });
176
- }
177
- next(domain) {
178
- return __awaiter(this, void 0, void 0, function* () {
179
- const next = yield this.getUrls().findOne({
180
- status: EStatus.QUEUED,
181
- domain: domain
182
- });
183
- if (next === null)
184
- return undefined;
185
- return next.url;
186
- });
187
- }
188
- setStatus(url, status) {
189
- return __awaiter(this, void 0, void 0, function* () {
190
- const updates = { status: status };
191
- if (![EStatus.QUEUED, EStatus.ACTIVE].includes(status))
192
- updates['attempted'] = new Date();
193
- if (status === EStatus.DONE)
194
- updates['done'] = new Date();
195
- try {
196
- yield this.getUrls().updateOne({ url: url }, { $set: updates });
197
- return true;
198
- }
199
- catch (ex) {
200
- console.error(ex);
201
- return false;
202
- }
203
- });
204
- }
205
- setStatusCode(url, code) {
206
- return __awaiter(this, void 0, void 0, function* () {
207
- try {
208
- yield this.getUrls().updateOne({ url: url }, { $set: { statusCode: code } });
209
- return true;
210
- }
211
- catch (ex) {
212
- console.error(ex);
213
- return false;
214
- }
215
- });
216
- }
217
- setHeaders(url, headers) {
218
- return __awaiter(this, void 0, void 0, function* () {
219
- try {
220
- yield this.getUrls().updateOne({ url: url }, { $set: { headers: headers } });
221
- return true;
222
- }
223
- catch (ex) {
224
- console.error(ex);
225
- return false;
226
- }
227
- });
228
- }
229
- setData(url, context, data) {
230
- return __awaiter(this, void 0, void 0, function* () {
231
- try {
232
- const update = {};
233
- update[context] = data;
234
- yield this.getUrls().updateOne({ url: url }, { $set: update });
235
- return true;
236
- }
237
- catch (ex) {
238
- console.error(ex);
239
- return false;
240
- }
241
- });
242
- }
243
- unsetData(url, context) {
244
- return __awaiter(this, void 0, void 0, function* () {
245
- try {
246
- const update = {};
247
- update[context] = true;
248
- yield this.getUrls().updateOne({ url: url }, { $unset: update });
249
- return true;
250
- }
251
- catch (ex) {
252
- console.error(ex);
253
- return false;
254
- }
255
- });
256
- }
257
- getTtl(url) {
258
- return __awaiter(this, void 0, void 0, function* () {
259
- const row = yield this.getUrls().findOne({ url: url });
260
- if (row === null)
261
- return undefined;
262
- if (!commonsTypeHasPropertyNumber(row, 'ttl'))
263
- return undefined;
264
- return row['ttl'];
265
- });
266
- }
267
- setTtl(url, ttl) {
268
- return __awaiter(this, void 0, void 0, function* () {
269
- try {
270
- yield this.getUrls().updateOne({ url: url }, { $set: { ttl: ttl } });
271
- return true;
272
- }
273
- catch (ex) {
274
- console.error(ex);
275
- return false;
276
- }
277
- });
278
- }
279
- unsetTtl(url) {
280
- return __awaiter(this, void 0, void 0, function* () {
281
- try {
282
- yield this.getUrls().updateOne({ url: url }, { $unset: { ttl: true } });
283
- return true;
284
- }
285
- catch (ex) {
286
- console.error(ex);
287
- return false;
288
- }
289
- });
290
- }
291
- getHash(url) {
292
- return __awaiter(this, void 0, void 0, function* () {
293
- const row = yield this.getUrls().findOne({ url: url });
294
- if (row === null)
295
- return undefined;
296
- if (!commonsTypeHasPropertyString(row, 'hash'))
297
- return undefined;
298
- return row['hash'];
299
- });
300
- }
301
- setHash(url, hash) {
302
- return __awaiter(this, void 0, void 0, function* () {
303
- try {
304
- yield this.getUrls().updateOne({ url: url }, { $set: {
305
- hash: hash,
306
- hashSet: new Date()
307
- } });
308
- return true;
309
- }
310
- catch (ex) {
311
- console.error(ex);
312
- return false;
313
- }
314
- });
315
- }
316
- unsetHash(url) {
317
- return __awaiter(this, void 0, void 0, function* () {
318
- try {
319
- yield this.getUrls().updateOne({ url: url }, { $unset: {
320
- hash: true,
321
- hashSet: true
322
- } });
323
- return true;
324
- }
325
- catch (ex) {
326
- console.error(ex);
327
- return false;
328
- }
329
- });
330
- }
331
- setFailReason(url, reason) {
332
- return __awaiter(this, void 0, void 0, function* () {
333
- try {
334
- yield this.getUrls().updateOne({ url: url }, { $set: {
335
- reason: reason
336
- } });
337
- return true;
338
- }
339
- catch (ex) {
340
- console.error(ex);
341
- return false;
342
- }
343
- });
344
- }
345
- unsetFailReason(url) {
346
- return __awaiter(this, void 0, void 0, function* () {
347
- try {
348
- yield this.getUrls().updateOne({ url: url }, { $unset: {
349
- reason: true
350
- } });
351
- return true;
352
- }
353
- catch (ex) {
354
- console.error(ex);
355
- return false;
356
- }
357
- });
358
- }
359
- link(url, links) {
360
- return __awaiter(this, void 0, void 0, function* () {
361
- // more efficient to only remove removed and only add new
362
- // rather than just wiping all existing and re-adding
363
- const find = this.getLinks().find({
364
- url: url
365
- }, {});
366
- const existing = (yield this.listQueryResults(find, isTLink))
367
- .map((link) => link.outgoing);
368
- const removed = [];
369
- const added = [];
370
- for (const link of links) {
371
- if (!existing.includes(link) && !added.includes(link))
372
- added.push(link);
373
- }
374
- for (const link of existing) {
375
- if (!links.includes(link))
376
- removed.push(link);
377
- }
378
- for (const outgoing of removed) {
379
- try {
380
- yield this.getLinks().deleteMany({
381
- url: url,
382
- outgoing: outgoing
383
- });
384
- }
385
- catch (ex) {
386
- /* do nothing */
387
- }
388
- }
389
- for (const outgoing of added) {
390
- try {
391
- yield this.getLinks().insertOne({
392
- url: url,
393
- outgoing: outgoing
394
- });
395
- }
396
- catch (ex) {
397
- switch (ex.code || -1) {
398
- case 11000:
399
- // ignore duplicates
400
- commonsOutputError(`DUPLICATE: ${url}, ${outgoing}`);
401
- break;
402
- case 17280:
403
- case 17282:
404
- commonsOutputError(`INDEX TOO LARGE: ${url}, ${outgoing}`);
405
- // ignore index too large
406
- break;
407
- default:
408
- commonsOutputDebug('debug position 8');
409
- console.log(ex);
410
- throw ex;
411
- }
412
- }
413
- }
414
- return true;
415
- });
416
- }
417
- markDead(domain) {
418
- return __awaiter(this, void 0, void 0, function* () {
419
- try {
420
- yield this.getUrls().updateMany({ domain: domain, status: { $in: [EStatus.QUEUED, EStatus.ACTIVE] } }, { $set: { status: EStatus.DEAD, attempted: new Date() } });
421
- return true;
422
- }
423
- catch (ex) {
424
- console.error(ex);
425
- return false;
426
- }
427
- });
428
- }
429
- listStatusTallies() {
430
- return __awaiter(this, void 0, void 0, function* () {
431
- const results = this.getUrls().aggregate([
432
- { $match: { status: { $ne: EStatus.ARCHIVED } } },
433
- { $group: {
434
- _id: '$status',
435
- tally: { $sum: 1 }
436
- } }
437
- ]);
438
- const rows = yield this.listQueryResults(results, isTMongoIdTallyRow);
439
- const map = new Map();
440
- for (const row of rows) {
441
- // eslint-disable-next-line no-underscore-dangle
442
- const status = toEStatus(row._id);
443
- if (status)
444
- map.set(status, row.tally);
445
- }
446
- return map;
447
- });
448
- }
449
- getLinkTalliesCount() {
450
- return __awaiter(this, void 0, void 0, function* () {
451
- return yield this.getLinks().estimatedDocumentCount();
452
- });
453
- }
454
- getDomainTalliesCount() {
455
- return __awaiter(this, void 0, void 0, function* () {
456
- return yield this.getDomains().estimatedDocumentCount();
457
- });
458
- }
459
- listDomainQueuedTallies() {
460
- return __awaiter(this, void 0, void 0, function* () {
461
- const results = this.getUrls().aggregate([
462
- { $match: {
463
- status: EStatus.QUEUED
464
- } },
465
- { $group: {
466
- _id: '$domain',
467
- tally: { $sum: 1 }
468
- } }
469
- ]);
470
- const rows = yield this.listQueryResults(results, isTMongoIdTallyRow);
471
- const map = new Map();
472
- for (const row of rows) {
473
- // eslint-disable-next-line no-underscore-dangle
474
- map.set(row._id, row.tally);
475
- }
476
- return map;
477
- });
478
- }
479
- listPhpErrors() {
480
- return __awaiter(this, void 0, void 0, function* () {
481
- const results = this.getUrls().find({
482
- status: { $ne: EStatus.ARCHIVED },
483
- phpErrors: { $exists: true }
484
- }, {});
485
- return (yield this.listQueryResults(results, isIUrl))
486
- .map((url) => {
487
- if (!commonsTypeHasPropertyTArray(url, 'phpErrors', isTPhpError))
488
- throw new Error('Invalid PHP error object');
489
- return {
490
- url: url.url,
491
- errors: url.phpErrors
492
- };
493
- });
494
- });
495
- }
496
- listAspErrors() {
497
- return __awaiter(this, void 0, void 0, function* () {
498
- const results = this.getUrls().find({
499
- status: { $ne: EStatus.ARCHIVED },
500
- aspErrors: { $exists: true }
501
- }, {});
502
- return (yield this.listQueryResults(results, isIUrl))
503
- .map((url) => {
504
- if (!commonsTypeHasPropertyTArray(url, 'aspErrors', isTAspError))
505
- throw new Error('Invalid PHP error object');
506
- return {
507
- url: url.url,
508
- errors: url.aspErrors
509
- };
510
- });
511
- });
512
- }
513
- listDone200DomainUrls(domain) {
514
- return __awaiter(this, void 0, void 0, function* () {
515
- const results = this.getUrls().find({ $and: [
516
- { domain: domain },
517
- { status: EStatus.DONE },
518
- { statusCode: { $gte: 200 } },
519
- { statusCode: { $lt: 300 } }
520
- ] }, {});
521
- return (yield this.listQueryResults(results, isIUrl))
522
- .map((url) => url.url);
523
- });
524
- }
525
- listDomains() {
526
- return __awaiter(this, void 0, void 0, function* () {
527
- const results = this.getDomains().find(
528
- // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment
529
- { ip: { $exists: true, $ne: null } }, // this is ok, despite the type objection to null
530
- {});
531
- // since we're doing $ne: null above, we don't need to strip nulls, as there won't be any
532
- return yield this.listQueryResults(results, isTDomain);
533
- });
534
- }
535
- listDomainsByLike(term) {
536
- return __awaiter(this, void 0, void 0, function* () {
537
- const results = this.getDomains().find({
538
- domain: new RegExp(commonsStringRegexLike(`%${term}%`), 'i')
539
- }, {});
540
- return (yield this.listQueryResults(results, isTDomain))
541
- .map((encoded) => commonsObjectStripNulls(encoded));
542
- });
543
- }
544
- listInboundLinks(url) {
545
- return __awaiter(this, void 0, void 0, function* () {
546
- const results = this.getLinks().find({ outgoing: url }, {});
547
- return (yield this.listQueryResults(results, isTLink))
548
- .map((link) => link.url);
549
- });
550
- }
551
- listOutboundLinks(url) {
552
- return __awaiter(this, void 0, void 0, function* () {
553
- const results = this.getLinks().find({ url: url }, {});
554
- return (yield this.listQueryResults(results, isTLink))
555
- .map((link) => link.outgoing);
556
- });
557
- }
558
- listImagesBySizeThreshold(size, comparator) {
559
- return __awaiter(this, void 0, void 0, function* () {
560
- const queries = [
561
- { status: EStatus.DONE },
562
- { 'headers.content-type': /^image\/(jpeg)/ },
563
- { 'headers.content-length': { $exists: true } }
564
- ];
565
- switch (comparator) {
566
- case EComparator.GT:
567
- queries.push({ 'headers.content-length': { $gt: size } });
568
- break;
569
- case EComparator.LT:
570
- queries.push({ 'headers.content-length': { $lt: size } });
571
- break;
572
- case EComparator.GTE:
573
- queries.push({ 'headers.content-length': { $gte: size } });
574
- break;
575
- case EComparator.LTE:
576
- queries.push({ 'headers.content-length': { $lte: size } });
577
- break;
578
- }
579
- const results = this.getUrls().find({ $and: queries }, {});
580
- return (yield this.listQueryResults(results, isIUrl))
581
- .map((row) => ({
582
- url: row.url,
583
- size: commonsTypeAttemptNumber(row['headers']['content-length']) || -1
584
- }));
585
- });
586
- }
587
- insertArchiveBatch(documents) {
588
- var _a, documents_1, documents_1_1;
589
- var _b, e_1, _c, _d;
590
- return __awaiter(this, void 0, void 0, function* () {
591
- const bulkInsert = this.getArchiveds().initializeUnorderedBulkOp();
592
- const insertIds = [];
593
- try {
594
- for (_a = true, documents_1 = __asyncValues(documents); documents_1_1 = yield documents_1.next(), _b = documents_1_1.done, !_b;) {
595
- _d = documents_1_1.value;
596
- _a = false;
597
- try {
598
- const document = _d;
599
- const typecast = document;
600
- // eslint-disable-next-line no-underscore-dangle
601
- const id = typecast._id;
602
- // Insert without raising an error for duplicates
603
- bulkInsert
604
- .find({ _id: id })
605
- .upsert()
606
- .replaceOne(document);
607
- insertIds.push(id);
608
- }
609
- finally {
610
- _a = true;
611
- }
612
- }
613
- }
614
- catch (e_1_1) { e_1 = { error: e_1_1 }; }
615
- finally {
616
- try {
617
- if (!_a && !_b && (_c = documents_1.return)) yield _c.call(documents_1);
618
- }
619
- finally { if (e_1) throw e_1.error; }
620
- }
621
- if (insertIds.length > 0)
622
- yield bulkInsert.execute();
623
- return insertIds;
624
- });
625
- }
626
- deleteUrlsBatch(documents) {
627
- var _a, documents_2, documents_2_1;
628
- var _b, e_2, _c, _d;
629
- return __awaiter(this, void 0, void 0, function* () {
630
- // NB, this presumes that the links to and from this have already been removed by the cleaner
631
- const bulkDelete = this.getUrls().initializeUnorderedBulkOp();
632
- let tally = 0;
633
- try {
634
- for (_a = true, documents_2 = __asyncValues(documents); documents_2_1 = yield documents_2.next(), _b = documents_2_1.done, !_b;) {
635
- _d = documents_2_1.value;
636
- _a = false;
637
- try {
638
- const document = _d;
639
- const typecast = document;
640
- // eslint-disable-next-line no-underscore-dangle
641
- const id = typecast._id;
642
- bulkDelete
643
- .find({ _id: id })
644
- .deleteOne();
645
- tally++;
646
- }
647
- finally {
648
- _a = true;
649
- }
650
- }
651
- }
652
- catch (e_2_1) { e_2 = { error: e_2_1 }; }
653
- finally {
654
- try {
655
- if (!_a && !_b && (_c = documents_2.return)) yield _c.call(documents_2);
656
- }
657
- finally { if (e_2) throw e_2.error; }
658
- }
659
- if (tally > 0)
660
- yield bulkDelete.execute();
661
- });
662
- }
663
- moveArchivedUrlsToArchive(batchSize, limit) {
664
- return __awaiter(this, void 0, void 0, function* () {
665
- let total = 0;
666
- while (true) {
667
- commonsOutputDoing(`Inserting up to ${batchSize} urls entries into archiveds`);
668
- const existings = this.getUrls().find({ status: EStatus.ARCHIVED }).limit(batchSize);
669
- const done = (yield this.insertArchiveBatch(existings))
670
- .map((id) => new mongodb.ObjectId(id));
671
- yield existings.close();
672
- total += done.length;
673
- commonsOutputResult(`${done.length}, ${total} total`);
674
- if (done.length === 0)
675
- break;
676
- commonsOutputDoing('Deleting previously moved urls');
677
- const toDelete = this.getUrls().find({ _id: { $in: done } }).limit(batchSize);
678
- yield this.deleteUrlsBatch(toDelete);
679
- yield toDelete.close();
680
- if (limit !== undefined)
681
- limit -= done.length;
682
- commonsOutputSuccess();
683
- if (limit !== undefined && limit <= 0)
684
- break;
685
- }
686
- });
687
- }
688
- listDuplicateArchivedUrls() {
689
- return __awaiter(this, void 0, void 0, function* () {
690
- commonsOutputDoing('Building cursor for all archiveds');
691
- // we have to use this strange sharded increasing array of sets, as the maximum values a Set (or Map) can hold is 16777216 (2^24)
692
- const urls = [];
693
- const duplicates = new Set();
694
- let tally = 0;
695
- while (true) {
696
- let subTally = 0;
697
- const currentUrls = new Set();
698
- try {
699
- const archiveds = this.getArchiveds().find({}, {
700
- sort: { _id: 1 },
701
- skip: tally,
702
- limit: 1000000
703
- });
704
- commonsOutputSuccess();
705
- commonsOutputDoing('Searching for duplicates');
706
- yield archiveds.forEach((archived) => {
707
- try {
708
- let match = false;
709
- for (const s of [...urls, currentUrls]) {
710
- if (s.has(archived.url))
711
- match = true;
712
- }
713
- if (match) {
714
- duplicates.add(archived.url);
715
- }
716
- else {
717
- currentUrls.add(archived.url);
718
- }
719
- tally++;
720
- subTally++;
721
- if ((tally % 10000) === 0) {
722
- let total = 0;
723
- for (const s of [...urls, currentUrls])
724
- total += s.size;
725
- commonsOutputProgress(`${tally}, ${total} unique urls, ${duplicates.size} duplicates`);
726
- }
727
- }
728
- catch (e) {
729
- console.log(e);
730
- process.exit(1);
731
- }
732
- });
733
- let batchTotal = 0;
734
- for (const s of [...urls, currentUrls])
735
- batchTotal += s.size;
736
- commonsOutputResult(`${tally}, ${batchTotal} unique urls, ${duplicates.size} duplicates`);
737
- yield archiveds.close();
738
- }
739
- catch (e2) {
740
- console.log(e2);
741
- process.exit(1);
742
- }
743
- urls.push(currentUrls);
744
- if (subTally === 0)
745
- break;
746
- }
747
- return Array.from(duplicates.values());
748
- });
749
- }
750
- purgeArchiveDuplicates(url) {
751
- var _a, e_3, _b, _c;
752
- return __awaiter(this, void 0, void 0, function* () {
753
- const archiveds = this.getArchiveds().find({
754
- url: url
755
- }, {});
756
- const bulkDelete = this.getArchiveds().initializeUnorderedBulkOp();
757
- let first = true;
758
- let tally = 0;
759
- try {
760
- for (var _d = true, archiveds_1 = __asyncValues(archiveds), archiveds_1_1; archiveds_1_1 = yield archiveds_1.next(), _a = archiveds_1_1.done, !_a;) {
761
- _c = archiveds_1_1.value;
762
- _d = false;
763
- try {
764
- const document = _c;
765
- if (first) {
766
- first = false;
767
- continue;
768
- }
769
- const typecast = document;
770
- // eslint-disable-next-line no-underscore-dangle
771
- const id = typecast._id;
772
- bulkDelete
773
- .find({ _id: id })
774
- .deleteOne();
775
- tally++;
776
- }
777
- finally {
778
- _d = true;
779
- }
780
- }
781
- }
782
- catch (e_3_1) { e_3 = { error: e_3_1 }; }
783
- finally {
784
- try {
785
- if (!_d && !_a && (_b = archiveds_1.return)) yield _b.call(archiveds_1);
786
- }
787
- finally { if (e_3) throw e_3.error; }
788
- }
789
- if (tally > 0)
790
- yield bulkDelete.execute();
791
- yield archiveds.close();
792
- });
793
- }
794
- }
795
- //# sourceMappingURL=database.service.js.map