hydra-crawler 1.4.5 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (305) hide show
  1. package/dist/apis/autocomplete.api.d.ts +7 -0
  2. package/dist/apis/autocomplete.api.js +15 -9
  3. package/dist/apis/autocomplete.api.js.map +1 -0
  4. package/dist/apis/bugs.api.d.ts +7 -0
  5. package/dist/apis/bugs.api.js +21 -15
  6. package/dist/apis/bugs.api.js.map +1 -0
  7. package/dist/apis/crawl.api.d.ts +7 -0
  8. package/dist/apis/crawl.api.js +15 -9
  9. package/dist/apis/crawl.api.js.map +1 -0
  10. package/dist/apis/domains.api.d.ts +7 -0
  11. package/dist/apis/domains.api.js +24 -19
  12. package/dist/apis/domains.api.js.map +1 -0
  13. package/dist/apis/images.api.d.ts +7 -0
  14. package/dist/apis/images.api.js +20 -14
  15. package/dist/apis/images.api.js.map +1 -0
  16. package/dist/apis/statistics.api.d.ts +8 -0
  17. package/dist/apis/statistics.api.js +27 -20
  18. package/dist/apis/statistics.api.js.map +1 -0
  19. package/dist/apis/test.api.d.ts +5 -0
  20. package/dist/apis/test.api.js +15 -9
  21. package/dist/apis/test.api.js.map +1 -0
  22. package/dist/apis/urls.api.d.ts +7 -0
  23. package/dist/apis/urls.api.js +21 -15
  24. package/dist/apis/urls.api.js.map +1 -0
  25. package/dist/apps/cleanup.app.d.ts +19 -0
  26. package/dist/apps/cleanup.app.js +118 -100
  27. package/dist/apps/cleanup.app.js.map +1 -0
  28. package/dist/apps/cross-populate-export.app.d.ts +12 -0
  29. package/dist/apps/cross-populate-export.app.js +60 -47
  30. package/dist/apps/cross-populate-export.app.js.map +1 -0
  31. package/dist/apps/cross-populate-import.app.d.ts +12 -0
  32. package/dist/apps/cross-populate-import.app.js +64 -51
  33. package/dist/apps/cross-populate-import.app.js.map +1 -0
  34. package/dist/apps/denylist.app.d.ts +17 -0
  35. package/dist/apps/denylist.app.js +115 -98
  36. package/dist/apps/denylist.app.js.map +1 -0
  37. package/dist/apps/expire.app.d.ts +19 -0
  38. package/dist/apps/expire.app.js +44 -31
  39. package/dist/apps/expire.app.js.map +1 -0
  40. package/dist/apps/extract-text.app.d.ts +8 -0
  41. package/dist/apps/extract-text.app.js +43 -35
  42. package/dist/apps/extract-text.app.js.map +1 -0
  43. package/dist/apps/hydra.app.d.ts +34 -0
  44. package/dist/apps/hydra.app.js +150 -137
  45. package/dist/apps/hydra.app.js.map +1 -0
  46. package/dist/apps/import.app.d.ts +11 -0
  47. package/dist/apps/import.app.js +44 -32
  48. package/dist/apps/import.app.js.map +1 -0
  49. package/dist/apps/internal-hydra-common.app.d.ts +28 -0
  50. package/dist/apps/internal-hydra-common.app.js +5 -11
  51. package/dist/apps/internal-hydra-common.app.js.map +1 -0
  52. package/dist/apps/query.app.d.ts +20 -0
  53. package/dist/apps/query.app.js +63 -49
  54. package/dist/apps/query.app.js.map +1 -0
  55. package/dist/apps/reattempt.app.d.ts +17 -0
  56. package/dist/apps/reattempt.app.js +66 -53
  57. package/dist/apps/reattempt.app.js.map +1 -0
  58. package/dist/apps/requeue-domain.app.d.ts +13 -0
  59. package/dist/apps/requeue-domain.app.js +50 -37
  60. package/dist/apps/requeue-domain.app.js.map +1 -0
  61. package/dist/apps/seed.app.d.ts +15 -0
  62. package/dist/apps/seed.app.js +53 -40
  63. package/dist/apps/seed.app.js.map +1 -0
  64. package/dist/apps/startup.app.d.ts +11 -0
  65. package/dist/apps/startup.app.js +51 -38
  66. package/dist/apps/startup.app.js.map +1 -0
  67. package/dist/apps/unarchive.app.d.ts +15 -0
  68. package/dist/apps/unarchive.app.js +67 -54
  69. package/dist/apps/unarchive.app.js.map +1 -0
  70. package/dist/classes/cleaner.d.ts +12 -0
  71. package/dist/classes/cleaner.js +227 -207
  72. package/dist/classes/cleaner.js.map +1 -0
  73. package/dist/classes/crawler.d.ts +34 -0
  74. package/dist/classes/crawler.js +248 -241
  75. package/dist/classes/crawler.js.map +1 -0
  76. package/dist/classes/dns.d.ts +3 -0
  77. package/dist/classes/dns.js +10 -13
  78. package/dist/classes/dns.js.map +1 -0
  79. package/dist/classes/expirer.d.ts +10 -0
  80. package/dist/classes/expirer.js +107 -94
  81. package/dist/classes/expirer.js.map +1 -0
  82. package/dist/classes/expiry.d.ts +8 -0
  83. package/dist/classes/expiry.js +16 -19
  84. package/dist/classes/expiry.js.map +1 -0
  85. package/dist/classes/lists.d.ts +9 -0
  86. package/dist/classes/lists.js +13 -18
  87. package/dist/classes/lists.js.map +1 -0
  88. package/dist/classes/robot.d.ts +15 -0
  89. package/dist/classes/robot.js +40 -30
  90. package/dist/classes/robot.js.map +1 -0
  91. package/dist/classes/tracker.d.ts +25 -0
  92. package/dist/classes/tracker.js +82 -64
  93. package/dist/classes/tracker.js.map +1 -0
  94. package/dist/cli.d.ts +1 -0
  95. package/dist/cli.js +72 -65
  96. package/dist/cli.js.map +1 -0
  97. package/dist/enums/eavailable-strategy.d.ts +4 -0
  98. package/dist/enums/eavailable-strategy.js +3 -5
  99. package/dist/enums/eavailable-strategy.js.map +1 -0
  100. package/dist/enums/elist.d.ts +7 -0
  101. package/dist/enums/elist.js +7 -11
  102. package/dist/enums/elist.js.map +1 -0
  103. package/dist/enums/eserver.d.ts +8 -0
  104. package/dist/enums/eserver.js +3 -5
  105. package/dist/enums/eserver.js.map +1 -0
  106. package/dist/enums/ex-powered-by.d.ts +6 -0
  107. package/dist/enums/ex-powered-by.js +3 -5
  108. package/dist/enums/ex-powered-by.js.map +1 -0
  109. package/dist/helpers/matcher.d.ts +5 -0
  110. package/dist/helpers/matcher.js +2 -5
  111. package/dist/helpers/matcher.js.map +1 -0
  112. package/dist/helpers/random.d.ts +4 -0
  113. package/dist/helpers/random.js +2 -5
  114. package/dist/helpers/random.js.map +1 -0
  115. package/dist/helpers/utf-decoder.d.ts +4 -0
  116. package/dist/helpers/utf-decoder.js +3 -6
  117. package/dist/helpers/utf-decoder.js.map +1 -0
  118. package/dist/interfaces/iexpiry.d.ts +7 -0
  119. package/dist/interfaces/iexpiry.js +9 -13
  120. package/dist/interfaces/iexpiry.js.map +1 -0
  121. package/dist/interfaces/imatch.d.ts +6 -0
  122. package/dist/interfaces/imatch.js +6 -9
  123. package/dist/interfaces/imatch.js.map +1 -0
  124. package/dist/interfaces/iparser-config.d.ts +4 -0
  125. package/dist/interfaces/iparser-config.js +4 -7
  126. package/dist/interfaces/iparser-config.js.map +1 -0
  127. package/dist/interfaces/iparser.d.ts +8 -0
  128. package/dist/interfaces/iparser.js +2 -2
  129. package/dist/interfaces/iparser.js.map +1 -0
  130. package/dist/interfaces/irequest-outcome.d.ts +11 -0
  131. package/dist/interfaces/irequest-outcome.js +2 -2
  132. package/dist/interfaces/irequest-outcome.js.map +1 -0
  133. package/dist/interfaces/iserver.d.ts +4 -0
  134. package/dist/interfaces/iserver.js +2 -2
  135. package/dist/interfaces/iserver.js.map +1 -0
  136. package/dist/parsers/accessibility-metrics.parser.d.ts +11 -0
  137. package/dist/parsers/accessibility-metrics.parser.js +34 -26
  138. package/dist/parsers/accessibility-metrics.parser.js.map +1 -0
  139. package/dist/parsers/asp-error.parser.d.ts +12 -0
  140. package/dist/parsers/asp-error.parser.js +36 -28
  141. package/dist/parsers/asp-error.parser.js.map +1 -0
  142. package/dist/parsers/bad-words.parser.d.ts +10 -0
  143. package/dist/parsers/bad-words.parser.js +21 -13
  144. package/dist/parsers/bad-words.parser.js.map +1 -0
  145. package/dist/parsers/complex-english.parser.d.ts +15 -0
  146. package/dist/parsers/complex-english.parser.js +33 -25
  147. package/dist/parsers/complex-english.parser.js.map +1 -0
  148. package/dist/parsers/data.parser.d.ts +14 -0
  149. package/dist/parsers/data.parser.js +12 -16
  150. package/dist/parsers/data.parser.js.map +1 -0
  151. package/dist/parsers/dictionary.parser.d.ts +19 -0
  152. package/dist/parsers/dictionary.parser.js +47 -39
  153. package/dist/parsers/dictionary.parser.js.map +1 -0
  154. package/dist/parsers/html.parser.d.ts +13 -0
  155. package/dist/parsers/html.parser.js +4 -8
  156. package/dist/parsers/html.parser.js.map +1 -0
  157. package/dist/parsers/hyperlinks.parser.d.ts +20 -0
  158. package/dist/parsers/hyperlinks.parser.js +82 -77
  159. package/dist/parsers/hyperlinks.parser.js.map +1 -0
  160. package/dist/parsers/image-tags.parser.d.ts +20 -0
  161. package/dist/parsers/image-tags.parser.js +38 -34
  162. package/dist/parsers/image-tags.parser.js.map +1 -0
  163. package/dist/parsers/jpeg.parser.d.ts +11 -0
  164. package/dist/parsers/jpeg.parser.js +28 -20
  165. package/dist/parsers/jpeg.parser.js.map +1 -0
  166. package/dist/parsers/paragraphs.parser.d.ts +13 -0
  167. package/dist/parsers/paragraphs.parser.js +33 -40
  168. package/dist/parsers/paragraphs.parser.js.map +1 -0
  169. package/dist/parsers/parser.d.ts +19 -0
  170. package/dist/parsers/parser.js +30 -17
  171. package/dist/parsers/parser.js.map +1 -0
  172. package/dist/parsers/php-error.parser.d.ts +12 -0
  173. package/dist/parsers/php-error.parser.js +42 -34
  174. package/dist/parsers/php-error.parser.js.map +1 -0
  175. package/dist/parsers/phrase.parser.d.ts +8 -0
  176. package/dist/parsers/phrase.parser.js +16 -11
  177. package/dist/parsers/phrase.parser.js.map +1 -0
  178. package/dist/parsers/regex.parser.d.ts +10 -0
  179. package/dist/parsers/regex.parser.js +30 -22
  180. package/dist/parsers/regex.parser.js.map +1 -0
  181. package/dist/parsers/server.parser.d.ts +12 -0
  182. package/dist/parsers/server.parser.js +66 -56
  183. package/dist/parsers/server.parser.js.map +1 -0
  184. package/dist/parsers/spelling.parser.d.ts +10 -0
  185. package/dist/parsers/spelling.parser.js +21 -13
  186. package/dist/parsers/spelling.parser.js.map +1 -0
  187. package/dist/parsers/string.parser.d.ts +8 -0
  188. package/dist/parsers/string.parser.js +5 -8
  189. package/dist/parsers/string.parser.js.map +1 -0
  190. package/dist/parsers/text.parser.d.ts +8 -0
  191. package/dist/parsers/text.parser.js +24 -18
  192. package/dist/parsers/text.parser.js.map +1 -0
  193. package/dist/parsers/words.parser.d.ts +11 -0
  194. package/dist/parsers/words.parser.js +32 -28
  195. package/dist/parsers/words.parser.js.map +1 -0
  196. package/dist/queries/complex-english.query.d.ts +2 -0
  197. package/dist/queries/complex-english.query.js +37 -38
  198. package/dist/queries/complex-english.query.js.map +1 -0
  199. package/dist/queries/flash-content.query.d.ts +2 -0
  200. package/dist/queries/flash-content.query.js +45 -32
  201. package/dist/queries/flash-content.query.js.map +1 -0
  202. package/dist/queries/linking-to-domains.query.d.ts +2 -0
  203. package/dist/queries/linking-to-domains.query.js +35 -27
  204. package/dist/queries/linking-to-domains.query.js.map +1 -0
  205. package/dist/queries/readability-score.query.d.ts +2 -0
  206. package/dist/queries/readability-score.query.js +21 -13
  207. package/dist/queries/readability-score.query.js.map +1 -0
  208. package/dist/servers/crawl.server.d.ts +35 -0
  209. package/dist/servers/crawl.server.js +133 -121
  210. package/dist/servers/crawl.server.js.map +1 -0
  211. package/dist/servers/express.server.d.ts +8 -0
  212. package/dist/servers/express.server.js +7 -10
  213. package/dist/servers/express.server.js.map +1 -0
  214. package/dist/servers/maintenance.server.d.ts +22 -0
  215. package/dist/servers/maintenance.server.js +42 -36
  216. package/dist/servers/maintenance.server.js.map +1 -0
  217. package/dist/servers/rest.server.d.ts +7 -0
  218. package/dist/servers/rest.server.js +40 -51
  219. package/dist/servers/rest.server.js.map +1 -0
  220. package/dist/servers/socket-io.server.d.ts +12 -0
  221. package/dist/servers/socket-io.server.js +48 -15
  222. package/dist/servers/socket-io.server.js.map +1 -0
  223. package/dist/services/database.service.d.ts +68 -0
  224. package/dist/services/database.service.js +527 -462
  225. package/dist/services/database.service.js.map +1 -0
  226. package/dist/types/tcrawl-config.d.ts +14 -0
  227. package/dist/types/tcrawl-config.js +14 -17
  228. package/dist/types/tcrawl-config.js.map +1 -0
  229. package/dist/types/thydra-config.d.ts +4 -0
  230. package/dist/types/thydra-config.js +4 -7
  231. package/dist/types/thydra-config.js.map +1 -0
  232. package/dist/types/tparser-ctor.d.ts +7 -0
  233. package/dist/types/tparser-ctor.js +2 -2
  234. package/dist/types/tparser-ctor.js.map +1 -0
  235. package/dist/types/tquery.d.ts +7 -0
  236. package/dist/types/tquery.js +2 -2
  237. package/dist/types/tquery.js.map +1 -0
  238. package/dist/types/trobots-config.d.ts +4 -0
  239. package/dist/types/trobots-config.js +4 -7
  240. package/dist/types/trobots-config.js.map +1 -0
  241. package/package.json +41 -29
  242. package/angular/10-es2015.bacd4ae5dd7913ce55f0.js +0 -1
  243. package/angular/10-es5.bacd4ae5dd7913ce55f0.js +0 -1
  244. package/angular/11-es2015.0f031dcf752d1e8eda6b.js +0 -1
  245. package/angular/11-es5.0f031dcf752d1e8eda6b.js +0 -1
  246. package/angular/3rdpartylicenses.txt +0 -1127
  247. package/angular/5-es2015.951498ca9c1bc74e57bf.js +0 -1
  248. package/angular/5-es5.951498ca9c1bc74e57bf.js +0 -1
  249. package/angular/6-es2015.65f680261a3506b88381.js +0 -1
  250. package/angular/6-es5.65f680261a3506b88381.js +0 -1
  251. package/angular/7-es2015.625197f3af1dbf3e805d.js +0 -1
  252. package/angular/7-es5.625197f3af1dbf3e805d.js +0 -1
  253. package/angular/8-es2015.55518901987a5b834309.js +0 -1
  254. package/angular/8-es5.55518901987a5b834309.js +0 -1
  255. package/angular/9-es2015.6cc9bde262564e7836f2.js +0 -1
  256. package/angular/9-es5.6cc9bde262564e7836f2.js +0 -1
  257. package/angular/Roboto-Black.41ed1105a6ebb8ffe34e.woff2 +0 -0
  258. package/angular/Roboto-Black.937491dfcbe64ca9a9f1.woff +0 -0
  259. package/angular/Roboto-BlackItalic.2e1ee657996854c6f427.woff +0 -0
  260. package/angular/Roboto-BlackItalic.50ca4c51ebc27e7e7d2f.woff2 +0 -0
  261. package/angular/Roboto-Bold.73288d91c325e82a5b92.woff +0 -0
  262. package/angular/Roboto-Bold.92fbd4e93cf0a5dbebaa.woff2 +0 -0
  263. package/angular/Roboto-BoldItalic.5f600d98a73d800ae575.woff2 +0 -0
  264. package/angular/Roboto-BoldItalic.6d89acbd21d7e3fbecb2.woff +0 -0
  265. package/angular/Roboto-Light.c27d89ac77468ae18f28.woff2 +0 -0
  266. package/angular/Roboto-Light.d923dfafc0c5183b59aa.woff +0 -0
  267. package/angular/Roboto-LightItalic.506274c7228cf81cae4d.woff2 +0 -0
  268. package/angular/Roboto-LightItalic.d4b8c137518d9d92bb28.woff +0 -0
  269. package/angular/Roboto-Medium.092c6130df8fd2199888.woff +0 -0
  270. package/angular/Roboto-Medium.1d3bced88509b0838984.woff2 +0 -0
  271. package/angular/Roboto-MediumItalic.18ff1628c628080166c1.woff +0 -0
  272. package/angular/Roboto-MediumItalic.d620b8f53f75966fe42e.woff2 +0 -0
  273. package/angular/Roboto-Regular.64cfb66c866ea50cad47.woff2 +0 -0
  274. package/angular/Roboto-Regular.e02e9d6ff5547f7e9962.woff +0 -0
  275. package/angular/Roboto-RegularItalic.4dd2af1e8df532f41db8.woff2 +0 -0
  276. package/angular/Roboto-RegularItalic.5ea38fff9eebef99c5df.woff +0 -0
  277. package/angular/Roboto-Thin.dbd56bd3357dc3617fe5.woff2 +0 -0
  278. package/angular/Roboto-Thin.e7f7c82374bd0ebef14b.woff +0 -0
  279. package/angular/Roboto-ThinItalic.5dd9349c940073834e9a.woff +0 -0
  280. package/angular/Roboto-ThinItalic.a8cef84f735ef887abdc.woff2 +0 -0
  281. package/angular/assets/config/app-config.json +0 -16
  282. package/angular/assets/images/splashbg.jpg +0 -0
  283. package/angular/assets/web-app-commons/fonts/material-icons/MaterialDesignIcons-Community-2.7.94.woff +0 -0
  284. package/angular/assets/web-app-commons/fonts/material-icons/MaterialDesignIcons-Community-2.7.94.woff2 +0 -0
  285. package/angular/assets/web-app-commons/fonts/material-icons/material-design-icons-community.css +0 -11293
  286. package/angular/favicon.ico +0 -0
  287. package/angular/flUhRq6tzZclQEJ-Vdg-IuiaDsNa.f2a0933406f783065152.woff +0 -0
  288. package/angular/flUhRq6tzZclQEJ-Vdg-IuiaDsNc.6467d9a24f234e8e8e07.woff2 +0 -0
  289. package/angular/index.html +0 -16
  290. package/angular/main-es2015.3a582572476c7f292e52.js +0 -1
  291. package/angular/main-es5.3a582572476c7f292e52.js +0 -1
  292. package/angular/polyfills-es2015.7df68534018bc2f6cb09.js +0 -1
  293. package/angular/polyfills-es5.e79468f406fae2989221.js +0 -1
  294. package/angular/runtime-es2015.6d2cff76cdb2790d3308.js +0 -1
  295. package/angular/runtime-es5.6d2cff76cdb2790d3308.js +0 -1
  296. package/angular/styles.c5c6c2534225b85c4ff0.css +0 -1
  297. package/config/bad-words.json +0 -1
  298. package/config/complex-english.json +0 -400
  299. package/config/hydra-auth.json +0 -8
  300. package/config/hydra-crawler.json +0 -84
  301. package/config/list-allow.json +0 -171
  302. package/config/list-deny.json +0 -248
  303. package/config/list-expiry.json +0 -7
  304. package/config/schedule.json +0 -25
  305. package/config/spelling.json +0 -1
@@ -1,10 +1,16 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.StartupApp = void 0;
4
- const nodecommons_cli_1 = require("nodecommons-cli");
5
- const nodecommons_app_1 = require("nodecommons-app");
1
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
+ return new (P || (P = Promise))(function (resolve, reject) {
4
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
8
+ });
9
+ };
10
+ import { commonsOutputDoing, commonsOutputResult, commonsOutputSuccess } from 'nodecommons-es-cli';
11
+ import { CommonsApp } from 'nodecommons-es-app';
6
12
  // load the tables into Mongo's memory
7
- class StartupApp extends nodecommons_app_1.CommonsApp {
13
+ export class StartupApp extends CommonsApp {
8
14
  constructor() {
9
15
  super('hydra-crawler');
10
16
  }
@@ -14,38 +20,45 @@ class StartupApp extends nodecommons_app_1.CommonsApp {
14
20
  setDatabaseService(databaseService) {
15
21
  this.databaseService = databaseService;
16
22
  }
17
- async init() {
18
- if (!this.databaseService)
19
- throw new Error('Database service has not been set yet');
20
- nodecommons_cli_1.CommonsOutput.doing('Connecting to database');
21
- await this.databaseService.init();
22
- nodecommons_cli_1.CommonsOutput.success();
23
- await super.init();
23
+ init() {
24
+ const _super = Object.create(null, {
25
+ init: { get: () => super.init }
26
+ });
27
+ return __awaiter(this, void 0, void 0, function* () {
28
+ if (!this.databaseService)
29
+ throw new Error('Database service has not been set yet');
30
+ commonsOutputDoing('Connecting to database');
31
+ yield this.databaseService.init();
32
+ commonsOutputSuccess();
33
+ yield _super.init.call(this);
34
+ });
24
35
  }
25
- async run() {
26
- if (!this.databaseService)
27
- throw new Error('Database service has not been set');
28
- const dbo = this.databaseService.getRawDatabase();
29
- for (const collection of 'domains,urls,links'.split(',')) {
30
- nodecommons_cli_1.CommonsOutput.doing(`Loading ${collection} collection into memory (simple enumeration)`);
31
- const tally = await dbo.collection(collection).find({}).count();
32
- nodecommons_cli_1.CommonsOutput.result(tally);
33
- }
34
- nodecommons_cli_1.CommonsOutput.doing(`Loading domains collection into memory (regex enumeration)`);
35
- await this.databaseService.getDomains().find({
36
- domain: /[0-9]temp[0-9]/
37
- }).count();
38
- nodecommons_cli_1.CommonsOutput.success();
39
- nodecommons_cli_1.CommonsOutput.doing(`Loading urls collection into memory (regex enumeration)`);
40
- await this.databaseService.getUrls().find({
41
- url: /[0-9]temp[0-9]/
42
- }).count();
43
- nodecommons_cli_1.CommonsOutput.success();
44
- nodecommons_cli_1.CommonsOutput.doing(`Loading links collection into memory (regex enumeration)`);
45
- await this.databaseService.getLinks().find({
46
- outgoing: /[0-9]temp[0-9]/
47
- }).count();
48
- nodecommons_cli_1.CommonsOutput.success();
36
+ run() {
37
+ return __awaiter(this, void 0, void 0, function* () {
38
+ if (!this.databaseService)
39
+ throw new Error('Database service has not been set');
40
+ const dbo = this.databaseService.getRawDatabase();
41
+ for (const collection of 'domains,urls,links'.split(',')) {
42
+ commonsOutputDoing(`Loading ${collection} collection into memory (simple enumeration)`);
43
+ const tally = yield dbo.collection(collection).find({}).count();
44
+ commonsOutputResult(tally);
45
+ }
46
+ commonsOutputDoing('Loading domains collection into memory (regex enumeration)');
47
+ yield this.databaseService.getDomains().find({
48
+ domain: /[0-9]temp[0-9]/
49
+ }).count();
50
+ commonsOutputSuccess();
51
+ commonsOutputDoing('Loading urls collection into memory (regex enumeration)');
52
+ yield this.databaseService.getUrls().find({
53
+ url: /[0-9]temp[0-9]/
54
+ }).count();
55
+ commonsOutputSuccess();
56
+ commonsOutputDoing('Loading links collection into memory (regex enumeration)');
57
+ yield this.databaseService.getLinks().find({
58
+ outgoing: /[0-9]temp[0-9]/
59
+ }).count();
60
+ commonsOutputSuccess();
61
+ });
49
62
  }
50
63
  }
51
- exports.StartupApp = StartupApp;
64
+ //# sourceMappingURL=startup.app.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"startup.app.js","sourceRoot":"","sources":["../../src/apps/startup.app.ts"],"names":[],"mappings":";;;;;;;;;AAEA,OAAO,EAAE,kBAAkB,EAAE,mBAAmB,EAAE,oBAAoB,EAAE,MAAM,oBAAoB,CAAC;AACnG,OAAO,EAAE,UAAU,EAAE,MAAM,oBAAoB,CAAC;AAMhD,sCAAsC;AAEtC,MAAM,OAAO,UAAW,SAAQ,UAAU;IAGzC;QACC,KAAK,CAAC,eAAe,CAAC,CAAC;IACxB,CAAC;IAEM,UAAU;QAChB,OAAO,iBAAiB,CAAC;IAC1B,CAAC;IAEM,kBAAkB,CACvB,eAAgC;QAEjC,IAAI,CAAC,eAAe,GAAG,eAAe,CAAC;IACxC,CAAC;IAEY,IAAI;;;;;YAChB,IAAI,CAAC,IAAI,CAAC,eAAe;gBAAE,MAAM,IAAI,KAAK,CAAC,uCAAuC,CAAC,CAAC;YAEpF,kBAAkB,CAAC,wBAAwB,CAAC,CAAC;YAC7C,MAAM,IAAI,CAAC,eAAe,CAAC,IAAI,EAAE,CAAC;YAClC,oBAAoB,EAAE,CAAC;YAEvB,MAAM,OAAM,IAAI,WAAE,CAAC;QACpB,CAAC;KAAA;IAEY,GAAG;;YACf,IAAI,CAAC,IAAI,CAAC,eAAe;gBAAE,MAAM,IAAI,KAAK,CAAC,mCAAmC,CAAC,CAAC;YAEhF,MAAM,GAAG,GAAO,IAAI,CAAC,eAAe,CAAC,cAAc,EAAE,CAAC;YAEtD,KAAK,MAAM,UAAU,IAAI,oBAAoB,CAAC,KAAK,CAAC,GAAG,CAAC,EAAE;gBACzD,kBAAkB,CAAC,WAAW,UAAU,8CAA8C,CAAC,CAAC;gBACxF,MAAM,KAAK,GAAW,MAAM,GAAG,CAAC,UAAU,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,KAAK,EAAE,CAAC;gBACxE,mBAAmB,CAAC,KAAK,CAAC,CAAC;aAC3B;YAED,kBAAkB,CAAC,4DAA4D,CAAC,CAAC;YACjF,MAAM,IAAI,CAAC,eAAe,CAAC,UAAU,EAAE,CAAC,IAAI,CAAC;gBAC3C,MAAM,EAAE,gBAAgB;aACzB,CAAC,CAAC,KAAK,EAAE,CAAC;YACX,oBAAoB,EAAE,CAAC;YAEvB,kBAAkB,CAAC,yDAAyD,CAAC,CAAC;YAC9E,MAAM,IAAI,CAAC,eAAe,CAAC,OAAO,EAAE,CAAC,IAAI,CAAC;gBACxC,GAAG,EAAE,gBAAgB;aACtB,CAAC,CAAC,KAAK,EAAE,CAAC;YACX,oBAAoB,EAAE,CAAC;YAEvB,kBAAkB,CAAC,0DAA0D,CAAC,CAAC;YAC/E,MAAM,IAAI,CAAC,eAAe,CAAC,QAAQ,EAAE,CAAC,IAAI,CAAC;gBACzC,QAAQ,EAAE,gBAAgB;aAC3B,CAAC,CAAC,KAAK,EAAE,CAAC;YACX,oBAAoB,EAAE,CAAC;QACxB,CAAC;KAAA;CACD"}
@@ -0,0 +1,15 @@
1
+ import { CommonsApp } from 'nodecommons-es-app';
2
+ import { DatabaseService } from '../services/database.service';
3
+ import { IMatch } from '../interfaces/imatch';
4
+ import { EList } from '../enums/elist';
5
+ import { IInternalHydraCommonListApp } from './internal-hydra-common.app';
6
+ export declare class UnarchiveApp extends CommonsApp implements IInternalHydraCommonListApp {
7
+ private databaseService;
8
+ private lists;
9
+ constructor();
10
+ getAppName(): string;
11
+ setDatabaseService(databaseService: DatabaseService): void;
12
+ addToList(list: EList, entries: IMatch[]): void;
13
+ init(): Promise<void>;
14
+ run(): Promise<void>;
15
+ }
@@ -1,15 +1,21 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.UnarchiveApp = void 0;
4
- const hydra_crawler_ts_assets_1 = require("hydra-crawler-ts-assets");
5
- const nodecommons_cli_1 = require("nodecommons-cli");
6
- const nodecommons_app_1 = require("nodecommons-app");
7
- const lists_1 = require("../classes/lists");
8
- const elist_1 = require("../enums/elist");
9
- class UnarchiveApp extends nodecommons_app_1.CommonsApp {
1
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
+ return new (P || (P = Promise))(function (resolve, reject) {
4
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
8
+ });
9
+ };
10
+ import { EStatus } from 'hydra-crawler-ts-assets';
11
+ import { commonsOutputDoing, commonsOutputFound, commonsOutputProgress, commonsOutputResult, commonsOutputSuccess } from 'nodecommons-es-cli';
12
+ import { CommonsApp } from 'nodecommons-es-app';
13
+ import { Lists } from '../classes/lists';
14
+ import { EList } from '../enums/elist';
15
+ export class UnarchiveApp extends CommonsApp {
10
16
  constructor() {
11
17
  super('hydra-crawler');
12
- this.lists = new lists_1.Lists();
18
+ this.lists = new Lists();
13
19
  }
14
20
  getAppName() {
15
21
  return 'Hydra - Unarchive';
@@ -20,50 +26,57 @@ class UnarchiveApp extends nodecommons_app_1.CommonsApp {
20
26
  addToList(list, entries) {
21
27
  this.lists.add(list, entries);
22
28
  }
23
- async init() {
24
- if (!this.databaseService)
25
- throw new Error('Database service has not been set yet');
26
- nodecommons_cli_1.CommonsOutput.doing('Connecting to database');
27
- await this.databaseService.init();
28
- nodecommons_cli_1.CommonsOutput.success();
29
- await super.init();
29
+ init() {
30
+ const _super = Object.create(null, {
31
+ init: { get: () => super.init }
32
+ });
33
+ return __awaiter(this, void 0, void 0, function* () {
34
+ if (!this.databaseService)
35
+ throw new Error('Database service has not been set yet');
36
+ commonsOutputDoing('Connecting to database');
37
+ yield this.databaseService.init();
38
+ commonsOutputSuccess();
39
+ yield _super.init.call(this);
40
+ });
30
41
  }
31
- async run() {
32
- if (!this.databaseService)
33
- throw new Error('Database service has not been set');
34
- let tally = 0;
35
- let found = 0;
36
- nodecommons_cli_1.CommonsOutput.doing(`Enumerating ARCHIVED URLS`);
37
- const results = this.databaseService.getUrls()
38
- .find({ status: hydra_crawler_ts_assets_1.EStatus.ARCHIVED });
39
- const urls = [];
40
- tally = 0;
41
- found = 0;
42
- while (true) {
43
- if ((tally % 100) === 0)
44
- nodecommons_cli_1.CommonsOutput.found(tally, found);
45
- tally++;
46
- const row = await results.next();
47
- if (row === null)
48
- break;
49
- if (!this.lists.match(elist_1.EList.ALLOW, row.url))
50
- continue;
51
- urls.push(row.url);
52
- found++;
53
- }
54
- nodecommons_cli_1.CommonsOutput.result(found);
55
- nodecommons_cli_1.CommonsOutput.doing('Re-QUEUEING matched URLs');
56
- tally = 0;
57
- while (urls.length > 0) {
58
- nodecommons_cli_1.CommonsOutput.progress(tally);
59
- tally += 100;
60
- const batch = urls.slice(0, 100);
61
- await this.databaseService.getUrls().updateMany({ url: { $in: batch } }, {
62
- $set: { status: hydra_crawler_ts_assets_1.EStatus.QUEUED }
63
- });
64
- urls.splice(0, 100);
65
- }
66
- nodecommons_cli_1.CommonsOutput.success();
42
+ run() {
43
+ return __awaiter(this, void 0, void 0, function* () {
44
+ if (!this.databaseService)
45
+ throw new Error('Database service has not been set');
46
+ let tally = 0;
47
+ let found = 0;
48
+ commonsOutputDoing('Enumerating ARCHIVED URLS');
49
+ const results = this.databaseService.getUrls()
50
+ .find({ status: EStatus.ARCHIVED }, {});
51
+ const urls = [];
52
+ tally = 0;
53
+ found = 0;
54
+ while (true) {
55
+ if ((tally % 100) === 0)
56
+ commonsOutputFound(tally, found);
57
+ tally++;
58
+ const row = yield results.next();
59
+ if (row === null)
60
+ break;
61
+ if (!this.lists.match(EList.ALLOW, row.url))
62
+ continue;
63
+ urls.push(row.url);
64
+ found++;
65
+ }
66
+ commonsOutputResult(found);
67
+ commonsOutputDoing('Re-QUEUEING matched URLs');
68
+ tally = 0;
69
+ while (urls.length > 0) {
70
+ commonsOutputProgress(tally);
71
+ tally += 100;
72
+ const batch = urls.slice(0, 100);
73
+ yield this.databaseService.getUrls().updateMany({ url: { $in: batch } }, {
74
+ $set: { status: EStatus.QUEUED }
75
+ });
76
+ urls.splice(0, 100);
77
+ }
78
+ commonsOutputSuccess();
79
+ });
67
80
  }
68
81
  }
69
- exports.UnarchiveApp = UnarchiveApp;
82
+ //# sourceMappingURL=unarchive.app.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"unarchive.app.js","sourceRoot":"","sources":["../../src/apps/unarchive.app.ts"],"names":[],"mappings":";;;;;;;;;AAEA,OAAO,EAAE,OAAO,EAAE,MAAM,yBAAyB,CAAC;AAGlD,OAAO,EAAE,kBAAkB,EAAE,kBAAkB,EAAE,qBAAqB,EAAE,mBAAmB,EAAE,oBAAoB,EAAE,MAAM,oBAAoB,CAAC;AAC9I,OAAO,EAAE,UAAU,EAAE,MAAM,oBAAoB,CAAC;AAEhD,OAAO,EAAE,KAAK,EAAE,MAAM,kBAAkB,CAAC;AAMzC,OAAO,EAAE,KAAK,EAAE,MAAM,gBAAgB,CAAC;AAIvC,MAAM,OAAO,YAAa,SAAQ,UAAU;IAK3C;QACC,KAAK,CAAC,eAAe,CAAC,CAAC;QAEvB,IAAI,CAAC,KAAK,GAAG,IAAI,KAAK,EAAE,CAAC;IAC1B,CAAC;IAEM,UAAU;QAChB,OAAO,mBAAmB,CAAC;IAC5B,CAAC;IAEM,kBAAkB,CACvB,eAAgC;QAEjC,IAAI,CAAC,eAAe,GAAG,eAAe,CAAC;IACxC,CAAC;IAEM,SAAS,CACd,IAAW,EACX,OAAiB;QAElB,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IAC/B,CAAC;IAEY,IAAI;;;;;YAChB,IAAI,CAAC,IAAI,CAAC,eAAe;gBAAE,MAAM,IAAI,KAAK,CAAC,uCAAuC,CAAC,CAAC;YAEpF,kBAAkB,CAAC,wBAAwB,CAAC,CAAC;YAC7C,MAAM,IAAI,CAAC,eAAe,CAAC,IAAI,EAAE,CAAC;YAClC,oBAAoB,EAAE,CAAC;YAEvB,MAAM,OAAM,IAAI,WAAE,CAAC;QACpB,CAAC;KAAA;IAEY,GAAG;;YACf,IAAI,CAAC,IAAI,CAAC,eAAe;gBAAE,MAAM,IAAI,KAAK,CAAC,mCAAmC,CAAC,CAAC;YAEhF,IAAI,KAAK,GAAW,CAAC,CAAC;YACtB,IAAI,KAAK,GAAW,CAAC,CAAC;YAEtB,kBAAkB,CAAC,2BAA2B,CAAC,CAAC;YAEhD,MAAM,OAAO,GAAiB,IAAI,CAAC,eAAe,CAAC,OAAO,EAAE;iBACzD,IAAI,CACH,EAAE,MAAM,EAAE,OAAO,CAAC,QAAQ,EAAE,EAC5B,EAAE,CACH,CAAC;YAEJ,MAAM,IAAI,GAAa,EAAE,CAAC;YAC1B,KAAK,GAAG,CAAC,CAAC;YAAC,KAAK,GAAG,CAAC,CAAC;YAErB,OAAO,IAAI,EAAE;gBACZ,IAAI,CAAC,KAAK,GAAG,GAAG,CAAC,KAAK,CAAC;oBAAE,kBAAkB,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;gBAC1D,KAAK,EAAE,CAAC;gBAER,MAAM,GAAG,GAAc,MAAM,OAAO,CAAC,IAAI,EAAE,CAAC;gBAC5C,IAAI,GAAG,KAAK,IAAI;oBAAE,MAAM;gBAExB,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,GAAG,CAAC;oBAAE,SAAS;gBAEtD,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;gBAEnB,KAAK,EAAE,CAAC;aACR;YACD,mBAAmB,CAAC,KAAK,CAAC,CAAC;YAE3B,kBAAkB,CAAC,0BAA0B,CAAC,CAAC;YAC/C,KAAK,GAAG,CAAC,CAAC;YACV,OAAO,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE;gBACvB,qBAAqB,CAAC,KAAK,CAAC,CAAC;gBAC7B,KAAK,IAAI,GAAG,CAAC;gBAEb,MAAM,KAAK,GAAa,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;gBAE3C,MAAM,IAAI,CAAC,eAAe,CAAC,OAAO,EAAE,CAAC,UAAU,CAC7C,EAAE,GAAG,EAAE,EAAE,GAAG,EAAE,KAAK,EAAE,EAAE,EACvB;oBACE,IAAI,EAAE,EAAE,MAAM,EAAE,OAAO,CAAC,MAAM,EAAE;iBACjC,CACF,CAAC;gBAEF,IAAI,CAAC,MAAM,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;aACpB;YACD,oBAAoB,EAAE,CAAC;QACxB,CAAC;KAAA;CACD"}
@@ -0,0 +1,12 @@
1
+ import { Lists } from '../classes/lists';
2
+ import { DatabaseService } from '../services/database.service';
3
+ export declare class Cleaner {
4
+ private lists;
5
+ private databaseService;
6
+ constructor(lists: Lists, databaseService: DatabaseService);
7
+ private detectStatusOrphans;
8
+ private detectNonAllowlistOrphans;
9
+ private detectStatusCodeOrphans;
10
+ purgeOrphanUrls(): Promise<void>;
11
+ purgeEmptyDomains(): Promise<void>;
12
+ }