hydra-crawler 1.4.4 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (305) hide show
  1. package/dist/apis/autocomplete.api.d.ts +7 -0
  2. package/dist/apis/autocomplete.api.js +15 -9
  3. package/dist/apis/autocomplete.api.js.map +1 -0
  4. package/dist/apis/bugs.api.d.ts +7 -0
  5. package/dist/apis/bugs.api.js +21 -15
  6. package/dist/apis/bugs.api.js.map +1 -0
  7. package/dist/apis/crawl.api.d.ts +7 -0
  8. package/dist/apis/crawl.api.js +15 -9
  9. package/dist/apis/crawl.api.js.map +1 -0
  10. package/dist/apis/domains.api.d.ts +7 -0
  11. package/dist/apis/domains.api.js +24 -19
  12. package/dist/apis/domains.api.js.map +1 -0
  13. package/dist/apis/images.api.d.ts +7 -0
  14. package/dist/apis/images.api.js +20 -14
  15. package/dist/apis/images.api.js.map +1 -0
  16. package/dist/apis/statistics.api.d.ts +8 -0
  17. package/dist/apis/statistics.api.js +27 -20
  18. package/dist/apis/statistics.api.js.map +1 -0
  19. package/dist/apis/test.api.d.ts +5 -0
  20. package/dist/apis/test.api.js +15 -9
  21. package/dist/apis/test.api.js.map +1 -0
  22. package/dist/apis/urls.api.d.ts +7 -0
  23. package/dist/apis/urls.api.js +21 -15
  24. package/dist/apis/urls.api.js.map +1 -0
  25. package/dist/apps/cleanup.app.d.ts +19 -0
  26. package/dist/apps/cleanup.app.js +118 -100
  27. package/dist/apps/cleanup.app.js.map +1 -0
  28. package/dist/apps/cross-populate-export.app.d.ts +12 -0
  29. package/dist/apps/cross-populate-export.app.js +60 -47
  30. package/dist/apps/cross-populate-export.app.js.map +1 -0
  31. package/dist/apps/cross-populate-import.app.d.ts +12 -0
  32. package/dist/apps/cross-populate-import.app.js +64 -51
  33. package/dist/apps/cross-populate-import.app.js.map +1 -0
  34. package/dist/apps/denylist.app.d.ts +17 -0
  35. package/dist/apps/denylist.app.js +115 -98
  36. package/dist/apps/denylist.app.js.map +1 -0
  37. package/dist/apps/expire.app.d.ts +19 -0
  38. package/dist/apps/expire.app.js +44 -31
  39. package/dist/apps/expire.app.js.map +1 -0
  40. package/dist/apps/extract-text.app.d.ts +8 -0
  41. package/dist/apps/extract-text.app.js +43 -35
  42. package/dist/apps/extract-text.app.js.map +1 -0
  43. package/dist/apps/hydra.app.d.ts +34 -0
  44. package/dist/apps/hydra.app.js +150 -137
  45. package/dist/apps/hydra.app.js.map +1 -0
  46. package/dist/apps/import.app.d.ts +11 -0
  47. package/dist/apps/import.app.js +44 -32
  48. package/dist/apps/import.app.js.map +1 -0
  49. package/dist/apps/internal-hydra-common.app.d.ts +28 -0
  50. package/dist/apps/internal-hydra-common.app.js +5 -11
  51. package/dist/apps/internal-hydra-common.app.js.map +1 -0
  52. package/dist/apps/query.app.d.ts +20 -0
  53. package/dist/apps/query.app.js +63 -49
  54. package/dist/apps/query.app.js.map +1 -0
  55. package/dist/apps/reattempt.app.d.ts +17 -0
  56. package/dist/apps/reattempt.app.js +66 -53
  57. package/dist/apps/reattempt.app.js.map +1 -0
  58. package/dist/apps/requeue-domain.app.d.ts +13 -0
  59. package/dist/apps/requeue-domain.app.js +50 -37
  60. package/dist/apps/requeue-domain.app.js.map +1 -0
  61. package/dist/apps/seed.app.d.ts +15 -0
  62. package/dist/apps/seed.app.js +53 -40
  63. package/dist/apps/seed.app.js.map +1 -0
  64. package/dist/apps/startup.app.d.ts +11 -0
  65. package/dist/apps/startup.app.js +51 -38
  66. package/dist/apps/startup.app.js.map +1 -0
  67. package/dist/apps/unarchive.app.d.ts +15 -0
  68. package/dist/apps/unarchive.app.js +67 -54
  69. package/dist/apps/unarchive.app.js.map +1 -0
  70. package/dist/classes/cleaner.d.ts +12 -0
  71. package/dist/classes/cleaner.js +227 -207
  72. package/dist/classes/cleaner.js.map +1 -0
  73. package/dist/classes/crawler.d.ts +34 -0
  74. package/dist/classes/crawler.js +248 -241
  75. package/dist/classes/crawler.js.map +1 -0
  76. package/dist/classes/dns.d.ts +3 -0
  77. package/dist/classes/dns.js +10 -13
  78. package/dist/classes/dns.js.map +1 -0
  79. package/dist/classes/expirer.d.ts +10 -0
  80. package/dist/classes/expirer.js +107 -94
  81. package/dist/classes/expirer.js.map +1 -0
  82. package/dist/classes/expiry.d.ts +8 -0
  83. package/dist/classes/expiry.js +16 -19
  84. package/dist/classes/expiry.js.map +1 -0
  85. package/dist/classes/lists.d.ts +9 -0
  86. package/dist/classes/lists.js +13 -18
  87. package/dist/classes/lists.js.map +1 -0
  88. package/dist/classes/robot.d.ts +15 -0
  89. package/dist/classes/robot.js +40 -30
  90. package/dist/classes/robot.js.map +1 -0
  91. package/dist/classes/tracker.d.ts +25 -0
  92. package/dist/classes/tracker.js +82 -64
  93. package/dist/classes/tracker.js.map +1 -0
  94. package/dist/cli.d.ts +1 -0
  95. package/dist/cli.js +72 -65
  96. package/dist/cli.js.map +1 -0
  97. package/dist/enums/eavailable-strategy.d.ts +4 -0
  98. package/dist/enums/eavailable-strategy.js +3 -5
  99. package/dist/enums/eavailable-strategy.js.map +1 -0
  100. package/dist/enums/elist.d.ts +7 -0
  101. package/dist/enums/elist.js +7 -11
  102. package/dist/enums/elist.js.map +1 -0
  103. package/dist/enums/eserver.d.ts +8 -0
  104. package/dist/enums/eserver.js +3 -5
  105. package/dist/enums/eserver.js.map +1 -0
  106. package/dist/enums/ex-powered-by.d.ts +6 -0
  107. package/dist/enums/ex-powered-by.js +3 -5
  108. package/dist/enums/ex-powered-by.js.map +1 -0
  109. package/dist/helpers/matcher.d.ts +5 -0
  110. package/dist/helpers/matcher.js +2 -5
  111. package/dist/helpers/matcher.js.map +1 -0
  112. package/dist/helpers/random.d.ts +4 -0
  113. package/dist/helpers/random.js +2 -5
  114. package/dist/helpers/random.js.map +1 -0
  115. package/dist/helpers/utf-decoder.d.ts +4 -0
  116. package/dist/helpers/utf-decoder.js +3 -6
  117. package/dist/helpers/utf-decoder.js.map +1 -0
  118. package/dist/interfaces/iexpiry.d.ts +7 -0
  119. package/dist/interfaces/iexpiry.js +9 -13
  120. package/dist/interfaces/iexpiry.js.map +1 -0
  121. package/dist/interfaces/imatch.d.ts +6 -0
  122. package/dist/interfaces/imatch.js +6 -9
  123. package/dist/interfaces/imatch.js.map +1 -0
  124. package/dist/interfaces/iparser-config.d.ts +4 -0
  125. package/dist/interfaces/iparser-config.js +4 -7
  126. package/dist/interfaces/iparser-config.js.map +1 -0
  127. package/dist/interfaces/iparser.d.ts +8 -0
  128. package/dist/interfaces/iparser.js +2 -2
  129. package/dist/interfaces/iparser.js.map +1 -0
  130. package/dist/interfaces/irequest-outcome.d.ts +11 -0
  131. package/dist/interfaces/irequest-outcome.js +2 -2
  132. package/dist/interfaces/irequest-outcome.js.map +1 -0
  133. package/dist/interfaces/iserver.d.ts +4 -0
  134. package/dist/interfaces/iserver.js +2 -2
  135. package/dist/interfaces/iserver.js.map +1 -0
  136. package/dist/parsers/accessibility-metrics.parser.d.ts +11 -0
  137. package/dist/parsers/accessibility-metrics.parser.js +34 -26
  138. package/dist/parsers/accessibility-metrics.parser.js.map +1 -0
  139. package/dist/parsers/asp-error.parser.d.ts +12 -0
  140. package/dist/parsers/asp-error.parser.js +36 -28
  141. package/dist/parsers/asp-error.parser.js.map +1 -0
  142. package/dist/parsers/bad-words.parser.d.ts +10 -0
  143. package/dist/parsers/bad-words.parser.js +21 -13
  144. package/dist/parsers/bad-words.parser.js.map +1 -0
  145. package/dist/parsers/complex-english.parser.d.ts +15 -0
  146. package/dist/parsers/complex-english.parser.js +33 -25
  147. package/dist/parsers/complex-english.parser.js.map +1 -0
  148. package/dist/parsers/data.parser.d.ts +14 -0
  149. package/dist/parsers/data.parser.js +12 -16
  150. package/dist/parsers/data.parser.js.map +1 -0
  151. package/dist/parsers/dictionary.parser.d.ts +19 -0
  152. package/dist/parsers/dictionary.parser.js +47 -39
  153. package/dist/parsers/dictionary.parser.js.map +1 -0
  154. package/dist/parsers/html.parser.d.ts +13 -0
  155. package/dist/parsers/html.parser.js +4 -8
  156. package/dist/parsers/html.parser.js.map +1 -0
  157. package/dist/parsers/hyperlinks.parser.d.ts +20 -0
  158. package/dist/parsers/hyperlinks.parser.js +82 -77
  159. package/dist/parsers/hyperlinks.parser.js.map +1 -0
  160. package/dist/parsers/image-tags.parser.d.ts +20 -0
  161. package/dist/parsers/image-tags.parser.js +38 -34
  162. package/dist/parsers/image-tags.parser.js.map +1 -0
  163. package/dist/parsers/jpeg.parser.d.ts +11 -0
  164. package/dist/parsers/jpeg.parser.js +28 -20
  165. package/dist/parsers/jpeg.parser.js.map +1 -0
  166. package/dist/parsers/paragraphs.parser.d.ts +13 -0
  167. package/dist/parsers/paragraphs.parser.js +33 -40
  168. package/dist/parsers/paragraphs.parser.js.map +1 -0
  169. package/dist/parsers/parser.d.ts +19 -0
  170. package/dist/parsers/parser.js +30 -17
  171. package/dist/parsers/parser.js.map +1 -0
  172. package/dist/parsers/php-error.parser.d.ts +12 -0
  173. package/dist/parsers/php-error.parser.js +42 -34
  174. package/dist/parsers/php-error.parser.js.map +1 -0
  175. package/dist/parsers/phrase.parser.d.ts +8 -0
  176. package/dist/parsers/phrase.parser.js +16 -11
  177. package/dist/parsers/phrase.parser.js.map +1 -0
  178. package/dist/parsers/regex.parser.d.ts +10 -0
  179. package/dist/parsers/regex.parser.js +30 -22
  180. package/dist/parsers/regex.parser.js.map +1 -0
  181. package/dist/parsers/server.parser.d.ts +12 -0
  182. package/dist/parsers/server.parser.js +66 -56
  183. package/dist/parsers/server.parser.js.map +1 -0
  184. package/dist/parsers/spelling.parser.d.ts +10 -0
  185. package/dist/parsers/spelling.parser.js +21 -13
  186. package/dist/parsers/spelling.parser.js.map +1 -0
  187. package/dist/parsers/string.parser.d.ts +8 -0
  188. package/dist/parsers/string.parser.js +5 -8
  189. package/dist/parsers/string.parser.js.map +1 -0
  190. package/dist/parsers/text.parser.d.ts +8 -0
  191. package/dist/parsers/text.parser.js +24 -18
  192. package/dist/parsers/text.parser.js.map +1 -0
  193. package/dist/parsers/words.parser.d.ts +11 -0
  194. package/dist/parsers/words.parser.js +32 -28
  195. package/dist/parsers/words.parser.js.map +1 -0
  196. package/dist/queries/complex-english.query.d.ts +2 -0
  197. package/dist/queries/complex-english.query.js +37 -38
  198. package/dist/queries/complex-english.query.js.map +1 -0
  199. package/dist/queries/flash-content.query.d.ts +2 -0
  200. package/dist/queries/flash-content.query.js +45 -32
  201. package/dist/queries/flash-content.query.js.map +1 -0
  202. package/dist/queries/linking-to-domains.query.d.ts +2 -0
  203. package/dist/queries/linking-to-domains.query.js +35 -27
  204. package/dist/queries/linking-to-domains.query.js.map +1 -0
  205. package/dist/queries/readability-score.query.d.ts +2 -0
  206. package/dist/queries/readability-score.query.js +21 -13
  207. package/dist/queries/readability-score.query.js.map +1 -0
  208. package/dist/servers/crawl.server.d.ts +35 -0
  209. package/dist/servers/crawl.server.js +133 -121
  210. package/dist/servers/crawl.server.js.map +1 -0
  211. package/dist/servers/express.server.d.ts +8 -0
  212. package/dist/servers/express.server.js +7 -10
  213. package/dist/servers/express.server.js.map +1 -0
  214. package/dist/servers/maintenance.server.d.ts +22 -0
  215. package/dist/servers/maintenance.server.js +42 -36
  216. package/dist/servers/maintenance.server.js.map +1 -0
  217. package/dist/servers/rest.server.d.ts +7 -0
  218. package/dist/servers/rest.server.js +40 -51
  219. package/dist/servers/rest.server.js.map +1 -0
  220. package/dist/servers/socket-io.server.d.ts +12 -0
  221. package/dist/servers/socket-io.server.js +48 -15
  222. package/dist/servers/socket-io.server.js.map +1 -0
  223. package/dist/services/database.service.d.ts +68 -0
  224. package/dist/services/database.service.js +528 -462
  225. package/dist/services/database.service.js.map +1 -0
  226. package/dist/types/tcrawl-config.d.ts +14 -0
  227. package/dist/types/tcrawl-config.js +14 -17
  228. package/dist/types/tcrawl-config.js.map +1 -0
  229. package/dist/types/thydra-config.d.ts +4 -0
  230. package/dist/types/thydra-config.js +4 -7
  231. package/dist/types/thydra-config.js.map +1 -0
  232. package/dist/types/tparser-ctor.d.ts +7 -0
  233. package/dist/types/tparser-ctor.js +2 -2
  234. package/dist/types/tparser-ctor.js.map +1 -0
  235. package/dist/types/tquery.d.ts +7 -0
  236. package/dist/types/tquery.js +2 -2
  237. package/dist/types/tquery.js.map +1 -0
  238. package/dist/types/trobots-config.d.ts +4 -0
  239. package/dist/types/trobots-config.js +4 -7
  240. package/dist/types/trobots-config.js.map +1 -0
  241. package/package.json +41 -29
  242. package/angular/10-es2015.bacd4ae5dd7913ce55f0.js +0 -1
  243. package/angular/10-es5.bacd4ae5dd7913ce55f0.js +0 -1
  244. package/angular/11-es2015.0f031dcf752d1e8eda6b.js +0 -1
  245. package/angular/11-es5.0f031dcf752d1e8eda6b.js +0 -1
  246. package/angular/3rdpartylicenses.txt +0 -1127
  247. package/angular/5-es2015.951498ca9c1bc74e57bf.js +0 -1
  248. package/angular/5-es5.951498ca9c1bc74e57bf.js +0 -1
  249. package/angular/6-es2015.65f680261a3506b88381.js +0 -1
  250. package/angular/6-es5.65f680261a3506b88381.js +0 -1
  251. package/angular/7-es2015.625197f3af1dbf3e805d.js +0 -1
  252. package/angular/7-es5.625197f3af1dbf3e805d.js +0 -1
  253. package/angular/8-es2015.55518901987a5b834309.js +0 -1
  254. package/angular/8-es5.55518901987a5b834309.js +0 -1
  255. package/angular/9-es2015.6cc9bde262564e7836f2.js +0 -1
  256. package/angular/9-es5.6cc9bde262564e7836f2.js +0 -1
  257. package/angular/Roboto-Black.41ed1105a6ebb8ffe34e.woff2 +0 -0
  258. package/angular/Roboto-Black.937491dfcbe64ca9a9f1.woff +0 -0
  259. package/angular/Roboto-BlackItalic.2e1ee657996854c6f427.woff +0 -0
  260. package/angular/Roboto-BlackItalic.50ca4c51ebc27e7e7d2f.woff2 +0 -0
  261. package/angular/Roboto-Bold.73288d91c325e82a5b92.woff +0 -0
  262. package/angular/Roboto-Bold.92fbd4e93cf0a5dbebaa.woff2 +0 -0
  263. package/angular/Roboto-BoldItalic.5f600d98a73d800ae575.woff2 +0 -0
  264. package/angular/Roboto-BoldItalic.6d89acbd21d7e3fbecb2.woff +0 -0
  265. package/angular/Roboto-Light.c27d89ac77468ae18f28.woff2 +0 -0
  266. package/angular/Roboto-Light.d923dfafc0c5183b59aa.woff +0 -0
  267. package/angular/Roboto-LightItalic.506274c7228cf81cae4d.woff2 +0 -0
  268. package/angular/Roboto-LightItalic.d4b8c137518d9d92bb28.woff +0 -0
  269. package/angular/Roboto-Medium.092c6130df8fd2199888.woff +0 -0
  270. package/angular/Roboto-Medium.1d3bced88509b0838984.woff2 +0 -0
  271. package/angular/Roboto-MediumItalic.18ff1628c628080166c1.woff +0 -0
  272. package/angular/Roboto-MediumItalic.d620b8f53f75966fe42e.woff2 +0 -0
  273. package/angular/Roboto-Regular.64cfb66c866ea50cad47.woff2 +0 -0
  274. package/angular/Roboto-Regular.e02e9d6ff5547f7e9962.woff +0 -0
  275. package/angular/Roboto-RegularItalic.4dd2af1e8df532f41db8.woff2 +0 -0
  276. package/angular/Roboto-RegularItalic.5ea38fff9eebef99c5df.woff +0 -0
  277. package/angular/Roboto-Thin.dbd56bd3357dc3617fe5.woff2 +0 -0
  278. package/angular/Roboto-Thin.e7f7c82374bd0ebef14b.woff +0 -0
  279. package/angular/Roboto-ThinItalic.5dd9349c940073834e9a.woff +0 -0
  280. package/angular/Roboto-ThinItalic.a8cef84f735ef887abdc.woff2 +0 -0
  281. package/angular/assets/config/app-config.json +0 -16
  282. package/angular/assets/images/splashbg.jpg +0 -0
  283. package/angular/assets/web-app-commons/fonts/material-icons/MaterialDesignIcons-Community-2.7.94.woff +0 -0
  284. package/angular/assets/web-app-commons/fonts/material-icons/MaterialDesignIcons-Community-2.7.94.woff2 +0 -0
  285. package/angular/assets/web-app-commons/fonts/material-icons/material-design-icons-community.css +0 -11293
  286. package/angular/favicon.ico +0 -0
  287. package/angular/flUhRq6tzZclQEJ-Vdg-IuiaDsNa.f2a0933406f783065152.woff +0 -0
  288. package/angular/flUhRq6tzZclQEJ-Vdg-IuiaDsNc.6467d9a24f234e8e8e07.woff2 +0 -0
  289. package/angular/index.html +0 -16
  290. package/angular/main-es2015.3a582572476c7f292e52.js +0 -1
  291. package/angular/main-es5.3a582572476c7f292e52.js +0 -1
  292. package/angular/polyfills-es2015.7df68534018bc2f6cb09.js +0 -1
  293. package/angular/polyfills-es5.e79468f406fae2989221.js +0 -1
  294. package/angular/runtime-es2015.6d2cff76cdb2790d3308.js +0 -1
  295. package/angular/runtime-es5.6d2cff76cdb2790d3308.js +0 -1
  296. package/angular/styles.c5c6c2534225b85c4ff0.css +0 -1
  297. package/config/bad-words.json +0 -1
  298. package/config/complex-english.json +0 -400
  299. package/config/hydra-auth.json +0 -8
  300. package/config/hydra-crawler.json +0 -84
  301. package/config/list-allow.json +0 -171
  302. package/config/list-deny.json +0 -248
  303. package/config/list-expiry.json +0 -7
  304. package/config/schedule.json +0 -25
  305. package/config/spelling.json +0 -1
@@ -1,237 +1,257 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.Cleaner = void 0;
4
- const hydra_crawler_ts_assets_1 = require("hydra-crawler-ts-assets");
5
- const nodecommons_cli_1 = require("nodecommons-cli");
6
- const elist_1 = require("../enums/elist");
7
- class Cleaner {
1
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
+ return new (P || (P = Promise))(function (resolve, reject) {
4
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
8
+ });
9
+ };
10
+ import { commonsTypeHasPropertyNumber } from 'tscommons-es-core';
11
+ import { EStatus } from 'hydra-crawler-ts-assets';
12
+ import { commonsOutputDoing, commonsOutputProgress, commonsOutputResult, commonsOutputSuccess } from 'nodecommons-es-cli';
13
+ import { EList } from '../enums/elist';
14
+ export class Cleaner {
8
15
  constructor(lists, databaseService) {
9
16
  this.lists = lists;
10
17
  this.databaseService = databaseService;
11
18
  }
12
- async detectStatusOrphans(statuses) {
13
- if (!this.databaseService)
14
- throw new Error('Database service has not been set yet');
15
- let tally = 0;
16
- let found = 0;
17
- nodecommons_cli_1.CommonsOutput.doing(`Detecting ${statuses.join(', ')} orphan urls`);
18
- while (true) {
19
- const urls = this.databaseService.getUrls()
20
- .find({
21
- $and: [
22
- { status: { $in: statuses } },
23
- { $or: [
24
- { orphan: null },
25
- { orphan: false }
26
- ] }
27
- ]
28
- })
29
- .sort({ _id: 1 })
30
- .skip(tally);
31
- try {
32
- while (true) {
33
- tally++;
34
- if ((tally % 100) === 0)
35
- nodecommons_cli_1.CommonsOutput.progress(`${tally} urls, ${found} orphans`);
36
- const row = await urls.next();
37
- if (row === null)
38
- break;
39
- const incoming = await this.databaseService.getLinks().find({ outgoing: row.url }).count();
40
- if (incoming > 0)
41
- continue;
42
- await this.databaseService.getUrls().updateOne({ _id: row['_id'] }, { $set: { orphan: true } });
43
- found++;
19
+ detectStatusOrphans(statuses) {
20
+ return __awaiter(this, void 0, void 0, function* () {
21
+ if (!this.databaseService)
22
+ throw new Error('Database service has not been set yet');
23
+ let tally = 0;
24
+ let found = 0;
25
+ commonsOutputDoing(`Detecting ${statuses.join(', ')} orphan urls`);
26
+ while (true) {
27
+ const urls = this.databaseService.getUrls()
28
+ .find({
29
+ $and: [
30
+ { status: { $in: statuses } },
31
+ { $or: [
32
+ { orphan: null },
33
+ { orphan: false }
34
+ ] }
35
+ ]
36
+ }, {})
37
+ .sort({ _id: 1 })
38
+ .skip(tally);
39
+ try {
40
+ while (true) {
41
+ tally++;
42
+ if ((tally % 100) === 0)
43
+ commonsOutputProgress(`${tally} urls, ${found} orphans`);
44
+ const row = yield urls.next();
45
+ if (row === null)
46
+ break;
47
+ const incoming = yield this.databaseService.getLinks().find({ outgoing: row.url }).count();
48
+ if (incoming > 0)
49
+ continue;
50
+ // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment
51
+ yield this.databaseService.getUrls().updateOne({ _id: row['_id'] }, { $set: { orphan: true } });
52
+ found++;
53
+ }
54
+ break;
55
+ }
56
+ catch (err) {
57
+ if (!commonsTypeHasPropertyNumber(err, 'code') || err.code !== 43)
58
+ throw err;
44
59
  }
45
- break;
46
- }
47
- catch (err) {
48
- if (err.code !== 43)
49
- throw err;
50
60
  }
51
- }
52
- nodecommons_cli_1.CommonsOutput.result(`${tally} urls, ${found} orphans`);
61
+ commonsOutputResult(`${tally} urls, ${found} orphans`);
62
+ });
53
63
  }
54
- async detectNonAllowlistOrphans(statuses) {
55
- let tally = 0;
56
- let found = 0;
57
- nodecommons_cli_1.CommonsOutput.doing(`Detecting non-allowlist ${statuses.join(', ')} orphan urls`);
58
- while (true) {
59
- const urls = this.databaseService.getUrls()
60
- .find({
61
- $and: [
62
- { status: { $in: statuses } },
63
- { $or: [
64
- { orphan: null },
65
- { orphan: false }
66
- ] }
67
- ]
68
- })
69
- .sort({ _id: 1 })
70
- .skip(tally);
71
- try {
72
- while (true) {
73
- tally++;
74
- if ((tally % 100) === 0)
75
- nodecommons_cli_1.CommonsOutput.progress(`${tally} urls, ${found} orphans`);
76
- const row = await urls.next();
77
- if (row === null)
78
- break;
79
- if (this.lists.match(elist_1.EList.ALLOW, row.url))
80
- continue;
81
- const incoming = await this.databaseService.getLinks().find({ outgoing: row.url }).count();
82
- if (incoming > 0)
83
- continue;
84
- await this.databaseService.getUrls().updateOne({ _id: row['_id'] }, { $set: { orphan: true } });
85
- found++;
64
+ detectNonAllowlistOrphans(statuses) {
65
+ return __awaiter(this, void 0, void 0, function* () {
66
+ let tally = 0;
67
+ let found = 0;
68
+ commonsOutputDoing(`Detecting non-allowlist ${statuses.join(', ')} orphan urls`);
69
+ while (true) {
70
+ const urls = this.databaseService.getUrls()
71
+ .find({
72
+ $and: [
73
+ { status: { $in: statuses } },
74
+ { $or: [
75
+ { orphan: null },
76
+ { orphan: false }
77
+ ] }
78
+ ]
79
+ }, {})
80
+ .sort({ _id: 1 })
81
+ .skip(tally);
82
+ try {
83
+ while (true) {
84
+ tally++;
85
+ if ((tally % 100) === 0)
86
+ commonsOutputProgress(`${tally} urls, ${found} orphans`);
87
+ const row = yield urls.next();
88
+ if (row === null)
89
+ break;
90
+ if (this.lists.match(EList.ALLOW, row.url))
91
+ continue;
92
+ const incoming = yield this.databaseService.getLinks().find({ outgoing: row.url }).count();
93
+ if (incoming > 0)
94
+ continue;
95
+ // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment
96
+ yield this.databaseService.getUrls().updateOne({ _id: row['_id'] }, { $set: { orphan: true } });
97
+ found++;
98
+ }
99
+ break;
100
+ }
101
+ catch (err) {
102
+ if (!commonsTypeHasPropertyNumber(err, 'code') || err.code !== 43)
103
+ throw err;
86
104
  }
87
- break;
88
- }
89
- catch (err) {
90
- if (err.code !== 43)
91
- throw err;
92
105
  }
93
- }
94
- nodecommons_cli_1.CommonsOutput.result(`${tally} urls, ${found} orphans`);
106
+ commonsOutputResult(`${tally} urls, ${found} orphans`);
107
+ });
95
108
  }
96
- async detectStatusCodeOrphans(gt, lt) {
97
- let tally = 0;
98
- let found = 0;
99
- nodecommons_cli_1.CommonsOutput.doing(`Detecting DONE status code between ${gt}<${lt} orphan urls`);
100
- while (true) {
101
- const urls = this.databaseService.getUrls()
102
- .find({
103
- $and: [
104
- { status: hydra_crawler_ts_assets_1.EStatus.DONE },
105
- { statusCode: { $gt: gt } },
106
- { statusCode: { $lt: lt } },
107
- { $or: [
108
- { orphan: null },
109
- { orphan: false }
110
- ] }
111
- ]
112
- })
113
- .sort({ _id: 1 })
114
- .skip(tally);
115
- try {
116
- while (true) {
117
- tally++;
118
- if ((tally % 100) === 0)
119
- nodecommons_cli_1.CommonsOutput.progress(`${tally} urls, ${found} orphans`);
120
- const row = await urls.next();
121
- if (row === null)
122
- break;
123
- const incoming = await this.databaseService.getLinks().find({ outgoing: row.url }).count();
124
- if (incoming > 0)
125
- continue;
126
- await this.databaseService.getUrls().updateOne({ _id: row['_id'] }, { $set: { orphan: true } });
127
- found++;
109
+ detectStatusCodeOrphans(gt, lt) {
110
+ return __awaiter(this, void 0, void 0, function* () {
111
+ let tally = 0;
112
+ let found = 0;
113
+ commonsOutputDoing(`Detecting DONE status code between ${gt}<${lt} orphan urls`);
114
+ while (true) {
115
+ const urls = this.databaseService.getUrls()
116
+ .find({
117
+ $and: [
118
+ { status: EStatus.DONE },
119
+ { statusCode: { $gt: gt } },
120
+ { statusCode: { $lt: lt } },
121
+ { $or: [
122
+ { orphan: null },
123
+ { orphan: false }
124
+ ] }
125
+ ]
126
+ }, {})
127
+ .sort({ _id: 1 })
128
+ .skip(tally);
129
+ try {
130
+ while (true) {
131
+ tally++;
132
+ if ((tally % 100) === 0)
133
+ commonsOutputProgress(`${tally} urls, ${found} orphans`);
134
+ const row = yield urls.next();
135
+ if (row === null)
136
+ break;
137
+ const incoming = yield this.databaseService.getLinks().find({ outgoing: row.url }).count();
138
+ if (incoming > 0)
139
+ continue;
140
+ // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment
141
+ yield this.databaseService.getUrls().updateOne({ _id: row['_id'] }, { $set: { orphan: true } });
142
+ found++;
143
+ }
144
+ break;
145
+ }
146
+ catch (err) {
147
+ if (!commonsTypeHasPropertyNumber(err, 'code') || err.code !== 43)
148
+ throw err;
128
149
  }
129
- break;
130
150
  }
131
- catch (err) {
132
- if (err.code !== 43)
133
- throw err;
151
+ commonsOutputResult(`${tally} urls, ${found} orphans`);
152
+ });
153
+ }
154
+ purgeOrphanUrls() {
155
+ return __awaiter(this, void 0, void 0, function* () {
156
+ yield this.detectStatusOrphans([
157
+ EStatus.DENY,
158
+ EStatus.FAILED,
159
+ EStatus.DEAD,
160
+ EStatus.DISALLOWED
161
+ ]);
162
+ yield this.detectNonAllowlistOrphans([
163
+ EStatus.QUEUED,
164
+ EStatus.DONE
165
+ ]);
166
+ yield this.detectStatusCodeOrphans(300, 310);
167
+ yield this.detectStatusCodeOrphans(400, 500);
168
+ let tally = 0;
169
+ commonsOutputDoing('Removing orphan outgoing links');
170
+ while (true) {
171
+ const urls2 = this.databaseService.getUrls()
172
+ .find({ orphan: true }, {})
173
+ .sort({ _id: 1 })
174
+ .skip(tally);
175
+ try {
176
+ while (true) {
177
+ tally++;
178
+ if ((tally % 100) === 0)
179
+ commonsOutputProgress(tally);
180
+ const row = yield urls2.next();
181
+ if (row === null)
182
+ break;
183
+ yield this.databaseService.getLinks().deleteMany({ url: row.url });
184
+ }
185
+ break;
186
+ }
187
+ catch (err) {
188
+ if (!commonsTypeHasPropertyNumber(err, 'code') || err.code !== 43)
189
+ throw err;
190
+ }
134
191
  }
135
- }
136
- nodecommons_cli_1.CommonsOutput.result(`${tally} urls, ${found} orphans`);
192
+ commonsOutputResult(tally);
193
+ commonsOutputDoing('Archiving orphans');
194
+ yield this.databaseService.getUrls().updateMany({ orphan: true }, {
195
+ $set: { status: EStatus.ARCHIVED },
196
+ $unset: {
197
+ orphan: true,
198
+ headers: true,
199
+ server: true,
200
+ hash: true,
201
+ hashSet: true,
202
+ ttl: true,
203
+ reason: true,
204
+ links: true,
205
+ title: true
206
+ }
207
+ });
208
+ commonsOutputSuccess();
209
+ });
137
210
  }
138
- async purgeOrphanUrls() {
139
- await this.detectStatusOrphans([
140
- hydra_crawler_ts_assets_1.EStatus.DENY,
141
- hydra_crawler_ts_assets_1.EStatus.FAILED,
142
- hydra_crawler_ts_assets_1.EStatus.DEAD,
143
- hydra_crawler_ts_assets_1.EStatus.DISALLOWED
144
- ]);
145
- await this.detectNonAllowlistOrphans([
146
- hydra_crawler_ts_assets_1.EStatus.QUEUED,
147
- hydra_crawler_ts_assets_1.EStatus.DONE
148
- ]);
149
- await this.detectStatusCodeOrphans(300, 310);
150
- await this.detectStatusCodeOrphans(400, 500);
151
- let tally = 0;
152
- nodecommons_cli_1.CommonsOutput.doing(`Removing orphan outgoing links`);
153
- while (true) {
154
- const urls2 = this.databaseService.getUrls()
155
- .find({ orphan: true })
156
- .sort({ _id: 1 })
157
- .skip(tally);
211
+ purgeEmptyDomains() {
212
+ return __awaiter(this, void 0, void 0, function* () {
213
+ let tally = 0;
214
+ let found = 0;
215
+ commonsOutputDoing('Enumerating domains');
216
+ const results = this.databaseService.getDomains().find({}, {});
217
+ const domains = [];
158
218
  try {
219
+ tally = 0;
159
220
  while (true) {
160
221
  tally++;
161
222
  if ((tally % 100) === 0)
162
- nodecommons_cli_1.CommonsOutput.progress(tally);
163
- const row = await urls2.next();
223
+ commonsOutputProgress(tally);
224
+ const row = yield results.next();
164
225
  if (row === null)
165
226
  break;
166
- await this.databaseService.getLinks().deleteMany({ url: row.url });
227
+ domains.push(row.domain);
167
228
  }
168
- break;
169
229
  }
170
230
  catch (err) {
171
- if (err.code !== 43)
231
+ if (!commonsTypeHasPropertyNumber(err, 'code') || err.code !== 43)
172
232
  throw err;
173
233
  }
174
- }
175
- nodecommons_cli_1.CommonsOutput.result(tally);
176
- nodecommons_cli_1.CommonsOutput.doing(`Archiving orphans`);
177
- await this.databaseService.getUrls().updateMany({ orphan: true }, {
178
- $set: { status: hydra_crawler_ts_assets_1.EStatus.ARCHIVED },
179
- $unset: {
180
- orphan: true,
181
- headers: true,
182
- server: true,
183
- hash: true,
184
- hashSet: true,
185
- ttl: true,
186
- reason: true,
187
- links: true,
188
- title: true
189
- }
190
- });
191
- nodecommons_cli_1.CommonsOutput.success();
192
- }
193
- async purgeEmptyDomains() {
194
- let tally = 0;
195
- let found = 0;
196
- nodecommons_cli_1.CommonsOutput.doing('Enumerating domains');
197
- const results = this.databaseService.getDomains().find({});
198
- const domains = [];
199
- try {
234
+ commonsOutputResult(tally);
200
235
  tally = 0;
201
- while (true) {
236
+ found = 0;
237
+ commonsOutputDoing('Detecting and removing empty domains');
238
+ for (const domain of domains) {
202
239
  tally++;
203
- if ((tally % 100) === 0)
204
- nodecommons_cli_1.CommonsOutput.progress(tally);
205
- const row = await results.next();
206
- if (row === null)
207
- break;
208
- domains.push(row.domain);
240
+ if ((tally % 10) === 0)
241
+ commonsOutputProgress(`${tally} domains, ${found} removed`);
242
+ const urls = yield this.databaseService.getUrls()
243
+ .find({
244
+ domain: domain,
245
+ status: { $ne: EStatus.ARCHIVED }
246
+ }, {})
247
+ .count();
248
+ if (urls > 0)
249
+ continue;
250
+ yield this.databaseService.getDomains().deleteOne({ domain: domain });
251
+ found++;
209
252
  }
210
- }
211
- catch (err) {
212
- if (err.code !== 43)
213
- throw err;
214
- }
215
- nodecommons_cli_1.CommonsOutput.result(tally);
216
- tally = 0;
217
- found = 0;
218
- nodecommons_cli_1.CommonsOutput.doing('Detecting and removing empty domains');
219
- for (const domain of domains) {
220
- tally++;
221
- if ((tally % 10) === 0)
222
- nodecommons_cli_1.CommonsOutput.progress(`${tally} domains, ${found} removed`);
223
- const urls = await this.databaseService.getUrls()
224
- .find({
225
- domain: domain,
226
- status: { $ne: hydra_crawler_ts_assets_1.EStatus.ARCHIVED }
227
- })
228
- .count();
229
- if (urls > 0)
230
- continue;
231
- await this.databaseService.getDomains().deleteOne({ domain: domain });
232
- found++;
233
- }
234
- nodecommons_cli_1.CommonsOutput.result(`${tally} domains, ${found} removed`);
253
+ commonsOutputResult(`${tally} domains, ${found} removed`);
254
+ });
235
255
  }
236
256
  }
237
- exports.Cleaner = Cleaner;
257
+ //# sourceMappingURL=cleaner.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cleaner.js","sourceRoot":"","sources":["../../src/classes/cleaner.ts"],"names":[],"mappings":";;;;;;;;;AAEA,OAAO,EAAE,4BAA4B,EAAE,MAAM,mBAAmB,CAAC;AAGjE,OAAO,EAAE,OAAO,EAAE,MAAM,yBAAyB,CAAC;AAGlD,OAAO,EAAE,kBAAkB,EAAE,qBAAqB,EAAE,mBAAmB,EAAE,oBAAoB,EAAE,MAAM,oBAAoB,CAAC;AAM1H,OAAO,EAAE,KAAK,EAAE,MAAM,gBAAgB,CAAC;AAEvC,MAAM,OAAO,OAAO;IACnB,YACU,KAAY,EACZ,eAAgC;QADhC,UAAK,GAAL,KAAK,CAAO;QACZ,oBAAe,GAAf,eAAe,CAAiB;IACvC,CAAC;IAEU,mBAAmB,CAAC,QAAmB;;YACpD,IAAI,CAAC,IAAI,CAAC,eAAe;gBAAE,MAAM,IAAI,KAAK,CAAC,uCAAuC,CAAC,CAAC;YAEpF,IAAI,KAAK,GAAW,CAAC,CAAC;YACtB,IAAI,KAAK,GAAW,CAAC,CAAC;YAEtB,kBAAkB,CAAC,aAAa,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;YAEnE,OAAO,IAAI,EAAE;gBACZ,MAAM,IAAI,GAAiB,IAAI,CAAC,eAAe,CAAC,OAAO,EAAE;qBACtD,IAAI,CACH;oBACE,IAAI,EAAE;wBACJ,EAAE,MAAM,EAAE,EAAE,GAAG,EAAE,QAAQ,EAAE,EAAE;wBAC7B,EAAE,GAAG,EAAE;gCACL,EAAE,MAAM,EAAE,IAAI,EAAE;gCAChB,EAAE,MAAM,EAAE,KAAK,EAAE;6BAClB,EAAE;qBACJ;iBACF,EACD,EAAE,CACH;qBACA,IAAI,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,CAAC;qBAChB,IAAI,CAAC,KAAK,CAAC,CAAC;gBAEf,IAAI;oBACH,OAAO,IAAI,EAAE;wBACZ,KAAK,EAAE,CAAC;wBACR,IAAI,CAAC,KAAK,GAAG,GAAG,CAAC,KAAK,CAAC;4BAAE,qBAAqB,CAAC,GAAG,KAAK,UAAU,KAAK,UAAU,CAAC,CAAC;wBAElF,MAAM,GAAG,GAAc,MAAM,IAAI,CAAC,IAAI,EAAE,CAAC;wBACzC,IAAI,GAAG,KAAK,IAAI;4BAAE,MAAM;wBAExB,MAAM,QAAQ,GAAW,MAAM,IAAI,CAAC,eAAe,CAAC,QAAQ,EAAE,CAAC,IAAI,CAAC,EAAE,QAAQ,EAAE,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,KAAK,EAAE,CAAC;wBACnG,IAAI,QAAQ,GAAG,CAAC;4BAAE,SAAS;wBAE3B,mEAAmE;wBACnE,MAAM,IAAI,CAAC,eAAe,CAAC,OAAO,EAAE,CAAC,SAAS,CAAC,EAAE,GAAG,EAAE,GAAG,CAAC,KAAK,CAAC,EAAE,EAAE,EAAE,IAAI,EAAE,EAAE,MAAM,EAAE,IAAI,EAAE,EAAE,CAAC,CAAC;wBAChG,KAAK,EAAE,CAAC;qBACR;oBAED,MAAM;iBACN;gBAAC,OAAO,GAAG,EAAE;oBACb,IAAI,CAAC,4BAA4B,CAAC,GAAG,EAAE,MAAM,CAAC,IAAI,GAAG,CAAC,IAAI,KAAK,EAAE;wBAAE,MAAM,GAAG,CAAC;iBAC7E;aACD;YACD,mBAAmB,CAAC,GAAG,KAAK,UAAU,KAAK,UAAU,CAAC,CAAC;QACxD,CAAC;KAAA;IAEa,yBAAyB,CAAC,QAAmB;;YAC1D,IAAI,KAAK,GAAW,CAAC,CAAC;YACtB,IAAI,KAAK,GAAW,CAAC,CAAC;YAEtB,kBAAkB,CAAC,2BAA2B,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;YAEjF,OAAO,IAAI,EAAE;gBACZ,MAAM,IAAI,GAAiB,IAAI,CAAC,eAAe,CAAC,OAAO,EAAE;qBACtD,IAAI,CACH;oBACE,IAAI,EAAE;wBACJ,EAAE,MAAM,EAAE,EAAE,GAAG,EAAE,QAAQ,EAAE,EAAE;wBAC7B,EAAE,GAAG,EAAE;gCACL,EAAE,MAAM,EAAE,IAAI,EAAE;gCAChB,EAAE,MAAM,EAAE,KAAK,EAAE;6BAClB,EAAE;qBACJ;iBACF,EACD,EAAE,CACH;qBACA,IAAI,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,CAAC;qBAChB,IAAI,CAAC,KAAK,CAAC,CAAC;gBAEf,IAAI;oBACH,OAAO,IAAI,EAAE;wBACZ,KAAK,EAAE,CAAC;wBACR,IAAI,CAAC,KAAK,GAAG,GAAG,CAAC,KAAK,CAAC;4BAAE,qBAAqB,CAAC,GAAG,KAAK,UAAU,KAAK,UAAU,CAAC,CAAC;wBAElF,MAAM,GAAG,GAAc,MAAM,IAAI,CAAC,IAAI,EAAE,CAAC;wBACzC,IAAI,GAAG,KAAK,IAAI;4BAAE,MAAM;wBAExB,IAAI,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,GAAG,CAAC;4BAAE,SAAS;wBAErD,MAAM,QAAQ,GAAW,MAAM,IAAI,CAAC,eAAe,CAAC,QAAQ,EAAE,CAAC,IAAI,CAAC,EAAE,QAAQ,EAAE,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,KAAK,EAAE,CAAC;wBACnG,IAAI,QAAQ,GAAG,CAAC;4BAAE,SAAS;wBAE3B,mEAAmE;wBACnE,MAAM,IAAI,CAAC,eAAe,CAAC,OAAO,EAAE,CAAC,SAAS,CAAC,EAAE,GAAG,EAAE,GAAG,CAAC,KAAK,CAAC,EAAE,EAAE,EAAE,IAAI,EAAE,EAAE,MAAM,EAAE,IAAI,EAAE,EAAE,CAAC,CAAC;wBAChG,KAAK,EAAE,CAAC;qBACR;oBAED,MAAM;iBACN;gBAAC,OAAO,GAAG,EAAE;oBACb,IAAI,CAAC,4BAA4B,CAAC,GAAG,EAAE,MAAM,CAAC,IAAI,GAAG,CAAC,IAAI,KAAK,EAAE;wBAAE,MAAM,GAAG,CAAC;iBAC7E;aACD;YACD,mBAAmB,CAAC,GAAG,KAAK,UAAU,KAAK,UAAU,CAAC,CAAC;QACxD,CAAC;KAAA;IAEa,uBAAuB,CAAC,EAAU,EAAE,EAAU;;YAC3D,IAAI,KAAK,GAAW,CAAC,CAAC;YACtB,IAAI,KAAK,GAAW,CAAC,CAAC;YAEtB,kBAAkB,CAAC,sCAAsC,EAAE,IAAI,EAAE,cAAc,CAAC,CAAC;YAEjF,OAAO,IAAI,EAAE;gBACZ,MAAM,IAAI,GAAiB,IAAI,CAAC,eAAe,CAAC,OAAO,EAAE;qBACtD,IAAI,CACH;oBACE,IAAI,EAAE;wBACJ,EAAE,MAAM,EAAE,OAAO,CAAC,IAAI,EAAE;wBACxB,EAAE,UAAU,EAAE,EAAE,GAAG,EAAE,EAAE,EAAE,EAAE;wBAC3B,EAAE,UAAU,EAAE,EAAE,GAAG,EAAE,EAAE,EAAE,EAAE;wBAC3B,EAAE,GAAG,EAAE;gCACL,EAAE,MAAM,EAAE,IAAI,EAAE;gCAChB,EAAE,MAAM,EAAE,KAAK,EAAE;6BAClB,EAAE;qBACJ;iBACF,EACD,EAAE,CACH;qBACA,IAAI,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,CAAC;qBAChB,IAAI,CAAC,KAAK,CAAC,CAAC;gBAEf,IAAI;oBACH,OAAO,IAAI,EAAE;wBACZ,KAAK,EAAE,CAAC;wBACR,IAAI,CAAC,KAAK,GAAG,GAAG,CAAC,KAAK,CAAC;4BAAE,qBAAqB,CAAC,GAAG,KAAK,UAAU,KAAK,UAAU,CAAC,CAAC;wBAElF,MAAM,GAAG,GAAc,MAAM,IAAI,CAAC,IAAI,EAAE,CAAC;wBACzC,IAAI,GAAG,KAAK,IAAI;4BAAE,MAAM;wBAExB,MAAM,QAAQ,GAAW,MAAM,IAAI,CAAC,eAAe,CAAC,QAAQ,EAAE,CAAC,IAAI,CAAC,EAAE,QAAQ,EAAE,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,KAAK,EAAE,CAAC;wBACnG,IAAI,QAAQ,GAAG,CAAC;4BAAE,SAAS;wBAE3B,mEAAmE;wBACnE,MAAM,IAAI,CAAC,eAAe,CAAC,OAAO,EAAE,CAAC,SAAS,CAAC,EAAE,GAAG,EAAE,GAAG,CAAC,KAAK,CAAC,EAAE,EAAE,EAAE,IAAI,EAAE,EAAE,MAAM,EAAE,IAAI,EAAE,EAAE,CAAC,CAAC;wBAChG,KAAK,EAAE,CAAC;qBACR;oBAED,MAAM;iBACN;gBAAC,OAAO,GAAG,EAAE;oBACb,IAAI,CAAC,4BAA4B,CAAC,GAAG,EAAE,MAAM,CAAC,IAAI,GAAG,CAAC,IAAI,KAAK,EAAE;wBAAE,MAAM,GAAG,CAAC;iBAC7E;aACD;YAED,mBAAmB,CAAC,GAAG,KAAK,UAAU,KAAK,UAAU,CAAC,CAAC;QACxD,CAAC;KAAA;IAEY,eAAe;;YAC3B,MAAM,IAAI,CAAC,mBAAmB,CAAC;gBAC7B,OAAO,CAAC,IAAI;gBACZ,OAAO,CAAC,MAAM;gBACd,OAAO,CAAC,IAAI;gBACZ,OAAO,CAAC,UAAU;aACnB,CAAC,CAAC;YAEH,MAAM,IAAI,CAAC,yBAAyB,CAAC;gBACnC,OAAO,CAAC,MAAM;gBACd,OAAO,CAAC,IAAI;aACb,CAAC,CAAC;YAEH,MAAM,IAAI,CAAC,uBAAuB,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC;YAC7C,MAAM,IAAI,CAAC,uBAAuB,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC;YAE7C,IAAI,KAAK,GAAW,CAAC,CAAC;YAEtB,kBAAkB,CAAC,gCAAgC,CAAC,CAAC;YACrD,OAAO,IAAI,EAAE;gBACZ,MAAM,KAAK,GAAiB,IAAI,CAAC,eAAe,CAAC,OAAO,EAAE;qBACvD,IAAI,CAAO,EAAE,MAAM,EAAE,IAAI,EAAE,EAAE,EAAE,CAAC;qBAChC,IAAI,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,CAAC;qBAChB,IAAI,CAAC,KAAK,CAAC,CAAC;gBAEf,IAAI;oBACH,OAAO,IAAI,EAAE;wBACZ,KAAK,EAAE,CAAC;wBACR,IAAI,CAAC,KAAK,GAAG,GAAG,CAAC,KAAK,CAAC;4BAAE,qBAAqB,CAAC,KAAK,CAAC,CAAC;wBAEtD,MAAM,GAAG,GAAc,MAAM,KAAK,CAAC,IAAI,EAAE,CAAC;wBAC1C,IAAI,GAAG,KAAK,IAAI;4BAAE,MAAM;wBAExB,MAAM,IAAI,CAAC,eAAe,CAAC,QAAQ,EAAE,CAAC,UAAU,CAAC,EAAE,GAAG,EAAE,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC;qBACnE;oBAED,MAAM;iBACN;gBAAC,OAAO,GAAG,EAAE;oBACb,IAAI,CAAC,4BAA4B,CAAC,GAAG,EAAE,MAAM,CAAC,IAAI,GAAG,CAAC,IAAI,KAAK,EAAE;wBAAE,MAAM,GAAG,CAAC;iBAC7E;aACD;YACD,mBAAmB,CAAC,KAAK,CAAC,CAAC;YAE3B,kBAAkB,CAAC,mBAAmB,CAAC,CAAC;YACxC,MAAM,IAAI,CAAC,eAAe,CAAC,OAAO,EAAE,CAAC,UAAU,CAC7C,EAAE,MAAM,EAAE,IAAI,EAAE,EAChB;gBACE,IAAI,EAAE,EAAE,MAAM,EAAE,OAAO,CAAC,QAAQ,EAAE;gBAClC,MAAM,EAAE;oBACN,MAAM,EAAE,IAAI;oBACZ,OAAO,EAAE,IAAI;oBACb,MAAM,EAAE,IAAI;oBACZ,IAAI,EAAE,IAAI;oBACV,OAAO,EAAE,IAAI;oBACb,GAAG,EAAE,IAAI;oBACT,MAAM,EAAE,IAAI;oBACZ,KAAK,EAAE,IAAI;oBACX,KAAK,EAAE,IAAI;iBACZ;aACF,CACF,CAAC;YACF,oBAAoB,EAAE,CAAC;QACxB,CAAC;KAAA;IAEY,iBAAiB;;YAC7B,IAAI,KAAK,GAAW,CAAC,CAAC;YACtB,IAAI,KAAK,GAAW,CAAC,CAAC;YAEtB,kBAAkB,CAAC,qBAAqB,CAAC,CAAC;YAE1C,MAAM,OAAO,GAAoB,IAAI,CAAC,eAAe,CAAC,UAAU,EAAE,CAAC,IAAI,CAAU,EAAE,EAAE,EAAE,CAAC,CAAC;YAEzF,MAAM,OAAO,GAAa,EAAE,CAAC;YAC7B,IAAI;gBACH,KAAK,GAAG,CAAC,CAAC;gBAEV,OAAO,IAAI,EAAE;oBACZ,KAAK,EAAE,CAAC;oBACR,IAAI,CAAC,KAAK,GAAG,GAAG,CAAC,KAAK,CAAC;wBAAE,qBAAqB,CAAC,KAAK,CAAC,CAAC;oBAEtD,MAAM,GAAG,GAAiB,MAAM,OAAO,CAAC,IAAI,EAAE,CAAC;oBAC/C,IAAI,GAAG,KAAK,IAAI;wBAAE,MAAM;oBAExB,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;iBACzB;aACD;YAAC,OAAO,GAAG,EAAE;gBACb,IAAI,CAAC,4BAA4B,CAAC,GAAG,EAAE,MAAM,CAAC,IAAI,GAAG,CAAC,IAAI,KAAK,EAAE;oBAAE,MAAM,GAAG,CAAC;aAC7E;YAED,mBAAmB,CAAC,KAAK,CAAC,CAAC;YAE3B,KAAK,GAAG,CAAC,CAAC;YAAC,KAAK,GAAG,CAAC,CAAC;YACrB,kBAAkB,CAAC,sCAAsC,CAAC,CAAC;YAC3D,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE;gBAC7B,KAAK,EAAE,CAAC;gBACR,IAAI,CAAC,KAAK,GAAG,EAAE,CAAC,KAAK,CAAC;oBAAE,qBAAqB,CAAC,GAAG,KAAK,aAAa,KAAK,UAAU,CAAC,CAAC;gBAEpF,MAAM,IAAI,GAAW,MAAM,IAAI,CAAC,eAAe,CAAC,OAAO,EAAE;qBACtD,IAAI,CAAO;oBACV,MAAM,EAAE,MAAM;oBACd,MAAM,EAAE,EAAE,GAAG,EAAE,OAAO,CAAC,QAAQ,EAAE;iBAClC,EAAE,EAAE,CAAC;qBACL,KAAK,EAAE,CAAC;gBAEX,IAAI,IAAI,GAAG,CAAC;oBAAE,SAAS;gBAEvB,MAAM,IAAI,CAAC,eAAe,CAAC,UAAU,EAAE,CAAC,SAAS,CAAC,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;gBAEtE,KAAK,EAAE,CAAC;aACR;YACD,mBAAmB,CAAC,GAAG,KAAK,aAAa,KAAK,UAAU,CAAC,CAAC;QAC3D,CAAC;KAAA;CAED"}
@@ -0,0 +1,34 @@
1
+ /// <reference types="node" />
2
+ /// <reference types="node" />
3
+ import * as http from 'http';
4
+ import * as https from 'https';
5
+ import { TKeyObject } from 'tscommons-es-core';
6
+ import { Lists } from '../classes/lists';
7
+ import { Tracker } from '../classes/tracker';
8
+ import { DatabaseService } from '../services/database.service';
9
+ import { IRequestOutcome } from '../interfaces/irequest-outcome';
10
+ import { IParserConfig } from '../interfaces/iparser-config';
11
+ import { TCrawlConfig } from '../types/tcrawl-config';
12
+ import { TRobotsConfig } from '../types/trobots-config';
13
+ import { TParserCtor } from '../types/tparser-ctor';
14
+ export declare class Crawler {
15
+ private domain;
16
+ private database;
17
+ private crawlConfig;
18
+ private parsersConfig;
19
+ private robotsConfig;
20
+ private parsers;
21
+ private lists;
22
+ private tracker?;
23
+ static applyMasqueradeHeaders(request: http.ClientRequest): void;
24
+ static request(handler: typeof http | typeof https, url: string, connectTimeout: number, maxFileSize: number, tracker?: Tracker): Promise<IRequestOutcome>;
25
+ private static pruneHeaders;
26
+ private isPaused;
27
+ private isAborted;
28
+ constructor(domain: string, database: DatabaseService, crawlConfig: TCrawlConfig, parsersConfig: TKeyObject<IParserConfig>, robotsConfig: TRobotsConfig, parsers: TParserCtor[], lists: Lists, tracker?: Tracker | undefined);
29
+ private abort;
30
+ pause(): void;
31
+ resume(): void;
32
+ fetch(url: string, setDomainIp: boolean): Promise<number>;
33
+ crawl(): Promise<void>;
34
+ }