hydra-crawler 1.4.6 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (305) hide show
  1. package/dist/apis/autocomplete.api.d.ts +7 -0
  2. package/dist/apis/autocomplete.api.js +15 -9
  3. package/dist/apis/autocomplete.api.js.map +1 -0
  4. package/dist/apis/bugs.api.d.ts +7 -0
  5. package/dist/apis/bugs.api.js +21 -15
  6. package/dist/apis/bugs.api.js.map +1 -0
  7. package/dist/apis/crawl.api.d.ts +7 -0
  8. package/dist/apis/crawl.api.js +15 -9
  9. package/dist/apis/crawl.api.js.map +1 -0
  10. package/dist/apis/domains.api.d.ts +7 -0
  11. package/dist/apis/domains.api.js +24 -19
  12. package/dist/apis/domains.api.js.map +1 -0
  13. package/dist/apis/images.api.d.ts +7 -0
  14. package/dist/apis/images.api.js +20 -14
  15. package/dist/apis/images.api.js.map +1 -0
  16. package/dist/apis/statistics.api.d.ts +8 -0
  17. package/dist/apis/statistics.api.js +27 -20
  18. package/dist/apis/statistics.api.js.map +1 -0
  19. package/dist/apis/test.api.d.ts +5 -0
  20. package/dist/apis/test.api.js +15 -9
  21. package/dist/apis/test.api.js.map +1 -0
  22. package/dist/apis/urls.api.d.ts +7 -0
  23. package/dist/apis/urls.api.js +21 -15
  24. package/dist/apis/urls.api.js.map +1 -0
  25. package/dist/apps/cleanup.app.d.ts +19 -0
  26. package/dist/apps/cleanup.app.js +118 -100
  27. package/dist/apps/cleanup.app.js.map +1 -0
  28. package/dist/apps/cross-populate-export.app.d.ts +12 -0
  29. package/dist/apps/cross-populate-export.app.js +60 -47
  30. package/dist/apps/cross-populate-export.app.js.map +1 -0
  31. package/dist/apps/cross-populate-import.app.d.ts +12 -0
  32. package/dist/apps/cross-populate-import.app.js +64 -51
  33. package/dist/apps/cross-populate-import.app.js.map +1 -0
  34. package/dist/apps/denylist.app.d.ts +17 -0
  35. package/dist/apps/denylist.app.js +115 -98
  36. package/dist/apps/denylist.app.js.map +1 -0
  37. package/dist/apps/expire.app.d.ts +19 -0
  38. package/dist/apps/expire.app.js +44 -31
  39. package/dist/apps/expire.app.js.map +1 -0
  40. package/dist/apps/extract-text.app.d.ts +8 -0
  41. package/dist/apps/extract-text.app.js +43 -35
  42. package/dist/apps/extract-text.app.js.map +1 -0
  43. package/dist/apps/hydra.app.d.ts +34 -0
  44. package/dist/apps/hydra.app.js +150 -137
  45. package/dist/apps/hydra.app.js.map +1 -0
  46. package/dist/apps/import.app.d.ts +11 -0
  47. package/dist/apps/import.app.js +44 -32
  48. package/dist/apps/import.app.js.map +1 -0
  49. package/dist/apps/internal-hydra-common.app.d.ts +28 -0
  50. package/dist/apps/internal-hydra-common.app.js +5 -11
  51. package/dist/apps/internal-hydra-common.app.js.map +1 -0
  52. package/dist/apps/query.app.d.ts +20 -0
  53. package/dist/apps/query.app.js +63 -49
  54. package/dist/apps/query.app.js.map +1 -0
  55. package/dist/apps/reattempt.app.d.ts +17 -0
  56. package/dist/apps/reattempt.app.js +66 -53
  57. package/dist/apps/reattempt.app.js.map +1 -0
  58. package/dist/apps/requeue-domain.app.d.ts +13 -0
  59. package/dist/apps/requeue-domain.app.js +50 -37
  60. package/dist/apps/requeue-domain.app.js.map +1 -0
  61. package/dist/apps/seed.app.d.ts +15 -0
  62. package/dist/apps/seed.app.js +53 -40
  63. package/dist/apps/seed.app.js.map +1 -0
  64. package/dist/apps/startup.app.d.ts +11 -0
  65. package/dist/apps/startup.app.js +51 -38
  66. package/dist/apps/startup.app.js.map +1 -0
  67. package/dist/apps/unarchive.app.d.ts +15 -0
  68. package/dist/apps/unarchive.app.js +67 -54
  69. package/dist/apps/unarchive.app.js.map +1 -0
  70. package/dist/classes/cleaner.d.ts +12 -0
  71. package/dist/classes/cleaner.js +227 -207
  72. package/dist/classes/cleaner.js.map +1 -0
  73. package/dist/classes/crawler.d.ts +34 -0
  74. package/dist/classes/crawler.js +248 -241
  75. package/dist/classes/crawler.js.map +1 -0
  76. package/dist/classes/dns.d.ts +3 -0
  77. package/dist/classes/dns.js +10 -13
  78. package/dist/classes/dns.js.map +1 -0
  79. package/dist/classes/expirer.d.ts +10 -0
  80. package/dist/classes/expirer.js +107 -94
  81. package/dist/classes/expirer.js.map +1 -0
  82. package/dist/classes/expiry.d.ts +8 -0
  83. package/dist/classes/expiry.js +16 -19
  84. package/dist/classes/expiry.js.map +1 -0
  85. package/dist/classes/lists.d.ts +9 -0
  86. package/dist/classes/lists.js +13 -18
  87. package/dist/classes/lists.js.map +1 -0
  88. package/dist/classes/robot.d.ts +15 -0
  89. package/dist/classes/robot.js +40 -30
  90. package/dist/classes/robot.js.map +1 -0
  91. package/dist/classes/tracker.d.ts +25 -0
  92. package/dist/classes/tracker.js +82 -64
  93. package/dist/classes/tracker.js.map +1 -0
  94. package/dist/cli.d.ts +1 -0
  95. package/dist/cli.js +72 -65
  96. package/dist/cli.js.map +1 -0
  97. package/dist/enums/eavailable-strategy.d.ts +4 -0
  98. package/dist/enums/eavailable-strategy.js +3 -5
  99. package/dist/enums/eavailable-strategy.js.map +1 -0
  100. package/dist/enums/elist.d.ts +7 -0
  101. package/dist/enums/elist.js +7 -11
  102. package/dist/enums/elist.js.map +1 -0
  103. package/dist/enums/eserver.d.ts +8 -0
  104. package/dist/enums/eserver.js +3 -5
  105. package/dist/enums/eserver.js.map +1 -0
  106. package/dist/enums/ex-powered-by.d.ts +6 -0
  107. package/dist/enums/ex-powered-by.js +3 -5
  108. package/dist/enums/ex-powered-by.js.map +1 -0
  109. package/dist/helpers/matcher.d.ts +5 -0
  110. package/dist/helpers/matcher.js +2 -5
  111. package/dist/helpers/matcher.js.map +1 -0
  112. package/dist/helpers/random.d.ts +4 -0
  113. package/dist/helpers/random.js +2 -5
  114. package/dist/helpers/random.js.map +1 -0
  115. package/dist/helpers/utf-decoder.d.ts +4 -0
  116. package/dist/helpers/utf-decoder.js +3 -6
  117. package/dist/helpers/utf-decoder.js.map +1 -0
  118. package/dist/interfaces/iexpiry.d.ts +7 -0
  119. package/dist/interfaces/iexpiry.js +9 -13
  120. package/dist/interfaces/iexpiry.js.map +1 -0
  121. package/dist/interfaces/imatch.d.ts +6 -0
  122. package/dist/interfaces/imatch.js +6 -9
  123. package/dist/interfaces/imatch.js.map +1 -0
  124. package/dist/interfaces/iparser-config.d.ts +4 -0
  125. package/dist/interfaces/iparser-config.js +4 -7
  126. package/dist/interfaces/iparser-config.js.map +1 -0
  127. package/dist/interfaces/iparser.d.ts +8 -0
  128. package/dist/interfaces/iparser.js +2 -2
  129. package/dist/interfaces/iparser.js.map +1 -0
  130. package/dist/interfaces/irequest-outcome.d.ts +11 -0
  131. package/dist/interfaces/irequest-outcome.js +2 -2
  132. package/dist/interfaces/irequest-outcome.js.map +1 -0
  133. package/dist/interfaces/iserver.d.ts +4 -0
  134. package/dist/interfaces/iserver.js +2 -2
  135. package/dist/interfaces/iserver.js.map +1 -0
  136. package/dist/parsers/accessibility-metrics.parser.d.ts +11 -0
  137. package/dist/parsers/accessibility-metrics.parser.js +34 -26
  138. package/dist/parsers/accessibility-metrics.parser.js.map +1 -0
  139. package/dist/parsers/asp-error.parser.d.ts +12 -0
  140. package/dist/parsers/asp-error.parser.js +36 -28
  141. package/dist/parsers/asp-error.parser.js.map +1 -0
  142. package/dist/parsers/bad-words.parser.d.ts +10 -0
  143. package/dist/parsers/bad-words.parser.js +21 -13
  144. package/dist/parsers/bad-words.parser.js.map +1 -0
  145. package/dist/parsers/complex-english.parser.d.ts +15 -0
  146. package/dist/parsers/complex-english.parser.js +33 -25
  147. package/dist/parsers/complex-english.parser.js.map +1 -0
  148. package/dist/parsers/data.parser.d.ts +14 -0
  149. package/dist/parsers/data.parser.js +12 -16
  150. package/dist/parsers/data.parser.js.map +1 -0
  151. package/dist/parsers/dictionary.parser.d.ts +19 -0
  152. package/dist/parsers/dictionary.parser.js +47 -39
  153. package/dist/parsers/dictionary.parser.js.map +1 -0
  154. package/dist/parsers/html.parser.d.ts +13 -0
  155. package/dist/parsers/html.parser.js +4 -8
  156. package/dist/parsers/html.parser.js.map +1 -0
  157. package/dist/parsers/hyperlinks.parser.d.ts +20 -0
  158. package/dist/parsers/hyperlinks.parser.js +82 -77
  159. package/dist/parsers/hyperlinks.parser.js.map +1 -0
  160. package/dist/parsers/image-tags.parser.d.ts +20 -0
  161. package/dist/parsers/image-tags.parser.js +38 -34
  162. package/dist/parsers/image-tags.parser.js.map +1 -0
  163. package/dist/parsers/jpeg.parser.d.ts +11 -0
  164. package/dist/parsers/jpeg.parser.js +28 -20
  165. package/dist/parsers/jpeg.parser.js.map +1 -0
  166. package/dist/parsers/paragraphs.parser.d.ts +13 -0
  167. package/dist/parsers/paragraphs.parser.js +33 -40
  168. package/dist/parsers/paragraphs.parser.js.map +1 -0
  169. package/dist/parsers/parser.d.ts +19 -0
  170. package/dist/parsers/parser.js +30 -17
  171. package/dist/parsers/parser.js.map +1 -0
  172. package/dist/parsers/php-error.parser.d.ts +12 -0
  173. package/dist/parsers/php-error.parser.js +42 -34
  174. package/dist/parsers/php-error.parser.js.map +1 -0
  175. package/dist/parsers/phrase.parser.d.ts +8 -0
  176. package/dist/parsers/phrase.parser.js +16 -11
  177. package/dist/parsers/phrase.parser.js.map +1 -0
  178. package/dist/parsers/regex.parser.d.ts +10 -0
  179. package/dist/parsers/regex.parser.js +30 -22
  180. package/dist/parsers/regex.parser.js.map +1 -0
  181. package/dist/parsers/server.parser.d.ts +12 -0
  182. package/dist/parsers/server.parser.js +66 -56
  183. package/dist/parsers/server.parser.js.map +1 -0
  184. package/dist/parsers/spelling.parser.d.ts +10 -0
  185. package/dist/parsers/spelling.parser.js +21 -13
  186. package/dist/parsers/spelling.parser.js.map +1 -0
  187. package/dist/parsers/string.parser.d.ts +8 -0
  188. package/dist/parsers/string.parser.js +5 -8
  189. package/dist/parsers/string.parser.js.map +1 -0
  190. package/dist/parsers/text.parser.d.ts +8 -0
  191. package/dist/parsers/text.parser.js +24 -18
  192. package/dist/parsers/text.parser.js.map +1 -0
  193. package/dist/parsers/words.parser.d.ts +11 -0
  194. package/dist/parsers/words.parser.js +32 -28
  195. package/dist/parsers/words.parser.js.map +1 -0
  196. package/dist/queries/complex-english.query.d.ts +2 -0
  197. package/dist/queries/complex-english.query.js +37 -38
  198. package/dist/queries/complex-english.query.js.map +1 -0
  199. package/dist/queries/flash-content.query.d.ts +2 -0
  200. package/dist/queries/flash-content.query.js +39 -30
  201. package/dist/queries/flash-content.query.js.map +1 -0
  202. package/dist/queries/linking-to-domains.query.d.ts +2 -0
  203. package/dist/queries/linking-to-domains.query.js +35 -27
  204. package/dist/queries/linking-to-domains.query.js.map +1 -0
  205. package/dist/queries/readability-score.query.d.ts +2 -0
  206. package/dist/queries/readability-score.query.js +21 -13
  207. package/dist/queries/readability-score.query.js.map +1 -0
  208. package/dist/servers/crawl.server.d.ts +35 -0
  209. package/dist/servers/crawl.server.js +133 -121
  210. package/dist/servers/crawl.server.js.map +1 -0
  211. package/dist/servers/express.server.d.ts +8 -0
  212. package/dist/servers/express.server.js +7 -10
  213. package/dist/servers/express.server.js.map +1 -0
  214. package/dist/servers/maintenance.server.d.ts +22 -0
  215. package/dist/servers/maintenance.server.js +42 -36
  216. package/dist/servers/maintenance.server.js.map +1 -0
  217. package/dist/servers/rest.server.d.ts +7 -0
  218. package/dist/servers/rest.server.js +40 -51
  219. package/dist/servers/rest.server.js.map +1 -0
  220. package/dist/servers/socket-io.server.d.ts +12 -0
  221. package/dist/servers/socket-io.server.js +48 -15
  222. package/dist/servers/socket-io.server.js.map +1 -0
  223. package/dist/services/database.service.d.ts +68 -0
  224. package/dist/services/database.service.js +528 -462
  225. package/dist/services/database.service.js.map +1 -0
  226. package/dist/types/tcrawl-config.d.ts +14 -0
  227. package/dist/types/tcrawl-config.js +14 -17
  228. package/dist/types/tcrawl-config.js.map +1 -0
  229. package/dist/types/thydra-config.d.ts +4 -0
  230. package/dist/types/thydra-config.js +4 -7
  231. package/dist/types/thydra-config.js.map +1 -0
  232. package/dist/types/tparser-ctor.d.ts +7 -0
  233. package/dist/types/tparser-ctor.js +2 -2
  234. package/dist/types/tparser-ctor.js.map +1 -0
  235. package/dist/types/tquery.d.ts +7 -0
  236. package/dist/types/tquery.js +2 -2
  237. package/dist/types/tquery.js.map +1 -0
  238. package/dist/types/trobots-config.d.ts +4 -0
  239. package/dist/types/trobots-config.js +4 -7
  240. package/dist/types/trobots-config.js.map +1 -0
  241. package/package.json +41 -29
  242. package/angular/10-es2015.bacd4ae5dd7913ce55f0.js +0 -1
  243. package/angular/10-es5.bacd4ae5dd7913ce55f0.js +0 -1
  244. package/angular/11-es2015.0f031dcf752d1e8eda6b.js +0 -1
  245. package/angular/11-es5.0f031dcf752d1e8eda6b.js +0 -1
  246. package/angular/3rdpartylicenses.txt +0 -1127
  247. package/angular/5-es2015.951498ca9c1bc74e57bf.js +0 -1
  248. package/angular/5-es5.951498ca9c1bc74e57bf.js +0 -1
  249. package/angular/6-es2015.65f680261a3506b88381.js +0 -1
  250. package/angular/6-es5.65f680261a3506b88381.js +0 -1
  251. package/angular/7-es2015.625197f3af1dbf3e805d.js +0 -1
  252. package/angular/7-es5.625197f3af1dbf3e805d.js +0 -1
  253. package/angular/8-es2015.55518901987a5b834309.js +0 -1
  254. package/angular/8-es5.55518901987a5b834309.js +0 -1
  255. package/angular/9-es2015.6cc9bde262564e7836f2.js +0 -1
  256. package/angular/9-es5.6cc9bde262564e7836f2.js +0 -1
  257. package/angular/Roboto-Black.41ed1105a6ebb8ffe34e.woff2 +0 -0
  258. package/angular/Roboto-Black.937491dfcbe64ca9a9f1.woff +0 -0
  259. package/angular/Roboto-BlackItalic.2e1ee657996854c6f427.woff +0 -0
  260. package/angular/Roboto-BlackItalic.50ca4c51ebc27e7e7d2f.woff2 +0 -0
  261. package/angular/Roboto-Bold.73288d91c325e82a5b92.woff +0 -0
  262. package/angular/Roboto-Bold.92fbd4e93cf0a5dbebaa.woff2 +0 -0
  263. package/angular/Roboto-BoldItalic.5f600d98a73d800ae575.woff2 +0 -0
  264. package/angular/Roboto-BoldItalic.6d89acbd21d7e3fbecb2.woff +0 -0
  265. package/angular/Roboto-Light.c27d89ac77468ae18f28.woff2 +0 -0
  266. package/angular/Roboto-Light.d923dfafc0c5183b59aa.woff +0 -0
  267. package/angular/Roboto-LightItalic.506274c7228cf81cae4d.woff2 +0 -0
  268. package/angular/Roboto-LightItalic.d4b8c137518d9d92bb28.woff +0 -0
  269. package/angular/Roboto-Medium.092c6130df8fd2199888.woff +0 -0
  270. package/angular/Roboto-Medium.1d3bced88509b0838984.woff2 +0 -0
  271. package/angular/Roboto-MediumItalic.18ff1628c628080166c1.woff +0 -0
  272. package/angular/Roboto-MediumItalic.d620b8f53f75966fe42e.woff2 +0 -0
  273. package/angular/Roboto-Regular.64cfb66c866ea50cad47.woff2 +0 -0
  274. package/angular/Roboto-Regular.e02e9d6ff5547f7e9962.woff +0 -0
  275. package/angular/Roboto-RegularItalic.4dd2af1e8df532f41db8.woff2 +0 -0
  276. package/angular/Roboto-RegularItalic.5ea38fff9eebef99c5df.woff +0 -0
  277. package/angular/Roboto-Thin.dbd56bd3357dc3617fe5.woff2 +0 -0
  278. package/angular/Roboto-Thin.e7f7c82374bd0ebef14b.woff +0 -0
  279. package/angular/Roboto-ThinItalic.5dd9349c940073834e9a.woff +0 -0
  280. package/angular/Roboto-ThinItalic.a8cef84f735ef887abdc.woff2 +0 -0
  281. package/angular/assets/config/app-config.json +0 -16
  282. package/angular/assets/images/splashbg.jpg +0 -0
  283. package/angular/assets/web-app-commons/fonts/material-icons/MaterialDesignIcons-Community-2.7.94.woff +0 -0
  284. package/angular/assets/web-app-commons/fonts/material-icons/MaterialDesignIcons-Community-2.7.94.woff2 +0 -0
  285. package/angular/assets/web-app-commons/fonts/material-icons/material-design-icons-community.css +0 -11293
  286. package/angular/favicon.ico +0 -0
  287. package/angular/flUhRq6tzZclQEJ-Vdg-IuiaDsNa.f2a0933406f783065152.woff +0 -0
  288. package/angular/flUhRq6tzZclQEJ-Vdg-IuiaDsNc.6467d9a24f234e8e8e07.woff2 +0 -0
  289. package/angular/index.html +0 -16
  290. package/angular/main-es2015.3a582572476c7f292e52.js +0 -1
  291. package/angular/main-es5.3a582572476c7f292e52.js +0 -1
  292. package/angular/polyfills-es2015.7df68534018bc2f6cb09.js +0 -1
  293. package/angular/polyfills-es5.e79468f406fae2989221.js +0 -1
  294. package/angular/runtime-es2015.6d2cff76cdb2790d3308.js +0 -1
  295. package/angular/runtime-es5.6d2cff76cdb2790d3308.js +0 -1
  296. package/angular/styles.c5c6c2534225b85c4ff0.css +0 -1
  297. package/config/bad-words.json +0 -1
  298. package/config/complex-english.json +0 -400
  299. package/config/hydra-auth.json +0 -8
  300. package/config/hydra-crawler.json +0 -84
  301. package/config/list-allow.json +0 -171
  302. package/config/list-deny.json +0 -248
  303. package/config/list-expiry.json +0 -7
  304. package/config/schedule.json +0 -25
  305. package/config/spelling.json +0 -1
@@ -0,0 +1,19 @@
1
+ import { TKeyObject } from 'tscommons-es-core';
2
+ import { DatabaseService } from '../services/database.service';
3
+ import { IParserConfig } from '../interfaces/iparser-config';
4
+ import { IRequestOutcome } from '../interfaces/irequest-outcome';
5
+ import { WordsParser, IWordsConfig } from './words.parser';
6
+ export interface IDictionaryConfig extends IWordsConfig {
7
+ dictionary: string;
8
+ caseSensitive?: boolean;
9
+ }
10
+ export declare function isIDictionaryConfig(test: unknown): test is IDictionaryConfig;
11
+ export declare abstract class DictionaryParser<T extends IDictionaryConfig> extends WordsParser<T> {
12
+ protected caseSensitive: boolean;
13
+ private dictionary;
14
+ private cacheId;
15
+ constructor(outcome?: IRequestOutcome, config?: TKeyObject<IParserConfig>, configKey?: string);
16
+ protected getDictionary(): string[];
17
+ protected abstract parseMatches(database: DatabaseService, matches: string[], nonMatches: string[]): Promise<void>;
18
+ protected parseWords(database: DatabaseService, words: string[]): Promise<void>;
19
+ }
@@ -1,22 +1,36 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.DictionaryParser = exports.isIDictionaryConfig = void 0;
4
- const tscommons_core_1 = require("tscommons-core");
5
- const nodecommons_cli_1 = require("nodecommons-cli");
6
- const nodecommons_file_1 = require("nodecommons-file");
7
- const words_parser_1 = require("./words.parser");
8
- function isIDictionaryConfig(test) {
9
- if (!words_parser_1.isIWordsConfig(test))
1
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
+ return new (P || (P = Promise))(function (resolve, reject) {
4
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
8
+ });
9
+ };
10
+ import { commonsTypeHasPropertyBooleanOrUndefined, commonsTypeHasPropertyString, commonsTypeHasPropertyT, commonsTypeIsStringArray } from 'tscommons-es-core';
11
+ import { commonsOutputError } from 'nodecommons-es-cli';
12
+ import { commonsFileCwdRelativeOrAbsolutePath, commonsFileReadJsonFile } from 'nodecommons-es-file';
13
+ import { WordsParser, isIWordsConfig } from './words.parser';
14
+ export function isIDictionaryConfig(test) {
15
+ if (!isIWordsConfig(test))
10
16
  return false;
11
- if (!tscommons_core_1.CommonsType.hasPropertyString(test, 'dictionary'))
17
+ if (!commonsTypeHasPropertyString(test, 'dictionary'))
12
18
  return false;
13
- if (!tscommons_core_1.CommonsType.hasPropertyBooleanOrUndefined(test, 'caseSensitive'))
19
+ if (!commonsTypeHasPropertyBooleanOrUndefined(test, 'caseSensitive'))
14
20
  return false;
15
21
  return true;
16
22
  }
17
- exports.isIDictionaryConfig = isIDictionaryConfig;
18
23
  const CACHE = new Map();
19
- class DictionaryParser extends words_parser_1.WordsParser {
24
+ function loadDictionary(config) {
25
+ const filename = commonsFileCwdRelativeOrAbsolutePath(`config/${config.dictionary}`);
26
+ const json = commonsFileReadJsonFile(filename);
27
+ if (json === undefined)
28
+ throw new Error('Unable to read dictionary file');
29
+ if (!commonsTypeIsStringArray(json))
30
+ throw new Error('Dictionary file is not a JSON array');
31
+ return json;
32
+ }
33
+ export class DictionaryParser extends WordsParser {
20
34
  constructor(outcome, config, configKey) {
21
35
  super(outcome, config, configKey);
22
36
  this.caseSensitive = false;
@@ -24,42 +38,36 @@ class DictionaryParser extends words_parser_1.WordsParser {
24
38
  if (!config || !configKey)
25
39
  return;
26
40
  this.cacheId = configKey;
27
- if (!tscommons_core_1.CommonsType.hasPropertyT(config, configKey, isIDictionaryConfig)) {
28
- nodecommons_cli_1.CommonsOutput.error(`Invalid config for DictionaryParser (${configKey})`);
41
+ if (!commonsTypeHasPropertyT(config, configKey, isIDictionaryConfig)) {
42
+ commonsOutputError(`Invalid config for DictionaryParser (${configKey})`);
29
43
  }
30
44
  const dictionaryConfig = config[configKey];
31
45
  this.caseSensitive = dictionaryConfig.caseSensitive || false;
32
46
  if (!CACHE.has(this.cacheId)) {
33
- CACHE.set(this.cacheId, DictionaryParser.loadDictionary(dictionaryConfig));
47
+ const loaded = loadDictionary(dictionaryConfig);
48
+ CACHE.set(this.cacheId, loaded);
34
49
  }
35
50
  }
36
- static loadDictionary(config) {
37
- const filename = `${__dirname}/../../config/${config.dictionary}`;
38
- const json = nodecommons_file_1.CommonsFile.readJsonFile(filename);
39
- if (json === undefined)
40
- throw new Error('Unable to read dictionary file');
41
- if (!tscommons_core_1.CommonsType.isStringArray(json))
42
- throw new Error('Dictionary file is not a JSON array');
43
- return json;
44
- }
45
51
  getDictionary() {
46
52
  if (!this.cacheId)
47
53
  return [];
48
54
  return CACHE.get(this.cacheId) || [];
49
55
  }
50
- async parseWords(database, words) {
51
- this.dictionary = this.getDictionary()
52
- .map((word) => this.caseSensitive ? word : word.toLowerCase());
53
- const matches = [];
54
- const nonMatches = [];
55
- for (const word of words) {
56
- const compare = this.caseSensitive ? word : word.toLowerCase();
57
- if (this.dictionary.includes(compare))
58
- matches.push(word);
59
- else
60
- nonMatches.push(word);
61
- }
62
- await this.parseMatches(database, matches, nonMatches);
56
+ parseWords(database, words) {
57
+ return __awaiter(this, void 0, void 0, function* () {
58
+ this.dictionary = this.getDictionary()
59
+ .map((word) => this.caseSensitive ? word : word.toLowerCase());
60
+ const matches = [];
61
+ const nonMatches = [];
62
+ for (const word of words) {
63
+ const compare = this.caseSensitive ? word : word.toLowerCase();
64
+ if (this.dictionary.includes(compare))
65
+ matches.push(word);
66
+ else
67
+ nonMatches.push(word);
68
+ }
69
+ yield this.parseMatches(database, matches, nonMatches);
70
+ });
63
71
  }
64
72
  }
65
- exports.DictionaryParser = DictionaryParser;
73
+ //# sourceMappingURL=dictionary.parser.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"dictionary.parser.js","sourceRoot":"","sources":["../../src/parsers/dictionary.parser.ts"],"names":[],"mappings":";;;;;;;;;AAAA,OAAO,EAAE,wCAAwC,EAAE,4BAA4B,EAAE,uBAAuB,EAAE,wBAAwB,EAAc,MAAM,mBAAmB,CAAC;AAE1K,OAAO,EAAE,kBAAkB,EAAE,MAAM,oBAAoB,CAAC;AACxD,OAAO,EAAE,oCAAoC,EAAE,uBAAuB,EAAE,MAAM,qBAAqB,CAAC;AAOpG,OAAO,EAAE,WAAW,EAAgB,cAAc,EAAE,MAAM,gBAAgB,CAAC;AAM3E,MAAM,UAAU,mBAAmB,CAAC,IAAa;IAChD,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC;QAAE,OAAO,KAAK,CAAC;IAExC,IAAI,CAAC,4BAA4B,CAAC,IAAI,EAAE,YAAY,CAAC;QAAE,OAAO,KAAK,CAAC;IACpE,IAAI,CAAC,wCAAwC,CAAC,IAAI,EAAE,eAAe,CAAC;QAAE,OAAO,KAAK,CAAC;IAEnF,OAAO,IAAI,CAAC;AACb,CAAC;AAED,MAAM,KAAK,GAA0B,IAAI,GAAG,EAAoB,CAAC;AAEjE,SAAS,cAAc,CAAC,MAAyB;IAChD,MAAM,QAAQ,GAAW,oCAAoC,CAAC,UAAU,MAAM,CAAC,UAAU,EAAE,CAAC,CAAC;IAC7F,MAAM,IAAI,GAAsB,uBAAuB,CAAC,QAAQ,CAAC,CAAC;IAClE,IAAI,IAAI,KAAK,SAAS;QAAE,MAAM,IAAI,KAAK,CAAC,gCAAgC,CAAC,CAAC;IAE1E,IAAI,CAAC,wBAAwB,CAAC,IAAI,CAAC;QAAE,MAAM,IAAI,KAAK,CAAC,qCAAqC,CAAC,CAAC;IAE5F,OAAO,IAAI,CAAC;AACb,CAAC;AAED,MAAM,OAAgB,gBAA8C,SAAQ,WAAc;IAKzF,YACE,OAAyB,EACzB,MAAkC,EAClC,SAAkB;QAEnB,KAAK,CAAC,OAAO,EAAE,MAAM,EAAE,SAAS,CAAC,CAAC;QATzB,kBAAa,GAAY,KAAK,CAAC;QACjC,eAAU,GAAa,EAAE,CAAC;QAUjC,IAAI,CAAC,MAAM,IAAI,CAAC,SAAS;YAAE,OAAO;QAElC,IAAI,CAAC,OAAO,GAAG,SAAS,CAAC;QAEzB,IAAI,CAAC,uBAAuB,CAAoB,MAAM,EAAE,SAAS,EAAE,mBAAmB,CAAC,EAAE;YACxF,kBAAkB,CAAC,wCAAwC,SAAS,GAAG,CAAC,CAAC;SACzE;QACD,MAAM,gBAAgB,GAAsB,MAAM,CAAC,SAAS,CAAsB,CAAC;QAEnF,IAAI,CAAC,aAAa,GAAG,gBAAgB,CAAC,aAAa,IAAI,KAAK,CAAC;QAE7D,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE;YAC7B,MAAM,MAAM,GAAa,cAAc,CAAC,gBAAgB,CAAC,CAAC;YAC1D,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC;SAChC;IACF,CAAC;IAES,aAAa;QACtB,IAAI,CAAC,IAAI,CAAC,OAAO;YAAE,OAAO,EAAE,CAAC;QAE7B,OAAO,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;IACtC,CAAC;IAIe,UAAU,CAAC,QAAyB,EAAE,KAAe;;YACpE,IAAI,CAAC,UAAU,GAAG,IAAI,CAAC,aAAa,EAAE;iBACnC,GAAG,CAAC,CAAC,IAAY,EAAU,EAAE,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,CAAC;YAEjF,MAAM,OAAO,GAAa,EAAE,CAAC;YAC7B,MAAM,UAAU,GAAa,EAAE,CAAC;YAEhC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE;gBACzB,MAAM,OAAO,GAAW,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC;gBACvE,IAAI,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC,OAAO,CAAC;oBAAE,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;;oBACrD,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;aAC3B;YAED,MAAM,IAAI,CAAC,YAAY,CAAC,QAAQ,EAAE,OAAO,EAAE,UAAU,CAAC,CAAC;QACxD,CAAC;KAAA;CACD"}
@@ -0,0 +1,13 @@
1
+ /// <reference types="cheerio" />
2
+ import { TKeyObject } from 'tscommons-es-core';
3
+ import { IRequestOutcome } from '../interfaces/irequest-outcome';
4
+ import { IParserConfig } from '../interfaces/iparser-config';
5
+ import { StringParser } from './string.parser';
6
+ import { IDataConfig } from './data.parser';
7
+ export declare abstract class HtmlParser<T extends IDataConfig> extends StringParser<T> {
8
+ static collapseWhitespace(raw: string): string;
9
+ static loadDom(html: string): cheerio.Root | undefined;
10
+ protected dom: cheerio.Root | undefined;
11
+ constructor(outcome?: IRequestOutcome, config?: TKeyObject<IParserConfig>, configKey?: string);
12
+ supports(contentType: string, isAllow: boolean): boolean;
13
+ }
@@ -1,9 +1,6 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.HtmlParser = void 0;
4
- const cheerio = require("cheerio");
5
- const string_parser_1 = require("./string.parser");
6
- class HtmlParser extends string_parser_1.StringParser {
1
+ import * as cheerio from 'cheerio';
2
+ import { StringParser } from './string.parser';
3
+ export class HtmlParser extends StringParser {
7
4
  constructor(outcome, config, configKey) {
8
5
  super(outcome, config, configKey);
9
6
  if (this.stringData && this.stringData.length) {
@@ -14,7 +11,6 @@ class HtmlParser extends string_parser_1.StringParser {
14
11
  static collapseWhitespace(raw) {
15
12
  return raw.replace(/\s+/g, ' ');
16
13
  }
17
- // @ts-ignore
18
14
  static loadDom(html) {
19
15
  try {
20
16
  return cheerio.load(html);
@@ -30,4 +26,4 @@ class HtmlParser extends string_parser_1.StringParser {
30
26
  return /^text\/html([^a-z0-9].*)?$/.test(contentType);
31
27
  }
32
28
  }
33
- exports.HtmlParser = HtmlParser;
29
+ //# sourceMappingURL=html.parser.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"html.parser.js","sourceRoot":"","sources":["../../src/parsers/html.parser.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AAOnC,OAAO,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC;AAG/C,MAAM,OAAgB,UAAkC,SAAQ,YAAe;IAgB9E,YACE,OAAyB,EACzB,MAAkC,EAClC,SAAkB;QAEnB,KAAK,CAAC,OAAO,EAAE,MAAM,EAAE,SAAS,CAAC,CAAC;QAElC,IAAI,IAAI,CAAC,UAAU,IAAI,IAAI,CAAC,UAAU,CAAC,MAAM,EAAE;YAC9C,MAAM,SAAS,GAAW,UAAU,CAAC,kBAAkB,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;YAEzE,IAAI,CAAC,GAAG,GAAG,UAAU,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;SACzC;IACF,CAAC;IA3BM,MAAM,CAAC,kBAAkB,CAAC,GAAW;QAC3C,OAAO,GAAG,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;IACjC,CAAC;IAEM,MAAM,CAAC,OAAO,CAAC,IAAY;QACjC,IAAI;YACH,OAAO,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;SAC1B;QAAC,OAAO,EAAE,EAAE;YACZ,SAAS;YACT,OAAO,SAAS,CAAC;SACjB;IACF,CAAC;IAkBM,QAAQ,CAAC,WAAmB,EAAE,OAAgB;QACpD,IAAI,CAAC,OAAO;YAAE,OAAO,KAAK,CAAC;QAC3B,OAAO,4BAA4B,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;IACvD,CAAC;CACD"}
@@ -0,0 +1,20 @@
1
+ /// <reference types="cheerio" />
2
+ import { TKeyObject } from 'tscommons-es-core';
3
+ import { DatabaseService } from '../services/database.service';
4
+ import { IParserConfig } from '../interfaces/iparser-config';
5
+ import { IRequestOutcome } from '../interfaces/irequest-outcome';
6
+ import { HtmlParser } from './html.parser';
7
+ import { IDataConfig } from './data.parser';
8
+ export interface IHyperlinksConfig extends IDataConfig {
9
+ sources: string[];
10
+ }
11
+ export declare function isIHyperlinksConfig(test: unknown): test is IHyperlinksConfig;
12
+ export declare class HyperlinksParser extends HtmlParser<IHyperlinksConfig> {
13
+ private url?;
14
+ static parseTagLinks(dom: cheerio.Root, tag: string, attribute: string): string[];
15
+ static getBaseHref(dom: cheerio.Root, url: string): string | undefined;
16
+ private hyperlinksConfig;
17
+ constructor(url?: string | undefined, outcome?: IRequestOutcome, config?: TKeyObject<IParserConfig>);
18
+ parse(database: DatabaseService): Promise<void>;
19
+ links(): Promise<string[]>;
20
+ }
@@ -1,110 +1,115 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.HyperlinksParser = exports.isIHyperlinksConfig = void 0;
4
- const url_1 = require("url");
5
- const tscommons_core_1 = require("tscommons-core");
6
- const html_parser_1 = require("./html.parser");
7
- const data_parser_1 = require("./data.parser");
8
- function isIHyperlinksConfig(test) {
9
- if (!data_parser_1.isIDataConfig(test))
1
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
+ return new (P || (P = Promise))(function (resolve, reject) {
4
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
8
+ });
9
+ };
10
+ import { URL } from 'url';
11
+ import { commonsArrayUnique, commonsTypeHasPropertyStringArray } from 'tscommons-es-core';
12
+ import { HtmlParser } from './html.parser';
13
+ import { isIDataConfig } from './data.parser';
14
+ export function isIHyperlinksConfig(test) {
15
+ if (!isIDataConfig(test))
10
16
  return false;
11
- if (!tscommons_core_1.CommonsType.hasPropertyStringArray(test, 'sources'))
17
+ if (!commonsTypeHasPropertyStringArray(test, 'sources'))
12
18
  return false;
13
19
  return true;
14
20
  }
15
- exports.isIHyperlinksConfig = isIHyperlinksConfig;
16
- class HyperlinksParser extends html_parser_1.HtmlParser {
21
+ export class HyperlinksParser extends HtmlParser {
17
22
  constructor(url, outcome, config) {
18
23
  super(outcome, config, 'hyperlinks');
19
24
  this.url = url;
20
25
  this.hyperlinksConfig = this.getConfig(isIHyperlinksConfig);
21
26
  }
22
- static parseTagLinks(
23
- // @ts-ignore
24
- dom, tag, attribute) {
27
+ static parseTagLinks(dom, tag, attribute) {
25
28
  const links = [];
26
- // @ts-ignore
27
- dom(`${tag}[${attribute}]`).each(function (_index, _element) {
28
- // @ts-ignore
29
- const link = dom(this).attr(attribute);
30
- links.push(link);
29
+ dom(`${tag}[${attribute}]`).each((_index, element) => {
30
+ const link = dom(element).attr(attribute);
31
+ if (link !== undefined)
32
+ links.push(link);
31
33
  return true;
32
34
  });
33
- return [...new Set(links)];
35
+ return commonsArrayUnique(links);
34
36
  }
35
- // @ts-ignore
36
37
  static getBaseHref(dom, url) {
37
38
  try {
38
- // @ts-ignore
39
39
  const base = dom('base[href]');
40
40
  if (base.length === 0)
41
41
  return undefined;
42
42
  const href = dom(base[0]).attr('href');
43
43
  if (!href)
44
44
  return undefined;
45
- const whatwg = new url_1.URL(href, url);
45
+ const whatwg = new URL(href, url);
46
46
  return whatwg.toString();
47
47
  }
48
48
  catch (ex) { /* do nothing */ }
49
49
  return undefined;
50
50
  }
51
- async parse(database) {
52
- if (!this.dom || !this.url || !this.hyperlinksConfig)
53
- return; // invalid parse
54
- let title;
55
- try {
56
- title = this.dom('title').text();
57
- }
58
- catch (ex) { /* do nothing */ }
59
- if (title !== undefined && title !== null && title !== '') {
60
- await database.setData(this.url, 'title', title);
61
- }
62
- const links = {};
63
- for (const source of this.hyperlinksConfig.sources) {
64
- const pattern = source.match(/^([a-z]+)\[(src|href|action)\]$/);
65
- if (pattern)
66
- links[pattern[1]] = HyperlinksParser.parseTagLinks(this.dom, pattern[1], pattern[2]);
67
- }
68
- await database.setData(this.url, 'links', links);
69
- }
70
- async links() {
71
- if (!this.dom || !this.url || !this.hyperlinksConfig)
72
- return []; // invalid parse
73
- let links = [];
74
- for (const source of this.hyperlinksConfig.sources) {
75
- const pattern = source.match(/^([a-z]+)\[(src|href|action)\]$/);
76
- if (pattern) {
77
- if (pattern[1] === 'form')
78
- continue;
79
- links = [...links, ...HyperlinksParser.parseTagLinks(this.dom, pattern[1], pattern[2])];
80
- }
81
- }
82
- const unique = [...new Set(links)];
83
- let baseHref = HyperlinksParser.getBaseHref(this.dom, this.url);
84
- if (baseHref === undefined)
85
- baseHref = this.url;
86
- const results = [];
87
- for (let l of unique) {
88
- const anchor = l.split('#');
89
- l = anchor[0];
90
- if (l === '')
91
- continue;
51
+ parse(database) {
52
+ return __awaiter(this, void 0, void 0, function* () {
53
+ if (!this.dom || !this.url || !this.hyperlinksConfig)
54
+ return; // invalid parse
55
+ let title;
92
56
  try {
93
- const u = new url_1.URL(l, baseHref);
94
- if (u.toString().indexOf('#') !== -1) {
95
- process.exit(1);
57
+ title = this.dom('title').text();
58
+ }
59
+ catch (ex) { /* do nothing */ }
60
+ if (title !== undefined && title !== null && title !== '') {
61
+ yield database.setData(this.url, 'title', title);
62
+ }
63
+ const links = {};
64
+ for (const source of this.hyperlinksConfig.sources) {
65
+ const pattern = source.match(/^([a-z]+)\[(src|href|action)\]$/);
66
+ if (pattern)
67
+ links[pattern[1]] = HyperlinksParser.parseTagLinks(this.dom, pattern[1], pattern[2]);
68
+ }
69
+ yield database.setData(this.url, 'links', links);
70
+ });
71
+ }
72
+ // eslint-disable-next-line @typescript-eslint/require-await
73
+ links() {
74
+ return __awaiter(this, void 0, void 0, function* () {
75
+ if (!this.dom || !this.url || !this.hyperlinksConfig)
76
+ return []; // invalid parse
77
+ let links = [];
78
+ for (const source of this.hyperlinksConfig.sources) {
79
+ const pattern = source.match(/^([a-z]+)\[(src|href|action)\]$/);
80
+ if (pattern) {
81
+ if (pattern[1] === 'form')
82
+ continue;
83
+ links = [...links, ...HyperlinksParser.parseTagLinks(this.dom, pattern[1], pattern[2])];
96
84
  }
97
- if (!u.protocol.match(/^http(s?):$/))
98
- continue;
99
- if (u.toString() === this.url || u.toString() === baseHref) {
100
- // skip self-links, or presumed self-links
85
+ }
86
+ const unique = [...new Set(links)];
87
+ let baseHref = HyperlinksParser.getBaseHref(this.dom, this.url);
88
+ if (baseHref === undefined)
89
+ baseHref = this.url;
90
+ const results = [];
91
+ for (let l of unique) {
92
+ const anchor = l.split('#');
93
+ l = anchor[0];
94
+ if (l === '')
101
95
  continue;
96
+ try {
97
+ const u = new URL(l, baseHref);
98
+ if (u.toString().indexOf('#') !== -1) {
99
+ process.exit(1);
100
+ }
101
+ if (!u.protocol.match(/^http(s?):$/))
102
+ continue;
103
+ if (u.toString() === this.url || u.toString() === baseHref) {
104
+ // skip self-links, or presumed self-links
105
+ continue;
106
+ }
107
+ results.push(u.toString());
102
108
  }
103
- results.push(u.toString());
109
+ catch (ex) { /* do nothing */ }
104
110
  }
105
- catch (ex) { /* do nothing */ }
106
- }
107
- return results;
111
+ return results;
112
+ });
108
113
  }
109
114
  }
110
- exports.HyperlinksParser = HyperlinksParser;
115
+ //# sourceMappingURL=hyperlinks.parser.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"hyperlinks.parser.js","sourceRoot":"","sources":["../../src/parsers/hyperlinks.parser.ts"],"names":[],"mappings":";;;;;;;;;AAAA,OAAO,EAAE,GAAG,EAAE,MAAM,KAAK,CAAC;AAE1B,OAAO,EAAE,kBAAkB,EAAE,iCAAiC,EAAc,MAAM,mBAAmB,CAAC;AAOtG,OAAO,EAAE,UAAU,EAAE,MAAM,eAAe,CAAC;AAC3C,OAAO,EAAe,aAAa,EAAE,MAAM,eAAe,CAAC;AAK3D,MAAM,UAAU,mBAAmB,CAAC,IAAa;IAChD,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC;QAAE,OAAO,KAAK,CAAC;IAEvC,IAAI,CAAC,iCAAiC,CAAC,IAAI,EAAE,SAAS,CAAC;QAAE,OAAO,KAAK,CAAC;IAEtE,OAAO,IAAI,CAAC;AACb,CAAC;AAED,MAAM,OAAO,gBAAiB,SAAQ,UAA6B;IAkClE,YACU,GAAY,EACpB,OAAyB,EACzB,MAAkC;QAEnC,KAAK,CAAC,OAAO,EAAE,MAAM,EAAE,YAAY,CAAC,CAAC;QAJ5B,QAAG,GAAH,GAAG,CAAS;QAMrB,IAAI,CAAC,gBAAgB,GAAG,IAAI,CAAC,SAAS,CAAC,mBAAmB,CAAC,CAAC;IAC7D,CAAC;IAxCM,MAAM,CAAC,aAAa,CACzB,GAAiB,EACjB,GAAW,EACX,SAAiB;QAElB,MAAM,KAAK,GAAa,EAAE,CAAC;QAE3B,GAAG,CAAC,GAAG,GAAG,IAAI,SAAS,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,MAAc,EAAE,OAAwB,EAAW,EAAE;YACtF,MAAM,IAAI,GAAqB,GAAG,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YAC5D,IAAI,IAAI,KAAK,SAAS;gBAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACzC,OAAO,IAAI,CAAC;QACb,CAAC,CAAC,CAAC;QAEH,OAAO,kBAAkB,CAAC,KAAK,CAAC,CAAC;IAClC,CAAC;IAEM,MAAM,CAAC,WAAW,CAAC,GAAiB,EAAE,GAAW;QACvD,IAAI;YACH,MAAM,IAAI,GAAoB,GAAG,CAAC,YAAY,CAAC,CAAC;YAChD,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;gBAAE,OAAO,SAAS,CAAC;YAExC,MAAM,IAAI,GAAqB,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACzD,IAAI,CAAC,IAAI;gBAAE,OAAO,SAAS,CAAC;YAE5B,MAAM,MAAM,GAAQ,IAAI,GAAG,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;YACvC,OAAO,MAAM,CAAC,QAAQ,EAAE,CAAC;SACzB;QAAC,OAAO,EAAE,EAAE,EAAE,gBAAgB,EAAE;QACjC,OAAO,SAAS,CAAC;IAClB,CAAC;IAcY,KAAK,CAAC,QAAyB;;YAC3C,IAAI,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,gBAAgB;gBAAE,OAAO,CAAC,gBAAgB;YAE9E,IAAI,KAAuB,CAAC;YAC5B,IAAI;gBACH,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC;aACjC;YAAC,OAAO,EAAE,EAAE,EAAE,gBAAgB,EAAE;YAEjC,IAAI,KAAK,KAAK,SAAS,IAAI,KAAK,KAAK,IAAI,IAAI,KAAK,KAAK,EAAE,EAAE;gBAC1D,MAAM,QAAQ,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,EAAE,OAAO,EAAE,KAAK,CAAC,CAAC;aACjD;YAED,MAAM,KAAK,GAAkC,EAAE,CAAC;YAChD,KAAK,MAAM,MAAM,IAAI,IAAI,CAAC,gBAAgB,CAAC,OAAO,EAAE;gBACnD,MAAM,OAAO,GAA0B,MAAM,CAAC,KAAK,CAAC,iCAAiC,CAAC,CAAC;gBACvF,IAAI,OAAO;oBAAE,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,GAAG,gBAAgB,CAAC,aAAa,CAAC,IAAI,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC;aAClG;YAED,MAAM,QAAQ,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,EAAE,OAAO,EAAE,KAAK,CAAC,CAAC;QAClD,CAAC;KAAA;IAED,4DAA4D;IAC/C,KAAK;;YACjB,IAAI,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,gBAAgB;gBAAE,OAAO,EAAE,CAAC,CAAC,gBAAgB;YAEjF,IAAI,KAAK,GAAa,EAAE,CAAC;YACzB,KAAK,MAAM,MAAM,IAAI,IAAI,CAAC,gBAAgB,CAAC,OAAO,EAAE;gBACnD,MAAM,OAAO,GAA0B,MAAM,CAAC,KAAK,CAAC,iCAAiC,CAAC,CAAC;gBACvF,IAAI,OAAO,EAAE;oBACZ,IAAI,OAAO,CAAC,CAAC,CAAC,KAAK,MAAM;wBAAE,SAAS;oBACpC,KAAK,GAAG,CAAC,GAAG,KAAK,EAAE,GAAG,gBAAgB,CAAC,aAAa,CAAC,IAAI,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;iBACxF;aACD;YAED,MAAM,MAAM,GAAa,CAAC,GAAG,IAAI,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC;YAE7C,IAAI,QAAQ,GAAqB,gBAAgB,CAAC,WAAW,CAAC,IAAI,CAAC,GAAG,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC;YAClF,IAAI,QAAQ,KAAK,SAAS;gBAAE,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC;YAEhD,MAAM,OAAO,GAAa,EAAE,CAAC;YAC7B,KAAK,IAAI,CAAC,IAAI,MAAM,EAAE;gBACrB,MAAM,MAAM,GAAa,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;gBACtC,CAAC,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;gBACd,IAAI,CAAC,KAAK,EAAE;oBAAE,SAAS;gBAEvB,IAAI;oBACH,MAAM,CAAC,GAAQ,IAAI,GAAG,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC;oBACpC,IAAI,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE;wBACrC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;qBAChB;oBACD,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,aAAa,CAAC;wBAAE,SAAS;oBAC/C,IAAI,CAAC,CAAC,QAAQ,EAAE,KAAK,IAAI,CAAC,GAAG,IAAI,CAAC,CAAC,QAAQ,EAAE,KAAK,QAAQ,EAAE;wBAC3D,0CAA0C;wBAC1C,SAAS;qBACT;oBAED,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;iBAC3B;gBAAC,OAAO,EAAE,EAAE,EAAE,gBAAgB,EAAE;aACjC;YAED,OAAO,OAAO,CAAC;QAChB,CAAC;KAAA;CACD"}
@@ -0,0 +1,20 @@
1
+ /// <reference types="cheerio" />
2
+ import { TKeyObject } from 'tscommons-es-core';
3
+ import { DatabaseService } from '../services/database.service';
4
+ import { IParserConfig } from '../interfaces/iparser-config';
5
+ import { IRequestOutcome } from '../interfaces/irequest-outcome';
6
+ import { HtmlParser } from './html.parser';
7
+ import { IDataConfig } from './data.parser';
8
+ declare type TImageTag = {
9
+ src: string;
10
+ alt?: string;
11
+ title?: string;
12
+ };
13
+ export declare class ImageTagsParser extends HtmlParser<IDataConfig> {
14
+ private url?;
15
+ static parseImageTags(dom: cheerio.Root): TImageTag[];
16
+ constructor(url?: string | undefined, outcome?: IRequestOutcome, config?: TKeyObject<IParserConfig>);
17
+ init(database: DatabaseService): Promise<void>;
18
+ parse(database: DatabaseService): Promise<void>;
19
+ }
20
+ export {};
@@ -1,27 +1,27 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.ImageTagsParser = void 0;
4
- const nodecommons_cli_1 = require("nodecommons-cli");
5
- const html_parser_1 = require("./html.parser");
6
- class ImageTagsParser extends html_parser_1.HtmlParser {
1
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
+ return new (P || (P = Promise))(function (resolve, reject) {
4
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
8
+ });
9
+ };
10
+ import { commonsOutputDebug } from 'nodecommons-es-cli';
11
+ import { HtmlParser } from './html.parser';
12
+ export class ImageTagsParser extends HtmlParser {
7
13
  constructor(url, outcome, config) {
8
14
  super(outcome, config, 'imageTags');
9
15
  this.url = url;
10
16
  }
11
- static parseImageTags(
12
- // @ts-ignore
13
- dom) {
17
+ static parseImageTags(dom) {
14
18
  const images = [];
15
- // @ts-ignore
16
- dom('img').each(function (_index, _element) {
17
- // @ts-ignore
18
- const src = dom(this).attr('src');
19
+ dom('img').each((_index, element) => {
20
+ const src = dom(element).attr('src');
19
21
  if (src === null || src === undefined)
20
22
  return true;
21
- // @ts-ignore
22
- const alt = dom(this).attr('alt');
23
- // @ts-ignore
24
- const title = dom(this).attr('title');
23
+ const alt = dom(element).attr('alt');
24
+ const title = dom(element).attr('title');
25
25
  const image = {
26
26
  src: src
27
27
  };
@@ -34,23 +34,27 @@ class ImageTagsParser extends html_parser_1.HtmlParser {
34
34
  });
35
35
  return images;
36
36
  }
37
- async init(database) {
38
- const urls = database.getUrls();
39
- if (!urls)
40
- return;
41
- await urls.createIndex({ 'images.alt': 1 }, { unique: false });
37
+ init(database) {
38
+ return __awaiter(this, void 0, void 0, function* () {
39
+ const urls = database.getUrls();
40
+ if (!urls)
41
+ return;
42
+ yield urls.createIndex({ 'images.alt': 1 }, { unique: false });
43
+ });
42
44
  }
43
- async parse(database) {
44
- if (!this.dom || !this.url)
45
- return; // invalid parse
46
- const images = ImageTagsParser.parseImageTags(this.dom);
47
- if (images.length > 0) {
48
- nodecommons_cli_1.CommonsOutput.debug(`Detected images for ${this.url}`);
49
- await database.setData(this.url, 'images', images);
50
- }
51
- else {
52
- await database.unsetData(this.url, 'images');
53
- }
45
+ parse(database) {
46
+ return __awaiter(this, void 0, void 0, function* () {
47
+ if (!this.dom || !this.url)
48
+ return; // invalid parse
49
+ const images = ImageTagsParser.parseImageTags(this.dom);
50
+ if (images.length > 0) {
51
+ commonsOutputDebug(`Detected images for ${this.url}`);
52
+ yield database.setData(this.url, 'images', images);
53
+ }
54
+ else {
55
+ yield database.unsetData(this.url, 'images');
56
+ }
57
+ });
54
58
  }
55
59
  }
56
- exports.ImageTagsParser = ImageTagsParser;
60
+ //# sourceMappingURL=image-tags.parser.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"image-tags.parser.js","sourceRoot":"","sources":["../../src/parsers/image-tags.parser.ts"],"names":[],"mappings":";;;;;;;;;AAMA,OAAO,EAAE,kBAAkB,EAAE,MAAM,oBAAoB,CAAC;AAOxD,OAAO,EAAE,UAAU,EAAE,MAAM,eAAe,CAAC;AAS3C,MAAM,OAAO,eAAgB,SAAQ,UAAuB;IA2B3D,YACU,GAAY,EACpB,OAAyB,EACzB,MAAkC;QAEnC,KAAK,CAAC,OAAO,EAAE,MAAM,EAAE,WAAW,CAAC,CAAC;QAJ3B,QAAG,GAAH,GAAG,CAAS;IAKtB,CAAC;IAhCM,MAAM,CAAC,cAAc,CAC1B,GAAiB;QAElB,MAAM,MAAM,GAAgB,EAAE,CAAC;QAE/B,GAAG,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,MAAc,EAAE,OAAwB,EAAW,EAAE;YACrE,MAAM,GAAG,GAA0B,GAAG,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAC5D,IAAI,GAAG,KAAK,IAAI,IAAI,GAAG,KAAK,SAAS;gBAAE,OAAO,IAAI,CAAC;YAEnD,MAAM,GAAG,GAA0B,GAAG,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAC5D,MAAM,KAAK,GAA0B,GAAG,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YAEhE,MAAM,KAAK,GAAc;gBACvB,GAAG,EAAE,GAAG;aACT,CAAC;YACF,IAAI,GAAG,KAAK,SAAS,IAAI,GAAG,KAAK,IAAI;gBAAE,KAAK,CAAC,GAAG,GAAG,GAAG,CAAC;YACvD,IAAI,KAAK,KAAK,SAAS,IAAI,KAAK,KAAK,IAAI;gBAAE,KAAK,CAAC,KAAK,GAAG,KAAK,CAAC;YAE/D,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAEnB,OAAO,IAAI,CAAC;QACb,CAAC,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IACf,CAAC;IAUY,IAAI,CAAC,QAAyB;;YAC1C,MAAM,IAAI,GAA+B,QAAQ,CAAC,OAAO,EAAE,CAAC;YAC5D,IAAI,CAAC,IAAI;gBAAE,OAAO;YAElB,MAAM,IAAI,CAAC,WAAW,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,EAAE,EAAE,MAAM,EAAE,KAAK,EAAE,CAAC,CAAC;QAChE,CAAC;KAAA;IAEY,KAAK,CAAC,QAAyB;;YAC3C,IAAI,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG;gBAAE,OAAO,CAAC,gBAAgB;YAEpD,MAAM,MAAM,GAAgB,eAAe,CAAC,cAAc,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YACrE,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE;gBACtB,kBAAkB,CAAC,uBAAuB,IAAI,CAAC,GAAG,EAAE,CAAC,CAAC;gBACtD,MAAM,QAAQ,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,EAAE,QAAQ,EAAE,MAAM,CAAC,CAAC;aACnD;iBAAM;gBACN,MAAM,QAAQ,CAAC,SAAS,CAAC,IAAI,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;aAC7C;QACF,CAAC;KAAA;CACD"}
@@ -0,0 +1,11 @@
1
+ import { TKeyObject } from 'tscommons-es-core';
2
+ import { DatabaseService } from '../services/database.service';
3
+ import { IRequestOutcome } from '../interfaces/irequest-outcome';
4
+ import { IParserConfig } from '../interfaces/iparser-config';
5
+ import { DataParser, IDataConfig } from './data.parser';
6
+ export declare class JpegParser extends DataParser<IDataConfig> {
7
+ private url?;
8
+ constructor(url?: string | undefined, outcome?: IRequestOutcome, config?: TKeyObject<IParserConfig>);
9
+ supports(contentType: string, isAllow: boolean): boolean;
10
+ parse(database: DatabaseService): Promise<void>;
11
+ }
@@ -1,9 +1,15 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.JpegParser = void 0;
4
- const jpeg = require("jpeg-js");
5
- const data_parser_1 = require("./data.parser");
6
- class JpegParser extends data_parser_1.DataParser {
1
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
+ return new (P || (P = Promise))(function (resolve, reject) {
4
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
8
+ });
9
+ };
10
+ import * as jpeg from 'jpeg-js';
11
+ import { DataParser } from './data.parser';
12
+ export class JpegParser extends DataParser {
7
13
  constructor(url, outcome, config) {
8
14
  super(outcome, config, 'jpeg');
9
15
  this.url = url;
@@ -13,19 +19,21 @@ class JpegParser extends data_parser_1.DataParser {
13
19
  return false;
14
20
  return contentType.match(/^image\/(jpeg|jpg)$/) ? true : false;
15
21
  }
16
- async parse(database) {
17
- if (!this.outcome || !this.url)
18
- return;
19
- if (!this.outcome.data)
20
- return;
21
- if (this.outcome.exceeded)
22
- return; // don't try and parse vast JPEGs than haven't been downloaded properly, regardless of what the IDataConfig.allowExceeded is set to
23
- try {
24
- const rawImageData = jpeg.decode(this.outcome.data);
25
- await database.setData(this.url, 'width', rawImageData.width);
26
- await database.setData(this.url, 'height', rawImageData.height);
27
- }
28
- catch (ex) { /* do nothing */ }
22
+ parse(database) {
23
+ return __awaiter(this, void 0, void 0, function* () {
24
+ if (!this.outcome || !this.url)
25
+ return;
26
+ if (!this.outcome.data)
27
+ return;
28
+ if (this.outcome.exceeded)
29
+ return; // don't try and parse vast JPEGs than haven't been downloaded properly, regardless of what the IDataConfig.allowExceeded is set to
30
+ try {
31
+ const rawImageData = jpeg.decode(this.outcome.data);
32
+ yield database.setData(this.url, 'width', rawImageData.width);
33
+ yield database.setData(this.url, 'height', rawImageData.height);
34
+ }
35
+ catch (ex) { /* do nothing */ }
36
+ });
29
37
  }
30
38
  }
31
- exports.JpegParser = JpegParser;
39
+ //# sourceMappingURL=jpeg.parser.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"jpeg.parser.js","sourceRoot":"","sources":["../../src/parsers/jpeg.parser.ts"],"names":[],"mappings":";;;;;;;;;AAAA,OAAO,KAAK,IAAI,MAAM,SAAS,CAAC;AAShC,OAAO,EAAE,UAAU,EAAe,MAAM,eAAe,CAAC;AAExD,MAAM,OAAO,UAAW,SAAQ,UAAuB;IACtD,YACU,GAAY,EACpB,OAAyB,EACzB,MAAkC;QAEnC,KAAK,CAAC,OAAO,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC;QAJtB,QAAG,GAAH,GAAG,CAAS;IAKtB,CAAC;IAEM,QAAQ,CAAC,WAAmB,EAAE,OAAgB;QACpD,IAAI,CAAC,OAAO;YAAE,OAAO,KAAK,CAAC;QAC3B,OAAO,WAAW,CAAC,KAAK,CAAC,qBAAqB,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC;IAChE,CAAC;IAEY,KAAK,CAAC,QAAyB;;YAC3C,IAAI,CAAC,IAAI,CAAC,OAAO,IAAI,CAAC,IAAI,CAAC,GAAG;gBAAE,OAAO;YAEvC,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI;gBAAE,OAAO;YAC/B,IAAI,IAAI,CAAC,OAAO,CAAC,QAAQ;gBAAE,OAAO,CAAC,mIAAmI;YAEtK,IAAI;gBACH,MAAM,YAAY,GAAmB,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;gBAEpE,MAAM,QAAQ,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,EAAE,OAAO,EAAE,YAAY,CAAC,KAAK,CAAC,CAAC;gBAC9D,MAAM,QAAQ,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,EAAE,QAAQ,EAAE,YAAY,CAAC,MAAM,CAAC,CAAC;aAChE;YAAC,OAAO,EAAE,EAAE,EAAE,gBAAgB,EAAE;QAClC,CAAC;KAAA;CACD"}
@@ -0,0 +1,13 @@
1
+ /// <reference types="cheerio" />
2
+ import { DatabaseService } from '../services/database.service';
3
+ import { HtmlParser } from './html.parser';
4
+ import { IDataConfig } from './data.parser';
5
+ export interface IParagraphsConfig extends IDataConfig {
6
+ paragraphWordsThreshold?: number;
7
+ }
8
+ export declare function isIParagraphsConfig(test: unknown): test is IParagraphsConfig;
9
+ export declare abstract class ParagraphsParser<T extends IParagraphsConfig> extends HtmlParser<T> {
10
+ static parseParagraphs(dom: cheerio.Root, paragraphsConfig: IParagraphsConfig): string[];
11
+ protected abstract parseParagraphs(database: DatabaseService, paragraphs: string[]): Promise<void>;
12
+ parse(database: DatabaseService): Promise<void>;
13
+ }