hydra-crawler 1.4.4 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/apis/autocomplete.api.d.ts +7 -0
- package/dist/apis/autocomplete.api.js +15 -9
- package/dist/apis/autocomplete.api.js.map +1 -0
- package/dist/apis/bugs.api.d.ts +7 -0
- package/dist/apis/bugs.api.js +21 -15
- package/dist/apis/bugs.api.js.map +1 -0
- package/dist/apis/crawl.api.d.ts +7 -0
- package/dist/apis/crawl.api.js +15 -9
- package/dist/apis/crawl.api.js.map +1 -0
- package/dist/apis/domains.api.d.ts +7 -0
- package/dist/apis/domains.api.js +24 -19
- package/dist/apis/domains.api.js.map +1 -0
- package/dist/apis/images.api.d.ts +7 -0
- package/dist/apis/images.api.js +20 -14
- package/dist/apis/images.api.js.map +1 -0
- package/dist/apis/statistics.api.d.ts +8 -0
- package/dist/apis/statistics.api.js +27 -20
- package/dist/apis/statistics.api.js.map +1 -0
- package/dist/apis/test.api.d.ts +5 -0
- package/dist/apis/test.api.js +15 -9
- package/dist/apis/test.api.js.map +1 -0
- package/dist/apis/urls.api.d.ts +7 -0
- package/dist/apis/urls.api.js +21 -15
- package/dist/apis/urls.api.js.map +1 -0
- package/dist/apps/cleanup.app.d.ts +19 -0
- package/dist/apps/cleanup.app.js +118 -100
- package/dist/apps/cleanup.app.js.map +1 -0
- package/dist/apps/cross-populate-export.app.d.ts +12 -0
- package/dist/apps/cross-populate-export.app.js +60 -47
- package/dist/apps/cross-populate-export.app.js.map +1 -0
- package/dist/apps/cross-populate-import.app.d.ts +12 -0
- package/dist/apps/cross-populate-import.app.js +64 -51
- package/dist/apps/cross-populate-import.app.js.map +1 -0
- package/dist/apps/denylist.app.d.ts +17 -0
- package/dist/apps/denylist.app.js +115 -98
- package/dist/apps/denylist.app.js.map +1 -0
- package/dist/apps/expire.app.d.ts +19 -0
- package/dist/apps/expire.app.js +44 -31
- package/dist/apps/expire.app.js.map +1 -0
- package/dist/apps/extract-text.app.d.ts +8 -0
- package/dist/apps/extract-text.app.js +43 -35
- package/dist/apps/extract-text.app.js.map +1 -0
- package/dist/apps/hydra.app.d.ts +34 -0
- package/dist/apps/hydra.app.js +150 -137
- package/dist/apps/hydra.app.js.map +1 -0
- package/dist/apps/import.app.d.ts +11 -0
- package/dist/apps/import.app.js +44 -32
- package/dist/apps/import.app.js.map +1 -0
- package/dist/apps/internal-hydra-common.app.d.ts +28 -0
- package/dist/apps/internal-hydra-common.app.js +5 -11
- package/dist/apps/internal-hydra-common.app.js.map +1 -0
- package/dist/apps/query.app.d.ts +20 -0
- package/dist/apps/query.app.js +63 -49
- package/dist/apps/query.app.js.map +1 -0
- package/dist/apps/reattempt.app.d.ts +17 -0
- package/dist/apps/reattempt.app.js +66 -53
- package/dist/apps/reattempt.app.js.map +1 -0
- package/dist/apps/requeue-domain.app.d.ts +13 -0
- package/dist/apps/requeue-domain.app.js +50 -37
- package/dist/apps/requeue-domain.app.js.map +1 -0
- package/dist/apps/seed.app.d.ts +15 -0
- package/dist/apps/seed.app.js +53 -40
- package/dist/apps/seed.app.js.map +1 -0
- package/dist/apps/startup.app.d.ts +11 -0
- package/dist/apps/startup.app.js +51 -38
- package/dist/apps/startup.app.js.map +1 -0
- package/dist/apps/unarchive.app.d.ts +15 -0
- package/dist/apps/unarchive.app.js +67 -54
- package/dist/apps/unarchive.app.js.map +1 -0
- package/dist/classes/cleaner.d.ts +12 -0
- package/dist/classes/cleaner.js +227 -207
- package/dist/classes/cleaner.js.map +1 -0
- package/dist/classes/crawler.d.ts +34 -0
- package/dist/classes/crawler.js +248 -241
- package/dist/classes/crawler.js.map +1 -0
- package/dist/classes/dns.d.ts +3 -0
- package/dist/classes/dns.js +10 -13
- package/dist/classes/dns.js.map +1 -0
- package/dist/classes/expirer.d.ts +10 -0
- package/dist/classes/expirer.js +107 -94
- package/dist/classes/expirer.js.map +1 -0
- package/dist/classes/expiry.d.ts +8 -0
- package/dist/classes/expiry.js +16 -19
- package/dist/classes/expiry.js.map +1 -0
- package/dist/classes/lists.d.ts +9 -0
- package/dist/classes/lists.js +13 -18
- package/dist/classes/lists.js.map +1 -0
- package/dist/classes/robot.d.ts +15 -0
- package/dist/classes/robot.js +40 -30
- package/dist/classes/robot.js.map +1 -0
- package/dist/classes/tracker.d.ts +25 -0
- package/dist/classes/tracker.js +82 -64
- package/dist/classes/tracker.js.map +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +72 -65
- package/dist/cli.js.map +1 -0
- package/dist/enums/eavailable-strategy.d.ts +4 -0
- package/dist/enums/eavailable-strategy.js +3 -5
- package/dist/enums/eavailable-strategy.js.map +1 -0
- package/dist/enums/elist.d.ts +7 -0
- package/dist/enums/elist.js +7 -11
- package/dist/enums/elist.js.map +1 -0
- package/dist/enums/eserver.d.ts +8 -0
- package/dist/enums/eserver.js +3 -5
- package/dist/enums/eserver.js.map +1 -0
- package/dist/enums/ex-powered-by.d.ts +6 -0
- package/dist/enums/ex-powered-by.js +3 -5
- package/dist/enums/ex-powered-by.js.map +1 -0
- package/dist/helpers/matcher.d.ts +5 -0
- package/dist/helpers/matcher.js +2 -5
- package/dist/helpers/matcher.js.map +1 -0
- package/dist/helpers/random.d.ts +4 -0
- package/dist/helpers/random.js +2 -5
- package/dist/helpers/random.js.map +1 -0
- package/dist/helpers/utf-decoder.d.ts +4 -0
- package/dist/helpers/utf-decoder.js +3 -6
- package/dist/helpers/utf-decoder.js.map +1 -0
- package/dist/interfaces/iexpiry.d.ts +7 -0
- package/dist/interfaces/iexpiry.js +9 -13
- package/dist/interfaces/iexpiry.js.map +1 -0
- package/dist/interfaces/imatch.d.ts +6 -0
- package/dist/interfaces/imatch.js +6 -9
- package/dist/interfaces/imatch.js.map +1 -0
- package/dist/interfaces/iparser-config.d.ts +4 -0
- package/dist/interfaces/iparser-config.js +4 -7
- package/dist/interfaces/iparser-config.js.map +1 -0
- package/dist/interfaces/iparser.d.ts +8 -0
- package/dist/interfaces/iparser.js +2 -2
- package/dist/interfaces/iparser.js.map +1 -0
- package/dist/interfaces/irequest-outcome.d.ts +11 -0
- package/dist/interfaces/irequest-outcome.js +2 -2
- package/dist/interfaces/irequest-outcome.js.map +1 -0
- package/dist/interfaces/iserver.d.ts +4 -0
- package/dist/interfaces/iserver.js +2 -2
- package/dist/interfaces/iserver.js.map +1 -0
- package/dist/parsers/accessibility-metrics.parser.d.ts +11 -0
- package/dist/parsers/accessibility-metrics.parser.js +34 -26
- package/dist/parsers/accessibility-metrics.parser.js.map +1 -0
- package/dist/parsers/asp-error.parser.d.ts +12 -0
- package/dist/parsers/asp-error.parser.js +36 -28
- package/dist/parsers/asp-error.parser.js.map +1 -0
- package/dist/parsers/bad-words.parser.d.ts +10 -0
- package/dist/parsers/bad-words.parser.js +21 -13
- package/dist/parsers/bad-words.parser.js.map +1 -0
- package/dist/parsers/complex-english.parser.d.ts +15 -0
- package/dist/parsers/complex-english.parser.js +33 -25
- package/dist/parsers/complex-english.parser.js.map +1 -0
- package/dist/parsers/data.parser.d.ts +14 -0
- package/dist/parsers/data.parser.js +12 -16
- package/dist/parsers/data.parser.js.map +1 -0
- package/dist/parsers/dictionary.parser.d.ts +19 -0
- package/dist/parsers/dictionary.parser.js +47 -39
- package/dist/parsers/dictionary.parser.js.map +1 -0
- package/dist/parsers/html.parser.d.ts +13 -0
- package/dist/parsers/html.parser.js +4 -8
- package/dist/parsers/html.parser.js.map +1 -0
- package/dist/parsers/hyperlinks.parser.d.ts +20 -0
- package/dist/parsers/hyperlinks.parser.js +82 -77
- package/dist/parsers/hyperlinks.parser.js.map +1 -0
- package/dist/parsers/image-tags.parser.d.ts +20 -0
- package/dist/parsers/image-tags.parser.js +38 -34
- package/dist/parsers/image-tags.parser.js.map +1 -0
- package/dist/parsers/jpeg.parser.d.ts +11 -0
- package/dist/parsers/jpeg.parser.js +28 -20
- package/dist/parsers/jpeg.parser.js.map +1 -0
- package/dist/parsers/paragraphs.parser.d.ts +13 -0
- package/dist/parsers/paragraphs.parser.js +33 -40
- package/dist/parsers/paragraphs.parser.js.map +1 -0
- package/dist/parsers/parser.d.ts +19 -0
- package/dist/parsers/parser.js +30 -17
- package/dist/parsers/parser.js.map +1 -0
- package/dist/parsers/php-error.parser.d.ts +12 -0
- package/dist/parsers/php-error.parser.js +42 -34
- package/dist/parsers/php-error.parser.js.map +1 -0
- package/dist/parsers/phrase.parser.d.ts +8 -0
- package/dist/parsers/phrase.parser.js +16 -11
- package/dist/parsers/phrase.parser.js.map +1 -0
- package/dist/parsers/regex.parser.d.ts +10 -0
- package/dist/parsers/regex.parser.js +30 -22
- package/dist/parsers/regex.parser.js.map +1 -0
- package/dist/parsers/server.parser.d.ts +12 -0
- package/dist/parsers/server.parser.js +66 -56
- package/dist/parsers/server.parser.js.map +1 -0
- package/dist/parsers/spelling.parser.d.ts +10 -0
- package/dist/parsers/spelling.parser.js +21 -13
- package/dist/parsers/spelling.parser.js.map +1 -0
- package/dist/parsers/string.parser.d.ts +8 -0
- package/dist/parsers/string.parser.js +5 -8
- package/dist/parsers/string.parser.js.map +1 -0
- package/dist/parsers/text.parser.d.ts +8 -0
- package/dist/parsers/text.parser.js +24 -18
- package/dist/parsers/text.parser.js.map +1 -0
- package/dist/parsers/words.parser.d.ts +11 -0
- package/dist/parsers/words.parser.js +32 -28
- package/dist/parsers/words.parser.js.map +1 -0
- package/dist/queries/complex-english.query.d.ts +2 -0
- package/dist/queries/complex-english.query.js +37 -38
- package/dist/queries/complex-english.query.js.map +1 -0
- package/dist/queries/flash-content.query.d.ts +2 -0
- package/dist/queries/flash-content.query.js +45 -32
- package/dist/queries/flash-content.query.js.map +1 -0
- package/dist/queries/linking-to-domains.query.d.ts +2 -0
- package/dist/queries/linking-to-domains.query.js +35 -27
- package/dist/queries/linking-to-domains.query.js.map +1 -0
- package/dist/queries/readability-score.query.d.ts +2 -0
- package/dist/queries/readability-score.query.js +21 -13
- package/dist/queries/readability-score.query.js.map +1 -0
- package/dist/servers/crawl.server.d.ts +35 -0
- package/dist/servers/crawl.server.js +133 -121
- package/dist/servers/crawl.server.js.map +1 -0
- package/dist/servers/express.server.d.ts +8 -0
- package/dist/servers/express.server.js +7 -10
- package/dist/servers/express.server.js.map +1 -0
- package/dist/servers/maintenance.server.d.ts +22 -0
- package/dist/servers/maintenance.server.js +42 -36
- package/dist/servers/maintenance.server.js.map +1 -0
- package/dist/servers/rest.server.d.ts +7 -0
- package/dist/servers/rest.server.js +40 -51
- package/dist/servers/rest.server.js.map +1 -0
- package/dist/servers/socket-io.server.d.ts +12 -0
- package/dist/servers/socket-io.server.js +48 -15
- package/dist/servers/socket-io.server.js.map +1 -0
- package/dist/services/database.service.d.ts +68 -0
- package/dist/services/database.service.js +528 -462
- package/dist/services/database.service.js.map +1 -0
- package/dist/types/tcrawl-config.d.ts +14 -0
- package/dist/types/tcrawl-config.js +14 -17
- package/dist/types/tcrawl-config.js.map +1 -0
- package/dist/types/thydra-config.d.ts +4 -0
- package/dist/types/thydra-config.js +4 -7
- package/dist/types/thydra-config.js.map +1 -0
- package/dist/types/tparser-ctor.d.ts +7 -0
- package/dist/types/tparser-ctor.js +2 -2
- package/dist/types/tparser-ctor.js.map +1 -0
- package/dist/types/tquery.d.ts +7 -0
- package/dist/types/tquery.js +2 -2
- package/dist/types/tquery.js.map +1 -0
- package/dist/types/trobots-config.d.ts +4 -0
- package/dist/types/trobots-config.js +4 -7
- package/dist/types/trobots-config.js.map +1 -0
- package/package.json +41 -29
- package/angular/10-es2015.bacd4ae5dd7913ce55f0.js +0 -1
- package/angular/10-es5.bacd4ae5dd7913ce55f0.js +0 -1
- package/angular/11-es2015.0f031dcf752d1e8eda6b.js +0 -1
- package/angular/11-es5.0f031dcf752d1e8eda6b.js +0 -1
- package/angular/3rdpartylicenses.txt +0 -1127
- package/angular/5-es2015.951498ca9c1bc74e57bf.js +0 -1
- package/angular/5-es5.951498ca9c1bc74e57bf.js +0 -1
- package/angular/6-es2015.65f680261a3506b88381.js +0 -1
- package/angular/6-es5.65f680261a3506b88381.js +0 -1
- package/angular/7-es2015.625197f3af1dbf3e805d.js +0 -1
- package/angular/7-es5.625197f3af1dbf3e805d.js +0 -1
- package/angular/8-es2015.55518901987a5b834309.js +0 -1
- package/angular/8-es5.55518901987a5b834309.js +0 -1
- package/angular/9-es2015.6cc9bde262564e7836f2.js +0 -1
- package/angular/9-es5.6cc9bde262564e7836f2.js +0 -1
- package/angular/Roboto-Black.41ed1105a6ebb8ffe34e.woff2 +0 -0
- package/angular/Roboto-Black.937491dfcbe64ca9a9f1.woff +0 -0
- package/angular/Roboto-BlackItalic.2e1ee657996854c6f427.woff +0 -0
- package/angular/Roboto-BlackItalic.50ca4c51ebc27e7e7d2f.woff2 +0 -0
- package/angular/Roboto-Bold.73288d91c325e82a5b92.woff +0 -0
- package/angular/Roboto-Bold.92fbd4e93cf0a5dbebaa.woff2 +0 -0
- package/angular/Roboto-BoldItalic.5f600d98a73d800ae575.woff2 +0 -0
- package/angular/Roboto-BoldItalic.6d89acbd21d7e3fbecb2.woff +0 -0
- package/angular/Roboto-Light.c27d89ac77468ae18f28.woff2 +0 -0
- package/angular/Roboto-Light.d923dfafc0c5183b59aa.woff +0 -0
- package/angular/Roboto-LightItalic.506274c7228cf81cae4d.woff2 +0 -0
- package/angular/Roboto-LightItalic.d4b8c137518d9d92bb28.woff +0 -0
- package/angular/Roboto-Medium.092c6130df8fd2199888.woff +0 -0
- package/angular/Roboto-Medium.1d3bced88509b0838984.woff2 +0 -0
- package/angular/Roboto-MediumItalic.18ff1628c628080166c1.woff +0 -0
- package/angular/Roboto-MediumItalic.d620b8f53f75966fe42e.woff2 +0 -0
- package/angular/Roboto-Regular.64cfb66c866ea50cad47.woff2 +0 -0
- package/angular/Roboto-Regular.e02e9d6ff5547f7e9962.woff +0 -0
- package/angular/Roboto-RegularItalic.4dd2af1e8df532f41db8.woff2 +0 -0
- package/angular/Roboto-RegularItalic.5ea38fff9eebef99c5df.woff +0 -0
- package/angular/Roboto-Thin.dbd56bd3357dc3617fe5.woff2 +0 -0
- package/angular/Roboto-Thin.e7f7c82374bd0ebef14b.woff +0 -0
- package/angular/Roboto-ThinItalic.5dd9349c940073834e9a.woff +0 -0
- package/angular/Roboto-ThinItalic.a8cef84f735ef887abdc.woff2 +0 -0
- package/angular/assets/config/app-config.json +0 -16
- package/angular/assets/images/splashbg.jpg +0 -0
- package/angular/assets/web-app-commons/fonts/material-icons/MaterialDesignIcons-Community-2.7.94.woff +0 -0
- package/angular/assets/web-app-commons/fonts/material-icons/MaterialDesignIcons-Community-2.7.94.woff2 +0 -0
- package/angular/assets/web-app-commons/fonts/material-icons/material-design-icons-community.css +0 -11293
- package/angular/favicon.ico +0 -0
- package/angular/flUhRq6tzZclQEJ-Vdg-IuiaDsNa.f2a0933406f783065152.woff +0 -0
- package/angular/flUhRq6tzZclQEJ-Vdg-IuiaDsNc.6467d9a24f234e8e8e07.woff2 +0 -0
- package/angular/index.html +0 -16
- package/angular/main-es2015.3a582572476c7f292e52.js +0 -1
- package/angular/main-es5.3a582572476c7f292e52.js +0 -1
- package/angular/polyfills-es2015.7df68534018bc2f6cb09.js +0 -1
- package/angular/polyfills-es5.e79468f406fae2989221.js +0 -1
- package/angular/runtime-es2015.6d2cff76cdb2790d3308.js +0 -1
- package/angular/runtime-es5.6d2cff76cdb2790d3308.js +0 -1
- package/angular/styles.c5c6c2534225b85c4ff0.css +0 -1
- package/config/bad-words.json +0 -1
- package/config/complex-english.json +0 -400
- package/config/hydra-auth.json +0 -8
- package/config/hydra-crawler.json +0 -84
- package/config/list-allow.json +0 -171
- package/config/list-deny.json +0 -248
- package/config/list-expiry.json +0 -7
- package/config/schedule.json +0 -25
- package/config/spelling.json +0 -1
|
@@ -1,19 +1,27 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
1
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
+
});
|
|
9
|
+
};
|
|
10
|
+
import { DictionaryParser } from './dictionary.parser';
|
|
11
|
+
export class SpellingParser extends DictionaryParser {
|
|
6
12
|
constructor(url, outcome, config) {
|
|
7
13
|
super(outcome, config, 'spelling');
|
|
8
14
|
this.url = url;
|
|
9
15
|
}
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
16
|
+
parseMatches(database, matches, _nonMatches) {
|
|
17
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
18
|
+
if (!this.url)
|
|
19
|
+
return;
|
|
20
|
+
if (matches.length > 0)
|
|
21
|
+
yield database.setData(this.url, 'spelling', matches);
|
|
22
|
+
else
|
|
23
|
+
yield database.unsetData(this.url, 'spelling');
|
|
24
|
+
});
|
|
17
25
|
}
|
|
18
26
|
}
|
|
19
|
-
|
|
27
|
+
//# sourceMappingURL=spelling.parser.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"spelling.parser.js","sourceRoot":"","sources":["../../src/parsers/spelling.parser.ts"],"names":[],"mappings":";;;;;;;;;AAOA,OAAO,EAAE,gBAAgB,EAAqB,MAAM,qBAAqB,CAAC;AAE1E,MAAM,OAAO,cAAe,SAAQ,gBAAmC;IACtE,YACU,GAAY,EACpB,OAAyB,EACzB,MAAkC;QAEnC,KAAK,CAAC,OAAO,EAAE,MAAM,EAAE,UAAU,CAAC,CAAC;QAJ1B,QAAG,GAAH,GAAG,CAAS;IAKtB,CAAC;IAEe,YAAY,CAAC,QAAyB,EAAE,OAAiB,EAAE,WAAqB;;YAC/F,IAAI,CAAC,IAAI,CAAC,GAAG;gBAAE,OAAO;YAEtB,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;gBAAE,MAAM,QAAQ,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,EAAE,UAAU,EAAE,OAAO,CAAC,CAAC;;gBACzE,MAAM,QAAQ,CAAC,SAAS,CAAC,IAAI,CAAC,GAAG,EAAE,UAAU,CAAC,CAAC;QACrD,CAAC;KAAA;CACD"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import { TKeyObject } from 'tscommons-es-core';
|
|
2
|
+
import { IRequestOutcome } from '../interfaces/irequest-outcome';
|
|
3
|
+
import { IParserConfig } from '../interfaces/iparser-config';
|
|
4
|
+
import { DataParser, IDataConfig } from './data.parser';
|
|
5
|
+
export declare abstract class StringParser<T extends IDataConfig> extends DataParser<T> {
|
|
6
|
+
protected stringData: string | undefined;
|
|
7
|
+
constructor(outcome?: IRequestOutcome, config?: TKeyObject<IParserConfig>, configKey?: string);
|
|
8
|
+
}
|
|
@@ -1,19 +1,16 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
const utf_decoder_1 = require("../helpers/utf-decoder");
|
|
5
|
-
const data_parser_1 = require("./data.parser");
|
|
6
|
-
class StringParser extends data_parser_1.DataParser {
|
|
1
|
+
import { UtfDecoder } from '../helpers/utf-decoder';
|
|
2
|
+
import { DataParser } from './data.parser';
|
|
3
|
+
export class StringParser extends DataParser {
|
|
7
4
|
constructor(outcome, config, configKey) {
|
|
8
5
|
super(outcome, config, configKey);
|
|
9
6
|
if (!this.data)
|
|
10
7
|
return;
|
|
11
8
|
try {
|
|
12
|
-
this.stringData =
|
|
9
|
+
this.stringData = UtfDecoder.fromBuffer(this.data);
|
|
13
10
|
}
|
|
14
11
|
catch (ex) {
|
|
15
12
|
// ignore
|
|
16
13
|
}
|
|
17
14
|
}
|
|
18
15
|
}
|
|
19
|
-
|
|
16
|
+
//# sourceMappingURL=string.parser.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"string.parser.js","sourceRoot":"","sources":["../../src/parsers/string.parser.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,UAAU,EAAE,MAAM,wBAAwB,CAAC;AAKpD,OAAO,EAAE,UAAU,EAAe,MAAM,eAAe,CAAC;AAExD,MAAM,OAAgB,YAAoC,SAAQ,UAAa;IAG9E,YACE,OAAyB,EACzB,MAAkC,EAClC,SAAkB;QAEnB,KAAK,CAAC,OAAO,EAAE,MAAM,EAAE,SAAS,CAAC,CAAC;QAElC,IAAI,CAAC,IAAI,CAAC,IAAI;YAAE,OAAO;QAEvB,IAAI;YACH,IAAI,CAAC,UAAU,GAAG,UAAU,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;SACnD;QAAC,OAAO,EAAE,EAAE;YACZ,SAAS;SACT;IACF,CAAC;CACD"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import { DatabaseService } from '../services/database.service';
|
|
2
|
+
import { HtmlParser } from './html.parser';
|
|
3
|
+
import { IDataConfig } from './data.parser';
|
|
4
|
+
export declare abstract class TextParser<T extends IDataConfig> extends HtmlParser<T> {
|
|
5
|
+
static attemptExtract(html: string): string;
|
|
6
|
+
protected abstract parseText(database: DatabaseService, text: string): Promise<void>;
|
|
7
|
+
parse(database: DatabaseService): Promise<void>;
|
|
8
|
+
}
|
|
@@ -1,11 +1,18 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
1
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
+
});
|
|
9
|
+
};
|
|
10
|
+
import extractor from 'unfluff';
|
|
11
|
+
import { HtmlParser } from './html.parser';
|
|
12
|
+
export class TextParser extends HtmlParser {
|
|
7
13
|
static attemptExtract(html) {
|
|
8
14
|
try {
|
|
15
|
+
// eslint-disable-next-line @typescript-eslint/no-unsafe-assignment, @typescript-eslint/no-unsafe-call
|
|
9
16
|
const extracted = extractor(html);
|
|
10
17
|
if (!extracted)
|
|
11
18
|
throw new Error('Unable to run unfluff');
|
|
@@ -19,17 +26,16 @@ class TextParser extends html_parser_1.HtmlParser {
|
|
|
19
26
|
return '';
|
|
20
27
|
}
|
|
21
28
|
}
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
await this.parseText(database, extracted);
|
|
29
|
+
parse(database) {
|
|
30
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
31
|
+
if (!this.dom) {
|
|
32
|
+
yield this.parseText(database, '');
|
|
33
|
+
return;
|
|
34
|
+
}
|
|
35
|
+
const html = this.dom.html();
|
|
36
|
+
const extracted = TextParser.attemptExtract(html);
|
|
37
|
+
yield this.parseText(database, extracted);
|
|
38
|
+
});
|
|
33
39
|
}
|
|
34
40
|
}
|
|
35
|
-
|
|
41
|
+
//# sourceMappingURL=text.parser.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"text.parser.js","sourceRoot":"","sources":["../../src/parsers/text.parser.ts"],"names":[],"mappings":";;;;;;;;;AAAA,OAAO,SAAS,MAAM,SAAS,CAAC;AAIhC,OAAO,EAAE,UAAU,EAAE,MAAM,eAAe,CAAC;AAG3C,MAAM,OAAgB,UAAkC,SAAQ,UAAa;IACrE,MAAM,CAAC,cAAc,CAAC,IAAY;QACxC,IAAI;YACH,sGAAsG;YACtG,MAAM,SAAS,GAAqB,SAAS,CAAC,IAAI,CAAC,CAAC;YACpD,IAAI,CAAC,SAAS;gBAAE,MAAM,IAAI,KAAK,CAAC,uBAAuB,CAAC,CAAC;YAEzD,OAAO,SAAS,CAAC,IAAI;iBAClB,KAAK,CAAC,KAAK,CAAC;iBACZ,GAAG,CAAC,CAAC,CAAS,EAAU,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;iBACpC,MAAM,CAAC,CAAC,CAAS,EAAW,EAAE,CAAC,CAAC,KAAK,EAAE,CAAC;iBACxC,IAAI,CAAC,IAAI,CAAC,CAAC;SACd;QAAC,OAAO,EAAE,EAAE;YACZ,OAAO,EAAE,CAAC;SACV;IACF,CAAC;IAIY,KAAK,CAAC,QAAyB;;YAC3C,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE;gBACd,MAAM,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;gBACnC,OAAO;aACP;YAED,MAAM,IAAI,GAAW,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC;YACrC,MAAM,SAAS,GAAW,UAAU,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;YAE1D,MAAM,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;QAC3C,CAAC;KAAA;CACD"}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import { DatabaseService } from '../services/database.service';
|
|
2
|
+
import { TextParser } from './text.parser';
|
|
3
|
+
import { IDataConfig } from './data.parser';
|
|
4
|
+
export interface IWordsConfig extends IDataConfig {
|
|
5
|
+
allowHyphenatedWords?: boolean;
|
|
6
|
+
}
|
|
7
|
+
export declare function isIWordsConfig(test: unknown): test is IWordsConfig;
|
|
8
|
+
export declare abstract class WordsParser<T extends IWordsConfig> extends TextParser<T> {
|
|
9
|
+
protected abstract parseWords(database: DatabaseService, words: string[]): Promise<void>;
|
|
10
|
+
protected parseText(database: DatabaseService, text: string): Promise<void>;
|
|
11
|
+
}
|
|
@@ -1,36 +1,40 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
1
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
+
});
|
|
9
|
+
};
|
|
10
|
+
import { commonsTypeHasPropertyBooleanOrUndefined } from 'tscommons-es-core';
|
|
11
|
+
import { TextParser } from './text.parser';
|
|
12
|
+
import { isIDataConfig } from './data.parser';
|
|
7
13
|
const WORD_WITH_HYPHEN = '(?<![a-z0-9])[a-z0-9](?:[-\']?[a-z0-9])*(?![a-z0-9])';
|
|
8
14
|
const WORD_WITHOUT_HYPHEN = '(?<![a-z0-9])[a-z0-9](?:[\']?[a-z0-9])*(?![a-z0-9])';
|
|
9
|
-
function isIWordsConfig(test) {
|
|
10
|
-
if (!
|
|
15
|
+
export function isIWordsConfig(test) {
|
|
16
|
+
if (!isIDataConfig(test))
|
|
11
17
|
return false;
|
|
12
|
-
if (!
|
|
18
|
+
if (!commonsTypeHasPropertyBooleanOrUndefined(test, 'allowHyphenatedWords'))
|
|
13
19
|
return false;
|
|
14
20
|
return true;
|
|
15
21
|
}
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
}
|
|
33
|
-
await this.parseWords(database, words);
|
|
22
|
+
export class WordsParser extends TextParser {
|
|
23
|
+
parseText(database, text) {
|
|
24
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
25
|
+
const wordsConfig = this.getConfig(isIWordsConfig);
|
|
26
|
+
if (!wordsConfig)
|
|
27
|
+
return;
|
|
28
|
+
const pattern = wordsConfig.allowHyphenatedWords ? new RegExp(WORD_WITH_HYPHEN, 'ig') : new RegExp(WORD_WITHOUT_HYPHEN, 'ig');
|
|
29
|
+
const words = [];
|
|
30
|
+
while (true) {
|
|
31
|
+
const result = pattern.exec(text);
|
|
32
|
+
if (result === null)
|
|
33
|
+
break;
|
|
34
|
+
words.push(result[0]);
|
|
35
|
+
}
|
|
36
|
+
yield this.parseWords(database, words);
|
|
37
|
+
});
|
|
34
38
|
}
|
|
35
39
|
}
|
|
36
|
-
|
|
40
|
+
//# sourceMappingURL=words.parser.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"words.parser.js","sourceRoot":"","sources":["../../src/parsers/words.parser.ts"],"names":[],"mappings":";;;;;;;;;AAAA,OAAO,EAAE,wCAAwC,EAAE,MAAM,mBAAmB,CAAC;AAI7E,OAAO,EAAE,UAAU,EAAE,MAAM,eAAe,CAAC;AAC3C,OAAO,EAAe,aAAa,EAAE,MAAM,eAAe,CAAC;AAE3D,MAAM,gBAAgB,GAAW,sDAAsD,CAAC;AACxF,MAAM,mBAAmB,GAAW,qDAAqD,CAAC;AAK1F,MAAM,UAAU,cAAc,CAAC,IAAa;IAC3C,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC;QAAE,OAAO,KAAK,CAAC;IAEvC,IAAI,CAAC,wCAAwC,CAAC,IAAI,EAAE,sBAAsB,CAAC;QAAE,OAAO,KAAK,CAAC;IAE1F,OAAO,IAAI,CAAC;AACb,CAAC;AAED,MAAM,OAAgB,WAAoC,SAAQ,UAAa;IAG9D,SAAS,CAAC,QAAyB,EAAE,IAAY;;YAChE,MAAM,WAAW,GAA2B,IAAI,CAAC,SAAS,CAAC,cAAc,CAAC,CAAC;YAC3E,IAAI,CAAC,WAAW;gBAAE,OAAO;YAEzB,MAAM,OAAO,GAAW,WAAW,CAAC,oBAAoB,CAAC,CAAC,CAAC,IAAI,MAAM,CAAC,gBAAgB,EAAE,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,MAAM,CAAC,mBAAmB,EAAE,IAAI,CAAC,CAAC;YAEtI,MAAM,KAAK,GAAa,EAAE,CAAC;YAC3B,OAAO,IAAI,EAAE;gBACZ,MAAM,MAAM,GAAyB,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACxD,IAAI,MAAM,KAAK,IAAI;oBAAE,MAAM;gBAE3B,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;aACtB;YAED,MAAM,IAAI,CAAC,UAAU,CAAC,QAAQ,EAAE,KAAK,CAAC,CAAC;QACxC,CAAC;KAAA;CACD"}
|
|
@@ -1,42 +1,49 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
1
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
+
});
|
|
9
|
+
};
|
|
10
|
+
import * as cheerio from 'cheerio';
|
|
11
|
+
import { commonsArrayUnique } from 'tscommons-es-core';
|
|
12
|
+
import { EStatus } from 'hydra-crawler-ts-assets';
|
|
13
|
+
import { commonsOutputDoing, commonsOutputError, commonsOutputProgress, commonsOutputResult } from 'nodecommons-es-cli';
|
|
14
|
+
import { commonsHttpReadUrlAsBuffer } from 'nodecommons-es-http';
|
|
15
|
+
import { ComplexEnglishParser } from '../parsers/complex-english.parser';
|
|
16
|
+
import { EList } from '../enums/elist';
|
|
17
|
+
export const query = (args, databaseService, lists, _expiry, parsersConfig) => __awaiter(void 0, void 0, void 0, function* () {
|
|
11
18
|
const domain = args.getString('domain');
|
|
12
19
|
if (!domain) {
|
|
13
|
-
|
|
20
|
+
commonsOutputError('No domain specified');
|
|
14
21
|
return;
|
|
15
22
|
}
|
|
16
|
-
const parser = new
|
|
23
|
+
const parser = new ComplexEnglishParser(undefined, undefined, parsersConfig);
|
|
17
24
|
const regexs = parser.getRegExs();
|
|
18
25
|
const keyDictionary = parser.getDictionary();
|
|
19
26
|
if (!keyDictionary)
|
|
20
27
|
throw new Error('No dictionary available');
|
|
21
|
-
|
|
28
|
+
commonsOutputDoing(`Searching for complex english detections for domain ${domain}`);
|
|
22
29
|
const result = databaseService.getUrls().find({
|
|
23
30
|
domain: domain,
|
|
24
|
-
status:
|
|
31
|
+
status: EStatus.DONE,
|
|
25
32
|
statusCode: 200,
|
|
26
33
|
complexEnglish: { $exists: true }
|
|
27
|
-
});
|
|
34
|
+
}, {});
|
|
28
35
|
const matches = [];
|
|
29
36
|
let tally = 0;
|
|
30
37
|
while (true) {
|
|
31
38
|
tally++;
|
|
32
39
|
if ((tally % 100) === 0)
|
|
33
|
-
|
|
34
|
-
const row =
|
|
40
|
+
commonsOutputProgress(`${tally}`);
|
|
41
|
+
const row = yield result.next();
|
|
35
42
|
if (row === null)
|
|
36
43
|
break;
|
|
37
44
|
const typecast = row;
|
|
38
45
|
if (typecast.headers !== undefined && typecast.headers['content-type'] !== undefined) {
|
|
39
|
-
if (!parser.supports(typecast.headers['content-type'], lists.match(
|
|
46
|
+
if (!parser.supports(typecast.headers['content-type'], lists.match(EList.ALLOW, row.url)))
|
|
40
47
|
continue;
|
|
41
48
|
}
|
|
42
49
|
matches.push({
|
|
@@ -44,44 +51,35 @@ const query = async (args, databaseService, lists, _expiry, parsersConfig) => {
|
|
|
44
51
|
complexEnglish: typecast.complexEnglish
|
|
45
52
|
});
|
|
46
53
|
}
|
|
47
|
-
|
|
54
|
+
commonsOutputResult(tally);
|
|
48
55
|
for (const match of matches) {
|
|
49
56
|
console.log('----------------------------------------------------');
|
|
50
57
|
console.log(match.url);
|
|
51
|
-
const data =
|
|
58
|
+
const data = yield commonsHttpReadUrlAsBuffer(match.url);
|
|
52
59
|
if (!data) {
|
|
53
|
-
|
|
60
|
+
commonsOutputError('Unable to read URL. Skipping');
|
|
54
61
|
continue;
|
|
55
62
|
}
|
|
56
|
-
// @ts-ignore
|
|
57
63
|
const dom = cheerio.load(data);
|
|
58
64
|
if (!dom) {
|
|
59
|
-
|
|
65
|
+
commonsOutputError('Unable to parse HTML. Skipping');
|
|
60
66
|
continue;
|
|
61
67
|
}
|
|
62
|
-
// @ts-ignore
|
|
63
|
-
// tslint:disable:no-invalid-this
|
|
64
68
|
const nodes = dom('*')
|
|
65
69
|
.contents()
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
return this.nodeType === 3;
|
|
69
|
-
});
|
|
70
|
-
// tslint:enable:no-invalid-this
|
|
70
|
+
// eslint-disable-next-line @typescript-eslint/no-unsafe-member-access
|
|
71
|
+
.filter((_index, element) => element.nodeType === 3);
|
|
71
72
|
const deconstruct = [];
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
// @ts-ignore
|
|
75
|
-
deconstruct.push(dom(this).text());
|
|
73
|
+
dom(nodes).each((_index, element) => {
|
|
74
|
+
deconstruct.push(dom(element).text());
|
|
76
75
|
});
|
|
77
|
-
// tslint:enable:no-invalid-this
|
|
78
76
|
const lines = deconstruct
|
|
79
77
|
.join('\n')
|
|
80
78
|
.replace(/[\t\r\n]+/g, '\n')
|
|
81
79
|
.split('\n')
|
|
82
80
|
.map((s) => s.trim())
|
|
83
81
|
.filter((s) => s !== '');
|
|
84
|
-
const unique =
|
|
82
|
+
const unique = commonsArrayUnique(lines);
|
|
85
83
|
for (const line of unique) {
|
|
86
84
|
let changed = line;
|
|
87
85
|
for (const complex of match.complexEnglish) {
|
|
@@ -102,5 +100,6 @@ const query = async (args, databaseService, lists, _expiry, parsersConfig) => {
|
|
|
102
100
|
}
|
|
103
101
|
}
|
|
104
102
|
}
|
|
105
|
-
};
|
|
106
|
-
|
|
103
|
+
});
|
|
104
|
+
// export default query;
|
|
105
|
+
//# sourceMappingURL=complex-english.query.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"complex-english.query.js","sourceRoot":"","sources":["../../src/queries/complex-english.query.ts"],"names":[],"mappings":";;;;;;;;;AAAA,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AAGnC,OAAO,EAAE,kBAAkB,EAAc,MAAM,mBAAmB,CAAC;AAEnE,OAAO,EAAE,OAAO,EAAE,MAAM,yBAAyB,CAAC;AAGlD,OAAO,EAAe,kBAAkB,EAAE,kBAAkB,EAAE,qBAAqB,EAAE,mBAAmB,EAAE,MAAM,oBAAoB,CAAC;AACrI,OAAO,EAAE,0BAA0B,EAAE,MAAM,qBAAqB,CAAC;AAOjE,OAAO,EAAE,oBAAoB,EAAE,MAAM,mCAAmC,CAAC;AAMzE,OAAO,EAAE,KAAK,EAAE,MAAM,gBAAgB,CAAC;AAUvC,MAAM,CAAC,MAAM,KAAK,GAAW,CAC3B,IAAiB,EACjB,eAAgC,EAChC,KAAY,EACZ,OAAe,EACf,aAAwC,EACzB,EAAE;IAClB,MAAM,MAAM,GAAqB,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC;IAC1D,IAAI,CAAC,MAAM,EAAE;QACZ,kBAAkB,CAAC,qBAAqB,CAAC,CAAC;QAC1C,OAAO;KACP;IAED,MAAM,MAAM,GAAyB,IAAI,oBAAoB,CAC3D,SAAS,EACT,SAAS,EACT,aAAa,CACd,CAAC;IACF,MAAM,MAAM,GAAwB,MAAM,CAAC,SAAS,EAAE,CAAC;IAEvD,MAAM,aAAa,GAAmC,MAAM,CAAC,aAAa,EAAE,CAAC;IAC7E,IAAI,CAAC,aAAa;QAAE,MAAM,IAAI,KAAK,CAAC,yBAAyB,CAAC,CAAC;IAE/D,kBAAkB,CAAC,uDAAuD,MAAM,EAAE,CAAC,CAAC;IAEpF,MAAM,MAAM,GAAiB,eAAe,CAAC,OAAO,EAAE,CAAC,IAAI,CACzD;QACE,MAAM,EAAE,MAAM;QACd,MAAM,EAAE,OAAO,CAAC,IAAI;QACpB,UAAU,EAAE,GAAG;QACf,cAAc,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE;KAClC,EACD,EAAE,CACH,CAAC;IAEF,MAAM,OAAO,GAAoB,EAAE,CAAC;IACpC,IAAI,KAAK,GAAW,CAAC,CAAC;IACtB,OAAO,IAAI,EAAE;QACZ,KAAK,EAAE,CAAC;QACR,IAAI,CAAC,KAAK,GAAG,GAAG,CAAC,KAAK,CAAC;YAAE,qBAAqB,CAAC,GAAG,KAAK,EAAE,CAAC,CAAC;QAE3D,MAAM,GAAG,GAAc,MAAM,MAAM,CAAC,IAAI,EAAE,CAAC;QAC3C,IAAI,GAAG,KAAK,IAAI;YAAE,MAAM;QAExB,MAAM,QAAQ,GAAkB,GAA+B,CAAC;QAEhE,IAAI,QAAQ,CAAC,OAAO,KAAK,SAAS,IAAI,QAAQ,CAAC,OAAO,CAAC,cAAc,CAAC,KAAK,SAAS,EAAE;YACrF,IAAI,CAAC,MAAM,CAAC,QAAQ,CAClB,QAAQ,CAAC,OAAO,CAAC,cAAc,CAAC,EAChC,KAAK,CAAC,KAAK,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,GAAG,CAAC,CAClC;gBAAE,SAAS;SACZ;QAED,OAAO,CAAC,IAAI,CAAC;YACX,GAAG,EAAE,GAAG,CAAC,GAAG;YACZ,cAAc,EAAE,QAAQ,CAAC,cAAc;SACxC,CAAC,CAAC;KACH;IAED,mBAAmB,CAAC,KAAK,CAAC,CAAC;IAE3B,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE;QAC5B,OAAO,CAAC,GAAG,CAAC,sDAAsD,CAAC,CAAC;QACpE,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAEvB,MAAM,IAAI,GAAqB,MAAM,0BAA0B,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAC3E,IAAI,CAAC,IAAI,EAAE;YACV,kBAAkB,CAAC,8BAA8B,CAAC,CAAC;YACnD,SAAS;SACT;QAED,MAAM,GAAG,GAA2B,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACvD,IAAI,CAAC,GAAG,EAAE;YACT,kBAAkB,CAAC,gCAAgC,CAAC,CAAC;YACrD,SAAS;SACT;QAED,MAAM,KAAK,GAAoB,GAAG,CAAC,GAAG,CAAC;aACpC,QAAQ,EAAE;YACX,sEAAsE;aACrE,MAAM,CAAC,CAAC,MAAc,EAAE,OAAwB,EAAW,EAAE,CAAE,OAAe,CAAC,QAAQ,KAAK,CAAC,CAAC,CAAC;QAElG,MAAM,WAAW,GAAa,EAAE,CAAC;QAEjC,GAAG,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,MAAc,EAAE,OAAwB,EAAQ,EAAE;YAClE,WAAW,CAAC,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;QACvC,CAAC,CAAC,CAAC;QAEH,MAAM,KAAK,GAAa,WAAW;aAChC,IAAI,CAAC,IAAI,CAAC;aACV,OAAO,CAAC,YAAY,EAAE,IAAI,CAAC;aAC3B,KAAK,CAAC,IAAI,CAAC;aACX,GAAG,CAAC,CAAC,CAAS,EAAU,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;aACpC,MAAM,CAAC,CAAC,CAAS,EAAW,EAAE,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC;QAE5C,MAAM,MAAM,GAAa,kBAAkB,CAAC,KAAK,CAAC,CAAC;QAEnD,KAAK,MAAM,IAAI,IAAI,MAAM,EAAE;YAC1B,IAAI,OAAO,GAAW,IAAI,CAAC;YAE3B,KAAK,MAAM,OAAO,IAAI,KAAK,CAAC,cAAc,EAAE;gBAC3C,MAAM,KAAK,GAAqB,MAAM,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;gBACpD,IAAI,CAAC,KAAK;oBAAE,SAAS;gBAErB,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC;oBAAE,SAAS;gBAEhC,MAAM,WAAW,GAAa,aAAa,CAAC,OAAO,CAAC,CAAC;gBAErD,MAAM,KAAK,GAAa,CAAE,KAAK,OAAO,EAAE,CAAE,CAAC;gBAC3C,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC;oBAAE,KAAK,CAAC,IAAI,CAAC,KAAK,WAAW,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;gBAErE,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,KAAK,EAAE,MAAM,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;aAC7D;YAED,IAAI,OAAO,KAAK,IAAI,EAAE;gBACrB,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;gBACrB,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;aAChB;SACD;KACD;AACF,CAAC,CAAA,CAAC;AAEF,wBAAwB"}
|
|
@@ -1,48 +1,56 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
1
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
+
});
|
|
9
|
+
};
|
|
10
|
+
import { ECommonsCsvColumnType } from 'tscommons-es-format';
|
|
11
|
+
import { isIUrl } from 'hydra-crawler-ts-assets';
|
|
12
|
+
import { isTLink } from 'hydra-crawler-ts-assets';
|
|
13
|
+
import { EStatus } from 'hydra-crawler-ts-assets';
|
|
14
|
+
import { commonsOutputDoing, commonsOutputProgress, commonsOutputResult, commonsOutputSuccess } from 'nodecommons-es-cli';
|
|
15
|
+
import { CommonsCsv } from 'nodecommons-es-file';
|
|
16
|
+
export const query = (args, databaseService, _lists, _expiry, _parsersConfig) => __awaiter(void 0, void 0, void 0, function* () {
|
|
9
17
|
const filename = args.getString('filename');
|
|
10
|
-
|
|
18
|
+
commonsOutputDoing('Searching for FLV and SWF URLs');
|
|
11
19
|
const results = databaseService.getUrls().find({
|
|
12
20
|
url: /\.(flv|swf)$/i,
|
|
13
21
|
status: { $in: [
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
22
|
+
EStatus.ACTIVE,
|
|
23
|
+
EStatus.DENY,
|
|
24
|
+
EStatus.DISALLOWED,
|
|
25
|
+
EStatus.DONE,
|
|
26
|
+
EStatus.FAILED,
|
|
27
|
+
EStatus.QUEUED
|
|
20
28
|
] }
|
|
21
|
-
});
|
|
22
|
-
const urls =
|
|
23
|
-
|
|
24
|
-
|
|
29
|
+
}, {});
|
|
30
|
+
const urls = yield databaseService.listQueryResults(results, isIUrl);
|
|
31
|
+
commonsOutputResult(urls.length);
|
|
32
|
+
commonsOutputDoing('Searching for outgoing links to URLs');
|
|
25
33
|
let tally = 0;
|
|
26
34
|
const urlLinks = new Map();
|
|
27
35
|
for (const url of urls) {
|
|
28
36
|
const results3 = databaseService.getLinks().find({
|
|
29
37
|
outgoing: url.url
|
|
30
|
-
});
|
|
31
|
-
const links =
|
|
38
|
+
}, {});
|
|
39
|
+
const links = yield databaseService.listQueryResults(results3, isTLink);
|
|
32
40
|
if (links.length === 0)
|
|
33
41
|
continue;
|
|
34
42
|
tally += links.length;
|
|
35
|
-
|
|
43
|
+
commonsOutputProgress(tally);
|
|
36
44
|
urlLinks.set(url, links);
|
|
37
45
|
}
|
|
38
|
-
|
|
39
|
-
|
|
46
|
+
commonsOutputResult(tally);
|
|
47
|
+
commonsOutputDoing('Building CSV array');
|
|
40
48
|
const rows = [];
|
|
41
49
|
for (const url of urls) {
|
|
42
50
|
for (const link of (urlLinks.get(url) || [])) {
|
|
43
51
|
rows.push({
|
|
44
|
-
src:
|
|
45
|
-
dest:
|
|
52
|
+
src: link.url,
|
|
53
|
+
dest: url.url
|
|
46
54
|
});
|
|
47
55
|
}
|
|
48
56
|
}
|
|
@@ -52,19 +60,24 @@ const query = async (args, databaseService, _lists, _expiry, _parsersConfig) =>
|
|
|
52
60
|
return -1;
|
|
53
61
|
if (a.src > b.src)
|
|
54
62
|
return 1;
|
|
63
|
+
if (a.dest < b.dest)
|
|
64
|
+
return -1;
|
|
65
|
+
if (a.dest > b.dest)
|
|
66
|
+
return 1;
|
|
55
67
|
return 0;
|
|
56
68
|
});
|
|
57
|
-
|
|
58
|
-
const csv = new
|
|
69
|
+
commonsOutputSuccess();
|
|
70
|
+
const csv = new CommonsCsv([
|
|
59
71
|
{
|
|
60
72
|
name: 'src',
|
|
61
|
-
type:
|
|
73
|
+
type: ECommonsCsvColumnType.STRING
|
|
62
74
|
},
|
|
63
75
|
{
|
|
64
76
|
name: 'dest',
|
|
65
|
-
type:
|
|
77
|
+
type: ECommonsCsvColumnType.STRING
|
|
66
78
|
}
|
|
67
79
|
]);
|
|
68
80
|
csv.save(rows, filename, true);
|
|
69
|
-
};
|
|
70
|
-
|
|
81
|
+
});
|
|
82
|
+
// export default query;
|
|
83
|
+
//# sourceMappingURL=flash-content.query.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"flash-content.query.js","sourceRoot":"","sources":["../../src/queries/flash-content.query.ts"],"names":[],"mappings":";;;;;;;;;AAGA,OAAO,EAAE,qBAAqB,EAAE,MAAM,qBAAqB,CAAC;AAE5D,OAAO,EAAQ,MAAM,EAAE,MAAM,yBAAyB,CAAC;AACvD,OAAO,EAAS,OAAO,EAAE,MAAM,yBAAyB,CAAC;AACzD,OAAO,EAAE,OAAO,EAAE,MAAM,yBAAyB,CAAC;AAElD,OAAO,EAAe,kBAAkB,EAAE,qBAAqB,EAAE,mBAAmB,EAAE,oBAAoB,EAAE,MAAM,oBAAoB,CAAC;AACvI,OAAO,EAAE,UAAU,EAAE,MAAM,qBAAqB,CAAC;AAgBjD,MAAM,CAAC,MAAM,KAAK,GAAW,CAC3B,IAAiB,EACjB,eAAgC,EAChC,MAAa,EACb,OAAe,EACf,cAAyC,EAC1B,EAAE;IAClB,MAAM,QAAQ,GAAW,IAAI,CAAC,SAAS,CAAC,UAAU,CAAC,CAAC;IAEpD,kBAAkB,CAAC,gCAAgC,CAAC,CAAC;IACrD,MAAM,OAAO,GAAiB,eAAe,CAAC,OAAO,EAAE,CAAC,IAAI,CAC1D;QACE,GAAG,EAAE,eAAe;QACpB,MAAM,EAAE,EAAE,GAAG,EAAE;gBACb,OAAO,CAAC,MAAM;gBACd,OAAO,CAAC,IAAI;gBACZ,OAAO,CAAC,UAAU;gBAClB,OAAO,CAAC,IAAI;gBACZ,OAAO,CAAC,MAAM;gBACd,OAAO,CAAC,MAAM;aACf,EAAE;KACJ,EACD,EAAE,CACH,CAAC;IACF,MAAM,IAAI,GAAW,MAAM,eAAe,CAAC,gBAAgB,CACzD,OAAO,EACP,MAAM,CACP,CAAC;IACF,mBAAmB,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IAEjC,kBAAkB,CAAC,sCAAsC,CAAC,CAAC;IAC3D,IAAI,KAAK,GAAW,CAAC,CAAC;IACtB,MAAM,QAAQ,GAAuB,IAAI,GAAG,EAAiB,CAAC;IAC9D,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE;QACvB,MAAM,QAAQ,GAAkB,eAAe,CAAC,QAAQ,EAAE,CAAC,IAAI,CAC7D;YACE,QAAQ,EAAE,GAAG,CAAC,GAAG;SAClB,EACD,EAAE,CACH,CAAC;QACF,MAAM,KAAK,GAAY,MAAM,eAAe,CAAC,gBAAgB,CAC3D,QAAQ,EACR,OAAO,CACR,CAAC;QAEF,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAEjC,KAAK,IAAI,KAAK,CAAC,MAAM,CAAC;QACtB,qBAAqB,CAAC,KAAK,CAAC,CAAC;QAE7B,QAAQ,CAAC,GAAG,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;KACzB;IACD,mBAAmB,CAAC,KAAK,CAAC,CAAC;IAE3B,kBAAkB,CAAC,oBAAoB,CAAC,CAAC;IAEzC,MAAM,IAAI,GAAW,EAAE,CAAC;IACxB,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE;QACvB,KAAK,MAAM,IAAI,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,EAAE;YAC7C,IAAI,CAAC,IAAI,CAAC;gBACR,GAAG,EAAE,IAAI,CAAC,GAAG;gBACb,IAAI,EAAE,GAAG,CAAC,GAAG;aACd,CAAC,CAAC;SACH;KACD;IACD,IAAI;SACD,IAAI,CAAC,CAAC,CAAO,EAAE,CAAO,EAAU,EAAE;QAClC,IAAI,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,GAAG;YAAE,OAAO,CAAC,CAAC,CAAC;QAC7B,IAAI,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,GAAG;YAAE,OAAO,CAAC,CAAC;QAC5B,IAAI,CAAC,CAAC,IAAI,GAAG,CAAC,CAAC,IAAI;YAAE,OAAO,CAAC,CAAC,CAAC;QAC/B,IAAI,CAAC,CAAC,IAAI,GAAG,CAAC,CAAC,IAAI;YAAE,OAAO,CAAC,CAAC;QAC9B,OAAO,CAAC,CAAC;IACV,CAAC,CAAC,CAAC;IAEL,oBAAoB,EAAE,CAAC;IAEvB,MAAM,GAAG,GAAe,IAAI,UAAU,CAAC;QACrC;YACE,IAAI,EAAE,KAAK;YACX,IAAI,EAAE,qBAAqB,CAAC,MAAM;SACnC;QACD;YACE,IAAI,EAAE,MAAM;YACZ,IAAI,EAAE,qBAAqB,CAAC,MAAM;SACnC;KACF,CAAC,CAAC;IACH,GAAG,CAAC,IAAI,CACN,IAAI,EACJ,QAAQ,EACR,IAAI,CACL,CAAC;AACH,CAAC,CAAA,CAAC;AAEF,wBAAwB"}
|