hydra-crawler 1.4.6 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/apis/autocomplete.api.d.ts +7 -0
- package/dist/apis/autocomplete.api.js +15 -9
- package/dist/apis/autocomplete.api.js.map +1 -0
- package/dist/apis/bugs.api.d.ts +7 -0
- package/dist/apis/bugs.api.js +21 -15
- package/dist/apis/bugs.api.js.map +1 -0
- package/dist/apis/crawl.api.d.ts +7 -0
- package/dist/apis/crawl.api.js +15 -9
- package/dist/apis/crawl.api.js.map +1 -0
- package/dist/apis/domains.api.d.ts +7 -0
- package/dist/apis/domains.api.js +24 -19
- package/dist/apis/domains.api.js.map +1 -0
- package/dist/apis/images.api.d.ts +7 -0
- package/dist/apis/images.api.js +20 -14
- package/dist/apis/images.api.js.map +1 -0
- package/dist/apis/statistics.api.d.ts +8 -0
- package/dist/apis/statistics.api.js +27 -20
- package/dist/apis/statistics.api.js.map +1 -0
- package/dist/apis/test.api.d.ts +5 -0
- package/dist/apis/test.api.js +15 -9
- package/dist/apis/test.api.js.map +1 -0
- package/dist/apis/urls.api.d.ts +7 -0
- package/dist/apis/urls.api.js +21 -15
- package/dist/apis/urls.api.js.map +1 -0
- package/dist/apps/cleanup.app.d.ts +19 -0
- package/dist/apps/cleanup.app.js +118 -100
- package/dist/apps/cleanup.app.js.map +1 -0
- package/dist/apps/cross-populate-export.app.d.ts +12 -0
- package/dist/apps/cross-populate-export.app.js +60 -47
- package/dist/apps/cross-populate-export.app.js.map +1 -0
- package/dist/apps/cross-populate-import.app.d.ts +12 -0
- package/dist/apps/cross-populate-import.app.js +64 -51
- package/dist/apps/cross-populate-import.app.js.map +1 -0
- package/dist/apps/denylist.app.d.ts +17 -0
- package/dist/apps/denylist.app.js +115 -98
- package/dist/apps/denylist.app.js.map +1 -0
- package/dist/apps/expire.app.d.ts +19 -0
- package/dist/apps/expire.app.js +44 -31
- package/dist/apps/expire.app.js.map +1 -0
- package/dist/apps/extract-text.app.d.ts +8 -0
- package/dist/apps/extract-text.app.js +43 -35
- package/dist/apps/extract-text.app.js.map +1 -0
- package/dist/apps/hydra.app.d.ts +34 -0
- package/dist/apps/hydra.app.js +150 -137
- package/dist/apps/hydra.app.js.map +1 -0
- package/dist/apps/import.app.d.ts +11 -0
- package/dist/apps/import.app.js +44 -32
- package/dist/apps/import.app.js.map +1 -0
- package/dist/apps/internal-hydra-common.app.d.ts +28 -0
- package/dist/apps/internal-hydra-common.app.js +5 -11
- package/dist/apps/internal-hydra-common.app.js.map +1 -0
- package/dist/apps/query.app.d.ts +20 -0
- package/dist/apps/query.app.js +63 -49
- package/dist/apps/query.app.js.map +1 -0
- package/dist/apps/reattempt.app.d.ts +17 -0
- package/dist/apps/reattempt.app.js +66 -53
- package/dist/apps/reattempt.app.js.map +1 -0
- package/dist/apps/requeue-domain.app.d.ts +13 -0
- package/dist/apps/requeue-domain.app.js +50 -37
- package/dist/apps/requeue-domain.app.js.map +1 -0
- package/dist/apps/seed.app.d.ts +15 -0
- package/dist/apps/seed.app.js +53 -40
- package/dist/apps/seed.app.js.map +1 -0
- package/dist/apps/startup.app.d.ts +11 -0
- package/dist/apps/startup.app.js +51 -38
- package/dist/apps/startup.app.js.map +1 -0
- package/dist/apps/unarchive.app.d.ts +15 -0
- package/dist/apps/unarchive.app.js +67 -54
- package/dist/apps/unarchive.app.js.map +1 -0
- package/dist/classes/cleaner.d.ts +12 -0
- package/dist/classes/cleaner.js +227 -207
- package/dist/classes/cleaner.js.map +1 -0
- package/dist/classes/crawler.d.ts +34 -0
- package/dist/classes/crawler.js +248 -241
- package/dist/classes/crawler.js.map +1 -0
- package/dist/classes/dns.d.ts +3 -0
- package/dist/classes/dns.js +10 -13
- package/dist/classes/dns.js.map +1 -0
- package/dist/classes/expirer.d.ts +10 -0
- package/dist/classes/expirer.js +107 -94
- package/dist/classes/expirer.js.map +1 -0
- package/dist/classes/expiry.d.ts +8 -0
- package/dist/classes/expiry.js +16 -19
- package/dist/classes/expiry.js.map +1 -0
- package/dist/classes/lists.d.ts +9 -0
- package/dist/classes/lists.js +13 -18
- package/dist/classes/lists.js.map +1 -0
- package/dist/classes/robot.d.ts +15 -0
- package/dist/classes/robot.js +40 -30
- package/dist/classes/robot.js.map +1 -0
- package/dist/classes/tracker.d.ts +25 -0
- package/dist/classes/tracker.js +82 -64
- package/dist/classes/tracker.js.map +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +72 -65
- package/dist/cli.js.map +1 -0
- package/dist/enums/eavailable-strategy.d.ts +4 -0
- package/dist/enums/eavailable-strategy.js +3 -5
- package/dist/enums/eavailable-strategy.js.map +1 -0
- package/dist/enums/elist.d.ts +7 -0
- package/dist/enums/elist.js +7 -11
- package/dist/enums/elist.js.map +1 -0
- package/dist/enums/eserver.d.ts +8 -0
- package/dist/enums/eserver.js +3 -5
- package/dist/enums/eserver.js.map +1 -0
- package/dist/enums/ex-powered-by.d.ts +6 -0
- package/dist/enums/ex-powered-by.js +3 -5
- package/dist/enums/ex-powered-by.js.map +1 -0
- package/dist/helpers/matcher.d.ts +5 -0
- package/dist/helpers/matcher.js +2 -5
- package/dist/helpers/matcher.js.map +1 -0
- package/dist/helpers/random.d.ts +4 -0
- package/dist/helpers/random.js +2 -5
- package/dist/helpers/random.js.map +1 -0
- package/dist/helpers/utf-decoder.d.ts +4 -0
- package/dist/helpers/utf-decoder.js +3 -6
- package/dist/helpers/utf-decoder.js.map +1 -0
- package/dist/interfaces/iexpiry.d.ts +7 -0
- package/dist/interfaces/iexpiry.js +9 -13
- package/dist/interfaces/iexpiry.js.map +1 -0
- package/dist/interfaces/imatch.d.ts +6 -0
- package/dist/interfaces/imatch.js +6 -9
- package/dist/interfaces/imatch.js.map +1 -0
- package/dist/interfaces/iparser-config.d.ts +4 -0
- package/dist/interfaces/iparser-config.js +4 -7
- package/dist/interfaces/iparser-config.js.map +1 -0
- package/dist/interfaces/iparser.d.ts +8 -0
- package/dist/interfaces/iparser.js +2 -2
- package/dist/interfaces/iparser.js.map +1 -0
- package/dist/interfaces/irequest-outcome.d.ts +11 -0
- package/dist/interfaces/irequest-outcome.js +2 -2
- package/dist/interfaces/irequest-outcome.js.map +1 -0
- package/dist/interfaces/iserver.d.ts +4 -0
- package/dist/interfaces/iserver.js +2 -2
- package/dist/interfaces/iserver.js.map +1 -0
- package/dist/parsers/accessibility-metrics.parser.d.ts +11 -0
- package/dist/parsers/accessibility-metrics.parser.js +34 -26
- package/dist/parsers/accessibility-metrics.parser.js.map +1 -0
- package/dist/parsers/asp-error.parser.d.ts +12 -0
- package/dist/parsers/asp-error.parser.js +36 -28
- package/dist/parsers/asp-error.parser.js.map +1 -0
- package/dist/parsers/bad-words.parser.d.ts +10 -0
- package/dist/parsers/bad-words.parser.js +21 -13
- package/dist/parsers/bad-words.parser.js.map +1 -0
- package/dist/parsers/complex-english.parser.d.ts +15 -0
- package/dist/parsers/complex-english.parser.js +33 -25
- package/dist/parsers/complex-english.parser.js.map +1 -0
- package/dist/parsers/data.parser.d.ts +14 -0
- package/dist/parsers/data.parser.js +12 -16
- package/dist/parsers/data.parser.js.map +1 -0
- package/dist/parsers/dictionary.parser.d.ts +19 -0
- package/dist/parsers/dictionary.parser.js +47 -39
- package/dist/parsers/dictionary.parser.js.map +1 -0
- package/dist/parsers/html.parser.d.ts +13 -0
- package/dist/parsers/html.parser.js +4 -8
- package/dist/parsers/html.parser.js.map +1 -0
- package/dist/parsers/hyperlinks.parser.d.ts +20 -0
- package/dist/parsers/hyperlinks.parser.js +82 -77
- package/dist/parsers/hyperlinks.parser.js.map +1 -0
- package/dist/parsers/image-tags.parser.d.ts +19 -0
- package/dist/parsers/image-tags.parser.js +31 -35
- package/dist/parsers/image-tags.parser.js.map +1 -0
- package/dist/parsers/jpeg.parser.d.ts +11 -0
- package/dist/parsers/jpeg.parser.js +28 -20
- package/dist/parsers/jpeg.parser.js.map +1 -0
- package/dist/parsers/paragraphs.parser.d.ts +13 -0
- package/dist/parsers/paragraphs.parser.js +33 -40
- package/dist/parsers/paragraphs.parser.js.map +1 -0
- package/dist/parsers/parser.d.ts +19 -0
- package/dist/parsers/parser.js +30 -17
- package/dist/parsers/parser.js.map +1 -0
- package/dist/parsers/php-error.parser.d.ts +12 -0
- package/dist/parsers/php-error.parser.js +42 -34
- package/dist/parsers/php-error.parser.js.map +1 -0
- package/dist/parsers/phrase.parser.d.ts +8 -0
- package/dist/parsers/phrase.parser.js +16 -11
- package/dist/parsers/phrase.parser.js.map +1 -0
- package/dist/parsers/regex.parser.d.ts +10 -0
- package/dist/parsers/regex.parser.js +30 -22
- package/dist/parsers/regex.parser.js.map +1 -0
- package/dist/parsers/server.parser.d.ts +11 -0
- package/dist/parsers/server.parser.js +58 -57
- package/dist/parsers/server.parser.js.map +1 -0
- package/dist/parsers/spelling.parser.d.ts +10 -0
- package/dist/parsers/spelling.parser.js +21 -13
- package/dist/parsers/spelling.parser.js.map +1 -0
- package/dist/parsers/string.parser.d.ts +8 -0
- package/dist/parsers/string.parser.js +5 -8
- package/dist/parsers/string.parser.js.map +1 -0
- package/dist/parsers/text.parser.d.ts +8 -0
- package/dist/parsers/text.parser.js +24 -18
- package/dist/parsers/text.parser.js.map +1 -0
- package/dist/parsers/words.parser.d.ts +11 -0
- package/dist/parsers/words.parser.js +32 -28
- package/dist/parsers/words.parser.js.map +1 -0
- package/dist/queries/complex-english.query.d.ts +2 -0
- package/dist/queries/complex-english.query.js +37 -38
- package/dist/queries/complex-english.query.js.map +1 -0
- package/dist/queries/flash-content.query.d.ts +2 -0
- package/dist/queries/flash-content.query.js +39 -30
- package/dist/queries/flash-content.query.js.map +1 -0
- package/dist/queries/linking-to-domains.query.d.ts +2 -0
- package/dist/queries/linking-to-domains.query.js +35 -27
- package/dist/queries/linking-to-domains.query.js.map +1 -0
- package/dist/queries/readability-score.query.d.ts +2 -0
- package/dist/queries/readability-score.query.js +21 -13
- package/dist/queries/readability-score.query.js.map +1 -0
- package/dist/servers/crawl.server.d.ts +35 -0
- package/dist/servers/crawl.server.js +133 -121
- package/dist/servers/crawl.server.js.map +1 -0
- package/dist/servers/express.server.d.ts +8 -0
- package/dist/servers/express.server.js +7 -10
- package/dist/servers/express.server.js.map +1 -0
- package/dist/servers/maintenance.server.d.ts +22 -0
- package/dist/servers/maintenance.server.js +42 -36
- package/dist/servers/maintenance.server.js.map +1 -0
- package/dist/servers/rest.server.d.ts +7 -0
- package/dist/servers/rest.server.js +40 -51
- package/dist/servers/rest.server.js.map +1 -0
- package/dist/servers/socket-io.server.d.ts +12 -0
- package/dist/servers/socket-io.server.js +48 -15
- package/dist/servers/socket-io.server.js.map +1 -0
- package/dist/services/database.service.d.ts +68 -0
- package/dist/services/database.service.js +527 -462
- package/dist/services/database.service.js.map +1 -0
- package/dist/types/tcrawl-config.d.ts +14 -0
- package/dist/types/tcrawl-config.js +14 -17
- package/dist/types/tcrawl-config.js.map +1 -0
- package/dist/types/thydra-config.d.ts +4 -0
- package/dist/types/thydra-config.js +4 -7
- package/dist/types/thydra-config.js.map +1 -0
- package/dist/types/tparser-ctor.d.ts +7 -0
- package/dist/types/tparser-ctor.js +2 -2
- package/dist/types/tparser-ctor.js.map +1 -0
- package/dist/types/tquery.d.ts +7 -0
- package/dist/types/tquery.js +2 -2
- package/dist/types/tquery.js.map +1 -0
- package/dist/types/trobots-config.d.ts +4 -0
- package/dist/types/trobots-config.js +4 -7
- package/dist/types/trobots-config.js.map +1 -0
- package/package.json +41 -29
- package/angular/10-es2015.bacd4ae5dd7913ce55f0.js +0 -1
- package/angular/10-es5.bacd4ae5dd7913ce55f0.js +0 -1
- package/angular/11-es2015.0f031dcf752d1e8eda6b.js +0 -1
- package/angular/11-es5.0f031dcf752d1e8eda6b.js +0 -1
- package/angular/3rdpartylicenses.txt +0 -1127
- package/angular/5-es2015.951498ca9c1bc74e57bf.js +0 -1
- package/angular/5-es5.951498ca9c1bc74e57bf.js +0 -1
- package/angular/6-es2015.65f680261a3506b88381.js +0 -1
- package/angular/6-es5.65f680261a3506b88381.js +0 -1
- package/angular/7-es2015.625197f3af1dbf3e805d.js +0 -1
- package/angular/7-es5.625197f3af1dbf3e805d.js +0 -1
- package/angular/8-es2015.55518901987a5b834309.js +0 -1
- package/angular/8-es5.55518901987a5b834309.js +0 -1
- package/angular/9-es2015.6cc9bde262564e7836f2.js +0 -1
- package/angular/9-es5.6cc9bde262564e7836f2.js +0 -1
- package/angular/Roboto-Black.41ed1105a6ebb8ffe34e.woff2 +0 -0
- package/angular/Roboto-Black.937491dfcbe64ca9a9f1.woff +0 -0
- package/angular/Roboto-BlackItalic.2e1ee657996854c6f427.woff +0 -0
- package/angular/Roboto-BlackItalic.50ca4c51ebc27e7e7d2f.woff2 +0 -0
- package/angular/Roboto-Bold.73288d91c325e82a5b92.woff +0 -0
- package/angular/Roboto-Bold.92fbd4e93cf0a5dbebaa.woff2 +0 -0
- package/angular/Roboto-BoldItalic.5f600d98a73d800ae575.woff2 +0 -0
- package/angular/Roboto-BoldItalic.6d89acbd21d7e3fbecb2.woff +0 -0
- package/angular/Roboto-Light.c27d89ac77468ae18f28.woff2 +0 -0
- package/angular/Roboto-Light.d923dfafc0c5183b59aa.woff +0 -0
- package/angular/Roboto-LightItalic.506274c7228cf81cae4d.woff2 +0 -0
- package/angular/Roboto-LightItalic.d4b8c137518d9d92bb28.woff +0 -0
- package/angular/Roboto-Medium.092c6130df8fd2199888.woff +0 -0
- package/angular/Roboto-Medium.1d3bced88509b0838984.woff2 +0 -0
- package/angular/Roboto-MediumItalic.18ff1628c628080166c1.woff +0 -0
- package/angular/Roboto-MediumItalic.d620b8f53f75966fe42e.woff2 +0 -0
- package/angular/Roboto-Regular.64cfb66c866ea50cad47.woff2 +0 -0
- package/angular/Roboto-Regular.e02e9d6ff5547f7e9962.woff +0 -0
- package/angular/Roboto-RegularItalic.4dd2af1e8df532f41db8.woff2 +0 -0
- package/angular/Roboto-RegularItalic.5ea38fff9eebef99c5df.woff +0 -0
- package/angular/Roboto-Thin.dbd56bd3357dc3617fe5.woff2 +0 -0
- package/angular/Roboto-Thin.e7f7c82374bd0ebef14b.woff +0 -0
- package/angular/Roboto-ThinItalic.5dd9349c940073834e9a.woff +0 -0
- package/angular/Roboto-ThinItalic.a8cef84f735ef887abdc.woff2 +0 -0
- package/angular/assets/config/app-config.json +0 -16
- package/angular/assets/images/splashbg.jpg +0 -0
- package/angular/assets/web-app-commons/fonts/material-icons/MaterialDesignIcons-Community-2.7.94.woff +0 -0
- package/angular/assets/web-app-commons/fonts/material-icons/MaterialDesignIcons-Community-2.7.94.woff2 +0 -0
- package/angular/assets/web-app-commons/fonts/material-icons/material-design-icons-community.css +0 -11293
- package/angular/favicon.ico +0 -0
- package/angular/flUhRq6tzZclQEJ-Vdg-IuiaDsNa.f2a0933406f783065152.woff +0 -0
- package/angular/flUhRq6tzZclQEJ-Vdg-IuiaDsNc.6467d9a24f234e8e8e07.woff2 +0 -0
- package/angular/index.html +0 -16
- package/angular/main-es2015.3a582572476c7f292e52.js +0 -1
- package/angular/main-es5.3a582572476c7f292e52.js +0 -1
- package/angular/polyfills-es2015.7df68534018bc2f6cb09.js +0 -1
- package/angular/polyfills-es5.e79468f406fae2989221.js +0 -1
- package/angular/runtime-es2015.6d2cff76cdb2790d3308.js +0 -1
- package/angular/runtime-es5.6d2cff76cdb2790d3308.js +0 -1
- package/angular/styles.c5c6c2534225b85c4ff0.css +0 -1
- package/config/bad-words.json +0 -1
- package/config/complex-english.json +0 -400
- package/config/hydra-auth.json +0 -8
- package/config/hydra-crawler.json +0 -84
- package/config/list-allow.json +0 -171
- package/config/list-deny.json +0 -248
- package/config/list-expiry.json +0 -7
- package/config/schedule.json +0 -25
- package/config/spelling.json +0 -1
|
@@ -1,42 +1,37 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
1
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
+
});
|
|
9
|
+
};
|
|
10
|
+
import { URL } from 'url';
|
|
11
|
+
import { commonsObjectStripNulls, commonsStringRegexLike, commonsTypeAttemptNumber, commonsTypeHasPropertyNumber, commonsTypeHasPropertyString, commonsTypeHasPropertyTArray } from 'tscommons-es-core';
|
|
12
|
+
import { isIUrl } from 'hydra-crawler-ts-assets';
|
|
13
|
+
import { isTPhpError } from 'hydra-crawler-ts-assets';
|
|
14
|
+
import { isTAspError } from 'hydra-crawler-ts-assets';
|
|
15
|
+
import { isTDomain } from 'hydra-crawler-ts-assets';
|
|
16
|
+
import { isTLink } from 'hydra-crawler-ts-assets';
|
|
17
|
+
import { EStatus, toEStatus } from 'hydra-crawler-ts-assets';
|
|
18
|
+
import { EComparator } from 'hydra-crawler-ts-assets';
|
|
19
|
+
import { commonsOutputDebug, commonsOutputError } from 'nodecommons-es-cli';
|
|
20
|
+
import { CommonsMongodbService } from 'nodecommons-es-database-mongodb';
|
|
21
|
+
import { EAvailableStrategy } from '../enums/eavailable-strategy';
|
|
22
|
+
export function isTMongoIdRow(test) {
|
|
23
|
+
if (!commonsTypeHasPropertyString(test, '_id'))
|
|
20
24
|
return false;
|
|
21
25
|
return true;
|
|
22
26
|
}
|
|
23
|
-
|
|
24
|
-
function isTMongoIdTallyRow(test) {
|
|
27
|
+
export function isTMongoIdTallyRow(test) {
|
|
25
28
|
if (!isTMongoIdRow(test))
|
|
26
29
|
return false;
|
|
27
|
-
if (!
|
|
30
|
+
if (!commonsTypeHasPropertyNumber(test, 'tally'))
|
|
28
31
|
return false;
|
|
29
32
|
return true;
|
|
30
33
|
}
|
|
31
|
-
|
|
32
|
-
class DatabaseService extends nodecommons_database_mongodb_1.CommonsMongodbService {
|
|
33
|
-
constructor(credentials) {
|
|
34
|
-
super(credentials);
|
|
35
|
-
// CommonsGracefulAbort.addCallback((): void => {
|
|
36
|
-
// CommonsOutput.alert(`SIGINT abort flag is set. Closing MongoDB connection.`);
|
|
37
|
-
// super.close();
|
|
38
|
-
// });
|
|
39
|
-
}
|
|
34
|
+
export class DatabaseService extends CommonsMongodbService {
|
|
40
35
|
getDomains() {
|
|
41
36
|
if (!this.domains)
|
|
42
37
|
throw new Error('Domains collected has not been instantiated yet');
|
|
@@ -52,454 +47,524 @@ class DatabaseService extends nodecommons_database_mongodb_1.CommonsMongodbServi
|
|
|
52
47
|
throw new Error('Links collected has not been instantiated yet');
|
|
53
48
|
return this.links;
|
|
54
49
|
}
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
this
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
50
|
+
init() {
|
|
51
|
+
const _super = Object.create(null, {
|
|
52
|
+
init: { get: () => super.init }
|
|
53
|
+
});
|
|
54
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
55
|
+
yield _super.init.call(this);
|
|
56
|
+
if (!this.database)
|
|
57
|
+
throw new Error('Database has not been instantiated yet');
|
|
58
|
+
this.domains = yield this.ensureCollection('domains');
|
|
59
|
+
yield this.domains.createIndex({ domain: 1 }, { unique: true });
|
|
60
|
+
yield this.domains.createIndex({ ip: 1 }, { unique: false });
|
|
61
|
+
this.urls = yield this.ensureCollection('urls');
|
|
62
|
+
yield this.urls.createIndex({ url: 1 }, { unique: true });
|
|
63
|
+
yield this.urls.createIndex({ domain: 1 }, { unique: false });
|
|
64
|
+
yield this.urls.createIndex({ status: 1 }, { unique: false });
|
|
65
|
+
yield this.urls.createIndex({ attempted: 1 }, { unique: false });
|
|
66
|
+
yield this.urls.createIndex({ orphan: 1 }, { unique: false });
|
|
67
|
+
yield this.urls.createIndex({ statusCode: 1 }, { unique: false });
|
|
68
|
+
yield this.urls.createIndex({ 'headers.content-type': 1 }, { unique: false });
|
|
69
|
+
yield this.urls.createIndex({ 'headers.content-length': 1 }, { unique: false });
|
|
70
|
+
this.links = yield this.ensureCollection('links');
|
|
71
|
+
yield this.links.createIndex({ url: 1 }, { unique: false });
|
|
72
|
+
yield this.links.createIndex({ outgoing: 1 }, { unique: false });
|
|
73
|
+
yield this.links.createIndex({ url: 1, outgoing: 1 }, { unique: true });
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
initParser(ctor) {
|
|
77
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
78
|
+
const parser = new ctor();
|
|
79
|
+
yield parser.init(this);
|
|
80
|
+
});
|
|
80
81
|
}
|
|
81
82
|
getRawDatabase() {
|
|
82
83
|
return super.getRawDatabase();
|
|
83
84
|
}
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
85
|
+
wipe() {
|
|
86
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
87
|
+
yield this.getLinks().deleteMany({});
|
|
88
|
+
yield this.getUrls().deleteMany({});
|
|
89
|
+
yield this.getDomains().deleteMany({});
|
|
90
|
+
});
|
|
88
91
|
}
|
|
89
|
-
|
|
90
|
-
|
|
92
|
+
resetActive() {
|
|
93
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
94
|
+
yield this.getUrls().updateMany({ status: EStatus.ACTIVE }, { $set: { status: EStatus.QUEUED } });
|
|
95
|
+
});
|
|
91
96
|
}
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
await this.getDomains().updateOne({ domain: domain }, { $set: { ip: ip } }, { upsert: true });
|
|
95
|
-
return true;
|
|
96
|
-
}
|
|
97
|
-
catch (ex) {
|
|
98
|
-
nodecommons_cli_1.CommonsOutput.debug('debug position 7');
|
|
99
|
-
console.log(ex);
|
|
100
|
-
return false;
|
|
101
|
-
}
|
|
102
|
-
}
|
|
103
|
-
async queue(url, isDeny) {
|
|
104
|
-
const whatwg = new url_1.URL(url);
|
|
105
|
-
if (!whatwg.protocol.match(/^http(s?):$/))
|
|
106
|
-
return false;
|
|
107
|
-
try {
|
|
108
|
-
const status = isDeny ? hydra_crawler_ts_assets_6.EStatus.DENY : hydra_crawler_ts_assets_6.EStatus.QUEUED;
|
|
109
|
-
// un-archive if currently archived
|
|
110
|
-
await this.getUrls().deleteOne({ url: url, status: hydra_crawler_ts_assets_6.EStatus.ARCHIVED });
|
|
111
|
-
await this.getUrls().insertOne({
|
|
112
|
-
url: url,
|
|
113
|
-
domain: whatwg.hostname,
|
|
114
|
-
status: status
|
|
115
|
-
});
|
|
116
|
-
return true;
|
|
117
|
-
}
|
|
118
|
-
catch (ex) {
|
|
119
|
-
return false;
|
|
120
|
-
}
|
|
121
|
-
}
|
|
122
|
-
async available(strategy, threshold, limit, existing) {
|
|
123
|
-
if (limit === 0)
|
|
124
|
-
return [];
|
|
125
|
-
const comparator = strategy === eavailable_strategy_1.EAvailableStrategy.LARGEST ? hydra_crawler_ts_assets_7.EComparator.GTE : hydra_crawler_ts_assets_7.EComparator.LT;
|
|
126
|
-
let thresholdMatch = {};
|
|
127
|
-
switch (comparator) {
|
|
128
|
-
case hydra_crawler_ts_assets_7.EComparator.GTE:
|
|
129
|
-
thresholdMatch = { tally: { $gte: threshold } };
|
|
130
|
-
break;
|
|
131
|
-
case hydra_crawler_ts_assets_7.EComparator.LT:
|
|
132
|
-
thresholdMatch = { tally: { $lt: threshold } };
|
|
133
|
-
break;
|
|
134
|
-
}
|
|
135
|
-
const sortOrder = strategy === eavailable_strategy_1.EAvailableStrategy.SMALLEST ? 1 : -1;
|
|
136
|
-
const results = this.getUrls().aggregate([
|
|
137
|
-
{ $match: { status: hydra_crawler_ts_assets_6.EStatus.QUEUED, domain: { $nin: existing } } },
|
|
138
|
-
{ $group: { _id: '$domain', tally: { $sum: 1 } } },
|
|
139
|
-
{ $match: thresholdMatch },
|
|
140
|
-
{ $sort: { tally: sortOrder } },
|
|
141
|
-
{ $limit: limit }
|
|
142
|
-
], { allowDiskUse: true });
|
|
143
|
-
const entries = await this.listQueryResults(results, isTMongoIdTallyRow);
|
|
144
|
-
return entries
|
|
145
|
-
.map((entry) => entry._id);
|
|
146
|
-
}
|
|
147
|
-
async next(domain) {
|
|
148
|
-
const next = await this.getUrls().findOne({
|
|
149
|
-
status: hydra_crawler_ts_assets_6.EStatus.QUEUED,
|
|
150
|
-
domain: domain
|
|
151
|
-
});
|
|
152
|
-
if (next === null)
|
|
153
|
-
return undefined;
|
|
154
|
-
return next.url;
|
|
155
|
-
}
|
|
156
|
-
async setStatus(url, status) {
|
|
157
|
-
const updates = {};
|
|
158
|
-
updates['status'] = status;
|
|
159
|
-
if (![hydra_crawler_ts_assets_6.EStatus.QUEUED, hydra_crawler_ts_assets_6.EStatus.ACTIVE].includes(status))
|
|
160
|
-
updates['attempted'] = new Date();
|
|
161
|
-
if (status === hydra_crawler_ts_assets_6.EStatus.DONE)
|
|
162
|
-
updates['done'] = new Date();
|
|
163
|
-
try {
|
|
164
|
-
await this.getUrls().updateOne({ url: url }, { $set: updates });
|
|
165
|
-
return true;
|
|
166
|
-
}
|
|
167
|
-
catch (ex) {
|
|
168
|
-
console.error(ex);
|
|
169
|
-
return false;
|
|
170
|
-
}
|
|
171
|
-
}
|
|
172
|
-
async setStatusCode(url, code) {
|
|
173
|
-
try {
|
|
174
|
-
await this.getUrls().updateOne({ url: url }, { $set: { statusCode: code }
|
|
175
|
-
});
|
|
176
|
-
return true;
|
|
177
|
-
}
|
|
178
|
-
catch (ex) {
|
|
179
|
-
console.error(ex);
|
|
180
|
-
return false;
|
|
181
|
-
}
|
|
182
|
-
}
|
|
183
|
-
async setHeaders(url, headers) {
|
|
184
|
-
try {
|
|
185
|
-
await this.getUrls().updateOne({ url: url }, { $set: { headers: headers }
|
|
186
|
-
});
|
|
187
|
-
return true;
|
|
188
|
-
}
|
|
189
|
-
catch (ex) {
|
|
190
|
-
console.error(ex);
|
|
191
|
-
return false;
|
|
192
|
-
}
|
|
193
|
-
}
|
|
194
|
-
async setData(url, context, data) {
|
|
195
|
-
try {
|
|
196
|
-
const update = {};
|
|
197
|
-
update[context] = data;
|
|
198
|
-
await this.getUrls().updateOne({ url: url }, { $set: update });
|
|
199
|
-
return true;
|
|
200
|
-
}
|
|
201
|
-
catch (ex) {
|
|
202
|
-
console.error(ex);
|
|
203
|
-
return false;
|
|
204
|
-
}
|
|
205
|
-
}
|
|
206
|
-
async unsetData(url, context) {
|
|
207
|
-
try {
|
|
208
|
-
const update = {};
|
|
209
|
-
update[context] = true;
|
|
210
|
-
await this.getUrls().updateOne({ url: url }, { $unset: update });
|
|
211
|
-
return true;
|
|
212
|
-
}
|
|
213
|
-
catch (ex) {
|
|
214
|
-
console.error(ex);
|
|
215
|
-
return false;
|
|
216
|
-
}
|
|
217
|
-
}
|
|
218
|
-
async getTtl(url) {
|
|
219
|
-
const row = await this.getUrls().findOne({ url: url });
|
|
220
|
-
if (row === null)
|
|
221
|
-
return undefined;
|
|
222
|
-
if (!tscommons_core_1.CommonsType.hasPropertyNumber(row, 'ttl'))
|
|
223
|
-
return undefined;
|
|
224
|
-
return row['ttl'];
|
|
225
|
-
}
|
|
226
|
-
async setTtl(url, ttl) {
|
|
227
|
-
try {
|
|
228
|
-
await this.getUrls().updateOne({ url: url }, { $set: { ttl: ttl }
|
|
229
|
-
});
|
|
230
|
-
return true;
|
|
231
|
-
}
|
|
232
|
-
catch (ex) {
|
|
233
|
-
console.error(ex);
|
|
234
|
-
return false;
|
|
235
|
-
}
|
|
236
|
-
}
|
|
237
|
-
async unsetTtl(url) {
|
|
238
|
-
try {
|
|
239
|
-
await this.getUrls().updateOne({ url: url }, { $unset: { ttl: true }
|
|
240
|
-
});
|
|
241
|
-
return true;
|
|
242
|
-
}
|
|
243
|
-
catch (ex) {
|
|
244
|
-
console.error(ex);
|
|
245
|
-
return false;
|
|
246
|
-
}
|
|
247
|
-
}
|
|
248
|
-
async getHash(url) {
|
|
249
|
-
const row = await this.getUrls().findOne({ url: url });
|
|
250
|
-
if (row === null)
|
|
251
|
-
return undefined;
|
|
252
|
-
if (!tscommons_core_1.CommonsType.hasPropertyString(row, 'hash'))
|
|
253
|
-
return undefined;
|
|
254
|
-
return row['hash'];
|
|
255
|
-
}
|
|
256
|
-
async setHash(url, hash) {
|
|
257
|
-
try {
|
|
258
|
-
await this.getUrls().updateOne({ url: url }, { $set: {
|
|
259
|
-
hash: hash,
|
|
260
|
-
hashSet: new Date()
|
|
261
|
-
} });
|
|
262
|
-
return true;
|
|
263
|
-
}
|
|
264
|
-
catch (ex) {
|
|
265
|
-
console.error(ex);
|
|
266
|
-
return false;
|
|
267
|
-
}
|
|
268
|
-
}
|
|
269
|
-
async unsetHash(url) {
|
|
270
|
-
try {
|
|
271
|
-
await this.getUrls().updateOne({ url: url }, { $unset: {
|
|
272
|
-
hash: true,
|
|
273
|
-
hashSet: true
|
|
274
|
-
} });
|
|
275
|
-
return true;
|
|
276
|
-
}
|
|
277
|
-
catch (ex) {
|
|
278
|
-
console.error(ex);
|
|
279
|
-
return false;
|
|
280
|
-
}
|
|
281
|
-
}
|
|
282
|
-
async setFailReason(url, reason) {
|
|
283
|
-
try {
|
|
284
|
-
await this.getUrls().updateOne({ url: url }, { $set: {
|
|
285
|
-
reason: reason
|
|
286
|
-
} });
|
|
287
|
-
return true;
|
|
288
|
-
}
|
|
289
|
-
catch (ex) {
|
|
290
|
-
console.error(ex);
|
|
291
|
-
return false;
|
|
292
|
-
}
|
|
293
|
-
}
|
|
294
|
-
async unsetFailReason(url) {
|
|
295
|
-
try {
|
|
296
|
-
await this.getUrls().updateOne({ url: url }, { $unset: {
|
|
297
|
-
reason: true
|
|
298
|
-
} });
|
|
299
|
-
return true;
|
|
300
|
-
}
|
|
301
|
-
catch (ex) {
|
|
302
|
-
console.error(ex);
|
|
303
|
-
return false;
|
|
304
|
-
}
|
|
305
|
-
}
|
|
306
|
-
async link(url, links) {
|
|
307
|
-
// more efficient to only remove removed and only add new
|
|
308
|
-
// rather than just wiping all existing and re-adding
|
|
309
|
-
const find = this.getLinks().find({
|
|
310
|
-
url: url
|
|
311
|
-
});
|
|
312
|
-
const existing = (await this.listQueryResults(find, hydra_crawler_ts_assets_5.isTLink))
|
|
313
|
-
.map((link) => link.outgoing);
|
|
314
|
-
const removed = [];
|
|
315
|
-
const added = [];
|
|
316
|
-
for (const link of links) {
|
|
317
|
-
if (!existing.includes(link) && !added.includes(link))
|
|
318
|
-
added.push(link);
|
|
319
|
-
}
|
|
320
|
-
for (const link of existing) {
|
|
321
|
-
if (!links.includes(link))
|
|
322
|
-
removed.push(link);
|
|
323
|
-
}
|
|
324
|
-
for (const outgoing of removed) {
|
|
97
|
+
domain(domain, ip) {
|
|
98
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
325
99
|
try {
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
outgoing: outgoing
|
|
329
|
-
});
|
|
100
|
+
yield this.getDomains().updateOne({ domain: domain }, { $set: { ip: ip } }, { upsert: true });
|
|
101
|
+
return true;
|
|
330
102
|
}
|
|
331
103
|
catch (ex) {
|
|
332
|
-
|
|
104
|
+
commonsOutputDebug('debug position 7');
|
|
105
|
+
console.log(ex);
|
|
106
|
+
return false;
|
|
333
107
|
}
|
|
334
|
-
}
|
|
335
|
-
|
|
108
|
+
});
|
|
109
|
+
}
|
|
110
|
+
queue(url, isDeny) {
|
|
111
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
112
|
+
const whatwg = new URL(url);
|
|
113
|
+
if (!whatwg.protocol.match(/^http(s?):$/))
|
|
114
|
+
return false;
|
|
336
115
|
try {
|
|
337
|
-
|
|
116
|
+
const status = isDeny ? EStatus.DENY : EStatus.QUEUED;
|
|
117
|
+
// un-archive if currently archived
|
|
118
|
+
yield this.getUrls().deleteOne({ url: url, status: EStatus.ARCHIVED });
|
|
119
|
+
yield this.getUrls().insertOne({
|
|
338
120
|
url: url,
|
|
339
|
-
|
|
121
|
+
domain: whatwg.hostname,
|
|
122
|
+
status: status
|
|
340
123
|
});
|
|
124
|
+
return true;
|
|
341
125
|
}
|
|
342
126
|
catch (ex) {
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
127
|
+
return false;
|
|
128
|
+
}
|
|
129
|
+
});
|
|
130
|
+
}
|
|
131
|
+
available(strategy, threshold, limit, existing) {
|
|
132
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
133
|
+
if (limit === 0)
|
|
134
|
+
return [];
|
|
135
|
+
const comparator = strategy === EAvailableStrategy.LARGEST ? EComparator.GTE : EComparator.LT;
|
|
136
|
+
const thresholdMatch = { tally: {} };
|
|
137
|
+
switch (comparator) {
|
|
138
|
+
case EComparator.GTE:
|
|
139
|
+
thresholdMatch.tally = { $gte: threshold };
|
|
140
|
+
break;
|
|
141
|
+
case EComparator.LT:
|
|
142
|
+
thresholdMatch.tally = { $lt: threshold };
|
|
143
|
+
break;
|
|
144
|
+
}
|
|
145
|
+
const sortOrder = strategy === EAvailableStrategy.SMALLEST ? 1 : -1;
|
|
146
|
+
const results = this.getUrls().aggregate([
|
|
147
|
+
{ $match: { status: EStatus.QUEUED, domain: { $nin: existing } } },
|
|
148
|
+
{ $group: { _id: '$domain', tally: { $sum: 1 } } },
|
|
149
|
+
{ $match: thresholdMatch },
|
|
150
|
+
{ $sort: { tally: sortOrder } },
|
|
151
|
+
{ $limit: limit }
|
|
152
|
+
], { allowDiskUse: true });
|
|
153
|
+
const entries = yield this.listQueryResults(results, isTMongoIdTallyRow);
|
|
154
|
+
return entries
|
|
155
|
+
// eslint-disable-next-line no-underscore-dangle
|
|
156
|
+
.map((entry) => entry._id);
|
|
157
|
+
});
|
|
158
|
+
}
|
|
159
|
+
next(domain) {
|
|
160
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
161
|
+
const next = yield this.getUrls().findOne({
|
|
162
|
+
status: EStatus.QUEUED,
|
|
163
|
+
domain: domain
|
|
164
|
+
});
|
|
165
|
+
if (next === null)
|
|
166
|
+
return undefined;
|
|
167
|
+
return next.url;
|
|
168
|
+
});
|
|
169
|
+
}
|
|
170
|
+
setStatus(url, status) {
|
|
171
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
172
|
+
const updates = { status: status };
|
|
173
|
+
if (![EStatus.QUEUED, EStatus.ACTIVE].includes(status))
|
|
174
|
+
updates['attempted'] = new Date();
|
|
175
|
+
if (status === EStatus.DONE)
|
|
176
|
+
updates['done'] = new Date();
|
|
177
|
+
try {
|
|
178
|
+
yield this.getUrls().updateOne({ url: url }, { $set: updates });
|
|
179
|
+
return true;
|
|
180
|
+
}
|
|
181
|
+
catch (ex) {
|
|
182
|
+
console.error(ex);
|
|
183
|
+
return false;
|
|
184
|
+
}
|
|
185
|
+
});
|
|
186
|
+
}
|
|
187
|
+
setStatusCode(url, code) {
|
|
188
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
189
|
+
try {
|
|
190
|
+
yield this.getUrls().updateOne({ url: url }, { $set: { statusCode: code } });
|
|
191
|
+
return true;
|
|
192
|
+
}
|
|
193
|
+
catch (ex) {
|
|
194
|
+
console.error(ex);
|
|
195
|
+
return false;
|
|
196
|
+
}
|
|
197
|
+
});
|
|
198
|
+
}
|
|
199
|
+
setHeaders(url, headers) {
|
|
200
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
201
|
+
try {
|
|
202
|
+
yield this.getUrls().updateOne({ url: url }, { $set: { headers: headers } });
|
|
203
|
+
return true;
|
|
204
|
+
}
|
|
205
|
+
catch (ex) {
|
|
206
|
+
console.error(ex);
|
|
207
|
+
return false;
|
|
208
|
+
}
|
|
209
|
+
});
|
|
210
|
+
}
|
|
211
|
+
setData(url, context, data) {
|
|
212
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
213
|
+
try {
|
|
214
|
+
const update = {};
|
|
215
|
+
update[context] = data;
|
|
216
|
+
yield this.getUrls().updateOne({ url: url }, { $set: update });
|
|
217
|
+
return true;
|
|
218
|
+
}
|
|
219
|
+
catch (ex) {
|
|
220
|
+
console.error(ex);
|
|
221
|
+
return false;
|
|
222
|
+
}
|
|
223
|
+
});
|
|
224
|
+
}
|
|
225
|
+
unsetData(url, context) {
|
|
226
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
227
|
+
try {
|
|
228
|
+
const update = {};
|
|
229
|
+
update[context] = true;
|
|
230
|
+
yield this.getUrls().updateOne({ url: url }, { $unset: update });
|
|
231
|
+
return true;
|
|
358
232
|
}
|
|
359
|
-
|
|
360
|
-
|
|
233
|
+
catch (ex) {
|
|
234
|
+
console.error(ex);
|
|
235
|
+
return false;
|
|
236
|
+
}
|
|
237
|
+
});
|
|
361
238
|
}
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
239
|
+
getTtl(url) {
|
|
240
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
241
|
+
const row = yield this.getUrls().findOne({ url: url });
|
|
242
|
+
if (row === null)
|
|
243
|
+
return undefined;
|
|
244
|
+
if (!commonsTypeHasPropertyNumber(row, 'ttl'))
|
|
245
|
+
return undefined;
|
|
246
|
+
return row['ttl'];
|
|
247
|
+
});
|
|
248
|
+
}
|
|
249
|
+
setTtl(url, ttl) {
|
|
250
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
251
|
+
try {
|
|
252
|
+
yield this.getUrls().updateOne({ url: url }, { $set: { ttl: ttl } });
|
|
253
|
+
return true;
|
|
254
|
+
}
|
|
255
|
+
catch (ex) {
|
|
256
|
+
console.error(ex);
|
|
257
|
+
return false;
|
|
258
|
+
}
|
|
259
|
+
});
|
|
260
|
+
}
|
|
261
|
+
unsetTtl(url) {
|
|
262
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
263
|
+
try {
|
|
264
|
+
yield this.getUrls().updateOne({ url: url }, { $unset: { ttl: true } });
|
|
265
|
+
return true;
|
|
266
|
+
}
|
|
267
|
+
catch (ex) {
|
|
268
|
+
console.error(ex);
|
|
269
|
+
return false;
|
|
270
|
+
}
|
|
271
|
+
});
|
|
272
|
+
}
|
|
273
|
+
getHash(url) {
|
|
274
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
275
|
+
const row = yield this.getUrls().findOne({ url: url });
|
|
276
|
+
if (row === null)
|
|
277
|
+
return undefined;
|
|
278
|
+
if (!commonsTypeHasPropertyString(row, 'hash'))
|
|
279
|
+
return undefined;
|
|
280
|
+
return row['hash'];
|
|
281
|
+
});
|
|
282
|
+
}
|
|
283
|
+
setHash(url, hash) {
|
|
284
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
285
|
+
try {
|
|
286
|
+
yield this.getUrls().updateOne({ url: url }, { $set: {
|
|
287
|
+
hash: hash,
|
|
288
|
+
hashSet: new Date()
|
|
289
|
+
} });
|
|
290
|
+
return true;
|
|
291
|
+
}
|
|
292
|
+
catch (ex) {
|
|
293
|
+
console.error(ex);
|
|
294
|
+
return false;
|
|
295
|
+
}
|
|
296
|
+
});
|
|
297
|
+
}
|
|
298
|
+
unsetHash(url) {
|
|
299
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
300
|
+
try {
|
|
301
|
+
yield this.getUrls().updateOne({ url: url }, { $unset: {
|
|
302
|
+
hash: true,
|
|
303
|
+
hashSet: true
|
|
304
|
+
} });
|
|
305
|
+
return true;
|
|
306
|
+
}
|
|
307
|
+
catch (ex) {
|
|
308
|
+
console.error(ex);
|
|
309
|
+
return false;
|
|
310
|
+
}
|
|
311
|
+
});
|
|
312
|
+
}
|
|
313
|
+
setFailReason(url, reason) {
|
|
314
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
315
|
+
try {
|
|
316
|
+
yield this.getUrls().updateOne({ url: url }, { $set: {
|
|
317
|
+
reason: reason
|
|
318
|
+
} });
|
|
319
|
+
return true;
|
|
320
|
+
}
|
|
321
|
+
catch (ex) {
|
|
322
|
+
console.error(ex);
|
|
323
|
+
return false;
|
|
324
|
+
}
|
|
325
|
+
});
|
|
326
|
+
}
|
|
327
|
+
unsetFailReason(url) {
|
|
328
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
329
|
+
try {
|
|
330
|
+
yield this.getUrls().updateOne({ url: url }, { $unset: {
|
|
331
|
+
reason: true
|
|
332
|
+
} });
|
|
333
|
+
return true;
|
|
334
|
+
}
|
|
335
|
+
catch (ex) {
|
|
336
|
+
console.error(ex);
|
|
337
|
+
return false;
|
|
338
|
+
}
|
|
339
|
+
});
|
|
340
|
+
}
|
|
341
|
+
link(url, links) {
|
|
342
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
343
|
+
// more efficient to only remove removed and only add new
|
|
344
|
+
// rather than just wiping all existing and re-adding
|
|
345
|
+
const find = this.getLinks().find({
|
|
346
|
+
url: url
|
|
347
|
+
}, {});
|
|
348
|
+
const existing = (yield this.listQueryResults(find, isTLink))
|
|
349
|
+
.map((link) => link.outgoing);
|
|
350
|
+
const removed = [];
|
|
351
|
+
const added = [];
|
|
352
|
+
for (const link of links) {
|
|
353
|
+
if (!existing.includes(link) && !added.includes(link))
|
|
354
|
+
added.push(link);
|
|
355
|
+
}
|
|
356
|
+
for (const link of existing) {
|
|
357
|
+
if (!links.includes(link))
|
|
358
|
+
removed.push(link);
|
|
359
|
+
}
|
|
360
|
+
for (const outgoing of removed) {
|
|
361
|
+
try {
|
|
362
|
+
yield this.getLinks().deleteMany({
|
|
363
|
+
url: url,
|
|
364
|
+
outgoing: outgoing
|
|
365
|
+
});
|
|
366
|
+
}
|
|
367
|
+
catch (ex) {
|
|
368
|
+
/* do nothing */
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
for (const outgoing of added) {
|
|
372
|
+
try {
|
|
373
|
+
yield this.getLinks().insertOne({
|
|
374
|
+
url: url,
|
|
375
|
+
outgoing: outgoing
|
|
376
|
+
});
|
|
377
|
+
}
|
|
378
|
+
catch (ex) {
|
|
379
|
+
switch (ex.code || -1) {
|
|
380
|
+
case 11000:
|
|
381
|
+
// ignore duplicates
|
|
382
|
+
commonsOutputError(`DUPLICATE: ${url}, ${outgoing}`);
|
|
383
|
+
break;
|
|
384
|
+
case 17280:
|
|
385
|
+
case 17282:
|
|
386
|
+
commonsOutputError(`INDEX TOO LARGE: ${url}, ${outgoing}`);
|
|
387
|
+
// ignore index too large
|
|
388
|
+
break;
|
|
389
|
+
default:
|
|
390
|
+
commonsOutputDebug('debug position 8');
|
|
391
|
+
console.log(ex);
|
|
392
|
+
throw ex;
|
|
393
|
+
}
|
|
394
|
+
}
|
|
395
|
+
}
|
|
365
396
|
return true;
|
|
366
|
-
}
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
return
|
|
437
|
-
url
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
{
|
|
481
|
-
{
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
397
|
+
});
|
|
398
|
+
}
|
|
399
|
+
markDead(domain) {
|
|
400
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
401
|
+
try {
|
|
402
|
+
yield this.getUrls().updateMany({ domain: domain, status: { $in: [EStatus.QUEUED, EStatus.ACTIVE] } }, { $set: { status: EStatus.DEAD, attempted: new Date() } });
|
|
403
|
+
return true;
|
|
404
|
+
}
|
|
405
|
+
catch (ex) {
|
|
406
|
+
console.error(ex);
|
|
407
|
+
return false;
|
|
408
|
+
}
|
|
409
|
+
});
|
|
410
|
+
}
|
|
411
|
+
listStatusTallies() {
|
|
412
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
413
|
+
const results = this.getUrls().aggregate([
|
|
414
|
+
{ $match: { status: { $ne: EStatus.ARCHIVED } } },
|
|
415
|
+
{ $group: {
|
|
416
|
+
_id: '$status',
|
|
417
|
+
tally: { $sum: 1 }
|
|
418
|
+
} }
|
|
419
|
+
]);
|
|
420
|
+
const rows = yield this.listQueryResults(results, isTMongoIdTallyRow);
|
|
421
|
+
const map = new Map();
|
|
422
|
+
for (const row of rows) {
|
|
423
|
+
// eslint-disable-next-line no-underscore-dangle
|
|
424
|
+
const status = toEStatus(row._id);
|
|
425
|
+
if (status)
|
|
426
|
+
map.set(status, row.tally);
|
|
427
|
+
}
|
|
428
|
+
return map;
|
|
429
|
+
});
|
|
430
|
+
}
|
|
431
|
+
getLinkTalliesCount() {
|
|
432
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
433
|
+
return yield this.getLinks().countDocuments();
|
|
434
|
+
});
|
|
435
|
+
}
|
|
436
|
+
getDomainTalliesCount() {
|
|
437
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
438
|
+
return yield this.getDomains().countDocuments();
|
|
439
|
+
});
|
|
440
|
+
}
|
|
441
|
+
listDomainQueuedTallies() {
|
|
442
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
443
|
+
const results = this.getUrls().aggregate([
|
|
444
|
+
{ $match: {
|
|
445
|
+
status: EStatus.QUEUED
|
|
446
|
+
} },
|
|
447
|
+
{ $group: {
|
|
448
|
+
_id: '$domain',
|
|
449
|
+
tally: { $sum: 1 }
|
|
450
|
+
} }
|
|
451
|
+
]);
|
|
452
|
+
const rows = yield this.listQueryResults(results, isTMongoIdTallyRow);
|
|
453
|
+
const map = new Map();
|
|
454
|
+
for (const row of rows) {
|
|
455
|
+
// eslint-disable-next-line no-underscore-dangle
|
|
456
|
+
map.set(row._id, row.tally);
|
|
457
|
+
}
|
|
458
|
+
return map;
|
|
459
|
+
});
|
|
460
|
+
}
|
|
461
|
+
listPhpErrors() {
|
|
462
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
463
|
+
const results = this.getUrls().find({
|
|
464
|
+
status: { $ne: EStatus.ARCHIVED },
|
|
465
|
+
phpErrors: { $exists: true }
|
|
466
|
+
}, {});
|
|
467
|
+
return (yield this.listQueryResults(results, isIUrl))
|
|
468
|
+
.map((url) => {
|
|
469
|
+
if (!commonsTypeHasPropertyTArray(url, 'phpErrors', isTPhpError))
|
|
470
|
+
throw new Error('Invalid PHP error object');
|
|
471
|
+
return {
|
|
472
|
+
url: url.url,
|
|
473
|
+
errors: url.phpErrors
|
|
474
|
+
};
|
|
475
|
+
});
|
|
476
|
+
});
|
|
477
|
+
}
|
|
478
|
+
listAspErrors() {
|
|
479
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
480
|
+
const results = this.getUrls().find({
|
|
481
|
+
status: { $ne: EStatus.ARCHIVED },
|
|
482
|
+
aspErrors: { $exists: true }
|
|
483
|
+
}, {});
|
|
484
|
+
return (yield this.listQueryResults(results, isIUrl))
|
|
485
|
+
.map((url) => {
|
|
486
|
+
if (!commonsTypeHasPropertyTArray(url, 'aspErrors', isTAspError))
|
|
487
|
+
throw new Error('Invalid PHP error object');
|
|
488
|
+
return {
|
|
489
|
+
url: url.url,
|
|
490
|
+
errors: url.aspErrors
|
|
491
|
+
};
|
|
492
|
+
});
|
|
493
|
+
});
|
|
494
|
+
}
|
|
495
|
+
listDone200DomainUrls(domain) {
|
|
496
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
497
|
+
const results = this.getUrls().find({ $and: [
|
|
498
|
+
{ domain: domain },
|
|
499
|
+
{ status: EStatus.DONE },
|
|
500
|
+
{ statusCode: { $gte: 200 } },
|
|
501
|
+
{ statusCode: { $lt: 300 } }
|
|
502
|
+
] }, {});
|
|
503
|
+
return (yield this.listQueryResults(results, isIUrl))
|
|
504
|
+
.map((url) => url.url);
|
|
505
|
+
});
|
|
506
|
+
}
|
|
507
|
+
listDomains() {
|
|
508
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
509
|
+
const results = this.getDomains().find(
|
|
510
|
+
// eslint-disable-next-line @typescript-eslint/no-unsafe-assignment
|
|
511
|
+
{ ip: { $exists: true, $ne: null } }, // this is ok, despite the type objection to null
|
|
512
|
+
{});
|
|
513
|
+
// since we're doing $ne: null above, we don't need to strip nulls, as there won't be any
|
|
514
|
+
return yield this.listQueryResults(results, isTDomain);
|
|
515
|
+
});
|
|
516
|
+
}
|
|
517
|
+
listDomainsByLike(term) {
|
|
518
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
519
|
+
const results = this.getDomains().find({
|
|
520
|
+
domain: new RegExp(commonsStringRegexLike(`%${term}%`), 'i')
|
|
521
|
+
}, {});
|
|
522
|
+
return (yield this.listQueryResults(results, isTDomain))
|
|
523
|
+
.map((encoded) => commonsObjectStripNulls(encoded));
|
|
524
|
+
});
|
|
525
|
+
}
|
|
526
|
+
listInboundLinks(url) {
|
|
527
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
528
|
+
const results = this.getLinks().find({ outgoing: url }, {});
|
|
529
|
+
return (yield this.listQueryResults(results, isTLink))
|
|
530
|
+
.map((link) => link.url);
|
|
531
|
+
});
|
|
532
|
+
}
|
|
533
|
+
listOutboundLinks(url) {
|
|
534
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
535
|
+
const results = this.getLinks().find({ url: url }, {});
|
|
536
|
+
return (yield this.listQueryResults(results, isTLink))
|
|
537
|
+
.map((link) => link.outgoing);
|
|
538
|
+
});
|
|
539
|
+
}
|
|
540
|
+
listImagesBySizeThreshold(size, comparator) {
|
|
541
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
542
|
+
const queries = [
|
|
543
|
+
{ status: EStatus.DONE },
|
|
544
|
+
{ 'headers.content-type': /^image\/(jpeg)/ },
|
|
545
|
+
{ 'headers.content-length': { $exists: true } }
|
|
546
|
+
];
|
|
547
|
+
switch (comparator) {
|
|
548
|
+
case EComparator.GT:
|
|
549
|
+
queries.push({ 'headers.content-length': { $gt: size } });
|
|
550
|
+
break;
|
|
551
|
+
case EComparator.LT:
|
|
552
|
+
queries.push({ 'headers.content-length': { $lt: size } });
|
|
553
|
+
break;
|
|
554
|
+
case EComparator.GTE:
|
|
555
|
+
queries.push({ 'headers.content-length': { $gte: size } });
|
|
556
|
+
break;
|
|
557
|
+
case EComparator.LTE:
|
|
558
|
+
queries.push({ 'headers.content-length': { $lte: size } });
|
|
559
|
+
break;
|
|
560
|
+
}
|
|
561
|
+
const results = this.getUrls().find({ $and: queries }, {});
|
|
562
|
+
return (yield this.listQueryResults(results, isIUrl))
|
|
563
|
+
.map((row) => ({
|
|
564
|
+
url: row.url,
|
|
565
|
+
size: commonsTypeAttemptNumber(row['headers']['content-length']) || -1
|
|
566
|
+
}));
|
|
567
|
+
});
|
|
503
568
|
}
|
|
504
569
|
}
|
|
505
|
-
|
|
570
|
+
//# sourceMappingURL=database.service.js.map
|