@bitblit/ratchet-node-only 4.0.475-alpha → 5.0.107
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/third-party/common-crawl/common-crawl-service.d.ts +15 -0
- package/lib/third-party/common-crawl/common-crawl-service.js +153 -0
- package/lib/third-party/common-crawl/common-crawl-service.js.map +1 -0
- package/lib/third-party/common-crawl/model/common-crawl-fetch-options.d.ts +12 -0
- package/lib/third-party/common-crawl/model/common-crawl-fetch-options.js +2 -0
- package/lib/third-party/common-crawl/model/common-crawl-fetch-options.js.map +1 -0
- package/lib/third-party/common-crawl/model/common-crawl-scan.d.ts +9 -0
- package/lib/third-party/common-crawl/model/common-crawl-scan.js +2 -0
- package/lib/third-party/common-crawl/model/common-crawl-scan.js.map +1 -0
- package/lib/third-party/common-crawl/model/domain-index-entry-raw.d.ts +14 -0
- package/lib/third-party/common-crawl/model/domain-index-entry-raw.js +2 -0
- package/lib/third-party/common-crawl/model/domain-index-entry-raw.js.map +1 -0
- package/lib/third-party/common-crawl/model/index-entry-raw.d.ts +8 -0
- package/lib/third-party/common-crawl/model/index-entry-raw.js +2 -0
- package/lib/third-party/common-crawl/model/index-entry-raw.js.map +1 -0
- package/lib/third-party/common-crawl/model/warc-entry-raw.d.ts +5 -0
- package/lib/third-party/common-crawl/model/warc-entry-raw.js +2 -0
- package/lib/third-party/common-crawl/model/warc-entry-raw.js.map +1 -0
- package/lib/third-party/common-crawl/model/warc-entry.d.ts +5 -0
- package/lib/third-party/common-crawl/model/warc-entry.js +2 -0
- package/lib/third-party/common-crawl/model/warc-entry.js.map +1 -0
- package/package.json +18 -5
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import { WarcEntry } from './model/warc-entry.js';
|
|
2
|
+
import { CommonCrawlScan } from './model/common-crawl-scan.js';
|
|
3
|
+
import { CommonCrawlFetchOptions } from './model/common-crawl-fetch-options.js';
|
|
4
|
+
import { DomainIndexEntryRaw } from './model/domain-index-entry-raw.js';
|
|
5
|
+
import { IndexEntryRaw } from './model/index-entry-raw.js';
|
|
6
|
+
export declare class CommonCrawlService {
|
|
7
|
+
static readonly COMMON_CRAWL_URL: string;
|
|
8
|
+
static readonly CURRENT_CRAWL: string;
|
|
9
|
+
fetchIndexes(): Promise<IndexEntryRaw[]>;
|
|
10
|
+
readPageData(entry: DomainIndexEntryRaw): Promise<any>;
|
|
11
|
+
static validLanguages(entry: DomainIndexEntryRaw): string[];
|
|
12
|
+
pullPageEntry(entry: DomainIndexEntryRaw, language?: string): Promise<WarcEntry>;
|
|
13
|
+
search(options: CommonCrawlFetchOptions): Promise<DomainIndexEntryRaw[]>;
|
|
14
|
+
scanSite(opts: CommonCrawlFetchOptions, onPage?: (idx: number, cnt: number, header: string) => Promise<any>): Promise<CommonCrawlScan>;
|
|
15
|
+
}
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
import fetch from 'cross-fetch';
|
|
2
|
+
import * as querystring from 'node:querystring';
|
|
3
|
+
import { RequireRatchet } from '@bitblit/ratchet-common/lang/require-ratchet';
|
|
4
|
+
import { StringRatchet } from '@bitblit/ratchet-common/lang/string-ratchet';
|
|
5
|
+
import zlib from 'zlib';
|
|
6
|
+
import { Readable } from 'stream';
|
|
7
|
+
import warc from 'warc';
|
|
8
|
+
import * as cheerio from 'cheerio';
|
|
9
|
+
import { ErrorRatchet } from "@bitblit/ratchet-common/lang/error-ratchet";
|
|
10
|
+
import { NodeStreamRatchet } from "../../stream/node-stream-ratchet";
|
|
11
|
+
import { Logger } from "@bitblit/ratchet-common/logger/logger";
|
|
12
|
+
import { PromiseRatchet } from "@bitblit/ratchet-common/lang/promise-ratchet";
|
|
13
|
+
import { StopWatch } from "@bitblit/ratchet-common/lang/stop-watch";
|
|
14
|
+
export class CommonCrawlService {
|
|
15
|
+
static COMMON_CRAWL_URL = 'https://index.commoncrawl.org/';
|
|
16
|
+
static CURRENT_CRAWL = 'CC-MAIN-2024-33';
|
|
17
|
+
async fetchIndexes() {
|
|
18
|
+
const res = await fetch(CommonCrawlService.COMMON_CRAWL_URL + 'collinfo.json');
|
|
19
|
+
const output = await res.json();
|
|
20
|
+
return output;
|
|
21
|
+
}
|
|
22
|
+
async readPageData(entry) {
|
|
23
|
+
const rval = {};
|
|
24
|
+
const langs = CommonCrawlService.validLanguages(entry);
|
|
25
|
+
for (const lang of langs) {
|
|
26
|
+
rval[lang] = [];
|
|
27
|
+
const data = await this.pullPageEntry(entry);
|
|
28
|
+
const asString = data.content.toString();
|
|
29
|
+
const parsed = cheerio.load(asString);
|
|
30
|
+
['p', 'div', 'span'].forEach((tag) => {
|
|
31
|
+
parsed(tag).each((idx, el) => {
|
|
32
|
+
const txt = StringRatchet.trimToNull(parsed(el).text());
|
|
33
|
+
if (txt && txt.includes('.')) {
|
|
34
|
+
rval[lang].push(txt);
|
|
35
|
+
}
|
|
36
|
+
});
|
|
37
|
+
});
|
|
38
|
+
}
|
|
39
|
+
return rval;
|
|
40
|
+
}
|
|
41
|
+
static validLanguages(entry) {
|
|
42
|
+
const validLangs = entry.languages
|
|
43
|
+
.split(',')
|
|
44
|
+
.map((s) => StringRatchet.trimToNull(s))
|
|
45
|
+
.filter((s) => !!s);
|
|
46
|
+
return validLangs;
|
|
47
|
+
}
|
|
48
|
+
async pullPageEntry(entry, language) {
|
|
49
|
+
const prefix = 'https://data.commoncrawl.org/';
|
|
50
|
+
const url = prefix + entry.filename;
|
|
51
|
+
const headers = { Range: 'bytes=' + entry.offset + '-' + (entry.offset + entry.length + 1) };
|
|
52
|
+
if (language) {
|
|
53
|
+
if (!CommonCrawlService.validLanguages(entry).includes(language)) {
|
|
54
|
+
throw ErrorRatchet.fErr('Requested language %s, but valid are %s', language, entry.languages);
|
|
55
|
+
}
|
|
56
|
+
headers['Accept-Language'] = language;
|
|
57
|
+
}
|
|
58
|
+
const resp = await fetch(url, {
|
|
59
|
+
headers: headers,
|
|
60
|
+
});
|
|
61
|
+
let reader = null;
|
|
62
|
+
if (resp.body instanceof Readable) {
|
|
63
|
+
reader = resp.body;
|
|
64
|
+
}
|
|
65
|
+
else if (resp.body instanceof ReadableStream) {
|
|
66
|
+
reader = NodeStreamRatchet.webReadableStreamToNodeReadable(resp.body);
|
|
67
|
+
}
|
|
68
|
+
const warcstream = new warc();
|
|
69
|
+
Logger.info('Headers is %j', resp.headers);
|
|
70
|
+
let rval = null;
|
|
71
|
+
reader
|
|
72
|
+
.pipe(zlib.createGunzip())
|
|
73
|
+
.pipe(warcstream)
|
|
74
|
+
.on('data', (val) => {
|
|
75
|
+
Logger.info('Got data ' + val.content.length);
|
|
76
|
+
rval = val;
|
|
77
|
+
warcstream.destroy();
|
|
78
|
+
})
|
|
79
|
+
.on('close', () => {
|
|
80
|
+
Logger.info('Got close event');
|
|
81
|
+
})
|
|
82
|
+
.on('error', (err) => {
|
|
83
|
+
Logger.error('Read error: %s', err, err);
|
|
84
|
+
});
|
|
85
|
+
while (!rval) {
|
|
86
|
+
await PromiseRatchet.wait(500);
|
|
87
|
+
}
|
|
88
|
+
const conv = {
|
|
89
|
+
protocol: rval.protocol,
|
|
90
|
+
headers: rval.headers,
|
|
91
|
+
content: rval.content.toString(),
|
|
92
|
+
};
|
|
93
|
+
return conv;
|
|
94
|
+
}
|
|
95
|
+
async search(options) {
|
|
96
|
+
RequireRatchet.notNullOrUndefined(options, 'options');
|
|
97
|
+
RequireRatchet.notNullUndefinedOrOnlyWhitespaceString(options.url, 'options.url');
|
|
98
|
+
let url = CommonCrawlService.COMMON_CRAWL_URL + (options.index || CommonCrawlService.CURRENT_CRAWL);
|
|
99
|
+
url += '-index?';
|
|
100
|
+
const params = {
|
|
101
|
+
url: options.url,
|
|
102
|
+
matchType: options.matchType || 'domain',
|
|
103
|
+
output: 'json',
|
|
104
|
+
};
|
|
105
|
+
const urlPart = querystring.stringify(params);
|
|
106
|
+
url += urlPart;
|
|
107
|
+
const res = await fetch(url);
|
|
108
|
+
const body = await res.text();
|
|
109
|
+
let rval = null;
|
|
110
|
+
if (res.status === 200) {
|
|
111
|
+
const lines = body.split('\n');
|
|
112
|
+
rval = lines.map((s) => (StringRatchet.trimToNull(s) ? JSON.parse(s) : null)).filter((s) => !!s);
|
|
113
|
+
}
|
|
114
|
+
else {
|
|
115
|
+
Logger.error('Failed to fetch: %s : %s : %j : %s', res.status, res.statusText, res.headers, res.body);
|
|
116
|
+
}
|
|
117
|
+
return rval;
|
|
118
|
+
}
|
|
119
|
+
async scanSite(opts, onPage) {
|
|
120
|
+
const sw = new StopWatch();
|
|
121
|
+
const rval = {
|
|
122
|
+
options: opts,
|
|
123
|
+
pageIndexes: [],
|
|
124
|
+
parsed: [],
|
|
125
|
+
errors: [],
|
|
126
|
+
};
|
|
127
|
+
Logger.info('Performing domain index scan with %j', opts);
|
|
128
|
+
rval.pageIndexes = await this.search(opts);
|
|
129
|
+
Logger.info('Found %d entries, pulling each', rval.pageIndexes.length);
|
|
130
|
+
for (const [idx, ent] of rval.pageIndexes.entries()) {
|
|
131
|
+
try {
|
|
132
|
+
Logger.info('Pulling item %d of %d, %s', idx, rval.pageIndexes.length, sw.dumpExpected(idx / rval.pageIndexes.length));
|
|
133
|
+
if (onPage) {
|
|
134
|
+
try {
|
|
135
|
+
await onPage(idx, rval.pageIndexes.length, ent.url);
|
|
136
|
+
}
|
|
137
|
+
catch (err) {
|
|
138
|
+
Logger.warn('Failed onpage: %s', err);
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
const parsed = await this.pullPageEntry(ent);
|
|
142
|
+
rval.parsed.push(parsed);
|
|
143
|
+
}
|
|
144
|
+
catch (err) {
|
|
145
|
+
Logger.warn('Failed to pull %j : %s', ent, err);
|
|
146
|
+
rval.errors.push({ pageIdx: ent, error: err });
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
Logger.info('Completed full scan in %s', sw.dump());
|
|
150
|
+
return rval;
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
//# sourceMappingURL=common-crawl-service.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"common-crawl-service.js","sourceRoot":"","sources":["../../../src/third-party/common-crawl/common-crawl-service.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,MAAM,aAAa,CAAC;AAChC,OAAO,KAAK,WAAW,MAAM,kBAAkB,CAAC;AAChD,OAAO,EAAE,cAAc,EAAE,MAAM,8CAA8C,CAAC;AAC9E,OAAO,EAAE,aAAa,EAAE,MAAM,6CAA6C,CAAC;AAC5E,OAAO,IAAI,MAAM,MAAM,CAAC;AACxB,OAAO,EAAE,QAAQ,EAAE,MAAM,QAAQ,CAAC;AAClC,OAAO,IAAI,MAAM,MAAM,CAAC;AACxB,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AAOnC,OAAO,EAAE,YAAY,EAAE,MAAM,4CAA4C,CAAC;AAC1E,OAAO,EAAE,iBAAiB,EAAE,MAAM,kCAAkC,CAAC;AACrE,OAAO,EAAE,MAAM,EAAE,MAAM,uCAAuC,CAAC;AAC/D,OAAO,EAAE,cAAc,EAAE,MAAM,8CAA8C,CAAC;AAC9E,OAAO,EAAE,SAAS,EAAE,MAAM,yCAAyC,CAAC;AAKpE,MAAM,OAAO,kBAAkB;IACtB,MAAM,CAAU,gBAAgB,GAAW,gCAAgC,CAAC;IAC5E,MAAM,CAAU,aAAa,GAAW,iBAAiB,CAAC;IAE1D,KAAK,CAAC,YAAY;QACvB,MAAM,GAAG,GAAa,MAAM,KAAK,CAAC,kBAAkB,CAAC,gBAAgB,GAAG,eAAe,CAAC,CAAC;QACzF,MAAM,MAAM,GAAoB,MAAM,GAAG,CAAC,IAAI,EAAE,CAAC;QACjD,OAAO,MAAM,CAAC;IAChB,CAAC;IAEM,KAAK,CAAC,YAAY,CAAC,KAA0B;QAClD,MAAM,IAAI,GAA6B,EAAE,CAAC;QAC1C,MAAM,KAAK,GAAa,kBAAkB,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC;QAEjE,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,CAAC;YAChB,MAAM,IAAI,GAAc,MAAM,IAAI,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;YACxD,MAAM,QAAQ,GAAW,IAAI,CAAC,OAAO,CAAC,QAAQ,EAAE,CAAC;YACjD,MAAM,MAAM,GAAiB,OAAO,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;YACpD,CAAC,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,EAAE;gBACnC,MAAM,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,GAAW,EAAE,EAAmB,EAAE,EAAE;oBACpD,MAAM,GAAG,GAAW,aAAa,CAAC,UAAU,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;oBAChE,IAAI,GAAG,IAAI,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;wBAC7B,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;oBAEvB,CAAC;gBACH,CAAC,CAAC,CAAC;YACL,CAAC,CAAC,CAAC;QACL,CAAC;QACD,OAAO,IAAI,CAAC;IACd,CAAC;IAEM,MAAM,CAAC,cAAc,CAAC,KAA0B;QACrD,MAAM,UAAU,GAAa,KAAK,CAAC,SAAS;aACzC,KAAK,CAAC,GAAG,CAAC;aACV,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,aAAa,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;aACvC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QACtB,OAAO,UAAU,CAAC;IACpB,CAAC;IAEM,KAAK,CAAC,aAAa,CAAC,KAA0B,EAAE,QAAiB;QACtE,MAAM,MAAM,GAAW,+BAA+B,CAAC;QACvD,MAAM,GAAG,GAAW,MAAM,GAAG,KAAK,CAAC,QAAQ,CAAC;QAE5C,MAAM,OAAO,GAA2B,EAAE,KAAK,EAAE,QAAQ,GAAG,KAAK,CAAC,MAAM,GAAG,GAAG,GAAG,CAAC,KAAK,CAAC,MAAM,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,EAAE,CAAC;QACrH,IAAI,QAAQ,EAAE,CAAC;YACb,IAAI,CAAC,kBAAkB,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;gBACjE,MAAM,YAAY,CAAC,IAAI,CAAC,yCAAyC,EAAE,QAAQ,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;YAChG,CAAC;YACD,OAAO,CAAC,iBAAiB,CAAC,GAAG,QAAQ,CAAC;QACxC,CAAC;QAED,MAAM,IAAI,GAAa,MAAM,KAAK,CAAC,GAAG,EAAE;YACtC,OAAO,EAAE,OAAO;SACjB,CAAC,CAAC;QAGH,IAAI,MAAM,GAAa,IAAI,CAAC;QAC5B,IAAI,IAAI,CAAC,IAAI,YAAY,QAAQ,EAAE,CAAC;YAClC,MAAM,GAAG,IAAI,CAAC,IAAI,CAAC;QACrB,CAAC;aAAM,IAAI,IAAI,CAAC,IAAI,YAAY,cAAc,EAAE,CAAC;YAC/C,MAAM,GAAG,iBAAiB,CAAC,+BAA+B,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACxE,CAAC;QAED,MAAM,UAAU,GAAS,IAAI,IAAI,EAAE,CAAC;QAOpC,MAAM,CAAC,IAAI,CAAC,eAAe,EAAE,IAAI,CAAC,OAAO,CAAC,CAAC;QAE3C,IAAI,IAAI,GAAiB,IAAI,CAAC;QAE9B,MAAM;aACH,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,CAAC;aACzB,IAAI,CAAC,UAAU,CAAC;aAChB,EAAE,CAAC,MAAM,EAAE,CAAC,GAAiB,EAAE,EAAE;YAChC,MAAM,CAAC,IAAI,CAAC,WAAW,GAAG,GAAG,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC;YAC9C,IAAI,GAAG,GAAG,CAAC;YAIX,UAAU,CAAC,OAAO,EAAE,CAAC;QACvB,CAAC,CAAC;aACD,EAAE,CAAC,OAAO,EAAE,GAAG,EAAE;YAChB,MAAM,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC;QAEjC,CAAC,CAAC;aACD,EAAE,CAAC,OAAO,EAAE,CAAC,GAAG,EAAE,EAAE;YACnB,MAAM,CAAC,KAAK,CAAC,gBAAgB,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC;QAE3C,CAAC,CAAC,CAAC;QAEL,OAAO,CAAC,IAAI,EAAE,CAAC;YAEb,MAAM,cAAc,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACjC,CAAC;QAED,MAAM,IAAI,GAAc;YACtB,QAAQ,EAAE,IAAI,CAAC,QAAQ;YACvB,OAAO,EAAE,IAAI,CAAC,OAAO;YACrB,OAAO,EAAE,IAAI,CAAC,OAAO,CAAC,QAAQ,EAAE;SACjC,CAAC;QAEF,OAAO,IAAI,CAAC;IACd,CAAC;IAEM,KAAK,CAAC,MAAM,CAAC,OAAgC;QAClD,cAAc,CAAC,kBAAkB,CAAC,OAAO,EAAE,SAAS,CAAC,CAAC;QACtD,cAAc,CAAC,sCAAsC,CAAC,OAAO,CAAC,GAAG,EAAE,aAAa,CAAC,CAAC;QAElF,IAAI,GAAG,GAAW,kBAAkB,CAAC,gBAAgB,GAAG,CAAC,OAAO,CAAC,KAAK,IAAI,kBAAkB,CAAC,aAAa,CAAC,CAAC;QAC5G,GAAG,IAAI,SAAS,CAAC;QACjB,MAAM,MAAM,GAAG;YACb,GAAG,EAAE,OAAO,CAAC,GAAG;YAGhB,SAAS,EAAE,OAAO,CAAC,SAAS,IAAI,QAAQ;YAMxC,MAAM,EAAE,MAAM;SACf,CAAC;QAEF,MAAM,OAAO,GAAW,WAAW,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QACtD,GAAG,IAAI,OAAO,CAAC;QAIf,MAAM,GAAG,GAAa,MAAM,KAAK,CAAC,GAAG,CAAC,CAAC;QACvC,MAAM,IAAI,GAAW,MAAM,GAAG,CAAC,IAAI,EAAE,CAAC;QACtC,IAAI,IAAI,GAA0B,IAAI,CAAC;QACvC,IAAI,GAAG,CAAC,MAAM,KAAK,GAAG,EAAE,CAAC;YACvB,MAAM,KAAK,GAAa,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;YACzC,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,aAAa,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAEnG,CAAC;aAAM,CAAC;YACN,MAAM,CAAC,KAAK,CAAC,oCAAoC,EAAE,GAAG,CAAC,MAAM,EAAE,GAAG,CAAC,UAAU,EAAE,GAAG,CAAC,OAAO,EAAE,GAAG,CAAC,IAAI,CAAC,CAAC;QACxG,CAAC;QAED,OAAO,IAAI,CAAC;IACd,CAAC;IAeM,KAAK,CAAC,QAAQ,CACnB,IAA6B,EAC7B,MAAmE;QAEnE,MAAM,EAAE,GAAc,IAAI,SAAS,EAAE,CAAC;QACtC,MAAM,IAAI,GAAoB;YAC5B,OAAO,EAAE,IAAI;YACb,WAAW,EAAE,EAAE;YACf,MAAM,EAAE,EAAE;YACV,MAAM,EAAE,EAAE;SACX,CAAC;QACF,MAAM,CAAC,IAAI,CAAC,sCAAsC,EAAE,IAAI,CAAC,CAAC;QAC1D,IAAI,CAAC,WAAW,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;QAC3C,MAAM,CAAC,IAAI,CAAC,gCAAgC,EAAE,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;QACvE,KAAK,MAAM,CAAC,GAAG,EAAE,GAAG,CAAC,IAAI,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,EAAE,CAAC;YACpD,IAAI,CAAC;gBACH,MAAM,CAAC,IAAI,CAAC,2BAA2B,EAAE,GAAG,EAAE,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,EAAE,CAAC,YAAY,CAAC,GAAG,GAAG,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC;gBACvH,IAAI,MAAM,EAAE,CAAC;oBACX,IAAI,CAAC;wBACH,MAAM,MAAM,CAAC,GAAG,EAAE,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,CAAC,CAAC;oBACtD,CAAC;oBAAC,OAAO,GAAG,EAAE,CAAC;wBACb,MAAM,CAAC,IAAI,CAAC,mBAAmB,EAAE,GAAG,CAAC,CAAC;oBACxC,CAAC;gBACH,CAAC;gBACD,MAAM,MAAM,GAAc,MAAM,IAAI,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC;gBAExD,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YAC3B,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACb,MAAM,CAAC,IAAI,CAAC,wBAAwB,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC;gBAChD,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,EAAE,OAAO,EAAE,GAAG,EAAE,KAAK,EAAE,GAAG,EAAE,CAAC,CAAC;YACjD,CAAC;QACH,CAAC;QACD,MAAM,CAAC,IAAI,CAAC,2BAA2B,EAAE,EAAE,CAAC,IAAI,EAAE,CAAC,CAAC;QACpD,OAAO,IAAI,CAAC;IACd,CAAC"}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
export interface CommonCrawlFetchOptions {
|
|
2
|
+
url: string;
|
|
3
|
+
index?: string;
|
|
4
|
+
from?: string;
|
|
5
|
+
to?: string;
|
|
6
|
+
showNumPages?: boolean;
|
|
7
|
+
matchType?: 'exact' | 'prefix' | 'host' | 'domain';
|
|
8
|
+
limit?: number;
|
|
9
|
+
sort?: 'asc' | 'desc';
|
|
10
|
+
page?: number;
|
|
11
|
+
pageSize?: number;
|
|
12
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"common-crawl-fetch-options.js","sourceRoot":"","sources":["../../../../src/third-party/common-crawl/model/common-crawl-fetch-options.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import { WarcEntry } from './warc-entry.js';
|
|
2
|
+
import { DomainIndexEntryRaw } from './domain-index-entry-raw.js';
|
|
3
|
+
import { CommonCrawlFetchOptions } from './common-crawl-fetch-options.js';
|
|
4
|
+
export interface CommonCrawlScan {
|
|
5
|
+
options: CommonCrawlFetchOptions;
|
|
6
|
+
pageIndexes: DomainIndexEntryRaw[];
|
|
7
|
+
parsed: WarcEntry[];
|
|
8
|
+
errors: any[];
|
|
9
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"common-crawl-scan.js","sourceRoot":"","sources":["../../../../src/third-party/common-crawl/model/common-crawl-scan.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
export interface DomainIndexEntryRaw {
|
|
2
|
+
urlkey: string;
|
|
3
|
+
timestamp: string;
|
|
4
|
+
url: string;
|
|
5
|
+
mime: string;
|
|
6
|
+
'mime-detected': string;
|
|
7
|
+
status: string;
|
|
8
|
+
digest: string;
|
|
9
|
+
length: string;
|
|
10
|
+
offset: string;
|
|
11
|
+
filename: string;
|
|
12
|
+
languages: string;
|
|
13
|
+
encoding: string;
|
|
14
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"domain-index-entry-raw.js","sourceRoot":"","sources":["../../../../src/third-party/common-crawl/model/domain-index-entry-raw.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index-entry-raw.js","sourceRoot":"","sources":["../../../../src/third-party/common-crawl/model/index-entry-raw.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"warc-entry-raw.js","sourceRoot":"","sources":["../../../../src/third-party/common-crawl/model/warc-entry-raw.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"warc-entry.js","sourceRoot":"","sources":["../../../../src/third-party/common-crawl/model/warc-entry.ts"],"names":[],"mappings":""}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@bitblit/ratchet-node-only",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "5.0.107",
|
|
4
4
|
"description": "Ratchet tools for use on node-only",
|
|
5
5
|
"note-on-side-effects": "Technically the entries in 'bin' below might be side effects, but they are called explicitly",
|
|
6
6
|
"sideEffects": false,
|
|
@@ -55,20 +55,27 @@
|
|
|
55
55
|
},
|
|
56
56
|
"license": "Apache-2.0",
|
|
57
57
|
"dependencies": {
|
|
58
|
-
"@bitblit/ratchet-common": "
|
|
58
|
+
"@bitblit/ratchet-common": "5.0.107"
|
|
59
59
|
},
|
|
60
60
|
"optionalDependencies": {
|
|
61
|
+
"cheerio": "1.0.0",
|
|
61
62
|
"csv": "6.3.10",
|
|
62
63
|
"jsonwebtoken": "9.0.2",
|
|
63
|
-
"rxjs": "7.8.1"
|
|
64
|
+
"rxjs": "7.8.1",
|
|
65
|
+
"warc": "1.0.1"
|
|
64
66
|
},
|
|
65
67
|
"peerDependencies": {
|
|
66
|
-
"@bitblit/ratchet-common": "
|
|
68
|
+
"@bitblit/ratchet-common": "5.0.107",
|
|
69
|
+
"cheerio": "^1.0.0",
|
|
67
70
|
"csv": "^6.3.10",
|
|
68
71
|
"jsonwebtoken": "^9.0.2",
|
|
69
|
-
"rxjs": "^7.8.1"
|
|
72
|
+
"rxjs": "^7.8.1",
|
|
73
|
+
"warc": "^1.0.1"
|
|
70
74
|
},
|
|
71
75
|
"peerDependenciesMeta": {
|
|
76
|
+
"cheerio": {
|
|
77
|
+
"optional": true
|
|
78
|
+
},
|
|
72
79
|
"csv": {
|
|
73
80
|
"optional": true
|
|
74
81
|
},
|
|
@@ -77,6 +84,12 @@
|
|
|
77
84
|
},
|
|
78
85
|
"rxjs": {
|
|
79
86
|
"optional": true
|
|
87
|
+
},
|
|
88
|
+
"warc": {
|
|
89
|
+
"optional": true
|
|
80
90
|
}
|
|
91
|
+
},
|
|
92
|
+
"devDependencies": {
|
|
93
|
+
"@types/cheerio": "0.22.35"
|
|
81
94
|
}
|
|
82
95
|
}
|