@bitblit/ratchet-node-only 4.0.475-alpha → 5.0.107

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. package/lib/third-party/common-crawl/common-crawl-service.d.ts +15 -0
  2. package/lib/third-party/common-crawl/common-crawl-service.js +153 -0
  3. package/lib/third-party/common-crawl/common-crawl-service.js.map +1 -0
  4. package/lib/third-party/common-crawl/model/common-crawl-fetch-options.d.ts +12 -0
  5. package/lib/third-party/common-crawl/model/common-crawl-fetch-options.js +2 -0
  6. package/lib/third-party/common-crawl/model/common-crawl-fetch-options.js.map +1 -0
  7. package/lib/third-party/common-crawl/model/common-crawl-scan.d.ts +9 -0
  8. package/lib/third-party/common-crawl/model/common-crawl-scan.js +2 -0
  9. package/lib/third-party/common-crawl/model/common-crawl-scan.js.map +1 -0
  10. package/lib/third-party/common-crawl/model/domain-index-entry-raw.d.ts +14 -0
  11. package/lib/third-party/common-crawl/model/domain-index-entry-raw.js +2 -0
  12. package/lib/third-party/common-crawl/model/domain-index-entry-raw.js.map +1 -0
  13. package/lib/third-party/common-crawl/model/index-entry-raw.d.ts +8 -0
  14. package/lib/third-party/common-crawl/model/index-entry-raw.js +2 -0
  15. package/lib/third-party/common-crawl/model/index-entry-raw.js.map +1 -0
  16. package/lib/third-party/common-crawl/model/warc-entry-raw.d.ts +5 -0
  17. package/lib/third-party/common-crawl/model/warc-entry-raw.js +2 -0
  18. package/lib/third-party/common-crawl/model/warc-entry-raw.js.map +1 -0
  19. package/lib/third-party/common-crawl/model/warc-entry.d.ts +5 -0
  20. package/lib/third-party/common-crawl/model/warc-entry.js +2 -0
  21. package/lib/third-party/common-crawl/model/warc-entry.js.map +1 -0
  22. package/package.json +18 -5
@@ -0,0 +1,15 @@
1
+ import { WarcEntry } from './model/warc-entry.js';
2
+ import { CommonCrawlScan } from './model/common-crawl-scan.js';
3
+ import { CommonCrawlFetchOptions } from './model/common-crawl-fetch-options.js';
4
+ import { DomainIndexEntryRaw } from './model/domain-index-entry-raw.js';
5
+ import { IndexEntryRaw } from './model/index-entry-raw.js';
6
+ export declare class CommonCrawlService {
7
+ static readonly COMMON_CRAWL_URL: string;
8
+ static readonly CURRENT_CRAWL: string;
9
+ fetchIndexes(): Promise<IndexEntryRaw[]>;
10
+ readPageData(entry: DomainIndexEntryRaw): Promise<any>;
11
+ static validLanguages(entry: DomainIndexEntryRaw): string[];
12
+ pullPageEntry(entry: DomainIndexEntryRaw, language?: string): Promise<WarcEntry>;
13
+ search(options: CommonCrawlFetchOptions): Promise<DomainIndexEntryRaw[]>;
14
+ scanSite(opts: CommonCrawlFetchOptions, onPage?: (idx: number, cnt: number, header: string) => Promise<any>): Promise<CommonCrawlScan>;
15
+ }
@@ -0,0 +1,153 @@
1
+ import fetch from 'cross-fetch';
2
+ import * as querystring from 'node:querystring';
3
+ import { RequireRatchet } from '@bitblit/ratchet-common/lang/require-ratchet';
4
+ import { StringRatchet } from '@bitblit/ratchet-common/lang/string-ratchet';
5
+ import zlib from 'zlib';
6
+ import { Readable } from 'stream';
7
+ import warc from 'warc';
8
+ import * as cheerio from 'cheerio';
9
+ import { ErrorRatchet } from "@bitblit/ratchet-common/lang/error-ratchet";
10
+ import { NodeStreamRatchet } from "../../stream/node-stream-ratchet";
11
+ import { Logger } from "@bitblit/ratchet-common/logger/logger";
12
+ import { PromiseRatchet } from "@bitblit/ratchet-common/lang/promise-ratchet";
13
+ import { StopWatch } from "@bitblit/ratchet-common/lang/stop-watch";
14
+ export class CommonCrawlService {
15
+ static COMMON_CRAWL_URL = 'https://index.commoncrawl.org/';
16
+ static CURRENT_CRAWL = 'CC-MAIN-2024-33';
17
+ async fetchIndexes() {
18
+ const res = await fetch(CommonCrawlService.COMMON_CRAWL_URL + 'collinfo.json');
19
+ const output = await res.json();
20
+ return output;
21
+ }
22
+ async readPageData(entry) {
23
+ const rval = {};
24
+ const langs = CommonCrawlService.validLanguages(entry);
25
+ for (const lang of langs) {
26
+ rval[lang] = [];
27
+ const data = await this.pullPageEntry(entry);
28
+ const asString = data.content.toString();
29
+ const parsed = cheerio.load(asString);
30
+ ['p', 'div', 'span'].forEach((tag) => {
31
+ parsed(tag).each((idx, el) => {
32
+ const txt = StringRatchet.trimToNull(parsed(el).text());
33
+ if (txt && txt.includes('.')) {
34
+ rval[lang].push(txt);
35
+ }
36
+ });
37
+ });
38
+ }
39
+ return rval;
40
+ }
41
+ static validLanguages(entry) {
42
+ const validLangs = entry.languages
43
+ .split(',')
44
+ .map((s) => StringRatchet.trimToNull(s))
45
+ .filter((s) => !!s);
46
+ return validLangs;
47
+ }
48
+ async pullPageEntry(entry, language) {
49
+ const prefix = 'https://data.commoncrawl.org/';
50
+ const url = prefix + entry.filename;
51
+ const headers = { Range: 'bytes=' + entry.offset + '-' + (entry.offset + entry.length + 1) };
52
+ if (language) {
53
+ if (!CommonCrawlService.validLanguages(entry).includes(language)) {
54
+ throw ErrorRatchet.fErr('Requested language %s, but valid are %s', language, entry.languages);
55
+ }
56
+ headers['Accept-Language'] = language;
57
+ }
58
+ const resp = await fetch(url, {
59
+ headers: headers,
60
+ });
61
+ let reader = null;
62
+ if (resp.body instanceof Readable) {
63
+ reader = resp.body;
64
+ }
65
+ else if (resp.body instanceof ReadableStream) {
66
+ reader = NodeStreamRatchet.webReadableStreamToNodeReadable(resp.body);
67
+ }
68
+ const warcstream = new warc();
69
+ Logger.info('Headers is %j', resp.headers);
70
+ let rval = null;
71
+ reader
72
+ .pipe(zlib.createGunzip())
73
+ .pipe(warcstream)
74
+ .on('data', (val) => {
75
+ Logger.info('Got data ' + val.content.length);
76
+ rval = val;
77
+ warcstream.destroy();
78
+ })
79
+ .on('close', () => {
80
+ Logger.info('Got close event');
81
+ })
82
+ .on('error', (err) => {
83
+ Logger.error('Read error: %s', err, err);
84
+ });
85
+ while (!rval) {
86
+ await PromiseRatchet.wait(500);
87
+ }
88
+ const conv = {
89
+ protocol: rval.protocol,
90
+ headers: rval.headers,
91
+ content: rval.content.toString(),
92
+ };
93
+ return conv;
94
+ }
95
+ async search(options) {
96
+ RequireRatchet.notNullOrUndefined(options, 'options');
97
+ RequireRatchet.notNullUndefinedOrOnlyWhitespaceString(options.url, 'options.url');
98
+ let url = CommonCrawlService.COMMON_CRAWL_URL + (options.index || CommonCrawlService.CURRENT_CRAWL);
99
+ url += '-index?';
100
+ const params = {
101
+ url: options.url,
102
+ matchType: options.matchType || 'domain',
103
+ output: 'json',
104
+ };
105
+ const urlPart = querystring.stringify(params);
106
+ url += urlPart;
107
+ const res = await fetch(url);
108
+ const body = await res.text();
109
+ let rval = null;
110
+ if (res.status === 200) {
111
+ const lines = body.split('\n');
112
+ rval = lines.map((s) => (StringRatchet.trimToNull(s) ? JSON.parse(s) : null)).filter((s) => !!s);
113
+ }
114
+ else {
115
+ Logger.error('Failed to fetch: %s : %s : %j : %s', res.status, res.statusText, res.headers, res.body);
116
+ }
117
+ return rval;
118
+ }
119
+ async scanSite(opts, onPage) {
120
+ const sw = new StopWatch();
121
+ const rval = {
122
+ options: opts,
123
+ pageIndexes: [],
124
+ parsed: [],
125
+ errors: [],
126
+ };
127
+ Logger.info('Performing domain index scan with %j', opts);
128
+ rval.pageIndexes = await this.search(opts);
129
+ Logger.info('Found %d entries, pulling each', rval.pageIndexes.length);
130
+ for (const [idx, ent] of rval.pageIndexes.entries()) {
131
+ try {
132
+ Logger.info('Pulling item %d of %d, %s', idx, rval.pageIndexes.length, sw.dumpExpected(idx / rval.pageIndexes.length));
133
+ if (onPage) {
134
+ try {
135
+ await onPage(idx, rval.pageIndexes.length, ent.url);
136
+ }
137
+ catch (err) {
138
+ Logger.warn('Failed onpage: %s', err);
139
+ }
140
+ }
141
+ const parsed = await this.pullPageEntry(ent);
142
+ rval.parsed.push(parsed);
143
+ }
144
+ catch (err) {
145
+ Logger.warn('Failed to pull %j : %s', ent, err);
146
+ rval.errors.push({ pageIdx: ent, error: err });
147
+ }
148
+ }
149
+ Logger.info('Completed full scan in %s', sw.dump());
150
+ return rval;
151
+ }
152
+ }
153
+ //# sourceMappingURL=common-crawl-service.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"common-crawl-service.js","sourceRoot":"","sources":["../../../src/third-party/common-crawl/common-crawl-service.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,MAAM,aAAa,CAAC;AAChC,OAAO,KAAK,WAAW,MAAM,kBAAkB,CAAC;AAChD,OAAO,EAAE,cAAc,EAAE,MAAM,8CAA8C,CAAC;AAC9E,OAAO,EAAE,aAAa,EAAE,MAAM,6CAA6C,CAAC;AAC5E,OAAO,IAAI,MAAM,MAAM,CAAC;AACxB,OAAO,EAAE,QAAQ,EAAE,MAAM,QAAQ,CAAC;AAClC,OAAO,IAAI,MAAM,MAAM,CAAC;AACxB,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AAOnC,OAAO,EAAE,YAAY,EAAE,MAAM,4CAA4C,CAAC;AAC1E,OAAO,EAAE,iBAAiB,EAAE,MAAM,kCAAkC,CAAC;AACrE,OAAO,EAAE,MAAM,EAAE,MAAM,uCAAuC,CAAC;AAC/D,OAAO,EAAE,cAAc,EAAE,MAAM,8CAA8C,CAAC;AAC9E,OAAO,EAAE,SAAS,EAAE,MAAM,yCAAyC,CAAC;AAKpE,MAAM,OAAO,kBAAkB;IACtB,MAAM,CAAU,gBAAgB,GAAW,gCAAgC,CAAC;IAC5E,MAAM,CAAU,aAAa,GAAW,iBAAiB,CAAC;IAE1D,KAAK,CAAC,YAAY;QACvB,MAAM,GAAG,GAAa,MAAM,KAAK,CAAC,kBAAkB,CAAC,gBAAgB,GAAG,eAAe,CAAC,CAAC;QACzF,MAAM,MAAM,GAAoB,MAAM,GAAG,CAAC,IAAI,EAAE,CAAC;QACjD,OAAO,MAAM,CAAC;IAChB,CAAC;IAEM,KAAK,CAAC,YAAY,CAAC,KAA0B;QAClD,MAAM,IAAI,GAA6B,EAAE,CAAC;QAC1C,MAAM,KAAK,GAAa,kBAAkB,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC;QAEjE,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,CAAC;YAChB,MAAM,IAAI,GAAc,MAAM,IAAI,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;YACxD,MAAM,QAAQ,GAAW,IAAI,CAAC,OAAO,CAAC,QAAQ,EAAE,CAAC;YACjD,MAAM,MAAM,GAAiB,OAAO,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;YACpD,CAAC,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,EAAE;gBACnC,MAAM,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,GAAW,EAAE,EAAmB,EAAE,EAAE;oBACpD,MAAM,GAAG,GAAW,aAAa,CAAC,UAAU,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;oBAChE,IAAI,GAAG,IAAI,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;wBAC7B,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;oBAEvB,CAAC;gBACH,CAAC,CAAC,CAAC;YACL,CAAC,CAAC,CAAC;QACL,CAAC;QACD,OAAO,IAAI,CAAC;IACd,CAAC;IAEM,MAAM,CAAC,cAAc,CAAC,KAA0B;QACrD,MAAM,UAAU,GAAa,KAAK,CAAC,SAAS;aACzC,KAAK,CAAC,GAAG,CAAC;aACV,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,aAAa,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;aACvC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QACtB,OAAO,UAAU,CAAC;IACpB,CAAC;IAEM,KAAK,CAAC,aAAa,CAAC,KAA0B,EAAE,QAAiB;QACtE,MAAM,MAAM,GAAW,+BAA+B,CAAC;QACvD,MAAM,GAAG,GAAW,MAAM,GAAG,KAAK,CAAC,QAAQ,CAAC;QAE5C,MAAM,OAAO,GAA2B,EAAE,KAAK,EAAE,QAAQ,GAAG,KAAK,CAAC,MAAM,GAAG,GAAG,GAAG,CAAC,KAAK,CAAC,MAAM,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,EAAE,CAAC;QACrH,IAAI,QAAQ,EAAE,CAAC;YACb,IAAI,CAAC,kBAAkB,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;gBACjE,MAAM,YAAY,CAAC,IAAI,CAAC,yCAAyC,EAAE,QAAQ,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;YAChG,CAAC;YACD,OAAO,CAAC,iBAAiB,CAAC,GAAG,QAAQ,CAAC;QACxC,CAAC;QAED,MAAM,IAAI,GAAa,MAAM,KAAK,CAAC,GAAG,EAAE;YACtC,OAAO,EAAE,OAAO;SACjB,CAAC,CAAC;QAGH,IAAI,MAAM,GAAa,IAAI,CAAC;QAC5B,IAAI,IAAI,CAAC,IAAI,YAAY,QAAQ,EAAE,CAAC;YAClC,MAAM,GAAG,IAAI,CAAC,IAAI,CAAC;QACrB,CAAC;aAAM,IAAI,IAAI,CAAC,IAAI,YAAY,cAAc,EAAE,CAAC;YAC/C,MAAM,GAAG,iBAAiB,CAAC,+BAA+B,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACxE,CAAC;QAED,MAAM,UAAU,GAAS,IAAI,IAAI,EAAE,CAAC;QAOpC,MAAM,CAAC,IAAI,CAAC,eAAe,EAAE,IAAI,CAAC,OAAO,CAAC,CAAC;QAE3C,IAAI,IAAI,GAAiB,IAAI,CAAC;QAE9B,MAAM;aACH,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,CAAC;aACzB,IAAI,CAAC,UAAU,CAAC;aAChB,EAAE,CAAC,MAAM,EAAE,CAAC,GAAiB,EAAE,EAAE;YAChC,MAAM,CAAC,IAAI,CAAC,WAAW,GAAG,GAAG,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC;YAC9C,IAAI,GAAG,GAAG,CAAC;YAIX,UAAU,CAAC,OAAO,EAAE,CAAC;QACvB,CAAC,CAAC;aACD,EAAE,CAAC,OAAO,EAAE,GAAG,EAAE;YAChB,MAAM,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC;QAEjC,CAAC,CAAC;aACD,EAAE,CAAC,OAAO,EAAE,CAAC,GAAG,EAAE,EAAE;YACnB,MAAM,CAAC,KAAK,CAAC,gBAAgB,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC;QAE3C,CAAC,CAAC,CAAC;QAEL,OAAO,CAAC,IAAI,EAAE,CAAC;YAEb,MAAM,cAAc,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACjC,CAAC;QAED,MAAM,IAAI,GAAc;YACtB,QAAQ,EAAE,IAAI,CAAC,QAAQ;YACvB,OAAO,EAAE,IAAI,CAAC,OAAO;YACrB,OAAO,EAAE,IAAI,CAAC,OAAO,CAAC,QAAQ,EAAE;SACjC,CAAC;QAEF,OAAO,IAAI,CAAC;IACd,CAAC;IAEM,KAAK,CAAC,MAAM,CAAC,OAAgC;QAClD,cAAc,CAAC,kBAAkB,CAAC,OAAO,EAAE,SAAS,CAAC,CAAC;QACtD,cAAc,CAAC,sCAAsC,CAAC,OAAO,CAAC,GAAG,EAAE,aAAa,CAAC,CAAC;QAElF,IAAI,GAAG,GAAW,kBAAkB,CAAC,gBAAgB,GAAG,CAAC,OAAO,CAAC,KAAK,IAAI,kBAAkB,CAAC,aAAa,CAAC,CAAC;QAC5G,GAAG,IAAI,SAAS,CAAC;QACjB,MAAM,MAAM,GAAG;YACb,GAAG,EAAE,OAAO,CAAC,GAAG;YAGhB,SAAS,EAAE,OAAO,CAAC,SAAS,IAAI,QAAQ;YAMxC,MAAM,EAAE,MAAM;SACf,CAAC;QAEF,MAAM,OAAO,GAAW,WAAW,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QACtD,GAAG,IAAI,OAAO,CAAC;QAIf,MAAM,GAAG,GAAa,MAAM,KAAK,CAAC,GAAG,CAAC,CAAC;QACvC,MAAM,IAAI,GAAW,MAAM,GAAG,CAAC,IAAI,EAAE,CAAC;QACtC,IAAI,IAAI,GAA0B,IAAI,CAAC;QACvC,IAAI,GAAG,CAAC,MAAM,KAAK,GAAG,EAAE,CAAC;YACvB,MAAM,KAAK,GAAa,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;YACzC,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,aAAa,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAEnG,CAAC;aAAM,CAAC;YACN,MAAM,CAAC,KAAK,CAAC,oCAAoC,EAAE,GAAG,CAAC,MAAM,EAAE,GAAG,CAAC,UAAU,EAAE,GAAG,CAAC,OAAO,EAAE,GAAG,CAAC,IAAI,CAAC,CAAC;QACxG,CAAC;QAED,OAAO,IAAI,CAAC;IACd,CAAC;IAeM,KAAK,CAAC,QAAQ,CACnB,IAA6B,EAC7B,MAAmE;QAEnE,MAAM,EAAE,GAAc,IAAI,SAAS,EAAE,CAAC;QACtC,MAAM,IAAI,GAAoB;YAC5B,OAAO,EAAE,IAAI;YACb,WAAW,EAAE,EAAE;YACf,MAAM,EAAE,EAAE;YACV,MAAM,EAAE,EAAE;SACX,CAAC;QACF,MAAM,CAAC,IAAI,CAAC,sCAAsC,EAAE,IAAI,CAAC,CAAC;QAC1D,IAAI,CAAC,WAAW,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;QAC3C,MAAM,CAAC,IAAI,CAAC,gCAAgC,EAAE,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;QACvE,KAAK,MAAM,CAAC,GAAG,EAAE,GAAG,CAAC,IAAI,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,EAAE,CAAC;YACpD,IAAI,CAAC;gBACH,MAAM,CAAC,IAAI,CAAC,2BAA2B,EAAE,GAAG,EAAE,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,EAAE,CAAC,YAAY,CAAC,GAAG,GAAG,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC;gBACvH,IAAI,MAAM,EAAE,CAAC;oBACX,IAAI,CAAC;wBACH,MAAM,MAAM,CAAC,GAAG,EAAE,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,CAAC,CAAC;oBACtD,CAAC;oBAAC,OAAO,GAAG,EAAE,CAAC;wBACb,MAAM,CAAC,IAAI,CAAC,mBAAmB,EAAE,GAAG,CAAC,CAAC;oBACxC,CAAC;gBACH,CAAC;gBACD,MAAM,MAAM,GAAc,MAAM,IAAI,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC;gBAExD,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YAC3B,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACb,MAAM,CAAC,IAAI,CAAC,wBAAwB,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC;gBAChD,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,EAAE,OAAO,EAAE,GAAG,EAAE,KAAK,EAAE,GAAG,EAAE,CAAC,CAAC;YACjD,CAAC;QACH,CAAC;QACD,MAAM,CAAC,IAAI,CAAC,2BAA2B,EAAE,EAAE,CAAC,IAAI,EAAE,CAAC,CAAC;QACpD,OAAO,IAAI,CAAC;IACd,CAAC"}
@@ -0,0 +1,12 @@
1
+ export interface CommonCrawlFetchOptions {
2
+ url: string;
3
+ index?: string;
4
+ from?: string;
5
+ to?: string;
6
+ showNumPages?: boolean;
7
+ matchType?: 'exact' | 'prefix' | 'host' | 'domain';
8
+ limit?: number;
9
+ sort?: 'asc' | 'desc';
10
+ page?: number;
11
+ pageSize?: number;
12
+ }
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=common-crawl-fetch-options.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"common-crawl-fetch-options.js","sourceRoot":"","sources":["../../../../src/third-party/common-crawl/model/common-crawl-fetch-options.ts"],"names":[],"mappings":""}
@@ -0,0 +1,9 @@
1
+ import { WarcEntry } from './warc-entry.js';
2
+ import { DomainIndexEntryRaw } from './domain-index-entry-raw.js';
3
+ import { CommonCrawlFetchOptions } from './common-crawl-fetch-options.js';
4
+ export interface CommonCrawlScan {
5
+ options: CommonCrawlFetchOptions;
6
+ pageIndexes: DomainIndexEntryRaw[];
7
+ parsed: WarcEntry[];
8
+ errors: any[];
9
+ }
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=common-crawl-scan.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"common-crawl-scan.js","sourceRoot":"","sources":["../../../../src/third-party/common-crawl/model/common-crawl-scan.ts"],"names":[],"mappings":""}
@@ -0,0 +1,14 @@
1
+ export interface DomainIndexEntryRaw {
2
+ urlkey: string;
3
+ timestamp: string;
4
+ url: string;
5
+ mime: string;
6
+ 'mime-detected': string;
7
+ status: string;
8
+ digest: string;
9
+ length: string;
10
+ offset: string;
11
+ filename: string;
12
+ languages: string;
13
+ encoding: string;
14
+ }
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=domain-index-entry-raw.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"domain-index-entry-raw.js","sourceRoot":"","sources":["../../../../src/third-party/common-crawl/model/domain-index-entry-raw.ts"],"names":[],"mappings":""}
@@ -0,0 +1,8 @@
1
+ export interface IndexEntryRaw {
2
+ id: string;
3
+ name: string;
4
+ timegate: string;
5
+ 'cdx-api': string;
6
+ from: string;
7
+ to: string;
8
+ }
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=index-entry-raw.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index-entry-raw.js","sourceRoot":"","sources":["../../../../src/third-party/common-crawl/model/index-entry-raw.ts"],"names":[],"mappings":""}
@@ -0,0 +1,5 @@
1
+ export interface WarcEntryRaw {
2
+ protocol: string;
3
+ headers: Record<string, string>;
4
+ content: Buffer;
5
+ }
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=warc-entry-raw.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"warc-entry-raw.js","sourceRoot":"","sources":["../../../../src/third-party/common-crawl/model/warc-entry-raw.ts"],"names":[],"mappings":""}
@@ -0,0 +1,5 @@
1
+ export interface WarcEntry {
2
+ protocol: string;
3
+ headers: Record<string, string>;
4
+ content: string;
5
+ }
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=warc-entry.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"warc-entry.js","sourceRoot":"","sources":["../../../../src/third-party/common-crawl/model/warc-entry.ts"],"names":[],"mappings":""}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@bitblit/ratchet-node-only",
3
- "version": "4.0.475-alpha",
3
+ "version": "5.0.107",
4
4
  "description": "Ratchet tools for use on node-only",
5
5
  "note-on-side-effects": "Technically the entries in 'bin' below might be side effects, but they are called explicitly",
6
6
  "sideEffects": false,
@@ -55,20 +55,27 @@
55
55
  },
56
56
  "license": "Apache-2.0",
57
57
  "dependencies": {
58
- "@bitblit/ratchet-common": "4.0.475-alpha"
58
+ "@bitblit/ratchet-common": "5.0.107"
59
59
  },
60
60
  "optionalDependencies": {
61
+ "cheerio": "1.0.0",
61
62
  "csv": "6.3.10",
62
63
  "jsonwebtoken": "9.0.2",
63
- "rxjs": "7.8.1"
64
+ "rxjs": "7.8.1",
65
+ "warc": "1.0.1"
64
66
  },
65
67
  "peerDependencies": {
66
- "@bitblit/ratchet-common": "4.0.475-alpha",
68
+ "@bitblit/ratchet-common": "5.0.107",
69
+ "cheerio": "^1.0.0",
67
70
  "csv": "^6.3.10",
68
71
  "jsonwebtoken": "^9.0.2",
69
- "rxjs": "^7.8.1"
72
+ "rxjs": "^7.8.1",
73
+ "warc": "^1.0.1"
70
74
  },
71
75
  "peerDependenciesMeta": {
76
+ "cheerio": {
77
+ "optional": true
78
+ },
72
79
  "csv": {
73
80
  "optional": true
74
81
  },
@@ -77,6 +84,12 @@
77
84
  },
78
85
  "rxjs": {
79
86
  "optional": true
87
+ },
88
+ "warc": {
89
+ "optional": true
80
90
  }
91
+ },
92
+ "devDependencies": {
93
+ "@types/cheerio": "0.22.35"
81
94
  }
82
95
  }