@bitblit/ratchet-node-only 6.0.145-alpha → 6.0.147-alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/package.json +4 -3
  2. package/src/build/ratchet-node-only-info.ts +19 -0
  3. package/src/ci/apply-ci-env-variables-to-files.spec.ts +30 -0
  4. package/src/ci/apply-ci-env-variables-to-files.ts +98 -0
  5. package/src/ci/ci-run-information-util.ts +48 -0
  6. package/src/ci/ci-run-information.ts +9 -0
  7. package/src/cli/abstract-ratchet-cli-handler.ts +33 -0
  8. package/src/cli/cli-ratchet.ts +34 -0
  9. package/src/cli/ratchet-cli-handler.ts +24 -0
  10. package/src/csv/csv-ratchet.spec.ts +59 -0
  11. package/src/csv/csv-ratchet.ts +211 -0
  12. package/src/export-builder/export-map-builder-config.ts +10 -0
  13. package/src/export-builder/export-map-builder-target-config.ts +5 -0
  14. package/src/export-builder/export-map-builder.spec.ts +22 -0
  15. package/src/export-builder/export-map-builder.ts +157 -0
  16. package/src/files/files-to-static-class.spec.ts +26 -0
  17. package/src/files/files-to-static-class.ts +101 -0
  18. package/src/files/unique-file-rename.ts +80 -0
  19. package/src/http/local-file-server.ts +129 -0
  20. package/src/http/local-server-cert.ts +72 -0
  21. package/src/jwt/jwt-ratchet-config.ts +18 -0
  22. package/src/jwt/jwt-ratchet-like.ts +19 -0
  23. package/src/jwt/jwt-ratchet.spec.ts +85 -0
  24. package/src/jwt/jwt-ratchet.ts +204 -0
  25. package/src/stream/buffer-writable.ts +16 -0
  26. package/src/stream/multi-stream.ts +15 -0
  27. package/src/stream/node-stream-ratchet.spec.ts +19 -0
  28. package/src/stream/node-stream-ratchet.ts +70 -0
  29. package/src/stream/string-writable.spec.ts +16 -0
  30. package/src/stream/string-writable.ts +14 -0
  31. package/src/third-party/angular/angular-aot-rollup-plugin.ts +18 -0
  32. package/src/third-party/common-crawl/common-crawl-service.ts +220 -0
  33. package/src/third-party/common-crawl/model/common-crawl-fetch-options.ts +12 -0
  34. package/src/third-party/common-crawl/model/common-crawl-scan.ts +11 -0
  35. package/src/third-party/common-crawl/model/domain-index-entry-raw.ts +14 -0
  36. package/src/third-party/common-crawl/model/index-entry-raw.ts +8 -0
  37. package/src/third-party/common-crawl/model/warc-entry-raw.ts +5 -0
  38. package/src/third-party/common-crawl/model/warc-entry.ts +5 -0
  39. package/src/third-party/git/git-ratchet.spec.ts +11 -0
  40. package/src/third-party/git/git-ratchet.ts +107 -0
  41. package/src/third-party/slack/publish-ci-release-to-slack.spec.ts +28 -0
  42. package/src/third-party/slack/publish-ci-release-to-slack.ts +84 -0
@@ -0,0 +1,85 @@
1
+ import { JwtRatchet } from './jwt-ratchet.js';
2
+ import { describe, expect, test } from 'vitest';
3
+ import { JwtRatchetConfig } from './jwt-ratchet-config.js';
4
+ import { ExpiredJwtHandling } from '@bitblit/ratchet-common/jwt/expired-jwt-handling';
5
+
6
+ describe('#jwtRatchet', function () {
7
+ function createConfig(): JwtRatchetConfig {
8
+ const jwtRatchetConfig: JwtRatchetConfig = {
9
+ encryptionKeyPromise: Promise.resolve('test1234'),
10
+ decryptKeysPromise: Promise.resolve([]),
11
+ jtiGenerator: undefined,
12
+ decryptOnlyKeyUseLogLevel: undefined,
13
+ parseFailureLogLevel: undefined,
14
+ };
15
+ return jwtRatchetConfig;
16
+ }
17
+
18
+ test('should test expiration flag for a token with millisecond expiration', async () => {
19
+ const jwt: JwtRatchet = new JwtRatchet(createConfig());
20
+
21
+ const token1: string = await jwt.createTokenString({ test: 1, exp: Date.now() - 100 }, null);
22
+
23
+ const output: any = await jwt.decodeToken(token1, ExpiredJwtHandling.ADD_FLAG);
24
+ expect(output).not.toBeNull();
25
+ expect(JwtRatchet.hasExpiredFlag(output)).toBeTruthy();
26
+ });
27
+
28
+ test('should test expiration calculation for a token', async () => {
29
+ const jwt: JwtRatchet = new JwtRatchet(createConfig());
30
+
31
+ const token1: string = await jwt.createTokenString({ test: 1 }, 120);
32
+ const output: number = await JwtRatchet.secondsRemainingUntilExpiration(token1);
33
+
34
+ expect(output).not.toBeNull();
35
+ expect(output).toBeLessThan(121);
36
+ expect(output).toBeGreaterThan(115);
37
+ });
38
+
39
+ test('should test round-trip for a token', async () => {
40
+ const jwt: JwtRatchet = new JwtRatchet(createConfig());
41
+
42
+ const token1: string = await jwt.createTokenString({ test: 1 }, 120);
43
+ const output: any = await jwt.decodeToken(token1);
44
+
45
+ expect(output).not.toBeNull();
46
+ expect(output['test']).toEqual(1);
47
+ });
48
+
49
+ test('should test round-trip for a token with array enc keys', async () => {
50
+ const subCfg: JwtRatchetConfig = createConfig();
51
+ subCfg.encryptionKeyPromise = Promise.resolve(['test1234', 'test5678']);
52
+ const jwt: JwtRatchet = new JwtRatchet(subCfg);
53
+
54
+ const token1: string = await jwt.createTokenString({ test: 1 }, 120);
55
+ const token2: string = await jwt.createTokenString({ test: 1 }, 120);
56
+ const token3: string = await jwt.createTokenString({ test: 1 }, 120);
57
+ const output1: any = await jwt.decodeToken(token1);
58
+ const output2: any = await jwt.decodeToken(token2);
59
+ const output3: any = await jwt.decodeToken(token3);
60
+
61
+ expect(output1).not.toBeNull();
62
+ expect(output1['test']).toEqual(1);
63
+ expect(output2).not.toBeNull();
64
+ expect(output2['test']).toEqual(1);
65
+ expect(output3).not.toBeNull();
66
+ expect(output3['test']).toEqual(1);
67
+ });
68
+
69
+ test('should decode with a decode key', async () => {
70
+ const oldCfg: JwtRatchetConfig = createConfig();
71
+ oldCfg.encryptionKeyPromise = Promise.resolve('oldKey');
72
+ const newCfg: JwtRatchetConfig = createConfig();
73
+ newCfg.encryptionKeyPromise = Promise.resolve('newKey');
74
+ newCfg.decryptKeysPromise = Promise.resolve(['oldKey']);
75
+
76
+ const jwtOld: JwtRatchet = new JwtRatchet(oldCfg);
77
+ const jwtNew: JwtRatchet = new JwtRatchet(newCfg);
78
+
79
+ const token1: string = await jwtOld.createTokenString({ test: 1 }, 120);
80
+ const output: any = await jwtNew.decodeToken(token1);
81
+
82
+ expect(output).not.toBeNull();
83
+ expect(output['test']).toEqual(1);
84
+ });
85
+ });
@@ -0,0 +1,204 @@
1
+ import { JwtRatchetLike } from './jwt-ratchet-like.js';
2
+ import { JwtRatchetConfig } from './jwt-ratchet-config.js';
3
+ import jsonwebtoken from 'jsonwebtoken';
4
+ import { RequireRatchet } from '@bitblit/ratchet-common/lang/require-ratchet';
5
+ import { StringRatchet } from '@bitblit/ratchet-common/lang/string-ratchet';
6
+ import { LoggerLevelName } from '@bitblit/ratchet-common/logger/logger-level-name';
7
+ import { Logger } from '@bitblit/ratchet-common/logger/logger';
8
+ import { JwtPayloadExpirationRatchet } from '@bitblit/ratchet-common/jwt/jwt-payload-expiration-ratchet';
9
+ import { JwtTokenBase } from '@bitblit/ratchet-common/jwt/jwt-token-base';
10
+ import { ExpiredJwtHandling } from '@bitblit/ratchet-common/jwt/expired-jwt-handling';
11
+
12
+ /**
13
+ * Functions to help with creating and decoding JWTs
14
+ *
15
+ * JWTRatchet accepts promises for its inputs for the simple reason that best practice dictates that the keys
16
+ * should never be in the code, which means it is likely somewhere else. That MIGHT be somewhere synchronous
17
+ * like an environmental variable, but it could very likely be someplace remote like a secure key store. By
18
+ * accepting promises here, we make it easy to do JwtRatchet construction in a place (like an IOT container)
19
+ * that itself must be synchronous
20
+ */
21
+ export class JwtRatchet implements JwtRatchetLike {
22
+ constructor(private cfg: JwtRatchetConfig) {
23
+ RequireRatchet.notNullOrUndefined(cfg, 'config');
24
+ RequireRatchet.notNullOrUndefined(cfg.encryptionKeyPromise, 'encryptionKeyPromise');
25
+
26
+ cfg.jtiGenerator = cfg.jtiGenerator ?? StringRatchet.createType4Guid;
27
+ cfg.decryptOnlyKeyUseLogLevel = cfg.decryptOnlyKeyUseLogLevel ?? LoggerLevelName.info;
28
+ cfg.parseFailureLogLevel = cfg.parseFailureLogLevel ?? LoggerLevelName.debug;
29
+ }
30
+
31
+ public get copyConfig(): JwtRatchetConfig {
32
+ const rval: JwtRatchetConfig = {
33
+ encryptionKeyPromise: this.cfg.encryptionKeyPromise,
34
+ decryptKeysPromise: this.cfg.decryptKeysPromise,
35
+ jtiGenerator: this.cfg.jtiGenerator,
36
+ decryptOnlyKeyUseLogLevel: this.cfg.decryptOnlyKeyUseLogLevel,
37
+ parseFailureLogLevel: this.cfg.parseFailureLogLevel,
38
+ };
39
+ return rval;
40
+ }
41
+
42
+ public get encryptionKeyPromise(): Promise<string | string[]> {
43
+ return this.cfg.encryptionKeyPromise;
44
+ }
45
+
46
+ public get decryptKeysPromise(): Promise<string[]> {
47
+ return this.cfg.decryptKeysPromise;
48
+ }
49
+
50
+ public get jtiGenerator(): () => string {
51
+ return this.cfg.jtiGenerator;
52
+ }
53
+
54
+ public get decryptOnlyKeyUseLogLevel(): LoggerLevelName {
55
+ return this.cfg.decryptOnlyKeyUseLogLevel;
56
+ }
57
+
58
+ public get parseFailureLogLevel(): LoggerLevelName {
59
+ return this.cfg.parseFailureLogLevel;
60
+ }
61
+
62
+ public static async invalidSafeDecode<T>(
63
+ payloadString: string,
64
+ decryptKey: string,
65
+ logLevel: LoggerLevelName = LoggerLevelName.silly,
66
+ ): Promise<T> {
67
+ let rval: T = null;
68
+ try {
69
+ rval = jsonwebtoken.verify(payloadString, decryptKey, { ignoreExpiration: true }) as unknown as T; // We'll check/flag expiration later
70
+ } catch (err) {
71
+ Logger.logByLevel(logLevel, 'Caught %s - ignoring', err);
72
+ }
73
+ return rval;
74
+ }
75
+
76
+ public static async secondsRemainingUntilExpiration(payloadString: string): Promise<number> {
77
+ let rval: number = null;
78
+ if (StringRatchet.trimToNull(payloadString)) {
79
+ const output: JwtTokenBase = await JwtRatchet.decodeTokenNoVerify<any>(payloadString);
80
+ const nowSecond: number = Math.floor(Date.now() / 1000);
81
+ if (output.exp) {
82
+ // A backwards compatibility hack since some of my old code used to incorrectly write the exp field in milliseconds
83
+ const expSeconds: number = output.exp > nowSecond * 100 ? Math.floor(output.exp / 1000) : output.exp;
84
+ rval = Math.max(0, expSeconds - nowSecond);
85
+ }
86
+ }
87
+ return rval;
88
+ }
89
+
90
+ public static async msRemainingUntilExpiration(payloadString: string): Promise<number> {
91
+ const secs: number = await JwtRatchet.secondsRemainingUntilExpiration(payloadString);
92
+ let rval: number = null;
93
+ if (secs !== null && secs !== undefined) {
94
+ rval = secs * 1000;
95
+ }
96
+ return rval;
97
+ }
98
+
99
+ public async decodeToken<T extends JwtTokenBase>(
100
+ payloadString: string,
101
+ expiredHandling: ExpiredJwtHandling = ExpiredJwtHandling.RETURN_NULL,
102
+ ): Promise<T> {
103
+ const encKeys: string[] = await this.encryptionKeyArray();
104
+ let decKeys: string[] = Object.assign([], encKeys);
105
+ if (this.decryptKeysPromise) {
106
+ decKeys = decKeys.concat(await this.decryptKeysPromise);
107
+ }
108
+
109
+ const keysTried: string[] = []; //[StringRatchet.obscure(encKey, 1, 1)];
110
+ let payload: T = null; //await JwtRatchet.invalidSafeDecode(payloadString, encKey);
111
+
112
+ for (let i = 0; i < decKeys.length && !payload; i++) {
113
+ keysTried.push(StringRatchet.obscure(decKeys[i], 1, 1));
114
+ // Only Log on the last one since it might have just been an old key
115
+ const logLevel: LoggerLevelName =
116
+ i === decKeys.length - 1 && this.parseFailureLogLevel ? this.parseFailureLogLevel : LoggerLevelName.silly;
117
+ payload = await JwtRatchet.invalidSafeDecode(payloadString, decKeys[i], logLevel);
118
+ if (payload && i >= encKeys.length) {
119
+ Logger.logByLevel(this.decryptOnlyKeyUseLogLevel, 'Used old key to decode token : %s', StringRatchet.obscure(decKeys[i], 2));
120
+ }
121
+ }
122
+
123
+ if (payload) {
124
+ payload = JwtPayloadExpirationRatchet.processPayloadExpiration(payload, expiredHandling);
125
+ } else {
126
+ Logger.warn('Unable to parse a payload (Tried %j) from : %s', keysTried, payloadString);
127
+ }
128
+
129
+ return payload;
130
+ }
131
+
132
+ public async encryptionKeyArray(): Promise<string[]> {
133
+ const encKey: string | string[] = await this.encryptionKeyPromise;
134
+ const rval: string[] = Array.isArray(encKey) ? encKey : [encKey];
135
+ if (rval.length < 1) {
136
+ throw new Error('Cannot create JwtRatchet with empty encryption key set');
137
+ }
138
+ return rval;
139
+ }
140
+
141
+ public async selectRandomEncryptionKey(): Promise<string> {
142
+ const encKey: string[] = await this.encryptionKeyArray();
143
+ const rval: string = encKey[Math.floor(Math.random() * encKey.length)];
144
+ return rval;
145
+ }
146
+
147
+ public async createTokenString(payload: any, expirationSeconds?: number, overrideEncryptionKey?: string): Promise<string> {
148
+ const encKey: string = StringRatchet.trimToNull(overrideEncryptionKey)
149
+ ? StringRatchet.trimToNull(overrideEncryptionKey)
150
+ : await this.selectRandomEncryptionKey();
151
+
152
+ RequireRatchet.notNullOrUndefined(payload, 'payload');
153
+ payload.jti = this.jtiGenerator ? this.jtiGenerator() : null; // Setup unique code
154
+ if (expirationSeconds) {
155
+ const nowSeconds = Math.floor(Date.now() / 1000);
156
+ const expires = nowSeconds + expirationSeconds;
157
+ Logger.debug('Forcing expiration to %d', expires);
158
+ payload.exp = expires;
159
+ }
160
+
161
+ const token: string = jsonwebtoken.sign(payload, encKey); // , algorithm = 'HS256')
162
+ return token;
163
+ }
164
+
165
+ public async refreshJWTString(tokenString: string, allowExpired?: boolean, expirationSeconds?: number): Promise<string> {
166
+ const handling: ExpiredJwtHandling = allowExpired ? ExpiredJwtHandling.ADD_FLAG : ExpiredJwtHandling.THROW_EXCEPTION;
167
+ const payload: JwtTokenBase = await this.decodeToken(tokenString, handling);
168
+
169
+ const originalDurationSeconds: number = payload.exp && payload.iat ? payload.exp - payload.iat : null;
170
+ const newExpirationSeconds: number = expirationSeconds || originalDurationSeconds;
171
+ // Remove any old stuff
172
+ JwtRatchet.removeJwtFields(payload);
173
+ JwtRatchet.removeExpiredFlag(payload); // If it wasnt allowed an exception was thrown above anyway
174
+ const token: string = await this.createTokenString(payload, newExpirationSeconds);
175
+ return token;
176
+ }
177
+
178
+ // Helper method that reads the token without checking it, therefore the keys are not needed
179
+ public static async decodeTokenNoVerify<T extends JwtTokenBase>(token: string): Promise<T> {
180
+ const rval: T = jsonwebtoken.decode(token) as T;
181
+ return rval;
182
+ }
183
+
184
+ // Removes any jwt fields from an object
185
+ public static removeJwtFields(ob: any) {
186
+ if (ob) {
187
+ ['iss', 'sub', 'aud', 'exp', 'nbf', 'iat', 'jti'].forEach((k) => {
188
+ // This isn't really dynamic
189
+ // eslint-disable-next-line @typescript-eslint/no-dynamic-delete
190
+ delete ob[k];
191
+ });
192
+ }
193
+ }
194
+
195
+ public static hasExpiredFlag(ob: any): boolean {
196
+ // Delegate for backwards compatibility
197
+ return JwtPayloadExpirationRatchet.hasExpiredFlag(ob);
198
+ }
199
+
200
+ public static removeExpiredFlag(ob: any) {
201
+ // Delegate for backwards compatibility
202
+ return JwtPayloadExpirationRatchet.removeExpiredFlag(ob);
203
+ }
204
+ }
@@ -0,0 +1,16 @@
1
+ import { Writable } from 'stream';
2
+
3
+ export class BufferWritable extends Writable {
4
+ private _val: any[] = [];
5
+
6
+ _write(chunk: any, encoding: string, callback): void {
7
+ if (chunk) {
8
+ this._val.push(chunk);
9
+ }
10
+ callback();
11
+ }
12
+
13
+ public get value(): Buffer {
14
+ return Buffer.concat(this._val);
15
+ }
16
+ }
@@ -0,0 +1,15 @@
1
+ import { Readable, ReadableOptions } from 'stream';
2
+
3
+ export class MultiStream extends Readable {
4
+ _object: any;
5
+
6
+ constructor(object: any, options: ReadableOptions = {}) {
7
+ super(object instanceof Buffer || typeof object === 'string' ? options : { objectMode: true });
8
+ this._object = object;
9
+ }
10
+
11
+ _read() {
12
+ this.push(this._object);
13
+ this._object = null;
14
+ }
15
+ }
@@ -0,0 +1,19 @@
1
+ import { Readable } from 'stream';
2
+ import { NodeStreamRatchet } from './node-stream-ratchet.js';
3
+ import { describe, expect, test } from 'vitest';
4
+
5
+ describe('#NodeStreamRatchet', function () {
6
+ test('should wrap a string in a readable', async () => {
7
+ const r: Readable = NodeStreamRatchet.stringToReadable('test');
8
+ const out: Buffer = r.read(200);
9
+
10
+ expect(out.length).toEqual(4);
11
+ });
12
+
13
+ test('should wrap an number in a readable', async () => {
14
+ const r: Readable = NodeStreamRatchet.anyToStringReadable(401);
15
+ const out: Buffer = r.read(200);
16
+
17
+ expect(out.length).toEqual(3);
18
+ });
19
+ });
@@ -0,0 +1,70 @@
1
+ import { Readable, Writable } from 'stream';
2
+ import { StringRatchet } from '@bitblit/ratchet-common/lang/string-ratchet';
3
+
4
+ /**
5
+ * This class is specifically dealing with node streams as opposed to web streams
6
+ * (ie, Readable vs ReadableStream, Writeable vs WritableStream)
7
+ * https://stackoverflow.com/questions/61232291/difference-between-web-streams-and-node-js-stream-apis
8
+ *
9
+ * This is the only class that supports conversion between the two since the web will be assumed
10
+ * to not have access to the NodeJS classes, but Node DOES have access to the web classes
11
+ */
12
+ export class NodeStreamRatchet {
13
+ // Empty constructor prevents instantiation
14
+ // eslint-disable-next-line @typescript-eslint/no-empty-function
15
+ private constructor() {}
16
+
17
+ public static readableToBufferSync(stream: Readable): Buffer {
18
+ const bufs = [];
19
+ let next: any = stream.read();
20
+ while (next) {
21
+ bufs.push(next);
22
+ next = stream.read();
23
+ }
24
+ return Buffer.concat(bufs);
25
+ }
26
+
27
+ public static stringToReadable(input: string): Readable {
28
+ return new Readable({
29
+ read() {
30
+ this.push(input);
31
+ this.push(null);
32
+ },
33
+ });
34
+ }
35
+
36
+ public static anyToStringReadable(input: any): Readable {
37
+ return input === null || input === undefined
38
+ ? NodeStreamRatchet.stringToReadable(null)
39
+ : NodeStreamRatchet.stringToReadable(StringRatchet.safeString(input));
40
+ }
41
+
42
+ // window.ReadableStream to Node.js Readable
43
+ public static webReadableStreamToNodeReadable(rs: ReadableStream): Readable {
44
+ const reader = rs.getReader();
45
+ const out = new Readable();
46
+ reader.read().then(async ({ value, done }) => {
47
+ while (!done) {
48
+ out.push(value);
49
+ ({ done, value } = await reader.read());
50
+ }
51
+ out.push(null);
52
+ });
53
+ return out;
54
+ }
55
+
56
+ // window.WritableStream to Node.js Writable
57
+ public static webWritableStreamToNodeWriteable(ws: WritableStream): Writable {
58
+ const writer = ws.getWriter();
59
+ const out = new Writable();
60
+ out._write = (chunk, encoding, callback) => {
61
+ writer.write(chunk);
62
+ callback();
63
+ };
64
+ out._final = (callback) => {
65
+ writer.close();
66
+ callback();
67
+ };
68
+ return out;
69
+ }
70
+ }
@@ -0,0 +1,16 @@
1
+ import { StringWritable } from './string-writable.js';
2
+ import { describe, expect, test } from 'vitest';
3
+
4
+ describe('#StringWritableStream', function () {
5
+ test('should write cumulatively to a string', async () => {
6
+ const sr: StringWritable = new StringWritable();
7
+ const callback = () => {
8
+ // Ignore me
9
+ };
10
+ sr._write('a', null, callback);
11
+ sr._write('b', null, callback);
12
+ sr._write('c', null, callback);
13
+
14
+ expect(sr.value.length).toEqual(3);
15
+ });
16
+ });
@@ -0,0 +1,14 @@
1
+ import { Writable } from 'stream';
2
+
3
+ export class StringWritable extends Writable {
4
+ private _val: string = '';
5
+
6
+ _write(chunk: any, encoding: string, callback): void {
7
+ this._val += chunk ? chunk.toString() : '';
8
+ callback();
9
+ }
10
+
11
+ public get value(): string {
12
+ return this._val;
13
+ }
14
+ }
@@ -0,0 +1,18 @@
1
+ import { execSync } from 'child_process';
2
+
3
+ // Custom Rollup plugin to run Angular AOT
4
+ export default function angularAotPlugin() {
5
+ return {
6
+ name: 'angular-aot-rollup-plugin',
7
+ buildStart() {
8
+ console.log('Running Angular AOT Compilation...');
9
+ try {
10
+ execSync('ngc -p tsconfig.json', { stdio: 'inherit' });
11
+ } catch (error) {
12
+ this.error('AOT Compilation failed.', error);
13
+ }
14
+ },
15
+ };
16
+ }
17
+
18
+ //module.exports = angularAotPlugin;
@@ -0,0 +1,220 @@
1
+ import fetch from 'cross-fetch';
2
+ import * as querystring from 'node:querystring';
3
+ import { RequireRatchet } from '@bitblit/ratchet-common/lang/require-ratchet';
4
+ import { StringRatchet } from '@bitblit/ratchet-common/lang/string-ratchet';
5
+ import zlib from 'zlib';
6
+ import { Readable } from 'stream';
7
+ import warc from 'warc';
8
+ import * as cheerio from 'cheerio';
9
+ import { WarcEntry } from './model/warc-entry.js';
10
+ import { CommonCrawlScan } from './model/common-crawl-scan.js';
11
+ import { CommonCrawlFetchOptions } from './model/common-crawl-fetch-options.js';
12
+ import { DomainIndexEntryRaw } from './model/domain-index-entry-raw.js';
13
+ import { IndexEntryRaw } from './model/index-entry-raw.js';
14
+ import { WarcEntryRaw } from './model/warc-entry-raw.js';
15
+ import { ErrorRatchet } from '@bitblit/ratchet-common/lang/error-ratchet';
16
+ import { NodeStreamRatchet } from '../../stream/node-stream-ratchet';
17
+ import { Logger } from '@bitblit/ratchet-common/logger/logger';
18
+ import { PromiseRatchet } from '@bitblit/ratchet-common/lang/promise-ratchet';
19
+ import { StopWatch } from '@bitblit/ratchet-common/lang/stop-watch';
20
+ import { CheerioAPI } from "cheerio";
21
+
22
+ /**
23
+ * A very early take to simplify accessing and using the common crawl
24
+ */
25
+ export class CommonCrawlService {
26
+ public static readonly COMMON_CRAWL_URL: string = 'https://index.commoncrawl.org/';
27
+ public static readonly CURRENT_CRAWL: string = 'CC-MAIN-2024-33'; // August 2024 index
28
+
29
+ public async fetchIndexes(): Promise<IndexEntryRaw[]> {
30
+ const res: Response = await fetch(CommonCrawlService.COMMON_CRAWL_URL + 'collinfo.json');
31
+ const output: IndexEntryRaw[] = await res.json();
32
+ return output;
33
+ }
34
+
35
+ public async readPageData(entry: DomainIndexEntryRaw): Promise<any> {
36
+ const rval: Record<string, string[]> = {};
37
+ const langs: string[] = CommonCrawlService.validLanguages(entry);
38
+
39
+ for (const lang of langs) {
40
+ rval[lang] = [];
41
+ const data: WarcEntry = await this.pullPageEntry(entry);
42
+ const asString: string = data.content.toString();
43
+ const parsed: CheerioAPI = cheerio.load(asString);
44
+ ['p', 'div', 'span'].forEach((tag) => {
45
+ parsed(tag).each((idx: number, el) => {
46
+ const txt: string = StringRatchet.trimToNull(parsed(el).text());
47
+ if (txt && txt.includes('.')) {
48
+ rval[lang].push(txt);
49
+ //Logger.info('Div: %s : type %s : text %s', idx, el.type, txt);
50
+ }
51
+ });
52
+ });
53
+ }
54
+ return rval;
55
+ }
56
+
57
+ public static validLanguages(entry: DomainIndexEntryRaw): string[] {
58
+ const validLangs: string[] = entry.languages
59
+ .split(',')
60
+ .map((s) => StringRatchet.trimToNull(s))
61
+ .filter((s) => !!s);
62
+ return validLangs;
63
+ }
64
+
65
+ public async pullPageEntry(entry: DomainIndexEntryRaw, language?: string): Promise<WarcEntry> {
66
+ const prefix: string = 'https://data.commoncrawl.org/'; //https://aws-publicdatasets.s3.amazonaws.com/';
67
+ const url: string = prefix + entry.filename;
68
+
69
+ const headers: Record<string, string> = { Range: 'bytes=' + entry.offset + '-' + (entry.offset + entry.length + 1) };
70
+ if (language) {
71
+ if (!CommonCrawlService.validLanguages(entry).includes(language)) {
72
+ throw ErrorRatchet.fErr('Requested language %s, but valid are %s', language, entry.languages);
73
+ }
74
+ headers['Accept-Language'] = language;
75
+ }
76
+
77
+ const resp: Response = await fetch(url, {
78
+ headers: headers,
79
+ });
80
+ //const do_unzip = promisify(unzip);
81
+
82
+ let reader: Readable = null;
83
+ if (resp.body instanceof Readable) {
84
+ reader = resp.body;
85
+ } else if (resp.body instanceof ReadableStream) {
86
+ reader = NodeStreamRatchet.webReadableStreamToNodeReadable(resp.body);
87
+ }
88
+
89
+ const warcstream: warc = new warc();
90
+ //const unzip: zlib.Gunzip = zlib.createGunzip();
91
+ //const unzipped: Buffer = zlib.unzipSync(await resp.arrayBuffer());
92
+
93
+ //const body: Readable = resp.body as unknown as Readable; // as unknown as PassThrough;
94
+ //const nodeReadble: Readable = NodeStreamRatchet.webReadableStreamToNodeReadable(resp.body);
95
+
96
+ Logger.info('Headers is %j', resp.headers);
97
+
98
+ let rval: WarcEntryRaw = null;
99
+ //let done: boolean = false;
100
+ reader
101
+ .pipe(zlib.createGunzip())
102
+ .pipe(warcstream)
103
+ .on('data', (val: WarcEntryRaw) => {
104
+ Logger.info('Got data ' + val.content.length);
105
+ rval = val; //.push(val);
106
+ //if (val.headers['WARC-Target-URI'] === entry.url) {
107
+ // }
108
+ //rval = val;
109
+ warcstream.destroy();
110
+ })
111
+ .on('close', () => {
112
+ Logger.info('Got close event');
113
+ //done = true;
114
+ })
115
+ .on('error', (err) => {
116
+ Logger.error('Read error: %s', err, err);
117
+ //done = true;
118
+ });
119
+
120
+ while (!rval) {
121
+ // Shouldnt really happen if offset is correct
122
+ await PromiseRatchet.wait(500);
123
+ }
124
+
125
+ const conv: WarcEntry = {
126
+ protocol: rval.protocol,
127
+ headers: rval.headers,
128
+ content: rval.content.toString(),
129
+ };
130
+
131
+ return conv;
132
+ }
133
+
134
+ public async search(options: CommonCrawlFetchOptions): Promise<DomainIndexEntryRaw[]> {
135
+ RequireRatchet.notNullOrUndefined(options, 'options');
136
+ RequireRatchet.notNullUndefinedOrOnlyWhitespaceString(options.url, 'options.url');
137
+
138
+ let url: string = CommonCrawlService.COMMON_CRAWL_URL + (options.index || CommonCrawlService.CURRENT_CRAWL);
139
+ url += '-index?';
140
+ const params = {
141
+ url: options.url,
142
+ //from: options.from,
143
+ //to: options.to,
144
+ matchType: options.matchType || 'domain', // exact, prefix, host , domain,
145
+ //limit: options.limit,
146
+ //sort: options.sort,
147
+ //page: options.page,
148
+ //pageSize: options.pageSize,
149
+ //showNumPages: options.showNumPages || false,
150
+ output: 'json',
151
+ };
152
+
153
+ const urlPart: string = querystring.stringify(params);
154
+ url += urlPart;
155
+
156
+ //Logger.info('URL: %s', url);
157
+
158
+ const res: Response = await fetch(url);
159
+ const body: string = await res.text();
160
+ let rval: DomainIndexEntryRaw[] = null;
161
+ if (res.status === 200) {
162
+ const lines: string[] = body.split('\n');
163
+ rval = lines.map((s) => (StringRatchet.trimToNull(s) ? JSON.parse(s) : null)).filter((s) => !!s);
164
+ //Logger.info('%j', rval);
165
+ } else {
166
+ Logger.error('Failed to fetch: %s : %s : %j : %s', res.status, res.statusText, res.headers, res.body);
167
+ }
168
+
169
+ return rval;
170
+ }
171
+
172
+ /*
173
+ public static async gunzip(input: ReadableStream): Promise<Buffer> {
174
+ const promise = new Promise<Buffer>(function (resolve, reject) {
175
+ zlib.gunzip(input, function (error, result) {
176
+ if (!error) resolve(result);
177
+ else reject(error);
178
+ });
179
+ });
180
+ return promise;
181
+ }
182
+
183
+ */
184
+
185
+ public async scanSite(
186
+ opts: CommonCrawlFetchOptions,
187
+ onPage?: (idx: number, cnt: number, header: string) => Promise<any>,
188
+ ): Promise<CommonCrawlScan> {
189
+ const sw: StopWatch = new StopWatch();
190
+ const rval: CommonCrawlScan = {
191
+ options: opts,
192
+ pageIndexes: [],
193
+ parsed: [],
194
+ errors: [],
195
+ };
196
+ Logger.info('Performing domain index scan with %j', opts);
197
+ rval.pageIndexes = await this.search(opts);
198
+ Logger.info('Found %d entries, pulling each', rval.pageIndexes.length);
199
+ for (const [idx, ent] of rval.pageIndexes.entries()) {
200
+ try {
201
+ Logger.info('Pulling item %d of %d, %s', idx, rval.pageIndexes.length, sw.dumpExpected(idx / rval.pageIndexes.length));
202
+ if (onPage) {
203
+ try {
204
+ await onPage(idx, rval.pageIndexes.length, ent.url);
205
+ } catch (err) {
206
+ Logger.warn('Failed onpage: %s', err);
207
+ }
208
+ }
209
+ const parsed: WarcEntry = await this.pullPageEntry(ent);
210
+
211
+ rval.parsed.push(parsed);
212
+ } catch (err) {
213
+ Logger.warn('Failed to pull %j : %s', ent, err);
214
+ rval.errors.push({ pageIdx: ent, error: err });
215
+ }
216
+ }
217
+ Logger.info('Completed full scan in %s', sw.dump());
218
+ return rval;
219
+ }
220
+ }