pagerts 0.2.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/codeql/codeql-config.yml +7 -0
- package/.github/workflows/ci.yml +146 -0
- package/.github/workflows/dependency-update.yml +52 -0
- package/.prettierignore +5 -0
- package/.prettierrc.json +10 -0
- package/MAINTAINERS.md +30 -0
- package/POST-INSTALL.md +205 -0
- package/README.md +220 -16
- package/SECURITY.md +160 -0
- package/bin/main.js +24 -19
- package/bin/main.js.map +4 -4
- package/eslint.config.mjs +83 -0
- package/{jest.config.js → jest.config.cjs} +45 -30
- package/package.json +34 -13
- package/src/__tests__/PageFetcher.test.ts +48 -0
- package/src/__tests__/security.test.ts +153 -0
- package/src/extractors/AbstractExtractor.ts +4 -5
- package/src/extractors/PageExtractor.ts +21 -12
- package/src/extractors/ResourceExtractor.ts +31 -25
- package/src/extractors/TagExtractor.ts +13 -14
- package/src/extractors/index.ts +4 -0
- package/src/main.ts +71 -43
- package/src/page/Page.ts +24 -19
- package/src/page/PageFetcher.ts +81 -30
- package/src/page/index.ts +3 -0
- package/src/printers/AbstractResourcePrinter.ts +6 -6
- package/src/printers/JSONStylePrinter.ts +9 -12
- package/src/printers/LogStylePrinter.ts +30 -28
- package/src/printers/index.ts +3 -0
- package/src/resource.ts +88 -96
- package/src/security.ts +184 -0
- package/tsconfig.eslint.json +5 -0
- package/tsconfig.json +27 -11
- package/bin/package.json +0 -40
- package/bin/src/extractors/AbstractExtractor.js +0 -11
- package/bin/src/extractors/AbstractExtractor.js.map +0 -1
- package/bin/src/extractors/PageExtractor.js +0 -13
- package/bin/src/extractors/PageExtractor.js.map +0 -1
- package/bin/src/extractors/ResourceExtractor.js +0 -32
- package/bin/src/extractors/ResourceExtractor.js.map +0 -1
- package/bin/src/main.js +0 -36
- package/bin/src/main.js.map +0 -1
- package/bin/src/page/Page.js +0 -8
- package/bin/src/page/Page.js.map +0 -1
- package/bin/src/page/PageFetcher.js +0 -26
- package/bin/src/page/PageFetcher.js.map +0 -1
- package/bin/src/printers/AbstractResourcePrinter.js +0 -8
- package/bin/src/printers/AbstractResourcePrinter.js.map +0 -1
- package/bin/src/printers/JSONStylePrinter.js +0 -12
- package/bin/src/printers/JSONStylePrinter.js.map +0 -1
- package/bin/src/printers/LogStylePrinter.js +0 -27
- package/bin/src/printers/LogStylePrinter.js.map +0 -1
- package/bin/src/resource.js +0 -56
- package/bin/src/resource.js.map +0 -1
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import eslint from '@eslint/js';
|
|
2
|
+
import tseslint from '@typescript-eslint/eslint-plugin';
|
|
3
|
+
import tsparser from '@typescript-eslint/parser';
|
|
4
|
+
import security from 'eslint-plugin-security';
|
|
5
|
+
import prettier from 'eslint-config-prettier';
|
|
6
|
+
|
|
7
|
+
export default [
|
|
8
|
+
{
|
|
9
|
+
ignores: ['bin/**', 'coverage/**', 'node_modules/**'],
|
|
10
|
+
},
|
|
11
|
+
eslint.configs.recommended,
|
|
12
|
+
{
|
|
13
|
+
files: ['src/**/*.ts'],
|
|
14
|
+
languageOptions: {
|
|
15
|
+
parser: tsparser,
|
|
16
|
+
parserOptions: {
|
|
17
|
+
ecmaVersion: 2022,
|
|
18
|
+
sourceType: 'module',
|
|
19
|
+
project: './tsconfig.eslint.json',
|
|
20
|
+
},
|
|
21
|
+
globals: {
|
|
22
|
+
console: 'readonly',
|
|
23
|
+
process: 'readonly',
|
|
24
|
+
__dirname: 'readonly',
|
|
25
|
+
__filename: 'readonly',
|
|
26
|
+
Buffer: 'readonly',
|
|
27
|
+
},
|
|
28
|
+
},
|
|
29
|
+
plugins: {
|
|
30
|
+
'@typescript-eslint': tseslint,
|
|
31
|
+
security: security,
|
|
32
|
+
},
|
|
33
|
+
rules: {
|
|
34
|
+
// Disable base JS rules in favor of TS-aware equivalents
|
|
35
|
+
'no-unused-vars': 'off',
|
|
36
|
+
'no-undef': 'off',
|
|
37
|
+
|
|
38
|
+
// TypeScript rules
|
|
39
|
+
'@typescript-eslint/no-explicit-any': 'error',
|
|
40
|
+
'@typescript-eslint/explicit-function-return-type': 'warn',
|
|
41
|
+
'@typescript-eslint/no-unused-vars': ['error', { argsIgnorePattern: '^_' }],
|
|
42
|
+
'@typescript-eslint/no-non-null-assertion': 'error',
|
|
43
|
+
'@typescript-eslint/prefer-nullish-coalescing': 'warn',
|
|
44
|
+
'@typescript-eslint/prefer-optional-chain': 'warn',
|
|
45
|
+
|
|
46
|
+
// Security rules
|
|
47
|
+
'security/detect-object-injection': 'warn',
|
|
48
|
+
'security/detect-non-literal-regexp': 'warn',
|
|
49
|
+
'security/detect-unsafe-regex': 'error',
|
|
50
|
+
'security/detect-buffer-noassert': 'error',
|
|
51
|
+
'security/detect-child-process': 'warn',
|
|
52
|
+
'security/detect-disable-mustache-escape': 'error',
|
|
53
|
+
'security/detect-eval-with-expression': 'error',
|
|
54
|
+
'security/detect-no-csrf-before-method-override': 'error',
|
|
55
|
+
'security/detect-non-literal-fs-filename': 'warn',
|
|
56
|
+
'security/detect-non-literal-require': 'warn',
|
|
57
|
+
'security/detect-possible-timing-attacks': 'warn',
|
|
58
|
+
'security/detect-pseudoRandomBytes': 'error',
|
|
59
|
+
|
|
60
|
+
// General rules
|
|
61
|
+
'no-console': ['warn', { allow: ['warn', 'error'] }],
|
|
62
|
+
'no-debugger': 'error',
|
|
63
|
+
'no-eval': 'error',
|
|
64
|
+
'no-implied-eval': 'error',
|
|
65
|
+
'no-new-func': 'error',
|
|
66
|
+
'prefer-const': 'error',
|
|
67
|
+
'no-var': 'error',
|
|
68
|
+
},
|
|
69
|
+
},
|
|
70
|
+
{
|
|
71
|
+
files: ['src/__tests__/**/*.ts'],
|
|
72
|
+
languageOptions: {
|
|
73
|
+
globals: {
|
|
74
|
+
describe: 'readonly',
|
|
75
|
+
it: 'readonly',
|
|
76
|
+
expect: 'readonly',
|
|
77
|
+
beforeEach: 'readonly',
|
|
78
|
+
setTimeout: 'readonly',
|
|
79
|
+
},
|
|
80
|
+
},
|
|
81
|
+
},
|
|
82
|
+
prettier,
|
|
83
|
+
];
|
|
@@ -5,48 +5,63 @@
|
|
|
5
5
|
|
|
6
6
|
/** @type {import('jest').Config} */
|
|
7
7
|
const config = {
|
|
8
|
-
|
|
9
|
-
|
|
8
|
+
preset: 'ts-jest',
|
|
9
|
+
testEnvironment: 'node',
|
|
10
10
|
|
|
11
|
-
//
|
|
12
|
-
|
|
11
|
+
// Support for ES modules
|
|
12
|
+
extensionsToTreatAsEsm: ['.ts'],
|
|
13
13
|
|
|
14
|
-
//
|
|
15
|
-
|
|
14
|
+
// Module name mapper for package.json imports
|
|
15
|
+
moduleNameMapper: {
|
|
16
|
+
'^(\\.{1,2}/.*)\\.js$': '$1',
|
|
17
|
+
},
|
|
16
18
|
|
|
17
19
|
// Automatically clear mock calls, instances, contexts and results before every test
|
|
18
|
-
|
|
20
|
+
clearMocks: true,
|
|
19
21
|
|
|
20
22
|
// Indicates whether the coverage information should be collected while executing the test
|
|
21
23
|
collectCoverage: true,
|
|
22
24
|
|
|
23
|
-
// An array of glob patterns indicating a set of files for which coverage information should be collected
|
|
24
|
-
// collectCoverageFrom: undefined,
|
|
25
|
-
|
|
26
25
|
// The directory where Jest should output its coverage files
|
|
27
|
-
coverageDirectory:
|
|
28
|
-
|
|
29
|
-
// An array of regexp pattern strings used to skip coverage collection
|
|
30
|
-
// coveragePathIgnorePatterns: [
|
|
31
|
-
// "/node_modules/"
|
|
32
|
-
// ],
|
|
26
|
+
coverageDirectory: 'coverage',
|
|
33
27
|
|
|
34
28
|
// Indicates which provider should be used to instrument code for coverage
|
|
35
|
-
coverageProvider:
|
|
29
|
+
coverageProvider: 'v8',
|
|
36
30
|
|
|
37
|
-
//
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
//
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
31
|
+
// An array of glob patterns indicating a set of files for which coverage information should be collected
|
|
32
|
+
collectCoverageFrom: ['src/**/*.ts', '!src/**/*.test.ts', '!src/**/*.spec.ts'],
|
|
33
|
+
|
|
34
|
+
// Coverage thresholds - realistic for current state, will improve over time
|
|
35
|
+
coverageThreshold: {
|
|
36
|
+
global: {
|
|
37
|
+
branches: 30,
|
|
38
|
+
functions: 35,
|
|
39
|
+
lines: 30,
|
|
40
|
+
statements: 30,
|
|
41
|
+
},
|
|
42
|
+
},
|
|
43
|
+
|
|
44
|
+
// Test match patterns
|
|
45
|
+
testMatch: ['**/__tests__/**/*.ts', '**/?(*.)+(spec|test).ts'],
|
|
46
|
+
|
|
47
|
+
// Module file extensions
|
|
48
|
+
moduleFileExtensions: ['ts', 'tsx', 'js', 'jsx', 'json', 'node'],
|
|
49
|
+
|
|
50
|
+
// Transform files with ts-jest
|
|
51
|
+
transform: {
|
|
52
|
+
'^.+\\.ts$': [
|
|
53
|
+
'ts-jest',
|
|
54
|
+
{
|
|
55
|
+
useESM: true,
|
|
56
|
+
tsconfig: {
|
|
57
|
+
module: 'ES2022',
|
|
58
|
+
target: 'ES2022',
|
|
59
|
+
esModuleInterop: true,
|
|
60
|
+
moduleResolution: 'node',
|
|
61
|
+
},
|
|
62
|
+
},
|
|
63
|
+
],
|
|
64
|
+
},
|
|
50
65
|
|
|
51
66
|
// Make calling deprecated APIs throw helpful error messages
|
|
52
67
|
// errorOnDeprecated: false,
|
package/package.json
CHANGED
|
@@ -1,22 +1,36 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pagerts",
|
|
3
3
|
"description": "A tool for viewing external relations in a webpage",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.4.1",
|
|
5
5
|
"main": "main.js",
|
|
6
6
|
"bin": {
|
|
7
7
|
"pagerts": "bin/main.js"
|
|
8
8
|
},
|
|
9
|
+
"engines": {
|
|
10
|
+
"node": ">=18.0.0"
|
|
11
|
+
},
|
|
9
12
|
"scripts": {
|
|
10
|
-
"test": "jest",
|
|
11
|
-
"
|
|
12
|
-
"
|
|
13
|
+
"test": "jest --coverage",
|
|
14
|
+
"test:watch": "jest --watch",
|
|
15
|
+
"build": "esbuild src/main.ts --external:jsdom --bundle --outdir=bin --minify --sourcemap --platform=node --format=esm",
|
|
16
|
+
"lint": "eslint src/**/*.ts",
|
|
17
|
+
"lint:fix": "eslint src/**/*.ts --fix",
|
|
18
|
+
"type-check": "tsc --noEmit",
|
|
19
|
+
"format": "prettier --write \"src/**/*.ts\"",
|
|
20
|
+
"format:check": "prettier --check \"src/**/*.ts\"",
|
|
21
|
+
"security:audit": "npm audit --audit-level=moderate",
|
|
22
|
+
"security:check": "npm run security:audit && npm run lint",
|
|
13
23
|
"start": "node ./bin/main.js",
|
|
14
|
-
"dev": "
|
|
24
|
+
"dev": "tsx src/main.ts",
|
|
25
|
+
"prepare": "npm run build"
|
|
15
26
|
},
|
|
16
27
|
"keywords": [
|
|
17
28
|
"webpage",
|
|
18
29
|
"hierarchy",
|
|
19
|
-
"management"
|
|
30
|
+
"management",
|
|
31
|
+
"web-scraping",
|
|
32
|
+
"cli",
|
|
33
|
+
"url-extraction"
|
|
20
34
|
],
|
|
21
35
|
"author": "Kirill kn253 Nevzorov",
|
|
22
36
|
"license": "MIT",
|
|
@@ -25,16 +39,23 @@
|
|
|
25
39
|
},
|
|
26
40
|
"homepage": "https://github.com/akinevz0/pagerts",
|
|
27
41
|
"dependencies": {
|
|
28
|
-
"blessed": "^0.1.81",
|
|
29
42
|
"commander": "^12.1.0",
|
|
30
|
-
"
|
|
31
|
-
"jsdom": "^26.0.0"
|
|
43
|
+
"jsdom": "^25.0.1"
|
|
32
44
|
},
|
|
33
45
|
"devDependencies": {
|
|
34
|
-
"@types/
|
|
46
|
+
"@types/jest": "^29.5.14",
|
|
35
47
|
"@types/jsdom": "^21.1.7",
|
|
36
|
-
"@types/node": "^22.
|
|
48
|
+
"@types/node": "^22.10.5",
|
|
49
|
+
"@typescript-eslint/eslint-plugin": "^8.20.0",
|
|
50
|
+
"@typescript-eslint/parser": "^8.20.0",
|
|
37
51
|
"esbuild": "^0.25.1",
|
|
38
|
-
"
|
|
52
|
+
"eslint": "^9.18.0",
|
|
53
|
+
"eslint-config-prettier": "^9.1.0",
|
|
54
|
+
"eslint-plugin-security": "^3.0.1",
|
|
55
|
+
"jest": "^29.7.0",
|
|
56
|
+
"prettier": "^3.4.2",
|
|
57
|
+
"ts-jest": "^29.2.5",
|
|
58
|
+
"tsx": "^4.19.2",
|
|
59
|
+
"typescript": "^5.7.2"
|
|
39
60
|
}
|
|
40
|
-
}
|
|
61
|
+
}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import { PageFetcher } from '../page/PageFetcher';
|
|
2
|
+
|
|
3
|
+
describe('PageFetcher', () => {
|
|
4
|
+
let pageFetcher: PageFetcher;
|
|
5
|
+
|
|
6
|
+
beforeEach(() => {
|
|
7
|
+
pageFetcher = new PageFetcher();
|
|
8
|
+
});
|
|
9
|
+
|
|
10
|
+
describe('fetchAll', () => {
|
|
11
|
+
it('should fetch valid URLs', async () => {
|
|
12
|
+
const urls = ['https://example.com'];
|
|
13
|
+
const responses = await pageFetcher.fetchAll(urls);
|
|
14
|
+
|
|
15
|
+
expect(responses.length).toBeGreaterThan(0);
|
|
16
|
+
expect(responses[0].url).toBe('https://example.com');
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
it('should handle invalid URLs gracefully', async () => {
|
|
20
|
+
const urls = ['https://this-domain-definitely-does-not-exist-12345.com'];
|
|
21
|
+
const responses = await pageFetcher.fetchAll(urls);
|
|
22
|
+
|
|
23
|
+
expect(responses.length).toBeGreaterThan(0);
|
|
24
|
+
if (responses[0].error) {
|
|
25
|
+
expect(responses[0].error).toContain('Failed to fetch');
|
|
26
|
+
}
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
it('should handle multiple URLs', async () => {
|
|
30
|
+
const urls = ['https://example.com', 'https://example.org'];
|
|
31
|
+
const responses = await pageFetcher.fetchAll(urls);
|
|
32
|
+
|
|
33
|
+
expect(responses.length).toBe(2);
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
it('should have timeout for slow requests', async () => {
|
|
37
|
+
const slowFetcher = new PageFetcher(100, 0); // 100ms timeout, no retries
|
|
38
|
+
const urls = ['https://httpbin.org/delay/5']; // This will timeout
|
|
39
|
+
|
|
40
|
+
const responses = await slowFetcher.fetchAll(urls);
|
|
41
|
+
expect(responses.length).toBeGreaterThan(0);
|
|
42
|
+
|
|
43
|
+
if (responses[0].error) {
|
|
44
|
+
expect(responses[0].error).toContain('timeout');
|
|
45
|
+
}
|
|
46
|
+
}, 10000);
|
|
47
|
+
});
|
|
48
|
+
});
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
import { validateUrl, validateUrls, RateLimiter, sanitizeText } from '../security';
|
|
2
|
+
|
|
3
|
+
describe('Security Module', () => {
|
|
4
|
+
describe('validateUrl', () => {
|
|
5
|
+
it('should validate a proper HTTPS URL', () => {
|
|
6
|
+
const result = validateUrl('https://example.com');
|
|
7
|
+
expect(result.isValid).toBe(true);
|
|
8
|
+
expect(result.sanitizedUrl).toBe('https://example.com/');
|
|
9
|
+
});
|
|
10
|
+
|
|
11
|
+
it('should validate a proper HTTP URL', () => {
|
|
12
|
+
const result = validateUrl('http://example.com');
|
|
13
|
+
expect(result.isValid).toBe(true);
|
|
14
|
+
});
|
|
15
|
+
|
|
16
|
+
it('should validate a file:// URL', () => {
|
|
17
|
+
const result = validateUrl('file:///path/to/file.html');
|
|
18
|
+
expect(result.isValid).toBe(true);
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
it('should reject empty URLs', () => {
|
|
22
|
+
const result = validateUrl('');
|
|
23
|
+
expect(result.isValid).toBe(false);
|
|
24
|
+
expect(result.error).toContain('empty');
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
it('should reject URLs with javascript: protocol', () => {
|
|
28
|
+
const result = validateUrl('javascript:alert(1)');
|
|
29
|
+
expect(result.isValid).toBe(false);
|
|
30
|
+
expect(result.error).toContain('suspicious');
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
it('should reject URLs with data: protocol', () => {
|
|
34
|
+
const result = validateUrl('data:text/html,<script>alert(1)</script>');
|
|
35
|
+
expect(result.isValid).toBe(false);
|
|
36
|
+
expect(result.error).toContain('suspicious');
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
it('should reject URLs exceeding maximum length', () => {
|
|
40
|
+
const longUrl = 'https://example.com/' + 'a'.repeat(3000);
|
|
41
|
+
const result = validateUrl(longUrl);
|
|
42
|
+
expect(result.isValid).toBe(false);
|
|
43
|
+
expect(result.error).toContain('exceeds maximum length');
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
it('should reject URLs with script tags', () => {
|
|
47
|
+
const result = validateUrl('https://example.com/<script>alert(1)</script>');
|
|
48
|
+
expect(result.isValid).toBe(false);
|
|
49
|
+
expect(result.error).toContain('suspicious');
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
it('should reject invalid URL formats', () => {
|
|
53
|
+
const result = validateUrl('not-a-valid-url');
|
|
54
|
+
expect(result.isValid).toBe(false);
|
|
55
|
+
expect(result.error).toContain('Invalid URL format');
|
|
56
|
+
});
|
|
57
|
+
|
|
58
|
+
it('should trim whitespace from URLs', () => {
|
|
59
|
+
const result = validateUrl(' https://example.com ');
|
|
60
|
+
expect(result.isValid).toBe(true);
|
|
61
|
+
expect(result.sanitizedUrl).toBe('https://example.com/');
|
|
62
|
+
});
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
describe('validateUrls', () => {
|
|
66
|
+
it('should validate multiple URLs and separate valid from invalid', () => {
|
|
67
|
+
const urls = ['https://example.com', 'javascript:alert(1)', 'http://test.com', 'invalid-url'];
|
|
68
|
+
const result = validateUrls(urls);
|
|
69
|
+
|
|
70
|
+
expect(result.validUrls.length).toBe(2);
|
|
71
|
+
expect(result.errors.length).toBe(2);
|
|
72
|
+
expect(result.validUrls).toContain('https://example.com/');
|
|
73
|
+
expect(result.validUrls).toContain('http://test.com/');
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
it('should return empty arrays for empty input', () => {
|
|
77
|
+
const result = validateUrls([]);
|
|
78
|
+
expect(result.validUrls.length).toBe(0);
|
|
79
|
+
expect(result.errors.length).toBe(0);
|
|
80
|
+
});
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
describe('RateLimiter', () => {
|
|
84
|
+
it('should allow requests within the limit', () => {
|
|
85
|
+
const limiter = new RateLimiter(5, 1000);
|
|
86
|
+
|
|
87
|
+
for (let i = 0; i < 5; i++) {
|
|
88
|
+
expect(limiter.isAllowed()).toBe(true);
|
|
89
|
+
}
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
it('should block requests exceeding the limit', () => {
|
|
93
|
+
const limiter = new RateLimiter(3, 1000);
|
|
94
|
+
|
|
95
|
+
// Use up all allowed requests
|
|
96
|
+
for (let i = 0; i < 3; i++) {
|
|
97
|
+
limiter.isAllowed();
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// Next request should be blocked
|
|
101
|
+
expect(limiter.isAllowed()).toBe(false);
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
it('should reset after the time window', async () => {
|
|
105
|
+
const limiter = new RateLimiter(2, 100); // 100ms window
|
|
106
|
+
|
|
107
|
+
limiter.isAllowed();
|
|
108
|
+
limiter.isAllowed();
|
|
109
|
+
expect(limiter.isAllowed()).toBe(false);
|
|
110
|
+
|
|
111
|
+
// Wait for window to expire
|
|
112
|
+
await new Promise((resolve) => setTimeout(resolve, 150));
|
|
113
|
+
|
|
114
|
+
// Should be allowed again
|
|
115
|
+
expect(limiter.isAllowed()).toBe(true);
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
it('should correctly report remaining requests', () => {
|
|
119
|
+
const limiter = new RateLimiter(5, 1000);
|
|
120
|
+
|
|
121
|
+
expect(limiter.getRemainingRequests()).toBe(5);
|
|
122
|
+
limiter.isAllowed();
|
|
123
|
+
expect(limiter.getRemainingRequests()).toBe(4);
|
|
124
|
+
limiter.isAllowed();
|
|
125
|
+
expect(limiter.getRemainingRequests()).toBe(3);
|
|
126
|
+
});
|
|
127
|
+
});
|
|
128
|
+
|
|
129
|
+
describe('sanitizeText', () => {
|
|
130
|
+
it('should sanitize HTML special characters', () => {
|
|
131
|
+
const input = '<script>alert("XSS")</script>';
|
|
132
|
+
const output = sanitizeText(input);
|
|
133
|
+
expect(output).toBe('<script>alert("XSS")</script>');
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
it('should handle empty strings', () => {
|
|
137
|
+
expect(sanitizeText('')).toBe('');
|
|
138
|
+
});
|
|
139
|
+
|
|
140
|
+
it('should escape quotes and apostrophes', () => {
|
|
141
|
+
const input = `It's a "test"`;
|
|
142
|
+
const output = sanitizeText(input);
|
|
143
|
+
expect(output).toContain(''');
|
|
144
|
+
expect(output).toContain('"');
|
|
145
|
+
});
|
|
146
|
+
|
|
147
|
+
it('should escape forward slashes', () => {
|
|
148
|
+
const input = '</script>';
|
|
149
|
+
const output = sanitizeText(input);
|
|
150
|
+
expect(output).toBe('</script>');
|
|
151
|
+
});
|
|
152
|
+
});
|
|
153
|
+
});
|
|
@@ -1,5 +1,4 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
}
|
|
1
|
+
export abstract class AbstractExtractor<V, R> {
|
|
2
|
+
constructor(readonly name: string) {}
|
|
3
|
+
abstract extract(value: V): Promise<R>;
|
|
4
|
+
}
|
|
@@ -1,12 +1,21 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { JSDOM } from 'jsdom';
|
|
3
|
-
import { AbstractExtractor } from './AbstractExtractor';
|
|
4
|
-
|
|
5
|
-
export class PageExtractor extends AbstractExtractor<JSDOM, Page> {
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
1
|
+
import type { Page } from '../page/index.js';
|
|
2
|
+
import { JSDOM } from 'jsdom';
|
|
3
|
+
import { AbstractExtractor } from './AbstractExtractor.js';
|
|
4
|
+
|
|
5
|
+
export class PageExtractor extends AbstractExtractor<JSDOM, Page> {
|
|
6
|
+
constructor() {
|
|
7
|
+
super('page-extractor');
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
async extract(value: JSDOM): Promise<Page> {
|
|
11
|
+
const {
|
|
12
|
+
window: {
|
|
13
|
+
document: {
|
|
14
|
+
title,
|
|
15
|
+
location: { href: url },
|
|
16
|
+
},
|
|
17
|
+
},
|
|
18
|
+
} = value;
|
|
19
|
+
return { title, url };
|
|
20
|
+
}
|
|
21
|
+
}
|
|
@@ -1,25 +1,31 @@
|
|
|
1
|
-
import type { JSDOM } from
|
|
2
|
-
import {
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
1
|
+
import type { JSDOM } from 'jsdom';
|
|
2
|
+
import {
|
|
3
|
+
findResourceLink,
|
|
4
|
+
findResourceText,
|
|
5
|
+
type ExternalResource,
|
|
6
|
+
type Resource,
|
|
7
|
+
type Tag,
|
|
8
|
+
} from '../resource.js';
|
|
9
|
+
import { AbstractExtractor } from './AbstractExtractor.js';
|
|
10
|
+
|
|
11
|
+
export class ResourceExtractor extends AbstractExtractor<JSDOM, ExternalResource[]> {
|
|
12
|
+
constructor(private readonly tags: Tag[]) {
|
|
13
|
+
super('page-extractor');
|
|
14
|
+
}
|
|
15
|
+
async extract(value: JSDOM): Promise<ExternalResource[]> {
|
|
16
|
+
const { document } = value.window;
|
|
17
|
+
const externalResources: ExternalResource[] = [];
|
|
18
|
+
for (const tag of this.tags) {
|
|
19
|
+
const selector = document.querySelectorAll<Resource>(tag);
|
|
20
|
+
const elements = Array.from(selector);
|
|
21
|
+
for (const element of elements) {
|
|
22
|
+
const text = findResourceText(element);
|
|
23
|
+
const link = findResourceLink(element);
|
|
24
|
+
if (!text || !link) continue;
|
|
25
|
+
if (!link.url.startsWith('http')) continue;
|
|
26
|
+
externalResources.push({ text, link });
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
return externalResources;
|
|
30
|
+
}
|
|
31
|
+
}
|
|
@@ -1,14 +1,13 @@
|
|
|
1
|
-
import { JSDOM } from 'jsdom';
|
|
2
|
-
import type { Resource, Tag } from '../resource';
|
|
3
|
-
import { AbstractExtractor } from './AbstractExtractor';
|
|
4
|
-
|
|
5
|
-
export class TagExtractor<T extends Tag> extends AbstractExtractor<JSDOM, Resource[]> {
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
}
|
|
1
|
+
import { JSDOM } from 'jsdom';
|
|
2
|
+
import type { Resource, Tag } from '../resource.js';
|
|
3
|
+
import { AbstractExtractor } from './AbstractExtractor.js';
|
|
4
|
+
|
|
5
|
+
export class TagExtractor<T extends Tag> extends AbstractExtractor<JSDOM, Resource[]> {
|
|
6
|
+
extract(value: JSDOM): Promise<Resource[]> {
|
|
7
|
+
const linkNodes = value.window.document.querySelectorAll<Resource>(this.tagName);
|
|
8
|
+
return Promise.resolve(Array.from(linkNodes));
|
|
9
|
+
}
|
|
10
|
+
constructor(private readonly tagName: T) {
|
|
11
|
+
super(`extract <${tagName}>`);
|
|
12
|
+
}
|
|
13
|
+
}
|