pagerts 0.4.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/main.js +6 -27
- package/bin/main.js.map +4 -4
- package/package.json +6 -2
- package/.github/codeql/codeql-config.yml +0 -7
- package/.github/workflows/ci.yml +0 -146
- package/.github/workflows/dependency-update.yml +0 -52
- package/.prettierignore +0 -5
- package/.prettierrc.json +0 -10
- package/MAINTAINERS.md +0 -30
- package/POST-INSTALL.md +0 -205
- package/SECURITY.md +0 -160
- package/eslint.config.mjs +0 -83
- package/jest.config.cjs +0 -213
- package/src/__tests__/PageFetcher.test.ts +0 -48
- package/src/__tests__/security.test.ts +0 -153
- package/src/extractors/AbstractExtractor.ts +0 -4
- package/src/extractors/PageExtractor.ts +0 -21
- package/src/extractors/ResourceExtractor.ts +0 -31
- package/src/extractors/TagExtractor.ts +0 -13
- package/src/extractors/index.ts +0 -4
- package/src/main.ts +0 -71
- package/src/page/Page.ts +0 -24
- package/src/page/PageFetcher.ts +0 -81
- package/src/page/index.ts +0 -3
- package/src/printers/AbstractResourcePrinter.ts +0 -6
- package/src/printers/JSONStylePrinter.ts +0 -9
- package/src/printers/LogStylePrinter.ts +0 -30
- package/src/printers/index.ts +0 -3
- package/src/resource.ts +0 -88
- package/src/security.ts +0 -184
- package/tsconfig.eslint.json +0 -5
- package/tsconfig.json +0 -28
package/jest.config.cjs
DELETED
|
@@ -1,213 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* For a detailed explanation regarding each configuration property, visit:
|
|
3
|
-
* https://jestjs.io/docs/configuration
|
|
4
|
-
*/
|
|
5
|
-
|
|
6
|
-
/** @type {import('jest').Config} */
|
|
7
|
-
const config = {
|
|
8
|
-
preset: 'ts-jest',
|
|
9
|
-
testEnvironment: 'node',
|
|
10
|
-
|
|
11
|
-
// Support for ES modules
|
|
12
|
-
extensionsToTreatAsEsm: ['.ts'],
|
|
13
|
-
|
|
14
|
-
// Module name mapper for package.json imports
|
|
15
|
-
moduleNameMapper: {
|
|
16
|
-
'^(\\.{1,2}/.*)\\.js$': '$1',
|
|
17
|
-
},
|
|
18
|
-
|
|
19
|
-
// Automatically clear mock calls, instances, contexts and results before every test
|
|
20
|
-
clearMocks: true,
|
|
21
|
-
|
|
22
|
-
// Indicates whether the coverage information should be collected while executing the test
|
|
23
|
-
collectCoverage: true,
|
|
24
|
-
|
|
25
|
-
// The directory where Jest should output its coverage files
|
|
26
|
-
coverageDirectory: 'coverage',
|
|
27
|
-
|
|
28
|
-
// Indicates which provider should be used to instrument code for coverage
|
|
29
|
-
coverageProvider: 'v8',
|
|
30
|
-
|
|
31
|
-
// An array of glob patterns indicating a set of files for which coverage information should be collected
|
|
32
|
-
collectCoverageFrom: ['src/**/*.ts', '!src/**/*.test.ts', '!src/**/*.spec.ts'],
|
|
33
|
-
|
|
34
|
-
// Coverage thresholds - realistic for current state, will improve over time
|
|
35
|
-
coverageThreshold: {
|
|
36
|
-
global: {
|
|
37
|
-
branches: 30,
|
|
38
|
-
functions: 35,
|
|
39
|
-
lines: 30,
|
|
40
|
-
statements: 30,
|
|
41
|
-
},
|
|
42
|
-
},
|
|
43
|
-
|
|
44
|
-
// Test match patterns
|
|
45
|
-
testMatch: ['**/__tests__/**/*.ts', '**/?(*.)+(spec|test).ts'],
|
|
46
|
-
|
|
47
|
-
// Module file extensions
|
|
48
|
-
moduleFileExtensions: ['ts', 'tsx', 'js', 'jsx', 'json', 'node'],
|
|
49
|
-
|
|
50
|
-
// Transform files with ts-jest
|
|
51
|
-
transform: {
|
|
52
|
-
'^.+\\.ts$': [
|
|
53
|
-
'ts-jest',
|
|
54
|
-
{
|
|
55
|
-
useESM: true,
|
|
56
|
-
tsconfig: {
|
|
57
|
-
module: 'ES2022',
|
|
58
|
-
target: 'ES2022',
|
|
59
|
-
esModuleInterop: true,
|
|
60
|
-
moduleResolution: 'node',
|
|
61
|
-
},
|
|
62
|
-
},
|
|
63
|
-
],
|
|
64
|
-
},
|
|
65
|
-
|
|
66
|
-
// Make calling deprecated APIs throw helpful error messages
|
|
67
|
-
// errorOnDeprecated: false,
|
|
68
|
-
|
|
69
|
-
// The default configuration for fake timers
|
|
70
|
-
// fakeTimers: {
|
|
71
|
-
// "enableGlobally": false
|
|
72
|
-
// },
|
|
73
|
-
|
|
74
|
-
// Force coverage collection from ignored files using an array of glob patterns
|
|
75
|
-
// forceCoverageMatch: [],
|
|
76
|
-
|
|
77
|
-
// A path to a module which exports an async function that is triggered once before all test suites
|
|
78
|
-
// globalSetup: undefined,
|
|
79
|
-
|
|
80
|
-
// A path to a module which exports an async function that is triggered once after all test suites
|
|
81
|
-
// globalTeardown: undefined,
|
|
82
|
-
|
|
83
|
-
// A set of global variables that need to be available in all test environments
|
|
84
|
-
// globals: {},
|
|
85
|
-
|
|
86
|
-
// The maximum amount of workers used to run your tests. Can be specified as % or a number. E.g. maxWorkers: 10% will use 10% of your CPU amount + 1 as the maximum worker number. maxWorkers: 2 will use a maximum of 2 workers.
|
|
87
|
-
// maxWorkers: "50%",
|
|
88
|
-
|
|
89
|
-
// An array of directory names to be searched recursively up from the requiring module's location
|
|
90
|
-
// moduleDirectories: [
|
|
91
|
-
// "node_modules"
|
|
92
|
-
// ],
|
|
93
|
-
|
|
94
|
-
// An array of file extensions your modules use
|
|
95
|
-
// moduleFileExtensions: [
|
|
96
|
-
// "js",
|
|
97
|
-
// "mjs",
|
|
98
|
-
// "cjs",
|
|
99
|
-
// "jsx",
|
|
100
|
-
// "ts",
|
|
101
|
-
// "tsx",
|
|
102
|
-
// "json",
|
|
103
|
-
// "node"
|
|
104
|
-
// ],
|
|
105
|
-
|
|
106
|
-
// A map from regular expressions to module names or to arrays of module names that allow to stub out resources with a single module
|
|
107
|
-
// moduleNameMapper: {},
|
|
108
|
-
|
|
109
|
-
// An array of regexp pattern strings, matched against all module paths before considered 'visible' to the module loader
|
|
110
|
-
// modulePathIgnorePatterns: [],
|
|
111
|
-
|
|
112
|
-
// Activates notifications for test results
|
|
113
|
-
// notify: false,
|
|
114
|
-
|
|
115
|
-
// An enum that specifies notification mode. Requires { notify: true }
|
|
116
|
-
// notifyMode: "failure-change",
|
|
117
|
-
|
|
118
|
-
// A preset that is used as a base for Jest's configuration
|
|
119
|
-
// preset: undefined,
|
|
120
|
-
|
|
121
|
-
// Run tests from one or more projects
|
|
122
|
-
// projects: undefined,
|
|
123
|
-
|
|
124
|
-
// Use this configuration option to add custom reporters to Jest
|
|
125
|
-
// reporters: undefined,
|
|
126
|
-
|
|
127
|
-
// Automatically reset mock state before every test
|
|
128
|
-
// resetMocks: false,
|
|
129
|
-
|
|
130
|
-
// Reset the module registry before running each individual test
|
|
131
|
-
// resetModules: false,
|
|
132
|
-
|
|
133
|
-
// A path to a custom resolver
|
|
134
|
-
// resolver: undefined,
|
|
135
|
-
|
|
136
|
-
// Automatically restore mock state and implementation before every test
|
|
137
|
-
// restoreMocks: false,
|
|
138
|
-
|
|
139
|
-
// The root directory that Jest should scan for tests and modules within
|
|
140
|
-
// rootDir: undefined,
|
|
141
|
-
|
|
142
|
-
// A list of paths to directories that Jest should use to search for files in
|
|
143
|
-
// roots: [
|
|
144
|
-
// "<rootDir>"
|
|
145
|
-
// ],
|
|
146
|
-
|
|
147
|
-
// Allows you to use a custom runner instead of Jest's default test runner
|
|
148
|
-
// runner: "jest-runner",
|
|
149
|
-
|
|
150
|
-
// The paths to modules that run some code to configure or set up the testing environment before each test
|
|
151
|
-
// setupFiles: [],
|
|
152
|
-
|
|
153
|
-
// A list of paths to modules that run some code to configure or set up the testing framework before each test
|
|
154
|
-
// setupFilesAfterEnv: [],
|
|
155
|
-
|
|
156
|
-
// The number of seconds after which a test is considered as slow and reported as such in the results.
|
|
157
|
-
// slowTestThreshold: 5,
|
|
158
|
-
|
|
159
|
-
// A list of paths to snapshot serializer modules Jest should use for snapshot testing
|
|
160
|
-
// snapshotSerializers: [],
|
|
161
|
-
|
|
162
|
-
// The test environment that will be used for testing
|
|
163
|
-
// testEnvironment: "jest-environment-node",
|
|
164
|
-
|
|
165
|
-
// Options that will be passed to the testEnvironment
|
|
166
|
-
// testEnvironmentOptions: {},
|
|
167
|
-
|
|
168
|
-
// Adds a location field to test results
|
|
169
|
-
// testLocationInResults: false,
|
|
170
|
-
|
|
171
|
-
// The glob patterns Jest uses to detect test files
|
|
172
|
-
// testMatch: [
|
|
173
|
-
// "**/__tests__/**/*.[jt]s?(x)",
|
|
174
|
-
// "**/?(*.)+(spec|test).[tj]s?(x)"
|
|
175
|
-
// ],
|
|
176
|
-
|
|
177
|
-
// An array of regexp pattern strings that are matched against all test paths, matched tests are skipped
|
|
178
|
-
// testPathIgnorePatterns: [
|
|
179
|
-
// "/node_modules/"
|
|
180
|
-
// ],
|
|
181
|
-
|
|
182
|
-
// The regexp pattern or array of patterns that Jest uses to detect test files
|
|
183
|
-
// testRegex: [],
|
|
184
|
-
|
|
185
|
-
// This option allows the use of a custom results processor
|
|
186
|
-
// testResultsProcessor: undefined,
|
|
187
|
-
|
|
188
|
-
// This option allows use of a custom test runner
|
|
189
|
-
// testRunner: "jest-circus/runner",
|
|
190
|
-
|
|
191
|
-
// A map from regular expressions to paths to transformers
|
|
192
|
-
// transform: undefined,
|
|
193
|
-
|
|
194
|
-
// An array of regexp pattern strings that are matched against all source file paths, matched files will skip transformation
|
|
195
|
-
// transformIgnorePatterns: [
|
|
196
|
-
// "/node_modules/",
|
|
197
|
-
// "\\.pnp\\.[^\\/]+$"
|
|
198
|
-
// ],
|
|
199
|
-
|
|
200
|
-
// An array of regexp pattern strings that are matched against all modules before the module loader will automatically return a mock for them
|
|
201
|
-
// unmockedModulePathPatterns: undefined,
|
|
202
|
-
|
|
203
|
-
// Indicates whether each individual test should be reported during the run
|
|
204
|
-
// verbose: undefined,
|
|
205
|
-
|
|
206
|
-
// An array of regexp patterns that are matched against all source file paths before re-running tests in watch mode
|
|
207
|
-
// watchPathIgnorePatterns: [],
|
|
208
|
-
|
|
209
|
-
// Whether to use watchman for file crawling
|
|
210
|
-
// watchman: true,
|
|
211
|
-
};
|
|
212
|
-
|
|
213
|
-
module.exports = config;
|
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
import { PageFetcher } from '../page/PageFetcher';
|
|
2
|
-
|
|
3
|
-
describe('PageFetcher', () => {
|
|
4
|
-
let pageFetcher: PageFetcher;
|
|
5
|
-
|
|
6
|
-
beforeEach(() => {
|
|
7
|
-
pageFetcher = new PageFetcher();
|
|
8
|
-
});
|
|
9
|
-
|
|
10
|
-
describe('fetchAll', () => {
|
|
11
|
-
it('should fetch valid URLs', async () => {
|
|
12
|
-
const urls = ['https://example.com'];
|
|
13
|
-
const responses = await pageFetcher.fetchAll(urls);
|
|
14
|
-
|
|
15
|
-
expect(responses.length).toBeGreaterThan(0);
|
|
16
|
-
expect(responses[0].url).toBe('https://example.com');
|
|
17
|
-
});
|
|
18
|
-
|
|
19
|
-
it('should handle invalid URLs gracefully', async () => {
|
|
20
|
-
const urls = ['https://this-domain-definitely-does-not-exist-12345.com'];
|
|
21
|
-
const responses = await pageFetcher.fetchAll(urls);
|
|
22
|
-
|
|
23
|
-
expect(responses.length).toBeGreaterThan(0);
|
|
24
|
-
if (responses[0].error) {
|
|
25
|
-
expect(responses[0].error).toContain('Failed to fetch');
|
|
26
|
-
}
|
|
27
|
-
});
|
|
28
|
-
|
|
29
|
-
it('should handle multiple URLs', async () => {
|
|
30
|
-
const urls = ['https://example.com', 'https://example.org'];
|
|
31
|
-
const responses = await pageFetcher.fetchAll(urls);
|
|
32
|
-
|
|
33
|
-
expect(responses.length).toBe(2);
|
|
34
|
-
});
|
|
35
|
-
|
|
36
|
-
it('should have timeout for slow requests', async () => {
|
|
37
|
-
const slowFetcher = new PageFetcher(100, 0); // 100ms timeout, no retries
|
|
38
|
-
const urls = ['https://httpbin.org/delay/5']; // This will timeout
|
|
39
|
-
|
|
40
|
-
const responses = await slowFetcher.fetchAll(urls);
|
|
41
|
-
expect(responses.length).toBeGreaterThan(0);
|
|
42
|
-
|
|
43
|
-
if (responses[0].error) {
|
|
44
|
-
expect(responses[0].error).toContain('timeout');
|
|
45
|
-
}
|
|
46
|
-
}, 10000);
|
|
47
|
-
});
|
|
48
|
-
});
|
|
@@ -1,153 +0,0 @@
|
|
|
1
|
-
import { validateUrl, validateUrls, RateLimiter, sanitizeText } from '../security';
|
|
2
|
-
|
|
3
|
-
describe('Security Module', () => {
|
|
4
|
-
describe('validateUrl', () => {
|
|
5
|
-
it('should validate a proper HTTPS URL', () => {
|
|
6
|
-
const result = validateUrl('https://example.com');
|
|
7
|
-
expect(result.isValid).toBe(true);
|
|
8
|
-
expect(result.sanitizedUrl).toBe('https://example.com/');
|
|
9
|
-
});
|
|
10
|
-
|
|
11
|
-
it('should validate a proper HTTP URL', () => {
|
|
12
|
-
const result = validateUrl('http://example.com');
|
|
13
|
-
expect(result.isValid).toBe(true);
|
|
14
|
-
});
|
|
15
|
-
|
|
16
|
-
it('should validate a file:// URL', () => {
|
|
17
|
-
const result = validateUrl('file:///path/to/file.html');
|
|
18
|
-
expect(result.isValid).toBe(true);
|
|
19
|
-
});
|
|
20
|
-
|
|
21
|
-
it('should reject empty URLs', () => {
|
|
22
|
-
const result = validateUrl('');
|
|
23
|
-
expect(result.isValid).toBe(false);
|
|
24
|
-
expect(result.error).toContain('empty');
|
|
25
|
-
});
|
|
26
|
-
|
|
27
|
-
it('should reject URLs with javascript: protocol', () => {
|
|
28
|
-
const result = validateUrl('javascript:alert(1)');
|
|
29
|
-
expect(result.isValid).toBe(false);
|
|
30
|
-
expect(result.error).toContain('suspicious');
|
|
31
|
-
});
|
|
32
|
-
|
|
33
|
-
it('should reject URLs with data: protocol', () => {
|
|
34
|
-
const result = validateUrl('data:text/html,<script>alert(1)</script>');
|
|
35
|
-
expect(result.isValid).toBe(false);
|
|
36
|
-
expect(result.error).toContain('suspicious');
|
|
37
|
-
});
|
|
38
|
-
|
|
39
|
-
it('should reject URLs exceeding maximum length', () => {
|
|
40
|
-
const longUrl = 'https://example.com/' + 'a'.repeat(3000);
|
|
41
|
-
const result = validateUrl(longUrl);
|
|
42
|
-
expect(result.isValid).toBe(false);
|
|
43
|
-
expect(result.error).toContain('exceeds maximum length');
|
|
44
|
-
});
|
|
45
|
-
|
|
46
|
-
it('should reject URLs with script tags', () => {
|
|
47
|
-
const result = validateUrl('https://example.com/<script>alert(1)</script>');
|
|
48
|
-
expect(result.isValid).toBe(false);
|
|
49
|
-
expect(result.error).toContain('suspicious');
|
|
50
|
-
});
|
|
51
|
-
|
|
52
|
-
it('should reject invalid URL formats', () => {
|
|
53
|
-
const result = validateUrl('not-a-valid-url');
|
|
54
|
-
expect(result.isValid).toBe(false);
|
|
55
|
-
expect(result.error).toContain('Invalid URL format');
|
|
56
|
-
});
|
|
57
|
-
|
|
58
|
-
it('should trim whitespace from URLs', () => {
|
|
59
|
-
const result = validateUrl(' https://example.com ');
|
|
60
|
-
expect(result.isValid).toBe(true);
|
|
61
|
-
expect(result.sanitizedUrl).toBe('https://example.com/');
|
|
62
|
-
});
|
|
63
|
-
});
|
|
64
|
-
|
|
65
|
-
describe('validateUrls', () => {
|
|
66
|
-
it('should validate multiple URLs and separate valid from invalid', () => {
|
|
67
|
-
const urls = ['https://example.com', 'javascript:alert(1)', 'http://test.com', 'invalid-url'];
|
|
68
|
-
const result = validateUrls(urls);
|
|
69
|
-
|
|
70
|
-
expect(result.validUrls.length).toBe(2);
|
|
71
|
-
expect(result.errors.length).toBe(2);
|
|
72
|
-
expect(result.validUrls).toContain('https://example.com/');
|
|
73
|
-
expect(result.validUrls).toContain('http://test.com/');
|
|
74
|
-
});
|
|
75
|
-
|
|
76
|
-
it('should return empty arrays for empty input', () => {
|
|
77
|
-
const result = validateUrls([]);
|
|
78
|
-
expect(result.validUrls.length).toBe(0);
|
|
79
|
-
expect(result.errors.length).toBe(0);
|
|
80
|
-
});
|
|
81
|
-
});
|
|
82
|
-
|
|
83
|
-
describe('RateLimiter', () => {
|
|
84
|
-
it('should allow requests within the limit', () => {
|
|
85
|
-
const limiter = new RateLimiter(5, 1000);
|
|
86
|
-
|
|
87
|
-
for (let i = 0; i < 5; i++) {
|
|
88
|
-
expect(limiter.isAllowed()).toBe(true);
|
|
89
|
-
}
|
|
90
|
-
});
|
|
91
|
-
|
|
92
|
-
it('should block requests exceeding the limit', () => {
|
|
93
|
-
const limiter = new RateLimiter(3, 1000);
|
|
94
|
-
|
|
95
|
-
// Use up all allowed requests
|
|
96
|
-
for (let i = 0; i < 3; i++) {
|
|
97
|
-
limiter.isAllowed();
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
// Next request should be blocked
|
|
101
|
-
expect(limiter.isAllowed()).toBe(false);
|
|
102
|
-
});
|
|
103
|
-
|
|
104
|
-
it('should reset after the time window', async () => {
|
|
105
|
-
const limiter = new RateLimiter(2, 100); // 100ms window
|
|
106
|
-
|
|
107
|
-
limiter.isAllowed();
|
|
108
|
-
limiter.isAllowed();
|
|
109
|
-
expect(limiter.isAllowed()).toBe(false);
|
|
110
|
-
|
|
111
|
-
// Wait for window to expire
|
|
112
|
-
await new Promise((resolve) => setTimeout(resolve, 150));
|
|
113
|
-
|
|
114
|
-
// Should be allowed again
|
|
115
|
-
expect(limiter.isAllowed()).toBe(true);
|
|
116
|
-
});
|
|
117
|
-
|
|
118
|
-
it('should correctly report remaining requests', () => {
|
|
119
|
-
const limiter = new RateLimiter(5, 1000);
|
|
120
|
-
|
|
121
|
-
expect(limiter.getRemainingRequests()).toBe(5);
|
|
122
|
-
limiter.isAllowed();
|
|
123
|
-
expect(limiter.getRemainingRequests()).toBe(4);
|
|
124
|
-
limiter.isAllowed();
|
|
125
|
-
expect(limiter.getRemainingRequests()).toBe(3);
|
|
126
|
-
});
|
|
127
|
-
});
|
|
128
|
-
|
|
129
|
-
describe('sanitizeText', () => {
|
|
130
|
-
it('should sanitize HTML special characters', () => {
|
|
131
|
-
const input = '<script>alert("XSS")</script>';
|
|
132
|
-
const output = sanitizeText(input);
|
|
133
|
-
expect(output).toBe('<script>alert("XSS")</script>');
|
|
134
|
-
});
|
|
135
|
-
|
|
136
|
-
it('should handle empty strings', () => {
|
|
137
|
-
expect(sanitizeText('')).toBe('');
|
|
138
|
-
});
|
|
139
|
-
|
|
140
|
-
it('should escape quotes and apostrophes', () => {
|
|
141
|
-
const input = `It's a "test"`;
|
|
142
|
-
const output = sanitizeText(input);
|
|
143
|
-
expect(output).toContain(''');
|
|
144
|
-
expect(output).toContain('"');
|
|
145
|
-
});
|
|
146
|
-
|
|
147
|
-
it('should escape forward slashes', () => {
|
|
148
|
-
const input = '</script>';
|
|
149
|
-
const output = sanitizeText(input);
|
|
150
|
-
expect(output).toBe('</script>');
|
|
151
|
-
});
|
|
152
|
-
});
|
|
153
|
-
});
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
import type { Page } from '../page/index.js';
|
|
2
|
-
import { JSDOM } from 'jsdom';
|
|
3
|
-
import { AbstractExtractor } from './AbstractExtractor.js';
|
|
4
|
-
|
|
5
|
-
export class PageExtractor extends AbstractExtractor<JSDOM, Page> {
|
|
6
|
-
constructor() {
|
|
7
|
-
super('page-extractor');
|
|
8
|
-
}
|
|
9
|
-
|
|
10
|
-
async extract(value: JSDOM): Promise<Page> {
|
|
11
|
-
const {
|
|
12
|
-
window: {
|
|
13
|
-
document: {
|
|
14
|
-
title,
|
|
15
|
-
location: { href: url },
|
|
16
|
-
},
|
|
17
|
-
},
|
|
18
|
-
} = value;
|
|
19
|
-
return { title, url };
|
|
20
|
-
}
|
|
21
|
-
}
|
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
import type { JSDOM } from 'jsdom';
|
|
2
|
-
import {
|
|
3
|
-
findResourceLink,
|
|
4
|
-
findResourceText,
|
|
5
|
-
type ExternalResource,
|
|
6
|
-
type Resource,
|
|
7
|
-
type Tag,
|
|
8
|
-
} from '../resource.js';
|
|
9
|
-
import { AbstractExtractor } from './AbstractExtractor.js';
|
|
10
|
-
|
|
11
|
-
export class ResourceExtractor extends AbstractExtractor<JSDOM, ExternalResource[]> {
|
|
12
|
-
constructor(private readonly tags: Tag[]) {
|
|
13
|
-
super('page-extractor');
|
|
14
|
-
}
|
|
15
|
-
async extract(value: JSDOM): Promise<ExternalResource[]> {
|
|
16
|
-
const { document } = value.window;
|
|
17
|
-
const externalResources: ExternalResource[] = [];
|
|
18
|
-
for (const tag of this.tags) {
|
|
19
|
-
const selector = document.querySelectorAll<Resource>(tag);
|
|
20
|
-
const elements = Array.from(selector);
|
|
21
|
-
for (const element of elements) {
|
|
22
|
-
const text = findResourceText(element);
|
|
23
|
-
const link = findResourceLink(element);
|
|
24
|
-
if (!text || !link) continue;
|
|
25
|
-
if (!link.url.startsWith('http')) continue;
|
|
26
|
-
externalResources.push({ text, link });
|
|
27
|
-
}
|
|
28
|
-
}
|
|
29
|
-
return externalResources;
|
|
30
|
-
}
|
|
31
|
-
}
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
import { JSDOM } from 'jsdom';
|
|
2
|
-
import type { Resource, Tag } from '../resource.js';
|
|
3
|
-
import { AbstractExtractor } from './AbstractExtractor.js';
|
|
4
|
-
|
|
5
|
-
export class TagExtractor<T extends Tag> extends AbstractExtractor<JSDOM, Resource[]> {
|
|
6
|
-
extract(value: JSDOM): Promise<Resource[]> {
|
|
7
|
-
const linkNodes = value.window.document.querySelectorAll<Resource>(this.tagName);
|
|
8
|
-
return Promise.resolve(Array.from(linkNodes));
|
|
9
|
-
}
|
|
10
|
-
constructor(private readonly tagName: T) {
|
|
11
|
-
super(`extract <${tagName}>`);
|
|
12
|
-
}
|
|
13
|
-
}
|
package/src/extractors/index.ts
DELETED
package/src/main.ts
DELETED
|
@@ -1,71 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
import { Command, createArgument } from 'commander';
|
|
3
|
-
|
|
4
|
-
import pkg from '../package.json';
|
|
5
|
-
import { PageExtractor, ResourceExtractor } from './extractors/index.js';
|
|
6
|
-
import { PageFetcher, type PageMetadata } from './page/index.js';
|
|
7
|
-
import { JSONStylePrinter } from './printers/index.js';
|
|
8
|
-
import { validateUrls } from './security.js';
|
|
9
|
-
|
|
10
|
-
const { description, name, version } = pkg;
|
|
11
|
-
|
|
12
|
-
const program = new Command();
|
|
13
|
-
|
|
14
|
-
const url = createArgument(
|
|
15
|
-
'<url | file...>',
|
|
16
|
-
'remote https://URL or local file://resource.html to extract from'
|
|
17
|
-
);
|
|
18
|
-
|
|
19
|
-
(async (): Promise<void> => {
|
|
20
|
-
await program
|
|
21
|
-
.name(name)
|
|
22
|
-
.version(version, '-v, --version')
|
|
23
|
-
.description(description)
|
|
24
|
-
.addArgument(url)
|
|
25
|
-
.action(async (urls: string[]) => {
|
|
26
|
-
try {
|
|
27
|
-
// Validate URLs first
|
|
28
|
-
const { validUrls, errors } = validateUrls(urls);
|
|
29
|
-
|
|
30
|
-
// Report validation errors
|
|
31
|
-
if (errors.length > 0) {
|
|
32
|
-
console.error('\n❌ URL Validation Errors:');
|
|
33
|
-
errors.forEach(({ url: invalidUrl, error }) => {
|
|
34
|
-
console.error(` - ${invalidUrl}: ${error}`);
|
|
35
|
-
});
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
// Exit if no valid URLs
|
|
39
|
-
if (validUrls.length === 0) {
|
|
40
|
-
console.error('\n❌ No valid URLs to process. Exiting.');
|
|
41
|
-
process.exit(1);
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
console.error(`\n✅ Processing ${validUrls.length} valid URL(s)...`);
|
|
45
|
-
|
|
46
|
-
const printer = new JSONStylePrinter();
|
|
47
|
-
const pageFetcher = new PageFetcher();
|
|
48
|
-
const pageExtractor = new PageExtractor();
|
|
49
|
-
const resourceExtractor = new ResourceExtractor(['a', 'meta', 'link', 'embed']);
|
|
50
|
-
|
|
51
|
-
const pageResponses = await pageFetcher.fetchAll(validUrls);
|
|
52
|
-
const pageMetadatas: PageMetadata[] = [];
|
|
53
|
-
|
|
54
|
-
for (const { content, url: responseUrl, error } of pageResponses) {
|
|
55
|
-
const resources =
|
|
56
|
-
error !== undefined || !content ? [] : await resourceExtractor.extract(content);
|
|
57
|
-
const descriptor =
|
|
58
|
-
error !== undefined || !content
|
|
59
|
-
? { url: responseUrl, error: error ?? 'Unknown error', resources }
|
|
60
|
-
: await pageExtractor.extract(content);
|
|
61
|
-
pageMetadatas.push({ ...descriptor, resources });
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
await printer.print(...pageMetadatas);
|
|
65
|
-
} catch (error) {
|
|
66
|
-
console.error('\n❌ An error occurred:', error instanceof Error ? error.message : error);
|
|
67
|
-
process.exit(1);
|
|
68
|
-
}
|
|
69
|
-
})
|
|
70
|
-
.parseAsync(process.argv);
|
|
71
|
-
})();
|
package/src/page/Page.ts
DELETED
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
import type { ExternalResource } from '../resource.js';
|
|
2
|
-
|
|
3
|
-
type hasTitle = {
|
|
4
|
-
title: string;
|
|
5
|
-
};
|
|
6
|
-
|
|
7
|
-
type hasUrl = {
|
|
8
|
-
url: string;
|
|
9
|
-
};
|
|
10
|
-
|
|
11
|
-
type hasResources = {
|
|
12
|
-
resources: ExternalResource[];
|
|
13
|
-
};
|
|
14
|
-
|
|
15
|
-
export type Page = hasTitle & hasUrl;
|
|
16
|
-
|
|
17
|
-
export type PageSuccess = Page & hasResources;
|
|
18
|
-
export type PageFailure = hasUrl & hasResources & { error: string };
|
|
19
|
-
export type PageMetadata = PageSuccess | PageFailure;
|
|
20
|
-
|
|
21
|
-
export const isError = (page: PageMetadata): page is PageFailure => 'error' in page;
|
|
22
|
-
|
|
23
|
-
export const isPage = (page: PageMetadata): page is PageSuccess =>
|
|
24
|
-
'title' in page && typeof page.title === 'string' && Array.isArray(page.resources);
|
package/src/page/PageFetcher.ts
DELETED
|
@@ -1,81 +0,0 @@
|
|
|
1
|
-
import { JSDOM, VirtualConsole } from 'jsdom';
|
|
2
|
-
|
|
3
|
-
interface PageResponse {
|
|
4
|
-
url: string;
|
|
5
|
-
content?: JSDOM;
|
|
6
|
-
error?: string;
|
|
7
|
-
}
|
|
8
|
-
|
|
9
|
-
export class PageFetcher {
|
|
10
|
-
private readonly timeout: number;
|
|
11
|
-
private readonly maxRetries: number;
|
|
12
|
-
|
|
13
|
-
constructor(timeout = 10000, maxRetries = 2) {
|
|
14
|
-
this.timeout = timeout;
|
|
15
|
-
this.maxRetries = maxRetries;
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
private async fetchPage(url: string, retryCount = 0): Promise<PageResponse> {
|
|
19
|
-
const virtualConsole = new VirtualConsole().on('jsdomError', (error: Error) => {
|
|
20
|
-
process.stderr.write(`Error parsing ${url}: ${error.message}\n`);
|
|
21
|
-
});
|
|
22
|
-
|
|
23
|
-
try {
|
|
24
|
-
let dom: Promise<JSDOM>;
|
|
25
|
-
|
|
26
|
-
if (url.startsWith('file://')) {
|
|
27
|
-
dom = JSDOM.fromFile(url.substring(7), { virtualConsole });
|
|
28
|
-
} else {
|
|
29
|
-
// Add timeout and security options for remote URLs
|
|
30
|
-
dom = JSDOM.fromURL(url, {
|
|
31
|
-
virtualConsole,
|
|
32
|
-
resources: 'usable',
|
|
33
|
-
runScripts: 'outside-only', // More secure - don't execute page scripts
|
|
34
|
-
beforeParse(window) {
|
|
35
|
-
// Prevent infinite loops and resource exhaustion
|
|
36
|
-
window.setTimeout = (() => {
|
|
37
|
-
throw new Error('setTimeout disabled for security');
|
|
38
|
-
}) as typeof window.setTimeout;
|
|
39
|
-
window.setInterval = (() => {
|
|
40
|
-
throw new Error('setInterval disabled for security');
|
|
41
|
-
}) as typeof window.setInterval;
|
|
42
|
-
},
|
|
43
|
-
});
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
const content = await Promise.race([
|
|
47
|
-
dom,
|
|
48
|
-
new Promise<never>((_, reject) =>
|
|
49
|
-
setTimeout(() => reject(new Error('Request timeout')), this.timeout)
|
|
50
|
-
),
|
|
51
|
-
]);
|
|
52
|
-
|
|
53
|
-
return { url, content };
|
|
54
|
-
} catch (error) {
|
|
55
|
-
const message = error instanceof Error ? error.message : 'Unknown error';
|
|
56
|
-
|
|
57
|
-
// Retry logic for transient errors
|
|
58
|
-
if (retryCount < this.maxRetries && this.isRetryableError(message)) {
|
|
59
|
-
process.stderr.write(`Retrying ${url} (attempt ${retryCount + 1}/${this.maxRetries})...\n`);
|
|
60
|
-
await this.delay(1000 * (retryCount + 1)); // Exponential backoff
|
|
61
|
-
return this.fetchPage(url, retryCount + 1);
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
return { url, error: `Failed to fetch: ${message}` };
|
|
65
|
-
}
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
private isRetryableError(message: string): boolean {
|
|
69
|
-
const retryablePatterns = [/timeout/i, /ECONNRESET/i, /ETIMEDOUT/i, /ENOTFOUND/i, /network/i];
|
|
70
|
-
return retryablePatterns.some((pattern) => pattern.test(message));
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
private delay(ms: number): Promise<void> {
|
|
74
|
-
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
async fetchAll(urls: string[]): Promise<PageResponse[]> {
|
|
78
|
-
const responses = await Promise.all(urls.map((url) => this.fetchPage(url)));
|
|
79
|
-
return responses.filter((response) => response.content !== undefined || response.error);
|
|
80
|
-
}
|
|
81
|
-
}
|
package/src/page/index.ts
DELETED