smippo 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +116 -0
- package/bin/smippo.js +5 -0
- package/package.json +100 -0
- package/src/cli.js +437 -0
- package/src/crawler.js +408 -0
- package/src/filter.js +155 -0
- package/src/index.js +60 -0
- package/src/interactive.js +391 -0
- package/src/link-extractor.js +212 -0
- package/src/link-rewriter.js +293 -0
- package/src/manifest.js +163 -0
- package/src/page-capture.js +151 -0
- package/src/progress.js +190 -0
- package/src/resource-saver.js +210 -0
- package/src/robots.js +104 -0
- package/src/screenshot.js +185 -0
- package/src/server.js +603 -0
- package/src/utils/logger.js +74 -0
- package/src/utils/path.js +76 -0
- package/src/utils/url.js +295 -0
- package/src/utils/version.js +14 -0
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
// @flow
|
|
2
|
+
import {load} from 'cheerio';
|
|
3
|
+
import {getRelativePath} from './utils/path.js';
|
|
4
|
+
import {urlToPath, resolveUrl} from './utils/url.js';
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Rewrite links in HTML to point to local files
|
|
8
|
+
*/
|
|
9
|
+
export function rewriteLinks(html, pageUrl, urlMap, _options = {}) {
|
|
10
|
+
const $ = load(html, {decodeEntities: false});
|
|
11
|
+
const pagePath = urlToPath(pageUrl, _options.structure);
|
|
12
|
+
|
|
13
|
+
// Strip all scripts if --no-js flag is set
|
|
14
|
+
if (_options.noJs) {
|
|
15
|
+
// Remove script tags
|
|
16
|
+
$('script').remove();
|
|
17
|
+
// Remove event handlers
|
|
18
|
+
$(
|
|
19
|
+
'[onclick], [onload], [onerror], [onmouseover], [onmouseout], [onkeydown], [onkeyup], [onsubmit], [onchange], [onfocus], [onblur]',
|
|
20
|
+
).each((_, el) => {
|
|
21
|
+
$(el)
|
|
22
|
+
.removeAttr('onclick')
|
|
23
|
+
.removeAttr('onload')
|
|
24
|
+
.removeAttr('onerror')
|
|
25
|
+
.removeAttr('onmouseover')
|
|
26
|
+
.removeAttr('onmouseout')
|
|
27
|
+
.removeAttr('onkeydown')
|
|
28
|
+
.removeAttr('onkeyup')
|
|
29
|
+
.removeAttr('onsubmit')
|
|
30
|
+
.removeAttr('onchange')
|
|
31
|
+
.removeAttr('onfocus')
|
|
32
|
+
.removeAttr('onblur');
|
|
33
|
+
});
|
|
34
|
+
// Remove module preloads
|
|
35
|
+
$('link[rel="modulepreload"]').remove();
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// Helper to get relative path for a URL
|
|
39
|
+
const getLocalPath = url => {
|
|
40
|
+
if (!url) return null;
|
|
41
|
+
|
|
42
|
+
// Resolve relative URLs (including absolute paths starting with /)
|
|
43
|
+
const absoluteUrl = resolveUrl(url, pageUrl);
|
|
44
|
+
|
|
45
|
+
// Check if we have this URL in our map
|
|
46
|
+
if (urlMap.has(absoluteUrl)) {
|
|
47
|
+
const targetPath = urlMap.get(absoluteUrl);
|
|
48
|
+
return getRelativePath(pagePath, targetPath);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
// Check without trailing slash
|
|
52
|
+
const normalizedUrl = absoluteUrl.replace(/\/$/, '');
|
|
53
|
+
if (urlMap.has(normalizedUrl)) {
|
|
54
|
+
const targetPath = urlMap.get(normalizedUrl);
|
|
55
|
+
return getRelativePath(pagePath, targetPath);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// Check with index.html appended
|
|
59
|
+
if (absoluteUrl.endsWith('/')) {
|
|
60
|
+
const indexUrl = absoluteUrl + 'index.html';
|
|
61
|
+
if (urlMap.has(indexUrl)) {
|
|
62
|
+
const targetPath = urlMap.get(indexUrl);
|
|
63
|
+
return getRelativePath(pagePath, targetPath);
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// Check without query string
|
|
68
|
+
const urlWithoutQuery = absoluteUrl.split('?')[0];
|
|
69
|
+
if (urlWithoutQuery !== absoluteUrl && urlMap.has(urlWithoutQuery)) {
|
|
70
|
+
const targetPath = urlMap.get(urlWithoutQuery);
|
|
71
|
+
return getRelativePath(pagePath, targetPath);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
return null;
|
|
75
|
+
};
|
|
76
|
+
|
|
77
|
+
// Rewrite <a href>
|
|
78
|
+
$('a[href]').each((_, el) => {
|
|
79
|
+
const href = $(el).attr('href');
|
|
80
|
+
if (shouldSkipUrl(href)) return;
|
|
81
|
+
|
|
82
|
+
const localPath = getLocalPath(href);
|
|
83
|
+
if (localPath) {
|
|
84
|
+
$(el).attr('href', localPath);
|
|
85
|
+
}
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
// Rewrite <link href>
|
|
89
|
+
$('link[href]').each((_, el) => {
|
|
90
|
+
const href = $(el).attr('href');
|
|
91
|
+
if (shouldSkipUrl(href)) return;
|
|
92
|
+
|
|
93
|
+
const localPath = getLocalPath(href);
|
|
94
|
+
if (localPath) {
|
|
95
|
+
$(el).attr('href', localPath);
|
|
96
|
+
}
|
|
97
|
+
});
|
|
98
|
+
|
|
99
|
+
// Rewrite <script src>
|
|
100
|
+
$('script[src]').each((_, el) => {
|
|
101
|
+
const src = $(el).attr('src');
|
|
102
|
+
if (shouldSkipUrl(src)) return;
|
|
103
|
+
|
|
104
|
+
const localPath = getLocalPath(src);
|
|
105
|
+
if (localPath) {
|
|
106
|
+
$(el).attr('src', localPath);
|
|
107
|
+
}
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
// Rewrite <img src>
|
|
111
|
+
$('img[src]').each((_, el) => {
|
|
112
|
+
const src = $(el).attr('src');
|
|
113
|
+
if (shouldSkipUrl(src)) return;
|
|
114
|
+
|
|
115
|
+
const localPath = getLocalPath(src);
|
|
116
|
+
if (localPath) {
|
|
117
|
+
$(el).attr('src', localPath);
|
|
118
|
+
}
|
|
119
|
+
});
|
|
120
|
+
|
|
121
|
+
// Rewrite <img srcset>
|
|
122
|
+
$('img[srcset], source[srcset]').each((_, el) => {
|
|
123
|
+
const srcset = $(el).attr('srcset');
|
|
124
|
+
if (!srcset) return;
|
|
125
|
+
|
|
126
|
+
const newSrcset = rewriteSrcset(srcset, pageUrl, urlMap, _options);
|
|
127
|
+
$(el).attr('srcset', newSrcset);
|
|
128
|
+
});
|
|
129
|
+
|
|
130
|
+
// Rewrite <video src>, <audio src>, <source src>
|
|
131
|
+
$('video[src], audio[src], source[src]').each((_, el) => {
|
|
132
|
+
const src = $(el).attr('src');
|
|
133
|
+
if (shouldSkipUrl(src)) return;
|
|
134
|
+
|
|
135
|
+
const localPath = getLocalPath(src);
|
|
136
|
+
if (localPath) {
|
|
137
|
+
$(el).attr('src', localPath);
|
|
138
|
+
}
|
|
139
|
+
});
|
|
140
|
+
|
|
141
|
+
// Rewrite <video poster>
|
|
142
|
+
$('video[poster]').each((_, el) => {
|
|
143
|
+
const poster = $(el).attr('poster');
|
|
144
|
+
if (shouldSkipUrl(poster)) return;
|
|
145
|
+
|
|
146
|
+
const localPath = getLocalPath(poster);
|
|
147
|
+
if (localPath) {
|
|
148
|
+
$(el).attr('poster', localPath);
|
|
149
|
+
}
|
|
150
|
+
});
|
|
151
|
+
|
|
152
|
+
// Rewrite <iframe src>
|
|
153
|
+
$('iframe[src]').each((_, el) => {
|
|
154
|
+
const src = $(el).attr('src');
|
|
155
|
+
if (shouldSkipUrl(src)) return;
|
|
156
|
+
|
|
157
|
+
const localPath = getLocalPath(src);
|
|
158
|
+
if (localPath) {
|
|
159
|
+
$(el).attr('src', localPath);
|
|
160
|
+
}
|
|
161
|
+
});
|
|
162
|
+
|
|
163
|
+
// Rewrite <object data>
|
|
164
|
+
$('object[data]').each((_, el) => {
|
|
165
|
+
const data = $(el).attr('data');
|
|
166
|
+
if (shouldSkipUrl(data)) return;
|
|
167
|
+
|
|
168
|
+
const localPath = getLocalPath(data);
|
|
169
|
+
if (localPath) {
|
|
170
|
+
$(el).attr('data', localPath);
|
|
171
|
+
}
|
|
172
|
+
});
|
|
173
|
+
|
|
174
|
+
// Rewrite style attributes
|
|
175
|
+
$('[style]').each((_, el) => {
|
|
176
|
+
const style = $(el).attr('style');
|
|
177
|
+
const newStyle = rewriteCssUrls(style, pageUrl, urlMap, pagePath, _options);
|
|
178
|
+
$(el).attr('style', newStyle);
|
|
179
|
+
});
|
|
180
|
+
|
|
181
|
+
// Rewrite inline <style> tags
|
|
182
|
+
$('style').each((_, el) => {
|
|
183
|
+
const css = $(el).html();
|
|
184
|
+
const newCss = rewriteCssUrls(css, pageUrl, urlMap, pagePath, _options);
|
|
185
|
+
$(el).html(newCss);
|
|
186
|
+
});
|
|
187
|
+
|
|
188
|
+
return $.html();
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
/**
|
|
192
|
+
* Rewrite URLs in CSS content
|
|
193
|
+
*/
|
|
194
|
+
export function rewriteCssUrls(css, baseUrl, urlMap, pagePath, _options = {}) {
|
|
195
|
+
if (!css) return css;
|
|
196
|
+
|
|
197
|
+
// Helper to find local path for a URL
|
|
198
|
+
const findLocalPath = url => {
|
|
199
|
+
const absoluteUrl = resolveUrl(url, baseUrl);
|
|
200
|
+
|
|
201
|
+
// Direct match
|
|
202
|
+
if (urlMap.has(absoluteUrl)) {
|
|
203
|
+
return urlMap.get(absoluteUrl);
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
// Try without query string
|
|
207
|
+
const urlWithoutQuery = absoluteUrl.split('?')[0];
|
|
208
|
+
if (urlWithoutQuery !== absoluteUrl && urlMap.has(urlWithoutQuery)) {
|
|
209
|
+
return urlMap.get(urlWithoutQuery);
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
// Try without trailing slash
|
|
213
|
+
const normalizedUrl = absoluteUrl.replace(/\/$/, '');
|
|
214
|
+
if (urlMap.has(normalizedUrl)) {
|
|
215
|
+
return urlMap.get(normalizedUrl);
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
return null;
|
|
219
|
+
};
|
|
220
|
+
|
|
221
|
+
// Rewrite url() references
|
|
222
|
+
css = css.replace(/url\s*\(\s*['"]?([^'")]+)['"]?\s*\)/gi, (match, url) => {
|
|
223
|
+
if (shouldSkipUrl(url)) return match;
|
|
224
|
+
|
|
225
|
+
const targetPath = findLocalPath(url);
|
|
226
|
+
if (targetPath) {
|
|
227
|
+
const relativePath = getRelativePath(pagePath, targetPath);
|
|
228
|
+
return `url("${relativePath}")`;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
return match;
|
|
232
|
+
});
|
|
233
|
+
|
|
234
|
+
// Rewrite @import references
|
|
235
|
+
css = css.replace(/@import\s+['"]([^'"]+)['"]/gi, (match, url) => {
|
|
236
|
+
if (shouldSkipUrl(url)) return match;
|
|
237
|
+
|
|
238
|
+
const targetPath = findLocalPath(url);
|
|
239
|
+
if (targetPath) {
|
|
240
|
+
const relativePath = getRelativePath(pagePath, targetPath);
|
|
241
|
+
return `@import "${relativePath}"`;
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
return match;
|
|
245
|
+
});
|
|
246
|
+
|
|
247
|
+
return css;
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
/**
|
|
251
|
+
* Rewrite srcset attribute
|
|
252
|
+
*/
|
|
253
|
+
function rewriteSrcset(srcset, pageUrl, urlMap, options) {
|
|
254
|
+
const pagePath = urlToPath(pageUrl, options.structure);
|
|
255
|
+
|
|
256
|
+
return srcset
|
|
257
|
+
.split(',')
|
|
258
|
+
.map(part => {
|
|
259
|
+
const [url, descriptor] = part.trim().split(/\s+/);
|
|
260
|
+
if (shouldSkipUrl(url)) return part;
|
|
261
|
+
|
|
262
|
+
const absoluteUrl = resolveUrl(url, pageUrl);
|
|
263
|
+
if (urlMap.has(absoluteUrl)) {
|
|
264
|
+
const targetPath = urlMap.get(absoluteUrl);
|
|
265
|
+
const relativePath = getRelativePath(pagePath, targetPath);
|
|
266
|
+
return descriptor ? `${relativePath} ${descriptor}` : relativePath;
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
return part;
|
|
270
|
+
})
|
|
271
|
+
.join(', ');
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
/**
|
|
275
|
+
* Check if URL should be skipped
|
|
276
|
+
*/
|
|
277
|
+
function shouldSkipUrl(url) {
|
|
278
|
+
if (!url) return true;
|
|
279
|
+
|
|
280
|
+
const skipPrefixes = [
|
|
281
|
+
'javascript:',
|
|
282
|
+
'mailto:',
|
|
283
|
+
'tel:',
|
|
284
|
+
'data:',
|
|
285
|
+
'#',
|
|
286
|
+
'blob:',
|
|
287
|
+
'about:',
|
|
288
|
+
];
|
|
289
|
+
|
|
290
|
+
return skipPrefixes.some(prefix =>
|
|
291
|
+
url.trim().toLowerCase().startsWith(prefix),
|
|
292
|
+
);
|
|
293
|
+
}
|
package/src/manifest.js
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import fs from 'fs-extra';
|
|
2
|
+
import path from 'path';
|
|
3
|
+
|
|
4
|
+
const SMIPPO_DIR = '.smippo';
|
|
5
|
+
const MANIFEST_FILE = 'manifest.json';
|
|
6
|
+
const CACHE_FILE = 'cache.json';
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Check if a manifest exists
|
|
10
|
+
*/
|
|
11
|
+
export function manifestExists(outputDir) {
|
|
12
|
+
const manifestPath = path.join(outputDir, SMIPPO_DIR, MANIFEST_FILE);
|
|
13
|
+
return fs.existsSync(manifestPath);
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Read the manifest file
|
|
18
|
+
*/
|
|
19
|
+
export async function readManifest(outputDir) {
|
|
20
|
+
const manifestPath = path.join(outputDir, SMIPPO_DIR, MANIFEST_FILE);
|
|
21
|
+
|
|
22
|
+
if (!fs.existsSync(manifestPath)) {
|
|
23
|
+
return null;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
const content = await fs.readFile(manifestPath, 'utf8');
|
|
27
|
+
return JSON.parse(content);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Write the manifest file
|
|
32
|
+
*/
|
|
33
|
+
export async function writeManifest(outputDir, manifest) {
|
|
34
|
+
const smippoDir = path.join(outputDir, SMIPPO_DIR);
|
|
35
|
+
const manifestPath = path.join(smippoDir, MANIFEST_FILE);
|
|
36
|
+
|
|
37
|
+
await fs.ensureDir(smippoDir);
|
|
38
|
+
await fs.writeFile(manifestPath, JSON.stringify(manifest, null, 2), 'utf8');
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Read the cache file
|
|
43
|
+
*/
|
|
44
|
+
export async function readCache(outputDir) {
|
|
45
|
+
const cachePath = path.join(outputDir, SMIPPO_DIR, CACHE_FILE);
|
|
46
|
+
|
|
47
|
+
if (!fs.existsSync(cachePath)) {
|
|
48
|
+
return {
|
|
49
|
+
etags: {},
|
|
50
|
+
lastModified: {},
|
|
51
|
+
contentTypes: {},
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
const content = await fs.readFile(cachePath, 'utf8');
|
|
56
|
+
return JSON.parse(content);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Write the cache file
|
|
61
|
+
*/
|
|
62
|
+
export async function writeCache(outputDir, cache) {
|
|
63
|
+
const smippoDir = path.join(outputDir, SMIPPO_DIR);
|
|
64
|
+
const cachePath = path.join(smippoDir, CACHE_FILE);
|
|
65
|
+
|
|
66
|
+
await fs.ensureDir(smippoDir);
|
|
67
|
+
await fs.writeFile(cachePath, JSON.stringify(cache, null, 2), 'utf8');
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Create initial manifest
|
|
72
|
+
*/
|
|
73
|
+
export function createManifest(url, options) {
|
|
74
|
+
return {
|
|
75
|
+
version: '0.0.1',
|
|
76
|
+
created: new Date().toISOString(),
|
|
77
|
+
updated: new Date().toISOString(),
|
|
78
|
+
rootUrl: url,
|
|
79
|
+
options: {
|
|
80
|
+
depth: options.depth,
|
|
81
|
+
scope: options.scope,
|
|
82
|
+
stayInDir: options.stayInDir,
|
|
83
|
+
externalAssets: options.externalAssets,
|
|
84
|
+
filters: {
|
|
85
|
+
include: options.include || [],
|
|
86
|
+
exclude: options.exclude || [],
|
|
87
|
+
},
|
|
88
|
+
},
|
|
89
|
+
stats: {
|
|
90
|
+
pagesCapt: 0,
|
|
91
|
+
assetsCapt: 0,
|
|
92
|
+
totalSize: 0,
|
|
93
|
+
duration: 0,
|
|
94
|
+
errors: 0,
|
|
95
|
+
},
|
|
96
|
+
pages: [],
|
|
97
|
+
assets: [],
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* Update manifest with captured page
|
|
103
|
+
*/
|
|
104
|
+
export function addPageToManifest(manifest, page) {
|
|
105
|
+
manifest.pages.push({
|
|
106
|
+
url: page.url,
|
|
107
|
+
localPath: page.localPath,
|
|
108
|
+
status: page.status || 200,
|
|
109
|
+
captured: new Date().toISOString(),
|
|
110
|
+
size: page.size,
|
|
111
|
+
title: page.title,
|
|
112
|
+
});
|
|
113
|
+
|
|
114
|
+
manifest.stats.pagesCapt++;
|
|
115
|
+
manifest.stats.totalSize += page.size || 0;
|
|
116
|
+
manifest.updated = new Date().toISOString();
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* Update manifest with captured asset
|
|
121
|
+
*/
|
|
122
|
+
export function addAssetToManifest(manifest, asset) {
|
|
123
|
+
manifest.assets.push({
|
|
124
|
+
url: asset.url,
|
|
125
|
+
localPath: asset.localPath,
|
|
126
|
+
mimeType: asset.mimeType,
|
|
127
|
+
size: asset.size,
|
|
128
|
+
});
|
|
129
|
+
|
|
130
|
+
manifest.stats.assetsCapt++;
|
|
131
|
+
manifest.stats.totalSize += asset.size || 0;
|
|
132
|
+
manifest.updated = new Date().toISOString();
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Record an error in manifest
|
|
137
|
+
*/
|
|
138
|
+
export function addErrorToManifest(manifest, _url, _error) {
|
|
139
|
+
manifest.stats.errors++;
|
|
140
|
+
manifest.updated = new Date().toISOString();
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Finalize manifest with duration
|
|
145
|
+
*/
|
|
146
|
+
export function finalizeManifest(manifest, duration) {
|
|
147
|
+
manifest.stats.duration = duration;
|
|
148
|
+
manifest.updated = new Date().toISOString();
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* Get HAR file path
|
|
153
|
+
*/
|
|
154
|
+
export function getHarPath(outputDir) {
|
|
155
|
+
return path.join(outputDir, SMIPPO_DIR, 'network.har');
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* Get log file path
|
|
160
|
+
*/
|
|
161
|
+
export function getLogPath(outputDir) {
|
|
162
|
+
return path.join(outputDir, SMIPPO_DIR, 'log.txt');
|
|
163
|
+
}
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
// @flow
|
|
2
|
+
import {extractLinks} from './link-extractor.js';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Capture a single page with all its rendered content
|
|
6
|
+
*/
|
|
7
|
+
export class PageCapture {
|
|
8
|
+
constructor(page, options = {}) {
|
|
9
|
+
this.page = page;
|
|
10
|
+
this.options = options;
|
|
11
|
+
this.resources = new Map();
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Capture the page content and resources
|
|
16
|
+
*/
|
|
17
|
+
async capture(url) {
|
|
18
|
+
const startTime = Date.now();
|
|
19
|
+
|
|
20
|
+
// Set up resource collection
|
|
21
|
+
this.page.on('response', async response => {
|
|
22
|
+
await this._collectResource(response);
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
// Navigate to the page
|
|
26
|
+
try {
|
|
27
|
+
await this.page.goto(url, {
|
|
28
|
+
waitUntil: this.options.wait || 'networkidle',
|
|
29
|
+
timeout: this.options.timeout || 30000,
|
|
30
|
+
});
|
|
31
|
+
} catch (error) {
|
|
32
|
+
// Handle navigation errors but continue if we got some content
|
|
33
|
+
if (!this.page.url().startsWith('http')) {
|
|
34
|
+
throw error;
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// Additional wait time if specified
|
|
39
|
+
if (this.options.waitTime > 0) {
|
|
40
|
+
await this.page.waitForTimeout(this.options.waitTime);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// Get the rendered HTML
|
|
44
|
+
const html = await this.page.content();
|
|
45
|
+
|
|
46
|
+
// Get page metadata
|
|
47
|
+
const title = await this.page.title();
|
|
48
|
+
const finalUrl = this.page.url();
|
|
49
|
+
|
|
50
|
+
// Extract links from the rendered page
|
|
51
|
+
const links = await extractLinks(this.page, finalUrl, this.options);
|
|
52
|
+
|
|
53
|
+
// Take screenshot if requested
|
|
54
|
+
let screenshot = null;
|
|
55
|
+
if (this.options.screenshot) {
|
|
56
|
+
screenshot = await this.page.screenshot({
|
|
57
|
+
fullPage: true,
|
|
58
|
+
type: 'png',
|
|
59
|
+
});
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// Generate PDF if requested
|
|
63
|
+
let pdf = null;
|
|
64
|
+
if (this.options.pdf) {
|
|
65
|
+
pdf = await this.page.pdf({
|
|
66
|
+
format: 'A4',
|
|
67
|
+
printBackground: true,
|
|
68
|
+
});
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
return {
|
|
72
|
+
url: finalUrl,
|
|
73
|
+
requestedUrl: url,
|
|
74
|
+
html,
|
|
75
|
+
title,
|
|
76
|
+
links,
|
|
77
|
+
resources: this.resources,
|
|
78
|
+
screenshot,
|
|
79
|
+
pdf,
|
|
80
|
+
duration: Date.now() - startTime,
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Collect a resource from a network response
|
|
86
|
+
*/
|
|
87
|
+
async _collectResource(response) {
|
|
88
|
+
try {
|
|
89
|
+
const url = response.url();
|
|
90
|
+
const status = response.status();
|
|
91
|
+
const headers = response.headers();
|
|
92
|
+
const contentType = headers['content-type'] || '';
|
|
93
|
+
|
|
94
|
+
// Skip failed requests
|
|
95
|
+
if (status < 200 || status >= 400) return;
|
|
96
|
+
|
|
97
|
+
// Skip the main HTML page (captured separately)
|
|
98
|
+
if (contentType.includes('text/html')) return;
|
|
99
|
+
|
|
100
|
+
// Skip data URLs
|
|
101
|
+
if (url.startsWith('data:')) return;
|
|
102
|
+
|
|
103
|
+
// Apply MIME type filters
|
|
104
|
+
if (this.options.mimeExclude?.length) {
|
|
105
|
+
if (this._matchesMimeFilter(contentType, this.options.mimeExclude)) {
|
|
106
|
+
return;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
if (this.options.mimeInclude?.length) {
|
|
111
|
+
if (!this._matchesMimeFilter(contentType, this.options.mimeInclude)) {
|
|
112
|
+
return;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// Get the response body
|
|
117
|
+
const body = await response.body().catch(() => null);
|
|
118
|
+
if (!body) return;
|
|
119
|
+
|
|
120
|
+
// Apply size filters
|
|
121
|
+
if (this.options.maxSize && body.length > this.options.maxSize) return;
|
|
122
|
+
if (this.options.minSize && body.length < this.options.minSize) return;
|
|
123
|
+
|
|
124
|
+
this.resources.set(url, {
|
|
125
|
+
url,
|
|
126
|
+
status,
|
|
127
|
+
contentType,
|
|
128
|
+
size: body.length,
|
|
129
|
+
body,
|
|
130
|
+
headers,
|
|
131
|
+
});
|
|
132
|
+
} catch (error) {
|
|
133
|
+
// Ignore resource collection errors
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Check if a content type matches a filter pattern
|
|
139
|
+
*/
|
|
140
|
+
_matchesMimeFilter(contentType, filters) {
|
|
141
|
+
const type = contentType.split(';')[0].trim().toLowerCase();
|
|
142
|
+
|
|
143
|
+
return filters.some(filter => {
|
|
144
|
+
filter = filter.toLowerCase();
|
|
145
|
+
if (filter.endsWith('/*')) {
|
|
146
|
+
return type.startsWith(filter.slice(0, -1));
|
|
147
|
+
}
|
|
148
|
+
return type === filter || type.startsWith(filter + ';');
|
|
149
|
+
});
|
|
150
|
+
}
|
|
151
|
+
}
|