smippo 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,293 @@
1
+ // @flow
2
+ import {load} from 'cheerio';
3
+ import {getRelativePath} from './utils/path.js';
4
+ import {urlToPath, resolveUrl} from './utils/url.js';
5
+
6
+ /**
7
+ * Rewrite links in HTML to point to local files
8
+ */
9
+ export function rewriteLinks(html, pageUrl, urlMap, _options = {}) {
10
+ const $ = load(html, {decodeEntities: false});
11
+ const pagePath = urlToPath(pageUrl, _options.structure);
12
+
13
+ // Strip all scripts if --no-js flag is set
14
+ if (_options.noJs) {
15
+ // Remove script tags
16
+ $('script').remove();
17
+ // Remove event handlers
18
+ $(
19
+ '[onclick], [onload], [onerror], [onmouseover], [onmouseout], [onkeydown], [onkeyup], [onsubmit], [onchange], [onfocus], [onblur]',
20
+ ).each((_, el) => {
21
+ $(el)
22
+ .removeAttr('onclick')
23
+ .removeAttr('onload')
24
+ .removeAttr('onerror')
25
+ .removeAttr('onmouseover')
26
+ .removeAttr('onmouseout')
27
+ .removeAttr('onkeydown')
28
+ .removeAttr('onkeyup')
29
+ .removeAttr('onsubmit')
30
+ .removeAttr('onchange')
31
+ .removeAttr('onfocus')
32
+ .removeAttr('onblur');
33
+ });
34
+ // Remove module preloads
35
+ $('link[rel="modulepreload"]').remove();
36
+ }
37
+
38
+ // Helper to get relative path for a URL
39
+ const getLocalPath = url => {
40
+ if (!url) return null;
41
+
42
+ // Resolve relative URLs (including absolute paths starting with /)
43
+ const absoluteUrl = resolveUrl(url, pageUrl);
44
+
45
+ // Check if we have this URL in our map
46
+ if (urlMap.has(absoluteUrl)) {
47
+ const targetPath = urlMap.get(absoluteUrl);
48
+ return getRelativePath(pagePath, targetPath);
49
+ }
50
+
51
+ // Check without trailing slash
52
+ const normalizedUrl = absoluteUrl.replace(/\/$/, '');
53
+ if (urlMap.has(normalizedUrl)) {
54
+ const targetPath = urlMap.get(normalizedUrl);
55
+ return getRelativePath(pagePath, targetPath);
56
+ }
57
+
58
+ // Check with index.html appended
59
+ if (absoluteUrl.endsWith('/')) {
60
+ const indexUrl = absoluteUrl + 'index.html';
61
+ if (urlMap.has(indexUrl)) {
62
+ const targetPath = urlMap.get(indexUrl);
63
+ return getRelativePath(pagePath, targetPath);
64
+ }
65
+ }
66
+
67
+ // Check without query string
68
+ const urlWithoutQuery = absoluteUrl.split('?')[0];
69
+ if (urlWithoutQuery !== absoluteUrl && urlMap.has(urlWithoutQuery)) {
70
+ const targetPath = urlMap.get(urlWithoutQuery);
71
+ return getRelativePath(pagePath, targetPath);
72
+ }
73
+
74
+ return null;
75
+ };
76
+
77
+ // Rewrite <a href>
78
+ $('a[href]').each((_, el) => {
79
+ const href = $(el).attr('href');
80
+ if (shouldSkipUrl(href)) return;
81
+
82
+ const localPath = getLocalPath(href);
83
+ if (localPath) {
84
+ $(el).attr('href', localPath);
85
+ }
86
+ });
87
+
88
+ // Rewrite <link href>
89
+ $('link[href]').each((_, el) => {
90
+ const href = $(el).attr('href');
91
+ if (shouldSkipUrl(href)) return;
92
+
93
+ const localPath = getLocalPath(href);
94
+ if (localPath) {
95
+ $(el).attr('href', localPath);
96
+ }
97
+ });
98
+
99
+ // Rewrite <script src>
100
+ $('script[src]').each((_, el) => {
101
+ const src = $(el).attr('src');
102
+ if (shouldSkipUrl(src)) return;
103
+
104
+ const localPath = getLocalPath(src);
105
+ if (localPath) {
106
+ $(el).attr('src', localPath);
107
+ }
108
+ });
109
+
110
+ // Rewrite <img src>
111
+ $('img[src]').each((_, el) => {
112
+ const src = $(el).attr('src');
113
+ if (shouldSkipUrl(src)) return;
114
+
115
+ const localPath = getLocalPath(src);
116
+ if (localPath) {
117
+ $(el).attr('src', localPath);
118
+ }
119
+ });
120
+
121
+ // Rewrite <img srcset>
122
+ $('img[srcset], source[srcset]').each((_, el) => {
123
+ const srcset = $(el).attr('srcset');
124
+ if (!srcset) return;
125
+
126
+ const newSrcset = rewriteSrcset(srcset, pageUrl, urlMap, _options);
127
+ $(el).attr('srcset', newSrcset);
128
+ });
129
+
130
+ // Rewrite <video src>, <audio src>, <source src>
131
+ $('video[src], audio[src], source[src]').each((_, el) => {
132
+ const src = $(el).attr('src');
133
+ if (shouldSkipUrl(src)) return;
134
+
135
+ const localPath = getLocalPath(src);
136
+ if (localPath) {
137
+ $(el).attr('src', localPath);
138
+ }
139
+ });
140
+
141
+ // Rewrite <video poster>
142
+ $('video[poster]').each((_, el) => {
143
+ const poster = $(el).attr('poster');
144
+ if (shouldSkipUrl(poster)) return;
145
+
146
+ const localPath = getLocalPath(poster);
147
+ if (localPath) {
148
+ $(el).attr('poster', localPath);
149
+ }
150
+ });
151
+
152
+ // Rewrite <iframe src>
153
+ $('iframe[src]').each((_, el) => {
154
+ const src = $(el).attr('src');
155
+ if (shouldSkipUrl(src)) return;
156
+
157
+ const localPath = getLocalPath(src);
158
+ if (localPath) {
159
+ $(el).attr('src', localPath);
160
+ }
161
+ });
162
+
163
+ // Rewrite <object data>
164
+ $('object[data]').each((_, el) => {
165
+ const data = $(el).attr('data');
166
+ if (shouldSkipUrl(data)) return;
167
+
168
+ const localPath = getLocalPath(data);
169
+ if (localPath) {
170
+ $(el).attr('data', localPath);
171
+ }
172
+ });
173
+
174
+ // Rewrite style attributes
175
+ $('[style]').each((_, el) => {
176
+ const style = $(el).attr('style');
177
+ const newStyle = rewriteCssUrls(style, pageUrl, urlMap, pagePath, _options);
178
+ $(el).attr('style', newStyle);
179
+ });
180
+
181
+ // Rewrite inline <style> tags
182
+ $('style').each((_, el) => {
183
+ const css = $(el).html();
184
+ const newCss = rewriteCssUrls(css, pageUrl, urlMap, pagePath, _options);
185
+ $(el).html(newCss);
186
+ });
187
+
188
+ return $.html();
189
+ }
190
+
191
+ /**
192
+ * Rewrite URLs in CSS content
193
+ */
194
+ export function rewriteCssUrls(css, baseUrl, urlMap, pagePath, _options = {}) {
195
+ if (!css) return css;
196
+
197
+ // Helper to find local path for a URL
198
+ const findLocalPath = url => {
199
+ const absoluteUrl = resolveUrl(url, baseUrl);
200
+
201
+ // Direct match
202
+ if (urlMap.has(absoluteUrl)) {
203
+ return urlMap.get(absoluteUrl);
204
+ }
205
+
206
+ // Try without query string
207
+ const urlWithoutQuery = absoluteUrl.split('?')[0];
208
+ if (urlWithoutQuery !== absoluteUrl && urlMap.has(urlWithoutQuery)) {
209
+ return urlMap.get(urlWithoutQuery);
210
+ }
211
+
212
+ // Try without trailing slash
213
+ const normalizedUrl = absoluteUrl.replace(/\/$/, '');
214
+ if (urlMap.has(normalizedUrl)) {
215
+ return urlMap.get(normalizedUrl);
216
+ }
217
+
218
+ return null;
219
+ };
220
+
221
+ // Rewrite url() references
222
+ css = css.replace(/url\s*\(\s*['"]?([^'")]+)['"]?\s*\)/gi, (match, url) => {
223
+ if (shouldSkipUrl(url)) return match;
224
+
225
+ const targetPath = findLocalPath(url);
226
+ if (targetPath) {
227
+ const relativePath = getRelativePath(pagePath, targetPath);
228
+ return `url("${relativePath}")`;
229
+ }
230
+
231
+ return match;
232
+ });
233
+
234
+ // Rewrite @import references
235
+ css = css.replace(/@import\s+['"]([^'"]+)['"]/gi, (match, url) => {
236
+ if (shouldSkipUrl(url)) return match;
237
+
238
+ const targetPath = findLocalPath(url);
239
+ if (targetPath) {
240
+ const relativePath = getRelativePath(pagePath, targetPath);
241
+ return `@import "${relativePath}"`;
242
+ }
243
+
244
+ return match;
245
+ });
246
+
247
+ return css;
248
+ }
249
+
250
+ /**
251
+ * Rewrite srcset attribute
252
+ */
253
+ function rewriteSrcset(srcset, pageUrl, urlMap, options) {
254
+ const pagePath = urlToPath(pageUrl, options.structure);
255
+
256
+ return srcset
257
+ .split(',')
258
+ .map(part => {
259
+ const [url, descriptor] = part.trim().split(/\s+/);
260
+ if (shouldSkipUrl(url)) return part;
261
+
262
+ const absoluteUrl = resolveUrl(url, pageUrl);
263
+ if (urlMap.has(absoluteUrl)) {
264
+ const targetPath = urlMap.get(absoluteUrl);
265
+ const relativePath = getRelativePath(pagePath, targetPath);
266
+ return descriptor ? `${relativePath} ${descriptor}` : relativePath;
267
+ }
268
+
269
+ return part;
270
+ })
271
+ .join(', ');
272
+ }
273
+
274
+ /**
275
+ * Check if URL should be skipped
276
+ */
277
+ function shouldSkipUrl(url) {
278
+ if (!url) return true;
279
+
280
+ const skipPrefixes = [
281
+ 'javascript:',
282
+ 'mailto:',
283
+ 'tel:',
284
+ 'data:',
285
+ '#',
286
+ 'blob:',
287
+ 'about:',
288
+ ];
289
+
290
+ return skipPrefixes.some(prefix =>
291
+ url.trim().toLowerCase().startsWith(prefix),
292
+ );
293
+ }
@@ -0,0 +1,163 @@
1
+ import fs from 'fs-extra';
2
+ import path from 'path';
3
+
4
+ const SMIPPO_DIR = '.smippo';
5
+ const MANIFEST_FILE = 'manifest.json';
6
+ const CACHE_FILE = 'cache.json';
7
+
8
+ /**
9
+ * Check if a manifest exists
10
+ */
11
+ export function manifestExists(outputDir) {
12
+ const manifestPath = path.join(outputDir, SMIPPO_DIR, MANIFEST_FILE);
13
+ return fs.existsSync(manifestPath);
14
+ }
15
+
16
+ /**
17
+ * Read the manifest file
18
+ */
19
+ export async function readManifest(outputDir) {
20
+ const manifestPath = path.join(outputDir, SMIPPO_DIR, MANIFEST_FILE);
21
+
22
+ if (!fs.existsSync(manifestPath)) {
23
+ return null;
24
+ }
25
+
26
+ const content = await fs.readFile(manifestPath, 'utf8');
27
+ return JSON.parse(content);
28
+ }
29
+
30
+ /**
31
+ * Write the manifest file
32
+ */
33
+ export async function writeManifest(outputDir, manifest) {
34
+ const smippoDir = path.join(outputDir, SMIPPO_DIR);
35
+ const manifestPath = path.join(smippoDir, MANIFEST_FILE);
36
+
37
+ await fs.ensureDir(smippoDir);
38
+ await fs.writeFile(manifestPath, JSON.stringify(manifest, null, 2), 'utf8');
39
+ }
40
+
41
+ /**
42
+ * Read the cache file
43
+ */
44
+ export async function readCache(outputDir) {
45
+ const cachePath = path.join(outputDir, SMIPPO_DIR, CACHE_FILE);
46
+
47
+ if (!fs.existsSync(cachePath)) {
48
+ return {
49
+ etags: {},
50
+ lastModified: {},
51
+ contentTypes: {},
52
+ };
53
+ }
54
+
55
+ const content = await fs.readFile(cachePath, 'utf8');
56
+ return JSON.parse(content);
57
+ }
58
+
59
+ /**
60
+ * Write the cache file
61
+ */
62
+ export async function writeCache(outputDir, cache) {
63
+ const smippoDir = path.join(outputDir, SMIPPO_DIR);
64
+ const cachePath = path.join(smippoDir, CACHE_FILE);
65
+
66
+ await fs.ensureDir(smippoDir);
67
+ await fs.writeFile(cachePath, JSON.stringify(cache, null, 2), 'utf8');
68
+ }
69
+
70
+ /**
71
+ * Create initial manifest
72
+ */
73
+ export function createManifest(url, options) {
74
+ return {
75
+ version: '0.0.1',
76
+ created: new Date().toISOString(),
77
+ updated: new Date().toISOString(),
78
+ rootUrl: url,
79
+ options: {
80
+ depth: options.depth,
81
+ scope: options.scope,
82
+ stayInDir: options.stayInDir,
83
+ externalAssets: options.externalAssets,
84
+ filters: {
85
+ include: options.include || [],
86
+ exclude: options.exclude || [],
87
+ },
88
+ },
89
+ stats: {
90
+ pagesCapt: 0,
91
+ assetsCapt: 0,
92
+ totalSize: 0,
93
+ duration: 0,
94
+ errors: 0,
95
+ },
96
+ pages: [],
97
+ assets: [],
98
+ };
99
+ }
100
+
101
+ /**
102
+ * Update manifest with captured page
103
+ */
104
+ export function addPageToManifest(manifest, page) {
105
+ manifest.pages.push({
106
+ url: page.url,
107
+ localPath: page.localPath,
108
+ status: page.status || 200,
109
+ captured: new Date().toISOString(),
110
+ size: page.size,
111
+ title: page.title,
112
+ });
113
+
114
+ manifest.stats.pagesCapt++;
115
+ manifest.stats.totalSize += page.size || 0;
116
+ manifest.updated = new Date().toISOString();
117
+ }
118
+
119
+ /**
120
+ * Update manifest with captured asset
121
+ */
122
+ export function addAssetToManifest(manifest, asset) {
123
+ manifest.assets.push({
124
+ url: asset.url,
125
+ localPath: asset.localPath,
126
+ mimeType: asset.mimeType,
127
+ size: asset.size,
128
+ });
129
+
130
+ manifest.stats.assetsCapt++;
131
+ manifest.stats.totalSize += asset.size || 0;
132
+ manifest.updated = new Date().toISOString();
133
+ }
134
+
135
+ /**
136
+ * Record an error in manifest
137
+ */
138
+ export function addErrorToManifest(manifest, _url, _error) {
139
+ manifest.stats.errors++;
140
+ manifest.updated = new Date().toISOString();
141
+ }
142
+
143
+ /**
144
+ * Finalize manifest with duration
145
+ */
146
+ export function finalizeManifest(manifest, duration) {
147
+ manifest.stats.duration = duration;
148
+ manifest.updated = new Date().toISOString();
149
+ }
150
+
151
+ /**
152
+ * Get HAR file path
153
+ */
154
+ export function getHarPath(outputDir) {
155
+ return path.join(outputDir, SMIPPO_DIR, 'network.har');
156
+ }
157
+
158
+ /**
159
+ * Get log file path
160
+ */
161
+ export function getLogPath(outputDir) {
162
+ return path.join(outputDir, SMIPPO_DIR, 'log.txt');
163
+ }
@@ -0,0 +1,151 @@
1
+ // @flow
2
+ import {extractLinks} from './link-extractor.js';
3
+
4
+ /**
5
+ * Capture a single page with all its rendered content
6
+ */
7
+ export class PageCapture {
8
+ constructor(page, options = {}) {
9
+ this.page = page;
10
+ this.options = options;
11
+ this.resources = new Map();
12
+ }
13
+
14
+ /**
15
+ * Capture the page content and resources
16
+ */
17
+ async capture(url) {
18
+ const startTime = Date.now();
19
+
20
+ // Set up resource collection
21
+ this.page.on('response', async response => {
22
+ await this._collectResource(response);
23
+ });
24
+
25
+ // Navigate to the page
26
+ try {
27
+ await this.page.goto(url, {
28
+ waitUntil: this.options.wait || 'networkidle',
29
+ timeout: this.options.timeout || 30000,
30
+ });
31
+ } catch (error) {
32
+ // Handle navigation errors but continue if we got some content
33
+ if (!this.page.url().startsWith('http')) {
34
+ throw error;
35
+ }
36
+ }
37
+
38
+ // Additional wait time if specified
39
+ if (this.options.waitTime > 0) {
40
+ await this.page.waitForTimeout(this.options.waitTime);
41
+ }
42
+
43
+ // Get the rendered HTML
44
+ const html = await this.page.content();
45
+
46
+ // Get page metadata
47
+ const title = await this.page.title();
48
+ const finalUrl = this.page.url();
49
+
50
+ // Extract links from the rendered page
51
+ const links = await extractLinks(this.page, finalUrl, this.options);
52
+
53
+ // Take screenshot if requested
54
+ let screenshot = null;
55
+ if (this.options.screenshot) {
56
+ screenshot = await this.page.screenshot({
57
+ fullPage: true,
58
+ type: 'png',
59
+ });
60
+ }
61
+
62
+ // Generate PDF if requested
63
+ let pdf = null;
64
+ if (this.options.pdf) {
65
+ pdf = await this.page.pdf({
66
+ format: 'A4',
67
+ printBackground: true,
68
+ });
69
+ }
70
+
71
+ return {
72
+ url: finalUrl,
73
+ requestedUrl: url,
74
+ html,
75
+ title,
76
+ links,
77
+ resources: this.resources,
78
+ screenshot,
79
+ pdf,
80
+ duration: Date.now() - startTime,
81
+ };
82
+ }
83
+
84
+ /**
85
+ * Collect a resource from a network response
86
+ */
87
+ async _collectResource(response) {
88
+ try {
89
+ const url = response.url();
90
+ const status = response.status();
91
+ const headers = response.headers();
92
+ const contentType = headers['content-type'] || '';
93
+
94
+ // Skip failed requests
95
+ if (status < 200 || status >= 400) return;
96
+
97
+ // Skip the main HTML page (captured separately)
98
+ if (contentType.includes('text/html')) return;
99
+
100
+ // Skip data URLs
101
+ if (url.startsWith('data:')) return;
102
+
103
+ // Apply MIME type filters
104
+ if (this.options.mimeExclude?.length) {
105
+ if (this._matchesMimeFilter(contentType, this.options.mimeExclude)) {
106
+ return;
107
+ }
108
+ }
109
+
110
+ if (this.options.mimeInclude?.length) {
111
+ if (!this._matchesMimeFilter(contentType, this.options.mimeInclude)) {
112
+ return;
113
+ }
114
+ }
115
+
116
+ // Get the response body
117
+ const body = await response.body().catch(() => null);
118
+ if (!body) return;
119
+
120
+ // Apply size filters
121
+ if (this.options.maxSize && body.length > this.options.maxSize) return;
122
+ if (this.options.minSize && body.length < this.options.minSize) return;
123
+
124
+ this.resources.set(url, {
125
+ url,
126
+ status,
127
+ contentType,
128
+ size: body.length,
129
+ body,
130
+ headers,
131
+ });
132
+ } catch (error) {
133
+ // Ignore resource collection errors
134
+ }
135
+ }
136
+
137
+ /**
138
+ * Check if a content type matches a filter pattern
139
+ */
140
+ _matchesMimeFilter(contentType, filters) {
141
+ const type = contentType.split(';')[0].trim().toLowerCase();
142
+
143
+ return filters.some(filter => {
144
+ filter = filter.toLowerCase();
145
+ if (filter.endsWith('/*')) {
146
+ return type.startsWith(filter.slice(0, -1));
147
+ }
148
+ return type === filter || type.startsWith(filter + ';');
149
+ });
150
+ }
151
+ }