smippo 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,391 @@
1
+ // @flow
2
+ import * as p from '@clack/prompts';
3
+ import chalk from 'chalk';
4
+ import figlet from 'figlet';
5
+ import gradient from 'gradient-string';
6
+
7
+ // Custom gradient for Smippo branding
8
+ const smippoGradient = gradient(['#60a5fa', '#a78bfa', '#f472b6']);
9
+
10
+ /**
11
+ * Display the Smippo banner
12
+ */
13
+ export function showBanner() {
14
+ console.log('');
15
+ console.log(
16
+ smippoGradient(
17
+ figlet.textSync('Smippo', {
18
+ font: 'Small',
19
+ horizontalLayout: 'default',
20
+ }),
21
+ ),
22
+ );
23
+ console.log(chalk.dim(' Modern Website Copier • Powered by Playwright'));
24
+ console.log('');
25
+ }
26
+
27
+ /**
28
+ * Display help information with visual formatting
29
+ */
30
+ export function showHelp() {
31
+ showBanner();
32
+
33
+ console.log(chalk.bold.white('USAGE'));
34
+ console.log('');
35
+ console.log(
36
+ ` ${chalk.cyan('smippo')} ${chalk.dim('[url]')} ${chalk.yellow('[options]')}`,
37
+ );
38
+ console.log(
39
+ ` ${chalk.cyan('smippo')} ${chalk.green('<command>')} ${chalk.yellow('[options]')}`,
40
+ );
41
+ console.log('');
42
+
43
+ console.log(chalk.bold.white('COMMANDS'));
44
+ console.log('');
45
+ console.log(
46
+ ` ${chalk.green('capture')} ${chalk.dim('<url>')} Take a screenshot of a URL`,
47
+ );
48
+ console.log(
49
+ ` ${chalk.green('serve')} ${chalk.dim('[directory]')} Serve captured site locally`,
50
+ );
51
+ console.log(
52
+ ` ${chalk.green('continue')} Resume interrupted capture`,
53
+ );
54
+ console.log(
55
+ ` ${chalk.green('update')} Update existing mirror`,
56
+ );
57
+ console.log(
58
+ ` ${chalk.green('help')} Show this help message`,
59
+ );
60
+ console.log('');
61
+
62
+ console.log(chalk.bold.white('CAPTURE OPTIONS'));
63
+ console.log('');
64
+ console.log(
65
+ ` ${chalk.yellow('-o, --output')} ${chalk.dim('<dir>')} Output directory ${chalk.dim('(default: ./site)')}`,
66
+ );
67
+ console.log(
68
+ ` ${chalk.yellow('-d, --depth')} ${chalk.dim('<n>')} Recursion depth ${chalk.dim('(0 = single page)')}`,
69
+ );
70
+ console.log(
71
+ ` ${chalk.yellow('-s, --scope')} ${chalk.dim('<type>')} Link scope: subdomain|domain|tld|all`,
72
+ );
73
+ console.log(
74
+ ` ${chalk.yellow('--external-assets')} Include assets from external domains`,
75
+ );
76
+ console.log(
77
+ ` ${chalk.yellow('--static')} Strip JS for static offline viewing`,
78
+ );
79
+ console.log('');
80
+
81
+ console.log(chalk.bold.white('FILTERING'));
82
+ console.log('');
83
+ console.log(
84
+ ` ${chalk.yellow('-I, --include')} ${chalk.dim('<glob>')} Include URLs matching pattern`,
85
+ );
86
+ console.log(
87
+ ` ${chalk.yellow('-E, --exclude')} ${chalk.dim('<glob>')} Exclude URLs matching pattern`,
88
+ );
89
+ console.log(
90
+ ` ${chalk.yellow('--max-size')} ${chalk.dim('<size>')} Maximum file size ${chalk.dim('(e.g., 10MB)')}`,
91
+ );
92
+ console.log(
93
+ ` ${chalk.yellow('--mime-include')} ${chalk.dim('<type>')} Include MIME types`,
94
+ );
95
+ console.log(
96
+ ` ${chalk.yellow('--mime-exclude')} ${chalk.dim('<type>')} Exclude MIME types`,
97
+ );
98
+ console.log('');
99
+
100
+ console.log(chalk.bold.white('BROWSER'));
101
+ console.log('');
102
+ console.log(
103
+ ` ${chalk.yellow('--wait')} ${chalk.dim('<strategy>')} Wait: networkidle|load|domcontentloaded`,
104
+ );
105
+ console.log(
106
+ ` ${chalk.yellow('--wait-time')} ${chalk.dim('<ms>')} Extra wait time after load`,
107
+ );
108
+ console.log(
109
+ ` ${chalk.yellow('--timeout')} ${chalk.dim('<ms>')} Page load timeout`,
110
+ );
111
+ console.log(
112
+ ` ${chalk.yellow('--viewport')} ${chalk.dim('<WxH>')} Viewport size ${chalk.dim('(default: 1920x1080)')}`,
113
+ );
114
+ console.log(
115
+ ` ${chalk.yellow('--device')} ${chalk.dim('<name>')} Emulate device`,
116
+ );
117
+ console.log('');
118
+
119
+ console.log(chalk.bold.white('OUTPUT'));
120
+ console.log('');
121
+ console.log(
122
+ ` ${chalk.yellow('--screenshot')} Take screenshot of each page`,
123
+ );
124
+ console.log(
125
+ ` ${chalk.yellow('--pdf')} Save PDF of each page`,
126
+ );
127
+ console.log(
128
+ ` ${chalk.yellow('--har')} Generate HAR file ${chalk.dim('(default: true)')}`,
129
+ );
130
+ console.log(
131
+ ` ${chalk.yellow('--no-har')} Disable HAR generation`,
132
+ );
133
+ console.log('');
134
+
135
+ console.log(chalk.bold.white('PERFORMANCE'));
136
+ console.log('');
137
+ console.log(
138
+ ` ${chalk.yellow('-w, --workers')} ${chalk.dim('<n>')} Parallel workers ${chalk.dim('(default: 8)')}`,
139
+ );
140
+ console.log(
141
+ ` ${chalk.yellow('--max-pages')} ${chalk.dim('<n>')} Maximum pages to capture`,
142
+ );
143
+ console.log(
144
+ ` ${chalk.yellow('--rate-limit')} ${chalk.dim('<ms>')} Delay between requests`,
145
+ );
146
+ console.log('');
147
+
148
+ console.log(chalk.bold.white('OTHER'));
149
+ console.log('');
150
+ console.log(` ${chalk.yellow('-v, --verbose')} Verbose output`);
151
+ console.log(` ${chalk.yellow('-q, --quiet')} Minimal output`);
152
+ console.log(
153
+ ` ${chalk.yellow('--debug')} Debug mode with visible browser`,
154
+ );
155
+ console.log(
156
+ ` ${chalk.yellow('--no-interaction')} Non-interactive mode (for CI)`,
157
+ );
158
+ console.log('');
159
+
160
+ console.log(chalk.bold.white('EXAMPLES'));
161
+ console.log('');
162
+ console.log(chalk.dim(' # Mirror a single page'));
163
+ console.log(` ${chalk.cyan('smippo')} https://example.com`);
164
+ console.log('');
165
+ console.log(chalk.dim(' # Mirror a site 3 levels deep'));
166
+ console.log(
167
+ ` ${chalk.cyan('smippo')} https://example.com ${chalk.yellow('-d 3')}`,
168
+ );
169
+ console.log('');
170
+ console.log(
171
+ chalk.dim(' # Mirror as static HTML (strips JS for offline viewing)'),
172
+ );
173
+ console.log(
174
+ ` ${chalk.cyan('smippo')} https://example.com ${chalk.yellow('--static --external-assets')}`,
175
+ );
176
+ console.log('');
177
+ console.log(chalk.dim(' # Take a screenshot'));
178
+ console.log(` ${chalk.cyan('smippo capture')} https://example.com`);
179
+ console.log('');
180
+ console.log(chalk.dim(' # Full-page screenshot'));
181
+ console.log(
182
+ ` ${chalk.cyan('smippo capture')} https://example.com ${chalk.yellow('--full-page')}`,
183
+ );
184
+ console.log('');
185
+ console.log(chalk.dim(' # Screenshot with device emulation'));
186
+ console.log(
187
+ ` ${chalk.cyan('smippo capture')} https://example.com ${chalk.yellow('--device "iPhone 13"')}`,
188
+ );
189
+ console.log('');
190
+ console.log(chalk.dim(' # Serve captured site'));
191
+ console.log(
192
+ ` ${chalk.cyan('smippo serve')} ./site ${chalk.yellow('--open')}`,
193
+ );
194
+ console.log('');
195
+ console.log(chalk.dim(' # Interactive guided mode'));
196
+ console.log(` ${chalk.cyan('smippo')}`);
197
+ console.log('');
198
+ }
199
+
200
+ /**
201
+ * Run the interactive guided capture wizard
202
+ */
203
+ export async function runInteractiveCapture() {
204
+ showBanner();
205
+
206
+ p.intro(chalk.bgCyan.black(' Capture Wizard '));
207
+
208
+ const options = await p.group(
209
+ {
210
+ url: () =>
211
+ p.text({
212
+ message: 'What URL would you like to capture?',
213
+ placeholder: 'https://example.com',
214
+ validate: value => {
215
+ if (!value) return 'URL is required';
216
+ try {
217
+ new URL(value);
218
+ } catch {
219
+ return 'Please enter a valid URL';
220
+ }
221
+ },
222
+ }),
223
+
224
+ depth: () =>
225
+ p.select({
226
+ message: 'How deep should we crawl?',
227
+ options: [
228
+ {value: '0', label: 'Single page only', hint: 'fastest'},
229
+ {value: '1', label: '1 level deep', hint: 'links from main page'},
230
+ {value: '2', label: '2 levels deep'},
231
+ {
232
+ value: '3',
233
+ label: '3 levels deep',
234
+ hint: 'recommended for small sites',
235
+ },
236
+ {value: 'custom', label: 'Custom depth...'},
237
+ ],
238
+ }),
239
+
240
+ customDepth: ({results}) => {
241
+ if (results.depth !== 'custom') return;
242
+ return p.text({
243
+ message: 'Enter custom depth:',
244
+ placeholder: '5',
245
+ validate: v =>
246
+ isNaN(parseInt(v)) ? 'Please enter a number' : undefined,
247
+ });
248
+ },
249
+
250
+ scope: () =>
251
+ p.select({
252
+ message: 'Link following scope:',
253
+ options: [
254
+ {
255
+ value: 'domain',
256
+ label: 'Same domain',
257
+ hint: 'www.example.com + example.com',
258
+ },
259
+ {value: 'subdomain', label: 'Same subdomain only', hint: 'strict'},
260
+ {value: 'all', label: 'All links', hint: 'careful - can be slow!'},
261
+ ],
262
+ }),
263
+
264
+ externalAssets: () =>
265
+ p.confirm({
266
+ message: 'Include assets from external domains (CDNs, fonts)?',
267
+ initialValue: true,
268
+ }),
269
+
270
+ static: () =>
271
+ p.confirm({
272
+ message: 'Strip JavaScript for static offline viewing?',
273
+ initialValue: false,
274
+ }),
275
+
276
+ advanced: () =>
277
+ p.confirm({
278
+ message: 'Configure advanced options?',
279
+ initialValue: false,
280
+ }),
281
+
282
+ output: ({results}) => {
283
+ if (!results.advanced) return;
284
+ return p.text({
285
+ message: 'Output directory:',
286
+ placeholder: './site',
287
+ initialValue: './site',
288
+ });
289
+ },
290
+
291
+ screenshot: ({results}) => {
292
+ if (!results.advanced) return;
293
+ return p.confirm({
294
+ message: 'Take screenshots of each page?',
295
+ initialValue: false,
296
+ });
297
+ },
298
+
299
+ workers: ({results}) => {
300
+ if (!results.advanced) return;
301
+ return p.select({
302
+ message: 'Parallel workers:',
303
+ options: [
304
+ {
305
+ value: '1',
306
+ label: '1 worker',
307
+ hint: 'slowest, safest for rate-limited sites',
308
+ },
309
+ {value: '4', label: '4 workers', hint: 'moderate'},
310
+ {value: '8', label: '8 workers', hint: 'default'},
311
+ {value: '16', label: '16 workers', hint: 'fast, use with caution'},
312
+ ],
313
+ });
314
+ },
315
+ },
316
+ {
317
+ onCancel: () => {
318
+ p.cancel('Capture cancelled.');
319
+ process.exit(0);
320
+ },
321
+ },
322
+ );
323
+
324
+ // Build final options
325
+ const finalOptions = {
326
+ url: options.url,
327
+ output: options.output || './site',
328
+ depth: options.customDepth || options.depth,
329
+ scope: options.scope,
330
+ externalAssets: options.externalAssets,
331
+ static: options.static,
332
+ screenshot: options.screenshot,
333
+ workers: options.workers || '8',
334
+ };
335
+
336
+ // Show summary
337
+ console.log('');
338
+ p.note(
339
+ [
340
+ `URL: ${chalk.cyan(finalOptions.url)}`,
341
+ `Depth: ${finalOptions.depth}`,
342
+ `Scope: ${finalOptions.scope}`,
343
+ `External: ${finalOptions.externalAssets ? 'Yes' : 'No'}`,
344
+ `Static: ${finalOptions.static ? 'Yes' : 'No'}`,
345
+ `Output: ${finalOptions.output}`,
346
+ ].join('\n'),
347
+ 'Capture Settings',
348
+ );
349
+
350
+ const confirmed = await p.confirm({
351
+ message: 'Start capture?',
352
+ initialValue: true,
353
+ });
354
+
355
+ if (!confirmed) {
356
+ p.cancel('Capture cancelled.');
357
+ process.exit(0);
358
+ }
359
+
360
+ p.outro(chalk.dim('Starting capture...'));
361
+ console.log('');
362
+
363
+ return finalOptions;
364
+ }
365
+
366
+ /**
367
+ * Check if we should run in interactive mode
368
+ */
369
+ export function shouldRunInteractive(args) {
370
+ // Check for explicit flags
371
+ if (args.includes('--no-interaction') || args.includes('-y')) {
372
+ return false;
373
+ }
374
+
375
+ // Check if stdin is a TTY (interactive terminal)
376
+ if (!process.stdin.isTTY) {
377
+ return false;
378
+ }
379
+
380
+ // Check if we have a URL or command
381
+ const hasUrlOrCommand = args.some(arg => {
382
+ if (arg.startsWith('-')) return false;
383
+ if (arg.startsWith('http://') || arg.startsWith('https://')) return true;
384
+ if (['capture', 'serve', 'continue', 'update', 'help'].includes(arg)) {
385
+ return true;
386
+ }
387
+ return false;
388
+ });
389
+
390
+ return !hasUrlOrCommand;
391
+ }
@@ -0,0 +1,212 @@
1
+ // @flow
2
+ import {load} from 'cheerio';
3
+ import {resolveUrl, isLikelyPage} from './utils/url.js';
4
+
5
+ /**
6
+ * Extract all links from a page
7
+ */
8
+ export async function extractLinks(page, baseUrl, _options = {}) {
9
+ const html = await page.content();
10
+ const $ = load(html);
11
+
12
+ const links = new Set();
13
+ const assets = new Set();
14
+
15
+ // Extract href links
16
+ $('a[href]').each((_, el) => {
17
+ const href = $(el).attr('href');
18
+ const resolved = resolveAndClean(href, baseUrl);
19
+ if (resolved && isHttpUrl(resolved)) {
20
+ links.add(resolved);
21
+ }
22
+ });
23
+
24
+ // Extract CSS links
25
+ $('link[href]').each((_, el) => {
26
+ const href = $(el).attr('href');
27
+ const resolved = resolveAndClean(href, baseUrl);
28
+ if (resolved && isHttpUrl(resolved)) {
29
+ assets.add(resolved);
30
+ }
31
+ });
32
+
33
+ // Extract script sources
34
+ $('script[src]').each((_, el) => {
35
+ const src = $(el).attr('src');
36
+ const resolved = resolveAndClean(src, baseUrl);
37
+ if (resolved && isHttpUrl(resolved)) {
38
+ assets.add(resolved);
39
+ }
40
+ });
41
+
42
+ // Extract images
43
+ $('img[src]').each((_, el) => {
44
+ const src = $(el).attr('src');
45
+ const resolved = resolveAndClean(src, baseUrl);
46
+ if (resolved && isHttpUrl(resolved)) {
47
+ assets.add(resolved);
48
+ }
49
+ });
50
+
51
+ // Extract srcset images
52
+ $('img[srcset], source[srcset]').each((_, el) => {
53
+ const srcset = $(el).attr('srcset');
54
+ if (srcset) {
55
+ parseSrcset(srcset).forEach(src => {
56
+ const resolved = resolveAndClean(src, baseUrl);
57
+ if (resolved && isHttpUrl(resolved)) {
58
+ assets.add(resolved);
59
+ }
60
+ });
61
+ }
62
+ });
63
+
64
+ // Extract video/audio sources
65
+ $('video[src], audio[src], source[src]').each((_, el) => {
66
+ const src = $(el).attr('src');
67
+ const resolved = resolveAndClean(src, baseUrl);
68
+ if (resolved && isHttpUrl(resolved)) {
69
+ assets.add(resolved);
70
+ }
71
+ });
72
+
73
+ // Extract video posters
74
+ $('video[poster]').each((_, el) => {
75
+ const poster = $(el).attr('poster');
76
+ const resolved = resolveAndClean(poster, baseUrl);
77
+ if (resolved && isHttpUrl(resolved)) {
78
+ assets.add(resolved);
79
+ }
80
+ });
81
+
82
+ // Extract iframe sources
83
+ $('iframe[src]').each((_, el) => {
84
+ const src = $(el).attr('src');
85
+ const resolved = resolveAndClean(src, baseUrl);
86
+ if (resolved && isHttpUrl(resolved)) {
87
+ links.add(resolved);
88
+ }
89
+ });
90
+
91
+ // Extract object data
92
+ $('object[data]').each((_, el) => {
93
+ const data = $(el).attr('data');
94
+ const resolved = resolveAndClean(data, baseUrl);
95
+ if (resolved && isHttpUrl(resolved)) {
96
+ assets.add(resolved);
97
+ }
98
+ });
99
+
100
+ // Extract background images from style attributes
101
+ $('[style]').each((_, el) => {
102
+ const style = $(el).attr('style');
103
+ extractCssUrls(style, baseUrl).forEach(url => assets.add(url));
104
+ });
105
+
106
+ // Extract URLs from inline style tags
107
+ $('style').each((_, el) => {
108
+ const css = $(el).html();
109
+ extractCssUrls(css, baseUrl).forEach(url => assets.add(url));
110
+ });
111
+
112
+ // Extract meta refresh URLs
113
+ $('meta[http-equiv="refresh"]').each((_, el) => {
114
+ const content = $(el).attr('content');
115
+ const match = content?.match(/url=(.+)/i);
116
+ if (match) {
117
+ const resolved = resolveAndClean(match[1].trim(), baseUrl);
118
+ if (resolved && isHttpUrl(resolved)) {
119
+ links.add(resolved);
120
+ }
121
+ }
122
+ });
123
+
124
+ // Extract canonical URL
125
+ $('link[rel="canonical"]').each((_, el) => {
126
+ const href = $(el).attr('href');
127
+ const resolved = resolveAndClean(href, baseUrl);
128
+ if (resolved && isHttpUrl(resolved)) {
129
+ links.add(resolved);
130
+ }
131
+ });
132
+
133
+ // Separate page links from asset links
134
+ const pageLinks = [...links].filter(url => isLikelyPage(url));
135
+ const assetLinks = [...assets, ...links.values()].filter(
136
+ url => !isLikelyPage(url),
137
+ );
138
+
139
+ return {
140
+ pages: [...new Set(pageLinks)],
141
+ assets: [...new Set(assetLinks)],
142
+ all: [...new Set([...links, ...assets])],
143
+ };
144
+ }
145
+
146
+ /**
147
+ * Extract URLs from CSS content
148
+ */
149
+ export function extractCssUrls(css, baseUrl) {
150
+ if (!css) return [];
151
+
152
+ const urls = [];
153
+ const urlRegex = /url\s*\(\s*['"]?([^'")]+)['"]?\s*\)/gi;
154
+ const importRegex = /@import\s+['"]([^'"]+)['"]/gi;
155
+
156
+ let match;
157
+ while ((match = urlRegex.exec(css)) !== null) {
158
+ const resolved = resolveAndClean(match[1], baseUrl);
159
+ if (resolved && isHttpUrl(resolved)) {
160
+ urls.push(resolved);
161
+ }
162
+ }
163
+
164
+ while ((match = importRegex.exec(css)) !== null) {
165
+ const resolved = resolveAndClean(match[1], baseUrl);
166
+ if (resolved && isHttpUrl(resolved)) {
167
+ urls.push(resolved);
168
+ }
169
+ }
170
+
171
+ return urls;
172
+ }
173
+
174
+ /**
175
+ * Parse srcset attribute
176
+ */
177
+ function parseSrcset(srcset) {
178
+ return srcset
179
+ .split(',')
180
+ .map(part => part.trim().split(/\s+/)[0])
181
+ .filter(Boolean);
182
+ }
183
+
184
+ /**
185
+ * Resolve and clean a URL
186
+ */
187
+ function resolveAndClean(url, baseUrl) {
188
+ if (!url) return null;
189
+
190
+ // Skip special URLs
191
+ url = url.trim();
192
+ if (url.startsWith('javascript:')) return null;
193
+ if (url.startsWith('mailto:')) return null;
194
+ if (url.startsWith('tel:')) return null;
195
+ if (url.startsWith('data:')) return null;
196
+ if (url.startsWith('#')) return null;
197
+
198
+ try {
199
+ const resolved = resolveUrl(url, baseUrl);
200
+ // Remove hash fragment
201
+ return resolved.split('#')[0];
202
+ } catch {
203
+ return null;
204
+ }
205
+ }
206
+
207
+ /**
208
+ * Check if URL is HTTP/HTTPS
209
+ */
210
+ function isHttpUrl(url) {
211
+ return url.startsWith('http://') || url.startsWith('https://');
212
+ }