smippo 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +116 -0
- package/bin/smippo.js +5 -0
- package/package.json +100 -0
- package/src/cli.js +437 -0
- package/src/crawler.js +408 -0
- package/src/filter.js +155 -0
- package/src/index.js +60 -0
- package/src/interactive.js +391 -0
- package/src/link-extractor.js +212 -0
- package/src/link-rewriter.js +293 -0
- package/src/manifest.js +163 -0
- package/src/page-capture.js +151 -0
- package/src/progress.js +190 -0
- package/src/resource-saver.js +210 -0
- package/src/robots.js +104 -0
- package/src/screenshot.js +185 -0
- package/src/server.js +603 -0
- package/src/utils/logger.js +74 -0
- package/src/utils/path.js +76 -0
- package/src/utils/url.js +295 -0
- package/src/utils/version.js +14 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Smippo
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="assets/logo.svg" alt="Smippo Logo" width="120" height="100">
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<h1 align="center">SMIPPO</h1>
|
|
6
|
+
|
|
7
|
+
<p align="center">
|
|
8
|
+
<strong>S.M.I.P.P.O.</strong> = Structured Mirroring of Internet Pages and Public Objects
|
|
9
|
+
</p>
|
|
10
|
+
|
|
11
|
+
<p align="center">
|
|
12
|
+
Modern website copier — consumes everything fast. Hippos don't nibble. They vacuum.
|
|
13
|
+
</p>
|
|
14
|
+
|
|
15
|
+
<p align="center">
|
|
16
|
+
<a href="https://smippo.com"><img src="https://img.shields.io/badge/docs-smippo.com-blue" alt="Documentation"></a>
|
|
17
|
+
<a href="https://www.npmjs.com/package/smippo"><img src="https://img.shields.io/npm/v/smippo?color=cb0000&label=npm" alt="npm version"></a>
|
|
18
|
+
<a href="https://www.npmjs.com/package/smippo"><img src="https://img.shields.io/npm/dm/smippo?color=cb0000" alt="npm downloads"></a>
|
|
19
|
+
<a href="./LICENSE"><img src="https://img.shields.io/npm/l/smippo?color=blue" alt="license"></a>
|
|
20
|
+
<a href="https://nodejs.org"><img src="https://img.shields.io/node/v/smippo?color=339933" alt="node"></a>
|
|
21
|
+
<a href="https://github.com/pouyanafisi/smippo/pulls"><img src="https://img.shields.io/badge/PRs-welcome-brightgreen.svg" alt="PRs Welcome"></a>
|
|
22
|
+
</p>
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
**S.M.I.P.P.O.** (Structured Mirroring of Internet Pages and Public Objects) is a command-line website copier and scraper that captures websites exactly as they appear in your browser. Create complete offline mirrors with all assets, styles, and dynamic content preserved. Perfect for website duplication, archiving, and offline browsing.
|
|
27
|
+
|
|
28
|
+
📚 **[View complete documentation →](https://smippo.com)**
|
|
29
|
+
|
|
30
|
+
## Quick Start
|
|
31
|
+
|
|
32
|
+
Install globally:
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
npm install -g smippo
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Capture a single page:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
smippo https://example.com
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Mirror a site (3 levels deep):
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
smippo https://example.com --depth 3
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Or use without installing:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
npx smippo https://example.com
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Commands
|
|
57
|
+
|
|
58
|
+
Smippo provides several commands for different use cases:
|
|
59
|
+
|
|
60
|
+
- **`smippo <url>`** — Capture and mirror websites with full fidelity
|
|
61
|
+
- **`smippo capture <url>`** — Take screenshots of web pages
|
|
62
|
+
- **`smippo serve <directory>`** — Serve captured sites locally
|
|
63
|
+
- **`smippo continue`** — Resume an interrupted capture
|
|
64
|
+
- **`smippo update`** — Update an existing mirror
|
|
65
|
+
|
|
66
|
+
Run `smippo` with no arguments to start the interactive guided mode.
|
|
67
|
+
|
|
68
|
+
## Features
|
|
69
|
+
|
|
70
|
+
- **🚀 Vacuum Architecture** — Parallel workers consume sites rapidly
|
|
71
|
+
- **📸 Complete Fidelity** — Captures pages exactly as rendered, including CSS-in-JS, dynamic content, and lazy-loaded images
|
|
72
|
+
- **🎯 Smart Filtering** — Filter by URL patterns, MIME types, and file sizes. Respects robots.txt
|
|
73
|
+
- **🌐 Built-in Server** — Serve captured sites locally with directory browsing
|
|
74
|
+
- **📊 HAR Files** — Generates HTTP Archive files for debugging and replay
|
|
75
|
+
- **💻 Programmatic API** — Use Smippo in your Node.js applications
|
|
76
|
+
|
|
77
|
+
## Documentation
|
|
78
|
+
|
|
79
|
+
For complete documentation, guides, and API reference, visit **[smippo.com](https://smippo.com)**:
|
|
80
|
+
|
|
81
|
+
- **[Installation Guide](https://smippo.com/getting-started/installation)** — Detailed installation instructions
|
|
82
|
+
- **[Commands Reference](https://smippo.com/commands)** — All available commands and options
|
|
83
|
+
- **[Configuration](https://smippo.com/configuration)** — Filtering, scope control, performance tuning
|
|
84
|
+
- **[Guides](https://smippo.com/guides)** — Output structure, link rewriting, troubleshooting
|
|
85
|
+
- **[Programmatic API](https://smippo.com/api/programmatic)** — Use Smippo in your Node.js code
|
|
86
|
+
- **[Examples](https://smippo.com/getting-started/examples)** — Real-world use cases
|
|
87
|
+
|
|
88
|
+
## Requirements
|
|
89
|
+
|
|
90
|
+
- Node.js 18 or later
|
|
91
|
+
- Chromium (automatically downloaded on first install)
|
|
92
|
+
|
|
93
|
+
## Contributing
|
|
94
|
+
|
|
95
|
+
Contributions are welcome! Whether it's bug reports, feature requests, or pull requests — all contributions help make Smippo better.
|
|
96
|
+
|
|
97
|
+
Please read our [Contributing Guide](CONTRIBUTING.md) for details on development setup, code style guidelines, and the pull request process.
|
|
98
|
+
|
|
99
|
+
Quick start:
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
git clone https://github.com/pouyanafisi/smippo.git
|
|
103
|
+
cd smippo
|
|
104
|
+
npm install
|
|
105
|
+
npm test
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## License
|
|
109
|
+
|
|
110
|
+
[MIT](./LICENSE) — feel free to use this in your own projects.
|
|
111
|
+
|
|
112
|
+
## Acknowledgments
|
|
113
|
+
|
|
114
|
+
- Built with [Playwright](https://playwright.dev/) for reliable browser automation
|
|
115
|
+
- CLI powered by [Commander.js](https://github.com/tj/commander.js) and [@clack/prompts](https://github.com/natemoo-re/clack)
|
|
116
|
+
- Inspired by classic website copiers like [HTTrack](https://www.httrack.com/)
|
package/bin/smippo.js
ADDED
package/package.json
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "smippo",
|
|
3
|
+
"version": "0.0.1",
|
|
4
|
+
"description": "S.M.I.P.P.O. — Structured Mirroring of Internet Pages and Public Objects. Modern website copier that captures sites exactly as they appear in your browser.",
|
|
5
|
+
"main": "src/index.js",
|
|
6
|
+
"bin": {
|
|
7
|
+
"smippo": "bin/smippo.js"
|
|
8
|
+
},
|
|
9
|
+
"type": "module",
|
|
10
|
+
"scripts": {
|
|
11
|
+
"start": "node bin/smippo.js",
|
|
12
|
+
"dev": "node bin/smippo.js",
|
|
13
|
+
"test": "mocha",
|
|
14
|
+
"test:unit": "mocha 'test/**/*.test.js'",
|
|
15
|
+
"test:integration": "mocha 'test/integration/**/*.test.js'",
|
|
16
|
+
"test:watch": "mocha --watch",
|
|
17
|
+
"lint": "eslint .",
|
|
18
|
+
"lint:fix": "eslint . --fix",
|
|
19
|
+
"format": "prettier --write .",
|
|
20
|
+
"format:check": "prettier --check .",
|
|
21
|
+
"typecheck": "tsc --noEmit",
|
|
22
|
+
"prepare": "husky install",
|
|
23
|
+
"postinstall": "npx playwright install chromium",
|
|
24
|
+
"prepublishOnly": "npm run lint && npm run format:check && npm test",
|
|
25
|
+
"verdaccio": "verdaccio --config verdaccio.yml",
|
|
26
|
+
"verdaccio:publish": "node verdaccioPublish.js",
|
|
27
|
+
"publish:github": "npm publish --registry=https://npm.pkg.github.com"
|
|
28
|
+
},
|
|
29
|
+
"keywords": [
|
|
30
|
+
"website",
|
|
31
|
+
"copier",
|
|
32
|
+
"mirror",
|
|
33
|
+
"offline",
|
|
34
|
+
"browser",
|
|
35
|
+
"playwright",
|
|
36
|
+
"crawler",
|
|
37
|
+
"scraper",
|
|
38
|
+
"archiver",
|
|
39
|
+
"smippo",
|
|
40
|
+
"website-mirror",
|
|
41
|
+
"site-copier",
|
|
42
|
+
"web-scraper",
|
|
43
|
+
"offline-browsing"
|
|
44
|
+
],
|
|
45
|
+
"author": "Pouyan Afisi",
|
|
46
|
+
"license": "MIT",
|
|
47
|
+
"repository": {
|
|
48
|
+
"type": "git",
|
|
49
|
+
"url": "git+https://github.com/pouyanafisi/smippo.git"
|
|
50
|
+
},
|
|
51
|
+
"bugs": {
|
|
52
|
+
"url": "https://github.com/pouyanafisi/smippo/issues"
|
|
53
|
+
},
|
|
54
|
+
"homepage": "https://smippo.dev",
|
|
55
|
+
"publishConfig": {
|
|
56
|
+
"registry": "https://registry.npmjs.org"
|
|
57
|
+
},
|
|
58
|
+
"engines": {
|
|
59
|
+
"node": ">=18.0.0"
|
|
60
|
+
},
|
|
61
|
+
"files": [
|
|
62
|
+
"bin/",
|
|
63
|
+
"src/",
|
|
64
|
+
"LICENSE",
|
|
65
|
+
"README.md"
|
|
66
|
+
],
|
|
67
|
+
"lint-staged": {
|
|
68
|
+
"*.js": [
|
|
69
|
+
"eslint --fix",
|
|
70
|
+
"prettier --write"
|
|
71
|
+
],
|
|
72
|
+
"*.{json,md}": [
|
|
73
|
+
"prettier --write"
|
|
74
|
+
]
|
|
75
|
+
},
|
|
76
|
+
"dependencies": {
|
|
77
|
+
"@clack/prompts": "^0.11.0",
|
|
78
|
+
"chalk": "^5.3.0",
|
|
79
|
+
"cheerio": "^1.0.0-rc.12",
|
|
80
|
+
"cli-progress": "^3.12.0",
|
|
81
|
+
"commander": "^12.0.0",
|
|
82
|
+
"figlet": "^1.9.4",
|
|
83
|
+
"fs-extra": "^11.2.0",
|
|
84
|
+
"glob": "^10.3.10",
|
|
85
|
+
"gradient-string": "^3.0.0",
|
|
86
|
+
"mime-types": "^2.1.35",
|
|
87
|
+
"minimatch": "^10.1.1",
|
|
88
|
+
"ora": "^8.0.1",
|
|
89
|
+
"p-queue": "^8.0.1",
|
|
90
|
+
"playwright": "^1.41.0",
|
|
91
|
+
"robots-parser": "^3.0.1"
|
|
92
|
+
},
|
|
93
|
+
"devDependencies": {
|
|
94
|
+
"eslint": "^8.57.0",
|
|
95
|
+
"husky": "^9.0.0",
|
|
96
|
+
"lint-staged": "^15.2.0",
|
|
97
|
+
"mocha": "^10.4.0",
|
|
98
|
+
"prettier": "^3.2.5"
|
|
99
|
+
}
|
|
100
|
+
}
|
package/src/cli.js
ADDED
|
@@ -0,0 +1,437 @@
|
|
|
1
|
+
// @flow
|
|
2
|
+
import {Command} from 'commander';
|
|
3
|
+
import chalk from 'chalk';
|
|
4
|
+
import ora from 'ora';
|
|
5
|
+
import {Crawler} from './crawler.js';
|
|
6
|
+
import {readManifest, manifestExists} from './manifest.js';
|
|
7
|
+
import {version} from './utils/version.js';
|
|
8
|
+
import {
|
|
9
|
+
showHelp,
|
|
10
|
+
runInteractiveCapture,
|
|
11
|
+
shouldRunInteractive,
|
|
12
|
+
} from './interactive.js';
|
|
13
|
+
|
|
14
|
+
const program = new Command();
|
|
15
|
+
|
|
16
|
+
export function run() {
|
|
17
|
+
// Check for help command first
|
|
18
|
+
const args = process.argv.slice(2);
|
|
19
|
+
if (args.includes('help') || args.includes('--help') || args.includes('-h')) {
|
|
20
|
+
showHelp();
|
|
21
|
+
return;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
// Check if we should run interactive mode
|
|
25
|
+
if (shouldRunInteractive(args)) {
|
|
26
|
+
runInteractiveCapture()
|
|
27
|
+
.then(options => {
|
|
28
|
+
return capture(options.url, {
|
|
29
|
+
output: options.output,
|
|
30
|
+
depth: options.depth,
|
|
31
|
+
scope: options.scope,
|
|
32
|
+
externalAssets: options.externalAssets,
|
|
33
|
+
static: options.static,
|
|
34
|
+
screenshot: options.screenshot,
|
|
35
|
+
workers: options.workers,
|
|
36
|
+
});
|
|
37
|
+
})
|
|
38
|
+
.catch(error => {
|
|
39
|
+
console.error(chalk.red(`\n✗ Error: ${error.message}`));
|
|
40
|
+
process.exit(1);
|
|
41
|
+
});
|
|
42
|
+
return;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
program
|
|
46
|
+
.name('smippo')
|
|
47
|
+
.description(
|
|
48
|
+
'Modern website copier powered by Playwright - capture JS-rendered pages for offline viewing',
|
|
49
|
+
)
|
|
50
|
+
.version(version);
|
|
51
|
+
|
|
52
|
+
// Main capture command
|
|
53
|
+
program
|
|
54
|
+
.argument('[url]', 'URL to capture')
|
|
55
|
+
.option('-o, --output <dir>', 'Output directory', './site')
|
|
56
|
+
.option('-d, --depth <n>', 'Recursion depth (0 = single page)', '0')
|
|
57
|
+
.option('--no-crawl', 'Disable link following (same as -d 0)')
|
|
58
|
+
.option('--dry-run', 'Show what would be captured without downloading')
|
|
59
|
+
|
|
60
|
+
// Scope options
|
|
61
|
+
.option(
|
|
62
|
+
'-s, --scope <type>',
|
|
63
|
+
'Link scope: subdomain|domain|tld|all',
|
|
64
|
+
'domain',
|
|
65
|
+
)
|
|
66
|
+
.option('--stay-in-dir', 'Only follow links in same directory or subdirs')
|
|
67
|
+
.option('--external-assets', 'Capture assets from external domains')
|
|
68
|
+
|
|
69
|
+
// Filter options
|
|
70
|
+
.option('-I, --include <glob...>', 'Include URLs matching pattern')
|
|
71
|
+
.option('-E, --exclude <glob...>', 'Exclude URLs matching pattern')
|
|
72
|
+
.option('--mime-include <type...>', 'Include MIME types')
|
|
73
|
+
.option('--mime-exclude <type...>', 'Exclude MIME types')
|
|
74
|
+
.option('--max-size <size>', 'Maximum file size (e.g., 10MB)')
|
|
75
|
+
.option('--min-size <size>', 'Minimum file size (e.g., 1KB)')
|
|
76
|
+
|
|
77
|
+
// Browser options
|
|
78
|
+
.option(
|
|
79
|
+
'--wait <strategy>',
|
|
80
|
+
'Wait strategy: networkidle|load|domcontentloaded',
|
|
81
|
+
'networkidle',
|
|
82
|
+
)
|
|
83
|
+
.option('--wait-time <ms>', 'Additional wait time after network idle', '0')
|
|
84
|
+
.option('--timeout <ms>', 'Page load timeout', '30000')
|
|
85
|
+
.option('--user-agent <string>', 'Custom user agent')
|
|
86
|
+
.option('--viewport <WxH>', 'Viewport size', '1920x1080')
|
|
87
|
+
.option('--device <name>', 'Emulate device (e.g., "iPhone 13")')
|
|
88
|
+
|
|
89
|
+
// Network options
|
|
90
|
+
.option('--proxy <url>', 'Proxy server URL')
|
|
91
|
+
.option('--cookies <file>', 'Load cookies from JSON file')
|
|
92
|
+
.option('--headers <json>', 'Custom headers as JSON')
|
|
93
|
+
.option('--capture-auth', 'Interactive authentication capture')
|
|
94
|
+
|
|
95
|
+
// Output options
|
|
96
|
+
.option(
|
|
97
|
+
'--structure <type>',
|
|
98
|
+
'Output structure: original|flat|domain',
|
|
99
|
+
'original',
|
|
100
|
+
)
|
|
101
|
+
.option('--har', 'Generate HAR file', true)
|
|
102
|
+
.option('--no-har', 'Disable HAR file generation')
|
|
103
|
+
.option('--screenshot', 'Take screenshot of each page')
|
|
104
|
+
.option('--pdf', 'Save PDF of each page')
|
|
105
|
+
.option('--static', 'Remove scripts for static offline viewing')
|
|
106
|
+
.option('--inline-css', 'Inline CSS into HTML for single-file output')
|
|
107
|
+
|
|
108
|
+
// Performance options
|
|
109
|
+
.option('-w, --workers <n>', 'Parallel workers/pages (default: 8)', '8')
|
|
110
|
+
.option('-c, --concurrency <n>', 'Alias for --workers', '8')
|
|
111
|
+
.option('--max-pages <n>', 'Maximum pages to capture')
|
|
112
|
+
.option('--max-time <seconds>', 'Maximum total time')
|
|
113
|
+
.option('--rate-limit <ms>', 'Delay between requests')
|
|
114
|
+
|
|
115
|
+
// Robots options
|
|
116
|
+
.option('--ignore-robots', 'Ignore robots.txt')
|
|
117
|
+
.option('--respect-robots', 'Respect robots.txt', true)
|
|
118
|
+
|
|
119
|
+
// Cache options
|
|
120
|
+
.option('--no-cache', "Don't use cache")
|
|
121
|
+
|
|
122
|
+
// Logging options
|
|
123
|
+
.option('-v, --verbose', 'Verbose output')
|
|
124
|
+
.option('-q, --quiet', 'Minimal output')
|
|
125
|
+
.option('--log-file <path>', 'Write logs to file')
|
|
126
|
+
.option('--debug', 'Debug mode with visible browser')
|
|
127
|
+
|
|
128
|
+
// Interaction options
|
|
129
|
+
.option('--no-interaction', 'Non-interactive mode (for CI/scripts)')
|
|
130
|
+
.option('-y, --yes', 'Skip prompts, use defaults')
|
|
131
|
+
|
|
132
|
+
.action(async (url, options) => {
|
|
133
|
+
if (!url) {
|
|
134
|
+
showHelp();
|
|
135
|
+
return;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
try {
|
|
139
|
+
await capture(url, options);
|
|
140
|
+
} catch (error) {
|
|
141
|
+
console.error(chalk.red(`\n✗ Error: ${error.message}`));
|
|
142
|
+
if (options.verbose || options.debug) {
|
|
143
|
+
console.error(error.stack);
|
|
144
|
+
}
|
|
145
|
+
process.exit(1);
|
|
146
|
+
}
|
|
147
|
+
});
|
|
148
|
+
|
|
149
|
+
// Continue command
|
|
150
|
+
program
|
|
151
|
+
.command('continue')
|
|
152
|
+
.description('Resume an interrupted capture')
|
|
153
|
+
.option('-o, --output <dir>', 'Output directory', './site')
|
|
154
|
+
.option('-v, --verbose', 'Verbose output')
|
|
155
|
+
.action(async options => {
|
|
156
|
+
try {
|
|
157
|
+
await continueCapture(options);
|
|
158
|
+
} catch (error) {
|
|
159
|
+
console.error(chalk.red(`\n✗ Error: ${error.message}`));
|
|
160
|
+
process.exit(1);
|
|
161
|
+
}
|
|
162
|
+
});
|
|
163
|
+
|
|
164
|
+
// Update command
|
|
165
|
+
program
|
|
166
|
+
.command('update')
|
|
167
|
+
.description('Update an existing mirror')
|
|
168
|
+
.option('-o, --output <dir>', 'Output directory', './site')
|
|
169
|
+
.option('-v, --verbose', 'Verbose output')
|
|
170
|
+
.action(async options => {
|
|
171
|
+
try {
|
|
172
|
+
await updateCapture(options);
|
|
173
|
+
} catch (error) {
|
|
174
|
+
console.error(chalk.red(`\n✗ Error: ${error.message}`));
|
|
175
|
+
process.exit(1);
|
|
176
|
+
}
|
|
177
|
+
});
|
|
178
|
+
|
|
179
|
+
// Serve command
|
|
180
|
+
program
|
|
181
|
+
.command('serve [directory]')
|
|
182
|
+
.description('Serve a captured site locally')
|
|
183
|
+
.option(
|
|
184
|
+
'-p, --port <port>',
|
|
185
|
+
'Port to serve on (auto-finds available)',
|
|
186
|
+
'8080',
|
|
187
|
+
)
|
|
188
|
+
.option('-H, --host <host>', 'Host to bind to', '127.0.0.1')
|
|
189
|
+
.option('-o, --open', 'Open browser automatically')
|
|
190
|
+
.option('--no-cors', 'Disable CORS headers')
|
|
191
|
+
.option('-v, --verbose', 'Show all requests')
|
|
192
|
+
.option('-q, --quiet', 'Minimal output')
|
|
193
|
+
.action(async (directory, options) => {
|
|
194
|
+
const {serve} = await import('./server.js');
|
|
195
|
+
await serve({
|
|
196
|
+
directory: directory || './site',
|
|
197
|
+
port: options.port,
|
|
198
|
+
host: options.host,
|
|
199
|
+
open: options.open,
|
|
200
|
+
cors: options.cors,
|
|
201
|
+
verbose: options.verbose,
|
|
202
|
+
quiet: options.quiet,
|
|
203
|
+
});
|
|
204
|
+
});
|
|
205
|
+
|
|
206
|
+
// Screenshot capture command
|
|
207
|
+
program
|
|
208
|
+
.command('capture <url>')
|
|
209
|
+
.description('Take a screenshot of a URL')
|
|
210
|
+
.option(
|
|
211
|
+
'-O, --out <file>',
|
|
212
|
+
'Output file path (auto-generated if not specified)',
|
|
213
|
+
)
|
|
214
|
+
.option('-f, --full-page', 'Capture full scrollable page')
|
|
215
|
+
.option('--format <type>', 'Image format: png|jpeg', 'png')
|
|
216
|
+
.option('--quality <n>', 'JPEG quality (1-100)', '80')
|
|
217
|
+
.option('--viewport <WxH>', 'Viewport size', '1920x1080')
|
|
218
|
+
.option('--device <name>', 'Emulate device (e.g., "iPhone 13", "iPad Pro")')
|
|
219
|
+
.option('--selector <css>', 'Capture specific element by CSS selector')
|
|
220
|
+
.option(
|
|
221
|
+
'--wait <strategy>',
|
|
222
|
+
'Wait strategy: networkidle|load|domcontentloaded',
|
|
223
|
+
'networkidle',
|
|
224
|
+
)
|
|
225
|
+
.option('--wait-time <ms>', 'Additional wait time after load', '0')
|
|
226
|
+
.option('--timeout <ms>', 'Page load timeout', '30000')
|
|
227
|
+
.option('--dark-mode', 'Use dark color scheme')
|
|
228
|
+
.option('--no-background', 'Transparent background (PNG only)')
|
|
229
|
+
.option('-v, --verbose', 'Verbose output')
|
|
230
|
+
.option('-q, --quiet', 'Minimal output')
|
|
231
|
+
.action(async (url, options) => {
|
|
232
|
+
try {
|
|
233
|
+
const {captureScreenshot, parseViewport} =
|
|
234
|
+
await import('./screenshot.js');
|
|
235
|
+
await captureScreenshot(url, {
|
|
236
|
+
output: options.out,
|
|
237
|
+
fullPage: options.fullPage,
|
|
238
|
+
format: options.format,
|
|
239
|
+
quality: options.quality ? parseInt(options.quality, 10) : undefined,
|
|
240
|
+
viewport: parseViewport(options.viewport),
|
|
241
|
+
device: options.device,
|
|
242
|
+
selector: options.selector,
|
|
243
|
+
wait: options.wait,
|
|
244
|
+
waitTime: parseInt(options.waitTime, 10),
|
|
245
|
+
timeout: parseInt(options.timeout, 10),
|
|
246
|
+
darkMode: options.darkMode,
|
|
247
|
+
omitBackground: !options.background,
|
|
248
|
+
verbose: options.verbose,
|
|
249
|
+
quiet: options.quiet,
|
|
250
|
+
});
|
|
251
|
+
} catch (error) {
|
|
252
|
+
console.error(chalk.red(`\n✗ Error: ${error.message}`));
|
|
253
|
+
if (options.verbose) {
|
|
254
|
+
console.error(error.stack);
|
|
255
|
+
}
|
|
256
|
+
process.exit(1);
|
|
257
|
+
}
|
|
258
|
+
});
|
|
259
|
+
|
|
260
|
+
// Help command
|
|
261
|
+
program
|
|
262
|
+
.command('help')
|
|
263
|
+
.description('Show detailed help')
|
|
264
|
+
.action(() => {
|
|
265
|
+
showHelp();
|
|
266
|
+
});
|
|
267
|
+
|
|
268
|
+
program.parse();
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
async function capture(url, options) {
|
|
272
|
+
const spinner = ora({
|
|
273
|
+
text: 'Initializing browser...',
|
|
274
|
+
isSilent: options.quiet,
|
|
275
|
+
}).start();
|
|
276
|
+
|
|
277
|
+
const crawler = new Crawler({
|
|
278
|
+
url,
|
|
279
|
+
output: options.output,
|
|
280
|
+
depth: parseInt(options.depth, 10),
|
|
281
|
+
scope: options.scope,
|
|
282
|
+
stayInDir: options.stayInDir,
|
|
283
|
+
externalAssets: options.externalAssets,
|
|
284
|
+
include: options.include || [],
|
|
285
|
+
exclude: options.exclude || [],
|
|
286
|
+
mimeInclude: options.mimeInclude || [],
|
|
287
|
+
mimeExclude: options.mimeExclude || [],
|
|
288
|
+
maxSize: parseSize(options.maxSize),
|
|
289
|
+
minSize: parseSize(options.minSize),
|
|
290
|
+
wait: options.wait,
|
|
291
|
+
waitTime: parseInt(options.waitTime, 10),
|
|
292
|
+
timeout: parseInt(options.timeout, 10),
|
|
293
|
+
userAgent: options.userAgent,
|
|
294
|
+
viewport: parseViewport(options.viewport),
|
|
295
|
+
device: options.device,
|
|
296
|
+
proxy: options.proxy,
|
|
297
|
+
cookies: options.cookies,
|
|
298
|
+
headers: options.headers ? JSON.parse(options.headers) : {},
|
|
299
|
+
captureAuth: options.captureAuth,
|
|
300
|
+
structure: options.structure,
|
|
301
|
+
har: options.har,
|
|
302
|
+
screenshot: options.screenshot,
|
|
303
|
+
pdf: options.pdf,
|
|
304
|
+
noJs: options.static,
|
|
305
|
+
inlineCss: options.inlineCss,
|
|
306
|
+
concurrency: parseInt(options.workers || options.concurrency, 10),
|
|
307
|
+
maxPages: options.maxPages ? parseInt(options.maxPages, 10) : undefined,
|
|
308
|
+
maxTime: options.maxTime ? parseInt(options.maxTime, 10) * 1000 : undefined,
|
|
309
|
+
rateLimit: options.rateLimit ? parseInt(options.rateLimit, 10) : 0,
|
|
310
|
+
ignoreRobots: options.ignoreRobots,
|
|
311
|
+
useCache: options.cache,
|
|
312
|
+
verbose: options.verbose,
|
|
313
|
+
quiet: options.quiet,
|
|
314
|
+
logFile: options.logFile,
|
|
315
|
+
debug: options.debug,
|
|
316
|
+
dryRun: options.dryRun,
|
|
317
|
+
spinner,
|
|
318
|
+
});
|
|
319
|
+
|
|
320
|
+
crawler.on('page:start', ({url}) => {
|
|
321
|
+
spinner.text = `Capturing: ${truncateUrl(url, 60)}`;
|
|
322
|
+
});
|
|
323
|
+
|
|
324
|
+
crawler.on('page:complete', ({url, size}) => {
|
|
325
|
+
if (options.verbose) {
|
|
326
|
+
spinner.succeed(
|
|
327
|
+
`Captured: ${truncateUrl(url, 50)} (${formatSize(size)})`,
|
|
328
|
+
);
|
|
329
|
+
spinner.start();
|
|
330
|
+
}
|
|
331
|
+
});
|
|
332
|
+
|
|
333
|
+
crawler.on('asset:save', ({url, size}) => {
|
|
334
|
+
if (options.verbose) {
|
|
335
|
+
spinner.text = `Asset: ${truncateUrl(url, 60)} (${formatSize(size)})`;
|
|
336
|
+
}
|
|
337
|
+
});
|
|
338
|
+
|
|
339
|
+
crawler.on('error', ({url, error}) => {
|
|
340
|
+
if (!options.quiet) {
|
|
341
|
+
spinner.warn(`Failed: ${truncateUrl(url, 50)} - ${error.message}`);
|
|
342
|
+
spinner.start();
|
|
343
|
+
}
|
|
344
|
+
});
|
|
345
|
+
|
|
346
|
+
const result = await crawler.start();
|
|
347
|
+
|
|
348
|
+
spinner.succeed(chalk.green(`Capture complete!`));
|
|
349
|
+
console.log('');
|
|
350
|
+
console.log(chalk.cyan(' Summary:'));
|
|
351
|
+
console.log(` Pages captured: ${result.stats.pagesCapt}`);
|
|
352
|
+
console.log(` Assets saved: ${result.stats.assetsCapt}`);
|
|
353
|
+
console.log(` Total size: ${formatSize(result.stats.totalSize)}`);
|
|
354
|
+
console.log(` Duration: ${formatDuration(result.stats.duration)}`);
|
|
355
|
+
if (result.stats.errors > 0) {
|
|
356
|
+
console.log(chalk.yellow(` Errors: ${result.stats.errors}`));
|
|
357
|
+
}
|
|
358
|
+
console.log('');
|
|
359
|
+
console.log(` Output: ${chalk.underline(options.output)}`);
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
async function continueCapture(options) {
|
|
363
|
+
if (!manifestExists(options.output)) {
|
|
364
|
+
throw new Error(
|
|
365
|
+
'No capture found in the specified directory. Start a new capture first.',
|
|
366
|
+
);
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
const manifest = await readManifest(options.output);
|
|
370
|
+
console.log(chalk.cyan(`Continuing capture of ${manifest.rootUrl}...`));
|
|
371
|
+
|
|
372
|
+
await capture(manifest.rootUrl, {
|
|
373
|
+
...manifest.options,
|
|
374
|
+
...options,
|
|
375
|
+
useCache: true,
|
|
376
|
+
});
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
async function updateCapture(options) {
|
|
380
|
+
if (!manifestExists(options.output)) {
|
|
381
|
+
throw new Error(
|
|
382
|
+
'No capture found in the specified directory. Start a new capture first.',
|
|
383
|
+
);
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
const manifest = await readManifest(options.output);
|
|
387
|
+
console.log(chalk.cyan(`Updating mirror of ${manifest.rootUrl}...`));
|
|
388
|
+
|
|
389
|
+
await capture(manifest.rootUrl, {
|
|
390
|
+
...manifest.options,
|
|
391
|
+
...options,
|
|
392
|
+
useCache: true,
|
|
393
|
+
update: true,
|
|
394
|
+
});
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
function parseSize(sizeStr) {
|
|
398
|
+
if (!sizeStr) return undefined;
|
|
399
|
+
const match = sizeStr.match(/^(\d+(?:\.\d+)?)\s*(KB|MB|GB|B)?$/i);
|
|
400
|
+
if (!match) return undefined;
|
|
401
|
+
|
|
402
|
+
const num = parseFloat(match[1]);
|
|
403
|
+
const unit = (match[2] || 'B').toUpperCase();
|
|
404
|
+
|
|
405
|
+
const multipliers = {B: 1, KB: 1024, MB: 1024 * 1024, GB: 1024 * 1024 * 1024};
|
|
406
|
+
return num * (multipliers[unit] || 1);
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
function parseViewport(viewportStr) {
|
|
410
|
+
if (!viewportStr) return {width: 1920, height: 1080};
|
|
411
|
+
const [width, height] = viewportStr.split('x').map(Number);
|
|
412
|
+
return {width: width || 1920, height: height || 1080};
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
function truncateUrl(url, maxLen) {
|
|
416
|
+
if (url.length <= maxLen) return url;
|
|
417
|
+
return url.slice(0, maxLen - 3) + '...';
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
function formatSize(bytes) {
|
|
421
|
+
if (!bytes) return '0 B';
|
|
422
|
+
const units = ['B', 'KB', 'MB', 'GB'];
|
|
423
|
+
let i = 0;
|
|
424
|
+
while (bytes >= 1024 && i < units.length - 1) {
|
|
425
|
+
bytes /= 1024;
|
|
426
|
+
i++;
|
|
427
|
+
}
|
|
428
|
+
return `${bytes.toFixed(i > 0 ? 1 : 0)} ${units[i]}`;
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
function formatDuration(ms) {
|
|
432
|
+
if (ms < 1000) return `${ms}ms`;
|
|
433
|
+
if (ms < 60000) return `${(ms / 1000).toFixed(1)}s`;
|
|
434
|
+
const mins = Math.floor(ms / 60000);
|
|
435
|
+
const secs = Math.round((ms % 60000) / 1000);
|
|
436
|
+
return `${mins}m ${secs}s`;
|
|
437
|
+
}
|