smippo 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/crawler.js ADDED
@@ -0,0 +1,408 @@
1
+ import {chromium, devices} from 'playwright';
2
+ import {EventEmitter} from 'events';
3
+ import PQueue from 'p-queue';
4
+ import fs from 'fs-extra';
5
+ import {PageCapture} from './page-capture.js';
6
+ import {ResourceSaver} from './resource-saver.js';
7
+ import {Filter} from './filter.js';
8
+ import {RobotsHandler} from './robots.js';
9
+ import {rewriteLinks, rewriteCssUrls} from './link-rewriter.js';
10
+ import {normalizeUrl, isLikelyPage} from './utils/url.js';
11
+ import {Logger} from './utils/logger.js';
12
+ import {
13
+ createManifest,
14
+ writeManifest,
15
+ readManifest,
16
+ readCache,
17
+ writeCache,
18
+ addPageToManifest,
19
+ addAssetToManifest,
20
+ addErrorToManifest,
21
+ finalizeManifest,
22
+ getHarPath,
23
+ getLogPath,
24
+ } from './manifest.js';
25
+
26
+ /**
27
+ * Main crawler class
28
+ */
29
+ export class Crawler extends EventEmitter {
30
+ constructor(options) {
31
+ super();
32
+ this.options = options;
33
+ this.url = normalizeUrl(options.url);
34
+ this.depth = options.depth || 0;
35
+ this.visited = new Set();
36
+ this.queue = new PQueue({concurrency: options.concurrency || 8});
37
+ this.startTime = null;
38
+ this.browser = null;
39
+ this.context = null;
40
+ this.manifest = null;
41
+ this.cache = null;
42
+
43
+ this.saver = new ResourceSaver({
44
+ output: options.output,
45
+ structure: options.structure,
46
+ });
47
+
48
+ this.filter = new Filter({
49
+ baseUrl: this.url,
50
+ scope: options.scope,
51
+ stayInDir: options.stayInDir,
52
+ externalAssets: options.externalAssets,
53
+ include: options.include,
54
+ exclude: options.exclude,
55
+ mimeInclude: options.mimeInclude,
56
+ mimeExclude: options.mimeExclude,
57
+ maxSize: options.maxSize,
58
+ minSize: options.minSize,
59
+ });
60
+
61
+ this.robots = new RobotsHandler({
62
+ ignoreRobots: options.ignoreRobots,
63
+ userAgent: options.userAgent,
64
+ });
65
+
66
+ this.logger = new Logger({
67
+ verbose: options.verbose,
68
+ quiet: options.quiet,
69
+ logFile: options.logFile || getLogPath(options.output),
70
+ });
71
+ }
72
+
73
+ /**
74
+ * Start the crawl
75
+ */
76
+ async start() {
77
+ this.startTime = Date.now();
78
+
79
+ try {
80
+ // Initialize browser
81
+ await this._initBrowser();
82
+
83
+ // Load or create manifest
84
+ if (this.options.useCache) {
85
+ this.manifest = await readManifest(this.options.output);
86
+ this.cache = await readCache(this.options.output);
87
+ }
88
+
89
+ if (!this.manifest) {
90
+ this.manifest = createManifest(this.url, this.options);
91
+ }
92
+
93
+ if (!this.cache) {
94
+ this.cache = {etags: {}, lastModified: {}, contentTypes: {}};
95
+ }
96
+
97
+ // Ensure output directory exists
98
+ await fs.ensureDir(this.options.output);
99
+
100
+ // Start crawling
101
+ await this._crawl(this.url, this.depth);
102
+
103
+ // Wait for queue to finish
104
+ await this.queue.onIdle();
105
+
106
+ // Finalize
107
+ const duration = Date.now() - this.startTime;
108
+ finalizeManifest(this.manifest, duration);
109
+ await writeManifest(this.options.output, this.manifest);
110
+ await writeCache(this.options.output, this.cache);
111
+ await this.logger.flush();
112
+
113
+ return {
114
+ stats: this.manifest.stats,
115
+ manifest: this.manifest,
116
+ };
117
+ } finally {
118
+ await this._closeBrowser();
119
+ }
120
+ }
121
+
122
+ /**
123
+ * Initialize the browser
124
+ */
125
+ async _initBrowser() {
126
+ const launchOptions = {
127
+ headless: !this.options.debug,
128
+ };
129
+
130
+ this.browser = await chromium.launch(launchOptions);
131
+
132
+ const contextOptions = {
133
+ viewport: this.options.viewport,
134
+ userAgent: this.options.userAgent,
135
+ };
136
+
137
+ // Apply device emulation
138
+ if (this.options.device && devices[this.options.device]) {
139
+ Object.assign(contextOptions, devices[this.options.device]);
140
+ }
141
+
142
+ // Set up proxy
143
+ if (this.options.proxy) {
144
+ contextOptions.proxy = {server: this.options.proxy};
145
+ }
146
+
147
+ // Record HAR if enabled
148
+ if (this.options.har) {
149
+ contextOptions.recordHar = {
150
+ path: getHarPath(this.options.output),
151
+ mode: 'full',
152
+ };
153
+ }
154
+
155
+ this.context = await this.browser.newContext(contextOptions);
156
+
157
+ // Load cookies if provided
158
+ if (this.options.cookies) {
159
+ const cookies = await fs.readJson(this.options.cookies);
160
+ await this.context.addCookies(cookies);
161
+ }
162
+
163
+ // Set extra headers
164
+ if (this.options.headers && Object.keys(this.options.headers).length > 0) {
165
+ await this.context.setExtraHTTPHeaders(this.options.headers);
166
+ }
167
+ }
168
+
169
+ /**
170
+ * Close the browser
171
+ */
172
+ async _closeBrowser() {
173
+ if (this.context) {
174
+ await this.context.close();
175
+ }
176
+ if (this.browser) {
177
+ await this.browser.close();
178
+ }
179
+ }
180
+
181
+ /**
182
+ * Crawl a URL
183
+ */
184
+ async _crawl(url, remainingDepth) {
185
+ // Normalize URL
186
+ url = normalizeUrl(url);
187
+
188
+ // Check if already visited
189
+ if (this.visited.has(url)) {
190
+ return;
191
+ }
192
+
193
+ // Check max pages limit
194
+ if (this.options.maxPages && this.visited.size >= this.options.maxPages) {
195
+ return;
196
+ }
197
+
198
+ // Check max time limit
199
+ if (
200
+ this.options.maxTime &&
201
+ Date.now() - this.startTime >= this.options.maxTime
202
+ ) {
203
+ return;
204
+ }
205
+
206
+ // Check filter
207
+ if (!this.filter.shouldFollow(url)) {
208
+ this.logger.debug(`Filtered out: ${url}`);
209
+ return;
210
+ }
211
+
212
+ // Check robots.txt
213
+ const robotsAllowed = await this.robots.isAllowed(url, async robotsUrl => {
214
+ try {
215
+ const page = await this.context.newPage();
216
+ const response = await page.goto(robotsUrl, {timeout: 10000});
217
+ const content = response?.ok() ? await page.content() : null;
218
+ await page.close();
219
+ return content;
220
+ } catch {
221
+ return null;
222
+ }
223
+ });
224
+
225
+ if (!robotsAllowed) {
226
+ this.logger.debug(`Blocked by robots.txt: ${url}`);
227
+ return;
228
+ }
229
+
230
+ // Mark as visited
231
+ this.visited.add(url);
232
+
233
+ // Add to queue
234
+ this.queue.add(async () => {
235
+ await this._capturePage(url, remainingDepth);
236
+ });
237
+ }
238
+
239
+ /**
240
+ * Capture a single page
241
+ */
242
+ async _capturePage(url, remainingDepth) {
243
+ this.emit('page:start', {url});
244
+
245
+ let page = null;
246
+
247
+ try {
248
+ // Rate limiting
249
+ if (this.options.rateLimit > 0) {
250
+ await sleep(this.options.rateLimit);
251
+ }
252
+
253
+ // Check crawl delay from robots.txt
254
+ const crawlDelay = this.robots.getCrawlDelay(url);
255
+ if (crawlDelay > 0) {
256
+ await sleep(crawlDelay * 1000);
257
+ }
258
+
259
+ // Create page
260
+ page = await this.context.newPage();
261
+
262
+ // Capture the page
263
+ const capture = new PageCapture(page, {
264
+ wait: this.options.wait,
265
+ waitTime: this.options.waitTime,
266
+ timeout: this.options.timeout,
267
+ screenshot: this.options.screenshot,
268
+ pdf: this.options.pdf,
269
+ mimeInclude: this.options.mimeInclude,
270
+ mimeExclude: this.options.mimeExclude,
271
+ maxSize: this.options.maxSize,
272
+ minSize: this.options.minSize,
273
+ });
274
+
275
+ const result = await capture.capture(url);
276
+
277
+ // Save resources
278
+ const savedResources = await this.saver.saveResources(result.resources);
279
+
280
+ for (const resource of savedResources) {
281
+ addAssetToManifest(this.manifest, {
282
+ url: resource.url,
283
+ localPath: this.saver.getRelativePath(resource.localPath),
284
+ mimeType: result.resources.get(resource.url)?.contentType,
285
+ size: resource.size,
286
+ });
287
+
288
+ this.emit('asset:save', {
289
+ url: resource.url,
290
+ localPath: resource.localPath,
291
+ size: resource.size,
292
+ });
293
+ }
294
+
295
+ // Build URL map for link rewriting
296
+ const urlMap = this.saver.getUrlMap();
297
+
298
+ // Rewrite CSS files to fix asset URLs
299
+ await this._rewriteCssFiles(result.resources, urlMap);
300
+
301
+ // Rewrite links in HTML
302
+ const rewrittenHtml = rewriteLinks(result.html, url, urlMap, {
303
+ structure: this.options.structure,
304
+ noJs: this.options.noJs,
305
+ inlineCss: this.options.inlineCss,
306
+ });
307
+
308
+ // Save HTML
309
+ const htmlPath = await this.saver.saveHtml(url, rewrittenHtml);
310
+
311
+ // Update manifest
312
+ addPageToManifest(this.manifest, {
313
+ url,
314
+ localPath: this.saver.getRelativePath(htmlPath),
315
+ size: Buffer.byteLength(rewrittenHtml, 'utf8'),
316
+ title: result.title,
317
+ });
318
+
319
+ // Save screenshot if captured
320
+ if (result.screenshot) {
321
+ await this.saver.saveScreenshot(url, result.screenshot);
322
+ }
323
+
324
+ // Save PDF if captured
325
+ if (result.pdf) {
326
+ await this.saver.savePdf(url, result.pdf);
327
+ }
328
+
329
+ this.emit('page:complete', {
330
+ url,
331
+ localPath: htmlPath,
332
+ size: Buffer.byteLength(rewrittenHtml, 'utf8'),
333
+ linksFound: result.links.pages.length,
334
+ });
335
+
336
+ // Continue crawling if depth allows
337
+ if (remainingDepth > 0) {
338
+ for (const link of result.links.pages) {
339
+ if (isLikelyPage(link)) {
340
+ await this._crawl(link, remainingDepth - 1);
341
+ }
342
+ }
343
+ }
344
+ } catch (error) {
345
+ this.logger.error(`Failed to capture ${url}`, error);
346
+ addErrorToManifest(this.manifest, url, error);
347
+ this.emit('error', {url, error});
348
+ } finally {
349
+ if (page) {
350
+ await page.close();
351
+ }
352
+ }
353
+ }
354
+
355
+ /**
356
+ * Rewrite URLs in CSS files to point to local files
357
+ */
358
+ async _rewriteCssFiles(resources, urlMap) {
359
+ const {joinPath, sanitizePath} = await import('./utils/path.js');
360
+
361
+ for (const [resourceUrl, resource] of resources) {
362
+ const contentType = resource.contentType || '';
363
+
364
+ // Only process CSS files
365
+ if (!contentType.includes('text/css') && !resourceUrl.endsWith('.css')) {
366
+ continue;
367
+ }
368
+
369
+ try {
370
+ // Get the local path where this CSS was saved
371
+ const cssRelativePath = urlMap.get(resourceUrl);
372
+ if (!cssRelativePath) continue;
373
+
374
+ const cssLocalPath = joinPath(
375
+ this.options.output,
376
+ sanitizePath(cssRelativePath),
377
+ );
378
+
379
+ // Read the CSS file
380
+ const cssContent = await fs.readFile(cssLocalPath, 'utf8');
381
+
382
+ // Rewrite URLs in the CSS
383
+ const rewrittenCss = rewriteCssUrls(
384
+ cssContent,
385
+ resourceUrl,
386
+ urlMap,
387
+ cssRelativePath,
388
+ {structure: this.options.structure},
389
+ );
390
+
391
+ // Write the rewritten CSS back
392
+ if (rewrittenCss !== cssContent) {
393
+ await fs.writeFile(cssLocalPath, rewrittenCss, 'utf8');
394
+ }
395
+ } catch (error) {
396
+ // Ignore CSS rewriting errors
397
+ this.logger.debug(
398
+ `Failed to rewrite CSS ${resourceUrl}:`,
399
+ error.message,
400
+ );
401
+ }
402
+ }
403
+ }
404
+ }
405
+
406
+ function sleep(ms) {
407
+ return new Promise(resolve => setTimeout(resolve, ms));
408
+ }
package/src/filter.js ADDED
@@ -0,0 +1,155 @@
1
+ import {minimatch} from 'minimatch';
2
+ import {isInScope} from './utils/url.js';
3
+
4
+ /**
5
+ * URL and resource filter
6
+ */
7
+ export class Filter {
8
+ constructor(options = {}) {
9
+ this.baseUrl = options.baseUrl;
10
+ this.scope = options.scope || 'domain';
11
+ this.stayInDir = options.stayInDir || false;
12
+ this.externalAssets = options.externalAssets || false;
13
+ this.include = options.include || [];
14
+ this.exclude = options.exclude || [];
15
+ this.mimeInclude = options.mimeInclude || [];
16
+ this.mimeExclude = options.mimeExclude || [];
17
+ this.maxSize = options.maxSize;
18
+ this.minSize = options.minSize;
19
+ }
20
+
21
+ /**
22
+ * Check if a URL should be followed (for crawling)
23
+ */
24
+ shouldFollow(url) {
25
+ // Check scope
26
+ if (!isInScope(url, this.baseUrl, this.scope, this.stayInDir)) {
27
+ return false;
28
+ }
29
+
30
+ // Check exclude patterns first (higher priority)
31
+ if (this.exclude.length > 0) {
32
+ if (this.matchesPattern(url, this.exclude)) {
33
+ return false;
34
+ }
35
+ }
36
+
37
+ // Check include patterns
38
+ if (this.include.length > 0) {
39
+ if (!this.matchesPattern(url, this.include)) {
40
+ return false;
41
+ }
42
+ }
43
+
44
+ return true;
45
+ }
46
+
47
+ /**
48
+ * Check if an asset URL should be downloaded
49
+ */
50
+ shouldDownloadAsset(url) {
51
+ // For assets, we're more permissive if externalAssets is true
52
+ if (this.externalAssets) {
53
+ // Still check exclude patterns
54
+ if (this.exclude.length > 0 && this.matchesPattern(url, this.exclude)) {
55
+ return false;
56
+ }
57
+ return true;
58
+ }
59
+
60
+ // Otherwise, apply same rules as shouldFollow
61
+ return this.shouldFollow(url);
62
+ }
63
+
64
+ /**
65
+ * Check if a resource should be saved based on MIME type
66
+ */
67
+ shouldSaveByMime(contentType) {
68
+ if (!contentType) return true;
69
+
70
+ const mimeType = contentType.split(';')[0].trim().toLowerCase();
71
+
72
+ // Check exclude patterns
73
+ if (this.mimeExclude.length > 0) {
74
+ if (this.matchesMime(mimeType, this.mimeExclude)) {
75
+ return false;
76
+ }
77
+ }
78
+
79
+ // Check include patterns
80
+ if (this.mimeInclude.length > 0) {
81
+ if (!this.matchesMime(mimeType, this.mimeInclude)) {
82
+ return false;
83
+ }
84
+ }
85
+
86
+ return true;
87
+ }
88
+
89
+ /**
90
+ * Check if a resource should be saved based on size
91
+ */
92
+ shouldSaveBySize(size) {
93
+ if (this.maxSize !== undefined && size > this.maxSize) {
94
+ return false;
95
+ }
96
+ if (this.minSize !== undefined && size < this.minSize) {
97
+ return false;
98
+ }
99
+ return true;
100
+ }
101
+
102
+ /**
103
+ * Full check for a resource
104
+ */
105
+ shouldSave(url, contentType, size) {
106
+ if (!this.shouldDownloadAsset(url)) return false;
107
+ if (!this.shouldSaveByMime(contentType)) return false;
108
+ if (!this.shouldSaveBySize(size)) return false;
109
+ return true;
110
+ }
111
+
112
+ /**
113
+ * Check if URL matches any of the patterns
114
+ */
115
+ matchesPattern(url, patterns) {
116
+ return patterns.some(pattern => {
117
+ // Convert HTTrack-style patterns to glob patterns
118
+ const globPattern = this.toGlobPattern(pattern);
119
+ return minimatch(url, globPattern, {nocase: true});
120
+ });
121
+ }
122
+
123
+ /**
124
+ * Check if MIME type matches any of the patterns
125
+ */
126
+ matchesMime(mimeType, patterns) {
127
+ return patterns.some(pattern => {
128
+ pattern = pattern.toLowerCase();
129
+ if (pattern.endsWith('/*')) {
130
+ return mimeType.startsWith(pattern.slice(0, -1));
131
+ }
132
+ return mimeType === pattern;
133
+ });
134
+ }
135
+
136
+ /**
137
+ * Convert HTTrack-style filter to glob pattern
138
+ */
139
+ toGlobPattern(pattern) {
140
+ // If already a glob pattern (contains *), use as is
141
+ if (pattern.includes('*')) {
142
+ return pattern;
143
+ }
144
+
145
+ // Otherwise, treat as a prefix match
146
+ return pattern + '*';
147
+ }
148
+ }
149
+
150
+ /**
151
+ * Create a filter from options
152
+ */
153
+ export function createFilter(options) {
154
+ return new Filter(options);
155
+ }
package/src/index.js ADDED
@@ -0,0 +1,60 @@
1
+ /**
2
+ * Smippo - Modern Website Copier
3
+ *
4
+ * A Playwright-powered website mirroring tool for the JavaScript age.
5
+ * Captures fully rendered pages with all network artifacts.
6
+ */
7
+
8
+ export {Crawler} from './crawler.js';
9
+ export {PageCapture} from './page-capture.js';
10
+ export {ResourceSaver} from './resource-saver.js';
11
+ export {Filter, createFilter} from './filter.js';
12
+ export {RobotsHandler} from './robots.js';
13
+ export {extractLinks, extractCssUrls} from './link-extractor.js';
14
+ export {rewriteLinks, rewriteCssUrls} from './link-rewriter.js';
15
+ export {
16
+ createManifest,
17
+ readManifest,
18
+ writeManifest,
19
+ readCache,
20
+ writeCache,
21
+ manifestExists,
22
+ } from './manifest.js';
23
+ export {
24
+ normalizeUrl,
25
+ resolveUrl,
26
+ urlToPath,
27
+ isInScope,
28
+ isSameOrigin,
29
+ isSameDomain,
30
+ isLikelyPage,
31
+ isAsset,
32
+ } from './utils/url.js';
33
+ export {createServer, serve} from './server.js';
34
+
35
+ /**
36
+ * Quick capture function for simple use cases
37
+ */
38
+ export async function capture(url, options = {}) {
39
+ const {Crawler} = await import('./crawler.js');
40
+
41
+ const crawler = new Crawler({
42
+ url,
43
+ output: options.output || './site',
44
+ depth: options.depth ?? 0,
45
+ scope: options.scope || 'domain',
46
+ ...options,
47
+ });
48
+
49
+ return crawler.start();
50
+ }
51
+
52
+ /**
53
+ * Default export
54
+ */
55
+ const {createServer} = await import('./server.js');
56
+ export default {
57
+ capture,
58
+ Crawler: (await import('./crawler.js')).Crawler,
59
+ createServer,
60
+ };