smippo 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +116 -0
- package/bin/smippo.js +5 -0
- package/package.json +100 -0
- package/src/cli.js +437 -0
- package/src/crawler.js +408 -0
- package/src/filter.js +155 -0
- package/src/index.js +60 -0
- package/src/interactive.js +391 -0
- package/src/link-extractor.js +212 -0
- package/src/link-rewriter.js +293 -0
- package/src/manifest.js +163 -0
- package/src/page-capture.js +151 -0
- package/src/progress.js +190 -0
- package/src/resource-saver.js +210 -0
- package/src/robots.js +104 -0
- package/src/screenshot.js +185 -0
- package/src/server.js +603 -0
- package/src/utils/logger.js +74 -0
- package/src/utils/path.js +76 -0
- package/src/utils/url.js +295 -0
- package/src/utils/version.js +14 -0
package/src/crawler.js
ADDED
|
@@ -0,0 +1,408 @@
|
|
|
1
|
+
import {chromium, devices} from 'playwright';
|
|
2
|
+
import {EventEmitter} from 'events';
|
|
3
|
+
import PQueue from 'p-queue';
|
|
4
|
+
import fs from 'fs-extra';
|
|
5
|
+
import {PageCapture} from './page-capture.js';
|
|
6
|
+
import {ResourceSaver} from './resource-saver.js';
|
|
7
|
+
import {Filter} from './filter.js';
|
|
8
|
+
import {RobotsHandler} from './robots.js';
|
|
9
|
+
import {rewriteLinks, rewriteCssUrls} from './link-rewriter.js';
|
|
10
|
+
import {normalizeUrl, isLikelyPage} from './utils/url.js';
|
|
11
|
+
import {Logger} from './utils/logger.js';
|
|
12
|
+
import {
|
|
13
|
+
createManifest,
|
|
14
|
+
writeManifest,
|
|
15
|
+
readManifest,
|
|
16
|
+
readCache,
|
|
17
|
+
writeCache,
|
|
18
|
+
addPageToManifest,
|
|
19
|
+
addAssetToManifest,
|
|
20
|
+
addErrorToManifest,
|
|
21
|
+
finalizeManifest,
|
|
22
|
+
getHarPath,
|
|
23
|
+
getLogPath,
|
|
24
|
+
} from './manifest.js';
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Main crawler class
|
|
28
|
+
*/
|
|
29
|
+
export class Crawler extends EventEmitter {
|
|
30
|
+
constructor(options) {
|
|
31
|
+
super();
|
|
32
|
+
this.options = options;
|
|
33
|
+
this.url = normalizeUrl(options.url);
|
|
34
|
+
this.depth = options.depth || 0;
|
|
35
|
+
this.visited = new Set();
|
|
36
|
+
this.queue = new PQueue({concurrency: options.concurrency || 8});
|
|
37
|
+
this.startTime = null;
|
|
38
|
+
this.browser = null;
|
|
39
|
+
this.context = null;
|
|
40
|
+
this.manifest = null;
|
|
41
|
+
this.cache = null;
|
|
42
|
+
|
|
43
|
+
this.saver = new ResourceSaver({
|
|
44
|
+
output: options.output,
|
|
45
|
+
structure: options.structure,
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
this.filter = new Filter({
|
|
49
|
+
baseUrl: this.url,
|
|
50
|
+
scope: options.scope,
|
|
51
|
+
stayInDir: options.stayInDir,
|
|
52
|
+
externalAssets: options.externalAssets,
|
|
53
|
+
include: options.include,
|
|
54
|
+
exclude: options.exclude,
|
|
55
|
+
mimeInclude: options.mimeInclude,
|
|
56
|
+
mimeExclude: options.mimeExclude,
|
|
57
|
+
maxSize: options.maxSize,
|
|
58
|
+
minSize: options.minSize,
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
this.robots = new RobotsHandler({
|
|
62
|
+
ignoreRobots: options.ignoreRobots,
|
|
63
|
+
userAgent: options.userAgent,
|
|
64
|
+
});
|
|
65
|
+
|
|
66
|
+
this.logger = new Logger({
|
|
67
|
+
verbose: options.verbose,
|
|
68
|
+
quiet: options.quiet,
|
|
69
|
+
logFile: options.logFile || getLogPath(options.output),
|
|
70
|
+
});
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Start the crawl
|
|
75
|
+
*/
|
|
76
|
+
async start() {
|
|
77
|
+
this.startTime = Date.now();
|
|
78
|
+
|
|
79
|
+
try {
|
|
80
|
+
// Initialize browser
|
|
81
|
+
await this._initBrowser();
|
|
82
|
+
|
|
83
|
+
// Load or create manifest
|
|
84
|
+
if (this.options.useCache) {
|
|
85
|
+
this.manifest = await readManifest(this.options.output);
|
|
86
|
+
this.cache = await readCache(this.options.output);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
if (!this.manifest) {
|
|
90
|
+
this.manifest = createManifest(this.url, this.options);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
if (!this.cache) {
|
|
94
|
+
this.cache = {etags: {}, lastModified: {}, contentTypes: {}};
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// Ensure output directory exists
|
|
98
|
+
await fs.ensureDir(this.options.output);
|
|
99
|
+
|
|
100
|
+
// Start crawling
|
|
101
|
+
await this._crawl(this.url, this.depth);
|
|
102
|
+
|
|
103
|
+
// Wait for queue to finish
|
|
104
|
+
await this.queue.onIdle();
|
|
105
|
+
|
|
106
|
+
// Finalize
|
|
107
|
+
const duration = Date.now() - this.startTime;
|
|
108
|
+
finalizeManifest(this.manifest, duration);
|
|
109
|
+
await writeManifest(this.options.output, this.manifest);
|
|
110
|
+
await writeCache(this.options.output, this.cache);
|
|
111
|
+
await this.logger.flush();
|
|
112
|
+
|
|
113
|
+
return {
|
|
114
|
+
stats: this.manifest.stats,
|
|
115
|
+
manifest: this.manifest,
|
|
116
|
+
};
|
|
117
|
+
} finally {
|
|
118
|
+
await this._closeBrowser();
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/**
|
|
123
|
+
* Initialize the browser
|
|
124
|
+
*/
|
|
125
|
+
async _initBrowser() {
|
|
126
|
+
const launchOptions = {
|
|
127
|
+
headless: !this.options.debug,
|
|
128
|
+
};
|
|
129
|
+
|
|
130
|
+
this.browser = await chromium.launch(launchOptions);
|
|
131
|
+
|
|
132
|
+
const contextOptions = {
|
|
133
|
+
viewport: this.options.viewport,
|
|
134
|
+
userAgent: this.options.userAgent,
|
|
135
|
+
};
|
|
136
|
+
|
|
137
|
+
// Apply device emulation
|
|
138
|
+
if (this.options.device && devices[this.options.device]) {
|
|
139
|
+
Object.assign(contextOptions, devices[this.options.device]);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Set up proxy
|
|
143
|
+
if (this.options.proxy) {
|
|
144
|
+
contextOptions.proxy = {server: this.options.proxy};
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// Record HAR if enabled
|
|
148
|
+
if (this.options.har) {
|
|
149
|
+
contextOptions.recordHar = {
|
|
150
|
+
path: getHarPath(this.options.output),
|
|
151
|
+
mode: 'full',
|
|
152
|
+
};
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
this.context = await this.browser.newContext(contextOptions);
|
|
156
|
+
|
|
157
|
+
// Load cookies if provided
|
|
158
|
+
if (this.options.cookies) {
|
|
159
|
+
const cookies = await fs.readJson(this.options.cookies);
|
|
160
|
+
await this.context.addCookies(cookies);
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// Set extra headers
|
|
164
|
+
if (this.options.headers && Object.keys(this.options.headers).length > 0) {
|
|
165
|
+
await this.context.setExtraHTTPHeaders(this.options.headers);
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
/**
|
|
170
|
+
* Close the browser
|
|
171
|
+
*/
|
|
172
|
+
async _closeBrowser() {
|
|
173
|
+
if (this.context) {
|
|
174
|
+
await this.context.close();
|
|
175
|
+
}
|
|
176
|
+
if (this.browser) {
|
|
177
|
+
await this.browser.close();
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
/**
|
|
182
|
+
* Crawl a URL
|
|
183
|
+
*/
|
|
184
|
+
async _crawl(url, remainingDepth) {
|
|
185
|
+
// Normalize URL
|
|
186
|
+
url = normalizeUrl(url);
|
|
187
|
+
|
|
188
|
+
// Check if already visited
|
|
189
|
+
if (this.visited.has(url)) {
|
|
190
|
+
return;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// Check max pages limit
|
|
194
|
+
if (this.options.maxPages && this.visited.size >= this.options.maxPages) {
|
|
195
|
+
return;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
// Check max time limit
|
|
199
|
+
if (
|
|
200
|
+
this.options.maxTime &&
|
|
201
|
+
Date.now() - this.startTime >= this.options.maxTime
|
|
202
|
+
) {
|
|
203
|
+
return;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
// Check filter
|
|
207
|
+
if (!this.filter.shouldFollow(url)) {
|
|
208
|
+
this.logger.debug(`Filtered out: ${url}`);
|
|
209
|
+
return;
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
// Check robots.txt
|
|
213
|
+
const robotsAllowed = await this.robots.isAllowed(url, async robotsUrl => {
|
|
214
|
+
try {
|
|
215
|
+
const page = await this.context.newPage();
|
|
216
|
+
const response = await page.goto(robotsUrl, {timeout: 10000});
|
|
217
|
+
const content = response?.ok() ? await page.content() : null;
|
|
218
|
+
await page.close();
|
|
219
|
+
return content;
|
|
220
|
+
} catch {
|
|
221
|
+
return null;
|
|
222
|
+
}
|
|
223
|
+
});
|
|
224
|
+
|
|
225
|
+
if (!robotsAllowed) {
|
|
226
|
+
this.logger.debug(`Blocked by robots.txt: ${url}`);
|
|
227
|
+
return;
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
// Mark as visited
|
|
231
|
+
this.visited.add(url);
|
|
232
|
+
|
|
233
|
+
// Add to queue
|
|
234
|
+
this.queue.add(async () => {
|
|
235
|
+
await this._capturePage(url, remainingDepth);
|
|
236
|
+
});
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
/**
|
|
240
|
+
* Capture a single page
|
|
241
|
+
*/
|
|
242
|
+
async _capturePage(url, remainingDepth) {
|
|
243
|
+
this.emit('page:start', {url});
|
|
244
|
+
|
|
245
|
+
let page = null;
|
|
246
|
+
|
|
247
|
+
try {
|
|
248
|
+
// Rate limiting
|
|
249
|
+
if (this.options.rateLimit > 0) {
|
|
250
|
+
await sleep(this.options.rateLimit);
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// Check crawl delay from robots.txt
|
|
254
|
+
const crawlDelay = this.robots.getCrawlDelay(url);
|
|
255
|
+
if (crawlDelay > 0) {
|
|
256
|
+
await sleep(crawlDelay * 1000);
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
// Create page
|
|
260
|
+
page = await this.context.newPage();
|
|
261
|
+
|
|
262
|
+
// Capture the page
|
|
263
|
+
const capture = new PageCapture(page, {
|
|
264
|
+
wait: this.options.wait,
|
|
265
|
+
waitTime: this.options.waitTime,
|
|
266
|
+
timeout: this.options.timeout,
|
|
267
|
+
screenshot: this.options.screenshot,
|
|
268
|
+
pdf: this.options.pdf,
|
|
269
|
+
mimeInclude: this.options.mimeInclude,
|
|
270
|
+
mimeExclude: this.options.mimeExclude,
|
|
271
|
+
maxSize: this.options.maxSize,
|
|
272
|
+
minSize: this.options.minSize,
|
|
273
|
+
});
|
|
274
|
+
|
|
275
|
+
const result = await capture.capture(url);
|
|
276
|
+
|
|
277
|
+
// Save resources
|
|
278
|
+
const savedResources = await this.saver.saveResources(result.resources);
|
|
279
|
+
|
|
280
|
+
for (const resource of savedResources) {
|
|
281
|
+
addAssetToManifest(this.manifest, {
|
|
282
|
+
url: resource.url,
|
|
283
|
+
localPath: this.saver.getRelativePath(resource.localPath),
|
|
284
|
+
mimeType: result.resources.get(resource.url)?.contentType,
|
|
285
|
+
size: resource.size,
|
|
286
|
+
});
|
|
287
|
+
|
|
288
|
+
this.emit('asset:save', {
|
|
289
|
+
url: resource.url,
|
|
290
|
+
localPath: resource.localPath,
|
|
291
|
+
size: resource.size,
|
|
292
|
+
});
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
// Build URL map for link rewriting
|
|
296
|
+
const urlMap = this.saver.getUrlMap();
|
|
297
|
+
|
|
298
|
+
// Rewrite CSS files to fix asset URLs
|
|
299
|
+
await this._rewriteCssFiles(result.resources, urlMap);
|
|
300
|
+
|
|
301
|
+
// Rewrite links in HTML
|
|
302
|
+
const rewrittenHtml = rewriteLinks(result.html, url, urlMap, {
|
|
303
|
+
structure: this.options.structure,
|
|
304
|
+
noJs: this.options.noJs,
|
|
305
|
+
inlineCss: this.options.inlineCss,
|
|
306
|
+
});
|
|
307
|
+
|
|
308
|
+
// Save HTML
|
|
309
|
+
const htmlPath = await this.saver.saveHtml(url, rewrittenHtml);
|
|
310
|
+
|
|
311
|
+
// Update manifest
|
|
312
|
+
addPageToManifest(this.manifest, {
|
|
313
|
+
url,
|
|
314
|
+
localPath: this.saver.getRelativePath(htmlPath),
|
|
315
|
+
size: Buffer.byteLength(rewrittenHtml, 'utf8'),
|
|
316
|
+
title: result.title,
|
|
317
|
+
});
|
|
318
|
+
|
|
319
|
+
// Save screenshot if captured
|
|
320
|
+
if (result.screenshot) {
|
|
321
|
+
await this.saver.saveScreenshot(url, result.screenshot);
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
// Save PDF if captured
|
|
325
|
+
if (result.pdf) {
|
|
326
|
+
await this.saver.savePdf(url, result.pdf);
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
this.emit('page:complete', {
|
|
330
|
+
url,
|
|
331
|
+
localPath: htmlPath,
|
|
332
|
+
size: Buffer.byteLength(rewrittenHtml, 'utf8'),
|
|
333
|
+
linksFound: result.links.pages.length,
|
|
334
|
+
});
|
|
335
|
+
|
|
336
|
+
// Continue crawling if depth allows
|
|
337
|
+
if (remainingDepth > 0) {
|
|
338
|
+
for (const link of result.links.pages) {
|
|
339
|
+
if (isLikelyPage(link)) {
|
|
340
|
+
await this._crawl(link, remainingDepth - 1);
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
} catch (error) {
|
|
345
|
+
this.logger.error(`Failed to capture ${url}`, error);
|
|
346
|
+
addErrorToManifest(this.manifest, url, error);
|
|
347
|
+
this.emit('error', {url, error});
|
|
348
|
+
} finally {
|
|
349
|
+
if (page) {
|
|
350
|
+
await page.close();
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
/**
|
|
356
|
+
* Rewrite URLs in CSS files to point to local files
|
|
357
|
+
*/
|
|
358
|
+
async _rewriteCssFiles(resources, urlMap) {
|
|
359
|
+
const {joinPath, sanitizePath} = await import('./utils/path.js');
|
|
360
|
+
|
|
361
|
+
for (const [resourceUrl, resource] of resources) {
|
|
362
|
+
const contentType = resource.contentType || '';
|
|
363
|
+
|
|
364
|
+
// Only process CSS files
|
|
365
|
+
if (!contentType.includes('text/css') && !resourceUrl.endsWith('.css')) {
|
|
366
|
+
continue;
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
try {
|
|
370
|
+
// Get the local path where this CSS was saved
|
|
371
|
+
const cssRelativePath = urlMap.get(resourceUrl);
|
|
372
|
+
if (!cssRelativePath) continue;
|
|
373
|
+
|
|
374
|
+
const cssLocalPath = joinPath(
|
|
375
|
+
this.options.output,
|
|
376
|
+
sanitizePath(cssRelativePath),
|
|
377
|
+
);
|
|
378
|
+
|
|
379
|
+
// Read the CSS file
|
|
380
|
+
const cssContent = await fs.readFile(cssLocalPath, 'utf8');
|
|
381
|
+
|
|
382
|
+
// Rewrite URLs in the CSS
|
|
383
|
+
const rewrittenCss = rewriteCssUrls(
|
|
384
|
+
cssContent,
|
|
385
|
+
resourceUrl,
|
|
386
|
+
urlMap,
|
|
387
|
+
cssRelativePath,
|
|
388
|
+
{structure: this.options.structure},
|
|
389
|
+
);
|
|
390
|
+
|
|
391
|
+
// Write the rewritten CSS back
|
|
392
|
+
if (rewrittenCss !== cssContent) {
|
|
393
|
+
await fs.writeFile(cssLocalPath, rewrittenCss, 'utf8');
|
|
394
|
+
}
|
|
395
|
+
} catch (error) {
|
|
396
|
+
// Ignore CSS rewriting errors
|
|
397
|
+
this.logger.debug(
|
|
398
|
+
`Failed to rewrite CSS ${resourceUrl}:`,
|
|
399
|
+
error.message,
|
|
400
|
+
);
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
function sleep(ms) {
|
|
407
|
+
return new Promise(resolve => setTimeout(resolve, ms));
|
|
408
|
+
}
|
package/src/filter.js
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
import {minimatch} from 'minimatch';
|
|
2
|
+
import {isInScope} from './utils/url.js';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* URL and resource filter
|
|
6
|
+
*/
|
|
7
|
+
export class Filter {
|
|
8
|
+
constructor(options = {}) {
|
|
9
|
+
this.baseUrl = options.baseUrl;
|
|
10
|
+
this.scope = options.scope || 'domain';
|
|
11
|
+
this.stayInDir = options.stayInDir || false;
|
|
12
|
+
this.externalAssets = options.externalAssets || false;
|
|
13
|
+
this.include = options.include || [];
|
|
14
|
+
this.exclude = options.exclude || [];
|
|
15
|
+
this.mimeInclude = options.mimeInclude || [];
|
|
16
|
+
this.mimeExclude = options.mimeExclude || [];
|
|
17
|
+
this.maxSize = options.maxSize;
|
|
18
|
+
this.minSize = options.minSize;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Check if a URL should be followed (for crawling)
|
|
23
|
+
*/
|
|
24
|
+
shouldFollow(url) {
|
|
25
|
+
// Check scope
|
|
26
|
+
if (!isInScope(url, this.baseUrl, this.scope, this.stayInDir)) {
|
|
27
|
+
return false;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// Check exclude patterns first (higher priority)
|
|
31
|
+
if (this.exclude.length > 0) {
|
|
32
|
+
if (this.matchesPattern(url, this.exclude)) {
|
|
33
|
+
return false;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Check include patterns
|
|
38
|
+
if (this.include.length > 0) {
|
|
39
|
+
if (!this.matchesPattern(url, this.include)) {
|
|
40
|
+
return false;
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
return true;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Check if an asset URL should be downloaded
|
|
49
|
+
*/
|
|
50
|
+
shouldDownloadAsset(url) {
|
|
51
|
+
// For assets, we're more permissive if externalAssets is true
|
|
52
|
+
if (this.externalAssets) {
|
|
53
|
+
// Still check exclude patterns
|
|
54
|
+
if (this.exclude.length > 0 && this.matchesPattern(url, this.exclude)) {
|
|
55
|
+
return false;
|
|
56
|
+
}
|
|
57
|
+
return true;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// Otherwise, apply same rules as shouldFollow
|
|
61
|
+
return this.shouldFollow(url);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Check if a resource should be saved based on MIME type
|
|
66
|
+
*/
|
|
67
|
+
shouldSaveByMime(contentType) {
|
|
68
|
+
if (!contentType) return true;
|
|
69
|
+
|
|
70
|
+
const mimeType = contentType.split(';')[0].trim().toLowerCase();
|
|
71
|
+
|
|
72
|
+
// Check exclude patterns
|
|
73
|
+
if (this.mimeExclude.length > 0) {
|
|
74
|
+
if (this.matchesMime(mimeType, this.mimeExclude)) {
|
|
75
|
+
return false;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// Check include patterns
|
|
80
|
+
if (this.mimeInclude.length > 0) {
|
|
81
|
+
if (!this.matchesMime(mimeType, this.mimeInclude)) {
|
|
82
|
+
return false;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
return true;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Check if a resource should be saved based on size
|
|
91
|
+
*/
|
|
92
|
+
shouldSaveBySize(size) {
|
|
93
|
+
if (this.maxSize !== undefined && size > this.maxSize) {
|
|
94
|
+
return false;
|
|
95
|
+
}
|
|
96
|
+
if (this.minSize !== undefined && size < this.minSize) {
|
|
97
|
+
return false;
|
|
98
|
+
}
|
|
99
|
+
return true;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Full check for a resource
|
|
104
|
+
*/
|
|
105
|
+
shouldSave(url, contentType, size) {
|
|
106
|
+
if (!this.shouldDownloadAsset(url)) return false;
|
|
107
|
+
if (!this.shouldSaveByMime(contentType)) return false;
|
|
108
|
+
if (!this.shouldSaveBySize(size)) return false;
|
|
109
|
+
return true;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Check if URL matches any of the patterns
|
|
114
|
+
*/
|
|
115
|
+
matchesPattern(url, patterns) {
|
|
116
|
+
return patterns.some(pattern => {
|
|
117
|
+
// Convert HTTrack-style patterns to glob patterns
|
|
118
|
+
const globPattern = this.toGlobPattern(pattern);
|
|
119
|
+
return minimatch(url, globPattern, {nocase: true});
|
|
120
|
+
});
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Check if MIME type matches any of the patterns
|
|
125
|
+
*/
|
|
126
|
+
matchesMime(mimeType, patterns) {
|
|
127
|
+
return patterns.some(pattern => {
|
|
128
|
+
pattern = pattern.toLowerCase();
|
|
129
|
+
if (pattern.endsWith('/*')) {
|
|
130
|
+
return mimeType.startsWith(pattern.slice(0, -1));
|
|
131
|
+
}
|
|
132
|
+
return mimeType === pattern;
|
|
133
|
+
});
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
/**
|
|
137
|
+
* Convert HTTrack-style filter to glob pattern
|
|
138
|
+
*/
|
|
139
|
+
toGlobPattern(pattern) {
|
|
140
|
+
// If already a glob pattern (contains *), use as is
|
|
141
|
+
if (pattern.includes('*')) {
|
|
142
|
+
return pattern;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// Otherwise, treat as a prefix match
|
|
146
|
+
return pattern + '*';
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
/**
|
|
151
|
+
* Create a filter from options
|
|
152
|
+
*/
|
|
153
|
+
export function createFilter(options) {
|
|
154
|
+
return new Filter(options);
|
|
155
|
+
}
|
package/src/index.js
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Smippo - Modern Website Copier
|
|
3
|
+
*
|
|
4
|
+
* A Playwright-powered website mirroring tool for the JavaScript age.
|
|
5
|
+
* Captures fully rendered pages with all network artifacts.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
export {Crawler} from './crawler.js';
|
|
9
|
+
export {PageCapture} from './page-capture.js';
|
|
10
|
+
export {ResourceSaver} from './resource-saver.js';
|
|
11
|
+
export {Filter, createFilter} from './filter.js';
|
|
12
|
+
export {RobotsHandler} from './robots.js';
|
|
13
|
+
export {extractLinks, extractCssUrls} from './link-extractor.js';
|
|
14
|
+
export {rewriteLinks, rewriteCssUrls} from './link-rewriter.js';
|
|
15
|
+
export {
|
|
16
|
+
createManifest,
|
|
17
|
+
readManifest,
|
|
18
|
+
writeManifest,
|
|
19
|
+
readCache,
|
|
20
|
+
writeCache,
|
|
21
|
+
manifestExists,
|
|
22
|
+
} from './manifest.js';
|
|
23
|
+
export {
|
|
24
|
+
normalizeUrl,
|
|
25
|
+
resolveUrl,
|
|
26
|
+
urlToPath,
|
|
27
|
+
isInScope,
|
|
28
|
+
isSameOrigin,
|
|
29
|
+
isSameDomain,
|
|
30
|
+
isLikelyPage,
|
|
31
|
+
isAsset,
|
|
32
|
+
} from './utils/url.js';
|
|
33
|
+
export {createServer, serve} from './server.js';
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Quick capture function for simple use cases
|
|
37
|
+
*/
|
|
38
|
+
export async function capture(url, options = {}) {
|
|
39
|
+
const {Crawler} = await import('./crawler.js');
|
|
40
|
+
|
|
41
|
+
const crawler = new Crawler({
|
|
42
|
+
url,
|
|
43
|
+
output: options.output || './site',
|
|
44
|
+
depth: options.depth ?? 0,
|
|
45
|
+
scope: options.scope || 'domain',
|
|
46
|
+
...options,
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
return crawler.start();
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Default export
|
|
54
|
+
*/
|
|
55
|
+
const {createServer} = await import('./server.js');
|
|
56
|
+
export default {
|
|
57
|
+
capture,
|
|
58
|
+
Crawler: (await import('./crawler.js')).Crawler,
|
|
59
|
+
createServer,
|
|
60
|
+
};
|