clean-web-scraper 3.9.1 โ 3.9.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -3
- package/main.js +27 -27
- package/package.json +1 -1
package/README.md
CHANGED
@@ -4,13 +4,13 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
|
|
4
4
|
|
5
5
|
## โจ Features
|
6
6
|
|
7
|
-
- ๐ Smart
|
7
|
+
- ๐ Smart web crawling of internal links
|
8
|
+
- ๐ Smart retry mechanism with proxy fallback
|
8
9
|
- ๐ Clean content extraction using Mozilla's Readability
|
9
10
|
- ๐งน Smart content processing and cleaning
|
10
11
|
- ๐๏ธ Maintains original URL structure in saved files
|
11
12
|
- ๐ซ Excludes unwanted paths from scraping
|
12
|
-
-
|
13
|
-
- ๐ฏ No duplicate page visits
|
13
|
+
- ๐ฆ Configurable rate limiting and delays
|
14
14
|
- ๐ค AI-friendly output formats (JSONL, CSV, clean text)
|
15
15
|
- ๐ Rich metadata extraction
|
16
16
|
- ๐ Combine results from multiple scrapers into a unified dataset
|
@@ -53,6 +53,23 @@ const scraper = new WebScraper({
|
|
53
53
|
jsonlOutputPath: './example.com/train.jsonl', // Optional: Custom JSONL output path
|
54
54
|
textOutputPath: "./example.com/texts", // Optional: Custom text output path
|
55
55
|
csvOutputPath: "./example.com/train.csv", // Optional: Custom CSV output path
|
56
|
+
strictBaseURL: true, // Optional: Only scrape URLs from same domain
|
57
|
+
maxDepth: Infinity, // Optional: Maximum crawling depth
|
58
|
+
maxArticles: Infinity, // Optional: Maximum articles to scrape
|
59
|
+
concurrencyLimit: 2, // Optional: Limit concurrent requests
|
60
|
+
crawlingDelay: 1000, // Optional: Delay between requests (ms)
|
61
|
+
|
62
|
+
// Network options
|
63
|
+
axiosHeaders: {}, // Optional: Custom HTTP headers
|
64
|
+
axiosProxy: "http://proxy:port", // Optional: HTTP/HTTPS proxy
|
65
|
+
axiosMaxRetries: 5, // Optional: Max retry attempts
|
66
|
+
axiosRetryDelay: 40000, // Optional: Delay between retries (ms)
|
67
|
+
useProxyAsFallback: false, // Optional: Fallback to proxy on failure
|
68
|
+
|
69
|
+
// Puppeteer options for handling dynamic content
|
70
|
+
usePuppeteer: false, // Optional: Enable Puppeteer browser
|
71
|
+
puppeteerProxy: "http://127.0.0.1:2080", // Optional: Puppeteer proxy
|
72
|
+
puppeteerExecutablePath: "/path/to/chrome", // Optional: Custom browser path
|
56
73
|
});
|
57
74
|
await scraper.start();
|
58
75
|
```
|
package/main.js
CHANGED
@@ -10,41 +10,41 @@ class WebScraper
|
|
10
10
|
constructor ( config = {})
|
11
11
|
{
|
12
12
|
// Base configuration
|
13
|
-
this.baseURL = baseURL;
|
14
|
-
this.startURL = startURL || baseURL;
|
15
|
-
this.strictBaseURL = strictBaseURL || true;
|
16
|
-
this.maxDepth = maxDepth || Infinity;
|
17
|
-
this.maxArticles = maxArticles || Infinity;
|
18
|
-
this.concurrencyLimit = concurrencyLimit || 2;
|
19
|
-
this.crawlingDelay = crawlingDelay || 1000;
|
13
|
+
this.baseURL = config.baseURL;
|
14
|
+
this.startURL = config.startURL || config.baseURL;
|
15
|
+
this.strictBaseURL = config.strictBaseURL || true;
|
16
|
+
this.maxDepth = config.maxDepth || Infinity;
|
17
|
+
this.maxArticles = config.maxArticles || Infinity;
|
18
|
+
this.concurrencyLimit = config.concurrencyLimit || 2;
|
19
|
+
this.crawlingDelay = config.crawlingDelay || 1000;
|
20
20
|
|
21
21
|
// Output paths setup
|
22
|
-
this.scrapResultPath = scrapResultPath || "./dataset";
|
23
|
-
this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
|
22
|
+
this.scrapResultPath = config.scrapResultPath || "./dataset";
|
23
|
+
this.textOutputPath = config.textOutputPath || path.join( this.scrapResultPath, "texts" );
|
24
24
|
this.textOutputPathWithMeta = `${this.textOutputPath }_with_metadata`;
|
25
|
-
this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
|
25
|
+
this.jsonlOutputPath = config.jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
|
26
26
|
this.jsonlOutputPathWithMeta = this.jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
|
27
|
-
this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
|
27
|
+
this.csvOutputPath = config.csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
|
28
28
|
this.csvOutputPathWithMeta = this.csvOutputPath.replace( ".csv", "_with_metadata.csv" );
|
29
29
|
|
30
30
|
// Metadata configuration
|
31
|
-
this.includeMetadata = includeMetadata || false;
|
32
|
-
this.metadataFields = new Set( metadataFields || [] );
|
31
|
+
this.includeMetadata = config.includeMetadata || false;
|
32
|
+
this.metadataFields = new Set( config.metadataFields || [] );
|
33
33
|
|
34
34
|
// URL filtering setup
|
35
35
|
this.visited = new Set();
|
36
|
-
this.excludeList = this.normalizeExcludeList( excludeList );
|
37
|
-
this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
|
38
|
-
this.filterFileTypes = filterFileTypes
|
39
|
-
this.excludedFileTypes = excludedFileTypes || [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"];
|
40
|
-
this.removeURLFragment = removeURLFragment
|
36
|
+
this.excludeList = this.normalizeExcludeList( config.excludeList );
|
37
|
+
this.exactExcludeList = this.normalizeExcludeList( config.exactExcludeList );
|
38
|
+
this.filterFileTypes = config.filterFileTypes ?? true;
|
39
|
+
this.excludedFileTypes = config.excludedFileTypes || [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"];
|
40
|
+
this.removeURLFragment = config.removeURLFragment ?? true;
|
41
41
|
|
42
42
|
// Network configuration
|
43
|
-
this.axiosHeaders = axiosHeaders;
|
44
|
-
this.axiosProxy = axiosProxy;
|
45
|
-
this.axiosMaxRetries = axiosMaxRetries || 5;
|
46
|
-
this.axiosRetryDelay = axiosRetryDelay || 40000;
|
47
|
-
this.useProxyAsFallback = useProxyAsFallback || false;
|
43
|
+
this.axiosHeaders = config.axiosHeaders;
|
44
|
+
this.axiosProxy = config.axiosProxy;
|
45
|
+
this.axiosMaxRetries = config.axiosMaxRetries || 5;
|
46
|
+
this.axiosRetryDelay = config.axiosRetryDelay || 40000;
|
47
|
+
this.useProxyAsFallback = config.useProxyAsFallback || false;
|
48
48
|
this.axiosOptions = {};
|
49
49
|
if ( this.axiosHeaders )
|
50
50
|
{
|
@@ -59,10 +59,10 @@ class WebScraper
|
|
59
59
|
this.allProcessedContent = [];
|
60
60
|
|
61
61
|
// Puppeteer configuration
|
62
|
-
this.usePuppeteer = usePuppeteer || false;
|
63
|
-
this.puppeteerProxy = puppeteerProxy; // http://127.0.0.1:2080
|
64
|
-
this.puppeteerExecutablePath = puppeteerExecutablePath;
|
65
|
-
this.puppeteerRealProxy = puppeteerRealProxy;
|
62
|
+
this.usePuppeteer = config.usePuppeteer || false;
|
63
|
+
this.puppeteerProxy = config.puppeteerProxy; // http://127.0.0.1:2080
|
64
|
+
this.puppeteerExecutablePath = config.puppeteerExecutablePath;
|
65
|
+
this.puppeteerRealProxy = config.puppeteerRealProxy;
|
66
66
|
this.configurePuppeteer();
|
67
67
|
}
|
68
68
|
|