clean-web-scraper 3.9.1 โ†’ 3.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +20 -3
  2. package/main.js +27 -27
  3. package/package.json +1 -1
package/README.md CHANGED
@@ -4,13 +4,13 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
4
4
 
5
5
  ## โœจ Features
6
6
 
7
- - ๐ŸŒ Smart recursive web crawling of internal links
7
+ - ๐ŸŒ Smart web crawling of internal links
8
+ - ๐Ÿ”„ Smart retry mechanism with proxy fallback
8
9
  - ๐Ÿ“ Clean content extraction using Mozilla's Readability
9
10
  - ๐Ÿงน Smart content processing and cleaning
10
11
  - ๐Ÿ—‚๏ธ Maintains original URL structure in saved files
11
12
  - ๐Ÿšซ Excludes unwanted paths from scraping
12
- - ๐Ÿ”„ Handles relative and absolute URLs like a pro
13
- - ๐ŸŽฏ No duplicate page visits
13
+ - ๐Ÿšฆ Configurable rate limiting and delays
14
14
  - ๐Ÿค– AI-friendly output formats (JSONL, CSV, clean text)
15
15
  - ๐Ÿ“Š Rich metadata extraction
16
16
  - ๐Ÿ“ Combine results from multiple scrapers into a unified dataset
@@ -53,6 +53,23 @@ const scraper = new WebScraper({
53
53
  jsonlOutputPath: './example.com/train.jsonl', // Optional: Custom JSONL output path
54
54
  textOutputPath: "./example.com/texts", // Optional: Custom text output path
55
55
  csvOutputPath: "./example.com/train.csv", // Optional: Custom CSV output path
56
+ strictBaseURL: true, // Optional: Only scrape URLs from same domain
57
+ maxDepth: Infinity, // Optional: Maximum crawling depth
58
+ maxArticles: Infinity, // Optional: Maximum articles to scrape
59
+ concurrencyLimit: 2, // Optional: Limit concurrent requests
60
+ crawlingDelay: 1000, // Optional: Delay between requests (ms)
61
+
62
+ // Network options
63
+ axiosHeaders: {}, // Optional: Custom HTTP headers
64
+ axiosProxy: "http://proxy:port", // Optional: HTTP/HTTPS proxy
65
+ axiosMaxRetries: 5, // Optional: Max retry attempts
66
+ axiosRetryDelay: 40000, // Optional: Delay between retries (ms)
67
+ useProxyAsFallback: false, // Optional: Fallback to proxy on failure
68
+
69
+ // Puppeteer options for handling dynamic content
70
+ usePuppeteer: false, // Optional: Enable Puppeteer browser
71
+ puppeteerProxy: "http://127.0.0.1:2080", // Optional: Puppeteer proxy
72
+ puppeteerExecutablePath: "/path/to/chrome", // Optional: Custom browser path
56
73
  });
57
74
  await scraper.start();
58
75
  ```
package/main.js CHANGED
@@ -10,41 +10,41 @@ class WebScraper
10
10
  constructor ( config = {})
11
11
  {
12
12
  // Base configuration
13
- this.baseURL = baseURL;
14
- this.startURL = startURL || baseURL;
15
- this.strictBaseURL = strictBaseURL || true;
16
- this.maxDepth = maxDepth || Infinity;
17
- this.maxArticles = maxArticles || Infinity;
18
- this.concurrencyLimit = concurrencyLimit || 2;
19
- this.crawlingDelay = crawlingDelay || 1000;
13
+ this.baseURL = config.baseURL;
14
+ this.startURL = config.startURL || config.baseURL;
15
+ this.strictBaseURL = config.strictBaseURL || true;
16
+ this.maxDepth = config.maxDepth || Infinity;
17
+ this.maxArticles = config.maxArticles || Infinity;
18
+ this.concurrencyLimit = config.concurrencyLimit || 2;
19
+ this.crawlingDelay = config.crawlingDelay || 1000;
20
20
 
21
21
  // Output paths setup
22
- this.scrapResultPath = scrapResultPath || "./dataset";
23
- this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
22
+ this.scrapResultPath = config.scrapResultPath || "./dataset";
23
+ this.textOutputPath = config.textOutputPath || path.join( this.scrapResultPath, "texts" );
24
24
  this.textOutputPathWithMeta = `${this.textOutputPath }_with_metadata`;
25
- this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
25
+ this.jsonlOutputPath = config.jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
26
26
  this.jsonlOutputPathWithMeta = this.jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
27
- this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
27
+ this.csvOutputPath = config.csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
28
28
  this.csvOutputPathWithMeta = this.csvOutputPath.replace( ".csv", "_with_metadata.csv" );
29
29
 
30
30
  // Metadata configuration
31
- this.includeMetadata = includeMetadata || false;
32
- this.metadataFields = new Set( metadataFields || [] );
31
+ this.includeMetadata = config.includeMetadata || false;
32
+ this.metadataFields = new Set( config.metadataFields || [] );
33
33
 
34
34
  // URL filtering setup
35
35
  this.visited = new Set();
36
- this.excludeList = this.normalizeExcludeList( excludeList );
37
- this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
38
- this.filterFileTypes = filterFileTypes || true;
39
- this.excludedFileTypes = excludedFileTypes || [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"];
40
- this.removeURLFragment = removeURLFragment || true;
36
+ this.excludeList = this.normalizeExcludeList( config.excludeList );
37
+ this.exactExcludeList = this.normalizeExcludeList( config.exactExcludeList );
38
+ this.filterFileTypes = config.filterFileTypes ?? true;
39
+ this.excludedFileTypes = config.excludedFileTypes || [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"];
40
+ this.removeURLFragment = config.removeURLFragment ?? true;
41
41
 
42
42
  // Network configuration
43
- this.axiosHeaders = axiosHeaders;
44
- this.axiosProxy = axiosProxy;
45
- this.axiosMaxRetries = axiosMaxRetries || 5;
46
- this.axiosRetryDelay = axiosRetryDelay || 40000;
47
- this.useProxyAsFallback = useProxyAsFallback || false;
43
+ this.axiosHeaders = config.axiosHeaders;
44
+ this.axiosProxy = config.axiosProxy;
45
+ this.axiosMaxRetries = config.axiosMaxRetries || 5;
46
+ this.axiosRetryDelay = config.axiosRetryDelay || 40000;
47
+ this.useProxyAsFallback = config.useProxyAsFallback || false;
48
48
  this.axiosOptions = {};
49
49
  if ( this.axiosHeaders )
50
50
  {
@@ -59,10 +59,10 @@ class WebScraper
59
59
  this.allProcessedContent = [];
60
60
 
61
61
  // Puppeteer configuration
62
- this.usePuppeteer = usePuppeteer || false;
63
- this.puppeteerProxy = puppeteerProxy; // http://127.0.0.1:2080
64
- this.puppeteerExecutablePath = puppeteerExecutablePath;
65
- this.puppeteerRealProxy = puppeteerRealProxy;
62
+ this.usePuppeteer = config.usePuppeteer || false;
63
+ this.puppeteerProxy = config.puppeteerProxy; // http://127.0.0.1:2080
64
+ this.puppeteerExecutablePath = config.puppeteerExecutablePath;
65
+ this.puppeteerRealProxy = config.puppeteerRealProxy;
66
66
  this.configurePuppeteer();
67
67
  }
68
68
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.9.1",
3
+ "version": "3.9.3",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",