clean-web-scraper 3.4.1 → 3.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -89,7 +89,8 @@ await WebScraper.combineResults('./combined', [docsScraper, blogScraper]);
89
89
  ```
90
90
 
91
91
  ```bash
92
- node example-usage.js
92
+ # 8 GB RAM
93
+ node --max-old-space-size=8192 example-usage.js
93
94
  ```
94
95
 
95
96
  ## 📤 Output
package/example-usage.js CHANGED
@@ -101,7 +101,9 @@ async function electronicintifada ()
101
101
  "https://electronicintifada.net/blog",
102
102
  "https://electronicintifada.net/people",
103
103
  "https://electronicintifada.net/location",
104
- "https://electronicintifada.net/file"
104
+ "https://electronicintifada.net/file",
105
+ "https://electronicintifada.net/bytopic/people",
106
+ "https://electronicintifada.net/comment/"
105
107
  ],
106
108
  exactExcludeList: [
107
109
  "https://electronicintifada.net",
@@ -115,6 +117,7 @@ async function electronicintifada ()
115
117
  textOutputPath: "./dataset/electronicintifada/texts",
116
118
  csvOutputPath: "./dataset/electronicintifada/train.csv",
117
119
  includeMetadata: true,
120
+ maxArticles: 1000,
118
121
  metadataFields: ["title", "description", "author"]
119
122
  });
120
123
  await scraper.start();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.4.1",
3
+ "version": "3.5.0",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -13,6 +13,7 @@ class WebScraper
13
13
  baseURL,
14
14
  startURL,
15
15
  maxDepth = Infinity,
16
+ maxArticles = Infinity, // Add this line
16
17
  excludeList,
17
18
  exactExcludeList,
18
19
  scrapResultPath = "./dataset",
@@ -33,6 +34,7 @@ class WebScraper
33
34
  this.baseURL = baseURL;
34
35
  this.startURL = startURL || baseURL;
35
36
  this.maxDepth = maxDepth;
37
+ this.maxArticles = maxArticles; // Add this line
36
38
  this.scrapResultPath = scrapResultPath;
37
39
  this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
38
40
  this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
@@ -113,6 +115,11 @@ class WebScraper
113
115
 
114
116
  async fetchPage ( url, depth )
115
117
  {
118
+ if ( this.allProcessedContent.length >= this.maxArticles )
119
+ {
120
+ console.log( `Reached maximum number of articles (${this.maxArticles})` );
121
+ return;
122
+ }
116
123
  if ( depth > this.maxDepth )
117
124
  {
118
125
  return;
@@ -125,6 +132,7 @@ class WebScraper
125
132
  try
126
133
  {
127
134
  const data = await this.caller( url );
135
+ if ( !data ) return;
128
136
  const dom = new JSDOM( data, { url });
129
137
  const { document } = dom.window;
130
138