clean-web-scraper 3.4.0 → 3.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -89,7 +89,8 @@ await WebScraper.combineResults('./combined', [docsScraper, blogScraper]);
89
89
  ```
90
90
 
91
91
  ```bash
92
- node example-usage.js
92
+ # 8 GB RAM
93
+ node --max-old-space-size=8192 example-usage.js
93
94
  ```
94
95
 
95
96
  ## 📤 Output
package/example-usage.js CHANGED
@@ -101,7 +101,9 @@ async function electronicintifada ()
101
101
  "https://electronicintifada.net/blog",
102
102
  "https://electronicintifada.net/people",
103
103
  "https://electronicintifada.net/location",
104
- "https://electronicintifada.net/file"
104
+ "https://electronicintifada.net/file",
105
+ "https://electronicintifada.net/bytopic/people",
106
+ "https://electronicintifada.net/comment/"
105
107
  ],
106
108
  exactExcludeList: [
107
109
  "https://electronicintifada.net",
@@ -115,6 +117,7 @@ async function electronicintifada ()
115
117
  textOutputPath: "./dataset/electronicintifada/texts",
116
118
  csvOutputPath: "./dataset/electronicintifada/train.csv",
117
119
  includeMetadata: true,
120
+ maxArticles: 1000,
118
121
  metadataFields: ["title", "description", "author"]
119
122
  });
120
123
  await scraper.start();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.4.0",
3
+ "version": "3.5.0",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -13,6 +13,7 @@ class WebScraper
13
13
  baseURL,
14
14
  startURL,
15
15
  maxDepth = Infinity,
16
+ maxArticles = Infinity, // Add this line
16
17
  excludeList,
17
18
  exactExcludeList,
18
19
  scrapResultPath = "./dataset",
@@ -33,6 +34,7 @@ class WebScraper
33
34
  this.baseURL = baseURL;
34
35
  this.startURL = startURL || baseURL;
35
36
  this.maxDepth = maxDepth;
37
+ this.maxArticles = maxArticles; // Add this line
36
38
  this.scrapResultPath = scrapResultPath;
37
39
  this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
38
40
  this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
@@ -113,18 +115,28 @@ class WebScraper
113
115
 
114
116
  async fetchPage ( url, depth )
115
117
  {
118
+ if ( this.allProcessedContent.length >= this.maxArticles )
119
+ {
120
+ console.log( `Reached maximum number of articles (${this.maxArticles})` );
121
+ return;
122
+ }
116
123
  if ( depth > this.maxDepth )
117
124
  {
118
125
  return;
119
126
  }
120
127
  this.visited.add( url );
128
+ if ( !this.isValidFileType( url ) )
129
+ {
130
+ return;
131
+ }
121
132
  try
122
133
  {
123
134
  const data = await this.caller( url );
135
+ if ( !data ) return;
124
136
  const dom = new JSDOM( data, { url });
125
137
  const { document } = dom.window;
126
138
 
127
- if ( !this.isExcluded( url ) && this.isValidFileType( url ) )
139
+ if ( !this.isExcluded( url ) )
128
140
  {
129
141
  const reader = new Readability( document, { charThreshold: 500, nbTopCandidates: 20 });
130
142
  const article = reader.parse();