clean-web-scraper 3.4.1 → 3.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/example-usage.js +4 -1
- package/package.json +1 -1
- package/src/WebScraper.js +8 -0
package/README.md
CHANGED
package/example-usage.js
CHANGED
|
@@ -101,7 +101,9 @@ async function electronicintifada ()
|
|
|
101
101
|
"https://electronicintifada.net/blog",
|
|
102
102
|
"https://electronicintifada.net/people",
|
|
103
103
|
"https://electronicintifada.net/location",
|
|
104
|
-
"https://electronicintifada.net/file"
|
|
104
|
+
"https://electronicintifada.net/file",
|
|
105
|
+
"https://electronicintifada.net/bytopic/people",
|
|
106
|
+
"https://electronicintifada.net/comment/"
|
|
105
107
|
],
|
|
106
108
|
exactExcludeList: [
|
|
107
109
|
"https://electronicintifada.net",
|
|
@@ -115,6 +117,7 @@ async function electronicintifada ()
|
|
|
115
117
|
textOutputPath: "./dataset/electronicintifada/texts",
|
|
116
118
|
csvOutputPath: "./dataset/electronicintifada/train.csv",
|
|
117
119
|
includeMetadata: true,
|
|
120
|
+
maxArticles: 1000,
|
|
118
121
|
metadataFields: ["title", "description", "author"]
|
|
119
122
|
});
|
|
120
123
|
await scraper.start();
|
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -13,6 +13,7 @@ class WebScraper
|
|
|
13
13
|
baseURL,
|
|
14
14
|
startURL,
|
|
15
15
|
maxDepth = Infinity,
|
|
16
|
+
maxArticles = Infinity, // Add this line
|
|
16
17
|
excludeList,
|
|
17
18
|
exactExcludeList,
|
|
18
19
|
scrapResultPath = "./dataset",
|
|
@@ -33,6 +34,7 @@ class WebScraper
|
|
|
33
34
|
this.baseURL = baseURL;
|
|
34
35
|
this.startURL = startURL || baseURL;
|
|
35
36
|
this.maxDepth = maxDepth;
|
|
37
|
+
this.maxArticles = maxArticles; // Add this line
|
|
36
38
|
this.scrapResultPath = scrapResultPath;
|
|
37
39
|
this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
|
|
38
40
|
this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
|
|
@@ -113,6 +115,11 @@ class WebScraper
|
|
|
113
115
|
|
|
114
116
|
async fetchPage ( url, depth )
|
|
115
117
|
{
|
|
118
|
+
if ( this.allProcessedContent.length >= this.maxArticles )
|
|
119
|
+
{
|
|
120
|
+
console.log( `Reached maximum number of articles (${this.maxArticles})` );
|
|
121
|
+
return;
|
|
122
|
+
}
|
|
116
123
|
if ( depth > this.maxDepth )
|
|
117
124
|
{
|
|
118
125
|
return;
|
|
@@ -125,6 +132,7 @@ class WebScraper
|
|
|
125
132
|
try
|
|
126
133
|
{
|
|
127
134
|
const data = await this.caller( url );
|
|
135
|
+
if ( !data ) return;
|
|
128
136
|
const dom = new JSDOM( data, { url });
|
|
129
137
|
const { document } = dom.window;
|
|
130
138
|
|