clean-web-scraper 3.4.0 → 3.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/example-usage.js +4 -1
- package/package.json +1 -1
- package/src/WebScraper.js +13 -1
package/README.md
CHANGED
package/example-usage.js
CHANGED
|
@@ -101,7 +101,9 @@ async function electronicintifada ()
|
|
|
101
101
|
"https://electronicintifada.net/blog",
|
|
102
102
|
"https://electronicintifada.net/people",
|
|
103
103
|
"https://electronicintifada.net/location",
|
|
104
|
-
"https://electronicintifada.net/file"
|
|
104
|
+
"https://electronicintifada.net/file",
|
|
105
|
+
"https://electronicintifada.net/bytopic/people",
|
|
106
|
+
"https://electronicintifada.net/comment/"
|
|
105
107
|
],
|
|
106
108
|
exactExcludeList: [
|
|
107
109
|
"https://electronicintifada.net",
|
|
@@ -115,6 +117,7 @@ async function electronicintifada ()
|
|
|
115
117
|
textOutputPath: "./dataset/electronicintifada/texts",
|
|
116
118
|
csvOutputPath: "./dataset/electronicintifada/train.csv",
|
|
117
119
|
includeMetadata: true,
|
|
120
|
+
maxArticles: 1000,
|
|
118
121
|
metadataFields: ["title", "description", "author"]
|
|
119
122
|
});
|
|
120
123
|
await scraper.start();
|
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -13,6 +13,7 @@ class WebScraper
|
|
|
13
13
|
baseURL,
|
|
14
14
|
startURL,
|
|
15
15
|
maxDepth = Infinity,
|
|
16
|
+
maxArticles = Infinity, // Add this line
|
|
16
17
|
excludeList,
|
|
17
18
|
exactExcludeList,
|
|
18
19
|
scrapResultPath = "./dataset",
|
|
@@ -33,6 +34,7 @@ class WebScraper
|
|
|
33
34
|
this.baseURL = baseURL;
|
|
34
35
|
this.startURL = startURL || baseURL;
|
|
35
36
|
this.maxDepth = maxDepth;
|
|
37
|
+
this.maxArticles = maxArticles; // Add this line
|
|
36
38
|
this.scrapResultPath = scrapResultPath;
|
|
37
39
|
this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
|
|
38
40
|
this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
|
|
@@ -113,18 +115,28 @@ class WebScraper
|
|
|
113
115
|
|
|
114
116
|
async fetchPage ( url, depth )
|
|
115
117
|
{
|
|
118
|
+
if ( this.allProcessedContent.length >= this.maxArticles )
|
|
119
|
+
{
|
|
120
|
+
console.log( `Reached maximum number of articles (${this.maxArticles})` );
|
|
121
|
+
return;
|
|
122
|
+
}
|
|
116
123
|
if ( depth > this.maxDepth )
|
|
117
124
|
{
|
|
118
125
|
return;
|
|
119
126
|
}
|
|
120
127
|
this.visited.add( url );
|
|
128
|
+
if ( !this.isValidFileType( url ) )
|
|
129
|
+
{
|
|
130
|
+
return;
|
|
131
|
+
}
|
|
121
132
|
try
|
|
122
133
|
{
|
|
123
134
|
const data = await this.caller( url );
|
|
135
|
+
if ( !data ) return;
|
|
124
136
|
const dom = new JSDOM( data, { url });
|
|
125
137
|
const { document } = dom.window;
|
|
126
138
|
|
|
127
|
-
if ( !this.isExcluded( url )
|
|
139
|
+
if ( !this.isExcluded( url ) )
|
|
128
140
|
{
|
|
129
141
|
const reader = new Readability( document, { charThreshold: 500, nbTopCandidates: 20 });
|
|
130
142
|
const article = reader.parse();
|