clean-web-scraper 3.4.1 → 3.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -89,7 +89,8 @@ await WebScraper.combineResults('./combined', [docsScraper, blogScraper]);
89
89
  ```
90
90
 
91
91
  ```bash
92
- node example-usage.js
92
+ # 8 GB RAM
93
+ node --max-old-space-size=8192 example-usage.js
93
94
  ```
94
95
 
95
96
  ## 📤 Output
package/example-usage.js CHANGED
@@ -27,13 +27,15 @@ async function khameneiIrFreePalestineTag ()
27
27
  includeMetadata: true,
28
28
  metadataFields: ["title", "description", "author"]
29
29
  });
30
- await scraper.start();
30
+ if ( enable )
31
+ {
32
+ await scraper.start();
33
+ }
31
34
  return scraper;
32
35
  }
33
36
 
34
- async function decolonizepalestine ()
37
+ async function decolonizepalestine ( enable )
35
38
  {
36
- // https://decolonizepalestine.com
37
39
  const scraper = new WebScraper({
38
40
  baseURL: "https://decolonizepalestine.com",
39
41
  excludeList: [
@@ -54,13 +56,15 @@ async function decolonizepalestine ()
54
56
  includeMetadata: true,
55
57
  metadataFields: ["title", "description", "author"]
56
58
  });
57
- await scraper.start();
59
+ if ( enable )
60
+ {
61
+ await scraper.start();
62
+ }
58
63
  return scraper;
59
64
  }
60
65
 
61
- async function bdsmovement ()
66
+ async function bdsmovement ( enable )
62
67
  {
63
- // https://bdsmovement.net
64
68
  const scraper = new WebScraper({
65
69
  baseURL: "https://bdsmovement.net",
66
70
  excludeList: [
@@ -83,15 +87,16 @@ async function bdsmovement ()
83
87
  host: "socks5://127.0.0.1",
84
88
  port: "2080",
85
89
  },
86
- // usePuppeteer: true
87
90
  });
88
- await scraper.start();
91
+ if ( enable )
92
+ {
93
+ await scraper.start();
94
+ }
89
95
  return scraper;
90
96
  }
91
97
 
92
- async function electronicintifada ()
98
+ async function electronicintifada ( enable )
93
99
  {
94
- // https://electronicintifada.net
95
100
  const scraper = new WebScraper({
96
101
  baseURL: "https://electronicintifada.net",
97
102
  excludeList: [
@@ -101,7 +106,10 @@ async function electronicintifada ()
101
106
  "https://electronicintifada.net/blog",
102
107
  "https://electronicintifada.net/people",
103
108
  "https://electronicintifada.net/location",
104
- "https://electronicintifada.net/file"
109
+ "https://electronicintifada.net/file",
110
+ "https://electronicintifada.net/bytopic/people",
111
+ "https://electronicintifada.net/comment/",
112
+ "https://electronicintifada.net/search/site/"
105
113
  ],
106
114
  exactExcludeList: [
107
115
  "https://electronicintifada.net",
@@ -115,23 +123,73 @@ async function electronicintifada ()
115
123
  textOutputPath: "./dataset/electronicintifada/texts",
116
124
  csvOutputPath: "./dataset/electronicintifada/train.csv",
117
125
  includeMetadata: true,
126
+ maxArticles: 2000,
118
127
  metadataFields: ["title", "description", "author"]
119
128
  });
120
- await scraper.start();
129
+ if ( enable )
130
+ {
131
+ await scraper.start();
132
+ }
121
133
  return scraper;
122
134
  }
123
135
 
136
+ async function palestineremembered ( enable )
137
+ {
138
+ const scraper = new WebScraper({
139
+ baseURL: "https://www.palestineremembered.com",
140
+ startURL: "https://www.palestineremembered.com/ZionistFAQ.html",
141
+ excludeList: [
142
+ "https://www.palestineremembered.com/GeoPoints",
143
+ "https://www.palestineremembered.com/Donate",
144
+ "https://www.palestineremembered.com/ContactUs.html",
145
+ "https://www.palestineremembered.com/tags/Looting-Palestinian-properties.html",
146
+ "https://www.palestineremembered.com/ar/",
147
+ "https://www.palestineremembered.com/OldNewPictures.html",
148
+ "https://www.palestineremembered.com/Maps/index.html",
149
+ "https://www.palestineremembered.com/OralHistory/Interviews-Listing/",
150
+ "https://www.palestineremembered.com/Acre/Famous-Zionist-Quotes/Story637.html",
151
+ "https://www.palestineremembered.com/Articles/General/Story2045.html",
152
+ "https://www.palestineremembered.com/AllTownsListing.html",
153
+ "https://www.palestineremembered.com/Articles/General/ar/",
154
+ "https://www.palestineremembered.com/SiteVideos.html"
155
+ ],
156
+ exactExcludeList: [
157
+ "https://www.palestineremembered.com/index.html",
158
+ "https://www.palestineremembered.com/ZionistFAQ.html"
159
+ ],
160
+ scrapResultPath: "./dataset/palestineremembered/website",
161
+ jsonlOutputPath: "./dataset/palestineremembered/train.jsonl",
162
+ textOutputPath: "./dataset/palestineremembered/texts",
163
+ csvOutputPath: "./dataset/palestineremembered/train.csv",
164
+ includeMetadata: true,
165
+ metadataFields: ["title", "description", "author"],
166
+ axiosProxy: {
167
+ host: "localhost",
168
+ port: 2080,
169
+ protocol: "http"
170
+ }
171
+ });
172
+ if ( enable )
173
+ {
174
+ await scraper.start();
175
+ }
176
+ return scraper;
177
+ }
178
+
179
+
124
180
  void async function main ()
125
181
  {
126
- const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
127
- const decolonizepalestineScraper = await decolonizepalestine();
128
- const bdsmovementScraper = await bdsmovement();
129
- const electronicintifadaScraper = await electronicintifada();
182
+ const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( false );
183
+ const decolonizepalestineScraper = await decolonizepalestine( false );
184
+ const bdsmovementScraper = await bdsmovement( false );
185
+ const electronicintifadaScraper = await electronicintifada( false );
186
+ const palestinerememberedScraper = await palestineremembered( true );
130
187
  await WebScraper.combineResults( "./dataset/combined", [
131
188
  khameneiIrFreePalestineTagScraper,
132
189
  decolonizepalestineScraper,
133
190
  bdsmovementScraper,
134
- electronicintifadaScraper
191
+ electronicintifadaScraper,
192
+ palestinerememberedScraper
135
193
  ] );
136
194
 
137
195
  // 5
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.4.1",
3
+ "version": "3.5.1",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -13,6 +13,7 @@ class WebScraper
13
13
  baseURL,
14
14
  startURL,
15
15
  maxDepth = Infinity,
16
+ maxArticles = Infinity, // Add this line
16
17
  excludeList,
17
18
  exactExcludeList,
18
19
  scrapResultPath = "./dataset",
@@ -21,7 +22,8 @@ class WebScraper
21
22
  csvOutputPath,
22
23
  includeMetadata = false,
23
24
  metadataFields = [], // ['title', 'description', 'author', etc.]
24
- headers,
25
+ axiosHeaders,
26
+ axiosProxy,
25
27
  usePuppeteer,
26
28
  puppeteerProxy, // e.g. http://127.0.0.1:2080
27
29
  puppeteerExecutablePath,
@@ -33,13 +35,14 @@ class WebScraper
33
35
  this.baseURL = baseURL;
34
36
  this.startURL = startURL || baseURL;
35
37
  this.maxDepth = maxDepth;
38
+ this.maxArticles = maxArticles; // Add this line
36
39
  this.scrapResultPath = scrapResultPath;
37
40
  this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
38
41
  this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
39
42
  this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
40
43
  this.jsonlOutputPathWithMeta = jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
41
44
  this.csvOutputPathWithMeta = csvOutputPath.replace( ".csv", "_with_metadata.csv" );
42
- this.headers = headers;
45
+ this.axiosHeaders = axiosHeaders;
43
46
  this.includeMetadata = includeMetadata;
44
47
  this.metadataFields = new Set( metadataFields );
45
48
  this.visited = new Set();
@@ -48,6 +51,7 @@ class WebScraper
48
51
  this.allProcessedContent = [];
49
52
  this.filterFileTypes = filterFileTypes;
50
53
  this.excludedFileTypes = excludedFileTypes;
54
+ this.axiosProxy = axiosProxy;
51
55
  this.usePuppeteer = usePuppeteer || false;
52
56
  this.puppeteerOptions = {
53
57
  headless: false,
@@ -113,6 +117,11 @@ class WebScraper
113
117
 
114
118
  async fetchPage ( url, depth )
115
119
  {
120
+ if ( this.allProcessedContent.length >= this.maxArticles )
121
+ {
122
+ console.log( `Reached maximum number of articles (${this.maxArticles})` );
123
+ return;
124
+ }
116
125
  if ( depth > this.maxDepth )
117
126
  {
118
127
  return;
@@ -125,6 +134,7 @@ class WebScraper
125
134
  try
126
135
  {
127
136
  const data = await this.caller( url );
137
+ if ( !data ) return;
128
138
  const dom = new JSDOM( data, { url });
129
139
  const { document } = dom.window;
130
140
 
@@ -172,9 +182,13 @@ class WebScraper
172
182
  try
173
183
  {
174
184
  let axiosOptions = {};
175
- if ( this.headers )
185
+ if ( this.axiosHeaders )
186
+ {
187
+ axiosOptions.headers = this.axiosHeaders;
188
+ }
189
+ if ( this.axiosProxy )
176
190
  {
177
- axiosOptions.headers = this.headers;
191
+ axiosOptions.proxy = this.axiosProxy;
178
192
  }
179
193
 
180
194
  // Step 1: Make a GET request with a small timeout and limited data download