clean-web-scraper 3.5.0 → 3.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/example-usage.js CHANGED
@@ -27,13 +27,15 @@ async function khameneiIrFreePalestineTag ()
27
27
  includeMetadata: true,
28
28
  metadataFields: ["title", "description", "author"]
29
29
  });
30
- await scraper.start();
30
+ if ( enable )
31
+ {
32
+ await scraper.start();
33
+ }
31
34
  return scraper;
32
35
  }
33
36
 
34
- async function decolonizepalestine ()
37
+ async function decolonizepalestine ( enable )
35
38
  {
36
- // https://decolonizepalestine.com
37
39
  const scraper = new WebScraper({
38
40
  baseURL: "https://decolonizepalestine.com",
39
41
  excludeList: [
@@ -54,13 +56,15 @@ async function decolonizepalestine ()
54
56
  includeMetadata: true,
55
57
  metadataFields: ["title", "description", "author"]
56
58
  });
57
- await scraper.start();
59
+ if ( enable )
60
+ {
61
+ await scraper.start();
62
+ }
58
63
  return scraper;
59
64
  }
60
65
 
61
- async function bdsmovement ()
66
+ async function bdsmovement ( enable )
62
67
  {
63
- // https://bdsmovement.net
64
68
  const scraper = new WebScraper({
65
69
  baseURL: "https://bdsmovement.net",
66
70
  excludeList: [
@@ -83,15 +87,16 @@ async function bdsmovement ()
83
87
  host: "socks5://127.0.0.1",
84
88
  port: "2080",
85
89
  },
86
- // usePuppeteer: true
87
90
  });
88
- await scraper.start();
91
+ if ( enable )
92
+ {
93
+ await scraper.start();
94
+ }
89
95
  return scraper;
90
96
  }
91
97
 
92
- async function electronicintifada ()
98
+ async function electronicintifada ( enable )
93
99
  {
94
- // https://electronicintifada.net
95
100
  const scraper = new WebScraper({
96
101
  baseURL: "https://electronicintifada.net",
97
102
  excludeList: [
@@ -103,7 +108,8 @@ async function electronicintifada ()
103
108
  "https://electronicintifada.net/location",
104
109
  "https://electronicintifada.net/file",
105
110
  "https://electronicintifada.net/bytopic/people",
106
- "https://electronicintifada.net/comment/"
111
+ "https://electronicintifada.net/comment/",
112
+ "https://electronicintifada.net/search/site/"
107
113
  ],
108
114
  exactExcludeList: [
109
115
  "https://electronicintifada.net",
@@ -117,24 +123,73 @@ async function electronicintifada ()
117
123
  textOutputPath: "./dataset/electronicintifada/texts",
118
124
  csvOutputPath: "./dataset/electronicintifada/train.csv",
119
125
  includeMetadata: true,
120
- maxArticles: 1000,
126
+ maxArticles: 2000,
121
127
  metadataFields: ["title", "description", "author"]
122
128
  });
123
- await scraper.start();
129
+ if ( enable )
130
+ {
131
+ await scraper.start();
132
+ }
124
133
  return scraper;
125
134
  }
126
135
 
136
+ async function palestineremembered ( enable )
137
+ {
138
+ const scraper = new WebScraper({
139
+ baseURL: "https://www.palestineremembered.com",
140
+ startURL: "https://www.palestineremembered.com/ZionistFAQ.html",
141
+ excludeList: [
142
+ "https://www.palestineremembered.com/GeoPoints",
143
+ "https://www.palestineremembered.com/Donate",
144
+ "https://www.palestineremembered.com/ContactUs.html",
145
+ "https://www.palestineremembered.com/tags/Looting-Palestinian-properties.html",
146
+ "https://www.palestineremembered.com/ar/",
147
+ "https://www.palestineremembered.com/OldNewPictures.html",
148
+ "https://www.palestineremembered.com/Maps/index.html",
149
+ "https://www.palestineremembered.com/OralHistory/Interviews-Listing/",
150
+ "https://www.palestineremembered.com/Acre/Famous-Zionist-Quotes/Story637.html",
151
+ "https://www.palestineremembered.com/Articles/General/Story2045.html",
152
+ "https://www.palestineremembered.com/AllTownsListing.html",
153
+ "https://www.palestineremembered.com/Articles/General/ar/",
154
+ "https://www.palestineremembered.com/SiteVideos.html"
155
+ ],
156
+ exactExcludeList: [
157
+ "https://www.palestineremembered.com/index.html",
158
+ "https://www.palestineremembered.com/ZionistFAQ.html"
159
+ ],
160
+ scrapResultPath: "./dataset/palestineremembered/website",
161
+ jsonlOutputPath: "./dataset/palestineremembered/train.jsonl",
162
+ textOutputPath: "./dataset/palestineremembered/texts",
163
+ csvOutputPath: "./dataset/palestineremembered/train.csv",
164
+ includeMetadata: true,
165
+ metadataFields: ["title", "description", "author"],
166
+ axiosProxy: {
167
+ host: "localhost",
168
+ port: 2080,
169
+ protocol: "http"
170
+ }
171
+ });
172
+ if ( enable )
173
+ {
174
+ await scraper.start();
175
+ }
176
+ return scraper;
177
+ }
178
+
179
+
127
180
  void async function main ()
128
181
  {
129
- const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
130
- const decolonizepalestineScraper = await decolonizepalestine();
131
- const bdsmovementScraper = await bdsmovement();
132
- const electronicintifadaScraper = await electronicintifada();
182
+ const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( false );
183
+ const decolonizepalestineScraper = await decolonizepalestine( false );
184
+ const bdsmovementScraper = await bdsmovement( false );
185
+ const electronicintifadaScraper = await electronicintifada( false );
186
+ const palestinerememberedScraper = await palestineremembered( true );
133
187
  await WebScraper.combineResults( "./dataset/combined", [
134
188
  khameneiIrFreePalestineTagScraper,
135
189
  decolonizepalestineScraper,
136
190
  bdsmovementScraper,
137
- electronicintifadaScraper
191
+ electronicintifadaScraper,
192
+ palestinerememberedScraper
138
193
  ] );
139
194
 
140
195
  // 5
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.5.0",
3
+ "version": "3.5.1",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -22,7 +22,8 @@ class WebScraper
22
22
  csvOutputPath,
23
23
  includeMetadata = false,
24
24
  metadataFields = [], // ['title', 'description', 'author', etc.]
25
- headers,
25
+ axiosHeaders,
26
+ axiosProxy,
26
27
  usePuppeteer,
27
28
  puppeteerProxy, // e.g. http://127.0.0.1:2080
28
29
  puppeteerExecutablePath,
@@ -41,7 +42,7 @@ class WebScraper
41
42
  this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
42
43
  this.jsonlOutputPathWithMeta = jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
43
44
  this.csvOutputPathWithMeta = csvOutputPath.replace( ".csv", "_with_metadata.csv" );
44
- this.headers = headers;
45
+ this.axiosHeaders = axiosHeaders;
45
46
  this.includeMetadata = includeMetadata;
46
47
  this.metadataFields = new Set( metadataFields );
47
48
  this.visited = new Set();
@@ -50,6 +51,7 @@ class WebScraper
50
51
  this.allProcessedContent = [];
51
52
  this.filterFileTypes = filterFileTypes;
52
53
  this.excludedFileTypes = excludedFileTypes;
54
+ this.axiosProxy = axiosProxy;
53
55
  this.usePuppeteer = usePuppeteer || false;
54
56
  this.puppeteerOptions = {
55
57
  headless: false,
@@ -180,9 +182,13 @@ class WebScraper
180
182
  try
181
183
  {
182
184
  let axiosOptions = {};
183
- if ( this.headers )
185
+ if ( this.axiosHeaders )
184
186
  {
185
- axiosOptions.headers = this.headers;
187
+ axiosOptions.headers = this.axiosHeaders;
188
+ }
189
+ if ( this.axiosProxy )
190
+ {
191
+ axiosOptions.proxy = this.axiosProxy;
186
192
  }
187
193
 
188
194
  // Step 1: Make a GET request with a small timeout and limited data download