clean-web-scraper 4.0.4 → 4.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/example-usage.js +38 -2
  2. package/package.json +1 -1
package/example-usage.js CHANGED
@@ -167,7 +167,15 @@ async function mondoweiss ( enable )
167
167
  "https://mondoweiss.net/recent-comments/",
168
168
  "https://mondoweiss.net/email-newsletters",
169
169
  "https://mondoweiss.net/author",
170
- "https://mondoweiss.net/tag/"
170
+ "https://mondoweiss.net/tag/",
171
+ "https://mondoweiss.net/wp-login.php",
172
+ "https://mondoweiss.net/news/page/",
173
+ "https://mondoweiss.net/news-letters/page/",
174
+ "https://mondoweiss.net/opinion/page/",
175
+ "https://mondoweiss.net/podcasts/page/",
176
+ "https://mondoweiss.net/media-analysis/page/",
177
+ "https://mondoweiss.net/culture/page/",
178
+ "https://mondoweiss.net/activism/page/"
171
179
  ],
172
180
  exactExcludeList: [
173
181
  "https://mondoweiss.net",
@@ -177,7 +185,33 @@ async function mondoweiss ( enable )
177
185
  "https://mondoweiss.net/media-analysis/",
178
186
  "https://mondoweiss.net/culture/",
179
187
  "https://mondoweiss.net/activism/",
180
- "https://mondoweiss.net/news-letters/"
188
+ "https://mondoweiss.net/news-letters/",
189
+ "https://mondoweiss.net/newsletters",
190
+ "https://mondoweiss.net/2006/",
191
+ "https://mondoweiss.net/2007/",
192
+ "https://mondoweiss.net/2008/",
193
+ "https://mondoweiss.net/2009/",
194
+ "https://mondoweiss.net/2010/",
195
+ "https://mondoweiss.net/2011/",
196
+ "https://mondoweiss.net/2012/",
197
+ "https://mondoweiss.net/2013/",
198
+ "https://mondoweiss.net/2014/",
199
+ "https://mondoweiss.net/2015/",
200
+ "https://mondoweiss.net/2016/",
201
+ "https://mondoweiss.net/2017/",
202
+ "https://mondoweiss.net/2018/",
203
+ "https://mondoweiss.net/2019/",
204
+ "https://mondoweiss.net/2020/",
205
+ "https://mondoweiss.net/2021/",
206
+ "https://mondoweiss.net/2022/",
207
+ "https://mondoweiss.net/2023/",
208
+ "https://mondoweiss.net/2024/",
209
+ "https://mondoweiss.net/2025/",
210
+ "https://mondoweiss.net/daily-headlines",
211
+ "https://mondoweiss.net/palestineletter",
212
+ "https://mondoweiss.net/podcasts/",
213
+ "https://mondoweiss.net/the-shift",
214
+ "https://mondoweiss.net/weekly-briefing"
181
215
  ],
182
216
  scrapResultPath: "./dataset/mondoweiss/website",
183
217
  jsonlOutputPath: "./dataset/mondoweiss/train.jsonl",
@@ -196,6 +230,8 @@ async function mondoweiss ( enable )
196
230
  useProxyAsFallback: true,
197
231
  includeMetadata: true,
198
232
  metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
233
+ crawlingDelay: 0,
234
+ batchSize: 10
199
235
  };
200
236
  return await runScraper( config, enable );
201
237
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "4.0.4",
3
+ "version": "4.0.6",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",