clean-web-scraper 4.0.4 → 4.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +36 -2
- package/package.json +1 -1
package/example-usage.js
CHANGED
@@ -167,7 +167,15 @@ async function mondoweiss ( enable )
|
|
167
167
|
"https://mondoweiss.net/recent-comments/",
|
168
168
|
"https://mondoweiss.net/email-newsletters",
|
169
169
|
"https://mondoweiss.net/author",
|
170
|
-
"https://mondoweiss.net/tag/"
|
170
|
+
"https://mondoweiss.net/tag/",
|
171
|
+
"https://mondoweiss.net/wp-login.php",
|
172
|
+
"https://mondoweiss.net/news/page/",
|
173
|
+
"https://mondoweiss.net/news-letters/page/",
|
174
|
+
"https://mondoweiss.net/opinion/page/",
|
175
|
+
"https://mondoweiss.net/podcasts/page/",
|
176
|
+
"https://mondoweiss.net/media-analysis/page/",
|
177
|
+
"https://mondoweiss.net/culture/page/",
|
178
|
+
"https://mondoweiss.net/activism/page/"
|
171
179
|
],
|
172
180
|
exactExcludeList: [
|
173
181
|
"https://mondoweiss.net",
|
@@ -177,7 +185,33 @@ async function mondoweiss ( enable )
|
|
177
185
|
"https://mondoweiss.net/media-analysis/",
|
178
186
|
"https://mondoweiss.net/culture/",
|
179
187
|
"https://mondoweiss.net/activism/",
|
180
|
-
"https://mondoweiss.net/news-letters/"
|
188
|
+
"https://mondoweiss.net/news-letters/",
|
189
|
+
"https://mondoweiss.net/newsletters",
|
190
|
+
"https://mondoweiss.net/2006/",
|
191
|
+
"https://mondoweiss.net/2007/",
|
192
|
+
"https://mondoweiss.net/2008/",
|
193
|
+
"https://mondoweiss.net/2009/",
|
194
|
+
"https://mondoweiss.net/2010/",
|
195
|
+
"https://mondoweiss.net/2011/",
|
196
|
+
"https://mondoweiss.net/2012/",
|
197
|
+
"https://mondoweiss.net/2013/",
|
198
|
+
"https://mondoweiss.net/2014/",
|
199
|
+
"https://mondoweiss.net/2015/",
|
200
|
+
"https://mondoweiss.net/2016/",
|
201
|
+
"https://mondoweiss.net/2017/",
|
202
|
+
"https://mondoweiss.net/2018/",
|
203
|
+
"https://mondoweiss.net/2019/",
|
204
|
+
"https://mondoweiss.net/2020/",
|
205
|
+
"https://mondoweiss.net/2021/",
|
206
|
+
"https://mondoweiss.net/2022/",
|
207
|
+
"https://mondoweiss.net/2023/",
|
208
|
+
"https://mondoweiss.net/2024/",
|
209
|
+
"https://mondoweiss.net/2025/",
|
210
|
+
"https://mondoweiss.net/daily-headlines",
|
211
|
+
"https://mondoweiss.net/palestineletter",
|
212
|
+
"https://mondoweiss.net/podcasts/",
|
213
|
+
"https://mondoweiss.net/the-shift",
|
214
|
+
"https://mondoweiss.net/weekly-briefing"
|
181
215
|
],
|
182
216
|
scrapResultPath: "./dataset/mondoweiss/website",
|
183
217
|
jsonlOutputPath: "./dataset/mondoweiss/train.jsonl",
|