@data-fair/processing-web-scraper 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.js +53 -9
- package/package.json +3 -3
- package/processing-config-schema.json +1 -1
package/index.js
CHANGED
|
@@ -84,6 +84,7 @@ class PagesIterator {
|
|
|
84
84
|
if (typeof page === 'string') page = { url: page }
|
|
85
85
|
if (!this.processingConfig.baseURLs.find(b => page.url.startsWith(b))) return
|
|
86
86
|
page.parsedURL = page.parsedURL || new URL(page.url)
|
|
87
|
+
if (page.parsedURL.hash) return
|
|
87
88
|
if (this.robots[page.parsedURL.origin] && !this.robots[page.parsedURL.origin].isAllowed(page.url, this.pluginConfig.userAgent || 'data-fair-web-scraper')) {
|
|
88
89
|
return
|
|
89
90
|
}
|
|
@@ -139,15 +140,27 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
139
140
|
const pages = new PagesIterator(log, pluginConfig, processingConfig, robots)
|
|
140
141
|
|
|
141
142
|
await log.step('Init pages list')
|
|
142
|
-
|
|
143
|
-
for (const url of processingConfig.startURLs) await pages.push({ url, source: 'config start URLs' })
|
|
143
|
+
let existingPages
|
|
144
144
|
if (processingConfig.datasetMode === 'update') {
|
|
145
|
-
|
|
145
|
+
existingPages = (await axios.get(`api/v1/datasets/${dataset.id}/lines`, { params: { select: '_id,url,etag,lastModified', size: 10000 } })).data.results
|
|
146
146
|
await log.info(`add ${existingPages.length} pages from previous crawls`)
|
|
147
|
-
for (const page of existingPages)
|
|
147
|
+
for (const page of existingPages) {
|
|
148
|
+
page.parsedURL = new URL(page.url)
|
|
149
|
+
if (page.parsedURL.hash) {
|
|
150
|
+
const parentURL = new URL(page.parsedURL)
|
|
151
|
+
parentURL.hash = ''
|
|
152
|
+
page.parentId = getId({ url: parentURL.href })
|
|
153
|
+
}
|
|
154
|
+
await pages.push({ ...page, source: 'previous exploration' })
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
await log.info(`add ${processingConfig.startURLs.length} pages from config`)
|
|
158
|
+
for (const url of processingConfig.startURLs) {
|
|
159
|
+
await pages.push({ url, source: 'config start URLs' })
|
|
148
160
|
}
|
|
149
161
|
// TODO: init from sitemap (and use robots.getSitemaps() to help in this)
|
|
150
162
|
|
|
163
|
+
const sentIds = new Set([])
|
|
151
164
|
const sendPage = async (page, data, contentType = 'text/html', filename = 'content.html') => {
|
|
152
165
|
await log.debug('send page', page.url)
|
|
153
166
|
// TODO: apply no-index rules
|
|
@@ -169,7 +182,10 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
169
182
|
knownLength: data.length
|
|
170
183
|
}
|
|
171
184
|
form.append('attachment', data, dataOpts)
|
|
185
|
+
if (page.lastModified) form.append('lastModified', page.lastModified)
|
|
186
|
+
if (page.etag) form.append('etag', page.etag)
|
|
172
187
|
page._id = getId(page)
|
|
188
|
+
sentIds.add(page._id)
|
|
173
189
|
const headers = {
|
|
174
190
|
...form.getHeaders(),
|
|
175
191
|
'content-length': form.getLengthSync()
|
|
@@ -189,10 +205,28 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
189
205
|
await new Promise(resolve => setTimeout(resolve, crawlDelay * 1000))
|
|
190
206
|
|
|
191
207
|
// TODO: apply if-none-match and if-modified-since headers if etag or lastModified are available
|
|
208
|
+
const headers = { 'user-agent': pluginConfig.userAgent || 'data-fair-web-scraper' }
|
|
209
|
+
if (page.lastModified) headers['if-modified-since'] = page.lastModified
|
|
210
|
+
if (page.etag) headers['if-none-match'] = page.etag
|
|
192
211
|
let response
|
|
193
212
|
try {
|
|
194
|
-
response = await axios.get(page.url, { headers
|
|
213
|
+
response = await axios.get(page.url, { headers, maxRedirects: 0 })
|
|
195
214
|
} catch (err) {
|
|
215
|
+
// content did not change
|
|
216
|
+
if (err.status === 304) {
|
|
217
|
+
await log.debug(`page was not modified since last exploration ${page.url}`)
|
|
218
|
+
sentIds.add(page._id)
|
|
219
|
+
for (const existingPage of existingPages) {
|
|
220
|
+
if (existingPage.parentId === page._id) sentIds.add(existingPage._id)
|
|
221
|
+
}
|
|
222
|
+
continue
|
|
223
|
+
}
|
|
224
|
+
// follow a redirect
|
|
225
|
+
if (err.status === 301) {
|
|
226
|
+
await log.debug(`page redirected ${page.url} -> ${err.headers.location}`)
|
|
227
|
+
pages.push({ url: new URL(err.headers.location, page.url).href, source: 'redirect ' + page.url })
|
|
228
|
+
continue
|
|
229
|
+
}
|
|
196
230
|
await log.warning(`failed to fetch page ${page.url} - ${err.status || err.message}`)
|
|
197
231
|
if (page.source) await log.warning(`this broken URL comes from ${page.source}`)
|
|
198
232
|
continue
|
|
@@ -205,6 +239,9 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
205
239
|
}
|
|
206
240
|
}
|
|
207
241
|
|
|
242
|
+
page.lastModified = response.headers['last-modified']
|
|
243
|
+
page.etag = response.headers.etag
|
|
244
|
+
|
|
208
245
|
const isHTML = (response.headers['content-type'] && response.headers['content-type'].startsWith('text/html;')) || (typeof response.data === 'string' && response.data.trim().startsWith('<html'))
|
|
209
246
|
if (isHTML) {
|
|
210
247
|
const cheerio = require('cheerio')
|
|
@@ -237,10 +274,9 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
237
274
|
const fragment = anchor.wrapperSelector ? targetElement.closest(anchor.wrapperSelector) : targetElement
|
|
238
275
|
const fragmentHtml = fragment.html()
|
|
239
276
|
if (fragmentHtml) {
|
|
240
|
-
const anchorPage = { url: parsedURL.href }
|
|
277
|
+
const anchorPage = { url: parsedURL.href, tags: anchor.tags || [], source: 'anchor ' + page.url }
|
|
241
278
|
if (anchor.titleSelector) anchorPage.title = fragment.find(anchor.titleSelector).text() || page.title
|
|
242
279
|
else anchorPage.title = targetElement.text() || page.title
|
|
243
|
-
anchorPage.tags = anchor.tags || []
|
|
244
280
|
anchorsPages.push([anchorPage, fragmentHtml])
|
|
245
281
|
$(fragment).remove()
|
|
246
282
|
}
|
|
@@ -248,7 +284,6 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
248
284
|
}
|
|
249
285
|
})
|
|
250
286
|
for (const [anchorPage, fragmentHtml] of anchorsPages) {
|
|
251
|
-
console.log(anchorPage)
|
|
252
287
|
await sendPage(anchorPage, `<body>
|
|
253
288
|
${fragmentHtml}
|
|
254
289
|
</body>`)
|
|
@@ -258,7 +293,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
258
293
|
if (!page.nofollow) {
|
|
259
294
|
$('a').each(function (i, elem) {
|
|
260
295
|
const href = $(this).attr('href')
|
|
261
|
-
if (href) pages.push({ url: new URL(href, page.url).href, source: page.url })
|
|
296
|
+
if (href) pages.push({ url: new URL(href, page.url).href, source: 'link ' + page.url })
|
|
262
297
|
})
|
|
263
298
|
}
|
|
264
299
|
|
|
@@ -270,6 +305,15 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
270
305
|
}
|
|
271
306
|
}
|
|
272
307
|
}
|
|
308
|
+
|
|
309
|
+
if (existingPages) {
|
|
310
|
+
for (const existingPage of existingPages) {
|
|
311
|
+
if (!sentIds.has(existingPage._id)) {
|
|
312
|
+
await log.info('delete previously explored page that was not indexed this time', existingPage.url)
|
|
313
|
+
await axios.delete(`api/v1/datasets/${dataset.id}/lines/${existingPage._id}`)
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
}
|
|
273
317
|
}
|
|
274
318
|
|
|
275
319
|
// used to manage interruption
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@data-fair/processing-web-scraper",
|
|
3
|
-
"version": "0.1
|
|
3
|
+
"version": "0.2.1",
|
|
4
4
|
"description": "A small Web scraper that publishes its data into data-fair datasets.",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"scripts": {
|
|
@@ -20,9 +20,9 @@
|
|
|
20
20
|
"bugs": {
|
|
21
21
|
"url": "https://github.com/data-fair/processing-web-scraper/issues"
|
|
22
22
|
},
|
|
23
|
-
"homepage": "https://github.com/data-fair/
|
|
23
|
+
"homepage": "https://github.com/data-fair/processing-web-scraper#readme",
|
|
24
24
|
"devDependencies": {
|
|
25
|
-
"@data-fair/processings-test-utils": "^0.5.
|
|
25
|
+
"@data-fair/processings-test-utils": "^0.5.1",
|
|
26
26
|
"config": "^3.3.6",
|
|
27
27
|
"eslint": "^7.18.0",
|
|
28
28
|
"express": "^4.18.2",
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
"required": ["title"],
|
|
15
15
|
"properties": {
|
|
16
16
|
"id": {"type": "string", "title": "Identifiant (laissez vide pour calculer un identifiant à partir du titre)"},
|
|
17
|
-
"title": {"type": "string", "title": "Titre", "default": "
|
|
17
|
+
"title": {"type": "string", "title": "Titre", "default": "Web scraper"}
|
|
18
18
|
}
|
|
19
19
|
}
|
|
20
20
|
}
|