@data-fair/processing-web-scraper 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.js +29 -7
- package/package.json +2 -2
- package/processing-config-schema.json +10 -5
package/index.js
CHANGED
|
@@ -5,9 +5,7 @@ const robotsParser = require('robots-parser')
|
|
|
5
5
|
// TODO:
|
|
6
6
|
// handle html but also any file formats
|
|
7
7
|
// add in-links info (at least for files)
|
|
8
|
-
// store last-modified and e-tag and use is when re-crawling a site
|
|
9
8
|
// specifications listed here http://robots-txt.com/
|
|
10
|
-
// normalize URL to prevent duplicates
|
|
11
9
|
|
|
12
10
|
const datasetSchema = [
|
|
13
11
|
{
|
|
@@ -51,13 +49,15 @@ const datasetSchema = [
|
|
|
51
49
|
// a global variable to manage interruption
|
|
52
50
|
let stopped
|
|
53
51
|
|
|
54
|
-
const normalizeURL = (url) => {
|
|
52
|
+
const normalizeURL = (url, ignoreHash = false, addSlash = false) => {
|
|
55
53
|
const parsedURL = new URL(url)
|
|
56
54
|
for (const indexSuffix of ['index.html', 'index.php', 'index.jsp', 'index.cgi']) {
|
|
57
55
|
if (parsedURL.pathname.endsWith('/' + indexSuffix)) {
|
|
58
56
|
parsedURL.pathname = parsedURL.pathname.slice(0, parsedURL.pathname.length - indexSuffix.length)
|
|
59
57
|
}
|
|
60
58
|
}
|
|
59
|
+
if (ignoreHash) parsedURL.hash = ''
|
|
60
|
+
if (addSlash && !parsedURL.pathname.endsWith('/')) parsedURL.pathname += '/'
|
|
61
61
|
return parsedURL.href
|
|
62
62
|
}
|
|
63
63
|
|
|
@@ -158,7 +158,21 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
158
158
|
for (const url of processingConfig.startURLs) {
|
|
159
159
|
await pages.push({ url, source: 'config start URLs' })
|
|
160
160
|
}
|
|
161
|
-
|
|
161
|
+
|
|
162
|
+
if (processingConfig.sitemap) {
|
|
163
|
+
await log.info(`fetch start URLs from sitemmap ${processingConfig.sitemap}`)
|
|
164
|
+
const sitemap = (await axios.get(processingConfig.sitemap)).data
|
|
165
|
+
const cheerio = require('cheerio')
|
|
166
|
+
const $ = cheerio.load(sitemap)
|
|
167
|
+
const sitemapURLs = []
|
|
168
|
+
$('url loc').each(function () {
|
|
169
|
+
sitemapURLs.push($(this).text())
|
|
170
|
+
})
|
|
171
|
+
for (const url of sitemapURLs) {
|
|
172
|
+
await pages.push({ url, source: 'sitemap' })
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
// TODO: use robots.getSitemaps()
|
|
162
176
|
|
|
163
177
|
const sentIds = new Set([])
|
|
164
178
|
const sendPage = async (page, data, contentType = 'text/html', filename = 'content.html') => {
|
|
@@ -210,8 +224,9 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
210
224
|
if (page.etag) headers['if-none-match'] = page.etag
|
|
211
225
|
let response
|
|
212
226
|
try {
|
|
213
|
-
response = await axios.get(page.url, { headers })
|
|
227
|
+
response = await axios.get(page.url, { headers, maxRedirects: 0 })
|
|
214
228
|
} catch (err) {
|
|
229
|
+
// content did not change
|
|
215
230
|
if (err.status === 304) {
|
|
216
231
|
await log.debug(`page was not modified since last exploration ${page.url}`)
|
|
217
232
|
sentIds.add(page._id)
|
|
@@ -220,12 +235,19 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
220
235
|
}
|
|
221
236
|
continue
|
|
222
237
|
}
|
|
238
|
+
// follow a redirect
|
|
239
|
+
if (err.status === 301) {
|
|
240
|
+
await log.debug(`page redirected ${page.url} -> ${err.headers.location}`)
|
|
241
|
+
pages.push({ url: new URL(err.headers.location, page.url).href, source: 'redirect ' + page.url })
|
|
242
|
+
continue
|
|
243
|
+
}
|
|
223
244
|
await log.warning(`failed to fetch page ${page.url} - ${err.status || err.message}`)
|
|
224
245
|
if (page.source) await log.warning(`this broken URL comes from ${page.source}`)
|
|
225
246
|
continue
|
|
226
247
|
}
|
|
227
248
|
|
|
228
249
|
if (response.headers['x-robots-tag']) {
|
|
250
|
+
await log.debug('use x-robots-tag header', response.headers['x-robots-tag'])
|
|
229
251
|
for (const part of response.headers['x-robots-tag'].split(',').map(p => p.trim())) {
|
|
230
252
|
if (part === 'noindex') page.noindex = true
|
|
231
253
|
if (part === 'nofollow') page.nofollow = true
|
|
@@ -245,6 +267,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
245
267
|
const name = $(this).attr('name')
|
|
246
268
|
if (name === 'robots') {
|
|
247
269
|
const content = $(this).attr('content')
|
|
270
|
+
log.debug('use robots meta', content)
|
|
248
271
|
if (content) {
|
|
249
272
|
for (const part of content.split(',').map(p => p.trim())) {
|
|
250
273
|
if (part === 'noindex') page.noindex = true
|
|
@@ -260,7 +283,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
260
283
|
const href = $(this).attr('href')
|
|
261
284
|
if (!href) return
|
|
262
285
|
const parsedURL = new URL(href, page.url)
|
|
263
|
-
if (parsedURL.hash) {
|
|
286
|
+
if (parsedURL.hash && normalizeURL(parsedURL.href, true, true) === normalizeURL(page.url, true, true)) {
|
|
264
287
|
const targetElement = $(parsedURL.hash)
|
|
265
288
|
if (!targetElement) return
|
|
266
289
|
for (const anchor of processingConfig.anchors || []) {
|
|
@@ -282,7 +305,6 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
282
305
|
</body>`)
|
|
283
306
|
}
|
|
284
307
|
}
|
|
285
|
-
|
|
286
308
|
if (!page.nofollow) {
|
|
287
309
|
$('a').each(function (i, elem) {
|
|
288
310
|
const href = $(this).attr('href')
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@data-fair/processing-web-scraper",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.0",
|
|
4
4
|
"description": "A small Web scraper that publishes its data into data-fair datasets.",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"scripts": {
|
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
},
|
|
23
23
|
"homepage": "https://github.com/data-fair/processing-web-scraper#readme",
|
|
24
24
|
"devDependencies": {
|
|
25
|
-
"@data-fair/processings-test-utils": "^0.5.
|
|
25
|
+
"@data-fair/processings-test-utils": "^0.5.1",
|
|
26
26
|
"config": "^3.3.6",
|
|
27
27
|
"eslint": "^7.18.0",
|
|
28
28
|
"express": "^4.18.2",
|
|
@@ -49,13 +49,22 @@
|
|
|
49
49
|
"startURLs": {
|
|
50
50
|
"type": "array",
|
|
51
51
|
"title": "URLs de départ",
|
|
52
|
-
"
|
|
52
|
+
"description": "Peut être omis si vous fournissez une URL vers un fichier sitemap.xml dans les paramètres avancés",
|
|
53
|
+
"minItems": 0,
|
|
53
54
|
"items": {"type": "string"}
|
|
54
55
|
}
|
|
55
56
|
}
|
|
56
57
|
}, {
|
|
57
58
|
"title": "Paramètres avancés",
|
|
58
59
|
"properties": {
|
|
60
|
+
"titlePrefix": {
|
|
61
|
+
"type": "string",
|
|
62
|
+
"title": "Préfixe à supprimer des titres avant indexation"
|
|
63
|
+
},
|
|
64
|
+
"sitemap": {
|
|
65
|
+
"type": "string",
|
|
66
|
+
"title": "URL d'un fichier sitemap.xml"
|
|
67
|
+
},
|
|
59
68
|
"prune": {
|
|
60
69
|
"type": "array",
|
|
61
70
|
"title": "Sélecteurs d'éléments HTML à ignorer",
|
|
@@ -98,10 +107,6 @@
|
|
|
98
107
|
}
|
|
99
108
|
}
|
|
100
109
|
}
|
|
101
|
-
},
|
|
102
|
-
"titlePrefix": {
|
|
103
|
-
"type": "string",
|
|
104
|
-
"title": "Préfixe à supprimer des titres avant indexation"
|
|
105
110
|
}
|
|
106
111
|
}
|
|
107
112
|
}]
|