@data-fair/processing-web-scraper 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.js +21 -6
- package/package.json +1 -1
- package/processing-config-schema.json +10 -5
package/index.js
CHANGED
|
@@ -5,9 +5,7 @@ const robotsParser = require('robots-parser')
|
|
|
5
5
|
// TODO:
|
|
6
6
|
// handle html but also any file formats
|
|
7
7
|
// add in-links info (at least for files)
|
|
8
|
-
// store last-modified and e-tag and use is when re-crawling a site
|
|
9
8
|
// specifications listed here http://robots-txt.com/
|
|
10
|
-
// normalize URL to prevent duplicates
|
|
11
9
|
|
|
12
10
|
const datasetSchema = [
|
|
13
11
|
{
|
|
@@ -51,13 +49,15 @@ const datasetSchema = [
|
|
|
51
49
|
// a global variable to manage interruption
|
|
52
50
|
let stopped
|
|
53
51
|
|
|
54
|
-
const normalizeURL = (url) => {
|
|
52
|
+
const normalizeURL = (url, ignoreHash = false, addSlash = false) => {
|
|
55
53
|
const parsedURL = new URL(url)
|
|
56
54
|
for (const indexSuffix of ['index.html', 'index.php', 'index.jsp', 'index.cgi']) {
|
|
57
55
|
if (parsedURL.pathname.endsWith('/' + indexSuffix)) {
|
|
58
56
|
parsedURL.pathname = parsedURL.pathname.slice(0, parsedURL.pathname.length - indexSuffix.length)
|
|
59
57
|
}
|
|
60
58
|
}
|
|
59
|
+
if (ignoreHash) parsedURL.hash = ''
|
|
60
|
+
if (addSlash && !parsedURL.pathname.endsWith('/')) parsedURL.pathname += '/'
|
|
61
61
|
return parsedURL.href
|
|
62
62
|
}
|
|
63
63
|
|
|
@@ -158,7 +158,21 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
158
158
|
for (const url of processingConfig.startURLs) {
|
|
159
159
|
await pages.push({ url, source: 'config start URLs' })
|
|
160
160
|
}
|
|
161
|
-
|
|
161
|
+
|
|
162
|
+
if (processingConfig.sitemap) {
|
|
163
|
+
await log.info(`fetch start URLs from sitemmap ${processingConfig.sitemap}`)
|
|
164
|
+
const sitemap = (await axios.get(processingConfig.sitemap)).data
|
|
165
|
+
const cheerio = require('cheerio')
|
|
166
|
+
const $ = cheerio.load(sitemap)
|
|
167
|
+
const sitemapURLs = []
|
|
168
|
+
$('url loc').each(function () {
|
|
169
|
+
sitemapURLs.push($(this).text())
|
|
170
|
+
})
|
|
171
|
+
for (const url of sitemapURLs) {
|
|
172
|
+
await pages.push({ url, source: 'sitemap' })
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
// TODO: use robots.getSitemaps()
|
|
162
176
|
|
|
163
177
|
const sentIds = new Set([])
|
|
164
178
|
const sendPage = async (page, data, contentType = 'text/html', filename = 'content.html') => {
|
|
@@ -233,6 +247,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
233
247
|
}
|
|
234
248
|
|
|
235
249
|
if (response.headers['x-robots-tag']) {
|
|
250
|
+
await log.debug('use x-robots-tag header', response.headers['x-robots-tag'])
|
|
236
251
|
for (const part of response.headers['x-robots-tag'].split(',').map(p => p.trim())) {
|
|
237
252
|
if (part === 'noindex') page.noindex = true
|
|
238
253
|
if (part === 'nofollow') page.nofollow = true
|
|
@@ -252,6 +267,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
252
267
|
const name = $(this).attr('name')
|
|
253
268
|
if (name === 'robots') {
|
|
254
269
|
const content = $(this).attr('content')
|
|
270
|
+
log.debug('use robots meta', content)
|
|
255
271
|
if (content) {
|
|
256
272
|
for (const part of content.split(',').map(p => p.trim())) {
|
|
257
273
|
if (part === 'noindex') page.noindex = true
|
|
@@ -267,7 +283,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
267
283
|
const href = $(this).attr('href')
|
|
268
284
|
if (!href) return
|
|
269
285
|
const parsedURL = new URL(href, page.url)
|
|
270
|
-
if (parsedURL.hash) {
|
|
286
|
+
if (parsedURL.hash && normalizeURL(parsedURL.href, true, true) === normalizeURL(page.url, true, true)) {
|
|
271
287
|
const targetElement = $(parsedURL.hash)
|
|
272
288
|
if (!targetElement) return
|
|
273
289
|
for (const anchor of processingConfig.anchors || []) {
|
|
@@ -289,7 +305,6 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
289
305
|
</body>`)
|
|
290
306
|
}
|
|
291
307
|
}
|
|
292
|
-
|
|
293
308
|
if (!page.nofollow) {
|
|
294
309
|
$('a').each(function (i, elem) {
|
|
295
310
|
const href = $(this).attr('href')
|
package/package.json
CHANGED
|
@@ -49,13 +49,22 @@
|
|
|
49
49
|
"startURLs": {
|
|
50
50
|
"type": "array",
|
|
51
51
|
"title": "URLs de départ",
|
|
52
|
-
"
|
|
52
|
+
"description": "Peut être omis si vous fournissez une URL vers un fichier sitemap.xml dans les paramètres avancés",
|
|
53
|
+
"minItems": 0,
|
|
53
54
|
"items": {"type": "string"}
|
|
54
55
|
}
|
|
55
56
|
}
|
|
56
57
|
}, {
|
|
57
58
|
"title": "Paramètres avancés",
|
|
58
59
|
"properties": {
|
|
60
|
+
"titlePrefix": {
|
|
61
|
+
"type": "string",
|
|
62
|
+
"title": "Préfixe à supprimer des titres avant indexation"
|
|
63
|
+
},
|
|
64
|
+
"sitemap": {
|
|
65
|
+
"type": "string",
|
|
66
|
+
"title": "URL d'un fichier sitemap.xml"
|
|
67
|
+
},
|
|
59
68
|
"prune": {
|
|
60
69
|
"type": "array",
|
|
61
70
|
"title": "Sélecteurs d'éléments HTML à ignorer",
|
|
@@ -98,10 +107,6 @@
|
|
|
98
107
|
}
|
|
99
108
|
}
|
|
100
109
|
}
|
|
101
|
-
},
|
|
102
|
-
"titlePrefix": {
|
|
103
|
-
"type": "string",
|
|
104
|
-
"title": "Préfixe à supprimer des titres avant indexation"
|
|
105
110
|
}
|
|
106
111
|
}
|
|
107
112
|
}]
|