@data-fair/processing-web-scraper 0.2.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.js +42 -9
- package/package.json +3 -2
- package/processing-config-schema.json +21 -14
package/index.js
CHANGED
|
@@ -5,9 +5,7 @@ const robotsParser = require('robots-parser')
|
|
|
5
5
|
// TODO:
|
|
6
6
|
// handle html but also any file formats
|
|
7
7
|
// add in-links info (at least for files)
|
|
8
|
-
// store last-modified and e-tag and use is when re-crawling a site
|
|
9
8
|
// specifications listed here http://robots-txt.com/
|
|
10
|
-
// normalize URL to prevent duplicates
|
|
11
9
|
|
|
12
10
|
const datasetSchema = [
|
|
13
11
|
{
|
|
@@ -51,13 +49,15 @@ const datasetSchema = [
|
|
|
51
49
|
// a global variable to manage interruption
|
|
52
50
|
let stopped
|
|
53
51
|
|
|
54
|
-
const normalizeURL = (url) => {
|
|
52
|
+
const normalizeURL = (url, ignoreHash = false, addSlash = false) => {
|
|
55
53
|
const parsedURL = new URL(url)
|
|
56
54
|
for (const indexSuffix of ['index.html', 'index.php', 'index.jsp', 'index.cgi']) {
|
|
57
55
|
if (parsedURL.pathname.endsWith('/' + indexSuffix)) {
|
|
58
56
|
parsedURL.pathname = parsedURL.pathname.slice(0, parsedURL.pathname.length - indexSuffix.length)
|
|
59
57
|
}
|
|
60
58
|
}
|
|
59
|
+
if (ignoreHash) parsedURL.hash = ''
|
|
60
|
+
if (addSlash && !parsedURL.pathname.endsWith('/')) parsedURL.pathname += '/'
|
|
61
61
|
return parsedURL.href
|
|
62
62
|
}
|
|
63
63
|
|
|
@@ -73,6 +73,13 @@ class PagesIterator {
|
|
|
73
73
|
this.pluginConfig = pluginConfig
|
|
74
74
|
this.processingConfig = processingConfig
|
|
75
75
|
this.robots = robots
|
|
76
|
+
const UrlPattern = require('url-pattern')
|
|
77
|
+
this.excludeURLPatterns = (processingConfig.excludeURLPatterns || []).map(p => {
|
|
78
|
+
const url = new URL(p)
|
|
79
|
+
const pattern = new UrlPattern(url.pathname)
|
|
80
|
+
pattern.hostname = url.hostname
|
|
81
|
+
return pattern
|
|
82
|
+
})
|
|
76
83
|
}
|
|
77
84
|
|
|
78
85
|
[Symbol.asyncIterator] () {
|
|
@@ -80,11 +87,13 @@ class PagesIterator {
|
|
|
80
87
|
}
|
|
81
88
|
|
|
82
89
|
push (page) {
|
|
83
|
-
// TODO: apply no-follow rules
|
|
84
90
|
if (typeof page === 'string') page = { url: page }
|
|
85
91
|
if (!this.processingConfig.baseURLs.find(b => page.url.startsWith(b))) return
|
|
86
92
|
page.parsedURL = page.parsedURL || new URL(page.url)
|
|
87
93
|
if (page.parsedURL.hash) return
|
|
94
|
+
if (this.excludeURLPatterns.find(p => p.match(page.parsedURL.pathname) && p.hostname === page.parsedURL.hostname)) {
|
|
95
|
+
return
|
|
96
|
+
}
|
|
88
97
|
if (this.robots[page.parsedURL.origin] && !this.robots[page.parsedURL.origin].isAllowed(page.url, this.pluginConfig.userAgent || 'data-fair-web-scraper')) {
|
|
89
98
|
return
|
|
90
99
|
}
|
|
@@ -126,12 +135,16 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
126
135
|
|
|
127
136
|
// parse the robots.txt files if available
|
|
128
137
|
const robots = {}
|
|
138
|
+
const sitemaps = processingConfig.sitemaps || []
|
|
129
139
|
for (const baseURL of processingConfig.baseURLs) {
|
|
130
140
|
const { origin } = new URL(baseURL)
|
|
131
141
|
if (robots[origin]) continue
|
|
132
142
|
try {
|
|
133
143
|
const response = await axios.get(origin + '/robots.txt')
|
|
134
144
|
robots[origin] = robotsParser(origin + '/robots.txt', response.data)
|
|
145
|
+
for (const sitemap of robots[origin].getSitemaps()) {
|
|
146
|
+
if (!sitemaps.includes(sitemap)) sitemaps.push(sitemap)
|
|
147
|
+
}
|
|
135
148
|
} catch (err) {
|
|
136
149
|
await log.info(`failed to fetch ${origin + '/robots.txt'} - ${err.status || err.message}`)
|
|
137
150
|
}
|
|
@@ -158,12 +171,24 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
158
171
|
for (const url of processingConfig.startURLs) {
|
|
159
172
|
await pages.push({ url, source: 'config start URLs' })
|
|
160
173
|
}
|
|
161
|
-
|
|
174
|
+
|
|
175
|
+
for (const sitemapURL of sitemaps) {
|
|
176
|
+
await log.info(`fetch start URLs from sitemap ${sitemapURL}`)
|
|
177
|
+
const sitemap = (await axios.get(sitemapURL)).data
|
|
178
|
+
const cheerio = require('cheerio')
|
|
179
|
+
const $ = cheerio.load(sitemap)
|
|
180
|
+
const sitemapURLs = []
|
|
181
|
+
$('url loc').each(function () {
|
|
182
|
+
sitemapURLs.push($(this).text())
|
|
183
|
+
})
|
|
184
|
+
for (const url of sitemapURLs) {
|
|
185
|
+
await pages.push({ url, source: 'sitemap' })
|
|
186
|
+
}
|
|
187
|
+
}
|
|
162
188
|
|
|
163
189
|
const sentIds = new Set([])
|
|
164
190
|
const sendPage = async (page, data, contentType = 'text/html', filename = 'content.html') => {
|
|
165
191
|
await log.debug('send page', page.url)
|
|
166
|
-
// TODO: apply no-index rules
|
|
167
192
|
const form = new FormData()
|
|
168
193
|
// improve page title
|
|
169
194
|
if (page.title) {
|
|
@@ -233,6 +258,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
233
258
|
}
|
|
234
259
|
|
|
235
260
|
if (response.headers['x-robots-tag']) {
|
|
261
|
+
await log.debug('use x-robots-tag header', response.headers['x-robots-tag'])
|
|
236
262
|
for (const part of response.headers['x-robots-tag'].split(',').map(p => p.trim())) {
|
|
237
263
|
if (part === 'noindex') page.noindex = true
|
|
238
264
|
if (part === 'nofollow') page.nofollow = true
|
|
@@ -246,12 +272,20 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
246
272
|
if (isHTML) {
|
|
247
273
|
const cheerio = require('cheerio')
|
|
248
274
|
const $ = cheerio.load(response.data)
|
|
249
|
-
|
|
275
|
+
const titleSelectors = (processingConfig.titleSelectors || []).concat(['title', 'h1'])
|
|
276
|
+
for (const titleSelector of titleSelectors) {
|
|
277
|
+
page.title = $(titleSelector).text()
|
|
278
|
+
if (page.title) {
|
|
279
|
+
log.debug(`used title selector "${titleSelector}" -> ${page.title.trim()}`)
|
|
280
|
+
break
|
|
281
|
+
}
|
|
282
|
+
}
|
|
250
283
|
|
|
251
284
|
$('meta').each(function (i, elem) {
|
|
252
285
|
const name = $(this).attr('name')
|
|
253
286
|
if (name === 'robots') {
|
|
254
287
|
const content = $(this).attr('content')
|
|
288
|
+
log.debug('use robots meta', content)
|
|
255
289
|
if (content) {
|
|
256
290
|
for (const part of content.split(',').map(p => p.trim())) {
|
|
257
291
|
if (part === 'noindex') page.noindex = true
|
|
@@ -267,7 +301,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
267
301
|
const href = $(this).attr('href')
|
|
268
302
|
if (!href) return
|
|
269
303
|
const parsedURL = new URL(href, page.url)
|
|
270
|
-
if (parsedURL.hash) {
|
|
304
|
+
if (parsedURL.hash && normalizeURL(parsedURL.href, true, true) === normalizeURL(page.url, true, true)) {
|
|
271
305
|
const targetElement = $(parsedURL.hash)
|
|
272
306
|
if (!targetElement) return
|
|
273
307
|
for (const anchor of processingConfig.anchors || []) {
|
|
@@ -289,7 +323,6 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
289
323
|
</body>`)
|
|
290
324
|
}
|
|
291
325
|
}
|
|
292
|
-
|
|
293
326
|
if (!page.nofollow) {
|
|
294
327
|
$('a').each(function (i, elem) {
|
|
295
328
|
const href = $(this).attr('href')
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@data-fair/processing-web-scraper",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.4.0",
|
|
4
4
|
"description": "A small Web scraper that publishes its data into data-fair datasets.",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"scripts": {
|
|
@@ -32,6 +32,7 @@
|
|
|
32
32
|
"dependencies": {
|
|
33
33
|
"cheerio": "^1.0.0-rc.12",
|
|
34
34
|
"form-data": "^4.0.0",
|
|
35
|
-
"robots-parser": "^3.0.0"
|
|
35
|
+
"robots-parser": "^3.0.0",
|
|
36
|
+
"url-pattern": "^1.0.3"
|
|
36
37
|
}
|
|
37
38
|
}
|
|
@@ -49,29 +49,40 @@
|
|
|
49
49
|
"startURLs": {
|
|
50
50
|
"type": "array",
|
|
51
51
|
"title": "URLs de départ",
|
|
52
|
-
"
|
|
52
|
+
"description": "Peut être omis si vous fournissez une URL vers un fichier sitemap.xml dans les paramètres avancés",
|
|
53
|
+
"minItems": 0,
|
|
53
54
|
"items": {"type": "string"}
|
|
54
55
|
}
|
|
55
56
|
}
|
|
56
57
|
}, {
|
|
57
58
|
"title": "Paramètres avancés",
|
|
58
59
|
"properties": {
|
|
59
|
-
"
|
|
60
|
+
"titlePrefix": {
|
|
61
|
+
"type": "string",
|
|
62
|
+
"title": "Préfixe à supprimer des titres avant indexation"
|
|
63
|
+
},
|
|
64
|
+
"titleSelectors": {
|
|
60
65
|
"type": "array",
|
|
61
|
-
"title": "Sélecteurs d'éléments HTML à
|
|
62
|
-
"description": "Par
|
|
66
|
+
"title": "Sélecteurs d'éléments HTML à utiliser comme titre de page",
|
|
67
|
+
"description": "Par défaut le sélecteur 'title' est utilisé ce qui correspond au titre de page dans les métadonnées HTML.",
|
|
68
|
+
"items": {"type": "string"}
|
|
69
|
+
},
|
|
70
|
+
"sitemaps": {
|
|
71
|
+
"type": "array",
|
|
72
|
+
"title": "URLs de fichiers sitemap.xml",
|
|
73
|
+
"description": "Ces URLs peuvent également être découvertes depuis le fichier robots.txt",
|
|
63
74
|
"items": {"type": "string"}
|
|
64
75
|
},
|
|
65
|
-
"
|
|
76
|
+
"prune": {
|
|
66
77
|
"type": "array",
|
|
67
|
-
"title": "
|
|
68
|
-
"description": "
|
|
78
|
+
"title": "Sélecteurs d'éléments HTML à ignorer",
|
|
79
|
+
"description": "Par exemple si une barre de navigation est présente sur de nombreuses page du site, répéter son contenu peut nuire à la qualité des résultats. Si cette barre de navigation a pour identifiant 'nav-bar' vous pouvez saisir '#nav-bar'.",
|
|
69
80
|
"items": {"type": "string"}
|
|
70
81
|
},
|
|
71
|
-
"
|
|
82
|
+
"excludeURLPatterns": {
|
|
72
83
|
"type": "array",
|
|
73
|
-
"title": "
|
|
74
|
-
"description": "
|
|
84
|
+
"title": "Formats d'URL à exclure de l'exploration",
|
|
85
|
+
"description": "Exemple: https://data-fair.github.io/master/en(/*)",
|
|
75
86
|
"items": {"type": "string"}
|
|
76
87
|
},
|
|
77
88
|
"anchors": {
|
|
@@ -98,10 +109,6 @@
|
|
|
98
109
|
}
|
|
99
110
|
}
|
|
100
111
|
}
|
|
101
|
-
},
|
|
102
|
-
"titlePrefix": {
|
|
103
|
-
"type": "string",
|
|
104
|
-
"title": "Préfixe à supprimer des titres avant indexation"
|
|
105
112
|
}
|
|
106
113
|
}
|
|
107
114
|
}]
|