@data-fair/processing-web-scraper 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.js +25 -7
- package/package.json +3 -2
- package/processing-config-schema.json +14 -12
package/index.js
CHANGED
|
@@ -73,6 +73,13 @@ class PagesIterator {
|
|
|
73
73
|
this.pluginConfig = pluginConfig
|
|
74
74
|
this.processingConfig = processingConfig
|
|
75
75
|
this.robots = robots
|
|
76
|
+
const UrlPattern = require('url-pattern')
|
|
77
|
+
this.excludeURLPatterns = (processingConfig.excludeURLPatterns || []).map(p => {
|
|
78
|
+
const url = new URL(p)
|
|
79
|
+
const pattern = new UrlPattern(url.pathname)
|
|
80
|
+
pattern.hostname = url.hostname
|
|
81
|
+
return pattern
|
|
82
|
+
})
|
|
76
83
|
}
|
|
77
84
|
|
|
78
85
|
[Symbol.asyncIterator] () {
|
|
@@ -80,11 +87,13 @@ class PagesIterator {
|
|
|
80
87
|
}
|
|
81
88
|
|
|
82
89
|
push (page) {
|
|
83
|
-
// TODO: apply no-follow rules
|
|
84
90
|
if (typeof page === 'string') page = { url: page }
|
|
85
91
|
if (!this.processingConfig.baseURLs.find(b => page.url.startsWith(b))) return
|
|
86
92
|
page.parsedURL = page.parsedURL || new URL(page.url)
|
|
87
93
|
if (page.parsedURL.hash) return
|
|
94
|
+
if (this.excludeURLPatterns.find(p => p.match(page.parsedURL.pathname) && p.hostname === page.parsedURL.hostname)) {
|
|
95
|
+
return
|
|
96
|
+
}
|
|
88
97
|
if (this.robots[page.parsedURL.origin] && !this.robots[page.parsedURL.origin].isAllowed(page.url, this.pluginConfig.userAgent || 'data-fair-web-scraper')) {
|
|
89
98
|
return
|
|
90
99
|
}
|
|
@@ -126,12 +135,16 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
126
135
|
|
|
127
136
|
// parse the robots.txt files if available
|
|
128
137
|
const robots = {}
|
|
138
|
+
const sitemaps = processingConfig.sitemaps || []
|
|
129
139
|
for (const baseURL of processingConfig.baseURLs) {
|
|
130
140
|
const { origin } = new URL(baseURL)
|
|
131
141
|
if (robots[origin]) continue
|
|
132
142
|
try {
|
|
133
143
|
const response = await axios.get(origin + '/robots.txt')
|
|
134
144
|
robots[origin] = robotsParser(origin + '/robots.txt', response.data)
|
|
145
|
+
for (const sitemap of robots[origin].getSitemaps()) {
|
|
146
|
+
if (!sitemaps.includes(sitemap)) sitemaps.push(sitemap)
|
|
147
|
+
}
|
|
135
148
|
} catch (err) {
|
|
136
149
|
await log.info(`failed to fetch ${origin + '/robots.txt'} - ${err.status || err.message}`)
|
|
137
150
|
}
|
|
@@ -159,9 +172,9 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
159
172
|
await pages.push({ url, source: 'config start URLs' })
|
|
160
173
|
}
|
|
161
174
|
|
|
162
|
-
|
|
163
|
-
await log.info(`fetch start URLs from
|
|
164
|
-
const sitemap = (await axios.get(
|
|
175
|
+
for (const sitemapURL of sitemaps) {
|
|
176
|
+
await log.info(`fetch start URLs from sitemap ${sitemapURL}`)
|
|
177
|
+
const sitemap = (await axios.get(sitemapURL)).data
|
|
165
178
|
const cheerio = require('cheerio')
|
|
166
179
|
const $ = cheerio.load(sitemap)
|
|
167
180
|
const sitemapURLs = []
|
|
@@ -172,12 +185,10 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
172
185
|
await pages.push({ url, source: 'sitemap' })
|
|
173
186
|
}
|
|
174
187
|
}
|
|
175
|
-
// TODO: use robots.getSitemaps()
|
|
176
188
|
|
|
177
189
|
const sentIds = new Set([])
|
|
178
190
|
const sendPage = async (page, data, contentType = 'text/html', filename = 'content.html') => {
|
|
179
191
|
await log.debug('send page', page.url)
|
|
180
|
-
// TODO: apply no-index rules
|
|
181
192
|
const form = new FormData()
|
|
182
193
|
// improve page title
|
|
183
194
|
if (page.title) {
|
|
@@ -261,7 +272,14 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
261
272
|
if (isHTML) {
|
|
262
273
|
const cheerio = require('cheerio')
|
|
263
274
|
const $ = cheerio.load(response.data)
|
|
264
|
-
|
|
275
|
+
const titleSelectors = (processingConfig.titleSelectors || []).concat(['title', 'h1'])
|
|
276
|
+
for (const titleSelector of titleSelectors) {
|
|
277
|
+
page.title = $(titleSelector).text()
|
|
278
|
+
if (page.title) {
|
|
279
|
+
log.debug(`used title selector "${titleSelector}" -> ${page.title.trim()}`)
|
|
280
|
+
break
|
|
281
|
+
}
|
|
282
|
+
}
|
|
265
283
|
|
|
266
284
|
$('meta').each(function (i, elem) {
|
|
267
285
|
const name = $(this).attr('name')
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@data-fair/processing-web-scraper",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.4.0",
|
|
4
4
|
"description": "A small Web scraper that publishes its data into data-fair datasets.",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"scripts": {
|
|
@@ -32,6 +32,7 @@
|
|
|
32
32
|
"dependencies": {
|
|
33
33
|
"cheerio": "^1.0.0-rc.12",
|
|
34
34
|
"form-data": "^4.0.0",
|
|
35
|
-
"robots-parser": "^3.0.0"
|
|
35
|
+
"robots-parser": "^3.0.0",
|
|
36
|
+
"url-pattern": "^1.0.3"
|
|
36
37
|
}
|
|
37
38
|
}
|
|
@@ -61,26 +61,28 @@
|
|
|
61
61
|
"type": "string",
|
|
62
62
|
"title": "Préfixe à supprimer des titres avant indexation"
|
|
63
63
|
},
|
|
64
|
-
"
|
|
65
|
-
"type": "
|
|
66
|
-
"title": "
|
|
64
|
+
"titleSelectors": {
|
|
65
|
+
"type": "array",
|
|
66
|
+
"title": "Sélecteurs d'éléments HTML à utiliser comme titre de page",
|
|
67
|
+
"description": "Par défaut le sélecteur 'title' est utilisé ce qui correspond au titre de page dans les métadonnées HTML.",
|
|
68
|
+
"items": {"type": "string"}
|
|
67
69
|
},
|
|
68
|
-
"
|
|
70
|
+
"sitemaps": {
|
|
69
71
|
"type": "array",
|
|
70
|
-
"title": "
|
|
71
|
-
"description": "
|
|
72
|
+
"title": "URLs de fichiers sitemap.xml",
|
|
73
|
+
"description": "Ces URLs peuvent également être découvertes depuis le fichier robots.txt",
|
|
72
74
|
"items": {"type": "string"}
|
|
73
75
|
},
|
|
74
|
-
"
|
|
76
|
+
"prune": {
|
|
75
77
|
"type": "array",
|
|
76
|
-
"title": "
|
|
77
|
-
"description": "
|
|
78
|
+
"title": "Sélecteurs d'éléments HTML à ignorer",
|
|
79
|
+
"description": "Par exemple si une barre de navigation est présente sur de nombreuses page du site, répéter son contenu peut nuire à la qualité des résultats. Si cette barre de navigation a pour identifiant 'nav-bar' vous pouvez saisir '#nav-bar'.",
|
|
78
80
|
"items": {"type": "string"}
|
|
79
81
|
},
|
|
80
|
-
"
|
|
82
|
+
"excludeURLPatterns": {
|
|
81
83
|
"type": "array",
|
|
82
|
-
"title": "
|
|
83
|
-
"description": "
|
|
84
|
+
"title": "Formats d'URL à exclure de l'exploration",
|
|
85
|
+
"description": "Exemple: https://data-fair.github.io/master/en(/*)",
|
|
84
86
|
"items": {"type": "string"}
|
|
85
87
|
},
|
|
86
88
|
"anchors": {
|