@data-fair/processing-web-scraper 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.js CHANGED
@@ -5,9 +5,7 @@ const robotsParser = require('robots-parser')
5
5
  // TODO:
6
6
  // handle html but also any file formats
7
7
  // add in-links info (at least for files)
8
- // store last-modified and e-tag and use is when re-crawling a site
9
8
  // specifications listed here http://robots-txt.com/
10
- // normalize URL to prevent duplicates
11
9
 
12
10
  const datasetSchema = [
13
11
  {
@@ -51,13 +49,15 @@ const datasetSchema = [
51
49
  // a global variable to manage interruption
52
50
  let stopped
53
51
 
54
- const normalizeURL = (url) => {
52
+ const normalizeURL = (url, ignoreHash = false, addSlash = false) => {
55
53
  const parsedURL = new URL(url)
56
54
  for (const indexSuffix of ['index.html', 'index.php', 'index.jsp', 'index.cgi']) {
57
55
  if (parsedURL.pathname.endsWith('/' + indexSuffix)) {
58
56
  parsedURL.pathname = parsedURL.pathname.slice(0, parsedURL.pathname.length - indexSuffix.length)
59
57
  }
60
58
  }
59
+ if (ignoreHash) parsedURL.hash = ''
60
+ if (addSlash && !parsedURL.pathname.endsWith('/')) parsedURL.pathname += '/'
61
61
  return parsedURL.href
62
62
  }
63
63
 
@@ -158,7 +158,21 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
158
158
  for (const url of processingConfig.startURLs) {
159
159
  await pages.push({ url, source: 'config start URLs' })
160
160
  }
161
- // TODO: init from sitemap (and use robots.getSitemaps() to help in this)
161
+
162
+ if (processingConfig.sitemap) {
163
+ await log.info(`fetch start URLs from sitemmap ${processingConfig.sitemap}`)
164
+ const sitemap = (await axios.get(processingConfig.sitemap)).data
165
+ const cheerio = require('cheerio')
166
+ const $ = cheerio.load(sitemap)
167
+ const sitemapURLs = []
168
+ $('url loc').each(function () {
169
+ sitemapURLs.push($(this).text())
170
+ })
171
+ for (const url of sitemapURLs) {
172
+ await pages.push({ url, source: 'sitemap' })
173
+ }
174
+ }
175
+ // TODO: use robots.getSitemaps()
162
176
 
163
177
  const sentIds = new Set([])
164
178
  const sendPage = async (page, data, contentType = 'text/html', filename = 'content.html') => {
@@ -233,6 +247,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
233
247
  }
234
248
 
235
249
  if (response.headers['x-robots-tag']) {
250
+ await log.debug('use x-robots-tag header', response.headers['x-robots-tag'])
236
251
  for (const part of response.headers['x-robots-tag'].split(',').map(p => p.trim())) {
237
252
  if (part === 'noindex') page.noindex = true
238
253
  if (part === 'nofollow') page.nofollow = true
@@ -252,6 +267,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
252
267
  const name = $(this).attr('name')
253
268
  if (name === 'robots') {
254
269
  const content = $(this).attr('content')
270
+ log.debug('use robots meta', content)
255
271
  if (content) {
256
272
  for (const part of content.split(',').map(p => p.trim())) {
257
273
  if (part === 'noindex') page.noindex = true
@@ -267,7 +283,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
267
283
  const href = $(this).attr('href')
268
284
  if (!href) return
269
285
  const parsedURL = new URL(href, page.url)
270
- if (parsedURL.hash) {
286
+ if (parsedURL.hash && normalizeURL(parsedURL.href, true, true) === normalizeURL(page.url, true, true)) {
271
287
  const targetElement = $(parsedURL.hash)
272
288
  if (!targetElement) return
273
289
  for (const anchor of processingConfig.anchors || []) {
@@ -289,7 +305,6 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
289
305
  </body>`)
290
306
  }
291
307
  }
292
-
293
308
  if (!page.nofollow) {
294
309
  $('a').each(function (i, elem) {
295
310
  const href = $(this).attr('href')
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@data-fair/processing-web-scraper",
3
- "version": "0.2.1",
3
+ "version": "0.3.0",
4
4
  "description": "A small Web scraper that publishes its data into data-fair datasets.",
5
5
  "main": "index.js",
6
6
  "scripts": {
@@ -49,13 +49,22 @@
49
49
  "startURLs": {
50
50
  "type": "array",
51
51
  "title": "URLs de départ",
52
- "minItems": 1,
52
+ "description": "Peut être omis si vous fournissez une URL vers un fichier sitemap.xml dans les paramètres avancés",
53
+ "minItems": 0,
53
54
  "items": {"type": "string"}
54
55
  }
55
56
  }
56
57
  }, {
57
58
  "title": "Paramètres avancés",
58
59
  "properties": {
60
+ "titlePrefix": {
61
+ "type": "string",
62
+ "title": "Préfixe à supprimer des titres avant indexation"
63
+ },
64
+ "sitemap": {
65
+ "type": "string",
66
+ "title": "URL d'un fichier sitemap.xml"
67
+ },
59
68
  "prune": {
60
69
  "type": "array",
61
70
  "title": "Sélecteurs d'éléments HTML à ignorer",
@@ -98,10 +107,6 @@
98
107
  }
99
108
  }
100
109
  }
101
- },
102
- "titlePrefix": {
103
- "type": "string",
104
- "title": "Préfixe à supprimer des titres avant indexation"
105
110
  }
106
111
  }
107
112
  }]