@data-fair/processing-web-scraper 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.js CHANGED
@@ -5,9 +5,7 @@ const robotsParser = require('robots-parser')
5
5
  // TODO:
6
6
  // handle html but also any file formats
7
7
  // add in-links info (at least for files)
8
- // store last-modified and e-tag and use is when re-crawling a site
9
8
  // specifications listed here http://robots-txt.com/
10
- // normalize URL to prevent duplicates
11
9
 
12
10
  const datasetSchema = [
13
11
  {
@@ -51,13 +49,15 @@ const datasetSchema = [
51
49
  // a global variable to manage interruption
52
50
  let stopped
53
51
 
54
- const normalizeURL = (url) => {
52
+ const normalizeURL = (url, ignoreHash = false, addSlash = false) => {
55
53
  const parsedURL = new URL(url)
56
54
  for (const indexSuffix of ['index.html', 'index.php', 'index.jsp', 'index.cgi']) {
57
55
  if (parsedURL.pathname.endsWith('/' + indexSuffix)) {
58
56
  parsedURL.pathname = parsedURL.pathname.slice(0, parsedURL.pathname.length - indexSuffix.length)
59
57
  }
60
58
  }
59
+ if (ignoreHash) parsedURL.hash = ''
60
+ if (addSlash && !parsedURL.pathname.endsWith('/')) parsedURL.pathname += '/'
61
61
  return parsedURL.href
62
62
  }
63
63
 
@@ -158,7 +158,21 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
158
158
  for (const url of processingConfig.startURLs) {
159
159
  await pages.push({ url, source: 'config start URLs' })
160
160
  }
161
- // TODO: init from sitemap (and use robots.getSitemaps() to help in this)
161
+
162
+ if (processingConfig.sitemap) {
163
+ await log.info(`fetch start URLs from sitemmap ${processingConfig.sitemap}`)
164
+ const sitemap = (await axios.get(processingConfig.sitemap)).data
165
+ const cheerio = require('cheerio')
166
+ const $ = cheerio.load(sitemap)
167
+ const sitemapURLs = []
168
+ $('url loc').each(function () {
169
+ sitemapURLs.push($(this).text())
170
+ })
171
+ for (const url of sitemapURLs) {
172
+ await pages.push({ url, source: 'sitemap' })
173
+ }
174
+ }
175
+ // TODO: use robots.getSitemaps()
162
176
 
163
177
  const sentIds = new Set([])
164
178
  const sendPage = async (page, data, contentType = 'text/html', filename = 'content.html') => {
@@ -210,8 +224,9 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
210
224
  if (page.etag) headers['if-none-match'] = page.etag
211
225
  let response
212
226
  try {
213
- response = await axios.get(page.url, { headers })
227
+ response = await axios.get(page.url, { headers, maxRedirects: 0 })
214
228
  } catch (err) {
229
+ // content did not change
215
230
  if (err.status === 304) {
216
231
  await log.debug(`page was not modified since last exploration ${page.url}`)
217
232
  sentIds.add(page._id)
@@ -220,12 +235,19 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
220
235
  }
221
236
  continue
222
237
  }
238
+ // follow a redirect
239
+ if (err.status === 301) {
240
+ await log.debug(`page redirected ${page.url} -> ${err.headers.location}`)
241
+ pages.push({ url: new URL(err.headers.location, page.url).href, source: 'redirect ' + page.url })
242
+ continue
243
+ }
223
244
  await log.warning(`failed to fetch page ${page.url} - ${err.status || err.message}`)
224
245
  if (page.source) await log.warning(`this broken URL comes from ${page.source}`)
225
246
  continue
226
247
  }
227
248
 
228
249
  if (response.headers['x-robots-tag']) {
250
+ await log.debug('use x-robots-tag header', response.headers['x-robots-tag'])
229
251
  for (const part of response.headers['x-robots-tag'].split(',').map(p => p.trim())) {
230
252
  if (part === 'noindex') page.noindex = true
231
253
  if (part === 'nofollow') page.nofollow = true
@@ -245,6 +267,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
245
267
  const name = $(this).attr('name')
246
268
  if (name === 'robots') {
247
269
  const content = $(this).attr('content')
270
+ log.debug('use robots meta', content)
248
271
  if (content) {
249
272
  for (const part of content.split(',').map(p => p.trim())) {
250
273
  if (part === 'noindex') page.noindex = true
@@ -260,7 +283,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
260
283
  const href = $(this).attr('href')
261
284
  if (!href) return
262
285
  const parsedURL = new URL(href, page.url)
263
- if (parsedURL.hash) {
286
+ if (parsedURL.hash && normalizeURL(parsedURL.href, true, true) === normalizeURL(page.url, true, true)) {
264
287
  const targetElement = $(parsedURL.hash)
265
288
  if (!targetElement) return
266
289
  for (const anchor of processingConfig.anchors || []) {
@@ -282,7 +305,6 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
282
305
  </body>`)
283
306
  }
284
307
  }
285
-
286
308
  if (!page.nofollow) {
287
309
  $('a').each(function (i, elem) {
288
310
  const href = $(this).attr('href')
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@data-fair/processing-web-scraper",
3
- "version": "0.2.0",
3
+ "version": "0.3.0",
4
4
  "description": "A small Web scraper that publishes its data into data-fair datasets.",
5
5
  "main": "index.js",
6
6
  "scripts": {
@@ -22,7 +22,7 @@
22
22
  },
23
23
  "homepage": "https://github.com/data-fair/processing-web-scraper#readme",
24
24
  "devDependencies": {
25
- "@data-fair/processings-test-utils": "^0.5.0",
25
+ "@data-fair/processings-test-utils": "^0.5.1",
26
26
  "config": "^3.3.6",
27
27
  "eslint": "^7.18.0",
28
28
  "express": "^4.18.2",
@@ -49,13 +49,22 @@
49
49
  "startURLs": {
50
50
  "type": "array",
51
51
  "title": "URLs de départ",
52
- "minItems": 1,
52
+ "description": "Peut être omis si vous fournissez une URL vers un fichier sitemap.xml dans les paramètres avancés",
53
+ "minItems": 0,
53
54
  "items": {"type": "string"}
54
55
  }
55
56
  }
56
57
  }, {
57
58
  "title": "Paramètres avancés",
58
59
  "properties": {
60
+ "titlePrefix": {
61
+ "type": "string",
62
+ "title": "Préfixe à supprimer des titres avant indexation"
63
+ },
64
+ "sitemap": {
65
+ "type": "string",
66
+ "title": "URL d'un fichier sitemap.xml"
67
+ },
59
68
  "prune": {
60
69
  "type": "array",
61
70
  "title": "Sélecteurs d'éléments HTML à ignorer",
@@ -98,10 +107,6 @@
98
107
  }
99
108
  }
100
109
  }
101
- },
102
- "titlePrefix": {
103
- "type": "string",
104
- "title": "Préfixe à supprimer des titres avant indexation"
105
110
  }
106
111
  }
107
112
  }]