npm - @data-fair/processing-web-scraper - Versions diffs - 0.2.0 → 0.3.0 - Mend

@data-fair/processing-web-scraper 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/index.js +29 -7
package/package.json +2 -2
package/processing-config-schema.json +10 -5

package/index.js CHANGED Viewed

@@ -5,9 +5,7 @@ const robotsParser = require('robots-parser')
 // TODO:
 // handle html but also any file formats
 // add in-links info (at least for files)
-// store last-modified and e-tag and use is when re-crawling a site
 // specifications listed here http://robots-txt.com/
-// normalize URL to prevent duplicates
 const datasetSchema = [
   {
@@ -51,13 +49,15 @@ const datasetSchema = [
 // a global variable to manage interruption
 let stopped
-const normalizeURL = (url) => {
+const normalizeURL = (url, ignoreHash = false, addSlash = false) => {
   const parsedURL = new URL(url)
   for (const indexSuffix of ['index.html', 'index.php', 'index.jsp', 'index.cgi']) {
     if (parsedURL.pathname.endsWith('/' + indexSuffix)) {
       parsedURL.pathname = parsedURL.pathname.slice(0, parsedURL.pathname.length - indexSuffix.length)
     }
   }
+  if (ignoreHash) parsedURL.hash = ''
+  if (addSlash && !parsedURL.pathname.endsWith('/')) parsedURL.pathname += '/'
   return parsedURL.href
 }
@@ -158,7 +158,21 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
   for (const url of processingConfig.startURLs) {
     await pages.push({ url, source: 'config start URLs' })
   }
-  // TODO: init from sitemap (and use robots.getSitemaps() to help in this)
+  if (processingConfig.sitemap) {
+    await log.info(`fetch start URLs from sitemmap ${processingConfig.sitemap}`)
+    const sitemap = (await axios.get(processingConfig.sitemap)).data
+    const cheerio = require('cheerio')
+    const $ = cheerio.load(sitemap)
+    const sitemapURLs = []
+    $('url loc').each(function () {
+      sitemapURLs.push($(this).text())
+    })
+    for (const url of sitemapURLs) {
+      await pages.push({ url, source: 'sitemap' })
+    }
+  }
+  // TODO: use robots.getSitemaps()
   const sentIds = new Set([])
   const sendPage = async (page, data, contentType = 'text/html', filename = 'content.html') => {
@@ -210,8 +224,9 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
     if (page.etag) headers['if-none-match'] = page.etag
     let response
     try {
-      response = await axios.get(page.url, { headers })
+      response = await axios.get(page.url, { headers, maxRedirects: 0 })
     } catch (err) {
+      // content did not change
       if (err.status === 304) {
         await log.debug(`page was not modified since last exploration ${page.url}`)
         sentIds.add(page._id)
@@ -220,12 +235,19 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
         }
         continue
       }
+      // follow a redirect
+      if (err.status === 301) {
+        await log.debug(`page redirected ${page.url} -> ${err.headers.location}`)
+        pages.push({ url: new URL(err.headers.location, page.url).href, source: 'redirect ' + page.url })
+        continue
+      }
       await log.warning(`failed to fetch page ${page.url} - ${err.status || err.message}`)
       if (page.source) await log.warning(`this broken URL comes from ${page.source}`)
       continue
     }
     if (response.headers['x-robots-tag']) {
+      await log.debug('use x-robots-tag header', response.headers['x-robots-tag'])
       for (const part of response.headers['x-robots-tag'].split(',').map(p => p.trim())) {
         if (part === 'noindex') page.noindex = true
         if (part === 'nofollow') page.nofollow = true
@@ -245,6 +267,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
         const name = $(this).attr('name')
         if (name === 'robots') {
           const content = $(this).attr('content')
+          log.debug('use robots meta', content)
           if (content) {
             for (const part of content.split(',').map(p => p.trim())) {
               if (part === 'noindex') page.noindex = true
@@ -260,7 +283,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
           const href = $(this).attr('href')
           if (!href) return
           const parsedURL = new URL(href, page.url)
-          if (parsedURL.hash) {
+          if (parsedURL.hash && normalizeURL(parsedURL.href, true, true) === normalizeURL(page.url, true, true)) {
             const targetElement = $(parsedURL.hash)
             if (!targetElement) return
             for (const anchor of processingConfig.anchors || []) {
@@ -282,7 +305,6 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
 </body>`)
         }
       }
       if (!page.nofollow) {
         $('a').each(function (i, elem) {
           const href = $(this).attr('href')

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@data-fair/processing-web-scraper",
-  "version": "0.2.0",
+  "version": "0.3.0",
   "description": "A small Web scraper that publishes its data into data-fair datasets.",
   "main": "index.js",
   "scripts": {
@@ -22,7 +22,7 @@
   },
   "homepage": "https://github.com/data-fair/processing-web-scraper#readme",
   "devDependencies": {
-    "@data-fair/processings-test-utils": "^0.5.0",
+    "@data-fair/processings-test-utils": "^0.5.1",
     "config": "^3.3.6",
     "eslint": "^7.18.0",
     "express": "^4.18.2",

package/processing-config-schema.json CHANGED Viewed

@@ -49,13 +49,22 @@
       "startURLs": {
         "type": "array",
         "title": "URLs de départ",
-        "minItems": 1,
+        "description": "Peut être omis si vous fournissez une URL vers un fichier sitemap.xml dans les paramètres avancés",
+        "minItems": 0,
         "items": {"type": "string"}
       }
     }
   }, {
     "title": "Paramètres avancés",
     "properties": {
+      "titlePrefix": {
+        "type": "string",
+        "title": "Préfixe à supprimer des titres avant indexation"
+      },
+      "sitemap": {
+        "type": "string",
+        "title": "URL d'un fichier sitemap.xml"
+      },
       "prune": {
         "type": "array",
         "title": "Sélecteurs d'éléments HTML à ignorer",
@@ -98,10 +107,6 @@
             }
           }
         }
-      },
-      "titlePrefix": {
-        "type": "string",
-        "title": "Préfixe à supprimer des titres avant indexation"
       }
     }
   }]