npm - @data-fair/processing-web-scraper - Versions diffs - 0.2.1 → 0.4.0 - Mend

@data-fair/processing-web-scraper 0.2.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/index.js +42 -9
package/package.json +3 -2
package/processing-config-schema.json +21 -14

package/index.js CHANGED Viewed

@@ -5,9 +5,7 @@ const robotsParser = require('robots-parser')
 // TODO:
 // handle html but also any file formats
 // add in-links info (at least for files)
-// store last-modified and e-tag and use is when re-crawling a site
 // specifications listed here http://robots-txt.com/
-// normalize URL to prevent duplicates
 const datasetSchema = [
   {
@@ -51,13 +49,15 @@ const datasetSchema = [
 // a global variable to manage interruption
 let stopped
-const normalizeURL = (url) => {
+const normalizeURL = (url, ignoreHash = false, addSlash = false) => {
   const parsedURL = new URL(url)
   for (const indexSuffix of ['index.html', 'index.php', 'index.jsp', 'index.cgi']) {
     if (parsedURL.pathname.endsWith('/' + indexSuffix)) {
       parsedURL.pathname = parsedURL.pathname.slice(0, parsedURL.pathname.length - indexSuffix.length)
     }
   }
+  if (ignoreHash) parsedURL.hash = ''
+  if (addSlash && !parsedURL.pathname.endsWith('/')) parsedURL.pathname += '/'
   return parsedURL.href
 }
@@ -73,6 +73,13 @@ class PagesIterator {
     this.pluginConfig = pluginConfig
     this.processingConfig = processingConfig
     this.robots = robots
+    const UrlPattern = require('url-pattern')
+    this.excludeURLPatterns = (processingConfig.excludeURLPatterns || []).map(p => {
+      const url = new URL(p)
+      const pattern = new UrlPattern(url.pathname)
+      pattern.hostname = url.hostname
+      return pattern
+    })
   }
   [Symbol.asyncIterator] () {
@@ -80,11 +87,13 @@ class PagesIterator {
   }
   push (page) {
-    // TODO: apply no-follow rules
     if (typeof page === 'string') page = { url: page }
     if (!this.processingConfig.baseURLs.find(b => page.url.startsWith(b))) return
     page.parsedURL = page.parsedURL || new URL(page.url)
     if (page.parsedURL.hash) return
+    if (this.excludeURLPatterns.find(p => p.match(page.parsedURL.pathname) && p.hostname === page.parsedURL.hostname)) {
+      return
+    }
     if (this.robots[page.parsedURL.origin] && !this.robots[page.parsedURL.origin].isAllowed(page.url, this.pluginConfig.userAgent || 'data-fair-web-scraper')) {
       return
     }
@@ -126,12 +135,16 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
   // parse the robots.txt files if available
   const robots = {}
+  const sitemaps = processingConfig.sitemaps || []
   for (const baseURL of processingConfig.baseURLs) {
     const { origin } = new URL(baseURL)
     if (robots[origin]) continue
     try {
       const response = await axios.get(origin + '/robots.txt')
       robots[origin] = robotsParser(origin + '/robots.txt', response.data)
+      for (const sitemap of robots[origin].getSitemaps()) {
+        if (!sitemaps.includes(sitemap)) sitemaps.push(sitemap)
+      }
     } catch (err) {
       await log.info(`failed to fetch ${origin + '/robots.txt'} - ${err.status || err.message}`)
     }
@@ -158,12 +171,24 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
   for (const url of processingConfig.startURLs) {
     await pages.push({ url, source: 'config start URLs' })
   }
-  // TODO: init from sitemap (and use robots.getSitemaps() to help in this)
+  for (const sitemapURL of sitemaps) {
+    await log.info(`fetch start URLs from sitemap ${sitemapURL}`)
+    const sitemap = (await axios.get(sitemapURL)).data
+    const cheerio = require('cheerio')
+    const $ = cheerio.load(sitemap)
+    const sitemapURLs = []
+    $('url loc').each(function () {
+      sitemapURLs.push($(this).text())
+    })
+    for (const url of sitemapURLs) {
+      await pages.push({ url, source: 'sitemap' })
+    }
+  }
   const sentIds = new Set([])
   const sendPage = async (page, data, contentType = 'text/html', filename = 'content.html') => {
     await log.debug('send page', page.url)
-    // TODO: apply no-index rules
     const form = new FormData()
     // improve page title
     if (page.title) {
@@ -233,6 +258,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
     }
     if (response.headers['x-robots-tag']) {
+      await log.debug('use x-robots-tag header', response.headers['x-robots-tag'])
       for (const part of response.headers['x-robots-tag'].split(',').map(p => p.trim())) {
         if (part === 'noindex') page.noindex = true
         if (part === 'nofollow') page.nofollow = true
@@ -246,12 +272,20 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
     if (isHTML) {
       const cheerio = require('cheerio')
       const $ = cheerio.load(response.data)
-      page.title = $('title').text()
+      const titleSelectors = (processingConfig.titleSelectors || []).concat(['title', 'h1'])
+      for (const titleSelector of titleSelectors) {
+        page.title = $(titleSelector).text()
+        if (page.title) {
+          log.debug(`used title selector "${titleSelector}" -> ${page.title.trim()}`)
+          break
+        }
+      }
       $('meta').each(function (i, elem) {
         const name = $(this).attr('name')
         if (name === 'robots') {
           const content = $(this).attr('content')
+          log.debug('use robots meta', content)
           if (content) {
             for (const part of content.split(',').map(p => p.trim())) {
               if (part === 'noindex') page.noindex = true
@@ -267,7 +301,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
           const href = $(this).attr('href')
           if (!href) return
           const parsedURL = new URL(href, page.url)
-          if (parsedURL.hash) {
+          if (parsedURL.hash && normalizeURL(parsedURL.href, true, true) === normalizeURL(page.url, true, true)) {
             const targetElement = $(parsedURL.hash)
             if (!targetElement) return
             for (const anchor of processingConfig.anchors || []) {
@@ -289,7 +323,6 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
 </body>`)
         }
       }
       if (!page.nofollow) {
         $('a').each(function (i, elem) {
           const href = $(this).attr('href')

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@data-fair/processing-web-scraper",
-  "version": "0.2.1",
+  "version": "0.4.0",
   "description": "A small Web scraper that publishes its data into data-fair datasets.",
   "main": "index.js",
   "scripts": {
@@ -32,6 +32,7 @@
   "dependencies": {
     "cheerio": "^1.0.0-rc.12",
     "form-data": "^4.0.0",
-    "robots-parser": "^3.0.0"
+    "robots-parser": "^3.0.0",
+    "url-pattern": "^1.0.3"
   }
 }

package/processing-config-schema.json CHANGED Viewed

@@ -49,29 +49,40 @@
       "startURLs": {
         "type": "array",
         "title": "URLs de départ",
-        "minItems": 1,
+        "description": "Peut être omis si vous fournissez une URL vers un fichier sitemap.xml dans les paramètres avancés",
+        "minItems": 0,
         "items": {"type": "string"}
       }
     }
   }, {
     "title": "Paramètres avancés",
     "properties": {
-      "prune": {
+      "titlePrefix": {
+        "type": "string",
+        "title": "Préfixe à supprimer des titres avant indexation"
+      },
+      "titleSelectors": {
         "type": "array",
-        "title": "Sélecteurs d'éléments HTML à ignorer",
-        "description": "Par exemple si une barre de navigation est présente sur de nombreuses page du site, répéter son contenu peut nuire à la qualité des résultats. Si cette barre de navigation a pour identifiant 'nav-bar' vous pouvez saisir '#nav-bar'.",
+        "title": "Sélecteurs d'éléments HTML à utiliser comme titre de page",
+        "description": "Par défaut le sélecteur 'title' est utilisé ce qui correspond au titre de page dans les métadonnées HTML.",
+        "items": {"type": "string"}
+      },
+      "sitemaps": {
+        "type": "array",
+        "title": "URLs de fichiers sitemap.xml",
+        "description": "Ces URLs peuvent également être découvertes depuis le fichier robots.txt",
         "items": {"type": "string"}
       },
-      "noIndex": {
+      "prune": {
         "type": "array",
-        "title": "URLs de page à ne pas indexer",
-        "description": "Elles seront potentiellement quand même analysées à la recherche de liens et de fragments. Notez que ce traitement respecte également les instructions des fichiers robots.txt et les metadonnées HTML robots.",
+        "title": "Sélecteurs d'éléments HTML à ignorer",
+        "description": "Par exemple si une barre de navigation est présente sur de nombreuses page du site, répéter son contenu peut nuire à la qualité des résultats. Si cette barre de navigation a pour identifiant 'nav-bar' vous pouvez saisir '#nav-bar'.",
         "items": {"type": "string"}
       },
-      "noFollow": {
+      "excludeURLPatterns": {
         "type": "array",
-        "title": "URLs de page à ne pas explorer",
-        "description": "Elles seront ni indexées ni analysées à la recherche de liens et de fragments. Notez que ce traitement respecte également les instructions des fichiers robots.txt et les metadonnées HTML robots.",
+        "title": "Formats d'URL à exclure de l'exploration",
+        "description": "Exemple: https://data-fair.github.io/master/en(/*)",
         "items": {"type": "string"}
       },
       "anchors": {
@@ -98,10 +109,6 @@
             }
           }
         }
-      },
-      "titlePrefix": {
-        "type": "string",
-        "title": "Préfixe à supprimer des titres avant indexation"
       }
     }
   }]