npm - @data-fair/processing-web-scraper - Versions diffs - 0.1.0 - Mend

@data-fair/processing-web-scraper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/.eslintrc.js +13 -0
package/LICENSE +21 -0
package/README.md +3 -0
package/index.js +281 -0
package/package.json +37 -0
package/plugin-config-schema.json +18 -0
package/processing-config-schema.json +108 -0

package/.eslintrc.js ADDED Viewed

@@ -0,0 +1,13 @@
+module.exports = {
+  root: true,
+  parserOptions: {
+    parser: 'babel-eslint',
+    sourceType: 'module'
+  },
+  extends: ['standard'],
+  // add your custom rules here
+  rules: {
+    // allow paren-less arrow functions
+    'arrow-parens': 0
+  }
+}

package/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2022 data-fair
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

package/README.md ADDED Viewed

@@ -0,0 +1,3 @@
+# processing-web-scraper
+A small Web scraper that publishes its data into data-fair datasets.

package/index.js ADDED Viewed

@@ -0,0 +1,281 @@
+const FormData = require('form-data')
+const crypto = require('crypto')
+const robotsParser = require('robots-parser')
+// TODO:
+// handle html but also any file formats
+// add in-links info (at least for files)
+// store last-modified and e-tag and use is when re-crawling a site
+// specifications listed here http://robots-txt.com/
+// normalize URL to prevent duplicates
+const datasetSchema = [
+  {
+    key: 'title',
+    type: 'string',
+    'x-refersTo': 'http://www.w3.org/2000/01/rdf-schema#label',
+    'x-capabilities': { textAgg: false }
+  },
+  {
+    key: 'url',
+    type: 'string',
+    'x-refersTo': 'https://schema.org/WebPage',
+    'x-capabilities': { text: false, values: false, textAgg: false, insensitive: false }
+  },
+  {
+    key: 'tags',
+    type: 'string',
+    separator: ',',
+    'x-refersTo': 'https://schema.org/DefinedTermSet',
+    'x-capabilities': { text: false, textStandard: false, textAgg: false, insensitive: false }
+  },
+  {
+    key: 'etag',
+    type: 'string',
+    separator: ',',
+    'x-capabilities': { index: false, values: false, text: false, textStandard: false, textAgg: false, insensitive: false }
+  },
+  {
+    key: 'lastModified',
+    type: 'string',
+    'x-capabilities': { index: false, values: false, text: false, textStandard: false, textAgg: false, insensitive: false }
+  },
+  {
+    key: 'attachmentPath',
+    type: 'string',
+    'x-refersTo': 'http://schema.org/DigitalDocument',
+    'x-capabilities': { text: false, textStandard: false, values: false, textAgg: false, insensitive: false }
+  }
+]
+// a global variable to manage interruption
+let stopped
+const normalizeURL = (url) => {
+  const parsedURL = new URL(url)
+  for (const indexSuffix of ['index.html', 'index.php', 'index.jsp', 'index.cgi']) {
+    if (parsedURL.pathname.endsWith('/' + indexSuffix)) {
+      parsedURL.pathname = parsedURL.pathname.slice(0, parsedURL.pathname.length - indexSuffix.length)
+    }
+  }
+  return parsedURL.href
+}
+const getId = (page) => {
+  return crypto.createHash('sha256').update(normalizeURL(page.url)).digest('base64url').slice(0, 20)
+}
+class PagesIterator {
+  constructor (log, pluginConfig, processingConfig, robots) {
+    this.pages = []
+    this.cursor = -1
+    this.log = log
+    this.pluginConfig = pluginConfig
+    this.processingConfig = processingConfig
+    this.robots = robots
+  }
+  [Symbol.asyncIterator] () {
+    return this
+  }
+  push (page) {
+    // TODO: apply no-follow rules
+    if (typeof page === 'string') page = { url: page }
+    if (!this.processingConfig.baseURLs.find(b => page.url.startsWith(b))) return
+    page.parsedURL = page.parsedURL || new URL(page.url)
+    if (this.robots[page.parsedURL.origin] && !this.robots[page.parsedURL.origin].isAllowed(page.url, this.pluginConfig.userAgent || 'data-fair-web-scraper')) {
+      return
+    }
+    page._id = getId(page)
+    if (this.pages.find(p => p._id === page._id)) return
+    this.pages.push(page)
+  }
+  async next () {
+    this.cursor += 1
+    if (this.cursor === 0) await this.log.task('Crawl pages')
+    await this.log.progress('Crawl pages', this.cursor, this.pages.length)
+    const page = this.pages[this.cursor]
+    if (page) await this.log.debug('next page', page.url)
+    return { value: page, done: this.cursor === this.pages.length }
+  }
+}
+exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir, axios, log, patchConfig, ws }) => {
+  let dataset
+  if (processingConfig.datasetMode === 'create') {
+    await log.step('Dataset creation')
+    dataset = (await axios.post('api/v1/datasets', {
+      id: processingConfig.dataset.id,
+      title: processingConfig.dataset.title,
+      isRest: true,
+      schema: datasetSchema,
+      extras: { processingId }
+    })).data
+    await log.info(`dataset created, id="${dataset.id}", title="${dataset.title}"`)
+    await patchConfig({ datasetMode: 'update', dataset: { id: dataset.id, title: dataset.title } })
+    await ws.waitForJournal(dataset.id, 'finalize-end')
+  } else if (processingConfig.datasetMode === 'update') {
+    await log.step('Check dataset')
+    dataset = (await axios.get(`api/v1/datasets/${processingConfig.dataset.id}`)).data
+    if (!dataset) throw new Error(`the dataset does not exist, id="${processingConfig.dataset.id}"`)
+    await log.info(`the dataset exists, id="${dataset.id}", title="${dataset.title}"`)
+  }
+  // parse the robots.txt files if available
+  const robots = {}
+  for (const baseURL of processingConfig.baseURLs) {
+    const { origin } = new URL(baseURL)
+    if (robots[origin]) continue
+    try {
+      const response = await axios.get(origin + '/robots.txt')
+      robots[origin] = robotsParser(origin + '/robots.txt', response.data)
+    } catch (err) {
+      await log.info(`failed to fetch ${origin + '/robots.txt'} - ${err.status || err.message}`)
+    }
+  }
+  const pages = new PagesIterator(log, pluginConfig, processingConfig, robots)
+  await log.step('Init pages list')
+  await log.info(`add ${processingConfig.startURLs.length} pages from config`)
+  for (const url of processingConfig.startURLs) await pages.push({ url, source: 'config start URLs' })
+  if (processingConfig.datasetMode === 'update') {
+    const existingPages = (await axios.get(`api/v1/datasets/${dataset.id}/lines`, { params: { select: '_id,url,etag,lastModified', size: 10000 } })).data.results
+    await log.info(`add ${existingPages.length} pages from previous crawls`)
+    for (const page of existingPages) await pages.push({ page, source: 'previous exploration' })
+  }
+  // TODO: init from sitemap (and use robots.getSitemaps() to help in this)
+  const sendPage = async (page, data, contentType = 'text/html', filename = 'content.html') => {
+    await log.debug('send page', page.url)
+    // TODO: apply no-index rules
+    const form = new FormData()
+    // improve page title
+    if (page.title) {
+      page.title = page.title.trim()
+      if (processingConfig.titlePrefix && page.title.startsWith(processingConfig.titlePrefix)) {
+        page.title = page.title.replace(processingConfig.titlePrefix, '')
+      }
+    }
+    form.append('title', page.title)
+    form.append('url', page.url)
+    if (page.tags && page.tags.length) form.append('tags', page.tags.join(','))
+    data = typeof data === 'string' ? Buffer.from(data) : data
+    const dataOpts = {
+      contentType,
+      filename,
+      knownLength: data.length
+    }
+    form.append('attachment', data, dataOpts)
+    page._id = getId(page)
+    const headers = {
+      ...form.getHeaders(),
+      'content-length': form.getLengthSync()
+    }
+    await axios({
+      method: 'put',
+      url: `api/v1/datasets/${dataset.id}/lines/${page._id}`,
+      data: form,
+      headers
+    })
+  }
+  for await (const page of pages) {
+    if (stopped) break
+    const crawlDelay = (robots[page.parsedURL.origin] && robots[page.parsedURL.origin].getCrawlDelay()) || pluginConfig.defaultCrawlDelay || 1
+    await new Promise(resolve => setTimeout(resolve, crawlDelay * 1000))
+    // TODO: apply if-none-match and if-modified-since headers if etag or lastModified are available
+    let response
+    try {
+      response = await axios.get(page.url, { headers: { 'user-agent': pluginConfig.userAgent || 'data-fair-web-scraper' } })
+    } catch (err) {
+      await log.warning(`failed to fetch page ${page.url} - ${err.status || err.message}`)
+      if (page.source) await log.warning(`this broken URL comes from ${page.source}`)
+      continue
+    }
+    if (response.headers['x-robots-tag']) {
+      for (const part of response.headers['x-robots-tag'].split(',').map(p => p.trim())) {
+        if (part === 'noindex') page.noindex = true
+        if (part === 'nofollow') page.nofollow = true
+      }
+    }
+    const isHTML = (response.headers['content-type'] && response.headers['content-type'].startsWith('text/html;')) || (typeof response.data === 'string' && response.data.trim().startsWith('<html'))
+    if (isHTML) {
+      const cheerio = require('cheerio')
+      const $ = cheerio.load(response.data)
+      page.title = $('title').text()
+      $('meta').each(function (i, elem) {
+        const name = $(this).attr('name')
+        if (name === 'robots') {
+          const content = $(this).attr('content')
+          if (content) {
+            for (const part of content.split(',').map(p => p.trim())) {
+              if (part === 'noindex') page.noindex = true
+              if (part === 'nofollow') page.nofollow = true
+            }
+          }
+        }
+      })
+      if (!page.noindex && processingConfig.anchors && processingConfig.anchors.length) {
+        const anchorsPages = []
+        $('a').each(function (i, elem) {
+          const href = $(this).attr('href')
+          if (!href) return
+          const parsedURL = new URL(href, page.url)
+          if (parsedURL.hash) {
+            const targetElement = $(parsedURL.hash)
+            if (!targetElement) return
+            for (const anchor of processingConfig.anchors || []) {
+              const fragment = anchor.wrapperSelector ? targetElement.closest(anchor.wrapperSelector) : targetElement
+              const fragmentHtml = fragment.html()
+              if (fragmentHtml) {
+                const anchorPage = { url: parsedURL.href }
+                if (anchor.titleSelector) anchorPage.title = fragment.find(anchor.titleSelector).text() || page.title
+                else anchorPage.title = targetElement.text() || page.title
+                anchorPage.tags = anchor.tags || []
+                anchorsPages.push([anchorPage, fragmentHtml])
+                $(fragment).remove()
+              }
+            }
+          }
+        })
+        for (const [anchorPage, fragmentHtml] of anchorsPages) {
+          console.log(anchorPage)
+          await sendPage(anchorPage, `<body>
+  ${fragmentHtml}
+</body>`)
+        }
+      }
+      if (!page.nofollow) {
+        $('a').each(function (i, elem) {
+          const href = $(this).attr('href')
+          if (href) pages.push({ url: new URL(href, page.url).href, source: page.url })
+        })
+      }
+      if (!page.noindex) {
+        if (processingConfig.prune) {
+          processingConfig.prune.forEach(s => $(s).remove())
+        }
+        await sendPage(page, $.html())
+      }
+    }
+  }
+}
+// used to manage interruption
+// not required but it is a good practice to prevent incoherent state a smuch as possible
+// the run method should finish shortly after calling stop, otherwise the process will be forcibly terminated
+// the grace period before force termination is 20 seconds
+exports.stop = async () => {
+  stopped = true
+}

package/package.json ADDED Viewed

@@ -0,0 +1,37 @@
+{
+  "name": "@data-fair/processing-web-scraper",
+  "version": "0.1.0",
+  "description": "A small Web scraper that publishes its data into data-fair datasets.",
+  "main": "index.js",
+  "scripts": {
+    "test": "mocha --exit",
+    "lint": "eslint --ignore-path .gitignore .",
+    "lint-fix": "eslint --ignore-path .gitignore --fix ."
+  },
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/data-fair/processing-web-scraper.git"
+  },
+  "keywords": [
+    "data-fair-processings-plugin"
+  ],
+  "author": "Alban Mouton <alban.mouton@gmail.com>",
+  "license": "MIT",
+  "bugs": {
+    "url": "https://github.com/data-fair/processing-web-scraper/issues"
+  },
+  "homepage": "https://github.com/data-fair/processingshello-world#readme",
+  "devDependencies": {
+    "@data-fair/processings-test-utils": "^0.5.0",
+    "config": "^3.3.6",
+    "eslint": "^7.18.0",
+    "express": "^4.18.2",
+    "mocha": "^8.2.1",
+    "standard": "^16.0.3"
+  },
+  "dependencies": {
+    "cheerio": "^1.0.0-rc.12",
+    "form-data": "^4.0.0",
+    "robots-parser": "^3.0.0"
+  }
+}

package/plugin-config-schema.json ADDED Viewed

@@ -0,0 +1,18 @@
+{
+  "type": "object",
+  "required": ["userAgent"],
+  "properties": {
+    "userAgent": {
+      "type": "string",
+      "title": "Robot identifier",
+      "description": "User-Agent header communicated to the web servers and used to determine the relevant rules from robots.txt files. Example: \"data-fair-web-scraper-koumoul\"",
+      "default": "data-fair-web-scraper"
+    },
+    "defaultCrawlDelay": {
+      "type": "integer",
+      "title": "Number of seconds to wait between each page download",
+      "description": "Can be overridden by robots.txt Crawl-delay rules",
+      "default": 1
+    }
+  }
+}

package/processing-config-schema.json ADDED Viewed

@@ -0,0 +1,108 @@
+{
+  "type": "object",
+  "x-display": "tabs",
+  "required": ["datasetMode"],
+  "allOf": [{
+    "title": "Jeu de données",
+    "oneOf": [{
+      "title": "Créer un jeu de données",
+      "required": ["dataset"],
+      "properties": {
+        "datasetMode": { "type": "string", "const": "create", "title": "Action" },
+        "dataset": {
+          "type": "object",
+          "required": ["title"],
+          "properties": {
+            "id": {"type": "string", "title": "Identifiant (laissez vide pour calculer un identifiant à partir du titre)"},
+            "title": {"type": "string", "title": "Titre", "default": "Hello world "}
+          }
+        }
+      }
+    }, {
+      "title": "Mettre à jour un jeu de données",
+      "required": ["dataset"],
+      "properties": {
+        "datasetMode": { "type": "string", "const": "update" },
+        "dataset": {
+          "type": "object",
+          "x-fromUrl": "{context.dataFairUrl}/api/v1/datasets?q={q}&select=id,title&{context.ownerFilter}",
+          "x-itemsProp": "results",
+          "x-itemTitle": "title",
+          "x-itemKey": "id",
+          "properties": {
+            "id": {"type": "string", "title": "Identifiant"},
+            "title": {"type": "string", "title": "Titre"}
+          }
+        }
+      }
+    }]
+  }, {
+    "title": "Paramètres essentiels",
+    "properties": {
+      "baseURLs": {
+        "type": "array",
+        "title": "URLs préfixes",
+        "description": "Pour qu'une URL soit parcourue elle doit commencer par un de ces préfixe. Si vous souhaitez parcourir tout un nom de domaine vous pouvez simplement saisir son URL racine.",
+        "minItems": 1,
+        "items": {"type": "string"}
+      },
+      "startURLs": {
+        "type": "array",
+        "title": "URLs de départ",
+        "minItems": 1,
+        "items": {"type": "string"}
+      }
+    }
+  }, {
+    "title": "Paramètres avancés",
+    "properties": {
+      "prune": {
+        "type": "array",
+        "title": "Sélecteurs d'éléments HTML à ignorer",
+        "description": "Par exemple si une barre de navigation est présente sur de nombreuses page du site, répéter son contenu peut nuire à la qualité des résultats. Si cette barre de navigation a pour identifiant 'nav-bar' vous pouvez saisir '#nav-bar'.",
+        "items": {"type": "string"}
+      },
+      "noIndex": {
+        "type": "array",
+        "title": "URLs de page à ne pas indexer",
+        "description": "Elles seront potentiellement quand même analysées à la recherche de liens et de fragments. Notez que ce traitement respecte également les instructions des fichiers robots.txt et les metadonnées HTML robots.",
+        "items": {"type": "string"}
+      },
+      "noFollow": {
+        "type": "array",
+        "title": "URLs de page à ne pas explorer",
+        "description": "Elles seront ni indexées ni analysées à la recherche de liens et de fragments. Notez que ce traitement respecte également les instructions des fichiers robots.txt et les metadonnées HTML robots.",
+        "items": {"type": "string"}
+      },
+      "anchors": {
+        "type": "array",
+        "title": "Extractions de sections ancrées dans les pages",
+        "description": "Une ancre est un lien qui pointe vers une section d'une page. Ces ancres et sections correspondantes peuvent être extraites et indexées séparément de la page qui les contient.",
+        "items": {
+          "type": "object",
+          "properties": {
+            "tags": {
+              "type": "array",
+              "title": "Étiquettes à associer à ce type de fragment",
+              "items": {"type": "string"}
+            },
+            "wrapperSelector": {
+              "type": "string",
+              "title": "Sélecteur CSS de l'élément englobant",
+              "description": "Le sélecteur CSS sera appliqué aux éléments parents de la cible du lien, l'élément le plus proche sera retourné et son contenu HTML sera le contenu du fragment. Optionel, si absent l'élément englobant sera directement l'élément cible du lien."
+            },
+            "titleSelector": {
+              "type": "string",
+              "title": "Sélecteur CSS du titre",
+              "description": "Le sélecteur CSS sera appliqué à l'intérieur de l'élément englobant pour extraire le titre du fragment. Optionel, si absent le titre sera directement extrait de l'élément cible du lien."
+            }
+          }
+        }
+      },
+      "titlePrefix": {
+        "type": "string",
+        "title": "Préfixe à supprimer des titres avant indexation"
+      }
+    }
+  }]
+}