@data-fair/processing-web-scraper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.eslintrc.js ADDED
@@ -0,0 +1,13 @@
1
+ module.exports = {
2
+ root: true,
3
+ parserOptions: {
4
+ parser: 'babel-eslint',
5
+ sourceType: 'module'
6
+ },
7
+ extends: ['standard'],
8
+ // add your custom rules here
9
+ rules: {
10
+ // allow paren-less arrow functions
11
+ 'arrow-parens': 0
12
+ }
13
+ }
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2022 data-fair
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,3 @@
1
+ # processing-web-scraper
2
+
3
+ A small Web scraper that publishes its data into data-fair datasets.
package/index.js ADDED
@@ -0,0 +1,281 @@
1
+ const FormData = require('form-data')
2
+ const crypto = require('crypto')
3
+ const robotsParser = require('robots-parser')
4
+
5
+ // TODO:
6
+ // handle html but also any file formats
7
+ // add in-links info (at least for files)
8
+ // store last-modified and e-tag and use is when re-crawling a site
9
+ // specifications listed here http://robots-txt.com/
10
+ // normalize URL to prevent duplicates
11
+
12
+ const datasetSchema = [
13
+ {
14
+ key: 'title',
15
+ type: 'string',
16
+ 'x-refersTo': 'http://www.w3.org/2000/01/rdf-schema#label',
17
+ 'x-capabilities': { textAgg: false }
18
+ },
19
+ {
20
+ key: 'url',
21
+ type: 'string',
22
+ 'x-refersTo': 'https://schema.org/WebPage',
23
+ 'x-capabilities': { text: false, values: false, textAgg: false, insensitive: false }
24
+ },
25
+ {
26
+ key: 'tags',
27
+ type: 'string',
28
+ separator: ',',
29
+ 'x-refersTo': 'https://schema.org/DefinedTermSet',
30
+ 'x-capabilities': { text: false, textStandard: false, textAgg: false, insensitive: false }
31
+ },
32
+ {
33
+ key: 'etag',
34
+ type: 'string',
35
+ separator: ',',
36
+ 'x-capabilities': { index: false, values: false, text: false, textStandard: false, textAgg: false, insensitive: false }
37
+ },
38
+ {
39
+ key: 'lastModified',
40
+ type: 'string',
41
+ 'x-capabilities': { index: false, values: false, text: false, textStandard: false, textAgg: false, insensitive: false }
42
+ },
43
+ {
44
+ key: 'attachmentPath',
45
+ type: 'string',
46
+ 'x-refersTo': 'http://schema.org/DigitalDocument',
47
+ 'x-capabilities': { text: false, textStandard: false, values: false, textAgg: false, insensitive: false }
48
+ }
49
+ ]
50
+
51
+ // a global variable to manage interruption
52
+ let stopped
53
+
54
+ const normalizeURL = (url) => {
55
+ const parsedURL = new URL(url)
56
+ for (const indexSuffix of ['index.html', 'index.php', 'index.jsp', 'index.cgi']) {
57
+ if (parsedURL.pathname.endsWith('/' + indexSuffix)) {
58
+ parsedURL.pathname = parsedURL.pathname.slice(0, parsedURL.pathname.length - indexSuffix.length)
59
+ }
60
+ }
61
+ return parsedURL.href
62
+ }
63
+
64
+ const getId = (page) => {
65
+ return crypto.createHash('sha256').update(normalizeURL(page.url)).digest('base64url').slice(0, 20)
66
+ }
67
+
68
+ class PagesIterator {
69
+ constructor (log, pluginConfig, processingConfig, robots) {
70
+ this.pages = []
71
+ this.cursor = -1
72
+ this.log = log
73
+ this.pluginConfig = pluginConfig
74
+ this.processingConfig = processingConfig
75
+ this.robots = robots
76
+ }
77
+
78
+ [Symbol.asyncIterator] () {
79
+ return this
80
+ }
81
+
82
+ push (page) {
83
+ // TODO: apply no-follow rules
84
+ if (typeof page === 'string') page = { url: page }
85
+ if (!this.processingConfig.baseURLs.find(b => page.url.startsWith(b))) return
86
+ page.parsedURL = page.parsedURL || new URL(page.url)
87
+ if (this.robots[page.parsedURL.origin] && !this.robots[page.parsedURL.origin].isAllowed(page.url, this.pluginConfig.userAgent || 'data-fair-web-scraper')) {
88
+ return
89
+ }
90
+ page._id = getId(page)
91
+ if (this.pages.find(p => p._id === page._id)) return
92
+ this.pages.push(page)
93
+ }
94
+
95
+ async next () {
96
+ this.cursor += 1
97
+ if (this.cursor === 0) await this.log.task('Crawl pages')
98
+ await this.log.progress('Crawl pages', this.cursor, this.pages.length)
99
+ const page = this.pages[this.cursor]
100
+ if (page) await this.log.debug('next page', page.url)
101
+ return { value: page, done: this.cursor === this.pages.length }
102
+ }
103
+ }
104
+
105
+ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir, axios, log, patchConfig, ws }) => {
106
+ let dataset
107
+ if (processingConfig.datasetMode === 'create') {
108
+ await log.step('Dataset creation')
109
+ dataset = (await axios.post('api/v1/datasets', {
110
+ id: processingConfig.dataset.id,
111
+ title: processingConfig.dataset.title,
112
+ isRest: true,
113
+ schema: datasetSchema,
114
+ extras: { processingId }
115
+ })).data
116
+ await log.info(`dataset created, id="${dataset.id}", title="${dataset.title}"`)
117
+ await patchConfig({ datasetMode: 'update', dataset: { id: dataset.id, title: dataset.title } })
118
+ await ws.waitForJournal(dataset.id, 'finalize-end')
119
+ } else if (processingConfig.datasetMode === 'update') {
120
+ await log.step('Check dataset')
121
+ dataset = (await axios.get(`api/v1/datasets/${processingConfig.dataset.id}`)).data
122
+ if (!dataset) throw new Error(`the dataset does not exist, id="${processingConfig.dataset.id}"`)
123
+ await log.info(`the dataset exists, id="${dataset.id}", title="${dataset.title}"`)
124
+ }
125
+
126
+ // parse the robots.txt files if available
127
+ const robots = {}
128
+ for (const baseURL of processingConfig.baseURLs) {
129
+ const { origin } = new URL(baseURL)
130
+ if (robots[origin]) continue
131
+ try {
132
+ const response = await axios.get(origin + '/robots.txt')
133
+ robots[origin] = robotsParser(origin + '/robots.txt', response.data)
134
+ } catch (err) {
135
+ await log.info(`failed to fetch ${origin + '/robots.txt'} - ${err.status || err.message}`)
136
+ }
137
+ }
138
+
139
+ const pages = new PagesIterator(log, pluginConfig, processingConfig, robots)
140
+
141
+ await log.step('Init pages list')
142
+ await log.info(`add ${processingConfig.startURLs.length} pages from config`)
143
+ for (const url of processingConfig.startURLs) await pages.push({ url, source: 'config start URLs' })
144
+ if (processingConfig.datasetMode === 'update') {
145
+ const existingPages = (await axios.get(`api/v1/datasets/${dataset.id}/lines`, { params: { select: '_id,url,etag,lastModified', size: 10000 } })).data.results
146
+ await log.info(`add ${existingPages.length} pages from previous crawls`)
147
+ for (const page of existingPages) await pages.push({ page, source: 'previous exploration' })
148
+ }
149
+ // TODO: init from sitemap (and use robots.getSitemaps() to help in this)
150
+
151
+ const sendPage = async (page, data, contentType = 'text/html', filename = 'content.html') => {
152
+ await log.debug('send page', page.url)
153
+ // TODO: apply no-index rules
154
+ const form = new FormData()
155
+ // improve page title
156
+ if (page.title) {
157
+ page.title = page.title.trim()
158
+ if (processingConfig.titlePrefix && page.title.startsWith(processingConfig.titlePrefix)) {
159
+ page.title = page.title.replace(processingConfig.titlePrefix, '')
160
+ }
161
+ }
162
+ form.append('title', page.title)
163
+ form.append('url', page.url)
164
+ if (page.tags && page.tags.length) form.append('tags', page.tags.join(','))
165
+ data = typeof data === 'string' ? Buffer.from(data) : data
166
+ const dataOpts = {
167
+ contentType,
168
+ filename,
169
+ knownLength: data.length
170
+ }
171
+ form.append('attachment', data, dataOpts)
172
+ page._id = getId(page)
173
+ const headers = {
174
+ ...form.getHeaders(),
175
+ 'content-length': form.getLengthSync()
176
+ }
177
+ await axios({
178
+ method: 'put',
179
+ url: `api/v1/datasets/${dataset.id}/lines/${page._id}`,
180
+ data: form,
181
+ headers
182
+ })
183
+ }
184
+
185
+ for await (const page of pages) {
186
+ if (stopped) break
187
+
188
+ const crawlDelay = (robots[page.parsedURL.origin] && robots[page.parsedURL.origin].getCrawlDelay()) || pluginConfig.defaultCrawlDelay || 1
189
+ await new Promise(resolve => setTimeout(resolve, crawlDelay * 1000))
190
+
191
+ // TODO: apply if-none-match and if-modified-since headers if etag or lastModified are available
192
+ let response
193
+ try {
194
+ response = await axios.get(page.url, { headers: { 'user-agent': pluginConfig.userAgent || 'data-fair-web-scraper' } })
195
+ } catch (err) {
196
+ await log.warning(`failed to fetch page ${page.url} - ${err.status || err.message}`)
197
+ if (page.source) await log.warning(`this broken URL comes from ${page.source}`)
198
+ continue
199
+ }
200
+
201
+ if (response.headers['x-robots-tag']) {
202
+ for (const part of response.headers['x-robots-tag'].split(',').map(p => p.trim())) {
203
+ if (part === 'noindex') page.noindex = true
204
+ if (part === 'nofollow') page.nofollow = true
205
+ }
206
+ }
207
+
208
+ const isHTML = (response.headers['content-type'] && response.headers['content-type'].startsWith('text/html;')) || (typeof response.data === 'string' && response.data.trim().startsWith('<html'))
209
+ if (isHTML) {
210
+ const cheerio = require('cheerio')
211
+ const $ = cheerio.load(response.data)
212
+ page.title = $('title').text()
213
+
214
+ $('meta').each(function (i, elem) {
215
+ const name = $(this).attr('name')
216
+ if (name === 'robots') {
217
+ const content = $(this).attr('content')
218
+ if (content) {
219
+ for (const part of content.split(',').map(p => p.trim())) {
220
+ if (part === 'noindex') page.noindex = true
221
+ if (part === 'nofollow') page.nofollow = true
222
+ }
223
+ }
224
+ }
225
+ })
226
+
227
+ if (!page.noindex && processingConfig.anchors && processingConfig.anchors.length) {
228
+ const anchorsPages = []
229
+ $('a').each(function (i, elem) {
230
+ const href = $(this).attr('href')
231
+ if (!href) return
232
+ const parsedURL = new URL(href, page.url)
233
+ if (parsedURL.hash) {
234
+ const targetElement = $(parsedURL.hash)
235
+ if (!targetElement) return
236
+ for (const anchor of processingConfig.anchors || []) {
237
+ const fragment = anchor.wrapperSelector ? targetElement.closest(anchor.wrapperSelector) : targetElement
238
+ const fragmentHtml = fragment.html()
239
+ if (fragmentHtml) {
240
+ const anchorPage = { url: parsedURL.href }
241
+ if (anchor.titleSelector) anchorPage.title = fragment.find(anchor.titleSelector).text() || page.title
242
+ else anchorPage.title = targetElement.text() || page.title
243
+ anchorPage.tags = anchor.tags || []
244
+ anchorsPages.push([anchorPage, fragmentHtml])
245
+ $(fragment).remove()
246
+ }
247
+ }
248
+ }
249
+ })
250
+ for (const [anchorPage, fragmentHtml] of anchorsPages) {
251
+ console.log(anchorPage)
252
+ await sendPage(anchorPage, `<body>
253
+ ${fragmentHtml}
254
+ </body>`)
255
+ }
256
+ }
257
+
258
+ if (!page.nofollow) {
259
+ $('a').each(function (i, elem) {
260
+ const href = $(this).attr('href')
261
+ if (href) pages.push({ url: new URL(href, page.url).href, source: page.url })
262
+ })
263
+ }
264
+
265
+ if (!page.noindex) {
266
+ if (processingConfig.prune) {
267
+ processingConfig.prune.forEach(s => $(s).remove())
268
+ }
269
+ await sendPage(page, $.html())
270
+ }
271
+ }
272
+ }
273
+ }
274
+
275
+ // used to manage interruption
276
+ // not required but it is a good practice to prevent incoherent state a smuch as possible
277
+ // the run method should finish shortly after calling stop, otherwise the process will be forcibly terminated
278
+ // the grace period before force termination is 20 seconds
279
+ exports.stop = async () => {
280
+ stopped = true
281
+ }
package/package.json ADDED
@@ -0,0 +1,37 @@
1
+ {
2
+ "name": "@data-fair/processing-web-scraper",
3
+ "version": "0.1.0",
4
+ "description": "A small Web scraper that publishes its data into data-fair datasets.",
5
+ "main": "index.js",
6
+ "scripts": {
7
+ "test": "mocha --exit",
8
+ "lint": "eslint --ignore-path .gitignore .",
9
+ "lint-fix": "eslint --ignore-path .gitignore --fix ."
10
+ },
11
+ "repository": {
12
+ "type": "git",
13
+ "url": "git+https://github.com/data-fair/processing-web-scraper.git"
14
+ },
15
+ "keywords": [
16
+ "data-fair-processings-plugin"
17
+ ],
18
+ "author": "Alban Mouton <alban.mouton@gmail.com>",
19
+ "license": "MIT",
20
+ "bugs": {
21
+ "url": "https://github.com/data-fair/processing-web-scraper/issues"
22
+ },
23
+ "homepage": "https://github.com/data-fair/processingshello-world#readme",
24
+ "devDependencies": {
25
+ "@data-fair/processings-test-utils": "^0.5.0",
26
+ "config": "^3.3.6",
27
+ "eslint": "^7.18.0",
28
+ "express": "^4.18.2",
29
+ "mocha": "^8.2.1",
30
+ "standard": "^16.0.3"
31
+ },
32
+ "dependencies": {
33
+ "cheerio": "^1.0.0-rc.12",
34
+ "form-data": "^4.0.0",
35
+ "robots-parser": "^3.0.0"
36
+ }
37
+ }
@@ -0,0 +1,18 @@
1
+ {
2
+ "type": "object",
3
+ "required": ["userAgent"],
4
+ "properties": {
5
+ "userAgent": {
6
+ "type": "string",
7
+ "title": "Robot identifier",
8
+ "description": "User-Agent header communicated to the web servers and used to determine the relevant rules from robots.txt files. Example: \"data-fair-web-scraper-koumoul\"",
9
+ "default": "data-fair-web-scraper"
10
+ },
11
+ "defaultCrawlDelay": {
12
+ "type": "integer",
13
+ "title": "Number of seconds to wait between each page download",
14
+ "description": "Can be overridden by robots.txt Crawl-delay rules",
15
+ "default": 1
16
+ }
17
+ }
18
+ }
@@ -0,0 +1,108 @@
1
+ {
2
+ "type": "object",
3
+ "x-display": "tabs",
4
+ "required": ["datasetMode"],
5
+ "allOf": [{
6
+ "title": "Jeu de données",
7
+ "oneOf": [{
8
+ "title": "Créer un jeu de données",
9
+ "required": ["dataset"],
10
+ "properties": {
11
+ "datasetMode": { "type": "string", "const": "create", "title": "Action" },
12
+ "dataset": {
13
+ "type": "object",
14
+ "required": ["title"],
15
+ "properties": {
16
+ "id": {"type": "string", "title": "Identifiant (laissez vide pour calculer un identifiant à partir du titre)"},
17
+ "title": {"type": "string", "title": "Titre", "default": "Hello world "}
18
+ }
19
+ }
20
+ }
21
+ }, {
22
+ "title": "Mettre à jour un jeu de données",
23
+ "required": ["dataset"],
24
+ "properties": {
25
+ "datasetMode": { "type": "string", "const": "update" },
26
+ "dataset": {
27
+ "type": "object",
28
+ "x-fromUrl": "{context.dataFairUrl}/api/v1/datasets?q={q}&select=id,title&{context.ownerFilter}",
29
+ "x-itemsProp": "results",
30
+ "x-itemTitle": "title",
31
+ "x-itemKey": "id",
32
+ "properties": {
33
+ "id": {"type": "string", "title": "Identifiant"},
34
+ "title": {"type": "string", "title": "Titre"}
35
+ }
36
+ }
37
+ }
38
+ }]
39
+ }, {
40
+ "title": "Paramètres essentiels",
41
+ "properties": {
42
+ "baseURLs": {
43
+ "type": "array",
44
+ "title": "URLs préfixes",
45
+ "description": "Pour qu'une URL soit parcourue elle doit commencer par un de ces préfixe. Si vous souhaitez parcourir tout un nom de domaine vous pouvez simplement saisir son URL racine.",
46
+ "minItems": 1,
47
+ "items": {"type": "string"}
48
+ },
49
+ "startURLs": {
50
+ "type": "array",
51
+ "title": "URLs de départ",
52
+ "minItems": 1,
53
+ "items": {"type": "string"}
54
+ }
55
+ }
56
+ }, {
57
+ "title": "Paramètres avancés",
58
+ "properties": {
59
+ "prune": {
60
+ "type": "array",
61
+ "title": "Sélecteurs d'éléments HTML à ignorer",
62
+ "description": "Par exemple si une barre de navigation est présente sur de nombreuses page du site, répéter son contenu peut nuire à la qualité des résultats. Si cette barre de navigation a pour identifiant 'nav-bar' vous pouvez saisir '#nav-bar'.",
63
+ "items": {"type": "string"}
64
+ },
65
+ "noIndex": {
66
+ "type": "array",
67
+ "title": "URLs de page à ne pas indexer",
68
+ "description": "Elles seront potentiellement quand même analysées à la recherche de liens et de fragments. Notez que ce traitement respecte également les instructions des fichiers robots.txt et les metadonnées HTML robots.",
69
+ "items": {"type": "string"}
70
+ },
71
+ "noFollow": {
72
+ "type": "array",
73
+ "title": "URLs de page à ne pas explorer",
74
+ "description": "Elles seront ni indexées ni analysées à la recherche de liens et de fragments. Notez que ce traitement respecte également les instructions des fichiers robots.txt et les metadonnées HTML robots.",
75
+ "items": {"type": "string"}
76
+ },
77
+ "anchors": {
78
+ "type": "array",
79
+ "title": "Extractions de sections ancrées dans les pages",
80
+ "description": "Une ancre est un lien qui pointe vers une section d'une page. Ces ancres et sections correspondantes peuvent être extraites et indexées séparément de la page qui les contient.",
81
+ "items": {
82
+ "type": "object",
83
+ "properties": {
84
+ "tags": {
85
+ "type": "array",
86
+ "title": "Étiquettes à associer à ce type de fragment",
87
+ "items": {"type": "string"}
88
+ },
89
+ "wrapperSelector": {
90
+ "type": "string",
91
+ "title": "Sélecteur CSS de l'élément englobant",
92
+ "description": "Le sélecteur CSS sera appliqué aux éléments parents de la cible du lien, l'élément le plus proche sera retourné et son contenu HTML sera le contenu du fragment. Optionel, si absent l'élément englobant sera directement l'élément cible du lien."
93
+ },
94
+ "titleSelector": {
95
+ "type": "string",
96
+ "title": "Sélecteur CSS du titre",
97
+ "description": "Le sélecteur CSS sera appliqué à l'intérieur de l'élément englobant pour extraire le titre du fragment. Optionel, si absent le titre sera directement extrait de l'élément cible du lien."
98
+ }
99
+ }
100
+ }
101
+ },
102
+ "titlePrefix": {
103
+ "type": "string",
104
+ "title": "Préfixe à supprimer des titres avant indexation"
105
+ }
106
+ }
107
+ }]
108
+ }