@data-fair/processing-web-scraper 0.4.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.ts ADDED
@@ -0,0 +1,19 @@
1
+ import type { PrepareFunction, ProcessingContext } from '@data-fair/lib-common-types/processings.js'
2
+ import type { ProcessingConfig } from './types/processingConfig/index.ts'
3
+
4
+ export const prepare: PrepareFunction<ProcessingConfig> = async ({ processingConfig, secrets }) => {
5
+ return {
6
+ processingConfig,
7
+ secrets
8
+ }
9
+ }
10
+
11
+ export const run = async (context: ProcessingContext<ProcessingConfig>) => {
12
+ const { run } = await import('./lib/scraper.ts')
13
+ await run(context)
14
+ }
15
+
16
+ export const stop = async () => {
17
+ const { stop } = await import('./lib/scraper.ts')
18
+ await stop()
19
+ }
@@ -1,11 +1,10 @@
1
- const FormData = require('form-data')
2
- const crypto = require('crypto')
3
- const robotsParser = require('robots-parser')
4
-
5
- // TODO:
6
- // handle html but also any file formats
7
- // add in-links info (at least for files)
8
- // specifications listed here http://robots-txt.com/
1
+ import { createHash } from 'node:crypto'
2
+ import UrlPattern from 'url-pattern'
3
+ import robotsParser from 'robots-parser'
4
+ import cheerio from 'cheerio'
5
+ import FormData from 'form-data'
6
+ import type { ProcessingContext } from '@data-fair/lib-common-types/processings.js'
7
+ import type { ProcessingConfig } from '../types/processingConfig/index.ts'
9
8
 
10
9
  const datasetSchema = [
11
10
  {
@@ -14,6 +13,12 @@ const datasetSchema = [
14
13
  'x-refersTo': 'http://www.w3.org/2000/01/rdf-schema#label',
15
14
  'x-capabilities': { textAgg: false }
16
15
  },
16
+ {
17
+ key: 'description',
18
+ type: 'string',
19
+ 'x-refersTo': 'https://schema.org/description',
20
+ 'x-capabilities': { values: false, textAgg: false, insensitive: false }
21
+ },
17
22
  {
18
23
  key: 'url',
19
24
  type: 'string',
@@ -30,7 +35,6 @@ const datasetSchema = [
30
35
  {
31
36
  key: 'etag',
32
37
  type: 'string',
33
- separator: ',',
34
38
  'x-capabilities': { index: false, values: false, text: false, textStandard: false, textAgg: false, insensitive: false }
35
39
  },
36
40
  {
@@ -46,10 +50,9 @@ const datasetSchema = [
46
50
  }
47
51
  ]
48
52
 
49
- // a global variable to manage interruption
50
- let stopped
53
+ let stopped: boolean | undefined
51
54
 
52
- const normalizeURL = (url, ignoreHash = false, addSlash = false) => {
55
+ const normalizeURL = (url: string, ignoreHash = false, addSlash = false): string => {
53
56
  const parsedURL = new URL(url)
54
57
  for (const indexSuffix of ['index.html', 'index.php', 'index.jsp', 'index.cgi']) {
55
58
  if (parsedURL.pathname.endsWith('/' + indexSuffix)) {
@@ -61,20 +64,41 @@ const normalizeURL = (url, ignoreHash = false, addSlash = false) => {
61
64
  return parsedURL.href
62
65
  }
63
66
 
64
- const getId = (page) => {
65
- return crypto.createHash('sha256').update(normalizeURL(page.url)).digest('base64url').slice(0, 20)
67
+ const getId = async (page: { url: string }): Promise<string> => {
68
+ return createHash('sha256').update(normalizeURL(page.url)).digest('base64url').slice(0, 20)
69
+ }
70
+
71
+ interface Page {
72
+ url: string
73
+ title?: string
74
+ description?: string
75
+ tags?: string[]
76
+ etag?: string
77
+ lastModified?: string
78
+ attachmentPath?: string
79
+ _id?: string
80
+ parsedURL?: URL
81
+ source?: string
82
+ noindex?: boolean
83
+ nofollow?: boolean
84
+ parentId?: string
66
85
  }
67
86
 
68
87
  class PagesIterator {
69
- constructor (log, pluginConfig, processingConfig, robots) {
70
- this.pages = []
71
- this.cursor = -1
88
+ pages: Page[] = []
89
+ cursor = -1
90
+ log: any
91
+ pluginConfig: any
92
+ processingConfig: ProcessingConfig
93
+ robots: Record<string, any>
94
+ excludeURLPatterns: any[] = []
95
+
96
+ constructor (log: any, pluginConfig: any, processingConfig: ProcessingConfig, robots: Record<string, any>) {
72
97
  this.log = log
73
98
  this.pluginConfig = pluginConfig
74
99
  this.processingConfig = processingConfig
75
100
  this.robots = robots
76
- const UrlPattern = require('url-pattern')
77
- this.excludeURLPatterns = (processingConfig.excludeURLPatterns || []).map(p => {
101
+ this.excludeURLPatterns = (processingConfig.excludeURLPatterns || []).map((p: string) => {
78
102
  const url = new URL(p)
79
103
  const pattern = new UrlPattern(url.pathname)
80
104
  pattern.hostname = url.hostname
@@ -86,19 +110,19 @@ class PagesIterator {
86
110
  return this
87
111
  }
88
112
 
89
- push (page) {
113
+ async push (page: Page | string) {
90
114
  if (typeof page === 'string') page = { url: page }
91
- if (!this.processingConfig.baseURLs.find(b => page.url.startsWith(b))) return
115
+ if (!this.processingConfig.baseURLs?.find((b: string) => page.url.startsWith(b))) return
92
116
  page.parsedURL = page.parsedURL || new URL(page.url)
93
117
  if (page.parsedURL.hash) return
94
- if (this.excludeURLPatterns.find(p => p.match(page.parsedURL.pathname) && p.hostname === page.parsedURL.hostname)) {
118
+ if (this.excludeURLPatterns.find((p: any) => p.match(page.parsedURL!.pathname) && p.hostname === page.parsedURL!.hostname)) {
95
119
  return
96
120
  }
97
- if (this.robots[page.parsedURL.origin] && !this.robots[page.parsedURL.origin].isAllowed(page.url, this.pluginConfig.userAgent || 'data-fair-web-scraper')) {
121
+ if (this.robots[page.parsedURL!.origin] && !this.robots[page.parsedURL!.origin].isAllowed(page.url, this.pluginConfig.userAgent || 'data-fair-web-scraper')) {
98
122
  return
99
123
  }
100
- page._id = getId(page)
101
- if (this.pages.find(p => p._id === page._id)) return
124
+ page._id = await getId(page)
125
+ if (this.pages.find((p: Page) => p._id === page._id)) return
102
126
  this.pages.push(page)
103
127
  }
104
128
 
@@ -112,40 +136,45 @@ class PagesIterator {
112
136
  }
113
137
  }
114
138
 
115
- exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir, axios, log, patchConfig, ws }) => {
116
- let dataset
139
+ export const run = async (context: ProcessingContext<ProcessingConfig>) => {
140
+ const { pluginConfig, processingConfig, processingId, axios, log, patchConfig, ws } = context
141
+ let dataset: any
117
142
  if (processingConfig.datasetMode === 'create') {
118
143
  await log.step('Dataset creation')
119
144
  dataset = (await axios.post('api/v1/datasets', {
120
- id: processingConfig.dataset.id,
121
- title: processingConfig.dataset.title,
145
+ id: processingConfig.dataset?.id,
146
+ title: processingConfig.dataset?.title,
122
147
  isRest: true,
123
148
  schema: datasetSchema,
124
149
  extras: { processingId }
125
150
  })).data
151
+ if (dataset.status !== 'finalized') {
152
+ await ws.waitForJournal(dataset.id, 'finalize-end')
153
+ }
126
154
  await log.info(`dataset created, id="${dataset.id}", title="${dataset.title}"`)
127
155
  await patchConfig({ datasetMode: 'update', dataset: { id: dataset.id, title: dataset.title } })
128
- await ws.waitForJournal(dataset.id, 'finalize-end')
129
156
  } else if (processingConfig.datasetMode === 'update') {
130
157
  await log.step('Check dataset')
131
- dataset = (await axios.get(`api/v1/datasets/${processingConfig.dataset.id}`)).data
132
- if (!dataset) throw new Error(`the dataset does not exist, id="${processingConfig.dataset.id}"`)
158
+ dataset = (await axios.get(`api/v1/datasets/${processingConfig.dataset?.id}`)).data
159
+ if (!dataset) throw new Error(`the dataset does not exist, id="${processingConfig.dataset?.id}"`)
133
160
  await log.info(`the dataset exists, id="${dataset.id}", title="${dataset.title}"`)
134
161
  }
135
162
 
136
- // parse the robots.txt files if available
137
- const robots = {}
163
+ const robots: Record<string, any> = {}
138
164
  const sitemaps = processingConfig.sitemaps || []
139
- for (const baseURL of processingConfig.baseURLs) {
165
+ for (const baseURL of processingConfig.baseURLs || []) {
140
166
  const { origin } = new URL(baseURL)
141
167
  if (robots[origin]) continue
142
168
  try {
143
169
  const response = await axios.get(origin + '/robots.txt')
144
170
  robots[origin] = robotsParser(origin + '/robots.txt', response.data)
145
171
  for (const sitemap of robots[origin].getSitemaps()) {
146
- if (!sitemaps.includes(sitemap)) sitemaps.push(sitemap)
172
+ if (!sitemaps.includes(sitemap)) {
173
+ await log.info(`add sitemap found in robots.txt ${sitemap}`)
174
+ sitemaps.push(sitemap)
175
+ }
147
176
  }
148
- } catch (err) {
177
+ } catch (err: any) {
149
178
  await log.info(`failed to fetch ${origin + '/robots.txt'} - ${err.status || err.message}`)
150
179
  }
151
180
  }
@@ -153,32 +182,33 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
153
182
  const pages = new PagesIterator(log, pluginConfig, processingConfig, robots)
154
183
 
155
184
  await log.step('Init pages list')
156
- let existingPages
185
+ let existingPages: Page[] | undefined
157
186
  if (processingConfig.datasetMode === 'update') {
158
187
  existingPages = (await axios.get(`api/v1/datasets/${dataset.id}/lines`, { params: { select: '_id,url,etag,lastModified', size: 10000 } })).data.results
159
- await log.info(`add ${existingPages.length} pages from previous crawls`)
160
- for (const page of existingPages) {
161
- page.parsedURL = new URL(page.url)
162
- if (page.parsedURL.hash) {
163
- const parentURL = new URL(page.parsedURL)
164
- parentURL.hash = ''
165
- page.parentId = getId({ url: parentURL.href })
188
+ if (existingPages) {
189
+ await log.info(`add ${existingPages.length} pages from previous crawls`)
190
+ for (const page of existingPages) {
191
+ page.parsedURL = new URL(page.url)
192
+ if (page.parsedURL.hash) {
193
+ const parentURL = new URL(page.parsedURL)
194
+ parentURL.hash = ''
195
+ page.parentId = await getId({ url: parentURL.href })
196
+ }
197
+ await pages.push({ ...page, source: 'previous exploration' })
166
198
  }
167
- await pages.push({ ...page, source: 'previous exploration' })
168
199
  }
169
200
  }
170
- await log.info(`add ${processingConfig.startURLs.length} pages from config`)
171
- for (const url of processingConfig.startURLs) {
201
+ await log.info(`add ${processingConfig.startURLs?.length || 0} pages from config`)
202
+ for (const url of processingConfig.startURLs || []) {
172
203
  await pages.push({ url, source: 'config start URLs' })
173
204
  }
174
205
 
175
206
  for (const sitemapURL of sitemaps) {
176
207
  await log.info(`fetch start URLs from sitemap ${sitemapURL}`)
177
208
  const sitemap = (await axios.get(sitemapURL)).data
178
- const cheerio = require('cheerio')
179
209
  const $ = cheerio.load(sitemap)
180
- const sitemapURLs = []
181
- $('url loc').each(function () {
210
+ const sitemapURLs: string[] = []
211
+ $('url loc').each(function (this: any) {
182
212
  sitemapURLs.push($(this).text())
183
213
  })
184
214
  for (const url of sitemapURLs) {
@@ -186,20 +216,16 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
186
216
  }
187
217
  }
188
218
 
189
- const sentIds = new Set([])
190
- const sendPage = async (page, data, contentType = 'text/html', filename = 'content.html') => {
191
- await log.debug('send page', page.url)
219
+ const sentIds = new Set<string>()
220
+ const sendPage = async (page: Page, data: any, contentType = 'text/html', filename = 'content.html') => {
192
221
  const form = new FormData()
193
- // improve page title
222
+
194
223
  if (page.title) {
195
224
  page.title = page.title.trim()
196
225
  if (processingConfig.titlePrefix && page.title.startsWith(processingConfig.titlePrefix)) {
197
226
  page.title = page.title.replace(processingConfig.titlePrefix, '')
198
227
  }
199
228
  }
200
- form.append('title', page.title)
201
- form.append('url', page.url)
202
- if (page.tags && page.tags.length) form.append('tags', page.tags.join(','))
203
229
  data = typeof data === 'string' ? Buffer.from(data) : data
204
230
  const dataOpts = {
205
231
  contentType,
@@ -207,49 +233,52 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
207
233
  knownLength: data.length
208
234
  }
209
235
  form.append('attachment', data, dataOpts)
210
- if (page.lastModified) form.append('lastModified', page.lastModified)
211
- if (page.etag) form.append('etag', page.etag)
212
- page._id = getId(page)
236
+ page._id = await getId(page)
213
237
  sentIds.add(page._id)
238
+ const body = { ...page }
239
+ delete body.source
240
+ delete body.parsedURL
241
+ delete body.nofollow
242
+ delete body.noindex
243
+ form.append('_body', JSON.stringify(body))
214
244
  const headers = {
215
245
  ...form.getHeaders(),
216
246
  'content-length': form.getLengthSync()
217
247
  }
248
+
218
249
  await axios({
219
250
  method: 'put',
220
251
  url: `api/v1/datasets/${dataset.id}/lines/${page._id}`,
221
252
  data: form,
222
- headers
253
+ headers,
254
+ validateStatus: (status) => status === 200 || status === 304
223
255
  })
224
256
  }
225
257
 
226
258
  for await (const page of pages) {
227
259
  if (stopped) break
228
260
 
229
- const crawlDelay = (robots[page.parsedURL.origin] && robots[page.parsedURL.origin].getCrawlDelay()) || pluginConfig.defaultCrawlDelay || 1
261
+ const crawlDelay = (robots[page.parsedURL!.origin] && robots[page.parsedURL!.origin].getCrawlDelay()) || pluginConfig.defaultCrawlDelay || 1
230
262
  await new Promise(resolve => setTimeout(resolve, crawlDelay * 1000))
231
263
 
232
- // TODO: apply if-none-match and if-modified-since headers if etag or lastModified are available
233
- const headers = { 'user-agent': pluginConfig.userAgent || 'data-fair-web-scraper' }
264
+ const headers: Record<string, string> = { 'user-agent': pluginConfig.userAgent || 'data-fair-web-scraper' }
234
265
  if (page.lastModified) headers['if-modified-since'] = page.lastModified
235
266
  if (page.etag) headers['if-none-match'] = page.etag
236
- let response
267
+ let response: any
237
268
  try {
238
269
  response = await axios.get(page.url, { headers, maxRedirects: 0 })
239
- } catch (err) {
240
- // content did not change
270
+ } catch (err: any) {
241
271
  if (err.status === 304) {
242
272
  await log.debug(`page was not modified since last exploration ${page.url}`)
243
- sentIds.add(page._id)
244
- for (const existingPage of existingPages) {
245
- if (existingPage.parentId === page._id) sentIds.add(existingPage._id)
273
+ sentIds.add(page._id!)
274
+ for (const existingPage of existingPages || []) {
275
+ if (existingPage.parentId === page._id) sentIds.add(existingPage._id!)
246
276
  }
247
277
  continue
248
278
  }
249
- // follow a redirect
250
- if (err.status === 301) {
279
+ if (err.status === 301 || err.status === 302) {
251
280
  await log.debug(`page redirected ${page.url} -> ${err.headers.location}`)
252
- pages.push({ url: new URL(err.headers.location, page.url).href, source: 'redirect ' + page.url })
281
+ await pages.push({ url: new URL(err.headers.location, page.url).href, source: 'redirect ' + page.url })
253
282
  continue
254
283
  }
255
284
  await log.warning(`failed to fetch page ${page.url} - ${err.status || err.message}`)
@@ -258,46 +287,70 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
258
287
  }
259
288
 
260
289
  if (response.headers['x-robots-tag']) {
261
- await log.debug('use x-robots-tag header', response.headers['x-robots-tag'])
262
- for (const part of response.headers['x-robots-tag'].split(',').map(p => p.trim())) {
290
+ await log.debug('x-robots-tag header', response.headers['x-robots-tag'])
291
+ for (const part of response.headers['x-robots-tag'].split(',').map((p: string) => p.trim())) {
263
292
  if (part === 'noindex') page.noindex = true
264
293
  if (part === 'nofollow') page.nofollow = true
265
294
  }
266
295
  }
267
-
268
296
  page.lastModified = response.headers['last-modified']
269
297
  page.etag = response.headers.etag
270
298
 
271
299
  const isHTML = (response.headers['content-type'] && response.headers['content-type'].startsWith('text/html;')) || (typeof response.data === 'string' && response.data.trim().startsWith('<html'))
272
300
  if (isHTML) {
273
- const cheerio = require('cheerio')
274
301
  const $ = cheerio.load(response.data)
275
302
  const titleSelectors = (processingConfig.titleSelectors || []).concat(['title', 'h1'])
276
303
  for (const titleSelector of titleSelectors) {
277
304
  page.title = $(titleSelector).text()
278
305
  if (page.title) {
279
- log.debug(`used title selector "${titleSelector}" -> ${page.title.trim()}`)
306
+ await log.debug(`used title selector "${titleSelector}" -> ${page.title.trim()}`)
280
307
  break
281
308
  }
282
309
  }
283
310
 
284
- $('meta').each(function (i, elem) {
311
+ if (processingConfig.tagsSelectors && processingConfig.tagsSelectors.length) {
312
+ for (const tagsSelector of processingConfig.tagsSelectors) {
313
+ $(tagsSelector).each(function (this: any) {
314
+ const tag = $(this).text().trim()
315
+ if (tag) {
316
+ page.tags = page.tags ?? []
317
+ page.tags!.push(tag)
318
+ }
319
+ })
320
+ }
321
+ }
322
+
323
+ $('meta').each(function (this: any) {
285
324
  const name = $(this).attr('name')
325
+ const property = $(this).attr('property')
326
+ const content = $(this).attr('content')
286
327
  if (name === 'robots') {
287
- const content = $(this).attr('content')
288
- log.debug('use robots meta', content)
328
+ log.debug('robots meta', content)
289
329
  if (content) {
290
- for (const part of content.split(',').map(p => p.trim())) {
330
+ for (const part of content.split(',').map((p) => p.trim())) {
291
331
  if (part === 'noindex') page.noindex = true
292
332
  if (part === 'nofollow') page.nofollow = true
293
333
  }
294
334
  }
295
335
  }
336
+ if (processingConfig.extractKeywords && name === 'keywords' && content) {
337
+ page.tags = page.tags ?? []
338
+ for (const tag of content.split(',').map((t) => t.trim()).filter((t) => t)) {
339
+ page.tags!.push(tag)
340
+ }
341
+ }
342
+ if (processingConfig.extractArticleTags && property === 'article:tag' && content) {
343
+ page.tags = page.tags ?? []
344
+ page.tags!.push(content.trim())
345
+ }
346
+ if (processingConfig.extractDescription && name === 'description' && content) {
347
+ page.description = content.trim()
348
+ }
296
349
  })
297
350
 
298
351
  if (!page.noindex && processingConfig.anchors && processingConfig.anchors.length) {
299
- const anchorsPages = []
300
- $('a').each(function (i, elem) {
352
+ const anchorsPages: [Page, string][] = []
353
+ $('a').each(function (this: any) {
301
354
  const href = $(this).attr('href')
302
355
  if (!href) return
303
356
  const parsedURL = new URL(href, page.url)
@@ -308,7 +361,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
308
361
  const fragment = anchor.wrapperSelector ? targetElement.closest(anchor.wrapperSelector) : targetElement
309
362
  const fragmentHtml = fragment.html()
310
363
  if (fragmentHtml) {
311
- const anchorPage = { url: parsedURL.href, tags: anchor.tags || [], source: 'anchor ' + page.url }
364
+ const anchorPage: Page = { url: parsedURL.href, tags: anchor.tags || [], source: 'anchor ' + page.url }
312
365
  if (anchor.titleSelector) anchorPage.title = fragment.find(anchor.titleSelector).text() || page.title
313
366
  else anchorPage.title = targetElement.text() || page.title
314
367
  anchorsPages.push([anchorPage, fragmentHtml])
@@ -324,7 +377,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
324
377
  }
325
378
  }
326
379
  if (!page.nofollow) {
327
- $('a').each(function (i, elem) {
380
+ $('a').each(function (this: any) {
328
381
  const href = $(this).attr('href')
329
382
  if (href) pages.push({ url: new URL(href, page.url).href, source: 'link ' + page.url })
330
383
  })
@@ -332,7 +385,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
332
385
 
333
386
  if (!page.noindex) {
334
387
  if (processingConfig.prune) {
335
- processingConfig.prune.forEach(s => $(s).remove())
388
+ processingConfig.prune.forEach((s: string) => $(s).remove())
336
389
  }
337
390
  await sendPage(page, $.html())
338
391
  }
@@ -341,7 +394,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
341
394
 
342
395
  if (existingPages) {
343
396
  for (const existingPage of existingPages) {
344
- if (!sentIds.has(existingPage._id)) {
397
+ if (!sentIds.has(existingPage._id!)) {
345
398
  await log.info('delete previously explored page that was not indexed this time', existingPage.url)
346
399
  await axios.delete(`api/v1/datasets/${dataset.id}/lines/${existingPage._id}`)
347
400
  }
@@ -349,10 +402,6 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
349
402
  }
350
403
  }
351
404
 
352
- // used to manage interruption
353
- // not required but it is a good practice to prevent incoherent state a smuch as possible
354
- // the run method should finish shortly after calling stop, otherwise the process will be forcibly terminated
355
- // the grace period before force termination is 20 seconds
356
- exports.stop = async () => {
405
+ export const stop = async () => {
357
406
  stopped = true
358
407
  }
package/package.json CHANGED
@@ -1,12 +1,16 @@
1
1
  {
2
2
  "name": "@data-fair/processing-web-scraper",
3
- "version": "0.4.0",
3
+ "version": "0.6.0",
4
4
  "description": "A small Web scraper that publishes its data into data-fair datasets.",
5
- "main": "index.js",
5
+ "main": "index.ts",
6
+ "type": "module",
6
7
  "scripts": {
7
- "test": "mocha --exit",
8
- "lint": "eslint --ignore-path .gitignore .",
9
- "lint-fix": "eslint --ignore-path .gitignore --fix ."
8
+ "lint": "eslint .",
9
+ "lint-fix": "eslint --fix .",
10
+ "build-types": "export NODE_OPTIONS='--experimental-strip-types' && df-build-types ./",
11
+ "prepare": "husky || true",
12
+ "test-base": "NODE_ENV=test node --experimental-strip-types --test-force-exit --test-concurrency=1 --test --test-reporter=spec --test-reporter-destination=stdout --test-timeout=300000",
13
+ "test": "npm run test-base test-it/*.ts"
10
14
  },
11
15
  "repository": {
12
16
  "type": "git",
@@ -22,17 +26,31 @@
22
26
  },
23
27
  "homepage": "https://github.com/data-fair/processing-web-scraper#readme",
24
28
  "devDependencies": {
25
- "@data-fair/processings-test-utils": "^0.5.1",
26
- "config": "^3.3.6",
27
- "eslint": "^7.18.0",
29
+ "@commitlint/cli": "^20.4.1",
30
+ "@commitlint/config-conventional": "^20.4.1",
31
+ "@data-fair/lib-processing-dev": "^0.2.0",
32
+ "@data-fair/lib-types-builder": "^1.11.3",
33
+ "@types/config": "^3.3.5",
34
+ "@types/express": "^5.0.6",
35
+ "@types/node": "^25.2.0",
36
+ "config": "^4.2.0",
37
+ "eslint": "^9.39.2",
28
38
  "express": "^4.18.2",
29
- "mocha": "^8.2.1",
30
- "standard": "^16.0.3"
39
+ "husky": "^9.1.7",
40
+ "neostandard": "^0.12.2",
41
+ "ws": "^8.19.0"
31
42
  },
32
43
  "dependencies": {
44
+ "@data-fair/lib-utils": "^1.9.0",
33
45
  "cheerio": "^1.0.0-rc.12",
34
46
  "form-data": "^4.0.0",
35
47
  "robots-parser": "^3.0.0",
36
48
  "url-pattern": "^1.0.3"
37
- }
49
+ },
50
+ "files": [
51
+ "plugin-config-schema.json",
52
+ "processing-config-schema.json",
53
+ "./index.ts",
54
+ "./lib/**/*"
55
+ ]
38
56
  }
@@ -67,6 +67,26 @@
67
67
  "description": "Par défaut le sélecteur 'title' est utilisé ce qui correspond au titre de page dans les métadonnées HTML.",
68
68
  "items": {"type": "string"}
69
69
  },
70
+ "tagsSelectors": {
71
+ "type": "array",
72
+ "title": "Sélecteurs d'éléments HTML à utiliser comme étiquettes",
73
+ "items": {"type": "string"}
74
+ },
75
+ "extractKeywords": {
76
+ "type": "boolean",
77
+ "title": "Extraire les mots-clés depuis la balise meta name=\"keywords\"",
78
+ "default": false
79
+ },
80
+ "extractArticleTags": {
81
+ "type": "boolean",
82
+ "title": "Extraire les étiquettes depuis les balises meta property=\"article:tag\"",
83
+ "default": false
84
+ },
85
+ "extractDescription": {
86
+ "type": "boolean",
87
+ "title": "Extraire la description depuis la balise meta name=\"description\"",
88
+ "default": false
89
+ },
70
90
  "sitemaps": {
71
91
  "type": "array",
72
92
  "title": "URLs de fichiers sitemap.xml",
@@ -82,7 +102,7 @@
82
102
  "excludeURLPatterns": {
83
103
  "type": "array",
84
104
  "title": "Formats d'URL à exclure de l'exploration",
85
- "description": "Exemple: https://data-fair.github.io/master/en(/*)",
105
+ "description": "Exemple: https://data-fair.github.io/3/en(/*)",
86
106
  "items": {"type": "string"}
87
107
  },
88
108
  "anchors": {
package/.eslintrc.js DELETED
@@ -1,13 +0,0 @@
1
- module.exports = {
2
- root: true,
3
- parserOptions: {
4
- parser: 'babel-eslint',
5
- sourceType: 'module'
6
- },
7
- extends: ['standard'],
8
- // add your custom rules here
9
- rules: {
10
- // allow paren-less arrow functions
11
- 'arrow-parens': 0
12
- }
13
- }