@data-fair/processing-web-scraper 0.5.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.ts ADDED
@@ -0,0 +1,19 @@
1
+ import type { PrepareFunction, ProcessingContext } from '@data-fair/lib-common-types/processings.js'
2
+ import type { ProcessingConfig } from './types/processingConfig/index.ts'
3
+
4
+ export const prepare: PrepareFunction<ProcessingConfig> = async ({ processingConfig, secrets }) => {
5
+ return {
6
+ processingConfig,
7
+ secrets
8
+ }
9
+ }
10
+
11
+ export const run = async (context: ProcessingContext<ProcessingConfig>) => {
12
+ const { run } = await import('./lib/scraper.ts')
13
+ await run(context)
14
+ }
15
+
16
+ export const stop = async () => {
17
+ const { stop } = await import('./lib/scraper.ts')
18
+ await stop()
19
+ }
@@ -1,11 +1,10 @@
1
- const FormData = require('form-data')
2
- const crypto = require('crypto')
3
- const robotsParser = require('robots-parser')
4
-
5
- // TODO:
6
- // handle html but also any file formats
7
- // add in-links info (at least for files)
8
- // specifications listed here http://robots-txt.com/
1
+ import { createHash } from 'node:crypto'
2
+ import UrlPattern from 'url-pattern'
3
+ import robotsParser from 'robots-parser'
4
+ import * as cheerio from 'cheerio'
5
+ import FormData from 'form-data'
6
+ import type { ProcessingContext } from '@data-fair/lib-common-types/processings.js'
7
+ import type { ProcessingConfig } from '../types/processingConfig/index.ts'
9
8
 
10
9
  const datasetSchema = [
11
10
  {
@@ -14,6 +13,12 @@ const datasetSchema = [
14
13
  'x-refersTo': 'http://www.w3.org/2000/01/rdf-schema#label',
15
14
  'x-capabilities': { textAgg: false }
16
15
  },
16
+ {
17
+ key: 'description',
18
+ type: 'string',
19
+ 'x-refersTo': 'https://schema.org/description',
20
+ 'x-capabilities': { values: false, textAgg: false, insensitive: false }
21
+ },
17
22
  {
18
23
  key: 'url',
19
24
  type: 'string',
@@ -30,7 +35,6 @@ const datasetSchema = [
30
35
  {
31
36
  key: 'etag',
32
37
  type: 'string',
33
- separator: ',',
34
38
  'x-capabilities': { index: false, values: false, text: false, textStandard: false, textAgg: false, insensitive: false }
35
39
  },
36
40
  {
@@ -46,10 +50,9 @@ const datasetSchema = [
46
50
  }
47
51
  ]
48
52
 
49
- // a global variable to manage interruption
50
- let stopped
53
+ let stopped: boolean | undefined
51
54
 
52
- const normalizeURL = (url, ignoreHash = false, addSlash = false) => {
55
+ const normalizeURL = (url: string, ignoreHash = false, addSlash = false): string => {
53
56
  const parsedURL = new URL(url)
54
57
  for (const indexSuffix of ['index.html', 'index.php', 'index.jsp', 'index.cgi']) {
55
58
  if (parsedURL.pathname.endsWith('/' + indexSuffix)) {
@@ -61,22 +64,45 @@ const normalizeURL = (url, ignoreHash = false, addSlash = false) => {
61
64
  return parsedURL.href
62
65
  }
63
66
 
64
- const getId = (page) => {
65
- return crypto.createHash('sha256').update(normalizeURL(page.url)).digest('base64url').slice(0, 20)
67
+ const getId = async (page: { url: string }): Promise<string> => {
68
+ return createHash('sha256').update(normalizeURL(page.url)).digest('base64url').slice(0, 20)
66
69
  }
67
70
 
71
+ interface Page {
72
+ url: string
73
+ title?: string
74
+ description?: string
75
+ tags?: string[]
76
+ etag?: string
77
+ lastModified?: string
78
+ attachmentPath?: string
79
+ _id?: string
80
+ parsedURL?: URL
81
+ source?: string
82
+ noindex?: boolean
83
+ nofollow?: boolean
84
+ parentId?: string
85
+ }
86
+
87
+ type UrlPatternWithHostname = UrlPattern & { hostname: string }
88
+
68
89
  class PagesIterator {
69
- constructor (log, pluginConfig, processingConfig, robots) {
70
- this.pages = []
71
- this.cursor = -1
90
+ pages: Page[] = []
91
+ cursor = -1
92
+ log: any
93
+ pluginConfig: any
94
+ processingConfig: ProcessingConfig
95
+ robots: Record<string, any>
96
+ excludeURLPatterns: any[] = []
97
+
98
+ constructor (log: any, pluginConfig: any, processingConfig: ProcessingConfig, robots: Record<string, any>) {
72
99
  this.log = log
73
100
  this.pluginConfig = pluginConfig
74
101
  this.processingConfig = processingConfig
75
102
  this.robots = robots
76
- const UrlPattern = require('url-pattern')
77
- this.excludeURLPatterns = (processingConfig.excludeURLPatterns || []).map(p => {
103
+ this.excludeURLPatterns = (processingConfig.excludeURLPatterns || []).map((p: string) => {
78
104
  const url = new URL(p)
79
- const pattern = new UrlPattern(url.pathname)
105
+ const pattern = new UrlPattern(url.pathname) as UrlPatternWithHostname
80
106
  pattern.hostname = url.hostname
81
107
  return pattern
82
108
  })
@@ -86,19 +112,19 @@ class PagesIterator {
86
112
  return this
87
113
  }
88
114
 
89
- push (page) {
115
+ async push (page: Page | string) {
90
116
  if (typeof page === 'string') page = { url: page }
91
- if (!this.processingConfig.baseURLs.find(b => page.url.startsWith(b))) return
117
+ if (!this.processingConfig.baseURLs?.find((b: string) => page.url.startsWith(b))) return
92
118
  page.parsedURL = page.parsedURL || new URL(page.url)
93
119
  if (page.parsedURL.hash) return
94
- if (this.excludeURLPatterns.find(p => p.match(page.parsedURL.pathname) && p.hostname === page.parsedURL.hostname)) {
120
+ if (this.excludeURLPatterns.find((p: any) => p.match(page.parsedURL!.pathname) && p.hostname === page.parsedURL!.hostname)) {
95
121
  return
96
122
  }
97
- if (this.robots[page.parsedURL.origin] && !this.robots[page.parsedURL.origin].isAllowed(page.url, this.pluginConfig.userAgent || 'data-fair-web-scraper')) {
123
+ if (this.robots[page.parsedURL!.origin] && !this.robots[page.parsedURL!.origin].isAllowed(page.url, this.pluginConfig.userAgent || 'data-fair-web-scraper')) {
98
124
  return
99
125
  }
100
- page._id = getId(page)
101
- if (this.pages.find(p => p._id === page._id)) return
126
+ page._id = await getId(page)
127
+ if (this.pages.find((p: Page) => p._id === page._id)) return
102
128
  this.pages.push(page)
103
129
  }
104
130
 
@@ -112,40 +138,45 @@ class PagesIterator {
112
138
  }
113
139
  }
114
140
 
115
- exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir, axios, log, patchConfig, ws }) => {
116
- let dataset
141
+ export const run = async (context: ProcessingContext<ProcessingConfig>) => {
142
+ const { pluginConfig, processingConfig, processingId, axios, log, patchConfig, ws } = context
143
+ let dataset: any
117
144
  if (processingConfig.datasetMode === 'create') {
118
145
  await log.step('Dataset creation')
119
146
  dataset = (await axios.post('api/v1/datasets', {
120
- id: processingConfig.dataset.id,
121
- title: processingConfig.dataset.title,
147
+ id: processingConfig.dataset?.id,
148
+ title: processingConfig.dataset?.title,
122
149
  isRest: true,
123
150
  schema: datasetSchema,
124
151
  extras: { processingId }
125
152
  })).data
153
+ if (dataset.status !== 'finalized') {
154
+ await ws.waitForJournal(dataset.id, 'finalize-end')
155
+ }
126
156
  await log.info(`dataset created, id="${dataset.id}", title="${dataset.title}"`)
127
157
  await patchConfig({ datasetMode: 'update', dataset: { id: dataset.id, title: dataset.title } })
128
- await ws.waitForJournal(dataset.id, 'finalize-end')
129
158
  } else if (processingConfig.datasetMode === 'update') {
130
159
  await log.step('Check dataset')
131
- dataset = (await axios.get(`api/v1/datasets/${processingConfig.dataset.id}`)).data
132
- if (!dataset) throw new Error(`the dataset does not exist, id="${processingConfig.dataset.id}"`)
160
+ dataset = (await axios.get(`api/v1/datasets/${processingConfig.dataset?.id}`)).data
161
+ if (!dataset) throw new Error(`the dataset does not exist, id="${processingConfig.dataset?.id}"`)
133
162
  await log.info(`the dataset exists, id="${dataset.id}", title="${dataset.title}"`)
134
163
  }
135
164
 
136
- // parse the robots.txt files if available
137
- const robots = {}
165
+ const robots: Record<string, any> = {}
138
166
  const sitemaps = processingConfig.sitemaps || []
139
- for (const baseURL of processingConfig.baseURLs) {
167
+ for (const baseURL of processingConfig.baseURLs || []) {
140
168
  const { origin } = new URL(baseURL)
141
169
  if (robots[origin]) continue
142
170
  try {
143
171
  const response = await axios.get(origin + '/robots.txt')
144
- robots[origin] = robotsParser(origin + '/robots.txt', response.data)
172
+ robots[origin] = (robotsParser as any)(origin + '/robots.txt', response.data)
145
173
  for (const sitemap of robots[origin].getSitemaps()) {
146
- if (!sitemaps.includes(sitemap)) sitemaps.push(sitemap)
174
+ if (!sitemaps.includes(sitemap)) {
175
+ await log.info(`add sitemap found in robots.txt ${sitemap}`)
176
+ sitemaps.push(sitemap)
177
+ }
147
178
  }
148
- } catch (err) {
179
+ } catch (err: any) {
149
180
  await log.info(`failed to fetch ${origin + '/robots.txt'} - ${err.status || err.message}`)
150
181
  }
151
182
  }
@@ -153,32 +184,33 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
153
184
  const pages = new PagesIterator(log, pluginConfig, processingConfig, robots)
154
185
 
155
186
  await log.step('Init pages list')
156
- let existingPages
187
+ let existingPages: Page[] | undefined
157
188
  if (processingConfig.datasetMode === 'update') {
158
189
  existingPages = (await axios.get(`api/v1/datasets/${dataset.id}/lines`, { params: { select: '_id,url,etag,lastModified', size: 10000 } })).data.results
159
- await log.info(`add ${existingPages.length} pages from previous crawls`)
160
- for (const page of existingPages) {
161
- page.parsedURL = new URL(page.url)
162
- if (page.parsedURL.hash) {
163
- const parentURL = new URL(page.parsedURL)
164
- parentURL.hash = ''
165
- page.parentId = getId({ url: parentURL.href })
190
+ if (existingPages) {
191
+ await log.info(`add ${existingPages.length} pages from previous crawls`)
192
+ for (const page of existingPages) {
193
+ page.parsedURL = new URL(page.url)
194
+ if (page.parsedURL.hash) {
195
+ const parentURL = new URL(page.parsedURL)
196
+ parentURL.hash = ''
197
+ page.parentId = await getId({ url: parentURL.href })
198
+ }
199
+ await pages.push({ ...page, source: 'previous exploration' })
166
200
  }
167
- await pages.push({ ...page, source: 'previous exploration' })
168
201
  }
169
202
  }
170
- await log.info(`add ${processingConfig.startURLs.length} pages from config`)
171
- for (const url of processingConfig.startURLs) {
203
+ await log.info(`add ${processingConfig.startURLs?.length || 0} pages from config`)
204
+ for (const url of processingConfig.startURLs || []) {
172
205
  await pages.push({ url, source: 'config start URLs' })
173
206
  }
174
207
 
175
208
  for (const sitemapURL of sitemaps) {
176
209
  await log.info(`fetch start URLs from sitemap ${sitemapURL}`)
177
210
  const sitemap = (await axios.get(sitemapURL)).data
178
- const cheerio = require('cheerio')
179
211
  const $ = cheerio.load(sitemap)
180
- const sitemapURLs = []
181
- $('url loc').each(function () {
212
+ const sitemapURLs: string[] = []
213
+ $('url loc').each(function (this: any) {
182
214
  sitemapURLs.push($(this).text())
183
215
  })
184
216
  for (const url of sitemapURLs) {
@@ -186,20 +218,16 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
186
218
  }
187
219
  }
188
220
 
189
- const sentIds = new Set([])
190
- const sendPage = async (page, data, contentType = 'text/html', filename = 'content.html') => {
191
- await log.debug('send page', page.url)
221
+ const sentIds = new Set<string>()
222
+ const sendPage = async (page: Page, data: any, contentType = 'text/html', filename = 'content.html') => {
192
223
  const form = new FormData()
193
- // improve page title
224
+
194
225
  if (page.title) {
195
226
  page.title = page.title.trim()
196
227
  if (processingConfig.titlePrefix && page.title.startsWith(processingConfig.titlePrefix)) {
197
228
  page.title = page.title.replace(processingConfig.titlePrefix, '')
198
229
  }
199
230
  }
200
- form.append('title', page.title)
201
- form.append('url', page.url)
202
- if (page.tags && page.tags.length) form.append('tags', page.tags.join(','))
203
231
  data = typeof data === 'string' ? Buffer.from(data) : data
204
232
  const dataOpts = {
205
233
  contentType,
@@ -207,49 +235,52 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
207
235
  knownLength: data.length
208
236
  }
209
237
  form.append('attachment', data, dataOpts)
210
- if (page.lastModified) form.append('lastModified', page.lastModified)
211
- if (page.etag) form.append('etag', page.etag)
212
- page._id = getId(page)
238
+ page._id = await getId(page)
213
239
  sentIds.add(page._id)
240
+ const body = { ...page }
241
+ delete body.source
242
+ delete body.parsedURL
243
+ delete body.nofollow
244
+ delete body.noindex
245
+ form.append('_body', JSON.stringify(body))
214
246
  const headers = {
215
247
  ...form.getHeaders(),
216
248
  'content-length': form.getLengthSync()
217
249
  }
250
+
218
251
  await axios({
219
252
  method: 'put',
220
253
  url: `api/v1/datasets/${dataset.id}/lines/${page._id}`,
221
254
  data: form,
222
- headers
255
+ headers,
256
+ validateStatus: (status) => status === 200 || status === 304
223
257
  })
224
258
  }
225
259
 
226
260
  for await (const page of pages) {
227
261
  if (stopped) break
228
262
 
229
- const crawlDelay = (robots[page.parsedURL.origin] && robots[page.parsedURL.origin].getCrawlDelay()) || pluginConfig.defaultCrawlDelay || 1
263
+ const crawlDelay = (robots[page.parsedURL!.origin] && robots[page.parsedURL!.origin].getCrawlDelay()) || pluginConfig.defaultCrawlDelay || 1
230
264
  await new Promise(resolve => setTimeout(resolve, crawlDelay * 1000))
231
265
 
232
- // TODO: apply if-none-match and if-modified-since headers if etag or lastModified are available
233
- const headers = { 'user-agent': pluginConfig.userAgent || 'data-fair-web-scraper' }
266
+ const headers: Record<string, string> = { 'user-agent': pluginConfig.userAgent || 'data-fair-web-scraper' }
234
267
  if (page.lastModified) headers['if-modified-since'] = page.lastModified
235
268
  if (page.etag) headers['if-none-match'] = page.etag
236
- let response
269
+ let response: any
237
270
  try {
238
271
  response = await axios.get(page.url, { headers, maxRedirects: 0 })
239
- } catch (err) {
240
- // content did not change
272
+ } catch (err: any) {
241
273
  if (err.status === 304) {
242
274
  await log.debug(`page was not modified since last exploration ${page.url}`)
243
- sentIds.add(page._id)
244
- for (const existingPage of existingPages) {
245
- if (existingPage.parentId === page._id) sentIds.add(existingPage._id)
275
+ sentIds.add(page._id!)
276
+ for (const existingPage of existingPages || []) {
277
+ if (existingPage.parentId === page._id) sentIds.add(existingPage._id!)
246
278
  }
247
279
  continue
248
280
  }
249
- // follow a redirect
250
- if (err.status === 301) {
281
+ if (err.status === 301 || err.status === 302) {
251
282
  await log.debug(`page redirected ${page.url} -> ${err.headers.location}`)
252
- pages.push({ url: new URL(err.headers.location, page.url).href, source: 'redirect ' + page.url })
283
+ await pages.push({ url: new URL(err.headers.location, page.url).href, source: 'redirect ' + page.url })
253
284
  continue
254
285
  }
255
286
  await log.warning(`failed to fetch page ${page.url} - ${err.status || err.message}`)
@@ -258,55 +289,70 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
258
289
  }
259
290
 
260
291
  if (response.headers['x-robots-tag']) {
261
- await log.debug('use x-robots-tag header', response.headers['x-robots-tag'])
262
- for (const part of response.headers['x-robots-tag'].split(',').map(p => p.trim())) {
292
+ await log.debug('x-robots-tag header', response.headers['x-robots-tag'])
293
+ for (const part of response.headers['x-robots-tag'].split(',').map((p: string) => p.trim())) {
263
294
  if (part === 'noindex') page.noindex = true
264
295
  if (part === 'nofollow') page.nofollow = true
265
296
  }
266
297
  }
267
-
268
298
  page.lastModified = response.headers['last-modified']
269
299
  page.etag = response.headers.etag
270
300
 
271
301
  const isHTML = (response.headers['content-type'] && response.headers['content-type'].startsWith('text/html;')) || (typeof response.data === 'string' && response.data.trim().startsWith('<html'))
272
302
  if (isHTML) {
273
- const cheerio = require('cheerio')
274
303
  const $ = cheerio.load(response.data)
275
304
  const titleSelectors = (processingConfig.titleSelectors || []).concat(['title', 'h1'])
276
305
  for (const titleSelector of titleSelectors) {
277
306
  page.title = $(titleSelector).text()
278
307
  if (page.title) {
279
- log.debug(`used title selector "${titleSelector}" -> ${page.title.trim()}`)
308
+ await log.debug(`used title selector "${titleSelector}" -> ${page.title.trim()}`)
280
309
  break
281
310
  }
282
311
  }
283
- page.tags = []
312
+
284
313
  if (processingConfig.tagsSelectors && processingConfig.tagsSelectors.length) {
285
314
  for (const tagsSelector of processingConfig.tagsSelectors) {
286
- $(tagsSelector).each(function (i, elem) {
315
+ $(tagsSelector).each(function (this: any) {
287
316
  const tag = $(this).text().trim()
288
- if (tag) page.tags.push(tag)
317
+ if (tag) {
318
+ page.tags = page.tags ?? []
319
+ page.tags!.push(tag)
320
+ }
289
321
  })
290
322
  }
291
323
  }
292
324
 
293
- $('meta').each(function (i, elem) {
325
+ $('meta').each(function (this: any) {
294
326
  const name = $(this).attr('name')
327
+ const property = $(this).attr('property')
328
+ const content = $(this).attr('content')
295
329
  if (name === 'robots') {
296
- const content = $(this).attr('content')
297
- log.debug('use robots meta', content)
330
+ log.debug('robots meta', content)
298
331
  if (content) {
299
- for (const part of content.split(',').map(p => p.trim())) {
332
+ for (const part of content.split(',').map((p) => p.trim())) {
300
333
  if (part === 'noindex') page.noindex = true
301
334
  if (part === 'nofollow') page.nofollow = true
302
335
  }
303
336
  }
304
337
  }
338
+ if (processingConfig.extractKeywords && name === 'keywords' && content) {
339
+ page.tags = page.tags ?? []
340
+ for (const tag of content.split(',').map((t) => t.trim()).filter((t) => t)) {
341
+ page.tags!.push(tag)
342
+ }
343
+ }
344
+ if (processingConfig.extractArticleTags && property === 'article:tag' && content) {
345
+ page.tags = page.tags ?? []
346
+ page.tags!.push(content.trim())
347
+ }
348
+ if (processingConfig.extractDescription && name === 'description' && content) {
349
+ page.description = content.trim()
350
+ }
305
351
  })
306
352
 
307
353
  if (!page.noindex && processingConfig.anchors && processingConfig.anchors.length) {
308
- const anchorsPages = []
309
- $('a').each(function (i, elem) {
354
+ const anchorsPages: [Page, string][] = []
355
+ $('a').each(function (this: any) {
310
356
  const href = $(this).attr('href')
311
357
  if (!href) return
312
358
  const parsedURL = new URL(href, page.url)
@@ -317,7 +363,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
317
363
  const fragment = anchor.wrapperSelector ? targetElement.closest(anchor.wrapperSelector) : targetElement
318
364
  const fragmentHtml = fragment.html()
319
365
  if (fragmentHtml) {
320
- const anchorPage = { url: parsedURL.href, tags: anchor.tags || [], source: 'anchor ' + page.url }
366
+ const anchorPage: Page = { url: parsedURL.href, tags: anchor.tags || [], source: 'anchor ' + page.url }
321
367
  if (anchor.titleSelector) anchorPage.title = fragment.find(anchor.titleSelector).text() || page.title
322
368
  else anchorPage.title = targetElement.text() || page.title
323
369
  anchorsPages.push([anchorPage, fragmentHtml])
@@ -333,7 +379,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
333
379
  }
334
380
  }
335
381
  if (!page.nofollow) {
336
- $('a').each(function (i, elem) {
382
+ $('a').each(function (this: any) {
337
383
  const href = $(this).attr('href')
338
384
  if (href) pages.push({ url: new URL(href, page.url).href, source: 'link ' + page.url })
339
385
  })
@@ -341,7 +387,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
341
387
 
342
388
  if (!page.noindex) {
343
389
  if (processingConfig.prune) {
344
- processingConfig.prune.forEach(s => $(s).remove())
390
+ processingConfig.prune.forEach((s: string) => $(s).remove())
345
391
  }
346
392
  await sendPage(page, $.html())
347
393
  }
@@ -350,7 +396,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
350
396
 
351
397
  if (existingPages) {
352
398
  for (const existingPage of existingPages) {
353
- if (!sentIds.has(existingPage._id)) {
399
+ if (!sentIds.has(existingPage._id!)) {
354
400
  await log.info('delete previously explored page that was not indexed this time', existingPage.url)
355
401
  await axios.delete(`api/v1/datasets/${dataset.id}/lines/${existingPage._id}`)
356
402
  }
@@ -358,10 +404,6 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
358
404
  }
359
405
  }
360
406
 
361
- // used to manage interruption
362
- // not required but it is a good practice to prevent incoherent state a smuch as possible
363
- // the run method should finish shortly after calling stop, otherwise the process will be forcibly terminated
364
- // the grace period before force termination is 20 seconds
365
- exports.stop = async () => {
407
+ export const stop = async () => {
366
408
  stopped = true
367
409
  }
package/package.json CHANGED
@@ -1,12 +1,18 @@
1
1
  {
2
2
  "name": "@data-fair/processing-web-scraper",
3
- "version": "0.5.0",
3
+ "version": "0.6.1",
4
4
  "description": "A small Web scraper that publishes its data into data-fair datasets.",
5
- "main": "index.js",
5
+ "main": "index.ts",
6
+ "type": "module",
6
7
  "scripts": {
7
- "test": "mocha --exit",
8
- "lint": "eslint --ignore-path .gitignore .",
9
- "lint-fix": "eslint --ignore-path .gitignore --fix ."
8
+ "lint": "eslint .",
9
+ "lint-fix": "eslint --fix .",
10
+ "build-types": "export NODE_OPTIONS='--experimental-strip-types' && df-build-types ./",
11
+ "prepare": "husky || true",
12
+ "test-base": "NODE_ENV=test node --experimental-strip-types --test-force-exit --test-concurrency=1 --test --test-reporter=spec --test-reporter-destination=stdout --test-timeout=300000",
13
+ "test": "npm run test-base test-it/*.ts",
14
+ "check-types": "tsc",
15
+ "quality": "npm run lint && npm run check-types && npm test"
10
16
  },
11
17
  "repository": {
12
18
  "type": "git",
@@ -22,17 +28,31 @@
22
28
  },
23
29
  "homepage": "https://github.com/data-fair/processing-web-scraper#readme",
24
30
  "devDependencies": {
25
- "@data-fair/processings-test-utils": "^0.5.1",
26
- "config": "^3.3.6",
27
- "eslint": "^7.18.0",
31
+ "@commitlint/cli": "^20.4.1",
32
+ "@commitlint/config-conventional": "^20.4.1",
33
+ "@data-fair/lib-processing-dev": "^0.2.0",
34
+ "@data-fair/lib-types-builder": "^1.11.3",
35
+ "@types/config": "^3.3.5",
36
+ "@types/express": "^5.0.6",
37
+ "@types/node": "^25.2.0",
38
+ "config": "^4.2.0",
39
+ "eslint": "^9.39.2",
28
40
  "express": "^4.18.2",
29
- "mocha": "^8.2.1",
30
- "standard": "^16.0.3"
41
+ "husky": "^9.1.7",
42
+ "neostandard": "^0.12.2",
43
+ "ws": "^8.19.0"
31
44
  },
32
45
  "dependencies": {
33
- "cheerio": "^1.0.0-rc.12",
34
- "form-data": "^4.0.0",
35
- "robots-parser": "^3.0.0",
46
+ "@data-fair/lib-utils": "^1.9.0",
47
+ "cheerio": "^1.2.0",
48
+ "form-data": "^4.0.5",
49
+ "robots-parser": "^3.0.1",
36
50
  "url-pattern": "^1.0.3"
37
- }
51
+ },
52
+ "files": [
53
+ "plugin-config-schema.json",
54
+ "processing-config-schema.json",
55
+ "./index.ts",
56
+ "./lib/**/*"
57
+ ]
38
58
  }
@@ -72,6 +72,21 @@
72
72
  "title": "Sélecteurs d'éléments HTML à utiliser comme étiquettes",
73
73
  "items": {"type": "string"}
74
74
  },
75
+ "extractKeywords": {
76
+ "type": "boolean",
77
+ "title": "Extraire les mots-clés depuis la balise meta name=\"keywords\"",
78
+ "default": false
79
+ },
80
+ "extractArticleTags": {
81
+ "type": "boolean",
82
+ "title": "Extraire les étiquettes depuis les balises meta property=\"article:tag\"",
83
+ "default": false
84
+ },
85
+ "extractDescription": {
86
+ "type": "boolean",
87
+ "title": "Extraire la description depuis la balise meta name=\"description\"",
88
+ "default": false
89
+ },
75
90
  "sitemaps": {
76
91
  "type": "array",
77
92
  "title": "URLs de fichiers sitemap.xml",
package/.eslintrc.js DELETED
@@ -1,13 +0,0 @@
1
- module.exports = {
2
- root: true,
3
- parserOptions: {
4
- parser: 'babel-eslint',
5
- sourceType: 'module'
6
- },
7
- extends: ['standard'],
8
- // add your custom rules here
9
- rules: {
10
- // allow paren-less arrow functions
11
- 'arrow-parens': 0
12
- }
13
- }