@data-fair/processing-web-scraper 0.5.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.ts +19 -0
- package/{index.js → lib/scraper.ts} +141 -99
- package/package.json +34 -14
- package/processing-config-schema.json +15 -0
- package/.eslintrc.js +0 -13
package/index.ts
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import type { PrepareFunction, ProcessingContext } from '@data-fair/lib-common-types/processings.js'
|
|
2
|
+
import type { ProcessingConfig } from './types/processingConfig/index.ts'
|
|
3
|
+
|
|
4
|
+
export const prepare: PrepareFunction<ProcessingConfig> = async ({ processingConfig, secrets }) => {
|
|
5
|
+
return {
|
|
6
|
+
processingConfig,
|
|
7
|
+
secrets
|
|
8
|
+
}
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export const run = async (context: ProcessingContext<ProcessingConfig>) => {
|
|
12
|
+
const { run } = await import('./lib/scraper.ts')
|
|
13
|
+
await run(context)
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export const stop = async () => {
|
|
17
|
+
const { stop } = await import('./lib/scraper.ts')
|
|
18
|
+
await stop()
|
|
19
|
+
}
|
|
@@ -1,11 +1,10 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
// specifications listed here http://robots-txt.com/
|
|
1
|
+
import { createHash } from 'node:crypto'
|
|
2
|
+
import UrlPattern from 'url-pattern'
|
|
3
|
+
import robotsParser from 'robots-parser'
|
|
4
|
+
import * as cheerio from 'cheerio'
|
|
5
|
+
import FormData from 'form-data'
|
|
6
|
+
import type { ProcessingContext } from '@data-fair/lib-common-types/processings.js'
|
|
7
|
+
import type { ProcessingConfig } from '../types/processingConfig/index.ts'
|
|
9
8
|
|
|
10
9
|
const datasetSchema = [
|
|
11
10
|
{
|
|
@@ -14,6 +13,12 @@ const datasetSchema = [
|
|
|
14
13
|
'x-refersTo': 'http://www.w3.org/2000/01/rdf-schema#label',
|
|
15
14
|
'x-capabilities': { textAgg: false }
|
|
16
15
|
},
|
|
16
|
+
{
|
|
17
|
+
key: 'description',
|
|
18
|
+
type: 'string',
|
|
19
|
+
'x-refersTo': 'https://schema.org/description',
|
|
20
|
+
'x-capabilities': { values: false, textAgg: false, insensitive: false }
|
|
21
|
+
},
|
|
17
22
|
{
|
|
18
23
|
key: 'url',
|
|
19
24
|
type: 'string',
|
|
@@ -30,7 +35,6 @@ const datasetSchema = [
|
|
|
30
35
|
{
|
|
31
36
|
key: 'etag',
|
|
32
37
|
type: 'string',
|
|
33
|
-
separator: ',',
|
|
34
38
|
'x-capabilities': { index: false, values: false, text: false, textStandard: false, textAgg: false, insensitive: false }
|
|
35
39
|
},
|
|
36
40
|
{
|
|
@@ -46,10 +50,9 @@ const datasetSchema = [
|
|
|
46
50
|
}
|
|
47
51
|
]
|
|
48
52
|
|
|
49
|
-
|
|
50
|
-
let stopped
|
|
53
|
+
let stopped: boolean | undefined
|
|
51
54
|
|
|
52
|
-
const normalizeURL = (url, ignoreHash = false, addSlash = false) => {
|
|
55
|
+
const normalizeURL = (url: string, ignoreHash = false, addSlash = false): string => {
|
|
53
56
|
const parsedURL = new URL(url)
|
|
54
57
|
for (const indexSuffix of ['index.html', 'index.php', 'index.jsp', 'index.cgi']) {
|
|
55
58
|
if (parsedURL.pathname.endsWith('/' + indexSuffix)) {
|
|
@@ -61,22 +64,45 @@ const normalizeURL = (url, ignoreHash = false, addSlash = false) => {
|
|
|
61
64
|
return parsedURL.href
|
|
62
65
|
}
|
|
63
66
|
|
|
64
|
-
const getId = (page) => {
|
|
65
|
-
return
|
|
67
|
+
const getId = async (page: { url: string }): Promise<string> => {
|
|
68
|
+
return createHash('sha256').update(normalizeURL(page.url)).digest('base64url').slice(0, 20)
|
|
66
69
|
}
|
|
67
70
|
|
|
71
|
+
interface Page {
|
|
72
|
+
url: string
|
|
73
|
+
title?: string
|
|
74
|
+
description?: string
|
|
75
|
+
tags?: string[]
|
|
76
|
+
etag?: string
|
|
77
|
+
lastModified?: string
|
|
78
|
+
attachmentPath?: string
|
|
79
|
+
_id?: string
|
|
80
|
+
parsedURL?: URL
|
|
81
|
+
source?: string
|
|
82
|
+
noindex?: boolean
|
|
83
|
+
nofollow?: boolean
|
|
84
|
+
parentId?: string
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
type UrlPatternWithHostname = UrlPattern & { hostname: string }
|
|
88
|
+
|
|
68
89
|
class PagesIterator {
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
90
|
+
pages: Page[] = []
|
|
91
|
+
cursor = -1
|
|
92
|
+
log: any
|
|
93
|
+
pluginConfig: any
|
|
94
|
+
processingConfig: ProcessingConfig
|
|
95
|
+
robots: Record<string, any>
|
|
96
|
+
excludeURLPatterns: any[] = []
|
|
97
|
+
|
|
98
|
+
constructor (log: any, pluginConfig: any, processingConfig: ProcessingConfig, robots: Record<string, any>) {
|
|
72
99
|
this.log = log
|
|
73
100
|
this.pluginConfig = pluginConfig
|
|
74
101
|
this.processingConfig = processingConfig
|
|
75
102
|
this.robots = robots
|
|
76
|
-
|
|
77
|
-
this.excludeURLPatterns = (processingConfig.excludeURLPatterns || []).map(p => {
|
|
103
|
+
this.excludeURLPatterns = (processingConfig.excludeURLPatterns || []).map((p: string) => {
|
|
78
104
|
const url = new URL(p)
|
|
79
|
-
const pattern = new UrlPattern(url.pathname)
|
|
105
|
+
const pattern = new UrlPattern(url.pathname) as UrlPatternWithHostname
|
|
80
106
|
pattern.hostname = url.hostname
|
|
81
107
|
return pattern
|
|
82
108
|
})
|
|
@@ -86,19 +112,19 @@ class PagesIterator {
|
|
|
86
112
|
return this
|
|
87
113
|
}
|
|
88
114
|
|
|
89
|
-
push (page) {
|
|
115
|
+
async push (page: Page | string) {
|
|
90
116
|
if (typeof page === 'string') page = { url: page }
|
|
91
|
-
if (!this.processingConfig.baseURLs
|
|
117
|
+
if (!this.processingConfig.baseURLs?.find((b: string) => page.url.startsWith(b))) return
|
|
92
118
|
page.parsedURL = page.parsedURL || new URL(page.url)
|
|
93
119
|
if (page.parsedURL.hash) return
|
|
94
|
-
if (this.excludeURLPatterns.find(p => p.match(page.parsedURL
|
|
120
|
+
if (this.excludeURLPatterns.find((p: any) => p.match(page.parsedURL!.pathname) && p.hostname === page.parsedURL!.hostname)) {
|
|
95
121
|
return
|
|
96
122
|
}
|
|
97
|
-
if (this.robots[page.parsedURL
|
|
123
|
+
if (this.robots[page.parsedURL!.origin] && !this.robots[page.parsedURL!.origin].isAllowed(page.url, this.pluginConfig.userAgent || 'data-fair-web-scraper')) {
|
|
98
124
|
return
|
|
99
125
|
}
|
|
100
|
-
page._id = getId(page)
|
|
101
|
-
if (this.pages.find(p => p._id === page._id)) return
|
|
126
|
+
page._id = await getId(page)
|
|
127
|
+
if (this.pages.find((p: Page) => p._id === page._id)) return
|
|
102
128
|
this.pages.push(page)
|
|
103
129
|
}
|
|
104
130
|
|
|
@@ -112,40 +138,45 @@ class PagesIterator {
|
|
|
112
138
|
}
|
|
113
139
|
}
|
|
114
140
|
|
|
115
|
-
|
|
116
|
-
|
|
141
|
+
export const run = async (context: ProcessingContext<ProcessingConfig>) => {
|
|
142
|
+
const { pluginConfig, processingConfig, processingId, axios, log, patchConfig, ws } = context
|
|
143
|
+
let dataset: any
|
|
117
144
|
if (processingConfig.datasetMode === 'create') {
|
|
118
145
|
await log.step('Dataset creation')
|
|
119
146
|
dataset = (await axios.post('api/v1/datasets', {
|
|
120
|
-
id: processingConfig.dataset
|
|
121
|
-
title: processingConfig.dataset
|
|
147
|
+
id: processingConfig.dataset?.id,
|
|
148
|
+
title: processingConfig.dataset?.title,
|
|
122
149
|
isRest: true,
|
|
123
150
|
schema: datasetSchema,
|
|
124
151
|
extras: { processingId }
|
|
125
152
|
})).data
|
|
153
|
+
if (dataset.status !== 'finalized') {
|
|
154
|
+
await ws.waitForJournal(dataset.id, 'finalize-end')
|
|
155
|
+
}
|
|
126
156
|
await log.info(`dataset created, id="${dataset.id}", title="${dataset.title}"`)
|
|
127
157
|
await patchConfig({ datasetMode: 'update', dataset: { id: dataset.id, title: dataset.title } })
|
|
128
|
-
await ws.waitForJournal(dataset.id, 'finalize-end')
|
|
129
158
|
} else if (processingConfig.datasetMode === 'update') {
|
|
130
159
|
await log.step('Check dataset')
|
|
131
|
-
dataset = (await axios.get(`api/v1/datasets/${processingConfig.dataset
|
|
132
|
-
if (!dataset) throw new Error(`the dataset does not exist, id="${processingConfig.dataset
|
|
160
|
+
dataset = (await axios.get(`api/v1/datasets/${processingConfig.dataset?.id}`)).data
|
|
161
|
+
if (!dataset) throw new Error(`the dataset does not exist, id="${processingConfig.dataset?.id}"`)
|
|
133
162
|
await log.info(`the dataset exists, id="${dataset.id}", title="${dataset.title}"`)
|
|
134
163
|
}
|
|
135
164
|
|
|
136
|
-
|
|
137
|
-
const robots = {}
|
|
165
|
+
const robots: Record<string, any> = {}
|
|
138
166
|
const sitemaps = processingConfig.sitemaps || []
|
|
139
|
-
for (const baseURL of processingConfig.baseURLs) {
|
|
167
|
+
for (const baseURL of processingConfig.baseURLs || []) {
|
|
140
168
|
const { origin } = new URL(baseURL)
|
|
141
169
|
if (robots[origin]) continue
|
|
142
170
|
try {
|
|
143
171
|
const response = await axios.get(origin + '/robots.txt')
|
|
144
|
-
robots[origin] = robotsParser(origin + '/robots.txt', response.data)
|
|
172
|
+
robots[origin] = (robotsParser as any)(origin + '/robots.txt', response.data)
|
|
145
173
|
for (const sitemap of robots[origin].getSitemaps()) {
|
|
146
|
-
if (!sitemaps.includes(sitemap))
|
|
174
|
+
if (!sitemaps.includes(sitemap)) {
|
|
175
|
+
await log.info(`add sitemap found in robots.txt ${sitemap}`)
|
|
176
|
+
sitemaps.push(sitemap)
|
|
177
|
+
}
|
|
147
178
|
}
|
|
148
|
-
} catch (err) {
|
|
179
|
+
} catch (err: any) {
|
|
149
180
|
await log.info(`failed to fetch ${origin + '/robots.txt'} - ${err.status || err.message}`)
|
|
150
181
|
}
|
|
151
182
|
}
|
|
@@ -153,32 +184,33 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
153
184
|
const pages = new PagesIterator(log, pluginConfig, processingConfig, robots)
|
|
154
185
|
|
|
155
186
|
await log.step('Init pages list')
|
|
156
|
-
let existingPages
|
|
187
|
+
let existingPages: Page[] | undefined
|
|
157
188
|
if (processingConfig.datasetMode === 'update') {
|
|
158
189
|
existingPages = (await axios.get(`api/v1/datasets/${dataset.id}/lines`, { params: { select: '_id,url,etag,lastModified', size: 10000 } })).data.results
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
page
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
190
|
+
if (existingPages) {
|
|
191
|
+
await log.info(`add ${existingPages.length} pages from previous crawls`)
|
|
192
|
+
for (const page of existingPages) {
|
|
193
|
+
page.parsedURL = new URL(page.url)
|
|
194
|
+
if (page.parsedURL.hash) {
|
|
195
|
+
const parentURL = new URL(page.parsedURL)
|
|
196
|
+
parentURL.hash = ''
|
|
197
|
+
page.parentId = await getId({ url: parentURL.href })
|
|
198
|
+
}
|
|
199
|
+
await pages.push({ ...page, source: 'previous exploration' })
|
|
166
200
|
}
|
|
167
|
-
await pages.push({ ...page, source: 'previous exploration' })
|
|
168
201
|
}
|
|
169
202
|
}
|
|
170
|
-
await log.info(`add ${processingConfig.startURLs
|
|
171
|
-
for (const url of processingConfig.startURLs) {
|
|
203
|
+
await log.info(`add ${processingConfig.startURLs?.length || 0} pages from config`)
|
|
204
|
+
for (const url of processingConfig.startURLs || []) {
|
|
172
205
|
await pages.push({ url, source: 'config start URLs' })
|
|
173
206
|
}
|
|
174
207
|
|
|
175
208
|
for (const sitemapURL of sitemaps) {
|
|
176
209
|
await log.info(`fetch start URLs from sitemap ${sitemapURL}`)
|
|
177
210
|
const sitemap = (await axios.get(sitemapURL)).data
|
|
178
|
-
const cheerio = require('cheerio')
|
|
179
211
|
const $ = cheerio.load(sitemap)
|
|
180
|
-
const sitemapURLs = []
|
|
181
|
-
$('url loc').each(function () {
|
|
212
|
+
const sitemapURLs: string[] = []
|
|
213
|
+
$('url loc').each(function (this: any) {
|
|
182
214
|
sitemapURLs.push($(this).text())
|
|
183
215
|
})
|
|
184
216
|
for (const url of sitemapURLs) {
|
|
@@ -186,20 +218,16 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
186
218
|
}
|
|
187
219
|
}
|
|
188
220
|
|
|
189
|
-
const sentIds = new Set(
|
|
190
|
-
const sendPage = async (page, data, contentType = 'text/html', filename = 'content.html') => {
|
|
191
|
-
await log.debug('send page', page.url)
|
|
221
|
+
const sentIds = new Set<string>()
|
|
222
|
+
const sendPage = async (page: Page, data: any, contentType = 'text/html', filename = 'content.html') => {
|
|
192
223
|
const form = new FormData()
|
|
193
|
-
|
|
224
|
+
|
|
194
225
|
if (page.title) {
|
|
195
226
|
page.title = page.title.trim()
|
|
196
227
|
if (processingConfig.titlePrefix && page.title.startsWith(processingConfig.titlePrefix)) {
|
|
197
228
|
page.title = page.title.replace(processingConfig.titlePrefix, '')
|
|
198
229
|
}
|
|
199
230
|
}
|
|
200
|
-
form.append('title', page.title)
|
|
201
|
-
form.append('url', page.url)
|
|
202
|
-
if (page.tags && page.tags.length) form.append('tags', page.tags.join(','))
|
|
203
231
|
data = typeof data === 'string' ? Buffer.from(data) : data
|
|
204
232
|
const dataOpts = {
|
|
205
233
|
contentType,
|
|
@@ -207,49 +235,52 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
207
235
|
knownLength: data.length
|
|
208
236
|
}
|
|
209
237
|
form.append('attachment', data, dataOpts)
|
|
210
|
-
|
|
211
|
-
if (page.etag) form.append('etag', page.etag)
|
|
212
|
-
page._id = getId(page)
|
|
238
|
+
page._id = await getId(page)
|
|
213
239
|
sentIds.add(page._id)
|
|
240
|
+
const body = { ...page }
|
|
241
|
+
delete body.source
|
|
242
|
+
delete body.parsedURL
|
|
243
|
+
delete body.nofollow
|
|
244
|
+
delete body.noindex
|
|
245
|
+
form.append('_body', JSON.stringify(body))
|
|
214
246
|
const headers = {
|
|
215
247
|
...form.getHeaders(),
|
|
216
248
|
'content-length': form.getLengthSync()
|
|
217
249
|
}
|
|
250
|
+
|
|
218
251
|
await axios({
|
|
219
252
|
method: 'put',
|
|
220
253
|
url: `api/v1/datasets/${dataset.id}/lines/${page._id}`,
|
|
221
254
|
data: form,
|
|
222
|
-
headers
|
|
255
|
+
headers,
|
|
256
|
+
validateStatus: (status) => status === 200 || status === 304
|
|
223
257
|
})
|
|
224
258
|
}
|
|
225
259
|
|
|
226
260
|
for await (const page of pages) {
|
|
227
261
|
if (stopped) break
|
|
228
262
|
|
|
229
|
-
const crawlDelay = (robots[page.parsedURL
|
|
263
|
+
const crawlDelay = (robots[page.parsedURL!.origin] && robots[page.parsedURL!.origin].getCrawlDelay()) || pluginConfig.defaultCrawlDelay || 1
|
|
230
264
|
await new Promise(resolve => setTimeout(resolve, crawlDelay * 1000))
|
|
231
265
|
|
|
232
|
-
|
|
233
|
-
const headers = { 'user-agent': pluginConfig.userAgent || 'data-fair-web-scraper' }
|
|
266
|
+
const headers: Record<string, string> = { 'user-agent': pluginConfig.userAgent || 'data-fair-web-scraper' }
|
|
234
267
|
if (page.lastModified) headers['if-modified-since'] = page.lastModified
|
|
235
268
|
if (page.etag) headers['if-none-match'] = page.etag
|
|
236
|
-
let response
|
|
269
|
+
let response: any
|
|
237
270
|
try {
|
|
238
271
|
response = await axios.get(page.url, { headers, maxRedirects: 0 })
|
|
239
|
-
} catch (err) {
|
|
240
|
-
// content did not change
|
|
272
|
+
} catch (err: any) {
|
|
241
273
|
if (err.status === 304) {
|
|
242
274
|
await log.debug(`page was not modified since last exploration ${page.url}`)
|
|
243
|
-
sentIds.add(page._id)
|
|
244
|
-
for (const existingPage of existingPages) {
|
|
245
|
-
if (existingPage.parentId === page._id) sentIds.add(existingPage._id)
|
|
275
|
+
sentIds.add(page._id!)
|
|
276
|
+
for (const existingPage of existingPages || []) {
|
|
277
|
+
if (existingPage.parentId === page._id) sentIds.add(existingPage._id!)
|
|
246
278
|
}
|
|
247
279
|
continue
|
|
248
280
|
}
|
|
249
|
-
|
|
250
|
-
if (err.status === 301) {
|
|
281
|
+
if (err.status === 301 || err.status === 302) {
|
|
251
282
|
await log.debug(`page redirected ${page.url} -> ${err.headers.location}`)
|
|
252
|
-
pages.push({ url: new URL(err.headers.location, page.url).href, source: 'redirect ' + page.url })
|
|
283
|
+
await pages.push({ url: new URL(err.headers.location, page.url).href, source: 'redirect ' + page.url })
|
|
253
284
|
continue
|
|
254
285
|
}
|
|
255
286
|
await log.warning(`failed to fetch page ${page.url} - ${err.status || err.message}`)
|
|
@@ -258,55 +289,70 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
258
289
|
}
|
|
259
290
|
|
|
260
291
|
if (response.headers['x-robots-tag']) {
|
|
261
|
-
await log.debug('
|
|
262
|
-
for (const part of response.headers['x-robots-tag'].split(',').map(p => p.trim())) {
|
|
292
|
+
await log.debug('x-robots-tag header', response.headers['x-robots-tag'])
|
|
293
|
+
for (const part of response.headers['x-robots-tag'].split(',').map((p: string) => p.trim())) {
|
|
263
294
|
if (part === 'noindex') page.noindex = true
|
|
264
295
|
if (part === 'nofollow') page.nofollow = true
|
|
265
296
|
}
|
|
266
297
|
}
|
|
267
|
-
|
|
268
298
|
page.lastModified = response.headers['last-modified']
|
|
269
299
|
page.etag = response.headers.etag
|
|
270
300
|
|
|
271
301
|
const isHTML = (response.headers['content-type'] && response.headers['content-type'].startsWith('text/html;')) || (typeof response.data === 'string' && response.data.trim().startsWith('<html'))
|
|
272
302
|
if (isHTML) {
|
|
273
|
-
const cheerio = require('cheerio')
|
|
274
303
|
const $ = cheerio.load(response.data)
|
|
275
304
|
const titleSelectors = (processingConfig.titleSelectors || []).concat(['title', 'h1'])
|
|
276
305
|
for (const titleSelector of titleSelectors) {
|
|
277
306
|
page.title = $(titleSelector).text()
|
|
278
307
|
if (page.title) {
|
|
279
|
-
log.debug(`used title selector "${titleSelector}" -> ${page.title.trim()}`)
|
|
308
|
+
await log.debug(`used title selector "${titleSelector}" -> ${page.title.trim()}`)
|
|
280
309
|
break
|
|
281
310
|
}
|
|
282
311
|
}
|
|
283
|
-
|
|
312
|
+
|
|
284
313
|
if (processingConfig.tagsSelectors && processingConfig.tagsSelectors.length) {
|
|
285
314
|
for (const tagsSelector of processingConfig.tagsSelectors) {
|
|
286
|
-
$(tagsSelector).each(function (
|
|
315
|
+
$(tagsSelector).each(function (this: any) {
|
|
287
316
|
const tag = $(this).text().trim()
|
|
288
|
-
if (tag)
|
|
317
|
+
if (tag) {
|
|
318
|
+
page.tags = page.tags ?? []
|
|
319
|
+
page.tags!.push(tag)
|
|
320
|
+
}
|
|
289
321
|
})
|
|
290
322
|
}
|
|
291
323
|
}
|
|
292
324
|
|
|
293
|
-
$('meta').each(function (
|
|
325
|
+
$('meta').each(function (this: any) {
|
|
294
326
|
const name = $(this).attr('name')
|
|
327
|
+
const property = $(this).attr('property')
|
|
328
|
+
const content = $(this).attr('content')
|
|
295
329
|
if (name === 'robots') {
|
|
296
|
-
|
|
297
|
-
log.debug('use robots meta', content)
|
|
330
|
+
log.debug('robots meta', content)
|
|
298
331
|
if (content) {
|
|
299
|
-
for (const part of content.split(',').map(p => p.trim())) {
|
|
332
|
+
for (const part of content.split(',').map((p) => p.trim())) {
|
|
300
333
|
if (part === 'noindex') page.noindex = true
|
|
301
334
|
if (part === 'nofollow') page.nofollow = true
|
|
302
335
|
}
|
|
303
336
|
}
|
|
304
337
|
}
|
|
338
|
+
if (processingConfig.extractKeywords && name === 'keywords' && content) {
|
|
339
|
+
page.tags = page.tags ?? []
|
|
340
|
+
for (const tag of content.split(',').map((t) => t.trim()).filter((t) => t)) {
|
|
341
|
+
page.tags!.push(tag)
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
if (processingConfig.extractArticleTags && property === 'article:tag' && content) {
|
|
345
|
+
page.tags = page.tags ?? []
|
|
346
|
+
page.tags!.push(content.trim())
|
|
347
|
+
}
|
|
348
|
+
if (processingConfig.extractDescription && name === 'description' && content) {
|
|
349
|
+
page.description = content.trim()
|
|
350
|
+
}
|
|
305
351
|
})
|
|
306
352
|
|
|
307
353
|
if (!page.noindex && processingConfig.anchors && processingConfig.anchors.length) {
|
|
308
|
-
const anchorsPages = []
|
|
309
|
-
$('a').each(function (
|
|
354
|
+
const anchorsPages: [Page, string][] = []
|
|
355
|
+
$('a').each(function (this: any) {
|
|
310
356
|
const href = $(this).attr('href')
|
|
311
357
|
if (!href) return
|
|
312
358
|
const parsedURL = new URL(href, page.url)
|
|
@@ -317,7 +363,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
317
363
|
const fragment = anchor.wrapperSelector ? targetElement.closest(anchor.wrapperSelector) : targetElement
|
|
318
364
|
const fragmentHtml = fragment.html()
|
|
319
365
|
if (fragmentHtml) {
|
|
320
|
-
const anchorPage = { url: parsedURL.href, tags: anchor.tags || [], source: 'anchor ' + page.url }
|
|
366
|
+
const anchorPage: Page = { url: parsedURL.href, tags: anchor.tags || [], source: 'anchor ' + page.url }
|
|
321
367
|
if (anchor.titleSelector) anchorPage.title = fragment.find(anchor.titleSelector).text() || page.title
|
|
322
368
|
else anchorPage.title = targetElement.text() || page.title
|
|
323
369
|
anchorsPages.push([anchorPage, fragmentHtml])
|
|
@@ -333,7 +379,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
333
379
|
}
|
|
334
380
|
}
|
|
335
381
|
if (!page.nofollow) {
|
|
336
|
-
$('a').each(function (
|
|
382
|
+
$('a').each(function (this: any) {
|
|
337
383
|
const href = $(this).attr('href')
|
|
338
384
|
if (href) pages.push({ url: new URL(href, page.url).href, source: 'link ' + page.url })
|
|
339
385
|
})
|
|
@@ -341,7 +387,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
341
387
|
|
|
342
388
|
if (!page.noindex) {
|
|
343
389
|
if (processingConfig.prune) {
|
|
344
|
-
processingConfig.prune.forEach(s => $(s).remove())
|
|
390
|
+
processingConfig.prune.forEach((s: string) => $(s).remove())
|
|
345
391
|
}
|
|
346
392
|
await sendPage(page, $.html())
|
|
347
393
|
}
|
|
@@ -350,7 +396,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
350
396
|
|
|
351
397
|
if (existingPages) {
|
|
352
398
|
for (const existingPage of existingPages) {
|
|
353
|
-
if (!sentIds.has(existingPage._id)) {
|
|
399
|
+
if (!sentIds.has(existingPage._id!)) {
|
|
354
400
|
await log.info('delete previously explored page that was not indexed this time', existingPage.url)
|
|
355
401
|
await axios.delete(`api/v1/datasets/${dataset.id}/lines/${existingPage._id}`)
|
|
356
402
|
}
|
|
@@ -358,10 +404,6 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
358
404
|
}
|
|
359
405
|
}
|
|
360
406
|
|
|
361
|
-
|
|
362
|
-
// not required but it is a good practice to prevent incoherent state a smuch as possible
|
|
363
|
-
// the run method should finish shortly after calling stop, otherwise the process will be forcibly terminated
|
|
364
|
-
// the grace period before force termination is 20 seconds
|
|
365
|
-
exports.stop = async () => {
|
|
407
|
+
export const stop = async () => {
|
|
366
408
|
stopped = true
|
|
367
409
|
}
|
package/package.json
CHANGED
|
@@ -1,12 +1,18 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@data-fair/processing-web-scraper",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.6.1",
|
|
4
4
|
"description": "A small Web scraper that publishes its data into data-fair datasets.",
|
|
5
|
-
"main": "index.
|
|
5
|
+
"main": "index.ts",
|
|
6
|
+
"type": "module",
|
|
6
7
|
"scripts": {
|
|
7
|
-
"
|
|
8
|
-
"lint": "eslint --
|
|
9
|
-
"
|
|
8
|
+
"lint": "eslint .",
|
|
9
|
+
"lint-fix": "eslint --fix .",
|
|
10
|
+
"build-types": "export NODE_OPTIONS='--experimental-strip-types' && df-build-types ./",
|
|
11
|
+
"prepare": "husky || true",
|
|
12
|
+
"test-base": "NODE_ENV=test node --experimental-strip-types --test-force-exit --test-concurrency=1 --test --test-reporter=spec --test-reporter-destination=stdout --test-timeout=300000",
|
|
13
|
+
"test": "npm run test-base test-it/*.ts",
|
|
14
|
+
"check-types": "tsc",
|
|
15
|
+
"quality": "npm run lint && npm run check-types && npm test"
|
|
10
16
|
},
|
|
11
17
|
"repository": {
|
|
12
18
|
"type": "git",
|
|
@@ -22,17 +28,31 @@
|
|
|
22
28
|
},
|
|
23
29
|
"homepage": "https://github.com/data-fair/processing-web-scraper#readme",
|
|
24
30
|
"devDependencies": {
|
|
25
|
-
"@
|
|
26
|
-
"config": "^
|
|
27
|
-
"
|
|
31
|
+
"@commitlint/cli": "^20.4.1",
|
|
32
|
+
"@commitlint/config-conventional": "^20.4.1",
|
|
33
|
+
"@data-fair/lib-processing-dev": "^0.2.0",
|
|
34
|
+
"@data-fair/lib-types-builder": "^1.11.3",
|
|
35
|
+
"@types/config": "^3.3.5",
|
|
36
|
+
"@types/express": "^5.0.6",
|
|
37
|
+
"@types/node": "^25.2.0",
|
|
38
|
+
"config": "^4.2.0",
|
|
39
|
+
"eslint": "^9.39.2",
|
|
28
40
|
"express": "^4.18.2",
|
|
29
|
-
"
|
|
30
|
-
"
|
|
41
|
+
"husky": "^9.1.7",
|
|
42
|
+
"neostandard": "^0.12.2",
|
|
43
|
+
"ws": "^8.19.0"
|
|
31
44
|
},
|
|
32
45
|
"dependencies": {
|
|
33
|
-
"
|
|
34
|
-
"
|
|
35
|
-
"
|
|
46
|
+
"@data-fair/lib-utils": "^1.9.0",
|
|
47
|
+
"cheerio": "^1.2.0",
|
|
48
|
+
"form-data": "^4.0.5",
|
|
49
|
+
"robots-parser": "^3.0.1",
|
|
36
50
|
"url-pattern": "^1.0.3"
|
|
37
|
-
}
|
|
51
|
+
},
|
|
52
|
+
"files": [
|
|
53
|
+
"plugin-config-schema.json",
|
|
54
|
+
"processing-config-schema.json",
|
|
55
|
+
"./index.ts",
|
|
56
|
+
"./lib/**/*"
|
|
57
|
+
]
|
|
38
58
|
}
|
|
@@ -72,6 +72,21 @@
|
|
|
72
72
|
"title": "Sélecteurs d'éléments HTML à utiliser comme étiquettes",
|
|
73
73
|
"items": {"type": "string"}
|
|
74
74
|
},
|
|
75
|
+
"extractKeywords": {
|
|
76
|
+
"type": "boolean",
|
|
77
|
+
"title": "Extraire les mots-clés depuis la balise meta name=\"keywords\"",
|
|
78
|
+
"default": false
|
|
79
|
+
},
|
|
80
|
+
"extractArticleTags": {
|
|
81
|
+
"type": "boolean",
|
|
82
|
+
"title": "Extraire les étiquettes depuis les balises meta property=\"article:tag\"",
|
|
83
|
+
"default": false
|
|
84
|
+
},
|
|
85
|
+
"extractDescription": {
|
|
86
|
+
"type": "boolean",
|
|
87
|
+
"title": "Extraire la description depuis la balise meta name=\"description\"",
|
|
88
|
+
"default": false
|
|
89
|
+
},
|
|
75
90
|
"sitemaps": {
|
|
76
91
|
"type": "array",
|
|
77
92
|
"title": "URLs de fichiers sitemap.xml",
|
package/.eslintrc.js
DELETED