@data-fair/processing-web-scraper 0.4.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.ts +19 -0
- package/{index.js → lib/scraper.ts} +143 -94
- package/package.json +29 -11
- package/processing-config-schema.json +21 -1
- package/.eslintrc.js +0 -13
package/index.ts
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import type { PrepareFunction, ProcessingContext } from '@data-fair/lib-common-types/processings.js'
|
|
2
|
+
import type { ProcessingConfig } from './types/processingConfig/index.ts'
|
|
3
|
+
|
|
4
|
+
export const prepare: PrepareFunction<ProcessingConfig> = async ({ processingConfig, secrets }) => {
|
|
5
|
+
return {
|
|
6
|
+
processingConfig,
|
|
7
|
+
secrets
|
|
8
|
+
}
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export const run = async (context: ProcessingContext<ProcessingConfig>) => {
|
|
12
|
+
const { run } = await import('./lib/scraper.ts')
|
|
13
|
+
await run(context)
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export const stop = async () => {
|
|
17
|
+
const { stop } = await import('./lib/scraper.ts')
|
|
18
|
+
await stop()
|
|
19
|
+
}
|
|
@@ -1,11 +1,10 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
// specifications listed here http://robots-txt.com/
|
|
1
|
+
import { createHash } from 'node:crypto'
|
|
2
|
+
import UrlPattern from 'url-pattern'
|
|
3
|
+
import robotsParser from 'robots-parser'
|
|
4
|
+
import cheerio from 'cheerio'
|
|
5
|
+
import FormData from 'form-data'
|
|
6
|
+
import type { ProcessingContext } from '@data-fair/lib-common-types/processings.js'
|
|
7
|
+
import type { ProcessingConfig } from '../types/processingConfig/index.ts'
|
|
9
8
|
|
|
10
9
|
const datasetSchema = [
|
|
11
10
|
{
|
|
@@ -14,6 +13,12 @@ const datasetSchema = [
|
|
|
14
13
|
'x-refersTo': 'http://www.w3.org/2000/01/rdf-schema#label',
|
|
15
14
|
'x-capabilities': { textAgg: false }
|
|
16
15
|
},
|
|
16
|
+
{
|
|
17
|
+
key: 'description',
|
|
18
|
+
type: 'string',
|
|
19
|
+
'x-refersTo': 'https://schema.org/description',
|
|
20
|
+
'x-capabilities': { values: false, textAgg: false, insensitive: false }
|
|
21
|
+
},
|
|
17
22
|
{
|
|
18
23
|
key: 'url',
|
|
19
24
|
type: 'string',
|
|
@@ -30,7 +35,6 @@ const datasetSchema = [
|
|
|
30
35
|
{
|
|
31
36
|
key: 'etag',
|
|
32
37
|
type: 'string',
|
|
33
|
-
separator: ',',
|
|
34
38
|
'x-capabilities': { index: false, values: false, text: false, textStandard: false, textAgg: false, insensitive: false }
|
|
35
39
|
},
|
|
36
40
|
{
|
|
@@ -46,10 +50,9 @@ const datasetSchema = [
|
|
|
46
50
|
}
|
|
47
51
|
]
|
|
48
52
|
|
|
49
|
-
|
|
50
|
-
let stopped
|
|
53
|
+
let stopped: boolean | undefined
|
|
51
54
|
|
|
52
|
-
const normalizeURL = (url, ignoreHash = false, addSlash = false) => {
|
|
55
|
+
const normalizeURL = (url: string, ignoreHash = false, addSlash = false): string => {
|
|
53
56
|
const parsedURL = new URL(url)
|
|
54
57
|
for (const indexSuffix of ['index.html', 'index.php', 'index.jsp', 'index.cgi']) {
|
|
55
58
|
if (parsedURL.pathname.endsWith('/' + indexSuffix)) {
|
|
@@ -61,20 +64,41 @@ const normalizeURL = (url, ignoreHash = false, addSlash = false) => {
|
|
|
61
64
|
return parsedURL.href
|
|
62
65
|
}
|
|
63
66
|
|
|
64
|
-
const getId = (page) => {
|
|
65
|
-
return
|
|
67
|
+
const getId = async (page: { url: string }): Promise<string> => {
|
|
68
|
+
return createHash('sha256').update(normalizeURL(page.url)).digest('base64url').slice(0, 20)
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
interface Page {
|
|
72
|
+
url: string
|
|
73
|
+
title?: string
|
|
74
|
+
description?: string
|
|
75
|
+
tags?: string[]
|
|
76
|
+
etag?: string
|
|
77
|
+
lastModified?: string
|
|
78
|
+
attachmentPath?: string
|
|
79
|
+
_id?: string
|
|
80
|
+
parsedURL?: URL
|
|
81
|
+
source?: string
|
|
82
|
+
noindex?: boolean
|
|
83
|
+
nofollow?: boolean
|
|
84
|
+
parentId?: string
|
|
66
85
|
}
|
|
67
86
|
|
|
68
87
|
class PagesIterator {
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
88
|
+
pages: Page[] = []
|
|
89
|
+
cursor = -1
|
|
90
|
+
log: any
|
|
91
|
+
pluginConfig: any
|
|
92
|
+
processingConfig: ProcessingConfig
|
|
93
|
+
robots: Record<string, any>
|
|
94
|
+
excludeURLPatterns: any[] = []
|
|
95
|
+
|
|
96
|
+
constructor (log: any, pluginConfig: any, processingConfig: ProcessingConfig, robots: Record<string, any>) {
|
|
72
97
|
this.log = log
|
|
73
98
|
this.pluginConfig = pluginConfig
|
|
74
99
|
this.processingConfig = processingConfig
|
|
75
100
|
this.robots = robots
|
|
76
|
-
|
|
77
|
-
this.excludeURLPatterns = (processingConfig.excludeURLPatterns || []).map(p => {
|
|
101
|
+
this.excludeURLPatterns = (processingConfig.excludeURLPatterns || []).map((p: string) => {
|
|
78
102
|
const url = new URL(p)
|
|
79
103
|
const pattern = new UrlPattern(url.pathname)
|
|
80
104
|
pattern.hostname = url.hostname
|
|
@@ -86,19 +110,19 @@ class PagesIterator {
|
|
|
86
110
|
return this
|
|
87
111
|
}
|
|
88
112
|
|
|
89
|
-
push (page) {
|
|
113
|
+
async push (page: Page | string) {
|
|
90
114
|
if (typeof page === 'string') page = { url: page }
|
|
91
|
-
if (!this.processingConfig.baseURLs
|
|
115
|
+
if (!this.processingConfig.baseURLs?.find((b: string) => page.url.startsWith(b))) return
|
|
92
116
|
page.parsedURL = page.parsedURL || new URL(page.url)
|
|
93
117
|
if (page.parsedURL.hash) return
|
|
94
|
-
if (this.excludeURLPatterns.find(p => p.match(page.parsedURL
|
|
118
|
+
if (this.excludeURLPatterns.find((p: any) => p.match(page.parsedURL!.pathname) && p.hostname === page.parsedURL!.hostname)) {
|
|
95
119
|
return
|
|
96
120
|
}
|
|
97
|
-
if (this.robots[page.parsedURL
|
|
121
|
+
if (this.robots[page.parsedURL!.origin] && !this.robots[page.parsedURL!.origin].isAllowed(page.url, this.pluginConfig.userAgent || 'data-fair-web-scraper')) {
|
|
98
122
|
return
|
|
99
123
|
}
|
|
100
|
-
page._id = getId(page)
|
|
101
|
-
if (this.pages.find(p => p._id === page._id)) return
|
|
124
|
+
page._id = await getId(page)
|
|
125
|
+
if (this.pages.find((p: Page) => p._id === page._id)) return
|
|
102
126
|
this.pages.push(page)
|
|
103
127
|
}
|
|
104
128
|
|
|
@@ -112,40 +136,45 @@ class PagesIterator {
|
|
|
112
136
|
}
|
|
113
137
|
}
|
|
114
138
|
|
|
115
|
-
|
|
116
|
-
|
|
139
|
+
export const run = async (context: ProcessingContext<ProcessingConfig>) => {
|
|
140
|
+
const { pluginConfig, processingConfig, processingId, axios, log, patchConfig, ws } = context
|
|
141
|
+
let dataset: any
|
|
117
142
|
if (processingConfig.datasetMode === 'create') {
|
|
118
143
|
await log.step('Dataset creation')
|
|
119
144
|
dataset = (await axios.post('api/v1/datasets', {
|
|
120
|
-
id: processingConfig.dataset
|
|
121
|
-
title: processingConfig.dataset
|
|
145
|
+
id: processingConfig.dataset?.id,
|
|
146
|
+
title: processingConfig.dataset?.title,
|
|
122
147
|
isRest: true,
|
|
123
148
|
schema: datasetSchema,
|
|
124
149
|
extras: { processingId }
|
|
125
150
|
})).data
|
|
151
|
+
if (dataset.status !== 'finalized') {
|
|
152
|
+
await ws.waitForJournal(dataset.id, 'finalize-end')
|
|
153
|
+
}
|
|
126
154
|
await log.info(`dataset created, id="${dataset.id}", title="${dataset.title}"`)
|
|
127
155
|
await patchConfig({ datasetMode: 'update', dataset: { id: dataset.id, title: dataset.title } })
|
|
128
|
-
await ws.waitForJournal(dataset.id, 'finalize-end')
|
|
129
156
|
} else if (processingConfig.datasetMode === 'update') {
|
|
130
157
|
await log.step('Check dataset')
|
|
131
|
-
dataset = (await axios.get(`api/v1/datasets/${processingConfig.dataset
|
|
132
|
-
if (!dataset) throw new Error(`the dataset does not exist, id="${processingConfig.dataset
|
|
158
|
+
dataset = (await axios.get(`api/v1/datasets/${processingConfig.dataset?.id}`)).data
|
|
159
|
+
if (!dataset) throw new Error(`the dataset does not exist, id="${processingConfig.dataset?.id}"`)
|
|
133
160
|
await log.info(`the dataset exists, id="${dataset.id}", title="${dataset.title}"`)
|
|
134
161
|
}
|
|
135
162
|
|
|
136
|
-
|
|
137
|
-
const robots = {}
|
|
163
|
+
const robots: Record<string, any> = {}
|
|
138
164
|
const sitemaps = processingConfig.sitemaps || []
|
|
139
|
-
for (const baseURL of processingConfig.baseURLs) {
|
|
165
|
+
for (const baseURL of processingConfig.baseURLs || []) {
|
|
140
166
|
const { origin } = new URL(baseURL)
|
|
141
167
|
if (robots[origin]) continue
|
|
142
168
|
try {
|
|
143
169
|
const response = await axios.get(origin + '/robots.txt')
|
|
144
170
|
robots[origin] = robotsParser(origin + '/robots.txt', response.data)
|
|
145
171
|
for (const sitemap of robots[origin].getSitemaps()) {
|
|
146
|
-
if (!sitemaps.includes(sitemap))
|
|
172
|
+
if (!sitemaps.includes(sitemap)) {
|
|
173
|
+
await log.info(`add sitemap found in robots.txt ${sitemap}`)
|
|
174
|
+
sitemaps.push(sitemap)
|
|
175
|
+
}
|
|
147
176
|
}
|
|
148
|
-
} catch (err) {
|
|
177
|
+
} catch (err: any) {
|
|
149
178
|
await log.info(`failed to fetch ${origin + '/robots.txt'} - ${err.status || err.message}`)
|
|
150
179
|
}
|
|
151
180
|
}
|
|
@@ -153,32 +182,33 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
153
182
|
const pages = new PagesIterator(log, pluginConfig, processingConfig, robots)
|
|
154
183
|
|
|
155
184
|
await log.step('Init pages list')
|
|
156
|
-
let existingPages
|
|
185
|
+
let existingPages: Page[] | undefined
|
|
157
186
|
if (processingConfig.datasetMode === 'update') {
|
|
158
187
|
existingPages = (await axios.get(`api/v1/datasets/${dataset.id}/lines`, { params: { select: '_id,url,etag,lastModified', size: 10000 } })).data.results
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
page
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
188
|
+
if (existingPages) {
|
|
189
|
+
await log.info(`add ${existingPages.length} pages from previous crawls`)
|
|
190
|
+
for (const page of existingPages) {
|
|
191
|
+
page.parsedURL = new URL(page.url)
|
|
192
|
+
if (page.parsedURL.hash) {
|
|
193
|
+
const parentURL = new URL(page.parsedURL)
|
|
194
|
+
parentURL.hash = ''
|
|
195
|
+
page.parentId = await getId({ url: parentURL.href })
|
|
196
|
+
}
|
|
197
|
+
await pages.push({ ...page, source: 'previous exploration' })
|
|
166
198
|
}
|
|
167
|
-
await pages.push({ ...page, source: 'previous exploration' })
|
|
168
199
|
}
|
|
169
200
|
}
|
|
170
|
-
await log.info(`add ${processingConfig.startURLs
|
|
171
|
-
for (const url of processingConfig.startURLs) {
|
|
201
|
+
await log.info(`add ${processingConfig.startURLs?.length || 0} pages from config`)
|
|
202
|
+
for (const url of processingConfig.startURLs || []) {
|
|
172
203
|
await pages.push({ url, source: 'config start URLs' })
|
|
173
204
|
}
|
|
174
205
|
|
|
175
206
|
for (const sitemapURL of sitemaps) {
|
|
176
207
|
await log.info(`fetch start URLs from sitemap ${sitemapURL}`)
|
|
177
208
|
const sitemap = (await axios.get(sitemapURL)).data
|
|
178
|
-
const cheerio = require('cheerio')
|
|
179
209
|
const $ = cheerio.load(sitemap)
|
|
180
|
-
const sitemapURLs = []
|
|
181
|
-
$('url loc').each(function () {
|
|
210
|
+
const sitemapURLs: string[] = []
|
|
211
|
+
$('url loc').each(function (this: any) {
|
|
182
212
|
sitemapURLs.push($(this).text())
|
|
183
213
|
})
|
|
184
214
|
for (const url of sitemapURLs) {
|
|
@@ -186,20 +216,16 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
186
216
|
}
|
|
187
217
|
}
|
|
188
218
|
|
|
189
|
-
const sentIds = new Set(
|
|
190
|
-
const sendPage = async (page, data, contentType = 'text/html', filename = 'content.html') => {
|
|
191
|
-
await log.debug('send page', page.url)
|
|
219
|
+
const sentIds = new Set<string>()
|
|
220
|
+
const sendPage = async (page: Page, data: any, contentType = 'text/html', filename = 'content.html') => {
|
|
192
221
|
const form = new FormData()
|
|
193
|
-
|
|
222
|
+
|
|
194
223
|
if (page.title) {
|
|
195
224
|
page.title = page.title.trim()
|
|
196
225
|
if (processingConfig.titlePrefix && page.title.startsWith(processingConfig.titlePrefix)) {
|
|
197
226
|
page.title = page.title.replace(processingConfig.titlePrefix, '')
|
|
198
227
|
}
|
|
199
228
|
}
|
|
200
|
-
form.append('title', page.title)
|
|
201
|
-
form.append('url', page.url)
|
|
202
|
-
if (page.tags && page.tags.length) form.append('tags', page.tags.join(','))
|
|
203
229
|
data = typeof data === 'string' ? Buffer.from(data) : data
|
|
204
230
|
const dataOpts = {
|
|
205
231
|
contentType,
|
|
@@ -207,49 +233,52 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
207
233
|
knownLength: data.length
|
|
208
234
|
}
|
|
209
235
|
form.append('attachment', data, dataOpts)
|
|
210
|
-
|
|
211
|
-
if (page.etag) form.append('etag', page.etag)
|
|
212
|
-
page._id = getId(page)
|
|
236
|
+
page._id = await getId(page)
|
|
213
237
|
sentIds.add(page._id)
|
|
238
|
+
const body = { ...page }
|
|
239
|
+
delete body.source
|
|
240
|
+
delete body.parsedURL
|
|
241
|
+
delete body.nofollow
|
|
242
|
+
delete body.noindex
|
|
243
|
+
form.append('_body', JSON.stringify(body))
|
|
214
244
|
const headers = {
|
|
215
245
|
...form.getHeaders(),
|
|
216
246
|
'content-length': form.getLengthSync()
|
|
217
247
|
}
|
|
248
|
+
|
|
218
249
|
await axios({
|
|
219
250
|
method: 'put',
|
|
220
251
|
url: `api/v1/datasets/${dataset.id}/lines/${page._id}`,
|
|
221
252
|
data: form,
|
|
222
|
-
headers
|
|
253
|
+
headers,
|
|
254
|
+
validateStatus: (status) => status === 200 || status === 304
|
|
223
255
|
})
|
|
224
256
|
}
|
|
225
257
|
|
|
226
258
|
for await (const page of pages) {
|
|
227
259
|
if (stopped) break
|
|
228
260
|
|
|
229
|
-
const crawlDelay = (robots[page.parsedURL
|
|
261
|
+
const crawlDelay = (robots[page.parsedURL!.origin] && robots[page.parsedURL!.origin].getCrawlDelay()) || pluginConfig.defaultCrawlDelay || 1
|
|
230
262
|
await new Promise(resolve => setTimeout(resolve, crawlDelay * 1000))
|
|
231
263
|
|
|
232
|
-
|
|
233
|
-
const headers = { 'user-agent': pluginConfig.userAgent || 'data-fair-web-scraper' }
|
|
264
|
+
const headers: Record<string, string> = { 'user-agent': pluginConfig.userAgent || 'data-fair-web-scraper' }
|
|
234
265
|
if (page.lastModified) headers['if-modified-since'] = page.lastModified
|
|
235
266
|
if (page.etag) headers['if-none-match'] = page.etag
|
|
236
|
-
let response
|
|
267
|
+
let response: any
|
|
237
268
|
try {
|
|
238
269
|
response = await axios.get(page.url, { headers, maxRedirects: 0 })
|
|
239
|
-
} catch (err) {
|
|
240
|
-
// content did not change
|
|
270
|
+
} catch (err: any) {
|
|
241
271
|
if (err.status === 304) {
|
|
242
272
|
await log.debug(`page was not modified since last exploration ${page.url}`)
|
|
243
|
-
sentIds.add(page._id)
|
|
244
|
-
for (const existingPage of existingPages) {
|
|
245
|
-
if (existingPage.parentId === page._id) sentIds.add(existingPage._id)
|
|
273
|
+
sentIds.add(page._id!)
|
|
274
|
+
for (const existingPage of existingPages || []) {
|
|
275
|
+
if (existingPage.parentId === page._id) sentIds.add(existingPage._id!)
|
|
246
276
|
}
|
|
247
277
|
continue
|
|
248
278
|
}
|
|
249
|
-
|
|
250
|
-
if (err.status === 301) {
|
|
279
|
+
if (err.status === 301 || err.status === 302) {
|
|
251
280
|
await log.debug(`page redirected ${page.url} -> ${err.headers.location}`)
|
|
252
|
-
pages.push({ url: new URL(err.headers.location, page.url).href, source: 'redirect ' + page.url })
|
|
281
|
+
await pages.push({ url: new URL(err.headers.location, page.url).href, source: 'redirect ' + page.url })
|
|
253
282
|
continue
|
|
254
283
|
}
|
|
255
284
|
await log.warning(`failed to fetch page ${page.url} - ${err.status || err.message}`)
|
|
@@ -258,46 +287,70 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
258
287
|
}
|
|
259
288
|
|
|
260
289
|
if (response.headers['x-robots-tag']) {
|
|
261
|
-
await log.debug('
|
|
262
|
-
for (const part of response.headers['x-robots-tag'].split(',').map(p => p.trim())) {
|
|
290
|
+
await log.debug('x-robots-tag header', response.headers['x-robots-tag'])
|
|
291
|
+
for (const part of response.headers['x-robots-tag'].split(',').map((p: string) => p.trim())) {
|
|
263
292
|
if (part === 'noindex') page.noindex = true
|
|
264
293
|
if (part === 'nofollow') page.nofollow = true
|
|
265
294
|
}
|
|
266
295
|
}
|
|
267
|
-
|
|
268
296
|
page.lastModified = response.headers['last-modified']
|
|
269
297
|
page.etag = response.headers.etag
|
|
270
298
|
|
|
271
299
|
const isHTML = (response.headers['content-type'] && response.headers['content-type'].startsWith('text/html;')) || (typeof response.data === 'string' && response.data.trim().startsWith('<html'))
|
|
272
300
|
if (isHTML) {
|
|
273
|
-
const cheerio = require('cheerio')
|
|
274
301
|
const $ = cheerio.load(response.data)
|
|
275
302
|
const titleSelectors = (processingConfig.titleSelectors || []).concat(['title', 'h1'])
|
|
276
303
|
for (const titleSelector of titleSelectors) {
|
|
277
304
|
page.title = $(titleSelector).text()
|
|
278
305
|
if (page.title) {
|
|
279
|
-
log.debug(`used title selector "${titleSelector}" -> ${page.title.trim()}`)
|
|
306
|
+
await log.debug(`used title selector "${titleSelector}" -> ${page.title.trim()}`)
|
|
280
307
|
break
|
|
281
308
|
}
|
|
282
309
|
}
|
|
283
310
|
|
|
284
|
-
|
|
311
|
+
if (processingConfig.tagsSelectors && processingConfig.tagsSelectors.length) {
|
|
312
|
+
for (const tagsSelector of processingConfig.tagsSelectors) {
|
|
313
|
+
$(tagsSelector).each(function (this: any) {
|
|
314
|
+
const tag = $(this).text().trim()
|
|
315
|
+
if (tag) {
|
|
316
|
+
page.tags = page.tags ?? []
|
|
317
|
+
page.tags!.push(tag)
|
|
318
|
+
}
|
|
319
|
+
})
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
$('meta').each(function (this: any) {
|
|
285
324
|
const name = $(this).attr('name')
|
|
325
|
+
const property = $(this).attr('property')
|
|
326
|
+
const content = $(this).attr('content')
|
|
286
327
|
if (name === 'robots') {
|
|
287
|
-
|
|
288
|
-
log.debug('use robots meta', content)
|
|
328
|
+
log.debug('robots meta', content)
|
|
289
329
|
if (content) {
|
|
290
|
-
for (const part of content.split(',').map(p => p.trim())) {
|
|
330
|
+
for (const part of content.split(',').map((p) => p.trim())) {
|
|
291
331
|
if (part === 'noindex') page.noindex = true
|
|
292
332
|
if (part === 'nofollow') page.nofollow = true
|
|
293
333
|
}
|
|
294
334
|
}
|
|
295
335
|
}
|
|
336
|
+
if (processingConfig.extractKeywords && name === 'keywords' && content) {
|
|
337
|
+
page.tags = page.tags ?? []
|
|
338
|
+
for (const tag of content.split(',').map((t) => t.trim()).filter((t) => t)) {
|
|
339
|
+
page.tags!.push(tag)
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
if (processingConfig.extractArticleTags && property === 'article:tag' && content) {
|
|
343
|
+
page.tags = page.tags ?? []
|
|
344
|
+
page.tags!.push(content.trim())
|
|
345
|
+
}
|
|
346
|
+
if (processingConfig.extractDescription && name === 'description' && content) {
|
|
347
|
+
page.description = content.trim()
|
|
348
|
+
}
|
|
296
349
|
})
|
|
297
350
|
|
|
298
351
|
if (!page.noindex && processingConfig.anchors && processingConfig.anchors.length) {
|
|
299
|
-
const anchorsPages = []
|
|
300
|
-
$('a').each(function (
|
|
352
|
+
const anchorsPages: [Page, string][] = []
|
|
353
|
+
$('a').each(function (this: any) {
|
|
301
354
|
const href = $(this).attr('href')
|
|
302
355
|
if (!href) return
|
|
303
356
|
const parsedURL = new URL(href, page.url)
|
|
@@ -308,7 +361,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
308
361
|
const fragment = anchor.wrapperSelector ? targetElement.closest(anchor.wrapperSelector) : targetElement
|
|
309
362
|
const fragmentHtml = fragment.html()
|
|
310
363
|
if (fragmentHtml) {
|
|
311
|
-
const anchorPage = { url: parsedURL.href, tags: anchor.tags || [], source: 'anchor ' + page.url }
|
|
364
|
+
const anchorPage: Page = { url: parsedURL.href, tags: anchor.tags || [], source: 'anchor ' + page.url }
|
|
312
365
|
if (anchor.titleSelector) anchorPage.title = fragment.find(anchor.titleSelector).text() || page.title
|
|
313
366
|
else anchorPage.title = targetElement.text() || page.title
|
|
314
367
|
anchorsPages.push([anchorPage, fragmentHtml])
|
|
@@ -324,7 +377,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
324
377
|
}
|
|
325
378
|
}
|
|
326
379
|
if (!page.nofollow) {
|
|
327
|
-
$('a').each(function (
|
|
380
|
+
$('a').each(function (this: any) {
|
|
328
381
|
const href = $(this).attr('href')
|
|
329
382
|
if (href) pages.push({ url: new URL(href, page.url).href, source: 'link ' + page.url })
|
|
330
383
|
})
|
|
@@ -332,7 +385,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
332
385
|
|
|
333
386
|
if (!page.noindex) {
|
|
334
387
|
if (processingConfig.prune) {
|
|
335
|
-
processingConfig.prune.forEach(s => $(s).remove())
|
|
388
|
+
processingConfig.prune.forEach((s: string) => $(s).remove())
|
|
336
389
|
}
|
|
337
390
|
await sendPage(page, $.html())
|
|
338
391
|
}
|
|
@@ -341,7 +394,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
341
394
|
|
|
342
395
|
if (existingPages) {
|
|
343
396
|
for (const existingPage of existingPages) {
|
|
344
|
-
if (!sentIds.has(existingPage._id)) {
|
|
397
|
+
if (!sentIds.has(existingPage._id!)) {
|
|
345
398
|
await log.info('delete previously explored page that was not indexed this time', existingPage.url)
|
|
346
399
|
await axios.delete(`api/v1/datasets/${dataset.id}/lines/${existingPage._id}`)
|
|
347
400
|
}
|
|
@@ -349,10 +402,6 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
349
402
|
}
|
|
350
403
|
}
|
|
351
404
|
|
|
352
|
-
|
|
353
|
-
// not required but it is a good practice to prevent incoherent state a smuch as possible
|
|
354
|
-
// the run method should finish shortly after calling stop, otherwise the process will be forcibly terminated
|
|
355
|
-
// the grace period before force termination is 20 seconds
|
|
356
|
-
exports.stop = async () => {
|
|
405
|
+
export const stop = async () => {
|
|
357
406
|
stopped = true
|
|
358
407
|
}
|
package/package.json
CHANGED
|
@@ -1,12 +1,16 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@data-fair/processing-web-scraper",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.6.0",
|
|
4
4
|
"description": "A small Web scraper that publishes its data into data-fair datasets.",
|
|
5
|
-
"main": "index.
|
|
5
|
+
"main": "index.ts",
|
|
6
|
+
"type": "module",
|
|
6
7
|
"scripts": {
|
|
7
|
-
"
|
|
8
|
-
"lint": "eslint --
|
|
9
|
-
"
|
|
8
|
+
"lint": "eslint .",
|
|
9
|
+
"lint-fix": "eslint --fix .",
|
|
10
|
+
"build-types": "export NODE_OPTIONS='--experimental-strip-types' && df-build-types ./",
|
|
11
|
+
"prepare": "husky || true",
|
|
12
|
+
"test-base": "NODE_ENV=test node --experimental-strip-types --test-force-exit --test-concurrency=1 --test --test-reporter=spec --test-reporter-destination=stdout --test-timeout=300000",
|
|
13
|
+
"test": "npm run test-base test-it/*.ts"
|
|
10
14
|
},
|
|
11
15
|
"repository": {
|
|
12
16
|
"type": "git",
|
|
@@ -22,17 +26,31 @@
|
|
|
22
26
|
},
|
|
23
27
|
"homepage": "https://github.com/data-fair/processing-web-scraper#readme",
|
|
24
28
|
"devDependencies": {
|
|
25
|
-
"@
|
|
26
|
-
"config": "^
|
|
27
|
-
"
|
|
29
|
+
"@commitlint/cli": "^20.4.1",
|
|
30
|
+
"@commitlint/config-conventional": "^20.4.1",
|
|
31
|
+
"@data-fair/lib-processing-dev": "^0.2.0",
|
|
32
|
+
"@data-fair/lib-types-builder": "^1.11.3",
|
|
33
|
+
"@types/config": "^3.3.5",
|
|
34
|
+
"@types/express": "^5.0.6",
|
|
35
|
+
"@types/node": "^25.2.0",
|
|
36
|
+
"config": "^4.2.0",
|
|
37
|
+
"eslint": "^9.39.2",
|
|
28
38
|
"express": "^4.18.2",
|
|
29
|
-
"
|
|
30
|
-
"
|
|
39
|
+
"husky": "^9.1.7",
|
|
40
|
+
"neostandard": "^0.12.2",
|
|
41
|
+
"ws": "^8.19.0"
|
|
31
42
|
},
|
|
32
43
|
"dependencies": {
|
|
44
|
+
"@data-fair/lib-utils": "^1.9.0",
|
|
33
45
|
"cheerio": "^1.0.0-rc.12",
|
|
34
46
|
"form-data": "^4.0.0",
|
|
35
47
|
"robots-parser": "^3.0.0",
|
|
36
48
|
"url-pattern": "^1.0.3"
|
|
37
|
-
}
|
|
49
|
+
},
|
|
50
|
+
"files": [
|
|
51
|
+
"plugin-config-schema.json",
|
|
52
|
+
"processing-config-schema.json",
|
|
53
|
+
"./index.ts",
|
|
54
|
+
"./lib/**/*"
|
|
55
|
+
]
|
|
38
56
|
}
|
|
@@ -67,6 +67,26 @@
|
|
|
67
67
|
"description": "Par défaut le sélecteur 'title' est utilisé ce qui correspond au titre de page dans les métadonnées HTML.",
|
|
68
68
|
"items": {"type": "string"}
|
|
69
69
|
},
|
|
70
|
+
"tagsSelectors": {
|
|
71
|
+
"type": "array",
|
|
72
|
+
"title": "Sélecteurs d'éléments HTML à utiliser comme étiquettes",
|
|
73
|
+
"items": {"type": "string"}
|
|
74
|
+
},
|
|
75
|
+
"extractKeywords": {
|
|
76
|
+
"type": "boolean",
|
|
77
|
+
"title": "Extraire les mots-clés depuis la balise meta name=\"keywords\"",
|
|
78
|
+
"default": false
|
|
79
|
+
},
|
|
80
|
+
"extractArticleTags": {
|
|
81
|
+
"type": "boolean",
|
|
82
|
+
"title": "Extraire les étiquettes depuis les balises meta property=\"article:tag\"",
|
|
83
|
+
"default": false
|
|
84
|
+
},
|
|
85
|
+
"extractDescription": {
|
|
86
|
+
"type": "boolean",
|
|
87
|
+
"title": "Extraire la description depuis la balise meta name=\"description\"",
|
|
88
|
+
"default": false
|
|
89
|
+
},
|
|
70
90
|
"sitemaps": {
|
|
71
91
|
"type": "array",
|
|
72
92
|
"title": "URLs de fichiers sitemap.xml",
|
|
@@ -82,7 +102,7 @@
|
|
|
82
102
|
"excludeURLPatterns": {
|
|
83
103
|
"type": "array",
|
|
84
104
|
"title": "Formats d'URL à exclure de l'exploration",
|
|
85
|
-
"description": "Exemple: https://data-fair.github.io/
|
|
105
|
+
"description": "Exemple: https://data-fair.github.io/3/en(/*)",
|
|
86
106
|
"items": {"type": "string"}
|
|
87
107
|
},
|
|
88
108
|
"anchors": {
|
package/.eslintrc.js
DELETED