@data-fair/processing-web-scraper 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.js +46 -9
- package/package.json +2 -2
- package/processing-config-schema.json +1 -1
package/index.js
CHANGED
|
@@ -84,6 +84,7 @@ class PagesIterator {
|
|
|
84
84
|
if (typeof page === 'string') page = { url: page }
|
|
85
85
|
if (!this.processingConfig.baseURLs.find(b => page.url.startsWith(b))) return
|
|
86
86
|
page.parsedURL = page.parsedURL || new URL(page.url)
|
|
87
|
+
if (page.parsedURL.hash) return
|
|
87
88
|
if (this.robots[page.parsedURL.origin] && !this.robots[page.parsedURL.origin].isAllowed(page.url, this.pluginConfig.userAgent || 'data-fair-web-scraper')) {
|
|
88
89
|
return
|
|
89
90
|
}
|
|
@@ -139,15 +140,27 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
139
140
|
const pages = new PagesIterator(log, pluginConfig, processingConfig, robots)
|
|
140
141
|
|
|
141
142
|
await log.step('Init pages list')
|
|
142
|
-
|
|
143
|
-
for (const url of processingConfig.startURLs) await pages.push({ url, source: 'config start URLs' })
|
|
143
|
+
let existingPages
|
|
144
144
|
if (processingConfig.datasetMode === 'update') {
|
|
145
|
-
|
|
145
|
+
existingPages = (await axios.get(`api/v1/datasets/${dataset.id}/lines`, { params: { select: '_id,url,etag,lastModified', size: 10000 } })).data.results
|
|
146
146
|
await log.info(`add ${existingPages.length} pages from previous crawls`)
|
|
147
|
-
for (const page of existingPages)
|
|
147
|
+
for (const page of existingPages) {
|
|
148
|
+
page.parsedURL = new URL(page.url)
|
|
149
|
+
if (page.parsedURL.hash) {
|
|
150
|
+
const parentURL = new URL(page.parsedURL)
|
|
151
|
+
parentURL.hash = ''
|
|
152
|
+
page.parentId = getId({ url: parentURL.href })
|
|
153
|
+
}
|
|
154
|
+
await pages.push({ ...page, source: 'previous exploration' })
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
await log.info(`add ${processingConfig.startURLs.length} pages from config`)
|
|
158
|
+
for (const url of processingConfig.startURLs) {
|
|
159
|
+
await pages.push({ url, source: 'config start URLs' })
|
|
148
160
|
}
|
|
149
161
|
// TODO: init from sitemap (and use robots.getSitemaps() to help in this)
|
|
150
162
|
|
|
163
|
+
const sentIds = new Set([])
|
|
151
164
|
const sendPage = async (page, data, contentType = 'text/html', filename = 'content.html') => {
|
|
152
165
|
await log.debug('send page', page.url)
|
|
153
166
|
// TODO: apply no-index rules
|
|
@@ -169,7 +182,10 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
169
182
|
knownLength: data.length
|
|
170
183
|
}
|
|
171
184
|
form.append('attachment', data, dataOpts)
|
|
185
|
+
if (page.lastModified) form.append('lastModified', page.lastModified)
|
|
186
|
+
if (page.etag) form.append('etag', page.etag)
|
|
172
187
|
page._id = getId(page)
|
|
188
|
+
sentIds.add(page._id)
|
|
173
189
|
const headers = {
|
|
174
190
|
...form.getHeaders(),
|
|
175
191
|
'content-length': form.getLengthSync()
|
|
@@ -189,10 +205,21 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
189
205
|
await new Promise(resolve => setTimeout(resolve, crawlDelay * 1000))
|
|
190
206
|
|
|
191
207
|
// TODO: apply if-none-match and if-modified-since headers if etag or lastModified are available
|
|
208
|
+
const headers = { 'user-agent': pluginConfig.userAgent || 'data-fair-web-scraper' }
|
|
209
|
+
if (page.lastModified) headers['if-modified-since'] = page.lastModified
|
|
210
|
+
if (page.etag) headers['if-none-match'] = page.etag
|
|
192
211
|
let response
|
|
193
212
|
try {
|
|
194
|
-
response = await axios.get(page.url, { headers
|
|
213
|
+
response = await axios.get(page.url, { headers })
|
|
195
214
|
} catch (err) {
|
|
215
|
+
if (err.status === 304) {
|
|
216
|
+
await log.debug(`page was not modified since last exploration ${page.url}`)
|
|
217
|
+
sentIds.add(page._id)
|
|
218
|
+
for (const existingPage of existingPages) {
|
|
219
|
+
if (existingPage.parentId === page._id) sentIds.add(existingPage._id)
|
|
220
|
+
}
|
|
221
|
+
continue
|
|
222
|
+
}
|
|
196
223
|
await log.warning(`failed to fetch page ${page.url} - ${err.status || err.message}`)
|
|
197
224
|
if (page.source) await log.warning(`this broken URL comes from ${page.source}`)
|
|
198
225
|
continue
|
|
@@ -205,6 +232,9 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
205
232
|
}
|
|
206
233
|
}
|
|
207
234
|
|
|
235
|
+
page.lastModified = response.headers['last-modified']
|
|
236
|
+
page.etag = response.headers.etag
|
|
237
|
+
|
|
208
238
|
const isHTML = (response.headers['content-type'] && response.headers['content-type'].startsWith('text/html;')) || (typeof response.data === 'string' && response.data.trim().startsWith('<html'))
|
|
209
239
|
if (isHTML) {
|
|
210
240
|
const cheerio = require('cheerio')
|
|
@@ -237,10 +267,9 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
237
267
|
const fragment = anchor.wrapperSelector ? targetElement.closest(anchor.wrapperSelector) : targetElement
|
|
238
268
|
const fragmentHtml = fragment.html()
|
|
239
269
|
if (fragmentHtml) {
|
|
240
|
-
const anchorPage = { url: parsedURL.href }
|
|
270
|
+
const anchorPage = { url: parsedURL.href, tags: anchor.tags || [], source: 'anchor ' + page.url }
|
|
241
271
|
if (anchor.titleSelector) anchorPage.title = fragment.find(anchor.titleSelector).text() || page.title
|
|
242
272
|
else anchorPage.title = targetElement.text() || page.title
|
|
243
|
-
anchorPage.tags = anchor.tags || []
|
|
244
273
|
anchorsPages.push([anchorPage, fragmentHtml])
|
|
245
274
|
$(fragment).remove()
|
|
246
275
|
}
|
|
@@ -248,7 +277,6 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
248
277
|
}
|
|
249
278
|
})
|
|
250
279
|
for (const [anchorPage, fragmentHtml] of anchorsPages) {
|
|
251
|
-
console.log(anchorPage)
|
|
252
280
|
await sendPage(anchorPage, `<body>
|
|
253
281
|
${fragmentHtml}
|
|
254
282
|
</body>`)
|
|
@@ -258,7 +286,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
258
286
|
if (!page.nofollow) {
|
|
259
287
|
$('a').each(function (i, elem) {
|
|
260
288
|
const href = $(this).attr('href')
|
|
261
|
-
if (href) pages.push({ url: new URL(href, page.url).href, source: page.url })
|
|
289
|
+
if (href) pages.push({ url: new URL(href, page.url).href, source: 'link ' + page.url })
|
|
262
290
|
})
|
|
263
291
|
}
|
|
264
292
|
|
|
@@ -270,6 +298,15 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
|
|
|
270
298
|
}
|
|
271
299
|
}
|
|
272
300
|
}
|
|
301
|
+
|
|
302
|
+
if (existingPages) {
|
|
303
|
+
for (const existingPage of existingPages) {
|
|
304
|
+
if (!sentIds.has(existingPage._id)) {
|
|
305
|
+
await log.info('delete previously explored page that was not indexed this time', existingPage.url)
|
|
306
|
+
await axios.delete(`api/v1/datasets/${dataset.id}/lines/${existingPage._id}`)
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
}
|
|
273
310
|
}
|
|
274
311
|
|
|
275
312
|
// used to manage interruption
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@data-fair/processing-web-scraper",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.2.0",
|
|
4
4
|
"description": "A small Web scraper that publishes its data into data-fair datasets.",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"scripts": {
|
|
@@ -20,7 +20,7 @@
|
|
|
20
20
|
"bugs": {
|
|
21
21
|
"url": "https://github.com/data-fair/processing-web-scraper/issues"
|
|
22
22
|
},
|
|
23
|
-
"homepage": "https://github.com/data-fair/
|
|
23
|
+
"homepage": "https://github.com/data-fair/processing-web-scraper#readme",
|
|
24
24
|
"devDependencies": {
|
|
25
25
|
"@data-fair/processings-test-utils": "^0.5.0",
|
|
26
26
|
"config": "^3.3.6",
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
"required": ["title"],
|
|
15
15
|
"properties": {
|
|
16
16
|
"id": {"type": "string", "title": "Identifiant (laissez vide pour calculer un identifiant à partir du titre)"},
|
|
17
|
-
"title": {"type": "string", "title": "Titre", "default": "
|
|
17
|
+
"title": {"type": "string", "title": "Titre", "default": "Web scraper"}
|
|
18
18
|
}
|
|
19
19
|
}
|
|
20
20
|
}
|