@data-fair/processing-web-scraper 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.js CHANGED
@@ -84,6 +84,7 @@ class PagesIterator {
84
84
  if (typeof page === 'string') page = { url: page }
85
85
  if (!this.processingConfig.baseURLs.find(b => page.url.startsWith(b))) return
86
86
  page.parsedURL = page.parsedURL || new URL(page.url)
87
+ if (page.parsedURL.hash) return
87
88
  if (this.robots[page.parsedURL.origin] && !this.robots[page.parsedURL.origin].isAllowed(page.url, this.pluginConfig.userAgent || 'data-fair-web-scraper')) {
88
89
  return
89
90
  }
@@ -139,15 +140,27 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
139
140
  const pages = new PagesIterator(log, pluginConfig, processingConfig, robots)
140
141
 
141
142
  await log.step('Init pages list')
142
- await log.info(`add ${processingConfig.startURLs.length} pages from config`)
143
- for (const url of processingConfig.startURLs) await pages.push({ url, source: 'config start URLs' })
143
+ let existingPages
144
144
  if (processingConfig.datasetMode === 'update') {
145
- const existingPages = (await axios.get(`api/v1/datasets/${dataset.id}/lines`, { params: { select: '_id,url,etag,lastModified', size: 10000 } })).data.results
145
+ existingPages = (await axios.get(`api/v1/datasets/${dataset.id}/lines`, { params: { select: '_id,url,etag,lastModified', size: 10000 } })).data.results
146
146
  await log.info(`add ${existingPages.length} pages from previous crawls`)
147
- for (const page of existingPages) await pages.push({ page, source: 'previous exploration' })
147
+ for (const page of existingPages) {
148
+ page.parsedURL = new URL(page.url)
149
+ if (page.parsedURL.hash) {
150
+ const parentURL = new URL(page.parsedURL)
151
+ parentURL.hash = ''
152
+ page.parentId = getId({ url: parentURL.href })
153
+ }
154
+ await pages.push({ ...page, source: 'previous exploration' })
155
+ }
156
+ }
157
+ await log.info(`add ${processingConfig.startURLs.length} pages from config`)
158
+ for (const url of processingConfig.startURLs) {
159
+ await pages.push({ url, source: 'config start URLs' })
148
160
  }
149
161
  // TODO: init from sitemap (and use robots.getSitemaps() to help in this)
150
162
 
163
+ const sentIds = new Set([])
151
164
  const sendPage = async (page, data, contentType = 'text/html', filename = 'content.html') => {
152
165
  await log.debug('send page', page.url)
153
166
  // TODO: apply no-index rules
@@ -169,7 +182,10 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
169
182
  knownLength: data.length
170
183
  }
171
184
  form.append('attachment', data, dataOpts)
185
+ if (page.lastModified) form.append('lastModified', page.lastModified)
186
+ if (page.etag) form.append('etag', page.etag)
172
187
  page._id = getId(page)
188
+ sentIds.add(page._id)
173
189
  const headers = {
174
190
  ...form.getHeaders(),
175
191
  'content-length': form.getLengthSync()
@@ -189,10 +205,28 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
189
205
  await new Promise(resolve => setTimeout(resolve, crawlDelay * 1000))
190
206
 
191
207
  // TODO: apply if-none-match and if-modified-since headers if etag or lastModified are available
208
+ const headers = { 'user-agent': pluginConfig.userAgent || 'data-fair-web-scraper' }
209
+ if (page.lastModified) headers['if-modified-since'] = page.lastModified
210
+ if (page.etag) headers['if-none-match'] = page.etag
192
211
  let response
193
212
  try {
194
- response = await axios.get(page.url, { headers: { 'user-agent': pluginConfig.userAgent || 'data-fair-web-scraper' } })
213
+ response = await axios.get(page.url, { headers, maxRedirects: 0 })
195
214
  } catch (err) {
215
+ // content did not change
216
+ if (err.status === 304) {
217
+ await log.debug(`page was not modified since last exploration ${page.url}`)
218
+ sentIds.add(page._id)
219
+ for (const existingPage of existingPages) {
220
+ if (existingPage.parentId === page._id) sentIds.add(existingPage._id)
221
+ }
222
+ continue
223
+ }
224
+ // follow a redirect
225
+ if (err.status === 301) {
226
+ await log.debug(`page redirected ${page.url} -> ${err.headers.location}`)
227
+ pages.push({ url: new URL(err.headers.location, page.url).href, source: 'redirect ' + page.url })
228
+ continue
229
+ }
196
230
  await log.warning(`failed to fetch page ${page.url} - ${err.status || err.message}`)
197
231
  if (page.source) await log.warning(`this broken URL comes from ${page.source}`)
198
232
  continue
@@ -205,6 +239,9 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
205
239
  }
206
240
  }
207
241
 
242
+ page.lastModified = response.headers['last-modified']
243
+ page.etag = response.headers.etag
244
+
208
245
  const isHTML = (response.headers['content-type'] && response.headers['content-type'].startsWith('text/html;')) || (typeof response.data === 'string' && response.data.trim().startsWith('<html'))
209
246
  if (isHTML) {
210
247
  const cheerio = require('cheerio')
@@ -237,10 +274,9 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
237
274
  const fragment = anchor.wrapperSelector ? targetElement.closest(anchor.wrapperSelector) : targetElement
238
275
  const fragmentHtml = fragment.html()
239
276
  if (fragmentHtml) {
240
- const anchorPage = { url: parsedURL.href }
277
+ const anchorPage = { url: parsedURL.href, tags: anchor.tags || [], source: 'anchor ' + page.url }
241
278
  if (anchor.titleSelector) anchorPage.title = fragment.find(anchor.titleSelector).text() || page.title
242
279
  else anchorPage.title = targetElement.text() || page.title
243
- anchorPage.tags = anchor.tags || []
244
280
  anchorsPages.push([anchorPage, fragmentHtml])
245
281
  $(fragment).remove()
246
282
  }
@@ -248,7 +284,6 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
248
284
  }
249
285
  })
250
286
  for (const [anchorPage, fragmentHtml] of anchorsPages) {
251
- console.log(anchorPage)
252
287
  await sendPage(anchorPage, `<body>
253
288
  ${fragmentHtml}
254
289
  </body>`)
@@ -258,7 +293,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
258
293
  if (!page.nofollow) {
259
294
  $('a').each(function (i, elem) {
260
295
  const href = $(this).attr('href')
261
- if (href) pages.push({ url: new URL(href, page.url).href, source: page.url })
296
+ if (href) pages.push({ url: new URL(href, page.url).href, source: 'link ' + page.url })
262
297
  })
263
298
  }
264
299
 
@@ -270,6 +305,15 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
270
305
  }
271
306
  }
272
307
  }
308
+
309
+ if (existingPages) {
310
+ for (const existingPage of existingPages) {
311
+ if (!sentIds.has(existingPage._id)) {
312
+ await log.info('delete previously explored page that was not indexed this time', existingPage.url)
313
+ await axios.delete(`api/v1/datasets/${dataset.id}/lines/${existingPage._id}`)
314
+ }
315
+ }
316
+ }
273
317
  }
274
318
 
275
319
  // used to manage interruption
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@data-fair/processing-web-scraper",
3
- "version": "0.1.0",
3
+ "version": "0.2.1",
4
4
  "description": "A small Web scraper that publishes its data into data-fair datasets.",
5
5
  "main": "index.js",
6
6
  "scripts": {
@@ -20,9 +20,9 @@
20
20
  "bugs": {
21
21
  "url": "https://github.com/data-fair/processing-web-scraper/issues"
22
22
  },
23
- "homepage": "https://github.com/data-fair/processingshello-world#readme",
23
+ "homepage": "https://github.com/data-fair/processing-web-scraper#readme",
24
24
  "devDependencies": {
25
- "@data-fair/processings-test-utils": "^0.5.0",
25
+ "@data-fair/processings-test-utils": "^0.5.1",
26
26
  "config": "^3.3.6",
27
27
  "eslint": "^7.18.0",
28
28
  "express": "^4.18.2",
@@ -14,7 +14,7 @@
14
14
  "required": ["title"],
15
15
  "properties": {
16
16
  "id": {"type": "string", "title": "Identifiant (laissez vide pour calculer un identifiant à partir du titre)"},
17
- "title": {"type": "string", "title": "Titre", "default": "Hello world "}
17
+ "title": {"type": "string", "title": "Titre", "default": "Web scraper"}
18
18
  }
19
19
  }
20
20
  }