@data-fair/processing-web-scraper 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.js CHANGED
@@ -84,6 +84,7 @@ class PagesIterator {
84
84
  if (typeof page === 'string') page = { url: page }
85
85
  if (!this.processingConfig.baseURLs.find(b => page.url.startsWith(b))) return
86
86
  page.parsedURL = page.parsedURL || new URL(page.url)
87
+ if (page.parsedURL.hash) return
87
88
  if (this.robots[page.parsedURL.origin] && !this.robots[page.parsedURL.origin].isAllowed(page.url, this.pluginConfig.userAgent || 'data-fair-web-scraper')) {
88
89
  return
89
90
  }
@@ -139,15 +140,27 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
139
140
  const pages = new PagesIterator(log, pluginConfig, processingConfig, robots)
140
141
 
141
142
  await log.step('Init pages list')
142
- await log.info(`add ${processingConfig.startURLs.length} pages from config`)
143
- for (const url of processingConfig.startURLs) await pages.push({ url, source: 'config start URLs' })
143
+ let existingPages
144
144
  if (processingConfig.datasetMode === 'update') {
145
- const existingPages = (await axios.get(`api/v1/datasets/${dataset.id}/lines`, { params: { select: '_id,url,etag,lastModified', size: 10000 } })).data.results
145
+ existingPages = (await axios.get(`api/v1/datasets/${dataset.id}/lines`, { params: { select: '_id,url,etag,lastModified', size: 10000 } })).data.results
146
146
  await log.info(`add ${existingPages.length} pages from previous crawls`)
147
- for (const page of existingPages) await pages.push({ page, source: 'previous exploration' })
147
+ for (const page of existingPages) {
148
+ page.parsedURL = new URL(page.url)
149
+ if (page.parsedURL.hash) {
150
+ const parentURL = new URL(page.parsedURL)
151
+ parentURL.hash = ''
152
+ page.parentId = getId({ url: parentURL.href })
153
+ }
154
+ await pages.push({ ...page, source: 'previous exploration' })
155
+ }
156
+ }
157
+ await log.info(`add ${processingConfig.startURLs.length} pages from config`)
158
+ for (const url of processingConfig.startURLs) {
159
+ await pages.push({ url, source: 'config start URLs' })
148
160
  }
149
161
  // TODO: init from sitemap (and use robots.getSitemaps() to help in this)
150
162
 
163
+ const sentIds = new Set([])
151
164
  const sendPage = async (page, data, contentType = 'text/html', filename = 'content.html') => {
152
165
  await log.debug('send page', page.url)
153
166
  // TODO: apply no-index rules
@@ -169,7 +182,10 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
169
182
  knownLength: data.length
170
183
  }
171
184
  form.append('attachment', data, dataOpts)
185
+ if (page.lastModified) form.append('lastModified', page.lastModified)
186
+ if (page.etag) form.append('etag', page.etag)
172
187
  page._id = getId(page)
188
+ sentIds.add(page._id)
173
189
  const headers = {
174
190
  ...form.getHeaders(),
175
191
  'content-length': form.getLengthSync()
@@ -189,10 +205,21 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
189
205
  await new Promise(resolve => setTimeout(resolve, crawlDelay * 1000))
190
206
 
191
207
  // TODO: apply if-none-match and if-modified-since headers if etag or lastModified are available
208
+ const headers = { 'user-agent': pluginConfig.userAgent || 'data-fair-web-scraper' }
209
+ if (page.lastModified) headers['if-modified-since'] = page.lastModified
210
+ if (page.etag) headers['if-none-match'] = page.etag
192
211
  let response
193
212
  try {
194
- response = await axios.get(page.url, { headers: { 'user-agent': pluginConfig.userAgent || 'data-fair-web-scraper' } })
213
+ response = await axios.get(page.url, { headers })
195
214
  } catch (err) {
215
+ if (err.status === 304) {
216
+ await log.debug(`page was not modified since last exploration ${page.url}`)
217
+ sentIds.add(page._id)
218
+ for (const existingPage of existingPages) {
219
+ if (existingPage.parentId === page._id) sentIds.add(existingPage._id)
220
+ }
221
+ continue
222
+ }
196
223
  await log.warning(`failed to fetch page ${page.url} - ${err.status || err.message}`)
197
224
  if (page.source) await log.warning(`this broken URL comes from ${page.source}`)
198
225
  continue
@@ -205,6 +232,9 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
205
232
  }
206
233
  }
207
234
 
235
+ page.lastModified = response.headers['last-modified']
236
+ page.etag = response.headers.etag
237
+
208
238
  const isHTML = (response.headers['content-type'] && response.headers['content-type'].startsWith('text/html;')) || (typeof response.data === 'string' && response.data.trim().startsWith('<html'))
209
239
  if (isHTML) {
210
240
  const cheerio = require('cheerio')
@@ -237,10 +267,9 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
237
267
  const fragment = anchor.wrapperSelector ? targetElement.closest(anchor.wrapperSelector) : targetElement
238
268
  const fragmentHtml = fragment.html()
239
269
  if (fragmentHtml) {
240
- const anchorPage = { url: parsedURL.href }
270
+ const anchorPage = { url: parsedURL.href, tags: anchor.tags || [], source: 'anchor ' + page.url }
241
271
  if (anchor.titleSelector) anchorPage.title = fragment.find(anchor.titleSelector).text() || page.title
242
272
  else anchorPage.title = targetElement.text() || page.title
243
- anchorPage.tags = anchor.tags || []
244
273
  anchorsPages.push([anchorPage, fragmentHtml])
245
274
  $(fragment).remove()
246
275
  }
@@ -248,7 +277,6 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
248
277
  }
249
278
  })
250
279
  for (const [anchorPage, fragmentHtml] of anchorsPages) {
251
- console.log(anchorPage)
252
280
  await sendPage(anchorPage, `<body>
253
281
  ${fragmentHtml}
254
282
  </body>`)
@@ -258,7 +286,7 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
258
286
  if (!page.nofollow) {
259
287
  $('a').each(function (i, elem) {
260
288
  const href = $(this).attr('href')
261
- if (href) pages.push({ url: new URL(href, page.url).href, source: page.url })
289
+ if (href) pages.push({ url: new URL(href, page.url).href, source: 'link ' + page.url })
262
290
  })
263
291
  }
264
292
 
@@ -270,6 +298,15 @@ exports.run = async ({ pluginConfig, processingConfig, processingId, dir, tmpDir
270
298
  }
271
299
  }
272
300
  }
301
+
302
+ if (existingPages) {
303
+ for (const existingPage of existingPages) {
304
+ if (!sentIds.has(existingPage._id)) {
305
+ await log.info('delete previously explored page that was not indexed this time', existingPage.url)
306
+ await axios.delete(`api/v1/datasets/${dataset.id}/lines/${existingPage._id}`)
307
+ }
308
+ }
309
+ }
273
310
  }
274
311
 
275
312
  // used to manage interruption
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@data-fair/processing-web-scraper",
3
- "version": "0.1.0",
3
+ "version": "0.2.0",
4
4
  "description": "A small Web scraper that publishes its data into data-fair datasets.",
5
5
  "main": "index.js",
6
6
  "scripts": {
@@ -20,7 +20,7 @@
20
20
  "bugs": {
21
21
  "url": "https://github.com/data-fair/processing-web-scraper/issues"
22
22
  },
23
- "homepage": "https://github.com/data-fair/processingshello-world#readme",
23
+ "homepage": "https://github.com/data-fair/processing-web-scraper#readme",
24
24
  "devDependencies": {
25
25
  "@data-fair/processings-test-utils": "^0.5.0",
26
26
  "config": "^3.3.6",
@@ -14,7 +14,7 @@
14
14
  "required": ["title"],
15
15
  "properties": {
16
16
  "id": {"type": "string", "title": "Identifiant (laissez vide pour calculer un identifiant à partir du titre)"},
17
- "title": {"type": "string", "title": "Titre", "default": "Hello world "}
17
+ "title": {"type": "string", "title": "Titre", "default": "Web scraper"}
18
18
  }
19
19
  }
20
20
  }