@data-fair/processing-web-scraper 0.6.0 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/lib/scraper.ts +13 -5
  2. package/package.json +7 -5
package/lib/scraper.ts CHANGED
@@ -1,7 +1,7 @@
1
1
  import { createHash } from 'node:crypto'
2
2
  import UrlPattern from 'url-pattern'
3
3
  import robotsParser from 'robots-parser'
4
- import cheerio from 'cheerio'
4
+ import * as cheerio from 'cheerio'
5
5
  import FormData from 'form-data'
6
6
  import type { ProcessingContext } from '@data-fair/lib-common-types/processings.js'
7
7
  import type { ProcessingConfig } from '../types/processingConfig/index.ts'
@@ -84,6 +84,8 @@ interface Page {
84
84
  parentId?: string
85
85
  }
86
86
 
87
+ type UrlPatternWithHostname = UrlPattern & { hostname: string }
88
+
87
89
  class PagesIterator {
88
90
  pages: Page[] = []
89
91
  cursor = -1
@@ -100,7 +102,7 @@ class PagesIterator {
100
102
  this.robots = robots
101
103
  this.excludeURLPatterns = (processingConfig.excludeURLPatterns || []).map((p: string) => {
102
104
  const url = new URL(p)
103
- const pattern = new UrlPattern(url.pathname)
105
+ const pattern = new UrlPattern(url.pathname) as UrlPatternWithHostname
104
106
  pattern.hostname = url.hostname
105
107
  return pattern
106
108
  })
@@ -167,7 +169,7 @@ export const run = async (context: ProcessingContext<ProcessingConfig>) => {
167
169
  if (robots[origin]) continue
168
170
  try {
169
171
  const response = await axios.get(origin + '/robots.txt')
170
- robots[origin] = robotsParser(origin + '/robots.txt', response.data)
172
+ robots[origin] = (robotsParser as any)(origin + '/robots.txt', response.data)
171
173
  for (const sitemap of robots[origin].getSitemaps()) {
172
174
  if (!sitemaps.includes(sitemap)) {
173
175
  await log.info(`add sitemap found in robots.txt ${sitemap}`)
@@ -277,8 +279,14 @@ export const run = async (context: ProcessingContext<ProcessingConfig>) => {
277
279
  continue
278
280
  }
279
281
  if (err.status === 301 || err.status === 302) {
280
- await log.debug(`page redirected ${page.url} -> ${err.headers.location}`)
281
- await pages.push({ url: new URL(err.headers.location, page.url).href, source: 'redirect ' + page.url })
282
+ // lack of homogeneity between processings service and dev env
283
+ const location = err.headers?.location ?? err.response?.headers?.location
284
+ if (!location) {
285
+ log.error('redirection without a location header')
286
+ } else {
287
+ await log.debug(`page redirected ${page.url} -> ${err.headers.location}`)
288
+ await pages.push({ url: new URL(err.headers.location, page.url).href, source: 'redirect ' + page.url })
289
+ }
282
290
  continue
283
291
  }
284
292
  await log.warning(`failed to fetch page ${page.url} - ${err.status || err.message}`)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@data-fair/processing-web-scraper",
3
- "version": "0.6.0",
3
+ "version": "0.6.2",
4
4
  "description": "A small Web scraper that publishes its data into data-fair datasets.",
5
5
  "main": "index.ts",
6
6
  "type": "module",
@@ -10,7 +10,9 @@
10
10
  "build-types": "export NODE_OPTIONS='--experimental-strip-types' && df-build-types ./",
11
11
  "prepare": "husky || true",
12
12
  "test-base": "NODE_ENV=test node --experimental-strip-types --test-force-exit --test-concurrency=1 --test --test-reporter=spec --test-reporter-destination=stdout --test-timeout=300000",
13
- "test": "npm run test-base test-it/*.ts"
13
+ "test": "npm run test-base test-it/*.ts",
14
+ "check-types": "tsc",
15
+ "quality": "npm run lint && npm run check-types && npm test"
14
16
  },
15
17
  "repository": {
16
18
  "type": "git",
@@ -42,9 +44,9 @@
42
44
  },
43
45
  "dependencies": {
44
46
  "@data-fair/lib-utils": "^1.9.0",
45
- "cheerio": "^1.0.0-rc.12",
46
- "form-data": "^4.0.0",
47
- "robots-parser": "^3.0.0",
47
+ "cheerio": "^1.2.0",
48
+ "form-data": "^4.0.5",
49
+ "robots-parser": "^3.0.1",
48
50
  "url-pattern": "^1.0.3"
49
51
  },
50
52
  "files": [