@data-fair/processing-web-scraper 0.6.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/scraper.ts +5 -3
- package/package.json +7 -5
package/lib/scraper.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { createHash } from 'node:crypto'
|
|
2
2
|
import UrlPattern from 'url-pattern'
|
|
3
3
|
import robotsParser from 'robots-parser'
|
|
4
|
-
import cheerio from 'cheerio'
|
|
4
|
+
import * as cheerio from 'cheerio'
|
|
5
5
|
import FormData from 'form-data'
|
|
6
6
|
import type { ProcessingContext } from '@data-fair/lib-common-types/processings.js'
|
|
7
7
|
import type { ProcessingConfig } from '../types/processingConfig/index.ts'
|
|
@@ -84,6 +84,8 @@ interface Page {
|
|
|
84
84
|
parentId?: string
|
|
85
85
|
}
|
|
86
86
|
|
|
87
|
+
type UrlPatternWithHostname = UrlPattern & { hostname: string }
|
|
88
|
+
|
|
87
89
|
class PagesIterator {
|
|
88
90
|
pages: Page[] = []
|
|
89
91
|
cursor = -1
|
|
@@ -100,7 +102,7 @@ class PagesIterator {
|
|
|
100
102
|
this.robots = robots
|
|
101
103
|
this.excludeURLPatterns = (processingConfig.excludeURLPatterns || []).map((p: string) => {
|
|
102
104
|
const url = new URL(p)
|
|
103
|
-
const pattern = new UrlPattern(url.pathname)
|
|
105
|
+
const pattern = new UrlPattern(url.pathname) as UrlPatternWithHostname
|
|
104
106
|
pattern.hostname = url.hostname
|
|
105
107
|
return pattern
|
|
106
108
|
})
|
|
@@ -167,7 +169,7 @@ export const run = async (context: ProcessingContext<ProcessingConfig>) => {
|
|
|
167
169
|
if (robots[origin]) continue
|
|
168
170
|
try {
|
|
169
171
|
const response = await axios.get(origin + '/robots.txt')
|
|
170
|
-
robots[origin] = robotsParser(origin + '/robots.txt', response.data)
|
|
172
|
+
robots[origin] = (robotsParser as any)(origin + '/robots.txt', response.data)
|
|
171
173
|
for (const sitemap of robots[origin].getSitemaps()) {
|
|
172
174
|
if (!sitemaps.includes(sitemap)) {
|
|
173
175
|
await log.info(`add sitemap found in robots.txt ${sitemap}`)
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@data-fair/processing-web-scraper",
|
|
3
|
-
"version": "0.6.
|
|
3
|
+
"version": "0.6.1",
|
|
4
4
|
"description": "A small Web scraper that publishes its data into data-fair datasets.",
|
|
5
5
|
"main": "index.ts",
|
|
6
6
|
"type": "module",
|
|
@@ -10,7 +10,9 @@
|
|
|
10
10
|
"build-types": "export NODE_OPTIONS='--experimental-strip-types' && df-build-types ./",
|
|
11
11
|
"prepare": "husky || true",
|
|
12
12
|
"test-base": "NODE_ENV=test node --experimental-strip-types --test-force-exit --test-concurrency=1 --test --test-reporter=spec --test-reporter-destination=stdout --test-timeout=300000",
|
|
13
|
-
"test": "npm run test-base test-it/*.ts"
|
|
13
|
+
"test": "npm run test-base test-it/*.ts",
|
|
14
|
+
"check-types": "tsc",
|
|
15
|
+
"quality": "npm run lint && npm run check-types && npm test"
|
|
14
16
|
},
|
|
15
17
|
"repository": {
|
|
16
18
|
"type": "git",
|
|
@@ -42,9 +44,9 @@
|
|
|
42
44
|
},
|
|
43
45
|
"dependencies": {
|
|
44
46
|
"@data-fair/lib-utils": "^1.9.0",
|
|
45
|
-
"cheerio": "^1.
|
|
46
|
-
"form-data": "^4.0.
|
|
47
|
-
"robots-parser": "^3.0.
|
|
47
|
+
"cheerio": "^1.2.0",
|
|
48
|
+
"form-data": "^4.0.5",
|
|
49
|
+
"robots-parser": "^3.0.1",
|
|
48
50
|
"url-pattern": "^1.0.3"
|
|
49
51
|
},
|
|
50
52
|
"files": [
|