@xcrap/got-scraping-client 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +127 -0
- package/jest.config.ts +5 -0
- package/package.json +32 -0
- package/rollup.config.js +46 -0
- package/tsconfig.json +15 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Marcuth
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
# 🕷️ Xcrap Got Scraping Client
|
|
2
|
+
|
|
3
|
+
**Xcrap Got Scraping Client** is a package of the Xcrap framework that implements an HTTP client using the [Got Scraping](https://www.npmjs.com/package/got-scraping) library.
|
|
4
|
+
|
|
5
|
+
## 📦 Installation
|
|
6
|
+
|
|
7
|
+
There are no secrets to installing it, just use your favorite dependency manager. Here is an example using NPM:
|
|
8
|
+
|
|
9
|
+
```cmd
|
|
10
|
+
npm i @xcrap/got-scraping-client @xcrap/core @xcrap/parser
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
> You need to install `@xcrap/parser` and `@xcrap/core` as well because I left them as `peerDependencies`, which means that the package needs `@xcrap/parser` and `@xcrap/core` as dependencies, however, the ones that the user has installed in the project will be used.
|
|
14
|
+
|
|
15
|
+
## 🚀 Usage
|
|
16
|
+
|
|
17
|
+
Like any HTTP client, `GotScrapingClient` has two methods: `fetch()` to make a request for a specific URL and `fetchMany()` to make requests for multiple URLs at the same time, being able to control concurrency and delays between requests. ### Example usage
|
|
18
|
+
|
|
19
|
+
```ts
|
|
20
|
+
import { GotScrapingClient } from "@xcrap/got-scraping-client"
|
|
21
|
+
import { extract } from "@xcrap/parser"
|
|
22
|
+
|
|
23
|
+
;(async() => {
|
|
24
|
+
const client = new GotScrapingClient()
|
|
25
|
+
const url = "https://example.com"
|
|
26
|
+
const response = await client.fetch({ url: url })
|
|
27
|
+
const parser = response.asHtmlParser()
|
|
28
|
+
const pageTitle = await parser.parseFist({ query: "title", extractor: extract("innerText") })
|
|
29
|
+
|
|
30
|
+
console.log("Page Title:", pageTitle)
|
|
31
|
+
})();
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
### Adding a proxy
|
|
35
|
+
|
|
36
|
+
In an HTTP client that extends `BaseClient` we can add a proxy in the constructor as we can see in the following example:
|
|
37
|
+
|
|
38
|
+
1. **Providing a `proxy` string**:
|
|
39
|
+
|
|
40
|
+
```ts
|
|
41
|
+
const client = new GotScrapingClient({ proxy: "http://47.251.122.81:8888" })
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
2. **Providing a function that will generate a `proxy`**:
|
|
45
|
+
|
|
46
|
+
```ts
|
|
47
|
+
function randomProxy() {
|
|
48
|
+
const proxies = [
|
|
49
|
+
"http://47.251.122.81:8888",
|
|
50
|
+
"http://159.203.61.169:3128"
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
const randomIndex = Math.floor(Math.random() * proxies.length)
|
|
54
|
+
|
|
55
|
+
return proxies[randomIndex]
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
const client = new GotScrapingClient({ proxy: randomProxy })
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Using a custom User Agent
|
|
62
|
+
|
|
63
|
+
In a client that extends `BaseClient` we can also customize the `User-Agent` of the requests. We can do this in two ways:
|
|
64
|
+
|
|
65
|
+
1. **By providing a `userAgent` string:
|
|
66
|
+
|
|
67
|
+
```ts
|
|
68
|
+
const client = new GotScraingClient({ userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36" })
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
2. **By providing a function that will generate a `userAgent`**:
|
|
72
|
+
|
|
73
|
+
```ts
|
|
74
|
+
function randomUserAgent() {
|
|
75
|
+
const userAgents = [
|
|
76
|
+
"Mozilla/5.0 (iPhone; CPU iPhone OS 9_8_4; like Mac OS X) AppleWebKit/603.37 (KHTML, like Gecko) Chrome/54.0.1244.188 Mobile Safari/601.5", "Mozilla/5.0 (Windows NT 10.3;; en-US) AppleWebKit/537.35 (KHTML, like Gecko) Chrome/47.0.1707.185 Safari/601"
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
const randomIndex = Math.floor(Math.random() * userAgents.length)
|
|
80
|
+
|
|
81
|
+
return userAgents[randomIndex]
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
const client = new GotScrapingClient({ userAgent: randomUserAgent })
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Using custom Proxy URL
|
|
88
|
+
|
|
89
|
+
In a client that extends `BaseClient` we can use proxy URLs, I don't know how to explain to you how they work, but I kind of discovered this kind of porxy when I was trying to solve the CORS problem by making a request on the client side, and then I met the *CORS Proxy*. Here I have a [template](https://gist.github.com/marcuth/9fbd321b011da44d1287faae31a8dd3a) for one for CloudFlare Workers in case you want to roll your own.
|
|
90
|
+
|
|
91
|
+
Well, we can do it the same way we did with `userAgent`:
|
|
92
|
+
|
|
93
|
+
1. **Providing a `proxyUrl` string**:
|
|
94
|
+
|
|
95
|
+
```ts
|
|
96
|
+
const client = new GotScrapingClient({ proxyUrl: "https://my-proxy-app.my-username.workers.dev" })
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
2. **Providing a function that will generate a `proxyUrl`**:
|
|
100
|
+
|
|
101
|
+
```ts
|
|
102
|
+
function randomProxyUrl() {
|
|
103
|
+
const proxyUrls = [
|
|
104
|
+
"https://my-proxy-app.my-username-1.workers.dev",
|
|
105
|
+
"https://my-proxy-app.my-username-2.workers.dev"
|
|
106
|
+
]
|
|
107
|
+
|
|
108
|
+
const randomIndex = Math.floor(Math.random() * proxyUrls.length)
|
|
109
|
+
|
|
110
|
+
return proxyUrls[randomIndex]
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
const client = new GotScrapingClient({ proxyUrl: randomProxyUrl })
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## 🤝 Contributing
|
|
117
|
+
|
|
118
|
+
- Want to contribute? Follow these steps:
|
|
119
|
+
- Fork the repository.
|
|
120
|
+
- Create a new branch (git checkout -b feature-new).
|
|
121
|
+
- Commit your changes (git commit -m 'Add new feature').
|
|
122
|
+
- Push to the branch (git push origin feature-new).
|
|
123
|
+
- Open a Pull Request.
|
|
124
|
+
|
|
125
|
+
## 📝 License
|
|
126
|
+
|
|
127
|
+
This project is licensed under the MIT License.
|
package/jest.config.ts
ADDED
package/package.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@xcrap/got-scraping-client",
|
|
3
|
+
"version": "0.0.1",
|
|
4
|
+
"description": "",
|
|
5
|
+
"main": "index.js",
|
|
6
|
+
"scripts": {
|
|
7
|
+
"test": "echo \"Error: no test specified\" && exit 1"
|
|
8
|
+
},
|
|
9
|
+
"keywords": [],
|
|
10
|
+
"author": "",
|
|
11
|
+
"license": "ISC",
|
|
12
|
+
"type": "commonjs",
|
|
13
|
+
"devDependencies": {
|
|
14
|
+
"@rollup/plugin-commonjs": "^28.0.3",
|
|
15
|
+
"@rollup/plugin-node-resolve": "^16.0.1",
|
|
16
|
+
"@rollup/plugin-typescript": "^12.1.2",
|
|
17
|
+
"@types/node": "^22.13.17",
|
|
18
|
+
"header-generator": "^2.1.63",
|
|
19
|
+
"rollup": "^4.39.0",
|
|
20
|
+
"ts-node": "^10.9.2",
|
|
21
|
+
"tslib": "^2.8.1",
|
|
22
|
+
"typescript": "^5.8.2"
|
|
23
|
+
},
|
|
24
|
+
"dependencies": {
|
|
25
|
+
"got-scraping": "^4.1.1",
|
|
26
|
+
"load-esm": "^1.0.2"
|
|
27
|
+
},
|
|
28
|
+
"peerDependencies": {
|
|
29
|
+
"@xcrap/core": "^0.0.3",
|
|
30
|
+
"@xcrap/parser": "^0.0.2"
|
|
31
|
+
}
|
|
32
|
+
}
|
package/rollup.config.js
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
const typescript = require("@rollup/plugin-typescript")
|
|
2
|
+
const resolve = require("@rollup/plugin-node-resolve")
|
|
3
|
+
const commonjs = require("@rollup/plugin-commonjs")
|
|
4
|
+
const path = require("node:path")
|
|
5
|
+
const fs = require("node:fs")
|
|
6
|
+
|
|
7
|
+
function getAllTypeScriptFiles(dir) {
|
|
8
|
+
let files = []
|
|
9
|
+
|
|
10
|
+
fs.readdirSync(dir).forEach(file => {
|
|
11
|
+
const fullPath = path.join(dir, file)
|
|
12
|
+
|
|
13
|
+
if (fs.statSync(fullPath).isDirectory()) {
|
|
14
|
+
files = files.concat(getAllTypeScriptFiles(fullPath))
|
|
15
|
+
} else if (file.endsWith(".ts")) {
|
|
16
|
+
files.push(fullPath)
|
|
17
|
+
}
|
|
18
|
+
})
|
|
19
|
+
|
|
20
|
+
return files
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
const inputFiles = getAllTypeScriptFiles("src")
|
|
24
|
+
|
|
25
|
+
module.exports = [
|
|
26
|
+
{
|
|
27
|
+
input: inputFiles,
|
|
28
|
+
output: {
|
|
29
|
+
dir: "dist",
|
|
30
|
+
format: "cjs",
|
|
31
|
+
entryFileNames: "[name].cjs",
|
|
32
|
+
preserveModules: true,
|
|
33
|
+
},
|
|
34
|
+
plugins: [resolve(), commonjs(), typescript()]
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
input: inputFiles,
|
|
38
|
+
output: {
|
|
39
|
+
dir: "dist",
|
|
40
|
+
format: "esm",
|
|
41
|
+
entryFileNames: "[name].mjs",
|
|
42
|
+
preserveModules: true,
|
|
43
|
+
},
|
|
44
|
+
plugins: [resolve(), commonjs(), typescript()]
|
|
45
|
+
}
|
|
46
|
+
]
|
package/tsconfig.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
{
|
|
2
|
+
"compilerOptions": {
|
|
3
|
+
"target": "es2019",
|
|
4
|
+
"module": "NodeNext",
|
|
5
|
+
"declaration": true,
|
|
6
|
+
"esModuleInterop": true,
|
|
7
|
+
"forceConsistentCasingInFileNames": true,
|
|
8
|
+
"strict": true,
|
|
9
|
+
"skipLibCheck": true,
|
|
10
|
+
"outDir": "./dist",
|
|
11
|
+
"rootDir": "./src"
|
|
12
|
+
},
|
|
13
|
+
"include": ["./src"],
|
|
14
|
+
"exclude": ["./node_modules", "./dist"]
|
|
15
|
+
}
|