@xcrap/puppeteer-client 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +177 -0
- package/dist/constants.cjs +5 -0
- package/dist/constants.d.ts +2 -0
- package/dist/constants.mjs +3 -0
- package/dist/index.cjs +169 -0
- package/dist/index.d.ts +39 -0
- package/dist/index.mjs +167 -0
- package/package.json +41 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Marcuth
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
# 🕷️ Xcrap Pupeeteer Client
|
|
2
|
+
|
|
3
|
+
**Xcrap Puppeteer Client** is a package of the Xcrap framework that implements an HTTP client using the [Puppeteer](https://www.npmjs.com/package/puppeteer) library.
|
|
4
|
+
## 📦 Installation
|
|
5
|
+
|
|
6
|
+
There are no secrets to installing it, just use your favorite dependency manager. Here is an example using NPM:
|
|
7
|
+
|
|
8
|
+
```cmd
|
|
9
|
+
npm i @xcrap/puppeteer-client @xcrap/core
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
> You need to install `@xcrap/parser` and `@xcrap/core` as well because I left them as `peerDependencies`, which means that the package needs `@xcrap/parser` and `@xcrap/core` as dependencies, however, the ones that the user has installed in the project will be used.
|
|
13
|
+
|
|
14
|
+
## 🚀 Usage
|
|
15
|
+
|
|
16
|
+
Like any HTTP client, `PuppteerClient` has two methods: `fetch()` to make a request for a specific URL and `fetchMany()` to make requests for multiple URLs at the same time, being able to control concurrency and delays between requests. #### Example usage
|
|
17
|
+
|
|
18
|
+
```ts
|
|
19
|
+
import { PuppteerClient } from "@xcrap/puppeteer-client"
|
|
20
|
+
import { extract } from "@xcrap/parser"
|
|
21
|
+
|
|
22
|
+
;(async() => {
|
|
23
|
+
const client = new PuppteerClient()
|
|
24
|
+
const url = "https://example.com"
|
|
25
|
+
const response = await client.fetch({ url: url })
|
|
26
|
+
const parser = response.asHtmlParser()
|
|
27
|
+
const pageTitle = await parser.parseFist({ query: "title", extractor: extract("innerText") })
|
|
28
|
+
|
|
29
|
+
console.log("Page Title:", pageTitle)
|
|
30
|
+
})();
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
#### Using Actions
|
|
34
|
+
|
|
35
|
+
If you want to perform operations on the page before or after requests, you can use the `actions` property, which is an array of functions. Actions are flexible enough that you can do exactly what you would normally do with Puppeteer: login, click buttons, evaluate functions, etc.
|
|
36
|
+
|
|
37
|
+
```ts
|
|
38
|
+
const response = await client.fetch({
|
|
39
|
+
url: "https://example.com",
|
|
40
|
+
actions: [
|
|
41
|
+
async (page) => {
|
|
42
|
+
await page.type("#username", "user")
|
|
43
|
+
await page.type("#password", "mypassword123")
|
|
44
|
+
await page.click("#submit")
|
|
45
|
+
}
|
|
46
|
+
]
|
|
47
|
+
})
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
By default, an action is executed after requests. If you want to manually define when it will be executed, you will have to pass an object instead of a simple function:
|
|
51
|
+
|
|
52
|
+
```ts
|
|
53
|
+
const response = await client.fetch({
|
|
54
|
+
url: "https://example.com",
|
|
55
|
+
actions: [
|
|
56
|
+
{
|
|
57
|
+
type: "afterRequest", // Executed after the request
|
|
58
|
+
exec: async (page) => {
|
|
59
|
+
await page.type("#username", "user")
|
|
60
|
+
await page.type("#password", "mypassword123")
|
|
61
|
+
await page.click("#submit")
|
|
62
|
+
}
|
|
63
|
+
},
|
|
64
|
+
{
|
|
65
|
+
type: "beforeRequest", // Executed before the request
|
|
66
|
+
func: async (page) => {
|
|
67
|
+
const width = 1920 + Math.floor(Math.random() * 100)
|
|
68
|
+
const height = 3000 + Math.floor(Math.random() * 100)
|
|
69
|
+
|
|
70
|
+
await page.setViewport({
|
|
71
|
+
width: width,
|
|
72
|
+
height: height,
|
|
73
|
+
deviceScaleFactor: 1,
|
|
74
|
+
hasTouch: false,
|
|
75
|
+
isLandscape: false,
|
|
76
|
+
isMobile: false,
|
|
77
|
+
})
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
]
|
|
81
|
+
})
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
#### Adding a proxy
|
|
85
|
+
|
|
86
|
+
In an HTTP client that extends `BaseClient` we can add a proxy in the constructor as we can see in the following example:
|
|
87
|
+
|
|
88
|
+
1. **Providing a `proxy` string:
|
|
89
|
+
|
|
90
|
+
```ts
|
|
91
|
+
const client = new PuppteerClient({ proxy: "http://47.251.122.81:8888" })
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
2. **Providing a function that will generate a `proxy`:**
|
|
95
|
+
|
|
96
|
+
```ts
|
|
97
|
+
function randomProxy() {
|
|
98
|
+
const proxies = [
|
|
99
|
+
"http://47.251.122.81:8888",
|
|
100
|
+
"http://159.203.61.169:3128"
|
|
101
|
+
]
|
|
102
|
+
|
|
103
|
+
const randomIndex = Math.floor(Math.random() * proxies.length)
|
|
104
|
+
|
|
105
|
+
return proxies[randomIndex]
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
const client = new PuppteerClient({ proxy: randomProxy })
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
#### Using a custom User Agent
|
|
112
|
+
|
|
113
|
+
In a client that extends `BaseClient` we can also customize the `User-Agent` of the requests. We can do this in two ways:
|
|
114
|
+
|
|
115
|
+
1. **By providing a `userAgent` string:
|
|
116
|
+
|
|
117
|
+
```ts
|
|
118
|
+
const client = new PuppteerClient({ userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36" })
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
2. **By providing a function that will generate a `userAgent`:**
|
|
122
|
+
|
|
123
|
+
```ts
|
|
124
|
+
function randomUserAgent() {
|
|
125
|
+
const userAgents = [
|
|
126
|
+
"Mozilla/5.0 (iPhone; CPU iPhone OS 9_8_4; like Mac OS X) AppleWebKit/603.37 (KHTML, like Gecko) Chrome/54.0.1244.188 Mobile Safari/601.5", "Mozilla/5.0 (Windows NT 10.3;; en-US) AppleWebKit/537.35 (KHTML, like Gecko) Chrome/47.0.1707.185 Safari/601"
|
|
127
|
+
]
|
|
128
|
+
|
|
129
|
+
const randomIndex = Math.floor(Math.random() * userAgents.length)
|
|
130
|
+
|
|
131
|
+
return userAgents[randomIndex]
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
const client = new PuppteerClient({ userAgent: randomUserAgent })
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
#### Using custom Proxy URL
|
|
138
|
+
|
|
139
|
+
In a client that extends `BaseClient` we can use proxy URLs, I don't know how to explain to you how they work, but I kind of discovered this kind of porxy when I was trying to solve the CORS problem by making a request on the client side, and then I met the *CORS Proxy*. Here I have a [template](https://gist.github.com/marcuth/9fbd321b011da44d1287faae31a8dd3a) for one for CloudFlare Workers in case you want to roll your own.
|
|
140
|
+
|
|
141
|
+
Well, we can do it the same way we did with `userAgent`:
|
|
142
|
+
|
|
143
|
+
1. **Providing a `proxyUrl` string:
|
|
144
|
+
|
|
145
|
+
```ts
|
|
146
|
+
const client = new PuppteerClient({ proxyUrl: "https://my-proxy-app.my-username.workers.dev" })
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
2. **Providing a function that will generate a `proxyUrl`:**
|
|
150
|
+
|
|
151
|
+
```ts
|
|
152
|
+
function randomProxyUrl() {
|
|
153
|
+
const proxyUrls = [
|
|
154
|
+
"https://my-proxy-app.my-username-1.workers.dev",
|
|
155
|
+
"https://my-proxy-app.my-username-2.workers.dev"
|
|
156
|
+
]
|
|
157
|
+
|
|
158
|
+
const randomIndex = Math.floor(Math.random() * proxyUrls.length)
|
|
159
|
+
|
|
160
|
+
return proxyUrls[randomIndex]
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
const client = new PuppteerClient({ proxyUrl: randomProxyUrl })
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
## 🤝 Contributing
|
|
167
|
+
|
|
168
|
+
- Want to contribute? Follow these steps:
|
|
169
|
+
- Fork the repository.
|
|
170
|
+
- Create a new branch (git checkout -b feature-new).
|
|
171
|
+
- Commit your changes (git commit -m 'Add new feature').
|
|
172
|
+
- Push to the branch (git push origin feature-new).
|
|
173
|
+
- Open a Pull Request.
|
|
174
|
+
|
|
175
|
+
## 📝 License
|
|
176
|
+
|
|
177
|
+
This project is licensed under the MIT License.
|
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
4
|
+
exports.PuppeteerClient = exports.PuppeteerClientActionType = void 0;
|
|
5
|
+
const tslib_1 = require("tslib");
|
|
6
|
+
const core_1 = require("@xcrap/core");
|
|
7
|
+
const puppeteer_1 = tslib_1.__importDefault(require("puppeteer"));
|
|
8
|
+
const constants_1 = require("./constants");
|
|
9
|
+
var PuppeteerClientActionType;
|
|
10
|
+
(function (PuppeteerClientActionType) {
|
|
11
|
+
PuppeteerClientActionType["BeforeRequest"] = "beforeRequest";
|
|
12
|
+
PuppeteerClientActionType["AfterRequest"] = "afterRequest";
|
|
13
|
+
})(PuppeteerClientActionType || (exports.PuppeteerClientActionType = PuppeteerClientActionType = {}));
|
|
14
|
+
class PuppeteerClient extends core_1.BaseClient {
|
|
15
|
+
constructor(options = {}) {
|
|
16
|
+
super(options);
|
|
17
|
+
this.options = options;
|
|
18
|
+
this.options = options;
|
|
19
|
+
this.browser = undefined;
|
|
20
|
+
}
|
|
21
|
+
initBrowser() {
|
|
22
|
+
return tslib_1.__awaiter(this, void 0, void 0, function* () {
|
|
23
|
+
const puppeteerArguments = [];
|
|
24
|
+
if (this.proxy) {
|
|
25
|
+
puppeteerArguments.push(`--proxy-server=${this.currentProxy}`);
|
|
26
|
+
}
|
|
27
|
+
if (this.options.args && this.options.args.length > 0) {
|
|
28
|
+
puppeteerArguments.push(...this.options.args);
|
|
29
|
+
}
|
|
30
|
+
this.browser = yield puppeteer_1.default.launch(Object.assign(Object.assign({}, this.options), { args: puppeteerArguments, headless: this.options.headless ? "shell" : false }));
|
|
31
|
+
});
|
|
32
|
+
}
|
|
33
|
+
ensureBrowser() {
|
|
34
|
+
return tslib_1.__awaiter(this, void 0, void 0, function* () {
|
|
35
|
+
if (!this.browser) {
|
|
36
|
+
yield this.initBrowser();
|
|
37
|
+
}
|
|
38
|
+
});
|
|
39
|
+
}
|
|
40
|
+
closeBrowser() {
|
|
41
|
+
return tslib_1.__awaiter(this, void 0, void 0, function* () {
|
|
42
|
+
if (this.browser) {
|
|
43
|
+
yield this.browser.close();
|
|
44
|
+
this.browser = undefined;
|
|
45
|
+
}
|
|
46
|
+
});
|
|
47
|
+
}
|
|
48
|
+
configurePage(page_1, _a) {
|
|
49
|
+
return tslib_1.__awaiter(this, arguments, void 0, function* (page, { javaScriptEnabled }) {
|
|
50
|
+
var _b;
|
|
51
|
+
if (this.currentUserAgent) {
|
|
52
|
+
yield page.setUserAgent((_b = this.currentUserAgent) !== null && _b !== void 0 ? _b : core_1.defaultUserAgent);
|
|
53
|
+
}
|
|
54
|
+
if (javaScriptEnabled !== undefined) {
|
|
55
|
+
yield page.setJavaScriptEnabled(javaScriptEnabled);
|
|
56
|
+
}
|
|
57
|
+
});
|
|
58
|
+
}
|
|
59
|
+
extractActions(actions) {
|
|
60
|
+
const actionsBeforeRequest = [];
|
|
61
|
+
const actionsAfterRequest = [];
|
|
62
|
+
if (!actions) {
|
|
63
|
+
actions = [];
|
|
64
|
+
}
|
|
65
|
+
for (const action of actions) {
|
|
66
|
+
const actionType = typeof action === "function" ? constants_1.defaultPuppeteerActionType : action.type;
|
|
67
|
+
const actionFunc = typeof action === "function" ? action : action.exec;
|
|
68
|
+
if (actionType === "beforeRequest") {
|
|
69
|
+
actionsBeforeRequest.push(actionFunc);
|
|
70
|
+
}
|
|
71
|
+
else {
|
|
72
|
+
actionsAfterRequest.push(actionFunc);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
return {
|
|
76
|
+
before: actionsBeforeRequest,
|
|
77
|
+
after: actionsAfterRequest
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
executeActions(page, actions) {
|
|
81
|
+
return tslib_1.__awaiter(this, void 0, void 0, function* () {
|
|
82
|
+
for (const action of actions) {
|
|
83
|
+
yield action(page);
|
|
84
|
+
}
|
|
85
|
+
});
|
|
86
|
+
}
|
|
87
|
+
fetch(_a) {
|
|
88
|
+
return tslib_1.__awaiter(this, arguments, void 0, function* ({ url, javaScriptEnabled, maxRetries = 0, actions, retries = 0, retryDelay, }) {
|
|
89
|
+
yield this.ensureBrowser();
|
|
90
|
+
const failedAttempts = [];
|
|
91
|
+
const attemptRequest = (currentRetry) => tslib_1.__awaiter(this, void 0, void 0, function* () {
|
|
92
|
+
let page = undefined;
|
|
93
|
+
try {
|
|
94
|
+
const fullUrl = this.currentProxyUrl ? `${this.currentProxyUrl}${url}` : url;
|
|
95
|
+
const { before: actionsBeforeRequest, after: actionsAfterRequest } = this.extractActions(actions);
|
|
96
|
+
page = yield this.browser.newPage();
|
|
97
|
+
yield this.configurePage(page, { javaScriptEnabled: javaScriptEnabled });
|
|
98
|
+
yield this.executeActions(page, actionsBeforeRequest);
|
|
99
|
+
const response = yield page.goto(fullUrl);
|
|
100
|
+
yield this.executeActions(page, actionsAfterRequest);
|
|
101
|
+
const content = yield page.content();
|
|
102
|
+
yield page.close();
|
|
103
|
+
const status = response === null || response === void 0 ? void 0 : response.status();
|
|
104
|
+
if (status === undefined || !this.isSuccess(status)) {
|
|
105
|
+
throw new core_1.InvalidStatusCodeError(status !== null && status !== void 0 ? status : 500);
|
|
106
|
+
}
|
|
107
|
+
return new core_1.HttpResponse({
|
|
108
|
+
body: content,
|
|
109
|
+
headers: (response === null || response === void 0 ? void 0 : response.headers()) || {},
|
|
110
|
+
status: (response === null || response === void 0 ? void 0 : response.status()) || 200,
|
|
111
|
+
statusText: (response === null || response === void 0 ? void 0 : response.statusText()) || "Ok",
|
|
112
|
+
attempts: currentRetry + 1,
|
|
113
|
+
failedAttempts: failedAttempts,
|
|
114
|
+
});
|
|
115
|
+
}
|
|
116
|
+
catch (error) {
|
|
117
|
+
const errorMessage = error instanceof Error ? error.message : "Unknown error";
|
|
118
|
+
failedAttempts.push({ error: errorMessage, timestamp: new Date() });
|
|
119
|
+
if (page) {
|
|
120
|
+
yield page.close().catch(() => { });
|
|
121
|
+
}
|
|
122
|
+
if (currentRetry < maxRetries) {
|
|
123
|
+
if (retryDelay !== undefined && retryDelay > 0) {
|
|
124
|
+
yield (0, core_1.delay)(retryDelay);
|
|
125
|
+
}
|
|
126
|
+
return yield attemptRequest(currentRetry + 1);
|
|
127
|
+
}
|
|
128
|
+
return new core_1.HttpResponse({
|
|
129
|
+
body: errorMessage,
|
|
130
|
+
headers: {},
|
|
131
|
+
status: error.status || 500,
|
|
132
|
+
statusText: "Request Failed",
|
|
133
|
+
attempts: currentRetry + 1,
|
|
134
|
+
failedAttempts: failedAttempts,
|
|
135
|
+
});
|
|
136
|
+
}
|
|
137
|
+
});
|
|
138
|
+
return yield attemptRequest(retries);
|
|
139
|
+
});
|
|
140
|
+
}
|
|
141
|
+
fetchMany(_a) {
|
|
142
|
+
return tslib_1.__awaiter(this, arguments, void 0, function* ({ requests, concurrency, requestDelay }) {
|
|
143
|
+
const results = [];
|
|
144
|
+
const executing = [];
|
|
145
|
+
for (let i = 0; i < requests.length; i++) {
|
|
146
|
+
const promise = this.executeRequest({
|
|
147
|
+
request: requests[i],
|
|
148
|
+
index: i,
|
|
149
|
+
requestDelay: requestDelay,
|
|
150
|
+
results: results
|
|
151
|
+
}).then(() => undefined);
|
|
152
|
+
executing.push(promise);
|
|
153
|
+
if (this.shouldThrottle(executing, concurrency)) {
|
|
154
|
+
yield this.handleConcurrency(executing);
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
yield Promise.all(executing);
|
|
158
|
+
return results;
|
|
159
|
+
});
|
|
160
|
+
}
|
|
161
|
+
close() {
|
|
162
|
+
return tslib_1.__awaiter(this, void 0, void 0, function* () {
|
|
163
|
+
if (this.browser) {
|
|
164
|
+
yield this.closeBrowser();
|
|
165
|
+
}
|
|
166
|
+
});
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
exports.PuppeteerClient = PuppeteerClient;
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import { ClientInterface, ClientFetchManyOptions, ClientRequestOptions, BaseClient, BaseClientOptions, HttpResponse } from "@xcrap/core";
|
|
2
|
+
import { Browser, Page, LaunchOptions } from "puppeteer";
|
|
3
|
+
export type PuppeteerProxy = string;
|
|
4
|
+
export type PuppeteerClientOptions = BaseClientOptions<PuppeteerProxy> & LaunchOptions;
|
|
5
|
+
export type PuppeteerClientActionFunction = (page: Page) => any | Promise<any>;
|
|
6
|
+
export declare enum PuppeteerClientActionType {
|
|
7
|
+
BeforeRequest = "beforeRequest",
|
|
8
|
+
AfterRequest = "afterRequest"
|
|
9
|
+
}
|
|
10
|
+
export type PuppeteerClientAction = PuppeteerClientActionFunction | {
|
|
11
|
+
type: `${PuppeteerClientActionType}`;
|
|
12
|
+
exec: PuppeteerClientActionFunction;
|
|
13
|
+
};
|
|
14
|
+
export type ExtractActionsResult = {
|
|
15
|
+
before: PuppeteerClientActionFunction[];
|
|
16
|
+
after: PuppeteerClientActionFunction[];
|
|
17
|
+
};
|
|
18
|
+
export type PuppeterRequestOptions = Omit<ClientRequestOptions & {
|
|
19
|
+
javaScriptEnabled?: boolean;
|
|
20
|
+
actions?: PuppeteerClientAction[];
|
|
21
|
+
}, "method">;
|
|
22
|
+
export type ConfigurePageOptions = {
|
|
23
|
+
javaScriptEnabled: PuppeterRequestOptions["javaScriptEnabled"];
|
|
24
|
+
};
|
|
25
|
+
export type PuppeteerFetchManyOptions = ClientFetchManyOptions<PuppeterRequestOptions>;
|
|
26
|
+
export declare class PuppeteerClient extends BaseClient<string> implements ClientInterface {
|
|
27
|
+
readonly options: PuppeteerClientOptions;
|
|
28
|
+
protected browser?: Browser;
|
|
29
|
+
constructor(options?: PuppeteerClientOptions);
|
|
30
|
+
protected initBrowser(): Promise<void>;
|
|
31
|
+
protected ensureBrowser(): Promise<void>;
|
|
32
|
+
protected closeBrowser(): Promise<void>;
|
|
33
|
+
protected configurePage(page: Page, { javaScriptEnabled }: ConfigurePageOptions): Promise<void>;
|
|
34
|
+
protected extractActions(actions: PuppeteerClientAction[] | undefined): ExtractActionsResult;
|
|
35
|
+
protected executeActions(page: Page, actions: PuppeteerClientActionFunction[]): Promise<void>;
|
|
36
|
+
fetch({ url, javaScriptEnabled, maxRetries, actions, retries, retryDelay, }: PuppeterRequestOptions): Promise<HttpResponse>;
|
|
37
|
+
fetchMany({ requests, concurrency, requestDelay }: PuppeteerFetchManyOptions): Promise<HttpResponse[]>;
|
|
38
|
+
close(): Promise<void>;
|
|
39
|
+
}
|
package/dist/index.mjs
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
2
|
+
exports.PuppeteerClient = exports.PuppeteerClientActionType = void 0;
|
|
3
|
+
const tslib_1 = require("tslib");
|
|
4
|
+
const core_1 = require("@xcrap/core");
|
|
5
|
+
const puppeteer_1 = tslib_1.__importDefault(require("puppeteer"));
|
|
6
|
+
const constants_1 = require("./constants");
|
|
7
|
+
var PuppeteerClientActionType;
|
|
8
|
+
(function (PuppeteerClientActionType) {
|
|
9
|
+
PuppeteerClientActionType["BeforeRequest"] = "beforeRequest";
|
|
10
|
+
PuppeteerClientActionType["AfterRequest"] = "afterRequest";
|
|
11
|
+
})(PuppeteerClientActionType || (exports.PuppeteerClientActionType = PuppeteerClientActionType = {}));
|
|
12
|
+
class PuppeteerClient extends core_1.BaseClient {
|
|
13
|
+
constructor(options = {}) {
|
|
14
|
+
super(options);
|
|
15
|
+
this.options = options;
|
|
16
|
+
this.options = options;
|
|
17
|
+
this.browser = undefined;
|
|
18
|
+
}
|
|
19
|
+
initBrowser() {
|
|
20
|
+
return tslib_1.__awaiter(this, void 0, void 0, function* () {
|
|
21
|
+
const puppeteerArguments = [];
|
|
22
|
+
if (this.proxy) {
|
|
23
|
+
puppeteerArguments.push(`--proxy-server=${this.currentProxy}`);
|
|
24
|
+
}
|
|
25
|
+
if (this.options.args && this.options.args.length > 0) {
|
|
26
|
+
puppeteerArguments.push(...this.options.args);
|
|
27
|
+
}
|
|
28
|
+
this.browser = yield puppeteer_1.default.launch(Object.assign(Object.assign({}, this.options), { args: puppeteerArguments, headless: this.options.headless ? "shell" : false }));
|
|
29
|
+
});
|
|
30
|
+
}
|
|
31
|
+
ensureBrowser() {
|
|
32
|
+
return tslib_1.__awaiter(this, void 0, void 0, function* () {
|
|
33
|
+
if (!this.browser) {
|
|
34
|
+
yield this.initBrowser();
|
|
35
|
+
}
|
|
36
|
+
});
|
|
37
|
+
}
|
|
38
|
+
closeBrowser() {
|
|
39
|
+
return tslib_1.__awaiter(this, void 0, void 0, function* () {
|
|
40
|
+
if (this.browser) {
|
|
41
|
+
yield this.browser.close();
|
|
42
|
+
this.browser = undefined;
|
|
43
|
+
}
|
|
44
|
+
});
|
|
45
|
+
}
|
|
46
|
+
configurePage(page_1, _a) {
|
|
47
|
+
return tslib_1.__awaiter(this, arguments, void 0, function* (page, { javaScriptEnabled }) {
|
|
48
|
+
var _b;
|
|
49
|
+
if (this.currentUserAgent) {
|
|
50
|
+
yield page.setUserAgent((_b = this.currentUserAgent) !== null && _b !== void 0 ? _b : core_1.defaultUserAgent);
|
|
51
|
+
}
|
|
52
|
+
if (javaScriptEnabled !== undefined) {
|
|
53
|
+
yield page.setJavaScriptEnabled(javaScriptEnabled);
|
|
54
|
+
}
|
|
55
|
+
});
|
|
56
|
+
}
|
|
57
|
+
extractActions(actions) {
|
|
58
|
+
const actionsBeforeRequest = [];
|
|
59
|
+
const actionsAfterRequest = [];
|
|
60
|
+
if (!actions) {
|
|
61
|
+
actions = [];
|
|
62
|
+
}
|
|
63
|
+
for (const action of actions) {
|
|
64
|
+
const actionType = typeof action === "function" ? constants_1.defaultPuppeteerActionType : action.type;
|
|
65
|
+
const actionFunc = typeof action === "function" ? action : action.exec;
|
|
66
|
+
if (actionType === "beforeRequest") {
|
|
67
|
+
actionsBeforeRequest.push(actionFunc);
|
|
68
|
+
}
|
|
69
|
+
else {
|
|
70
|
+
actionsAfterRequest.push(actionFunc);
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
return {
|
|
74
|
+
before: actionsBeforeRequest,
|
|
75
|
+
after: actionsAfterRequest
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
executeActions(page, actions) {
|
|
79
|
+
return tslib_1.__awaiter(this, void 0, void 0, function* () {
|
|
80
|
+
for (const action of actions) {
|
|
81
|
+
yield action(page);
|
|
82
|
+
}
|
|
83
|
+
});
|
|
84
|
+
}
|
|
85
|
+
fetch(_a) {
|
|
86
|
+
return tslib_1.__awaiter(this, arguments, void 0, function* ({ url, javaScriptEnabled, maxRetries = 0, actions, retries = 0, retryDelay, }) {
|
|
87
|
+
yield this.ensureBrowser();
|
|
88
|
+
const failedAttempts = [];
|
|
89
|
+
const attemptRequest = (currentRetry) => tslib_1.__awaiter(this, void 0, void 0, function* () {
|
|
90
|
+
let page = undefined;
|
|
91
|
+
try {
|
|
92
|
+
const fullUrl = this.currentProxyUrl ? `${this.currentProxyUrl}${url}` : url;
|
|
93
|
+
const { before: actionsBeforeRequest, after: actionsAfterRequest } = this.extractActions(actions);
|
|
94
|
+
page = yield this.browser.newPage();
|
|
95
|
+
yield this.configurePage(page, { javaScriptEnabled: javaScriptEnabled });
|
|
96
|
+
yield this.executeActions(page, actionsBeforeRequest);
|
|
97
|
+
const response = yield page.goto(fullUrl);
|
|
98
|
+
yield this.executeActions(page, actionsAfterRequest);
|
|
99
|
+
const content = yield page.content();
|
|
100
|
+
yield page.close();
|
|
101
|
+
const status = response === null || response === void 0 ? void 0 : response.status();
|
|
102
|
+
if (status === undefined || !this.isSuccess(status)) {
|
|
103
|
+
throw new core_1.InvalidStatusCodeError(status !== null && status !== void 0 ? status : 500);
|
|
104
|
+
}
|
|
105
|
+
return new core_1.HttpResponse({
|
|
106
|
+
body: content,
|
|
107
|
+
headers: (response === null || response === void 0 ? void 0 : response.headers()) || {},
|
|
108
|
+
status: (response === null || response === void 0 ? void 0 : response.status()) || 200,
|
|
109
|
+
statusText: (response === null || response === void 0 ? void 0 : response.statusText()) || "Ok",
|
|
110
|
+
attempts: currentRetry + 1,
|
|
111
|
+
failedAttempts: failedAttempts,
|
|
112
|
+
});
|
|
113
|
+
}
|
|
114
|
+
catch (error) {
|
|
115
|
+
const errorMessage = error instanceof Error ? error.message : "Unknown error";
|
|
116
|
+
failedAttempts.push({ error: errorMessage, timestamp: new Date() });
|
|
117
|
+
if (page) {
|
|
118
|
+
yield page.close().catch(() => { });
|
|
119
|
+
}
|
|
120
|
+
if (currentRetry < maxRetries) {
|
|
121
|
+
if (retryDelay !== undefined && retryDelay > 0) {
|
|
122
|
+
yield (0, core_1.delay)(retryDelay);
|
|
123
|
+
}
|
|
124
|
+
return yield attemptRequest(currentRetry + 1);
|
|
125
|
+
}
|
|
126
|
+
return new core_1.HttpResponse({
|
|
127
|
+
body: errorMessage,
|
|
128
|
+
headers: {},
|
|
129
|
+
status: error.status || 500,
|
|
130
|
+
statusText: "Request Failed",
|
|
131
|
+
attempts: currentRetry + 1,
|
|
132
|
+
failedAttempts: failedAttempts,
|
|
133
|
+
});
|
|
134
|
+
}
|
|
135
|
+
});
|
|
136
|
+
return yield attemptRequest(retries);
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
fetchMany(_a) {
|
|
140
|
+
return tslib_1.__awaiter(this, arguments, void 0, function* ({ requests, concurrency, requestDelay }) {
|
|
141
|
+
const results = [];
|
|
142
|
+
const executing = [];
|
|
143
|
+
for (let i = 0; i < requests.length; i++) {
|
|
144
|
+
const promise = this.executeRequest({
|
|
145
|
+
request: requests[i],
|
|
146
|
+
index: i,
|
|
147
|
+
requestDelay: requestDelay,
|
|
148
|
+
results: results
|
|
149
|
+
}).then(() => undefined);
|
|
150
|
+
executing.push(promise);
|
|
151
|
+
if (this.shouldThrottle(executing, concurrency)) {
|
|
152
|
+
yield this.handleConcurrency(executing);
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
yield Promise.all(executing);
|
|
156
|
+
return results;
|
|
157
|
+
});
|
|
158
|
+
}
|
|
159
|
+
close() {
|
|
160
|
+
return tslib_1.__awaiter(this, void 0, void 0, function* () {
|
|
161
|
+
if (this.browser) {
|
|
162
|
+
yield this.closeBrowser();
|
|
163
|
+
}
|
|
164
|
+
});
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
exports.PuppeteerClient = PuppeteerClient;
|
package/package.json
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@xcrap/puppeteer-client",
|
|
3
|
+
"version": "0.0.1",
|
|
4
|
+
"description": "",
|
|
5
|
+
"main": "./dist/index.js",
|
|
6
|
+
"types": "./dist/index.d.ts",
|
|
7
|
+
"files": [
|
|
8
|
+
"dist/*",
|
|
9
|
+
"!/**/__tests__"
|
|
10
|
+
],
|
|
11
|
+
"scripts": {
|
|
12
|
+
"test": "jest",
|
|
13
|
+
"build": "rollup -c"
|
|
14
|
+
},
|
|
15
|
+
"keywords": [
|
|
16
|
+
"xcrap",
|
|
17
|
+
"transformer",
|
|
18
|
+
"scrapy",
|
|
19
|
+
"web scraping"
|
|
20
|
+
],
|
|
21
|
+
"author": "Marcuth",
|
|
22
|
+
"license": "MIT",
|
|
23
|
+
"type": "commonjs",
|
|
24
|
+
"devDependencies": {
|
|
25
|
+
"@rollup/plugin-commonjs": "^28.0.3",
|
|
26
|
+
"@rollup/plugin-node-resolve": "^16.0.1",
|
|
27
|
+
"@rollup/plugin-typescript": "^12.1.2",
|
|
28
|
+
"@types/node": "^22.13.17",
|
|
29
|
+
"rollup": "^4.39.0",
|
|
30
|
+
"ts-node": "^10.9.2",
|
|
31
|
+
"tslib": "^2.8.1",
|
|
32
|
+
"typescript": "^5.8.2"
|
|
33
|
+
},
|
|
34
|
+
"dependencies": {
|
|
35
|
+
"puppeteer": "^24.5.0"
|
|
36
|
+
},
|
|
37
|
+
"peerDependencies": {
|
|
38
|
+
"@xcrap/core": "^0.0.3",
|
|
39
|
+
"@xcrap/parser": "^0.0.2"
|
|
40
|
+
}
|
|
41
|
+
}
|