@hanivanrizky/nestjs-browser-action 0.14.0 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +230 -254
- package/dist/interfaces/browser-action-options.d.ts +1 -1
- package/dist/interfaces/cleansing-options.d.ts +3 -8
- package/dist/interfaces/types.d.ts +32 -1
- package/dist/interfaces/workflow-options.d.ts +2 -2
- package/dist/pipes/alt-flag.pipe.d.ts +1 -1
- package/dist/pipes/alt-flag.pipe.js.map +1 -1
- package/dist/pipes/clean-html.pipe.d.ts +5 -0
- package/dist/pipes/clean-html.pipe.js +52 -0
- package/dist/pipes/clean-html.pipe.js.map +1 -0
- package/dist/pipes/cleansing-pipe.d.ts +4 -3
- package/dist/pipes/cleansing-pipe.js +3 -17
- package/dist/pipes/cleansing-pipe.js.map +1 -1
- package/dist/pipes/date-format-special.pipe.d.ts +6 -0
- package/dist/pipes/date-format-special.pipe.js +24 -0
- package/dist/pipes/date-format-special.pipe.js.map +1 -0
- package/dist/pipes/date-format.pipe.d.ts +1 -1
- package/dist/pipes/date-format.pipe.js.map +1 -1
- package/dist/pipes/extract-email.pipe.d.ts +6 -0
- package/dist/pipes/extract-email.pipe.js +19 -0
- package/dist/pipes/extract-email.pipe.js.map +1 -0
- package/dist/pipes/extract-url-params.pipe.d.ts +10 -0
- package/dist/pipes/extract-url-params.pipe.js +57 -0
- package/dist/pipes/extract-url-params.pipe.js.map +1 -0
- package/dist/pipes/index.d.ts +16 -0
- package/dist/pipes/index.js +16 -0
- package/dist/pipes/index.js.map +1 -1
- package/dist/pipes/json-path.pipe.d.ts +8 -0
- package/dist/pipes/json-path.pipe.js +40 -0
- package/dist/pipes/json-path.pipe.js.map +1 -0
- package/dist/pipes/media-filter.pipe.d.ts +7 -0
- package/dist/pipes/media-filter.pipe.js +21 -0
- package/dist/pipes/media-filter.pipe.js.map +1 -0
- package/dist/pipes/normalize-whitespace.pipe.d.ts +1 -1
- package/dist/pipes/normalize-whitespace.pipe.js.map +1 -1
- package/dist/pipes/number-normalize.pipe.d.ts +6 -0
- package/dist/pipes/number-normalize.pipe.js +38 -0
- package/dist/pipes/number-normalize.pipe.js.map +1 -0
- package/dist/pipes/parse-as-url.pipe.d.ts +7 -0
- package/dist/pipes/parse-as-url.pipe.js +45 -0
- package/dist/pipes/parse-as-url.pipe.js.map +1 -0
- package/dist/pipes/pipe-engine.d.ts +20 -0
- package/dist/pipes/pipe-engine.js +81 -0
- package/dist/pipes/pipe-engine.js.map +1 -0
- package/dist/pipes/pipe-registry.d.ts +3 -0
- package/dist/pipes/pipe-registry.js +89 -0
- package/dist/pipes/pipe-registry.js.map +1 -0
- package/dist/pipes/profiles/currency.profile.js +11 -12
- package/dist/pipes/profiles/currency.profile.js.map +1 -1
- package/dist/pipes/profiles/date.profile.js +10 -7
- package/dist/pipes/profiles/date.profile.js.map +1 -1
- package/dist/pipes/profiles/email.profile.js +9 -9
- package/dist/pipes/profiles/email.profile.js.map +1 -1
- package/dist/pipes/profiles/phone.profile.js +11 -11
- package/dist/pipes/profiles/phone.profile.js.map +1 -1
- package/dist/pipes/profiles/price.profile.js +11 -12
- package/dist/pipes/profiles/price.profile.js.map +1 -1
- package/dist/pipes/profiles.d.ts +2 -2
- package/dist/pipes/profiles.js +5 -5
- package/dist/pipes/profiles.js.map +1 -1
- package/dist/pipes/query-append.pipe.d.ts +9 -0
- package/dist/pipes/query-append.pipe.js +47 -0
- package/dist/pipes/query-append.pipe.js.map +1 -0
- package/dist/pipes/query-remover.pipe.d.ts +22 -0
- package/dist/pipes/query-remover.pipe.js +83 -0
- package/dist/pipes/query-remover.pipe.js.map +1 -0
- package/dist/pipes/regex-extract.pipe.d.ts +1 -1
- package/dist/pipes/regex-extract.pipe.js.map +1 -1
- package/dist/pipes/regex-extraction.pipe.d.ts +25 -0
- package/dist/pipes/regex-extraction.pipe.js +90 -0
- package/dist/pipes/regex-extraction.pipe.js.map +1 -0
- package/dist/pipes/regex-replace-x.pipe.d.ts +28 -0
- package/dist/pipes/regex-replace-x.pipe.js +104 -0
- package/dist/pipes/regex-replace-x.pipe.js.map +1 -0
- package/dist/pipes/regex-replace.pipe.d.ts +1 -1
- package/dist/pipes/regex-replace.pipe.js.map +1 -1
- package/dist/pipes/regex.pipe.d.ts +12 -0
- package/dist/pipes/regex.pipe.js +42 -0
- package/dist/pipes/regex.pipe.js.map +1 -0
- package/dist/pipes/remove-currency-symbol.pipe.d.ts +1 -1
- package/dist/pipes/remove-currency-symbol.pipe.js.map +1 -1
- package/dist/pipes/remove-line-breaks.pipe.d.ts +1 -1
- package/dist/pipes/remove-line-breaks.pipe.js.map +1 -1
- package/dist/pipes/remove-special-chars.pipe.d.ts +1 -1
- package/dist/pipes/remove-special-chars.pipe.js.map +1 -1
- package/dist/pipes/sanitize-text.pipe.d.ts +1 -1
- package/dist/pipes/sanitize-text.pipe.js.map +1 -1
- package/dist/pipes/to-lower-case.pipe.d.ts +1 -1
- package/dist/pipes/to-lower-case.pipe.js.map +1 -1
- package/dist/pipes/to-number.pipe.d.ts +1 -1
- package/dist/pipes/to-number.pipe.js.map +1 -1
- package/dist/pipes/to-upper-case.pipe.d.ts +1 -1
- package/dist/pipes/to-upper-case.pipe.js.map +1 -1
- package/dist/pipes/trim.pipe.d.ts +1 -1
- package/dist/pipes/trim.pipe.js.map +1 -1
- package/dist/pipes/url-resolve.pipe.d.ts +7 -0
- package/dist/pipes/url-resolve.pipe.js +52 -0
- package/dist/pipes/url-resolve.pipe.js.map +1 -0
- package/dist/services/browser-action.service.d.ts +3 -3
- package/dist/services/browser-action.service.js +54 -27
- package/dist/services/browser-action.service.js.map +1 -1
- package/dist/services/cleansing.service.d.ts +2 -4
- package/dist/services/cleansing.service.js +24 -81
- package/dist/services/cleansing.service.js.map +1 -1
- package/dist/tsconfig.build.tsbuildinfo +1 -1
- package/dist/utils/user-agent.util.js +6 -11
- package/dist/utils/user-agent.util.js.map +1 -1
- package/dist/validators/workflow.validator.js +2 -2
- package/dist/validators/workflow.validator.js.map +1 -1
- package/package.json +7 -2
package/README.md
CHANGED
|
@@ -1,53 +1,73 @@
|
|
|
1
1
|
# @hanivanrizky/nestjs-browser-action
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
3
|
+
<p align="center">
|
|
4
|
+
<a href="http://nestjs.com/" target="_blank"><img src="https://nestjs.com/img/logo-small.svg" width="120" alt="Nest Logo" /></a>
|
|
5
|
+
</p>
|
|
6
6
|
|
|
7
|
-
>
|
|
8
|
-
>
|
|
9
|
-
> This project is currently in **experimental** stage and intended for **personal use only**. The API is subject to change, and production use is not recommended.
|
|
7
|
+
<p align="center">A NestJS module for stealth browser automation using CloakBrowser + puppeteer-core with support for proxy rotation, connection pooling, cookie persistence, and flexible data extraction.</p>
|
|
10
8
|
|
|
11
|
-
|
|
9
|
+
<p align="center">
|
|
10
|
+
<a href="https://www.npmjs.com/package/@hanivanrizky/nestjs-browser-action" target="_blank"><img src="https://img.shields.io/npm/v/@hanivanrizky/nestjs-browser-action.svg" alt="NPM Version" /></a>
|
|
11
|
+
<a href="https://www.npmjs.com/package/@hanivanrizky/nestjs-browser-action" target="_blank"><img src="https://img.shields.io/npm/l/@hanivanrizky/nestjs-browser-action.svg" alt="Package License" /></a>
|
|
12
|
+
<a href="https://www.npmjs.com/package/@hanivanrizky/nestjs-browser-action" target="_blank"><img src="https://img.shields.io/npm/dm/@hanivanrizky/nestjs-browser-action.svg" alt="NPM Downloads" /></a>
|
|
13
|
+
<img src="https://img.shields.io/badge/tests-363%20passed-brightgreen.svg" alt="Tests: 363 passed" />
|
|
14
|
+
</p>
|
|
15
|
+
|
|
16
|
+
> **⚠️ Status: Experimental** — personal use only; API subject to change.
|
|
17
|
+
|
|
18
|
+
## Table of Contents
|
|
19
|
+
|
|
20
|
+
- [Features](#features)
|
|
21
|
+
- [Installation](#installation)
|
|
22
|
+
- [Quick Start](#quick-start)
|
|
23
|
+
- [Documentation](#documentation)
|
|
24
|
+
- [Quick Examples](#quick-examples)
|
|
25
|
+
- [Development](#development)
|
|
26
|
+
- [Contributing](#contributing)
|
|
27
|
+
- [License](#license)
|
|
12
28
|
|
|
13
29
|
## Features
|
|
14
30
|
|
|
15
|
-
- (
|
|
16
|
-
- (
|
|
17
|
-
- (
|
|
18
|
-
- (
|
|
19
|
-
- (
|
|
20
|
-
- (
|
|
21
|
-
- (
|
|
22
|
-
- (
|
|
23
|
-
- (
|
|
24
|
-
- (
|
|
31
|
+
- **(☆^O^☆) Pattern-Based Extraction**: Define extraction patterns with `PatternField` — API-compatible with `nestjs-xpath-parser`
|
|
32
|
+
- **(.\_.) Container Extraction**: Extract lists of items from repeating DOM nodes with pagination
|
|
33
|
+
- **(>\_<) Workflow Automation**: Declarative step-by-step browser automation (navigate, click, fill, extract, screenshot…)
|
|
34
|
+
- **(・\_・) Data Cleaning Pipes**: 33 built-in transformations (trim, case, replace, decode HTML, number, regex, jsonpath, clean-html…)
|
|
35
|
+
- **(☆^O^☆) Custom Pipes**: Extensible pipe registry — `PIPE_REGISTRY['my-type'] = MyPipe`
|
|
36
|
+
- **(>\_<) Connection Pooling**: Efficient browser instance reuse with configurable min/max/idle/acquire timeouts
|
|
37
|
+
- **(.\_.) Cookie Persistence**: Save/load browser sessions for authentication flows
|
|
38
|
+
- **(o_o) Stealth**: CloakBrowser Chromium with proxy, humanize, geoip, timezone/locale spoofing, and anti-detect flags
|
|
39
|
+
- **(.\_.) Remote Chrome**: Connect to remote Chrome instances via CDP (browserURL / browserWSEndpoint)
|
|
40
|
+
- **(>\_<) TLS Fingerprint**: Capture the browser's own TLS/HTTP handshake (ja3/ja4, ciphers, http2 akamai, headers) for use with `nestjs-xpath-parser`'s CycleTLS engine
|
|
41
|
+
- **(☆^O^☆) TypeScript Generics**: Full generic type support for type-safe results
|
|
42
|
+
- **(o_o) Fully Tested**: 363 tests across 37 suites
|
|
25
43
|
|
|
26
44
|
## Installation
|
|
27
45
|
|
|
28
|
-
### From npm
|
|
29
|
-
|
|
30
46
|
```bash
|
|
31
|
-
|
|
47
|
+
pnpm add @hanivanrizky/nestjs-browser-action
|
|
32
48
|
# or
|
|
33
49
|
yarn add @hanivanrizky/nestjs-browser-action
|
|
34
50
|
# or
|
|
35
|
-
|
|
51
|
+
npm install @hanivanrizky/nestjs-browser-action
|
|
36
52
|
```
|
|
37
53
|
|
|
38
|
-
|
|
54
|
+
## Quick Start
|
|
39
55
|
|
|
40
|
-
|
|
41
|
-
npm install https://github.com/Hanivan/nestjs-browser-action.git
|
|
42
|
-
# or
|
|
43
|
-
pnpm add https://github.com/Hanivan/nestjs-browser-action.git
|
|
44
|
-
# or using SSH
|
|
45
|
-
pnpm add git@github.com:Hanivan/nestjs-browser-action.git
|
|
46
|
-
```
|
|
56
|
+
### Import the Module
|
|
47
57
|
|
|
48
|
-
|
|
58
|
+
**Basic usage:**
|
|
49
59
|
|
|
50
|
-
|
|
60
|
+
```typescript
|
|
61
|
+
import { Module } from '@nestjs/common';
|
|
62
|
+
import { BrowserActionModule } from '@hanivanrizky/nestjs-browser-action';
|
|
63
|
+
|
|
64
|
+
@Module({
|
|
65
|
+
imports: [BrowserActionModule.forRoot()],
|
|
66
|
+
})
|
|
67
|
+
export class AppModule {}
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
**With pool and cookie options:**
|
|
51
71
|
|
|
52
72
|
```typescript
|
|
53
73
|
import { Module } from '@nestjs/common';
|
|
@@ -57,304 +77,260 @@ import { BrowserActionModule } from '@hanivanrizky/nestjs-browser-action';
|
|
|
57
77
|
imports: [
|
|
58
78
|
BrowserActionModule.forRoot({
|
|
59
79
|
pool: { min: 2, max: 10 },
|
|
60
|
-
cookies: { enabled: true },
|
|
80
|
+
cookies: { enabled: true, cookiesDir: './cookies' },
|
|
81
|
+
logLevel: 'log',
|
|
61
82
|
}),
|
|
62
83
|
],
|
|
63
84
|
})
|
|
64
85
|
export class AppModule {}
|
|
65
86
|
```
|
|
66
87
|
|
|
67
|
-
|
|
88
|
+
**Async configuration:**
|
|
89
|
+
|
|
90
|
+
```typescript
|
|
91
|
+
import { Module } from '@nestjs/common';
|
|
92
|
+
import { BrowserActionModule } from '@hanivanrizky/nestjs-browser-action';
|
|
93
|
+
import { ConfigModule, ConfigService } from '@nestjs/config';
|
|
94
|
+
|
|
95
|
+
@Module({
|
|
96
|
+
imports: [
|
|
97
|
+
ConfigModule.forRoot(),
|
|
98
|
+
BrowserActionModule.forRootAsync({
|
|
99
|
+
imports: [ConfigModule],
|
|
100
|
+
useFactory: (configService: ConfigService) => ({
|
|
101
|
+
pool: {
|
|
102
|
+
min: configService.get<number>('POOL_MIN', 2),
|
|
103
|
+
max: configService.get<number>('POOL_MAX', 10),
|
|
104
|
+
},
|
|
105
|
+
cloak: {
|
|
106
|
+
proxy: { server: configService.get<string>('PROXY_URL', '') },
|
|
107
|
+
},
|
|
108
|
+
logLevel: configService.get<string>('LOG_LEVEL', 'log'),
|
|
109
|
+
}),
|
|
110
|
+
inject: [ConfigService],
|
|
111
|
+
}),
|
|
112
|
+
],
|
|
113
|
+
})
|
|
114
|
+
export class AppModule {}
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### Inject the Service
|
|
68
118
|
|
|
69
119
|
```typescript
|
|
70
120
|
import { Injectable } from '@nestjs/common';
|
|
71
121
|
import { BrowserActionService } from '@hanivanrizky/nestjs-browser-action';
|
|
72
122
|
|
|
73
123
|
@Injectable()
|
|
74
|
-
export class
|
|
75
|
-
constructor(
|
|
76
|
-
|
|
77
|
-
) {
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
124
|
+
export class YourService {
|
|
125
|
+
constructor(private readonly browserAction: BrowserActionService) {}
|
|
126
|
+
|
|
127
|
+
async scrapeProducts() {
|
|
128
|
+
const result = await this.browserAction.evaluateWebsite({
|
|
129
|
+
url: 'https://www.scrapingcourse.com/ecommerce/',
|
|
130
|
+
patterns: [
|
|
131
|
+
{
|
|
132
|
+
key: 'container',
|
|
133
|
+
patternType: 'css',
|
|
134
|
+
returnType: 'text',
|
|
135
|
+
patterns: ['.product'],
|
|
136
|
+
meta: { isContainer: true },
|
|
137
|
+
},
|
|
138
|
+
{
|
|
139
|
+
key: 'name',
|
|
140
|
+
patternType: 'css',
|
|
141
|
+
returnType: 'text',
|
|
142
|
+
patterns: ['h2.woocommerce-loop-product__title'],
|
|
143
|
+
pipes: { trim: true },
|
|
144
|
+
},
|
|
145
|
+
{
|
|
146
|
+
key: 'price',
|
|
147
|
+
patternType: 'css',
|
|
148
|
+
returnType: 'text',
|
|
149
|
+
patterns: ['.price'],
|
|
150
|
+
pipes: { trim: true },
|
|
151
|
+
},
|
|
152
|
+
],
|
|
153
|
+
});
|
|
154
|
+
|
|
155
|
+
return result.results;
|
|
90
156
|
}
|
|
91
157
|
}
|
|
92
158
|
```
|
|
93
159
|
|
|
94
160
|
## Documentation
|
|
95
161
|
|
|
96
|
-
###
|
|
97
|
-
|
|
98
|
-
| Method | Description |
|
|
99
|
-
|--------|-------------|
|
|
100
|
-
| [`scrape()`](./docs/methods/scrape.md) | Extract single elements |
|
|
101
|
-
| [`scrapeAll()`](./docs/methods/scrape-all.md) | Extract multiple elements |
|
|
102
|
-
| [`scrapeWithWorkflow()`](./docs/methods/workflow.md) | Workflow-based automation |
|
|
103
|
-
| [`scrapeAllWithWorkflow()`](./docs/methods/workflow.md) | Workflow with multi-element |
|
|
104
|
-
| [`takeScreenshot()`](./docs/methods/screenshots.md) | Capture screenshots |
|
|
105
|
-
| [`generatePDF()`](./docs/methods/screenshots.md) | Generate PDFs |
|
|
106
|
-
| [`captureTlsFingerprint()`](./docs/api-reference.md#capturetlsfingerprintpath-url-promisetlsfingerprint) | Capture browser's TLS/HTTP fingerprint |
|
|
107
|
-
| [Browser & Page Control](./docs/methods/browser-control.md) | Low-level control |
|
|
108
|
-
|
|
109
|
-
### (☆^O^☆) Feature Guides
|
|
162
|
+
### Features
|
|
110
163
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
164
|
+
- [Pattern-Based Extraction](docs/methods/scrape.md#evaluatewebsite---unified-xpath-parser-compatible-api) - `evaluateWebsite()` with `PatternField` patterns
|
|
165
|
+
- [Container-Based Extraction](docs/methods/scrape.md#evaluatewebsite---unified-xpath-parser-compatible-api) - Extract lists with `meta.isContainer`
|
|
166
|
+
- [Data Cleaning Pipes](docs/features/pipes.md) - Transform extracted data with pipes
|
|
167
|
+
- [Cookie Management](docs/features/cookies.md) - Session persistence
|
|
168
|
+
- [Workflow Actions](docs/methods/workflow.md) - Declarative step-by-step automation
|
|
116
169
|
|
|
117
|
-
###
|
|
170
|
+
### Reference
|
|
118
171
|
|
|
119
|
-
- [API Reference](
|
|
120
|
-
- [
|
|
121
|
-
- [
|
|
172
|
+
- [API Reference](docs/api-reference.md) - Complete service API documentation
|
|
173
|
+
- [Workflow Actions Reference](docs/workflow-actions.md) - All action types
|
|
174
|
+
- [Browser & Page Control](docs/methods/browser-control.md) - Low-level control
|
|
122
175
|
|
|
123
176
|
## Quick Examples
|
|
124
177
|
|
|
125
|
-
### Simple Scraping
|
|
178
|
+
### Simple Product Scraping
|
|
126
179
|
|
|
127
180
|
```typescript
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
price:
|
|
131
|
-
}
|
|
132
|
-
```
|
|
133
|
-
|
|
134
|
-
### Multi-Element Scraping
|
|
181
|
+
interface Product {
|
|
182
|
+
name: string;
|
|
183
|
+
price: string;
|
|
184
|
+
}
|
|
135
185
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
186
|
+
const result = await browserAction.evaluateWebsite<Product>({
|
|
187
|
+
url: 'https://example.com/products',
|
|
188
|
+
patterns: [
|
|
189
|
+
{
|
|
190
|
+
key: 'container',
|
|
191
|
+
patternType: 'css',
|
|
192
|
+
returnType: 'text',
|
|
193
|
+
patterns: ['.product-card'],
|
|
194
|
+
meta: { isContainer: true },
|
|
195
|
+
},
|
|
196
|
+
{
|
|
197
|
+
key: 'name',
|
|
198
|
+
patternType: 'css',
|
|
199
|
+
returnType: 'text',
|
|
200
|
+
patterns: ['h2.name'],
|
|
201
|
+
pipes: { trim: true },
|
|
202
|
+
},
|
|
203
|
+
{
|
|
204
|
+
key: 'price',
|
|
205
|
+
patternType: 'css',
|
|
206
|
+
returnType: 'text',
|
|
207
|
+
patterns: ['.price'],
|
|
208
|
+
pipes: {
|
|
209
|
+
trim: true,
|
|
210
|
+
replace: [{ from: '$', to: '' }],
|
|
211
|
+
},
|
|
212
|
+
},
|
|
213
|
+
],
|
|
140
214
|
});
|
|
141
215
|
```
|
|
142
216
|
|
|
143
|
-
###
|
|
217
|
+
### Article Extraction with Fallbacks
|
|
144
218
|
|
|
145
219
|
```typescript
|
|
146
|
-
const
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
{
|
|
150
|
-
|
|
151
|
-
|
|
220
|
+
const result = await browserAction.evaluateWebsite({
|
|
221
|
+
url: 'https://example.com/article',
|
|
222
|
+
patterns: [
|
|
223
|
+
{
|
|
224
|
+
key: 'title',
|
|
225
|
+
patternType: 'css',
|
|
226
|
+
returnType: 'text',
|
|
227
|
+
patterns: ['meta[property="og:title"]'],
|
|
228
|
+
meta: {
|
|
229
|
+
alterPattern: ['h1', 'title'],
|
|
230
|
+
},
|
|
231
|
+
pipes: { trim: true },
|
|
232
|
+
},
|
|
233
|
+
{
|
|
234
|
+
key: 'description',
|
|
235
|
+
patternType: 'css',
|
|
236
|
+
returnType: 'text',
|
|
237
|
+
patterns: ['meta[name="description"]'],
|
|
238
|
+
pipes: { trim: true, decode: true },
|
|
239
|
+
},
|
|
152
240
|
],
|
|
153
|
-
};
|
|
154
|
-
|
|
155
|
-
const result = await this.actionHelpers.scrapeWithWorkflow(workflow);
|
|
241
|
+
});
|
|
156
242
|
```
|
|
157
243
|
|
|
158
|
-
###
|
|
244
|
+
### XPath Extraction
|
|
159
245
|
|
|
160
246
|
```typescript
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
{
|
|
170
|
-
|
|
171
|
-
|
|
247
|
+
const result = await browserAction.evaluateWebsite({
|
|
248
|
+
url: 'https://example.com/sitemap.xml',
|
|
249
|
+
patterns: [
|
|
250
|
+
{
|
|
251
|
+
key: 'container',
|
|
252
|
+
patternType: 'xpath',
|
|
253
|
+
returnType: 'text',
|
|
254
|
+
patterns: ['//url'],
|
|
255
|
+
meta: { isContainer: true },
|
|
256
|
+
},
|
|
257
|
+
{
|
|
258
|
+
key: 'loc',
|
|
259
|
+
patternType: 'xpath',
|
|
260
|
+
returnType: 'text',
|
|
261
|
+
patterns: ['.//loc/text()'],
|
|
262
|
+
},
|
|
263
|
+
],
|
|
172
264
|
});
|
|
173
265
|
```
|
|
174
266
|
|
|
175
|
-
###
|
|
267
|
+
### Workflow Automation
|
|
176
268
|
|
|
177
269
|
```typescript
|
|
178
|
-
const
|
|
179
|
-
version: '1.0'
|
|
270
|
+
const result = await browserAction.scrapeWithWorkflow({
|
|
271
|
+
version: '1.0',
|
|
180
272
|
actions: [
|
|
181
|
-
{ action: '
|
|
182
|
-
{ action: '
|
|
183
|
-
{ action: '
|
|
273
|
+
{ action: 'navigate', value: 'https://example.com/login' },
|
|
274
|
+
{ action: 'fill', target: { type: 'css', value: '#username' }, value: 'user' },
|
|
275
|
+
{ action: 'fill', target: { type: 'css', value: '#password' }, value: 'pass' },
|
|
276
|
+
{ action: 'click', target: { type: 'css', value: '[type=submit]' } },
|
|
277
|
+
{ action: 'saveCookies', value: 'user-session', options: { overwrite: true } },
|
|
278
|
+
{ id: 'title', action: 'extract', target: { type: 'css', value: 'h1' } },
|
|
184
279
|
],
|
|
185
|
-
};
|
|
280
|
+
});
|
|
186
281
|
```
|
|
187
282
|
|
|
188
283
|
### Stealth (CloakBrowser)
|
|
189
284
|
|
|
190
|
-
Local browsers launch through CloakBrowser stealth Chromium. Configure anti-detect
|
|
191
|
-
features via the `cloak` option:
|
|
192
|
-
|
|
193
285
|
```typescript
|
|
194
286
|
BrowserActionModule.forRoot({
|
|
195
287
|
cloak: {
|
|
196
288
|
proxy: { server: 'http://host:port', username: 'user', password: 'pass' },
|
|
197
|
-
humanize: true,
|
|
198
|
-
geoip: true,
|
|
199
|
-
timezone: 'America/New_York',
|
|
200
|
-
locale: 'en-US',
|
|
201
|
-
stealthArgs: true,
|
|
202
|
-
extensionPaths: ['/path/ext'], // load unpacked extensions
|
|
203
|
-
userDataDir: './profile', // persistent profile (launchPersistentContext)
|
|
204
|
-
launchOptions: { headless: true, args: ['--no-sandbox'] }, // raw puppeteer-core passthrough
|
|
289
|
+
humanize: true,
|
|
290
|
+
geoip: true,
|
|
291
|
+
timezone: 'America/New_York',
|
|
292
|
+
locale: 'en-US',
|
|
293
|
+
stealthArgs: true,
|
|
205
294
|
},
|
|
206
295
|
pool: { min: 2, max: 5 },
|
|
207
296
|
})
|
|
208
297
|
```
|
|
209
298
|
|
|
210
|
-
|
|
211
|
-
passthrough for backward compatibility. `cloak` is ignored when `remote` is set
|
|
212
|
-
(remote uses plain CDP connect).
|
|
299
|
+
### TLS Fingerprint Capture
|
|
213
300
|
|
|
214
|
-
|
|
215
|
-
dedicated off-pool browser with its own stealth config — useful for rotating proxies or
|
|
216
|
-
fingerprints across requests. Not supported in remote CDP mode.
|
|
301
|
+
Capture the browser's own TLS fingerprint for use with `nestjs-xpath-parser`'s CycleTLS engine:
|
|
217
302
|
|
|
218
303
|
```typescript
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
cloak: { proxy: { server: 'http://rotating-proxy:8080' } },
|
|
222
|
-
});
|
|
223
|
-
|
|
224
|
-
// workflow
|
|
225
|
-
await actions.scrapeWithWorkflow(url, {
|
|
226
|
-
version: '1.0',
|
|
227
|
-
cloak: { proxy: { server: 'http://rotating-proxy:8080' } },
|
|
228
|
-
actions: [...],
|
|
229
|
-
});
|
|
230
|
-
```
|
|
231
|
-
|
|
232
|
-
### Remote Chrome Connection
|
|
233
|
-
|
|
234
|
-
Connect to remote Chrome instances via Chrome DevTools Protocol (CDP):
|
|
235
|
-
|
|
236
|
-
```typescript
|
|
237
|
-
BrowserActionModule.forRoot({
|
|
238
|
-
remote: {
|
|
239
|
-
browserURL: 'http://localhost:9222', // Or use browserWSEndpoint
|
|
240
|
-
retryMax: 3, // Connection retry attempts
|
|
241
|
-
retryDelay: 1000, // Delay between retries (ms)
|
|
242
|
-
},
|
|
243
|
-
pool: { min: 2, max: 5 },
|
|
244
|
-
})
|
|
245
|
-
```
|
|
246
|
-
|
|
247
|
-
**Using browserWSEndpoint:**
|
|
248
|
-
|
|
249
|
-
```typescript
|
|
250
|
-
BrowserActionModule.forRoot({
|
|
251
|
-
remote: {
|
|
252
|
-
browserWSEndpoint: 'ws://localhost:9222/devtools/page/abc123',
|
|
253
|
-
},
|
|
254
|
-
})
|
|
255
|
-
```
|
|
256
|
-
|
|
257
|
-
**Remote-first priority:** When both `remote` and `launchOptions` are provided, remote connection takes precedence.
|
|
258
|
-
|
|
259
|
-
**See:** [Remote Chrome Configuration](./docs/api-reference.md#remote-chrome-configuration) for details.
|
|
260
|
-
|
|
261
|
-
## Services
|
|
262
|
-
|
|
263
|
-
| Service | Description |
|
|
264
|
-
|---------|-------------|
|
|
265
|
-
| **BrowserActionService** | High-level automation methods (scrape, screenshot, PDF, workflows) |
|
|
266
|
-
| **BrowserManagerService** | Browser pool management |
|
|
267
|
-
| **PageService** | Page lifecycle and navigation |
|
|
268
|
-
| **CookieService** | Cookie persistence |
|
|
269
|
-
| **CleansingService** | Data cleansing with pipes |
|
|
270
|
-
|
|
271
|
-
## Configuration
|
|
272
|
-
|
|
273
|
-
### Basic Configuration
|
|
274
|
-
|
|
275
|
-
```typescript
|
|
276
|
-
BrowserActionModule.forRoot({
|
|
277
|
-
pool: {
|
|
278
|
-
min: 2,
|
|
279
|
-
max: 10,
|
|
280
|
-
idleTimeoutMs: 30000, // reap idle browsers down to min (0 disables)
|
|
281
|
-
acquireTimeoutMs: 30000, // reject acquire() if none free in time (0 waits forever)
|
|
282
|
-
strategy: 'round-robin',
|
|
283
|
-
},
|
|
284
|
-
cookies: {
|
|
285
|
-
enabled: true,
|
|
286
|
-
cookiesDir: './cookies',
|
|
287
|
-
},
|
|
288
|
-
logLevel: 'log',
|
|
289
|
-
})
|
|
290
|
-
```
|
|
291
|
-
|
|
292
|
-
### All Options
|
|
293
|
-
|
|
294
|
-
See [Configuration Reference](./docs/api-reference.md#configuration) for complete options.
|
|
295
|
-
|
|
296
|
-
## Type Safety
|
|
297
|
-
|
|
298
|
-
Full TypeScript support with generics:
|
|
299
|
-
|
|
300
|
-
```typescript
|
|
301
|
-
// Type-safe selectors
|
|
302
|
-
interface ProductSelectors {
|
|
303
|
-
title: string;
|
|
304
|
-
price: number;
|
|
305
|
-
}
|
|
306
|
-
|
|
307
|
-
const result = await this.actionHelpers.scrape<ProductSelectors>(url, {
|
|
308
|
-
title: 'h1',
|
|
309
|
-
price: '.price',
|
|
310
|
-
});
|
|
311
|
-
|
|
312
|
-
// Type-safe workflow results
|
|
313
|
-
const workflow = await this.actionHelpers.scrapeWithWorkflow<{
|
|
314
|
-
title: string;
|
|
315
|
-
price: number;
|
|
316
|
-
}>(url, workflow);
|
|
304
|
+
const fingerprint = await browserAction.captureTlsFingerprint('./fingerprint.json');
|
|
305
|
+
// fingerprint.json can be passed to ScraperHtmlModule.forRoot({ fingerprint: './fingerprint.json' })
|
|
317
306
|
```
|
|
318
307
|
|
|
319
308
|
## Development
|
|
320
309
|
|
|
321
|
-
### Scripts
|
|
322
|
-
|
|
323
310
|
```bash
|
|
311
|
+
# Install dependencies
|
|
312
|
+
pnpm install
|
|
313
|
+
|
|
324
314
|
# Build
|
|
325
315
|
pnpm build
|
|
326
316
|
|
|
327
|
-
#
|
|
317
|
+
# Test
|
|
328
318
|
pnpm test
|
|
319
|
+
pnpm test:cov
|
|
329
320
|
|
|
330
|
-
# Lint
|
|
321
|
+
# Lint
|
|
331
322
|
pnpm lint
|
|
332
|
-
|
|
333
|
-
# Format code
|
|
334
323
|
pnpm format
|
|
335
324
|
```
|
|
336
325
|
|
|
337
|
-
|
|
326
|
+
## Contributing
|
|
338
327
|
|
|
339
|
-
|
|
340
|
-
|
|
328
|
+
1. Fork the repository
|
|
329
|
+
2. Create your feature branch (`git checkout -b feature/yourusername/amazing-feature`)
|
|
330
|
+
3. Commit your changes (`git commit -m 'Add some amazing feature'`)
|
|
331
|
+
4. Push to the branch (`git push origin feature/yourusername/amazing-feature`)
|
|
332
|
+
5. Open a Pull Request
|
|
341
333
|
|
|
342
334
|
## License
|
|
343
335
|
|
|
344
|
-
MIT
|
|
345
|
-
|
|
346
|
-
## Support
|
|
347
|
-
|
|
348
|
-
For issues and questions, please use [GitHub Issues](https://github.com/Hanivan/nestjs-browser-action/issues).
|
|
349
|
-
|
|
350
|
-
## Examples
|
|
351
|
-
|
|
352
|
-
Check out the test project for complete examples: [test-browser-action](https://github.com/Hanivan/test-browser-action)
|
|
353
|
-
|
|
354
|
-
---
|
|
355
|
-
|
|
356
|
-
**Documentation:**
|
|
357
|
-
- [Methods](./docs/methods) - Method-specific guides
|
|
358
|
-
- [Features](./docs/features) - Feature guides
|
|
359
|
-
- [API Reference](./docs/api-reference.md) - Complete API
|
|
360
|
-
- [Workflow Actions](./docs/workflow-actions.md) - Action reference
|
|
336
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
@@ -1,14 +1,9 @@
|
|
|
1
1
|
import { CleansingProfile } from '../enums/cleansing-profile.enum';
|
|
2
|
-
import type {
|
|
2
|
+
import type { CleanerStepRules } from '../pipes/pipe-engine';
|
|
3
3
|
export interface CleansingOptions {
|
|
4
|
-
pipes?:
|
|
4
|
+
pipes?: CleanerStepRules;
|
|
5
5
|
profile?: CleansingProfile;
|
|
6
6
|
}
|
|
7
|
-
export interface CleansingWithAltOptions {
|
|
8
|
-
primaryPipes: PipeConfig[];
|
|
9
|
-
fallbackPipes: PipeConfig[];
|
|
10
|
-
fallbackOn?: 'empty' | 'null' | 'undefined' | 'all';
|
|
11
|
-
}
|
|
12
7
|
export interface ScrapeCleansingOptions {
|
|
13
|
-
pipes?: Record<string,
|
|
8
|
+
pipes?: Record<string, CleanerStepRules>;
|
|
14
9
|
}
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { CleansingType } from '../enums/cleansing-type.enum';
|
|
2
2
|
import type { CloakOptions } from './browser-action-options';
|
|
3
|
+
import type { CleanerStepRules } from '../pipes/pipe-engine';
|
|
3
4
|
export interface PipeConfig {
|
|
4
5
|
type: CleansingType | string;
|
|
5
6
|
pattern?: string;
|
|
@@ -12,7 +13,8 @@ export interface PipeConfig {
|
|
|
12
13
|
[key: string]: unknown;
|
|
13
14
|
}
|
|
14
15
|
export type SelectorMap = Record<string, string>;
|
|
15
|
-
export type PipeOptions = Record<string,
|
|
16
|
+
export type PipeOptions = Record<string, CleanerStepRules>;
|
|
17
|
+
export type { CleanerStepRules };
|
|
16
18
|
export type ScrapeResult = Partial<Record<string, unknown>>;
|
|
17
19
|
export type ScrapeAllResult = Partial<Record<string, unknown[]>>;
|
|
18
20
|
export type ScraperOptions = {
|
|
@@ -63,4 +65,33 @@ export interface ContainerScrapeResult<T = Record<string, unknown>> {
|
|
|
63
65
|
items: T[];
|
|
64
66
|
pagination?: PaginationResult;
|
|
65
67
|
}
|
|
68
|
+
export interface PatternMeta {
|
|
69
|
+
multiple?: boolean | string;
|
|
70
|
+
multiline?: boolean;
|
|
71
|
+
alterPattern?: string[];
|
|
72
|
+
isContainer?: boolean;
|
|
73
|
+
isPage?: boolean;
|
|
74
|
+
pageUrlKey?: string;
|
|
75
|
+
pageTextKey?: string;
|
|
76
|
+
}
|
|
77
|
+
export interface PatternField {
|
|
78
|
+
key: string;
|
|
79
|
+
patternType: 'xpath' | 'css';
|
|
80
|
+
returnType: 'text' | 'rawHTML' | 'html';
|
|
81
|
+
patterns: string[];
|
|
82
|
+
meta?: PatternMeta;
|
|
83
|
+
pipes?: CleanerStepRules;
|
|
84
|
+
}
|
|
85
|
+
export interface EvaluateOptions {
|
|
86
|
+
url?: string;
|
|
87
|
+
patterns: PatternField[];
|
|
88
|
+
waitUntil?: 'load' | 'domcontentloaded' | 'networkidle0' | 'networkidle2';
|
|
89
|
+
timeout?: number;
|
|
90
|
+
cloak?: CloakOptions;
|
|
91
|
+
interceptResource?: boolean;
|
|
92
|
+
useRandomUserAgent?: boolean;
|
|
93
|
+
}
|
|
94
|
+
export interface EvaluateResult<T = Record<string, unknown>> {
|
|
95
|
+
results: T[];
|
|
96
|
+
}
|
|
66
97
|
export type { ActionTarget, ActionType, ActionOptions, ActionCondition, ErrorStrategy, WorkflowAction, WorkflowDefinition, WorkflowResult, VariableContext, } from './workflow-options';
|