@purepageio/fetch-engines 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +73 -25
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -14,7 +14,7 @@ npm install @purepageio/fetch-engines
|
|
|
14
14
|
yarn add @purepageio/fetch-engines
|
|
15
15
|
```
|
|
16
16
|
|
|
17
|
-
If you plan to use the `PlaywrightEngine`, you also need to install Playwright's browser binaries:
|
|
17
|
+
If you plan to use the `PlaywrightEngine` or `HybridEngine`, you also need to install Playwright's browser binaries:
|
|
18
18
|
|
|
19
19
|
```bash
|
|
20
20
|
pnpm exec playwright install
|
|
@@ -25,7 +25,8 @@ npx playwright install
|
|
|
25
25
|
## Engines
|
|
26
26
|
|
|
27
27
|
- **`FetchEngine`**: Uses the standard `fetch` API. Suitable for simple HTML pages or APIs returning HTML. Lightweight and fast.
|
|
28
|
-
- **`PlaywrightEngine`**: Uses Playwright to control a headless
|
|
28
|
+
- **`PlaywrightEngine`**: Uses Playwright to control a managed pool of headless browsers (Chromium by default via `playwright-extra`). Handles JavaScript rendering, complex interactions, and provides automatic stealth/anti-bot detection measures. More resource-intensive but necessary for dynamic websites.
|
|
29
|
+
- **`HybridEngine`**: A smart combination. It first attempts to fetch content using the lightweight `FetchEngine`. If that fails for *any* reason (e.g., network error, non-HTML content, HTTP error like 403), it automatically falls back to using the `PlaywrightEngine`. This provides the speed of `FetchEngine` for simple sites while retaining the power of `PlaywrightEngine` for complex ones.
|
|
29
30
|
|
|
30
31
|
## Basic Usage
|
|
31
32
|
|
|
@@ -40,7 +41,7 @@ async function main() {
|
|
|
40
41
|
try {
|
|
41
42
|
const url = "https://example.com";
|
|
42
43
|
const result = await engine.fetchHTML(url);
|
|
43
|
-
console.log(`Fetched ${result.url}`);
|
|
44
|
+
console.log(`Fetched ${result.url} (Status: ${result.statusCode})`);
|
|
44
45
|
console.log(`Title: ${result.title}`);
|
|
45
46
|
// console.log(`HTML: ${result.html.substring(0, 200)}...`);
|
|
46
47
|
} catch (error) {
|
|
@@ -67,7 +68,7 @@ async function main() {
|
|
|
67
68
|
try {
|
|
68
69
|
const url = "https://quotes.toscrape.com/"; // A site that might benefit from JS rendering
|
|
69
70
|
const result = await engine.fetchHTML(url);
|
|
70
|
-
console.log(`Fetched ${result.url}`);
|
|
71
|
+
console.log(`Fetched ${result.url} (Status: ${result.statusCode})`);
|
|
71
72
|
console.log(`Title: ${result.title}`);
|
|
72
73
|
// console.log(`HTML: ${result.html.substring(0, 200)}...`);
|
|
73
74
|
} catch (error) {
|
|
@@ -81,6 +82,41 @@ async function main() {
|
|
|
81
82
|
main();
|
|
82
83
|
```
|
|
83
84
|
|
|
85
|
+
### HybridEngine
|
|
86
|
+
|
|
87
|
+
```typescript
|
|
88
|
+
import { HybridEngine } from '@purepageio/fetch-engines';
|
|
89
|
+
|
|
90
|
+
// Configure the underlying PlaywrightEngine (optional)
|
|
91
|
+
const engine = new HybridEngine({
|
|
92
|
+
maxRetries: 2, // PlaywrightEngine retry config
|
|
93
|
+
maxBrowsers: 3, // PlaywrightEngine pool config
|
|
94
|
+
// FetchEngine part has no config
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
async function main() {
|
|
98
|
+
try {
|
|
99
|
+
// Try a simple site (likely uses FetchEngine)
|
|
100
|
+
const url1 = 'https://example.com';
|
|
101
|
+
const result1 = await engine.fetchHTML(url1);
|
|
102
|
+
console.log(`Fetched ${result1.url} (Status: ${result1.statusCode}) - Title: ${result1.title}`);
|
|
103
|
+
|
|
104
|
+
// Try a complex site (likely falls back to PlaywrightEngine)
|
|
105
|
+
const url2 = 'https://quotes.toscrape.com/';
|
|
106
|
+
const result2 = await engine.fetchHTML(url2);
|
|
107
|
+
console.log(`Fetched ${result2.url} (Status: ${result2.statusCode}) - Title: ${result2.title}`);
|
|
108
|
+
|
|
109
|
+
} catch (error) {
|
|
110
|
+
console.error("Hybrid fetch failed:", error);
|
|
111
|
+
} finally {
|
|
112
|
+
// Important: Clean up browser resources (for the Playwright part) when done
|
|
113
|
+
await engine.cleanup();
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
main();
|
|
118
|
+
```
|
|
119
|
+
|
|
84
120
|
## Configuration
|
|
85
121
|
|
|
86
122
|
Engines accept an optional configuration object in their constructor to customize behavior.
|
|
@@ -91,50 +127,62 @@ The `FetchEngine` currently has **no configurable options** via its constructor.
|
|
|
91
127
|
|
|
92
128
|
### PlaywrightEngine
|
|
93
129
|
|
|
94
|
-
The `PlaywrightEngine`
|
|
130
|
+
The `PlaywrightEngine` accepts a `PlaywrightEngineConfig` object. See the detailed options below:
|
|
95
131
|
|
|
96
132
|
**General Options:**
|
|
97
133
|
|
|
98
134
|
- `concurrentPages` (`number`, default: `3`)
|
|
99
135
|
- Maximum number of Playwright pages to process concurrently across all browser instances.
|
|
100
136
|
- `maxRetries` (`number`, default: `3`)
|
|
101
|
-
- Maximum number of retry attempts for a failed fetch operation (excluding initial attempt).
|
|
137
|
+
- Maximum number of retry attempts for a failed Playwright fetch operation (excluding initial attempt).
|
|
102
138
|
- `retryDelay` (`number`, default: `5000`)
|
|
103
|
-
- Delay in milliseconds between retry attempts.
|
|
139
|
+
- Delay in milliseconds between Playwright retry attempts.
|
|
104
140
|
- `cacheTTL` (`number`, default: `900000` (15 minutes))
|
|
105
|
-
- Time-to-live for cached results in milliseconds. Set to `0` to disable the in-memory cache.
|
|
141
|
+
- Time-to-live for cached results in milliseconds. Set to `0` to disable the in-memory cache. Affects both HTTP fallback and Playwright results.
|
|
106
142
|
- `useHttpFallback` (`boolean`, default: `true`)
|
|
107
143
|
- If `true`, the engine first attempts a simple, fast HTTP GET request. If this fails or appears to receive a challenge/CAPTCHA page, it then proceeds with a full Playwright browser request.
|
|
108
144
|
- `useHeadedModeFallback` (`boolean`, default: `false`)
|
|
109
|
-
- If `true` and a Playwright request fails (potentially due to bot detection), subsequent requests
|
|
145
|
+
- If `true` and a Playwright request fails (potentially due to bot detection), subsequent Playwright requests *to that specific domain* will automatically use a headed (visible) browser instance.
|
|
110
146
|
- `defaultFastMode` (`boolean`, default: `true`)
|
|
111
|
-
- If `true`, requests initially run in "fast mode", blocking non-essential resources
|
|
147
|
+
- If `true`, Playwright requests initially run in "fast mode", blocking non-essential resources and skipping human behavior simulation. Can be overridden per-request via `fetchHTML` options.
|
|
112
148
|
- `simulateHumanBehavior` (`boolean`, default: `true`)
|
|
113
|
-
- If `true` and the request is
|
|
149
|
+
- If `true` and the Playwright request is *not* in `fastMode`, the engine attempts basic human-like interactions. *Note: This simulation is currently basic.*
|
|
114
150
|
|
|
115
|
-
**Browser Pool Options:**
|
|
116
|
-
|
|
117
|
-
These options are passed down to configure the underlying `PlaywrightBrowserPool` that manages browser instances.
|
|
151
|
+
**Browser Pool Options (Passed to internal `PlaywrightBrowserPool`):**
|
|
118
152
|
|
|
119
153
|
- `maxBrowsers` (`number`, default: `2`)
|
|
120
|
-
- Maximum number of concurrent browser instances
|
|
154
|
+
- Maximum number of concurrent browser instances the pool will manage.
|
|
121
155
|
- `maxPagesPerContext` (`number`, default: `6`)
|
|
122
|
-
- Maximum number of pages
|
|
156
|
+
- Maximum number of pages per browser context before recycling.
|
|
123
157
|
- `maxBrowserAge` (`number`, default: `1200000` (20 minutes))
|
|
124
|
-
- Maximum age in milliseconds a browser instance
|
|
158
|
+
- Maximum age in milliseconds a browser instance lives before recycling.
|
|
125
159
|
- `healthCheckInterval` (`number`, default: `60000` (1 minute))
|
|
126
|
-
- How often (in milliseconds) the pool checks
|
|
160
|
+
- How often (in milliseconds) the pool checks browser health.
|
|
127
161
|
- `useHeadedMode` (`boolean`, default: `false`)
|
|
128
|
-
- Forces the
|
|
162
|
+
- Forces the *entire* browser pool to launch browsers in headed (visible) mode.
|
|
129
163
|
- `poolBlockedDomains` (`string[]`, default: `[]` - uses pool's internal defaults)
|
|
130
|
-
- List of domain
|
|
164
|
+
- List of domain *glob patterns* to block browser requests to.
|
|
131
165
|
- `poolBlockedResourceTypes` (`string[]`, default: `[]` - uses pool's internal defaults)
|
|
132
|
-
- List of Playwright resource types (e.g., `image`, `
|
|
166
|
+
- List of Playwright resource types (e.g., `image`, `font`) to block.
|
|
133
167
|
- `proxy` (`object | undefined`, default: `undefined`)
|
|
134
|
-
- Proxy configuration
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
168
|
+
- Proxy configuration for browser instances (`server`, `username?`, `password?`).
|
|
169
|
+
|
|
170
|
+
### HybridEngine
|
|
171
|
+
|
|
172
|
+
The `HybridEngine` constructor accepts a single optional argument: `playwrightConfig`. This object follows the **`PlaywrightEngineConfig`** structure described above.
|
|
173
|
+
|
|
174
|
+
```typescript
|
|
175
|
+
import { HybridEngine } from '@purepageio/fetch-engines';
|
|
176
|
+
|
|
177
|
+
const engine = new HybridEngine({
|
|
178
|
+
// These options configure the PlaywrightEngine used for fallbacks
|
|
179
|
+
maxRetries: 1,
|
|
180
|
+
maxBrowsers: 1,
|
|
181
|
+
cacheTTL: 0 // Disable caching in the Playwright part
|
|
182
|
+
});
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
The internal `FetchEngine` used by `HybridEngine` is *not* configurable.
|
|
138
186
|
|
|
139
187
|
## Return Value
|
|
140
188
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@purepageio/fetch-engines",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.2",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "A collection of configurable engines for fetching HTML content using fetch or Playwright.",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -43,7 +43,7 @@
|
|
|
43
43
|
},
|
|
44
44
|
"repository": {
|
|
45
45
|
"type": "git",
|
|
46
|
-
"url": "git+https://github.com/
|
|
46
|
+
"url": "git+https://github.com/purepage/fetch-engines"
|
|
47
47
|
},
|
|
48
48
|
"keywords": [
|
|
49
49
|
"fetch",
|