@purepageio/fetch-engines 0.1.4 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +203 -102
- package/dist/FetchEngine.d.ts +9 -8
- package/dist/FetchEngine.d.ts.map +1 -1
- package/dist/FetchEngine.js +54 -77
- package/dist/FetchEngine.js.map +1 -1
- package/dist/HybridEngine.d.ts +13 -7
- package/dist/HybridEngine.d.ts.map +1 -1
- package/dist/HybridEngine.js +37 -17
- package/dist/HybridEngine.js.map +1 -1
- package/dist/PlaywrightEngine.d.ts +4 -2
- package/dist/PlaywrightEngine.d.ts.map +1 -1
- package/dist/PlaywrightEngine.js +97 -60
- package/dist/PlaywrightEngine.js.map +1 -1
- package/dist/types.d.ts +27 -11
- package/dist/types.d.ts.map +1 -1
- package/dist/utils/markdown-converter.d.ts +31 -0
- package/dist/utils/markdown-converter.d.ts.map +1 -0
- package/dist/utils/markdown-converter.js +794 -0
- package/dist/utils/markdown-converter.js.map +1 -0
- package/package.json +6 -2
package/README.md
CHANGED
|
@@ -1,8 +1,53 @@
|
|
|
1
1
|
# @purepageio/fetch-engines
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
3
|
+
[](https://www.npmjs.com/package/@purepageio/fetch-engines)
|
|
4
|
+
[](https://github.com/purepageio/fetch-engines/actions/workflows/publish.yml) <!-- Assuming build.yml is the workflow filename -->
|
|
5
|
+
[](https://opensource.org/licenses/MIT)
|
|
6
|
+
|
|
7
|
+
Fetching web content can be complex. You need to handle static HTML, dynamic JavaScript-driven sites, network errors, retries, caching, and potential bot detection measures. Managing browser automation tools like Playwright adds another layer of complexity with resource pooling and stealth configurations.
|
|
8
|
+
|
|
9
|
+
`@purepageio/fetch-engines` simplifies this entire process by providing a set of robust, configurable, and easy-to-use engines for retrieving web page content.
|
|
10
|
+
|
|
11
|
+
**Why use `@purepageio/fetch-engines`?**
|
|
12
|
+
|
|
13
|
+
- **Unified API:** Get content from simple or complex sites using the same `fetchHTML(url, options?)` method.
|
|
14
|
+
- **Flexible Strategies:** Choose the right tool for the job:
|
|
15
|
+
- `FetchEngine`: Lightweight and fast for static HTML, using the standard `fetch` API.
|
|
16
|
+
- `PlaywrightEngine`: Powerful browser automation for JavaScript-heavy sites, handling rendering and interactions.
|
|
17
|
+
- `HybridEngine`: The best of both worlds – tries `FetchEngine` first for speed, automatically falls back to `PlaywrightEngine` for reliability on complex pages.
|
|
18
|
+
- **Robust & Resilient:** Built-in caching, configurable retries, and standardized error handling make your fetching logic more dependable.
|
|
19
|
+
- **Simplified Automation:** `PlaywrightEngine` manages browser instances and contexts automatically through efficient pooling and includes integrated stealth measures to bypass common anti-bot systems.
|
|
20
|
+
- **Content Transformation:** Optionally convert fetched HTML directly to clean Markdown content.
|
|
21
|
+
- **TypeScript Ready:** Fully typed for a better development experience.
|
|
22
|
+
|
|
23
|
+
This package provides a high-level abstraction, letting you focus on using the web content rather than the intricacies of fetching it.
|
|
24
|
+
|
|
25
|
+
## Table of Contents
|
|
26
|
+
|
|
27
|
+
- [Features](#features)
|
|
28
|
+
- [Installation](#installation)
|
|
29
|
+
- [Engines](#engines)
|
|
30
|
+
- [Basic Usage](#basic-usage)
|
|
31
|
+
- [Configuration](#configuration)
|
|
32
|
+
- [Return Value](#return-value)
|
|
33
|
+
- [API Reference](#api-reference)
|
|
34
|
+
- [Stealth / Anti-Detection (`PlaywrightEngine`)](#stealth--anti-detection-playwrightengine)
|
|
35
|
+
- [Error Handling](#error-handling)
|
|
36
|
+
- [Contributing](#contributing)
|
|
37
|
+
- [License](#license)
|
|
38
|
+
|
|
39
|
+
## Features
|
|
40
|
+
|
|
41
|
+
- **Multiple Fetching Strategies:** Choose between `FetchEngine` (lightweight `fetch`), `PlaywrightEngine` (robust JS rendering via Playwright), or `HybridEngine` (smart fallback).
|
|
42
|
+
- **Unified API:** Simple `fetchHTML(url, options?)` interface across all engines.
|
|
43
|
+
- **Configurable Retries:** Automatic retries on failure with customizable attempts and delays.
|
|
44
|
+
- **Built-in Caching:** In-memory caching with configurable TTL to reduce redundant fetches.
|
|
45
|
+
- **Playwright Stealth:** Automatic integration of `playwright-extra` and stealth plugins to bypass common bot detection.
|
|
46
|
+
- **Managed Browser Pooling:** Efficient resource management for `PlaywrightEngine` with configurable browser/context limits and lifecycles.
|
|
47
|
+
- **Smart Fallbacks:** `HybridEngine` uses `FetchEngine` first, falling back to `PlaywrightEngine` only when needed. `PlaywrightEngine` can optionally use a fast HTTP fetch before launching a full browser.
|
|
48
|
+
- **Content Conversion:** Optionally convert fetched HTML directly to Markdown.
|
|
49
|
+
- **Standardized Errors:** Custom `FetchError` classes provide context on failures.
|
|
50
|
+
- **TypeScript Ready:** Fully typed codebase for enhanced developer experience.
|
|
6
51
|
|
|
7
52
|
## Installation
|
|
8
53
|
|
|
@@ -35,20 +80,25 @@ npx playwright install
|
|
|
35
80
|
```typescript
|
|
36
81
|
import { FetchEngine } from "@purepageio/fetch-engines";
|
|
37
82
|
|
|
38
|
-
const engine = new FetchEngine();
|
|
83
|
+
const engine = new FetchEngine(); // Default: fetches HTML
|
|
39
84
|
|
|
40
85
|
async function main() {
|
|
41
86
|
try {
|
|
42
87
|
const url = "https://example.com";
|
|
43
88
|
const result = await engine.fetchHTML(url);
|
|
44
|
-
console.log(`Fetched ${result.url} (
|
|
89
|
+
console.log(`Fetched ${result.url} (ContentType: ${result.contentType})`);
|
|
45
90
|
console.log(`Title: ${result.title}`);
|
|
46
|
-
|
|
91
|
+
console.log(`Content (HTML): ${result.content.substring(0, 100)}...`);
|
|
92
|
+
|
|
93
|
+
// Example fetching Markdown directly via constructor option
|
|
94
|
+
const markdownEngine = new FetchEngine({ markdown: true });
|
|
95
|
+
const mdResult = await markdownEngine.fetchHTML(url);
|
|
96
|
+
console.log(`\nFetched ${mdResult.url} (ContentType: ${mdResult.contentType})`);
|
|
97
|
+
console.log(`Content (Markdown):\n${mdResult.content.substring(0, 300)}...`);
|
|
47
98
|
} catch (error) {
|
|
48
99
|
console.error("Fetch failed:", error);
|
|
49
100
|
}
|
|
50
101
|
}
|
|
51
|
-
|
|
52
102
|
main();
|
|
53
103
|
```
|
|
54
104
|
|
|
@@ -57,28 +107,28 @@ main();
|
|
|
57
107
|
```typescript
|
|
58
108
|
import { PlaywrightEngine } from "@purepageio/fetch-engines";
|
|
59
109
|
|
|
60
|
-
//
|
|
61
|
-
const engine = new PlaywrightEngine({
|
|
62
|
-
maxRetries: 2, // Number of retry attempts
|
|
63
|
-
useHttpFallback: true, // Try simple HTTP fetch first
|
|
64
|
-
cacheTTL: 5 * 60 * 1000, // Cache results for 5 minutes (in milliseconds)
|
|
65
|
-
});
|
|
110
|
+
// Engine configured to fetch HTML by default
|
|
111
|
+
const engine = new PlaywrightEngine({ markdown: false });
|
|
66
112
|
|
|
67
113
|
async function main() {
|
|
68
114
|
try {
|
|
69
|
-
const url = "https://quotes.toscrape.com/";
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
console.log(`
|
|
73
|
-
|
|
115
|
+
const url = "https://quotes.toscrape.com/";
|
|
116
|
+
|
|
117
|
+
// Example: Fetching as Markdown using per-request override
|
|
118
|
+
console.log(`Fetching ${url} as Markdown...`);
|
|
119
|
+
const mdResult = await engine.fetchHTML(url, { markdown: true });
|
|
120
|
+
console.log(`Fetched ${mdResult.url} (ContentType: ${mdResult.contentType}) - Title: ${mdResult.title}`);
|
|
121
|
+
console.log(`Content (Markdown):\n${mdResult.content.substring(0, 300)}...`);
|
|
122
|
+
|
|
123
|
+
// You could also fetch as HTML by default:
|
|
124
|
+
// const htmlResult = await engine.fetchHTML(url);
|
|
125
|
+
// console.log(`\nFetched ${htmlResult.url} (ContentType: ${htmlResult.contentType}) - Title: ${htmlResult.title}`);
|
|
74
126
|
} catch (error) {
|
|
75
127
|
console.error("Playwright fetch failed:", error);
|
|
76
128
|
} finally {
|
|
77
|
-
// Important: Clean up browser resources when done
|
|
78
129
|
await engine.cleanup();
|
|
79
130
|
}
|
|
80
131
|
}
|
|
81
|
-
|
|
82
132
|
main();
|
|
83
133
|
```
|
|
84
134
|
|
|
@@ -87,32 +137,35 @@ main();
|
|
|
87
137
|
```typescript
|
|
88
138
|
import { HybridEngine } from "@purepageio/fetch-engines";
|
|
89
139
|
|
|
90
|
-
//
|
|
91
|
-
const engine = new HybridEngine({
|
|
92
|
-
maxRetries: 2, // PlaywrightEngine retry config
|
|
93
|
-
maxBrowsers: 3, // PlaywrightEngine pool config
|
|
94
|
-
// FetchEngine part has no config
|
|
95
|
-
});
|
|
140
|
+
// Engine configured to fetch HTML by default for both internal engines
|
|
141
|
+
const engine = new HybridEngine({ markdown: false });
|
|
96
142
|
|
|
97
143
|
async function main() {
|
|
98
144
|
try {
|
|
99
|
-
|
|
100
|
-
const
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
//
|
|
105
|
-
const
|
|
106
|
-
|
|
107
|
-
console.log(`
|
|
145
|
+
const url1 = "https://example.com"; // Simple site
|
|
146
|
+
const url2 = "https://quotes.toscrape.com/"; // Complex site
|
|
147
|
+
|
|
148
|
+
// --- Scenario 1: FetchEngine Succeeds ---
|
|
149
|
+
console.log(`\nFetching simple site (${url1}) requesting Markdown...`);
|
|
150
|
+
// FetchEngine uses its constructor config (markdown: false), ignoring the per-request option.
|
|
151
|
+
const result1 = await engine.fetchHTML(url1, { markdown: true });
|
|
152
|
+
console.log(`Fetched ${result1.url} (ContentType: ${result1.contentType}) - Title: ${result1.title}`);
|
|
153
|
+
console.log(`Content is ${result1.contentType} because FetchEngine succeeded and used its own config.`);
|
|
154
|
+
console.log(`${result1.content.substring(0, 300)}...`);
|
|
155
|
+
|
|
156
|
+
// --- Scenario 2: FetchEngine Fails, Playwright Fallback Occurs ---
|
|
157
|
+
console.log(`\nFetching complex site (${url2}) requesting Markdown...`);
|
|
158
|
+
// Assume FetchEngine fails for url2. PlaywrightEngine will be used and *will* receive the markdown: true override.
|
|
159
|
+
const result2 = await engine.fetchHTML(url2, { markdown: true });
|
|
160
|
+
console.log(`Fetched ${result2.url} (ContentType: ${result2.contentType}) - Title: ${result2.title}`);
|
|
161
|
+
console.log(`Content is ${result2.contentType} because Playwright fallback used the per-request option.`);
|
|
162
|
+
console.log(`${result2.content.substring(0, 300)}...`);
|
|
108
163
|
} catch (error) {
|
|
109
164
|
console.error("Hybrid fetch failed:", error);
|
|
110
165
|
} finally {
|
|
111
|
-
// Important: Clean up browser resources (for the Playwright part) when done
|
|
112
166
|
await engine.cleanup();
|
|
113
167
|
}
|
|
114
168
|
}
|
|
115
|
-
|
|
116
169
|
main();
|
|
117
170
|
```
|
|
118
171
|
|
|
@@ -122,99 +175,93 @@ Engines accept an optional configuration object in their constructor to customis
|
|
|
122
175
|
|
|
123
176
|
### FetchEngine
|
|
124
177
|
|
|
125
|
-
The `FetchEngine`
|
|
178
|
+
The `FetchEngine` accepts a `FetchEngineOptions` object with the following properties:
|
|
179
|
+
|
|
180
|
+
| Option | Type | Default | Description |
|
|
181
|
+
| ---------- | --------- | ------- | ------------------------------------------------------------------------------------------------------ |
|
|
182
|
+
| `markdown` | `boolean` | `false` | If `true`, converts fetched HTML to Markdown. `contentType` in the result will be set to `'markdown'`. |
|
|
183
|
+
|
|
184
|
+
```typescript
|
|
185
|
+
// Example: Always convert to Markdown
|
|
186
|
+
const mdFetchEngine = new FetchEngine({ markdown: true });
|
|
187
|
+
```
|
|
126
188
|
|
|
127
189
|
### PlaywrightEngine
|
|
128
190
|
|
|
129
|
-
The `PlaywrightEngine` accepts a `PlaywrightEngineConfig` object
|
|
191
|
+
The `PlaywrightEngine` accepts a `PlaywrightEngineConfig` object with the following properties:
|
|
130
192
|
|
|
131
193
|
**General Options:**
|
|
132
194
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
- If `true` and a Playwright request fails (potentially due to bot detection), subsequent Playwright requests _to that specific domain_ will automatically use a headed (visible) browser instance.
|
|
145
|
-
- `defaultFastMode` (`boolean`, default: `true`)
|
|
146
|
-
- If `true`, Playwright requests initially run in "fast mode", blocking non-essential resources and skipping human behavior simulation. Can be overridden per-request via `fetchHTML` options.
|
|
147
|
-
- `simulateHumanBehavior` (`boolean`, default: `true`)
|
|
148
|
-
- If `true` and the Playwright request is _not_ in `fastMode`, the engine attempts basic human-like interactions. _Note: This simulation is currently basic._
|
|
195
|
+
| Option | Type | Default | Description |
|
|
196
|
+
| ----------------------- | --------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
|
|
197
|
+
| `markdown` | `boolean` | `false` | If `true`, converts content (from Playwright or fallback) to Markdown. `contentType` will be `'markdown'`. Can be overridden per-request. |
|
|
198
|
+
| `useHttpFallback` | `boolean` | `true` | If `true`, attempts a fast HTTP fetch before using Playwright. |
|
|
199
|
+
| `useHeadedModeFallback` | `boolean` | `false` | If `true`, automatically retries specific failed domains in headed (visible) mode. |
|
|
200
|
+
| `defaultFastMode` | `boolean` | `true` | If `true`, initially blocks non-essential resources and skips human simulation. Can be overridden per-request. |
|
|
201
|
+
| `simulateHumanBehavior` | `boolean` | `true` | If `true` (and not `fastMode`), attempts basic human-like interactions. |
|
|
202
|
+
| `concurrentPages` | `number` | `3` | Max number of pages to process concurrently within the engine queue. |
|
|
203
|
+
| `maxRetries` | `number` | `3` | Max retry attempts for a failed fetch (excluding initial try). |
|
|
204
|
+
| `retryDelay` | `number` | `5000` | Delay (ms) between retries. |
|
|
205
|
+
| `cacheTTL` | `number` | `900000` | Cache Time-To-Live (ms). `0` disables caching. (15 mins default) |
|
|
149
206
|
|
|
150
207
|
**Browser Pool Options (Passed to internal `PlaywrightBrowserPool`):**
|
|
151
208
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
- `poolBlockedDomains` (`string[]`, default: `[]` - uses pool's internal defaults)
|
|
163
|
-
- List of domain _glob patterns_ to block browser requests to.
|
|
164
|
-
- `poolBlockedResourceTypes` (`string[]`, default: `[]` - uses pool's internal defaults)
|
|
165
|
-
- List of Playwright resource types (e.g., `image`, `font`) to block.
|
|
166
|
-
- `proxy` (`object | undefined`, default: `undefined`)
|
|
167
|
-
- Proxy configuration for browser instances (`server`, `username?`, `password?`).
|
|
209
|
+
| Option | Type | Default | Description |
|
|
210
|
+
| -------------------------- | -------------------------- | ----------- | ------------------------------------------------------------------------- |
|
|
211
|
+
| `maxBrowsers` | `number` | `2` | Max concurrent browser instances managed by the pool. |
|
|
212
|
+
| `maxPagesPerContext` | `number` | `6` | Max pages per browser context before recycling. |
|
|
213
|
+
| `maxBrowserAge` | `number` | `1200000` | Max age (ms) a browser instance lives before recycling. (20 mins default) |
|
|
214
|
+
| `healthCheckInterval` | `number` | `60000` | How often (ms) the pool checks browser health. (1 min default) |
|
|
215
|
+
| `useHeadedMode` | `boolean` | `false` | Forces the _entire pool_ to launch browsers in headed (visible) mode. |
|
|
216
|
+
| `poolBlockedDomains` | `string[]` | `[]` | List of domain glob patterns to block requests to. |
|
|
217
|
+
| `poolBlockedResourceTypes` | `string[]` | `[]` | List of Playwright resource types (e.g., 'image', 'font') to block. |
|
|
218
|
+
| `proxy` | `{ server: string, ... }?` | `undefined` | Proxy configuration object (see `PlaywrightEngineConfig` type). |
|
|
168
219
|
|
|
169
220
|
### HybridEngine
|
|
170
221
|
|
|
171
|
-
The `HybridEngine` constructor accepts a single optional argument
|
|
222
|
+
The `HybridEngine` constructor accepts a single optional argument which uses the **`PlaywrightEngineConfig`** structure (see the `PlaywrightEngine` tables above). These options configure the underlying engines where applicable:
|
|
172
223
|
|
|
173
|
-
|
|
174
|
-
|
|
224
|
+
- Options like `maxRetries`, `cacheTTL`, `proxy`, `maxBrowsers`, etc., are primarily passed to the internal `PlaywrightEngine`.
|
|
225
|
+
- The `markdown` setting in the constructor (`boolean`, default: `false`) applies to **both** internal engines by default.
|
|
226
|
+
- If you provide `markdown: true` in the `options` object when calling `fetchHTML`, this override **only applies if a fallback to `PlaywrightEngine` is necessary**. The `FetchEngine` part will always use the `markdown` setting provided in the `HybridEngine` constructor.
|
|
175
227
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
maxRetries: 1,
|
|
179
|
-
maxBrowsers: 1,
|
|
180
|
-
cacheTTL: 0, // Disable caching in the Playwright part
|
|
181
|
-
});
|
|
228
|
+
```typescript
|
|
229
|
+
// ... (HybridEngine examples remain the same) ...
|
|
182
230
|
```
|
|
183
231
|
|
|
184
|
-
The internal `FetchEngine` used by `HybridEngine` is _not_ configurable.
|
|
185
|
-
|
|
186
232
|
## Return Value
|
|
187
233
|
|
|
188
|
-
|
|
234
|
+
All `fetchHTML()` methods return a Promise that resolves to an `HTMLFetchResult` object:
|
|
189
235
|
|
|
190
|
-
- `
|
|
191
|
-
- `
|
|
192
|
-
- `
|
|
193
|
-
- `
|
|
194
|
-
- `
|
|
195
|
-
- `
|
|
236
|
+
- `content` (`string`): The fetched content, either original HTML or converted Markdown.
|
|
237
|
+
- `contentType` (`'html' | 'markdown'`): Indicates the format of the `content` string.
|
|
238
|
+
- `title` (`string | null`): Extracted page title (from original HTML).
|
|
239
|
+
- `url` (`string`): Final URL after redirects.
|
|
240
|
+
- `isFromCache` (`boolean`): True if the result came from cache.
|
|
241
|
+
- `statusCode` (`number | undefined`): HTTP status code.
|
|
242
|
+
- `error` (`Error | undefined`): Error object if the fetch failed after all retries. It's generally recommended to rely on catching thrown errors for failure handling.
|
|
196
243
|
|
|
197
244
|
## API Reference
|
|
198
245
|
|
|
199
246
|
### `engine.fetchHTML(url, options?)`
|
|
200
247
|
|
|
201
|
-
- `url` (`string`):
|
|
202
|
-
- `options
|
|
203
|
-
-
|
|
204
|
-
-
|
|
205
|
-
- **Returns:** `Promise<
|
|
248
|
+
- `url` (`string`): URL to fetch.
|
|
249
|
+
- `options?` (`FetchOptions`): Optional per-request overrides.
|
|
250
|
+
- `markdown?: boolean`: (Playwright/Hybrid only) Request Markdown conversion. For Hybrid, only applies on fallback to Playwright.
|
|
251
|
+
- `fastMode?: boolean`: (Playwright/Hybrid only) Override fast mode.
|
|
252
|
+
- **Returns:** `Promise<HTMLFetchResult>`
|
|
206
253
|
|
|
207
|
-
Fetches
|
|
254
|
+
Fetches content, returning HTML or Markdown based on configuration/options in `result.content` with `result.contentType` indicating the format.
|
|
208
255
|
|
|
209
|
-
### `engine.cleanup()` (PlaywrightEngine
|
|
256
|
+
### `engine.cleanup()` (PlaywrightEngine & HybridEngine)
|
|
210
257
|
|
|
211
258
|
- **Returns:** `Promise<void>`
|
|
212
259
|
|
|
213
|
-
Gracefully shuts down all browser instances managed by the `PlaywrightEngine`'s browser pool. **It is crucial to call `await engine.cleanup()` when you are finished using
|
|
260
|
+
Gracefully shuts down all browser instances managed by the `PlaywrightEngine`'s browser pool (used by both `PlaywrightEngine` and `HybridEngine`). **It is crucial to call `await engine.cleanup()` when you are finished using these engines** to release system resources.
|
|
214
261
|
|
|
215
262
|
## Stealth / Anti-Detection (`PlaywrightEngine`)
|
|
216
263
|
|
|
217
|
-
The `PlaywrightEngine` automatically integrates `playwright-extra` and its powerful stealth plugin (`puppeteer-extra-plugin-stealth`). This plugin applies various techniques to make the headless browser controlled by Playwright appear more like a regular human-operated browser, helping to bypass many common bot detection systems.
|
|
264
|
+
The `PlaywrightEngine` automatically integrates `playwright-extra` and its powerful stealth plugin ([`puppeteer-extra-plugin-stealth`](https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-stealth)). This plugin applies various techniques to make the headless browser controlled by Playwright appear more like a regular human-operated browser, helping to bypass many common bot detection systems.
|
|
218
265
|
|
|
219
266
|
There are **no manual configuration options** for stealth; it is enabled by default when using `PlaywrightEngine`. The previous options (`useStealthMode`, `randomizeFingerprint`, `evasionLevel`) have been removed.
|
|
220
267
|
|
|
@@ -228,21 +275,75 @@ Errors during fetching are typically thrown as instances of `FetchError` (or its
|
|
|
228
275
|
- `message` (`string`): Description of the error.
|
|
229
276
|
- `code` (`string | undefined`): A specific error code (e.g., `ERR_NAVIGATION_TIMEOUT`, `ERR_HTTP_ERROR`, `ERR_NON_HTML_CONTENT`).
|
|
230
277
|
- `originalError` (`Error | undefined`): The underlying error that caused this fetch error (e.g., a Playwright error object).
|
|
278
|
+
- `statusCode` (`number | undefined`): The HTTP status code, if relevant (especially for `FetchEngineHttpError`).
|
|
231
279
|
|
|
232
280
|
Common error scenarios include:
|
|
233
281
|
|
|
234
282
|
- Network issues (DNS resolution failure, connection refused).
|
|
235
|
-
- HTTP errors (4xx client errors, 5xx server errors)
|
|
236
|
-
- Non-HTML content type received
|
|
237
|
-
- Playwright navigation timeouts
|
|
283
|
+
- HTTP errors (4xx client errors, 5xx server errors) -> `FetchEngineHttpError` from `FetchEngine` or potentially wrapped `FetchError` from `PlaywrightEngine`.
|
|
284
|
+
- Non-HTML content type received -> `FetchError` with code `ERR_NON_HTML_CONTENT` from `FetchEngine`.
|
|
285
|
+
- Playwright navigation timeouts -> `FetchError` wrapping Playwright error, often with code `ERR_NAVIGATION_TIMEOUT`.
|
|
238
286
|
- Proxy connection errors.
|
|
239
287
|
- Page crashes within Playwright.
|
|
240
288
|
- Errors thrown by the browser pool (e.g., failure to launch browser).
|
|
241
289
|
|
|
242
|
-
The `
|
|
290
|
+
The `HTMLFetchResult` object may also contain an `error` property if the final fetch attempt failed after all retries but an earlier attempt (within retries) might have produced some intermediate (potentially unusable) result data. It's generally best to rely on the thrown error for failure handling.
|
|
291
|
+
|
|
292
|
+
**Example:**
|
|
293
|
+
|
|
294
|
+
```typescript
|
|
295
|
+
import { FetchEngine, FetchError } from "@purepageio/fetch-engines";
|
|
296
|
+
|
|
297
|
+
const engine = new FetchEngine();
|
|
298
|
+
|
|
299
|
+
async function fetchWithHandling(url: string) {
|
|
300
|
+
try {
|
|
301
|
+
const result = await engine.fetchHTML(url);
|
|
302
|
+
// Note: result.error is less common, primary errors are thrown.
|
|
303
|
+
if (result.error) {
|
|
304
|
+
console.error(`Fetch for ${url} reported error after retries: ${result.error.message}`);
|
|
305
|
+
} else {
|
|
306
|
+
console.log(`Success for ${url}! Content type: ${result.contentType}`);
|
|
307
|
+
// Use result.content
|
|
308
|
+
}
|
|
309
|
+
} catch (error) {
|
|
310
|
+
console.error(`Fetch failed entirely for ${url}:`);
|
|
311
|
+
if (error instanceof FetchError) {
|
|
312
|
+
// Handle specific FetchError codes
|
|
313
|
+
switch (error.code) {
|
|
314
|
+
case "ERR_HTTP_ERROR":
|
|
315
|
+
console.error(` HTTP Error: Status ${error.statusCode} - ${error.message}`);
|
|
316
|
+
break;
|
|
317
|
+
case "ERR_NON_HTML_CONTENT":
|
|
318
|
+
console.error(` Wrong Content Type: ${error.message}`);
|
|
319
|
+
break;
|
|
320
|
+
// Add other specific codes as needed
|
|
321
|
+
default:
|
|
322
|
+
console.error(` FetchError (${error.code || "UNKNOWN"}): ${error.message}`);
|
|
323
|
+
break;
|
|
324
|
+
}
|
|
325
|
+
if (error.originalError) {
|
|
326
|
+
console.error(` Original Error: ${error.originalError.message}`);
|
|
327
|
+
}
|
|
328
|
+
} else if (error instanceof Error) {
|
|
329
|
+
// Handle generic JavaScript errors
|
|
330
|
+
console.error(` Generic Error: ${error.message}`);
|
|
331
|
+
} else {
|
|
332
|
+
// Handle unexpected throw types
|
|
333
|
+
console.error(` Unknown error occurred.`);
|
|
334
|
+
}
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
fetchWithHandling("https://example.com");
|
|
339
|
+
fetchWithHandling("https://httpbin.org/status/404"); // Example causing HTTP error
|
|
340
|
+
fetchWithHandling("https://httpbin.org/image/png"); // Example causing non-HTML error
|
|
341
|
+
```
|
|
243
342
|
|
|
244
343
|
## Logging
|
|
245
344
|
|
|
345
|
+
Currently, the library uses `console.warn` and `console.error` for internal warnings (like fallback events) and critical errors. More sophisticated logging options may be added in the future.
|
|
346
|
+
|
|
246
347
|
## Contributing
|
|
247
348
|
|
|
248
349
|
Contributions are welcome! Please open an issue or submit a pull request on the [GitHub repository](https://github.com/purepageio/fetch-engines).
|
package/dist/FetchEngine.d.ts
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
import type { HTMLFetchResult, BrowserMetrics } from "./types.js";
|
|
1
|
+
import type { HTMLFetchResult, BrowserMetrics, FetchEngineOptions } from "./types.js";
|
|
2
2
|
import type { IEngine } from "./IEngine.js";
|
|
3
|
+
import { FetchError } from "./errors.js";
|
|
3
4
|
/**
|
|
4
5
|
* Custom error class for HTTP errors from FetchEngine.
|
|
5
6
|
*/
|
|
6
|
-
export declare class FetchEngineHttpError extends
|
|
7
|
+
export declare class FetchEngineHttpError extends FetchError {
|
|
7
8
|
readonly statusCode: number;
|
|
8
9
|
constructor(message: string, statusCode: number);
|
|
9
10
|
}
|
|
@@ -14,22 +15,22 @@ export declare class FetchEngineHttpError extends Error {
|
|
|
14
15
|
* It does not support advanced configurations like retries, caching, or proxies directly.
|
|
15
16
|
*/
|
|
16
17
|
export declare class FetchEngine implements IEngine {
|
|
17
|
-
private readonly
|
|
18
|
+
private readonly options;
|
|
19
|
+
private static readonly DEFAULT_OPTIONS;
|
|
18
20
|
/**
|
|
19
21
|
* Creates an instance of FetchEngine.
|
|
20
|
-
*
|
|
22
|
+
* @param options Configuration options for the FetchEngine.
|
|
21
23
|
*/
|
|
22
|
-
constructor();
|
|
24
|
+
constructor(options?: FetchEngineOptions);
|
|
23
25
|
/**
|
|
24
|
-
* Fetches HTML
|
|
26
|
+
* Fetches HTML or converts to Markdown from the specified URL.
|
|
25
27
|
*
|
|
26
28
|
* @param url The URL to fetch.
|
|
27
29
|
* @returns A Promise resolving to an HTMLFetchResult object.
|
|
28
30
|
* @throws {FetchEngineHttpError} If the HTTP response status is not ok (e.g., 404, 500).
|
|
29
31
|
* @throws {Error} If the content type is not HTML or for other network errors.
|
|
30
32
|
*/
|
|
31
|
-
fetchHTML(url: string): Promise<HTMLFetchResult>;
|
|
32
|
-
private detectSPA;
|
|
33
|
+
fetchHTML(url: string, options?: FetchEngineOptions): Promise<HTMLFetchResult>;
|
|
33
34
|
/**
|
|
34
35
|
* Cleans up resources used by the engine.
|
|
35
36
|
* For FetchEngine, this is a no-op as it doesn't manage persistent resources.
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"FetchEngine.d.ts","sourceRoot":"","sources":["../src/FetchEngine.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;
|
|
1
|
+
{"version":3,"file":"FetchEngine.d.ts","sourceRoot":"","sources":["../src/FetchEngine.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,cAAc,EAAE,kBAAkB,EAAE,MAAM,YAAY,CAAC;AACtF,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAG5C,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAEzC;;GAEG;AACH,qBAAa,oBAAqB,SAAQ,UAAU;aAGhC,UAAU,EAAE,MAAM;gBADlC,OAAO,EAAE,MAAM,EACC,UAAU,EAAE,MAAM;CAKrC;AAED;;;;;GAKG;AACH,qBAAa,WAAY,YAAW,OAAO;IACzC,OAAO,CAAC,QAAQ,CAAC,OAAO,CAA+B;IAEvD,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,eAAe,CAErC;IAEF;;;OAGG;gBACS,OAAO,GAAE,kBAAuB;IAI5C;;;;;;;OAOG;IACG,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,kBAAkB,GAAG,OAAO,CAAC,eAAe,CAAC;IAiEpF;;;;OAIG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;IAI9B;;;;OAIG;IACH,UAAU,IAAI,cAAc,EAAE;CAG/B"}
|