@purepageio/fetch-engines 0.1.4 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,8 +1,53 @@
1
1
  # @purepageio/fetch-engines
2
2
 
3
- A collection of configurable engines for fetching HTML content using plain `fetch` or Playwright.
4
-
5
- This package provides robust and customisable ways to retrieve web page content, handling retries, caching, user agents, and optional browser automation via Playwright for complex JavaScript-driven sites.
3
+ [![npm version](https://img.shields.io/npm/v/@purepageio/fetch-engines.svg)](https://www.npmjs.com/package/@purepageio/fetch-engines)
4
+ [![Build Status](https://github.com/purepageio/fetch-engines/actions/workflows/build.yml/badge.svg)](https://github.com/purepageio/fetch-engines/actions/workflows/publish.yml) <!-- Assuming build.yml is the workflow filename -->
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
6
+
7
+ Fetching web content can be complex. You need to handle static HTML, dynamic JavaScript-driven sites, network errors, retries, caching, and potential bot detection measures. Managing browser automation tools like Playwright adds another layer of complexity with resource pooling and stealth configurations.
8
+
9
+ `@purepageio/fetch-engines` simplifies this entire process by providing a set of robust, configurable, and easy-to-use engines for retrieving web page content.
10
+
11
+ **Why use `@purepageio/fetch-engines`?**
12
+
13
+ - **Unified API:** Get content from simple or complex sites using the same `fetchHTML(url, options?)` method.
14
+ - **Flexible Strategies:** Choose the right tool for the job:
15
+ - `FetchEngine`: Lightweight and fast for static HTML, using the standard `fetch` API.
16
+ - `PlaywrightEngine`: Powerful browser automation for JavaScript-heavy sites, handling rendering and interactions.
17
+ - `HybridEngine`: The best of both worlds – tries `FetchEngine` first for speed, automatically falls back to `PlaywrightEngine` for reliability on complex pages.
18
+ - **Robust & Resilient:** Built-in caching, configurable retries, and standardized error handling make your fetching logic more dependable.
19
+ - **Simplified Automation:** `PlaywrightEngine` manages browser instances and contexts automatically through efficient pooling and includes integrated stealth measures to bypass common anti-bot systems.
20
+ - **Content Transformation:** Optionally convert fetched HTML directly to clean Markdown content.
21
+ - **TypeScript Ready:** Fully typed for a better development experience.
22
+
23
+ This package provides a high-level abstraction, letting you focus on using the web content rather than the intricacies of fetching it.
24
+
25
+ ## Table of Contents
26
+
27
+ - [Features](#features)
28
+ - [Installation](#installation)
29
+ - [Engines](#engines)
30
+ - [Basic Usage](#basic-usage)
31
+ - [Configuration](#configuration)
32
+ - [Return Value](#return-value)
33
+ - [API Reference](#api-reference)
34
+ - [Stealth / Anti-Detection (`PlaywrightEngine`)](#stealth--anti-detection-playwrightengine)
35
+ - [Error Handling](#error-handling)
36
+ - [Contributing](#contributing)
37
+ - [License](#license)
38
+
39
+ ## Features
40
+
41
+ - **Multiple Fetching Strategies:** Choose between `FetchEngine` (lightweight `fetch`), `PlaywrightEngine` (robust JS rendering via Playwright), or `HybridEngine` (smart fallback).
42
+ - **Unified API:** Simple `fetchHTML(url, options?)` interface across all engines.
43
+ - **Configurable Retries:** Automatic retries on failure with customizable attempts and delays.
44
+ - **Built-in Caching:** In-memory caching with configurable TTL to reduce redundant fetches.
45
+ - **Playwright Stealth:** Automatic integration of `playwright-extra` and stealth plugins to bypass common bot detection.
46
+ - **Managed Browser Pooling:** Efficient resource management for `PlaywrightEngine` with configurable browser/context limits and lifecycles.
47
+ - **Smart Fallbacks:** `HybridEngine` uses `FetchEngine` first, falling back to `PlaywrightEngine` only when needed. `PlaywrightEngine` can optionally use a fast HTTP fetch before launching a full browser.
48
+ - **Content Conversion:** Optionally convert fetched HTML directly to Markdown.
49
+ - **Standardized Errors:** Custom `FetchError` classes provide context on failures.
50
+ - **TypeScript Ready:** Fully typed codebase for enhanced developer experience.
6
51
 
7
52
  ## Installation
8
53
 
@@ -35,20 +80,25 @@ npx playwright install
35
80
  ```typescript
36
81
  import { FetchEngine } from "@purepageio/fetch-engines";
37
82
 
38
- const engine = new FetchEngine();
83
+ const engine = new FetchEngine(); // Default: fetches HTML
39
84
 
40
85
  async function main() {
41
86
  try {
42
87
  const url = "https://example.com";
43
88
  const result = await engine.fetchHTML(url);
44
- console.log(`Fetched ${result.url} (Status: ${result.statusCode})`);
89
+ console.log(`Fetched ${result.url} (ContentType: ${result.contentType})`);
45
90
  console.log(`Title: ${result.title}`);
46
- // console.log(`HTML: ${result.html.substring(0, 200)}...`);
91
+ console.log(`Content (HTML): ${result.content.substring(0, 100)}...`);
92
+
93
+ // Example fetching Markdown directly via constructor option
94
+ const markdownEngine = new FetchEngine({ markdown: true });
95
+ const mdResult = await markdownEngine.fetchHTML(url);
96
+ console.log(`\nFetched ${mdResult.url} (ContentType: ${mdResult.contentType})`);
97
+ console.log(`Content (Markdown):\n${mdResult.content.substring(0, 300)}...`);
47
98
  } catch (error) {
48
99
  console.error("Fetch failed:", error);
49
100
  }
50
101
  }
51
-
52
102
  main();
53
103
  ```
54
104
 
@@ -57,28 +107,28 @@ main();
57
107
  ```typescript
58
108
  import { PlaywrightEngine } from "@purepageio/fetch-engines";
59
109
 
60
- // Configure engine options (optional)
61
- const engine = new PlaywrightEngine({
62
- maxRetries: 2, // Number of retry attempts
63
- useHttpFallback: true, // Try simple HTTP fetch first
64
- cacheTTL: 5 * 60 * 1000, // Cache results for 5 minutes (in milliseconds)
65
- });
110
+ // Engine configured to fetch HTML by default
111
+ const engine = new PlaywrightEngine({ markdown: false });
66
112
 
67
113
  async function main() {
68
114
  try {
69
- const url = "https://quotes.toscrape.com/"; // A site that might benefit from JS rendering
70
- const result = await engine.fetchHTML(url);
71
- console.log(`Fetched ${result.url} (Status: ${result.statusCode})`);
72
- console.log(`Title: ${result.title}`);
73
- // console.log(`HTML: ${result.html.substring(0, 200)}...`);
115
+ const url = "https://quotes.toscrape.com/";
116
+
117
+ // Example: Fetching as Markdown using per-request override
118
+ console.log(`Fetching ${url} as Markdown...`);
119
+ const mdResult = await engine.fetchHTML(url, { markdown: true });
120
+ console.log(`Fetched ${mdResult.url} (ContentType: ${mdResult.contentType}) - Title: ${mdResult.title}`);
121
+ console.log(`Content (Markdown):\n${mdResult.content.substring(0, 300)}...`);
122
+
123
+ // You could also fetch as HTML by default:
124
+ // const htmlResult = await engine.fetchHTML(url);
125
+ // console.log(`\nFetched ${htmlResult.url} (ContentType: ${htmlResult.contentType}) - Title: ${htmlResult.title}`);
74
126
  } catch (error) {
75
127
  console.error("Playwright fetch failed:", error);
76
128
  } finally {
77
- // Important: Clean up browser resources when done
78
129
  await engine.cleanup();
79
130
  }
80
131
  }
81
-
82
132
  main();
83
133
  ```
84
134
 
@@ -87,32 +137,35 @@ main();
87
137
  ```typescript
88
138
  import { HybridEngine } from "@purepageio/fetch-engines";
89
139
 
90
- // Configure the underlying PlaywrightEngine (optional)
91
- const engine = new HybridEngine({
92
- maxRetries: 2, // PlaywrightEngine retry config
93
- maxBrowsers: 3, // PlaywrightEngine pool config
94
- // FetchEngine part has no config
95
- });
140
+ // Engine configured to fetch HTML by default for both internal engines
141
+ const engine = new HybridEngine({ markdown: false });
96
142
 
97
143
  async function main() {
98
144
  try {
99
- // Try a simple site (likely uses FetchEngine)
100
- const url1 = "https://example.com";
101
- const result1 = await engine.fetchHTML(url1);
102
- console.log(`Fetched ${result1.url} (Status: ${result1.statusCode}) - Title: ${result1.title}`);
103
-
104
- // Try a complex site (likely falls back to PlaywrightEngine)
105
- const url2 = "https://quotes.toscrape.com/";
106
- const result2 = await engine.fetchHTML(url2);
107
- console.log(`Fetched ${result2.url} (Status: ${result2.statusCode}) - Title: ${result2.title}`);
145
+ const url1 = "https://example.com"; // Simple site
146
+ const url2 = "https://quotes.toscrape.com/"; // Complex site
147
+
148
+ // --- Scenario 1: FetchEngine Succeeds ---
149
+ console.log(`\nFetching simple site (${url1}) requesting Markdown...`);
150
+ // FetchEngine uses its constructor config (markdown: false), ignoring the per-request option.
151
+ const result1 = await engine.fetchHTML(url1, { markdown: true });
152
+ console.log(`Fetched ${result1.url} (ContentType: ${result1.contentType}) - Title: ${result1.title}`);
153
+ console.log(`Content is ${result1.contentType} because FetchEngine succeeded and used its own config.`);
154
+ console.log(`${result1.content.substring(0, 300)}...`);
155
+
156
+ // --- Scenario 2: FetchEngine Fails, Playwright Fallback Occurs ---
157
+ console.log(`\nFetching complex site (${url2}) requesting Markdown...`);
158
+ // Assume FetchEngine fails for url2. PlaywrightEngine will be used and *will* receive the markdown: true override.
159
+ const result2 = await engine.fetchHTML(url2, { markdown: true });
160
+ console.log(`Fetched ${result2.url} (ContentType: ${result2.contentType}) - Title: ${result2.title}`);
161
+ console.log(`Content is ${result2.contentType} because Playwright fallback used the per-request option.`);
162
+ console.log(`${result2.content.substring(0, 300)}...`);
108
163
  } catch (error) {
109
164
  console.error("Hybrid fetch failed:", error);
110
165
  } finally {
111
- // Important: Clean up browser resources (for the Playwright part) when done
112
166
  await engine.cleanup();
113
167
  }
114
168
  }
115
-
116
169
  main();
117
170
  ```
118
171
 
@@ -122,99 +175,93 @@ Engines accept an optional configuration object in their constructor to customis
122
175
 
123
176
  ### FetchEngine
124
177
 
125
- The `FetchEngine` currently has **no configurable options** via its constructor. It uses standard `fetch` with default browser/Node.js retry/timeout behavior and a fixed set of browser-like headers.
178
+ The `FetchEngine` accepts a `FetchEngineOptions` object with the following properties:
179
+
180
+ | Option | Type | Default | Description |
181
+ | ---------- | --------- | ------- | ------------------------------------------------------------------------------------------------------ |
182
+ | `markdown` | `boolean` | `false` | If `true`, converts fetched HTML to Markdown. `contentType` in the result will be set to `'markdown'`. |
183
+
184
+ ```typescript
185
+ // Example: Always convert to Markdown
186
+ const mdFetchEngine = new FetchEngine({ markdown: true });
187
+ ```
126
188
 
127
189
  ### PlaywrightEngine
128
190
 
129
- The `PlaywrightEngine` accepts a `PlaywrightEngineConfig` object. See the detailed options below:
191
+ The `PlaywrightEngine` accepts a `PlaywrightEngineConfig` object with the following properties:
130
192
 
131
193
  **General Options:**
132
194
 
133
- - `concurrentPages` (`number`, default: `3`)
134
- - Maximum number of Playwright pages to process concurrently across all browser instances.
135
- - `maxRetries` (`number`, default: `3`)
136
- - Maximum number of retry attempts for a failed Playwright fetch operation (excluding initial attempt).
137
- - `retryDelay` (`number`, default: `5000`)
138
- - Delay in milliseconds between Playwright retry attempts.
139
- - `cacheTTL` (`number`, default: `900000` (15 minutes))
140
- - Time-to-live for cached results in milliseconds. Set to `0` to disable the in-memory cache. Affects both HTTP fallback and Playwright results.
141
- - `useHttpFallback` (`boolean`, default: `true`)
142
- - If `true`, the engine first attempts a simple, fast HTTP GET request. If this fails or appears to receive a challenge/CAPTCHA page, it then proceeds with a full Playwright browser request.
143
- - `useHeadedModeFallback` (`boolean`, default: `false`)
144
- - If `true` and a Playwright request fails (potentially due to bot detection), subsequent Playwright requests _to that specific domain_ will automatically use a headed (visible) browser instance.
145
- - `defaultFastMode` (`boolean`, default: `true`)
146
- - If `true`, Playwright requests initially run in "fast mode", blocking non-essential resources and skipping human behavior simulation. Can be overridden per-request via `fetchHTML` options.
147
- - `simulateHumanBehavior` (`boolean`, default: `true`)
148
- - If `true` and the Playwright request is _not_ in `fastMode`, the engine attempts basic human-like interactions. _Note: This simulation is currently basic._
195
+ | Option | Type | Default | Description |
196
+ | ----------------------- | --------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
197
+ | `markdown` | `boolean` | `false` | If `true`, converts content (from Playwright or fallback) to Markdown. `contentType` will be `'markdown'`. Can be overridden per-request. |
198
+ | `useHttpFallback` | `boolean` | `true` | If `true`, attempts a fast HTTP fetch before using Playwright. |
199
+ | `useHeadedModeFallback` | `boolean` | `false` | If `true`, automatically retries specific failed domains in headed (visible) mode. |
200
+ | `defaultFastMode` | `boolean` | `true` | If `true`, initially blocks non-essential resources and skips human simulation. Can be overridden per-request. |
201
+ | `simulateHumanBehavior` | `boolean` | `true` | If `true` (and not `fastMode`), attempts basic human-like interactions. |
202
+ | `concurrentPages` | `number` | `3` | Max number of pages to process concurrently within the engine queue. |
203
+ | `maxRetries` | `number` | `3` | Max retry attempts for a failed fetch (excluding initial try). |
204
+ | `retryDelay` | `number` | `5000` | Delay (ms) between retries. |
205
+ | `cacheTTL` | `number` | `900000` | Cache Time-To-Live (ms). `0` disables caching. (15 mins default) |
149
206
 
150
207
  **Browser Pool Options (Passed to internal `PlaywrightBrowserPool`):**
151
208
 
152
- - `maxBrowsers` (`number`, default: `2`)
153
- - Maximum number of concurrent browser instances the pool will manage.
154
- - `maxPagesPerContext` (`number`, default: `6`)
155
- - Maximum number of pages per browser context before recycling.
156
- - `maxBrowserAge` (`number`, default: `1200000` (20 minutes))
157
- - Maximum age in milliseconds a browser instance lives before recycling.
158
- - `healthCheckInterval` (`number`, default: `60000` (1 minute))
159
- - How often (in milliseconds) the pool checks browser health.
160
- - `useHeadedMode` (`boolean`, default: `false`)
161
- - Forces the _entire_ browser pool to launch browsers in headed (visible) mode.
162
- - `poolBlockedDomains` (`string[]`, default: `[]` - uses pool's internal defaults)
163
- - List of domain _glob patterns_ to block browser requests to.
164
- - `poolBlockedResourceTypes` (`string[]`, default: `[]` - uses pool's internal defaults)
165
- - List of Playwright resource types (e.g., `image`, `font`) to block.
166
- - `proxy` (`object | undefined`, default: `undefined`)
167
- - Proxy configuration for browser instances (`server`, `username?`, `password?`).
209
+ | Option | Type | Default | Description |
210
+ | -------------------------- | -------------------------- | ----------- | ------------------------------------------------------------------------- |
211
+ | `maxBrowsers` | `number` | `2` | Max concurrent browser instances managed by the pool. |
212
+ | `maxPagesPerContext` | `number` | `6` | Max pages per browser context before recycling. |
213
+ | `maxBrowserAge` | `number` | `1200000` | Max age (ms) a browser instance lives before recycling. (20 mins default) |
214
+ | `healthCheckInterval` | `number` | `60000` | How often (ms) the pool checks browser health. (1 min default) |
215
+ | `useHeadedMode` | `boolean` | `false` | Forces the _entire pool_ to launch browsers in headed (visible) mode. |
216
+ | `poolBlockedDomains` | `string[]` | `[]` | List of domain glob patterns to block requests to. |
217
+ | `poolBlockedResourceTypes` | `string[]` | `[]` | List of Playwright resource types (e.g., 'image', 'font') to block. |
218
+ | `proxy` | `{ server: string, ... }?` | `undefined` | Proxy configuration object (see `PlaywrightEngineConfig` type). |
168
219
 
169
220
  ### HybridEngine
170
221
 
171
- The `HybridEngine` constructor accepts a single optional argument: `playwrightConfig`. This object follows the **`PlaywrightEngineConfig`** structure described above.
222
+ The `HybridEngine` constructor accepts a single optional argument which uses the **`PlaywrightEngineConfig`** structure (see the `PlaywrightEngine` tables above). These options configure the underlying engines where applicable:
172
223
 
173
- ```typescript
174
- import { HybridEngine } from "@purepageio/fetch-engines";
224
+ - Options like `maxRetries`, `cacheTTL`, `proxy`, `maxBrowsers`, etc., are primarily passed to the internal `PlaywrightEngine`.
225
+ - The `markdown` setting in the constructor (`boolean`, default: `false`) applies to **both** internal engines by default.
226
+ - If you provide `markdown: true` in the `options` object when calling `fetchHTML`, this override **only applies if a fallback to `PlaywrightEngine` is necessary**. The `FetchEngine` part will always use the `markdown` setting provided in the `HybridEngine` constructor.
175
227
 
176
- const engine = new HybridEngine({
177
- // These options configure the PlaywrightEngine used for fallbacks
178
- maxRetries: 1,
179
- maxBrowsers: 1,
180
- cacheTTL: 0, // Disable caching in the Playwright part
181
- });
228
+ ```typescript
229
+ // ... (HybridEngine examples remain the same) ...
182
230
  ```
183
231
 
184
- The internal `FetchEngine` used by `HybridEngine` is _not_ configurable.
185
-
186
232
  ## Return Value
187
233
 
188
- Both `FetchEngine.fetchHTML()` and `PlaywrightEngine.fetchHTML()` return a Promise that resolves to a `FetchResult` object with the following properties:
234
+ All `fetchHTML()` methods return a Promise that resolves to an `HTMLFetchResult` object:
189
235
 
190
- - `html` (`string`): The full HTML content of the fetched page.
191
- - `title` (`string | null`): The extracted `<title>` tag content, or `null` if no title is found.
192
- - `url` (`string`): The final URL after any redirects.
193
- - `isFromCache` (`boolean`): `true` if the result was served from the engine's cache, `false` otherwise.
194
- - `statusCode` (`number | undefined`): The HTTP status code of the final response. This is typically available for `FetchEngine` and the HTTP fallback in `PlaywrightEngine`, but might be `undefined` for some Playwright navigation scenarios if the primary response wasn't directly captured.
195
- - `error` (`FetchError | Error | undefined`): If an error occurred during the _final_ fetch attempt (after retries), this property will contain the error object. It might be a specific `FetchError` (see Error Handling) or a generic `Error`.
236
+ - `content` (`string`): The fetched content, either original HTML or converted Markdown.
237
+ - `contentType` (`'html' | 'markdown'`): Indicates the format of the `content` string.
238
+ - `title` (`string | null`): Extracted page title (from original HTML).
239
+ - `url` (`string`): Final URL after redirects.
240
+ - `isFromCache` (`boolean`): True if the result came from cache.
241
+ - `statusCode` (`number | undefined`): HTTP status code.
242
+ - `error` (`Error | undefined`): Error object if the fetch failed after all retries. It's generally recommended to rely on catching thrown errors for failure handling.
196
243
 
197
244
  ## API Reference
198
245
 
199
246
  ### `engine.fetchHTML(url, options?)`
200
247
 
201
- - `url` (`string`): The URL of the page to fetch.
202
- - `options` (`object`, optional): Per-request options to override engine defaults.
203
- - For `PlaywrightEngine`, you can override `fastMode` (`boolean`) to force or disable fast mode for this specific request.
204
- - _(Other per-request options may be added in the future)._
205
- - **Returns:** `Promise<FetchResult>`
248
+ - `url` (`string`): URL to fetch.
249
+ - `options?` (`FetchOptions`): Optional per-request overrides.
250
+ - `markdown?: boolean`: (Playwright/Hybrid only) Request Markdown conversion. For Hybrid, only applies on fallback to Playwright.
251
+ - `fastMode?: boolean`: (Playwright/Hybrid only) Override fast mode.
252
+ - **Returns:** `Promise<HTMLFetchResult>`
206
253
 
207
- Fetches the HTML content for the given URL using the engine's configured strategy (plain fetch or Playwright).
254
+ Fetches content, returning HTML or Markdown based on configuration/options in `result.content` with `result.contentType` indicating the format.
208
255
 
209
- ### `engine.cleanup()` (PlaywrightEngine only)
256
+ ### `engine.cleanup()` (PlaywrightEngine & HybridEngine)
210
257
 
211
258
  - **Returns:** `Promise<void>`
212
259
 
213
- Gracefully shuts down all browser instances managed by the `PlaywrightEngine`'s browser pool. **It is crucial to call `await engine.cleanup()` when you are finished using a `PlaywrightEngine` instance** to release system resources.
260
+ Gracefully shuts down all browser instances managed by the `PlaywrightEngine`'s browser pool (used by both `PlaywrightEngine` and `HybridEngine`). **It is crucial to call `await engine.cleanup()` when you are finished using these engines** to release system resources.
214
261
 
215
262
  ## Stealth / Anti-Detection (`PlaywrightEngine`)
216
263
 
217
- The `PlaywrightEngine` automatically integrates `playwright-extra` and its powerful stealth plugin (`puppeteer-extra-plugin-stealth`). This plugin applies various techniques to make the headless browser controlled by Playwright appear more like a regular human-operated browser, helping to bypass many common bot detection systems.
264
+ The `PlaywrightEngine` automatically integrates `playwright-extra` and its powerful stealth plugin ([`puppeteer-extra-plugin-stealth`](https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-stealth)). This plugin applies various techniques to make the headless browser controlled by Playwright appear more like a regular human-operated browser, helping to bypass many common bot detection systems.
218
265
 
219
266
  There are **no manual configuration options** for stealth; it is enabled by default when using `PlaywrightEngine`. The previous options (`useStealthMode`, `randomizeFingerprint`, `evasionLevel`) have been removed.
220
267
 
@@ -228,21 +275,75 @@ Errors during fetching are typically thrown as instances of `FetchError` (or its
228
275
  - `message` (`string`): Description of the error.
229
276
  - `code` (`string | undefined`): A specific error code (e.g., `ERR_NAVIGATION_TIMEOUT`, `ERR_HTTP_ERROR`, `ERR_NON_HTML_CONTENT`).
230
277
  - `originalError` (`Error | undefined`): The underlying error that caused this fetch error (e.g., a Playwright error object).
278
+ - `statusCode` (`number | undefined`): The HTTP status code, if relevant (especially for `FetchEngineHttpError`).
231
279
 
232
280
  Common error scenarios include:
233
281
 
234
282
  - Network issues (DNS resolution failure, connection refused).
235
- - HTTP errors (4xx client errors, 5xx server errors).
236
- - Non-HTML content type received (for `FetchEngine`).
237
- - Playwright navigation timeouts.
283
+ - HTTP errors (4xx client errors, 5xx server errors) -> `FetchEngineHttpError` from `FetchEngine` or potentially wrapped `FetchError` from `PlaywrightEngine`.
284
+ - Non-HTML content type received -> `FetchError` with code `ERR_NON_HTML_CONTENT` from `FetchEngine`.
285
+ - Playwright navigation timeouts -> `FetchError` wrapping Playwright error, often with code `ERR_NAVIGATION_TIMEOUT`.
238
286
  - Proxy connection errors.
239
287
  - Page crashes within Playwright.
240
288
  - Errors thrown by the browser pool (e.g., failure to launch browser).
241
289
 
242
- The `FetchResult` object may also contain an `error` property if the final fetch attempt failed after all retries.
290
+ The `HTMLFetchResult` object may also contain an `error` property if the final fetch attempt failed after all retries but an earlier attempt (within retries) might have produced some intermediate (potentially unusable) result data. It's generally best to rely on the thrown error for failure handling.
291
+
292
+ **Example:**
293
+
294
+ ```typescript
295
+ import { FetchEngine, FetchError } from "@purepageio/fetch-engines";
296
+
297
+ const engine = new FetchEngine();
298
+
299
+ async function fetchWithHandling(url: string) {
300
+ try {
301
+ const result = await engine.fetchHTML(url);
302
+ // Note: result.error is less common, primary errors are thrown.
303
+ if (result.error) {
304
+ console.error(`Fetch for ${url} reported error after retries: ${result.error.message}`);
305
+ } else {
306
+ console.log(`Success for ${url}! Content type: ${result.contentType}`);
307
+ // Use result.content
308
+ }
309
+ } catch (error) {
310
+ console.error(`Fetch failed entirely for ${url}:`);
311
+ if (error instanceof FetchError) {
312
+ // Handle specific FetchError codes
313
+ switch (error.code) {
314
+ case "ERR_HTTP_ERROR":
315
+ console.error(` HTTP Error: Status ${error.statusCode} - ${error.message}`);
316
+ break;
317
+ case "ERR_NON_HTML_CONTENT":
318
+ console.error(` Wrong Content Type: ${error.message}`);
319
+ break;
320
+ // Add other specific codes as needed
321
+ default:
322
+ console.error(` FetchError (${error.code || "UNKNOWN"}): ${error.message}`);
323
+ break;
324
+ }
325
+ if (error.originalError) {
326
+ console.error(` Original Error: ${error.originalError.message}`);
327
+ }
328
+ } else if (error instanceof Error) {
329
+ // Handle generic JavaScript errors
330
+ console.error(` Generic Error: ${error.message}`);
331
+ } else {
332
+ // Handle unexpected throw types
333
+ console.error(` Unknown error occurred.`);
334
+ }
335
+ }
336
+ }
337
+
338
+ fetchWithHandling("https://example.com");
339
+ fetchWithHandling("https://httpbin.org/status/404"); // Example causing HTTP error
340
+ fetchWithHandling("https://httpbin.org/image/png"); // Example causing non-HTML error
341
+ ```
243
342
 
244
343
  ## Logging
245
344
 
345
+ Currently, the library uses `console.warn` and `console.error` for internal warnings (like fallback events) and critical errors. More sophisticated logging options may be added in the future.
346
+
246
347
  ## Contributing
247
348
 
248
349
  Contributions are welcome! Please open an issue or submit a pull request on the [GitHub repository](https://github.com/purepageio/fetch-engines).
@@ -1,9 +1,10 @@
1
- import type { HTMLFetchResult, BrowserMetrics } from "./types.js";
1
+ import type { HTMLFetchResult, BrowserMetrics, FetchEngineOptions } from "./types.js";
2
2
  import type { IEngine } from "./IEngine.js";
3
+ import { FetchError } from "./errors.js";
3
4
  /**
4
5
  * Custom error class for HTTP errors from FetchEngine.
5
6
  */
6
- export declare class FetchEngineHttpError extends Error {
7
+ export declare class FetchEngineHttpError extends FetchError {
7
8
  readonly statusCode: number;
8
9
  constructor(message: string, statusCode: number);
9
10
  }
@@ -14,22 +15,22 @@ export declare class FetchEngineHttpError extends Error {
14
15
  * It does not support advanced configurations like retries, caching, or proxies directly.
15
16
  */
16
17
  export declare class FetchEngine implements IEngine {
17
- private readonly headers;
18
+ private readonly options;
19
+ private static readonly DEFAULT_OPTIONS;
18
20
  /**
19
21
  * Creates an instance of FetchEngine.
20
- * Note: This engine currently does not accept configuration options.
22
+ * @param options Configuration options for the FetchEngine.
21
23
  */
22
- constructor();
24
+ constructor(options?: FetchEngineOptions);
23
25
  /**
24
- * Fetches HTML content from the specified URL using the `fetch` API.
26
+ * Fetches HTML or converts to Markdown from the specified URL.
25
27
  *
26
28
  * @param url The URL to fetch.
27
29
  * @returns A Promise resolving to an HTMLFetchResult object.
28
30
  * @throws {FetchEngineHttpError} If the HTTP response status is not ok (e.g., 404, 500).
29
31
  * @throws {Error} If the content type is not HTML or for other network errors.
30
32
  */
31
- fetchHTML(url: string): Promise<HTMLFetchResult>;
32
- private detectSPA;
33
+ fetchHTML(url: string, options?: FetchEngineOptions): Promise<HTMLFetchResult>;
33
34
  /**
34
35
  * Cleans up resources used by the engine.
35
36
  * For FetchEngine, this is a no-op as it doesn't manage persistent resources.
@@ -1 +1 @@
1
- {"version":3,"file":"FetchEngine.d.ts","sourceRoot":"","sources":["../src/FetchEngine.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAClE,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAG5C;;GAEG;AACH,qBAAa,oBAAqB,SAAQ,KAAK;IAC7C,SAAgB,UAAU,EAAE,MAAM,CAAC;gBAEvB,OAAO,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM;CAShD;AAED;;;;;GAKG;AACH,qBAAa,WAAY,YAAW,OAAO;IACzC,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAyB;IAEjD;;;OAGG;;IAeH;;;;;;;OAOG;IACG,SAAS,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,eAAe,CAAC;IAgDtD,OAAO,CAAC,SAAS;IA+BjB;;;;OAIG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;IAK9B;;;;OAIG;IACH,UAAU,IAAI,cAAc,EAAE;CAI/B"}
1
+ {"version":3,"file":"FetchEngine.d.ts","sourceRoot":"","sources":["../src/FetchEngine.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,cAAc,EAAE,kBAAkB,EAAE,MAAM,YAAY,CAAC;AACtF,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAG5C,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAEzC;;GAEG;AACH,qBAAa,oBAAqB,SAAQ,UAAU;aAGhC,UAAU,EAAE,MAAM;gBADlC,OAAO,EAAE,MAAM,EACC,UAAU,EAAE,MAAM;CAKrC;AAED;;;;;GAKG;AACH,qBAAa,WAAY,YAAW,OAAO;IACzC,OAAO,CAAC,QAAQ,CAAC,OAAO,CAA+B;IAEvD,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,eAAe,CAErC;IAEF;;;OAGG;gBACS,OAAO,GAAE,kBAAuB;IAI5C;;;;;;;OAOG;IACG,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,kBAAkB,GAAG,OAAO,CAAC,eAAe,CAAC;IAiEpF;;;;OAIG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;IAI9B;;;;OAIG;IACH,UAAU,IAAI,cAAc,EAAE;CAG/B"}