@purepageio/fetch-engines 0.2.11 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -32,6 +32,7 @@ This package provides a high-level abstraction, letting you focus on using the w
32
32
  - [API Reference](#api-reference)
33
33
  - [Stealth / Anti-Detection (`PlaywrightEngine`)](#stealth--anti-detection-playwrightengine)
34
34
  - [Error Handling](#error-handling)
35
+ - [Logging](#logging)
35
36
  - [Contributing](#contributing)
36
37
  - [License](#license)
37
38
 
@@ -106,8 +107,11 @@ main();
106
107
  ```typescript
107
108
  import { PlaywrightEngine } from "@purepageio/fetch-engines";
108
109
 
109
- // Engine configured to fetch HTML by default
110
- const engine = new PlaywrightEngine({ markdown: false });
110
+ // Engine configured to fetch HTML by default and pass custom launch arguments
111
+ const engine = new PlaywrightEngine({
112
+ markdown: false,
113
+ playwrightLaunchOptions: { args: ["--disable-gpu"] },
114
+ });
111
115
 
112
116
  async function main() {
113
117
  try {
@@ -191,17 +195,20 @@ The `PlaywrightEngine` accepts a `PlaywrightEngineConfig` object with the follow
191
195
 
192
196
  **General Options:**
193
197
 
194
- | Option | Type | Default | Description |
195
- | ----------------------- | --------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
196
- | `markdown` | `boolean` | `false` | If `true`, converts content (from Playwright or fallback) to Markdown. `contentType` will be `'markdown'`. Can be overridden per-request. |
197
- | `useHttpFallback` | `boolean` | `true` | If `true`, attempts a fast HTTP fetch before using Playwright. |
198
- | `useHeadedModeFallback` | `boolean` | `false` | If `true`, automatically retries specific failed domains in headed (visible) mode. |
199
- | `defaultFastMode` | `boolean` | `true` | If `true`, initially blocks non-essential resources and skips human simulation. Can be overridden per-request. |
200
- | `simulateHumanBehavior` | `boolean` | `true` | If `true` (and not `fastMode`), attempts basic human-like interactions. |
201
- | `concurrentPages` | `number` | `3` | Max number of pages to process concurrently within the engine queue. |
202
- | `maxRetries` | `number` | `3` | Max retry attempts for a failed fetch (excluding initial try). |
203
- | `retryDelay` | `number` | `5000` | Delay (ms) between retries. |
204
- | `cacheTTL` | `number` | `900000` | Cache Time-To-Live (ms). `0` disables caching. (15 mins default) |
198
+ | Option | Type | Default | Description |
199
+ | ------------------------- | --------------- | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
200
+ | `markdown` | `boolean` | `false` | If `true`, converts content (from Playwright or its internal HTTP fallback) to Markdown. `contentType` will be `'markdown'`. Can be overridden per-request. |
201
+ | `useHttpFallback` | `boolean` | `true` | If `true`, attempts a fast HTTP fetch before using Playwright. Ineffective if `spaMode` is `true`. |
202
+ | `useHeadedModeFallback` | `boolean` | `false` | If `true`, automatically retries specific failed Playwright attempts in headed (visible) mode. |
203
+ | `defaultFastMode` | `boolean` | `true` | If `true`, initially blocks non-essential resources and skips human simulation. Can be overridden per-request. Effectively `false` if `spaMode` is `true`. |
204
+ | `simulateHumanBehavior` | `boolean` | `true` | If `true` (and not `fastMode` or `spaMode`), attempts basic human-like interactions. |
205
+ | `concurrentPages` | `number` | `3` | Max number of pages to process concurrently within the engine queue. |
206
+ | `maxRetries` | `number` | `3` | Max retry attempts for a failed fetch (excluding initial try). |
207
+ | `retryDelay` | `number` | `5000` | Delay (ms) between retries. |
208
+ | `cacheTTL` | `number` | `900000` | Cache Time-To-Live (ms). `0` disables caching. (15 mins default) |
209
+ | `spaMode` | `boolean` | `false` | If `true`, enables Single Page Application mode. This typically bypasses `useHttpFallback`, effectively sets `fastMode` to `false`, uses more patient load conditions (e.g., network idle), and may apply `spaRenderDelayMs`. Recommended for JavaScript-heavy sites. |
210
+ | `spaRenderDelayMs` | `number` | `0` | Explicit delay (ms) after page load events in `spaMode` to allow for client-side rendering. Only applies if `spaMode` is `true`. |
211
+ | `playwrightLaunchOptions` | `LaunchOptions` | `undefined` | Optional Playwright launch options (from `playwright` package, e.g., `{ args: ['--some-flag'] }`) passed when a browser instance is created. Merged with internal defaults. |
205
212
 
206
213
  **Browser Pool Options (Passed to internal `PlaywrightBrowserPool`):**
207
214
 
@@ -218,14 +225,42 @@ The `PlaywrightEngine` accepts a `PlaywrightEngineConfig` object with the follow
218
225
 
219
226
  ### HybridEngine
220
227
 
221
- The `HybridEngine` constructor accepts a single optional argument which uses the **`PlaywrightEngineConfig`** structure (see the `PlaywrightEngine` tables above). These options configure the underlying engines where applicable:
228
+ The `HybridEngine` constructor accepts `PlaywrightEngineConfig` options. These settings configure the underlying engines and the hybrid strategy:
222
229
 
223
- - Options like `maxRetries`, `cacheTTL`, `proxy`, `maxBrowsers`, etc., are primarily passed to the internal `PlaywrightEngine`.
224
- - The `markdown` setting in the constructor (`boolean`, default: `false`) applies to **both** internal engines by default.
225
- - If you provide `markdown: true` in the `options` object when calling `fetchHTML`, this override **only applies if a fallback to `PlaywrightEngine` is necessary**. The `FetchEngine` part will always use the `markdown` setting provided in the `HybridEngine` constructor.
230
+ - **Constructor `markdown` option:**
231
+ - Sets the default Markdown conversion for the internal `FetchEngine`. This `FetchEngine` instance **does not** react to per-request `markdown` overrides.
232
+ - Sets the default for the internal `PlaywrightEngine`.
233
+ - **Constructor `spaMode` option:**
234
+ - Sets the default SPA mode for `HybridEngine`. If `true`, `HybridEngine` checks `FetchEngine`'s output for SPA shell characteristics. If an SPA shell is detected, it forces a fallback to `PlaywrightEngine` (which will also run in SPA mode).
235
+ - Sets the default for the internal `PlaywrightEngine`.
236
+ - **Other `PlaywrightEngineConfig` options** (e.g., `maxRetries`, `cacheTTL`, `playwrightLaunchOptions`, pool settings) are primarily passed to and used by the internal `PlaywrightEngine`.
237
+
238
+ **Per-request `options` in `HybridEngine.fetchHTML(url, options)`:**
239
+
240
+ - **`options.markdown` (`boolean`):**
241
+ - If `FetchEngine` succeeds and its content is used (i.e., not an SPA shell when `spaMode` is active), this per-request `markdown` option is **ignored**. The content's format is determined by the `FetchEngine`'s constructor `markdown` setting.
242
+ - If `HybridEngine` falls back to `PlaywrightEngine` (due to `FetchEngine` failure or SPA shell detection), this per-request `markdown` option **overrides** the `PlaywrightEngine`'s default and determines if its output is Markdown.
243
+ - **`options.spaMode` (`boolean`):**
244
+ - Overrides the `HybridEngine`'s default SPA mode behavior for this specific request (affecting SPA shell detection and potential fallback to `PlaywrightEngine`).
245
+ - If `PlaywrightEngine` is used, this option also overrides its default SPA mode.
246
+ - **`options.fastMode` (`boolean`):**
247
+ - If `PlaywrightEngine` is used, this option overrides its `defaultFastMode` setting. It has no effect on `FetchEngine`.
226
248
 
227
249
  ```typescript
228
- // ... (HybridEngine examples remain the same) ...
250
+ // Example: HybridEngine with SPA mode enabled by default
251
+ const spaHybridEngine = new HybridEngine({ spaMode: true, spaRenderDelayMs: 2000 });
252
+
253
+ async function fetchSpaSite() {
254
+ try {
255
+ // This will use PlaywrightEngine directly if smallblackdots is an SPA shell
256
+ const result = await spaHybridEngine.fetchHTML(
257
+ "https://www.smallblackdots.net/release/16109/corrina-joseph-wish-tonite-lonely"
258
+ );
259
+ console.log(`Title: ${result.title}`);
260
+ } catch (e) {
261
+ console.error(e);
262
+ }
263
+ }
229
264
  ```
230
265
 
231
266
  ## Return Value
@@ -248,6 +283,7 @@ All `fetchHTML()` methods return a Promise that resolves to an `HTMLFetchResult`
248
283
  - `options?` (`FetchOptions`): Optional per-request overrides.
249
284
  - `markdown?: boolean`: (Playwright/Hybrid only) Request Markdown conversion. For Hybrid, only applies on fallback to Playwright.
250
285
  - `fastMode?: boolean`: (Playwright/Hybrid only) Override fast mode.
286
+ - `spaMode?: boolean`: (Playwright/Hybrid only) Override SPA mode behavior for this request.
251
287
  - **Returns:** `Promise<HTMLFetchResult>`
252
288
 
253
289
  Fetches content, returning HTML or Markdown based on configuration/options in `result.content` with `result.contentType` indicating the format.
@@ -276,67 +312,72 @@ Errors during fetching are typically thrown as instances of `FetchError` (or its
276
312
  - `originalError` (`Error | undefined`): The underlying error that caused this fetch error (e.g., a Playwright error object).
277
313
  - `statusCode` (`number | undefined`): The HTTP status code, if relevant (especially for `FetchEngineHttpError`).
278
314
 
279
- Common error scenarios include:
280
-
281
- - Network issues (DNS resolution failure, connection refused).
282
- - HTTP errors (4xx client errors, 5xx server errors) -> `FetchEngineHttpError` from `FetchEngine` or potentially wrapped `FetchError` from `PlaywrightEngine`.
283
- - Non-HTML content type received -> `FetchError` with code `ERR_NON_HTML_CONTENT` from `FetchEngine`.
284
- - Playwright navigation timeouts -> `FetchError` wrapping Playwright error, often with code `ERR_NAVIGATION_TIMEOUT`.
285
- - Proxy connection errors.
286
- - Page crashes within Playwright.
287
- - Errors thrown by the browser pool (e.g., failure to launch browser).
315
+ Common `FetchError` codes and scenarios:
316
+
317
+ - **`ERR_HTTP_ERROR`**: Thrown by `FetchEngine` for HTTP status codes >= 400. `error.statusCode` will be set.
318
+ - **`ERR_NON_HTML_CONTENT`**: Thrown by `FetchEngine` if the content type is not HTML and `markdown` conversion is not requested.
319
+ - **`ERR_PLAYWRIGHT_OPERATION`**: A general error from `PlaywrightEngine` indicating a failure during a Playwright operation (e.g., page acquisition, navigation, interaction). The `originalError` property will often contain the specific Playwright error.
320
+ - **`ERR_NAVIGATION`**: Often seen as part of `ERR_PLAYWRIGHT_OPERATION`'s message or in `originalError` when a Playwright navigation fails (e.g., timeout, SSL error).
321
+ - **`ERR_MARKDOWN_CONVERSION_NON_HTML`**: Thrown by `PlaywrightEngine` (or `HybridEngine` if falling back to Playwright) if `markdown: true` is requested for a non-HTML content type (e.g., XML, JSON).
322
+ - **`ERR_UNSUPPORTED_RAW_CONTENT_TYPE`**: Thrown by `PlaywrightEngine` if `markdown: false` is requested for a content type it doesn't support for direct fetching (e.g., images, applications). Currently, it primarily supports `text/*` and `application/json`, `application/xml` like types when `markdown: false`.
323
+ - **`ERR_CACHE_ERROR`**: Indicates an issue with cache read/write operations.
324
+ - **`ERR_PROXY_CONFIG_ERROR`**: Problem with proxy configuration.
325
+ - **`ERR_BROWSER_POOL_EXHAUSTED`**: If the browser pool cannot provide a page (e.g. max browsers reached and all are busy beyond timeout).
326
+ - **Other Scenarios (often wrapped by `ERR_PLAYWRIGHT_OPERATION` or a generic `FetchError`):**
327
+ - Network issues (DNS resolution, connection refused).
328
+ - Proxy connection failures.
329
+ - Page crashes or context/browser disconnections within Playwright.
330
+ - Failures during browser launch or management by the pool.
288
331
 
289
332
  The `HTMLFetchResult` object may also contain an `error` property if the final fetch attempt failed after all retries but an earlier attempt (within retries) might have produced some intermediate (potentially unusable) result data. It's generally best to rely on the thrown error for failure handling.
290
333
 
291
334
  **Example:**
292
335
 
293
336
  ```typescript
294
- import { FetchEngine, FetchError } from "@purepageio/fetch-engines";
337
+ import { PlaywrightEngine, FetchError } from "@purepageio/fetch-engines";
295
338
 
296
- const engine = new FetchEngine();
339
+ // Example using PlaywrightEngine to illustrate more complex error handling
340
+ const engine = new PlaywrightEngine({ useHttpFallback: false, maxRetries: 1 });
297
341
 
298
342
  async function fetchWithHandling(url: string) {
299
343
  try {
300
344
  const result = await engine.fetchHTML(url);
301
- // Note: result.error is less common, primary errors are thrown.
302
345
  if (result.error) {
303
- console.error(`Fetch for ${url} reported error after retries: ${result.error.message}`);
304
- } else {
305
- console.log(`Success for ${url}! Content type: ${result.contentType}`);
306
- // Use result.content
346
+ console.warn(`Fetch for ${url} included non-critical error after retries: ${result.error.message}`);
307
347
  }
348
+ console.log(`Success for ${url}! Title: ${result.title}, Content type: ${result.contentType}`);
349
+ // Use result.content
308
350
  } catch (error) {
309
- console.error(`Fetch failed entirely for ${url}:`);
351
+ console.error(`Fetch failed for ${url}:`);
310
352
  if (error instanceof FetchError) {
311
- // Handle specific FetchError codes
312
- switch (error.code) {
313
- case "ERR_HTTP_ERROR":
314
- console.error(` HTTP Error: Status ${error.statusCode} - ${error.message}`);
315
- break;
316
- case "ERR_NON_HTML_CONTENT":
317
- console.error(` Wrong Content Type: ${error.message}`);
318
- break;
319
- // Add other specific codes as needed
320
- default:
321
- console.error(` FetchError (${error.code || "UNKNOWN"}): ${error.message}`);
322
- break;
353
+ console.error(` Error Code: ${error.code || "N/A"}`);
354
+ console.error(` Message: ${error.message}`);
355
+ if (error.statusCode) {
356
+ console.error(` Status Code: ${error.statusCode}`);
323
357
  }
324
358
  if (error.originalError) {
325
- console.error(` Original Error: ${error.originalError.message}`);
359
+ console.error(` Original Error: ${error.originalError.name} - ${error.originalError.message}`);
360
+ }
361
+ // Example of specific handling:
362
+ if (error.code === "ERR_PLAYWRIGHT_OPERATION") {
363
+ console.error(" Hint: This was a Playwright operation failure. Check Playwright logs or originalError.");
326
364
  }
327
365
  } else if (error instanceof Error) {
328
- // Handle generic JavaScript errors
329
366
  console.error(` Generic Error: ${error.message}`);
330
367
  } else {
331
- // Handle unexpected throw types
332
- console.error(` Unknown error occurred.`);
368
+ console.error(` Unknown error occurred: ${String(error)}`);
333
369
  }
334
370
  }
335
371
  }
336
372
 
337
- fetchWithHandling("https://example.com");
338
- fetchWithHandling("https://httpbin.org/status/404"); // Example causing HTTP error
339
- fetchWithHandling("https://httpbin.org/image/png"); // Example causing non-HTML error
373
+ async function runExamples() {
374
+ await fetchWithHandling("https://nonexistentdomain.example.com"); // Likely DNS or navigation error
375
+ await fetchWithHandling("https://example.com/non_html_resource.json"); // Test with actual JSON URL if available
376
+ // or a site known to cause Playwright issues for a demo.
377
+ await engine.cleanup(); // Important for PlaywrightEngine
378
+ }
379
+
380
+ runExamples();
340
381
  ```
341
382
 
342
383
  ## Logging
@@ -7,7 +7,9 @@ export declare class HybridEngine implements IEngine {
7
7
  private readonly fetchEngine;
8
8
  private readonly playwrightEngine;
9
9
  private readonly config;
10
+ private readonly playwrightOnlyPatterns;
10
11
  constructor(config?: PlaywrightEngineConfig);
12
+ private _isSpaShell;
11
13
  fetchHTML(url: string, options?: FetchOptions): Promise<HTMLFetchResult>;
12
14
  /**
13
15
  * Delegates getMetrics to the PlaywrightEngine.
@@ -1 +1 @@
1
- {"version":3,"file":"HybridEngine.d.ts","sourceRoot":"","sources":["../src/HybridEngine.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAC5C,OAAO,KAAK,EAAE,eAAe,EAAE,sBAAsB,EAAE,YAAY,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAExG;;GAEG;AACH,qBAAa,YAAa,YAAW,OAAO;IAC1C,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAc;IAC1C,OAAO,CAAC,QAAQ,CAAC,gBAAgB,CAAmB;IACpD,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAyB;gBAEpC,MAAM,GAAE,sBAA2B;IAQzC,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,YAAiB,GAAG,OAAO,CAAC,eAAe,CAAC;IA8BlF;;OAEG;IACH,UAAU,IAAI,cAAc,EAAE;IAI9B;;OAEG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAM/B"}
1
+ {"version":3,"file":"HybridEngine.d.ts","sourceRoot":"","sources":["../src/HybridEngine.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAC5C,OAAO,KAAK,EAAE,eAAe,EAAE,sBAAsB,EAAE,YAAY,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAExG;;GAEG;AACH,qBAAa,YAAa,YAAW,OAAO;IAC1C,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAc;IAC1C,OAAO,CAAC,QAAQ,CAAC,gBAAgB,CAAmB;IACpD,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAyB;IAChD,OAAO,CAAC,QAAQ,CAAC,sBAAsB,CAAsB;gBAEjD,MAAM,GAAE,sBAA2B;IAU/C,OAAO,CAAC,WAAW;IAkBb,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,YAAiB,GAAG,OAAO,CAAC,eAAe,CAAC;IA+DlF;;OAEG;IACH,UAAU,IAAI,cAAc,EAAE;IAI9B;;OAEG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAM/B"}
@@ -7,39 +7,84 @@ export class HybridEngine {
7
7
  fetchEngine;
8
8
  playwrightEngine;
9
9
  config; // Store config for potential per-request PW overrides
10
+ playwrightOnlyPatterns;
10
11
  constructor(config = {}) {
11
12
  // Pass relevant config parts to each engine
12
13
  // FetchEngine only takes markdown option from the shared config
14
+ // spaMode from config is primarily for PlaywrightEngine, but HybridEngine uses it for decision making.
13
15
  this.fetchEngine = new FetchEngine({ markdown: config.markdown });
14
16
  this.playwrightEngine = new PlaywrightEngine(config);
15
17
  this.config = config; // Store for merging later
18
+ this.playwrightOnlyPatterns = config.playwrightOnlyPatterns || [];
19
+ }
20
+ _isSpaShell(htmlContent) {
21
+ if (!htmlContent || htmlContent.length < 150) {
22
+ // Very short content might be a shell or error
23
+ // Heuristic: if it's very short AND contains noscript, good chance it's a shell.
24
+ if (htmlContent.includes("<noscript>"))
25
+ return true;
26
+ }
27
+ // Check for <noscript> tag
28
+ if (htmlContent.includes("<noscript>"))
29
+ return true;
30
+ // Check for common empty root divs
31
+ if (/<div id=(?:"|')?(root|app)(?:"|')?[^>]*>\s*<\/div>/i.test(htmlContent))
32
+ return true;
33
+ // Check for empty title tag or no title tag at all
34
+ if (/<title>\s*<\/title>/i.test(htmlContent) || !/<title[^>]*>/i.test(htmlContent))
35
+ return true;
36
+ return false;
16
37
  }
17
38
  async fetchHTML(url, options = {}) {
18
- // FetchEngine uses its constructor config; it doesn't accept per-request options here.
39
+ // Determine effective SPA mode and markdown options
40
+ // HybridEngine defaults to false for these if not otherwise specified in its own config or per-request options.
41
+ const effectiveSpaMode = options.spaMode !== undefined ? options.spaMode : this.config.spaMode !== undefined ? this.config.spaMode : false;
42
+ const effectiveMarkdown = options.markdown !== undefined
43
+ ? options.markdown
44
+ : this.config.markdown !== undefined
45
+ ? this.config.markdown
46
+ : false;
47
+ // Prepare options for PlaywrightEngine, to be used in fallback scenarios or direct calls
48
+ const playwrightOptions = {
49
+ ...this.config, // Start with base config given to HybridEngine (e.g. spaRenderDelayMs)
50
+ ...options, // Apply all per-request overrides first
51
+ markdown: effectiveMarkdown, // Then ensure HybridEngine's resolved markdown is set
52
+ spaMode: effectiveSpaMode, // Then ensure HybridEngine's resolved spaMode is set
53
+ };
54
+ // Check playwrightOnlyPatterns first
55
+ for (const pattern of this.playwrightOnlyPatterns) {
56
+ if (typeof pattern === "string" && url.includes(pattern)) {
57
+ console.warn(`HybridEngine: URL ${url} matches string pattern "${pattern}". Using PlaywrightEngine directly.`);
58
+ return this.playwrightEngine.fetchHTML(url, playwrightOptions);
59
+ }
60
+ else if (pattern instanceof RegExp && pattern.test(url)) {
61
+ console.warn(`HybridEngine: URL ${url} matches regex pattern "${pattern.toString()}". Using PlaywrightEngine directly.`);
62
+ return this.playwrightEngine.fetchHTML(url, playwrightOptions);
63
+ }
64
+ }
19
65
  try {
20
66
  const fetchResult = await this.fetchEngine.fetchHTML(url);
21
- // If fetch succeeded, return its result directly (it handles its own markdown config)
22
- // No need to check contentType here, FetchEngine handles it based on its constructor.
67
+ // If FetchEngine succeeded AND spaMode is active, check if it's just a shell
68
+ if (effectiveSpaMode && fetchResult && fetchResult.content) {
69
+ if (this._isSpaShell(fetchResult.content)) {
70
+ console.warn(`HybridEngine: FetchEngine returned likely SPA shell for ${url} in spaMode. Forcing PlaywrightEngine.`);
71
+ // Fallback to PlaywrightEngine, passing the determined effective options
72
+ return this.playwrightEngine.fetchHTML(url, playwrightOptions);
73
+ }
74
+ }
75
+ // If not spaMode, or if spaMode but content is not a shell, return FetchEngine's result
23
76
  return fetchResult;
24
77
  }
25
78
  catch (fetchError) {
26
- console.warn(`FetchEngine failed for ${url}: ${fetchError.message}. Falling back to PlaywrightEngine.`);
27
- // Merge constructor config with per-request options for Playwright fallback
28
- const playwrightOptions = {
29
- ...this.config, // Start with base config given to HybridEngine
30
- ...options, // Override with per-request options
31
- };
79
+ console.warn(`HybridEngine: FetchEngine failed for ${url}: ${fetchError.message}. Falling back to PlaywrightEngine.`);
32
80
  try {
33
- // Pass merged options to PlaywrightEngine
81
+ // Fallback to PlaywrightEngine, passing the determined effective options
34
82
  const playwrightResult = await this.playwrightEngine.fetchHTML(url, playwrightOptions);
35
83
  return playwrightResult;
36
84
  }
37
85
  catch (playwrightError) {
38
- // Catch potential Playwright error
39
- console.error(`PlaywrightEngine fallback failed for ${url}: ${playwrightError.message}`);
40
- // Optionally, wrap or prioritize which error to throw
41
- // Throwing the Playwright error as it's the last one encountered
42
- throw playwrightError;
86
+ console.error(`HybridEngine: PlaywrightEngine fallback also failed for ${url}: ${playwrightError.message}`);
87
+ throw playwrightError; // Throw the Playwright error as it's the last one encountered
43
88
  }
44
89
  }
45
90
  }
@@ -1 +1 @@
1
- {"version":3,"file":"HybridEngine.js","sourceRoot":"","sources":["../src/HybridEngine.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAC/C,OAAO,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AAIzD;;GAEG;AACH,MAAM,OAAO,YAAY;IACN,WAAW,CAAc;IACzB,gBAAgB,CAAmB;IACnC,MAAM,CAAyB,CAAC,sDAAsD;IAEvG,YAAY,SAAiC,EAAE;QAC7C,4CAA4C;QAC5C,gEAAgE;QAChE,IAAI,CAAC,WAAW,GAAG,IAAI,WAAW,CAAC,EAAE,QAAQ,EAAE,MAAM,CAAC,QAAQ,EAAE,CAAC,CAAC;QAClE,IAAI,CAAC,gBAAgB,GAAG,IAAI,gBAAgB,CAAC,MAAM,CAAC,CAAC;QACrD,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC,CAAC,0BAA0B;IAClD,CAAC;IAED,KAAK,CAAC,SAAS,CAAC,GAAW,EAAE,UAAwB,EAAE;QACrD,uFAAuF;QACvF,IAAI,CAAC;YACH,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;YAC1D,sFAAsF;YACtF,sFAAsF;YACtF,OAAO,WAAW,CAAC;QACrB,CAAC;QAAC,OAAO,UAAe,EAAE,CAAC;YACzB,OAAO,CAAC,IAAI,CAAC,0BAA0B,GAAG,KAAK,UAAU,CAAC,OAAO,qCAAqC,CAAC,CAAC;YAExG,4EAA4E;YAC5E,MAAM,iBAAiB,GAAiB;gBACtC,GAAG,IAAI,CAAC,MAAM,EAAE,+CAA+C;gBAC/D,GAAG,OAAO,EAAE,oCAAoC;aACjD,CAAC;YAEF,IAAI,CAAC;gBACH,0CAA0C;gBAC1C,MAAM,gBAAgB,GAAG,MAAM,IAAI,CAAC,gBAAgB,CAAC,SAAS,CAAC,GAAG,EAAE,iBAAiB,CAAC,CAAC;gBACvF,OAAO,gBAAgB,CAAC;YAC1B,CAAC;YAAC,OAAO,eAAoB,EAAE,CAAC;gBAC9B,mCAAmC;gBACnC,OAAO,CAAC,KAAK,CAAC,wCAAwC,GAAG,KAAK,eAAe,CAAC,OAAO,EAAE,CAAC,CAAC;gBACzF,sDAAsD;gBACtD,iEAAiE;gBACjE,MAAM,eAAe,CAAC;YACxB,CAAC;QACH,CAAC;IACH,CAAC;IAED;;OAEG;IACH,UAAU;QACR,OAAO,IAAI,CAAC,gBAAgB,CAAC,UAAU,EAAE,CAAC;IAC5C,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO;QACX,MAAM,OAAO,CAAC,UAAU,CAAC;YACvB,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,EAAE,yCAAyC;YACrE,IAAI,CAAC,gBAAgB,CAAC,OAAO,EAAE;SAChC,CAAC,CAAC;IACL,CAAC;CACF"}
1
+ {"version":3,"file":"HybridEngine.js","sourceRoot":"","sources":["../src/HybridEngine.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAC/C,OAAO,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AAIzD;;GAEG;AACH,MAAM,OAAO,YAAY;IACN,WAAW,CAAc;IACzB,gBAAgB,CAAmB;IACnC,MAAM,CAAyB,CAAC,sDAAsD;IACtF,sBAAsB,CAAsB;IAE7D,YAAY,SAAiC,EAAE;QAC7C,4CAA4C;QAC5C,gEAAgE;QAChE,uGAAuG;QACvG,IAAI,CAAC,WAAW,GAAG,IAAI,WAAW,CAAC,EAAE,QAAQ,EAAE,MAAM,CAAC,QAAQ,EAAE,CAAC,CAAC;QAClE,IAAI,CAAC,gBAAgB,GAAG,IAAI,gBAAgB,CAAC,MAAM,CAAC,CAAC;QACrD,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC,CAAC,0BAA0B;QAChD,IAAI,CAAC,sBAAsB,GAAG,MAAM,CAAC,sBAAsB,IAAI,EAAE,CAAC;IACpE,CAAC;IAEO,WAAW,CAAC,WAAmB;QACrC,IAAI,CAAC,WAAW,IAAI,WAAW,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;YAC7C,+CAA+C;YAC/C,iFAAiF;YACjF,IAAI,WAAW,CAAC,QAAQ,CAAC,YAAY,CAAC;gBAAE,OAAO,IAAI,CAAC;QACtD,CAAC;QACD,2BAA2B;QAC3B,IAAI,WAAW,CAAC,QAAQ,CAAC,YAAY,CAAC;YAAE,OAAO,IAAI,CAAC;QAEpD,mCAAmC;QACnC,IAAI,qDAAqD,CAAC,IAAI,CAAC,WAAW,CAAC;YAAE,OAAO,IAAI,CAAC;QAEzF,mDAAmD;QACnD,IAAI,sBAAsB,CAAC,IAAI,CAAC,WAAW,CAAC,IAAI,CAAC,eAAe,CAAC,IAAI,CAAC,WAAW,CAAC;YAAE,OAAO,IAAI,CAAC;QAEhG,OAAO,KAAK,CAAC;IACf,CAAC;IAED,KAAK,CAAC,SAAS,CAAC,GAAW,EAAE,UAAwB,EAAE;QACrD,oDAAoD;QACpD,gHAAgH;QAChH,MAAM,gBAAgB,GACpB,OAAO,CAAC,OAAO,KAAK,SAAS,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,OAAO,KAAK,SAAS,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,KAAK,CAAC;QACpH,MAAM,iBAAiB,GACrB,OAAO,CAAC,QAAQ,KAAK,SAAS;YAC5B,CAAC,CAAC,OAAO,CAAC,QAAQ;YAClB,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,KAAK,SAAS;gBAClC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ;gBACtB,CAAC,CAAC,KAAK,CAAC;QAEd,yFAAyF;QACzF,MAAM,iBAAiB,GAA6D;YAClF,GAAG,IAAI,CAAC,MAAM,EAAE,uEAAuE;YACvF,GAAG,OAAO,EAAE,wCAAwC;YACpD,QAAQ,EAAE,iBAAiB,EAAE,sDAAsD;YACnF,OAAO,EAAE,gBAAgB,EAAE,qDAAqD;SACjF,CAAC;QAEF,qCAAqC;QACrC,KAAK,MAAM,OAAO,IAAI,IAAI,CAAC,sBAAsB,EAAE,CAAC;YAClD,IAAI,OAAO,OAAO,KAAK,QAAQ,IAAI,GAAG,CAAC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;gBACzD,OAAO,CAAC,IAAI,CAAC,qBAAqB,GAAG,4BAA4B,OAAO,qCAAqC,CAAC,CAAC;gBAC/G,OAAO,IAAI,CAAC,gBAAgB,CAAC,SAAS,CAAC,GAAG,EAAE,iBAAiB,CAAC,CAAC;YACjE,CAAC;iBAAM,IAAI,OAAO,YAAY,MAAM,IAAI,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;gBAC1D,OAAO,CAAC,IAAI,CACV,qBAAqB,GAAG,2BAA2B,OAAO,CAAC,QAAQ,EAAE,qCAAqC,CAC3G,CAAC;gBACF,OAAO,IAAI,CAAC,gBAAgB,CAAC,SAAS,CAAC,GAAG,EAAE,iBAAiB,CAAC,CAAC;YACjE,CAAC;QACH,CAAC;QAED,IAAI,CAAC;YACH,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;YAE1D,6EAA6E;YAC7E,IAAI,gBAAgB,IAAI,WAAW,IAAI,WAAW,CAAC,OAAO,EAAE,CAAC;gBAC3D,IAAI,IAAI,CAAC,WAAW,CAAC,WAAW,CAAC,OAAO,CAAC,EAAE,CAAC;oBAC1C,OAAO,CAAC,IAAI,CACV,2DAA2D,GAAG,wCAAwC,CACvG,CAAC;oBACF,yEAAyE;oBACzE,OAAO,IAAI,CAAC,gBAAgB,CAAC,SAAS,CAAC,GAAG,EAAE,iBAAiB,CAAC,CAAC;gBACjE,CAAC;YACH,CAAC;YACD,wFAAwF;YACxF,OAAO,WAAW,CAAC;QACrB,CAAC;QAAC,OAAO,UAAe,EAAE,CAAC;YACzB,OAAO,CAAC,IAAI,CACV,wCAAwC,GAAG,KAAK,UAAU,CAAC,OAAO,qCAAqC,CACxG,CAAC;YACF,IAAI,CAAC;gBACH,yEAAyE;gBACzE,MAAM,gBAAgB,GAAG,MAAM,IAAI,CAAC,gBAAgB,CAAC,SAAS,CAAC,GAAG,EAAE,iBAAiB,CAAC,CAAC;gBACvF,OAAO,gBAAgB,CAAC;YAC1B,CAAC;YAAC,OAAO,eAAoB,EAAE,CAAC;gBAC9B,OAAO,CAAC,KAAK,CAAC,2DAA2D,GAAG,KAAK,eAAe,CAAC,OAAO,EAAE,CAAC,CAAC;gBAC5G,MAAM,eAAe,CAAC,CAAC,8DAA8D;YACvF,CAAC;QACH,CAAC;IACH,CAAC;IAED;;OAEG;IACH,UAAU;QACR,OAAO,IAAI,CAAC,gBAAgB,CAAC,UAAU,EAAE,CAAC;IAC5C,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO;QACX,MAAM,OAAO,CAAC,UAAU,CAAC;YACvB,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,EAAE,yCAAyC;YACrE,IAAI,CAAC,gBAAgB,CAAC,OAAO,EAAE;SAChC,CAAC,CAAC;IACL,CAAC;CACF"}
@@ -51,19 +51,48 @@ export declare class PlaywrightEngine implements IEngine {
51
51
  * @param url The URL to fetch.
52
52
  * @param options Optional settings for this specific fetch operation.
53
53
  * @param options.fastMode Overrides the engine's `defaultFastMode` configuration for this request.
54
+ * @param options.spaMode Overrides the engine's `spaMode` configuration for this request.
54
55
  * @returns A Promise resolving to an HTMLFetchResult object.
55
56
  * @throws {FetchError} If the fetch fails after all retries or encounters critical errors.
56
57
  */
57
58
  fetchHTML(url: string, options?: FetchOptions & {
58
59
  markdown?: boolean;
60
+ spaMode?: boolean;
59
61
  }): Promise<HTMLFetchResult>;
62
+ /**
63
+ * Helper to check cache and potentially return a cached result.
64
+ * Handles logic for re-fetching if cache is stale or content type mismatch for markdown.
65
+ *
66
+ * @param url URL to check in cache
67
+ * @param currentConfig Current fetch configuration
68
+ * @returns Cached result or null if not found/needs re-fetch.
69
+ */
70
+ private _handleCacheCheck;
71
+ /**
72
+ * Attempts to fetch the URL using a simple HTTP GET request as a fallback.
73
+ *
74
+ * @param url The URL to fetch.
75
+ * @param currentConfig The current fetch configuration.
76
+ * @returns A Promise resolving to an HTMLFetchResult if successful, or null if fallback is skipped or a challenge page is encountered.
77
+ * @throws {FetchError} If the HTTP fallback itself fails with an unrecoverable error.
78
+ */
79
+ private _attemptHttpFallback;
80
+ /**
81
+ * Ensures the browser pool is initialized with the correct mode (headed/headless).
82
+ * Handles one retry attempt if the initial pool initialization fails.
83
+ *
84
+ * @param useHeadedMode Whether to initialize the pool in headed mode.
85
+ * @param currentConfig The current fetch configuration (for retryDelay).
86
+ * @returns A Promise that resolves when the pool is initialized, or rejects if initialization fails after retries.
87
+ * @throws {FetchError} If pool initialization fails after retries or if the pool is unavailable.
88
+ */
89
+ private _ensureBrowserPoolInitialized;
60
90
  /**
61
91
  * Internal recursive method to handle fetching with retries.
62
92
  *
63
93
  * @param url URL to fetch
64
94
  * @param currentConfig The merged configuration including markdown option
65
95
  * @param retryAttempt Current retry attempt number (starts at 0)
66
- * @param parentRetryCount Tracks retries related to pool initialization errors (starts at 0)
67
96
  * @returns Promise resolving to HTMLFetchResult
68
97
  */
69
98
  private _fetchRecursive;
@@ -1 +1 @@
1
- {"version":3,"file":"PlaywrightEngine.d.ts","sourceRoot":"","sources":["../src/PlaywrightEngine.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,cAAc,EAAE,sBAAsB,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AACxG,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAmB5C;;;;;;GAMG;AACH,qBAAa,gBAAiB,YAAW,OAAO;IAC9C,OAAO,CAAC,WAAW,CAAsC;IACzD,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAS;IAC/B,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAsC;IAC5D,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAmC;IAG1D,OAAO,CAAC,uBAAuB,CAAkB;IACjD,OAAO,CAAC,iBAAiB,CAAkB;IAC3C,OAAO,CAAC,mBAAmB,CAA0B;IAGrD,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,cAAc,CAkBpC;IAEF;;;;;OAKG;gBACS,MAAM,GAAE,sBAA2B;IAM/C;;OAEG;YACW,qBAAqB;IAuCnC;;;OAGG;YACW,yBAAyB;IAmFvC,OAAO,CAAC,UAAU;IAalB;;OAEG;YACW,WAAW;IAazB;;OAEG;YACW,qBAAqB;IAqCnC;;OAEG;IACH,OAAO,CAAC,UAAU;IAUlB;;;;;;;;OAQG;IACG,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,YAAY,GAAG;QAAE,QAAQ,CAAC,EAAE,OAAO,CAAA;KAAO,GAAG,OAAO,CAAC,eAAe,CAAC;IAU3G;;;;;;;;OAQG;YACW,eAAe;IAsH7B;;;OAGG;YACW,mBAAmB;YAmJnB,kBAAkB;IAmChC;;;;;OAKG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;IAe9B;;;OAGG;IACH,UAAU,IAAI,cAAc,EAAE;IAQ9B,OAAO,CAAC,mBAAmB;CAS5B"}
1
+ {"version":3,"file":"PlaywrightEngine.d.ts","sourceRoot":"","sources":["../src/PlaywrightEngine.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,cAAc,EAAE,sBAAsB,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AACxG,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAyC5C;;;;;;GAMG;AACH,qBAAa,gBAAiB,YAAW,OAAO;IAC9C,OAAO,CAAC,WAAW,CAAsC;IACzD,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAS;IAC/B,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAsC;IAC5D,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAiC;IAGxD,OAAO,CAAC,uBAAuB,CAAkB;IACjD,OAAO,CAAC,iBAAiB,CAAkB;IAC3C,OAAO,CAAC,mBAAmB,CAA0B;IAGrD,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,cAAc,CAsBpC;IAEF;;;;;OAKG;gBACS,MAAM,GAAE,sBAA2B;IAM/C;;OAEG;YACW,qBAAqB;IAwCnC;;;OAGG;YACW,yBAAyB;IAiEvC,OAAO,CAAC,UAAU;IAalB;;OAEG;YACW,WAAW;IAazB;;OAEG;YACW,qBAAqB;IAwCnC;;OAEG;IACH,OAAO,CAAC,UAAU;IAUlB;;;;;;;;;OASG;IACG,SAAS,CACb,GAAG,EAAE,MAAM,EACX,OAAO,GAAE,YAAY,GAAG;QAAE,QAAQ,CAAC,EAAE,OAAO,CAAC;QAAC,OAAO,CAAC,EAAE,OAAO,CAAA;KAAO,GACrE,OAAO,CAAC,eAAe,CAAC;IAc3B;;;;;;;OAOG;IACH,OAAO,CAAC,iBAAiB;IAmDzB;;;;;;;OAOG;YACW,oBAAoB;IAiClC;;;;;;;;OAQG;YACW,6BAA6B;IAmC3C;;;;;;;OAOG;YACW,eAAe;IAgH7B;;;OAGG;YACW,mBAAmB;YAqKnB,kBAAkB;IAyChC;;;;;OAKG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;IAoB9B;;;OAGG;IACH,UAAU,IAAI,cAAc,EAAE;IAQ9B,OAAO,CAAC,mBAAmB;CAS5B"}