@purepageio/fetch-engines 0.2.11 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +95 -54
- package/dist/HybridEngine.d.ts +2 -0
- package/dist/HybridEngine.d.ts.map +1 -1
- package/dist/HybridEngine.js +60 -15
- package/dist/HybridEngine.js.map +1 -1
- package/dist/PlaywrightEngine.d.ts +30 -1
- package/dist/PlaywrightEngine.d.ts.map +1 -1
- package/dist/PlaywrightEngine.js +247 -136
- package/dist/PlaywrightEngine.js.map +1 -1
- package/dist/browser/PlaywrightBrowserPool.d.ts +3 -1
- package/dist/browser/PlaywrightBrowserPool.d.ts.map +1 -1
- package/dist/browser/PlaywrightBrowserPool.js +319 -190
- package/dist/browser/PlaywrightBrowserPool.js.map +1 -1
- package/dist/constants.d.ts +29 -0
- package/dist/constants.d.ts.map +1 -0
- package/dist/constants.js +31 -0
- package/dist/constants.js.map +1 -0
- package/dist/types.d.ts +31 -1
- package/dist/types.d.ts.map +1 -1
- package/dist/utils/markdown-converter.d.ts +7 -0
- package/dist/utils/markdown-converter.d.ts.map +1 -1
- package/dist/utils/markdown-converter.js +155 -64
- package/dist/utils/markdown-converter.js.map +1 -1
- package/package.json +2 -1
package/README.md
CHANGED
|
@@ -32,6 +32,7 @@ This package provides a high-level abstraction, letting you focus on using the w
|
|
|
32
32
|
- [API Reference](#api-reference)
|
|
33
33
|
- [Stealth / Anti-Detection (`PlaywrightEngine`)](#stealth--anti-detection-playwrightengine)
|
|
34
34
|
- [Error Handling](#error-handling)
|
|
35
|
+
- [Logging](#logging)
|
|
35
36
|
- [Contributing](#contributing)
|
|
36
37
|
- [License](#license)
|
|
37
38
|
|
|
@@ -106,8 +107,11 @@ main();
|
|
|
106
107
|
```typescript
|
|
107
108
|
import { PlaywrightEngine } from "@purepageio/fetch-engines";
|
|
108
109
|
|
|
109
|
-
// Engine configured to fetch HTML by default
|
|
110
|
-
const engine = new PlaywrightEngine({
|
|
110
|
+
// Engine configured to fetch HTML by default and pass custom launch arguments
|
|
111
|
+
const engine = new PlaywrightEngine({
|
|
112
|
+
markdown: false,
|
|
113
|
+
playwrightLaunchOptions: { args: ["--disable-gpu"] },
|
|
114
|
+
});
|
|
111
115
|
|
|
112
116
|
async function main() {
|
|
113
117
|
try {
|
|
@@ -191,17 +195,20 @@ The `PlaywrightEngine` accepts a `PlaywrightEngineConfig` object with the follow
|
|
|
191
195
|
|
|
192
196
|
**General Options:**
|
|
193
197
|
|
|
194
|
-
| Option
|
|
195
|
-
|
|
|
196
|
-
| `markdown`
|
|
197
|
-
| `useHttpFallback`
|
|
198
|
-
| `useHeadedModeFallback`
|
|
199
|
-
| `defaultFastMode`
|
|
200
|
-
| `simulateHumanBehavior`
|
|
201
|
-
| `concurrentPages`
|
|
202
|
-
| `maxRetries`
|
|
203
|
-
| `retryDelay`
|
|
204
|
-
| `cacheTTL`
|
|
198
|
+
| Option | Type | Default | Description |
|
|
199
|
+
| ------------------------- | --------------- | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
200
|
+
| `markdown` | `boolean` | `false` | If `true`, converts content (from Playwright or its internal HTTP fallback) to Markdown. `contentType` will be `'markdown'`. Can be overridden per-request. |
|
|
201
|
+
| `useHttpFallback` | `boolean` | `true` | If `true`, attempts a fast HTTP fetch before using Playwright. Ineffective if `spaMode` is `true`. |
|
|
202
|
+
| `useHeadedModeFallback` | `boolean` | `false` | If `true`, automatically retries specific failed Playwright attempts in headed (visible) mode. |
|
|
203
|
+
| `defaultFastMode` | `boolean` | `true` | If `true`, initially blocks non-essential resources and skips human simulation. Can be overridden per-request. Effectively `false` if `spaMode` is `true`. |
|
|
204
|
+
| `simulateHumanBehavior` | `boolean` | `true` | If `true` (and not `fastMode` or `spaMode`), attempts basic human-like interactions. |
|
|
205
|
+
| `concurrentPages` | `number` | `3` | Max number of pages to process concurrently within the engine queue. |
|
|
206
|
+
| `maxRetries` | `number` | `3` | Max retry attempts for a failed fetch (excluding initial try). |
|
|
207
|
+
| `retryDelay` | `number` | `5000` | Delay (ms) between retries. |
|
|
208
|
+
| `cacheTTL` | `number` | `900000` | Cache Time-To-Live (ms). `0` disables caching. (15 mins default) |
|
|
209
|
+
| `spaMode` | `boolean` | `false` | If `true`, enables Single Page Application mode. This typically bypasses `useHttpFallback`, effectively sets `fastMode` to `false`, uses more patient load conditions (e.g., network idle), and may apply `spaRenderDelayMs`. Recommended for JavaScript-heavy sites. |
|
|
210
|
+
| `spaRenderDelayMs` | `number` | `0` | Explicit delay (ms) after page load events in `spaMode` to allow for client-side rendering. Only applies if `spaMode` is `true`. |
|
|
211
|
+
| `playwrightLaunchOptions` | `LaunchOptions` | `undefined` | Optional Playwright launch options (from `playwright` package, e.g., `{ args: ['--some-flag'] }`) passed when a browser instance is created. Merged with internal defaults. |
|
|
205
212
|
|
|
206
213
|
**Browser Pool Options (Passed to internal `PlaywrightBrowserPool`):**
|
|
207
214
|
|
|
@@ -218,14 +225,42 @@ The `PlaywrightEngine` accepts a `PlaywrightEngineConfig` object with the follow
|
|
|
218
225
|
|
|
219
226
|
### HybridEngine
|
|
220
227
|
|
|
221
|
-
The `HybridEngine` constructor accepts
|
|
228
|
+
The `HybridEngine` constructor accepts `PlaywrightEngineConfig` options. These settings configure the underlying engines and the hybrid strategy:
|
|
222
229
|
|
|
223
|
-
-
|
|
224
|
-
-
|
|
225
|
-
-
|
|
230
|
+
- **Constructor `markdown` option:**
|
|
231
|
+
- Sets the default Markdown conversion for the internal `FetchEngine`. This `FetchEngine` instance **does not** react to per-request `markdown` overrides.
|
|
232
|
+
- Sets the default for the internal `PlaywrightEngine`.
|
|
233
|
+
- **Constructor `spaMode` option:**
|
|
234
|
+
- Sets the default SPA mode for `HybridEngine`. If `true`, `HybridEngine` checks `FetchEngine`'s output for SPA shell characteristics. If an SPA shell is detected, it forces a fallback to `PlaywrightEngine` (which will also run in SPA mode).
|
|
235
|
+
- Sets the default for the internal `PlaywrightEngine`.
|
|
236
|
+
- **Other `PlaywrightEngineConfig` options** (e.g., `maxRetries`, `cacheTTL`, `playwrightLaunchOptions`, pool settings) are primarily passed to and used by the internal `PlaywrightEngine`.
|
|
237
|
+
|
|
238
|
+
**Per-request `options` in `HybridEngine.fetchHTML(url, options)`:**
|
|
239
|
+
|
|
240
|
+
- **`options.markdown` (`boolean`):**
|
|
241
|
+
- If `FetchEngine` succeeds and its content is used (i.e., not an SPA shell when `spaMode` is active), this per-request `markdown` option is **ignored**. The content's format is determined by the `FetchEngine`'s constructor `markdown` setting.
|
|
242
|
+
- If `HybridEngine` falls back to `PlaywrightEngine` (due to `FetchEngine` failure or SPA shell detection), this per-request `markdown` option **overrides** the `PlaywrightEngine`'s default and determines if its output is Markdown.
|
|
243
|
+
- **`options.spaMode` (`boolean`):**
|
|
244
|
+
- Overrides the `HybridEngine`'s default SPA mode behavior for this specific request (affecting SPA shell detection and potential fallback to `PlaywrightEngine`).
|
|
245
|
+
- If `PlaywrightEngine` is used, this option also overrides its default SPA mode.
|
|
246
|
+
- **`options.fastMode` (`boolean`):**
|
|
247
|
+
- If `PlaywrightEngine` is used, this option overrides its `defaultFastMode` setting. It has no effect on `FetchEngine`.
|
|
226
248
|
|
|
227
249
|
```typescript
|
|
228
|
-
//
|
|
250
|
+
// Example: HybridEngine with SPA mode enabled by default
|
|
251
|
+
const spaHybridEngine = new HybridEngine({ spaMode: true, spaRenderDelayMs: 2000 });
|
|
252
|
+
|
|
253
|
+
async function fetchSpaSite() {
|
|
254
|
+
try {
|
|
255
|
+
// This will use PlaywrightEngine directly if smallblackdots is an SPA shell
|
|
256
|
+
const result = await spaHybridEngine.fetchHTML(
|
|
257
|
+
"https://www.smallblackdots.net/release/16109/corrina-joseph-wish-tonite-lonely"
|
|
258
|
+
);
|
|
259
|
+
console.log(`Title: ${result.title}`);
|
|
260
|
+
} catch (e) {
|
|
261
|
+
console.error(e);
|
|
262
|
+
}
|
|
263
|
+
}
|
|
229
264
|
```
|
|
230
265
|
|
|
231
266
|
## Return Value
|
|
@@ -248,6 +283,7 @@ All `fetchHTML()` methods return a Promise that resolves to an `HTMLFetchResult`
|
|
|
248
283
|
- `options?` (`FetchOptions`): Optional per-request overrides.
|
|
249
284
|
- `markdown?: boolean`: (Playwright/Hybrid only) Request Markdown conversion. For Hybrid, only applies on fallback to Playwright.
|
|
250
285
|
- `fastMode?: boolean`: (Playwright/Hybrid only) Override fast mode.
|
|
286
|
+
- `spaMode?: boolean`: (Playwright/Hybrid only) Override SPA mode behavior for this request.
|
|
251
287
|
- **Returns:** `Promise<HTMLFetchResult>`
|
|
252
288
|
|
|
253
289
|
Fetches content, returning HTML or Markdown based on configuration/options in `result.content` with `result.contentType` indicating the format.
|
|
@@ -276,67 +312,72 @@ Errors during fetching are typically thrown as instances of `FetchError` (or its
|
|
|
276
312
|
- `originalError` (`Error | undefined`): The underlying error that caused this fetch error (e.g., a Playwright error object).
|
|
277
313
|
- `statusCode` (`number | undefined`): The HTTP status code, if relevant (especially for `FetchEngineHttpError`).
|
|
278
314
|
|
|
279
|
-
Common
|
|
280
|
-
|
|
281
|
-
-
|
|
282
|
-
-
|
|
283
|
-
-
|
|
284
|
-
-
|
|
285
|
-
-
|
|
286
|
-
-
|
|
287
|
-
-
|
|
315
|
+
Common `FetchError` codes and scenarios:
|
|
316
|
+
|
|
317
|
+
- **`ERR_HTTP_ERROR`**: Thrown by `FetchEngine` for HTTP status codes >= 400. `error.statusCode` will be set.
|
|
318
|
+
- **`ERR_NON_HTML_CONTENT`**: Thrown by `FetchEngine` if the content type is not HTML and `markdown` conversion is not requested.
|
|
319
|
+
- **`ERR_PLAYWRIGHT_OPERATION`**: A general error from `PlaywrightEngine` indicating a failure during a Playwright operation (e.g., page acquisition, navigation, interaction). The `originalError` property will often contain the specific Playwright error.
|
|
320
|
+
- **`ERR_NAVIGATION`**: Often seen as part of `ERR_PLAYWRIGHT_OPERATION`'s message or in `originalError` when a Playwright navigation fails (e.g., timeout, SSL error).
|
|
321
|
+
- **`ERR_MARKDOWN_CONVERSION_NON_HTML`**: Thrown by `PlaywrightEngine` (or `HybridEngine` if falling back to Playwright) if `markdown: true` is requested for a non-HTML content type (e.g., XML, JSON).
|
|
322
|
+
- **`ERR_UNSUPPORTED_RAW_CONTENT_TYPE`**: Thrown by `PlaywrightEngine` if `markdown: false` is requested for a content type it doesn't support for direct fetching (e.g., images, applications). Currently, it primarily supports `text/*` and `application/json`, `application/xml` like types when `markdown: false`.
|
|
323
|
+
- **`ERR_CACHE_ERROR`**: Indicates an issue with cache read/write operations.
|
|
324
|
+
- **`ERR_PROXY_CONFIG_ERROR`**: Problem with proxy configuration.
|
|
325
|
+
- **`ERR_BROWSER_POOL_EXHAUSTED`**: If the browser pool cannot provide a page (e.g. max browsers reached and all are busy beyond timeout).
|
|
326
|
+
- **Other Scenarios (often wrapped by `ERR_PLAYWRIGHT_OPERATION` or a generic `FetchError`):**
|
|
327
|
+
- Network issues (DNS resolution, connection refused).
|
|
328
|
+
- Proxy connection failures.
|
|
329
|
+
- Page crashes or context/browser disconnections within Playwright.
|
|
330
|
+
- Failures during browser launch or management by the pool.
|
|
288
331
|
|
|
289
332
|
The `HTMLFetchResult` object may also contain an `error` property if the final fetch attempt failed after all retries but an earlier attempt (within retries) might have produced some intermediate (potentially unusable) result data. It's generally best to rely on the thrown error for failure handling.
|
|
290
333
|
|
|
291
334
|
**Example:**
|
|
292
335
|
|
|
293
336
|
```typescript
|
|
294
|
-
import {
|
|
337
|
+
import { PlaywrightEngine, FetchError } from "@purepageio/fetch-engines";
|
|
295
338
|
|
|
296
|
-
|
|
339
|
+
// Example using PlaywrightEngine to illustrate more complex error handling
|
|
340
|
+
const engine = new PlaywrightEngine({ useHttpFallback: false, maxRetries: 1 });
|
|
297
341
|
|
|
298
342
|
async function fetchWithHandling(url: string) {
|
|
299
343
|
try {
|
|
300
344
|
const result = await engine.fetchHTML(url);
|
|
301
|
-
// Note: result.error is less common, primary errors are thrown.
|
|
302
345
|
if (result.error) {
|
|
303
|
-
console.
|
|
304
|
-
} else {
|
|
305
|
-
console.log(`Success for ${url}! Content type: ${result.contentType}`);
|
|
306
|
-
// Use result.content
|
|
346
|
+
console.warn(`Fetch for ${url} included non-critical error after retries: ${result.error.message}`);
|
|
307
347
|
}
|
|
348
|
+
console.log(`Success for ${url}! Title: ${result.title}, Content type: ${result.contentType}`);
|
|
349
|
+
// Use result.content
|
|
308
350
|
} catch (error) {
|
|
309
|
-
console.error(`Fetch failed
|
|
351
|
+
console.error(`Fetch failed for ${url}:`);
|
|
310
352
|
if (error instanceof FetchError) {
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
break;
|
|
316
|
-
case "ERR_NON_HTML_CONTENT":
|
|
317
|
-
console.error(` Wrong Content Type: ${error.message}`);
|
|
318
|
-
break;
|
|
319
|
-
// Add other specific codes as needed
|
|
320
|
-
default:
|
|
321
|
-
console.error(` FetchError (${error.code || "UNKNOWN"}): ${error.message}`);
|
|
322
|
-
break;
|
|
353
|
+
console.error(` Error Code: ${error.code || "N/A"}`);
|
|
354
|
+
console.error(` Message: ${error.message}`);
|
|
355
|
+
if (error.statusCode) {
|
|
356
|
+
console.error(` Status Code: ${error.statusCode}`);
|
|
323
357
|
}
|
|
324
358
|
if (error.originalError) {
|
|
325
|
-
console.error(` Original Error: ${error.originalError.message}`);
|
|
359
|
+
console.error(` Original Error: ${error.originalError.name} - ${error.originalError.message}`);
|
|
360
|
+
}
|
|
361
|
+
// Example of specific handling:
|
|
362
|
+
if (error.code === "ERR_PLAYWRIGHT_OPERATION") {
|
|
363
|
+
console.error(" Hint: This was a Playwright operation failure. Check Playwright logs or originalError.");
|
|
326
364
|
}
|
|
327
365
|
} else if (error instanceof Error) {
|
|
328
|
-
// Handle generic JavaScript errors
|
|
329
366
|
console.error(` Generic Error: ${error.message}`);
|
|
330
367
|
} else {
|
|
331
|
-
|
|
332
|
-
console.error(` Unknown error occurred.`);
|
|
368
|
+
console.error(` Unknown error occurred: ${String(error)}`);
|
|
333
369
|
}
|
|
334
370
|
}
|
|
335
371
|
}
|
|
336
372
|
|
|
337
|
-
|
|
338
|
-
fetchWithHandling("https://
|
|
339
|
-
fetchWithHandling("https://
|
|
373
|
+
async function runExamples() {
|
|
374
|
+
await fetchWithHandling("https://nonexistentdomain.example.com"); // Likely DNS or navigation error
|
|
375
|
+
await fetchWithHandling("https://example.com/non_html_resource.json"); // Test with actual JSON URL if available
|
|
376
|
+
// or a site known to cause Playwright issues for a demo.
|
|
377
|
+
await engine.cleanup(); // Important for PlaywrightEngine
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
runExamples();
|
|
340
381
|
```
|
|
341
382
|
|
|
342
383
|
## Logging
|
package/dist/HybridEngine.d.ts
CHANGED
|
@@ -7,7 +7,9 @@ export declare class HybridEngine implements IEngine {
|
|
|
7
7
|
private readonly fetchEngine;
|
|
8
8
|
private readonly playwrightEngine;
|
|
9
9
|
private readonly config;
|
|
10
|
+
private readonly playwrightOnlyPatterns;
|
|
10
11
|
constructor(config?: PlaywrightEngineConfig);
|
|
12
|
+
private _isSpaShell;
|
|
11
13
|
fetchHTML(url: string, options?: FetchOptions): Promise<HTMLFetchResult>;
|
|
12
14
|
/**
|
|
13
15
|
* Delegates getMetrics to the PlaywrightEngine.
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"HybridEngine.d.ts","sourceRoot":"","sources":["../src/HybridEngine.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAC5C,OAAO,KAAK,EAAE,eAAe,EAAE,sBAAsB,EAAE,YAAY,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAExG;;GAEG;AACH,qBAAa,YAAa,YAAW,OAAO;IAC1C,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAc;IAC1C,OAAO,CAAC,QAAQ,CAAC,gBAAgB,CAAmB;IACpD,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAyB;
|
|
1
|
+
{"version":3,"file":"HybridEngine.d.ts","sourceRoot":"","sources":["../src/HybridEngine.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAC5C,OAAO,KAAK,EAAE,eAAe,EAAE,sBAAsB,EAAE,YAAY,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAExG;;GAEG;AACH,qBAAa,YAAa,YAAW,OAAO;IAC1C,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAc;IAC1C,OAAO,CAAC,QAAQ,CAAC,gBAAgB,CAAmB;IACpD,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAyB;IAChD,OAAO,CAAC,QAAQ,CAAC,sBAAsB,CAAsB;gBAEjD,MAAM,GAAE,sBAA2B;IAU/C,OAAO,CAAC,WAAW;IAkBb,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,YAAiB,GAAG,OAAO,CAAC,eAAe,CAAC;IA+DlF;;OAEG;IACH,UAAU,IAAI,cAAc,EAAE;IAI9B;;OAEG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAM/B"}
|
package/dist/HybridEngine.js
CHANGED
|
@@ -7,39 +7,84 @@ export class HybridEngine {
|
|
|
7
7
|
fetchEngine;
|
|
8
8
|
playwrightEngine;
|
|
9
9
|
config; // Store config for potential per-request PW overrides
|
|
10
|
+
playwrightOnlyPatterns;
|
|
10
11
|
constructor(config = {}) {
|
|
11
12
|
// Pass relevant config parts to each engine
|
|
12
13
|
// FetchEngine only takes markdown option from the shared config
|
|
14
|
+
// spaMode from config is primarily for PlaywrightEngine, but HybridEngine uses it for decision making.
|
|
13
15
|
this.fetchEngine = new FetchEngine({ markdown: config.markdown });
|
|
14
16
|
this.playwrightEngine = new PlaywrightEngine(config);
|
|
15
17
|
this.config = config; // Store for merging later
|
|
18
|
+
this.playwrightOnlyPatterns = config.playwrightOnlyPatterns || [];
|
|
19
|
+
}
|
|
20
|
+
_isSpaShell(htmlContent) {
|
|
21
|
+
if (!htmlContent || htmlContent.length < 150) {
|
|
22
|
+
// Very short content might be a shell or error
|
|
23
|
+
// Heuristic: if it's very short AND contains noscript, good chance it's a shell.
|
|
24
|
+
if (htmlContent.includes("<noscript>"))
|
|
25
|
+
return true;
|
|
26
|
+
}
|
|
27
|
+
// Check for <noscript> tag
|
|
28
|
+
if (htmlContent.includes("<noscript>"))
|
|
29
|
+
return true;
|
|
30
|
+
// Check for common empty root divs
|
|
31
|
+
if (/<div id=(?:"|')?(root|app)(?:"|')?[^>]*>\s*<\/div>/i.test(htmlContent))
|
|
32
|
+
return true;
|
|
33
|
+
// Check for empty title tag or no title tag at all
|
|
34
|
+
if (/<title>\s*<\/title>/i.test(htmlContent) || !/<title[^>]*>/i.test(htmlContent))
|
|
35
|
+
return true;
|
|
36
|
+
return false;
|
|
16
37
|
}
|
|
17
38
|
async fetchHTML(url, options = {}) {
|
|
18
|
-
//
|
|
39
|
+
// Determine effective SPA mode and markdown options
|
|
40
|
+
// HybridEngine defaults to false for these if not otherwise specified in its own config or per-request options.
|
|
41
|
+
const effectiveSpaMode = options.spaMode !== undefined ? options.spaMode : this.config.spaMode !== undefined ? this.config.spaMode : false;
|
|
42
|
+
const effectiveMarkdown = options.markdown !== undefined
|
|
43
|
+
? options.markdown
|
|
44
|
+
: this.config.markdown !== undefined
|
|
45
|
+
? this.config.markdown
|
|
46
|
+
: false;
|
|
47
|
+
// Prepare options for PlaywrightEngine, to be used in fallback scenarios or direct calls
|
|
48
|
+
const playwrightOptions = {
|
|
49
|
+
...this.config, // Start with base config given to HybridEngine (e.g. spaRenderDelayMs)
|
|
50
|
+
...options, // Apply all per-request overrides first
|
|
51
|
+
markdown: effectiveMarkdown, // Then ensure HybridEngine's resolved markdown is set
|
|
52
|
+
spaMode: effectiveSpaMode, // Then ensure HybridEngine's resolved spaMode is set
|
|
53
|
+
};
|
|
54
|
+
// Check playwrightOnlyPatterns first
|
|
55
|
+
for (const pattern of this.playwrightOnlyPatterns) {
|
|
56
|
+
if (typeof pattern === "string" && url.includes(pattern)) {
|
|
57
|
+
console.warn(`HybridEngine: URL ${url} matches string pattern "${pattern}". Using PlaywrightEngine directly.`);
|
|
58
|
+
return this.playwrightEngine.fetchHTML(url, playwrightOptions);
|
|
59
|
+
}
|
|
60
|
+
else if (pattern instanceof RegExp && pattern.test(url)) {
|
|
61
|
+
console.warn(`HybridEngine: URL ${url} matches regex pattern "${pattern.toString()}". Using PlaywrightEngine directly.`);
|
|
62
|
+
return this.playwrightEngine.fetchHTML(url, playwrightOptions);
|
|
63
|
+
}
|
|
64
|
+
}
|
|
19
65
|
try {
|
|
20
66
|
const fetchResult = await this.fetchEngine.fetchHTML(url);
|
|
21
|
-
// If
|
|
22
|
-
|
|
67
|
+
// If FetchEngine succeeded AND spaMode is active, check if it's just a shell
|
|
68
|
+
if (effectiveSpaMode && fetchResult && fetchResult.content) {
|
|
69
|
+
if (this._isSpaShell(fetchResult.content)) {
|
|
70
|
+
console.warn(`HybridEngine: FetchEngine returned likely SPA shell for ${url} in spaMode. Forcing PlaywrightEngine.`);
|
|
71
|
+
// Fallback to PlaywrightEngine, passing the determined effective options
|
|
72
|
+
return this.playwrightEngine.fetchHTML(url, playwrightOptions);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
// If not spaMode, or if spaMode but content is not a shell, return FetchEngine's result
|
|
23
76
|
return fetchResult;
|
|
24
77
|
}
|
|
25
78
|
catch (fetchError) {
|
|
26
|
-
console.warn(`FetchEngine failed for ${url}: ${fetchError.message}. Falling back to PlaywrightEngine.`);
|
|
27
|
-
// Merge constructor config with per-request options for Playwright fallback
|
|
28
|
-
const playwrightOptions = {
|
|
29
|
-
...this.config, // Start with base config given to HybridEngine
|
|
30
|
-
...options, // Override with per-request options
|
|
31
|
-
};
|
|
79
|
+
console.warn(`HybridEngine: FetchEngine failed for ${url}: ${fetchError.message}. Falling back to PlaywrightEngine.`);
|
|
32
80
|
try {
|
|
33
|
-
//
|
|
81
|
+
// Fallback to PlaywrightEngine, passing the determined effective options
|
|
34
82
|
const playwrightResult = await this.playwrightEngine.fetchHTML(url, playwrightOptions);
|
|
35
83
|
return playwrightResult;
|
|
36
84
|
}
|
|
37
85
|
catch (playwrightError) {
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
// Optionally, wrap or prioritize which error to throw
|
|
41
|
-
// Throwing the Playwright error as it's the last one encountered
|
|
42
|
-
throw playwrightError;
|
|
86
|
+
console.error(`HybridEngine: PlaywrightEngine fallback also failed for ${url}: ${playwrightError.message}`);
|
|
87
|
+
throw playwrightError; // Throw the Playwright error as it's the last one encountered
|
|
43
88
|
}
|
|
44
89
|
}
|
|
45
90
|
}
|
package/dist/HybridEngine.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"HybridEngine.js","sourceRoot":"","sources":["../src/HybridEngine.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAC/C,OAAO,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AAIzD;;GAEG;AACH,MAAM,OAAO,YAAY;IACN,WAAW,CAAc;IACzB,gBAAgB,CAAmB;IACnC,MAAM,CAAyB,CAAC,sDAAsD;
|
|
1
|
+
{"version":3,"file":"HybridEngine.js","sourceRoot":"","sources":["../src/HybridEngine.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAC/C,OAAO,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AAIzD;;GAEG;AACH,MAAM,OAAO,YAAY;IACN,WAAW,CAAc;IACzB,gBAAgB,CAAmB;IACnC,MAAM,CAAyB,CAAC,sDAAsD;IACtF,sBAAsB,CAAsB;IAE7D,YAAY,SAAiC,EAAE;QAC7C,4CAA4C;QAC5C,gEAAgE;QAChE,uGAAuG;QACvG,IAAI,CAAC,WAAW,GAAG,IAAI,WAAW,CAAC,EAAE,QAAQ,EAAE,MAAM,CAAC,QAAQ,EAAE,CAAC,CAAC;QAClE,IAAI,CAAC,gBAAgB,GAAG,IAAI,gBAAgB,CAAC,MAAM,CAAC,CAAC;QACrD,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC,CAAC,0BAA0B;QAChD,IAAI,CAAC,sBAAsB,GAAG,MAAM,CAAC,sBAAsB,IAAI,EAAE,CAAC;IACpE,CAAC;IAEO,WAAW,CAAC,WAAmB;QACrC,IAAI,CAAC,WAAW,IAAI,WAAW,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;YAC7C,+CAA+C;YAC/C,iFAAiF;YACjF,IAAI,WAAW,CAAC,QAAQ,CAAC,YAAY,CAAC;gBAAE,OAAO,IAAI,CAAC;QACtD,CAAC;QACD,2BAA2B;QAC3B,IAAI,WAAW,CAAC,QAAQ,CAAC,YAAY,CAAC;YAAE,OAAO,IAAI,CAAC;QAEpD,mCAAmC;QACnC,IAAI,qDAAqD,CAAC,IAAI,CAAC,WAAW,CAAC;YAAE,OAAO,IAAI,CAAC;QAEzF,mDAAmD;QACnD,IAAI,sBAAsB,CAAC,IAAI,CAAC,WAAW,CAAC,IAAI,CAAC,eAAe,CAAC,IAAI,CAAC,WAAW,CAAC;YAAE,OAAO,IAAI,CAAC;QAEhG,OAAO,KAAK,CAAC;IACf,CAAC;IAED,KAAK,CAAC,SAAS,CAAC,GAAW,EAAE,UAAwB,EAAE;QACrD,oDAAoD;QACpD,gHAAgH;QAChH,MAAM,gBAAgB,GACpB,OAAO,CAAC,OAAO,KAAK,SAAS,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,OAAO,KAAK,SAAS,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,KAAK,CAAC;QACpH,MAAM,iBAAiB,GACrB,OAAO,CAAC,QAAQ,KAAK,SAAS;YAC5B,CAAC,CAAC,OAAO,CAAC,QAAQ;YAClB,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,KAAK,SAAS;gBAClC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ;gBACtB,CAAC,CAAC,KAAK,CAAC;QAEd,yFAAyF;QACzF,MAAM,iBAAiB,GAA6D;YAClF,GAAG,IAAI,CAAC,MAAM,EAAE,uEAAuE;YACvF,GAAG,OAAO,EAAE,wCAAwC;YACpD,QAAQ,EAAE,iBAAiB,EAAE,sDAAsD;YACnF,OAAO,EAAE,gBAAgB,EAAE,qDAAqD;SACjF,CAAC;QAEF,qCAAqC;QACrC,KAAK,MAAM,OAAO,IAAI,IAAI,CAAC,sBAAsB,EAAE,CAAC;YAClD,IAAI,OAAO,OAAO,KAAK,QAAQ,IAAI,GAAG,CAAC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;gBACzD,OAAO,CAAC,IAAI,CAAC,qBAAqB,GAAG,4BAA4B,OAAO,qCAAqC,CAAC,CAAC;gBAC/G,OAAO,IAAI,CAAC,gBAAgB,CAAC,SAAS,CAAC,GAAG,EAAE,iBAAiB,CAAC,CAAC;YACjE,CAAC;iBAAM,IAAI,OAAO,YAAY,MAAM,IAAI,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;gBAC1D,OAAO,CAAC,IAAI,CACV,qBAAqB,GAAG,2BAA2B,OAAO,CAAC,QAAQ,EAAE,qCAAqC,CAC3G,CAAC;gBACF,OAAO,IAAI,CAAC,gBAAgB,CAAC,SAAS,CAAC,GAAG,EAAE,iBAAiB,CAAC,CAAC;YACjE,CAAC;QACH,CAAC;QAED,IAAI,CAAC;YACH,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;YAE1D,6EAA6E;YAC7E,IAAI,gBAAgB,IAAI,WAAW,IAAI,WAAW,CAAC,OAAO,EAAE,CAAC;gBAC3D,IAAI,IAAI,CAAC,WAAW,CAAC,WAAW,CAAC,OAAO,CAAC,EAAE,CAAC;oBAC1C,OAAO,CAAC,IAAI,CACV,2DAA2D,GAAG,wCAAwC,CACvG,CAAC;oBACF,yEAAyE;oBACzE,OAAO,IAAI,CAAC,gBAAgB,CAAC,SAAS,CAAC,GAAG,EAAE,iBAAiB,CAAC,CAAC;gBACjE,CAAC;YACH,CAAC;YACD,wFAAwF;YACxF,OAAO,WAAW,CAAC;QACrB,CAAC;QAAC,OAAO,UAAe,EAAE,CAAC;YACzB,OAAO,CAAC,IAAI,CACV,wCAAwC,GAAG,KAAK,UAAU,CAAC,OAAO,qCAAqC,CACxG,CAAC;YACF,IAAI,CAAC;gBACH,yEAAyE;gBACzE,MAAM,gBAAgB,GAAG,MAAM,IAAI,CAAC,gBAAgB,CAAC,SAAS,CAAC,GAAG,EAAE,iBAAiB,CAAC,CAAC;gBACvF,OAAO,gBAAgB,CAAC;YAC1B,CAAC;YAAC,OAAO,eAAoB,EAAE,CAAC;gBAC9B,OAAO,CAAC,KAAK,CAAC,2DAA2D,GAAG,KAAK,eAAe,CAAC,OAAO,EAAE,CAAC,CAAC;gBAC5G,MAAM,eAAe,CAAC,CAAC,8DAA8D;YACvF,CAAC;QACH,CAAC;IACH,CAAC;IAED;;OAEG;IACH,UAAU;QACR,OAAO,IAAI,CAAC,gBAAgB,CAAC,UAAU,EAAE,CAAC;IAC5C,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO;QACX,MAAM,OAAO,CAAC,UAAU,CAAC;YACvB,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,EAAE,yCAAyC;YACrE,IAAI,CAAC,gBAAgB,CAAC,OAAO,EAAE;SAChC,CAAC,CAAC;IACL,CAAC;CACF"}
|
|
@@ -51,19 +51,48 @@ export declare class PlaywrightEngine implements IEngine {
|
|
|
51
51
|
* @param url The URL to fetch.
|
|
52
52
|
* @param options Optional settings for this specific fetch operation.
|
|
53
53
|
* @param options.fastMode Overrides the engine's `defaultFastMode` configuration for this request.
|
|
54
|
+
* @param options.spaMode Overrides the engine's `spaMode` configuration for this request.
|
|
54
55
|
* @returns A Promise resolving to an HTMLFetchResult object.
|
|
55
56
|
* @throws {FetchError} If the fetch fails after all retries or encounters critical errors.
|
|
56
57
|
*/
|
|
57
58
|
fetchHTML(url: string, options?: FetchOptions & {
|
|
58
59
|
markdown?: boolean;
|
|
60
|
+
spaMode?: boolean;
|
|
59
61
|
}): Promise<HTMLFetchResult>;
|
|
62
|
+
/**
|
|
63
|
+
* Helper to check cache and potentially return a cached result.
|
|
64
|
+
* Handles logic for re-fetching if cache is stale or content type mismatch for markdown.
|
|
65
|
+
*
|
|
66
|
+
* @param url URL to check in cache
|
|
67
|
+
* @param currentConfig Current fetch configuration
|
|
68
|
+
* @returns Cached result or null if not found/needs re-fetch.
|
|
69
|
+
*/
|
|
70
|
+
private _handleCacheCheck;
|
|
71
|
+
/**
|
|
72
|
+
* Attempts to fetch the URL using a simple HTTP GET request as a fallback.
|
|
73
|
+
*
|
|
74
|
+
* @param url The URL to fetch.
|
|
75
|
+
* @param currentConfig The current fetch configuration.
|
|
76
|
+
* @returns A Promise resolving to an HTMLFetchResult if successful, or null if fallback is skipped or a challenge page is encountered.
|
|
77
|
+
* @throws {FetchError} If the HTTP fallback itself fails with an unrecoverable error.
|
|
78
|
+
*/
|
|
79
|
+
private _attemptHttpFallback;
|
|
80
|
+
/**
|
|
81
|
+
* Ensures the browser pool is initialized with the correct mode (headed/headless).
|
|
82
|
+
* Handles one retry attempt if the initial pool initialization fails.
|
|
83
|
+
*
|
|
84
|
+
* @param useHeadedMode Whether to initialize the pool in headed mode.
|
|
85
|
+
* @param currentConfig The current fetch configuration (for retryDelay).
|
|
86
|
+
* @returns A Promise that resolves when the pool is initialized, or rejects if initialization fails after retries.
|
|
87
|
+
* @throws {FetchError} If pool initialization fails after retries or if the pool is unavailable.
|
|
88
|
+
*/
|
|
89
|
+
private _ensureBrowserPoolInitialized;
|
|
60
90
|
/**
|
|
61
91
|
* Internal recursive method to handle fetching with retries.
|
|
62
92
|
*
|
|
63
93
|
* @param url URL to fetch
|
|
64
94
|
* @param currentConfig The merged configuration including markdown option
|
|
65
95
|
* @param retryAttempt Current retry attempt number (starts at 0)
|
|
66
|
-
* @param parentRetryCount Tracks retries related to pool initialization errors (starts at 0)
|
|
67
96
|
* @returns Promise resolving to HTMLFetchResult
|
|
68
97
|
*/
|
|
69
98
|
private _fetchRecursive;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"PlaywrightEngine.d.ts","sourceRoot":"","sources":["../src/PlaywrightEngine.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,cAAc,EAAE,sBAAsB,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AACxG,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;
|
|
1
|
+
{"version":3,"file":"PlaywrightEngine.d.ts","sourceRoot":"","sources":["../src/PlaywrightEngine.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,cAAc,EAAE,sBAAsB,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AACxG,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAyC5C;;;;;;GAMG;AACH,qBAAa,gBAAiB,YAAW,OAAO;IAC9C,OAAO,CAAC,WAAW,CAAsC;IACzD,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAS;IAC/B,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAsC;IAC5D,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAiC;IAGxD,OAAO,CAAC,uBAAuB,CAAkB;IACjD,OAAO,CAAC,iBAAiB,CAAkB;IAC3C,OAAO,CAAC,mBAAmB,CAA0B;IAGrD,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,cAAc,CAsBpC;IAEF;;;;;OAKG;gBACS,MAAM,GAAE,sBAA2B;IAM/C;;OAEG;YACW,qBAAqB;IAwCnC;;;OAGG;YACW,yBAAyB;IAiEvC,OAAO,CAAC,UAAU;IAalB;;OAEG;YACW,WAAW;IAazB;;OAEG;YACW,qBAAqB;IAwCnC;;OAEG;IACH,OAAO,CAAC,UAAU;IAUlB;;;;;;;;;OASG;IACG,SAAS,CACb,GAAG,EAAE,MAAM,EACX,OAAO,GAAE,YAAY,GAAG;QAAE,QAAQ,CAAC,EAAE,OAAO,CAAC;QAAC,OAAO,CAAC,EAAE,OAAO,CAAA;KAAO,GACrE,OAAO,CAAC,eAAe,CAAC;IAc3B;;;;;;;OAOG;IACH,OAAO,CAAC,iBAAiB;IAmDzB;;;;;;;OAOG;YACW,oBAAoB;IAiClC;;;;;;;;OAQG;YACW,6BAA6B;IAmC3C;;;;;;;OAOG;YACW,eAAe;IAgH7B;;;OAGG;YACW,mBAAmB;YAqKnB,kBAAkB;IAyChC;;;;;OAKG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;IAoB9B;;;OAGG;IACH,UAAU,IAAI,cAAc,EAAE;IAQ9B,OAAO,CAAC,mBAAmB;CAS5B"}
|