@purepageio/fetch-engines 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +205 -0
- package/dist/FetchEngine.d.ts +46 -0
- package/dist/FetchEngine.d.ts.map +1 -0
- package/dist/FetchEngine.js +137 -0
- package/dist/FetchEngine.js.map +1 -0
- package/dist/FetchEngine.test.d.ts +2 -0
- package/dist/FetchEngine.test.d.ts.map +1 -0
- package/dist/FetchEngine.test.js +44 -0
- package/dist/FetchEngine.test.js.map +1 -0
- package/dist/HybridEngine.d.ts +15 -0
- package/dist/HybridEngine.d.ts.map +1 -0
- package/dist/HybridEngine.js +45 -0
- package/dist/HybridEngine.js.map +1 -0
- package/dist/IEngine.d.ts +22 -0
- package/dist/IEngine.d.ts.map +1 -0
- package/dist/IEngine.js +2 -0
- package/dist/IEngine.js.map +1 -0
- package/dist/PlaywrightEngine.d.ts +88 -0
- package/dist/PlaywrightEngine.d.ts.map +1 -0
- package/dist/PlaywrightEngine.js +484 -0
- package/dist/PlaywrightEngine.js.map +1 -0
- package/dist/PlaywrightEngine.test.d.ts +2 -0
- package/dist/PlaywrightEngine.test.d.ts.map +1 -0
- package/dist/PlaywrightEngine.test.js +299 -0
- package/dist/PlaywrightEngine.test.js.map +1 -0
- package/dist/PuppeteerEngine.d.ts +21 -0
- package/dist/PuppeteerEngine.d.ts.map +1 -0
- package/dist/PuppeteerEngine.js +412 -0
- package/dist/PuppeteerEngine.js.map +1 -0
- package/dist/browser/BrowserPool.d.ts +29 -0
- package/dist/browser/BrowserPool.d.ts.map +1 -0
- package/dist/browser/BrowserPool.js +378 -0
- package/dist/browser/BrowserPool.js.map +1 -0
- package/dist/browser/PlaywrightBrowserPool.d.ts +78 -0
- package/dist/browser/PlaywrightBrowserPool.d.ts.map +1 -0
- package/dist/browser/PlaywrightBrowserPool.js +429 -0
- package/dist/browser/PlaywrightBrowserPool.js.map +1 -0
- package/dist/browser/PlaywrightBrowserPool.test.d.ts +2 -0
- package/dist/browser/PlaywrightBrowserPool.test.d.ts.map +1 -0
- package/dist/browser/PlaywrightBrowserPool.test.js +422 -0
- package/dist/browser/PlaywrightBrowserPool.test.js.map +1 -0
- package/dist/errors.d.ts +20 -0
- package/dist/errors.d.ts.map +1 -0
- package/dist/errors.js +30 -0
- package/dist/errors.js.map +1 -0
- package/dist/index.d.ts +8 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +5 -0
- package/dist/index.js.map +1 -0
- package/dist/types.d.ts +151 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/package.json +72 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Purepage
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
# @purepageio/fetch-engines
|
|
2
|
+
|
|
3
|
+
A collection of configurable engines for fetching HTML content using plain `fetch` or Playwright.
|
|
4
|
+
|
|
5
|
+
This package provides robust and customizable ways to retrieve web page content, handling retries, caching, user agents, and optional browser automation via Playwright for complex JavaScript-driven sites.
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pnpm add @purepageio/fetch-engines
|
|
11
|
+
# or with npm
|
|
12
|
+
npm install @purepageio/fetch-engines
|
|
13
|
+
# or with yarn
|
|
14
|
+
yarn add @purepageio/fetch-engines
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
If you plan to use the `PlaywrightEngine`, you also need to install Playwright's browser binaries:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pnpm exec playwright install
|
|
21
|
+
# or
|
|
22
|
+
npx playwright install
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Engines
|
|
26
|
+
|
|
27
|
+
- **`FetchEngine`**: Uses the standard `fetch` API. Suitable for simple HTML pages or APIs returning HTML. Lightweight and fast.
|
|
28
|
+
- **`PlaywrightEngine`**: Uses Playwright to control a headless browser (Chromium, Firefox, WebKit). Handles JavaScript rendering, complex interactions (if needed), and provides options for stealth and anti-bot detection measures. More resource-intensive but necessary for dynamic websites.
|
|
29
|
+
|
|
30
|
+
## Basic Usage
|
|
31
|
+
|
|
32
|
+
### FetchEngine
|
|
33
|
+
|
|
34
|
+
```typescript
|
|
35
|
+
import { FetchEngine } from "@purepageio/fetch-engines";
|
|
36
|
+
|
|
37
|
+
const engine = new FetchEngine();
|
|
38
|
+
|
|
39
|
+
async function main() {
|
|
40
|
+
try {
|
|
41
|
+
const url = "https://example.com";
|
|
42
|
+
const result = await engine.fetchHTML(url);
|
|
43
|
+
console.log(`Fetched ${result.url}`);
|
|
44
|
+
console.log(`Title: ${result.title}`);
|
|
45
|
+
// console.log(`HTML: ${result.html.substring(0, 200)}...`);
|
|
46
|
+
} catch (error) {
|
|
47
|
+
console.error("Fetch failed:", error);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
main();
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### PlaywrightEngine
|
|
55
|
+
|
|
56
|
+
```typescript
|
|
57
|
+
import { PlaywrightEngine } from "@purepageio/fetch-engines";
|
|
58
|
+
|
|
59
|
+
// Configure engine options (optional)
|
|
60
|
+
const engine = new PlaywrightEngine({
|
|
61
|
+
maxRetries: 2, // Number of retry attempts
|
|
62
|
+
useHttpFallback: true, // Try simple HTTP fetch first
|
|
63
|
+
cacheTTL: 5 * 60 * 1000, // Cache results for 5 minutes (in milliseconds)
|
|
64
|
+
});
|
|
65
|
+
|
|
66
|
+
async function main() {
|
|
67
|
+
try {
|
|
68
|
+
const url = "https://quotes.toscrape.com/"; // A site that might benefit from JS rendering
|
|
69
|
+
const result = await engine.fetchHTML(url);
|
|
70
|
+
console.log(`Fetched ${result.url}`);
|
|
71
|
+
console.log(`Title: ${result.title}`);
|
|
72
|
+
// console.log(`HTML: ${result.html.substring(0, 200)}...`);
|
|
73
|
+
} catch (error) {
|
|
74
|
+
console.error("Playwright fetch failed:", error);
|
|
75
|
+
} finally {
|
|
76
|
+
// Important: Clean up browser resources when done
|
|
77
|
+
await engine.cleanup();
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
main();
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Configuration
|
|
85
|
+
|
|
86
|
+
Engines accept an optional configuration object in their constructor to customize behavior.
|
|
87
|
+
|
|
88
|
+
### FetchEngine
|
|
89
|
+
|
|
90
|
+
The `FetchEngine` currently has **no configurable options** via its constructor. It uses standard `fetch` with default browser/Node.js retry/timeout behavior and a fixed set of browser-like headers.
|
|
91
|
+
|
|
92
|
+
### PlaywrightEngine
|
|
93
|
+
|
|
94
|
+
The `PlaywrightEngine` offers more extensive configuration:
|
|
95
|
+
|
|
96
|
+
**General Options:**
|
|
97
|
+
|
|
98
|
+
- `concurrentPages` (`number`, default: `3`)
|
|
99
|
+
- Maximum number of Playwright pages to process concurrently across all browser instances.
|
|
100
|
+
- `maxRetries` (`number`, default: `3`)
|
|
101
|
+
- Maximum number of retry attempts for a failed fetch operation (excluding initial attempt).
|
|
102
|
+
- `retryDelay` (`number`, default: `5000`)
|
|
103
|
+
- Delay in milliseconds between retry attempts.
|
|
104
|
+
- `cacheTTL` (`number`, default: `900000` (15 minutes))
|
|
105
|
+
- Time-to-live for cached results in milliseconds. Set to `0` to disable the in-memory cache.
|
|
106
|
+
- `useHttpFallback` (`boolean`, default: `true`)
|
|
107
|
+
- If `true`, the engine first attempts a simple, fast HTTP GET request. If this fails or appears to receive a challenge/CAPTCHA page, it then proceeds with a full Playwright browser request.
|
|
108
|
+
- `useHeadedModeFallback` (`boolean`, default: `false`)
|
|
109
|
+
- If `true` and a Playwright request fails (potentially due to bot detection), subsequent requests _to that specific domain_ will automatically use a headed (visible) browser instance, which can sometimes bypass stricter checks. This requires the pool to potentially manage both headless and headed instances.
|
|
110
|
+
- `defaultFastMode` (`boolean`, default: `true`)
|
|
111
|
+
- If `true`, requests initially run in "fast mode", blocking non-essential resources (images, fonts, stylesheets) and skipping human behavior simulation. This can significantly speed up fetches but may break some sites or increase detection risk. This can be overridden per-request via the `fetchHTML` options.
|
|
112
|
+
- `simulateHumanBehavior` (`boolean`, default: `true`)
|
|
113
|
+
- If `true` and the request is _not_ in `fastMode`, the engine attempts basic human-like interactions (e.g., slight delays, mouse movements). _Note: This simulation is currently basic and may not defeat advanced bot detection._
|
|
114
|
+
|
|
115
|
+
**Browser Pool Options:**
|
|
116
|
+
|
|
117
|
+
These options are passed down to configure the underlying `PlaywrightBrowserPool` that manages browser instances.
|
|
118
|
+
|
|
119
|
+
- `maxBrowsers` (`number`, default: `2`)
|
|
120
|
+
- Maximum number of concurrent browser instances (e.g., Chrome processes) the pool will manage.
|
|
121
|
+
- `maxPagesPerContext` (`number`, default: `6`)
|
|
122
|
+
- Maximum number of pages that can be opened within a single browser context (like an isolated browser profile) before the pool prefers using a different context or browser instance. Helps isolate sessions.
|
|
123
|
+
- `maxBrowserAge` (`number`, default: `1200000` (20 minutes))
|
|
124
|
+
- Maximum age in milliseconds a browser instance can live before the pool proactively closes and replaces it. Helps mitigate memory leaks or state issues.
|
|
125
|
+
- `healthCheckInterval` (`number`, default: `60000` (1 minute))
|
|
126
|
+
- How often (in milliseconds) the pool checks the health of its browser instances (e.g., checking connectivity, age).
|
|
127
|
+
- `useHeadedMode` (`boolean`, default: `false`)
|
|
128
|
+
- Forces the _entire_ browser pool to launch browsers in headed (visible) mode instead of the default headless mode. Primarily useful for debugging purposes.
|
|
129
|
+
- `poolBlockedDomains` (`string[]`, default: `[]` - uses pool's internal defaults)
|
|
130
|
+
- List of domain _glob patterns_ (e.g., `*.google-analytics.com`, `*.doubleclick.net`) for requests that the browser should block. An empty array uses the pool's built-in default blocklist (recommended).
|
|
131
|
+
- `poolBlockedResourceTypes` (`string[]`, default: `[]` - uses pool's internal defaults)
|
|
132
|
+
- List of Playwright resource types (e.g., `image`, `stylesheet`, `font`, `media`, `websocket`) to block. Blocking unnecessary resources can speed up page loads. An empty array uses the pool's built-in default blocklist (recommended).
|
|
133
|
+
- `proxy` (`object | undefined`, default: `undefined`)
|
|
134
|
+
- Proxy configuration to be used by the browser instances.
|
|
135
|
+
- `server` (`string`): Proxy URL (e.g., `http://host:port`, `socks5://user:pass@host:port`).
|
|
136
|
+
- `username` (`string`, optional): Proxy username.
|
|
137
|
+
- `password` (`string`, optional): Proxy password.
|
|
138
|
+
|
|
139
|
+
## Return Value
|
|
140
|
+
|
|
141
|
+
Both `FetchEngine.fetchHTML()` and `PlaywrightEngine.fetchHTML()` return a Promise that resolves to a `FetchResult` object with the following properties:
|
|
142
|
+
|
|
143
|
+
- `html` (`string`): The full HTML content of the fetched page.
|
|
144
|
+
- `title` (`string | null`): The extracted `<title>` tag content, or `null` if no title is found.
|
|
145
|
+
- `url` (`string`): The final URL after any redirects.
|
|
146
|
+
- `isFromCache` (`boolean`): `true` if the result was served from the engine's cache, `false` otherwise.
|
|
147
|
+
- `statusCode` (`number | undefined`): The HTTP status code of the final response. This is typically available for `FetchEngine` and the HTTP fallback in `PlaywrightEngine`, but might be `undefined` for some Playwright navigation scenarios if the primary response wasn't directly captured.
|
|
148
|
+
- `error` (`FetchError | Error | undefined`): If an error occurred during the _final_ fetch attempt (after retries), this property will contain the error object. It might be a specific `FetchError` (see Error Handling) or a generic `Error`.
|
|
149
|
+
|
|
150
|
+
## API Reference
|
|
151
|
+
|
|
152
|
+
### `engine.fetchHTML(url, options?)`
|
|
153
|
+
|
|
154
|
+
- `url` (`string`): The URL of the page to fetch.
|
|
155
|
+
- `options` (`object`, optional): Per-request options to override engine defaults.
|
|
156
|
+
- For `PlaywrightEngine`, you can override `fastMode` (`boolean`) to force or disable fast mode for this specific request.
|
|
157
|
+
- _(Other per-request options may be added in the future)._
|
|
158
|
+
- **Returns:** `Promise<FetchResult>`
|
|
159
|
+
|
|
160
|
+
Fetches the HTML content for the given URL using the engine's configured strategy (plain fetch or Playwright).
|
|
161
|
+
|
|
162
|
+
### `engine.cleanup()` (PlaywrightEngine only)
|
|
163
|
+
|
|
164
|
+
- **Returns:** `Promise<void>`
|
|
165
|
+
|
|
166
|
+
Gracefully shuts down all browser instances managed by the `PlaywrightEngine`'s browser pool. **It is crucial to call `await engine.cleanup()` when you are finished using a `PlaywrightEngine` instance** to release system resources.
|
|
167
|
+
|
|
168
|
+
## Stealth / Anti-Detection (`PlaywrightEngine`)
|
|
169
|
+
|
|
170
|
+
The `PlaywrightEngine` automatically integrates `playwright-extra` and its powerful stealth plugin (`puppeteer-extra-plugin-stealth`). This plugin applies various techniques to make the headless browser controlled by Playwright appear more like a regular human-operated browser, helping to bypass many common bot detection systems.
|
|
171
|
+
|
|
172
|
+
There are **no manual configuration options** for stealth; it is enabled by default when using `PlaywrightEngine`. The previous options (`useStealthMode`, `randomizeFingerprint`, `evasionLevel`) have been removed.
|
|
173
|
+
|
|
174
|
+
While effective, be aware that no stealth technique is foolproof, and sophisticated websites may still detect automated browsing.
|
|
175
|
+
|
|
176
|
+
## Error Handling
|
|
177
|
+
|
|
178
|
+
Errors during fetching are typically thrown as instances of `FetchError` (or its subclasses like `FetchEngineHttpError`), providing more context than standard `Error` objects.
|
|
179
|
+
|
|
180
|
+
- `FetchError` properties:
|
|
181
|
+
- `message` (`string`): Description of the error.
|
|
182
|
+
- `code` (`string | undefined`): A specific error code (e.g., `ERR_NAVIGATION_TIMEOUT`, `ERR_HTTP_ERROR`, `ERR_NON_HTML_CONTENT`).
|
|
183
|
+
- `originalError` (`Error | undefined`): The underlying error that caused this fetch error (e.g., a Playwright error object).
|
|
184
|
+
|
|
185
|
+
Common error scenarios include:
|
|
186
|
+
|
|
187
|
+
- Network issues (DNS resolution failure, connection refused).
|
|
188
|
+
- HTTP errors (4xx client errors, 5xx server errors).
|
|
189
|
+
- Non-HTML content type received (for `FetchEngine`).
|
|
190
|
+
- Playwright navigation timeouts.
|
|
191
|
+
- Proxy connection errors.
|
|
192
|
+
- Page crashes within Playwright.
|
|
193
|
+
- Errors thrown by the browser pool (e.g., failure to launch browser).
|
|
194
|
+
|
|
195
|
+
The `FetchResult` object may also contain an `error` property if the final fetch attempt failed after all retries.
|
|
196
|
+
|
|
197
|
+
## Logging
|
|
198
|
+
|
|
199
|
+
## Contributing
|
|
200
|
+
|
|
201
|
+
Contributions are welcome! Please open an issue or submit a pull request on the [GitHub repository](https://github.com/purepageio/fetch-engines).
|
|
202
|
+
|
|
203
|
+
## License
|
|
204
|
+
|
|
205
|
+
MIT
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import type { HTMLFetchResult, BrowserMetrics } from "./types.js";
|
|
2
|
+
import type { IEngine } from "./IEngine.js";
|
|
3
|
+
/**
|
|
4
|
+
* Custom error class for HTTP errors from FetchEngine.
|
|
5
|
+
*/
|
|
6
|
+
export declare class FetchEngineHttpError extends Error {
|
|
7
|
+
readonly statusCode: number;
|
|
8
|
+
constructor(message: string, statusCode: number);
|
|
9
|
+
}
|
|
10
|
+
/**
|
|
11
|
+
* FetchEngine - A lightweight engine for fetching HTML content using the standard `fetch` API.
|
|
12
|
+
*
|
|
13
|
+
* Ideal for fetching content from static websites or APIs where JavaScript execution is not required.
|
|
14
|
+
* It does not support advanced configurations like retries, caching, or proxies directly.
|
|
15
|
+
*/
|
|
16
|
+
export declare class FetchEngine implements IEngine {
|
|
17
|
+
private readonly headers;
|
|
18
|
+
/**
|
|
19
|
+
* Creates an instance of FetchEngine.
|
|
20
|
+
* Note: This engine currently does not accept configuration options.
|
|
21
|
+
*/
|
|
22
|
+
constructor();
|
|
23
|
+
/**
|
|
24
|
+
* Fetches HTML content from the specified URL using the `fetch` API.
|
|
25
|
+
*
|
|
26
|
+
* @param url The URL to fetch.
|
|
27
|
+
* @returns A Promise resolving to an HTMLFetchResult object.
|
|
28
|
+
* @throws {FetchEngineHttpError} If the HTTP response status is not ok (e.g., 404, 500).
|
|
29
|
+
* @throws {Error} If the content type is not HTML or for other network errors.
|
|
30
|
+
*/
|
|
31
|
+
fetchHTML(url: string): Promise<HTMLFetchResult>;
|
|
32
|
+
private detectSPA;
|
|
33
|
+
/**
|
|
34
|
+
* Cleans up resources used by the engine.
|
|
35
|
+
* For FetchEngine, this is a no-op as it doesn't manage persistent resources.
|
|
36
|
+
* @returns A Promise that resolves when cleanup is complete.
|
|
37
|
+
*/
|
|
38
|
+
cleanup(): Promise<void>;
|
|
39
|
+
/**
|
|
40
|
+
* Retrieves metrics for the engine.
|
|
41
|
+
* FetchEngine does not manage browsers, so it returns an empty array.
|
|
42
|
+
* @returns An empty array.
|
|
43
|
+
*/
|
|
44
|
+
getMetrics(): BrowserMetrics[];
|
|
45
|
+
}
|
|
46
|
+
//# sourceMappingURL=FetchEngine.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"FetchEngine.d.ts","sourceRoot":"","sources":["../src/FetchEngine.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAClE,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAG5C;;GAEG;AACH,qBAAa,oBAAqB,SAAQ,KAAK;IAC7C,SAAgB,UAAU,EAAE,MAAM,CAAC;gBAEvB,OAAO,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM;CAShD;AAED;;;;;GAKG;AACH,qBAAa,WAAY,YAAW,OAAO;IACzC,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAyB;IAEjD;;;OAGG;;IAeH;;;;;;;OAOG;IACG,SAAS,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,eAAe,CAAC;IAgDtD,OAAO,CAAC,SAAS;IA+BjB;;;;OAIG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;IAK9B;;;;OAIG;IACH,UAAU,IAAI,cAAc,EAAE;CAI/B"}
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
import { JSDOM } from "jsdom";
|
|
2
|
+
/**
|
|
3
|
+
* Custom error class for HTTP errors from FetchEngine.
|
|
4
|
+
*/
|
|
5
|
+
export class FetchEngineHttpError extends Error {
|
|
6
|
+
statusCode;
|
|
7
|
+
constructor(message, statusCode) {
|
|
8
|
+
super(message);
|
|
9
|
+
this.name = "FetchEngineHttpError";
|
|
10
|
+
this.statusCode = statusCode;
|
|
11
|
+
// Maintain proper stack trace (requires target ES2015+ in tsconfig)
|
|
12
|
+
if (Error.captureStackTrace) {
|
|
13
|
+
Error.captureStackTrace(this, FetchEngineHttpError);
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* FetchEngine - A lightweight engine for fetching HTML content using the standard `fetch` API.
|
|
19
|
+
*
|
|
20
|
+
* Ideal for fetching content from static websites or APIs where JavaScript execution is not required.
|
|
21
|
+
* It does not support advanced configurations like retries, caching, or proxies directly.
|
|
22
|
+
*/
|
|
23
|
+
export class FetchEngine {
|
|
24
|
+
headers;
|
|
25
|
+
/**
|
|
26
|
+
* Creates an instance of FetchEngine.
|
|
27
|
+
* Note: This engine currently does not accept configuration options.
|
|
28
|
+
*/
|
|
29
|
+
constructor() {
|
|
30
|
+
this.headers = {
|
|
31
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
32
|
+
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
33
|
+
"Accept-Language": "en-US,en;q=0.5",
|
|
34
|
+
"Upgrade-Insecure-Requests": "1",
|
|
35
|
+
"Sec-Fetch-Dest": "document",
|
|
36
|
+
"Sec-Fetch-Mode": "navigate",
|
|
37
|
+
"Sec-Fetch-Site": "none",
|
|
38
|
+
"Sec-Fetch-User": "?1",
|
|
39
|
+
};
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Fetches HTML content from the specified URL using the `fetch` API.
|
|
43
|
+
*
|
|
44
|
+
* @param url The URL to fetch.
|
|
45
|
+
* @returns A Promise resolving to an HTMLFetchResult object.
|
|
46
|
+
* @throws {FetchEngineHttpError} If the HTTP response status is not ok (e.g., 404, 500).
|
|
47
|
+
* @throws {Error} If the content type is not HTML or for other network errors.
|
|
48
|
+
*/
|
|
49
|
+
async fetchHTML(url) {
|
|
50
|
+
try {
|
|
51
|
+
const response = await fetch(url, {
|
|
52
|
+
headers: this.headers,
|
|
53
|
+
redirect: "follow",
|
|
54
|
+
});
|
|
55
|
+
if (!response.ok) {
|
|
56
|
+
// Throw the custom error with status code
|
|
57
|
+
throw new FetchEngineHttpError(`HTTP error! status: ${response.status}`, response.status);
|
|
58
|
+
}
|
|
59
|
+
const contentType = response.headers.get("content-type") || "";
|
|
60
|
+
if (!contentType.includes("text/html")) {
|
|
61
|
+
throw new Error("Not an HTML page");
|
|
62
|
+
}
|
|
63
|
+
const html = await response.text();
|
|
64
|
+
// Use JSDOM to parse HTML and extract title
|
|
65
|
+
const dom = new JSDOM(html);
|
|
66
|
+
const title = dom.window.document.title || "";
|
|
67
|
+
// Check for potential SPA markers
|
|
68
|
+
const isSPA = this.detectSPA(dom.window.document);
|
|
69
|
+
if (isSPA) {
|
|
70
|
+
// Removed throwing error here, as the calling code should decide how to handle this.
|
|
71
|
+
// Consider adding a flag to the result instead.
|
|
72
|
+
console.warn(`SPA detected for ${url}, content might be incomplete without JavaScript rendering.`);
|
|
73
|
+
// Example: return { html, title, url: response.url, isSPA: true };
|
|
74
|
+
}
|
|
75
|
+
return {
|
|
76
|
+
html,
|
|
77
|
+
title,
|
|
78
|
+
url: response.url,
|
|
79
|
+
isFromCache: false, // FetchEngine doesn't cache
|
|
80
|
+
statusCode: response.status,
|
|
81
|
+
error: undefined,
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
catch (error) {
|
|
85
|
+
// console.error(`FetchEngine failed for ${url}:`, error); // Optional: Keep logging if desired
|
|
86
|
+
// Re-throw the original error to preserve its type (e.g., FetchEngineHttpError)
|
|
87
|
+
// Ensure the result conforms to HTMLFetchResult even on error (for consistency? No, spec says throw)
|
|
88
|
+
throw error;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
detectSPA(document) {
|
|
92
|
+
// Check for common SPA frameworks and patterns
|
|
93
|
+
const spaMarkers = [
|
|
94
|
+
// React
|
|
95
|
+
"[data-reactroot]",
|
|
96
|
+
"#root",
|
|
97
|
+
"#app",
|
|
98
|
+
// Vue
|
|
99
|
+
"[data-v-app]",
|
|
100
|
+
"#app[data-v-]",
|
|
101
|
+
// Angular
|
|
102
|
+
"[ng-version]",
|
|
103
|
+
"[ng-app]",
|
|
104
|
+
// Common SPA patterns
|
|
105
|
+
'script[type="application/json+ld"]', // Less reliable marker
|
|
106
|
+
'meta[name="fragment"]',
|
|
107
|
+
];
|
|
108
|
+
// Check if the body is nearly empty but has JS (More reliable)
|
|
109
|
+
const bodyContent = document.body?.textContent?.trim() || "";
|
|
110
|
+
const hasScripts = document.scripts.length > 0;
|
|
111
|
+
if (bodyContent.length < 150 && hasScripts) {
|
|
112
|
+
// Increased threshold slightly
|
|
113
|
+
return true;
|
|
114
|
+
}
|
|
115
|
+
// Check for SPA markers (Less reliable)
|
|
116
|
+
return spaMarkers.some((selector) => document.querySelector(selector) !== null);
|
|
117
|
+
}
|
|
118
|
+
/**
|
|
119
|
+
* Cleans up resources used by the engine.
|
|
120
|
+
* For FetchEngine, this is a no-op as it doesn't manage persistent resources.
|
|
121
|
+
* @returns A Promise that resolves when cleanup is complete.
|
|
122
|
+
*/
|
|
123
|
+
async cleanup() {
|
|
124
|
+
// No resources to clean up for fetch engine
|
|
125
|
+
return Promise.resolve(); // Explicitly return resolved promise
|
|
126
|
+
}
|
|
127
|
+
/**
|
|
128
|
+
* Retrieves metrics for the engine.
|
|
129
|
+
* FetchEngine does not manage browsers, so it returns an empty array.
|
|
130
|
+
* @returns An empty array.
|
|
131
|
+
*/
|
|
132
|
+
getMetrics() {
|
|
133
|
+
// Fetch engine doesn't maintain browser pool metrics
|
|
134
|
+
return [];
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
//# sourceMappingURL=FetchEngine.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"FetchEngine.js","sourceRoot":"","sources":["../src/FetchEngine.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,KAAK,EAAE,MAAM,OAAO,CAAC;AAE9B;;GAEG;AACH,MAAM,OAAO,oBAAqB,SAAQ,KAAK;IAC7B,UAAU,CAAS;IAEnC,YAAY,OAAe,EAAE,UAAkB;QAC7C,KAAK,CAAC,OAAO,CAAC,CAAC;QACf,IAAI,CAAC,IAAI,GAAG,sBAAsB,CAAC;QACnC,IAAI,CAAC,UAAU,GAAG,UAAU,CAAC;QAC7B,oEAAoE;QACpE,IAAI,KAAK,CAAC,iBAAiB,EAAE,CAAC;YAC5B,KAAK,CAAC,iBAAiB,CAAC,IAAI,EAAE,oBAAoB,CAAC,CAAC;QACtD,CAAC;IACH,CAAC;CACF;AAED;;;;;GAKG;AACH,MAAM,OAAO,WAAW;IACL,OAAO,CAAyB;IAEjD;;;OAGG;IACH;QACE,IAAI,CAAC,OAAO,GAAG;YACb,YAAY,EACV,iHAAiH;YACnH,MAAM,EAAE,4EAA4E;YACpF,iBAAiB,EAAE,gBAAgB;YACnC,2BAA2B,EAAE,GAAG;YAChC,gBAAgB,EAAE,UAAU;YAC5B,gBAAgB,EAAE,UAAU;YAC5B,gBAAgB,EAAE,MAAM;YACxB,gBAAgB,EAAE,IAAI;SACvB,CAAC;IACJ,CAAC;IAED;;;;;;;OAOG;IACH,KAAK,CAAC,SAAS,CAAC,GAAW;QACzB,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;gBAChC,OAAO,EAAE,IAAI,CAAC,OAAO;gBACrB,QAAQ,EAAE,QAAQ;aACnB,CAAC,CAAC;YAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;gBACjB,0CAA0C;gBAC1C,MAAM,IAAI,oBAAoB,CAAC,uBAAuB,QAAQ,CAAC,MAAM,EAAE,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC;YAC5F,CAAC;YAED,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;YAC/D,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,WAAW,CAAC,EAAE,CAAC;gBACvC,MAAM,IAAI,KAAK,CAAC,kBAAkB,CAAC,CAAC;YACtC,CAAC;YAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YAEnC,4CAA4C;YAC5C,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,IAAI,CAAC,CAAC;YAC5B,MAAM,KAAK,GAAG,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,IAAI,EAAE,CAAC;YAE9C,kCAAkC;YAClC,MAAM,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;YAClD,IAAI,KAAK,EAAE,CAAC;gBACV,qFAAqF;gBACrF,gDAAgD;gBAChD,OAAO,CAAC,IAAI,CAAC,oBAAoB,GAAG,6DAA6D,CAAC,CAAC;gBACnG,mEAAmE;YACrE,CAAC;YAED,OAAO;gBACL,IAAI;gBACJ,KAAK;gBACL,GAAG,EAAE,QAAQ,CAAC,GAAG;gBACjB,WAAW,EAAE,KAAK,EAAE,4BAA4B;gBAChD,UAAU,EAAE,QAAQ,CAAC,MAAM;gBAC3B,KAAK,EAAE,SAAS;aACjB,CAAC;QACJ,CAAC;QAAC,OAAO,KAAU,EAAE,CAAC;YACpB,+FAA+F;YAC/F,gFAAgF;YAChF,qGAAqG;YACrG,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC;IAEO,SAAS,CAAC,QAAkB;QAClC,+CAA+C;QAC/C,MAAM,UAAU,GAAG;YACjB,QAAQ;YACR,kBAAkB;YAClB,OAAO;YACP,MAAM;YACN,MAAM;YACN,cAAc;YACd,eAAe;YACf,UAAU;YACV,cAAc;YACd,UAAU;YACV,sBAAsB;YACtB,oCAAoC,EAAE,uBAAuB;YAC7D,uBAAuB;SACxB,CAAC;QAEF,+DAA+D;QAC/D,MAAM,WAAW,GAAG,QAAQ,CAAC,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QAC7D,MAAM,UAAU,GAAG,QAAQ,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC;QAE/C,IAAI,WAAW,CAAC,MAAM,GAAG,GAAG,IAAI,UAAU,EAAE,CAAC;YAC3C,+BAA+B;YAC/B,OAAO,IAAI,CAAC;QACd,CAAC;QAED,wCAAwC;QACxC,OAAO,UAAU,CAAC,IAAI,CAAC,CAAC,QAAQ,EAAE,EAAE,CAAC,QAAQ,CAAC,aAAa,CAAC,QAAQ,CAAC,KAAK,IAAI,CAAC,CAAC;IAClF,CAAC;IAED;;;;OAIG;IACH,KAAK,CAAC,OAAO;QACX,4CAA4C;QAC5C,OAAO,OAAO,CAAC,OAAO,EAAE,CAAC,CAAC,qCAAqC;IACjE,CAAC;IAED;;;;OAIG;IACH,UAAU;QACR,qDAAqD;QACrD,OAAO,EAAE,CAAC;IACZ,CAAC;CACF"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"FetchEngine.test.d.ts","sourceRoot":"","sources":["../src/FetchEngine.test.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { FetchEngine } from "./FetchEngine.js";
|
|
3
|
+
describe("FetchEngine", () => {
|
|
4
|
+
it("should fetch HTML and extract title from a static page", async () => {
|
|
5
|
+
const engine = new FetchEngine();
|
|
6
|
+
const url = "http://example.com";
|
|
7
|
+
const expectedUrl = "http://example.com/"; // Expect trailing slash
|
|
8
|
+
try {
|
|
9
|
+
const result = await engine.fetchHTML(url);
|
|
10
|
+
expect(result).toBeDefined();
|
|
11
|
+
expect(result.url).toBe(expectedUrl); // Use expectedUrl
|
|
12
|
+
expect(result.title).toBe("Example Domain");
|
|
13
|
+
expect(result.html).toContain("<title>Example Domain</title>");
|
|
14
|
+
expect(result.html).toContain("<h1>Example Domain</h1>");
|
|
15
|
+
}
|
|
16
|
+
catch (error) {
|
|
17
|
+
// If the test environment doesn't have fetch or network access, this might fail.
|
|
18
|
+
// In a real CI/CD, ensure network access or mock fetch.
|
|
19
|
+
console.warn("FetchEngine test failed, potentially due to network issues or missing fetch API:", error);
|
|
20
|
+
// Re-throw to fail the test if fetch was expected to work
|
|
21
|
+
throw error;
|
|
22
|
+
}
|
|
23
|
+
});
|
|
24
|
+
it("should throw an error for non-HTML content", async () => {
|
|
25
|
+
const engine = new FetchEngine();
|
|
26
|
+
// Use a URL known to return non-HTML content, e.g., a JSON endpoint or an image
|
|
27
|
+
const url = "https://httpbin.org/json";
|
|
28
|
+
// Expect the fetchHTML method to reject
|
|
29
|
+
await expect(engine.fetchHTML(url)).rejects.toThrow("Not an HTML page");
|
|
30
|
+
});
|
|
31
|
+
it("should throw an error for non-existent domains", async () => {
|
|
32
|
+
const engine = new FetchEngine();
|
|
33
|
+
const url = "http://domain-that-does-not-exist-fdsahjkl.xyz";
|
|
34
|
+
// Expect the fetchHTML method to reject (error message might vary)
|
|
35
|
+
await expect(engine.fetchHTML(url)).rejects.toThrow();
|
|
36
|
+
});
|
|
37
|
+
it("should handle http errors", async () => {
|
|
38
|
+
const engine = new FetchEngine();
|
|
39
|
+
const url = "https://httpbin.org/status/404"; // URL that returns 404
|
|
40
|
+
await expect(engine.fetchHTML(url)).rejects.toThrow(/HTTP error! status: 404/);
|
|
41
|
+
});
|
|
42
|
+
// Add more tests: SPA detection warning, etc.
|
|
43
|
+
});
|
|
44
|
+
//# sourceMappingURL=FetchEngine.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"FetchEngine.test.js","sourceRoot":"","sources":["../src/FetchEngine.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAC9C,OAAO,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAE/C,QAAQ,CAAC,aAAa,EAAE,GAAG,EAAE;IAC3B,EAAE,CAAC,wDAAwD,EAAE,KAAK,IAAI,EAAE;QACtE,MAAM,MAAM,GAAG,IAAI,WAAW,EAAE,CAAC;QACjC,MAAM,GAAG,GAAG,oBAAoB,CAAC;QACjC,MAAM,WAAW,GAAG,qBAAqB,CAAC,CAAC,wBAAwB;QAEnE,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;YAE3C,MAAM,CAAC,MAAM,CAAC,CAAC,WAAW,EAAE,CAAC;YAC7B,MAAM,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,kBAAkB;YACxD,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC;YAC5C,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,SAAS,CAAC,+BAA+B,CAAC,CAAC;YAC/D,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,SAAS,CAAC,yBAAyB,CAAC,CAAC;QAC3D,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,iFAAiF;YACjF,wDAAwD;YACxD,OAAO,CAAC,IAAI,CAAC,kFAAkF,EAAE,KAAK,CAAC,CAAC;YACxG,0DAA0D;YAC1D,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,4CAA4C,EAAE,KAAK,IAAI,EAAE;QAC1D,MAAM,MAAM,GAAG,IAAI,WAAW,EAAE,CAAC;QACjC,gFAAgF;QAChF,MAAM,GAAG,GAAG,0BAA0B,CAAC;QAEvC,wCAAwC;QACxC,MAAM,MAAM,CAAC,MAAM,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,kBAAkB,CAAC,CAAC;IAC1E,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,gDAAgD,EAAE,KAAK,IAAI,EAAE;QAC9D,MAAM,MAAM,GAAG,IAAI,WAAW,EAAE,CAAC;QACjC,MAAM,GAAG,GAAG,gDAAgD,CAAC;QAE7D,mEAAmE;QACnE,MAAM,MAAM,CAAC,MAAM,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC;IACxD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,2BAA2B,EAAE,KAAK,IAAI,EAAE;QACzC,MAAM,MAAM,GAAG,IAAI,WAAW,EAAE,CAAC;QACjC,MAAM,GAAG,GAAG,gCAAgC,CAAC,CAAC,uBAAuB;QAErE,MAAM,MAAM,CAAC,MAAM,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,yBAAyB,CAAC,CAAC;IACjF,CAAC,CAAC,CAAC;IAEH,8CAA8C;AAChD,CAAC,CAAC,CAAC"}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import type { HTMLFetchResult, BrowserMetrics, PlaywrightEngineConfig } from "./types.js";
|
|
2
|
+
import { IEngine } from "./IEngine.js";
|
|
3
|
+
/**
|
|
4
|
+
* HybridEngine - Attempts fetching with FetchEngine first for speed,
|
|
5
|
+
* then falls back to PlaywrightEngine for complex sites or specific errors.
|
|
6
|
+
*/
|
|
7
|
+
export declare class HybridEngine implements IEngine {
|
|
8
|
+
private readonly fetchEngine;
|
|
9
|
+
private readonly playwrightEngine;
|
|
10
|
+
constructor(playwrightConfig?: PlaywrightEngineConfig);
|
|
11
|
+
fetchHTML(url: string): Promise<HTMLFetchResult>;
|
|
12
|
+
cleanup(): Promise<void>;
|
|
13
|
+
getMetrics(): BrowserMetrics[];
|
|
14
|
+
}
|
|
15
|
+
//# sourceMappingURL=HybridEngine.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"HybridEngine.d.ts","sourceRoot":"","sources":["../src/HybridEngine.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EACV,eAAe,EACf,cAAc,EACd,sBAAsB,EACvB,MAAM,YAAY,CAAC;AACpB,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAEvC;;;GAGG;AACH,qBAAa,YAAa,YAAW,OAAO;IAC1C,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAc;IAC1C,OAAO,CAAC,QAAQ,CAAC,gBAAgB,CAAmB;gBAExC,gBAAgB,GAAE,sBAA2B;IAKnD,SAAS,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,eAAe,CAAC;IAkBhD,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;IAQ9B,UAAU,IAAI,cAAc,EAAE;CAI/B"}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { FetchEngine } from "./FetchEngine.js";
|
|
2
|
+
import { PlaywrightEngine } from "./PlaywrightEngine.js";
|
|
3
|
+
/**
|
|
4
|
+
* HybridEngine - Attempts fetching with FetchEngine first for speed,
|
|
5
|
+
* then falls back to PlaywrightEngine for complex sites or specific errors.
|
|
6
|
+
*/
|
|
7
|
+
export class HybridEngine {
|
|
8
|
+
fetchEngine;
|
|
9
|
+
playwrightEngine;
|
|
10
|
+
constructor(playwrightConfig = {}) {
|
|
11
|
+
this.fetchEngine = new FetchEngine();
|
|
12
|
+
this.playwrightEngine = new PlaywrightEngine(playwrightConfig);
|
|
13
|
+
}
|
|
14
|
+
async fetchHTML(url) {
|
|
15
|
+
try {
|
|
16
|
+
// Attempt 1: Use the fast FetchEngine
|
|
17
|
+
const fetchResult = await this.fetchEngine.fetchHTML(url);
|
|
18
|
+
return fetchResult;
|
|
19
|
+
}
|
|
20
|
+
catch (_fetchError) {
|
|
21
|
+
// Prefixed unused error
|
|
22
|
+
// If FetchEngine fails (e.g., 403, network error, non-html), try Playwright
|
|
23
|
+
try {
|
|
24
|
+
const playwrightResult = await this.playwrightEngine.fetchHTML(url);
|
|
25
|
+
return playwrightResult;
|
|
26
|
+
}
|
|
27
|
+
catch (playwrightError) {
|
|
28
|
+
// If Playwright also fails, throw its error (potentially more informative)
|
|
29
|
+
throw playwrightError;
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
async cleanup() {
|
|
34
|
+
// Cleanup both engines concurrently
|
|
35
|
+
await Promise.allSettled([
|
|
36
|
+
this.fetchEngine.cleanup(),
|
|
37
|
+
this.playwrightEngine.cleanup(),
|
|
38
|
+
]);
|
|
39
|
+
}
|
|
40
|
+
getMetrics() {
|
|
41
|
+
// FetchEngine doesn't produce metrics, only PlaywrightEngine does
|
|
42
|
+
return this.playwrightEngine.getMetrics();
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
//# sourceMappingURL=HybridEngine.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"HybridEngine.js","sourceRoot":"","sources":["../src/HybridEngine.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAC/C,OAAO,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AAQzD;;;GAGG;AACH,MAAM,OAAO,YAAY;IACN,WAAW,CAAc;IACzB,gBAAgB,CAAmB;IAEpD,YAAY,mBAA2C,EAAE;QACvD,IAAI,CAAC,WAAW,GAAG,IAAI,WAAW,EAAE,CAAC;QACrC,IAAI,CAAC,gBAAgB,GAAG,IAAI,gBAAgB,CAAC,gBAAgB,CAAC,CAAC;IACjE,CAAC;IAED,KAAK,CAAC,SAAS,CAAC,GAAW;QACzB,IAAI,CAAC;YACH,sCAAsC;YACtC,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;YAC1D,OAAO,WAAW,CAAC;QACrB,CAAC;QAAC,OAAO,WAAgB,EAAE,CAAC;YAC1B,wBAAwB;YACxB,4EAA4E;YAC5E,IAAI,CAAC;gBACH,MAAM,gBAAgB,GAAG,MAAM,IAAI,CAAC,gBAAgB,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;gBACpE,OAAO,gBAAgB,CAAC;YAC1B,CAAC;YAAC,OAAO,eAAe,EAAE,CAAC;gBACzB,2EAA2E;gBAC3E,MAAM,eAAe,CAAC;YACxB,CAAC;QACH,CAAC;IACH,CAAC;IAED,KAAK,CAAC,OAAO;QACX,oCAAoC;QACpC,MAAM,OAAO,CAAC,UAAU,CAAC;YACvB,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE;YAC1B,IAAI,CAAC,gBAAgB,CAAC,OAAO,EAAE;SAChC,CAAC,CAAC;IACL,CAAC;IAED,UAAU;QACR,kEAAkE;QAClE,OAAO,IAAI,CAAC,gBAAgB,CAAC,UAAU,EAAE,CAAC;IAC5C,CAAC;CACF"}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import type { HTMLFetchResult, BrowserMetrics } from "./types.js";
|
|
2
|
+
/**
|
|
3
|
+
* Interface for browser engines that can fetch HTML content from URLs
|
|
4
|
+
*/
|
|
5
|
+
export interface IEngine {
|
|
6
|
+
/**
|
|
7
|
+
* Fetches HTML content from a URL
|
|
8
|
+
* @param url The URL to fetch
|
|
9
|
+
* @returns A promise that resolves to an HTMLFetchResult
|
|
10
|
+
*/
|
|
11
|
+
fetchHTML(url: string): Promise<HTMLFetchResult>;
|
|
12
|
+
/**
|
|
13
|
+
* Cleans up resources used by the engine
|
|
14
|
+
*/
|
|
15
|
+
cleanup(): Promise<void>;
|
|
16
|
+
/**
|
|
17
|
+
* Gets metrics about the engine's performance
|
|
18
|
+
* @returns An array of BrowserMetrics
|
|
19
|
+
*/
|
|
20
|
+
getMetrics(): BrowserMetrics[];
|
|
21
|
+
}
|
|
22
|
+
//# sourceMappingURL=IEngine.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"IEngine.d.ts","sourceRoot":"","sources":["../src/IEngine.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAElE;;GAEG;AACH,MAAM,WAAW,OAAO;IACtB;;;;OAIG;IACH,SAAS,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,eAAe,CAAC,CAAC;IAEjD;;OAEG;IACH,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;IAEzB;;;OAGG;IACH,UAAU,IAAI,cAAc,EAAE,CAAC;CAChC"}
|
package/dist/IEngine.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"IEngine.js","sourceRoot":"","sources":["../src/IEngine.ts"],"names":[],"mappings":""}
|