@purepageio/fetch-engines 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +205 -0
  3. package/dist/FetchEngine.d.ts +46 -0
  4. package/dist/FetchEngine.d.ts.map +1 -0
  5. package/dist/FetchEngine.js +137 -0
  6. package/dist/FetchEngine.js.map +1 -0
  7. package/dist/FetchEngine.test.d.ts +2 -0
  8. package/dist/FetchEngine.test.d.ts.map +1 -0
  9. package/dist/FetchEngine.test.js +44 -0
  10. package/dist/FetchEngine.test.js.map +1 -0
  11. package/dist/HybridEngine.d.ts +15 -0
  12. package/dist/HybridEngine.d.ts.map +1 -0
  13. package/dist/HybridEngine.js +45 -0
  14. package/dist/HybridEngine.js.map +1 -0
  15. package/dist/IEngine.d.ts +22 -0
  16. package/dist/IEngine.d.ts.map +1 -0
  17. package/dist/IEngine.js +2 -0
  18. package/dist/IEngine.js.map +1 -0
  19. package/dist/PlaywrightEngine.d.ts +88 -0
  20. package/dist/PlaywrightEngine.d.ts.map +1 -0
  21. package/dist/PlaywrightEngine.js +484 -0
  22. package/dist/PlaywrightEngine.js.map +1 -0
  23. package/dist/PlaywrightEngine.test.d.ts +2 -0
  24. package/dist/PlaywrightEngine.test.d.ts.map +1 -0
  25. package/dist/PlaywrightEngine.test.js +299 -0
  26. package/dist/PlaywrightEngine.test.js.map +1 -0
  27. package/dist/PuppeteerEngine.d.ts +21 -0
  28. package/dist/PuppeteerEngine.d.ts.map +1 -0
  29. package/dist/PuppeteerEngine.js +412 -0
  30. package/dist/PuppeteerEngine.js.map +1 -0
  31. package/dist/browser/BrowserPool.d.ts +29 -0
  32. package/dist/browser/BrowserPool.d.ts.map +1 -0
  33. package/dist/browser/BrowserPool.js +378 -0
  34. package/dist/browser/BrowserPool.js.map +1 -0
  35. package/dist/browser/PlaywrightBrowserPool.d.ts +78 -0
  36. package/dist/browser/PlaywrightBrowserPool.d.ts.map +1 -0
  37. package/dist/browser/PlaywrightBrowserPool.js +429 -0
  38. package/dist/browser/PlaywrightBrowserPool.js.map +1 -0
  39. package/dist/browser/PlaywrightBrowserPool.test.d.ts +2 -0
  40. package/dist/browser/PlaywrightBrowserPool.test.d.ts.map +1 -0
  41. package/dist/browser/PlaywrightBrowserPool.test.js +422 -0
  42. package/dist/browser/PlaywrightBrowserPool.test.js.map +1 -0
  43. package/dist/errors.d.ts +20 -0
  44. package/dist/errors.d.ts.map +1 -0
  45. package/dist/errors.js +30 -0
  46. package/dist/errors.js.map +1 -0
  47. package/dist/index.d.ts +8 -0
  48. package/dist/index.d.ts.map +1 -0
  49. package/dist/index.js +5 -0
  50. package/dist/index.js.map +1 -0
  51. package/dist/types.d.ts +151 -0
  52. package/dist/types.d.ts.map +1 -0
  53. package/dist/types.js +2 -0
  54. package/dist/types.js.map +1 -0
  55. package/package.json +72 -0
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Purepage
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,205 @@
1
+ # @purepageio/fetch-engines
2
+
3
+ A collection of configurable engines for fetching HTML content using plain `fetch` or Playwright.
4
+
5
+ This package provides robust and customizable ways to retrieve web page content, handling retries, caching, user agents, and optional browser automation via Playwright for complex JavaScript-driven sites.
6
+
7
+ ## Installation
8
+
9
+ ```bash
10
+ pnpm add @purepageio/fetch-engines
11
+ # or with npm
12
+ npm install @purepageio/fetch-engines
13
+ # or with yarn
14
+ yarn add @purepageio/fetch-engines
15
+ ```
16
+
17
+ If you plan to use the `PlaywrightEngine`, you also need to install Playwright's browser binaries:
18
+
19
+ ```bash
20
+ pnpm exec playwright install
21
+ # or
22
+ npx playwright install
23
+ ```
24
+
25
+ ## Engines
26
+
27
+ - **`FetchEngine`**: Uses the standard `fetch` API. Suitable for simple HTML pages or APIs returning HTML. Lightweight and fast.
28
+ - **`PlaywrightEngine`**: Uses Playwright to control a headless browser (Chromium, Firefox, WebKit). Handles JavaScript rendering, complex interactions (if needed), and provides options for stealth and anti-bot detection measures. More resource-intensive but necessary for dynamic websites.
29
+
30
+ ## Basic Usage
31
+
32
+ ### FetchEngine
33
+
34
+ ```typescript
35
+ import { FetchEngine } from "@purepageio/fetch-engines";
36
+
37
+ const engine = new FetchEngine();
38
+
39
+ async function main() {
40
+ try {
41
+ const url = "https://example.com";
42
+ const result = await engine.fetchHTML(url);
43
+ console.log(`Fetched ${result.url}`);
44
+ console.log(`Title: ${result.title}`);
45
+ // console.log(`HTML: ${result.html.substring(0, 200)}...`);
46
+ } catch (error) {
47
+ console.error("Fetch failed:", error);
48
+ }
49
+ }
50
+
51
+ main();
52
+ ```
53
+
54
+ ### PlaywrightEngine
55
+
56
+ ```typescript
57
+ import { PlaywrightEngine } from "@purepageio/fetch-engines";
58
+
59
+ // Configure engine options (optional)
60
+ const engine = new PlaywrightEngine({
61
+ maxRetries: 2, // Number of retry attempts
62
+ useHttpFallback: true, // Try simple HTTP fetch first
63
+ cacheTTL: 5 * 60 * 1000, // Cache results for 5 minutes (in milliseconds)
64
+ });
65
+
66
+ async function main() {
67
+ try {
68
+ const url = "https://quotes.toscrape.com/"; // A site that might benefit from JS rendering
69
+ const result = await engine.fetchHTML(url);
70
+ console.log(`Fetched ${result.url}`);
71
+ console.log(`Title: ${result.title}`);
72
+ // console.log(`HTML: ${result.html.substring(0, 200)}...`);
73
+ } catch (error) {
74
+ console.error("Playwright fetch failed:", error);
75
+ } finally {
76
+ // Important: Clean up browser resources when done
77
+ await engine.cleanup();
78
+ }
79
+ }
80
+
81
+ main();
82
+ ```
83
+
84
+ ## Configuration
85
+
86
+ Engines accept an optional configuration object in their constructor to customize behavior.
87
+
88
+ ### FetchEngine
89
+
90
+ The `FetchEngine` currently has **no configurable options** via its constructor. It uses standard `fetch` with default browser/Node.js retry/timeout behavior and a fixed set of browser-like headers.
91
+
92
+ ### PlaywrightEngine
93
+
94
+ The `PlaywrightEngine` offers more extensive configuration:
95
+
96
+ **General Options:**
97
+
98
+ - `concurrentPages` (`number`, default: `3`)
99
+ - Maximum number of Playwright pages to process concurrently across all browser instances.
100
+ - `maxRetries` (`number`, default: `3`)
101
+ - Maximum number of retry attempts for a failed fetch operation (excluding initial attempt).
102
+ - `retryDelay` (`number`, default: `5000`)
103
+ - Delay in milliseconds between retry attempts.
104
+ - `cacheTTL` (`number`, default: `900000` (15 minutes))
105
+ - Time-to-live for cached results in milliseconds. Set to `0` to disable the in-memory cache.
106
+ - `useHttpFallback` (`boolean`, default: `true`)
107
+ - If `true`, the engine first attempts a simple, fast HTTP GET request. If this fails or appears to receive a challenge/CAPTCHA page, it then proceeds with a full Playwright browser request.
108
+ - `useHeadedModeFallback` (`boolean`, default: `false`)
109
+ - If `true` and a Playwright request fails (potentially due to bot detection), subsequent requests _to that specific domain_ will automatically use a headed (visible) browser instance, which can sometimes bypass stricter checks. This requires the pool to potentially manage both headless and headed instances.
110
+ - `defaultFastMode` (`boolean`, default: `true`)
111
+ - If `true`, requests initially run in "fast mode", blocking non-essential resources (images, fonts, stylesheets) and skipping human behavior simulation. This can significantly speed up fetches but may break some sites or increase detection risk. This can be overridden per-request via the `fetchHTML` options.
112
+ - `simulateHumanBehavior` (`boolean`, default: `true`)
113
+ - If `true` and the request is _not_ in `fastMode`, the engine attempts basic human-like interactions (e.g., slight delays, mouse movements). _Note: This simulation is currently basic and may not defeat advanced bot detection._
114
+
115
+ **Browser Pool Options:**
116
+
117
+ These options are passed down to configure the underlying `PlaywrightBrowserPool` that manages browser instances.
118
+
119
+ - `maxBrowsers` (`number`, default: `2`)
120
+ - Maximum number of concurrent browser instances (e.g., Chrome processes) the pool will manage.
121
+ - `maxPagesPerContext` (`number`, default: `6`)
122
+ - Maximum number of pages that can be opened within a single browser context (like an isolated browser profile) before the pool prefers using a different context or browser instance. Helps isolate sessions.
123
+ - `maxBrowserAge` (`number`, default: `1200000` (20 minutes))
124
+ - Maximum age in milliseconds a browser instance can live before the pool proactively closes and replaces it. Helps mitigate memory leaks or state issues.
125
+ - `healthCheckInterval` (`number`, default: `60000` (1 minute))
126
+ - How often (in milliseconds) the pool checks the health of its browser instances (e.g., checking connectivity, age).
127
+ - `useHeadedMode` (`boolean`, default: `false`)
128
+ - Forces the _entire_ browser pool to launch browsers in headed (visible) mode instead of the default headless mode. Primarily useful for debugging purposes.
129
+ - `poolBlockedDomains` (`string[]`, default: `[]` - uses pool's internal defaults)
130
+ - List of domain _glob patterns_ (e.g., `*.google-analytics.com`, `*.doubleclick.net`) for requests that the browser should block. An empty array uses the pool's built-in default blocklist (recommended).
131
+ - `poolBlockedResourceTypes` (`string[]`, default: `[]` - uses pool's internal defaults)
132
+ - List of Playwright resource types (e.g., `image`, `stylesheet`, `font`, `media`, `websocket`) to block. Blocking unnecessary resources can speed up page loads. An empty array uses the pool's built-in default blocklist (recommended).
133
+ - `proxy` (`object | undefined`, default: `undefined`)
134
+ - Proxy configuration to be used by the browser instances.
135
+ - `server` (`string`): Proxy URL (e.g., `http://host:port`, `socks5://user:pass@host:port`).
136
+ - `username` (`string`, optional): Proxy username.
137
+ - `password` (`string`, optional): Proxy password.
138
+
139
+ ## Return Value
140
+
141
+ Both `FetchEngine.fetchHTML()` and `PlaywrightEngine.fetchHTML()` return a Promise that resolves to a `FetchResult` object with the following properties:
142
+
143
+ - `html` (`string`): The full HTML content of the fetched page.
144
+ - `title` (`string | null`): The extracted `<title>` tag content, or `null` if no title is found.
145
+ - `url` (`string`): The final URL after any redirects.
146
+ - `isFromCache` (`boolean`): `true` if the result was served from the engine's cache, `false` otherwise.
147
+ - `statusCode` (`number | undefined`): The HTTP status code of the final response. This is typically available for `FetchEngine` and the HTTP fallback in `PlaywrightEngine`, but might be `undefined` for some Playwright navigation scenarios if the primary response wasn't directly captured.
148
+ - `error` (`FetchError | Error | undefined`): If an error occurred during the _final_ fetch attempt (after retries), this property will contain the error object. It might be a specific `FetchError` (see Error Handling) or a generic `Error`.
149
+
150
+ ## API Reference
151
+
152
+ ### `engine.fetchHTML(url, options?)`
153
+
154
+ - `url` (`string`): The URL of the page to fetch.
155
+ - `options` (`object`, optional): Per-request options to override engine defaults.
156
+ - For `PlaywrightEngine`, you can override `fastMode` (`boolean`) to force or disable fast mode for this specific request.
157
+ - _(Other per-request options may be added in the future)._
158
+ - **Returns:** `Promise<FetchResult>`
159
+
160
+ Fetches the HTML content for the given URL using the engine's configured strategy (plain fetch or Playwright).
161
+
162
+ ### `engine.cleanup()` (PlaywrightEngine only)
163
+
164
+ - **Returns:** `Promise<void>`
165
+
166
+ Gracefully shuts down all browser instances managed by the `PlaywrightEngine`'s browser pool. **It is crucial to call `await engine.cleanup()` when you are finished using a `PlaywrightEngine` instance** to release system resources.
167
+
168
+ ## Stealth / Anti-Detection (`PlaywrightEngine`)
169
+
170
+ The `PlaywrightEngine` automatically integrates `playwright-extra` and its powerful stealth plugin (`puppeteer-extra-plugin-stealth`). This plugin applies various techniques to make the headless browser controlled by Playwright appear more like a regular human-operated browser, helping to bypass many common bot detection systems.
171
+
172
+ There are **no manual configuration options** for stealth; it is enabled by default when using `PlaywrightEngine`. The previous options (`useStealthMode`, `randomizeFingerprint`, `evasionLevel`) have been removed.
173
+
174
+ While effective, be aware that no stealth technique is foolproof, and sophisticated websites may still detect automated browsing.
175
+
176
+ ## Error Handling
177
+
178
+ Errors during fetching are typically thrown as instances of `FetchError` (or its subclasses like `FetchEngineHttpError`), providing more context than standard `Error` objects.
179
+
180
+ - `FetchError` properties:
181
+ - `message` (`string`): Description of the error.
182
+ - `code` (`string | undefined`): A specific error code (e.g., `ERR_NAVIGATION_TIMEOUT`, `ERR_HTTP_ERROR`, `ERR_NON_HTML_CONTENT`).
183
+ - `originalError` (`Error | undefined`): The underlying error that caused this fetch error (e.g., a Playwright error object).
184
+
185
+ Common error scenarios include:
186
+
187
+ - Network issues (DNS resolution failure, connection refused).
188
+ - HTTP errors (4xx client errors, 5xx server errors).
189
+ - Non-HTML content type received (for `FetchEngine`).
190
+ - Playwright navigation timeouts.
191
+ - Proxy connection errors.
192
+ - Page crashes within Playwright.
193
+ - Errors thrown by the browser pool (e.g., failure to launch browser).
194
+
195
+ The `FetchResult` object may also contain an `error` property if the final fetch attempt failed after all retries.
196
+
197
+ ## Logging
198
+
199
+ ## Contributing
200
+
201
+ Contributions are welcome! Please open an issue or submit a pull request on the [GitHub repository](https://github.com/purepageio/fetch-engines).
202
+
203
+ ## License
204
+
205
+ MIT
@@ -0,0 +1,46 @@
1
+ import type { HTMLFetchResult, BrowserMetrics } from "./types.js";
2
+ import type { IEngine } from "./IEngine.js";
3
+ /**
4
+ * Custom error class for HTTP errors from FetchEngine.
5
+ */
6
+ export declare class FetchEngineHttpError extends Error {
7
+ readonly statusCode: number;
8
+ constructor(message: string, statusCode: number);
9
+ }
10
+ /**
11
+ * FetchEngine - A lightweight engine for fetching HTML content using the standard `fetch` API.
12
+ *
13
+ * Ideal for fetching content from static websites or APIs where JavaScript execution is not required.
14
+ * It does not support advanced configurations like retries, caching, or proxies directly.
15
+ */
16
+ export declare class FetchEngine implements IEngine {
17
+ private readonly headers;
18
+ /**
19
+ * Creates an instance of FetchEngine.
20
+ * Note: This engine currently does not accept configuration options.
21
+ */
22
+ constructor();
23
+ /**
24
+ * Fetches HTML content from the specified URL using the `fetch` API.
25
+ *
26
+ * @param url The URL to fetch.
27
+ * @returns A Promise resolving to an HTMLFetchResult object.
28
+ * @throws {FetchEngineHttpError} If the HTTP response status is not ok (e.g., 404, 500).
29
+ * @throws {Error} If the content type is not HTML or for other network errors.
30
+ */
31
+ fetchHTML(url: string): Promise<HTMLFetchResult>;
32
+ private detectSPA;
33
+ /**
34
+ * Cleans up resources used by the engine.
35
+ * For FetchEngine, this is a no-op as it doesn't manage persistent resources.
36
+ * @returns A Promise that resolves when cleanup is complete.
37
+ */
38
+ cleanup(): Promise<void>;
39
+ /**
40
+ * Retrieves metrics for the engine.
41
+ * FetchEngine does not manage browsers, so it returns an empty array.
42
+ * @returns An empty array.
43
+ */
44
+ getMetrics(): BrowserMetrics[];
45
+ }
46
+ //# sourceMappingURL=FetchEngine.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"FetchEngine.d.ts","sourceRoot":"","sources":["../src/FetchEngine.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAClE,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAG5C;;GAEG;AACH,qBAAa,oBAAqB,SAAQ,KAAK;IAC7C,SAAgB,UAAU,EAAE,MAAM,CAAC;gBAEvB,OAAO,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM;CAShD;AAED;;;;;GAKG;AACH,qBAAa,WAAY,YAAW,OAAO;IACzC,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAyB;IAEjD;;;OAGG;;IAeH;;;;;;;OAOG;IACG,SAAS,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,eAAe,CAAC;IAgDtD,OAAO,CAAC,SAAS;IA+BjB;;;;OAIG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;IAK9B;;;;OAIG;IACH,UAAU,IAAI,cAAc,EAAE;CAI/B"}
@@ -0,0 +1,137 @@
1
+ import { JSDOM } from "jsdom";
2
+ /**
3
+ * Custom error class for HTTP errors from FetchEngine.
4
+ */
5
+ export class FetchEngineHttpError extends Error {
6
+ statusCode;
7
+ constructor(message, statusCode) {
8
+ super(message);
9
+ this.name = "FetchEngineHttpError";
10
+ this.statusCode = statusCode;
11
+ // Maintain proper stack trace (requires target ES2015+ in tsconfig)
12
+ if (Error.captureStackTrace) {
13
+ Error.captureStackTrace(this, FetchEngineHttpError);
14
+ }
15
+ }
16
+ }
17
+ /**
18
+ * FetchEngine - A lightweight engine for fetching HTML content using the standard `fetch` API.
19
+ *
20
+ * Ideal for fetching content from static websites or APIs where JavaScript execution is not required.
21
+ * It does not support advanced configurations like retries, caching, or proxies directly.
22
+ */
23
+ export class FetchEngine {
24
+ headers;
25
+ /**
26
+ * Creates an instance of FetchEngine.
27
+ * Note: This engine currently does not accept configuration options.
28
+ */
29
+ constructor() {
30
+ this.headers = {
31
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
32
+ Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
33
+ "Accept-Language": "en-US,en;q=0.5",
34
+ "Upgrade-Insecure-Requests": "1",
35
+ "Sec-Fetch-Dest": "document",
36
+ "Sec-Fetch-Mode": "navigate",
37
+ "Sec-Fetch-Site": "none",
38
+ "Sec-Fetch-User": "?1",
39
+ };
40
+ }
41
+ /**
42
+ * Fetches HTML content from the specified URL using the `fetch` API.
43
+ *
44
+ * @param url The URL to fetch.
45
+ * @returns A Promise resolving to an HTMLFetchResult object.
46
+ * @throws {FetchEngineHttpError} If the HTTP response status is not ok (e.g., 404, 500).
47
+ * @throws {Error} If the content type is not HTML or for other network errors.
48
+ */
49
+ async fetchHTML(url) {
50
+ try {
51
+ const response = await fetch(url, {
52
+ headers: this.headers,
53
+ redirect: "follow",
54
+ });
55
+ if (!response.ok) {
56
+ // Throw the custom error with status code
57
+ throw new FetchEngineHttpError(`HTTP error! status: ${response.status}`, response.status);
58
+ }
59
+ const contentType = response.headers.get("content-type") || "";
60
+ if (!contentType.includes("text/html")) {
61
+ throw new Error("Not an HTML page");
62
+ }
63
+ const html = await response.text();
64
+ // Use JSDOM to parse HTML and extract title
65
+ const dom = new JSDOM(html);
66
+ const title = dom.window.document.title || "";
67
+ // Check for potential SPA markers
68
+ const isSPA = this.detectSPA(dom.window.document);
69
+ if (isSPA) {
70
+ // Removed throwing error here, as the calling code should decide how to handle this.
71
+ // Consider adding a flag to the result instead.
72
+ console.warn(`SPA detected for ${url}, content might be incomplete without JavaScript rendering.`);
73
+ // Example: return { html, title, url: response.url, isSPA: true };
74
+ }
75
+ return {
76
+ html,
77
+ title,
78
+ url: response.url,
79
+ isFromCache: false, // FetchEngine doesn't cache
80
+ statusCode: response.status,
81
+ error: undefined,
82
+ };
83
+ }
84
+ catch (error) {
85
+ // console.error(`FetchEngine failed for ${url}:`, error); // Optional: Keep logging if desired
86
+ // Re-throw the original error to preserve its type (e.g., FetchEngineHttpError)
87
+ // Ensure the result conforms to HTMLFetchResult even on error (for consistency? No, spec says throw)
88
+ throw error;
89
+ }
90
+ }
91
+ detectSPA(document) {
92
+ // Check for common SPA frameworks and patterns
93
+ const spaMarkers = [
94
+ // React
95
+ "[data-reactroot]",
96
+ "#root",
97
+ "#app",
98
+ // Vue
99
+ "[data-v-app]",
100
+ "#app[data-v-]",
101
+ // Angular
102
+ "[ng-version]",
103
+ "[ng-app]",
104
+ // Common SPA patterns
105
+ 'script[type="application/json+ld"]', // Less reliable marker
106
+ 'meta[name="fragment"]',
107
+ ];
108
+ // Check if the body is nearly empty but has JS (More reliable)
109
+ const bodyContent = document.body?.textContent?.trim() || "";
110
+ const hasScripts = document.scripts.length > 0;
111
+ if (bodyContent.length < 150 && hasScripts) {
112
+ // Increased threshold slightly
113
+ return true;
114
+ }
115
+ // Check for SPA markers (Less reliable)
116
+ return spaMarkers.some((selector) => document.querySelector(selector) !== null);
117
+ }
118
+ /**
119
+ * Cleans up resources used by the engine.
120
+ * For FetchEngine, this is a no-op as it doesn't manage persistent resources.
121
+ * @returns A Promise that resolves when cleanup is complete.
122
+ */
123
+ async cleanup() {
124
+ // No resources to clean up for fetch engine
125
+ return Promise.resolve(); // Explicitly return resolved promise
126
+ }
127
+ /**
128
+ * Retrieves metrics for the engine.
129
+ * FetchEngine does not manage browsers, so it returns an empty array.
130
+ * @returns An empty array.
131
+ */
132
+ getMetrics() {
133
+ // Fetch engine doesn't maintain browser pool metrics
134
+ return [];
135
+ }
136
+ }
137
+ //# sourceMappingURL=FetchEngine.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"FetchEngine.js","sourceRoot":"","sources":["../src/FetchEngine.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,KAAK,EAAE,MAAM,OAAO,CAAC;AAE9B;;GAEG;AACH,MAAM,OAAO,oBAAqB,SAAQ,KAAK;IAC7B,UAAU,CAAS;IAEnC,YAAY,OAAe,EAAE,UAAkB;QAC7C,KAAK,CAAC,OAAO,CAAC,CAAC;QACf,IAAI,CAAC,IAAI,GAAG,sBAAsB,CAAC;QACnC,IAAI,CAAC,UAAU,GAAG,UAAU,CAAC;QAC7B,oEAAoE;QACpE,IAAI,KAAK,CAAC,iBAAiB,EAAE,CAAC;YAC5B,KAAK,CAAC,iBAAiB,CAAC,IAAI,EAAE,oBAAoB,CAAC,CAAC;QACtD,CAAC;IACH,CAAC;CACF;AAED;;;;;GAKG;AACH,MAAM,OAAO,WAAW;IACL,OAAO,CAAyB;IAEjD;;;OAGG;IACH;QACE,IAAI,CAAC,OAAO,GAAG;YACb,YAAY,EACV,iHAAiH;YACnH,MAAM,EAAE,4EAA4E;YACpF,iBAAiB,EAAE,gBAAgB;YACnC,2BAA2B,EAAE,GAAG;YAChC,gBAAgB,EAAE,UAAU;YAC5B,gBAAgB,EAAE,UAAU;YAC5B,gBAAgB,EAAE,MAAM;YACxB,gBAAgB,EAAE,IAAI;SACvB,CAAC;IACJ,CAAC;IAED;;;;;;;OAOG;IACH,KAAK,CAAC,SAAS,CAAC,GAAW;QACzB,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;gBAChC,OAAO,EAAE,IAAI,CAAC,OAAO;gBACrB,QAAQ,EAAE,QAAQ;aACnB,CAAC,CAAC;YAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;gBACjB,0CAA0C;gBAC1C,MAAM,IAAI,oBAAoB,CAAC,uBAAuB,QAAQ,CAAC,MAAM,EAAE,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC;YAC5F,CAAC;YAED,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;YAC/D,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,WAAW,CAAC,EAAE,CAAC;gBACvC,MAAM,IAAI,KAAK,CAAC,kBAAkB,CAAC,CAAC;YACtC,CAAC;YAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YAEnC,4CAA4C;YAC5C,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,IAAI,CAAC,CAAC;YAC5B,MAAM,KAAK,GAAG,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,IAAI,EAAE,CAAC;YAE9C,kCAAkC;YAClC,MAAM,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;YAClD,IAAI,KAAK,EAAE,CAAC;gBACV,qFAAqF;gBACrF,gDAAgD;gBAChD,OAAO,CAAC,IAAI,CAAC,oBAAoB,GAAG,6DAA6D,CAAC,CAAC;gBACnG,mEAAmE;YACrE,CAAC;YAED,OAAO;gBACL,IAAI;gBACJ,KAAK;gBACL,GAAG,EAAE,QAAQ,CAAC,GAAG;gBACjB,WAAW,EAAE,KAAK,EAAE,4BAA4B;gBAChD,UAAU,EAAE,QAAQ,CAAC,MAAM;gBAC3B,KAAK,EAAE,SAAS;aACjB,CAAC;QACJ,CAAC;QAAC,OAAO,KAAU,EAAE,CAAC;YACpB,+FAA+F;YAC/F,gFAAgF;YAChF,qGAAqG;YACrG,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC;IAEO,SAAS,CAAC,QAAkB;QAClC,+CAA+C;QAC/C,MAAM,UAAU,GAAG;YACjB,QAAQ;YACR,kBAAkB;YAClB,OAAO;YACP,MAAM;YACN,MAAM;YACN,cAAc;YACd,eAAe;YACf,UAAU;YACV,cAAc;YACd,UAAU;YACV,sBAAsB;YACtB,oCAAoC,EAAE,uBAAuB;YAC7D,uBAAuB;SACxB,CAAC;QAEF,+DAA+D;QAC/D,MAAM,WAAW,GAAG,QAAQ,CAAC,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QAC7D,MAAM,UAAU,GAAG,QAAQ,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC;QAE/C,IAAI,WAAW,CAAC,MAAM,GAAG,GAAG,IAAI,UAAU,EAAE,CAAC;YAC3C,+BAA+B;YAC/B,OAAO,IAAI,CAAC;QACd,CAAC;QAED,wCAAwC;QACxC,OAAO,UAAU,CAAC,IAAI,CAAC,CAAC,QAAQ,EAAE,EAAE,CAAC,QAAQ,CAAC,aAAa,CAAC,QAAQ,CAAC,KAAK,IAAI,CAAC,CAAC;IAClF,CAAC;IAED;;;;OAIG;IACH,KAAK,CAAC,OAAO;QACX,4CAA4C;QAC5C,OAAO,OAAO,CAAC,OAAO,EAAE,CAAC,CAAC,qCAAqC;IACjE,CAAC;IAED;;;;OAIG;IACH,UAAU;QACR,qDAAqD;QACrD,OAAO,EAAE,CAAC;IACZ,CAAC;CACF"}
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=FetchEngine.test.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"FetchEngine.test.d.ts","sourceRoot":"","sources":["../src/FetchEngine.test.ts"],"names":[],"mappings":""}
@@ -0,0 +1,44 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import { FetchEngine } from "./FetchEngine.js";
3
+ describe("FetchEngine", () => {
4
+ it("should fetch HTML and extract title from a static page", async () => {
5
+ const engine = new FetchEngine();
6
+ const url = "http://example.com";
7
+ const expectedUrl = "http://example.com/"; // Expect trailing slash
8
+ try {
9
+ const result = await engine.fetchHTML(url);
10
+ expect(result).toBeDefined();
11
+ expect(result.url).toBe(expectedUrl); // Use expectedUrl
12
+ expect(result.title).toBe("Example Domain");
13
+ expect(result.html).toContain("<title>Example Domain</title>");
14
+ expect(result.html).toContain("<h1>Example Domain</h1>");
15
+ }
16
+ catch (error) {
17
+ // If the test environment doesn't have fetch or network access, this might fail.
18
+ // In a real CI/CD, ensure network access or mock fetch.
19
+ console.warn("FetchEngine test failed, potentially due to network issues or missing fetch API:", error);
20
+ // Re-throw to fail the test if fetch was expected to work
21
+ throw error;
22
+ }
23
+ });
24
+ it("should throw an error for non-HTML content", async () => {
25
+ const engine = new FetchEngine();
26
+ // Use a URL known to return non-HTML content, e.g., a JSON endpoint or an image
27
+ const url = "https://httpbin.org/json";
28
+ // Expect the fetchHTML method to reject
29
+ await expect(engine.fetchHTML(url)).rejects.toThrow("Not an HTML page");
30
+ });
31
+ it("should throw an error for non-existent domains", async () => {
32
+ const engine = new FetchEngine();
33
+ const url = "http://domain-that-does-not-exist-fdsahjkl.xyz";
34
+ // Expect the fetchHTML method to reject (error message might vary)
35
+ await expect(engine.fetchHTML(url)).rejects.toThrow();
36
+ });
37
+ it("should handle http errors", async () => {
38
+ const engine = new FetchEngine();
39
+ const url = "https://httpbin.org/status/404"; // URL that returns 404
40
+ await expect(engine.fetchHTML(url)).rejects.toThrow(/HTTP error! status: 404/);
41
+ });
42
+ // Add more tests: SPA detection warning, etc.
43
+ });
44
+ //# sourceMappingURL=FetchEngine.test.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"FetchEngine.test.js","sourceRoot":"","sources":["../src/FetchEngine.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAC9C,OAAO,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAE/C,QAAQ,CAAC,aAAa,EAAE,GAAG,EAAE;IAC3B,EAAE,CAAC,wDAAwD,EAAE,KAAK,IAAI,EAAE;QACtE,MAAM,MAAM,GAAG,IAAI,WAAW,EAAE,CAAC;QACjC,MAAM,GAAG,GAAG,oBAAoB,CAAC;QACjC,MAAM,WAAW,GAAG,qBAAqB,CAAC,CAAC,wBAAwB;QAEnE,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;YAE3C,MAAM,CAAC,MAAM,CAAC,CAAC,WAAW,EAAE,CAAC;YAC7B,MAAM,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,kBAAkB;YACxD,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC;YAC5C,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,SAAS,CAAC,+BAA+B,CAAC,CAAC;YAC/D,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,SAAS,CAAC,yBAAyB,CAAC,CAAC;QAC3D,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,iFAAiF;YACjF,wDAAwD;YACxD,OAAO,CAAC,IAAI,CAAC,kFAAkF,EAAE,KAAK,CAAC,CAAC;YACxG,0DAA0D;YAC1D,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,4CAA4C,EAAE,KAAK,IAAI,EAAE;QAC1D,MAAM,MAAM,GAAG,IAAI,WAAW,EAAE,CAAC;QACjC,gFAAgF;QAChF,MAAM,GAAG,GAAG,0BAA0B,CAAC;QAEvC,wCAAwC;QACxC,MAAM,MAAM,CAAC,MAAM,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,kBAAkB,CAAC,CAAC;IAC1E,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,gDAAgD,EAAE,KAAK,IAAI,EAAE;QAC9D,MAAM,MAAM,GAAG,IAAI,WAAW,EAAE,CAAC;QACjC,MAAM,GAAG,GAAG,gDAAgD,CAAC;QAE7D,mEAAmE;QACnE,MAAM,MAAM,CAAC,MAAM,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC;IACxD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,2BAA2B,EAAE,KAAK,IAAI,EAAE;QACzC,MAAM,MAAM,GAAG,IAAI,WAAW,EAAE,CAAC;QACjC,MAAM,GAAG,GAAG,gCAAgC,CAAC,CAAC,uBAAuB;QAErE,MAAM,MAAM,CAAC,MAAM,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,yBAAyB,CAAC,CAAC;IACjF,CAAC,CAAC,CAAC;IAEH,8CAA8C;AAChD,CAAC,CAAC,CAAC"}
@@ -0,0 +1,15 @@
1
+ import type { HTMLFetchResult, BrowserMetrics, PlaywrightEngineConfig } from "./types.js";
2
+ import { IEngine } from "./IEngine.js";
3
+ /**
4
+ * HybridEngine - Attempts fetching with FetchEngine first for speed,
5
+ * then falls back to PlaywrightEngine for complex sites or specific errors.
6
+ */
7
+ export declare class HybridEngine implements IEngine {
8
+ private readonly fetchEngine;
9
+ private readonly playwrightEngine;
10
+ constructor(playwrightConfig?: PlaywrightEngineConfig);
11
+ fetchHTML(url: string): Promise<HTMLFetchResult>;
12
+ cleanup(): Promise<void>;
13
+ getMetrics(): BrowserMetrics[];
14
+ }
15
+ //# sourceMappingURL=HybridEngine.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"HybridEngine.d.ts","sourceRoot":"","sources":["../src/HybridEngine.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EACV,eAAe,EACf,cAAc,EACd,sBAAsB,EACvB,MAAM,YAAY,CAAC;AACpB,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAEvC;;;GAGG;AACH,qBAAa,YAAa,YAAW,OAAO;IAC1C,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAc;IAC1C,OAAO,CAAC,QAAQ,CAAC,gBAAgB,CAAmB;gBAExC,gBAAgB,GAAE,sBAA2B;IAKnD,SAAS,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,eAAe,CAAC;IAkBhD,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;IAQ9B,UAAU,IAAI,cAAc,EAAE;CAI/B"}
@@ -0,0 +1,45 @@
1
+ import { FetchEngine } from "./FetchEngine.js";
2
+ import { PlaywrightEngine } from "./PlaywrightEngine.js";
3
+ /**
4
+ * HybridEngine - Attempts fetching with FetchEngine first for speed,
5
+ * then falls back to PlaywrightEngine for complex sites or specific errors.
6
+ */
7
+ export class HybridEngine {
8
+ fetchEngine;
9
+ playwrightEngine;
10
+ constructor(playwrightConfig = {}) {
11
+ this.fetchEngine = new FetchEngine();
12
+ this.playwrightEngine = new PlaywrightEngine(playwrightConfig);
13
+ }
14
+ async fetchHTML(url) {
15
+ try {
16
+ // Attempt 1: Use the fast FetchEngine
17
+ const fetchResult = await this.fetchEngine.fetchHTML(url);
18
+ return fetchResult;
19
+ }
20
+ catch (_fetchError) {
21
+ // Prefixed unused error
22
+ // If FetchEngine fails (e.g., 403, network error, non-html), try Playwright
23
+ try {
24
+ const playwrightResult = await this.playwrightEngine.fetchHTML(url);
25
+ return playwrightResult;
26
+ }
27
+ catch (playwrightError) {
28
+ // If Playwright also fails, throw its error (potentially more informative)
29
+ throw playwrightError;
30
+ }
31
+ }
32
+ }
33
+ async cleanup() {
34
+ // Cleanup both engines concurrently
35
+ await Promise.allSettled([
36
+ this.fetchEngine.cleanup(),
37
+ this.playwrightEngine.cleanup(),
38
+ ]);
39
+ }
40
+ getMetrics() {
41
+ // FetchEngine doesn't produce metrics, only PlaywrightEngine does
42
+ return this.playwrightEngine.getMetrics();
43
+ }
44
+ }
45
+ //# sourceMappingURL=HybridEngine.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"HybridEngine.js","sourceRoot":"","sources":["../src/HybridEngine.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAC/C,OAAO,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AAQzD;;;GAGG;AACH,MAAM,OAAO,YAAY;IACN,WAAW,CAAc;IACzB,gBAAgB,CAAmB;IAEpD,YAAY,mBAA2C,EAAE;QACvD,IAAI,CAAC,WAAW,GAAG,IAAI,WAAW,EAAE,CAAC;QACrC,IAAI,CAAC,gBAAgB,GAAG,IAAI,gBAAgB,CAAC,gBAAgB,CAAC,CAAC;IACjE,CAAC;IAED,KAAK,CAAC,SAAS,CAAC,GAAW;QACzB,IAAI,CAAC;YACH,sCAAsC;YACtC,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;YAC1D,OAAO,WAAW,CAAC;QACrB,CAAC;QAAC,OAAO,WAAgB,EAAE,CAAC;YAC1B,wBAAwB;YACxB,4EAA4E;YAC5E,IAAI,CAAC;gBACH,MAAM,gBAAgB,GAAG,MAAM,IAAI,CAAC,gBAAgB,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;gBACpE,OAAO,gBAAgB,CAAC;YAC1B,CAAC;YAAC,OAAO,eAAe,EAAE,CAAC;gBACzB,2EAA2E;gBAC3E,MAAM,eAAe,CAAC;YACxB,CAAC;QACH,CAAC;IACH,CAAC;IAED,KAAK,CAAC,OAAO;QACX,oCAAoC;QACpC,MAAM,OAAO,CAAC,UAAU,CAAC;YACvB,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE;YAC1B,IAAI,CAAC,gBAAgB,CAAC,OAAO,EAAE;SAChC,CAAC,CAAC;IACL,CAAC;IAED,UAAU;QACR,kEAAkE;QAClE,OAAO,IAAI,CAAC,gBAAgB,CAAC,UAAU,EAAE,CAAC;IAC5C,CAAC;CACF"}
@@ -0,0 +1,22 @@
1
+ import type { HTMLFetchResult, BrowserMetrics } from "./types.js";
2
+ /**
3
+ * Interface for browser engines that can fetch HTML content from URLs
4
+ */
5
+ export interface IEngine {
6
+ /**
7
+ * Fetches HTML content from a URL
8
+ * @param url The URL to fetch
9
+ * @returns A promise that resolves to an HTMLFetchResult
10
+ */
11
+ fetchHTML(url: string): Promise<HTMLFetchResult>;
12
+ /**
13
+ * Cleans up resources used by the engine
14
+ */
15
+ cleanup(): Promise<void>;
16
+ /**
17
+ * Gets metrics about the engine's performance
18
+ * @returns An array of BrowserMetrics
19
+ */
20
+ getMetrics(): BrowserMetrics[];
21
+ }
22
+ //# sourceMappingURL=IEngine.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"IEngine.d.ts","sourceRoot":"","sources":["../src/IEngine.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAElE;;GAEG;AACH,MAAM,WAAW,OAAO;IACtB;;;;OAIG;IACH,SAAS,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,eAAe,CAAC,CAAC;IAEjD;;OAEG;IACH,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;IAEzB;;;OAGG;IACH,UAAU,IAAI,cAAc,EAAE,CAAC;CAChC"}
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=IEngine.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"IEngine.js","sourceRoot":"","sources":["../src/IEngine.ts"],"names":[],"mappings":""}