@purepageio/fetch-engines 0.2.4 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/dist/index.cjs +1657 -0
  2. package/dist/index.cjs.map +1 -0
  3. package/dist/index.d.cts +323 -0
  4. package/dist/index.d.ts +323 -8
  5. package/dist/index.js +1617 -4
  6. package/dist/index.js.map +1 -1
  7. package/package.json +14 -5
  8. package/dist/FetchEngine.d.ts +0 -47
  9. package/dist/FetchEngine.d.ts.map +0 -1
  10. package/dist/FetchEngine.js +0 -114
  11. package/dist/FetchEngine.js.map +0 -1
  12. package/dist/FetchEngine.test.d.ts +0 -2
  13. package/dist/FetchEngine.test.d.ts.map +0 -1
  14. package/dist/FetchEngine.test.js +0 -44
  15. package/dist/FetchEngine.test.js.map +0 -1
  16. package/dist/HybridEngine.d.ts +0 -21
  17. package/dist/HybridEngine.d.ts.map +0 -1
  18. package/dist/HybridEngine.js +0 -62
  19. package/dist/HybridEngine.js.map +0 -1
  20. package/dist/IEngine.d.ts +0 -22
  21. package/dist/IEngine.d.ts.map +0 -1
  22. package/dist/IEngine.js +0 -2
  23. package/dist/IEngine.js.map +0 -1
  24. package/dist/PlaywrightEngine.d.ts +0 -90
  25. package/dist/PlaywrightEngine.d.ts.map +0 -1
  26. package/dist/PlaywrightEngine.js +0 -505
  27. package/dist/PlaywrightEngine.js.map +0 -1
  28. package/dist/PlaywrightEngine.test.d.ts +0 -2
  29. package/dist/PlaywrightEngine.test.d.ts.map +0 -1
  30. package/dist/PlaywrightEngine.test.js +0 -207
  31. package/dist/PlaywrightEngine.test.js.map +0 -1
  32. package/dist/PuppeteerEngine.d.ts +0 -21
  33. package/dist/PuppeteerEngine.d.ts.map +0 -1
  34. package/dist/PuppeteerEngine.js +0 -412
  35. package/dist/PuppeteerEngine.js.map +0 -1
  36. package/dist/browser/BrowserPool.d.ts +0 -29
  37. package/dist/browser/BrowserPool.d.ts.map +0 -1
  38. package/dist/browser/BrowserPool.js +0 -378
  39. package/dist/browser/BrowserPool.js.map +0 -1
  40. package/dist/browser/PlaywrightBrowserPool.d.ts +0 -78
  41. package/dist/browser/PlaywrightBrowserPool.d.ts.map +0 -1
  42. package/dist/browser/PlaywrightBrowserPool.js +0 -445
  43. package/dist/browser/PlaywrightBrowserPool.js.map +0 -1
  44. package/dist/browser/PlaywrightBrowserPool.test.d.ts +0 -2
  45. package/dist/browser/PlaywrightBrowserPool.test.d.ts.map +0 -1
  46. package/dist/browser/PlaywrightBrowserPool.test.js +0 -422
  47. package/dist/browser/PlaywrightBrowserPool.test.js.map +0 -1
  48. package/dist/errors.d.ts +0 -20
  49. package/dist/errors.d.ts.map +0 -1
  50. package/dist/errors.js +0 -30
  51. package/dist/errors.js.map +0 -1
  52. package/dist/index.d.ts.map +0 -1
  53. package/dist/types.d.ts +0 -167
  54. package/dist/types.d.ts.map +0 -1
  55. package/dist/types.js +0 -2
  56. package/dist/types.js.map +0 -1
  57. package/dist/utils/markdown-converter.d.ts +0 -31
  58. package/dist/utils/markdown-converter.d.ts.map +0 -1
  59. package/dist/utils/markdown-converter.js +0 -796
  60. package/dist/utils/markdown-converter.js.map +0 -1
package/dist/index.d.ts CHANGED
@@ -1,8 +1,323 @@
1
- import type { IEngine } from "./IEngine.js";
2
- import { FetchEngine } from "./FetchEngine.js";
3
- import { PlaywrightEngine } from "./PlaywrightEngine.js";
4
- import type { HTMLFetchResult, BrowserMetrics } from "./types.js";
5
- export type { IEngine, HTMLFetchResult, BrowserMetrics };
6
- export { FetchEngine, PlaywrightEngine };
7
- export * from "./HybridEngine.js";
8
- //# sourceMappingURL=index.d.ts.map
1
+ /**
2
+ * Defines the structure for the result of fetching HTML content.
3
+ */
4
+ interface HTMLFetchResult {
5
+ /** The fetched HTML content OR the converted Markdown content. */
6
+ content: string;
7
+ /** Indicates the type of content in the 'content' field. */
8
+ contentType: "html" | "markdown";
9
+ /** The extracted title of the page, if available. */
10
+ title: string | null;
11
+ /** The final URL after any redirects. */
12
+ url: string;
13
+ /** Indicates if the result came from the cache. */
14
+ isFromCache: boolean;
15
+ /** The HTTP status code of the final response. */
16
+ statusCode: number | undefined;
17
+ /** Any error encountered during the fetch process. */
18
+ error: Error | undefined;
19
+ }
20
+ /**
21
+ * Metrics related to browser pool performance and status.
22
+ */
23
+ interface BrowserMetrics {
24
+ id: string;
25
+ engine?: "playwright" | string;
26
+ pagesCreated: number;
27
+ activePages: number;
28
+ lastUsed: Date;
29
+ errors: number;
30
+ totalRequests?: number;
31
+ avgResponseTime?: number;
32
+ createdAt: Date;
33
+ isHealthy: boolean;
34
+ }
35
+ /**
36
+ * Configuration options for the PlaywrightEngine.
37
+ */
38
+ interface PlaywrightEngineConfig {
39
+ /**
40
+ * Maximum number of Playwright pages to process concurrently.
41
+ * @default 3
42
+ */
43
+ concurrentPages?: number;
44
+ /**
45
+ * Maximum number of retry attempts for a failed fetch operation (excluding initial attempt).
46
+ * @default 3
47
+ */
48
+ maxRetries?: number;
49
+ /**
50
+ * Delay in milliseconds between retry attempts.
51
+ * @default 5000
52
+ */
53
+ retryDelay?: number;
54
+ /**
55
+ * Time-to-live for cached results in milliseconds. Set to 0 to disable.
56
+ * @default 900000 (15 minutes)
57
+ */
58
+ cacheTTL?: number;
59
+ /**
60
+ * If true, attempts a fast HTTP GET first before using Playwright.
61
+ * @default true
62
+ */
63
+ useHttpFallback?: boolean;
64
+ /**
65
+ * If true, automatically retries failed requests for a domain in headed mode.
66
+ * @default false
67
+ */
68
+ useHeadedModeFallback?: boolean;
69
+ /**
70
+ * If true, requests initially block non-essential resources and skip human simulation.
71
+ * Can be overridden per-request via fetchHTML options.
72
+ * @default true
73
+ */
74
+ defaultFastMode?: boolean;
75
+ /**
76
+ * If true (and not in fastMode), attempts basic human-like interactions.
77
+ * @default true
78
+ */
79
+ simulateHumanBehavior?: boolean;
80
+ /**
81
+ * Maximum number of concurrent browser instances the pool manages.
82
+ * Passed to PlaywrightBrowserPool.
83
+ * @default 2
84
+ */
85
+ maxBrowsers?: number;
86
+ /**
87
+ * Maximum number of pages per browser context before recycling.
88
+ * Passed to PlaywrightBrowserPool.
89
+ * @default 6
90
+ */
91
+ maxPagesPerContext?: number;
92
+ /**
93
+ * Maximum age in ms a browser instance lives before recycling.
94
+ * Passed to PlaywrightBrowserPool.
95
+ * @default 1200000 (20 minutes)
96
+ */
97
+ maxBrowserAge?: number;
98
+ /**
99
+ * How often (in ms) the pool checks browser health.
100
+ * Passed to PlaywrightBrowserPool.
101
+ * @default 60000 (1 minute)
102
+ */
103
+ healthCheckInterval?: number;
104
+ /**
105
+ * List of domain glob patterns to block requests to. Overrides pool default.
106
+ * Passed to PlaywrightBrowserPool.
107
+ * @default [] (uses pool's defaults)
108
+ */
109
+ poolBlockedDomains?: string[];
110
+ /**
111
+ * List of Playwright resource types (e.g., 'image', 'font') to block. Overrides pool default.
112
+ * Passed to PlaywrightBrowserPool.
113
+ * @default [] (uses pool's defaults)
114
+ */
115
+ poolBlockedResourceTypes?: string[];
116
+ /**
117
+ * Proxy configuration for browser instances.
118
+ * Passed to PlaywrightBrowserPool.
119
+ * @default undefined
120
+ */
121
+ proxy?: {
122
+ /** Proxy server URL (e.g., "http://host:port", "socks5://user:pass@host:port"). */
123
+ server: string;
124
+ /** Optional proxy username. */
125
+ username?: string;
126
+ /** Optional proxy password. */
127
+ password?: string;
128
+ };
129
+ /**
130
+ * Forces the entire pool to launch browsers in headed (visible) mode.
131
+ * Passed to PlaywrightBrowserPool.
132
+ * @default false
133
+ */
134
+ useHeadedMode?: boolean;
135
+ /**
136
+ * If true, the fetched HTML content will be converted to Markdown.
137
+ * @default false
138
+ */
139
+ markdown?: boolean;
140
+ }
141
+ /**
142
+ * Options that can be passed per-request to engine.fetchHTML().
143
+ */
144
+ interface FetchOptions {
145
+ /** Overrides the engine's defaultFastMode for this specific request. (Playwright/Hybrid only) */
146
+ fastMode?: boolean;
147
+ /** Overrides the engine's markdown setting for this specific request. (Playwright/Hybrid only) */
148
+ markdown?: boolean;
149
+ }
150
+ /**
151
+ * Configuration options specifically for the FetchEngine.
152
+ */
153
+ interface FetchEngineOptions {
154
+ /** If true, convert the fetched HTML to Markdown. Default: false */
155
+ markdown?: boolean;
156
+ }
157
+
158
+ /**
159
+ * Interface for browser engines that can fetch HTML content from URLs
160
+ */
161
+ interface IEngine {
162
+ /**
163
+ * Fetches HTML content from a URL
164
+ * @param url The URL to fetch
165
+ * @returns A promise that resolves to an HTMLFetchResult
166
+ */
167
+ fetchHTML(url: string): Promise<HTMLFetchResult>;
168
+ /**
169
+ * Cleans up resources used by the engine
170
+ */
171
+ cleanup(): Promise<void>;
172
+ /**
173
+ * Gets metrics about the engine's performance
174
+ * @returns An array of BrowserMetrics
175
+ */
176
+ getMetrics(): BrowserMetrics[];
177
+ }
178
+
179
+ /**
180
+ * FetchEngine - A lightweight engine for fetching HTML content using the standard `fetch` API.
181
+ *
182
+ * Ideal for fetching content from static websites or APIs where JavaScript execution is not required.
183
+ * It does not support advanced configurations like retries, caching, or proxies directly.
184
+ */
185
+ declare class FetchEngine implements IEngine {
186
+ private readonly options;
187
+ private static readonly DEFAULT_OPTIONS;
188
+ /**
189
+ * Creates an instance of FetchEngine.
190
+ * @param options Configuration options for the FetchEngine.
191
+ */
192
+ constructor(options?: FetchEngineOptions);
193
+ /**
194
+ * Fetches HTML or converts to Markdown from the specified URL.
195
+ *
196
+ * @param url The URL to fetch.
197
+ * @returns A Promise resolving to an HTMLFetchResult object.
198
+ * @throws {FetchEngineHttpError} If the HTTP response status is not ok (e.g., 404, 500).
199
+ * @throws {Error} If the content type is not HTML or for other network errors.
200
+ */
201
+ fetchHTML(url: string, options?: FetchEngineOptions): Promise<HTMLFetchResult>;
202
+ /**
203
+ * Cleans up resources used by the engine.
204
+ * For FetchEngine, this is a no-op as it doesn't manage persistent resources.
205
+ * @returns A Promise that resolves when cleanup is complete.
206
+ */
207
+ cleanup(): Promise<void>;
208
+ /**
209
+ * Retrieves metrics for the engine.
210
+ * FetchEngine does not manage browsers, so it returns an empty array.
211
+ * @returns An empty array.
212
+ */
213
+ getMetrics(): BrowserMetrics[];
214
+ }
215
+
216
+ /**
217
+ * PlaywrightEngine - Fetches HTML using a managed pool of headless Playwright browser instances.
218
+ *
219
+ * This engine is suitable for dynamic websites that require JavaScript execution.
220
+ * It incorporates `playwright-extra` with the stealth plugin for enhanced anti-detection capabilities.
221
+ * Features include caching, retries, HTTP fallback, and configurable browser pooling.
222
+ */
223
+ declare class PlaywrightEngine implements IEngine {
224
+ private browserPool;
225
+ private readonly queue;
226
+ private readonly cache;
227
+ private readonly config;
228
+ private initializingBrowserPool;
229
+ private isUsingHeadedMode;
230
+ private headedFallbackSites;
231
+ private static readonly DEFAULT_CONFIG;
232
+ /**
233
+ * Creates an instance of PlaywrightEngine.
234
+ *
235
+ * @param config Configuration options for the engine and its browser pool.
236
+ * See `PlaywrightEngineConfig` for details.
237
+ */
238
+ constructor(config?: PlaywrightEngineConfig);
239
+ /**
240
+ * Initialize the browser pool with improved error handling and mode switching.
241
+ */
242
+ private initializeBrowserPool;
243
+ /**
244
+ * Fallback method using simple HTTP requests via Axios.
245
+ * Ensures return type matches HTMLFetchResult.
246
+ */
247
+ private fetchHTMLWithHttpFallback;
248
+ private checkCache;
249
+ /**
250
+ * Safely check if a page is still usable and connected.
251
+ */
252
+ private isPageValid;
253
+ /**
254
+ * Simulate human-like interactions on the page.
255
+ */
256
+ private simulateHumanBehavior;
257
+ /**
258
+ * Adds a result to the in-memory cache.
259
+ */
260
+ private addToCache;
261
+ /**
262
+ * Public method to fetch HTML. Delegates to the internal recursive fetch method.
263
+ *
264
+ * @param url The URL to fetch.
265
+ * @param options Optional settings for this specific fetch operation.
266
+ * @param options.fastMode Overrides the engine's `defaultFastMode` configuration for this request.
267
+ * @returns A Promise resolving to an HTMLFetchResult object.
268
+ * @throws {FetchError} If the fetch fails after all retries or encounters critical errors.
269
+ */
270
+ fetchHTML(url: string, options?: FetchOptions & {
271
+ markdown?: boolean;
272
+ }): Promise<HTMLFetchResult>;
273
+ /**
274
+ * Internal recursive method to handle fetching with retries.
275
+ *
276
+ * @param url URL to fetch
277
+ * @param currentConfig The merged configuration including markdown option
278
+ * @param retryAttempt Current retry attempt number (starts at 0)
279
+ * @param parentRetryCount Tracks retries related to pool initialization errors (starts at 0)
280
+ * @returns Promise resolving to HTMLFetchResult
281
+ */
282
+ private _fetchRecursive;
283
+ /**
284
+ * Performs the actual page fetch using a Playwright page from the pool.
285
+ * Ensures return type matches HTMLFetchResult.
286
+ */
287
+ private fetchWithPlaywright;
288
+ private applyBlockingRules;
289
+ /**
290
+ * Cleans up resources used by the engine, primarily closing browser instances in the pool.
291
+ *
292
+ * It is crucial to call this method when finished with the engine instance to release resources.
293
+ * @returns A Promise that resolves when cleanup is complete.
294
+ */
295
+ cleanup(): Promise<void>;
296
+ /**
297
+ * Retrieves metrics from the underlying browser pool.
298
+ * @returns An array of BrowserMetrics objects, one for each active browser instance, or an empty array if the pool is not initialized.
299
+ */
300
+ getMetrics(): BrowserMetrics[];
301
+ private shouldUseHeadedMode;
302
+ }
303
+
304
+ /**
305
+ * HybridEngine - Tries FetchEngine first, falls back to PlaywrightEngine on failure.
306
+ */
307
+ declare class HybridEngine implements IEngine {
308
+ private readonly fetchEngine;
309
+ private readonly playwrightEngine;
310
+ private readonly config;
311
+ constructor(config?: PlaywrightEngineConfig);
312
+ fetchHTML(url: string, options?: FetchOptions): Promise<HTMLFetchResult>;
313
+ /**
314
+ * Delegates getMetrics to the PlaywrightEngine.
315
+ */
316
+ getMetrics(): BrowserMetrics[];
317
+ /**
318
+ * Calls cleanup on both underlying engines.
319
+ */
320
+ cleanup(): Promise<void>;
321
+ }
322
+
323
+ export { type BrowserMetrics, FetchEngine, type HTMLFetchResult, HybridEngine, type IEngine, PlaywrightEngine };