@purepageio/fetch-engines 0.2.5 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1657 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +323 -0
- package/dist/index.d.ts +323 -8
- package/dist/index.js +1617 -4
- package/dist/index.js.map +1 -1
- package/package.json +14 -5
- package/dist/FetchEngine.d.ts +0 -47
- package/dist/FetchEngine.d.ts.map +0 -1
- package/dist/FetchEngine.js +0 -114
- package/dist/FetchEngine.js.map +0 -1
- package/dist/FetchEngine.test.d.ts +0 -2
- package/dist/FetchEngine.test.d.ts.map +0 -1
- package/dist/FetchEngine.test.js +0 -44
- package/dist/FetchEngine.test.js.map +0 -1
- package/dist/HybridEngine.d.ts +0 -21
- package/dist/HybridEngine.d.ts.map +0 -1
- package/dist/HybridEngine.js +0 -62
- package/dist/HybridEngine.js.map +0 -1
- package/dist/IEngine.d.ts +0 -22
- package/dist/IEngine.d.ts.map +0 -1
- package/dist/IEngine.js +0 -2
- package/dist/IEngine.js.map +0 -1
- package/dist/PlaywrightEngine.d.ts +0 -90
- package/dist/PlaywrightEngine.d.ts.map +0 -1
- package/dist/PlaywrightEngine.js +0 -505
- package/dist/PlaywrightEngine.js.map +0 -1
- package/dist/PlaywrightEngine.test.d.ts +0 -2
- package/dist/PlaywrightEngine.test.d.ts.map +0 -1
- package/dist/PlaywrightEngine.test.js +0 -207
- package/dist/PlaywrightEngine.test.js.map +0 -1
- package/dist/PuppeteerEngine.d.ts +0 -21
- package/dist/PuppeteerEngine.d.ts.map +0 -1
- package/dist/PuppeteerEngine.js +0 -412
- package/dist/PuppeteerEngine.js.map +0 -1
- package/dist/browser/BrowserPool.d.ts +0 -29
- package/dist/browser/BrowserPool.d.ts.map +0 -1
- package/dist/browser/BrowserPool.js +0 -378
- package/dist/browser/BrowserPool.js.map +0 -1
- package/dist/browser/PlaywrightBrowserPool.d.ts +0 -48
- package/dist/browser/PlaywrightBrowserPool.d.ts.map +0 -1
- package/dist/browser/PlaywrightBrowserPool.js +0 -378
- package/dist/browser/PlaywrightBrowserPool.js.map +0 -1
- package/dist/browser/PlaywrightBrowserPool.test.d.ts +0 -2
- package/dist/browser/PlaywrightBrowserPool.test.d.ts.map +0 -1
- package/dist/browser/PlaywrightBrowserPool.test.js +0 -422
- package/dist/browser/PlaywrightBrowserPool.test.js.map +0 -1
- package/dist/errors.d.ts +0 -20
- package/dist/errors.d.ts.map +0 -1
- package/dist/errors.js +0 -30
- package/dist/errors.js.map +0 -1
- package/dist/index.d.ts.map +0 -1
- package/dist/types.d.ts +0 -167
- package/dist/types.d.ts.map +0 -1
- package/dist/types.js +0 -2
- package/dist/types.js.map +0 -1
- package/dist/utils/markdown-converter.d.ts +0 -31
- package/dist/utils/markdown-converter.d.ts.map +0 -1
- package/dist/utils/markdown-converter.js +0 -796
- package/dist/utils/markdown-converter.js.map +0 -1
package/dist/HybridEngine.js
DELETED
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
import { FetchEngine } from "./FetchEngine.js";
|
|
2
|
-
import { PlaywrightEngine } from "./PlaywrightEngine.js";
|
|
3
|
-
/**
|
|
4
|
-
* HybridEngine - Tries FetchEngine first, falls back to PlaywrightEngine on failure.
|
|
5
|
-
*/
|
|
6
|
-
export class HybridEngine {
|
|
7
|
-
fetchEngine;
|
|
8
|
-
playwrightEngine;
|
|
9
|
-
config; // Store config for potential per-request PW overrides
|
|
10
|
-
constructor(config = {}) {
|
|
11
|
-
// Pass relevant config parts to each engine
|
|
12
|
-
// FetchEngine only takes markdown option from the shared config
|
|
13
|
-
this.fetchEngine = new FetchEngine({ markdown: config.markdown });
|
|
14
|
-
this.playwrightEngine = new PlaywrightEngine(config);
|
|
15
|
-
this.config = config; // Store for merging later
|
|
16
|
-
}
|
|
17
|
-
async fetchHTML(url, options = {}) {
|
|
18
|
-
// FetchEngine uses its constructor config; it doesn't accept per-request options here.
|
|
19
|
-
try {
|
|
20
|
-
const fetchResult = await this.fetchEngine.fetchHTML(url);
|
|
21
|
-
// If fetch succeeded, return its result directly (it handles its own markdown config)
|
|
22
|
-
// No need to check contentType here, FetchEngine handles it based on its constructor.
|
|
23
|
-
return fetchResult;
|
|
24
|
-
}
|
|
25
|
-
catch (fetchError) {
|
|
26
|
-
console.warn(`FetchEngine failed for ${url}: ${fetchError.message}. Falling back to PlaywrightEngine.`);
|
|
27
|
-
// Merge constructor config with per-request options for Playwright fallback
|
|
28
|
-
const playwrightOptions = {
|
|
29
|
-
...this.config, // Start with base config given to HybridEngine
|
|
30
|
-
...options, // Override with per-request options
|
|
31
|
-
};
|
|
32
|
-
try {
|
|
33
|
-
// Pass merged options to PlaywrightEngine
|
|
34
|
-
const playwrightResult = await this.playwrightEngine.fetchHTML(url, playwrightOptions);
|
|
35
|
-
return playwrightResult;
|
|
36
|
-
}
|
|
37
|
-
catch (playwrightError) {
|
|
38
|
-
// Catch potential Playwright error
|
|
39
|
-
console.error(`PlaywrightEngine fallback failed for ${url}: ${playwrightError.message}`);
|
|
40
|
-
// Optionally, wrap or prioritize which error to throw
|
|
41
|
-
// Throwing the Playwright error as it's the last one encountered
|
|
42
|
-
throw playwrightError;
|
|
43
|
-
}
|
|
44
|
-
}
|
|
45
|
-
}
|
|
46
|
-
/**
|
|
47
|
-
* Delegates getMetrics to the PlaywrightEngine.
|
|
48
|
-
*/
|
|
49
|
-
getMetrics() {
|
|
50
|
-
return this.playwrightEngine.getMetrics();
|
|
51
|
-
}
|
|
52
|
-
/**
|
|
53
|
-
* Calls cleanup on both underlying engines.
|
|
54
|
-
*/
|
|
55
|
-
async cleanup() {
|
|
56
|
-
await Promise.allSettled([
|
|
57
|
-
this.fetchEngine.cleanup(), // Although a no-op, call for consistency
|
|
58
|
-
this.playwrightEngine.cleanup(),
|
|
59
|
-
]);
|
|
60
|
-
}
|
|
61
|
-
}
|
|
62
|
-
//# sourceMappingURL=HybridEngine.js.map
|
package/dist/HybridEngine.js.map
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"HybridEngine.js","sourceRoot":"","sources":["../src/HybridEngine.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAC/C,OAAO,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AAIzD;;GAEG;AACH,MAAM,OAAO,YAAY;IACN,WAAW,CAAc;IACzB,gBAAgB,CAAmB;IACnC,MAAM,CAAyB,CAAC,sDAAsD;IAEvG,YAAY,SAAiC,EAAE;QAC7C,4CAA4C;QAC5C,gEAAgE;QAChE,IAAI,CAAC,WAAW,GAAG,IAAI,WAAW,CAAC,EAAE,QAAQ,EAAE,MAAM,CAAC,QAAQ,EAAE,CAAC,CAAC;QAClE,IAAI,CAAC,gBAAgB,GAAG,IAAI,gBAAgB,CAAC,MAAM,CAAC,CAAC;QACrD,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC,CAAC,0BAA0B;IAClD,CAAC;IAED,KAAK,CAAC,SAAS,CAAC,GAAW,EAAE,UAAwB,EAAE;QACrD,uFAAuF;QACvF,IAAI,CAAC;YACH,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;YAC1D,sFAAsF;YACtF,sFAAsF;YACtF,OAAO,WAAW,CAAC;QACrB,CAAC;QAAC,OAAO,UAAe,EAAE,CAAC;YACzB,OAAO,CAAC,IAAI,CAAC,0BAA0B,GAAG,KAAK,UAAU,CAAC,OAAO,qCAAqC,CAAC,CAAC;YAExG,4EAA4E;YAC5E,MAAM,iBAAiB,GAAiB;gBACtC,GAAG,IAAI,CAAC,MAAM,EAAE,+CAA+C;gBAC/D,GAAG,OAAO,EAAE,oCAAoC;aACjD,CAAC;YAEF,IAAI,CAAC;gBACH,0CAA0C;gBAC1C,MAAM,gBAAgB,GAAG,MAAM,IAAI,CAAC,gBAAgB,CAAC,SAAS,CAAC,GAAG,EAAE,iBAAiB,CAAC,CAAC;gBACvF,OAAO,gBAAgB,CAAC;YAC1B,CAAC;YAAC,OAAO,eAAoB,EAAE,CAAC;gBAC9B,mCAAmC;gBACnC,OAAO,CAAC,KAAK,CAAC,wCAAwC,GAAG,KAAK,eAAe,CAAC,OAAO,EAAE,CAAC,CAAC;gBACzF,sDAAsD;gBACtD,iEAAiE;gBACjE,MAAM,eAAe,CAAC;YACxB,CAAC;QACH,CAAC;IACH,CAAC;IAED;;OAEG;IACH,UAAU;QACR,OAAO,IAAI,CAAC,gBAAgB,CAAC,UAAU,EAAE,CAAC;IAC5C,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO;QACX,MAAM,OAAO,CAAC,UAAU,CAAC;YACvB,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,EAAE,yCAAyC;YACrE,IAAI,CAAC,gBAAgB,CAAC,OAAO,EAAE;SAChC,CAAC,CAAC;IACL,CAAC;CACF"}
|
package/dist/IEngine.d.ts
DELETED
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
import type { HTMLFetchResult, BrowserMetrics } from "./types.js";
|
|
2
|
-
/**
|
|
3
|
-
* Interface for browser engines that can fetch HTML content from URLs
|
|
4
|
-
*/
|
|
5
|
-
export interface IEngine {
|
|
6
|
-
/**
|
|
7
|
-
* Fetches HTML content from a URL
|
|
8
|
-
* @param url The URL to fetch
|
|
9
|
-
* @returns A promise that resolves to an HTMLFetchResult
|
|
10
|
-
*/
|
|
11
|
-
fetchHTML(url: string): Promise<HTMLFetchResult>;
|
|
12
|
-
/**
|
|
13
|
-
* Cleans up resources used by the engine
|
|
14
|
-
*/
|
|
15
|
-
cleanup(): Promise<void>;
|
|
16
|
-
/**
|
|
17
|
-
* Gets metrics about the engine's performance
|
|
18
|
-
* @returns An array of BrowserMetrics
|
|
19
|
-
*/
|
|
20
|
-
getMetrics(): BrowserMetrics[];
|
|
21
|
-
}
|
|
22
|
-
//# sourceMappingURL=IEngine.d.ts.map
|
package/dist/IEngine.d.ts.map
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"IEngine.d.ts","sourceRoot":"","sources":["../src/IEngine.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAElE;;GAEG;AACH,MAAM,WAAW,OAAO;IACtB;;;;OAIG;IACH,SAAS,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,eAAe,CAAC,CAAC;IAEjD;;OAEG;IACH,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;IAEzB;;;OAGG;IACH,UAAU,IAAI,cAAc,EAAE,CAAC;CAChC"}
|
package/dist/IEngine.js
DELETED
package/dist/IEngine.js.map
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"IEngine.js","sourceRoot":"","sources":["../src/IEngine.ts"],"names":[],"mappings":""}
|
|
@@ -1,90 +0,0 @@
|
|
|
1
|
-
import type { HTMLFetchResult, BrowserMetrics, PlaywrightEngineConfig, FetchOptions } from "./types.js";
|
|
2
|
-
import type { IEngine } from "./IEngine.js";
|
|
3
|
-
/**
|
|
4
|
-
* PlaywrightEngine - Fetches HTML using a managed pool of headless Playwright browser instances.
|
|
5
|
-
*
|
|
6
|
-
* This engine is suitable for dynamic websites that require JavaScript execution.
|
|
7
|
-
* It incorporates `playwright-extra` with the stealth plugin for enhanced anti-detection capabilities.
|
|
8
|
-
* Features include caching, retries, HTTP fallback, and configurable browser pooling.
|
|
9
|
-
*/
|
|
10
|
-
export declare class PlaywrightEngine implements IEngine {
|
|
11
|
-
private browserPool;
|
|
12
|
-
private readonly queue;
|
|
13
|
-
private readonly cache;
|
|
14
|
-
private readonly config;
|
|
15
|
-
private initializingBrowserPool;
|
|
16
|
-
private isUsingHeadedMode;
|
|
17
|
-
private headedFallbackSites;
|
|
18
|
-
private static readonly DEFAULT_CONFIG;
|
|
19
|
-
/**
|
|
20
|
-
* Creates an instance of PlaywrightEngine.
|
|
21
|
-
*
|
|
22
|
-
* @param config Configuration options for the engine and its browser pool.
|
|
23
|
-
* See `PlaywrightEngineConfig` for details.
|
|
24
|
-
*/
|
|
25
|
-
constructor(config?: PlaywrightEngineConfig);
|
|
26
|
-
/**
|
|
27
|
-
* Initialize the browser pool with improved error handling and mode switching.
|
|
28
|
-
*/
|
|
29
|
-
private initializeBrowserPool;
|
|
30
|
-
/**
|
|
31
|
-
* Fallback method using simple HTTP requests via Axios.
|
|
32
|
-
* Ensures return type matches HTMLFetchResult.
|
|
33
|
-
*/
|
|
34
|
-
private fetchHTMLWithHttpFallback;
|
|
35
|
-
private checkCache;
|
|
36
|
-
/**
|
|
37
|
-
* Safely check if a page is still usable and connected.
|
|
38
|
-
*/
|
|
39
|
-
private isPageValid;
|
|
40
|
-
/**
|
|
41
|
-
* Simulate human-like interactions on the page.
|
|
42
|
-
*/
|
|
43
|
-
private simulateHumanBehavior;
|
|
44
|
-
/**
|
|
45
|
-
* Adds a result to the in-memory cache.
|
|
46
|
-
*/
|
|
47
|
-
private addToCache;
|
|
48
|
-
/**
|
|
49
|
-
* Public method to fetch HTML. Delegates to the internal recursive fetch method.
|
|
50
|
-
*
|
|
51
|
-
* @param url The URL to fetch.
|
|
52
|
-
* @param options Optional settings for this specific fetch operation.
|
|
53
|
-
* @param options.fastMode Overrides the engine's `defaultFastMode` configuration for this request.
|
|
54
|
-
* @returns A Promise resolving to an HTMLFetchResult object.
|
|
55
|
-
* @throws {FetchError} If the fetch fails after all retries or encounters critical errors.
|
|
56
|
-
*/
|
|
57
|
-
fetchHTML(url: string, options?: FetchOptions & {
|
|
58
|
-
markdown?: boolean;
|
|
59
|
-
}): Promise<HTMLFetchResult>;
|
|
60
|
-
/**
|
|
61
|
-
* Internal recursive method to handle fetching with retries.
|
|
62
|
-
*
|
|
63
|
-
* @param url URL to fetch
|
|
64
|
-
* @param currentConfig The merged configuration including markdown option
|
|
65
|
-
* @param retryAttempt Current retry attempt number (starts at 0)
|
|
66
|
-
* @param parentRetryCount Tracks retries related to pool initialization errors (starts at 0)
|
|
67
|
-
* @returns Promise resolving to HTMLFetchResult
|
|
68
|
-
*/
|
|
69
|
-
private _fetchRecursive;
|
|
70
|
-
/**
|
|
71
|
-
* Performs the actual page fetch using a Playwright page from the pool.
|
|
72
|
-
* Ensures return type matches HTMLFetchResult.
|
|
73
|
-
*/
|
|
74
|
-
private fetchWithPlaywright;
|
|
75
|
-
private applyBlockingRules;
|
|
76
|
-
/**
|
|
77
|
-
* Cleans up resources used by the engine, primarily closing browser instances in the pool.
|
|
78
|
-
*
|
|
79
|
-
* It is crucial to call this method when finished with the engine instance to release resources.
|
|
80
|
-
* @returns A Promise that resolves when cleanup is complete.
|
|
81
|
-
*/
|
|
82
|
-
cleanup(): Promise<void>;
|
|
83
|
-
/**
|
|
84
|
-
* Retrieves metrics from the underlying browser pool.
|
|
85
|
-
* @returns An array of BrowserMetrics objects, one for each active browser instance, or an empty array if the pool is not initialized.
|
|
86
|
-
*/
|
|
87
|
-
getMetrics(): BrowserMetrics[];
|
|
88
|
-
private shouldUseHeadedMode;
|
|
89
|
-
}
|
|
90
|
-
//# sourceMappingURL=PlaywrightEngine.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"PlaywrightEngine.d.ts","sourceRoot":"","sources":["../src/PlaywrightEngine.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,cAAc,EAAE,sBAAsB,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AACxG,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAmB5C;;;;;;GAMG;AACH,qBAAa,gBAAiB,YAAW,OAAO;IAC9C,OAAO,CAAC,WAAW,CAAsC;IACzD,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAS;IAC/B,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAsC;IAC5D,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAmC;IAG1D,OAAO,CAAC,uBAAuB,CAAkB;IACjD,OAAO,CAAC,iBAAiB,CAAkB;IAC3C,OAAO,CAAC,mBAAmB,CAA0B;IAGrD,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,cAAc,CAkBpC;IAEF;;;;;OAKG;gBACS,MAAM,GAAE,sBAA2B;IAM/C;;OAEG;YACW,qBAAqB;IAuCnC;;;OAGG;YACW,yBAAyB;IAmFvC,OAAO,CAAC,UAAU;IAalB;;OAEG;YACW,WAAW;IAazB;;OAEG;YACW,qBAAqB;IAqCnC;;OAEG;IACH,OAAO,CAAC,UAAU;IAUlB;;;;;;;;OAQG;IACG,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,YAAY,GAAG;QAAE,QAAQ,CAAC,EAAE,OAAO,CAAA;KAAO,GAAG,OAAO,CAAC,eAAe,CAAC;IAU3G;;;;;;;;OAQG;YACW,eAAe;IAsH7B;;;OAGG;YACW,mBAAmB;YAmFnB,kBAAkB;IAmChC;;;;;OAKG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;IAe9B;;;OAGG;IACH,UAAU,IAAI,cAAc,EAAE;IAQ9B,OAAO,CAAC,mBAAmB;CAS5B"}
|