@purepageio/fetch-engines 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +205 -0
- package/dist/FetchEngine.d.ts +46 -0
- package/dist/FetchEngine.d.ts.map +1 -0
- package/dist/FetchEngine.js +137 -0
- package/dist/FetchEngine.js.map +1 -0
- package/dist/FetchEngine.test.d.ts +2 -0
- package/dist/FetchEngine.test.d.ts.map +1 -0
- package/dist/FetchEngine.test.js +44 -0
- package/dist/FetchEngine.test.js.map +1 -0
- package/dist/HybridEngine.d.ts +15 -0
- package/dist/HybridEngine.d.ts.map +1 -0
- package/dist/HybridEngine.js +45 -0
- package/dist/HybridEngine.js.map +1 -0
- package/dist/IEngine.d.ts +22 -0
- package/dist/IEngine.d.ts.map +1 -0
- package/dist/IEngine.js +2 -0
- package/dist/IEngine.js.map +1 -0
- package/dist/PlaywrightEngine.d.ts +88 -0
- package/dist/PlaywrightEngine.d.ts.map +1 -0
- package/dist/PlaywrightEngine.js +484 -0
- package/dist/PlaywrightEngine.js.map +1 -0
- package/dist/PlaywrightEngine.test.d.ts +2 -0
- package/dist/PlaywrightEngine.test.d.ts.map +1 -0
- package/dist/PlaywrightEngine.test.js +299 -0
- package/dist/PlaywrightEngine.test.js.map +1 -0
- package/dist/PuppeteerEngine.d.ts +21 -0
- package/dist/PuppeteerEngine.d.ts.map +1 -0
- package/dist/PuppeteerEngine.js +412 -0
- package/dist/PuppeteerEngine.js.map +1 -0
- package/dist/browser/BrowserPool.d.ts +29 -0
- package/dist/browser/BrowserPool.d.ts.map +1 -0
- package/dist/browser/BrowserPool.js +378 -0
- package/dist/browser/BrowserPool.js.map +1 -0
- package/dist/browser/PlaywrightBrowserPool.d.ts +78 -0
- package/dist/browser/PlaywrightBrowserPool.d.ts.map +1 -0
- package/dist/browser/PlaywrightBrowserPool.js +429 -0
- package/dist/browser/PlaywrightBrowserPool.js.map +1 -0
- package/dist/browser/PlaywrightBrowserPool.test.d.ts +2 -0
- package/dist/browser/PlaywrightBrowserPool.test.d.ts.map +1 -0
- package/dist/browser/PlaywrightBrowserPool.test.js +422 -0
- package/dist/browser/PlaywrightBrowserPool.test.js.map +1 -0
- package/dist/errors.d.ts +20 -0
- package/dist/errors.d.ts.map +1 -0
- package/dist/errors.js +30 -0
- package/dist/errors.js.map +1 -0
- package/dist/index.d.ts +8 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +5 -0
- package/dist/index.js.map +1 -0
- package/dist/types.d.ts +151 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/package.json +72 -0
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import type { Browser as PlaywrightBrowser, BrowserContext } from "playwright";
|
|
2
|
+
/**
|
|
3
|
+
* Result object returned by engine's fetchHTML method.
|
|
4
|
+
*/
|
|
5
|
+
export interface HTMLFetchResult {
|
|
6
|
+
/** The full HTML content of the fetched page. */
|
|
7
|
+
html: string;
|
|
8
|
+
/** The extracted content of the <title> tag, or an empty string if not found. */
|
|
9
|
+
title: string;
|
|
10
|
+
/** The final URL after any redirects. */
|
|
11
|
+
url: string;
|
|
12
|
+
/** Indicates if the result was served from the engine's cache. */
|
|
13
|
+
isFromCache: boolean;
|
|
14
|
+
/** The HTTP status code of the final response, if available. */
|
|
15
|
+
statusCode?: number;
|
|
16
|
+
/** Error object if the fetch failed after all retries. */
|
|
17
|
+
error?: Error;
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Metrics related to browser pool performance and status.
|
|
21
|
+
*/
|
|
22
|
+
export interface BrowserMetrics {
|
|
23
|
+
id: string;
|
|
24
|
+
engine?: "playwright" | string;
|
|
25
|
+
pagesCreated: number;
|
|
26
|
+
activePages: number;
|
|
27
|
+
lastUsed: Date;
|
|
28
|
+
errors: number;
|
|
29
|
+
totalRequests?: number;
|
|
30
|
+
avgResponseTime?: number;
|
|
31
|
+
createdAt: Date;
|
|
32
|
+
isHealthy: boolean;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Internal representation of a Playwright browser instance within the pool.
|
|
36
|
+
*/
|
|
37
|
+
export interface BrowserInstance {
|
|
38
|
+
browser: PlaywrightBrowser;
|
|
39
|
+
context: BrowserContext;
|
|
40
|
+
metrics: BrowserMetrics;
|
|
41
|
+
isHealthy: boolean;
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Configuration options for the PlaywrightEngine.
|
|
45
|
+
*/
|
|
46
|
+
export interface PlaywrightEngineConfig {
|
|
47
|
+
/**
|
|
48
|
+
* Maximum number of Playwright pages to process concurrently.
|
|
49
|
+
* @default 3
|
|
50
|
+
*/
|
|
51
|
+
concurrentPages?: number;
|
|
52
|
+
/**
|
|
53
|
+
* Maximum number of retry attempts for a failed fetch operation (excluding initial attempt).
|
|
54
|
+
* @default 3
|
|
55
|
+
*/
|
|
56
|
+
maxRetries?: number;
|
|
57
|
+
/**
|
|
58
|
+
* Delay in milliseconds between retry attempts.
|
|
59
|
+
* @default 5000
|
|
60
|
+
*/
|
|
61
|
+
retryDelay?: number;
|
|
62
|
+
/**
|
|
63
|
+
* Time-to-live for cached results in milliseconds. Set to 0 to disable.
|
|
64
|
+
* @default 900000 (15 minutes)
|
|
65
|
+
*/
|
|
66
|
+
cacheTTL?: number;
|
|
67
|
+
/**
|
|
68
|
+
* If true, attempts a fast HTTP GET first before using Playwright.
|
|
69
|
+
* @default true
|
|
70
|
+
*/
|
|
71
|
+
useHttpFallback?: boolean;
|
|
72
|
+
/**
|
|
73
|
+
* If true, automatically retries failed requests for a domain in headed mode.
|
|
74
|
+
* @default false
|
|
75
|
+
*/
|
|
76
|
+
useHeadedModeFallback?: boolean;
|
|
77
|
+
/**
|
|
78
|
+
* If true, requests initially block non-essential resources and skip human simulation.
|
|
79
|
+
* Can be overridden per-request via fetchHTML options.
|
|
80
|
+
* @default true
|
|
81
|
+
*/
|
|
82
|
+
defaultFastMode?: boolean;
|
|
83
|
+
/**
|
|
84
|
+
* If true (and not in fastMode), attempts basic human-like interactions.
|
|
85
|
+
* @default true
|
|
86
|
+
*/
|
|
87
|
+
simulateHumanBehavior?: boolean;
|
|
88
|
+
/**
|
|
89
|
+
* Maximum number of concurrent browser instances the pool manages.
|
|
90
|
+
* Passed to PlaywrightBrowserPool.
|
|
91
|
+
* @default 2
|
|
92
|
+
*/
|
|
93
|
+
maxBrowsers?: number;
|
|
94
|
+
/**
|
|
95
|
+
* Maximum number of pages per browser context before recycling.
|
|
96
|
+
* Passed to PlaywrightBrowserPool.
|
|
97
|
+
* @default 6
|
|
98
|
+
*/
|
|
99
|
+
maxPagesPerContext?: number;
|
|
100
|
+
/**
|
|
101
|
+
* Maximum age in ms a browser instance lives before recycling.
|
|
102
|
+
* Passed to PlaywrightBrowserPool.
|
|
103
|
+
* @default 1200000 (20 minutes)
|
|
104
|
+
*/
|
|
105
|
+
maxBrowserAge?: number;
|
|
106
|
+
/**
|
|
107
|
+
* How often (in ms) the pool checks browser health.
|
|
108
|
+
* Passed to PlaywrightBrowserPool.
|
|
109
|
+
* @default 60000 (1 minute)
|
|
110
|
+
*/
|
|
111
|
+
healthCheckInterval?: number;
|
|
112
|
+
/**
|
|
113
|
+
* List of domain glob patterns to block requests to. Overrides pool default.
|
|
114
|
+
* Passed to PlaywrightBrowserPool.
|
|
115
|
+
* @default [] (uses pool's defaults)
|
|
116
|
+
*/
|
|
117
|
+
poolBlockedDomains?: string[];
|
|
118
|
+
/**
|
|
119
|
+
* List of Playwright resource types (e.g., 'image', 'font') to block. Overrides pool default.
|
|
120
|
+
* Passed to PlaywrightBrowserPool.
|
|
121
|
+
* @default [] (uses pool's defaults)
|
|
122
|
+
*/
|
|
123
|
+
poolBlockedResourceTypes?: string[];
|
|
124
|
+
/**
|
|
125
|
+
* Proxy configuration for browser instances.
|
|
126
|
+
* Passed to PlaywrightBrowserPool.
|
|
127
|
+
* @default undefined
|
|
128
|
+
*/
|
|
129
|
+
proxy?: {
|
|
130
|
+
/** Proxy server URL (e.g., "http://host:port", "socks5://user:pass@host:port"). */
|
|
131
|
+
server: string;
|
|
132
|
+
/** Optional proxy username. */
|
|
133
|
+
username?: string;
|
|
134
|
+
/** Optional proxy password. */
|
|
135
|
+
password?: string;
|
|
136
|
+
};
|
|
137
|
+
/**
|
|
138
|
+
* Forces the entire pool to launch browsers in headed (visible) mode.
|
|
139
|
+
* Passed to PlaywrightBrowserPool.
|
|
140
|
+
* @default false
|
|
141
|
+
*/
|
|
142
|
+
useHeadedMode?: boolean;
|
|
143
|
+
}
|
|
144
|
+
/**
|
|
145
|
+
* Options that can be passed per-request to engine.fetchHTML().
|
|
146
|
+
*/
|
|
147
|
+
export interface FetchOptions {
|
|
148
|
+
/** Overrides the engine's defaultFastMode for this specific request. */
|
|
149
|
+
fastMode?: boolean;
|
|
150
|
+
}
|
|
151
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,OAAO,IAAI,iBAAiB,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAE/E;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,iDAAiD;IACjD,IAAI,EAAE,MAAM,CAAC;IACb,iFAAiF;IACjF,KAAK,EAAE,MAAM,CAAC;IACd,yCAAyC;IACzC,GAAG,EAAE,MAAM,CAAC;IACZ,kEAAkE;IAClE,WAAW,EAAE,OAAO,CAAC;IACrB,gEAAgE;IAChE,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,0DAA0D;IAC1D,KAAK,CAAC,EAAE,KAAK,CAAC;CACf;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,EAAE,EAAE,MAAM,CAAC;IACX,MAAM,CAAC,EAAE,YAAY,GAAG,MAAM,CAAC;IAC/B,YAAY,EAAE,MAAM,CAAC;IACrB,WAAW,EAAE,MAAM,CAAC;IACpB,QAAQ,EAAE,IAAI,CAAC;IACf,MAAM,EAAE,MAAM,CAAC;IACf,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,SAAS,EAAE,IAAI,CAAC;IAChB,SAAS,EAAE,OAAO,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,OAAO,EAAE,iBAAiB,CAAC;IAC3B,OAAO,EAAE,cAAc,CAAC;IACxB,OAAO,EAAE,cAAc,CAAC;IACxB,SAAS,EAAE,OAAO,CAAC;CACpB;AAKD;;GAEG;AACH,MAAM,WAAW,sBAAsB;IACrC;;;OAGG;IACH,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB;;;OAGG;IACH,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB;;;OAGG;IACH,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB;;;OAGG;IACH,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB;;;OAGG;IACH,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B;;;OAGG;IACH,qBAAqB,CAAC,EAAE,OAAO,CAAC;IAChC;;;;OAIG;IACH,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B;;;OAGG;IACH,qBAAqB,CAAC,EAAE,OAAO,CAAC;IAIhC;;;;OAIG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB;;;;OAIG;IACH,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B;;;;OAIG;IACH,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB;;;;OAIG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B;;;;OAIG;IACH,kBAAkB,CAAC,EAAE,MAAM,EAAE,CAAC;IAC9B;;;;OAIG;IACH,wBAAwB,CAAC,EAAE,MAAM,EAAE,CAAC;IACpC;;;;OAIG;IACH,KAAK,CAAC,EAAE;QACN,mFAAmF;QACnF,MAAM,EAAE,MAAM,CAAC;QACf,+BAA+B;QAC/B,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,+BAA+B;QAC/B,QAAQ,CAAC,EAAE,MAAM,CAAC;KACnB,CAAC;IACF;;;;OAIG;IACH,aAAa,CAAC,EAAE,OAAO,CAAC;CACzB;AAED;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,wEAAwE;IACxE,QAAQ,CAAC,EAAE,OAAO,CAAC;CACpB"}
|
package/dist/types.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":""}
|
package/package.json
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@purepageio/fetch-engines",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"type": "module",
|
|
5
|
+
"description": "A collection of configurable engines for fetching HTML content using fetch or Playwright.",
|
|
6
|
+
"main": "dist/index.js",
|
|
7
|
+
"types": "dist/index.d.ts",
|
|
8
|
+
"files": [
|
|
9
|
+
"dist",
|
|
10
|
+
"README.md",
|
|
11
|
+
"LICENSE"
|
|
12
|
+
],
|
|
13
|
+
"dependencies": {
|
|
14
|
+
"axios": "^1.6.8",
|
|
15
|
+
"jsdom": "^24.0.0",
|
|
16
|
+
"p-queue": "^7.4.1",
|
|
17
|
+
"playwright": "^1.43.0",
|
|
18
|
+
"playwright-extra": "^4.3.6",
|
|
19
|
+
"puppeteer-extra-plugin-stealth": "^2.11.2",
|
|
20
|
+
"user-agents": "^1.1.208",
|
|
21
|
+
"uuid": "^11.1.0"
|
|
22
|
+
},
|
|
23
|
+
"devDependencies": {
|
|
24
|
+
"@types/axios": "^0.14.0",
|
|
25
|
+
"@types/jsdom": "^21.1.6",
|
|
26
|
+
"@types/node": "^18.0.0",
|
|
27
|
+
"@types/user-agents": "^1.0.4",
|
|
28
|
+
"@types/uuid": "^10.0.0",
|
|
29
|
+
"@typescript-eslint/eslint-plugin": "^8.29.0",
|
|
30
|
+
"@typescript-eslint/parser": "^8.29.0",
|
|
31
|
+
"eslint": "^9.23.0",
|
|
32
|
+
"eslint-config-prettier": "^10.1.1",
|
|
33
|
+
"eslint-plugin-prettier": "^5.2.5",
|
|
34
|
+
"globals": "^16.0.0",
|
|
35
|
+
"prettier": "^3.5.3",
|
|
36
|
+
"ts-node": "^10.9.2",
|
|
37
|
+
"typescript": "^5.3.3",
|
|
38
|
+
"typescript-eslint": "^8.29.0",
|
|
39
|
+
"vitest": "^3.1.1"
|
|
40
|
+
},
|
|
41
|
+
"engines": {
|
|
42
|
+
"node": ">=22.0.0"
|
|
43
|
+
},
|
|
44
|
+
"repository": {
|
|
45
|
+
"type": "git",
|
|
46
|
+
"url": "git+https://github.com/purepageio/fetch-engines.git"
|
|
47
|
+
},
|
|
48
|
+
"keywords": [
|
|
49
|
+
"fetch",
|
|
50
|
+
"scrape",
|
|
51
|
+
"playwright",
|
|
52
|
+
"headless",
|
|
53
|
+
"browser",
|
|
54
|
+
"automation",
|
|
55
|
+
"engine"
|
|
56
|
+
],
|
|
57
|
+
"author": "Purepage",
|
|
58
|
+
"license": "MIT",
|
|
59
|
+
"bugs": {
|
|
60
|
+
"url": "https://github.com/purepageio/fetch-engines/issues"
|
|
61
|
+
},
|
|
62
|
+
"homepage": "https://github.com/purepageio/fetch-engines#readme",
|
|
63
|
+
"publishConfig": {
|
|
64
|
+
"access": "public"
|
|
65
|
+
},
|
|
66
|
+
"scripts": {
|
|
67
|
+
"build": "tsc",
|
|
68
|
+
"lint": "eslint \"src/**/*.ts\" \"examples/**/*.ts\"",
|
|
69
|
+
"format": "prettier --write \"src/**/*.ts\" \"examples/**/*.ts\" \"*.{js,cjs,json,md}\"",
|
|
70
|
+
"test": "vitest run"
|
|
71
|
+
}
|
|
72
|
+
}
|