@purepageio/fetch-engines 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +205 -0
  3. package/dist/FetchEngine.d.ts +46 -0
  4. package/dist/FetchEngine.d.ts.map +1 -0
  5. package/dist/FetchEngine.js +137 -0
  6. package/dist/FetchEngine.js.map +1 -0
  7. package/dist/FetchEngine.test.d.ts +2 -0
  8. package/dist/FetchEngine.test.d.ts.map +1 -0
  9. package/dist/FetchEngine.test.js +44 -0
  10. package/dist/FetchEngine.test.js.map +1 -0
  11. package/dist/HybridEngine.d.ts +15 -0
  12. package/dist/HybridEngine.d.ts.map +1 -0
  13. package/dist/HybridEngine.js +45 -0
  14. package/dist/HybridEngine.js.map +1 -0
  15. package/dist/IEngine.d.ts +22 -0
  16. package/dist/IEngine.d.ts.map +1 -0
  17. package/dist/IEngine.js +2 -0
  18. package/dist/IEngine.js.map +1 -0
  19. package/dist/PlaywrightEngine.d.ts +88 -0
  20. package/dist/PlaywrightEngine.d.ts.map +1 -0
  21. package/dist/PlaywrightEngine.js +484 -0
  22. package/dist/PlaywrightEngine.js.map +1 -0
  23. package/dist/PlaywrightEngine.test.d.ts +2 -0
  24. package/dist/PlaywrightEngine.test.d.ts.map +1 -0
  25. package/dist/PlaywrightEngine.test.js +299 -0
  26. package/dist/PlaywrightEngine.test.js.map +1 -0
  27. package/dist/PuppeteerEngine.d.ts +21 -0
  28. package/dist/PuppeteerEngine.d.ts.map +1 -0
  29. package/dist/PuppeteerEngine.js +412 -0
  30. package/dist/PuppeteerEngine.js.map +1 -0
  31. package/dist/browser/BrowserPool.d.ts +29 -0
  32. package/dist/browser/BrowserPool.d.ts.map +1 -0
  33. package/dist/browser/BrowserPool.js +378 -0
  34. package/dist/browser/BrowserPool.js.map +1 -0
  35. package/dist/browser/PlaywrightBrowserPool.d.ts +78 -0
  36. package/dist/browser/PlaywrightBrowserPool.d.ts.map +1 -0
  37. package/dist/browser/PlaywrightBrowserPool.js +429 -0
  38. package/dist/browser/PlaywrightBrowserPool.js.map +1 -0
  39. package/dist/browser/PlaywrightBrowserPool.test.d.ts +2 -0
  40. package/dist/browser/PlaywrightBrowserPool.test.d.ts.map +1 -0
  41. package/dist/browser/PlaywrightBrowserPool.test.js +422 -0
  42. package/dist/browser/PlaywrightBrowserPool.test.js.map +1 -0
  43. package/dist/errors.d.ts +20 -0
  44. package/dist/errors.d.ts.map +1 -0
  45. package/dist/errors.js +30 -0
  46. package/dist/errors.js.map +1 -0
  47. package/dist/index.d.ts +8 -0
  48. package/dist/index.d.ts.map +1 -0
  49. package/dist/index.js +5 -0
  50. package/dist/index.js.map +1 -0
  51. package/dist/types.d.ts +151 -0
  52. package/dist/types.d.ts.map +1 -0
  53. package/dist/types.js +2 -0
  54. package/dist/types.js.map +1 -0
  55. package/package.json +72 -0
@@ -0,0 +1,151 @@
1
+ import type { Browser as PlaywrightBrowser, BrowserContext } from "playwright";
2
+ /**
3
+ * Result object returned by engine's fetchHTML method.
4
+ */
5
+ export interface HTMLFetchResult {
6
+ /** The full HTML content of the fetched page. */
7
+ html: string;
8
+ /** The extracted content of the <title> tag, or an empty string if not found. */
9
+ title: string;
10
+ /** The final URL after any redirects. */
11
+ url: string;
12
+ /** Indicates if the result was served from the engine's cache. */
13
+ isFromCache: boolean;
14
+ /** The HTTP status code of the final response, if available. */
15
+ statusCode?: number;
16
+ /** Error object if the fetch failed after all retries. */
17
+ error?: Error;
18
+ }
19
+ /**
20
+ * Metrics related to browser pool performance and status.
21
+ */
22
+ export interface BrowserMetrics {
23
+ id: string;
24
+ engine?: "playwright" | string;
25
+ pagesCreated: number;
26
+ activePages: number;
27
+ lastUsed: Date;
28
+ errors: number;
29
+ totalRequests?: number;
30
+ avgResponseTime?: number;
31
+ createdAt: Date;
32
+ isHealthy: boolean;
33
+ }
34
+ /**
35
+ * Internal representation of a Playwright browser instance within the pool.
36
+ */
37
+ export interface BrowserInstance {
38
+ browser: PlaywrightBrowser;
39
+ context: BrowserContext;
40
+ metrics: BrowserMetrics;
41
+ isHealthy: boolean;
42
+ }
43
+ /**
44
+ * Configuration options for the PlaywrightEngine.
45
+ */
46
+ export interface PlaywrightEngineConfig {
47
+ /**
48
+ * Maximum number of Playwright pages to process concurrently.
49
+ * @default 3
50
+ */
51
+ concurrentPages?: number;
52
+ /**
53
+ * Maximum number of retry attempts for a failed fetch operation (excluding initial attempt).
54
+ * @default 3
55
+ */
56
+ maxRetries?: number;
57
+ /**
58
+ * Delay in milliseconds between retry attempts.
59
+ * @default 5000
60
+ */
61
+ retryDelay?: number;
62
+ /**
63
+ * Time-to-live for cached results in milliseconds. Set to 0 to disable.
64
+ * @default 900000 (15 minutes)
65
+ */
66
+ cacheTTL?: number;
67
+ /**
68
+ * If true, attempts a fast HTTP GET first before using Playwright.
69
+ * @default true
70
+ */
71
+ useHttpFallback?: boolean;
72
+ /**
73
+ * If true, automatically retries failed requests for a domain in headed mode.
74
+ * @default false
75
+ */
76
+ useHeadedModeFallback?: boolean;
77
+ /**
78
+ * If true, requests initially block non-essential resources and skip human simulation.
79
+ * Can be overridden per-request via fetchHTML options.
80
+ * @default true
81
+ */
82
+ defaultFastMode?: boolean;
83
+ /**
84
+ * If true (and not in fastMode), attempts basic human-like interactions.
85
+ * @default true
86
+ */
87
+ simulateHumanBehavior?: boolean;
88
+ /**
89
+ * Maximum number of concurrent browser instances the pool manages.
90
+ * Passed to PlaywrightBrowserPool.
91
+ * @default 2
92
+ */
93
+ maxBrowsers?: number;
94
+ /**
95
+ * Maximum number of pages per browser context before recycling.
96
+ * Passed to PlaywrightBrowserPool.
97
+ * @default 6
98
+ */
99
+ maxPagesPerContext?: number;
100
+ /**
101
+ * Maximum age in ms a browser instance lives before recycling.
102
+ * Passed to PlaywrightBrowserPool.
103
+ * @default 1200000 (20 minutes)
104
+ */
105
+ maxBrowserAge?: number;
106
+ /**
107
+ * How often (in ms) the pool checks browser health.
108
+ * Passed to PlaywrightBrowserPool.
109
+ * @default 60000 (1 minute)
110
+ */
111
+ healthCheckInterval?: number;
112
+ /**
113
+ * List of domain glob patterns to block requests to. Overrides pool default.
114
+ * Passed to PlaywrightBrowserPool.
115
+ * @default [] (uses pool's defaults)
116
+ */
117
+ poolBlockedDomains?: string[];
118
+ /**
119
+ * List of Playwright resource types (e.g., 'image', 'font') to block. Overrides pool default.
120
+ * Passed to PlaywrightBrowserPool.
121
+ * @default [] (uses pool's defaults)
122
+ */
123
+ poolBlockedResourceTypes?: string[];
124
+ /**
125
+ * Proxy configuration for browser instances.
126
+ * Passed to PlaywrightBrowserPool.
127
+ * @default undefined
128
+ */
129
+ proxy?: {
130
+ /** Proxy server URL (e.g., "http://host:port", "socks5://user:pass@host:port"). */
131
+ server: string;
132
+ /** Optional proxy username. */
133
+ username?: string;
134
+ /** Optional proxy password. */
135
+ password?: string;
136
+ };
137
+ /**
138
+ * Forces the entire pool to launch browsers in headed (visible) mode.
139
+ * Passed to PlaywrightBrowserPool.
140
+ * @default false
141
+ */
142
+ useHeadedMode?: boolean;
143
+ }
144
+ /**
145
+ * Options that can be passed per-request to engine.fetchHTML().
146
+ */
147
+ export interface FetchOptions {
148
+ /** Overrides the engine's defaultFastMode for this specific request. */
149
+ fastMode?: boolean;
150
+ }
151
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,OAAO,IAAI,iBAAiB,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAE/E;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,iDAAiD;IACjD,IAAI,EAAE,MAAM,CAAC;IACb,iFAAiF;IACjF,KAAK,EAAE,MAAM,CAAC;IACd,yCAAyC;IACzC,GAAG,EAAE,MAAM,CAAC;IACZ,kEAAkE;IAClE,WAAW,EAAE,OAAO,CAAC;IACrB,gEAAgE;IAChE,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,0DAA0D;IAC1D,KAAK,CAAC,EAAE,KAAK,CAAC;CACf;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,EAAE,EAAE,MAAM,CAAC;IACX,MAAM,CAAC,EAAE,YAAY,GAAG,MAAM,CAAC;IAC/B,YAAY,EAAE,MAAM,CAAC;IACrB,WAAW,EAAE,MAAM,CAAC;IACpB,QAAQ,EAAE,IAAI,CAAC;IACf,MAAM,EAAE,MAAM,CAAC;IACf,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,SAAS,EAAE,IAAI,CAAC;IAChB,SAAS,EAAE,OAAO,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,OAAO,EAAE,iBAAiB,CAAC;IAC3B,OAAO,EAAE,cAAc,CAAC;IACxB,OAAO,EAAE,cAAc,CAAC;IACxB,SAAS,EAAE,OAAO,CAAC;CACpB;AAKD;;GAEG;AACH,MAAM,WAAW,sBAAsB;IACrC;;;OAGG;IACH,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB;;;OAGG;IACH,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB;;;OAGG;IACH,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB;;;OAGG;IACH,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB;;;OAGG;IACH,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B;;;OAGG;IACH,qBAAqB,CAAC,EAAE,OAAO,CAAC;IAChC;;;;OAIG;IACH,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B;;;OAGG;IACH,qBAAqB,CAAC,EAAE,OAAO,CAAC;IAIhC;;;;OAIG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB;;;;OAIG;IACH,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B;;;;OAIG;IACH,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB;;;;OAIG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B;;;;OAIG;IACH,kBAAkB,CAAC,EAAE,MAAM,EAAE,CAAC;IAC9B;;;;OAIG;IACH,wBAAwB,CAAC,EAAE,MAAM,EAAE,CAAC;IACpC;;;;OAIG;IACH,KAAK,CAAC,EAAE;QACN,mFAAmF;QACnF,MAAM,EAAE,MAAM,CAAC;QACf,+BAA+B;QAC/B,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,+BAA+B;QAC/B,QAAQ,CAAC,EAAE,MAAM,CAAC;KACnB,CAAC;IACF;;;;OAIG;IACH,aAAa,CAAC,EAAE,OAAO,CAAC;CACzB;AAED;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,wEAAwE;IACxE,QAAQ,CAAC,EAAE,OAAO,CAAC;CACpB"}
package/dist/types.js ADDED
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":""}
package/package.json ADDED
@@ -0,0 +1,72 @@
1
+ {
2
+ "name": "@purepageio/fetch-engines",
3
+ "version": "0.1.0",
4
+ "type": "module",
5
+ "description": "A collection of configurable engines for fetching HTML content using fetch or Playwright.",
6
+ "main": "dist/index.js",
7
+ "types": "dist/index.d.ts",
8
+ "files": [
9
+ "dist",
10
+ "README.md",
11
+ "LICENSE"
12
+ ],
13
+ "dependencies": {
14
+ "axios": "^1.6.8",
15
+ "jsdom": "^24.0.0",
16
+ "p-queue": "^7.4.1",
17
+ "playwright": "^1.43.0",
18
+ "playwright-extra": "^4.3.6",
19
+ "puppeteer-extra-plugin-stealth": "^2.11.2",
20
+ "user-agents": "^1.1.208",
21
+ "uuid": "^11.1.0"
22
+ },
23
+ "devDependencies": {
24
+ "@types/axios": "^0.14.0",
25
+ "@types/jsdom": "^21.1.6",
26
+ "@types/node": "^18.0.0",
27
+ "@types/user-agents": "^1.0.4",
28
+ "@types/uuid": "^10.0.0",
29
+ "@typescript-eslint/eslint-plugin": "^8.29.0",
30
+ "@typescript-eslint/parser": "^8.29.0",
31
+ "eslint": "^9.23.0",
32
+ "eslint-config-prettier": "^10.1.1",
33
+ "eslint-plugin-prettier": "^5.2.5",
34
+ "globals": "^16.0.0",
35
+ "prettier": "^3.5.3",
36
+ "ts-node": "^10.9.2",
37
+ "typescript": "^5.3.3",
38
+ "typescript-eslint": "^8.29.0",
39
+ "vitest": "^3.1.1"
40
+ },
41
+ "engines": {
42
+ "node": ">=22.0.0"
43
+ },
44
+ "repository": {
45
+ "type": "git",
46
+ "url": "git+https://github.com/purepageio/fetch-engines.git"
47
+ },
48
+ "keywords": [
49
+ "fetch",
50
+ "scrape",
51
+ "playwright",
52
+ "headless",
53
+ "browser",
54
+ "automation",
55
+ "engine"
56
+ ],
57
+ "author": "Purepage",
58
+ "license": "MIT",
59
+ "bugs": {
60
+ "url": "https://github.com/purepageio/fetch-engines/issues"
61
+ },
62
+ "homepage": "https://github.com/purepageio/fetch-engines#readme",
63
+ "publishConfig": {
64
+ "access": "public"
65
+ },
66
+ "scripts": {
67
+ "build": "tsc",
68
+ "lint": "eslint \"src/**/*.ts\" \"examples/**/*.ts\"",
69
+ "format": "prettier --write \"src/**/*.ts\" \"examples/**/*.ts\" \"*.{js,cjs,json,md}\"",
70
+ "test": "vitest run"
71
+ }
72
+ }