@purepageio/fetch-engines 0.2.9 → 0.2.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/dist/FetchEngine.d.ts +47 -0
  2. package/dist/FetchEngine.d.ts.map +1 -0
  3. package/dist/FetchEngine.js +114 -0
  4. package/dist/FetchEngine.js.map +1 -0
  5. package/dist/FetchEngine.test.d.ts +2 -0
  6. package/dist/FetchEngine.test.d.ts.map +1 -0
  7. package/dist/FetchEngine.test.js +44 -0
  8. package/dist/FetchEngine.test.js.map +1 -0
  9. package/dist/HybridEngine.d.ts +21 -0
  10. package/dist/HybridEngine.d.ts.map +1 -0
  11. package/dist/HybridEngine.js +62 -0
  12. package/dist/HybridEngine.js.map +1 -0
  13. package/dist/IEngine.d.ts +22 -0
  14. package/dist/IEngine.d.ts.map +1 -0
  15. package/dist/IEngine.js +2 -0
  16. package/dist/IEngine.js.map +1 -0
  17. package/dist/PlaywrightEngine.d.ts +90 -0
  18. package/dist/PlaywrightEngine.d.ts.map +1 -0
  19. package/dist/PlaywrightEngine.js +558 -0
  20. package/dist/PlaywrightEngine.js.map +1 -0
  21. package/dist/PlaywrightEngine.test.d.ts +2 -0
  22. package/dist/PlaywrightEngine.test.d.ts.map +1 -0
  23. package/dist/PlaywrightEngine.test.js +207 -0
  24. package/dist/PlaywrightEngine.test.js.map +1 -0
  25. package/dist/PuppeteerEngine.d.ts +21 -0
  26. package/dist/PuppeteerEngine.d.ts.map +1 -0
  27. package/dist/PuppeteerEngine.js +412 -0
  28. package/dist/PuppeteerEngine.js.map +1 -0
  29. package/dist/browser/BrowserPool.d.ts +29 -0
  30. package/dist/browser/BrowserPool.d.ts.map +1 -0
  31. package/dist/browser/BrowserPool.js +378 -0
  32. package/dist/browser/BrowserPool.js.map +1 -0
  33. package/dist/browser/PlaywrightBrowserPool.d.ts +48 -0
  34. package/dist/browser/PlaywrightBrowserPool.d.ts.map +1 -0
  35. package/dist/browser/PlaywrightBrowserPool.js +378 -0
  36. package/dist/browser/PlaywrightBrowserPool.js.map +1 -0
  37. package/dist/browser/PlaywrightBrowserPool.test.d.ts +2 -0
  38. package/dist/browser/PlaywrightBrowserPool.test.d.ts.map +1 -0
  39. package/dist/browser/PlaywrightBrowserPool.test.js +422 -0
  40. package/dist/browser/PlaywrightBrowserPool.test.js.map +1 -0
  41. package/dist/errors.d.ts +20 -0
  42. package/dist/errors.d.ts.map +1 -0
  43. package/dist/errors.js +30 -0
  44. package/dist/errors.js.map +1 -0
  45. package/dist/index.d.ts +8 -323
  46. package/dist/index.d.ts.map +1 -0
  47. package/dist/index.js +4 -1617
  48. package/dist/index.js.map +1 -1
  49. package/dist/types.d.ts +167 -0
  50. package/dist/types.d.ts.map +1 -0
  51. package/dist/types.js +2 -0
  52. package/dist/types.js.map +1 -0
  53. package/dist/utils/markdown-converter.d.ts +31 -0
  54. package/dist/utils/markdown-converter.d.ts.map +1 -0
  55. package/dist/utils/markdown-converter.js +796 -0
  56. package/dist/utils/markdown-converter.js.map +1 -0
  57. package/package.json +5 -14
  58. package/dist/index.cjs +0 -1657
  59. package/dist/index.cjs.map +0 -1
  60. package/dist/index.d.cts +0 -323
@@ -0,0 +1,47 @@
1
+ import type { HTMLFetchResult, BrowserMetrics, FetchEngineOptions } from "./types.js";
2
+ import type { IEngine } from "./IEngine.js";
3
+ import { FetchError } from "./errors.js";
4
+ /**
5
+ * Custom error class for HTTP errors from FetchEngine.
6
+ */
7
+ export declare class FetchEngineHttpError extends FetchError {
8
+ readonly statusCode: number;
9
+ constructor(message: string, statusCode: number);
10
+ }
11
+ /**
12
+ * FetchEngine - A lightweight engine for fetching HTML content using the standard `fetch` API.
13
+ *
14
+ * Ideal for fetching content from static websites or APIs where JavaScript execution is not required.
15
+ * It does not support advanced configurations like retries, caching, or proxies directly.
16
+ */
17
+ export declare class FetchEngine implements IEngine {
18
+ private readonly options;
19
+ private static readonly DEFAULT_OPTIONS;
20
+ /**
21
+ * Creates an instance of FetchEngine.
22
+ * @param options Configuration options for the FetchEngine.
23
+ */
24
+ constructor(options?: FetchEngineOptions);
25
+ /**
26
+ * Fetches HTML or converts to Markdown from the specified URL.
27
+ *
28
+ * @param url The URL to fetch.
29
+ * @returns A Promise resolving to an HTMLFetchResult object.
30
+ * @throws {FetchEngineHttpError} If the HTTP response status is not ok (e.g., 404, 500).
31
+ * @throws {Error} If the content type is not HTML or for other network errors.
32
+ */
33
+ fetchHTML(url: string, options?: FetchEngineOptions): Promise<HTMLFetchResult>;
34
+ /**
35
+ * Cleans up resources used by the engine.
36
+ * For FetchEngine, this is a no-op as it doesn't manage persistent resources.
37
+ * @returns A Promise that resolves when cleanup is complete.
38
+ */
39
+ cleanup(): Promise<void>;
40
+ /**
41
+ * Retrieves metrics for the engine.
42
+ * FetchEngine does not manage browsers, so it returns an empty array.
43
+ * @returns An empty array.
44
+ */
45
+ getMetrics(): BrowserMetrics[];
46
+ }
47
+ //# sourceMappingURL=FetchEngine.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"FetchEngine.d.ts","sourceRoot":"","sources":["../src/FetchEngine.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,cAAc,EAAE,kBAAkB,EAAE,MAAM,YAAY,CAAC;AACtF,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAG5C,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAEzC;;GAEG;AACH,qBAAa,oBAAqB,SAAQ,UAAU;aAGhC,UAAU,EAAE,MAAM;gBADlC,OAAO,EAAE,MAAM,EACC,UAAU,EAAE,MAAM;CAKrC;AAED;;;;;GAKG;AACH,qBAAa,WAAY,YAAW,OAAO;IACzC,OAAO,CAAC,QAAQ,CAAC,OAAO,CAA+B;IAEvD,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,eAAe,CAErC;IAEF;;;OAGG;gBACS,OAAO,GAAE,kBAAuB;IAI5C;;;;;;;OAOG;IACG,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,kBAAkB,GAAG,OAAO,CAAC,eAAe,CAAC;IAiEpF;;;;OAIG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;IAI9B;;;;OAIG;IACH,UAAU,IAAI,cAAc,EAAE;CAG/B"}
@@ -0,0 +1,114 @@
1
+ import { MarkdownConverter } from "./utils/markdown-converter.js"; // Import the converter
2
+ import { FetchError } from "./errors.js"; // Only import FetchError
3
+ /**
4
+ * Custom error class for HTTP errors from FetchEngine.
5
+ */
6
+ export class FetchEngineHttpError extends FetchError {
7
+ statusCode;
8
+ constructor(message, statusCode) {
9
+ super(message, "ERR_HTTP_ERROR", undefined, statusCode);
10
+ this.statusCode = statusCode;
11
+ this.name = "FetchEngineHttpError";
12
+ }
13
+ }
14
+ /**
15
+ * FetchEngine - A lightweight engine for fetching HTML content using the standard `fetch` API.
16
+ *
17
+ * Ideal for fetching content from static websites or APIs where JavaScript execution is not required.
18
+ * It does not support advanced configurations like retries, caching, or proxies directly.
19
+ */
20
+ export class FetchEngine {
21
+ options;
22
+ static DEFAULT_OPTIONS = {
23
+ markdown: false,
24
+ };
25
+ /**
26
+ * Creates an instance of FetchEngine.
27
+ * @param options Configuration options for the FetchEngine.
28
+ */
29
+ constructor(options = {}) {
30
+ this.options = { ...FetchEngine.DEFAULT_OPTIONS, ...options };
31
+ }
32
+ /**
33
+ * Fetches HTML or converts to Markdown from the specified URL.
34
+ *
35
+ * @param url The URL to fetch.
36
+ * @returns A Promise resolving to an HTMLFetchResult object.
37
+ * @throws {FetchEngineHttpError} If the HTTP response status is not ok (e.g., 404, 500).
38
+ * @throws {Error} If the content type is not HTML or for other network errors.
39
+ */
40
+ async fetchHTML(url, options) {
41
+ const effectiveOptions = { ...this.options, ...options }; // Combine constructor and call options
42
+ let response;
43
+ try {
44
+ response = await fetch(url, {
45
+ redirect: "follow",
46
+ headers: {
47
+ // Standard browser-like headers
48
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
49
+ Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
50
+ "Accept-Language": "en-US,en;q=0.9",
51
+ },
52
+ });
53
+ if (!response.ok) {
54
+ throw new FetchEngineHttpError(`HTTP error! status: ${response.status}`, response.status);
55
+ }
56
+ const contentTypeHeader = response.headers.get("content-type");
57
+ if (!contentTypeHeader || !contentTypeHeader.includes("text/html")) {
58
+ throw new FetchError("Content-Type is not text/html", "ERR_NON_HTML_CONTENT");
59
+ }
60
+ const html = await response.text();
61
+ const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
62
+ const title = titleMatch ? titleMatch[1].trim() : null;
63
+ let finalContent = html;
64
+ let finalContentType = "html";
65
+ if (effectiveOptions.markdown) {
66
+ try {
67
+ const converter = new MarkdownConverter();
68
+ finalContent = converter.convert(html);
69
+ finalContentType = "markdown";
70
+ }
71
+ catch (conversionError) {
72
+ console.error(`Markdown conversion failed for ${url} (FetchEngine):`, conversionError);
73
+ // Fallback to original HTML on conversion error
74
+ }
75
+ }
76
+ return {
77
+ content: finalContent,
78
+ contentType: finalContentType,
79
+ title: title,
80
+ url: response.url, // Use the final URL after redirects
81
+ isFromCache: false,
82
+ statusCode: response.status,
83
+ error: undefined,
84
+ };
85
+ }
86
+ catch (error) {
87
+ // Re-throw specific known errors directly
88
+ if (error instanceof FetchEngineHttpError ||
89
+ (error instanceof FetchError && error.code === "ERR_NON_HTML_CONTENT")) {
90
+ throw error;
91
+ }
92
+ // Wrap other/unexpected errors
93
+ const message = error instanceof Error ? error.message : "Unknown fetch error";
94
+ throw new FetchError(`Fetch failed: ${message}`, "ERR_FETCH_FAILED", error instanceof Error ? error : undefined);
95
+ }
96
+ }
97
+ /**
98
+ * Cleans up resources used by the engine.
99
+ * For FetchEngine, this is a no-op as it doesn't manage persistent resources.
100
+ * @returns A Promise that resolves when cleanup is complete.
101
+ */
102
+ async cleanup() {
103
+ return Promise.resolve();
104
+ }
105
+ /**
106
+ * Retrieves metrics for the engine.
107
+ * FetchEngine does not manage browsers, so it returns an empty array.
108
+ * @returns An empty array.
109
+ */
110
+ getMetrics() {
111
+ return [];
112
+ }
113
+ }
114
+ //# sourceMappingURL=FetchEngine.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"FetchEngine.js","sourceRoot":"","sources":["../src/FetchEngine.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,iBAAiB,EAAE,MAAM,+BAA+B,CAAC,CAAC,uBAAuB;AAC1F,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC,CAAC,yBAAyB;AAEnE;;GAEG;AACH,MAAM,OAAO,oBAAqB,SAAQ,UAAU;IAGhC;IAFlB,YACE,OAAe,EACC,UAAkB;QAElC,KAAK,CAAC,OAAO,EAAE,gBAAgB,EAAE,SAAS,EAAE,UAAU,CAAC,CAAC;QAFxC,eAAU,GAAV,UAAU,CAAQ;QAGlC,IAAI,CAAC,IAAI,GAAG,sBAAsB,CAAC;IACrC,CAAC;CACF;AAED;;;;;GAKG;AACH,MAAM,OAAO,WAAW;IACL,OAAO,CAA+B;IAE/C,MAAM,CAAU,eAAe,GAAiC;QACtE,QAAQ,EAAE,KAAK;KAChB,CAAC;IAEF;;;OAGG;IACH,YAAY,UAA8B,EAAE;QAC1C,IAAI,CAAC,OAAO,GAAG,EAAE,GAAG,WAAW,CAAC,eAAe,EAAE,GAAG,OAAO,EAAE,CAAC;IAChE,CAAC;IAED;;;;;;;OAOG;IACH,KAAK,CAAC,SAAS,CAAC,GAAW,EAAE,OAA4B;QACvD,MAAM,gBAAgB,GAAG,EAAE,GAAG,IAAI,CAAC,OAAO,EAAE,GAAG,OAAO,EAAE,CAAC,CAAC,uCAAuC;QACjG,IAAI,QAAkB,CAAC;QACvB,IAAI,CAAC;YACH,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;gBAC1B,QAAQ,EAAE,QAAQ;gBAClB,OAAO,EAAE;oBACP,gCAAgC;oBAChC,YAAY,EACV,iHAAiH;oBACnH,MAAM,EAAE,kGAAkG;oBAC1G,iBAAiB,EAAE,gBAAgB;iBACpC;aACF,CAAC,CAAC;YAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;gBACjB,MAAM,IAAI,oBAAoB,CAAC,uBAAuB,QAAQ,CAAC,MAAM,EAAE,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC;YAC5F,CAAC;YAED,MAAM,iBAAiB,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,CAAC;YAC/D,IAAI,CAAC,iBAAiB,IAAI,CAAC,iBAAiB,CAAC,QAAQ,CAAC,WAAW,CAAC,EAAE,CAAC;gBACnE,MAAM,IAAI,UAAU,CAAC,+BAA+B,EAAE,sBAAsB,CAAC,CAAC;YAChF,CAAC;YAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YACnC,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,+BAA+B,CAAC,CAAC;YAC/D,MAAM,KAAK,GAAG,UAAU,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;YAEvD,IAAI,YAAY,GAAG,IAAI,CAAC;YACxB,IAAI,gBAAgB,GAAwB,MAAM,CAAC;YAEnD,IAAI,gBAAgB,CAAC,QAAQ,EAAE,CAAC;gBAC9B,IAAI,CAAC;oBACH,MAAM,SAAS,GAAG,IAAI,iBAAiB,EAAE,CAAC;oBAC1C,YAAY,GAAG,SAAS,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;oBACvC,gBAAgB,GAAG,UAAU,CAAC;gBAChC,CAAC;gBAAC,OAAO,eAAoB,EAAE,CAAC;oBAC9B,OAAO,CAAC,KAAK,CAAC,kCAAkC,GAAG,iBAAiB,EAAE,eAAe,CAAC,CAAC;oBACvF,gDAAgD;gBAClD,CAAC;YACH,CAAC;YAED,OAAO;gBACL,OAAO,EAAE,YAAY;gBACrB,WAAW,EAAE,gBAAgB;gBAC7B,KAAK,EAAE,KAAK;gBACZ,GAAG,EAAE,QAAQ,CAAC,GAAG,EAAE,oCAAoC;gBACvD,WAAW,EAAE,KAAK;gBAClB,UAAU,EAAE,QAAQ,CAAC,MAAM;gBAC3B,KAAK,EAAE,SAAS;aACjB,CAAC;QACJ,CAAC;QAAC,OAAO,KAAU,EAAE,CAAC;YACpB,0CAA0C;YAC1C,IACE,KAAK,YAAY,oBAAoB;gBACrC,CAAC,KAAK,YAAY,UAAU,IAAI,KAAK,CAAC,IAAI,KAAK,sBAAsB,CAAC,EACtE,CAAC;gBACD,MAAM,KAAK,CAAC;YACd,CAAC;YACD,+BAA+B;YAC/B,MAAM,OAAO,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,qBAAqB,CAAC;YAC/E,MAAM,IAAI,UAAU,CAAC,iBAAiB,OAAO,EAAE,EAAE,kBAAkB,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;QACnH,CAAC;IACH,CAAC;IAED;;;;OAIG;IACH,KAAK,CAAC,OAAO;QACX,OAAO,OAAO,CAAC,OAAO,EAAE,CAAC;IAC3B,CAAC;IAED;;;;OAIG;IACH,UAAU;QACR,OAAO,EAAE,CAAC;IACZ,CAAC"}
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=FetchEngine.test.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"FetchEngine.test.d.ts","sourceRoot":"","sources":["../src/FetchEngine.test.ts"],"names":[],"mappings":""}
@@ -0,0 +1,44 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import { FetchEngine } from "./FetchEngine.js";
3
+ describe("FetchEngine", () => {
4
+ it("should fetch HTML and extract title from a static page", async () => {
5
+ const engine = new FetchEngine();
6
+ const url = "http://example.com";
7
+ const expectedUrl = "http://example.com/"; // Expect trailing slash
8
+ try {
9
+ const result = await engine.fetchHTML(url);
10
+ expect(result).toBeDefined();
11
+ expect(result.url).toBe(expectedUrl); // Use expectedUrl
12
+ expect(result.title).toBe("Example Domain");
13
+ expect(result.html).toContain("<title>Example Domain</title>");
14
+ expect(result.html).toContain("<h1>Example Domain</h1>");
15
+ }
16
+ catch (error) {
17
+ // If the test environment doesn't have fetch or network access, this might fail.
18
+ // In a real CI/CD, ensure network access or mock fetch.
19
+ console.warn("FetchEngine test failed, potentially due to network issues or missing fetch API:", error);
20
+ // Re-throw to fail the test if fetch was expected to work
21
+ throw error;
22
+ }
23
+ });
24
+ it("should throw an error for non-HTML content", async () => {
25
+ const engine = new FetchEngine();
26
+ // Use a URL known to return non-HTML content, e.g., a JSON endpoint or an image
27
+ const url = "https://httpbin.org/json";
28
+ // Expect the fetchHTML method to reject
29
+ await expect(engine.fetchHTML(url)).rejects.toThrow("Not an HTML page");
30
+ });
31
+ it("should throw an error for non-existent domains", async () => {
32
+ const engine = new FetchEngine();
33
+ const url = "http://domain-that-does-not-exist-fdsahjkl.xyz";
34
+ // Expect the fetchHTML method to reject (error message might vary)
35
+ await expect(engine.fetchHTML(url)).rejects.toThrow();
36
+ });
37
+ it("should handle http errors", async () => {
38
+ const engine = new FetchEngine();
39
+ const url = "https://httpbin.org/status/404"; // URL that returns 404
40
+ await expect(engine.fetchHTML(url)).rejects.toThrow(/HTTP error! status: 404/);
41
+ });
42
+ // Add more tests: SPA detection warning, etc.
43
+ });
44
+ //# sourceMappingURL=FetchEngine.test.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"FetchEngine.test.js","sourceRoot":"","sources":["../src/FetchEngine.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAC9C,OAAO,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAE/C,QAAQ,CAAC,aAAa,EAAE,GAAG,EAAE;IAC3B,EAAE,CAAC,wDAAwD,EAAE,KAAK,IAAI,EAAE;QACtE,MAAM,MAAM,GAAG,IAAI,WAAW,EAAE,CAAC;QACjC,MAAM,GAAG,GAAG,oBAAoB,CAAC;QACjC,MAAM,WAAW,GAAG,qBAAqB,CAAC,CAAC,wBAAwB;QAEnE,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;YAE3C,MAAM,CAAC,MAAM,CAAC,CAAC,WAAW,EAAE,CAAC;YAC7B,MAAM,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,kBAAkB;YACxD,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC;YAC5C,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,SAAS,CAAC,+BAA+B,CAAC,CAAC;YAC/D,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,SAAS,CAAC,yBAAyB,CAAC,CAAC;QAC3D,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,iFAAiF;YACjF,wDAAwD;YACxD,OAAO,CAAC,IAAI,CAAC,kFAAkF,EAAE,KAAK,CAAC,CAAC;YACxG,0DAA0D;YAC1D,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,4CAA4C,EAAE,KAAK,IAAI,EAAE;QAC1D,MAAM,MAAM,GAAG,IAAI,WAAW,EAAE,CAAC;QACjC,gFAAgF;QAChF,MAAM,GAAG,GAAG,0BAA0B,CAAC;QAEvC,wCAAwC;QACxC,MAAM,MAAM,CAAC,MAAM,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,kBAAkB,CAAC,CAAC;IAC1E,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,gDAAgD,EAAE,KAAK,IAAI,EAAE;QAC9D,MAAM,MAAM,GAAG,IAAI,WAAW,EAAE,CAAC;QACjC,MAAM,GAAG,GAAG,gDAAgD,CAAC;QAE7D,mEAAmE;QACnE,MAAM,MAAM,CAAC,MAAM,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC;IACxD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,2BAA2B,EAAE,KAAK,IAAI,EAAE;QACzC,MAAM,MAAM,GAAG,IAAI,WAAW,EAAE,CAAC;QACjC,MAAM,GAAG,GAAG,gCAAgC,CAAC,CAAC,uBAAuB;QAErE,MAAM,MAAM,CAAC,MAAM,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,yBAAyB,CAAC,CAAC;IACjF,CAAC,CAAC,CAAC;IAEH,8CAA8C;AAChD,CAAC,CAAC,CAAC"}
@@ -0,0 +1,21 @@
1
+ import type { IEngine } from "./IEngine.js";
2
+ import type { HTMLFetchResult, PlaywrightEngineConfig, FetchOptions, BrowserMetrics } from "./types.js";
3
+ /**
4
+ * HybridEngine - Tries FetchEngine first, falls back to PlaywrightEngine on failure.
5
+ */
6
+ export declare class HybridEngine implements IEngine {
7
+ private readonly fetchEngine;
8
+ private readonly playwrightEngine;
9
+ private readonly config;
10
+ constructor(config?: PlaywrightEngineConfig);
11
+ fetchHTML(url: string, options?: FetchOptions): Promise<HTMLFetchResult>;
12
+ /**
13
+ * Delegates getMetrics to the PlaywrightEngine.
14
+ */
15
+ getMetrics(): BrowserMetrics[];
16
+ /**
17
+ * Calls cleanup on both underlying engines.
18
+ */
19
+ cleanup(): Promise<void>;
20
+ }
21
+ //# sourceMappingURL=HybridEngine.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"HybridEngine.d.ts","sourceRoot":"","sources":["../src/HybridEngine.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAC5C,OAAO,KAAK,EAAE,eAAe,EAAE,sBAAsB,EAAE,YAAY,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAExG;;GAEG;AACH,qBAAa,YAAa,YAAW,OAAO;IAC1C,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAc;IAC1C,OAAO,CAAC,QAAQ,CAAC,gBAAgB,CAAmB;IACpD,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAyB;gBAEpC,MAAM,GAAE,sBAA2B;IAQzC,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,YAAiB,GAAG,OAAO,CAAC,eAAe,CAAC;IA8BlF;;OAEG;IACH,UAAU,IAAI,cAAc,EAAE;IAI9B;;OAEG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAM/B"}
@@ -0,0 +1,62 @@
1
+ import { FetchEngine } from "./FetchEngine.js";
2
+ import { PlaywrightEngine } from "./PlaywrightEngine.js";
3
+ /**
4
+ * HybridEngine - Tries FetchEngine first, falls back to PlaywrightEngine on failure.
5
+ */
6
+ export class HybridEngine {
7
+ fetchEngine;
8
+ playwrightEngine;
9
+ config; // Store config for potential per-request PW overrides
10
+ constructor(config = {}) {
11
+ // Pass relevant config parts to each engine
12
+ // FetchEngine only takes markdown option from the shared config
13
+ this.fetchEngine = new FetchEngine({ markdown: config.markdown });
14
+ this.playwrightEngine = new PlaywrightEngine(config);
15
+ this.config = config; // Store for merging later
16
+ }
17
+ async fetchHTML(url, options = {}) {
18
+ // FetchEngine uses its constructor config; it doesn't accept per-request options here.
19
+ try {
20
+ const fetchResult = await this.fetchEngine.fetchHTML(url);
21
+ // If fetch succeeded, return its result directly (it handles its own markdown config)
22
+ // No need to check contentType here, FetchEngine handles it based on its constructor.
23
+ return fetchResult;
24
+ }
25
+ catch (fetchError) {
26
+ console.warn(`FetchEngine failed for ${url}: ${fetchError.message}. Falling back to PlaywrightEngine.`);
27
+ // Merge constructor config with per-request options for Playwright fallback
28
+ const playwrightOptions = {
29
+ ...this.config, // Start with base config given to HybridEngine
30
+ ...options, // Override with per-request options
31
+ };
32
+ try {
33
+ // Pass merged options to PlaywrightEngine
34
+ const playwrightResult = await this.playwrightEngine.fetchHTML(url, playwrightOptions);
35
+ return playwrightResult;
36
+ }
37
+ catch (playwrightError) {
38
+ // Catch potential Playwright error
39
+ console.error(`PlaywrightEngine fallback failed for ${url}: ${playwrightError.message}`);
40
+ // Optionally, wrap or prioritize which error to throw
41
+ // Throwing the Playwright error as it's the last one encountered
42
+ throw playwrightError;
43
+ }
44
+ }
45
+ }
46
+ /**
47
+ * Delegates getMetrics to the PlaywrightEngine.
48
+ */
49
+ getMetrics() {
50
+ return this.playwrightEngine.getMetrics();
51
+ }
52
+ /**
53
+ * Calls cleanup on both underlying engines.
54
+ */
55
+ async cleanup() {
56
+ await Promise.allSettled([
57
+ this.fetchEngine.cleanup(), // Although a no-op, call for consistency
58
+ this.playwrightEngine.cleanup(),
59
+ ]);
60
+ }
61
+ }
62
+ //# sourceMappingURL=HybridEngine.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"HybridEngine.js","sourceRoot":"","sources":["../src/HybridEngine.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAC/C,OAAO,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AAIzD;;GAEG;AACH,MAAM,OAAO,YAAY;IACN,WAAW,CAAc;IACzB,gBAAgB,CAAmB;IACnC,MAAM,CAAyB,CAAC,sDAAsD;IAEvG,YAAY,SAAiC,EAAE;QAC7C,4CAA4C;QAC5C,gEAAgE;QAChE,IAAI,CAAC,WAAW,GAAG,IAAI,WAAW,CAAC,EAAE,QAAQ,EAAE,MAAM,CAAC,QAAQ,EAAE,CAAC,CAAC;QAClE,IAAI,CAAC,gBAAgB,GAAG,IAAI,gBAAgB,CAAC,MAAM,CAAC,CAAC;QACrD,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC,CAAC,0BAA0B;IAClD,CAAC;IAED,KAAK,CAAC,SAAS,CAAC,GAAW,EAAE,UAAwB,EAAE;QACrD,uFAAuF;QACvF,IAAI,CAAC;YACH,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;YAC1D,sFAAsF;YACtF,sFAAsF;YACtF,OAAO,WAAW,CAAC;QACrB,CAAC;QAAC,OAAO,UAAe,EAAE,CAAC;YACzB,OAAO,CAAC,IAAI,CAAC,0BAA0B,GAAG,KAAK,UAAU,CAAC,OAAO,qCAAqC,CAAC,CAAC;YAExG,4EAA4E;YAC5E,MAAM,iBAAiB,GAAiB;gBACtC,GAAG,IAAI,CAAC,MAAM,EAAE,+CAA+C;gBAC/D,GAAG,OAAO,EAAE,oCAAoC;aACjD,CAAC;YAEF,IAAI,CAAC;gBACH,0CAA0C;gBAC1C,MAAM,gBAAgB,GAAG,MAAM,IAAI,CAAC,gBAAgB,CAAC,SAAS,CAAC,GAAG,EAAE,iBAAiB,CAAC,CAAC;gBACvF,OAAO,gBAAgB,CAAC;YAC1B,CAAC;YAAC,OAAO,eAAoB,EAAE,CAAC;gBAC9B,mCAAmC;gBACnC,OAAO,CAAC,KAAK,CAAC,wCAAwC,GAAG,KAAK,eAAe,CAAC,OAAO,EAAE,CAAC,CAAC;gBACzF,sDAAsD;gBACtD,iEAAiE;gBACjE,MAAM,eAAe,CAAC;YACxB,CAAC;QACH,CAAC;IACH,CAAC;IAED;;OAEG;IACH,UAAU;QACR,OAAO,IAAI,CAAC,gBAAgB,CAAC,UAAU,EAAE,CAAC;IAC5C,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO;QACX,MAAM,OAAO,CAAC,UAAU,CAAC;YACvB,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,EAAE,yCAAyC;YACrE,IAAI,CAAC,gBAAgB,CAAC,OAAO,EAAE;SAChC,CAAC,CAAC;IACL,CAAC;CACF"}
@@ -0,0 +1,22 @@
1
+ import type { HTMLFetchResult, BrowserMetrics } from "./types.js";
2
+ /**
3
+ * Interface for browser engines that can fetch HTML content from URLs
4
+ */
5
+ export interface IEngine {
6
+ /**
7
+ * Fetches HTML content from a URL
8
+ * @param url The URL to fetch
9
+ * @returns A promise that resolves to an HTMLFetchResult
10
+ */
11
+ fetchHTML(url: string): Promise<HTMLFetchResult>;
12
+ /**
13
+ * Cleans up resources used by the engine
14
+ */
15
+ cleanup(): Promise<void>;
16
+ /**
17
+ * Gets metrics about the engine's performance
18
+ * @returns An array of BrowserMetrics
19
+ */
20
+ getMetrics(): BrowserMetrics[];
21
+ }
22
+ //# sourceMappingURL=IEngine.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"IEngine.d.ts","sourceRoot":"","sources":["../src/IEngine.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAElE;;GAEG;AACH,MAAM,WAAW,OAAO;IACtB;;;;OAIG;IACH,SAAS,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,eAAe,CAAC,CAAC;IAEjD;;OAEG;IACH,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;IAEzB;;;OAGG;IACH,UAAU,IAAI,cAAc,EAAE,CAAC;CAChC"}
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=IEngine.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"IEngine.js","sourceRoot":"","sources":["../src/IEngine.ts"],"names":[],"mappings":""}
@@ -0,0 +1,90 @@
1
+ import type { HTMLFetchResult, BrowserMetrics, PlaywrightEngineConfig, FetchOptions } from "./types.js";
2
+ import type { IEngine } from "./IEngine.js";
3
+ /**
4
+ * PlaywrightEngine - Fetches HTML using a managed pool of headless Playwright browser instances.
5
+ *
6
+ * This engine is suitable for dynamic websites that require JavaScript execution.
7
+ * It incorporates `playwright-extra` with the stealth plugin for enhanced anti-detection capabilities.
8
+ * Features include caching, retries, HTTP fallback, and configurable browser pooling.
9
+ */
10
+ export declare class PlaywrightEngine implements IEngine {
11
+ private browserPool;
12
+ private readonly queue;
13
+ private readonly cache;
14
+ private readonly config;
15
+ private initializingBrowserPool;
16
+ private isUsingHeadedMode;
17
+ private headedFallbackSites;
18
+ private static readonly DEFAULT_CONFIG;
19
+ /**
20
+ * Creates an instance of PlaywrightEngine.
21
+ *
22
+ * @param config Configuration options for the engine and its browser pool.
23
+ * See `PlaywrightEngineConfig` for details.
24
+ */
25
+ constructor(config?: PlaywrightEngineConfig);
26
+ /**
27
+ * Initialize the browser pool with improved error handling and mode switching.
28
+ */
29
+ private initializeBrowserPool;
30
+ /**
31
+ * Fallback method using simple HTTP requests via Axios.
32
+ * Ensures return type matches HTMLFetchResult.
33
+ */
34
+ private fetchHTMLWithHttpFallback;
35
+ private checkCache;
36
+ /**
37
+ * Safely check if a page is still usable and connected.
38
+ */
39
+ private isPageValid;
40
+ /**
41
+ * Simulate human-like interactions on the page.
42
+ */
43
+ private simulateHumanBehavior;
44
+ /**
45
+ * Adds a result to the in-memory cache.
46
+ */
47
+ private addToCache;
48
+ /**
49
+ * Public method to fetch HTML. Delegates to the internal recursive fetch method.
50
+ *
51
+ * @param url The URL to fetch.
52
+ * @param options Optional settings for this specific fetch operation.
53
+ * @param options.fastMode Overrides the engine's `defaultFastMode` configuration for this request.
54
+ * @returns A Promise resolving to an HTMLFetchResult object.
55
+ * @throws {FetchError} If the fetch fails after all retries or encounters critical errors.
56
+ */
57
+ fetchHTML(url: string, options?: FetchOptions & {
58
+ markdown?: boolean;
59
+ }): Promise<HTMLFetchResult>;
60
+ /**
61
+ * Internal recursive method to handle fetching with retries.
62
+ *
63
+ * @param url URL to fetch
64
+ * @param currentConfig The merged configuration including markdown option
65
+ * @param retryAttempt Current retry attempt number (starts at 0)
66
+ * @param parentRetryCount Tracks retries related to pool initialization errors (starts at 0)
67
+ * @returns Promise resolving to HTMLFetchResult
68
+ */
69
+ private _fetchRecursive;
70
+ /**
71
+ * Performs the actual page fetch using a Playwright page from the pool.
72
+ * Ensures return type matches HTMLFetchResult.
73
+ */
74
+ private fetchWithPlaywright;
75
+ private applyBlockingRules;
76
+ /**
77
+ * Cleans up resources used by the engine, primarily closing browser instances in the pool.
78
+ *
79
+ * It is crucial to call this method when finished with the engine instance to release resources.
80
+ * @returns A Promise that resolves when cleanup is complete.
81
+ */
82
+ cleanup(): Promise<void>;
83
+ /**
84
+ * Retrieves metrics from the underlying browser pool.
85
+ * @returns An array of BrowserMetrics objects, one for each active browser instance, or an empty array if the pool is not initialized.
86
+ */
87
+ getMetrics(): BrowserMetrics[];
88
+ private shouldUseHeadedMode;
89
+ }
90
+ //# sourceMappingURL=PlaywrightEngine.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"PlaywrightEngine.d.ts","sourceRoot":"","sources":["../src/PlaywrightEngine.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,cAAc,EAAE,sBAAsB,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AACxG,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAmB5C;;;;;;GAMG;AACH,qBAAa,gBAAiB,YAAW,OAAO;IAC9C,OAAO,CAAC,WAAW,CAAsC;IACzD,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAS;IAC/B,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAsC;IAC5D,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAmC;IAG1D,OAAO,CAAC,uBAAuB,CAAkB;IACjD,OAAO,CAAC,iBAAiB,CAAkB;IAC3C,OAAO,CAAC,mBAAmB,CAA0B;IAGrD,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,cAAc,CAkBpC;IAEF;;;;;OAKG;gBACS,MAAM,GAAE,sBAA2B;IAM/C;;OAEG;YACW,qBAAqB;IAuCnC;;;OAGG;YACW,yBAAyB;IAmFvC,OAAO,CAAC,UAAU;IAalB;;OAEG;YACW,WAAW;IAazB;;OAEG;YACW,qBAAqB;IAqCnC;;OAEG;IACH,OAAO,CAAC,UAAU;IAUlB;;;;;;;;OAQG;IACG,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,YAAY,GAAG;QAAE,QAAQ,CAAC,EAAE,OAAO,CAAA;KAAO,GAAG,OAAO,CAAC,eAAe,CAAC;IAU3G;;;;;;;;OAQG;YACW,eAAe;IAsH7B;;;OAGG;YACW,mBAAmB;YAmJnB,kBAAkB;IAmChC;;;;;OAKG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;IAe9B;;;OAGG;IACH,UAAU,IAAI,cAAc,EAAE;IAQ9B,OAAO,CAAC,mBAAmB;CAS5B"}