@purepageio/fetch-engines 0.1.4 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,17 +1,14 @@
1
- import { JSDOM } from "jsdom";
1
+ import { MarkdownConverter } from "./utils/markdown-converter.js"; // Import the converter
2
+ import { FetchError } from "./errors.js"; // Only import FetchError
2
3
  /**
3
4
  * Custom error class for HTTP errors from FetchEngine.
4
5
  */
5
- export class FetchEngineHttpError extends Error {
6
+ export class FetchEngineHttpError extends FetchError {
6
7
  statusCode;
7
8
  constructor(message, statusCode) {
8
- super(message);
9
- this.name = "FetchEngineHttpError";
9
+ super(message, "ERR_HTTP_ERROR", undefined, statusCode);
10
10
  this.statusCode = statusCode;
11
- // Maintain proper stack trace (requires target ES2015+ in tsconfig)
12
- if (Error.captureStackTrace) {
13
- Error.captureStackTrace(this, FetchEngineHttpError);
14
- }
11
+ this.name = "FetchEngineHttpError";
15
12
  }
16
13
  }
17
14
  /**
@@ -21,99 +18,81 @@ export class FetchEngineHttpError extends Error {
21
18
  * It does not support advanced configurations like retries, caching, or proxies directly.
22
19
  */
23
20
  export class FetchEngine {
24
- headers;
21
+ options;
22
+ static DEFAULT_OPTIONS = {
23
+ markdown: false,
24
+ };
25
25
  /**
26
26
  * Creates an instance of FetchEngine.
27
- * Note: This engine currently does not accept configuration options.
27
+ * @param options Configuration options for the FetchEngine.
28
28
  */
29
- constructor() {
30
- this.headers = {
31
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
32
- Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
33
- "Accept-Language": "en-US,en;q=0.5",
34
- "Upgrade-Insecure-Requests": "1",
35
- "Sec-Fetch-Dest": "document",
36
- "Sec-Fetch-Mode": "navigate",
37
- "Sec-Fetch-Site": "none",
38
- "Sec-Fetch-User": "?1",
39
- };
29
+ constructor(options = {}) {
30
+ this.options = { ...FetchEngine.DEFAULT_OPTIONS, ...options };
40
31
  }
41
32
  /**
42
- * Fetches HTML content from the specified URL using the `fetch` API.
33
+ * Fetches HTML or converts to Markdown from the specified URL.
43
34
  *
44
35
  * @param url The URL to fetch.
45
36
  * @returns A Promise resolving to an HTMLFetchResult object.
46
37
  * @throws {FetchEngineHttpError} If the HTTP response status is not ok (e.g., 404, 500).
47
38
  * @throws {Error} If the content type is not HTML or for other network errors.
48
39
  */
49
- async fetchHTML(url) {
40
+ async fetchHTML(url, options) {
41
+ const effectiveOptions = { ...this.options, ...options }; // Combine constructor and call options
42
+ let response;
50
43
  try {
51
- const response = await fetch(url, {
52
- headers: this.headers,
44
+ response = await fetch(url, {
53
45
  redirect: "follow",
46
+ headers: {
47
+ // Standard browser-like headers
48
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
49
+ Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
50
+ "Accept-Language": "en-US,en;q=0.9",
51
+ },
54
52
  });
55
53
  if (!response.ok) {
56
- // Throw the custom error with status code
57
54
  throw new FetchEngineHttpError(`HTTP error! status: ${response.status}`, response.status);
58
55
  }
59
- const contentType = response.headers.get("content-type") || "";
60
- if (!contentType.includes("text/html")) {
61
- throw new Error("Not an HTML page");
56
+ const contentTypeHeader = response.headers.get("content-type");
57
+ if (!contentTypeHeader || !contentTypeHeader.includes("text/html")) {
58
+ throw new FetchError("Content-Type is not text/html", "ERR_NON_HTML_CONTENT");
62
59
  }
63
60
  const html = await response.text();
64
- // Use JSDOM to parse HTML and extract title
65
- const dom = new JSDOM(html);
66
- const title = dom.window.document.title || "";
67
- // Check for potential SPA markers
68
- const isSPA = this.detectSPA(dom.window.document);
69
- if (isSPA) {
70
- // Removed throwing error here, as the calling code should decide how to handle this.
71
- // Consider adding a flag to the result instead.
72
- console.warn(`SPA detected for ${url}, content might be incomplete without JavaScript rendering.`);
73
- // Example: return { html, title, url: response.url, isSPA: true };
61
+ const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
62
+ const title = titleMatch ? titleMatch[1].trim() : null;
63
+ let finalContent = html;
64
+ let finalContentType = "html";
65
+ if (effectiveOptions.markdown) {
66
+ try {
67
+ const converter = new MarkdownConverter();
68
+ finalContent = converter.convert(html);
69
+ finalContentType = "markdown";
70
+ }
71
+ catch (conversionError) {
72
+ console.error(`Markdown conversion failed for ${url} (FetchEngine):`, conversionError);
73
+ // Fallback to original HTML on conversion error
74
+ }
74
75
  }
75
76
  return {
76
- html,
77
- title,
78
- url: response.url,
79
- isFromCache: false, // FetchEngine doesn't cache
77
+ content: finalContent,
78
+ contentType: finalContentType,
79
+ title: title,
80
+ url: response.url, // Use the final URL after redirects
81
+ isFromCache: false,
80
82
  statusCode: response.status,
81
83
  error: undefined,
82
84
  };
83
85
  }
84
86
  catch (error) {
85
- // console.error(`FetchEngine failed for ${url}:`, error); // Optional: Keep logging if desired
86
- // Re-throw the original error to preserve its type (e.g., FetchEngineHttpError)
87
- // Ensure the result conforms to HTMLFetchResult even on error (for consistency? No, spec says throw)
88
- throw error;
89
- }
90
- }
91
- detectSPA(document) {
92
- // Check for common SPA frameworks and patterns
93
- const spaMarkers = [
94
- // React
95
- "[data-reactroot]",
96
- "#root",
97
- "#app",
98
- // Vue
99
- "[data-v-app]",
100
- "#app[data-v-]",
101
- // Angular
102
- "[ng-version]",
103
- "[ng-app]",
104
- // Common SPA patterns
105
- 'script[type="application/json+ld"]', // Less reliable marker
106
- 'meta[name="fragment"]',
107
- ];
108
- // Check if the body is nearly empty but has JS (More reliable)
109
- const bodyContent = document.body?.textContent?.trim() || "";
110
- const hasScripts = document.scripts.length > 0;
111
- if (bodyContent.length < 150 && hasScripts) {
112
- // Increased threshold slightly
113
- return true;
87
+ // Re-throw specific known errors directly
88
+ if (error instanceof FetchEngineHttpError ||
89
+ (error instanceof FetchError && error.code === "ERR_NON_HTML_CONTENT")) {
90
+ throw error;
91
+ }
92
+ // Wrap other/unexpected errors
93
+ const message = error instanceof Error ? error.message : "Unknown fetch error";
94
+ throw new FetchError(`Fetch failed: ${message}`, "ERR_FETCH_FAILED", error instanceof Error ? error : undefined);
114
95
  }
115
- // Check for SPA markers (Less reliable)
116
- return spaMarkers.some((selector) => document.querySelector(selector) !== null);
117
96
  }
118
97
  /**
119
98
  * Cleans up resources used by the engine.
@@ -121,8 +100,7 @@ export class FetchEngine {
121
100
  * @returns A Promise that resolves when cleanup is complete.
122
101
  */
123
102
  async cleanup() {
124
- // No resources to clean up for fetch engine
125
- return Promise.resolve(); // Explicitly return resolved promise
103
+ return Promise.resolve();
126
104
  }
127
105
  /**
128
106
  * Retrieves metrics for the engine.
@@ -130,7 +108,6 @@ export class FetchEngine {
130
108
  * @returns An empty array.
131
109
  */
132
110
  getMetrics() {
133
- // Fetch engine doesn't maintain browser pool metrics
134
111
  return [];
135
112
  }
136
113
  }
@@ -1 +1 @@
1
- {"version":3,"file":"FetchEngine.js","sourceRoot":"","sources":["../src/FetchEngine.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,KAAK,EAAE,MAAM,OAAO,CAAC;AAE9B;;GAEG;AACH,MAAM,OAAO,oBAAqB,SAAQ,KAAK;IAC7B,UAAU,CAAS;IAEnC,YAAY,OAAe,EAAE,UAAkB;QAC7C,KAAK,CAAC,OAAO,CAAC,CAAC;QACf,IAAI,CAAC,IAAI,GAAG,sBAAsB,CAAC;QACnC,IAAI,CAAC,UAAU,GAAG,UAAU,CAAC;QAC7B,oEAAoE;QACpE,IAAI,KAAK,CAAC,iBAAiB,EAAE,CAAC;YAC5B,KAAK,CAAC,iBAAiB,CAAC,IAAI,EAAE,oBAAoB,CAAC,CAAC;QACtD,CAAC;IACH,CAAC;CACF;AAED;;;;;GAKG;AACH,MAAM,OAAO,WAAW;IACL,OAAO,CAAyB;IAEjD;;;OAGG;IACH;QACE,IAAI,CAAC,OAAO,GAAG;YACb,YAAY,EACV,iHAAiH;YACnH,MAAM,EAAE,4EAA4E;YACpF,iBAAiB,EAAE,gBAAgB;YACnC,2BAA2B,EAAE,GAAG;YAChC,gBAAgB,EAAE,UAAU;YAC5B,gBAAgB,EAAE,UAAU;YAC5B,gBAAgB,EAAE,MAAM;YACxB,gBAAgB,EAAE,IAAI;SACvB,CAAC;IACJ,CAAC;IAED;;;;;;;OAOG;IACH,KAAK,CAAC,SAAS,CAAC,GAAW;QACzB,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;gBAChC,OAAO,EAAE,IAAI,CAAC,OAAO;gBACrB,QAAQ,EAAE,QAAQ;aACnB,CAAC,CAAC;YAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;gBACjB,0CAA0C;gBAC1C,MAAM,IAAI,oBAAoB,CAAC,uBAAuB,QAAQ,CAAC,MAAM,EAAE,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC;YAC5F,CAAC;YAED,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;YAC/D,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,WAAW,CAAC,EAAE,CAAC;gBACvC,MAAM,IAAI,KAAK,CAAC,kBAAkB,CAAC,CAAC;YACtC,CAAC;YAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YAEnC,4CAA4C;YAC5C,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,IAAI,CAAC,CAAC;YAC5B,MAAM,KAAK,GAAG,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,IAAI,EAAE,CAAC;YAE9C,kCAAkC;YAClC,MAAM,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;YAClD,IAAI,KAAK,EAAE,CAAC;gBACV,qFAAqF;gBACrF,gDAAgD;gBAChD,OAAO,CAAC,IAAI,CAAC,oBAAoB,GAAG,6DAA6D,CAAC,CAAC;gBACnG,mEAAmE;YACrE,CAAC;YAED,OAAO;gBACL,IAAI;gBACJ,KAAK;gBACL,GAAG,EAAE,QAAQ,CAAC,GAAG;gBACjB,WAAW,EAAE,KAAK,EAAE,4BAA4B;gBAChD,UAAU,EAAE,QAAQ,CAAC,MAAM;gBAC3B,KAAK,EAAE,SAAS;aACjB,CAAC;QACJ,CAAC;QAAC,OAAO,KAAU,EAAE,CAAC;YACpB,+FAA+F;YAC/F,gFAAgF;YAChF,qGAAqG;YACrG,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC;IAEO,SAAS,CAAC,QAAkB;QAClC,+CAA+C;QAC/C,MAAM,UAAU,GAAG;YACjB,QAAQ;YACR,kBAAkB;YAClB,OAAO;YACP,MAAM;YACN,MAAM;YACN,cAAc;YACd,eAAe;YACf,UAAU;YACV,cAAc;YACd,UAAU;YACV,sBAAsB;YACtB,oCAAoC,EAAE,uBAAuB;YAC7D,uBAAuB;SACxB,CAAC;QAEF,+DAA+D;QAC/D,MAAM,WAAW,GAAG,QAAQ,CAAC,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QAC7D,MAAM,UAAU,GAAG,QAAQ,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC;QAE/C,IAAI,WAAW,CAAC,MAAM,GAAG,GAAG,IAAI,UAAU,EAAE,CAAC;YAC3C,+BAA+B;YAC/B,OAAO,IAAI,CAAC;QACd,CAAC;QAED,wCAAwC;QACxC,OAAO,UAAU,CAAC,IAAI,CAAC,CAAC,QAAQ,EAAE,EAAE,CAAC,QAAQ,CAAC,aAAa,CAAC,QAAQ,CAAC,KAAK,IAAI,CAAC,CAAC;IAClF,CAAC;IAED;;;;OAIG;IACH,KAAK,CAAC,OAAO;QACX,4CAA4C;QAC5C,OAAO,OAAO,CAAC,OAAO,EAAE,CAAC,CAAC,qCAAqC;IACjE,CAAC;IAED;;;;OAIG;IACH,UAAU;QACR,qDAAqD;QACrD,OAAO,EAAE,CAAC;IACZ,CAAC;CACF"}
1
+ {"version":3,"file":"FetchEngine.js","sourceRoot":"","sources":["../src/FetchEngine.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,iBAAiB,EAAE,MAAM,+BAA+B,CAAC,CAAC,uBAAuB;AAC1F,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC,CAAC,yBAAyB;AAEnE;;GAEG;AACH,MAAM,OAAO,oBAAqB,SAAQ,UAAU;IAGhC;IAFlB,YACE,OAAe,EACC,UAAkB;QAElC,KAAK,CAAC,OAAO,EAAE,gBAAgB,EAAE,SAAS,EAAE,UAAU,CAAC,CAAC;QAFxC,eAAU,GAAV,UAAU,CAAQ;QAGlC,IAAI,CAAC,IAAI,GAAG,sBAAsB,CAAC;IACrC,CAAC;CACF;AAED;;;;;GAKG;AACH,MAAM,OAAO,WAAW;IACL,OAAO,CAA+B;IAE/C,MAAM,CAAU,eAAe,GAAiC;QACtE,QAAQ,EAAE,KAAK;KAChB,CAAC;IAEF;;;OAGG;IACH,YAAY,UAA8B,EAAE;QAC1C,IAAI,CAAC,OAAO,GAAG,EAAE,GAAG,WAAW,CAAC,eAAe,EAAE,GAAG,OAAO,EAAE,CAAC;IAChE,CAAC;IAED;;;;;;;OAOG;IACH,KAAK,CAAC,SAAS,CAAC,GAAW,EAAE,OAA4B;QACvD,MAAM,gBAAgB,GAAG,EAAE,GAAG,IAAI,CAAC,OAAO,EAAE,GAAG,OAAO,EAAE,CAAC,CAAC,uCAAuC;QACjG,IAAI,QAAkB,CAAC;QACvB,IAAI,CAAC;YACH,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;gBAC1B,QAAQ,EAAE,QAAQ;gBAClB,OAAO,EAAE;oBACP,gCAAgC;oBAChC,YAAY,EACV,iHAAiH;oBACnH,MAAM,EAAE,kGAAkG;oBAC1G,iBAAiB,EAAE,gBAAgB;iBACpC;aACF,CAAC,CAAC;YAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;gBACjB,MAAM,IAAI,oBAAoB,CAAC,uBAAuB,QAAQ,CAAC,MAAM,EAAE,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC;YAC5F,CAAC;YAED,MAAM,iBAAiB,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,CAAC;YAC/D,IAAI,CAAC,iBAAiB,IAAI,CAAC,iBAAiB,CAAC,QAAQ,CAAC,WAAW,CAAC,EAAE,CAAC;gBACnE,MAAM,IAAI,UAAU,CAAC,+BAA+B,EAAE,sBAAsB,CAAC,CAAC;YAChF,CAAC;YAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YACnC,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,+BAA+B,CAAC,CAAC;YAC/D,MAAM,KAAK,GAAG,UAAU,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;YAEvD,IAAI,YAAY,GAAG,IAAI,CAAC;YACxB,IAAI,gBAAgB,GAAwB,MAAM,CAAC;YAEnD,IAAI,gBAAgB,CAAC,QAAQ,EAAE,CAAC;gBAC9B,IAAI,CAAC;oBACH,MAAM,SAAS,GAAG,IAAI,iBAAiB,EAAE,CAAC;oBAC1C,YAAY,GAAG,SAAS,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;oBACvC,gBAAgB,GAAG,UAAU,CAAC;gBAChC,CAAC;gBAAC,OAAO,eAAoB,EAAE,CAAC;oBAC9B,OAAO,CAAC,KAAK,CAAC,kCAAkC,GAAG,iBAAiB,EAAE,eAAe,CAAC,CAAC;oBACvF,gDAAgD;gBAClD,CAAC;YACH,CAAC;YAED,OAAO;gBACL,OAAO,EAAE,YAAY;gBACrB,WAAW,EAAE,gBAAgB;gBAC7B,KAAK,EAAE,KAAK;gBACZ,GAAG,EAAE,QAAQ,CAAC,GAAG,EAAE,oCAAoC;gBACvD,WAAW,EAAE,KAAK;gBAClB,UAAU,EAAE,QAAQ,CAAC,MAAM;gBAC3B,KAAK,EAAE,SAAS;aACjB,CAAC;QACJ,CAAC;QAAC,OAAO,KAAU,EAAE,CAAC;YACpB,0CAA0C;YAC1C,IACE,KAAK,YAAY,oBAAoB;gBACrC,CAAC,KAAK,YAAY,UAAU,IAAI,KAAK,CAAC,IAAI,KAAK,sBAAsB,CAAC,EACtE,CAAC;gBACD,MAAM,KAAK,CAAC;YACd,CAAC;YACD,+BAA+B;YAC/B,MAAM,OAAO,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,qBAAqB,CAAC;YAC/E,MAAM,IAAI,UAAU,CAAC,iBAAiB,OAAO,EAAE,EAAE,kBAAkB,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;QACnH,CAAC;IACH,CAAC;IAED;;;;OAIG;IACH,KAAK,CAAC,OAAO;QACX,OAAO,OAAO,CAAC,OAAO,EAAE,CAAC;IAC3B,CAAC;IAED;;;;OAIG;IACH,UAAU;QACR,OAAO,EAAE,CAAC;IACZ,CAAC"}
@@ -1,15 +1,21 @@
1
- import type { HTMLFetchResult, BrowserMetrics, PlaywrightEngineConfig } from "./types.js";
2
- import { IEngine } from "./IEngine.js";
1
+ import type { IEngine } from "./IEngine.js";
2
+ import type { HTMLFetchResult, PlaywrightEngineConfig, FetchOptions, BrowserMetrics } from "./types.js";
3
3
  /**
4
- * HybridEngine - Attempts fetching with FetchEngine first for speed,
5
- * then falls back to PlaywrightEngine for complex sites or specific errors.
4
+ * HybridEngine - Tries FetchEngine first, falls back to PlaywrightEngine on failure.
6
5
  */
7
6
  export declare class HybridEngine implements IEngine {
8
7
  private readonly fetchEngine;
9
8
  private readonly playwrightEngine;
10
- constructor(playwrightConfig?: PlaywrightEngineConfig);
11
- fetchHTML(url: string): Promise<HTMLFetchResult>;
12
- cleanup(): Promise<void>;
9
+ private readonly config;
10
+ constructor(config?: PlaywrightEngineConfig);
11
+ fetchHTML(url: string, options?: FetchOptions): Promise<HTMLFetchResult>;
12
+ /**
13
+ * Delegates getMetrics to the PlaywrightEngine.
14
+ */
13
15
  getMetrics(): BrowserMetrics[];
16
+ /**
17
+ * Calls cleanup on both underlying engines.
18
+ */
19
+ cleanup(): Promise<void>;
14
20
  }
15
21
  //# sourceMappingURL=HybridEngine.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"HybridEngine.d.ts","sourceRoot":"","sources":["../src/HybridEngine.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,eAAe,EAAE,cAAc,EAAE,sBAAsB,EAAE,MAAM,YAAY,CAAC;AAC1F,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAEvC;;;GAGG;AACH,qBAAa,YAAa,YAAW,OAAO;IAC1C,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAc;IAC1C,OAAO,CAAC,QAAQ,CAAC,gBAAgB,CAAmB;gBAExC,gBAAgB,GAAE,sBAA2B;IAKnD,SAAS,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,eAAe,CAAC;IAkBhD,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;IAK9B,UAAU,IAAI,cAAc,EAAE;CAI/B"}
1
+ {"version":3,"file":"HybridEngine.d.ts","sourceRoot":"","sources":["../src/HybridEngine.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAC5C,OAAO,KAAK,EAAE,eAAe,EAAE,sBAAsB,EAAE,YAAY,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAExG;;GAEG;AACH,qBAAa,YAAa,YAAW,OAAO;IAC1C,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAc;IAC1C,OAAO,CAAC,QAAQ,CAAC,gBAAgB,CAAmB;IACpD,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAyB;gBAEpC,MAAM,GAAE,sBAA2B;IAQzC,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,YAAiB,GAAG,OAAO,CAAC,eAAe,CAAC;IA8BlF;;OAEG;IACH,UAAU,IAAI,cAAc,EAAE;IAI9B;;OAEG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAM/B"}
@@ -1,42 +1,62 @@
1
1
  import { FetchEngine } from "./FetchEngine.js";
2
2
  import { PlaywrightEngine } from "./PlaywrightEngine.js";
3
3
  /**
4
- * HybridEngine - Attempts fetching with FetchEngine first for speed,
5
- * then falls back to PlaywrightEngine for complex sites or specific errors.
4
+ * HybridEngine - Tries FetchEngine first, falls back to PlaywrightEngine on failure.
6
5
  */
7
6
  export class HybridEngine {
8
7
  fetchEngine;
9
8
  playwrightEngine;
10
- constructor(playwrightConfig = {}) {
11
- this.fetchEngine = new FetchEngine();
12
- this.playwrightEngine = new PlaywrightEngine(playwrightConfig);
9
+ config; // Store config for potential per-request PW overrides
10
+ constructor(config = {}) {
11
+ // Pass relevant config parts to each engine
12
+ // FetchEngine only takes markdown option from the shared config
13
+ this.fetchEngine = new FetchEngine({ markdown: config.markdown });
14
+ this.playwrightEngine = new PlaywrightEngine(config);
15
+ this.config = config; // Store for merging later
13
16
  }
14
- async fetchHTML(url) {
17
+ async fetchHTML(url, options = {}) {
18
+ // FetchEngine uses its constructor config; it doesn't accept per-request options here.
15
19
  try {
16
- // Attempt 1: Use the fast FetchEngine
17
20
  const fetchResult = await this.fetchEngine.fetchHTML(url);
21
+ // If fetch succeeded, return its result directly (it handles its own markdown config)
22
+ // No need to check contentType here, FetchEngine handles it based on its constructor.
18
23
  return fetchResult;
19
24
  }
20
- catch (_fetchError) {
21
- // Prefixed unused error
22
- // If FetchEngine fails (e.g., 403, network error, non-html), try Playwright
25
+ catch (fetchError) {
26
+ console.warn(`FetchEngine failed for ${url}: ${fetchError.message}. Falling back to PlaywrightEngine.`);
27
+ // Merge constructor config with per-request options for Playwright fallback
28
+ const playwrightOptions = {
29
+ ...this.config, // Start with base config given to HybridEngine
30
+ ...options, // Override with per-request options
31
+ };
23
32
  try {
24
- const playwrightResult = await this.playwrightEngine.fetchHTML(url);
33
+ // Pass merged options to PlaywrightEngine
34
+ const playwrightResult = await this.playwrightEngine.fetchHTML(url, playwrightOptions);
25
35
  return playwrightResult;
26
36
  }
27
37
  catch (playwrightError) {
28
- // If Playwright also fails, throw its error (potentially more informative)
38
+ // Catch potential Playwright error
39
+ console.error(`PlaywrightEngine fallback failed for ${url}: ${playwrightError.message}`);
40
+ // Optionally, wrap or prioritize which error to throw
41
+ // Throwing the Playwright error as it's the last one encountered
29
42
  throw playwrightError;
30
43
  }
31
44
  }
32
45
  }
33
- async cleanup() {
34
- // Cleanup both engines concurrently
35
- await Promise.allSettled([this.fetchEngine.cleanup(), this.playwrightEngine.cleanup()]);
36
- }
46
+ /**
47
+ * Delegates getMetrics to the PlaywrightEngine.
48
+ */
37
49
  getMetrics() {
38
- // FetchEngine doesn't produce metrics, only PlaywrightEngine does
39
50
  return this.playwrightEngine.getMetrics();
40
51
  }
52
+ /**
53
+ * Calls cleanup on both underlying engines.
54
+ */
55
+ async cleanup() {
56
+ await Promise.allSettled([
57
+ this.fetchEngine.cleanup(), // Although a no-op, call for consistency
58
+ this.playwrightEngine.cleanup(),
59
+ ]);
60
+ }
41
61
  }
42
62
  //# sourceMappingURL=HybridEngine.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"HybridEngine.js","sourceRoot":"","sources":["../src/HybridEngine.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAC/C,OAAO,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AAIzD;;;GAGG;AACH,MAAM,OAAO,YAAY;IACN,WAAW,CAAc;IACzB,gBAAgB,CAAmB;IAEpD,YAAY,mBAA2C,EAAE;QACvD,IAAI,CAAC,WAAW,GAAG,IAAI,WAAW,EAAE,CAAC;QACrC,IAAI,CAAC,gBAAgB,GAAG,IAAI,gBAAgB,CAAC,gBAAgB,CAAC,CAAC;IACjE,CAAC;IAED,KAAK,CAAC,SAAS,CAAC,GAAW;QACzB,IAAI,CAAC;YACH,sCAAsC;YACtC,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;YAC1D,OAAO,WAAW,CAAC;QACrB,CAAC;QAAC,OAAO,WAAgB,EAAE,CAAC;YAC1B,wBAAwB;YACxB,4EAA4E;YAC5E,IAAI,CAAC;gBACH,MAAM,gBAAgB,GAAG,MAAM,IAAI,CAAC,gBAAgB,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;gBACpE,OAAO,gBAAgB,CAAC;YAC1B,CAAC;YAAC,OAAO,eAAe,EAAE,CAAC;gBACzB,2EAA2E;gBAC3E,MAAM,eAAe,CAAC;YACxB,CAAC;QACH,CAAC;IACH,CAAC;IAED,KAAK,CAAC,OAAO;QACX,oCAAoC;QACpC,MAAM,OAAO,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,EAAE,IAAI,CAAC,gBAAgB,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;IAC1F,CAAC;IAED,UAAU;QACR,kEAAkE;QAClE,OAAO,IAAI,CAAC,gBAAgB,CAAC,UAAU,EAAE,CAAC;IAC5C,CAAC;CACF"}
1
+ {"version":3,"file":"HybridEngine.js","sourceRoot":"","sources":["../src/HybridEngine.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAC/C,OAAO,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AAIzD;;GAEG;AACH,MAAM,OAAO,YAAY;IACN,WAAW,CAAc;IACzB,gBAAgB,CAAmB;IACnC,MAAM,CAAyB,CAAC,sDAAsD;IAEvG,YAAY,SAAiC,EAAE;QAC7C,4CAA4C;QAC5C,gEAAgE;QAChE,IAAI,CAAC,WAAW,GAAG,IAAI,WAAW,CAAC,EAAE,QAAQ,EAAE,MAAM,CAAC,QAAQ,EAAE,CAAC,CAAC;QAClE,IAAI,CAAC,gBAAgB,GAAG,IAAI,gBAAgB,CAAC,MAAM,CAAC,CAAC;QACrD,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC,CAAC,0BAA0B;IAClD,CAAC;IAED,KAAK,CAAC,SAAS,CAAC,GAAW,EAAE,UAAwB,EAAE;QACrD,uFAAuF;QACvF,IAAI,CAAC;YACH,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;YAC1D,sFAAsF;YACtF,sFAAsF;YACtF,OAAO,WAAW,CAAC;QACrB,CAAC;QAAC,OAAO,UAAe,EAAE,CAAC;YACzB,OAAO,CAAC,IAAI,CAAC,0BAA0B,GAAG,KAAK,UAAU,CAAC,OAAO,qCAAqC,CAAC,CAAC;YAExG,4EAA4E;YAC5E,MAAM,iBAAiB,GAAiB;gBACtC,GAAG,IAAI,CAAC,MAAM,EAAE,+CAA+C;gBAC/D,GAAG,OAAO,EAAE,oCAAoC;aACjD,CAAC;YAEF,IAAI,CAAC;gBACH,0CAA0C;gBAC1C,MAAM,gBAAgB,GAAG,MAAM,IAAI,CAAC,gBAAgB,CAAC,SAAS,CAAC,GAAG,EAAE,iBAAiB,CAAC,CAAC;gBACvF,OAAO,gBAAgB,CAAC;YAC1B,CAAC;YAAC,OAAO,eAAoB,EAAE,CAAC;gBAC9B,mCAAmC;gBACnC,OAAO,CAAC,KAAK,CAAC,wCAAwC,GAAG,KAAK,eAAe,CAAC,OAAO,EAAE,CAAC,CAAC;gBACzF,sDAAsD;gBACtD,iEAAiE;gBACjE,MAAM,eAAe,CAAC;YACxB,CAAC;QACH,CAAC;IACH,CAAC;IAED;;OAEG;IACH,UAAU;QACR,OAAO,IAAI,CAAC,gBAAgB,CAAC,UAAU,EAAE,CAAC;IAC5C,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO;QACX,MAAM,OAAO,CAAC,UAAU,CAAC;YACvB,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,EAAE,yCAAyC;YACrE,IAAI,CAAC,gBAAgB,CAAC,OAAO,EAAE;SAChC,CAAC,CAAC;IACL,CAAC;CACF"}
@@ -54,12 +54,14 @@ export declare class PlaywrightEngine implements IEngine {
54
54
  * @returns A Promise resolving to an HTMLFetchResult object.
55
55
  * @throws {FetchError} If the fetch fails after all retries or encounters critical errors.
56
56
  */
57
- fetchHTML(url: string, options?: FetchOptions): Promise<HTMLFetchResult>;
57
+ fetchHTML(url: string, options?: FetchOptions & {
58
+ markdown?: boolean;
59
+ }): Promise<HTMLFetchResult>;
58
60
  /**
59
61
  * Internal recursive method to handle fetching with retries.
60
62
  *
61
63
  * @param url URL to fetch
62
- * @param options Original fetch options (e.g., fastMode override)
64
+ * @param currentConfig The merged configuration including markdown option
63
65
  * @param retryAttempt Current retry attempt number (starts at 0)
64
66
  * @param parentRetryCount Tracks retries related to pool initialization errors (starts at 0)
65
67
  * @returns Promise resolving to HTMLFetchResult
@@ -1 +1 @@
1
- {"version":3,"file":"PlaywrightEngine.d.ts","sourceRoot":"","sources":["../src/PlaywrightEngine.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,cAAc,EAAE,sBAAsB,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AACxG,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAkB5C;;;;;;GAMG;AACH,qBAAa,gBAAiB,YAAW,OAAO;IAC9C,OAAO,CAAC,WAAW,CAAsC;IACzD,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAS;IAC/B,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAsC;IAC5D,OAAO,CAAC,QAAQ,CAAC,MAAM,CAIE;IAGzB,OAAO,CAAC,uBAAuB,CAAkB;IACjD,OAAO,CAAC,iBAAiB,CAAkB;IAC3C,OAAO,CAAC,mBAAmB,CAA0B;IAGrD,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,cAAc,CAoBpC;IAEF;;;;;OAKG;gBACS,MAAM,GAAE,sBAA2B;IAmB/C;;OAEG;YACW,qBAAqB;IAuCnC;;;OAGG;YACW,yBAAyB;IAiEvC,OAAO,CAAC,UAAU;IAWlB;;OAEG;YACW,WAAW;IAazB;;OAEG;YACW,qBAAqB;IAqCnC;;OAEG;IACH,OAAO,CAAC,UAAU;IAUlB;;;;;;;;OAQG;IACG,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,YAAiB,GAAG,OAAO,CAAC,eAAe,CAAC;IAKlF;;;;;;;;OAQG;YACW,eAAe;IA2G7B;;;OAGG;YACW,mBAAmB;YAiEnB,kBAAkB;IAmChC;;;;;OAKG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;IAe9B;;;OAGG;IACH,UAAU,IAAI,cAAc,EAAE;IAQ9B,OAAO,CAAC,mBAAmB;CAS5B"}
1
+ {"version":3,"file":"PlaywrightEngine.d.ts","sourceRoot":"","sources":["../src/PlaywrightEngine.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,cAAc,EAAE,sBAAsB,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AACxG,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAmB5C;;;;;;GAMG;AACH,qBAAa,gBAAiB,YAAW,OAAO;IAC9C,OAAO,CAAC,WAAW,CAAsC;IACzD,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAS;IAC/B,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAsC;IAC5D,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAmC;IAG1D,OAAO,CAAC,uBAAuB,CAAkB;IACjD,OAAO,CAAC,iBAAiB,CAAkB;IAC3C,OAAO,CAAC,mBAAmB,CAA0B;IAGrD,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,cAAc,CAkBpC;IAEF;;;;;OAKG;gBACS,MAAM,GAAE,sBAA2B;IAM/C;;OAEG;YACW,qBAAqB;IAuCnC;;;OAGG;YACW,yBAAyB;IAmFvC,OAAO,CAAC,UAAU;IAalB;;OAEG;YACW,WAAW;IAazB;;OAEG;YACW,qBAAqB;IAqCnC;;OAEG;IACH,OAAO,CAAC,UAAU;IAUlB;;;;;;;;OAQG;IACG,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,YAAY,GAAG;QAAE,QAAQ,CAAC,EAAE,OAAO,CAAA;KAAO,GAAG,OAAO,CAAC,eAAe,CAAC;IAU3G;;;;;;;;OAQG;YACW,eAAe;IAsH7B;;;OAGG;YACW,mBAAmB;YAmFnB,kBAAkB;IAmChC;;;;;OAKG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;IAe9B;;;OAGG;IACH,UAAU,IAAI,cAAc,EAAE;IAQ9B,OAAO,CAAC,mBAAmB;CAS5B"}