@purepageio/fetch-engines 0.2.5 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1657 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +323 -0
- package/dist/index.d.ts +323 -8
- package/dist/index.js +1617 -4
- package/dist/index.js.map +1 -1
- package/package.json +14 -5
- package/dist/FetchEngine.d.ts +0 -47
- package/dist/FetchEngine.d.ts.map +0 -1
- package/dist/FetchEngine.js +0 -114
- package/dist/FetchEngine.js.map +0 -1
- package/dist/FetchEngine.test.d.ts +0 -2
- package/dist/FetchEngine.test.d.ts.map +0 -1
- package/dist/FetchEngine.test.js +0 -44
- package/dist/FetchEngine.test.js.map +0 -1
- package/dist/HybridEngine.d.ts +0 -21
- package/dist/HybridEngine.d.ts.map +0 -1
- package/dist/HybridEngine.js +0 -62
- package/dist/HybridEngine.js.map +0 -1
- package/dist/IEngine.d.ts +0 -22
- package/dist/IEngine.d.ts.map +0 -1
- package/dist/IEngine.js +0 -2
- package/dist/IEngine.js.map +0 -1
- package/dist/PlaywrightEngine.d.ts +0 -90
- package/dist/PlaywrightEngine.d.ts.map +0 -1
- package/dist/PlaywrightEngine.js +0 -505
- package/dist/PlaywrightEngine.js.map +0 -1
- package/dist/PlaywrightEngine.test.d.ts +0 -2
- package/dist/PlaywrightEngine.test.d.ts.map +0 -1
- package/dist/PlaywrightEngine.test.js +0 -207
- package/dist/PlaywrightEngine.test.js.map +0 -1
- package/dist/PuppeteerEngine.d.ts +0 -21
- package/dist/PuppeteerEngine.d.ts.map +0 -1
- package/dist/PuppeteerEngine.js +0 -412
- package/dist/PuppeteerEngine.js.map +0 -1
- package/dist/browser/BrowserPool.d.ts +0 -29
- package/dist/browser/BrowserPool.d.ts.map +0 -1
- package/dist/browser/BrowserPool.js +0 -378
- package/dist/browser/BrowserPool.js.map +0 -1
- package/dist/browser/PlaywrightBrowserPool.d.ts +0 -48
- package/dist/browser/PlaywrightBrowserPool.d.ts.map +0 -1
- package/dist/browser/PlaywrightBrowserPool.js +0 -378
- package/dist/browser/PlaywrightBrowserPool.js.map +0 -1
- package/dist/browser/PlaywrightBrowserPool.test.d.ts +0 -2
- package/dist/browser/PlaywrightBrowserPool.test.d.ts.map +0 -1
- package/dist/browser/PlaywrightBrowserPool.test.js +0 -422
- package/dist/browser/PlaywrightBrowserPool.test.js.map +0 -1
- package/dist/errors.d.ts +0 -20
- package/dist/errors.d.ts.map +0 -1
- package/dist/errors.js +0 -30
- package/dist/errors.js.map +0 -1
- package/dist/index.d.ts.map +0 -1
- package/dist/types.d.ts +0 -167
- package/dist/types.d.ts.map +0 -1
- package/dist/types.js +0 -2
- package/dist/types.js.map +0 -1
- package/dist/utils/markdown-converter.d.ts +0 -31
- package/dist/utils/markdown-converter.d.ts.map +0 -1
- package/dist/utils/markdown-converter.js +0 -796
- package/dist/utils/markdown-converter.js.map +0 -1
package/dist/types.d.ts
DELETED
|
@@ -1,167 +0,0 @@
|
|
|
1
|
-
import type { Browser as PlaywrightBrowser, BrowserContext } from "playwright";
|
|
2
|
-
/**
|
|
3
|
-
* Defines the structure for the result of fetching HTML content.
|
|
4
|
-
*/
|
|
5
|
-
export interface HTMLFetchResult {
|
|
6
|
-
/** The fetched HTML content OR the converted Markdown content. */
|
|
7
|
-
content: string;
|
|
8
|
-
/** Indicates the type of content in the 'content' field. */
|
|
9
|
-
contentType: "html" | "markdown";
|
|
10
|
-
/** The extracted title of the page, if available. */
|
|
11
|
-
title: string | null;
|
|
12
|
-
/** The final URL after any redirects. */
|
|
13
|
-
url: string;
|
|
14
|
-
/** Indicates if the result came from the cache. */
|
|
15
|
-
isFromCache: boolean;
|
|
16
|
-
/** The HTTP status code of the final response. */
|
|
17
|
-
statusCode: number | undefined;
|
|
18
|
-
/** Any error encountered during the fetch process. */
|
|
19
|
-
error: Error | undefined;
|
|
20
|
-
}
|
|
21
|
-
/**
|
|
22
|
-
* Metrics related to browser pool performance and status.
|
|
23
|
-
*/
|
|
24
|
-
export interface BrowserMetrics {
|
|
25
|
-
id: string;
|
|
26
|
-
engine?: "playwright" | string;
|
|
27
|
-
pagesCreated: number;
|
|
28
|
-
activePages: number;
|
|
29
|
-
lastUsed: Date;
|
|
30
|
-
errors: number;
|
|
31
|
-
totalRequests?: number;
|
|
32
|
-
avgResponseTime?: number;
|
|
33
|
-
createdAt: Date;
|
|
34
|
-
isHealthy: boolean;
|
|
35
|
-
}
|
|
36
|
-
/**
|
|
37
|
-
* Internal representation of a Playwright browser instance within the pool.
|
|
38
|
-
*/
|
|
39
|
-
export interface BrowserInstance {
|
|
40
|
-
browser: PlaywrightBrowser;
|
|
41
|
-
context: BrowserContext;
|
|
42
|
-
metrics: BrowserMetrics;
|
|
43
|
-
isHealthy: boolean;
|
|
44
|
-
}
|
|
45
|
-
/**
|
|
46
|
-
* Configuration options for the PlaywrightEngine.
|
|
47
|
-
*/
|
|
48
|
-
export interface PlaywrightEngineConfig {
|
|
49
|
-
/**
|
|
50
|
-
* Maximum number of Playwright pages to process concurrently.
|
|
51
|
-
* @default 3
|
|
52
|
-
*/
|
|
53
|
-
concurrentPages?: number;
|
|
54
|
-
/**
|
|
55
|
-
* Maximum number of retry attempts for a failed fetch operation (excluding initial attempt).
|
|
56
|
-
* @default 3
|
|
57
|
-
*/
|
|
58
|
-
maxRetries?: number;
|
|
59
|
-
/**
|
|
60
|
-
* Delay in milliseconds between retry attempts.
|
|
61
|
-
* @default 5000
|
|
62
|
-
*/
|
|
63
|
-
retryDelay?: number;
|
|
64
|
-
/**
|
|
65
|
-
* Time-to-live for cached results in milliseconds. Set to 0 to disable.
|
|
66
|
-
* @default 900000 (15 minutes)
|
|
67
|
-
*/
|
|
68
|
-
cacheTTL?: number;
|
|
69
|
-
/**
|
|
70
|
-
* If true, attempts a fast HTTP GET first before using Playwright.
|
|
71
|
-
* @default true
|
|
72
|
-
*/
|
|
73
|
-
useHttpFallback?: boolean;
|
|
74
|
-
/**
|
|
75
|
-
* If true, automatically retries failed requests for a domain in headed mode.
|
|
76
|
-
* @default false
|
|
77
|
-
*/
|
|
78
|
-
useHeadedModeFallback?: boolean;
|
|
79
|
-
/**
|
|
80
|
-
* If true, requests initially block non-essential resources and skip human simulation.
|
|
81
|
-
* Can be overridden per-request via fetchHTML options.
|
|
82
|
-
* @default true
|
|
83
|
-
*/
|
|
84
|
-
defaultFastMode?: boolean;
|
|
85
|
-
/**
|
|
86
|
-
* If true (and not in fastMode), attempts basic human-like interactions.
|
|
87
|
-
* @default true
|
|
88
|
-
*/
|
|
89
|
-
simulateHumanBehavior?: boolean;
|
|
90
|
-
/**
|
|
91
|
-
* Maximum number of concurrent browser instances the pool manages.
|
|
92
|
-
* Passed to PlaywrightBrowserPool.
|
|
93
|
-
* @default 2
|
|
94
|
-
*/
|
|
95
|
-
maxBrowsers?: number;
|
|
96
|
-
/**
|
|
97
|
-
* Maximum number of pages per browser context before recycling.
|
|
98
|
-
* Passed to PlaywrightBrowserPool.
|
|
99
|
-
* @default 6
|
|
100
|
-
*/
|
|
101
|
-
maxPagesPerContext?: number;
|
|
102
|
-
/**
|
|
103
|
-
* Maximum age in ms a browser instance lives before recycling.
|
|
104
|
-
* Passed to PlaywrightBrowserPool.
|
|
105
|
-
* @default 1200000 (20 minutes)
|
|
106
|
-
*/
|
|
107
|
-
maxBrowserAge?: number;
|
|
108
|
-
/**
|
|
109
|
-
* How often (in ms) the pool checks browser health.
|
|
110
|
-
* Passed to PlaywrightBrowserPool.
|
|
111
|
-
* @default 60000 (1 minute)
|
|
112
|
-
*/
|
|
113
|
-
healthCheckInterval?: number;
|
|
114
|
-
/**
|
|
115
|
-
* List of domain glob patterns to block requests to. Overrides pool default.
|
|
116
|
-
* Passed to PlaywrightBrowserPool.
|
|
117
|
-
* @default [] (uses pool's defaults)
|
|
118
|
-
*/
|
|
119
|
-
poolBlockedDomains?: string[];
|
|
120
|
-
/**
|
|
121
|
-
* List of Playwright resource types (e.g., 'image', 'font') to block. Overrides pool default.
|
|
122
|
-
* Passed to PlaywrightBrowserPool.
|
|
123
|
-
* @default [] (uses pool's defaults)
|
|
124
|
-
*/
|
|
125
|
-
poolBlockedResourceTypes?: string[];
|
|
126
|
-
/**
|
|
127
|
-
* Proxy configuration for browser instances.
|
|
128
|
-
* Passed to PlaywrightBrowserPool.
|
|
129
|
-
* @default undefined
|
|
130
|
-
*/
|
|
131
|
-
proxy?: {
|
|
132
|
-
/** Proxy server URL (e.g., "http://host:port", "socks5://user:pass@host:port"). */
|
|
133
|
-
server: string;
|
|
134
|
-
/** Optional proxy username. */
|
|
135
|
-
username?: string;
|
|
136
|
-
/** Optional proxy password. */
|
|
137
|
-
password?: string;
|
|
138
|
-
};
|
|
139
|
-
/**
|
|
140
|
-
* Forces the entire pool to launch browsers in headed (visible) mode.
|
|
141
|
-
* Passed to PlaywrightBrowserPool.
|
|
142
|
-
* @default false
|
|
143
|
-
*/
|
|
144
|
-
useHeadedMode?: boolean;
|
|
145
|
-
/**
|
|
146
|
-
* If true, the fetched HTML content will be converted to Markdown.
|
|
147
|
-
* @default false
|
|
148
|
-
*/
|
|
149
|
-
markdown?: boolean;
|
|
150
|
-
}
|
|
151
|
-
/**
|
|
152
|
-
* Options that can be passed per-request to engine.fetchHTML().
|
|
153
|
-
*/
|
|
154
|
-
export interface FetchOptions {
|
|
155
|
-
/** Overrides the engine's defaultFastMode for this specific request. (Playwright/Hybrid only) */
|
|
156
|
-
fastMode?: boolean;
|
|
157
|
-
/** Overrides the engine's markdown setting for this specific request. (Playwright/Hybrid only) */
|
|
158
|
-
markdown?: boolean;
|
|
159
|
-
}
|
|
160
|
-
/**
|
|
161
|
-
* Configuration options specifically for the FetchEngine.
|
|
162
|
-
*/
|
|
163
|
-
export interface FetchEngineOptions {
|
|
164
|
-
/** If true, convert the fetched HTML to Markdown. Default: false */
|
|
165
|
-
markdown?: boolean;
|
|
166
|
-
}
|
|
167
|
-
//# sourceMappingURL=types.d.ts.map
|
package/dist/types.d.ts.map
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,OAAO,IAAI,iBAAiB,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAE/E;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,kEAAkE;IAClE,OAAO,EAAE,MAAM,CAAC;IAChB,4DAA4D;IAC5D,WAAW,EAAE,MAAM,GAAG,UAAU,CAAC;IACjC,qDAAqD;IACrD,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;IACrB,yCAAyC;IACzC,GAAG,EAAE,MAAM,CAAC;IACZ,mDAAmD;IACnD,WAAW,EAAE,OAAO,CAAC;IACrB,kDAAkD;IAClD,UAAU,EAAE,MAAM,GAAG,SAAS,CAAC;IAC/B,sDAAsD;IACtD,KAAK,EAAE,KAAK,GAAG,SAAS,CAAC;CAC1B;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,EAAE,EAAE,MAAM,CAAC;IACX,MAAM,CAAC,EAAE,YAAY,GAAG,MAAM,CAAC;IAC/B,YAAY,EAAE,MAAM,CAAC;IACrB,WAAW,EAAE,MAAM,CAAC;IACpB,QAAQ,EAAE,IAAI,CAAC;IACf,MAAM,EAAE,MAAM,CAAC;IACf,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,SAAS,EAAE,IAAI,CAAC;IAChB,SAAS,EAAE,OAAO,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,OAAO,EAAE,iBAAiB,CAAC;IAC3B,OAAO,EAAE,cAAc,CAAC;IACxB,OAAO,EAAE,cAAc,CAAC;IACxB,SAAS,EAAE,OAAO,CAAC;CACpB;AAKD;;GAEG;AACH,MAAM,WAAW,sBAAsB;IACrC;;;OAGG;IACH,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB;;;OAGG;IACH,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB;;;OAGG;IACH,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB;;;OAGG;IACH,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB;;;OAGG;IACH,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B;;;OAGG;IACH,qBAAqB,CAAC,EAAE,OAAO,CAAC;IAChC;;;;OAIG;IACH,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B;;;OAGG;IACH,qBAAqB,CAAC,EAAE,OAAO,CAAC;IAIhC;;;;OAIG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB;;;;OAIG;IACH,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B;;;;OAIG;IACH,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB;;;;OAIG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B;;;;OAIG;IACH,kBAAkB,CAAC,EAAE,MAAM,EAAE,CAAC;IAC9B;;;;OAIG;IACH,wBAAwB,CAAC,EAAE,MAAM,EAAE,CAAC;IACpC;;;;OAIG;IACH,KAAK,CAAC,EAAE;QACN,mFAAmF;QACnF,MAAM,EAAE,MAAM,CAAC;QACf,+BAA+B;QAC/B,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,+BAA+B;QAC/B,QAAQ,CAAC,EAAE,MAAM,CAAC;KACnB,CAAC;IACF;;;;OAIG;IACH,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB;;;OAGG;IACH,QAAQ,CAAC,EAAE,OAAO,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,iGAAiG;IACjG,QAAQ,CAAC,EAAE,OAAO,CAAC;IACnB,kGAAkG;IAClG,QAAQ,CAAC,EAAE,OAAO,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,oEAAoE;IACpE,QAAQ,CAAC,EAAE,OAAO,CAAC;CAEpB"}
|
package/dist/types.js
DELETED
package/dist/types.js.map
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":""}
|
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
export interface ConversionOptions {
|
|
2
|
-
/** Maximum length of the final Markdown content. Defaults to Infinity. */
|
|
3
|
-
maxContentLength?: number;
|
|
4
|
-
}
|
|
5
|
-
export declare class MarkdownConverter {
|
|
6
|
-
private turndownService;
|
|
7
|
-
constructor();
|
|
8
|
-
/**
|
|
9
|
-
* Converts HTML string to Markdown.
|
|
10
|
-
* @param html The HTML string to convert.
|
|
11
|
-
* @param options Conversion options.
|
|
12
|
-
* @returns The converted Markdown string.
|
|
13
|
-
*/
|
|
14
|
-
convert(html: string, options?: ConversionOptions): string;
|
|
15
|
-
private setupPrioritizedRules;
|
|
16
|
-
private addContentExtractionRules;
|
|
17
|
-
private addStructureRules;
|
|
18
|
-
private addBlockRules;
|
|
19
|
-
private addInlineRules;
|
|
20
|
-
private preprocessHTML;
|
|
21
|
-
private cleanupHtml;
|
|
22
|
-
private cleanupContentHtml;
|
|
23
|
-
private removeHighLinkDensityElements;
|
|
24
|
-
private extractDocumentMetadata;
|
|
25
|
-
private detectForumPage;
|
|
26
|
-
private extractArticleContentElement;
|
|
27
|
-
private extractForumContentElement;
|
|
28
|
-
private hasHighLinkDensity;
|
|
29
|
-
private postprocessMarkdown;
|
|
30
|
-
}
|
|
31
|
-
//# sourceMappingURL=markdown-converter.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"markdown-converter.d.ts","sourceRoot":"","sources":["../../src/utils/markdown-converter.ts"],"names":[],"mappings":"AAmEA,MAAM,WAAW,iBAAiB;IAChC,0EAA0E;IAC1E,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AASD,qBAAa,iBAAiB;IAC5B,OAAO,CAAC,eAAe,CAAkB;;IA+BzC;;;;;OAKG;IACI,OAAO,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,GAAE,iBAAsB,GAAG,MAAM;IAerE,OAAO,CAAC,qBAAqB;IAS7B,OAAO,CAAC,yBAAyB;IA4CjC,OAAO,CAAC,iBAAiB;IAiBzB,OAAO,CAAC,aAAa;IAmErB,OAAO,CAAC,cAAc;IAqLtB,OAAO,CAAC,cAAc;IAmDtB,OAAO,CAAC,WAAW;IAanB,OAAO,CAAC,kBAAkB;IAqB1B,OAAO,CAAC,6BAA6B;IAkDrC,OAAO,CAAC,uBAAuB;IAwE/B,OAAO,CAAC,eAAe;IAoDvB,OAAO,CAAC,4BAA4B;IA8DpC,OAAO,CAAC,0BAA0B;IAqElC,OAAO,CAAC,kBAAkB;IAwB1B,OAAO,CAAC,mBAAmB;CAkD5B"}
|