@staticn0va/wigolo 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +74 -0
- package/README.md +272 -0
- package/dist/cache/db.d.ts +5 -0
- package/dist/cache/db.d.ts.map +1 -0
- package/dist/cache/db.js +97 -0
- package/dist/cache/db.js.map +1 -0
- package/dist/cache/store.d.ts +26 -0
- package/dist/cache/store.d.ts.map +1 -0
- package/dist/cache/store.js +214 -0
- package/dist/cache/store.js.map +1 -0
- package/dist/cli/daemon.d.ts +2 -0
- package/dist/cli/daemon.d.ts.map +1 -0
- package/dist/cli/daemon.js +5 -0
- package/dist/cli/daemon.js.map +1 -0
- package/dist/cli/health.d.ts +2 -0
- package/dist/cli/health.d.ts.map +1 -0
- package/dist/cli/health.js +5 -0
- package/dist/cli/health.js.map +1 -0
- package/dist/cli/index.d.ts +7 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +9 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/cli/warmup.d.ts +11 -0
- package/dist/cli/warmup.d.ts.map +1 -0
- package/dist/cli/warmup.js +107 -0
- package/dist/cli/warmup.js.map +1 -0
- package/dist/config.d.ts +41 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +66 -0
- package/dist/config.js.map +1 -0
- package/dist/crawl/crawler.d.ts +18 -0
- package/dist/crawl/crawler.d.ts.map +1 -0
- package/dist/crawl/crawler.js +228 -0
- package/dist/crawl/crawler.js.map +1 -0
- package/dist/crawl/dedup.d.ts +15 -0
- package/dist/crawl/dedup.d.ts.map +1 -0
- package/dist/crawl/dedup.js +93 -0
- package/dist/crawl/dedup.js.map +1 -0
- package/dist/crawl/mapper.d.ts +17 -0
- package/dist/crawl/mapper.d.ts.map +1 -0
- package/dist/crawl/mapper.js +178 -0
- package/dist/crawl/mapper.js.map +1 -0
- package/dist/crawl/rate-limiter.d.ts +10 -0
- package/dist/crawl/rate-limiter.d.ts.map +1 -0
- package/dist/crawl/rate-limiter.js +72 -0
- package/dist/crawl/rate-limiter.js.map +1 -0
- package/dist/crawl/robots.d.ts +9 -0
- package/dist/crawl/robots.d.ts.map +1 -0
- package/dist/crawl/robots.js +63 -0
- package/dist/crawl/robots.js.map +1 -0
- package/dist/crawl/sitemap.d.ts +4 -0
- package/dist/crawl/sitemap.d.ts.map +1 -0
- package/dist/crawl/sitemap.js +38 -0
- package/dist/crawl/sitemap.js.map +1 -0
- package/dist/crawl/url-utils.d.ts +3 -0
- package/dist/crawl/url-utils.d.ts.map +1 -0
- package/dist/crawl/url-utils.js +41 -0
- package/dist/crawl/url-utils.js.map +1 -0
- package/dist/extraction/defuddle.d.ts +3 -0
- package/dist/extraction/defuddle.d.ts.map +1 -0
- package/dist/extraction/defuddle.js +26 -0
- package/dist/extraction/defuddle.js.map +1 -0
- package/dist/extraction/extract.d.ts +5 -0
- package/dist/extraction/extract.d.ts.map +1 -0
- package/dist/extraction/extract.js +83 -0
- package/dist/extraction/extract.js.map +1 -0
- package/dist/extraction/jsonld.d.ts +4 -0
- package/dist/extraction/jsonld.d.ts.map +1 -0
- package/dist/extraction/jsonld.js +64 -0
- package/dist/extraction/jsonld.js.map +1 -0
- package/dist/extraction/markdown.d.ts +10 -0
- package/dist/extraction/markdown.d.ts.map +1 -0
- package/dist/extraction/markdown.js +107 -0
- package/dist/extraction/markdown.js.map +1 -0
- package/dist/extraction/pipeline.d.ts +11 -0
- package/dist/extraction/pipeline.d.ts.map +1 -0
- package/dist/extraction/pipeline.js +95 -0
- package/dist/extraction/pipeline.js.map +1 -0
- package/dist/extraction/readability.d.ts +3 -0
- package/dist/extraction/readability.d.ts.map +1 -0
- package/dist/extraction/readability.js +32 -0
- package/dist/extraction/readability.js.map +1 -0
- package/dist/extraction/schema.d.ts +7 -0
- package/dist/extraction/schema.d.ts.map +1 -0
- package/dist/extraction/schema.js +86 -0
- package/dist/extraction/schema.js.map +1 -0
- package/dist/extraction/site-extractors/docs-generic.d.ts +3 -0
- package/dist/extraction/site-extractors/docs-generic.d.ts.map +1 -0
- package/dist/extraction/site-extractors/docs-generic.js +104 -0
- package/dist/extraction/site-extractors/docs-generic.js.map +1 -0
- package/dist/extraction/site-extractors/github.d.ts +3 -0
- package/dist/extraction/site-extractors/github.d.ts.map +1 -0
- package/dist/extraction/site-extractors/github.js +107 -0
- package/dist/extraction/site-extractors/github.js.map +1 -0
- package/dist/extraction/site-extractors/mdn.d.ts +3 -0
- package/dist/extraction/site-extractors/mdn.d.ts.map +1 -0
- package/dist/extraction/site-extractors/mdn.js +58 -0
- package/dist/extraction/site-extractors/mdn.js.map +1 -0
- package/dist/extraction/site-extractors/stackoverflow.d.ts +3 -0
- package/dist/extraction/site-extractors/stackoverflow.d.ts.map +1 -0
- package/dist/extraction/site-extractors/stackoverflow.js +88 -0
- package/dist/extraction/site-extractors/stackoverflow.js.map +1 -0
- package/dist/extraction/trafilatura.d.ts +6 -0
- package/dist/extraction/trafilatura.d.ts.map +1 -0
- package/dist/extraction/trafilatura.js +105 -0
- package/dist/extraction/trafilatura.js.map +1 -0
- package/dist/fetch/auth.d.ts +8 -0
- package/dist/fetch/auth.d.ts.map +1 -0
- package/dist/fetch/auth.js +32 -0
- package/dist/fetch/auth.js.map +1 -0
- package/dist/fetch/browser-pool.d.ts +28 -0
- package/dist/fetch/browser-pool.d.ts.map +1 -0
- package/dist/fetch/browser-pool.js +138 -0
- package/dist/fetch/browser-pool.js.map +1 -0
- package/dist/fetch/content-check.d.ts +2 -0
- package/dist/fetch/content-check.d.ts.map +1 -0
- package/dist/fetch/content-check.js +62 -0
- package/dist/fetch/content-check.js.map +1 -0
- package/dist/fetch/http-client.d.ts +15 -0
- package/dist/fetch/http-client.d.ts.map +1 -0
- package/dist/fetch/http-client.js +146 -0
- package/dist/fetch/http-client.js.map +1 -0
- package/dist/fetch/router.d.ts +45 -0
- package/dist/fetch/router.d.ts.map +1 -0
- package/dist/fetch/router.js +89 -0
- package/dist/fetch/router.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +22 -0
- package/dist/index.js.map +1 -0
- package/dist/logger.d.ts +10 -0
- package/dist/logger.d.ts.map +1 -0
- package/dist/logger.js +39 -0
- package/dist/logger.js.map +1 -0
- package/dist/search/dedup.d.ts +10 -0
- package/dist/search/dedup.d.ts.map +1 -0
- package/dist/search/dedup.js +35 -0
- package/dist/search/dedup.js.map +1 -0
- package/dist/search/engines/bing.d.ts +7 -0
- package/dist/search/engines/bing.d.ts.map +1 -0
- package/dist/search/engines/bing.js +48 -0
- package/dist/search/engines/bing.js.map +1 -0
- package/dist/search/engines/duckduckgo.d.ts +7 -0
- package/dist/search/engines/duckduckgo.d.ts.map +1 -0
- package/dist/search/engines/duckduckgo.js +50 -0
- package/dist/search/engines/duckduckgo.js.map +1 -0
- package/dist/search/engines/startpage.d.ts +7 -0
- package/dist/search/engines/startpage.d.ts.map +1 -0
- package/dist/search/engines/startpage.js +50 -0
- package/dist/search/engines/startpage.js.map +1 -0
- package/dist/search/filters.d.ts +16 -0
- package/dist/search/filters.d.ts.map +1 -0
- package/dist/search/filters.js +63 -0
- package/dist/search/filters.js.map +1 -0
- package/dist/search/flashrank.d.ts +12 -0
- package/dist/search/flashrank.d.ts.map +1 -0
- package/dist/search/flashrank.js +63 -0
- package/dist/search/flashrank.js.map +1 -0
- package/dist/search/query.d.ts +2 -0
- package/dist/search/query.d.ts.map +1 -0
- package/dist/search/query.js +41 -0
- package/dist/search/query.js.map +1 -0
- package/dist/search/rerank.d.ts +3 -0
- package/dist/search/rerank.d.ts.map +1 -0
- package/dist/search/rerank.js +40 -0
- package/dist/search/rerank.js.map +1 -0
- package/dist/search/searxng.d.ts +8 -0
- package/dist/search/searxng.d.ts.map +1 -0
- package/dist/search/searxng.js +87 -0
- package/dist/search/searxng.js.map +1 -0
- package/dist/search/validator.d.ts +6 -0
- package/dist/search/validator.d.ts.map +1 -0
- package/dist/search/validator.js +35 -0
- package/dist/search/validator.js.map +1 -0
- package/dist/searxng/bootstrap.d.ts +18 -0
- package/dist/searxng/bootstrap.d.ts.map +1 -0
- package/dist/searxng/bootstrap.js +136 -0
- package/dist/searxng/bootstrap.js.map +1 -0
- package/dist/searxng/docker.d.ts +9 -0
- package/dist/searxng/docker.d.ts.map +1 -0
- package/dist/searxng/docker.js +67 -0
- package/dist/searxng/docker.js.map +1 -0
- package/dist/searxng/process.d.ts +23 -0
- package/dist/searxng/process.d.ts.map +1 -0
- package/dist/searxng/process.js +188 -0
- package/dist/searxng/process.js.map +1 -0
- package/dist/server.d.ts +2 -0
- package/dist/server.d.ts.map +1 -0
- package/dist/server.js +311 -0
- package/dist/server.js.map +1 -0
- package/dist/tools/cache.d.ts +3 -0
- package/dist/tools/cache.d.ts.map +1 -0
- package/dist/tools/cache.js +50 -0
- package/dist/tools/cache.js.map +1 -0
- package/dist/tools/crawl.d.ts +6 -0
- package/dist/tools/crawl.d.ts.map +1 -0
- package/dist/tools/crawl.js +97 -0
- package/dist/tools/crawl.js.map +1 -0
- package/dist/tools/extract.d.ts +4 -0
- package/dist/tools/extract.d.ts.map +1 -0
- package/dist/tools/extract.js +69 -0
- package/dist/tools/extract.js.map +1 -0
- package/dist/tools/fetch.d.ts +4 -0
- package/dist/tools/fetch.d.ts.map +1 -0
- package/dist/tools/fetch.js +76 -0
- package/dist/tools/fetch.js.map +1 -0
- package/dist/tools/search.d.ts +4 -0
- package/dist/tools/search.d.ts.map +1 -0
- package/dist/tools/search.js +160 -0
- package/dist/tools/search.js.map +1 -0
- package/dist/types.d.ts +222 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/package.json +61 -0
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
import { getConfig } from '../config.js';
|
|
2
|
+
import { createLogger } from '../logger.js';
|
|
3
|
+
const RETRYABLE_STATUSES = new Set([429, 502, 503]);
|
|
4
|
+
const RETRYABLE_ERROR_CODES = new Set(['ECONNRESET', 'ETIMEDOUT', 'ECONNREFUSED']);
|
|
5
|
+
const REDIRECT_STATUSES = new Set([301, 302, 307, 308]);
|
|
6
|
+
const DEFAULT_USER_AGENTS = [
|
|
7
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
|
|
8
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
|
|
9
|
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
|
|
10
|
+
];
|
|
11
|
+
function getRotatingUserAgent(config) {
|
|
12
|
+
if (config.userAgent)
|
|
13
|
+
return config.userAgent;
|
|
14
|
+
return DEFAULT_USER_AGENTS[Math.floor(Math.random() * DEFAULT_USER_AGENTS.length)];
|
|
15
|
+
}
|
|
16
|
+
function isRetryableError(err) {
|
|
17
|
+
if (err instanceof Error) {
|
|
18
|
+
const code = err.code;
|
|
19
|
+
if (code && RETRYABLE_ERROR_CODES.has(code))
|
|
20
|
+
return true;
|
|
21
|
+
// AbortSignal timeout throws DOMException with name TimeoutError
|
|
22
|
+
if (err.name === 'TimeoutError')
|
|
23
|
+
return true;
|
|
24
|
+
}
|
|
25
|
+
return false;
|
|
26
|
+
}
|
|
27
|
+
function backoffMs(attempt) {
|
|
28
|
+
return 500 * Math.pow(2, attempt) + Math.random() * 500;
|
|
29
|
+
}
|
|
30
|
+
function sleep(ms) {
|
|
31
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
32
|
+
}
|
|
33
|
+
export async function httpFetch(url, options = {}) {
|
|
34
|
+
const config = getConfig();
|
|
35
|
+
const logger = createLogger('fetch');
|
|
36
|
+
const maxRetries = config.fetchMaxRetries;
|
|
37
|
+
const timeoutMs = options.timeoutMs ?? config.fetchTimeoutMs;
|
|
38
|
+
const maxRedirects = config.maxRedirects;
|
|
39
|
+
let lastError;
|
|
40
|
+
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
41
|
+
if (attempt > 0) {
|
|
42
|
+
const delay = backoffMs(attempt - 1);
|
|
43
|
+
logger.debug('retrying after backoff', { attempt, delayMs: delay, url });
|
|
44
|
+
await sleep(delay);
|
|
45
|
+
}
|
|
46
|
+
try {
|
|
47
|
+
const result = await fetchWithRedirects(url, options, timeoutMs, maxRedirects, logger);
|
|
48
|
+
return result;
|
|
49
|
+
}
|
|
50
|
+
catch (err) {
|
|
51
|
+
lastError = err;
|
|
52
|
+
if (err instanceof HttpFetchError && !err.retryable) {
|
|
53
|
+
throw err;
|
|
54
|
+
}
|
|
55
|
+
const retryable = err instanceof HttpFetchError ? err.retryable : isRetryableError(err);
|
|
56
|
+
if (!retryable || attempt >= maxRetries) {
|
|
57
|
+
throw err;
|
|
58
|
+
}
|
|
59
|
+
logger.warn('fetch failed, will retry', {
|
|
60
|
+
attempt,
|
|
61
|
+
url,
|
|
62
|
+
error: err instanceof Error ? err.message : String(err),
|
|
63
|
+
});
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
throw lastError;
|
|
67
|
+
}
|
|
68
|
+
class HttpFetchError extends Error {
|
|
69
|
+
retryable;
|
|
70
|
+
constructor(message, retryable) {
|
|
71
|
+
super(message);
|
|
72
|
+
this.retryable = retryable;
|
|
73
|
+
this.name = 'HttpFetchError';
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
async function fetchWithRedirects(originalUrl, options, timeoutMs, maxRedirects, logger) {
|
|
77
|
+
const visited = new Set();
|
|
78
|
+
let currentUrl = originalUrl;
|
|
79
|
+
let redirectCount = 0;
|
|
80
|
+
while (true) {
|
|
81
|
+
if (visited.has(currentUrl)) {
|
|
82
|
+
throw new HttpFetchError(`Redirect loop detected at ${currentUrl}`, false);
|
|
83
|
+
}
|
|
84
|
+
visited.add(currentUrl);
|
|
85
|
+
logger.debug('fetching', { url: currentUrl, attempt: redirectCount });
|
|
86
|
+
const signal = AbortSignal.timeout(timeoutMs);
|
|
87
|
+
let response;
|
|
88
|
+
try {
|
|
89
|
+
const ua = getRotatingUserAgent(getConfig());
|
|
90
|
+
const mergedHeaders = { 'User-Agent': ua, ...options.headers };
|
|
91
|
+
response = await fetch(currentUrl, {
|
|
92
|
+
headers: mergedHeaders,
|
|
93
|
+
redirect: 'manual',
|
|
94
|
+
signal,
|
|
95
|
+
});
|
|
96
|
+
}
|
|
97
|
+
catch (err) {
|
|
98
|
+
const isTimeout = err instanceof Error && err.name === 'TimeoutError';
|
|
99
|
+
const isConnErr = err instanceof Error && RETRYABLE_ERROR_CODES.has(err.code ?? '');
|
|
100
|
+
const retryable = isTimeout || isConnErr;
|
|
101
|
+
throw Object.assign(err instanceof Error ? err : new Error(String(err)), { retryable });
|
|
102
|
+
}
|
|
103
|
+
if (REDIRECT_STATUSES.has(response.status)) {
|
|
104
|
+
const location = response.headers.get('location');
|
|
105
|
+
if (!location) {
|
|
106
|
+
throw new HttpFetchError(`Redirect with no location header at ${currentUrl}`, false);
|
|
107
|
+
}
|
|
108
|
+
redirectCount++;
|
|
109
|
+
if (redirectCount > maxRedirects) {
|
|
110
|
+
throw new HttpFetchError(`Too many redirects (>${maxRedirects}) from ${originalUrl}`, false);
|
|
111
|
+
}
|
|
112
|
+
// Resolve relative redirects
|
|
113
|
+
currentUrl = new URL(location, currentUrl).toString();
|
|
114
|
+
continue;
|
|
115
|
+
}
|
|
116
|
+
if (RETRYABLE_STATUSES.has(response.status)) {
|
|
117
|
+
throw new HttpFetchError(`HTTP ${response.status} from ${currentUrl}`, true);
|
|
118
|
+
}
|
|
119
|
+
const contentType = response.headers.get('content-type') ?? '';
|
|
120
|
+
const headers = {};
|
|
121
|
+
response.headers.forEach((value, key) => {
|
|
122
|
+
headers[key] = value;
|
|
123
|
+
});
|
|
124
|
+
const isPdf = contentType.includes('application/pdf');
|
|
125
|
+
let html;
|
|
126
|
+
let rawBuffer;
|
|
127
|
+
if (isPdf) {
|
|
128
|
+
const arrayBuf = await response.arrayBuffer();
|
|
129
|
+
rawBuffer = Buffer.from(arrayBuf);
|
|
130
|
+
html = '';
|
|
131
|
+
}
|
|
132
|
+
else {
|
|
133
|
+
html = await response.text();
|
|
134
|
+
}
|
|
135
|
+
return {
|
|
136
|
+
url: originalUrl,
|
|
137
|
+
finalUrl: currentUrl,
|
|
138
|
+
html,
|
|
139
|
+
contentType,
|
|
140
|
+
statusCode: response.status,
|
|
141
|
+
headers,
|
|
142
|
+
rawBuffer,
|
|
143
|
+
};
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
//# sourceMappingURL=http-client.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"http-client.js","sourceRoot":"","sources":["../../src/fetch/http-client.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAiB5C,MAAM,kBAAkB,GAAG,IAAI,GAAG,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC;AACpD,MAAM,qBAAqB,GAAG,IAAI,GAAG,CAAC,CAAC,YAAY,EAAE,WAAW,EAAE,cAAc,CAAC,CAAC,CAAC;AACnF,MAAM,iBAAiB,GAAG,IAAI,GAAG,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC;AAExD,MAAM,mBAAmB,GAAG;IAC1B,uHAAuH;IACvH,iHAAiH;IACjH,uGAAuG;CACxG,CAAC;AAEF,SAAS,oBAAoB,CAAC,MAAqC;IACjE,IAAI,MAAM,CAAC,SAAS;QAAE,OAAO,MAAM,CAAC,SAAS,CAAC;IAC9C,OAAO,mBAAmB,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,mBAAmB,CAAC,MAAM,CAAC,CAAC,CAAC;AACrF,CAAC;AAED,SAAS,gBAAgB,CAAC,GAAY;IACpC,IAAI,GAAG,YAAY,KAAK,EAAE,CAAC;QACzB,MAAM,IAAI,GAAI,GAA6B,CAAC,IAAI,CAAC;QACjD,IAAI,IAAI,IAAI,qBAAqB,CAAC,GAAG,CAAC,IAAI,CAAC;YAAE,OAAO,IAAI,CAAC;QACzD,iEAAiE;QACjE,IAAI,GAAG,CAAC,IAAI,KAAK,cAAc;YAAE,OAAO,IAAI,CAAC;IAC/C,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAED,SAAS,SAAS,CAAC,OAAe;IAChC,OAAO,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,OAAO,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,GAAG,GAAG,CAAC;AAC1D,CAAC;AAED,SAAS,KAAK,CAAC,EAAU;IACvB,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC,CAAC;AAC3D,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,SAAS,CAAC,GAAW,EAAE,UAA4B,EAAE;IACzE,MAAM,MAAM,GAAG,SAAS,EAAE,CAAC;IAC3B,MAAM,MAAM,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC;IACrC,MAAM,UAAU,GAAG,MAAM,CAAC,eAAe,CAAC;IAC1C,MAAM,SAAS,GAAG,OAAO,CAAC,SAAS,IAAI,MAAM,CAAC,cAAc,CAAC;IAC7D,MAAM,YAAY,GAAG,MAAM,CAAC,YAAY,CAAC;IAEzC,IAAI,SAAkB,CAAC;IAEvB,KAAK,IAAI,OAAO,GAAG,CAAC,EAAE,OAAO,IAAI,UAAU,EAAE,OAAO,EAAE,EAAE,CAAC;QACvD,IAAI,OAAO,GAAG,CAAC,EAAE,CAAC;YAChB,MAAM,KAAK,GAAG,SAAS,CAAC,OAAO,GAAG,CAAC,CAAC,CAAC;YACrC,MAAM,CAAC,KAAK,CAAC,wBAAwB,EAAE,EAAE,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,GAAG,EAAE,CAAC,CAAC;YACzE,MAAM,KAAK,CAAC,KAAK,CAAC,CAAC;QACrB,CAAC;QAED,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,kBAAkB,CAAC,GAAG,EAAE,OAAO,EAAE,SAAS,EAAE,YAAY,EAAE,MAAM,CAAC,CAAC;YACvF,OAAO,MAAM,CAAC;QAChB,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,SAAS,GAAG,GAAG,CAAC;YAEhB,IAAI,GAAG,YAAY,cAAc,IAAI,CAAC,GAAG,CAAC,SAAS,EAAE,CAAC;gBACpD,MAAM,GAAG,CAAC;YACZ,CAAC;YAED,MAAM,SAAS,GAAG,GAAG,YAAY,cAAc,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,CAAC,gBAAgB,CAAC,GAAG,CAAC,CAAC;YAExF,IAAI,CAAC,SAAS,IAAI,OAAO,IAAI,UAAU,EAAE,CAAC;gBACxC,MAAM,GAAG,CAAC;YACZ,CAAC;YAED,MAAM,CAAC,IAAI,CAAC,0BAA0B,EAAE;gBACtC,OAAO;gBACP,GAAG;gBACH,KAAK,EAAE,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC;aACxD,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,MAAM,SAAS,CAAC;AAClB,CAAC;AAED,MAAM,cAAe,SAAQ,KAAK;IACa;IAA7C,YAAY,OAAe,EAAkB,SAAkB;QAC7D,KAAK,CAAC,OAAO,CAAC,CAAC;QAD4B,cAAS,GAAT,SAAS,CAAS;QAE7D,IAAI,CAAC,IAAI,GAAG,gBAAgB,CAAC;IAC/B,CAAC;CACF;AAED,KAAK,UAAU,kBAAkB,CAC/B,WAAmB,EACnB,OAAyB,EACzB,SAAiB,EACjB,YAAoB,EACpB,MAAuC;IAEvC,MAAM,OAAO,GAAG,IAAI,GAAG,EAAU,CAAC;IAClC,IAAI,UAAU,GAAG,WAAW,CAAC;IAC7B,IAAI,aAAa,GAAG,CAAC,CAAC;IAEtB,OAAO,IAAI,EAAE,CAAC;QACZ,IAAI,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC,EAAE,CAAC;YAC5B,MAAM,IAAI,cAAc,CAAC,6BAA6B,UAAU,EAAE,EAAE,KAAK,CAAC,CAAC;QAC7E,CAAC;QACD,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;QAExB,MAAM,CAAC,KAAK,CAAC,UAAU,EAAE,EAAE,GAAG,EAAE,UAAU,EAAE,OAAO,EAAE,aAAa,EAAE,CAAC,CAAC;QAEtE,MAAM,MAAM,GAAG,WAAW,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;QAE9C,IAAI,QAAkB,CAAC;QACvB,IAAI,CAAC;YACH,MAAM,EAAE,GAAG,oBAAoB,CAAC,SAAS,EAAE,CAAC,CAAC;YAC7C,MAAM,aAAa,GAAG,EAAE,YAAY,EAAE,EAAE,EAAE,GAAG,OAAO,CAAC,OAAO,EAAE,CAAC;YAC/D,QAAQ,GAAG,MAAM,KAAK,CAAC,UAAU,EAAE;gBACjC,OAAO,EAAE,aAAa;gBACtB,QAAQ,EAAE,QAAQ;gBAClB,MAAM;aACP,CAAC,CAAC;QACL,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,MAAM,SAAS,GAAG,GAAG,YAAY,KAAK,IAAI,GAAG,CAAC,IAAI,KAAK,cAAc,CAAC;YACtE,MAAM,SAAS,GAAG,GAAG,YAAY,KAAK,IAAI,qBAAqB,CAAC,GAAG,CAAE,GAA6B,CAAC,IAAI,IAAI,EAAE,CAAC,CAAC;YAC/G,MAAM,SAAS,GAAG,SAAS,IAAI,SAAS,CAAC;YACzC,MAAM,MAAM,CAAC,MAAM,CAAC,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,SAAS,EAAE,CAAC,CAAC;QAC1F,CAAC;QAED,IAAI,iBAAiB,CAAC,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;YAC3C,MAAM,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;YAClD,IAAI,CAAC,QAAQ,EAAE,CAAC;gBACd,MAAM,IAAI,cAAc,CAAC,uCAAuC,UAAU,EAAE,EAAE,KAAK,CAAC,CAAC;YACvF,CAAC;YAED,aAAa,EAAE,CAAC;YAChB,IAAI,aAAa,GAAG,YAAY,EAAE,CAAC;gBACjC,MAAM,IAAI,cAAc,CAAC,wBAAwB,YAAY,UAAU,WAAW,EAAE,EAAE,KAAK,CAAC,CAAC;YAC/F,CAAC;YAED,6BAA6B;YAC7B,UAAU,GAAG,IAAI,GAAG,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC,QAAQ,EAAE,CAAC;YACtD,SAAS;QACX,CAAC;QAED,IAAI,kBAAkB,CAAC,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;YAC5C,MAAM,IAAI,cAAc,CAAC,QAAQ,QAAQ,CAAC,MAAM,SAAS,UAAU,EAAE,EAAE,IAAI,CAAC,CAAC;QAC/E,CAAC;QAED,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;QAC/D,MAAM,OAAO,GAA2B,EAAE,CAAC;QAC3C,QAAQ,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,KAAK,EAAE,GAAG,EAAE,EAAE;YACtC,OAAO,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC;QACvB,CAAC,CAAC,CAAC;QAEH,MAAM,KAAK,GAAG,WAAW,CAAC,QAAQ,CAAC,iBAAiB,CAAC,CAAC;QACtD,IAAI,IAAY,CAAC;QACjB,IAAI,SAA6B,CAAC;QAElC,IAAI,KAAK,EAAE,CAAC;YACV,MAAM,QAAQ,GAAG,MAAM,QAAQ,CAAC,WAAW,EAAE,CAAC;YAC9C,SAAS,GAAG,MAAM,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;YAClC,IAAI,GAAG,EAAE,CAAC;QACZ,CAAC;aAAM,CAAC;YACN,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;QAC/B,CAAC;QAED,OAAO;YACL,GAAG,EAAE,WAAW;YAChB,QAAQ,EAAE,UAAU;YACpB,IAAI;YACJ,WAAW;YACX,UAAU,EAAE,QAAQ,CAAC,MAAM;YAC3B,OAAO;YACP,SAAS;SACV,CAAC;IACJ,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import type { RawFetchResult } from '../types.js';
|
|
2
|
+
export interface RouterFetchOptions {
|
|
3
|
+
renderJs?: 'auto' | 'always' | 'never';
|
|
4
|
+
useAuth?: boolean;
|
|
5
|
+
headers?: Record<string, string>;
|
|
6
|
+
screenshot?: boolean;
|
|
7
|
+
}
|
|
8
|
+
export interface HttpClient {
|
|
9
|
+
fetch(url: string, options?: {
|
|
10
|
+
headers?: Record<string, string>;
|
|
11
|
+
timeoutMs?: number;
|
|
12
|
+
}): Promise<{
|
|
13
|
+
url: string;
|
|
14
|
+
finalUrl: string;
|
|
15
|
+
html: string;
|
|
16
|
+
contentType: string;
|
|
17
|
+
statusCode: number;
|
|
18
|
+
headers: Record<string, string>;
|
|
19
|
+
rawBuffer?: Buffer;
|
|
20
|
+
}>;
|
|
21
|
+
}
|
|
22
|
+
export interface BrowserPoolInterface {
|
|
23
|
+
fetchWithBrowser(url: string, options?: {
|
|
24
|
+
headers?: Record<string, string>;
|
|
25
|
+
storageStatePath?: string;
|
|
26
|
+
userDataDir?: string;
|
|
27
|
+
screenshot?: boolean;
|
|
28
|
+
}): Promise<RawFetchResult>;
|
|
29
|
+
}
|
|
30
|
+
interface DomainStats {
|
|
31
|
+
failureCount: number;
|
|
32
|
+
preferPlaywright: boolean;
|
|
33
|
+
}
|
|
34
|
+
export declare class SmartRouter {
|
|
35
|
+
private readonly httpClient;
|
|
36
|
+
private readonly browserPool;
|
|
37
|
+
private readonly domainMap;
|
|
38
|
+
constructor(httpClient: HttpClient, browserPool: BrowserPoolInterface);
|
|
39
|
+
fetch(url: string, options?: RouterFetchOptions): Promise<RawFetchResult>;
|
|
40
|
+
getDomainStats(domain: string): DomainStats | undefined;
|
|
41
|
+
private ensureStats;
|
|
42
|
+
private toRawFetchResult;
|
|
43
|
+
}
|
|
44
|
+
export {};
|
|
45
|
+
//# sourceMappingURL=router.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"router.d.ts","sourceRoot":"","sources":["../../src/fetch/router.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,MAAM,WAAW,kBAAkB;IACjC,QAAQ,CAAC,EAAE,MAAM,GAAG,QAAQ,GAAG,OAAO,CAAC;IACvC,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,UAAU,CAAC,EAAE,OAAO,CAAC;CACtB;AAED,MAAM,WAAW,UAAU;IACzB,KAAK,CACH,GAAG,EAAE,MAAM,EACX,OAAO,CAAC,EAAE;QAAE,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QAAC,SAAS,CAAC,EAAE,MAAM,CAAA;KAAE,GACjE,OAAO,CAAC;QACT,GAAG,EAAE,MAAM,CAAC;QACZ,QAAQ,EAAE,MAAM,CAAC;QACjB,IAAI,EAAE,MAAM,CAAC;QACb,WAAW,EAAE,MAAM,CAAC;QACpB,UAAU,EAAE,MAAM,CAAC;QACnB,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QAChC,SAAS,CAAC,EAAE,MAAM,CAAC;KACpB,CAAC,CAAC;CACJ;AAED,MAAM,WAAW,oBAAoB;IACnC,gBAAgB,CACd,GAAG,EAAE,MAAM,EACX,OAAO,CAAC,EAAE;QAAE,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QAAC,gBAAgB,CAAC,EAAE,MAAM,CAAC;QAAC,WAAW,CAAC,EAAE,MAAM,CAAC;QAAC,UAAU,CAAC,EAAE,OAAO,CAAA;KAAE,GACpH,OAAO,CAAC,cAAc,CAAC,CAAC;CAC5B;AAED,UAAU,WAAW;IACnB,YAAY,EAAE,MAAM,CAAC;IACrB,gBAAgB,EAAE,OAAO,CAAC;CAC3B;AAED,qBAAa,WAAW;IAIpB,OAAO,CAAC,QAAQ,CAAC,UAAU;IAC3B,OAAO,CAAC,QAAQ,CAAC,WAAW;IAJ9B,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAkC;gBAGzC,UAAU,EAAE,UAAU,EACtB,WAAW,EAAE,oBAAoB;IAG9C,KAAK,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,kBAAuB,GAAG,OAAO,CAAC,cAAc,CAAC;IA6DnF,cAAc,CAAC,MAAM,EAAE,MAAM,GAAG,WAAW,GAAG,SAAS;IAIvD,OAAO,CAAC,WAAW;IASnB,OAAO,CAAC,gBAAgB;CAczB"}
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import { getConfig } from '../config.js';
|
|
2
|
+
import { createLogger } from '../logger.js';
|
|
3
|
+
import { contentAppearsEmpty } from './content-check.js';
|
|
4
|
+
import { getAuthOptions } from './auth.js';
|
|
5
|
+
export class SmartRouter {
|
|
6
|
+
httpClient;
|
|
7
|
+
browserPool;
|
|
8
|
+
domainMap = new Map();
|
|
9
|
+
constructor(httpClient, browserPool) {
|
|
10
|
+
this.httpClient = httpClient;
|
|
11
|
+
this.browserPool = browserPool;
|
|
12
|
+
}
|
|
13
|
+
async fetch(url, options = {}) {
|
|
14
|
+
const { renderJs = 'auto', useAuth = false, headers, screenshot } = options;
|
|
15
|
+
const config = getConfig();
|
|
16
|
+
const logger = createLogger('fetch');
|
|
17
|
+
const threshold = config.browserFallbackThreshold;
|
|
18
|
+
const domain = new URL(url).hostname;
|
|
19
|
+
// Always Playwright for auth or explicit override
|
|
20
|
+
if (renderJs === 'always' || useAuth) {
|
|
21
|
+
const authOptions = useAuth ? (getAuthOptions() ?? {}) : {};
|
|
22
|
+
logger.debug('routing to playwright', { url, reason: useAuth ? 'auth' : 'render_js=always' });
|
|
23
|
+
return this.browserPool.fetchWithBrowser(url, { headers, screenshot, ...authOptions });
|
|
24
|
+
}
|
|
25
|
+
// HTTP only, no fallback
|
|
26
|
+
if (renderJs === 'never') {
|
|
27
|
+
logger.debug('routing to http (never)', { url });
|
|
28
|
+
const result = await this.httpClient.fetch(url, { headers });
|
|
29
|
+
this.ensureStats(domain);
|
|
30
|
+
return this.toRawFetchResult(result);
|
|
31
|
+
}
|
|
32
|
+
// auto: check if domain is already marked for Playwright
|
|
33
|
+
const stats = this.ensureStats(domain);
|
|
34
|
+
if (stats.preferPlaywright) {
|
|
35
|
+
logger.debug('routing to playwright (domain marked)', { url, domain });
|
|
36
|
+
return this.browserPool.fetchWithBrowser(url, { headers, screenshot });
|
|
37
|
+
}
|
|
38
|
+
// Try HTTP first
|
|
39
|
+
try {
|
|
40
|
+
const result = await this.httpClient.fetch(url, { headers });
|
|
41
|
+
// Check for SPA shell / empty content
|
|
42
|
+
if (contentAppearsEmpty(result.html)) {
|
|
43
|
+
logger.info('SPA shell detected, marking domain for playwright', { url, domain });
|
|
44
|
+
stats.preferPlaywright = true;
|
|
45
|
+
return this.browserPool.fetchWithBrowser(url, { headers, screenshot });
|
|
46
|
+
}
|
|
47
|
+
return this.toRawFetchResult(result);
|
|
48
|
+
}
|
|
49
|
+
catch (err) {
|
|
50
|
+
stats.failureCount++;
|
|
51
|
+
logger.warn('http fetch failed', {
|
|
52
|
+
url,
|
|
53
|
+
domain,
|
|
54
|
+
failureCount: stats.failureCount,
|
|
55
|
+
error: err instanceof Error ? err.message : String(err),
|
|
56
|
+
});
|
|
57
|
+
if (stats.failureCount >= threshold) {
|
|
58
|
+
logger.info('failure threshold reached, marking domain for playwright', { url, domain, threshold });
|
|
59
|
+
stats.preferPlaywright = true;
|
|
60
|
+
return this.browserPool.fetchWithBrowser(url, { headers, screenshot });
|
|
61
|
+
}
|
|
62
|
+
throw err;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
getDomainStats(domain) {
|
|
66
|
+
return this.domainMap.get(domain);
|
|
67
|
+
}
|
|
68
|
+
ensureStats(domain) {
|
|
69
|
+
let stats = this.domainMap.get(domain);
|
|
70
|
+
if (!stats) {
|
|
71
|
+
stats = { failureCount: 0, preferPlaywright: false };
|
|
72
|
+
this.domainMap.set(domain, stats);
|
|
73
|
+
}
|
|
74
|
+
return stats;
|
|
75
|
+
}
|
|
76
|
+
toRawFetchResult(result) {
|
|
77
|
+
return {
|
|
78
|
+
url: result.url,
|
|
79
|
+
finalUrl: result.finalUrl,
|
|
80
|
+
html: result.html,
|
|
81
|
+
contentType: result.contentType,
|
|
82
|
+
statusCode: result.statusCode,
|
|
83
|
+
method: 'http',
|
|
84
|
+
headers: result.headers,
|
|
85
|
+
rawBuffer: result.rawBuffer,
|
|
86
|
+
};
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
//# sourceMappingURL=router.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"router.js","sourceRoot":"","sources":["../../src/fetch/router.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAC5C,OAAO,EAAE,mBAAmB,EAAE,MAAM,oBAAoB,CAAC;AACzD,OAAO,EAAE,cAAc,EAAE,MAAM,WAAW,CAAC;AAqC3C,MAAM,OAAO,WAAW;IAIH;IACA;IAJF,SAAS,GAAG,IAAI,GAAG,EAAuB,CAAC;IAE5D,YACmB,UAAsB,EACtB,WAAiC;QADjC,eAAU,GAAV,UAAU,CAAY;QACtB,gBAAW,GAAX,WAAW,CAAsB;IACjD,CAAC;IAEJ,KAAK,CAAC,KAAK,CAAC,GAAW,EAAE,UAA8B,EAAE;QACvD,MAAM,EAAE,QAAQ,GAAG,MAAM,EAAE,OAAO,GAAG,KAAK,EAAE,OAAO,EAAE,UAAU,EAAE,GAAG,OAAO,CAAC;QAC5E,MAAM,MAAM,GAAG,SAAS,EAAE,CAAC;QAC3B,MAAM,MAAM,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC;QACrC,MAAM,SAAS,GAAG,MAAM,CAAC,wBAAwB,CAAC;QAClD,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;QAErC,kDAAkD;QAClD,IAAI,QAAQ,KAAK,QAAQ,IAAI,OAAO,EAAE,CAAC;YACrC,MAAM,WAAW,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC,cAAc,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAC5D,MAAM,CAAC,KAAK,CAAC,uBAAuB,EAAE,EAAE,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,kBAAkB,EAAE,CAAC,CAAC;YAC9F,OAAO,IAAI,CAAC,WAAW,CAAC,gBAAgB,CAAC,GAAG,EAAE,EAAE,OAAO,EAAE,UAAU,EAAE,GAAG,WAAW,EAAE,CAAC,CAAC;QACzF,CAAC;QAED,yBAAyB;QACzB,IAAI,QAAQ,KAAK,OAAO,EAAE,CAAC;YACzB,MAAM,CAAC,KAAK,CAAC,yBAAyB,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC;YACjD,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,GAAG,EAAE,EAAE,OAAO,EAAE,CAAC,CAAC;YAC7D,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;YACzB,OAAO,IAAI,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC;QACvC,CAAC;QAED,yDAAyD;QACzD,MAAM,KAAK,GAAG,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;QAEvC,IAAI,KAAK,CAAC,gBAAgB,EAAE,CAAC;YAC3B,MAAM,CAAC,KAAK,CAAC,uCAAuC,EAAE,EAAE,GAAG,EAAE,MAAM,EAAE,CAAC,CAAC;YACvE,OAAO,IAAI,CAAC,WAAW,CAAC,gBAAgB,CAAC,GAAG,EAAE,EAAE,OAAO,EAAE,UAAU,EAAE,CAAC,CAAC;QACzE,CAAC;QAED,iBAAiB;QACjB,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,GAAG,EAAE,EAAE,OAAO,EAAE,CAAC,CAAC;YAE7D,sCAAsC;YACtC,IAAI,mBAAmB,CAAC,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC;gBACrC,MAAM,CAAC,IAAI,CAAC,mDAAmD,EAAE,EAAE,GAAG,EAAE,MAAM,EAAE,CAAC,CAAC;gBAClF,KAAK,CAAC,gBAAgB,GAAG,IAAI,CAAC;gBAC9B,OAAO,IAAI,CAAC,WAAW,CAAC,gBAAgB,CAAC,GAAG,EAAE,EAAE,OAAO,EAAE,UAAU,EAAE,CAAC,CAAC;YACzE,CAAC;YAED,OAAO,IAAI,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC;QACvC,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,KAAK,CAAC,YAAY,EAAE,CAAC;YACrB,MAAM,CAAC,IAAI,CAAC,mBAAmB,EAAE;gBAC/B,GAAG;gBACH,MAAM;gBACN,YAAY,EAAE,KAAK,CAAC,YAAY;gBAChC,KAAK,EAAE,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC;aACxD,CAAC,CAAC;YAEH,IAAI,KAAK,CAAC,YAAY,IAAI,SAAS,EAAE,CAAC;gBACpC,MAAM,CAAC,IAAI,CAAC,0DAA0D,EAAE,EAAE,GAAG,EAAE,MAAM,EAAE,SAAS,EAAE,CAAC,CAAC;gBACpG,KAAK,CAAC,gBAAgB,GAAG,IAAI,CAAC;gBAC9B,OAAO,IAAI,CAAC,WAAW,CAAC,gBAAgB,CAAC,GAAG,EAAE,EAAE,OAAO,EAAE,UAAU,EAAE,CAAC,CAAC;YACzE,CAAC;YAED,MAAM,GAAG,CAAC;QACZ,CAAC;IACH,CAAC;IAED,cAAc,CAAC,MAAc;QAC3B,OAAO,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;IACpC,CAAC;IAEO,WAAW,CAAC,MAAc;QAChC,IAAI,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;QACvC,IAAI,CAAC,KAAK,EAAE,CAAC;YACX,KAAK,GAAG,EAAE,YAAY,EAAE,CAAC,EAAE,gBAAgB,EAAE,KAAK,EAAE,CAAC;YACrD,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC;QACpC,CAAC;QACD,OAAO,KAAK,CAAC;IACf,CAAC;IAEO,gBAAgB,CACtB,MAAgD;QAEhD,OAAO;YACL,GAAG,EAAE,MAAM,CAAC,GAAG;YACf,QAAQ,EAAE,MAAM,CAAC,QAAQ;YACzB,IAAI,EAAE,MAAM,CAAC,IAAI;YACjB,WAAW,EAAE,MAAM,CAAC,WAAW;YAC/B,UAAU,EAAE,MAAM,CAAC,UAAU;YAC7B,MAAM,EAAE,MAAM;YACd,OAAO,EAAE,MAAM,CAAC,OAAO;YACvB,SAAS,EAAE,MAAM,CAAC,SAAS;SAC5B,CAAC;IACJ,CAAC;CACF"}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":""}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { parseCommand } from './cli/index.js';
|
|
3
|
+
import { runWarmup } from './cli/warmup.js';
|
|
4
|
+
import { runDaemon } from './cli/daemon.js';
|
|
5
|
+
import { runHealthCheck } from './cli/health.js';
|
|
6
|
+
import { startServer } from './server.js';
|
|
7
|
+
const { command, args } = parseCommand(process.argv.slice(2));
|
|
8
|
+
switch (command) {
|
|
9
|
+
case 'warmup':
|
|
10
|
+
await runWarmup(args);
|
|
11
|
+
break;
|
|
12
|
+
case 'serve':
|
|
13
|
+
runDaemon(args);
|
|
14
|
+
break;
|
|
15
|
+
case 'health':
|
|
16
|
+
runHealthCheck();
|
|
17
|
+
break;
|
|
18
|
+
case 'mcp':
|
|
19
|
+
await startServer();
|
|
20
|
+
break;
|
|
21
|
+
}
|
|
22
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";AAEA,OAAO,EAAE,YAAY,EAAE,MAAM,gBAAgB,CAAC;AAC9C,OAAO,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAC;AAC5C,OAAO,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAC;AAC5C,OAAO,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AACjD,OAAO,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAE1C,MAAM,EAAE,OAAO,EAAE,IAAI,EAAE,GAAG,YAAY,CAAC,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;AAE9D,QAAQ,OAAO,EAAE,CAAC;IAChB,KAAK,QAAQ;QACX,MAAM,SAAS,CAAC,IAAI,CAAC,CAAC;QACtB,MAAM;IAER,KAAK,OAAO;QACV,SAAS,CAAC,IAAI,CAAC,CAAC;QAChB,MAAM;IAER,KAAK,QAAQ;QACX,cAAc,EAAE,CAAC;QACjB,MAAM;IAER,KAAK,KAAK;QACR,MAAM,WAAW,EAAE,CAAC;QACpB,MAAM;AACV,CAAC"}
|
package/dist/logger.d.ts
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
type Module = 'fetch' | 'search' | 'crawl' | 'cache' | 'extract' | 'searxng' | 'server' | 'cli' | 'jsonld';
|
|
2
|
+
export interface Logger {
|
|
3
|
+
debug(msg: string, data?: Record<string, unknown>): void;
|
|
4
|
+
info(msg: string, data?: Record<string, unknown>): void;
|
|
5
|
+
warn(msg: string, data?: Record<string, unknown>): void;
|
|
6
|
+
error(msg: string, data?: Record<string, unknown>): void;
|
|
7
|
+
}
|
|
8
|
+
export declare function createLogger(module: Module): Logger;
|
|
9
|
+
export {};
|
|
10
|
+
//# sourceMappingURL=logger.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"logger.d.ts","sourceRoot":"","sources":["../src/logger.ts"],"names":[],"mappings":"AAGA,KAAK,MAAM,GAAG,OAAO,GAAG,QAAQ,GAAG,OAAO,GAAG,OAAO,GAAG,SAAS,GAAG,SAAS,GAAG,QAAQ,GAAG,KAAK,GAAG,QAAQ,CAAC;AAS3G,MAAM,WAAW,MAAM;IACrB,KAAK,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,IAAI,CAAC;IACzD,IAAI,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,IAAI,CAAC;IACxD,IAAI,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,IAAI,CAAC;IACxD,KAAK,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,IAAI,CAAC;CAC1D;AAmBD,wBAAgB,YAAY,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,CAiBnD"}
|
package/dist/logger.js
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import { getConfig } from './config.js';
|
|
2
|
+
const LEVEL_PRIORITY = {
|
|
3
|
+
debug: 0,
|
|
4
|
+
info: 1,
|
|
5
|
+
warn: 2,
|
|
6
|
+
error: 3,
|
|
7
|
+
};
|
|
8
|
+
function writeJson(level, module, msg, data) {
|
|
9
|
+
const line = JSON.stringify({
|
|
10
|
+
ts: new Date().toISOString(),
|
|
11
|
+
level,
|
|
12
|
+
msg,
|
|
13
|
+
module,
|
|
14
|
+
...(data ? { data } : {}),
|
|
15
|
+
});
|
|
16
|
+
process.stderr.write(line + '\n');
|
|
17
|
+
}
|
|
18
|
+
function writeText(level, module, msg, data) {
|
|
19
|
+
const ts = new Date().toISOString();
|
|
20
|
+
const dataStr = data ? ' ' + Object.entries(data).map(([k, v]) => `${k}=${v}`).join(' ') : '';
|
|
21
|
+
process.stderr.write(`[${ts}] ${level.toUpperCase().padEnd(5)} [${module}] ${msg}${dataStr}\n`);
|
|
22
|
+
}
|
|
23
|
+
export function createLogger(module) {
|
|
24
|
+
const config = getConfig();
|
|
25
|
+
const minPriority = LEVEL_PRIORITY[config.logLevel];
|
|
26
|
+
const write = config.logFormat === 'json' ? writeJson : writeText;
|
|
27
|
+
function log(level, msg, data) {
|
|
28
|
+
if (LEVEL_PRIORITY[level] >= minPriority) {
|
|
29
|
+
write(level, module, msg, data);
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
return {
|
|
33
|
+
debug: (msg, data) => log('debug', msg, data),
|
|
34
|
+
info: (msg, data) => log('info', msg, data),
|
|
35
|
+
warn: (msg, data) => log('warn', msg, data),
|
|
36
|
+
error: (msg, data) => log('error', msg, data),
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
//# sourceMappingURL=logger.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"logger.js","sourceRoot":"","sources":["../src/logger.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAKxC,MAAM,cAAc,GAA6B;IAC/C,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;CACT,CAAC;AASF,SAAS,SAAS,CAAC,KAAe,EAAE,MAAc,EAAE,GAAW,EAAE,IAA8B;IAC7F,MAAM,IAAI,GAAG,IAAI,CAAC,SAAS,CAAC;QAC1B,EAAE,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;QAC5B,KAAK;QACL,GAAG;QACH,MAAM;QACN,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;KAC1B,CAAC,CAAC;IACH,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,GAAG,IAAI,CAAC,CAAC;AACpC,CAAC;AAED,SAAS,SAAS,CAAC,KAAe,EAAE,MAAc,EAAE,GAAW,EAAE,IAA8B;IAC7F,MAAM,EAAE,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IACpC,MAAM,OAAO,GAAG,IAAI,CAAC,CAAC,CAAC,GAAG,GAAG,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IAC9F,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,EAAE,KAAK,KAAK,CAAC,WAAW,EAAE,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,MAAM,KAAK,GAAG,GAAG,OAAO,IAAI,CAAC,CAAC;AAClG,CAAC;AAED,MAAM,UAAU,YAAY,CAAC,MAAc;IACzC,MAAM,MAAM,GAAG,SAAS,EAAE,CAAC;IAC3B,MAAM,WAAW,GAAG,cAAc,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;IACpD,MAAM,KAAK,GAAG,MAAM,CAAC,SAAS,KAAK,MAAM,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,SAAS,CAAC;IAElE,SAAS,GAAG,CAAC,KAAe,EAAE,GAAW,EAAE,IAA8B;QACvE,IAAI,cAAc,CAAC,KAAK,CAAC,IAAI,WAAW,EAAE,CAAC;YACzC,KAAK,CAAC,KAAK,EAAE,MAAM,EAAE,GAAG,EAAE,IAAI,CAAC,CAAC;QAClC,CAAC;IACH,CAAC;IAED,OAAO;QACL,KAAK,EAAE,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,EAAE,GAAG,EAAE,IAAI,CAAC;QAC7C,IAAI,EAAE,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,EAAE,GAAG,EAAE,IAAI,CAAC;QAC3C,IAAI,EAAE,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,EAAE,GAAG,EAAE,IAAI,CAAC;QAC3C,KAAK,EAAE,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,EAAE,GAAG,EAAE,IAAI,CAAC;KAC9C,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import type { RawSearchResult } from '../types.js';
|
|
2
|
+
export interface MergedSearchResult {
|
|
3
|
+
title: string;
|
|
4
|
+
url: string;
|
|
5
|
+
snippet: string;
|
|
6
|
+
relevance_score: number;
|
|
7
|
+
engines: string[];
|
|
8
|
+
}
|
|
9
|
+
export declare function deduplicateResults(results: RawSearchResult[]): MergedSearchResult[];
|
|
10
|
+
//# sourceMappingURL=dedup.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"dedup.d.ts","sourceRoot":"","sources":["../../src/search/dedup.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AAEnD,MAAM,WAAW,kBAAkB;IACjC,KAAK,EAAE,MAAM,CAAC;IACd,GAAG,EAAE,MAAM,CAAC;IACZ,OAAO,EAAE,MAAM,CAAC;IAChB,eAAe,EAAE,MAAM,CAAC;IACxB,OAAO,EAAE,MAAM,EAAE,CAAC;CACnB;AAED,wBAAgB,kBAAkB,CAAC,OAAO,EAAE,eAAe,EAAE,GAAG,kBAAkB,EAAE,CAkCnF"}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import { normalizeUrl } from '../cache/store.js';
|
|
2
|
+
export function deduplicateResults(results) {
|
|
3
|
+
const urlMap = new Map();
|
|
4
|
+
for (const result of results) {
|
|
5
|
+
let normalized;
|
|
6
|
+
try {
|
|
7
|
+
normalized = normalizeUrl(result.url);
|
|
8
|
+
}
|
|
9
|
+
catch {
|
|
10
|
+
normalized = result.url;
|
|
11
|
+
}
|
|
12
|
+
const existing = urlMap.get(normalized);
|
|
13
|
+
if (existing) {
|
|
14
|
+
if (result.relevance_score > existing.relevance_score) {
|
|
15
|
+
existing.relevance_score = result.relevance_score;
|
|
16
|
+
existing.title = result.title;
|
|
17
|
+
existing.snippet = result.snippet;
|
|
18
|
+
}
|
|
19
|
+
if (!existing.engines.includes(result.engine)) {
|
|
20
|
+
existing.engines.push(result.engine);
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
else {
|
|
24
|
+
urlMap.set(normalized, {
|
|
25
|
+
title: result.title,
|
|
26
|
+
url: result.url,
|
|
27
|
+
snippet: result.snippet,
|
|
28
|
+
relevance_score: result.relevance_score,
|
|
29
|
+
engines: [result.engine],
|
|
30
|
+
});
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
return [...urlMap.values()].sort((a, b) => b.relevance_score - a.relevance_score);
|
|
34
|
+
}
|
|
35
|
+
//# sourceMappingURL=dedup.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"dedup.js","sourceRoot":"","sources":["../../src/search/dedup.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,mBAAmB,CAAC;AAWjD,MAAM,UAAU,kBAAkB,CAAC,OAA0B;IAC3D,MAAM,MAAM,GAAG,IAAI,GAAG,EAA8B,CAAC;IAErD,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QAC7B,IAAI,UAAkB,CAAC;QACvB,IAAI,CAAC;YACH,UAAU,GAAG,YAAY,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;QACxC,CAAC;QAAC,MAAM,CAAC;YACP,UAAU,GAAG,MAAM,CAAC,GAAG,CAAC;QAC1B,CAAC;QAED,MAAM,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;QAExC,IAAI,QAAQ,EAAE,CAAC;YACb,IAAI,MAAM,CAAC,eAAe,GAAG,QAAQ,CAAC,eAAe,EAAE,CAAC;gBACtD,QAAQ,CAAC,eAAe,GAAG,MAAM,CAAC,eAAe,CAAC;gBAClD,QAAQ,CAAC,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC;gBAC9B,QAAQ,CAAC,OAAO,GAAG,MAAM,CAAC,OAAO,CAAC;YACpC,CAAC;YACD,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAC,MAAM,CAAC,EAAE,CAAC;gBAC9C,QAAQ,CAAC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;YACvC,CAAC;QACH,CAAC;aAAM,CAAC;YACN,MAAM,CAAC,GAAG,CAAC,UAAU,EAAE;gBACrB,KAAK,EAAE,MAAM,CAAC,KAAK;gBACnB,GAAG,EAAE,MAAM,CAAC,GAAG;gBACf,OAAO,EAAE,MAAM,CAAC,OAAO;gBACvB,eAAe,EAAE,MAAM,CAAC,eAAe;gBACvC,OAAO,EAAE,CAAC,MAAM,CAAC,MAAM,CAAC;aACzB,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,OAAO,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,eAAe,GAAG,CAAC,CAAC,eAAe,CAAC,CAAC;AACpF,CAAC"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { SearchEngine, SearchEngineOptions, RawSearchResult } from '../../types.js';
|
|
2
|
+
export declare class BingEngine implements SearchEngine {
|
|
3
|
+
name: string;
|
|
4
|
+
search(query: string, options?: SearchEngineOptions): Promise<RawSearchResult[]>;
|
|
5
|
+
parseResults(html: string, maxResults: number): RawSearchResult[];
|
|
6
|
+
}
|
|
7
|
+
//# sourceMappingURL=bing.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"bing.d.ts","sourceRoot":"","sources":["../../../src/search/engines/bing.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,YAAY,EAAE,mBAAmB,EAAE,eAAe,EAAE,MAAM,gBAAgB,CAAC;AAKzF,qBAAa,UAAW,YAAW,YAAY;IAC7C,IAAI,SAAU;IAER,MAAM,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,GAAE,mBAAwB,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC;IAuB1F,YAAY,CAAC,IAAI,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM,GAAG,eAAe,EAAE;CA4BlE"}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import { parseHTML } from 'linkedom';
|
|
2
|
+
import { createLogger } from '../../logger.js';
|
|
3
|
+
const log = createLogger('search');
|
|
4
|
+
export class BingEngine {
|
|
5
|
+
name = 'bing';
|
|
6
|
+
async search(query, options = {}) {
|
|
7
|
+
const timeoutMs = options.timeoutMs ?? 10000;
|
|
8
|
+
const maxResults = options.maxResults ?? 10;
|
|
9
|
+
const params = new URLSearchParams({ q: query });
|
|
10
|
+
const url = `https://www.bing.com/search?${params}`;
|
|
11
|
+
log.debug('scraping bing', { query });
|
|
12
|
+
const response = await fetch(url, {
|
|
13
|
+
signal: AbortSignal.timeout(timeoutMs),
|
|
14
|
+
headers: {
|
|
15
|
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
16
|
+
'Accept-Language': options.language ?? 'en-US,en;q=0.9',
|
|
17
|
+
},
|
|
18
|
+
});
|
|
19
|
+
if (!response.ok)
|
|
20
|
+
throw new Error(`Bing returned ${response.status}`);
|
|
21
|
+
const html = await response.text();
|
|
22
|
+
return this.parseResults(html, maxResults);
|
|
23
|
+
}
|
|
24
|
+
parseResults(html, maxResults) {
|
|
25
|
+
const { document } = parseHTML(html);
|
|
26
|
+
const results = [];
|
|
27
|
+
const items = document.querySelectorAll('li.b_algo');
|
|
28
|
+
const total = Math.min(items.length, maxResults);
|
|
29
|
+
for (let i = 0; i < total; i++) {
|
|
30
|
+
const item = items[i];
|
|
31
|
+
const link = item.querySelector('h2 a');
|
|
32
|
+
const snippetEl = item.querySelector('.b_lineclamp2, .b_lineclamp3, .b_caption p');
|
|
33
|
+
const href = link?.getAttribute('href');
|
|
34
|
+
const title = link?.textContent?.trim();
|
|
35
|
+
if (href && title) {
|
|
36
|
+
results.push({
|
|
37
|
+
title,
|
|
38
|
+
url: href,
|
|
39
|
+
snippet: snippetEl?.textContent?.trim() ?? '',
|
|
40
|
+
relevance_score: 1 - i / Math.max(items.length, 1),
|
|
41
|
+
engine: 'bing',
|
|
42
|
+
});
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
return results;
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
//# sourceMappingURL=bing.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"bing.js","sourceRoot":"","sources":["../../../src/search/engines/bing.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AAErC,OAAO,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC;AAE/C,MAAM,GAAG,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;AAEnC,MAAM,OAAO,UAAU;IACrB,IAAI,GAAG,MAAM,CAAC;IAEd,KAAK,CAAC,MAAM,CAAC,KAAa,EAAE,UAA+B,EAAE;QAC3D,MAAM,SAAS,GAAG,OAAO,CAAC,SAAS,IAAI,KAAK,CAAC;QAC7C,MAAM,UAAU,GAAG,OAAO,CAAC,UAAU,IAAI,EAAE,CAAC;QAE5C,MAAM,MAAM,GAAG,IAAI,eAAe,CAAC,EAAE,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC;QACjD,MAAM,GAAG,GAAG,+BAA+B,MAAM,EAAE,CAAC;QAEpD,GAAG,CAAC,KAAK,CAAC,eAAe,EAAE,EAAE,KAAK,EAAE,CAAC,CAAC;QAEtC,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;YAChC,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,SAAS,CAAC;YACtC,OAAO,EAAE;gBACP,YAAY,EAAE,uHAAuH;gBACrI,iBAAiB,EAAE,OAAO,CAAC,QAAQ,IAAI,gBAAgB;aACxD;SACF,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC,EAAE;YAAE,MAAM,IAAI,KAAK,CAAC,iBAAiB,QAAQ,CAAC,MAAM,EAAE,CAAC,CAAC;QAEtE,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;QACnC,OAAO,IAAI,CAAC,YAAY,CAAC,IAAI,EAAE,UAAU,CAAC,CAAC;IAC7C,CAAC;IAED,YAAY,CAAC,IAAY,EAAE,UAAkB;QAC3C,MAAM,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QACrC,MAAM,OAAO,GAAsB,EAAE,CAAC;QAEtC,MAAM,KAAK,GAAG,QAAQ,CAAC,gBAAgB,CAAC,WAAW,CAAC,CAAC;QACrD,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,EAAE,UAAU,CAAC,CAAC;QAEjD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,EAAE,CAAC,EAAE,EAAE,CAAC;YAC/B,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;YACtB,MAAM,IAAI,GAAG,IAAI,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;YACxC,MAAM,SAAS,GAAG,IAAI,CAAC,aAAa,CAAC,4CAA4C,CAAC,CAAC;YAEnF,MAAM,IAAI,GAAG,IAAI,EAAE,YAAY,CAAC,MAAM,CAAC,CAAC;YACxC,MAAM,KAAK,GAAG,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;YAExC,IAAI,IAAI,IAAI,KAAK,EAAE,CAAC;gBAClB,OAAO,CAAC,IAAI,CAAC;oBACX,KAAK;oBACL,GAAG,EAAE,IAAI;oBACT,OAAO,EAAE,SAAS,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE;oBAC7C,eAAe,EAAE,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,CAAC;oBAClD,MAAM,EAAE,MAAM;iBACf,CAAC,CAAC;YACL,CAAC;QACH,CAAC;QAED,OAAO,OAAO,CAAC;IACjB,CAAC;CACF"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { SearchEngine, SearchEngineOptions, RawSearchResult } from '../../types.js';
|
|
2
|
+
export declare class DuckDuckGoEngine implements SearchEngine {
|
|
3
|
+
name: string;
|
|
4
|
+
search(query: string, options?: SearchEngineOptions): Promise<RawSearchResult[]>;
|
|
5
|
+
parseResults(html: string, maxResults: number): RawSearchResult[];
|
|
6
|
+
}
|
|
7
|
+
//# sourceMappingURL=duckduckgo.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"duckduckgo.d.ts","sourceRoot":"","sources":["../../../src/search/engines/duckduckgo.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,YAAY,EAAE,mBAAmB,EAAE,eAAe,EAAE,MAAM,gBAAgB,CAAC;AAWzF,qBAAa,gBAAiB,YAAW,YAAY;IACnD,IAAI,SAAgB;IAEd,MAAM,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,GAAE,mBAAwB,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC;IAoB1F,YAAY,CAAC,IAAI,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM,GAAG,eAAe,EAAE;CA4BlE"}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import { parseHTML } from 'linkedom';
|
|
2
|
+
import { createLogger } from '../../logger.js';
|
|
3
|
+
const log = createLogger('search');
|
|
4
|
+
const USER_AGENTS = [
|
|
5
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
6
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
7
|
+
'Mozilla/5.0 (X11; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0',
|
|
8
|
+
];
|
|
9
|
+
export class DuckDuckGoEngine {
|
|
10
|
+
name = 'duckduckgo';
|
|
11
|
+
async search(query, options = {}) {
|
|
12
|
+
const timeoutMs = options.timeoutMs ?? 10000;
|
|
13
|
+
const maxResults = options.maxResults ?? 10;
|
|
14
|
+
const params = new URLSearchParams({ q: query });
|
|
15
|
+
const url = `https://lite.duckduckgo.com/lite/?${params}`;
|
|
16
|
+
log.debug('scraping duckduckgo', { query });
|
|
17
|
+
const response = await fetch(url, {
|
|
18
|
+
signal: AbortSignal.timeout(timeoutMs),
|
|
19
|
+
headers: { 'User-Agent': USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)] },
|
|
20
|
+
});
|
|
21
|
+
if (!response.ok)
|
|
22
|
+
throw new Error(`DDG returned ${response.status}`);
|
|
23
|
+
const html = await response.text();
|
|
24
|
+
return this.parseResults(html, maxResults);
|
|
25
|
+
}
|
|
26
|
+
parseResults(html, maxResults) {
|
|
27
|
+
const { document } = parseHTML(html);
|
|
28
|
+
const results = [];
|
|
29
|
+
const links = document.querySelectorAll('a.result-link');
|
|
30
|
+
const snippets = document.querySelectorAll('.result-snippet');
|
|
31
|
+
const total = Math.min(links.length, maxResults);
|
|
32
|
+
for (let i = 0; i < total; i++) {
|
|
33
|
+
const link = links[i];
|
|
34
|
+
const snippet = snippets[i];
|
|
35
|
+
const href = link?.getAttribute('href');
|
|
36
|
+
const title = link?.textContent?.trim();
|
|
37
|
+
if (href && title) {
|
|
38
|
+
results.push({
|
|
39
|
+
title,
|
|
40
|
+
url: href,
|
|
41
|
+
snippet: snippet?.textContent?.trim() ?? '',
|
|
42
|
+
relevance_score: 1 - i / Math.max(links.length, 1),
|
|
43
|
+
engine: 'duckduckgo',
|
|
44
|
+
});
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
return results;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
//# sourceMappingURL=duckduckgo.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"duckduckgo.js","sourceRoot":"","sources":["../../../src/search/engines/duckduckgo.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AAErC,OAAO,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC;AAE/C,MAAM,GAAG,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;AAEnC,MAAM,WAAW,GAAG;IAClB,uHAAuH;IACvH,iHAAiH;IACjH,wEAAwE;CACzE,CAAC;AAEF,MAAM,OAAO,gBAAgB;IAC3B,IAAI,GAAG,YAAY,CAAC;IAEpB,KAAK,CAAC,MAAM,CAAC,KAAa,EAAE,UAA+B,EAAE;QAC3D,MAAM,SAAS,GAAG,OAAO,CAAC,SAAS,IAAI,KAAK,CAAC;QAC7C,MAAM,UAAU,GAAG,OAAO,CAAC,UAAU,IAAI,EAAE,CAAC;QAE5C,MAAM,MAAM,GAAG,IAAI,eAAe,CAAC,EAAE,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC;QACjD,MAAM,GAAG,GAAG,qCAAqC,MAAM,EAAE,CAAC;QAE1D,GAAG,CAAC,KAAK,CAAC,qBAAqB,EAAE,EAAE,KAAK,EAAE,CAAC,CAAC;QAE5C,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;YAChC,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,SAAS,CAAC;YACtC,OAAO,EAAE,EAAE,YAAY,EAAE,WAAW,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,EAAE;SACvF,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC,EAAE;YAAE,MAAM,IAAI,KAAK,CAAC,gBAAgB,QAAQ,CAAC,MAAM,EAAE,CAAC,CAAC;QAErE,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;QACnC,OAAO,IAAI,CAAC,YAAY,CAAC,IAAI,EAAE,UAAU,CAAC,CAAC;IAC7C,CAAC;IAED,YAAY,CAAC,IAAY,EAAE,UAAkB;QAC3C,MAAM,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QACrC,MAAM,OAAO,GAAsB,EAAE,CAAC;QAEtC,MAAM,KAAK,GAAG,QAAQ,CAAC,gBAAgB,CAAC,eAAe,CAAC,CAAC;QACzD,MAAM,QAAQ,GAAG,QAAQ,CAAC,gBAAgB,CAAC,iBAAiB,CAAC,CAAC;QAE9D,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,EAAE,UAAU,CAAC,CAAC;QAEjD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,EAAE,CAAC,EAAE,EAAE,CAAC;YAC/B,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;YACtB,MAAM,OAAO,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC;YAC5B,MAAM,IAAI,GAAG,IAAI,EAAE,YAAY,CAAC,MAAM,CAAC,CAAC;YACxC,MAAM,KAAK,GAAG,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;YAExC,IAAI,IAAI,IAAI,KAAK,EAAE,CAAC;gBAClB,OAAO,CAAC,IAAI,CAAC;oBACX,KAAK;oBACL,GAAG,EAAE,IAAI;oBACT,OAAO,EAAE,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE;oBAC3C,eAAe,EAAE,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,CAAC;oBAClD,MAAM,EAAE,YAAY;iBACrB,CAAC,CAAC;YACL,CAAC;QACH,CAAC;QAED,OAAO,OAAO,CAAC;IACjB,CAAC;CACF"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { SearchEngine, SearchEngineOptions, RawSearchResult } from '../../types.js';
|
|
2
|
+
export declare class StartpageEngine implements SearchEngine {
|
|
3
|
+
name: string;
|
|
4
|
+
search(query: string, options?: SearchEngineOptions): Promise<RawSearchResult[]>;
|
|
5
|
+
parseResults(html: string, maxResults: number): RawSearchResult[];
|
|
6
|
+
}
|
|
7
|
+
//# sourceMappingURL=startpage.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"startpage.d.ts","sourceRoot":"","sources":["../../../src/search/engines/startpage.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,YAAY,EAAE,mBAAmB,EAAE,eAAe,EAAE,MAAM,gBAAgB,CAAC;AAKzF,qBAAa,eAAgB,YAAW,YAAY;IAClD,IAAI,SAAe;IAEb,MAAM,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,GAAE,mBAAwB,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC;IAyB1F,YAAY,CAAC,IAAI,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM,GAAG,eAAe,EAAE;CA4BlE"}
|