@fettstorch/clai 0.1.7 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +148 -78
- package/dist/index.js +133 -65
- package/package.json +36 -36
- package/src/cli.ts +152 -148
- package/src/index.ts +45 -22
- package/src/openai.ts +0 -6
- package/src/scraper.ts +207 -93
- package/src/summarizer.ts +101 -40
package/dist/cli.js
CHANGED
@@ -28997,22 +28997,22 @@ var require_diagnostics = __commonJS((exports, module) => {
|
|
28997
28997
|
const debuglog = fetchDebuglog.enabled ? fetchDebuglog : undiciDebugLog;
|
28998
28998
|
diagnosticsChannel.channel("undici:client:beforeConnect").subscribe((evt) => {
|
28999
28999
|
const {
|
29000
|
-
connectParams: { version, protocol, port, host }
|
29000
|
+
connectParams: { version: version2, protocol, port, host }
|
29001
29001
|
} = evt;
|
29002
|
-
debuglog("connecting to %s using %s%s", `${host}${port ? `:${port}` : ""}`, protocol,
|
29002
|
+
debuglog("connecting to %s using %s%s", `${host}${port ? `:${port}` : ""}`, protocol, version2);
|
29003
29003
|
});
|
29004
29004
|
diagnosticsChannel.channel("undici:client:connected").subscribe((evt) => {
|
29005
29005
|
const {
|
29006
|
-
connectParams: { version, protocol, port, host }
|
29006
|
+
connectParams: { version: version2, protocol, port, host }
|
29007
29007
|
} = evt;
|
29008
|
-
debuglog("connected to %s using %s%s", `${host}${port ? `:${port}` : ""}`, protocol,
|
29008
|
+
debuglog("connected to %s using %s%s", `${host}${port ? `:${port}` : ""}`, protocol, version2);
|
29009
29009
|
});
|
29010
29010
|
diagnosticsChannel.channel("undici:client:connectError").subscribe((evt) => {
|
29011
29011
|
const {
|
29012
|
-
connectParams: { version, protocol, port, host },
|
29012
|
+
connectParams: { version: version2, protocol, port, host },
|
29013
29013
|
error
|
29014
29014
|
} = evt;
|
29015
|
-
debuglog("connection to %s using %s%s errored - %s", `${host}${port ? `:${port}` : ""}`, protocol,
|
29015
|
+
debuglog("connection to %s using %s%s errored - %s", `${host}${port ? `:${port}` : ""}`, protocol, version2, error.message);
|
29016
29016
|
});
|
29017
29017
|
diagnosticsChannel.channel("undici:client:sendHeaders").subscribe((evt) => {
|
29018
29018
|
const {
|
@@ -29047,22 +29047,22 @@ var require_diagnostics = __commonJS((exports, module) => {
|
|
29047
29047
|
const debuglog = undiciDebugLog.enabled ? undiciDebugLog : websocketDebuglog;
|
29048
29048
|
diagnosticsChannel.channel("undici:client:beforeConnect").subscribe((evt) => {
|
29049
29049
|
const {
|
29050
|
-
connectParams: { version, protocol, port, host }
|
29050
|
+
connectParams: { version: version2, protocol, port, host }
|
29051
29051
|
} = evt;
|
29052
|
-
debuglog("connecting to %s%s using %s%s", host, port ? `:${port}` : "", protocol,
|
29052
|
+
debuglog("connecting to %s%s using %s%s", host, port ? `:${port}` : "", protocol, version2);
|
29053
29053
|
});
|
29054
29054
|
diagnosticsChannel.channel("undici:client:connected").subscribe((evt) => {
|
29055
29055
|
const {
|
29056
|
-
connectParams: { version, protocol, port, host }
|
29056
|
+
connectParams: { version: version2, protocol, port, host }
|
29057
29057
|
} = evt;
|
29058
|
-
debuglog("connected to %s%s using %s%s", host, port ? `:${port}` : "", protocol,
|
29058
|
+
debuglog("connected to %s%s using %s%s", host, port ? `:${port}` : "", protocol, version2);
|
29059
29059
|
});
|
29060
29060
|
diagnosticsChannel.channel("undici:client:connectError").subscribe((evt) => {
|
29061
29061
|
const {
|
29062
|
-
connectParams: { version, protocol, port, host },
|
29062
|
+
connectParams: { version: version2, protocol, port, host },
|
29063
29063
|
error
|
29064
29064
|
} = evt;
|
29065
|
-
debuglog("connection to %s%s using %s%s errored - %s", host, port ? `:${port}` : "", protocol,
|
29065
|
+
debuglog("connection to %s%s using %s%s errored - %s", host, port ? `:${port}` : "", protocol, version2, error.message);
|
29066
29066
|
});
|
29067
29067
|
diagnosticsChannel.channel("undici:client:sendHeaders").subscribe((evt) => {
|
29068
29068
|
const {
|
@@ -54255,9 +54255,10 @@ function ora(options) {
|
|
54255
54255
|
return new Ora(options);
|
54256
54256
|
}
|
54257
54257
|
// package.json
|
54258
|
+
var version = "0.1.9";
|
54258
54259
|
var package_default = {
|
54259
54260
|
name: "@fettstorch/clai",
|
54260
|
-
version
|
54261
|
+
version,
|
54261
54262
|
main: "dist/index.js",
|
54262
54263
|
bin: {
|
54263
54264
|
clai: "dist/cli.js"
|
@@ -68031,8 +68032,7 @@ async function scrape(input) {
|
|
68031
68032
|
}));
|
68032
68033
|
return results.filter((result) => result !== null);
|
68033
68034
|
} catch (error) {
|
68034
|
-
|
68035
|
-
throw error;
|
68035
|
+
return [];
|
68036
68036
|
}
|
68037
68037
|
}
|
68038
68038
|
function isValidUrl(input) {
|
@@ -68048,55 +68048,31 @@ function normalizeUrl(url) {
|
|
68048
68048
|
return url;
|
68049
68049
|
}
|
68050
68050
|
async function getSearchResults(query) {
|
68051
|
-
|
68052
|
-
|
68053
|
-
|
68054
|
-
|
68051
|
+
const searchEngines = [
|
68052
|
+
{ name: "SearX", fn: getSearXResults },
|
68053
|
+
{ name: "Google", fn: getGoogleResults },
|
68054
|
+
{ name: "DuckDuckGo", fn: getDuckDuckGoResults },
|
68055
|
+
{ name: "Wikipedia", fn: getWikipediaResults }
|
68056
|
+
];
|
68057
|
+
for (const engine of searchEngines) {
|
68055
68058
|
try {
|
68056
|
-
|
68057
|
-
|
68058
|
-
|
68059
|
-
|
68060
|
-
|
68061
|
-
} catch (_4) {
|
68062
|
-
console.log("Using emergency fallback...");
|
68063
|
-
return getEmergencyResults(query);
|
68064
|
-
}
|
68065
|
-
}
|
68066
|
-
}
|
68067
|
-
}
|
68068
|
-
function getEmergencyResults(query) {
|
68069
|
-
const results = [];
|
68070
|
-
const cleanQuery = query.toLowerCase().replace(/[^a-z0-9\s]/g, "").trim();
|
68071
|
-
const words = cleanQuery.split(/\s+/).filter((word) => word.length > 2);
|
68072
|
-
if (words.length > 0) {
|
68073
|
-
const mainWord = words[0];
|
68074
|
-
results.push(`https://en.wikipedia.org/wiki/${encodeURIComponent(query.replace(/\s+/g, "_"))}`);
|
68075
|
-
if (mainWord.length > 3) {
|
68076
|
-
results.push(`https://${mainWord}.com`);
|
68077
|
-
results.push(`https://www.${mainWord}.org`);
|
68059
|
+
const result = await engine.fn(query);
|
68060
|
+
console.log(`[${engine.name}]::✅`);
|
68061
|
+
return result;
|
68062
|
+
} catch (_2) {
|
68063
|
+
console.log(`[${engine.name}]::❌`);
|
68078
68064
|
}
|
68079
|
-
results.push(`https://www.reddit.com/search/?q=${encodeURIComponent(query)}`);
|
68080
68065
|
}
|
68081
|
-
console.log("
|
68082
|
-
|
68083
|
-
`https://en.wikipedia.org/wiki/${encodeURIComponent(query.replace(/\s+/g, "_"))}`
|
68084
|
-
];
|
68066
|
+
console.log("All search engines failed - no URLs to scrape");
|
68067
|
+
throw new Error("No search results available");
|
68085
68068
|
}
|
68086
68069
|
async function getSearXResults(query) {
|
68087
|
-
const searxInstances = [
|
68088
|
-
"https://searx.be",
|
68089
|
-
"https://search.sapti.me",
|
68090
|
-
"https://searx.tiekoetter.com",
|
68091
|
-
"https://searx.prvcy.eu"
|
68092
|
-
];
|
68070
|
+
const searxInstances = ["https://searx.be", "https://search.sapti.me"];
|
68093
68071
|
for (const instance of searxInstances) {
|
68094
68072
|
try {
|
68095
68073
|
const searchUrl = `${instance}/search?q=${encodeURIComponent(query)}&format=json&categories=general`;
|
68096
|
-
|
68097
|
-
const response = await fetch(searchUrl, {
|
68074
|
+
const response = await enhancedFetch(searchUrl, {
|
68098
68075
|
headers: {
|
68099
|
-
"User-Agent": getRandomUserAgent(),
|
68100
68076
|
Accept: "application/json"
|
68101
68077
|
}
|
68102
68078
|
});
|
@@ -68113,7 +68089,6 @@ async function getSearXResults(query) {
|
|
68113
68089
|
}
|
68114
68090
|
}
|
68115
68091
|
if (urls.length > 0) {
|
68116
|
-
console.log(`✓ SearX found ${urls.length} results`);
|
68117
68092
|
return urls.slice(0, 3);
|
68118
68093
|
}
|
68119
68094
|
} catch (error) {
|
@@ -68121,9 +68096,31 @@ async function getSearXResults(query) {
|
|
68121
68096
|
}
|
68122
68097
|
throw new Error("All SearX instances failed");
|
68123
68098
|
}
|
68099
|
+
async function getWikipediaResults(query) {
|
68100
|
+
const searchUrl = `https://en.wikipedia.org/w/api.php?action=opensearch&search=${encodeURIComponent(query)}&limit=3&format=json&origin=*`;
|
68101
|
+
const response = await enhancedFetch(searchUrl, {
|
68102
|
+
headers: {
|
68103
|
+
Accept: "application/json"
|
68104
|
+
}
|
68105
|
+
});
|
68106
|
+
if (!response.ok) {
|
68107
|
+
throw new Error(`Wikipedia API error: ${response.status}`);
|
68108
|
+
}
|
68109
|
+
const data2 = await response.json();
|
68110
|
+
if (Array.isArray(data2) && data2.length >= 4 && Array.isArray(data2[3])) {
|
68111
|
+
const urls = data2[3]?.filter((url) => url?.startsWith("https://"));
|
68112
|
+
if (urls?.length > 0) {
|
68113
|
+
return urls;
|
68114
|
+
}
|
68115
|
+
}
|
68116
|
+
throw new Error("No Wikipedia results found");
|
68117
|
+
}
|
68124
68118
|
async function getGoogleResults(query) {
|
68125
68119
|
const searchUrl = `https://www.google.com/search?q=${encodeURIComponent(query)}&num=10`;
|
68126
68120
|
const html3 = await fetchHtml(searchUrl);
|
68121
|
+
if (html3.includes("If you're having trouble accessing Google Search") || html3.includes("unusual traffic from your computer network")) {
|
68122
|
+
throw new Error("Google blocked request - detected as bot");
|
68123
|
+
}
|
68127
68124
|
const cheerioDoc = load2(html3);
|
68128
68125
|
const urls = [];
|
68129
68126
|
cheerioDoc('a[href^="/url?q="]').each((_2, element) => {
|
@@ -68151,13 +68148,18 @@ async function getGoogleResults(query) {
|
|
68151
68148
|
if (uniqueUrls.length === 0) {
|
68152
68149
|
throw new Error("No search results found in Google response");
|
68153
68150
|
}
|
68154
|
-
console.log(`✓ Google found ${uniqueUrls.length} results`);
|
68155
68151
|
return uniqueUrls;
|
68156
68152
|
}
|
68157
68153
|
async function getDuckDuckGoResults(query) {
|
68158
68154
|
const searchUrl = `https://api.duckduckgo.com/?q=${encodeURIComponent(query)}&format=json&no_html=1&skip_disambig=1`;
|
68159
|
-
const response = await
|
68155
|
+
const response = await enhancedFetch(searchUrl);
|
68156
|
+
if (!response.ok) {
|
68157
|
+
throw new Error(`DuckDuckGo API error: ${response.status}`);
|
68158
|
+
}
|
68160
68159
|
const data2 = await response.json();
|
68160
|
+
if (data2.Abstract?.includes("redirects users to a non-JavaScript site") || data2.Abstract?.includes("DuckDuckGo redirects users") || data2.AbstractText?.includes("redirects users to a non-JavaScript site") || data2.AbstractText?.includes("DuckDuckGo redirects users")) {
|
68161
|
+
throw new Error("DuckDuckGo blocked request - JavaScript disabled redirect");
|
68162
|
+
}
|
68161
68163
|
const urls = [];
|
68162
68164
|
if (data2.AbstractURL) {
|
68163
68165
|
urls.push(data2.AbstractURL);
|
@@ -68169,29 +68171,39 @@ async function getDuckDuckGoResults(query) {
|
|
68169
68171
|
}
|
68170
68172
|
}
|
68171
68173
|
}
|
68174
|
+
if (urls.length === 0 && data2.DefinitionURL) {
|
68175
|
+
urls.push(data2.DefinitionURL);
|
68176
|
+
}
|
68172
68177
|
if (urls.length === 0) {
|
68173
68178
|
throw new Error("No search results found in DuckDuckGo response");
|
68174
68179
|
}
|
68175
|
-
console.log(`✓ DuckDuckGo found ${urls.length} results`);
|
68176
68180
|
return urls;
|
68177
68181
|
}
|
68178
|
-
async function
|
68179
|
-
const
|
68180
|
-
|
68181
|
-
|
68182
|
-
|
68183
|
-
|
68184
|
-
|
68185
|
-
|
68186
|
-
|
68187
|
-
|
68188
|
-
|
68189
|
-
|
68190
|
-
|
68191
|
-
|
68192
|
-
|
68193
|
-
|
68182
|
+
async function enhancedFetch(url, options = {}) {
|
68183
|
+
const headers = {
|
68184
|
+
"User-Agent": getRandomUserAgent(),
|
68185
|
+
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
68186
|
+
"Accept-Language": "en-US,en;q=0.9",
|
68187
|
+
"Accept-Encoding": "gzip, deflate, br",
|
68188
|
+
"sec-ch-ua": '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
|
68189
|
+
"sec-ch-ua-mobile": "?0",
|
68190
|
+
"sec-ch-ua-platform": '"macOS"',
|
68191
|
+
"Sec-Fetch-Dest": "document",
|
68192
|
+
"Sec-Fetch-Mode": "navigate",
|
68193
|
+
"Sec-Fetch-Site": "cross-site",
|
68194
|
+
"Sec-Fetch-User": "?1",
|
68195
|
+
"Upgrade-Insecure-Requests": "1",
|
68196
|
+
"Cache-Control": "no-cache",
|
68197
|
+
Pragma: "no-cache",
|
68198
|
+
...options.headers
|
68199
|
+
};
|
68200
|
+
return fetch(url, {
|
68201
|
+
...options,
|
68202
|
+
headers
|
68194
68203
|
});
|
68204
|
+
}
|
68205
|
+
async function fetchHtml(url) {
|
68206
|
+
const response = await enhancedFetch(url);
|
68195
68207
|
return response.text();
|
68196
68208
|
}
|
68197
68209
|
function getRandomUserAgent() {
|
@@ -73255,25 +73267,83 @@ async function summarizeWebPage(content, openAIApiKey) {
|
|
73255
73267
|
});
|
73256
73268
|
return result;
|
73257
73269
|
}
|
73270
|
+
async function summarizeQuery(query, openAIApiKey) {
|
73271
|
+
const openai = openaiClient(openAIApiKey);
|
73272
|
+
const prompt2 = `You are an expert educator and researcher. Answer the following query with accurate, helpful information:
|
73273
|
+
|
73274
|
+
"${query}"
|
73275
|
+
|
73276
|
+
Guidelines:
|
73277
|
+
1. Provide a comprehensive but concise answer
|
73278
|
+
2. Use bullet points, lists, and tables when appropriate
|
73279
|
+
3. Include relevant examples or step-by-step instructions if applicable
|
73280
|
+
4. Format your response in valid markdown
|
73281
|
+
5. Be factual and cite general knowledge sources when relevant
|
73282
|
+
6. If you suggest external resources, format them as links in the response
|
73283
|
+
7. Mark proper nouns as bold e.g. **OpenAI**
|
73284
|
+
8. Use appropriate headings (##, ###) to structure your response
|
73285
|
+
9. If the query is about current events beyond your knowledge cutoff, mention that limitation
|
73286
|
+
|
73287
|
+
Provide a thorough, educational response that directly addresses the user's query.`;
|
73288
|
+
const schema = {
|
73289
|
+
textual: {
|
73290
|
+
type: "string",
|
73291
|
+
description: "Comprehensive answer to the user query"
|
73292
|
+
},
|
73293
|
+
links: {
|
73294
|
+
type: "array",
|
73295
|
+
items: {
|
73296
|
+
type: "object",
|
73297
|
+
properties: {
|
73298
|
+
name: {
|
73299
|
+
type: "string",
|
73300
|
+
description: "Descriptive name of the recommended resource"
|
73301
|
+
},
|
73302
|
+
url: {
|
73303
|
+
type: "string",
|
73304
|
+
description: "URL to the recommended resource"
|
73305
|
+
}
|
73306
|
+
},
|
73307
|
+
required: ["name", "url"]
|
73308
|
+
}
|
73309
|
+
}
|
73310
|
+
};
|
73311
|
+
const result = await openai.completeStructured(prompt2, {
|
73312
|
+
temperature: 0.7,
|
73313
|
+
responseSchema: schema
|
73314
|
+
});
|
73315
|
+
return result;
|
73316
|
+
}
|
73258
73317
|
|
73259
73318
|
// src/index.ts
|
73260
73319
|
async function clai(input, openAIKey) {
|
73261
73320
|
const scrapedData = await scrape(input);
|
73262
|
-
const
|
73321
|
+
const usefulData = scrapedData.filter((data2) => data2.content.length > 200 && !data2.content.includes("Wikipedia does not have an article") && !data2.content.includes("page not found") && !data2.content.includes("404") && !data2.content.includes("error"));
|
73322
|
+
if (usefulData.length > 0) {
|
73323
|
+
const combinedContent = usefulData.map((data2) => `Content from ${data2.url}:
|
73263
73324
|
${data2.content}`).join(`
|
73264
73325
|
|
73265
73326
|
`);
|
73266
|
-
|
73327
|
+
const result2 = await summarizeWebPage(combinedContent, openAIKey);
|
73328
|
+
return {
|
73329
|
+
summary: result2.textual.trim(),
|
73330
|
+
links: result2.links,
|
73331
|
+
sources: usefulData.map((data2) => data2.url)
|
73332
|
+
};
|
73333
|
+
}
|
73334
|
+
console.log("No scraped data available - using OpenAI directly for query...");
|
73335
|
+
const result = await summarizeQuery(input, openAIKey);
|
73267
73336
|
return {
|
73268
73337
|
summary: result.textual.trim(),
|
73269
73338
|
links: result.links,
|
73270
|
-
sources:
|
73339
|
+
sources: ["OpenAI Knowledge Base"]
|
73271
73340
|
};
|
73272
73341
|
}
|
73273
73342
|
|
73274
73343
|
// src/cli.ts
|
73275
73344
|
var program2 = new Command;
|
73276
73345
|
async function main2() {
|
73346
|
+
console.log(`[clAi]::${source_default.cyan(version)}`);
|
73277
73347
|
try {
|
73278
73348
|
program2.name("clai").description("AI-powered web scraping tool").version(package_default.version).argument("[input...]", "URL or search terms to analyze").action(async (inputs) => {
|
73279
73349
|
const openAIKey = process.env.OPENAI_API_KEY;
|
package/dist/index.js
CHANGED
@@ -41059,8 +41059,7 @@ async function scrape(input) {
|
|
41059
41059
|
}));
|
41060
41060
|
return results.filter((result) => result !== null);
|
41061
41061
|
} catch (error) {
|
41062
|
-
|
41063
|
-
throw error;
|
41062
|
+
return [];
|
41064
41063
|
}
|
41065
41064
|
}
|
41066
41065
|
function isValidUrl(input) {
|
@@ -41076,55 +41075,31 @@ function normalizeUrl(url) {
|
|
41076
41075
|
return url;
|
41077
41076
|
}
|
41078
41077
|
async function getSearchResults(query) {
|
41079
|
-
|
41080
|
-
|
41081
|
-
|
41082
|
-
|
41078
|
+
const searchEngines = [
|
41079
|
+
{ name: "SearX", fn: getSearXResults },
|
41080
|
+
{ name: "Google", fn: getGoogleResults },
|
41081
|
+
{ name: "DuckDuckGo", fn: getDuckDuckGoResults },
|
41082
|
+
{ name: "Wikipedia", fn: getWikipediaResults }
|
41083
|
+
];
|
41084
|
+
for (const engine of searchEngines) {
|
41083
41085
|
try {
|
41084
|
-
|
41085
|
-
|
41086
|
-
|
41087
|
-
|
41088
|
-
|
41089
|
-
} catch (_3) {
|
41090
|
-
console.log("Using emergency fallback...");
|
41091
|
-
return getEmergencyResults(query);
|
41092
|
-
}
|
41093
|
-
}
|
41094
|
-
}
|
41095
|
-
}
|
41096
|
-
function getEmergencyResults(query) {
|
41097
|
-
const results = [];
|
41098
|
-
const cleanQuery = query.toLowerCase().replace(/[^a-z0-9\s]/g, "").trim();
|
41099
|
-
const words = cleanQuery.split(/\s+/).filter((word) => word.length > 2);
|
41100
|
-
if (words.length > 0) {
|
41101
|
-
const mainWord = words[0];
|
41102
|
-
results.push(`https://en.wikipedia.org/wiki/${encodeURIComponent(query.replace(/\s+/g, "_"))}`);
|
41103
|
-
if (mainWord.length > 3) {
|
41104
|
-
results.push(`https://${mainWord}.com`);
|
41105
|
-
results.push(`https://www.${mainWord}.org`);
|
41086
|
+
const result = await engine.fn(query);
|
41087
|
+
console.log(`[${engine.name}]::✅`);
|
41088
|
+
return result;
|
41089
|
+
} catch (_) {
|
41090
|
+
console.log(`[${engine.name}]::❌`);
|
41106
41091
|
}
|
41107
|
-
results.push(`https://www.reddit.com/search/?q=${encodeURIComponent(query)}`);
|
41108
41092
|
}
|
41109
|
-
console.log("
|
41110
|
-
|
41111
|
-
`https://en.wikipedia.org/wiki/${encodeURIComponent(query.replace(/\s+/g, "_"))}`
|
41112
|
-
];
|
41093
|
+
console.log("All search engines failed - no URLs to scrape");
|
41094
|
+
throw new Error("No search results available");
|
41113
41095
|
}
|
41114
41096
|
async function getSearXResults(query) {
|
41115
|
-
const searxInstances = [
|
41116
|
-
"https://searx.be",
|
41117
|
-
"https://search.sapti.me",
|
41118
|
-
"https://searx.tiekoetter.com",
|
41119
|
-
"https://searx.prvcy.eu"
|
41120
|
-
];
|
41097
|
+
const searxInstances = ["https://searx.be", "https://search.sapti.me"];
|
41121
41098
|
for (const instance of searxInstances) {
|
41122
41099
|
try {
|
41123
41100
|
const searchUrl = `${instance}/search?q=${encodeURIComponent(query)}&format=json&categories=general`;
|
41124
|
-
|
41125
|
-
const response = await fetch(searchUrl, {
|
41101
|
+
const response = await enhancedFetch(searchUrl, {
|
41126
41102
|
headers: {
|
41127
|
-
"User-Agent": getRandomUserAgent(),
|
41128
41103
|
Accept: "application/json"
|
41129
41104
|
}
|
41130
41105
|
});
|
@@ -41141,7 +41116,6 @@ async function getSearXResults(query) {
|
|
41141
41116
|
}
|
41142
41117
|
}
|
41143
41118
|
if (urls.length > 0) {
|
41144
|
-
console.log(`✓ SearX found ${urls.length} results`);
|
41145
41119
|
return urls.slice(0, 3);
|
41146
41120
|
}
|
41147
41121
|
} catch (error) {
|
@@ -41149,9 +41123,31 @@ async function getSearXResults(query) {
|
|
41149
41123
|
}
|
41150
41124
|
throw new Error("All SearX instances failed");
|
41151
41125
|
}
|
41126
|
+
async function getWikipediaResults(query) {
|
41127
|
+
const searchUrl = `https://en.wikipedia.org/w/api.php?action=opensearch&search=${encodeURIComponent(query)}&limit=3&format=json&origin=*`;
|
41128
|
+
const response = await enhancedFetch(searchUrl, {
|
41129
|
+
headers: {
|
41130
|
+
Accept: "application/json"
|
41131
|
+
}
|
41132
|
+
});
|
41133
|
+
if (!response.ok) {
|
41134
|
+
throw new Error(`Wikipedia API error: ${response.status}`);
|
41135
|
+
}
|
41136
|
+
const data2 = await response.json();
|
41137
|
+
if (Array.isArray(data2) && data2.length >= 4 && Array.isArray(data2[3])) {
|
41138
|
+
const urls = data2[3]?.filter((url) => url?.startsWith("https://"));
|
41139
|
+
if (urls?.length > 0) {
|
41140
|
+
return urls;
|
41141
|
+
}
|
41142
|
+
}
|
41143
|
+
throw new Error("No Wikipedia results found");
|
41144
|
+
}
|
41152
41145
|
async function getGoogleResults(query) {
|
41153
41146
|
const searchUrl = `https://www.google.com/search?q=${encodeURIComponent(query)}&num=10`;
|
41154
41147
|
const html3 = await fetchHtml(searchUrl);
|
41148
|
+
if (html3.includes("If you're having trouble accessing Google Search") || html3.includes("unusual traffic from your computer network")) {
|
41149
|
+
throw new Error("Google blocked request - detected as bot");
|
41150
|
+
}
|
41155
41151
|
const cheerioDoc = load(html3);
|
41156
41152
|
const urls = [];
|
41157
41153
|
cheerioDoc('a[href^="/url?q="]').each((_, element) => {
|
@@ -41179,13 +41175,18 @@ async function getGoogleResults(query) {
|
|
41179
41175
|
if (uniqueUrls.length === 0) {
|
41180
41176
|
throw new Error("No search results found in Google response");
|
41181
41177
|
}
|
41182
|
-
console.log(`✓ Google found ${uniqueUrls.length} results`);
|
41183
41178
|
return uniqueUrls;
|
41184
41179
|
}
|
41185
41180
|
async function getDuckDuckGoResults(query) {
|
41186
41181
|
const searchUrl = `https://api.duckduckgo.com/?q=${encodeURIComponent(query)}&format=json&no_html=1&skip_disambig=1`;
|
41187
|
-
const response = await
|
41182
|
+
const response = await enhancedFetch(searchUrl);
|
41183
|
+
if (!response.ok) {
|
41184
|
+
throw new Error(`DuckDuckGo API error: ${response.status}`);
|
41185
|
+
}
|
41188
41186
|
const data2 = await response.json();
|
41187
|
+
if (data2.Abstract?.includes("redirects users to a non-JavaScript site") || data2.Abstract?.includes("DuckDuckGo redirects users") || data2.AbstractText?.includes("redirects users to a non-JavaScript site") || data2.AbstractText?.includes("DuckDuckGo redirects users")) {
|
41188
|
+
throw new Error("DuckDuckGo blocked request - JavaScript disabled redirect");
|
41189
|
+
}
|
41189
41190
|
const urls = [];
|
41190
41191
|
if (data2.AbstractURL) {
|
41191
41192
|
urls.push(data2.AbstractURL);
|
@@ -41197,29 +41198,39 @@ async function getDuckDuckGoResults(query) {
|
|
41197
41198
|
}
|
41198
41199
|
}
|
41199
41200
|
}
|
41201
|
+
if (urls.length === 0 && data2.DefinitionURL) {
|
41202
|
+
urls.push(data2.DefinitionURL);
|
41203
|
+
}
|
41200
41204
|
if (urls.length === 0) {
|
41201
41205
|
throw new Error("No search results found in DuckDuckGo response");
|
41202
41206
|
}
|
41203
|
-
console.log(`✓ DuckDuckGo found ${urls.length} results`);
|
41204
41207
|
return urls;
|
41205
41208
|
}
|
41206
|
-
async function
|
41207
|
-
const
|
41208
|
-
|
41209
|
-
|
41210
|
-
|
41211
|
-
|
41212
|
-
|
41213
|
-
|
41214
|
-
|
41215
|
-
|
41216
|
-
|
41217
|
-
|
41218
|
-
|
41219
|
-
|
41220
|
-
|
41221
|
-
|
41209
|
+
async function enhancedFetch(url, options = {}) {
|
41210
|
+
const headers = {
|
41211
|
+
"User-Agent": getRandomUserAgent(),
|
41212
|
+
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
41213
|
+
"Accept-Language": "en-US,en;q=0.9",
|
41214
|
+
"Accept-Encoding": "gzip, deflate, br",
|
41215
|
+
"sec-ch-ua": '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
|
41216
|
+
"sec-ch-ua-mobile": "?0",
|
41217
|
+
"sec-ch-ua-platform": '"macOS"',
|
41218
|
+
"Sec-Fetch-Dest": "document",
|
41219
|
+
"Sec-Fetch-Mode": "navigate",
|
41220
|
+
"Sec-Fetch-Site": "cross-site",
|
41221
|
+
"Sec-Fetch-User": "?1",
|
41222
|
+
"Upgrade-Insecure-Requests": "1",
|
41223
|
+
"Cache-Control": "no-cache",
|
41224
|
+
Pragma: "no-cache",
|
41225
|
+
...options.headers
|
41226
|
+
};
|
41227
|
+
return fetch(url, {
|
41228
|
+
...options,
|
41229
|
+
headers
|
41222
41230
|
});
|
41231
|
+
}
|
41232
|
+
async function fetchHtml(url) {
|
41233
|
+
const response = await enhancedFetch(url);
|
41223
41234
|
return response.text();
|
41224
41235
|
}
|
41225
41236
|
function getRandomUserAgent() {
|
@@ -46353,19 +46364,76 @@ async function summarizeWebPage(content, openAIApiKey) {
|
|
46353
46364
|
});
|
46354
46365
|
return result;
|
46355
46366
|
}
|
46367
|
+
async function summarizeQuery(query, openAIApiKey) {
|
46368
|
+
const openai = openaiClient(openAIApiKey);
|
46369
|
+
const prompt = `You are an expert educator and researcher. Answer the following query with accurate, helpful information:
|
46370
|
+
|
46371
|
+
"${query}"
|
46372
|
+
|
46373
|
+
Guidelines:
|
46374
|
+
1. Provide a comprehensive but concise answer
|
46375
|
+
2. Use bullet points, lists, and tables when appropriate
|
46376
|
+
3. Include relevant examples or step-by-step instructions if applicable
|
46377
|
+
4. Format your response in valid markdown
|
46378
|
+
5. Be factual and cite general knowledge sources when relevant
|
46379
|
+
6. If you suggest external resources, format them as links in the response
|
46380
|
+
7. Mark proper nouns as bold e.g. **OpenAI**
|
46381
|
+
8. Use appropriate headings (##, ###) to structure your response
|
46382
|
+
9. If the query is about current events beyond your knowledge cutoff, mention that limitation
|
46383
|
+
|
46384
|
+
Provide a thorough, educational response that directly addresses the user's query.`;
|
46385
|
+
const schema = {
|
46386
|
+
textual: {
|
46387
|
+
type: "string",
|
46388
|
+
description: "Comprehensive answer to the user query"
|
46389
|
+
},
|
46390
|
+
links: {
|
46391
|
+
type: "array",
|
46392
|
+
items: {
|
46393
|
+
type: "object",
|
46394
|
+
properties: {
|
46395
|
+
name: {
|
46396
|
+
type: "string",
|
46397
|
+
description: "Descriptive name of the recommended resource"
|
46398
|
+
},
|
46399
|
+
url: {
|
46400
|
+
type: "string",
|
46401
|
+
description: "URL to the recommended resource"
|
46402
|
+
}
|
46403
|
+
},
|
46404
|
+
required: ["name", "url"]
|
46405
|
+
}
|
46406
|
+
}
|
46407
|
+
};
|
46408
|
+
const result = await openai.completeStructured(prompt, {
|
46409
|
+
temperature: 0.7,
|
46410
|
+
responseSchema: schema
|
46411
|
+
});
|
46412
|
+
return result;
|
46413
|
+
}
|
46356
46414
|
|
46357
46415
|
// src/index.ts
|
46358
46416
|
async function clai(input, openAIKey) {
|
46359
46417
|
const scrapedData = await scrape(input);
|
46360
|
-
const
|
46418
|
+
const usefulData = scrapedData.filter((data2) => data2.content.length > 200 && !data2.content.includes("Wikipedia does not have an article") && !data2.content.includes("page not found") && !data2.content.includes("404") && !data2.content.includes("error"));
|
46419
|
+
if (usefulData.length > 0) {
|
46420
|
+
const combinedContent = usefulData.map((data2) => `Content from ${data2.url}:
|
46361
46421
|
${data2.content}`).join(`
|
46362
46422
|
|
46363
46423
|
`);
|
46364
|
-
|
46424
|
+
const result2 = await summarizeWebPage(combinedContent, openAIKey);
|
46425
|
+
return {
|
46426
|
+
summary: result2.textual.trim(),
|
46427
|
+
links: result2.links,
|
46428
|
+
sources: usefulData.map((data2) => data2.url)
|
46429
|
+
};
|
46430
|
+
}
|
46431
|
+
console.log("No scraped data available - using OpenAI directly for query...");
|
46432
|
+
const result = await summarizeQuery(input, openAIKey);
|
46365
46433
|
return {
|
46366
46434
|
summary: result.textual.trim(),
|
46367
46435
|
links: result.links,
|
46368
|
-
sources:
|
46436
|
+
sources: ["OpenAI Knowledge Base"]
|
46369
46437
|
};
|
46370
46438
|
}
|
46371
46439
|
var src_default = clai;
|