@fettstorch/clai 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.vscode/settings.json +10 -0
- package/README.md +1 -0
- package/biome.json +33 -0
- package/dist/cli.js +5875 -5611
- package/dist/index.js +1570 -1365
- package/package.json +36 -36
- package/src/cli.ts +148 -140
- package/src/index.ts +28 -25
- package/src/openai.ts +29 -25
- package/src/scraper.ts +246 -40
- package/src/summarizer.ts +44 -41
- package/tsconfig.json +14 -14
package/src/scraper.ts
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
import * as Cheerio from
|
1
|
+
import * as Cheerio from "cheerio";
|
2
2
|
|
3
3
|
export interface ScrapedData {
|
4
4
|
title: string;
|
@@ -9,11 +9,11 @@ export interface ScrapedData {
|
|
9
9
|
export async function scrape(input: string): Promise<ScrapedData[]> {
|
10
10
|
try {
|
11
11
|
let urls: string[];
|
12
|
-
|
12
|
+
|
13
13
|
if (isValidUrl(input)) {
|
14
14
|
urls = [normalizeUrl(input)];
|
15
15
|
} else {
|
16
|
-
urls = await
|
16
|
+
urls = await getSearchResults(input);
|
17
17
|
}
|
18
18
|
|
19
19
|
// Fetch all URLs in parallel
|
@@ -33,7 +33,7 @@ export async function scrape(input: string): Promise<ScrapedData[]> {
|
|
33
33
|
// Filter out failed scrapes
|
34
34
|
return results.filter((result): result is ScrapedData => result !== null);
|
35
35
|
} catch (error) {
|
36
|
-
console.error(
|
36
|
+
console.error("Error during scraping:", error);
|
37
37
|
throw error;
|
38
38
|
}
|
39
39
|
}
|
@@ -42,66 +42,272 @@ export async function scrape(input: string): Promise<ScrapedData[]> {
|
|
42
42
|
|
43
43
|
function isValidUrl(input: string): boolean {
|
44
44
|
// Check for whitespace
|
45
|
-
if (input.includes(
|
46
|
-
|
45
|
+
if (input.includes(" ")) return false;
|
46
|
+
|
47
47
|
// Check for common TLDs using regex
|
48
48
|
const tldPattern = /^[^\s]+\.[a-z]{2,}$/i;
|
49
49
|
return tldPattern.test(input);
|
50
50
|
}
|
51
51
|
|
52
52
|
function normalizeUrl(url: string): string {
|
53
|
-
if (!url.startsWith(
|
53
|
+
if (!url.startsWith("http://") && !url.startsWith("https://")) {
|
54
54
|
return `https://${url}`;
|
55
55
|
}
|
56
56
|
return url;
|
57
57
|
}
|
58
58
|
|
59
|
+
async function getSearchResults(query: string): Promise<string[]> {
|
60
|
+
try {
|
61
|
+
return await getSearXResults(query);
|
62
|
+
} catch (_) {
|
63
|
+
console.log("Trying Google search...");
|
64
|
+
try {
|
65
|
+
return await getGoogleResults(query);
|
66
|
+
} catch (_) {
|
67
|
+
console.log("Trying DuckDuckGo search...");
|
68
|
+
try {
|
69
|
+
return await getDuckDuckGoResults(query);
|
70
|
+
} catch (_) {
|
71
|
+
console.log("Using emergency fallback...");
|
72
|
+
return getEmergencyResults(query);
|
73
|
+
}
|
74
|
+
}
|
75
|
+
}
|
76
|
+
}
|
77
|
+
|
78
|
+
function getEmergencyResults(query: string): string[] {
|
79
|
+
// Emergency fallback - construct likely URLs based on the query
|
80
|
+
const results: string[] = [];
|
81
|
+
|
82
|
+
// Try to construct some reasonable URLs based on common patterns
|
83
|
+
const cleanQuery = query
|
84
|
+
.toLowerCase()
|
85
|
+
.replace(/[^a-z0-9\s]/g, "")
|
86
|
+
.trim();
|
87
|
+
const words = cleanQuery.split(/\s+/).filter((word) => word.length > 2);
|
88
|
+
|
89
|
+
if (words.length > 0) {
|
90
|
+
const mainWord = words[0];
|
91
|
+
|
92
|
+
// Add some likely candidates
|
93
|
+
results.push(
|
94
|
+
`https://en.wikipedia.org/wiki/${encodeURIComponent(
|
95
|
+
query.replace(/\s+/g, "_")
|
96
|
+
)}`
|
97
|
+
);
|
98
|
+
|
99
|
+
if (mainWord.length > 3) {
|
100
|
+
results.push(`https://${mainWord}.com`);
|
101
|
+
results.push(`https://www.${mainWord}.org`);
|
102
|
+
}
|
103
|
+
|
104
|
+
// Add a Reddit search as last resort
|
105
|
+
results.push(
|
106
|
+
`https://www.reddit.com/search/?q=${encodeURIComponent(query)}`
|
107
|
+
);
|
108
|
+
}
|
109
|
+
|
110
|
+
console.log("Emergency fallback returning:", results.join(", "));
|
111
|
+
return results.length > 0
|
112
|
+
? results.slice(0, 3)
|
113
|
+
: [
|
114
|
+
`https://en.wikipedia.org/wiki/${encodeURIComponent(
|
115
|
+
query.replace(/\s+/g, "_")
|
116
|
+
)}`,
|
117
|
+
];
|
118
|
+
}
|
119
|
+
|
120
|
+
async function getSearXResults(query: string): Promise<string[]> {
|
121
|
+
// Public SearXNG instances that are scraper-friendly
|
122
|
+
const searxInstances = [
|
123
|
+
"https://searx.be",
|
124
|
+
"https://search.sapti.me",
|
125
|
+
"https://searx.tiekoetter.com",
|
126
|
+
"https://searx.prvcy.eu",
|
127
|
+
];
|
128
|
+
|
129
|
+
// Try instances until one works
|
130
|
+
for (const instance of searxInstances) {
|
131
|
+
try {
|
132
|
+
const searchUrl = `${instance}/search?q=${encodeURIComponent(
|
133
|
+
query
|
134
|
+
)}&format=json&categories=general`;
|
135
|
+
|
136
|
+
console.log("Trying SearX search...");
|
137
|
+
|
138
|
+
const response = await fetch(searchUrl, {
|
139
|
+
headers: {
|
140
|
+
"User-Agent": getRandomUserAgent(),
|
141
|
+
Accept: "application/json",
|
142
|
+
},
|
143
|
+
});
|
144
|
+
|
145
|
+
if (!response.ok) {
|
146
|
+
continue;
|
147
|
+
}
|
148
|
+
|
149
|
+
const data = await response.json();
|
150
|
+
const urls: string[] = [];
|
151
|
+
|
152
|
+
if (data.results && Array.isArray(data.results)) {
|
153
|
+
for (const result of data.results.slice(0, 5)) {
|
154
|
+
if (
|
155
|
+
result.url &&
|
156
|
+
(result.url.startsWith("http://") ||
|
157
|
+
result.url.startsWith("https://")) &&
|
158
|
+
!result.url.includes("wikipedia.org") && // Skip Wikipedia for diversity
|
159
|
+
!urls.includes(result.url)
|
160
|
+
) {
|
161
|
+
urls.push(result.url);
|
162
|
+
}
|
163
|
+
}
|
164
|
+
}
|
165
|
+
|
166
|
+
if (urls.length > 0) {
|
167
|
+
console.log(`✓ SearX found ${urls.length} results`);
|
168
|
+
return urls.slice(0, 3); // Limit to 3 results
|
169
|
+
}
|
170
|
+
} catch (error) {
|
171
|
+
// Continue to next instance
|
172
|
+
}
|
173
|
+
}
|
174
|
+
|
175
|
+
throw new Error("All SearX instances failed");
|
176
|
+
}
|
177
|
+
|
59
178
|
async function getGoogleResults(query: string): Promise<string[]> {
|
60
|
-
const searchUrl = `https://www.google.com/search?q=${encodeURIComponent(
|
179
|
+
const searchUrl = `https://www.google.com/search?q=${encodeURIComponent(
|
180
|
+
query
|
181
|
+
)}&num=10`;
|
182
|
+
|
61
183
|
const html = await fetchHtml(searchUrl);
|
62
|
-
|
63
|
-
const
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
.
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
184
|
+
const cheerioDoc = Cheerio.load(html);
|
185
|
+
const urls: string[] = [];
|
186
|
+
|
187
|
+
// Google search result links are in <a> tags with href starting with /url?q=
|
188
|
+
cheerioDoc('a[href^="/url?q="]').each((_, element) => {
|
189
|
+
const href = cheerioDoc(element).attr("href");
|
190
|
+
if (href) {
|
191
|
+
// Extract the actual URL from Google's redirect format: /url?q=ACTUAL_URL&sa=...
|
192
|
+
const urlMatch = href.match(/\/url\?q=([^&]+)/);
|
193
|
+
if (urlMatch) {
|
194
|
+
try {
|
195
|
+
const decodedUrl = decodeURIComponent(urlMatch[1]);
|
196
|
+
// Filter out Google's own URLs and other unwanted domains
|
197
|
+
if (
|
198
|
+
!decodedUrl.includes("google.com") &&
|
199
|
+
!decodedUrl.includes("youtube.com") &&
|
200
|
+
(decodedUrl.startsWith("http://") ||
|
201
|
+
decodedUrl.startsWith("https://"))
|
202
|
+
) {
|
203
|
+
urls.push(decodedUrl);
|
204
|
+
}
|
205
|
+
} catch (error) {
|
206
|
+
// Skip malformed URLs
|
207
|
+
}
|
208
|
+
}
|
209
|
+
}
|
210
|
+
});
|
211
|
+
|
212
|
+
// Also try direct links (sometimes Google shows direct links)
|
213
|
+
cheerioDoc('a[href^="http"]').each((_, element) => {
|
214
|
+
const href = cheerioDoc(element).attr("href");
|
215
|
+
if (
|
216
|
+
href &&
|
217
|
+
!href.includes("google.com") &&
|
218
|
+
!href.includes("youtube.com") &&
|
219
|
+
!urls.includes(href)
|
220
|
+
) {
|
221
|
+
urls.push(href);
|
222
|
+
}
|
223
|
+
});
|
224
|
+
|
225
|
+
// Remove duplicates and limit to first 3 results
|
226
|
+
const uniqueUrls = [...new Set(urls)].slice(0, 3);
|
227
|
+
|
228
|
+
if (uniqueUrls.length === 0) {
|
229
|
+
throw new Error("No search results found in Google response");
|
230
|
+
}
|
231
|
+
|
232
|
+
console.log(`✓ Google found ${uniqueUrls.length} results`);
|
233
|
+
return uniqueUrls;
|
234
|
+
}
|
235
|
+
|
236
|
+
async function getDuckDuckGoResults(query: string): Promise<string[]> {
|
237
|
+
const searchUrl = `https://api.duckduckgo.com/?q=${encodeURIComponent(
|
238
|
+
query
|
239
|
+
)}&format=json&no_html=1&skip_disambig=1`;
|
240
|
+
|
241
|
+
const response = await fetch(searchUrl);
|
242
|
+
const data = await response.json();
|
243
|
+
|
244
|
+
const urls: string[] = [];
|
245
|
+
|
246
|
+
if (data.AbstractURL) {
|
247
|
+
urls.push(data.AbstractURL);
|
248
|
+
}
|
249
|
+
|
250
|
+
if (data.RelatedTopics) {
|
251
|
+
for (const topic of data.RelatedTopics.slice(0, 2)) {
|
252
|
+
if (topic.FirstURL) {
|
253
|
+
urls.push(topic.FirstURL);
|
254
|
+
}
|
255
|
+
}
|
86
256
|
}
|
87
|
-
|
88
|
-
|
257
|
+
|
258
|
+
if (urls.length === 0) {
|
259
|
+
throw new Error("No search results found in DuckDuckGo response");
|
260
|
+
}
|
261
|
+
|
262
|
+
console.log(`✓ DuckDuckGo found ${urls.length} results`);
|
263
|
+
return urls;
|
89
264
|
}
|
90
265
|
|
91
266
|
async function fetchHtml(url: string): Promise<string> {
|
92
267
|
const response = await fetch(url, {
|
93
268
|
headers: {
|
94
|
-
|
95
|
-
|
269
|
+
"User-Agent": getRandomUserAgent(),
|
270
|
+
Accept:
|
271
|
+
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
272
|
+
"Accept-Language": "en-US,en;q=0.9",
|
273
|
+
"Accept-Encoding": "gzip, deflate, br",
|
274
|
+
DNT: "1",
|
275
|
+
Connection: "keep-alive",
|
276
|
+
"Upgrade-Insecure-Requests": "1",
|
277
|
+
"Sec-Fetch-Dest": "document",
|
278
|
+
"Sec-Fetch-Mode": "navigate",
|
279
|
+
"Sec-Fetch-Site": "none",
|
280
|
+
"Sec-Fetch-User": "?1",
|
281
|
+
"Cache-Control": "max-age=0",
|
282
|
+
},
|
96
283
|
});
|
97
284
|
return response.text();
|
98
285
|
}
|
99
286
|
|
287
|
+
function getRandomUserAgent(): string {
|
288
|
+
const userAgents = [
|
289
|
+
// Latest Chrome on macOS
|
290
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
291
|
+
// Latest Chrome on Windows
|
292
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
293
|
+
// Latest Firefox on macOS
|
294
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:132.0) Gecko/20100101 Firefox/132.0",
|
295
|
+
// Latest Firefox on Windows
|
296
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0",
|
297
|
+
// Latest Safari on macOS
|
298
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Safari/605.1.15",
|
299
|
+
// Latest Edge on Windows
|
300
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0",
|
301
|
+
];
|
302
|
+
|
303
|
+
return userAgents[Math.floor(Math.random() * userAgents.length)];
|
304
|
+
}
|
305
|
+
|
100
306
|
function extractDataFromHtml(html: string): ScrapedData {
|
101
307
|
const cheerioDoc = Cheerio.load(html);
|
102
308
|
return {
|
103
|
-
title: cheerioDoc(
|
104
|
-
content: cheerioDoc(
|
105
|
-
url: cheerioDoc('link[rel="canonical"]').attr(
|
309
|
+
title: cheerioDoc("title").text(),
|
310
|
+
content: cheerioDoc("body").text(),
|
311
|
+
url: cheerioDoc('link[rel="canonical"]').attr("href") || "",
|
106
312
|
};
|
107
|
-
}
|
313
|
+
}
|
package/src/summarizer.ts
CHANGED
@@ -1,11 +1,11 @@
|
|
1
|
-
import { openaiClient } from './openai'
|
1
|
+
import { openaiClient } from './openai'
|
2
2
|
|
3
3
|
export type SummaryResult = Readonly<{
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
4
|
+
textual: string
|
5
|
+
links: ReadonlyArray<{
|
6
|
+
name: string
|
7
|
+
url: string
|
8
|
+
}>
|
9
9
|
}>
|
10
10
|
|
11
11
|
/**
|
@@ -14,7 +14,7 @@ export type SummaryResult = Readonly<{
|
|
14
14
|
* @param maxLength - Maximum length of the summary in words
|
15
15
|
* @returns Promise containing the summary text and extracted links
|
16
16
|
* @throws Will throw an error if OpenAI API call fails
|
17
|
-
*
|
17
|
+
*
|
18
18
|
* @example
|
19
19
|
* ```ts
|
20
20
|
* const result = await summarizeContent(longText, 100)
|
@@ -23,10 +23,13 @@ export type SummaryResult = Readonly<{
|
|
23
23
|
* ```
|
24
24
|
*/
|
25
25
|
|
26
|
-
export async function summarizeWebPage(
|
27
|
-
|
28
|
-
|
29
|
-
|
26
|
+
export async function summarizeWebPage(
|
27
|
+
content: string,
|
28
|
+
openAIApiKey: string,
|
29
|
+
): Promise<SummaryResult> {
|
30
|
+
const openai = openaiClient(openAIApiKey)
|
31
|
+
|
32
|
+
const prompt = `Your are an expert educator. Analyze the following text and create a
|
30
33
|
concise summary with the following guidelines:
|
31
34
|
1. Always use bullet points, lists and tables over paragraphs.
|
32
35
|
2. Produce valid markdown output
|
@@ -45,36 +48,36 @@ export async function summarizeWebPage(content: string, openAIApiKey: string): P
|
|
45
48
|
|
46
49
|
Don't just summarize, cite the key information.
|
47
50
|
|
48
|
-
Text to analyze:\n"${content}\n"
|
51
|
+
Text to analyze:\n"${content}\n"`
|
49
52
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
53
|
+
const schema = {
|
54
|
+
textual: {
|
55
|
+
type: 'string',
|
56
|
+
description: 'Concise summary of the text',
|
57
|
+
},
|
58
|
+
links: {
|
59
|
+
type: 'array',
|
60
|
+
items: {
|
61
|
+
type: 'object',
|
62
|
+
properties: {
|
63
|
+
name: {
|
64
|
+
type: 'string',
|
65
|
+
description: 'Descriptive name or title of the link',
|
66
|
+
},
|
67
|
+
url: {
|
68
|
+
type: 'string',
|
69
|
+
description: 'The URL of the link',
|
70
|
+
},
|
71
|
+
},
|
72
|
+
required: ['name', 'url'],
|
73
|
+
},
|
74
|
+
},
|
75
|
+
}
|
73
76
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
77
|
+
const result = await openai.completeStructured<SummaryResult>(prompt, {
|
78
|
+
temperature: 0.3,
|
79
|
+
responseSchema: schema,
|
80
|
+
})
|
78
81
|
|
79
|
-
|
80
|
-
}
|
82
|
+
return result
|
83
|
+
}
|
package/tsconfig.json
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
{
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
}
|
2
|
+
"compilerOptions": {
|
3
|
+
"target": "esnext",
|
4
|
+
"module": "esnext",
|
5
|
+
"moduleResolution": "bundler",
|
6
|
+
"types": ["bun-types"],
|
7
|
+
"outDir": "./dist",
|
8
|
+
"rootDir": "./src",
|
9
|
+
"strict": true,
|
10
|
+
"skipLibCheck": true,
|
11
|
+
"forceConsistentCasingInFileNames": true,
|
12
|
+
"resolveJsonModule": true
|
13
|
+
},
|
14
|
+
"include": ["src/**/*"]
|
15
|
+
}
|