@fettstorch/clai 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/scraper.ts CHANGED
@@ -1,4 +1,4 @@
1
- import * as Cheerio from 'cheerio';
1
+ import * as Cheerio from "cheerio";
2
2
 
3
3
  export interface ScrapedData {
4
4
  title: string;
@@ -9,11 +9,11 @@ export interface ScrapedData {
9
9
  export async function scrape(input: string): Promise<ScrapedData[]> {
10
10
  try {
11
11
  let urls: string[];
12
-
12
+
13
13
  if (isValidUrl(input)) {
14
14
  urls = [normalizeUrl(input)];
15
15
  } else {
16
- urls = await getGoogleResults(input);
16
+ urls = await getSearchResults(input);
17
17
  }
18
18
 
19
19
  // Fetch all URLs in parallel
@@ -33,7 +33,7 @@ export async function scrape(input: string): Promise<ScrapedData[]> {
33
33
  // Filter out failed scrapes
34
34
  return results.filter((result): result is ScrapedData => result !== null);
35
35
  } catch (error) {
36
- console.error('Error during scraping:', error);
36
+ console.error("Error during scraping:", error);
37
37
  throw error;
38
38
  }
39
39
  }
@@ -42,66 +42,272 @@ export async function scrape(input: string): Promise<ScrapedData[]> {
42
42
 
43
43
  function isValidUrl(input: string): boolean {
44
44
  // Check for whitespace
45
- if (input.includes(' ')) return false;
46
-
45
+ if (input.includes(" ")) return false;
46
+
47
47
  // Check for common TLDs using regex
48
48
  const tldPattern = /^[^\s]+\.[a-z]{2,}$/i;
49
49
  return tldPattern.test(input);
50
50
  }
51
51
 
52
52
  function normalizeUrl(url: string): string {
53
- if (!url.startsWith('http://') && !url.startsWith('https://')) {
53
+ if (!url.startsWith("http://") && !url.startsWith("https://")) {
54
54
  return `https://${url}`;
55
55
  }
56
56
  return url;
57
57
  }
58
58
 
59
+ async function getSearchResults(query: string): Promise<string[]> {
60
+ try {
61
+ return await getSearXResults(query);
62
+ } catch (_) {
63
+ console.log("Trying Google search...");
64
+ try {
65
+ return await getGoogleResults(query);
66
+ } catch (_) {
67
+ console.log("Trying DuckDuckGo search...");
68
+ try {
69
+ return await getDuckDuckGoResults(query);
70
+ } catch (_) {
71
+ console.log("Using emergency fallback...");
72
+ return getEmergencyResults(query);
73
+ }
74
+ }
75
+ }
76
+ }
77
+
78
+ function getEmergencyResults(query: string): string[] {
79
+ // Emergency fallback - construct likely URLs based on the query
80
+ const results: string[] = [];
81
+
82
+ // Try to construct some reasonable URLs based on common patterns
83
+ const cleanQuery = query
84
+ .toLowerCase()
85
+ .replace(/[^a-z0-9\s]/g, "")
86
+ .trim();
87
+ const words = cleanQuery.split(/\s+/).filter((word) => word.length > 2);
88
+
89
+ if (words.length > 0) {
90
+ const mainWord = words[0];
91
+
92
+ // Add some likely candidates
93
+ results.push(
94
+ `https://en.wikipedia.org/wiki/${encodeURIComponent(
95
+ query.replace(/\s+/g, "_")
96
+ )}`
97
+ );
98
+
99
+ if (mainWord.length > 3) {
100
+ results.push(`https://${mainWord}.com`);
101
+ results.push(`https://www.${mainWord}.org`);
102
+ }
103
+
104
+ // Add a Reddit search as last resort
105
+ results.push(
106
+ `https://www.reddit.com/search/?q=${encodeURIComponent(query)}`
107
+ );
108
+ }
109
+
110
+ console.log("Emergency fallback returning:", results.join(", "));
111
+ return results.length > 0
112
+ ? results.slice(0, 3)
113
+ : [
114
+ `https://en.wikipedia.org/wiki/${encodeURIComponent(
115
+ query.replace(/\s+/g, "_")
116
+ )}`,
117
+ ];
118
+ }
119
+
120
+ async function getSearXResults(query: string): Promise<string[]> {
121
+ // Public SearXNG instances that are scraper-friendly
122
+ const searxInstances = [
123
+ "https://searx.be",
124
+ "https://search.sapti.me",
125
+ "https://searx.tiekoetter.com",
126
+ "https://searx.prvcy.eu",
127
+ ];
128
+
129
+ // Try instances until one works
130
+ for (const instance of searxInstances) {
131
+ try {
132
+ const searchUrl = `${instance}/search?q=${encodeURIComponent(
133
+ query
134
+ )}&format=json&categories=general`;
135
+
136
+ console.log("Trying SearX search...");
137
+
138
+ const response = await fetch(searchUrl, {
139
+ headers: {
140
+ "User-Agent": getRandomUserAgent(),
141
+ Accept: "application/json",
142
+ },
143
+ });
144
+
145
+ if (!response.ok) {
146
+ continue;
147
+ }
148
+
149
+ const data = await response.json();
150
+ const urls: string[] = [];
151
+
152
+ if (data.results && Array.isArray(data.results)) {
153
+ for (const result of data.results.slice(0, 5)) {
154
+ if (
155
+ result.url &&
156
+ (result.url.startsWith("http://") ||
157
+ result.url.startsWith("https://")) &&
158
+ !result.url.includes("wikipedia.org") && // Skip Wikipedia for diversity
159
+ !urls.includes(result.url)
160
+ ) {
161
+ urls.push(result.url);
162
+ }
163
+ }
164
+ }
165
+
166
+ if (urls.length > 0) {
167
+ console.log(`✓ SearX found ${urls.length} results`);
168
+ return urls.slice(0, 3); // Limit to 3 results
169
+ }
170
+ } catch (error) {
171
+ // Continue to next instance
172
+ }
173
+ }
174
+
175
+ throw new Error("All SearX instances failed");
176
+ }
177
+
59
178
  async function getGoogleResults(query: string): Promise<string[]> {
60
- const searchUrl = `https://www.google.com/search?q=${encodeURIComponent(query)}`;
179
+ const searchUrl = `https://www.google.com/search?q=${encodeURIComponent(
180
+ query
181
+ )}&num=10`;
182
+
61
183
  const html = await fetchHtml(searchUrl);
62
-
63
- const urlPattern = /https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)/gi;
64
- const urls = html.match(urlPattern) || [];
65
-
66
- const queryWords = query
67
- .toLowerCase()
68
- .split(/\s+/)
69
- .filter(word => word.length > 2);
70
-
71
- const filteredUrls = new Set(
72
- urls.filter(url => {
73
- const urlLower = url.toLowerCase();
74
- return !urlLower.includes('www.google') &&
75
- !urlLower.includes('gstatic.com') &&
76
- !urlLower.includes('googleapis.com') &&
77
- !urlLower.includes('googleadservices') &&
78
- queryWords.some(word => urlLower.includes(word));
79
- })
80
- );
81
-
82
- const results = [...filteredUrls].slice(0, 3);
83
-
84
- if (results.length === 0) {
85
- throw new Error('No search results found');
184
+ const cheerioDoc = Cheerio.load(html);
185
+ const urls: string[] = [];
186
+
187
+ // Google search result links are in <a> tags with href starting with /url?q=
188
+ cheerioDoc('a[href^="/url?q="]').each((_, element) => {
189
+ const href = cheerioDoc(element).attr("href");
190
+ if (href) {
191
+ // Extract the actual URL from Google's redirect format: /url?q=ACTUAL_URL&sa=...
192
+ const urlMatch = href.match(/\/url\?q=([^&]+)/);
193
+ if (urlMatch) {
194
+ try {
195
+ const decodedUrl = decodeURIComponent(urlMatch[1]);
196
+ // Filter out Google's own URLs and other unwanted domains
197
+ if (
198
+ !decodedUrl.includes("google.com") &&
199
+ !decodedUrl.includes("youtube.com") &&
200
+ (decodedUrl.startsWith("http://") ||
201
+ decodedUrl.startsWith("https://"))
202
+ ) {
203
+ urls.push(decodedUrl);
204
+ }
205
+ } catch (error) {
206
+ // Skip malformed URLs
207
+ }
208
+ }
209
+ }
210
+ });
211
+
212
+ // Also try direct links (sometimes Google shows direct links)
213
+ cheerioDoc('a[href^="http"]').each((_, element) => {
214
+ const href = cheerioDoc(element).attr("href");
215
+ if (
216
+ href &&
217
+ !href.includes("google.com") &&
218
+ !href.includes("youtube.com") &&
219
+ !urls.includes(href)
220
+ ) {
221
+ urls.push(href);
222
+ }
223
+ });
224
+
225
+ // Remove duplicates and limit to first 3 results
226
+ const uniqueUrls = [...new Set(urls)].slice(0, 3);
227
+
228
+ if (uniqueUrls.length === 0) {
229
+ throw new Error("No search results found in Google response");
230
+ }
231
+
232
+ console.log(`✓ Google found ${uniqueUrls.length} results`);
233
+ return uniqueUrls;
234
+ }
235
+
236
+ async function getDuckDuckGoResults(query: string): Promise<string[]> {
237
+ const searchUrl = `https://api.duckduckgo.com/?q=${encodeURIComponent(
238
+ query
239
+ )}&format=json&no_html=1&skip_disambig=1`;
240
+
241
+ const response = await fetch(searchUrl);
242
+ const data = await response.json();
243
+
244
+ const urls: string[] = [];
245
+
246
+ if (data.AbstractURL) {
247
+ urls.push(data.AbstractURL);
248
+ }
249
+
250
+ if (data.RelatedTopics) {
251
+ for (const topic of data.RelatedTopics.slice(0, 2)) {
252
+ if (topic.FirstURL) {
253
+ urls.push(topic.FirstURL);
254
+ }
255
+ }
86
256
  }
87
-
88
- return results;
257
+
258
+ if (urls.length === 0) {
259
+ throw new Error("No search results found in DuckDuckGo response");
260
+ }
261
+
262
+ console.log(`✓ DuckDuckGo found ${urls.length} results`);
263
+ return urls;
89
264
  }
90
265
 
91
266
  async function fetchHtml(url: string): Promise<string> {
92
267
  const response = await fetch(url, {
93
268
  headers: {
94
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
95
- }
269
+ "User-Agent": getRandomUserAgent(),
270
+ Accept:
271
+ "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
272
+ "Accept-Language": "en-US,en;q=0.9",
273
+ "Accept-Encoding": "gzip, deflate, br",
274
+ DNT: "1",
275
+ Connection: "keep-alive",
276
+ "Upgrade-Insecure-Requests": "1",
277
+ "Sec-Fetch-Dest": "document",
278
+ "Sec-Fetch-Mode": "navigate",
279
+ "Sec-Fetch-Site": "none",
280
+ "Sec-Fetch-User": "?1",
281
+ "Cache-Control": "max-age=0",
282
+ },
96
283
  });
97
284
  return response.text();
98
285
  }
99
286
 
287
+ function getRandomUserAgent(): string {
288
+ const userAgents = [
289
+ // Latest Chrome on macOS
290
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
291
+ // Latest Chrome on Windows
292
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
293
+ // Latest Firefox on macOS
294
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:132.0) Gecko/20100101 Firefox/132.0",
295
+ // Latest Firefox on Windows
296
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0",
297
+ // Latest Safari on macOS
298
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Safari/605.1.15",
299
+ // Latest Edge on Windows
300
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0",
301
+ ];
302
+
303
+ return userAgents[Math.floor(Math.random() * userAgents.length)];
304
+ }
305
+
100
306
  function extractDataFromHtml(html: string): ScrapedData {
101
307
  const cheerioDoc = Cheerio.load(html);
102
308
  return {
103
- title: cheerioDoc('title').text(),
104
- content: cheerioDoc('body').text(),
105
- url: cheerioDoc('link[rel="canonical"]').attr('href') || ''
309
+ title: cheerioDoc("title").text(),
310
+ content: cheerioDoc("body").text(),
311
+ url: cheerioDoc('link[rel="canonical"]').attr("href") || "",
106
312
  };
107
- }
313
+ }
package/src/summarizer.ts CHANGED
@@ -1,11 +1,11 @@
1
- import { openaiClient } from './openai';
1
+ import { openaiClient } from './openai'
2
2
 
3
3
  export type SummaryResult = Readonly<{
4
- textual: string;
5
- links: ReadonlyArray<{
6
- name: string;
7
- url: string;
8
- }>;
4
+ textual: string
5
+ links: ReadonlyArray<{
6
+ name: string
7
+ url: string
8
+ }>
9
9
  }>
10
10
 
11
11
  /**
@@ -14,7 +14,7 @@ export type SummaryResult = Readonly<{
14
14
  * @param maxLength - Maximum length of the summary in words
15
15
  * @returns Promise containing the summary text and extracted links
16
16
  * @throws Will throw an error if OpenAI API call fails
17
- *
17
+ *
18
18
  * @example
19
19
  * ```ts
20
20
  * const result = await summarizeContent(longText, 100)
@@ -23,10 +23,13 @@ export type SummaryResult = Readonly<{
23
23
  * ```
24
24
  */
25
25
 
26
- export async function summarizeWebPage(content: string, openAIApiKey: string): Promise<SummaryResult> {
27
- const openai = openaiClient(openAIApiKey);
28
-
29
- const prompt = `Your are an expert educator. Analyze the following text and create a
26
+ export async function summarizeWebPage(
27
+ content: string,
28
+ openAIApiKey: string,
29
+ ): Promise<SummaryResult> {
30
+ const openai = openaiClient(openAIApiKey)
31
+
32
+ const prompt = `Your are an expert educator. Analyze the following text and create a
30
33
  concise summary with the following guidelines:
31
34
  1. Always use bullet points, lists and tables over paragraphs.
32
35
  2. Produce valid markdown output
@@ -45,36 +48,36 @@ export async function summarizeWebPage(content: string, openAIApiKey: string): P
45
48
 
46
49
  Don't just summarize, cite the key information.
47
50
 
48
- Text to analyze:\n"${content}\n"`;
51
+ Text to analyze:\n"${content}\n"`
49
52
 
50
- const schema = {
51
- textual: {
52
- type: 'string',
53
- description: 'Concise summary of the text'
54
- },
55
- links: {
56
- type: 'array',
57
- items: {
58
- type: 'object',
59
- properties: {
60
- name: {
61
- type: 'string',
62
- description: 'Descriptive name or title of the link'
63
- },
64
- url: {
65
- type: 'string',
66
- description: 'The URL of the link'
67
- }
68
- },
69
- required: ['name', 'url']
70
- }
71
- }
72
- };
53
+ const schema = {
54
+ textual: {
55
+ type: 'string',
56
+ description: 'Concise summary of the text',
57
+ },
58
+ links: {
59
+ type: 'array',
60
+ items: {
61
+ type: 'object',
62
+ properties: {
63
+ name: {
64
+ type: 'string',
65
+ description: 'Descriptive name or title of the link',
66
+ },
67
+ url: {
68
+ type: 'string',
69
+ description: 'The URL of the link',
70
+ },
71
+ },
72
+ required: ['name', 'url'],
73
+ },
74
+ },
75
+ }
73
76
 
74
- const result = await openai.completeStructured<SummaryResult>(prompt, {
75
- temperature: 0.3,
76
- responseSchema: schema
77
- });
77
+ const result = await openai.completeStructured<SummaryResult>(prompt, {
78
+ temperature: 0.3,
79
+ responseSchema: schema,
80
+ })
78
81
 
79
- return result;
80
- }
82
+ return result
83
+ }
package/tsconfig.json CHANGED
@@ -1,15 +1,15 @@
1
1
  {
2
- "compilerOptions": {
3
- "target": "esnext",
4
- "module": "esnext",
5
- "moduleResolution": "bundler",
6
- "types": ["bun-types"],
7
- "outDir": "./dist",
8
- "rootDir": "./src",
9
- "strict": true,
10
- "skipLibCheck": true,
11
- "forceConsistentCasingInFileNames": true,
12
- "resolveJsonModule": true
13
- },
14
- "include": ["src/**/*"]
15
- }
2
+ "compilerOptions": {
3
+ "target": "esnext",
4
+ "module": "esnext",
5
+ "moduleResolution": "bundler",
6
+ "types": ["bun-types"],
7
+ "outDir": "./dist",
8
+ "rootDir": "./src",
9
+ "strict": true,
10
+ "skipLibCheck": true,
11
+ "forceConsistentCasingInFileNames": true,
12
+ "resolveJsonModule": true
13
+ },
14
+ "include": ["src/**/*"]
15
+ }