@fettstorch/clai 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +48 -0
- package/dist/cli.js +73056 -0
- package/dist/index.js +46165 -0
- package/package.json +38 -0
- package/src/cli.ts +137 -0
- package/src/index.ts +45 -0
- package/src/openai.ts +92 -0
- package/src/scraper.ts +102 -0
- package/src/summarizer.ts +77 -0
- package/tsconfig.json +14 -0
package/package.json
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
{
|
2
|
+
"name": "@fettstorch/clai",
|
3
|
+
"version": "0.1.0",
|
4
|
+
"main": "dist/index.js",
|
5
|
+
"bin": {
|
6
|
+
"clai": "dist/cli.js"
|
7
|
+
},
|
8
|
+
"repository": {
|
9
|
+
"type": "git",
|
10
|
+
"url": "git+https://github.com/schnullerpip/clai.git"
|
11
|
+
},
|
12
|
+
"scripts": {
|
13
|
+
"start": "bun run src/cli.ts",
|
14
|
+
"build": "bun build ./src/index.ts --outdir dist --target node && bun build ./src/cli.ts --outdir dist --target node",
|
15
|
+
"dev": "bun --watch src/cli.ts"
|
16
|
+
},
|
17
|
+
"author": "schnullerpip (https://github.com/schnullerpip)",
|
18
|
+
"license": "ISC",
|
19
|
+
"description": "AI-powered webpage summarizer",
|
20
|
+
"dependencies": {
|
21
|
+
"@fettstorch/jule": "^0.5.3",
|
22
|
+
"chalk": "^5.3.0",
|
23
|
+
"cheerio": "^1.0.0-rc.12",
|
24
|
+
"commander": "^12.1.0",
|
25
|
+
"inquirer": "^12.1.0",
|
26
|
+
"openai": "^4.73.0",
|
27
|
+
"ora": "^8.1.1",
|
28
|
+
"googleapis": "^126.0.1"
|
29
|
+
},
|
30
|
+
"devDependencies": {
|
31
|
+
"@types/inquirer": "^9.0.7",
|
32
|
+
"@types/node": "^20.11.19",
|
33
|
+
"bun-types": "latest"
|
34
|
+
},
|
35
|
+
"publishConfig": {
|
36
|
+
"access": "public"
|
37
|
+
}
|
38
|
+
}
|
package/src/cli.ts
ADDED
@@ -0,0 +1,137 @@
|
|
1
|
+
#!/usr/bin/env bun
|
2
|
+
import { Command } from 'commander';
|
3
|
+
import inquirer from 'inquirer';
|
4
|
+
import chalk from 'chalk';
|
5
|
+
import ora from 'ora';
|
6
|
+
import { clai } from './index';
|
7
|
+
|
8
|
+
const program = new Command();
|
9
|
+
|
10
|
+
async function main() {
|
11
|
+
try {
|
12
|
+
program
|
13
|
+
.name('clai')
|
14
|
+
.description('AI-powered web scraping tool')
|
15
|
+
.version('1.0.0')
|
16
|
+
.argument('[input...]', 'URL or search terms to analyze')
|
17
|
+
.action(async (inputs: string[]) => {
|
18
|
+
const openAIKey = process.env.OPENAI_API_KEY;
|
19
|
+
|
20
|
+
if (!openAIKey) {
|
21
|
+
console.error(chalk.red('❌ OPENAI_API_KEY environment variable is not set'));
|
22
|
+
process.exit(1);
|
23
|
+
}
|
24
|
+
|
25
|
+
let input = inputs?.join(' ');
|
26
|
+
|
27
|
+
if (!input) {
|
28
|
+
const answers = await inquirer.prompt([
|
29
|
+
{
|
30
|
+
type: 'input',
|
31
|
+
name: 'input',
|
32
|
+
message: 'Enter a URL or search query:',
|
33
|
+
validate: (input) => input.length > 0
|
34
|
+
}
|
35
|
+
]);
|
36
|
+
input = answers.input;
|
37
|
+
}
|
38
|
+
|
39
|
+
await analyzeInput(input, openAIKey);
|
40
|
+
process.exit(0);
|
41
|
+
});
|
42
|
+
|
43
|
+
await program.parseAsync();
|
44
|
+
} catch (error) {
|
45
|
+
console.error(chalk.red('Fatal error:'), error);
|
46
|
+
process.exit(1);
|
47
|
+
}
|
48
|
+
}
|
49
|
+
|
50
|
+
async function animateText(text: string, delay = 25) {
|
51
|
+
let shouldComplete = false;
|
52
|
+
|
53
|
+
// Setup keypress listener
|
54
|
+
const keypressHandler = (str: string, key: { name: string }) => {
|
55
|
+
if (key.name === 'return') {
|
56
|
+
shouldComplete = true;
|
57
|
+
}
|
58
|
+
};
|
59
|
+
|
60
|
+
process.stdin.on('keypress', keypressHandler);
|
61
|
+
|
62
|
+
// Enable raw mode to get keypress events
|
63
|
+
process.stdin.setRawMode(true);
|
64
|
+
process.stdin.resume();
|
65
|
+
|
66
|
+
let currentIndex = 0;
|
67
|
+
while (currentIndex < text.length) {
|
68
|
+
if (shouldComplete) {
|
69
|
+
// Show remaining text immediately
|
70
|
+
process.stdout.write(text.slice(currentIndex));
|
71
|
+
break;
|
72
|
+
}
|
73
|
+
|
74
|
+
process.stdout.write(text[currentIndex]);
|
75
|
+
currentIndex++;
|
76
|
+
|
77
|
+
if (!shouldComplete) {
|
78
|
+
await new Promise(resolve => setTimeout(resolve, delay));
|
79
|
+
}
|
80
|
+
}
|
81
|
+
|
82
|
+
// Cleanup
|
83
|
+
process.stdin.setRawMode(false);
|
84
|
+
process.stdin.pause();
|
85
|
+
process.stdin.removeListener('keypress', keypressHandler);
|
86
|
+
|
87
|
+
process.stdout.write('\n');
|
88
|
+
}
|
89
|
+
|
90
|
+
async function analyzeInput(input: string, openAIKey: string) {
|
91
|
+
const spinner = ora('Analyzing content...').start();
|
92
|
+
|
93
|
+
try {
|
94
|
+
const result = await clai(input, openAIKey);
|
95
|
+
spinner.succeed('Analysis complete');
|
96
|
+
|
97
|
+
console.log(chalk.green.bold('\n📝 Summary:'));
|
98
|
+
await animateText(result.summary);
|
99
|
+
|
100
|
+
// Prompt user to select a link
|
101
|
+
const { selectedLink } = await inquirer.prompt([
|
102
|
+
{
|
103
|
+
type: 'list',
|
104
|
+
name: 'selectedLink',
|
105
|
+
message: '\n\nWhat now?:',
|
106
|
+
choices: [
|
107
|
+
{ name: chalk.yellow('🔍 New search'), value: 'new' },
|
108
|
+
...result.links.map(link => ({
|
109
|
+
name: `${chalk.bold(link.name)}: ${chalk.cyan(link.url)}`,
|
110
|
+
value: link.url
|
111
|
+
})),
|
112
|
+
{ name: 'Exit', value: 'exit' }
|
113
|
+
]
|
114
|
+
}
|
115
|
+
]);
|
116
|
+
|
117
|
+
if (selectedLink === 'new') {
|
118
|
+
const { input: newInput } = await inquirer.prompt([
|
119
|
+
{
|
120
|
+
type: 'input',
|
121
|
+
name: 'input',
|
122
|
+
message: 'Enter a URL or search query:',
|
123
|
+
validate: (input) => input.length > 0
|
124
|
+
}
|
125
|
+
]);
|
126
|
+
await analyzeInput(newInput, openAIKey);
|
127
|
+
} else if (selectedLink && selectedLink !== 'exit') {
|
128
|
+
await analyzeInput(selectedLink, openAIKey);
|
129
|
+
}
|
130
|
+
|
131
|
+
} catch (error) {
|
132
|
+
spinner?.fail('Analysis failed');
|
133
|
+
console.error(chalk.red('Error:'), error);
|
134
|
+
}
|
135
|
+
}
|
136
|
+
|
137
|
+
main();
|
package/src/index.ts
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
import { scrape } from './scraper';
|
2
|
+
import { summarizeWebPage as summarize } from './summarizer';
|
3
|
+
|
4
|
+
export interface SummaryOutput {
|
5
|
+
summary: string;
|
6
|
+
links: ReadonlyArray<{
|
7
|
+
name: string;
|
8
|
+
url: string;
|
9
|
+
}>;
|
10
|
+
sources: string[];
|
11
|
+
}
|
12
|
+
|
13
|
+
/**
|
14
|
+
* Scrapes and analyzes webpages using AI
|
15
|
+
* @param input - URL or search query to analyze
|
16
|
+
* @param openAIKey - OpenAI API key
|
17
|
+
* @returns Promise with summary, extracted links, and source URLs
|
18
|
+
*
|
19
|
+
* @example
|
20
|
+
* ```ts
|
21
|
+
* const result = await clai('https://example.com', 'your-openai-key')
|
22
|
+
* console.log(result.summary) // AI generated summary
|
23
|
+
* console.log(result.links) // Extracted links
|
24
|
+
* console.log(result.sources) // Source URLs
|
25
|
+
* ```
|
26
|
+
*/
|
27
|
+
export async function clai(input: string, openAIKey: string): Promise<SummaryOutput> {
|
28
|
+
const scrapedData = await scrape(input);
|
29
|
+
|
30
|
+
// Combine all content with source attribution
|
31
|
+
const combinedContent = scrapedData
|
32
|
+
.map(data => `Content from ${data.url}:\n${data.content}`)
|
33
|
+
.join('\n\n');
|
34
|
+
|
35
|
+
const result = await summarize(combinedContent, openAIKey);
|
36
|
+
|
37
|
+
return {
|
38
|
+
summary: result.textual.trim(),
|
39
|
+
links: result.links,
|
40
|
+
sources: scrapedData.map(data => data.url)
|
41
|
+
};
|
42
|
+
}
|
43
|
+
|
44
|
+
// Default export for easier importing
|
45
|
+
export default clai;
|
package/src/openai.ts
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
import { once } from '@fettstorch/jule';
|
2
|
+
import OpenAI from 'openai';
|
3
|
+
|
4
|
+
const MAX_INPUT_TOKENS = 10000;
|
5
|
+
|
6
|
+
function truncateContent(content: string): string {
|
7
|
+
const maxChars = MAX_INPUT_TOKENS * 4;
|
8
|
+
if (content.length <= maxChars) return content;
|
9
|
+
return content.slice(0, maxChars) + '... (content truncated)';
|
10
|
+
}
|
11
|
+
|
12
|
+
export interface StructuredResponse<T> {
|
13
|
+
function_call: {
|
14
|
+
arguments: string;
|
15
|
+
};
|
16
|
+
}
|
17
|
+
|
18
|
+
class OpenAIWrapper {
|
19
|
+
private client: OpenAI;
|
20
|
+
|
21
|
+
constructor(apiKey: string) {
|
22
|
+
this.client = new OpenAI({ apiKey });
|
23
|
+
}
|
24
|
+
|
25
|
+
async complete(
|
26
|
+
prompt: string,
|
27
|
+
options: {
|
28
|
+
model?: string;
|
29
|
+
temperature?: number;
|
30
|
+
} = {}
|
31
|
+
): Promise<string> {
|
32
|
+
const truncatedPrompt = truncateContent(prompt);
|
33
|
+
const { model = 'gpt-4o-turbo', temperature = 0.6 } = options;
|
34
|
+
|
35
|
+
const response = await this.client.chat.completions.create({
|
36
|
+
model,
|
37
|
+
messages: [{ role: 'user', content: truncatedPrompt }],
|
38
|
+
temperature,
|
39
|
+
max_tokens: 2000
|
40
|
+
});
|
41
|
+
|
42
|
+
return response.choices[0]?.message?.content ?? '';
|
43
|
+
}
|
44
|
+
|
45
|
+
async completeStructured<T>(
|
46
|
+
prompt: string,
|
47
|
+
options: {
|
48
|
+
model?: string;
|
49
|
+
temperature?: number;
|
50
|
+
functionName?: string;
|
51
|
+
responseSchema: Record<string, unknown>;
|
52
|
+
}
|
53
|
+
): Promise<T> {
|
54
|
+
const truncatedPrompt = truncateContent(prompt);
|
55
|
+
const {
|
56
|
+
model = 'gpt-3.5-turbo',
|
57
|
+
temperature = 0.6,
|
58
|
+
functionName = 'generate_response',
|
59
|
+
responseSchema
|
60
|
+
} = options;
|
61
|
+
|
62
|
+
const response = await this.client.chat.completions.create({
|
63
|
+
model,
|
64
|
+
messages: [{ role: 'user', content: truncatedPrompt }],
|
65
|
+
temperature,
|
66
|
+
max_tokens: 2000,
|
67
|
+
functions: [{
|
68
|
+
name: functionName,
|
69
|
+
parameters: {
|
70
|
+
type: 'object',
|
71
|
+
properties: responseSchema,
|
72
|
+
required: Object.keys(responseSchema)
|
73
|
+
}
|
74
|
+
}],
|
75
|
+
function_call: { name: functionName }
|
76
|
+
});
|
77
|
+
|
78
|
+
const functionCall = response.choices[0]?.message?.function_call;
|
79
|
+
if (!functionCall?.arguments) {
|
80
|
+
throw new Error('No function call arguments received');
|
81
|
+
}
|
82
|
+
|
83
|
+
return JSON.parse(functionCall.arguments) as T;
|
84
|
+
}
|
85
|
+
}
|
86
|
+
|
87
|
+
export const openaiClient: (apiKey?: string) => OpenAIWrapper = once((apiKey?: string) => {
|
88
|
+
if (!apiKey) {
|
89
|
+
throw new Error('OPENAI_API_KEY is not set')
|
90
|
+
}
|
91
|
+
return new OpenAIWrapper(apiKey)
|
92
|
+
});
|
package/src/scraper.ts
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
import * as Cheerio from 'cheerio';
|
2
|
+
|
3
|
+
export interface ScrapedData {
|
4
|
+
title: string;
|
5
|
+
content: string;
|
6
|
+
url: string;
|
7
|
+
}
|
8
|
+
|
9
|
+
export async function scrape(input: string): Promise<ScrapedData[]> {
|
10
|
+
try {
|
11
|
+
let urls: string[];
|
12
|
+
|
13
|
+
if (isValidUrl(input)) {
|
14
|
+
urls = [normalizeUrl(input)];
|
15
|
+
} else {
|
16
|
+
urls = await getGoogleResults(input);
|
17
|
+
}
|
18
|
+
|
19
|
+
// Fetch all URLs in parallel
|
20
|
+
const results = await Promise.all(
|
21
|
+
urls.map(async (url) => {
|
22
|
+
try {
|
23
|
+
const html = await fetchHtml(url);
|
24
|
+
const data = extractDataFromHtml(html);
|
25
|
+
return { ...data, url };
|
26
|
+
} catch (error) {
|
27
|
+
console.error(`Error scraping ${url}:`, error);
|
28
|
+
return null;
|
29
|
+
}
|
30
|
+
})
|
31
|
+
);
|
32
|
+
|
33
|
+
// Filter out failed scrapes
|
34
|
+
return results.filter((result): result is ScrapedData => result !== null);
|
35
|
+
} catch (error) {
|
36
|
+
console.error('Error during scraping:', error);
|
37
|
+
throw error;
|
38
|
+
}
|
39
|
+
}
|
40
|
+
|
41
|
+
// --- module private
|
42
|
+
|
43
|
+
function isValidUrl(input: string): boolean {
|
44
|
+
return !input.includes(' ');
|
45
|
+
}
|
46
|
+
|
47
|
+
function normalizeUrl(url: string): string {
|
48
|
+
if (!url.startsWith('http://') && !url.startsWith('https://')) {
|
49
|
+
return `https://${url}`;
|
50
|
+
}
|
51
|
+
return url;
|
52
|
+
}
|
53
|
+
|
54
|
+
async function getGoogleResults(query: string): Promise<string[]> {
|
55
|
+
const searchUrl = `https://www.google.com/search?q=${encodeURIComponent(query)}`;
|
56
|
+
const html = await fetchHtml(searchUrl);
|
57
|
+
|
58
|
+
const urlPattern = /https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)/gi;
|
59
|
+
const urls = html.match(urlPattern) || [];
|
60
|
+
|
61
|
+
const queryWords = query
|
62
|
+
.toLowerCase()
|
63
|
+
.split(/\s+/)
|
64
|
+
.filter(word => word.length > 2);
|
65
|
+
|
66
|
+
const filteredUrls = new Set(
|
67
|
+
urls.filter(url => {
|
68
|
+
const urlLower = url.toLowerCase();
|
69
|
+
return !urlLower.includes('www.google') &&
|
70
|
+
!urlLower.includes('gstatic.com') &&
|
71
|
+
!urlLower.includes('googleapis.com') &&
|
72
|
+
!urlLower.includes('googleadservices') &&
|
73
|
+
queryWords.some(word => urlLower.includes(word));
|
74
|
+
})
|
75
|
+
);
|
76
|
+
|
77
|
+
const results = [...filteredUrls].slice(0, 3);
|
78
|
+
|
79
|
+
if (results.length === 0) {
|
80
|
+
throw new Error('No search results found');
|
81
|
+
}
|
82
|
+
|
83
|
+
return results;
|
84
|
+
}
|
85
|
+
|
86
|
+
async function fetchHtml(url: string): Promise<string> {
|
87
|
+
const response = await fetch(url, {
|
88
|
+
headers: {
|
89
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
90
|
+
}
|
91
|
+
});
|
92
|
+
return response.text();
|
93
|
+
}
|
94
|
+
|
95
|
+
function extractDataFromHtml(html: string): ScrapedData {
|
96
|
+
const cheerioDoc = Cheerio.load(html);
|
97
|
+
return {
|
98
|
+
title: cheerioDoc('title').text(),
|
99
|
+
content: cheerioDoc('body').text(),
|
100
|
+
url: cheerioDoc('link[rel="canonical"]').attr('href') || ''
|
101
|
+
};
|
102
|
+
}
|
@@ -0,0 +1,77 @@
|
|
1
|
+
import { openaiClient } from './openai';
|
2
|
+
|
3
|
+
export type SummaryResult = Readonly<{
|
4
|
+
textual: string;
|
5
|
+
links: ReadonlyArray<{
|
6
|
+
name: string;
|
7
|
+
url: string;
|
8
|
+
}>;
|
9
|
+
}>
|
10
|
+
|
11
|
+
/**
|
12
|
+
* Summarizes content and extracts relevant links using OpenAI
|
13
|
+
* @param content - The text content to analyze and summarize
|
14
|
+
* @param maxLength - Maximum length of the summary in words
|
15
|
+
* @returns Promise containing the summary text and extracted links
|
16
|
+
* @throws Will throw an error if OpenAI API call fails
|
17
|
+
*
|
18
|
+
* @example
|
19
|
+
* ```ts
|
20
|
+
* const result = await summarizeContent(longText, 100)
|
21
|
+
* console.log(result.textual) // Summary text
|
22
|
+
* console.log(result.links) // Array of extracted links
|
23
|
+
* ```
|
24
|
+
*/
|
25
|
+
|
26
|
+
export async function summarizeWebPage(content: string, openAIApiKey: string): Promise<SummaryResult> {
|
27
|
+
const openai = openaiClient(openAIApiKey);
|
28
|
+
|
29
|
+
const prompt = `Your are an expert educator. Analyze the following text and create a
|
30
|
+
concise summary with the following guidelines:
|
31
|
+
1. Always use bullet points, lists and tables over paragraphs.
|
32
|
+
2. Produce valid markdown output
|
33
|
+
3. Use the articles titles and headings as a guide
|
34
|
+
4. Try to present the most relevant information
|
35
|
+
5. Extract all meaningful links from the text
|
36
|
+
6. Don't narrate the content e.g. 'The text says that the earth is round' but rather use the content itself e.g. 'The earth is round'
|
37
|
+
7. Don't use the word 'text' or 'content' in your summary
|
38
|
+
8. Don't try to emulate emotions or tone of the original content, always be neutral and objective
|
39
|
+
9. If the content is instructional repeat the instructions step by step e.g.:
|
40
|
+
- Step 1: Do this
|
41
|
+
- Step 2: Do that
|
42
|
+
- Step 3: Done
|
43
|
+
10. Mark proper nouns as bold e.g. **Harry Potter**
|
44
|
+
|
45
|
+
Don't just summarize, cite the key information.
|
46
|
+
|
47
|
+
Text to analyze:\n"${content}\n"`;
|
48
|
+
|
49
|
+
const schema = {
|
50
|
+
textual: {
|
51
|
+
type: 'string',
|
52
|
+
description: 'Concise summary of the text'
|
53
|
+
},
|
54
|
+
links: {
|
55
|
+
type: 'array',
|
56
|
+
items: {
|
57
|
+
type: 'object',
|
58
|
+
properties: {
|
59
|
+
name: {
|
60
|
+
type: 'string',
|
61
|
+
description: 'Descriptive name or title of the link'
|
62
|
+
},
|
63
|
+
url: {
|
64
|
+
type: 'string',
|
65
|
+
description: 'The URL of the link'
|
66
|
+
}
|
67
|
+
},
|
68
|
+
required: ['name', 'url']
|
69
|
+
}
|
70
|
+
}
|
71
|
+
};
|
72
|
+
|
73
|
+
return openai.completeStructured<SummaryResult>(prompt, {
|
74
|
+
temperature: 0.3,
|
75
|
+
responseSchema: schema
|
76
|
+
});
|
77
|
+
}
|
package/tsconfig.json
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
{
|
2
|
+
"compilerOptions": {
|
3
|
+
"target": "esnext",
|
4
|
+
"module": "esnext",
|
5
|
+
"moduleResolution": "bundler",
|
6
|
+
"types": ["bun-types"],
|
7
|
+
"outDir": "./dist",
|
8
|
+
"rootDir": "./src",
|
9
|
+
"strict": true,
|
10
|
+
"skipLibCheck": true,
|
11
|
+
"forceConsistentCasingInFileNames": true
|
12
|
+
},
|
13
|
+
"include": ["src/**/*"]
|
14
|
+
}
|