halbot 1995.1.58 → 1995.1.61
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.mjs +31 -40
- package/lib/hal.mjs +27 -3
- package/package.json +11 -11
- package/pipeline/010_broca.mjs +21 -2
- package/pipeline/080_history.mjs +24 -2
- package/pipeline/100_chat.mjs +1 -1
package/index.mjs
CHANGED
|
@@ -11,11 +11,27 @@ const init = async (options = {}) => {
|
|
|
11
11
|
const info = bot.lines([
|
|
12
12
|
`[${bot.BOT} ${pkg.title}](${pkg.homepage})`, pkg.description
|
|
13
13
|
]);
|
|
14
|
-
// use google
|
|
15
|
-
options
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
14
|
+
// use google rerank
|
|
15
|
+
if (options?.googleCredentials && options.googleProjectId) {
|
|
16
|
+
opts = {
|
|
17
|
+
provider: 'GOOGLE', credentials: options.googleCredentials,
|
|
18
|
+
projectId: options.googleProjectId,
|
|
19
|
+
};
|
|
20
|
+
if (!_rerank) {
|
|
21
|
+
await rag.initReranker(opts);
|
|
22
|
+
_rerank = rag.rerank;
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
// use jina search and reranker
|
|
26
|
+
if (options.jinaApiKey) {
|
|
27
|
+
opts = { provider: 'JINA', apiKey: options.jinaApiKey };
|
|
28
|
+
await web.initSearch(opts);
|
|
29
|
+
if (!_rerank) {
|
|
30
|
+
await rag.initReranker(opts);
|
|
31
|
+
_rerank = rag.rerank;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
// use openrouter's AI models
|
|
19
35
|
if (options.openrouterApiKey) {
|
|
20
36
|
opts = { provider: 'OPENROUTER', apiKey: options.openrouterApiKey };
|
|
21
37
|
await alan.init({
|
|
@@ -27,17 +43,8 @@ const init = async (options = {}) => {
|
|
|
27
43
|
await rag.initEmbedding(opts);
|
|
28
44
|
_embed = rag.embed;
|
|
29
45
|
}
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
if (options.googleApiKey) {
|
|
33
|
-
opts = { provider: 'GOOGLE', apiKey: options.googleApiKey };
|
|
34
|
-
await alan.init({
|
|
35
|
-
...opts, model: options.googleModel || '*',
|
|
36
|
-
priority: options.googlePriority, ...options,
|
|
37
|
-
});
|
|
38
|
-
}
|
|
39
|
-
// use openai's embedding, tts if openai is enabled, and google is not
|
|
40
|
-
if (options.openaiApiKey) {
|
|
46
|
+
// use openai models, embedding if openrouter is disabled
|
|
47
|
+
} else if (options.openaiApiKey && options.openaiModels) {
|
|
41
48
|
opts = { provider: 'OPENAI', apiKey: options.openaiApiKey };
|
|
42
49
|
await alan.init({
|
|
43
50
|
...opts, model: options.openaiModel || '*',
|
|
@@ -48,31 +55,15 @@ const init = async (options = {}) => {
|
|
|
48
55
|
_embed = rag.embed;
|
|
49
56
|
}
|
|
50
57
|
}
|
|
51
|
-
// use google
|
|
52
|
-
if (options
|
|
53
|
-
opts = {
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
await rag.initReranker(opts);
|
|
59
|
-
_rerank = rag.rerank;
|
|
60
|
-
}
|
|
61
|
-
}
|
|
62
|
-
// init other ai providers
|
|
63
|
-
options.siliconflowApiKey && await alan.init({
|
|
64
|
-
provider: 'SILICONFLOW', apiKey: options.siliconflowApiKey,
|
|
65
|
-
model: options.siliconflowModel || '*',
|
|
66
|
-
priority: options.siliconflowPriority, ...options,
|
|
67
|
-
});
|
|
68
|
-
if (options.jinaApiKey) {
|
|
69
|
-
opts = { provider: 'JINA', apiKey: options.jinaApiKey };
|
|
70
|
-
await web.initSearch(opts);
|
|
71
|
-
if (!_rerank) {
|
|
72
|
-
await rag.initReranker(opts);
|
|
73
|
-
_rerank = rag.rerank;
|
|
74
|
-
}
|
|
58
|
+
// use google's veo
|
|
59
|
+
if (options.googleApiKey) {
|
|
60
|
+
opts = { provider: 'GOOGLE', apiKey: options.googleApiKey };
|
|
61
|
+
await alan.init({
|
|
62
|
+
...opts, model: options.googleModel || '*',
|
|
63
|
+
priority: options.googlePriority, ...options,
|
|
64
|
+
});
|
|
75
65
|
}
|
|
66
|
+
// use ollama
|
|
76
67
|
if (options?.ollamaEnabled || options?.ollamaEndpoint) {
|
|
77
68
|
await alan.init({
|
|
78
69
|
provider: 'OLLAMA', model: options?.ollamaModel || '*',
|
package/lib/hal.mjs
CHANGED
|
@@ -20,6 +20,9 @@ const { __filename } = utilitas.__(import.meta.url);
|
|
|
20
20
|
const workdir = path.dirname(__filename);
|
|
21
21
|
const getPath = (subPath) => { return path.join(workdir, subPath || ''); };
|
|
22
22
|
const table = 'utilitas_hal_events';
|
|
23
|
+
const vectorIndex = `${table}_distilled_vector_index`;
|
|
24
|
+
const vectorIndexLists = 1000;
|
|
25
|
+
const vectorIndexProbes = 20;
|
|
23
26
|
const log = (c, o) => utilitas.log(c, import.meta.url, { time: 1, ...o || {} });
|
|
24
27
|
const [end] = [bot.end];
|
|
25
28
|
const uList = arr => bot.lines(arr.map(x => `- ${x}`));
|
|
@@ -65,6 +68,8 @@ const initSql = {
|
|
|
65
68
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4`), [table],
|
|
66
69
|
]],
|
|
67
70
|
[dbio.POSTGRESQL]: [[
|
|
71
|
+
'CREATE EXTENSION IF NOT EXISTS vchord CASCADE',
|
|
72
|
+
], [
|
|
68
73
|
dbio.cleanSql(`CREATE TABLE IF NOT EXISTS ${table} (
|
|
69
74
|
id SERIAL PRIMARY KEY,
|
|
70
75
|
bot_id BIGINT NOT NULL,
|
|
@@ -77,7 +82,7 @@ const initSql = {
|
|
|
77
82
|
response_text TEXT NOT NULL,
|
|
78
83
|
collected TEXT NOT NULL,
|
|
79
84
|
distilled TEXT NOT NULL,
|
|
80
|
-
distilled_vector
|
|
85
|
+
distilled_vector RABITQ8(768) NOT NULL,
|
|
81
86
|
token VARCHAR(255) NOT NULL DEFAULT '',
|
|
82
87
|
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
83
88
|
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
|
|
@@ -102,8 +107,6 @@ const initSql = {
|
|
|
102
107
|
`CREATE INDEX IF NOT EXISTS ${table}_collected_index ON ${table} USING GIN(to_tsvector('english', collected))`,
|
|
103
108
|
], [
|
|
104
109
|
`CREATE INDEX IF NOT EXISTS ${table}_distilled_index ON ${table} USING GIN(to_tsvector('english', distilled))`,
|
|
105
|
-
], [
|
|
106
|
-
`CREATE INDEX IF NOT EXISTS ${table}_distilled_vector_index ON ${table} USING hnsw(distilled_vector vector_cosine_ops)`,
|
|
107
110
|
], [
|
|
108
111
|
`CREATE INDEX IF NOT EXISTS ${table}_token_index ON ${table} (token)`,
|
|
109
112
|
], [
|
|
@@ -115,6 +118,24 @@ const initSql = {
|
|
|
115
118
|
|
|
116
119
|
let _;
|
|
117
120
|
|
|
121
|
+
const ensurePostgresqlRagIndex = async (client) => {
|
|
122
|
+
const index = await client.query(
|
|
123
|
+
`SELECT indexdef FROM pg_indexes
|
|
124
|
+
WHERE tablename = $1 AND indexname = $2`,
|
|
125
|
+
[table, vectorIndex]
|
|
126
|
+
);
|
|
127
|
+
if (index?.length) { return; }
|
|
128
|
+
await client.query(`CREATE INDEX IF NOT EXISTS ${vectorIndex}
|
|
129
|
+
ON ${table} USING vchordrq (distilled_vector rabitq8_cosine_ops)
|
|
130
|
+
WITH (options = $$
|
|
131
|
+
[build.internal]
|
|
132
|
+
lists = [${vectorIndexLists}]
|
|
133
|
+
spherical_centroids = true
|
|
134
|
+
build_threads = 8
|
|
135
|
+
$$, probes = '${vectorIndexProbes}')`
|
|
136
|
+
);
|
|
137
|
+
};
|
|
138
|
+
|
|
118
139
|
const newCommand = (command, description) => ({
|
|
119
140
|
command: utilitas.ensureString(command, { case: 'SNAKE' }).slice(0, COMMAND_LENGTH),
|
|
120
141
|
description: utilitas.trim(description).slice(0, COMMAND_DESCRIPTION_LENGTH),
|
|
@@ -219,6 +240,9 @@ const init = async (options) => {
|
|
|
219
240
|
for (const act of initSql[_.storage?.provider]) {
|
|
220
241
|
dbResult.push(await _.storage.client.query(...act));
|
|
221
242
|
}
|
|
243
|
+
if (_.storage?.provider === dbio.POSTGRESQL) {
|
|
244
|
+
await ensurePostgresqlRagIndex(_.storage.client);
|
|
245
|
+
}
|
|
222
246
|
} catch (e) { console.error(e); }
|
|
223
247
|
}
|
|
224
248
|
}
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "halbot",
|
|
3
3
|
"description": "Just another AI powered Telegram bot, which is simple design, easy to use, extendable and fun.",
|
|
4
|
-
"version": "1995.1.
|
|
4
|
+
"version": "1995.1.61",
|
|
5
5
|
"private": false,
|
|
6
6
|
"homepage": "https://github.com/Leask/halbot",
|
|
7
7
|
"type": "module",
|
|
@@ -31,27 +31,27 @@
|
|
|
31
31
|
"dependencies": {
|
|
32
32
|
"@ffmpeg-installer/ffmpeg": "^1.1.0",
|
|
33
33
|
"@ffprobe-installer/ffprobe": "^2.1.2",
|
|
34
|
-
"@google-cloud/discoveryengine": "^2.
|
|
35
|
-
"@google/genai": "^1.
|
|
34
|
+
"@google-cloud/discoveryengine": "^2.6.0",
|
|
35
|
+
"@google/genai": "^1.51.0",
|
|
36
36
|
"@mozilla/readability": "^0.6.0",
|
|
37
37
|
"@resvg/resvg-js": "^2.6.2",
|
|
38
38
|
"fluent-ffmpeg": "^2.1.3",
|
|
39
39
|
"google-gax": "^5.0.6",
|
|
40
|
-
"ioredis": "^5.
|
|
41
|
-
"jsdom": "^
|
|
40
|
+
"ioredis": "^5.10.1",
|
|
41
|
+
"jsdom": "^29.1.1",
|
|
42
42
|
"lorem-ipsum": "^2.0.8",
|
|
43
43
|
"mime": "^4.1.0",
|
|
44
|
-
"mysql2": "^3.
|
|
44
|
+
"mysql2": "^3.22.3",
|
|
45
45
|
"office-text-extractor": "^4.0.0",
|
|
46
|
-
"openai": "^6.
|
|
46
|
+
"openai": "^6.35.0",
|
|
47
47
|
"parse-numeric-range": "^1.3.0",
|
|
48
|
-
"pg": "^8.
|
|
48
|
+
"pg": "^8.20.0",
|
|
49
49
|
"pgvector": "^0.2.1",
|
|
50
|
-
"satori": "^0.
|
|
50
|
+
"satori": "^0.26.0",
|
|
51
51
|
"telegraf": "^4.16.3",
|
|
52
52
|
"tellegram": "^1.1.18",
|
|
53
53
|
"tesseract.js": "^7.0.0",
|
|
54
|
-
"webjam": "^1995.3.
|
|
55
|
-
"youtube-transcript": "^1.
|
|
54
|
+
"webjam": "^1995.3.21",
|
|
55
|
+
"youtube-transcript": "^1.3.1"
|
|
56
56
|
}
|
|
57
57
|
}
|
package/pipeline/010_broca.mjs
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
import { bot, hal, uoid, utilitas } from '../index.mjs';
|
|
1
|
+
import { bot, hal, storage, uoid, utilitas } from '../index.mjs';
|
|
2
2
|
import { convert, paginate } from 'tellegram';
|
|
3
|
+
import { v4 as uuidv4 } from 'uuid';
|
|
3
4
|
|
|
4
5
|
const _name = 'Broca';
|
|
5
6
|
const [PRIVATE_LIMIT, GROUP_LIMIT] = [60 / 60, 60 / 20].map(x => x * 1000);
|
|
@@ -44,6 +45,24 @@ const getExtra = (ctx, options) => {
|
|
|
44
45
|
return resp;
|
|
45
46
|
};
|
|
46
47
|
|
|
48
|
+
const getMediaName = func => func.replace(/^replyWith/, '') || 'File';
|
|
49
|
+
|
|
50
|
+
const packInputFile = async (func, media) => {
|
|
51
|
+
if (media.filename || media.url || !Buffer.isBuffer(media.source)) {
|
|
52
|
+
return media;
|
|
53
|
+
}
|
|
54
|
+
const { extension } = await storage.getMime(media.source);
|
|
55
|
+
return { ...media, filename: `${getMediaName(func)}_${uuidv4()}.${extension}` };
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
const getMedia = async (func, src) => {
|
|
59
|
+
if (src?.source || src?.url) { return await packInputFile(func, src); }
|
|
60
|
+
if (Buffer.isBuffer(src)) {
|
|
61
|
+
return await packInputFile(func, { source: src });
|
|
62
|
+
}
|
|
63
|
+
return { [getKey(src)]: src };
|
|
64
|
+
};
|
|
65
|
+
|
|
47
66
|
const resp = async (ctx, text, extra) => {
|
|
48
67
|
// if (ctx._.type === 'inline_query') {
|
|
49
68
|
// return await ctx.answerInlineQuery([{}, {}]);
|
|
@@ -72,7 +91,7 @@ const resp = async (ctx, text, extra) => {
|
|
|
72
91
|
const replyWith = async (ctx, func, src, options) => ctx._.done.push(
|
|
73
92
|
await ctx[func](Array.isArray(src) ? src.map(x => ({
|
|
74
93
|
type: x.type || 'photo', media: { [getKey(x.src)]: x.src },
|
|
75
|
-
})) :
|
|
94
|
+
})) : await getMedia(func, src), getExtra(ctx, options))
|
|
76
95
|
);
|
|
77
96
|
|
|
78
97
|
const edit = async (ctx, lastMsgId, text, extra) => {
|
package/pipeline/080_history.mjs
CHANGED
|
@@ -3,12 +3,31 @@ import { bot, dbio, hal, utilitas } from '../index.mjs';
|
|
|
3
3
|
const [RELEVANCE, SEARCH_LIMIT, SUB_LIMIT] = [0.2, 10, 200]; // Google Rerank limit
|
|
4
4
|
const compact = (str, op) => utilitas.ensureString(str, { ...op || {}, compact: true });
|
|
5
5
|
const compactLimit = (str, op) => compact(str, { ...op || {}, limit: 140 });
|
|
6
|
+
const quote = key => `"${key}"`;
|
|
6
7
|
|
|
7
8
|
const packMessage = (messages) => messages.map(x => ({
|
|
8
9
|
message_id: x.message_id, score: x.score, created_at: x.created_at,
|
|
9
10
|
request: x.received_text, response: x.response_text,
|
|
10
11
|
}));
|
|
11
12
|
|
|
13
|
+
const upsertPostgresqlEvent = async event => {
|
|
14
|
+
event = {
|
|
15
|
+
...event,
|
|
16
|
+
distilled_vector: await dbio.encodeVector(event.distilled_vector),
|
|
17
|
+
};
|
|
18
|
+
const fields = Object.keys(event);
|
|
19
|
+
const values = fields.map(key => event[key]);
|
|
20
|
+
const placeholders = fields.map((key, i) => key === 'distilled_vector'
|
|
21
|
+
? `quantize_to_rabitq8($${i + 1}::vector)` : `$${i + 1}`);
|
|
22
|
+
return await hal._.storage?.client?.query?.(
|
|
23
|
+
`INSERT INTO ${hal.table} (${fields.map(quote).join(', ')}) `
|
|
24
|
+
+ `VALUES (${placeholders.join(', ')}) `
|
|
25
|
+
+ `ON CONFLICT ("id") DO UPDATE SET `
|
|
26
|
+
+ fields.map(key => `${quote(key)} = EXCLUDED.${quote(key)}`).join(', '),
|
|
27
|
+
values
|
|
28
|
+
);
|
|
29
|
+
};
|
|
30
|
+
|
|
12
31
|
const recall = async (sessionId, keyword, offset = 0, limit = SEARCH_LIMIT, options = {}) => {
|
|
13
32
|
assert(sessionId, 'Session ID is required.');
|
|
14
33
|
let [result, _limit, exclude] = [
|
|
@@ -35,13 +54,14 @@ const recall = async (sessionId, keyword, offset = 0, limit = SEARCH_LIMIT, opti
|
|
|
35
54
|
// globalThis.debug = 2;
|
|
36
55
|
const vector = await dbio.encodeVector(await hal._.embed(keyword));
|
|
37
56
|
result = await hal._.storage?.client?.query?.(
|
|
38
|
-
`SELECT *, (1 - (distilled_vector <=>
|
|
57
|
+
`SELECT *, (1 - (distilled_vector <=> `
|
|
58
|
+
+ `quantize_to_rabitq8($1::vector))) as relevance `
|
|
39
59
|
+ `FROM ${hal.table} WHERE bot_id = $2 AND chat_id = $3 `
|
|
40
60
|
+ `AND received_text != '' `
|
|
41
61
|
+ `AND received_text NOT LIKE '/%' `
|
|
42
62
|
+ `AND response_text != '' `
|
|
43
63
|
+ (exclude.length ? `AND message_id NOT IN (${exclude.join(',')}) ` : '')
|
|
44
|
-
+ `ORDER BY (distilled_vector <=> $1) ASC `
|
|
64
|
+
+ `ORDER BY (distilled_vector <=> quantize_to_rabitq8($1::vector)) ASC `
|
|
45
65
|
+ `LIMIT ${_limit} OFFSET $4`,
|
|
46
66
|
[vector, hal._.bot.botInfo.id, sessionId, offset]
|
|
47
67
|
);
|
|
@@ -127,6 +147,8 @@ const memorize = async (ctx) => {
|
|
|
127
147
|
case dbio.MYSQL:
|
|
128
148
|
event.distilled_vector = JSON.stringify(event.distilled_vector);
|
|
129
149
|
break;
|
|
150
|
+
case dbio.POSTGRESQL:
|
|
151
|
+
return await upsertPostgresqlEvent(event);
|
|
130
152
|
}
|
|
131
153
|
await hal._.storage?.client?.upsert?.(hal.table, event, { skipEcho: true });
|
|
132
154
|
}, hal.logOptions);
|
package/pipeline/100_chat.mjs
CHANGED