@yesvara/svara 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +497 -0
- package/dist/chunk-CIESM3BP.mjs +33 -0
- package/dist/chunk-FEA5KIJN.mjs +418 -0
- package/dist/cli/index.d.mts +1 -0
- package/dist/cli/index.d.ts +1 -0
- package/dist/cli/index.js +328 -0
- package/dist/cli/index.mjs +39 -0
- package/dist/dev-OYGXXK2B.mjs +69 -0
- package/dist/index.d.mts +967 -0
- package/dist/index.d.ts +967 -0
- package/dist/index.js +1976 -0
- package/dist/index.mjs +1502 -0
- package/dist/new-7K4NIDZO.mjs +177 -0
- package/dist/retriever-4QY667XF.mjs +7 -0
- package/examples/01-basic/index.ts +26 -0
- package/examples/02-with-tools/index.ts +73 -0
- package/examples/03-rag-knowledge/index.ts +41 -0
- package/examples/04-multi-channel/index.ts +91 -0
- package/package.json +74 -0
- package/src/app/index.ts +176 -0
- package/src/channels/telegram.ts +122 -0
- package/src/channels/web.ts +118 -0
- package/src/channels/whatsapp.ts +161 -0
- package/src/cli/commands/dev.ts +87 -0
- package/src/cli/commands/new.ts +213 -0
- package/src/cli/index.ts +78 -0
- package/src/core/agent.ts +607 -0
- package/src/core/llm.ts +406 -0
- package/src/core/types.ts +183 -0
- package/src/database/schema.ts +79 -0
- package/src/database/sqlite.ts +239 -0
- package/src/index.ts +94 -0
- package/src/memory/context.ts +49 -0
- package/src/memory/conversation.ts +51 -0
- package/src/rag/chunker.ts +165 -0
- package/src/rag/loader.ts +216 -0
- package/src/rag/retriever.ts +248 -0
- package/src/tools/executor.ts +54 -0
- package/src/tools/index.ts +89 -0
- package/src/tools/registry.ts +44 -0
- package/src/types.ts +131 -0
- package/tsconfig.json +26 -0
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @module database/sqlite
|
|
3
|
+
* SvaraJS — SQLite adapter
|
|
4
|
+
*
|
|
5
|
+
* A clean, ergonomic wrapper around better-sqlite3.
|
|
6
|
+
* Provides typed query helpers, migrations, and a KV store.
|
|
7
|
+
* Used internally by SvaraJS and optionally exposed to users.
|
|
8
|
+
*
|
|
9
|
+
* @example
|
|
10
|
+
* const db = new SvaraDB('./data/agent.db');
|
|
11
|
+
*
|
|
12
|
+
* // Typed queries
|
|
13
|
+
* const users = db.query<{ id: string; name: string }>(
|
|
14
|
+
* 'SELECT id, name FROM users WHERE active = ?', [1]
|
|
15
|
+
* );
|
|
16
|
+
*
|
|
17
|
+
* // KV store
|
|
18
|
+
* db.kv.set('onboarding:done', true);
|
|
19
|
+
* const done = db.kv.get<boolean>('onboarding:done');
|
|
20
|
+
*
|
|
21
|
+
* // Custom tables
|
|
22
|
+
* db.exec(`CREATE TABLE IF NOT EXISTS orders (id TEXT PRIMARY KEY, ...)`);
|
|
23
|
+
*/
|
|
24
|
+
|
|
25
|
+
import path from 'path';
|
|
26
|
+
import fs from 'fs';
|
|
27
|
+
import { CREATE_TABLES_SQL, INSERT_META_SQL, SCHEMA_VERSION } from './schema.js';
|
|
28
|
+
|
|
29
|
+
type Database = {
|
|
30
|
+
prepare: (sql: string) => Statement;
|
|
31
|
+
exec: (sql: string) => void;
|
|
32
|
+
close: () => void;
|
|
33
|
+
pragma: (pragma: string, options?: { simple?: boolean }) => unknown;
|
|
34
|
+
transaction: <T>(fn: () => T) => () => T;
|
|
35
|
+
};
|
|
36
|
+
|
|
37
|
+
type Statement = {
|
|
38
|
+
run: (...args: unknown[]) => { lastInsertRowid: bigint | number; changes: number };
|
|
39
|
+
get: (...args: unknown[]) => unknown;
|
|
40
|
+
all: (...args: unknown[]) => unknown[];
|
|
41
|
+
};
|
|
42
|
+
|
|
43
|
+
// ─── KV Store ─────────────────────────────────────────────────────────────────
|
|
44
|
+
|
|
45
|
+
class KVStore {
|
|
46
|
+
constructor(private db: Database) {}
|
|
47
|
+
|
|
48
|
+
/** Set a key-value pair, with optional TTL in seconds. */
|
|
49
|
+
set<T>(key: string, value: T, ttlSeconds?: number): void {
|
|
50
|
+
const expiresAt = ttlSeconds ? Math.floor(Date.now() / 1000) + ttlSeconds : null;
|
|
51
|
+
this.db.prepare(`
|
|
52
|
+
INSERT OR REPLACE INTO svara_kv (key, value, expires_at, updated_at)
|
|
53
|
+
VALUES (?, ?, ?, unixepoch())
|
|
54
|
+
`).run(key, JSON.stringify(value), expiresAt);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/** Get a value by key. Returns undefined if not found or expired. */
|
|
58
|
+
get<T = unknown>(key: string): T | undefined {
|
|
59
|
+
const row = this.db.prepare(`
|
|
60
|
+
SELECT value, expires_at FROM svara_kv
|
|
61
|
+
WHERE key = ? AND (expires_at IS NULL OR expires_at > unixepoch())
|
|
62
|
+
`).get(key) as { value: string; expires_at: number | null } | undefined;
|
|
63
|
+
|
|
64
|
+
if (!row) return undefined;
|
|
65
|
+
return JSON.parse(row.value) as T;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/** Delete a key. */
|
|
69
|
+
delete(key: string): void {
|
|
70
|
+
this.db.prepare('DELETE FROM svara_kv WHERE key = ?').run(key);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/** Check if a key exists and is not expired. */
|
|
74
|
+
has(key: string): boolean {
|
|
75
|
+
return this.get(key) !== undefined;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/** Get all keys matching a prefix. */
|
|
79
|
+
keys(prefix = ''): string[] {
|
|
80
|
+
const rows = this.db.prepare(`
|
|
81
|
+
SELECT key FROM svara_kv
|
|
82
|
+
WHERE key LIKE ? AND (expires_at IS NULL OR expires_at > unixepoch())
|
|
83
|
+
`).all(`${prefix}%`) as Array<{ key: string }>;
|
|
84
|
+
return rows.map((r) => r.key);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// ─── SvaraDB ──────────────────────────────────────────────────────────────────
|
|
89
|
+
|
|
90
|
+
export class SvaraDB {
|
|
91
|
+
private db: Database;
|
|
92
|
+
readonly kv: KVStore;
|
|
93
|
+
|
|
94
|
+
constructor(dbPath = ':memory:') {
|
|
95
|
+
// Ensure the directory exists
|
|
96
|
+
if (dbPath !== ':memory:') {
|
|
97
|
+
fs.mkdirSync(path.dirname(path.resolve(dbPath)), { recursive: true });
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
this.db = this.openDatabase(dbPath);
|
|
101
|
+
this.configure();
|
|
102
|
+
this.migrate();
|
|
103
|
+
this.kv = new KVStore(this.db);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// ─── Query Helpers ────────────────────────────────────────────────────────
|
|
107
|
+
|
|
108
|
+
/**
|
|
109
|
+
* Run a SELECT and return all matching rows.
|
|
110
|
+
*/
|
|
111
|
+
query<T = Record<string, unknown>>(sql: string, params: unknown[] = []): T[] {
|
|
112
|
+
return this.db.prepare(sql).all(...params) as T[];
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Run a SELECT and return the first matching row.
|
|
117
|
+
*/
|
|
118
|
+
queryOne<T = Record<string, unknown>>(sql: string, params: unknown[] = []): T | undefined {
|
|
119
|
+
return this.db.prepare(sql).get(...params) as T | undefined;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/**
|
|
123
|
+
* Run an INSERT/UPDATE/DELETE. Returns affected row count.
|
|
124
|
+
*/
|
|
125
|
+
run(sql: string, params: unknown[] = []): number {
|
|
126
|
+
return this.db.prepare(sql).run(...params).changes;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Execute raw SQL (for DDL, migrations, etc.).
|
|
131
|
+
*/
|
|
132
|
+
exec(sql: string): void {
|
|
133
|
+
this.db.exec(sql);
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
/**
|
|
137
|
+
* Run multiple operations in a single transaction.
|
|
138
|
+
*
|
|
139
|
+
* @example
|
|
140
|
+
* db.transaction(() => {
|
|
141
|
+
* db.run('INSERT INTO orders ...', [...]);
|
|
142
|
+
* db.run('UPDATE inventory ...', [...]);
|
|
143
|
+
* });
|
|
144
|
+
*/
|
|
145
|
+
transaction<T>(fn: () => T): T {
|
|
146
|
+
return this.db.transaction(fn)();
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
/**
|
|
150
|
+
* Close the database connection.
|
|
151
|
+
*/
|
|
152
|
+
close(): void {
|
|
153
|
+
this.db.close();
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// ─── Internal Message Storage ─────────────────────────────────────────────
|
|
157
|
+
|
|
158
|
+
saveMessage(params: {
|
|
159
|
+
id: string;
|
|
160
|
+
sessionId: string;
|
|
161
|
+
role: string;
|
|
162
|
+
content: string;
|
|
163
|
+
toolCallId?: string;
|
|
164
|
+
}): void {
|
|
165
|
+
this.db.prepare(`
|
|
166
|
+
INSERT OR REPLACE INTO svara_messages (id, session_id, role, content, tool_call_id)
|
|
167
|
+
VALUES (?, ?, ?, ?, ?)
|
|
168
|
+
`).run(
|
|
169
|
+
params.id,
|
|
170
|
+
params.sessionId,
|
|
171
|
+
params.role,
|
|
172
|
+
params.content,
|
|
173
|
+
params.toolCallId ?? null
|
|
174
|
+
);
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
getMessages(sessionId: string, limit = 50): Array<{
|
|
178
|
+
id: string;
|
|
179
|
+
role: string;
|
|
180
|
+
content: string;
|
|
181
|
+
tool_call_id: string | null;
|
|
182
|
+
created_at: number;
|
|
183
|
+
}> {
|
|
184
|
+
return this.db.prepare(`
|
|
185
|
+
SELECT id, role, content, tool_call_id, created_at
|
|
186
|
+
FROM svara_messages
|
|
187
|
+
WHERE session_id = ?
|
|
188
|
+
ORDER BY created_at ASC
|
|
189
|
+
LIMIT ?
|
|
190
|
+
`).all(sessionId, limit) as Array<{
|
|
191
|
+
id: string;
|
|
192
|
+
role: string;
|
|
193
|
+
content: string;
|
|
194
|
+
tool_call_id: string | null;
|
|
195
|
+
created_at: number;
|
|
196
|
+
}>;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
clearSession(sessionId: string): void {
|
|
200
|
+
this.db.prepare('DELETE FROM svara_messages WHERE session_id = ?').run(sessionId);
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
// ─── Private Setup ────────────────────────────────────────────────────────
|
|
204
|
+
|
|
205
|
+
private openDatabase(dbPath: string): Database {
|
|
206
|
+
try {
|
|
207
|
+
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
|
208
|
+
const Database = require('better-sqlite3') as new (path: string) => Database;
|
|
209
|
+
return new Database(dbPath);
|
|
210
|
+
} catch {
|
|
211
|
+
throw new Error(
|
|
212
|
+
'[SvaraJS] Database requires the "better-sqlite3" package.\n' +
|
|
213
|
+
'Run: npm install better-sqlite3'
|
|
214
|
+
);
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
private configure(): void {
|
|
219
|
+
// WAL mode = faster writes, better concurrency
|
|
220
|
+
this.db.pragma('journal_mode = WAL');
|
|
221
|
+
this.db.pragma('synchronous = NORMAL');
|
|
222
|
+
this.db.pragma('foreign_keys = ON');
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
private migrate(): void {
|
|
226
|
+
this.db.exec(CREATE_TABLES_SQL);
|
|
227
|
+
|
|
228
|
+
const meta = this.db.prepare(
|
|
229
|
+
"SELECT value FROM svara_meta WHERE key = 'schema_version'"
|
|
230
|
+
).get() as { value: string } | undefined;
|
|
231
|
+
|
|
232
|
+
if (!meta) {
|
|
233
|
+
this.db.prepare(INSERT_META_SQL).run(
|
|
234
|
+
String(SCHEMA_VERSION),
|
|
235
|
+
new Date().toISOString()
|
|
236
|
+
);
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @yesvara/svara — Agentic AI Backend Framework
|
|
3
|
+
*
|
|
4
|
+
* Build production-ready AI agents in minutes, not months.
|
|
5
|
+
*
|
|
6
|
+
* @example The 15-line agent
|
|
7
|
+
* ```ts
|
|
8
|
+
* import { SvaraApp, SvaraAgent } from '@yesvara/svara';
|
|
9
|
+
*
|
|
10
|
+
* const app = new SvaraApp();
|
|
11
|
+
*
|
|
12
|
+
* const agent = new SvaraAgent({
|
|
13
|
+
* name: 'Support Bot',
|
|
14
|
+
* model: 'gpt-4o-mini',
|
|
15
|
+
* knowledge: './docs',
|
|
16
|
+
* });
|
|
17
|
+
*
|
|
18
|
+
* app.route('/chat', agent.handler());
|
|
19
|
+
* app.listen(3000);
|
|
20
|
+
* ```
|
|
21
|
+
*
|
|
22
|
+
* @example With tools and channels
|
|
23
|
+
* ```ts
|
|
24
|
+
* import { SvaraAgent, createTool } from '@yesvara/svara';
|
|
25
|
+
*
|
|
26
|
+
* const agent = new SvaraAgent({ name: 'Aria', model: 'gpt-4o' });
|
|
27
|
+
*
|
|
28
|
+
* agent
|
|
29
|
+
* .addTool(createTool({
|
|
30
|
+
* name: 'get_time',
|
|
31
|
+
* description: 'Get current date and time',
|
|
32
|
+
* parameters: {},
|
|
33
|
+
* async run() { return { time: new Date().toISOString() }; },
|
|
34
|
+
* }))
|
|
35
|
+
* .connectChannel('telegram', { token: process.env.TG_TOKEN! })
|
|
36
|
+
* .connectChannel('whatsapp', {
|
|
37
|
+
* token: process.env.WA_TOKEN!,
|
|
38
|
+
* phoneId: process.env.WA_PHONE_ID!,
|
|
39
|
+
* verifyToken: process.env.WA_VERIFY_TOKEN!,
|
|
40
|
+
* });
|
|
41
|
+
*
|
|
42
|
+
* await agent.start();
|
|
43
|
+
* ```
|
|
44
|
+
*/
|
|
45
|
+
|
|
46
|
+
// ─── Framework Classes ────────────────────────────────────────────────────────
|
|
47
|
+
|
|
48
|
+
export { SvaraApp } from './app/index.js';
|
|
49
|
+
export { SvaraAgent } from './core/agent.js';
|
|
50
|
+
|
|
51
|
+
// ─── Tool Helpers ─────────────────────────────────────────────────────────────
|
|
52
|
+
|
|
53
|
+
export { createTool } from './tools/index.js';
|
|
54
|
+
|
|
55
|
+
// ─── Database ─────────────────────────────────────────────────────────────────
|
|
56
|
+
|
|
57
|
+
export { SvaraDB } from './database/sqlite.js';
|
|
58
|
+
|
|
59
|
+
// ─── Advanced: Direct Channel Classes ────────────────────────────────────────
|
|
60
|
+
// Most users won't need these — use agent.connectChannel() instead.
|
|
61
|
+
|
|
62
|
+
export { WebChannel } from './channels/web.js';
|
|
63
|
+
export { TelegramChannel } from './channels/telegram.js';
|
|
64
|
+
export { WhatsAppChannel } from './channels/whatsapp.js';
|
|
65
|
+
|
|
66
|
+
// ─── Advanced: RAG Components ─────────────────────────────────────────────────
|
|
67
|
+
// For building custom knowledge pipeline integrations.
|
|
68
|
+
|
|
69
|
+
export { DocumentLoader } from './rag/loader.js';
|
|
70
|
+
export { Chunker } from './rag/chunker.js';
|
|
71
|
+
export { VectorRetriever } from './rag/retriever.js';
|
|
72
|
+
|
|
73
|
+
// ─── Public Types ─────────────────────────────────────────────────────────────
|
|
74
|
+
|
|
75
|
+
export type {
|
|
76
|
+
// The main types you'll use every day
|
|
77
|
+
AgentConfig,
|
|
78
|
+
Tool,
|
|
79
|
+
ToolParameter,
|
|
80
|
+
AgentContext,
|
|
81
|
+
ProcessResult,
|
|
82
|
+
MemoryOptions,
|
|
83
|
+
AppOptions,
|
|
84
|
+
ChannelName,
|
|
85
|
+
} from './types.js';
|
|
86
|
+
|
|
87
|
+
// Channel-specific configs (for advanced usage)
|
|
88
|
+
export type { WebChannelConfig } from './channels/web.js';
|
|
89
|
+
export type { TelegramChannelConfig } from './channels/telegram.js';
|
|
90
|
+
export type { WhatsAppChannelConfig } from './channels/whatsapp.js';
|
|
91
|
+
|
|
92
|
+
// ─── Version ──────────────────────────────────────────────────────────────────
|
|
93
|
+
|
|
94
|
+
export const VERSION = '0.1.0';
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @internal
|
|
3
|
+
* Builds the messages array sent to the LLM on each call.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import type { LLMMessage } from '../core/types.js';
|
|
7
|
+
import type { LLMAdapter } from '../core/llm.js';
|
|
8
|
+
|
|
9
|
+
export class ContextBuilder {
|
|
10
|
+
constructor(private llm: LLMAdapter) {}
|
|
11
|
+
|
|
12
|
+
buildMessages(
|
|
13
|
+
systemPrompt: string,
|
|
14
|
+
history: LLMMessage[],
|
|
15
|
+
userMessage: string,
|
|
16
|
+
ragContext?: string
|
|
17
|
+
): LLMMessage[] {
|
|
18
|
+
const messages: LLMMessage[] = [
|
|
19
|
+
{ role: 'system', content: systemPrompt },
|
|
20
|
+
// Exclude any system messages from history — we prepend our own
|
|
21
|
+
...history.filter((m) => m.role !== 'system'),
|
|
22
|
+
];
|
|
23
|
+
|
|
24
|
+
const content = ragContext
|
|
25
|
+
? this.augmentWithRAG(userMessage, ragContext)
|
|
26
|
+
: userMessage;
|
|
27
|
+
|
|
28
|
+
messages.push({ role: 'user', content });
|
|
29
|
+
return messages;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
estimateTokens(messages: LLMMessage[]): number {
|
|
33
|
+
const text = messages.map((m) => m.content).join(' ');
|
|
34
|
+
return this.llm.countTokens(text);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
private augmentWithRAG(message: string, context: string): string {
|
|
38
|
+
return [
|
|
39
|
+
'Use the following context to answer the question.',
|
|
40
|
+
"If the answer isn't in the context, say so honestly — don't guess.",
|
|
41
|
+
'',
|
|
42
|
+
'--- Context ---',
|
|
43
|
+
context,
|
|
44
|
+
'--- End Context ---',
|
|
45
|
+
'',
|
|
46
|
+
`Question: ${message}`,
|
|
47
|
+
].join('\n');
|
|
48
|
+
}
|
|
49
|
+
}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @internal
|
|
3
|
+
* Per-session conversation history store.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import type { LLMMessage, SessionStore } from '../core/types.js';
|
|
7
|
+
|
|
8
|
+
export interface MemoryConfig {
|
|
9
|
+
type: 'conversation' | 'none';
|
|
10
|
+
maxMessages: number;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export class ConversationMemory {
|
|
14
|
+
private sessions: Map<string, SessionStore> = new Map();
|
|
15
|
+
|
|
16
|
+
constructor(private config: MemoryConfig) {}
|
|
17
|
+
|
|
18
|
+
async getHistory(sessionId: string): Promise<LLMMessage[]> {
|
|
19
|
+
return this.sessions.get(sessionId)?.messages ?? [];
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
async append(sessionId: string, messages: LLMMessage[]): Promise<void> {
|
|
23
|
+
if (this.config.type === 'none') return;
|
|
24
|
+
|
|
25
|
+
const store = this.sessions.get(sessionId) ?? {
|
|
26
|
+
messages: [],
|
|
27
|
+
createdAt: new Date(),
|
|
28
|
+
updatedAt: new Date(),
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
store.messages.push(...messages);
|
|
32
|
+
store.updatedAt = new Date();
|
|
33
|
+
|
|
34
|
+
// Trim to window — always keep system messages
|
|
35
|
+
if (store.messages.length > this.config.maxMessages) {
|
|
36
|
+
const system = store.messages.filter((m) => m.role === 'system');
|
|
37
|
+
const rest = store.messages.filter((m) => m.role !== 'system');
|
|
38
|
+
store.messages = [...system, ...rest.slice(-this.config.maxMessages)];
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
this.sessions.set(sessionId, store);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
async clear(sessionId: string): Promise<void> {
|
|
45
|
+
this.sessions.delete(sessionId);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
getSessionIds(): string[] {
|
|
49
|
+
return [...this.sessions.keys()];
|
|
50
|
+
}
|
|
51
|
+
}
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @module rag/chunker
|
|
3
|
+
* SvaraJS — Document chunking strategies
|
|
4
|
+
*
|
|
5
|
+
* Breaks documents into retrieval-optimized chunks.
|
|
6
|
+
* Strategy selection matters: fixed for code/data, sentence for prose, paragraph for docs.
|
|
7
|
+
*
|
|
8
|
+
* @example
|
|
9
|
+
* const chunker = new Chunker({ strategy: 'sentence', size: 512, overlap: 50 });
|
|
10
|
+
* const chunks = chunker.chunk(document);
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import crypto from 'crypto';
|
|
14
|
+
import type { Document, DocumentChunk } from '../core/types.js';
|
|
15
|
+
|
|
16
|
+
export interface ChunkOptions {
|
|
17
|
+
strategy?: 'fixed' | 'sentence' | 'paragraph'; // default: 'sentence'
|
|
18
|
+
size?: number; // target chunk size in chars (not tokens), default 2000
|
|
19
|
+
overlap?: number; // overlap in chars, default 200
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
// ─── Chunker ──────────────────────────────────────────────────────────────────
|
|
23
|
+
|
|
24
|
+
export class Chunker {
|
|
25
|
+
private options: Required<ChunkOptions>;
|
|
26
|
+
|
|
27
|
+
constructor(options: ChunkOptions = {}) {
|
|
28
|
+
this.options = {
|
|
29
|
+
strategy: options.strategy ?? 'sentence',
|
|
30
|
+
size: options.size ?? 2000,
|
|
31
|
+
overlap: options.overlap ?? 200,
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Split a document into overlapping chunks.
|
|
37
|
+
* Returns the document with populated `chunks` field.
|
|
38
|
+
*/
|
|
39
|
+
chunk(document: Document): DocumentChunk[] {
|
|
40
|
+
const text = document.content.trim();
|
|
41
|
+
if (!text) return [];
|
|
42
|
+
|
|
43
|
+
let texts: string[];
|
|
44
|
+
|
|
45
|
+
switch (this.options.strategy) {
|
|
46
|
+
case 'fixed':
|
|
47
|
+
texts = this.fixedChunk(text);
|
|
48
|
+
break;
|
|
49
|
+
case 'paragraph':
|
|
50
|
+
texts = this.paragraphChunk(text);
|
|
51
|
+
break;
|
|
52
|
+
case 'sentence':
|
|
53
|
+
default:
|
|
54
|
+
texts = this.sentenceChunk(text);
|
|
55
|
+
break;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
return texts
|
|
59
|
+
.filter((t) => t.trim().length > 0)
|
|
60
|
+
.map((content, index) => ({
|
|
61
|
+
id: this.chunkId(document.id, index),
|
|
62
|
+
documentId: document.id,
|
|
63
|
+
content: content.trim(),
|
|
64
|
+
index,
|
|
65
|
+
metadata: {
|
|
66
|
+
...document.metadata,
|
|
67
|
+
chunkIndex: index,
|
|
68
|
+
strategy: this.options.strategy,
|
|
69
|
+
charCount: content.length,
|
|
70
|
+
},
|
|
71
|
+
}));
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Chunk multiple documents at once.
|
|
76
|
+
*/
|
|
77
|
+
chunkMany(documents: Document[]): DocumentChunk[] {
|
|
78
|
+
return documents.flatMap((doc) => this.chunk(doc));
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// ─── Strategies ───────────────────────────────────────────────────────────
|
|
82
|
+
|
|
83
|
+
/** Split into fixed-size windows with overlap. Good for code and structured data. */
|
|
84
|
+
private fixedChunk(text: string): string[] {
|
|
85
|
+
const { size, overlap } = this.options;
|
|
86
|
+
const chunks: string[] = [];
|
|
87
|
+
let start = 0;
|
|
88
|
+
|
|
89
|
+
while (start < text.length) {
|
|
90
|
+
const end = Math.min(start + size, text.length);
|
|
91
|
+
chunks.push(text.slice(start, end));
|
|
92
|
+
start += size - overlap;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
return chunks;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Split by sentences, grouping them until size limit.
|
|
100
|
+
* Best for prose text — preserves natural reading units.
|
|
101
|
+
*/
|
|
102
|
+
private sentenceChunk(text: string): string[] {
|
|
103
|
+
const sentences = this.splitSentences(text);
|
|
104
|
+
return this.groupBySize(sentences);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/**
|
|
108
|
+
* Split by paragraphs (double newline), grouping small ones.
|
|
109
|
+
* Best for documentation, articles, and manuals.
|
|
110
|
+
*/
|
|
111
|
+
private paragraphChunk(text: string): string[] {
|
|
112
|
+
const paragraphs = text
|
|
113
|
+
.split(/\n{2,}/)
|
|
114
|
+
.map((p) => p.trim())
|
|
115
|
+
.filter(Boolean);
|
|
116
|
+
|
|
117
|
+
return this.groupBySize(paragraphs);
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// ─── Helpers ──────────────────────────────────────────────────────────────
|
|
121
|
+
|
|
122
|
+
private splitSentences(text: string): string[] {
|
|
123
|
+
// Split on sentence-ending punctuation followed by whitespace
|
|
124
|
+
// Handles: "Dr. Smith", "U.S.A.", abbreviations reasonably well
|
|
125
|
+
return text
|
|
126
|
+
.split(/(?<=[.!?])\s+(?=[A-Z"'(])/)
|
|
127
|
+
.map((s) => s.trim())
|
|
128
|
+
.filter(Boolean);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
private groupBySize(units: string[]): string[] {
|
|
132
|
+
const { size, overlap } = this.options;
|
|
133
|
+
const chunks: string[] = [];
|
|
134
|
+
let current = '';
|
|
135
|
+
let overlapBuffer = '';
|
|
136
|
+
|
|
137
|
+
for (const unit of units) {
|
|
138
|
+
if (current.length + unit.length + 1 > size && current.length > 0) {
|
|
139
|
+
chunks.push(current);
|
|
140
|
+
// Start next chunk with overlap
|
|
141
|
+
current = overlapBuffer + (overlapBuffer ? ' ' : '') + unit;
|
|
142
|
+
overlapBuffer = '';
|
|
143
|
+
} else {
|
|
144
|
+
current += (current ? ' ' : '') + unit;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// Build overlap buffer from the tail of current chunk
|
|
148
|
+
if (current.length > overlap) {
|
|
149
|
+
overlapBuffer = current.slice(-overlap);
|
|
150
|
+
} else {
|
|
151
|
+
overlapBuffer = current;
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
if (current.trim()) chunks.push(current);
|
|
156
|
+
return chunks;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
private chunkId(documentId: string, index: number): string {
|
|
160
|
+
return crypto
|
|
161
|
+
.createHash('md5')
|
|
162
|
+
.update(`${documentId}:${index}`)
|
|
163
|
+
.digest('hex');
|
|
164
|
+
}
|
|
165
|
+
}
|