@rws-framework/ai-tools 2.2.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,319 @@
1
+ /**
2
+ * Text chunking utility following LangChain tutorial approach
3
+ * Uses RecursiveCharacterTextSplitter-like logic with token optimization
4
+ */
5
+ export class TextChunker {
6
+ /**
7
+ * Conservative token estimation based on character count
8
+ * Uses a slightly more accurate ratio than the original 4 chars per token
9
+ */
10
+ static estimateTokens(text: string): number {
11
+ // More accurate estimation: ~3.7 characters per token on average
12
+ return Math.ceil(text.length / 3.7);
13
+ }
14
+
15
+ /**
16
+ * Default separators following LangChain RecursiveCharacterTextSplitter approach
17
+ * Ordered by preference for breaking text
18
+ */
19
+ private static readonly DEFAULT_SEPARATORS = [
20
+ '\n\n', // Double newlines (paragraphs)
21
+ '\n', // Single newlines
22
+ '. ', // Sentence endings
23
+ '! ', // Exclamation sentence endings
24
+ '? ', // Question sentence endings
25
+ '; ', // Semicolons
26
+ ', ', // Commas
27
+ ' ', // Spaces (words)
28
+ '' // Character level (fallback)
29
+ ];
30
+
31
+ /**
32
+ * Chunk text using RecursiveCharacterTextSplitter approach from LangChain tutorial
33
+ * Recursively tries separators until chunks fit within token limits
34
+ *
35
+ * @param text - The text to chunk
36
+ * @param maxTokens - Maximum tokens per chunk (default: 450 to be safe under 512)
37
+ * @param overlap - Number of characters to overlap between chunks
38
+ * @param separators - Custom separators (uses defaults if not provided)
39
+ * @returns Array of text chunks
40
+ */
41
+ static chunkText(
42
+ text: string,
43
+ maxTokens: number = 450,
44
+ overlap: number = 50,
45
+ separators: string[] = TextChunker.DEFAULT_SEPARATORS
46
+ ): string[] {
47
+ if (!text || text.trim().length === 0) {
48
+ return [];
49
+ }
50
+
51
+ // If text is already within limits, return as-is
52
+ const estimatedTokens = this.estimateTokens(text);
53
+ if (estimatedTokens <= maxTokens) {
54
+ return [text.trim()];
55
+ }
56
+
57
+ console.log(`[TextChunker] Chunking text: ${text.length} chars, estimated ${estimatedTokens} tokens, max ${maxTokens} tokens per chunk`);
58
+
59
+ // Convert token limit to character limit (conservative)
60
+ const maxCharsPerChunk = Math.floor(maxTokens * 3.5);
61
+
62
+ // Use recursive splitting approach like LangChain tutorial
63
+ return this.recursiveSplit(text, maxCharsPerChunk, overlap, separators);
64
+ }
65
+
66
+ /**
67
+ * Recursive splitting approach following LangChain's RecursiveCharacterTextSplitter
68
+ * Tries each separator in order until chunks fit within limits
69
+ */
70
+ private static recursiveSplit(
71
+ text: string,
72
+ maxChars: number,
73
+ overlap: number,
74
+ separators: string[]
75
+ ): string[] {
76
+ const finalChunks: string[] = [];
77
+
78
+ // Get the first separator to try
79
+ const separator = separators[0];
80
+ let newSeparators: string[];
81
+
82
+ if (separator === '') {
83
+ // Character-level splitting (last resort)
84
+ newSeparators = [];
85
+ } else {
86
+ // Continue with remaining separators for recursive calls
87
+ newSeparators = separators.slice(1);
88
+ }
89
+
90
+ // Split text by current separator
91
+ const splits = this.splitTextBySeparator(text, separator);
92
+
93
+ // Process each split
94
+ for (const split of splits) {
95
+ if (split.length <= maxChars) {
96
+ // Split fits within limit, add it
97
+ finalChunks.push(split);
98
+ } else {
99
+ // Split is too large, need to recursively split further
100
+ if (newSeparators.length === 0) {
101
+ // No more separators, force split by character limit
102
+ finalChunks.push(...this.forceSplitByCharacterLimit(split, maxChars, overlap));
103
+ } else {
104
+ // Recursively split with next separator
105
+ finalChunks.push(...this.recursiveSplit(split, maxChars, overlap, newSeparators));
106
+ }
107
+ }
108
+ }
109
+
110
+ // Merge small chunks and add overlaps (like tutorial's approach)
111
+ return this.mergeChunksWithOverlap(finalChunks, maxChars, overlap);
112
+ }
113
+
114
+ /**
115
+ * Split text by a specific separator (preserving separator where appropriate)
116
+ */
117
+ private static splitTextBySeparator(text: string, separator: string): string[] {
118
+ if (separator === '') {
119
+ // Character-level split
120
+ return text.split('');
121
+ }
122
+
123
+ if (!text.includes(separator)) {
124
+ return [text];
125
+ }
126
+
127
+ // Split by separator and preserve meaningful separators
128
+ const splits = text.split(separator);
129
+ const result: string[] = [];
130
+
131
+ for (let i = 0; i < splits.length; i++) {
132
+ const split = splits[i];
133
+
134
+ if (i === splits.length - 1) {
135
+ // Last split, don't add separator
136
+ if (split.trim()) {
137
+ result.push(split.trim());
138
+ }
139
+ } else {
140
+ // Add separator back for meaningful breaks
141
+ const withSeparator = split + (this.shouldPreserveSeparator(separator) ? separator : '');
142
+ if (withSeparator.trim()) {
143
+ result.push(withSeparator.trim());
144
+ }
145
+ }
146
+ }
147
+
148
+ return result.filter(s => s.length > 0);
149
+ }
150
+
151
+ /**
152
+ * Determine if separator should be preserved in the text
153
+ */
154
+ private static shouldPreserveSeparator(separator: string): boolean {
155
+ // Preserve sentence endings and meaningful punctuation
156
+ return ['. ', '! ', '? ', '; '].includes(separator);
157
+ }
158
+
159
+ /**
160
+ * Force split by character limit when no separators work
161
+ */
162
+ private static forceSplitByCharacterLimit(text: string, maxChars: number, overlap: number): string[] {
163
+ const chunks: string[] = [];
164
+ let position = 0;
165
+
166
+ while (position < text.length) {
167
+ let endPosition = position + maxChars;
168
+
169
+ if (endPosition >= text.length) {
170
+ // Last chunk
171
+ const lastChunk = text.substring(position).trim();
172
+ if (lastChunk) {
173
+ chunks.push(lastChunk);
174
+ }
175
+ break;
176
+ }
177
+
178
+ chunks.push(text.substring(position, endPosition).trim());
179
+ position = endPosition - overlap;
180
+
181
+ // Ensure we don't go backwards
182
+ if (position < 0) position = 0;
183
+ }
184
+
185
+ return chunks.filter(chunk => chunk.length > 0);
186
+ }
187
+
188
+ /**
189
+ * Merge small chunks and add overlaps following tutorial approach
190
+ */
191
+ private static mergeChunksWithOverlap(chunks: string[], maxChars: number, overlap: number): string[] {
192
+ if (chunks.length === 0) return [];
193
+
194
+ const mergedChunks: string[] = [];
195
+ let currentChunk = '';
196
+
197
+ for (let i = 0; i < chunks.length; i++) {
198
+ const chunk = chunks[i];
199
+
200
+ // Check if we can merge this chunk with current chunk
201
+ const combined = currentChunk ? currentChunk + ' ' + chunk : chunk;
202
+
203
+ if (combined.length <= maxChars) {
204
+ // Can merge
205
+ currentChunk = combined;
206
+ } else {
207
+ // Can't merge, save current chunk and start new one
208
+ if (currentChunk) {
209
+ mergedChunks.push(currentChunk.trim());
210
+ }
211
+ currentChunk = chunk;
212
+ }
213
+ }
214
+
215
+ // Add final chunk
216
+ if (currentChunk.trim()) {
217
+ mergedChunks.push(currentChunk.trim());
218
+ }
219
+
220
+ // Add overlaps between chunks
221
+ return this.addOverlapsBetweenChunks(mergedChunks, overlap);
222
+ }
223
+
224
+ /**
225
+ * Add overlaps between chunks like the tutorial approach
226
+ */
227
+ private static addOverlapsBetweenChunks(chunks: string[], overlap: number): string[] {
228
+ if (chunks.length <= 1 || overlap <= 0) {
229
+ return chunks;
230
+ }
231
+
232
+ const chunksWithOverlap: string[] = [];
233
+
234
+ for (let i = 0; i < chunks.length; i++) {
235
+ let chunkWithOverlap = chunks[i];
236
+
237
+ // Add overlap from previous chunk at the beginning
238
+ if (i > 0) {
239
+ const prevOverlap = this.extractOverlap(chunks[i - 1], overlap);
240
+ if (prevOverlap && !chunkWithOverlap.startsWith(prevOverlap)) {
241
+ chunkWithOverlap = prevOverlap + ' ' + chunkWithOverlap;
242
+ }
243
+ }
244
+
245
+ chunksWithOverlap.push(chunkWithOverlap.trim());
246
+ }
247
+
248
+ return chunksWithOverlap;
249
+ }
250
+
251
+ /**
252
+ * Extract overlap text from the end of a chunk
253
+ */
254
+ private static extractOverlap(text: string, overlapLength: number): string {
255
+ if (text.length <= overlapLength) {
256
+ return text;
257
+ }
258
+
259
+ // Try to break at word boundary for overlap
260
+ let startPosition = text.length - overlapLength;
261
+ while (startPosition < text.length && text[startPosition] !== ' ') {
262
+ startPosition++;
263
+ }
264
+
265
+ if (startPosition >= text.length) {
266
+ // No word boundary found, just take last characters
267
+ return text.substring(text.length - overlapLength);
268
+ }
269
+
270
+ return text.substring(startPosition + 1); // +1 to skip the space
271
+ }
272
+
273
+ /**
274
+ * Truncate text to fit within token limit (utility method)
275
+ */
276
+ static truncateText(text: string, maxTokens: number): string {
277
+ const maxChars = Math.floor(maxTokens * 3.5);
278
+
279
+ if (text.length <= maxChars) {
280
+ return text;
281
+ }
282
+
283
+ // Try to truncate at word boundary
284
+ let truncatePosition = maxChars;
285
+ while (truncatePosition > maxChars * 0.8 && text[truncatePosition] !== ' ') {
286
+ truncatePosition--;
287
+ }
288
+
289
+ if (truncatePosition <= maxChars * 0.8) {
290
+ // No word boundary found, truncate at character limit
291
+ truncatePosition = maxChars;
292
+ }
293
+
294
+ return text.substring(0, truncatePosition).trim();
295
+ }
296
+
297
+ /**
298
+ * Create documents from text chunks (tutorial-style helper)
299
+ * Similar to how the tutorial creates Document objects from splits
300
+ */
301
+ static createDocumentsFromChunks(
302
+ text: string,
303
+ metadata: Record<string, any> = {},
304
+ maxTokens: number = 450,
305
+ overlap: number = 50
306
+ ): Array<{ pageContent: string; metadata: Record<string, any> }> {
307
+ const chunks = this.chunkText(text, maxTokens, overlap);
308
+
309
+ return chunks.map((chunk, index) => ({
310
+ pageContent: chunk,
311
+ metadata: {
312
+ ...metadata,
313
+ chunkIndex: index,
314
+ id: `${metadata.documentId || 'doc'}_chunk_${index}`,
315
+ totalChunks: chunks.length
316
+ }
317
+ }));
318
+ }
319
+ }
@@ -0,0 +1,15 @@
1
+ /**
2
+ * Embedding service configuration interfaces
3
+ */
4
+ export interface IEmbeddingConfig {
5
+ provider: 'cohere';
6
+ apiKey: string;
7
+ model?: string;
8
+ batchSize?: number;
9
+ }
10
+
11
+ export interface IChunkConfig {
12
+ chunkSize?: number;
13
+ chunkOverlap?: number;
14
+ separators?: string[];
15
+ }
@@ -0,0 +1,5 @@
1
+ // Re-export all types from a central location
2
+ export * from './embedding.types';
3
+ export * from './search.types';
4
+ export * from './vectorstore.types';
5
+ export * from './rag.types';
@@ -0,0 +1,44 @@
1
+ import { ISearchResult } from './search.types';
2
+
3
+ /**
4
+ * RAG service configuration and request/response interfaces
5
+ */
6
+ export interface ILangChainRAGConfig {
7
+ embedding?: import('./embedding.types').IEmbeddingConfig;
8
+ vectorStore: import('./vectorstore.types').IVectorStoreConfig;
9
+ chunking?: import('./embedding.types').IChunkConfig;
10
+ persistence?: {
11
+ enabled: boolean;
12
+ storagePath?: string;
13
+ autoSave?: boolean;
14
+ };
15
+ }
16
+
17
+ export interface IRAGIndexRequest {
18
+ content: string;
19
+ documentId: string | number;
20
+ metadata?: Record<string, any>;
21
+ }
22
+
23
+ export interface IRAGSearchRequest {
24
+ query: string;
25
+ maxResults?: number;
26
+ threshold?: number;
27
+ filter?: {
28
+ knowledgeIds?: (string | number)[];
29
+ documentIds?: (string | number)[];
30
+ [key: string]: any;
31
+ };
32
+ }
33
+
34
+ export interface IRAGResponse<T = any> {
35
+ success: boolean;
36
+ data: T | null;
37
+ error?: string;
38
+ }
39
+
40
+ export interface IRAGStats {
41
+ totalChunks: number;
42
+ totalDocuments: number;
43
+ knowledgeItems: number;
44
+ }
@@ -0,0 +1,56 @@
1
+ /**
2
+ * Search and result interfaces
3
+ */
4
+ export interface ISearchResult {
5
+ content: string;
6
+ score: number;
7
+ metadata: any;
8
+ chunkId: string;
9
+ }
10
+
11
+ export interface IVectorSearchRequest {
12
+ query: string;
13
+ maxResults?: number;
14
+ similarityThreshold?: number;
15
+ filter?: {
16
+ knowledgeIds?: string[];
17
+ documentIds?: string[];
18
+ [key: string]: any;
19
+ };
20
+ }
21
+
22
+ export interface IVectorSearchResponse {
23
+ results: ISearchResult[];
24
+ totalResults: number;
25
+ }
26
+
27
+ /**
28
+ * Optimized search interfaces
29
+ */
30
+ export interface IOptimizedSearchRequest {
31
+ query: string;
32
+ knowledgeVectors: Array<{
33
+ knowledgeId: string | number;
34
+ chunks: Array<{
35
+ content: string;
36
+ embedding: number[];
37
+ metadata: any;
38
+ }>;
39
+ }>;
40
+ maxResults?: number;
41
+ threshold?: number;
42
+ }
43
+
44
+ export interface IOptimizedSearchResult {
45
+ content: string;
46
+ score: number;
47
+ metadata: any;
48
+ knowledgeId: string | number;
49
+ chunkId: string;
50
+ }
51
+
52
+ export interface IOptimizedSearchResponse {
53
+ results: IOptimizedSearchResult[];
54
+ searchTime: number;
55
+ totalCandidates: number;
56
+ }
@@ -0,0 +1,23 @@
1
+ /**
2
+ * Vector store configuration interfaces
3
+ */
4
+ export interface IVectorStoreConfig {
5
+ type: 'memory';
6
+ maxResults?: number;
7
+ autoSave?: boolean;
8
+ similarityThreshold?: number;
9
+ }
10
+
11
+ export interface IDocumentChunk {
12
+ id: string;
13
+ content: string;
14
+ embedding?: number[];
15
+ metadata?: {
16
+ documentId: string;
17
+ chunkIndex: number;
18
+ source?: string;
19
+ title?: string;
20
+ knowledgeId?: string;
21
+ [key: string]: any;
22
+ };
23
+ }
@@ -1,15 +0,0 @@
1
- import { EmbeddingsInterface } from '@langchain/core/embeddings';
2
- import { Injectable } from '@rws-framework/server/nest';
3
-
4
- import RWSVectorStore, { VectorDocType } from '../models/convo/VectorStore';
5
-
6
- @Injectable()
7
- class VectorStoreService
8
- {
9
- async createStore(docs: VectorDocType, embeddings: EmbeddingsInterface): Promise<RWSVectorStore>
10
- {
11
- return await (new RWSVectorStore(docs, embeddings)).init();
12
- }
13
- }
14
-
15
- export { VectorStoreService };