@yamo/memory-mesh 2.3.2 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/bin/memory_mesh.js +1 -1
  2. package/lib/llm/client.d.ts +111 -0
  3. package/lib/llm/client.js +299 -357
  4. package/lib/llm/client.ts +413 -0
  5. package/lib/llm/index.d.ts +17 -0
  6. package/lib/llm/index.js +15 -8
  7. package/lib/llm/index.ts +19 -0
  8. package/lib/memory/adapters/client.d.ts +183 -0
  9. package/lib/memory/adapters/client.js +518 -0
  10. package/lib/memory/adapters/client.ts +678 -0
  11. package/lib/memory/adapters/config.d.ts +137 -0
  12. package/lib/memory/adapters/config.js +189 -0
  13. package/lib/memory/adapters/config.ts +259 -0
  14. package/lib/memory/adapters/errors.d.ts +76 -0
  15. package/lib/memory/adapters/errors.js +128 -0
  16. package/lib/memory/adapters/errors.ts +166 -0
  17. package/lib/memory/context-manager.d.ts +44 -0
  18. package/lib/memory/context-manager.js +344 -0
  19. package/lib/memory/context-manager.ts +432 -0
  20. package/lib/memory/embeddings/factory.d.ts +59 -0
  21. package/lib/memory/embeddings/factory.js +148 -0
  22. package/lib/{embeddings/factory.js → memory/embeddings/factory.ts} +69 -28
  23. package/lib/memory/embeddings/index.d.ts +2 -0
  24. package/lib/memory/embeddings/index.js +2 -0
  25. package/lib/memory/embeddings/index.ts +2 -0
  26. package/lib/memory/embeddings/service.d.ts +164 -0
  27. package/lib/memory/embeddings/service.js +515 -0
  28. package/lib/{embeddings/service.js → memory/embeddings/service.ts} +223 -156
  29. package/lib/memory/index.d.ts +9 -0
  30. package/lib/memory/index.js +9 -1
  31. package/lib/memory/index.ts +20 -0
  32. package/lib/memory/memory-mesh.d.ts +274 -0
  33. package/lib/memory/memory-mesh.js +1469 -678
  34. package/lib/memory/memory-mesh.ts +1803 -0
  35. package/lib/memory/memory-translator.d.ts +19 -0
  36. package/lib/memory/memory-translator.js +125 -0
  37. package/lib/memory/memory-translator.ts +158 -0
  38. package/lib/memory/schema.d.ts +111 -0
  39. package/lib/memory/schema.js +183 -0
  40. package/lib/memory/schema.ts +267 -0
  41. package/lib/memory/scorer.d.ts +26 -0
  42. package/lib/memory/scorer.js +77 -0
  43. package/lib/memory/scorer.ts +95 -0
  44. package/lib/memory/search/index.d.ts +1 -0
  45. package/lib/memory/search/index.js +1 -0
  46. package/lib/memory/search/index.ts +1 -0
  47. package/lib/memory/search/keyword-search.d.ts +62 -0
  48. package/lib/memory/search/keyword-search.js +135 -0
  49. package/lib/{search/keyword-search.js → memory/search/keyword-search.ts} +66 -36
  50. package/lib/scrubber/config/defaults.d.ts +53 -0
  51. package/lib/scrubber/config/defaults.js +49 -57
  52. package/lib/scrubber/config/defaults.ts +117 -0
  53. package/lib/scrubber/index.d.ts +6 -0
  54. package/lib/scrubber/index.js +3 -23
  55. package/lib/scrubber/index.ts +7 -0
  56. package/lib/scrubber/scrubber.d.ts +61 -0
  57. package/lib/scrubber/scrubber.js +99 -121
  58. package/lib/scrubber/scrubber.ts +168 -0
  59. package/lib/scrubber/stages/chunker.d.ts +13 -0
  60. package/lib/scrubber/stages/metadata-annotator.d.ts +18 -0
  61. package/lib/scrubber/stages/normalizer.d.ts +13 -0
  62. package/lib/scrubber/stages/semantic-filter.d.ts +13 -0
  63. package/lib/scrubber/stages/structural-cleaner.d.ts +13 -0
  64. package/lib/scrubber/stages/validator.d.ts +18 -0
  65. package/lib/scrubber/telemetry.d.ts +36 -0
  66. package/lib/scrubber/telemetry.js +53 -58
  67. package/lib/scrubber/telemetry.ts +99 -0
  68. package/lib/utils/logger.d.ts +29 -0
  69. package/lib/utils/logger.js +64 -0
  70. package/lib/utils/logger.ts +85 -0
  71. package/lib/utils/skill-metadata.d.ts +32 -0
  72. package/lib/utils/skill-metadata.js +132 -0
  73. package/lib/utils/skill-metadata.ts +147 -0
  74. package/lib/yamo/emitter.d.ts +73 -0
  75. package/lib/yamo/emitter.js +78 -143
  76. package/lib/yamo/emitter.ts +249 -0
  77. package/lib/yamo/schema.d.ts +58 -0
  78. package/lib/yamo/schema.js +81 -108
  79. package/lib/yamo/schema.ts +165 -0
  80. package/package.json +11 -8
  81. package/index.d.ts +0 -111
  82. package/lib/embeddings/index.js +0 -2
  83. package/lib/index.js +0 -6
  84. package/lib/lancedb/client.js +0 -633
  85. package/lib/lancedb/config.js +0 -215
  86. package/lib/lancedb/errors.js +0 -144
  87. package/lib/lancedb/index.js +0 -4
  88. package/lib/lancedb/schema.js +0 -217
  89. package/lib/scrubber/errors/scrubber-error.js +0 -43
  90. package/lib/scrubber/stages/chunker.js +0 -103
  91. package/lib/scrubber/stages/metadata-annotator.js +0 -74
  92. package/lib/scrubber/stages/normalizer.js +0 -59
  93. package/lib/scrubber/stages/semantic-filter.js +0 -61
  94. package/lib/scrubber/stages/structural-cleaner.js +0 -82
  95. package/lib/scrubber/stages/validator.js +0 -66
  96. package/lib/scrubber/utils/hash.js +0 -39
  97. package/lib/scrubber/utils/html-parser.js +0 -45
  98. package/lib/scrubber/utils/pattern-matcher.js +0 -63
  99. package/lib/scrubber/utils/token-counter.js +0 -31
  100. package/lib/search/index.js +0 -1
  101. package/lib/utils/index.js +0 -1
  102. package/lib/yamo/index.js +0 -15
@@ -0,0 +1,267 @@
1
+ /**
2
+ * LanceDB Schema Definitions for MemoryManager
3
+ * Uses Apache Arrow Schema format for LanceDB JavaScript SDK
4
+ *
5
+ * Supports dynamic vector dimensions for different embedding models:
6
+ * - all-MiniLM-L6-v2: 384 dimensions
7
+ * - all-mpnet-base-v2: 768 dimensions
8
+ * - text-embedding-3-small: 1536 dimensions
9
+ */
10
+
11
+ import * as arrow from "apache-arrow";
12
+ import * as lancedb from "@lancedb/lancedb";
13
+
14
+ /**
15
+ * Default vector dimension (all-MiniLM-L6-v2)
16
+ */
17
+ export const DEFAULT_VECTOR_DIMENSION = 384;
18
+
19
+ /**
20
+ * Common embedding model dimensions
21
+ */
22
+ export const EMBEDDING_DIMENSIONS: Record<string, number> = {
23
+ "Xenova/all-MiniLM-L6-v2": 384,
24
+ "Xenova/all-mpnet-base-v2": 768,
25
+ "Xenova/distiluse-base-multilingual-cased-v1": 512,
26
+ "sentence-transformers/all-MiniLM-L6-v2": 384,
27
+ "sentence-transformers/all-mpnet-base-v2": 768,
28
+ "openai/text-embedding-3-small": 1536,
29
+ "openai/text-embedding-3-large": 3072,
30
+ "cohere/embed-english-light-v3.0": 1024,
31
+ "cohere/embed-english-v3.0": 1024,
32
+ };
33
+
34
+ /**
35
+ * Get dimension for a given embedding model
36
+ * @param {string} modelName - Embedding model name or path
37
+ * @returns {number} Vector dimension
38
+ */
39
+ export function getEmbeddingDimension(modelName?: string): number {
40
+ if (!modelName) {
41
+ return DEFAULT_VECTOR_DIMENSION;
42
+ }
43
+
44
+ // Check exact match
45
+ if (EMBEDDING_DIMENSIONS[modelName]) {
46
+ return EMBEDDING_DIMENSIONS[modelName];
47
+ }
48
+
49
+ // Check for partial matches
50
+ for (const [key, dimension] of Object.entries(EMBEDDING_DIMENSIONS)) {
51
+ if (modelName.toLowerCase().includes(key.toLowerCase())) {
52
+ return dimension;
53
+ }
54
+ }
55
+
56
+ // Fallback to default
57
+ return DEFAULT_VECTOR_DIMENSION;
58
+ }
59
+
60
+ /**
61
+ * Create a memory schema with a specific vector dimension
62
+ * @param {number} vectorDim - Vector dimension (e.g., 384, 768, 1536)
63
+ * @returns {arrow.Schema} Arrow schema with specified dimension
64
+ */
65
+ export function createMemorySchema(
66
+ vectorDim: number = DEFAULT_VECTOR_DIMENSION,
67
+ ): arrow.Schema {
68
+ return new arrow.Schema([
69
+ new arrow.Field("id", new arrow.Utf8(), false),
70
+ new arrow.Field(
71
+ "vector",
72
+ new arrow.FixedSizeList(
73
+ vectorDim,
74
+ new arrow.Field("item", new arrow.Float32(), true),
75
+ ),
76
+ false,
77
+ ),
78
+ new arrow.Field("content", new arrow.Utf8(), false),
79
+ new arrow.Field("metadata", new arrow.Utf8(), true), // Stored as JSON string
80
+ new arrow.Field(
81
+ "created_at",
82
+ new arrow.Timestamp(arrow.TimeUnit.MILLISECOND),
83
+ false,
84
+ ),
85
+ new arrow.Field(
86
+ "updated_at",
87
+ new arrow.Timestamp(arrow.TimeUnit.MILLISECOND),
88
+ true,
89
+ ),
90
+ ]);
91
+ }
92
+
93
+ /**
94
+ * Create V2 memory schema with automatic recall fields
95
+ * All new fields are nullable for backward compatibility
96
+ * @param {number} vectorDim - Vector dimension (e.g., 384, 768, 1536)
97
+ * @returns {arrow.Schema} Arrow schema with V2 fields
98
+ */
99
+ export function createMemorySchemaV2(
100
+ vectorDim: number = DEFAULT_VECTOR_DIMENSION,
101
+ ): arrow.Schema {
102
+ return new arrow.Schema([
103
+ // ========== V1 Fields (Backward Compatible) ==========
104
+ new arrow.Field("id", new arrow.Utf8(), false),
105
+ new arrow.Field(
106
+ "vector",
107
+ new arrow.FixedSizeList(
108
+ vectorDim,
109
+ new arrow.Field("item", new arrow.Float32(), true),
110
+ ),
111
+ false,
112
+ ),
113
+ new arrow.Field("content", new arrow.Utf8(), false),
114
+ new arrow.Field("metadata", new arrow.Utf8(), true),
115
+ new arrow.Field(
116
+ "created_at",
117
+ new arrow.Timestamp(arrow.TimeUnit.MILLISECOND),
118
+ false,
119
+ ),
120
+ new arrow.Field(
121
+ "updated_at",
122
+ new arrow.Timestamp(arrow.TimeUnit.MILLISECOND),
123
+ true,
124
+ ),
125
+
126
+ // ========== V2 Fields (All Nullable) ==========
127
+ new arrow.Field("session_id", new arrow.Utf8(), true), // Session association
128
+ new arrow.Field("agent_id", new arrow.Utf8(), true), // Agent/skill that created memory
129
+ new arrow.Field("memory_type", new arrow.Utf8(), true), // 'global', 'session', 'agent'
130
+ new arrow.Field("importance_score", new arrow.Float32(), true), // 0.0-1.0 importance
131
+ new arrow.Field("access_count", new arrow.Int32(), true), // Popularity tracking
132
+ new arrow.Field(
133
+ "last_accessed",
134
+ new arrow.Timestamp(arrow.TimeUnit.MILLISECOND),
135
+ true,
136
+ ),
137
+ ]);
138
+ }
139
+
140
+ /**
141
+ * Create schema for synthesized skills (Recursive Skill Synthesis)
142
+ * @param {number} vectorDim - Vector dimension for intent embedding
143
+ * @returns {arrow.Schema} Arrow schema
144
+ */
145
+ export function createSynthesizedSkillSchema(
146
+ vectorDim: number = DEFAULT_VECTOR_DIMENSION,
147
+ ): arrow.Schema {
148
+ return new arrow.Schema([
149
+ new arrow.Field("id", new arrow.Utf8(), false),
150
+ new arrow.Field("name", new arrow.Utf8(), false),
151
+ new arrow.Field("intent", new arrow.Utf8(), false),
152
+ new arrow.Field("yamo_text", new arrow.Utf8(), false),
153
+ new arrow.Field(
154
+ "vector",
155
+ new arrow.FixedSizeList(
156
+ vectorDim,
157
+ new arrow.Field("item", new arrow.Float32(), true),
158
+ ),
159
+ false,
160
+ ),
161
+ new arrow.Field("metadata", new arrow.Utf8(), true), // Stored as JSON: {reliability, use_count, created_at}
162
+ new arrow.Field(
163
+ "created_at",
164
+ new arrow.Timestamp(arrow.TimeUnit.MILLISECOND),
165
+ false,
166
+ ),
167
+ ]);
168
+ }
169
+
170
+ /**
171
+ * Check if a table is using V2 schema
172
+ * @param {arrow.Schema} schema - Table schema to check
173
+ * @returns {boolean} True if V2 schema detected
174
+ */
175
+ export function isSchemaV2(schema: arrow.Schema): boolean {
176
+ return schema.fields.some((f) => f.name === "session_id");
177
+ }
178
+
179
+ /**
180
+ * Memory table schema using Apache Arrow format (default 384 dimensions)
181
+ * @deprecated Use createMemorySchema(vectorDim) for dynamic dimensions
182
+ */
183
+ export const MEMORY_SCHEMA = createMemorySchema(DEFAULT_VECTOR_DIMENSION);
184
+
185
+ /**
186
+ * Index configuration for memory table
187
+ * Indices should be created after data is inserted
188
+ */
189
+ export const INDEX_CONFIG = {
190
+ vector: {
191
+ index_type: "ivf_pq",
192
+ metric: "cosine",
193
+ num_partitions: 256,
194
+ num_sub_vectors: 8,
195
+ },
196
+ full_text: {
197
+ fields: ["content"],
198
+ },
199
+ };
200
+
201
+ /**
202
+ * Creates a memory table in LanceDB with the predefined schema (384 dimensions)
203
+ * @param {lancedb.Connection} db - LanceDB connection
204
+ * @param {string} tableName - Name of the table to create (default: 'memory_entries')
205
+ * @returns {Promise<lancedb.Table>} The created or opened table
206
+ * @throws {Error} If table creation fails
207
+ * @deprecated Use createMemoryTableWithDimension() for dynamic dimensions
208
+ */
209
+ export async function createMemoryTable(
210
+ db: lancedb.Connection,
211
+ tableName: string = "memory_entries",
212
+ ): Promise<lancedb.Table> {
213
+ return createMemoryTableWithDimension(
214
+ db,
215
+ tableName,
216
+ DEFAULT_VECTOR_DIMENSION,
217
+ );
218
+ }
219
+
220
+ /**
221
+ * Creates a memory table in LanceDB with a specific vector dimension
222
+ * @param {lancedb.Connection} db - LanceDB connection
223
+ * @param {string} tableName - Name of the table to create
224
+ * @param {number} vectorDim - Vector dimension (384, 768, 1536, etc.)
225
+ * @returns {Promise<lancedb.Table>} The created or opened table
226
+ * @throws {Error} If table creation fails
227
+ */
228
+ export async function createMemoryTableWithDimension(
229
+ db: lancedb.Connection,
230
+ tableName: string,
231
+ vectorDim: number,
232
+ ): Promise<lancedb.Table> {
233
+ try {
234
+ // Check if table already exists
235
+ const existingTables = await db.tableNames();
236
+
237
+ if (existingTables.includes(tableName)) {
238
+ return await db.openTable(tableName);
239
+ }
240
+
241
+ // Create schema with specified dimension
242
+ const schema = createMemorySchema(vectorDim);
243
+
244
+ // Create table with schema
245
+ // LanceDB v0.23.0+ accepts empty array as initial data with schema option
246
+ const table = await db.createTable(tableName, [], { schema } as any); // Cast to any because lancedb types might be strict about options
247
+ return table;
248
+ } catch (error) {
249
+ const message = error instanceof Error ? error.message : String(error);
250
+ throw new Error(
251
+ `Failed to create memory table with dimension ${vectorDim}: ${message}`,
252
+ );
253
+ }
254
+ }
255
+
256
+ export default {
257
+ MEMORY_SCHEMA,
258
+ INDEX_CONFIG,
259
+ createMemoryTable,
260
+ createMemoryTableWithDimension,
261
+ createMemorySchema,
262
+ createMemorySchemaV2,
263
+ isSchemaV2,
264
+ getEmbeddingDimension,
265
+ DEFAULT_VECTOR_DIMENSION,
266
+ EMBEDDING_DIMENSIONS,
267
+ };
@@ -0,0 +1,26 @@
1
+ /**
2
+ * MemoryScorer - Calculate memory importance and detect duplicates
3
+ */
4
+ import { MemoryMesh } from "./memory-mesh.js";
5
+ export declare class MemoryScorer {
6
+ #private;
7
+ /**
8
+ * @param {MemoryMesh} mesh - MemoryMesh instance for duplicate checking
9
+ */
10
+ constructor(mesh: MemoryMesh);
11
+ /**
12
+ * Calculate importance score for content
13
+ * @param {string} content - Content to score
14
+ * @param {Object} metadata - Associated metadata
15
+ * @returns {Promise<number>} Importance score (0-1)
16
+ */
17
+ calculateImportance(content: string, metadata?: any): number;
18
+ /**
19
+ * Check if content is duplicate of existing memory
20
+ * @param {string} content - Content to check
21
+ * @param {number} threshold - Similarity threshold (default 0.9)
22
+ * @returns {Promise<boolean>} True if duplicate exists
23
+ */
24
+ isDuplicate(content: string, threshold?: number): Promise<boolean>;
25
+ }
26
+ export default MemoryScorer;
@@ -0,0 +1,77 @@
1
+ /**
2
+ * MemoryScorer - Calculate memory importance and detect duplicates
3
+ */
4
+ export class MemoryScorer {
5
+ #mesh;
6
+ /**
7
+ * @param {MemoryMesh} mesh - MemoryMesh instance for duplicate checking
8
+ */
9
+ constructor(mesh) {
10
+ this.#mesh = mesh;
11
+ }
12
+ /**
13
+ * Calculate importance score for content
14
+ * @param {string} content - Content to score
15
+ * @param {Object} metadata - Associated metadata
16
+ * @returns {Promise<number>} Importance score (0-1)
17
+ */
18
+ calculateImportance(content, metadata = {}) {
19
+ let score = 0;
20
+ // Content length (longer = more important, up to a point)
21
+ const length = content.length;
22
+ score += Math.min(length / 1000, 0.2);
23
+ // Has structured data (JSON, code blocks)
24
+ if (content.includes("```") || content.includes("{")) {
25
+ score += 0.1;
26
+ }
27
+ // Interaction type bonuses
28
+ if (metadata.interaction_type === "tool_execution") {
29
+ score += 0.15;
30
+ }
31
+ if (metadata.interaction_type === "file_operation") {
32
+ score += 0.1;
33
+ }
34
+ // Tool usage indicates importance
35
+ if (metadata.tools_used?.length > 0) {
36
+ score += Math.min(metadata.tools_used.length * 0.05, 0.15);
37
+ }
38
+ // File involvement
39
+ if (metadata.files_involved?.length > 0) {
40
+ score += Math.min(metadata.files_involved.length * 0.05, 0.15);
41
+ }
42
+ // Keywords that indicate importance
43
+ const importantKeywords = [
44
+ "error",
45
+ "bug",
46
+ "fix",
47
+ "important",
48
+ "critical",
49
+ "note",
50
+ "remember",
51
+ ];
52
+ const lowerContent = content.toLowerCase();
53
+ const keywordMatches = importantKeywords.filter((k) => lowerContent.includes(k)).length;
54
+ score += Math.min(keywordMatches * 0.05, 0.15);
55
+ return Math.min(score, 1.0);
56
+ }
57
+ /**
58
+ * Check if content is duplicate of existing memory
59
+ * @param {string} content - Content to check
60
+ * @param {number} threshold - Similarity threshold (default 0.9)
61
+ * @returns {Promise<boolean>} True if duplicate exists
62
+ */
63
+ async isDuplicate(content, threshold = 0.9) {
64
+ try {
65
+ const results = await this.#mesh.search(content, {
66
+ limit: 1,
67
+ useCache: false,
68
+ });
69
+ return results.length > 0 && results[0].score >= threshold;
70
+ }
71
+ catch (_error) {
72
+ // On error, assume not duplicate to allow storage
73
+ return false;
74
+ }
75
+ }
76
+ }
77
+ export default MemoryScorer;
@@ -0,0 +1,95 @@
1
+ /**
2
+ * MemoryScorer - Calculate memory importance and detect duplicates
3
+ */
4
+
5
+ import { MemoryMesh } from "./memory-mesh.js";
6
+
7
+ export class MemoryScorer {
8
+ #mesh: MemoryMesh;
9
+
10
+ /**
11
+ * @param {MemoryMesh} mesh - MemoryMesh instance for duplicate checking
12
+ */
13
+ constructor(mesh: MemoryMesh) {
14
+ this.#mesh = mesh;
15
+ }
16
+
17
+ /**
18
+ * Calculate importance score for content
19
+ * @param {string} content - Content to score
20
+ * @param {Object} metadata - Associated metadata
21
+ * @returns {Promise<number>} Importance score (0-1)
22
+ */
23
+ calculateImportance(content: string, metadata: any = {}): number {
24
+ let score = 0;
25
+
26
+ // Content length (longer = more important, up to a point)
27
+ const length = content.length;
28
+ score += Math.min(length / 1000, 0.2);
29
+
30
+ // Has structured data (JSON, code blocks)
31
+ if (content.includes("```") || content.includes("{")) {
32
+ score += 0.1;
33
+ }
34
+
35
+ // Interaction type bonuses
36
+ if (metadata.interaction_type === "tool_execution") {
37
+ score += 0.15;
38
+ }
39
+ if (metadata.interaction_type === "file_operation") {
40
+ score += 0.1;
41
+ }
42
+
43
+ // Tool usage indicates importance
44
+ if (metadata.tools_used?.length > 0) {
45
+ score += Math.min(metadata.tools_used.length * 0.05, 0.15);
46
+ }
47
+
48
+ // File involvement
49
+ if (metadata.files_involved?.length > 0) {
50
+ score += Math.min(metadata.files_involved.length * 0.05, 0.15);
51
+ }
52
+
53
+ // Keywords that indicate importance
54
+ const importantKeywords = [
55
+ "error",
56
+ "bug",
57
+ "fix",
58
+ "important",
59
+ "critical",
60
+ "note",
61
+ "remember",
62
+ ];
63
+ const lowerContent = content.toLowerCase();
64
+ const keywordMatches = importantKeywords.filter((k) =>
65
+ lowerContent.includes(k),
66
+ ).length;
67
+ score += Math.min(keywordMatches * 0.05, 0.15);
68
+
69
+ return Math.min(score, 1.0);
70
+ }
71
+
72
+ /**
73
+ * Check if content is duplicate of existing memory
74
+ * @param {string} content - Content to check
75
+ * @param {number} threshold - Similarity threshold (default 0.9)
76
+ * @returns {Promise<boolean>} True if duplicate exists
77
+ */
78
+ async isDuplicate(
79
+ content: string,
80
+ threshold: number = 0.9,
81
+ ): Promise<boolean> {
82
+ try {
83
+ const results = await this.#mesh.search(content, {
84
+ limit: 1,
85
+ useCache: false,
86
+ });
87
+ return results.length > 0 && results[0].score >= threshold;
88
+ } catch (_error) {
89
+ // On error, assume not duplicate to allow storage
90
+ return false;
91
+ }
92
+ }
93
+ }
94
+
95
+ export default MemoryScorer;
@@ -0,0 +1 @@
1
+ export { KeywordSearch } from "./keyword-search.js";
@@ -0,0 +1 @@
1
+ export { KeywordSearch } from "./keyword-search.js";
@@ -0,0 +1 @@
1
+ export { KeywordSearch } from "./keyword-search.js";
@@ -0,0 +1,62 @@
1
+ /**
2
+ * Simple Keyword Search Engine (In-Memory)
3
+ * Provides basic TF-IDF style retrieval to complement vector search
4
+ */
5
+ export interface KeywordDoc {
6
+ content: string;
7
+ metadata?: any;
8
+ }
9
+ export interface KeywordSearchResult extends KeywordDoc {
10
+ id: string;
11
+ score: number;
12
+ matches: string[];
13
+ }
14
+ export interface SearchOptions {
15
+ limit?: number;
16
+ }
17
+ export declare class KeywordSearch {
18
+ index: Map<string, Map<string, number>>;
19
+ docLengths: Map<string, number>;
20
+ idf: Map<string, number>;
21
+ docs: Map<string, KeywordDoc>;
22
+ isDirty: boolean;
23
+ constructor();
24
+ /**
25
+ * Tokenize text into normalized terms
26
+ * @param {string} text
27
+ * @returns {string[]} tokens
28
+ */
29
+ tokenize(text: string): string[];
30
+ /**
31
+ * Add a document to the index
32
+ * @param {string} id
33
+ * @param {string} content
34
+ * @param {Object} [metadata]
35
+ */
36
+ add(id: string, content: string, metadata?: any): void;
37
+ /**
38
+ * Remove a document
39
+ * @param {string} id
40
+ */
41
+ remove(id: string): void;
42
+ /**
43
+ * Recalculate IDF scores
44
+ */
45
+ _computeStats(): void;
46
+ /**
47
+ * Search for query terms
48
+ * @param {string} query
49
+ * @param {Object} options
50
+ * @returns {Array<{id: string, score: number, matches: string[], content: string, metadata: Object}>}
51
+ */
52
+ search(query: string, options?: SearchOptions): KeywordSearchResult[];
53
+ /**
54
+ * Bulk load records
55
+ * @param {Array} records
56
+ */
57
+ load(records: {
58
+ id: string;
59
+ content: string;
60
+ metadata?: any;
61
+ }[]): void;
62
+ }
@@ -0,0 +1,135 @@
1
+ /**
2
+ * Simple Keyword Search Engine (In-Memory)
3
+ * Provides basic TF-IDF style retrieval to complement vector search
4
+ */
5
+ export class KeywordSearch {
6
+ index; // token -> Map<docId, tf>
7
+ docLengths; // docId -> length
8
+ idf; // token -> idf value
9
+ docs; // docId -> content (optional, for snippet)
10
+ isDirty;
11
+ constructor() {
12
+ this.index = new Map();
13
+ this.docLengths = new Map();
14
+ this.idf = new Map();
15
+ this.docs = new Map();
16
+ this.isDirty = false;
17
+ }
18
+ /**
19
+ * Tokenize text into normalized terms
20
+ * @param {string} text
21
+ * @returns {string[]} tokens
22
+ */
23
+ tokenize(text) {
24
+ if (!text) {
25
+ return [];
26
+ }
27
+ return text
28
+ .toLowerCase()
29
+ .replace(/[^\w\s]/g, "") // Remove punctuation
30
+ .split(/\s+/)
31
+ .filter((t) => t.length > 2) // Filter stopwords/short
32
+ .map((t) => t.substring(0, 20)); // Truncate
33
+ }
34
+ /**
35
+ * Add a document to the index
36
+ * @param {string} id
37
+ * @param {string} content
38
+ * @param {Object} [metadata]
39
+ */
40
+ add(id, content, metadata = {}) {
41
+ const tokens = this.tokenize(content);
42
+ const termFreqs = new Map();
43
+ tokens.forEach((t) => {
44
+ termFreqs.set(t, (termFreqs.get(t) || 0) + 1);
45
+ });
46
+ this.docLengths.set(id, tokens.length);
47
+ this.docs.set(id, { content, metadata });
48
+ // Update index
49
+ for (const [token, freq] of termFreqs.entries()) {
50
+ if (!this.index.has(token)) {
51
+ this.index.set(token, new Map());
52
+ }
53
+ this.index.get(token).set(id, freq);
54
+ }
55
+ this.isDirty = true;
56
+ }
57
+ /**
58
+ * Remove a document
59
+ * @param {string} id
60
+ */
61
+ remove(id) {
62
+ this.docLengths.delete(id);
63
+ this.docs.delete(id);
64
+ // This is expensive O(Vocab), but okay for small scale
65
+ for (const docMap of this.index.values()) {
66
+ docMap.delete(id);
67
+ }
68
+ this.isDirty = true;
69
+ }
70
+ /**
71
+ * Recalculate IDF scores
72
+ */
73
+ _computeStats() {
74
+ if (!this.isDirty) {
75
+ return;
76
+ }
77
+ const N = this.docLengths.size;
78
+ this.idf.clear();
79
+ for (const [token, docMap] of this.index.entries()) {
80
+ const df = docMap.size;
81
+ // Standard IDF: log(N / (df + 1)) + 1
82
+ const idf = Math.log(N / (df + 1)) + 1;
83
+ this.idf.set(token, idf);
84
+ }
85
+ this.isDirty = false;
86
+ }
87
+ /**
88
+ * Search for query terms
89
+ * @param {string} query
90
+ * @param {Object} options
91
+ * @returns {Array<{id: string, score: number, matches: string[], content: string, metadata: Object}>}
92
+ */
93
+ search(query, options = {}) {
94
+ this._computeStats();
95
+ const tokens = this.tokenize(query);
96
+ const scores = new Map(); // docId -> score
97
+ const matches = new Map(); // docId -> matched tokens
98
+ const limit = options.limit || 10;
99
+ for (const token of tokens) {
100
+ const docMap = this.index.get(token);
101
+ if (!docMap) {
102
+ continue;
103
+ }
104
+ const idf = this.idf.get(token) || 0;
105
+ for (const [docId, tf] of docMap.entries()) {
106
+ // TF-IDF Score
107
+ // Score = tf * idf * (normalization?)
108
+ // Simple variant:
109
+ const score = tf * idf;
110
+ scores.set(docId, (scores.get(docId) || 0) + score);
111
+ if (!matches.has(docId)) {
112
+ matches.set(docId, []);
113
+ }
114
+ matches.get(docId).push(token);
115
+ }
116
+ }
117
+ // Convert to array and sort
118
+ return Array.from(scores.entries())
119
+ .map(([id, score]) => ({
120
+ id,
121
+ score,
122
+ matches: matches.get(id) || [],
123
+ ...this.docs.get(id),
124
+ }))
125
+ .sort((a, b) => b.score - a.score)
126
+ .slice(0, limit);
127
+ }
128
+ /**
129
+ * Bulk load records
130
+ * @param {Array} records
131
+ */
132
+ load(records) {
133
+ records.forEach((r) => this.add(r.id, r.content, r.metadata));
134
+ }
135
+ }