mumpix 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mumpix might be problematic. Click here for more details.

@@ -0,0 +1,176 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * MumpixRecall — hybrid retrieval engine
5
+ *
6
+ * Strategy (in order):
7
+ * 1. Exact substring match (zero-latency)
8
+ * 2. TF-IDF cosine similarity (local semantic approximation, no API needed)
9
+ * 3. Token overlap fallback (always produces a result)
10
+ *
11
+ * Optional: pass embedFn to use your own embeddings (OpenAI, Cohere, etc.)
12
+ */
13
+
14
+ // ── Stopwords ────────────────────────────────────
15
+ const STOPWORDS = new Set([
16
+ 'a','an','the','is','are','was','were','be','been','being',
17
+ 'have','has','had','do','does','did','will','would','could',
18
+ 'should','may','might','i','you','he','she','it','we','they',
19
+ 'my','your','his','her','its','our','their','what','which',
20
+ 'who','whom','that','this','these','those','and','but','or',
21
+ 'nor','for','so','yet','in','on','at','to','of','up','by',
22
+ 'with','about','into','through','during','before','after',
23
+ 'above','below','from','out','off','over','under','again',
24
+ 'then','once','here','there','when','where','why','how','all',
25
+ 'both','each','few','more','most','other','some','such','no',
26
+ 'not','only','own','same','than','too','very','just','can',
27
+ 'me','him','us','them','am','get','got','put','set','let',
28
+ 'if','as','also','even','still','already','now', 'do', 'did',
29
+ ]);
30
+
31
+ // ── TF-IDF utilities ─────────────────────────────
32
+
33
+ function tokenize(text) {
34
+ return text
35
+ .toLowerCase()
36
+ .replace(/[^a-z0-9\s'-]/g, ' ')
37
+ .split(/\s+/)
38
+ .filter(t => t.length > 1 && !STOPWORDS.has(t));
39
+ }
40
+
41
+ function tf(tokens) {
42
+ const freq = {};
43
+ for (const t of tokens) freq[t] = (freq[t] || 0) + 1;
44
+ const len = tokens.length || 1;
45
+ const out = {};
46
+ for (const [t, c] of Object.entries(freq)) out[t] = c / len;
47
+ return out;
48
+ }
49
+
50
+ function buildIDF(corpus) {
51
+ const df = {};
52
+ const N = corpus.length;
53
+ for (const doc of corpus) {
54
+ const seen = new Set(doc);
55
+ for (const t of seen) df[t] = (df[t] || 0) + 1;
56
+ }
57
+ const idf = {};
58
+ for (const [t, c] of Object.entries(df)) {
59
+ idf[t] = Math.log((N + 1) / (c + 1)) + 1;
60
+ }
61
+ return idf;
62
+ }
63
+
64
+ function tfidfVec(tfMap, idf) {
65
+ const vec = {};
66
+ for (const [t, w] of Object.entries(tfMap)) {
67
+ vec[t] = w * (idf[t] || 1);
68
+ }
69
+ return vec;
70
+ }
71
+
72
+ function cosine(a, b) {
73
+ let dot = 0, normA = 0, normB = 0;
74
+ const keys = new Set([...Object.keys(a), ...Object.keys(b)]);
75
+ for (const k of keys) {
76
+ const va = a[k] || 0;
77
+ const vb = b[k] || 0;
78
+ dot += va * vb;
79
+ normA += va * va;
80
+ normB += vb * vb;
81
+ }
82
+ const denom = Math.sqrt(normA) * Math.sqrt(normB);
83
+ return denom === 0 ? 0 : dot / denom;
84
+ }
85
+
86
+ // ── Token overlap (tie-breaker / fallback) ───────
87
+
88
+ function tokenOverlap(queryTokens, docTokens) {
89
+ if (!queryTokens.length) return 0;
90
+ const docSet = new Set(docTokens);
91
+ const hits = queryTokens.filter(t => docSet.has(t)).length;
92
+ return hits / queryTokens.length;
93
+ }
94
+
95
+ // ── Main recall function ──────────────────────────
96
+
97
+ /**
98
+ * recall(query, records, opts) → Record | null
99
+ *
100
+ * opts.k — number of results to return (default 1)
101
+ * opts.embedFn — async fn(texts[]) → number[][] for custom embeddings
102
+ * opts.filter — fn(record) → bool for pre-filtering
103
+ * opts.since — timestamp: only consider records newer than this
104
+ * opts.mode — "exact" | "semantic" | "hybrid" (default "hybrid")
105
+ */
106
+ async function recall(query, records, opts = {}) {
107
+ const results = await recallMany(query, records, { ...opts, k: opts.k || 1 });
108
+ return results.length ? results[0] : null;
109
+ }
110
+
111
+ async function recallMany(query, records, opts = {}) {
112
+ const k = opts.k || 5;
113
+ const mode = opts.mode || 'hybrid';
114
+ const filter = opts.filter || null;
115
+ const since = opts.since || null;
116
+
117
+ let pool = records;
118
+ if (filter) pool = pool.filter(filter);
119
+ if (since) pool = pool.filter(r => r.ts >= since);
120
+ if (!pool.length) return [];
121
+
122
+ // 1. Exact match shortcut
123
+ const queryLower = query.toLowerCase();
124
+ if (mode !== 'semantic') {
125
+ const exact = pool.filter(r => r.content.toLowerCase().includes(queryLower));
126
+ if (exact.length >= k && mode === 'exact') return exact.slice(0, k);
127
+ if (exact.length && mode === 'exact') return exact;
128
+ }
129
+
130
+ // 2. Custom embeddings
131
+ if (opts.embedFn && mode !== 'exact') {
132
+ try {
133
+ const texts = [query, ...pool.map(r => r.content)];
134
+ const vectors = await opts.embedFn(texts);
135
+ const qVec = vectors[0];
136
+ const scored = pool.map((r, i) => ({ r, score: cosineArrays(qVec, vectors[i + 1]) }));
137
+ scored.sort((a, b) => b.score - a.score);
138
+ return scored.slice(0, k).map(s => ({ ...s.r, _score: s.score }));
139
+ } catch (_) { /* fall through to TF-IDF */ }
140
+ }
141
+
142
+ // 3. TF-IDF semantic
143
+ const qTokens = tokenize(query);
144
+ const docTokens = pool.map(r => tokenize(r.content));
145
+ const corpus = [qTokens, ...docTokens];
146
+ const idf = buildIDF(corpus);
147
+
148
+ const qTF = tf(qTokens);
149
+ const qVec = tfidfVec(qTF, idf);
150
+
151
+ const scored = pool.map((r, i) => {
152
+ const dVec = tfidfVec(tf(docTokens[i]), idf);
153
+ const sem = cosine(qVec, dVec);
154
+ const over = tokenOverlap(qTokens, docTokens[i]);
155
+ // Blend: 70% semantic + 30% overlap, with recency boost
156
+ const recency = Math.exp(-(Date.now() - r.ts) / (1000 * 60 * 60 * 24 * 7)); // 7-day half-life
157
+ const score = (sem * 0.70) + (over * 0.20) + (recency * 0.10);
158
+ return { r, score, _debug: { sem, over, recency } };
159
+ });
160
+
161
+ scored.sort((a, b) => b.score - a.score);
162
+ return scored.slice(0, k).map(s => ({ ...s.r, _score: s.score }));
163
+ }
164
+
165
+ function cosineArrays(a, b) {
166
+ let dot = 0, normA = 0, normB = 0;
167
+ for (let i = 0; i < a.length; i++) {
168
+ dot += a[i] * b[i];
169
+ normA += a[i] * a[i];
170
+ normB += b[i] * b[i];
171
+ }
172
+ const denom = Math.sqrt(normA) * Math.sqrt(normB);
173
+ return denom === 0 ? 0 : dot / denom;
174
+ }
175
+
176
+ module.exports = { recall, recallMany, tokenize };
@@ -0,0 +1,230 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * MumpixStore — crash-safe, append-only storage engine
5
+ *
6
+ * File format (.mumpix):
7
+ * Line 0: JSON header {"v":1,"consistency":"strict","created":ts}
8
+ * Line N: JSON record {"id":1,"content":"...","ts":ts,"h":"0xabc"}
9
+ *
10
+ * WAL (.mumpix.wal):
11
+ * Each line: {"op":"write"|"clear","entry"?:{...},"ts":ts}
12
+ * Replayed on open if present, then merged and deleted.
13
+ */
14
+
15
+ const fs = require('fs');
16
+ const path = require('path');
17
+ const os = require('os');
18
+
19
+ const MAGIC_VERSION = 1;
20
+
21
+ class MumpixStore {
22
+ constructor(filePath) {
23
+ this.filePath = path.resolve(filePath);
24
+ this.walPath = this.filePath + '.wal';
25
+ this.header = null;
26
+ this.records = []; // { id, content, ts, h }
27
+ this._nextId = 1;
28
+ this._fd = null; // open file descriptor for appends
29
+ }
30
+
31
+ // ── Public ──────────────────────────────────────
32
+
33
+ open(opts = {}) {
34
+ const consistency = opts.consistency || 'eventual';
35
+
36
+ if (fs.existsSync(this.filePath)) {
37
+ this._load();
38
+ // Replay any uncommitted WAL
39
+ this._replayWAL();
40
+ } else {
41
+ this.header = {
42
+ v: MAGIC_VERSION,
43
+ consistency,
44
+ created: Date.now(),
45
+ path: path.basename(this.filePath),
46
+ };
47
+ this._writeHeader();
48
+ }
49
+
50
+ // Update consistency if caller changed it
51
+ if (this.header.consistency !== consistency) {
52
+ this.header.consistency = consistency;
53
+ this._rewriteFull();
54
+ }
55
+
56
+ // Open append FD
57
+ this._fd = fs.openSync(this.filePath, 'a');
58
+ return this;
59
+ }
60
+
61
+ close() {
62
+ if (this._fd !== null) {
63
+ fs.closeSync(this._fd);
64
+ this._fd = null;
65
+ }
66
+ }
67
+
68
+ /**
69
+ * Append a record. WAL-first for crash safety.
70
+ * Returns the new record.
71
+ */
72
+ write(content) {
73
+ const record = {
74
+ id: this._nextId++,
75
+ content: content.trim(),
76
+ ts: Date.now(),
77
+ h: this._hash(content),
78
+ };
79
+
80
+ // 1. Write to WAL
81
+ const walEntry = JSON.stringify({ op: 'write', entry: record, ts: Date.now() }) + '\n';
82
+ fs.appendFileSync(this.walPath, walEntry, 'utf8');
83
+
84
+ // 2. Commit to main file
85
+ const line = JSON.stringify(record) + '\n';
86
+ fs.writeSync(this._fd, line, null, 'utf8');
87
+
88
+ // 3. Sync to disk (strict/verified modes flush immediately)
89
+ const consistency = this.header.consistency;
90
+ if (consistency === 'strict' || consistency === 'verified') {
91
+ fs.fdatasyncSync(this._fd);
92
+ }
93
+
94
+ // 4. Remove WAL entry (write succeeded)
95
+ this._clearWAL();
96
+
97
+ this.records.push(record);
98
+ return record;
99
+ }
100
+
101
+ /**
102
+ * Clear all records — WAL-first.
103
+ */
104
+ clear() {
105
+ const count = this.records.length;
106
+
107
+ fs.appendFileSync(this.walPath, JSON.stringify({ op: 'clear', count, ts: Date.now() }) + '\n', 'utf8');
108
+
109
+ this.records = [];
110
+ this._nextId = 1;
111
+ this._rewriteFull();
112
+ this._clearWAL();
113
+
114
+ return count;
115
+ }
116
+
117
+ /**
118
+ * Return all records (immutable copy).
119
+ */
120
+ all() {
121
+ return this.records.map(r => ({ ...r }));
122
+ }
123
+
124
+ /**
125
+ * Return store metadata.
126
+ */
127
+ stats() {
128
+ const stat = fs.existsSync(this.filePath) ? fs.statSync(this.filePath) : null;
129
+ return {
130
+ path: this.filePath,
131
+ consistency: this.header.consistency,
132
+ records: this.records.length,
133
+ created: this.header.created,
134
+ sizeBytes: stat ? stat.size : 0,
135
+ version: this.header.v,
136
+ };
137
+ }
138
+
139
+ // ── Private ─────────────────────────────────────
140
+
141
+ _load() {
142
+ const raw = fs.readFileSync(this.filePath, 'utf8');
143
+ const lines = raw.split('\n').filter(l => l.trim());
144
+
145
+ if (!lines.length) throw new Error(`mumpix: corrupt or empty file: ${this.filePath}`);
146
+
147
+ this.header = JSON.parse(lines[0]);
148
+ if (this.header.v !== MAGIC_VERSION) {
149
+ throw new Error(`mumpix: unsupported file version ${this.header.v}`);
150
+ }
151
+
152
+ this.records = [];
153
+ for (let i = 1; i < lines.length; i++) {
154
+ try {
155
+ const r = JSON.parse(lines[i]);
156
+ if (r && r.id && r.content) {
157
+ this.records.push(r);
158
+ if (r.id >= this._nextId) this._nextId = r.id + 1;
159
+ }
160
+ } catch (_) { /* skip corrupt lines */ }
161
+ }
162
+ }
163
+
164
+ _replayWAL() {
165
+ if (!fs.existsSync(this.walPath)) return;
166
+
167
+ const raw = fs.readFileSync(this.walPath, 'utf8');
168
+ const lines = raw.split('\n').filter(l => l.trim());
169
+ let dirty = false;
170
+
171
+ for (const line of lines) {
172
+ try {
173
+ const entry = JSON.parse(line);
174
+ if (entry.op === 'write' && entry.entry) {
175
+ const exists = this.records.find(r => r.id === entry.entry.id);
176
+ if (!exists) {
177
+ this.records.push(entry.entry);
178
+ if (entry.entry.id >= this._nextId) this._nextId = entry.entry.id + 1;
179
+ dirty = true;
180
+ }
181
+ } else if (entry.op === 'clear') {
182
+ this.records = [];
183
+ this._nextId = 1;
184
+ dirty = true;
185
+ }
186
+ } catch (_) { /* skip corrupt WAL lines */ }
187
+ }
188
+
189
+ if (dirty) this._rewriteFull();
190
+ this._clearWAL();
191
+ }
192
+
193
+ _writeHeader() {
194
+ const dir = path.dirname(this.filePath);
195
+ if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
196
+ fs.writeFileSync(this.filePath, JSON.stringify(this.header) + '\n', 'utf8');
197
+ }
198
+
199
+ _rewriteFull() {
200
+ // Atomic rewrite via temp file
201
+ const tmp = this.filePath + '.tmp.' + process.pid;
202
+ const lines = [JSON.stringify(this.header)];
203
+ for (const r of this.records) lines.push(JSON.stringify(r));
204
+ fs.writeFileSync(tmp, lines.join('\n') + '\n', 'utf8');
205
+ fs.renameSync(tmp, this.filePath);
206
+
207
+ // Re-open append FD if needed
208
+ if (this._fd !== null) {
209
+ fs.closeSync(this._fd);
210
+ this._fd = fs.openSync(this.filePath, 'a');
211
+ }
212
+ }
213
+
214
+ _clearWAL() {
215
+ if (fs.existsSync(this.walPath)) {
216
+ fs.unlinkSync(this.walPath);
217
+ }
218
+ }
219
+
220
+ _hash(s) {
221
+ let h = 0x811c9dc5;
222
+ for (let i = 0; i < s.length; i++) {
223
+ h ^= s.charCodeAt(i);
224
+ h = Math.imul(h, 0x01000193);
225
+ }
226
+ return '0x' + (h >>> 0).toString(16).padStart(8, '0');
227
+ }
228
+ }
229
+
230
+ module.exports = { MumpixStore };
package/src/index.js ADDED
@@ -0,0 +1,38 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * mumpix — SQLite for AI
5
+ *
6
+ * Quick start:
7
+ * const { Mumpix } = require('mumpix')
8
+ * const db = Mumpix.open('./agent.mumpix', { consistency: 'strict' })
9
+ *
10
+ * await db.remember('User prefers TypeScript')
11
+ * const ans = await db.recall('What language do they prefer?')
12
+ * console.log(ans) // → 'User prefers TypeScript'
13
+ *
14
+ * await db.close()
15
+ */
16
+
17
+ const { MumpixDB } = require('./core/MumpixDB');
18
+ const { MumpixStore } = require('./core/store');
19
+ const { MumpixAudit } = require('./core/audit');
20
+ const { recall, recallMany, tokenize } = require('./core/recall');
21
+
22
+ // Convenience alias: Mumpix.open() === MumpixDB.open()
23
+ const Mumpix = MumpixDB;
24
+
25
+ module.exports = {
26
+ // Primary export — use this
27
+ Mumpix,
28
+ MumpixDB,
29
+
30
+ // Lower-level exports for advanced use
31
+ MumpixStore,
32
+ MumpixAudit,
33
+
34
+ // Recall utilities — useful for custom pipelines
35
+ recall,
36
+ recallMany,
37
+ tokenize,
38
+ };
@@ -0,0 +1,131 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Mumpix × LangChain integration
5
+ *
6
+ * Usage:
7
+ * const { MumpixVectorStore } = require('mumpix/src/integrations/langchain')
8
+ * const db = Mumpix.open('./agent.mumpix', { consistency: 'strict' })
9
+ * const store = new MumpixVectorStore(db)
10
+ *
11
+ * // Use as a LangChain Memory:
12
+ * const memory = new MumpixChatMemory({ db })
13
+ */
14
+
15
+ const { recallMany } = require('../core/recall');
16
+
17
+ /**
18
+ * MumpixVectorStore — drop-in VectorStore adapter for LangChain.
19
+ *
20
+ * Works with any LangChain retriever that accepts .similaritySearch().
21
+ * No external vector library required — uses Mumpix's built-in TF-IDF.
22
+ */
23
+ class MumpixVectorStore {
24
+ constructor(db) {
25
+ this.db = db;
26
+ }
27
+
28
+ /**
29
+ * LangChain VectorStore interface
30
+ */
31
+ async similaritySearch(query, k = 4) {
32
+ const results = await this.db.recallMany(query, k);
33
+ return results.map(r => ({
34
+ pageContent: r.content,
35
+ metadata: { id: r.id, ts: r.ts, score: r.score },
36
+ }));
37
+ }
38
+
39
+ async similaritySearchWithScore(query, k = 4) {
40
+ const results = await this.db.recallMany(query, k);
41
+ return results.map(r => ([
42
+ { pageContent: r.content, metadata: { id: r.id, ts: r.ts } },
43
+ r.score,
44
+ ]));
45
+ }
46
+
47
+ async addDocuments(docs) {
48
+ const results = [];
49
+ for (const doc of docs) {
50
+ const r = await this.db.remember(doc.pageContent);
51
+ results.push(r.id.toString());
52
+ }
53
+ return results;
54
+ }
55
+
56
+ async addTexts(texts, metadatas = []) {
57
+ const ids = [];
58
+ for (const text of texts) {
59
+ const r = await this.db.remember(text);
60
+ ids.push(r.id.toString());
61
+ }
62
+ return ids;
63
+ }
64
+ }
65
+
66
+ /**
67
+ * MumpixChatMemory — LangChain BaseChatMemory compatible adapter.
68
+ *
69
+ * Stores conversation turns as memories and retrieves them semantically.
70
+ *
71
+ * const memory = new MumpixChatMemory({ db, k: 3 })
72
+ * // Then pass as `memory` to LLMChain, ConversationChain, etc.
73
+ */
74
+ class MumpixChatMemory {
75
+ constructor({ db, k = 4, inputKey = 'input', outputKey = 'output' } = {}) {
76
+ this.db = db;
77
+ this.k = k;
78
+ this.inputKey = inputKey;
79
+ this.outputKey = outputKey;
80
+ this.memoryKey = 'history';
81
+ }
82
+
83
+ get memoryKeys() {
84
+ return [this.memoryKey];
85
+ }
86
+
87
+ async loadMemoryVariables(values) {
88
+ const query = values[this.inputKey] || '';
89
+ const results = await this.db.recallMany(query, this.k);
90
+ const history = results.map(r => r.content).join('\n');
91
+ return { [this.memoryKey]: history };
92
+ }
93
+
94
+ async saveContext(inputs, outputs) {
95
+ const input = inputs[this.inputKey] || '';
96
+ const output = outputs[this.outputKey] || '';
97
+ if (input) await this.db.remember(`Human: ${input}`);
98
+ if (output) await this.db.remember(`AI: ${output}`);
99
+ }
100
+
101
+ async clear() {
102
+ await this.db.clear();
103
+ }
104
+ }
105
+
106
+ /**
107
+ * MumpixRetriever — LangChain BaseRetriever compatible adapter.
108
+ *
109
+ * const retriever = new MumpixRetriever(db, { k: 5 })
110
+ * const results = await retriever.getRelevantDocuments("query")
111
+ */
112
+ class MumpixRetriever {
113
+ constructor(db, opts = {}) {
114
+ this.db = db;
115
+ this.k = opts.k || 4;
116
+ }
117
+
118
+ async getRelevantDocuments(query) {
119
+ const results = await this.db.recallMany(query, this.k);
120
+ return results.map(r => ({
121
+ pageContent: r.content,
122
+ metadata: { id: r.id, ts: r.ts, score: r.score },
123
+ }));
124
+ }
125
+ }
126
+
127
+ module.exports = {
128
+ MumpixVectorStore,
129
+ MumpixChatMemory,
130
+ MumpixRetriever,
131
+ };
@@ -0,0 +1,86 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Mumpix × LlamaIndex integration
5
+ *
6
+ * Usage:
7
+ * const { MumpixReader, MumpixIndex } = require('mumpix/src/integrations/llamaindex')
8
+ * const db = Mumpix.open('./agent.mumpix', { consistency: 'strict' })
9
+ * const index = new MumpixIndex(db)
10
+ * const retriever = index.asRetriever({ topK: 5 })
11
+ */
12
+
13
+ /**
14
+ * MumpixIndex — wraps a MumpixDB as a LlamaIndex-style index.
15
+ */
16
+ class MumpixIndex {
17
+ constructor(db) {
18
+ this.db = db;
19
+ }
20
+
21
+ asRetriever(opts = {}) {
22
+ return new MumpixIndexRetriever(this.db, opts);
23
+ }
24
+
25
+ /**
26
+ * Insert a document / text node.
27
+ */
28
+ async insert(node) {
29
+ const text = typeof node === 'string' ? node : (node.text || node.content || '');
30
+ return this.db.remember(text);
31
+ }
32
+
33
+ async insertMany(nodes) {
34
+ const results = [];
35
+ for (const n of nodes) results.push(await this.insert(n));
36
+ return results;
37
+ }
38
+ }
39
+
40
+ /**
41
+ * MumpixIndexRetriever — LlamaIndex-compatible retriever.
42
+ */
43
+ class MumpixIndexRetriever {
44
+ constructor(db, opts = {}) {
45
+ this.db = db;
46
+ this.topK = opts.topK || 5;
47
+ }
48
+
49
+ async retrieve(queryBundle) {
50
+ const query = typeof queryBundle === 'string' ? queryBundle : queryBundle.queryStr;
51
+ const results = await this.db.recallMany(query, this.topK);
52
+ return results.map(r => ({
53
+ node: {
54
+ id_: r.id.toString(),
55
+ text: r.content,
56
+ metadata: { ts: r.ts },
57
+ getContent: () => r.content,
58
+ },
59
+ score: r.score,
60
+ }));
61
+ }
62
+ }
63
+
64
+ /**
65
+ * MumpixReader — load .mumpix file contents as LlamaIndex documents.
66
+ */
67
+ class MumpixReader {
68
+ constructor(db) {
69
+ this.db = db;
70
+ }
71
+
72
+ async loadData() {
73
+ const memories = await this.db.list();
74
+ return memories.map(m => ({
75
+ id_: m.id.toString(),
76
+ text: m.content,
77
+ metadata: { source: 'mumpix', ts: m.ts },
78
+ }));
79
+ }
80
+ }
81
+
82
+ module.exports = {
83
+ MumpixIndex,
84
+ MumpixIndexRetriever,
85
+ MumpixReader,
86
+ };