mumpix 1.0.20 → 1.0.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,176 +1 @@
1
- 'use strict';
2
-
3
- /**
4
- * MumpixRecall — hybrid retrieval engine
5
- *
6
- * Strategy (in order):
7
- * 1. Exact substring match (zero-latency)
8
- * 2. TF-IDF cosine similarity (local semantic approximation, no API needed)
9
- * 3. Token overlap fallback (always produces a result)
10
- *
11
- * Optional: pass embedFn to use your own embeddings (OpenAI, Cohere, etc.)
12
- */
13
-
14
- // ── Stopwords ────────────────────────────────────
15
- const STOPWORDS = new Set([
16
- 'a','an','the','is','are','was','were','be','been','being',
17
- 'have','has','had','do','does','did','will','would','could',
18
- 'should','may','might','i','you','he','she','it','we','they',
19
- 'my','your','his','her','its','our','their','what','which',
20
- 'who','whom','that','this','these','those','and','but','or',
21
- 'nor','for','so','yet','in','on','at','to','of','up','by',
22
- 'with','about','into','through','during','before','after',
23
- 'above','below','from','out','off','over','under','again',
24
- 'then','once','here','there','when','where','why','how','all',
25
- 'both','each','few','more','most','other','some','such','no',
26
- 'not','only','own','same','than','too','very','just','can',
27
- 'me','him','us','them','am','get','got','put','set','let',
28
- 'if','as','also','even','still','already','now', 'do', 'did',
29
- ]);
30
-
31
- // ── TF-IDF utilities ─────────────────────────────
32
-
33
- function tokenize(text) {
34
- return text
35
- .toLowerCase()
36
- .replace(/[^a-z0-9\s'-]/g, ' ')
37
- .split(/\s+/)
38
- .filter(t => t.length > 1 && !STOPWORDS.has(t));
39
- }
40
-
41
- function tf(tokens) {
42
- const freq = {};
43
- for (const t of tokens) freq[t] = (freq[t] || 0) + 1;
44
- const len = tokens.length || 1;
45
- const out = {};
46
- for (const [t, c] of Object.entries(freq)) out[t] = c / len;
47
- return out;
48
- }
49
-
50
- function buildIDF(corpus) {
51
- const df = {};
52
- const N = corpus.length;
53
- for (const doc of corpus) {
54
- const seen = new Set(doc);
55
- for (const t of seen) df[t] = (df[t] || 0) + 1;
56
- }
57
- const idf = {};
58
- for (const [t, c] of Object.entries(df)) {
59
- idf[t] = Math.log((N + 1) / (c + 1)) + 1;
60
- }
61
- return idf;
62
- }
63
-
64
- function tfidfVec(tfMap, idf) {
65
- const vec = {};
66
- for (const [t, w] of Object.entries(tfMap)) {
67
- vec[t] = w * (idf[t] || 1);
68
- }
69
- return vec;
70
- }
71
-
72
- function cosine(a, b) {
73
- let dot = 0, normA = 0, normB = 0;
74
- const keys = new Set([...Object.keys(a), ...Object.keys(b)]);
75
- for (const k of keys) {
76
- const va = a[k] || 0;
77
- const vb = b[k] || 0;
78
- dot += va * vb;
79
- normA += va * va;
80
- normB += vb * vb;
81
- }
82
- const denom = Math.sqrt(normA) * Math.sqrt(normB);
83
- return denom === 0 ? 0 : dot / denom;
84
- }
85
-
86
- // ── Token overlap (tie-breaker / fallback) ───────
87
-
88
- function tokenOverlap(queryTokens, docTokens) {
89
- if (!queryTokens.length) return 0;
90
- const docSet = new Set(docTokens);
91
- const hits = queryTokens.filter(t => docSet.has(t)).length;
92
- return hits / queryTokens.length;
93
- }
94
-
95
- // ── Main recall function ──────────────────────────
96
-
97
- /**
98
- * recall(query, records, opts) → Record | null
99
- *
100
- * opts.k — number of results to return (default 1)
101
- * opts.embedFn — async fn(texts[]) → number[][] for custom embeddings
102
- * opts.filter — fn(record) → bool for pre-filtering
103
- * opts.since — timestamp: only consider records newer than this
104
- * opts.mode — "exact" | "semantic" | "hybrid" (default "hybrid")
105
- */
106
- async function recall(query, records, opts = {}) {
107
- const results = await recallMany(query, records, { ...opts, k: opts.k || 1 });
108
- return results.length ? results[0] : null;
109
- }
110
-
111
- async function recallMany(query, records, opts = {}) {
112
- const k = opts.k || 5;
113
- const mode = opts.mode || 'hybrid';
114
- const filter = opts.filter || null;
115
- const since = opts.since || null;
116
-
117
- let pool = records;
118
- if (filter) pool = pool.filter(filter);
119
- if (since) pool = pool.filter(r => r.ts >= since);
120
- if (!pool.length) return [];
121
-
122
- // 1. Exact match shortcut
123
- const queryLower = query.toLowerCase();
124
- if (mode !== 'semantic') {
125
- const exact = pool.filter(r => r.content.toLowerCase().includes(queryLower));
126
- if (exact.length >= k && mode === 'exact') return exact.slice(0, k);
127
- if (exact.length && mode === 'exact') return exact;
128
- }
129
-
130
- // 2. Custom embeddings
131
- if (opts.embedFn && mode !== 'exact') {
132
- try {
133
- const texts = [query, ...pool.map(r => r.content)];
134
- const vectors = await opts.embedFn(texts);
135
- const qVec = vectors[0];
136
- const scored = pool.map((r, i) => ({ r, score: cosineArrays(qVec, vectors[i + 1]) }));
137
- scored.sort((a, b) => b.score - a.score);
138
- return scored.slice(0, k).map(s => ({ ...s.r, _score: s.score }));
139
- } catch (_) { /* fall through to TF-IDF */ }
140
- }
141
-
142
- // 3. TF-IDF semantic
143
- const qTokens = tokenize(query);
144
- const docTokens = pool.map(r => tokenize(r.content));
145
- const corpus = [qTokens, ...docTokens];
146
- const idf = buildIDF(corpus);
147
-
148
- const qTF = tf(qTokens);
149
- const qVec = tfidfVec(qTF, idf);
150
-
151
- const scored = pool.map((r, i) => {
152
- const dVec = tfidfVec(tf(docTokens[i]), idf);
153
- const sem = cosine(qVec, dVec);
154
- const over = tokenOverlap(qTokens, docTokens[i]);
155
- // Blend: 70% semantic + 30% overlap, with recency boost
156
- const recency = Math.exp(-(Date.now() - r.ts) / (1000 * 60 * 60 * 24 * 7)); // 7-day half-life
157
- const score = (sem * 0.70) + (over * 0.20) + (recency * 0.10);
158
- return { r, score, _debug: { sem, over, recency } };
159
- });
160
-
161
- scored.sort((a, b) => b.score - a.score);
162
- return scored.slice(0, k).map(s => ({ ...s.r, _score: s.score }));
163
- }
164
-
165
- function cosineArrays(a, b) {
166
- let dot = 0, normA = 0, normB = 0;
167
- for (let i = 0; i < a.length; i++) {
168
- dot += a[i] * b[i];
169
- normA += a[i] * a[i];
170
- normB += b[i] * b[i];
171
- }
172
- const denom = Math.sqrt(normA) * Math.sqrt(normB);
173
- return denom === 0 ? 0 : dot / denom;
174
- }
175
-
176
- module.exports = { recall, recallMany, tokenize };
1
+ "use strict";const e=new Set(["a","an","the","is","are","was","were","be","been","being","have","has","had","do","does","did","will","would","could","should","may","might","i","you","he","she","it","we","they","my","your","his","her","its","our","their","what","which","who","whom","that","this","these","those","and","but","or","nor","for","so","yet","in","on","at","to","of","up","by","with","about","into","through","during","before","after","above","below","from","out","off","over","under","again","then","once","here","there","when","where","why","how","all","both","each","few","more","most","other","some","such","no","not","only","own","same","than","too","very","just","can","me","him","us","them","am","get","got","put","set","let","if","as","also","even","still","already","now","do","did"]);function t(t){return t.toLowerCase().replace(/[^a-z0-9\s'-]/g," ").split(/\s+/).filter(t=>t.length>1&&!e.has(t))}function o(e){const t={};for(const o of e)t[o]=(t[o]||0)+1;const o=e.length||1,n={};for(const[e,r]of Object.entries(t))n[e]=r/o;return n}function n(e,t){const o={};for(const[n,r]of Object.entries(e))o[n]=r*(t[n]||1);return o}async function r(e,r,c={}){const a=c.k||5,h=c.mode||"hybrid",i=c.filter||null,l=c.since||null;let u=r;if(i&&(u=u.filter(i)),l&&(u=u.filter(e=>e.ts>=l)),!u.length)return[];const f=e.toLowerCase();if("semantic"!==h){const e=u.filter(e=>e.content.toLowerCase().includes(f));if(e.length>=a&&"exact"===h)return e.slice(0,a);if(e.length&&"exact"===h)return e}if(c.embedFn&&"exact"!==h)try{const t=[e,...u.map(e=>e.content)],o=await c.embedFn(t),n=o[0],r=u.map((e,t)=>({r:e,score:s(n,o[t+1])}));return r.sort((e,t)=>t.score-e.score),r.slice(0,a).map(e=>({...e.r,_score:e.score}))}catch(e){}const w=t(e),m=u.map(e=>t(e.content)),d=function(e){const t={},o=e.length;for(const o of e){const e=new Set(o);for(const o of e)t[o]=(t[o]||0)+1}const n={};for(const[e,r]of Object.entries(t))n[e]=Math.log((o+1)/(r+1))+1;return n}([w,...m]),g=n(o(w),d),b=u.map((e,t)=>{const r=n(o(m[t]),d),s=function(e,t){let o=0,n=0,r=0;const s=new Set([...Object.keys(e),...Object.keys(t)]);for(const c of s){const s=e[c]||0,a=t[c]||0;o+=s*a,n+=s*s,r+=a*a}const c=Math.sqrt(n)*Math.sqrt(r);return 0===c?0:o/c}(g,r),c=function(e,t){if(!e.length)return 0;const o=new Set(t);return e.filter(e=>o.has(e)).length/e.length}(w,m[t]),a=Math.exp(-(Date.now()-e.ts)/6048e5);return{r:e,score:.7*s+.2*c+.1*a,_debug:{sem:s,over:c,recency:a}}});return b.sort((e,t)=>t.score-e.score),b.slice(0,a).map(e=>({...e.r,_score:e.score}))}function s(e,t){let o=0,n=0,r=0;for(let s=0;s<e.length;s++)o+=e[s]*t[s],n+=e[s]*e[s],r+=t[s]*t[s];const s=Math.sqrt(n)*Math.sqrt(r);return 0===s?0:o/s}module.exports={recall:async function(e,t,o={}){const n=await r(e,t,{...o,k:o.k||1});return n.length?n[0]:null},recallMany:r,tokenize:t};