@plur-ai/core 0.7.7 → 0.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-GRDNBUIJ.js +452 -0
- package/dist/{chunk-KMVQYBNP.js → chunk-MY4XVDCE.js} +1 -192
- package/dist/chunk-UETCDULF.js +196 -0
- package/dist/{embeddings-2IODIQAF.js → embeddings-EX7QPXJS.js} +2 -1
- package/dist/index.d.ts +693 -193
- package/dist/index.js +1320 -420
- package/dist/learn-async-VXBH3TYE.js +184 -0
- package/package.json +1 -1
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
import {
|
|
2
|
+
atomicWrite
|
|
3
|
+
} from "./chunk-MY4XVDCE.js";
|
|
4
|
+
|
|
5
|
+
// src/fts.ts
|
|
6
|
+
var STOP_WORDS = /* @__PURE__ */ new Set([
|
|
7
|
+
"the",
|
|
8
|
+
"and",
|
|
9
|
+
"for",
|
|
10
|
+
"that",
|
|
11
|
+
"this",
|
|
12
|
+
"with",
|
|
13
|
+
"from",
|
|
14
|
+
"are",
|
|
15
|
+
"was",
|
|
16
|
+
"were",
|
|
17
|
+
"been",
|
|
18
|
+
"have",
|
|
19
|
+
"has",
|
|
20
|
+
"not",
|
|
21
|
+
"but",
|
|
22
|
+
"its",
|
|
23
|
+
"you",
|
|
24
|
+
"your",
|
|
25
|
+
"can",
|
|
26
|
+
"will",
|
|
27
|
+
"should",
|
|
28
|
+
"would",
|
|
29
|
+
"could",
|
|
30
|
+
"may",
|
|
31
|
+
"might"
|
|
32
|
+
]);
|
|
33
|
+
function ftsTokenize(text) {
|
|
34
|
+
return text.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((w) => w.length > 2).filter((w) => !STOP_WORDS.has(w));
|
|
35
|
+
}
|
|
36
|
+
function engramSearchText(engram) {
|
|
37
|
+
const parts = [engram.statement];
|
|
38
|
+
if (engram.domain) parts.push(engram.domain.replace(/\./g, " "));
|
|
39
|
+
if (engram.tags.length > 0) parts.push(engram.tags.join(" "));
|
|
40
|
+
if (engram.entities) {
|
|
41
|
+
for (const e of engram.entities) {
|
|
42
|
+
parts.push(e.name);
|
|
43
|
+
if (e.type !== "other") parts.push(e.type);
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
if (engram.temporal) {
|
|
47
|
+
if (engram.temporal.valid_from) parts.push(engram.temporal.valid_from);
|
|
48
|
+
if (engram.temporal.valid_until) parts.push(engram.temporal.valid_until);
|
|
49
|
+
}
|
|
50
|
+
if (engram.rationale) parts.push(engram.rationale);
|
|
51
|
+
return parts.join(" ");
|
|
52
|
+
}
|
|
53
|
+
function computeIdf(engrams, queryTokens) {
|
|
54
|
+
const N = engrams.length;
|
|
55
|
+
if (N === 0) return /* @__PURE__ */ new Map();
|
|
56
|
+
const engramTermSets = engrams.map((e) => new Set(ftsTokenize(engramSearchText(e))));
|
|
57
|
+
const idf = /* @__PURE__ */ new Map();
|
|
58
|
+
for (const qt of queryTokens) {
|
|
59
|
+
let df = 0;
|
|
60
|
+
for (const termSet of engramTermSets) {
|
|
61
|
+
if (termSet.has(qt) || Array.from(termSet).some((t) => t.includes(qt) || qt.includes(t))) {
|
|
62
|
+
df++;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
idf.set(qt, Math.max(0, Math.log(N / (1 + df))));
|
|
66
|
+
}
|
|
67
|
+
return idf;
|
|
68
|
+
}
|
|
69
|
+
var BM25_K1 = 1.2;
|
|
70
|
+
var BM25_B = 0.75;
|
|
71
|
+
function ftsScore(engram, queryTokens, idfWeights, avgDocLength) {
|
|
72
|
+
const allTerms = ftsTokenize(engramSearchText(engram));
|
|
73
|
+
if (queryTokens.length === 0) return 0;
|
|
74
|
+
const docLen = allTerms.length;
|
|
75
|
+
const avgdl = avgDocLength && avgDocLength > 0 ? avgDocLength : docLen;
|
|
76
|
+
const hasNonZeroIdf = idfWeights && Array.from(idfWeights.values()).some((v) => v > 0);
|
|
77
|
+
let score = 0;
|
|
78
|
+
for (const qt of queryTokens) {
|
|
79
|
+
let effectiveIdf;
|
|
80
|
+
if (!idfWeights) {
|
|
81
|
+
effectiveIdf = 1;
|
|
82
|
+
} else if (hasNonZeroIdf) {
|
|
83
|
+
effectiveIdf = idfWeights.get(qt) ?? 0;
|
|
84
|
+
if (effectiveIdf === 0) continue;
|
|
85
|
+
} else {
|
|
86
|
+
effectiveIdf = 1;
|
|
87
|
+
}
|
|
88
|
+
let tf = 0;
|
|
89
|
+
for (const t of allTerms) {
|
|
90
|
+
if (t.includes(qt) || qt.includes(t)) tf++;
|
|
91
|
+
}
|
|
92
|
+
if (tf === 0) continue;
|
|
93
|
+
const numerator = tf * (BM25_K1 + 1);
|
|
94
|
+
const denominator = tf + BM25_K1 * (1 - BM25_B + BM25_B * docLen / avgdl);
|
|
95
|
+
score += effectiveIdf * (numerator / denominator);
|
|
96
|
+
}
|
|
97
|
+
return score;
|
|
98
|
+
}
|
|
99
|
+
function searchEngrams(engrams, query, limit = 20) {
|
|
100
|
+
const queryTokens = ftsTokenize(query);
|
|
101
|
+
if (queryTokens.length === 0) return [];
|
|
102
|
+
const idfWeights = computeIdf(engrams, queryTokens);
|
|
103
|
+
const avgDocLength = engrams.length > 0 ? engrams.reduce((sum, e) => sum + ftsTokenize(engramSearchText(e)).length, 0) / engrams.length : 0;
|
|
104
|
+
return engrams.map((e) => ({ engram: e, score: ftsScore(e, queryTokens, idfWeights, avgDocLength) })).filter((r) => r.score > 0).sort((a, b) => b.score - a.score).slice(0, limit).map((r) => r.engram);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// src/embeddings.ts
|
|
108
|
+
import { existsSync, readFileSync, mkdirSync } from "fs";
|
|
109
|
+
import { join } from "path";
|
|
110
|
+
import { createHash } from "crypto";
|
|
111
|
+
var embedPipeline = null;
|
|
112
|
+
var transformersUnavailable = false;
|
|
113
|
+
async function getEmbedder() {
|
|
114
|
+
if (transformersUnavailable) return null;
|
|
115
|
+
if (!embedPipeline) {
|
|
116
|
+
try {
|
|
117
|
+
const { pipeline } = await import("./transformers.node-PH5YK5EA.js");
|
|
118
|
+
embedPipeline = await pipeline("feature-extraction", "Xenova/bge-small-en-v1.5", {
|
|
119
|
+
dtype: "fp32"
|
|
120
|
+
});
|
|
121
|
+
} catch {
|
|
122
|
+
transformersUnavailable = true;
|
|
123
|
+
return null;
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
return embedPipeline;
|
|
127
|
+
}
|
|
128
|
+
async function embed(text) {
|
|
129
|
+
const embedder = await getEmbedder();
|
|
130
|
+
if (!embedder) return null;
|
|
131
|
+
const result = await embedder(text, { pooling: "cls", normalize: true });
|
|
132
|
+
return new Float32Array(result.data);
|
|
133
|
+
}
|
|
134
|
+
function cosineSimilarity(a, b) {
|
|
135
|
+
let dot = 0;
|
|
136
|
+
for (let i = 0; i < a.length; i++) dot += a[i] * b[i];
|
|
137
|
+
return dot;
|
|
138
|
+
}
|
|
139
|
+
function loadCache(cachePath) {
|
|
140
|
+
if (!existsSync(cachePath)) return {};
|
|
141
|
+
try {
|
|
142
|
+
return JSON.parse(readFileSync(cachePath, "utf8"));
|
|
143
|
+
} catch {
|
|
144
|
+
return {};
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
function saveCache(cachePath, cache) {
|
|
148
|
+
const dir = cachePath.substring(0, cachePath.lastIndexOf("/"));
|
|
149
|
+
if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
|
|
150
|
+
atomicWrite(cachePath, JSON.stringify(cache));
|
|
151
|
+
}
|
|
152
|
+
function hashStatement(statement) {
|
|
153
|
+
return createHash("sha256").update(statement).digest("hex").slice(0, 16);
|
|
154
|
+
}
|
|
155
|
+
async function embeddingSearch(engrams, query, limit, storagePath) {
|
|
156
|
+
if (engrams.length === 0) return [];
|
|
157
|
+
const cachePath = storagePath ? join(storagePath, ".embeddings-cache.json") : ".embeddings-cache.json";
|
|
158
|
+
const cache = loadCache(cachePath);
|
|
159
|
+
const queryEmbedding = await embed(query);
|
|
160
|
+
if (!queryEmbedding) {
|
|
161
|
+
return [];
|
|
162
|
+
}
|
|
163
|
+
const similarities = [];
|
|
164
|
+
for (const engram of engrams) {
|
|
165
|
+
const searchText = engramSearchText(engram);
|
|
166
|
+
const hash = hashStatement(searchText);
|
|
167
|
+
let engramEmbedding;
|
|
168
|
+
if (cache[engram.id]?.hash === hash) {
|
|
169
|
+
engramEmbedding = new Float32Array(cache[engram.id].embedding);
|
|
170
|
+
} else {
|
|
171
|
+
const emb = await embed(searchText);
|
|
172
|
+
if (!emb) return [];
|
|
173
|
+
engramEmbedding = emb;
|
|
174
|
+
cache[engram.id] = {
|
|
175
|
+
hash,
|
|
176
|
+
embedding: Array.from(engramEmbedding)
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
const score = cosineSimilarity(queryEmbedding, engramEmbedding);
|
|
180
|
+
similarities.push({ engram, score });
|
|
181
|
+
}
|
|
182
|
+
saveCache(cachePath, cache);
|
|
183
|
+
similarities.sort((a, b) => b.score - a.score);
|
|
184
|
+
return similarities.slice(0, limit).map((s) => s.engram);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
export {
|
|
188
|
+
ftsTokenize,
|
|
189
|
+
engramSearchText,
|
|
190
|
+
computeIdf,
|
|
191
|
+
ftsScore,
|
|
192
|
+
searchEngrams,
|
|
193
|
+
embed,
|
|
194
|
+
cosineSimilarity,
|
|
195
|
+
embeddingSearch
|
|
196
|
+
};
|