mikoshi 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +41 -0
- package/package.json +23 -0
- package/src/mikoshi/auth.js +199 -0
- package/src/mikoshi/chunking.js +31 -0
- package/src/mikoshi/cli.js +274 -0
- package/src/mikoshi/config.js +58 -0
- package/src/mikoshi/entitlements.js +6 -0
- package/src/mikoshi/hashing.js +9 -0
- package/src/mikoshi/ignore.js +90 -0
- package/src/mikoshi/indexing/file_scanner.js +39 -0
- package/src/mikoshi/indexing/index_store.js +82 -0
- package/src/mikoshi/indexing/indexer.js +198 -0
- package/src/mikoshi/mcp_server/server.js +121 -0
- package/src/mikoshi/retrieval/hybrid.js +85 -0
- package/src/mikoshi/retrieval/lexical.js +53 -0
- package/src/mikoshi/retrieval/rerank.js +3 -0
- package/src/mikoshi/retrieval/semantic.js +210 -0
- package/src/mikoshi/utils/timer.js +9 -0
- package/src/mikoshi/utils/types.js +44 -0
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
import os from "node:os";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
|
|
4
|
+
const DEFAULT_DIM = 384;
|
|
5
|
+
|
|
6
|
+
function normalizeVector(vec) {
|
|
7
|
+
let sum = 0;
|
|
8
|
+
for (let i = 0; i < vec.length; i += 1) sum += vec[i] * vec[i];
|
|
9
|
+
const norm = Math.sqrt(sum) || 1;
|
|
10
|
+
for (let i = 0; i < vec.length; i += 1) vec[i] /= norm;
|
|
11
|
+
return vec;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export function normalizeEmbeddings(matrix, dim) {
|
|
15
|
+
if (!matrix || !matrix.length || !dim) return matrix;
|
|
16
|
+
const rows = Math.floor(matrix.length / dim);
|
|
17
|
+
for (let i = 0; i < rows; i += 1) {
|
|
18
|
+
const offset = i * dim;
|
|
19
|
+
let sum = 0;
|
|
20
|
+
for (let j = 0; j < dim; j += 1) {
|
|
21
|
+
const v = matrix[offset + j];
|
|
22
|
+
sum += v * v;
|
|
23
|
+
}
|
|
24
|
+
const norm = Math.sqrt(sum) || 1;
|
|
25
|
+
for (let j = 0; j < dim; j += 1) {
|
|
26
|
+
matrix[offset + j] /= norm;
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
return matrix;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
class HashEmbeddingsProvider {
|
|
33
|
+
constructor(dim = DEFAULT_DIM) {
|
|
34
|
+
this.dimension = dim;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
async embedTexts(texts) {
|
|
38
|
+
const vectors = [];
|
|
39
|
+
for (const text of texts) {
|
|
40
|
+
const vec = new Float32Array(this.dimension);
|
|
41
|
+
const tokens = text.toLowerCase().split(/[^a-z0-9_]+/).filter(Boolean);
|
|
42
|
+
for (const token of tokens) {
|
|
43
|
+
let hash = 0;
|
|
44
|
+
for (let i = 0; i < token.length; i += 1) {
|
|
45
|
+
hash = (hash * 31 + token.charCodeAt(i)) >>> 0;
|
|
46
|
+
}
|
|
47
|
+
const idx = hash % this.dimension;
|
|
48
|
+
vec[idx] += 1;
|
|
49
|
+
}
|
|
50
|
+
normalizeVector(vec);
|
|
51
|
+
vectors.push(vec);
|
|
52
|
+
}
|
|
53
|
+
return vectors;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
class OpenAIEmbeddingsProvider {
|
|
58
|
+
constructor(config) {
|
|
59
|
+
this.config = config;
|
|
60
|
+
this.dimension = null;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
async embedTexts(texts) {
|
|
64
|
+
const response = await fetch(`${this.config.embeddings.openai_base_url}/v1/embeddings`, {
|
|
65
|
+
method: "POST",
|
|
66
|
+
headers: {
|
|
67
|
+
"Content-Type": "application/json",
|
|
68
|
+
Authorization: `Bearer ${this.config.embeddings.openai_key}`,
|
|
69
|
+
},
|
|
70
|
+
body: JSON.stringify({
|
|
71
|
+
model: this.config.embeddings.openai_model,
|
|
72
|
+
input: texts,
|
|
73
|
+
}),
|
|
74
|
+
});
|
|
75
|
+
if (!response.ok) {
|
|
76
|
+
const detail = await response.text();
|
|
77
|
+
throw new Error(`OpenAI embeddings failed: ${detail}`);
|
|
78
|
+
}
|
|
79
|
+
const data = await response.json();
|
|
80
|
+
const vectors = data.data.map((item) => Float32Array.from(item.embedding));
|
|
81
|
+
if (vectors.length) {
|
|
82
|
+
this.dimension = vectors[0].length;
|
|
83
|
+
for (const vec of vectors) normalizeVector(vec);
|
|
84
|
+
}
|
|
85
|
+
return vectors;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
class TransformersEmbeddingsProvider {
|
|
90
|
+
constructor(config) {
|
|
91
|
+
this.config = config;
|
|
92
|
+
this.dimension = null;
|
|
93
|
+
this._pipelinePromise = null;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
async _loadPipeline() {
|
|
97
|
+
if (this._pipelinePromise) return this._pipelinePromise;
|
|
98
|
+
this._pipelinePromise = (async () => {
|
|
99
|
+
const { pipeline, env } = await import("@xenova/transformers");
|
|
100
|
+
const cacheDir = path.join(os.homedir(), ".mikoshi", "models");
|
|
101
|
+
env.cacheDir = cacheDir;
|
|
102
|
+
env.allowLocalModels = true;
|
|
103
|
+
|
|
104
|
+
const offline = isOffline();
|
|
105
|
+
const timeoutMs = downloadTimeoutMs();
|
|
106
|
+
|
|
107
|
+
console.error("🧠 Loading embeddings model…");
|
|
108
|
+
|
|
109
|
+
env.allowRemoteModels = false;
|
|
110
|
+
try {
|
|
111
|
+
const extractor = await pipeline("feature-extraction", this.config.embeddings.model, {
|
|
112
|
+
quantized: false,
|
|
113
|
+
});
|
|
114
|
+
console.error("✅ Embeddings model loaded");
|
|
115
|
+
return extractor;
|
|
116
|
+
} catch (err) {
|
|
117
|
+
if (offline) {
|
|
118
|
+
throw new Error(
|
|
119
|
+
"Model download disabled. Set MIKOSHI_OFFLINE=0 or pre-cache the model."
|
|
120
|
+
);
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
console.error("⬇️ Downloading local model (one-time)…");
|
|
125
|
+
env.allowRemoteModels = true;
|
|
126
|
+
const extractor = await withTimeout(
|
|
127
|
+
pipeline("feature-extraction", this.config.embeddings.model, { quantized: false }),
|
|
128
|
+
timeoutMs
|
|
129
|
+
);
|
|
130
|
+
console.error("✅ Model ready");
|
|
131
|
+
console.error("✅ Embeddings model loaded");
|
|
132
|
+
return extractor;
|
|
133
|
+
})();
|
|
134
|
+
return this._pipelinePromise;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
async embedTexts(texts) {
|
|
138
|
+
const extractor = await this._loadPipeline();
|
|
139
|
+
const vectors = [];
|
|
140
|
+
for (const text of texts) {
|
|
141
|
+
const output = await extractor(text, { pooling: "mean", normalize: true });
|
|
142
|
+
const data = output.data;
|
|
143
|
+
const vec = Float32Array.from(data);
|
|
144
|
+
this.dimension = vec.length;
|
|
145
|
+
vectors.push(vec);
|
|
146
|
+
}
|
|
147
|
+
return vectors;
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
function isOffline() {
|
|
152
|
+
const raw = process.env.MIKOSHI_OFFLINE || "";
|
|
153
|
+
return ["1", "true", "yes", "on"].includes(raw.toLowerCase());
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
function downloadTimeoutMs() {
|
|
157
|
+
const raw = process.env.MIKOSHI_MODEL_DOWNLOAD_TIMEOUT_SEC;
|
|
158
|
+
const seconds = raw ? Number.parseInt(raw, 10) : 300;
|
|
159
|
+
if (!Number.isFinite(seconds) || seconds <= 0) return 300_000;
|
|
160
|
+
return seconds * 1000;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
function withTimeout(promise, timeoutMs) {
|
|
164
|
+
let timeoutId;
|
|
165
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
166
|
+
timeoutId = setTimeout(() => {
|
|
167
|
+
reject(
|
|
168
|
+
new Error(
|
|
169
|
+
"Model download timed out. Set MIKOSHI_OFFLINE=1 to skip, or retry on a stable network."
|
|
170
|
+
)
|
|
171
|
+
);
|
|
172
|
+
}, timeoutMs);
|
|
173
|
+
});
|
|
174
|
+
return Promise.race([promise, timeoutPromise]).finally(() => clearTimeout(timeoutId));
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
export function getEmbeddingsProvider(config) {
|
|
178
|
+
if (process.env.MIKOSHI_TEST_MODE === "1") {
|
|
179
|
+
return new HashEmbeddingsProvider();
|
|
180
|
+
}
|
|
181
|
+
if (config.embeddings.provider === "openai") {
|
|
182
|
+
return new OpenAIEmbeddingsProvider(config);
|
|
183
|
+
}
|
|
184
|
+
return new TransformersEmbeddingsProvider(config);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
export class SemanticSearcher {
|
|
188
|
+
constructor(embeddings, dim, provider) {
|
|
189
|
+
this.embeddings = embeddings;
|
|
190
|
+
this.dim = dim;
|
|
191
|
+
this.provider = provider;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
async search(query, k) {
|
|
195
|
+
const vectors = await this.provider.embedTexts([query]);
|
|
196
|
+
const queryVec = vectors[0];
|
|
197
|
+
const results = [];
|
|
198
|
+
const rows = Math.floor(this.embeddings.length / this.dim);
|
|
199
|
+
for (let i = 0; i < rows; i += 1) {
|
|
200
|
+
let score = 0;
|
|
201
|
+
const offset = i * this.dim;
|
|
202
|
+
for (let j = 0; j < this.dim; j += 1) {
|
|
203
|
+
score += this.embeddings[offset + j] * queryVec[j];
|
|
204
|
+
}
|
|
205
|
+
results.push([i, score]);
|
|
206
|
+
}
|
|
207
|
+
results.sort((a, b) => b[1] - a[1]);
|
|
208
|
+
return results.slice(0, k);
|
|
209
|
+
}
|
|
210
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @typedef {Object} Chunk
|
|
3
|
+
* @property {string} id
|
|
4
|
+
* @property {string} relpath
|
|
5
|
+
* @property {number} start_line
|
|
6
|
+
* @property {number} end_line
|
|
7
|
+
* @property {string} text
|
|
8
|
+
* @property {string} file_hash
|
|
9
|
+
* @property {number | null} vector_idx
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* @typedef {Object} IndexMeta
|
|
14
|
+
* @property {string} repo_id
|
|
15
|
+
* @property {string} repo_path
|
|
16
|
+
* @property {string} created_at
|
|
17
|
+
* @property {string} updated_at
|
|
18
|
+
* @property {string} embedding_provider
|
|
19
|
+
* @property {string} model
|
|
20
|
+
* @property {number} embedding_dim
|
|
21
|
+
* @property {number} chunk_lines
|
|
22
|
+
* @property {number} chunk_overlap
|
|
23
|
+
* @property {number} max_bytes
|
|
24
|
+
* @property {Object.<string, string>} files
|
|
25
|
+
* @property {number} chunks
|
|
26
|
+
*/
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* @typedef {Object} IndexResult
|
|
30
|
+
* @property {string} repo_id
|
|
31
|
+
* @property {number} chunks_indexed
|
|
32
|
+
* @property {number} took_ms
|
|
33
|
+
*/
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* @typedef {Object} SearchResult
|
|
37
|
+
* @property {string} relpath
|
|
38
|
+
* @property {number} start_line
|
|
39
|
+
* @property {number} end_line
|
|
40
|
+
* @property {number} score
|
|
41
|
+
* @property {string} snippet
|
|
42
|
+
*/
|
|
43
|
+
|
|
44
|
+
export {};
|