mikoshi 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,210 @@
1
+ import os from "node:os";
2
+ import path from "node:path";
3
+
4
+ const DEFAULT_DIM = 384;
5
+
6
+ function normalizeVector(vec) {
7
+ let sum = 0;
8
+ for (let i = 0; i < vec.length; i += 1) sum += vec[i] * vec[i];
9
+ const norm = Math.sqrt(sum) || 1;
10
+ for (let i = 0; i < vec.length; i += 1) vec[i] /= norm;
11
+ return vec;
12
+ }
13
+
14
+ export function normalizeEmbeddings(matrix, dim) {
15
+ if (!matrix || !matrix.length || !dim) return matrix;
16
+ const rows = Math.floor(matrix.length / dim);
17
+ for (let i = 0; i < rows; i += 1) {
18
+ const offset = i * dim;
19
+ let sum = 0;
20
+ for (let j = 0; j < dim; j += 1) {
21
+ const v = matrix[offset + j];
22
+ sum += v * v;
23
+ }
24
+ const norm = Math.sqrt(sum) || 1;
25
+ for (let j = 0; j < dim; j += 1) {
26
+ matrix[offset + j] /= norm;
27
+ }
28
+ }
29
+ return matrix;
30
+ }
31
+
32
+ class HashEmbeddingsProvider {
33
+ constructor(dim = DEFAULT_DIM) {
34
+ this.dimension = dim;
35
+ }
36
+
37
+ async embedTexts(texts) {
38
+ const vectors = [];
39
+ for (const text of texts) {
40
+ const vec = new Float32Array(this.dimension);
41
+ const tokens = text.toLowerCase().split(/[^a-z0-9_]+/).filter(Boolean);
42
+ for (const token of tokens) {
43
+ let hash = 0;
44
+ for (let i = 0; i < token.length; i += 1) {
45
+ hash = (hash * 31 + token.charCodeAt(i)) >>> 0;
46
+ }
47
+ const idx = hash % this.dimension;
48
+ vec[idx] += 1;
49
+ }
50
+ normalizeVector(vec);
51
+ vectors.push(vec);
52
+ }
53
+ return vectors;
54
+ }
55
+ }
56
+
57
+ class OpenAIEmbeddingsProvider {
58
+ constructor(config) {
59
+ this.config = config;
60
+ this.dimension = null;
61
+ }
62
+
63
+ async embedTexts(texts) {
64
+ const response = await fetch(`${this.config.embeddings.openai_base_url}/v1/embeddings`, {
65
+ method: "POST",
66
+ headers: {
67
+ "Content-Type": "application/json",
68
+ Authorization: `Bearer ${this.config.embeddings.openai_key}`,
69
+ },
70
+ body: JSON.stringify({
71
+ model: this.config.embeddings.openai_model,
72
+ input: texts,
73
+ }),
74
+ });
75
+ if (!response.ok) {
76
+ const detail = await response.text();
77
+ throw new Error(`OpenAI embeddings failed: ${detail}`);
78
+ }
79
+ const data = await response.json();
80
+ const vectors = data.data.map((item) => Float32Array.from(item.embedding));
81
+ if (vectors.length) {
82
+ this.dimension = vectors[0].length;
83
+ for (const vec of vectors) normalizeVector(vec);
84
+ }
85
+ return vectors;
86
+ }
87
+ }
88
+
89
+ class TransformersEmbeddingsProvider {
90
+ constructor(config) {
91
+ this.config = config;
92
+ this.dimension = null;
93
+ this._pipelinePromise = null;
94
+ }
95
+
96
+ async _loadPipeline() {
97
+ if (this._pipelinePromise) return this._pipelinePromise;
98
+ this._pipelinePromise = (async () => {
99
+ const { pipeline, env } = await import("@xenova/transformers");
100
+ const cacheDir = path.join(os.homedir(), ".mikoshi", "models");
101
+ env.cacheDir = cacheDir;
102
+ env.allowLocalModels = true;
103
+
104
+ const offline = isOffline();
105
+ const timeoutMs = downloadTimeoutMs();
106
+
107
+ console.error("🧠 Loading embeddings model…");
108
+
109
+ env.allowRemoteModels = false;
110
+ try {
111
+ const extractor = await pipeline("feature-extraction", this.config.embeddings.model, {
112
+ quantized: false,
113
+ });
114
+ console.error("✅ Embeddings model loaded");
115
+ return extractor;
116
+ } catch (err) {
117
+ if (offline) {
118
+ throw new Error(
119
+ "Model download disabled. Set MIKOSHI_OFFLINE=0 or pre-cache the model."
120
+ );
121
+ }
122
+ }
123
+
124
+ console.error("⬇️ Downloading local model (one-time)…");
125
+ env.allowRemoteModels = true;
126
+ const extractor = await withTimeout(
127
+ pipeline("feature-extraction", this.config.embeddings.model, { quantized: false }),
128
+ timeoutMs
129
+ );
130
+ console.error("✅ Model ready");
131
+ console.error("✅ Embeddings model loaded");
132
+ return extractor;
133
+ })();
134
+ return this._pipelinePromise;
135
+ }
136
+
137
+ async embedTexts(texts) {
138
+ const extractor = await this._loadPipeline();
139
+ const vectors = [];
140
+ for (const text of texts) {
141
+ const output = await extractor(text, { pooling: "mean", normalize: true });
142
+ const data = output.data;
143
+ const vec = Float32Array.from(data);
144
+ this.dimension = vec.length;
145
+ vectors.push(vec);
146
+ }
147
+ return vectors;
148
+ }
149
+ }
150
+
151
+ function isOffline() {
152
+ const raw = process.env.MIKOSHI_OFFLINE || "";
153
+ return ["1", "true", "yes", "on"].includes(raw.toLowerCase());
154
+ }
155
+
156
+ function downloadTimeoutMs() {
157
+ const raw = process.env.MIKOSHI_MODEL_DOWNLOAD_TIMEOUT_SEC;
158
+ const seconds = raw ? Number.parseInt(raw, 10) : 300;
159
+ if (!Number.isFinite(seconds) || seconds <= 0) return 300_000;
160
+ return seconds * 1000;
161
+ }
162
+
163
+ function withTimeout(promise, timeoutMs) {
164
+ let timeoutId;
165
+ const timeoutPromise = new Promise((_, reject) => {
166
+ timeoutId = setTimeout(() => {
167
+ reject(
168
+ new Error(
169
+ "Model download timed out. Set MIKOSHI_OFFLINE=1 to skip, or retry on a stable network."
170
+ )
171
+ );
172
+ }, timeoutMs);
173
+ });
174
+ return Promise.race([promise, timeoutPromise]).finally(() => clearTimeout(timeoutId));
175
+ }
176
+
177
+ export function getEmbeddingsProvider(config) {
178
+ if (process.env.MIKOSHI_TEST_MODE === "1") {
179
+ return new HashEmbeddingsProvider();
180
+ }
181
+ if (config.embeddings.provider === "openai") {
182
+ return new OpenAIEmbeddingsProvider(config);
183
+ }
184
+ return new TransformersEmbeddingsProvider(config);
185
+ }
186
+
187
+ export class SemanticSearcher {
188
+ constructor(embeddings, dim, provider) {
189
+ this.embeddings = embeddings;
190
+ this.dim = dim;
191
+ this.provider = provider;
192
+ }
193
+
194
+ async search(query, k) {
195
+ const vectors = await this.provider.embedTexts([query]);
196
+ const queryVec = vectors[0];
197
+ const results = [];
198
+ const rows = Math.floor(this.embeddings.length / this.dim);
199
+ for (let i = 0; i < rows; i += 1) {
200
+ let score = 0;
201
+ const offset = i * this.dim;
202
+ for (let j = 0; j < this.dim; j += 1) {
203
+ score += this.embeddings[offset + j] * queryVec[j];
204
+ }
205
+ results.push([i, score]);
206
+ }
207
+ results.sort((a, b) => b[1] - a[1]);
208
+ return results.slice(0, k);
209
+ }
210
+ }
@@ -0,0 +1,9 @@
1
+ export class Timer {
2
+ constructor() {
3
+ this.start = Date.now();
4
+ }
5
+
6
+ get ms() {
7
+ return Date.now() - this.start;
8
+ }
9
+ }
@@ -0,0 +1,44 @@
1
+ /**
2
+ * @typedef {Object} Chunk
3
+ * @property {string} id
4
+ * @property {string} relpath
5
+ * @property {number} start_line
6
+ * @property {number} end_line
7
+ * @property {string} text
8
+ * @property {string} file_hash
9
+ * @property {number | null} vector_idx
10
+ */
11
+
12
+ /**
13
+ * @typedef {Object} IndexMeta
14
+ * @property {string} repo_id
15
+ * @property {string} repo_path
16
+ * @property {string} created_at
17
+ * @property {string} updated_at
18
+ * @property {string} embedding_provider
19
+ * @property {string} model
20
+ * @property {number} embedding_dim
21
+ * @property {number} chunk_lines
22
+ * @property {number} chunk_overlap
23
+ * @property {number} max_bytes
24
+ * @property {Object.<string, string>} files
25
+ * @property {number} chunks
26
+ */
27
+
28
+ /**
29
+ * @typedef {Object} IndexResult
30
+ * @property {string} repo_id
31
+ * @property {number} chunks_indexed
32
+ * @property {number} took_ms
33
+ */
34
+
35
+ /**
36
+ * @typedef {Object} SearchResult
37
+ * @property {string} relpath
38
+ * @property {number} start_line
39
+ * @property {number} end_line
40
+ * @property {number} score
41
+ * @property {string} snippet
42
+ */
43
+
44
+ export {};