@grumppie/ownsearch 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +137 -0
- package/dist/chunk-LGXCBOO4.js +752 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +229 -0
- package/dist/mcp/server.d.ts +1 -0
- package/dist/mcp/server.js +243 -0
- package/package.json +61 -0
|
@@ -0,0 +1,752 @@
|
|
|
1
|
+
// src/context.ts
|
|
2
|
+
function buildContextBundle(query, hits, maxChars = 12e3) {
|
|
3
|
+
const results = [];
|
|
4
|
+
let totalChars = 0;
|
|
5
|
+
for (const hit of hits) {
|
|
6
|
+
if (results.length > 0 && totalChars + hit.content.length > maxChars) {
|
|
7
|
+
break;
|
|
8
|
+
}
|
|
9
|
+
results.push({
|
|
10
|
+
id: hit.id,
|
|
11
|
+
score: hit.score,
|
|
12
|
+
rootId: hit.rootId,
|
|
13
|
+
rootName: hit.rootName,
|
|
14
|
+
relativePath: hit.relativePath,
|
|
15
|
+
chunkIndex: hit.chunkIndex,
|
|
16
|
+
content: hit.content
|
|
17
|
+
});
|
|
18
|
+
totalChars += hit.content.length;
|
|
19
|
+
}
|
|
20
|
+
return {
|
|
21
|
+
query,
|
|
22
|
+
totalChars,
|
|
23
|
+
results
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// src/config.ts
|
|
28
|
+
import fsSync from "fs";
|
|
29
|
+
import fs from "fs/promises";
|
|
30
|
+
import os from "os";
|
|
31
|
+
import path2 from "path";
|
|
32
|
+
import dotenv from "dotenv";
|
|
33
|
+
|
|
34
|
+
// src/constants.ts
|
|
35
|
+
var CONFIG_DIR_NAME = ".ownsearch";
|
|
36
|
+
var CONFIG_FILE_NAME = "config.json";
|
|
37
|
+
var DEFAULT_COLLECTION = "text_gemini_embedding_001_768";
|
|
38
|
+
var DEFAULT_EMBEDDING_MODEL = "gemini-embedding-001";
|
|
39
|
+
var DEFAULT_VECTOR_SIZE = 768;
|
|
40
|
+
var DEFAULT_QDRANT_URL = "http://127.0.0.1:6333";
|
|
41
|
+
var DEFAULT_QDRANT_CONTAINER = "ownsearch-qdrant";
|
|
42
|
+
var DEFAULT_QDRANT_VOLUME = "ownsearch-qdrant-storage";
|
|
43
|
+
var DEFAULT_CHUNK_SIZE = 1200;
|
|
44
|
+
var DEFAULT_CHUNK_OVERLAP = 200;
|
|
45
|
+
var DEFAULT_MAX_FILE_BYTES = 50 * 1024 * 1024;
|
|
46
|
+
var SUPPORTED_TEXT_EXTENSIONS = /* @__PURE__ */ new Set([
|
|
47
|
+
".c",
|
|
48
|
+
".cpp",
|
|
49
|
+
".cs",
|
|
50
|
+
".css",
|
|
51
|
+
".csv",
|
|
52
|
+
".env",
|
|
53
|
+
".go",
|
|
54
|
+
".h",
|
|
55
|
+
".hpp",
|
|
56
|
+
".html",
|
|
57
|
+
".java",
|
|
58
|
+
".js",
|
|
59
|
+
".json",
|
|
60
|
+
".jsx",
|
|
61
|
+
".md",
|
|
62
|
+
".mdx",
|
|
63
|
+
".mjs",
|
|
64
|
+
".pdf",
|
|
65
|
+
".ps1",
|
|
66
|
+
".py",
|
|
67
|
+
".rb",
|
|
68
|
+
".rs",
|
|
69
|
+
".sh",
|
|
70
|
+
".sql",
|
|
71
|
+
".toml",
|
|
72
|
+
".ts",
|
|
73
|
+
".tsx",
|
|
74
|
+
".txt",
|
|
75
|
+
".xml",
|
|
76
|
+
".yaml",
|
|
77
|
+
".yml"
|
|
78
|
+
]);
|
|
79
|
+
var IGNORED_DIRECTORIES = /* @__PURE__ */ new Set([
|
|
80
|
+
".git",
|
|
81
|
+
".hg",
|
|
82
|
+
".idea",
|
|
83
|
+
".next",
|
|
84
|
+
".svn",
|
|
85
|
+
".turbo",
|
|
86
|
+
".venv",
|
|
87
|
+
".vscode",
|
|
88
|
+
"build",
|
|
89
|
+
"coverage",
|
|
90
|
+
"dist",
|
|
91
|
+
"node_modules",
|
|
92
|
+
"venv"
|
|
93
|
+
]);
|
|
94
|
+
|
|
95
|
+
// src/utils.ts
|
|
96
|
+
import crypto from "crypto";
|
|
97
|
+
import path from "path";
|
|
98
|
+
function toPosixPath(inputPath) {
|
|
99
|
+
return inputPath.split(path.sep).join("/");
|
|
100
|
+
}
|
|
101
|
+
function sha256(input) {
|
|
102
|
+
return crypto.createHash("sha256").update(input).digest("hex");
|
|
103
|
+
}
|
|
104
|
+
function hashToUuid(input) {
|
|
105
|
+
const hash = sha256(input);
|
|
106
|
+
const part1 = hash.slice(0, 8);
|
|
107
|
+
const part2 = hash.slice(8, 12);
|
|
108
|
+
const part3 = `5${hash.slice(13, 16)}`;
|
|
109
|
+
const variantNibble = parseInt(hash.slice(16, 17), 16) & 3 | 8;
|
|
110
|
+
const part4 = `${variantNibble.toString(16)}${hash.slice(17, 20)}`;
|
|
111
|
+
const part5 = hash.slice(20, 32);
|
|
112
|
+
return `${part1}-${part2}-${part3}-${part4}-${part5}`;
|
|
113
|
+
}
|
|
114
|
+
function normalizeVector(values) {
|
|
115
|
+
const magnitude = Math.sqrt(values.reduce((sum, value) => sum + value * value, 0));
|
|
116
|
+
if (magnitude === 0) {
|
|
117
|
+
return values;
|
|
118
|
+
}
|
|
119
|
+
return values.map((value) => value / magnitude);
|
|
120
|
+
}
|
|
121
|
+
function slugifyName(value) {
|
|
122
|
+
return value.trim().toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 64);
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// src/config.ts
|
|
126
|
+
function defaultConfig() {
|
|
127
|
+
return {
|
|
128
|
+
qdrantUrl: DEFAULT_QDRANT_URL,
|
|
129
|
+
qdrantCollection: DEFAULT_COLLECTION,
|
|
130
|
+
qdrantContainerName: DEFAULT_QDRANT_CONTAINER,
|
|
131
|
+
qdrantVolumeName: DEFAULT_QDRANT_VOLUME,
|
|
132
|
+
embeddingModel: DEFAULT_EMBEDDING_MODEL,
|
|
133
|
+
vectorSize: DEFAULT_VECTOR_SIZE,
|
|
134
|
+
chunkSize: DEFAULT_CHUNK_SIZE,
|
|
135
|
+
chunkOverlap: DEFAULT_CHUNK_OVERLAP,
|
|
136
|
+
maxFileBytes: DEFAULT_MAX_FILE_BYTES,
|
|
137
|
+
roots: []
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
function getConfigDir() {
|
|
141
|
+
return path2.join(os.homedir(), CONFIG_DIR_NAME);
|
|
142
|
+
}
|
|
143
|
+
function getConfigPath() {
|
|
144
|
+
return path2.join(getConfigDir(), CONFIG_FILE_NAME);
|
|
145
|
+
}
|
|
146
|
+
function getEnvPath() {
|
|
147
|
+
return path2.join(getConfigDir(), ".env");
|
|
148
|
+
}
|
|
149
|
+
async function ensureConfigDir() {
|
|
150
|
+
await fs.mkdir(getConfigDir(), { recursive: true });
|
|
151
|
+
}
|
|
152
|
+
function loadOwnSearchEnv() {
|
|
153
|
+
for (const envPath of [path2.resolve(process.cwd(), ".env"), getEnvPath()]) {
|
|
154
|
+
if (!fsSync.existsSync(envPath)) {
|
|
155
|
+
continue;
|
|
156
|
+
}
|
|
157
|
+
const parsed = dotenv.parse(fsSync.readFileSync(envPath, "utf8"));
|
|
158
|
+
for (const [key, value] of Object.entries(parsed)) {
|
|
159
|
+
if (process.env[key] === void 0) {
|
|
160
|
+
process.env[key] = value;
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
async function loadConfig() {
|
|
166
|
+
await ensureConfigDir();
|
|
167
|
+
const configPath = getConfigPath();
|
|
168
|
+
try {
|
|
169
|
+
const raw = await fs.readFile(configPath, "utf8");
|
|
170
|
+
const parsed = JSON.parse(raw);
|
|
171
|
+
const config = {
|
|
172
|
+
...defaultConfig(),
|
|
173
|
+
...parsed,
|
|
174
|
+
maxFileBytes: Math.max(parsed.maxFileBytes ?? DEFAULT_MAX_FILE_BYTES, DEFAULT_MAX_FILE_BYTES),
|
|
175
|
+
roots: parsed.roots ?? []
|
|
176
|
+
};
|
|
177
|
+
if (config.maxFileBytes !== parsed.maxFileBytes) {
|
|
178
|
+
await saveConfig(config);
|
|
179
|
+
}
|
|
180
|
+
return config;
|
|
181
|
+
} catch (error) {
|
|
182
|
+
const config = defaultConfig();
|
|
183
|
+
await saveConfig(config);
|
|
184
|
+
return config;
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
async function saveConfig(config) {
|
|
188
|
+
await ensureConfigDir();
|
|
189
|
+
await fs.writeFile(getConfigPath(), `${JSON.stringify(config, null, 2)}
|
|
190
|
+
`, "utf8");
|
|
191
|
+
}
|
|
192
|
+
async function saveGeminiApiKey(apiKey) {
|
|
193
|
+
await ensureConfigDir();
|
|
194
|
+
await fs.writeFile(getEnvPath(), `GEMINI_API_KEY=${apiKey.trim()}
|
|
195
|
+
`, "utf8");
|
|
196
|
+
}
|
|
197
|
+
function createRootDefinition(rootPath, name) {
|
|
198
|
+
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
199
|
+
const rootName = name?.trim() || path2.basename(rootPath);
|
|
200
|
+
return {
|
|
201
|
+
id: slugifyName(`${rootName}-${rootPath}`),
|
|
202
|
+
name: rootName,
|
|
203
|
+
path: path2.resolve(rootPath),
|
|
204
|
+
createdAt: now,
|
|
205
|
+
updatedAt: now
|
|
206
|
+
};
|
|
207
|
+
}
|
|
208
|
+
async function upsertRoot(rootPath, name) {
|
|
209
|
+
const config = await loadConfig();
|
|
210
|
+
const absolutePath = path2.resolve(rootPath);
|
|
211
|
+
const existing = config.roots.find((root2) => root2.path === absolutePath);
|
|
212
|
+
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
213
|
+
if (existing) {
|
|
214
|
+
existing.name = name?.trim() || existing.name;
|
|
215
|
+
existing.updatedAt = now;
|
|
216
|
+
await saveConfig(config);
|
|
217
|
+
return existing;
|
|
218
|
+
}
|
|
219
|
+
const root = createRootDefinition(absolutePath, name);
|
|
220
|
+
config.roots.push(root);
|
|
221
|
+
await saveConfig(config);
|
|
222
|
+
return root;
|
|
223
|
+
}
|
|
224
|
+
async function deleteRootDefinition(rootId) {
|
|
225
|
+
const config = await loadConfig();
|
|
226
|
+
const initialLength = config.roots.length;
|
|
227
|
+
config.roots = config.roots.filter((root) => root.id !== rootId);
|
|
228
|
+
await saveConfig(config);
|
|
229
|
+
return config.roots.length !== initialLength;
|
|
230
|
+
}
|
|
231
|
+
async function findRoot(rootId) {
|
|
232
|
+
const config = await loadConfig();
|
|
233
|
+
return config.roots.find((root) => root.id === rootId);
|
|
234
|
+
}
|
|
235
|
+
async function listRoots() {
|
|
236
|
+
const config = await loadConfig();
|
|
237
|
+
return config.roots;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
// src/errors.ts
|
|
241
|
+
var OwnSearchError = class extends Error {
|
|
242
|
+
constructor(message) {
|
|
243
|
+
super(message);
|
|
244
|
+
this.name = "OwnSearchError";
|
|
245
|
+
}
|
|
246
|
+
};
|
|
247
|
+
|
|
248
|
+
// src/gemini.ts
|
|
249
|
+
import { GoogleGenAI } from "@google/genai";
|
|
250
|
+
var client;
|
|
251
|
+
var MAX_EMBED_BATCH_SIZE = 20;
|
|
252
|
+
function getClient() {
|
|
253
|
+
const apiKey = process.env.GEMINI_API_KEY;
|
|
254
|
+
if (!apiKey) {
|
|
255
|
+
throw new OwnSearchError("GEMINI_API_KEY is required.");
|
|
256
|
+
}
|
|
257
|
+
if (!client) {
|
|
258
|
+
client = new GoogleGenAI({ apiKey });
|
|
259
|
+
}
|
|
260
|
+
return client;
|
|
261
|
+
}
|
|
262
|
+
async function embed(contents, taskType) {
|
|
263
|
+
const config = await loadConfig();
|
|
264
|
+
const vectors = [];
|
|
265
|
+
const debug = process.env.OWNSEARCH_DEBUG_INDEX === "1";
|
|
266
|
+
for (let index = 0; index < contents.length; index += MAX_EMBED_BATCH_SIZE) {
|
|
267
|
+
const batch = contents.slice(index, index + MAX_EMBED_BATCH_SIZE);
|
|
268
|
+
if (debug) {
|
|
269
|
+
console.log("[ownsearch:embed]", "batch", index / MAX_EMBED_BATCH_SIZE + 1, "size", batch.length, "chars", batch.reduce((sum, text) => sum + text.length, 0));
|
|
270
|
+
}
|
|
271
|
+
const response = await getClient().models.embedContent({
|
|
272
|
+
model: config.embeddingModel,
|
|
273
|
+
contents: batch,
|
|
274
|
+
config: {
|
|
275
|
+
taskType,
|
|
276
|
+
outputDimensionality: config.vectorSize
|
|
277
|
+
}
|
|
278
|
+
});
|
|
279
|
+
if (!response.embeddings?.length) {
|
|
280
|
+
throw new OwnSearchError("Gemini returned no embeddings.");
|
|
281
|
+
}
|
|
282
|
+
vectors.push(...response.embeddings.map((embedding) => normalizeVector(embedding.values ?? [])));
|
|
283
|
+
}
|
|
284
|
+
return vectors;
|
|
285
|
+
}
|
|
286
|
+
async function embedDocuments(contents) {
|
|
287
|
+
return embed(contents, "RETRIEVAL_DOCUMENT");
|
|
288
|
+
}
|
|
289
|
+
async function embedQuery(query) {
|
|
290
|
+
const [vector] = await embed([query], "RETRIEVAL_QUERY");
|
|
291
|
+
return vector;
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
// src/qdrant.ts
|
|
295
|
+
import { QdrantClient } from "@qdrant/js-client-rest";
|
|
296
|
+
var OwnSearchStore = class {
|
|
297
|
+
constructor(client2, collectionName, vectorSize) {
|
|
298
|
+
this.client = client2;
|
|
299
|
+
this.collectionName = collectionName;
|
|
300
|
+
this.vectorSize = vectorSize;
|
|
301
|
+
}
|
|
302
|
+
async ensureCollection() {
|
|
303
|
+
const collections = await this.client.getCollections();
|
|
304
|
+
const exists = collections.collections.some((collection) => collection.name === this.collectionName);
|
|
305
|
+
if (!exists) {
|
|
306
|
+
await this.client.createCollection(this.collectionName, {
|
|
307
|
+
vectors: {
|
|
308
|
+
size: this.vectorSize,
|
|
309
|
+
distance: "Cosine"
|
|
310
|
+
}
|
|
311
|
+
});
|
|
312
|
+
} else {
|
|
313
|
+
const info = await this.client.getCollection(this.collectionName);
|
|
314
|
+
const vectorConfig = info.config?.params?.vectors;
|
|
315
|
+
const actualSize = vectorConfig && !Array.isArray(vectorConfig) && "size" in vectorConfig ? Number(vectorConfig.size) : void 0;
|
|
316
|
+
if (actualSize && actualSize !== this.vectorSize) {
|
|
317
|
+
throw new Error(
|
|
318
|
+
`Qdrant collection ${this.collectionName} has vector size ${actualSize}, expected ${this.vectorSize}.`
|
|
319
|
+
);
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
await Promise.allSettled([
|
|
323
|
+
this.client.createPayloadIndex(this.collectionName, {
|
|
324
|
+
field_name: "root_id",
|
|
325
|
+
field_schema: "keyword"
|
|
326
|
+
}),
|
|
327
|
+
this.client.createPayloadIndex(this.collectionName, {
|
|
328
|
+
field_name: "relative_path",
|
|
329
|
+
field_schema: "keyword"
|
|
330
|
+
})
|
|
331
|
+
]);
|
|
332
|
+
}
|
|
333
|
+
async upsertChunks(records, vectors) {
|
|
334
|
+
const batchSize = 50;
|
|
335
|
+
for (let index = 0; index < records.length; index += batchSize) {
|
|
336
|
+
const batchRecords = records.slice(index, index + batchSize);
|
|
337
|
+
const batchVectors = vectors.slice(index, index + batchSize);
|
|
338
|
+
const points = batchRecords.map((record, batchIndex) => ({
|
|
339
|
+
id: record.id,
|
|
340
|
+
vector: batchVectors[batchIndex],
|
|
341
|
+
payload: {
|
|
342
|
+
root_id: record.rootId,
|
|
343
|
+
root_name: record.rootName,
|
|
344
|
+
root_path: record.rootPath,
|
|
345
|
+
file_path: record.filePath,
|
|
346
|
+
relative_path: record.relativePath,
|
|
347
|
+
file_extension: record.fileExtension,
|
|
348
|
+
chunk_index: record.chunkIndex,
|
|
349
|
+
content: record.content,
|
|
350
|
+
content_hash: record.contentHash,
|
|
351
|
+
file_hash: record.fileHash,
|
|
352
|
+
mtime_ms: record.mtimeMs,
|
|
353
|
+
size_bytes: record.sizeBytes
|
|
354
|
+
}
|
|
355
|
+
}));
|
|
356
|
+
await this.client.upsert(this.collectionName, {
|
|
357
|
+
wait: true,
|
|
358
|
+
points
|
|
359
|
+
});
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
async scrollRootChunks(rootId) {
|
|
363
|
+
const chunks = [];
|
|
364
|
+
let offset;
|
|
365
|
+
do {
|
|
366
|
+
const result = await this.client.scroll(this.collectionName, {
|
|
367
|
+
limit: 1024,
|
|
368
|
+
offset,
|
|
369
|
+
with_payload: true,
|
|
370
|
+
with_vector: false,
|
|
371
|
+
filter: {
|
|
372
|
+
must: [
|
|
373
|
+
{
|
|
374
|
+
key: "root_id",
|
|
375
|
+
match: {
|
|
376
|
+
value: rootId
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
]
|
|
380
|
+
}
|
|
381
|
+
});
|
|
382
|
+
chunks.push(
|
|
383
|
+
...(result.points ?? []).map((point) => ({
|
|
384
|
+
id: String(point.id),
|
|
385
|
+
rootId: String(point.payload?.root_id ?? ""),
|
|
386
|
+
rootPath: String(point.payload?.root_path ?? ""),
|
|
387
|
+
rootName: String(point.payload?.root_name ?? ""),
|
|
388
|
+
filePath: String(point.payload?.file_path ?? ""),
|
|
389
|
+
relativePath: String(point.payload?.relative_path ?? ""),
|
|
390
|
+
fileExtension: String(point.payload?.file_extension ?? ""),
|
|
391
|
+
chunkIndex: Number(point.payload?.chunk_index ?? 0),
|
|
392
|
+
content: String(point.payload?.content ?? ""),
|
|
393
|
+
contentHash: String(point.payload?.content_hash ?? ""),
|
|
394
|
+
fileHash: String(point.payload?.file_hash ?? ""),
|
|
395
|
+
mtimeMs: Number(point.payload?.mtime_ms ?? 0),
|
|
396
|
+
sizeBytes: Number(point.payload?.size_bytes ?? 0)
|
|
397
|
+
}))
|
|
398
|
+
);
|
|
399
|
+
offset = result.next_page_offset;
|
|
400
|
+
} while (offset !== void 0 && offset !== null);
|
|
401
|
+
return chunks;
|
|
402
|
+
}
|
|
403
|
+
async deleteRoot(rootId) {
|
|
404
|
+
await this.client.delete(this.collectionName, {
|
|
405
|
+
wait: true,
|
|
406
|
+
filter: {
|
|
407
|
+
must: [
|
|
408
|
+
{
|
|
409
|
+
key: "root_id",
|
|
410
|
+
match: {
|
|
411
|
+
value: rootId
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
]
|
|
415
|
+
}
|
|
416
|
+
});
|
|
417
|
+
}
|
|
418
|
+
async deleteFiles(rootId, filePaths) {
|
|
419
|
+
for (const filePath of filePaths) {
|
|
420
|
+
await this.client.delete(this.collectionName, {
|
|
421
|
+
wait: true,
|
|
422
|
+
filter: {
|
|
423
|
+
must: [
|
|
424
|
+
{
|
|
425
|
+
key: "root_id",
|
|
426
|
+
match: {
|
|
427
|
+
value: rootId
|
|
428
|
+
}
|
|
429
|
+
},
|
|
430
|
+
{
|
|
431
|
+
key: "file_path",
|
|
432
|
+
match: {
|
|
433
|
+
value: filePath
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
]
|
|
437
|
+
}
|
|
438
|
+
});
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
async search(vector, filters, limit) {
|
|
442
|
+
const must = [];
|
|
443
|
+
if (filters.rootIds?.length) {
|
|
444
|
+
must.push({
|
|
445
|
+
key: "root_id",
|
|
446
|
+
match: {
|
|
447
|
+
any: filters.rootIds
|
|
448
|
+
}
|
|
449
|
+
});
|
|
450
|
+
}
|
|
451
|
+
const results = await this.client.search(this.collectionName, {
|
|
452
|
+
vector,
|
|
453
|
+
limit: filters.pathSubstring ? Math.max(limit * 3, limit) : limit,
|
|
454
|
+
with_payload: true,
|
|
455
|
+
filter: must.length ? { must } : void 0
|
|
456
|
+
});
|
|
457
|
+
const hits = results.map((result) => ({
|
|
458
|
+
id: String(result.id),
|
|
459
|
+
score: result.score,
|
|
460
|
+
rootId: String(result.payload?.root_id ?? ""),
|
|
461
|
+
rootName: String(result.payload?.root_name ?? ""),
|
|
462
|
+
filePath: String(result.payload?.file_path ?? ""),
|
|
463
|
+
relativePath: String(result.payload?.relative_path ?? ""),
|
|
464
|
+
chunkIndex: Number(result.payload?.chunk_index ?? 0),
|
|
465
|
+
content: String(result.payload?.content ?? "")
|
|
466
|
+
}));
|
|
467
|
+
if (!filters.pathSubstring) {
|
|
468
|
+
return hits.slice(0, limit);
|
|
469
|
+
}
|
|
470
|
+
const needle = filters.pathSubstring.toLowerCase();
|
|
471
|
+
return hits.filter((hit) => hit.relativePath.toLowerCase().includes(needle)).slice(0, limit);
|
|
472
|
+
}
|
|
473
|
+
async getChunks(ids) {
|
|
474
|
+
if (ids.length === 0) {
|
|
475
|
+
return [];
|
|
476
|
+
}
|
|
477
|
+
const points = await this.client.retrieve(this.collectionName, {
|
|
478
|
+
ids,
|
|
479
|
+
with_payload: true,
|
|
480
|
+
with_vector: false
|
|
481
|
+
});
|
|
482
|
+
return (points ?? []).map((point) => ({
|
|
483
|
+
id: String(point.id),
|
|
484
|
+
rootId: String(point.payload?.root_id ?? ""),
|
|
485
|
+
rootName: String(point.payload?.root_name ?? ""),
|
|
486
|
+
filePath: String(point.payload?.file_path ?? ""),
|
|
487
|
+
relativePath: String(point.payload?.relative_path ?? ""),
|
|
488
|
+
chunkIndex: Number(point.payload?.chunk_index ?? 0),
|
|
489
|
+
content: String(point.payload?.content ?? "")
|
|
490
|
+
}));
|
|
491
|
+
}
|
|
492
|
+
async getStatus() {
|
|
493
|
+
const info = await this.client.getCollection(this.collectionName);
|
|
494
|
+
return {
|
|
495
|
+
collection: this.collectionName,
|
|
496
|
+
status: info.status,
|
|
497
|
+
pointsCount: info.points_count,
|
|
498
|
+
indexedVectorsCount: info.indexed_vectors_count,
|
|
499
|
+
vectorConfig: info.config?.params?.vectors ?? null
|
|
500
|
+
};
|
|
501
|
+
}
|
|
502
|
+
};
|
|
503
|
+
async function createStore() {
|
|
504
|
+
const config = await loadConfig();
|
|
505
|
+
const client2 = new QdrantClient({ url: config.qdrantUrl, checkCompatibility: false });
|
|
506
|
+
const store = new OwnSearchStore(client2, config.qdrantCollection, config.vectorSize);
|
|
507
|
+
await store.ensureCollection();
|
|
508
|
+
return store;
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
// src/indexer.ts
|
|
512
|
+
import fs3 from "fs/promises";
|
|
513
|
+
import path4 from "path";
|
|
514
|
+
|
|
515
|
+
// src/chunking.ts
|
|
516
|
+
function chunkText(content, chunkSize, chunkOverlap) {
|
|
517
|
+
const normalized = content.replace(/\r\n/g, "\n").trim();
|
|
518
|
+
if (!normalized) {
|
|
519
|
+
return [];
|
|
520
|
+
}
|
|
521
|
+
const chunks = [];
|
|
522
|
+
let start = 0;
|
|
523
|
+
while (start < normalized.length) {
|
|
524
|
+
let end = Math.min(start + chunkSize, normalized.length);
|
|
525
|
+
if (end < normalized.length) {
|
|
526
|
+
const lastBoundary = normalized.lastIndexOf("\n", end);
|
|
527
|
+
if (lastBoundary > start + Math.floor(chunkSize * 0.5)) {
|
|
528
|
+
end = lastBoundary;
|
|
529
|
+
}
|
|
530
|
+
}
|
|
531
|
+
const chunk = normalized.slice(start, end).trim();
|
|
532
|
+
if (chunk) {
|
|
533
|
+
chunks.push(chunk);
|
|
534
|
+
}
|
|
535
|
+
if (end >= normalized.length) {
|
|
536
|
+
break;
|
|
537
|
+
}
|
|
538
|
+
start = Math.max(end - chunkOverlap, start + 1);
|
|
539
|
+
}
|
|
540
|
+
return chunks;
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
// src/files.ts
|
|
544
|
+
import fs2 from "fs/promises";
|
|
545
|
+
import path3 from "path";
|
|
546
|
+
import { PDFParse } from "pdf-parse";
|
|
547
|
+
function sanitizeExtractedText(input) {
|
|
548
|
+
return input.replace(/\u0000/g, "").replace(/[\u0001-\u0008\u000B\u000C\u000E-\u001F\u007F]/g, " ").replace(/\r\n/g, "\n");
|
|
549
|
+
}
|
|
550
|
+
async function collectTextFiles(rootPath, maxFileBytes) {
|
|
551
|
+
const files = [];
|
|
552
|
+
const absoluteRoot = path3.resolve(rootPath);
|
|
553
|
+
const debug = process.env.OWNSEARCH_DEBUG_INDEX === "1";
|
|
554
|
+
function debugLog(...parts) {
|
|
555
|
+
if (debug) {
|
|
556
|
+
console.log("[ownsearch:index]", ...parts);
|
|
557
|
+
}
|
|
558
|
+
}
|
|
559
|
+
async function parsePdf(filePath) {
|
|
560
|
+
const buffer = await fs2.readFile(filePath);
|
|
561
|
+
const parser = new PDFParse({ data: buffer });
|
|
562
|
+
try {
|
|
563
|
+
const pdfData = await parser.getText();
|
|
564
|
+
return pdfData.text ?? "";
|
|
565
|
+
} finally {
|
|
566
|
+
await parser.destroy();
|
|
567
|
+
}
|
|
568
|
+
}
|
|
569
|
+
async function walk(currentPath) {
|
|
570
|
+
const entries = await fs2.readdir(currentPath, { withFileTypes: true });
|
|
571
|
+
for (const entry of entries) {
|
|
572
|
+
if (entry.name.startsWith(".") && entry.name !== ".env" && entry.name !== ".github") {
|
|
573
|
+
if (entry.isDirectory()) {
|
|
574
|
+
continue;
|
|
575
|
+
}
|
|
576
|
+
}
|
|
577
|
+
const nextPath = path3.join(currentPath, entry.name);
|
|
578
|
+
if (entry.isDirectory()) {
|
|
579
|
+
if (IGNORED_DIRECTORIES.has(entry.name)) {
|
|
580
|
+
continue;
|
|
581
|
+
}
|
|
582
|
+
await walk(nextPath);
|
|
583
|
+
continue;
|
|
584
|
+
}
|
|
585
|
+
const extension = path3.extname(entry.name).toLowerCase();
|
|
586
|
+
if (!SUPPORTED_TEXT_EXTENSIONS.has(extension)) {
|
|
587
|
+
debugLog("skip-extension", nextPath, extension);
|
|
588
|
+
continue;
|
|
589
|
+
}
|
|
590
|
+
const stats = await fs2.stat(nextPath);
|
|
591
|
+
if (stats.size > maxFileBytes) {
|
|
592
|
+
debugLog("skip-size", nextPath, stats.size);
|
|
593
|
+
continue;
|
|
594
|
+
}
|
|
595
|
+
let content = "";
|
|
596
|
+
try {
|
|
597
|
+
if (extension === ".pdf") {
|
|
598
|
+
content = await parsePdf(nextPath);
|
|
599
|
+
} else {
|
|
600
|
+
content = await fs2.readFile(nextPath, "utf8");
|
|
601
|
+
}
|
|
602
|
+
content = sanitizeExtractedText(content);
|
|
603
|
+
} catch (error) {
|
|
604
|
+
debugLog("skip-parse", nextPath, String(error));
|
|
605
|
+
continue;
|
|
606
|
+
}
|
|
607
|
+
if (!content || !content.trim()) {
|
|
608
|
+
debugLog("skip-empty", nextPath);
|
|
609
|
+
continue;
|
|
610
|
+
}
|
|
611
|
+
files.push({
|
|
612
|
+
path: nextPath,
|
|
613
|
+
relativePath: path3.relative(absoluteRoot, nextPath),
|
|
614
|
+
extension,
|
|
615
|
+
sizeBytes: stats.size,
|
|
616
|
+
mtimeMs: stats.mtimeMs,
|
|
617
|
+
content
|
|
618
|
+
});
|
|
619
|
+
}
|
|
620
|
+
}
|
|
621
|
+
await walk(absoluteRoot);
|
|
622
|
+
return files;
|
|
623
|
+
}
|
|
624
|
+
|
|
625
|
+
// src/indexer.ts
|
|
626
|
+
function buildChunkId(rootId, relativePath, chunkIndex, fileHash) {
|
|
627
|
+
return hashToUuid(`${rootId}:${relativePath}:${chunkIndex}:${fileHash}`);
|
|
628
|
+
}
|
|
629
|
+
async function embedRecords(records) {
|
|
630
|
+
if (records.length === 0) {
|
|
631
|
+
return { records: [], vectors: [], skipped: 0 };
|
|
632
|
+
}
|
|
633
|
+
try {
|
|
634
|
+
const vectors = await embedDocuments(records.map((record) => record.content));
|
|
635
|
+
return { records, vectors, skipped: 0 };
|
|
636
|
+
} catch (error) {
|
|
637
|
+
if (records.length === 1) {
|
|
638
|
+
const debug = process.env.OWNSEARCH_DEBUG_INDEX === "1";
|
|
639
|
+
if (debug) {
|
|
640
|
+
console.log("[ownsearch:embed]", "skip-chunk", records[0].relativePath, String(error));
|
|
641
|
+
}
|
|
642
|
+
return { records: [], vectors: [], skipped: 1 };
|
|
643
|
+
}
|
|
644
|
+
const midpoint = Math.floor(records.length / 2);
|
|
645
|
+
const left = await embedRecords(records.slice(0, midpoint));
|
|
646
|
+
const right = await embedRecords(records.slice(midpoint));
|
|
647
|
+
return {
|
|
648
|
+
records: [...left.records, ...right.records],
|
|
649
|
+
vectors: [...left.vectors, ...right.vectors],
|
|
650
|
+
skipped: left.skipped + right.skipped
|
|
651
|
+
};
|
|
652
|
+
}
|
|
653
|
+
}
|
|
654
|
+
async function indexPath(rootPath, options = {}) {
|
|
655
|
+
const absolutePath = path4.resolve(rootPath);
|
|
656
|
+
const stats = await fs3.stat(absolutePath).catch(() => void 0);
|
|
657
|
+
if (!stats?.isDirectory()) {
|
|
658
|
+
throw new OwnSearchError(`Path is not a readable directory: ${absolutePath}`);
|
|
659
|
+
}
|
|
660
|
+
const config = await loadConfig();
|
|
661
|
+
const root = await upsertRoot(absolutePath, options.name);
|
|
662
|
+
const store = await createStore();
|
|
663
|
+
const files = await collectTextFiles(root.path, options.maxFileBytes ?? config.maxFileBytes);
|
|
664
|
+
const existingChunks = await store.scrollRootChunks(root.id);
|
|
665
|
+
const records = [];
|
|
666
|
+
const filesByPath = /* @__PURE__ */ new Map();
|
|
667
|
+
const existingByPath = /* @__PURE__ */ new Map();
|
|
668
|
+
const refreshAllMetadata = existingChunks.some(
|
|
669
|
+
(chunk) => chunk.rootName !== root.name || chunk.rootPath !== root.path
|
|
670
|
+
);
|
|
671
|
+
for (const file of files) {
|
|
672
|
+
filesByPath.set(file.path, file);
|
|
673
|
+
}
|
|
674
|
+
for (const chunk of existingChunks) {
|
|
675
|
+
const list = existingByPath.get(chunk.filePath) ?? [];
|
|
676
|
+
list.push(chunk);
|
|
677
|
+
existingByPath.set(chunk.filePath, list);
|
|
678
|
+
}
|
|
679
|
+
const staleFiles = [];
|
|
680
|
+
for (const file of files) {
|
|
681
|
+
const fileHash = sha256(file.content);
|
|
682
|
+
const chunks = chunkText(file.content, config.chunkSize, config.chunkOverlap);
|
|
683
|
+
const existing = existingByPath.get(file.path);
|
|
684
|
+
const existingFileHash = existing?.[0]?.fileHash;
|
|
685
|
+
const existingChunkCount = existing?.length ?? 0;
|
|
686
|
+
if (!refreshAllMetadata && existing && existingFileHash === fileHash && existingChunkCount === chunks.length) {
|
|
687
|
+
continue;
|
|
688
|
+
}
|
|
689
|
+
if (existing?.length) {
|
|
690
|
+
staleFiles.push(file.path);
|
|
691
|
+
}
|
|
692
|
+
chunks.forEach((content, chunkIndex) => {
|
|
693
|
+
records.push({
|
|
694
|
+
id: buildChunkId(root.id, toPosixPath(file.relativePath), chunkIndex, fileHash),
|
|
695
|
+
rootId: root.id,
|
|
696
|
+
rootPath: root.path,
|
|
697
|
+
rootName: root.name,
|
|
698
|
+
filePath: file.path,
|
|
699
|
+
relativePath: toPosixPath(file.relativePath),
|
|
700
|
+
fileExtension: file.extension,
|
|
701
|
+
chunkIndex,
|
|
702
|
+
content,
|
|
703
|
+
contentHash: sha256(content),
|
|
704
|
+
fileHash,
|
|
705
|
+
mtimeMs: file.mtimeMs,
|
|
706
|
+
sizeBytes: file.sizeBytes
|
|
707
|
+
});
|
|
708
|
+
});
|
|
709
|
+
}
|
|
710
|
+
for (const [existingFilePath] of existingByPath.entries()) {
|
|
711
|
+
if (!filesByPath.has(existingFilePath)) {
|
|
712
|
+
staleFiles.push(existingFilePath);
|
|
713
|
+
}
|
|
714
|
+
}
|
|
715
|
+
if (staleFiles.length > 0) {
|
|
716
|
+
await store.deleteFiles(root.id, Array.from(new Set(staleFiles)));
|
|
717
|
+
}
|
|
718
|
+
if (records.length === 0) {
|
|
719
|
+
return {
|
|
720
|
+
root,
|
|
721
|
+
indexedFiles: files.length,
|
|
722
|
+
indexedChunks: 0,
|
|
723
|
+
skippedFiles: 0
|
|
724
|
+
};
|
|
725
|
+
}
|
|
726
|
+
const embedded = await embedRecords(records);
|
|
727
|
+
if (embedded.records.length > 0) {
|
|
728
|
+
await store.upsertChunks(embedded.records, embedded.vectors);
|
|
729
|
+
}
|
|
730
|
+
return {
|
|
731
|
+
root,
|
|
732
|
+
indexedFiles: files.length,
|
|
733
|
+
indexedChunks: embedded.records.length,
|
|
734
|
+
skippedFiles: 0
|
|
735
|
+
};
|
|
736
|
+
}
|
|
737
|
+
|
|
738
|
+
export {
|
|
739
|
+
buildContextBundle,
|
|
740
|
+
getConfigPath,
|
|
741
|
+
getEnvPath,
|
|
742
|
+
loadOwnSearchEnv,
|
|
743
|
+
loadConfig,
|
|
744
|
+
saveGeminiApiKey,
|
|
745
|
+
deleteRootDefinition,
|
|
746
|
+
findRoot,
|
|
747
|
+
listRoots,
|
|
748
|
+
OwnSearchError,
|
|
749
|
+
embedQuery,
|
|
750
|
+
createStore,
|
|
751
|
+
indexPath
|
|
752
|
+
};
|