@khoinguyen2002/doc-mcp 1.0.4 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,87 @@
1
+ import { Redis } from "@upstash/redis";
2
+ import { config } from "../config.js";
3
+
4
+ const HASH_KEY = "doc_sync_state";
5
+
6
+ let _redis: Redis | null = null;
7
+
8
+ function getRedis(): Redis {
9
+ if (!_redis) {
10
+ _redis = new Redis({
11
+ url: config.UPSTASH_REDIS_REST_URL,
12
+ token: config.UPSTASH_REDIS_REST_TOKEN,
13
+ });
14
+ }
15
+ return _redis;
16
+ }
17
+
18
+ export interface SyncEntry {
19
+ modifiedTime: string;
20
+ blockCount: number;
21
+ title: string;
22
+ }
23
+
24
+ export async function getAllSyncEntries(): Promise<Record<string, SyncEntry>> {
25
+ const redis = getRedis();
26
+ const raw = await redis.hgetall(HASH_KEY);
27
+ if (!raw) return {};
28
+
29
+ const result: Record<string, SyncEntry> = {};
30
+ for (const [fileId, value] of Object.entries(raw)) {
31
+ if (!value) continue;
32
+ try {
33
+ result[fileId] =
34
+ typeof value === "string"
35
+ ? (JSON.parse(value) as SyncEntry)
36
+ : (value as unknown as SyncEntry);
37
+ } catch {
38
+ // skip malformed entries
39
+ }
40
+ }
41
+ return result;
42
+ }
43
+
44
+ export async function getSyncEntry(fileId: string): Promise<SyncEntry | null> {
45
+ const redis = getRedis();
46
+ const raw = await redis.hget(HASH_KEY, fileId);
47
+ if (!raw) return null;
48
+ try {
49
+ return typeof raw === "string"
50
+ ? (JSON.parse(raw) as SyncEntry)
51
+ : (raw as unknown as SyncEntry);
52
+ } catch {
53
+ return null;
54
+ }
55
+ }
56
+
57
+ export async function setSyncEntry(
58
+ fileId: string,
59
+ entry: SyncEntry
60
+ ): Promise<void> {
61
+ const redis = getRedis();
62
+ await redis.hset(HASH_KEY, { [fileId]: JSON.stringify(entry) });
63
+ }
64
+
65
+ export async function deleteSyncEntry(fileId: string): Promise<void> {
66
+ const redis = getRedis();
67
+ await redis.hdel(HASH_KEY, fileId);
68
+ }
69
+
70
+ // ─── Image Description Cache ──────────────────────────────────────────────────
71
+ // Global hash: md5(imageBinary) → description text
72
+ // Deduplicates across docs (same image used in multiple files reuses description)
73
+ const IMG_DESC_KEY = "img_desc";
74
+
75
+ export async function getImageDesc(imageHash: string): Promise<string | null> {
76
+ const redis = getRedis();
77
+ const raw = await redis.hget(IMG_DESC_KEY, imageHash);
78
+ return raw ? String(raw) : null;
79
+ }
80
+
81
+ export async function setImageDesc(
82
+ imageHash: string,
83
+ description: string
84
+ ): Promise<void> {
85
+ const redis = getRedis();
86
+ await redis.hset(IMG_DESC_KEY, { [imageHash]: description });
87
+ }
package/src/db/vector.ts CHANGED
@@ -1,9 +1,20 @@
1
- import { QdrantClient } from '@qdrant/js-client-rest';
2
- import { v4 as uuidv4 } from 'uuid';
3
- import { config } from '../config.js';
1
+ import { QdrantClient } from "@qdrant/js-client-rest";
2
+ import { v4 as uuidv4, v5 as uuidv5 } from "uuid";
3
+ import { config } from "../config.js";
4
4
 
5
5
  let client: QdrantClient | null = null;
6
- const COLLECTION_NAME = 'project_memory';
6
+ const COLLECTION_NAME = "project_memory";
7
+
8
+ // Fixed namespace for deterministic point IDs (uuid v5)
9
+ const POINT_NAMESPACE = "1b671a64-40d5-491e-99b0-da01ff1f3341";
10
+
11
+ /**
12
+ * Deterministic Qdrant point ID: uuidv5(fileId:blockIndex, NS)
13
+ * Same input → same ID → upsert overwrites correctly.
14
+ */
15
+ export function getBlockPointId(fileId: string, blockIndex: number): string {
16
+ return uuidv5(`${fileId}:${blockIndex}`, POINT_NAMESPACE);
17
+ }
7
18
 
8
19
  export async function initVectorDB() {
9
20
  if (!client) {
@@ -13,119 +24,282 @@ export async function initVectorDB() {
13
24
  });
14
25
  console.error(`Connected to Qdrant at ${config.QDRANT_URL}`);
15
26
 
16
- // Check if collection exists
17
27
  const res = await client.getCollections();
18
- const exists = res.collections.some(c => c.name === COLLECTION_NAME);
28
+ const exists = res.collections.some((c) => c.name === COLLECTION_NAME);
19
29
  if (!exists) {
20
30
  console.error(`Creating Qdrant collection: ${COLLECTION_NAME}`);
21
31
  const dummyVector = await embedText("test");
22
32
  const dimension = dummyVector.length;
23
33
 
24
34
  await client.createCollection(COLLECTION_NAME, {
25
- vectors: {
26
- size: dimension,
27
- distance: "Cosine",
28
- },
35
+ vectors: { size: dimension, distance: "Cosine" },
29
36
  });
30
37
  await client.createPayloadIndex(COLLECTION_NAME, {
31
- field_name: "folderId",
38
+ field_name: "source",
32
39
  field_schema: "keyword",
33
40
  });
34
41
  await client.createPayloadIndex(COLLECTION_NAME, {
35
- field_name: "file_id",
36
- field_schema: "keyword",
42
+ field_name: "block_index",
43
+ field_schema: "integer",
37
44
  });
38
45
  await client.createPayloadIndex(COLLECTION_NAME, {
39
- field_name: "source",
46
+ field_name: "block_hash",
40
47
  field_schema: "keyword",
41
48
  });
42
- console.error(`Collection ${COLLECTION_NAME} created with dimension ${dimension}.`);
49
+ // Full-text index on `text` payload for exact/keyword search.
50
+ // whitespace tokenizer keeps API paths (e.g. /v1/foo/bar) as single tokens.
51
+ // lowercase=true makes searches case-insensitive.
52
+ await client.createPayloadIndex(COLLECTION_NAME, {
53
+ field_name: "text",
54
+ field_schema: {
55
+ type: "text",
56
+ tokenizer: "whitespace",
57
+ min_token_len: 2,
58
+ max_token_len: 200,
59
+ lowercase: true,
60
+ } as any,
61
+ });
62
+ console.error(
63
+ `Collection ${COLLECTION_NAME} created with dimension ${dimension}.`
64
+ );
43
65
  }
44
66
  }
45
67
  }
46
68
 
47
- export async function embedText(text: string): Promise<number[]> {
48
- const response = await fetch("https://openrouter.ai/api/v1/embeddings", {
49
- method: "POST",
50
- headers: {
51
- "Authorization": `Bearer ${config.OPENROUTER_API_KEY}`,
52
- "Content-Type": "application/json"
53
- },
54
- body: JSON.stringify({
55
- model: config.EMBEDDING_MODEL_ID,
56
- input: text
57
- })
58
- });
69
+ export async function embedText(
70
+ text: string,
71
+ maxRetries = 5
72
+ ): Promise<number[]> {
73
+ for (let attempt = 0; attempt < maxRetries; attempt++) {
74
+ try {
75
+ const response = await fetch("https://openrouter.ai/api/v1/embeddings", {
76
+ method: "POST",
77
+ headers: {
78
+ Authorization: `Bearer ${config.OPENROUTER_API_KEY}`,
79
+ "Content-Type": "application/json",
80
+ },
81
+ body: JSON.stringify({
82
+ model: config.EMBEDDING_MODEL_ID,
83
+ input: text,
84
+ }),
85
+ });
86
+
87
+ if (!response.ok) {
88
+ if (response.status === 429 && attempt < maxRetries - 1) {
89
+ const delay = Math.pow(2, attempt) * 1000 + Math.random() * 1000;
90
+ console.error(
91
+ `[Rate Limit] OpenRouter 429. Retrying in ${Math.round(delay)}ms... (Attempt ${attempt + 1}/${maxRetries})`
92
+ );
93
+ await new Promise((res) => setTimeout(res, delay));
94
+ continue;
95
+ }
96
+ const errText = await response.text();
97
+ throw new Error(
98
+ `OpenRouter Embedding API failed: ${response.status} ${errText}`
99
+ );
100
+ }
59
101
 
60
- if (!response.ok) {
61
- const errText = await response.text();
62
- throw new Error(`OpenRouter Embedding API failed: ${response.status} ${errText}`);
102
+ const json: any = await response.json();
103
+ if (!json.data || !json.data[0] || !json.data[0].embedding) {
104
+ throw new Error(
105
+ `Invalid response from OpenRouter: ${JSON.stringify(json)}`
106
+ );
107
+ }
108
+ return json.data[0].embedding;
109
+ } catch (err: any) {
110
+ if (attempt >= maxRetries - 1) throw err;
111
+ const delay = Math.pow(2, attempt) * 1000 + Math.random() * 1000;
112
+ console.error(
113
+ `[Error] ${err.message}. Retrying in ${Math.round(delay)}ms... (Attempt ${attempt + 1}/${maxRetries})`
114
+ );
115
+ await new Promise((res) => setTimeout(res, delay));
116
+ }
63
117
  }
118
+ throw new Error("Max retries reached for embedding");
119
+ }
120
+
121
+ /**
122
+ * Embed nhiều texts trong 1 API call (batch).
123
+ * OpenRouter hỗ trợ input: string[] → trả data[i].embedding.
124
+ */
125
+ export async function embedBatch(
126
+ texts: string[],
127
+ maxRetries = 5
128
+ ): Promise<number[][]> {
129
+ if (texts.length === 0) return [];
130
+
131
+ for (let attempt = 0; attempt < maxRetries; attempt++) {
132
+ try {
133
+ const response = await fetch("https://openrouter.ai/api/v1/embeddings", {
134
+ method: "POST",
135
+ headers: {
136
+ Authorization: `Bearer ${config.OPENROUTER_API_KEY}`,
137
+ "Content-Type": "application/json",
138
+ },
139
+ body: JSON.stringify({
140
+ model: config.EMBEDDING_MODEL_ID,
141
+ input: texts,
142
+ }),
143
+ });
64
144
 
65
- const json: any = await response.json();
66
- if (!json.data || !json.data[0] || !json.data[0].embedding) {
67
- throw new Error("Invalid response from OpenRouter Embedding API");
145
+ if (!response.ok) {
146
+ if (response.status === 429 && attempt < maxRetries - 1) {
147
+ const delay = Math.pow(2, attempt) * 1000 + Math.random() * 1000;
148
+ console.error(
149
+ `[Rate Limit] OpenRouter 429 (batch). Retrying in ${Math.round(delay)}ms... (Attempt ${attempt + 1}/${maxRetries})`
150
+ );
151
+ await new Promise((res) => setTimeout(res, delay));
152
+ continue;
153
+ }
154
+ const errText = await response.text();
155
+ throw new Error(
156
+ `OpenRouter Batch Embedding API failed: ${response.status} ${errText}`
157
+ );
158
+ }
159
+
160
+ const json: any = await response.json();
161
+ if (!json.data || !Array.isArray(json.data)) {
162
+ throw new Error(
163
+ `Invalid batch response from OpenRouter: ${JSON.stringify(json)}`
164
+ );
165
+ }
166
+ return json.data.map((item: any) => item.embedding);
167
+ } catch (err: any) {
168
+ if (attempt >= maxRetries - 1) throw err;
169
+ const delay = Math.pow(2, attempt) * 1000 + Math.random() * 1000;
170
+ console.error(
171
+ `[Error] ${err.message}. Retrying in ${Math.round(delay)}ms... (Attempt ${attempt + 1}/${maxRetries})`
172
+ );
173
+ await new Promise((res) => setTimeout(res, delay));
174
+ }
68
175
  }
176
+ throw new Error("Max retries reached for batch embedding");
177
+ }
69
178
 
70
- return json.data[0].embedding;
179
+ export interface ChunkUpsert {
180
+ pointId: string;
181
+ vector: number[];
182
+ text: string;
183
+ title: string;
184
+ blockIndex: number;
185
+ blockHash: string;
186
+ source: string;
187
+ offset: number; // character offset in the Markdown string
71
188
  }
72
189
 
73
- export async function upsertProjectDocument(folderId: string, text: string, metadata: Record<string, any> = {}): Promise<void> {
190
+ /**
191
+ * Bulk upsert nhiều chunks vào Qdrant trong 1 HTTP call.
192
+ */
193
+ export async function upsertChunkBatch(chunks: ChunkUpsert[]): Promise<void> {
74
194
  await initVectorDB();
75
195
  if (!client) throw new Error("Qdrant not initialized");
76
-
77
- const vector = await embedText(text);
196
+ if (chunks.length === 0) return;
78
197
 
79
198
  await client.upsert(COLLECTION_NAME, {
80
199
  wait: true,
81
- points: [
82
- {
83
- id: uuidv4(),
84
- vector: vector,
85
- payload: {
86
- folderId,
87
- text,
88
- source: metadata.source || "user",
89
- file_id: metadata.file_id || null,
90
- modified_time: metadata.modified_time || null,
91
- metadata: JSON.stringify(metadata),
92
- createdAt: new Date().toISOString()
93
- }
94
- }
95
- ]
200
+ points: chunks.map((c) => ({
201
+ id: c.pointId,
202
+ vector: c.vector,
203
+ payload: {
204
+ text: c.text,
205
+ title: c.title,
206
+ block_index: c.blockIndex,
207
+ block_hash: c.blockHash,
208
+ source: c.source,
209
+ offset: c.offset,
210
+ },
211
+ })),
212
+ });
213
+ console.error(`Upserted ${chunks.length} chunk(s) to Qdrant.`);
214
+ }
215
+
216
+ /**
217
+ * Fetch block_hash AND offset for a list of point IDs.
218
+ * Used to diff block-level changes during re-sync (hash) and
219
+ * detect stale offsets in unchanged blocks (offset).
220
+ */
221
+ export async function getBlockMetaByIds(
222
+ pointIds: string[]
223
+ ): Promise<Record<string, { hash: string; offset: number }>> {
224
+ await initVectorDB();
225
+ if (!client || pointIds.length === 0) return {};
226
+
227
+ const results = await client.retrieve(COLLECTION_NAME, {
228
+ ids: pointIds,
229
+ with_payload: ["block_hash", "offset"],
230
+ with_vector: false,
96
231
  });
97
232
 
98
- console.error(`Upserted document chunk for folder ${folderId}`);
233
+ const metaMap: Record<string, { hash: string; offset: number }> = {};
234
+ for (const point of results) {
235
+ const hash = point.payload?.block_hash as string | undefined;
236
+ const offset = point.payload?.offset as number | undefined;
237
+ if (hash !== undefined) {
238
+ metaMap[point.id as string] = { hash, offset: offset ?? 0 };
239
+ }
240
+ }
241
+ return metaMap;
242
+ }
243
+
244
+ /**
245
+ * Update only the `offset` payload field for a set of points (no re-embedding).
246
+ * Called for unchanged blocks whose character position shifted due to earlier edits.
247
+ * Uses parallel setPayload calls (lightweight metadata-only updates).
248
+ */
249
+ export async function updateBlockOffsets(
250
+ updates: { pointId: string; offset: number }[]
251
+ ): Promise<void> {
252
+ if (updates.length === 0) return;
253
+ await initVectorDB();
254
+ if (!client) throw new Error("Qdrant not initialized");
255
+
256
+ await Promise.all(
257
+ updates.map(({ pointId, offset }) =>
258
+ client!.setPayload(COLLECTION_NAME, {
259
+ payload: { offset },
260
+ points: [pointId],
261
+ wait: false, // fire-and-forget per point; all resolve before function returns
262
+ })
263
+ )
264
+ );
265
+ console.error(`[Sync] Updated offset for ${updates.length} unchanged block(s).`);
266
+ }
267
+
268
+ /**
269
+ * Xóa Qdrant points theo danh sách IDs.
270
+ */
271
+ export async function deletePointsByIds(pointIds: string[]): Promise<void> {
272
+ await initVectorDB();
273
+ if (!client || pointIds.length === 0) return;
274
+
275
+ await client.delete(COLLECTION_NAME, {
276
+ wait: true,
277
+ points: pointIds,
278
+ });
279
+ console.error(`Deleted ${pointIds.length} obsolete block(s) from Qdrant.`);
99
280
  }
100
281
 
101
- export async function searchProjectMemory(folderId: string, query: string, topK: number = 3): Promise<any[]> {
282
+ /**
283
+ * Global semantic search — không filter theo folder hay file.
284
+ */
285
+ export async function searchProjectMemory(
286
+ query: string,
287
+ topK: number = 3
288
+ ): Promise<any[]> {
102
289
  await initVectorDB();
103
290
  if (!client) throw new Error("Qdrant not initialized");
104
291
 
105
292
  try {
106
293
  const queryVector = await embedText(query);
107
-
108
294
  const results = await client.search(COLLECTION_NAME, {
109
295
  vector: queryVector,
110
296
  limit: topK,
111
297
  with_payload: true,
112
- filter: {
113
- must: [
114
- {
115
- key: "folderId",
116
- match: {
117
- value: folderId
118
- }
119
- }
120
- ]
121
- }
122
298
  });
123
299
 
124
- // Map to match LanceDB format expected by other tools
125
- return results.map(r => ({
300
+ return results.map((r) => ({
126
301
  id: r.id,
127
- vector: r.vector,
128
- ...r.payload
302
+ ...r.payload,
129
303
  }));
130
304
  } catch (err: any) {
131
305
  console.error("Qdrant search error:", err.message);
@@ -133,58 +307,74 @@ export async function searchProjectMemory(folderId: string, query: string, topK:
133
307
  }
134
308
  }
135
309
 
136
- export async function deleteProjectDocument(folderId: string, fileId: string): Promise<void> {
310
+ /**
311
+ * Exhaustive full-text search using Qdrant's inverted index on the `text` field.
312
+ * Uses whitespace tokenizer → API paths like /v1/foo/bar match as single tokens.
313
+ * Paginates through all results server-side (no full collection scan in JS).
314
+ */
315
+ export async function exactSearchChunks(
316
+ term: string,
317
+ limit: number = 50
318
+ ): Promise<any[]> {
137
319
  await initVectorDB();
138
- if (!client) return;
320
+ if (!client) throw new Error("Qdrant not initialized");
139
321
 
140
- await client.delete(COLLECTION_NAME, {
141
- filter: {
142
- must: [
143
- { key: "folderId", match: { value: folderId } },
144
- { key: "file_id", match: { value: fileId } }
145
- ]
146
- }
147
- });
148
- console.error(`Deleted old chunks from Qdrant for ${folderId} / ${fileId}`);
149
- }
322
+ const filter = {
323
+ must: [
324
+ {
325
+ key: "text",
326
+ match: { text: term.toLowerCase() },
327
+ },
328
+ ],
329
+ };
150
330
 
151
- export async function checkProjectDocumentExists(folderId: string, fileId: string): Promise<boolean> {
152
- await initVectorDB();
153
- if (!client) return false;
154
-
155
- const res = await client.count(COLLECTION_NAME, {
156
- filter: {
157
- must: [
158
- { key: "folderId", match: { value: folderId } },
159
- { key: "file_id", match: { value: fileId } }
160
- ]
331
+ const results: any[] = [];
332
+ let offset: string | number | null | undefined = undefined;
333
+
334
+ // Paginate until all matching points are collected or limit is reached
335
+ do {
336
+ const page: { points: any[]; next_page_offset?: string | number | null } =
337
+ await (client as any).scroll(COLLECTION_NAME, {
338
+ filter,
339
+ with_payload: true,
340
+ with_vector: false,
341
+ limit: Math.min(100, limit - results.length),
342
+ ...(offset !== undefined ? { offset } : {}),
343
+ });
344
+
345
+ for (const point of page.points) {
346
+ results.push({ id: point.id, ...point.payload });
161
347
  }
162
- });
163
- return res.count > 0;
348
+ offset = page.next_page_offset;
349
+ } while (offset != null && results.length < limit);
350
+
351
+ return results;
164
352
  }
165
353
 
166
- export async function getProjectDocumentMetadata(folderId: string): Promise<Record<string, string>> {
354
+ /**
355
+ * Upsert agent note với random UUID (không có fileId).
356
+ */
357
+ export async function upsertAgentNote(text: string): Promise<void> {
167
358
  await initVectorDB();
168
- if (!client) return {};
169
-
170
- const res = await client.scroll(COLLECTION_NAME, {
171
- filter: {
172
- must: [
173
- { key: "folderId", match: { value: folderId } },
174
- { key: "source", match: { value: "google_drive" } }
175
- ]
176
- },
177
- limit: 10000,
178
- with_payload: ["file_id", "modified_time"],
179
- with_vector: false
180
- });
181
-
182
- const fileMap: Record<string, string> = {};
183
- for (const r of res.points) {
184
- if (r.payload && r.payload.file_id && r.payload.modified_time) {
185
- fileMap[r.payload.file_id as string] = r.payload.modified_time as string;
186
- }
187
- }
359
+ if (!client) throw new Error("Qdrant not initialized");
188
360
 
189
- return fileMap;
361
+ const vector = await embedText(text);
362
+ await client.upsert(COLLECTION_NAME, {
363
+ wait: true,
364
+ points: [
365
+ {
366
+ id: uuidv4(),
367
+ vector,
368
+ payload: {
369
+ text,
370
+ title: "Agent Note",
371
+ block_index: 0,
372
+ block_hash: "",
373
+ source: "agent",
374
+ offset: 0,
375
+ },
376
+ },
377
+ ],
378
+ });
379
+ console.error("Upserted agent note to Qdrant.");
190
380
  }