@khoinguyen2002/doc-mcp 1.0.4 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,87 @@
1
+ import { Redis } from "@upstash/redis";
2
+ import { config } from "../config.js";
3
+
4
+ const HASH_KEY = "doc_sync_state";
5
+
6
+ let _redis: Redis | null = null;
7
+
8
+ function getRedis(): Redis {
9
+ if (!_redis) {
10
+ _redis = new Redis({
11
+ url: config.UPSTASH_REDIS_REST_URL,
12
+ token: config.UPSTASH_REDIS_REST_TOKEN,
13
+ });
14
+ }
15
+ return _redis;
16
+ }
17
+
18
+ export interface SyncEntry {
19
+ modifiedTime: string;
20
+ blockCount: number;
21
+ title: string;
22
+ }
23
+
24
+ export async function getAllSyncEntries(): Promise<Record<string, SyncEntry>> {
25
+ const redis = getRedis();
26
+ const raw = await redis.hgetall(HASH_KEY);
27
+ if (!raw) return {};
28
+
29
+ const result: Record<string, SyncEntry> = {};
30
+ for (const [fileId, value] of Object.entries(raw)) {
31
+ if (!value) continue;
32
+ try {
33
+ result[fileId] =
34
+ typeof value === "string"
35
+ ? (JSON.parse(value) as SyncEntry)
36
+ : (value as unknown as SyncEntry);
37
+ } catch {
38
+ // skip malformed entries
39
+ }
40
+ }
41
+ return result;
42
+ }
43
+
44
+ export async function getSyncEntry(fileId: string): Promise<SyncEntry | null> {
45
+ const redis = getRedis();
46
+ const raw = await redis.hget(HASH_KEY, fileId);
47
+ if (!raw) return null;
48
+ try {
49
+ return typeof raw === "string"
50
+ ? (JSON.parse(raw) as SyncEntry)
51
+ : (raw as unknown as SyncEntry);
52
+ } catch {
53
+ return null;
54
+ }
55
+ }
56
+
57
+ export async function setSyncEntry(
58
+ fileId: string,
59
+ entry: SyncEntry
60
+ ): Promise<void> {
61
+ const redis = getRedis();
62
+ await redis.hset(HASH_KEY, { [fileId]: JSON.stringify(entry) });
63
+ }
64
+
65
+ export async function deleteSyncEntry(fileId: string): Promise<void> {
66
+ const redis = getRedis();
67
+ await redis.hdel(HASH_KEY, fileId);
68
+ }
69
+
70
+ // ─── Image Description Cache ──────────────────────────────────────────────────
71
+ // Global hash: md5(imageBinary) → description text
72
+ // Deduplicates across docs (same image used in multiple files reuses description)
73
+ const IMG_DESC_KEY = "img_desc";
74
+
75
+ export async function getImageDesc(imageHash: string): Promise<string | null> {
76
+ const redis = getRedis();
77
+ const raw = await redis.hget(IMG_DESC_KEY, imageHash);
78
+ return raw ? String(raw) : null;
79
+ }
80
+
81
+ export async function setImageDesc(
82
+ imageHash: string,
83
+ description: string
84
+ ): Promise<void> {
85
+ const redis = getRedis();
86
+ await redis.hset(IMG_DESC_KEY, { [imageHash]: description });
87
+ }
package/src/db/vector.ts CHANGED
@@ -1,9 +1,20 @@
1
- import { QdrantClient } from '@qdrant/js-client-rest';
2
- import { v4 as uuidv4 } from 'uuid';
3
- import { config } from '../config.js';
1
+ import { QdrantClient } from "@qdrant/js-client-rest";
2
+ import { v4 as uuidv4, v5 as uuidv5 } from "uuid";
3
+ import { config } from "../config.js";
4
4
 
5
5
  let client: QdrantClient | null = null;
6
- const COLLECTION_NAME = 'project_memory';
6
+ const COLLECTION_NAME = "project_memory";
7
+
8
+ // Fixed namespace for deterministic point IDs (uuid v5)
9
+ const POINT_NAMESPACE = "1b671a64-40d5-491e-99b0-da01ff1f3341";
10
+
11
+ /**
12
+ * Deterministic Qdrant point ID: uuidv5(fileId:blockIndex, NS)
13
+ * Same input → same ID → upsert overwrites correctly.
14
+ */
15
+ export function getBlockPointId(fileId: string, blockIndex: number): string {
16
+ return uuidv5(`${fileId}:${blockIndex}`, POINT_NAMESPACE);
17
+ }
7
18
 
8
19
  export async function initVectorDB() {
9
20
  if (!client) {
@@ -13,22 +24,18 @@ export async function initVectorDB() {
13
24
  });
14
25
  console.error(`Connected to Qdrant at ${config.QDRANT_URL}`);
15
26
 
16
- // Check if collection exists
17
27
  const res = await client.getCollections();
18
- const exists = res.collections.some(c => c.name === COLLECTION_NAME);
28
+ const exists = res.collections.some((c) => c.name === COLLECTION_NAME);
19
29
  if (!exists) {
20
30
  console.error(`Creating Qdrant collection: ${COLLECTION_NAME}`);
21
31
  const dummyVector = await embedText("test");
22
32
  const dimension = dummyVector.length;
23
33
 
24
34
  await client.createCollection(COLLECTION_NAME, {
25
- vectors: {
26
- size: dimension,
27
- distance: "Cosine",
28
- },
35
+ vectors: { size: dimension, distance: "Cosine" },
29
36
  });
30
37
  await client.createPayloadIndex(COLLECTION_NAME, {
31
- field_name: "folderId",
38
+ field_name: "source",
32
39
  field_schema: "keyword",
33
40
  });
34
41
  await client.createPayloadIndex(COLLECTION_NAME, {
@@ -36,96 +43,269 @@ export async function initVectorDB() {
36
43
  field_schema: "keyword",
37
44
  });
38
45
  await client.createPayloadIndex(COLLECTION_NAME, {
39
- field_name: "source",
46
+ field_name: "block_index",
47
+ field_schema: "integer",
48
+ });
49
+ await client.createPayloadIndex(COLLECTION_NAME, {
50
+ field_name: "block_hash",
40
51
  field_schema: "keyword",
41
52
  });
42
- console.error(`Collection ${COLLECTION_NAME} created with dimension ${dimension}.`);
53
+ // Full-text index on `text` payload for exact/keyword search.
54
+ // whitespace tokenizer keeps API paths (e.g. /v1/foo/bar) as single tokens.
55
+ // lowercase=true makes searches case-insensitive.
56
+ await client.createPayloadIndex(COLLECTION_NAME, {
57
+ field_name: "text",
58
+ field_schema: {
59
+ type: "text",
60
+ tokenizer: "whitespace",
61
+ min_token_len: 2,
62
+ max_token_len: 200,
63
+ lowercase: true,
64
+ } as any,
65
+ });
66
+ console.error(
67
+ `Collection ${COLLECTION_NAME} created with dimension ${dimension}.`
68
+ );
43
69
  }
44
70
  }
45
71
  }
46
72
 
47
- export async function embedText(text: string): Promise<number[]> {
48
- const response = await fetch("https://openrouter.ai/api/v1/embeddings", {
49
- method: "POST",
50
- headers: {
51
- "Authorization": `Bearer ${config.OPENROUTER_API_KEY}`,
52
- "Content-Type": "application/json"
53
- },
54
- body: JSON.stringify({
55
- model: config.EMBEDDING_MODEL_ID,
56
- input: text
57
- })
58
- });
73
+ export async function embedText(
74
+ text: string,
75
+ maxRetries = 5
76
+ ): Promise<number[]> {
77
+ for (let attempt = 0; attempt < maxRetries; attempt++) {
78
+ try {
79
+ const response = await fetch("https://openrouter.ai/api/v1/embeddings", {
80
+ method: "POST",
81
+ headers: {
82
+ Authorization: `Bearer ${config.OPENROUTER_API_KEY}`,
83
+ "Content-Type": "application/json",
84
+ },
85
+ body: JSON.stringify({
86
+ model: config.EMBEDDING_MODEL_ID,
87
+ input: text,
88
+ }),
89
+ });
90
+
91
+ if (!response.ok) {
92
+ if (response.status === 429 && attempt < maxRetries - 1) {
93
+ const delay = Math.pow(2, attempt) * 1000 + Math.random() * 1000;
94
+ console.error(
95
+ `[Rate Limit] OpenRouter 429. Retrying in ${Math.round(delay)}ms... (Attempt ${attempt + 1}/${maxRetries})`
96
+ );
97
+ await new Promise((res) => setTimeout(res, delay));
98
+ continue;
99
+ }
100
+ const errText = await response.text();
101
+ throw new Error(
102
+ `OpenRouter Embedding API failed: ${response.status} ${errText}`
103
+ );
104
+ }
59
105
 
60
- if (!response.ok) {
61
- const errText = await response.text();
62
- throw new Error(`OpenRouter Embedding API failed: ${response.status} ${errText}`);
106
+ const json: any = await response.json();
107
+ if (!json.data || !json.data[0] || !json.data[0].embedding) {
108
+ throw new Error(
109
+ `Invalid response from OpenRouter: ${JSON.stringify(json)}`
110
+ );
111
+ }
112
+ return json.data[0].embedding;
113
+ } catch (err: any) {
114
+ if (attempt >= maxRetries - 1) throw err;
115
+ const delay = Math.pow(2, attempt) * 1000 + Math.random() * 1000;
116
+ console.error(
117
+ `[Error] ${err.message}. Retrying in ${Math.round(delay)}ms... (Attempt ${attempt + 1}/${maxRetries})`
118
+ );
119
+ await new Promise((res) => setTimeout(res, delay));
120
+ }
63
121
  }
122
+ throw new Error("Max retries reached for embedding");
123
+ }
124
+
125
+ /**
126
+ * Embed nhiều texts trong 1 API call (batch).
127
+ * OpenRouter hỗ trợ input: string[] → trả data[i].embedding.
128
+ */
129
+ export async function embedBatch(
130
+ texts: string[],
131
+ maxRetries = 5
132
+ ): Promise<number[][]> {
133
+ if (texts.length === 0) return [];
64
134
 
65
- const json: any = await response.json();
66
- if (!json.data || !json.data[0] || !json.data[0].embedding) {
67
- throw new Error("Invalid response from OpenRouter Embedding API");
135
+ for (let attempt = 0; attempt < maxRetries; attempt++) {
136
+ try {
137
+ const response = await fetch("https://openrouter.ai/api/v1/embeddings", {
138
+ method: "POST",
139
+ headers: {
140
+ Authorization: `Bearer ${config.OPENROUTER_API_KEY}`,
141
+ "Content-Type": "application/json",
142
+ },
143
+ body: JSON.stringify({
144
+ model: config.EMBEDDING_MODEL_ID,
145
+ input: texts,
146
+ }),
147
+ });
148
+
149
+ if (!response.ok) {
150
+ if (response.status === 429 && attempt < maxRetries - 1) {
151
+ const delay = Math.pow(2, attempt) * 1000 + Math.random() * 1000;
152
+ console.error(
153
+ `[Rate Limit] OpenRouter 429 (batch). Retrying in ${Math.round(delay)}ms... (Attempt ${attempt + 1}/${maxRetries})`
154
+ );
155
+ await new Promise((res) => setTimeout(res, delay));
156
+ continue;
157
+ }
158
+ const errText = await response.text();
159
+ throw new Error(
160
+ `OpenRouter Batch Embedding API failed: ${response.status} ${errText}`
161
+ );
162
+ }
163
+
164
+ const json: any = await response.json();
165
+ if (!json.data || !Array.isArray(json.data)) {
166
+ throw new Error(
167
+ `Invalid batch response from OpenRouter: ${JSON.stringify(json)}`
168
+ );
169
+ }
170
+ return json.data.map((item: any) => item.embedding);
171
+ } catch (err: any) {
172
+ if (attempt >= maxRetries - 1) throw err;
173
+ const delay = Math.pow(2, attempt) * 1000 + Math.random() * 1000;
174
+ console.error(
175
+ `[Error] ${err.message}. Retrying in ${Math.round(delay)}ms... (Attempt ${attempt + 1}/${maxRetries})`
176
+ );
177
+ await new Promise((res) => setTimeout(res, delay));
178
+ }
68
179
  }
180
+ throw new Error("Max retries reached for batch embedding");
181
+ }
69
182
 
70
- return json.data[0].embedding;
183
+ export interface ChunkUpsert {
184
+ pointId: string;
185
+ vector: number[];
186
+ text: string;
187
+ title: string;
188
+ fileId: string;
189
+ blockIndex: number;
190
+ blockHash: string;
191
+ source: string;
192
+ offset: number; // character offset in the Markdown string
71
193
  }
72
194
 
73
- export async function upsertProjectDocument(folderId: string, text: string, metadata: Record<string, any> = {}): Promise<void> {
195
+ /**
196
+ * Bulk upsert nhiều chunks vào Qdrant trong 1 HTTP call.
197
+ */
198
+ export async function upsertChunkBatch(chunks: ChunkUpsert[]): Promise<void> {
74
199
  await initVectorDB();
75
200
  if (!client) throw new Error("Qdrant not initialized");
76
-
77
- const vector = await embedText(text);
201
+ if (chunks.length === 0) return;
78
202
 
79
203
  await client.upsert(COLLECTION_NAME, {
80
204
  wait: true,
81
- points: [
82
- {
83
- id: uuidv4(),
84
- vector: vector,
85
- payload: {
86
- folderId,
87
- text,
88
- source: metadata.source || "user",
89
- file_id: metadata.file_id || null,
90
- modified_time: metadata.modified_time || null,
91
- metadata: JSON.stringify(metadata),
92
- createdAt: new Date().toISOString()
93
- }
94
- }
95
- ]
205
+ points: chunks.map((c) => ({
206
+ id: c.pointId,
207
+ vector: c.vector,
208
+ payload: {
209
+ text: c.text,
210
+ title: c.title,
211
+ file_id: c.fileId,
212
+ block_index: c.blockIndex,
213
+ block_hash: c.blockHash,
214
+ source: c.source,
215
+ offset: c.offset,
216
+ },
217
+ })),
218
+ });
219
+ console.error(`Upserted ${chunks.length} chunk(s) to Qdrant.`);
220
+ }
221
+
222
+ /**
223
+ * Fetch block_hash AND offset for a list of point IDs.
224
+ * Used to diff block-level changes during re-sync (hash) and
225
+ * detect stale offsets in unchanged blocks (offset).
226
+ */
227
+ export async function getBlockMetaByIds(
228
+ pointIds: string[]
229
+ ): Promise<Record<string, { hash: string; offset: number }>> {
230
+ await initVectorDB();
231
+ if (!client || pointIds.length === 0) return {};
232
+
233
+ const results = await client.retrieve(COLLECTION_NAME, {
234
+ ids: pointIds,
235
+ with_payload: ["block_hash", "offset"],
236
+ with_vector: false,
96
237
  });
97
238
 
98
- console.error(`Upserted document chunk for folder ${folderId}`);
239
+ const metaMap: Record<string, { hash: string; offset: number }> = {};
240
+ for (const point of results) {
241
+ const hash = point.payload?.block_hash as string | undefined;
242
+ const offset = point.payload?.offset as number | undefined;
243
+ if (hash !== undefined) {
244
+ metaMap[point.id as string] = { hash, offset: offset ?? 0 };
245
+ }
246
+ }
247
+ return metaMap;
99
248
  }
100
249
 
101
- export async function searchProjectMemory(folderId: string, query: string, topK: number = 3): Promise<any[]> {
250
+ /**
251
+ * Update only the `offset` payload field for a set of points (no re-embedding).
252
+ * Called for unchanged blocks whose character position shifted due to earlier edits.
253
+ * Uses parallel setPayload calls (lightweight metadata-only updates).
254
+ */
255
+ export async function updateBlockOffsets(
256
+ updates: { pointId: string; offset: number }[]
257
+ ): Promise<void> {
258
+ if (updates.length === 0) return;
259
+ await initVectorDB();
260
+ if (!client) throw new Error("Qdrant not initialized");
261
+
262
+ await Promise.all(
263
+ updates.map(({ pointId, offset }) =>
264
+ client!.setPayload(COLLECTION_NAME, {
265
+ payload: { offset },
266
+ points: [pointId],
267
+ wait: false, // fire-and-forget per point; all resolve before function returns
268
+ })
269
+ )
270
+ );
271
+ console.error(`[Sync] Updated offset for ${updates.length} unchanged block(s).`);
272
+ }
273
+
274
+ /**
275
+ * Xóa Qdrant points theo danh sách IDs.
276
+ */
277
+ export async function deletePointsByIds(pointIds: string[]): Promise<void> {
278
+ await initVectorDB();
279
+ if (!client || pointIds.length === 0) return;
280
+
281
+ await client.delete(COLLECTION_NAME, {
282
+ wait: true,
283
+ points: pointIds,
284
+ });
285
+ console.error(`Deleted ${pointIds.length} obsolete block(s) from Qdrant.`);
286
+ }
287
+
288
+ /**
289
+ * Global semantic search — không filter theo folder hay file.
290
+ */
291
+ export async function searchProjectMemory(
292
+ query: string,
293
+ topK: number = 3
294
+ ): Promise<any[]> {
102
295
  await initVectorDB();
103
296
  if (!client) throw new Error("Qdrant not initialized");
104
297
 
105
298
  try {
106
299
  const queryVector = await embedText(query);
107
-
108
300
  const results = await client.search(COLLECTION_NAME, {
109
301
  vector: queryVector,
110
302
  limit: topK,
111
303
  with_payload: true,
112
- filter: {
113
- must: [
114
- {
115
- key: "folderId",
116
- match: {
117
- value: folderId
118
- }
119
- }
120
- ]
121
- }
122
304
  });
123
305
 
124
- // Map to match LanceDB format expected by other tools
125
- return results.map(r => ({
306
+ return results.map((r) => ({
126
307
  id: r.id,
127
- vector: r.vector,
128
- ...r.payload
308
+ ...r.payload,
129
309
  }));
130
310
  } catch (err: any) {
131
311
  console.error("Qdrant search error:", err.message);
@@ -133,58 +313,70 @@ export async function searchProjectMemory(folderId: string, query: string, topK:
133
313
  }
134
314
  }
135
315
 
136
- export async function deleteProjectDocument(folderId: string, fileId: string): Promise<void> {
316
+ /**
317
+ * Exhaustive substring search: scrolls ALL points and filters client-side.
318
+ * More reliable than Qdrant full-text filter (whitespace tokenizer doesn't
319
+ * strip surrounding punctuation, causing false negatives for terms like
320
+ * "ServiceCode.mkp" appearing as "ServiceCode.mkp)" in headings).
321
+ * For typical collection sizes (~few hundred chunks) the O(N) cost is negligible.
322
+ */
323
+ export async function exactSearchChunks(
324
+ term: string,
325
+ limit: number = 50
326
+ ): Promise<any[]> {
137
327
  await initVectorDB();
138
- if (!client) return;
328
+ if (!client) throw new Error("Qdrant not initialized");
139
329
 
140
- await client.delete(COLLECTION_NAME, {
141
- filter: {
142
- must: [
143
- { key: "folderId", match: { value: folderId } },
144
- { key: "file_id", match: { value: fileId } }
145
- ]
146
- }
147
- });
148
- console.error(`Deleted old chunks from Qdrant for ${folderId} / ${fileId}`);
149
- }
330
+ const lowerTerm = term.toLowerCase();
331
+ const results: any[] = [];
332
+ let offset: string | number | null | undefined = undefined;
150
333
 
151
- export async function checkProjectDocumentExists(folderId: string, fileId: string): Promise<boolean> {
152
- await initVectorDB();
153
- if (!client) return false;
154
-
155
- const res = await client.count(COLLECTION_NAME, {
156
- filter: {
157
- must: [
158
- { key: "folderId", match: { value: folderId } },
159
- { key: "file_id", match: { value: fileId } }
160
- ]
334
+ do {
335
+ const page: { points: any[]; next_page_offset?: string | number | null } =
336
+ await (client as any).scroll(COLLECTION_NAME, {
337
+ with_payload: true,
338
+ with_vector: false,
339
+ limit: 100,
340
+ ...(offset !== undefined ? { offset } : {}),
341
+ });
342
+
343
+ for (const point of page.points) {
344
+ const text = ((point.payload?.text as string) ?? "").toLowerCase();
345
+ if (text.includes(lowerTerm)) {
346
+ results.push({ id: point.id, ...point.payload });
347
+ if (results.length >= limit) break;
348
+ }
161
349
  }
162
- });
163
- return res.count > 0;
350
+ offset = page.next_page_offset;
351
+ } while (offset != null && results.length < limit);
352
+
353
+ return results;
164
354
  }
165
355
 
166
- export async function getProjectDocumentMetadata(folderId: string): Promise<Record<string, string>> {
356
+ /**
357
+ * Upsert agent note với random UUID (không có fileId).
358
+ */
359
+ export async function upsertAgentNote(text: string): Promise<void> {
167
360
  await initVectorDB();
168
- if (!client) return {};
169
-
170
- const res = await client.scroll(COLLECTION_NAME, {
171
- filter: {
172
- must: [
173
- { key: "folderId", match: { value: folderId } },
174
- { key: "source", match: { value: "google_drive" } }
175
- ]
176
- },
177
- limit: 10000,
178
- with_payload: ["file_id", "modified_time"],
179
- with_vector: false
180
- });
181
-
182
- const fileMap: Record<string, string> = {};
183
- for (const r of res.points) {
184
- if (r.payload && r.payload.file_id && r.payload.modified_time) {
185
- fileMap[r.payload.file_id as string] = r.payload.modified_time as string;
186
- }
187
- }
361
+ if (!client) throw new Error("Qdrant not initialized");
188
362
 
189
- return fileMap;
363
+ const vector = await embedText(text);
364
+ await client.upsert(COLLECTION_NAME, {
365
+ wait: true,
366
+ points: [
367
+ {
368
+ id: uuidv4(),
369
+ vector,
370
+ payload: {
371
+ text,
372
+ title: "Agent Note",
373
+ block_index: 0,
374
+ block_hash: "",
375
+ source: "agent",
376
+ offset: 0,
377
+ },
378
+ },
379
+ ],
380
+ });
381
+ console.error("Upserted agent note to Qdrant.");
190
382
  }