@khoinguyen2002/doc-mcp 1.0.4 → 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/config.d.ts +6 -4
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +22 -7
- package/dist/db/rateLimiter.d.ts +6 -0
- package/dist/db/rateLimiter.d.ts.map +1 -0
- package/dist/db/rateLimiter.js +20 -0
- package/dist/db/syncState.d.ts +12 -0
- package/dist/db/syncState.d.ts.map +1 -0
- package/dist/db/syncState.js +69 -0
- package/dist/db/vector.d.ts +64 -6
- package/dist/db/vector.d.ts.map +1 -1
- package/dist/db/vector.js +250 -107
- package/dist/mcp-server.js +44 -23
- package/dist/tools/driveTools.d.ts +20 -16
- package/dist/tools/driveTools.d.ts.map +1 -1
- package/dist/tools/driveTools.js +100 -149
- package/dist/tools/ingestFlow.d.ts +8 -0
- package/dist/tools/ingestFlow.d.ts.map +1 -0
- package/dist/tools/ingestFlow.js +408 -0
- package/dist/tools/knowledgeTools.d.ts +26 -5
- package/dist/tools/knowledgeTools.d.ts.map +1 -1
- package/dist/tools/knowledgeTools.js +31 -40
- package/package.json +8 -1
- package/src/config.ts +28 -9
- package/src/db/rateLimiter.ts +25 -0
- package/src/db/syncState.ts +87 -0
- package/src/db/vector.ts +305 -113
- package/src/mcp-server.ts +55 -33
- package/src/tools/driveTools.ts +111 -175
- package/src/tools/ingestFlow.ts +509 -0
- package/src/tools/knowledgeTools.ts +36 -38
- package/src/types/turndown-plugin-gfm.d.ts +8 -0
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import { Redis } from "@upstash/redis";
|
|
2
|
+
import { config } from "../config.js";
|
|
3
|
+
|
|
4
|
+
const HASH_KEY = "doc_sync_state";
|
|
5
|
+
|
|
6
|
+
let _redis: Redis | null = null;
|
|
7
|
+
|
|
8
|
+
function getRedis(): Redis {
|
|
9
|
+
if (!_redis) {
|
|
10
|
+
_redis = new Redis({
|
|
11
|
+
url: config.UPSTASH_REDIS_REST_URL,
|
|
12
|
+
token: config.UPSTASH_REDIS_REST_TOKEN,
|
|
13
|
+
});
|
|
14
|
+
}
|
|
15
|
+
return _redis;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export interface SyncEntry {
|
|
19
|
+
modifiedTime: string;
|
|
20
|
+
blockCount: number;
|
|
21
|
+
title: string;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export async function getAllSyncEntries(): Promise<Record<string, SyncEntry>> {
|
|
25
|
+
const redis = getRedis();
|
|
26
|
+
const raw = await redis.hgetall(HASH_KEY);
|
|
27
|
+
if (!raw) return {};
|
|
28
|
+
|
|
29
|
+
const result: Record<string, SyncEntry> = {};
|
|
30
|
+
for (const [fileId, value] of Object.entries(raw)) {
|
|
31
|
+
if (!value) continue;
|
|
32
|
+
try {
|
|
33
|
+
result[fileId] =
|
|
34
|
+
typeof value === "string"
|
|
35
|
+
? (JSON.parse(value) as SyncEntry)
|
|
36
|
+
: (value as unknown as SyncEntry);
|
|
37
|
+
} catch {
|
|
38
|
+
// skip malformed entries
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
return result;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export async function getSyncEntry(fileId: string): Promise<SyncEntry | null> {
|
|
45
|
+
const redis = getRedis();
|
|
46
|
+
const raw = await redis.hget(HASH_KEY, fileId);
|
|
47
|
+
if (!raw) return null;
|
|
48
|
+
try {
|
|
49
|
+
return typeof raw === "string"
|
|
50
|
+
? (JSON.parse(raw) as SyncEntry)
|
|
51
|
+
: (raw as unknown as SyncEntry);
|
|
52
|
+
} catch {
|
|
53
|
+
return null;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export async function setSyncEntry(
|
|
58
|
+
fileId: string,
|
|
59
|
+
entry: SyncEntry
|
|
60
|
+
): Promise<void> {
|
|
61
|
+
const redis = getRedis();
|
|
62
|
+
await redis.hset(HASH_KEY, { [fileId]: JSON.stringify(entry) });
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
export async function deleteSyncEntry(fileId: string): Promise<void> {
|
|
66
|
+
const redis = getRedis();
|
|
67
|
+
await redis.hdel(HASH_KEY, fileId);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// ─── Image Description Cache ──────────────────────────────────────────────────
|
|
71
|
+
// Global hash: md5(imageBinary) → description text
|
|
72
|
+
// Deduplicates across docs (same image used in multiple files reuses description)
|
|
73
|
+
const IMG_DESC_KEY = "img_desc";
|
|
74
|
+
|
|
75
|
+
export async function getImageDesc(imageHash: string): Promise<string | null> {
|
|
76
|
+
const redis = getRedis();
|
|
77
|
+
const raw = await redis.hget(IMG_DESC_KEY, imageHash);
|
|
78
|
+
return raw ? String(raw) : null;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
export async function setImageDesc(
|
|
82
|
+
imageHash: string,
|
|
83
|
+
description: string
|
|
84
|
+
): Promise<void> {
|
|
85
|
+
const redis = getRedis();
|
|
86
|
+
await redis.hset(IMG_DESC_KEY, { [imageHash]: description });
|
|
87
|
+
}
|
package/src/db/vector.ts
CHANGED
|
@@ -1,9 +1,20 @@
|
|
|
1
|
-
import { QdrantClient } from
|
|
2
|
-
import { v4 as uuidv4 } from
|
|
3
|
-
import { config } from
|
|
1
|
+
import { QdrantClient } from "@qdrant/js-client-rest";
|
|
2
|
+
import { v4 as uuidv4, v5 as uuidv5 } from "uuid";
|
|
3
|
+
import { config } from "../config.js";
|
|
4
4
|
|
|
5
5
|
let client: QdrantClient | null = null;
|
|
6
|
-
const COLLECTION_NAME =
|
|
6
|
+
const COLLECTION_NAME = "project_memory";
|
|
7
|
+
|
|
8
|
+
// Fixed namespace for deterministic point IDs (uuid v5)
|
|
9
|
+
const POINT_NAMESPACE = "1b671a64-40d5-491e-99b0-da01ff1f3341";
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Deterministic Qdrant point ID: uuidv5(fileId:blockIndex, NS)
|
|
13
|
+
* Same input → same ID → upsert overwrites correctly.
|
|
14
|
+
*/
|
|
15
|
+
export function getBlockPointId(fileId: string, blockIndex: number): string {
|
|
16
|
+
return uuidv5(`${fileId}:${blockIndex}`, POINT_NAMESPACE);
|
|
17
|
+
}
|
|
7
18
|
|
|
8
19
|
export async function initVectorDB() {
|
|
9
20
|
if (!client) {
|
|
@@ -13,22 +24,18 @@ export async function initVectorDB() {
|
|
|
13
24
|
});
|
|
14
25
|
console.error(`Connected to Qdrant at ${config.QDRANT_URL}`);
|
|
15
26
|
|
|
16
|
-
// Check if collection exists
|
|
17
27
|
const res = await client.getCollections();
|
|
18
|
-
const exists = res.collections.some(c => c.name === COLLECTION_NAME);
|
|
28
|
+
const exists = res.collections.some((c) => c.name === COLLECTION_NAME);
|
|
19
29
|
if (!exists) {
|
|
20
30
|
console.error(`Creating Qdrant collection: ${COLLECTION_NAME}`);
|
|
21
31
|
const dummyVector = await embedText("test");
|
|
22
32
|
const dimension = dummyVector.length;
|
|
23
33
|
|
|
24
34
|
await client.createCollection(COLLECTION_NAME, {
|
|
25
|
-
vectors: {
|
|
26
|
-
size: dimension,
|
|
27
|
-
distance: "Cosine",
|
|
28
|
-
},
|
|
35
|
+
vectors: { size: dimension, distance: "Cosine" },
|
|
29
36
|
});
|
|
30
37
|
await client.createPayloadIndex(COLLECTION_NAME, {
|
|
31
|
-
field_name: "
|
|
38
|
+
field_name: "source",
|
|
32
39
|
field_schema: "keyword",
|
|
33
40
|
});
|
|
34
41
|
await client.createPayloadIndex(COLLECTION_NAME, {
|
|
@@ -36,96 +43,269 @@ export async function initVectorDB() {
|
|
|
36
43
|
field_schema: "keyword",
|
|
37
44
|
});
|
|
38
45
|
await client.createPayloadIndex(COLLECTION_NAME, {
|
|
39
|
-
field_name: "
|
|
46
|
+
field_name: "block_index",
|
|
47
|
+
field_schema: "integer",
|
|
48
|
+
});
|
|
49
|
+
await client.createPayloadIndex(COLLECTION_NAME, {
|
|
50
|
+
field_name: "block_hash",
|
|
40
51
|
field_schema: "keyword",
|
|
41
52
|
});
|
|
42
|
-
|
|
53
|
+
// Full-text index on `text` payload for exact/keyword search.
|
|
54
|
+
// whitespace tokenizer keeps API paths (e.g. /v1/foo/bar) as single tokens.
|
|
55
|
+
// lowercase=true makes searches case-insensitive.
|
|
56
|
+
await client.createPayloadIndex(COLLECTION_NAME, {
|
|
57
|
+
field_name: "text",
|
|
58
|
+
field_schema: {
|
|
59
|
+
type: "text",
|
|
60
|
+
tokenizer: "whitespace",
|
|
61
|
+
min_token_len: 2,
|
|
62
|
+
max_token_len: 200,
|
|
63
|
+
lowercase: true,
|
|
64
|
+
} as any,
|
|
65
|
+
});
|
|
66
|
+
console.error(
|
|
67
|
+
`Collection ${COLLECTION_NAME} created with dimension ${dimension}.`
|
|
68
|
+
);
|
|
43
69
|
}
|
|
44
70
|
}
|
|
45
71
|
}
|
|
46
72
|
|
|
47
|
-
export async function embedText(
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
73
|
+
export async function embedText(
|
|
74
|
+
text: string,
|
|
75
|
+
maxRetries = 5
|
|
76
|
+
): Promise<number[]> {
|
|
77
|
+
for (let attempt = 0; attempt < maxRetries; attempt++) {
|
|
78
|
+
try {
|
|
79
|
+
const response = await fetch("https://openrouter.ai/api/v1/embeddings", {
|
|
80
|
+
method: "POST",
|
|
81
|
+
headers: {
|
|
82
|
+
Authorization: `Bearer ${config.OPENROUTER_API_KEY}`,
|
|
83
|
+
"Content-Type": "application/json",
|
|
84
|
+
},
|
|
85
|
+
body: JSON.stringify({
|
|
86
|
+
model: config.EMBEDDING_MODEL_ID,
|
|
87
|
+
input: text,
|
|
88
|
+
}),
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
if (!response.ok) {
|
|
92
|
+
if (response.status === 429 && attempt < maxRetries - 1) {
|
|
93
|
+
const delay = Math.pow(2, attempt) * 1000 + Math.random() * 1000;
|
|
94
|
+
console.error(
|
|
95
|
+
`[Rate Limit] OpenRouter 429. Retrying in ${Math.round(delay)}ms... (Attempt ${attempt + 1}/${maxRetries})`
|
|
96
|
+
);
|
|
97
|
+
await new Promise((res) => setTimeout(res, delay));
|
|
98
|
+
continue;
|
|
99
|
+
}
|
|
100
|
+
const errText = await response.text();
|
|
101
|
+
throw new Error(
|
|
102
|
+
`OpenRouter Embedding API failed: ${response.status} ${errText}`
|
|
103
|
+
);
|
|
104
|
+
}
|
|
59
105
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
106
|
+
const json: any = await response.json();
|
|
107
|
+
if (!json.data || !json.data[0] || !json.data[0].embedding) {
|
|
108
|
+
throw new Error(
|
|
109
|
+
`Invalid response from OpenRouter: ${JSON.stringify(json)}`
|
|
110
|
+
);
|
|
111
|
+
}
|
|
112
|
+
return json.data[0].embedding;
|
|
113
|
+
} catch (err: any) {
|
|
114
|
+
if (attempt >= maxRetries - 1) throw err;
|
|
115
|
+
const delay = Math.pow(2, attempt) * 1000 + Math.random() * 1000;
|
|
116
|
+
console.error(
|
|
117
|
+
`[Error] ${err.message}. Retrying in ${Math.round(delay)}ms... (Attempt ${attempt + 1}/${maxRetries})`
|
|
118
|
+
);
|
|
119
|
+
await new Promise((res) => setTimeout(res, delay));
|
|
120
|
+
}
|
|
63
121
|
}
|
|
122
|
+
throw new Error("Max retries reached for embedding");
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
/**
|
|
126
|
+
* Embed nhiều texts trong 1 API call (batch).
|
|
127
|
+
* OpenRouter hỗ trợ input: string[] → trả data[i].embedding.
|
|
128
|
+
*/
|
|
129
|
+
export async function embedBatch(
|
|
130
|
+
texts: string[],
|
|
131
|
+
maxRetries = 5
|
|
132
|
+
): Promise<number[][]> {
|
|
133
|
+
if (texts.length === 0) return [];
|
|
64
134
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
135
|
+
for (let attempt = 0; attempt < maxRetries; attempt++) {
|
|
136
|
+
try {
|
|
137
|
+
const response = await fetch("https://openrouter.ai/api/v1/embeddings", {
|
|
138
|
+
method: "POST",
|
|
139
|
+
headers: {
|
|
140
|
+
Authorization: `Bearer ${config.OPENROUTER_API_KEY}`,
|
|
141
|
+
"Content-Type": "application/json",
|
|
142
|
+
},
|
|
143
|
+
body: JSON.stringify({
|
|
144
|
+
model: config.EMBEDDING_MODEL_ID,
|
|
145
|
+
input: texts,
|
|
146
|
+
}),
|
|
147
|
+
});
|
|
148
|
+
|
|
149
|
+
if (!response.ok) {
|
|
150
|
+
if (response.status === 429 && attempt < maxRetries - 1) {
|
|
151
|
+
const delay = Math.pow(2, attempt) * 1000 + Math.random() * 1000;
|
|
152
|
+
console.error(
|
|
153
|
+
`[Rate Limit] OpenRouter 429 (batch). Retrying in ${Math.round(delay)}ms... (Attempt ${attempt + 1}/${maxRetries})`
|
|
154
|
+
);
|
|
155
|
+
await new Promise((res) => setTimeout(res, delay));
|
|
156
|
+
continue;
|
|
157
|
+
}
|
|
158
|
+
const errText = await response.text();
|
|
159
|
+
throw new Error(
|
|
160
|
+
`OpenRouter Batch Embedding API failed: ${response.status} ${errText}`
|
|
161
|
+
);
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
const json: any = await response.json();
|
|
165
|
+
if (!json.data || !Array.isArray(json.data)) {
|
|
166
|
+
throw new Error(
|
|
167
|
+
`Invalid batch response from OpenRouter: ${JSON.stringify(json)}`
|
|
168
|
+
);
|
|
169
|
+
}
|
|
170
|
+
return json.data.map((item: any) => item.embedding);
|
|
171
|
+
} catch (err: any) {
|
|
172
|
+
if (attempt >= maxRetries - 1) throw err;
|
|
173
|
+
const delay = Math.pow(2, attempt) * 1000 + Math.random() * 1000;
|
|
174
|
+
console.error(
|
|
175
|
+
`[Error] ${err.message}. Retrying in ${Math.round(delay)}ms... (Attempt ${attempt + 1}/${maxRetries})`
|
|
176
|
+
);
|
|
177
|
+
await new Promise((res) => setTimeout(res, delay));
|
|
178
|
+
}
|
|
68
179
|
}
|
|
180
|
+
throw new Error("Max retries reached for batch embedding");
|
|
181
|
+
}
|
|
69
182
|
|
|
70
|
-
|
|
183
|
+
export interface ChunkUpsert {
|
|
184
|
+
pointId: string;
|
|
185
|
+
vector: number[];
|
|
186
|
+
text: string;
|
|
187
|
+
title: string;
|
|
188
|
+
fileId: string;
|
|
189
|
+
blockIndex: number;
|
|
190
|
+
blockHash: string;
|
|
191
|
+
source: string;
|
|
192
|
+
offset: number; // character offset in the Markdown string
|
|
71
193
|
}
|
|
72
194
|
|
|
73
|
-
|
|
195
|
+
/**
|
|
196
|
+
* Bulk upsert nhiều chunks vào Qdrant trong 1 HTTP call.
|
|
197
|
+
*/
|
|
198
|
+
export async function upsertChunkBatch(chunks: ChunkUpsert[]): Promise<void> {
|
|
74
199
|
await initVectorDB();
|
|
75
200
|
if (!client) throw new Error("Qdrant not initialized");
|
|
76
|
-
|
|
77
|
-
const vector = await embedText(text);
|
|
201
|
+
if (chunks.length === 0) return;
|
|
78
202
|
|
|
79
203
|
await client.upsert(COLLECTION_NAME, {
|
|
80
204
|
wait: true,
|
|
81
|
-
points:
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
205
|
+
points: chunks.map((c) => ({
|
|
206
|
+
id: c.pointId,
|
|
207
|
+
vector: c.vector,
|
|
208
|
+
payload: {
|
|
209
|
+
text: c.text,
|
|
210
|
+
title: c.title,
|
|
211
|
+
file_id: c.fileId,
|
|
212
|
+
block_index: c.blockIndex,
|
|
213
|
+
block_hash: c.blockHash,
|
|
214
|
+
source: c.source,
|
|
215
|
+
offset: c.offset,
|
|
216
|
+
},
|
|
217
|
+
})),
|
|
218
|
+
});
|
|
219
|
+
console.error(`Upserted ${chunks.length} chunk(s) to Qdrant.`);
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
/**
|
|
223
|
+
* Fetch block_hash AND offset for a list of point IDs.
|
|
224
|
+
* Used to diff block-level changes during re-sync (hash) and
|
|
225
|
+
* detect stale offsets in unchanged blocks (offset).
|
|
226
|
+
*/
|
|
227
|
+
export async function getBlockMetaByIds(
|
|
228
|
+
pointIds: string[]
|
|
229
|
+
): Promise<Record<string, { hash: string; offset: number }>> {
|
|
230
|
+
await initVectorDB();
|
|
231
|
+
if (!client || pointIds.length === 0) return {};
|
|
232
|
+
|
|
233
|
+
const results = await client.retrieve(COLLECTION_NAME, {
|
|
234
|
+
ids: pointIds,
|
|
235
|
+
with_payload: ["block_hash", "offset"],
|
|
236
|
+
with_vector: false,
|
|
96
237
|
});
|
|
97
238
|
|
|
98
|
-
|
|
239
|
+
const metaMap: Record<string, { hash: string; offset: number }> = {};
|
|
240
|
+
for (const point of results) {
|
|
241
|
+
const hash = point.payload?.block_hash as string | undefined;
|
|
242
|
+
const offset = point.payload?.offset as number | undefined;
|
|
243
|
+
if (hash !== undefined) {
|
|
244
|
+
metaMap[point.id as string] = { hash, offset: offset ?? 0 };
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
return metaMap;
|
|
99
248
|
}
|
|
100
249
|
|
|
101
|
-
|
|
250
|
+
/**
|
|
251
|
+
* Update only the `offset` payload field for a set of points (no re-embedding).
|
|
252
|
+
* Called for unchanged blocks whose character position shifted due to earlier edits.
|
|
253
|
+
* Uses parallel setPayload calls (lightweight metadata-only updates).
|
|
254
|
+
*/
|
|
255
|
+
export async function updateBlockOffsets(
|
|
256
|
+
updates: { pointId: string; offset: number }[]
|
|
257
|
+
): Promise<void> {
|
|
258
|
+
if (updates.length === 0) return;
|
|
259
|
+
await initVectorDB();
|
|
260
|
+
if (!client) throw new Error("Qdrant not initialized");
|
|
261
|
+
|
|
262
|
+
await Promise.all(
|
|
263
|
+
updates.map(({ pointId, offset }) =>
|
|
264
|
+
client!.setPayload(COLLECTION_NAME, {
|
|
265
|
+
payload: { offset },
|
|
266
|
+
points: [pointId],
|
|
267
|
+
wait: false, // fire-and-forget per point; all resolve before function returns
|
|
268
|
+
})
|
|
269
|
+
)
|
|
270
|
+
);
|
|
271
|
+
console.error(`[Sync] Updated offset for ${updates.length} unchanged block(s).`);
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
/**
|
|
275
|
+
* Xóa Qdrant points theo danh sách IDs.
|
|
276
|
+
*/
|
|
277
|
+
export async function deletePointsByIds(pointIds: string[]): Promise<void> {
|
|
278
|
+
await initVectorDB();
|
|
279
|
+
if (!client || pointIds.length === 0) return;
|
|
280
|
+
|
|
281
|
+
await client.delete(COLLECTION_NAME, {
|
|
282
|
+
wait: true,
|
|
283
|
+
points: pointIds,
|
|
284
|
+
});
|
|
285
|
+
console.error(`Deleted ${pointIds.length} obsolete block(s) from Qdrant.`);
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
/**
|
|
289
|
+
* Global semantic search — không filter theo folder hay file.
|
|
290
|
+
*/
|
|
291
|
+
export async function searchProjectMemory(
|
|
292
|
+
query: string,
|
|
293
|
+
topK: number = 3
|
|
294
|
+
): Promise<any[]> {
|
|
102
295
|
await initVectorDB();
|
|
103
296
|
if (!client) throw new Error("Qdrant not initialized");
|
|
104
297
|
|
|
105
298
|
try {
|
|
106
299
|
const queryVector = await embedText(query);
|
|
107
|
-
|
|
108
300
|
const results = await client.search(COLLECTION_NAME, {
|
|
109
301
|
vector: queryVector,
|
|
110
302
|
limit: topK,
|
|
111
303
|
with_payload: true,
|
|
112
|
-
filter: {
|
|
113
|
-
must: [
|
|
114
|
-
{
|
|
115
|
-
key: "folderId",
|
|
116
|
-
match: {
|
|
117
|
-
value: folderId
|
|
118
|
-
}
|
|
119
|
-
}
|
|
120
|
-
]
|
|
121
|
-
}
|
|
122
304
|
});
|
|
123
305
|
|
|
124
|
-
|
|
125
|
-
return results.map(r => ({
|
|
306
|
+
return results.map((r) => ({
|
|
126
307
|
id: r.id,
|
|
127
|
-
|
|
128
|
-
...r.payload
|
|
308
|
+
...r.payload,
|
|
129
309
|
}));
|
|
130
310
|
} catch (err: any) {
|
|
131
311
|
console.error("Qdrant search error:", err.message);
|
|
@@ -133,58 +313,70 @@ export async function searchProjectMemory(folderId: string, query: string, topK:
|
|
|
133
313
|
}
|
|
134
314
|
}
|
|
135
315
|
|
|
136
|
-
|
|
316
|
+
/**
|
|
317
|
+
* Exhaustive substring search: scrolls ALL points and filters client-side.
|
|
318
|
+
* More reliable than Qdrant full-text filter (whitespace tokenizer doesn't
|
|
319
|
+
* strip surrounding punctuation, causing false negatives for terms like
|
|
320
|
+
* "ServiceCode.mkp" appearing as "ServiceCode.mkp)" in headings).
|
|
321
|
+
* For typical collection sizes (~few hundred chunks) the O(N) cost is negligible.
|
|
322
|
+
*/
|
|
323
|
+
export async function exactSearchChunks(
|
|
324
|
+
term: string,
|
|
325
|
+
limit: number = 50
|
|
326
|
+
): Promise<any[]> {
|
|
137
327
|
await initVectorDB();
|
|
138
|
-
if (!client)
|
|
328
|
+
if (!client) throw new Error("Qdrant not initialized");
|
|
139
329
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
{ key: "folderId", match: { value: folderId } },
|
|
144
|
-
{ key: "file_id", match: { value: fileId } }
|
|
145
|
-
]
|
|
146
|
-
}
|
|
147
|
-
});
|
|
148
|
-
console.error(`Deleted old chunks from Qdrant for ${folderId} / ${fileId}`);
|
|
149
|
-
}
|
|
330
|
+
const lowerTerm = term.toLowerCase();
|
|
331
|
+
const results: any[] = [];
|
|
332
|
+
let offset: string | number | null | undefined = undefined;
|
|
150
333
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
334
|
+
do {
|
|
335
|
+
const page: { points: any[]; next_page_offset?: string | number | null } =
|
|
336
|
+
await (client as any).scroll(COLLECTION_NAME, {
|
|
337
|
+
with_payload: true,
|
|
338
|
+
with_vector: false,
|
|
339
|
+
limit: 100,
|
|
340
|
+
...(offset !== undefined ? { offset } : {}),
|
|
341
|
+
});
|
|
342
|
+
|
|
343
|
+
for (const point of page.points) {
|
|
344
|
+
const text = ((point.payload?.text as string) ?? "").toLowerCase();
|
|
345
|
+
if (text.includes(lowerTerm)) {
|
|
346
|
+
results.push({ id: point.id, ...point.payload });
|
|
347
|
+
if (results.length >= limit) break;
|
|
348
|
+
}
|
|
161
349
|
}
|
|
162
|
-
|
|
163
|
-
|
|
350
|
+
offset = page.next_page_offset;
|
|
351
|
+
} while (offset != null && results.length < limit);
|
|
352
|
+
|
|
353
|
+
return results;
|
|
164
354
|
}
|
|
165
355
|
|
|
166
|
-
|
|
356
|
+
/**
|
|
357
|
+
* Upsert agent note với random UUID (không có fileId).
|
|
358
|
+
*/
|
|
359
|
+
export async function upsertAgentNote(text: string): Promise<void> {
|
|
167
360
|
await initVectorDB();
|
|
168
|
-
if (!client)
|
|
169
|
-
|
|
170
|
-
const res = await client.scroll(COLLECTION_NAME, {
|
|
171
|
-
filter: {
|
|
172
|
-
must: [
|
|
173
|
-
{ key: "folderId", match: { value: folderId } },
|
|
174
|
-
{ key: "source", match: { value: "google_drive" } }
|
|
175
|
-
]
|
|
176
|
-
},
|
|
177
|
-
limit: 10000,
|
|
178
|
-
with_payload: ["file_id", "modified_time"],
|
|
179
|
-
with_vector: false
|
|
180
|
-
});
|
|
181
|
-
|
|
182
|
-
const fileMap: Record<string, string> = {};
|
|
183
|
-
for (const r of res.points) {
|
|
184
|
-
if (r.payload && r.payload.file_id && r.payload.modified_time) {
|
|
185
|
-
fileMap[r.payload.file_id as string] = r.payload.modified_time as string;
|
|
186
|
-
}
|
|
187
|
-
}
|
|
361
|
+
if (!client) throw new Error("Qdrant not initialized");
|
|
188
362
|
|
|
189
|
-
|
|
363
|
+
const vector = await embedText(text);
|
|
364
|
+
await client.upsert(COLLECTION_NAME, {
|
|
365
|
+
wait: true,
|
|
366
|
+
points: [
|
|
367
|
+
{
|
|
368
|
+
id: uuidv4(),
|
|
369
|
+
vector,
|
|
370
|
+
payload: {
|
|
371
|
+
text,
|
|
372
|
+
title: "Agent Note",
|
|
373
|
+
block_index: 0,
|
|
374
|
+
block_hash: "",
|
|
375
|
+
source: "agent",
|
|
376
|
+
offset: 0,
|
|
377
|
+
},
|
|
378
|
+
},
|
|
379
|
+
],
|
|
380
|
+
});
|
|
381
|
+
console.error("Upserted agent note to Qdrant.");
|
|
190
382
|
}
|