@khoinguyen2002/doc-mcp 1.0.4 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,407 @@
1
+ import { google } from "googleapis";
2
+ import { toHast } from "@googleworkspace/google-docs-hast";
3
+ import { toHtml } from "hast-util-to-html";
4
+ import * as crypto from "crypto";
5
+ import TurndownService from "turndown";
6
+ import { gfm } from "turndown-plugin-gfm";
7
+ import { config } from "../config.js";
8
+ import { get_encoding } from "tiktoken";
9
+ import { embedBatch, getBlockPointId, getBlockMetaByIds, deletePointsByIds, upsertChunkBatch, updateBlockOffsets, } from "../db/vector.js";
10
+ import { getSyncEntry, setSyncEntry, getImageDesc, setImageDesc } from "../db/syncState.js";
11
+ import { waitForRateLimit } from "../db/rateLimiter.js";
12
+ // ─── Turndown setup ───────────────────────────────────────────────────────────
13
+ const turndownService = new TurndownService({
14
+ headingStyle: "atx",
15
+ codeBlockStyle: "fenced",
16
+ bulletListMarker: "-",
17
+ });
18
+ turndownService.use(gfm);
19
+ // Replace img tags with readable placeholder (Drive blob URLs are useless)
20
+ turndownService.addRule("images", {
21
+ filter: "img",
22
+ replacement: (_content, node) => {
23
+ const alt = node.getAttribute?.("alt") || "";
24
+ return alt ? `[Image: ${alt}]` : "[Image]";
25
+ },
26
+ });
27
+ // ─── Google Auth ──────────────────────────────────────────────────────────────
28
+ function getGoogleClients() {
29
+ const clientEmail = config.DOC_MCP_GOOGLE_CLIENT_EMAIL;
30
+ let privateKey = config.DOC_MCP_GOOGLE_PRIVATE_KEY;
31
+ if (!clientEmail || !privateKey) {
32
+ throw new Error("Google credentials not configured.");
33
+ }
34
+ if (privateKey.startsWith('"') && privateKey.endsWith('"')) {
35
+ privateKey = privateKey.slice(1, -1);
36
+ }
37
+ privateKey = privateKey.replace(/\\n/g, "\n");
38
+ const auth = new google.auth.JWT({
39
+ email: clientEmail,
40
+ key: privateKey,
41
+ scopes: [
42
+ "https://www.googleapis.com/auth/drive.readonly",
43
+ "https://www.googleapis.com/auth/documents.readonly",
44
+ ],
45
+ });
46
+ return {
47
+ drive: google.drive({ version: "v3", auth }),
48
+ docs: google.docs({ version: "v1", auth }),
49
+ };
50
+ }
51
+ // ─── HAST Image Collection ───────────────────────────────────────────────────
52
+ /** Collect all img src URLs from a HAST tree. */
53
+ function collectImageSrcs(node, srcs) {
54
+ if (!node)
55
+ return;
56
+ if (node.type === "element" && node.tagName === "img" && node.properties?.src) {
57
+ srcs.add(String(node.properties.src));
58
+ }
59
+ if (Array.isArray(node.children)) {
60
+ for (const child of node.children)
61
+ collectImageSrcs(child, srcs);
62
+ }
63
+ }
64
+ /** Replace img nodes with description text from descMap. */
65
+ function sanitizeHast(node, descMap) {
66
+ if (!node)
67
+ return;
68
+ if (node.type === "element" && node.tagName === "img") {
69
+ const src = String(node.properties?.src ?? "");
70
+ const description = descMap.get(src) ||
71
+ (node.properties?.alt ? String(node.properties.alt) : "");
72
+ const label = description ? `: ${description}` : "";
73
+ node.tagName = "span";
74
+ node.properties = { className: ["img-placeholder"] };
75
+ node.children = [{ type: "text", value: `[Image${label}]` }];
76
+ return;
77
+ }
78
+ if (Array.isArray(node.children)) {
79
+ for (const child of node.children)
80
+ sanitizeHast(child, descMap);
81
+ }
82
+ }
83
+ // ─── Vision LLM ──────────────────────────────────────────────────────────────
84
+ async function downloadImage(url) {
85
+ try {
86
+ const res = await fetch(url, { signal: AbortSignal.timeout(10_000) });
87
+ if (!res.ok)
88
+ return null;
89
+ const contentType = res.headers.get("content-type") || "image/png";
90
+ const mimeType = contentType.split(";")[0].trim();
91
+ const buffer = Buffer.from(await res.arrayBuffer());
92
+ return { buffer, mimeType };
93
+ }
94
+ catch {
95
+ return null;
96
+ }
97
+ }
98
+ async function describeImageWithVision(buffer, mimeType) {
99
+ const base64 = buffer.toString("base64");
100
+ const res = await fetch("https://openrouter.ai/api/v1/chat/completions", {
101
+ method: "POST",
102
+ headers: {
103
+ Authorization: `Bearer ${config.OPENROUTER_API_KEY}`,
104
+ "Content-Type": "application/json",
105
+ },
106
+ body: JSON.stringify({
107
+ model: config.VISION_MODEL_ID,
108
+ messages: [
109
+ {
110
+ role: "user",
111
+ content: [
112
+ {
113
+ type: "image_url",
114
+ image_url: { url: `data:${mimeType};base64,${base64}` },
115
+ },
116
+ {
117
+ type: "text",
118
+ text: "Describe this image concisely in 1-3 sentences for a developer reading technical documentation. Focus on UI layout, data shown, flow diagrams, or key visible text.",
119
+ },
120
+ ],
121
+ },
122
+ ],
123
+ max_tokens: 300,
124
+ }),
125
+ });
126
+ if (!res.ok) {
127
+ console.error(`[Vision] API error: ${res.status}`);
128
+ return "";
129
+ }
130
+ const json = await res.json();
131
+ return json.choices?.[0]?.message?.content?.trim() || "";
132
+ }
133
+ /**
134
+ * Process all images in a HAST tree:
135
+ * 1. Download image binary
136
+ * 2. Check Redis cache by binary hash
137
+ * 3. Call vision LLM if cache miss
138
+ * 4. Return src→description map
139
+ */
140
+ async function processImages(hast) {
141
+ const descMap = new Map();
142
+ if (!config.VISION_MODEL_ID) {
143
+ // Vision not configured — fall back to alt text only
144
+ return descMap;
145
+ }
146
+ const srcs = new Set();
147
+ collectImageSrcs(hast, srcs);
148
+ if (srcs.size === 0)
149
+ return descMap;
150
+ console.error(`[Vision] Processing ${srcs.size} image(s)...`);
151
+ for (const src of srcs) {
152
+ const image = await downloadImage(src);
153
+ if (!image) {
154
+ console.error(`[Vision] Failed to download: ${src.substring(0, 60)}...`);
155
+ continue;
156
+ }
157
+ const imageHash = crypto.createHash("md5").update(image.buffer).digest("hex");
158
+ // Check Redis cache
159
+ const cached = await getImageDesc(imageHash);
160
+ if (cached) {
161
+ console.error(`[Vision] Cache hit for image hash ${imageHash.substring(0, 8)}`);
162
+ descMap.set(src, cached);
163
+ continue;
164
+ }
165
+ // Call vision LLM
166
+ console.error(`[Vision] Describing image hash ${imageHash.substring(0, 8)}...`);
167
+ const description = await describeImageWithVision(image.buffer, image.mimeType);
168
+ if (description) {
169
+ await setImageDesc(imageHash, description);
170
+ descMap.set(src, description);
171
+ console.error(`[Vision] Stored: "${description.substring(0, 80)}..."`);
172
+ }
173
+ }
174
+ return descMap;
175
+ }
176
+ // ─── HTML → Markdown ─────────────────────────────────────────────────────────
177
+ export async function googleDocToMarkdown(docJson) {
178
+ const hast = toHast(docJson);
179
+ const descMap = await processImages(hast);
180
+ sanitizeHast(hast, descMap);
181
+ const html = toHtml(hast);
182
+ // 1. Strip inline styles/attrs from table/row/cell tags
183
+ let cleanHtml = html.replace(/<(table|thead|tbody|tr|td|th)(\s[^>]*)>/gi, (_, tag) => `<${tag}>`);
184
+ // 2. Fix tables for turndown-plugin-gfm:
185
+ // - Strip <p> wrappers inside cells (GFM requires inline content only)
186
+ // - Strip <span> attributes (inline styles break cell content parsing)
187
+ // - Convert first <tr>'s <td> → <th> so isHeadingRow() returns true
188
+ // (Google Docs never uses <th>; without this the table rule doesn't fire)
189
+ cleanHtml = cleanHtml.replace(/<table[\s\S]*?<\/table>/gi, (tableBlock) => {
190
+ let cleaned = tableBlock
191
+ .replace(/<\/?p[^>]*>/gi, "") // strip <p> wrappers
192
+ .replace(/<span[^>]*>/gi, "") // strip <span> open tags w/ attrs
193
+ .replace(/<\/span>/gi, ""); // strip </span>
194
+ // Promote first <tr>'s cells to <th> so GFM table rule fires
195
+ let firstRow = true;
196
+ return cleaned.replace(/<tr>([\s\S]*?)<\/tr>/gi, (rowMatch, rowContent) => {
197
+ if (firstRow) {
198
+ firstRow = false;
199
+ return "<tr>" +
200
+ rowContent
201
+ .replace(/<td>/gi, "<th>")
202
+ .replace(/<\/td>/gi, "</th>") +
203
+ "</tr>";
204
+ }
205
+ return rowMatch;
206
+ });
207
+ });
208
+ return turndownService.turndown(cleanHtml);
209
+ }
210
+ /**
211
+ * Convert a multi-tab Google Doc to a single Markdown string.
212
+ * Each tab becomes a top-level section separated by ---.
213
+ */
214
+ async function docToMarkdown(docData) {
215
+ if (docData.tabs && docData.tabs.length > 0) {
216
+ const tabMarkdowns = [];
217
+ for (const tab of docData.tabs) {
218
+ if (!tab.documentTab?.body)
219
+ continue;
220
+ const tabTitle = tab.tabProperties?.title || "Tab";
221
+ // Spread full documentTab so toHast resolves inline objects per-tab
222
+ const tabDoc = { ...docData, ...tab.documentTab };
223
+ const md = await googleDocToMarkdown(tabDoc);
224
+ tabMarkdowns.push(`# ${tabTitle}\n\n${md}`);
225
+ }
226
+ return tabMarkdowns.join("\n\n---\n\n");
227
+ }
228
+ // Single-tab (legacy) document
229
+ return googleDocToMarkdown(docData);
230
+ }
231
+ // ─── Chunking ─────────────────────────────────────────────────────────────────
232
+ /**
233
+ * Split Markdown at headings (#, ##), merge small sections up to MAX_CHUNK_SIZE.
234
+ * Sections exceeding MAX_CHUNK_SIZE are split at the nearest newline boundary.
235
+ *
236
+ * Effective MAX_CHUNK_SIZE is capped so that even the worst-case content
237
+ * (all-Thai, ~3 cl100k tokens/char × TOKEN_SAFETY_MULTIPLIER) stays within
238
+ * 40% of EMBEDDING_MAX_TOKENS — guaranteeing at least 2 chunks can fit per batch
239
+ * regardless of which embedding model is configured.
240
+ */
241
+ function chunkMarkdown(markdown) {
242
+ // Worst-case Thai tokenization: 3 cl100k tokens/char × 1.4 safety = ~4.2 tokens/char
243
+ const worstCaseTokensPerChar = 3 * TOKEN_SAFETY_MULTIPLIER;
244
+ // Allow each chunk to use at most 40% of the token budget
245
+ const maxCharsFromBudget = Math.max(500, Math.floor((config.EMBEDDING_MAX_TOKENS * 0.4) / worstCaseTokensPerChar));
246
+ const MAX_CHUNK_SIZE = Math.min(config.MAX_CHUNK_SIZE, maxCharsFromBudget);
247
+ console.error(`[Chunk] effectiveChunkSize=${MAX_CHUNK_SIZE} ` +
248
+ `(config=${config.MAX_CHUNK_SIZE}, budgetCap=${maxCharsFromBudget}, ` +
249
+ `maxTokens=${config.EMBEDDING_MAX_TOKENS})`);
250
+ // Split at markdown headings (keep the heading in the next chunk)
251
+ const sections = markdown
252
+ .split(/(?=\n#{1,2} )/g)
253
+ .filter((s) => s.trim().length > 0);
254
+ const chunks = [];
255
+ let current = "";
256
+ for (let section of sections) {
257
+ // Section exceeds MAX_CHUNK_SIZE → split at newline boundaries
258
+ while (section.length > MAX_CHUNK_SIZE) {
259
+ if (current.length > 0) {
260
+ chunks.push(current);
261
+ current = "";
262
+ }
263
+ // Find the nearest newline in the second half of the window
264
+ // to avoid cutting mid-line (table row, sentence, etc.)
265
+ let cutAt = MAX_CHUNK_SIZE;
266
+ const newlineIdx = section.lastIndexOf("\n", MAX_CHUNK_SIZE);
267
+ if (newlineIdx > MAX_CHUNK_SIZE * 0.5) {
268
+ cutAt = newlineIdx + 1; // include the \n in the current chunk
269
+ }
270
+ chunks.push(section.substring(0, cutAt));
271
+ section = section.substring(cutAt);
272
+ }
273
+ if (current.length > 0 &&
274
+ current.length + section.length > MAX_CHUNK_SIZE) {
275
+ chunks.push(current);
276
+ current = section;
277
+ }
278
+ else {
279
+ current += section;
280
+ }
281
+ }
282
+ if (current.trim())
283
+ chunks.push(current);
284
+ return chunks;
285
+ }
286
+ function calculateHash(content) {
287
+ return crypto.createHash("md5").update(content).digest("hex");
288
+ }
289
+ // ─── Batch Packing ────────────────────────────────────────────────────────────
290
+ /**
291
+ * Token counter using tiktoken cl100k_base (GPT-4 tokenizer).
292
+ * cl100k_base is used as a close approximation for LLaMA-2 based models.
293
+ * A 1.4x safety multiplier is applied because LLaMA-2's SentencePiece tokenizer
294
+ * tokenizes Thai/multilingual text significantly worse than cl100k_base.
295
+ * Encoder is initialized once at module level to avoid repeated WASM loads.
296
+ */
297
+ let _enc = null;
298
+ function getEncoder() {
299
+ if (!_enc)
300
+ _enc = get_encoding("cl100k_base");
301
+ return _enc;
302
+ }
303
+ const TOKEN_SAFETY_MULTIPLIER = 1.4;
304
+ function countTokens(text) {
305
+ const enc = getEncoder();
306
+ return Math.ceil(enc.encode(text).length * TOKEN_SAFETY_MULTIPLIER);
307
+ }
308
+ function packIntoBatches(blocks, maxTokens) {
309
+ const batches = [];
310
+ let current = [];
311
+ let currentTokens = 0;
312
+ for (const block of blocks) {
313
+ const blockTokens = countTokens(block.text);
314
+ if (current.length > 0 && currentTokens + blockTokens > maxTokens) {
315
+ batches.push(current);
316
+ current = [block];
317
+ currentTokens = blockTokens;
318
+ }
319
+ else {
320
+ current.push(block);
321
+ currentTokens += blockTokens;
322
+ }
323
+ }
324
+ if (current.length > 0)
325
+ batches.push(current);
326
+ return batches;
327
+ }
328
+ // ─── Core Sync ────────────────────────────────────────────────────────────────
329
+ export async function syncSingleDocument(fileId, driveModifiedTime, title) {
330
+ const { docs } = getGoogleClients();
331
+ // 1. Fetch doc with ALL tabs content + convert to Markdown
332
+ const docRes = await docs.documents.get({
333
+ documentId: fileId,
334
+ includeTabsContent: true, // fetch all document tabs
335
+ });
336
+ const markdown = await docToMarkdown(docRes.data);
337
+ // 2. Check sync state — skip embedding if unchanged
338
+ const syncEntry = await getSyncEntry(fileId);
339
+ if (syncEntry?.modifiedTime === driveModifiedTime) {
340
+ console.error(`[Sync] "${title}": unchanged, skipping embedding.`);
341
+ return { synced: false, content: markdown };
342
+ }
343
+ // 3. Chunk Markdown
344
+ const newBlocks = chunkMarkdown(markdown);
345
+ // 4. Get existing block hashes via deterministic IDs
346
+ const oldBlockCount = syncEntry?.blockCount ?? 0;
347
+ const oldPointIds = Array.from({ length: oldBlockCount }, (_, i) => getBlockPointId(fileId, i));
348
+ const existingMeta = await getBlockMetaByIds(oldPointIds);
349
+ // 5. Diff blocks
350
+ const blocksToEmbed = [];
351
+ const blocksToUpdateOffset = [];
352
+ let skippedCount = 0;
353
+ let charOffset = 0;
354
+ for (let i = 0; i < newBlocks.length; i++) {
355
+ const text = newBlocks[i];
356
+ const hash = calculateHash(text);
357
+ const pointId = getBlockPointId(fileId, i);
358
+ const existing = existingMeta[pointId];
359
+ if (existing && existing.hash === hash) {
360
+ // Content unchanged — but check if offset shifted (due to edits in earlier blocks)
361
+ if (existing.offset !== charOffset) {
362
+ blocksToUpdateOffset.push({ pointId, offset: charOffset });
363
+ }
364
+ skippedCount++;
365
+ }
366
+ else {
367
+ blocksToEmbed.push({ index: i, offset: charOffset, text, hash, pointId });
368
+ }
369
+ charOffset += text.length;
370
+ }
371
+ // 6. Delete obsolete blocks (doc shrunk)
372
+ const obsoletePointIds = Array.from({ length: Math.max(0, oldBlockCount - newBlocks.length) }, (_, i) => getBlockPointId(fileId, newBlocks.length + i));
373
+ if (obsoletePointIds.length > 0) {
374
+ await deletePointsByIds(obsoletePointIds);
375
+ }
376
+ // 7. Fix stale offsets for unchanged blocks (no re-embed needed)
377
+ await updateBlockOffsets(blocksToUpdateOffset);
378
+ // 8. Batch embed + upsert
379
+ const batches = packIntoBatches(blocksToEmbed, config.EMBEDDING_MAX_TOKENS);
380
+ let upsertedCount = 0;
381
+ for (let b = 0; b < batches.length; b++) {
382
+ const batch = batches[b];
383
+ console.error(`[Embed] Batch ${b + 1}/${batches.length}: ${batch.length} chunk(s)`);
384
+ await waitForRateLimit();
385
+ const vectors = await embedBatch(batch.map((bl) => bl.text));
386
+ const chunkUpserts = batch.map((bl, vi) => ({
387
+ pointId: bl.pointId,
388
+ vector: vectors[vi],
389
+ text: bl.text,
390
+ title,
391
+ blockIndex: bl.index,
392
+ blockHash: bl.hash,
393
+ source: "google_drive",
394
+ offset: bl.offset,
395
+ }));
396
+ await upsertChunkBatch(chunkUpserts);
397
+ upsertedCount += batch.length;
398
+ }
399
+ // 8. Update sync state in Redis
400
+ await setSyncEntry(fileId, {
401
+ modifiedTime: driveModifiedTime,
402
+ blockCount: newBlocks.length,
403
+ title,
404
+ });
405
+ console.error(`[Sync] "${title}": ${upsertedCount} upserted, ${skippedCount} skipped, ${obsoletePointIds.length} deleted.`);
406
+ return { synced: true, content: markdown, upsertedCount, skippedCount };
407
+ }
@@ -1,28 +1,47 @@
1
1
  export declare function saveAgentNote(content: string): Promise<{
2
+ success: boolean;
3
+ message: string;
4
+ error?: undefined;
5
+ } | {
2
6
  success: boolean;
3
7
  error: string;
4
8
  message?: undefined;
9
+ }>;
10
+ export declare function searchKnowledge(query: string, topK?: number): Promise<{
11
+ success: boolean;
12
+ results: string;
13
+ error?: undefined;
5
14
  } | {
6
15
  success: boolean;
7
- message: string;
16
+ results: {
17
+ title: any;
18
+ offset: any;
19
+ text: any;
20
+ }[];
8
21
  error?: undefined;
9
- }>;
10
- export declare function searchKnowledge(query: string, topK?: number): Promise<{
22
+ } | {
11
23
  success: boolean;
12
24
  error: string;
13
25
  results?: undefined;
14
- } | {
26
+ }>;
27
+ export declare function searchExact(term: string, limit?: number): Promise<{
15
28
  success: boolean;
16
29
  results: string;
30
+ totalFound?: undefined;
17
31
  error?: undefined;
18
32
  } | {
19
33
  success: boolean;
34
+ totalFound: number;
20
35
  results: {
21
- title: string;
22
- fileId: any;
36
+ title: any;
23
37
  offset: any;
24
38
  text: any;
25
39
  }[];
26
40
  error?: undefined;
41
+ } | {
42
+ success: boolean;
43
+ error: string;
44
+ results?: undefined;
45
+ totalFound?: undefined;
27
46
  }>;
28
47
  //# sourceMappingURL=knowledgeTools.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"knowledgeTools.d.ts","sourceRoot":"","sources":["../../src/tools/knowledgeTools.ts"],"names":[],"mappings":"AAIA,wBAAsB,aAAa,CAAC,OAAO,EAAE,MAAM;;;;;;;;GAmBlD;AAED,wBAAsB,eAAe,CAAC,KAAK,EAAE,MAAM,EAAE,IAAI,GAAE,MAAU;;;;;;;;;;;;;;;;;GA0CpE"}
1
+ {"version":3,"file":"knowledgeTools.d.ts","sourceRoot":"","sources":["../../src/tools/knowledgeTools.ts"],"names":[],"mappings":"AAGA,wBAAsB,aAAa,CAAC,OAAO,EAAE,MAAM;;;;;;;;GAUlD;AAED,wBAAsB,eAAe,CAAC,KAAK,EAAE,MAAM,EAAE,IAAI,GAAE,MAAU;;;;;;;;;;;;;;;;GAsBpE;AACD,wBAAsB,WAAW,CAC/B,IAAI,EAAE,MAAM,EACZ,KAAK,GAAE,MAAW;;;;;;;;;;;;;;;;;;;GAuBnB"}
@@ -1,18 +1,8 @@
1
- import { config } from "../config.js";
2
- import { upsertProjectDocument, searchProjectMemory } from "../db/vector.js";
3
- import { syncFolderState } from "./driveTools.js";
1
+ import { searchProjectMemory, upsertAgentNote, exactSearchChunks } from "../db/vector.js";
2
+ import { syncAllDocuments } from "./driveTools.js";
4
3
  export async function saveAgentNote(content) {
5
- const folderId = config.DOC_MCP_DRIVE_FOLDER_ID;
6
- if (!folderId) {
7
- return {
8
- success: false,
9
- error: "DOC_MCP_DRIVE_FOLDER_ID is not configured.",
10
- };
11
- }
12
4
  try {
13
- await upsertProjectDocument(folderId, content, {
14
- source: "agent",
15
- });
5
+ await upsertAgentNote(content);
16
6
  return {
17
7
  success: true,
18
8
  message: "Successfully stored note in vector memory.",
@@ -23,42 +13,41 @@ export async function saveAgentNote(content) {
23
13
  }
24
14
  }
25
15
  export async function searchKnowledge(query, topK = 3) {
26
- const folderId = config.DOC_MCP_DRIVE_FOLDER_ID;
27
- if (!folderId) {
16
+ try {
17
+ // Auto-sync all documents before searching
18
+ await syncAllDocuments();
19
+ const results = await searchProjectMemory(query, topK);
20
+ if (!results || results.length === 0) {
21
+ return { success: true, results: "NOT_FOUND" };
22
+ }
28
23
  return {
29
- success: false,
30
- error: "DOC_MCP_DRIVE_FOLDER_ID is not configured.",
24
+ success: true,
25
+ results: results.map((r) => ({
26
+ title: r.title || "Unknown",
27
+ offset: r.offset ?? 0,
28
+ text: r.text,
29
+ })),
31
30
  };
32
31
  }
32
+ catch (err) {
33
+ return { success: false, error: `Failed to search: ${err.message}` };
34
+ }
35
+ }
36
+ export async function searchExact(term, limit = 50) {
33
37
  try {
34
- // Auto-sync folder state before searching
35
- await syncFolderState(folderId);
36
- const results = await searchProjectMemory(folderId, query, topK);
38
+ await syncAllDocuments();
39
+ const results = await exactSearchChunks(term, limit);
37
40
  if (!results || results.length === 0) {
38
41
  return { success: true, results: "NOT_FOUND" };
39
42
  }
40
43
  return {
41
44
  success: true,
42
- results: results.map((r) => {
43
- let title = "Unknown Source";
44
- let offset = undefined;
45
- if (r.metadata) {
46
- try {
47
- const metaObj = JSON.parse(r.metadata);
48
- if (metaObj.title)
49
- title = metaObj.title;
50
- if (metaObj.offset !== undefined)
51
- offset = metaObj.offset;
52
- }
53
- catch (e) { }
54
- }
55
- return {
56
- title,
57
- fileId: r.file_id || "N/A",
58
- offset,
59
- text: r.text,
60
- };
61
- }),
45
+ totalFound: results.length,
46
+ results: results.map((r) => ({
47
+ title: r.title || "Unknown",
48
+ offset: r.offset ?? 0,
49
+ text: r.text,
50
+ })),
62
51
  };
63
52
  }
64
53
  catch (err) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@khoinguyen2002/doc-mcp",
3
- "version": "1.0.4",
3
+ "version": "1.0.5",
4
4
  "type": "module",
5
5
  "main": "dist/index.js",
6
6
  "bin": {
@@ -11,10 +11,16 @@
11
11
  "start": "node dist/mcp-server.js"
12
12
  },
13
13
  "dependencies": {
14
+ "@googleworkspace/google-docs-hast": "^1.0.5",
14
15
  "@langchain/textsplitters": "^1.0.1",
15
16
  "@modelcontextprotocol/sdk": "^1.29.0",
16
17
  "@qdrant/js-client-rest": "^1.18.0",
18
+ "@upstash/redis": "^1.38.0",
17
19
  "googleapis": "^173.0.0",
20
+ "hast-util-to-html": "^9.0.5",
21
+ "tiktoken": "^1.0.22",
22
+ "turndown": "^7.2.4",
23
+ "turndown-plugin-gfm": "^1.0.2",
18
24
  "uuid": "^14.0.0",
19
25
  "zod": "^4.4.3"
20
26
  },
@@ -23,6 +29,7 @@
23
29
  ".": "./dist/index.js"
24
30
  },
25
31
  "devDependencies": {
32
+ "@types/turndown": "^5.0.6",
26
33
  "@types/uuid": "^11.0.0"
27
34
  }
28
35
  }
package/src/config.ts CHANGED
@@ -1,17 +1,35 @@
1
1
  import { z } from "zod";
2
2
 
3
3
  const schema = z.object({
4
- DOC_MCP_DRIVE_FOLDER_ID: z.string().optional(),
5
4
  DOC_MCP_GOOGLE_CLIENT_EMAIL: z.string().email().optional(),
6
5
  DOC_MCP_GOOGLE_PRIVATE_KEY: z.string().optional(),
7
-
8
- // Vector DB / Embeddings
6
+
7
+ // Vector DB
9
8
  QDRANT_URL: z.string().url().describe("The URL of your Qdrant instance"),
10
- QDRANT_API_KEY: z.string().optional().describe("API Key for Qdrant Cloud (optional for local)"),
9
+ QDRANT_API_KEY: z
10
+ .string()
11
+ .optional()
12
+ .describe("API Key for Qdrant Cloud (optional for local)"),
13
+
14
+ // Embeddings
11
15
  OPENROUTER_API_KEY: z.string().min(1),
12
- EMBEDDING_MODEL_ID: z.string().default("nvidia/llama-nemotron-embed-vl-1b-v2:free"),
13
- CHUNK_SIZE: z.coerce.number().int().positive().default(4000),
14
- CHUNK_OVERLAP: z.coerce.number().int().nonnegative().default(500),
16
+ EMBEDDING_MODEL_ID: z
17
+ .string()
18
+ .default("nvidia/llama-nemotron-embed-vl-1b-v2:free"),
19
+ // Max chunk size in Markdown chars — system may use a smaller value if
20
+ // the embedding model's token budget requires it (see ingestFlow.ts)
21
+ MAX_CHUNK_SIZE: z.coerce.number().int().positive().default(3000),
22
+ // Max tokens per embedding API call (for batch packing)
23
+ EMBEDDING_MAX_TOKENS: z.coerce.number().int().positive().default(32000),
24
+ // Max embedding API requests per minute
25
+ EMBEDDING_RPM: z.coerce.number().int().positive().default(40),
26
+
27
+ // Vision LLM model ID for image descriptions (optional, skip if not set)
28
+ VISION_MODEL_ID: z.string().optional(),
29
+
30
+ // Upstash Redis (for sync state)
31
+ UPSTASH_REDIS_REST_URL: z.string().url(),
32
+ UPSTASH_REDIS_REST_TOKEN: z.string().min(1),
15
33
  });
16
34
 
17
35
  function loadConfig() {
@@ -20,10 +38,11 @@ function loadConfig() {
20
38
  const missing = result.error.issues
21
39
  .map((i) => ` ${i.path.join(".")}: ${i.message}`)
22
40
  .join("\n");
23
- throw new Error(`Invalid environment configuration for doc-mcp:\n${missing}`);
41
+ throw new Error(
42
+ `doc-mcp configuration error:\n${missing}\n\nPlease check your environment variables.`
43
+ );
24
44
  }
25
45
  return result.data;
26
46
  }
27
47
 
28
48
  export const config = loadConfig();
29
- export type Config = typeof config;
@@ -0,0 +1,25 @@
1
+ import { config } from "../config.js";
2
+
3
+ const MIN_GAP_MS = Math.ceil(60000 / config.EMBEDDING_RPM);
4
+ let lastCallTime = 0;
5
+
6
+ function sleep(ms: number): Promise<void> {
7
+ return new Promise((resolve) => setTimeout(resolve, ms));
8
+ }
9
+
10
+ /**
11
+ * Call này trước mỗi embedBatch() để đảm bảo không vượt EMBEDDING_RPM.
12
+ * Sliding window đơn giản: enforce minimum gap = 60000 / RPM giữa các lần gọi.
13
+ */
14
+ export async function waitForRateLimit(): Promise<void> {
15
+ const now = Date.now();
16
+ const elapsed = now - lastCallTime;
17
+ if (elapsed < MIN_GAP_MS) {
18
+ const waitMs = MIN_GAP_MS - elapsed;
19
+ console.error(
20
+ `[RateLimit] Waiting ${waitMs}ms before next embedding call...`
21
+ );
22
+ await sleep(waitMs);
23
+ }
24
+ lastCallTime = Date.now();
25
+ }