@lorrylurui/code-intelligence-mcp 1.2.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,87 +14,132 @@
14
14
  * 大仓分片:
15
15
  * - 直接启动多个 worker 进程(同一 Redis)即可水平扩展,BullMQ 原生分布式协调
16
16
  */
17
- import { Worker } from 'bullmq';
18
- import Redis from 'ioredis';
17
+ import { Worker, QueueEvents } from 'bullmq';
18
+ import { Redis } from 'ioredis';
19
19
  import { env } from '../config/env.js';
20
- import { getMySqlPool } from '../db/mysql.js';
21
- import { createEmbeddingClient } from './embeddingClient.js';
20
+ import { getPool } from '../db/postgres.js';
21
+ import { createEmbeddingClient } from '../services/embeddingClient.js';
22
22
  import { indexedRowToEmbedText } from '../indexer/embedText.js';
23
+ import { initCategoryEmbeddings, resolveCategory, } from '../indexer/categoryClassifier.js';
23
24
  import { SYMBOL_STATUS } from '../config/symbolStatus.js';
24
25
  async function processEmbedJob(job, pool) {
25
26
  const { semanticHash } = job.data;
26
- const table = env.mysqlSymbolsTable;
27
+ // 优先使用 job payload 里的表名(跨项目场景),降级到 env(单项目场景)
28
+ const table = job.data.symbolsTable ?? env.symbolsTable;
29
+ const shortHash = semanticHash.slice(0, 10);
27
30
  const embedClient = createEmbeddingClient(env.embeddingServiceUrl);
31
+ const ts = () => new Date().toISOString();
28
32
  // Step 1: 缓存命中检查 —— 相同 semantic_hash 已有 online 向量
29
- const [cached] = await pool.query(`SELECT embedding FROM ${table}
30
- WHERE semantic_hash = ? AND status = ? AND embedding IS NOT NULL
33
+ const { rows: cached } = await pool.query(`SELECT embedding FROM ${table}
34
+ WHERE semantic_hash = $1 AND status = $2 AND embedding IS NOT NULL
31
35
  LIMIT 1`, [semanticHash, SYMBOL_STATUS.ONLINE]);
32
36
  let vector;
33
37
  if (cached.length > 0) {
34
38
  // Cache hit: 直接复用已有向量,0 次 API 调用
39
+ // pgvector 返回字符串 "[x1,x2,...]", JSON.parse 可直接解析
35
40
  vector =
36
41
  typeof cached[0].embedding === 'string'
37
42
  ? JSON.parse(cached[0].embedding)
38
43
  : cached[0].embedding;
39
- console.error(`[worker] cache hit hash=${semanticHash.slice(0, 10)}…`);
44
+ // cache hit 时只需把 pending 行的向量补齐(有可能是新增的同语义符号)
45
+ const cacheResult = await pool.query(`UPDATE ${table}
46
+ SET embedding = $1::vector, status = $2
47
+ WHERE semantic_hash = $3 AND status = $4`, [
48
+ `[${vector.join(',')}]`,
49
+ SYMBOL_STATUS.ONLINE,
50
+ semanticHash,
51
+ SYMBOL_STATUS.PENDING,
52
+ ]);
53
+ console.error(`[worker] ✅ cache hit [${ts()}] table=${table} hash=${shortHash}… updated ${cacheResult.rowCount ?? 0} row(s) (0 API calls)`);
54
+ return { updatedRows: cacheResult.rowCount ?? 0 };
40
55
  }
41
- else {
42
- // Cache miss: 取一条 pending 行做 embedding
43
- const [pending] = await pool.query(`SELECT name, type, category, path, description, content, meta
44
- FROM ${table}
45
- WHERE semantic_hash = ? AND status = ?
46
- LIMIT 1`, [semanticHash, SYMBOL_STATUS.PENDING]);
47
- if (pending.length === 0) {
48
- // 所有行已被并发 worker 处理,幂等退出
49
- return;
50
- }
51
- const row = pending[0];
52
- const meta = typeof row.meta === 'string'
53
- ? JSON.parse(row.meta)
54
- : (row.meta ?? {});
55
- const doc = indexedRowToEmbedText({ ...row, meta });
56
- const vectors = await embedClient.embed([doc]);
57
- vector = vectors[0];
58
- console.error(`[worker] embedded hash=${semanticHash.slice(0, 10)}… path=${row.path}:${row.name}`);
56
+ // Cache miss: 取一条 pending 行做 embedding
57
+ const { rows: pending } = await pool.query(`SELECT name, type, category, path, description, content, meta
58
+ FROM ${table}
59
+ WHERE semantic_hash = $1 AND status = $2
60
+ LIMIT 1`, [semanticHash, SYMBOL_STATUS.PENDING]);
61
+ if (pending.length === 0) {
62
+ // 所有行已被并发 worker 处理,幂等退出
63
+ console.error(`[worker] ⚠️ skip [${ts()}] table=${table} hash=${shortHash}… (no pending rows)`);
64
+ return { updatedRows: 0 };
59
65
  }
60
- // Step 2: 批量写入 —— 批量更新所有拥有相同 semantic_hash 的 pending
61
- // 一次 API 调用覆盖 N 个同义符号(大仓重复代码/多文件同函数场景收益明显)
62
- await pool.query(`UPDATE ${table}
63
- SET embedding = CAST(? AS JSON), status = ?
64
- WHERE semantic_hash = ? AND status = ?`, [
65
- JSON.stringify(vector),
66
+ const row = pending[0];
67
+ const meta = typeof row.meta === 'string' ? JSON.parse(row.meta) : (row.meta ?? {});
68
+ const rowObj = { ...row, meta };
69
+ console.error(`[worker] 🔄 embedding [${ts()}] table=${table} hash=${shortHash}… ${row.path}:${row.name}`);
70
+ // reindex 保持一致:优先用 content(语义模板),降级用 indexedRowToEmbedText
71
+ const doc = row.content ?? indexedRowToEmbedText(rowObj);
72
+ const vectors = await embedClient.embed([doc]);
73
+ vector = vectors[0];
74
+ // 生成 category(规则 → embedding → LLM 三层融合)
75
+ const [resolvedRow] = await resolveCategory([rowObj], [vector]);
76
+ const resolvedCategory = resolvedRow.category ?? null;
77
+ // Step 2: 批量写入 —— 覆盖所有相同 semantic_hash 的 pending 行
78
+ const result = await pool.query(`UPDATE ${table}
79
+ SET embedding = $1::vector, status = $2, category = COALESCE($3, category)
80
+ WHERE semantic_hash = $4 AND status = $5`, [
81
+ `[${vector.join(',')}]`,
66
82
  SYMBOL_STATUS.ONLINE,
83
+ resolvedCategory,
67
84
  semanticHash,
68
85
  SYMBOL_STATUS.PENDING,
69
86
  ]);
87
+ console.error(`[worker] ✓ done [${ts()}] table=${table} hash=${shortHash}… category=${resolvedCategory ?? 'null'} updated ${result.rowCount ?? 0} row(s)`);
88
+ return { updatedRows: result.rowCount ?? 0 };
70
89
  }
71
90
  /**
72
- * 启动 embedding worker,返回 Worker 实例(可用于优雅关闭)。
91
+ * 启动 embedding worker,返回包含 stop() 的句柄。
73
92
  */
74
- export function startEmbeddingWorker(opts = {}) {
93
+ export async function startEmbeddingWorker(opts = {}) {
75
94
  const { concurrency = 5, rpmLimit = 100 } = opts;
76
95
  const connection = new Redis(env.redisUrl, {
77
96
  maxRetriesPerRequest: null,
78
97
  enableReadyCheck: false,
79
98
  });
80
- const pool = getMySqlPool();
81
- if (!pool) {
82
- throw new Error('[embeddingWorker] MySQL pool unavailable — check env vars');
99
+ // 独立连接监听队列事件(BullMQ 要求不共用 Worker 连接)
100
+ const eventsConnection = new Redis(env.redisUrl, {
101
+ maxRetriesPerRequest: null,
102
+ enableReadyCheck: false,
103
+ });
104
+ const queueEvents = new QueueEvents('embedding', {
105
+ connection: eventsConnection,
106
+ });
107
+ const pool = getPool();
108
+ // 预热 category embeddings(仅在服务启动时调用一次)
109
+ if (env.embeddingServiceUrl) {
110
+ await initCategoryEmbeddings();
111
+ console.error('[embedding-worker] category embeddings initialized');
83
112
  }
84
113
  const worker = new Worker('embedding', (job) => processEmbedJob(job, pool), {
85
114
  connection,
86
115
  concurrency,
87
116
  // 全局限流:所有 worker 进程共享,防止触发 OpenAI rate limit
88
117
  limiter: { max: rpmLimit, duration: 60_000 },
118
+ // 完成后立即从 Redis 清除,避免 jobId 残留导致下次同 hash 无法入队
119
+ removeOnComplete: { count: 0 },
120
+ removeOnFail: { count: 100 },
89
121
  });
90
- worker.on('completed', (job) => {
91
- console.error(`[worker] job done hash=${job.data.semanticHash.slice(0, 10)}…`);
122
+ // 累计统计:每次 drained 后重置
123
+ const stats = { completed: 0, updatedRows: 0 };
124
+ worker.on('completed', (_job, result) => {
125
+ stats.completed++;
126
+ stats.updatedRows += result?.updatedRows ?? 0;
92
127
  });
93
128
  worker.on('failed', (job, err) => {
94
- console.error(`[worker] ✗ job fail hash=${job?.data?.semanticHash?.slice(0, 10)}… err=${err.message}`);
129
+ console.error(`[worker] ✗ failed [${new Date().toISOString()}] table=${job?.data?.symbolsTable ?? env.symbolsTable} hash=${job?.data?.semanticHash?.slice(0, 10)}… err=${err.message}`);
95
130
  });
96
131
  worker.on('error', (err) => {
97
132
  console.error(`[worker] error: ${err.message}`);
98
133
  });
99
- return worker;
134
+ // 队列清空时打汇总(全量 reindex 入队后监听,确认所有 embedding 已处理)
135
+ queueEvents.on('drained', () => {
136
+ console.error(`[worker] ✅ queue drained [${new Date().toISOString()}] completed=${stats.completed} jobs rows_updated=${stats.updatedRows}`);
137
+ stats.completed = 0;
138
+ stats.updatedRows = 0;
139
+ });
140
+ const stop = async () => {
141
+ await worker.close();
142
+ await queueEvents.close();
143
+ };
144
+ return { worker, stop };
100
145
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lorrylurui/code-intelligence-mcp",
3
- "version": "1.2.0",
3
+ "version": "2.0.1",
4
4
  "private": false,
5
5
  "description": "MCP server 提供仓库内可复用代码块(ts/tsx/js/jsx/css/less)的索引和查询能力,支持基于代码上下文的智能推荐。",
6
6
  "type": "module",
@@ -23,7 +23,7 @@
23
23
  "embedding:dev": "cd embedding-service && python3 -m uvicorn app:app --host 127.0.0.1 --port 8765",
24
24
  "docker:up": "docker compose up -d",
25
25
  "docker:down": "docker compose down",
26
- "docker:logs": "docker compose logs -f mysql"
26
+ "docker:logs": "docker compose logs -f postgres"
27
27
  },
28
28
  "dependencies": {
29
29
  "@babel/parser": "^7.29.2",
@@ -35,12 +35,14 @@
35
35
  "fast-glob": "^3.3.2",
36
36
  "ioredis": "^5.10.1",
37
37
  "mysql2": "^3.11.3",
38
+ "pg": "^8.20.0",
38
39
  "react": "^19.2.4",
39
40
  "ts-morph": "^25.0.0",
40
41
  "zod": "^3.23.8"
41
42
  },
42
43
  "devDependencies": {
43
44
  "@types/node": "^22.10.1",
45
+ "@types/pg": "^8.20.0",
44
46
  "tsx": "^4.19.2",
45
47
  "typescript": "^5.6.3"
46
48
  },