grepmax 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -81,7 +81,7 @@ const TOOLS = [
81
81
  },
82
82
  limit: {
83
83
  type: "number",
84
- description: "Max results to return (default 10, max 50)",
84
+ description: "Max results to return (default 3, max 50)",
85
85
  },
86
86
  root: {
87
87
  type: "string",
@@ -91,6 +91,10 @@ const TOOLS = [
91
91
  type: "string",
92
92
  description: "Restrict search to files under this path prefix (e.g. 'src/auth/'). Relative to the search root.",
93
93
  },
94
+ detail: {
95
+ type: "string",
96
+ description: "Output detail: 'pointer' (default, metadata only — symbol, location, role, calls) or 'code' (include 4-line code snippets)",
97
+ },
94
98
  min_score: {
95
99
  type: "number",
96
100
  description: "Minimum relevance score (0-1). Results below this threshold are filtered out. Default: 0 (no filtering)",
@@ -115,7 +119,11 @@ const TOOLS = [
115
119
  },
116
120
  limit: {
117
121
  type: "number",
118
- description: "Max results to return (default 10, max 50)",
122
+ description: "Max results to return (default 3, max 50)",
123
+ },
124
+ detail: {
125
+ type: "string",
126
+ description: "Output detail: 'pointer' (default) or 'code' (include snippets)",
119
127
  },
120
128
  min_score: {
121
129
  type: "number",
@@ -335,21 +343,21 @@ exports.mcp = new commander_1.Command("mcp")
335
343
  const query = String(args.query || "");
336
344
  if (!query)
337
345
  return err("Missing required parameter: query");
338
- const limit = Math.min(Math.max(Number(args.limit) || 10, 1), 50);
346
+ const limit = Math.min(Math.max(Number(args.limit) || 3, 1), 50);
339
347
  yield ensureIndexReady();
340
348
  try {
341
349
  const searcher = getSearcher();
342
- // Determine path prefix for scoping
350
+ // Determine path prefix and display root for relative paths
343
351
  let pathPrefix;
352
+ let displayRoot = projectRoot;
344
353
  if (!searchAll) {
345
- // Resolve search root — default to project root
346
354
  const searchRoot = typeof args.root === "string"
347
355
  ? path.resolve(args.root)
348
356
  : path.resolve(projectRoot);
357
+ displayRoot = searchRoot;
349
358
  pathPrefix = searchRoot.endsWith("/")
350
359
  ? searchRoot
351
360
  : `${searchRoot}/`;
352
- // If a sub-path is specified, append it
353
361
  if (typeof args.path === "string") {
354
362
  pathPrefix = path.join(searchRoot, args.path);
355
363
  if (!pathPrefix.endsWith("/"))
@@ -362,47 +370,72 @@ exports.mcp = new commander_1.Command("mcp")
362
370
  }
363
371
  const minScore = typeof args.min_score === "number" ? args.min_score : 0;
364
372
  const maxPerFile = typeof args.max_per_file === "number" ? args.max_per_file : 0;
365
- const MAX_SNIPPET_LINES = 8;
366
- let compact = result.data.map((r) => {
373
+ const detail = typeof args.detail === "string" ? args.detail : "pointer";
374
+ let results = result.data.map((r) => {
367
375
  var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k, _l, _m;
368
- const startLine = (_c = (_a = r.startLine) !== null && _a !== void 0 ? _a : (_b = r.generated_metadata) === null || _b === void 0 ? void 0 : _b.start_line) !== null && _c !== void 0 ? _c : 0;
369
- const raw = typeof r.content === "string"
370
- ? r.content
371
- : typeof r.text === "string"
372
- ? r.text
373
- : "";
374
- // Add line numbers and cap at MAX_SNIPPET_LINES
375
- const lines = raw.split("\n");
376
- const capped = lines.slice(0, MAX_SNIPPET_LINES);
377
- const numbered = capped.map((line, i) => `${startLine + i + 1}│${line}`);
378
- const snippet = lines.length > MAX_SNIPPET_LINES
379
- ? `${numbered.join("\n")}\n… (+${lines.length - MAX_SNIPPET_LINES} more lines)`
380
- : numbered.join("\n");
376
+ const absPath = (_c = (_a = r.path) !== null && _a !== void 0 ? _a : (_b = r.metadata) === null || _b === void 0 ? void 0 : _b.path) !== null && _c !== void 0 ? _c : "";
377
+ const relPath = absPath.startsWith(displayRoot)
378
+ ? absPath.slice(displayRoot.length + 1)
379
+ : absPath;
380
+ const startLine = (_f = (_d = r.startLine) !== null && _d !== void 0 ? _d : (_e = r.generated_metadata) === null || _e === void 0 ? void 0 : _e.start_line) !== null && _f !== void 0 ? _f : 0;
381
+ const endLine = (_j = (_g = r.endLine) !== null && _g !== void 0 ? _g : (_h = r.generated_metadata) === null || _h === void 0 ? void 0 : _h.end_line) !== null && _j !== void 0 ? _j : 0;
382
+ const defs = toStringArray((_k = r.definedSymbols) !== null && _k !== void 0 ? _k : r.defined_symbols);
383
+ const refs = toStringArray((_l = r.referenced_symbols) !== null && _l !== void 0 ? _l : r.referencedSymbols);
384
+ const symbol = defs[0] || "(anonymous)";
385
+ const role = ((_m = r.role) !== null && _m !== void 0 ? _m : "IMPL").slice(0, 4).toUpperCase();
386
+ const exported = r.is_exported ? "exported " : "";
387
+ const complexity = typeof r.complexity === "number" && r.complexity > 0
388
+ ? ` C:${Math.round(r.complexity)}`
389
+ : "";
390
+ const parentStr = r.parent_symbol
391
+ ? `parent:${r.parent_symbol} `
392
+ : "";
393
+ const callsStr = refs.length > 0
394
+ ? `calls:${refs.slice(0, 8).join(",")}`
395
+ : "";
396
+ const line1 = `${symbol} [${exported}${role}${complexity}] ${relPath}:${startLine + 1}-${endLine + 1}`;
397
+ const summaryStr = r.summary ? ` ${r.summary}` : "";
398
+ const line2 = parentStr || callsStr
399
+ ? ` ${parentStr}${callsStr}`
400
+ : "";
401
+ let snippet = "";
402
+ if (detail === "code") {
403
+ const raw = typeof r.content === "string"
404
+ ? r.content
405
+ : typeof r.text === "string"
406
+ ? r.text
407
+ : "";
408
+ const lines = raw.split("\n").slice(0, 4);
409
+ snippet =
410
+ "\n" +
411
+ lines
412
+ .map((l, i) => `${startLine + i + 1}│${l}`)
413
+ .join("\n");
414
+ }
415
+ const text = line1 +
416
+ (summaryStr ? `\n${summaryStr}` : "") +
417
+ (line2 ? `\n${line2}` : "") +
418
+ snippet;
381
419
  return {
382
- path: (_f = (_d = r.path) !== null && _d !== void 0 ? _d : (_e = r.metadata) === null || _e === void 0 ? void 0 : _e.path) !== null && _f !== void 0 ? _f : "",
383
- startLine,
384
- endLine: (_j = (_g = r.endLine) !== null && _g !== void 0 ? _g : (_h = r.generated_metadata) === null || _h === void 0 ? void 0 : _h.end_line) !== null && _j !== void 0 ? _j : 0,
385
- score: typeof r.score === "number" ? +r.score.toFixed(3) : 0,
386
- role: (_k = r.role) !== null && _k !== void 0 ? _k : "IMPLEMENTATION",
387
- confidence: (_l = r.confidence) !== null && _l !== void 0 ? _l : "Unknown",
388
- definedSymbols: toStringArray((_m = r.definedSymbols) !== null && _m !== void 0 ? _m : r.defined_symbols).slice(0, 5),
389
- snippet,
420
+ absPath,
421
+ text,
422
+ score: typeof r.score === "number" ? r.score : 0,
390
423
  };
391
424
  });
392
425
  if (minScore > 0) {
393
- compact = compact.filter((r) => r.score >= minScore);
426
+ results = results.filter((r) => r.score >= minScore);
394
427
  }
395
428
  if (maxPerFile > 0) {
396
429
  const counts = new Map();
397
- compact = compact.filter((r) => {
398
- const count = counts.get(r.path) || 0;
430
+ results = results.filter((r) => {
431
+ const count = counts.get(r.absPath) || 0;
399
432
  if (count >= maxPerFile)
400
433
  return false;
401
- counts.set(r.path, count + 1);
434
+ counts.set(r.absPath, count + 1);
402
435
  return true;
403
436
  });
404
437
  }
405
- return ok(JSON.stringify(compact));
438
+ return ok(results.map((r) => r.text).join("\n\n"));
406
439
  }
407
440
  catch (e) {
408
441
  const msg = e instanceof Error ? e.message : String(e);
@@ -127,8 +127,7 @@ class Searcher {
127
127
  referenced_symbols: referencedSymbols,
128
128
  imports,
129
129
  exports,
130
- // Remove 'context' field entirely from JSON output
131
- // context: record.context_prev ? [record.context_prev] : [],
130
+ summary: record.summary,
132
131
  };
133
132
  }
134
133
  applyStructureBoost(record, score, intent) {
@@ -97,6 +97,7 @@ class VectorDB {
97
97
  role: "",
98
98
  parent_symbol: "",
99
99
  file_skeleton: "",
100
+ summary: "",
100
101
  };
101
102
  }
102
103
  validateSchema(table) {
@@ -138,6 +139,7 @@ class VectorDB {
138
139
  new apache_arrow_1.Field("role", new apache_arrow_1.Utf8(), true),
139
140
  new apache_arrow_1.Field("parent_symbol", new apache_arrow_1.Utf8(), true),
140
141
  new apache_arrow_1.Field("file_skeleton", new apache_arrow_1.Utf8(), true),
142
+ new apache_arrow_1.Field("summary", new apache_arrow_1.Utf8(), true),
141
143
  ]);
142
144
  }
143
145
  ensureTable() {
@@ -201,7 +203,7 @@ class VectorDB {
201
203
  return [];
202
204
  };
203
205
  const rows = records.map((rec) => {
204
- var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k, _l, _m, _o;
206
+ var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k, _l, _m, _o, _p;
205
207
  const vec = (() => {
206
208
  const arr = toNumberArray(rec.vector);
207
209
  if (arr.length < this.vectorDim) {
@@ -241,6 +243,7 @@ class VectorDB {
241
243
  role: (_l = rec.role) !== null && _l !== void 0 ? _l : "",
242
244
  parent_symbol: (_m = rec.parent_symbol) !== null && _m !== void 0 ? _m : "",
243
245
  file_skeleton: (_o = rec.file_skeleton) !== null && _o !== void 0 ? _o : "",
246
+ summary: (_p = rec.summary) !== null && _p !== void 0 ? _p : null,
244
247
  };
245
248
  });
246
249
  try {
@@ -49,6 +49,7 @@ const transformers_1 = require("@huggingface/transformers");
49
49
  const ort = __importStar(require("onnxruntime-node"));
50
50
  const uuid_1 = require("uuid");
51
51
  const config_1 = require("../../config");
52
+ const llm_client_1 = require("./summarize/llm-client");
52
53
  const chunker_1 = require("../index/chunker");
53
54
  const skeleton_1 = require("../skeleton");
54
55
  const file_utils_1 = require("../utils/file-utils");
@@ -213,7 +214,23 @@ class WorkerOrchestrator {
213
214
  if (!chunks.length)
214
215
  return { vectors: [], hash, mtimeMs, size };
215
216
  const preparedChunks = this.toPreparedChunks(input.path, hash, chunks, skeletonResult.success ? skeletonResult.skeleton : undefined);
216
- const hybrids = yield this.computeHybrid(preparedChunks.map((chunk) => chunk.content), onProgress);
217
+ // Run embedding and summarization in parallel
218
+ const lang = path.extname(input.path).replace(/^\./, "") || "unknown";
219
+ const [hybrids, summaries] = yield Promise.all([
220
+ this.computeHybrid(preparedChunks.map((chunk) => chunk.content), onProgress),
221
+ (0, llm_client_1.summarizeChunks)(preparedChunks.map((c) => ({
222
+ code: c.content,
223
+ language: lang,
224
+ file: c.path,
225
+ }))),
226
+ ]);
227
+ // Attach summaries if available
228
+ if (summaries) {
229
+ for (let i = 0; i < preparedChunks.length; i++) {
230
+ if (summaries[i])
231
+ preparedChunks[i].summary = summaries[i];
232
+ }
233
+ }
217
234
  const vectors = preparedChunks.map((chunk, idx) => {
218
235
  var _a;
219
236
  const hybrid = (_a = hybrids[idx]) !== null && _a !== void 0 ? _a : {
@@ -0,0 +1,165 @@
1
+ "use strict";
2
+ /**
3
+ * LLM summarizer HTTP client.
4
+ * Talks to the MLX summarizer server to generate code summaries.
5
+ * Returns null if server isn't running — caller skips summaries gracefully.
6
+ */
7
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
8
+ if (k2 === undefined) k2 = k;
9
+ var desc = Object.getOwnPropertyDescriptor(m, k);
10
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
11
+ desc = { enumerable: true, get: function() { return m[k]; } };
12
+ }
13
+ Object.defineProperty(o, k2, desc);
14
+ }) : (function(o, m, k, k2) {
15
+ if (k2 === undefined) k2 = k;
16
+ o[k2] = m[k];
17
+ }));
18
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
19
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
20
+ }) : function(o, v) {
21
+ o["default"] = v;
22
+ });
23
+ var __importStar = (this && this.__importStar) || (function () {
24
+ var ownKeys = function(o) {
25
+ ownKeys = Object.getOwnPropertyNames || function (o) {
26
+ var ar = [];
27
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
28
+ return ar;
29
+ };
30
+ return ownKeys(o);
31
+ };
32
+ return function (mod) {
33
+ if (mod && mod.__esModule) return mod;
34
+ var result = {};
35
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
36
+ __setModuleDefault(result, mod);
37
+ return result;
38
+ };
39
+ })();
40
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
41
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
42
+ return new (P || (P = Promise))(function (resolve, reject) {
43
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
44
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
45
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
46
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
47
+ });
48
+ };
49
+ Object.defineProperty(exports, "__esModule", { value: true });
50
+ exports.summarizeChunks = summarizeChunks;
51
+ exports.resetSummarizerCache = resetSummarizerCache;
52
+ const http = __importStar(require("node:http"));
53
+ const SUMMARY_PORT = parseInt(process.env.GMAX_SUMMARY_PORT || "8101", 10);
54
+ const SUMMARY_HOST = "127.0.0.1";
55
+ const SUMMARY_TIMEOUT_MS = 120000; // 2 min — batches of chunks take time
56
+ let summarizerAvailable = null;
57
+ let lastCheck = 0;
58
+ const CHECK_INTERVAL_MS = 5000; // short cache — retry quickly if server just started
59
+ function postJSON(path, body) {
60
+ return new Promise((resolve) => {
61
+ const payload = JSON.stringify(body);
62
+ const req = http.request({
63
+ hostname: SUMMARY_HOST,
64
+ port: SUMMARY_PORT,
65
+ path,
66
+ method: "POST",
67
+ headers: {
68
+ "Content-Type": "application/json",
69
+ "Content-Length": Buffer.byteLength(payload),
70
+ },
71
+ timeout: SUMMARY_TIMEOUT_MS,
72
+ }, (res) => {
73
+ const chunks = [];
74
+ res.on("data", (chunk) => chunks.push(chunk));
75
+ res.on("end", () => {
76
+ try {
77
+ const data = JSON.parse(Buffer.concat(chunks).toString("utf-8"));
78
+ resolve({ ok: res.statusCode === 200, data });
79
+ }
80
+ catch (_a) {
81
+ resolve({ ok: false });
82
+ }
83
+ });
84
+ });
85
+ req.on("error", () => resolve({ ok: false }));
86
+ req.on("timeout", () => {
87
+ req.destroy();
88
+ resolve({ ok: false });
89
+ });
90
+ req.write(payload);
91
+ req.end();
92
+ });
93
+ }
94
+ function isSummarizerUp() {
95
+ return __awaiter(this, void 0, void 0, function* () {
96
+ const now = Date.now();
97
+ if (summarizerAvailable !== null && now - lastCheck < CHECK_INTERVAL_MS) {
98
+ return summarizerAvailable;
99
+ }
100
+ const result = yield new Promise((resolve) => {
101
+ const req = http.get({
102
+ hostname: SUMMARY_HOST,
103
+ port: SUMMARY_PORT,
104
+ path: "/health",
105
+ timeout: 5000,
106
+ }, (res) => {
107
+ res.resume();
108
+ resolve(res.statusCode === 200);
109
+ });
110
+ req.on("error", () => resolve(false));
111
+ req.on("timeout", () => {
112
+ req.destroy();
113
+ resolve(false);
114
+ });
115
+ });
116
+ summarizerAvailable = result;
117
+ lastCheck = now;
118
+ return result;
119
+ });
120
+ }
121
+ /**
122
+ * Generate summaries for code chunks via the local LLM server.
123
+ * Sends one chunk at a time. Skips health check — just tries the request.
124
+ * If the server is busy, the TCP connection queues until it's ready.
125
+ * Returns string[] on success, null if server unavailable.
126
+ */
127
+ function summarizeChunks(chunks) {
128
+ return __awaiter(this, void 0, void 0, function* () {
129
+ var _a;
130
+ if (chunks.length === 0)
131
+ return [];
132
+ // Quick check only if we've never connected
133
+ if (summarizerAvailable === null) {
134
+ summarizerAvailable = yield isSummarizerUp();
135
+ if (!summarizerAvailable)
136
+ return null;
137
+ }
138
+ if (summarizerAvailable === false) {
139
+ // Recheck periodically
140
+ const now = Date.now();
141
+ if (now - lastCheck < CHECK_INTERVAL_MS)
142
+ return null;
143
+ summarizerAvailable = yield isSummarizerUp();
144
+ if (!summarizerAvailable)
145
+ return null;
146
+ }
147
+ const summaries = [];
148
+ for (const chunk of chunks) {
149
+ const { ok, data } = yield postJSON("/summarize", {
150
+ chunks: [chunk],
151
+ });
152
+ if (!ok || !((_a = data === null || data === void 0 ? void 0 : data.summaries) === null || _a === void 0 ? void 0 : _a[0])) {
153
+ summaries.push("");
154
+ }
155
+ else {
156
+ summaries.push(data.summaries[0]);
157
+ }
158
+ }
159
+ return summaries;
160
+ });
161
+ }
162
+ function resetSummarizerCache() {
163
+ summarizerAvailable = null;
164
+ lastCheck = 0;
165
+ }
@@ -1,13 +1,15 @@
1
1
  [project]
2
2
  name = "mlx-embed-server"
3
3
  version = "0.1.0"
4
- description = "MLX-accelerated embedding server for grepmax"
4
+ description = "MLX-accelerated embedding and summarization server for grepmax"
5
5
  requires-python = ">=3.13"
6
6
  dependencies = [
7
7
  "fastapi>=0.115.0",
8
8
  "uvicorn>=0.34.0",
9
9
  "mlx-embeddings @ git+https://github.com/Blaizzy/mlx-embeddings.git",
10
+ "mlx-lm>=0.22.0",
10
11
  ]
11
12
 
12
13
  [project.scripts]
13
14
  mlx-embed-server = "server:main"
15
+ mlx-summarizer = "summarizer:main"
@@ -0,0 +1,169 @@
1
+ """MLX-accelerated code summarizer for grepmax.
2
+
3
+ Runs Qwen3-Coder-30B-A3B on Apple Silicon GPU to generate one-line
4
+ summaries of code chunks during indexing. Summaries are stored in
5
+ LanceDB and returned in search results.
6
+
7
+ IMPORTANT: All MLX operations must run on a single thread. FastAPI async
8
+ endpoints run on the event loop thread, avoiding Metal thread-safety crashes.
9
+ """
10
+
11
+ import asyncio
12
+ import logging
13
+ import os
14
+ import signal
15
+ import socket
16
+ import time
17
+ import warnings
18
+ from contextlib import asynccontextmanager
19
+
20
+ os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
21
+ os.environ["HF_HUB_DISABLE_IMPLICIT_TOKEN"] = "1"
22
+ os.environ["HF_HUB_VERBOSITY"] = "error"
23
+ os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
24
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
25
+ warnings.filterwarnings("ignore", message=".*PyTorch.*")
26
+ warnings.filterwarnings("ignore", message=".*resource_tracker.*")
27
+ logging.getLogger("huggingface_hub").setLevel(logging.ERROR)
28
+
29
+ import mlx.core as mx
30
+ import uvicorn
31
+ from fastapi import FastAPI
32
+ from mlx_lm import load, generate
33
+ from pydantic import BaseModel
34
+
35
+ MODEL_ID = os.environ.get(
36
+ "MLX_SUMMARY_MODEL",
37
+ "lmstudio-community/Qwen3-Coder-30B-A3B-Instruct-MLX-4bit",
38
+ )
39
+ PORT = int(os.environ.get("MLX_SUMMARY_PORT", "8101"))
40
+ IDLE_TIMEOUT_S = int(os.environ.get("MLX_SUMMARY_IDLE_TIMEOUT", "1800")) # 30 min
41
+ MAX_TOKENS = 100 # summaries should be one line
42
+
43
+ model = None
44
+ tokenizer = None
45
+ last_activity = time.time()
46
+
47
+ _mlx_lock = asyncio.Lock()
48
+
49
+ SYSTEM_PROMPT = """You are a code summarizer. Given a code chunk, produce exactly one line describing what it does.
50
+ Be specific about business logic, services, and side effects. Do not describe syntax.
51
+ Do not use phrases like "This function" or "This code". Start with a verb."""
52
+
53
+ def build_prompt(code: str, language: str, file: str) -> str:
54
+ return f"Language: {language}\nFile: {file}\n\n```\n{code}\n```"
55
+
56
+
57
+ def is_port_in_use(port: int) -> bool:
58
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
59
+ return s.connect_ex(("127.0.0.1", port)) == 0
60
+
61
+
62
+ def summarize_chunk(code: str, language: str, file: str) -> str:
63
+ """Generate a one-line summary for a code chunk."""
64
+ messages = [
65
+ {"role": "system", "content": SYSTEM_PROMPT},
66
+ {"role": "user", "content": build_prompt(code, language, file)},
67
+ ]
68
+ prompt = tokenizer.apply_chat_template(
69
+ messages, tokenize=False, add_generation_prompt=True
70
+ )
71
+ response = generate(
72
+ model,
73
+ tokenizer,
74
+ prompt=prompt,
75
+ max_tokens=MAX_TOKENS,
76
+ verbose=False,
77
+ )
78
+ # Take first line only, strip whitespace
79
+ summary = response.strip().split("\n")[0].strip()
80
+ # Remove common prefixes the model might add
81
+ for prefix in ["Summary: ", "summary: ", "- "]:
82
+ if summary.startswith(prefix):
83
+ summary = summary[len(prefix):]
84
+ return summary
85
+
86
+
87
+ def load_model():
88
+ global model, tokenizer
89
+ print(f"[summarizer] Loading {MODEL_ID}...")
90
+ model, tokenizer = load(MODEL_ID)
91
+ # Warm up
92
+ _ = summarize_chunk("function hello() { return 'world'; }", "javascript", "test.js")
93
+ print("[summarizer] Model ready on Metal GPU.")
94
+
95
+
96
+ @asynccontextmanager
97
+ async def lifespan(app: FastAPI):
98
+ load_model()
99
+ yield
100
+
101
+
102
+ app = FastAPI(lifespan=lifespan)
103
+
104
+
105
+ class ChunkInput(BaseModel):
106
+ code: str
107
+ language: str = "unknown"
108
+ file: str = ""
109
+
110
+
111
+ class SummarizeRequest(BaseModel):
112
+ chunks: list[ChunkInput]
113
+
114
+
115
+ class SummarizeResponse(BaseModel):
116
+ summaries: list[str]
117
+
118
+
119
+ @app.post("/summarize")
120
+ async def summarize(request: SummarizeRequest) -> SummarizeResponse:
121
+ global last_activity
122
+ last_activity = time.time()
123
+
124
+ summaries = []
125
+ async with _mlx_lock:
126
+ for chunk in request.chunks:
127
+ try:
128
+ summary = summarize_chunk(chunk.code, chunk.language, chunk.file)
129
+ summaries.append(summary)
130
+ except Exception as e:
131
+ summaries.append(f"(summary failed: {e})")
132
+
133
+ return SummarizeResponse(summaries=summaries)
134
+
135
+
136
+ @app.get("/health")
137
+ async def health():
138
+ # Health check must NOT acquire _mlx_lock — it must respond instantly
139
+ # even when a summarization is in progress
140
+ global last_activity
141
+ last_activity = time.time()
142
+ return {"status": "ok", "model": MODEL_ID}
143
+
144
+
145
+ def main():
146
+ if is_port_in_use(PORT):
147
+ print(f"[summarizer] Port {PORT} already in use — server is already running.")
148
+ return
149
+
150
+ print(f"[summarizer] Starting on port {PORT}")
151
+
152
+ def handle_signal(sig, frame):
153
+ print("[summarizer] Stopped.")
154
+ try:
155
+ from multiprocessing.resource_tracker import _resource_tracker
156
+ if _resource_tracker._pid is not None:
157
+ os.kill(_resource_tracker._pid, signal.SIGKILL)
158
+ except Exception:
159
+ pass
160
+ os._exit(0)
161
+
162
+ signal.signal(signal.SIGINT, handle_signal)
163
+ signal.signal(signal.SIGTERM, handle_signal)
164
+
165
+ uvicorn.run(app, host="127.0.0.1", port=PORT, log_level="warning")
166
+
167
+
168
+ if __name__ == "__main__":
169
+ main()
@@ -610,6 +610,7 @@ source = { virtual = "." }
610
610
  dependencies = [
611
611
  { name = "fastapi" },
612
612
  { name = "mlx-embeddings" },
613
+ { name = "mlx-lm" },
613
614
  { name = "uvicorn" },
614
615
  ]
615
616
 
@@ -617,6 +618,7 @@ dependencies = [
617
618
  requires-dist = [
618
619
  { name = "fastapi", specifier = ">=0.115.0" },
619
620
  { name = "mlx-embeddings", git = "https://github.com/Blaizzy/mlx-embeddings.git" },
621
+ { name = "mlx-lm", specifier = ">=0.22.0" },
620
622
  { name = "uvicorn", specifier = ">=0.34.0" },
621
623
  ]
622
624
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "grepmax",
3
- "version": "0.3.0",
3
+ "version": "0.4.0",
4
4
  "author": "Robert Owens <robowens@me.com>",
5
5
  "homepage": "https://github.com/reowens/grepmax",
6
6
  "bugs": {
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "grepmax",
3
- "version": "0.3.0",
3
+ "version": "0.4.0",
4
4
  "description": "Semantic code search for Claude Code. Automatically indexes your project and provides intelligent search capabilities.",
5
5
  "author": {
6
6
  "name": "Robert Owens",
@@ -3,10 +3,10 @@ const _path = require("node:path");
3
3
  const http = require("node:http");
4
4
  const { spawn } = require("node:child_process");
5
5
 
6
- function isMlxRunning() {
6
+ function isServerRunning(port) {
7
7
  return new Promise((resolve) => {
8
8
  const req = http.get(
9
- { hostname: "127.0.0.1", port: 8100, path: "/health", timeout: 1000 },
9
+ { hostname: "127.0.0.1", port, path: "/health", timeout: 1000 },
10
10
  (res) => {
11
11
  res.resume();
12
12
  resolve(res.statusCode === 200);
@@ -20,17 +20,17 @@ function isMlxRunning() {
20
20
  });
21
21
  }
22
22
 
23
- function startMlxServer() {
23
+ function startPythonServer(scriptName, logName) {
24
24
  const pluginRoot = __dirname.replace(/\/hooks$/, "");
25
25
  const gmaxRoot = _path.resolve(pluginRoot, "../..");
26
26
  const serverDir = _path.join(gmaxRoot, "mlx-embed-server");
27
27
 
28
- if (!fs.existsSync(_path.join(serverDir, "server.py"))) return;
28
+ if (!fs.existsSync(_path.join(serverDir, scriptName))) return;
29
29
 
30
- const logPath = "/tmp/mlx-embed-server.log";
30
+ const logPath = `/tmp/${logName}.log`;
31
31
  const out = fs.openSync(logPath, "a");
32
32
 
33
- const child = spawn("uv", ["run", "python", "server.py"], {
33
+ const child = spawn("uv", ["run", "python", scriptName], {
34
34
  cwd: serverDir,
35
35
  detached: true,
36
36
  stdio: ["ignore", out, out],
@@ -40,17 +40,21 @@ function startMlxServer() {
40
40
  }
41
41
 
42
42
  async function main() {
43
- // Start MLX embed server if not running (set GMAX_EMBED_MODE=cpu to skip)
44
43
  const embedMode =
45
44
  process.env.GMAX_EMBED_MODE || process.env.OSGREP_EMBED_MODE || "auto";
45
+
46
46
  if (embedMode !== "cpu") {
47
- const mlxUp = await isMlxRunning();
48
- if (!mlxUp) {
49
- startMlxServer();
47
+ // Start MLX embed server (port 8100)
48
+ if (!(await isServerRunning(8100))) {
49
+ startPythonServer("server.py", "mlx-embed-server");
50
+ }
51
+
52
+ // Start LLM summarizer server (port 8101)
53
+ if (!(await isServerRunning(8101))) {
54
+ startPythonServer("summarizer.py", "mlx-summarizer");
50
55
  }
51
56
  }
52
57
 
53
- // MCP server handles indexing and search directly — no daemon needed
54
58
  const response = {
55
59
  hookSpecificOutput: {
56
60
  hookEventName: "SessionStart",
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  name: gmax
3
3
  description: Semantic code search. Use alongside grep - grep for exact strings, gmax for concepts.
4
- allowed-tools: "mcp__grepmax__semantic_search, mcp__grepmax__code_skeleton, mcp__grepmax__trace_calls, mcp__grepmax__list_symbols, mcp__grepmax__index_status, Bash(gmax:*), Read"
4
+ allowed-tools: "mcp__grepmax__semantic_search, mcp__grepmax__search_all, mcp__grepmax__code_skeleton, mcp__grepmax__trace_calls, mcp__grepmax__list_symbols, mcp__grepmax__index_status, Bash(gmax:*), Read"
5
5
  ---
6
6
 
7
7
  ## What gmax does
@@ -11,24 +11,35 @@ Finds code by meaning. When you'd ask a colleague "where do we handle auth?", us
11
11
  - grep/ripgrep: exact string match, fast
12
12
  - gmax: concept match, finds code you couldn't grep for
13
13
 
14
- ## MCP tools (preferred)
15
-
16
- Use these structured tools when available — they return typed JSON and don't need output parsing.
14
+ ## MCP tools
17
15
 
18
16
  ### semantic_search
19
- Search code by meaning. Returns ranked snippets with file paths, line numbers, scores.
17
+ Search code by meaning. Returns **pointers** by default symbol, file:line, role, calls. No code snippets unless requested.
20
18
  - `query` (required): Natural language. Be specific — more words = better results.
21
- - `limit` (optional): Max results (default 10, max 50)
19
+ - `limit` (optional): Max results (default 3, max 50)
20
+ - `root` (optional): Directory to search. Defaults to project root. Use to search a parent directory (e.g. `root: "../"` to search the monorepo).
22
21
  - `path` (optional): Restrict to path prefix (e.g. "src/auth/")
22
+ - `detail` (optional): `"pointer"` (default) or `"code"` (adds 4-line numbered snippets)
23
23
  - `min_score` (optional): Filter by minimum relevance score (0-1)
24
24
  - `max_per_file` (optional): Cap results per file for diversity
25
25
 
26
+ **Output format (pointer mode):**
27
+ ```
28
+ handleAuth [exported ORCH C:8] src/auth/handler.ts:45-90
29
+ parent:AuthController calls:validateToken,checkRole,respond
30
+ ```
31
+
32
+ **When to use `detail: "code"`:** Only when you need to see the actual code before deciding to Read — e.g. comparing implementations, checking syntax. For navigation ("where is X?"), pointer mode is sufficient.
33
+
34
+ ### search_all
35
+ Search ALL indexed code across every directory. Same output format as semantic_search. Use when code could be anywhere — e.g. tracing a function across projects.
36
+
26
37
  ### code_skeleton
27
38
  Show file structure — signatures with bodies collapsed (~4x fewer tokens).
28
39
  - `target` (required): File path relative to project root
29
40
 
30
41
  ### trace_calls
31
- Trace call graph — who calls a symbol and what it calls.
42
+ Trace call graph — who calls a symbol and what it calls. Unscoped — follows calls across all indexed directories.
32
43
  - `symbol` (required): Function/method/class name (e.g. "handleAuth")
33
44
 
34
45
  ### list_symbols
@@ -38,45 +49,20 @@ List indexed symbols with definition locations.
38
49
  - `path` (optional): Only symbols under this path prefix
39
50
 
40
51
  ### index_status
41
- Check index and daemon health — file count, chunks, embed mode, age, watching status.
42
-
43
- ## CLI fallback
52
+ Check centralized index health — chunk count, files, indexed directories, model info.
44
53
 
45
- If MCP tools aren't available, use the CLI via Bash:
54
+ ## Workflow
46
55
 
47
- ```bash
48
- gmax "where do we validate user permissions" # Semantic search
49
- gmax "authentication" --compact # Just file paths + line ranges
50
- gmax skeleton src/giant-2000-line-file.ts # File structure
51
- gmax trace handleAuth # Call graph
52
- gmax symbols booking # Find symbols by name
53
- ```
54
-
55
- ## Output explained (CLI)
56
- ```
57
- ORCHESTRATION src/auth/handler.ts:45
58
- Defines: handleAuth | Calls: validate, checkRole, respond | Score: .94
56
+ 1. **Locate** — `semantic_search` with pointer mode to find relevant code
57
+ 2. **Read** `Read file:line` for the specific ranges you need
58
+ 3. **Trace** `trace_calls` to understand how functions connect
59
+ 4. **Skeleton** `code_skeleton` before reading large files
59
60
 
60
- export async function handleAuth(req: Request) {
61
- const token = req.headers.get("Authorization");
62
- const claims = await validateToken(token);
63
- ...
64
- ```
65
-
66
- - **ORCHESTRATION** = contains logic, coordinates other code
67
- - **DEFINITION** = types, interfaces, classes
68
- - **Score** = relevance (1 = best match)
69
- - **Calls** = what this code calls (helps trace flow)
61
+ Don't read entire files. Use the line ranges from search results.
70
62
 
71
63
  ## Tips
72
64
 
73
65
  - More words = better results. "auth" is vague. "where does the server validate JWT tokens" is specific.
74
- - ORCH results contain the logic — prioritize these.
75
- - Don't read entire files. Use the line ranges from results.
76
- - If results seem off, rephrase like you'd ask a teammate.
77
- - Use `code_skeleton` before reading large files — understand structure first.
78
- - Use `trace_calls` to understand how functions connect across the codebase.
79
-
80
- ## If Index is Building
81
-
82
- If you see "Indexing" or daemon not ready: tell the user. Ask if they want to wait or proceed with partial results.
66
+ - ORCH results contain the logic — prioritize these over DEF/IMPL.
67
+ - Use `root` to search parent directories (monorepo, workspace).
68
+ - Use `search_all` sparingly it searches everything indexed.