code-memory 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
parser.py ADDED
@@ -0,0 +1,392 @@
1
+ """
2
+ Language-agnostic AST parser and incremental indexer for code-memory.
3
+
4
+ Uses **tree-sitter** for multi-language structural parsing. Supports
5
+ Python, JavaScript, TypeScript, Java, Go, Rust, C, C++, and Ruby out of
6
+ the box. Falls back to whole-file indexing for unsupported languages so
7
+ that every source file is still searchable via BM25 / vector search.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import logging
13
+ import os
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+ from tree_sitter import Language, Parser, Node
18
+
19
+ import db as db_mod
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # ── Directories to skip ───────────────────────────────────────────────
24
+ _SKIP_DIRS = frozenset({
25
+ ".venv", "venv", "__pycache__", ".git", "node_modules",
26
+ ".mypy_cache", ".pytest_cache", ".ruff_cache", ".tox",
27
+ "dist", "build", "target", "bin", "obj",
28
+ })
29
+
30
+ # ── File extensions we consider "source code" ─────────────────────────
31
+ _SOURCE_EXTENSIONS = frozenset({
32
+ ".py", ".js", ".jsx", ".ts", ".tsx", ".java",
33
+ ".go", ".rs", ".c", ".h", ".cpp", ".hpp", ".cc", ".cxx",
34
+ ".rb", ".cs", ".swift", ".kt", ".kts", ".scala", ".lua",
35
+ ".sh", ".bash", ".zsh", ".yaml", ".yml", ".toml", ".json",
36
+ ".html", ".css", ".scss", ".sql", ".md", ".txt",
37
+ ".dockerfile", ".makefile",
38
+ })
39
+
40
+ # ---------------------------------------------------------------------------
41
+ # Tree-sitter language registry (lazy-loaded)
42
+ # ---------------------------------------------------------------------------
43
+
44
+ _LANGUAGES: dict[str, Language] = {}
45
+
46
+
47
+ def _load_language(ext: str) -> Language | None:
48
+ """Return a tree-sitter Language for the given file extension, or None."""
49
+ if ext in _LANGUAGES:
50
+ return _LANGUAGES[ext]
51
+
52
+ lang = _try_import_language(ext)
53
+ if lang is not None:
54
+ _LANGUAGES[ext] = lang
55
+ return lang
56
+
57
+
58
+ def _try_import_language(ext: str) -> Language | None:
59
+ """Attempt to import the tree-sitter grammar for *ext*."""
60
+ try:
61
+ if ext == ".py":
62
+ import tree_sitter_python as mod
63
+ elif ext in (".js", ".jsx"):
64
+ import tree_sitter_javascript as mod
65
+ elif ext in (".ts", ".tsx"):
66
+ import tree_sitter_typescript as ts_mod
67
+ # TypeScript grammar exposes typescript and tsx separately
68
+ if ext == ".tsx":
69
+ return Language(ts_mod.language_tsx())
70
+ return Language(ts_mod.language_typescript())
71
+ elif ext == ".java":
72
+ import tree_sitter_java as mod
73
+ elif ext == ".go":
74
+ import tree_sitter_go as mod
75
+ elif ext == ".rs":
76
+ import tree_sitter_rust as mod
77
+ elif ext in (".c", ".h"):
78
+ import tree_sitter_c as mod
79
+ elif ext in (".cpp", ".hpp", ".cc", ".cxx"):
80
+ import tree_sitter_cpp as mod
81
+ elif ext == ".rb":
82
+ import tree_sitter_ruby as mod
83
+ elif ext in (".kt", ".kts"):
84
+ import tree_sitter_kotlin as mod
85
+ else:
86
+ return None
87
+ return Language(mod.language())
88
+ except ImportError:
89
+ logger.debug("No tree-sitter grammar for %s", ext)
90
+ return None
91
+
92
+
93
+ # ---------------------------------------------------------------------------
94
+ # Tree-sitter node-type → symbol kind mapping (per language family)
95
+ # ---------------------------------------------------------------------------
96
+
97
+ # Maps tree-sitter node types to our normalised (kind, is_container) pairs
98
+ _NODE_KIND_MAP: dict[str, tuple[str, bool]] = {
99
+ # Python
100
+ "function_definition": ("function", False),
101
+ "class_definition": ("class", True),
102
+ # JS / TS
103
+ "function_declaration": ("function", False),
104
+ "arrow_function": ("function", False),
105
+ "class_declaration": ("class", True),
106
+ "method_definition": ("method", False),
107
+ "lexical_declaration": ("variable", False),
108
+ # Java
109
+ "method_declaration": ("method", False),
110
+ "constructor_declaration": ("method", False),
111
+ "interface_declaration": ("class", True),
112
+ # Go (function_declaration already mapped above for JS/TS/Kotlin)
113
+ "method_declaration": ("method", False),
114
+ "type_spec": ("class", False),
115
+ # Rust
116
+ "function_item": ("function", False),
117
+ "struct_item": ("class", False),
118
+ "impl_item": ("class", True),
119
+ "enum_item": ("class", False),
120
+ "trait_item": ("class", True),
121
+ # C / C++
122
+ "function_definition": ("function", False),
123
+ "struct_specifier": ("class", False),
124
+ "class_specifier": ("class", True),
125
+ # Kotlin
126
+ "function_declaration": ("function", False),
127
+ "class_declaration": ("class", True),
128
+ "object_declaration": ("class", True),
129
+ "companion_object": ("class", True),
130
+ # Ruby
131
+ "method": ("method", False),
132
+ "singleton_method": ("method", False),
133
+ "class": ("class", True),
134
+ "module": ("class", True),
135
+ }
136
+
137
+
138
+ def _node_name(node: Node, source: bytes) -> str:
139
+ """Extract the symbol name from a tree-sitter node."""
140
+ # Most definitions have a 'name' or 'identifier' child
141
+ for child in node.children:
142
+ if child.type in ("identifier", "name", "property_identifier",
143
+ "type_identifier", "constant"):
144
+ return source[child.start_byte:child.end_byte].decode("utf-8", errors="replace")
145
+ # Fallback: first identifier anywhere in the node
146
+ ident = _first_identifier(node, source)
147
+ if ident:
148
+ return ident
149
+ return f"<anonymous@{node.start_point[0] + 1}>"
150
+
151
+
152
+ def _first_identifier(node: Node, source: bytes) -> str | None:
153
+ """DFS for the first identifier node."""
154
+ if node.type in ("identifier", "name"):
155
+ return source[node.start_byte:node.end_byte].decode("utf-8", errors="replace")
156
+ for child in node.children:
157
+ result = _first_identifier(child, source)
158
+ if result:
159
+ return result
160
+ return None
161
+
162
+
163
+ # ---------------------------------------------------------------------------
164
+ # Symbol extraction via tree-sitter
165
+ # ---------------------------------------------------------------------------
166
+
167
+ def _extract_symbols(
168
+ tree_root: Node,
169
+ source: bytes,
170
+ ) -> list[dict[str, Any]]:
171
+ """Walk the tree-sitter AST and extract symbols.
172
+
173
+ Returns a flat list of dicts with keys:
174
+ name, kind, line_start, line_end, source_text, children (nested symbols)
175
+ """
176
+ symbols: list[dict[str, Any]] = []
177
+
178
+ def _walk(node: Node, parent_kind: str | None = None):
179
+ node_type = node.type
180
+ mapping = _NODE_KIND_MAP.get(node_type)
181
+
182
+ if mapping:
183
+ kind, is_container = mapping
184
+ # Promote function → method if parent is a class/container
185
+ if kind == "function" and parent_kind in ("class",):
186
+ kind = "method"
187
+
188
+ name = _node_name(node, source)
189
+ src_text = source[node.start_byte:node.end_byte].decode(
190
+ "utf-8", errors="replace"
191
+ )
192
+ sym = {
193
+ "name": name,
194
+ "kind": kind,
195
+ "line_start": node.start_point[0] + 1, # 1-indexed
196
+ "line_end": node.end_point[0] + 1,
197
+ "source_text": src_text,
198
+ "children": [],
199
+ }
200
+ symbols.append(sym)
201
+
202
+ # Recurse into container nodes (classes, impl blocks, etc.)
203
+ if is_container:
204
+ child_syms_before = len(symbols)
205
+ for child in node.children:
206
+ _walk(child, parent_kind=kind)
207
+ # Attach newly-found children
208
+ sym["children"] = symbols[child_syms_before:]
209
+ return
210
+
211
+ # Not a symbol node — recurse into children
212
+ for child in node.children:
213
+ _walk(child, parent_kind=parent_kind)
214
+
215
+ _walk(tree_root)
216
+ return symbols
217
+
218
+
219
+ def _extract_references(tree_root: Node, source: bytes) -> list[dict[str, Any]]:
220
+ """Extract identifier references from the tree-sitter AST."""
221
+ refs: list[dict[str, Any]] = []
222
+ seen: set[tuple[str, int]] = set()
223
+
224
+ def _walk(node: Node):
225
+ if node.type in ("identifier", "name", "type_identifier"):
226
+ name = source[node.start_byte:node.end_byte].decode(
227
+ "utf-8", errors="replace"
228
+ )
229
+ line = node.start_point[0] + 1
230
+ key = (name, line)
231
+ if key not in seen:
232
+ seen.add(key)
233
+ refs.append({"name": name, "line": line})
234
+ for child in node.children:
235
+ _walk(child)
236
+
237
+ _walk(tree_root)
238
+ return refs
239
+
240
+
241
+ # ---------------------------------------------------------------------------
242
+ # Single-file indexer
243
+ # ---------------------------------------------------------------------------
244
+
245
+ def index_file(filepath: str, db) -> dict:
246
+ """Parse a single source file and index its symbols + references.
247
+
248
+ Uses tree-sitter when a grammar is available for the file's language.
249
+ Falls back to indexing the whole file as a single symbol otherwise.
250
+ Skips the file if its ``last_modified`` timestamp has not changed.
251
+
252
+ Args:
253
+ filepath: Absolute path to a source file.
254
+ db: An open ``sqlite3.Connection`` from ``db.get_db()``.
255
+
256
+ Returns:
257
+ A dict with ``file``, ``symbols_indexed``, ``references_indexed``,
258
+ and ``skipped`` keys.
259
+ """
260
+ filepath = os.path.abspath(filepath)
261
+ ext = os.path.splitext(filepath)[1].lower()
262
+
263
+ # ── Check freshness ───────────────────────────────────────────────
264
+ mtime = os.path.getmtime(filepath)
265
+ row = db.execute(
266
+ "SELECT id, last_modified FROM files WHERE path = ?", (filepath,)
267
+ ).fetchone()
268
+
269
+ if row and row[1] >= mtime:
270
+ return {"file": filepath, "symbols_indexed": 0,
271
+ "references_indexed": 0, "skipped": True}
272
+
273
+ # ── Read file ─────────────────────────────────────────────────────
274
+ source_bytes = Path(filepath).read_bytes()
275
+ source_text = source_bytes.decode("utf-8", errors="replace")
276
+
277
+ fhash = db_mod.file_hash(filepath)
278
+ file_id = db_mod.upsert_file(db, filepath, mtime, fhash)
279
+
280
+ # Delete stale data before re-inserting
281
+ db_mod.delete_file_data(db, file_id)
282
+
283
+ symbols_indexed = 0
284
+ references_indexed = 0
285
+
286
+ # ── Try tree-sitter parsing ───────────────────────────────────────
287
+ lang = _load_language(ext)
288
+
289
+ if lang is not None:
290
+ parser = Parser(lang)
291
+ tree = parser.parse(source_bytes)
292
+
293
+ # Extract and store symbols
294
+ raw_symbols = _extract_symbols(tree.root_node, source_bytes)
295
+
296
+ # Flatten: process top-level symbols and nested children
297
+ def _store_symbols(sym_list, parent_id=None):
298
+ nonlocal symbols_indexed
299
+ for sym in sym_list:
300
+ sym_id = db_mod.upsert_symbol(
301
+ db, sym["name"], sym["kind"], file_id,
302
+ sym["line_start"], sym["line_end"],
303
+ parent_id, sym["source_text"],
304
+ )
305
+ symbols_indexed += 1
306
+
307
+ # Generate embedding
308
+ embed_input = f"{sym['kind']} {sym['name']}: {sym['source_text'][:1000]}"
309
+ vec = db_mod.embed_text(embed_input)
310
+ db_mod.upsert_embedding(db, sym_id, vec)
311
+
312
+ # Recurse into children
313
+ if sym.get("children"):
314
+ _store_symbols(sym["children"], parent_id=sym_id)
315
+
316
+ _store_symbols(raw_symbols)
317
+
318
+ # Extract and store references
319
+ refs = _extract_references(tree.root_node, source_bytes)
320
+ for ref in refs:
321
+ db_mod.upsert_reference(db, ref["name"], file_id, ref["line"])
322
+ references_indexed += 1
323
+
324
+ else:
325
+ # ── Fallback: index entire file as one symbol ─────────────────
326
+ basename = os.path.basename(filepath)
327
+ sym_id = db_mod.upsert_symbol(
328
+ db, basename, "file", file_id,
329
+ 1, source_text.count("\n") + 1,
330
+ None, source_text[:5000],
331
+ )
332
+ symbols_indexed += 1
333
+
334
+ embed_input = f"file {basename}: {source_text[:1000]}"
335
+ vec = db_mod.embed_text(embed_input)
336
+ db_mod.upsert_embedding(db, sym_id, vec)
337
+
338
+ db.commit()
339
+ return {
340
+ "file": filepath,
341
+ "symbols_indexed": symbols_indexed,
342
+ "references_indexed": references_indexed,
343
+ "skipped": False,
344
+ }
345
+
346
+
347
+ # ---------------------------------------------------------------------------
348
+ # Directory indexer
349
+ # ---------------------------------------------------------------------------
350
+
351
+ def index_directory(dirpath: str, db) -> list[dict]:
352
+ """Recursively index all source files under *dirpath*.
353
+
354
+ Skips directories in ``_SKIP_DIRS`` and unchanged files. Indexes any
355
+ file with a recognised source-code extension.
356
+
357
+ Args:
358
+ dirpath: Root directory to scan.
359
+ db: An open ``sqlite3.Connection`` from ``db.get_db()``.
360
+
361
+ Returns:
362
+ A list of per-file result dicts (see :func:`index_file`).
363
+ """
364
+ results: list[dict] = []
365
+ dirpath = os.path.abspath(dirpath)
366
+
367
+ for root, dirs, files in os.walk(dirpath, topdown=True):
368
+ # Prune skipped directories in-place
369
+ dirs[:] = [d for d in dirs if d not in _SKIP_DIRS
370
+ and not d.endswith(".egg-info")]
371
+
372
+ for fname in sorted(files):
373
+ ext = os.path.splitext(fname)[1].lower()
374
+ # Accept files with known extensions, or files with a
375
+ # tree-sitter grammar available
376
+ if ext not in _SOURCE_EXTENSIONS and _load_language(ext) is None:
377
+ continue
378
+
379
+ fpath = os.path.join(root, fname)
380
+ try:
381
+ result = index_file(fpath, db)
382
+ results.append(result)
383
+ except Exception:
384
+ logger.exception("Failed to index %s", fpath)
385
+ results.append({
386
+ "file": fpath,
387
+ "symbols_indexed": 0,
388
+ "references_indexed": 0,
389
+ "skipped": True,
390
+ "error": True,
391
+ })
392
+ return results
@@ -0,0 +1,62 @@
1
+ <system_prompt>
2
+ <role_and_objective>
3
+ You are an expert Python developer and systems architect. Your objective is to build the foundation for a Model Context Protocol (MCP) server named `code-memory`. This server acts as a deterministic, high-precision code intelligence layer.
4
+
5
+ You must generate the complete scaffolding for this Python MCP server using the official `mcp` SDK and the `FastMCP` wrapper.
6
+ </role_and_objective>
7
+
8
+ <architectural_strategy>
9
+ To prevent context window bloat and reduce cognitive load on the LLM client, this server uses a strict "Progressive Disclosure" three-pathway routing architecture:
10
+
11
+ 1. **"Who/Why?"** → `search_history` (Temporal/Git data)
12
+ 2. **"Where/What?"** → `search_code` (Deterministic AST data)
13
+ 3. **"How?"** → `search_docs` (Semantic/Fuzzy logic)
14
+ </architectural_strategy>
15
+
16
+ <instructions>
17
+ Please complete the following steps in order. Before writing any code, use a <thinking> block to plan your implementation for each step.
18
+
19
+ <step_1_project_setup>
20
+ Provide the exact CLI commands to initialize the project using `uv` (e.g., `uv init code-memory`) and install the required dependencies (e.g., `uv add "mcp[cli]"`).
21
+ </step_1_project_setup>
22
+
23
+ <step_2_server_initialization>
24
+ Create the main entry point (e.g., `server.py`) that initializes the FastMCP Server (`mcp = FastMCP("code-memory")`).
25
+ </step_2_server_initialization>
26
+
27
+ <step_3_tool_registration>
28
+ Implement the three tools below using the `@mcp.tool()` decorator.
29
+ CRITICAL RULE: You must use strict Python type hints (e.g., `Literal` for enums) and rich docstrings. FastMCP uses these docstrings and type hints to generate the tool descriptions and schemas for the LLM.
30
+
31
+ - **Tool 1: `search_code`**
32
+ - **Docstring**: "Use this tool to find exact structural code definitions, locate where functions/classes are defined, or map out dependency references (call graphs). Do NOT use this for conceptual questions."
33
+ - **Parameters**:
34
+ - `query` (str): The exact symbol, function name, class name, or file path to look up.
35
+ - `search_type` (Literal["definition", "references", "file_structure"]): The type of structural search to perform.
36
+
37
+ - **Tool 2: `search_docs`**
38
+ - **Docstring**: "Use this tool to understand the codebase conceptually. Ideal for 'how does X work?', 'explain the architecture', or finding standard operating procedures in the documentation."
39
+ - **Parameters**:
40
+ - `query` (str): A natural language question about the codebase architecture, logic, or workflow.
41
+
42
+ - **Tool 3: `search_history`**
43
+ - **Docstring**: "Use this tool to debug regressions, understand developer intent, or find out WHY a specific change was made by searching Git history and commit messages."
44
+ - **Parameters**:
45
+ - `query` (str): A search term, author name, or specific commit hash.
46
+ - `target_file` (str, optional): Restrict the history search to a specific file path. Default to `None`.
47
+ </step_3_tool_registration>
48
+
49
+ <step_4_handler_mocking>
50
+ Inside each of the three decorated tool functions, return a mock string or dictionary response for now (e.g., `{"status": "mocked", "tool": "search_code"}`).
51
+
52
+ CRITICAL RULE: Do NOT implement the actual SQLite, Git, or vector logic yet. The goal is strictly to get the MCP protocol wiring fully functional and ready to run via `uv run mcp run server.py`.
53
+ </step_4_handler_mocking>
54
+
55
+ </instructions>
56
+
57
+ <output_formatting>
58
+ - Wrap your internal planning process inside `<thinking>` tags.
59
+ - Output the CLI commands in a `bash` markdown code block.
60
+ - Output the Python code in a single `python` markdown code block (for `server.py`).
61
+ </output_formatting>
62
+ </system_prompt>