code-memory 1.0.6__tar.gz → 1.0.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {code_memory-1.0.6 → code_memory-1.0.9}/PKG-INFO +39 -1
  2. {code_memory-1.0.6 → code_memory-1.0.9}/README.md +37 -0
  3. {code_memory-1.0.6 → code_memory-1.0.9}/db.py +104 -11
  4. {code_memory-1.0.6 → code_memory-1.0.9}/parser.py +64 -6
  5. {code_memory-1.0.6 → code_memory-1.0.9}/pyproject.toml +2 -1
  6. {code_memory-1.0.6 → code_memory-1.0.9}/uv.lock +3 -1
  7. {code_memory-1.0.6 → code_memory-1.0.9}/.github/workflows/ci.yml +0 -0
  8. {code_memory-1.0.6 → code_memory-1.0.9}/.github/workflows/publish.yml +0 -0
  9. {code_memory-1.0.6 → code_memory-1.0.9}/.gitignore +0 -0
  10. {code_memory-1.0.6 → code_memory-1.0.9}/.python-version +0 -0
  11. {code_memory-1.0.6 → code_memory-1.0.9}/CHANGELOG.md +0 -0
  12. {code_memory-1.0.6 → code_memory-1.0.9}/CONTRIBUTING.md +0 -0
  13. {code_memory-1.0.6 → code_memory-1.0.9}/LICENSE +0 -0
  14. {code_memory-1.0.6 → code_memory-1.0.9}/Makefile +0 -0
  15. {code_memory-1.0.6 → code_memory-1.0.9}/doc_parser.py +0 -0
  16. {code_memory-1.0.6 → code_memory-1.0.9}/errors.py +0 -0
  17. {code_memory-1.0.6 → code_memory-1.0.9}/git_search.py +0 -0
  18. {code_memory-1.0.6 → code_memory-1.0.9}/logging_config.py +0 -0
  19. {code_memory-1.0.6 → code_memory-1.0.9}/prompts/milestone_1.xml +0 -0
  20. {code_memory-1.0.6 → code_memory-1.0.9}/prompts/milestone_2.xml +0 -0
  21. {code_memory-1.0.6 → code_memory-1.0.9}/prompts/milestone_3.xml +0 -0
  22. {code_memory-1.0.6 → code_memory-1.0.9}/prompts/milestone_4.xml +0 -0
  23. {code_memory-1.0.6 → code_memory-1.0.9}/prompts/milestone_5.xml +0 -0
  24. {code_memory-1.0.6 → code_memory-1.0.9}/prompts/milestone_6.xml +0 -0
  25. {code_memory-1.0.6 → code_memory-1.0.9}/queries.py +0 -0
  26. {code_memory-1.0.6 → code_memory-1.0.9}/server.py +0 -0
  27. {code_memory-1.0.6 → code_memory-1.0.9}/tests/__init__.py +0 -0
  28. {code_memory-1.0.6 → code_memory-1.0.9}/tests/conftest.py +0 -0
  29. {code_memory-1.0.6 → code_memory-1.0.9}/tests/test_errors.py +0 -0
  30. {code_memory-1.0.6 → code_memory-1.0.9}/tests/test_logging.py +0 -0
  31. {code_memory-1.0.6 → code_memory-1.0.9}/tests/test_tools.py +0 -0
  32. {code_memory-1.0.6 → code_memory-1.0.9}/tests/test_validation.py +0 -0
  33. {code_memory-1.0.6 → code_memory-1.0.9}/validation.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: code-memory
3
- Version: 1.0.6
3
+ Version: 1.0.9
4
4
  Summary: A deterministic, high-precision code intelligence MCP server
5
5
  Project-URL: Homepage, https://github.com/kapillamba4/code-memory
6
6
  Project-URL: Documentation, https://github.com/kapillamba4/code-memory#readme
@@ -19,6 +19,7 @@ Requires-Python: >=3.13
19
19
  Requires-Dist: gitpython>=3.1.46
20
20
  Requires-Dist: markdown-it-py>=4.0.0
21
21
  Requires-Dist: mcp[cli]>=1.26.0
22
+ Requires-Dist: pathspec>=0.12.1
22
23
  Requires-Dist: sentence-transformers>=5.2.3
23
24
  Requires-Dist: sqlite-vec>=0.1.6
24
25
  Requires-Dist: tree-sitter-c>=0.24.1
@@ -47,6 +48,43 @@ A deterministic, high-precision **code intelligence layer** exposed as a [Model
47
48
 
48
49
  `code-memory` gives your AI coding assistant structured access to your codebase through three focused pathways — eliminating context-window bloat and vague "search everything" queries.
49
50
 
51
+ ## Supported Languages
52
+
53
+ ### Full AST Support (Tree-sitter)
54
+
55
+ These languages have structural parsing with symbol extraction (functions, classes, methods, etc.):
56
+
57
+ | Language | Extensions |
58
+ |----------|------------|
59
+ | Python | `.py` |
60
+ | JavaScript | `.js`, `.jsx` |
61
+ | TypeScript | `.ts`, `.tsx` |
62
+ | Java | `.java` |
63
+ | Go | `.go` |
64
+ | Rust | `.rs` |
65
+ | C | `.c`, `.h` |
66
+ | C++ | `.cpp`, `.hpp`, `.cc`, `.cxx` |
67
+ | Ruby | `.rb` |
68
+ | Kotlin | `.kt`, `.kts` |
69
+
70
+ ### Fallback Support (Whole-file Indexing)
71
+
72
+ These file types are indexed as complete units for BM25 and semantic search:
73
+
74
+ | Category | Extensions |
75
+ |----------|------------|
76
+ | C# | `.cs` |
77
+ | Swift | `.swift` |
78
+ | Scala | `.scala` |
79
+ | Lua | `.lua` |
80
+ | Shell | `.sh`, `.bash`, `.zsh` |
81
+ | Config | `.yaml`, `.yml`, `.toml`, `.json` |
82
+ | Web | `.html`, `.css`, `.scss` |
83
+ | Database | `.sql` |
84
+ | Docs | `.md`, `.txt` |
85
+
86
+ > **Note:** Files and directories matching patterns in your `.gitignore` are automatically skipped during indexing. This excludes build artifacts, dependencies, and other generated files.
87
+
50
88
  ## Architecture: Progressive Disclosure
51
89
 
52
90
  Instead of a single monolithic search, `code-memory` routes queries through **three purpose-built tools**:
@@ -4,6 +4,43 @@ A deterministic, high-precision **code intelligence layer** exposed as a [Model
4
4
 
5
5
  `code-memory` gives your AI coding assistant structured access to your codebase through three focused pathways — eliminating context-window bloat and vague "search everything" queries.
6
6
 
7
+ ## Supported Languages
8
+
9
+ ### Full AST Support (Tree-sitter)
10
+
11
+ These languages have structural parsing with symbol extraction (functions, classes, methods, etc.):
12
+
13
+ | Language | Extensions |
14
+ |----------|------------|
15
+ | Python | `.py` |
16
+ | JavaScript | `.js`, `.jsx` |
17
+ | TypeScript | `.ts`, `.tsx` |
18
+ | Java | `.java` |
19
+ | Go | `.go` |
20
+ | Rust | `.rs` |
21
+ | C | `.c`, `.h` |
22
+ | C++ | `.cpp`, `.hpp`, `.cc`, `.cxx` |
23
+ | Ruby | `.rb` |
24
+ | Kotlin | `.kt`, `.kts` |
25
+
26
+ ### Fallback Support (Whole-file Indexing)
27
+
28
+ These file types are indexed as complete units for BM25 and semantic search:
29
+
30
+ | Category | Extensions |
31
+ |----------|------------|
32
+ | C# | `.cs` |
33
+ | Swift | `.swift` |
34
+ | Scala | `.scala` |
35
+ | Lua | `.lua` |
36
+ | Shell | `.sh`, `.bash`, `.zsh` |
37
+ | Config | `.yaml`, `.yml`, `.toml`, `.json` |
38
+ | Web | `.html`, `.css`, `.scss` |
39
+ | Database | `.sql` |
40
+ | Docs | `.md`, `.txt` |
41
+
42
+ > **Note:** Files and directories matching patterns in your `.gitignore` are automatically skipped during indexing. This excludes build artifacts, dependencies, and other generated files.
43
+
7
44
  ## Architecture: Progressive Disclosure
8
45
 
9
46
  Instead of a single monolithic search, `code-memory` routes queries through **three purpose-built tools**:
@@ -29,26 +29,42 @@ logger = logging.getLogger(__name__)
29
29
  # ---------------------------------------------------------------------------
30
30
 
31
31
  _model = None
32
- EMBEDDING_DIM = 1024 # jina-code-embeddings-0.5b (Matryoshka truncated)
32
+ _embedding_dim = None
33
+
34
+ # Model identifier - change this if you switch to a different embedding model
35
+ EMBEDDING_MODEL_NAME = "jinaai/jina-code-embeddings-0.5b"
33
36
 
34
37
 
35
38
  def get_embedding_model():
36
39
  """Lazy-load and cache the sentence-transformers model."""
37
- global _model
40
+ global _model, _embedding_dim
38
41
  if _model is None:
39
42
  from sentence_transformers import SentenceTransformer
40
43
 
41
44
  _model = SentenceTransformer(
42
- "jinaai/jina-code-embeddings-0.5b", trust_remote_code=True
45
+ EMBEDDING_MODEL_NAME, trust_remote_code=True
43
46
  )
47
+ # Cache the embedding dimension from the model
48
+ _embedding_dim = _model.get_sentence_embedding_dimension()
49
+ logger.info(f"Loaded embedding model with dimension: {_embedding_dim}")
44
50
  return _model
45
51
 
46
52
 
53
+ def get_embedding_dim() -> int:
54
+ """Get the embedding dimension from the model.
55
+
56
+ Loads the model if not already loaded.
57
+ Returns the native embedding dimension of the model.
58
+ """
59
+ if _embedding_dim is None:
60
+ get_embedding_model()
61
+ return _embedding_dim
62
+
63
+
47
64
  def embed_text(text: str, task_type: str = "nl2code") -> list[float]:
48
65
  """Generate a dense vector embedding for *text*.
49
66
 
50
67
  Uses jina-code-embeddings with task prefix for better code retrieval.
51
- Matryoshka embedding truncated to 1024 dims for efficiency.
52
68
 
53
69
  Args:
54
70
  text: The text to embed.
@@ -57,7 +73,7 @@ def embed_text(text: str, task_type: str = "nl2code") -> list[float]:
57
73
  model = get_embedding_model()
58
74
  prefixed_text = f"{task_type}: {text}"
59
75
  vec = model.encode(prefixed_text, normalize_embeddings=True, show_progress_bar=False)
60
- return vec.tolist()[:EMBEDDING_DIM]
76
+ return vec.tolist()
61
77
 
62
78
 
63
79
  def embed_texts_batch(
@@ -93,7 +109,7 @@ def embed_texts_batch(
93
109
  convert_to_numpy=True,
94
110
  )
95
111
 
96
- return [v.tolist()[:EMBEDDING_DIM] for v in vectors]
112
+ return [v.tolist() for v in vectors]
97
113
 
98
114
 
99
115
  def warmup_embedding_model() -> None:
@@ -141,6 +157,12 @@ def transaction(db: sqlite3.Connection):
141
157
  # ---------------------------------------------------------------------------
142
158
 
143
159
  _SCHEMA_SQL = """
160
+ -- 0. Metadata table for tracking index version and model info
161
+ CREATE TABLE IF NOT EXISTS index_metadata (
162
+ key TEXT PRIMARY KEY,
163
+ value TEXT NOT NULL
164
+ );
165
+
144
166
  -- 1. Tracked source files
145
167
  CREATE TABLE IF NOT EXISTS files (
146
168
  id INTEGER PRIMARY KEY,
@@ -256,6 +278,9 @@ def get_db(project_dir: str) -> sqlite3.Connection:
256
278
  The database is stored as {project_dir}/code_memory.db to ensure each
257
279
  project has its own isolated index.
258
280
 
281
+ If the embedding model has changed since the last index, all indexed data
282
+ is automatically invalidated and the index will need to be rebuilt.
283
+
259
284
  Args:
260
285
  project_dir: The project directory where code_memory.db will be stored.
261
286
 
@@ -274,13 +299,83 @@ def get_db(project_dir: str) -> sqlite3.Connection:
274
299
 
275
300
  db.executescript(_SCHEMA_SQL)
276
301
 
277
- # sqlite-vec virtual table for code embeddings (must be created outside executescript)
302
+ # Get embedding dimension from the model (loads model if needed)
303
+ embedding_dim = get_embedding_dim()
304
+
305
+ # Check if the embedding model has changed
306
+ stored_model = db.execute(
307
+ "SELECT value FROM index_metadata WHERE key = 'embedding_model'"
308
+ ).fetchone()
309
+ stored_dim = db.execute(
310
+ "SELECT value FROM index_metadata WHERE key = 'embedding_dim'"
311
+ ).fetchone()
312
+
313
+ model_changed = (
314
+ stored_model is None
315
+ or stored_model[0] != EMBEDDING_MODEL_NAME
316
+ or stored_dim is None
317
+ or int(stored_dim[0]) != embedding_dim
318
+ )
319
+
320
+ if model_changed:
321
+ if stored_model is not None:
322
+ # Model changed - invalidate existing index
323
+ logger.info(
324
+ f"Embedding model changed from '{stored_model[0] if stored_model else 'none'}' "
325
+ f"to '{EMBEDDING_MODEL_NAME}'. Invalidating index..."
326
+ )
327
+ _invalidate_index(db, embedding_dim)
328
+ else:
329
+ # New database - just create the embedding tables
330
+ _create_embedding_tables(db, embedding_dim)
331
+
332
+ # Store the current model info
333
+ db.execute(
334
+ "INSERT OR REPLACE INTO index_metadata (key, value) VALUES ('embedding_model', ?)",
335
+ (EMBEDDING_MODEL_NAME,)
336
+ )
337
+ db.execute(
338
+ "INSERT OR REPLACE INTO index_metadata (key, value) VALUES ('embedding_dim', ?)",
339
+ (str(embedding_dim),)
340
+ )
341
+ db.commit()
342
+
343
+ return db
344
+
345
+
346
+ def _invalidate_index(db: sqlite3.Connection, embedding_dim: int) -> None:
347
+ """Invalidate the index by clearing all data and recreating embedding tables.
348
+
349
+ This is called when the embedding model changes.
350
+ """
351
+ # Drop existing embedding virtual tables
352
+ db.execute("DROP TABLE IF EXISTS symbol_embeddings")
353
+ db.execute("DROP TABLE IF EXISTS doc_embeddings")
354
+
355
+ # Clear all indexed data (cascades will handle related data via foreign keys,
356
+ # but we need to be explicit since FK enforcement may vary)
357
+ db.execute("DELETE FROM symbol_embeddings")
358
+ db.execute("DELETE FROM doc_embeddings")
359
+ db.execute("DELETE FROM symbols")
360
+ db.execute("DELETE FROM files")
361
+ db.execute("DELETE FROM references_")
362
+ db.execute("DELETE FROM doc_chunks")
363
+ db.execute("DELETE FROM doc_files")
364
+
365
+ # Recreate embedding tables with new dimension
366
+ _create_embedding_tables(db, embedding_dim)
367
+ logger.info("Index invalidated and embedding tables recreated")
368
+
369
+
370
+ def _create_embedding_tables(db: sqlite3.Connection, embedding_dim: int) -> None:
371
+ """Create the embedding virtual tables with the specified dimension."""
372
+ # sqlite-vec virtual table for code embeddings
278
373
  db.execute(
279
374
  f"""
280
375
  CREATE VIRTUAL TABLE IF NOT EXISTS symbol_embeddings
281
376
  USING vec0(
282
377
  symbol_id INTEGER PRIMARY KEY,
283
- embedding float[{EMBEDDING_DIM}]
378
+ embedding float[{embedding_dim}]
284
379
  )
285
380
  """
286
381
  )
@@ -291,12 +386,10 @@ def get_db(project_dir: str) -> sqlite3.Connection:
291
386
  CREATE VIRTUAL TABLE IF NOT EXISTS doc_embeddings
292
387
  USING vec0(
293
388
  chunk_id INTEGER PRIMARY KEY,
294
- embedding float[{EMBEDDING_DIM}]
389
+ embedding float[{embedding_dim}]
295
390
  )
296
391
  """
297
392
  )
298
- db.commit()
299
- return db
300
393
 
301
394
 
302
395
  # ---------------------------------------------------------------------------
@@ -14,19 +14,56 @@ import os
14
14
  from pathlib import Path
15
15
  from typing import Any
16
16
 
17
+ import pathspec
17
18
  from tree_sitter import Language, Node, Parser
18
19
 
19
20
  import db as db_mod
20
21
 
21
22
  logger = logging.getLogger(__name__)
22
23
 
23
- # ── Directories to skip ───────────────────────────────────────────────
24
+ # ── Directories to always skip (even without .gitignore) ───────────────
24
25
  _SKIP_DIRS = frozenset({
25
26
  ".venv", "venv", "__pycache__", ".git", "node_modules",
26
27
  ".mypy_cache", ".pytest_cache", ".ruff_cache", ".tox",
27
28
  "dist", "build", "target", "bin", "obj",
28
29
  })
29
30
 
31
+
32
+ def _load_gitignore_spec(root_dir: str) -> pathspec.PathSpec | None:
33
+ """Load .gitignore patterns from the given directory.
34
+
35
+ Returns a PathSpec object if .gitignore exists, None otherwise.
36
+ """
37
+ gitignore_path = os.path.join(root_dir, ".gitignore")
38
+ if not os.path.isfile(gitignore_path):
39
+ return None
40
+
41
+ try:
42
+ with open(gitignore_path, encoding="utf-8") as f:
43
+ lines = f.readlines()
44
+ return pathspec.PathSpec.from_lines("gitwildmatch", lines)
45
+ except (OSError, UnicodeDecodeError) as e:
46
+ logger.debug("Failed to read .gitignore: %s", e)
47
+ return None
48
+
49
+
50
+ def _should_skip_path(
51
+ rel_path: str,
52
+ is_dir: bool,
53
+ gitignore_spec: pathspec.PathSpec | None,
54
+ ) -> bool:
55
+ """Check if a path should be skipped based on .gitignore patterns."""
56
+ if gitignore_spec is None:
57
+ return False
58
+
59
+ # Check both the path as-is and with trailing slash for directories
60
+ if gitignore_spec.match_file(rel_path):
61
+ return True
62
+ if is_dir and gitignore_spec.match_file(rel_path + "/"):
63
+ return True
64
+
65
+ return False
66
+
30
67
  # ── File extensions we consider "source code" ─────────────────────────
31
68
  _SOURCE_EXTENSIONS = frozenset({
32
69
  ".py", ".js", ".jsx", ".ts", ".tsx", ".java",
@@ -360,8 +397,8 @@ def index_file(filepath: str, db) -> dict:
360
397
  def index_directory(dirpath: str, db) -> list[dict]:
361
398
  """Recursively index all source files under *dirpath*.
362
399
 
363
- Skips directories in ``_SKIP_DIRS`` and unchanged files. Indexes any
364
- file with a recognised source-code extension.
400
+ Skips directories in ``_SKIP_DIRS``, files matching ``.gitignore`` patterns,
401
+ and unchanged files. Indexes any file with a recognised source-code extension.
365
402
 
366
403
  Args:
367
404
  dirpath: Root directory to scan.
@@ -376,12 +413,33 @@ def index_directory(dirpath: str, db) -> list[dict]:
376
413
  dirpath = os.path.abspath(dirpath)
377
414
  total_start = time.perf_counter()
378
415
 
416
+ # Load .gitignore patterns from the root directory
417
+ gitignore_spec = _load_gitignore_spec(dirpath)
418
+ if gitignore_spec:
419
+ logger.debug("Loaded .gitignore patterns from %s", dirpath)
420
+
379
421
  for root, dirs, files in os.walk(dirpath, topdown=True):
380
- # Prune skipped directories in-place
381
- dirs[:] = [d for d in dirs if d not in _SKIP_DIRS
382
- and not d.endswith(".egg-info")]
422
+ rel_root = os.path.relpath(root, dirpath)
423
+
424
+ # Prune skipped directories in-place (always-skip + gitignore)
425
+ def _should_keep_dir(d: str) -> bool:
426
+ if d in _SKIP_DIRS or d.endswith(".egg-info"):
427
+ return False
428
+ if gitignore_spec:
429
+ rel_path = os.path.join(rel_root, d) if rel_root != "." else d
430
+ if _should_skip_path(rel_path, is_dir=True, gitignore_spec=gitignore_spec):
431
+ return False
432
+ return True
433
+
434
+ dirs[:] = [d for d in dirs if _should_keep_dir(d)]
383
435
 
384
436
  for fname in sorted(files):
437
+ # Skip files matching .gitignore patterns
438
+ if gitignore_spec:
439
+ rel_path = os.path.join(rel_root, fname) if rel_root != "." else fname
440
+ if _should_skip_path(rel_path, is_dir=False, gitignore_spec=gitignore_spec):
441
+ continue
442
+
385
443
  ext = os.path.splitext(fname)[1].lower()
386
444
  # Accept files with known extensions, or files with a
387
445
  # tree-sitter grammar available
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "code-memory"
7
- version = "1.0.6"
7
+ version = "1.0.9"
8
8
  description = "A deterministic, high-precision code intelligence MCP server"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -22,6 +22,7 @@ classifiers = [
22
22
  ]
23
23
  dependencies = [
24
24
  "gitpython>=3.1.46",
25
+ "pathspec>=0.12.1",
25
26
  "markdown-it-py>=4.0.0",
26
27
  "mcp[cli]>=1.26.0",
27
28
  "sentence-transformers>=5.2.3",
@@ -109,12 +109,13 @@ wheels = [
109
109
 
110
110
  [[package]]
111
111
  name = "code-memory"
112
- version = "1.0.4"
112
+ version = "1.0.6"
113
113
  source = { editable = "." }
114
114
  dependencies = [
115
115
  { name = "gitpython" },
116
116
  { name = "markdown-it-py" },
117
117
  { name = "mcp", extra = ["cli"] },
118
+ { name = "pathspec" },
118
119
  { name = "sentence-transformers" },
119
120
  { name = "sqlite-vec" },
120
121
  { name = "tree-sitter" },
@@ -146,6 +147,7 @@ requires-dist = [
146
147
  { name = "markdown-it-py", specifier = ">=4.0.0" },
147
148
  { name = "mcp", extras = ["cli"], specifier = ">=1.26.0" },
148
149
  { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.13.0" },
150
+ { name = "pathspec", specifier = ">=0.12.1" },
149
151
  { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" },
150
152
  { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.24.0" },
151
153
  { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=5.0.0" },
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes