cocoindex-code 0.1.12__tar.gz → 0.1.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cocoindex-code
3
- Version: 0.1.12
3
+ Version: 0.1.14
4
4
  Summary: MCP server for indexing and querying codebases using CocoIndex
5
5
  Project-URL: Homepage, https://github.com/cocoindex-io/cocoindex-code
6
6
  Project-URL: Repository, https://github.com/cocoindex-io/cocoindex-code
@@ -17,7 +17,7 @@ Classifier: Programming Language :: Python :: 3.12
17
17
  Classifier: Programming Language :: Python :: 3.13
18
18
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
19
19
  Requires-Python: >=3.11
20
- Requires-Dist: cocoindex[litellm]==1.0.0a26
20
+ Requires-Dist: cocoindex[litellm]==1.0.0a29
21
21
  Requires-Dist: einops>=0.8.2
22
22
  Requires-Dist: mcp>=1.0.0
23
23
  Requires-Dist: numpy>=1.24.0
@@ -165,6 +165,7 @@ Use the cocoindex-code MCP server for semantic code search when:
165
165
  | `COCOINDEX_CODE_EMBEDDING_MODEL` | Embedding model (see below) | `sbert/sentence-transformers/all-MiniLM-L6-v2` |
166
166
  | `COCOINDEX_CODE_BATCH_SIZE` | Max batch size for local embedding model | `16` |
167
167
  | `COCOINDEX_CODE_EXTRA_EXTENSIONS` | Additional file extensions to index (comma-separated, e.g. `"inc:php,yaml,toml"` — use `ext:lang` to override language detection) | _(none)_ |
168
+ | `COCOINDEX_CODE_EXCLUDED_PATTERNS` | Additional glob patterns to exclude from indexing as a JSON array (e.g. `'["**/migration.sql", "{**/*.md,**/*.txt}"]'`) | _(none)_ |
168
169
 
169
170
 
170
171
  ### Root Path Discovery
@@ -297,9 +298,20 @@ claude mcp add cocoindex-code \
297
298
 
298
299
  Any model supported by LiteLLM works — see the [full list of embedding providers](https://docs.litellm.ai/docs/embedding/supported_embedding).
299
300
 
300
- ### GPU-optimised local model
301
+ ### Local SentenceTransformers models
301
302
 
302
- If you have a GPU, [`nomic-ai/CodeRankEmbed`](https://huggingface.co/nomic-ai/CodeRankEmbed) delivers significantly better code retrieval than the default model. It is 137M parameters, requires ~1 GB VRAM, and has an 8192-token context window.
303
+ Use the `sbert/` prefix to load any [SentenceTransformers](https://www.sbert.net/) model locally (no API key required).
304
+
305
+ **Example — general purpose text model:**
306
+ ```bash
307
+ claude mcp add cocoindex-code \
308
+ -e COCOINDEX_CODE_EMBEDDING_MODEL=sbert/nomic-ai/nomic-embed-text-v1 \
309
+ -- cocoindex-code
310
+ ```
311
+
312
+ **GPU-optimised code retrieval:**
313
+
314
+ [`nomic-ai/CodeRankEmbed`](https://huggingface.co/nomic-ai/CodeRankEmbed) delivers significantly better code retrieval than the default model. It is 137M parameters, requires ~1 GB VRAM, and has an 8192-token context window.
303
315
 
304
316
  ```bash
305
317
  claude mcp add cocoindex-code \
@@ -355,6 +367,7 @@ Returns matching code chunks with:
355
367
  | javascript | js | `.js` |
356
368
  | json | | `.json` |
357
369
  | kotlin | | `.kt`, `.kts` |
370
+ | lua | | `.lua` |
358
371
  | markdown | md | `.md`, `.mdx` |
359
372
  | pascal | pas, dpr, delphi | `.pas`, `.dpr` |
360
373
  | php | | `.php` |
@@ -130,6 +130,7 @@ Use the cocoindex-code MCP server for semantic code search when:
130
130
  | `COCOINDEX_CODE_EMBEDDING_MODEL` | Embedding model (see below) | `sbert/sentence-transformers/all-MiniLM-L6-v2` |
131
131
  | `COCOINDEX_CODE_BATCH_SIZE` | Max batch size for local embedding model | `16` |
132
132
  | `COCOINDEX_CODE_EXTRA_EXTENSIONS` | Additional file extensions to index (comma-separated, e.g. `"inc:php,yaml,toml"` — use `ext:lang` to override language detection) | _(none)_ |
133
+ | `COCOINDEX_CODE_EXCLUDED_PATTERNS` | Additional glob patterns to exclude from indexing as a JSON array (e.g. `'["**/migration.sql", "{**/*.md,**/*.txt}"]'`) | _(none)_ |
133
134
 
134
135
 
135
136
  ### Root Path Discovery
@@ -262,9 +263,20 @@ claude mcp add cocoindex-code \
262
263
 
263
264
  Any model supported by LiteLLM works — see the [full list of embedding providers](https://docs.litellm.ai/docs/embedding/supported_embedding).
264
265
 
265
- ### GPU-optimised local model
266
+ ### Local SentenceTransformers models
266
267
 
267
- If you have a GPU, [`nomic-ai/CodeRankEmbed`](https://huggingface.co/nomic-ai/CodeRankEmbed) delivers significantly better code retrieval than the default model. It is 137M parameters, requires ~1 GB VRAM, and has an 8192-token context window.
268
+ Use the `sbert/` prefix to load any [SentenceTransformers](https://www.sbert.net/) model locally (no API key required).
269
+
270
+ **Example — general purpose text model:**
271
+ ```bash
272
+ claude mcp add cocoindex-code \
273
+ -e COCOINDEX_CODE_EMBEDDING_MODEL=sbert/nomic-ai/nomic-embed-text-v1 \
274
+ -- cocoindex-code
275
+ ```
276
+
277
+ **GPU-optimised code retrieval:**
278
+
279
+ [`nomic-ai/CodeRankEmbed`](https://huggingface.co/nomic-ai/CodeRankEmbed) delivers significantly better code retrieval than the default model. It is 137M parameters, requires ~1 GB VRAM, and has an 8192-token context window.
268
280
 
269
281
  ```bash
270
282
  claude mcp add cocoindex-code \
@@ -320,6 +332,7 @@ Returns matching code chunks with:
320
332
  | javascript | js | `.js` |
321
333
  | json | | `.json` |
322
334
  | kotlin | | `.kt`, `.kts` |
335
+ | lua | | `.lua` |
323
336
  | markdown | md | `.md`, `.mdx` |
324
337
  | pascal | pas, dpr, delphi | `.pas`, `.dpr` |
325
338
  | php | | `.php` |
@@ -23,7 +23,7 @@ classifiers = [
23
23
 
24
24
  dependencies = [
25
25
  "mcp>=1.0.0",
26
- "cocoindex[litellm]==1.0.0a26",
26
+ "cocoindex[litellm]==1.0.0a29",
27
27
  "sentence-transformers>=2.2.0",
28
28
  "sqlite-vec>=0.1.0",
29
29
  "pydantic>=2.0.0",
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import json
5
6
  import os
6
7
  from dataclasses import dataclass
7
8
  from pathlib import Path
@@ -42,6 +43,31 @@ def _discover_codebase_root() -> Path:
42
43
  return root if root is not None else cwd
43
44
 
44
45
 
46
+ def _parse_json_string_list_env(var_name: str) -> list[str]:
47
+ """Parse an environment variable as a JSON array of strings."""
48
+ raw_value = os.environ.get(var_name, "")
49
+ if not raw_value.strip():
50
+ return []
51
+
52
+ try:
53
+ parsed = json.loads(raw_value)
54
+ except json.JSONDecodeError as exc:
55
+ raise ValueError(f"{var_name} must be a JSON array of strings, got invalid JSON") from exc
56
+
57
+ if not isinstance(parsed, list):
58
+ raise ValueError(f"{var_name} must be a JSON array of strings")
59
+
60
+ result: list[str] = []
61
+ for item in parsed:
62
+ if not isinstance(item, str):
63
+ raise ValueError(f"{var_name} must be a JSON array of strings")
64
+ item = item.strip()
65
+ if item:
66
+ result.append(item)
67
+
68
+ return result
69
+
70
+
45
71
  @dataclass
46
72
  class Config:
47
73
  """Configuration loaded from environment variables."""
@@ -50,8 +76,8 @@ class Config:
50
76
  embedding_model: str
51
77
  index_dir: Path
52
78
  device: str | None
53
- trust_remote_code: bool
54
79
  extra_extensions: dict[str, str | None]
80
+ excluded_patterns: list[str]
55
81
 
56
82
  @classmethod
57
83
  def from_env(cls) -> Config:
@@ -76,16 +102,6 @@ class Config:
76
102
  # Device: auto-detect CUDA or use env override
77
103
  device = os.environ.get("COCOINDEX_CODE_DEVICE")
78
104
 
79
- # trust_remote_code: opt-in via env var only.
80
- # sentence-transformers 5.x+ supports Jina models natively, so
81
- # auto-enabling this for jinaai/ models causes failures with
82
- # transformers 5.x (removed find_pruneable_heads_and_indices).
83
- trust_remote_code = os.environ.get("COCOINDEX_CODE_TRUST_REMOTE_CODE", "").lower() in (
84
- "1",
85
- "true",
86
- "yes",
87
- )
88
-
89
105
  # Extra file extensions (format: "inc:php,yaml,toml" — optional lang after colon)
90
106
  raw_extra = os.environ.get("COCOINDEX_CODE_EXTRA_EXTENSIONS", "")
91
107
  extra_extensions: dict[str, str | None] = {}
@@ -99,13 +115,16 @@ class Config:
99
115
  else:
100
116
  extra_extensions[f".{token}"] = None
101
117
 
118
+ # Excluded file glob patterns
119
+ excluded_patterns = _parse_json_string_list_env("COCOINDEX_CODE_EXCLUDED_PATTERNS")
120
+
102
121
  return cls(
103
122
  codebase_root_path=root,
104
123
  embedding_model=embedding_model,
105
124
  index_dir=index_dir,
106
125
  device=device,
107
- trust_remote_code=trust_remote_code,
108
126
  extra_extensions=extra_extensions,
127
+ excluded_patterns=excluded_patterns,
109
128
  )
110
129
 
111
130
  @property
@@ -42,6 +42,7 @@ DEFAULT_INCLUDED_PATTERNS = [
42
42
  "**/*.txt", # Plain text
43
43
  "**/*.rst", # reStructuredText
44
44
  "**/*.php", # PHP
45
+ "**/*.lua", # Lua
45
46
  ]
46
47
 
47
48
  INCLUDED_PATTERNS = DEFAULT_INCLUDED_PATTERNS + [f"**/*{ext}" for ext in config.extra_extensions]
@@ -51,7 +52,7 @@ LANGUAGE_OVERRIDES: dict[str, str] = {
51
52
  ext: lang for ext, lang in config.extra_extensions.items() if lang is not None
52
53
  }
53
54
 
54
- EXCLUDED_PATTERNS = [
55
+ DEFAULT_EXCLUDED_PATTERNS = [
55
56
  "**/.*", # Hidden directories
56
57
  "**/__pycache__", # Python cache
57
58
  "**/node_modules", # Node.js dependencies
@@ -63,6 +64,8 @@ EXCLUDED_PATTERNS = [
63
64
  "**/.cocoindex_code", # Our own index directory
64
65
  ]
65
66
 
67
+ EXCLUDED_PATTERNS = DEFAULT_EXCLUDED_PATTERNS + config.excluded_patterns
68
+
66
69
  # Chunking configuration
67
70
  CHUNK_SIZE = 2000
68
71
  MIN_CHUNK_SIZE = 300
@@ -106,7 +106,7 @@ async def query_codebase(
106
106
  db = coco_env.get_context(SQLITE_DB)
107
107
 
108
108
  # Generate query embedding.
109
- query_embedding = await embedder.embed(query, True, query_prompt_name)
109
+ query_embedding = await embedder.embed(query, query_prompt_name)
110
110
 
111
111
  embedding_bytes = query_embedding.astype("float32").tobytes()
112
112
 
@@ -31,19 +31,15 @@ if config.embedding_model.startswith(SBERT_PREFIX):
31
31
  # Models that define a "query" prompt for asymmetric retrieval.
32
32
  _QUERY_PROMPT_MODELS = {"nomic-ai/nomic-embed-code", "nomic-ai/CodeRankEmbed"}
33
33
  query_prompt_name: str | None = "query" if _model_name in _QUERY_PROMPT_MODELS else None
34
- # Models whose custom remote code is known-compatible with transformers 5.x.
35
- _KNOWN_REMOTE_CODE_MODELS = {"nomic-ai/CodeRankEmbed"}
36
- _trust = config.trust_remote_code or _model_name in _KNOWN_REMOTE_CODE_MODELS
37
34
  embedder = SentenceTransformerEmbedder(
38
35
  _model_name,
39
36
  device=config.device,
40
- trust_remote_code=_trust,
37
+ trust_remote_code=True,
41
38
  )
42
39
  logger.info(
43
- "Embedding model: %s | device: %s | trust_remote_code: %s",
40
+ "Embedding model: %s | device: %s",
44
41
  config.embedding_model,
45
42
  config.device,
46
- _trust,
47
43
  )
48
44
  else:
49
45
  from cocoindex.ops.litellm import LiteLLMEmbedder
File without changes