cocoindex-code 0.1.12__tar.gz → 0.1.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cocoindex_code-0.1.12 → cocoindex_code-0.1.14}/PKG-INFO +17 -4
- {cocoindex_code-0.1.12 → cocoindex_code-0.1.14}/README.md +15 -2
- {cocoindex_code-0.1.12 → cocoindex_code-0.1.14}/pyproject.toml +1 -1
- {cocoindex_code-0.1.12 → cocoindex_code-0.1.14}/src/cocoindex_code/config.py +31 -12
- {cocoindex_code-0.1.12 → cocoindex_code-0.1.14}/src/cocoindex_code/indexer.py +4 -1
- {cocoindex_code-0.1.12 → cocoindex_code-0.1.14}/src/cocoindex_code/query.py +1 -1
- {cocoindex_code-0.1.12 → cocoindex_code-0.1.14}/src/cocoindex_code/shared.py +2 -6
- {cocoindex_code-0.1.12 → cocoindex_code-0.1.14}/.gitignore +0 -0
- {cocoindex_code-0.1.12 → cocoindex_code-0.1.14}/LICENSE +0 -0
- {cocoindex_code-0.1.12 → cocoindex_code-0.1.14}/src/cocoindex_code/__init__.py +0 -0
- {cocoindex_code-0.1.12 → cocoindex_code-0.1.14}/src/cocoindex_code/__main__.py +0 -0
- {cocoindex_code-0.1.12 → cocoindex_code-0.1.14}/src/cocoindex_code/schema.py +0 -0
- {cocoindex_code-0.1.12 → cocoindex_code-0.1.14}/src/cocoindex_code/server.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cocoindex-code
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.14
|
|
4
4
|
Summary: MCP server for indexing and querying codebases using CocoIndex
|
|
5
5
|
Project-URL: Homepage, https://github.com/cocoindex-io/cocoindex-code
|
|
6
6
|
Project-URL: Repository, https://github.com/cocoindex-io/cocoindex-code
|
|
@@ -17,7 +17,7 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
17
17
|
Classifier: Programming Language :: Python :: 3.13
|
|
18
18
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
19
|
Requires-Python: >=3.11
|
|
20
|
-
Requires-Dist: cocoindex[litellm]==1.0.
|
|
20
|
+
Requires-Dist: cocoindex[litellm]==1.0.0a29
|
|
21
21
|
Requires-Dist: einops>=0.8.2
|
|
22
22
|
Requires-Dist: mcp>=1.0.0
|
|
23
23
|
Requires-Dist: numpy>=1.24.0
|
|
@@ -165,6 +165,7 @@ Use the cocoindex-code MCP server for semantic code search when:
|
|
|
165
165
|
| `COCOINDEX_CODE_EMBEDDING_MODEL` | Embedding model (see below) | `sbert/sentence-transformers/all-MiniLM-L6-v2` |
|
|
166
166
|
| `COCOINDEX_CODE_BATCH_SIZE` | Max batch size for local embedding model | `16` |
|
|
167
167
|
| `COCOINDEX_CODE_EXTRA_EXTENSIONS` | Additional file extensions to index (comma-separated, e.g. `"inc:php,yaml,toml"` — use `ext:lang` to override language detection) | _(none)_ |
|
|
168
|
+
| `COCOINDEX_CODE_EXCLUDED_PATTERNS` | Additional glob patterns to exclude from indexing as a JSON array (e.g. `'["**/migration.sql", "{**/*.md,**/*.txt}"]'`) | _(none)_ |
|
|
168
169
|
|
|
169
170
|
|
|
170
171
|
### Root Path Discovery
|
|
@@ -297,9 +298,20 @@ claude mcp add cocoindex-code \
|
|
|
297
298
|
|
|
298
299
|
Any model supported by LiteLLM works — see the [full list of embedding providers](https://docs.litellm.ai/docs/embedding/supported_embedding).
|
|
299
300
|
|
|
300
|
-
###
|
|
301
|
+
### Local SentenceTransformers models
|
|
301
302
|
|
|
302
|
-
|
|
303
|
+
Use the `sbert/` prefix to load any [SentenceTransformers](https://www.sbert.net/) model locally (no API key required).
|
|
304
|
+
|
|
305
|
+
**Example — general purpose text model:**
|
|
306
|
+
```bash
|
|
307
|
+
claude mcp add cocoindex-code \
|
|
308
|
+
-e COCOINDEX_CODE_EMBEDDING_MODEL=sbert/nomic-ai/nomic-embed-text-v1 \
|
|
309
|
+
-- cocoindex-code
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
**GPU-optimised code retrieval:**
|
|
313
|
+
|
|
314
|
+
[`nomic-ai/CodeRankEmbed`](https://huggingface.co/nomic-ai/CodeRankEmbed) delivers significantly better code retrieval than the default model. It is 137M parameters, requires ~1 GB VRAM, and has an 8192-token context window.
|
|
303
315
|
|
|
304
316
|
```bash
|
|
305
317
|
claude mcp add cocoindex-code \
|
|
@@ -355,6 +367,7 @@ Returns matching code chunks with:
|
|
|
355
367
|
| javascript | js | `.js` |
|
|
356
368
|
| json | | `.json` |
|
|
357
369
|
| kotlin | | `.kt`, `.kts` |
|
|
370
|
+
| lua | | `.lua` |
|
|
358
371
|
| markdown | md | `.md`, `.mdx` |
|
|
359
372
|
| pascal | pas, dpr, delphi | `.pas`, `.dpr` |
|
|
360
373
|
| php | | `.php` |
|
|
@@ -130,6 +130,7 @@ Use the cocoindex-code MCP server for semantic code search when:
|
|
|
130
130
|
| `COCOINDEX_CODE_EMBEDDING_MODEL` | Embedding model (see below) | `sbert/sentence-transformers/all-MiniLM-L6-v2` |
|
|
131
131
|
| `COCOINDEX_CODE_BATCH_SIZE` | Max batch size for local embedding model | `16` |
|
|
132
132
|
| `COCOINDEX_CODE_EXTRA_EXTENSIONS` | Additional file extensions to index (comma-separated, e.g. `"inc:php,yaml,toml"` — use `ext:lang` to override language detection) | _(none)_ |
|
|
133
|
+
| `COCOINDEX_CODE_EXCLUDED_PATTERNS` | Additional glob patterns to exclude from indexing as a JSON array (e.g. `'["**/migration.sql", "{**/*.md,**/*.txt}"]'`) | _(none)_ |
|
|
133
134
|
|
|
134
135
|
|
|
135
136
|
### Root Path Discovery
|
|
@@ -262,9 +263,20 @@ claude mcp add cocoindex-code \
|
|
|
262
263
|
|
|
263
264
|
Any model supported by LiteLLM works — see the [full list of embedding providers](https://docs.litellm.ai/docs/embedding/supported_embedding).
|
|
264
265
|
|
|
265
|
-
###
|
|
266
|
+
### Local SentenceTransformers models
|
|
266
267
|
|
|
267
|
-
|
|
268
|
+
Use the `sbert/` prefix to load any [SentenceTransformers](https://www.sbert.net/) model locally (no API key required).
|
|
269
|
+
|
|
270
|
+
**Example — general purpose text model:**
|
|
271
|
+
```bash
|
|
272
|
+
claude mcp add cocoindex-code \
|
|
273
|
+
-e COCOINDEX_CODE_EMBEDDING_MODEL=sbert/nomic-ai/nomic-embed-text-v1 \
|
|
274
|
+
-- cocoindex-code
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
**GPU-optimised code retrieval:**
|
|
278
|
+
|
|
279
|
+
[`nomic-ai/CodeRankEmbed`](https://huggingface.co/nomic-ai/CodeRankEmbed) delivers significantly better code retrieval than the default model. It is 137M parameters, requires ~1 GB VRAM, and has an 8192-token context window.
|
|
268
280
|
|
|
269
281
|
```bash
|
|
270
282
|
claude mcp add cocoindex-code \
|
|
@@ -320,6 +332,7 @@ Returns matching code chunks with:
|
|
|
320
332
|
| javascript | js | `.js` |
|
|
321
333
|
| json | | `.json` |
|
|
322
334
|
| kotlin | | `.kt`, `.kts` |
|
|
335
|
+
| lua | | `.lua` |
|
|
323
336
|
| markdown | md | `.md`, `.mdx` |
|
|
324
337
|
| pascal | pas, dpr, delphi | `.pas`, `.dpr` |
|
|
325
338
|
| php | | `.php` |
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import json
|
|
5
6
|
import os
|
|
6
7
|
from dataclasses import dataclass
|
|
7
8
|
from pathlib import Path
|
|
@@ -42,6 +43,31 @@ def _discover_codebase_root() -> Path:
|
|
|
42
43
|
return root if root is not None else cwd
|
|
43
44
|
|
|
44
45
|
|
|
46
|
+
def _parse_json_string_list_env(var_name: str) -> list[str]:
|
|
47
|
+
"""Parse an environment variable as a JSON array of strings."""
|
|
48
|
+
raw_value = os.environ.get(var_name, "")
|
|
49
|
+
if not raw_value.strip():
|
|
50
|
+
return []
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
parsed = json.loads(raw_value)
|
|
54
|
+
except json.JSONDecodeError as exc:
|
|
55
|
+
raise ValueError(f"{var_name} must be a JSON array of strings, got invalid JSON") from exc
|
|
56
|
+
|
|
57
|
+
if not isinstance(parsed, list):
|
|
58
|
+
raise ValueError(f"{var_name} must be a JSON array of strings")
|
|
59
|
+
|
|
60
|
+
result: list[str] = []
|
|
61
|
+
for item in parsed:
|
|
62
|
+
if not isinstance(item, str):
|
|
63
|
+
raise ValueError(f"{var_name} must be a JSON array of strings")
|
|
64
|
+
item = item.strip()
|
|
65
|
+
if item:
|
|
66
|
+
result.append(item)
|
|
67
|
+
|
|
68
|
+
return result
|
|
69
|
+
|
|
70
|
+
|
|
45
71
|
@dataclass
|
|
46
72
|
class Config:
|
|
47
73
|
"""Configuration loaded from environment variables."""
|
|
@@ -50,8 +76,8 @@ class Config:
|
|
|
50
76
|
embedding_model: str
|
|
51
77
|
index_dir: Path
|
|
52
78
|
device: str | None
|
|
53
|
-
trust_remote_code: bool
|
|
54
79
|
extra_extensions: dict[str, str | None]
|
|
80
|
+
excluded_patterns: list[str]
|
|
55
81
|
|
|
56
82
|
@classmethod
|
|
57
83
|
def from_env(cls) -> Config:
|
|
@@ -76,16 +102,6 @@ class Config:
|
|
|
76
102
|
# Device: auto-detect CUDA or use env override
|
|
77
103
|
device = os.environ.get("COCOINDEX_CODE_DEVICE")
|
|
78
104
|
|
|
79
|
-
# trust_remote_code: opt-in via env var only.
|
|
80
|
-
# sentence-transformers 5.x+ supports Jina models natively, so
|
|
81
|
-
# auto-enabling this for jinaai/ models causes failures with
|
|
82
|
-
# transformers 5.x (removed find_pruneable_heads_and_indices).
|
|
83
|
-
trust_remote_code = os.environ.get("COCOINDEX_CODE_TRUST_REMOTE_CODE", "").lower() in (
|
|
84
|
-
"1",
|
|
85
|
-
"true",
|
|
86
|
-
"yes",
|
|
87
|
-
)
|
|
88
|
-
|
|
89
105
|
# Extra file extensions (format: "inc:php,yaml,toml" — optional lang after colon)
|
|
90
106
|
raw_extra = os.environ.get("COCOINDEX_CODE_EXTRA_EXTENSIONS", "")
|
|
91
107
|
extra_extensions: dict[str, str | None] = {}
|
|
@@ -99,13 +115,16 @@ class Config:
|
|
|
99
115
|
else:
|
|
100
116
|
extra_extensions[f".{token}"] = None
|
|
101
117
|
|
|
118
|
+
# Excluded file glob patterns
|
|
119
|
+
excluded_patterns = _parse_json_string_list_env("COCOINDEX_CODE_EXCLUDED_PATTERNS")
|
|
120
|
+
|
|
102
121
|
return cls(
|
|
103
122
|
codebase_root_path=root,
|
|
104
123
|
embedding_model=embedding_model,
|
|
105
124
|
index_dir=index_dir,
|
|
106
125
|
device=device,
|
|
107
|
-
trust_remote_code=trust_remote_code,
|
|
108
126
|
extra_extensions=extra_extensions,
|
|
127
|
+
excluded_patterns=excluded_patterns,
|
|
109
128
|
)
|
|
110
129
|
|
|
111
130
|
@property
|
|
@@ -42,6 +42,7 @@ DEFAULT_INCLUDED_PATTERNS = [
|
|
|
42
42
|
"**/*.txt", # Plain text
|
|
43
43
|
"**/*.rst", # reStructuredText
|
|
44
44
|
"**/*.php", # PHP
|
|
45
|
+
"**/*.lua", # Lua
|
|
45
46
|
]
|
|
46
47
|
|
|
47
48
|
INCLUDED_PATTERNS = DEFAULT_INCLUDED_PATTERNS + [f"**/*{ext}" for ext in config.extra_extensions]
|
|
@@ -51,7 +52,7 @@ LANGUAGE_OVERRIDES: dict[str, str] = {
|
|
|
51
52
|
ext: lang for ext, lang in config.extra_extensions.items() if lang is not None
|
|
52
53
|
}
|
|
53
54
|
|
|
54
|
-
|
|
55
|
+
DEFAULT_EXCLUDED_PATTERNS = [
|
|
55
56
|
"**/.*", # Hidden directories
|
|
56
57
|
"**/__pycache__", # Python cache
|
|
57
58
|
"**/node_modules", # Node.js dependencies
|
|
@@ -63,6 +64,8 @@ EXCLUDED_PATTERNS = [
|
|
|
63
64
|
"**/.cocoindex_code", # Our own index directory
|
|
64
65
|
]
|
|
65
66
|
|
|
67
|
+
EXCLUDED_PATTERNS = DEFAULT_EXCLUDED_PATTERNS + config.excluded_patterns
|
|
68
|
+
|
|
66
69
|
# Chunking configuration
|
|
67
70
|
CHUNK_SIZE = 2000
|
|
68
71
|
MIN_CHUNK_SIZE = 300
|
|
@@ -106,7 +106,7 @@ async def query_codebase(
|
|
|
106
106
|
db = coco_env.get_context(SQLITE_DB)
|
|
107
107
|
|
|
108
108
|
# Generate query embedding.
|
|
109
|
-
query_embedding = await embedder.embed(query,
|
|
109
|
+
query_embedding = await embedder.embed(query, query_prompt_name)
|
|
110
110
|
|
|
111
111
|
embedding_bytes = query_embedding.astype("float32").tobytes()
|
|
112
112
|
|
|
@@ -31,19 +31,15 @@ if config.embedding_model.startswith(SBERT_PREFIX):
|
|
|
31
31
|
# Models that define a "query" prompt for asymmetric retrieval.
|
|
32
32
|
_QUERY_PROMPT_MODELS = {"nomic-ai/nomic-embed-code", "nomic-ai/CodeRankEmbed"}
|
|
33
33
|
query_prompt_name: str | None = "query" if _model_name in _QUERY_PROMPT_MODELS else None
|
|
34
|
-
# Models whose custom remote code is known-compatible with transformers 5.x.
|
|
35
|
-
_KNOWN_REMOTE_CODE_MODELS = {"nomic-ai/CodeRankEmbed"}
|
|
36
|
-
_trust = config.trust_remote_code or _model_name in _KNOWN_REMOTE_CODE_MODELS
|
|
37
34
|
embedder = SentenceTransformerEmbedder(
|
|
38
35
|
_model_name,
|
|
39
36
|
device=config.device,
|
|
40
|
-
trust_remote_code=
|
|
37
|
+
trust_remote_code=True,
|
|
41
38
|
)
|
|
42
39
|
logger.info(
|
|
43
|
-
"Embedding model: %s | device: %s
|
|
40
|
+
"Embedding model: %s | device: %s",
|
|
44
41
|
config.embedding_model,
|
|
45
42
|
config.device,
|
|
46
|
-
_trust,
|
|
47
43
|
)
|
|
48
44
|
else:
|
|
49
45
|
from cocoindex.ops.litellm import LiteLLMEmbedder
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|