cocoindex-code 0.1.8__tar.gz → 0.1.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cocoindex_code-0.1.8 → cocoindex_code-0.1.10}/PKG-INFO +37 -6
- {cocoindex_code-0.1.8 → cocoindex_code-0.1.10}/README.md +35 -4
- {cocoindex_code-0.1.8 → cocoindex_code-0.1.10}/pyproject.toml +1 -5
- {cocoindex_code-0.1.8 → cocoindex_code-0.1.10}/src/cocoindex_code/config.py +14 -15
- {cocoindex_code-0.1.8 → cocoindex_code-0.1.10}/src/cocoindex_code/indexer.py +15 -2
- {cocoindex_code-0.1.8 → cocoindex_code-0.1.10}/src/cocoindex_code/query.py +2 -5
- {cocoindex_code-0.1.8 → cocoindex_code-0.1.10}/src/cocoindex_code/shared.py +6 -7
- cocoindex_code-0.1.8/src/cocoindex_code/embedder.py +0 -119
- {cocoindex_code-0.1.8 → cocoindex_code-0.1.10}/.gitignore +0 -0
- {cocoindex_code-0.1.8 → cocoindex_code-0.1.10}/LICENSE +0 -0
- {cocoindex_code-0.1.8 → cocoindex_code-0.1.10}/src/cocoindex_code/__init__.py +0 -0
- {cocoindex_code-0.1.8 → cocoindex_code-0.1.10}/src/cocoindex_code/__main__.py +0 -0
- {cocoindex_code-0.1.8 → cocoindex_code-0.1.10}/src/cocoindex_code/schema.py +0 -0
- {cocoindex_code-0.1.8 → cocoindex_code-0.1.10}/src/cocoindex_code/server.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cocoindex-code
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.10
|
|
4
4
|
Summary: MCP server for indexing and querying codebases using CocoIndex
|
|
5
5
|
Project-URL: Homepage, https://github.com/cocoindex-io/cocoindex-code
|
|
6
6
|
Project-URL: Repository, https://github.com/cocoindex-io/cocoindex-code
|
|
@@ -17,7 +17,7 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
17
17
|
Classifier: Programming Language :: Python :: 3.13
|
|
18
18
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
19
|
Requires-Python: >=3.11
|
|
20
|
-
Requires-Dist: cocoindex[litellm]==1.0.
|
|
20
|
+
Requires-Dist: cocoindex[litellm]==1.0.0a26
|
|
21
21
|
Requires-Dist: einops>=0.8.2
|
|
22
22
|
Requires-Dist: mcp>=1.0.0
|
|
23
23
|
Requires-Dist: numpy>=1.24.0
|
|
@@ -40,6 +40,10 @@ Description-Content-Type: text/markdown
|
|
|
40
40
|
|
|
41
41
|
<h1 align="center">light weight MCP for code that just works </h1>
|
|
42
42
|
|
|
43
|
+

|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
|
|
43
47
|
|
|
44
48
|
A super light-weight, effective embedded MCP **(AST-based)** that understand and searches your codebase that just works! Using [CocoIndex](https://github.com/cocoindex-io/cocoindex) - an Rust-based ultra performant data transformation engine. No blackbox. Works for Claude, Codex, Cursor - any coding agent.
|
|
45
49
|
|
|
@@ -75,10 +79,15 @@ A super light-weight, effective embedded MCP **(AST-based)** that understand and
|
|
|
75
79
|
|
|
76
80
|
## Get Started - zero config, let's go!!
|
|
77
81
|
|
|
78
|
-
|
|
82
|
+
Using [pipx](https://pipx.pypa.io/stable/installation/):
|
|
83
|
+
```bash
|
|
84
|
+
pipx install cocoindex-code # first install
|
|
85
|
+
pipx upgrade cocoindex-code # upgrade
|
|
86
|
+
```
|
|
79
87
|
|
|
88
|
+
Using [uv](https://docs.astral.sh/uv/getting-started/installation/):
|
|
80
89
|
```bash
|
|
81
|
-
|
|
90
|
+
uv tool install --upgrade cocoindex-code --prerelease explicit --with "cocoindex>=1.0.0a24"
|
|
82
91
|
```
|
|
83
92
|
|
|
84
93
|
### Claude
|
|
@@ -116,6 +125,20 @@ Or use opencode.json:
|
|
|
116
125
|
|
|
117
126
|
Optionally, you can run `cocoindex-code index` to create or update the index. Without running it, the MCP server will automatically build and keep the index up-to-date in the background.
|
|
118
127
|
|
|
128
|
+
## When Is the MCP Triggered?
|
|
129
|
+
|
|
130
|
+
Once configured, your coding agent (Claude Code, Codex, Cursor, etc.) automatically decides when semantic code search is helpful — especially for finding code by description, exploring unfamiliar codebases, fuzzy/conceptual matches, or locating implementations without knowing exact names.
|
|
131
|
+
|
|
132
|
+
You can also nudge the agent explicitly, e.g. *"Use the cocoindex-code MCP to find how user sessions are managed."* For persistent instructions, add guidance to your project's `AGENTS.md` or `CLAUDE.md`:
|
|
133
|
+
|
|
134
|
+
```
|
|
135
|
+
Use the cocoindex-code MCP server for semantic code search when:
|
|
136
|
+
- Searching for code by meaning or description rather than exact text
|
|
137
|
+
- Exploring unfamiliar parts of the codebase
|
|
138
|
+
- Looking for implementations without knowing exact names
|
|
139
|
+
- Finding similar code patterns or related functionality
|
|
140
|
+
```
|
|
141
|
+
|
|
119
142
|
## Features
|
|
120
143
|
- **Semantic Code Search**: Find relevant code using natural language queries when grep doesn't work well, and save tokens immediately.
|
|
121
144
|
- **Ultra Performant to code changes**:⚡ Built on top of ultra performant [Rust indexing engine](https://github.com/cocoindex-io/cocoindex/edit/main/README.md). Only re-indexes changed files for fast updates.
|
|
@@ -131,6 +154,7 @@ Optionally, you can run `cocoindex-code index` to create or update the index. Wi
|
|
|
131
154
|
| `COCOINDEX_CODE_ROOT_PATH` | Root path of the codebase | Auto-discovered (see below) |
|
|
132
155
|
| `COCOINDEX_CODE_EMBEDDING_MODEL` | Embedding model (see below) | `sbert/sentence-transformers/all-MiniLM-L6-v2` |
|
|
133
156
|
| `COCOINDEX_CODE_BATCH_SIZE` | Max batch size for local embedding model | `16` |
|
|
157
|
+
| `COCOINDEX_CODE_EXTRA_EXTENSIONS` | Additional file extensions to index (comma-separated, e.g. `"inc:php,yaml,toml"` — use `ext:lang` to override language detection) | _(none)_ |
|
|
134
158
|
|
|
135
159
|
|
|
136
160
|
### Root Path Discovery
|
|
@@ -358,10 +382,17 @@ Some Python installations (e.g. the one pre-installed on macOS) ship with a SQLi
|
|
|
358
382
|
brew install python3
|
|
359
383
|
```
|
|
360
384
|
|
|
361
|
-
Then re-install cocoindex-code
|
|
385
|
+
Then re-install cocoindex-code (see [Get Started](#get-started---zero-config-lets-go) for install options):
|
|
386
|
+
|
|
387
|
+
Using pipx:
|
|
388
|
+
```bash
|
|
389
|
+
pipx install cocoindex-code # first install
|
|
390
|
+
pipx upgrade cocoindex-code # upgrade
|
|
391
|
+
```
|
|
362
392
|
|
|
393
|
+
Using uv (install or upgrade):
|
|
363
394
|
```bash
|
|
364
|
-
|
|
395
|
+
uv tool install --upgrade cocoindex-code --prerelease explicit --with "cocoindex>=1.0.0a24"
|
|
365
396
|
```
|
|
366
397
|
|
|
367
398
|
## Large codebase / Enterprise
|
|
@@ -5,6 +5,10 @@
|
|
|
5
5
|
|
|
6
6
|
<h1 align="center">light weight MCP for code that just works </h1>
|
|
7
7
|
|
|
8
|
+

|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
8
12
|
|
|
9
13
|
A super light-weight, effective embedded MCP **(AST-based)** that understand and searches your codebase that just works! Using [CocoIndex](https://github.com/cocoindex-io/cocoindex) - an Rust-based ultra performant data transformation engine. No blackbox. Works for Claude, Codex, Cursor - any coding agent.
|
|
10
14
|
|
|
@@ -40,10 +44,15 @@ A super light-weight, effective embedded MCP **(AST-based)** that understand and
|
|
|
40
44
|
|
|
41
45
|
## Get Started - zero config, let's go!!
|
|
42
46
|
|
|
43
|
-
|
|
47
|
+
Using [pipx](https://pipx.pypa.io/stable/installation/):
|
|
48
|
+
```bash
|
|
49
|
+
pipx install cocoindex-code # first install
|
|
50
|
+
pipx upgrade cocoindex-code # upgrade
|
|
51
|
+
```
|
|
44
52
|
|
|
53
|
+
Using [uv](https://docs.astral.sh/uv/getting-started/installation/):
|
|
45
54
|
```bash
|
|
46
|
-
|
|
55
|
+
uv tool install --upgrade cocoindex-code --prerelease explicit --with "cocoindex>=1.0.0a24"
|
|
47
56
|
```
|
|
48
57
|
|
|
49
58
|
### Claude
|
|
@@ -81,6 +90,20 @@ Or use opencode.json:
|
|
|
81
90
|
|
|
82
91
|
Optionally, you can run `cocoindex-code index` to create or update the index. Without running it, the MCP server will automatically build and keep the index up-to-date in the background.
|
|
83
92
|
|
|
93
|
+
## When Is the MCP Triggered?
|
|
94
|
+
|
|
95
|
+
Once configured, your coding agent (Claude Code, Codex, Cursor, etc.) automatically decides when semantic code search is helpful — especially for finding code by description, exploring unfamiliar codebases, fuzzy/conceptual matches, or locating implementations without knowing exact names.
|
|
96
|
+
|
|
97
|
+
You can also nudge the agent explicitly, e.g. *"Use the cocoindex-code MCP to find how user sessions are managed."* For persistent instructions, add guidance to your project's `AGENTS.md` or `CLAUDE.md`:
|
|
98
|
+
|
|
99
|
+
```
|
|
100
|
+
Use the cocoindex-code MCP server for semantic code search when:
|
|
101
|
+
- Searching for code by meaning or description rather than exact text
|
|
102
|
+
- Exploring unfamiliar parts of the codebase
|
|
103
|
+
- Looking for implementations without knowing exact names
|
|
104
|
+
- Finding similar code patterns or related functionality
|
|
105
|
+
```
|
|
106
|
+
|
|
84
107
|
## Features
|
|
85
108
|
- **Semantic Code Search**: Find relevant code using natural language queries when grep doesn't work well, and save tokens immediately.
|
|
86
109
|
- **Ultra Performant to code changes**:⚡ Built on top of ultra performant [Rust indexing engine](https://github.com/cocoindex-io/cocoindex/edit/main/README.md). Only re-indexes changed files for fast updates.
|
|
@@ -96,6 +119,7 @@ Optionally, you can run `cocoindex-code index` to create or update the index. Wi
|
|
|
96
119
|
| `COCOINDEX_CODE_ROOT_PATH` | Root path of the codebase | Auto-discovered (see below) |
|
|
97
120
|
| `COCOINDEX_CODE_EMBEDDING_MODEL` | Embedding model (see below) | `sbert/sentence-transformers/all-MiniLM-L6-v2` |
|
|
98
121
|
| `COCOINDEX_CODE_BATCH_SIZE` | Max batch size for local embedding model | `16` |
|
|
122
|
+
| `COCOINDEX_CODE_EXTRA_EXTENSIONS` | Additional file extensions to index (comma-separated, e.g. `"inc:php,yaml,toml"` — use `ext:lang` to override language detection) | _(none)_ |
|
|
99
123
|
|
|
100
124
|
|
|
101
125
|
### Root Path Discovery
|
|
@@ -323,10 +347,17 @@ Some Python installations (e.g. the one pre-installed on macOS) ship with a SQLi
|
|
|
323
347
|
brew install python3
|
|
324
348
|
```
|
|
325
349
|
|
|
326
|
-
Then re-install cocoindex-code
|
|
350
|
+
Then re-install cocoindex-code (see [Get Started](#get-started---zero-config-lets-go) for install options):
|
|
351
|
+
|
|
352
|
+
Using pipx:
|
|
353
|
+
```bash
|
|
354
|
+
pipx install cocoindex-code # first install
|
|
355
|
+
pipx upgrade cocoindex-code # upgrade
|
|
356
|
+
```
|
|
327
357
|
|
|
358
|
+
Using uv (install or upgrade):
|
|
328
359
|
```bash
|
|
329
|
-
|
|
360
|
+
uv tool install --upgrade cocoindex-code --prerelease explicit --with "cocoindex>=1.0.0a24"
|
|
330
361
|
```
|
|
331
362
|
|
|
332
363
|
## Large codebase / Enterprise
|
|
@@ -23,7 +23,7 @@ classifiers = [
|
|
|
23
23
|
|
|
24
24
|
dependencies = [
|
|
25
25
|
"mcp>=1.0.0",
|
|
26
|
-
"cocoindex[litellm]==1.0.
|
|
26
|
+
"cocoindex[litellm]==1.0.0a26",
|
|
27
27
|
"sentence-transformers>=2.2.0",
|
|
28
28
|
"sqlite-vec>=0.1.0",
|
|
29
29
|
"pydantic>=2.0.0",
|
|
@@ -82,10 +82,6 @@ python_version = "3.11"
|
|
|
82
82
|
strict = true
|
|
83
83
|
ignore_missing_imports = true
|
|
84
84
|
|
|
85
|
-
[[tool.mypy.overrides]]
|
|
86
|
-
module = "cocoindex_code.embedder"
|
|
87
|
-
warn_unused_ignores = false
|
|
88
|
-
|
|
89
85
|
[tool.pytest.ini_options]
|
|
90
86
|
testpaths = ["tests"]
|
|
91
87
|
python_files = ["test_*.py"]
|
|
@@ -6,7 +6,6 @@ import os
|
|
|
6
6
|
from dataclasses import dataclass
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
|
|
9
|
-
_SBERT_PREFIX = "sbert/"
|
|
10
9
|
_DEFAULT_MODEL = "sbert/sentence-transformers/all-MiniLM-L6-v2"
|
|
11
10
|
|
|
12
11
|
|
|
@@ -65,7 +64,7 @@ class Config:
|
|
|
65
64
|
index_dir: Path
|
|
66
65
|
device: str
|
|
67
66
|
trust_remote_code: bool
|
|
68
|
-
|
|
67
|
+
extra_extensions: dict[str, str | None]
|
|
69
68
|
|
|
70
69
|
@classmethod
|
|
71
70
|
def from_env(cls) -> Config:
|
|
@@ -100,18 +99,18 @@ class Config:
|
|
|
100
99
|
"yes",
|
|
101
100
|
)
|
|
102
101
|
|
|
103
|
-
#
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
102
|
+
# Extra file extensions (format: "inc:php,yaml,toml" — optional lang after colon)
|
|
103
|
+
raw_extra = os.environ.get("COCOINDEX_CODE_EXTRA_EXTENSIONS", "")
|
|
104
|
+
extra_extensions: dict[str, str | None] = {}
|
|
105
|
+
for token in raw_extra.split(","):
|
|
106
|
+
token = token.strip()
|
|
107
|
+
if not token:
|
|
108
|
+
continue
|
|
109
|
+
if ":" in token:
|
|
110
|
+
ext, lang = token.split(":", 1)
|
|
111
|
+
extra_extensions[f".{ext.strip()}"] = lang.strip() or None
|
|
112
|
+
else:
|
|
113
|
+
extra_extensions[f".{token}"] = None
|
|
115
114
|
|
|
116
115
|
return cls(
|
|
117
116
|
codebase_root_path=root,
|
|
@@ -119,7 +118,7 @@ class Config:
|
|
|
119
118
|
index_dir=index_dir,
|
|
120
119
|
device=device,
|
|
121
120
|
trust_remote_code=trust_remote_code,
|
|
122
|
-
|
|
121
|
+
extra_extensions=extra_extensions,
|
|
123
122
|
)
|
|
124
123
|
|
|
125
124
|
@property
|
|
@@ -8,10 +8,11 @@ from cocoindex.resources.chunk import Chunk
|
|
|
8
8
|
from cocoindex.resources.file import PatternFilePathMatcher
|
|
9
9
|
from cocoindex.resources.id import IdGenerator
|
|
10
10
|
|
|
11
|
+
from .config import config
|
|
11
12
|
from .shared import CODEBASE_DIR, SQLITE_DB, CodeChunk, embedder
|
|
12
13
|
|
|
13
14
|
# File patterns for supported languages
|
|
14
|
-
|
|
15
|
+
DEFAULT_INCLUDED_PATTERNS = [
|
|
15
16
|
"**/*.py", # Python
|
|
16
17
|
"**/*.pyi", # Python stubs
|
|
17
18
|
"**/*.js", # JavaScript
|
|
@@ -43,6 +44,13 @@ INCLUDED_PATTERNS = [
|
|
|
43
44
|
"**/*.php", # PHP
|
|
44
45
|
]
|
|
45
46
|
|
|
47
|
+
INCLUDED_PATTERNS = DEFAULT_INCLUDED_PATTERNS + [f"**/*{ext}" for ext in config.extra_extensions]
|
|
48
|
+
|
|
49
|
+
# Language overrides from extra_extensions (e.g. ".inc" -> "php")
|
|
50
|
+
LANGUAGE_OVERRIDES: dict[str, str] = {
|
|
51
|
+
ext: lang for ext, lang in config.extra_extensions.items() if lang is not None
|
|
52
|
+
}
|
|
53
|
+
|
|
46
54
|
EXCLUDED_PATTERNS = [
|
|
47
55
|
"**/.*", # Hidden directories
|
|
48
56
|
"**/__pycache__", # Python cache
|
|
@@ -81,7 +89,12 @@ async def process_file(
|
|
|
81
89
|
return
|
|
82
90
|
|
|
83
91
|
# Get relative path and detect language
|
|
84
|
-
|
|
92
|
+
suffix = file.file_path.path.suffix
|
|
93
|
+
language = (
|
|
94
|
+
LANGUAGE_OVERRIDES.get(suffix)
|
|
95
|
+
or detect_code_language(filename=file.file_path.path.name)
|
|
96
|
+
or "text"
|
|
97
|
+
)
|
|
85
98
|
|
|
86
99
|
# Split into chunks
|
|
87
100
|
chunks = splitter.split(
|
|
@@ -8,7 +8,7 @@ import cocoindex as coco
|
|
|
8
8
|
|
|
9
9
|
from .config import config
|
|
10
10
|
from .schema import QueryResult
|
|
11
|
-
from .shared import SQLITE_DB, embedder
|
|
11
|
+
from .shared import SQLITE_DB, embedder, query_prompt_name
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
def _l2_to_score(distance: float) -> float:
|
|
@@ -106,10 +106,7 @@ async def query_codebase(
|
|
|
106
106
|
db = coco_env.get_context(SQLITE_DB)
|
|
107
107
|
|
|
108
108
|
# Generate query embedding.
|
|
109
|
-
|
|
110
|
-
query_embedding = await embedder.embed_query(query)
|
|
111
|
-
else:
|
|
112
|
-
query_embedding = await embedder.embed(query)
|
|
109
|
+
query_embedding = await embedder.embed(query, True, query_prompt_name)
|
|
113
110
|
|
|
114
111
|
embedding_bytes = query_embedding.astype("float32").tobytes()
|
|
115
112
|
|
|
@@ -14,8 +14,7 @@ from numpy.typing import NDArray
|
|
|
14
14
|
|
|
15
15
|
if TYPE_CHECKING:
|
|
16
16
|
from cocoindex.ops.litellm import LiteLLMEmbedder
|
|
17
|
-
|
|
18
|
-
from .embedder import LocalEmbedder
|
|
17
|
+
from cocoindex.ops.sentence_transformers import SentenceTransformerEmbedder
|
|
19
18
|
|
|
20
19
|
from .config import config
|
|
21
20
|
|
|
@@ -24,22 +23,21 @@ logger = logging.getLogger(__name__)
|
|
|
24
23
|
SBERT_PREFIX = "sbert/"
|
|
25
24
|
|
|
26
25
|
# Initialize embedder at module level based on model prefix
|
|
27
|
-
embedder:
|
|
26
|
+
embedder: SentenceTransformerEmbedder | LiteLLMEmbedder
|
|
28
27
|
if config.embedding_model.startswith(SBERT_PREFIX):
|
|
29
|
-
from .
|
|
28
|
+
from cocoindex.ops.sentence_transformers import SentenceTransformerEmbedder
|
|
30
29
|
|
|
31
30
|
_model_name = config.embedding_model[len(SBERT_PREFIX) :]
|
|
32
31
|
# Models that define a "query" prompt for asymmetric retrieval.
|
|
33
32
|
_QUERY_PROMPT_MODELS = {"nomic-ai/nomic-embed-code", "nomic-ai/CodeRankEmbed"}
|
|
34
|
-
|
|
33
|
+
query_prompt_name: str | None = "query" if _model_name in _QUERY_PROMPT_MODELS else None
|
|
35
34
|
# Models whose custom remote code is known-compatible with transformers 5.x.
|
|
36
35
|
_KNOWN_REMOTE_CODE_MODELS = {"nomic-ai/CodeRankEmbed"}
|
|
37
36
|
_trust = config.trust_remote_code or _model_name in _KNOWN_REMOTE_CODE_MODELS
|
|
38
|
-
embedder =
|
|
37
|
+
embedder = SentenceTransformerEmbedder(
|
|
39
38
|
_model_name,
|
|
40
39
|
device=config.device,
|
|
41
40
|
trust_remote_code=_trust,
|
|
42
|
-
query_prompt_name=_query_prompt_name,
|
|
43
41
|
)
|
|
44
42
|
logger.info(
|
|
45
43
|
"Embedding model: %s | device: %s | trust_remote_code: %s",
|
|
@@ -51,6 +49,7 @@ else:
|
|
|
51
49
|
from cocoindex.ops.litellm import LiteLLMEmbedder
|
|
52
50
|
|
|
53
51
|
embedder = LiteLLMEmbedder(config.embedding_model)
|
|
52
|
+
query_prompt_name = None
|
|
54
53
|
logger.info("Embedding model (LiteLLM): %s", config.embedding_model)
|
|
55
54
|
|
|
56
55
|
# Context key for SQLite database (connection managed in lifespan)
|
|
@@ -1,119 +0,0 @@
|
|
|
1
|
-
"""Local SentenceTransformer embedder with device and trust_remote_code support."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import threading
|
|
6
|
-
from typing import TYPE_CHECKING, Any
|
|
7
|
-
|
|
8
|
-
import cocoindex as coco
|
|
9
|
-
import numpy as np
|
|
10
|
-
from cocoindex.resources import schema as _schema
|
|
11
|
-
from numpy.typing import NDArray
|
|
12
|
-
|
|
13
|
-
from .config import config as _config
|
|
14
|
-
|
|
15
|
-
if TYPE_CHECKING:
|
|
16
|
-
from sentence_transformers import SentenceTransformer
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class LocalEmbedder(_schema.VectorSchemaProvider):
|
|
20
|
-
"""SentenceTransformer embedder with explicit device and trust_remote_code support.
|
|
21
|
-
|
|
22
|
-
Drop-in replacement for cocoindex's SentenceTransformerEmbedder that supports:
|
|
23
|
-
- Explicit device selection (e.g. "cuda", "cpu")
|
|
24
|
-
- trust_remote_code for models with custom pooling (e.g. Jina models)
|
|
25
|
-
"""
|
|
26
|
-
|
|
27
|
-
def __init__(
|
|
28
|
-
self,
|
|
29
|
-
model_name_or_path: str,
|
|
30
|
-
*,
|
|
31
|
-
device: str = "cpu",
|
|
32
|
-
trust_remote_code: bool = False,
|
|
33
|
-
normalize_embeddings: bool = True,
|
|
34
|
-
query_prompt_name: str | None = None,
|
|
35
|
-
) -> None:
|
|
36
|
-
self._model_name_or_path = model_name_or_path
|
|
37
|
-
self._device = device
|
|
38
|
-
self._trust_remote_code = trust_remote_code
|
|
39
|
-
self._normalize_embeddings = normalize_embeddings
|
|
40
|
-
self._query_prompt_name = query_prompt_name
|
|
41
|
-
self._model: SentenceTransformer | None = None
|
|
42
|
-
self._lock = threading.Lock()
|
|
43
|
-
|
|
44
|
-
def __getstate__(self) -> dict[str, Any]:
|
|
45
|
-
return {
|
|
46
|
-
"model_name_or_path": self._model_name_or_path,
|
|
47
|
-
"device": self._device,
|
|
48
|
-
"trust_remote_code": self._trust_remote_code,
|
|
49
|
-
"normalize_embeddings": self._normalize_embeddings,
|
|
50
|
-
"query_prompt_name": self._query_prompt_name,
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
def __setstate__(self, state: dict[str, Any]) -> None:
|
|
54
|
-
self._model_name_or_path = state["model_name_or_path"]
|
|
55
|
-
self._device = state["device"]
|
|
56
|
-
self._trust_remote_code = state["trust_remote_code"]
|
|
57
|
-
self._normalize_embeddings = state["normalize_embeddings"]
|
|
58
|
-
self._query_prompt_name = state.get("query_prompt_name")
|
|
59
|
-
self._model = None
|
|
60
|
-
self._lock = threading.Lock()
|
|
61
|
-
|
|
62
|
-
def _get_model(self) -> SentenceTransformer:
|
|
63
|
-
"""Lazy-load the model with thread-safe double-checked locking."""
|
|
64
|
-
if self._model is None:
|
|
65
|
-
with self._lock:
|
|
66
|
-
if self._model is None:
|
|
67
|
-
from sentence_transformers import SentenceTransformer
|
|
68
|
-
|
|
69
|
-
self._model = SentenceTransformer(
|
|
70
|
-
self._model_name_or_path,
|
|
71
|
-
device=self._device,
|
|
72
|
-
trust_remote_code=self._trust_remote_code,
|
|
73
|
-
)
|
|
74
|
-
return self._model
|
|
75
|
-
|
|
76
|
-
@coco.fn.as_async(batching=True, runner=coco.GPU, memo=True, max_batch_size=_config.batch_size)
|
|
77
|
-
def embed(self, texts: list[str]) -> list[NDArray[np.float32]]:
|
|
78
|
-
"""Embed a batch of texts into float32 vectors."""
|
|
79
|
-
model = self._get_model()
|
|
80
|
-
embeddings: NDArray[np.float32] = model.encode(
|
|
81
|
-
texts,
|
|
82
|
-
convert_to_numpy=True,
|
|
83
|
-
normalize_embeddings=self._normalize_embeddings,
|
|
84
|
-
show_progress_bar=False,
|
|
85
|
-
) # type: ignore[assignment]
|
|
86
|
-
return list(embeddings)
|
|
87
|
-
|
|
88
|
-
@coco.fn.as_async(batching=True, runner=coco.GPU, memo=True, max_batch_size=_config.batch_size)
|
|
89
|
-
def embed_query(self, texts: list[str]) -> list[NDArray[np.float32]]:
|
|
90
|
-
"""Embed query texts, applying query_prompt_name if configured."""
|
|
91
|
-
model = self._get_model()
|
|
92
|
-
embeddings: NDArray[np.float32] = model.encode(
|
|
93
|
-
texts,
|
|
94
|
-
prompt_name=self._query_prompt_name,
|
|
95
|
-
convert_to_numpy=True,
|
|
96
|
-
normalize_embeddings=self._normalize_embeddings,
|
|
97
|
-
show_progress_bar=False,
|
|
98
|
-
) # type: ignore[assignment]
|
|
99
|
-
return list(embeddings)
|
|
100
|
-
|
|
101
|
-
@coco.fn.as_async(runner=coco.GPU, memo=True)
|
|
102
|
-
def __coco_vector_schema__(self) -> _schema.VectorSchema:
|
|
103
|
-
"""Return the vector schema (dimension + dtype) for this model."""
|
|
104
|
-
model = self._get_model()
|
|
105
|
-
dim = model.get_sentence_embedding_dimension()
|
|
106
|
-
if dim is None:
|
|
107
|
-
raise RuntimeError(
|
|
108
|
-
f"Embedding dimension is unknown for model {self._model_name_or_path}."
|
|
109
|
-
)
|
|
110
|
-
return _schema.VectorSchema(dtype=np.dtype(np.float32), size=dim)
|
|
111
|
-
|
|
112
|
-
def __coco_memo_key__(self) -> object:
|
|
113
|
-
return (
|
|
114
|
-
self._model_name_or_path,
|
|
115
|
-
self._device,
|
|
116
|
-
self._trust_remote_code,
|
|
117
|
-
self._normalize_embeddings,
|
|
118
|
-
self._query_prompt_name,
|
|
119
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|