cocoindex-code 0.2.28__tar.gz → 0.2.29__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cocoindex_code-0.2.28 → cocoindex_code-0.2.29}/PKG-INFO +35 -3
- {cocoindex_code-0.2.28 → cocoindex_code-0.2.29}/README.md +34 -2
- {cocoindex_code-0.2.28 → cocoindex_code-0.2.29}/src/cocoindex_code/_version.py +2 -2
- {cocoindex_code-0.2.28 → cocoindex_code-0.2.29}/src/cocoindex_code/cli.py +15 -1
- {cocoindex_code-0.2.28 → cocoindex_code-0.2.29}/src/cocoindex_code/client.py +34 -0
- {cocoindex_code-0.2.28 → cocoindex_code-0.2.29}/src/cocoindex_code/daemon.py +96 -16
- cocoindex_code-0.2.29/src/cocoindex_code/embedder_defaults.py +152 -0
- cocoindex_code-0.2.29/src/cocoindex_code/embedder_params.py +95 -0
- {cocoindex_code-0.2.28 → cocoindex_code-0.2.29}/src/cocoindex_code/indexer.py +3 -1
- {cocoindex_code-0.2.28 → cocoindex_code-0.2.29}/src/cocoindex_code/litellm_embedder.py +2 -0
- {cocoindex_code-0.2.28 → cocoindex_code-0.2.29}/src/cocoindex_code/project.py +13 -1
- {cocoindex_code-0.2.28 → cocoindex_code-0.2.29}/src/cocoindex_code/protocol.py +3 -0
- {cocoindex_code-0.2.28 → cocoindex_code-0.2.29}/src/cocoindex_code/query.py +3 -2
- {cocoindex_code-0.2.28 → cocoindex_code-0.2.29}/src/cocoindex_code/settings.py +51 -3
- {cocoindex_code-0.2.28 → cocoindex_code-0.2.29}/src/cocoindex_code/shared.py +15 -24
- {cocoindex_code-0.2.28 → cocoindex_code-0.2.29}/.gitignore +0 -0
- {cocoindex_code-0.2.28 → cocoindex_code-0.2.29}/LICENSE +0 -0
- {cocoindex_code-0.2.28 → cocoindex_code-0.2.29}/pyproject.toml +0 -0
- {cocoindex_code-0.2.28 → cocoindex_code-0.2.29}/src/cocoindex_code/__init__.py +0 -0
- {cocoindex_code-0.2.28 → cocoindex_code-0.2.29}/src/cocoindex_code/__main__.py +0 -0
- {cocoindex_code-0.2.28 → cocoindex_code-0.2.29}/src/cocoindex_code/_daemon_paths.py +0 -0
- {cocoindex_code-0.2.28 → cocoindex_code-0.2.29}/src/cocoindex_code/chunking.py +0 -0
- {cocoindex_code-0.2.28 → cocoindex_code-0.2.29}/src/cocoindex_code/schema.py +0 -0
- {cocoindex_code-0.2.28 → cocoindex_code-0.2.29}/src/cocoindex_code/server.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cocoindex-code
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.29
|
|
4
4
|
Summary: MCP server for indexing and querying codebases using CocoIndex
|
|
5
5
|
Project-URL: Homepage, https://github.com/cocoindex-io/cocoindex-code
|
|
6
6
|
Project-URL: Repository, https://github.com/cocoindex-io/cocoindex-code
|
|
@@ -96,7 +96,7 @@ pipx upgrade cocoindex-code # upgrade
|
|
|
96
96
|
|
|
97
97
|
Using [uv](https://docs.astral.sh/uv/getting-started/installation/):
|
|
98
98
|
```bash
|
|
99
|
-
uv tool install --upgrade 'cocoindex-code[full]'
|
|
99
|
+
uv tool install --upgrade 'cocoindex-code[full]'
|
|
100
100
|
```
|
|
101
101
|
|
|
102
102
|
Two install styles — they mirror the Docker image variants of the same names:
|
|
@@ -437,6 +437,14 @@ embedding:
|
|
|
437
437
|
device: mps # optional: cpu, cuda, mps (auto-detected if omitted)
|
|
438
438
|
min_interval_ms: 300 # optional: pace LiteLLM embedding requests to reduce 429s; defaults to 5 for LiteLLM
|
|
439
439
|
|
|
440
|
+
# Optional extra kwargs passed to the embedder, separately for indexing vs query.
|
|
441
|
+
# `ccc init` auto-populates these for known models (e.g. Cohere, Voyage, Nvidia NIM,
|
|
442
|
+
# nomic-ai code-retrieval models, Snowflake arctic-embed).
|
|
443
|
+
# indexing_params:
|
|
444
|
+
# input_type: search_document # litellm: input_type, dimensions
|
|
445
|
+
# query_params:
|
|
446
|
+
# input_type: search_query # sentence-transformers: prompt_name
|
|
447
|
+
|
|
440
448
|
envs: # extra environment variables for the daemon
|
|
441
449
|
OPENAI_API_KEY: your-key # only needed if not already in your shell environment
|
|
442
450
|
```
|
|
@@ -445,6 +453,30 @@ envs: # extra environment variabl
|
|
|
445
453
|
|
|
446
454
|
> **Custom location:** set `COCOINDEX_CODE_DIR` to place `global_settings.yml` somewhere other than `~/.cocoindex_code/` — useful if you want the file to live alongside your projects (e.g. on a synced folder).
|
|
447
455
|
|
|
456
|
+
#### `indexing_params` / `query_params`
|
|
457
|
+
|
|
458
|
+
Some embedding models expose different modes for documents vs queries (asymmetric retrieval). For example, Cohere's v3 models want `input_type: search_document` when embedding corpus content and `input_type: search_query` when embedding a user query; several SentenceTransformers models use `prompt_name: passage` / `prompt_name: query` for the same purpose. These knobs live under `indexing_params` and `query_params`:
|
|
459
|
+
|
|
460
|
+
```yaml
|
|
461
|
+
embedding:
|
|
462
|
+
provider: litellm
|
|
463
|
+
model: cohere/embed-english-v3.0
|
|
464
|
+
indexing_params:
|
|
465
|
+
input_type: search_document
|
|
466
|
+
query_params:
|
|
467
|
+
input_type: search_query
|
|
468
|
+
```
|
|
469
|
+
|
|
470
|
+
`ccc init` populates these automatically for models it recognizes — including all Cohere v3, Voyage, Nvidia NIM, Gemini embedding (`gemini/gemini-embedding-*`, `gemini/text-embedding-*`, `gemini/embedding-*` — LiteLLM auto-maps `input_type` to Gemini's `task_type`), `nomic-ai/CodeRankEmbed`, `nomic-ai/nomic-embed-code`, `nomic-ai/nomic-embed-text-v1`/`v1.5`, `mixedbread-ai/mxbai-embed-large-v1`, and the `Snowflake/snowflake-arctic-embed-*` family — and prints the chosen defaults. For other models, it leaves a commented-out template under `embedding:` so you can fill it in by hand.
|
|
471
|
+
|
|
472
|
+
OpenAI embeddings (`text-embedding-3-*`, `text-embedding-ada-002`) are intentionally not in the list: they're symmetric and have no equivalent knob.
|
|
473
|
+
|
|
474
|
+
**Accepted keys:** `prompt_name` (sentence-transformers), `input_type` and `dimensions` (litellm). Other keys are rejected at daemon startup with a clear error.
|
|
475
|
+
|
|
476
|
+
**Doctor checks both sides.** `ccc doctor` exercises the model once with `indexing_params` and once with `query_params`, reporting each as a separate `Model Check (indexing)` / `Model Check (query)` entry — so a misconfiguration on one side is diagnosable without hiding behind the other.
|
|
477
|
+
|
|
478
|
+
**Legacy-bridge warning:** if you're upgrading from an earlier version and your `global_settings.yml` uses `nomic-ai/CodeRankEmbed` or `nomic-ai/nomic-embed-code` without `indexing_params` / `query_params`, the daemon continues to apply the previous behavior (`prompt_name: query` at query time) and prints a one-time warning asking you to make the setting explicit. You can silence the warning by adding an empty block such as `query_params: {}`.
|
|
479
|
+
|
|
448
480
|
### Project Settings (`<project>/.cocoindex_code/settings.yml`)
|
|
449
481
|
|
|
450
482
|
Per-project. Controls which files to index.
|
|
@@ -727,7 +759,7 @@ pipx upgrade cocoindex-code # upgrade
|
|
|
727
759
|
|
|
728
760
|
Using uv (install or upgrade):
|
|
729
761
|
```bash
|
|
730
|
-
uv tool install --upgrade cocoindex-code
|
|
762
|
+
uv tool install --upgrade cocoindex-code
|
|
731
763
|
```
|
|
732
764
|
|
|
733
765
|
## Legacy: Environment Variables
|
|
@@ -52,7 +52,7 @@ pipx upgrade cocoindex-code # upgrade
|
|
|
52
52
|
|
|
53
53
|
Using [uv](https://docs.astral.sh/uv/getting-started/installation/):
|
|
54
54
|
```bash
|
|
55
|
-
uv tool install --upgrade 'cocoindex-code[full]'
|
|
55
|
+
uv tool install --upgrade 'cocoindex-code[full]'
|
|
56
56
|
```
|
|
57
57
|
|
|
58
58
|
Two install styles — they mirror the Docker image variants of the same names:
|
|
@@ -393,6 +393,14 @@ embedding:
|
|
|
393
393
|
device: mps # optional: cpu, cuda, mps (auto-detected if omitted)
|
|
394
394
|
min_interval_ms: 300 # optional: pace LiteLLM embedding requests to reduce 429s; defaults to 5 for LiteLLM
|
|
395
395
|
|
|
396
|
+
# Optional extra kwargs passed to the embedder, separately for indexing vs query.
|
|
397
|
+
# `ccc init` auto-populates these for known models (e.g. Cohere, Voyage, Nvidia NIM,
|
|
398
|
+
# nomic-ai code-retrieval models, Snowflake arctic-embed).
|
|
399
|
+
# indexing_params:
|
|
400
|
+
# input_type: search_document # litellm: input_type, dimensions
|
|
401
|
+
# query_params:
|
|
402
|
+
# input_type: search_query # sentence-transformers: prompt_name
|
|
403
|
+
|
|
396
404
|
envs: # extra environment variables for the daemon
|
|
397
405
|
OPENAI_API_KEY: your-key # only needed if not already in your shell environment
|
|
398
406
|
```
|
|
@@ -401,6 +409,30 @@ envs: # extra environment variabl
|
|
|
401
409
|
|
|
402
410
|
> **Custom location:** set `COCOINDEX_CODE_DIR` to place `global_settings.yml` somewhere other than `~/.cocoindex_code/` — useful if you want the file to live alongside your projects (e.g. on a synced folder).
|
|
403
411
|
|
|
412
|
+
#### `indexing_params` / `query_params`
|
|
413
|
+
|
|
414
|
+
Some embedding models expose different modes for documents vs queries (asymmetric retrieval). For example, Cohere's v3 models want `input_type: search_document` when embedding corpus content and `input_type: search_query` when embedding a user query; several SentenceTransformers models use `prompt_name: passage` / `prompt_name: query` for the same purpose. These knobs live under `indexing_params` and `query_params`:
|
|
415
|
+
|
|
416
|
+
```yaml
|
|
417
|
+
embedding:
|
|
418
|
+
provider: litellm
|
|
419
|
+
model: cohere/embed-english-v3.0
|
|
420
|
+
indexing_params:
|
|
421
|
+
input_type: search_document
|
|
422
|
+
query_params:
|
|
423
|
+
input_type: search_query
|
|
424
|
+
```
|
|
425
|
+
|
|
426
|
+
`ccc init` populates these automatically for models it recognizes — including all Cohere v3, Voyage, Nvidia NIM, Gemini embedding (`gemini/gemini-embedding-*`, `gemini/text-embedding-*`, `gemini/embedding-*` — LiteLLM auto-maps `input_type` to Gemini's `task_type`), `nomic-ai/CodeRankEmbed`, `nomic-ai/nomic-embed-code`, `nomic-ai/nomic-embed-text-v1`/`v1.5`, `mixedbread-ai/mxbai-embed-large-v1`, and the `Snowflake/snowflake-arctic-embed-*` family — and prints the chosen defaults. For other models, it leaves a commented-out template under `embedding:` so you can fill it in by hand.
|
|
427
|
+
|
|
428
|
+
OpenAI embeddings (`text-embedding-3-*`, `text-embedding-ada-002`) are intentionally not in the list: they're symmetric and have no equivalent knob.
|
|
429
|
+
|
|
430
|
+
**Accepted keys:** `prompt_name` (sentence-transformers), `input_type` and `dimensions` (litellm). Other keys are rejected at daemon startup with a clear error.
|
|
431
|
+
|
|
432
|
+
**Doctor checks both sides.** `ccc doctor` exercises the model once with `indexing_params` and once with `query_params`, reporting each as a separate `Model Check (indexing)` / `Model Check (query)` entry — so a misconfiguration on one side is diagnosable without hiding behind the other.
|
|
433
|
+
|
|
434
|
+
**Legacy-bridge warning:** if you're upgrading from an earlier version and your `global_settings.yml` uses `nomic-ai/CodeRankEmbed` or `nomic-ai/nomic-embed-code` without `indexing_params` / `query_params`, the daemon continues to apply the previous behavior (`prompt_name: query` at query time) and prints a one-time warning asking you to make the setting explicit. You can silence the warning by adding an empty block such as `query_params: {}`.
|
|
435
|
+
|
|
404
436
|
### Project Settings (`<project>/.cocoindex_code/settings.yml`)
|
|
405
437
|
|
|
406
438
|
Per-project. Controls which files to index.
|
|
@@ -683,7 +715,7 @@ pipx upgrade cocoindex-code # upgrade
|
|
|
683
715
|
|
|
684
716
|
Using uv (install or upgrade):
|
|
685
717
|
```bash
|
|
686
|
-
uv tool install --upgrade cocoindex-code
|
|
718
|
+
uv tool install --upgrade cocoindex-code
|
|
687
719
|
```
|
|
688
720
|
|
|
689
721
|
## Legacy: Environment Variables
|
|
@@ -18,7 +18,7 @@ version_tuple: tuple[int | str, ...]
|
|
|
18
18
|
commit_id: str | None
|
|
19
19
|
__commit_id__: str | None
|
|
20
20
|
|
|
21
|
-
__version__ = version = '0.2.
|
|
22
|
-
__version_tuple__ = version_tuple = (0, 2,
|
|
21
|
+
__version__ = version = '0.2.29'
|
|
22
|
+
__version_tuple__ = version_tuple = (0, 2, 29)
|
|
23
23
|
|
|
24
24
|
__commit_id__ = commit_id = None
|
|
@@ -436,6 +436,7 @@ def _run_init_model_check(settings_path: Path) -> None:
|
|
|
436
436
|
|
|
437
437
|
def _setup_user_settings_interactive(litellm_model_flag: str | None) -> None:
|
|
438
438
|
"""Interactive global-settings setup — only runs when settings are missing."""
|
|
439
|
+
from .embedder_defaults import lookup_defaults
|
|
439
440
|
from .shared import is_sentence_transformers_installed
|
|
440
441
|
|
|
441
442
|
embedding = _resolve_embedding_choice(
|
|
@@ -444,10 +445,23 @@ def _setup_user_settings_interactive(litellm_model_flag: str | None) -> None:
|
|
|
444
445
|
tty=sys.stdin.isatty(),
|
|
445
446
|
)
|
|
446
447
|
|
|
447
|
-
|
|
448
|
+
# Apply curated defaults if the model is in our table.
|
|
449
|
+
indexing_defaults, query_defaults = lookup_defaults(embedding.provider, embedding.model)
|
|
450
|
+
defaults_applied = indexing_defaults is not None or query_defaults is not None
|
|
451
|
+
if defaults_applied:
|
|
452
|
+
embedding.indexing_params = indexing_defaults or {}
|
|
453
|
+
embedding.query_params = query_defaults or {}
|
|
454
|
+
|
|
455
|
+
path = save_initial_user_settings(embedding, defaults_applied=defaults_applied)
|
|
448
456
|
_typer.echo()
|
|
449
457
|
_typer.echo(f"Created user settings: {format_path_for_display(path)}")
|
|
450
458
|
|
|
459
|
+
if defaults_applied:
|
|
460
|
+
_typer.echo()
|
|
461
|
+
_typer.echo(f"Applied recommended defaults for {embedding.model}:")
|
|
462
|
+
_typer.echo(f" indexing_params: {embedding.indexing_params}")
|
|
463
|
+
_typer.echo(f" query_params: {embedding.query_params}")
|
|
464
|
+
|
|
451
465
|
_typer.echo()
|
|
452
466
|
_typer.echo(f"Testing embedding model: {embedding.provider} / {embedding.model}")
|
|
453
467
|
_run_init_model_check(path)
|
|
@@ -65,6 +65,38 @@ logger = logging.getLogger(__name__)
|
|
|
65
65
|
|
|
66
66
|
_daemon_ensured = False
|
|
67
67
|
|
|
68
|
+
# Tracks which daemon-side handshake warnings have already been surfaced to
|
|
69
|
+
# the user in this process. We print each distinct warning at most once per
|
|
70
|
+
# `ccc` invocation — see `_print_handshake_warnings`.
|
|
71
|
+
_surfaced_warnings: set[str] = set()
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def print_warning(message: str) -> None:
|
|
75
|
+
"""Render a user-facing warning to stderr with a uniform style.
|
|
76
|
+
|
|
77
|
+
Prefixes with ``Warning:`` and renders in yellow when stderr is a TTY;
|
|
78
|
+
falls through as plain text for pipes / files / CI logs. Intended as
|
|
79
|
+
the single entry point for warnings the user should notice — reuse it
|
|
80
|
+
for any new warning rather than inventing a local style.
|
|
81
|
+
"""
|
|
82
|
+
import click
|
|
83
|
+
|
|
84
|
+
click.secho(f"Warning: {message}", fg="yellow", err=True)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _print_handshake_warnings(resp: HandshakeResponse) -> None:
|
|
88
|
+
"""Print any new daemon-side warnings to stderr (once per process).
|
|
89
|
+
|
|
90
|
+
The daemon populates ``HandshakeResponse.warnings`` on every handshake;
|
|
91
|
+
the dedup set here ensures a warning is printed at most once within a
|
|
92
|
+
single CLI invocation even though several connections are opened.
|
|
93
|
+
"""
|
|
94
|
+
for w in resp.warnings:
|
|
95
|
+
if w in _surfaced_warnings:
|
|
96
|
+
continue
|
|
97
|
+
_surfaced_warnings.add(w)
|
|
98
|
+
print_warning(w)
|
|
99
|
+
|
|
68
100
|
|
|
69
101
|
def _is_daemon_supervised() -> bool:
|
|
70
102
|
"""True when an external supervisor (Docker entrypoint loop, systemd, …) owns
|
|
@@ -146,6 +178,7 @@ def _raw_connect_and_handshake() -> Connection:
|
|
|
146
178
|
if not resp.ok or _needs_restart(resp):
|
|
147
179
|
conn.close()
|
|
148
180
|
raise DaemonVersionError(resp)
|
|
181
|
+
_print_handshake_warnings(resp)
|
|
149
182
|
return conn
|
|
150
183
|
|
|
151
184
|
|
|
@@ -452,6 +485,7 @@ def stop_daemon() -> None:
|
|
|
452
485
|
"""
|
|
453
486
|
global _daemon_ensured # noqa: PLW0603
|
|
454
487
|
_daemon_ensured = False
|
|
488
|
+
_surfaced_warnings.clear()
|
|
455
489
|
pid_path = daemon_pid_path()
|
|
456
490
|
|
|
457
491
|
pid: int | None = None
|
|
@@ -24,6 +24,7 @@ from ._daemon_paths import (
|
|
|
24
24
|
)
|
|
25
25
|
from ._version import __version__
|
|
26
26
|
from .chunking import ChunkerFn as _ChunkerFn
|
|
27
|
+
from .embedder_params import resolve_embedder_params
|
|
27
28
|
from .project import Project
|
|
28
29
|
from .protocol import (
|
|
29
30
|
DaemonEnvRequest,
|
|
@@ -56,6 +57,7 @@ from .protocol import (
|
|
|
56
57
|
)
|
|
57
58
|
from .settings import (
|
|
58
59
|
ChunkerMapping,
|
|
60
|
+
UserSettings,
|
|
59
61
|
format_path_for_display,
|
|
60
62
|
get_host_path_mappings,
|
|
61
63
|
global_settings_mtime_us,
|
|
@@ -69,6 +71,27 @@ from .shared import Embedder, check_embedding, create_embedder
|
|
|
69
71
|
logger = logging.getLogger(__name__)
|
|
70
72
|
|
|
71
73
|
|
|
74
|
+
def _build_backward_compat_warning(
|
|
75
|
+
user_settings: UserSettings,
|
|
76
|
+
settings_path: Path,
|
|
77
|
+
) -> str:
|
|
78
|
+
"""Compose the one-time handshake warning for legacy-bridge models.
|
|
79
|
+
|
|
80
|
+
Fired when a user's settings omit ``indexing_params`` / ``query_params`` for
|
|
81
|
+
a model that was previously hardcoded to use ``prompt_name="query"`` for
|
|
82
|
+
queries. See embedder_defaults.LEGACY_QUERY_PROMPT_MODELS.
|
|
83
|
+
"""
|
|
84
|
+
return (
|
|
85
|
+
f"Your embedding model ({user_settings.embedding.model}) was previously "
|
|
86
|
+
f'hardcoded to use prompt_name="query" for queries. Add the following to '
|
|
87
|
+
f"{settings_path} to keep this behavior and silence this warning:\n"
|
|
88
|
+
f"\n"
|
|
89
|
+
f" embedding:\n"
|
|
90
|
+
f" query_params:\n"
|
|
91
|
+
f" prompt_name: query\n"
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
72
95
|
def _resolve_chunker_registry(mappings: list[ChunkerMapping]) -> dict[str, _ChunkerFn]:
|
|
73
96
|
"""Resolve ``ChunkerMapping`` settings entries to a ``{suffix: fn}`` dict.
|
|
74
97
|
|
|
@@ -105,10 +128,19 @@ class ProjectRegistry:
|
|
|
105
128
|
|
|
106
129
|
_projects: dict[str, Project]
|
|
107
130
|
_embedder: Embedder | None
|
|
108
|
-
|
|
109
|
-
|
|
131
|
+
indexing_params: dict[str, Any]
|
|
132
|
+
query_params: dict[str, Any]
|
|
133
|
+
|
|
134
|
+
def __init__(
|
|
135
|
+
self,
|
|
136
|
+
embedder: Embedder | None,
|
|
137
|
+
indexing_params: dict[str, Any] | None = None,
|
|
138
|
+
query_params: dict[str, Any] | None = None,
|
|
139
|
+
) -> None:
|
|
110
140
|
self._projects = {}
|
|
111
141
|
self._embedder = embedder
|
|
142
|
+
self.indexing_params = dict(indexing_params) if indexing_params else {}
|
|
143
|
+
self.query_params = dict(query_params) if query_params else {}
|
|
112
144
|
|
|
113
145
|
async def get_project(self, project_root: str) -> Project:
|
|
114
146
|
"""Get or create a Project for the given root. Lazy initialization."""
|
|
@@ -120,7 +152,13 @@ class ProjectRegistry:
|
|
|
120
152
|
root = Path(project_root)
|
|
121
153
|
project_settings = load_project_settings(root)
|
|
122
154
|
chunker_registry = _resolve_chunker_registry(project_settings.chunkers)
|
|
123
|
-
project = await Project.create(
|
|
155
|
+
project = await Project.create(
|
|
156
|
+
root,
|
|
157
|
+
self._embedder,
|
|
158
|
+
indexing_params=self.indexing_params,
|
|
159
|
+
query_params=self.query_params,
|
|
160
|
+
chunker_registry=chunker_registry,
|
|
161
|
+
)
|
|
124
162
|
self._projects[project_root] = project
|
|
125
163
|
return self._projects[project_root]
|
|
126
164
|
|
|
@@ -168,6 +206,7 @@ async def handle_connection(
|
|
|
168
206
|
on_shutdown: Callable[[], None],
|
|
169
207
|
settings_mtime_us: int | None,
|
|
170
208
|
settings_env_names: list[str],
|
|
209
|
+
handshake_warnings: list[str],
|
|
171
210
|
) -> None:
|
|
172
211
|
"""Handle a single client connection (per-request model).
|
|
173
212
|
|
|
@@ -193,6 +232,7 @@ async def handle_connection(
|
|
|
193
232
|
ok=ok,
|
|
194
233
|
daemon_version=__version__,
|
|
195
234
|
global_settings_mtime_us=settings_mtime_us,
|
|
235
|
+
warnings=list(handshake_warnings),
|
|
196
236
|
)
|
|
197
237
|
)
|
|
198
238
|
)
|
|
@@ -260,8 +300,19 @@ async def _handle_doctor(
|
|
|
260
300
|
appear before project settings in the output.
|
|
261
301
|
"""
|
|
262
302
|
if req.project_root is None:
|
|
263
|
-
# Global-scope checks
|
|
264
|
-
|
|
303
|
+
# Global-scope checks — two separate embed calls because indexing and
|
|
304
|
+
# query may pass different kwargs (asymmetric embedding models), and
|
|
305
|
+
# either side can fail independently (e.g. a malformed input_type).
|
|
306
|
+
yield DoctorResponse(
|
|
307
|
+
result=await _check_model(
|
|
308
|
+
registry._embedder, label="indexing", params=registry.indexing_params
|
|
309
|
+
)
|
|
310
|
+
)
|
|
311
|
+
yield DoctorResponse(
|
|
312
|
+
result=await _check_model(
|
|
313
|
+
registry._embedder, label="query", params=registry.query_params
|
|
314
|
+
)
|
|
315
|
+
)
|
|
265
316
|
else:
|
|
266
317
|
# Project-scope checks
|
|
267
318
|
yield DoctorResponse(result=await _check_file_walk(req.project_root))
|
|
@@ -274,31 +325,39 @@ async def _handle_doctor(
|
|
|
274
325
|
)
|
|
275
326
|
|
|
276
327
|
|
|
277
|
-
async def _check_model(
|
|
278
|
-
|
|
328
|
+
async def _check_model(
|
|
329
|
+
embedder: Embedder | None,
|
|
330
|
+
label: str,
|
|
331
|
+
params: dict[str, Any],
|
|
332
|
+
) -> DoctorCheckResult:
|
|
333
|
+
"""Test the embedding model by embedding a short string using *params*.
|
|
279
334
|
|
|
280
|
-
|
|
281
|
-
|
|
335
|
+
*label* appears in the check's name (e.g. ``"indexing"`` / ``"query"``) so
|
|
336
|
+
users see which side of the config the result corresponds to. Returns a
|
|
337
|
+
failed result when the embedder is ``None`` (daemon running in no-settings
|
|
338
|
+
mode).
|
|
282
339
|
"""
|
|
340
|
+
name = f"Model Check ({label})"
|
|
283
341
|
if embedder is None:
|
|
284
342
|
return DoctorCheckResult(
|
|
285
|
-
name=
|
|
343
|
+
name=name,
|
|
286
344
|
ok=False,
|
|
287
345
|
details=[],
|
|
288
346
|
errors=["Daemon has no global settings loaded. Run `ccc init` to set up."],
|
|
289
347
|
)
|
|
290
|
-
result = await check_embedding(embedder)
|
|
348
|
+
result = await check_embedding(embedder, params)
|
|
349
|
+
params_detail = f"params: {params}" if params else "params: {} (no extra kwargs)"
|
|
291
350
|
if result.error is None:
|
|
292
351
|
return DoctorCheckResult(
|
|
293
|
-
name=
|
|
352
|
+
name=name,
|
|
294
353
|
ok=True,
|
|
295
|
-
details=[f"Embedding dimension: {result.dim}"],
|
|
354
|
+
details=[params_detail, f"Embedding dimension: {result.dim}"],
|
|
296
355
|
errors=[],
|
|
297
356
|
)
|
|
298
357
|
return DoctorCheckResult(
|
|
299
|
-
name=
|
|
358
|
+
name=name,
|
|
300
359
|
ok=False,
|
|
301
|
-
details=[],
|
|
360
|
+
details=[params_detail],
|
|
302
361
|
errors=[result.error],
|
|
303
362
|
)
|
|
304
363
|
|
|
@@ -506,11 +565,27 @@ def run_daemon() -> None:
|
|
|
506
565
|
# provider/model picker in `ccc init`.
|
|
507
566
|
settings_mtime_us = global_settings_mtime_us() # None when file is missing
|
|
508
567
|
embedder: Embedder | None
|
|
568
|
+
indexing_params: dict[str, Any] = {}
|
|
569
|
+
query_params: dict[str, Any] = {}
|
|
570
|
+
handshake_warnings: list[str] = []
|
|
509
571
|
if user_settings_path().is_file():
|
|
510
572
|
user_settings = load_user_settings()
|
|
511
573
|
settings_env_keys = list(user_settings.envs.keys())
|
|
512
574
|
for key, value in user_settings.envs.items():
|
|
513
575
|
os.environ[key] = value
|
|
576
|
+
# Resolve params BEFORE constructing the embedder so invalid configs
|
|
577
|
+
# fail fast without paying the model-load cost.
|
|
578
|
+
try:
|
|
579
|
+
embedder_params = resolve_embedder_params(user_settings.embedding)
|
|
580
|
+
except ValueError:
|
|
581
|
+
logger.exception("Invalid embedder params in global_settings.yml")
|
|
582
|
+
sys.exit(1)
|
|
583
|
+
indexing_params = embedder_params.indexing
|
|
584
|
+
query_params = embedder_params.query
|
|
585
|
+
if embedder_params.used_backward_compat:
|
|
586
|
+
handshake_warnings.append(
|
|
587
|
+
_build_backward_compat_warning(user_settings, user_settings_path())
|
|
588
|
+
)
|
|
514
589
|
embedder = create_embedder(user_settings.embedding)
|
|
515
590
|
else:
|
|
516
591
|
settings_env_keys = []
|
|
@@ -532,7 +607,11 @@ def run_daemon() -> None:
|
|
|
532
607
|
logger.info("Daemon starting (PID %d, version %s)", os.getpid(), __version__)
|
|
533
608
|
|
|
534
609
|
start_time = time.monotonic()
|
|
535
|
-
registry = ProjectRegistry(
|
|
610
|
+
registry = ProjectRegistry(
|
|
611
|
+
embedder,
|
|
612
|
+
indexing_params=indexing_params,
|
|
613
|
+
query_params=query_params,
|
|
614
|
+
)
|
|
536
615
|
|
|
537
616
|
sock_path = daemon_socket_path()
|
|
538
617
|
if sys.platform != "win32":
|
|
@@ -560,6 +639,7 @@ def run_daemon() -> None:
|
|
|
560
639
|
_request_shutdown,
|
|
561
640
|
settings_mtime_us,
|
|
562
641
|
settings_env_keys,
|
|
642
|
+
handshake_warnings,
|
|
563
643
|
)
|
|
564
644
|
)
|
|
565
645
|
tasks.add(task)
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""Curated default embedder params for known models.
|
|
2
|
+
|
|
3
|
+
Consulted only by ``ccc init`` — the table is NOT read at daemon runtime.
|
|
4
|
+
The runtime path reads the user's YAML verbatim; the legacy-bridge in
|
|
5
|
+
``embedder_params.resolve_embedder_params`` is the only runtime-level fallback
|
|
6
|
+
and is scoped to :data:`LEGACY_QUERY_PROMPT_MODELS`.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
from typing import Any, NamedTuple
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"DefaultParamsEntry",
|
|
16
|
+
"LEGACY_QUERY_PROMPT_MODELS",
|
|
17
|
+
"lookup_defaults",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class DefaultParamsEntry(NamedTuple):
|
|
22
|
+
provider: str # "sentence-transformers" | "litellm"
|
|
23
|
+
model: str | re.Pattern[str] # str = exact match; Pattern = regex match
|
|
24
|
+
indexing_params: dict[str, Any] # may be empty
|
|
25
|
+
query_params: dict[str, Any] # may be empty
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# Models previously hardcoded in shared.py:_QUERY_PROMPT_MODELS. Retained as
|
|
29
|
+
# a frozenset so the legacy-bridge in ``embedder_params`` can recognize
|
|
30
|
+
# pre-existing configs that predate this feature.
|
|
31
|
+
LEGACY_QUERY_PROMPT_MODELS: frozenset[str] = frozenset(
|
|
32
|
+
{"nomic-ai/nomic-embed-code", "nomic-ai/CodeRankEmbed"}
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
_DEFAULT_PARAMS: list[DefaultParamsEntry] = [
|
|
37
|
+
# --- sentence-transformers ---
|
|
38
|
+
DefaultParamsEntry(
|
|
39
|
+
"sentence-transformers",
|
|
40
|
+
"nomic-ai/CodeRankEmbed",
|
|
41
|
+
{},
|
|
42
|
+
{"prompt_name": "query"},
|
|
43
|
+
),
|
|
44
|
+
DefaultParamsEntry(
|
|
45
|
+
"sentence-transformers",
|
|
46
|
+
"nomic-ai/nomic-embed-code",
|
|
47
|
+
{},
|
|
48
|
+
{"prompt_name": "query"},
|
|
49
|
+
),
|
|
50
|
+
DefaultParamsEntry(
|
|
51
|
+
"sentence-transformers",
|
|
52
|
+
"nomic-ai/nomic-embed-text-v1",
|
|
53
|
+
{"prompt_name": "passage"},
|
|
54
|
+
{"prompt_name": "query"},
|
|
55
|
+
),
|
|
56
|
+
DefaultParamsEntry(
|
|
57
|
+
"sentence-transformers",
|
|
58
|
+
"nomic-ai/nomic-embed-text-v1.5",
|
|
59
|
+
{"prompt_name": "passage"},
|
|
60
|
+
{"prompt_name": "query"},
|
|
61
|
+
),
|
|
62
|
+
DefaultParamsEntry(
|
|
63
|
+
"sentence-transformers",
|
|
64
|
+
"mixedbread-ai/mxbai-embed-large-v1",
|
|
65
|
+
{},
|
|
66
|
+
{"prompt_name": "query"},
|
|
67
|
+
),
|
|
68
|
+
DefaultParamsEntry(
|
|
69
|
+
"sentence-transformers",
|
|
70
|
+
re.compile(r"Snowflake/snowflake-arctic-embed-.+"),
|
|
71
|
+
{},
|
|
72
|
+
{"prompt_name": "query"},
|
|
73
|
+
),
|
|
74
|
+
# --- litellm ---
|
|
75
|
+
DefaultParamsEntry(
|
|
76
|
+
"litellm",
|
|
77
|
+
re.compile(r"cohere/embed-(english|multilingual)(-light)?-v3\.0"),
|
|
78
|
+
{"input_type": "search_document"},
|
|
79
|
+
{"input_type": "search_query"},
|
|
80
|
+
),
|
|
81
|
+
DefaultParamsEntry(
|
|
82
|
+
"litellm",
|
|
83
|
+
re.compile(r"voyage/.+"),
|
|
84
|
+
{"input_type": "document"},
|
|
85
|
+
{"input_type": "query"},
|
|
86
|
+
),
|
|
87
|
+
DefaultParamsEntry(
|
|
88
|
+
"litellm",
|
|
89
|
+
re.compile(r"nvidia_nim/nvidia/.+"),
|
|
90
|
+
{"input_type": "passage"},
|
|
91
|
+
{"input_type": "query"},
|
|
92
|
+
),
|
|
93
|
+
# Gemini embedding models: LiteLLM's Gemini transformation auto-maps
|
|
94
|
+
# `input_type` → `task_type` (RETRIEVAL_DOCUMENT / RETRIEVAL_QUERY work
|
|
95
|
+
# across all Gemini embedding generations).
|
|
96
|
+
DefaultParamsEntry(
|
|
97
|
+
"litellm",
|
|
98
|
+
re.compile(r"gemini/(gemini-embedding|text-embedding|embedding)[-\w.]*"),
|
|
99
|
+
{"input_type": "RETRIEVAL_DOCUMENT"},
|
|
100
|
+
{"input_type": "RETRIEVAL_QUERY"},
|
|
101
|
+
),
|
|
102
|
+
]
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def lookup_defaults(
|
|
106
|
+
provider: str, model: str
|
|
107
|
+
) -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
|
|
108
|
+
"""Look up recommended (indexing_params, query_params) for *model*.
|
|
109
|
+
|
|
110
|
+
Walks :data:`_DEFAULT_PARAMS` in order; an exact-name entry matches iff
|
|
111
|
+
``entry.model == model``; a compiled-regex entry matches via
|
|
112
|
+
``entry.model.fullmatch(model)``. First match wins. Returns the pair of
|
|
113
|
+
dicts (each possibly empty) or ``(None, None)`` when no entry matches.
|
|
114
|
+
"""
|
|
115
|
+
for entry in _DEFAULT_PARAMS:
|
|
116
|
+
if entry.provider != provider:
|
|
117
|
+
continue
|
|
118
|
+
if isinstance(entry.model, str):
|
|
119
|
+
matched = entry.model == model
|
|
120
|
+
else:
|
|
121
|
+
matched = entry.model.fullmatch(model) is not None
|
|
122
|
+
if matched:
|
|
123
|
+
return dict(entry.indexing_params), dict(entry.query_params)
|
|
124
|
+
return None, None
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _assert_legacy_bridge_invariant() -> None:
|
|
128
|
+
"""Each legacy model must have an exact sentence-transformers entry with
|
|
129
|
+
``query_params == {"prompt_name": "query"}``. Guarantees users who run
|
|
130
|
+
``ccc init`` against a legacy model get the same effective behavior the
|
|
131
|
+
runtime legacy-bridge produces.
|
|
132
|
+
"""
|
|
133
|
+
for legacy in LEGACY_QUERY_PROMPT_MODELS:
|
|
134
|
+
found = False
|
|
135
|
+
for entry in _DEFAULT_PARAMS:
|
|
136
|
+
if (
|
|
137
|
+
entry.provider == "sentence-transformers"
|
|
138
|
+
and isinstance(entry.model, str)
|
|
139
|
+
and entry.model == legacy
|
|
140
|
+
and entry.query_params == {"prompt_name": "query"}
|
|
141
|
+
):
|
|
142
|
+
found = True
|
|
143
|
+
break
|
|
144
|
+
if not found:
|
|
145
|
+
raise AssertionError(
|
|
146
|
+
f"Legacy model {legacy!r} has no matching sentence-transformers "
|
|
147
|
+
f"exact-name entry in _DEFAULT_PARAMS with "
|
|
148
|
+
f"query_params={{'prompt_name': 'query'}}"
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
_assert_legacy_bridge_invariant()
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""Validation and resolution of embedder ``indexing_params`` / ``query_params``.
|
|
2
|
+
|
|
3
|
+
Runtime entry point is :func:`resolve_embedder_params`. The curated defaults
|
|
4
|
+
table lives in :mod:`embedder_defaults` and is used only by ``ccc init`` —
|
|
5
|
+
this module does not consult it.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any, NamedTuple
|
|
11
|
+
|
|
12
|
+
from .embedder_defaults import LEGACY_QUERY_PROMPT_MODELS
|
|
13
|
+
from .settings import EmbeddingSettings
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"EmbedderParams",
|
|
17
|
+
"accepted_kwargs_for",
|
|
18
|
+
"resolve_embedder_params",
|
|
19
|
+
"validate_params",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# Accepted kwargs per provider. Intentionally minimal — we only expose knobs
|
|
24
|
+
# that users have reason to tune. ``normalize_embeddings`` (sentence-
|
|
25
|
+
# transformers) and ``encoding_format`` (litellm) are deliberately excluded
|
|
26
|
+
# because other code assumes unit vectors (query._l2_to_score) and float
|
|
27
|
+
# payloads (litellm_embedder hardcodes encoding_format="float").
|
|
28
|
+
_ACCEPTED_KWARGS: dict[str, frozenset[str]] = {
|
|
29
|
+
"sentence-transformers": frozenset({"prompt_name"}),
|
|
30
|
+
"litellm": frozenset({"input_type", "dimensions"}),
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def accepted_kwargs_for(provider: str) -> frozenset[str]:
|
|
35
|
+
"""Return the set of accepted kwarg names for *provider*.
|
|
36
|
+
|
|
37
|
+
Raises ``ValueError`` on unknown providers.
|
|
38
|
+
"""
|
|
39
|
+
try:
|
|
40
|
+
return _ACCEPTED_KWARGS[provider]
|
|
41
|
+
except KeyError as e:
|
|
42
|
+
raise ValueError(f"Unknown provider: {provider!r}") from e
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def validate_params(
|
|
46
|
+
provider: str,
|
|
47
|
+
indexing_params: dict[str, Any] | None,
|
|
48
|
+
query_params: dict[str, Any] | None,
|
|
49
|
+
) -> None:
|
|
50
|
+
"""Raise ``ValueError`` if either dict contains keys not accepted by *provider*."""
|
|
51
|
+
accepted = accepted_kwargs_for(provider)
|
|
52
|
+
for side, params in (("indexing_params", indexing_params), ("query_params", query_params)):
|
|
53
|
+
if not params:
|
|
54
|
+
continue
|
|
55
|
+
unknown = sorted(set(params) - accepted)
|
|
56
|
+
if unknown:
|
|
57
|
+
raise ValueError(
|
|
58
|
+
f"{side}: unknown key(s) {unknown!r} for provider {provider!r}. "
|
|
59
|
+
f"Accepted keys: {sorted(accepted)!r}."
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class EmbedderParams(NamedTuple):
|
|
64
|
+
"""Params that will be spread into ``embedder.embed()`` calls at runtime."""
|
|
65
|
+
|
|
66
|
+
indexing: dict[str, Any] # never None; possibly empty
|
|
67
|
+
query: dict[str, Any] # never None; possibly empty
|
|
68
|
+
used_backward_compat: bool # True iff the legacy bridge fired
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def resolve_embedder_params(settings: EmbeddingSettings) -> EmbedderParams:
|
|
72
|
+
"""Resolve the effective embedder params from user settings.
|
|
73
|
+
|
|
74
|
+
Whatever the user put in the file, verbatim, with one exception for
|
|
75
|
+
backward compatibility: if neither ``indexing_params`` nor ``query_params``
|
|
76
|
+
is set and the model was previously handled by the hardcoded
|
|
77
|
+
``_QUERY_PROMPT_MODELS`` path, fill in ``query = {'prompt_name': 'query'}``
|
|
78
|
+
and raise the ``used_backward_compat`` flag so the daemon emits a
|
|
79
|
+
handshake warning.
|
|
80
|
+
"""
|
|
81
|
+
indexing: dict[str, Any] = dict(settings.indexing_params) if settings.indexing_params else {}
|
|
82
|
+
query: dict[str, Any] = dict(settings.query_params) if settings.query_params else {}
|
|
83
|
+
used_backward_compat = False
|
|
84
|
+
|
|
85
|
+
if (
|
|
86
|
+
settings.indexing_params is None
|
|
87
|
+
and settings.query_params is None
|
|
88
|
+
and settings.provider == "sentence-transformers"
|
|
89
|
+
and settings.model in LEGACY_QUERY_PROMPT_MODELS
|
|
90
|
+
):
|
|
91
|
+
query = {"prompt_name": "query"}
|
|
92
|
+
used_backward_compat = True
|
|
93
|
+
|
|
94
|
+
validate_params(settings.provider, indexing, query)
|
|
95
|
+
return EmbedderParams(indexing=indexing, query=query, used_backward_compat=used_backward_compat)
|
|
@@ -19,6 +19,7 @@ from .settings import load_gitignore_spec, load_project_settings
|
|
|
19
19
|
from .shared import (
|
|
20
20
|
CODEBASE_DIR,
|
|
21
21
|
EMBEDDER,
|
|
22
|
+
INDEXING_EMBED_PARAMS,
|
|
22
23
|
SQLITE_DB,
|
|
23
24
|
CodeChunk,
|
|
24
25
|
)
|
|
@@ -140,6 +141,7 @@ async def process_file(
|
|
|
140
141
|
) -> None:
|
|
141
142
|
"""Process a single file: chunk, embed, and store."""
|
|
142
143
|
embedder = coco.use_context(EMBEDDER)
|
|
144
|
+
indexing_params = coco.use_context(INDEXING_EMBED_PARAMS)
|
|
143
145
|
|
|
144
146
|
try:
|
|
145
147
|
content = await file.read_text()
|
|
@@ -185,7 +187,7 @@ async def process_file(
|
|
|
185
187
|
content=chunk.text,
|
|
186
188
|
start_line=chunk.start.line,
|
|
187
189
|
end_line=chunk.end.line,
|
|
188
|
-
embedding=await embedder.embed(chunk.text),
|
|
190
|
+
embedding=await embedder.embed(chunk.text, **indexing_params),
|
|
189
191
|
)
|
|
190
192
|
)
|
|
191
193
|
|
|
@@ -13,6 +13,8 @@ import numpy as np
|
|
|
13
13
|
from cocoindex.ops.litellm import LiteLLMEmbedder, litellm
|
|
14
14
|
from numpy.typing import NDArray
|
|
15
15
|
|
|
16
|
+
litellm.drop_params = True
|
|
17
|
+
|
|
16
18
|
logger = logging.getLogger(__name__)
|
|
17
19
|
|
|
18
20
|
_RATE_LIMIT_DELAY_RE = re.compile(r"Please try again in ([0-9.]+)(ms|s)", re.IGNORECASE)
|
|
@@ -6,6 +6,7 @@ import asyncio
|
|
|
6
6
|
import sqlite3
|
|
7
7
|
from collections.abc import AsyncIterator, Callable
|
|
8
8
|
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
9
10
|
|
|
10
11
|
import cocoindex as coco
|
|
11
12
|
from cocoindex.connectors import sqlite as coco_sqlite
|
|
@@ -34,6 +35,8 @@ from .settings import (
|
|
|
34
35
|
from .shared import (
|
|
35
36
|
CODEBASE_DIR,
|
|
36
37
|
EMBEDDER,
|
|
38
|
+
INDEXING_EMBED_PARAMS,
|
|
39
|
+
QUERY_EMBED_PARAMS,
|
|
37
40
|
SQLITE_DB,
|
|
38
41
|
Embedder,
|
|
39
42
|
)
|
|
@@ -257,9 +260,11 @@ class Project:
|
|
|
257
260
|
async def create(
|
|
258
261
|
project_root: Path,
|
|
259
262
|
embedder: Embedder,
|
|
263
|
+
indexing_params: dict[str, Any],
|
|
264
|
+
query_params: dict[str, Any],
|
|
260
265
|
chunker_registry: dict[str, ChunkerFn] | None = None,
|
|
261
266
|
) -> Project:
|
|
262
|
-
"""Create a project with explicit embedder.
|
|
267
|
+
"""Create a project with explicit embedder and per-call params.
|
|
263
268
|
|
|
264
269
|
Project-level settings and .gitignore are NOT cached here — the
|
|
265
270
|
indexer loads them fresh from disk on every run so that user edits
|
|
@@ -268,6 +273,11 @@ class Project:
|
|
|
268
273
|
Args:
|
|
269
274
|
project_root: Root directory of the codebase to index.
|
|
270
275
|
embedder: Embedding model instance.
|
|
276
|
+
indexing_params: Extra kwargs spread into ``embedder.embed()`` during
|
|
277
|
+
indexing (e.g. ``{"prompt_name": "passage"}``). Pass ``{}`` for
|
|
278
|
+
no extras.
|
|
279
|
+
query_params: Extra kwargs spread into ``embedder.embed()`` for the
|
|
280
|
+
query side.
|
|
271
281
|
chunker_registry: Optional mapping of file suffix (e.g. ``".toml"``)
|
|
272
282
|
to a ``ChunkerFn``. When a suffix matches, the registered
|
|
273
283
|
chunker is called instead of the built-in splitter.
|
|
@@ -287,6 +297,8 @@ class Project:
|
|
|
287
297
|
context.provide(CODEBASE_DIR, project_root)
|
|
288
298
|
context.provide(SQLITE_DB, coco_sqlite.connect(str(target_sqlite_db), load_vec=True))
|
|
289
299
|
context.provide(EMBEDDER, embedder)
|
|
300
|
+
context.provide(INDEXING_EMBED_PARAMS, dict(indexing_params))
|
|
301
|
+
context.provide(QUERY_EMBED_PARAMS, dict(query_params))
|
|
290
302
|
context.provide(CHUNKER_REGISTRY, dict(chunker_registry) if chunker_registry else {})
|
|
291
303
|
|
|
292
304
|
env = coco.Environment(settings, context_provider=context)
|
|
@@ -71,6 +71,9 @@ class HandshakeResponse(_msgspec.Struct, tag="handshake"):
|
|
|
71
71
|
ok: bool
|
|
72
72
|
daemon_version: str
|
|
73
73
|
global_settings_mtime_us: int | None = None
|
|
74
|
+
# Non-fatal daemon-side warnings surfaced to the client on every handshake.
|
|
75
|
+
# The client dedupes and prints them to stderr (see client._print_handshake_warnings).
|
|
76
|
+
warnings: list[str] = []
|
|
74
77
|
|
|
75
78
|
|
|
76
79
|
class IndexResponse(_msgspec.Struct, tag="index"):
|
|
@@ -8,7 +8,7 @@ from pathlib import Path
|
|
|
8
8
|
from typing import Any
|
|
9
9
|
|
|
10
10
|
from .schema import QueryResult
|
|
11
|
-
from .shared import EMBEDDER,
|
|
11
|
+
from .shared import EMBEDDER, QUERY_EMBED_PARAMS, SQLITE_DB
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
def _l2_to_score(distance: float) -> float:
|
|
@@ -106,9 +106,10 @@ async def query_codebase(
|
|
|
106
106
|
|
|
107
107
|
db = env.get_context(SQLITE_DB)
|
|
108
108
|
embedder = env.get_context(EMBEDDER)
|
|
109
|
+
query_params = env.get_context(QUERY_EMBED_PARAMS)
|
|
109
110
|
|
|
110
111
|
# Generate query embedding.
|
|
111
|
-
query_embedding = await embedder.embed(query,
|
|
112
|
+
query_embedding = await embedder.embed(query, **query_params)
|
|
112
113
|
|
|
113
114
|
embedding_bytes = query_embedding.astype("float32").tobytes()
|
|
114
115
|
|
|
@@ -93,6 +93,11 @@ class EmbeddingSettings:
|
|
|
93
93
|
provider: str = "litellm"
|
|
94
94
|
device: str | None = None
|
|
95
95
|
min_interval_ms: int | None = None
|
|
96
|
+
# Extra kwargs spread into ``embedder.embed()`` during indexing/query.
|
|
97
|
+
# ``None`` means the user did not set the key; ``{}`` is an explicit empty
|
|
98
|
+
# dict (used to opt out of the legacy-bridge warning).
|
|
99
|
+
indexing_params: dict[str, Any] | None = None
|
|
100
|
+
query_params: dict[str, Any] | None = None
|
|
96
101
|
|
|
97
102
|
|
|
98
103
|
@dataclass
|
|
@@ -410,6 +415,10 @@ def _embedding_settings_to_dict(embedding: EmbeddingSettings) -> dict[str, Any]:
|
|
|
410
415
|
d["device"] = embedding.device
|
|
411
416
|
if embedding.min_interval_ms is not None:
|
|
412
417
|
d["min_interval_ms"] = embedding.min_interval_ms
|
|
418
|
+
if embedding.indexing_params is not None:
|
|
419
|
+
d["indexing_params"] = dict(embedding.indexing_params)
|
|
420
|
+
if embedding.query_params is not None:
|
|
421
|
+
d["query_params"] = dict(embedding.query_params)
|
|
413
422
|
return d
|
|
414
423
|
|
|
415
424
|
|
|
@@ -432,6 +441,13 @@ def _user_settings_from_dict(d: dict[str, Any]) -> UserSettings:
|
|
|
432
441
|
emb_kwargs["device"] = emb_dict["device"]
|
|
433
442
|
if "min_interval_ms" in emb_dict:
|
|
434
443
|
emb_kwargs["min_interval_ms"] = emb_dict["min_interval_ms"]
|
|
444
|
+
# indexing_params / query_params: missing → None (dataclass default);
|
|
445
|
+
# present-but-null → {} (treat the same as an empty dict, since both mean
|
|
446
|
+
# "user acknowledged the key and wants no extra kwargs").
|
|
447
|
+
if "indexing_params" in emb_dict:
|
|
448
|
+
emb_kwargs["indexing_params"] = dict(emb_dict["indexing_params"] or {})
|
|
449
|
+
if "query_params" in emb_dict:
|
|
450
|
+
emb_kwargs["query_params"] = dict(emb_dict["query_params"] or {})
|
|
435
451
|
embedding = EmbeddingSettings(**emb_kwargs)
|
|
436
452
|
envs = d.get("envs", {})
|
|
437
453
|
return UserSettings(embedding=embedding, envs=envs)
|
|
@@ -514,21 +530,53 @@ _INITIAL_ENVS_COMMENT = (
|
|
|
514
530
|
"# VOYAGE_API_KEY: ...\n"
|
|
515
531
|
)
|
|
516
532
|
|
|
517
|
-
|
|
518
|
-
|
|
533
|
+
# Comment-template blocks inserted after `embedding:` when we don't have
|
|
534
|
+
# curated defaults for the chosen model, so users know the fields exist.
|
|
535
|
+
# Keyed by provider name.
|
|
536
|
+
_PARAMS_COMMENT_BY_PROVIDER: dict[str, str] = {
|
|
537
|
+
"sentence-transformers": (
|
|
538
|
+
" #\n"
|
|
539
|
+
" # Extra kwargs passed to the embedder. Supported keys:\n"
|
|
540
|
+
" # prompt_name\n"
|
|
541
|
+
" # indexing_params: {}\n"
|
|
542
|
+
" # query_params: {}\n"
|
|
543
|
+
),
|
|
544
|
+
"litellm": (
|
|
545
|
+
" #\n"
|
|
546
|
+
" # Extra kwargs passed to the embedder. Supported keys:\n"
|
|
547
|
+
" # input_type, dimensions\n"
|
|
548
|
+
" # indexing_params: {}\n"
|
|
549
|
+
" # query_params: {}\n"
|
|
550
|
+
),
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
def save_initial_user_settings(
|
|
555
|
+
embedding: EmbeddingSettings,
|
|
556
|
+
defaults_applied: bool,
|
|
557
|
+
) -> Path:
|
|
519
558
|
"""Write the initial global_settings.yml with comment hints and env examples.
|
|
520
559
|
|
|
521
560
|
Only used by `ccc init` on first-time setup. Emits only the `embedding:`
|
|
522
561
|
block from the input; the `envs:` section is a commented-out template.
|
|
523
562
|
Subsequent programmatic writes use `save_user_settings` and do not
|
|
524
563
|
preserve comments.
|
|
564
|
+
|
|
565
|
+
When ``defaults_applied`` is False, a provider-specific commented-out
|
|
566
|
+
template for ``indexing_params`` / ``query_params`` is inserted under the
|
|
567
|
+
``embedding:`` block so the user sees the fields exist.
|
|
525
568
|
"""
|
|
526
569
|
emb_block = _yaml.safe_dump(
|
|
527
570
|
{"embedding": _embedding_settings_to_dict(embedding)},
|
|
528
571
|
default_flow_style=False,
|
|
529
572
|
sort_keys=False,
|
|
530
573
|
)
|
|
531
|
-
content = _INITIAL_HEADER + emb_block
|
|
574
|
+
content = _INITIAL_HEADER + emb_block
|
|
575
|
+
if not defaults_applied:
|
|
576
|
+
hint = _PARAMS_COMMENT_BY_PROVIDER.get(embedding.provider)
|
|
577
|
+
if hint is not None:
|
|
578
|
+
content += hint
|
|
579
|
+
content += _INITIAL_ENVS_COMMENT
|
|
532
580
|
|
|
533
581
|
path = user_settings_path()
|
|
534
582
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
@@ -6,7 +6,7 @@ import importlib.util
|
|
|
6
6
|
import logging
|
|
7
7
|
import pathlib
|
|
8
8
|
from dataclasses import dataclass
|
|
9
|
-
from typing import TYPE_CHECKING, Annotated, NamedTuple, Union
|
|
9
|
+
from typing import TYPE_CHECKING, Annotated, Any, NamedTuple, Union
|
|
10
10
|
|
|
11
11
|
import cocoindex as coco
|
|
12
12
|
import numpy as np
|
|
@@ -24,9 +24,6 @@ logger = logging.getLogger(__name__)
|
|
|
24
24
|
SBERT_PREFIX = "sbert/"
|
|
25
25
|
DEFAULT_LITELLM_MIN_INTERVAL_MS = 5
|
|
26
26
|
|
|
27
|
-
# Models that define a "query" prompt for asymmetric retrieval.
|
|
28
|
-
_QUERY_PROMPT_MODELS = {"nomic-ai/nomic-embed-code", "nomic-ai/CodeRankEmbed"}
|
|
29
|
-
|
|
30
27
|
# Type alias
|
|
31
28
|
Embedder = Union["SentenceTransformerEmbedder", "LiteLLMEmbedder"]
|
|
32
29
|
|
|
@@ -34,12 +31,8 @@ Embedder = Union["SentenceTransformerEmbedder", "LiteLLMEmbedder"]
|
|
|
34
31
|
EMBEDDER = coco.ContextKey[Embedder]("embedder", detect_change=True)
|
|
35
32
|
SQLITE_DB = coco.ContextKey[sqlite.ManagedConnection]("index_db")
|
|
36
33
|
CODEBASE_DIR = coco.ContextKey[pathlib.Path]("codebase")
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
embedder: Embedder | None = None
|
|
40
|
-
|
|
41
|
-
# Query prompt name — set alongside embedder by create_embedder().
|
|
42
|
-
query_prompt_name: str | None = None
|
|
34
|
+
INDEXING_EMBED_PARAMS = coco.ContextKey[dict[str, Any]]("indexing_embed_params")
|
|
35
|
+
QUERY_EMBED_PARAMS = coco.ContextKey[dict[str, Any]]("query_embed_params")
|
|
43
36
|
|
|
44
37
|
|
|
45
38
|
def is_sentence_transformers_installed() -> bool:
|
|
@@ -61,29 +54,30 @@ class EmbeddingCheckResult(NamedTuple):
|
|
|
61
54
|
error: str | None
|
|
62
55
|
|
|
63
56
|
|
|
64
|
-
async def check_embedding(
|
|
57
|
+
async def check_embedding(
|
|
58
|
+
embedder: Embedder,
|
|
59
|
+
params: dict[str, Any] | None = None,
|
|
60
|
+
) -> EmbeddingCheckResult:
|
|
65
61
|
"""Run a single embed call against *embedder* and report dim or error.
|
|
66
62
|
|
|
67
|
-
|
|
68
|
-
|
|
63
|
+
*params* are spread into ``embed()`` so callers can verify indexing vs
|
|
64
|
+
query params separately (they may use different keys at runtime).
|
|
65
|
+
|
|
66
|
+
Never raises. Used by the daemon's doctor path (`daemon._check_model`).
|
|
69
67
|
"""
|
|
68
|
+
kwargs = dict(params) if params else {}
|
|
70
69
|
try:
|
|
71
|
-
vec = await embedder.embed("hello world")
|
|
70
|
+
vec = await embedder.embed("hello world", **kwargs)
|
|
72
71
|
return EmbeddingCheckResult(dim=len(vec), error=None)
|
|
73
72
|
except Exception as e:
|
|
74
|
-
msg = f"{type(e).__name__}: {e}".splitlines()
|
|
73
|
+
msg = " ".join(f"{type(e).__name__}: {e}".splitlines())
|
|
75
74
|
if len(msg) > 500:
|
|
76
75
|
msg = msg[:500] + "…"
|
|
77
76
|
return EmbeddingCheckResult(dim=None, error=msg)
|
|
78
77
|
|
|
79
78
|
|
|
80
79
|
def create_embedder(settings: EmbeddingSettings) -> Embedder:
|
|
81
|
-
"""Create and return an embedder instance based on settings.
|
|
82
|
-
|
|
83
|
-
Also sets the module-level ``embedder`` and ``query_prompt_name`` variables.
|
|
84
|
-
"""
|
|
85
|
-
global embedder, query_prompt_name
|
|
86
|
-
|
|
80
|
+
"""Create and return an embedder instance based on settings."""
|
|
87
81
|
if settings.provider == "sentence-transformers":
|
|
88
82
|
from cocoindex.ops.sentence_transformers import SentenceTransformerEmbedder
|
|
89
83
|
|
|
@@ -92,7 +86,6 @@ def create_embedder(settings: EmbeddingSettings) -> Embedder:
|
|
|
92
86
|
if model_name.startswith(SBERT_PREFIX):
|
|
93
87
|
model_name = model_name[len(SBERT_PREFIX) :]
|
|
94
88
|
|
|
95
|
-
query_prompt_name = "query" if model_name in _QUERY_PROMPT_MODELS else None
|
|
96
89
|
instance: Embedder = SentenceTransformerEmbedder(
|
|
97
90
|
model_name,
|
|
98
91
|
device=settings.device,
|
|
@@ -111,14 +104,12 @@ def create_embedder(settings: EmbeddingSettings) -> Embedder:
|
|
|
111
104
|
settings.model,
|
|
112
105
|
min_interval_ms=min_interval_ms,
|
|
113
106
|
)
|
|
114
|
-
query_prompt_name = None
|
|
115
107
|
logger.info(
|
|
116
108
|
"Embedding model (LiteLLM): %s | min_interval_ms: %s",
|
|
117
109
|
settings.model,
|
|
118
110
|
min_interval_ms,
|
|
119
111
|
)
|
|
120
112
|
|
|
121
|
-
embedder = instance
|
|
122
113
|
return instance
|
|
123
114
|
|
|
124
115
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|