cocoindex-code 0.2.8__tar.gz → 0.2.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cocoindex_code-0.2.8 → cocoindex_code-0.2.9}/PKG-INFO +34 -1
- {cocoindex_code-0.2.8 → cocoindex_code-0.2.9}/README.md +33 -0
- {cocoindex_code-0.2.8 → cocoindex_code-0.2.9}/pyproject.toml +2 -0
- {cocoindex_code-0.2.8 → cocoindex_code-0.2.9}/src/cocoindex_code/_version.py +2 -2
- cocoindex_code-0.2.9/src/cocoindex_code/chunking.py +29 -0
- {cocoindex_code-0.2.8 → cocoindex_code-0.2.9}/src/cocoindex_code/cli.py +19 -18
- {cocoindex_code-0.2.8 → cocoindex_code-0.2.9}/src/cocoindex_code/client.py +2 -2
- {cocoindex_code-0.2.8 → cocoindex_code-0.2.9}/src/cocoindex_code/daemon.py +29 -3
- {cocoindex_code-0.2.8 → cocoindex_code-0.2.9}/src/cocoindex_code/indexer.py +15 -7
- {cocoindex_code-0.2.8 → cocoindex_code-0.2.9}/src/cocoindex_code/project.py +24 -6
- {cocoindex_code-0.2.8 → cocoindex_code-0.2.9}/src/cocoindex_code/settings.py +49 -1
- cocoindex_code-0.2.8/src/cocoindex_code/config.py +0 -144
- {cocoindex_code-0.2.8 → cocoindex_code-0.2.9}/.gitignore +0 -0
- {cocoindex_code-0.2.8 → cocoindex_code-0.2.9}/LICENSE +0 -0
- {cocoindex_code-0.2.8 → cocoindex_code-0.2.9}/src/cocoindex_code/__init__.py +0 -0
- {cocoindex_code-0.2.8 → cocoindex_code-0.2.9}/src/cocoindex_code/__main__.py +0 -0
- {cocoindex_code-0.2.8 → cocoindex_code-0.2.9}/src/cocoindex_code/protocol.py +0 -0
- {cocoindex_code-0.2.8 → cocoindex_code-0.2.9}/src/cocoindex_code/query.py +0 -0
- {cocoindex_code-0.2.8 → cocoindex_code-0.2.9}/src/cocoindex_code/schema.py +0 -0
- {cocoindex_code-0.2.8 → cocoindex_code-0.2.9}/src/cocoindex_code/server.py +0 -0
- {cocoindex_code-0.2.8 → cocoindex_code-0.2.9}/src/cocoindex_code/shared.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cocoindex-code
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.9
|
|
4
4
|
Summary: MCP server for indexing and querying codebases using CocoIndex
|
|
5
5
|
Project-URL: Homepage, https://github.com/cocoindex-io/cocoindex-code
|
|
6
6
|
Project-URL: Repository, https://github.com/cocoindex-io/cocoindex-code
|
|
@@ -276,10 +276,43 @@ exclude_patterns:
|
|
|
276
276
|
language_overrides:
|
|
277
277
|
- ext: inc # treat .inc files as PHP
|
|
278
278
|
lang: php
|
|
279
|
+
|
|
280
|
+
chunkers:
|
|
281
|
+
- ext: toml # use a custom chunker for .toml files
|
|
282
|
+
module: example_toml_chunker:toml_chunker
|
|
279
283
|
```
|
|
280
284
|
|
|
281
285
|
> `.cocoindex_code/` is automatically added to `.gitignore` during init.
|
|
282
286
|
|
|
287
|
+
Use `chunkers` when you want to control how a file type is split into chunks before indexing.
|
|
288
|
+
|
|
289
|
+
`module: example_toml_chunker:toml_chunker` means:
|
|
290
|
+
- `example_toml_chunker` is a local Python module
|
|
291
|
+
- `toml_chunker` is the function inside that module
|
|
292
|
+
|
|
293
|
+
In practice, this usually means:
|
|
294
|
+
- you create a Python file in your project, for example `example_toml_chunker.py`
|
|
295
|
+
- you add a function in that file
|
|
296
|
+
- you point `settings.yml` at it with `module.path:function_name`
|
|
297
|
+
|
|
298
|
+
The function should use this signature:
|
|
299
|
+
|
|
300
|
+
```python
|
|
301
|
+
from pathlib import Path
|
|
302
|
+
from cocoindex_code.chunking import Chunk
|
|
303
|
+
|
|
304
|
+
def my_chunker(path: Path, content: str) -> tuple[str | None, list[Chunk]]:
|
|
305
|
+
...
|
|
306
|
+
```
|
|
307
|
+
|
|
308
|
+
- `path` is the file being indexed
|
|
309
|
+
- `content` is the full text of that file
|
|
310
|
+
- return `language_override` as a string like `"toml"` if you want to override language detection
|
|
311
|
+
- return `None` as `language_override` if you want to keep the detected language
|
|
312
|
+
- return a `list[Chunk]` with the chunks you want stored in the index
|
|
313
|
+
|
|
314
|
+
See [`src/cocoindex_code/chunking.py`](./src/cocoindex_code/chunking.py) for the public types and [`tests/example_toml_chunker.py`](./tests/example_toml_chunker.py) for a complete example.
|
|
315
|
+
|
|
283
316
|
## Embedding Models
|
|
284
317
|
|
|
285
318
|
By default, a local SentenceTransformers model ([sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)) is used — no API key required. To use a different model, edit `~/.cocoindex_code/global_settings.yml`.
|
|
@@ -237,10 +237,43 @@ exclude_patterns:
|
|
|
237
237
|
language_overrides:
|
|
238
238
|
- ext: inc # treat .inc files as PHP
|
|
239
239
|
lang: php
|
|
240
|
+
|
|
241
|
+
chunkers:
|
|
242
|
+
- ext: toml # use a custom chunker for .toml files
|
|
243
|
+
module: example_toml_chunker:toml_chunker
|
|
240
244
|
```
|
|
241
245
|
|
|
242
246
|
> `.cocoindex_code/` is automatically added to `.gitignore` during init.
|
|
243
247
|
|
|
248
|
+
Use `chunkers` when you want to control how a file type is split into chunks before indexing.
|
|
249
|
+
|
|
250
|
+
`module: example_toml_chunker:toml_chunker` means:
|
|
251
|
+
- `example_toml_chunker` is a local Python module
|
|
252
|
+
- `toml_chunker` is the function inside that module
|
|
253
|
+
|
|
254
|
+
In practice, this usually means:
|
|
255
|
+
- you create a Python file in your project, for example `example_toml_chunker.py`
|
|
256
|
+
- you add a function in that file
|
|
257
|
+
- you point `settings.yml` at it with `module.path:function_name`
|
|
258
|
+
|
|
259
|
+
The function should use this signature:
|
|
260
|
+
|
|
261
|
+
```python
|
|
262
|
+
from pathlib import Path
|
|
263
|
+
from cocoindex_code.chunking import Chunk
|
|
264
|
+
|
|
265
|
+
def my_chunker(path: Path, content: str) -> tuple[str | None, list[Chunk]]:
|
|
266
|
+
...
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
- `path` is the file being indexed
|
|
270
|
+
- `content` is the full text of that file
|
|
271
|
+
- return `language_override` as a string like `"toml"` if you want to override language detection
|
|
272
|
+
- return `None` as `language_override` if you want to keep the detected language
|
|
273
|
+
- return a `list[Chunk]` with the chunks you want stored in the index
|
|
274
|
+
|
|
275
|
+
See [`src/cocoindex_code/chunking.py`](./src/cocoindex_code/chunking.py) for the public types and [`tests/example_toml_chunker.py`](./tests/example_toml_chunker.py) for a complete example.
|
|
276
|
+
|
|
244
277
|
## Embedding Models
|
|
245
278
|
|
|
246
279
|
By default, a local SentenceTransformers model ([sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)) is used — no API key required. To use a different model, edit `~/.cocoindex_code/global_settings.yml`.
|
|
@@ -91,9 +91,11 @@ select = ["E", "F", "I", "N", "W", "UP"]
|
|
|
91
91
|
python_version = "3.11"
|
|
92
92
|
strict = true
|
|
93
93
|
ignore_missing_imports = true
|
|
94
|
+
explicit_package_bases = true
|
|
94
95
|
|
|
95
96
|
[tool.pytest.ini_options]
|
|
96
97
|
testpaths = ["tests"]
|
|
97
98
|
python_files = ["test_*.py"]
|
|
98
99
|
python_functions = ["test_*"]
|
|
99
100
|
addopts = "-v --tb=short"
|
|
101
|
+
asyncio_mode = "auto"
|
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.2.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 2,
|
|
31
|
+
__version__ = version = '0.2.9'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 2, 9)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Public API for writing custom chunkers.
|
|
2
|
+
|
|
3
|
+
Example usage::
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from cocoindex_code.chunking import Chunk, ChunkerFn, TextPosition
|
|
7
|
+
|
|
8
|
+
def my_chunker(path: Path, content: str) -> tuple[str | None, list[Chunk]]:
|
|
9
|
+
pos = TextPosition(byte_offset=0, char_offset=0, line=1, column=0)
|
|
10
|
+
return "mylang", [Chunk(text=content, start=pos, end=pos)]
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import pathlib as _pathlib
|
|
16
|
+
from collections.abc import Callable as _Callable
|
|
17
|
+
|
|
18
|
+
import cocoindex as _coco
|
|
19
|
+
from cocoindex.resources.chunk import Chunk, TextPosition
|
|
20
|
+
|
|
21
|
+
# Callable alias (not Protocol) — consistent with codebase style.
|
|
22
|
+
# language_override=None keeps the language detected by detect_code_language.
|
|
23
|
+
# path is not resolved (no syscall); call path.resolve() inside the chunker if needed.
|
|
24
|
+
ChunkerFn = _Callable[[_pathlib.Path, str], tuple[str | None, list[Chunk]]]
|
|
25
|
+
|
|
26
|
+
# tracked=False: callables are not fingerprint-able; daemon restart re-indexes anyway.
|
|
27
|
+
CHUNKER_REGISTRY = _coco.ContextKey[dict[str, ChunkerFn]]("chunker_registry", tracked=False)
|
|
28
|
+
|
|
29
|
+
__all__ = ["Chunk", "ChunkerFn", "CHUNKER_REGISTRY", "TextPosition"]
|
|
@@ -12,13 +12,16 @@ import typer as _typer
|
|
|
12
12
|
from .client import DaemonStartError
|
|
13
13
|
from .protocol import DoctorCheckResult, IndexingProgress, ProjectStatusResponse, SearchResponse
|
|
14
14
|
from .settings import (
|
|
15
|
+
cocoindex_db_path,
|
|
15
16
|
default_project_settings,
|
|
16
17
|
default_user_settings,
|
|
17
18
|
find_parent_with_marker,
|
|
18
19
|
find_project_root,
|
|
20
|
+
project_settings_path,
|
|
19
21
|
resolve_db_dir,
|
|
20
22
|
save_project_settings,
|
|
21
23
|
save_user_settings,
|
|
24
|
+
target_sqlite_db_path,
|
|
22
25
|
user_settings_path,
|
|
23
26
|
)
|
|
24
27
|
|
|
@@ -284,10 +287,8 @@ def init(
|
|
|
284
287
|
force: bool = _typer.Option(False, "-f", "--force", help="Skip parent directory warning"),
|
|
285
288
|
) -> None:
|
|
286
289
|
"""Initialize a project for cocoindex-code."""
|
|
287
|
-
from .settings import project_settings_path as _project_settings_path
|
|
288
|
-
|
|
289
290
|
cwd = Path.cwd().resolve()
|
|
290
|
-
settings_file =
|
|
291
|
+
settings_file = project_settings_path(cwd)
|
|
291
292
|
|
|
292
293
|
# Always ensure user settings exist
|
|
293
294
|
user_path = user_settings_path()
|
|
@@ -377,8 +378,15 @@ def status() -> None:
|
|
|
377
378
|
"""Show project status."""
|
|
378
379
|
from . import client as _client
|
|
379
380
|
|
|
380
|
-
|
|
381
|
+
project_root_path = require_project_root()
|
|
382
|
+
project_root = str(project_root_path)
|
|
381
383
|
print_project_header(project_root)
|
|
384
|
+
|
|
385
|
+
_typer.echo(f"Settings: {project_settings_path(project_root_path)}")
|
|
386
|
+
db_path = target_sqlite_db_path(project_root_path)
|
|
387
|
+
if db_path.exists():
|
|
388
|
+
_typer.echo(f"Index DB: {db_path}")
|
|
389
|
+
|
|
382
390
|
print_index_stats(_client.project_status(project_root))
|
|
383
391
|
|
|
384
392
|
|
|
@@ -393,10 +401,10 @@ def reset(
|
|
|
393
401
|
db_dir = resolve_db_dir(project_root)
|
|
394
402
|
|
|
395
403
|
db_files = [
|
|
396
|
-
|
|
397
|
-
|
|
404
|
+
cocoindex_db_path(project_root),
|
|
405
|
+
target_sqlite_db_path(project_root),
|
|
398
406
|
]
|
|
399
|
-
settings_file =
|
|
407
|
+
settings_file = project_settings_path(project_root)
|
|
400
408
|
|
|
401
409
|
# Determine what will be deleted
|
|
402
410
|
to_delete = [f for f in db_files if f.exists()]
|
|
@@ -503,16 +511,10 @@ def doctor() -> None:
|
|
|
503
511
|
from .settings import (
|
|
504
512
|
load_user_settings as _load_user_settings,
|
|
505
513
|
)
|
|
506
|
-
from .settings import (
|
|
507
|
-
project_settings_path as _project_settings_path,
|
|
508
|
-
)
|
|
509
|
-
from .settings import (
|
|
510
|
-
user_settings_path as _user_settings_path,
|
|
511
|
-
)
|
|
512
514
|
|
|
513
515
|
# --- 1. Global settings (local, no daemon needed) ---
|
|
514
516
|
_print_section("Global Settings")
|
|
515
|
-
settings_path =
|
|
517
|
+
settings_path = user_settings_path()
|
|
516
518
|
_typer.echo(f" Settings: {settings_path}")
|
|
517
519
|
try:
|
|
518
520
|
user_settings = _load_user_settings()
|
|
@@ -570,7 +572,7 @@ def doctor() -> None:
|
|
|
570
572
|
# --- 6. Project settings (local, no daemon needed) ---
|
|
571
573
|
if project_root is not None:
|
|
572
574
|
_print_section("Project Settings")
|
|
573
|
-
ps_path =
|
|
575
|
+
ps_path = project_settings_path(project_root)
|
|
574
576
|
_typer.echo(f" Settings: {ps_path}")
|
|
575
577
|
try:
|
|
576
578
|
ps = _load_project_settings(project_root)
|
|
@@ -597,10 +599,9 @@ def doctor() -> None:
|
|
|
597
599
|
|
|
598
600
|
# --- 8. Log files ---
|
|
599
601
|
_print_section("Log Files")
|
|
600
|
-
from .daemon import
|
|
602
|
+
from .daemon import daemon_log_path as _daemon_log_path
|
|
601
603
|
|
|
602
|
-
|
|
603
|
-
_typer.echo(f" Daemon logs: {log_dir / 'daemon.log'}")
|
|
604
|
+
_typer.echo(f" Daemon logs: {_daemon_log_path()}")
|
|
604
605
|
_typer.echo(" Check logs above for further troubleshooting.")
|
|
605
606
|
|
|
606
607
|
|
|
@@ -343,10 +343,10 @@ def start_daemon() -> subprocess.Popen[bytes]:
|
|
|
343
343
|
Returns the ``Popen`` object so callers can detect early process death
|
|
344
344
|
(via ``proc.poll()``) instead of waiting for a full timeout.
|
|
345
345
|
"""
|
|
346
|
-
from .daemon import daemon_dir
|
|
346
|
+
from .daemon import daemon_dir, daemon_log_path
|
|
347
347
|
|
|
348
348
|
daemon_dir().mkdir(parents=True, exist_ok=True)
|
|
349
|
-
log_path =
|
|
349
|
+
log_path = daemon_log_path()
|
|
350
350
|
|
|
351
351
|
ccc_path = _find_ccc_executable()
|
|
352
352
|
if ccc_path:
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import asyncio
|
|
6
|
+
import importlib
|
|
6
7
|
import logging
|
|
7
8
|
import os
|
|
8
9
|
import signal
|
|
@@ -15,6 +16,7 @@ from pathlib import Path
|
|
|
15
16
|
from typing import Any
|
|
16
17
|
|
|
17
18
|
from ._version import __version__
|
|
19
|
+
from .chunking import ChunkerFn as _ChunkerFn
|
|
18
20
|
from .project import Project
|
|
19
21
|
from .protocol import (
|
|
20
22
|
DaemonEnvRequest,
|
|
@@ -46,15 +48,37 @@ from .protocol import (
|
|
|
46
48
|
encode_response,
|
|
47
49
|
)
|
|
48
50
|
from .settings import (
|
|
51
|
+
ChunkerMapping,
|
|
49
52
|
global_settings_mtime_us,
|
|
53
|
+
load_project_settings,
|
|
50
54
|
load_user_settings,
|
|
51
|
-
|
|
55
|
+
target_sqlite_db_path,
|
|
52
56
|
user_settings_dir,
|
|
53
57
|
)
|
|
54
58
|
from .shared import Embedder, create_embedder
|
|
55
59
|
|
|
56
60
|
logger = logging.getLogger(__name__)
|
|
57
61
|
|
|
62
|
+
|
|
63
|
+
def _resolve_chunker_registry(mappings: list[ChunkerMapping]) -> dict[str, _ChunkerFn]:
|
|
64
|
+
"""Resolve ``ChunkerMapping`` settings entries to a ``{suffix: fn}`` dict.
|
|
65
|
+
|
|
66
|
+
Each ``mapping.module`` must be a ``"module.path:callable"`` string importable
|
|
67
|
+
from the current environment.
|
|
68
|
+
"""
|
|
69
|
+
registry: dict[str, _ChunkerFn] = {}
|
|
70
|
+
for cm in mappings:
|
|
71
|
+
module_path, _, attr = cm.module.partition(":")
|
|
72
|
+
if not attr:
|
|
73
|
+
raise ValueError(f"chunker module {cm.module!r} must use 'module.path:callable' format")
|
|
74
|
+
mod = importlib.import_module(module_path)
|
|
75
|
+
fn = getattr(mod, attr)
|
|
76
|
+
if not callable(fn):
|
|
77
|
+
raise ValueError(f"chunker {cm.module!r}: {attr!r} is not callable")
|
|
78
|
+
registry[f".{cm.ext}"] = fn
|
|
79
|
+
return registry
|
|
80
|
+
|
|
81
|
+
|
|
58
82
|
# ---------------------------------------------------------------------------
|
|
59
83
|
# Daemon paths
|
|
60
84
|
# ---------------------------------------------------------------------------
|
|
@@ -111,7 +135,9 @@ class ProjectRegistry:
|
|
|
111
135
|
"""Get or create a Project for the given root. Lazy initialization."""
|
|
112
136
|
if project_root not in self._projects:
|
|
113
137
|
root = Path(project_root)
|
|
114
|
-
|
|
138
|
+
project_settings = load_project_settings(root)
|
|
139
|
+
chunker_registry = _resolve_chunker_registry(project_settings.chunkers)
|
|
140
|
+
project = await Project.create(root, self._embedder, chunker_registry=chunker_registry)
|
|
115
141
|
self._projects[project_root] = project
|
|
116
142
|
return self._projects[project_root]
|
|
117
143
|
|
|
@@ -346,7 +372,7 @@ async def _check_index_status(project_root_str: str) -> DoctorCheckResult:
|
|
|
346
372
|
from cocoindex.connectors import sqlite as coco_sqlite
|
|
347
373
|
|
|
348
374
|
project_root = Path(project_root_str)
|
|
349
|
-
db_path =
|
|
375
|
+
db_path = target_sqlite_db_path(project_root)
|
|
350
376
|
details = [f"Index: {db_path}"]
|
|
351
377
|
|
|
352
378
|
if not db_path.exists():
|
|
@@ -14,6 +14,7 @@ from cocoindex.resources.file import FilePathMatcher, PatternFilePathMatcher
|
|
|
14
14
|
from cocoindex.resources.id import IdGenerator
|
|
15
15
|
from pathspec import GitIgnoreSpec
|
|
16
16
|
|
|
17
|
+
from .chunking import CHUNKER_REGISTRY
|
|
17
18
|
from .settings import load_gitignore_spec, load_project_settings
|
|
18
19
|
from .shared import (
|
|
19
20
|
CODEBASE_DIR,
|
|
@@ -158,13 +159,20 @@ async def process_file(
|
|
|
158
159
|
or "text"
|
|
159
160
|
)
|
|
160
161
|
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
162
|
+
chunker_registry = coco.use_context(CHUNKER_REGISTRY)
|
|
163
|
+
chunker = chunker_registry.get(suffix)
|
|
164
|
+
if chunker is not None:
|
|
165
|
+
language_override, chunks = chunker(Path(file.file_path.path), content)
|
|
166
|
+
if language_override is not None:
|
|
167
|
+
language = language_override
|
|
168
|
+
else:
|
|
169
|
+
chunks = splitter.split(
|
|
170
|
+
content,
|
|
171
|
+
chunk_size=CHUNK_SIZE,
|
|
172
|
+
min_chunk_size=MIN_CHUNK_SIZE,
|
|
173
|
+
chunk_overlap=CHUNK_OVERLAP,
|
|
174
|
+
language=language,
|
|
175
|
+
)
|
|
168
176
|
|
|
169
177
|
id_gen = IdGenerator()
|
|
170
178
|
|
|
@@ -10,6 +10,7 @@ from pathlib import Path
|
|
|
10
10
|
import cocoindex as coco
|
|
11
11
|
from cocoindex.connectors import sqlite as coco_sqlite
|
|
12
12
|
|
|
13
|
+
from .chunking import CHUNKER_REGISTRY, ChunkerFn
|
|
13
14
|
from .indexer import indexer_main
|
|
14
15
|
from .protocol import (
|
|
15
16
|
IndexingProgress,
|
|
@@ -21,7 +22,15 @@ from .protocol import (
|
|
|
21
22
|
SearchResult,
|
|
22
23
|
)
|
|
23
24
|
from .query import query_codebase
|
|
24
|
-
from .settings import
|
|
25
|
+
from .settings import (
|
|
26
|
+
cocoindex_db_path as _cocoindex_db_path,
|
|
27
|
+
)
|
|
28
|
+
from .settings import (
|
|
29
|
+
resolve_db_dir,
|
|
30
|
+
)
|
|
31
|
+
from .settings import (
|
|
32
|
+
target_sqlite_db_path as _target_sqlite_db_path,
|
|
33
|
+
)
|
|
25
34
|
from .shared import (
|
|
26
35
|
CODEBASE_DIR,
|
|
27
36
|
EMBEDDER,
|
|
@@ -171,7 +180,7 @@ class Project:
|
|
|
171
180
|
offset: int = 0,
|
|
172
181
|
) -> list[SearchResult]:
|
|
173
182
|
"""Search within this project."""
|
|
174
|
-
target_db =
|
|
183
|
+
target_db = _target_sqlite_db_path(self._project_root)
|
|
175
184
|
results = await query_codebase(
|
|
176
185
|
query=query,
|
|
177
186
|
target_sqlite_db_path=target_db,
|
|
@@ -248,12 +257,20 @@ class Project:
|
|
|
248
257
|
async def create(
|
|
249
258
|
project_root: Path,
|
|
250
259
|
embedder: Embedder,
|
|
260
|
+
chunker_registry: dict[str, ChunkerFn] | None = None,
|
|
251
261
|
) -> Project:
|
|
252
262
|
"""Create a project with explicit embedder.
|
|
253
263
|
|
|
254
264
|
Project-level settings and .gitignore are NOT cached here — the
|
|
255
265
|
indexer loads them fresh from disk on every run so that user edits
|
|
256
266
|
take effect without restarting the daemon.
|
|
267
|
+
|
|
268
|
+
Args:
|
|
269
|
+
project_root: Root directory of the codebase to index.
|
|
270
|
+
embedder: Embedding model instance.
|
|
271
|
+
chunker_registry: Optional mapping of file suffix (e.g. ``".toml"``)
|
|
272
|
+
to a ``ChunkerFn``. When a suffix matches, the registered
|
|
273
|
+
chunker is called instead of the built-in splitter.
|
|
257
274
|
"""
|
|
258
275
|
settings_dir = project_root / ".cocoindex_code"
|
|
259
276
|
settings_dir.mkdir(parents=True, exist_ok=True)
|
|
@@ -261,15 +278,16 @@ class Project:
|
|
|
261
278
|
db_dir = resolve_db_dir(project_root)
|
|
262
279
|
db_dir.mkdir(parents=True, exist_ok=True)
|
|
263
280
|
|
|
264
|
-
|
|
265
|
-
|
|
281
|
+
cocoindex_db = _cocoindex_db_path(project_root)
|
|
282
|
+
target_sqlite_db = _target_sqlite_db_path(project_root)
|
|
266
283
|
|
|
267
|
-
settings = coco.Settings.from_env(
|
|
284
|
+
settings = coco.Settings.from_env(cocoindex_db)
|
|
268
285
|
|
|
269
286
|
context = coco.ContextProvider()
|
|
270
287
|
context.provide(CODEBASE_DIR, project_root)
|
|
271
|
-
context.provide(SQLITE_DB, coco_sqlite.connect(str(
|
|
288
|
+
context.provide(SQLITE_DB, coco_sqlite.connect(str(target_sqlite_db), load_vec=True))
|
|
272
289
|
context.provide(EMBEDDER, embedder)
|
|
290
|
+
context.provide(CHUNKER_REGISTRY, dict(chunker_registry) if chunker_registry else {})
|
|
273
291
|
|
|
274
292
|
env = coco.Environment(settings, context_provider=context)
|
|
275
293
|
app = coco.App(
|
|
@@ -45,6 +45,29 @@ DEFAULT_INCLUDED_PATTERNS: list[str] = [
|
|
|
45
45
|
"**/*.rst", # reStructuredText
|
|
46
46
|
"**/*.php", # PHP
|
|
47
47
|
"**/*.lua", # Lua
|
|
48
|
+
"**/*.rb", # Ruby
|
|
49
|
+
"**/*.swift", # Swift
|
|
50
|
+
"**/*.kt", # Kotlin
|
|
51
|
+
"**/*.kts", # Kotlin script
|
|
52
|
+
"**/*.scala", # Scala
|
|
53
|
+
"**/*.r", # R
|
|
54
|
+
"**/*.html", # HTML
|
|
55
|
+
"**/*.htm", # HTML
|
|
56
|
+
"**/*.css", # CSS
|
|
57
|
+
"**/*.scss", # SCSS
|
|
58
|
+
"**/*.json", # JSON
|
|
59
|
+
"**/*.xml", # XML
|
|
60
|
+
"**/*.yaml", # YAML
|
|
61
|
+
"**/*.yml", # YAML
|
|
62
|
+
"**/*.toml", # TOML
|
|
63
|
+
"**/*.sol", # Solidity
|
|
64
|
+
"**/*.pas", # Pascal
|
|
65
|
+
"**/*.dpr", # Pascal/Delphi
|
|
66
|
+
"**/*.dtd", # DTD
|
|
67
|
+
"**/*.f", # Fortran
|
|
68
|
+
"**/*.f90", # Fortran
|
|
69
|
+
"**/*.f95", # Fortran
|
|
70
|
+
"**/*.f03", # Fortran
|
|
48
71
|
]
|
|
49
72
|
|
|
50
73
|
DEFAULT_EXCLUDED_PATTERNS: list[str] = [
|
|
@@ -83,11 +106,18 @@ class LanguageOverride:
|
|
|
83
106
|
lang: str # e.g. "php"
|
|
84
107
|
|
|
85
108
|
|
|
109
|
+
@dataclass
|
|
110
|
+
class ChunkerMapping:
|
|
111
|
+
ext: str # without dot, e.g. "toml"
|
|
112
|
+
module: str # "module.path:callable", e.g. "cocoindex_code.toml_chunker:toml_chunker"
|
|
113
|
+
|
|
114
|
+
|
|
86
115
|
@dataclass
|
|
87
116
|
class ProjectSettings:
|
|
88
117
|
include_patterns: list[str] = field(default_factory=lambda: list(DEFAULT_INCLUDED_PATTERNS))
|
|
89
118
|
exclude_patterns: list[str] = field(default_factory=lambda: list(DEFAULT_EXCLUDED_PATTERNS))
|
|
90
119
|
language_overrides: list[LanguageOverride] = field(default_factory=list)
|
|
120
|
+
chunkers: list[ChunkerMapping] = field(default_factory=list)
|
|
91
121
|
|
|
92
122
|
|
|
93
123
|
# ---------------------------------------------------------------------------
|
|
@@ -194,6 +224,20 @@ def _reset_db_path_mapping_cache() -> None:
|
|
|
194
224
|
_db_path_mapping = None
|
|
195
225
|
|
|
196
226
|
|
|
227
|
+
_TARGET_SQLITE_DB_NAME = "target_sqlite.db"
|
|
228
|
+
_COCOINDEX_DB_NAME = "cocoindex.db"
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def target_sqlite_db_path(project_root: Path) -> Path:
|
|
232
|
+
"""Return the path to the vector index SQLite database for a project."""
|
|
233
|
+
return resolve_db_dir(project_root) / _TARGET_SQLITE_DB_NAME
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def cocoindex_db_path(project_root: Path) -> Path:
|
|
237
|
+
"""Return the path to the CocoIndex state database for a project."""
|
|
238
|
+
return resolve_db_dir(project_root) / _COCOINDEX_DB_NAME
|
|
239
|
+
|
|
240
|
+
|
|
197
241
|
def user_settings_dir() -> Path:
|
|
198
242
|
"""Return ``~/.cocoindex_code/``.
|
|
199
243
|
|
|
@@ -238,7 +282,7 @@ def find_legacy_project_root(start: Path) -> Path | None:
|
|
|
238
282
|
"""
|
|
239
283
|
current = start.resolve()
|
|
240
284
|
while True:
|
|
241
|
-
if (current / _SETTINGS_DIR_NAME /
|
|
285
|
+
if (current / _SETTINGS_DIR_NAME / _COCOINDEX_DB_NAME).exists():
|
|
242
286
|
return current
|
|
243
287
|
parent = current.parent
|
|
244
288
|
if parent == current:
|
|
@@ -337,6 +381,8 @@ def _project_settings_to_dict(settings: ProjectSettings) -> dict[str, Any]:
|
|
|
337
381
|
d["language_overrides"] = [
|
|
338
382
|
{"ext": lo.ext, "lang": lo.lang} for lo in settings.language_overrides
|
|
339
383
|
]
|
|
384
|
+
if settings.chunkers:
|
|
385
|
+
d["chunkers"] = [{"ext": cm.ext, "module": cm.module} for cm in settings.chunkers]
|
|
340
386
|
return d
|
|
341
387
|
|
|
342
388
|
|
|
@@ -344,10 +390,12 @@ def _project_settings_from_dict(d: dict[str, Any]) -> ProjectSettings:
|
|
|
344
390
|
overrides = [
|
|
345
391
|
LanguageOverride(ext=lo["ext"], lang=lo["lang"]) for lo in d.get("language_overrides", [])
|
|
346
392
|
]
|
|
393
|
+
chunkers = [ChunkerMapping(ext=cm["ext"], module=cm["module"]) for cm in d.get("chunkers", [])]
|
|
347
394
|
return ProjectSettings(
|
|
348
395
|
include_patterns=d.get("include_patterns", list(DEFAULT_INCLUDED_PATTERNS)),
|
|
349
396
|
exclude_patterns=d.get("exclude_patterns", list(DEFAULT_EXCLUDED_PATTERNS)),
|
|
350
397
|
language_overrides=overrides,
|
|
398
|
+
chunkers=chunkers,
|
|
351
399
|
)
|
|
352
400
|
|
|
353
401
|
|
|
@@ -1,144 +0,0 @@
|
|
|
1
|
-
"""Configuration management for cocoindex-code."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import json
|
|
6
|
-
import os
|
|
7
|
-
from dataclasses import dataclass
|
|
8
|
-
from pathlib import Path
|
|
9
|
-
|
|
10
|
-
from .settings import resolve_db_dir
|
|
11
|
-
|
|
12
|
-
_DEFAULT_MODEL = "sbert/sentence-transformers/all-MiniLM-L6-v2"
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def _find_root_with_marker(start: Path, markers: list[str]) -> Path | None:
|
|
16
|
-
"""Walk up from start, return first directory containing any marker."""
|
|
17
|
-
current = start
|
|
18
|
-
while True:
|
|
19
|
-
if any((current / m).exists() for m in markers):
|
|
20
|
-
return current
|
|
21
|
-
parent = current.parent
|
|
22
|
-
if parent == current:
|
|
23
|
-
return None
|
|
24
|
-
current = parent
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
def _discover_codebase_root() -> Path:
|
|
28
|
-
"""Discover the codebase root directory.
|
|
29
|
-
|
|
30
|
-
Discovery order:
|
|
31
|
-
1. Find nearest parent with `.cocoindex_code` directory (re-anchor to previously-indexed tree)
|
|
32
|
-
2. Find nearest parent with any common project root marker
|
|
33
|
-
3. Fall back to current working directory
|
|
34
|
-
"""
|
|
35
|
-
cwd = Path.cwd()
|
|
36
|
-
|
|
37
|
-
# First, look for existing .cocoindex_code directory
|
|
38
|
-
root = _find_root_with_marker(cwd, [".cocoindex_code"])
|
|
39
|
-
if root is not None:
|
|
40
|
-
return root
|
|
41
|
-
|
|
42
|
-
# Then, look for common project root markers
|
|
43
|
-
markers = [".git", "pyproject.toml", "package.json", "Cargo.toml", "go.mod"]
|
|
44
|
-
root = _find_root_with_marker(cwd, markers)
|
|
45
|
-
return root if root is not None else cwd
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def _parse_json_string_list_env(var_name: str) -> list[str]:
|
|
49
|
-
"""Parse an environment variable as a JSON array of strings."""
|
|
50
|
-
raw_value = os.environ.get(var_name, "")
|
|
51
|
-
if not raw_value.strip():
|
|
52
|
-
return []
|
|
53
|
-
|
|
54
|
-
try:
|
|
55
|
-
parsed = json.loads(raw_value)
|
|
56
|
-
except json.JSONDecodeError as exc:
|
|
57
|
-
raise ValueError(f"{var_name} must be a JSON array of strings, got invalid JSON") from exc
|
|
58
|
-
|
|
59
|
-
if not isinstance(parsed, list):
|
|
60
|
-
raise ValueError(f"{var_name} must be a JSON array of strings")
|
|
61
|
-
|
|
62
|
-
result: list[str] = []
|
|
63
|
-
for item in parsed:
|
|
64
|
-
if not isinstance(item, str):
|
|
65
|
-
raise ValueError(f"{var_name} must be a JSON array of strings")
|
|
66
|
-
item = item.strip()
|
|
67
|
-
if item:
|
|
68
|
-
result.append(item)
|
|
69
|
-
|
|
70
|
-
return result
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
@dataclass
|
|
74
|
-
class Config:
|
|
75
|
-
"""Configuration loaded from environment variables."""
|
|
76
|
-
|
|
77
|
-
codebase_root_path: Path
|
|
78
|
-
embedding_model: str
|
|
79
|
-
index_dir: Path
|
|
80
|
-
device: str | None
|
|
81
|
-
extra_extensions: dict[str, str | None]
|
|
82
|
-
excluded_patterns: list[str]
|
|
83
|
-
|
|
84
|
-
@classmethod
|
|
85
|
-
def from_env(cls) -> Config:
|
|
86
|
-
"""Load configuration from environment variables."""
|
|
87
|
-
# Get root path from env or discover it
|
|
88
|
-
root_path_str = os.environ.get("COCOINDEX_CODE_ROOT_PATH")
|
|
89
|
-
if root_path_str:
|
|
90
|
-
root = Path(root_path_str).resolve()
|
|
91
|
-
else:
|
|
92
|
-
root = _discover_codebase_root()
|
|
93
|
-
|
|
94
|
-
# Get embedding model
|
|
95
|
-
# Prefix "sbert/" for SentenceTransformers models, otherwise LiteLLM.
|
|
96
|
-
embedding_model = os.environ.get(
|
|
97
|
-
"COCOINDEX_CODE_EMBEDDING_MODEL",
|
|
98
|
-
_DEFAULT_MODEL,
|
|
99
|
-
)
|
|
100
|
-
|
|
101
|
-
# Index directory: apply DB path mapping if configured
|
|
102
|
-
index_dir = resolve_db_dir(root)
|
|
103
|
-
|
|
104
|
-
# Device: auto-detect CUDA or use env override
|
|
105
|
-
device = os.environ.get("COCOINDEX_CODE_DEVICE")
|
|
106
|
-
|
|
107
|
-
# Extra file extensions (format: "inc:php,yaml,toml" — optional lang after colon)
|
|
108
|
-
raw_extra = os.environ.get("COCOINDEX_CODE_EXTRA_EXTENSIONS", "")
|
|
109
|
-
extra_extensions: dict[str, str | None] = {}
|
|
110
|
-
for token in raw_extra.split(","):
|
|
111
|
-
token = token.strip()
|
|
112
|
-
if not token:
|
|
113
|
-
continue
|
|
114
|
-
if ":" in token:
|
|
115
|
-
ext, lang = token.split(":", 1)
|
|
116
|
-
extra_extensions[f".{ext.strip()}"] = lang.strip() or None
|
|
117
|
-
else:
|
|
118
|
-
extra_extensions[f".{token}"] = None
|
|
119
|
-
|
|
120
|
-
# Excluded file glob patterns
|
|
121
|
-
excluded_patterns = _parse_json_string_list_env("COCOINDEX_CODE_EXCLUDED_PATTERNS")
|
|
122
|
-
|
|
123
|
-
return cls(
|
|
124
|
-
codebase_root_path=root,
|
|
125
|
-
embedding_model=embedding_model,
|
|
126
|
-
index_dir=index_dir,
|
|
127
|
-
device=device,
|
|
128
|
-
extra_extensions=extra_extensions,
|
|
129
|
-
excluded_patterns=excluded_patterns,
|
|
130
|
-
)
|
|
131
|
-
|
|
132
|
-
@property
|
|
133
|
-
def target_sqlite_db_path(self) -> Path:
|
|
134
|
-
"""Path to the vector index SQLite database."""
|
|
135
|
-
return self.index_dir / "target_sqlite.db"
|
|
136
|
-
|
|
137
|
-
@property
|
|
138
|
-
def cocoindex_db_path(self) -> Path:
|
|
139
|
-
"""Path to the CocoIndex state database."""
|
|
140
|
-
return self.index_dir / "cocoindex.db"
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
# Module-level singleton — imported directly by all modules that need configuration
|
|
144
|
-
config: Config = Config.from_env()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|