cocoindex-code 0.2.7__tar.gz → 0.2.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/PKG-INFO +67 -1
- {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/README.md +66 -0
- {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/pyproject.toml +2 -0
- {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/src/cocoindex_code/_version.py +2 -2
- cocoindex_code-0.2.9/src/cocoindex_code/chunking.py +29 -0
- {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/src/cocoindex_code/cli.py +31 -18
- {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/src/cocoindex_code/client.py +2 -2
- {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/src/cocoindex_code/daemon.py +36 -2
- {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/src/cocoindex_code/indexer.py +15 -7
- {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/src/cocoindex_code/project.py +29 -7
- {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/src/cocoindex_code/protocol.py +6 -0
- {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/src/cocoindex_code/settings.py +127 -3
- cocoindex_code-0.2.7/src/cocoindex_code/config.py +0 -142
- {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/.gitignore +0 -0
- {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/LICENSE +0 -0
- {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/src/cocoindex_code/__init__.py +0 -0
- {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/src/cocoindex_code/__main__.py +0 -0
- {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/src/cocoindex_code/query.py +0 -0
- {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/src/cocoindex_code/schema.py +0 -0
- {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/src/cocoindex_code/server.py +0 -0
- {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/src/cocoindex_code/shared.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cocoindex-code
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.9
|
|
4
4
|
Summary: MCP server for indexing and querying codebases using CocoIndex
|
|
5
5
|
Project-URL: Homepage, https://github.com/cocoindex-io/cocoindex-code
|
|
6
6
|
Project-URL: Repository, https://github.com/cocoindex-io/cocoindex-code
|
|
@@ -276,10 +276,43 @@ exclude_patterns:
|
|
|
276
276
|
language_overrides:
|
|
277
277
|
- ext: inc # treat .inc files as PHP
|
|
278
278
|
lang: php
|
|
279
|
+
|
|
280
|
+
chunkers:
|
|
281
|
+
- ext: toml # use a custom chunker for .toml files
|
|
282
|
+
module: example_toml_chunker:toml_chunker
|
|
279
283
|
```
|
|
280
284
|
|
|
281
285
|
> `.cocoindex_code/` is automatically added to `.gitignore` during init.
|
|
282
286
|
|
|
287
|
+
Use `chunkers` when you want to control how a file type is split into chunks before indexing.
|
|
288
|
+
|
|
289
|
+
`module: example_toml_chunker:toml_chunker` means:
|
|
290
|
+
- `example_toml_chunker` is a local Python module
|
|
291
|
+
- `toml_chunker` is the function inside that module
|
|
292
|
+
|
|
293
|
+
In practice, this usually means:
|
|
294
|
+
- you create a Python file in your project, for example `example_toml_chunker.py`
|
|
295
|
+
- you add a function in that file
|
|
296
|
+
- you point `settings.yml` at it with `module.path:function_name`
|
|
297
|
+
|
|
298
|
+
The function should use this signature:
|
|
299
|
+
|
|
300
|
+
```python
|
|
301
|
+
from pathlib import Path
|
|
302
|
+
from cocoindex_code.chunking import Chunk
|
|
303
|
+
|
|
304
|
+
def my_chunker(path: Path, content: str) -> tuple[str | None, list[Chunk]]:
|
|
305
|
+
...
|
|
306
|
+
```
|
|
307
|
+
|
|
308
|
+
- `path` is the file being indexed
|
|
309
|
+
- `content` is the full text of that file
|
|
310
|
+
- return `language_override` as a string like `"toml"` if you want to override language detection
|
|
311
|
+
- return `None` as `language_override` if you want to keep the detected language
|
|
312
|
+
- return a `list[Chunk]` with the chunks you want stored in the index
|
|
313
|
+
|
|
314
|
+
See [`src/cocoindex_code/chunking.py`](./src/cocoindex_code/chunking.py) for the public types and [`tests/example_toml_chunker.py`](./tests/example_toml_chunker.py) for a complete example.
|
|
315
|
+
|
|
283
316
|
## Embedding Models
|
|
284
317
|
|
|
285
318
|
By default, a local SentenceTransformers model ([sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)) is used — no API key required. To use a different model, edit `~/.cocoindex_code/global_settings.yml`.
|
|
@@ -457,6 +490,26 @@ embedding:
|
|
|
457
490
|
| xml | | `.xml` |
|
|
458
491
|
| yaml | | `.yaml`, `.yml` |
|
|
459
492
|
|
|
493
|
+
### Custom Database Location
|
|
494
|
+
|
|
495
|
+
By default, index databases (`cocoindex.db` and `target_sqlite.db`) live alongside settings in `<project>/.cocoindex_code/`. When running in Docker, you may want the databases on the container's native filesystem for performance (LMDB doesn't work well on mounted volumes) while keeping the source code and settings on a mounted volume.
|
|
496
|
+
|
|
497
|
+
Set `COCOINDEX_CODE_DB_PATH_MAPPING` to remap database locations by path prefix:
|
|
498
|
+
|
|
499
|
+
```bash
|
|
500
|
+
COCOINDEX_CODE_DB_PATH_MAPPING=/workspace=/db-files
|
|
501
|
+
```
|
|
502
|
+
|
|
503
|
+
With this mapping, a project at `/workspace/myrepo` stores its databases in `/db-files/myrepo/` instead of `/workspace/myrepo/.cocoindex_code/`. Settings files remain in the original location.
|
|
504
|
+
|
|
505
|
+
Multiple mappings are comma-separated and resolved in order (first match wins):
|
|
506
|
+
|
|
507
|
+
```bash
|
|
508
|
+
COCOINDEX_CODE_DB_PATH_MAPPING=/workspace=/db-files,/workspace2=/db-files2
|
|
509
|
+
```
|
|
510
|
+
|
|
511
|
+
Both source and target must be absolute paths. If no mapping matches, the default location is used.
|
|
512
|
+
|
|
460
513
|
## Troubleshooting
|
|
461
514
|
|
|
462
515
|
Run `ccc doctor` to diagnose common issues. It checks your settings, daemon health, embedding model, file matching, and index status — all in one command.
|
|
@@ -501,6 +554,19 @@ If you previously configured `cocoindex-code` via environment variables, the `co
|
|
|
501
554
|
|
|
502
555
|
If you need help with remote setup, please email our maintainer linghua@cocoindex.io, happy to help!
|
|
503
556
|
|
|
557
|
+
## Contributing
|
|
558
|
+
|
|
559
|
+
We welcome contributions! Before you start, please install the [pre-commit](https://pre-commit.com/) hooks so that linting, formatting, type checking, and tests run automatically before each commit:
|
|
560
|
+
|
|
561
|
+
```bash
|
|
562
|
+
pip install pre-commit
|
|
563
|
+
pre-commit install
|
|
564
|
+
```
|
|
565
|
+
|
|
566
|
+
This catches common issues — trailing whitespace, lint errors (Ruff), type errors (mypy), and test failures — before they reach CI.
|
|
567
|
+
|
|
568
|
+
For more details, see our [contributing guide](https://cocoindex.io/docs/contributing/guide).
|
|
569
|
+
|
|
504
570
|
## License
|
|
505
571
|
|
|
506
572
|
Apache-2.0
|
|
@@ -237,10 +237,43 @@ exclude_patterns:
|
|
|
237
237
|
language_overrides:
|
|
238
238
|
- ext: inc # treat .inc files as PHP
|
|
239
239
|
lang: php
|
|
240
|
+
|
|
241
|
+
chunkers:
|
|
242
|
+
- ext: toml # use a custom chunker for .toml files
|
|
243
|
+
module: example_toml_chunker:toml_chunker
|
|
240
244
|
```
|
|
241
245
|
|
|
242
246
|
> `.cocoindex_code/` is automatically added to `.gitignore` during init.
|
|
243
247
|
|
|
248
|
+
Use `chunkers` when you want to control how a file type is split into chunks before indexing.
|
|
249
|
+
|
|
250
|
+
`module: example_toml_chunker:toml_chunker` means:
|
|
251
|
+
- `example_toml_chunker` is a local Python module
|
|
252
|
+
- `toml_chunker` is the function inside that module
|
|
253
|
+
|
|
254
|
+
In practice, this usually means:
|
|
255
|
+
- you create a Python file in your project, for example `example_toml_chunker.py`
|
|
256
|
+
- you add a function in that file
|
|
257
|
+
- you point `settings.yml` at it with `module.path:function_name`
|
|
258
|
+
|
|
259
|
+
The function should use this signature:
|
|
260
|
+
|
|
261
|
+
```python
|
|
262
|
+
from pathlib import Path
|
|
263
|
+
from cocoindex_code.chunking import Chunk
|
|
264
|
+
|
|
265
|
+
def my_chunker(path: Path, content: str) -> tuple[str | None, list[Chunk]]:
|
|
266
|
+
...
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
- `path` is the file being indexed
|
|
270
|
+
- `content` is the full text of that file
|
|
271
|
+
- return `language_override` as a string like `"toml"` if you want to override language detection
|
|
272
|
+
- return `None` as `language_override` if you want to keep the detected language
|
|
273
|
+
- return a `list[Chunk]` with the chunks you want stored in the index
|
|
274
|
+
|
|
275
|
+
See [`src/cocoindex_code/chunking.py`](./src/cocoindex_code/chunking.py) for the public types and [`tests/example_toml_chunker.py`](./tests/example_toml_chunker.py) for a complete example.
|
|
276
|
+
|
|
244
277
|
## Embedding Models
|
|
245
278
|
|
|
246
279
|
By default, a local SentenceTransformers model ([sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)) is used — no API key required. To use a different model, edit `~/.cocoindex_code/global_settings.yml`.
|
|
@@ -418,6 +451,26 @@ embedding:
|
|
|
418
451
|
| xml | | `.xml` |
|
|
419
452
|
| yaml | | `.yaml`, `.yml` |
|
|
420
453
|
|
|
454
|
+
### Custom Database Location
|
|
455
|
+
|
|
456
|
+
By default, index databases (`cocoindex.db` and `target_sqlite.db`) live alongside settings in `<project>/.cocoindex_code/`. When running in Docker, you may want the databases on the container's native filesystem for performance (LMDB doesn't work well on mounted volumes) while keeping the source code and settings on a mounted volume.
|
|
457
|
+
|
|
458
|
+
Set `COCOINDEX_CODE_DB_PATH_MAPPING` to remap database locations by path prefix:
|
|
459
|
+
|
|
460
|
+
```bash
|
|
461
|
+
COCOINDEX_CODE_DB_PATH_MAPPING=/workspace=/db-files
|
|
462
|
+
```
|
|
463
|
+
|
|
464
|
+
With this mapping, a project at `/workspace/myrepo` stores its databases in `/db-files/myrepo/` instead of `/workspace/myrepo/.cocoindex_code/`. Settings files remain in the original location.
|
|
465
|
+
|
|
466
|
+
Multiple mappings are comma-separated and resolved in order (first match wins):
|
|
467
|
+
|
|
468
|
+
```bash
|
|
469
|
+
COCOINDEX_CODE_DB_PATH_MAPPING=/workspace=/db-files,/workspace2=/db-files2
|
|
470
|
+
```
|
|
471
|
+
|
|
472
|
+
Both source and target must be absolute paths. If no mapping matches, the default location is used.
|
|
473
|
+
|
|
421
474
|
## Troubleshooting
|
|
422
475
|
|
|
423
476
|
Run `ccc doctor` to diagnose common issues. It checks your settings, daemon health, embedding model, file matching, and index status — all in one command.
|
|
@@ -462,6 +515,19 @@ If you previously configured `cocoindex-code` via environment variables, the `co
|
|
|
462
515
|
|
|
463
516
|
If you need help with remote setup, please email our maintainer linghua@cocoindex.io, happy to help!
|
|
464
517
|
|
|
518
|
+
## Contributing
|
|
519
|
+
|
|
520
|
+
We welcome contributions! Before you start, please install the [pre-commit](https://pre-commit.com/) hooks so that linting, formatting, type checking, and tests run automatically before each commit:
|
|
521
|
+
|
|
522
|
+
```bash
|
|
523
|
+
pip install pre-commit
|
|
524
|
+
pre-commit install
|
|
525
|
+
```
|
|
526
|
+
|
|
527
|
+
This catches common issues — trailing whitespace, lint errors (Ruff), type errors (mypy), and test failures — before they reach CI.
|
|
528
|
+
|
|
529
|
+
For more details, see our [contributing guide](https://cocoindex.io/docs/contributing/guide).
|
|
530
|
+
|
|
465
531
|
## License
|
|
466
532
|
|
|
467
533
|
Apache-2.0
|
|
@@ -91,9 +91,11 @@ select = ["E", "F", "I", "N", "W", "UP"]
|
|
|
91
91
|
python_version = "3.11"
|
|
92
92
|
strict = true
|
|
93
93
|
ignore_missing_imports = true
|
|
94
|
+
explicit_package_bases = true
|
|
94
95
|
|
|
95
96
|
[tool.pytest.ini_options]
|
|
96
97
|
testpaths = ["tests"]
|
|
97
98
|
python_files = ["test_*.py"]
|
|
98
99
|
python_functions = ["test_*"]
|
|
99
100
|
addopts = "-v --tb=short"
|
|
101
|
+
asyncio_mode = "auto"
|
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.2.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 2,
|
|
31
|
+
__version__ = version = '0.2.9'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 2, 9)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Public API for writing custom chunkers.
|
|
2
|
+
|
|
3
|
+
Example usage::
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from cocoindex_code.chunking import Chunk, ChunkerFn, TextPosition
|
|
7
|
+
|
|
8
|
+
def my_chunker(path: Path, content: str) -> tuple[str | None, list[Chunk]]:
|
|
9
|
+
pos = TextPosition(byte_offset=0, char_offset=0, line=1, column=0)
|
|
10
|
+
return "mylang", [Chunk(text=content, start=pos, end=pos)]
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import pathlib as _pathlib
|
|
16
|
+
from collections.abc import Callable as _Callable
|
|
17
|
+
|
|
18
|
+
import cocoindex as _coco
|
|
19
|
+
from cocoindex.resources.chunk import Chunk, TextPosition
|
|
20
|
+
|
|
21
|
+
# Callable alias (not Protocol) — consistent with codebase style.
|
|
22
|
+
# language_override=None keeps the language detected by detect_code_language.
|
|
23
|
+
# path is not resolved (no syscall); call path.resolve() inside the chunker if needed.
|
|
24
|
+
ChunkerFn = _Callable[[_pathlib.Path, str], tuple[str | None, list[Chunk]]]
|
|
25
|
+
|
|
26
|
+
# tracked=False: callables are not fingerprint-able; daemon restart re-indexes anyway.
|
|
27
|
+
CHUNKER_REGISTRY = _coco.ContextKey[dict[str, ChunkerFn]]("chunker_registry", tracked=False)
|
|
28
|
+
|
|
29
|
+
__all__ = ["Chunk", "ChunkerFn", "CHUNKER_REGISTRY", "TextPosition"]
|
|
@@ -12,12 +12,16 @@ import typer as _typer
|
|
|
12
12
|
from .client import DaemonStartError
|
|
13
13
|
from .protocol import DoctorCheckResult, IndexingProgress, ProjectStatusResponse, SearchResponse
|
|
14
14
|
from .settings import (
|
|
15
|
+
cocoindex_db_path,
|
|
15
16
|
default_project_settings,
|
|
16
17
|
default_user_settings,
|
|
17
18
|
find_parent_with_marker,
|
|
18
19
|
find_project_root,
|
|
20
|
+
project_settings_path,
|
|
21
|
+
resolve_db_dir,
|
|
19
22
|
save_project_settings,
|
|
20
23
|
save_user_settings,
|
|
24
|
+
target_sqlite_db_path,
|
|
21
25
|
user_settings_path,
|
|
22
26
|
)
|
|
23
27
|
|
|
@@ -283,10 +287,8 @@ def init(
|
|
|
283
287
|
force: bool = _typer.Option(False, "-f", "--force", help="Skip parent directory warning"),
|
|
284
288
|
) -> None:
|
|
285
289
|
"""Initialize a project for cocoindex-code."""
|
|
286
|
-
from .settings import project_settings_path as _project_settings_path
|
|
287
|
-
|
|
288
290
|
cwd = Path.cwd().resolve()
|
|
289
|
-
settings_file =
|
|
291
|
+
settings_file = project_settings_path(cwd)
|
|
290
292
|
|
|
291
293
|
# Always ensure user settings exist
|
|
292
294
|
user_path = user_settings_path()
|
|
@@ -376,8 +378,15 @@ def status() -> None:
|
|
|
376
378
|
"""Show project status."""
|
|
377
379
|
from . import client as _client
|
|
378
380
|
|
|
379
|
-
|
|
381
|
+
project_root_path = require_project_root()
|
|
382
|
+
project_root = str(project_root_path)
|
|
380
383
|
print_project_header(project_root)
|
|
384
|
+
|
|
385
|
+
_typer.echo(f"Settings: {project_settings_path(project_root_path)}")
|
|
386
|
+
db_path = target_sqlite_db_path(project_root_path)
|
|
387
|
+
if db_path.exists():
|
|
388
|
+
_typer.echo(f"Index DB: {db_path}")
|
|
389
|
+
|
|
381
390
|
print_index_stats(_client.project_status(project_root))
|
|
382
391
|
|
|
383
392
|
|
|
@@ -389,12 +398,13 @@ def reset(
|
|
|
389
398
|
"""Reset project databases and optionally remove settings."""
|
|
390
399
|
project_root = require_project_root()
|
|
391
400
|
cocoindex_dir = project_root / ".cocoindex_code"
|
|
401
|
+
db_dir = resolve_db_dir(project_root)
|
|
392
402
|
|
|
393
403
|
db_files = [
|
|
394
|
-
|
|
395
|
-
|
|
404
|
+
cocoindex_db_path(project_root),
|
|
405
|
+
target_sqlite_db_path(project_root),
|
|
396
406
|
]
|
|
397
|
-
settings_file =
|
|
407
|
+
settings_file = project_settings_path(project_root)
|
|
398
408
|
|
|
399
409
|
# Determine what will be deleted
|
|
400
410
|
to_delete = [f for f in db_files if f.exists()]
|
|
@@ -436,6 +446,12 @@ def reset(
|
|
|
436
446
|
f.unlink(missing_ok=True)
|
|
437
447
|
|
|
438
448
|
if all_:
|
|
449
|
+
# Remove db_dir if empty and different from cocoindex_dir
|
|
450
|
+
if db_dir != cocoindex_dir:
|
|
451
|
+
try:
|
|
452
|
+
db_dir.rmdir()
|
|
453
|
+
except OSError:
|
|
454
|
+
pass # Not empty or doesn't exist
|
|
439
455
|
# Remove .cocoindex_code/ if empty
|
|
440
456
|
try:
|
|
441
457
|
cocoindex_dir.rmdir()
|
|
@@ -495,16 +511,10 @@ def doctor() -> None:
|
|
|
495
511
|
from .settings import (
|
|
496
512
|
load_user_settings as _load_user_settings,
|
|
497
513
|
)
|
|
498
|
-
from .settings import (
|
|
499
|
-
project_settings_path as _project_settings_path,
|
|
500
|
-
)
|
|
501
|
-
from .settings import (
|
|
502
|
-
user_settings_path as _user_settings_path,
|
|
503
|
-
)
|
|
504
514
|
|
|
505
515
|
# --- 1. Global settings (local, no daemon needed) ---
|
|
506
516
|
_print_section("Global Settings")
|
|
507
|
-
settings_path =
|
|
517
|
+
settings_path = user_settings_path()
|
|
508
518
|
_typer.echo(f" Settings: {settings_path}")
|
|
509
519
|
try:
|
|
510
520
|
user_settings = _load_user_settings()
|
|
@@ -539,6 +549,10 @@ def doctor() -> None:
|
|
|
539
549
|
other_keys = [k for k in env_resp.env_names if k not in settings_keys]
|
|
540
550
|
if other_keys:
|
|
541
551
|
_typer.echo(f" Other env vars in daemon: {', '.join(sorted(other_keys))}")
|
|
552
|
+
if env_resp.db_path_mappings:
|
|
553
|
+
_typer.echo(" DB path mappings:")
|
|
554
|
+
for m in env_resp.db_path_mappings:
|
|
555
|
+
_typer.echo(f" {m.source} \u2192 {m.target}")
|
|
542
556
|
except Exception as e:
|
|
543
557
|
_print_error(f"Failed to get daemon env: {e}")
|
|
544
558
|
|
|
@@ -558,7 +572,7 @@ def doctor() -> None:
|
|
|
558
572
|
# --- 6. Project settings (local, no daemon needed) ---
|
|
559
573
|
if project_root is not None:
|
|
560
574
|
_print_section("Project Settings")
|
|
561
|
-
ps_path =
|
|
575
|
+
ps_path = project_settings_path(project_root)
|
|
562
576
|
_typer.echo(f" Settings: {ps_path}")
|
|
563
577
|
try:
|
|
564
578
|
ps = _load_project_settings(project_root)
|
|
@@ -585,10 +599,9 @@ def doctor() -> None:
|
|
|
585
599
|
|
|
586
600
|
# --- 8. Log files ---
|
|
587
601
|
_print_section("Log Files")
|
|
588
|
-
from .daemon import
|
|
602
|
+
from .daemon import daemon_log_path as _daemon_log_path
|
|
589
603
|
|
|
590
|
-
|
|
591
|
-
_typer.echo(f" Daemon logs: {log_dir / 'daemon.log'}")
|
|
604
|
+
_typer.echo(f" Daemon logs: {_daemon_log_path()}")
|
|
592
605
|
_typer.echo(" Check logs above for further troubleshooting.")
|
|
593
606
|
|
|
594
607
|
|
|
@@ -343,10 +343,10 @@ def start_daemon() -> subprocess.Popen[bytes]:
|
|
|
343
343
|
Returns the ``Popen`` object so callers can detect early process death
|
|
344
344
|
(via ``proc.poll()``) instead of waiting for a full timeout.
|
|
345
345
|
"""
|
|
346
|
-
from .daemon import daemon_dir
|
|
346
|
+
from .daemon import daemon_dir, daemon_log_path
|
|
347
347
|
|
|
348
348
|
daemon_dir().mkdir(parents=True, exist_ok=True)
|
|
349
|
-
log_path =
|
|
349
|
+
log_path = daemon_log_path()
|
|
350
350
|
|
|
351
351
|
ccc_path = _find_ccc_executable()
|
|
352
352
|
if ccc_path:
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import asyncio
|
|
6
|
+
import importlib
|
|
6
7
|
import logging
|
|
7
8
|
import os
|
|
8
9
|
import signal
|
|
@@ -15,6 +16,7 @@ from pathlib import Path
|
|
|
15
16
|
from typing import Any
|
|
16
17
|
|
|
17
18
|
from ._version import __version__
|
|
19
|
+
from .chunking import ChunkerFn as _ChunkerFn
|
|
18
20
|
from .project import Project
|
|
19
21
|
from .protocol import (
|
|
20
22
|
DaemonEnvRequest,
|
|
@@ -46,14 +48,37 @@ from .protocol import (
|
|
|
46
48
|
encode_response,
|
|
47
49
|
)
|
|
48
50
|
from .settings import (
|
|
51
|
+
ChunkerMapping,
|
|
49
52
|
global_settings_mtime_us,
|
|
53
|
+
load_project_settings,
|
|
50
54
|
load_user_settings,
|
|
55
|
+
target_sqlite_db_path,
|
|
51
56
|
user_settings_dir,
|
|
52
57
|
)
|
|
53
58
|
from .shared import Embedder, create_embedder
|
|
54
59
|
|
|
55
60
|
logger = logging.getLogger(__name__)
|
|
56
61
|
|
|
62
|
+
|
|
63
|
+
def _resolve_chunker_registry(mappings: list[ChunkerMapping]) -> dict[str, _ChunkerFn]:
|
|
64
|
+
"""Resolve ``ChunkerMapping`` settings entries to a ``{suffix: fn}`` dict.
|
|
65
|
+
|
|
66
|
+
Each ``mapping.module`` must be a ``"module.path:callable"`` string importable
|
|
67
|
+
from the current environment.
|
|
68
|
+
"""
|
|
69
|
+
registry: dict[str, _ChunkerFn] = {}
|
|
70
|
+
for cm in mappings:
|
|
71
|
+
module_path, _, attr = cm.module.partition(":")
|
|
72
|
+
if not attr:
|
|
73
|
+
raise ValueError(f"chunker module {cm.module!r} must use 'module.path:callable' format")
|
|
74
|
+
mod = importlib.import_module(module_path)
|
|
75
|
+
fn = getattr(mod, attr)
|
|
76
|
+
if not callable(fn):
|
|
77
|
+
raise ValueError(f"chunker {cm.module!r}: {attr!r} is not callable")
|
|
78
|
+
registry[f".{cm.ext}"] = fn
|
|
79
|
+
return registry
|
|
80
|
+
|
|
81
|
+
|
|
57
82
|
# ---------------------------------------------------------------------------
|
|
58
83
|
# Daemon paths
|
|
59
84
|
# ---------------------------------------------------------------------------
|
|
@@ -110,7 +135,9 @@ class ProjectRegistry:
|
|
|
110
135
|
"""Get or create a Project for the given root. Lazy initialization."""
|
|
111
136
|
if project_root not in self._projects:
|
|
112
137
|
root = Path(project_root)
|
|
113
|
-
|
|
138
|
+
project_settings = load_project_settings(root)
|
|
139
|
+
chunker_registry = _resolve_chunker_registry(project_settings.chunkers)
|
|
140
|
+
project = await Project.create(root, self._embedder, chunker_registry=chunker_registry)
|
|
114
141
|
self._projects[project_root] = project
|
|
115
142
|
return self._projects[project_root]
|
|
116
143
|
|
|
@@ -345,7 +372,7 @@ async def _check_index_status(project_root_str: str) -> DoctorCheckResult:
|
|
|
345
372
|
from cocoindex.connectors import sqlite as coco_sqlite
|
|
346
373
|
|
|
347
374
|
project_root = Path(project_root_str)
|
|
348
|
-
db_path = project_root
|
|
375
|
+
db_path = target_sqlite_db_path(project_root)
|
|
349
376
|
details = [f"Index: {db_path}"]
|
|
350
377
|
|
|
351
378
|
if not db_path.exists():
|
|
@@ -441,9 +468,16 @@ async def _dispatch(
|
|
|
441
468
|
return StopResponse(ok=True)
|
|
442
469
|
|
|
443
470
|
if isinstance(req, DaemonEnvRequest):
|
|
471
|
+
from .protocol import DbPathMappingEntry
|
|
472
|
+
from .settings import get_db_path_mappings
|
|
473
|
+
|
|
444
474
|
return DaemonEnvResponse(
|
|
445
475
|
env_names=sorted(os.environ.keys()),
|
|
446
476
|
settings_env_names=settings_env_names,
|
|
477
|
+
db_path_mappings=[
|
|
478
|
+
DbPathMappingEntry(source=str(m.source), target=str(m.target))
|
|
479
|
+
for m in get_db_path_mappings()
|
|
480
|
+
],
|
|
447
481
|
)
|
|
448
482
|
|
|
449
483
|
if isinstance(req, DoctorRequest):
|
|
@@ -14,6 +14,7 @@ from cocoindex.resources.file import FilePathMatcher, PatternFilePathMatcher
|
|
|
14
14
|
from cocoindex.resources.id import IdGenerator
|
|
15
15
|
from pathspec import GitIgnoreSpec
|
|
16
16
|
|
|
17
|
+
from .chunking import CHUNKER_REGISTRY
|
|
17
18
|
from .settings import load_gitignore_spec, load_project_settings
|
|
18
19
|
from .shared import (
|
|
19
20
|
CODEBASE_DIR,
|
|
@@ -158,13 +159,20 @@ async def process_file(
|
|
|
158
159
|
or "text"
|
|
159
160
|
)
|
|
160
161
|
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
162
|
+
chunker_registry = coco.use_context(CHUNKER_REGISTRY)
|
|
163
|
+
chunker = chunker_registry.get(suffix)
|
|
164
|
+
if chunker is not None:
|
|
165
|
+
language_override, chunks = chunker(Path(file.file_path.path), content)
|
|
166
|
+
if language_override is not None:
|
|
167
|
+
language = language_override
|
|
168
|
+
else:
|
|
169
|
+
chunks = splitter.split(
|
|
170
|
+
content,
|
|
171
|
+
chunk_size=CHUNK_SIZE,
|
|
172
|
+
min_chunk_size=MIN_CHUNK_SIZE,
|
|
173
|
+
chunk_overlap=CHUNK_OVERLAP,
|
|
174
|
+
language=language,
|
|
175
|
+
)
|
|
168
176
|
|
|
169
177
|
id_gen = IdGenerator()
|
|
170
178
|
|
|
@@ -10,6 +10,7 @@ from pathlib import Path
|
|
|
10
10
|
import cocoindex as coco
|
|
11
11
|
from cocoindex.connectors import sqlite as coco_sqlite
|
|
12
12
|
|
|
13
|
+
from .chunking import CHUNKER_REGISTRY, ChunkerFn
|
|
13
14
|
from .indexer import indexer_main
|
|
14
15
|
from .protocol import (
|
|
15
16
|
IndexingProgress,
|
|
@@ -21,6 +22,15 @@ from .protocol import (
|
|
|
21
22
|
SearchResult,
|
|
22
23
|
)
|
|
23
24
|
from .query import query_codebase
|
|
25
|
+
from .settings import (
|
|
26
|
+
cocoindex_db_path as _cocoindex_db_path,
|
|
27
|
+
)
|
|
28
|
+
from .settings import (
|
|
29
|
+
resolve_db_dir,
|
|
30
|
+
)
|
|
31
|
+
from .settings import (
|
|
32
|
+
target_sqlite_db_path as _target_sqlite_db_path,
|
|
33
|
+
)
|
|
24
34
|
from .shared import (
|
|
25
35
|
CODEBASE_DIR,
|
|
26
36
|
EMBEDDER,
|
|
@@ -170,7 +180,7 @@ class Project:
|
|
|
170
180
|
offset: int = 0,
|
|
171
181
|
) -> list[SearchResult]:
|
|
172
182
|
"""Search within this project."""
|
|
173
|
-
target_db = self._project_root
|
|
183
|
+
target_db = _target_sqlite_db_path(self._project_root)
|
|
174
184
|
results = await query_codebase(
|
|
175
185
|
query=query,
|
|
176
186
|
target_sqlite_db_path=target_db,
|
|
@@ -247,25 +257,37 @@ class Project:
|
|
|
247
257
|
async def create(
|
|
248
258
|
project_root: Path,
|
|
249
259
|
embedder: Embedder,
|
|
260
|
+
chunker_registry: dict[str, ChunkerFn] | None = None,
|
|
250
261
|
) -> Project:
|
|
251
262
|
"""Create a project with explicit embedder.
|
|
252
263
|
|
|
253
264
|
Project-level settings and .gitignore are NOT cached here — the
|
|
254
265
|
indexer loads them fresh from disk on every run so that user edits
|
|
255
266
|
take effect without restarting the daemon.
|
|
267
|
+
|
|
268
|
+
Args:
|
|
269
|
+
project_root: Root directory of the codebase to index.
|
|
270
|
+
embedder: Embedding model instance.
|
|
271
|
+
chunker_registry: Optional mapping of file suffix (e.g. ``".toml"``)
|
|
272
|
+
to a ``ChunkerFn``. When a suffix matches, the registered
|
|
273
|
+
chunker is called instead of the built-in splitter.
|
|
256
274
|
"""
|
|
257
|
-
|
|
258
|
-
|
|
275
|
+
settings_dir = project_root / ".cocoindex_code"
|
|
276
|
+
settings_dir.mkdir(parents=True, exist_ok=True)
|
|
277
|
+
|
|
278
|
+
db_dir = resolve_db_dir(project_root)
|
|
279
|
+
db_dir.mkdir(parents=True, exist_ok=True)
|
|
259
280
|
|
|
260
|
-
|
|
261
|
-
|
|
281
|
+
cocoindex_db = _cocoindex_db_path(project_root)
|
|
282
|
+
target_sqlite_db = _target_sqlite_db_path(project_root)
|
|
262
283
|
|
|
263
|
-
settings = coco.Settings.from_env(
|
|
284
|
+
settings = coco.Settings.from_env(cocoindex_db)
|
|
264
285
|
|
|
265
286
|
context = coco.ContextProvider()
|
|
266
287
|
context.provide(CODEBASE_DIR, project_root)
|
|
267
|
-
context.provide(SQLITE_DB, coco_sqlite.connect(str(
|
|
288
|
+
context.provide(SQLITE_DB, coco_sqlite.connect(str(target_sqlite_db), load_vec=True))
|
|
268
289
|
context.provide(EMBEDDER, embedder)
|
|
290
|
+
context.provide(CHUNKER_REGISTRY, dict(chunker_registry) if chunker_registry else {})
|
|
269
291
|
|
|
270
292
|
env = coco.Environment(settings, context_provider=context)
|
|
271
293
|
app = coco.App(
|
|
@@ -158,9 +158,15 @@ class DoctorResponse(_msgspec.Struct, tag="doctor"):
|
|
|
158
158
|
final: bool = False
|
|
159
159
|
|
|
160
160
|
|
|
161
|
+
class DbPathMappingEntry(_msgspec.Struct):
|
|
162
|
+
source: str
|
|
163
|
+
target: str
|
|
164
|
+
|
|
165
|
+
|
|
161
166
|
class DaemonEnvResponse(_msgspec.Struct, tag="daemon_env"):
|
|
162
167
|
env_names: list[str]
|
|
163
168
|
settings_env_names: list[str]
|
|
169
|
+
db_path_mappings: list[DbPathMappingEntry] = []
|
|
164
170
|
|
|
165
171
|
|
|
166
172
|
class ErrorResponse(_msgspec.Struct, tag="error"):
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import os
|
|
5
6
|
from dataclasses import dataclass, field
|
|
6
7
|
from pathlib import Path
|
|
7
8
|
from typing import Any
|
|
@@ -44,6 +45,29 @@ DEFAULT_INCLUDED_PATTERNS: list[str] = [
|
|
|
44
45
|
"**/*.rst", # reStructuredText
|
|
45
46
|
"**/*.php", # PHP
|
|
46
47
|
"**/*.lua", # Lua
|
|
48
|
+
"**/*.rb", # Ruby
|
|
49
|
+
"**/*.swift", # Swift
|
|
50
|
+
"**/*.kt", # Kotlin
|
|
51
|
+
"**/*.kts", # Kotlin script
|
|
52
|
+
"**/*.scala", # Scala
|
|
53
|
+
"**/*.r", # R
|
|
54
|
+
"**/*.html", # HTML
|
|
55
|
+
"**/*.htm", # HTML
|
|
56
|
+
"**/*.css", # CSS
|
|
57
|
+
"**/*.scss", # SCSS
|
|
58
|
+
"**/*.json", # JSON
|
|
59
|
+
"**/*.xml", # XML
|
|
60
|
+
"**/*.yaml", # YAML
|
|
61
|
+
"**/*.yml", # YAML
|
|
62
|
+
"**/*.toml", # TOML
|
|
63
|
+
"**/*.sol", # Solidity
|
|
64
|
+
"**/*.pas", # Pascal
|
|
65
|
+
"**/*.dpr", # Pascal/Delphi
|
|
66
|
+
"**/*.dtd", # DTD
|
|
67
|
+
"**/*.f", # Fortran
|
|
68
|
+
"**/*.f90", # Fortran
|
|
69
|
+
"**/*.f95", # Fortran
|
|
70
|
+
"**/*.f03", # Fortran
|
|
47
71
|
]
|
|
48
72
|
|
|
49
73
|
DEFAULT_EXCLUDED_PATTERNS: list[str] = [
|
|
@@ -82,11 +106,18 @@ class LanguageOverride:
|
|
|
82
106
|
lang: str # e.g. "php"
|
|
83
107
|
|
|
84
108
|
|
|
109
|
+
@dataclass
|
|
110
|
+
class ChunkerMapping:
|
|
111
|
+
ext: str # without dot, e.g. "toml"
|
|
112
|
+
module: str # "module.path:callable", e.g. "cocoindex_code.toml_chunker:toml_chunker"
|
|
113
|
+
|
|
114
|
+
|
|
85
115
|
@dataclass
|
|
86
116
|
class ProjectSettings:
|
|
87
117
|
include_patterns: list[str] = field(default_factory=lambda: list(DEFAULT_INCLUDED_PATTERNS))
|
|
88
118
|
exclude_patterns: list[str] = field(default_factory=lambda: list(DEFAULT_EXCLUDED_PATTERNS))
|
|
89
119
|
language_overrides: list[LanguageOverride] = field(default_factory=list)
|
|
120
|
+
chunkers: list[ChunkerMapping] = field(default_factory=list)
|
|
90
121
|
|
|
91
122
|
|
|
92
123
|
# ---------------------------------------------------------------------------
|
|
@@ -115,14 +146,103 @@ _SETTINGS_DIR_NAME = ".cocoindex_code"
|
|
|
115
146
|
_SETTINGS_FILE_NAME = "settings.yml" # project-level
|
|
116
147
|
_USER_SETTINGS_FILE_NAME = "global_settings.yml" # user-level
|
|
117
148
|
|
|
149
|
+
_ENV_DB_PATH_MAPPING = "COCOINDEX_CODE_DB_PATH_MAPPING"
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
@dataclass
|
|
153
|
+
class DbPathMapping:
|
|
154
|
+
source: Path
|
|
155
|
+
target: Path
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
_db_path_mapping: list[DbPathMapping] | None = None
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _parse_db_path_mapping() -> list[DbPathMapping]:
|
|
162
|
+
"""Parse ``COCOINDEX_CODE_DB_PATH_MAPPING`` env var.
|
|
163
|
+
|
|
164
|
+
Format: ``/src1=/dst1,/src2=/dst2``
|
|
165
|
+
Both source and target must be absolute paths.
|
|
166
|
+
"""
|
|
167
|
+
raw = os.environ.get(_ENV_DB_PATH_MAPPING, "")
|
|
168
|
+
if not raw.strip():
|
|
169
|
+
return []
|
|
170
|
+
|
|
171
|
+
mappings: list[DbPathMapping] = []
|
|
172
|
+
for entry in raw.split(","):
|
|
173
|
+
entry = entry.strip()
|
|
174
|
+
if not entry:
|
|
175
|
+
continue
|
|
176
|
+
parts = entry.split("=", 1)
|
|
177
|
+
if len(parts) != 2 or not parts[0] or not parts[1]:
|
|
178
|
+
raise ValueError(
|
|
179
|
+
f"{_ENV_DB_PATH_MAPPING}: invalid entry {entry!r}, expected format 'source=target'"
|
|
180
|
+
)
|
|
181
|
+
source = Path(parts[0])
|
|
182
|
+
target = Path(parts[1])
|
|
183
|
+
if not source.is_absolute():
|
|
184
|
+
raise ValueError(
|
|
185
|
+
f"{_ENV_DB_PATH_MAPPING}: source path must be absolute, got {source!r}"
|
|
186
|
+
)
|
|
187
|
+
if not target.is_absolute():
|
|
188
|
+
raise ValueError(
|
|
189
|
+
f"{_ENV_DB_PATH_MAPPING}: target path must be absolute, got {target!r}"
|
|
190
|
+
)
|
|
191
|
+
mappings.append(DbPathMapping(source=source.resolve(), target=target.resolve()))
|
|
192
|
+
return mappings
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def resolve_db_dir(project_root: Path) -> Path:
|
|
196
|
+
"""Return the directory for database files given a project root.
|
|
197
|
+
|
|
198
|
+
Applies ``COCOINDEX_CODE_DB_PATH_MAPPING`` if set, otherwise falls back
|
|
199
|
+
to ``project_root / ".cocoindex_code"``.
|
|
200
|
+
"""
|
|
201
|
+
global _db_path_mapping # noqa: PLW0603
|
|
202
|
+
if _db_path_mapping is None:
|
|
203
|
+
_db_path_mapping = _parse_db_path_mapping()
|
|
204
|
+
|
|
205
|
+
resolved = project_root.resolve()
|
|
206
|
+
for mapping in _db_path_mapping:
|
|
207
|
+
if resolved == mapping.source or resolved.is_relative_to(mapping.source):
|
|
208
|
+
rel = resolved.relative_to(mapping.source)
|
|
209
|
+
return mapping.target / rel
|
|
210
|
+
return project_root / _SETTINGS_DIR_NAME
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def get_db_path_mappings() -> list[DbPathMapping]:
|
|
214
|
+
"""Return the parsed DB path mappings from ``COCOINDEX_CODE_DB_PATH_MAPPING``."""
|
|
215
|
+
global _db_path_mapping # noqa: PLW0603
|
|
216
|
+
if _db_path_mapping is None:
|
|
217
|
+
_db_path_mapping = _parse_db_path_mapping()
|
|
218
|
+
return list(_db_path_mapping)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _reset_db_path_mapping_cache() -> None:
|
|
222
|
+
"""Reset the cached mapping (for tests)."""
|
|
223
|
+
global _db_path_mapping # noqa: PLW0603
|
|
224
|
+
_db_path_mapping = None
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
_TARGET_SQLITE_DB_NAME = "target_sqlite.db"
|
|
228
|
+
_COCOINDEX_DB_NAME = "cocoindex.db"
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def target_sqlite_db_path(project_root: Path) -> Path:
|
|
232
|
+
"""Return the path to the vector index SQLite database for a project."""
|
|
233
|
+
return resolve_db_dir(project_root) / _TARGET_SQLITE_DB_NAME
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def cocoindex_db_path(project_root: Path) -> Path:
|
|
237
|
+
"""Return the path to the CocoIndex state database for a project."""
|
|
238
|
+
return resolve_db_dir(project_root) / _COCOINDEX_DB_NAME
|
|
239
|
+
|
|
118
240
|
|
|
119
241
|
def user_settings_dir() -> Path:
|
|
120
242
|
"""Return ``~/.cocoindex_code/``.
|
|
121
243
|
|
|
122
244
|
Respects ``COCOINDEX_CODE_DIR`` env var for overriding the base directory.
|
|
123
245
|
"""
|
|
124
|
-
import os
|
|
125
|
-
|
|
126
246
|
override = os.environ.get("COCOINDEX_CODE_DIR")
|
|
127
247
|
if override:
|
|
128
248
|
return Path(override)
|
|
@@ -162,7 +282,7 @@ def find_legacy_project_root(start: Path) -> Path | None:
|
|
|
162
282
|
"""
|
|
163
283
|
current = start.resolve()
|
|
164
284
|
while True:
|
|
165
|
-
if (current / _SETTINGS_DIR_NAME /
|
|
285
|
+
if (current / _SETTINGS_DIR_NAME / _COCOINDEX_DB_NAME).exists():
|
|
166
286
|
return current
|
|
167
287
|
parent = current.parent
|
|
168
288
|
if parent == current:
|
|
@@ -261,6 +381,8 @@ def _project_settings_to_dict(settings: ProjectSettings) -> dict[str, Any]:
|
|
|
261
381
|
d["language_overrides"] = [
|
|
262
382
|
{"ext": lo.ext, "lang": lo.lang} for lo in settings.language_overrides
|
|
263
383
|
]
|
|
384
|
+
if settings.chunkers:
|
|
385
|
+
d["chunkers"] = [{"ext": cm.ext, "module": cm.module} for cm in settings.chunkers]
|
|
264
386
|
return d
|
|
265
387
|
|
|
266
388
|
|
|
@@ -268,10 +390,12 @@ def _project_settings_from_dict(d: dict[str, Any]) -> ProjectSettings:
|
|
|
268
390
|
overrides = [
|
|
269
391
|
LanguageOverride(ext=lo["ext"], lang=lo["lang"]) for lo in d.get("language_overrides", [])
|
|
270
392
|
]
|
|
393
|
+
chunkers = [ChunkerMapping(ext=cm["ext"], module=cm["module"]) for cm in d.get("chunkers", [])]
|
|
271
394
|
return ProjectSettings(
|
|
272
395
|
include_patterns=d.get("include_patterns", list(DEFAULT_INCLUDED_PATTERNS)),
|
|
273
396
|
exclude_patterns=d.get("exclude_patterns", list(DEFAULT_EXCLUDED_PATTERNS)),
|
|
274
397
|
language_overrides=overrides,
|
|
398
|
+
chunkers=chunkers,
|
|
275
399
|
)
|
|
276
400
|
|
|
277
401
|
|
|
@@ -1,142 +0,0 @@
|
|
|
1
|
-
"""Configuration management for cocoindex-code."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import json
|
|
6
|
-
import os
|
|
7
|
-
from dataclasses import dataclass
|
|
8
|
-
from pathlib import Path
|
|
9
|
-
|
|
10
|
-
_DEFAULT_MODEL = "sbert/sentence-transformers/all-MiniLM-L6-v2"
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def _find_root_with_marker(start: Path, markers: list[str]) -> Path | None:
|
|
14
|
-
"""Walk up from start, return first directory containing any marker."""
|
|
15
|
-
current = start
|
|
16
|
-
while True:
|
|
17
|
-
if any((current / m).exists() for m in markers):
|
|
18
|
-
return current
|
|
19
|
-
parent = current.parent
|
|
20
|
-
if parent == current:
|
|
21
|
-
return None
|
|
22
|
-
current = parent
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def _discover_codebase_root() -> Path:
|
|
26
|
-
"""Discover the codebase root directory.
|
|
27
|
-
|
|
28
|
-
Discovery order:
|
|
29
|
-
1. Find nearest parent with `.cocoindex_code` directory (re-anchor to previously-indexed tree)
|
|
30
|
-
2. Find nearest parent with any common project root marker
|
|
31
|
-
3. Fall back to current working directory
|
|
32
|
-
"""
|
|
33
|
-
cwd = Path.cwd()
|
|
34
|
-
|
|
35
|
-
# First, look for existing .cocoindex_code directory
|
|
36
|
-
root = _find_root_with_marker(cwd, [".cocoindex_code"])
|
|
37
|
-
if root is not None:
|
|
38
|
-
return root
|
|
39
|
-
|
|
40
|
-
# Then, look for common project root markers
|
|
41
|
-
markers = [".git", "pyproject.toml", "package.json", "Cargo.toml", "go.mod"]
|
|
42
|
-
root = _find_root_with_marker(cwd, markers)
|
|
43
|
-
return root if root is not None else cwd
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
def _parse_json_string_list_env(var_name: str) -> list[str]:
|
|
47
|
-
"""Parse an environment variable as a JSON array of strings."""
|
|
48
|
-
raw_value = os.environ.get(var_name, "")
|
|
49
|
-
if not raw_value.strip():
|
|
50
|
-
return []
|
|
51
|
-
|
|
52
|
-
try:
|
|
53
|
-
parsed = json.loads(raw_value)
|
|
54
|
-
except json.JSONDecodeError as exc:
|
|
55
|
-
raise ValueError(f"{var_name} must be a JSON array of strings, got invalid JSON") from exc
|
|
56
|
-
|
|
57
|
-
if not isinstance(parsed, list):
|
|
58
|
-
raise ValueError(f"{var_name} must be a JSON array of strings")
|
|
59
|
-
|
|
60
|
-
result: list[str] = []
|
|
61
|
-
for item in parsed:
|
|
62
|
-
if not isinstance(item, str):
|
|
63
|
-
raise ValueError(f"{var_name} must be a JSON array of strings")
|
|
64
|
-
item = item.strip()
|
|
65
|
-
if item:
|
|
66
|
-
result.append(item)
|
|
67
|
-
|
|
68
|
-
return result
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
@dataclass
|
|
72
|
-
class Config:
|
|
73
|
-
"""Configuration loaded from environment variables."""
|
|
74
|
-
|
|
75
|
-
codebase_root_path: Path
|
|
76
|
-
embedding_model: str
|
|
77
|
-
index_dir: Path
|
|
78
|
-
device: str | None
|
|
79
|
-
extra_extensions: dict[str, str | None]
|
|
80
|
-
excluded_patterns: list[str]
|
|
81
|
-
|
|
82
|
-
@classmethod
|
|
83
|
-
def from_env(cls) -> Config:
|
|
84
|
-
"""Load configuration from environment variables."""
|
|
85
|
-
# Get root path from env or discover it
|
|
86
|
-
root_path_str = os.environ.get("COCOINDEX_CODE_ROOT_PATH")
|
|
87
|
-
if root_path_str:
|
|
88
|
-
root = Path(root_path_str).resolve()
|
|
89
|
-
else:
|
|
90
|
-
root = _discover_codebase_root()
|
|
91
|
-
|
|
92
|
-
# Get embedding model
|
|
93
|
-
# Prefix "sbert/" for SentenceTransformers models, otherwise LiteLLM.
|
|
94
|
-
embedding_model = os.environ.get(
|
|
95
|
-
"COCOINDEX_CODE_EMBEDDING_MODEL",
|
|
96
|
-
_DEFAULT_MODEL,
|
|
97
|
-
)
|
|
98
|
-
|
|
99
|
-
# Index directory is always under the root
|
|
100
|
-
index_dir = root / ".cocoindex_code"
|
|
101
|
-
|
|
102
|
-
# Device: auto-detect CUDA or use env override
|
|
103
|
-
device = os.environ.get("COCOINDEX_CODE_DEVICE")
|
|
104
|
-
|
|
105
|
-
# Extra file extensions (format: "inc:php,yaml,toml" — optional lang after colon)
|
|
106
|
-
raw_extra = os.environ.get("COCOINDEX_CODE_EXTRA_EXTENSIONS", "")
|
|
107
|
-
extra_extensions: dict[str, str | None] = {}
|
|
108
|
-
for token in raw_extra.split(","):
|
|
109
|
-
token = token.strip()
|
|
110
|
-
if not token:
|
|
111
|
-
continue
|
|
112
|
-
if ":" in token:
|
|
113
|
-
ext, lang = token.split(":", 1)
|
|
114
|
-
extra_extensions[f".{ext.strip()}"] = lang.strip() or None
|
|
115
|
-
else:
|
|
116
|
-
extra_extensions[f".{token}"] = None
|
|
117
|
-
|
|
118
|
-
# Excluded file glob patterns
|
|
119
|
-
excluded_patterns = _parse_json_string_list_env("COCOINDEX_CODE_EXCLUDED_PATTERNS")
|
|
120
|
-
|
|
121
|
-
return cls(
|
|
122
|
-
codebase_root_path=root,
|
|
123
|
-
embedding_model=embedding_model,
|
|
124
|
-
index_dir=index_dir,
|
|
125
|
-
device=device,
|
|
126
|
-
extra_extensions=extra_extensions,
|
|
127
|
-
excluded_patterns=excluded_patterns,
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
@property
|
|
131
|
-
def target_sqlite_db_path(self) -> Path:
|
|
132
|
-
"""Path to the vector index SQLite database."""
|
|
133
|
-
return self.index_dir / "target_sqlite.db"
|
|
134
|
-
|
|
135
|
-
@property
|
|
136
|
-
def cocoindex_db_path(self) -> Path:
|
|
137
|
-
"""Path to the CocoIndex state database."""
|
|
138
|
-
return self.index_dir / "cocoindex.db"
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
# Module-level singleton — imported directly by all modules that need configuration
|
|
142
|
-
config: Config = Config.from_env()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|