cocoindex-code 0.2.8__tar.gz → 0.2.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. {cocoindex_code-0.2.8 → cocoindex_code-0.2.10}/PKG-INFO +134 -2
  2. {cocoindex_code-0.2.8 → cocoindex_code-0.2.10}/README.md +132 -0
  3. {cocoindex_code-0.2.8 → cocoindex_code-0.2.10}/pyproject.toml +3 -1
  4. {cocoindex_code-0.2.8 → cocoindex_code-0.2.10}/src/cocoindex_code/_version.py +2 -2
  5. cocoindex_code-0.2.10/src/cocoindex_code/chunking.py +29 -0
  6. {cocoindex_code-0.2.8 → cocoindex_code-0.2.10}/src/cocoindex_code/cli.py +19 -18
  7. {cocoindex_code-0.2.8 → cocoindex_code-0.2.10}/src/cocoindex_code/client.py +2 -2
  8. {cocoindex_code-0.2.8 → cocoindex_code-0.2.10}/src/cocoindex_code/daemon.py +29 -3
  9. {cocoindex_code-0.2.8 → cocoindex_code-0.2.10}/src/cocoindex_code/indexer.py +15 -7
  10. {cocoindex_code-0.2.8 → cocoindex_code-0.2.10}/src/cocoindex_code/project.py +24 -6
  11. {cocoindex_code-0.2.8 → cocoindex_code-0.2.10}/src/cocoindex_code/settings.py +49 -1
  12. cocoindex_code-0.2.8/src/cocoindex_code/config.py +0 -144
  13. {cocoindex_code-0.2.8 → cocoindex_code-0.2.10}/.gitignore +0 -0
  14. {cocoindex_code-0.2.8 → cocoindex_code-0.2.10}/LICENSE +0 -0
  15. {cocoindex_code-0.2.8 → cocoindex_code-0.2.10}/src/cocoindex_code/__init__.py +0 -0
  16. {cocoindex_code-0.2.8 → cocoindex_code-0.2.10}/src/cocoindex_code/__main__.py +0 -0
  17. {cocoindex_code-0.2.8 → cocoindex_code-0.2.10}/src/cocoindex_code/protocol.py +0 -0
  18. {cocoindex_code-0.2.8 → cocoindex_code-0.2.10}/src/cocoindex_code/query.py +0 -0
  19. {cocoindex_code-0.2.8 → cocoindex_code-0.2.10}/src/cocoindex_code/schema.py +0 -0
  20. {cocoindex_code-0.2.8 → cocoindex_code-0.2.10}/src/cocoindex_code/server.py +0 -0
  21. {cocoindex_code-0.2.8 → cocoindex_code-0.2.10}/src/cocoindex_code/shared.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cocoindex-code
3
- Version: 0.2.8
3
+ Version: 0.2.10
4
4
  Summary: MCP server for indexing and querying codebases using CocoIndex
5
5
  Project-URL: Homepage, https://github.com/cocoindex-io/cocoindex-code
6
6
  Project-URL: Repository, https://github.com/cocoindex-io/cocoindex-code
@@ -17,7 +17,7 @@ Classifier: Programming Language :: Python :: 3.12
17
17
  Classifier: Programming Language :: Python :: 3.13
18
18
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
19
19
  Requires-Python: >=3.11
20
- Requires-Dist: cocoindex[litellm]==1.0.0a37
20
+ Requires-Dist: cocoindex[litellm]==1.0.0a38
21
21
  Requires-Dist: einops>=0.8.2
22
22
  Requires-Dist: mcp>=1.0.0
23
23
  Requires-Dist: msgspec>=0.19.0
@@ -226,6 +226,105 @@ ccc search --refresh database schema # update index first, then
226
226
 
227
227
  By default, `ccc search` scopes results to your current working directory (relative to the project root). Use `--path` to override.
228
228
 
229
+ ## Docker
230
+
231
+ A Docker image is available for teams who want a reproducible, dependency-free
232
+ setup — no Python, `uv`, or system dependencies required on the host.
233
+
234
+ The recommended approach is a **persistent container**: start it once, and use
235
+ `docker exec` to run CLI commands or connect MCP sessions to it. The daemon
236
+ inside stays warm across sessions, so the embedding model is loaded only once.
237
+
238
+ ### Step 1 — Start the container
239
+
240
+ ```bash
241
+ docker run -d --name cocoindex-code \
242
+ --volume "$(pwd):/workspace" \
243
+ --volume cocoindex-db:/db \
244
+ --volume cocoindex-model-cache:/root/.cache \
245
+ ghcr.io/cocoindex-io/cocoindex-code:latest
246
+ ```
247
+
248
+ - `/workspace` — mount your project root here
249
+ - `cocoindex-db` — index databases live inside the container (fast native I/O, no cross-OS volume issues)
250
+ - `cocoindex-model-cache` — persists the embedding model across image upgrades
251
+
252
+ ### Step 2 — Index your codebase
253
+
254
+ ```bash
255
+ docker exec -it cocoindex-code ccc index
256
+ ```
257
+
258
+ ### Step 3 — Connect your coding agent
259
+
260
+ <details>
261
+ <summary>Claude Code</summary>
262
+
263
+ ```bash
264
+ claude mcp add cocoindex-code -- docker exec -i cocoindex-code ccc mcp
265
+ ```
266
+
267
+ Or via `.mcp.json`:
268
+
269
+ ```json
270
+ {
271
+ "mcpServers": {
272
+ "cocoindex-code": {
273
+ "type": "stdio",
274
+ "command": "docker",
275
+ "args": ["exec", "-i", "cocoindex-code", "ccc", "mcp"]
276
+ }
277
+ }
278
+ }
279
+ ```
280
+ </details>
281
+
282
+ <details>
283
+ <summary>Codex</summary>
284
+
285
+ ```bash
286
+ codex mcp add cocoindex-code -- docker exec -i cocoindex-code ccc mcp
287
+ ```
288
+ </details>
289
+
290
+ ### CLI usage inside the container
291
+
292
+ All `ccc` commands work via `docker exec`:
293
+
294
+ ```bash
295
+ docker exec -it cocoindex-code ccc index
296
+ docker exec -it cocoindex-code ccc search "authentication logic"
297
+ docker exec -it cocoindex-code ccc status
298
+ ```
299
+
300
+ Or set an alias on your host so it feels native:
301
+
302
+ ```bash
303
+ alias ccc='docker exec -it cocoindex-code ccc'
304
+ ```
305
+
306
+ ### Configuration via environment variables
307
+
308
+ Pass configuration to `docker run` with `-e`:
309
+
310
+ ```bash
311
+ # Extra extensions (e.g. Typesafe Config, SBT build files)
312
+ -e COCOINDEX_CODE_EXTRA_EXTENSIONS="conf,sbt"
313
+
314
+ # Exclude build artefacts (Scala/SBT example)
315
+ -e COCOINDEX_CODE_EXCLUDE_PATTERNS='["**/target/**","**/.bloop/**","**/.metals/**"]'
316
+
317
+ # Swap in a code-optimised embedding model
318
+ -e COCOINDEX_CODE_EMBEDDING_MODEL=voyage/voyage-code-3
319
+ -e VOYAGE_API_KEY=your-key
320
+ ```
321
+
322
+ ### Build the image locally
323
+
324
+ ```bash
325
+ docker build -t cocoindex-code:local -f docker/Dockerfile .
326
+ ```
327
+
229
328
  ## Features
230
329
  - **Semantic Code Search**: Find relevant code using natural language queries when grep doesn't work well, and save tokens immediately.
231
330
  - **Ultra Performant**: ⚡ Built on top of ultra performant [Rust indexing engine](https://github.com/cocoindex-io/cocoindex). Only re-indexes changed files for fast updates.
@@ -276,10 +375,43 @@ exclude_patterns:
276
375
  language_overrides:
277
376
  - ext: inc # treat .inc files as PHP
278
377
  lang: php
378
+
379
+ chunkers:
380
+ - ext: toml # use a custom chunker for .toml files
381
+ module: example_toml_chunker:toml_chunker
279
382
  ```
280
383
 
281
384
  > `.cocoindex_code/` is automatically added to `.gitignore` during init.
282
385
 
386
+ Use `chunkers` when you want to control how a file type is split into chunks before indexing.
387
+
388
+ `module: example_toml_chunker:toml_chunker` means:
389
+ - `example_toml_chunker` is a local Python module
390
+ - `toml_chunker` is the function inside that module
391
+
392
+ In practice, this usually means:
393
+ - you create a Python file in your project, for example `example_toml_chunker.py`
394
+ - you add a function in that file
395
+ - you point `settings.yml` at it with `module.path:function_name`
396
+
397
+ The function should use this signature:
398
+
399
+ ```python
400
+ from pathlib import Path
401
+ from cocoindex_code.chunking import Chunk
402
+
403
+ def my_chunker(path: Path, content: str) -> tuple[str | None, list[Chunk]]:
404
+ ...
405
+ ```
406
+
407
+ - `path` is the file being indexed
408
+ - `content` is the full text of that file
409
+ - return `language_override` as a string like `"toml"` if you want to override language detection
410
+ - return `None` as `language_override` if you want to keep the detected language
411
+ - return a `list[Chunk]` with the chunks you want stored in the index
412
+
413
+ See [`src/cocoindex_code/chunking.py`](./src/cocoindex_code/chunking.py) for the public types and [`tests/example_toml_chunker.py`](./tests/example_toml_chunker.py) for a complete example.
414
+
283
415
  ## Embedding Models
284
416
 
285
417
  By default, a local SentenceTransformers model ([sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)) is used — no API key required. To use a different model, edit `~/.cocoindex_code/global_settings.yml`.
@@ -187,6 +187,105 @@ ccc search --refresh database schema # update index first, then
187
187
 
188
188
  By default, `ccc search` scopes results to your current working directory (relative to the project root). Use `--path` to override.
189
189
 
190
+ ## Docker
191
+
192
+ A Docker image is available for teams who want a reproducible, dependency-free
193
+ setup — no Python, `uv`, or system dependencies required on the host.
194
+
195
+ The recommended approach is a **persistent container**: start it once, and use
196
+ `docker exec` to run CLI commands or connect MCP sessions to it. The daemon
197
+ inside stays warm across sessions, so the embedding model is loaded only once.
198
+
199
+ ### Step 1 — Start the container
200
+
201
+ ```bash
202
+ docker run -d --name cocoindex-code \
203
+ --volume "$(pwd):/workspace" \
204
+ --volume cocoindex-db:/db \
205
+ --volume cocoindex-model-cache:/root/.cache \
206
+ ghcr.io/cocoindex-io/cocoindex-code:latest
207
+ ```
208
+
209
+ - `/workspace` — mount your project root here
210
+ - `cocoindex-db` — index databases live inside the container (fast native I/O, no cross-OS volume issues)
211
+ - `cocoindex-model-cache` — persists the embedding model across image upgrades
212
+
213
+ ### Step 2 — Index your codebase
214
+
215
+ ```bash
216
+ docker exec -it cocoindex-code ccc index
217
+ ```
218
+
219
+ ### Step 3 — Connect your coding agent
220
+
221
+ <details>
222
+ <summary>Claude Code</summary>
223
+
224
+ ```bash
225
+ claude mcp add cocoindex-code -- docker exec -i cocoindex-code ccc mcp
226
+ ```
227
+
228
+ Or via `.mcp.json`:
229
+
230
+ ```json
231
+ {
232
+ "mcpServers": {
233
+ "cocoindex-code": {
234
+ "type": "stdio",
235
+ "command": "docker",
236
+ "args": ["exec", "-i", "cocoindex-code", "ccc", "mcp"]
237
+ }
238
+ }
239
+ }
240
+ ```
241
+ </details>
242
+
243
+ <details>
244
+ <summary>Codex</summary>
245
+
246
+ ```bash
247
+ codex mcp add cocoindex-code -- docker exec -i cocoindex-code ccc mcp
248
+ ```
249
+ </details>
250
+
251
+ ### CLI usage inside the container
252
+
253
+ All `ccc` commands work via `docker exec`:
254
+
255
+ ```bash
256
+ docker exec -it cocoindex-code ccc index
257
+ docker exec -it cocoindex-code ccc search "authentication logic"
258
+ docker exec -it cocoindex-code ccc status
259
+ ```
260
+
261
+ Or set an alias on your host so it feels native:
262
+
263
+ ```bash
264
+ alias ccc='docker exec -it cocoindex-code ccc'
265
+ ```
266
+
267
+ ### Configuration via environment variables
268
+
269
+ Pass configuration to `docker run` with `-e`:
270
+
271
+ ```bash
272
+ # Extra extensions (e.g. Typesafe Config, SBT build files)
273
+ -e COCOINDEX_CODE_EXTRA_EXTENSIONS="conf,sbt"
274
+
275
+ # Exclude build artefacts (Scala/SBT example)
276
+ -e COCOINDEX_CODE_EXCLUDE_PATTERNS='["**/target/**","**/.bloop/**","**/.metals/**"]'
277
+
278
+ # Swap in a code-optimised embedding model
279
+ -e COCOINDEX_CODE_EMBEDDING_MODEL=voyage/voyage-code-3
280
+ -e VOYAGE_API_KEY=your-key
281
+ ```
282
+
283
+ ### Build the image locally
284
+
285
+ ```bash
286
+ docker build -t cocoindex-code:local -f docker/Dockerfile .
287
+ ```
288
+
190
289
  ## Features
191
290
  - **Semantic Code Search**: Find relevant code using natural language queries when grep doesn't work well, and save tokens immediately.
192
291
  - **Ultra Performant**: ⚡ Built on top of ultra performant [Rust indexing engine](https://github.com/cocoindex-io/cocoindex). Only re-indexes changed files for fast updates.
@@ -237,10 +336,43 @@ exclude_patterns:
237
336
  language_overrides:
238
337
  - ext: inc # treat .inc files as PHP
239
338
  lang: php
339
+
340
+ chunkers:
341
+ - ext: toml # use a custom chunker for .toml files
342
+ module: example_toml_chunker:toml_chunker
240
343
  ```
241
344
 
242
345
  > `.cocoindex_code/` is automatically added to `.gitignore` during init.
243
346
 
347
+ Use `chunkers` when you want to control how a file type is split into chunks before indexing.
348
+
349
+ `module: example_toml_chunker:toml_chunker` means:
350
+ - `example_toml_chunker` is a local Python module
351
+ - `toml_chunker` is the function inside that module
352
+
353
+ In practice, this usually means:
354
+ - you create a Python file in your project, for example `example_toml_chunker.py`
355
+ - you add a function in that file
356
+ - you point `settings.yml` at it with `module.path:function_name`
357
+
358
+ The function should use this signature:
359
+
360
+ ```python
361
+ from pathlib import Path
362
+ from cocoindex_code.chunking import Chunk
363
+
364
+ def my_chunker(path: Path, content: str) -> tuple[str | None, list[Chunk]]:
365
+ ...
366
+ ```
367
+
368
+ - `path` is the file being indexed
369
+ - `content` is the full text of that file
370
+ - return `language_override` as a string like `"toml"` if you want to override language detection
371
+ - return `None` as `language_override` if you want to keep the detected language
372
+ - return a `list[Chunk]` with the chunks you want stored in the index
373
+
374
+ See [`src/cocoindex_code/chunking.py`](./src/cocoindex_code/chunking.py) for the public types and [`tests/example_toml_chunker.py`](./tests/example_toml_chunker.py) for a complete example.
375
+
244
376
  ## Embedding Models
245
377
 
246
378
  By default, a local SentenceTransformers model ([sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)) is used — no API key required. To use a different model, edit `~/.cocoindex_code/global_settings.yml`.
@@ -23,7 +23,7 @@ classifiers = [
23
23
 
24
24
  dependencies = [
25
25
  "mcp>=1.0.0",
26
- "cocoindex[litellm]==1.0.0a37",
26
+ "cocoindex[litellm]==1.0.0a38",
27
27
  "sentence-transformers>=2.2.0",
28
28
  "sqlite-vec>=0.1.0",
29
29
  "pydantic>=2.0.0",
@@ -91,9 +91,11 @@ select = ["E", "F", "I", "N", "W", "UP"]
91
91
  python_version = "3.11"
92
92
  strict = true
93
93
  ignore_missing_imports = true
94
+ explicit_package_bases = true
94
95
 
95
96
  [tool.pytest.ini_options]
96
97
  testpaths = ["tests"]
97
98
  python_files = ["test_*.py"]
98
99
  python_functions = ["test_*"]
99
100
  addopts = "-v --tb=short"
101
+ asyncio_mode = "auto"
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.2.8'
32
- __version_tuple__ = version_tuple = (0, 2, 8)
31
+ __version__ = version = '0.2.10'
32
+ __version_tuple__ = version_tuple = (0, 2, 10)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -0,0 +1,29 @@
1
+ """Public API for writing custom chunkers.
2
+
3
+ Example usage::
4
+
5
+ from pathlib import Path
6
+ from cocoindex_code.chunking import Chunk, ChunkerFn, TextPosition
7
+
8
+ def my_chunker(path: Path, content: str) -> tuple[str | None, list[Chunk]]:
9
+ pos = TextPosition(byte_offset=0, char_offset=0, line=1, column=0)
10
+ return "mylang", [Chunk(text=content, start=pos, end=pos)]
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import pathlib as _pathlib
16
+ from collections.abc import Callable as _Callable
17
+
18
+ import cocoindex as _coco
19
+ from cocoindex.resources.chunk import Chunk, TextPosition
20
+
21
+ # Callable alias (not Protocol) — consistent with codebase style.
22
+ # language_override=None keeps the language detected by detect_code_language.
23
+ # path is not resolved (no syscall); call path.resolve() inside the chunker if needed.
24
+ ChunkerFn = _Callable[[_pathlib.Path, str], tuple[str | None, list[Chunk]]]
25
+
26
+ # tracked=False: callables are not fingerprint-able; daemon restart re-indexes anyway.
27
+ CHUNKER_REGISTRY = _coco.ContextKey[dict[str, ChunkerFn]]("chunker_registry", tracked=False)
28
+
29
+ __all__ = ["Chunk", "ChunkerFn", "CHUNKER_REGISTRY", "TextPosition"]
@@ -12,13 +12,16 @@ import typer as _typer
12
12
  from .client import DaemonStartError
13
13
  from .protocol import DoctorCheckResult, IndexingProgress, ProjectStatusResponse, SearchResponse
14
14
  from .settings import (
15
+ cocoindex_db_path,
15
16
  default_project_settings,
16
17
  default_user_settings,
17
18
  find_parent_with_marker,
18
19
  find_project_root,
20
+ project_settings_path,
19
21
  resolve_db_dir,
20
22
  save_project_settings,
21
23
  save_user_settings,
24
+ target_sqlite_db_path,
22
25
  user_settings_path,
23
26
  )
24
27
 
@@ -284,10 +287,8 @@ def init(
284
287
  force: bool = _typer.Option(False, "-f", "--force", help="Skip parent directory warning"),
285
288
  ) -> None:
286
289
  """Initialize a project for cocoindex-code."""
287
- from .settings import project_settings_path as _project_settings_path
288
-
289
290
  cwd = Path.cwd().resolve()
290
- settings_file = _project_settings_path(cwd)
291
+ settings_file = project_settings_path(cwd)
291
292
 
292
293
  # Always ensure user settings exist
293
294
  user_path = user_settings_path()
@@ -377,8 +378,15 @@ def status() -> None:
377
378
  """Show project status."""
378
379
  from . import client as _client
379
380
 
380
- project_root = str(require_project_root())
381
+ project_root_path = require_project_root()
382
+ project_root = str(project_root_path)
381
383
  print_project_header(project_root)
384
+
385
+ _typer.echo(f"Settings: {project_settings_path(project_root_path)}")
386
+ db_path = target_sqlite_db_path(project_root_path)
387
+ if db_path.exists():
388
+ _typer.echo(f"Index DB: {db_path}")
389
+
382
390
  print_index_stats(_client.project_status(project_root))
383
391
 
384
392
 
@@ -393,10 +401,10 @@ def reset(
393
401
  db_dir = resolve_db_dir(project_root)
394
402
 
395
403
  db_files = [
396
- db_dir / "cocoindex.db",
397
- db_dir / "target_sqlite.db",
404
+ cocoindex_db_path(project_root),
405
+ target_sqlite_db_path(project_root),
398
406
  ]
399
- settings_file = cocoindex_dir / "settings.yml"
407
+ settings_file = project_settings_path(project_root)
400
408
 
401
409
  # Determine what will be deleted
402
410
  to_delete = [f for f in db_files if f.exists()]
@@ -503,16 +511,10 @@ def doctor() -> None:
503
511
  from .settings import (
504
512
  load_user_settings as _load_user_settings,
505
513
  )
506
- from .settings import (
507
- project_settings_path as _project_settings_path,
508
- )
509
- from .settings import (
510
- user_settings_path as _user_settings_path,
511
- )
512
514
 
513
515
  # --- 1. Global settings (local, no daemon needed) ---
514
516
  _print_section("Global Settings")
515
- settings_path = _user_settings_path()
517
+ settings_path = user_settings_path()
516
518
  _typer.echo(f" Settings: {settings_path}")
517
519
  try:
518
520
  user_settings = _load_user_settings()
@@ -570,7 +572,7 @@ def doctor() -> None:
570
572
  # --- 6. Project settings (local, no daemon needed) ---
571
573
  if project_root is not None:
572
574
  _print_section("Project Settings")
573
- ps_path = _project_settings_path(project_root)
575
+ ps_path = project_settings_path(project_root)
574
576
  _typer.echo(f" Settings: {ps_path}")
575
577
  try:
576
578
  ps = _load_project_settings(project_root)
@@ -597,10 +599,9 @@ def doctor() -> None:
597
599
 
598
600
  # --- 8. Log files ---
599
601
  _print_section("Log Files")
600
- from .daemon import daemon_dir as _daemon_dir
602
+ from .daemon import daemon_log_path as _daemon_log_path
601
603
 
602
- log_dir = _daemon_dir()
603
- _typer.echo(f" Daemon logs: {log_dir / 'daemon.log'}")
604
+ _typer.echo(f" Daemon logs: {_daemon_log_path()}")
604
605
  _typer.echo(" Check logs above for further troubleshooting.")
605
606
 
606
607
 
@@ -343,10 +343,10 @@ def start_daemon() -> subprocess.Popen[bytes]:
343
343
  Returns the ``Popen`` object so callers can detect early process death
344
344
  (via ``proc.poll()``) instead of waiting for a full timeout.
345
345
  """
346
- from .daemon import daemon_dir
346
+ from .daemon import daemon_dir, daemon_log_path
347
347
 
348
348
  daemon_dir().mkdir(parents=True, exist_ok=True)
349
- log_path = daemon_dir() / "daemon.log"
349
+ log_path = daemon_log_path()
350
350
 
351
351
  ccc_path = _find_ccc_executable()
352
352
  if ccc_path:
@@ -3,6 +3,7 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import asyncio
6
+ import importlib
6
7
  import logging
7
8
  import os
8
9
  import signal
@@ -15,6 +16,7 @@ from pathlib import Path
15
16
  from typing import Any
16
17
 
17
18
  from ._version import __version__
19
+ from .chunking import ChunkerFn as _ChunkerFn
18
20
  from .project import Project
19
21
  from .protocol import (
20
22
  DaemonEnvRequest,
@@ -46,15 +48,37 @@ from .protocol import (
46
48
  encode_response,
47
49
  )
48
50
  from .settings import (
51
+ ChunkerMapping,
49
52
  global_settings_mtime_us,
53
+ load_project_settings,
50
54
  load_user_settings,
51
- resolve_db_dir,
55
+ target_sqlite_db_path,
52
56
  user_settings_dir,
53
57
  )
54
58
  from .shared import Embedder, create_embedder
55
59
 
56
60
  logger = logging.getLogger(__name__)
57
61
 
62
+
63
+ def _resolve_chunker_registry(mappings: list[ChunkerMapping]) -> dict[str, _ChunkerFn]:
64
+ """Resolve ``ChunkerMapping`` settings entries to a ``{suffix: fn}`` dict.
65
+
66
+ Each ``mapping.module`` must be a ``"module.path:callable"`` string importable
67
+ from the current environment.
68
+ """
69
+ registry: dict[str, _ChunkerFn] = {}
70
+ for cm in mappings:
71
+ module_path, _, attr = cm.module.partition(":")
72
+ if not attr:
73
+ raise ValueError(f"chunker module {cm.module!r} must use 'module.path:callable' format")
74
+ mod = importlib.import_module(module_path)
75
+ fn = getattr(mod, attr)
76
+ if not callable(fn):
77
+ raise ValueError(f"chunker {cm.module!r}: {attr!r} is not callable")
78
+ registry[f".{cm.ext}"] = fn
79
+ return registry
80
+
81
+
58
82
  # ---------------------------------------------------------------------------
59
83
  # Daemon paths
60
84
  # ---------------------------------------------------------------------------
@@ -111,7 +135,9 @@ class ProjectRegistry:
111
135
  """Get or create a Project for the given root. Lazy initialization."""
112
136
  if project_root not in self._projects:
113
137
  root = Path(project_root)
114
- project = await Project.create(root, self._embedder)
138
+ project_settings = load_project_settings(root)
139
+ chunker_registry = _resolve_chunker_registry(project_settings.chunkers)
140
+ project = await Project.create(root, self._embedder, chunker_registry=chunker_registry)
115
141
  self._projects[project_root] = project
116
142
  return self._projects[project_root]
117
143
 
@@ -346,7 +372,7 @@ async def _check_index_status(project_root_str: str) -> DoctorCheckResult:
346
372
  from cocoindex.connectors import sqlite as coco_sqlite
347
373
 
348
374
  project_root = Path(project_root_str)
349
- db_path = resolve_db_dir(project_root) / "target_sqlite.db"
375
+ db_path = target_sqlite_db_path(project_root)
350
376
  details = [f"Index: {db_path}"]
351
377
 
352
378
  if not db_path.exists():
@@ -14,6 +14,7 @@ from cocoindex.resources.file import FilePathMatcher, PatternFilePathMatcher
14
14
  from cocoindex.resources.id import IdGenerator
15
15
  from pathspec import GitIgnoreSpec
16
16
 
17
+ from .chunking import CHUNKER_REGISTRY
17
18
  from .settings import load_gitignore_spec, load_project_settings
18
19
  from .shared import (
19
20
  CODEBASE_DIR,
@@ -158,13 +159,20 @@ async def process_file(
158
159
  or "text"
159
160
  )
160
161
 
161
- chunks = splitter.split(
162
- content,
163
- chunk_size=CHUNK_SIZE,
164
- min_chunk_size=MIN_CHUNK_SIZE,
165
- chunk_overlap=CHUNK_OVERLAP,
166
- language=language,
167
- )
162
+ chunker_registry = coco.use_context(CHUNKER_REGISTRY)
163
+ chunker = chunker_registry.get(suffix)
164
+ if chunker is not None:
165
+ language_override, chunks = chunker(Path(file.file_path.path), content)
166
+ if language_override is not None:
167
+ language = language_override
168
+ else:
169
+ chunks = splitter.split(
170
+ content,
171
+ chunk_size=CHUNK_SIZE,
172
+ min_chunk_size=MIN_CHUNK_SIZE,
173
+ chunk_overlap=CHUNK_OVERLAP,
174
+ language=language,
175
+ )
168
176
 
169
177
  id_gen = IdGenerator()
170
178
 
@@ -10,6 +10,7 @@ from pathlib import Path
10
10
  import cocoindex as coco
11
11
  from cocoindex.connectors import sqlite as coco_sqlite
12
12
 
13
+ from .chunking import CHUNKER_REGISTRY, ChunkerFn
13
14
  from .indexer import indexer_main
14
15
  from .protocol import (
15
16
  IndexingProgress,
@@ -21,7 +22,15 @@ from .protocol import (
21
22
  SearchResult,
22
23
  )
23
24
  from .query import query_codebase
24
- from .settings import resolve_db_dir
25
+ from .settings import (
26
+ cocoindex_db_path as _cocoindex_db_path,
27
+ )
28
+ from .settings import (
29
+ resolve_db_dir,
30
+ )
31
+ from .settings import (
32
+ target_sqlite_db_path as _target_sqlite_db_path,
33
+ )
25
34
  from .shared import (
26
35
  CODEBASE_DIR,
27
36
  EMBEDDER,
@@ -171,7 +180,7 @@ class Project:
171
180
  offset: int = 0,
172
181
  ) -> list[SearchResult]:
173
182
  """Search within this project."""
174
- target_db = resolve_db_dir(self._project_root) / "target_sqlite.db"
183
+ target_db = _target_sqlite_db_path(self._project_root)
175
184
  results = await query_codebase(
176
185
  query=query,
177
186
  target_sqlite_db_path=target_db,
@@ -248,12 +257,20 @@ class Project:
248
257
  async def create(
249
258
  project_root: Path,
250
259
  embedder: Embedder,
260
+ chunker_registry: dict[str, ChunkerFn] | None = None,
251
261
  ) -> Project:
252
262
  """Create a project with explicit embedder.
253
263
 
254
264
  Project-level settings and .gitignore are NOT cached here — the
255
265
  indexer loads them fresh from disk on every run so that user edits
256
266
  take effect without restarting the daemon.
267
+
268
+ Args:
269
+ project_root: Root directory of the codebase to index.
270
+ embedder: Embedding model instance.
271
+ chunker_registry: Optional mapping of file suffix (e.g. ``".toml"``)
272
+ to a ``ChunkerFn``. When a suffix matches, the registered
273
+ chunker is called instead of the built-in splitter.
257
274
  """
258
275
  settings_dir = project_root / ".cocoindex_code"
259
276
  settings_dir.mkdir(parents=True, exist_ok=True)
@@ -261,15 +278,16 @@ class Project:
261
278
  db_dir = resolve_db_dir(project_root)
262
279
  db_dir.mkdir(parents=True, exist_ok=True)
263
280
 
264
- cocoindex_db_path = db_dir / "cocoindex.db"
265
- target_sqlite_db_path = db_dir / "target_sqlite.db"
281
+ cocoindex_db = _cocoindex_db_path(project_root)
282
+ target_sqlite_db = _target_sqlite_db_path(project_root)
266
283
 
267
- settings = coco.Settings.from_env(cocoindex_db_path)
284
+ settings = coco.Settings.from_env(cocoindex_db)
268
285
 
269
286
  context = coco.ContextProvider()
270
287
  context.provide(CODEBASE_DIR, project_root)
271
- context.provide(SQLITE_DB, coco_sqlite.connect(str(target_sqlite_db_path), load_vec=True))
288
+ context.provide(SQLITE_DB, coco_sqlite.connect(str(target_sqlite_db), load_vec=True))
272
289
  context.provide(EMBEDDER, embedder)
290
+ context.provide(CHUNKER_REGISTRY, dict(chunker_registry) if chunker_registry else {})
273
291
 
274
292
  env = coco.Environment(settings, context_provider=context)
275
293
  app = coco.App(
@@ -45,6 +45,29 @@ DEFAULT_INCLUDED_PATTERNS: list[str] = [
45
45
  "**/*.rst", # reStructuredText
46
46
  "**/*.php", # PHP
47
47
  "**/*.lua", # Lua
48
+ "**/*.rb", # Ruby
49
+ "**/*.swift", # Swift
50
+ "**/*.kt", # Kotlin
51
+ "**/*.kts", # Kotlin script
52
+ "**/*.scala", # Scala
53
+ "**/*.r", # R
54
+ "**/*.html", # HTML
55
+ "**/*.htm", # HTML
56
+ "**/*.css", # CSS
57
+ "**/*.scss", # SCSS
58
+ "**/*.json", # JSON
59
+ "**/*.xml", # XML
60
+ "**/*.yaml", # YAML
61
+ "**/*.yml", # YAML
62
+ "**/*.toml", # TOML
63
+ "**/*.sol", # Solidity
64
+ "**/*.pas", # Pascal
65
+ "**/*.dpr", # Pascal/Delphi
66
+ "**/*.dtd", # DTD
67
+ "**/*.f", # Fortran
68
+ "**/*.f90", # Fortran
69
+ "**/*.f95", # Fortran
70
+ "**/*.f03", # Fortran
48
71
  ]
49
72
 
50
73
  DEFAULT_EXCLUDED_PATTERNS: list[str] = [
@@ -83,11 +106,18 @@ class LanguageOverride:
83
106
  lang: str # e.g. "php"
84
107
 
85
108
 
109
+ @dataclass
110
+ class ChunkerMapping:
111
+ ext: str # without dot, e.g. "toml"
112
+ module: str # "module.path:callable", e.g. "cocoindex_code.toml_chunker:toml_chunker"
113
+
114
+
86
115
  @dataclass
87
116
  class ProjectSettings:
88
117
  include_patterns: list[str] = field(default_factory=lambda: list(DEFAULT_INCLUDED_PATTERNS))
89
118
  exclude_patterns: list[str] = field(default_factory=lambda: list(DEFAULT_EXCLUDED_PATTERNS))
90
119
  language_overrides: list[LanguageOverride] = field(default_factory=list)
120
+ chunkers: list[ChunkerMapping] = field(default_factory=list)
91
121
 
92
122
 
93
123
  # ---------------------------------------------------------------------------
@@ -194,6 +224,20 @@ def _reset_db_path_mapping_cache() -> None:
194
224
  _db_path_mapping = None
195
225
 
196
226
 
227
+ _TARGET_SQLITE_DB_NAME = "target_sqlite.db"
228
+ _COCOINDEX_DB_NAME = "cocoindex.db"
229
+
230
+
231
+ def target_sqlite_db_path(project_root: Path) -> Path:
232
+ """Return the path to the vector index SQLite database for a project."""
233
+ return resolve_db_dir(project_root) / _TARGET_SQLITE_DB_NAME
234
+
235
+
236
+ def cocoindex_db_path(project_root: Path) -> Path:
237
+ """Return the path to the CocoIndex state database for a project."""
238
+ return resolve_db_dir(project_root) / _COCOINDEX_DB_NAME
239
+
240
+
197
241
  def user_settings_dir() -> Path:
198
242
  """Return ``~/.cocoindex_code/``.
199
243
 
@@ -238,7 +282,7 @@ def find_legacy_project_root(start: Path) -> Path | None:
238
282
  """
239
283
  current = start.resolve()
240
284
  while True:
241
- if (current / _SETTINGS_DIR_NAME / "cocoindex.db").exists():
285
+ if (current / _SETTINGS_DIR_NAME / _COCOINDEX_DB_NAME).exists():
242
286
  return current
243
287
  parent = current.parent
244
288
  if parent == current:
@@ -337,6 +381,8 @@ def _project_settings_to_dict(settings: ProjectSettings) -> dict[str, Any]:
337
381
  d["language_overrides"] = [
338
382
  {"ext": lo.ext, "lang": lo.lang} for lo in settings.language_overrides
339
383
  ]
384
+ if settings.chunkers:
385
+ d["chunkers"] = [{"ext": cm.ext, "module": cm.module} for cm in settings.chunkers]
340
386
  return d
341
387
 
342
388
 
@@ -344,10 +390,12 @@ def _project_settings_from_dict(d: dict[str, Any]) -> ProjectSettings:
344
390
  overrides = [
345
391
  LanguageOverride(ext=lo["ext"], lang=lo["lang"]) for lo in d.get("language_overrides", [])
346
392
  ]
393
+ chunkers = [ChunkerMapping(ext=cm["ext"], module=cm["module"]) for cm in d.get("chunkers", [])]
347
394
  return ProjectSettings(
348
395
  include_patterns=d.get("include_patterns", list(DEFAULT_INCLUDED_PATTERNS)),
349
396
  exclude_patterns=d.get("exclude_patterns", list(DEFAULT_EXCLUDED_PATTERNS)),
350
397
  language_overrides=overrides,
398
+ chunkers=chunkers,
351
399
  )
352
400
 
353
401
 
@@ -1,144 +0,0 @@
1
- """Configuration management for cocoindex-code."""
2
-
3
- from __future__ import annotations
4
-
5
- import json
6
- import os
7
- from dataclasses import dataclass
8
- from pathlib import Path
9
-
10
- from .settings import resolve_db_dir
11
-
12
- _DEFAULT_MODEL = "sbert/sentence-transformers/all-MiniLM-L6-v2"
13
-
14
-
15
- def _find_root_with_marker(start: Path, markers: list[str]) -> Path | None:
16
- """Walk up from start, return first directory containing any marker."""
17
- current = start
18
- while True:
19
- if any((current / m).exists() for m in markers):
20
- return current
21
- parent = current.parent
22
- if parent == current:
23
- return None
24
- current = parent
25
-
26
-
27
- def _discover_codebase_root() -> Path:
28
- """Discover the codebase root directory.
29
-
30
- Discovery order:
31
- 1. Find nearest parent with `.cocoindex_code` directory (re-anchor to previously-indexed tree)
32
- 2. Find nearest parent with any common project root marker
33
- 3. Fall back to current working directory
34
- """
35
- cwd = Path.cwd()
36
-
37
- # First, look for existing .cocoindex_code directory
38
- root = _find_root_with_marker(cwd, [".cocoindex_code"])
39
- if root is not None:
40
- return root
41
-
42
- # Then, look for common project root markers
43
- markers = [".git", "pyproject.toml", "package.json", "Cargo.toml", "go.mod"]
44
- root = _find_root_with_marker(cwd, markers)
45
- return root if root is not None else cwd
46
-
47
-
48
- def _parse_json_string_list_env(var_name: str) -> list[str]:
49
- """Parse an environment variable as a JSON array of strings."""
50
- raw_value = os.environ.get(var_name, "")
51
- if not raw_value.strip():
52
- return []
53
-
54
- try:
55
- parsed = json.loads(raw_value)
56
- except json.JSONDecodeError as exc:
57
- raise ValueError(f"{var_name} must be a JSON array of strings, got invalid JSON") from exc
58
-
59
- if not isinstance(parsed, list):
60
- raise ValueError(f"{var_name} must be a JSON array of strings")
61
-
62
- result: list[str] = []
63
- for item in parsed:
64
- if not isinstance(item, str):
65
- raise ValueError(f"{var_name} must be a JSON array of strings")
66
- item = item.strip()
67
- if item:
68
- result.append(item)
69
-
70
- return result
71
-
72
-
73
- @dataclass
74
- class Config:
75
- """Configuration loaded from environment variables."""
76
-
77
- codebase_root_path: Path
78
- embedding_model: str
79
- index_dir: Path
80
- device: str | None
81
- extra_extensions: dict[str, str | None]
82
- excluded_patterns: list[str]
83
-
84
- @classmethod
85
- def from_env(cls) -> Config:
86
- """Load configuration from environment variables."""
87
- # Get root path from env or discover it
88
- root_path_str = os.environ.get("COCOINDEX_CODE_ROOT_PATH")
89
- if root_path_str:
90
- root = Path(root_path_str).resolve()
91
- else:
92
- root = _discover_codebase_root()
93
-
94
- # Get embedding model
95
- # Prefix "sbert/" for SentenceTransformers models, otherwise LiteLLM.
96
- embedding_model = os.environ.get(
97
- "COCOINDEX_CODE_EMBEDDING_MODEL",
98
- _DEFAULT_MODEL,
99
- )
100
-
101
- # Index directory: apply DB path mapping if configured
102
- index_dir = resolve_db_dir(root)
103
-
104
- # Device: auto-detect CUDA or use env override
105
- device = os.environ.get("COCOINDEX_CODE_DEVICE")
106
-
107
- # Extra file extensions (format: "inc:php,yaml,toml" — optional lang after colon)
108
- raw_extra = os.environ.get("COCOINDEX_CODE_EXTRA_EXTENSIONS", "")
109
- extra_extensions: dict[str, str | None] = {}
110
- for token in raw_extra.split(","):
111
- token = token.strip()
112
- if not token:
113
- continue
114
- if ":" in token:
115
- ext, lang = token.split(":", 1)
116
- extra_extensions[f".{ext.strip()}"] = lang.strip() or None
117
- else:
118
- extra_extensions[f".{token}"] = None
119
-
120
- # Excluded file glob patterns
121
- excluded_patterns = _parse_json_string_list_env("COCOINDEX_CODE_EXCLUDED_PATTERNS")
122
-
123
- return cls(
124
- codebase_root_path=root,
125
- embedding_model=embedding_model,
126
- index_dir=index_dir,
127
- device=device,
128
- extra_extensions=extra_extensions,
129
- excluded_patterns=excluded_patterns,
130
- )
131
-
132
- @property
133
- def target_sqlite_db_path(self) -> Path:
134
- """Path to the vector index SQLite database."""
135
- return self.index_dir / "target_sqlite.db"
136
-
137
- @property
138
- def cocoindex_db_path(self) -> Path:
139
- """Path to the CocoIndex state database."""
140
- return self.index_dir / "cocoindex.db"
141
-
142
-
143
- # Module-level singleton — imported directly by all modules that need configuration
144
- config: Config = Config.from_env()
File without changes