cocoindex-code 0.2.7__tar.gz → 0.2.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/PKG-INFO +67 -1
  2. {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/README.md +66 -0
  3. {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/pyproject.toml +2 -0
  4. {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/src/cocoindex_code/_version.py +2 -2
  5. cocoindex_code-0.2.9/src/cocoindex_code/chunking.py +29 -0
  6. {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/src/cocoindex_code/cli.py +31 -18
  7. {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/src/cocoindex_code/client.py +2 -2
  8. {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/src/cocoindex_code/daemon.py +36 -2
  9. {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/src/cocoindex_code/indexer.py +15 -7
  10. {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/src/cocoindex_code/project.py +29 -7
  11. {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/src/cocoindex_code/protocol.py +6 -0
  12. {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/src/cocoindex_code/settings.py +127 -3
  13. cocoindex_code-0.2.7/src/cocoindex_code/config.py +0 -142
  14. {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/.gitignore +0 -0
  15. {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/LICENSE +0 -0
  16. {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/src/cocoindex_code/__init__.py +0 -0
  17. {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/src/cocoindex_code/__main__.py +0 -0
  18. {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/src/cocoindex_code/query.py +0 -0
  19. {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/src/cocoindex_code/schema.py +0 -0
  20. {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/src/cocoindex_code/server.py +0 -0
  21. {cocoindex_code-0.2.7 → cocoindex_code-0.2.9}/src/cocoindex_code/shared.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cocoindex-code
3
- Version: 0.2.7
3
+ Version: 0.2.9
4
4
  Summary: MCP server for indexing and querying codebases using CocoIndex
5
5
  Project-URL: Homepage, https://github.com/cocoindex-io/cocoindex-code
6
6
  Project-URL: Repository, https://github.com/cocoindex-io/cocoindex-code
@@ -276,10 +276,43 @@ exclude_patterns:
276
276
  language_overrides:
277
277
  - ext: inc # treat .inc files as PHP
278
278
  lang: php
279
+
280
+ chunkers:
281
+ - ext: toml # use a custom chunker for .toml files
282
+ module: example_toml_chunker:toml_chunker
279
283
  ```
280
284
 
281
285
  > `.cocoindex_code/` is automatically added to `.gitignore` during init.
282
286
 
287
+ Use `chunkers` when you want to control how a file type is split into chunks before indexing.
288
+
289
+ `module: example_toml_chunker:toml_chunker` means:
290
+ - `example_toml_chunker` is a local Python module
291
+ - `toml_chunker` is the function inside that module
292
+
293
+ In practice, this usually means:
294
+ - you create a Python file in your project, for example `example_toml_chunker.py`
295
+ - you add a function in that file
296
+ - you point `settings.yml` at it with `module.path:function_name`
297
+
298
+ The function should use this signature:
299
+
300
+ ```python
301
+ from pathlib import Path
302
+ from cocoindex_code.chunking import Chunk
303
+
304
+ def my_chunker(path: Path, content: str) -> tuple[str | None, list[Chunk]]:
305
+ ...
306
+ ```
307
+
308
+ - `path` is the file being indexed
309
+ - `content` is the full text of that file
310
+ - return `language_override` as a string like `"toml"` if you want to override language detection
311
+ - return `None` as `language_override` if you want to keep the detected language
312
+ - return a `list[Chunk]` with the chunks you want stored in the index
313
+
314
+ See [`src/cocoindex_code/chunking.py`](./src/cocoindex_code/chunking.py) for the public types and [`tests/example_toml_chunker.py`](./tests/example_toml_chunker.py) for a complete example.
315
+
283
316
  ## Embedding Models
284
317
 
285
318
  By default, a local SentenceTransformers model ([sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)) is used — no API key required. To use a different model, edit `~/.cocoindex_code/global_settings.yml`.
@@ -457,6 +490,26 @@ embedding:
457
490
  | xml | | `.xml` |
458
491
  | yaml | | `.yaml`, `.yml` |
459
492
 
493
+ ### Custom Database Location
494
+
495
+ By default, index databases (`cocoindex.db` and `target_sqlite.db`) live alongside settings in `<project>/.cocoindex_code/`. When running in Docker, you may want the databases on the container's native filesystem for performance (LMDB doesn't work well on mounted volumes) while keeping the source code and settings on a mounted volume.
496
+
497
+ Set `COCOINDEX_CODE_DB_PATH_MAPPING` to remap database locations by path prefix:
498
+
499
+ ```bash
500
+ COCOINDEX_CODE_DB_PATH_MAPPING=/workspace=/db-files
501
+ ```
502
+
503
+ With this mapping, a project at `/workspace/myrepo` stores its databases in `/db-files/myrepo/` instead of `/workspace/myrepo/.cocoindex_code/`. Settings files remain in the original location.
504
+
505
+ Multiple mappings are comma-separated and resolved in order (first match wins):
506
+
507
+ ```bash
508
+ COCOINDEX_CODE_DB_PATH_MAPPING=/workspace=/db-files,/workspace2=/db-files2
509
+ ```
510
+
511
+ Both source and target must be absolute paths. If no mapping matches, the default location is used.
512
+
460
513
  ## Troubleshooting
461
514
 
462
515
  Run `ccc doctor` to diagnose common issues. It checks your settings, daemon health, embedding model, file matching, and index status — all in one command.
@@ -501,6 +554,19 @@ If you previously configured `cocoindex-code` via environment variables, the `co
501
554
 
502
555
  If you need help with remote setup, please email our maintainer linghua@cocoindex.io, happy to help!
503
556
 
557
+ ## Contributing
558
+
559
+ We welcome contributions! Before you start, please install the [pre-commit](https://pre-commit.com/) hooks so that linting, formatting, type checking, and tests run automatically before each commit:
560
+
561
+ ```bash
562
+ pip install pre-commit
563
+ pre-commit install
564
+ ```
565
+
566
+ This catches common issues — trailing whitespace, lint errors (Ruff), type errors (mypy), and test failures — before they reach CI.
567
+
568
+ For more details, see our [contributing guide](https://cocoindex.io/docs/contributing/guide).
569
+
504
570
  ## License
505
571
 
506
572
  Apache-2.0
@@ -237,10 +237,43 @@ exclude_patterns:
237
237
  language_overrides:
238
238
  - ext: inc # treat .inc files as PHP
239
239
  lang: php
240
+
241
+ chunkers:
242
+ - ext: toml # use a custom chunker for .toml files
243
+ module: example_toml_chunker:toml_chunker
240
244
  ```
241
245
 
242
246
  > `.cocoindex_code/` is automatically added to `.gitignore` during init.
243
247
 
248
+ Use `chunkers` when you want to control how a file type is split into chunks before indexing.
249
+
250
+ `module: example_toml_chunker:toml_chunker` means:
251
+ - `example_toml_chunker` is a local Python module
252
+ - `toml_chunker` is the function inside that module
253
+
254
+ In practice, this usually means:
255
+ - you create a Python file in your project, for example `example_toml_chunker.py`
256
+ - you add a function in that file
257
+ - you point `settings.yml` at it with `module.path:function_name`
258
+
259
+ The function should use this signature:
260
+
261
+ ```python
262
+ from pathlib import Path
263
+ from cocoindex_code.chunking import Chunk
264
+
265
+ def my_chunker(path: Path, content: str) -> tuple[str | None, list[Chunk]]:
266
+ ...
267
+ ```
268
+
269
+ - `path` is the file being indexed
270
+ - `content` is the full text of that file
271
+ - return `language_override` as a string like `"toml"` if you want to override language detection
272
+ - return `None` as `language_override` if you want to keep the detected language
273
+ - return a `list[Chunk]` with the chunks you want stored in the index
274
+
275
+ See [`src/cocoindex_code/chunking.py`](./src/cocoindex_code/chunking.py) for the public types and [`tests/example_toml_chunker.py`](./tests/example_toml_chunker.py) for a complete example.
276
+
244
277
  ## Embedding Models
245
278
 
246
279
  By default, a local SentenceTransformers model ([sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)) is used — no API key required. To use a different model, edit `~/.cocoindex_code/global_settings.yml`.
@@ -418,6 +451,26 @@ embedding:
418
451
  | xml | | `.xml` |
419
452
  | yaml | | `.yaml`, `.yml` |
420
453
 
454
+ ### Custom Database Location
455
+
456
+ By default, index databases (`cocoindex.db` and `target_sqlite.db`) live alongside settings in `<project>/.cocoindex_code/`. When running in Docker, you may want the databases on the container's native filesystem for performance (LMDB doesn't work well on mounted volumes) while keeping the source code and settings on a mounted volume.
457
+
458
+ Set `COCOINDEX_CODE_DB_PATH_MAPPING` to remap database locations by path prefix:
459
+
460
+ ```bash
461
+ COCOINDEX_CODE_DB_PATH_MAPPING=/workspace=/db-files
462
+ ```
463
+
464
+ With this mapping, a project at `/workspace/myrepo` stores its databases in `/db-files/myrepo/` instead of `/workspace/myrepo/.cocoindex_code/`. Settings files remain in the original location.
465
+
466
+ Multiple mappings are comma-separated and resolved in order (first match wins):
467
+
468
+ ```bash
469
+ COCOINDEX_CODE_DB_PATH_MAPPING=/workspace=/db-files,/workspace2=/db-files2
470
+ ```
471
+
472
+ Both source and target must be absolute paths. If no mapping matches, the default location is used.
473
+
421
474
  ## Troubleshooting
422
475
 
423
476
  Run `ccc doctor` to diagnose common issues. It checks your settings, daemon health, embedding model, file matching, and index status — all in one command.
@@ -462,6 +515,19 @@ If you previously configured `cocoindex-code` via environment variables, the `co
462
515
 
463
516
  If you need help with remote setup, please email our maintainer linghua@cocoindex.io, happy to help!
464
517
 
518
+ ## Contributing
519
+
520
+ We welcome contributions! Before you start, please install the [pre-commit](https://pre-commit.com/) hooks so that linting, formatting, type checking, and tests run automatically before each commit:
521
+
522
+ ```bash
523
+ pip install pre-commit
524
+ pre-commit install
525
+ ```
526
+
527
+ This catches common issues — trailing whitespace, lint errors (Ruff), type errors (mypy), and test failures — before they reach CI.
528
+
529
+ For more details, see our [contributing guide](https://cocoindex.io/docs/contributing/guide).
530
+
465
531
  ## License
466
532
 
467
533
  Apache-2.0
@@ -91,9 +91,11 @@ select = ["E", "F", "I", "N", "W", "UP"]
91
91
  python_version = "3.11"
92
92
  strict = true
93
93
  ignore_missing_imports = true
94
+ explicit_package_bases = true
94
95
 
95
96
  [tool.pytest.ini_options]
96
97
  testpaths = ["tests"]
97
98
  python_files = ["test_*.py"]
98
99
  python_functions = ["test_*"]
99
100
  addopts = "-v --tb=short"
101
+ asyncio_mode = "auto"
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.2.7'
32
- __version_tuple__ = version_tuple = (0, 2, 7)
31
+ __version__ = version = '0.2.9'
32
+ __version_tuple__ = version_tuple = (0, 2, 9)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -0,0 +1,29 @@
1
+ """Public API for writing custom chunkers.
2
+
3
+ Example usage::
4
+
5
+ from pathlib import Path
6
+ from cocoindex_code.chunking import Chunk, ChunkerFn, TextPosition
7
+
8
+ def my_chunker(path: Path, content: str) -> tuple[str | None, list[Chunk]]:
9
+ pos = TextPosition(byte_offset=0, char_offset=0, line=1, column=0)
10
+ return "mylang", [Chunk(text=content, start=pos, end=pos)]
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import pathlib as _pathlib
16
+ from collections.abc import Callable as _Callable
17
+
18
+ import cocoindex as _coco
19
+ from cocoindex.resources.chunk import Chunk, TextPosition
20
+
21
+ # Callable alias (not Protocol) — consistent with codebase style.
22
+ # language_override=None keeps the language detected by detect_code_language.
23
+ # path is not resolved (no syscall); call path.resolve() inside the chunker if needed.
24
+ ChunkerFn = _Callable[[_pathlib.Path, str], tuple[str | None, list[Chunk]]]
25
+
26
+ # tracked=False: callables are not fingerprint-able; daemon restart re-indexes anyway.
27
+ CHUNKER_REGISTRY = _coco.ContextKey[dict[str, ChunkerFn]]("chunker_registry", tracked=False)
28
+
29
+ __all__ = ["Chunk", "ChunkerFn", "CHUNKER_REGISTRY", "TextPosition"]
@@ -12,12 +12,16 @@ import typer as _typer
12
12
  from .client import DaemonStartError
13
13
  from .protocol import DoctorCheckResult, IndexingProgress, ProjectStatusResponse, SearchResponse
14
14
  from .settings import (
15
+ cocoindex_db_path,
15
16
  default_project_settings,
16
17
  default_user_settings,
17
18
  find_parent_with_marker,
18
19
  find_project_root,
20
+ project_settings_path,
21
+ resolve_db_dir,
19
22
  save_project_settings,
20
23
  save_user_settings,
24
+ target_sqlite_db_path,
21
25
  user_settings_path,
22
26
  )
23
27
 
@@ -283,10 +287,8 @@ def init(
283
287
  force: bool = _typer.Option(False, "-f", "--force", help="Skip parent directory warning"),
284
288
  ) -> None:
285
289
  """Initialize a project for cocoindex-code."""
286
- from .settings import project_settings_path as _project_settings_path
287
-
288
290
  cwd = Path.cwd().resolve()
289
- settings_file = _project_settings_path(cwd)
291
+ settings_file = project_settings_path(cwd)
290
292
 
291
293
  # Always ensure user settings exist
292
294
  user_path = user_settings_path()
@@ -376,8 +378,15 @@ def status() -> None:
376
378
  """Show project status."""
377
379
  from . import client as _client
378
380
 
379
- project_root = str(require_project_root())
381
+ project_root_path = require_project_root()
382
+ project_root = str(project_root_path)
380
383
  print_project_header(project_root)
384
+
385
+ _typer.echo(f"Settings: {project_settings_path(project_root_path)}")
386
+ db_path = target_sqlite_db_path(project_root_path)
387
+ if db_path.exists():
388
+ _typer.echo(f"Index DB: {db_path}")
389
+
381
390
  print_index_stats(_client.project_status(project_root))
382
391
 
383
392
 
@@ -389,12 +398,13 @@ def reset(
389
398
  """Reset project databases and optionally remove settings."""
390
399
  project_root = require_project_root()
391
400
  cocoindex_dir = project_root / ".cocoindex_code"
401
+ db_dir = resolve_db_dir(project_root)
392
402
 
393
403
  db_files = [
394
- cocoindex_dir / "cocoindex.db",
395
- cocoindex_dir / "target_sqlite.db",
404
+ cocoindex_db_path(project_root),
405
+ target_sqlite_db_path(project_root),
396
406
  ]
397
- settings_file = cocoindex_dir / "settings.yml"
407
+ settings_file = project_settings_path(project_root)
398
408
 
399
409
  # Determine what will be deleted
400
410
  to_delete = [f for f in db_files if f.exists()]
@@ -436,6 +446,12 @@ def reset(
436
446
  f.unlink(missing_ok=True)
437
447
 
438
448
  if all_:
449
+ # Remove db_dir if empty and different from cocoindex_dir
450
+ if db_dir != cocoindex_dir:
451
+ try:
452
+ db_dir.rmdir()
453
+ except OSError:
454
+ pass # Not empty or doesn't exist
439
455
  # Remove .cocoindex_code/ if empty
440
456
  try:
441
457
  cocoindex_dir.rmdir()
@@ -495,16 +511,10 @@ def doctor() -> None:
495
511
  from .settings import (
496
512
  load_user_settings as _load_user_settings,
497
513
  )
498
- from .settings import (
499
- project_settings_path as _project_settings_path,
500
- )
501
- from .settings import (
502
- user_settings_path as _user_settings_path,
503
- )
504
514
 
505
515
  # --- 1. Global settings (local, no daemon needed) ---
506
516
  _print_section("Global Settings")
507
- settings_path = _user_settings_path()
517
+ settings_path = user_settings_path()
508
518
  _typer.echo(f" Settings: {settings_path}")
509
519
  try:
510
520
  user_settings = _load_user_settings()
@@ -539,6 +549,10 @@ def doctor() -> None:
539
549
  other_keys = [k for k in env_resp.env_names if k not in settings_keys]
540
550
  if other_keys:
541
551
  _typer.echo(f" Other env vars in daemon: {', '.join(sorted(other_keys))}")
552
+ if env_resp.db_path_mappings:
553
+ _typer.echo(" DB path mappings:")
554
+ for m in env_resp.db_path_mappings:
555
+ _typer.echo(f" {m.source} \u2192 {m.target}")
542
556
  except Exception as e:
543
557
  _print_error(f"Failed to get daemon env: {e}")
544
558
 
@@ -558,7 +572,7 @@ def doctor() -> None:
558
572
  # --- 6. Project settings (local, no daemon needed) ---
559
573
  if project_root is not None:
560
574
  _print_section("Project Settings")
561
- ps_path = _project_settings_path(project_root)
575
+ ps_path = project_settings_path(project_root)
562
576
  _typer.echo(f" Settings: {ps_path}")
563
577
  try:
564
578
  ps = _load_project_settings(project_root)
@@ -585,10 +599,9 @@ def doctor() -> None:
585
599
 
586
600
  # --- 8. Log files ---
587
601
  _print_section("Log Files")
588
- from .daemon import daemon_dir as _daemon_dir
602
+ from .daemon import daemon_log_path as _daemon_log_path
589
603
 
590
- log_dir = _daemon_dir()
591
- _typer.echo(f" Daemon logs: {log_dir / 'daemon.log'}")
604
+ _typer.echo(f" Daemon logs: {_daemon_log_path()}")
592
605
  _typer.echo(" Check logs above for further troubleshooting.")
593
606
 
594
607
 
@@ -343,10 +343,10 @@ def start_daemon() -> subprocess.Popen[bytes]:
343
343
  Returns the ``Popen`` object so callers can detect early process death
344
344
  (via ``proc.poll()``) instead of waiting for a full timeout.
345
345
  """
346
- from .daemon import daemon_dir
346
+ from .daemon import daemon_dir, daemon_log_path
347
347
 
348
348
  daemon_dir().mkdir(parents=True, exist_ok=True)
349
- log_path = daemon_dir() / "daemon.log"
349
+ log_path = daemon_log_path()
350
350
 
351
351
  ccc_path = _find_ccc_executable()
352
352
  if ccc_path:
@@ -3,6 +3,7 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import asyncio
6
+ import importlib
6
7
  import logging
7
8
  import os
8
9
  import signal
@@ -15,6 +16,7 @@ from pathlib import Path
15
16
  from typing import Any
16
17
 
17
18
  from ._version import __version__
19
+ from .chunking import ChunkerFn as _ChunkerFn
18
20
  from .project import Project
19
21
  from .protocol import (
20
22
  DaemonEnvRequest,
@@ -46,14 +48,37 @@ from .protocol import (
46
48
  encode_response,
47
49
  )
48
50
  from .settings import (
51
+ ChunkerMapping,
49
52
  global_settings_mtime_us,
53
+ load_project_settings,
50
54
  load_user_settings,
55
+ target_sqlite_db_path,
51
56
  user_settings_dir,
52
57
  )
53
58
  from .shared import Embedder, create_embedder
54
59
 
55
60
  logger = logging.getLogger(__name__)
56
61
 
62
+
63
+ def _resolve_chunker_registry(mappings: list[ChunkerMapping]) -> dict[str, _ChunkerFn]:
64
+ """Resolve ``ChunkerMapping`` settings entries to a ``{suffix: fn}`` dict.
65
+
66
+ Each ``mapping.module`` must be a ``"module.path:callable"`` string importable
67
+ from the current environment.
68
+ """
69
+ registry: dict[str, _ChunkerFn] = {}
70
+ for cm in mappings:
71
+ module_path, _, attr = cm.module.partition(":")
72
+ if not attr:
73
+ raise ValueError(f"chunker module {cm.module!r} must use 'module.path:callable' format")
74
+ mod = importlib.import_module(module_path)
75
+ fn = getattr(mod, attr)
76
+ if not callable(fn):
77
+ raise ValueError(f"chunker {cm.module!r}: {attr!r} is not callable")
78
+ registry[f".{cm.ext}"] = fn
79
+ return registry
80
+
81
+
57
82
  # ---------------------------------------------------------------------------
58
83
  # Daemon paths
59
84
  # ---------------------------------------------------------------------------
@@ -110,7 +135,9 @@ class ProjectRegistry:
110
135
  """Get or create a Project for the given root. Lazy initialization."""
111
136
  if project_root not in self._projects:
112
137
  root = Path(project_root)
113
- project = await Project.create(root, self._embedder)
138
+ project_settings = load_project_settings(root)
139
+ chunker_registry = _resolve_chunker_registry(project_settings.chunkers)
140
+ project = await Project.create(root, self._embedder, chunker_registry=chunker_registry)
114
141
  self._projects[project_root] = project
115
142
  return self._projects[project_root]
116
143
 
@@ -345,7 +372,7 @@ async def _check_index_status(project_root_str: str) -> DoctorCheckResult:
345
372
  from cocoindex.connectors import sqlite as coco_sqlite
346
373
 
347
374
  project_root = Path(project_root_str)
348
- db_path = project_root / ".cocoindex_code" / "target_sqlite.db"
375
+ db_path = target_sqlite_db_path(project_root)
349
376
  details = [f"Index: {db_path}"]
350
377
 
351
378
  if not db_path.exists():
@@ -441,9 +468,16 @@ async def _dispatch(
441
468
  return StopResponse(ok=True)
442
469
 
443
470
  if isinstance(req, DaemonEnvRequest):
471
+ from .protocol import DbPathMappingEntry
472
+ from .settings import get_db_path_mappings
473
+
444
474
  return DaemonEnvResponse(
445
475
  env_names=sorted(os.environ.keys()),
446
476
  settings_env_names=settings_env_names,
477
+ db_path_mappings=[
478
+ DbPathMappingEntry(source=str(m.source), target=str(m.target))
479
+ for m in get_db_path_mappings()
480
+ ],
447
481
  )
448
482
 
449
483
  if isinstance(req, DoctorRequest):
@@ -14,6 +14,7 @@ from cocoindex.resources.file import FilePathMatcher, PatternFilePathMatcher
14
14
  from cocoindex.resources.id import IdGenerator
15
15
  from pathspec import GitIgnoreSpec
16
16
 
17
+ from .chunking import CHUNKER_REGISTRY
17
18
  from .settings import load_gitignore_spec, load_project_settings
18
19
  from .shared import (
19
20
  CODEBASE_DIR,
@@ -158,13 +159,20 @@ async def process_file(
158
159
  or "text"
159
160
  )
160
161
 
161
- chunks = splitter.split(
162
- content,
163
- chunk_size=CHUNK_SIZE,
164
- min_chunk_size=MIN_CHUNK_SIZE,
165
- chunk_overlap=CHUNK_OVERLAP,
166
- language=language,
167
- )
162
+ chunker_registry = coco.use_context(CHUNKER_REGISTRY)
163
+ chunker = chunker_registry.get(suffix)
164
+ if chunker is not None:
165
+ language_override, chunks = chunker(Path(file.file_path.path), content)
166
+ if language_override is not None:
167
+ language = language_override
168
+ else:
169
+ chunks = splitter.split(
170
+ content,
171
+ chunk_size=CHUNK_SIZE,
172
+ min_chunk_size=MIN_CHUNK_SIZE,
173
+ chunk_overlap=CHUNK_OVERLAP,
174
+ language=language,
175
+ )
168
176
 
169
177
  id_gen = IdGenerator()
170
178
 
@@ -10,6 +10,7 @@ from pathlib import Path
10
10
  import cocoindex as coco
11
11
  from cocoindex.connectors import sqlite as coco_sqlite
12
12
 
13
+ from .chunking import CHUNKER_REGISTRY, ChunkerFn
13
14
  from .indexer import indexer_main
14
15
  from .protocol import (
15
16
  IndexingProgress,
@@ -21,6 +22,15 @@ from .protocol import (
21
22
  SearchResult,
22
23
  )
23
24
  from .query import query_codebase
25
+ from .settings import (
26
+ cocoindex_db_path as _cocoindex_db_path,
27
+ )
28
+ from .settings import (
29
+ resolve_db_dir,
30
+ )
31
+ from .settings import (
32
+ target_sqlite_db_path as _target_sqlite_db_path,
33
+ )
24
34
  from .shared import (
25
35
  CODEBASE_DIR,
26
36
  EMBEDDER,
@@ -170,7 +180,7 @@ class Project:
170
180
  offset: int = 0,
171
181
  ) -> list[SearchResult]:
172
182
  """Search within this project."""
173
- target_db = self._project_root / ".cocoindex_code" / "target_sqlite.db"
183
+ target_db = _target_sqlite_db_path(self._project_root)
174
184
  results = await query_codebase(
175
185
  query=query,
176
186
  target_sqlite_db_path=target_db,
@@ -247,25 +257,37 @@ class Project:
247
257
  async def create(
248
258
  project_root: Path,
249
259
  embedder: Embedder,
260
+ chunker_registry: dict[str, ChunkerFn] | None = None,
250
261
  ) -> Project:
251
262
  """Create a project with explicit embedder.
252
263
 
253
264
  Project-level settings and .gitignore are NOT cached here — the
254
265
  indexer loads them fresh from disk on every run so that user edits
255
266
  take effect without restarting the daemon.
267
+
268
+ Args:
269
+ project_root: Root directory of the codebase to index.
270
+ embedder: Embedding model instance.
271
+ chunker_registry: Optional mapping of file suffix (e.g. ``".toml"``)
272
+ to a ``ChunkerFn``. When a suffix matches, the registered
273
+ chunker is called instead of the built-in splitter.
256
274
  """
257
- index_dir = project_root / ".cocoindex_code"
258
- index_dir.mkdir(parents=True, exist_ok=True)
275
+ settings_dir = project_root / ".cocoindex_code"
276
+ settings_dir.mkdir(parents=True, exist_ok=True)
277
+
278
+ db_dir = resolve_db_dir(project_root)
279
+ db_dir.mkdir(parents=True, exist_ok=True)
259
280
 
260
- cocoindex_db_path = index_dir / "cocoindex.db"
261
- target_sqlite_db_path = index_dir / "target_sqlite.db"
281
+ cocoindex_db = _cocoindex_db_path(project_root)
282
+ target_sqlite_db = _target_sqlite_db_path(project_root)
262
283
 
263
- settings = coco.Settings.from_env(cocoindex_db_path)
284
+ settings = coco.Settings.from_env(cocoindex_db)
264
285
 
265
286
  context = coco.ContextProvider()
266
287
  context.provide(CODEBASE_DIR, project_root)
267
- context.provide(SQLITE_DB, coco_sqlite.connect(str(target_sqlite_db_path), load_vec=True))
288
+ context.provide(SQLITE_DB, coco_sqlite.connect(str(target_sqlite_db), load_vec=True))
268
289
  context.provide(EMBEDDER, embedder)
290
+ context.provide(CHUNKER_REGISTRY, dict(chunker_registry) if chunker_registry else {})
269
291
 
270
292
  env = coco.Environment(settings, context_provider=context)
271
293
  app = coco.App(
@@ -158,9 +158,15 @@ class DoctorResponse(_msgspec.Struct, tag="doctor"):
158
158
  final: bool = False
159
159
 
160
160
 
161
+ class DbPathMappingEntry(_msgspec.Struct):
162
+ source: str
163
+ target: str
164
+
165
+
161
166
  class DaemonEnvResponse(_msgspec.Struct, tag="daemon_env"):
162
167
  env_names: list[str]
163
168
  settings_env_names: list[str]
169
+ db_path_mappings: list[DbPathMappingEntry] = []
164
170
 
165
171
 
166
172
  class ErrorResponse(_msgspec.Struct, tag="error"):
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import os
5
6
  from dataclasses import dataclass, field
6
7
  from pathlib import Path
7
8
  from typing import Any
@@ -44,6 +45,29 @@ DEFAULT_INCLUDED_PATTERNS: list[str] = [
44
45
  "**/*.rst", # reStructuredText
45
46
  "**/*.php", # PHP
46
47
  "**/*.lua", # Lua
48
+ "**/*.rb", # Ruby
49
+ "**/*.swift", # Swift
50
+ "**/*.kt", # Kotlin
51
+ "**/*.kts", # Kotlin script
52
+ "**/*.scala", # Scala
53
+ "**/*.r", # R
54
+ "**/*.html", # HTML
55
+ "**/*.htm", # HTML
56
+ "**/*.css", # CSS
57
+ "**/*.scss", # SCSS
58
+ "**/*.json", # JSON
59
+ "**/*.xml", # XML
60
+ "**/*.yaml", # YAML
61
+ "**/*.yml", # YAML
62
+ "**/*.toml", # TOML
63
+ "**/*.sol", # Solidity
64
+ "**/*.pas", # Pascal
65
+ "**/*.dpr", # Pascal/Delphi
66
+ "**/*.dtd", # DTD
67
+ "**/*.f", # Fortran
68
+ "**/*.f90", # Fortran
69
+ "**/*.f95", # Fortran
70
+ "**/*.f03", # Fortran
47
71
  ]
48
72
 
49
73
  DEFAULT_EXCLUDED_PATTERNS: list[str] = [
@@ -82,11 +106,18 @@ class LanguageOverride:
82
106
  lang: str # e.g. "php"
83
107
 
84
108
 
109
+ @dataclass
110
+ class ChunkerMapping:
111
+ ext: str # without dot, e.g. "toml"
112
+ module: str # "module.path:callable", e.g. "cocoindex_code.toml_chunker:toml_chunker"
113
+
114
+
85
115
  @dataclass
86
116
  class ProjectSettings:
87
117
  include_patterns: list[str] = field(default_factory=lambda: list(DEFAULT_INCLUDED_PATTERNS))
88
118
  exclude_patterns: list[str] = field(default_factory=lambda: list(DEFAULT_EXCLUDED_PATTERNS))
89
119
  language_overrides: list[LanguageOverride] = field(default_factory=list)
120
+ chunkers: list[ChunkerMapping] = field(default_factory=list)
90
121
 
91
122
 
92
123
  # ---------------------------------------------------------------------------
@@ -115,14 +146,103 @@ _SETTINGS_DIR_NAME = ".cocoindex_code"
115
146
  _SETTINGS_FILE_NAME = "settings.yml" # project-level
116
147
  _USER_SETTINGS_FILE_NAME = "global_settings.yml" # user-level
117
148
 
149
+ _ENV_DB_PATH_MAPPING = "COCOINDEX_CODE_DB_PATH_MAPPING"
150
+
151
+
152
+ @dataclass
153
+ class DbPathMapping:
154
+ source: Path
155
+ target: Path
156
+
157
+
158
+ _db_path_mapping: list[DbPathMapping] | None = None
159
+
160
+
161
+ def _parse_db_path_mapping() -> list[DbPathMapping]:
162
+ """Parse ``COCOINDEX_CODE_DB_PATH_MAPPING`` env var.
163
+
164
+ Format: ``/src1=/dst1,/src2=/dst2``
165
+ Both source and target must be absolute paths.
166
+ """
167
+ raw = os.environ.get(_ENV_DB_PATH_MAPPING, "")
168
+ if not raw.strip():
169
+ return []
170
+
171
+ mappings: list[DbPathMapping] = []
172
+ for entry in raw.split(","):
173
+ entry = entry.strip()
174
+ if not entry:
175
+ continue
176
+ parts = entry.split("=", 1)
177
+ if len(parts) != 2 or not parts[0] or not parts[1]:
178
+ raise ValueError(
179
+ f"{_ENV_DB_PATH_MAPPING}: invalid entry {entry!r}, expected format 'source=target'"
180
+ )
181
+ source = Path(parts[0])
182
+ target = Path(parts[1])
183
+ if not source.is_absolute():
184
+ raise ValueError(
185
+ f"{_ENV_DB_PATH_MAPPING}: source path must be absolute, got {source!r}"
186
+ )
187
+ if not target.is_absolute():
188
+ raise ValueError(
189
+ f"{_ENV_DB_PATH_MAPPING}: target path must be absolute, got {target!r}"
190
+ )
191
+ mappings.append(DbPathMapping(source=source.resolve(), target=target.resolve()))
192
+ return mappings
193
+
194
+
195
+ def resolve_db_dir(project_root: Path) -> Path:
196
+ """Return the directory for database files given a project root.
197
+
198
+ Applies ``COCOINDEX_CODE_DB_PATH_MAPPING`` if set, otherwise falls back
199
+ to ``project_root / ".cocoindex_code"``.
200
+ """
201
+ global _db_path_mapping # noqa: PLW0603
202
+ if _db_path_mapping is None:
203
+ _db_path_mapping = _parse_db_path_mapping()
204
+
205
+ resolved = project_root.resolve()
206
+ for mapping in _db_path_mapping:
207
+ if resolved == mapping.source or resolved.is_relative_to(mapping.source):
208
+ rel = resolved.relative_to(mapping.source)
209
+ return mapping.target / rel
210
+ return project_root / _SETTINGS_DIR_NAME
211
+
212
+
213
+ def get_db_path_mappings() -> list[DbPathMapping]:
214
+ """Return the parsed DB path mappings from ``COCOINDEX_CODE_DB_PATH_MAPPING``."""
215
+ global _db_path_mapping # noqa: PLW0603
216
+ if _db_path_mapping is None:
217
+ _db_path_mapping = _parse_db_path_mapping()
218
+ return list(_db_path_mapping)
219
+
220
+
221
+ def _reset_db_path_mapping_cache() -> None:
222
+ """Reset the cached mapping (for tests)."""
223
+ global _db_path_mapping # noqa: PLW0603
224
+ _db_path_mapping = None
225
+
226
+
227
+ _TARGET_SQLITE_DB_NAME = "target_sqlite.db"
228
+ _COCOINDEX_DB_NAME = "cocoindex.db"
229
+
230
+
231
+ def target_sqlite_db_path(project_root: Path) -> Path:
232
+ """Return the path to the vector index SQLite database for a project."""
233
+ return resolve_db_dir(project_root) / _TARGET_SQLITE_DB_NAME
234
+
235
+
236
+ def cocoindex_db_path(project_root: Path) -> Path:
237
+ """Return the path to the CocoIndex state database for a project."""
238
+ return resolve_db_dir(project_root) / _COCOINDEX_DB_NAME
239
+
118
240
 
119
241
  def user_settings_dir() -> Path:
120
242
  """Return ``~/.cocoindex_code/``.
121
243
 
122
244
  Respects ``COCOINDEX_CODE_DIR`` env var for overriding the base directory.
123
245
  """
124
- import os
125
-
126
246
  override = os.environ.get("COCOINDEX_CODE_DIR")
127
247
  if override:
128
248
  return Path(override)
@@ -162,7 +282,7 @@ def find_legacy_project_root(start: Path) -> Path | None:
162
282
  """
163
283
  current = start.resolve()
164
284
  while True:
165
- if (current / _SETTINGS_DIR_NAME / "cocoindex.db").exists():
285
+ if (current / _SETTINGS_DIR_NAME / _COCOINDEX_DB_NAME).exists():
166
286
  return current
167
287
  parent = current.parent
168
288
  if parent == current:
@@ -261,6 +381,8 @@ def _project_settings_to_dict(settings: ProjectSettings) -> dict[str, Any]:
261
381
  d["language_overrides"] = [
262
382
  {"ext": lo.ext, "lang": lo.lang} for lo in settings.language_overrides
263
383
  ]
384
+ if settings.chunkers:
385
+ d["chunkers"] = [{"ext": cm.ext, "module": cm.module} for cm in settings.chunkers]
264
386
  return d
265
387
 
266
388
 
@@ -268,10 +390,12 @@ def _project_settings_from_dict(d: dict[str, Any]) -> ProjectSettings:
268
390
  overrides = [
269
391
  LanguageOverride(ext=lo["ext"], lang=lo["lang"]) for lo in d.get("language_overrides", [])
270
392
  ]
393
+ chunkers = [ChunkerMapping(ext=cm["ext"], module=cm["module"]) for cm in d.get("chunkers", [])]
271
394
  return ProjectSettings(
272
395
  include_patterns=d.get("include_patterns", list(DEFAULT_INCLUDED_PATTERNS)),
273
396
  exclude_patterns=d.get("exclude_patterns", list(DEFAULT_EXCLUDED_PATTERNS)),
274
397
  language_overrides=overrides,
398
+ chunkers=chunkers,
275
399
  )
276
400
 
277
401
 
@@ -1,142 +0,0 @@
1
- """Configuration management for cocoindex-code."""
2
-
3
- from __future__ import annotations
4
-
5
- import json
6
- import os
7
- from dataclasses import dataclass
8
- from pathlib import Path
9
-
10
- _DEFAULT_MODEL = "sbert/sentence-transformers/all-MiniLM-L6-v2"
11
-
12
-
13
- def _find_root_with_marker(start: Path, markers: list[str]) -> Path | None:
14
- """Walk up from start, return first directory containing any marker."""
15
- current = start
16
- while True:
17
- if any((current / m).exists() for m in markers):
18
- return current
19
- parent = current.parent
20
- if parent == current:
21
- return None
22
- current = parent
23
-
24
-
25
- def _discover_codebase_root() -> Path:
26
- """Discover the codebase root directory.
27
-
28
- Discovery order:
29
- 1. Find nearest parent with `.cocoindex_code` directory (re-anchor to previously-indexed tree)
30
- 2. Find nearest parent with any common project root marker
31
- 3. Fall back to current working directory
32
- """
33
- cwd = Path.cwd()
34
-
35
- # First, look for existing .cocoindex_code directory
36
- root = _find_root_with_marker(cwd, [".cocoindex_code"])
37
- if root is not None:
38
- return root
39
-
40
- # Then, look for common project root markers
41
- markers = [".git", "pyproject.toml", "package.json", "Cargo.toml", "go.mod"]
42
- root = _find_root_with_marker(cwd, markers)
43
- return root if root is not None else cwd
44
-
45
-
46
- def _parse_json_string_list_env(var_name: str) -> list[str]:
47
- """Parse an environment variable as a JSON array of strings."""
48
- raw_value = os.environ.get(var_name, "")
49
- if not raw_value.strip():
50
- return []
51
-
52
- try:
53
- parsed = json.loads(raw_value)
54
- except json.JSONDecodeError as exc:
55
- raise ValueError(f"{var_name} must be a JSON array of strings, got invalid JSON") from exc
56
-
57
- if not isinstance(parsed, list):
58
- raise ValueError(f"{var_name} must be a JSON array of strings")
59
-
60
- result: list[str] = []
61
- for item in parsed:
62
- if not isinstance(item, str):
63
- raise ValueError(f"{var_name} must be a JSON array of strings")
64
- item = item.strip()
65
- if item:
66
- result.append(item)
67
-
68
- return result
69
-
70
-
71
- @dataclass
72
- class Config:
73
- """Configuration loaded from environment variables."""
74
-
75
- codebase_root_path: Path
76
- embedding_model: str
77
- index_dir: Path
78
- device: str | None
79
- extra_extensions: dict[str, str | None]
80
- excluded_patterns: list[str]
81
-
82
- @classmethod
83
- def from_env(cls) -> Config:
84
- """Load configuration from environment variables."""
85
- # Get root path from env or discover it
86
- root_path_str = os.environ.get("COCOINDEX_CODE_ROOT_PATH")
87
- if root_path_str:
88
- root = Path(root_path_str).resolve()
89
- else:
90
- root = _discover_codebase_root()
91
-
92
- # Get embedding model
93
- # Prefix "sbert/" for SentenceTransformers models, otherwise LiteLLM.
94
- embedding_model = os.environ.get(
95
- "COCOINDEX_CODE_EMBEDDING_MODEL",
96
- _DEFAULT_MODEL,
97
- )
98
-
99
- # Index directory is always under the root
100
- index_dir = root / ".cocoindex_code"
101
-
102
- # Device: auto-detect CUDA or use env override
103
- device = os.environ.get("COCOINDEX_CODE_DEVICE")
104
-
105
- # Extra file extensions (format: "inc:php,yaml,toml" — optional lang after colon)
106
- raw_extra = os.environ.get("COCOINDEX_CODE_EXTRA_EXTENSIONS", "")
107
- extra_extensions: dict[str, str | None] = {}
108
- for token in raw_extra.split(","):
109
- token = token.strip()
110
- if not token:
111
- continue
112
- if ":" in token:
113
- ext, lang = token.split(":", 1)
114
- extra_extensions[f".{ext.strip()}"] = lang.strip() or None
115
- else:
116
- extra_extensions[f".{token}"] = None
117
-
118
- # Excluded file glob patterns
119
- excluded_patterns = _parse_json_string_list_env("COCOINDEX_CODE_EXCLUDED_PATTERNS")
120
-
121
- return cls(
122
- codebase_root_path=root,
123
- embedding_model=embedding_model,
124
- index_dir=index_dir,
125
- device=device,
126
- extra_extensions=extra_extensions,
127
- excluded_patterns=excluded_patterns,
128
- )
129
-
130
- @property
131
- def target_sqlite_db_path(self) -> Path:
132
- """Path to the vector index SQLite database."""
133
- return self.index_dir / "target_sqlite.db"
134
-
135
- @property
136
- def cocoindex_db_path(self) -> Path:
137
- """Path to the CocoIndex state database."""
138
- return self.index_dir / "cocoindex.db"
139
-
140
-
141
- # Module-level singleton — imported directly by all modules that need configuration
142
- config: Config = Config.from_env()
File without changes