coderay 1.1.1__tar.gz → 1.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. {coderay-1.1.1/src/coderay.egg-info → coderay-1.2.1}/PKG-INFO +47 -38
  2. {coderay-1.1.1 → coderay-1.2.1}/README.md +45 -37
  3. {coderay-1.1.1 → coderay-1.2.1}/pyproject.toml +2 -1
  4. coderay-1.2.1/src/coderay/__init__.py +1 -0
  5. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/cli/commands.py +40 -4
  6. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/core/defaults/default.coderay.toml +2 -2
  7. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/embedding/README.md +21 -13
  8. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/mcp_server/server.py +46 -9
  9. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/parsing/languages.py +39 -20
  10. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/skeleton/README.md +3 -2
  11. coderay-1.2.1/src/coderay/skeleton/extractor.py +402 -0
  12. coderay-1.2.1/src/coderay/skeleton/path_range.py +39 -0
  13. {coderay-1.1.1 → coderay-1.2.1/src/coderay.egg-info}/PKG-INFO +47 -38
  14. {coderay-1.1.1 → coderay-1.2.1}/src/coderay.egg-info/SOURCES.txt +1 -0
  15. {coderay-1.1.1 → coderay-1.2.1}/src/coderay.egg-info/requires.txt +1 -0
  16. coderay-1.1.1/src/coderay/__init__.py +0 -1
  17. coderay-1.1.1/src/coderay/skeleton/extractor.py +0 -281
  18. {coderay-1.1.1 → coderay-1.2.1}/LICENSE +0 -0
  19. {coderay-1.1.1 → coderay-1.2.1}/MANIFEST.in +0 -0
  20. {coderay-1.1.1 → coderay-1.2.1}/NOTICE +0 -0
  21. {coderay-1.1.1 → coderay-1.2.1}/setup.cfg +0 -0
  22. {coderay-1.1.1 → coderay-1.2.1}/src/README.md +0 -0
  23. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/chunking/README.md +0 -0
  24. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/chunking/__init__.py +0 -0
  25. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/chunking/chunker.py +0 -0
  26. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/cli/README.md +0 -0
  27. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/cli/__init__.py +0 -0
  28. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/cli/search_input.py +0 -0
  29. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/core/README.md +0 -0
  30. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/core/__init__.py +0 -0
  31. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/core/config.py +0 -0
  32. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/core/defaults/__init__.py +0 -0
  33. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/core/errors.py +0 -0
  34. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/core/index_workspace.py +0 -0
  35. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/core/lock.py +0 -0
  36. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/core/models.py +0 -0
  37. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/core/timing.py +0 -0
  38. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/core/utils.py +0 -0
  39. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/embedding/__init__.py +0 -0
  40. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/embedding/backend_resolve.py +0 -0
  41. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/embedding/base.py +0 -0
  42. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/embedding/format.py +0 -0
  43. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/embedding/local.py +0 -0
  44. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/embedding/mlx_backend.py +0 -0
  45. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/embedding/prefixes.py +0 -0
  46. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/README.md +0 -0
  47. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/__init__.py +0 -0
  48. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/builder.py +0 -0
  49. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/code_graph.py +0 -0
  50. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/extractors/__init__.py +0 -0
  51. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/extractors/base.py +0 -0
  52. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/extractors/js_ts/__init__.py +0 -0
  53. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/extractors/js_ts/extractor.py +0 -0
  54. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/extractors/python/__init__.py +0 -0
  55. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/extractors/python/extractor.py +0 -0
  56. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/facts.py +0 -0
  57. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/graph_builder.py +0 -0
  58. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/handlers/__init__.py +0 -0
  59. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/handlers/assignment_binder.py +0 -0
  60. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/handlers/call_emitter.py +0 -0
  61. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/handlers/decorator_emitter.py +0 -0
  62. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/handlers/definition_binder.py +0 -0
  63. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/handlers/definition_emitter.py +0 -0
  64. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/handlers/helpers.py +0 -0
  65. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/handlers/js_ts/__init__.py +0 -0
  66. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/handlers/js_ts/import_binder.py +0 -0
  67. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/handlers/js_ts/import_emitter.py +0 -0
  68. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/handlers/python/__init__.py +0 -0
  69. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/handlers/python/assignment_binder.py +0 -0
  70. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/handlers/python/function_binder.py +0 -0
  71. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/handlers/python/import_binder.py +0 -0
  72. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/handlers/python/import_emitter.py +0 -0
  73. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/handlers/python/with_binder.py +0 -0
  74. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/handlers/typed_annotations.py +0 -0
  75. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/handlers/typed_params.py +0 -0
  76. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/impact.py +0 -0
  77. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/language_plugin.py +0 -0
  78. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/lowering/__init__.py +0 -0
  79. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/lowering/callee_resolver.py +0 -0
  80. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/lowering/callee_strategy.py +0 -0
  81. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/lowering/cst_helpers.py +0 -0
  82. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/lowering/name_bindings.py +0 -0
  83. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/materialise.py +0 -0
  84. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/passes/__init__.py +0 -0
  85. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/passes/python.py +0 -0
  86. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/passes/resolve_bare_phantoms.py +0 -0
  87. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/pipeline.py +0 -0
  88. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/project_index.py +0 -0
  89. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/refs.py +0 -0
  90. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/graph/utils.py +0 -0
  91. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/mcp_server/README.md +0 -0
  92. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/mcp_server/__init__.py +0 -0
  93. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/mcp_server/errors.py +0 -0
  94. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/parsing/README.md +0 -0
  95. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/parsing/base.py +0 -0
  96. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/parsing/conventions.py +0 -0
  97. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/parsing/cst_kind.py +0 -0
  98. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/parsing/cst_traversal.py +0 -0
  99. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/pipeline/README.md +0 -0
  100. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/pipeline/__init__.py +0 -0
  101. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/pipeline/indexer.py +0 -0
  102. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/pipeline/watcher.py +0 -0
  103. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/retrieval/README.md +0 -0
  104. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/retrieval/__init__.py +0 -0
  105. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/retrieval/boosting.py +0 -0
  106. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/retrieval/models.py +0 -0
  107. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/retrieval/search.py +0 -0
  108. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/skeleton/__init__.py +0 -0
  109. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/state/README.md +0 -0
  110. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/state/__init__.py +0 -0
  111. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/state/machine.py +0 -0
  112. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/state/version.py +0 -0
  113. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/storage/README.md +0 -0
  114. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/storage/__init__.py +0 -0
  115. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/storage/lancedb.py +0 -0
  116. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/vcs/README.md +0 -0
  117. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/vcs/__init__.py +0 -0
  118. {coderay-1.1.1 → coderay-1.2.1}/src/coderay/vcs/git.py +0 -0
  119. {coderay-1.1.1 → coderay-1.2.1}/src/coderay.egg-info/dependency_links.txt +0 -0
  120. {coderay-1.1.1 → coderay-1.2.1}/src/coderay.egg-info/entry_points.txt +0 -0
  121. {coderay-1.1.1 → coderay-1.2.1}/src/coderay.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: coderay
3
- Version: 1.1.1
3
+ Version: 1.2.1
4
4
  Summary: X-ray your codebase — semantic search, code graphs, file skeletons, and MCP server
5
5
  Author-email: Bogdan Copocean <bogdancopocean@gmail.com>
6
6
  License-Expression: MIT
@@ -41,6 +41,7 @@ Requires-Dist: pytest>=7.0; extra == "dev"
41
41
  Requires-Dist: pytest-cov>=4.0; extra == "dev"
42
42
  Requires-Dist: ruff>=0.8.0; extra == "dev"
43
43
  Requires-Dist: mypy>=1.0.0; extra == "dev"
44
+ Requires-Dist: tiktoken>=0.5.0; extra == "dev"
44
45
  Provides-Extra: maintain
45
46
  Requires-Dist: pylance>=0.15.0; extra == "maintain"
46
47
  Provides-Extra: mlx
@@ -56,38 +57,71 @@ Dynamic: license-file
56
57
  [![License](https://img.shields.io/github/license/bogdan-copocean/coderay)](LICENSE)
57
58
  [![CI](https://github.com/bogdan-copocean/coderay/actions/workflows/ci.yml/badge.svg)](https://github.com/bogdan-copocean/coderay/actions/workflows/ci.yml)
58
59
 
59
- **CodeRay** ships a **local code index** with **semantic search**, **file skeletons** (signatures and docstrings, no bodies), and **blast radius** (callers, imports, inheritance) plus an **MCP stdio server** so agents can use the same tools. Ask *by meaning*, skim **API shape**, trace **who calls what**, then read implementation when it matters: fewer tokens, less noise, answers anchored to the right files.
60
+ **CodeRay** builds a **local code index** that gives AI agents a smarter way to explore a codebasereading only what they need, not whole files.
60
61
 
61
- **No LLM inside CodeRay, no network, no API key – it runs locally and offline on your machine.**
62
+ **Runs locally. No LLM. No network. No API key.**
62
63
 
64
+ ## The problem
63
65
 
64
- ## Tools
66
+ AI agents exploring a codebase default to reading whole files – even when one function is all that's needed. Every unnecessary line **burns tokens** and **floods the context window**: driving up API costs and noise with every read.
67
+
68
+ The root cause is simple: agents know the **file paths** but no finer location. Without knowing *where* in a file something lives, they have no choice but to read everything.
69
+
70
+ **CodeRay fixes this.** Every tool returns **file paths with exact line ranges** — so agents locate first, then read only the lines that matter.
71
+
72
+ ## How it works
73
+
74
+ CodeRay exposes three primitives, each returning **paths + line ranges**:
75
+
76
+ | Tool | Question it answers | What agents get |
77
+ |------|---------------------|-----------------|
78
+ | **search** | *Where is the code that does X?* | Relevant chunks with file paths and line ranges |
79
+ | **skeleton** | *What's the shape of this file?* | Signatures + docstrings only, each tagged with its line range |
80
+ | **impact** | *What breaks if I change this?* | Callers, imports, and inheritors — located by line range |
81
+
82
+ ### The two-phase flow
65
83
 
66
- **CodeRay sits next to ripgrep, not instead of it.** Ripgrep when you know the string or symbol; search, skeleton, and impact when you care about *intent*, *structure*, or *dependencies*—then open the file when you need real implementation detail.
84
+ 1. **Locate** run `search`, `skeleton`, or `impact` to find what's needed. Every result includes a file path and a symbol-level line range.
85
+ 2. **Read precisely** — use those line ranges to load only the relevant snippet. Skip the rest.
67
86
 
68
- Semantic search is retrieval, not proof: hits can miss or rank oddly. Treat them as candidates, confirm with a skeleton or read, and keep the index fresh with `coderay watch` or `coderay build` when things drift.
87
+ This keeps context windows lean and agent reasoning focused. CodeRay is not a replacement for `grep` it fills the gap when exact names are unknown or a map is needed before reading.
88
+
89
+ ### Token savings (tiktoken, `cl100k_base`)
90
+
91
+ | File | Lines | Full read | Skeleton | Savings | % reduction |
92
+ |------|-------|-----------|----------|---------|-------------|
93
+ | `src/coderay/graph/impact.py` | 249 | 2,333 | 693 | **3.4×** | **70%** |
94
+ | `src/coderay/cli/commands.py` | 584 | 4,327 | 1,906 | **2.3×** | **56%** |
95
+ | `src/coderay/pipeline/indexer.py` | 408 | 3,065 | 1,433 | **2.1×** | **53%** |
96
+
97
+ | Query | Search hit tokens | vs full `indexer.py` read |
98
+ |-------|-------------------|---------------------------|
99
+ | "how are files re-indexed on change" | 479 | **~6x cheaper** |
100
+
101
+
102
+ ## Tools
69
103
 
70
- Skeleton shows API shape and docstrings, not every branch. Use **search** and **impact** to narrow where to look, then read the file (or spans) when you need control flow or line-accurate edits. CodeRay trims noise on those round trips; it does not forbid them.
104
+ ### Semantic search
71
105
 
72
- **Semantic search** — “How/where” by meaning.
106
+ Agents search by **meaning**, not by name useful when the exact function or class is unknown. Results return **file paths with line ranges** pointing at relevant chunks. Treat them as candidates: confirm with `skeleton` or a ranged read before acting. Keep the index fresh with `coderay watch` or `coderay build` when the tree drifts.
73
107
 
74
108
  <img src="assets/coderay-search.gif" alt="coderay search demo" width="100%" />
75
109
 
76
110
  ### Blast radius
77
111
 
78
- Callers and dependents (calls, imports, inheritance).
112
+ Shows **callers, imports, and inheritance** for a symbol before it changes. Each result is tied to a file path and line range — combine with `skeleton` or ranged reads on those locations when bodies are needed.
79
113
 
80
114
  <img src="assets/coderay-impact.gif" alt="coderay impact demo" width="100%" />
81
115
 
82
116
  ### Skeleton
83
117
 
84
- Signatures and docstrings only; API surface without bodies.
118
+ Returns **signatures and docstrings only** no function bodies. Every block is tagged with its path and line range so subsequent reads can be scoped to exactly those lines. A full file read should happen only when the skeleton isn't enough.
85
119
 
86
120
  <img src="assets/coderay-skeleton.gif" alt="coderay skeleton demo" width="100%" />
87
121
 
88
122
  ### Full read
89
123
 
90
- Same file as skeleton: raw source costs more tokens.
124
+ **Same file, raw source for comparison:**
91
125
 
92
126
  <img src="assets/coderay-fullread.gif" alt="same file, raw source head" width="100%" />
93
127
 
@@ -102,7 +136,7 @@ Same file as skeleton: raw source costs more tokens.
102
136
 
103
137
  ## MCP
104
138
 
105
- Same tools as above, exposed to the agent so it can search, sketch structure, and trace impact instead of vacuuming whole files by default. Point the server at a checkout whose root contains `.coderay.toml` (`CODERAY_REPO_ROOT` below). For choosing tools versus a plain read, see [AGENTS.md](AGENTS.md).
139
+ Same three tools over MCP: search, skeleton (paths and line ranges), and impact—so **AI agents** can **narrow context** before full-file reads. Point the server at a checkout whose root contains `.coderay.toml` (`CODERAY_REPO_ROOT` below). For tool choice versus a plain read, see [AGENTS.md](AGENTS.md).
106
140
 
107
141
  ```bash
108
142
  which coderay-mcp
@@ -123,37 +157,12 @@ which coderay-mcp
123
157
  `CODERAY_REPO_ROOT` must be the directory that contains `.coderay.toml`. More detail: [`mcp_server/README.md`](src/coderay/mcp_server/README.md).
124
158
 
125
159
 
126
- ## Why this matters
127
-
128
- Noisy context windows make models confident about the wrong code. CodeRay front-loads **intent** (search), **shape** (skeleton), and **dependencies** (impact) so the expensive read happens after you have a map—not instead of ever reading implementation when control flow matters.
129
-
130
- ### Token savings (tiktoken, `cl100k_base`)
131
-
132
- Measured on this repo after a full index.
133
-
134
-
135
- | File | Lines | Full read | Skeleton | Savings |
136
- | ---------------------------------- | ----- | --------- | -------- | -------- |
137
- | `src/coderay/pipeline/indexer.py` | 400 | 3,024 | 757 | **4.0x** |
138
- | `src/coderay/graph/code_graph.py` | 500 | 4,261 | 1,022 | **4.2x** |
139
- | `src/coderay/mcp_server/server.py` | 316 | 2,268 | 1,313 | **1.7x** |
140
-
141
-
142
-
143
- | Query | Search hit tokens | vs full `indexer.py` read |
144
- | ------------------------------------ | ----------------- | ------------------------- |
145
- | "how are files re-indexed on change" | 479 | **~6x cheaper** |
146
-
147
-
148
- *Not guarantees — model, chunks, and files affect counts.*
149
-
150
-
151
160
  ## Features
152
161
 
153
162
  - **Languages** — Python, JavaScript, and TypeScript — [`parsing/README.md`](src/coderay/parsing/README.md)
154
163
  - **Multi-repo / monorepo** — roots, aliases, optional `include` subtrees — [`core/README.md`](src/coderay/core/README.md)
155
164
  - **Hybrid search** — vector + BM25 (RRF), optional boosting — [`retrieval/README.md`](src/coderay/retrieval/README.md)
156
- - **Embeddings** — fastembed (CPU) or MLX on Apple Silicon — [`embedding/README.md`](src/coderay/embedding/README.md)
165
+ - **Embeddings** — fastembed (CPU) or MLX on Apple Silicon; defaults to MiniLM L6 for speed configure BGE in `.coderay.toml` for stronger (heavier) vectors — [`embedding/README.md`](src/coderay/embedding/README.md)
157
166
  - **Watch** — incremental re-index; `.coderay.toml` is the source of truth for what’s indexed
158
167
 
159
168
 
@@ -4,38 +4,71 @@
4
4
  [![License](https://img.shields.io/github/license/bogdan-copocean/coderay)](LICENSE)
5
5
  [![CI](https://github.com/bogdan-copocean/coderay/actions/workflows/ci.yml/badge.svg)](https://github.com/bogdan-copocean/coderay/actions/workflows/ci.yml)
6
6
 
7
- **CodeRay** ships a **local code index** with **semantic search**, **file skeletons** (signatures and docstrings, no bodies), and **blast radius** (callers, imports, inheritance) plus an **MCP stdio server** so agents can use the same tools. Ask *by meaning*, skim **API shape**, trace **who calls what**, then read implementation when it matters: fewer tokens, less noise, answers anchored to the right files.
7
+ **CodeRay** builds a **local code index** that gives AI agents a smarter way to explore a codebasereading only what they need, not whole files.
8
8
 
9
- **No LLM inside CodeRay, no network, no API key – it runs locally and offline on your machine.**
9
+ **Runs locally. No LLM. No network. No API key.**
10
10
 
11
+ ## The problem
11
12
 
12
- ## Tools
13
+ AI agents exploring a codebase default to reading whole files – even when one function is all that's needed. Every unnecessary line **burns tokens** and **floods the context window**: driving up API costs and noise with every read.
14
+
15
+ The root cause is simple: agents know the **file paths** but no finer location. Without knowing *where* in a file something lives, they have no choice but to read everything.
16
+
17
+ **CodeRay fixes this.** Every tool returns **file paths with exact line ranges** — so agents locate first, then read only the lines that matter.
18
+
19
+ ## How it works
20
+
21
+ CodeRay exposes three primitives, each returning **paths + line ranges**:
22
+
23
+ | Tool | Question it answers | What agents get |
24
+ |------|---------------------|-----------------|
25
+ | **search** | *Where is the code that does X?* | Relevant chunks with file paths and line ranges |
26
+ | **skeleton** | *What's the shape of this file?* | Signatures + docstrings only, each tagged with its line range |
27
+ | **impact** | *What breaks if I change this?* | Callers, imports, and inheritors — located by line range |
28
+
29
+ ### The two-phase flow
13
30
 
14
- **CodeRay sits next to ripgrep, not instead of it.** Ripgrep when you know the string or symbol; search, skeleton, and impact when you care about *intent*, *structure*, or *dependencies*—then open the file when you need real implementation detail.
31
+ 1. **Locate** run `search`, `skeleton`, or `impact` to find what's needed. Every result includes a file path and a symbol-level line range.
32
+ 2. **Read precisely** — use those line ranges to load only the relevant snippet. Skip the rest.
15
33
 
16
- Semantic search is retrieval, not proof: hits can miss or rank oddly. Treat them as candidates, confirm with a skeleton or read, and keep the index fresh with `coderay watch` or `coderay build` when things drift.
34
+ This keeps context windows lean and agent reasoning focused. CodeRay is not a replacement for `grep` it fills the gap when exact names are unknown or a map is needed before reading.
35
+
36
+ ### Token savings (tiktoken, `cl100k_base`)
37
+
38
+ | File | Lines | Full read | Skeleton | Savings | % reduction |
39
+ |------|-------|-----------|----------|---------|-------------|
40
+ | `src/coderay/graph/impact.py` | 249 | 2,333 | 693 | **3.4×** | **70%** |
41
+ | `src/coderay/cli/commands.py` | 584 | 4,327 | 1,906 | **2.3×** | **56%** |
42
+ | `src/coderay/pipeline/indexer.py` | 408 | 3,065 | 1,433 | **2.1×** | **53%** |
43
+
44
+ | Query | Search hit tokens | vs full `indexer.py` read |
45
+ |-------|-------------------|---------------------------|
46
+ | "how are files re-indexed on change" | 479 | **~6x cheaper** |
47
+
48
+
49
+ ## Tools
17
50
 
18
- Skeleton shows API shape and docstrings, not every branch. Use **search** and **impact** to narrow where to look, then read the file (or spans) when you need control flow or line-accurate edits. CodeRay trims noise on those round trips; it does not forbid them.
51
+ ### Semantic search
19
52
 
20
- **Semantic search** — “How/where” by meaning.
53
+ Agents search by **meaning**, not by name useful when the exact function or class is unknown. Results return **file paths with line ranges** pointing at relevant chunks. Treat them as candidates: confirm with `skeleton` or a ranged read before acting. Keep the index fresh with `coderay watch` or `coderay build` when the tree drifts.
21
54
 
22
55
  <img src="assets/coderay-search.gif" alt="coderay search demo" width="100%" />
23
56
 
24
57
  ### Blast radius
25
58
 
26
- Callers and dependents (calls, imports, inheritance).
59
+ Shows **callers, imports, and inheritance** for a symbol before it changes. Each result is tied to a file path and line range — combine with `skeleton` or ranged reads on those locations when bodies are needed.
27
60
 
28
61
  <img src="assets/coderay-impact.gif" alt="coderay impact demo" width="100%" />
29
62
 
30
63
  ### Skeleton
31
64
 
32
- Signatures and docstrings only; API surface without bodies.
65
+ Returns **signatures and docstrings only** no function bodies. Every block is tagged with its path and line range so subsequent reads can be scoped to exactly those lines. A full file read should happen only when the skeleton isn't enough.
33
66
 
34
67
  <img src="assets/coderay-skeleton.gif" alt="coderay skeleton demo" width="100%" />
35
68
 
36
69
  ### Full read
37
70
 
38
- Same file as skeleton: raw source costs more tokens.
71
+ **Same file, raw source for comparison:**
39
72
 
40
73
  <img src="assets/coderay-fullread.gif" alt="same file, raw source head" width="100%" />
41
74
 
@@ -50,7 +83,7 @@ Same file as skeleton: raw source costs more tokens.
50
83
 
51
84
  ## MCP
52
85
 
53
- Same tools as above, exposed to the agent so it can search, sketch structure, and trace impact instead of vacuuming whole files by default. Point the server at a checkout whose root contains `.coderay.toml` (`CODERAY_REPO_ROOT` below). For choosing tools versus a plain read, see [AGENTS.md](AGENTS.md).
86
+ Same three tools over MCP: search, skeleton (paths and line ranges), and impact—so **AI agents** can **narrow context** before full-file reads. Point the server at a checkout whose root contains `.coderay.toml` (`CODERAY_REPO_ROOT` below). For tool choice versus a plain read, see [AGENTS.md](AGENTS.md).
54
87
 
55
88
  ```bash
56
89
  which coderay-mcp
@@ -71,37 +104,12 @@ which coderay-mcp
71
104
  `CODERAY_REPO_ROOT` must be the directory that contains `.coderay.toml`. More detail: [`mcp_server/README.md`](src/coderay/mcp_server/README.md).
72
105
 
73
106
 
74
- ## Why this matters
75
-
76
- Noisy context windows make models confident about the wrong code. CodeRay front-loads **intent** (search), **shape** (skeleton), and **dependencies** (impact) so the expensive read happens after you have a map—not instead of ever reading implementation when control flow matters.
77
-
78
- ### Token savings (tiktoken, `cl100k_base`)
79
-
80
- Measured on this repo after a full index.
81
-
82
-
83
- | File | Lines | Full read | Skeleton | Savings |
84
- | ---------------------------------- | ----- | --------- | -------- | -------- |
85
- | `src/coderay/pipeline/indexer.py` | 400 | 3,024 | 757 | **4.0x** |
86
- | `src/coderay/graph/code_graph.py` | 500 | 4,261 | 1,022 | **4.2x** |
87
- | `src/coderay/mcp_server/server.py` | 316 | 2,268 | 1,313 | **1.7x** |
88
-
89
-
90
-
91
- | Query | Search hit tokens | vs full `indexer.py` read |
92
- | ------------------------------------ | ----------------- | ------------------------- |
93
- | "how are files re-indexed on change" | 479 | **~6x cheaper** |
94
-
95
-
96
- *Not guarantees — model, chunks, and files affect counts.*
97
-
98
-
99
107
  ## Features
100
108
 
101
109
  - **Languages** — Python, JavaScript, and TypeScript — [`parsing/README.md`](src/coderay/parsing/README.md)
102
110
  - **Multi-repo / monorepo** — roots, aliases, optional `include` subtrees — [`core/README.md`](src/coderay/core/README.md)
103
111
  - **Hybrid search** — vector + BM25 (RRF), optional boosting — [`retrieval/README.md`](src/coderay/retrieval/README.md)
104
- - **Embeddings** — fastembed (CPU) or MLX on Apple Silicon — [`embedding/README.md`](src/coderay/embedding/README.md)
112
+ - **Embeddings** — fastembed (CPU) or MLX on Apple Silicon; defaults to MiniLM L6 for speed configure BGE in `.coderay.toml` for stronger (heavier) vectors — [`embedding/README.md`](src/coderay/embedding/README.md)
105
113
  - **Watch** — incremental re-index; `.coderay.toml` is the source of truth for what’s indexed
106
114
 
107
115
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "coderay"
7
- version = "1.1.1"
7
+ version = "1.2.1"
8
8
  description = "X-ray your codebase — semantic search, code graphs, file skeletons, and MCP server"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -53,6 +53,7 @@ dev = [
53
53
  "pytest-cov>=4.0",
54
54
  "ruff>=0.8.0",
55
55
  "mypy>=1.0.0",
56
+ "tiktoken>=0.5.0",
56
57
  ]
57
58
  maintain = [
58
59
  "pylance>=0.15.0",
@@ -0,0 +1 @@
1
+ __version__ = "1.2.1"
@@ -340,7 +340,7 @@ def maintain(ctx: click.Context) -> None:
340
340
 
341
341
 
342
342
  @cli.command()
343
- @click.argument("file_path", type=click.Path(exists=True, path_type=Path))
343
+ @click.argument("file_path", type=str)
344
344
  @click.option(
345
345
  "--include-imports",
346
346
  is_flag=True,
@@ -353,17 +353,53 @@ def maintain(ctx: click.Context) -> None:
353
353
  default=None,
354
354
  help="Filter to a specific class or top-level function by name.",
355
355
  )
356
+ @click.option(
357
+ "--lines",
358
+ "line_range",
359
+ default=None,
360
+ metavar="START-END",
361
+ help=(
362
+ "File line range (1-based inclusive); keep only symbols fully within this span."
363
+ " Do not combine with a :START-END suffix on FILE_PATH (same meaning)."
364
+ ),
365
+ )
356
366
  def skeleton(
357
- file_path: Path,
367
+ file_path: str,
358
368
  include_imports: bool,
359
369
  symbol: str | None,
370
+ line_range: str | None,
360
371
  ) -> None:
361
372
  """Print signatures without bodies (cheaper than reading the full file)."""
362
373
  from coderay.skeleton.extractor import extract_skeleton
374
+ from coderay.skeleton.path_range import (
375
+ parse_file_line_range,
376
+ parse_skeleton_file_arg,
377
+ )
363
378
 
364
- content = file_path.read_text(encoding="utf-8", errors="replace")
379
+ try:
380
+ path_str, rng_from_path = parse_skeleton_file_arg(file_path, parse_suffix=True)
381
+ except ValueError as e:
382
+ raise click.BadParameter(str(e)) from e
383
+ file_line_range = rng_from_path
384
+ if line_range:
385
+ if file_line_range is not None:
386
+ raise click.UsageError(
387
+ "Use either a path ending with :START-END or --lines, not both."
388
+ )
389
+ try:
390
+ file_line_range = parse_file_line_range(line_range)
391
+ except ValueError as e:
392
+ raise click.BadParameter(str(e), param_hint="--lines") from e
393
+ resolved = Path(path_str)
394
+ if not resolved.is_file():
395
+ raise click.BadParameter(f"not a file: {path_str}", param_hint="file_path")
396
+ content = resolved.read_text(encoding="utf-8", errors="replace")
365
397
  out = extract_skeleton(
366
- file_path, content, include_imports=include_imports, symbol=symbol
398
+ resolved,
399
+ content,
400
+ include_imports=include_imports,
401
+ symbol=symbol,
402
+ line_range=file_line_range,
367
403
  )
368
404
  click.echo(out)
369
405
 
@@ -38,13 +38,13 @@ backend = "auto"
38
38
 
39
39
  [embedder.fastembed]
40
40
  # Default embedder. Runs on CPU.
41
- model_name = "BAAI/bge-small-en-v1.5"
41
+ model_name = "sentence-transformers/all-MiniLM-L6-v2"
42
42
  dimensions = 384
43
43
  batch_size = 64
44
44
 
45
45
  [embedder.mlx]
46
46
  # Apple Silicon embedder (MLX/Metal; device depends on runtime).
47
- model_name = "mlx-community/bge-small-en-v1.5-bf16"
47
+ model_name = "mlx-community/all-MiniLM-L6-v2-4bit"
48
48
  dimensions = 384
49
49
  batch_size = 256
50
50
 
@@ -27,25 +27,33 @@ Maps code chunks to dense vectors for storage and query.
27
27
  Run `coderay build --full` after any change to `[embedder]` config. Vectors
28
28
  from different models are not compatible.
29
29
 
30
- ## If indexing is slow
30
+ ## Defaults and trade-offs
31
31
 
32
- The default model (BGE Small, ~67MB via fastembed / ~25MB via MLX bf16) is a
33
- good balance of speed and retrieval quality. If your repo is large and the first
34
- build takes too long, consider a lighter model:
32
+ The default is **MiniLM L6** (`sentence-transformers/all-MiniLM-L6-v2` on CPU,
33
+ `mlx-community/all-MiniLM-L6-v2-bf16` on MLX): fast indexing and good enough
34
+ semantic search for most workflows. For **stronger embeddings** (often better
35
+ retrieval on code), switch to **BGE Small** — expect a heavier download and more
36
+ compute than MiniLM.
35
37
 
36
- | Model | Backend | Size | Dimensions | Trade-off |
37
- |-------|---------|------|------------|-----------|
38
- | `BAAI/bge-small-en-v1.5` | fastembed | ~67MB | 384 | **Default.** Best retrieval quality in this size class. |
39
- | `sentence-transformers/all-MiniLM-L6-v2` | fastembed | ~90MB | 384 | Widely used, slightly lower code retrieval quality than BGE Small. Larger download. |
40
- | `mlx-community/bge-small-en-v1.5-4bit` | mlx | ~19MB | 384 | 4-bit quantised BGE Small. Fast on Apple Silicon, minimal download. Small quality delta vs bf16 — untested on code retrieval specifically. |
41
- | `mlx-community/all-MiniLM-L6-v2-4bit` | mlx | ~13MB | 384 | Smallest option. Fastest cold start. Noticeably lower retrieval quality for code; best suited for quick experimentation. |
38
+ | Model | Backend | Size (approx.) | Dimensions | Notes |
39
+ |-------|---------|----------------|------------|-------|
40
+ | `sentence-transformers/all-MiniLM-L6-v2` | fastembed | ~90MB | 384 | **Default.** Fast; widely used. |
41
+ | `BAAI/bge-small-en-v1.5` | fastembed | ~67MB | 384 | Heavier quality focus; strong retrieval in this size class. |
42
+ | `mlx-community/all-MiniLM-L6-v2-bf16` | mlx | ~45MB | 384 | **Default** on Apple Silicon with `coderay[mlx]`. |
43
+ | `mlx-community/bge-small-en-v1.5-bf16` | mlx | ~25MB | 384 | BGE on MLX; better embeddings than MiniLM, more work per batch. |
44
+ | `mlx-community/bge-small-en-v1.5-4bit` | mlx | ~19MB | 384 | 4-bit BGE; smaller download, small quality delta vs bf16. |
45
+ | `mlx-community/all-MiniLM-L6-v2-4bit` | mlx | ~13MB | 384 | Smallest; fastest cold start; lower retrieval quality for code. |
42
46
 
43
- To switch, update `.coderay.toml` and run `coderay build --full`:
47
+ To use BGE instead of the defaults, edit `.coderay.toml` and run `coderay build --full`:
44
48
 
45
49
  ```toml
46
- # Example: lighter MLX model on Apple Silicon
50
+ [embedder.fastembed]
51
+ model_name = "BAAI/bge-small-en-v1.5"
52
+ dimensions = 384
53
+ batch_size = 64
54
+
47
55
  [embedder.mlx]
48
- model_name = "mlx-community/bge-small-en-v1.5-4bit"
56
+ model_name = "mlx-community/bge-small-en-v1.5-bf16"
49
57
  dimensions = 384
50
58
  batch_size = 256
51
59
  ```
@@ -37,8 +37,10 @@ mcp = FastMCP(
37
37
  "\n"
38
38
  "- semantic_search: search code by meaning. Best for "
39
39
  "'how/where' questions. Use grep for exact symbol lookup.\n"
40
- "- get_file_skeleton: signatures and docstrings only, no bodies. "
41
- "Check a file's API before reading full source. "
40
+ "- get_file_skeleton: signatures and docstrings only, no bodies; "
41
+ "absolute path line per symbol (with optional symbol line range suffix) "
42
+ "for filepath:START-END style refs. "
43
+ "Optional file line range narrows output. "
42
44
  "Works without the index.\n"
43
45
  "- get_impact_radius: reverse dependency traversal from the code "
44
46
  "graph. Shows callers/dependents of a function or class. "
@@ -177,10 +179,11 @@ async def semantic_search(
177
179
  @mcp.tool(
178
180
  description=(
179
181
  "Extracts class/function signatures and docstrings from a file — "
180
- "no bodies. Significantly fewer tokens than reading the full source "
181
- "(a 500-line file typically compresses to ~100 lines of skeleton). "
182
- "Use this before deciding whether to read a file in full. "
183
- "Does not require the index."
182
+ "no bodies. Each symbol is preceded by the absolute file path and "
183
+ "symbol line range suffix (1-based inclusive) for filepath:START-END refs. "
184
+ "Optional file line range via path suffix :START-END or file_line_range "
185
+ "(same meaning; do not pass both). Narrows to declarations fully within that"
186
+ "range. Does not require the index."
184
187
  ),
185
188
  annotations=READ_ONLY_ANNOTATIONS,
186
189
  tags={"analysis"},
@@ -188,7 +191,12 @@ async def semantic_search(
188
191
  async def get_file_skeleton(
189
192
  file_path: Annotated[
190
193
  str,
191
- Field(description="Absolute or relative path to the file"),
194
+ Field(
195
+ description=(
196
+ "Path to the file. Optional :START-END suffix (same as file_line_range)"
197
+ "; do not combine with file_line_range."
198
+ ),
199
+ ),
192
200
  ],
193
201
  include_imports: Annotated[
194
202
  bool,
@@ -206,18 +214,46 @@ async def get_file_skeleton(
206
214
  ),
207
215
  ),
208
216
  ] = None,
217
+ file_line_range: Annotated[
218
+ str | None,
219
+ Field(
220
+ description=(
221
+ "Optional file line range as START-END (1-based inclusive). "
222
+ "Do not combine with a :START-END suffix on file_path."
223
+ ),
224
+ ),
225
+ ] = None,
209
226
  ) -> str:
210
227
  """Get file API surface (signatures, no bodies)."""
211
228
  from coderay.skeleton.extractor import extract_skeleton
229
+ from coderay.skeleton.path_range import (
230
+ parse_file_line_range,
231
+ parse_skeleton_file_arg,
232
+ )
233
+
234
+ try:
235
+ path_str, rng_suffix = parse_skeleton_file_arg(file_path, parse_suffix=True)
236
+ except ValueError as e:
237
+ raise ValueError(str(e)) from e
238
+ line_range: tuple[int, int] | None = rng_suffix
239
+ if file_line_range:
240
+ if line_range is not None:
241
+ raise ValueError(
242
+ "Use either file_path :START-END suffix or file_line_range, not both."
243
+ )
244
+ try:
245
+ line_range = parse_file_line_range(file_line_range)
246
+ except ValueError as e:
247
+ raise ValueError(str(e)) from e
212
248
 
213
249
  workspace_root = _resolve_index_dir().parent.resolve()
214
- candidate = (workspace_root / file_path).resolve()
250
+ candidate = (workspace_root / path_str).resolve()
215
251
  try:
216
252
  candidate.relative_to(workspace_root)
217
253
  except ValueError:
218
254
  raise FileNotFoundError(f"File not found: {file_path}")
219
255
  if not candidate.is_file():
220
- raise FileNotFoundError(f"File not found: {file_path}")
256
+ raise FileNotFoundError(f"File not found: {path_str}")
221
257
  content = await asyncio.to_thread(
222
258
  candidate.read_text, encoding="utf-8", errors="replace"
223
259
  )
@@ -227,6 +263,7 @@ async def get_file_skeleton(
227
263
  content,
228
264
  include_imports=include_imports,
229
265
  symbol=symbol,
266
+ line_range=line_range,
230
267
  )
231
268
 
232
269
 
@@ -37,8 +37,11 @@ class GraphConfig:
37
37
 
38
38
  @dataclass
39
39
  class SkeletonConfig:
40
- """Skeleton-only: docstrings and pass-through at module scope."""
40
+ """Skeleton: declaration types (chunker-style), docstrings, module pass-through."""
41
41
 
42
+ # Node types that emit as declarations. JS/TS omits export_statement (unwrap) and
43
+ # lexical_declaration (top_level_expr_types). See chunk_types in this file.
44
+ symbol_types: tuple[str, ...]
42
45
  docstring_expr_type: str = "expression_statement"
43
46
  top_level_expr_types: tuple[str, ...] = ("expression_statement",)
44
47
  body_block_types: tuple[str, ...] = ("block", "statement_block")
@@ -102,8 +105,16 @@ _PYTHON_CST_DISPATCH = CstDispatchConfig(
102
105
  )
103
106
 
104
107
 
108
+ _PY_CHUNK_TYPES: tuple[str, ...] = (
109
+ "function_definition",
110
+ "class_definition",
111
+ "decorated_definition",
112
+ )
113
+
114
+
105
115
  def _python_skeleton() -> SkeletonConfig:
106
116
  return SkeletonConfig(
117
+ symbol_types=_PY_CHUNK_TYPES,
107
118
  docstring_expr_type="expression_statement",
108
119
  top_level_expr_types=("expression_statement",),
109
120
  body_block_types=("block",),
@@ -111,13 +122,7 @@ def _python_skeleton() -> SkeletonConfig:
111
122
 
112
123
 
113
124
  def _python_chunker() -> ChunkerConfig:
114
- return ChunkerConfig(
115
- chunk_types=(
116
- "function_definition",
117
- "class_definition",
118
- "decorated_definition",
119
- ),
120
- )
125
+ return ChunkerConfig(chunk_types=_PY_CHUNK_TYPES)
121
126
 
122
127
 
123
128
  _PYTHON_GRAPH = GraphConfig(
@@ -172,8 +177,33 @@ _JS_TS_GRAPH = GraphConfig(
172
177
  )
173
178
 
174
179
 
180
+ # Chunker includes export_statement and lexical_declaration; skeleton unwraps exports
181
+ # and treats top-level lexical_declaration via top_level_expr_types.
182
+ _JS_TS_CHUNK_TYPES: tuple[str, ...] = (
183
+ "function_declaration",
184
+ "class_declaration",
185
+ "method_definition",
186
+ "arrow_function",
187
+ "export_statement",
188
+ "lexical_declaration",
189
+ "interface_declaration",
190
+ "type_alias_declaration",
191
+ )
192
+
193
+ _JS_TS_SKELETON_SYMBOL_TYPES: tuple[str, ...] = (
194
+ "function_declaration",
195
+ "class_declaration",
196
+ "method_definition",
197
+ "arrow_function",
198
+ "interface_declaration",
199
+ "type_alias_declaration",
200
+ "type_declaration",
201
+ )
202
+
203
+
175
204
  def _js_ts_skeleton() -> SkeletonConfig:
176
205
  return SkeletonConfig(
206
+ symbol_types=_JS_TS_SKELETON_SYMBOL_TYPES,
177
207
  docstring_expr_type="expression_statement",
178
208
  top_level_expr_types=("expression_statement", "lexical_declaration"),
179
209
  body_block_types=("statement_block",),
@@ -181,18 +211,7 @@ def _js_ts_skeleton() -> SkeletonConfig:
181
211
 
182
212
 
183
213
  def _js_ts_chunker() -> ChunkerConfig:
184
- return ChunkerConfig(
185
- chunk_types=(
186
- "function_declaration",
187
- "class_declaration",
188
- "method_definition",
189
- "arrow_function",
190
- "export_statement",
191
- "lexical_declaration",
192
- "interface_declaration",
193
- "type_alias_declaration",
194
- ),
195
- )
214
+ return ChunkerConfig(chunk_types=_JS_TS_CHUNK_TYPES)
196
215
 
197
216
 
198
217
  @dataclass
@@ -6,8 +6,9 @@ demand (not stored in the index). Works without a built index.
6
6
 
7
7
  ## How it works
8
8
 
9
- `extractor.py` uses tree-sitter to parse the file, then walks the CST using
10
- `classify_node` from `parsing/cst_kind.py` to identify structural boundaries.
9
+ `extractor.py` uses tree-sitter to parse the file, then walks the CST; declaration
10
+ nodes come from `LanguageConfig.skeleton.symbol_types` (per language, like chunk
11
+ types in `parsing/languages.py`), with shape from `cst` function/class/decorator sets.
11
12
  Function and method bodies are replaced with `...`. Class headers are kept as
12
13
  context even when filtering to a specific symbol.
13
14