coderay 1.1.0__tar.gz → 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. {coderay-1.1.0/src/coderay.egg-info → coderay-1.2.0}/PKG-INFO +46 -39
  2. {coderay-1.1.0 → coderay-1.2.0}/README.md +44 -38
  3. {coderay-1.1.0 → coderay-1.2.0}/pyproject.toml +2 -1
  4. coderay-1.2.0/src/coderay/__init__.py +1 -0
  5. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/cli/commands.py +47 -4
  6. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/core/timing.py +7 -2
  7. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/embedding/base.py +2 -2
  8. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/embedding/local.py +21 -9
  9. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/embedding/mlx_backend.py +20 -19
  10. coderay-1.2.0/src/coderay/graph/README.md +45 -0
  11. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/graph/__init__.py +3 -2
  12. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/graph/builder.py +6 -31
  13. coderay-1.2.0/src/coderay/graph/code_graph.py +276 -0
  14. coderay-1.2.0/src/coderay/graph/extractors/__init__.py +5 -0
  15. coderay-1.2.0/src/coderay/graph/extractors/base.py +248 -0
  16. coderay-1.2.0/src/coderay/graph/extractors/js_ts/__init__.py +7 -0
  17. coderay-1.2.0/src/coderay/graph/extractors/js_ts/extractor.py +53 -0
  18. coderay-1.2.0/src/coderay/graph/extractors/python/__init__.py +7 -0
  19. coderay-1.2.0/src/coderay/graph/extractors/python/extractor.py +58 -0
  20. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/graph/facts.py +10 -0
  21. coderay-1.2.0/src/coderay/graph/graph_builder.py +92 -0
  22. coderay-1.2.0/src/coderay/graph/handlers/__init__.py +1 -0
  23. coderay-1.2.0/src/coderay/graph/handlers/assignment_binder.py +140 -0
  24. coderay-1.2.0/src/coderay/graph/handlers/call_emitter.py +47 -0
  25. coderay-1.2.0/src/coderay/graph/handlers/decorator_emitter.py +74 -0
  26. coderay-1.2.0/src/coderay/graph/handlers/definition_binder.py +41 -0
  27. coderay-1.2.0/src/coderay/graph/handlers/definition_emitter.py +68 -0
  28. coderay-1.2.0/src/coderay/graph/handlers/helpers.py +20 -0
  29. coderay-1.2.0/src/coderay/graph/handlers/js_ts/__init__.py +1 -0
  30. coderay-1.2.0/src/coderay/graph/handlers/js_ts/import_binder.py +111 -0
  31. coderay-1.2.0/src/coderay/graph/handlers/js_ts/import_emitter.py +41 -0
  32. coderay-1.2.0/src/coderay/graph/handlers/python/__init__.py +1 -0
  33. coderay-1.2.0/src/coderay/graph/handlers/python/assignment_binder.py +129 -0
  34. coderay-1.2.0/src/coderay/graph/handlers/python/function_binder.py +55 -0
  35. coderay-1.2.0/src/coderay/graph/handlers/python/import_binder.py +133 -0
  36. coderay-1.2.0/src/coderay/graph/handlers/python/import_emitter.py +58 -0
  37. coderay-1.2.0/src/coderay/graph/handlers/python/with_binder.py +59 -0
  38. coderay-1.2.0/src/coderay/graph/handlers/typed_annotations.py +77 -0
  39. coderay-1.2.0/src/coderay/graph/handlers/typed_params.py +139 -0
  40. coderay-1.2.0/src/coderay/graph/impact.py +249 -0
  41. coderay-1.2.0/src/coderay/graph/language_plugin.py +63 -0
  42. coderay-1.2.0/src/coderay/graph/lowering/__init__.py +3 -0
  43. coderay-1.2.0/src/coderay/graph/lowering/callee_resolver.py +141 -0
  44. coderay-1.2.0/src/coderay/graph/lowering/callee_strategy.py +25 -0
  45. coderay-1.2.0/src/coderay/graph/lowering/cst_helpers.py +64 -0
  46. coderay-1.2.0/src/coderay/graph/lowering/name_bindings.py +229 -0
  47. coderay-1.1.0/src/coderay/graph/emit.py → coderay-1.2.0/src/coderay/graph/materialise.py +8 -10
  48. coderay-1.2.0/src/coderay/graph/passes/resolve_bare_phantoms.py +49 -0
  49. coderay-1.2.0/src/coderay/graph/pipeline.py +35 -0
  50. coderay-1.2.0/src/coderay/graph/project_index.py +43 -0
  51. coderay-1.2.0/src/coderay/graph/refs.py +59 -0
  52. coderay-1.1.0/src/coderay/graph/identifiers.py → coderay-1.2.0/src/coderay/graph/utils.py +6 -3
  53. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/mcp_server/server.py +53 -9
  54. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/parsing/README.md +2 -2
  55. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/parsing/base.py +4 -1
  56. coderay-1.1.0/src/coderay/graph/_utils.py → coderay-1.2.0/src/coderay/parsing/conventions.py +16 -19
  57. coderay-1.2.0/src/coderay/parsing/cst_traversal.py +118 -0
  58. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/parsing/languages.py +39 -20
  59. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/pipeline/indexer.py +48 -40
  60. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/skeleton/README.md +3 -2
  61. coderay-1.2.0/src/coderay/skeleton/extractor.py +402 -0
  62. coderay-1.2.0/src/coderay/skeleton/path_range.py +39 -0
  63. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/vcs/git.py +0 -4
  64. {coderay-1.1.0 → coderay-1.2.0/src/coderay.egg-info}/PKG-INFO +46 -39
  65. {coderay-1.1.0 → coderay-1.2.0}/src/coderay.egg-info/SOURCES.txt +40 -25
  66. {coderay-1.1.0 → coderay-1.2.0}/src/coderay.egg-info/requires.txt +1 -0
  67. coderay-1.1.0/src/coderay/__init__.py +0 -1
  68. coderay-1.1.0/src/coderay/graph/README.md +0 -58
  69. coderay-1.1.0/src/coderay/graph/code_graph.py +0 -500
  70. coderay-1.1.0/src/coderay/graph/extractor.py +0 -67
  71. coderay-1.1.0/src/coderay/graph/file_context.py +0 -140
  72. coderay-1.1.0/src/coderay/graph/passes/global_passes.py +0 -10
  73. coderay-1.1.0/src/coderay/graph/pipeline.py +0 -17
  74. coderay-1.1.0/src/coderay/graph/plugin_protocol.py +0 -62
  75. coderay-1.1.0/src/coderay/graph/plugins/__init__.py +0 -1
  76. coderay-1.1.0/src/coderay/graph/plugins/base/__init__.py +0 -10
  77. coderay-1.1.0/src/coderay/graph/plugins/base/extractor.py +0 -142
  78. coderay-1.1.0/src/coderay/graph/plugins/base/handlers/__init__.py +0 -13
  79. coderay-1.1.0/src/coderay/graph/plugins/base/handlers/assignments.py +0 -137
  80. coderay-1.1.0/src/coderay/graph/plugins/base/handlers/calls.py +0 -304
  81. coderay-1.1.0/src/coderay/graph/plugins/base/handlers/definitions.py +0 -143
  82. coderay-1.1.0/src/coderay/graph/plugins/base/handlers/type_resolution.py +0 -202
  83. coderay-1.1.0/src/coderay/graph/plugins/base/plugin.py +0 -43
  84. coderay-1.1.0/src/coderay/graph/plugins/js_ts/__init__.py +0 -8
  85. coderay-1.1.0/src/coderay/graph/plugins/js_ts/extractor.py +0 -65
  86. coderay-1.1.0/src/coderay/graph/plugins/js_ts/import_handler.py +0 -106
  87. coderay-1.1.0/src/coderay/graph/plugins/python/__init__.py +0 -7
  88. coderay-1.1.0/src/coderay/graph/plugins/python/extractor.py +0 -280
  89. coderay-1.1.0/src/coderay/graph/plugins/python/import_handler.py +0 -139
  90. coderay-1.1.0/src/coderay/graph/registry.py +0 -23
  91. coderay-1.1.0/src/coderay/graph/resolution.py +0 -16
  92. coderay-1.1.0/src/coderay/parsing/conventions.py +0 -13
  93. coderay-1.1.0/src/coderay/skeleton/extractor.py +0 -281
  94. {coderay-1.1.0 → coderay-1.2.0}/LICENSE +0 -0
  95. {coderay-1.1.0 → coderay-1.2.0}/MANIFEST.in +0 -0
  96. {coderay-1.1.0 → coderay-1.2.0}/NOTICE +0 -0
  97. {coderay-1.1.0 → coderay-1.2.0}/setup.cfg +0 -0
  98. {coderay-1.1.0 → coderay-1.2.0}/src/README.md +0 -0
  99. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/chunking/README.md +0 -0
  100. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/chunking/__init__.py +0 -0
  101. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/chunking/chunker.py +0 -0
  102. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/cli/README.md +0 -0
  103. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/cli/__init__.py +0 -0
  104. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/cli/search_input.py +0 -0
  105. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/core/README.md +0 -0
  106. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/core/__init__.py +0 -0
  107. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/core/config.py +0 -0
  108. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/core/defaults/__init__.py +0 -0
  109. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/core/defaults/default.coderay.toml +0 -0
  110. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/core/errors.py +0 -0
  111. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/core/index_workspace.py +0 -0
  112. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/core/lock.py +0 -0
  113. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/core/models.py +0 -0
  114. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/core/utils.py +0 -0
  115. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/embedding/README.md +0 -0
  116. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/embedding/__init__.py +0 -0
  117. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/embedding/backend_resolve.py +0 -0
  118. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/embedding/format.py +0 -0
  119. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/embedding/prefixes.py +0 -0
  120. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/graph/passes/__init__.py +0 -0
  121. /coderay-1.1.0/src/coderay/graph/plugins/python/passes.py → /coderay-1.2.0/src/coderay/graph/passes/python.py +0 -0
  122. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/mcp_server/README.md +0 -0
  123. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/mcp_server/__init__.py +0 -0
  124. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/mcp_server/errors.py +0 -0
  125. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/parsing/cst_kind.py +0 -0
  126. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/pipeline/README.md +0 -0
  127. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/pipeline/__init__.py +0 -0
  128. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/pipeline/watcher.py +0 -0
  129. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/retrieval/README.md +0 -0
  130. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/retrieval/__init__.py +0 -0
  131. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/retrieval/boosting.py +0 -0
  132. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/retrieval/models.py +0 -0
  133. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/retrieval/search.py +0 -0
  134. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/skeleton/__init__.py +0 -0
  135. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/state/README.md +0 -0
  136. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/state/__init__.py +0 -0
  137. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/state/machine.py +0 -0
  138. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/state/version.py +0 -0
  139. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/storage/README.md +0 -0
  140. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/storage/__init__.py +0 -0
  141. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/storage/lancedb.py +0 -0
  142. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/vcs/README.md +0 -0
  143. {coderay-1.1.0 → coderay-1.2.0}/src/coderay/vcs/__init__.py +0 -0
  144. {coderay-1.1.0 → coderay-1.2.0}/src/coderay.egg-info/dependency_links.txt +0 -0
  145. {coderay-1.1.0 → coderay-1.2.0}/src/coderay.egg-info/entry_points.txt +0 -0
  146. {coderay-1.1.0 → coderay-1.2.0}/src/coderay.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: coderay
3
- Version: 1.1.0
3
+ Version: 1.2.0
4
4
  Summary: X-ray your codebase — semantic search, code graphs, file skeletons, and MCP server
5
5
  Author-email: Bogdan Copocean <bogdancopocean@gmail.com>
6
6
  License-Expression: MIT
@@ -41,6 +41,7 @@ Requires-Dist: pytest>=7.0; extra == "dev"
41
41
  Requires-Dist: pytest-cov>=4.0; extra == "dev"
42
42
  Requires-Dist: ruff>=0.8.0; extra == "dev"
43
43
  Requires-Dist: mypy>=1.0.0; extra == "dev"
44
+ Requires-Dist: tiktoken>=0.5.0; extra == "dev"
44
45
  Provides-Extra: maintain
45
46
  Requires-Dist: pylance>=0.15.0; extra == "maintain"
46
47
  Provides-Extra: mlx
@@ -56,38 +57,71 @@ Dynamic: license-file
56
57
  [![License](https://img.shields.io/github/license/bogdan-copocean/coderay)](LICENSE)
57
58
  [![CI](https://github.com/bogdan-copocean/coderay/actions/workflows/ci.yml/badge.svg)](https://github.com/bogdan-copocean/coderay/actions/workflows/ci.yml)
58
59
 
59
- **CodeRay** ships a **local code index** with **semantic search**, **file skeletons** (signatures and docstrings, no bodies), and **blast radius** (callers, imports, inheritance) plus an **MCP stdio server** so agents can use the same tools. Ask *by meaning*, skim **API shape**, trace **who calls what**, then read implementation when it matters: fewer tokens, less noise, answers anchored to the right files.
60
+ **CodeRay** builds a **local code index** that gives AI agents a smarter way to explore a codebasereading only what they need, not whole files.
60
61
 
61
- **No LLM inside CodeRay, no network, no API key – it runs on your machine.**
62
+ **Runs locally. No LLM. No network. No API key.**
62
63
 
64
+ ## The problem
63
65
 
64
- ## Tools
66
+ AI agents exploring a codebase default to reading whole files – even when one function is all that's needed. Every unnecessary line **burns tokens** and **floods the context window**: driving up API costs and noise with every read.
67
+
68
+ The root cause is simple: agents know the **file paths** but no finer location. Without knowing *where* in a file something lives, they have no choice but to read everything.
69
+
70
+ **CodeRay fixes this.** Every tool returns **file paths with exact line ranges** — so agents locate first, then read only the lines that matter.
71
+
72
+ ## How it works
73
+
74
+ CodeRay exposes three primitives, each returning **paths + line ranges**:
75
+
76
+ | Tool | Question it answers | What agents get |
77
+ |------|---------------------|-----------------|
78
+ | **search** | *Where is the code that does X?* | Relevant chunks with file paths and line ranges |
79
+ | **skeleton** | *What's the shape of this file?* | Signatures + docstrings only, each tagged with its line range |
80
+ | **impact** | *What breaks if I change this?* | Callers, imports, and inheritors — located by line range |
81
+
82
+ ### The two-phase flow
65
83
 
66
- **CodeRay sits next to ripgrep, not instead of it.** Ripgrep when you know the string or symbol; search, skeleton, and impact when you care about *intent*, *structure*, or *dependencies*—then open the file when you need real implementation detail.
84
+ 1. **Locate** run `search`, `skeleton`, or `impact` to find what's needed. Every result includes a file path and a symbol-level line range.
85
+ 2. **Read precisely** — use those line ranges to load only the relevant snippet. Skip the rest.
67
86
 
68
- Semantic search is retrieval, not proof: hits can miss or rank oddly. Treat them as candidates, confirm with a skeleton or read, and keep the index fresh with `coderay watch` or `coderay build` when things drift.
87
+ This keeps context windows lean and agent reasoning focused. CodeRay is not a replacement for `grep` it fills the gap when exact names are unknown or a map is needed before reading.
88
+
89
+ ### Token savings (tiktoken, `cl100k_base`)
90
+
91
+ | File | Lines | Full read | Skeleton | Savings | % reduction |
92
+ |------|-------|-----------|----------|---------|-------------|
93
+ | `src/coderay/graph/impact.py` | 249 | 2,333 | 693 | **3.4×** | **70%** |
94
+ | `src/coderay/cli/commands.py` | 584 | 4,327 | 1,906 | **2.3×** | **56%** |
95
+ | `src/coderay/pipeline/indexer.py` | 408 | 3,065 | 1,433 | **2.1×** | **53%** |
96
+
97
+ | Query | Search hit tokens | vs full `indexer.py` read |
98
+ |-------|-------------------|---------------------------|
99
+ | "how are files re-indexed on change" | 479 | **~6x cheaper** |
100
+
101
+
102
+ ## Tools
69
103
 
70
- Skeleton shows API shape and docstrings, not every branch. Use **search** and **impact** to narrow where to look, then read the file (or spans) when you need control flow or line-accurate edits. CodeRay trims noise on those round trips; it does not forbid them.
104
+ ### Semantic search
71
105
 
72
- **Semantic search** — “How/where” by meaning.
106
+ Agents search by **meaning**, not by name useful when the exact function or class is unknown. Results return **file paths with line ranges** pointing at relevant chunks. Treat them as candidates: confirm with `skeleton` or a ranged read before acting. Keep the index fresh with `coderay watch` or `coderay build` when the tree drifts.
73
107
 
74
108
  <img src="assets/coderay-search.gif" alt="coderay search demo" width="100%" />
75
109
 
76
110
  ### Blast radius
77
111
 
78
- Callers and dependents (calls, imports, inheritance).
112
+ Shows **callers, imports, and inheritance** for a symbol before it changes. Each result is tied to a file path and line range — combine with `skeleton` or ranged reads on those locations when bodies are needed.
79
113
 
80
114
  <img src="assets/coderay-impact.gif" alt="coderay impact demo" width="100%" />
81
115
 
82
116
  ### Skeleton
83
117
 
84
- Signatures and docstrings only; API surface without bodies.
118
+ Returns **signatures and docstrings only** no function bodies. Every block is tagged with its path and line range so subsequent reads can be scoped to exactly those lines. A full file read should happen only when the skeleton isn't enough.
85
119
 
86
120
  <img src="assets/coderay-skeleton.gif" alt="coderay skeleton demo" width="100%" />
87
121
 
88
122
  ### Full read
89
123
 
90
- Same file as skeleton: raw source costs more tokens.
124
+ **Same file, raw source for comparison:**
91
125
 
92
126
  <img src="assets/coderay-fullread.gif" alt="same file, raw source head" width="100%" />
93
127
 
@@ -102,7 +136,7 @@ Same file as skeleton: raw source costs more tokens.
102
136
 
103
137
  ## MCP
104
138
 
105
- Same tools as above, exposed to the agent so it can search, sketch structure, and trace impact instead of vacuuming whole files by default. Point the server at a checkout whose root contains `.coderay.toml` (`CODERAY_REPO_ROOT` below). For choosing tools versus a plain read, see [AGENTS.md](AGENTS.md).
139
+ Same three tools over MCP: search, skeleton (paths and line ranges), and impact—so **AI agents** can **narrow context** before full-file reads. Point the server at a checkout whose root contains `.coderay.toml` (`CODERAY_REPO_ROOT` below). For tool choice versus a plain read, see [AGENTS.md](AGENTS.md).
106
140
 
107
141
  ```bash
108
142
  which coderay-mcp
@@ -123,32 +157,6 @@ which coderay-mcp
123
157
  `CODERAY_REPO_ROOT` must be the directory that contains `.coderay.toml`. More detail: [`mcp_server/README.md`](src/coderay/mcp_server/README.md).
124
158
 
125
159
 
126
- ## Why this matters
127
-
128
- Noisy context windows make models confident about the wrong code. CodeRay front-loads **intent** (search), **shape** (skeleton), and **dependencies** (impact) so the expensive read happens after you have a map—not instead of ever reading implementation when control flow matters.
129
-
130
- ### Token savings (tiktoken, `cl100k_base`)
131
-
132
- Measured on this repo after a full index.
133
-
134
-
135
- | File | Lines | Full read | Skeleton | Savings |
136
- | ---------------------------------- | ----- | --------- | -------- | -------- |
137
- | `src/coderay/pipeline/indexer.py` | 400 | 3,024 | 757 | **4.0x** |
138
- | `src/coderay/graph/code_graph.py` | 500 | 4,261 | 1,022 | **4.2x** |
139
- | `src/coderay/mcp_server/server.py` | 316 | 2,268 | 1,313 | **1.7x** |
140
-
141
-
142
-
143
- | Query | Search hit tokens | vs full `indexer.py` read |
144
- | ------------------------------------ | ----------------- | ------------------------- |
145
- | "how are files re-indexed on change" | 479 | **~6x cheaper** |
146
-
147
-
148
- *Not guarantees — model, chunks, and files affect counts.*
149
-
150
- ---
151
-
152
160
  ## Features
153
161
 
154
162
  - **Languages** — Python, JavaScript, and TypeScript — [`parsing/README.md`](src/coderay/parsing/README.md)
@@ -157,7 +165,6 @@ Measured on this repo after a full index.
157
165
  - **Embeddings** — fastembed (CPU) or MLX on Apple Silicon — [`embedding/README.md`](src/coderay/embedding/README.md)
158
166
  - **Watch** — incremental re-index; `.coderay.toml` is the source of truth for what’s indexed
159
167
 
160
- ---
161
168
 
162
169
  ## Install
163
170
 
@@ -4,38 +4,71 @@
4
4
  [![License](https://img.shields.io/github/license/bogdan-copocean/coderay)](LICENSE)
5
5
  [![CI](https://github.com/bogdan-copocean/coderay/actions/workflows/ci.yml/badge.svg)](https://github.com/bogdan-copocean/coderay/actions/workflows/ci.yml)
6
6
 
7
- **CodeRay** ships a **local code index** with **semantic search**, **file skeletons** (signatures and docstrings, no bodies), and **blast radius** (callers, imports, inheritance) plus an **MCP stdio server** so agents can use the same tools. Ask *by meaning*, skim **API shape**, trace **who calls what**, then read implementation when it matters: fewer tokens, less noise, answers anchored to the right files.
7
+ **CodeRay** builds a **local code index** that gives AI agents a smarter way to explore a codebasereading only what they need, not whole files.
8
8
 
9
- **No LLM inside CodeRay, no network, no API key – it runs on your machine.**
9
+ **Runs locally. No LLM. No network. No API key.**
10
10
 
11
+ ## The problem
11
12
 
12
- ## Tools
13
+ AI agents exploring a codebase default to reading whole files – even when one function is all that's needed. Every unnecessary line **burns tokens** and **floods the context window**: driving up API costs and noise with every read.
14
+
15
+ The root cause is simple: agents know the **file paths** but no finer location. Without knowing *where* in a file something lives, they have no choice but to read everything.
16
+
17
+ **CodeRay fixes this.** Every tool returns **file paths with exact line ranges** — so agents locate first, then read only the lines that matter.
18
+
19
+ ## How it works
20
+
21
+ CodeRay exposes three primitives, each returning **paths + line ranges**:
22
+
23
+ | Tool | Question it answers | What agents get |
24
+ |------|---------------------|-----------------|
25
+ | **search** | *Where is the code that does X?* | Relevant chunks with file paths and line ranges |
26
+ | **skeleton** | *What's the shape of this file?* | Signatures + docstrings only, each tagged with its line range |
27
+ | **impact** | *What breaks if I change this?* | Callers, imports, and inheritors — located by line range |
28
+
29
+ ### The two-phase flow
13
30
 
14
- **CodeRay sits next to ripgrep, not instead of it.** Ripgrep when you know the string or symbol; search, skeleton, and impact when you care about *intent*, *structure*, or *dependencies*—then open the file when you need real implementation detail.
31
+ 1. **Locate** run `search`, `skeleton`, or `impact` to find what's needed. Every result includes a file path and a symbol-level line range.
32
+ 2. **Read precisely** — use those line ranges to load only the relevant snippet. Skip the rest.
15
33
 
16
- Semantic search is retrieval, not proof: hits can miss or rank oddly. Treat them as candidates, confirm with a skeleton or read, and keep the index fresh with `coderay watch` or `coderay build` when things drift.
34
+ This keeps context windows lean and agent reasoning focused. CodeRay is not a replacement for `grep` it fills the gap when exact names are unknown or a map is needed before reading.
35
+
36
+ ### Token savings (tiktoken, `cl100k_base`)
37
+
38
+ | File | Lines | Full read | Skeleton | Savings | % reduction |
39
+ |------|-------|-----------|----------|---------|-------------|
40
+ | `src/coderay/graph/impact.py` | 249 | 2,333 | 693 | **3.4×** | **70%** |
41
+ | `src/coderay/cli/commands.py` | 584 | 4,327 | 1,906 | **2.3×** | **56%** |
42
+ | `src/coderay/pipeline/indexer.py` | 408 | 3,065 | 1,433 | **2.1×** | **53%** |
43
+
44
+ | Query | Search hit tokens | vs full `indexer.py` read |
45
+ |-------|-------------------|---------------------------|
46
+ | "how are files re-indexed on change" | 479 | **~6x cheaper** |
47
+
48
+
49
+ ## Tools
17
50
 
18
- Skeleton shows API shape and docstrings, not every branch. Use **search** and **impact** to narrow where to look, then read the file (or spans) when you need control flow or line-accurate edits. CodeRay trims noise on those round trips; it does not forbid them.
51
+ ### Semantic search
19
52
 
20
- **Semantic search** — “How/where” by meaning.
53
+ Agents search by **meaning**, not by name useful when the exact function or class is unknown. Results return **file paths with line ranges** pointing at relevant chunks. Treat them as candidates: confirm with `skeleton` or a ranged read before acting. Keep the index fresh with `coderay watch` or `coderay build` when the tree drifts.
21
54
 
22
55
  <img src="assets/coderay-search.gif" alt="coderay search demo" width="100%" />
23
56
 
24
57
  ### Blast radius
25
58
 
26
- Callers and dependents (calls, imports, inheritance).
59
+ Shows **callers, imports, and inheritance** for a symbol before it changes. Each result is tied to a file path and line range — combine with `skeleton` or ranged reads on those locations when bodies are needed.
27
60
 
28
61
  <img src="assets/coderay-impact.gif" alt="coderay impact demo" width="100%" />
29
62
 
30
63
  ### Skeleton
31
64
 
32
- Signatures and docstrings only; API surface without bodies.
65
+ Returns **signatures and docstrings only** no function bodies. Every block is tagged with its path and line range so subsequent reads can be scoped to exactly those lines. A full file read should happen only when the skeleton isn't enough.
33
66
 
34
67
  <img src="assets/coderay-skeleton.gif" alt="coderay skeleton demo" width="100%" />
35
68
 
36
69
  ### Full read
37
70
 
38
- Same file as skeleton: raw source costs more tokens.
71
+ **Same file, raw source for comparison:**
39
72
 
40
73
  <img src="assets/coderay-fullread.gif" alt="same file, raw source head" width="100%" />
41
74
 
@@ -50,7 +83,7 @@ Same file as skeleton: raw source costs more tokens.
50
83
 
51
84
  ## MCP
52
85
 
53
- Same tools as above, exposed to the agent so it can search, sketch structure, and trace impact instead of vacuuming whole files by default. Point the server at a checkout whose root contains `.coderay.toml` (`CODERAY_REPO_ROOT` below). For choosing tools versus a plain read, see [AGENTS.md](AGENTS.md).
86
+ Same three tools over MCP: search, skeleton (paths and line ranges), and impact—so **AI agents** can **narrow context** before full-file reads. Point the server at a checkout whose root contains `.coderay.toml` (`CODERAY_REPO_ROOT` below). For tool choice versus a plain read, see [AGENTS.md](AGENTS.md).
54
87
 
55
88
  ```bash
56
89
  which coderay-mcp
@@ -71,32 +104,6 @@ which coderay-mcp
71
104
  `CODERAY_REPO_ROOT` must be the directory that contains `.coderay.toml`. More detail: [`mcp_server/README.md`](src/coderay/mcp_server/README.md).
72
105
 
73
106
 
74
- ## Why this matters
75
-
76
- Noisy context windows make models confident about the wrong code. CodeRay front-loads **intent** (search), **shape** (skeleton), and **dependencies** (impact) so the expensive read happens after you have a map—not instead of ever reading implementation when control flow matters.
77
-
78
- ### Token savings (tiktoken, `cl100k_base`)
79
-
80
- Measured on this repo after a full index.
81
-
82
-
83
- | File | Lines | Full read | Skeleton | Savings |
84
- | ---------------------------------- | ----- | --------- | -------- | -------- |
85
- | `src/coderay/pipeline/indexer.py` | 400 | 3,024 | 757 | **4.0x** |
86
- | `src/coderay/graph/code_graph.py` | 500 | 4,261 | 1,022 | **4.2x** |
87
- | `src/coderay/mcp_server/server.py` | 316 | 2,268 | 1,313 | **1.7x** |
88
-
89
-
90
-
91
- | Query | Search hit tokens | vs full `indexer.py` read |
92
- | ------------------------------------ | ----------------- | ------------------------- |
93
- | "how are files re-indexed on change" | 479 | **~6x cheaper** |
94
-
95
-
96
- *Not guarantees — model, chunks, and files affect counts.*
97
-
98
- ---
99
-
100
107
  ## Features
101
108
 
102
109
  - **Languages** — Python, JavaScript, and TypeScript — [`parsing/README.md`](src/coderay/parsing/README.md)
@@ -105,7 +112,6 @@ Measured on this repo after a full index.
105
112
  - **Embeddings** — fastembed (CPU) or MLX on Apple Silicon — [`embedding/README.md`](src/coderay/embedding/README.md)
106
113
  - **Watch** — incremental re-index; `.coderay.toml` is the source of truth for what’s indexed
107
114
 
108
- ---
109
115
 
110
116
  ## Install
111
117
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "coderay"
7
- version = "1.1.0"
7
+ version = "1.2.0"
8
8
  description = "X-ray your codebase — semantic search, code graphs, file skeletons, and MCP server"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -53,6 +53,7 @@ dev = [
53
53
  "pytest-cov>=4.0",
54
54
  "ruff>=0.8.0",
55
55
  "mypy>=1.0.0",
56
+ "tiktoken>=0.5.0",
56
57
  ]
57
58
  maintain = [
58
59
  "pylance>=0.15.0",
@@ -0,0 +1 @@
1
+ __version__ = "1.2.0"
@@ -4,6 +4,7 @@ import logging
4
4
  import os
5
5
  import sys
6
6
  import time
7
+ import warnings
7
8
  from pathlib import Path
8
9
 
9
10
  import click
@@ -51,6 +52,12 @@ def _setup_logging(verbose: bool = False) -> None:
51
52
  ):
52
53
  logging.getLogger(name).setLevel(logging.WARNING)
53
54
  os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
55
+ warnings.filterwarnings(
56
+ "ignore",
57
+ message="Cannot enable progress bars: environment variable",
58
+ category=UserWarning,
59
+ module="huggingface_hub.utils.tqdm",
60
+ )
54
61
 
55
62
 
56
63
  def _set_repo_root(repo_root: Path) -> None:
@@ -333,7 +340,7 @@ def maintain(ctx: click.Context) -> None:
333
340
 
334
341
 
335
342
  @cli.command()
336
- @click.argument("file_path", type=click.Path(exists=True, path_type=Path))
343
+ @click.argument("file_path", type=str)
337
344
  @click.option(
338
345
  "--include-imports",
339
346
  is_flag=True,
@@ -346,17 +353,53 @@ def maintain(ctx: click.Context) -> None:
346
353
  default=None,
347
354
  help="Filter to a specific class or top-level function by name.",
348
355
  )
356
+ @click.option(
357
+ "--lines",
358
+ "line_range",
359
+ default=None,
360
+ metavar="START-END",
361
+ help=(
362
+ "File line range (1-based inclusive); keep only symbols fully within this span."
363
+ " Do not combine with a :START-END suffix on FILE_PATH (same meaning)."
364
+ ),
365
+ )
349
366
  def skeleton(
350
- file_path: Path,
367
+ file_path: str,
351
368
  include_imports: bool,
352
369
  symbol: str | None,
370
+ line_range: str | None,
353
371
  ) -> None:
354
372
  """Print signatures without bodies (cheaper than reading the full file)."""
355
373
  from coderay.skeleton.extractor import extract_skeleton
374
+ from coderay.skeleton.path_range import (
375
+ parse_file_line_range,
376
+ parse_skeleton_file_arg,
377
+ )
356
378
 
357
- content = file_path.read_text(encoding="utf-8", errors="replace")
379
+ try:
380
+ path_str, rng_from_path = parse_skeleton_file_arg(file_path, parse_suffix=True)
381
+ except ValueError as e:
382
+ raise click.BadParameter(str(e)) from e
383
+ file_line_range = rng_from_path
384
+ if line_range:
385
+ if file_line_range is not None:
386
+ raise click.UsageError(
387
+ "Use either a path ending with :START-END or --lines, not both."
388
+ )
389
+ try:
390
+ file_line_range = parse_file_line_range(line_range)
391
+ except ValueError as e:
392
+ raise click.BadParameter(str(e), param_hint="--lines") from e
393
+ resolved = Path(path_str)
394
+ if not resolved.is_file():
395
+ raise click.BadParameter(f"not a file: {path_str}", param_hint="file_path")
396
+ content = resolved.read_text(encoding="utf-8", errors="replace")
358
397
  out = extract_skeleton(
359
- file_path, content, include_imports=include_imports, symbol=symbol
398
+ resolved,
399
+ content,
400
+ include_imports=include_imports,
401
+ symbol=symbol,
402
+ line_range=file_line_range,
360
403
  )
361
404
  click.echo(out)
362
405
 
@@ -30,7 +30,7 @@ def timed(phase: str) -> Callable[[F], F]:
30
30
 
31
31
 
32
32
  class TimedPhase:
33
- """Context manager: measure block execution time."""
33
+ """Context manager: measure block execution time; log completion at DEBUG."""
34
34
 
35
35
  def __init__(self, phase: str, *, log: bool = True) -> None:
36
36
  self.phase = phase
@@ -42,10 +42,15 @@ class TimedPhase:
42
42
  self.t0 = time.perf_counter()
43
43
  return self
44
44
 
45
+ def elapsed_so_far(self) -> float:
46
+ """Return seconds since __enter__ (before __exit__)."""
47
+
48
+ return time.perf_counter() - self.t0
49
+
45
50
  def __exit__(self, *args: object) -> None:
46
51
  self.elapsed = time.perf_counter() - self.t0
47
52
  if self.log:
48
- logger.info("%s: %.3fs", self.phase, self.elapsed)
53
+ logger.debug("%s: %.3fs", self.phase, self.elapsed)
49
54
 
50
55
 
51
56
  timed_phase = TimedPhase # Convenience alias for context manager usage
@@ -48,13 +48,13 @@ def load_embedder_from_config() -> Embedder:
48
48
  config = get_config()
49
49
  ed = config.embedder
50
50
  backend = resolved_embedder_backend(ed.backend)
51
- if (ed.backend or "auto").strip().lower() == "auto":
52
- logger.info("embedder.backend=auto -> %s", backend)
53
51
  if backend == "mlx" and not mlx_optional_installed():
54
52
  raise RuntimeError(
55
53
  "embedder.backend is 'mlx' but MLX is not installed. "
56
54
  "On Apple Silicon: pip install 'coderay[mlx]'"
57
55
  )
56
+ model_name = ed.mlx.model_name if backend == "mlx" else ed.fastembed.model_name
57
+ logger.info("embedder.backend=%s model=%s", backend, model_name)
58
58
  if backend == "mlx":
59
59
  mx = ed.mlx
60
60
  return MLXEmbedder(
@@ -5,6 +5,7 @@ from typing import Any
5
5
 
6
6
  from onnxruntime.capi.onnxruntime_pybind11_state import NoSuchFile
7
7
 
8
+ from coderay.core.timing import timed_phase
8
9
  from coderay.embedding.base import Embedder, EmbedTask
9
10
  from coderay.embedding.prefixes import SEARCH_PREFIXES, requires_prefix
10
11
 
@@ -38,15 +39,14 @@ class LocalEmbedder(Embedder):
38
39
  return TextEmbedding(model_name=name, local_files_only=local_only)
39
40
 
40
41
  try:
41
- logger.info("Loading model %s from cache...", self._model_name)
42
42
  self._model = _open(name=self._model_name, local_only=True)
43
- logger.info("Model %s loaded from cache.", self._model_name)
44
43
  except (NoSuchFile, ValueError) as e:
45
44
  if isinstance(e, ValueError) and "Could not load model" not in str(e):
46
45
  raise
47
- logger.info("Downloading model %s (one-time)...", self._model_name)
48
46
  self._model = _open(name=self._model_name, local_only=False)
49
- logger.info("Model %s downloaded and ready.", self._model_name)
47
+ logger.info("Model %s ready (downloaded).", self._model_name)
48
+ else:
49
+ logger.info("Model %s ready (cache).", self._model_name)
50
50
 
51
51
  def _apply_prefix(self, texts: list[str], task: EmbedTask) -> list[str]:
52
52
  if not requires_prefix(self._model_name):
@@ -66,9 +66,21 @@ class LocalEmbedder(Embedder):
66
66
  self._load_model()
67
67
 
68
68
  prefixed = self._apply_prefix(texts, task)
69
-
70
- logger.info("Embedding %d chunks (task=%s)...", len(prefixed), task.value)
71
- embeddings = list(self._model.embed(prefixed, batch_size=self._batch_size))
69
+ n = len(prefixed)
70
+ logger.info("Embedding %d chunks (task=%s)...", n, task.value)
71
+ raw: list[Any] = []
72
+ bs = self._batch_size
73
+ with timed_phase("embedding", log=False) as tp:
74
+ for i in range(0, n, bs):
75
+ sub = prefixed[i : i + bs]
76
+ part = list(self._model.embed(sub, batch_size=self._batch_size))
77
+ raw.extend(part)
78
+ logger.info("Embedded %d/%d chunks", min(i + len(sub), n), n)
79
+ logger.info(
80
+ "Embedding complete: %d chunks in %.2fs",
81
+ n,
82
+ tp.elapsed,
83
+ )
72
84
  if self._matryoshka_dimensions is not None:
73
- return [e.tolist()[: self._matryoshka_dimensions] for e in embeddings]
74
- return [e.tolist() for e in embeddings]
85
+ return [e.tolist()[: self._matryoshka_dimensions] for e in raw]
86
+ return [e.tolist() for e in raw]
@@ -1,5 +1,6 @@
1
1
  import logging
2
2
 
3
+ from coderay.core.timing import timed_phase
3
4
  from coderay.embedding.base import Embedder, EmbedTask
4
5
  from coderay.embedding.prefixes import SEARCH_PREFIXES, requires_prefix
5
6
 
@@ -44,6 +45,7 @@ class MLXEmbedder(Embedder):
44
45
  prefix = SEARCH_PREFIXES.get(task, "")
45
46
  texts = [prefix + t for t in texts] if prefix else texts
46
47
 
48
+ logger.info("Embedding %d chunks (task=%s)...", len(texts), task.value)
47
49
  return self._embed_batched(texts)
48
50
 
49
51
  def _ensure_loaded(self) -> None:
@@ -53,20 +55,13 @@ class MLXEmbedder(Embedder):
53
55
  from mlx_embeddings import load
54
56
 
55
57
  cached = self._is_cached()
56
- if cached:
57
- logger.info(
58
- "Loading model %s from cache (%s)...",
59
- self._model_name,
60
- mx.default_device(),
61
- )
62
- else:
63
- logger.info(
64
- "Downloading model %s (one-time, %s)...",
65
- self._model_name,
66
- mx.default_device(),
67
- )
68
58
  self._model, self._tokenizer = load(self._model_name)
69
- logger.info("Model %s ready.", self._model_name)
59
+ logger.info(
60
+ "Model %s ready (%s, %s).",
61
+ self._model_name,
62
+ "cache" if cached else "downloaded",
63
+ mx.default_device(),
64
+ )
70
65
 
71
66
  def _is_cached(self) -> bool:
72
67
  """Check if model exists in huggingface cache."""
@@ -83,12 +78,18 @@ class MLXEmbedder(Embedder):
83
78
  n = len(texts)
84
79
  bs = self._batch_size
85
80
 
86
- for i in range(0, n, bs):
87
- batch = texts[i : i + bs]
88
- arr = self._embed_single_batch(batch)
89
- out.extend(arr.tolist())
90
- logger.info("MLX embedded %d/%d", min(i + bs, n), n)
91
-
81
+ with timed_phase("embedding", log=False) as tp:
82
+ for i in range(0, n, bs):
83
+ batch = texts[i : i + bs]
84
+ arr = self._embed_single_batch(batch)
85
+ out.extend(arr.tolist())
86
+ logger.info("Embedded %d/%d chunks", min(i + bs, n), n)
87
+
88
+ logger.info(
89
+ "Embedding complete: %d chunks in %.2fs",
90
+ n,
91
+ tp.elapsed,
92
+ )
92
93
  return out
93
94
 
94
95
  def _embed_single_batch(self, batch: list[str]):
@@ -0,0 +1,45 @@
1
+ # graph
2
+
3
+ Directed **calls**, **imports**, and **inheritance** over indexed source. The implementation is laid out as extractors, lowering, merge, and post-merge passes in this package; this file describes **behavior**, not file names.
4
+
5
+ ## Pipeline (conceptual)
6
+
7
+ Per file: CST → **facts** (definitions, calls, imports, inherits) → **materialise** into `GraphNode` / `GraphEdge`. Multi-file **merge** builds one `CodeGraph`. **Post-merge** runs language passes and global rewrites (e.g. resolving bare-name call targets when unambiguous repo-wide).
8
+
9
+ Cross-file lowering uses a **module index** (dotted name → file path) so imports and qualified names can become `file_path::symbol` targets. Edges may point at **phantom** strings (unresolved callee) until passes or later tooling refine them.
10
+
11
+ ## Targets and phantoms
12
+
13
+ Call/import/inherit **targets are strings**: resolved node ids (`file::qual`), module-style refs (`pkg.mod.sym`), or **phantoms** (short names, unknowns). Heuristics classify targets for filtering and UX; **materialise** can emit edges whose endpoints are not yet graph nodes.
14
+
15
+ **`include_external`** (config) drops edges whose targets are not considered “in repo” for the current index.
16
+
17
+ ## Symbol resolution (`CodeGraph`)
18
+
19
+ Indexes back **short names** and **qualified names** to node ids. **Unique** short name → one id; **ambiguous** → `resolve_symbol` returns `None` (callers must use full id or disambiguate).
20
+
21
+ ## Impact radius (`impact.py`)
22
+
23
+ **Reverse** traversal from a symbol: who **calls**, **imports**, or **inherits** toward it, up to a **depth** limit. Not every edge kind is impact-relevant; module nodes are filtered when the same file is already represented by concrete symbols.
24
+
25
+ **Resolution layers:** exact id → optional **fuzzy** match by trailing name within a file → hints when ambiguous or empty results. **Seeds** for a method can include the **parent class’s** same-named method when inheritance is present, so callers of the base implementation count toward impact on overrides. **Phantom aliases** (same symbol under different string ids) are considered so edges from re-exports or legacy shapes are not missed.
26
+
27
+ **Limitations:** static graph only—dynamic dispatch, reflection, and cross-repo callers are not modeled; hints may suggest grep when imports exist but call edges could not be resolved.
28
+
29
+ ## Callee lowering (`CalleeResolver`)
30
+
31
+ Raw callee text from the tree (e.g. `self.m`, `super().x`, `a.b`) is combined with **per-file bindings** (imports, instance typing, scopes) to produce target strings. Order matters: **super** / **self** handling runs before generic **simple** and **dotted-chain** resolution. Behavior is shared across languages where configs align (`self`/`super` prefixes); edge cases differ by language grammar and binding richness.
32
+
33
+ ## Known limitations (general)
34
+
35
+ - **Soundness:** graph is **heuristic**, not a type system; wrong or missing edges are expected under metaprogramming, conditional imports, and incomplete index scope.
36
+ - **Staleness:** graph reflects last build; **watch** / rebuild needed after large refactors.
37
+ - **Language coverage:** depth varies by language (Python/JS/TS today); new languages plug in via the same fact/materialise/merge shape but need their own extractors and tests.
38
+
39
+ ## Tests
40
+
41
+ [`tests/unit/graph/`](../../../tests/unit/graph/) (invariants, extractors, resolver), [`tests/regression/graph/`](../../../tests/regression/graph/) (multi-file fixtures).
42
+
43
+ ## Persistence
44
+
45
+ `graph.json` under the index directory; **`schema_version`** supports loading older serialised shapes when bumped.
@@ -6,14 +6,15 @@ from coderay.graph.builder import (
6
6
  save_graph,
7
7
  )
8
8
  from coderay.graph.code_graph import CodeGraph
9
- from coderay.graph.extractor import extract_graph_from_file
9
+ from coderay.graph.graph_builder import GraphBuilder, build_project_index
10
10
 
11
11
  __all__ = [
12
12
  "GRAPH_FILENAME",
13
13
  "CodeGraph",
14
+ "GraphBuilder",
15
+ "build_project_index",
14
16
  "build_and_save_graph",
15
17
  "build_graph",
16
- "extract_graph_from_file",
17
18
  "load_graph",
18
19
  "save_graph",
19
20
  ]