codespine 0.5.3__tar.gz → 0.5.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. codespine-0.5.4/PKG-INFO +270 -0
  2. codespine-0.5.4/README.md +206 -0
  3. {codespine-0.5.3 → codespine-0.5.4}/codespine/__init__.py +1 -1
  4. codespine-0.5.4/codespine/analysis/community.py +182 -0
  5. {codespine-0.5.3 → codespine-0.5.4}/codespine/analysis/deadcode.py +66 -0
  6. {codespine-0.5.3 → codespine-0.5.4}/codespine/cli.py +10 -6
  7. {codespine-0.5.3 → codespine-0.5.4}/codespine/db/store.py +54 -33
  8. codespine-0.5.4/codespine.egg-info/PKG-INFO +270 -0
  9. {codespine-0.5.3 → codespine-0.5.4}/codespine.egg-info/SOURCES.txt +2 -0
  10. {codespine-0.5.3 → codespine-0.5.4}/pyproject.toml +1 -1
  11. codespine-0.5.4/tests/test_community_detection.py +44 -0
  12. codespine-0.5.4/tests/test_deadcode.py +52 -0
  13. codespine-0.5.3/PKG-INFO +0 -333
  14. codespine-0.5.3/README.md +0 -269
  15. codespine-0.5.3/codespine/analysis/community.py +0 -88
  16. codespine-0.5.3/codespine.egg-info/PKG-INFO +0 -333
  17. {codespine-0.5.3 → codespine-0.5.4}/LICENSE +0 -0
  18. {codespine-0.5.3 → codespine-0.5.4}/codespine/analysis/__init__.py +0 -0
  19. {codespine-0.5.3 → codespine-0.5.4}/codespine/analysis/context.py +0 -0
  20. {codespine-0.5.3 → codespine-0.5.4}/codespine/analysis/coupling.py +0 -0
  21. {codespine-0.5.3 → codespine-0.5.4}/codespine/analysis/crossmodule.py +0 -0
  22. {codespine-0.5.3 → codespine-0.5.4}/codespine/analysis/flow.py +0 -0
  23. {codespine-0.5.3 → codespine-0.5.4}/codespine/analysis/impact.py +0 -0
  24. {codespine-0.5.3 → codespine-0.5.4}/codespine/config.py +0 -0
  25. {codespine-0.5.3 → codespine-0.5.4}/codespine/db/__init__.py +0 -0
  26. {codespine-0.5.3 → codespine-0.5.4}/codespine/db/schema.py +0 -0
  27. {codespine-0.5.3 → codespine-0.5.4}/codespine/diff/__init__.py +0 -0
  28. {codespine-0.5.3 → codespine-0.5.4}/codespine/diff/branch_diff.py +0 -0
  29. {codespine-0.5.3 → codespine-0.5.4}/codespine/indexer/__init__.py +0 -0
  30. {codespine-0.5.3 → codespine-0.5.4}/codespine/indexer/call_resolver.py +0 -0
  31. {codespine-0.5.3 → codespine-0.5.4}/codespine/indexer/engine.py +0 -0
  32. {codespine-0.5.3 → codespine-0.5.4}/codespine/indexer/java_parser.py +0 -0
  33. {codespine-0.5.3 → codespine-0.5.4}/codespine/indexer/symbol_builder.py +0 -0
  34. {codespine-0.5.3 → codespine-0.5.4}/codespine/mcp/__init__.py +0 -0
  35. {codespine-0.5.3 → codespine-0.5.4}/codespine/mcp/server.py +0 -0
  36. {codespine-0.5.3 → codespine-0.5.4}/codespine/noise/__init__.py +0 -0
  37. {codespine-0.5.3 → codespine-0.5.4}/codespine/noise/blocklist.py +0 -0
  38. {codespine-0.5.3 → codespine-0.5.4}/codespine/search/__init__.py +0 -0
  39. {codespine-0.5.3 → codespine-0.5.4}/codespine/search/bm25.py +0 -0
  40. {codespine-0.5.3 → codespine-0.5.4}/codespine/search/fuzzy.py +0 -0
  41. {codespine-0.5.3 → codespine-0.5.4}/codespine/search/hybrid.py +0 -0
  42. {codespine-0.5.3 → codespine-0.5.4}/codespine/search/rrf.py +0 -0
  43. {codespine-0.5.3 → codespine-0.5.4}/codespine/search/vector.py +0 -0
  44. {codespine-0.5.3 → codespine-0.5.4}/codespine/watch/__init__.py +0 -0
  45. {codespine-0.5.3 → codespine-0.5.4}/codespine/watch/watcher.py +0 -0
  46. {codespine-0.5.3 → codespine-0.5.4}/codespine.egg-info/dependency_links.txt +0 -0
  47. {codespine-0.5.3 → codespine-0.5.4}/codespine.egg-info/entry_points.txt +0 -0
  48. {codespine-0.5.3 → codespine-0.5.4}/codespine.egg-info/requires.txt +0 -0
  49. {codespine-0.5.3 → codespine-0.5.4}/codespine.egg-info/top_level.txt +0 -0
  50. {codespine-0.5.3 → codespine-0.5.4}/gindex.py +0 -0
  51. {codespine-0.5.3 → codespine-0.5.4}/setup.cfg +0 -0
  52. {codespine-0.5.3 → codespine-0.5.4}/tests/test_branch_diff_normalize.py +0 -0
  53. {codespine-0.5.3 → codespine-0.5.4}/tests/test_call_resolver.py +0 -0
  54. {codespine-0.5.3 → codespine-0.5.4}/tests/test_index_and_hybrid.py +0 -0
  55. {codespine-0.5.3 → codespine-0.5.4}/tests/test_java_parser.py +0 -0
  56. {codespine-0.5.3 → codespine-0.5.4}/tests/test_multimodule_index.py +0 -0
  57. {codespine-0.5.3 → codespine-0.5.4}/tests/test_search_ranking.py +0 -0
@@ -0,0 +1,270 @@
1
+ Metadata-Version: 2.4
2
+ Name: codespine
3
+ Version: 0.5.4
4
+ Summary: Local Java code intelligence indexer backed by a graph database
5
+ Author: CodeSpine contributors
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 CodeSpine contributors
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/vinayak3022/codeSpine
29
+ Project-URL: Repository, https://github.com/vinayak3022/codeSpine
30
+ Project-URL: Issues, https://github.com/vinayak3022/codeSpine/issues
31
+ Keywords: java,code-indexing,graph,kuzu,mcp
32
+ Classifier: Development Status :: 3 - Alpha
33
+ Classifier: Intended Audience :: Developers
34
+ Classifier: License :: OSI Approved :: MIT License
35
+ Classifier: Programming Language :: Python :: 3
36
+ Classifier: Programming Language :: Python :: 3.10
37
+ Classifier: Programming Language :: Python :: 3.11
38
+ Classifier: Programming Language :: Python :: 3.12
39
+ Classifier: Programming Language :: Python :: 3.13
40
+ Classifier: Topic :: Software Development :: Libraries
41
+ Classifier: Topic :: Software Development :: Quality Assurance
42
+ Requires-Python: >=3.10
43
+ Description-Content-Type: text/markdown
44
+ License-File: LICENSE
45
+ Requires-Dist: click
46
+ Requires-Dist: kuzu
47
+ Requires-Dist: tree-sitter
48
+ Requires-Dist: tree-sitter-java
49
+ Requires-Dist: fastmcp>=2.3.0
50
+ Requires-Dist: psutil
51
+ Requires-Dist: watchfiles
52
+ Provides-Extra: ml
53
+ Requires-Dist: sentence-transformers; extra == "ml"
54
+ Requires-Dist: numpy; extra == "ml"
55
+ Provides-Extra: community
56
+ Requires-Dist: igraph; extra == "community"
57
+ Requires-Dist: leidenalg; extra == "community"
58
+ Provides-Extra: full
59
+ Requires-Dist: sentence-transformers; extra == "full"
60
+ Requires-Dist: numpy; extra == "full"
61
+ Requires-Dist: igraph; extra == "full"
62
+ Requires-Dist: leidenalg; extra == "full"
63
+ Dynamic: license-file
64
+
65
+ # CodeSpine
66
+
67
+ CodeSpine cuts token burn for coding agents working on Java codebases.
68
+
69
+ Instead of having an agent open dozens of `.java` files to answer one question, CodeSpine indexes the codebase once and serves the structure over MCP. The agent asks for symbols, callers, impact, flows, dead code, and module boundaries directly, which means fewer file reads, fewer wasted context windows, and fewer hallucinated code paths.
70
+
71
+ It indexes classes, methods, calls, type relationships, cross-module links, git coupling, dead-code candidates, and execution flows so agents can work from graph answers first and source files second.
72
+
73
+ ## Why It Saves Tokens
74
+
75
+ - One MCP call can replace many file opens. `get_symbol_context("PaymentService")` returns a resolved neighborhood instead of forcing the agent to read every caller and callee file manually.
76
+ - Search is structure-aware. Agents can ask for a symbol, concept, impact radius, or dead-code candidate without scanning entire packages.
77
+ - Multi-module repos stay scoped. Project-aware IDs and `project=` parameters reduce noise from unrelated modules and workspaces.
78
+ - Repeat sessions get cheaper. Once indexed, the agent reuses the graph instead of re-discovering the same relationships every turn.
79
+
80
+ ## Install
81
+
82
+ ```bash
83
+ pip install codespine
84
+ ```
85
+
86
+ Optional semantic search:
87
+
88
+ ```bash
89
+ pip install "codespine[ml]"
90
+ ```
91
+
92
+ ## What It Does
93
+
94
+ - Hybrid search: BM25 + fuzzy by default, semantic vector search with `--embed`
95
+ - Impact analysis: callers, dependencies, and confidence-scored edges
96
+ - Dead code detection: Java-aware exemptions for tests, framework hooks, contracts, and common DI patterns
97
+ - Execution flows: traces from entry points through the call graph
98
+ - Community detection: structural clusters for architectural context
99
+ - Change coupling: git-history-based file relationships
100
+ - Multi-project and multi-module indexing: workspaces, Maven modules, Gradle subprojects
101
+ - MCP server: structured tools for Claude, Cursor, Cline, Copilot, and similar clients
102
+
103
+ ## Quick Start
104
+
105
+ Index a repo:
106
+
107
+ ```bash
108
+ codespine analyse /path/to/project
109
+ ```
110
+
111
+ Run a deeper pass:
112
+
113
+ ```bash
114
+ codespine analyse /path/to/project --deep
115
+ ```
116
+
117
+ Add embeddings for semantic search:
118
+
119
+ ```bash
120
+ codespine analyse /path/to/project --embed
121
+ ```
122
+
123
+ Typical output:
124
+
125
+ ```text
126
+ $ codespine analyse .
127
+ Walking files... 142 files found
128
+ Index mode... incremental (8 files to index, 0 deleted)
129
+ Parsing code... 8/8
130
+ Tracing calls... 847 calls resolved
131
+ Analyzing types... 234 type relationships
132
+ Cross-module linking... skipped (single module)
133
+ Detecting communities... 8 clusters found
134
+ Detecting execution flows... 34 processes found
135
+ Finding dead code... 12 unreachable symbols
136
+ Analyzing git history... 18 coupled file pairs
137
+ Generating embeddings... 0 vectors stored
138
+
139
+ Done in 4.2s - 623 symbols, 1847 edges, 8 clusters, 34 flows (no embeddings; rerun with --embed for semantic search)
140
+ ```
141
+
142
+ Search the index:
143
+
144
+ ```bash
145
+ codespine search "retry payment"
146
+ codespine context "PaymentService"
147
+ codespine impact "com.example.PaymentService#charge(java.lang.String)"
148
+ codespine stats
149
+ ```
150
+
151
+ ## MCP
152
+
153
+ Foreground MCP server:
154
+
155
+ ```bash
156
+ codespine mcp
157
+ ```
158
+
159
+ Minimal MCP config:
160
+
161
+ ```json
162
+ {
163
+ "mcpServers": {
164
+ "codespine": {
165
+ "command": "codespine",
166
+ "args": ["mcp"]
167
+ }
168
+ }
169
+ }
170
+ ```
171
+
172
+ If the client launches the wrong Python environment, use the absolute binary path instead:
173
+
174
+ ```json
175
+ {
176
+ "mcpServers": {
177
+ "codespine": {
178
+ "command": "/absolute/path/to/codespine",
179
+ "args": ["mcp"]
180
+ }
181
+ }
182
+ }
183
+ ```
184
+
185
+ Common MCP tools:
186
+
187
+ - `search_hybrid(query, k, project)`
188
+ - `find_symbol(name, kind, project, limit)`
189
+ - `get_symbol_context(query, max_depth, project)`
190
+ - `get_impact(symbol, max_depth, project)`
191
+ - `detect_dead_code(limit, project, strict)`
192
+ - `trace_execution_flows(entry_symbol, max_depth, project)`
193
+ - `get_symbol_community(symbol)`
194
+ - `get_change_coupling(months, min_strength, min_cochanges, project)`
195
+ - `compare_branches(base_ref, head_ref)`
196
+ - `get_codebase_stats()`
197
+
198
+ ## CLI
199
+
200
+ Core commands:
201
+
202
+ ```bash
203
+ codespine analyse <path>
204
+ codespine analyse <path> --full
205
+ codespine analyse <path> --deep
206
+ codespine analyse <path> --embed
207
+ codespine watch --path .
208
+ codespine search "query"
209
+ codespine context "symbol"
210
+ codespine impact "symbol"
211
+ codespine deadcode
212
+ codespine flow
213
+ codespine community
214
+ codespine coupling
215
+ codespine diff main..feature
216
+ codespine stats
217
+ codespine list
218
+ codespine clear-project <project_id>
219
+ codespine clear-index
220
+ ```
221
+
222
+ `analyse` defaults to incremental mode. Repeat runs are designed to be fast when files have not changed.
223
+
224
+ ## Workspace And Module Detection
225
+
226
+ CodeSpine can index:
227
+
228
+ - a single Java repo
229
+ - a multi-module Maven or Gradle repo
230
+ - a workspace directory containing multiple repos
231
+
232
+ Project IDs are:
233
+
234
+ - single-module repo: `payments-service`
235
+ - multi-module repo: `payments-service::core`, `payments-service::api`
236
+
237
+ That same project ID can be passed into MCP tools and CLI analysis calls that support project scoping.
238
+
239
+ ## Deep Analysis Trade-Offs
240
+
241
+ `--deep` enables the expensive graph-wide passes:
242
+
243
+ - communities
244
+ - execution flows
245
+ - dead code
246
+ - git coupling
247
+
248
+ Use it when you want architecture-level context. Skip it when you just need the graph refreshed for search, context, and impact.
249
+
250
+ `--embed` is also optional. Without it, CodeSpine still supports exact, keyword, and fuzzy search. Add embeddings when you need concept-level retrieval.
251
+
252
+ ## Runtime Files
253
+
254
+ - `~/.codespine_db` - graph database
255
+ - `~/.codespine.pid` - MCP background server PID
256
+ - `~/.codespine.log` - server log
257
+ - `~/.codespine_embedding_cache.json` - embedding cache
258
+ - `~/.codespine_index_meta/` - incremental file metadata cache
259
+
260
+ ## Notes
261
+
262
+ - `codespine start` launches a background MCP server. Most IDE MCP clients should use `codespine mcp` instead and manage the process themselves.
263
+ - `codespine clear-index` rebuilds the local index database from scratch.
264
+ - For large Spring or JPA-heavy repos, dead-code results should still be reviewed before deletion. The tool is conservative, not authoritative.
265
+
266
+ ## Project Docs
267
+
268
+ - [Contributing](.github/CONTRIBUTING.md)
269
+ - [Security](.github/SECURITY.md)
270
+ - [Code of Conduct](.github/CODE_OF_CONDUCT.md)
@@ -0,0 +1,206 @@
1
+ # CodeSpine
2
+
3
+ CodeSpine cuts token burn for coding agents working on Java codebases.
4
+
5
+ Instead of having an agent open dozens of `.java` files to answer one question, CodeSpine indexes the codebase once and serves the structure over MCP. The agent asks for symbols, callers, impact, flows, dead code, and module boundaries directly, which means fewer file reads, fewer wasted context windows, and fewer hallucinated code paths.
6
+
7
+ It indexes classes, methods, calls, type relationships, cross-module links, git coupling, dead-code candidates, and execution flows so agents can work from graph answers first and source files second.
8
+
9
+ ## Why It Saves Tokens
10
+
11
+ - One MCP call can replace many file opens. `get_symbol_context("PaymentService")` returns a resolved neighborhood instead of forcing the agent to read every caller and callee file manually.
12
+ - Search is structure-aware. Agents can ask for a symbol, concept, impact radius, or dead-code candidate without scanning entire packages.
13
+ - Multi-module repos stay scoped. Project-aware IDs and `project=` parameters reduce noise from unrelated modules and workspaces.
14
+ - Repeat sessions get cheaper. Once indexed, the agent reuses the graph instead of re-discovering the same relationships every turn.
15
+
16
+ ## Install
17
+
18
+ ```bash
19
+ pip install codespine
20
+ ```
21
+
22
+ Optional semantic search:
23
+
24
+ ```bash
25
+ pip install "codespine[ml]"
26
+ ```
27
+
28
+ ## What It Does
29
+
30
+ - Hybrid search: BM25 + fuzzy by default, semantic vector search with `--embed`
31
+ - Impact analysis: callers, dependencies, and confidence-scored edges
32
+ - Dead code detection: Java-aware exemptions for tests, framework hooks, contracts, and common DI patterns
33
+ - Execution flows: traces from entry points through the call graph
34
+ - Community detection: structural clusters for architectural context
35
+ - Change coupling: git-history-based file relationships
36
+ - Multi-project and multi-module indexing: workspaces, Maven modules, Gradle subprojects
37
+ - MCP server: structured tools for Claude, Cursor, Cline, Copilot, and similar clients
38
+
39
+ ## Quick Start
40
+
41
+ Index a repo:
42
+
43
+ ```bash
44
+ codespine analyse /path/to/project
45
+ ```
46
+
47
+ Run a deeper pass:
48
+
49
+ ```bash
50
+ codespine analyse /path/to/project --deep
51
+ ```
52
+
53
+ Add embeddings for semantic search:
54
+
55
+ ```bash
56
+ codespine analyse /path/to/project --embed
57
+ ```
58
+
59
+ Typical output:
60
+
61
+ ```text
62
+ $ codespine analyse .
63
+ Walking files... 142 files found
64
+ Index mode... incremental (8 files to index, 0 deleted)
65
+ Parsing code... 8/8
66
+ Tracing calls... 847 calls resolved
67
+ Analyzing types... 234 type relationships
68
+ Cross-module linking... skipped (single module)
69
+ Detecting communities... 8 clusters found
70
+ Detecting execution flows... 34 processes found
71
+ Finding dead code... 12 unreachable symbols
72
+ Analyzing git history... 18 coupled file pairs
73
+ Generating embeddings... 0 vectors stored
74
+
75
+ Done in 4.2s - 623 symbols, 1847 edges, 8 clusters, 34 flows (no embeddings; rerun with --embed for semantic search)
76
+ ```
77
+
78
+ Search the index:
79
+
80
+ ```bash
81
+ codespine search "retry payment"
82
+ codespine context "PaymentService"
83
+ codespine impact "com.example.PaymentService#charge(java.lang.String)"
84
+ codespine stats
85
+ ```
86
+
87
+ ## MCP
88
+
89
+ Foreground MCP server:
90
+
91
+ ```bash
92
+ codespine mcp
93
+ ```
94
+
95
+ Minimal MCP config:
96
+
97
+ ```json
98
+ {
99
+ "mcpServers": {
100
+ "codespine": {
101
+ "command": "codespine",
102
+ "args": ["mcp"]
103
+ }
104
+ }
105
+ }
106
+ ```
107
+
108
+ If the client launches the wrong Python environment, use the absolute binary path instead:
109
+
110
+ ```json
111
+ {
112
+ "mcpServers": {
113
+ "codespine": {
114
+ "command": "/absolute/path/to/codespine",
115
+ "args": ["mcp"]
116
+ }
117
+ }
118
+ }
119
+ ```
120
+
121
+ Common MCP tools:
122
+
123
+ - `search_hybrid(query, k, project)`
124
+ - `find_symbol(name, kind, project, limit)`
125
+ - `get_symbol_context(query, max_depth, project)`
126
+ - `get_impact(symbol, max_depth, project)`
127
+ - `detect_dead_code(limit, project, strict)`
128
+ - `trace_execution_flows(entry_symbol, max_depth, project)`
129
+ - `get_symbol_community(symbol)`
130
+ - `get_change_coupling(months, min_strength, min_cochanges, project)`
131
+ - `compare_branches(base_ref, head_ref)`
132
+ - `get_codebase_stats()`
133
+
134
+ ## CLI
135
+
136
+ Core commands:
137
+
138
+ ```bash
139
+ codespine analyse <path>
140
+ codespine analyse <path> --full
141
+ codespine analyse <path> --deep
142
+ codespine analyse <path> --embed
143
+ codespine watch --path .
144
+ codespine search "query"
145
+ codespine context "symbol"
146
+ codespine impact "symbol"
147
+ codespine deadcode
148
+ codespine flow
149
+ codespine community
150
+ codespine coupling
151
+ codespine diff main..feature
152
+ codespine stats
153
+ codespine list
154
+ codespine clear-project <project_id>
155
+ codespine clear-index
156
+ ```
157
+
158
+ `analyse` defaults to incremental mode. Repeat runs are designed to be fast when files have not changed.
159
+
160
+ ## Workspace And Module Detection
161
+
162
+ CodeSpine can index:
163
+
164
+ - a single Java repo
165
+ - a multi-module Maven or Gradle repo
166
+ - a workspace directory containing multiple repos
167
+
168
+ Project IDs are:
169
+
170
+ - single-module repo: `payments-service`
171
+ - multi-module repo: `payments-service::core`, `payments-service::api`
172
+
173
+ That same project ID can be passed into MCP tools and CLI analysis calls that support project scoping.
174
+
175
+ ## Deep Analysis Trade-Offs
176
+
177
+ `--deep` enables the expensive graph-wide passes:
178
+
179
+ - communities
180
+ - execution flows
181
+ - dead code
182
+ - git coupling
183
+
184
+ Use it when you want architecture-level context. Skip it when you just need the graph refreshed for search, context, and impact.
185
+
186
+ `--embed` is also optional. Without it, CodeSpine still supports exact, keyword, and fuzzy search. Add embeddings when you need concept-level retrieval.
187
+
188
+ ## Runtime Files
189
+
190
+ - `~/.codespine_db` - graph database
191
+ - `~/.codespine.pid` - MCP background server PID
192
+ - `~/.codespine.log` - server log
193
+ - `~/.codespine_embedding_cache.json` - embedding cache
194
+ - `~/.codespine_index_meta/` - incremental file metadata cache
195
+
196
+ ## Notes
197
+
198
+ - `codespine start` launches a background MCP server. Most IDE MCP clients should use `codespine mcp` instead and manage the process themselves.
199
+ - `codespine clear-index` rebuilds the local index database from scratch.
200
+ - For large Spring or JPA-heavy repos, dead-code results should still be reviewed before deletion. The tool is conservative, not authoritative.
201
+
202
+ ## Project Docs
203
+
204
+ - [Contributing](.github/CONTRIBUTING.md)
205
+ - [Security](.github/SECURITY.md)
206
+ - [Code of Conduct](.github/CODE_OF_CONDUCT.md)
@@ -1,4 +1,4 @@
1
1
  """CodeSpine package."""
2
2
 
3
3
  __all__ = ["__version__"]
4
- __version__ = "0.5.3"
4
+ __version__ = "0.5.4"
@@ -0,0 +1,182 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import Counter, defaultdict
4
+
5
+ MAX_LEIDEN_SYMBOLS = 12000
6
+ MIN_COMMUNITY_SIZE = 2
7
+ PACKAGE_BUCKET_DEPTH = 5
8
+
9
+
10
+ def _package_bucket(fqname: str) -> str:
11
+ base = (fqname or "").split("#", 1)[0]
12
+ parts = [p for p in base.split(".") if p]
13
+ if len(parts) <= 2:
14
+ return base or "default"
15
+ package_parts = parts[:-1] if len(parts) > 1 else parts
16
+ depth = min(PACKAGE_BUCKET_DEPTH, len(package_parts))
17
+ return ".".join(package_parts[:depth]) or base or "default"
18
+
19
+
20
+ def _community_label(symbol_ids: list[str], symbol_meta: dict[str, dict]) -> str:
21
+ bucket_counts = Counter(_package_bucket(symbol_meta[sid].get("fqname", "")) for sid in symbol_ids if sid in symbol_meta)
22
+ if bucket_counts:
23
+ return bucket_counts.most_common(1)[0][0]
24
+ return "community"
25
+
26
+
27
+ def _call_graph_communities(symbol_meta: dict[str, dict], method_edges: list[tuple[str, str]], progress=None) -> dict[str, int]:
28
+ def _ping(msg: str) -> None:
29
+ if progress:
30
+ progress(msg)
31
+
32
+ graph_nodes = sorted({sid for edge in method_edges for sid in edge})
33
+ if not graph_nodes:
34
+ return {}
35
+
36
+ if len(graph_nodes) > MAX_LEIDEN_SYMBOLS:
37
+ _ping(f"graph too large for leiden ({len(graph_nodes)} symbols), using package fallback")
38
+ return {}
39
+
40
+ index_of = {sid: i for i, sid in enumerate(graph_nodes)}
41
+ membership: dict[str, int] = {}
42
+ try:
43
+ import igraph as ig
44
+ import leidenalg
45
+
46
+ _ping(f"{len(graph_nodes)} connected symbols, running leiden")
47
+ g = ig.Graph(directed=False)
48
+ g.add_vertices(len(graph_nodes))
49
+ g.add_edges([(index_of[src], index_of[dst]) for src, dst in method_edges if src in index_of and dst in index_of])
50
+ part = leidenalg.find_partition(g, leidenalg.ModularityVertexPartition)
51
+ for idx, cid in enumerate(part.membership):
52
+ membership[graph_nodes[idx]] = int(cid)
53
+ except Exception:
54
+ _ping("leiden unavailable, using package fallback")
55
+ return {}
56
+ return membership
57
+
58
+
59
+ def detect_communities(store, progress=None) -> list[dict]:
60
+ def _ping(msg: str) -> None:
61
+ if progress:
62
+ progress(msg)
63
+
64
+ _ping("loading symbols")
65
+ symbols = store.query_records(
66
+ """
67
+ MATCH (s:Symbol)
68
+ RETURN s.id as id, s.kind as kind, s.fqname as fqname, s.file_id as file_id
69
+ """
70
+ )
71
+ if not symbols:
72
+ return []
73
+
74
+ symbol_meta = {s["id"]: s for s in symbols}
75
+ method_symbols_by_key: dict[tuple[str, str], str] = {}
76
+ class_symbols_by_key: dict[tuple[str, str], str] = {}
77
+ for symbol in symbols:
78
+ key = (symbol.get("file_id", ""), symbol.get("fqname", ""))
79
+ if symbol.get("kind") == "method":
80
+ method_symbols_by_key[key] = symbol["id"]
81
+ elif symbol.get("kind") == "class":
82
+ class_symbols_by_key[key] = symbol["id"]
83
+
84
+ _ping("loading methods")
85
+ method_rows = store.query_records(
86
+ """
87
+ MATCH (m:Method), (c:Class)
88
+ WHERE m.class_id = c.id
89
+ RETURN m.id as method_id, c.file_id as file_id, c.fqcn as class_fqcn, m.signature as signature
90
+ """
91
+ )
92
+ method_symbol_ids: dict[str, str] = {}
93
+ graph_edges: set[tuple[str, str]] = set()
94
+ for row in method_rows:
95
+ file_id = row.get("file_id", "")
96
+ fqname = f"{row.get('class_fqcn', '')}#{row.get('signature', '')}"
97
+ method_symbol_id = method_symbols_by_key.get((file_id, fqname))
98
+ if not method_symbol_id:
99
+ continue
100
+ method_symbol_ids[row["method_id"]] = method_symbol_id
101
+ class_symbol_id = class_symbols_by_key.get((file_id, row.get("class_fqcn", "")))
102
+ if class_symbol_id and class_symbol_id != method_symbol_id:
103
+ graph_edges.add(tuple(sorted((method_symbol_id, class_symbol_id))))
104
+
105
+ _ping("loading call edges")
106
+ call_rows = store.query_records(
107
+ """
108
+ MATCH (a:Method)-[:CALLS]->(b:Method)
109
+ RETURN a.id as src, b.id as dst
110
+ """
111
+ )
112
+ for row in call_rows:
113
+ src = method_symbol_ids.get(row.get("src", ""))
114
+ dst = method_symbol_ids.get(row.get("dst", ""))
115
+ if src and dst and src != dst:
116
+ graph_edges.add(tuple(sorted((src, dst))))
117
+
118
+ _ping(f"{len(symbols)} symbols, {len(graph_edges)} structural edges")
119
+ membership = _call_graph_communities(symbol_meta, sorted(graph_edges), progress=progress)
120
+
121
+ grouped: dict[str, list[str]] = defaultdict(list)
122
+ next_fallback_id = 1000000
123
+
124
+ # Keep only meaningful graph communities; tiny ones get merged by package bucket below.
125
+ temp_grouped: dict[int, list[str]] = defaultdict(list)
126
+ for sid, cid in membership.items():
127
+ temp_grouped[cid].append(sid)
128
+
129
+ for cid, members in temp_grouped.items():
130
+ if len(members) >= MIN_COMMUNITY_SIZE:
131
+ grouped[f"graph:{cid}"].extend(members)
132
+ else:
133
+ for sid in members:
134
+ grouped[f"pkg:{_package_bucket(symbol_meta[sid].get('fqname', ''))}"].append(sid)
135
+
136
+ for sid, meta in symbol_meta.items():
137
+ if sid in membership:
138
+ continue
139
+ grouped[f"pkg:{_package_bucket(meta.get('fqname', ''))}"].append(sid)
140
+
141
+ # Filter out residual singletons. They are not useful architectural communities.
142
+ filtered = {cid: members for cid, members in grouped.items() if len(members) >= MIN_COMMUNITY_SIZE}
143
+ if not filtered:
144
+ # Last resort: put everything into one broad bucket so callers still get context.
145
+ cid = f"fallback:{next_fallback_id}"
146
+ filtered[cid] = list(symbol_meta.keys())
147
+
148
+ _ping(f"{len(filtered)} clusters, replacing previous communities")
149
+ store.clear_communities()
150
+
151
+ communities: list[dict] = []
152
+ total_clusters = len(filtered)
153
+ for idx, (cid, symbol_ids) in enumerate(sorted(filtered.items()), start=1):
154
+ label = _community_label(symbol_ids, symbol_meta)
155
+ cohesion = min(1.0, len(symbol_ids) / max(len(symbol_meta), 1))
156
+ store.set_community(cid, label, cohesion, symbol_ids)
157
+ if idx % 100 == 0 or idx == total_clusters:
158
+ _ping(f"persisting {idx}/{total_clusters} clusters")
159
+ communities.append(
160
+ {
161
+ "community_id": cid,
162
+ "label": label,
163
+ "cohesion": cohesion,
164
+ "size": len(symbol_ids),
165
+ }
166
+ )
167
+
168
+ communities.sort(key=lambda c: (c["size"], c["label"]), reverse=True)
169
+ return communities
170
+
171
+
172
+ def symbol_community(store, symbol_query: str) -> dict:
173
+ recs = store.query_records(
174
+ """
175
+ MATCH (s:Symbol)-[:IN_COMMUNITY]->(c:Community)
176
+ WHERE s.id = $q OR lower(s.fqname) = lower($q) OR lower(s.name) = lower($q)
177
+ RETURN s.id as symbol_id, s.fqname as fqname, c.id as community_id, c.label as label, c.cohesion as cohesion
178
+ LIMIT 20
179
+ """,
180
+ {"q": symbol_query},
181
+ )
182
+ return {"query": symbol_query, "matches": recs}