clewdex 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- clewdex-0.1.0/.claude/settings.local.json +65 -0
- clewdex-0.1.0/.claude/skills/clew-enrich/SKILL.md +166 -0
- clewdex-0.1.0/.clew/cache.db +0 -0
- clewdex-0.1.0/.clew/state.db +0 -0
- clewdex-0.1.0/.env.example +8 -0
- clewdex-0.1.0/.github/workflows/ci.yml +37 -0
- clewdex-0.1.0/.github/workflows/publish-npm.yml +25 -0
- clewdex-0.1.0/.github/workflows/publish-pypi.yml +47 -0
- clewdex-0.1.0/.gitignore +31 -0
- clewdex-0.1.0/CLAUDE.md +156 -0
- clewdex-0.1.0/PKG-INFO +440 -0
- clewdex-0.1.0/README.md +388 -0
- clewdex-0.1.0/clew/__init__.py +0 -0
- clewdex-0.1.0/clew/__main__.py +5 -0
- clewdex-0.1.0/clew/chunker/__init__.py +0 -0
- clewdex-0.1.0/clew/chunker/fallback.py +205 -0
- clewdex-0.1.0/clew/chunker/parser.py +64 -0
- clewdex-0.1.0/clew/chunker/strategies.py +132 -0
- clewdex-0.1.0/clew/chunker/tokenizer.py +22 -0
- clewdex-0.1.0/clew/cli.py +364 -0
- clewdex-0.1.0/clew/clients/__init__.py +22 -0
- clewdex-0.1.0/clew/clients/base.py +29 -0
- clewdex-0.1.0/clew/clients/circuit_breaker.py +56 -0
- clewdex-0.1.0/clew/clients/description.py +252 -0
- clewdex-0.1.0/clew/clients/qdrant.py +165 -0
- clewdex-0.1.0/clew/clients/voyage.py +83 -0
- clewdex-0.1.0/clew/config.py +57 -0
- clewdex-0.1.0/clew/discovery.py +104 -0
- clewdex-0.1.0/clew/exceptions.py +107 -0
- clewdex-0.1.0/clew/factory.py +120 -0
- clewdex-0.1.0/clew/indexer/__init__.py +0 -0
- clewdex-0.1.0/clew/indexer/cache.py +700 -0
- clewdex-0.1.0/clew/indexer/change_detector.py +87 -0
- clewdex-0.1.0/clew/indexer/extractors/__init__.py +0 -0
- clewdex-0.1.0/clew/indexer/extractors/api_boundary.py +115 -0
- clewdex-0.1.0/clew/indexer/extractors/base.py +26 -0
- clewdex-0.1.0/clew/indexer/extractors/django_models.py +133 -0
- clewdex-0.1.0/clew/indexer/extractors/django_urls.py +80 -0
- clewdex-0.1.0/clew/indexer/extractors/python.py +373 -0
- clewdex-0.1.0/clew/indexer/extractors/tests.py +97 -0
- clewdex-0.1.0/clew/indexer/extractors/typescript.py +256 -0
- clewdex-0.1.0/clew/indexer/file_hash.py +47 -0
- clewdex-0.1.0/clew/indexer/git_tracker.py +225 -0
- clewdex-0.1.0/clew/indexer/ignore.py +85 -0
- clewdex-0.1.0/clew/indexer/importance.py +33 -0
- clewdex-0.1.0/clew/indexer/metadata.py +123 -0
- clewdex-0.1.0/clew/indexer/pipeline.py +1039 -0
- clewdex-0.1.0/clew/indexer/relationships.py +16 -0
- clewdex-0.1.0/clew/mcp_server.py +571 -0
- clewdex-0.1.0/clew/models.py +144 -0
- clewdex-0.1.0/clew/safety.py +59 -0
- clewdex-0.1.0/clew/search/__init__.py +1 -0
- clewdex-0.1.0/clew/search/engine.py +183 -0
- clewdex-0.1.0/clew/search/enhance.py +76 -0
- clewdex-0.1.0/clew/search/filters.py +55 -0
- clewdex-0.1.0/clew/search/hybrid.py +185 -0
- clewdex-0.1.0/clew/search/intent.py +113 -0
- clewdex-0.1.0/clew/search/models.py +59 -0
- clewdex-0.1.0/clew/search/rerank.py +95 -0
- clewdex-0.1.0/clew/search/tokenize.py +108 -0
- clewdex-0.1.0/docker-compose.yml +23 -0
- clewdex-0.1.0/docs/CLAUDE_CONTEXT_ANALYSIS.md +250 -0
- clewdex-0.1.0/docs/DESIGN.md +1117 -0
- clewdex-0.1.0/docs/IMPLEMENTATION.md +1976 -0
- clewdex-0.1.0/docs/adr/001-qdrant-as-vector-database.md +333 -0
- clewdex-0.1.0/docs/adr/002-build-vs-adopt-claude-context.md +100 -0
- clewdex-0.1.0/docs/adr/003-ported-features-from-claude-context.md +134 -0
- clewdex-0.1.0/docs/plans/2026-02-06-phase1-core-infrastructure.md +2693 -0
- clewdex-0.1.0/docs/plans/2026-02-06-phase2-search-pipeline.md +3210 -0
- clewdex-0.1.0/docs/plans/2026-02-06-three-layer-knowledge-design.md +369 -0
- clewdex-0.1.0/docs/plans/2026-02-09-v1.1-nl-descriptions.md +1180 -0
- clewdex-0.1.0/docs/plans/2026-02-10-compact-responses-and-cache-fix.md +709 -0
- clewdex-0.1.0/docs/plans/2026-02-10-fix-evaluation-shortcomings.md +312 -0
- clewdex-0.1.0/docs/plans/2026-02-12-index-enrichment-design.md +431 -0
- clewdex-0.1.0/docs/plans/2026-02-12-plan-review-findings.md +93 -0
- clewdex-0.1.0/docs/plans/2026-02-12-v2-consolidated-implementation.md +863 -0
- clewdex-0.1.0/homebrew/clewdex.rb +39 -0
- clewdex-0.1.0/npm/README.md +28 -0
- clewdex-0.1.0/npm/bin/clewdex.js +37 -0
- clewdex-0.1.0/npm/package.json +31 -0
- clewdex-0.1.0/pyproject.toml +106 -0
- clewdex-0.1.0/tests/__init__.py +0 -0
- clewdex-0.1.0/tests/conftest.py +42 -0
- clewdex-0.1.0/tests/fixtures/python/sample_models.py +28 -0
- clewdex-0.1.0/tests/integration/__init__.py +0 -0
- clewdex-0.1.0/tests/integration/conftest.py +91 -0
- clewdex-0.1.0/tests/integration/test_end_to_end.py +308 -0
- clewdex-0.1.0/tests/unit/__init__.py +0 -0
- clewdex-0.1.0/tests/unit/test_api_boundary.py +114 -0
- clewdex-0.1.0/tests/unit/test_cache.py +552 -0
- clewdex-0.1.0/tests/unit/test_change_detector.py +178 -0
- clewdex-0.1.0/tests/unit/test_chunker.py +93 -0
- clewdex-0.1.0/tests/unit/test_cli.py +345 -0
- clewdex-0.1.0/tests/unit/test_code_tokenize.py +81 -0
- clewdex-0.1.0/tests/unit/test_config.py +97 -0
- clewdex-0.1.0/tests/unit/test_description_provider.py +319 -0
- clewdex-0.1.0/tests/unit/test_discovery.py +198 -0
- clewdex-0.1.0/tests/unit/test_django_model_extractor.py +127 -0
- clewdex-0.1.0/tests/unit/test_django_url_extractor.py +60 -0
- clewdex-0.1.0/tests/unit/test_embedding_provider.py +56 -0
- clewdex-0.1.0/tests/unit/test_enhance.py +92 -0
- clewdex-0.1.0/tests/unit/test_factory.py +269 -0
- clewdex-0.1.0/tests/unit/test_fallback.py +110 -0
- clewdex-0.1.0/tests/unit/test_file_hash.py +76 -0
- clewdex-0.1.0/tests/unit/test_filters.py +69 -0
- clewdex-0.1.0/tests/unit/test_git_tracker.py +78 -0
- clewdex-0.1.0/tests/unit/test_hybrid_search.py +380 -0
- clewdex-0.1.0/tests/unit/test_ignore.py +87 -0
- clewdex-0.1.0/tests/unit/test_importance.py +69 -0
- clewdex-0.1.0/tests/unit/test_indexing_pipeline.py +904 -0
- clewdex-0.1.0/tests/unit/test_intent.py +60 -0
- clewdex-0.1.0/tests/unit/test_mcp_server.py +995 -0
- clewdex-0.1.0/tests/unit/test_metadata.py +122 -0
- clewdex-0.1.0/tests/unit/test_models.py +51 -0
- clewdex-0.1.0/tests/unit/test_python_extractor.py +226 -0
- clewdex-0.1.0/tests/unit/test_qdrant_manager.py +136 -0
- clewdex-0.1.0/tests/unit/test_relationships.py +30 -0
- clewdex-0.1.0/tests/unit/test_rerank.py +146 -0
- clewdex-0.1.0/tests/unit/test_safety.py +75 -0
- clewdex-0.1.0/tests/unit/test_search_engine.py +368 -0
- clewdex-0.1.0/tests/unit/test_search_models.py +105 -0
- clewdex-0.1.0/tests/unit/test_strategies.py +116 -0
- clewdex-0.1.0/tests/unit/test_test_extractor.py +60 -0
- clewdex-0.1.0/tests/unit/test_tokenizer.py +35 -0
- clewdex-0.1.0/tests/unit/test_typescript_extractor.py +178 -0
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
{
|
|
2
|
+
"permissions": {
|
|
3
|
+
"allow": [
|
|
4
|
+
"Bash(python -m pytest:*)",
|
|
5
|
+
"Bash(python3 -m pytest:*)",
|
|
6
|
+
"Bash(pytest:*)",
|
|
7
|
+
"Bash(uv run pytest:*)",
|
|
8
|
+
"Bash(python3 -m ruff check:*)",
|
|
9
|
+
"Bash(python3 -m ruff format:*)",
|
|
10
|
+
"Bash(ruff check:*)",
|
|
11
|
+
"Bash(ruff format:*)",
|
|
12
|
+
"Bash(mypy:*)",
|
|
13
|
+
"Bash(git branch:*)",
|
|
14
|
+
"Bash(python3:*)",
|
|
15
|
+
"Bash(git filter-branch:*)",
|
|
16
|
+
"Bash(FILTER_BRANCH_SQUELCH_WARNING=1 git filter-branch:*)",
|
|
17
|
+
"Bash(grep:*)",
|
|
18
|
+
"Bash(find:*)",
|
|
19
|
+
"Bash(git fsck:*)",
|
|
20
|
+
"WebSearch",
|
|
21
|
+
"Bash(gh search:*)",
|
|
22
|
+
"WebFetch(domain:pypi.org)",
|
|
23
|
+
"WebFetch(domain:www.npmjs.com)",
|
|
24
|
+
"WebFetch(domain:github.com)",
|
|
25
|
+
"WebFetch(domain:formulae.brew.sh)",
|
|
26
|
+
"Bash(pip show:*)",
|
|
27
|
+
"Bash(pip uninstall:*)",
|
|
28
|
+
"Bash(git mv:*)",
|
|
29
|
+
"Bash(pip install:*)",
|
|
30
|
+
"Bash(clew --help:*)",
|
|
31
|
+
"mcp__plugin_episodic-memory_episodic-memory__search",
|
|
32
|
+
"mcp__plugin_episodic-memory_episodic-memory__read",
|
|
33
|
+
"mcp__plugin_context7_context7__resolve-library-id",
|
|
34
|
+
"mcp__plugin_context7_context7__query-docs",
|
|
35
|
+
"WebFetch(domain:weaviate.io)",
|
|
36
|
+
"WebFetch(domain:sourcegraph.com)",
|
|
37
|
+
"WebFetch(domain:www.greptile.com)",
|
|
38
|
+
"WebFetch(domain:towardsdatascience.com)",
|
|
39
|
+
"WebFetch(domain:read.engineerscodex.com)",
|
|
40
|
+
"WebFetch(domain:blog.voyageai.com)",
|
|
41
|
+
"WebFetch(domain:www.infoq.com)",
|
|
42
|
+
"WebFetch(domain:qdrant.tech)",
|
|
43
|
+
"WebFetch(domain:arxiv.org)",
|
|
44
|
+
"WebFetch(domain:news.ycombinator.com)",
|
|
45
|
+
"WebFetch(domain:medium.com)",
|
|
46
|
+
"WebFetch(domain:registry.npmjs.org)",
|
|
47
|
+
"WebFetch(domain:crates.io)",
|
|
48
|
+
"Bash(curl:*)",
|
|
49
|
+
"Bash(python -m build:*)",
|
|
50
|
+
"Bash(node:*)",
|
|
51
|
+
"Bash(clew index:*)",
|
|
52
|
+
"Bash(QDRANT_URL=http://localhost:6333 clew index:*)",
|
|
53
|
+
"Bash(QDRANT_URL=http://localhost:6333 clew search:*)",
|
|
54
|
+
"Bash(QDRANT_URL=http://localhost:6333 clew trace:*)",
|
|
55
|
+
"Bash(clew status:*)",
|
|
56
|
+
"Bash(QDRANT_URL=http://localhost:6333 clew status:*)",
|
|
57
|
+
"Bash(clew trace:*)",
|
|
58
|
+
"Bash(clew search:*)",
|
|
59
|
+
"Bash(sqlite3:*)",
|
|
60
|
+
"Bash(python -m clew.cli:*)",
|
|
61
|
+
"Bash(clew reembed:*)",
|
|
62
|
+
"Bash(QDRANT_URL=http://localhost:6333 clew reembed:*)"
|
|
63
|
+
]
|
|
64
|
+
}
|
|
65
|
+
}
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
# /clew-enrich
|
|
2
|
+
|
|
3
|
+
Enrich indexed code chunks with LLM-generated descriptions and keywords to improve semantic search quality.
|
|
4
|
+
|
|
5
|
+
## When to Use
|
|
6
|
+
|
|
7
|
+
Run `/clew-enrich` after `clew index` has completed. Enrichment adds natural language descriptions and search keywords to each code chunk, enabling better semantic search results for natural language queries like "how does order processing work".
|
|
8
|
+
|
|
9
|
+
## Prerequisites
|
|
10
|
+
|
|
11
|
+
- The project must already be indexed (`clew index [PROJECT_ROOT]`)
|
|
12
|
+
- The `.clew/` directory must exist in the project root (created by `clew index`)
|
|
13
|
+
|
|
14
|
+
## What It Does
|
|
15
|
+
|
|
16
|
+
1. Reads all chunk IDs from the SQLite cache (`cache.db`)
|
|
17
|
+
2. Identifies chunks that have NOT yet been enriched
|
|
18
|
+
3. For each unenriched chunk, reads the source code from disk
|
|
19
|
+
4. Generates a description (2-3 sentences) and keywords (8-15 terms) per chunk
|
|
20
|
+
5. Writes enrichment data back to the SQLite cache
|
|
21
|
+
6. Triggers `clew reembed` to re-embed all enriched chunks with the new content
|
|
22
|
+
|
|
23
|
+
## Step-by-Step Instructions
|
|
24
|
+
|
|
25
|
+
### Step 1: Locate the Cache Directory
|
|
26
|
+
|
|
27
|
+
The cache directory is at `{project_root}/.clew/`. The project root is typically the git root. If `CLEW_CACHE_DIR` is set, use that path instead.
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
# Find the git root
|
|
31
|
+
git rev-parse --show-toplevel
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
The cache database file is `{cache_dir}/cache.db`.
|
|
35
|
+
|
|
36
|
+
### Step 2: Read Unenriched Chunks
|
|
37
|
+
|
|
38
|
+
Run this Python script to get the list of unenriched chunk IDs and their file paths:
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
import sqlite3
|
|
42
|
+
import json
|
|
43
|
+
|
|
44
|
+
cache_dir = "{PROJECT_ROOT}/.clew" # Replace with actual path
|
|
45
|
+
conn = sqlite3.connect(f"{cache_dir}/cache.db")
|
|
46
|
+
conn.row_factory = sqlite3.Row
|
|
47
|
+
|
|
48
|
+
# Get all chunk_ids from chunk_cache
|
|
49
|
+
rows = conn.execute("SELECT file_path, chunk_ids FROM chunk_cache").fetchall()
|
|
50
|
+
all_chunks = []
|
|
51
|
+
for row in rows:
|
|
52
|
+
file_path = row["file_path"]
|
|
53
|
+
chunk_ids = json.loads(row["chunk_ids"])
|
|
54
|
+
for cid in chunk_ids:
|
|
55
|
+
all_chunks.append((cid, file_path))
|
|
56
|
+
|
|
57
|
+
# Get already-enriched chunk_ids
|
|
58
|
+
enriched = set(
|
|
59
|
+
r[0] for r in conn.execute("SELECT chunk_id FROM enrichment_cache").fetchall()
|
|
60
|
+
)
|
|
61
|
+
conn.close()
|
|
62
|
+
|
|
63
|
+
# Filter to unenriched
|
|
64
|
+
unenriched = [(cid, fp) for cid, fp in all_chunks if cid not in enriched]
|
|
65
|
+
print(f"Total chunks: {len(all_chunks)}")
|
|
66
|
+
print(f"Already enriched: {len(enriched)}")
|
|
67
|
+
print(f"Unenriched: {len(unenriched)}")
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### Step 3: Enrich Each Chunk
|
|
71
|
+
|
|
72
|
+
For each unenriched chunk, you need to:
|
|
73
|
+
|
|
74
|
+
1. **Parse the chunk_id** to extract entity info. The format is: `file_path::entity_type::qualified_name` (for named entities) or `file_path::toplevel::sha256hash` (for anonymous/toplevel chunks).
|
|
75
|
+
|
|
76
|
+
2. **Read the source file** from disk using the `file_path` from the chunk_id (the part before the first `::`).
|
|
77
|
+
|
|
78
|
+
3. **Generate description and keywords**. For each chunk, produce output in this exact format:
|
|
79
|
+
```
|
|
80
|
+
Description: 2-3 sentences explaining what the code does, why it exists, and what domain concept it represents.
|
|
81
|
+
Keywords: 8-15 space-separated terms a developer might search for when looking for this code.
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
When generating descriptions and keywords, consider:
|
|
85
|
+
- What does this code do at a high level?
|
|
86
|
+
- What domain concepts does it relate to?
|
|
87
|
+
- What would someone search for to find this code?
|
|
88
|
+
- Include both technical terms and domain-specific vocabulary
|
|
89
|
+
|
|
90
|
+
4. **Write enrichment to cache** using this Python script (batch mode):
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
import sqlite3
|
|
94
|
+
import time
|
|
95
|
+
|
|
96
|
+
cache_dir = "{PROJECT_ROOT}/.clew" # Replace with actual path
|
|
97
|
+
conn = sqlite3.connect(f"{cache_dir}/cache.db")
|
|
98
|
+
|
|
99
|
+
enrichments = [
|
|
100
|
+
# ("chunk_id", "description text", "keyword1 keyword2 keyword3 ..."),
|
|
101
|
+
]
|
|
102
|
+
|
|
103
|
+
for chunk_id, description, keywords in enrichments:
|
|
104
|
+
conn.execute(
|
|
105
|
+
"INSERT OR REPLACE INTO enrichment_cache "
|
|
106
|
+
"(chunk_id, description, keywords, enriched_at) "
|
|
107
|
+
"VALUES (?, ?, ?, ?)",
|
|
108
|
+
(chunk_id, description, keywords, time.time()),
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
conn.commit()
|
|
112
|
+
conn.close()
|
|
113
|
+
print(f"Wrote {len(enrichments)} enrichments to cache")
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### Step 4: Process in Batches
|
|
117
|
+
|
|
118
|
+
Process chunks in batches of 20-50 to manage context window size:
|
|
119
|
+
|
|
120
|
+
1. Read a batch of source files using the Read tool
|
|
121
|
+
2. Generate descriptions and keywords for all chunks in the batch
|
|
122
|
+
3. Write the batch to SQLite
|
|
123
|
+
4. Repeat until all chunks are processed
|
|
124
|
+
|
|
125
|
+
Skip chunks where:
|
|
126
|
+
- The file no longer exists on disk
|
|
127
|
+
- The chunk_id contains `::toplevel::` (anonymous chunks have less value to enrich)
|
|
128
|
+
- The chunk_id contains `::file_summary::` (already synthetic)
|
|
129
|
+
|
|
130
|
+
### Step 5: Re-embed
|
|
131
|
+
|
|
132
|
+
After all enrichments are written to the cache, run:
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
clew reembed {PROJECT_ROOT}
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
This reads the enrichment data from cache and re-embeds all enriched chunks with the full content (description + keywords + code) into Qdrant's named vectors.
|
|
139
|
+
|
|
140
|
+
## Enrichment Cache Schema
|
|
141
|
+
|
|
142
|
+
The enrichment data is stored in `{cache_dir}/cache.db` in the `enrichment_cache` table:
|
|
143
|
+
|
|
144
|
+
```sql
|
|
145
|
+
CREATE TABLE IF NOT EXISTS enrichment_cache (
|
|
146
|
+
chunk_id TEXT PRIMARY KEY,
|
|
147
|
+
description TEXT,
|
|
148
|
+
keywords TEXT,
|
|
149
|
+
enriched_at REAL
|
|
150
|
+
);
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## Example Output
|
|
154
|
+
|
|
155
|
+
For a chunk with ID `backend/ecomm/utils.py::function::EcommUtils._process_shopify_order_impl`:
|
|
156
|
+
|
|
157
|
+
```
|
|
158
|
+
Description: Processes incoming Shopify orders by validating order data, creating internal Order records, and triggering fulfillment workflows. This is the core order ingestion handler called by the Shopify webhook receiver.
|
|
159
|
+
Keywords: shopify order processing webhook fulfillment ecommerce cart purchase payment order_ingestion
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
## Notes
|
|
163
|
+
|
|
164
|
+
- Enrichment is idempotent: running `/clew-enrich` again skips already-enriched chunks
|
|
165
|
+
- To re-enrich everything, delete the `enrichment_cache` table contents first: `DELETE FROM enrichment_cache` in `cache.db`
|
|
166
|
+
- The `clew reembed` step is required after writing enrichments; without it, the search index won't reflect the new descriptions
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
|
15
|
+
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
20
|
+
uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: ${{ matrix.python-version }}
|
|
23
|
+
|
|
24
|
+
- name: Install dependencies
|
|
25
|
+
run: pip install -e ".[dev]"
|
|
26
|
+
|
|
27
|
+
- name: Lint
|
|
28
|
+
run: ruff check .
|
|
29
|
+
|
|
30
|
+
- name: Format check
|
|
31
|
+
run: ruff format --check .
|
|
32
|
+
|
|
33
|
+
- name: Type check
|
|
34
|
+
run: mypy clew/
|
|
35
|
+
|
|
36
|
+
- name: Test
|
|
37
|
+
run: pytest --cov=clew -v --ignore=tests/integration
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
name: Publish to npm
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
publish:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
defaults:
|
|
11
|
+
run:
|
|
12
|
+
working-directory: npm
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
|
|
16
|
+
- name: Set up Node.js
|
|
17
|
+
uses: actions/setup-node@v4
|
|
18
|
+
with:
|
|
19
|
+
node-version: "20"
|
|
20
|
+
registry-url: "https://registry.npmjs.org"
|
|
21
|
+
|
|
22
|
+
- name: Publish to npm
|
|
23
|
+
run: npm publish --access public
|
|
24
|
+
env:
|
|
25
|
+
NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
permissions:
|
|
8
|
+
id-token: write
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
build:
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
|
|
16
|
+
- name: Set up Python
|
|
17
|
+
uses: actions/setup-python@v5
|
|
18
|
+
with:
|
|
19
|
+
python-version: "3.12"
|
|
20
|
+
|
|
21
|
+
- name: Install build tools
|
|
22
|
+
run: pip install build
|
|
23
|
+
|
|
24
|
+
- name: Build package
|
|
25
|
+
run: python -m build
|
|
26
|
+
|
|
27
|
+
- name: Upload artifacts
|
|
28
|
+
uses: actions/upload-artifact@v4
|
|
29
|
+
with:
|
|
30
|
+
name: dist
|
|
31
|
+
path: dist/
|
|
32
|
+
|
|
33
|
+
publish:
|
|
34
|
+
needs: build
|
|
35
|
+
runs-on: ubuntu-latest
|
|
36
|
+
environment: pypi
|
|
37
|
+
permissions:
|
|
38
|
+
id-token: write
|
|
39
|
+
steps:
|
|
40
|
+
- name: Download artifacts
|
|
41
|
+
uses: actions/download-artifact@v4
|
|
42
|
+
with:
|
|
43
|
+
name: dist
|
|
44
|
+
path: dist/
|
|
45
|
+
|
|
46
|
+
- name: Publish to PyPI
|
|
47
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
clewdex-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.pyc
|
|
4
|
+
*.pyo
|
|
5
|
+
*.egg-info/
|
|
6
|
+
dist/
|
|
7
|
+
build/
|
|
8
|
+
.venv/
|
|
9
|
+
venv/
|
|
10
|
+
|
|
11
|
+
# IDE
|
|
12
|
+
.vscode/
|
|
13
|
+
.idea/
|
|
14
|
+
|
|
15
|
+
# Environment
|
|
16
|
+
.env
|
|
17
|
+
|
|
18
|
+
# Cache
|
|
19
|
+
.code-search/
|
|
20
|
+
.mypy_cache/
|
|
21
|
+
.pytest_cache/
|
|
22
|
+
.ruff_cache/
|
|
23
|
+
|
|
24
|
+
# Qdrant local data
|
|
25
|
+
.qdrant/
|
|
26
|
+
|
|
27
|
+
# Worktrees
|
|
28
|
+
.worktrees/
|
|
29
|
+
|
|
30
|
+
# Coverage
|
|
31
|
+
.coverage
|
clewdex-0.1.0/CLAUDE.md
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
# clew
|
|
2
|
+
|
|
3
|
+
Semantic code search tool with hybrid retrieval and MCP integration for Claude Code.
|
|
4
|
+
|
|
5
|
+
## Project State
|
|
6
|
+
|
|
7
|
+
**Phase 1 (Core Infrastructure) is complete.** 20 source modules, 10 test files, 80 tests passing at 86% coverage.
|
|
8
|
+
|
|
9
|
+
**Phase 2 (Search Pipeline) is complete.** 27 source modules, 16 test files, 240 tests passing at 91% coverage.
|
|
10
|
+
|
|
11
|
+
**Phase 3 (MCP Integration & CLI) is complete.** 31 source modules, 27 test files, 349 tests passing at 92% coverage.
|
|
12
|
+
|
|
13
|
+
**V1.1 (NL Descriptions) is complete.** 32 source modules, 30 test files, 394 tests passing. LLM-generated descriptions for undocumented code chunks, prepended before embedding to improve semantic search quality.
|
|
14
|
+
|
|
15
|
+
**V1.2 (Structural Layer) is complete.** 39 source modules, 36 test files, 472 tests passing. Code relationship extraction (imports, inherits, calls, decorates, renders, tests, calls_api) with BFS graph traversal via `trace` MCP tool and CLI command.
|
|
16
|
+
|
|
17
|
+
**V1.3 (Compact Responses & Cache Fix) is complete.** 39 source modules, 36 test files, 491 tests passing. Compact MCP responses by default (~20x token reduction), opt-in full content via `detail="full"`. CACHE_DIR now resolves from git root so MCP server and indexer share the same state.db.
|
|
18
|
+
|
|
19
|
+
## Module Inventory
|
|
20
|
+
|
|
21
|
+
```
|
|
22
|
+
clew/
|
|
23
|
+
├── chunker/ # AST parsing (tree-sitter), language strategies, token-aware fallback splitting
|
|
24
|
+
│ ├── parser.py # ASTParser — tree-sitter wrapper, language detection by extension
|
|
25
|
+
│ ├── strategies.py # PythonChunker — extracts functions/classes as CodeEntity dataclasses
|
|
26
|
+
│ ├── fallback.py # Token-recursive + line-based splitting; Chunk dataclass with metadata dict
|
|
27
|
+
│ └── tokenizer.py # tiktoken cl100k_base token counting
|
|
28
|
+
├── clients/ # External service abstractions
|
|
29
|
+
│ ├── base.py # EmbeddingProvider ABC (embed, embed_query, dimensions, model_name)
|
|
30
|
+
│ ├── description.py # DescriptionProvider ABC + AnthropicDescriptionProvider — NL descriptions for code
|
|
31
|
+
│ ├── qdrant.py # QdrantManager — collection CRUD, hybrid query with RRF fusion, delete by file_path
|
|
32
|
+
│ └── voyage.py # VoyageEmbeddingProvider — httpx async client for Voyage AI
|
|
33
|
+
├── indexer/ # File discovery, caching, change detection, indexing pipeline, relationship extraction
|
|
34
|
+
│ ├── cache.py # CacheDB — SQLite via contextmanager, embedding + chunk caches, state tracking, relationship store
|
|
35
|
+
│ ├── change_detector.py # ChangeDetector — unified interface: git-first, file-hash fallback
|
|
36
|
+
│ ├── extractors/ # Pluggable relationship extractors (V1.2)
|
|
37
|
+
│ │ ├── base.py # RelationshipExtractor ABC
|
|
38
|
+
│ │ ├── python.py # Python: imports, inherits, decorates, calls
|
|
39
|
+
│ │ ├── typescript.py # TypeScript/JS: imports, inherits, renders (JSX), calls, calls_api (fetch/axios)
|
|
40
|
+
│ │ ├── tests.py # Test file detection: maps test files to tested modules
|
|
41
|
+
│ │ ├── django_urls.py # Django URL pattern extraction from urls.py
|
|
42
|
+
│ │ └── api_boundary.py # Cross-language API boundary matching (frontend→backend)
|
|
43
|
+
│ ├── file_hash.py # FileHashTracker — SHA256-based change detection (added/modified/unchanged)
|
|
44
|
+
│ ├── git_tracker.py # GitChangeTracker — git diff --name-status change detection (A/M/D/R parsing)
|
|
45
|
+
│ ├── ignore.py # IgnorePatternLoader — .gitignore + .clewignore + defaults
|
|
46
|
+
│ ├── metadata.py # detect_app_name, classify_layer, extract_signature, build_chunk_id
|
|
47
|
+
│ ├── pipeline.py # IndexingPipeline — file -> chunk -> metadata -> embed -> upsert + relationship extraction
|
|
48
|
+
│ └── relationships.py # Relationship dataclass — entity-relationship-entity with confidence
|
|
49
|
+
├── search/ # Search pipeline: enhance -> classify -> hybrid search -> rerank
|
|
50
|
+
│ ├── engine.py # SearchEngine — top-level orchestrator, full pipeline coordination
|
|
51
|
+
│ ├── enhance.py # QueryEnhancer — terminology expansion from YAML (abbreviations + synonyms)
|
|
52
|
+
│ ├── hybrid.py # HybridSearchEngine — dense + BM25 multi-prefetch with structural boosting
|
|
53
|
+
│ ├── intent.py # classify_intent — keyword heuristic intent routing (DEBUG > LOCATION > DOCS > CODE)
|
|
54
|
+
│ ├── models.py # QueryIntent, SearchResult, SearchRequest, SearchResponse dataclasses
|
|
55
|
+
│ ├── filters.py # build_qdrant_filter() — converts SearchRequest.filters to Qdrant Filter objects
|
|
56
|
+
│ ├── rerank.py # RerankProvider — Voyage rerank-2.5 integration with configurable skip conditions
|
|
57
|
+
│ └── tokenize.py # BM25 tokenization — camelCase/snake_case splitting, raw term count sparse vectors
|
|
58
|
+
├── cli.py # Typer app — index, search, status, trace, serve commands (fully wired)
|
|
59
|
+
├── config.py # Environment class — env var loading with defaults
|
|
60
|
+
├── discovery.py # discover_files() — centralized file discovery using IgnorePatternLoader + SafetyChecker
|
|
61
|
+
├── exceptions.py # Exception hierarchy with user-facing fix suggestions
|
|
62
|
+
├── factory.py # Component factory — centralized wiring, create_components() returns Components dataclass
|
|
63
|
+
├── mcp_server.py # FastMCP server — 5 tools: search, get_context, explain, index_status, trace
|
|
64
|
+
├── models.py # Pydantic v2 models — ProjectConfig, SearchConfig, CollectionConfig, SafetyConfig, etc.
|
|
65
|
+
└── safety.py # SafetyChecker — file size, chunk count, collection limits
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Established Patterns
|
|
69
|
+
|
|
70
|
+
- **Data models:** Pydantic v2 `BaseModel` for config/validation, `@dataclass` for internal data (CodeEntity, FileChange, SearchResult)
|
|
71
|
+
- **Provider abstraction:** ABC base classes (e.g., `EmbeddingProvider`) with concrete implementations
|
|
72
|
+
- **SQLite access:** `contextmanager` pattern in `CacheDB._connect()` — no ORM
|
|
73
|
+
- **Config:** `Environment` class reads env vars with sensible defaults; `ProjectConfig` loaded from YAML
|
|
74
|
+
- **Exceptions:** Hierarchy rooted in `ClewError`, each with `fix_hint` for user-facing messages
|
|
75
|
+
- **Async:** Used for external API calls (Voyage AI, Qdrant hybrid search, embedding); sync for file I/O and SQLite
|
|
76
|
+
- **Deterministic IDs:** Qdrant point IDs are UUID5 derived from structured chunk IDs (format: `file_path::entity_type::qualified_name`)
|
|
77
|
+
|
|
78
|
+
## Commands
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
# Dev commands
|
|
82
|
+
pytest --cov=clew -v # Run tests with coverage
|
|
83
|
+
pytest -m integration # Run integration tests only
|
|
84
|
+
ruff check . # Lint
|
|
85
|
+
ruff format --check . # Check formatting
|
|
86
|
+
mypy clew/ # Type check
|
|
87
|
+
|
|
88
|
+
# CLI commands
|
|
89
|
+
clew index [PROJECT_ROOT] --full # Full reindex
|
|
90
|
+
clew index [PROJECT_ROOT] # Incremental (change detection)
|
|
91
|
+
clew index [PROJECT_ROOT] --nl-descriptions # Generate NL descriptions (requires ANTHROPIC_API_KEY)
|
|
92
|
+
clew search "query" --raw # Search with JSON output
|
|
93
|
+
clew trace "entity::name" # Trace code relationships (BFS graph traversal)
|
|
94
|
+
clew trace "entity" --direction outbound --depth 3 # Directed trace with depth limit
|
|
95
|
+
clew status # Show Qdrant health + index stats
|
|
96
|
+
clew serve # Start MCP server (stdio transport)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Key Files
|
|
100
|
+
|
|
101
|
+
- `docs/DESIGN.md` — Architecture, chunking strategy, search pipeline, MCP tools, metadata schema
|
|
102
|
+
- `docs/IMPLEMENTATION.md` — Concrete specs: dependencies, SQLite schemas, Pydantic models, tree-sitter setup, phase tasks
|
|
103
|
+
- `docs/adr/` — Architecture Decision Records (Qdrant over Milvus, build vs adopt)
|
|
104
|
+
- `docs/plans/2026-02-06-phase1-core-infrastructure.md` — Phase 1 plan (complete)
|
|
105
|
+
- `docs/plans/2026-02-06-phase2-search-pipeline.md` — Phase 2 plan (complete)
|
|
106
|
+
- `docs/plans/2026-02-09-v1.1-nl-descriptions.md` — V1.1 NL Descriptions plan (complete)
|
|
107
|
+
- `docs/plans/2026-02-09-v1.2-structural-layer.md` — V1.2 Structural Layer plan (complete)
|
|
108
|
+
- `docs/plans/2026-02-10-compact-responses-and-cache-fix.md` — V1.3 Compact Responses & Cache Fix plan (complete)
|
|
109
|
+
- `docs/plans/2026-02-06-three-layer-knowledge-design.md` — Future roadmap (V1.4+)
|
|
110
|
+
|
|
111
|
+
## Tech Stack
|
|
112
|
+
|
|
113
|
+
- Python >=3.10, Qdrant (Docker), Voyage AI voyage-code-3, tree-sitter, typer + rich CLI, Pydantic v2, SQLite for caching
|
|
114
|
+
- Testing: pytest + pytest-asyncio, respx for HTTP mocking
|
|
115
|
+
- Linting: ruff, mypy (strict)
|
|
116
|
+
|
|
117
|
+
## MCP Server Configuration
|
|
118
|
+
|
|
119
|
+
Add to Claude Code's `.mcp.json`:
|
|
120
|
+
```json
|
|
121
|
+
{
|
|
122
|
+
"mcpServers": {
|
|
123
|
+
"clew": {
|
|
124
|
+
"command": "clew",
|
|
125
|
+
"args": ["serve"],
|
|
126
|
+
"env": {
|
|
127
|
+
"VOYAGE_API_KEY": "your-key-here",
|
|
128
|
+
"QDRANT_URL": "http://localhost:6333",
|
|
129
|
+
"ANTHROPIC_API_KEY": "your-key-here (optional, for NL descriptions)",
|
|
130
|
+
"CLEW_CACHE_DIR": "/absolute/path/to/project/.clew (optional, auto-detected from git root)"
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## MCP Tool Response Modes
|
|
138
|
+
|
|
139
|
+
MCP tools default to **compact** responses to minimize context window usage:
|
|
140
|
+
|
|
141
|
+
- **`search`** — Returns `snippet` (signature + docstring preview) instead of full source. Default `limit=5`. Pass `detail="full"` for complete content.
|
|
142
|
+
- **`explain`** — Same compact/full behavior. Default `limit=5`.
|
|
143
|
+
- **`get_context`** — Returns file content only. Pass `include_related=True` to also get related code chunks (compact format).
|
|
144
|
+
- **`trace`** and **`index_status`** — Already compact, no changes needed.
|
|
145
|
+
|
|
146
|
+
The agent can always use the `Read` tool to fetch specific lines from results that look promising.
|
|
147
|
+
|
|
148
|
+
## Conventions
|
|
149
|
+
|
|
150
|
+
- Use `ruff` for formatting and linting
|
|
151
|
+
- Use `mypy --strict` for type checking
|
|
152
|
+
- Async where interacting with Voyage API or Qdrant
|
|
153
|
+
- All config through Pydantic models validated from YAML
|
|
154
|
+
- Error messages should tell the user how to fix the problem (e.g., "Qdrant not running. Start with: docker compose up -d qdrant")
|
|
155
|
+
- Component wiring through `factory.py` — no global state, one factory call per invocation
|
|
156
|
+
- MCP tools return structured dicts with `error` + `fix` keys on failure
|