clewdex 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. clewdex-0.1.0/.claude/settings.local.json +65 -0
  2. clewdex-0.1.0/.claude/skills/clew-enrich/SKILL.md +166 -0
  3. clewdex-0.1.0/.clew/cache.db +0 -0
  4. clewdex-0.1.0/.clew/state.db +0 -0
  5. clewdex-0.1.0/.env.example +8 -0
  6. clewdex-0.1.0/.github/workflows/ci.yml +37 -0
  7. clewdex-0.1.0/.github/workflows/publish-npm.yml +25 -0
  8. clewdex-0.1.0/.github/workflows/publish-pypi.yml +47 -0
  9. clewdex-0.1.0/.gitignore +31 -0
  10. clewdex-0.1.0/CLAUDE.md +156 -0
  11. clewdex-0.1.0/PKG-INFO +440 -0
  12. clewdex-0.1.0/README.md +388 -0
  13. clewdex-0.1.0/clew/__init__.py +0 -0
  14. clewdex-0.1.0/clew/__main__.py +5 -0
  15. clewdex-0.1.0/clew/chunker/__init__.py +0 -0
  16. clewdex-0.1.0/clew/chunker/fallback.py +205 -0
  17. clewdex-0.1.0/clew/chunker/parser.py +64 -0
  18. clewdex-0.1.0/clew/chunker/strategies.py +132 -0
  19. clewdex-0.1.0/clew/chunker/tokenizer.py +22 -0
  20. clewdex-0.1.0/clew/cli.py +364 -0
  21. clewdex-0.1.0/clew/clients/__init__.py +22 -0
  22. clewdex-0.1.0/clew/clients/base.py +29 -0
  23. clewdex-0.1.0/clew/clients/circuit_breaker.py +56 -0
  24. clewdex-0.1.0/clew/clients/description.py +252 -0
  25. clewdex-0.1.0/clew/clients/qdrant.py +165 -0
  26. clewdex-0.1.0/clew/clients/voyage.py +83 -0
  27. clewdex-0.1.0/clew/config.py +57 -0
  28. clewdex-0.1.0/clew/discovery.py +104 -0
  29. clewdex-0.1.0/clew/exceptions.py +107 -0
  30. clewdex-0.1.0/clew/factory.py +120 -0
  31. clewdex-0.1.0/clew/indexer/__init__.py +0 -0
  32. clewdex-0.1.0/clew/indexer/cache.py +700 -0
  33. clewdex-0.1.0/clew/indexer/change_detector.py +87 -0
  34. clewdex-0.1.0/clew/indexer/extractors/__init__.py +0 -0
  35. clewdex-0.1.0/clew/indexer/extractors/api_boundary.py +115 -0
  36. clewdex-0.1.0/clew/indexer/extractors/base.py +26 -0
  37. clewdex-0.1.0/clew/indexer/extractors/django_models.py +133 -0
  38. clewdex-0.1.0/clew/indexer/extractors/django_urls.py +80 -0
  39. clewdex-0.1.0/clew/indexer/extractors/python.py +373 -0
  40. clewdex-0.1.0/clew/indexer/extractors/tests.py +97 -0
  41. clewdex-0.1.0/clew/indexer/extractors/typescript.py +256 -0
  42. clewdex-0.1.0/clew/indexer/file_hash.py +47 -0
  43. clewdex-0.1.0/clew/indexer/git_tracker.py +225 -0
  44. clewdex-0.1.0/clew/indexer/ignore.py +85 -0
  45. clewdex-0.1.0/clew/indexer/importance.py +33 -0
  46. clewdex-0.1.0/clew/indexer/metadata.py +123 -0
  47. clewdex-0.1.0/clew/indexer/pipeline.py +1039 -0
  48. clewdex-0.1.0/clew/indexer/relationships.py +16 -0
  49. clewdex-0.1.0/clew/mcp_server.py +571 -0
  50. clewdex-0.1.0/clew/models.py +144 -0
  51. clewdex-0.1.0/clew/safety.py +59 -0
  52. clewdex-0.1.0/clew/search/__init__.py +1 -0
  53. clewdex-0.1.0/clew/search/engine.py +183 -0
  54. clewdex-0.1.0/clew/search/enhance.py +76 -0
  55. clewdex-0.1.0/clew/search/filters.py +55 -0
  56. clewdex-0.1.0/clew/search/hybrid.py +185 -0
  57. clewdex-0.1.0/clew/search/intent.py +113 -0
  58. clewdex-0.1.0/clew/search/models.py +59 -0
  59. clewdex-0.1.0/clew/search/rerank.py +95 -0
  60. clewdex-0.1.0/clew/search/tokenize.py +108 -0
  61. clewdex-0.1.0/docker-compose.yml +23 -0
  62. clewdex-0.1.0/docs/CLAUDE_CONTEXT_ANALYSIS.md +250 -0
  63. clewdex-0.1.0/docs/DESIGN.md +1117 -0
  64. clewdex-0.1.0/docs/IMPLEMENTATION.md +1976 -0
  65. clewdex-0.1.0/docs/adr/001-qdrant-as-vector-database.md +333 -0
  66. clewdex-0.1.0/docs/adr/002-build-vs-adopt-claude-context.md +100 -0
  67. clewdex-0.1.0/docs/adr/003-ported-features-from-claude-context.md +134 -0
  68. clewdex-0.1.0/docs/plans/2026-02-06-phase1-core-infrastructure.md +2693 -0
  69. clewdex-0.1.0/docs/plans/2026-02-06-phase2-search-pipeline.md +3210 -0
  70. clewdex-0.1.0/docs/plans/2026-02-06-three-layer-knowledge-design.md +369 -0
  71. clewdex-0.1.0/docs/plans/2026-02-09-v1.1-nl-descriptions.md +1180 -0
  72. clewdex-0.1.0/docs/plans/2026-02-10-compact-responses-and-cache-fix.md +709 -0
  73. clewdex-0.1.0/docs/plans/2026-02-10-fix-evaluation-shortcomings.md +312 -0
  74. clewdex-0.1.0/docs/plans/2026-02-12-index-enrichment-design.md +431 -0
  75. clewdex-0.1.0/docs/plans/2026-02-12-plan-review-findings.md +93 -0
  76. clewdex-0.1.0/docs/plans/2026-02-12-v2-consolidated-implementation.md +863 -0
  77. clewdex-0.1.0/homebrew/clewdex.rb +39 -0
  78. clewdex-0.1.0/npm/README.md +28 -0
  79. clewdex-0.1.0/npm/bin/clewdex.js +37 -0
  80. clewdex-0.1.0/npm/package.json +31 -0
  81. clewdex-0.1.0/pyproject.toml +106 -0
  82. clewdex-0.1.0/tests/__init__.py +0 -0
  83. clewdex-0.1.0/tests/conftest.py +42 -0
  84. clewdex-0.1.0/tests/fixtures/python/sample_models.py +28 -0
  85. clewdex-0.1.0/tests/integration/__init__.py +0 -0
  86. clewdex-0.1.0/tests/integration/conftest.py +91 -0
  87. clewdex-0.1.0/tests/integration/test_end_to_end.py +308 -0
  88. clewdex-0.1.0/tests/unit/__init__.py +0 -0
  89. clewdex-0.1.0/tests/unit/test_api_boundary.py +114 -0
  90. clewdex-0.1.0/tests/unit/test_cache.py +552 -0
  91. clewdex-0.1.0/tests/unit/test_change_detector.py +178 -0
  92. clewdex-0.1.0/tests/unit/test_chunker.py +93 -0
  93. clewdex-0.1.0/tests/unit/test_cli.py +345 -0
  94. clewdex-0.1.0/tests/unit/test_code_tokenize.py +81 -0
  95. clewdex-0.1.0/tests/unit/test_config.py +97 -0
  96. clewdex-0.1.0/tests/unit/test_description_provider.py +319 -0
  97. clewdex-0.1.0/tests/unit/test_discovery.py +198 -0
  98. clewdex-0.1.0/tests/unit/test_django_model_extractor.py +127 -0
  99. clewdex-0.1.0/tests/unit/test_django_url_extractor.py +60 -0
  100. clewdex-0.1.0/tests/unit/test_embedding_provider.py +56 -0
  101. clewdex-0.1.0/tests/unit/test_enhance.py +92 -0
  102. clewdex-0.1.0/tests/unit/test_factory.py +269 -0
  103. clewdex-0.1.0/tests/unit/test_fallback.py +110 -0
  104. clewdex-0.1.0/tests/unit/test_file_hash.py +76 -0
  105. clewdex-0.1.0/tests/unit/test_filters.py +69 -0
  106. clewdex-0.1.0/tests/unit/test_git_tracker.py +78 -0
  107. clewdex-0.1.0/tests/unit/test_hybrid_search.py +380 -0
  108. clewdex-0.1.0/tests/unit/test_ignore.py +87 -0
  109. clewdex-0.1.0/tests/unit/test_importance.py +69 -0
  110. clewdex-0.1.0/tests/unit/test_indexing_pipeline.py +904 -0
  111. clewdex-0.1.0/tests/unit/test_intent.py +60 -0
  112. clewdex-0.1.0/tests/unit/test_mcp_server.py +995 -0
  113. clewdex-0.1.0/tests/unit/test_metadata.py +122 -0
  114. clewdex-0.1.0/tests/unit/test_models.py +51 -0
  115. clewdex-0.1.0/tests/unit/test_python_extractor.py +226 -0
  116. clewdex-0.1.0/tests/unit/test_qdrant_manager.py +136 -0
  117. clewdex-0.1.0/tests/unit/test_relationships.py +30 -0
  118. clewdex-0.1.0/tests/unit/test_rerank.py +146 -0
  119. clewdex-0.1.0/tests/unit/test_safety.py +75 -0
  120. clewdex-0.1.0/tests/unit/test_search_engine.py +368 -0
  121. clewdex-0.1.0/tests/unit/test_search_models.py +105 -0
  122. clewdex-0.1.0/tests/unit/test_strategies.py +116 -0
  123. clewdex-0.1.0/tests/unit/test_test_extractor.py +60 -0
  124. clewdex-0.1.0/tests/unit/test_tokenizer.py +35 -0
  125. clewdex-0.1.0/tests/unit/test_typescript_extractor.py +178 -0
@@ -0,0 +1,65 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(python -m pytest:*)",
5
+ "Bash(python3 -m pytest:*)",
6
+ "Bash(pytest:*)",
7
+ "Bash(uv run pytest:*)",
8
+ "Bash(python3 -m ruff check:*)",
9
+ "Bash(python3 -m ruff format:*)",
10
+ "Bash(ruff check:*)",
11
+ "Bash(ruff format:*)",
12
+ "Bash(mypy:*)",
13
+ "Bash(git branch:*)",
14
+ "Bash(python3:*)",
15
+ "Bash(git filter-branch:*)",
16
+ "Bash(FILTER_BRANCH_SQUELCH_WARNING=1 git filter-branch:*)",
17
+ "Bash(grep:*)",
18
+ "Bash(find:*)",
19
+ "Bash(git fsck:*)",
20
+ "WebSearch",
21
+ "Bash(gh search:*)",
22
+ "WebFetch(domain:pypi.org)",
23
+ "WebFetch(domain:www.npmjs.com)",
24
+ "WebFetch(domain:github.com)",
25
+ "WebFetch(domain:formulae.brew.sh)",
26
+ "Bash(pip show:*)",
27
+ "Bash(pip uninstall:*)",
28
+ "Bash(git mv:*)",
29
+ "Bash(pip install:*)",
30
+ "Bash(clew --help:*)",
31
+ "mcp__plugin_episodic-memory_episodic-memory__search",
32
+ "mcp__plugin_episodic-memory_episodic-memory__read",
33
+ "mcp__plugin_context7_context7__resolve-library-id",
34
+ "mcp__plugin_context7_context7__query-docs",
35
+ "WebFetch(domain:weaviate.io)",
36
+ "WebFetch(domain:sourcegraph.com)",
37
+ "WebFetch(domain:www.greptile.com)",
38
+ "WebFetch(domain:towardsdatascience.com)",
39
+ "WebFetch(domain:read.engineerscodex.com)",
40
+ "WebFetch(domain:blog.voyageai.com)",
41
+ "WebFetch(domain:www.infoq.com)",
42
+ "WebFetch(domain:qdrant.tech)",
43
+ "WebFetch(domain:arxiv.org)",
44
+ "WebFetch(domain:news.ycombinator.com)",
45
+ "WebFetch(domain:medium.com)",
46
+ "WebFetch(domain:registry.npmjs.org)",
47
+ "WebFetch(domain:crates.io)",
48
+ "Bash(curl:*)",
49
+ "Bash(python -m build:*)",
50
+ "Bash(node:*)",
51
+ "Bash(clew index:*)",
52
+ "Bash(QDRANT_URL=http://localhost:6333 clew index:*)",
53
+ "Bash(QDRANT_URL=http://localhost:6333 clew search:*)",
54
+ "Bash(QDRANT_URL=http://localhost:6333 clew trace:*)",
55
+ "Bash(clew status:*)",
56
+ "Bash(QDRANT_URL=http://localhost:6333 clew status:*)",
57
+ "Bash(clew trace:*)",
58
+ "Bash(clew search:*)",
59
+ "Bash(sqlite3:*)",
60
+ "Bash(python -m clew.cli:*)",
61
+ "Bash(clew reembed:*)",
62
+ "Bash(QDRANT_URL=http://localhost:6333 clew reembed:*)"
63
+ ]
64
+ }
65
+ }
@@ -0,0 +1,166 @@
1
+ # /clew-enrich
2
+
3
+ Enrich indexed code chunks with LLM-generated descriptions and keywords to improve semantic search quality.
4
+
5
+ ## When to Use
6
+
7
+ Run `/clew-enrich` after `clew index` has completed. Enrichment adds natural language descriptions and search keywords to each code chunk, enabling better semantic search results for natural language queries like "how does order processing work".
8
+
9
+ ## Prerequisites
10
+
11
+ - The project must already be indexed (`clew index [PROJECT_ROOT]`)
12
+ - The `.clew/` directory must exist in the project root (created by `clew index`)
13
+
14
+ ## What It Does
15
+
16
+ 1. Reads all chunk IDs from the SQLite cache (`cache.db`)
17
+ 2. Identifies chunks that have NOT yet been enriched
18
+ 3. For each unenriched chunk, reads the source code from disk
19
+ 4. Generates a description (2-3 sentences) and keywords (8-15 terms) per chunk
20
+ 5. Writes enrichment data back to the SQLite cache
21
+ 6. Triggers `clew reembed` to re-embed all enriched chunks with the new content
22
+
23
+ ## Step-by-Step Instructions
24
+
25
+ ### Step 1: Locate the Cache Directory
26
+
27
+ The cache directory is at `{project_root}/.clew/`. The project root is typically the git root. If `CLEW_CACHE_DIR` is set, use that path instead.
28
+
29
+ ```bash
30
+ # Find the git root
31
+ git rev-parse --show-toplevel
32
+ ```
33
+
34
+ The cache database file is `{cache_dir}/cache.db`.
35
+
36
+ ### Step 2: Read Unenriched Chunks
37
+
38
+ Run this Python script to get the list of unenriched chunk IDs and their file paths:
39
+
40
+ ```python
41
+ import sqlite3
42
+ import json
43
+
44
+ cache_dir = "{PROJECT_ROOT}/.clew" # Replace with actual path
45
+ conn = sqlite3.connect(f"{cache_dir}/cache.db")
46
+ conn.row_factory = sqlite3.Row
47
+
48
+ # Get all chunk_ids from chunk_cache
49
+ rows = conn.execute("SELECT file_path, chunk_ids FROM chunk_cache").fetchall()
50
+ all_chunks = []
51
+ for row in rows:
52
+ file_path = row["file_path"]
53
+ chunk_ids = json.loads(row["chunk_ids"])
54
+ for cid in chunk_ids:
55
+ all_chunks.append((cid, file_path))
56
+
57
+ # Get already-enriched chunk_ids
58
+ enriched = set(
59
+ r[0] for r in conn.execute("SELECT chunk_id FROM enrichment_cache").fetchall()
60
+ )
61
+ conn.close()
62
+
63
+ # Filter to unenriched
64
+ unenriched = [(cid, fp) for cid, fp in all_chunks if cid not in enriched]
65
+ print(f"Total chunks: {len(all_chunks)}")
66
+ print(f"Already enriched: {len(enriched)}")
67
+ print(f"Unenriched: {len(unenriched)}")
68
+ ```
69
+
70
+ ### Step 3: Enrich Each Chunk
71
+
72
+ For each unenriched chunk, you need to:
73
+
74
+ 1. **Parse the chunk_id** to extract entity info. The format is: `file_path::entity_type::qualified_name` (for named entities) or `file_path::toplevel::sha256hash` (for anonymous/toplevel chunks).
75
+
76
+ 2. **Read the source file** from disk using the `file_path` from the chunk_id (the part before the first `::`).
77
+
78
+ 3. **Generate description and keywords**. For each chunk, produce output in this exact format:
79
+ ```
80
+ Description: 2-3 sentences explaining what the code does, why it exists, and what domain concept it represents.
81
+ Keywords: 8-15 space-separated terms a developer might search for when looking for this code.
82
+ ```
83
+
84
+ When generating descriptions and keywords, consider:
85
+ - What does this code do at a high level?
86
+ - What domain concepts does it relate to?
87
+ - What would someone search for to find this code?
88
+ - Include both technical terms and domain-specific vocabulary
89
+
90
+ 4. **Write enrichment to cache** using this Python script (batch mode):
91
+
92
+ ```python
93
+ import sqlite3
94
+ import time
95
+
96
+ cache_dir = "{PROJECT_ROOT}/.clew" # Replace with actual path
97
+ conn = sqlite3.connect(f"{cache_dir}/cache.db")
98
+
99
+ enrichments = [
100
+ # ("chunk_id", "description text", "keyword1 keyword2 keyword3 ..."),
101
+ ]
102
+
103
+ for chunk_id, description, keywords in enrichments:
104
+ conn.execute(
105
+ "INSERT OR REPLACE INTO enrichment_cache "
106
+ "(chunk_id, description, keywords, enriched_at) "
107
+ "VALUES (?, ?, ?, ?)",
108
+ (chunk_id, description, keywords, time.time()),
109
+ )
110
+
111
+ conn.commit()
112
+ conn.close()
113
+ print(f"Wrote {len(enrichments)} enrichments to cache")
114
+ ```
115
+
116
+ ### Step 4: Process in Batches
117
+
118
+ Process chunks in batches of 20-50 to manage context window size:
119
+
120
+ 1. Read a batch of source files using the Read tool
121
+ 2. Generate descriptions and keywords for all chunks in the batch
122
+ 3. Write the batch to SQLite
123
+ 4. Repeat until all chunks are processed
124
+
125
+ Skip chunks where:
126
+ - The file no longer exists on disk
127
+ - The chunk_id contains `::toplevel::` (anonymous chunks have less value to enrich)
128
+ - The chunk_id contains `::file_summary::` (already synthetic)
129
+
130
+ ### Step 5: Re-embed
131
+
132
+ After all enrichments are written to the cache, run:
133
+
134
+ ```bash
135
+ clew reembed {PROJECT_ROOT}
136
+ ```
137
+
138
+ This reads the enrichment data from cache and re-embeds all enriched chunks with the full content (description + keywords + code) into Qdrant's named vectors.
139
+
140
+ ## Enrichment Cache Schema
141
+
142
+ The enrichment data is stored in `{cache_dir}/cache.db` in the `enrichment_cache` table:
143
+
144
+ ```sql
145
+ CREATE TABLE IF NOT EXISTS enrichment_cache (
146
+ chunk_id TEXT PRIMARY KEY,
147
+ description TEXT,
148
+ keywords TEXT,
149
+ enriched_at REAL
150
+ );
151
+ ```
152
+
153
+ ## Example Output
154
+
155
+ For a chunk with ID `backend/ecomm/utils.py::function::EcommUtils._process_shopify_order_impl`:
156
+
157
+ ```
158
+ Description: Processes incoming Shopify orders by validating order data, creating internal Order records, and triggering fulfillment workflows. This is the core order ingestion handler called by the Shopify webhook receiver.
159
+ Keywords: shopify order processing webhook fulfillment ecommerce cart purchase payment order_ingestion
160
+ ```
161
+
162
+ ## Notes
163
+
164
+ - Enrichment is idempotent: running `/clew-enrich` again skips already-enriched chunks
165
+ - To re-enrich everything, delete the `enrichment_cache` table contents first: `DELETE FROM enrichment_cache` in `cache.db`
166
+ - The `clew reembed` step is required after writing enrichments; without it, the search index won't reflect the new descriptions
Binary file
Binary file
@@ -0,0 +1,8 @@
1
+ # Required
2
+ VOYAGE_API_KEY=pa-xxxxxxxxxxxxxxxxxxxx
3
+
4
+ # Optional (defaults shown)
5
+ QDRANT_URL=http://localhost:6333
6
+ QDRANT_API_KEY=
7
+ CLEW_CACHE_DIR=.clew
8
+ CLEW_LOG_LEVEL=INFO
@@ -0,0 +1,37 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Set up Python ${{ matrix.python-version }}
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: ${{ matrix.python-version }}
23
+
24
+ - name: Install dependencies
25
+ run: pip install -e ".[dev]"
26
+
27
+ - name: Lint
28
+ run: ruff check .
29
+
30
+ - name: Format check
31
+ run: ruff format --check .
32
+
33
+ - name: Type check
34
+ run: mypy clew/
35
+
36
+ - name: Test
37
+ run: pytest --cov=clew -v --ignore=tests/integration
@@ -0,0 +1,25 @@
1
+ name: Publish to npm
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ publish:
9
+ runs-on: ubuntu-latest
10
+ defaults:
11
+ run:
12
+ working-directory: npm
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+
16
+ - name: Set up Node.js
17
+ uses: actions/setup-node@v4
18
+ with:
19
+ node-version: "20"
20
+ registry-url: "https://registry.npmjs.org"
21
+
22
+ - name: Publish to npm
23
+ run: npm publish --access public
24
+ env:
25
+ NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
@@ -0,0 +1,47 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ permissions:
8
+ id-token: write
9
+
10
+ jobs:
11
+ build:
12
+ runs-on: ubuntu-latest
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v5
18
+ with:
19
+ python-version: "3.12"
20
+
21
+ - name: Install build tools
22
+ run: pip install build
23
+
24
+ - name: Build package
25
+ run: python -m build
26
+
27
+ - name: Upload artifacts
28
+ uses: actions/upload-artifact@v4
29
+ with:
30
+ name: dist
31
+ path: dist/
32
+
33
+ publish:
34
+ needs: build
35
+ runs-on: ubuntu-latest
36
+ environment: pypi
37
+ permissions:
38
+ id-token: write
39
+ steps:
40
+ - name: Download artifacts
41
+ uses: actions/download-artifact@v4
42
+ with:
43
+ name: dist
44
+ path: dist/
45
+
46
+ - name: Publish to PyPI
47
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,31 @@
1
+ # Python
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.egg-info/
6
+ dist/
7
+ build/
8
+ .venv/
9
+ venv/
10
+
11
+ # IDE
12
+ .vscode/
13
+ .idea/
14
+
15
+ # Environment
16
+ .env
17
+
18
+ # Cache
19
+ .code-search/
20
+ .mypy_cache/
21
+ .pytest_cache/
22
+ .ruff_cache/
23
+
24
+ # Qdrant local data
25
+ .qdrant/
26
+
27
+ # Worktrees
28
+ .worktrees/
29
+
30
+ # Coverage
31
+ .coverage
@@ -0,0 +1,156 @@
1
+ # clew
2
+
3
+ Semantic code search tool with hybrid retrieval and MCP integration for Claude Code.
4
+
5
+ ## Project State
6
+
7
+ **Phase 1 (Core Infrastructure) is complete.** 20 source modules, 10 test files, 80 tests passing at 86% coverage.
8
+
9
+ **Phase 2 (Search Pipeline) is complete.** 27 source modules, 16 test files, 240 tests passing at 91% coverage.
10
+
11
+ **Phase 3 (MCP Integration & CLI) is complete.** 31 source modules, 27 test files, 349 tests passing at 92% coverage.
12
+
13
+ **V1.1 (NL Descriptions) is complete.** 32 source modules, 30 test files, 394 tests passing. LLM-generated descriptions for undocumented code chunks, prepended before embedding to improve semantic search quality.
14
+
15
+ **V1.2 (Structural Layer) is complete.** 39 source modules, 36 test files, 472 tests passing. Code relationship extraction (imports, inherits, calls, decorates, renders, tests, calls_api) with BFS graph traversal via `trace` MCP tool and CLI command.
16
+
17
+ **V1.3 (Compact Responses & Cache Fix) is complete.** 39 source modules, 36 test files, 491 tests passing. Compact MCP responses by default (~20x token reduction), opt-in full content via `detail="full"`. CACHE_DIR now resolves from git root so MCP server and indexer share the same state.db.
18
+
19
+ ## Module Inventory
20
+
21
+ ```
22
+ clew/
23
+ ├── chunker/ # AST parsing (tree-sitter), language strategies, token-aware fallback splitting
24
+ │ ├── parser.py # ASTParser — tree-sitter wrapper, language detection by extension
25
+ │ ├── strategies.py # PythonChunker — extracts functions/classes as CodeEntity dataclasses
26
+ │ ├── fallback.py # Token-recursive + line-based splitting; Chunk dataclass with metadata dict
27
+ │ └── tokenizer.py # tiktoken cl100k_base token counting
28
+ ├── clients/ # External service abstractions
29
+ │ ├── base.py # EmbeddingProvider ABC (embed, embed_query, dimensions, model_name)
30
+ │ ├── description.py # DescriptionProvider ABC + AnthropicDescriptionProvider — NL descriptions for code
31
+ │ ├── qdrant.py # QdrantManager — collection CRUD, hybrid query with RRF fusion, delete by file_path
32
+ │ └── voyage.py # VoyageEmbeddingProvider — httpx async client for Voyage AI
33
+ ├── indexer/ # File discovery, caching, change detection, indexing pipeline, relationship extraction
34
+ │ ├── cache.py # CacheDB — SQLite via contextmanager, embedding + chunk caches, state tracking, relationship store
35
+ │ ├── change_detector.py # ChangeDetector — unified interface: git-first, file-hash fallback
36
+ │ ├── extractors/ # Pluggable relationship extractors (V1.2)
37
+ │ │ ├── base.py # RelationshipExtractor ABC
38
+ │ │ ├── python.py # Python: imports, inherits, decorates, calls
39
+ │ │ ├── typescript.py # TypeScript/JS: imports, inherits, renders (JSX), calls, calls_api (fetch/axios)
40
+ │ │ ├── tests.py # Test file detection: maps test files to tested modules
41
+ │ │ ├── django_urls.py # Django URL pattern extraction from urls.py
42
+ │ │ └── api_boundary.py # Cross-language API boundary matching (frontend→backend)
43
+ │ ├── file_hash.py # FileHashTracker — SHA256-based change detection (added/modified/unchanged)
44
+ │ ├── git_tracker.py # GitChangeTracker — git diff --name-status change detection (A/M/D/R parsing)
45
+ │ ├── ignore.py # IgnorePatternLoader — .gitignore + .clewignore + defaults
46
+ │ ├── metadata.py # detect_app_name, classify_layer, extract_signature, build_chunk_id
47
+ │ ├── pipeline.py # IndexingPipeline — file -> chunk -> metadata -> embed -> upsert + relationship extraction
48
+ │ └── relationships.py # Relationship dataclass — entity-relationship-entity with confidence
49
+ ├── search/ # Search pipeline: enhance -> classify -> hybrid search -> rerank
50
+ │ ├── engine.py # SearchEngine — top-level orchestrator, full pipeline coordination
51
+ │ ├── enhance.py # QueryEnhancer — terminology expansion from YAML (abbreviations + synonyms)
52
+ │ ├── hybrid.py # HybridSearchEngine — dense + BM25 multi-prefetch with structural boosting
53
+ │ ├── intent.py # classify_intent — keyword heuristic intent routing (DEBUG > LOCATION > DOCS > CODE)
54
+ │ ├── models.py # QueryIntent, SearchResult, SearchRequest, SearchResponse dataclasses
55
+ │ ├── filters.py # build_qdrant_filter() — converts SearchRequest.filters to Qdrant Filter objects
56
+ │ ├── rerank.py # RerankProvider — Voyage rerank-2.5 integration with configurable skip conditions
57
+ │ └── tokenize.py # BM25 tokenization — camelCase/snake_case splitting, raw term count sparse vectors
58
+ ├── cli.py # Typer app — index, search, status, trace, serve commands (fully wired)
59
+ ├── config.py # Environment class — env var loading with defaults
60
+ ├── discovery.py # discover_files() — centralized file discovery using IgnorePatternLoader + SafetyChecker
61
+ ├── exceptions.py # Exception hierarchy with user-facing fix suggestions
62
+ ├── factory.py # Component factory — centralized wiring, create_components() returns Components dataclass
63
+ ├── mcp_server.py # FastMCP server — 5 tools: search, get_context, explain, index_status, trace
64
+ ├── models.py # Pydantic v2 models — ProjectConfig, SearchConfig, CollectionConfig, SafetyConfig, etc.
65
+ └── safety.py # SafetyChecker — file size, chunk count, collection limits
66
+ ```
67
+
68
+ ## Established Patterns
69
+
70
+ - **Data models:** Pydantic v2 `BaseModel` for config/validation, `@dataclass` for internal data (CodeEntity, FileChange, SearchResult)
71
+ - **Provider abstraction:** ABC base classes (e.g., `EmbeddingProvider`) with concrete implementations
72
+ - **SQLite access:** `contextmanager` pattern in `CacheDB._connect()` — no ORM
73
+ - **Config:** `Environment` class reads env vars with sensible defaults; `ProjectConfig` loaded from YAML
74
+ - **Exceptions:** Hierarchy rooted in `ClewError`, each with `fix_hint` for user-facing messages
75
+ - **Async:** Used for external API calls (Voyage AI, Qdrant hybrid search, embedding); sync for file I/O and SQLite
76
+ - **Deterministic IDs:** Qdrant point IDs are UUID5 derived from structured chunk IDs (format: `file_path::entity_type::qualified_name`)
77
+
78
+ ## Commands
79
+
80
+ ```bash
81
+ # Dev commands
82
+ pytest --cov=clew -v # Run tests with coverage
83
+ pytest -m integration # Run integration tests only
84
+ ruff check . # Lint
85
+ ruff format --check . # Check formatting
86
+ mypy clew/ # Type check
87
+
88
+ # CLI commands
89
+ clew index [PROJECT_ROOT] --full # Full reindex
90
+ clew index [PROJECT_ROOT] # Incremental (change detection)
91
+ clew index [PROJECT_ROOT] --nl-descriptions # Generate NL descriptions (requires ANTHROPIC_API_KEY)
92
+ clew search "query" --raw # Search with JSON output
93
+ clew trace "entity::name" # Trace code relationships (BFS graph traversal)
94
+ clew trace "entity" --direction outbound --depth 3 # Directed trace with depth limit
95
+ clew status # Show Qdrant health + index stats
96
+ clew serve # Start MCP server (stdio transport)
97
+ ```
98
+
99
+ ## Key Files
100
+
101
+ - `docs/DESIGN.md` — Architecture, chunking strategy, search pipeline, MCP tools, metadata schema
102
+ - `docs/IMPLEMENTATION.md` — Concrete specs: dependencies, SQLite schemas, Pydantic models, tree-sitter setup, phase tasks
103
+ - `docs/adr/` — Architecture Decision Records (Qdrant over Milvus, build vs adopt)
104
+ - `docs/plans/2026-02-06-phase1-core-infrastructure.md` — Phase 1 plan (complete)
105
+ - `docs/plans/2026-02-06-phase2-search-pipeline.md` — Phase 2 plan (complete)
106
+ - `docs/plans/2026-02-09-v1.1-nl-descriptions.md` — V1.1 NL Descriptions plan (complete)
107
+ - `docs/plans/2026-02-09-v1.2-structural-layer.md` — V1.2 Structural Layer plan (complete)
108
+ - `docs/plans/2026-02-10-compact-responses-and-cache-fix.md` — V1.3 Compact Responses & Cache Fix plan (complete)
109
+ - `docs/plans/2026-02-06-three-layer-knowledge-design.md` — Future roadmap (V1.4+)
110
+
111
+ ## Tech Stack
112
+
113
+ - Python >=3.10, Qdrant (Docker), Voyage AI voyage-code-3, tree-sitter, typer + rich CLI, Pydantic v2, SQLite for caching
114
+ - Testing: pytest + pytest-asyncio, respx for HTTP mocking
115
+ - Linting: ruff, mypy (strict)
116
+
117
+ ## MCP Server Configuration
118
+
119
+ Add to Claude Code's `.mcp.json`:
120
+ ```json
121
+ {
122
+ "mcpServers": {
123
+ "clew": {
124
+ "command": "clew",
125
+ "args": ["serve"],
126
+ "env": {
127
+ "VOYAGE_API_KEY": "your-key-here",
128
+ "QDRANT_URL": "http://localhost:6333",
129
+ "ANTHROPIC_API_KEY": "your-key-here (optional, for NL descriptions)",
130
+ "CLEW_CACHE_DIR": "/absolute/path/to/project/.clew (optional, auto-detected from git root)"
131
+ }
132
+ }
133
+ }
134
+ }
135
+ ```
136
+
137
+ ## MCP Tool Response Modes
138
+
139
+ MCP tools default to **compact** responses to minimize context window usage:
140
+
141
+ - **`search`** — Returns `snippet` (signature + docstring preview) instead of full source. Default `limit=5`. Pass `detail="full"` for complete content.
142
+ - **`explain`** — Same compact/full behavior. Default `limit=5`.
143
+ - **`get_context`** — Returns file content only. Pass `include_related=True` to also get related code chunks (compact format).
144
+ - **`trace`** and **`index_status`** — Already compact, no changes needed.
145
+
146
+ The agent can always use the `Read` tool to fetch specific lines from results that look promising.
147
+
148
+ ## Conventions
149
+
150
+ - Use `ruff` for formatting and linting
151
+ - Use `mypy --strict` for type checking
152
+ - Async where interacting with Voyage API or Qdrant
153
+ - All config through Pydantic models validated from YAML
154
+ - Error messages should tell the user how to fix the problem (e.g., "Qdrant not running. Start with: docker compose up -d qdrant")
155
+ - Component wiring through `factory.py` — no global state, one factory call per invocation
156
+ - MCP tools return structured dicts with `error` + `fix` keys on failure