code-finder 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. claude_context/__init__.py +33 -0
  2. claude_context/agentic_integration.py +309 -0
  3. claude_context/ast_chunker.py +646 -0
  4. claude_context/config.py +239 -0
  5. claude_context/context_manager.py +627 -0
  6. claude_context/embeddings.py +307 -0
  7. claude_context/embeddings_interface.py +226 -0
  8. claude_context/enhanced_ast_chunker.py +1129 -0
  9. claude_context/explorer.py +951 -0
  10. claude_context/explorer_with_context.py +1008 -0
  11. claude_context/indexer.py +893 -0
  12. claude_context/markdown_chunker.py +421 -0
  13. claude_context/mode_handler.py +1774 -0
  14. claude_context/query_metrics.py +164 -0
  15. claude_context/question_generator.py +800 -0
  16. claude_context/readme_extractor.py +485 -0
  17. claude_context/repository_adapter.py +399 -0
  18. claude_context/search.py +493 -0
  19. claude_context/skills/__init__.py +11 -0
  20. claude_context/skills/_cli_common.py +74 -0
  21. claude_context/skills/_index_manager.py +98 -0
  22. claude_context/skills/api_surface.py +219 -0
  23. claude_context/skills/evidence_retrieval.py +151 -0
  24. claude_context/skills/grounded_review.py +212 -0
  25. claude_context/synthesis/__init__.py +8 -0
  26. claude_context/synthesis/editor_agent.py +391 -0
  27. claude_context/synthesis/llm_synthesizer.py +153 -0
  28. claude_context/synthesis/logic_explainer.py +235 -0
  29. claude_context/synthesis/multi_review_pipeline.py +717 -0
  30. claude_context/synthesis/prompt_builder.py +439 -0
  31. claude_context/synthesis/providers.py +115 -0
  32. claude_context/synthesis/validators.py +458 -0
  33. code_finder-0.1.0.dist-info/METADATA +823 -0
  34. code_finder-0.1.0.dist-info/RECORD +37 -0
  35. code_finder-0.1.0.dist-info/WHEEL +5 -0
  36. code_finder-0.1.0.dist-info/entry_points.txt +4 -0
  37. code_finder-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,307 @@
1
+ """
2
+ Local embeddings for Claude Context - No API keys needed
3
+
4
+ This module provides local text embeddings using sentence-transformers,
5
+ following our fail-fast principles with clear error reporting.
6
+ """
7
+
8
+ import logging
9
+ from typing import List, Optional, Union
10
+ import numpy as np
11
+
12
+ # Import sentence-transformers - fail fast if not installed
13
+ try:
14
+ from sentence_transformers import SentenceTransformer
15
+ except ImportError as e:
16
+ raise ImportError(
17
+ "sentence-transformers is required for embeddings. "
18
+ "Install with: pip install sentence-transformers"
19
+ ) from e
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class LocalEmbeddings:
25
+ """
26
+ Local embeddings using sentence-transformers.
27
+
28
+ No API keys needed, runs entirely on your machine.
29
+ Fail-fast approach: errors are explicit, not hidden.
30
+ """
31
+
32
+ # Recommended models with trade-offs clearly documented
33
+ MODELS = {
34
+ "all-MiniLM-L6-v2": {
35
+ "dimension": 384,
36
+ "description": "Fast and good quality, best for development",
37
+ "size_mb": 80,
38
+ "speed": "fast"
39
+ },
40
+ "all-mpnet-base-v2": {
41
+ "dimension": 768,
42
+ "description": "Higher quality, slower, good for production",
43
+ "size_mb": 420,
44
+ "speed": "medium"
45
+ },
46
+ "all-distilroberta-v1": {
47
+ "dimension": 768,
48
+ "description": "Good balance of speed and quality",
49
+ "size_mb": 290,
50
+ "speed": "medium"
51
+ }
52
+ }
53
+
54
+ def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
55
+ """
56
+ Initialize the embedding model.
57
+
58
+ Args:
59
+ model_name: Name of the sentence-transformers model to use
60
+
61
+ Raises:
62
+ ValueError: If model_name is empty
63
+ RuntimeError: If model fails to load
64
+ """
65
+ if not model_name:
66
+ raise ValueError("model_name cannot be empty")
67
+
68
+ self.model_name = model_name
69
+ logger.info(f"Loading embedding model: {self.model_name}")
70
+
71
+ # Get model info if available
72
+ model_info = self.MODELS.get(self.model_name, {})
73
+ if model_info:
74
+ logger.info(f" Description: {model_info.get('description')}")
75
+ logger.info(f" Dimension: {model_info.get('dimension')}")
76
+ logger.info(f" Speed: {model_info.get('speed')}")
77
+
78
+ # Load model - fail fast if it doesn't work
79
+ try:
80
+ self.model = SentenceTransformer(model_name)
81
+ self.dimension = self.model.get_sentence_embedding_dimension()
82
+ logger.info(f"✅ Model loaded successfully (dimension: {self.dimension})")
83
+ except Exception as e:
84
+ # Don't hide the error - make it clear what failed
85
+ raise RuntimeError(
86
+ f"Failed to load embedding model '{model_name}': {e}\n"
87
+ f"Try one of the tested models: {list(self.MODELS.keys())}"
88
+ ) from e
89
+
90
+ def embed_texts(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
91
+ """
92
+ Embed a list of texts into vectors.
93
+
94
+ Args:
95
+ texts: List of texts to embed
96
+ batch_size: Batch size for processing (for memory efficiency)
97
+
98
+ Returns:
99
+ numpy array of embeddings, shape (n_texts, dimension)
100
+
101
+ Raises:
102
+ ValueError: If texts is empty or contains only empty strings
103
+ RuntimeError: If embedding fails
104
+ """
105
+ # Validate input - fail fast on bad data
106
+ if not texts:
107
+ raise ValueError("Cannot embed empty text list")
108
+
109
+ if not any(texts): # All texts are empty
110
+ raise ValueError("Cannot embed list of empty strings")
111
+
112
+ # Log what we're doing
113
+ logger.info(f"Embedding {len(texts)} texts (batch_size={batch_size})")
114
+
115
+ try:
116
+ # Process embeddings
117
+ embeddings = self.model.encode(
118
+ texts,
119
+ batch_size=batch_size,
120
+ show_progress_bar=len(texts) > 100, # Show progress for large batches
121
+ convert_to_numpy=True
122
+ )
123
+
124
+ # Validate output
125
+ if embeddings.shape[0] != len(texts):
126
+ raise RuntimeError(
127
+ f"Embedding count mismatch: expected {len(texts)}, got {embeddings.shape[0]}"
128
+ )
129
+
130
+ logger.debug(f"Generated embeddings shape: {embeddings.shape}")
131
+ return embeddings
132
+
133
+ except Exception as e:
134
+ # Don't hide errors - be explicit about what failed
135
+ raise RuntimeError(f"Failed to embed texts: {e}") from e
136
+
137
+ def embed_text(self, text: str) -> np.ndarray:
138
+ """
139
+ Embed a single text into a vector.
140
+
141
+ Args:
142
+ text: Text to embed
143
+
144
+ Returns:
145
+ numpy array of embedding, shape (dimension,)
146
+
147
+ Raises:
148
+ ValueError: If text is empty
149
+ RuntimeError: If embedding fails
150
+ """
151
+ if not text or not text.strip():
152
+ raise ValueError("Cannot embed empty text")
153
+
154
+ logger.debug(f"Embedding single text of length {len(text)}")
155
+
156
+ # Use embed_texts for consistency
157
+ embeddings = self.embed_texts([text], batch_size=1)
158
+ return embeddings[0]
159
+
160
+ def embed_query(self, query: str) -> np.ndarray:
161
+ """
162
+ Embed a search query.
163
+
164
+ This is an alias for embed_text but semantically indicates
165
+ this is for a search query rather than document text.
166
+
167
+ Args:
168
+ query: Search query to embed
169
+
170
+ Returns:
171
+ numpy array of embedding, shape (dimension,)
172
+
173
+ Raises:
174
+ ValueError: If query is empty
175
+ RuntimeError: If embedding fails
176
+ """
177
+ if not query or not query.strip():
178
+ raise ValueError("Cannot embed empty query")
179
+
180
+ logger.debug(f"Embedding query: '{query[:50]}...'")
181
+ return self.embed_text(query)
182
+
183
+ def get_dimension(self) -> int:
184
+ """
185
+ Get the dimension of the embeddings.
186
+
187
+ Returns:
188
+ Embedding dimension
189
+ """
190
+ return self.dimension
191
+
192
+ def get_model_info(self) -> dict:
193
+ """
194
+ Get information about the current model.
195
+
196
+ Returns:
197
+ Dictionary with model information
198
+ """
199
+ info = {
200
+ "model_name": self.model_name,
201
+ "dimension": self.dimension,
202
+ "type": "local",
203
+ "requires_api_key": False
204
+ }
205
+
206
+ # Add known model info if available
207
+ if self.model_name in self.MODELS:
208
+ info.update(self.MODELS[self.model_name])
209
+
210
+ return info
211
+
212
+ def __repr__(self) -> str:
213
+ """String representation"""
214
+ return f"LocalEmbeddings(model='{self.model_name}', dim={self.dimension})"
215
+
216
+
217
+ # Convenience function for quick setup
218
+ def create_embeddings(model_name: Optional[str] = None) -> LocalEmbeddings:
219
+ """
220
+ Create an embedding instance with default or specified model.
221
+
222
+ Args:
223
+ model_name: Optional model name, defaults to fast model
224
+
225
+ Returns:
226
+ LocalEmbeddings instance
227
+
228
+ Raises:
229
+ RuntimeError: If embeddings cannot be created
230
+ """
231
+ model = model_name or "all-MiniLM-L6-v2"
232
+ logger.info(f"Creating embeddings with model: {model}")
233
+ return LocalEmbeddings(model)
234
+
235
+
236
+ # Example usage and testing
237
+ if __name__ == "__main__":
238
+ import time
239
+
240
+ print("Testing Local Embeddings Module...")
241
+ print("-" * 50)
242
+
243
+ # Test 1: Create embeddings
244
+ print("\n1. Creating embedding model...")
245
+ embeddings = LocalEmbeddings()
246
+ print(f" ✅ {embeddings}")
247
+
248
+ # Test 2: Embed single text
249
+ print("\n2. Testing single text embedding...")
250
+ text = "This is a test sentence for embedding."
251
+ embedding = embeddings.embed_text(text)
252
+ print(f" ✅ Embedded text to shape: {embedding.shape}")
253
+ print(f" Sample values: {embedding[:5]}")
254
+
255
+ # Test 3: Embed multiple texts
256
+ print("\n3. Testing batch embedding...")
257
+ texts = [
258
+ "Python is a programming language",
259
+ "JavaScript runs in the browser",
260
+ "Rust is memory safe",
261
+ "Go has great concurrency"
262
+ ]
263
+ start = time.time()
264
+ batch_embeddings = embeddings.embed_texts(texts)
265
+ elapsed = time.time() - start
266
+ print(f" ✅ Embedded {len(texts)} texts in {elapsed:.3f}s")
267
+ print(f" Output shape: {batch_embeddings.shape}")
268
+
269
+ # Test 4: Embed query (semantic similarity)
270
+ print("\n4. Testing query embedding...")
271
+ query = "What programming language is best for web development?"
272
+ query_embedding = embeddings.embed_query(query)
273
+
274
+ # Calculate similarities
275
+ from numpy.linalg import norm
276
+ similarities = []
277
+ for i, text in enumerate(texts):
278
+ # Cosine similarity
279
+ similarity = np.dot(query_embedding, batch_embeddings[i]) / (
280
+ norm(query_embedding) * norm(batch_embeddings[i])
281
+ )
282
+ similarities.append((text, similarity))
283
+
284
+ # Sort by similarity
285
+ similarities.sort(key=lambda x: x[1], reverse=True)
286
+ print(f" Query: '{query}'")
287
+ print(" Most similar texts:")
288
+ for text, sim in similarities[:2]:
289
+ print(f" - {text}: {sim:.3f}")
290
+
291
+ # Test 5: Error handling
292
+ print("\n5. Testing error handling...")
293
+ try:
294
+ embeddings.embed_text("")
295
+ print(" ❌ Should have raised error for empty text")
296
+ except ValueError as e:
297
+ print(f" ✅ Correctly raised error: {e}")
298
+
299
+ # Test 6: Model info
300
+ print("\n6. Model information:")
301
+ info = embeddings.get_model_info()
302
+ for key, value in info.items():
303
+ print(f" - {key}: {value}")
304
+
305
+ print("\n" + "=" * 50)
306
+ print("🎉 All tests passed! Embeddings module ready.")
307
+ print("=" * 50)
@@ -0,0 +1,226 @@
1
+ """
2
+ Embeddings Interface - Easy to swap between different providers
3
+
4
+ This shows how we can easily switch between local and API-based embeddings
5
+ in the future without changing the rest of our code.
6
+ """
7
+
8
+ from abc import ABC, abstractmethod
9
+ from typing import List, Optional
10
+ import numpy as np
11
+ import logging
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class EmbeddingsInterface(ABC):
17
+ """Abstract base class for all embedding providers"""
18
+
19
+ @abstractmethod
20
+ def embed_texts(self, texts: List[str]) -> np.ndarray:
21
+ """Embed multiple texts"""
22
+ pass
23
+
24
+ @abstractmethod
25
+ def embed_text(self, text: str) -> np.ndarray:
26
+ """Embed single text"""
27
+ pass
28
+
29
+ @abstractmethod
30
+ def get_dimension(self) -> int:
31
+ """Get embedding dimension"""
32
+ pass
33
+
34
+
35
+ class LocalEmbeddings(EmbeddingsInterface):
36
+ """Local embeddings (what we have now)"""
37
+
38
+ def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
39
+ from sentence_transformers import SentenceTransformer
40
+ self.model = SentenceTransformer(model_name)
41
+ self.dimension = self.model.get_sentence_embedding_dimension()
42
+
43
+ def embed_texts(self, texts: List[str]) -> np.ndarray:
44
+ return self.model.encode(texts, convert_to_numpy=True)
45
+
46
+ def embed_text(self, text: str) -> np.ndarray:
47
+ return self.embed_texts([text])[0]
48
+
49
+ def get_dimension(self) -> int:
50
+ return self.dimension
51
+
52
+
53
+ class OpenAIEmbeddings(EmbeddingsInterface):
54
+ """OpenAI embeddings (future option)"""
55
+
56
+ def __init__(self, api_key: str, model: str = "text-embedding-3-small"):
57
+ import openai
58
+ self.client = openai.OpenAI(api_key=api_key)
59
+ self.model = model
60
+ self.dimension = {
61
+ "text-embedding-3-small": 1536,
62
+ "text-embedding-3-large": 3072,
63
+ "text-embedding-ada-002": 1536,
64
+ }.get(model, 1536)
65
+
66
+ def embed_texts(self, texts: List[str]) -> np.ndarray:
67
+ response = self.client.embeddings.create(
68
+ input=texts,
69
+ model=self.model
70
+ )
71
+ return np.array([e.embedding for e in response.data])
72
+
73
+ def embed_text(self, text: str) -> np.ndarray:
74
+ return self.embed_texts([text])[0]
75
+
76
+ def get_dimension(self) -> int:
77
+ return self.dimension
78
+
79
+
80
+ class VoyageAIEmbeddings(EmbeddingsInterface):
81
+ """Voyage AI embeddings - optimized for code (future option)"""
82
+
83
+ def __init__(self, api_key: str, model: str = "voyage-code-2"):
84
+ import voyageai
85
+ self.client = voyageai.Client(api_key=api_key)
86
+ self.model = model
87
+ self.dimension = 1536 # voyage-code-2 dimension
88
+
89
+ def embed_texts(self, texts: List[str]) -> np.ndarray:
90
+ result = self.client.embed(texts, model=self.model)
91
+ return np.array(result.embeddings)
92
+
93
+ def embed_text(self, text: str) -> np.ndarray:
94
+ return self.embed_texts([text])[0]
95
+
96
+ def get_dimension(self) -> int:
97
+ return self.dimension
98
+
99
+
100
+ def create_embeddings(provider: str = "local", **kwargs) -> EmbeddingsInterface:
101
+ """
102
+ Factory function to create embeddings based on provider.
103
+
104
+ Args:
105
+ provider: One of "local", "openai", "voyage"
106
+ **kwargs: Provider-specific arguments
107
+
108
+ Returns:
109
+ EmbeddingsInterface implementation
110
+
111
+ Examples:
112
+ # Development
113
+ embeddings = create_embeddings("local")
114
+
115
+ # Production with OpenAI
116
+ embeddings = create_embeddings("openai", api_key="sk-...")
117
+
118
+ # Production with code-optimized embeddings
119
+ embeddings = create_embeddings("voyage", api_key="...", model="voyage-code-2")
120
+ """
121
+ if provider == "local":
122
+ model = kwargs.get("model", "all-MiniLM-L6-v2")
123
+ logger.info(f"Creating local embeddings: {model}")
124
+ return LocalEmbeddings(model)
125
+
126
+ elif provider == "openai":
127
+ api_key = kwargs.get("api_key")
128
+ if not api_key:
129
+ raise ValueError("OpenAI requires api_key")
130
+ model = kwargs.get("model", "text-embedding-3-small")
131
+ logger.info(f"Creating OpenAI embeddings: {model}")
132
+ return OpenAIEmbeddings(api_key, model)
133
+
134
+ elif provider == "voyage":
135
+ api_key = kwargs.get("api_key")
136
+ if not api_key:
137
+ raise ValueError("Voyage requires api_key")
138
+ model = kwargs.get("model", "voyage-code-2")
139
+ logger.info(f"Creating Voyage AI embeddings: {model}")
140
+ return VoyageAIEmbeddings(api_key, model)
141
+
142
+ else:
143
+ raise ValueError(f"Unknown provider: {provider}")
144
+
145
+
146
+ # Configuration-based selection
147
+ class EmbeddingsConfig:
148
+ """Configuration for embeddings selection"""
149
+
150
+ @staticmethod
151
+ def from_environment() -> EmbeddingsInterface:
152
+ """
153
+ Create embeddings based on environment variables.
154
+
155
+ This makes it easy to switch providers without code changes:
156
+
157
+ Development:
158
+ EMBEDDINGS_PROVIDER=local
159
+
160
+ Production:
161
+ EMBEDDINGS_PROVIDER=openai
162
+ OPENAI_API_KEY=sk-...
163
+
164
+ Code-optimized:
165
+ EMBEDDINGS_PROVIDER=voyage
166
+ VOYAGE_API_KEY=...
167
+ """
168
+ import os
169
+
170
+ provider = os.getenv("EMBEDDINGS_PROVIDER", "local")
171
+
172
+ if provider == "local":
173
+ model = os.getenv("EMBEDDINGS_MODEL", "all-MiniLM-L6-v2")
174
+ return create_embeddings("local", model=model)
175
+
176
+ elif provider == "openai":
177
+ api_key = os.getenv("OPENAI_API_KEY")
178
+ if not api_key:
179
+ raise ValueError("OPENAI_API_KEY environment variable required")
180
+ model = os.getenv("EMBEDDINGS_MODEL", "text-embedding-3-small")
181
+ return create_embeddings("openai", api_key=api_key, model=model)
182
+
183
+ elif provider == "voyage":
184
+ api_key = os.getenv("VOYAGE_API_KEY")
185
+ if not api_key:
186
+ raise ValueError("VOYAGE_API_KEY environment variable required")
187
+ model = os.getenv("EMBEDDINGS_MODEL", "voyage-code-2")
188
+ return create_embeddings("voyage", api_key=api_key, model=model)
189
+
190
+ else:
191
+ raise ValueError(f"Unknown EMBEDDINGS_PROVIDER: {provider}")
192
+
193
+
194
+ if __name__ == "__main__":
195
+ print("Embeddings Interface Example")
196
+ print("-" * 50)
197
+
198
+ # Show how easy it is to switch
199
+ print("\n1. Local embeddings (current):")
200
+ embeddings = create_embeddings("local")
201
+ print(f" Provider: Local")
202
+ print(f" Dimension: {embeddings.get_dimension()}")
203
+ print(f" Cost: Free")
204
+ print(f" Speed: Fast")
205
+ print(f" Quality: Good")
206
+
207
+ print("\n2. Future upgrade path:")
208
+ print(" OpenAI text-embedding-3-large:")
209
+ print(" - Dimension: 3072")
210
+ print(" - Cost: ~$0.13 per 1M tokens")
211
+ print(" - Speed: API latency")
212
+ print(" - Quality: Excellent")
213
+
214
+ print("\n Voyage AI voyage-code-2:")
215
+ print(" - Dimension: 1536")
216
+ print(" - Cost: ~$0.10 per 1M tokens")
217
+ print(" - Speed: API latency")
218
+ print(" - Quality: Excellent for code")
219
+
220
+ print("\n3. Deployment flexibility:")
221
+ print(" - Development: Local model on Mac (MPS)")
222
+ print(" - Staging: Local model on Linux (CPU/GPU)")
223
+ print(" - Production: API-based for best quality")
224
+ print(" - Enterprise: Self-hosted models on GPU cluster")
225
+
226
+ print("\n✅ Same interface, different providers!")