codexa 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. codexa-0.4.0.dist-info/METADATA +650 -0
  2. codexa-0.4.0.dist-info/RECORD +189 -0
  3. codexa-0.4.0.dist-info/WHEEL +5 -0
  4. codexa-0.4.0.dist-info/entry_points.txt +2 -0
  5. codexa-0.4.0.dist-info/licenses/LICENSE +21 -0
  6. codexa-0.4.0.dist-info/top_level.txt +1 -0
  7. semantic_code_intelligence/__init__.py +5 -0
  8. semantic_code_intelligence/analysis/__init__.py +21 -0
  9. semantic_code_intelligence/analysis/ai_features.py +351 -0
  10. semantic_code_intelligence/bridge/__init__.py +28 -0
  11. semantic_code_intelligence/bridge/context_provider.py +245 -0
  12. semantic_code_intelligence/bridge/protocol.py +167 -0
  13. semantic_code_intelligence/bridge/server.py +348 -0
  14. semantic_code_intelligence/bridge/vscode.py +271 -0
  15. semantic_code_intelligence/ci/__init__.py +13 -0
  16. semantic_code_intelligence/ci/hooks.py +98 -0
  17. semantic_code_intelligence/ci/hotspots.py +272 -0
  18. semantic_code_intelligence/ci/impact.py +246 -0
  19. semantic_code_intelligence/ci/metrics.py +591 -0
  20. semantic_code_intelligence/ci/pr.py +412 -0
  21. semantic_code_intelligence/ci/quality.py +557 -0
  22. semantic_code_intelligence/ci/templates.py +164 -0
  23. semantic_code_intelligence/ci/trace.py +224 -0
  24. semantic_code_intelligence/cli/__init__.py +0 -0
  25. semantic_code_intelligence/cli/commands/__init__.py +0 -0
  26. semantic_code_intelligence/cli/commands/ask_cmd.py +153 -0
  27. semantic_code_intelligence/cli/commands/benchmark_cmd.py +303 -0
  28. semantic_code_intelligence/cli/commands/chat_cmd.py +252 -0
  29. semantic_code_intelligence/cli/commands/ci_gen_cmd.py +74 -0
  30. semantic_code_intelligence/cli/commands/context_cmd.py +120 -0
  31. semantic_code_intelligence/cli/commands/cross_refactor_cmd.py +113 -0
  32. semantic_code_intelligence/cli/commands/deps_cmd.py +91 -0
  33. semantic_code_intelligence/cli/commands/docs_cmd.py +101 -0
  34. semantic_code_intelligence/cli/commands/doctor_cmd.py +147 -0
  35. semantic_code_intelligence/cli/commands/evolve_cmd.py +171 -0
  36. semantic_code_intelligence/cli/commands/explain_cmd.py +112 -0
  37. semantic_code_intelligence/cli/commands/gate_cmd.py +135 -0
  38. semantic_code_intelligence/cli/commands/grep_cmd.py +234 -0
  39. semantic_code_intelligence/cli/commands/hotspots_cmd.py +119 -0
  40. semantic_code_intelligence/cli/commands/impact_cmd.py +131 -0
  41. semantic_code_intelligence/cli/commands/index_cmd.py +138 -0
  42. semantic_code_intelligence/cli/commands/init_cmd.py +152 -0
  43. semantic_code_intelligence/cli/commands/investigate_cmd.py +163 -0
  44. semantic_code_intelligence/cli/commands/languages_cmd.py +101 -0
  45. semantic_code_intelligence/cli/commands/lsp_cmd.py +49 -0
  46. semantic_code_intelligence/cli/commands/mcp_cmd.py +50 -0
  47. semantic_code_intelligence/cli/commands/metrics_cmd.py +264 -0
  48. semantic_code_intelligence/cli/commands/models_cmd.py +157 -0
  49. semantic_code_intelligence/cli/commands/plugin_cmd.py +275 -0
  50. semantic_code_intelligence/cli/commands/pr_summary_cmd.py +178 -0
  51. semantic_code_intelligence/cli/commands/quality_cmd.py +208 -0
  52. semantic_code_intelligence/cli/commands/refactor_cmd.py +103 -0
  53. semantic_code_intelligence/cli/commands/review_cmd.py +88 -0
  54. semantic_code_intelligence/cli/commands/search_cmd.py +236 -0
  55. semantic_code_intelligence/cli/commands/serve_cmd.py +117 -0
  56. semantic_code_intelligence/cli/commands/suggest_cmd.py +100 -0
  57. semantic_code_intelligence/cli/commands/summary_cmd.py +78 -0
  58. semantic_code_intelligence/cli/commands/tool_cmd.py +282 -0
  59. semantic_code_intelligence/cli/commands/trace_cmd.py +123 -0
  60. semantic_code_intelligence/cli/commands/tui_cmd.py +58 -0
  61. semantic_code_intelligence/cli/commands/viz_cmd.py +127 -0
  62. semantic_code_intelligence/cli/commands/watch_cmd.py +72 -0
  63. semantic_code_intelligence/cli/commands/web_cmd.py +61 -0
  64. semantic_code_intelligence/cli/commands/workspace_cmd.py +250 -0
  65. semantic_code_intelligence/cli/main.py +65 -0
  66. semantic_code_intelligence/cli/router.py +92 -0
  67. semantic_code_intelligence/config/__init__.py +0 -0
  68. semantic_code_intelligence/config/settings.py +260 -0
  69. semantic_code_intelligence/context/__init__.py +19 -0
  70. semantic_code_intelligence/context/engine.py +429 -0
  71. semantic_code_intelligence/context/memory.py +253 -0
  72. semantic_code_intelligence/daemon/__init__.py +1 -0
  73. semantic_code_intelligence/daemon/watcher.py +515 -0
  74. semantic_code_intelligence/docs/__init__.py +1080 -0
  75. semantic_code_intelligence/embeddings/__init__.py +0 -0
  76. semantic_code_intelligence/embeddings/enhanced.py +131 -0
  77. semantic_code_intelligence/embeddings/generator.py +149 -0
  78. semantic_code_intelligence/embeddings/model_registry.py +100 -0
  79. semantic_code_intelligence/evolution/__init__.py +1 -0
  80. semantic_code_intelligence/evolution/budget_guard.py +111 -0
  81. semantic_code_intelligence/evolution/commit_manager.py +88 -0
  82. semantic_code_intelligence/evolution/context_builder.py +131 -0
  83. semantic_code_intelligence/evolution/engine.py +249 -0
  84. semantic_code_intelligence/evolution/patch_generator.py +229 -0
  85. semantic_code_intelligence/evolution/task_selector.py +214 -0
  86. semantic_code_intelligence/evolution/test_runner.py +111 -0
  87. semantic_code_intelligence/indexing/__init__.py +0 -0
  88. semantic_code_intelligence/indexing/chunker.py +174 -0
  89. semantic_code_intelligence/indexing/parallel.py +86 -0
  90. semantic_code_intelligence/indexing/scanner.py +146 -0
  91. semantic_code_intelligence/indexing/semantic_chunker.py +337 -0
  92. semantic_code_intelligence/llm/__init__.py +62 -0
  93. semantic_code_intelligence/llm/cache.py +219 -0
  94. semantic_code_intelligence/llm/cached_provider.py +145 -0
  95. semantic_code_intelligence/llm/conversation.py +190 -0
  96. semantic_code_intelligence/llm/cross_refactor.py +272 -0
  97. semantic_code_intelligence/llm/investigation.py +274 -0
  98. semantic_code_intelligence/llm/mock_provider.py +77 -0
  99. semantic_code_intelligence/llm/ollama_provider.py +122 -0
  100. semantic_code_intelligence/llm/openai_provider.py +100 -0
  101. semantic_code_intelligence/llm/provider.py +92 -0
  102. semantic_code_intelligence/llm/rate_limiter.py +164 -0
  103. semantic_code_intelligence/llm/reasoning.py +438 -0
  104. semantic_code_intelligence/llm/safety.py +110 -0
  105. semantic_code_intelligence/llm/streaming.py +251 -0
  106. semantic_code_intelligence/lsp/__init__.py +609 -0
  107. semantic_code_intelligence/mcp/__init__.py +393 -0
  108. semantic_code_intelligence/parsing/__init__.py +19 -0
  109. semantic_code_intelligence/parsing/parser.py +375 -0
  110. semantic_code_intelligence/plugins/__init__.py +255 -0
  111. semantic_code_intelligence/plugins/examples/__init__.py +1 -0
  112. semantic_code_intelligence/plugins/examples/code_quality.py +73 -0
  113. semantic_code_intelligence/plugins/examples/search_annotator.py +56 -0
  114. semantic_code_intelligence/scalability/__init__.py +205 -0
  115. semantic_code_intelligence/search/__init__.py +0 -0
  116. semantic_code_intelligence/search/formatter.py +123 -0
  117. semantic_code_intelligence/search/grep.py +361 -0
  118. semantic_code_intelligence/search/hybrid_search.py +170 -0
  119. semantic_code_intelligence/search/keyword_search.py +311 -0
  120. semantic_code_intelligence/search/section_expander.py +103 -0
  121. semantic_code_intelligence/services/__init__.py +0 -0
  122. semantic_code_intelligence/services/indexing_service.py +630 -0
  123. semantic_code_intelligence/services/search_service.py +269 -0
  124. semantic_code_intelligence/storage/__init__.py +0 -0
  125. semantic_code_intelligence/storage/chunk_hash_store.py +86 -0
  126. semantic_code_intelligence/storage/hash_store.py +66 -0
  127. semantic_code_intelligence/storage/index_manifest.py +85 -0
  128. semantic_code_intelligence/storage/index_stats.py +138 -0
  129. semantic_code_intelligence/storage/query_history.py +160 -0
  130. semantic_code_intelligence/storage/symbol_registry.py +209 -0
  131. semantic_code_intelligence/storage/vector_store.py +297 -0
  132. semantic_code_intelligence/tests/__init__.py +0 -0
  133. semantic_code_intelligence/tests/test_ai_features.py +351 -0
  134. semantic_code_intelligence/tests/test_chunker.py +119 -0
  135. semantic_code_intelligence/tests/test_cli.py +188 -0
  136. semantic_code_intelligence/tests/test_config.py +154 -0
  137. semantic_code_intelligence/tests/test_context.py +381 -0
  138. semantic_code_intelligence/tests/test_embeddings.py +73 -0
  139. semantic_code_intelligence/tests/test_endtoend.py +1142 -0
  140. semantic_code_intelligence/tests/test_enhanced_embeddings.py +92 -0
  141. semantic_code_intelligence/tests/test_hash_store.py +79 -0
  142. semantic_code_intelligence/tests/test_logging.py +55 -0
  143. semantic_code_intelligence/tests/test_new_cli.py +138 -0
  144. semantic_code_intelligence/tests/test_parser.py +495 -0
  145. semantic_code_intelligence/tests/test_phase10.py +355 -0
  146. semantic_code_intelligence/tests/test_phase11.py +593 -0
  147. semantic_code_intelligence/tests/test_phase12.py +375 -0
  148. semantic_code_intelligence/tests/test_phase13.py +663 -0
  149. semantic_code_intelligence/tests/test_phase14.py +568 -0
  150. semantic_code_intelligence/tests/test_phase15.py +814 -0
  151. semantic_code_intelligence/tests/test_phase16.py +792 -0
  152. semantic_code_intelligence/tests/test_phase17.py +815 -0
  153. semantic_code_intelligence/tests/test_phase18.py +934 -0
  154. semantic_code_intelligence/tests/test_phase19.py +986 -0
  155. semantic_code_intelligence/tests/test_phase20.py +2753 -0
  156. semantic_code_intelligence/tests/test_phase20b.py +2058 -0
  157. semantic_code_intelligence/tests/test_phase20c.py +962 -0
  158. semantic_code_intelligence/tests/test_phase21.py +428 -0
  159. semantic_code_intelligence/tests/test_phase22.py +799 -0
  160. semantic_code_intelligence/tests/test_phase23.py +783 -0
  161. semantic_code_intelligence/tests/test_phase24.py +715 -0
  162. semantic_code_intelligence/tests/test_phase25.py +496 -0
  163. semantic_code_intelligence/tests/test_phase26.py +251 -0
  164. semantic_code_intelligence/tests/test_phase27.py +531 -0
  165. semantic_code_intelligence/tests/test_phase8.py +592 -0
  166. semantic_code_intelligence/tests/test_phase9.py +643 -0
  167. semantic_code_intelligence/tests/test_plugins.py +293 -0
  168. semantic_code_intelligence/tests/test_priority_features.py +727 -0
  169. semantic_code_intelligence/tests/test_router.py +41 -0
  170. semantic_code_intelligence/tests/test_scalability.py +138 -0
  171. semantic_code_intelligence/tests/test_scanner.py +125 -0
  172. semantic_code_intelligence/tests/test_search.py +160 -0
  173. semantic_code_intelligence/tests/test_semantic_chunker.py +255 -0
  174. semantic_code_intelligence/tests/test_tools.py +182 -0
  175. semantic_code_intelligence/tests/test_vector_store.py +151 -0
  176. semantic_code_intelligence/tests/test_watcher.py +211 -0
  177. semantic_code_intelligence/tools/__init__.py +442 -0
  178. semantic_code_intelligence/tools/executor.py +232 -0
  179. semantic_code_intelligence/tools/protocol.py +200 -0
  180. semantic_code_intelligence/tui/__init__.py +454 -0
  181. semantic_code_intelligence/utils/__init__.py +0 -0
  182. semantic_code_intelligence/utils/logging.py +112 -0
  183. semantic_code_intelligence/version.py +3 -0
  184. semantic_code_intelligence/web/__init__.py +11 -0
  185. semantic_code_intelligence/web/api.py +289 -0
  186. semantic_code_intelligence/web/server.py +397 -0
  187. semantic_code_intelligence/web/ui.py +659 -0
  188. semantic_code_intelligence/web/visualize.py +226 -0
  189. semantic_code_intelligence/workspace/__init__.py +427 -0
File without changes
@@ -0,0 +1,131 @@
1
+ """Enhanced embedding pipeline — semantic preprocessing for code vectors.
2
+
3
+ Wraps the base generator with code-aware preprocessing:
4
+ - Prepends semantic labels to improve embedding quality
5
+ - Normalizes code formatting for more consistent representations
6
+ - Supports batch processing with progress tracking
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import re
12
+ from typing import TYPE_CHECKING, Any
13
+
14
+ import numpy as np
15
+
16
+ from semantic_code_intelligence.embeddings.generator import (
17
+ generate_embeddings,
18
+ get_embedding_dimension,
19
+ get_model,
20
+ )
21
+ from semantic_code_intelligence.indexing.semantic_chunker import SemanticChunk
22
+ from semantic_code_intelligence.utils.logging import get_logger
23
+
24
+ if TYPE_CHECKING:
25
+ pass
26
+
27
+ logger = get_logger("embeddings.enhanced")
28
+
29
+
30
+ def preprocess_code_for_embedding(content: str, semantic_label: str = "") -> str:
31
+ """Preprocess a code string to improve embedding quality.
32
+
33
+ Transformations:
34
+ 1. Prepend semantic label (e.g. "[python] function authenticate(user, password)")
35
+ 2. Collapse excessive blank lines
36
+ 3. Strip trailing whitespace per line
37
+ 4. Normalize indentation depth (reduce deep nesting visual noise)
38
+
39
+ Args:
40
+ content: Raw code string.
41
+ semantic_label: Optional semantic prefix.
42
+
43
+ Returns:
44
+ Preprocessed text ready for embedding.
45
+ """
46
+ lines = content.splitlines()
47
+ processed: list[str] = []
48
+
49
+ blank_count = 0
50
+ for line in lines:
51
+ stripped = line.rstrip()
52
+ if not stripped:
53
+ blank_count += 1
54
+ if blank_count <= 1:
55
+ processed.append("")
56
+ continue
57
+ blank_count = 0
58
+ processed.append(stripped)
59
+
60
+ text = "\n".join(processed).strip()
61
+
62
+ if semantic_label:
63
+ text = f"{semantic_label}\n{text}"
64
+
65
+ return text
66
+
67
+
68
+ def prepare_semantic_texts(chunks: list[SemanticChunk]) -> list[str]:
69
+ """Convert semantic chunks into preprocessed text strings for embedding.
70
+
71
+ Each chunk's content is enhanced with its semantic label to give
72
+ the embedding model structural context about what it's encoding.
73
+
74
+ Args:
75
+ chunks: List of SemanticChunk objects.
76
+
77
+ Returns:
78
+ List of preprocessed text strings, one per chunk.
79
+ """
80
+ return [
81
+ preprocess_code_for_embedding(c.content, c.semantic_label)
82
+ for c in chunks
83
+ ]
84
+
85
+
86
+ def generate_semantic_embeddings(
87
+ chunks: list[SemanticChunk],
88
+ model_name: str = "all-MiniLM-L6-v2",
89
+ batch_size: int = 64,
90
+ show_progress: bool = False,
91
+ ) -> np.ndarray:
92
+ """Generate embeddings from semantic chunks with preprocessing.
93
+
94
+ This is the recommended entry point for the enhanced pipeline.
95
+ It preprocesses each chunk with its semantic label before encoding.
96
+
97
+ Args:
98
+ chunks: List of SemanticChunk objects.
99
+ model_name: Sentence-transformers model name.
100
+ batch_size: Encoding batch size.
101
+ show_progress: Show progress bar.
102
+
103
+ Returns:
104
+ NumPy array of shape (len(chunks), embedding_dim), L2-normalized.
105
+ """
106
+ if not chunks:
107
+ return np.array([], dtype=np.float32).reshape(0, 0)
108
+
109
+ texts = prepare_semantic_texts(chunks)
110
+ return generate_embeddings(texts, model_name, batch_size, show_progress)
111
+
112
+
113
+ def generate_query_embedding(
114
+ query: str,
115
+ model_name: str = "all-MiniLM-L6-v2",
116
+ ) -> np.ndarray:
117
+ """Generate embedding for a search query with light preprocessing.
118
+
119
+ Queries are treated differently from code: they are natural language,
120
+ so we do minimal transformation.
121
+
122
+ Args:
123
+ query: Natural language search query.
124
+ model_name: Sentence-transformers model name.
125
+
126
+ Returns:
127
+ NumPy array of shape (1, embedding_dim), L2-normalized.
128
+ """
129
+ # Light cleanup only
130
+ clean_query = query.strip()
131
+ return generate_embeddings([clean_query], model_name)
@@ -0,0 +1,149 @@
1
+ """Embedding generator — converts code chunks into vector embeddings.
2
+
3
+ Supports two backends:
4
+ - **sentence-transformers** (default): PyTorch-based, full-featured.
5
+ - **onnx**: Lightweight ONNX Runtime backend via ``optimum`` — lower
6
+ memory (~50% less) and often faster inference on CPU.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import os
12
+ from typing import TYPE_CHECKING
13
+
14
+ import numpy as np
15
+
16
+ from semantic_code_intelligence.embeddings.model_registry import resolve_model_name
17
+ from semantic_code_intelligence.utils.logging import get_logger
18
+
19
+ if TYPE_CHECKING:
20
+ from sentence_transformers import SentenceTransformer
21
+
22
+ logger = get_logger("embeddings")
23
+
24
+ # Module-level cache for loaded model instances
25
+ _model_cache: dict[str, "SentenceTransformer"] = {}
26
+
27
+
28
+ def _configure_hf_token() -> None:
29
+ """Set HF_TOKEN from common env vars if not already set.
30
+
31
+ Checks ``HF_TOKEN``, ``HUGGING_FACE_HUB_TOKEN``, and
32
+ ``HUGGINGFACE_TOKEN`` so the user only needs to export one.
33
+ """
34
+ if os.environ.get("HF_TOKEN"):
35
+ return
36
+ for var in ("HUGGING_FACE_HUB_TOKEN", "HUGGINGFACE_TOKEN"):
37
+ value = os.environ.get(var)
38
+ if value:
39
+ os.environ["HF_TOKEN"] = value
40
+ return
41
+
42
+
43
+ def _onnx_available() -> bool:
44
+ """Check if the ONNX Runtime backend is available."""
45
+ try:
46
+ import optimum # noqa: F401
47
+ import onnxruntime # noqa: F401
48
+ return True
49
+ except ImportError:
50
+ return False
51
+
52
+
53
+ def get_model(
54
+ model_name: str = "all-MiniLM-L6-v2",
55
+ backend: str = "auto",
56
+ ) -> "SentenceTransformer":
57
+ """Load and cache a sentence-transformers model.
58
+
59
+ Args:
60
+ model_name: Name of the model to load (full HF name or alias).
61
+ backend: ``"auto"`` (ONNX if available, else PyTorch),
62
+ ``"onnx"``, or ``"torch"``.
63
+
64
+ Returns:
65
+ A SentenceTransformer model instance.
66
+ """
67
+ model_name = resolve_model_name(model_name)
68
+ cache_key = f"{model_name}:{backend}"
69
+
70
+ if cache_key not in _model_cache:
71
+ _configure_hf_token()
72
+ from sentence_transformers import SentenceTransformer
73
+
74
+ use_onnx = False
75
+ if backend == "onnx":
76
+ if _onnx_available():
77
+ use_onnx = True
78
+ else:
79
+ logger.warning("ONNX requested but optimum/onnxruntime not installed; falling back to PyTorch.")
80
+ elif backend == "auto" and _onnx_available():
81
+ use_onnx = True
82
+
83
+ logger.info("Loading embedding model: %s (backend=%s)", model_name, "onnx" if use_onnx else "torch")
84
+
85
+ if use_onnx:
86
+ try:
87
+ _model_cache[cache_key] = SentenceTransformer(model_name, backend="onnx")
88
+ logger.info("Model loaded with ONNX backend.")
89
+ return _model_cache[cache_key]
90
+ except Exception:
91
+ logger.warning("ONNX load failed; falling back to PyTorch.")
92
+
93
+ _model_cache[cache_key] = SentenceTransformer(model_name)
94
+ logger.info("Model loaded successfully (PyTorch).")
95
+
96
+ return _model_cache[cache_key]
97
+
98
+
99
+ def generate_embeddings(
100
+ texts: list[str],
101
+ model_name: str = "all-MiniLM-L6-v2",
102
+ batch_size: int = 64,
103
+ show_progress: bool = False,
104
+ backend: str = "auto",
105
+ ) -> np.ndarray:
106
+ """Generate vector embeddings for a list of text strings.
107
+
108
+ Args:
109
+ texts: List of code/text strings to embed.
110
+ model_name: Name of the sentence-transformers model (or alias).
111
+ batch_size: Batch size for encoding.
112
+ show_progress: Whether to show a progress bar.
113
+ backend: ``"auto"``, ``"onnx"``, or ``"torch"``.
114
+
115
+ Returns:
116
+ NumPy array of shape (len(texts), embedding_dim).
117
+ """
118
+ if not texts:
119
+ return np.array([], dtype=np.float32).reshape(0, 0)
120
+
121
+ model = get_model(model_name, backend=backend)
122
+ embeddings = model.encode(
123
+ texts,
124
+ batch_size=batch_size,
125
+ show_progress_bar=show_progress,
126
+ convert_to_numpy=True,
127
+ normalize_embeddings=True,
128
+ )
129
+ return embeddings.astype(np.float32)
130
+
131
+
132
+ def get_embedding_dimension(
133
+ model_name: str = "all-MiniLM-L6-v2",
134
+ backend: str = "auto",
135
+ ) -> int:
136
+ """Return the dimensionality of embeddings produced by the given model.
137
+
138
+ Args:
139
+ model_name: Name of the sentence-transformers model (or alias).
140
+ backend: ``"auto"``, ``"onnx"``, or ``"torch"``.
141
+
142
+ Returns:
143
+ Integer dimension of the embedding vectors.
144
+ """
145
+ model = get_model(model_name, backend=backend)
146
+ dim = model.get_sentence_embedding_dimension()
147
+ if dim is None:
148
+ raise RuntimeError(f"Model {model_name!r} returned None for embedding dimension")
149
+ return dim
@@ -0,0 +1,100 @@
1
+ """Embedding model registry — defines available models and their properties.
2
+
3
+ Provides a catalogue of supported embedding models so users can switch
4
+ between models optimised for different use-cases (code-heavy, doc-heavy,
5
+ speed vs quality, etc.).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class ModelInfo:
15
+ """Metadata about a supported embedding model."""
16
+
17
+ name: str
18
+ display_name: str
19
+ dimension: int
20
+ description: str
21
+ recommended_for: str
22
+ backend: str = "sentence-transformers" # or "onnx"
23
+
24
+
25
+ # Built-in model catalogue — mirrors ck's 4 models plus extras
26
+ AVAILABLE_MODELS: dict[str, ModelInfo] = {
27
+ "all-MiniLM-L6-v2": ModelInfo(
28
+ name="all-MiniLM-L6-v2",
29
+ display_name="MiniLM L6 v2",
30
+ dimension=384,
31
+ description="Default balanced model — good quality, fast inference.",
32
+ recommended_for="General purpose, balanced speed/quality.",
33
+ backend="sentence-transformers",
34
+ ),
35
+ "BAAI/bge-small-en-v1.5": ModelInfo(
36
+ name="BAAI/bge-small-en-v1.5",
37
+ display_name="BGE Small EN v1.5",
38
+ dimension=384,
39
+ description="Compact BGE model — strong text retrieval performance.",
40
+ recommended_for="Retrieval-heavy workloads, lower memory.",
41
+ backend="sentence-transformers",
42
+ ),
43
+ "nomic-ai/nomic-embed-text-v1.5": ModelInfo(
44
+ name="nomic-ai/nomic-embed-text-v1.5",
45
+ display_name="Nomic Embed Text v1.5",
46
+ dimension=768,
47
+ description="High-quality long-context model (8192 tokens).",
48
+ recommended_for="Documentation-heavy repos, long files.",
49
+ backend="sentence-transformers",
50
+ ),
51
+ "jinaai/jina-embeddings-v2-base-code": ModelInfo(
52
+ name="jinaai/jina-embeddings-v2-base-code",
53
+ display_name="Jina Code v2",
54
+ dimension=768,
55
+ description="Code-specialised model trained on programming languages.",
56
+ recommended_for="Code-heavy repos, programming-specific search.",
57
+ backend="sentence-transformers",
58
+ ),
59
+ "mixedbread-ai/mxbai-embed-xsmall-v1": ModelInfo(
60
+ name="mixedbread-ai/mxbai-embed-xsmall-v1",
61
+ display_name="Mixedbread XSmall v1",
62
+ dimension=384,
63
+ description="Ultra-compact model — fastest inference, smallest footprint.",
64
+ recommended_for="Large repos where speed matters most.",
65
+ backend="sentence-transformers",
66
+ ),
67
+ }
68
+
69
+ # Shorthand aliases for CLI convenience
70
+ MODEL_ALIASES: dict[str, str] = {
71
+ "minilm": "all-MiniLM-L6-v2",
72
+ "bge-small": "BAAI/bge-small-en-v1.5",
73
+ "nomic": "nomic-ai/nomic-embed-text-v1.5",
74
+ "jina-code": "jinaai/jina-embeddings-v2-base-code",
75
+ "mxbai-xsmall": "mixedbread-ai/mxbai-embed-xsmall-v1",
76
+ }
77
+
78
+ DEFAULT_MODEL = "all-MiniLM-L6-v2"
79
+
80
+
81
+ def resolve_model_name(name_or_alias: str) -> str:
82
+ """Resolve a model name or alias to the full model identifier."""
83
+ if name_or_alias in AVAILABLE_MODELS:
84
+ return name_or_alias
85
+ resolved = MODEL_ALIASES.get(name_or_alias.lower())
86
+ if resolved:
87
+ return resolved
88
+ # Assume it's a custom HF model name
89
+ return name_or_alias
90
+
91
+
92
+ def get_model_info(name_or_alias: str) -> ModelInfo | None:
93
+ """Look up model info by name or alias. Returns None for unknown models."""
94
+ resolved = resolve_model_name(name_or_alias)
95
+ return AVAILABLE_MODELS.get(resolved)
96
+
97
+
98
+ def list_models() -> list[ModelInfo]:
99
+ """Return all available models."""
100
+ return list(AVAILABLE_MODELS.values())
@@ -0,0 +1 @@
1
+ """Self-improving development loop — LLM-driven incremental code evolution."""
@@ -0,0 +1,111 @@
1
+ """Budget guard — tracks token usage, iterations, and wall-clock time.
2
+
3
+ Enforces hard limits so that the evolution loop cannot run away with
4
+ unbounded LLM calls. The guard is passed through every stage and
5
+ checked before each LLM invocation.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import time
11
+ from dataclasses import dataclass, field
12
+
13
+
14
+ @dataclass
15
+ class BudgetGuard:
16
+ """Resource budget tracker for the evolution loop.
17
+
18
+ Parameters
19
+ ----------
20
+ max_tokens : int
21
+ Maximum total tokens (prompt + completion) across all LLM calls.
22
+ max_iterations : int
23
+ Maximum evolution iterations to attempt.
24
+ max_seconds : float
25
+ Maximum wall-clock seconds before the loop is force-stopped.
26
+ """
27
+
28
+ max_tokens: int = 20_000
29
+ max_iterations: int = 5
30
+ max_seconds: float = 600.0 # 10 minutes default
31
+
32
+ # Counters
33
+ tokens_used: int = 0
34
+ iterations_done: int = 0
35
+ _start_time: float = field(default=0.0, repr=False)
36
+
37
+ # ------------------------------------------------------------------ #
38
+ # Lifecycle
39
+ # ------------------------------------------------------------------ #
40
+
41
+ def start(self) -> None:
42
+ """Mark the beginning of the evolution run."""
43
+ self._start_time = time.time()
44
+
45
+ @property
46
+ def elapsed_seconds(self) -> float:
47
+ """Wall-clock seconds since :meth:`start` was called."""
48
+ if self._start_time == 0.0:
49
+ return 0.0
50
+ return time.time() - self._start_time
51
+
52
+ # ------------------------------------------------------------------ #
53
+ # Checks
54
+ # ------------------------------------------------------------------ #
55
+
56
+ @property
57
+ def tokens_remaining(self) -> int:
58
+ """Tokens still available before the budget is exhausted."""
59
+ return max(0, self.max_tokens - self.tokens_used)
60
+
61
+ @property
62
+ def iterations_remaining(self) -> int:
63
+ """Iterations still available before the limit is hit."""
64
+ return max(0, self.max_iterations - self.iterations_done)
65
+
66
+ def can_continue(self) -> bool:
67
+ """Return ``True`` if budget allows another iteration."""
68
+ if self.iterations_done >= self.max_iterations:
69
+ return False
70
+ if self.tokens_used >= self.max_tokens:
71
+ return False
72
+ if self._start_time > 0.0 and self.elapsed_seconds >= self.max_seconds:
73
+ return False
74
+ return True
75
+
76
+ def stop_reason(self) -> str | None:
77
+ """Return a human-readable reason if budget is exhausted, else ``None``."""
78
+ if self.iterations_done >= self.max_iterations:
79
+ return f"iteration limit reached ({self.max_iterations})"
80
+ if self.tokens_used >= self.max_tokens:
81
+ return f"token budget exhausted ({self.tokens_used}/{self.max_tokens})"
82
+ if self._start_time > 0.0 and self.elapsed_seconds >= self.max_seconds:
83
+ return f"time limit reached ({self.elapsed_seconds:.0f}s/{self.max_seconds:.0f}s)"
84
+ return None
85
+
86
+ # ------------------------------------------------------------------ #
87
+ # Recording
88
+ # ------------------------------------------------------------------ #
89
+
90
+ def record_tokens(self, tokens: int) -> None:
91
+ """Record token usage from an LLM call."""
92
+ self.tokens_used += tokens
93
+
94
+ def record_iteration(self) -> None:
95
+ """Mark one iteration as completed."""
96
+ self.iterations_done += 1
97
+
98
+ # ------------------------------------------------------------------ #
99
+ # Summary
100
+ # ------------------------------------------------------------------ #
101
+
102
+ def summary(self) -> dict[str, object]:
103
+ """Return a dict snapshot of current budget usage."""
104
+ return {
105
+ "tokens_used": self.tokens_used,
106
+ "tokens_max": self.max_tokens,
107
+ "iterations_done": self.iterations_done,
108
+ "iterations_max": self.max_iterations,
109
+ "elapsed_seconds": round(self.elapsed_seconds, 2),
110
+ "max_seconds": self.max_seconds,
111
+ }
@@ -0,0 +1,88 @@
1
+ """Commit manager — handles git add / commit / revert for evolution patches.
2
+
3
+ All git operations run as subprocesses against the project root.
4
+ The manager only commits files that the evolution loop explicitly touched.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import subprocess
10
+ from pathlib import Path
11
+
12
+ from semantic_code_intelligence.utils.logging import get_logger
13
+
14
+ logger = get_logger("evolution.commit_manager")
15
+
16
+
17
+ class CommitManager:
18
+ """Thin wrapper around git for safe commit/revert cycles."""
19
+
20
+ def __init__(self, project_root: Path) -> None:
21
+ self._root = project_root.resolve()
22
+
23
+ # ------------------------------------------------------------------ #
24
+ # Queries
25
+ # ------------------------------------------------------------------ #
26
+
27
+ def git_diff(self, staged: bool = False) -> str:
28
+ """Return the current ``git diff`` output (unstaged by default)."""
29
+ cmd = ["git", "diff"]
30
+ if staged:
31
+ cmd.append("--staged")
32
+ return self._run(cmd)
33
+
34
+ def git_diff_stat(self) -> str:
35
+ """Return the ``--stat`` summary of uncommitted changes."""
36
+ return self._run(["git", "diff", "--stat"])
37
+
38
+ def has_changes(self) -> bool:
39
+ """Return ``True`` if there are uncommitted changes."""
40
+ output = self._run(["git", "status", "--porcelain"])
41
+ return bool(output.strip())
42
+
43
+ # ------------------------------------------------------------------ #
44
+ # Mutations
45
+ # ------------------------------------------------------------------ #
46
+
47
+ def stage_files(self, paths: list[str]) -> None:
48
+ """``git add`` a list of relative file paths."""
49
+ if not paths:
50
+ return
51
+ self._run(["git", "add", "--"] + paths)
52
+
53
+ def commit(self, message: str) -> str:
54
+ """Create a commit with the given message. Returns the short SHA."""
55
+ self._run(["git", "commit", "-m", message])
56
+ sha = self._run(["git", "rev-parse", "--short", "HEAD"]).strip()
57
+ logger.info("Committed %s: %s", sha, message)
58
+ return sha
59
+
60
+ def revert_files(self, paths: list[str]) -> None:
61
+ """Restore files to their last committed state."""
62
+ if not paths:
63
+ return
64
+ self._run(["git", "checkout", "--"] + paths)
65
+ logger.info("Reverted %d file(s).", len(paths))
66
+
67
+ def stash_push(self, message: str = "evolution-wip") -> None:
68
+ """Stash current changes."""
69
+ self._run(["git", "stash", "push", "-m", message])
70
+
71
+ def stash_pop(self) -> None:
72
+ """Pop the most recent stash."""
73
+ self._run(["git", "stash", "pop"])
74
+
75
+ # ------------------------------------------------------------------ #
76
+ # Internal
77
+ # ------------------------------------------------------------------ #
78
+
79
+ def _run(self, cmd: list[str]) -> str:
80
+ proc = subprocess.run(
81
+ cmd,
82
+ capture_output=True,
83
+ text=True,
84
+ cwd=str(self._root),
85
+ )
86
+ if proc.returncode != 0 and "nothing to commit" not in proc.stdout:
87
+ logger.debug("git stderr: %s", proc.stderr.strip())
88
+ return proc.stdout