codexa 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codexa-0.4.0.dist-info/METADATA +650 -0
- codexa-0.4.0.dist-info/RECORD +189 -0
- codexa-0.4.0.dist-info/WHEEL +5 -0
- codexa-0.4.0.dist-info/entry_points.txt +2 -0
- codexa-0.4.0.dist-info/licenses/LICENSE +21 -0
- codexa-0.4.0.dist-info/top_level.txt +1 -0
- semantic_code_intelligence/__init__.py +5 -0
- semantic_code_intelligence/analysis/__init__.py +21 -0
- semantic_code_intelligence/analysis/ai_features.py +351 -0
- semantic_code_intelligence/bridge/__init__.py +28 -0
- semantic_code_intelligence/bridge/context_provider.py +245 -0
- semantic_code_intelligence/bridge/protocol.py +167 -0
- semantic_code_intelligence/bridge/server.py +348 -0
- semantic_code_intelligence/bridge/vscode.py +271 -0
- semantic_code_intelligence/ci/__init__.py +13 -0
- semantic_code_intelligence/ci/hooks.py +98 -0
- semantic_code_intelligence/ci/hotspots.py +272 -0
- semantic_code_intelligence/ci/impact.py +246 -0
- semantic_code_intelligence/ci/metrics.py +591 -0
- semantic_code_intelligence/ci/pr.py +412 -0
- semantic_code_intelligence/ci/quality.py +557 -0
- semantic_code_intelligence/ci/templates.py +164 -0
- semantic_code_intelligence/ci/trace.py +224 -0
- semantic_code_intelligence/cli/__init__.py +0 -0
- semantic_code_intelligence/cli/commands/__init__.py +0 -0
- semantic_code_intelligence/cli/commands/ask_cmd.py +153 -0
- semantic_code_intelligence/cli/commands/benchmark_cmd.py +303 -0
- semantic_code_intelligence/cli/commands/chat_cmd.py +252 -0
- semantic_code_intelligence/cli/commands/ci_gen_cmd.py +74 -0
- semantic_code_intelligence/cli/commands/context_cmd.py +120 -0
- semantic_code_intelligence/cli/commands/cross_refactor_cmd.py +113 -0
- semantic_code_intelligence/cli/commands/deps_cmd.py +91 -0
- semantic_code_intelligence/cli/commands/docs_cmd.py +101 -0
- semantic_code_intelligence/cli/commands/doctor_cmd.py +147 -0
- semantic_code_intelligence/cli/commands/evolve_cmd.py +171 -0
- semantic_code_intelligence/cli/commands/explain_cmd.py +112 -0
- semantic_code_intelligence/cli/commands/gate_cmd.py +135 -0
- semantic_code_intelligence/cli/commands/grep_cmd.py +234 -0
- semantic_code_intelligence/cli/commands/hotspots_cmd.py +119 -0
- semantic_code_intelligence/cli/commands/impact_cmd.py +131 -0
- semantic_code_intelligence/cli/commands/index_cmd.py +138 -0
- semantic_code_intelligence/cli/commands/init_cmd.py +152 -0
- semantic_code_intelligence/cli/commands/investigate_cmd.py +163 -0
- semantic_code_intelligence/cli/commands/languages_cmd.py +101 -0
- semantic_code_intelligence/cli/commands/lsp_cmd.py +49 -0
- semantic_code_intelligence/cli/commands/mcp_cmd.py +50 -0
- semantic_code_intelligence/cli/commands/metrics_cmd.py +264 -0
- semantic_code_intelligence/cli/commands/models_cmd.py +157 -0
- semantic_code_intelligence/cli/commands/plugin_cmd.py +275 -0
- semantic_code_intelligence/cli/commands/pr_summary_cmd.py +178 -0
- semantic_code_intelligence/cli/commands/quality_cmd.py +208 -0
- semantic_code_intelligence/cli/commands/refactor_cmd.py +103 -0
- semantic_code_intelligence/cli/commands/review_cmd.py +88 -0
- semantic_code_intelligence/cli/commands/search_cmd.py +236 -0
- semantic_code_intelligence/cli/commands/serve_cmd.py +117 -0
- semantic_code_intelligence/cli/commands/suggest_cmd.py +100 -0
- semantic_code_intelligence/cli/commands/summary_cmd.py +78 -0
- semantic_code_intelligence/cli/commands/tool_cmd.py +282 -0
- semantic_code_intelligence/cli/commands/trace_cmd.py +123 -0
- semantic_code_intelligence/cli/commands/tui_cmd.py +58 -0
- semantic_code_intelligence/cli/commands/viz_cmd.py +127 -0
- semantic_code_intelligence/cli/commands/watch_cmd.py +72 -0
- semantic_code_intelligence/cli/commands/web_cmd.py +61 -0
- semantic_code_intelligence/cli/commands/workspace_cmd.py +250 -0
- semantic_code_intelligence/cli/main.py +65 -0
- semantic_code_intelligence/cli/router.py +92 -0
- semantic_code_intelligence/config/__init__.py +0 -0
- semantic_code_intelligence/config/settings.py +260 -0
- semantic_code_intelligence/context/__init__.py +19 -0
- semantic_code_intelligence/context/engine.py +429 -0
- semantic_code_intelligence/context/memory.py +253 -0
- semantic_code_intelligence/daemon/__init__.py +1 -0
- semantic_code_intelligence/daemon/watcher.py +515 -0
- semantic_code_intelligence/docs/__init__.py +1080 -0
- semantic_code_intelligence/embeddings/__init__.py +0 -0
- semantic_code_intelligence/embeddings/enhanced.py +131 -0
- semantic_code_intelligence/embeddings/generator.py +149 -0
- semantic_code_intelligence/embeddings/model_registry.py +100 -0
- semantic_code_intelligence/evolution/__init__.py +1 -0
- semantic_code_intelligence/evolution/budget_guard.py +111 -0
- semantic_code_intelligence/evolution/commit_manager.py +88 -0
- semantic_code_intelligence/evolution/context_builder.py +131 -0
- semantic_code_intelligence/evolution/engine.py +249 -0
- semantic_code_intelligence/evolution/patch_generator.py +229 -0
- semantic_code_intelligence/evolution/task_selector.py +214 -0
- semantic_code_intelligence/evolution/test_runner.py +111 -0
- semantic_code_intelligence/indexing/__init__.py +0 -0
- semantic_code_intelligence/indexing/chunker.py +174 -0
- semantic_code_intelligence/indexing/parallel.py +86 -0
- semantic_code_intelligence/indexing/scanner.py +146 -0
- semantic_code_intelligence/indexing/semantic_chunker.py +337 -0
- semantic_code_intelligence/llm/__init__.py +62 -0
- semantic_code_intelligence/llm/cache.py +219 -0
- semantic_code_intelligence/llm/cached_provider.py +145 -0
- semantic_code_intelligence/llm/conversation.py +190 -0
- semantic_code_intelligence/llm/cross_refactor.py +272 -0
- semantic_code_intelligence/llm/investigation.py +274 -0
- semantic_code_intelligence/llm/mock_provider.py +77 -0
- semantic_code_intelligence/llm/ollama_provider.py +122 -0
- semantic_code_intelligence/llm/openai_provider.py +100 -0
- semantic_code_intelligence/llm/provider.py +92 -0
- semantic_code_intelligence/llm/rate_limiter.py +164 -0
- semantic_code_intelligence/llm/reasoning.py +438 -0
- semantic_code_intelligence/llm/safety.py +110 -0
- semantic_code_intelligence/llm/streaming.py +251 -0
- semantic_code_intelligence/lsp/__init__.py +609 -0
- semantic_code_intelligence/mcp/__init__.py +393 -0
- semantic_code_intelligence/parsing/__init__.py +19 -0
- semantic_code_intelligence/parsing/parser.py +375 -0
- semantic_code_intelligence/plugins/__init__.py +255 -0
- semantic_code_intelligence/plugins/examples/__init__.py +1 -0
- semantic_code_intelligence/plugins/examples/code_quality.py +73 -0
- semantic_code_intelligence/plugins/examples/search_annotator.py +56 -0
- semantic_code_intelligence/scalability/__init__.py +205 -0
- semantic_code_intelligence/search/__init__.py +0 -0
- semantic_code_intelligence/search/formatter.py +123 -0
- semantic_code_intelligence/search/grep.py +361 -0
- semantic_code_intelligence/search/hybrid_search.py +170 -0
- semantic_code_intelligence/search/keyword_search.py +311 -0
- semantic_code_intelligence/search/section_expander.py +103 -0
- semantic_code_intelligence/services/__init__.py +0 -0
- semantic_code_intelligence/services/indexing_service.py +630 -0
- semantic_code_intelligence/services/search_service.py +269 -0
- semantic_code_intelligence/storage/__init__.py +0 -0
- semantic_code_intelligence/storage/chunk_hash_store.py +86 -0
- semantic_code_intelligence/storage/hash_store.py +66 -0
- semantic_code_intelligence/storage/index_manifest.py +85 -0
- semantic_code_intelligence/storage/index_stats.py +138 -0
- semantic_code_intelligence/storage/query_history.py +160 -0
- semantic_code_intelligence/storage/symbol_registry.py +209 -0
- semantic_code_intelligence/storage/vector_store.py +297 -0
- semantic_code_intelligence/tests/__init__.py +0 -0
- semantic_code_intelligence/tests/test_ai_features.py +351 -0
- semantic_code_intelligence/tests/test_chunker.py +119 -0
- semantic_code_intelligence/tests/test_cli.py +188 -0
- semantic_code_intelligence/tests/test_config.py +154 -0
- semantic_code_intelligence/tests/test_context.py +381 -0
- semantic_code_intelligence/tests/test_embeddings.py +73 -0
- semantic_code_intelligence/tests/test_endtoend.py +1142 -0
- semantic_code_intelligence/tests/test_enhanced_embeddings.py +92 -0
- semantic_code_intelligence/tests/test_hash_store.py +79 -0
- semantic_code_intelligence/tests/test_logging.py +55 -0
- semantic_code_intelligence/tests/test_new_cli.py +138 -0
- semantic_code_intelligence/tests/test_parser.py +495 -0
- semantic_code_intelligence/tests/test_phase10.py +355 -0
- semantic_code_intelligence/tests/test_phase11.py +593 -0
- semantic_code_intelligence/tests/test_phase12.py +375 -0
- semantic_code_intelligence/tests/test_phase13.py +663 -0
- semantic_code_intelligence/tests/test_phase14.py +568 -0
- semantic_code_intelligence/tests/test_phase15.py +814 -0
- semantic_code_intelligence/tests/test_phase16.py +792 -0
- semantic_code_intelligence/tests/test_phase17.py +815 -0
- semantic_code_intelligence/tests/test_phase18.py +934 -0
- semantic_code_intelligence/tests/test_phase19.py +986 -0
- semantic_code_intelligence/tests/test_phase20.py +2753 -0
- semantic_code_intelligence/tests/test_phase20b.py +2058 -0
- semantic_code_intelligence/tests/test_phase20c.py +962 -0
- semantic_code_intelligence/tests/test_phase21.py +428 -0
- semantic_code_intelligence/tests/test_phase22.py +799 -0
- semantic_code_intelligence/tests/test_phase23.py +783 -0
- semantic_code_intelligence/tests/test_phase24.py +715 -0
- semantic_code_intelligence/tests/test_phase25.py +496 -0
- semantic_code_intelligence/tests/test_phase26.py +251 -0
- semantic_code_intelligence/tests/test_phase27.py +531 -0
- semantic_code_intelligence/tests/test_phase8.py +592 -0
- semantic_code_intelligence/tests/test_phase9.py +643 -0
- semantic_code_intelligence/tests/test_plugins.py +293 -0
- semantic_code_intelligence/tests/test_priority_features.py +727 -0
- semantic_code_intelligence/tests/test_router.py +41 -0
- semantic_code_intelligence/tests/test_scalability.py +138 -0
- semantic_code_intelligence/tests/test_scanner.py +125 -0
- semantic_code_intelligence/tests/test_search.py +160 -0
- semantic_code_intelligence/tests/test_semantic_chunker.py +255 -0
- semantic_code_intelligence/tests/test_tools.py +182 -0
- semantic_code_intelligence/tests/test_vector_store.py +151 -0
- semantic_code_intelligence/tests/test_watcher.py +211 -0
- semantic_code_intelligence/tools/__init__.py +442 -0
- semantic_code_intelligence/tools/executor.py +232 -0
- semantic_code_intelligence/tools/protocol.py +200 -0
- semantic_code_intelligence/tui/__init__.py +454 -0
- semantic_code_intelligence/utils/__init__.py +0 -0
- semantic_code_intelligence/utils/logging.py +112 -0
- semantic_code_intelligence/version.py +3 -0
- semantic_code_intelligence/web/__init__.py +11 -0
- semantic_code_intelligence/web/api.py +289 -0
- semantic_code_intelligence/web/server.py +397 -0
- semantic_code_intelligence/web/ui.py +659 -0
- semantic_code_intelligence/web/visualize.py +226 -0
- semantic_code_intelligence/workspace/__init__.py +427 -0
|
File without changes
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""Enhanced embedding pipeline — semantic preprocessing for code vectors.
|
|
2
|
+
|
|
3
|
+
Wraps the base generator with code-aware preprocessing:
|
|
4
|
+
- Prepends semantic labels to improve embedding quality
|
|
5
|
+
- Normalizes code formatting for more consistent representations
|
|
6
|
+
- Supports batch processing with progress tracking
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
from typing import TYPE_CHECKING, Any
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
|
|
16
|
+
from semantic_code_intelligence.embeddings.generator import (
|
|
17
|
+
generate_embeddings,
|
|
18
|
+
get_embedding_dimension,
|
|
19
|
+
get_model,
|
|
20
|
+
)
|
|
21
|
+
from semantic_code_intelligence.indexing.semantic_chunker import SemanticChunk
|
|
22
|
+
from semantic_code_intelligence.utils.logging import get_logger
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
logger = get_logger("embeddings.enhanced")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def preprocess_code_for_embedding(content: str, semantic_label: str = "") -> str:
|
|
31
|
+
"""Preprocess a code string to improve embedding quality.
|
|
32
|
+
|
|
33
|
+
Transformations:
|
|
34
|
+
1. Prepend semantic label (e.g. "[python] function authenticate(user, password)")
|
|
35
|
+
2. Collapse excessive blank lines
|
|
36
|
+
3. Strip trailing whitespace per line
|
|
37
|
+
4. Normalize indentation depth (reduce deep nesting visual noise)
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
content: Raw code string.
|
|
41
|
+
semantic_label: Optional semantic prefix.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Preprocessed text ready for embedding.
|
|
45
|
+
"""
|
|
46
|
+
lines = content.splitlines()
|
|
47
|
+
processed: list[str] = []
|
|
48
|
+
|
|
49
|
+
blank_count = 0
|
|
50
|
+
for line in lines:
|
|
51
|
+
stripped = line.rstrip()
|
|
52
|
+
if not stripped:
|
|
53
|
+
blank_count += 1
|
|
54
|
+
if blank_count <= 1:
|
|
55
|
+
processed.append("")
|
|
56
|
+
continue
|
|
57
|
+
blank_count = 0
|
|
58
|
+
processed.append(stripped)
|
|
59
|
+
|
|
60
|
+
text = "\n".join(processed).strip()
|
|
61
|
+
|
|
62
|
+
if semantic_label:
|
|
63
|
+
text = f"{semantic_label}\n{text}"
|
|
64
|
+
|
|
65
|
+
return text
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def prepare_semantic_texts(chunks: list[SemanticChunk]) -> list[str]:
|
|
69
|
+
"""Convert semantic chunks into preprocessed text strings for embedding.
|
|
70
|
+
|
|
71
|
+
Each chunk's content is enhanced with its semantic label to give
|
|
72
|
+
the embedding model structural context about what it's encoding.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
chunks: List of SemanticChunk objects.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
List of preprocessed text strings, one per chunk.
|
|
79
|
+
"""
|
|
80
|
+
return [
|
|
81
|
+
preprocess_code_for_embedding(c.content, c.semantic_label)
|
|
82
|
+
for c in chunks
|
|
83
|
+
]
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def generate_semantic_embeddings(
|
|
87
|
+
chunks: list[SemanticChunk],
|
|
88
|
+
model_name: str = "all-MiniLM-L6-v2",
|
|
89
|
+
batch_size: int = 64,
|
|
90
|
+
show_progress: bool = False,
|
|
91
|
+
) -> np.ndarray:
|
|
92
|
+
"""Generate embeddings from semantic chunks with preprocessing.
|
|
93
|
+
|
|
94
|
+
This is the recommended entry point for the enhanced pipeline.
|
|
95
|
+
It preprocesses each chunk with its semantic label before encoding.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
chunks: List of SemanticChunk objects.
|
|
99
|
+
model_name: Sentence-transformers model name.
|
|
100
|
+
batch_size: Encoding batch size.
|
|
101
|
+
show_progress: Show progress bar.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
NumPy array of shape (len(chunks), embedding_dim), L2-normalized.
|
|
105
|
+
"""
|
|
106
|
+
if not chunks:
|
|
107
|
+
return np.array([], dtype=np.float32).reshape(0, 0)
|
|
108
|
+
|
|
109
|
+
texts = prepare_semantic_texts(chunks)
|
|
110
|
+
return generate_embeddings(texts, model_name, batch_size, show_progress)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def generate_query_embedding(
|
|
114
|
+
query: str,
|
|
115
|
+
model_name: str = "all-MiniLM-L6-v2",
|
|
116
|
+
) -> np.ndarray:
|
|
117
|
+
"""Generate embedding for a search query with light preprocessing.
|
|
118
|
+
|
|
119
|
+
Queries are treated differently from code: they are natural language,
|
|
120
|
+
so we do minimal transformation.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
query: Natural language search query.
|
|
124
|
+
model_name: Sentence-transformers model name.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
NumPy array of shape (1, embedding_dim), L2-normalized.
|
|
128
|
+
"""
|
|
129
|
+
# Light cleanup only
|
|
130
|
+
clean_query = query.strip()
|
|
131
|
+
return generate_embeddings([clean_query], model_name)
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""Embedding generator — converts code chunks into vector embeddings.
|
|
2
|
+
|
|
3
|
+
Supports two backends:
|
|
4
|
+
- **sentence-transformers** (default): PyTorch-based, full-featured.
|
|
5
|
+
- **onnx**: Lightweight ONNX Runtime backend via ``optimum`` — lower
|
|
6
|
+
memory (~50% less) and often faster inference on CPU.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
from typing import TYPE_CHECKING
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
|
|
16
|
+
from semantic_code_intelligence.embeddings.model_registry import resolve_model_name
|
|
17
|
+
from semantic_code_intelligence.utils.logging import get_logger
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from sentence_transformers import SentenceTransformer
|
|
21
|
+
|
|
22
|
+
logger = get_logger("embeddings")
|
|
23
|
+
|
|
24
|
+
# Module-level cache for loaded model instances
|
|
25
|
+
_model_cache: dict[str, "SentenceTransformer"] = {}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _configure_hf_token() -> None:
|
|
29
|
+
"""Set HF_TOKEN from common env vars if not already set.
|
|
30
|
+
|
|
31
|
+
Checks ``HF_TOKEN``, ``HUGGING_FACE_HUB_TOKEN``, and
|
|
32
|
+
``HUGGINGFACE_TOKEN`` so the user only needs to export one.
|
|
33
|
+
"""
|
|
34
|
+
if os.environ.get("HF_TOKEN"):
|
|
35
|
+
return
|
|
36
|
+
for var in ("HUGGING_FACE_HUB_TOKEN", "HUGGINGFACE_TOKEN"):
|
|
37
|
+
value = os.environ.get(var)
|
|
38
|
+
if value:
|
|
39
|
+
os.environ["HF_TOKEN"] = value
|
|
40
|
+
return
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _onnx_available() -> bool:
|
|
44
|
+
"""Check if the ONNX Runtime backend is available."""
|
|
45
|
+
try:
|
|
46
|
+
import optimum # noqa: F401
|
|
47
|
+
import onnxruntime # noqa: F401
|
|
48
|
+
return True
|
|
49
|
+
except ImportError:
|
|
50
|
+
return False
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def get_model(
|
|
54
|
+
model_name: str = "all-MiniLM-L6-v2",
|
|
55
|
+
backend: str = "auto",
|
|
56
|
+
) -> "SentenceTransformer":
|
|
57
|
+
"""Load and cache a sentence-transformers model.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
model_name: Name of the model to load (full HF name or alias).
|
|
61
|
+
backend: ``"auto"`` (ONNX if available, else PyTorch),
|
|
62
|
+
``"onnx"``, or ``"torch"``.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
A SentenceTransformer model instance.
|
|
66
|
+
"""
|
|
67
|
+
model_name = resolve_model_name(model_name)
|
|
68
|
+
cache_key = f"{model_name}:{backend}"
|
|
69
|
+
|
|
70
|
+
if cache_key not in _model_cache:
|
|
71
|
+
_configure_hf_token()
|
|
72
|
+
from sentence_transformers import SentenceTransformer
|
|
73
|
+
|
|
74
|
+
use_onnx = False
|
|
75
|
+
if backend == "onnx":
|
|
76
|
+
if _onnx_available():
|
|
77
|
+
use_onnx = True
|
|
78
|
+
else:
|
|
79
|
+
logger.warning("ONNX requested but optimum/onnxruntime not installed; falling back to PyTorch.")
|
|
80
|
+
elif backend == "auto" and _onnx_available():
|
|
81
|
+
use_onnx = True
|
|
82
|
+
|
|
83
|
+
logger.info("Loading embedding model: %s (backend=%s)", model_name, "onnx" if use_onnx else "torch")
|
|
84
|
+
|
|
85
|
+
if use_onnx:
|
|
86
|
+
try:
|
|
87
|
+
_model_cache[cache_key] = SentenceTransformer(model_name, backend="onnx")
|
|
88
|
+
logger.info("Model loaded with ONNX backend.")
|
|
89
|
+
return _model_cache[cache_key]
|
|
90
|
+
except Exception:
|
|
91
|
+
logger.warning("ONNX load failed; falling back to PyTorch.")
|
|
92
|
+
|
|
93
|
+
_model_cache[cache_key] = SentenceTransformer(model_name)
|
|
94
|
+
logger.info("Model loaded successfully (PyTorch).")
|
|
95
|
+
|
|
96
|
+
return _model_cache[cache_key]
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def generate_embeddings(
|
|
100
|
+
texts: list[str],
|
|
101
|
+
model_name: str = "all-MiniLM-L6-v2",
|
|
102
|
+
batch_size: int = 64,
|
|
103
|
+
show_progress: bool = False,
|
|
104
|
+
backend: str = "auto",
|
|
105
|
+
) -> np.ndarray:
|
|
106
|
+
"""Generate vector embeddings for a list of text strings.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
texts: List of code/text strings to embed.
|
|
110
|
+
model_name: Name of the sentence-transformers model (or alias).
|
|
111
|
+
batch_size: Batch size for encoding.
|
|
112
|
+
show_progress: Whether to show a progress bar.
|
|
113
|
+
backend: ``"auto"``, ``"onnx"``, or ``"torch"``.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
NumPy array of shape (len(texts), embedding_dim).
|
|
117
|
+
"""
|
|
118
|
+
if not texts:
|
|
119
|
+
return np.array([], dtype=np.float32).reshape(0, 0)
|
|
120
|
+
|
|
121
|
+
model = get_model(model_name, backend=backend)
|
|
122
|
+
embeddings = model.encode(
|
|
123
|
+
texts,
|
|
124
|
+
batch_size=batch_size,
|
|
125
|
+
show_progress_bar=show_progress,
|
|
126
|
+
convert_to_numpy=True,
|
|
127
|
+
normalize_embeddings=True,
|
|
128
|
+
)
|
|
129
|
+
return embeddings.astype(np.float32)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def get_embedding_dimension(
|
|
133
|
+
model_name: str = "all-MiniLM-L6-v2",
|
|
134
|
+
backend: str = "auto",
|
|
135
|
+
) -> int:
|
|
136
|
+
"""Return the dimensionality of embeddings produced by the given model.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
model_name: Name of the sentence-transformers model (or alias).
|
|
140
|
+
backend: ``"auto"``, ``"onnx"``, or ``"torch"``.
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
Integer dimension of the embedding vectors.
|
|
144
|
+
"""
|
|
145
|
+
model = get_model(model_name, backend=backend)
|
|
146
|
+
dim = model.get_sentence_embedding_dimension()
|
|
147
|
+
if dim is None:
|
|
148
|
+
raise RuntimeError(f"Model {model_name!r} returned None for embedding dimension")
|
|
149
|
+
return dim
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""Embedding model registry — defines available models and their properties.
|
|
2
|
+
|
|
3
|
+
Provides a catalogue of supported embedding models so users can switch
|
|
4
|
+
between models optimised for different use-cases (code-heavy, doc-heavy,
|
|
5
|
+
speed vs quality, etc.).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass(frozen=True)
|
|
14
|
+
class ModelInfo:
|
|
15
|
+
"""Metadata about a supported embedding model."""
|
|
16
|
+
|
|
17
|
+
name: str
|
|
18
|
+
display_name: str
|
|
19
|
+
dimension: int
|
|
20
|
+
description: str
|
|
21
|
+
recommended_for: str
|
|
22
|
+
backend: str = "sentence-transformers" # or "onnx"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# Built-in model catalogue — mirrors ck's 4 models plus extras
|
|
26
|
+
AVAILABLE_MODELS: dict[str, ModelInfo] = {
|
|
27
|
+
"all-MiniLM-L6-v2": ModelInfo(
|
|
28
|
+
name="all-MiniLM-L6-v2",
|
|
29
|
+
display_name="MiniLM L6 v2",
|
|
30
|
+
dimension=384,
|
|
31
|
+
description="Default balanced model — good quality, fast inference.",
|
|
32
|
+
recommended_for="General purpose, balanced speed/quality.",
|
|
33
|
+
backend="sentence-transformers",
|
|
34
|
+
),
|
|
35
|
+
"BAAI/bge-small-en-v1.5": ModelInfo(
|
|
36
|
+
name="BAAI/bge-small-en-v1.5",
|
|
37
|
+
display_name="BGE Small EN v1.5",
|
|
38
|
+
dimension=384,
|
|
39
|
+
description="Compact BGE model — strong text retrieval performance.",
|
|
40
|
+
recommended_for="Retrieval-heavy workloads, lower memory.",
|
|
41
|
+
backend="sentence-transformers",
|
|
42
|
+
),
|
|
43
|
+
"nomic-ai/nomic-embed-text-v1.5": ModelInfo(
|
|
44
|
+
name="nomic-ai/nomic-embed-text-v1.5",
|
|
45
|
+
display_name="Nomic Embed Text v1.5",
|
|
46
|
+
dimension=768,
|
|
47
|
+
description="High-quality long-context model (8192 tokens).",
|
|
48
|
+
recommended_for="Documentation-heavy repos, long files.",
|
|
49
|
+
backend="sentence-transformers",
|
|
50
|
+
),
|
|
51
|
+
"jinaai/jina-embeddings-v2-base-code": ModelInfo(
|
|
52
|
+
name="jinaai/jina-embeddings-v2-base-code",
|
|
53
|
+
display_name="Jina Code v2",
|
|
54
|
+
dimension=768,
|
|
55
|
+
description="Code-specialised model trained on programming languages.",
|
|
56
|
+
recommended_for="Code-heavy repos, programming-specific search.",
|
|
57
|
+
backend="sentence-transformers",
|
|
58
|
+
),
|
|
59
|
+
"mixedbread-ai/mxbai-embed-xsmall-v1": ModelInfo(
|
|
60
|
+
name="mixedbread-ai/mxbai-embed-xsmall-v1",
|
|
61
|
+
display_name="Mixedbread XSmall v1",
|
|
62
|
+
dimension=384,
|
|
63
|
+
description="Ultra-compact model — fastest inference, smallest footprint.",
|
|
64
|
+
recommended_for="Large repos where speed matters most.",
|
|
65
|
+
backend="sentence-transformers",
|
|
66
|
+
),
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
# Shorthand aliases for CLI convenience
|
|
70
|
+
MODEL_ALIASES: dict[str, str] = {
|
|
71
|
+
"minilm": "all-MiniLM-L6-v2",
|
|
72
|
+
"bge-small": "BAAI/bge-small-en-v1.5",
|
|
73
|
+
"nomic": "nomic-ai/nomic-embed-text-v1.5",
|
|
74
|
+
"jina-code": "jinaai/jina-embeddings-v2-base-code",
|
|
75
|
+
"mxbai-xsmall": "mixedbread-ai/mxbai-embed-xsmall-v1",
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
DEFAULT_MODEL = "all-MiniLM-L6-v2"
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def resolve_model_name(name_or_alias: str) -> str:
|
|
82
|
+
"""Resolve a model name or alias to the full model identifier."""
|
|
83
|
+
if name_or_alias in AVAILABLE_MODELS:
|
|
84
|
+
return name_or_alias
|
|
85
|
+
resolved = MODEL_ALIASES.get(name_or_alias.lower())
|
|
86
|
+
if resolved:
|
|
87
|
+
return resolved
|
|
88
|
+
# Assume it's a custom HF model name
|
|
89
|
+
return name_or_alias
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def get_model_info(name_or_alias: str) -> ModelInfo | None:
|
|
93
|
+
"""Look up model info by name or alias. Returns None for unknown models."""
|
|
94
|
+
resolved = resolve_model_name(name_or_alias)
|
|
95
|
+
return AVAILABLE_MODELS.get(resolved)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def list_models() -> list[ModelInfo]:
|
|
99
|
+
"""Return all available models."""
|
|
100
|
+
return list(AVAILABLE_MODELS.values())
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Self-improving development loop — LLM-driven incremental code evolution."""
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""Budget guard — tracks token usage, iterations, and wall-clock time.
|
|
2
|
+
|
|
3
|
+
Enforces hard limits so that the evolution loop cannot run away with
|
|
4
|
+
unbounded LLM calls. The guard is passed through every stage and
|
|
5
|
+
checked before each LLM invocation.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import time
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class BudgetGuard:
|
|
16
|
+
"""Resource budget tracker for the evolution loop.
|
|
17
|
+
|
|
18
|
+
Parameters
|
|
19
|
+
----------
|
|
20
|
+
max_tokens : int
|
|
21
|
+
Maximum total tokens (prompt + completion) across all LLM calls.
|
|
22
|
+
max_iterations : int
|
|
23
|
+
Maximum evolution iterations to attempt.
|
|
24
|
+
max_seconds : float
|
|
25
|
+
Maximum wall-clock seconds before the loop is force-stopped.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
max_tokens: int = 20_000
|
|
29
|
+
max_iterations: int = 5
|
|
30
|
+
max_seconds: float = 600.0 # 10 minutes default
|
|
31
|
+
|
|
32
|
+
# Counters
|
|
33
|
+
tokens_used: int = 0
|
|
34
|
+
iterations_done: int = 0
|
|
35
|
+
_start_time: float = field(default=0.0, repr=False)
|
|
36
|
+
|
|
37
|
+
# ------------------------------------------------------------------ #
|
|
38
|
+
# Lifecycle
|
|
39
|
+
# ------------------------------------------------------------------ #
|
|
40
|
+
|
|
41
|
+
def start(self) -> None:
|
|
42
|
+
"""Mark the beginning of the evolution run."""
|
|
43
|
+
self._start_time = time.time()
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def elapsed_seconds(self) -> float:
|
|
47
|
+
"""Wall-clock seconds since :meth:`start` was called."""
|
|
48
|
+
if self._start_time == 0.0:
|
|
49
|
+
return 0.0
|
|
50
|
+
return time.time() - self._start_time
|
|
51
|
+
|
|
52
|
+
# ------------------------------------------------------------------ #
|
|
53
|
+
# Checks
|
|
54
|
+
# ------------------------------------------------------------------ #
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def tokens_remaining(self) -> int:
|
|
58
|
+
"""Tokens still available before the budget is exhausted."""
|
|
59
|
+
return max(0, self.max_tokens - self.tokens_used)
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def iterations_remaining(self) -> int:
|
|
63
|
+
"""Iterations still available before the limit is hit."""
|
|
64
|
+
return max(0, self.max_iterations - self.iterations_done)
|
|
65
|
+
|
|
66
|
+
def can_continue(self) -> bool:
|
|
67
|
+
"""Return ``True`` if budget allows another iteration."""
|
|
68
|
+
if self.iterations_done >= self.max_iterations:
|
|
69
|
+
return False
|
|
70
|
+
if self.tokens_used >= self.max_tokens:
|
|
71
|
+
return False
|
|
72
|
+
if self._start_time > 0.0 and self.elapsed_seconds >= self.max_seconds:
|
|
73
|
+
return False
|
|
74
|
+
return True
|
|
75
|
+
|
|
76
|
+
def stop_reason(self) -> str | None:
|
|
77
|
+
"""Return a human-readable reason if budget is exhausted, else ``None``."""
|
|
78
|
+
if self.iterations_done >= self.max_iterations:
|
|
79
|
+
return f"iteration limit reached ({self.max_iterations})"
|
|
80
|
+
if self.tokens_used >= self.max_tokens:
|
|
81
|
+
return f"token budget exhausted ({self.tokens_used}/{self.max_tokens})"
|
|
82
|
+
if self._start_time > 0.0 and self.elapsed_seconds >= self.max_seconds:
|
|
83
|
+
return f"time limit reached ({self.elapsed_seconds:.0f}s/{self.max_seconds:.0f}s)"
|
|
84
|
+
return None
|
|
85
|
+
|
|
86
|
+
# ------------------------------------------------------------------ #
|
|
87
|
+
# Recording
|
|
88
|
+
# ------------------------------------------------------------------ #
|
|
89
|
+
|
|
90
|
+
def record_tokens(self, tokens: int) -> None:
|
|
91
|
+
"""Record token usage from an LLM call."""
|
|
92
|
+
self.tokens_used += tokens
|
|
93
|
+
|
|
94
|
+
def record_iteration(self) -> None:
|
|
95
|
+
"""Mark one iteration as completed."""
|
|
96
|
+
self.iterations_done += 1
|
|
97
|
+
|
|
98
|
+
# ------------------------------------------------------------------ #
|
|
99
|
+
# Summary
|
|
100
|
+
# ------------------------------------------------------------------ #
|
|
101
|
+
|
|
102
|
+
def summary(self) -> dict[str, object]:
|
|
103
|
+
"""Return a dict snapshot of current budget usage."""
|
|
104
|
+
return {
|
|
105
|
+
"tokens_used": self.tokens_used,
|
|
106
|
+
"tokens_max": self.max_tokens,
|
|
107
|
+
"iterations_done": self.iterations_done,
|
|
108
|
+
"iterations_max": self.max_iterations,
|
|
109
|
+
"elapsed_seconds": round(self.elapsed_seconds, 2),
|
|
110
|
+
"max_seconds": self.max_seconds,
|
|
111
|
+
}
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""Commit manager — handles git add / commit / revert for evolution patches.
|
|
2
|
+
|
|
3
|
+
All git operations run as subprocesses against the project root.
|
|
4
|
+
The manager only commits files that the evolution loop explicitly touched.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import subprocess
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from semantic_code_intelligence.utils.logging import get_logger
|
|
13
|
+
|
|
14
|
+
logger = get_logger("evolution.commit_manager")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class CommitManager:
|
|
18
|
+
"""Thin wrapper around git for safe commit/revert cycles."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, project_root: Path) -> None:
|
|
21
|
+
self._root = project_root.resolve()
|
|
22
|
+
|
|
23
|
+
# ------------------------------------------------------------------ #
|
|
24
|
+
# Queries
|
|
25
|
+
# ------------------------------------------------------------------ #
|
|
26
|
+
|
|
27
|
+
def git_diff(self, staged: bool = False) -> str:
|
|
28
|
+
"""Return the current ``git diff`` output (unstaged by default)."""
|
|
29
|
+
cmd = ["git", "diff"]
|
|
30
|
+
if staged:
|
|
31
|
+
cmd.append("--staged")
|
|
32
|
+
return self._run(cmd)
|
|
33
|
+
|
|
34
|
+
def git_diff_stat(self) -> str:
|
|
35
|
+
"""Return the ``--stat`` summary of uncommitted changes."""
|
|
36
|
+
return self._run(["git", "diff", "--stat"])
|
|
37
|
+
|
|
38
|
+
def has_changes(self) -> bool:
|
|
39
|
+
"""Return ``True`` if there are uncommitted changes."""
|
|
40
|
+
output = self._run(["git", "status", "--porcelain"])
|
|
41
|
+
return bool(output.strip())
|
|
42
|
+
|
|
43
|
+
# ------------------------------------------------------------------ #
|
|
44
|
+
# Mutations
|
|
45
|
+
# ------------------------------------------------------------------ #
|
|
46
|
+
|
|
47
|
+
def stage_files(self, paths: list[str]) -> None:
|
|
48
|
+
"""``git add`` a list of relative file paths."""
|
|
49
|
+
if not paths:
|
|
50
|
+
return
|
|
51
|
+
self._run(["git", "add", "--"] + paths)
|
|
52
|
+
|
|
53
|
+
def commit(self, message: str) -> str:
|
|
54
|
+
"""Create a commit with the given message. Returns the short SHA."""
|
|
55
|
+
self._run(["git", "commit", "-m", message])
|
|
56
|
+
sha = self._run(["git", "rev-parse", "--short", "HEAD"]).strip()
|
|
57
|
+
logger.info("Committed %s: %s", sha, message)
|
|
58
|
+
return sha
|
|
59
|
+
|
|
60
|
+
def revert_files(self, paths: list[str]) -> None:
|
|
61
|
+
"""Restore files to their last committed state."""
|
|
62
|
+
if not paths:
|
|
63
|
+
return
|
|
64
|
+
self._run(["git", "checkout", "--"] + paths)
|
|
65
|
+
logger.info("Reverted %d file(s).", len(paths))
|
|
66
|
+
|
|
67
|
+
def stash_push(self, message: str = "evolution-wip") -> None:
|
|
68
|
+
"""Stash current changes."""
|
|
69
|
+
self._run(["git", "stash", "push", "-m", message])
|
|
70
|
+
|
|
71
|
+
def stash_pop(self) -> None:
|
|
72
|
+
"""Pop the most recent stash."""
|
|
73
|
+
self._run(["git", "stash", "pop"])
|
|
74
|
+
|
|
75
|
+
# ------------------------------------------------------------------ #
|
|
76
|
+
# Internal
|
|
77
|
+
# ------------------------------------------------------------------ #
|
|
78
|
+
|
|
79
|
+
def _run(self, cmd: list[str]) -> str:
|
|
80
|
+
proc = subprocess.run(
|
|
81
|
+
cmd,
|
|
82
|
+
capture_output=True,
|
|
83
|
+
text=True,
|
|
84
|
+
cwd=str(self._root),
|
|
85
|
+
)
|
|
86
|
+
if proc.returncode != 0 and "nothing to commit" not in proc.stdout:
|
|
87
|
+
logger.debug("git stderr: %s", proc.stderr.strip())
|
|
88
|
+
return proc.stdout
|