codeembed 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codeembed/__init__.py +59 -0
- codeembed/bootstrap/__init__.py +17 -0
- codeembed/bootstrap/services.py +220 -0
- codeembed/cli.py +454 -0
- codeembed/config/__init__.py +5 -0
- codeembed/config/models.py +13 -0
- codeembed/cost_tracking/__init__.py +7 -0
- codeembed/cost_tracking/llm_wrapper.py +39 -0
- codeembed/cost_tracking/models.py +52 -0
- codeembed/delta_computer/__init__.py +5 -0
- codeembed/delta_computer/delta_computer.py +75 -0
- codeembed/doc_embedder/__init__.py +5 -0
- codeembed/doc_embedder/doc_embedder.py +134 -0
- codeembed/doc_provider/__init__.py +10 -0
- codeembed/doc_provider/base.py +14 -0
- codeembed/doc_provider/local_doc_provider.py +58 -0
- codeembed/doc_provider/models.py +20 -0
- codeembed/doc_search_service/__init__.py +5 -0
- codeembed/doc_search_service/doc_search_service.py +48 -0
- codeembed/doc_splitters/__init__.py +8 -0
- codeembed/doc_splitters/generic_splitter.py +165 -0
- codeembed/doc_splitters/models.py +14 -0
- codeembed/llm/__init__.py +13 -0
- codeembed/llm/base.py +31 -0
- codeembed/llm/models.py +27 -0
- codeembed/llm/ollama_adapter.py +64 -0
- codeembed/llm/openai_adapter.py +96 -0
- codeembed/mcp_server.py +45 -0
- codeembed/setup_logger.py +34 -0
- codeembed/utils/__init__.py +9 -0
- codeembed/utils/checksum_utils.py +5 -0
- codeembed/utils/string_utils.py +5 -0
- codeembed/utils/time_utils.py +5 -0
- codeembed/vector_db/__init__.py +9 -0
- codeembed/vector_db/base.py +27 -0
- codeembed/vector_db/chromadb_adapter.py +130 -0
- codeembed/vector_db/models.py +16 -0
- codeembed-0.1.0.dist-info/METADATA +292 -0
- codeembed-0.1.0.dist-info/RECORD +42 -0
- codeembed-0.1.0.dist-info/WHEEL +4 -0
- codeembed-0.1.0.dist-info/entry_points.txt +2 -0
- codeembed-0.1.0.dist-info/licenses/LICENSE +21 -0
codeembed/__init__.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from codeembed.bootstrap import (
|
|
2
|
+
embed_loop,
|
|
3
|
+
get_config,
|
|
4
|
+
get_embedder_service,
|
|
5
|
+
get_llm_service,
|
|
6
|
+
get_search_service,
|
|
7
|
+
get_session,
|
|
8
|
+
)
|
|
9
|
+
from codeembed.config import CodeEmbedConfig
|
|
10
|
+
from codeembed.cost_tracking import LLMServiceWithCostTracking, Session
|
|
11
|
+
from codeembed.delta_computer import DeltaComputer
|
|
12
|
+
from codeembed.doc_embedder import DocEmbedder
|
|
13
|
+
from codeembed.doc_provider import DocProviderBase, DocumentContent, DocumentMeta, LocalDocProvider
|
|
14
|
+
from codeembed.doc_search_service import DocSearchService
|
|
15
|
+
from codeembed.doc_splitters import FileSegment, FileSplitter, SplittedFile
|
|
16
|
+
from codeembed.llm import (
|
|
17
|
+
ChatMessage,
|
|
18
|
+
LLMResponse,
|
|
19
|
+
LLMServiceBase,
|
|
20
|
+
OllamaLLMService,
|
|
21
|
+
OpenAILLMService,
|
|
22
|
+
StructuredLLMResponse,
|
|
23
|
+
)
|
|
24
|
+
from codeembed.utils import string_to_sha256, truncate_string, utc_now
|
|
25
|
+
from codeembed.vector_db import ChromaDbAdapter, Chunk, VectorDbBase
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
"ChatMessage",
|
|
29
|
+
"ChromaDbAdapter",
|
|
30
|
+
"Chunk",
|
|
31
|
+
"CodeEmbedConfig",
|
|
32
|
+
"DeltaComputer",
|
|
33
|
+
"DocEmbedder",
|
|
34
|
+
"DocProviderBase",
|
|
35
|
+
"DocSearchService",
|
|
36
|
+
"DocumentContent",
|
|
37
|
+
"DocumentMeta",
|
|
38
|
+
"FileSegment",
|
|
39
|
+
"FileSplitter",
|
|
40
|
+
"LLMResponse",
|
|
41
|
+
"LLMServiceBase",
|
|
42
|
+
"LLMServiceWithCostTracking",
|
|
43
|
+
"LocalDocProvider",
|
|
44
|
+
"OllamaLLMService",
|
|
45
|
+
"OpenAILLMService",
|
|
46
|
+
"Session",
|
|
47
|
+
"SplittedFile",
|
|
48
|
+
"StructuredLLMResponse",
|
|
49
|
+
"VectorDbBase",
|
|
50
|
+
"embed_loop",
|
|
51
|
+
"get_config",
|
|
52
|
+
"get_embedder_service",
|
|
53
|
+
"get_llm_service",
|
|
54
|
+
"get_search_service",
|
|
55
|
+
"get_session",
|
|
56
|
+
"string_to_sha256",
|
|
57
|
+
"truncate_string",
|
|
58
|
+
"utc_now",
|
|
59
|
+
]
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from codeembed.bootstrap.services import (
|
|
2
|
+
embed_loop,
|
|
3
|
+
get_config,
|
|
4
|
+
get_embedder_service,
|
|
5
|
+
get_llm_service,
|
|
6
|
+
get_search_service,
|
|
7
|
+
get_session,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"embed_loop",
|
|
12
|
+
"get_config",
|
|
13
|
+
"get_embedder_service",
|
|
14
|
+
"get_llm_service",
|
|
15
|
+
"get_search_service",
|
|
16
|
+
"get_session",
|
|
17
|
+
]
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import tomllib
|
|
5
|
+
from functools import lru_cache
|
|
6
|
+
|
|
7
|
+
from codeembed.config.models import CodeEmbedConfig
|
|
8
|
+
from codeembed.cost_tracking.llm_wrapper import LLMServiceWithCostTracking
|
|
9
|
+
from codeembed.cost_tracking.models import Session
|
|
10
|
+
from codeembed.doc_embedder.doc_embedder import DocEmbedder
|
|
11
|
+
from codeembed.doc_provider.local_doc_provider import LocalDocProvider
|
|
12
|
+
from codeembed.doc_search_service.doc_search_service import DocSearchService
|
|
13
|
+
from codeembed.llm.base import LLMServiceBase
|
|
14
|
+
from codeembed.llm.ollama_adapter import OllamaLLMService
|
|
15
|
+
from codeembed.vector_db.chromadb_adapter import ChromaDbAdapter
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
_SUPPORTED_FILE_EXTENSIONS = ["py", "md", "ts", "tsx", "js", "jsx"]
|
|
20
|
+
_CONFIG_FILE_PATH = "codeembed.toml"
|
|
21
|
+
_DEFAULT_LLM_MODEL = "gpt-oss:20b"
|
|
22
|
+
_DEFAULT_DEBOUNCE = 10
|
|
23
|
+
_DEFAULT_SLEEP_INTERVAL = 60
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@lru_cache(maxsize=1)
|
|
27
|
+
def get_search_service() -> DocSearchService:
|
|
28
|
+
vector_db = ChromaDbAdapter(collection_name="codebase")
|
|
29
|
+
search_service = DocSearchService(vector_db)
|
|
30
|
+
return search_service
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@lru_cache(maxsize=1)
|
|
34
|
+
def get_config() -> CodeEmbedConfig:
|
|
35
|
+
if os.path.isfile(_CONFIG_FILE_PATH):
|
|
36
|
+
try:
|
|
37
|
+
with open(_CONFIG_FILE_PATH, "rb") as f:
|
|
38
|
+
data = tomllib.load(f)
|
|
39
|
+
config = CodeEmbedConfig(**data["codeembed"])
|
|
40
|
+
return config
|
|
41
|
+
except Exception:
|
|
42
|
+
pass
|
|
43
|
+
default_config = CodeEmbedConfig(
|
|
44
|
+
llm_model=_DEFAULT_LLM_MODEL,
|
|
45
|
+
debounce=_DEFAULT_DEBOUNCE,
|
|
46
|
+
sleep_interval=_DEFAULT_SLEEP_INTERVAL,
|
|
47
|
+
)
|
|
48
|
+
return default_config
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _get_llm_service() -> LLMServiceBase:
|
|
52
|
+
config = get_config()
|
|
53
|
+
|
|
54
|
+
#
|
|
55
|
+
# Ollama
|
|
56
|
+
#
|
|
57
|
+
if config.provider == "ollama":
|
|
58
|
+
return OllamaLLMService()
|
|
59
|
+
|
|
60
|
+
#
|
|
61
|
+
# OpenAI-compatible providers
|
|
62
|
+
#
|
|
63
|
+
if config.provider != "openai":
|
|
64
|
+
raise ValueError(f"Unsupported LLM provider: {config.provider}")
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
|
|
68
|
+
from openai import OpenAI
|
|
69
|
+
|
|
70
|
+
from codeembed.llm.openai_adapter import OpenAILLMService
|
|
71
|
+
except ImportError as e:
|
|
72
|
+
raise ImportError(
|
|
73
|
+
"OpenAI provider requires optional dependencies. Install them with:\n uv tool install 'codeembed[openai]'"
|
|
74
|
+
) from e
|
|
75
|
+
|
|
76
|
+
#
|
|
77
|
+
# Explicit config env-var overrides
|
|
78
|
+
#
|
|
79
|
+
custom_endpoint = os.getenv(config.llm_api_endpoint_env_var) if config.llm_api_endpoint_env_var else None
|
|
80
|
+
|
|
81
|
+
custom_api_key = os.getenv(config.llm_api_key_env_var) if config.llm_api_key_env_var else None
|
|
82
|
+
|
|
83
|
+
#
|
|
84
|
+
# Generic OpenAI-compatible configuration
|
|
85
|
+
#
|
|
86
|
+
openai_api_key = custom_api_key or os.getenv("OPENAI_API_KEY")
|
|
87
|
+
|
|
88
|
+
openai_base_url = custom_endpoint or os.getenv("OPENAI_BASE_URL")
|
|
89
|
+
|
|
90
|
+
#
|
|
91
|
+
# Azure OpenAI configuration
|
|
92
|
+
#
|
|
93
|
+
azure_openai_endpoint = custom_endpoint or os.getenv("AZURE_OPENAI_ENDPOINT")
|
|
94
|
+
|
|
95
|
+
azure_openai_api_key = custom_api_key or os.getenv("AZURE_OPENAI_API_KEY")
|
|
96
|
+
|
|
97
|
+
#
|
|
98
|
+
# ----------------------------------------------------------
|
|
99
|
+
# Standard OpenAI or OpenAI-compatible endpoint
|
|
100
|
+
#
|
|
101
|
+
# Examples:
|
|
102
|
+
# - OpenAI cloud
|
|
103
|
+
# - vLLM
|
|
104
|
+
# - LM Studio
|
|
105
|
+
# - Ollama OpenAI shim
|
|
106
|
+
# - OpenRouter
|
|
107
|
+
# - local gateways
|
|
108
|
+
# ----------------------------------------------------------
|
|
109
|
+
#
|
|
110
|
+
if openai_api_key:
|
|
111
|
+
client = OpenAI(
|
|
112
|
+
api_key=openai_api_key,
|
|
113
|
+
base_url=openai_base_url,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
return OpenAILLMService(client)
|
|
117
|
+
|
|
118
|
+
#
|
|
119
|
+
# ----------------------------------------------------------
|
|
120
|
+
# Azure OpenAI with API key
|
|
121
|
+
# ----------------------------------------------------------
|
|
122
|
+
#
|
|
123
|
+
if azure_openai_endpoint and azure_openai_api_key:
|
|
124
|
+
client = OpenAI(
|
|
125
|
+
base_url=azure_openai_endpoint,
|
|
126
|
+
api_key=azure_openai_api_key,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
return OpenAILLMService(client)
|
|
130
|
+
|
|
131
|
+
#
|
|
132
|
+
# ----------------------------------------------------------
|
|
133
|
+
# Azure OpenAI with RBAC / Entra ID
|
|
134
|
+
#
|
|
135
|
+
# DefaultAzureCredential tries these in order:
|
|
136
|
+
# - Environment / service principal (AZURE_CLIENT_ID etc.)
|
|
137
|
+
# - Workload Identity
|
|
138
|
+
# - Managed Identity
|
|
139
|
+
# - VS Code Azure sign-in
|
|
140
|
+
# - Azure CLI (az login)
|
|
141
|
+
# - Azure PowerShell (Connect-AzAccount)
|
|
142
|
+
# - Azure Developer CLI (azd auth login)
|
|
143
|
+
# - Interactive browser (last resort; enabled via exclude_interactive_browser_credential=False)
|
|
144
|
+
# ----------------------------------------------------------
|
|
145
|
+
#
|
|
146
|
+
if azure_openai_endpoint:
|
|
147
|
+
# expected format: https://<resource-name>.openai.azure.com/openai/v1/
|
|
148
|
+
if not azure_openai_endpoint.startswith("https://") or ".openai.azure.com" not in azure_openai_endpoint:
|
|
149
|
+
raise ValueError(f"Invalid Azure OpenAI endpoint: {azure_openai_endpoint}")
|
|
150
|
+
elif not azure_openai_endpoint.endswith("/openai/v1/"):
|
|
151
|
+
logger.warning(f"Azure OpenAI endpoint {azure_openai_endpoint} does not end with the expected /openai/v1/.")
|
|
152
|
+
|
|
153
|
+
credential = DefaultAzureCredential(
|
|
154
|
+
exclude_interactive_browser_credential=False,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
token_provider = get_bearer_token_provider(
|
|
158
|
+
credential,
|
|
159
|
+
"https://cognitiveservices.azure.com/.default",
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
client = OpenAI(
|
|
163
|
+
base_url=azure_openai_endpoint,
|
|
164
|
+
api_key=token_provider,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
return OpenAILLMService(client)
|
|
168
|
+
|
|
169
|
+
raise ValueError(
|
|
170
|
+
"Unable to configure OpenAI client.\n"
|
|
171
|
+
"Expected one of:\n"
|
|
172
|
+
"- OPENAI_API_KEY\n"
|
|
173
|
+
"- AZURE_OPENAI_API_KEY + AZURE_OPENAI_ENDPOINT\n"
|
|
174
|
+
"- AZURE_OPENAI_ENDPOINT with RBAC-enabled identity"
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
@lru_cache(maxsize=1)
|
|
179
|
+
def get_session() -> Session:
|
|
180
|
+
return Session()
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
@lru_cache(maxsize=1)
|
|
184
|
+
def get_llm_service() -> LLMServiceBase:
|
|
185
|
+
session = get_session()
|
|
186
|
+
llm_service = _get_llm_service()
|
|
187
|
+
llm_service = LLMServiceWithCostTracking(llm_service, session)
|
|
188
|
+
return llm_service
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
@lru_cache(maxsize=1)
|
|
192
|
+
def get_embedder_service() -> DocEmbedder:
|
|
193
|
+
config = get_config()
|
|
194
|
+
doc_provider = LocalDocProvider(
|
|
195
|
+
base_path=".",
|
|
196
|
+
supported_file_extensions=_SUPPORTED_FILE_EXTENSIONS,
|
|
197
|
+
)
|
|
198
|
+
vector_db = ChromaDbAdapter(collection_name="codebase")
|
|
199
|
+
llm_service = get_llm_service()
|
|
200
|
+
embedder = DocEmbedder(
|
|
201
|
+
doc_provider, vector_db, llm_service, llm_model=config.llm_model, debounce_seconds=config.debounce
|
|
202
|
+
)
|
|
203
|
+
return embedder
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
async def embed_loop() -> None:
|
|
207
|
+
embedder = get_embedder_service()
|
|
208
|
+
session = get_session()
|
|
209
|
+
config = get_config()
|
|
210
|
+
while True:
|
|
211
|
+
try:
|
|
212
|
+
await asyncio.to_thread(embedder.embed_codebase)
|
|
213
|
+
except Exception:
|
|
214
|
+
logger.exception("Embedding run failed")
|
|
215
|
+
session.save()
|
|
216
|
+
logger.info(
|
|
217
|
+
f"Input tokens used: {session.input_tokens}, output tokens used: {session.output_tokens}."
|
|
218
|
+
f"Sleeping for {config.sleep_interval} seconds..."
|
|
219
|
+
)
|
|
220
|
+
await asyncio.sleep(config.sleep_interval)
|