codeembed 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. codeembed/__init__.py +59 -0
  2. codeembed/bootstrap/__init__.py +17 -0
  3. codeembed/bootstrap/services.py +220 -0
  4. codeembed/cli.py +454 -0
  5. codeembed/config/__init__.py +5 -0
  6. codeembed/config/models.py +13 -0
  7. codeembed/cost_tracking/__init__.py +7 -0
  8. codeembed/cost_tracking/llm_wrapper.py +39 -0
  9. codeembed/cost_tracking/models.py +52 -0
  10. codeembed/delta_computer/__init__.py +5 -0
  11. codeembed/delta_computer/delta_computer.py +75 -0
  12. codeembed/doc_embedder/__init__.py +5 -0
  13. codeembed/doc_embedder/doc_embedder.py +134 -0
  14. codeembed/doc_provider/__init__.py +10 -0
  15. codeembed/doc_provider/base.py +14 -0
  16. codeembed/doc_provider/local_doc_provider.py +58 -0
  17. codeembed/doc_provider/models.py +20 -0
  18. codeembed/doc_search_service/__init__.py +5 -0
  19. codeembed/doc_search_service/doc_search_service.py +48 -0
  20. codeembed/doc_splitters/__init__.py +8 -0
  21. codeembed/doc_splitters/generic_splitter.py +165 -0
  22. codeembed/doc_splitters/models.py +14 -0
  23. codeembed/llm/__init__.py +13 -0
  24. codeembed/llm/base.py +31 -0
  25. codeembed/llm/models.py +27 -0
  26. codeembed/llm/ollama_adapter.py +64 -0
  27. codeembed/llm/openai_adapter.py +96 -0
  28. codeembed/mcp_server.py +45 -0
  29. codeembed/setup_logger.py +34 -0
  30. codeembed/utils/__init__.py +9 -0
  31. codeembed/utils/checksum_utils.py +5 -0
  32. codeembed/utils/string_utils.py +5 -0
  33. codeembed/utils/time_utils.py +5 -0
  34. codeembed/vector_db/__init__.py +9 -0
  35. codeembed/vector_db/base.py +27 -0
  36. codeembed/vector_db/chromadb_adapter.py +130 -0
  37. codeembed/vector_db/models.py +16 -0
  38. codeembed-0.1.0.dist-info/METADATA +292 -0
  39. codeembed-0.1.0.dist-info/RECORD +42 -0
  40. codeembed-0.1.0.dist-info/WHEEL +4 -0
  41. codeembed-0.1.0.dist-info/entry_points.txt +2 -0
  42. codeembed-0.1.0.dist-info/licenses/LICENSE +21 -0
codeembed/__init__.py ADDED
@@ -0,0 +1,59 @@
1
+ from codeembed.bootstrap import (
2
+ embed_loop,
3
+ get_config,
4
+ get_embedder_service,
5
+ get_llm_service,
6
+ get_search_service,
7
+ get_session,
8
+ )
9
+ from codeembed.config import CodeEmbedConfig
10
+ from codeembed.cost_tracking import LLMServiceWithCostTracking, Session
11
+ from codeembed.delta_computer import DeltaComputer
12
+ from codeembed.doc_embedder import DocEmbedder
13
+ from codeembed.doc_provider import DocProviderBase, DocumentContent, DocumentMeta, LocalDocProvider
14
+ from codeembed.doc_search_service import DocSearchService
15
+ from codeembed.doc_splitters import FileSegment, FileSplitter, SplittedFile
16
+ from codeembed.llm import (
17
+ ChatMessage,
18
+ LLMResponse,
19
+ LLMServiceBase,
20
+ OllamaLLMService,
21
+ OpenAILLMService,
22
+ StructuredLLMResponse,
23
+ )
24
+ from codeembed.utils import string_to_sha256, truncate_string, utc_now
25
+ from codeembed.vector_db import ChromaDbAdapter, Chunk, VectorDbBase
26
+
27
+ __all__ = [
28
+ "ChatMessage",
29
+ "ChromaDbAdapter",
30
+ "Chunk",
31
+ "CodeEmbedConfig",
32
+ "DeltaComputer",
33
+ "DocEmbedder",
34
+ "DocProviderBase",
35
+ "DocSearchService",
36
+ "DocumentContent",
37
+ "DocumentMeta",
38
+ "FileSegment",
39
+ "FileSplitter",
40
+ "LLMResponse",
41
+ "LLMServiceBase",
42
+ "LLMServiceWithCostTracking",
43
+ "LocalDocProvider",
44
+ "OllamaLLMService",
45
+ "OpenAILLMService",
46
+ "Session",
47
+ "SplittedFile",
48
+ "StructuredLLMResponse",
49
+ "VectorDbBase",
50
+ "embed_loop",
51
+ "get_config",
52
+ "get_embedder_service",
53
+ "get_llm_service",
54
+ "get_search_service",
55
+ "get_session",
56
+ "string_to_sha256",
57
+ "truncate_string",
58
+ "utc_now",
59
+ ]
@@ -0,0 +1,17 @@
1
+ from codeembed.bootstrap.services import (
2
+ embed_loop,
3
+ get_config,
4
+ get_embedder_service,
5
+ get_llm_service,
6
+ get_search_service,
7
+ get_session,
8
+ )
9
+
10
+ __all__ = [
11
+ "embed_loop",
12
+ "get_config",
13
+ "get_embedder_service",
14
+ "get_llm_service",
15
+ "get_search_service",
16
+ "get_session",
17
+ ]
@@ -0,0 +1,220 @@
1
+ import asyncio
2
+ import logging
3
+ import os
4
+ import tomllib
5
+ from functools import lru_cache
6
+
7
+ from codeembed.config.models import CodeEmbedConfig
8
+ from codeembed.cost_tracking.llm_wrapper import LLMServiceWithCostTracking
9
+ from codeembed.cost_tracking.models import Session
10
+ from codeembed.doc_embedder.doc_embedder import DocEmbedder
11
+ from codeembed.doc_provider.local_doc_provider import LocalDocProvider
12
+ from codeembed.doc_search_service.doc_search_service import DocSearchService
13
+ from codeembed.llm.base import LLMServiceBase
14
+ from codeembed.llm.ollama_adapter import OllamaLLMService
15
+ from codeembed.vector_db.chromadb_adapter import ChromaDbAdapter
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ _SUPPORTED_FILE_EXTENSIONS = ["py", "md", "ts", "tsx", "js", "jsx"]
20
+ _CONFIG_FILE_PATH = "codeembed.toml"
21
+ _DEFAULT_LLM_MODEL = "gpt-oss:20b"
22
+ _DEFAULT_DEBOUNCE = 10
23
+ _DEFAULT_SLEEP_INTERVAL = 60
24
+
25
+
26
+ @lru_cache(maxsize=1)
27
+ def get_search_service() -> DocSearchService:
28
+ vector_db = ChromaDbAdapter(collection_name="codebase")
29
+ search_service = DocSearchService(vector_db)
30
+ return search_service
31
+
32
+
33
+ @lru_cache(maxsize=1)
34
+ def get_config() -> CodeEmbedConfig:
35
+ if os.path.isfile(_CONFIG_FILE_PATH):
36
+ try:
37
+ with open(_CONFIG_FILE_PATH, "rb") as f:
38
+ data = tomllib.load(f)
39
+ config = CodeEmbedConfig(**data["codeembed"])
40
+ return config
41
+ except Exception:
42
+ pass
43
+ default_config = CodeEmbedConfig(
44
+ llm_model=_DEFAULT_LLM_MODEL,
45
+ debounce=_DEFAULT_DEBOUNCE,
46
+ sleep_interval=_DEFAULT_SLEEP_INTERVAL,
47
+ )
48
+ return default_config
49
+
50
+
51
+ def _get_llm_service() -> LLMServiceBase:
52
+ config = get_config()
53
+
54
+ #
55
+ # Ollama
56
+ #
57
+ if config.provider == "ollama":
58
+ return OllamaLLMService()
59
+
60
+ #
61
+ # OpenAI-compatible providers
62
+ #
63
+ if config.provider != "openai":
64
+ raise ValueError(f"Unsupported LLM provider: {config.provider}")
65
+
66
+ try:
67
+ from azure.identity import DefaultAzureCredential, get_bearer_token_provider
68
+ from openai import OpenAI
69
+
70
+ from codeembed.llm.openai_adapter import OpenAILLMService
71
+ except ImportError as e:
72
+ raise ImportError(
73
+ "OpenAI provider requires optional dependencies. Install them with:\n uv tool install 'codeembed[openai]'"
74
+ ) from e
75
+
76
+ #
77
+ # Explicit config env-var overrides
78
+ #
79
+ custom_endpoint = os.getenv(config.llm_api_endpoint_env_var) if config.llm_api_endpoint_env_var else None
80
+
81
+ custom_api_key = os.getenv(config.llm_api_key_env_var) if config.llm_api_key_env_var else None
82
+
83
+ #
84
+ # Generic OpenAI-compatible configuration
85
+ #
86
+ openai_api_key = custom_api_key or os.getenv("OPENAI_API_KEY")
87
+
88
+ openai_base_url = custom_endpoint or os.getenv("OPENAI_BASE_URL")
89
+
90
+ #
91
+ # Azure OpenAI configuration
92
+ #
93
+ azure_openai_endpoint = custom_endpoint or os.getenv("AZURE_OPENAI_ENDPOINT")
94
+
95
+ azure_openai_api_key = custom_api_key or os.getenv("AZURE_OPENAI_API_KEY")
96
+
97
+ #
98
+ # ----------------------------------------------------------
99
+ # Standard OpenAI or OpenAI-compatible endpoint
100
+ #
101
+ # Examples:
102
+ # - OpenAI cloud
103
+ # - vLLM
104
+ # - LM Studio
105
+ # - Ollama OpenAI shim
106
+ # - OpenRouter
107
+ # - local gateways
108
+ # ----------------------------------------------------------
109
+ #
110
+ if openai_api_key:
111
+ client = OpenAI(
112
+ api_key=openai_api_key,
113
+ base_url=openai_base_url,
114
+ )
115
+
116
+ return OpenAILLMService(client)
117
+
118
+ #
119
+ # ----------------------------------------------------------
120
+ # Azure OpenAI with API key
121
+ # ----------------------------------------------------------
122
+ #
123
+ if azure_openai_endpoint and azure_openai_api_key:
124
+ client = OpenAI(
125
+ base_url=azure_openai_endpoint,
126
+ api_key=azure_openai_api_key,
127
+ )
128
+
129
+ return OpenAILLMService(client)
130
+
131
+ #
132
+ # ----------------------------------------------------------
133
+ # Azure OpenAI with RBAC / Entra ID
134
+ #
135
+ # DefaultAzureCredential tries these in order:
136
+ # - Environment / service principal (AZURE_CLIENT_ID etc.)
137
+ # - Workload Identity
138
+ # - Managed Identity
139
+ # - VS Code Azure sign-in
140
+ # - Azure CLI (az login)
141
+ # - Azure PowerShell (Connect-AzAccount)
142
+ # - Azure Developer CLI (azd auth login)
143
+ # - Interactive browser (last resort; enabled via exclude_interactive_browser_credential=False)
144
+ # ----------------------------------------------------------
145
+ #
146
+ if azure_openai_endpoint:
147
+ # expected format: https://<resource-name>.openai.azure.com/openai/v1/
148
+ if not azure_openai_endpoint.startswith("https://") or ".openai.azure.com" not in azure_openai_endpoint:
149
+ raise ValueError(f"Invalid Azure OpenAI endpoint: {azure_openai_endpoint}")
150
+ elif not azure_openai_endpoint.endswith("/openai/v1/"):
151
+ logger.warning(f"Azure OpenAI endpoint {azure_openai_endpoint} does not end with the expected /openai/v1/.")
152
+
153
+ credential = DefaultAzureCredential(
154
+ exclude_interactive_browser_credential=False,
155
+ )
156
+
157
+ token_provider = get_bearer_token_provider(
158
+ credential,
159
+ "https://cognitiveservices.azure.com/.default",
160
+ )
161
+
162
+ client = OpenAI(
163
+ base_url=azure_openai_endpoint,
164
+ api_key=token_provider,
165
+ )
166
+
167
+ return OpenAILLMService(client)
168
+
169
+ raise ValueError(
170
+ "Unable to configure OpenAI client.\n"
171
+ "Expected one of:\n"
172
+ "- OPENAI_API_KEY\n"
173
+ "- AZURE_OPENAI_API_KEY + AZURE_OPENAI_ENDPOINT\n"
174
+ "- AZURE_OPENAI_ENDPOINT with RBAC-enabled identity"
175
+ )
176
+
177
+
178
+ @lru_cache(maxsize=1)
179
+ def get_session() -> Session:
180
+ return Session()
181
+
182
+
183
+ @lru_cache(maxsize=1)
184
+ def get_llm_service() -> LLMServiceBase:
185
+ session = get_session()
186
+ llm_service = _get_llm_service()
187
+ llm_service = LLMServiceWithCostTracking(llm_service, session)
188
+ return llm_service
189
+
190
+
191
+ @lru_cache(maxsize=1)
192
+ def get_embedder_service() -> DocEmbedder:
193
+ config = get_config()
194
+ doc_provider = LocalDocProvider(
195
+ base_path=".",
196
+ supported_file_extensions=_SUPPORTED_FILE_EXTENSIONS,
197
+ )
198
+ vector_db = ChromaDbAdapter(collection_name="codebase")
199
+ llm_service = get_llm_service()
200
+ embedder = DocEmbedder(
201
+ doc_provider, vector_db, llm_service, llm_model=config.llm_model, debounce_seconds=config.debounce
202
+ )
203
+ return embedder
204
+
205
+
206
+ async def embed_loop() -> None:
207
+ embedder = get_embedder_service()
208
+ session = get_session()
209
+ config = get_config()
210
+ while True:
211
+ try:
212
+ await asyncio.to_thread(embedder.embed_codebase)
213
+ except Exception:
214
+ logger.exception("Embedding run failed")
215
+ session.save()
216
+ logger.info(
217
+ f"Input tokens used: {session.input_tokens}, output tokens used: {session.output_tokens}."
218
+ f"Sleeping for {config.sleep_interval} seconds..."
219
+ )
220
+ await asyncio.sleep(config.sleep_interval)