codeembed 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. codeembed/__init__.py +59 -0
  2. codeembed/bootstrap/__init__.py +17 -0
  3. codeembed/bootstrap/services.py +220 -0
  4. codeembed/cli.py +454 -0
  5. codeembed/config/__init__.py +5 -0
  6. codeembed/config/models.py +13 -0
  7. codeembed/cost_tracking/__init__.py +7 -0
  8. codeembed/cost_tracking/llm_wrapper.py +39 -0
  9. codeembed/cost_tracking/models.py +52 -0
  10. codeembed/delta_computer/__init__.py +5 -0
  11. codeembed/delta_computer/delta_computer.py +75 -0
  12. codeembed/doc_embedder/__init__.py +5 -0
  13. codeembed/doc_embedder/doc_embedder.py +134 -0
  14. codeembed/doc_provider/__init__.py +10 -0
  15. codeembed/doc_provider/base.py +14 -0
  16. codeembed/doc_provider/local_doc_provider.py +58 -0
  17. codeembed/doc_provider/models.py +20 -0
  18. codeembed/doc_search_service/__init__.py +5 -0
  19. codeembed/doc_search_service/doc_search_service.py +48 -0
  20. codeembed/doc_splitters/__init__.py +8 -0
  21. codeembed/doc_splitters/generic_splitter.py +165 -0
  22. codeembed/doc_splitters/models.py +14 -0
  23. codeembed/llm/__init__.py +13 -0
  24. codeembed/llm/base.py +31 -0
  25. codeembed/llm/models.py +27 -0
  26. codeembed/llm/ollama_adapter.py +64 -0
  27. codeembed/llm/openai_adapter.py +96 -0
  28. codeembed/mcp_server.py +45 -0
  29. codeembed/setup_logger.py +34 -0
  30. codeembed/utils/__init__.py +9 -0
  31. codeembed/utils/checksum_utils.py +5 -0
  32. codeembed/utils/string_utils.py +5 -0
  33. codeembed/utils/time_utils.py +5 -0
  34. codeembed/vector_db/__init__.py +9 -0
  35. codeembed/vector_db/base.py +27 -0
  36. codeembed/vector_db/chromadb_adapter.py +130 -0
  37. codeembed/vector_db/models.py +16 -0
  38. codeembed-0.1.0.dist-info/METADATA +292 -0
  39. codeembed-0.1.0.dist-info/RECORD +42 -0
  40. codeembed-0.1.0.dist-info/WHEEL +4 -0
  41. codeembed-0.1.0.dist-info/entry_points.txt +2 -0
  42. codeembed-0.1.0.dist-info/licenses/LICENSE +21 -0
codeembed/cli.py ADDED
@@ -0,0 +1,454 @@
1
+ import json
2
+ import os
3
+ import shutil
4
+ import subprocess
5
+ from typing import Any, Dict, Literal, Optional
6
+
7
+ import typer
8
+
9
+ from codeembed.bootstrap.services import get_config, get_llm_service, get_session
10
+ from codeembed.llm.base import LLMServiceBase
11
+ from codeembed.setup_logger import setup_logger
12
+
13
+ app = typer.Typer()
14
+
15
+ _CODEEMBED_DIR = ".codeembed"
16
+ _CONFIG_FILE = "codeembed.toml"
17
+ _GITIGNORE_ENTRY = ".codeembed/"
18
+ _DEFAULT_DEBOUNCE = 10
19
+ _DEFAULT_SLEEP_INTERVAL = 60
20
+
21
+ _CURATED_MODELS = [
22
+ ("gpt-oss:20b", "OpenAI's open source model, ~14GB"),
23
+ ("gemma4:e4b", "Google's all-around model, ~9.6GB"),
24
+ ]
25
+
26
+ _OPENAI_CURATED_MODELS = [
27
+ ("gpt-4.1-mini", "Lightweight and cost-effective"),
28
+ ("gpt-5.4-mini", "Newer, lightweight and cost-effective"),
29
+ ("gpt-5.4-nano", "Newer and super lightweight option"),
30
+ ]
31
+
32
+ _AGENT_INSTRUCTION_FILES = [
33
+ "AGENTS.md",
34
+ "CLAUDE.md",
35
+ os.path.join(".github", "copilot-instructions.md"),
36
+ ]
37
+
38
+ _AGENT_INSTRUCTIONS_MARKER = "mcp__codeembed__search"
39
+
40
+ _AGENT_INSTRUCTIONS_CONTENT = """\
41
+ ## Codebase search
42
+
43
+ Use the `mcp__codeembed__search` tool as the first step for any question about how this \
44
+ codebase works — how something is implemented, where something is defined, what calls what. \
45
+ Prefer it over grep or file reads for exploratory questions.
46
+ """
47
+
48
+
49
+ def _ensure_gitignore() -> None:
50
+ if not os.path.isfile(".gitignore"):
51
+ typer.echo("Error: No .gitignore found. Run 'codeembed init' from the root of your git repository.")
52
+ typer.echo("A .gitignore is required to prevent CodeEmbed from embedding your sensitive files.")
53
+ raise typer.Exit(1)
54
+
55
+ with open(".gitignore", "r", encoding="utf-8") as f:
56
+ content = f.read()
57
+
58
+ if _GITIGNORE_ENTRY not in content:
59
+ # ask user for permission to modify .gitignore
60
+ typer.echo(f"CodeEmbed stores its data in the '{_CODEEMBED_DIR}/' directory.")
61
+ typer.echo(f"You must add '{_GITIGNORE_ENTRY}' to your .gitignore to use CodeEmbed safely.")
62
+ if not typer.confirm(f"Add '{_GITIGNORE_ENTRY}' to your .gitignore now?", default=True):
63
+ typer.echo(f"Error: Gitignore is missing '{_GITIGNORE_ENTRY}' entry for safe operation.")
64
+ raise typer.Exit(1)
65
+ with open(".gitignore", "a", encoding="utf-8") as f:
66
+ f.write(f"\n# CodeEmbed\n{_GITIGNORE_ENTRY}\n")
67
+ typer.echo(f"Added '{_GITIGNORE_ENTRY}' to .gitignore. Remember to commit this change.\n")
68
+
69
+
70
+ def _create_codeembed_dir() -> None:
71
+ if not os.path.isdir(_CODEEMBED_DIR):
72
+ os.makedirs(_CODEEMBED_DIR)
73
+ typer.echo(f"Created '{_CODEEMBED_DIR}/' directory.\n")
74
+
75
+
76
+ def _check_ollama_installed() -> None:
77
+ if shutil.which("ollama") is None:
78
+ typer.echo("Error: Ollama is not installed or not in your PATH.")
79
+ typer.echo("Install it from https://ollama.com/ then re-run 'codeembed init'.")
80
+ raise typer.Exit(1)
81
+
82
+
83
+ def _check_ollama_model_is_available(model: str) -> None:
84
+ downloaded_models = _get_downloaded_models()
85
+ if model not in downloaded_models:
86
+ typer.echo(f"Error: Ollama model '{model}' is not available.")
87
+ typer.echo(f"Download it with: ollama pull {model}")
88
+ # Alternatively give option to download now.
89
+ raise typer.Exit(1)
90
+
91
+
92
+ def _check_ollama_running() -> None:
93
+ result = subprocess.run(["ollama", "list"], capture_output=True, text=True)
94
+ if result.returncode != 0:
95
+ typer.echo("Error: Ollama server is not running.")
96
+ typer.echo("Start it with: ollama serve")
97
+ raise typer.Exit(1)
98
+
99
+
100
+ def _get_downloaded_models() -> list[str]:
101
+ result = subprocess.run(["ollama", "list"], capture_output=True, text=True)
102
+ lines = result.stdout.strip().splitlines()
103
+ models = []
104
+ for line in lines[1:]: # skip header
105
+ parts = line.split()
106
+ if parts:
107
+ models.append(parts[0])
108
+ return models
109
+
110
+
111
+ def _select_ollama_llm_model(downloaded_models: list[str]) -> str:
112
+ typer.echo("\nSelect a local LLM model for code summarization:\n")
113
+
114
+ options: list[str] = []
115
+
116
+ for model, description in _CURATED_MODELS:
117
+ tag = " [downloaded]" if model in downloaded_models else ""
118
+ options.append(model)
119
+ typer.echo(f" {len(options)}. {model} — {description}{tag}")
120
+
121
+ extra = [m for m in downloaded_models if m not in dict(_CURATED_MODELS)]
122
+ for model in extra:
123
+ options.append(model)
124
+ typer.echo(f" {len(options)}. {model} [downloaded]")
125
+
126
+ options.append("custom")
127
+ typer.echo(f" {len(options)}. Enter a custom model name\n")
128
+
129
+ raw = typer.prompt(f"Choice (1-{len(options)})")
130
+
131
+ try:
132
+ index = int(raw) - 1
133
+ if index < 0 or index >= len(options):
134
+ raise ValueError()
135
+ except ValueError:
136
+ typer.echo("Invalid choice. Please re-run 'codeembed init'.")
137
+ raise typer.Exit(1)
138
+
139
+ if options[index] == "custom":
140
+ return typer.prompt("Model name (e.g. gpt-oss:20b)")
141
+
142
+ return options[index]
143
+
144
+
145
+ def _select_openai_model() -> str:
146
+ typer.echo("\nSelect an OpenAI LLM deployment for code summarization:\n")
147
+
148
+ options = list(_OPENAI_CURATED_MODELS)
149
+ for i, (model, description) in enumerate(options, 1):
150
+ typer.echo(f" {i}. {model} — {description}")
151
+
152
+ custom_index = len(options) + 1
153
+ typer.echo(f" {custom_index}. Enter a custom model name\n")
154
+
155
+ raw = typer.prompt(f"Choice (1-{custom_index})")
156
+
157
+ try:
158
+ index = int(raw) - 1
159
+ if index < 0 or index >= custom_index:
160
+ raise ValueError()
161
+ except ValueError:
162
+ typer.echo("Invalid choice. Please re-run 'codeembed init'.")
163
+ raise typer.Exit(1)
164
+
165
+ if index == len(options):
166
+ return typer.prompt("Model name (e.g. gpt-4o)")
167
+
168
+ return options[index][0]
169
+
170
+
171
+ def _check_openai_credentials() -> str:
172
+ if os.getenv("OPENAI_API_KEY"):
173
+ return "[OPENAI_API_KEY set]"
174
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
175
+ if azure_endpoint:
176
+ if os.getenv("AZURE_OPENAI_API_KEY"):
177
+ return "[Azure: endpoint + key set]"
178
+ return "[Azure: endpoint set, using RBAC]"
179
+ return "[no credentials found]"
180
+
181
+
182
+ def _select_provider() -> Literal["ollama", "openai"]:
183
+ typer.echo("\nSelect an LLM provider for code summarization:\n")
184
+
185
+ is_ollama_installed = shutil.which("ollama") is not None
186
+ if is_ollama_installed:
187
+ result = subprocess.run(["ollama", "list"], capture_output=True, text=True)
188
+ is_ollama_running = result.returncode == 0
189
+ else:
190
+ is_ollama_running = False
191
+
192
+ ollama_status = "[running]" if is_ollama_running else "[not running]" if is_ollama_installed else "[not installed]"
193
+ openai_status = _check_openai_credentials()
194
+
195
+ typer.echo(" 1. ollama " + ollama_status)
196
+ typer.echo(" 2. openai " + openai_status)
197
+
198
+ raw = typer.prompt("Choice (1-2)")
199
+
200
+ try:
201
+ index = int(raw) - 1
202
+ if index < 0 or index >= 2:
203
+ raise ValueError()
204
+ except ValueError:
205
+ typer.echo("Invalid choice. Please re-run 'codeembed init'.")
206
+ raise typer.Exit(1)
207
+
208
+ if index == 0:
209
+ return "ollama"
210
+ elif index == 1:
211
+ return "openai"
212
+ raise ValueError("Invalid index") # should never happen
213
+
214
+
215
+ def _ensure_model_downloaded(model: str, downloaded_models: list[str]) -> None:
216
+ if model in downloaded_models:
217
+ return
218
+
219
+ typer.echo(f"\nModel '{model}' is not downloaded yet.")
220
+ if typer.confirm("Download it now?", default=False):
221
+ typer.echo(f"Pulling '{model}'... (this may take a while)")
222
+ subprocess.run(["ollama", "pull", model])
223
+ else:
224
+ typer.echo(f"Skipping. You can pull it later with: ollama pull {model}")
225
+
226
+
227
+ def _write_config(model: str, provider: Literal["ollama", "openai"], env_var_path: Optional[str] = None) -> None:
228
+ config_toml = f"""\
229
+ [codeembed]
230
+ llm_model = "{model}"
231
+ provider = "{provider}"
232
+ debounce = {_DEFAULT_DEBOUNCE}
233
+ sleep_interval = {_DEFAULT_SLEEP_INTERVAL}
234
+ """
235
+ if env_var_path:
236
+ config_toml += f'env_var_path = "{env_var_path}"\n'
237
+
238
+ with open(_CONFIG_FILE, "w", encoding="utf-8") as f:
239
+ f.write(config_toml)
240
+
241
+ typer.echo(f"Created '{_CONFIG_FILE}'.")
242
+
243
+
244
+ def _load_env_file(env_var_path: Optional[str]) -> None:
245
+ if not env_var_path:
246
+ return
247
+ from dotenv import load_dotenv
248
+
249
+ if not os.path.isfile(env_var_path):
250
+ typer.echo(f"Error: Environment variable file '{env_var_path}' not found.")
251
+ raise typer.Exit(1)
252
+ load_dotenv(env_var_path)
253
+
254
+
255
+ def _check_llm_is_available(llm_service: LLMServiceBase, llm_model: str) -> None:
256
+ # Pings the LLM deployment. Raises exception if it's not available.
257
+ try:
258
+ llm_service.generate_response(
259
+ [{"role": "system", "content": "Ping!"}],
260
+ llm_model,
261
+ temperature=0.0,
262
+ max_tokens=1,
263
+ )
264
+ except Exception as e:
265
+ typer.echo(f"Error: Failed to ping LLM model or deployment '{llm_model}'. Details: {e}")
266
+ raise typer.Exit(1)
267
+
268
+
269
+ _MCP_SERVER_CONFIG = {
270
+ "command": "codeembed",
271
+ "args": ["serve"],
272
+ }
273
+
274
+
275
+ def _read_json(path: str) -> Dict[str, Any]:
276
+ with open(path, "r", encoding="utf-8") as f:
277
+ return json.load(f)
278
+
279
+
280
+ def _write_json(path: str, data: Dict[str, Any]) -> None:
281
+ os.makedirs(os.path.dirname(path), exist_ok=True) if os.path.dirname(path) else None
282
+ with open(path, "w", encoding="utf-8") as f:
283
+ f.write(json.dumps(data, indent=2) + "\n")
284
+
285
+
286
+ def _add_to_claude_code() -> None:
287
+ mcp_json_path = ".mcp.json"
288
+ settings_path = os.path.join(".claude", "settings.local.json")
289
+
290
+ data: Dict[str, Any] = _read_json(mcp_json_path) if os.path.isfile(mcp_json_path) else {}
291
+ data.setdefault("mcpServers", {})["codeembed"] = _MCP_SERVER_CONFIG
292
+ _write_json(mcp_json_path, data)
293
+ typer.echo(f" Updated '{mcp_json_path}'.")
294
+
295
+ data = _read_json(settings_path) if os.path.isfile(settings_path) else {}
296
+ enabled = data.setdefault("enabledMcpjsonServers", [])
297
+ if "codeembed" not in enabled:
298
+ enabled.append("codeembed")
299
+ perms = data.setdefault("permissions", {})
300
+ allowed = perms.setdefault("allow", [])
301
+ if "mcp__codeembed__search" not in allowed:
302
+ allowed.append("mcp__codeembed__search")
303
+ _write_json(settings_path, data)
304
+ typer.echo(f" Updated '{settings_path}'.")
305
+
306
+
307
+ def _add_agent_instructions() -> None:
308
+ target = next(
309
+ (f for f in _AGENT_INSTRUCTION_FILES if os.path.isfile(f)),
310
+ "AGENTS.md",
311
+ )
312
+
313
+ if os.path.isfile(target):
314
+ with open(target, "r", encoding="utf-8") as f:
315
+ existing = f.read()
316
+ if _AGENT_INSTRUCTIONS_MARKER in existing:
317
+ typer.echo(f" '{target}' already contains CodeEmbed instructions, skipping.")
318
+ return
319
+ with open(target, "a", encoding="utf-8") as f:
320
+ f.write("\n" + _AGENT_INSTRUCTIONS_CONTENT)
321
+ typer.echo(f" Appended CodeEmbed search instructions to '{target}'.")
322
+ else:
323
+ parent = os.path.dirname(target)
324
+ if parent:
325
+ os.makedirs(parent, exist_ok=True)
326
+ with open(target, "w", encoding="utf-8") as f:
327
+ f.write(_AGENT_INSTRUCTIONS_CONTENT)
328
+ typer.echo(f" Created '{target}' with CodeEmbed search instructions.")
329
+
330
+
331
+ def _add_to_github_copilot() -> None:
332
+ vscode_mcp_path = os.path.join(".vscode", "mcp.json")
333
+
334
+ data = _read_json(vscode_mcp_path) if os.path.isfile(vscode_mcp_path) else {}
335
+ data.setdefault("servers", {})["codeembed"] = _MCP_SERVER_CONFIG
336
+ _write_json(vscode_mcp_path, data)
337
+ typer.echo(f" Updated '{vscode_mcp_path}'.")
338
+
339
+
340
+ @app.command()
341
+ def init():
342
+ """Initialize CodeEmbed in the current project."""
343
+ typer.echo("Initializing CodeEmbed...\n")
344
+
345
+ if os.path.isfile(_CONFIG_FILE):
346
+ if not typer.confirm(f"'{_CONFIG_FILE}' already exists. Overwrite?", default=False):
347
+ raise typer.Exit(0)
348
+
349
+ env_var_path = typer.prompt(
350
+ "Do you have a .env file path? (optional, press Enter to skip)", default="", show_default=False
351
+ )
352
+ _load_env_file(env_var_path or None)
353
+
354
+ _ensure_gitignore()
355
+ _create_codeembed_dir()
356
+
357
+ provider = _select_provider()
358
+
359
+ if provider == "ollama":
360
+ _check_ollama_installed()
361
+ _check_ollama_running()
362
+ downloaded_models = _get_downloaded_models()
363
+ model = _select_ollama_llm_model(downloaded_models)
364
+ _ensure_model_downloaded(model, downloaded_models)
365
+ else:
366
+ model = _select_openai_model()
367
+
368
+ _write_config(model, provider, env_var_path)
369
+
370
+ typer.echo("")
371
+ if typer.confirm(
372
+ "Add CodeEmbed to Claude Code? (creates/updates .mcp.json and .claude/settings.local.json)", default=True
373
+ ):
374
+ _add_to_claude_code()
375
+
376
+ if typer.confirm("Add CodeEmbed to GitHub Copilot? (creates/updates .vscode/mcp.json)", default=False):
377
+ _add_to_github_copilot()
378
+
379
+ if typer.confirm(
380
+ "Add CodeEmbed search instructions to AGENTS.md? (or existing CLAUDE.md / .github/copilot-instructions.md)",
381
+ default=True,
382
+ ):
383
+ _add_agent_instructions()
384
+
385
+ typer.echo(
386
+ "\nDone.\n\n"
387
+ "Tip: Run 'codeembed embed' before starting the server to pre-populate the index.\n"
388
+ "The server also embeds in the background automatically, but searches will return\n"
389
+ "empty results until the first file is embedded.\n\n"
390
+ "Then run 'codeembed serve' to start the MCP server."
391
+ )
392
+
393
+
394
+ @app.command()
395
+ def serve():
396
+ """Start the MCP server."""
397
+ if not os.path.isfile(_CONFIG_FILE):
398
+ typer.echo("Error: 'codeembed.toml' not found. Run 'codeembed init' first.")
399
+ raise typer.Exit(1)
400
+
401
+ config = get_config()
402
+ _load_env_file(config.env_var_path)
403
+
404
+ setup_logger()
405
+
406
+ if config.provider == "ollama":
407
+ _check_ollama_installed()
408
+ _check_ollama_running()
409
+ _check_ollama_model_is_available(config.llm_model)
410
+
411
+ llm_service = get_llm_service()
412
+
413
+ _check_llm_is_available(llm_service, config.llm_model)
414
+
415
+ from codeembed.mcp_server import mcp
416
+
417
+ typer.echo("Starting CodeEmbed MCP server...")
418
+ mcp.run(transport="stdio")
419
+
420
+
421
+ @app.command()
422
+ def embed():
423
+ """Embed codebase into the vector database."""
424
+ if not os.path.isfile(_CONFIG_FILE):
425
+ typer.echo("Error: 'codeembed.toml' not found. Run 'codeembed init' first.")
426
+ raise typer.Exit(1)
427
+
428
+ config = get_config()
429
+ _load_env_file(config.env_var_path)
430
+
431
+ setup_logger()
432
+
433
+ if config.provider == "ollama":
434
+ _check_ollama_installed()
435
+ _check_ollama_running()
436
+ _check_ollama_model_is_available(config.llm_model)
437
+
438
+ try:
439
+ llm_service = get_llm_service()
440
+
441
+ _check_llm_is_available(llm_service, config.llm_model)
442
+
443
+ typer.echo("Embedding codebase...\n")
444
+
445
+ from codeembed.bootstrap.services import get_embedder_service
446
+
447
+ embedder = get_embedder_service()
448
+ embedder.embed_codebase()
449
+ finally:
450
+ session = get_session()
451
+ session.save()
452
+ typer.echo(f"\nInput tokens used: {session.input_tokens}. Output tokens used: {session.output_tokens}.")
453
+
454
+ typer.echo("\nDone.")
@@ -0,0 +1,5 @@
1
+ from codeembed.config.models import CodeEmbedConfig
2
+
3
+ __all__ = [
4
+ "CodeEmbedConfig",
5
+ ]
@@ -0,0 +1,13 @@
1
+ from dataclasses import dataclass
2
+ from typing import Literal, Optional
3
+
4
+
5
+ @dataclass
6
+ class CodeEmbedConfig:
7
+ debounce: int
8
+ sleep_interval: int
9
+ llm_model: str
10
+ provider: Literal["ollama", "openai"] = "ollama"
11
+ llm_api_endpoint_env_var: Optional[str] = None
12
+ llm_api_key_env_var: Optional[str] = None
13
+ env_var_path: Optional[str] = None
@@ -0,0 +1,7 @@
1
+ from codeembed.cost_tracking.llm_wrapper import LLMServiceWithCostTracking
2
+ from codeembed.cost_tracking.models import Session
3
+
4
+ __all__ = [
5
+ "LLMServiceWithCostTracking",
6
+ "Session",
7
+ ]
@@ -0,0 +1,39 @@
1
+ from typing import TypeVar
2
+
3
+ from pydantic import BaseModel
4
+
5
+ from codeembed.cost_tracking.models import Session
6
+ from codeembed.llm.base import LLMServiceBase
7
+ from codeembed.llm.models import LLMResponse, StructuredLLMResponse
8
+
9
+ T = TypeVar("T", bound=BaseModel)
10
+
11
+
12
+ class LLMServiceWithCostTracking(LLMServiceBase):
13
+ """Wrapper for LLM service that tracks token usage."""
14
+
15
+ def __init__(self, llm_service: LLMServiceBase, session: Session) -> None:
16
+ self._llm_service = llm_service
17
+ self._session = session
18
+
19
+ def generate_structured_output(self, *args, **kwargs) -> StructuredLLMResponse[T]:
20
+ res = self._llm_service.generate_structured_output(*args, **kwargs)
21
+
22
+ self._session.add(
23
+ model_name=res.llm_model,
24
+ input_tokens=res.input_tokens,
25
+ output_tokens=res.output_tokens,
26
+ )
27
+
28
+ return res
29
+
30
+ def generate_response(self, *args, **kwargs) -> LLMResponse:
31
+ res = self._llm_service.generate_response(*args, **kwargs)
32
+
33
+ self._session.add(
34
+ model_name=res.llm_model,
35
+ input_tokens=res.input_tokens,
36
+ output_tokens=res.output_tokens,
37
+ )
38
+
39
+ return res
@@ -0,0 +1,52 @@
1
+ import json
2
+ import os
3
+ from typing import Dict, Literal, Optional
4
+
5
+ from codeembed.utils.time_utils import utc_now
6
+
7
+ _SessionData = Dict[str, Dict[Literal["input_tokens", "output_tokens", "embedding_tokens"], int]]
8
+
9
+ _SESSIONS_DIR = ".codeembed/sessions"
10
+
11
+
12
+ class Session:
13
+ """Writes token usage to a file under `.codeembed/sessions/<timestamp>.json`"""
14
+
15
+ def __init__(self):
16
+ self._by_model: _SessionData = {}
17
+ self._session_id = utc_now().strftime("%Y-%m-%dT%H-%M-%S")
18
+
19
+ def add(
20
+ self,
21
+ model_name: str,
22
+ input_tokens: Optional[int] = None,
23
+ output_tokens: Optional[int] = None,
24
+ ) -> None:
25
+ if model_name not in self._by_model:
26
+ self._by_model[model_name] = {
27
+ "input_tokens": 0,
28
+ "output_tokens": 0,
29
+ "embedding_tokens": 0,
30
+ }
31
+ if input_tokens is not None:
32
+ self._by_model[model_name]["input_tokens"] += input_tokens
33
+ if output_tokens is not None:
34
+ self._by_model[model_name]["output_tokens"] += output_tokens
35
+
36
+ def save(self) -> None:
37
+ if not self._by_model:
38
+ return
39
+ os.makedirs(_SESSIONS_DIR, exist_ok=True)
40
+ with open(f"{_SESSIONS_DIR}/{self._session_id}.json", "w") as f:
41
+ f.write(json.dumps(self._by_model, indent=2))
42
+
43
+ def get_usage(self) -> _SessionData:
44
+ return self._by_model
45
+
46
+ @property
47
+ def input_tokens(self) -> int:
48
+ return sum(tokens["input_tokens"] for tokens in self._by_model.values())
49
+
50
+ @property
51
+ def output_tokens(self) -> int:
52
+ return sum(tokens["output_tokens"] for tokens in self._by_model.values())
@@ -0,0 +1,5 @@
1
+ from codeembed.delta_computer.delta_computer import DeltaComputer
2
+
3
+ __all__ = [
4
+ "DeltaComputer",
5
+ ]
@@ -0,0 +1,75 @@
1
+ from datetime import datetime, timedelta
2
+ from typing import Dict, List, Set, Tuple
3
+ from uuid import UUID
4
+
5
+ from codeembed.doc_provider.base import DocProviderBase
6
+ from codeembed.utils.time_utils import utc_now
7
+ from codeembed.vector_db.base import VectorDbBase
8
+
9
+
10
+ class DeltaComputer:
11
+ """Figures out which files to add, delete or update."""
12
+
13
+ def __init__(self, doc_provider: DocProviderBase, vector_db: VectorDbBase, debounce_seconds: int = 10) -> None:
14
+ self._doc_provider = doc_provider
15
+ self._vector_db = vector_db
16
+ self._debounce_seconds = debounce_seconds
17
+
18
+ def compute_deltas(self) -> Tuple[Set[UUID], Set[str]]:
19
+ """
20
+ Returns chunk IDs to delete and file paths to process.
21
+
22
+ May not have best perfomance since we iterate each chunk stored in the vector database.
23
+ """
24
+
25
+ file_paths_to_update: Set[str] = set()
26
+
27
+ file_path_to_chunk_ids: Dict[str, List[UUID]] = {}
28
+ chunk_ids_to_delete: Set[UUID] = set()
29
+
30
+ # Collect modified_at stored in our database.
31
+ old_modified_at: Dict[str, datetime] = {}
32
+ old_checksums: Dict[str, str] = {}
33
+ for chunk in self._vector_db.iter_chunks():
34
+ old_modified_at[chunk.file_path] = max(
35
+ old_modified_at.get(chunk.file_path, chunk.modified_at), chunk.modified_at
36
+ )
37
+ old_checksums[chunk.file_path] = chunk.file_sha256_checksum
38
+
39
+ file_path_to_chunk_ids[chunk.file_path] = file_path_to_chunk_ids.get(chunk.file_path, []) + [chunk.id]
40
+
41
+ # Collect current modified_at in file system.
42
+ current: Dict[str, datetime] = {}
43
+ for doc in self._doc_provider.iter():
44
+ current[doc.file_path] = doc.modified_at
45
+
46
+ # Figure out which files have been added or modified.
47
+ for file_path, modified_at in current.items():
48
+ if modified_at > utc_now() - timedelta(seconds=self._debounce_seconds):
49
+ # We skip files modified within the last N seconds
50
+ continue
51
+
52
+ if file_path not in old_modified_at or old_modified_at[file_path] < modified_at:
53
+ if file_path in old_modified_at:
54
+ doc = self._doc_provider.get_content(file_path)
55
+ if doc.sha256_checksum == old_checksums[file_path]:
56
+ # We skip files with same checksum even if modified_at is updated.
57
+ # Some editors update modified_at even without any changes.
58
+ # TODO: We should probably update modified_at in vector database
59
+ # to avoid re-reading this file on every run.
60
+ continue
61
+
62
+ # file updated or added
63
+ file_paths_to_update.add(file_path)
64
+
65
+ # We delete all old chunks for any modified files.
66
+ for chunk_id in file_path_to_chunk_ids.get(file_path, []):
67
+ chunk_ids_to_delete.add(chunk_id)
68
+
69
+ # Figure out which files have been removed.
70
+ for file_path in old_modified_at:
71
+ if file_path not in current:
72
+ for chunk_id in file_path_to_chunk_ids.get(file_path, []):
73
+ chunk_ids_to_delete.add(chunk_id)
74
+
75
+ return chunk_ids_to_delete, file_paths_to_update
@@ -0,0 +1,5 @@
1
+ from codeembed.doc_embedder.doc_embedder import DocEmbedder
2
+
3
+ __all__ = [
4
+ "DocEmbedder",
5
+ ]