kodit 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (118) hide show
  1. kodit/_version.py +2 -2
  2. kodit/application/__init__.py +1 -0
  3. kodit/application/commands/__init__.py +1 -0
  4. kodit/application/commands/snippet_commands.py +22 -0
  5. kodit/application/services/__init__.py +1 -0
  6. kodit/application/services/indexing_application_service.py +363 -0
  7. kodit/application/services/snippet_application_service.py +143 -0
  8. kodit/cli.py +105 -82
  9. kodit/database.py +0 -22
  10. kodit/domain/__init__.py +1 -0
  11. kodit/{source/source_models.py → domain/entities.py} +88 -19
  12. kodit/domain/enums.py +9 -0
  13. kodit/domain/interfaces.py +27 -0
  14. kodit/domain/repositories.py +95 -0
  15. kodit/domain/services/__init__.py +1 -0
  16. kodit/domain/services/bm25_service.py +124 -0
  17. kodit/domain/services/embedding_service.py +155 -0
  18. kodit/domain/services/enrichment_service.py +48 -0
  19. kodit/domain/services/ignore_service.py +45 -0
  20. kodit/domain/services/indexing_service.py +203 -0
  21. kodit/domain/services/snippet_extraction_service.py +89 -0
  22. kodit/domain/services/source_service.py +83 -0
  23. kodit/domain/value_objects.py +215 -0
  24. kodit/infrastructure/__init__.py +1 -0
  25. kodit/infrastructure/bm25/__init__.py +1 -0
  26. kodit/infrastructure/bm25/bm25_factory.py +28 -0
  27. kodit/{bm25/local_bm25.py → infrastructure/bm25/local_bm25_repository.py} +33 -22
  28. kodit/{bm25/vectorchord_bm25.py → infrastructure/bm25/vectorchord_bm25_repository.py} +40 -35
  29. kodit/infrastructure/cloning/__init__.py +1 -0
  30. kodit/infrastructure/cloning/folder/__init__.py +1 -0
  31. kodit/infrastructure/cloning/folder/factory.py +119 -0
  32. kodit/infrastructure/cloning/folder/working_copy.py +38 -0
  33. kodit/infrastructure/cloning/git/__init__.py +1 -0
  34. kodit/infrastructure/cloning/git/factory.py +133 -0
  35. kodit/infrastructure/cloning/git/working_copy.py +32 -0
  36. kodit/infrastructure/cloning/metadata.py +127 -0
  37. kodit/infrastructure/embedding/__init__.py +1 -0
  38. kodit/infrastructure/embedding/embedding_factory.py +87 -0
  39. kodit/infrastructure/embedding/embedding_providers/__init__.py +1 -0
  40. kodit/infrastructure/embedding/embedding_providers/batching.py +93 -0
  41. kodit/infrastructure/embedding/embedding_providers/hash_embedding_provider.py +79 -0
  42. kodit/infrastructure/embedding/embedding_providers/local_embedding_provider.py +129 -0
  43. kodit/infrastructure/embedding/embedding_providers/openai_embedding_provider.py +113 -0
  44. kodit/infrastructure/embedding/local_vector_search_repository.py +114 -0
  45. kodit/{embedding/vectorchord_vector_search_service.py → infrastructure/embedding/vectorchord_vector_search_repository.py} +98 -32
  46. kodit/infrastructure/enrichment/__init__.py +1 -0
  47. kodit/{enrichment → infrastructure/enrichment}/enrichment_factory.py +28 -12
  48. kodit/infrastructure/enrichment/legacy_enrichment_models.py +42 -0
  49. kodit/infrastructure/enrichment/local_enrichment_provider.py +115 -0
  50. kodit/infrastructure/enrichment/null_enrichment_provider.py +25 -0
  51. kodit/infrastructure/enrichment/openai_enrichment_provider.py +89 -0
  52. kodit/infrastructure/git/__init__.py +1 -0
  53. kodit/{source/git.py → infrastructure/git/git_utils.py} +10 -2
  54. kodit/infrastructure/ignore/__init__.py +1 -0
  55. kodit/{source/ignore.py → infrastructure/ignore/ignore_pattern_provider.py} +23 -6
  56. kodit/infrastructure/indexing/__init__.py +1 -0
  57. kodit/infrastructure/indexing/fusion_service.py +55 -0
  58. kodit/infrastructure/indexing/index_repository.py +296 -0
  59. kodit/infrastructure/indexing/indexing_factory.py +111 -0
  60. kodit/infrastructure/snippet_extraction/__init__.py +1 -0
  61. kodit/infrastructure/snippet_extraction/language_detection_service.py +39 -0
  62. kodit/infrastructure/snippet_extraction/snippet_extraction_factory.py +95 -0
  63. kodit/infrastructure/snippet_extraction/snippet_query_provider.py +45 -0
  64. kodit/{snippets/method_snippets.py → infrastructure/snippet_extraction/tree_sitter_snippet_extractor.py} +123 -61
  65. kodit/infrastructure/sqlalchemy/__init__.py +1 -0
  66. kodit/{embedding → infrastructure/sqlalchemy}/embedding_repository.py +40 -24
  67. kodit/infrastructure/sqlalchemy/file_repository.py +73 -0
  68. kodit/infrastructure/sqlalchemy/repository.py +121 -0
  69. kodit/infrastructure/sqlalchemy/snippet_repository.py +75 -0
  70. kodit/infrastructure/ui/__init__.py +1 -0
  71. kodit/infrastructure/ui/progress.py +127 -0
  72. kodit/{util → infrastructure/ui}/spinner.py +19 -4
  73. kodit/mcp.py +50 -28
  74. kodit/migrations/env.py +1 -4
  75. kodit/reporting.py +78 -0
  76. {kodit-0.2.3.dist-info → kodit-0.2.5.dist-info}/METADATA +1 -1
  77. kodit-0.2.5.dist-info/RECORD +99 -0
  78. kodit/bm25/__init__.py +0 -1
  79. kodit/bm25/keyword_search_factory.py +0 -17
  80. kodit/bm25/keyword_search_service.py +0 -34
  81. kodit/embedding/__init__.py +0 -1
  82. kodit/embedding/embedding_factory.py +0 -63
  83. kodit/embedding/embedding_models.py +0 -28
  84. kodit/embedding/embedding_provider/__init__.py +0 -1
  85. kodit/embedding/embedding_provider/embedding_provider.py +0 -64
  86. kodit/embedding/embedding_provider/hash_embedding_provider.py +0 -77
  87. kodit/embedding/embedding_provider/local_embedding_provider.py +0 -64
  88. kodit/embedding/embedding_provider/openai_embedding_provider.py +0 -77
  89. kodit/embedding/local_vector_search_service.py +0 -54
  90. kodit/embedding/vector_search_service.py +0 -38
  91. kodit/enrichment/__init__.py +0 -1
  92. kodit/enrichment/enrichment_provider/__init__.py +0 -1
  93. kodit/enrichment/enrichment_provider/enrichment_provider.py +0 -16
  94. kodit/enrichment/enrichment_provider/local_enrichment_provider.py +0 -92
  95. kodit/enrichment/enrichment_provider/openai_enrichment_provider.py +0 -81
  96. kodit/enrichment/enrichment_service.py +0 -33
  97. kodit/indexing/__init__.py +0 -1
  98. kodit/indexing/fusion.py +0 -67
  99. kodit/indexing/indexing_models.py +0 -43
  100. kodit/indexing/indexing_repository.py +0 -216
  101. kodit/indexing/indexing_service.py +0 -338
  102. kodit/snippets/__init__.py +0 -1
  103. kodit/snippets/languages/__init__.py +0 -53
  104. kodit/snippets/snippets.py +0 -50
  105. kodit/source/__init__.py +0 -1
  106. kodit/source/source_factories.py +0 -356
  107. kodit/source/source_repository.py +0 -169
  108. kodit/source/source_service.py +0 -150
  109. kodit/util/__init__.py +0 -1
  110. kodit-0.2.3.dist-info/RECORD +0 -71
  111. /kodit/{snippets → infrastructure/snippet_extraction}/languages/csharp.scm +0 -0
  112. /kodit/{snippets → infrastructure/snippet_extraction}/languages/go.scm +0 -0
  113. /kodit/{snippets → infrastructure/snippet_extraction}/languages/javascript.scm +0 -0
  114. /kodit/{snippets → infrastructure/snippet_extraction}/languages/python.scm +0 -0
  115. /kodit/{snippets → infrastructure/snippet_extraction}/languages/typescript.scm +0 -0
  116. {kodit-0.2.3.dist-info → kodit-0.2.5.dist-info}/WHEEL +0 -0
  117. {kodit-0.2.3.dist-info → kodit-0.2.5.dist-info}/entry_points.txt +0 -0
  118. {kodit-0.2.3.dist-info → kodit-0.2.5.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,42 @@
1
+ """Legacy enrichment models for backward compatibility."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from collections.abc import AsyncGenerator
5
+ from dataclasses import dataclass
6
+
7
+
8
+ @dataclass
9
+ class EnrichmentRequest:
10
+ """Legacy enrichment request model."""
11
+
12
+ snippet_id: int
13
+ text: str
14
+
15
+
16
+ @dataclass
17
+ class EnrichmentResponse:
18
+ """Legacy enrichment response model."""
19
+
20
+ snippet_id: int
21
+ text: str
22
+
23
+
24
+ class EnrichmentService(ABC):
25
+ """Legacy enrichment service interface."""
26
+
27
+ @abstractmethod
28
+ def enrich(
29
+ self, data: list[EnrichmentRequest]
30
+ ) -> AsyncGenerator[EnrichmentResponse, None]:
31
+ """Enrich a list of requests."""
32
+
33
+
34
+ class NullEnrichmentService(EnrichmentService):
35
+ """Null enrichment service for testing."""
36
+
37
+ async def enrich(
38
+ self, data: list[EnrichmentRequest]
39
+ ) -> AsyncGenerator[EnrichmentResponse, None]:
40
+ """Return empty responses for all requests."""
41
+ for request in data:
42
+ yield EnrichmentResponse(snippet_id=request.snippet_id, text="")
@@ -0,0 +1,115 @@
1
+ """Local enrichment provider implementation."""
2
+
3
+ import os
4
+ from collections.abc import AsyncGenerator
5
+
6
+ import structlog
7
+ import tiktoken
8
+
9
+ from kodit.domain.services.enrichment_service import EnrichmentProvider
10
+ from kodit.domain.value_objects import EnrichmentRequest, EnrichmentResponse
11
+
12
+ ENRICHMENT_SYSTEM_PROMPT = """
13
+ You are a professional software developer. You will be given a snippet of code.
14
+ Please provide a concise explanation of the code.
15
+ """
16
+
17
+ DEFAULT_ENRICHMENT_MODEL = "Qwen/Qwen3-0.6B"
18
+ DEFAULT_CONTEXT_WINDOW_SIZE = 2048 # Small so it works even on low-powered devices
19
+
20
+
21
+ class LocalEnrichmentProvider(EnrichmentProvider):
22
+ """Local enrichment provider implementation."""
23
+
24
+ def __init__(
25
+ self,
26
+ model_name: str = DEFAULT_ENRICHMENT_MODEL,
27
+ context_window: int = DEFAULT_CONTEXT_WINDOW_SIZE,
28
+ ) -> None:
29
+ """Initialize the local enrichment provider.
30
+
31
+ Args:
32
+ model_name: The model name to use for enrichment.
33
+ context_window: The context window size for the model.
34
+
35
+ """
36
+ self.log = structlog.get_logger(__name__)
37
+ self.model_name = model_name
38
+ self.context_window = context_window
39
+ self.model = None
40
+ self.tokenizer = None
41
+ self.encoding = tiktoken.encoding_for_model("text-embedding-3-small")
42
+
43
+ async def enrich(
44
+ self, requests: list[EnrichmentRequest]
45
+ ) -> AsyncGenerator[EnrichmentResponse, None]:
46
+ """Enrich a list of requests using local model.
47
+
48
+ Args:
49
+ requests: List of enrichment requests.
50
+
51
+ Yields:
52
+ Enrichment responses as they are processed.
53
+
54
+ """
55
+ # Remove empty snippets
56
+ requests = [req for req in requests if req.text]
57
+
58
+ if not requests:
59
+ self.log.warning("No valid requests for enrichment")
60
+ return
61
+
62
+ from transformers.models.auto.modeling_auto import (
63
+ AutoModelForCausalLM,
64
+ )
65
+ from transformers.models.auto.tokenization_auto import AutoTokenizer
66
+
67
+ if self.tokenizer is None:
68
+ self.tokenizer = AutoTokenizer.from_pretrained(
69
+ self.model_name, padding_side="left"
70
+ )
71
+ if self.model is None:
72
+ os.environ["TOKENIZERS_PARALLELISM"] = "false" # Avoid warnings
73
+ self.model = AutoModelForCausalLM.from_pretrained(
74
+ self.model_name,
75
+ torch_dtype="auto",
76
+ trust_remote_code=True,
77
+ device_map="auto",
78
+ )
79
+
80
+ # Prepare prompts
81
+ prompts = [
82
+ {
83
+ "id": req.snippet_id,
84
+ "text": self.tokenizer.apply_chat_template(
85
+ [
86
+ {"role": "system", "content": ENRICHMENT_SYSTEM_PROMPT},
87
+ {"role": "user", "content": req.text},
88
+ ],
89
+ tokenize=False,
90
+ add_generation_prompt=True,
91
+ enable_thinking=False,
92
+ ),
93
+ }
94
+ for req in requests
95
+ ]
96
+
97
+ for prompt in prompts:
98
+ model_inputs = self.tokenizer(
99
+ prompt["text"],
100
+ return_tensors="pt",
101
+ padding=True,
102
+ truncation=True,
103
+ ).to(self.model.device)
104
+ generated_ids = self.model.generate(
105
+ **model_inputs, max_new_tokens=self.context_window
106
+ )
107
+ input_ids = model_inputs["input_ids"][0]
108
+ output_ids = generated_ids[0][len(input_ids) :].tolist()
109
+ content = self.tokenizer.decode(output_ids, skip_special_tokens=True).strip(
110
+ "\n"
111
+ )
112
+ yield EnrichmentResponse(
113
+ snippet_id=prompt["id"],
114
+ text=content,
115
+ )
@@ -0,0 +1,25 @@
1
+ """Null enrichment provider for testing."""
2
+
3
+ from collections.abc import AsyncGenerator
4
+
5
+ from kodit.domain.services.enrichment_service import EnrichmentProvider
6
+ from kodit.domain.value_objects import EnrichmentRequest, EnrichmentResponse
7
+
8
+
9
+ class NullEnrichmentProvider(EnrichmentProvider):
10
+ """Null enrichment provider that returns empty responses."""
11
+
12
+ async def enrich(
13
+ self, requests: list[EnrichmentRequest]
14
+ ) -> AsyncGenerator[EnrichmentResponse, None]:
15
+ """Return empty responses for all requests.
16
+
17
+ Args:
18
+ requests: List of enrichment requests.
19
+
20
+ Yields:
21
+ Empty enrichment responses.
22
+
23
+ """
24
+ for request in requests:
25
+ yield EnrichmentResponse(snippet_id=request.snippet_id, text="")
@@ -0,0 +1,89 @@
1
+ """OpenAI enrichment provider implementation."""
2
+
3
+ import asyncio
4
+ from collections.abc import AsyncGenerator
5
+ from typing import Any
6
+
7
+ import structlog
8
+
9
+ from kodit.domain.services.enrichment_service import EnrichmentProvider
10
+ from kodit.domain.value_objects import EnrichmentRequest, EnrichmentResponse
11
+
12
+ ENRICHMENT_SYSTEM_PROMPT = """
13
+ You are a professional software developer. You will be given a snippet of code.
14
+ Please provide a concise explanation of the code.
15
+ """
16
+
17
+ # Default tuned to approximately fit within OpenAI's rate limit of 500 / RPM
18
+ OPENAI_NUM_PARALLEL_TASKS = 40
19
+
20
+
21
+ class OpenAIEnrichmentProvider(EnrichmentProvider):
22
+ """OpenAI enrichment provider implementation."""
23
+
24
+ def __init__(self, openai_client: Any, model_name: str = "gpt-4o-mini") -> None:
25
+ """Initialize the OpenAI enrichment provider.
26
+
27
+ Args:
28
+ openai_client: The OpenAI client instance.
29
+ model_name: The model name to use for enrichment.
30
+
31
+ """
32
+ self.log = structlog.get_logger(__name__)
33
+ self.openai_client = openai_client
34
+ self.model_name = model_name
35
+
36
+ async def enrich(
37
+ self, requests: list[EnrichmentRequest]
38
+ ) -> AsyncGenerator[EnrichmentResponse, None]:
39
+ """Enrich a list of requests using OpenAI API.
40
+
41
+ Args:
42
+ requests: List of enrichment requests.
43
+
44
+ Yields:
45
+ Enrichment responses as they are processed.
46
+
47
+ """
48
+ if not requests:
49
+ self.log.warning("No requests for enrichment")
50
+ return
51
+
52
+ # Process batches in parallel with a semaphore to limit concurrent requests
53
+ sem = asyncio.Semaphore(OPENAI_NUM_PARALLEL_TASKS)
54
+
55
+ async def process_request(request: EnrichmentRequest) -> EnrichmentResponse:
56
+ async with sem:
57
+ if not request.text:
58
+ return EnrichmentResponse(
59
+ snippet_id=request.snippet_id,
60
+ text="",
61
+ )
62
+ try:
63
+ response = await self.openai_client.chat.completions.create(
64
+ model=self.model_name,
65
+ messages=[
66
+ {
67
+ "role": "system",
68
+ "content": ENRICHMENT_SYSTEM_PROMPT,
69
+ },
70
+ {"role": "user", "content": request.text},
71
+ ],
72
+ )
73
+ return EnrichmentResponse(
74
+ snippet_id=request.snippet_id,
75
+ text=response.choices[0].message.content or "",
76
+ )
77
+ except Exception as e:
78
+ self.log.exception("Error enriching request", error=str(e))
79
+ return EnrichmentResponse(
80
+ snippet_id=request.snippet_id,
81
+ text="",
82
+ )
83
+
84
+ # Create tasks for all requests
85
+ tasks = [process_request(request) for request in requests]
86
+
87
+ # Process all requests and yield results as they complete
88
+ for task in asyncio.as_completed(tasks):
89
+ yield await task
@@ -0,0 +1 @@
1
+ """Git infrastructure module."""
@@ -1,4 +1,4 @@
1
- """Git utilities."""
1
+ """Git utilities for infrastructure operations."""
2
2
 
3
3
  import tempfile
4
4
 
@@ -6,7 +6,15 @@ import git
6
6
 
7
7
 
8
8
  def is_valid_clone_target(target: str) -> bool:
9
- """Return True if the target is clonable."""
9
+ """Return True if the target is clonable.
10
+
11
+ Args:
12
+ target: The git repository URL or path to validate.
13
+
14
+ Returns:
15
+ True if the target can be cloned, False otherwise.
16
+
17
+ """
10
18
  with tempfile.TemporaryDirectory() as temp_dir:
11
19
  try:
12
20
  git.Repo.clone_from(target, temp_dir)
@@ -0,0 +1 @@
1
+ """Ignore infrastructure module."""
@@ -1,18 +1,27 @@
1
- """Ignore patterns."""
1
+ """Infrastructure implementation of ignore pattern provider."""
2
2
 
3
3
  from pathlib import Path
4
4
 
5
5
  import git
6
6
  import pathspec
7
7
 
8
- from kodit.source.git import is_valid_clone_target
8
+ from kodit.domain.services.ignore_service import IgnorePatternProvider
9
+ from kodit.infrastructure.git.git_utils import is_valid_clone_target
9
10
 
10
11
 
11
- class IgnorePatterns:
12
- """Ignore patterns."""
12
+ class GitIgnorePatternProvider(IgnorePatternProvider):
13
+ """Ignore pattern provider for git repositories."""
13
14
 
14
15
  def __init__(self, base_dir: Path) -> None:
15
- """Initialize the ignore patterns."""
16
+ """Initialize the ignore pattern provider.
17
+
18
+ Args:
19
+ base_dir: The base directory to check for ignore patterns.
20
+
21
+ Raises:
22
+ ValueError: If the base directory is not a directory.
23
+
24
+ """
16
25
  if not base_dir.is_dir():
17
26
  msg = f"Base directory is not a directory: {base_dir}"
18
27
  raise ValueError(msg)
@@ -25,7 +34,15 @@ class IgnorePatterns:
25
34
  self.git_repo = git.Repo(base_dir)
26
35
 
27
36
  def should_ignore(self, path: Path) -> bool:
28
- """Check if a path should be ignored."""
37
+ """Check if a path should be ignored.
38
+
39
+ Args:
40
+ path: The path to check.
41
+
42
+ Returns:
43
+ True if the path should be ignored, False otherwise.
44
+
45
+ """
29
46
  if path.is_dir():
30
47
  return False
31
48
 
@@ -0,0 +1 @@
1
+ """Infrastructure indexing module."""
@@ -0,0 +1,55 @@
1
+ """Infrastructure implementation of the fusion service."""
2
+
3
+ from collections import defaultdict
4
+
5
+ from kodit.domain.services.indexing_service import FusionService
6
+ from kodit.domain.value_objects import FusionRequest, FusionResult
7
+
8
+
9
+ class ReciprocalRankFusionService(FusionService):
10
+ """Infrastructure implementation of reciprocal rank fusion."""
11
+
12
+ def reciprocal_rank_fusion(
13
+ self, rankings: list[list[FusionRequest]], k: float = 60
14
+ ) -> list[FusionResult]:
15
+ """Perform reciprocal rank fusion on search results.
16
+
17
+ Args:
18
+ rankings: List of rankers, each containing a list of document ids.
19
+ Top of the list is considered to be the best result.
20
+ k: Parameter for RRF.
21
+
22
+ Returns:
23
+ List of fused results with scores.
24
+
25
+ """
26
+ scores = {}
27
+ for ranker in rankings:
28
+ for rank in ranker:
29
+ scores[rank.id] = float(0)
30
+
31
+ for ranker in rankings:
32
+ for i, rank in enumerate(ranker):
33
+ scores[rank.id] += 1.0 / (k + i)
34
+
35
+ # Create a list of tuples of ids and their scores
36
+ results = [(rank, scores[rank]) for rank in scores]
37
+
38
+ # Sort results by score
39
+ results.sort(key=lambda x: x[1], reverse=True)
40
+
41
+ # Create a map of original scores to ids
42
+ original_scores_to_ids = defaultdict(list)
43
+ for ranker in rankings:
44
+ for rank in ranker:
45
+ original_scores_to_ids[rank.id].append(rank.score)
46
+
47
+ # Rebuild a list of final results with their original scores
48
+ return [
49
+ FusionResult(
50
+ id=result[0],
51
+ score=result[1],
52
+ original_scores=original_scores_to_ids[result[0]],
53
+ )
54
+ for result in results
55
+ ]