kodit 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (118) hide show
  1. kodit/_version.py +2 -2
  2. kodit/application/__init__.py +1 -0
  3. kodit/application/commands/__init__.py +1 -0
  4. kodit/application/commands/snippet_commands.py +22 -0
  5. kodit/application/services/__init__.py +1 -0
  6. kodit/application/services/indexing_application_service.py +363 -0
  7. kodit/application/services/snippet_application_service.py +143 -0
  8. kodit/cli.py +105 -82
  9. kodit/database.py +0 -22
  10. kodit/domain/__init__.py +1 -0
  11. kodit/{source/source_models.py → domain/entities.py} +88 -19
  12. kodit/domain/enums.py +9 -0
  13. kodit/domain/interfaces.py +27 -0
  14. kodit/domain/repositories.py +95 -0
  15. kodit/domain/services/__init__.py +1 -0
  16. kodit/domain/services/bm25_service.py +124 -0
  17. kodit/domain/services/embedding_service.py +155 -0
  18. kodit/domain/services/enrichment_service.py +48 -0
  19. kodit/domain/services/ignore_service.py +45 -0
  20. kodit/domain/services/indexing_service.py +203 -0
  21. kodit/domain/services/snippet_extraction_service.py +89 -0
  22. kodit/domain/services/source_service.py +83 -0
  23. kodit/domain/value_objects.py +215 -0
  24. kodit/infrastructure/__init__.py +1 -0
  25. kodit/infrastructure/bm25/__init__.py +1 -0
  26. kodit/infrastructure/bm25/bm25_factory.py +28 -0
  27. kodit/{bm25/local_bm25.py → infrastructure/bm25/local_bm25_repository.py} +33 -22
  28. kodit/{bm25/vectorchord_bm25.py → infrastructure/bm25/vectorchord_bm25_repository.py} +40 -35
  29. kodit/infrastructure/cloning/__init__.py +1 -0
  30. kodit/infrastructure/cloning/folder/__init__.py +1 -0
  31. kodit/infrastructure/cloning/folder/factory.py +119 -0
  32. kodit/infrastructure/cloning/folder/working_copy.py +38 -0
  33. kodit/infrastructure/cloning/git/__init__.py +1 -0
  34. kodit/infrastructure/cloning/git/factory.py +133 -0
  35. kodit/infrastructure/cloning/git/working_copy.py +32 -0
  36. kodit/infrastructure/cloning/metadata.py +127 -0
  37. kodit/infrastructure/embedding/__init__.py +1 -0
  38. kodit/infrastructure/embedding/embedding_factory.py +87 -0
  39. kodit/infrastructure/embedding/embedding_providers/__init__.py +1 -0
  40. kodit/infrastructure/embedding/embedding_providers/batching.py +93 -0
  41. kodit/infrastructure/embedding/embedding_providers/hash_embedding_provider.py +79 -0
  42. kodit/infrastructure/embedding/embedding_providers/local_embedding_provider.py +129 -0
  43. kodit/infrastructure/embedding/embedding_providers/openai_embedding_provider.py +113 -0
  44. kodit/infrastructure/embedding/local_vector_search_repository.py +114 -0
  45. kodit/{embedding/vectorchord_vector_search_service.py → infrastructure/embedding/vectorchord_vector_search_repository.py} +98 -32
  46. kodit/infrastructure/enrichment/__init__.py +1 -0
  47. kodit/{enrichment → infrastructure/enrichment}/enrichment_factory.py +28 -12
  48. kodit/infrastructure/enrichment/legacy_enrichment_models.py +42 -0
  49. kodit/infrastructure/enrichment/local_enrichment_provider.py +115 -0
  50. kodit/infrastructure/enrichment/null_enrichment_provider.py +25 -0
  51. kodit/infrastructure/enrichment/openai_enrichment_provider.py +89 -0
  52. kodit/infrastructure/git/__init__.py +1 -0
  53. kodit/{source/git.py → infrastructure/git/git_utils.py} +10 -2
  54. kodit/infrastructure/ignore/__init__.py +1 -0
  55. kodit/{source/ignore.py → infrastructure/ignore/ignore_pattern_provider.py} +23 -6
  56. kodit/infrastructure/indexing/__init__.py +1 -0
  57. kodit/infrastructure/indexing/fusion_service.py +55 -0
  58. kodit/infrastructure/indexing/index_repository.py +296 -0
  59. kodit/infrastructure/indexing/indexing_factory.py +111 -0
  60. kodit/infrastructure/snippet_extraction/__init__.py +1 -0
  61. kodit/infrastructure/snippet_extraction/language_detection_service.py +39 -0
  62. kodit/infrastructure/snippet_extraction/snippet_extraction_factory.py +95 -0
  63. kodit/infrastructure/snippet_extraction/snippet_query_provider.py +45 -0
  64. kodit/{snippets/method_snippets.py → infrastructure/snippet_extraction/tree_sitter_snippet_extractor.py} +123 -61
  65. kodit/infrastructure/sqlalchemy/__init__.py +1 -0
  66. kodit/{embedding → infrastructure/sqlalchemy}/embedding_repository.py +40 -24
  67. kodit/infrastructure/sqlalchemy/file_repository.py +73 -0
  68. kodit/infrastructure/sqlalchemy/repository.py +121 -0
  69. kodit/infrastructure/sqlalchemy/snippet_repository.py +75 -0
  70. kodit/infrastructure/ui/__init__.py +1 -0
  71. kodit/infrastructure/ui/progress.py +127 -0
  72. kodit/{util → infrastructure/ui}/spinner.py +19 -4
  73. kodit/mcp.py +50 -28
  74. kodit/migrations/env.py +1 -4
  75. kodit/reporting.py +78 -0
  76. {kodit-0.2.3.dist-info → kodit-0.2.5.dist-info}/METADATA +1 -1
  77. kodit-0.2.5.dist-info/RECORD +99 -0
  78. kodit/bm25/__init__.py +0 -1
  79. kodit/bm25/keyword_search_factory.py +0 -17
  80. kodit/bm25/keyword_search_service.py +0 -34
  81. kodit/embedding/__init__.py +0 -1
  82. kodit/embedding/embedding_factory.py +0 -63
  83. kodit/embedding/embedding_models.py +0 -28
  84. kodit/embedding/embedding_provider/__init__.py +0 -1
  85. kodit/embedding/embedding_provider/embedding_provider.py +0 -64
  86. kodit/embedding/embedding_provider/hash_embedding_provider.py +0 -77
  87. kodit/embedding/embedding_provider/local_embedding_provider.py +0 -64
  88. kodit/embedding/embedding_provider/openai_embedding_provider.py +0 -77
  89. kodit/embedding/local_vector_search_service.py +0 -54
  90. kodit/embedding/vector_search_service.py +0 -38
  91. kodit/enrichment/__init__.py +0 -1
  92. kodit/enrichment/enrichment_provider/__init__.py +0 -1
  93. kodit/enrichment/enrichment_provider/enrichment_provider.py +0 -16
  94. kodit/enrichment/enrichment_provider/local_enrichment_provider.py +0 -92
  95. kodit/enrichment/enrichment_provider/openai_enrichment_provider.py +0 -81
  96. kodit/enrichment/enrichment_service.py +0 -33
  97. kodit/indexing/__init__.py +0 -1
  98. kodit/indexing/fusion.py +0 -67
  99. kodit/indexing/indexing_models.py +0 -43
  100. kodit/indexing/indexing_repository.py +0 -216
  101. kodit/indexing/indexing_service.py +0 -338
  102. kodit/snippets/__init__.py +0 -1
  103. kodit/snippets/languages/__init__.py +0 -53
  104. kodit/snippets/snippets.py +0 -50
  105. kodit/source/__init__.py +0 -1
  106. kodit/source/source_factories.py +0 -356
  107. kodit/source/source_repository.py +0 -169
  108. kodit/source/source_service.py +0 -150
  109. kodit/util/__init__.py +0 -1
  110. kodit-0.2.3.dist-info/RECORD +0 -71
  111. /kodit/{snippets → infrastructure/snippet_extraction}/languages/csharp.scm +0 -0
  112. /kodit/{snippets → infrastructure/snippet_extraction}/languages/go.scm +0 -0
  113. /kodit/{snippets → infrastructure/snippet_extraction}/languages/javascript.scm +0 -0
  114. /kodit/{snippets → infrastructure/snippet_extraction}/languages/python.scm +0 -0
  115. /kodit/{snippets → infrastructure/snippet_extraction}/languages/typescript.scm +0 -0
  116. {kodit-0.2.3.dist-info → kodit-0.2.5.dist-info}/WHEEL +0 -0
  117. {kodit-0.2.3.dist-info → kodit-0.2.5.dist-info}/entry_points.txt +0 -0
  118. {kodit-0.2.3.dist-info → kodit-0.2.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,64 +0,0 @@
1
- """Embedding provider."""
2
-
3
- from abc import ABC, abstractmethod
4
-
5
- import structlog
6
- import tiktoken
7
-
8
- OPENAI_MAX_EMBEDDING_SIZE = 8192
9
-
10
- Vector = list[float]
11
-
12
-
13
- class EmbeddingProvider(ABC):
14
- """Embedding provider."""
15
-
16
- @abstractmethod
17
- async def embed(self, data: list[str]) -> list[Vector]:
18
- """Embed a list of strings.
19
-
20
- The embedding provider is responsible for embedding a list of strings into a
21
- list of vectors. The embedding provider is responsible for splitting the list of
22
- strings into smaller sub-batches and embedding them in parallel.
23
- """
24
-
25
-
26
- def split_sub_batches(
27
- encoding: tiktoken.Encoding,
28
- data: list[str],
29
- max_context_window: int = OPENAI_MAX_EMBEDDING_SIZE,
30
- ) -> list[list[str]]:
31
- """Split a list of strings into smaller sub-batches."""
32
- log = structlog.get_logger(__name__)
33
- result = []
34
- data_to_process = [s for s in data if s.strip()] # Filter out empty strings
35
-
36
- while data_to_process:
37
- next_batch = []
38
- current_tokens = 0
39
-
40
- while data_to_process:
41
- next_item = data_to_process[0]
42
- item_tokens = len(encoding.encode(next_item, disallowed_special=()))
43
-
44
- if item_tokens > max_context_window:
45
- # Loop around trying to truncate the snippet until it fits in the max
46
- # embedding size
47
- while item_tokens > max_context_window:
48
- next_item = next_item[:-1]
49
- item_tokens = len(encoding.encode(next_item, disallowed_special=()))
50
-
51
- data_to_process[0] = next_item
52
-
53
- log.warning("Truncated snippet", snippet=next_item)
54
-
55
- if current_tokens + item_tokens > max_context_window:
56
- break
57
-
58
- next_batch.append(data_to_process.pop(0))
59
- current_tokens += item_tokens
60
-
61
- if next_batch:
62
- result.append(next_batch)
63
-
64
- return result
@@ -1,77 +0,0 @@
1
- """Hash embedding provider, for use in tests only."""
2
-
3
- import asyncio
4
- import hashlib
5
- import math
6
- from collections.abc import Generator, Sequence
7
-
8
- from kodit.embedding.embedding_provider.embedding_provider import (
9
- EmbeddingProvider,
10
- Vector,
11
- )
12
-
13
-
14
- class HashEmbeddingProvider(EmbeddingProvider):
15
- """A minimal test-time embedding provider.
16
-
17
- • Zero third-party dependencies (uses only std-lib)
18
- • Distinguishes strings by hashing with SHA-256
19
- • Maps the digest to a fixed-size float vector, then ℓ₂-normalises
20
- • Splits work into small asynchronous chunks for speed in event loops
21
- """
22
-
23
- def __init__(self, dim: int = 16, batch_size: int = 64) -> None:
24
- """Initialize the hash embedding provider."""
25
- if dim <= 0:
26
- msg = f"dim must be > 0, got {dim}"
27
- raise ValueError(msg)
28
- if batch_size <= 0:
29
- msg = f"batch_size must be > 0, got {batch_size}"
30
- raise ValueError(msg)
31
- self.dim = dim
32
- self.batch_size = batch_size
33
-
34
- async def embed(self, data: list[str]) -> list[Vector]:
35
- """Embed every string in *data*, preserving order.
36
-
37
- Work is sliced into *batch_size* chunks and scheduled concurrently
38
- (still CPU-bound, but enough to cooperate with an asyncio loop).
39
- """
40
- if not data:
41
- return []
42
-
43
- async def _embed_chunk(chunk: Sequence[str]) -> list[Vector]:
44
- return [self._string_to_vector(text) for text in chunk]
45
-
46
- tasks = [
47
- asyncio.create_task(_embed_chunk(chunk))
48
- for chunk in self._chunked(data, self.batch_size)
49
- ]
50
-
51
- vectors: list[Vector] = []
52
- for task in tasks:
53
- vectors.extend(await task)
54
- return vectors
55
-
56
- @staticmethod
57
- def _chunked(seq: Sequence[str], size: int) -> Generator[Sequence[str], None, None]:
58
- """Yield successive *size*-sized slices from *seq*."""
59
- for i in range(0, len(seq), size):
60
- yield seq[i : i + size]
61
-
62
- def _string_to_vector(self, text: str) -> Vector:
63
- """Deterministically convert *text* to a normalised float vector."""
64
- digest = hashlib.sha256(text.encode("utf-8")).digest()
65
-
66
- # Build the vector from 4-byte windows of the digest.
67
- vec = [
68
- int.from_bytes(
69
- digest[(i * 4) % len(digest) : (i * 4) % len(digest) + 4], "big"
70
- )
71
- / 0xFFFFFFFF
72
- for i in range(self.dim)
73
- ]
74
-
75
- # ℓ₂-normalise so magnitudes are comparable.
76
- norm = math.sqrt(sum(x * x for x in vec)) or 1.0
77
- return [x / norm for x in vec]
@@ -1,64 +0,0 @@
1
- """Local embedding service."""
2
-
3
- from __future__ import annotations
4
-
5
- import os
6
- from typing import TYPE_CHECKING
7
-
8
- import structlog
9
- import tiktoken
10
- from tqdm import tqdm
11
-
12
- from kodit.embedding.embedding_provider.embedding_provider import (
13
- EmbeddingProvider,
14
- Vector,
15
- split_sub_batches,
16
- )
17
-
18
- if TYPE_CHECKING:
19
- from sentence_transformers import SentenceTransformer
20
-
21
- TINY = "tiny"
22
- CODE = "code"
23
- TEST = "test"
24
-
25
- COMMON_EMBEDDING_MODELS = {
26
- TINY: "ibm-granite/granite-embedding-30m-english",
27
- CODE: "flax-sentence-embeddings/st-codesearch-distilroberta-base",
28
- TEST: "minishlab/potion-base-4M",
29
- }
30
-
31
-
32
- class LocalEmbeddingProvider(EmbeddingProvider):
33
- """Local embedder."""
34
-
35
- def __init__(self, model_name: str) -> None:
36
- """Initialize the local embedder."""
37
- self.log = structlog.get_logger(__name__)
38
- self.model_name = COMMON_EMBEDDING_MODELS.get(model_name, model_name)
39
- self.embedding_model = None
40
- self.encoding = tiktoken.encoding_for_model("text-embedding-3-small")
41
-
42
- def _model(self) -> SentenceTransformer:
43
- """Get the embedding model."""
44
- if self.embedding_model is None:
45
- os.environ["TOKENIZERS_PARALLELISM"] = "false" # Avoid warnings
46
- from sentence_transformers import SentenceTransformer
47
-
48
- self.embedding_model = SentenceTransformer(
49
- self.model_name,
50
- trust_remote_code=True,
51
- )
52
- return self.embedding_model
53
-
54
- async def embed(self, data: list[str]) -> list[Vector]:
55
- """Embed a list of strings."""
56
- model = self._model()
57
-
58
- batched_data = split_sub_batches(self.encoding, data)
59
-
60
- results: list[Vector] = []
61
- for batch in tqdm(batched_data, total=len(batched_data), leave=False):
62
- embeddings = model.encode(batch, show_progress_bar=False, batch_size=4)
63
- results.extend([[float(x) for x in embedding] for embedding in embeddings])
64
- return results
@@ -1,77 +0,0 @@
1
- """OpenAI embedding service."""
2
-
3
- import asyncio
4
-
5
- import structlog
6
- import tiktoken
7
- from openai import AsyncOpenAI
8
-
9
- from kodit.embedding.embedding_provider.embedding_provider import (
10
- EmbeddingProvider,
11
- Vector,
12
- split_sub_batches,
13
- )
14
-
15
- OPENAI_NUM_PARALLEL_TASKS = 10
16
-
17
-
18
- class OpenAIEmbeddingProvider(EmbeddingProvider):
19
- """OpenAI embedder."""
20
-
21
- def __init__(
22
- self,
23
- openai_client: AsyncOpenAI,
24
- model_name: str = "text-embedding-3-small",
25
- ) -> None:
26
- """Initialize the OpenAI embedder."""
27
- self.log = structlog.get_logger(__name__)
28
- self.openai_client = openai_client
29
- self.model_name = model_name
30
- self.encoding = tiktoken.encoding_for_model(
31
- "text-embedding-3-small"
32
- ) # Sensible default
33
-
34
- async def embed(self, data: list[str]) -> list[Vector]:
35
- """Embed a list of documents."""
36
- # First split the list into a list of list where each sublist has fewer than
37
- # max tokens.
38
- batched_data = split_sub_batches(self.encoding, data)
39
-
40
- # Process batches in parallel with a semaphore to limit concurrent requests
41
- sem = asyncio.Semaphore(OPENAI_NUM_PARALLEL_TASKS)
42
-
43
- # Create a list of tuples with a temporary id for each batch
44
- # We need to do this so that we can return the results in the same order as the
45
- # input data
46
- input_data = [(i, batch) for i, batch in enumerate(batched_data)]
47
-
48
- async def process_batch(
49
- data: tuple[int, list[str]],
50
- ) -> tuple[int, list[Vector]]:
51
- batch_id, batch = data
52
- async with sem:
53
- try:
54
- response = await self.openai_client.embeddings.create(
55
- model=self.model_name,
56
- input=batch,
57
- )
58
- return batch_id, [
59
- [float(x) for x in embedding.embedding]
60
- for embedding in response.data
61
- ]
62
- except Exception as e:
63
- self.log.exception("Error embedding batch", error=str(e))
64
- return batch_id, []
65
-
66
- # Create tasks for all batches
67
- tasks = [process_batch(batch) for batch in input_data]
68
-
69
- # Process all batches and yield results as they complete
70
- results: list[tuple[int, list[Vector]]] = []
71
- for task in asyncio.as_completed(tasks):
72
- result = await task
73
- results.append(result)
74
-
75
- # Output in the same order as the input data
76
- ordered_results = [result for _, result in sorted(results, key=lambda x: x[0])]
77
- return [item for sublist in ordered_results for item in sublist]
@@ -1,54 +0,0 @@
1
- """Local vector search."""
2
-
3
- import structlog
4
- import tiktoken
5
-
6
- from kodit.embedding.embedding_models import Embedding, EmbeddingType
7
- from kodit.embedding.embedding_provider.embedding_provider import EmbeddingProvider
8
- from kodit.embedding.embedding_repository import EmbeddingRepository
9
- from kodit.embedding.vector_search_service import (
10
- VectorSearchRequest,
11
- VectorSearchResponse,
12
- VectorSearchService,
13
- )
14
-
15
-
16
- class LocalVectorSearchService(VectorSearchService):
17
- """Local vector search."""
18
-
19
- def __init__(
20
- self,
21
- embedding_repository: EmbeddingRepository,
22
- embedding_provider: EmbeddingProvider,
23
- ) -> None:
24
- """Initialize the local embedder."""
25
- self.log = structlog.get_logger(__name__)
26
- self.embedding_repository = embedding_repository
27
- self.embedding_provider = embedding_provider
28
- self.encoding = tiktoken.encoding_for_model("text-embedding-3-small")
29
-
30
- async def index(self, data: list[VectorSearchRequest]) -> None:
31
- """Embed a list of documents."""
32
- if not data or len(data) == 0:
33
- self.log.warning("Embedding data is empty, skipping embedding")
34
- return
35
-
36
- embeddings = await self.embedding_provider.embed([i.text for i in data])
37
- for i, x in zip(data, embeddings, strict=False):
38
- await self.embedding_repository.create_embedding(
39
- Embedding(
40
- snippet_id=i.snippet_id,
41
- embedding=[float(y) for y in x],
42
- type=EmbeddingType.CODE,
43
- )
44
- )
45
-
46
- async def retrieve(self, query: str, top_k: int = 10) -> list[VectorSearchResponse]:
47
- """Query the embedding model."""
48
- embedding = (await self.embedding_provider.embed([query]))[0]
49
- results = await self.embedding_repository.list_semantic_results(
50
- EmbeddingType.CODE, [float(x) for x in embedding], top_k
51
- )
52
- return [
53
- VectorSearchResponse(snippet_id, score) for snippet_id, score in results
54
- ]
@@ -1,38 +0,0 @@
1
- """Embedding service."""
2
-
3
- from abc import ABC, abstractmethod
4
- from typing import NamedTuple
5
-
6
-
7
- class VectorSearchResponse(NamedTuple):
8
- """Embedding result."""
9
-
10
- snippet_id: int
11
- score: float
12
-
13
-
14
- class VectorSearchRequest(NamedTuple):
15
- """Input for embedding."""
16
-
17
- snippet_id: int
18
- text: str
19
-
20
-
21
- class VectorSearchService(ABC):
22
- """Semantic search service interface."""
23
-
24
- @abstractmethod
25
- async def index(self, data: list[VectorSearchRequest]) -> None:
26
- """Embed a list of documents.
27
-
28
- The embedding service accepts a massive list of id,strings to embed. Behind the
29
- scenes it batches up requests and parallelizes them for performance according to
30
- the specifics of the embedding service.
31
-
32
- The id reference is required because the parallelization may return results out
33
- of order.
34
- """
35
-
36
- @abstractmethod
37
- async def retrieve(self, query: str, top_k: int = 10) -> list[VectorSearchResponse]:
38
- """Query the embedding model."""
@@ -1 +0,0 @@
1
- """Enrichment."""
@@ -1 +0,0 @@
1
- """Enrichment provider."""
@@ -1,16 +0,0 @@
1
- """Enrichment provider."""
2
-
3
- from abc import ABC, abstractmethod
4
-
5
- ENRICHMENT_SYSTEM_PROMPT = """
6
- You are a professional software developer. You will be given a snippet of code.
7
- Please provide a concise explanation of the code.
8
- """
9
-
10
-
11
- class EnrichmentProvider(ABC):
12
- """Enrichment provider."""
13
-
14
- @abstractmethod
15
- async def enrich(self, data: list[str]) -> list[str]:
16
- """Enrich a list of strings."""
@@ -1,92 +0,0 @@
1
- """Local embedding service."""
2
-
3
- import os
4
-
5
- import structlog
6
- import tiktoken
7
- from tqdm import tqdm
8
-
9
- from kodit.embedding.embedding_provider.embedding_provider import split_sub_batches
10
- from kodit.enrichment.enrichment_provider.enrichment_provider import (
11
- ENRICHMENT_SYSTEM_PROMPT,
12
- EnrichmentProvider,
13
- )
14
-
15
- DEFAULT_ENRICHMENT_MODEL = "Qwen/Qwen3-0.6B"
16
- DEFAULT_CONTEXT_WINDOW_SIZE = 2048 # Small so it works even on low-powered devices
17
-
18
-
19
- class LocalEnrichmentProvider(EnrichmentProvider):
20
- """Local embedder."""
21
-
22
- def __init__(
23
- self,
24
- model_name: str = DEFAULT_ENRICHMENT_MODEL,
25
- context_window: int = DEFAULT_CONTEXT_WINDOW_SIZE,
26
- ) -> None:
27
- """Initialize the local enrichment provider."""
28
- self.log = structlog.get_logger(__name__)
29
- self.model_name = model_name
30
- self.context_window = context_window
31
- self.model = None
32
- self.tokenizer = None
33
- self.encoding = tiktoken.encoding_for_model("text-embedding-3-small")
34
-
35
- async def enrich(self, data: list[str]) -> list[str]:
36
- """Enrich a list of strings."""
37
- if not data or len(data) == 0:
38
- self.log.warning("Data is empty, skipping enrichment")
39
- return []
40
-
41
- from transformers.models.auto.modeling_auto import (
42
- AutoModelForCausalLM,
43
- )
44
- from transformers.models.auto.tokenization_auto import AutoTokenizer
45
-
46
- if self.tokenizer is None:
47
- self.tokenizer = AutoTokenizer.from_pretrained(
48
- self.model_name, padding_side="left"
49
- )
50
- if self.model is None:
51
- os.environ["TOKENIZERS_PARALLELISM"] = "false" # Avoid warnings
52
- self.model = AutoModelForCausalLM.from_pretrained(
53
- self.model_name,
54
- torch_dtype="auto",
55
- trust_remote_code=True,
56
- device_map="auto",
57
- )
58
-
59
- # Prepare prompts
60
- prompts = [
61
- self.tokenizer.apply_chat_template(
62
- [
63
- {"role": "system", "content": ENRICHMENT_SYSTEM_PROMPT},
64
- {"role": "user", "content": snippet},
65
- ],
66
- tokenize=False,
67
- add_generation_prompt=True,
68
- enable_thinking=False,
69
- )
70
- for snippet in data
71
- ]
72
-
73
- # Batch prompts using split_sub_batches
74
- batched_prompts = split_sub_batches(
75
- self.encoding, prompts, max_context_window=self.context_window
76
- )
77
- results = []
78
- for batch in tqdm(batched_prompts, leave=False, total=len(batched_prompts)):
79
- model_inputs = self.tokenizer(
80
- batch, return_tensors="pt", padding=True, truncation=True
81
- ).to(self.model.device)
82
- generated_ids = self.model.generate(
83
- **model_inputs, max_new_tokens=self.context_window
84
- )
85
- # For each prompt in the batch, decode only the generated part
86
- for i, input_ids in enumerate(model_inputs["input_ids"]):
87
- output_ids = generated_ids[i][len(input_ids) :].tolist()
88
- content = self.tokenizer.decode(
89
- output_ids, skip_special_tokens=True
90
- ).strip("\n")
91
- results.append(content)
92
- return results
@@ -1,81 +0,0 @@
1
- """OpenAI embedding service."""
2
-
3
- import asyncio
4
-
5
- import structlog
6
- import tiktoken
7
- from openai import AsyncOpenAI
8
- from tqdm import tqdm
9
-
10
- from kodit.enrichment.enrichment_provider.enrichment_provider import (
11
- ENRICHMENT_SYSTEM_PROMPT,
12
- EnrichmentProvider,
13
- )
14
-
15
- OPENAI_NUM_PARALLEL_TASKS = 10
16
-
17
-
18
- class OpenAIEnrichmentProvider(EnrichmentProvider):
19
- """OpenAI enrichment provider."""
20
-
21
- def __init__(
22
- self,
23
- openai_client: AsyncOpenAI,
24
- model_name: str = "gpt-4o-mini",
25
- ) -> None:
26
- """Initialize the OpenAI enrichment provider."""
27
- self.log = structlog.get_logger(__name__)
28
- self.openai_client = openai_client
29
- self.model_name = model_name
30
- self.encoding = tiktoken.encoding_for_model("gpt-4o-mini") # Approximation
31
-
32
- async def enrich(self, data: list[str]) -> list[str]:
33
- """Enrich a list of documents."""
34
- if not data or len(data) == 0:
35
- self.log.warning("Data is empty, skipping enrichment")
36
- return []
37
-
38
- # Process batches in parallel with a semaphore to limit concurrent requests
39
- sem = asyncio.Semaphore(OPENAI_NUM_PARALLEL_TASKS)
40
-
41
- # Create a list of tuples with a temporary id for each snippet
42
- # We need to do this so that we can return the results in the same order as the
43
- # input data
44
- input_data = [(i, snippet) for i, snippet in enumerate(data)]
45
-
46
- async def process_data(data: tuple[int, str]) -> tuple[int, str]:
47
- snippet_id, snippet = data
48
- if not snippet:
49
- return snippet_id, ""
50
- async with sem:
51
- try:
52
- response = await self.openai_client.chat.completions.create(
53
- model=self.model_name,
54
- messages=[
55
- {
56
- "role": "system",
57
- "content": ENRICHMENT_SYSTEM_PROMPT,
58
- },
59
- {"role": "user", "content": snippet},
60
- ],
61
- )
62
- return snippet_id, response.choices[0].message.content or ""
63
- except Exception as e:
64
- self.log.exception("Error enriching data", error=str(e))
65
- return snippet_id, ""
66
-
67
- # Create tasks for all data
68
- tasks = [process_data(snippet) for snippet in input_data]
69
-
70
- # Process all data and yield results as they complete
71
- results: list[tuple[int, str]] = []
72
- for task in tqdm(
73
- asyncio.as_completed(tasks),
74
- total=len(tasks),
75
- leave=False,
76
- ):
77
- result = await task
78
- results.append(result)
79
-
80
- # Output in the same order as the input data
81
- return [result for _, result in sorted(results, key=lambda x: x[0])]
@@ -1,33 +0,0 @@
1
- """Enrichment service."""
2
-
3
- from abc import ABC, abstractmethod
4
-
5
- from kodit.enrichment.enrichment_provider.enrichment_provider import EnrichmentProvider
6
-
7
-
8
- class EnrichmentService(ABC):
9
- """Enrichment service."""
10
-
11
- @abstractmethod
12
- async def enrich(self, data: list[str]) -> list[str]:
13
- """Enrich a list of strings."""
14
-
15
-
16
- class NullEnrichmentService(EnrichmentService):
17
- """Null enrichment service."""
18
-
19
- async def enrich(self, data: list[str]) -> list[str]:
20
- """Enrich a list of strings."""
21
- return [""] * len(data)
22
-
23
-
24
- class LLMEnrichmentService(EnrichmentService):
25
- """Enrichment service using an LLM."""
26
-
27
- def __init__(self, enrichment_provider: EnrichmentProvider) -> None:
28
- """Initialize the enrichment service."""
29
- self.enrichment_provider = enrichment_provider
30
-
31
- async def enrich(self, data: list[str]) -> list[str]:
32
- """Enrich a list of strings."""
33
- return await self.enrichment_provider.enrich(data)
@@ -1 +0,0 @@
1
- """Indexing package for managing code indexes and search functionality."""
kodit/indexing/fusion.py DELETED
@@ -1,67 +0,0 @@
1
- """Fusion functions for combining search results."""
2
-
3
- from collections import defaultdict
4
- from dataclasses import dataclass
5
-
6
-
7
- @dataclass
8
- class FusionResult:
9
- """Result of a fusion operation."""
10
-
11
- id: int
12
- score: float
13
- original_scores: list[float]
14
-
15
-
16
- @dataclass
17
- class FusionRequest:
18
- """Result of a RRF operation."""
19
-
20
- id: int
21
- score: float
22
-
23
-
24
- def reciprocal_rank_fusion(
25
- rankings: list[list[FusionRequest]], k: float = 60
26
- ) -> list[FusionResult]:
27
- """RRF prioritises results that are present in all results.
28
-
29
- Args:
30
- rankings: List of rankers, each containing a list of document ids. Top of the
31
- list is considered to be the best result.
32
- k: Parameter for RRF.
33
-
34
- Returns:
35
- Dictionary of ids and their scores.
36
-
37
- """
38
- scores = {}
39
- for ranker in rankings:
40
- for rank in ranker:
41
- scores[rank.id] = float(0)
42
-
43
- for ranker in rankings:
44
- for i, rank in enumerate(ranker):
45
- scores[rank.id] += 1.0 / (k + i)
46
-
47
- # Create a list of tuples of ids and their scores
48
- results = [(rank, scores[rank]) for rank in scores]
49
-
50
- # Sort results by score
51
- results.sort(key=lambda x: x[1], reverse=True)
52
-
53
- # Create a map of original scores to ids
54
- original_scores_to_ids = defaultdict(list)
55
- for ranker in rankings:
56
- for rank in ranker:
57
- original_scores_to_ids[rank.id].append(rank.score)
58
-
59
- # Rebuild a list of final results with their original scores
60
- return [
61
- FusionResult(
62
- id=result[0],
63
- score=result[1],
64
- original_scores=original_scores_to_ids[result[0]],
65
- )
66
- for result in results
67
- ]