kodit 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/application/__init__.py +1 -0
- kodit/application/commands/__init__.py +1 -0
- kodit/application/commands/snippet_commands.py +22 -0
- kodit/application/services/__init__.py +1 -0
- kodit/application/services/indexing_application_service.py +363 -0
- kodit/application/services/snippet_application_service.py +143 -0
- kodit/cli.py +105 -82
- kodit/database.py +0 -22
- kodit/domain/__init__.py +1 -0
- kodit/{source/source_models.py → domain/entities.py} +88 -19
- kodit/domain/enums.py +9 -0
- kodit/domain/interfaces.py +27 -0
- kodit/domain/repositories.py +95 -0
- kodit/domain/services/__init__.py +1 -0
- kodit/domain/services/bm25_service.py +124 -0
- kodit/domain/services/embedding_service.py +155 -0
- kodit/domain/services/enrichment_service.py +48 -0
- kodit/domain/services/ignore_service.py +45 -0
- kodit/domain/services/indexing_service.py +203 -0
- kodit/domain/services/snippet_extraction_service.py +89 -0
- kodit/domain/services/source_service.py +83 -0
- kodit/domain/value_objects.py +215 -0
- kodit/infrastructure/__init__.py +1 -0
- kodit/infrastructure/bm25/__init__.py +1 -0
- kodit/infrastructure/bm25/bm25_factory.py +28 -0
- kodit/{bm25/local_bm25.py → infrastructure/bm25/local_bm25_repository.py} +33 -22
- kodit/{bm25/vectorchord_bm25.py → infrastructure/bm25/vectorchord_bm25_repository.py} +40 -35
- kodit/infrastructure/cloning/__init__.py +1 -0
- kodit/infrastructure/cloning/folder/__init__.py +1 -0
- kodit/infrastructure/cloning/folder/factory.py +119 -0
- kodit/infrastructure/cloning/folder/working_copy.py +38 -0
- kodit/infrastructure/cloning/git/__init__.py +1 -0
- kodit/infrastructure/cloning/git/factory.py +133 -0
- kodit/infrastructure/cloning/git/working_copy.py +32 -0
- kodit/infrastructure/cloning/metadata.py +127 -0
- kodit/infrastructure/embedding/__init__.py +1 -0
- kodit/infrastructure/embedding/embedding_factory.py +87 -0
- kodit/infrastructure/embedding/embedding_providers/__init__.py +1 -0
- kodit/infrastructure/embedding/embedding_providers/batching.py +93 -0
- kodit/infrastructure/embedding/embedding_providers/hash_embedding_provider.py +79 -0
- kodit/infrastructure/embedding/embedding_providers/local_embedding_provider.py +129 -0
- kodit/infrastructure/embedding/embedding_providers/openai_embedding_provider.py +113 -0
- kodit/infrastructure/embedding/local_vector_search_repository.py +114 -0
- kodit/{embedding/vectorchord_vector_search_service.py → infrastructure/embedding/vectorchord_vector_search_repository.py} +98 -32
- kodit/infrastructure/enrichment/__init__.py +1 -0
- kodit/{enrichment → infrastructure/enrichment}/enrichment_factory.py +28 -12
- kodit/infrastructure/enrichment/legacy_enrichment_models.py +42 -0
- kodit/infrastructure/enrichment/local_enrichment_provider.py +115 -0
- kodit/infrastructure/enrichment/null_enrichment_provider.py +25 -0
- kodit/infrastructure/enrichment/openai_enrichment_provider.py +89 -0
- kodit/infrastructure/git/__init__.py +1 -0
- kodit/{source/git.py → infrastructure/git/git_utils.py} +10 -2
- kodit/infrastructure/ignore/__init__.py +1 -0
- kodit/{source/ignore.py → infrastructure/ignore/ignore_pattern_provider.py} +23 -6
- kodit/infrastructure/indexing/__init__.py +1 -0
- kodit/infrastructure/indexing/fusion_service.py +55 -0
- kodit/infrastructure/indexing/index_repository.py +296 -0
- kodit/infrastructure/indexing/indexing_factory.py +111 -0
- kodit/infrastructure/snippet_extraction/__init__.py +1 -0
- kodit/infrastructure/snippet_extraction/language_detection_service.py +39 -0
- kodit/infrastructure/snippet_extraction/snippet_extraction_factory.py +95 -0
- kodit/infrastructure/snippet_extraction/snippet_query_provider.py +45 -0
- kodit/{snippets/method_snippets.py → infrastructure/snippet_extraction/tree_sitter_snippet_extractor.py} +123 -61
- kodit/infrastructure/sqlalchemy/__init__.py +1 -0
- kodit/{embedding → infrastructure/sqlalchemy}/embedding_repository.py +40 -24
- kodit/infrastructure/sqlalchemy/file_repository.py +73 -0
- kodit/infrastructure/sqlalchemy/repository.py +121 -0
- kodit/infrastructure/sqlalchemy/snippet_repository.py +75 -0
- kodit/infrastructure/ui/__init__.py +1 -0
- kodit/infrastructure/ui/progress.py +127 -0
- kodit/{util → infrastructure/ui}/spinner.py +19 -4
- kodit/mcp.py +50 -28
- kodit/migrations/env.py +1 -4
- kodit/reporting.py +78 -0
- {kodit-0.2.3.dist-info → kodit-0.2.5.dist-info}/METADATA +1 -1
- kodit-0.2.5.dist-info/RECORD +99 -0
- kodit/bm25/__init__.py +0 -1
- kodit/bm25/keyword_search_factory.py +0 -17
- kodit/bm25/keyword_search_service.py +0 -34
- kodit/embedding/__init__.py +0 -1
- kodit/embedding/embedding_factory.py +0 -63
- kodit/embedding/embedding_models.py +0 -28
- kodit/embedding/embedding_provider/__init__.py +0 -1
- kodit/embedding/embedding_provider/embedding_provider.py +0 -64
- kodit/embedding/embedding_provider/hash_embedding_provider.py +0 -77
- kodit/embedding/embedding_provider/local_embedding_provider.py +0 -64
- kodit/embedding/embedding_provider/openai_embedding_provider.py +0 -77
- kodit/embedding/local_vector_search_service.py +0 -54
- kodit/embedding/vector_search_service.py +0 -38
- kodit/enrichment/__init__.py +0 -1
- kodit/enrichment/enrichment_provider/__init__.py +0 -1
- kodit/enrichment/enrichment_provider/enrichment_provider.py +0 -16
- kodit/enrichment/enrichment_provider/local_enrichment_provider.py +0 -92
- kodit/enrichment/enrichment_provider/openai_enrichment_provider.py +0 -81
- kodit/enrichment/enrichment_service.py +0 -33
- kodit/indexing/__init__.py +0 -1
- kodit/indexing/fusion.py +0 -67
- kodit/indexing/indexing_models.py +0 -43
- kodit/indexing/indexing_repository.py +0 -216
- kodit/indexing/indexing_service.py +0 -338
- kodit/snippets/__init__.py +0 -1
- kodit/snippets/languages/__init__.py +0 -53
- kodit/snippets/snippets.py +0 -50
- kodit/source/__init__.py +0 -1
- kodit/source/source_factories.py +0 -356
- kodit/source/source_repository.py +0 -169
- kodit/source/source_service.py +0 -150
- kodit/util/__init__.py +0 -1
- kodit-0.2.3.dist-info/RECORD +0 -71
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/csharp.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/go.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/javascript.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/python.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/typescript.scm +0 -0
- {kodit-0.2.3.dist-info → kodit-0.2.5.dist-info}/WHEEL +0 -0
- {kodit-0.2.3.dist-info → kodit-0.2.5.dist-info}/entry_points.txt +0 -0
- {kodit-0.2.3.dist-info → kodit-0.2.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,64 +0,0 @@
|
|
|
1
|
-
"""Embedding provider."""
|
|
2
|
-
|
|
3
|
-
from abc import ABC, abstractmethod
|
|
4
|
-
|
|
5
|
-
import structlog
|
|
6
|
-
import tiktoken
|
|
7
|
-
|
|
8
|
-
OPENAI_MAX_EMBEDDING_SIZE = 8192
|
|
9
|
-
|
|
10
|
-
Vector = list[float]
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class EmbeddingProvider(ABC):
|
|
14
|
-
"""Embedding provider."""
|
|
15
|
-
|
|
16
|
-
@abstractmethod
|
|
17
|
-
async def embed(self, data: list[str]) -> list[Vector]:
|
|
18
|
-
"""Embed a list of strings.
|
|
19
|
-
|
|
20
|
-
The embedding provider is responsible for embedding a list of strings into a
|
|
21
|
-
list of vectors. The embedding provider is responsible for splitting the list of
|
|
22
|
-
strings into smaller sub-batches and embedding them in parallel.
|
|
23
|
-
"""
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def split_sub_batches(
|
|
27
|
-
encoding: tiktoken.Encoding,
|
|
28
|
-
data: list[str],
|
|
29
|
-
max_context_window: int = OPENAI_MAX_EMBEDDING_SIZE,
|
|
30
|
-
) -> list[list[str]]:
|
|
31
|
-
"""Split a list of strings into smaller sub-batches."""
|
|
32
|
-
log = structlog.get_logger(__name__)
|
|
33
|
-
result = []
|
|
34
|
-
data_to_process = [s for s in data if s.strip()] # Filter out empty strings
|
|
35
|
-
|
|
36
|
-
while data_to_process:
|
|
37
|
-
next_batch = []
|
|
38
|
-
current_tokens = 0
|
|
39
|
-
|
|
40
|
-
while data_to_process:
|
|
41
|
-
next_item = data_to_process[0]
|
|
42
|
-
item_tokens = len(encoding.encode(next_item, disallowed_special=()))
|
|
43
|
-
|
|
44
|
-
if item_tokens > max_context_window:
|
|
45
|
-
# Loop around trying to truncate the snippet until it fits in the max
|
|
46
|
-
# embedding size
|
|
47
|
-
while item_tokens > max_context_window:
|
|
48
|
-
next_item = next_item[:-1]
|
|
49
|
-
item_tokens = len(encoding.encode(next_item, disallowed_special=()))
|
|
50
|
-
|
|
51
|
-
data_to_process[0] = next_item
|
|
52
|
-
|
|
53
|
-
log.warning("Truncated snippet", snippet=next_item)
|
|
54
|
-
|
|
55
|
-
if current_tokens + item_tokens > max_context_window:
|
|
56
|
-
break
|
|
57
|
-
|
|
58
|
-
next_batch.append(data_to_process.pop(0))
|
|
59
|
-
current_tokens += item_tokens
|
|
60
|
-
|
|
61
|
-
if next_batch:
|
|
62
|
-
result.append(next_batch)
|
|
63
|
-
|
|
64
|
-
return result
|
|
@@ -1,77 +0,0 @@
|
|
|
1
|
-
"""Hash embedding provider, for use in tests only."""
|
|
2
|
-
|
|
3
|
-
import asyncio
|
|
4
|
-
import hashlib
|
|
5
|
-
import math
|
|
6
|
-
from collections.abc import Generator, Sequence
|
|
7
|
-
|
|
8
|
-
from kodit.embedding.embedding_provider.embedding_provider import (
|
|
9
|
-
EmbeddingProvider,
|
|
10
|
-
Vector,
|
|
11
|
-
)
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class HashEmbeddingProvider(EmbeddingProvider):
|
|
15
|
-
"""A minimal test-time embedding provider.
|
|
16
|
-
|
|
17
|
-
• Zero third-party dependencies (uses only std-lib)
|
|
18
|
-
• Distinguishes strings by hashing with SHA-256
|
|
19
|
-
• Maps the digest to a fixed-size float vector, then ℓ₂-normalises
|
|
20
|
-
• Splits work into small asynchronous chunks for speed in event loops
|
|
21
|
-
"""
|
|
22
|
-
|
|
23
|
-
def __init__(self, dim: int = 16, batch_size: int = 64) -> None:
|
|
24
|
-
"""Initialize the hash embedding provider."""
|
|
25
|
-
if dim <= 0:
|
|
26
|
-
msg = f"dim must be > 0, got {dim}"
|
|
27
|
-
raise ValueError(msg)
|
|
28
|
-
if batch_size <= 0:
|
|
29
|
-
msg = f"batch_size must be > 0, got {batch_size}"
|
|
30
|
-
raise ValueError(msg)
|
|
31
|
-
self.dim = dim
|
|
32
|
-
self.batch_size = batch_size
|
|
33
|
-
|
|
34
|
-
async def embed(self, data: list[str]) -> list[Vector]:
|
|
35
|
-
"""Embed every string in *data*, preserving order.
|
|
36
|
-
|
|
37
|
-
Work is sliced into *batch_size* chunks and scheduled concurrently
|
|
38
|
-
(still CPU-bound, but enough to cooperate with an asyncio loop).
|
|
39
|
-
"""
|
|
40
|
-
if not data:
|
|
41
|
-
return []
|
|
42
|
-
|
|
43
|
-
async def _embed_chunk(chunk: Sequence[str]) -> list[Vector]:
|
|
44
|
-
return [self._string_to_vector(text) for text in chunk]
|
|
45
|
-
|
|
46
|
-
tasks = [
|
|
47
|
-
asyncio.create_task(_embed_chunk(chunk))
|
|
48
|
-
for chunk in self._chunked(data, self.batch_size)
|
|
49
|
-
]
|
|
50
|
-
|
|
51
|
-
vectors: list[Vector] = []
|
|
52
|
-
for task in tasks:
|
|
53
|
-
vectors.extend(await task)
|
|
54
|
-
return vectors
|
|
55
|
-
|
|
56
|
-
@staticmethod
|
|
57
|
-
def _chunked(seq: Sequence[str], size: int) -> Generator[Sequence[str], None, None]:
|
|
58
|
-
"""Yield successive *size*-sized slices from *seq*."""
|
|
59
|
-
for i in range(0, len(seq), size):
|
|
60
|
-
yield seq[i : i + size]
|
|
61
|
-
|
|
62
|
-
def _string_to_vector(self, text: str) -> Vector:
|
|
63
|
-
"""Deterministically convert *text* to a normalised float vector."""
|
|
64
|
-
digest = hashlib.sha256(text.encode("utf-8")).digest()
|
|
65
|
-
|
|
66
|
-
# Build the vector from 4-byte windows of the digest.
|
|
67
|
-
vec = [
|
|
68
|
-
int.from_bytes(
|
|
69
|
-
digest[(i * 4) % len(digest) : (i * 4) % len(digest) + 4], "big"
|
|
70
|
-
)
|
|
71
|
-
/ 0xFFFFFFFF
|
|
72
|
-
for i in range(self.dim)
|
|
73
|
-
]
|
|
74
|
-
|
|
75
|
-
# ℓ₂-normalise so magnitudes are comparable.
|
|
76
|
-
norm = math.sqrt(sum(x * x for x in vec)) or 1.0
|
|
77
|
-
return [x / norm for x in vec]
|
|
@@ -1,64 +0,0 @@
|
|
|
1
|
-
"""Local embedding service."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import os
|
|
6
|
-
from typing import TYPE_CHECKING
|
|
7
|
-
|
|
8
|
-
import structlog
|
|
9
|
-
import tiktoken
|
|
10
|
-
from tqdm import tqdm
|
|
11
|
-
|
|
12
|
-
from kodit.embedding.embedding_provider.embedding_provider import (
|
|
13
|
-
EmbeddingProvider,
|
|
14
|
-
Vector,
|
|
15
|
-
split_sub_batches,
|
|
16
|
-
)
|
|
17
|
-
|
|
18
|
-
if TYPE_CHECKING:
|
|
19
|
-
from sentence_transformers import SentenceTransformer
|
|
20
|
-
|
|
21
|
-
TINY = "tiny"
|
|
22
|
-
CODE = "code"
|
|
23
|
-
TEST = "test"
|
|
24
|
-
|
|
25
|
-
COMMON_EMBEDDING_MODELS = {
|
|
26
|
-
TINY: "ibm-granite/granite-embedding-30m-english",
|
|
27
|
-
CODE: "flax-sentence-embeddings/st-codesearch-distilroberta-base",
|
|
28
|
-
TEST: "minishlab/potion-base-4M",
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
class LocalEmbeddingProvider(EmbeddingProvider):
|
|
33
|
-
"""Local embedder."""
|
|
34
|
-
|
|
35
|
-
def __init__(self, model_name: str) -> None:
|
|
36
|
-
"""Initialize the local embedder."""
|
|
37
|
-
self.log = structlog.get_logger(__name__)
|
|
38
|
-
self.model_name = COMMON_EMBEDDING_MODELS.get(model_name, model_name)
|
|
39
|
-
self.embedding_model = None
|
|
40
|
-
self.encoding = tiktoken.encoding_for_model("text-embedding-3-small")
|
|
41
|
-
|
|
42
|
-
def _model(self) -> SentenceTransformer:
|
|
43
|
-
"""Get the embedding model."""
|
|
44
|
-
if self.embedding_model is None:
|
|
45
|
-
os.environ["TOKENIZERS_PARALLELISM"] = "false" # Avoid warnings
|
|
46
|
-
from sentence_transformers import SentenceTransformer
|
|
47
|
-
|
|
48
|
-
self.embedding_model = SentenceTransformer(
|
|
49
|
-
self.model_name,
|
|
50
|
-
trust_remote_code=True,
|
|
51
|
-
)
|
|
52
|
-
return self.embedding_model
|
|
53
|
-
|
|
54
|
-
async def embed(self, data: list[str]) -> list[Vector]:
|
|
55
|
-
"""Embed a list of strings."""
|
|
56
|
-
model = self._model()
|
|
57
|
-
|
|
58
|
-
batched_data = split_sub_batches(self.encoding, data)
|
|
59
|
-
|
|
60
|
-
results: list[Vector] = []
|
|
61
|
-
for batch in tqdm(batched_data, total=len(batched_data), leave=False):
|
|
62
|
-
embeddings = model.encode(batch, show_progress_bar=False, batch_size=4)
|
|
63
|
-
results.extend([[float(x) for x in embedding] for embedding in embeddings])
|
|
64
|
-
return results
|
|
@@ -1,77 +0,0 @@
|
|
|
1
|
-
"""OpenAI embedding service."""
|
|
2
|
-
|
|
3
|
-
import asyncio
|
|
4
|
-
|
|
5
|
-
import structlog
|
|
6
|
-
import tiktoken
|
|
7
|
-
from openai import AsyncOpenAI
|
|
8
|
-
|
|
9
|
-
from kodit.embedding.embedding_provider.embedding_provider import (
|
|
10
|
-
EmbeddingProvider,
|
|
11
|
-
Vector,
|
|
12
|
-
split_sub_batches,
|
|
13
|
-
)
|
|
14
|
-
|
|
15
|
-
OPENAI_NUM_PARALLEL_TASKS = 10
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class OpenAIEmbeddingProvider(EmbeddingProvider):
|
|
19
|
-
"""OpenAI embedder."""
|
|
20
|
-
|
|
21
|
-
def __init__(
|
|
22
|
-
self,
|
|
23
|
-
openai_client: AsyncOpenAI,
|
|
24
|
-
model_name: str = "text-embedding-3-small",
|
|
25
|
-
) -> None:
|
|
26
|
-
"""Initialize the OpenAI embedder."""
|
|
27
|
-
self.log = structlog.get_logger(__name__)
|
|
28
|
-
self.openai_client = openai_client
|
|
29
|
-
self.model_name = model_name
|
|
30
|
-
self.encoding = tiktoken.encoding_for_model(
|
|
31
|
-
"text-embedding-3-small"
|
|
32
|
-
) # Sensible default
|
|
33
|
-
|
|
34
|
-
async def embed(self, data: list[str]) -> list[Vector]:
|
|
35
|
-
"""Embed a list of documents."""
|
|
36
|
-
# First split the list into a list of list where each sublist has fewer than
|
|
37
|
-
# max tokens.
|
|
38
|
-
batched_data = split_sub_batches(self.encoding, data)
|
|
39
|
-
|
|
40
|
-
# Process batches in parallel with a semaphore to limit concurrent requests
|
|
41
|
-
sem = asyncio.Semaphore(OPENAI_NUM_PARALLEL_TASKS)
|
|
42
|
-
|
|
43
|
-
# Create a list of tuples with a temporary id for each batch
|
|
44
|
-
# We need to do this so that we can return the results in the same order as the
|
|
45
|
-
# input data
|
|
46
|
-
input_data = [(i, batch) for i, batch in enumerate(batched_data)]
|
|
47
|
-
|
|
48
|
-
async def process_batch(
|
|
49
|
-
data: tuple[int, list[str]],
|
|
50
|
-
) -> tuple[int, list[Vector]]:
|
|
51
|
-
batch_id, batch = data
|
|
52
|
-
async with sem:
|
|
53
|
-
try:
|
|
54
|
-
response = await self.openai_client.embeddings.create(
|
|
55
|
-
model=self.model_name,
|
|
56
|
-
input=batch,
|
|
57
|
-
)
|
|
58
|
-
return batch_id, [
|
|
59
|
-
[float(x) for x in embedding.embedding]
|
|
60
|
-
for embedding in response.data
|
|
61
|
-
]
|
|
62
|
-
except Exception as e:
|
|
63
|
-
self.log.exception("Error embedding batch", error=str(e))
|
|
64
|
-
return batch_id, []
|
|
65
|
-
|
|
66
|
-
# Create tasks for all batches
|
|
67
|
-
tasks = [process_batch(batch) for batch in input_data]
|
|
68
|
-
|
|
69
|
-
# Process all batches and yield results as they complete
|
|
70
|
-
results: list[tuple[int, list[Vector]]] = []
|
|
71
|
-
for task in asyncio.as_completed(tasks):
|
|
72
|
-
result = await task
|
|
73
|
-
results.append(result)
|
|
74
|
-
|
|
75
|
-
# Output in the same order as the input data
|
|
76
|
-
ordered_results = [result for _, result in sorted(results, key=lambda x: x[0])]
|
|
77
|
-
return [item for sublist in ordered_results for item in sublist]
|
|
@@ -1,54 +0,0 @@
|
|
|
1
|
-
"""Local vector search."""
|
|
2
|
-
|
|
3
|
-
import structlog
|
|
4
|
-
import tiktoken
|
|
5
|
-
|
|
6
|
-
from kodit.embedding.embedding_models import Embedding, EmbeddingType
|
|
7
|
-
from kodit.embedding.embedding_provider.embedding_provider import EmbeddingProvider
|
|
8
|
-
from kodit.embedding.embedding_repository import EmbeddingRepository
|
|
9
|
-
from kodit.embedding.vector_search_service import (
|
|
10
|
-
VectorSearchRequest,
|
|
11
|
-
VectorSearchResponse,
|
|
12
|
-
VectorSearchService,
|
|
13
|
-
)
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class LocalVectorSearchService(VectorSearchService):
|
|
17
|
-
"""Local vector search."""
|
|
18
|
-
|
|
19
|
-
def __init__(
|
|
20
|
-
self,
|
|
21
|
-
embedding_repository: EmbeddingRepository,
|
|
22
|
-
embedding_provider: EmbeddingProvider,
|
|
23
|
-
) -> None:
|
|
24
|
-
"""Initialize the local embedder."""
|
|
25
|
-
self.log = structlog.get_logger(__name__)
|
|
26
|
-
self.embedding_repository = embedding_repository
|
|
27
|
-
self.embedding_provider = embedding_provider
|
|
28
|
-
self.encoding = tiktoken.encoding_for_model("text-embedding-3-small")
|
|
29
|
-
|
|
30
|
-
async def index(self, data: list[VectorSearchRequest]) -> None:
|
|
31
|
-
"""Embed a list of documents."""
|
|
32
|
-
if not data or len(data) == 0:
|
|
33
|
-
self.log.warning("Embedding data is empty, skipping embedding")
|
|
34
|
-
return
|
|
35
|
-
|
|
36
|
-
embeddings = await self.embedding_provider.embed([i.text for i in data])
|
|
37
|
-
for i, x in zip(data, embeddings, strict=False):
|
|
38
|
-
await self.embedding_repository.create_embedding(
|
|
39
|
-
Embedding(
|
|
40
|
-
snippet_id=i.snippet_id,
|
|
41
|
-
embedding=[float(y) for y in x],
|
|
42
|
-
type=EmbeddingType.CODE,
|
|
43
|
-
)
|
|
44
|
-
)
|
|
45
|
-
|
|
46
|
-
async def retrieve(self, query: str, top_k: int = 10) -> list[VectorSearchResponse]:
|
|
47
|
-
"""Query the embedding model."""
|
|
48
|
-
embedding = (await self.embedding_provider.embed([query]))[0]
|
|
49
|
-
results = await self.embedding_repository.list_semantic_results(
|
|
50
|
-
EmbeddingType.CODE, [float(x) for x in embedding], top_k
|
|
51
|
-
)
|
|
52
|
-
return [
|
|
53
|
-
VectorSearchResponse(snippet_id, score) for snippet_id, score in results
|
|
54
|
-
]
|
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
"""Embedding service."""
|
|
2
|
-
|
|
3
|
-
from abc import ABC, abstractmethod
|
|
4
|
-
from typing import NamedTuple
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class VectorSearchResponse(NamedTuple):
|
|
8
|
-
"""Embedding result."""
|
|
9
|
-
|
|
10
|
-
snippet_id: int
|
|
11
|
-
score: float
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class VectorSearchRequest(NamedTuple):
|
|
15
|
-
"""Input for embedding."""
|
|
16
|
-
|
|
17
|
-
snippet_id: int
|
|
18
|
-
text: str
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
class VectorSearchService(ABC):
|
|
22
|
-
"""Semantic search service interface."""
|
|
23
|
-
|
|
24
|
-
@abstractmethod
|
|
25
|
-
async def index(self, data: list[VectorSearchRequest]) -> None:
|
|
26
|
-
"""Embed a list of documents.
|
|
27
|
-
|
|
28
|
-
The embedding service accepts a massive list of id,strings to embed. Behind the
|
|
29
|
-
scenes it batches up requests and parallelizes them for performance according to
|
|
30
|
-
the specifics of the embedding service.
|
|
31
|
-
|
|
32
|
-
The id reference is required because the parallelization may return results out
|
|
33
|
-
of order.
|
|
34
|
-
"""
|
|
35
|
-
|
|
36
|
-
@abstractmethod
|
|
37
|
-
async def retrieve(self, query: str, top_k: int = 10) -> list[VectorSearchResponse]:
|
|
38
|
-
"""Query the embedding model."""
|
kodit/enrichment/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
"""Enrichment."""
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
"""Enrichment provider."""
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
"""Enrichment provider."""
|
|
2
|
-
|
|
3
|
-
from abc import ABC, abstractmethod
|
|
4
|
-
|
|
5
|
-
ENRICHMENT_SYSTEM_PROMPT = """
|
|
6
|
-
You are a professional software developer. You will be given a snippet of code.
|
|
7
|
-
Please provide a concise explanation of the code.
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class EnrichmentProvider(ABC):
|
|
12
|
-
"""Enrichment provider."""
|
|
13
|
-
|
|
14
|
-
@abstractmethod
|
|
15
|
-
async def enrich(self, data: list[str]) -> list[str]:
|
|
16
|
-
"""Enrich a list of strings."""
|
|
@@ -1,92 +0,0 @@
|
|
|
1
|
-
"""Local embedding service."""
|
|
2
|
-
|
|
3
|
-
import os
|
|
4
|
-
|
|
5
|
-
import structlog
|
|
6
|
-
import tiktoken
|
|
7
|
-
from tqdm import tqdm
|
|
8
|
-
|
|
9
|
-
from kodit.embedding.embedding_provider.embedding_provider import split_sub_batches
|
|
10
|
-
from kodit.enrichment.enrichment_provider.enrichment_provider import (
|
|
11
|
-
ENRICHMENT_SYSTEM_PROMPT,
|
|
12
|
-
EnrichmentProvider,
|
|
13
|
-
)
|
|
14
|
-
|
|
15
|
-
DEFAULT_ENRICHMENT_MODEL = "Qwen/Qwen3-0.6B"
|
|
16
|
-
DEFAULT_CONTEXT_WINDOW_SIZE = 2048 # Small so it works even on low-powered devices
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class LocalEnrichmentProvider(EnrichmentProvider):
|
|
20
|
-
"""Local embedder."""
|
|
21
|
-
|
|
22
|
-
def __init__(
|
|
23
|
-
self,
|
|
24
|
-
model_name: str = DEFAULT_ENRICHMENT_MODEL,
|
|
25
|
-
context_window: int = DEFAULT_CONTEXT_WINDOW_SIZE,
|
|
26
|
-
) -> None:
|
|
27
|
-
"""Initialize the local enrichment provider."""
|
|
28
|
-
self.log = structlog.get_logger(__name__)
|
|
29
|
-
self.model_name = model_name
|
|
30
|
-
self.context_window = context_window
|
|
31
|
-
self.model = None
|
|
32
|
-
self.tokenizer = None
|
|
33
|
-
self.encoding = tiktoken.encoding_for_model("text-embedding-3-small")
|
|
34
|
-
|
|
35
|
-
async def enrich(self, data: list[str]) -> list[str]:
|
|
36
|
-
"""Enrich a list of strings."""
|
|
37
|
-
if not data or len(data) == 0:
|
|
38
|
-
self.log.warning("Data is empty, skipping enrichment")
|
|
39
|
-
return []
|
|
40
|
-
|
|
41
|
-
from transformers.models.auto.modeling_auto import (
|
|
42
|
-
AutoModelForCausalLM,
|
|
43
|
-
)
|
|
44
|
-
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
|
45
|
-
|
|
46
|
-
if self.tokenizer is None:
|
|
47
|
-
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
48
|
-
self.model_name, padding_side="left"
|
|
49
|
-
)
|
|
50
|
-
if self.model is None:
|
|
51
|
-
os.environ["TOKENIZERS_PARALLELISM"] = "false" # Avoid warnings
|
|
52
|
-
self.model = AutoModelForCausalLM.from_pretrained(
|
|
53
|
-
self.model_name,
|
|
54
|
-
torch_dtype="auto",
|
|
55
|
-
trust_remote_code=True,
|
|
56
|
-
device_map="auto",
|
|
57
|
-
)
|
|
58
|
-
|
|
59
|
-
# Prepare prompts
|
|
60
|
-
prompts = [
|
|
61
|
-
self.tokenizer.apply_chat_template(
|
|
62
|
-
[
|
|
63
|
-
{"role": "system", "content": ENRICHMENT_SYSTEM_PROMPT},
|
|
64
|
-
{"role": "user", "content": snippet},
|
|
65
|
-
],
|
|
66
|
-
tokenize=False,
|
|
67
|
-
add_generation_prompt=True,
|
|
68
|
-
enable_thinking=False,
|
|
69
|
-
)
|
|
70
|
-
for snippet in data
|
|
71
|
-
]
|
|
72
|
-
|
|
73
|
-
# Batch prompts using split_sub_batches
|
|
74
|
-
batched_prompts = split_sub_batches(
|
|
75
|
-
self.encoding, prompts, max_context_window=self.context_window
|
|
76
|
-
)
|
|
77
|
-
results = []
|
|
78
|
-
for batch in tqdm(batched_prompts, leave=False, total=len(batched_prompts)):
|
|
79
|
-
model_inputs = self.tokenizer(
|
|
80
|
-
batch, return_tensors="pt", padding=True, truncation=True
|
|
81
|
-
).to(self.model.device)
|
|
82
|
-
generated_ids = self.model.generate(
|
|
83
|
-
**model_inputs, max_new_tokens=self.context_window
|
|
84
|
-
)
|
|
85
|
-
# For each prompt in the batch, decode only the generated part
|
|
86
|
-
for i, input_ids in enumerate(model_inputs["input_ids"]):
|
|
87
|
-
output_ids = generated_ids[i][len(input_ids) :].tolist()
|
|
88
|
-
content = self.tokenizer.decode(
|
|
89
|
-
output_ids, skip_special_tokens=True
|
|
90
|
-
).strip("\n")
|
|
91
|
-
results.append(content)
|
|
92
|
-
return results
|
|
@@ -1,81 +0,0 @@
|
|
|
1
|
-
"""OpenAI embedding service."""
|
|
2
|
-
|
|
3
|
-
import asyncio
|
|
4
|
-
|
|
5
|
-
import structlog
|
|
6
|
-
import tiktoken
|
|
7
|
-
from openai import AsyncOpenAI
|
|
8
|
-
from tqdm import tqdm
|
|
9
|
-
|
|
10
|
-
from kodit.enrichment.enrichment_provider.enrichment_provider import (
|
|
11
|
-
ENRICHMENT_SYSTEM_PROMPT,
|
|
12
|
-
EnrichmentProvider,
|
|
13
|
-
)
|
|
14
|
-
|
|
15
|
-
OPENAI_NUM_PARALLEL_TASKS = 10
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class OpenAIEnrichmentProvider(EnrichmentProvider):
|
|
19
|
-
"""OpenAI enrichment provider."""
|
|
20
|
-
|
|
21
|
-
def __init__(
|
|
22
|
-
self,
|
|
23
|
-
openai_client: AsyncOpenAI,
|
|
24
|
-
model_name: str = "gpt-4o-mini",
|
|
25
|
-
) -> None:
|
|
26
|
-
"""Initialize the OpenAI enrichment provider."""
|
|
27
|
-
self.log = structlog.get_logger(__name__)
|
|
28
|
-
self.openai_client = openai_client
|
|
29
|
-
self.model_name = model_name
|
|
30
|
-
self.encoding = tiktoken.encoding_for_model("gpt-4o-mini") # Approximation
|
|
31
|
-
|
|
32
|
-
async def enrich(self, data: list[str]) -> list[str]:
|
|
33
|
-
"""Enrich a list of documents."""
|
|
34
|
-
if not data or len(data) == 0:
|
|
35
|
-
self.log.warning("Data is empty, skipping enrichment")
|
|
36
|
-
return []
|
|
37
|
-
|
|
38
|
-
# Process batches in parallel with a semaphore to limit concurrent requests
|
|
39
|
-
sem = asyncio.Semaphore(OPENAI_NUM_PARALLEL_TASKS)
|
|
40
|
-
|
|
41
|
-
# Create a list of tuples with a temporary id for each snippet
|
|
42
|
-
# We need to do this so that we can return the results in the same order as the
|
|
43
|
-
# input data
|
|
44
|
-
input_data = [(i, snippet) for i, snippet in enumerate(data)]
|
|
45
|
-
|
|
46
|
-
async def process_data(data: tuple[int, str]) -> tuple[int, str]:
|
|
47
|
-
snippet_id, snippet = data
|
|
48
|
-
if not snippet:
|
|
49
|
-
return snippet_id, ""
|
|
50
|
-
async with sem:
|
|
51
|
-
try:
|
|
52
|
-
response = await self.openai_client.chat.completions.create(
|
|
53
|
-
model=self.model_name,
|
|
54
|
-
messages=[
|
|
55
|
-
{
|
|
56
|
-
"role": "system",
|
|
57
|
-
"content": ENRICHMENT_SYSTEM_PROMPT,
|
|
58
|
-
},
|
|
59
|
-
{"role": "user", "content": snippet},
|
|
60
|
-
],
|
|
61
|
-
)
|
|
62
|
-
return snippet_id, response.choices[0].message.content or ""
|
|
63
|
-
except Exception as e:
|
|
64
|
-
self.log.exception("Error enriching data", error=str(e))
|
|
65
|
-
return snippet_id, ""
|
|
66
|
-
|
|
67
|
-
# Create tasks for all data
|
|
68
|
-
tasks = [process_data(snippet) for snippet in input_data]
|
|
69
|
-
|
|
70
|
-
# Process all data and yield results as they complete
|
|
71
|
-
results: list[tuple[int, str]] = []
|
|
72
|
-
for task in tqdm(
|
|
73
|
-
asyncio.as_completed(tasks),
|
|
74
|
-
total=len(tasks),
|
|
75
|
-
leave=False,
|
|
76
|
-
):
|
|
77
|
-
result = await task
|
|
78
|
-
results.append(result)
|
|
79
|
-
|
|
80
|
-
# Output in the same order as the input data
|
|
81
|
-
return [result for _, result in sorted(results, key=lambda x: x[0])]
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
"""Enrichment service."""
|
|
2
|
-
|
|
3
|
-
from abc import ABC, abstractmethod
|
|
4
|
-
|
|
5
|
-
from kodit.enrichment.enrichment_provider.enrichment_provider import EnrichmentProvider
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class EnrichmentService(ABC):
|
|
9
|
-
"""Enrichment service."""
|
|
10
|
-
|
|
11
|
-
@abstractmethod
|
|
12
|
-
async def enrich(self, data: list[str]) -> list[str]:
|
|
13
|
-
"""Enrich a list of strings."""
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class NullEnrichmentService(EnrichmentService):
|
|
17
|
-
"""Null enrichment service."""
|
|
18
|
-
|
|
19
|
-
async def enrich(self, data: list[str]) -> list[str]:
|
|
20
|
-
"""Enrich a list of strings."""
|
|
21
|
-
return [""] * len(data)
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class LLMEnrichmentService(EnrichmentService):
|
|
25
|
-
"""Enrichment service using an LLM."""
|
|
26
|
-
|
|
27
|
-
def __init__(self, enrichment_provider: EnrichmentProvider) -> None:
|
|
28
|
-
"""Initialize the enrichment service."""
|
|
29
|
-
self.enrichment_provider = enrichment_provider
|
|
30
|
-
|
|
31
|
-
async def enrich(self, data: list[str]) -> list[str]:
|
|
32
|
-
"""Enrich a list of strings."""
|
|
33
|
-
return await self.enrichment_provider.enrich(data)
|
kodit/indexing/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
"""Indexing package for managing code indexes and search functionality."""
|
kodit/indexing/fusion.py
DELETED
|
@@ -1,67 +0,0 @@
|
|
|
1
|
-
"""Fusion functions for combining search results."""
|
|
2
|
-
|
|
3
|
-
from collections import defaultdict
|
|
4
|
-
from dataclasses import dataclass
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
@dataclass
|
|
8
|
-
class FusionResult:
|
|
9
|
-
"""Result of a fusion operation."""
|
|
10
|
-
|
|
11
|
-
id: int
|
|
12
|
-
score: float
|
|
13
|
-
original_scores: list[float]
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
@dataclass
|
|
17
|
-
class FusionRequest:
|
|
18
|
-
"""Result of a RRF operation."""
|
|
19
|
-
|
|
20
|
-
id: int
|
|
21
|
-
score: float
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def reciprocal_rank_fusion(
|
|
25
|
-
rankings: list[list[FusionRequest]], k: float = 60
|
|
26
|
-
) -> list[FusionResult]:
|
|
27
|
-
"""RRF prioritises results that are present in all results.
|
|
28
|
-
|
|
29
|
-
Args:
|
|
30
|
-
rankings: List of rankers, each containing a list of document ids. Top of the
|
|
31
|
-
list is considered to be the best result.
|
|
32
|
-
k: Parameter for RRF.
|
|
33
|
-
|
|
34
|
-
Returns:
|
|
35
|
-
Dictionary of ids and their scores.
|
|
36
|
-
|
|
37
|
-
"""
|
|
38
|
-
scores = {}
|
|
39
|
-
for ranker in rankings:
|
|
40
|
-
for rank in ranker:
|
|
41
|
-
scores[rank.id] = float(0)
|
|
42
|
-
|
|
43
|
-
for ranker in rankings:
|
|
44
|
-
for i, rank in enumerate(ranker):
|
|
45
|
-
scores[rank.id] += 1.0 / (k + i)
|
|
46
|
-
|
|
47
|
-
# Create a list of tuples of ids and their scores
|
|
48
|
-
results = [(rank, scores[rank]) for rank in scores]
|
|
49
|
-
|
|
50
|
-
# Sort results by score
|
|
51
|
-
results.sort(key=lambda x: x[1], reverse=True)
|
|
52
|
-
|
|
53
|
-
# Create a map of original scores to ids
|
|
54
|
-
original_scores_to_ids = defaultdict(list)
|
|
55
|
-
for ranker in rankings:
|
|
56
|
-
for rank in ranker:
|
|
57
|
-
original_scores_to_ids[rank.id].append(rank.score)
|
|
58
|
-
|
|
59
|
-
# Rebuild a list of final results with their original scores
|
|
60
|
-
return [
|
|
61
|
-
FusionResult(
|
|
62
|
-
id=result[0],
|
|
63
|
-
score=result[1],
|
|
64
|
-
original_scores=original_scores_to_ids[result[0]],
|
|
65
|
-
)
|
|
66
|
-
for result in results
|
|
67
|
-
]
|