kodit 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/application/__init__.py +1 -0
- kodit/application/commands/__init__.py +1 -0
- kodit/application/commands/snippet_commands.py +22 -0
- kodit/application/services/__init__.py +1 -0
- kodit/application/services/indexing_application_service.py +363 -0
- kodit/application/services/snippet_application_service.py +143 -0
- kodit/cli.py +105 -82
- kodit/database.py +0 -22
- kodit/domain/__init__.py +1 -0
- kodit/{source/source_models.py → domain/entities.py} +88 -19
- kodit/domain/enums.py +9 -0
- kodit/domain/interfaces.py +27 -0
- kodit/domain/repositories.py +95 -0
- kodit/domain/services/__init__.py +1 -0
- kodit/domain/services/bm25_service.py +124 -0
- kodit/domain/services/embedding_service.py +155 -0
- kodit/domain/services/enrichment_service.py +48 -0
- kodit/domain/services/ignore_service.py +45 -0
- kodit/domain/services/indexing_service.py +203 -0
- kodit/domain/services/snippet_extraction_service.py +89 -0
- kodit/domain/services/source_service.py +83 -0
- kodit/domain/value_objects.py +215 -0
- kodit/infrastructure/__init__.py +1 -0
- kodit/infrastructure/bm25/__init__.py +1 -0
- kodit/infrastructure/bm25/bm25_factory.py +28 -0
- kodit/{bm25/local_bm25.py → infrastructure/bm25/local_bm25_repository.py} +33 -22
- kodit/{bm25/vectorchord_bm25.py → infrastructure/bm25/vectorchord_bm25_repository.py} +40 -35
- kodit/infrastructure/cloning/__init__.py +1 -0
- kodit/infrastructure/cloning/folder/__init__.py +1 -0
- kodit/infrastructure/cloning/folder/factory.py +119 -0
- kodit/infrastructure/cloning/folder/working_copy.py +38 -0
- kodit/infrastructure/cloning/git/__init__.py +1 -0
- kodit/infrastructure/cloning/git/factory.py +133 -0
- kodit/infrastructure/cloning/git/working_copy.py +32 -0
- kodit/infrastructure/cloning/metadata.py +127 -0
- kodit/infrastructure/embedding/__init__.py +1 -0
- kodit/infrastructure/embedding/embedding_factory.py +87 -0
- kodit/infrastructure/embedding/embedding_providers/__init__.py +1 -0
- kodit/infrastructure/embedding/embedding_providers/batching.py +93 -0
- kodit/infrastructure/embedding/embedding_providers/hash_embedding_provider.py +79 -0
- kodit/infrastructure/embedding/embedding_providers/local_embedding_provider.py +129 -0
- kodit/infrastructure/embedding/embedding_providers/openai_embedding_provider.py +113 -0
- kodit/infrastructure/embedding/local_vector_search_repository.py +114 -0
- kodit/{embedding/vectorchord_vector_search_service.py → infrastructure/embedding/vectorchord_vector_search_repository.py} +65 -46
- kodit/infrastructure/enrichment/__init__.py +1 -0
- kodit/{enrichment → infrastructure/enrichment}/enrichment_factory.py +28 -12
- kodit/infrastructure/enrichment/legacy_enrichment_models.py +42 -0
- kodit/{enrichment/enrichment_provider → infrastructure/enrichment}/local_enrichment_provider.py +38 -26
- kodit/infrastructure/enrichment/null_enrichment_provider.py +25 -0
- kodit/infrastructure/enrichment/openai_enrichment_provider.py +89 -0
- kodit/infrastructure/git/__init__.py +1 -0
- kodit/{source/git.py → infrastructure/git/git_utils.py} +10 -2
- kodit/infrastructure/ignore/__init__.py +1 -0
- kodit/{source/ignore.py → infrastructure/ignore/ignore_pattern_provider.py} +23 -6
- kodit/infrastructure/indexing/__init__.py +1 -0
- kodit/infrastructure/indexing/fusion_service.py +55 -0
- kodit/infrastructure/indexing/index_repository.py +296 -0
- kodit/infrastructure/indexing/indexing_factory.py +111 -0
- kodit/infrastructure/snippet_extraction/__init__.py +1 -0
- kodit/infrastructure/snippet_extraction/language_detection_service.py +39 -0
- kodit/infrastructure/snippet_extraction/snippet_extraction_factory.py +95 -0
- kodit/infrastructure/snippet_extraction/snippet_query_provider.py +45 -0
- kodit/{snippets/method_snippets.py → infrastructure/snippet_extraction/tree_sitter_snippet_extractor.py} +123 -61
- kodit/infrastructure/sqlalchemy/__init__.py +1 -0
- kodit/{embedding → infrastructure/sqlalchemy}/embedding_repository.py +40 -24
- kodit/infrastructure/sqlalchemy/file_repository.py +73 -0
- kodit/infrastructure/sqlalchemy/repository.py +121 -0
- kodit/infrastructure/sqlalchemy/snippet_repository.py +75 -0
- kodit/infrastructure/ui/__init__.py +1 -0
- kodit/infrastructure/ui/progress.py +127 -0
- kodit/{util → infrastructure/ui}/spinner.py +19 -4
- kodit/mcp.py +50 -28
- kodit/migrations/env.py +1 -4
- kodit/reporting.py +78 -0
- {kodit-0.2.4.dist-info → kodit-0.2.5.dist-info}/METADATA +1 -1
- kodit-0.2.5.dist-info/RECORD +99 -0
- kodit/bm25/__init__.py +0 -1
- kodit/bm25/keyword_search_factory.py +0 -17
- kodit/bm25/keyword_search_service.py +0 -34
- kodit/embedding/__init__.py +0 -1
- kodit/embedding/embedding_factory.py +0 -69
- kodit/embedding/embedding_models.py +0 -28
- kodit/embedding/embedding_provider/__init__.py +0 -1
- kodit/embedding/embedding_provider/embedding_provider.py +0 -92
- kodit/embedding/embedding_provider/hash_embedding_provider.py +0 -86
- kodit/embedding/embedding_provider/local_embedding_provider.py +0 -96
- kodit/embedding/embedding_provider/openai_embedding_provider.py +0 -73
- kodit/embedding/local_vector_search_service.py +0 -87
- kodit/embedding/vector_search_service.py +0 -55
- kodit/enrichment/__init__.py +0 -1
- kodit/enrichment/enrichment_provider/__init__.py +0 -1
- kodit/enrichment/enrichment_provider/enrichment_provider.py +0 -36
- kodit/enrichment/enrichment_provider/openai_enrichment_provider.py +0 -79
- kodit/enrichment/enrichment_service.py +0 -45
- kodit/indexing/__init__.py +0 -1
- kodit/indexing/fusion.py +0 -67
- kodit/indexing/indexing_models.py +0 -43
- kodit/indexing/indexing_repository.py +0 -216
- kodit/indexing/indexing_service.py +0 -344
- kodit/snippets/__init__.py +0 -1
- kodit/snippets/languages/__init__.py +0 -53
- kodit/snippets/snippets.py +0 -50
- kodit/source/__init__.py +0 -1
- kodit/source/source_factories.py +0 -356
- kodit/source/source_repository.py +0 -169
- kodit/source/source_service.py +0 -150
- kodit/util/__init__.py +0 -1
- kodit-0.2.4.dist-info/RECORD +0 -71
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/csharp.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/go.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/javascript.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/python.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/typescript.scm +0 -0
- {kodit-0.2.4.dist-info → kodit-0.2.5.dist-info}/WHEEL +0 -0
- {kodit-0.2.4.dist-info → kodit-0.2.5.dist-info}/entry_points.txt +0 -0
- {kodit-0.2.4.dist-info → kodit-0.2.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,92 +0,0 @@
|
|
|
1
|
-
"""Embedding provider."""
|
|
2
|
-
|
|
3
|
-
from abc import ABC, abstractmethod
|
|
4
|
-
from collections.abc import AsyncGenerator
|
|
5
|
-
from dataclasses import dataclass
|
|
6
|
-
|
|
7
|
-
import structlog
|
|
8
|
-
import tiktoken
|
|
9
|
-
|
|
10
|
-
OPENAI_MAX_EMBEDDING_SIZE = 8192
|
|
11
|
-
|
|
12
|
-
Vector = list[float]
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
@dataclass
|
|
16
|
-
class EmbeddingRequest:
|
|
17
|
-
"""Embedding request."""
|
|
18
|
-
|
|
19
|
-
id: int
|
|
20
|
-
text: str
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
@dataclass
|
|
24
|
-
class EmbeddingResponse:
|
|
25
|
-
"""Embedding response."""
|
|
26
|
-
|
|
27
|
-
id: int
|
|
28
|
-
embedding: Vector
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
class EmbeddingProvider(ABC):
|
|
32
|
-
"""Embedding provider."""
|
|
33
|
-
|
|
34
|
-
@abstractmethod
|
|
35
|
-
def embed(
|
|
36
|
-
self, data: list[EmbeddingRequest]
|
|
37
|
-
) -> AsyncGenerator[list[EmbeddingResponse], None]:
|
|
38
|
-
"""Embed a list of strings.
|
|
39
|
-
|
|
40
|
-
The embedding provider is responsible for embedding a list of strings into a
|
|
41
|
-
list of vectors. The embedding provider is responsible for splitting the list of
|
|
42
|
-
strings into smaller sub-batches and embedding them in parallel.
|
|
43
|
-
"""
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
def split_sub_batches(
|
|
47
|
-
encoding: tiktoken.Encoding,
|
|
48
|
-
data: list[EmbeddingRequest],
|
|
49
|
-
max_context_window: int = OPENAI_MAX_EMBEDDING_SIZE,
|
|
50
|
-
) -> list[list[EmbeddingRequest]]:
|
|
51
|
-
"""Split a list of strings into smaller sub-batches."""
|
|
52
|
-
log = structlog.get_logger(__name__)
|
|
53
|
-
result = []
|
|
54
|
-
data_to_process = [s for s in data if s.text.strip()] # Filter out empty strings
|
|
55
|
-
|
|
56
|
-
while data_to_process:
|
|
57
|
-
next_batch = []
|
|
58
|
-
current_tokens = 0
|
|
59
|
-
|
|
60
|
-
while data_to_process:
|
|
61
|
-
next_item = data_to_process[0]
|
|
62
|
-
item_tokens = len(encoding.encode(next_item.text, disallowed_special=()))
|
|
63
|
-
|
|
64
|
-
if item_tokens > max_context_window:
|
|
65
|
-
# Optimise truncation by operating on tokens directly instead of
|
|
66
|
-
# removing one character at a time and repeatedly re-encoding.
|
|
67
|
-
tokens = encoding.encode(next_item.text, disallowed_special=())
|
|
68
|
-
if len(tokens) > max_context_window:
|
|
69
|
-
# Keep only the first *max_context_window* tokens.
|
|
70
|
-
tokens = tokens[:max_context_window]
|
|
71
|
-
# Convert back to text. This requires only one decode call and
|
|
72
|
-
# guarantees that the resulting string fits the token budget.
|
|
73
|
-
next_item.text = encoding.decode(tokens)
|
|
74
|
-
item_tokens = max_context_window # We know the exact size now
|
|
75
|
-
|
|
76
|
-
data_to_process[0] = next_item
|
|
77
|
-
|
|
78
|
-
log.warning(
|
|
79
|
-
"Truncated snippet because it was too long to embed",
|
|
80
|
-
snippet=next_item.text[:100] + "...",
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
if current_tokens + item_tokens > max_context_window:
|
|
84
|
-
break
|
|
85
|
-
|
|
86
|
-
next_batch.append(data_to_process.pop(0))
|
|
87
|
-
current_tokens += item_tokens
|
|
88
|
-
|
|
89
|
-
if next_batch:
|
|
90
|
-
result.append(next_batch)
|
|
91
|
-
|
|
92
|
-
return result
|
|
@@ -1,86 +0,0 @@
|
|
|
1
|
-
"""Hash embedding provider, for use in tests only."""
|
|
2
|
-
|
|
3
|
-
import asyncio
|
|
4
|
-
import hashlib
|
|
5
|
-
import math
|
|
6
|
-
from collections.abc import AsyncGenerator, Generator, Sequence
|
|
7
|
-
|
|
8
|
-
from kodit.embedding.embedding_provider.embedding_provider import (
|
|
9
|
-
EmbeddingProvider,
|
|
10
|
-
EmbeddingRequest,
|
|
11
|
-
EmbeddingResponse,
|
|
12
|
-
Vector,
|
|
13
|
-
)
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class HashEmbeddingProvider(EmbeddingProvider):
|
|
17
|
-
"""A minimal test-time embedding provider.
|
|
18
|
-
|
|
19
|
-
• Zero third-party dependencies (uses only std-lib)
|
|
20
|
-
• Distinguishes strings by hashing with SHA-256
|
|
21
|
-
• Maps the digest to a fixed-size float vector, then ℓ₂-normalises
|
|
22
|
-
• Splits work into small asynchronous chunks for speed in event loops
|
|
23
|
-
"""
|
|
24
|
-
|
|
25
|
-
def __init__(self, dim: int = 16, batch_size: int = 64) -> None:
|
|
26
|
-
"""Initialize the hash embedding provider."""
|
|
27
|
-
if dim <= 0:
|
|
28
|
-
msg = f"dim must be > 0, got {dim}"
|
|
29
|
-
raise ValueError(msg)
|
|
30
|
-
if batch_size <= 0:
|
|
31
|
-
msg = f"batch_size must be > 0, got {batch_size}"
|
|
32
|
-
raise ValueError(msg)
|
|
33
|
-
self.dim = dim
|
|
34
|
-
self.batch_size = batch_size
|
|
35
|
-
|
|
36
|
-
async def embed(
|
|
37
|
-
self, data: list[EmbeddingRequest]
|
|
38
|
-
) -> AsyncGenerator[list[EmbeddingResponse], None]:
|
|
39
|
-
"""Embed every string in *data*, preserving order.
|
|
40
|
-
|
|
41
|
-
Work is sliced into *batch_size* chunks and scheduled concurrently
|
|
42
|
-
(still CPU-bound, but enough to cooperate with an asyncio loop).
|
|
43
|
-
"""
|
|
44
|
-
if not data:
|
|
45
|
-
yield []
|
|
46
|
-
|
|
47
|
-
async def _embed_chunk(chunk: Sequence[str]) -> list[Vector]:
|
|
48
|
-
return [self._string_to_vector(text) for text in chunk]
|
|
49
|
-
|
|
50
|
-
tasks = [
|
|
51
|
-
asyncio.create_task(_embed_chunk(chunk))
|
|
52
|
-
for chunk in self._chunked([i.text for i in data], self.batch_size)
|
|
53
|
-
]
|
|
54
|
-
|
|
55
|
-
for task in tasks:
|
|
56
|
-
result = await task
|
|
57
|
-
yield [
|
|
58
|
-
EmbeddingResponse(
|
|
59
|
-
id=item.id,
|
|
60
|
-
embedding=embedding,
|
|
61
|
-
)
|
|
62
|
-
for item, embedding in zip(data, result, strict=True)
|
|
63
|
-
]
|
|
64
|
-
|
|
65
|
-
@staticmethod
|
|
66
|
-
def _chunked(seq: Sequence[str], size: int) -> Generator[Sequence[str], None, None]:
|
|
67
|
-
"""Yield successive *size*-sized slices from *seq*."""
|
|
68
|
-
for i in range(0, len(seq), size):
|
|
69
|
-
yield seq[i : i + size]
|
|
70
|
-
|
|
71
|
-
def _string_to_vector(self, text: str) -> Vector:
|
|
72
|
-
"""Deterministically convert *text* to a normalised float vector."""
|
|
73
|
-
digest = hashlib.sha256(text.encode("utf-8")).digest()
|
|
74
|
-
|
|
75
|
-
# Build the vector from 4-byte windows of the digest.
|
|
76
|
-
vec = [
|
|
77
|
-
int.from_bytes(
|
|
78
|
-
digest[(i * 4) % len(digest) : (i * 4) % len(digest) + 4], "big"
|
|
79
|
-
)
|
|
80
|
-
/ 0xFFFFFFFF
|
|
81
|
-
for i in range(self.dim)
|
|
82
|
-
]
|
|
83
|
-
|
|
84
|
-
# ℓ₂-normalise so magnitudes are comparable.
|
|
85
|
-
norm = math.sqrt(sum(x * x for x in vec)) or 1.0
|
|
86
|
-
return [x / norm for x in vec]
|
|
@@ -1,96 +0,0 @@
|
|
|
1
|
-
"""Local embedding service."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import os
|
|
6
|
-
from time import time
|
|
7
|
-
from typing import TYPE_CHECKING
|
|
8
|
-
|
|
9
|
-
import structlog
|
|
10
|
-
|
|
11
|
-
from kodit.embedding.embedding_provider.embedding_provider import (
|
|
12
|
-
EmbeddingProvider,
|
|
13
|
-
EmbeddingRequest,
|
|
14
|
-
EmbeddingResponse,
|
|
15
|
-
split_sub_batches,
|
|
16
|
-
)
|
|
17
|
-
|
|
18
|
-
if TYPE_CHECKING:
|
|
19
|
-
from collections.abc import AsyncGenerator
|
|
20
|
-
|
|
21
|
-
from sentence_transformers import SentenceTransformer
|
|
22
|
-
from tiktoken import Encoding
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
TINY = "tiny"
|
|
26
|
-
CODE = "code"
|
|
27
|
-
TEST = "test"
|
|
28
|
-
|
|
29
|
-
COMMON_EMBEDDING_MODELS = {
|
|
30
|
-
TINY: "ibm-granite/granite-embedding-30m-english",
|
|
31
|
-
CODE: "flax-sentence-embeddings/st-codesearch-distilroberta-base",
|
|
32
|
-
TEST: "minishlab/potion-base-4M",
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
class LocalEmbeddingProvider(EmbeddingProvider):
|
|
37
|
-
"""Local embedder."""
|
|
38
|
-
|
|
39
|
-
def __init__(self, model_name: str) -> None:
|
|
40
|
-
"""Initialize the local embedder."""
|
|
41
|
-
self.log = structlog.get_logger(__name__)
|
|
42
|
-
self.model_name = COMMON_EMBEDDING_MODELS.get(model_name, model_name)
|
|
43
|
-
self.encoding_name = "text-embedding-3-small"
|
|
44
|
-
self.embedding_model = None
|
|
45
|
-
self.encoding = None
|
|
46
|
-
|
|
47
|
-
def _encoding(self) -> Encoding:
|
|
48
|
-
if self.encoding is None:
|
|
49
|
-
from tiktoken import encoding_for_model
|
|
50
|
-
|
|
51
|
-
start_time = time()
|
|
52
|
-
self.encoding = encoding_for_model(self.encoding_name)
|
|
53
|
-
self.log.debug(
|
|
54
|
-
"Encoding loaded",
|
|
55
|
-
model_name=self.encoding_name,
|
|
56
|
-
duration=time() - start_time,
|
|
57
|
-
)
|
|
58
|
-
return self.encoding
|
|
59
|
-
|
|
60
|
-
def _model(self) -> SentenceTransformer:
|
|
61
|
-
"""Get the embedding model."""
|
|
62
|
-
if self.embedding_model is None:
|
|
63
|
-
os.environ["TOKENIZERS_PARALLELISM"] = "false" # Avoid warnings
|
|
64
|
-
from sentence_transformers import SentenceTransformer
|
|
65
|
-
|
|
66
|
-
start_time = time()
|
|
67
|
-
self.embedding_model = SentenceTransformer(
|
|
68
|
-
self.model_name,
|
|
69
|
-
trust_remote_code=True,
|
|
70
|
-
)
|
|
71
|
-
self.log.debug(
|
|
72
|
-
"Model loaded",
|
|
73
|
-
model_name=self.model_name,
|
|
74
|
-
duration=time() - start_time,
|
|
75
|
-
)
|
|
76
|
-
return self.embedding_model
|
|
77
|
-
|
|
78
|
-
async def embed(
|
|
79
|
-
self, data: list[EmbeddingRequest]
|
|
80
|
-
) -> AsyncGenerator[list[EmbeddingResponse], None]:
|
|
81
|
-
"""Embed a list of strings."""
|
|
82
|
-
model = self._model()
|
|
83
|
-
|
|
84
|
-
batched_data = split_sub_batches(self._encoding(), data)
|
|
85
|
-
|
|
86
|
-
for batch in batched_data:
|
|
87
|
-
embeddings = model.encode(
|
|
88
|
-
[i.text for i in batch], show_progress_bar=False, batch_size=4
|
|
89
|
-
)
|
|
90
|
-
yield [
|
|
91
|
-
EmbeddingResponse(
|
|
92
|
-
id=item.id,
|
|
93
|
-
embedding=[float(x) for x in embedding],
|
|
94
|
-
)
|
|
95
|
-
for item, embedding in zip(batch, embeddings, strict=True)
|
|
96
|
-
]
|
|
@@ -1,73 +0,0 @@
|
|
|
1
|
-
"""OpenAI embedding service."""
|
|
2
|
-
|
|
3
|
-
import asyncio
|
|
4
|
-
from collections.abc import AsyncGenerator
|
|
5
|
-
|
|
6
|
-
import structlog
|
|
7
|
-
import tiktoken
|
|
8
|
-
from openai import AsyncOpenAI
|
|
9
|
-
|
|
10
|
-
from kodit.embedding.embedding_provider.embedding_provider import (
|
|
11
|
-
EmbeddingProvider,
|
|
12
|
-
EmbeddingRequest,
|
|
13
|
-
EmbeddingResponse,
|
|
14
|
-
split_sub_batches,
|
|
15
|
-
)
|
|
16
|
-
|
|
17
|
-
OPENAI_NUM_PARALLEL_TASKS = 10
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class OpenAIEmbeddingProvider(EmbeddingProvider):
|
|
21
|
-
"""OpenAI embedder."""
|
|
22
|
-
|
|
23
|
-
def __init__(
|
|
24
|
-
self,
|
|
25
|
-
openai_client: AsyncOpenAI,
|
|
26
|
-
model_name: str = "text-embedding-3-small",
|
|
27
|
-
) -> None:
|
|
28
|
-
"""Initialize the OpenAI embedder."""
|
|
29
|
-
self.log = structlog.get_logger(__name__)
|
|
30
|
-
self.openai_client = openai_client
|
|
31
|
-
self.model_name = model_name
|
|
32
|
-
self.encoding = tiktoken.encoding_for_model(
|
|
33
|
-
"text-embedding-3-small"
|
|
34
|
-
) # Sensible default
|
|
35
|
-
|
|
36
|
-
async def embed(
|
|
37
|
-
self, data: list[EmbeddingRequest]
|
|
38
|
-
) -> AsyncGenerator[list[EmbeddingResponse], None]:
|
|
39
|
-
"""Embed a list of documents."""
|
|
40
|
-
# First split the list into a list of list where each sublist has fewer than
|
|
41
|
-
# max tokens.
|
|
42
|
-
batched_data = split_sub_batches(self.encoding, data)
|
|
43
|
-
|
|
44
|
-
# Process batches in parallel with a semaphore to limit concurrent requests
|
|
45
|
-
sem = asyncio.Semaphore(OPENAI_NUM_PARALLEL_TASKS)
|
|
46
|
-
|
|
47
|
-
async def process_batch(
|
|
48
|
-
data: list[EmbeddingRequest],
|
|
49
|
-
) -> list[EmbeddingResponse]:
|
|
50
|
-
async with sem:
|
|
51
|
-
try:
|
|
52
|
-
response = await self.openai_client.embeddings.create(
|
|
53
|
-
model=self.model_name,
|
|
54
|
-
input=[i.text for i in data],
|
|
55
|
-
)
|
|
56
|
-
return [
|
|
57
|
-
EmbeddingResponse(
|
|
58
|
-
id=item.id,
|
|
59
|
-
embedding=embedding.embedding,
|
|
60
|
-
)
|
|
61
|
-
for item, embedding in zip(data, response.data, strict=True)
|
|
62
|
-
]
|
|
63
|
-
except Exception as e:
|
|
64
|
-
self.log.exception("Error embedding batch", error=str(e))
|
|
65
|
-
return []
|
|
66
|
-
|
|
67
|
-
# Create tasks for all batches
|
|
68
|
-
tasks = [process_batch(batch) for batch in batched_data]
|
|
69
|
-
|
|
70
|
-
# Process all batches and yield results as they complete
|
|
71
|
-
for task in asyncio.as_completed(tasks):
|
|
72
|
-
result = await task
|
|
73
|
-
yield result
|
|
@@ -1,87 +0,0 @@
|
|
|
1
|
-
"""Local vector search."""
|
|
2
|
-
|
|
3
|
-
from collections.abc import AsyncGenerator
|
|
4
|
-
|
|
5
|
-
import structlog
|
|
6
|
-
import tiktoken
|
|
7
|
-
|
|
8
|
-
from kodit.embedding.embedding_models import Embedding, EmbeddingType
|
|
9
|
-
from kodit.embedding.embedding_provider.embedding_provider import (
|
|
10
|
-
EmbeddingProvider,
|
|
11
|
-
EmbeddingRequest,
|
|
12
|
-
)
|
|
13
|
-
from kodit.embedding.embedding_repository import EmbeddingRepository
|
|
14
|
-
from kodit.embedding.vector_search_service import (
|
|
15
|
-
IndexResult,
|
|
16
|
-
VectorSearchRequest,
|
|
17
|
-
VectorSearchResponse,
|
|
18
|
-
VectorSearchService,
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class LocalVectorSearchService(VectorSearchService):
|
|
23
|
-
"""Local vector search."""
|
|
24
|
-
|
|
25
|
-
def __init__(
|
|
26
|
-
self,
|
|
27
|
-
embedding_repository: EmbeddingRepository,
|
|
28
|
-
embedding_provider: EmbeddingProvider,
|
|
29
|
-
embedding_type: EmbeddingType = EmbeddingType.CODE,
|
|
30
|
-
) -> None:
|
|
31
|
-
"""Initialize the local embedder."""
|
|
32
|
-
self.log = structlog.get_logger(__name__)
|
|
33
|
-
self.embedding_repository = embedding_repository
|
|
34
|
-
self.embedding_provider = embedding_provider
|
|
35
|
-
self.encoding = tiktoken.encoding_for_model("text-embedding-3-small")
|
|
36
|
-
self.embedding_type = embedding_type
|
|
37
|
-
|
|
38
|
-
async def index(
|
|
39
|
-
self, data: list[VectorSearchRequest]
|
|
40
|
-
) -> AsyncGenerator[list[IndexResult], None]:
|
|
41
|
-
"""Embed a list of documents."""
|
|
42
|
-
if not data or len(data) == 0:
|
|
43
|
-
return
|
|
44
|
-
|
|
45
|
-
requests = [EmbeddingRequest(id=doc.snippet_id, text=doc.text) for doc in data]
|
|
46
|
-
|
|
47
|
-
async for batch in self.embedding_provider.embed(requests):
|
|
48
|
-
for result in batch:
|
|
49
|
-
await self.embedding_repository.create_embedding(
|
|
50
|
-
Embedding(
|
|
51
|
-
snippet_id=result.id,
|
|
52
|
-
embedding=result.embedding,
|
|
53
|
-
type=self.embedding_type,
|
|
54
|
-
)
|
|
55
|
-
)
|
|
56
|
-
yield [IndexResult(snippet_id=result.id)]
|
|
57
|
-
|
|
58
|
-
async def retrieve(self, query: str, top_k: int = 10) -> list[VectorSearchResponse]:
|
|
59
|
-
"""Query the embedding model."""
|
|
60
|
-
# Build a single-item request and collect its embedding.
|
|
61
|
-
req = EmbeddingRequest(id=0, text=query)
|
|
62
|
-
embedding_vec: list[float] | None = None
|
|
63
|
-
async for batch in self.embedding_provider.embed([req]):
|
|
64
|
-
if batch:
|
|
65
|
-
embedding_vec = [float(v) for v in batch[0].embedding]
|
|
66
|
-
break
|
|
67
|
-
|
|
68
|
-
if not embedding_vec:
|
|
69
|
-
return []
|
|
70
|
-
|
|
71
|
-
results = await self.embedding_repository.list_semantic_results(
|
|
72
|
-
self.embedding_type, embedding_vec, top_k
|
|
73
|
-
)
|
|
74
|
-
return [
|
|
75
|
-
VectorSearchResponse(snippet_id, score) for snippet_id, score in results
|
|
76
|
-
]
|
|
77
|
-
|
|
78
|
-
async def has_embedding(
|
|
79
|
-
self, snippet_id: int, embedding_type: EmbeddingType
|
|
80
|
-
) -> bool:
|
|
81
|
-
"""Check if a snippet has an embedding."""
|
|
82
|
-
return (
|
|
83
|
-
await self.embedding_repository.get_embedding_by_snippet_id_and_type(
|
|
84
|
-
snippet_id, embedding_type
|
|
85
|
-
)
|
|
86
|
-
is not None
|
|
87
|
-
)
|
|
@@ -1,55 +0,0 @@
|
|
|
1
|
-
"""Embedding service."""
|
|
2
|
-
|
|
3
|
-
from abc import ABC, abstractmethod
|
|
4
|
-
from collections.abc import AsyncGenerator
|
|
5
|
-
from typing import NamedTuple
|
|
6
|
-
|
|
7
|
-
from kodit.embedding.embedding_models import EmbeddingType
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class VectorSearchResponse(NamedTuple):
|
|
11
|
-
"""Embedding result."""
|
|
12
|
-
|
|
13
|
-
snippet_id: int
|
|
14
|
-
score: float
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class VectorSearchRequest(NamedTuple):
|
|
18
|
-
"""Input for embedding."""
|
|
19
|
-
|
|
20
|
-
snippet_id: int
|
|
21
|
-
text: str
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class IndexResult(NamedTuple):
|
|
25
|
-
"""Result of indexing."""
|
|
26
|
-
|
|
27
|
-
snippet_id: int
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
class VectorSearchService(ABC):
|
|
31
|
-
"""Semantic search service interface."""
|
|
32
|
-
|
|
33
|
-
@abstractmethod
|
|
34
|
-
def index(
|
|
35
|
-
self, data: list[VectorSearchRequest]
|
|
36
|
-
) -> AsyncGenerator[list[IndexResult], None]:
|
|
37
|
-
"""Embed a list of documents.
|
|
38
|
-
|
|
39
|
-
The embedding service accepts a massive list of id,strings to embed. Behind the
|
|
40
|
-
scenes it batches up requests and parallelizes them for performance according to
|
|
41
|
-
the specifics of the embedding service.
|
|
42
|
-
|
|
43
|
-
The id reference is required because the parallelization may return results out
|
|
44
|
-
of order.
|
|
45
|
-
"""
|
|
46
|
-
|
|
47
|
-
@abstractmethod
|
|
48
|
-
async def retrieve(self, query: str, top_k: int = 10) -> list[VectorSearchResponse]:
|
|
49
|
-
"""Query the embedding model."""
|
|
50
|
-
|
|
51
|
-
@abstractmethod
|
|
52
|
-
async def has_embedding(
|
|
53
|
-
self, snippet_id: int, embedding_type: EmbeddingType
|
|
54
|
-
) -> bool:
|
|
55
|
-
"""Check if a snippet has an embedding."""
|
kodit/enrichment/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
"""Enrichment."""
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
"""Enrichment provider."""
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
"""Enrichment provider."""
|
|
2
|
-
|
|
3
|
-
from abc import ABC, abstractmethod
|
|
4
|
-
from collections.abc import AsyncGenerator
|
|
5
|
-
from dataclasses import dataclass
|
|
6
|
-
|
|
7
|
-
ENRICHMENT_SYSTEM_PROMPT = """
|
|
8
|
-
You are a professional software developer. You will be given a snippet of code.
|
|
9
|
-
Please provide a concise explanation of the code.
|
|
10
|
-
"""
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
@dataclass
|
|
14
|
-
class EnrichmentRequest:
|
|
15
|
-
"""Enrichment request."""
|
|
16
|
-
|
|
17
|
-
snippet_id: int
|
|
18
|
-
text: str
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
@dataclass
|
|
22
|
-
class EnrichmentResponse:
|
|
23
|
-
"""Enrichment response."""
|
|
24
|
-
|
|
25
|
-
snippet_id: int
|
|
26
|
-
text: str
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class EnrichmentProvider(ABC):
|
|
30
|
-
"""Enrichment provider."""
|
|
31
|
-
|
|
32
|
-
@abstractmethod
|
|
33
|
-
def enrich(
|
|
34
|
-
self, data: list[EnrichmentRequest]
|
|
35
|
-
) -> AsyncGenerator[EnrichmentResponse, None]:
|
|
36
|
-
"""Enrich a list of strings."""
|
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
"""OpenAI embedding service."""
|
|
2
|
-
|
|
3
|
-
import asyncio
|
|
4
|
-
from collections.abc import AsyncGenerator
|
|
5
|
-
|
|
6
|
-
import structlog
|
|
7
|
-
import tiktoken
|
|
8
|
-
from openai import AsyncOpenAI
|
|
9
|
-
|
|
10
|
-
from kodit.enrichment.enrichment_provider.enrichment_provider import (
|
|
11
|
-
ENRICHMENT_SYSTEM_PROMPT,
|
|
12
|
-
EnrichmentProvider,
|
|
13
|
-
EnrichmentRequest,
|
|
14
|
-
EnrichmentResponse,
|
|
15
|
-
)
|
|
16
|
-
|
|
17
|
-
OPENAI_NUM_PARALLEL_TASKS = 10
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class OpenAIEnrichmentProvider(EnrichmentProvider):
|
|
21
|
-
"""OpenAI enrichment provider."""
|
|
22
|
-
|
|
23
|
-
def __init__(
|
|
24
|
-
self,
|
|
25
|
-
openai_client: AsyncOpenAI,
|
|
26
|
-
model_name: str = "gpt-4o-mini",
|
|
27
|
-
) -> None:
|
|
28
|
-
"""Initialize the OpenAI enrichment provider."""
|
|
29
|
-
self.log = structlog.get_logger(__name__)
|
|
30
|
-
self.openai_client = openai_client
|
|
31
|
-
self.model_name = model_name
|
|
32
|
-
self.encoding = tiktoken.encoding_for_model("gpt-4o-mini") # Approximation
|
|
33
|
-
|
|
34
|
-
async def enrich(
|
|
35
|
-
self, data: list[EnrichmentRequest]
|
|
36
|
-
) -> AsyncGenerator[EnrichmentResponse, None]:
|
|
37
|
-
"""Enrich a list of documents."""
|
|
38
|
-
if not data or len(data) == 0:
|
|
39
|
-
self.log.warning("Data is empty, skipping enrichment")
|
|
40
|
-
return
|
|
41
|
-
|
|
42
|
-
# Process batches in parallel with a semaphore to limit concurrent requests
|
|
43
|
-
sem = asyncio.Semaphore(OPENAI_NUM_PARALLEL_TASKS)
|
|
44
|
-
|
|
45
|
-
async def process_data(data: EnrichmentRequest) -> EnrichmentResponse:
|
|
46
|
-
async with sem:
|
|
47
|
-
if not data.text:
|
|
48
|
-
return EnrichmentResponse(
|
|
49
|
-
snippet_id=data.snippet_id,
|
|
50
|
-
text="",
|
|
51
|
-
)
|
|
52
|
-
try:
|
|
53
|
-
response = await self.openai_client.chat.completions.create(
|
|
54
|
-
model=self.model_name,
|
|
55
|
-
messages=[
|
|
56
|
-
{
|
|
57
|
-
"role": "system",
|
|
58
|
-
"content": ENRICHMENT_SYSTEM_PROMPT,
|
|
59
|
-
},
|
|
60
|
-
{"role": "user", "content": data.text},
|
|
61
|
-
],
|
|
62
|
-
)
|
|
63
|
-
return EnrichmentResponse(
|
|
64
|
-
snippet_id=data.snippet_id,
|
|
65
|
-
text=response.choices[0].message.content or "",
|
|
66
|
-
)
|
|
67
|
-
except Exception as e:
|
|
68
|
-
self.log.exception("Error enriching data", error=str(e))
|
|
69
|
-
return EnrichmentResponse(
|
|
70
|
-
snippet_id=data.snippet_id,
|
|
71
|
-
text="",
|
|
72
|
-
)
|
|
73
|
-
|
|
74
|
-
# Create tasks for all data
|
|
75
|
-
tasks = [process_data(snippet) for snippet in data]
|
|
76
|
-
|
|
77
|
-
# Process all data and yield results as they complete
|
|
78
|
-
for task in asyncio.as_completed(tasks):
|
|
79
|
-
yield await task
|
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
"""Enrichment service."""
|
|
2
|
-
|
|
3
|
-
from abc import ABC, abstractmethod
|
|
4
|
-
from collections.abc import AsyncGenerator
|
|
5
|
-
|
|
6
|
-
from kodit.enrichment.enrichment_provider.enrichment_provider import (
|
|
7
|
-
EnrichmentProvider,
|
|
8
|
-
EnrichmentRequest,
|
|
9
|
-
EnrichmentResponse,
|
|
10
|
-
)
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class EnrichmentService(ABC):
|
|
14
|
-
"""Enrichment service."""
|
|
15
|
-
|
|
16
|
-
@abstractmethod
|
|
17
|
-
def enrich(
|
|
18
|
-
self, data: list[EnrichmentRequest]
|
|
19
|
-
) -> AsyncGenerator[EnrichmentResponse, None]:
|
|
20
|
-
"""Enrich a list of strings."""
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class NullEnrichmentService(EnrichmentService):
|
|
24
|
-
"""Null enrichment service."""
|
|
25
|
-
|
|
26
|
-
async def enrich(
|
|
27
|
-
self, data: list[EnrichmentRequest]
|
|
28
|
-
) -> AsyncGenerator[EnrichmentResponse, None]:
|
|
29
|
-
"""Enrich a list of strings."""
|
|
30
|
-
for request in data:
|
|
31
|
-
yield EnrichmentResponse(snippet_id=request.snippet_id, text="")
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
class LLMEnrichmentService(EnrichmentService):
|
|
35
|
-
"""Enrichment service using an LLM."""
|
|
36
|
-
|
|
37
|
-
def __init__(self, enrichment_provider: EnrichmentProvider) -> None:
|
|
38
|
-
"""Initialize the enrichment service."""
|
|
39
|
-
self.enrichment_provider = enrichment_provider
|
|
40
|
-
|
|
41
|
-
def enrich(
|
|
42
|
-
self, data: list[EnrichmentRequest]
|
|
43
|
-
) -> AsyncGenerator[EnrichmentResponse, None]:
|
|
44
|
-
"""Enrich a list of snippets."""
|
|
45
|
-
return self.enrichment_provider.enrich(data)
|
kodit/indexing/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
"""Indexing package for managing code indexes and search functionality."""
|