kodit 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/embedding/embedding_factory.py +6 -0
- kodit/embedding/embedding_provider/embedding_provider.py +42 -14
- kodit/embedding/embedding_provider/hash_embedding_provider.py +16 -7
- kodit/embedding/embedding_provider/local_embedding_provider.py +43 -11
- kodit/embedding/embedding_provider/openai_embedding_provider.py +18 -22
- kodit/embedding/local_vector_search_service.py +46 -13
- kodit/embedding/vector_search_service.py +18 -1
- kodit/embedding/vectorchord_vector_search_service.py +63 -16
- kodit/enrichment/enrichment_provider/enrichment_provider.py +21 -1
- kodit/enrichment/enrichment_provider/local_enrichment_provider.py +39 -28
- kodit/enrichment/enrichment_provider/openai_enrichment_provider.py +25 -27
- kodit/enrichment/enrichment_service.py +19 -7
- kodit/indexing/indexing_service.py +26 -20
- {kodit-0.2.3.dist-info → kodit-0.2.4.dist-info}/METADATA +1 -1
- {kodit-0.2.3.dist-info → kodit-0.2.4.dist-info}/RECORD +19 -19
- {kodit-0.2.3.dist-info → kodit-0.2.4.dist-info}/WHEEL +0 -0
- {kodit-0.2.3.dist-info → kodit-0.2.4.dist-info}/entry_points.txt +0 -0
- {kodit-0.2.3.dist-info → kodit-0.2.4.dist-info}/licenses/LICENSE +0 -0
kodit/_version.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
4
4
|
|
|
5
5
|
from kodit.config import AppContext, Endpoint
|
|
6
|
+
from kodit.embedding.embedding_models import EmbeddingType
|
|
6
7
|
from kodit.embedding.embedding_provider.local_embedding_provider import (
|
|
7
8
|
CODE,
|
|
8
9
|
LocalEmbeddingProvider,
|
|
@@ -54,9 +55,14 @@ def embedding_factory(
|
|
|
54
55
|
return VectorChordVectorSearchService(task_name, session, embedding_provider)
|
|
55
56
|
if app_context.default_search.provider == "sqlite":
|
|
56
57
|
log_event("kodit.database", {"provider": "sqlite"})
|
|
58
|
+
if task_name == "code":
|
|
59
|
+
embedding_type = EmbeddingType.CODE
|
|
60
|
+
elif task_name == "text":
|
|
61
|
+
embedding_type = EmbeddingType.TEXT
|
|
57
62
|
return LocalVectorSearchService(
|
|
58
63
|
embedding_repository=embedding_repository,
|
|
59
64
|
embedding_provider=embedding_provider,
|
|
65
|
+
embedding_type=embedding_type,
|
|
60
66
|
)
|
|
61
67
|
|
|
62
68
|
msg = f"Invalid semantic search provider: {app_context.default_search.provider}"
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
"""Embedding provider."""
|
|
2
2
|
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
|
+
from collections.abc import AsyncGenerator
|
|
5
|
+
from dataclasses import dataclass
|
|
4
6
|
|
|
5
7
|
import structlog
|
|
6
8
|
import tiktoken
|
|
@@ -10,11 +12,29 @@ OPENAI_MAX_EMBEDDING_SIZE = 8192
|
|
|
10
12
|
Vector = list[float]
|
|
11
13
|
|
|
12
14
|
|
|
15
|
+
@dataclass
|
|
16
|
+
class EmbeddingRequest:
|
|
17
|
+
"""Embedding request."""
|
|
18
|
+
|
|
19
|
+
id: int
|
|
20
|
+
text: str
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class EmbeddingResponse:
|
|
25
|
+
"""Embedding response."""
|
|
26
|
+
|
|
27
|
+
id: int
|
|
28
|
+
embedding: Vector
|
|
29
|
+
|
|
30
|
+
|
|
13
31
|
class EmbeddingProvider(ABC):
|
|
14
32
|
"""Embedding provider."""
|
|
15
33
|
|
|
16
34
|
@abstractmethod
|
|
17
|
-
|
|
35
|
+
def embed(
|
|
36
|
+
self, data: list[EmbeddingRequest]
|
|
37
|
+
) -> AsyncGenerator[list[EmbeddingResponse], None]:
|
|
18
38
|
"""Embed a list of strings.
|
|
19
39
|
|
|
20
40
|
The embedding provider is responsible for embedding a list of strings into a
|
|
@@ -25,13 +45,13 @@ class EmbeddingProvider(ABC):
|
|
|
25
45
|
|
|
26
46
|
def split_sub_batches(
|
|
27
47
|
encoding: tiktoken.Encoding,
|
|
28
|
-
data: list[
|
|
48
|
+
data: list[EmbeddingRequest],
|
|
29
49
|
max_context_window: int = OPENAI_MAX_EMBEDDING_SIZE,
|
|
30
|
-
) -> list[list[
|
|
50
|
+
) -> list[list[EmbeddingRequest]]:
|
|
31
51
|
"""Split a list of strings into smaller sub-batches."""
|
|
32
52
|
log = structlog.get_logger(__name__)
|
|
33
53
|
result = []
|
|
34
|
-
data_to_process = [s for s in data if s.strip()] # Filter out empty strings
|
|
54
|
+
data_to_process = [s for s in data if s.text.strip()] # Filter out empty strings
|
|
35
55
|
|
|
36
56
|
while data_to_process:
|
|
37
57
|
next_batch = []
|
|
@@ -39,18 +59,26 @@ def split_sub_batches(
|
|
|
39
59
|
|
|
40
60
|
while data_to_process:
|
|
41
61
|
next_item = data_to_process[0]
|
|
42
|
-
item_tokens = len(encoding.encode(next_item, disallowed_special=()))
|
|
62
|
+
item_tokens = len(encoding.encode(next_item.text, disallowed_special=()))
|
|
43
63
|
|
|
44
64
|
if item_tokens > max_context_window:
|
|
45
|
-
#
|
|
46
|
-
#
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
65
|
+
# Optimise truncation by operating on tokens directly instead of
|
|
66
|
+
# removing one character at a time and repeatedly re-encoding.
|
|
67
|
+
tokens = encoding.encode(next_item.text, disallowed_special=())
|
|
68
|
+
if len(tokens) > max_context_window:
|
|
69
|
+
# Keep only the first *max_context_window* tokens.
|
|
70
|
+
tokens = tokens[:max_context_window]
|
|
71
|
+
# Convert back to text. This requires only one decode call and
|
|
72
|
+
# guarantees that the resulting string fits the token budget.
|
|
73
|
+
next_item.text = encoding.decode(tokens)
|
|
74
|
+
item_tokens = max_context_window # We know the exact size now
|
|
75
|
+
|
|
76
|
+
data_to_process[0] = next_item
|
|
77
|
+
|
|
78
|
+
log.warning(
|
|
79
|
+
"Truncated snippet because it was too long to embed",
|
|
80
|
+
snippet=next_item.text[:100] + "...",
|
|
81
|
+
)
|
|
54
82
|
|
|
55
83
|
if current_tokens + item_tokens > max_context_window:
|
|
56
84
|
break
|
|
@@ -3,10 +3,12 @@
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import hashlib
|
|
5
5
|
import math
|
|
6
|
-
from collections.abc import Generator, Sequence
|
|
6
|
+
from collections.abc import AsyncGenerator, Generator, Sequence
|
|
7
7
|
|
|
8
8
|
from kodit.embedding.embedding_provider.embedding_provider import (
|
|
9
9
|
EmbeddingProvider,
|
|
10
|
+
EmbeddingRequest,
|
|
11
|
+
EmbeddingResponse,
|
|
10
12
|
Vector,
|
|
11
13
|
)
|
|
12
14
|
|
|
@@ -31,27 +33,34 @@ class HashEmbeddingProvider(EmbeddingProvider):
|
|
|
31
33
|
self.dim = dim
|
|
32
34
|
self.batch_size = batch_size
|
|
33
35
|
|
|
34
|
-
async def embed(
|
|
36
|
+
async def embed(
|
|
37
|
+
self, data: list[EmbeddingRequest]
|
|
38
|
+
) -> AsyncGenerator[list[EmbeddingResponse], None]:
|
|
35
39
|
"""Embed every string in *data*, preserving order.
|
|
36
40
|
|
|
37
41
|
Work is sliced into *batch_size* chunks and scheduled concurrently
|
|
38
42
|
(still CPU-bound, but enough to cooperate with an asyncio loop).
|
|
39
43
|
"""
|
|
40
44
|
if not data:
|
|
41
|
-
|
|
45
|
+
yield []
|
|
42
46
|
|
|
43
47
|
async def _embed_chunk(chunk: Sequence[str]) -> list[Vector]:
|
|
44
48
|
return [self._string_to_vector(text) for text in chunk]
|
|
45
49
|
|
|
46
50
|
tasks = [
|
|
47
51
|
asyncio.create_task(_embed_chunk(chunk))
|
|
48
|
-
for chunk in self._chunked(data, self.batch_size)
|
|
52
|
+
for chunk in self._chunked([i.text for i in data], self.batch_size)
|
|
49
53
|
]
|
|
50
54
|
|
|
51
|
-
vectors: list[Vector] = []
|
|
52
55
|
for task in tasks:
|
|
53
|
-
|
|
54
|
-
|
|
56
|
+
result = await task
|
|
57
|
+
yield [
|
|
58
|
+
EmbeddingResponse(
|
|
59
|
+
id=item.id,
|
|
60
|
+
embedding=embedding,
|
|
61
|
+
)
|
|
62
|
+
for item, embedding in zip(data, result, strict=True)
|
|
63
|
+
]
|
|
55
64
|
|
|
56
65
|
@staticmethod
|
|
57
66
|
def _chunked(seq: Sequence[str], size: int) -> Generator[Sequence[str], None, None]:
|
|
@@ -3,20 +3,24 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import os
|
|
6
|
+
from time import time
|
|
6
7
|
from typing import TYPE_CHECKING
|
|
7
8
|
|
|
8
9
|
import structlog
|
|
9
|
-
import tiktoken
|
|
10
|
-
from tqdm import tqdm
|
|
11
10
|
|
|
12
11
|
from kodit.embedding.embedding_provider.embedding_provider import (
|
|
13
12
|
EmbeddingProvider,
|
|
14
|
-
|
|
13
|
+
EmbeddingRequest,
|
|
14
|
+
EmbeddingResponse,
|
|
15
15
|
split_sub_batches,
|
|
16
16
|
)
|
|
17
17
|
|
|
18
18
|
if TYPE_CHECKING:
|
|
19
|
+
from collections.abc import AsyncGenerator
|
|
20
|
+
|
|
19
21
|
from sentence_transformers import SentenceTransformer
|
|
22
|
+
from tiktoken import Encoding
|
|
23
|
+
|
|
20
24
|
|
|
21
25
|
TINY = "tiny"
|
|
22
26
|
CODE = "code"
|
|
@@ -36,8 +40,22 @@ class LocalEmbeddingProvider(EmbeddingProvider):
|
|
|
36
40
|
"""Initialize the local embedder."""
|
|
37
41
|
self.log = structlog.get_logger(__name__)
|
|
38
42
|
self.model_name = COMMON_EMBEDDING_MODELS.get(model_name, model_name)
|
|
43
|
+
self.encoding_name = "text-embedding-3-small"
|
|
39
44
|
self.embedding_model = None
|
|
40
|
-
self.encoding =
|
|
45
|
+
self.encoding = None
|
|
46
|
+
|
|
47
|
+
def _encoding(self) -> Encoding:
|
|
48
|
+
if self.encoding is None:
|
|
49
|
+
from tiktoken import encoding_for_model
|
|
50
|
+
|
|
51
|
+
start_time = time()
|
|
52
|
+
self.encoding = encoding_for_model(self.encoding_name)
|
|
53
|
+
self.log.debug(
|
|
54
|
+
"Encoding loaded",
|
|
55
|
+
model_name=self.encoding_name,
|
|
56
|
+
duration=time() - start_time,
|
|
57
|
+
)
|
|
58
|
+
return self.encoding
|
|
41
59
|
|
|
42
60
|
def _model(self) -> SentenceTransformer:
|
|
43
61
|
"""Get the embedding model."""
|
|
@@ -45,20 +63,34 @@ class LocalEmbeddingProvider(EmbeddingProvider):
|
|
|
45
63
|
os.environ["TOKENIZERS_PARALLELISM"] = "false" # Avoid warnings
|
|
46
64
|
from sentence_transformers import SentenceTransformer
|
|
47
65
|
|
|
66
|
+
start_time = time()
|
|
48
67
|
self.embedding_model = SentenceTransformer(
|
|
49
68
|
self.model_name,
|
|
50
69
|
trust_remote_code=True,
|
|
51
70
|
)
|
|
71
|
+
self.log.debug(
|
|
72
|
+
"Model loaded",
|
|
73
|
+
model_name=self.model_name,
|
|
74
|
+
duration=time() - start_time,
|
|
75
|
+
)
|
|
52
76
|
return self.embedding_model
|
|
53
77
|
|
|
54
|
-
async def embed(
|
|
78
|
+
async def embed(
|
|
79
|
+
self, data: list[EmbeddingRequest]
|
|
80
|
+
) -> AsyncGenerator[list[EmbeddingResponse], None]:
|
|
55
81
|
"""Embed a list of strings."""
|
|
56
82
|
model = self._model()
|
|
57
83
|
|
|
58
|
-
batched_data = split_sub_batches(self.
|
|
84
|
+
batched_data = split_sub_batches(self._encoding(), data)
|
|
59
85
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
86
|
+
for batch in batched_data:
|
|
87
|
+
embeddings = model.encode(
|
|
88
|
+
[i.text for i in batch], show_progress_bar=False, batch_size=4
|
|
89
|
+
)
|
|
90
|
+
yield [
|
|
91
|
+
EmbeddingResponse(
|
|
92
|
+
id=item.id,
|
|
93
|
+
embedding=[float(x) for x in embedding],
|
|
94
|
+
)
|
|
95
|
+
for item, embedding in zip(batch, embeddings, strict=True)
|
|
96
|
+
]
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""OpenAI embedding service."""
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
from collections.abc import AsyncGenerator
|
|
4
5
|
|
|
5
6
|
import structlog
|
|
6
7
|
import tiktoken
|
|
@@ -8,7 +9,8 @@ from openai import AsyncOpenAI
|
|
|
8
9
|
|
|
9
10
|
from kodit.embedding.embedding_provider.embedding_provider import (
|
|
10
11
|
EmbeddingProvider,
|
|
11
|
-
|
|
12
|
+
EmbeddingRequest,
|
|
13
|
+
EmbeddingResponse,
|
|
12
14
|
split_sub_batches,
|
|
13
15
|
)
|
|
14
16
|
|
|
@@ -31,7 +33,9 @@ class OpenAIEmbeddingProvider(EmbeddingProvider):
|
|
|
31
33
|
"text-embedding-3-small"
|
|
32
34
|
) # Sensible default
|
|
33
35
|
|
|
34
|
-
async def embed(
|
|
36
|
+
async def embed(
|
|
37
|
+
self, data: list[EmbeddingRequest]
|
|
38
|
+
) -> AsyncGenerator[list[EmbeddingResponse], None]:
|
|
35
39
|
"""Embed a list of documents."""
|
|
36
40
|
# First split the list into a list of list where each sublist has fewer than
|
|
37
41
|
# max tokens.
|
|
@@ -40,38 +44,30 @@ class OpenAIEmbeddingProvider(EmbeddingProvider):
|
|
|
40
44
|
# Process batches in parallel with a semaphore to limit concurrent requests
|
|
41
45
|
sem = asyncio.Semaphore(OPENAI_NUM_PARALLEL_TASKS)
|
|
42
46
|
|
|
43
|
-
# Create a list of tuples with a temporary id for each batch
|
|
44
|
-
# We need to do this so that we can return the results in the same order as the
|
|
45
|
-
# input data
|
|
46
|
-
input_data = [(i, batch) for i, batch in enumerate(batched_data)]
|
|
47
|
-
|
|
48
47
|
async def process_batch(
|
|
49
|
-
data:
|
|
50
|
-
) ->
|
|
51
|
-
batch_id, batch = data
|
|
48
|
+
data: list[EmbeddingRequest],
|
|
49
|
+
) -> list[EmbeddingResponse]:
|
|
52
50
|
async with sem:
|
|
53
51
|
try:
|
|
54
52
|
response = await self.openai_client.embeddings.create(
|
|
55
53
|
model=self.model_name,
|
|
56
|
-
input=
|
|
54
|
+
input=[i.text for i in data],
|
|
57
55
|
)
|
|
58
|
-
return
|
|
59
|
-
|
|
60
|
-
|
|
56
|
+
return [
|
|
57
|
+
EmbeddingResponse(
|
|
58
|
+
id=item.id,
|
|
59
|
+
embedding=embedding.embedding,
|
|
60
|
+
)
|
|
61
|
+
for item, embedding in zip(data, response.data, strict=True)
|
|
61
62
|
]
|
|
62
63
|
except Exception as e:
|
|
63
64
|
self.log.exception("Error embedding batch", error=str(e))
|
|
64
|
-
return
|
|
65
|
+
return []
|
|
65
66
|
|
|
66
67
|
# Create tasks for all batches
|
|
67
|
-
tasks = [process_batch(batch) for batch in
|
|
68
|
+
tasks = [process_batch(batch) for batch in batched_data]
|
|
68
69
|
|
|
69
70
|
# Process all batches and yield results as they complete
|
|
70
|
-
results: list[tuple[int, list[Vector]]] = []
|
|
71
71
|
for task in asyncio.as_completed(tasks):
|
|
72
72
|
result = await task
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
# Output in the same order as the input data
|
|
76
|
-
ordered_results = [result for _, result in sorted(results, key=lambda x: x[0])]
|
|
77
|
-
return [item for sublist in ordered_results for item in sublist]
|
|
73
|
+
yield result
|
|
@@ -1,12 +1,18 @@
|
|
|
1
1
|
"""Local vector search."""
|
|
2
2
|
|
|
3
|
+
from collections.abc import AsyncGenerator
|
|
4
|
+
|
|
3
5
|
import structlog
|
|
4
6
|
import tiktoken
|
|
5
7
|
|
|
6
8
|
from kodit.embedding.embedding_models import Embedding, EmbeddingType
|
|
7
|
-
from kodit.embedding.embedding_provider.embedding_provider import
|
|
9
|
+
from kodit.embedding.embedding_provider.embedding_provider import (
|
|
10
|
+
EmbeddingProvider,
|
|
11
|
+
EmbeddingRequest,
|
|
12
|
+
)
|
|
8
13
|
from kodit.embedding.embedding_repository import EmbeddingRepository
|
|
9
14
|
from kodit.embedding.vector_search_service import (
|
|
15
|
+
IndexResult,
|
|
10
16
|
VectorSearchRequest,
|
|
11
17
|
VectorSearchResponse,
|
|
12
18
|
VectorSearchService,
|
|
@@ -20,35 +26,62 @@ class LocalVectorSearchService(VectorSearchService):
|
|
|
20
26
|
self,
|
|
21
27
|
embedding_repository: EmbeddingRepository,
|
|
22
28
|
embedding_provider: EmbeddingProvider,
|
|
29
|
+
embedding_type: EmbeddingType = EmbeddingType.CODE,
|
|
23
30
|
) -> None:
|
|
24
31
|
"""Initialize the local embedder."""
|
|
25
32
|
self.log = structlog.get_logger(__name__)
|
|
26
33
|
self.embedding_repository = embedding_repository
|
|
27
34
|
self.embedding_provider = embedding_provider
|
|
28
35
|
self.encoding = tiktoken.encoding_for_model("text-embedding-3-small")
|
|
36
|
+
self.embedding_type = embedding_type
|
|
29
37
|
|
|
30
|
-
async def index(
|
|
38
|
+
async def index(
|
|
39
|
+
self, data: list[VectorSearchRequest]
|
|
40
|
+
) -> AsyncGenerator[list[IndexResult], None]:
|
|
31
41
|
"""Embed a list of documents."""
|
|
32
42
|
if not data or len(data) == 0:
|
|
33
|
-
self.log.warning("Embedding data is empty, skipping embedding")
|
|
34
43
|
return
|
|
35
44
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
45
|
+
requests = [EmbeddingRequest(id=doc.snippet_id, text=doc.text) for doc in data]
|
|
46
|
+
|
|
47
|
+
async for batch in self.embedding_provider.embed(requests):
|
|
48
|
+
for result in batch:
|
|
49
|
+
await self.embedding_repository.create_embedding(
|
|
50
|
+
Embedding(
|
|
51
|
+
snippet_id=result.id,
|
|
52
|
+
embedding=result.embedding,
|
|
53
|
+
type=self.embedding_type,
|
|
54
|
+
)
|
|
43
55
|
)
|
|
44
|
-
|
|
56
|
+
yield [IndexResult(snippet_id=result.id)]
|
|
45
57
|
|
|
46
58
|
async def retrieve(self, query: str, top_k: int = 10) -> list[VectorSearchResponse]:
|
|
47
59
|
"""Query the embedding model."""
|
|
48
|
-
|
|
60
|
+
# Build a single-item request and collect its embedding.
|
|
61
|
+
req = EmbeddingRequest(id=0, text=query)
|
|
62
|
+
embedding_vec: list[float] | None = None
|
|
63
|
+
async for batch in self.embedding_provider.embed([req]):
|
|
64
|
+
if batch:
|
|
65
|
+
embedding_vec = [float(v) for v in batch[0].embedding]
|
|
66
|
+
break
|
|
67
|
+
|
|
68
|
+
if not embedding_vec:
|
|
69
|
+
return []
|
|
70
|
+
|
|
49
71
|
results = await self.embedding_repository.list_semantic_results(
|
|
50
|
-
|
|
72
|
+
self.embedding_type, embedding_vec, top_k
|
|
51
73
|
)
|
|
52
74
|
return [
|
|
53
75
|
VectorSearchResponse(snippet_id, score) for snippet_id, score in results
|
|
54
76
|
]
|
|
77
|
+
|
|
78
|
+
async def has_embedding(
|
|
79
|
+
self, snippet_id: int, embedding_type: EmbeddingType
|
|
80
|
+
) -> bool:
|
|
81
|
+
"""Check if a snippet has an embedding."""
|
|
82
|
+
return (
|
|
83
|
+
await self.embedding_repository.get_embedding_by_snippet_id_and_type(
|
|
84
|
+
snippet_id, embedding_type
|
|
85
|
+
)
|
|
86
|
+
is not None
|
|
87
|
+
)
|
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
"""Embedding service."""
|
|
2
2
|
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
|
+
from collections.abc import AsyncGenerator
|
|
4
5
|
from typing import NamedTuple
|
|
5
6
|
|
|
7
|
+
from kodit.embedding.embedding_models import EmbeddingType
|
|
8
|
+
|
|
6
9
|
|
|
7
10
|
class VectorSearchResponse(NamedTuple):
|
|
8
11
|
"""Embedding result."""
|
|
@@ -18,11 +21,19 @@ class VectorSearchRequest(NamedTuple):
|
|
|
18
21
|
text: str
|
|
19
22
|
|
|
20
23
|
|
|
24
|
+
class IndexResult(NamedTuple):
|
|
25
|
+
"""Result of indexing."""
|
|
26
|
+
|
|
27
|
+
snippet_id: int
|
|
28
|
+
|
|
29
|
+
|
|
21
30
|
class VectorSearchService(ABC):
|
|
22
31
|
"""Semantic search service interface."""
|
|
23
32
|
|
|
24
33
|
@abstractmethod
|
|
25
|
-
|
|
34
|
+
def index(
|
|
35
|
+
self, data: list[VectorSearchRequest]
|
|
36
|
+
) -> AsyncGenerator[list[IndexResult], None]:
|
|
26
37
|
"""Embed a list of documents.
|
|
27
38
|
|
|
28
39
|
The embedding service accepts a massive list of id,strings to embed. Behind the
|
|
@@ -36,3 +47,9 @@ class VectorSearchService(ABC):
|
|
|
36
47
|
@abstractmethod
|
|
37
48
|
async def retrieve(self, query: str, top_k: int = 10) -> list[VectorSearchResponse]:
|
|
38
49
|
"""Query the embedding model."""
|
|
50
|
+
|
|
51
|
+
@abstractmethod
|
|
52
|
+
async def has_embedding(
|
|
53
|
+
self, snippet_id: int, embedding_type: EmbeddingType
|
|
54
|
+
) -> bool:
|
|
55
|
+
"""Check if a snippet has an embedding."""
|
|
@@ -1,13 +1,19 @@
|
|
|
1
1
|
"""Vectorchord vector search."""
|
|
2
2
|
|
|
3
|
+
from collections.abc import AsyncGenerator
|
|
3
4
|
from typing import Any, Literal
|
|
4
5
|
|
|
5
6
|
import structlog
|
|
6
7
|
from sqlalchemy import Result, TextClause, text
|
|
7
8
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
8
9
|
|
|
9
|
-
from kodit.embedding.
|
|
10
|
+
from kodit.embedding.embedding_models import EmbeddingType
|
|
11
|
+
from kodit.embedding.embedding_provider.embedding_provider import (
|
|
12
|
+
EmbeddingProvider,
|
|
13
|
+
EmbeddingRequest,
|
|
14
|
+
)
|
|
10
15
|
from kodit.embedding.vector_search_service import (
|
|
16
|
+
IndexResult,
|
|
11
17
|
VectorSearchRequest,
|
|
12
18
|
VectorSearchResponse,
|
|
13
19
|
VectorSearchService,
|
|
@@ -52,6 +58,10 @@ ORDER BY score ASC
|
|
|
52
58
|
LIMIT :top_k;
|
|
53
59
|
"""
|
|
54
60
|
|
|
61
|
+
CHECK_VCHORD_EMBEDDING_EXISTS = """
|
|
62
|
+
SELECT EXISTS(SELECT 1 FROM {TABLE_NAME} WHERE snippet_id = :snippet_id)
|
|
63
|
+
"""
|
|
64
|
+
|
|
55
65
|
TaskName = Literal["code", "text"]
|
|
56
66
|
|
|
57
67
|
|
|
@@ -89,7 +99,15 @@ class VectorChordVectorSearchService(VectorSearchService):
|
|
|
89
99
|
|
|
90
100
|
async def _create_tables(self) -> None:
|
|
91
101
|
"""Create the necessary tables."""
|
|
92
|
-
|
|
102
|
+
req = EmbeddingRequest(id=0, text="dimension")
|
|
103
|
+
vector_dim: list[float] | None = None
|
|
104
|
+
async for batch in self.embedding_provider.embed([req]):
|
|
105
|
+
if batch:
|
|
106
|
+
vector_dim = batch[0].embedding
|
|
107
|
+
break
|
|
108
|
+
if vector_dim is None:
|
|
109
|
+
msg = "Failed to obtain embedding dimension from provider"
|
|
110
|
+
raise RuntimeError(msg)
|
|
93
111
|
await self._session.execute(
|
|
94
112
|
text(
|
|
95
113
|
f"""CREATE TABLE IF NOT EXISTS {self.table_name} (
|
|
@@ -130,31 +148,48 @@ class VectorChordVectorSearchService(VectorSearchService):
|
|
|
130
148
|
"""Commit the session."""
|
|
131
149
|
await self._session.commit()
|
|
132
150
|
|
|
133
|
-
async def index(
|
|
151
|
+
async def index(
|
|
152
|
+
self, data: list[VectorSearchRequest]
|
|
153
|
+
) -> AsyncGenerator[list[IndexResult], None]:
|
|
134
154
|
"""Embed a list of documents."""
|
|
135
155
|
if not data or len(data) == 0:
|
|
136
156
|
self.log.warning("Embedding data is empty, skipping embedding")
|
|
137
157
|
return
|
|
138
158
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
159
|
+
requests = [EmbeddingRequest(id=doc.snippet_id, text=doc.text) for doc in data]
|
|
160
|
+
|
|
161
|
+
async for batch in self.embedding_provider.embed(requests):
|
|
162
|
+
await self._execute(
|
|
163
|
+
text(INSERT_QUERY.format(TABLE_NAME=self.table_name)),
|
|
164
|
+
[
|
|
165
|
+
{
|
|
166
|
+
"snippet_id": result.id,
|
|
167
|
+
"embedding": str(result.embedding),
|
|
168
|
+
}
|
|
169
|
+
for result in batch
|
|
170
|
+
],
|
|
171
|
+
)
|
|
172
|
+
await self._commit()
|
|
173
|
+
yield [IndexResult(snippet_id=result.id) for result in batch]
|
|
149
174
|
|
|
150
175
|
async def retrieve(self, query: str, top_k: int = 10) -> list[VectorSearchResponse]:
|
|
151
176
|
"""Query the embedding model."""
|
|
152
|
-
|
|
153
|
-
|
|
177
|
+
from kodit.embedding.embedding_provider.embedding_provider import (
|
|
178
|
+
EmbeddingRequest,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
req = EmbeddingRequest(id=0, text=query)
|
|
182
|
+
embedding_vec: list[float] | None = None
|
|
183
|
+
async for batch in self.embedding_provider.embed([req]):
|
|
184
|
+
if batch:
|
|
185
|
+
embedding_vec = batch[0].embedding
|
|
186
|
+
break
|
|
187
|
+
|
|
188
|
+
if not embedding_vec:
|
|
154
189
|
return []
|
|
155
190
|
result = await self._execute(
|
|
156
191
|
text(SEARCH_QUERY.format(TABLE_NAME=self.table_name)),
|
|
157
|
-
{"query": str(
|
|
192
|
+
{"query": str(embedding_vec), "top_k": top_k},
|
|
158
193
|
)
|
|
159
194
|
rows = result.mappings().all()
|
|
160
195
|
|
|
@@ -162,3 +197,15 @@ class VectorChordVectorSearchService(VectorSearchService):
|
|
|
162
197
|
VectorSearchResponse(snippet_id=row["snippet_id"], score=row["score"])
|
|
163
198
|
for row in rows
|
|
164
199
|
]
|
|
200
|
+
|
|
201
|
+
async def has_embedding(
|
|
202
|
+
self,
|
|
203
|
+
snippet_id: int,
|
|
204
|
+
embedding_type: EmbeddingType, # noqa: ARG002
|
|
205
|
+
) -> bool:
|
|
206
|
+
"""Check if a snippet has an embedding."""
|
|
207
|
+
result = await self._execute(
|
|
208
|
+
text(CHECK_VCHORD_EMBEDDING_EXISTS.format(TABLE_NAME=self.table_name)),
|
|
209
|
+
{"snippet_id": snippet_id},
|
|
210
|
+
)
|
|
211
|
+
return result.scalar_one()
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
"""Enrichment provider."""
|
|
2
2
|
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
|
+
from collections.abc import AsyncGenerator
|
|
5
|
+
from dataclasses import dataclass
|
|
4
6
|
|
|
5
7
|
ENRICHMENT_SYSTEM_PROMPT = """
|
|
6
8
|
You are a professional software developer. You will be given a snippet of code.
|
|
@@ -8,9 +10,27 @@ Please provide a concise explanation of the code.
|
|
|
8
10
|
"""
|
|
9
11
|
|
|
10
12
|
|
|
13
|
+
@dataclass
|
|
14
|
+
class EnrichmentRequest:
|
|
15
|
+
"""Enrichment request."""
|
|
16
|
+
|
|
17
|
+
snippet_id: int
|
|
18
|
+
text: str
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class EnrichmentResponse:
|
|
23
|
+
"""Enrichment response."""
|
|
24
|
+
|
|
25
|
+
snippet_id: int
|
|
26
|
+
text: str
|
|
27
|
+
|
|
28
|
+
|
|
11
29
|
class EnrichmentProvider(ABC):
|
|
12
30
|
"""Enrichment provider."""
|
|
13
31
|
|
|
14
32
|
@abstractmethod
|
|
15
|
-
|
|
33
|
+
def enrich(
|
|
34
|
+
self, data: list[EnrichmentRequest]
|
|
35
|
+
) -> AsyncGenerator[EnrichmentResponse, None]:
|
|
16
36
|
"""Enrich a list of strings."""
|
|
@@ -1,15 +1,19 @@
|
|
|
1
1
|
"""Local embedding service."""
|
|
2
2
|
|
|
3
3
|
import os
|
|
4
|
+
from collections.abc import AsyncGenerator
|
|
4
5
|
|
|
5
6
|
import structlog
|
|
6
7
|
import tiktoken
|
|
7
|
-
from tqdm import tqdm
|
|
8
8
|
|
|
9
|
-
from kodit.embedding.embedding_provider.embedding_provider import
|
|
9
|
+
from kodit.embedding.embedding_provider.embedding_provider import (
|
|
10
|
+
EmbeddingRequest,
|
|
11
|
+
)
|
|
10
12
|
from kodit.enrichment.enrichment_provider.enrichment_provider import (
|
|
11
13
|
ENRICHMENT_SYSTEM_PROMPT,
|
|
12
14
|
EnrichmentProvider,
|
|
15
|
+
EnrichmentRequest,
|
|
16
|
+
EnrichmentResponse,
|
|
13
17
|
)
|
|
14
18
|
|
|
15
19
|
DEFAULT_ENRICHMENT_MODEL = "Qwen/Qwen3-0.6B"
|
|
@@ -32,11 +36,16 @@ class LocalEnrichmentProvider(EnrichmentProvider):
|
|
|
32
36
|
self.tokenizer = None
|
|
33
37
|
self.encoding = tiktoken.encoding_for_model("text-embedding-3-small")
|
|
34
38
|
|
|
35
|
-
async def enrich(
|
|
39
|
+
async def enrich(
|
|
40
|
+
self, data: list[EnrichmentRequest]
|
|
41
|
+
) -> AsyncGenerator[EnrichmentResponse, None]:
|
|
36
42
|
"""Enrich a list of strings."""
|
|
43
|
+
# Remove empty snippets
|
|
44
|
+
data = [snippet for snippet in data if snippet.text]
|
|
45
|
+
|
|
37
46
|
if not data or len(data) == 0:
|
|
38
47
|
self.log.warning("Data is empty, skipping enrichment")
|
|
39
|
-
return
|
|
48
|
+
return
|
|
40
49
|
|
|
41
50
|
from transformers.models.auto.modeling_auto import (
|
|
42
51
|
AutoModelForCausalLM,
|
|
@@ -57,36 +66,38 @@ class LocalEnrichmentProvider(EnrichmentProvider):
|
|
|
57
66
|
)
|
|
58
67
|
|
|
59
68
|
# Prepare prompts
|
|
60
|
-
prompts = [
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
+
prompts: list[EmbeddingRequest] = [
|
|
70
|
+
EmbeddingRequest(
|
|
71
|
+
id=snippet.snippet_id,
|
|
72
|
+
text=self.tokenizer.apply_chat_template(
|
|
73
|
+
[
|
|
74
|
+
{"role": "system", "content": ENRICHMENT_SYSTEM_PROMPT},
|
|
75
|
+
{"role": "user", "content": snippet.text},
|
|
76
|
+
],
|
|
77
|
+
tokenize=False,
|
|
78
|
+
add_generation_prompt=True,
|
|
79
|
+
enable_thinking=False,
|
|
80
|
+
),
|
|
69
81
|
)
|
|
70
82
|
for snippet in data
|
|
71
83
|
]
|
|
72
84
|
|
|
73
|
-
|
|
74
|
-
batched_prompts = split_sub_batches(
|
|
75
|
-
self.encoding, prompts, max_context_window=self.context_window
|
|
76
|
-
)
|
|
77
|
-
results = []
|
|
78
|
-
for batch in tqdm(batched_prompts, leave=False, total=len(batched_prompts)):
|
|
85
|
+
for prompt in prompts:
|
|
79
86
|
model_inputs = self.tokenizer(
|
|
80
|
-
|
|
87
|
+
prompt.text,
|
|
88
|
+
return_tensors="pt",
|
|
89
|
+
padding=True,
|
|
90
|
+
truncation=True,
|
|
81
91
|
).to(self.model.device)
|
|
82
92
|
generated_ids = self.model.generate(
|
|
83
93
|
**model_inputs, max_new_tokens=self.context_window
|
|
84
94
|
)
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
95
|
+
input_ids = model_inputs["input_ids"][0]
|
|
96
|
+
output_ids = generated_ids[0][len(input_ids) :].tolist()
|
|
97
|
+
content = self.tokenizer.decode(output_ids, skip_special_tokens=True).strip(
|
|
98
|
+
"\n"
|
|
99
|
+
)
|
|
100
|
+
yield EnrichmentResponse(
|
|
101
|
+
snippet_id=prompt.id,
|
|
102
|
+
text=content,
|
|
103
|
+
)
|
|
@@ -1,15 +1,17 @@
|
|
|
1
1
|
"""OpenAI embedding service."""
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
from collections.abc import AsyncGenerator
|
|
4
5
|
|
|
5
6
|
import structlog
|
|
6
7
|
import tiktoken
|
|
7
8
|
from openai import AsyncOpenAI
|
|
8
|
-
from tqdm import tqdm
|
|
9
9
|
|
|
10
10
|
from kodit.enrichment.enrichment_provider.enrichment_provider import (
|
|
11
11
|
ENRICHMENT_SYSTEM_PROMPT,
|
|
12
12
|
EnrichmentProvider,
|
|
13
|
+
EnrichmentRequest,
|
|
14
|
+
EnrichmentResponse,
|
|
13
15
|
)
|
|
14
16
|
|
|
15
17
|
OPENAI_NUM_PARALLEL_TASKS = 10
|
|
@@ -29,25 +31,24 @@ class OpenAIEnrichmentProvider(EnrichmentProvider):
|
|
|
29
31
|
self.model_name = model_name
|
|
30
32
|
self.encoding = tiktoken.encoding_for_model("gpt-4o-mini") # Approximation
|
|
31
33
|
|
|
32
|
-
async def enrich(
|
|
34
|
+
async def enrich(
|
|
35
|
+
self, data: list[EnrichmentRequest]
|
|
36
|
+
) -> AsyncGenerator[EnrichmentResponse, None]:
|
|
33
37
|
"""Enrich a list of documents."""
|
|
34
38
|
if not data or len(data) == 0:
|
|
35
39
|
self.log.warning("Data is empty, skipping enrichment")
|
|
36
|
-
return
|
|
40
|
+
return
|
|
37
41
|
|
|
38
42
|
# Process batches in parallel with a semaphore to limit concurrent requests
|
|
39
43
|
sem = asyncio.Semaphore(OPENAI_NUM_PARALLEL_TASKS)
|
|
40
44
|
|
|
41
|
-
|
|
42
|
-
# We need to do this so that we can return the results in the same order as the
|
|
43
|
-
# input data
|
|
44
|
-
input_data = [(i, snippet) for i, snippet in enumerate(data)]
|
|
45
|
-
|
|
46
|
-
async def process_data(data: tuple[int, str]) -> tuple[int, str]:
|
|
47
|
-
snippet_id, snippet = data
|
|
48
|
-
if not snippet:
|
|
49
|
-
return snippet_id, ""
|
|
45
|
+
async def process_data(data: EnrichmentRequest) -> EnrichmentResponse:
|
|
50
46
|
async with sem:
|
|
47
|
+
if not data.text:
|
|
48
|
+
return EnrichmentResponse(
|
|
49
|
+
snippet_id=data.snippet_id,
|
|
50
|
+
text="",
|
|
51
|
+
)
|
|
51
52
|
try:
|
|
52
53
|
response = await self.openai_client.chat.completions.create(
|
|
53
54
|
model=self.model_name,
|
|
@@ -56,26 +57,23 @@ class OpenAIEnrichmentProvider(EnrichmentProvider):
|
|
|
56
57
|
"role": "system",
|
|
57
58
|
"content": ENRICHMENT_SYSTEM_PROMPT,
|
|
58
59
|
},
|
|
59
|
-
{"role": "user", "content":
|
|
60
|
+
{"role": "user", "content": data.text},
|
|
60
61
|
],
|
|
61
62
|
)
|
|
62
|
-
return
|
|
63
|
+
return EnrichmentResponse(
|
|
64
|
+
snippet_id=data.snippet_id,
|
|
65
|
+
text=response.choices[0].message.content or "",
|
|
66
|
+
)
|
|
63
67
|
except Exception as e:
|
|
64
68
|
self.log.exception("Error enriching data", error=str(e))
|
|
65
|
-
return
|
|
69
|
+
return EnrichmentResponse(
|
|
70
|
+
snippet_id=data.snippet_id,
|
|
71
|
+
text="",
|
|
72
|
+
)
|
|
66
73
|
|
|
67
74
|
# Create tasks for all data
|
|
68
|
-
tasks = [process_data(snippet) for snippet in
|
|
75
|
+
tasks = [process_data(snippet) for snippet in data]
|
|
69
76
|
|
|
70
77
|
# Process all data and yield results as they complete
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
asyncio.as_completed(tasks),
|
|
74
|
-
total=len(tasks),
|
|
75
|
-
leave=False,
|
|
76
|
-
):
|
|
77
|
-
result = await task
|
|
78
|
-
results.append(result)
|
|
79
|
-
|
|
80
|
-
# Output in the same order as the input data
|
|
81
|
-
return [result for _, result in sorted(results, key=lambda x: x[0])]
|
|
78
|
+
for task in asyncio.as_completed(tasks):
|
|
79
|
+
yield await task
|
|
@@ -1,24 +1,34 @@
|
|
|
1
1
|
"""Enrichment service."""
|
|
2
2
|
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
|
+
from collections.abc import AsyncGenerator
|
|
4
5
|
|
|
5
|
-
from kodit.enrichment.enrichment_provider.enrichment_provider import
|
|
6
|
+
from kodit.enrichment.enrichment_provider.enrichment_provider import (
|
|
7
|
+
EnrichmentProvider,
|
|
8
|
+
EnrichmentRequest,
|
|
9
|
+
EnrichmentResponse,
|
|
10
|
+
)
|
|
6
11
|
|
|
7
12
|
|
|
8
13
|
class EnrichmentService(ABC):
|
|
9
14
|
"""Enrichment service."""
|
|
10
15
|
|
|
11
16
|
@abstractmethod
|
|
12
|
-
|
|
17
|
+
def enrich(
|
|
18
|
+
self, data: list[EnrichmentRequest]
|
|
19
|
+
) -> AsyncGenerator[EnrichmentResponse, None]:
|
|
13
20
|
"""Enrich a list of strings."""
|
|
14
21
|
|
|
15
22
|
|
|
16
23
|
class NullEnrichmentService(EnrichmentService):
|
|
17
24
|
"""Null enrichment service."""
|
|
18
25
|
|
|
19
|
-
async def enrich(
|
|
26
|
+
async def enrich(
|
|
27
|
+
self, data: list[EnrichmentRequest]
|
|
28
|
+
) -> AsyncGenerator[EnrichmentResponse, None]:
|
|
20
29
|
"""Enrich a list of strings."""
|
|
21
|
-
|
|
30
|
+
for request in data:
|
|
31
|
+
yield EnrichmentResponse(snippet_id=request.snippet_id, text="")
|
|
22
32
|
|
|
23
33
|
|
|
24
34
|
class LLMEnrichmentService(EnrichmentService):
|
|
@@ -28,6 +38,8 @@ class LLMEnrichmentService(EnrichmentService):
|
|
|
28
38
|
"""Initialize the enrichment service."""
|
|
29
39
|
self.enrichment_provider = enrichment_provider
|
|
30
40
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
41
|
+
def enrich(
|
|
42
|
+
self, data: list[EnrichmentRequest]
|
|
43
|
+
) -> AsyncGenerator[EnrichmentResponse, None]:
|
|
44
|
+
"""Enrich a list of snippets."""
|
|
45
|
+
return self.enrichment_provider.enrich(data)
|
|
@@ -22,6 +22,7 @@ from kodit.embedding.vector_search_service import (
|
|
|
22
22
|
VectorSearchRequest,
|
|
23
23
|
VectorSearchService,
|
|
24
24
|
)
|
|
25
|
+
from kodit.enrichment.enrichment_provider.enrichment_provider import EnrichmentRequest
|
|
25
26
|
from kodit.enrichment.enrichment_service import EnrichmentService
|
|
26
27
|
from kodit.indexing.fusion import FusionRequest, reciprocal_rank_fusion
|
|
27
28
|
from kodit.indexing.indexing_models import Snippet
|
|
@@ -200,37 +201,42 @@ class IndexService:
|
|
|
200
201
|
)
|
|
201
202
|
|
|
202
203
|
self.log.info("Creating semantic code index")
|
|
203
|
-
with
|
|
204
|
-
|
|
204
|
+
with tqdm(total=len(snippets), leave=False) as pbar:
|
|
205
|
+
async for result in self.code_search_service.index(
|
|
205
206
|
[
|
|
206
207
|
VectorSearchRequest(snippet.id, snippet.content)
|
|
207
208
|
for snippet in snippets
|
|
208
209
|
]
|
|
209
|
-
)
|
|
210
|
+
):
|
|
211
|
+
pbar.update(len(result))
|
|
210
212
|
|
|
211
213
|
self.log.info("Enriching snippets", num_snippets=len(snippets))
|
|
212
|
-
enriched_contents =
|
|
213
|
-
|
|
214
|
-
|
|
214
|
+
enriched_contents = []
|
|
215
|
+
with tqdm(total=len(snippets), leave=False) as pbar:
|
|
216
|
+
async for result in self.enrichment_service.enrich(
|
|
217
|
+
[
|
|
218
|
+
EnrichmentRequest(snippet_id=snippet.id, text=snippet.content)
|
|
219
|
+
for snippet in snippets
|
|
220
|
+
]
|
|
221
|
+
):
|
|
222
|
+
snippet = next(s for s in snippets if s.id == result.snippet_id)
|
|
223
|
+
if snippet:
|
|
224
|
+
snippet.content = (
|
|
225
|
+
result.text + "\n\n```\n" + snippet.content + "\n```"
|
|
226
|
+
)
|
|
227
|
+
await self.repository.add_snippet(snippet)
|
|
228
|
+
enriched_contents.append(result)
|
|
229
|
+
pbar.update(1)
|
|
215
230
|
|
|
216
231
|
self.log.info("Creating semantic text index")
|
|
217
|
-
with
|
|
218
|
-
|
|
232
|
+
with tqdm(total=len(snippets), leave=False) as pbar:
|
|
233
|
+
async for result in self.text_search_service.index(
|
|
219
234
|
[
|
|
220
|
-
VectorSearchRequest(snippet.id,
|
|
221
|
-
for snippet
|
|
222
|
-
snippets, enriched_contents, strict=True
|
|
223
|
-
)
|
|
235
|
+
VectorSearchRequest(snippet.id, snippet.content)
|
|
236
|
+
for snippet in snippets
|
|
224
237
|
]
|
|
225
|
-
)
|
|
226
|
-
# Add the enriched text back to the snippets and write to the database
|
|
227
|
-
for snippet, enriched_content in zip(
|
|
228
|
-
snippets, enriched_contents, strict=True
|
|
229
238
|
):
|
|
230
|
-
|
|
231
|
-
enriched_content + "\n\n```\n" + snippet.content + "\n```"
|
|
232
|
-
)
|
|
233
|
-
await self.repository.add_snippet(snippet)
|
|
239
|
+
pbar.update(len(result))
|
|
234
240
|
|
|
235
241
|
# Update index timestamp
|
|
236
242
|
await self.repository.update_index_timestamp(index)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
kodit/.gitignore,sha256=ztkjgRwL9Uud1OEi36hGQeDGk3OLK1NfDEO8YqGYy8o,11
|
|
2
2
|
kodit/__init__.py,sha256=aEKHYninUq1yh6jaNfvJBYg-6fenpN132nJt1UU6Jxs,59
|
|
3
|
-
kodit/_version.py,sha256=
|
|
3
|
+
kodit/_version.py,sha256=1LUN_sRKOiFInoB6AlW6TYoQMCh1Z4KutwcHNvHcfB0,511
|
|
4
4
|
kodit/app.py,sha256=qKBWJ0VNSY_M6G3VFfAQ0133q5bnS99cUFD0p396taw,1032
|
|
5
5
|
kodit/cli.py,sha256=wKFXGUMX-fDLooaK-3po2TBpNNRBwgSD7BRbUddg-_M,11562
|
|
6
6
|
kodit/config.py,sha256=3yh7hfLSILjZK_qJMhcExwRcrWJ0b5Eb1JjjOvMPJZo,4146
|
|
@@ -14,29 +14,29 @@ kodit/bm25/keyword_search_service.py,sha256=aBbWQKgQmi2re3EIHdXFS00n7Wj3b2D0pZsL
|
|
|
14
14
|
kodit/bm25/local_bm25.py,sha256=nokrd_xAeqXi3m68X5P1R5KBhRRB1E2L_J6Zgm26PCg,3869
|
|
15
15
|
kodit/bm25/vectorchord_bm25.py,sha256=0p_FgliaoevB8GLSmzWnV3zUjdcWgCgOKIpLURr7Qfo,6549
|
|
16
16
|
kodit/embedding/__init__.py,sha256=h9NXzDA1r-K23nvBajBV-RJzHJN0p3UJ7UQsmdnOoRw,24
|
|
17
|
-
kodit/embedding/embedding_factory.py,sha256
|
|
17
|
+
kodit/embedding/embedding_factory.py,sha256=lFcgqsDxw8L5mygq-TppQ2wtoIA2p2OL7XmtOyX8Omw,2683
|
|
18
18
|
kodit/embedding/embedding_models.py,sha256=rN90vSs86dYiqoawcp8E9jtwY31JoJXYfaDlsJK7uqc,656
|
|
19
19
|
kodit/embedding/embedding_repository.py,sha256=-ux3scpBzel8c0pMH9fNOEsSXFIzl-IfgaWrkTb1szo,6907
|
|
20
|
-
kodit/embedding/local_vector_search_service.py,sha256=
|
|
21
|
-
kodit/embedding/vector_search_service.py,sha256=
|
|
22
|
-
kodit/embedding/vectorchord_vector_search_service.py,sha256=
|
|
20
|
+
kodit/embedding/local_vector_search_service.py,sha256=yZm0ahQQKhfYZ943yxKHp04cairmzgGBUNi5PB_GDbo,3002
|
|
21
|
+
kodit/embedding/vector_search_service.py,sha256=frN9baAlqFmsY3xiv1ZeSgsfhK9FzKPkVR55MEvMV4I,1416
|
|
22
|
+
kodit/embedding/vectorchord_vector_search_service.py,sha256=JQeIl9mtR4E_izOoFD_4ZRfENHNfwoKr16pQkkGoK3o,6884
|
|
23
23
|
kodit/embedding/embedding_provider/__init__.py,sha256=h9NXzDA1r-K23nvBajBV-RJzHJN0p3UJ7UQsmdnOoRw,24
|
|
24
|
-
kodit/embedding/embedding_provider/embedding_provider.py,sha256=
|
|
25
|
-
kodit/embedding/embedding_provider/hash_embedding_provider.py,sha256=
|
|
26
|
-
kodit/embedding/embedding_provider/local_embedding_provider.py,sha256=
|
|
27
|
-
kodit/embedding/embedding_provider/openai_embedding_provider.py,sha256
|
|
24
|
+
kodit/embedding/embedding_provider/embedding_provider.py,sha256=WDHifrsQOnpXwIDzSfau32Eq8z8BF3XNeVYd6X989uc,2841
|
|
25
|
+
kodit/embedding/embedding_provider/hash_embedding_provider.py,sha256=AhGize94EoScyQMhCjo26zlO0eP_m3F_1qvrVmB6MTE,2941
|
|
26
|
+
kodit/embedding/embedding_provider/local_embedding_provider.py,sha256=kqbGd7TW6BUsOq_f_IzPCsD7z8LsFieTOZ7saY11I8o,2877
|
|
27
|
+
kodit/embedding/embedding_provider/openai_embedding_provider.py,sha256=2FTIL34yVstf0NTJNSi-sjk38OJd4Aa66TH5FMPJul0,2425
|
|
28
28
|
kodit/enrichment/__init__.py,sha256=vBEolHpKaHUhfINX0dSGyAPlvgpLNAer9YzFtdvCB24,18
|
|
29
29
|
kodit/enrichment/enrichment_factory.py,sha256=AAzvxgjo-FQU5aAm9Zla4DAwUMKGrcw8mQwJsMhIsHY,1566
|
|
30
|
-
kodit/enrichment/enrichment_service.py,sha256=
|
|
30
|
+
kodit/enrichment/enrichment_service.py,sha256=z7VrrQ-Jhb-oO26rQCaqlpmkGRlDQGAu7qVsI0cwHak,1310
|
|
31
31
|
kodit/enrichment/enrichment_provider/__init__.py,sha256=klf8iuLVWX4iRz-DZQauFFNAoJC5CByczh48TBZPW-o,27
|
|
32
|
-
kodit/enrichment/enrichment_provider/enrichment_provider.py,sha256=
|
|
33
|
-
kodit/enrichment/enrichment_provider/local_enrichment_provider.py,sha256=
|
|
34
|
-
kodit/enrichment/enrichment_provider/openai_enrichment_provider.py,sha256=
|
|
32
|
+
kodit/enrichment/enrichment_provider/enrichment_provider.py,sha256=kiDgg2G8G85K4KqwCQKHE_ANybANURPO6NbASf4yAr0,751
|
|
33
|
+
kodit/enrichment/enrichment_provider/local_enrichment_provider.py,sha256=JipvgZwfNvHaECqLJRmQo3W10yb9rOVSrV2U0Jpp4d8,3456
|
|
34
|
+
kodit/enrichment/enrichment_provider/openai_enrichment_provider.py,sha256=xAzbGHJHqGxZxa3yPvHAcPgjOMzQ05qLes0XW6OIdYc,2758
|
|
35
35
|
kodit/indexing/__init__.py,sha256=cPyi2Iej3G1JFWlWr7X80_UrsMaTu5W5rBwgif1B3xo,75
|
|
36
36
|
kodit/indexing/fusion.py,sha256=TZb4fPAedXdEUXzwzOofW98QIOymdbclBOP1KOijuEk,1674
|
|
37
37
|
kodit/indexing/indexing_models.py,sha256=6NX9HVcj6Pu9ePwHC7n-PWSyAgukpJq0nCNmUIigtbo,1282
|
|
38
38
|
kodit/indexing/indexing_repository.py,sha256=dqOS0pxKM6bUjMXWqYukAK8XdiD36OnskFASgZRXRQM,6955
|
|
39
|
-
kodit/indexing/indexing_service.py,sha256=
|
|
39
|
+
kodit/indexing/indexing_service.py,sha256=UD7RKQRkAlpmepl20vcdEgQapwEA2kDJQBmn4_kGWwU,11841
|
|
40
40
|
kodit/migrations/README,sha256=ISVtAOvqvKk_5ThM5ioJE-lMkvf9IbknFUFVU_vPma4,58
|
|
41
41
|
kodit/migrations/__init__.py,sha256=lP5MuwlyWRMO6UcDWnQcQ3G-GYHcFb6rl9gYPHJ1sjo,40
|
|
42
42
|
kodit/migrations/env.py,sha256=w1M7OZh-ZeR2dPHS0ByXAUxQjfZQ8xIzMseWuzLDTWw,2469
|
|
@@ -64,8 +64,8 @@ kodit/source/source_repository.py,sha256=eme0C3pRqwFZ1ZSbqq4Z6SV9CC6AvRmiOjy3eHQ
|
|
|
64
64
|
kodit/source/source_service.py,sha256=E1KPG7TrorqdreJVHxZPx8CVLncOxGEvZ5uDQ6yZugo,5050
|
|
65
65
|
kodit/util/__init__.py,sha256=bPu6CtqDWCRGU7VgW2_aiQrCBi8G89FS6k1PjvDajJ0,37
|
|
66
66
|
kodit/util/spinner.py,sha256=R9bzrHtBiIH6IfLbmsIVHL53s8vg-tqW4lwGGALu4dw,1932
|
|
67
|
-
kodit-0.2.
|
|
68
|
-
kodit-0.2.
|
|
69
|
-
kodit-0.2.
|
|
70
|
-
kodit-0.2.
|
|
71
|
-
kodit-0.2.
|
|
67
|
+
kodit-0.2.4.dist-info/METADATA,sha256=PLQQVNKVnMyyliP9TEapeXUuog_N1bTFlup6F89B7NU,5867
|
|
68
|
+
kodit-0.2.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
69
|
+
kodit-0.2.4.dist-info/entry_points.txt,sha256=hoTn-1aKyTItjnY91fnO-rV5uaWQLQ-Vi7V5et2IbHY,40
|
|
70
|
+
kodit-0.2.4.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
71
|
+
kodit-0.2.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|