kodit 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/app.py +6 -0
- kodit/cli.py +8 -2
- kodit/embedding/embedding_factory.py +11 -0
- kodit/embedding/embedding_provider/embedding_provider.py +42 -14
- kodit/embedding/embedding_provider/hash_embedding_provider.py +16 -7
- kodit/embedding/embedding_provider/local_embedding_provider.py +43 -11
- kodit/embedding/embedding_provider/openai_embedding_provider.py +18 -22
- kodit/embedding/local_vector_search_service.py +46 -13
- kodit/embedding/vector_search_service.py +18 -1
- kodit/embedding/vectorchord_vector_search_service.py +63 -16
- kodit/enrichment/enrichment_factory.py +3 -0
- kodit/enrichment/enrichment_provider/enrichment_provider.py +21 -1
- kodit/enrichment/enrichment_provider/local_enrichment_provider.py +39 -28
- kodit/enrichment/enrichment_provider/openai_enrichment_provider.py +25 -27
- kodit/enrichment/enrichment_service.py +19 -7
- kodit/indexing/indexing_service.py +50 -23
- kodit/log.py +126 -24
- kodit/migrations/versions/9e53ea8bb3b0_add_authors.py +103 -0
- kodit/source/source_factories.py +356 -0
- kodit/source/source_models.py +17 -5
- kodit/source/source_repository.py +49 -20
- kodit/source/source_service.py +41 -218
- {kodit-0.2.2.dist-info → kodit-0.2.4.dist-info}/METADATA +2 -2
- {kodit-0.2.2.dist-info → kodit-0.2.4.dist-info}/RECORD +28 -27
- kodit/migrations/versions/42e836b21102_add_authors.py +0 -64
- {kodit-0.2.2.dist-info → kodit-0.2.4.dist-info}/WHEEL +0 -0
- {kodit-0.2.2.dist-info → kodit-0.2.4.dist-info}/entry_points.txt +0 -0
- {kodit-0.2.2.dist-info → kodit-0.2.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,13 +1,19 @@
|
|
|
1
1
|
"""Vectorchord vector search."""
|
|
2
2
|
|
|
3
|
+
from collections.abc import AsyncGenerator
|
|
3
4
|
from typing import Any, Literal
|
|
4
5
|
|
|
5
6
|
import structlog
|
|
6
7
|
from sqlalchemy import Result, TextClause, text
|
|
7
8
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
8
9
|
|
|
9
|
-
from kodit.embedding.
|
|
10
|
+
from kodit.embedding.embedding_models import EmbeddingType
|
|
11
|
+
from kodit.embedding.embedding_provider.embedding_provider import (
|
|
12
|
+
EmbeddingProvider,
|
|
13
|
+
EmbeddingRequest,
|
|
14
|
+
)
|
|
10
15
|
from kodit.embedding.vector_search_service import (
|
|
16
|
+
IndexResult,
|
|
11
17
|
VectorSearchRequest,
|
|
12
18
|
VectorSearchResponse,
|
|
13
19
|
VectorSearchService,
|
|
@@ -52,6 +58,10 @@ ORDER BY score ASC
|
|
|
52
58
|
LIMIT :top_k;
|
|
53
59
|
"""
|
|
54
60
|
|
|
61
|
+
CHECK_VCHORD_EMBEDDING_EXISTS = """
|
|
62
|
+
SELECT EXISTS(SELECT 1 FROM {TABLE_NAME} WHERE snippet_id = :snippet_id)
|
|
63
|
+
"""
|
|
64
|
+
|
|
55
65
|
TaskName = Literal["code", "text"]
|
|
56
66
|
|
|
57
67
|
|
|
@@ -89,7 +99,15 @@ class VectorChordVectorSearchService(VectorSearchService):
|
|
|
89
99
|
|
|
90
100
|
async def _create_tables(self) -> None:
|
|
91
101
|
"""Create the necessary tables."""
|
|
92
|
-
|
|
102
|
+
req = EmbeddingRequest(id=0, text="dimension")
|
|
103
|
+
vector_dim: list[float] | None = None
|
|
104
|
+
async for batch in self.embedding_provider.embed([req]):
|
|
105
|
+
if batch:
|
|
106
|
+
vector_dim = batch[0].embedding
|
|
107
|
+
break
|
|
108
|
+
if vector_dim is None:
|
|
109
|
+
msg = "Failed to obtain embedding dimension from provider"
|
|
110
|
+
raise RuntimeError(msg)
|
|
93
111
|
await self._session.execute(
|
|
94
112
|
text(
|
|
95
113
|
f"""CREATE TABLE IF NOT EXISTS {self.table_name} (
|
|
@@ -130,31 +148,48 @@ class VectorChordVectorSearchService(VectorSearchService):
|
|
|
130
148
|
"""Commit the session."""
|
|
131
149
|
await self._session.commit()
|
|
132
150
|
|
|
133
|
-
async def index(
|
|
151
|
+
async def index(
|
|
152
|
+
self, data: list[VectorSearchRequest]
|
|
153
|
+
) -> AsyncGenerator[list[IndexResult], None]:
|
|
134
154
|
"""Embed a list of documents."""
|
|
135
155
|
if not data or len(data) == 0:
|
|
136
156
|
self.log.warning("Embedding data is empty, skipping embedding")
|
|
137
157
|
return
|
|
138
158
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
159
|
+
requests = [EmbeddingRequest(id=doc.snippet_id, text=doc.text) for doc in data]
|
|
160
|
+
|
|
161
|
+
async for batch in self.embedding_provider.embed(requests):
|
|
162
|
+
await self._execute(
|
|
163
|
+
text(INSERT_QUERY.format(TABLE_NAME=self.table_name)),
|
|
164
|
+
[
|
|
165
|
+
{
|
|
166
|
+
"snippet_id": result.id,
|
|
167
|
+
"embedding": str(result.embedding),
|
|
168
|
+
}
|
|
169
|
+
for result in batch
|
|
170
|
+
],
|
|
171
|
+
)
|
|
172
|
+
await self._commit()
|
|
173
|
+
yield [IndexResult(snippet_id=result.id) for result in batch]
|
|
149
174
|
|
|
150
175
|
async def retrieve(self, query: str, top_k: int = 10) -> list[VectorSearchResponse]:
|
|
151
176
|
"""Query the embedding model."""
|
|
152
|
-
|
|
153
|
-
|
|
177
|
+
from kodit.embedding.embedding_provider.embedding_provider import (
|
|
178
|
+
EmbeddingRequest,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
req = EmbeddingRequest(id=0, text=query)
|
|
182
|
+
embedding_vec: list[float] | None = None
|
|
183
|
+
async for batch in self.embedding_provider.embed([req]):
|
|
184
|
+
if batch:
|
|
185
|
+
embedding_vec = batch[0].embedding
|
|
186
|
+
break
|
|
187
|
+
|
|
188
|
+
if not embedding_vec:
|
|
154
189
|
return []
|
|
155
190
|
result = await self._execute(
|
|
156
191
|
text(SEARCH_QUERY.format(TABLE_NAME=self.table_name)),
|
|
157
|
-
{"query": str(
|
|
192
|
+
{"query": str(embedding_vec), "top_k": top_k},
|
|
158
193
|
)
|
|
159
194
|
rows = result.mappings().all()
|
|
160
195
|
|
|
@@ -162,3 +197,15 @@ class VectorChordVectorSearchService(VectorSearchService):
|
|
|
162
197
|
VectorSearchResponse(snippet_id=row["snippet_id"], score=row["score"])
|
|
163
198
|
for row in rows
|
|
164
199
|
]
|
|
200
|
+
|
|
201
|
+
async def has_embedding(
|
|
202
|
+
self,
|
|
203
|
+
snippet_id: int,
|
|
204
|
+
embedding_type: EmbeddingType, # noqa: ARG002
|
|
205
|
+
) -> bool:
|
|
206
|
+
"""Check if a snippet has an embedding."""
|
|
207
|
+
result = await self._execute(
|
|
208
|
+
text(CHECK_VCHORD_EMBEDDING_EXISTS.format(TABLE_NAME=self.table_name)),
|
|
209
|
+
{"snippet_id": snippet_id},
|
|
210
|
+
)
|
|
211
|
+
return result.scalar_one()
|
|
@@ -11,6 +11,7 @@ from kodit.enrichment.enrichment_service import (
|
|
|
11
11
|
EnrichmentService,
|
|
12
12
|
LLMEnrichmentService,
|
|
13
13
|
)
|
|
14
|
+
from kodit.log import log_event
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
def _get_endpoint_configuration(app_context: AppContext) -> Endpoint | None:
|
|
@@ -24,6 +25,7 @@ def enrichment_factory(app_context: AppContext) -> EnrichmentService:
|
|
|
24
25
|
endpoint = app_context.enrichment_endpoint or app_context.default_endpoint or None
|
|
25
26
|
|
|
26
27
|
if endpoint and endpoint.type == "openai":
|
|
28
|
+
log_event("kodit.enrichment", {"provider": "openai"})
|
|
27
29
|
from openai import AsyncOpenAI
|
|
28
30
|
|
|
29
31
|
enrichment_provider = OpenAIEnrichmentProvider(
|
|
@@ -34,6 +36,7 @@ def enrichment_factory(app_context: AppContext) -> EnrichmentService:
|
|
|
34
36
|
model_name=endpoint.model or "gpt-4o-mini",
|
|
35
37
|
)
|
|
36
38
|
else:
|
|
39
|
+
log_event("kodit.enrichment", {"provider": "local"})
|
|
37
40
|
enrichment_provider = LocalEnrichmentProvider()
|
|
38
41
|
|
|
39
42
|
return LLMEnrichmentService(enrichment_provider=enrichment_provider)
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
"""Enrichment provider."""
|
|
2
2
|
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
|
+
from collections.abc import AsyncGenerator
|
|
5
|
+
from dataclasses import dataclass
|
|
4
6
|
|
|
5
7
|
ENRICHMENT_SYSTEM_PROMPT = """
|
|
6
8
|
You are a professional software developer. You will be given a snippet of code.
|
|
@@ -8,9 +10,27 @@ Please provide a concise explanation of the code.
|
|
|
8
10
|
"""
|
|
9
11
|
|
|
10
12
|
|
|
13
|
+
@dataclass
|
|
14
|
+
class EnrichmentRequest:
|
|
15
|
+
"""Enrichment request."""
|
|
16
|
+
|
|
17
|
+
snippet_id: int
|
|
18
|
+
text: str
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class EnrichmentResponse:
|
|
23
|
+
"""Enrichment response."""
|
|
24
|
+
|
|
25
|
+
snippet_id: int
|
|
26
|
+
text: str
|
|
27
|
+
|
|
28
|
+
|
|
11
29
|
class EnrichmentProvider(ABC):
|
|
12
30
|
"""Enrichment provider."""
|
|
13
31
|
|
|
14
32
|
@abstractmethod
|
|
15
|
-
|
|
33
|
+
def enrich(
|
|
34
|
+
self, data: list[EnrichmentRequest]
|
|
35
|
+
) -> AsyncGenerator[EnrichmentResponse, None]:
|
|
16
36
|
"""Enrich a list of strings."""
|
|
@@ -1,15 +1,19 @@
|
|
|
1
1
|
"""Local embedding service."""
|
|
2
2
|
|
|
3
3
|
import os
|
|
4
|
+
from collections.abc import AsyncGenerator
|
|
4
5
|
|
|
5
6
|
import structlog
|
|
6
7
|
import tiktoken
|
|
7
|
-
from tqdm import tqdm
|
|
8
8
|
|
|
9
|
-
from kodit.embedding.embedding_provider.embedding_provider import
|
|
9
|
+
from kodit.embedding.embedding_provider.embedding_provider import (
|
|
10
|
+
EmbeddingRequest,
|
|
11
|
+
)
|
|
10
12
|
from kodit.enrichment.enrichment_provider.enrichment_provider import (
|
|
11
13
|
ENRICHMENT_SYSTEM_PROMPT,
|
|
12
14
|
EnrichmentProvider,
|
|
15
|
+
EnrichmentRequest,
|
|
16
|
+
EnrichmentResponse,
|
|
13
17
|
)
|
|
14
18
|
|
|
15
19
|
DEFAULT_ENRICHMENT_MODEL = "Qwen/Qwen3-0.6B"
|
|
@@ -32,11 +36,16 @@ class LocalEnrichmentProvider(EnrichmentProvider):
|
|
|
32
36
|
self.tokenizer = None
|
|
33
37
|
self.encoding = tiktoken.encoding_for_model("text-embedding-3-small")
|
|
34
38
|
|
|
35
|
-
async def enrich(
|
|
39
|
+
async def enrich(
|
|
40
|
+
self, data: list[EnrichmentRequest]
|
|
41
|
+
) -> AsyncGenerator[EnrichmentResponse, None]:
|
|
36
42
|
"""Enrich a list of strings."""
|
|
43
|
+
# Remove empty snippets
|
|
44
|
+
data = [snippet for snippet in data if snippet.text]
|
|
45
|
+
|
|
37
46
|
if not data or len(data) == 0:
|
|
38
47
|
self.log.warning("Data is empty, skipping enrichment")
|
|
39
|
-
return
|
|
48
|
+
return
|
|
40
49
|
|
|
41
50
|
from transformers.models.auto.modeling_auto import (
|
|
42
51
|
AutoModelForCausalLM,
|
|
@@ -57,36 +66,38 @@ class LocalEnrichmentProvider(EnrichmentProvider):
|
|
|
57
66
|
)
|
|
58
67
|
|
|
59
68
|
# Prepare prompts
|
|
60
|
-
prompts = [
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
+
prompts: list[EmbeddingRequest] = [
|
|
70
|
+
EmbeddingRequest(
|
|
71
|
+
id=snippet.snippet_id,
|
|
72
|
+
text=self.tokenizer.apply_chat_template(
|
|
73
|
+
[
|
|
74
|
+
{"role": "system", "content": ENRICHMENT_SYSTEM_PROMPT},
|
|
75
|
+
{"role": "user", "content": snippet.text},
|
|
76
|
+
],
|
|
77
|
+
tokenize=False,
|
|
78
|
+
add_generation_prompt=True,
|
|
79
|
+
enable_thinking=False,
|
|
80
|
+
),
|
|
69
81
|
)
|
|
70
82
|
for snippet in data
|
|
71
83
|
]
|
|
72
84
|
|
|
73
|
-
|
|
74
|
-
batched_prompts = split_sub_batches(
|
|
75
|
-
self.encoding, prompts, max_context_window=self.context_window
|
|
76
|
-
)
|
|
77
|
-
results = []
|
|
78
|
-
for batch in tqdm(batched_prompts, leave=False, total=len(batched_prompts)):
|
|
85
|
+
for prompt in prompts:
|
|
79
86
|
model_inputs = self.tokenizer(
|
|
80
|
-
|
|
87
|
+
prompt.text,
|
|
88
|
+
return_tensors="pt",
|
|
89
|
+
padding=True,
|
|
90
|
+
truncation=True,
|
|
81
91
|
).to(self.model.device)
|
|
82
92
|
generated_ids = self.model.generate(
|
|
83
93
|
**model_inputs, max_new_tokens=self.context_window
|
|
84
94
|
)
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
95
|
+
input_ids = model_inputs["input_ids"][0]
|
|
96
|
+
output_ids = generated_ids[0][len(input_ids) :].tolist()
|
|
97
|
+
content = self.tokenizer.decode(output_ids, skip_special_tokens=True).strip(
|
|
98
|
+
"\n"
|
|
99
|
+
)
|
|
100
|
+
yield EnrichmentResponse(
|
|
101
|
+
snippet_id=prompt.id,
|
|
102
|
+
text=content,
|
|
103
|
+
)
|
|
@@ -1,15 +1,17 @@
|
|
|
1
1
|
"""OpenAI embedding service."""
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
from collections.abc import AsyncGenerator
|
|
4
5
|
|
|
5
6
|
import structlog
|
|
6
7
|
import tiktoken
|
|
7
8
|
from openai import AsyncOpenAI
|
|
8
|
-
from tqdm import tqdm
|
|
9
9
|
|
|
10
10
|
from kodit.enrichment.enrichment_provider.enrichment_provider import (
|
|
11
11
|
ENRICHMENT_SYSTEM_PROMPT,
|
|
12
12
|
EnrichmentProvider,
|
|
13
|
+
EnrichmentRequest,
|
|
14
|
+
EnrichmentResponse,
|
|
13
15
|
)
|
|
14
16
|
|
|
15
17
|
OPENAI_NUM_PARALLEL_TASKS = 10
|
|
@@ -29,25 +31,24 @@ class OpenAIEnrichmentProvider(EnrichmentProvider):
|
|
|
29
31
|
self.model_name = model_name
|
|
30
32
|
self.encoding = tiktoken.encoding_for_model("gpt-4o-mini") # Approximation
|
|
31
33
|
|
|
32
|
-
async def enrich(
|
|
34
|
+
async def enrich(
|
|
35
|
+
self, data: list[EnrichmentRequest]
|
|
36
|
+
) -> AsyncGenerator[EnrichmentResponse, None]:
|
|
33
37
|
"""Enrich a list of documents."""
|
|
34
38
|
if not data or len(data) == 0:
|
|
35
39
|
self.log.warning("Data is empty, skipping enrichment")
|
|
36
|
-
return
|
|
40
|
+
return
|
|
37
41
|
|
|
38
42
|
# Process batches in parallel with a semaphore to limit concurrent requests
|
|
39
43
|
sem = asyncio.Semaphore(OPENAI_NUM_PARALLEL_TASKS)
|
|
40
44
|
|
|
41
|
-
|
|
42
|
-
# We need to do this so that we can return the results in the same order as the
|
|
43
|
-
# input data
|
|
44
|
-
input_data = [(i, snippet) for i, snippet in enumerate(data)]
|
|
45
|
-
|
|
46
|
-
async def process_data(data: tuple[int, str]) -> tuple[int, str]:
|
|
47
|
-
snippet_id, snippet = data
|
|
48
|
-
if not snippet:
|
|
49
|
-
return snippet_id, ""
|
|
45
|
+
async def process_data(data: EnrichmentRequest) -> EnrichmentResponse:
|
|
50
46
|
async with sem:
|
|
47
|
+
if not data.text:
|
|
48
|
+
return EnrichmentResponse(
|
|
49
|
+
snippet_id=data.snippet_id,
|
|
50
|
+
text="",
|
|
51
|
+
)
|
|
51
52
|
try:
|
|
52
53
|
response = await self.openai_client.chat.completions.create(
|
|
53
54
|
model=self.model_name,
|
|
@@ -56,26 +57,23 @@ class OpenAIEnrichmentProvider(EnrichmentProvider):
|
|
|
56
57
|
"role": "system",
|
|
57
58
|
"content": ENRICHMENT_SYSTEM_PROMPT,
|
|
58
59
|
},
|
|
59
|
-
{"role": "user", "content":
|
|
60
|
+
{"role": "user", "content": data.text},
|
|
60
61
|
],
|
|
61
62
|
)
|
|
62
|
-
return
|
|
63
|
+
return EnrichmentResponse(
|
|
64
|
+
snippet_id=data.snippet_id,
|
|
65
|
+
text=response.choices[0].message.content or "",
|
|
66
|
+
)
|
|
63
67
|
except Exception as e:
|
|
64
68
|
self.log.exception("Error enriching data", error=str(e))
|
|
65
|
-
return
|
|
69
|
+
return EnrichmentResponse(
|
|
70
|
+
snippet_id=data.snippet_id,
|
|
71
|
+
text="",
|
|
72
|
+
)
|
|
66
73
|
|
|
67
74
|
# Create tasks for all data
|
|
68
|
-
tasks = [process_data(snippet) for snippet in
|
|
75
|
+
tasks = [process_data(snippet) for snippet in data]
|
|
69
76
|
|
|
70
77
|
# Process all data and yield results as they complete
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
asyncio.as_completed(tasks),
|
|
74
|
-
total=len(tasks),
|
|
75
|
-
leave=False,
|
|
76
|
-
):
|
|
77
|
-
result = await task
|
|
78
|
-
results.append(result)
|
|
79
|
-
|
|
80
|
-
# Output in the same order as the input data
|
|
81
|
-
return [result for _, result in sorted(results, key=lambda x: x[0])]
|
|
78
|
+
for task in asyncio.as_completed(tasks):
|
|
79
|
+
yield await task
|
|
@@ -1,24 +1,34 @@
|
|
|
1
1
|
"""Enrichment service."""
|
|
2
2
|
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
|
+
from collections.abc import AsyncGenerator
|
|
4
5
|
|
|
5
|
-
from kodit.enrichment.enrichment_provider.enrichment_provider import
|
|
6
|
+
from kodit.enrichment.enrichment_provider.enrichment_provider import (
|
|
7
|
+
EnrichmentProvider,
|
|
8
|
+
EnrichmentRequest,
|
|
9
|
+
EnrichmentResponse,
|
|
10
|
+
)
|
|
6
11
|
|
|
7
12
|
|
|
8
13
|
class EnrichmentService(ABC):
|
|
9
14
|
"""Enrichment service."""
|
|
10
15
|
|
|
11
16
|
@abstractmethod
|
|
12
|
-
|
|
17
|
+
def enrich(
|
|
18
|
+
self, data: list[EnrichmentRequest]
|
|
19
|
+
) -> AsyncGenerator[EnrichmentResponse, None]:
|
|
13
20
|
"""Enrich a list of strings."""
|
|
14
21
|
|
|
15
22
|
|
|
16
23
|
class NullEnrichmentService(EnrichmentService):
|
|
17
24
|
"""Null enrichment service."""
|
|
18
25
|
|
|
19
|
-
async def enrich(
|
|
26
|
+
async def enrich(
|
|
27
|
+
self, data: list[EnrichmentRequest]
|
|
28
|
+
) -> AsyncGenerator[EnrichmentResponse, None]:
|
|
20
29
|
"""Enrich a list of strings."""
|
|
21
|
-
|
|
30
|
+
for request in data:
|
|
31
|
+
yield EnrichmentResponse(snippet_id=request.snippet_id, text="")
|
|
22
32
|
|
|
23
33
|
|
|
24
34
|
class LLMEnrichmentService(EnrichmentService):
|
|
@@ -28,6 +38,8 @@ class LLMEnrichmentService(EnrichmentService):
|
|
|
28
38
|
"""Initialize the enrichment service."""
|
|
29
39
|
self.enrichment_provider = enrichment_provider
|
|
30
40
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
41
|
+
def enrich(
|
|
42
|
+
self, data: list[EnrichmentRequest]
|
|
43
|
+
) -> AsyncGenerator[EnrichmentResponse, None]:
|
|
44
|
+
"""Enrich a list of snippets."""
|
|
45
|
+
return self.enrichment_provider.enrich(data)
|
|
@@ -22,10 +22,12 @@ from kodit.embedding.vector_search_service import (
|
|
|
22
22
|
VectorSearchRequest,
|
|
23
23
|
VectorSearchService,
|
|
24
24
|
)
|
|
25
|
+
from kodit.enrichment.enrichment_provider.enrichment_provider import EnrichmentRequest
|
|
25
26
|
from kodit.enrichment.enrichment_service import EnrichmentService
|
|
26
27
|
from kodit.indexing.fusion import FusionRequest, reciprocal_rank_fusion
|
|
27
28
|
from kodit.indexing.indexing_models import Snippet
|
|
28
29
|
from kodit.indexing.indexing_repository import IndexRepository
|
|
30
|
+
from kodit.log import log_event
|
|
29
31
|
from kodit.snippets.snippets import SnippetService
|
|
30
32
|
from kodit.source.source_service import SourceService
|
|
31
33
|
from kodit.util.spinner import Spinner
|
|
@@ -45,7 +47,7 @@ class IndexView(pydantic.BaseModel):
|
|
|
45
47
|
created_at: datetime
|
|
46
48
|
updated_at: datetime | None = None
|
|
47
49
|
source: str | None = None
|
|
48
|
-
num_snippets: int
|
|
50
|
+
num_snippets: int
|
|
49
51
|
|
|
50
52
|
|
|
51
53
|
class SearchRequest(pydantic.BaseModel):
|
|
@@ -119,6 +121,8 @@ class IndexService:
|
|
|
119
121
|
ValueError: If the source doesn't exist or already has an index.
|
|
120
122
|
|
|
121
123
|
"""
|
|
124
|
+
log_event("kodit.index.create")
|
|
125
|
+
|
|
122
126
|
# Check if the source exists
|
|
123
127
|
source = await self.source_service.get(source_id)
|
|
124
128
|
|
|
@@ -129,6 +133,8 @@ class IndexService:
|
|
|
129
133
|
return IndexView(
|
|
130
134
|
id=index.id,
|
|
131
135
|
created_at=index.created_at,
|
|
136
|
+
num_snippets=await self.repository.num_snippets_for_index(index.id),
|
|
137
|
+
source=source.uri,
|
|
132
138
|
)
|
|
133
139
|
|
|
134
140
|
async def list_indexes(self) -> list[IndexView]:
|
|
@@ -142,19 +148,33 @@ class IndexService:
|
|
|
142
148
|
indexes = await self.repository.list_indexes()
|
|
143
149
|
|
|
144
150
|
# Transform database results into DTOs
|
|
145
|
-
|
|
151
|
+
indexes = [
|
|
146
152
|
IndexView(
|
|
147
153
|
id=index.id,
|
|
148
154
|
created_at=index.created_at,
|
|
149
155
|
updated_at=index.updated_at,
|
|
150
|
-
num_snippets=await self.repository.num_snippets_for_index(index.id)
|
|
156
|
+
num_snippets=await self.repository.num_snippets_for_index(index.id)
|
|
157
|
+
or 0,
|
|
151
158
|
source=source.uri,
|
|
152
159
|
)
|
|
153
160
|
for index, source in indexes
|
|
154
161
|
]
|
|
155
162
|
|
|
163
|
+
# Help Kodit by measuring how much people are using indexes
|
|
164
|
+
log_event(
|
|
165
|
+
"kodit.index.list",
|
|
166
|
+
{
|
|
167
|
+
"num_indexes": len(indexes),
|
|
168
|
+
"num_snippets": sum([index.num_snippets for index in indexes]),
|
|
169
|
+
},
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
return indexes
|
|
173
|
+
|
|
156
174
|
async def run(self, index_id: int) -> None:
|
|
157
175
|
"""Run the indexing process for a specific index."""
|
|
176
|
+
log_event("kodit.index.run")
|
|
177
|
+
|
|
158
178
|
# Get and validate index
|
|
159
179
|
index = await self.repository.get_by_id(index_id)
|
|
160
180
|
if not index:
|
|
@@ -181,43 +201,50 @@ class IndexService:
|
|
|
181
201
|
)
|
|
182
202
|
|
|
183
203
|
self.log.info("Creating semantic code index")
|
|
184
|
-
with
|
|
185
|
-
|
|
204
|
+
with tqdm(total=len(snippets), leave=False) as pbar:
|
|
205
|
+
async for result in self.code_search_service.index(
|
|
186
206
|
[
|
|
187
207
|
VectorSearchRequest(snippet.id, snippet.content)
|
|
188
208
|
for snippet in snippets
|
|
189
209
|
]
|
|
190
|
-
)
|
|
210
|
+
):
|
|
211
|
+
pbar.update(len(result))
|
|
191
212
|
|
|
192
213
|
self.log.info("Enriching snippets", num_snippets=len(snippets))
|
|
193
|
-
enriched_contents =
|
|
194
|
-
|
|
195
|
-
|
|
214
|
+
enriched_contents = []
|
|
215
|
+
with tqdm(total=len(snippets), leave=False) as pbar:
|
|
216
|
+
async for result in self.enrichment_service.enrich(
|
|
217
|
+
[
|
|
218
|
+
EnrichmentRequest(snippet_id=snippet.id, text=snippet.content)
|
|
219
|
+
for snippet in snippets
|
|
220
|
+
]
|
|
221
|
+
):
|
|
222
|
+
snippet = next(s for s in snippets if s.id == result.snippet_id)
|
|
223
|
+
if snippet:
|
|
224
|
+
snippet.content = (
|
|
225
|
+
result.text + "\n\n```\n" + snippet.content + "\n```"
|
|
226
|
+
)
|
|
227
|
+
await self.repository.add_snippet(snippet)
|
|
228
|
+
enriched_contents.append(result)
|
|
229
|
+
pbar.update(1)
|
|
196
230
|
|
|
197
231
|
self.log.info("Creating semantic text index")
|
|
198
|
-
with
|
|
199
|
-
|
|
232
|
+
with tqdm(total=len(snippets), leave=False) as pbar:
|
|
233
|
+
async for result in self.text_search_service.index(
|
|
200
234
|
[
|
|
201
|
-
VectorSearchRequest(snippet.id,
|
|
202
|
-
for snippet
|
|
203
|
-
snippets, enriched_contents, strict=True
|
|
204
|
-
)
|
|
235
|
+
VectorSearchRequest(snippet.id, snippet.content)
|
|
236
|
+
for snippet in snippets
|
|
205
237
|
]
|
|
206
|
-
)
|
|
207
|
-
# Add the enriched text back to the snippets and write to the database
|
|
208
|
-
for snippet, enriched_content in zip(
|
|
209
|
-
snippets, enriched_contents, strict=True
|
|
210
238
|
):
|
|
211
|
-
|
|
212
|
-
enriched_content + "\n\n```\n" + snippet.content + "\n```"
|
|
213
|
-
)
|
|
214
|
-
await self.repository.add_snippet(snippet)
|
|
239
|
+
pbar.update(len(result))
|
|
215
240
|
|
|
216
241
|
# Update index timestamp
|
|
217
242
|
await self.repository.update_index_timestamp(index)
|
|
218
243
|
|
|
219
244
|
async def search(self, request: SearchRequest) -> list[SearchResult]:
|
|
220
245
|
"""Search for relevant data."""
|
|
246
|
+
log_event("kodit.index.search")
|
|
247
|
+
|
|
221
248
|
fusion_list: list[list[FusionRequest]] = []
|
|
222
249
|
if request.keywords:
|
|
223
250
|
# Gather results for each keyword
|