kodit 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (29) hide show
  1. kodit/_version.py +2 -2
  2. kodit/app.py +6 -0
  3. kodit/cli.py +8 -2
  4. kodit/embedding/embedding_factory.py +11 -0
  5. kodit/embedding/embedding_provider/embedding_provider.py +42 -14
  6. kodit/embedding/embedding_provider/hash_embedding_provider.py +16 -7
  7. kodit/embedding/embedding_provider/local_embedding_provider.py +43 -11
  8. kodit/embedding/embedding_provider/openai_embedding_provider.py +18 -22
  9. kodit/embedding/local_vector_search_service.py +46 -13
  10. kodit/embedding/vector_search_service.py +18 -1
  11. kodit/embedding/vectorchord_vector_search_service.py +63 -16
  12. kodit/enrichment/enrichment_factory.py +3 -0
  13. kodit/enrichment/enrichment_provider/enrichment_provider.py +21 -1
  14. kodit/enrichment/enrichment_provider/local_enrichment_provider.py +39 -28
  15. kodit/enrichment/enrichment_provider/openai_enrichment_provider.py +25 -27
  16. kodit/enrichment/enrichment_service.py +19 -7
  17. kodit/indexing/indexing_service.py +50 -23
  18. kodit/log.py +126 -24
  19. kodit/migrations/versions/9e53ea8bb3b0_add_authors.py +103 -0
  20. kodit/source/source_factories.py +356 -0
  21. kodit/source/source_models.py +17 -5
  22. kodit/source/source_repository.py +49 -20
  23. kodit/source/source_service.py +41 -218
  24. {kodit-0.2.2.dist-info → kodit-0.2.4.dist-info}/METADATA +2 -2
  25. {kodit-0.2.2.dist-info → kodit-0.2.4.dist-info}/RECORD +28 -27
  26. kodit/migrations/versions/42e836b21102_add_authors.py +0 -64
  27. {kodit-0.2.2.dist-info → kodit-0.2.4.dist-info}/WHEEL +0 -0
  28. {kodit-0.2.2.dist-info → kodit-0.2.4.dist-info}/entry_points.txt +0 -0
  29. {kodit-0.2.2.dist-info → kodit-0.2.4.dist-info}/licenses/LICENSE +0 -0
@@ -1,13 +1,19 @@
1
1
  """Vectorchord vector search."""
2
2
 
3
+ from collections.abc import AsyncGenerator
3
4
  from typing import Any, Literal
4
5
 
5
6
  import structlog
6
7
  from sqlalchemy import Result, TextClause, text
7
8
  from sqlalchemy.ext.asyncio import AsyncSession
8
9
 
9
- from kodit.embedding.embedding_provider.embedding_provider import EmbeddingProvider
10
+ from kodit.embedding.embedding_models import EmbeddingType
11
+ from kodit.embedding.embedding_provider.embedding_provider import (
12
+ EmbeddingProvider,
13
+ EmbeddingRequest,
14
+ )
10
15
  from kodit.embedding.vector_search_service import (
16
+ IndexResult,
11
17
  VectorSearchRequest,
12
18
  VectorSearchResponse,
13
19
  VectorSearchService,
@@ -52,6 +58,10 @@ ORDER BY score ASC
52
58
  LIMIT :top_k;
53
59
  """
54
60
 
61
+ CHECK_VCHORD_EMBEDDING_EXISTS = """
62
+ SELECT EXISTS(SELECT 1 FROM {TABLE_NAME} WHERE snippet_id = :snippet_id)
63
+ """
64
+
55
65
  TaskName = Literal["code", "text"]
56
66
 
57
67
 
@@ -89,7 +99,15 @@ class VectorChordVectorSearchService(VectorSearchService):
89
99
 
90
100
  async def _create_tables(self) -> None:
91
101
  """Create the necessary tables."""
92
- vector_dim = (await self.embedding_provider.embed(["dimension"]))[0]
102
+ req = EmbeddingRequest(id=0, text="dimension")
103
+ vector_dim: list[float] | None = None
104
+ async for batch in self.embedding_provider.embed([req]):
105
+ if batch:
106
+ vector_dim = batch[0].embedding
107
+ break
108
+ if vector_dim is None:
109
+ msg = "Failed to obtain embedding dimension from provider"
110
+ raise RuntimeError(msg)
93
111
  await self._session.execute(
94
112
  text(
95
113
  f"""CREATE TABLE IF NOT EXISTS {self.table_name} (
@@ -130,31 +148,48 @@ class VectorChordVectorSearchService(VectorSearchService):
130
148
  """Commit the session."""
131
149
  await self._session.commit()
132
150
 
133
- async def index(self, data: list[VectorSearchRequest]) -> None:
151
+ async def index(
152
+ self, data: list[VectorSearchRequest]
153
+ ) -> AsyncGenerator[list[IndexResult], None]:
134
154
  """Embed a list of documents."""
135
155
  if not data or len(data) == 0:
136
156
  self.log.warning("Embedding data is empty, skipping embedding")
137
157
  return
138
158
 
139
- embeddings = await self.embedding_provider.embed([doc.text for doc in data])
140
- # Execute inserts
141
- await self._execute(
142
- text(INSERT_QUERY.format(TABLE_NAME=self.table_name)),
143
- [
144
- {"snippet_id": doc.snippet_id, "embedding": str(embedding)}
145
- for doc, embedding in zip(data, embeddings, strict=True)
146
- ],
147
- )
148
- await self._commit()
159
+ requests = [EmbeddingRequest(id=doc.snippet_id, text=doc.text) for doc in data]
160
+
161
+ async for batch in self.embedding_provider.embed(requests):
162
+ await self._execute(
163
+ text(INSERT_QUERY.format(TABLE_NAME=self.table_name)),
164
+ [
165
+ {
166
+ "snippet_id": result.id,
167
+ "embedding": str(result.embedding),
168
+ }
169
+ for result in batch
170
+ ],
171
+ )
172
+ await self._commit()
173
+ yield [IndexResult(snippet_id=result.id) for result in batch]
149
174
 
150
175
  async def retrieve(self, query: str, top_k: int = 10) -> list[VectorSearchResponse]:
151
176
  """Query the embedding model."""
152
- embedding = await self.embedding_provider.embed([query])
153
- if len(embedding) == 0 or len(embedding[0]) == 0:
177
+ from kodit.embedding.embedding_provider.embedding_provider import (
178
+ EmbeddingRequest,
179
+ )
180
+
181
+ req = EmbeddingRequest(id=0, text=query)
182
+ embedding_vec: list[float] | None = None
183
+ async for batch in self.embedding_provider.embed([req]):
184
+ if batch:
185
+ embedding_vec = batch[0].embedding
186
+ break
187
+
188
+ if not embedding_vec:
154
189
  return []
155
190
  result = await self._execute(
156
191
  text(SEARCH_QUERY.format(TABLE_NAME=self.table_name)),
157
- {"query": str(embedding[0]), "top_k": top_k},
192
+ {"query": str(embedding_vec), "top_k": top_k},
158
193
  )
159
194
  rows = result.mappings().all()
160
195
 
@@ -162,3 +197,15 @@ class VectorChordVectorSearchService(VectorSearchService):
162
197
  VectorSearchResponse(snippet_id=row["snippet_id"], score=row["score"])
163
198
  for row in rows
164
199
  ]
200
+
201
+ async def has_embedding(
202
+ self,
203
+ snippet_id: int,
204
+ embedding_type: EmbeddingType, # noqa: ARG002
205
+ ) -> bool:
206
+ """Check if a snippet has an embedding."""
207
+ result = await self._execute(
208
+ text(CHECK_VCHORD_EMBEDDING_EXISTS.format(TABLE_NAME=self.table_name)),
209
+ {"snippet_id": snippet_id},
210
+ )
211
+ return result.scalar_one()
@@ -11,6 +11,7 @@ from kodit.enrichment.enrichment_service import (
11
11
  EnrichmentService,
12
12
  LLMEnrichmentService,
13
13
  )
14
+ from kodit.log import log_event
14
15
 
15
16
 
16
17
  def _get_endpoint_configuration(app_context: AppContext) -> Endpoint | None:
@@ -24,6 +25,7 @@ def enrichment_factory(app_context: AppContext) -> EnrichmentService:
24
25
  endpoint = app_context.enrichment_endpoint or app_context.default_endpoint or None
25
26
 
26
27
  if endpoint and endpoint.type == "openai":
28
+ log_event("kodit.enrichment", {"provider": "openai"})
27
29
  from openai import AsyncOpenAI
28
30
 
29
31
  enrichment_provider = OpenAIEnrichmentProvider(
@@ -34,6 +36,7 @@ def enrichment_factory(app_context: AppContext) -> EnrichmentService:
34
36
  model_name=endpoint.model or "gpt-4o-mini",
35
37
  )
36
38
  else:
39
+ log_event("kodit.enrichment", {"provider": "local"})
37
40
  enrichment_provider = LocalEnrichmentProvider()
38
41
 
39
42
  return LLMEnrichmentService(enrichment_provider=enrichment_provider)
@@ -1,6 +1,8 @@
1
1
  """Enrichment provider."""
2
2
 
3
3
  from abc import ABC, abstractmethod
4
+ from collections.abc import AsyncGenerator
5
+ from dataclasses import dataclass
4
6
 
5
7
  ENRICHMENT_SYSTEM_PROMPT = """
6
8
  You are a professional software developer. You will be given a snippet of code.
@@ -8,9 +10,27 @@ Please provide a concise explanation of the code.
8
10
  """
9
11
 
10
12
 
13
+ @dataclass
14
+ class EnrichmentRequest:
15
+ """Enrichment request."""
16
+
17
+ snippet_id: int
18
+ text: str
19
+
20
+
21
+ @dataclass
22
+ class EnrichmentResponse:
23
+ """Enrichment response."""
24
+
25
+ snippet_id: int
26
+ text: str
27
+
28
+
11
29
  class EnrichmentProvider(ABC):
12
30
  """Enrichment provider."""
13
31
 
14
32
  @abstractmethod
15
- async def enrich(self, data: list[str]) -> list[str]:
33
+ def enrich(
34
+ self, data: list[EnrichmentRequest]
35
+ ) -> AsyncGenerator[EnrichmentResponse, None]:
16
36
  """Enrich a list of strings."""
@@ -1,15 +1,19 @@
1
1
  """Local embedding service."""
2
2
 
3
3
  import os
4
+ from collections.abc import AsyncGenerator
4
5
 
5
6
  import structlog
6
7
  import tiktoken
7
- from tqdm import tqdm
8
8
 
9
- from kodit.embedding.embedding_provider.embedding_provider import split_sub_batches
9
+ from kodit.embedding.embedding_provider.embedding_provider import (
10
+ EmbeddingRequest,
11
+ )
10
12
  from kodit.enrichment.enrichment_provider.enrichment_provider import (
11
13
  ENRICHMENT_SYSTEM_PROMPT,
12
14
  EnrichmentProvider,
15
+ EnrichmentRequest,
16
+ EnrichmentResponse,
13
17
  )
14
18
 
15
19
  DEFAULT_ENRICHMENT_MODEL = "Qwen/Qwen3-0.6B"
@@ -32,11 +36,16 @@ class LocalEnrichmentProvider(EnrichmentProvider):
32
36
  self.tokenizer = None
33
37
  self.encoding = tiktoken.encoding_for_model("text-embedding-3-small")
34
38
 
35
- async def enrich(self, data: list[str]) -> list[str]:
39
+ async def enrich(
40
+ self, data: list[EnrichmentRequest]
41
+ ) -> AsyncGenerator[EnrichmentResponse, None]:
36
42
  """Enrich a list of strings."""
43
+ # Remove empty snippets
44
+ data = [snippet for snippet in data if snippet.text]
45
+
37
46
  if not data or len(data) == 0:
38
47
  self.log.warning("Data is empty, skipping enrichment")
39
- return []
48
+ return
40
49
 
41
50
  from transformers.models.auto.modeling_auto import (
42
51
  AutoModelForCausalLM,
@@ -57,36 +66,38 @@ class LocalEnrichmentProvider(EnrichmentProvider):
57
66
  )
58
67
 
59
68
  # Prepare prompts
60
- prompts = [
61
- self.tokenizer.apply_chat_template(
62
- [
63
- {"role": "system", "content": ENRICHMENT_SYSTEM_PROMPT},
64
- {"role": "user", "content": snippet},
65
- ],
66
- tokenize=False,
67
- add_generation_prompt=True,
68
- enable_thinking=False,
69
+ prompts: list[EmbeddingRequest] = [
70
+ EmbeddingRequest(
71
+ id=snippet.snippet_id,
72
+ text=self.tokenizer.apply_chat_template(
73
+ [
74
+ {"role": "system", "content": ENRICHMENT_SYSTEM_PROMPT},
75
+ {"role": "user", "content": snippet.text},
76
+ ],
77
+ tokenize=False,
78
+ add_generation_prompt=True,
79
+ enable_thinking=False,
80
+ ),
69
81
  )
70
82
  for snippet in data
71
83
  ]
72
84
 
73
- # Batch prompts using split_sub_batches
74
- batched_prompts = split_sub_batches(
75
- self.encoding, prompts, max_context_window=self.context_window
76
- )
77
- results = []
78
- for batch in tqdm(batched_prompts, leave=False, total=len(batched_prompts)):
85
+ for prompt in prompts:
79
86
  model_inputs = self.tokenizer(
80
- batch, return_tensors="pt", padding=True, truncation=True
87
+ prompt.text,
88
+ return_tensors="pt",
89
+ padding=True,
90
+ truncation=True,
81
91
  ).to(self.model.device)
82
92
  generated_ids = self.model.generate(
83
93
  **model_inputs, max_new_tokens=self.context_window
84
94
  )
85
- # For each prompt in the batch, decode only the generated part
86
- for i, input_ids in enumerate(model_inputs["input_ids"]):
87
- output_ids = generated_ids[i][len(input_ids) :].tolist()
88
- content = self.tokenizer.decode(
89
- output_ids, skip_special_tokens=True
90
- ).strip("\n")
91
- results.append(content)
92
- return results
95
+ input_ids = model_inputs["input_ids"][0]
96
+ output_ids = generated_ids[0][len(input_ids) :].tolist()
97
+ content = self.tokenizer.decode(output_ids, skip_special_tokens=True).strip(
98
+ "\n"
99
+ )
100
+ yield EnrichmentResponse(
101
+ snippet_id=prompt.id,
102
+ text=content,
103
+ )
@@ -1,15 +1,17 @@
1
1
  """OpenAI embedding service."""
2
2
 
3
3
  import asyncio
4
+ from collections.abc import AsyncGenerator
4
5
 
5
6
  import structlog
6
7
  import tiktoken
7
8
  from openai import AsyncOpenAI
8
- from tqdm import tqdm
9
9
 
10
10
  from kodit.enrichment.enrichment_provider.enrichment_provider import (
11
11
  ENRICHMENT_SYSTEM_PROMPT,
12
12
  EnrichmentProvider,
13
+ EnrichmentRequest,
14
+ EnrichmentResponse,
13
15
  )
14
16
 
15
17
  OPENAI_NUM_PARALLEL_TASKS = 10
@@ -29,25 +31,24 @@ class OpenAIEnrichmentProvider(EnrichmentProvider):
29
31
  self.model_name = model_name
30
32
  self.encoding = tiktoken.encoding_for_model("gpt-4o-mini") # Approximation
31
33
 
32
- async def enrich(self, data: list[str]) -> list[str]:
34
+ async def enrich(
35
+ self, data: list[EnrichmentRequest]
36
+ ) -> AsyncGenerator[EnrichmentResponse, None]:
33
37
  """Enrich a list of documents."""
34
38
  if not data or len(data) == 0:
35
39
  self.log.warning("Data is empty, skipping enrichment")
36
- return []
40
+ return
37
41
 
38
42
  # Process batches in parallel with a semaphore to limit concurrent requests
39
43
  sem = asyncio.Semaphore(OPENAI_NUM_PARALLEL_TASKS)
40
44
 
41
- # Create a list of tuples with a temporary id for each snippet
42
- # We need to do this so that we can return the results in the same order as the
43
- # input data
44
- input_data = [(i, snippet) for i, snippet in enumerate(data)]
45
-
46
- async def process_data(data: tuple[int, str]) -> tuple[int, str]:
47
- snippet_id, snippet = data
48
- if not snippet:
49
- return snippet_id, ""
45
+ async def process_data(data: EnrichmentRequest) -> EnrichmentResponse:
50
46
  async with sem:
47
+ if not data.text:
48
+ return EnrichmentResponse(
49
+ snippet_id=data.snippet_id,
50
+ text="",
51
+ )
51
52
  try:
52
53
  response = await self.openai_client.chat.completions.create(
53
54
  model=self.model_name,
@@ -56,26 +57,23 @@ class OpenAIEnrichmentProvider(EnrichmentProvider):
56
57
  "role": "system",
57
58
  "content": ENRICHMENT_SYSTEM_PROMPT,
58
59
  },
59
- {"role": "user", "content": snippet},
60
+ {"role": "user", "content": data.text},
60
61
  ],
61
62
  )
62
- return snippet_id, response.choices[0].message.content or ""
63
+ return EnrichmentResponse(
64
+ snippet_id=data.snippet_id,
65
+ text=response.choices[0].message.content or "",
66
+ )
63
67
  except Exception as e:
64
68
  self.log.exception("Error enriching data", error=str(e))
65
- return snippet_id, ""
69
+ return EnrichmentResponse(
70
+ snippet_id=data.snippet_id,
71
+ text="",
72
+ )
66
73
 
67
74
  # Create tasks for all data
68
- tasks = [process_data(snippet) for snippet in input_data]
75
+ tasks = [process_data(snippet) for snippet in data]
69
76
 
70
77
  # Process all data and yield results as they complete
71
- results: list[tuple[int, str]] = []
72
- for task in tqdm(
73
- asyncio.as_completed(tasks),
74
- total=len(tasks),
75
- leave=False,
76
- ):
77
- result = await task
78
- results.append(result)
79
-
80
- # Output in the same order as the input data
81
- return [result for _, result in sorted(results, key=lambda x: x[0])]
78
+ for task in asyncio.as_completed(tasks):
79
+ yield await task
@@ -1,24 +1,34 @@
1
1
  """Enrichment service."""
2
2
 
3
3
  from abc import ABC, abstractmethod
4
+ from collections.abc import AsyncGenerator
4
5
 
5
- from kodit.enrichment.enrichment_provider.enrichment_provider import EnrichmentProvider
6
+ from kodit.enrichment.enrichment_provider.enrichment_provider import (
7
+ EnrichmentProvider,
8
+ EnrichmentRequest,
9
+ EnrichmentResponse,
10
+ )
6
11
 
7
12
 
8
13
  class EnrichmentService(ABC):
9
14
  """Enrichment service."""
10
15
 
11
16
  @abstractmethod
12
- async def enrich(self, data: list[str]) -> list[str]:
17
+ def enrich(
18
+ self, data: list[EnrichmentRequest]
19
+ ) -> AsyncGenerator[EnrichmentResponse, None]:
13
20
  """Enrich a list of strings."""
14
21
 
15
22
 
16
23
  class NullEnrichmentService(EnrichmentService):
17
24
  """Null enrichment service."""
18
25
 
19
- async def enrich(self, data: list[str]) -> list[str]:
26
+ async def enrich(
27
+ self, data: list[EnrichmentRequest]
28
+ ) -> AsyncGenerator[EnrichmentResponse, None]:
20
29
  """Enrich a list of strings."""
21
- return [""] * len(data)
30
+ for request in data:
31
+ yield EnrichmentResponse(snippet_id=request.snippet_id, text="")
22
32
 
23
33
 
24
34
  class LLMEnrichmentService(EnrichmentService):
@@ -28,6 +38,8 @@ class LLMEnrichmentService(EnrichmentService):
28
38
  """Initialize the enrichment service."""
29
39
  self.enrichment_provider = enrichment_provider
30
40
 
31
- async def enrich(self, data: list[str]) -> list[str]:
32
- """Enrich a list of strings."""
33
- return await self.enrichment_provider.enrich(data)
41
+ def enrich(
42
+ self, data: list[EnrichmentRequest]
43
+ ) -> AsyncGenerator[EnrichmentResponse, None]:
44
+ """Enrich a list of snippets."""
45
+ return self.enrichment_provider.enrich(data)
@@ -22,10 +22,12 @@ from kodit.embedding.vector_search_service import (
22
22
  VectorSearchRequest,
23
23
  VectorSearchService,
24
24
  )
25
+ from kodit.enrichment.enrichment_provider.enrichment_provider import EnrichmentRequest
25
26
  from kodit.enrichment.enrichment_service import EnrichmentService
26
27
  from kodit.indexing.fusion import FusionRequest, reciprocal_rank_fusion
27
28
  from kodit.indexing.indexing_models import Snippet
28
29
  from kodit.indexing.indexing_repository import IndexRepository
30
+ from kodit.log import log_event
29
31
  from kodit.snippets.snippets import SnippetService
30
32
  from kodit.source.source_service import SourceService
31
33
  from kodit.util.spinner import Spinner
@@ -45,7 +47,7 @@ class IndexView(pydantic.BaseModel):
45
47
  created_at: datetime
46
48
  updated_at: datetime | None = None
47
49
  source: str | None = None
48
- num_snippets: int | None = None
50
+ num_snippets: int
49
51
 
50
52
 
51
53
  class SearchRequest(pydantic.BaseModel):
@@ -119,6 +121,8 @@ class IndexService:
119
121
  ValueError: If the source doesn't exist or already has an index.
120
122
 
121
123
  """
124
+ log_event("kodit.index.create")
125
+
122
126
  # Check if the source exists
123
127
  source = await self.source_service.get(source_id)
124
128
 
@@ -129,6 +133,8 @@ class IndexService:
129
133
  return IndexView(
130
134
  id=index.id,
131
135
  created_at=index.created_at,
136
+ num_snippets=await self.repository.num_snippets_for_index(index.id),
137
+ source=source.uri,
132
138
  )
133
139
 
134
140
  async def list_indexes(self) -> list[IndexView]:
@@ -142,19 +148,33 @@ class IndexService:
142
148
  indexes = await self.repository.list_indexes()
143
149
 
144
150
  # Transform database results into DTOs
145
- return [
151
+ indexes = [
146
152
  IndexView(
147
153
  id=index.id,
148
154
  created_at=index.created_at,
149
155
  updated_at=index.updated_at,
150
- num_snippets=await self.repository.num_snippets_for_index(index.id),
156
+ num_snippets=await self.repository.num_snippets_for_index(index.id)
157
+ or 0,
151
158
  source=source.uri,
152
159
  )
153
160
  for index, source in indexes
154
161
  ]
155
162
 
163
+ # Help Kodit by measuring how much people are using indexes
164
+ log_event(
165
+ "kodit.index.list",
166
+ {
167
+ "num_indexes": len(indexes),
168
+ "num_snippets": sum([index.num_snippets for index in indexes]),
169
+ },
170
+ )
171
+
172
+ return indexes
173
+
156
174
  async def run(self, index_id: int) -> None:
157
175
  """Run the indexing process for a specific index."""
176
+ log_event("kodit.index.run")
177
+
158
178
  # Get and validate index
159
179
  index = await self.repository.get_by_id(index_id)
160
180
  if not index:
@@ -181,43 +201,50 @@ class IndexService:
181
201
  )
182
202
 
183
203
  self.log.info("Creating semantic code index")
184
- with Spinner():
185
- await self.code_search_service.index(
204
+ with tqdm(total=len(snippets), leave=False) as pbar:
205
+ async for result in self.code_search_service.index(
186
206
  [
187
207
  VectorSearchRequest(snippet.id, snippet.content)
188
208
  for snippet in snippets
189
209
  ]
190
- )
210
+ ):
211
+ pbar.update(len(result))
191
212
 
192
213
  self.log.info("Enriching snippets", num_snippets=len(snippets))
193
- enriched_contents = await self.enrichment_service.enrich(
194
- [snippet.content for snippet in snippets]
195
- )
214
+ enriched_contents = []
215
+ with tqdm(total=len(snippets), leave=False) as pbar:
216
+ async for result in self.enrichment_service.enrich(
217
+ [
218
+ EnrichmentRequest(snippet_id=snippet.id, text=snippet.content)
219
+ for snippet in snippets
220
+ ]
221
+ ):
222
+ snippet = next(s for s in snippets if s.id == result.snippet_id)
223
+ if snippet:
224
+ snippet.content = (
225
+ result.text + "\n\n```\n" + snippet.content + "\n```"
226
+ )
227
+ await self.repository.add_snippet(snippet)
228
+ enriched_contents.append(result)
229
+ pbar.update(1)
196
230
 
197
231
  self.log.info("Creating semantic text index")
198
- with Spinner():
199
- await self.text_search_service.index(
232
+ with tqdm(total=len(snippets), leave=False) as pbar:
233
+ async for result in self.text_search_service.index(
200
234
  [
201
- VectorSearchRequest(snippet.id, enriched_content)
202
- for snippet, enriched_content in zip(
203
- snippets, enriched_contents, strict=True
204
- )
235
+ VectorSearchRequest(snippet.id, snippet.content)
236
+ for snippet in snippets
205
237
  ]
206
- )
207
- # Add the enriched text back to the snippets and write to the database
208
- for snippet, enriched_content in zip(
209
- snippets, enriched_contents, strict=True
210
238
  ):
211
- snippet.content = (
212
- enriched_content + "\n\n```\n" + snippet.content + "\n```"
213
- )
214
- await self.repository.add_snippet(snippet)
239
+ pbar.update(len(result))
215
240
 
216
241
  # Update index timestamp
217
242
  await self.repository.update_index_timestamp(index)
218
243
 
219
244
  async def search(self, request: SearchRequest) -> list[SearchResult]:
220
245
  """Search for relevant data."""
246
+ log_event("kodit.index.search")
247
+
221
248
  fusion_list: list[list[FusionRequest]] = []
222
249
  if request.keywords:
223
250
  # Gather results for each keyword