kodit 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

kodit/_version.py CHANGED
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '0.2.0'
21
- __version_tuple__ = version_tuple = (0, 2, 0)
20
+ __version__ = version = '0.2.2'
21
+ __version_tuple__ = version_tuple = (0, 2, 2)
kodit/bm25/local_bm25.py CHANGED
@@ -1,13 +1,14 @@
1
1
  """Locally hosted BM25 service primarily for use with SQLite."""
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import json
4
6
  from pathlib import Path
7
+ from typing import TYPE_CHECKING
5
8
 
6
9
  import aiofiles
7
- import bm25s
8
10
  import Stemmer
9
11
  import structlog
10
- from bm25s.tokenization import Tokenized
11
12
 
12
13
  from kodit.bm25.keyword_search_service import (
13
14
  BM25Document,
@@ -15,6 +16,11 @@ from kodit.bm25.keyword_search_service import (
15
16
  KeywordSearchProvider,
16
17
  )
17
18
 
19
+ if TYPE_CHECKING:
20
+ import bm25s
21
+ from bm25s.tokenization import Tokenized
22
+
23
+
18
24
  SNIPPET_IDS_FILE = "snippet_ids.jsonl"
19
25
 
20
26
 
@@ -26,19 +32,28 @@ class BM25Service(KeywordSearchProvider):
26
32
  self.log = structlog.get_logger(__name__)
27
33
  self.index_path = data_dir / "bm25s_index"
28
34
  self.snippet_ids: list[int] = []
29
- try:
30
- self.log.debug("Loading BM25 index")
31
- self.retriever = bm25s.BM25.load(self.index_path, mmap=True)
32
- with Path(self.index_path / SNIPPET_IDS_FILE).open() as f:
33
- self.snippet_ids = json.load(f)
34
- except FileNotFoundError:
35
- self.log.debug("BM25 index not found, creating new index")
36
- self.retriever = bm25s.BM25()
37
-
38
35
  self.stemmer = Stemmer.Stemmer("english")
36
+ self.__retriever: bm25s.BM25 | None = None
37
+
38
+ def _retriever(self) -> bm25s.BM25:
39
+ """Get the BM25 retriever."""
40
+ if self.__retriever is None:
41
+ import bm25s
42
+
43
+ try:
44
+ self.log.debug("Loading BM25 index")
45
+ self.__retriever = bm25s.BM25.load(self.index_path, mmap=True)
46
+ with Path(self.index_path / SNIPPET_IDS_FILE).open() as f:
47
+ self.snippet_ids = json.load(f)
48
+ except FileNotFoundError:
49
+ self.log.debug("BM25 index not found, creating new index")
50
+ self.__retriever = bm25s.BM25()
51
+ return self.__retriever
39
52
 
40
53
  def _tokenize(self, corpus: list[str]) -> list[list[str]] | Tokenized:
41
- return bm25s.tokenize(
54
+ from bm25s import tokenize
55
+
56
+ return tokenize(
42
57
  corpus,
43
58
  stopwords="en",
44
59
  stemmer=self.stemmer,
@@ -49,10 +64,13 @@ class BM25Service(KeywordSearchProvider):
49
64
  async def index(self, corpus: list[BM25Document]) -> None:
50
65
  """Index a new corpus."""
51
66
  self.log.debug("Indexing corpus")
67
+ if not corpus or len(corpus) == 0:
68
+ self.log.warning("Corpus is empty, skipping bm25 index")
69
+ return
70
+
52
71
  vocab = self._tokenize([doc.text for doc in corpus])
53
- self.retriever = bm25s.BM25()
54
- self.retriever.index(vocab, show_progress=False)
55
- self.retriever.save(self.index_path)
72
+ self._retriever().index(vocab, show_progress=False)
73
+ self._retriever().save(self.index_path)
56
74
  self.snippet_ids = self.snippet_ids + [doc.snippet_id for doc in corpus]
57
75
  async with aiofiles.open(self.index_path / SNIPPET_IDS_FILE, "w") as f:
58
76
  await f.write(json.dumps(self.snippet_ids))
@@ -63,8 +81,12 @@ class BM25Service(KeywordSearchProvider):
63
81
  self.log.warning("Top k is 0, returning empty list")
64
82
  return []
65
83
 
84
+ # Check that the index has data
85
+ if not hasattr(self._retriever(), "scores"):
86
+ return []
87
+
66
88
  # Get the number of documents in the index
67
- num_docs = self.retriever.scores["num_docs"]
89
+ num_docs = self._retriever().scores["num_docs"]
68
90
  if num_docs == 0:
69
91
  return []
70
92
 
@@ -80,7 +102,7 @@ class BM25Service(KeywordSearchProvider):
80
102
 
81
103
  self.log.debug("Query tokens", query_tokens=query_tokens)
82
104
 
83
- results, scores = self.retriever.retrieve(
105
+ results, scores = self._retriever().retrieve(
84
106
  query_tokens=query_tokens,
85
107
  corpus=self.snippet_ids,
86
108
  k=top_k,
@@ -2,6 +2,7 @@
2
2
 
3
3
  from typing import Any
4
4
 
5
+ import structlog
5
6
  from sqlalchemy import Result, TextClause, bindparam, text
6
7
  from sqlalchemy.ext.asyncio import AsyncSession
7
8
 
@@ -93,6 +94,7 @@ class VectorChordBM25(KeywordSearchProvider):
93
94
  """Initialize the VectorChord BM25."""
94
95
  self.__session = session
95
96
  self._initialized = False
97
+ self.log = structlog.get_logger(__name__)
96
98
 
97
99
  async def _initialize(self) -> None:
98
100
  """Initialize the VectorChord environment."""
@@ -149,7 +151,8 @@ class VectorChordBM25(KeywordSearchProvider):
149
151
  if doc.snippet_id is not None and doc.text is not None and doc.text != ""
150
152
  ]
151
153
 
152
- if not corpus:
154
+ if not corpus or len(corpus) == 0:
155
+ self.log.warning("Corpus is empty, skipping bm25 index")
153
156
  return
154
157
 
155
158
  # Execute inserts
kodit/config.py CHANGED
@@ -1,16 +1,20 @@
1
1
  """Global configuration for the kodit project."""
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import asyncio
4
- from collections.abc import Callable, Coroutine
5
6
  from functools import wraps
6
7
  from pathlib import Path
7
- from typing import Any, Literal, TypeVar
8
+ from typing import TYPE_CHECKING, Any, Literal, TypeVar
8
9
 
9
10
  import click
10
- from openai import AsyncOpenAI
11
11
  from pydantic import BaseModel, Field
12
12
  from pydantic_settings import BaseSettings, SettingsConfigDict
13
13
 
14
+ if TYPE_CHECKING:
15
+ from collections.abc import Callable, Coroutine
16
+
17
+
14
18
  from kodit.database import Database
15
19
 
16
20
  DEFAULT_BASE_DIR = Path.home() / ".kodit"
@@ -20,13 +24,16 @@ DEFAULT_LOG_FORMAT = "pretty"
20
24
  DEFAULT_DISABLE_TELEMETRY = False
21
25
  T = TypeVar("T")
22
26
 
27
+ EndpointType = Literal["openai"]
28
+
23
29
 
24
30
  class Endpoint(BaseModel):
25
31
  """Endpoint provides configuration for an AI service."""
26
32
 
27
- type: Literal["openai"] = Field(default="openai")
28
- api_key: str | None = None
33
+ type: EndpointType | None = None
29
34
  base_url: str | None = None
35
+ model: str | None = None
36
+ api_key: str | None = None
30
37
 
31
38
 
32
39
  class Search(BaseModel):
@@ -52,15 +59,20 @@ class AppContext(BaseSettings):
52
59
  log_format: str = Field(default=DEFAULT_LOG_FORMAT)
53
60
  disable_telemetry: bool = Field(default=DEFAULT_DISABLE_TELEMETRY)
54
61
  default_endpoint: Endpoint | None = Field(
55
- default=Endpoint(
56
- type="openai",
57
- base_url="https://api.openai.com/v1",
58
- ),
62
+ default=None,
59
63
  description=(
60
64
  "Default endpoint to use for all AI interactions "
61
65
  "(can be overridden by task-specific configuration)."
62
66
  ),
63
67
  )
68
+ embedding_endpoint: Endpoint | None = Field(
69
+ default=None,
70
+ description="Endpoint to use for embedding.",
71
+ )
72
+ enrichment_endpoint: Endpoint | None = Field(
73
+ default=None,
74
+ description="Endpoint to use for enrichment.",
75
+ )
64
76
  default_search: Search = Field(
65
77
  default=Search(),
66
78
  )
@@ -90,21 +102,6 @@ class AppContext(BaseSettings):
90
102
  await self._db.run_migrations(self.db_url)
91
103
  return self._db
92
104
 
93
- def get_default_openai_client(self) -> AsyncOpenAI | None:
94
- """Get the default OpenAI client, if it is configured."""
95
- endpoint = self.default_endpoint
96
- if not (
97
- endpoint
98
- and endpoint.type == "openai"
99
- and endpoint.api_key
100
- and endpoint.base_url
101
- ):
102
- return None
103
- return AsyncOpenAI(
104
- api_key=endpoint.api_key,
105
- base_url=endpoint.base_url,
106
- )
107
-
108
105
 
109
106
  with_app_context = click.make_pass_decorator(AppContext)
110
107
 
@@ -2,7 +2,7 @@
2
2
 
3
3
  from sqlalchemy.ext.asyncio import AsyncSession
4
4
 
5
- from kodit.config import AppContext
5
+ from kodit.config import AppContext, Endpoint
6
6
  from kodit.embedding.embedding_provider.local_embedding_provider import (
7
7
  CODE,
8
8
  LocalEmbeddingProvider,
@@ -16,19 +16,33 @@ from kodit.embedding.vector_search_service import (
16
16
  VectorSearchService,
17
17
  )
18
18
  from kodit.embedding.vectorchord_vector_search_service import (
19
+ TaskName,
19
20
  VectorChordVectorSearchService,
20
21
  )
21
22
 
22
23
 
24
+ def _get_endpoint_configuration(app_context: AppContext) -> Endpoint | None:
25
+ """Get the endpoint configuration for the embedding service."""
26
+ return app_context.embedding_endpoint or app_context.default_endpoint or None
27
+
28
+
23
29
  def embedding_factory(
24
- task_name: str, app_context: AppContext, session: AsyncSession
30
+ task_name: TaskName, app_context: AppContext, session: AsyncSession
25
31
  ) -> VectorSearchService:
26
32
  """Create an embedding service."""
27
33
  embedding_repository = EmbeddingRepository(session=session)
28
- embedding_provider = None
29
- openai_client = app_context.get_default_openai_client()
30
- if openai_client is not None:
31
- embedding_provider = OpenAIEmbeddingProvider(openai_client=openai_client)
34
+ endpoint = _get_endpoint_configuration(app_context)
35
+
36
+ if endpoint and endpoint.type == "openai":
37
+ from openai import AsyncOpenAI
38
+
39
+ embedding_provider = OpenAIEmbeddingProvider(
40
+ openai_client=AsyncOpenAI(
41
+ api_key=endpoint.api_key or "default",
42
+ base_url=endpoint.base_url or "https://api.openai.com/v1",
43
+ ),
44
+ model_name=endpoint.model or "text-embedding-3-small",
45
+ )
32
46
  else:
33
47
  embedding_provider = LocalEmbeddingProvider(CODE)
34
48
 
@@ -23,7 +23,11 @@ class EmbeddingProvider(ABC):
23
23
  """
24
24
 
25
25
 
26
- def split_sub_batches(encoding: tiktoken.Encoding, data: list[str]) -> list[list[str]]:
26
+ def split_sub_batches(
27
+ encoding: tiktoken.Encoding,
28
+ data: list[str],
29
+ max_context_window: int = OPENAI_MAX_EMBEDDING_SIZE,
30
+ ) -> list[list[str]]:
27
31
  """Split a list of strings into smaller sub-batches."""
28
32
  log = structlog.get_logger(__name__)
29
33
  result = []
@@ -37,10 +41,10 @@ def split_sub_batches(encoding: tiktoken.Encoding, data: list[str]) -> list[list
37
41
  next_item = data_to_process[0]
38
42
  item_tokens = len(encoding.encode(next_item))
39
43
 
40
- if item_tokens > OPENAI_MAX_EMBEDDING_SIZE:
44
+ if item_tokens > max_context_window:
41
45
  # Loop around trying to truncate the snippet until it fits in the max
42
46
  # embedding size
43
- while item_tokens > OPENAI_MAX_EMBEDDING_SIZE:
47
+ while item_tokens > max_context_window:
44
48
  next_item = next_item[:-1]
45
49
  item_tokens = len(encoding.encode(next_item))
46
50
 
@@ -48,7 +52,7 @@ def split_sub_batches(encoding: tiktoken.Encoding, data: list[str]) -> list[list
48
52
 
49
53
  log.warning("Truncated snippet", snippet=next_item)
50
54
 
51
- if current_tokens + item_tokens > OPENAI_MAX_EMBEDDING_SIZE:
55
+ if current_tokens + item_tokens > max_context_window:
52
56
  break
53
57
 
54
58
  next_batch.append(data_to_process.pop(0))
@@ -1,10 +1,12 @@
1
1
  """Local embedding service."""
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import os
6
+ from typing import TYPE_CHECKING
4
7
 
5
8
  import structlog
6
9
  import tiktoken
7
- from sentence_transformers import SentenceTransformer
8
10
  from tqdm import tqdm
9
11
 
10
12
  from kodit.embedding.embedding_provider.embedding_provider import (
@@ -13,6 +15,9 @@ from kodit.embedding.embedding_provider.embedding_provider import (
13
15
  split_sub_batches,
14
16
  )
15
17
 
18
+ if TYPE_CHECKING:
19
+ from sentence_transformers import SentenceTransformer
20
+
16
21
  TINY = "tiny"
17
22
  CODE = "code"
18
23
  TEST = "test"
@@ -38,10 +43,11 @@ class LocalEmbeddingProvider(EmbeddingProvider):
38
43
  """Get the embedding model."""
39
44
  if self.embedding_model is None:
40
45
  os.environ["TOKENIZERS_PARALLELISM"] = "false" # Avoid warnings
46
+ from sentence_transformers import SentenceTransformer
47
+
41
48
  self.embedding_model = SentenceTransformer(
42
49
  self.model_name,
43
50
  trust_remote_code=True,
44
- device="cpu", # Force CPU so we don't have to install accelerate, etc.
45
51
  )
46
52
  return self.embedding_model
47
53
 
@@ -27,7 +27,9 @@ class OpenAIEmbeddingProvider(EmbeddingProvider):
27
27
  self.log = structlog.get_logger(__name__)
28
28
  self.openai_client = openai_client
29
29
  self.model_name = model_name
30
- self.encoding = tiktoken.encoding_for_model(model_name)
30
+ self.encoding = tiktoken.encoding_for_model(
31
+ "text-embedding-3-small"
32
+ ) # Sensible default
31
33
 
32
34
  async def embed(self, data: list[str]) -> list[Vector]:
33
35
  """Embed a list of documents."""
@@ -29,6 +29,10 @@ class LocalVectorSearchService(VectorSearchService):
29
29
 
30
30
  async def index(self, data: list[VectorSearchRequest]) -> None:
31
31
  """Embed a list of documents."""
32
+ if not data or len(data) == 0:
33
+ self.log.warning("Embedding data is empty, skipping embedding")
34
+ return
35
+
32
36
  embeddings = await self.embedding_provider.embed([i.text for i in data])
33
37
  for i, x in zip(data, embeddings, strict=False):
34
38
  await self.embedding_repository.create_embedding(
@@ -1,7 +1,8 @@
1
1
  """Vectorchord vector search."""
2
2
 
3
- from typing import Any
3
+ from typing import Any, Literal
4
4
 
5
+ import structlog
5
6
  from sqlalchemy import Result, TextClause, text
6
7
  from sqlalchemy.ext.asyncio import AsyncSession
7
8
 
@@ -51,13 +52,15 @@ ORDER BY score ASC
51
52
  LIMIT :top_k;
52
53
  """
53
54
 
55
+ TaskName = Literal["code", "text"]
56
+
54
57
 
55
58
  class VectorChordVectorSearchService(VectorSearchService):
56
59
  """VectorChord vector search."""
57
60
 
58
61
  def __init__(
59
62
  self,
60
- task_name: str,
63
+ task_name: TaskName,
61
64
  session: AsyncSession,
62
65
  embedding_provider: EmbeddingProvider,
63
66
  ) -> None:
@@ -67,6 +70,7 @@ class VectorChordVectorSearchService(VectorSearchService):
67
70
  self._initialized = False
68
71
  self.table_name = f"vectorchord_{task_name}_embeddings"
69
72
  self.index_name = f"{self.table_name}_idx"
73
+ self.log = structlog.get_logger(__name__)
70
74
 
71
75
  async def _initialize(self) -> None:
72
76
  """Initialize the VectorChord environment."""
@@ -128,6 +132,10 @@ class VectorChordVectorSearchService(VectorSearchService):
128
132
 
129
133
  async def index(self, data: list[VectorSearchRequest]) -> None:
130
134
  """Embed a list of documents."""
135
+ if not data or len(data) == 0:
136
+ self.log.warning("Embedding data is empty, skipping embedding")
137
+ return
138
+
131
139
  embeddings = await self.embedding_provider.embed([doc.text for doc in data])
132
140
  # Execute inserts
133
141
  await self._execute(
@@ -1,6 +1,6 @@
1
1
  """Embedding service."""
2
2
 
3
- from kodit.config import AppContext
3
+ from kodit.config import AppContext, Endpoint
4
4
  from kodit.enrichment.enrichment_provider.local_enrichment_provider import (
5
5
  LocalEnrichmentProvider,
6
6
  )
@@ -13,11 +13,27 @@ from kodit.enrichment.enrichment_service import (
13
13
  )
14
14
 
15
15
 
16
+ def _get_endpoint_configuration(app_context: AppContext) -> Endpoint | None:
17
+ """Get the endpoint configuration for the enrichment service."""
18
+ return app_context.enrichment_endpoint or app_context.default_endpoint or None
19
+
20
+
16
21
  def enrichment_factory(app_context: AppContext) -> EnrichmentService:
17
- """Create an embedding service."""
18
- openai_client = app_context.get_default_openai_client()
19
- if openai_client is not None:
20
- enrichment_provider = OpenAIEnrichmentProvider(openai_client=openai_client)
21
- return LLMEnrichmentService(enrichment_provider)
22
+ """Create an enrichment service."""
23
+ endpoint = _get_endpoint_configuration(app_context)
24
+ endpoint = app_context.enrichment_endpoint or app_context.default_endpoint or None
25
+
26
+ if endpoint and endpoint.type == "openai":
27
+ from openai import AsyncOpenAI
28
+
29
+ enrichment_provider = OpenAIEnrichmentProvider(
30
+ openai_client=AsyncOpenAI(
31
+ api_key=endpoint.api_key or "default",
32
+ base_url=endpoint.base_url or "https://api.openai.com/v1",
33
+ ),
34
+ model_name=endpoint.model or "gpt-4o-mini",
35
+ )
36
+ else:
37
+ enrichment_provider = LocalEnrichmentProvider()
22
38
 
23
- return LLMEnrichmentService(LocalEnrichmentProvider())
39
+ return LLMEnrichmentService(enrichment_provider=enrichment_provider)
@@ -3,61 +3,90 @@
3
3
  import os
4
4
 
5
5
  import structlog
6
- from transformers.models.auto.modeling_auto import AutoModelForCausalLM
7
- from transformers.models.auto.tokenization_auto import AutoTokenizer
6
+ import tiktoken
7
+ from tqdm import tqdm
8
8
 
9
+ from kodit.embedding.embedding_provider.embedding_provider import split_sub_batches
9
10
  from kodit.enrichment.enrichment_provider.enrichment_provider import (
10
11
  ENRICHMENT_SYSTEM_PROMPT,
11
12
  EnrichmentProvider,
12
13
  )
13
14
 
15
+ DEFAULT_ENRICHMENT_MODEL = "Qwen/Qwen3-0.6B"
16
+ DEFAULT_CONTEXT_WINDOW_SIZE = 2048 # Small so it works even on low-powered devices
17
+
14
18
 
15
19
  class LocalEnrichmentProvider(EnrichmentProvider):
16
20
  """Local embedder."""
17
21
 
18
- def __init__(self, model_name: str = "Qwen/Qwen3-0.6B") -> None:
22
+ def __init__(
23
+ self,
24
+ model_name: str = DEFAULT_ENRICHMENT_MODEL,
25
+ context_window: int = DEFAULT_CONTEXT_WINDOW_SIZE,
26
+ ) -> None:
19
27
  """Initialize the local enrichment provider."""
20
28
  self.log = structlog.get_logger(__name__)
21
29
  self.model_name = model_name
30
+ self.context_window = context_window
22
31
  self.model = None
23
32
  self.tokenizer = None
33
+ self.encoding = tiktoken.encoding_for_model("text-embedding-3-small")
24
34
 
25
35
  async def enrich(self, data: list[str]) -> list[str]:
26
36
  """Enrich a list of strings."""
37
+ if not data or len(data) == 0:
38
+ self.log.warning("Data is empty, skipping enrichment")
39
+ return []
40
+
41
+ from transformers.models.auto.modeling_auto import (
42
+ AutoModelForCausalLM,
43
+ )
44
+ from transformers.models.auto.tokenization_auto import AutoTokenizer
45
+
27
46
  if self.tokenizer is None:
28
- self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
47
+ self.tokenizer = AutoTokenizer.from_pretrained(
48
+ self.model_name, padding_side="left"
49
+ )
29
50
  if self.model is None:
30
51
  os.environ["TOKENIZERS_PARALLELISM"] = "false" # Avoid warnings
31
52
  self.model = AutoModelForCausalLM.from_pretrained(
32
53
  self.model_name,
33
54
  torch_dtype="auto",
34
55
  trust_remote_code=True,
56
+ device_map="auto",
35
57
  )
36
58
 
37
- results = []
38
- for snippet in data:
39
- # prepare the model input
40
- messages = [
41
- {"role": "system", "content": ENRICHMENT_SYSTEM_PROMPT},
42
- {"role": "user", "content": snippet},
43
- ]
44
- text = self.tokenizer.apply_chat_template(
45
- messages,
59
+ # Prepare prompts
60
+ prompts = [
61
+ self.tokenizer.apply_chat_template(
62
+ [
63
+ {"role": "system", "content": ENRICHMENT_SYSTEM_PROMPT},
64
+ {"role": "user", "content": snippet},
65
+ ],
46
66
  tokenize=False,
47
67
  add_generation_prompt=True,
48
68
  enable_thinking=False,
49
69
  )
50
- model_inputs = self.tokenizer([text], return_tensors="pt").to(
51
- self.model.device
52
- )
70
+ for snippet in data
71
+ ]
53
72
 
54
- # conduct text completion
55
- generated_ids = self.model.generate(**model_inputs, max_new_tokens=32768)
56
- output_ids = generated_ids[0][len(model_inputs.input_ids[0]) :].tolist()
57
- content = self.tokenizer.decode(output_ids, skip_special_tokens=True).strip(
58
- "\n"
73
+ # Batch prompts using split_sub_batches
74
+ batched_prompts = split_sub_batches(
75
+ self.encoding, prompts, max_context_window=self.context_window
76
+ )
77
+ results = []
78
+ for batch in tqdm(batched_prompts, leave=False, total=len(batched_prompts)):
79
+ model_inputs = self.tokenizer(
80
+ batch, return_tensors="pt", padding=True, truncation=True
81
+ ).to(self.model.device)
82
+ generated_ids = self.model.generate(
83
+ **model_inputs, max_new_tokens=self.context_window
59
84
  )
60
-
61
- results.append(content)
62
-
85
+ # For each prompt in the batch, decode only the generated part
86
+ for i, input_ids in enumerate(model_inputs["input_ids"]):
87
+ output_ids = generated_ids[i][len(input_ids) :].tolist()
88
+ content = self.tokenizer.decode(
89
+ output_ids, skip_special_tokens=True
90
+ ).strip("\n")
91
+ results.append(content)
63
92
  return results
@@ -27,10 +27,14 @@ class OpenAIEnrichmentProvider(EnrichmentProvider):
27
27
  self.log = structlog.get_logger(__name__)
28
28
  self.openai_client = openai_client
29
29
  self.model_name = model_name
30
- self.encoding = tiktoken.encoding_for_model(model_name)
30
+ self.encoding = tiktoken.encoding_for_model("gpt-4o-mini") # Approximation
31
31
 
32
32
  async def enrich(self, data: list[str]) -> list[str]:
33
33
  """Enrich a list of documents."""
34
+ if not data or len(data) == 0:
35
+ self.log.warning("Data is empty, skipping enrichment")
36
+ return []
37
+
34
38
  # Process batches in parallel with a semaphore to limit concurrent requests
35
39
  sem = asyncio.Semaphore(OPENAI_NUM_PARALLEL_TASKS)
36
40
 
@@ -289,6 +289,10 @@ class IndexService:
289
289
 
290
290
  """
291
291
  files = await self.repository.files_for_index(index_id)
292
+ if not files:
293
+ self.log.warning("No files to create snippets for")
294
+ return
295
+
292
296
  for file in tqdm(files, total=len(files), leave=False):
293
297
  # Skip unsupported file types
294
298
  if file.mime_type in MIME_BLACKLIST:
@@ -0,0 +1,64 @@
1
+ # ruff: noqa
2
+ """add authors
3
+
4
+ Revision ID: 42e836b21102
5
+ Revises: c3f5137d30f5
6
+ Create Date: 2025-06-13 14:48:50.152940
7
+
8
+ """
9
+
10
+ from typing import Sequence, Union
11
+
12
+ from alembic import op
13
+ import sqlalchemy as sa
14
+
15
+
16
+ # revision identifiers, used by Alembic.
17
+ revision: str = '42e836b21102'
18
+ down_revision: Union[str, None] = 'c3f5137d30f5'
19
+ branch_labels: Union[str, Sequence[str], None] = None
20
+ depends_on: Union[str, Sequence[str], None] = None
21
+
22
+
23
+ def upgrade() -> None:
24
+ """Upgrade schema."""
25
+ # ### commands auto generated by Alembic - please adjust! ###
26
+ op.create_table('authors',
27
+ sa.Column('name', sa.String(length=255), nullable=False),
28
+ sa.Column('email', sa.String(length=255), nullable=False),
29
+ sa.Column('id', sa.Integer(), autoincrement=True, nullable=False),
30
+ sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
31
+ sa.Column('updated_at', sa.DateTime(timezone=True), nullable=False),
32
+ sa.PrimaryKeyConstraint('id')
33
+ )
34
+ op.create_index(op.f('ix_authors_email'), 'authors', ['email'], unique=True)
35
+ op.create_index(op.f('ix_authors_name'), 'authors', ['name'], unique=True)
36
+ op.create_table('author_file_mappings',
37
+ sa.Column('author_id', sa.Integer(), nullable=False),
38
+ sa.Column('file_id', sa.Integer(), nullable=False),
39
+ sa.Column('id', sa.Integer(), autoincrement=True, nullable=False),
40
+ sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
41
+ sa.Column('updated_at', sa.DateTime(timezone=True), nullable=False),
42
+ sa.ForeignKeyConstraint(['author_id'], ['authors.id'], ),
43
+ sa.ForeignKeyConstraint(['file_id'], ['files.id'], ),
44
+ sa.PrimaryKeyConstraint('id')
45
+ )
46
+ op.add_column('files', sa.Column('extension', sa.String(length=255), nullable=False))
47
+ op.create_index(op.f('ix_files_extension'), 'files', ['extension'], unique=False)
48
+ op.add_column('sources', sa.Column('type', sa.Enum('UNKNOWN', 'FOLDER', 'GIT', name='sourcetype'), nullable=False))
49
+ op.create_index(op.f('ix_sources_type'), 'sources', ['type'], unique=False)
50
+ # ### end Alembic commands ###
51
+
52
+
53
+ def downgrade() -> None:
54
+ """Downgrade schema."""
55
+ # ### commands auto generated by Alembic - please adjust! ###
56
+ op.drop_index(op.f('ix_sources_type'), table_name='sources')
57
+ op.drop_column('sources', 'type')
58
+ op.drop_index(op.f('ix_files_extension'), table_name='files')
59
+ op.drop_column('files', 'extension')
60
+ op.drop_table('author_file_mappings')
61
+ op.drop_index(op.f('ix_authors_name'), table_name='authors')
62
+ op.drop_index(op.f('ix_authors_email'), table_name='authors')
63
+ op.drop_table('authors')
64
+ # ### end Alembic commands ###
kodit/source/git.py ADDED
@@ -0,0 +1,16 @@
1
+ """Git utilities."""
2
+
3
+ import tempfile
4
+
5
+ import git
6
+
7
+
8
+ def is_valid_clone_target(target: str) -> bool:
9
+ """Return True if the target is clonable."""
10
+ with tempfile.TemporaryDirectory() as temp_dir:
11
+ try:
12
+ git.Repo.clone_from(target, temp_dir)
13
+ except git.GitCommandError:
14
+ return False
15
+ else:
16
+ return True
kodit/source/ignore.py ADDED
@@ -0,0 +1,53 @@
1
+ """Ignore patterns."""
2
+
3
+ from pathlib import Path
4
+
5
+ import git
6
+ import pathspec
7
+
8
+ from kodit.source.git import is_valid_clone_target
9
+
10
+
11
+ class IgnorePatterns:
12
+ """Ignore patterns."""
13
+
14
+ def __init__(self, base_dir: Path) -> None:
15
+ """Initialize the ignore patterns."""
16
+ if not base_dir.is_dir():
17
+ msg = f"Base directory is not a directory: {base_dir}"
18
+ raise ValueError(msg)
19
+
20
+ self.base_dir = base_dir
21
+
22
+ # Check if the base_dir is a valid git repository
23
+ self.git_repo = None
24
+ if is_valid_clone_target(str(base_dir)):
25
+ self.git_repo = git.Repo(base_dir)
26
+
27
+ def should_ignore(self, path: Path) -> bool:
28
+ """Check if a path should be ignored."""
29
+ if path.is_dir():
30
+ return False
31
+
32
+ # Get the path relative to the base_dir
33
+ relative_path = path.relative_to(self.base_dir)
34
+
35
+ # If this file is _part_ of a .git directory, then it should be ignored
36
+ if relative_path.as_posix().startswith(".git"):
37
+ return True
38
+
39
+ # If it is a git repository, then we need to check if the file is ignored
40
+ if self.git_repo and len(self.git_repo.ignored(path)) > 0:
41
+ return True
42
+
43
+ # If the repo has a .noindex file
44
+ noindex_path = Path(self.base_dir / ".noindex")
45
+ if noindex_path.exists():
46
+ with noindex_path.open() as f:
47
+ patterns = [line.strip() for line in f if line.strip()]
48
+ if patterns:
49
+ spec = pathspec.PathSpec.from_lines("gitwildmatch", patterns)
50
+ if spec.match_file(relative_path.as_posix()):
51
+ return True
52
+
53
+ return False
@@ -5,7 +5,10 @@ It includes models for tracking different types of sources (git repositories and
5
5
  folders) and their relationships.
6
6
  """
7
7
 
8
- from sqlalchemy import ForeignKey, Integer, String
8
+ import datetime
9
+ from enum import Enum as EnumType
10
+
11
+ from sqlalchemy import Enum, ForeignKey, Integer, String
9
12
  from sqlalchemy.orm import Mapped, mapped_column
10
13
 
11
14
  from kodit.database import Base, CommonMixin
@@ -14,6 +17,14 @@ from kodit.database import Base, CommonMixin
14
17
  __all__ = ["File", "Source"]
15
18
 
16
19
 
20
+ class SourceType(EnumType):
21
+ """The type of source."""
22
+
23
+ UNKNOWN = 0
24
+ FOLDER = 1
25
+ GIT = 2
26
+
27
+
17
28
  class Source(Base, CommonMixin):
18
29
  """Base model for tracking code sources.
19
30
 
@@ -32,12 +43,34 @@ class Source(Base, CommonMixin):
32
43
  __tablename__ = "sources"
33
44
  uri: Mapped[str] = mapped_column(String(1024), index=True, unique=True)
34
45
  cloned_path: Mapped[str] = mapped_column(String(1024), index=True)
46
+ type: Mapped[SourceType] = mapped_column(
47
+ Enum(SourceType), default=SourceType.UNKNOWN, index=True
48
+ )
35
49
 
36
- def __init__(self, uri: str, cloned_path: str) -> None:
50
+ def __init__(self, uri: str, cloned_path: str, source_type: SourceType) -> None:
37
51
  """Initialize a new Source instance for typing purposes."""
38
52
  super().__init__()
39
53
  self.uri = uri
40
54
  self.cloned_path = cloned_path
55
+ self.type = source_type
56
+
57
+
58
+ class Author(Base, CommonMixin):
59
+ """Author model."""
60
+
61
+ __tablename__ = "authors"
62
+
63
+ name: Mapped[str] = mapped_column(String(255), index=True, unique=True)
64
+ email: Mapped[str] = mapped_column(String(255), index=True, unique=True)
65
+
66
+
67
+ class AuthorFileMapping(Base, CommonMixin):
68
+ """Author file mapping model."""
69
+
70
+ __tablename__ = "author_file_mappings"
71
+
72
+ author_id: Mapped[int] = mapped_column(ForeignKey("authors.id"))
73
+ file_id: Mapped[int] = mapped_column(ForeignKey("files.id"))
41
74
 
42
75
 
43
76
  class File(Base, CommonMixin):
@@ -51,9 +84,12 @@ class File(Base, CommonMixin):
51
84
  cloned_path: Mapped[str] = mapped_column(String(1024), index=True)
52
85
  sha256: Mapped[str] = mapped_column(String(64), default="", index=True)
53
86
  size_bytes: Mapped[int] = mapped_column(Integer, default=0)
87
+ extension: Mapped[str] = mapped_column(String(255), default="", index=True)
54
88
 
55
89
  def __init__( # noqa: PLR0913
56
90
  self,
91
+ created_at: datetime.datetime,
92
+ updated_at: datetime.datetime,
57
93
  source_id: int,
58
94
  cloned_path: str,
59
95
  mime_type: str = "",
@@ -63,6 +99,8 @@ class File(Base, CommonMixin):
63
99
  ) -> None:
64
100
  """Initialize a new File instance for typing purposes."""
65
101
  super().__init__()
102
+ self.created_at = created_at
103
+ self.updated_at = updated_at
66
104
  self.source_id = source_id
67
105
  self.cloned_path = cloned_path
68
106
  self.mime_type = mime_type
@@ -3,7 +3,13 @@
3
3
  from sqlalchemy import func, select
4
4
  from sqlalchemy.ext.asyncio import AsyncSession
5
5
 
6
- from kodit.source.source_models import File, Source
6
+ from kodit.source.source_models import (
7
+ Author,
8
+ AuthorFileMapping,
9
+ File,
10
+ Source,
11
+ SourceType,
12
+ )
7
13
 
8
14
 
9
15
  class SourceRepository:
@@ -22,22 +28,12 @@ class SourceRepository:
22
28
  self.session = session
23
29
 
24
30
  async def create_source(self, source: Source) -> Source:
25
- """Create a new folder source record in the database.
31
+ """Add a new source to the database."""
32
+ # Validate the source
33
+ if source.type == SourceType.UNKNOWN:
34
+ msg = "Source type is required"
35
+ raise ValueError(msg)
26
36
 
27
- This method creates both a Source record and a linked FolderSource record
28
- in a single transaction.
29
-
30
- Args:
31
- path: The absolute path of the folder to create a source for.
32
-
33
- Returns:
34
- The created Source model instance.
35
-
36
- Note:
37
- This method commits the transaction to ensure the source.id is available
38
- for creating the linked FolderSource record.
39
-
40
- """
41
37
  self.session.add(source)
42
38
  await self.session.commit()
43
39
  return source
@@ -52,6 +48,12 @@ class SourceRepository:
52
48
  await self.session.commit()
53
49
  return file
54
50
 
51
+ async def list_files_for_source(self, source_id: int) -> list[File]:
52
+ """List all files for a source."""
53
+ query = select(File).where(File.source_id == source_id)
54
+ result = await self.session.execute(query)
55
+ return list(result.scalars())
56
+
55
57
  async def num_files_for_source(self, source_id: int) -> int:
56
58
  """Get the number of files for a source.
57
59
 
@@ -103,3 +105,36 @@ class SourceRepository:
103
105
  query = select(Source).where(Source.id == source_id)
104
106
  result = await self.session.execute(query)
105
107
  return result.scalar_one_or_none()
108
+
109
+ async def get_or_create_author(self, name: str, email: str) -> Author:
110
+ """Get or create an author by name and email.
111
+
112
+ Args:
113
+ name: The name of the author.
114
+ email: The email of the author.
115
+
116
+ """
117
+ query = select(Author).where(Author.name == name, Author.email == email)
118
+ result = await self.session.execute(query)
119
+ author = result.scalar_one_or_none()
120
+ if not author:
121
+ author = Author(name=name, email=email)
122
+ self.session.add(author)
123
+ await self.session.commit()
124
+ return author
125
+
126
+ async def get_or_create_author_file_mapping(
127
+ self, author_id: int, file_id: int
128
+ ) -> AuthorFileMapping:
129
+ """Create a new author file mapping record in the database."""
130
+ query = select(AuthorFileMapping).where(
131
+ AuthorFileMapping.author_id == author_id,
132
+ AuthorFileMapping.file_id == file_id,
133
+ )
134
+ result = await self.session.execute(query)
135
+ mapping = result.scalar_one_or_none()
136
+ if not mapping:
137
+ mapping = AuthorFileMapping(author_id=author_id, file_id=file_id)
138
+ self.session.add(mapping)
139
+ await self.session.commit()
140
+ return mapping
@@ -8,7 +8,8 @@ source management.
8
8
 
9
9
  import mimetypes
10
10
  import shutil
11
- from datetime import datetime
11
+ import tempfile
12
+ from datetime import UTC, datetime
12
13
  from hashlib import sha256
13
14
  from pathlib import Path
14
15
 
@@ -17,9 +18,15 @@ import git
17
18
  import pydantic
18
19
  import structlog
19
20
  from tqdm import tqdm
20
- from uritools import isuri, urisplit
21
21
 
22
- from kodit.source.source_models import File, Source
22
+ from kodit.source.git import is_valid_clone_target
23
+ from kodit.source.ignore import IgnorePatterns
24
+ from kodit.source.source_models import (
25
+ Author,
26
+ File,
27
+ Source,
28
+ SourceType,
29
+ )
23
30
  from kodit.source.source_repository import SourceRepository
24
31
 
25
32
 
@@ -82,39 +89,16 @@ class SourceService:
82
89
  )
83
90
 
84
91
  async def create(self, uri_or_path_like: str) -> SourceView:
85
- """Create a new source from a URI.
92
+ """Create a new source from a URI or path."""
93
+ # If it's possible to clone it, then do so
94
+ if is_valid_clone_target(uri_or_path_like):
95
+ return await self._create_git_source(uri_or_path_like)
86
96
 
87
- Args:
88
- uri: The URI of the source to create. Can be a git-like URI or a local
89
- directory.
90
-
91
- Raises:
92
- ValueError: If the source type is not supported or if the folder doesn't
93
- exist.
94
-
95
- """
97
+ # Otherwise just treat it as a directory
96
98
  if Path(uri_or_path_like).is_dir():
97
99
  return await self._create_folder_source(Path(uri_or_path_like))
98
- if isuri(uri_or_path_like):
99
- parsed = urisplit(uri_or_path_like)
100
- if parsed.scheme == "file":
101
- return await self._create_folder_source(Path(parsed.path))
102
- if parsed.scheme in ("git", "http", "https") and parsed.path.endswith(
103
- ".git"
104
- ):
105
- return await self._create_git_source(uri_or_path_like)
106
-
107
- # Try adding a .git suffix, sometimes people just pass the url
108
- if not uri_or_path_like.endswith(".git"):
109
- uri_or_path_like = uri_or_path_like + ".git"
110
- try:
111
- return await self._create_git_source(uri_or_path_like)
112
- except git.GitCommandError:
113
- raise
114
- except ValueError:
115
- pass
116
-
117
- msg = f"Unsupported source type: {uri_or_path_like}"
100
+
101
+ msg = f"Unsupported source: {uri_or_path_like}"
118
102
  raise ValueError(msg)
119
103
 
120
104
  async def _create_folder_source(self, directory: Path) -> SourceView:
@@ -159,7 +143,11 @@ class SourceService:
159
143
  )
160
144
 
161
145
  source = await self.repository.create_source(
162
- Source(uri=directory.as_uri(), cloned_path=str(clone_path)),
146
+ Source(
147
+ uri=directory.as_uri(),
148
+ cloned_path=str(clone_path),
149
+ source_type=SourceType.FOLDER,
150
+ ),
163
151
  )
164
152
 
165
153
  # Add all files to the source
@@ -168,7 +156,7 @@ class SourceService:
168
156
 
169
157
  # Process each file in the source directory
170
158
  for path in tqdm(clone_path.rglob("*"), total=file_count, leave=False):
171
- await self._process_file(source.id, path.absolute())
159
+ await self._process_file(source, path.absolute())
172
160
 
173
161
  return SourceView(
174
162
  id=source.id,
@@ -188,7 +176,13 @@ class SourceService:
188
176
  ValueError: If the repository cloning fails.
189
177
 
190
178
  """
191
- # Check if the repository is already added
179
+ self.log.debug("Normalising git uri", uri=uri)
180
+ with tempfile.TemporaryDirectory() as temp_dir:
181
+ git.Repo.clone_from(uri, temp_dir)
182
+ remote = git.Repo(temp_dir).remote()
183
+ uri = remote.url
184
+
185
+ self.log.debug("Checking if source already exists", uri=uri)
192
186
  source = await self.repository.get_source_by_uri(uri)
193
187
 
194
188
  if source:
@@ -208,18 +202,27 @@ class SourceService:
208
202
  msg = f"Failed to clone repository: {e}"
209
203
  raise ValueError(msg) from e
210
204
 
205
+ self.log.debug("Creating source", uri=uri, clone_path=str(clone_path))
211
206
  source = await self.repository.create_source(
212
- Source(uri=uri, cloned_path=str(clone_path)),
207
+ Source(
208
+ uri=uri,
209
+ cloned_path=str(clone_path),
210
+ source_type=SourceType.GIT,
211
+ ),
213
212
  )
214
213
 
215
- # Add all files to the source
216
- # Count total files for progress bar
217
- file_count = sum(1 for _ in clone_path.rglob("*") if _.is_file())
214
+ # Get the ignore patterns for this source
215
+ ignore_patterns = IgnorePatterns(clone_path)
216
+
217
+ # Get all files that are not ignored
218
+ files = [
219
+ f for f in clone_path.rglob("*") if not ignore_patterns.should_ignore(f)
220
+ ]
218
221
 
219
222
  # Process each file in the source directory
220
- self.log.info("Inspecting files", source_id=source.id)
221
- for path in tqdm(clone_path.rglob("*"), total=file_count, leave=False):
222
- await self._process_file(source.id, path.absolute())
223
+ self.log.info("Inspecting files", source_id=source.id, num_files=len(files))
224
+ for path in tqdm(files, total=len(files), leave=False):
225
+ await self._process_file(source, path.absolute())
223
226
 
224
227
  return SourceView(
225
228
  id=source.id,
@@ -231,32 +234,79 @@ class SourceService:
231
234
 
232
235
  async def _process_file(
233
236
  self,
234
- source_id: int,
235
- cloned_path: Path,
237
+ source: Source,
238
+ cloned_file: Path,
236
239
  ) -> None:
237
240
  """Process a single file for indexing."""
238
- if not cloned_path.is_file():
241
+ if not cloned_file.is_file():
239
242
  return
240
243
 
241
- async with aiofiles.open(cloned_path, "rb") as f:
244
+ # If this file exists in a git repository, pull out the file's metadata
245
+ authors: list[Author] = []
246
+ first_modified_at: datetime | None = None
247
+ last_modified_at: datetime | None = None
248
+ if source.type == SourceType.GIT:
249
+ # Get the git repository
250
+ git_repo = git.Repo(source.cloned_path)
251
+
252
+ # Get the last commit that touched this file
253
+ commits = list(
254
+ git_repo.iter_commits(
255
+ paths=str(cloned_file),
256
+ all=True,
257
+ )
258
+ )
259
+ if len(commits) > 0:
260
+ last_modified_at = commits[0].committed_datetime
261
+ first_modified_at = commits[-1].committed_datetime
262
+
263
+ # Get the file's blame
264
+ blames = git_repo.blame("HEAD", str(cloned_file))
265
+
266
+ # Extract the blame's authors
267
+ actors = [
268
+ commit.author
269
+ for blame in blames or []
270
+ for commit in blame
271
+ if isinstance(commit, git.Commit)
272
+ ]
273
+
274
+ # Get or create the authors in the database
275
+ for actor in actors:
276
+ if actor.name or actor.email:
277
+ author = await self.repository.get_or_create_author(
278
+ actor.name or "", actor.email or ""
279
+ )
280
+ authors.append(author)
281
+
282
+ # Create the file record
283
+ async with aiofiles.open(cloned_file, "rb") as f:
242
284
  content = await f.read()
243
- mime_type = mimetypes.guess_type(cloned_path)
285
+ mime_type = mimetypes.guess_type(cloned_file)
244
286
  sha = sha256(content).hexdigest()
245
287
 
246
288
  # Create file record
247
289
  file = File(
248
- source_id=source_id,
249
- cloned_path=cloned_path.as_posix(),
290
+ created_at=first_modified_at or datetime.now(UTC),
291
+ updated_at=last_modified_at or datetime.now(UTC),
292
+ source_id=source.id,
293
+ cloned_path=str(cloned_file),
250
294
  mime_type=mime_type[0]
251
295
  if mime_type and mime_type[0]
252
296
  else "application/octet-stream",
253
- uri=cloned_path.as_uri(),
297
+ uri=cloned_file.as_uri(),
254
298
  sha256=sha,
255
299
  size_bytes=len(content),
256
300
  )
257
301
 
258
302
  await self.repository.create_file(file)
259
303
 
304
+ # Create mapping of authors to the file
305
+ for author in authors:
306
+ await self.repository.get_or_create_author_file_mapping(
307
+ author_id=author.id, file_id=file.id
308
+ )
309
+
260
310
  async def list_sources(self) -> list[SourceView]:
261
311
  """List all available sources.
262
312
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kodit
3
- Version: 0.2.0
3
+ Version: 0.2.2
4
4
  Summary: Code indexing for better AI code generation
5
5
  Project-URL: Homepage, https://docs.helixml.tech/kodit/
6
6
  Project-URL: Documentation, https://docs.helixml.tech/kodit/
@@ -18,6 +18,7 @@ Classifier: Programming Language :: Python :: 3.12
18
18
  Classifier: Programming Language :: Python :: 3.13
19
19
  Classifier: Topic :: Software Development :: Code Generators
20
20
  Requires-Python: >=3.12
21
+ Requires-Dist: accelerate>=1.7.0
21
22
  Requires-Dist: aiofiles>=24.1.0
22
23
  Requires-Dist: aiosqlite>=0.20.0
23
24
  Requires-Dist: alembic>=1.15.2
@@ -35,6 +36,7 @@ Requires-Dist: hf-xet>=1.1.2
35
36
  Requires-Dist: httpx-retries>=0.3.2
36
37
  Requires-Dist: httpx>=0.28.1
37
38
  Requires-Dist: openai>=1.82.0
39
+ Requires-Dist: pathspec>=0.12.1
38
40
  Requires-Dist: posthog>=4.0.1
39
41
  Requires-Dist: pydantic-settings>=2.9.1
40
42
  Requires-Dist: pytable-formatter>=0.1.1
@@ -92,6 +94,7 @@ code. This index is used to build a snippet library, ready for ingestion into an
92
94
  - Build comprehensive snippet libraries for LLM ingestion
93
95
  - Support for multiple codebase types and languages
94
96
  - Efficient indexing and search capabilities
97
+ - Privacy first: respects .gitignore and .noindex files.
95
98
 
96
99
  ### MCP Server
97
100
 
@@ -1,9 +1,9 @@
1
1
  kodit/.gitignore,sha256=ztkjgRwL9Uud1OEi36hGQeDGk3OLK1NfDEO8YqGYy8o,11
2
2
  kodit/__init__.py,sha256=aEKHYninUq1yh6jaNfvJBYg-6fenpN132nJt1UU6Jxs,59
3
- kodit/_version.py,sha256=iB5DfB5V6YB5Wo4JmvS-txT42QtmGaWcWp3udRT7zCI,511
3
+ kodit/_version.py,sha256=OjGGK5TcHVG44Y62aAqeJH4CskkZoY9ydbHOtCDew50,511
4
4
  kodit/app.py,sha256=Mr5BFHOHx5zppwjC4XPWVvHjwgl1yrKbUjTWXKubJQM,891
5
5
  kodit/cli.py,sha256=i7eEt0FdIQGEfXKFte-8fBcZZGE8BPXBp40aGwJDQGI,11323
6
- kodit/config.py,sha256=2W2u5J8j-Mbt-C4xzOuK-PeuDCx0S_rnCXPhBwvfLT4,4353
6
+ kodit/config.py,sha256=3yh7hfLSILjZK_qJMhcExwRcrWJ0b5Eb1JjjOvMPJZo,4146
7
7
  kodit/database.py,sha256=WB1KpVxUYPgiJGU0gJa2hqytYB8wJEJ5z3WayhWzNMU,2403
8
8
  kodit/log.py,sha256=HU1OmuxO4FcVw61k4WW7Y4WM7BrDaeplw1PcBHhuIZY,5434
9
9
  kodit/mcp.py,sha256=QruyPskWB0_x59pkfj5BBeXuR13GMny5TAZEa2j4U9s,5752
@@ -11,36 +11,37 @@ kodit/middleware.py,sha256=I6FOkqG9-8RH5kR1-0ZoQWfE4qLCB8lZYv8H_OCH29o,2714
11
11
  kodit/bm25/__init__.py,sha256=j8zyriNWhbwE5Lbybzg1hQAhANlU9mKHWw4beeUR6og,19
12
12
  kodit/bm25/keyword_search_factory.py,sha256=rp-wx3DJsc2KlELK1V337EyeYvmwnMQwUqOo1WVPSmg,631
13
13
  kodit/bm25/keyword_search_service.py,sha256=aBbWQKgQmi2re3EIHdXFS00n7Wj3b2D0pZsLZ4qmHfE,754
14
- kodit/bm25/local_bm25.py,sha256=AAbFhbQDqyL3d7jsPL7W4HsLxdoYctaDsREUXOLy6jM,3260
15
- kodit/bm25/vectorchord_bm25.py,sha256=_nGrkUReYLLV-L8RIuIVLwjuhSYZl9T532n5OVf0kWs,6393
14
+ kodit/bm25/local_bm25.py,sha256=nokrd_xAeqXi3m68X5P1R5KBhRRB1E2L_J6Zgm26PCg,3869
15
+ kodit/bm25/vectorchord_bm25.py,sha256=0p_FgliaoevB8GLSmzWnV3zUjdcWgCgOKIpLURr7Qfo,6549
16
16
  kodit/embedding/__init__.py,sha256=h9NXzDA1r-K23nvBajBV-RJzHJN0p3UJ7UQsmdnOoRw,24
17
- kodit/embedding/embedding_factory.py,sha256=UGnFRyyQXazSUOwyW4Hg7Vq2-kfAoDj9lD4CTLu8x04,1630
17
+ kodit/embedding/embedding_factory.py,sha256=UbrTl3NEqBBH3ecvRG7vGW5wuvUMbWJEWbAAFALOGqs,2141
18
18
  kodit/embedding/embedding_models.py,sha256=rN90vSs86dYiqoawcp8E9jtwY31JoJXYfaDlsJK7uqc,656
19
19
  kodit/embedding/embedding_repository.py,sha256=-ux3scpBzel8c0pMH9fNOEsSXFIzl-IfgaWrkTb1szo,6907
20
- kodit/embedding/local_vector_search_service.py,sha256=hkF0qlfzjyGt400qIX9Mr6B7b7i8WvYIYWN2Z2C_pcs,1907
20
+ kodit/embedding/local_vector_search_service.py,sha256=dgMi8hQNUbYEgHnEYmLIpon4yLduoNUpu7k7VP6sOHI,2042
21
21
  kodit/embedding/vector_search_service.py,sha256=pQJ129QjGrAWOXzqkywmgtDRpy8_gtzYgkivyqF9Vrs,1009
22
- kodit/embedding/vectorchord_vector_search_service.py,sha256=63Xf7_nAz3xWOwrmZibw8Q-xoRdCrPDDpdSA_WE7mrc,5131
22
+ kodit/embedding/vectorchord_vector_search_service.py,sha256=TKNR3HgWHwwWtJ1SsvSaj_BXLJ_uw6Bdr_tpaePMeAA,5383
23
23
  kodit/embedding/embedding_provider/__init__.py,sha256=h9NXzDA1r-K23nvBajBV-RJzHJN0p3UJ7UQsmdnOoRw,24
24
- kodit/embedding/embedding_provider/embedding_provider.py,sha256=Tf3bwUsUMzAgoyLFM5qBtOLqPp1qr03TzrwGczkDvy0,1835
24
+ kodit/embedding/embedding_provider/embedding_provider.py,sha256=IC7fZaZ_ze-DxpxKfK44pRDwHWUQhVIqVKKQ3alO5Qc,1882
25
25
  kodit/embedding/embedding_provider/hash_embedding_provider.py,sha256=nAhlhh8j8PqqCCbhVl26Y8ntFBm2vJBCtB4X04g5Wwg,2638
26
- kodit/embedding/embedding_provider/local_embedding_provider.py,sha256=4ER-UPq506Y0TWU6qcs0nUqw6bSKQkSrdog-DhNQWM8,1906
27
- kodit/embedding/embedding_provider/openai_embedding_provider.py,sha256=V_jdUXiaGdslplwxMlfgFc4_hAVS2eaJXMTs2C7RiLI,2666
26
+ kodit/embedding/embedding_provider/local_embedding_provider.py,sha256=WP8lw6XG7v1_5Mw4_rhIOETooYRsxhkwmFaXCqCouQU,1977
27
+ kodit/embedding/embedding_provider/openai_embedding_provider.py,sha256=-phz5FKYM_tI3Q4_3SPzjzIOK3k92Uk52TAOTmoVoWI,2722
28
28
  kodit/enrichment/__init__.py,sha256=vBEolHpKaHUhfINX0dSGyAPlvgpLNAer9YzFtdvCB24,18
29
- kodit/enrichment/enrichment_factory.py,sha256=vKjkUTdhj74IW2S4GENDWdWMJx6BwUSZjJGDC0i7DSk,787
29
+ kodit/enrichment/enrichment_factory.py,sha256=JbWFNciB6Yf79SFVjG9UhLgCcrXZ1rIJrenU8QmNLBE,1411
30
30
  kodit/enrichment/enrichment_service.py,sha256=87Sd3gGbEMJYb_wVrHG8L1yGIZmQNR7foUS4_y94azI,977
31
31
  kodit/enrichment/enrichment_provider/__init__.py,sha256=klf8iuLVWX4iRz-DZQauFFNAoJC5CByczh48TBZPW-o,27
32
32
  kodit/enrichment/enrichment_provider/enrichment_provider.py,sha256=E0H5rq3OENM0yYbA8K_3nSnj5lUHCpoIOqpWLo-2MVU,413
33
- kodit/enrichment/enrichment_provider/local_enrichment_provider.py,sha256=bR6HR1gH7wtZdMLOwaKdASjvllRo1FlNW9GyZC11zAM,2164
34
- kodit/enrichment/enrichment_provider/openai_enrichment_provider.py,sha256=gYuFTAeIVdQNlCUvNSPgRoiRwCvRD0C8419h8ubyABA,2725
33
+ kodit/enrichment/enrichment_provider/local_enrichment_provider.py,sha256=RqwUD0BnwRQ8zlkFNkaKq8d58r33k2jIdnSdf6zla1w,3325
34
+ kodit/enrichment/enrichment_provider/openai_enrichment_provider.py,sha256=0Yw7h9RXptoI4bKuqJSKIRQXPUUhNV7eACavgoy_T8s,2874
35
35
  kodit/indexing/__init__.py,sha256=cPyi2Iej3G1JFWlWr7X80_UrsMaTu5W5rBwgif1B3xo,75
36
36
  kodit/indexing/fusion.py,sha256=TZb4fPAedXdEUXzwzOofW98QIOymdbclBOP1KOijuEk,1674
37
37
  kodit/indexing/indexing_models.py,sha256=6NX9HVcj6Pu9ePwHC7n-PWSyAgukpJq0nCNmUIigtbo,1282
38
38
  kodit/indexing/indexing_repository.py,sha256=dqOS0pxKM6bUjMXWqYukAK8XdiD36OnskFASgZRXRQM,6955
39
- kodit/indexing/indexing_service.py,sha256=_uhoqBic3_zXNJOsKt_w-TgX5ebf7OBwbqMdO9zectM,10779
39
+ kodit/indexing/indexing_service.py,sha256=79BZ4yaSJqADkivzjsq1bDCBtbfWikVRC7Fjlp1HmZw,10885
40
40
  kodit/migrations/README,sha256=ISVtAOvqvKk_5ThM5ioJE-lMkvf9IbknFUFVU_vPma4,58
41
41
  kodit/migrations/__init__.py,sha256=lP5MuwlyWRMO6UcDWnQcQ3G-GYHcFb6rl9gYPHJ1sjo,40
42
42
  kodit/migrations/env.py,sha256=w1M7OZh-ZeR2dPHS0ByXAUxQjfZQ8xIzMseWuzLDTWw,2469
43
43
  kodit/migrations/script.py.mako,sha256=zWziKtiwYKEWuwPV_HBNHwa9LCT45_bi01-uSNFaOOE,703
44
+ kodit/migrations/versions/42e836b21102_add_authors.py,sha256=KmXlHb_y8bIa_ABNU67zZi13r0DAfHA9G8tjQNkdITM,2638
44
45
  kodit/migrations/versions/7c3bbc2ab32b_add_embeddings_table.py,sha256=-61qol9PfQKILCDQRA5jEaats9aGZs9Wdtp-j-38SF4,1644
45
46
  kodit/migrations/versions/85155663351e_initial.py,sha256=Cg7zlF871o9ShV5rQMQ1v7hRV7fI59veDY9cjtTrs-8,3306
46
47
  kodit/migrations/versions/__init__.py,sha256=9-lHzptItTzq_fomdIRBegQNm4Znx6pVjwD4MiqRIdo,36
@@ -55,13 +56,15 @@ kodit/snippets/languages/javascript.scm,sha256=Ini5TsVNmcBKQ8aL46a5Id9ut0g9Udmvm
55
56
  kodit/snippets/languages/python.scm,sha256=ee85R9PBzwye3IMTE7-iVoKWd_ViU3EJISTyrFGrVeo,429
56
57
  kodit/snippets/languages/typescript.scm,sha256=U-ujbbv4tylbUBj9wuhL-e5cW6hmgPCNs4xrIX3r_hE,448
57
58
  kodit/source/__init__.py,sha256=1NTZyPdjThVQpZO1Mp1ColVsS7sqYanOVLqnoqV9Ipo,83
58
- kodit/source/source_models.py,sha256=kcC59XPSDDMth2mOYK3FakqTN0jxKFaTDch0ejyD9Sw,2446
59
- kodit/source/source_repository.py,sha256=0EksMpoLzdkfe8S4eeCm4Sf7TuxsOzOzaF4BBsMYo-4,3163
60
- kodit/source/source_service.py,sha256=u_GaH07ewakThQJRfT8O_yZ54A52qLtJuM1bF3xUT2A,9633
59
+ kodit/source/git.py,sha256=CpNczc06SbxpzfQKq76lZFzuol10ZJvTRSzeXW9DFUs,363
60
+ kodit/source/ignore.py,sha256=W7cuIrYlgfu3S1qyoIepXe8PqYmtFv61Tt5RO8cbZbg,1701
61
+ kodit/source/source_models.py,sha256=lCaaoukLlMHuRWJBuYM2nkNKGtFASgbk7ZXq8kp4H5c,3519
62
+ kodit/source/source_repository.py,sha256=4L-W0uE4LOB9LQlefk5f2sgHlsJjj8t33USPxU0na40,4448
63
+ kodit/source/source_service.py,sha256=v-lY-7tsNFCyXo9yCUo7Q00NOWYKGiDB_M2-Hr8hp3U,11391
61
64
  kodit/util/__init__.py,sha256=bPu6CtqDWCRGU7VgW2_aiQrCBi8G89FS6k1PjvDajJ0,37
62
65
  kodit/util/spinner.py,sha256=R9bzrHtBiIH6IfLbmsIVHL53s8vg-tqW4lwGGALu4dw,1932
63
- kodit-0.2.0.dist-info/METADATA,sha256=0CdegivoI9rcZLpmwzGTFfW_bui1D1tjNtz7ajXFOJk,5735
64
- kodit-0.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
65
- kodit-0.2.0.dist-info/entry_points.txt,sha256=hoTn-1aKyTItjnY91fnO-rV5uaWQLQ-Vi7V5et2IbHY,40
66
- kodit-0.2.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
67
- kodit-0.2.0.dist-info/RECORD,,
66
+ kodit-0.2.2.dist-info/METADATA,sha256=UU1curOx-XMql_IiXty-eoz-MJrd5QdlzfCj7ZoSzhg,5857
67
+ kodit-0.2.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
68
+ kodit-0.2.2.dist-info/entry_points.txt,sha256=hoTn-1aKyTItjnY91fnO-rV5uaWQLQ-Vi7V5et2IbHY,40
69
+ kodit-0.2.2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
70
+ kodit-0.2.2.dist-info/RECORD,,
File without changes