kodit 0.2.0__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (135) hide show
  1. {kodit-0.2.0 → kodit-0.2.1}/.github/workflows/docker.yaml +3 -0
  2. {kodit-0.2.0 → kodit-0.2.1}/.gitignore +1 -1
  3. {kodit-0.2.0 → kodit-0.2.1}/Dockerfile +10 -1
  4. {kodit-0.2.0 → kodit-0.2.1}/PKG-INFO +2 -1
  5. {kodit-0.2.0 → kodit-0.2.1}/pyproject.toml +1 -0
  6. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/_version.py +2 -2
  7. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/bm25/local_bm25.py +31 -17
  8. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/config.py +10 -3
  9. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/embedding/embedding_provider/embedding_provider.py +8 -4
  10. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/embedding/embedding_provider/local_embedding_provider.py +8 -2
  11. kodit-0.2.1/src/kodit/enrichment/enrichment_provider/local_enrichment_provider.py +88 -0
  12. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/source/source_service.py +5 -22
  13. kodit-0.2.1/tests/docker-smoke.sh +13 -0
  14. {kodit-0.2.0 → kodit-0.2.1}/tests/kodit/bm25/local_bm25_test.py +1 -8
  15. {kodit-0.2.0 → kodit-0.2.1}/tests/kodit/source/source_service_test.py +11 -3
  16. {kodit-0.2.0 → kodit-0.2.1}/uv.lock +20 -0
  17. kodit-0.2.0/src/kodit/enrichment/enrichment_provider/local_enrichment_provider.py +0 -63
  18. {kodit-0.2.0 → kodit-0.2.1}/.cursor/rules/kodit.mdc +0 -0
  19. {kodit-0.2.0 → kodit-0.2.1}/.github/CODE_OF_CONDUCT.md +0 -0
  20. {kodit-0.2.0 → kodit-0.2.1}/.github/CONTRIBUTING.md +0 -0
  21. {kodit-0.2.0 → kodit-0.2.1}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  22. {kodit-0.2.0 → kodit-0.2.1}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  23. {kodit-0.2.0 → kodit-0.2.1}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  24. {kodit-0.2.0 → kodit-0.2.1}/.github/dependabot.yml +0 -0
  25. {kodit-0.2.0 → kodit-0.2.1}/.github/workflows/docs.yaml +0 -0
  26. {kodit-0.2.0 → kodit-0.2.1}/.github/workflows/pull_request.yaml +0 -0
  27. {kodit-0.2.0 → kodit-0.2.1}/.github/workflows/pypi-test.yaml +0 -0
  28. {kodit-0.2.0 → kodit-0.2.1}/.github/workflows/pypi.yaml +0 -0
  29. {kodit-0.2.0 → kodit-0.2.1}/.github/workflows/test.yaml +0 -0
  30. {kodit-0.2.0 → kodit-0.2.1}/.python-version +0 -0
  31. {kodit-0.2.0 → kodit-0.2.1}/.vscode/launch.json +0 -0
  32. {kodit-0.2.0 → kodit-0.2.1}/.vscode/settings.json +0 -0
  33. {kodit-0.2.0 → kodit-0.2.1}/LICENSE +0 -0
  34. {kodit-0.2.0 → kodit-0.2.1}/README.md +0 -0
  35. {kodit-0.2.0 → kodit-0.2.1}/alembic.ini +0 -0
  36. {kodit-0.2.0 → kodit-0.2.1}/docs/_index.md +0 -0
  37. {kodit-0.2.0 → kodit-0.2.1}/docs/demos/_index.md +0 -0
  38. {kodit-0.2.0 → kodit-0.2.1}/docs/demos/knock-knock-auth/index.md +0 -0
  39. {kodit-0.2.0 → kodit-0.2.1}/docs/developer/index.md +0 -0
  40. {kodit-0.2.0 → kodit-0.2.1}/docs/getting-started/_index.md +0 -0
  41. {kodit-0.2.0 → kodit-0.2.1}/docs/getting-started/installation/index.md +0 -0
  42. {kodit-0.2.0 → kodit-0.2.1}/docs/getting-started/integration/index.md +0 -0
  43. {kodit-0.2.0 → kodit-0.2.1}/docs/getting-started/quick-start/index.md +0 -0
  44. {kodit-0.2.0 → kodit-0.2.1}/docs/reference/_index.md +0 -0
  45. {kodit-0.2.0 → kodit-0.2.1}/docs/reference/configuration/index.md +0 -0
  46. {kodit-0.2.0 → kodit-0.2.1}/docs/reference/telemetry/index.md +0 -0
  47. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/.gitignore +0 -0
  48. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/__init__.py +0 -0
  49. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/app.py +0 -0
  50. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/bm25/__init__.py +0 -0
  51. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/bm25/keyword_search_factory.py +0 -0
  52. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/bm25/keyword_search_service.py +0 -0
  53. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/bm25/vectorchord_bm25.py +0 -0
  54. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/cli.py +0 -0
  55. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/database.py +0 -0
  56. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/embedding/__init__.py +0 -0
  57. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/embedding/embedding_factory.py +0 -0
  58. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/embedding/embedding_models.py +0 -0
  59. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/embedding/embedding_provider/__init__.py +0 -0
  60. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/embedding/embedding_provider/hash_embedding_provider.py +0 -0
  61. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/embedding/embedding_provider/openai_embedding_provider.py +0 -0
  62. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/embedding/embedding_repository.py +0 -0
  63. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/embedding/local_vector_search_service.py +0 -0
  64. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/embedding/vector_search_service.py +0 -0
  65. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/embedding/vectorchord_vector_search_service.py +0 -0
  66. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/enrichment/__init__.py +0 -0
  67. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/enrichment/enrichment_factory.py +0 -0
  68. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/enrichment/enrichment_provider/__init__.py +0 -0
  69. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/enrichment/enrichment_provider/enrichment_provider.py +0 -0
  70. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/enrichment/enrichment_provider/openai_enrichment_provider.py +0 -0
  71. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/enrichment/enrichment_service.py +0 -0
  72. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/indexing/__init__.py +0 -0
  73. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/indexing/fusion.py +0 -0
  74. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/indexing/indexing_models.py +0 -0
  75. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/indexing/indexing_repository.py +0 -0
  76. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/indexing/indexing_service.py +0 -0
  77. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/log.py +0 -0
  78. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/mcp.py +0 -0
  79. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/middleware.py +0 -0
  80. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/migrations/README +0 -0
  81. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/migrations/__init__.py +0 -0
  82. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/migrations/env.py +0 -0
  83. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/migrations/script.py.mako +0 -0
  84. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/migrations/versions/7c3bbc2ab32b_add_embeddings_table.py +0 -0
  85. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/migrations/versions/85155663351e_initial.py +0 -0
  86. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/migrations/versions/__init__.py +0 -0
  87. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/migrations/versions/c3f5137d30f5_index_all_the_things.py +0 -0
  88. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/snippets/__init__.py +0 -0
  89. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/snippets/languages/__init__.py +0 -0
  90. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/snippets/languages/csharp.scm +0 -0
  91. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/snippets/languages/go.scm +0 -0
  92. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/snippets/languages/javascript.scm +0 -0
  93. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/snippets/languages/python.scm +0 -0
  94. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/snippets/languages/typescript.scm +0 -0
  95. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/snippets/method_snippets.py +0 -0
  96. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/snippets/snippets.py +0 -0
  97. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/source/__init__.py +0 -0
  98. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/source/source_models.py +0 -0
  99. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/source/source_repository.py +0 -0
  100. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/util/__init__.py +0 -0
  101. {kodit-0.2.0 → kodit-0.2.1}/src/kodit/util/spinner.py +0 -0
  102. {kodit-0.2.0 → kodit-0.2.1}/tests/__init__.py +0 -0
  103. {kodit-0.2.0 → kodit-0.2.1}/tests/conftest.py +0 -0
  104. {kodit-0.2.0 → kodit-0.2.1}/tests/experiments/cline-prompt-regression-tests/cline_prompt.txt +0 -0
  105. {kodit-0.2.0 → kodit-0.2.1}/tests/experiments/cline-prompt-regression-tests/cline_prompt_test.py +0 -0
  106. {kodit-0.2.0 → kodit-0.2.1}/tests/experiments/embedding.py +0 -0
  107. {kodit-0.2.0 → kodit-0.2.1}/tests/experiments/similarity_test.py +0 -0
  108. {kodit-0.2.0 → kodit-0.2.1}/tests/kodit/__init__.py +0 -0
  109. {kodit-0.2.0 → kodit-0.2.1}/tests/kodit/bm25/vectorchord_repository_test.py +0 -0
  110. {kodit-0.2.0 → kodit-0.2.1}/tests/kodit/cli_test.py +0 -0
  111. {kodit-0.2.0 → kodit-0.2.1}/tests/kodit/e2e.py +0 -0
  112. {kodit-0.2.0 → kodit-0.2.1}/tests/kodit/embedding/__init__.py +0 -0
  113. {kodit-0.2.0 → kodit-0.2.1}/tests/kodit/embedding/embedding_provider/local_embedding_provider_test.py +0 -0
  114. {kodit-0.2.0 → kodit-0.2.1}/tests/kodit/embedding/embedding_provider/openai_embedding_provider_test.py +0 -0
  115. {kodit-0.2.0 → kodit-0.2.1}/tests/kodit/embedding/local_vector_search_service_test.py +0 -0
  116. {kodit-0.2.0 → kodit-0.2.1}/tests/kodit/embedding/vectorchord_vector_search_service_test.py +0 -0
  117. {kodit-0.2.0 → kodit-0.2.1}/tests/kodit/enrichment/__init__.py +0 -0
  118. {kodit-0.2.0 → kodit-0.2.1}/tests/kodit/enrichment/enrichment_provider/__init__.py +0 -0
  119. {kodit-0.2.0 → kodit-0.2.1}/tests/kodit/enrichment/enrichment_provider/openai_enrichment_provider_test.py +0 -0
  120. {kodit-0.2.0 → kodit-0.2.1}/tests/kodit/indexing/__init__.py +0 -0
  121. {kodit-0.2.0 → kodit-0.2.1}/tests/kodit/indexing/indexing_repository_test.py +0 -0
  122. {kodit-0.2.0 → kodit-0.2.1}/tests/kodit/indexing/indexing_service_test.py +0 -0
  123. {kodit-0.2.0 → kodit-0.2.1}/tests/kodit/mcp_test.py +0 -0
  124. {kodit-0.2.0 → kodit-0.2.1}/tests/kodit/snippets/__init__.py +0 -0
  125. {kodit-0.2.0 → kodit-0.2.1}/tests/kodit/snippets/csharp.cs +0 -0
  126. {kodit-0.2.0 → kodit-0.2.1}/tests/kodit/snippets/detect_language_test.py +0 -0
  127. {kodit-0.2.0 → kodit-0.2.1}/tests/kodit/snippets/golang.go +0 -0
  128. {kodit-0.2.0 → kodit-0.2.1}/tests/kodit/snippets/javascript.js +0 -0
  129. {kodit-0.2.0 → kodit-0.2.1}/tests/kodit/snippets/knock-knock-server.py +0 -0
  130. {kodit-0.2.0 → kodit-0.2.1}/tests/kodit/snippets/method_extraction_test.py +0 -0
  131. {kodit-0.2.0 → kodit-0.2.1}/tests/kodit/snippets/python.py +0 -0
  132. {kodit-0.2.0 → kodit-0.2.1}/tests/kodit/snippets/typescript.tsx +0 -0
  133. {kodit-0.2.0 → kodit-0.2.1}/tests/kodit/source/__init__.py +0 -0
  134. {kodit-0.2.0 → kodit-0.2.1}/tests/performance/similarity.py +0 -0
  135. {kodit-0.2.0 → kodit-0.2.1}/tests/smoke.sh +0 -0
@@ -26,6 +26,9 @@ jobs:
26
26
  load: true
27
27
  tags: ${{ env.TEST_TAG }}
28
28
 
29
+ - name: Docker smoke test
30
+ run: ./tests/docker-smoke.sh
31
+
29
32
  push_to_registry:
30
33
  name: Push Docker image to registry
31
34
  runs-on: ubuntu-latest
@@ -128,7 +128,7 @@ celerybeat.pid
128
128
  *.sage.py
129
129
 
130
130
  # Environments
131
- .env
131
+ .env*
132
132
  .venv
133
133
  env/
134
134
  venv/
@@ -14,7 +14,8 @@ apt-get update -qy
14
14
  apt-get install -qyy \
15
15
  -o APT::Install-Recommends=false \
16
16
  -o APT::Install-Suggests=false \
17
- git
17
+ git \
18
+ build-essential
18
19
  EOT
19
20
 
20
21
  # Security-conscious organizations should package/review uv themselves.
@@ -63,6 +64,14 @@ RUN --mount=type=cache,target=/root/.cache \
63
64
  FROM python:3.13.4-slim-bookworm
64
65
  SHELL ["sh", "-exc"]
65
66
 
67
+ RUN <<EOT
68
+ apt-get update -qy
69
+ apt-get install -qyy \
70
+ -o APT::Install-Recommends=false \
71
+ -o APT::Install-Suggests=false \
72
+ git
73
+ EOT
74
+
66
75
  ENV PATH=/app/bin:$PATH
67
76
 
68
77
  # Don't run your app as root.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kodit
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: Code indexing for better AI code generation
5
5
  Project-URL: Homepage, https://docs.helixml.tech/kodit/
6
6
  Project-URL: Documentation, https://docs.helixml.tech/kodit/
@@ -18,6 +18,7 @@ Classifier: Programming Language :: Python :: 3.12
18
18
  Classifier: Programming Language :: Python :: 3.13
19
19
  Classifier: Topic :: Software Development :: Code Generators
20
20
  Requires-Python: >=3.12
21
+ Requires-Dist: accelerate>=1.7.0
21
22
  Requires-Dist: aiofiles>=24.1.0
22
23
  Requires-Dist: aiosqlite>=0.20.0
23
24
  Requires-Dist: alembic>=1.15.2
@@ -51,6 +51,7 @@ dependencies = [
51
51
  "tiktoken>=0.9.0",
52
52
  "asyncpg>=0.30.0",
53
53
  "transformers>=4.51.3",
54
+ "accelerate>=1.7.0",
54
55
  ]
55
56
 
56
57
  [dependency-groups]
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '0.2.0'
21
- __version_tuple__ = version_tuple = (0, 2, 0)
20
+ __version__ = version = '0.2.1'
21
+ __version_tuple__ = version_tuple = (0, 2, 1)
@@ -1,13 +1,14 @@
1
1
  """Locally hosted BM25 service primarily for use with SQLite."""
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import json
4
6
  from pathlib import Path
7
+ from typing import TYPE_CHECKING
5
8
 
6
9
  import aiofiles
7
- import bm25s
8
10
  import Stemmer
9
11
  import structlog
10
- from bm25s.tokenization import Tokenized
11
12
 
12
13
  from kodit.bm25.keyword_search_service import (
13
14
  BM25Document,
@@ -15,6 +16,11 @@ from kodit.bm25.keyword_search_service import (
15
16
  KeywordSearchProvider,
16
17
  )
17
18
 
19
+ if TYPE_CHECKING:
20
+ import bm25s
21
+ from bm25s.tokenization import Tokenized
22
+
23
+
18
24
  SNIPPET_IDS_FILE = "snippet_ids.jsonl"
19
25
 
20
26
 
@@ -26,19 +32,28 @@ class BM25Service(KeywordSearchProvider):
26
32
  self.log = structlog.get_logger(__name__)
27
33
  self.index_path = data_dir / "bm25s_index"
28
34
  self.snippet_ids: list[int] = []
29
- try:
30
- self.log.debug("Loading BM25 index")
31
- self.retriever = bm25s.BM25.load(self.index_path, mmap=True)
32
- with Path(self.index_path / SNIPPET_IDS_FILE).open() as f:
33
- self.snippet_ids = json.load(f)
34
- except FileNotFoundError:
35
- self.log.debug("BM25 index not found, creating new index")
36
- self.retriever = bm25s.BM25()
37
-
38
35
  self.stemmer = Stemmer.Stemmer("english")
36
+ self.__retriever: bm25s.BM25 | None = None
37
+
38
+ def _retriever(self) -> bm25s.BM25:
39
+ """Get the BM25 retriever."""
40
+ if self.__retriever is None:
41
+ import bm25s
42
+
43
+ try:
44
+ self.log.debug("Loading BM25 index")
45
+ self.__retriever = bm25s.BM25.load(self.index_path, mmap=True)
46
+ with Path(self.index_path / SNIPPET_IDS_FILE).open() as f:
47
+ self.snippet_ids = json.load(f)
48
+ except FileNotFoundError:
49
+ self.log.debug("BM25 index not found, creating new index")
50
+ self.__retriever = bm25s.BM25()
51
+ return self.__retriever
39
52
 
40
53
  def _tokenize(self, corpus: list[str]) -> list[list[str]] | Tokenized:
41
- return bm25s.tokenize(
54
+ from bm25s import tokenize
55
+
56
+ return tokenize(
42
57
  corpus,
43
58
  stopwords="en",
44
59
  stemmer=self.stemmer,
@@ -50,9 +65,8 @@ class BM25Service(KeywordSearchProvider):
50
65
  """Index a new corpus."""
51
66
  self.log.debug("Indexing corpus")
52
67
  vocab = self._tokenize([doc.text for doc in corpus])
53
- self.retriever = bm25s.BM25()
54
- self.retriever.index(vocab, show_progress=False)
55
- self.retriever.save(self.index_path)
68
+ self._retriever().index(vocab, show_progress=False)
69
+ self._retriever().save(self.index_path)
56
70
  self.snippet_ids = self.snippet_ids + [doc.snippet_id for doc in corpus]
57
71
  async with aiofiles.open(self.index_path / SNIPPET_IDS_FILE, "w") as f:
58
72
  await f.write(json.dumps(self.snippet_ids))
@@ -64,7 +78,7 @@ class BM25Service(KeywordSearchProvider):
64
78
  return []
65
79
 
66
80
  # Get the number of documents in the index
67
- num_docs = self.retriever.scores["num_docs"]
81
+ num_docs = self._retriever().scores["num_docs"]
68
82
  if num_docs == 0:
69
83
  return []
70
84
 
@@ -80,7 +94,7 @@ class BM25Service(KeywordSearchProvider):
80
94
 
81
95
  self.log.debug("Query tokens", query_tokens=query_tokens)
82
96
 
83
- results, scores = self.retriever.retrieve(
97
+ results, scores = self._retriever().retrieve(
84
98
  query_tokens=query_tokens,
85
99
  corpus=self.snippet_ids,
86
100
  k=top_k,
@@ -1,16 +1,21 @@
1
1
  """Global configuration for the kodit project."""
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import asyncio
4
- from collections.abc import Callable, Coroutine
5
6
  from functools import wraps
6
7
  from pathlib import Path
7
- from typing import Any, Literal, TypeVar
8
+ from typing import TYPE_CHECKING, Any, Literal, TypeVar
8
9
 
9
10
  import click
10
- from openai import AsyncOpenAI
11
11
  from pydantic import BaseModel, Field
12
12
  from pydantic_settings import BaseSettings, SettingsConfigDict
13
13
 
14
+ if TYPE_CHECKING:
15
+ from collections.abc import Callable, Coroutine
16
+
17
+ from openai import AsyncOpenAI
18
+
14
19
  from kodit.database import Database
15
20
 
16
21
  DEFAULT_BASE_DIR = Path.home() / ".kodit"
@@ -92,6 +97,8 @@ class AppContext(BaseSettings):
92
97
 
93
98
  def get_default_openai_client(self) -> AsyncOpenAI | None:
94
99
  """Get the default OpenAI client, if it is configured."""
100
+ from openai import AsyncOpenAI
101
+
95
102
  endpoint = self.default_endpoint
96
103
  if not (
97
104
  endpoint
@@ -23,7 +23,11 @@ class EmbeddingProvider(ABC):
23
23
  """
24
24
 
25
25
 
26
- def split_sub_batches(encoding: tiktoken.Encoding, data: list[str]) -> list[list[str]]:
26
+ def split_sub_batches(
27
+ encoding: tiktoken.Encoding,
28
+ data: list[str],
29
+ max_context_window: int = OPENAI_MAX_EMBEDDING_SIZE,
30
+ ) -> list[list[str]]:
27
31
  """Split a list of strings into smaller sub-batches."""
28
32
  log = structlog.get_logger(__name__)
29
33
  result = []
@@ -37,10 +41,10 @@ def split_sub_batches(encoding: tiktoken.Encoding, data: list[str]) -> list[list
37
41
  next_item = data_to_process[0]
38
42
  item_tokens = len(encoding.encode(next_item))
39
43
 
40
- if item_tokens > OPENAI_MAX_EMBEDDING_SIZE:
44
+ if item_tokens > max_context_window:
41
45
  # Loop around trying to truncate the snippet until it fits in the max
42
46
  # embedding size
43
- while item_tokens > OPENAI_MAX_EMBEDDING_SIZE:
47
+ while item_tokens > max_context_window:
44
48
  next_item = next_item[:-1]
45
49
  item_tokens = len(encoding.encode(next_item))
46
50
 
@@ -48,7 +52,7 @@ def split_sub_batches(encoding: tiktoken.Encoding, data: list[str]) -> list[list
48
52
 
49
53
  log.warning("Truncated snippet", snippet=next_item)
50
54
 
51
- if current_tokens + item_tokens > OPENAI_MAX_EMBEDDING_SIZE:
55
+ if current_tokens + item_tokens > max_context_window:
52
56
  break
53
57
 
54
58
  next_batch.append(data_to_process.pop(0))
@@ -1,10 +1,12 @@
1
1
  """Local embedding service."""
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import os
6
+ from typing import TYPE_CHECKING
4
7
 
5
8
  import structlog
6
9
  import tiktoken
7
- from sentence_transformers import SentenceTransformer
8
10
  from tqdm import tqdm
9
11
 
10
12
  from kodit.embedding.embedding_provider.embedding_provider import (
@@ -13,6 +15,9 @@ from kodit.embedding.embedding_provider.embedding_provider import (
13
15
  split_sub_batches,
14
16
  )
15
17
 
18
+ if TYPE_CHECKING:
19
+ from sentence_transformers import SentenceTransformer
20
+
16
21
  TINY = "tiny"
17
22
  CODE = "code"
18
23
  TEST = "test"
@@ -38,10 +43,11 @@ class LocalEmbeddingProvider(EmbeddingProvider):
38
43
  """Get the embedding model."""
39
44
  if self.embedding_model is None:
40
45
  os.environ["TOKENIZERS_PARALLELISM"] = "false" # Avoid warnings
46
+ from sentence_transformers import SentenceTransformer
47
+
41
48
  self.embedding_model = SentenceTransformer(
42
49
  self.model_name,
43
50
  trust_remote_code=True,
44
- device="cpu", # Force CPU so we don't have to install accelerate, etc.
45
51
  )
46
52
  return self.embedding_model
47
53
 
@@ -0,0 +1,88 @@
1
+ """Local embedding service."""
2
+
3
+ import os
4
+
5
+ import structlog
6
+ import tiktoken
7
+ from tqdm import tqdm
8
+
9
+ from kodit.embedding.embedding_provider.embedding_provider import split_sub_batches
10
+ from kodit.enrichment.enrichment_provider.enrichment_provider import (
11
+ ENRICHMENT_SYSTEM_PROMPT,
12
+ EnrichmentProvider,
13
+ )
14
+
15
+ DEFAULT_ENRICHMENT_MODEL = "Qwen/Qwen3-0.6B"
16
+ DEFAULT_CONTEXT_WINDOW_SIZE = 2048 # Small so it works even on low-powered devices
17
+
18
+
19
+ class LocalEnrichmentProvider(EnrichmentProvider):
20
+ """Local embedder."""
21
+
22
+ def __init__(
23
+ self,
24
+ model_name: str = DEFAULT_ENRICHMENT_MODEL,
25
+ context_window: int = DEFAULT_CONTEXT_WINDOW_SIZE,
26
+ ) -> None:
27
+ """Initialize the local enrichment provider."""
28
+ self.log = structlog.get_logger(__name__)
29
+ self.model_name = model_name
30
+ self.context_window = context_window
31
+ self.model = None
32
+ self.tokenizer = None
33
+ self.encoding = tiktoken.encoding_for_model("text-embedding-3-small")
34
+
35
+ async def enrich(self, data: list[str]) -> list[str]:
36
+ """Enrich a list of strings."""
37
+ from transformers.models.auto.modeling_auto import (
38
+ AutoModelForCausalLM,
39
+ )
40
+ from transformers.models.auto.tokenization_auto import AutoTokenizer
41
+
42
+ if self.tokenizer is None:
43
+ self.tokenizer = AutoTokenizer.from_pretrained(
44
+ self.model_name, padding_side="left"
45
+ )
46
+ if self.model is None:
47
+ os.environ["TOKENIZERS_PARALLELISM"] = "false" # Avoid warnings
48
+ self.model = AutoModelForCausalLM.from_pretrained(
49
+ self.model_name,
50
+ torch_dtype="auto",
51
+ trust_remote_code=True,
52
+ device_map="auto",
53
+ )
54
+
55
+ # Prepare prompts
56
+ prompts = [
57
+ self.tokenizer.apply_chat_template(
58
+ [
59
+ {"role": "system", "content": ENRICHMENT_SYSTEM_PROMPT},
60
+ {"role": "user", "content": snippet},
61
+ ],
62
+ tokenize=False,
63
+ add_generation_prompt=True,
64
+ enable_thinking=False,
65
+ )
66
+ for snippet in data
67
+ ]
68
+
69
+ # Batch prompts using split_sub_batches
70
+ batched_prompts = split_sub_batches(
71
+ self.encoding, prompts, max_context_window=self.context_window
72
+ )
73
+ results = []
74
+ for batch in tqdm(batched_prompts, leave=False, total=len(batched_prompts)):
75
+ model_inputs = self.tokenizer(
76
+ batch, return_tensors="pt", padding=True, truncation=True
77
+ ).to(self.model.device)
78
+ generated_ids = self.model.generate(
79
+ **model_inputs, max_new_tokens=self.context_window
80
+ )
81
+ # For each prompt in the batch, decode only the generated part
82
+ for i, input_ids in enumerate(model_inputs["input_ids"]):
83
+ output_ids = generated_ids[i][len(input_ids) :].tolist()
84
+ content = self.tokenizer.decode(
85
+ output_ids, skip_special_tokens=True
86
+ ).strip("\n")
87
+ results.append(content)
88
+ return results
@@ -82,17 +82,7 @@ class SourceService:
82
82
  )
83
83
 
84
84
  async def create(self, uri_or_path_like: str) -> SourceView:
85
- """Create a new source from a URI.
86
-
87
- Args:
88
- uri: The URI of the source to create. Can be a git-like URI or a local
89
- directory.
90
-
91
- Raises:
92
- ValueError: If the source type is not supported or if the folder doesn't
93
- exist.
94
-
95
- """
85
+ """Create a new source from a URI or path."""
96
86
  if Path(uri_or_path_like).is_dir():
97
87
  return await self._create_folder_source(Path(uri_or_path_like))
98
88
  if isuri(uri_or_path_like):
@@ -103,18 +93,11 @@ class SourceService:
103
93
  ".git"
104
94
  ):
105
95
  return await self._create_git_source(uri_or_path_like)
106
-
107
- # Try adding a .git suffix, sometimes people just pass the url
108
96
  if not uri_or_path_like.endswith(".git"):
109
- uri_or_path_like = uri_or_path_like + ".git"
110
- try:
111
- return await self._create_git_source(uri_or_path_like)
112
- except git.GitCommandError:
113
- raise
114
- except ValueError:
115
- pass
116
-
117
- msg = f"Unsupported source type: {uri_or_path_like}"
97
+ uri_or_path_like = uri_or_path_like.strip("/") + ".git"
98
+ return await self._create_git_source(uri_or_path_like)
99
+
100
+ msg = f"Unsupported source: {uri_or_path_like}"
118
101
  raise ValueError(msg)
119
102
 
120
103
  async def _create_folder_source(self, directory: Path) -> SourceView:
@@ -0,0 +1,13 @@
1
+ #!/bin/bash
2
+ set -e
3
+
4
+ if [ -z "$TEST_TAG" ]; then
5
+ echo "TEST_TAG is not set"
6
+ exit 1
7
+ fi
8
+
9
+ # Get the directory of this script
10
+ script_dir=$(dirname "$0")
11
+
12
+ # Start the container, mount the smoke test and run it
13
+ docker run -i -v $script_dir:/tests --entrypoint /bin/bash --env CI=true $TEST_TAG -c "/tests/smoke.sh"
@@ -34,14 +34,6 @@ def sample_documents():
34
34
  ]
35
35
 
36
36
 
37
- @pytest.mark.asyncio
38
- async def test_initialization(bm25_service):
39
- """Test that the service initializes correctly."""
40
- assert bm25_service.retriever is not None
41
- assert isinstance(bm25_service.snippet_ids, list)
42
- assert len(bm25_service.snippet_ids) == 0
43
-
44
-
45
37
  @pytest.mark.asyncio
46
38
  async def test_index_and_retrieve(bm25_service, sample_documents):
47
39
  """Test indexing and retrieving documents."""
@@ -134,6 +126,7 @@ async def test_persistence(bm25_service, sample_documents):
134
126
 
135
127
  # Create a new service instance with the same data directory
136
128
  new_service = BM25Service(bm25_service.index_path.parent)
129
+ new_service._retriever()
137
130
 
138
131
  # Verify the new instance loaded the correct data
139
132
  assert len(new_service.snippet_ids) == 3
@@ -32,7 +32,7 @@ async def test_create_source_nonexistent_path(service: SourceService) -> None:
32
32
  uri = nonexistent_path.as_uri()
33
33
 
34
34
  # Try to create a source with the nonexistent path
35
- with pytest.raises(ValueError, match=f"Folder does not exist: {nonexistent_path}"):
35
+ with pytest.raises(ValueError):
36
36
  await service.create(uri)
37
37
 
38
38
 
@@ -41,7 +41,7 @@ async def test_create_source_invalid_path_and_uri(service: SourceService) -> Non
41
41
  """Test creating a source with an invalid path that is also not a valid URI."""
42
42
  # Try to create a source with an invalid path that is also not a valid URI
43
43
  invalid_path = "not/a/valid/path/or/uri"
44
- with pytest.raises(ValueError, match=f"Unsupported source type: {invalid_path}"):
44
+ with pytest.raises(ValueError):
45
45
  await service.create(invalid_path)
46
46
 
47
47
 
@@ -65,7 +65,7 @@ async def test_create_source_already_added(
65
65
  async def test_create_source_unsupported_uri(service: SourceService) -> None:
66
66
  """Test creating a source with an unsupported URI."""
67
67
  # Try to create a source with an unsupported URI (e.g., http)
68
- with pytest.raises(ValueError, match="Unsupported source type: http://example.com"):
68
+ with pytest.raises(ValueError):
69
69
  await service.create("http://example.com")
70
70
 
71
71
 
@@ -155,3 +155,11 @@ async def test_create_source_relative_path(
155
155
 
156
156
  # Should not raise an error
157
157
  await service.create(".")
158
+
159
+
160
+ @pytest.mark.asyncio
161
+ async def test_strip_trailing_slash_on_gh_url(service: SourceService) -> None:
162
+ """Test creating a source with a file URI."""
163
+
164
+ # Should work
165
+ await service.create("https://github.com/helixml/kodit/")
@@ -6,6 +6,24 @@ resolution-markers = [
6
6
  "python_full_version < '3.13'",
7
7
  ]
8
8
 
9
+ [[package]]
10
+ name = "accelerate"
11
+ version = "1.7.0"
12
+ source = { registry = "https://pypi.org/simple/" }
13
+ dependencies = [
14
+ { name = "huggingface-hub" },
15
+ { name = "numpy" },
16
+ { name = "packaging" },
17
+ { name = "psutil" },
18
+ { name = "pyyaml" },
19
+ { name = "safetensors" },
20
+ { name = "torch" },
21
+ ]
22
+ sdist = { url = "https://files.pythonhosted.org/packages/97/33/47bbd507e3a851d33d19ce7b2141c5ea3689bfae91ba168044d7db24b0e9/accelerate-1.7.0.tar.gz", hash = "sha256:e8a2a5503d6237b9eee73cc8d36cf543f9c2d8dd2c6713450b322f5e6d53a610", size = 376026, upload-time = "2025-05-15T10:00:52.117Z" }
23
+ wheels = [
24
+ { url = "https://files.pythonhosted.org/packages/f8/bb/be8146c196ad6e4dec78385d91e92591f8a433576c4e04c342a636fcd811/accelerate-1.7.0-py3-none-any.whl", hash = "sha256:cf57165cca28769c6cf2650812371c81b18e05743dfa3c748524b1bb4f2b272f", size = 362095, upload-time = "2025-05-15T10:00:49.914Z" },
25
+ ]
26
+
9
27
  [[package]]
10
28
  name = "aiofiles"
11
29
  version = "24.1.0"
@@ -840,6 +858,7 @@ wheels = [
840
858
  name = "kodit"
841
859
  source = { editable = "." }
842
860
  dependencies = [
861
+ { name = "accelerate" },
843
862
  { name = "aiofiles" },
844
863
  { name = "aiosqlite" },
845
864
  { name = "alembic" },
@@ -884,6 +903,7 @@ dev = [
884
903
 
885
904
  [package.metadata]
886
905
  requires-dist = [
906
+ { name = "accelerate", specifier = ">=1.7.0" },
887
907
  { name = "aiofiles", specifier = ">=24.1.0" },
888
908
  { name = "aiosqlite", specifier = ">=0.20.0" },
889
909
  { name = "alembic", specifier = ">=1.15.2" },
@@ -1,63 +0,0 @@
1
- """Local embedding service."""
2
-
3
- import os
4
-
5
- import structlog
6
- from transformers.models.auto.modeling_auto import AutoModelForCausalLM
7
- from transformers.models.auto.tokenization_auto import AutoTokenizer
8
-
9
- from kodit.enrichment.enrichment_provider.enrichment_provider import (
10
- ENRICHMENT_SYSTEM_PROMPT,
11
- EnrichmentProvider,
12
- )
13
-
14
-
15
- class LocalEnrichmentProvider(EnrichmentProvider):
16
- """Local embedder."""
17
-
18
- def __init__(self, model_name: str = "Qwen/Qwen3-0.6B") -> None:
19
- """Initialize the local enrichment provider."""
20
- self.log = structlog.get_logger(__name__)
21
- self.model_name = model_name
22
- self.model = None
23
- self.tokenizer = None
24
-
25
- async def enrich(self, data: list[str]) -> list[str]:
26
- """Enrich a list of strings."""
27
- if self.tokenizer is None:
28
- self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
29
- if self.model is None:
30
- os.environ["TOKENIZERS_PARALLELISM"] = "false" # Avoid warnings
31
- self.model = AutoModelForCausalLM.from_pretrained(
32
- self.model_name,
33
- torch_dtype="auto",
34
- trust_remote_code=True,
35
- )
36
-
37
- results = []
38
- for snippet in data:
39
- # prepare the model input
40
- messages = [
41
- {"role": "system", "content": ENRICHMENT_SYSTEM_PROMPT},
42
- {"role": "user", "content": snippet},
43
- ]
44
- text = self.tokenizer.apply_chat_template(
45
- messages,
46
- tokenize=False,
47
- add_generation_prompt=True,
48
- enable_thinking=False,
49
- )
50
- model_inputs = self.tokenizer([text], return_tensors="pt").to(
51
- self.model.device
52
- )
53
-
54
- # conduct text completion
55
- generated_ids = self.model.generate(**model_inputs, max_new_tokens=32768)
56
- output_ids = generated_ids[0][len(model_inputs.input_ids[0]) :].tolist()
57
- content = self.tokenizer.decode(output_ids, skip_special_tokens=True).strip(
58
- "\n"
59
- )
60
-
61
- results.append(content)
62
-
63
- return results
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes