kodit 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +16 -3
- kodit/app.py +10 -3
- kodit/application/factories/code_indexing_factory.py +54 -7
- kodit/application/factories/reporting_factory.py +27 -0
- kodit/application/services/auto_indexing_service.py +16 -4
- kodit/application/services/code_indexing_application_service.py +115 -133
- kodit/application/services/indexing_worker_service.py +18 -20
- kodit/application/services/queue_service.py +15 -12
- kodit/application/services/reporting.py +86 -0
- kodit/application/services/sync_scheduler.py +21 -20
- kodit/cli.py +14 -18
- kodit/config.py +35 -17
- kodit/database.py +2 -1
- kodit/domain/protocols.py +9 -1
- kodit/domain/services/bm25_service.py +1 -6
- kodit/domain/services/index_service.py +22 -58
- kodit/domain/value_objects.py +57 -9
- kodit/infrastructure/api/v1/__init__.py +2 -2
- kodit/infrastructure/api/v1/dependencies.py +23 -10
- kodit/infrastructure/api/v1/routers/__init__.py +2 -1
- kodit/infrastructure/api/v1/routers/queue.py +76 -0
- kodit/infrastructure/api/v1/schemas/queue.py +35 -0
- kodit/infrastructure/cloning/git/working_copy.py +36 -7
- kodit/infrastructure/embedding/embedding_factory.py +18 -19
- kodit/infrastructure/embedding/embedding_providers/litellm_embedding_provider.py +156 -0
- kodit/infrastructure/enrichment/enrichment_factory.py +7 -16
- kodit/infrastructure/enrichment/{openai_enrichment_provider.py → litellm_enrichment_provider.py} +70 -60
- kodit/infrastructure/git/git_utils.py +9 -2
- kodit/infrastructure/mappers/index_mapper.py +1 -0
- kodit/infrastructure/reporting/__init__.py +1 -0
- kodit/infrastructure/reporting/log_progress.py +65 -0
- kodit/infrastructure/reporting/tdqm_progress.py +73 -0
- kodit/infrastructure/sqlalchemy/embedding_repository.py +47 -68
- kodit/infrastructure/sqlalchemy/entities.py +28 -2
- kodit/infrastructure/sqlalchemy/index_repository.py +274 -236
- kodit/infrastructure/sqlalchemy/task_repository.py +55 -39
- kodit/infrastructure/sqlalchemy/unit_of_work.py +59 -0
- kodit/log.py +6 -0
- kodit/mcp.py +10 -2
- {kodit-0.4.0.dist-info → kodit-0.4.2.dist-info}/METADATA +3 -2
- {kodit-0.4.0.dist-info → kodit-0.4.2.dist-info}/RECORD +44 -41
- kodit/domain/interfaces.py +0 -27
- kodit/infrastructure/embedding/embedding_providers/openai_embedding_provider.py +0 -183
- kodit/infrastructure/ui/__init__.py +0 -1
- kodit/infrastructure/ui/progress.py +0 -170
- kodit/infrastructure/ui/spinner.py +0 -74
- kodit/reporting.py +0 -78
- {kodit-0.4.0.dist-info → kodit-0.4.2.dist-info}/WHEEL +0 -0
- {kodit-0.4.0.dist-info → kodit-0.4.2.dist-info}/entry_points.txt +0 -0
- {kodit-0.4.0.dist-info → kodit-0.4.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
"""Task repository for the task queue."""
|
|
2
2
|
|
|
3
|
+
from collections.abc import Callable
|
|
4
|
+
|
|
3
5
|
import structlog
|
|
4
6
|
from sqlalchemy import select
|
|
5
7
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
@@ -9,14 +11,23 @@ from kodit.domain.protocols import TaskRepository
|
|
|
9
11
|
from kodit.domain.value_objects import TaskType
|
|
10
12
|
from kodit.infrastructure.mappers.task_mapper import TaskMapper, TaskTypeMapper
|
|
11
13
|
from kodit.infrastructure.sqlalchemy import entities as db_entities
|
|
14
|
+
from kodit.infrastructure.sqlalchemy.unit_of_work import SqlAlchemyUnitOfWork
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def create_task_repository(
|
|
18
|
+
session_factory: Callable[[], AsyncSession],
|
|
19
|
+
) -> TaskRepository:
|
|
20
|
+
"""Create an index repository."""
|
|
21
|
+
uow = SqlAlchemyUnitOfWork(session_factory=session_factory)
|
|
22
|
+
return SqlAlchemyTaskRepository(uow)
|
|
12
23
|
|
|
13
24
|
|
|
14
25
|
class SqlAlchemyTaskRepository(TaskRepository):
|
|
15
26
|
"""Repository for task persistence using the existing Task entity."""
|
|
16
27
|
|
|
17
|
-
def __init__(self,
|
|
28
|
+
def __init__(self, uow: SqlAlchemyUnitOfWork) -> None:
|
|
18
29
|
"""Initialize the repository."""
|
|
19
|
-
self.
|
|
30
|
+
self.uow = uow
|
|
20
31
|
self.log = structlog.get_logger(__name__)
|
|
21
32
|
|
|
22
33
|
async def add(
|
|
@@ -24,58 +35,63 @@ class SqlAlchemyTaskRepository(TaskRepository):
|
|
|
24
35
|
task: Task,
|
|
25
36
|
) -> None:
|
|
26
37
|
"""Create a new task in the database."""
|
|
27
|
-
self.
|
|
38
|
+
async with self.uow:
|
|
39
|
+
self.uow.session.add(TaskMapper.from_domain_task(task))
|
|
28
40
|
|
|
29
41
|
async def get(self, task_id: str) -> Task | None:
|
|
30
42
|
"""Get a task by ID."""
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
43
|
+
async with self.uow:
|
|
44
|
+
stmt = select(db_entities.Task).where(db_entities.Task.dedup_key == task_id)
|
|
45
|
+
result = await self.uow.session.execute(stmt)
|
|
46
|
+
db_task = result.scalar_one_or_none()
|
|
47
|
+
if not db_task:
|
|
48
|
+
return None
|
|
49
|
+
return TaskMapper.to_domain_task(db_task)
|
|
37
50
|
|
|
38
51
|
async def take(self) -> Task | None:
|
|
39
52
|
"""Take a task for processing and remove it from the database."""
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
53
|
+
async with self.uow:
|
|
54
|
+
stmt = (
|
|
55
|
+
select(db_entities.Task)
|
|
56
|
+
.order_by(db_entities.Task.priority.desc(), db_entities.Task.created_at)
|
|
57
|
+
.limit(1)
|
|
58
|
+
)
|
|
59
|
+
result = await self.uow.session.execute(stmt)
|
|
60
|
+
db_task = result.scalar_one_or_none()
|
|
61
|
+
if not db_task:
|
|
62
|
+
return None
|
|
63
|
+
await self.uow.session.delete(db_task)
|
|
64
|
+
return TaskMapper.to_domain_task(db_task)
|
|
51
65
|
|
|
52
66
|
async def update(self, task: Task) -> None:
|
|
53
67
|
"""Update a task in the database."""
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
68
|
+
async with self.uow:
|
|
69
|
+
stmt = select(db_entities.Task).where(db_entities.Task.dedup_key == task.id)
|
|
70
|
+
result = await self.uow.session.execute(stmt)
|
|
71
|
+
db_task = result.scalar_one_or_none()
|
|
57
72
|
|
|
58
|
-
|
|
59
|
-
|
|
73
|
+
if not db_task:
|
|
74
|
+
raise ValueError(f"Task not found: {task.id}")
|
|
60
75
|
|
|
61
|
-
|
|
62
|
-
|
|
76
|
+
db_task.priority = task.priority
|
|
77
|
+
db_task.payload = task.payload
|
|
63
78
|
|
|
64
79
|
async def list(self, task_type: TaskType | None = None) -> list[Task]:
|
|
65
80
|
"""List tasks with optional status filter."""
|
|
66
|
-
|
|
81
|
+
async with self.uow:
|
|
82
|
+
stmt = select(db_entities.Task)
|
|
67
83
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
84
|
+
if task_type:
|
|
85
|
+
stmt = stmt.where(
|
|
86
|
+
db_entities.Task.type == TaskTypeMapper.from_domain_type(task_type)
|
|
87
|
+
)
|
|
72
88
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
89
|
+
stmt = stmt.order_by(
|
|
90
|
+
db_entities.Task.priority.desc(), db_entities.Task.created_at
|
|
91
|
+
)
|
|
76
92
|
|
|
77
|
-
|
|
78
|
-
|
|
93
|
+
result = await self.uow.session.execute(stmt)
|
|
94
|
+
records = result.scalars().all()
|
|
79
95
|
|
|
80
|
-
|
|
81
|
-
|
|
96
|
+
# Convert to domain entities
|
|
97
|
+
return [TaskMapper.to_domain_task(record) for record in records]
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""SQLAlchemy implementation of Unit of Work pattern."""
|
|
2
|
+
|
|
3
|
+
from collections.abc import Callable
|
|
4
|
+
from types import TracebackType
|
|
5
|
+
|
|
6
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SqlAlchemyUnitOfWork:
|
|
10
|
+
"""SQLAlchemy implementation of Unit of Work pattern."""
|
|
11
|
+
|
|
12
|
+
def __init__(self, session_factory: Callable[[], AsyncSession]) -> None:
|
|
13
|
+
"""Initialize the unit of work with a session factory."""
|
|
14
|
+
self._session_factory = session_factory
|
|
15
|
+
self._session: AsyncSession | None = None
|
|
16
|
+
|
|
17
|
+
@property
|
|
18
|
+
def session(self) -> AsyncSession:
|
|
19
|
+
"""Get the current session."""
|
|
20
|
+
if self._session is None:
|
|
21
|
+
raise RuntimeError("UnitOfWork must be used within async context")
|
|
22
|
+
return self._session
|
|
23
|
+
|
|
24
|
+
async def __aenter__(self) -> "SqlAlchemyUnitOfWork":
|
|
25
|
+
"""Enter the unit of work context."""
|
|
26
|
+
self._session = self._session_factory()
|
|
27
|
+
return self
|
|
28
|
+
|
|
29
|
+
async def __aexit__(
|
|
30
|
+
self,
|
|
31
|
+
exc_type: type[BaseException] | None,
|
|
32
|
+
exc_val: BaseException | None,
|
|
33
|
+
exc_tb: TracebackType | None,
|
|
34
|
+
) -> None:
|
|
35
|
+
"""Exit the unit of work context."""
|
|
36
|
+
if self._session:
|
|
37
|
+
if exc_type is not None:
|
|
38
|
+
await self._session.rollback()
|
|
39
|
+
await self._session.commit()
|
|
40
|
+
await self._session.close()
|
|
41
|
+
self._session = None
|
|
42
|
+
|
|
43
|
+
async def commit(self) -> None:
|
|
44
|
+
"""Commit the current transaction."""
|
|
45
|
+
if self._session is None:
|
|
46
|
+
raise RuntimeError("UnitOfWork must be used within async context")
|
|
47
|
+
await self._session.commit()
|
|
48
|
+
|
|
49
|
+
async def rollback(self) -> None:
|
|
50
|
+
"""Rollback the current transaction."""
|
|
51
|
+
if self._session is None:
|
|
52
|
+
raise RuntimeError("UnitOfWork must be used within async context")
|
|
53
|
+
await self._session.rollback()
|
|
54
|
+
|
|
55
|
+
async def flush(self) -> None:
|
|
56
|
+
"""Flush pending changes to the database without committing."""
|
|
57
|
+
if self._session is None:
|
|
58
|
+
raise RuntimeError("UnitOfWork must be used within async context")
|
|
59
|
+
await self._session.flush()
|
kodit/log.py
CHANGED
|
@@ -11,6 +11,7 @@ from functools import lru_cache
|
|
|
11
11
|
from pathlib import Path
|
|
12
12
|
from typing import Any
|
|
13
13
|
|
|
14
|
+
import litellm
|
|
14
15
|
import rudderstack.analytics as rudder_analytics # type: ignore[import-untyped]
|
|
15
16
|
import structlog
|
|
16
17
|
from structlog.types import EventDict
|
|
@@ -99,6 +100,7 @@ def configure_logging(app_context: AppContext) -> None:
|
|
|
99
100
|
"bm25s",
|
|
100
101
|
"sentence_transformers.SentenceTransformer",
|
|
101
102
|
"httpx",
|
|
103
|
+
"LiteLLM",
|
|
102
104
|
]:
|
|
103
105
|
if root_logger.getEffectiveLevel() == logging.DEBUG:
|
|
104
106
|
logging.getLogger(_log).handlers.clear()
|
|
@@ -106,6 +108,9 @@ def configure_logging(app_context: AppContext) -> None:
|
|
|
106
108
|
else:
|
|
107
109
|
logging.getLogger(_log).disabled = True
|
|
108
110
|
|
|
111
|
+
# More litellm logging cruft
|
|
112
|
+
litellm.suppress_debug_info = True
|
|
113
|
+
|
|
109
114
|
# Configure SQLAlchemy loggers to use our structlog setup
|
|
110
115
|
for _log in ["sqlalchemy.engine", "alembic"]:
|
|
111
116
|
engine_logger = logging.getLogger(_log)
|
|
@@ -138,6 +143,7 @@ def configure_logging(app_context: AppContext) -> None:
|
|
|
138
143
|
|
|
139
144
|
def configure_telemetry(app_context: AppContext) -> None:
|
|
140
145
|
"""Configure telemetry for the application."""
|
|
146
|
+
litellm.telemetry = False # Disable litellm telemetry by default
|
|
141
147
|
if app_context.disable_telemetry:
|
|
142
148
|
structlog.stdlib.get_logger(__name__).info("Telemetry has been disabled")
|
|
143
149
|
rudder_analytics.send = False
|
kodit/mcp.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""MCP server for kodit."""
|
|
2
2
|
|
|
3
|
-
from collections.abc import AsyncIterator
|
|
3
|
+
from collections.abc import AsyncIterator, Callable
|
|
4
4
|
from contextlib import asynccontextmanager
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from pathlib import Path
|
|
@@ -15,6 +15,7 @@ from kodit._version import version
|
|
|
15
15
|
from kodit.application.factories.code_indexing_factory import (
|
|
16
16
|
create_code_indexing_application_service,
|
|
17
17
|
)
|
|
18
|
+
from kodit.application.factories.reporting_factory import create_server_operation
|
|
18
19
|
from kodit.config import AppContext
|
|
19
20
|
from kodit.database import Database
|
|
20
21
|
from kodit.domain.value_objects import (
|
|
@@ -32,6 +33,7 @@ class MCPContext:
|
|
|
32
33
|
"""Context for the MCP server."""
|
|
33
34
|
|
|
34
35
|
session: AsyncSession
|
|
36
|
+
session_factory: Callable[[], AsyncSession]
|
|
35
37
|
app_context: AppContext
|
|
36
38
|
|
|
37
39
|
|
|
@@ -55,7 +57,11 @@ async def mcp_lifespan(_: FastMCP) -> AsyncIterator[MCPContext]:
|
|
|
55
57
|
if _mcp_db is None:
|
|
56
58
|
_mcp_db = await app_context.get_db()
|
|
57
59
|
async with _mcp_db.session_factory() as session:
|
|
58
|
-
yield MCPContext(
|
|
60
|
+
yield MCPContext(
|
|
61
|
+
session=session,
|
|
62
|
+
app_context=app_context,
|
|
63
|
+
session_factory=_mcp_db.session_factory,
|
|
64
|
+
)
|
|
59
65
|
|
|
60
66
|
|
|
61
67
|
def create_mcp_server(name: str, instructions: str | None = None) -> FastMCP:
|
|
@@ -174,6 +180,8 @@ def register_mcp_tools(mcp_server: FastMCP) -> None:
|
|
|
174
180
|
service = create_code_indexing_application_service(
|
|
175
181
|
app_context=mcp_context.app_context,
|
|
176
182
|
session=mcp_context.session,
|
|
183
|
+
session_factory=mcp_context.session_factory,
|
|
184
|
+
operation=create_server_operation(),
|
|
177
185
|
)
|
|
178
186
|
|
|
179
187
|
log.debug("Searching for snippets")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: kodit
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.2
|
|
4
4
|
Summary: Code indexing for better AI code generation
|
|
5
5
|
Project-URL: Homepage, https://docs.helixml.tech/kodit/
|
|
6
6
|
Project-URL: Documentation, https://docs.helixml.tech/kodit/
|
|
@@ -35,7 +35,8 @@ Requires-Dist: gitpython>=3.1.44
|
|
|
35
35
|
Requires-Dist: hf-xet>=1.1.2
|
|
36
36
|
Requires-Dist: httpx-retries>=0.3.2
|
|
37
37
|
Requires-Dist: httpx>=0.28.1
|
|
38
|
-
Requires-Dist:
|
|
38
|
+
Requires-Dist: litellm>=1.75.8
|
|
39
|
+
Requires-Dist: openai==1.99.9
|
|
39
40
|
Requires-Dist: pathspec>=0.12.1
|
|
40
41
|
Requires-Dist: pydantic-settings>=2.9.1
|
|
41
42
|
Requires-Dist: pystemmer>=3.0.0
|
|
@@ -1,36 +1,36 @@
|
|
|
1
1
|
kodit/.gitignore,sha256=ztkjgRwL9Uud1OEi36hGQeDGk3OLK1NfDEO8YqGYy8o,11
|
|
2
2
|
kodit/__init__.py,sha256=aEKHYninUq1yh6jaNfvJBYg-6fenpN132nJt1UU6Jxs,59
|
|
3
|
-
kodit/_version.py,sha256=
|
|
4
|
-
kodit/app.py,sha256=
|
|
5
|
-
kodit/cli.py,sha256=
|
|
3
|
+
kodit/_version.py,sha256=A45grTqzrHuDn1CT9K5GVUbY4_Q3OSTcXAl3zdHzcEI,704
|
|
4
|
+
kodit/app.py,sha256=xLy0cM3fduXSQSws3wq9fWg5eJB1xD6vrMpkVFYpnhA,4468
|
|
5
|
+
kodit/cli.py,sha256=ugy0L9m5lVgudLebD5FpmZfJEAVAtvxbCvUTfJvU46Y,27948
|
|
6
6
|
kodit/cli_utils.py,sha256=bW4rIm-elrsyM_pSGHh30zV0_oX7V-64pL3YSaBcOt0,2810
|
|
7
|
-
kodit/config.py,sha256=
|
|
8
|
-
kodit/database.py,sha256=
|
|
9
|
-
kodit/log.py,sha256=
|
|
10
|
-
kodit/mcp.py,sha256=
|
|
7
|
+
kodit/config.py,sha256=wKXUb06j7VbpD7ydCARd6_DNeAY5tLeJqHvhWozFhyI,11052
|
|
8
|
+
kodit/database.py,sha256=k93byjVUX1VjAb0hLZxUo4liEKKxAWUBJNw2e7rzaiI,1771
|
|
9
|
+
kodit/log.py,sha256=ZpM0eMo_DVGQqrHxg0VV6dMrN2AAmu_3C0I3G7p2nMw,8828
|
|
10
|
+
kodit/mcp.py,sha256=GWh9krkcP37wh8ZmvfXaGJPknhaautBxzvbMMr5FRdg,7555
|
|
11
11
|
kodit/middleware.py,sha256=TiwebNpaEmiP7QRuZrfZcCL51IUefQyNLSPuzVyk8UM,2813
|
|
12
|
-
kodit/reporting.py,sha256=icce1ZyiADsA_Qz-mSjgn2H4SSqKuGfLKnw-yrl9nsg,2722
|
|
13
12
|
kodit/application/__init__.py,sha256=mH50wTpgP9dhbKztFsL8Dda9Hi18TSnMVxXtpp4aGOA,35
|
|
14
13
|
kodit/application/factories/__init__.py,sha256=bU5CvEnaBePZ7JbkCOp1MGTNP752bnU2uEqmfy5FdRk,37
|
|
15
|
-
kodit/application/factories/code_indexing_factory.py,sha256=
|
|
14
|
+
kodit/application/factories/code_indexing_factory.py,sha256=4c5LS2t7FOHiNS_Xb5sPRngf3we-VbTWKa-NcjZmf0Q,7300
|
|
15
|
+
kodit/application/factories/reporting_factory.py,sha256=Plf3c1KIx36eM5YefU5svPr9QeaNcKFH5UlmDuET8R0,1013
|
|
16
16
|
kodit/application/services/__init__.py,sha256=p5UQNw-H5sxQvs5Etfte93B3cJ1kKW6DNxK34uFvU1E,38
|
|
17
|
-
kodit/application/services/auto_indexing_service.py,sha256=
|
|
18
|
-
kodit/application/services/code_indexing_application_service.py,sha256=
|
|
19
|
-
kodit/application/services/indexing_worker_service.py,sha256=
|
|
20
|
-
kodit/application/services/queue_service.py,sha256=
|
|
21
|
-
kodit/application/services/
|
|
17
|
+
kodit/application/services/auto_indexing_service.py,sha256=rJPWiV755eskFNKjYliPr1WMFylXlG8BWPpFcwwOhm0,3973
|
|
18
|
+
kodit/application/services/code_indexing_application_service.py,sha256=tLbbo-fyAc3iZoCOJU9lIfhNI_6Lz9SqQdfjeN5m8yA,16213
|
|
19
|
+
kodit/application/services/indexing_worker_service.py,sha256=B8MdXrzjaYVS7zVTTz8cXUQItkGb8Fk1aXeim2dfCJw,5311
|
|
20
|
+
kodit/application/services/queue_service.py,sha256=G42lR31maFRZ9cSvnWZrzeyb4P1R6yFqrcHWVKAqc9U,1924
|
|
21
|
+
kodit/application/services/reporting.py,sha256=hDisTU_XUBTfiOtnJ5-6x0jj8rHSlq9zZgeNnBT7W5Y,2834
|
|
22
|
+
kodit/application/services/sync_scheduler.py,sha256=FUUpDtxUh7Eg-lnzOrUHzmSWGpzdpYJQYgPhnQYwTcg,3446
|
|
22
23
|
kodit/domain/__init__.py,sha256=TCpg4Xx-oF4mKV91lo4iXqMEfBT1OoRSYnbG-zVWolA,66
|
|
23
24
|
kodit/domain/entities.py,sha256=QsCzKXT7gF9jTPAjJo5lqjFGRsIklAFC2qRy_Gt3RbA,10377
|
|
24
25
|
kodit/domain/errors.py,sha256=yIsgCjM_yOFIg8l7l-t7jM8pgeAX4cfPq0owf7iz3DA,106
|
|
25
|
-
kodit/domain/
|
|
26
|
-
kodit/domain/
|
|
27
|
-
kodit/domain/value_objects.py,sha256=dkfbg99PSCrfj6nJ7tZ2UzDG3QUgNa_Cpj2gLakDM5k,17512
|
|
26
|
+
kodit/domain/protocols.py,sha256=RGNOlHyvNq6Nx_95ETTO9DkzeZmjtubfC7qdGvA5iPk,2753
|
|
27
|
+
kodit/domain/value_objects.py,sha256=uIpAdIvq6VefEGa8yq5Uqyuyit72SHtDptnoOUd73u0,18882
|
|
28
28
|
kodit/domain/services/__init__.py,sha256=Q1GhCK_PqKHYwYE4tkwDz5BIyXkJngLBBOHhzvX8nzo,42
|
|
29
|
-
kodit/domain/services/bm25_service.py,sha256=
|
|
29
|
+
kodit/domain/services/bm25_service.py,sha256=seRo0V-zW6Uq-Y67j0-zp1xz93gbfQgvlEbpQeYHN1U,3529
|
|
30
30
|
kodit/domain/services/embedding_service.py,sha256=7drYRC2kjg0WJmo06a2E9N0vDnwInUlBB96twjz2BT8,4526
|
|
31
31
|
kodit/domain/services/enrichment_service.py,sha256=XsXg3nV-KN4rqtC7Zro_ZiZ6RSq-1eA1MG6IDzFGyBA,1316
|
|
32
32
|
kodit/domain/services/index_query_service.py,sha256=cDQkgpJ3JbyeZ3z3GTIqH1JzhhKE_LBIwYE6b-lakwU,2172
|
|
33
|
-
kodit/domain/services/index_service.py,sha256=
|
|
33
|
+
kodit/domain/services/index_service.py,sha256=TSvM-UuOtq30hz6eNPgu9AEVFrLDugdYoBgBf1xZDcI,10377
|
|
34
34
|
kodit/infrastructure/__init__.py,sha256=HzEYIjoXnkz_i_MHO2e0sIVYweUcRnl2RpyBiTbMObU,28
|
|
35
35
|
kodit/infrastructure/api/__init__.py,sha256=U0TSMPpHrlj1zbAtleuZjU3nXGwudyMe-veNBgvODwM,34
|
|
36
36
|
kodit/infrastructure/api/client/__init__.py,sha256=6RSYqeuxjDe_zTUq48D0F-VfBBUvDmTkO3K3vD61q3I,349
|
|
@@ -41,14 +41,16 @@ kodit/infrastructure/api/client/index_client.py,sha256=OxsakDQBEulwmqZVzwOSSI0Lk
|
|
|
41
41
|
kodit/infrastructure/api/client/search_client.py,sha256=f4mM5ZJpAuR7w-i9yASbh4SYMxOq7_f4hXgaQesGquI,2614
|
|
42
42
|
kodit/infrastructure/api/middleware/__init__.py,sha256=6m7eE5k5buboJbuzyX5E9-Tf99yNwFaeJF0f_6HwLyM,30
|
|
43
43
|
kodit/infrastructure/api/middleware/auth.py,sha256=QSnMcMLWvfumqN1iG4ePj2vEZb2Dlsgr-WHptkEkkhE,1064
|
|
44
|
-
kodit/infrastructure/api/v1/__init__.py,sha256=
|
|
45
|
-
kodit/infrastructure/api/v1/dependencies.py,sha256=
|
|
46
|
-
kodit/infrastructure/api/v1/routers/__init__.py,sha256=
|
|
44
|
+
kodit/infrastructure/api/v1/__init__.py,sha256=hQ03es21FSgzQlmdP5xWZzK80woIvuYGjiZLwFYuYwk,151
|
|
45
|
+
kodit/infrastructure/api/v1/dependencies.py,sha256=MBmCpTtwDAtdsLjJ06Bzod3Vwqon8mMASknZobdoaMU,2919
|
|
46
|
+
kodit/infrastructure/api/v1/routers/__init__.py,sha256=YYyeiuyphIPc-Q_2totF8zfR0BoseOH4ZYFdHP0ed_M,218
|
|
47
47
|
kodit/infrastructure/api/v1/routers/indexes.py,sha256=_lUir1M0SW6kPHeGqjiPjtSa50rY4PN2es5TZEpSHYE,3442
|
|
48
|
+
kodit/infrastructure/api/v1/routers/queue.py,sha256=EZbR-G0qDO9W5ajV_75GRk2pW1Qdgc0ggOwrGKlBE2A,2138
|
|
48
49
|
kodit/infrastructure/api/v1/routers/search.py,sha256=da9YTR6VTzU85_6X3aaZemdTHGCEvcPNeKuMFBgmT_A,2452
|
|
49
50
|
kodit/infrastructure/api/v1/schemas/__init__.py,sha256=_5BVqv4EUi_vvWlAQOE_VfRulUDAF21ZQ7z27y7YOdw,498
|
|
50
51
|
kodit/infrastructure/api/v1/schemas/context.py,sha256=NlsIn9j1R3se7JkGZivS_CUN4gGP5NYaAtkRe3QH6dk,214
|
|
51
52
|
kodit/infrastructure/api/v1/schemas/index.py,sha256=NtL09YtO50h-ddpAFxNf-dyxu_Xi5v3yOpKW0W4xsAM,1950
|
|
53
|
+
kodit/infrastructure/api/v1/schemas/queue.py,sha256=oa4wumWOvGzi53Q3cjwIrQJRoentp5nsQSsaj-l-B4U,652
|
|
52
54
|
kodit/infrastructure/api/v1/schemas/search.py,sha256=CWzg5SIMUJ_4yM-ZfgSLWCanMxov6AyGgQQcOMkRlGw,5618
|
|
53
55
|
kodit/infrastructure/bm25/__init__.py,sha256=DmGbrEO34FOJy4e685BbyxLA7gPW1eqs2gAxsp6JOuM,34
|
|
54
56
|
kodit/infrastructure/bm25/bm25_factory.py,sha256=I4eo7qRslnyXIRkBf-StZ5ga2Evrr5J5YFocTChFD3g,884
|
|
@@ -57,42 +59,43 @@ kodit/infrastructure/bm25/vectorchord_bm25_repository.py,sha256=p6ht5K-jlDTvEkmo
|
|
|
57
59
|
kodit/infrastructure/cloning/__init__.py,sha256=IzIvX-yeRRFZ-lfvPVSEe_qXszO6DGQdjKwwDigexyQ,30
|
|
58
60
|
kodit/infrastructure/cloning/metadata.py,sha256=GD2UnCC1oR82RD0SVUqk9CJOqzXPxhOAHVOp7jqN6Qc,3571
|
|
59
61
|
kodit/infrastructure/cloning/git/__init__.py,sha256=20ePcp0qE6BuLsjsv4KYB1DzKhMIMsPXwEqIEZtjTJs,34
|
|
60
|
-
kodit/infrastructure/cloning/git/working_copy.py,sha256=
|
|
62
|
+
kodit/infrastructure/cloning/git/working_copy.py,sha256=Lt_NWSoQ1pZAi0u_MKUhrwGeul4XWf3zqCuzG3dn70s,3608
|
|
61
63
|
kodit/infrastructure/embedding/__init__.py,sha256=F-8nLlWAerYJ0MOIA4tbXHLan8bW5rRR84vzxx6tRKI,39
|
|
62
|
-
kodit/infrastructure/embedding/embedding_factory.py,sha256=
|
|
64
|
+
kodit/infrastructure/embedding/embedding_factory.py,sha256=BNhrrYQAkcnXkuuQy-Q-lwJhyoGONsTsbgN4t0UdGeY,3395
|
|
63
65
|
kodit/infrastructure/embedding/local_vector_search_repository.py,sha256=ExweyNEL5cP-g3eDhGqZSih7zhdOrop2WdFPPJL-tB4,3505
|
|
64
66
|
kodit/infrastructure/embedding/vectorchord_vector_search_repository.py,sha256=PIoU0HsDlaoXDXnGjOR0LAkAcW4JiE3ymJy_SBhEopc,8030
|
|
65
67
|
kodit/infrastructure/embedding/embedding_providers/__init__.py,sha256=qeZ-oAIAxMl5QqebGtO1lq-tHjl_ucAwOXePklcwwGk,34
|
|
66
68
|
kodit/infrastructure/embedding/embedding_providers/batching.py,sha256=a8CL9PX2VLmbeg616fc_lQzfC4BWTVn32m4SEhXpHxc,3279
|
|
67
69
|
kodit/infrastructure/embedding/embedding_providers/hash_embedding_provider.py,sha256=V6OdCuWyQQOvo3OJGRi-gBKDApIcrELydFg7T696P5s,2257
|
|
70
|
+
kodit/infrastructure/embedding/embedding_providers/litellm_embedding_provider.py,sha256=9Q5he_MI8xXENODwCvYCbhVawTjTv1bArGQrmxoWLas,5297
|
|
68
71
|
kodit/infrastructure/embedding/embedding_providers/local_embedding_provider.py,sha256=9aLV1Zg4KMhYWlGRwgAUtswW4aIabNqbsipWhAn64RI,4133
|
|
69
|
-
kodit/infrastructure/embedding/embedding_providers/openai_embedding_provider.py,sha256=CE86s8IicieUjIDWn2xzswteHXCzmw1Qz6Kp4GBIcus,6316
|
|
70
72
|
kodit/infrastructure/enrichment/__init__.py,sha256=8acZKNzql8Fs0lceFu9U3KoUrOptRBtVIxr_Iw6lz3Y,40
|
|
71
|
-
kodit/infrastructure/enrichment/enrichment_factory.py,sha256=
|
|
73
|
+
kodit/infrastructure/enrichment/enrichment_factory.py,sha256=NFGY6u9SJ_GOgiB_RtotbQmte0kGFQUymwzZCbbsx34,1530
|
|
74
|
+
kodit/infrastructure/enrichment/litellm_enrichment_provider.py,sha256=AM4-4KApDndzWzQzzKAedy21iGMhkwylR5VCmV9K-uI,6040
|
|
72
75
|
kodit/infrastructure/enrichment/local_enrichment_provider.py,sha256=aVU3_kbLJ0BihwGIwvJ00DBe0voHkiKdFSjPxxkVfVA,4150
|
|
73
76
|
kodit/infrastructure/enrichment/null_enrichment_provider.py,sha256=DhZkJBnkvXg_XSAs-oKiFnKqYFPnmTl3ikdxrqeEfbc,713
|
|
74
|
-
kodit/infrastructure/enrichment/openai_enrichment_provider.py,sha256=C0y0NEPu1GpFr22TGi1voxYGsYTV0ZITYuDzvRJ5vW4,5573
|
|
75
77
|
kodit/infrastructure/enrichment/utils.py,sha256=FE9UCuxxzSdoHrmAC8Si2b5D6Nf6kVqgM1yjUVyCvW0,930
|
|
76
78
|
kodit/infrastructure/git/__init__.py,sha256=0iMosFzudj4_xNIMe2SRbV6l5bWqkjnUsZoFsoZFuM8,33
|
|
77
|
-
kodit/infrastructure/git/git_utils.py,sha256=
|
|
79
|
+
kodit/infrastructure/git/git_utils.py,sha256=5lH94AcF7Hac4h6kBzo_B9pzC1S6AK2-Dy13gz--Zf0,781
|
|
78
80
|
kodit/infrastructure/ignore/__init__.py,sha256=VzFv8XOzHmsu0MEGnWVSF6KsgqLBmvHlRqAkT1Xb1MY,36
|
|
79
81
|
kodit/infrastructure/ignore/ignore_pattern_provider.py,sha256=zdxun3GodLfXxyssBK8QDUK58xb4fBJ0SKcHUyn3pzM,2131
|
|
80
82
|
kodit/infrastructure/indexing/__init__.py,sha256=7UPRa2jwCAsa0Orsp6PqXSF8iIXJVzXHMFmrKkI9yH8,38
|
|
81
83
|
kodit/infrastructure/indexing/fusion_service.py,sha256=2B0guBsuKz19uWcs18sIJpUJPzXoRvULgl7UNWQGysA,1809
|
|
82
84
|
kodit/infrastructure/mappers/__init__.py,sha256=QPHOjNreXmBPPovZ6elnYFS0vD-IsmrGl4TT01FCKro,77
|
|
83
|
-
kodit/infrastructure/mappers/index_mapper.py,sha256=
|
|
85
|
+
kodit/infrastructure/mappers/index_mapper.py,sha256=XWtv_him2Sd9dR-Jy_ndy9jYXVtv3LttzmmUGzNK6CE,12825
|
|
84
86
|
kodit/infrastructure/mappers/task_mapper.py,sha256=QW7uL8rji6QJ7RRdHwbvkWqmwDcUDGTYPLwbwiKlViY,2919
|
|
87
|
+
kodit/infrastructure/reporting/__init__.py,sha256=4Qu38YbDOaeDqLdT_CbK8tOZHTKGrHRXncVKlGRzOeQ,32
|
|
88
|
+
kodit/infrastructure/reporting/log_progress.py,sha256=sNF0oeg56NaTfO3DVg1AXQWwgrHSTaZrOWqPDq-FhVE,2180
|
|
89
|
+
kodit/infrastructure/reporting/tdqm_progress.py,sha256=g01P7PItQqqSXzM5jjXL6uOUIKJQ6O9zaO1WZZ7XKSM,2512
|
|
85
90
|
kodit/infrastructure/slicing/__init__.py,sha256=x7cjvHA9Ay2weUYE_dpdAaPaStp20M-4U2b5MLgT5KM,37
|
|
86
91
|
kodit/infrastructure/slicing/language_detection_service.py,sha256=JGJXrq9bLyfnisWJXeP7y1jbZMmKAISdPBlRBCosUcE,684
|
|
87
92
|
kodit/infrastructure/slicing/slicer.py,sha256=GOqJykd00waOTO1WJHyE5KUgJ2RLx2rOQ7M7T_u5LLg,35600
|
|
88
93
|
kodit/infrastructure/sqlalchemy/__init__.py,sha256=UXPMSF_hgWaqr86cawRVqM8XdVNumQyyK5B8B97GnlA,33
|
|
89
|
-
kodit/infrastructure/sqlalchemy/embedding_repository.py,sha256=
|
|
90
|
-
kodit/infrastructure/sqlalchemy/entities.py,sha256=
|
|
91
|
-
kodit/infrastructure/sqlalchemy/index_repository.py,sha256=
|
|
92
|
-
kodit/infrastructure/sqlalchemy/task_repository.py,sha256=
|
|
93
|
-
kodit/infrastructure/
|
|
94
|
-
kodit/infrastructure/ui/progress.py,sha256=SHEUoQA_x36z4nqHrQduVrrWIvFfX6QxAawC7zQ50pw,6433
|
|
95
|
-
kodit/infrastructure/ui/spinner.py,sha256=GcP115qtR0VEnGfMEtsGoAUpRzVGUSfiUXfoJJERngA,2357
|
|
94
|
+
kodit/infrastructure/sqlalchemy/embedding_repository.py,sha256=YYxbUEdzDdlKdy0FyAP4EzhJMAIdEnNZiXT6hzPHk9I,7731
|
|
95
|
+
kodit/infrastructure/sqlalchemy/entities.py,sha256=P3BitWqnTxMVXmyez7OX-SB3-UG66XorqvPMjXspwoM,7894
|
|
96
|
+
kodit/infrastructure/sqlalchemy/index_repository.py,sha256=x8MPl0j7GrW_lEZh464EZyb0w935p_EHv2NIMNxjJu0,25680
|
|
97
|
+
kodit/infrastructure/sqlalchemy/task_repository.py,sha256=60ECbxiXC2_UR80f4uPSmJiP_so7PTBzZG_w1WXSiuE,3546
|
|
98
|
+
kodit/infrastructure/sqlalchemy/unit_of_work.py,sha256=gK-C8yk2HYBrAEDrblWxBrldrGb83SBHn-8lURkFeMg,2093
|
|
96
99
|
kodit/migrations/README,sha256=ISVtAOvqvKk_5ThM5ioJE-lMkvf9IbknFUFVU_vPma4,58
|
|
97
100
|
kodit/migrations/__init__.py,sha256=lP5MuwlyWRMO6UcDWnQcQ3G-GYHcFb6rl9gYPHJ1sjo,40
|
|
98
101
|
kodit/migrations/env.py,sha256=m57TkFLYjQ4w2aw1YICXkeek27M6qjwRDMHvThWqIL0,2383
|
|
@@ -109,8 +112,8 @@ kodit/utils/__init__.py,sha256=DPEB1i8evnLF4Ns3huuAYg-0pKBFKUFuiDzOKG9r-sw,33
|
|
|
109
112
|
kodit/utils/dump_openapi.py,sha256=29VdjHpNSaGAg7RjQw0meq1OLhljCx1ElgBlTC8xoF4,1247
|
|
110
113
|
kodit/utils/generate_api_paths.py,sha256=TMtx9v55podDfUmiWaHgJHLtEWLV2sLL-5ejGFMPzAo,3569
|
|
111
114
|
kodit/utils/path_utils.py,sha256=thK6YGGNvQThdBaCYCCeCvS1L8x-lwl3AoGht2jnjGw,1645
|
|
112
|
-
kodit-0.4.
|
|
113
|
-
kodit-0.4.
|
|
114
|
-
kodit-0.4.
|
|
115
|
-
kodit-0.4.
|
|
116
|
-
kodit-0.4.
|
|
115
|
+
kodit-0.4.2.dist-info/METADATA,sha256=bC5eza2ORs3v3w5-bwW1uybuk8b4JdNI13GZQvrP4ps,7702
|
|
116
|
+
kodit-0.4.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
117
|
+
kodit-0.4.2.dist-info/entry_points.txt,sha256=hoTn-1aKyTItjnY91fnO-rV5uaWQLQ-Vi7V5et2IbHY,40
|
|
118
|
+
kodit-0.4.2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
119
|
+
kodit-0.4.2.dist-info/RECORD,,
|
kodit/domain/interfaces.py
DELETED
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
"""Domain interfaces."""
|
|
2
|
-
|
|
3
|
-
from abc import ABC, abstractmethod
|
|
4
|
-
|
|
5
|
-
from kodit.domain.value_objects import ProgressEvent
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class ProgressCallback(ABC):
|
|
9
|
-
"""Abstract interface for progress callbacks."""
|
|
10
|
-
|
|
11
|
-
@abstractmethod
|
|
12
|
-
async def on_progress(self, event: ProgressEvent) -> None:
|
|
13
|
-
"""On progress hook."""
|
|
14
|
-
|
|
15
|
-
@abstractmethod
|
|
16
|
-
async def on_complete(self, operation: str) -> None:
|
|
17
|
-
"""On complete hook."""
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class NullProgressCallback(ProgressCallback):
|
|
21
|
-
"""Null implementation of progress callback that does nothing."""
|
|
22
|
-
|
|
23
|
-
async def on_progress(self, event: ProgressEvent) -> None:
|
|
24
|
-
"""Do nothing on progress."""
|
|
25
|
-
|
|
26
|
-
async def on_complete(self, operation: str) -> None:
|
|
27
|
-
"""Do nothing on complete."""
|
|
@@ -1,183 +0,0 @@
|
|
|
1
|
-
"""OpenAI embedding provider implementation using httpx."""
|
|
2
|
-
|
|
3
|
-
import asyncio
|
|
4
|
-
from collections.abc import AsyncGenerator
|
|
5
|
-
from typing import Any
|
|
6
|
-
|
|
7
|
-
import httpx
|
|
8
|
-
import structlog
|
|
9
|
-
import tiktoken
|
|
10
|
-
from tiktoken import Encoding
|
|
11
|
-
|
|
12
|
-
from kodit.domain.services.embedding_service import EmbeddingProvider
|
|
13
|
-
from kodit.domain.value_objects import EmbeddingRequest, EmbeddingResponse
|
|
14
|
-
|
|
15
|
-
from .batching import split_sub_batches
|
|
16
|
-
|
|
17
|
-
# Constants
|
|
18
|
-
MAX_TOKENS = 8192 # Conservative token limit for the embedding model
|
|
19
|
-
BATCH_SIZE = (
|
|
20
|
-
10 # Maximum number of items per API call (keeps existing test expectations)
|
|
21
|
-
)
|
|
22
|
-
OPENAI_NUM_PARALLEL_TASKS = 10 # Semaphore limit for concurrent OpenAI requests
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
class OpenAIEmbeddingProvider(EmbeddingProvider):
|
|
26
|
-
"""OpenAI embedding provider that uses OpenAI's embedding API via httpx."""
|
|
27
|
-
|
|
28
|
-
def __init__( # noqa: PLR0913
|
|
29
|
-
self,
|
|
30
|
-
api_key: str | None = None,
|
|
31
|
-
base_url: str = "https://api.openai.com",
|
|
32
|
-
model_name: str = "text-embedding-3-small",
|
|
33
|
-
num_parallel_tasks: int = OPENAI_NUM_PARALLEL_TASKS,
|
|
34
|
-
socket_path: str | None = None,
|
|
35
|
-
timeout: float = 30.0,
|
|
36
|
-
) -> None:
|
|
37
|
-
"""Initialize the OpenAI embedding provider.
|
|
38
|
-
|
|
39
|
-
Args:
|
|
40
|
-
api_key: The OpenAI API key.
|
|
41
|
-
base_url: The base URL for the OpenAI API.
|
|
42
|
-
model_name: The model name to use for embeddings.
|
|
43
|
-
num_parallel_tasks: Maximum number of concurrent requests.
|
|
44
|
-
socket_path: Optional Unix socket path for local communication.
|
|
45
|
-
timeout: Request timeout in seconds.
|
|
46
|
-
|
|
47
|
-
"""
|
|
48
|
-
self.model_name = model_name
|
|
49
|
-
self.num_parallel_tasks = num_parallel_tasks
|
|
50
|
-
self.log = structlog.get_logger(__name__)
|
|
51
|
-
self.api_key = api_key
|
|
52
|
-
self.base_url = base_url
|
|
53
|
-
self.socket_path = socket_path
|
|
54
|
-
self.timeout = timeout
|
|
55
|
-
|
|
56
|
-
# Lazily initialised token encoding
|
|
57
|
-
self._encoding: Encoding | None = None
|
|
58
|
-
|
|
59
|
-
# Create httpx client with optional Unix socket support
|
|
60
|
-
if socket_path:
|
|
61
|
-
transport = httpx.AsyncHTTPTransport(uds=socket_path)
|
|
62
|
-
self.http_client = httpx.AsyncClient(
|
|
63
|
-
transport=transport,
|
|
64
|
-
base_url="http://localhost", # Base URL for Unix socket
|
|
65
|
-
timeout=timeout,
|
|
66
|
-
)
|
|
67
|
-
else:
|
|
68
|
-
self.http_client = httpx.AsyncClient(
|
|
69
|
-
base_url=base_url,
|
|
70
|
-
timeout=timeout,
|
|
71
|
-
)
|
|
72
|
-
|
|
73
|
-
# ---------------------------------------------------------------------
|
|
74
|
-
# Helper utilities
|
|
75
|
-
# ---------------------------------------------------------------------
|
|
76
|
-
|
|
77
|
-
def _get_encoding(self) -> "Encoding":
|
|
78
|
-
"""Return (and cache) the tiktoken encoding for the chosen model."""
|
|
79
|
-
if self._encoding is None:
|
|
80
|
-
try:
|
|
81
|
-
self._encoding = tiktoken.encoding_for_model(self.model_name)
|
|
82
|
-
except KeyError:
|
|
83
|
-
# If the model is not supported by tiktoken, use a default encoding
|
|
84
|
-
self.log.info(
|
|
85
|
-
"Model not supported by tiktoken, using default encoding",
|
|
86
|
-
model_name=self.model_name,
|
|
87
|
-
default_encoding="o200k_base",
|
|
88
|
-
)
|
|
89
|
-
self._encoding = tiktoken.get_encoding("o200k_base")
|
|
90
|
-
|
|
91
|
-
return self._encoding
|
|
92
|
-
|
|
93
|
-
def _split_sub_batches(
|
|
94
|
-
self, encoding: "Encoding", data: list[EmbeddingRequest]
|
|
95
|
-
) -> list[list[EmbeddingRequest]]:
|
|
96
|
-
"""Proxy to the shared batching utility (kept for backward-compat)."""
|
|
97
|
-
return split_sub_batches(
|
|
98
|
-
encoding,
|
|
99
|
-
data,
|
|
100
|
-
max_tokens=MAX_TOKENS,
|
|
101
|
-
batch_size=BATCH_SIZE,
|
|
102
|
-
)
|
|
103
|
-
|
|
104
|
-
async def _call_embeddings_api(
|
|
105
|
-
self, texts: list[str]
|
|
106
|
-
) -> dict[str, Any]:
|
|
107
|
-
"""Call the embeddings API using httpx.
|
|
108
|
-
|
|
109
|
-
Args:
|
|
110
|
-
texts: The texts to embed.
|
|
111
|
-
|
|
112
|
-
Returns:
|
|
113
|
-
The API response as a dictionary.
|
|
114
|
-
|
|
115
|
-
"""
|
|
116
|
-
headers = {
|
|
117
|
-
"Content-Type": "application/json",
|
|
118
|
-
}
|
|
119
|
-
if self.api_key:
|
|
120
|
-
headers["Authorization"] = f"Bearer {self.api_key}"
|
|
121
|
-
|
|
122
|
-
data = {
|
|
123
|
-
"model": self.model_name,
|
|
124
|
-
"input": texts,
|
|
125
|
-
}
|
|
126
|
-
|
|
127
|
-
response = await self.http_client.post(
|
|
128
|
-
"/v1/embeddings",
|
|
129
|
-
json=data,
|
|
130
|
-
headers=headers,
|
|
131
|
-
)
|
|
132
|
-
response.raise_for_status()
|
|
133
|
-
return response.json()
|
|
134
|
-
|
|
135
|
-
async def embed(
|
|
136
|
-
self, data: list[EmbeddingRequest]
|
|
137
|
-
) -> AsyncGenerator[list[EmbeddingResponse], None]:
|
|
138
|
-
"""Embed a list of strings using OpenAI's API."""
|
|
139
|
-
if not data:
|
|
140
|
-
yield []
|
|
141
|
-
|
|
142
|
-
encoding = self._get_encoding()
|
|
143
|
-
|
|
144
|
-
# First, split by token limits (and max batch size)
|
|
145
|
-
batched_data = self._split_sub_batches(encoding, data)
|
|
146
|
-
|
|
147
|
-
# -----------------------------------------------------------------
|
|
148
|
-
# Process batches concurrently (but bounded by a semaphore)
|
|
149
|
-
# -----------------------------------------------------------------
|
|
150
|
-
|
|
151
|
-
sem = asyncio.Semaphore(self.num_parallel_tasks)
|
|
152
|
-
|
|
153
|
-
async def _process_batch(
|
|
154
|
-
batch: list[EmbeddingRequest],
|
|
155
|
-
) -> list[EmbeddingResponse]:
|
|
156
|
-
async with sem:
|
|
157
|
-
try:
|
|
158
|
-
response = await self._call_embeddings_api(
|
|
159
|
-
[item.text for item in batch]
|
|
160
|
-
)
|
|
161
|
-
embeddings_data = response.get("data", [])
|
|
162
|
-
|
|
163
|
-
return [
|
|
164
|
-
EmbeddingResponse(
|
|
165
|
-
snippet_id=item.snippet_id,
|
|
166
|
-
embedding=emb_data.get("embedding", []),
|
|
167
|
-
)
|
|
168
|
-
for item, emb_data in zip(batch, embeddings_data, strict=True)
|
|
169
|
-
]
|
|
170
|
-
except Exception as e:
|
|
171
|
-
self.log.exception("Error embedding batch", error=str(e))
|
|
172
|
-
# Return no embeddings for this batch if there was an error
|
|
173
|
-
return []
|
|
174
|
-
|
|
175
|
-
tasks = [_process_batch(batch) for batch in batched_data]
|
|
176
|
-
for task in asyncio.as_completed(tasks):
|
|
177
|
-
yield await task
|
|
178
|
-
|
|
179
|
-
async def close(self) -> None:
|
|
180
|
-
"""Close the HTTP client."""
|
|
181
|
-
if hasattr(self, "http_client"):
|
|
182
|
-
await self.http_client.aclose()
|
|
183
|
-
|