kodit 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/application/__init__.py +1 -0
- kodit/application/commands/__init__.py +1 -0
- kodit/application/commands/snippet_commands.py +22 -0
- kodit/application/services/__init__.py +1 -0
- kodit/application/services/indexing_application_service.py +387 -0
- kodit/application/services/snippet_application_service.py +149 -0
- kodit/cli.py +118 -82
- kodit/database.py +0 -22
- kodit/domain/__init__.py +1 -0
- kodit/{source/source_models.py → domain/entities.py} +88 -19
- kodit/domain/enums.py +9 -0
- kodit/domain/errors.py +5 -0
- kodit/domain/interfaces.py +27 -0
- kodit/domain/repositories.py +95 -0
- kodit/domain/services/__init__.py +1 -0
- kodit/domain/services/bm25_service.py +124 -0
- kodit/domain/services/embedding_service.py +155 -0
- kodit/domain/services/enrichment_service.py +48 -0
- kodit/domain/services/ignore_service.py +45 -0
- kodit/domain/services/indexing_service.py +203 -0
- kodit/domain/services/snippet_extraction_service.py +89 -0
- kodit/domain/services/source_service.py +85 -0
- kodit/domain/value_objects.py +215 -0
- kodit/infrastructure/__init__.py +1 -0
- kodit/infrastructure/bm25/__init__.py +1 -0
- kodit/infrastructure/bm25/bm25_factory.py +28 -0
- kodit/{bm25/local_bm25.py → infrastructure/bm25/local_bm25_repository.py} +33 -22
- kodit/{bm25/vectorchord_bm25.py → infrastructure/bm25/vectorchord_bm25_repository.py} +40 -35
- kodit/infrastructure/cloning/__init__.py +1 -0
- kodit/infrastructure/cloning/folder/__init__.py +1 -0
- kodit/infrastructure/cloning/folder/factory.py +128 -0
- kodit/infrastructure/cloning/folder/working_copy.py +38 -0
- kodit/infrastructure/cloning/git/__init__.py +1 -0
- kodit/infrastructure/cloning/git/factory.py +147 -0
- kodit/infrastructure/cloning/git/working_copy.py +32 -0
- kodit/infrastructure/cloning/metadata.py +127 -0
- kodit/infrastructure/embedding/__init__.py +1 -0
- kodit/infrastructure/embedding/embedding_factory.py +87 -0
- kodit/infrastructure/embedding/embedding_providers/__init__.py +1 -0
- kodit/infrastructure/embedding/embedding_providers/batching.py +93 -0
- kodit/infrastructure/embedding/embedding_providers/hash_embedding_provider.py +79 -0
- kodit/infrastructure/embedding/embedding_providers/local_embedding_provider.py +129 -0
- kodit/infrastructure/embedding/embedding_providers/openai_embedding_provider.py +113 -0
- kodit/infrastructure/embedding/local_vector_search_repository.py +114 -0
- kodit/{embedding/vectorchord_vector_search_service.py → infrastructure/embedding/vectorchord_vector_search_repository.py} +65 -46
- kodit/infrastructure/enrichment/__init__.py +1 -0
- kodit/{enrichment → infrastructure/enrichment}/enrichment_factory.py +28 -12
- kodit/infrastructure/enrichment/legacy_enrichment_models.py +42 -0
- kodit/{enrichment/enrichment_provider → infrastructure/enrichment}/local_enrichment_provider.py +38 -26
- kodit/infrastructure/enrichment/null_enrichment_provider.py +25 -0
- kodit/infrastructure/enrichment/openai_enrichment_provider.py +89 -0
- kodit/infrastructure/git/__init__.py +1 -0
- kodit/{source/git.py → infrastructure/git/git_utils.py} +10 -2
- kodit/infrastructure/ignore/__init__.py +1 -0
- kodit/{source/ignore.py → infrastructure/ignore/ignore_pattern_provider.py} +23 -6
- kodit/infrastructure/indexing/__init__.py +1 -0
- kodit/infrastructure/indexing/fusion_service.py +55 -0
- kodit/infrastructure/indexing/index_repository.py +291 -0
- kodit/infrastructure/indexing/indexing_factory.py +113 -0
- kodit/infrastructure/snippet_extraction/__init__.py +1 -0
- kodit/infrastructure/snippet_extraction/language_detection_service.py +39 -0
- kodit/infrastructure/snippet_extraction/snippet_extraction_factory.py +95 -0
- kodit/infrastructure/snippet_extraction/snippet_query_provider.py +45 -0
- kodit/{snippets/method_snippets.py → infrastructure/snippet_extraction/tree_sitter_snippet_extractor.py} +123 -61
- kodit/infrastructure/sqlalchemy/__init__.py +1 -0
- kodit/{embedding → infrastructure/sqlalchemy}/embedding_repository.py +40 -26
- kodit/infrastructure/sqlalchemy/file_repository.py +78 -0
- kodit/infrastructure/sqlalchemy/repository.py +133 -0
- kodit/infrastructure/sqlalchemy/snippet_repository.py +79 -0
- kodit/infrastructure/ui/__init__.py +1 -0
- kodit/infrastructure/ui/progress.py +127 -0
- kodit/{util → infrastructure/ui}/spinner.py +19 -4
- kodit/mcp.py +51 -28
- kodit/migrations/env.py +1 -4
- kodit/reporting.py +78 -0
- {kodit-0.2.4.dist-info → kodit-0.2.6.dist-info}/METADATA +1 -1
- kodit-0.2.6.dist-info/RECORD +100 -0
- kodit/bm25/__init__.py +0 -1
- kodit/bm25/keyword_search_factory.py +0 -17
- kodit/bm25/keyword_search_service.py +0 -34
- kodit/embedding/__init__.py +0 -1
- kodit/embedding/embedding_factory.py +0 -69
- kodit/embedding/embedding_models.py +0 -28
- kodit/embedding/embedding_provider/__init__.py +0 -1
- kodit/embedding/embedding_provider/embedding_provider.py +0 -92
- kodit/embedding/embedding_provider/hash_embedding_provider.py +0 -86
- kodit/embedding/embedding_provider/local_embedding_provider.py +0 -96
- kodit/embedding/embedding_provider/openai_embedding_provider.py +0 -73
- kodit/embedding/local_vector_search_service.py +0 -87
- kodit/embedding/vector_search_service.py +0 -55
- kodit/enrichment/__init__.py +0 -1
- kodit/enrichment/enrichment_provider/__init__.py +0 -1
- kodit/enrichment/enrichment_provider/enrichment_provider.py +0 -36
- kodit/enrichment/enrichment_provider/openai_enrichment_provider.py +0 -79
- kodit/enrichment/enrichment_service.py +0 -45
- kodit/indexing/__init__.py +0 -1
- kodit/indexing/fusion.py +0 -67
- kodit/indexing/indexing_models.py +0 -43
- kodit/indexing/indexing_repository.py +0 -216
- kodit/indexing/indexing_service.py +0 -344
- kodit/snippets/__init__.py +0 -1
- kodit/snippets/languages/__init__.py +0 -53
- kodit/snippets/snippets.py +0 -50
- kodit/source/__init__.py +0 -1
- kodit/source/source_factories.py +0 -356
- kodit/source/source_repository.py +0 -169
- kodit/source/source_service.py +0 -150
- kodit/util/__init__.py +0 -1
- kodit-0.2.4.dist-info/RECORD +0 -71
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/csharp.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/go.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/javascript.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/python.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/typescript.scm +0 -0
- {kodit-0.2.4.dist-info → kodit-0.2.6.dist-info}/WHEEL +0 -0
- {kodit-0.2.4.dist-info → kodit-0.2.6.dist-info}/entry_points.txt +0 -0
- {kodit-0.2.4.dist-info → kodit-0.2.6.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Legacy enrichment models for backward compatibility."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from collections.abc import AsyncGenerator
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class EnrichmentRequest:
|
|
10
|
+
"""Legacy enrichment request model."""
|
|
11
|
+
|
|
12
|
+
snippet_id: int
|
|
13
|
+
text: str
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class EnrichmentResponse:
|
|
18
|
+
"""Legacy enrichment response model."""
|
|
19
|
+
|
|
20
|
+
snippet_id: int
|
|
21
|
+
text: str
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class EnrichmentService(ABC):
|
|
25
|
+
"""Legacy enrichment service interface."""
|
|
26
|
+
|
|
27
|
+
@abstractmethod
|
|
28
|
+
def enrich(
|
|
29
|
+
self, data: list[EnrichmentRequest]
|
|
30
|
+
) -> AsyncGenerator[EnrichmentResponse, None]:
|
|
31
|
+
"""Enrich a list of requests."""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class NullEnrichmentService(EnrichmentService):
|
|
35
|
+
"""Null enrichment service for testing."""
|
|
36
|
+
|
|
37
|
+
async def enrich(
|
|
38
|
+
self, data: list[EnrichmentRequest]
|
|
39
|
+
) -> AsyncGenerator[EnrichmentResponse, None]:
|
|
40
|
+
"""Return empty responses for all requests."""
|
|
41
|
+
for request in data:
|
|
42
|
+
yield EnrichmentResponse(snippet_id=request.snippet_id, text="")
|
kodit/{enrichment/enrichment_provider → infrastructure/enrichment}/local_enrichment_provider.py
RENAMED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""Local
|
|
1
|
+
"""Local enrichment provider implementation."""
|
|
2
2
|
|
|
3
3
|
import os
|
|
4
4
|
from collections.abc import AsyncGenerator
|
|
@@ -6,29 +6,33 @@ from collections.abc import AsyncGenerator
|
|
|
6
6
|
import structlog
|
|
7
7
|
import tiktoken
|
|
8
8
|
|
|
9
|
-
from kodit.
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
EnrichmentResponse,
|
|
17
|
-
)
|
|
9
|
+
from kodit.domain.services.enrichment_service import EnrichmentProvider
|
|
10
|
+
from kodit.domain.value_objects import EnrichmentRequest, EnrichmentResponse
|
|
11
|
+
|
|
12
|
+
ENRICHMENT_SYSTEM_PROMPT = """
|
|
13
|
+
You are a professional software developer. You will be given a snippet of code.
|
|
14
|
+
Please provide a concise explanation of the code.
|
|
15
|
+
"""
|
|
18
16
|
|
|
19
17
|
DEFAULT_ENRICHMENT_MODEL = "Qwen/Qwen3-0.6B"
|
|
20
18
|
DEFAULT_CONTEXT_WINDOW_SIZE = 2048 # Small so it works even on low-powered devices
|
|
21
19
|
|
|
22
20
|
|
|
23
21
|
class LocalEnrichmentProvider(EnrichmentProvider):
|
|
24
|
-
"""Local
|
|
22
|
+
"""Local enrichment provider implementation."""
|
|
25
23
|
|
|
26
24
|
def __init__(
|
|
27
25
|
self,
|
|
28
26
|
model_name: str = DEFAULT_ENRICHMENT_MODEL,
|
|
29
27
|
context_window: int = DEFAULT_CONTEXT_WINDOW_SIZE,
|
|
30
28
|
) -> None:
|
|
31
|
-
"""Initialize the local enrichment provider.
|
|
29
|
+
"""Initialize the local enrichment provider.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
model_name: The model name to use for enrichment.
|
|
33
|
+
context_window: The context window size for the model.
|
|
34
|
+
|
|
35
|
+
"""
|
|
32
36
|
self.log = structlog.get_logger(__name__)
|
|
33
37
|
self.model_name = model_name
|
|
34
38
|
self.context_window = context_window
|
|
@@ -37,14 +41,22 @@ class LocalEnrichmentProvider(EnrichmentProvider):
|
|
|
37
41
|
self.encoding = tiktoken.encoding_for_model("text-embedding-3-small")
|
|
38
42
|
|
|
39
43
|
async def enrich(
|
|
40
|
-
self,
|
|
44
|
+
self, requests: list[EnrichmentRequest]
|
|
41
45
|
) -> AsyncGenerator[EnrichmentResponse, None]:
|
|
42
|
-
"""Enrich a list of
|
|
46
|
+
"""Enrich a list of requests using local model.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
requests: List of enrichment requests.
|
|
50
|
+
|
|
51
|
+
Yields:
|
|
52
|
+
Enrichment responses as they are processed.
|
|
53
|
+
|
|
54
|
+
"""
|
|
43
55
|
# Remove empty snippets
|
|
44
|
-
|
|
56
|
+
requests = [req for req in requests if req.text]
|
|
45
57
|
|
|
46
|
-
if not
|
|
47
|
-
self.log.warning("
|
|
58
|
+
if not requests:
|
|
59
|
+
self.log.warning("No valid requests for enrichment")
|
|
48
60
|
return
|
|
49
61
|
|
|
50
62
|
from transformers.models.auto.modeling_auto import (
|
|
@@ -66,25 +78,25 @@ class LocalEnrichmentProvider(EnrichmentProvider):
|
|
|
66
78
|
)
|
|
67
79
|
|
|
68
80
|
# Prepare prompts
|
|
69
|
-
prompts
|
|
70
|
-
|
|
71
|
-
id
|
|
72
|
-
text
|
|
81
|
+
prompts = [
|
|
82
|
+
{
|
|
83
|
+
"id": req.snippet_id,
|
|
84
|
+
"text": self.tokenizer.apply_chat_template(
|
|
73
85
|
[
|
|
74
86
|
{"role": "system", "content": ENRICHMENT_SYSTEM_PROMPT},
|
|
75
|
-
{"role": "user", "content":
|
|
87
|
+
{"role": "user", "content": req.text},
|
|
76
88
|
],
|
|
77
89
|
tokenize=False,
|
|
78
90
|
add_generation_prompt=True,
|
|
79
91
|
enable_thinking=False,
|
|
80
92
|
),
|
|
81
|
-
|
|
82
|
-
for
|
|
93
|
+
}
|
|
94
|
+
for req in requests
|
|
83
95
|
]
|
|
84
96
|
|
|
85
97
|
for prompt in prompts:
|
|
86
98
|
model_inputs = self.tokenizer(
|
|
87
|
-
prompt
|
|
99
|
+
prompt["text"],
|
|
88
100
|
return_tensors="pt",
|
|
89
101
|
padding=True,
|
|
90
102
|
truncation=True,
|
|
@@ -98,6 +110,6 @@ class LocalEnrichmentProvider(EnrichmentProvider):
|
|
|
98
110
|
"\n"
|
|
99
111
|
)
|
|
100
112
|
yield EnrichmentResponse(
|
|
101
|
-
snippet_id=prompt
|
|
113
|
+
snippet_id=prompt["id"],
|
|
102
114
|
text=content,
|
|
103
115
|
)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Null enrichment provider for testing."""
|
|
2
|
+
|
|
3
|
+
from collections.abc import AsyncGenerator
|
|
4
|
+
|
|
5
|
+
from kodit.domain.services.enrichment_service import EnrichmentProvider
|
|
6
|
+
from kodit.domain.value_objects import EnrichmentRequest, EnrichmentResponse
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class NullEnrichmentProvider(EnrichmentProvider):
|
|
10
|
+
"""Null enrichment provider that returns empty responses."""
|
|
11
|
+
|
|
12
|
+
async def enrich(
|
|
13
|
+
self, requests: list[EnrichmentRequest]
|
|
14
|
+
) -> AsyncGenerator[EnrichmentResponse, None]:
|
|
15
|
+
"""Return empty responses for all requests.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
requests: List of enrichment requests.
|
|
19
|
+
|
|
20
|
+
Yields:
|
|
21
|
+
Empty enrichment responses.
|
|
22
|
+
|
|
23
|
+
"""
|
|
24
|
+
for request in requests:
|
|
25
|
+
yield EnrichmentResponse(snippet_id=request.snippet_id, text="")
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""OpenAI enrichment provider implementation."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from collections.abc import AsyncGenerator
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import structlog
|
|
8
|
+
|
|
9
|
+
from kodit.domain.services.enrichment_service import EnrichmentProvider
|
|
10
|
+
from kodit.domain.value_objects import EnrichmentRequest, EnrichmentResponse
|
|
11
|
+
|
|
12
|
+
ENRICHMENT_SYSTEM_PROMPT = """
|
|
13
|
+
You are a professional software developer. You will be given a snippet of code.
|
|
14
|
+
Please provide a concise explanation of the code.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
# Default tuned to approximately fit within OpenAI's rate limit of 500 / RPM
|
|
18
|
+
OPENAI_NUM_PARALLEL_TASKS = 40
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class OpenAIEnrichmentProvider(EnrichmentProvider):
|
|
22
|
+
"""OpenAI enrichment provider implementation."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, openai_client: Any, model_name: str = "gpt-4o-mini") -> None:
|
|
25
|
+
"""Initialize the OpenAI enrichment provider.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
openai_client: The OpenAI client instance.
|
|
29
|
+
model_name: The model name to use for enrichment.
|
|
30
|
+
|
|
31
|
+
"""
|
|
32
|
+
self.log = structlog.get_logger(__name__)
|
|
33
|
+
self.openai_client = openai_client
|
|
34
|
+
self.model_name = model_name
|
|
35
|
+
|
|
36
|
+
async def enrich(
|
|
37
|
+
self, requests: list[EnrichmentRequest]
|
|
38
|
+
) -> AsyncGenerator[EnrichmentResponse, None]:
|
|
39
|
+
"""Enrich a list of requests using OpenAI API.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
requests: List of enrichment requests.
|
|
43
|
+
|
|
44
|
+
Yields:
|
|
45
|
+
Enrichment responses as they are processed.
|
|
46
|
+
|
|
47
|
+
"""
|
|
48
|
+
if not requests:
|
|
49
|
+
self.log.warning("No requests for enrichment")
|
|
50
|
+
return
|
|
51
|
+
|
|
52
|
+
# Process batches in parallel with a semaphore to limit concurrent requests
|
|
53
|
+
sem = asyncio.Semaphore(OPENAI_NUM_PARALLEL_TASKS)
|
|
54
|
+
|
|
55
|
+
async def process_request(request: EnrichmentRequest) -> EnrichmentResponse:
|
|
56
|
+
async with sem:
|
|
57
|
+
if not request.text:
|
|
58
|
+
return EnrichmentResponse(
|
|
59
|
+
snippet_id=request.snippet_id,
|
|
60
|
+
text="",
|
|
61
|
+
)
|
|
62
|
+
try:
|
|
63
|
+
response = await self.openai_client.chat.completions.create(
|
|
64
|
+
model=self.model_name,
|
|
65
|
+
messages=[
|
|
66
|
+
{
|
|
67
|
+
"role": "system",
|
|
68
|
+
"content": ENRICHMENT_SYSTEM_PROMPT,
|
|
69
|
+
},
|
|
70
|
+
{"role": "user", "content": request.text},
|
|
71
|
+
],
|
|
72
|
+
)
|
|
73
|
+
return EnrichmentResponse(
|
|
74
|
+
snippet_id=request.snippet_id,
|
|
75
|
+
text=response.choices[0].message.content or "",
|
|
76
|
+
)
|
|
77
|
+
except Exception as e:
|
|
78
|
+
self.log.exception("Error enriching request", error=str(e))
|
|
79
|
+
return EnrichmentResponse(
|
|
80
|
+
snippet_id=request.snippet_id,
|
|
81
|
+
text="",
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Create tasks for all requests
|
|
85
|
+
tasks = [process_request(request) for request in requests]
|
|
86
|
+
|
|
87
|
+
# Process all requests and yield results as they complete
|
|
88
|
+
for task in asyncio.as_completed(tasks):
|
|
89
|
+
yield await task
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Git infrastructure module."""
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""Git utilities."""
|
|
1
|
+
"""Git utilities for infrastructure operations."""
|
|
2
2
|
|
|
3
3
|
import tempfile
|
|
4
4
|
|
|
@@ -6,7 +6,15 @@ import git
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def is_valid_clone_target(target: str) -> bool:
|
|
9
|
-
"""Return True if the target is clonable.
|
|
9
|
+
"""Return True if the target is clonable.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
target: The git repository URL or path to validate.
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
True if the target can be cloned, False otherwise.
|
|
16
|
+
|
|
17
|
+
"""
|
|
10
18
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
11
19
|
try:
|
|
12
20
|
git.Repo.clone_from(target, temp_dir)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Ignore infrastructure module."""
|
|
@@ -1,18 +1,27 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Infrastructure implementation of ignore pattern provider."""
|
|
2
2
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
|
|
5
5
|
import git
|
|
6
6
|
import pathspec
|
|
7
7
|
|
|
8
|
-
from kodit.
|
|
8
|
+
from kodit.domain.services.ignore_service import IgnorePatternProvider
|
|
9
|
+
from kodit.infrastructure.git.git_utils import is_valid_clone_target
|
|
9
10
|
|
|
10
11
|
|
|
11
|
-
class
|
|
12
|
-
"""Ignore
|
|
12
|
+
class GitIgnorePatternProvider(IgnorePatternProvider):
|
|
13
|
+
"""Ignore pattern provider for git repositories."""
|
|
13
14
|
|
|
14
15
|
def __init__(self, base_dir: Path) -> None:
|
|
15
|
-
"""Initialize the ignore
|
|
16
|
+
"""Initialize the ignore pattern provider.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
base_dir: The base directory to check for ignore patterns.
|
|
20
|
+
|
|
21
|
+
Raises:
|
|
22
|
+
ValueError: If the base directory is not a directory.
|
|
23
|
+
|
|
24
|
+
"""
|
|
16
25
|
if not base_dir.is_dir():
|
|
17
26
|
msg = f"Base directory is not a directory: {base_dir}"
|
|
18
27
|
raise ValueError(msg)
|
|
@@ -25,7 +34,15 @@ class IgnorePatterns:
|
|
|
25
34
|
self.git_repo = git.Repo(base_dir)
|
|
26
35
|
|
|
27
36
|
def should_ignore(self, path: Path) -> bool:
|
|
28
|
-
"""Check if a path should be ignored.
|
|
37
|
+
"""Check if a path should be ignored.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
path: The path to check.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
True if the path should be ignored, False otherwise.
|
|
44
|
+
|
|
45
|
+
"""
|
|
29
46
|
if path.is_dir():
|
|
30
47
|
return False
|
|
31
48
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Infrastructure indexing module."""
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""Infrastructure implementation of the fusion service."""
|
|
2
|
+
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
|
|
5
|
+
from kodit.domain.services.indexing_service import FusionService
|
|
6
|
+
from kodit.domain.value_objects import FusionRequest, FusionResult
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ReciprocalRankFusionService(FusionService):
|
|
10
|
+
"""Infrastructure implementation of reciprocal rank fusion."""
|
|
11
|
+
|
|
12
|
+
def reciprocal_rank_fusion(
|
|
13
|
+
self, rankings: list[list[FusionRequest]], k: float = 60
|
|
14
|
+
) -> list[FusionResult]:
|
|
15
|
+
"""Perform reciprocal rank fusion on search results.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
rankings: List of rankers, each containing a list of document ids.
|
|
19
|
+
Top of the list is considered to be the best result.
|
|
20
|
+
k: Parameter for RRF.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
List of fused results with scores.
|
|
24
|
+
|
|
25
|
+
"""
|
|
26
|
+
scores = {}
|
|
27
|
+
for ranker in rankings:
|
|
28
|
+
for rank in ranker:
|
|
29
|
+
scores[rank.id] = float(0)
|
|
30
|
+
|
|
31
|
+
for ranker in rankings:
|
|
32
|
+
for i, rank in enumerate(ranker):
|
|
33
|
+
scores[rank.id] += 1.0 / (k + i)
|
|
34
|
+
|
|
35
|
+
# Create a list of tuples of ids and their scores
|
|
36
|
+
results = [(rank, scores[rank]) for rank in scores]
|
|
37
|
+
|
|
38
|
+
# Sort results by score
|
|
39
|
+
results.sort(key=lambda x: x[1], reverse=True)
|
|
40
|
+
|
|
41
|
+
# Create a map of original scores to ids
|
|
42
|
+
original_scores_to_ids = defaultdict(list)
|
|
43
|
+
for ranker in rankings:
|
|
44
|
+
for rank in ranker:
|
|
45
|
+
original_scores_to_ids[rank.id].append(rank.score)
|
|
46
|
+
|
|
47
|
+
# Rebuild a list of final results with their original scores
|
|
48
|
+
return [
|
|
49
|
+
FusionResult(
|
|
50
|
+
id=result[0],
|
|
51
|
+
score=result[1],
|
|
52
|
+
original_scores=original_scores_to_ids[result[0]],
|
|
53
|
+
)
|
|
54
|
+
for result in results
|
|
55
|
+
]
|