kiln-ai 0.20.1__py3-none-any.whl → 0.22.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kiln-ai might be problematic. Click here for more details.
- kiln_ai/adapters/__init__.py +6 -0
- kiln_ai/adapters/adapter_registry.py +43 -226
- kiln_ai/adapters/chunkers/__init__.py +13 -0
- kiln_ai/adapters/chunkers/base_chunker.py +42 -0
- kiln_ai/adapters/chunkers/chunker_registry.py +16 -0
- kiln_ai/adapters/chunkers/fixed_window_chunker.py +39 -0
- kiln_ai/adapters/chunkers/helpers.py +23 -0
- kiln_ai/adapters/chunkers/test_base_chunker.py +63 -0
- kiln_ai/adapters/chunkers/test_chunker_registry.py +28 -0
- kiln_ai/adapters/chunkers/test_fixed_window_chunker.py +346 -0
- kiln_ai/adapters/chunkers/test_helpers.py +75 -0
- kiln_ai/adapters/data_gen/test_data_gen_task.py +9 -3
- kiln_ai/adapters/embedding/__init__.py +0 -0
- kiln_ai/adapters/embedding/base_embedding_adapter.py +44 -0
- kiln_ai/adapters/embedding/embedding_registry.py +32 -0
- kiln_ai/adapters/embedding/litellm_embedding_adapter.py +199 -0
- kiln_ai/adapters/embedding/test_base_embedding_adapter.py +283 -0
- kiln_ai/adapters/embedding/test_embedding_registry.py +166 -0
- kiln_ai/adapters/embedding/test_litellm_embedding_adapter.py +1149 -0
- kiln_ai/adapters/eval/eval_runner.py +6 -2
- kiln_ai/adapters/eval/test_base_eval.py +1 -3
- kiln_ai/adapters/eval/test_g_eval.py +1 -1
- kiln_ai/adapters/extractors/__init__.py +18 -0
- kiln_ai/adapters/extractors/base_extractor.py +72 -0
- kiln_ai/adapters/extractors/encoding.py +20 -0
- kiln_ai/adapters/extractors/extractor_registry.py +44 -0
- kiln_ai/adapters/extractors/extractor_runner.py +112 -0
- kiln_ai/adapters/extractors/litellm_extractor.py +406 -0
- kiln_ai/adapters/extractors/test_base_extractor.py +244 -0
- kiln_ai/adapters/extractors/test_encoding.py +54 -0
- kiln_ai/adapters/extractors/test_extractor_registry.py +181 -0
- kiln_ai/adapters/extractors/test_extractor_runner.py +181 -0
- kiln_ai/adapters/extractors/test_litellm_extractor.py +1290 -0
- kiln_ai/adapters/fine_tune/test_dataset_formatter.py +2 -2
- kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +2 -6
- kiln_ai/adapters/fine_tune/test_together_finetune.py +2 -6
- kiln_ai/adapters/ml_embedding_model_list.py +494 -0
- kiln_ai/adapters/ml_model_list.py +876 -18
- kiln_ai/adapters/model_adapters/litellm_adapter.py +40 -75
- kiln_ai/adapters/model_adapters/test_litellm_adapter.py +79 -1
- kiln_ai/adapters/model_adapters/test_litellm_adapter_tools.py +119 -5
- kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +9 -3
- kiln_ai/adapters/model_adapters/test_structured_output.py +9 -10
- kiln_ai/adapters/ollama_tools.py +69 -12
- kiln_ai/adapters/provider_tools.py +190 -46
- kiln_ai/adapters/rag/deduplication.py +49 -0
- kiln_ai/adapters/rag/progress.py +252 -0
- kiln_ai/adapters/rag/rag_runners.py +844 -0
- kiln_ai/adapters/rag/test_deduplication.py +195 -0
- kiln_ai/adapters/rag/test_progress.py +785 -0
- kiln_ai/adapters/rag/test_rag_runners.py +2376 -0
- kiln_ai/adapters/remote_config.py +80 -8
- kiln_ai/adapters/test_adapter_registry.py +579 -86
- kiln_ai/adapters/test_ml_embedding_model_list.py +239 -0
- kiln_ai/adapters/test_ml_model_list.py +202 -0
- kiln_ai/adapters/test_ollama_tools.py +340 -1
- kiln_ai/adapters/test_prompt_builders.py +1 -1
- kiln_ai/adapters/test_provider_tools.py +199 -8
- kiln_ai/adapters/test_remote_config.py +551 -56
- kiln_ai/adapters/vector_store/__init__.py +1 -0
- kiln_ai/adapters/vector_store/base_vector_store_adapter.py +83 -0
- kiln_ai/adapters/vector_store/lancedb_adapter.py +389 -0
- kiln_ai/adapters/vector_store/test_base_vector_store.py +160 -0
- kiln_ai/adapters/vector_store/test_lancedb_adapter.py +1841 -0
- kiln_ai/adapters/vector_store/test_vector_store_registry.py +199 -0
- kiln_ai/adapters/vector_store/vector_store_registry.py +33 -0
- kiln_ai/datamodel/__init__.py +16 -13
- kiln_ai/datamodel/basemodel.py +201 -4
- kiln_ai/datamodel/chunk.py +158 -0
- kiln_ai/datamodel/datamodel_enums.py +27 -0
- kiln_ai/datamodel/embedding.py +64 -0
- kiln_ai/datamodel/external_tool_server.py +206 -54
- kiln_ai/datamodel/extraction.py +317 -0
- kiln_ai/datamodel/project.py +33 -1
- kiln_ai/datamodel/rag.py +79 -0
- kiln_ai/datamodel/task.py +5 -0
- kiln_ai/datamodel/task_output.py +41 -11
- kiln_ai/datamodel/test_attachment.py +649 -0
- kiln_ai/datamodel/test_basemodel.py +270 -14
- kiln_ai/datamodel/test_chunk_models.py +317 -0
- kiln_ai/datamodel/test_dataset_split.py +1 -1
- kiln_ai/datamodel/test_datasource.py +50 -0
- kiln_ai/datamodel/test_embedding_models.py +448 -0
- kiln_ai/datamodel/test_eval_model.py +6 -6
- kiln_ai/datamodel/test_external_tool_server.py +534 -152
- kiln_ai/datamodel/test_extraction_chunk.py +206 -0
- kiln_ai/datamodel/test_extraction_model.py +501 -0
- kiln_ai/datamodel/test_rag.py +641 -0
- kiln_ai/datamodel/test_task.py +35 -1
- kiln_ai/datamodel/test_tool_id.py +187 -1
- kiln_ai/datamodel/test_vector_store.py +320 -0
- kiln_ai/datamodel/tool_id.py +58 -0
- kiln_ai/datamodel/vector_store.py +141 -0
- kiln_ai/tools/base_tool.py +12 -3
- kiln_ai/tools/built_in_tools/math_tools.py +12 -4
- kiln_ai/tools/kiln_task_tool.py +158 -0
- kiln_ai/tools/mcp_server_tool.py +2 -2
- kiln_ai/tools/mcp_session_manager.py +51 -22
- kiln_ai/tools/rag_tools.py +164 -0
- kiln_ai/tools/test_kiln_task_tool.py +527 -0
- kiln_ai/tools/test_mcp_server_tool.py +4 -15
- kiln_ai/tools/test_mcp_session_manager.py +187 -227
- kiln_ai/tools/test_rag_tools.py +929 -0
- kiln_ai/tools/test_tool_registry.py +290 -7
- kiln_ai/tools/tool_registry.py +69 -16
- kiln_ai/utils/__init__.py +3 -0
- kiln_ai/utils/async_job_runner.py +62 -17
- kiln_ai/utils/config.py +2 -2
- kiln_ai/utils/env.py +15 -0
- kiln_ai/utils/filesystem.py +14 -0
- kiln_ai/utils/filesystem_cache.py +60 -0
- kiln_ai/utils/litellm.py +94 -0
- kiln_ai/utils/lock.py +100 -0
- kiln_ai/utils/mime_type.py +38 -0
- kiln_ai/utils/open_ai_types.py +19 -2
- kiln_ai/utils/pdf_utils.py +59 -0
- kiln_ai/utils/test_async_job_runner.py +151 -35
- kiln_ai/utils/test_env.py +142 -0
- kiln_ai/utils/test_filesystem_cache.py +316 -0
- kiln_ai/utils/test_litellm.py +206 -0
- kiln_ai/utils/test_lock.py +185 -0
- kiln_ai/utils/test_mime_type.py +66 -0
- kiln_ai/utils/test_open_ai_types.py +88 -12
- kiln_ai/utils/test_pdf_utils.py +86 -0
- kiln_ai/utils/test_uuid.py +111 -0
- kiln_ai/utils/test_validation.py +524 -0
- kiln_ai/utils/uuid.py +9 -0
- kiln_ai/utils/validation.py +90 -0
- {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/METADATA +9 -1
- kiln_ai-0.22.0.dist-info/RECORD +213 -0
- kiln_ai-0.20.1.dist-info/RECORD +0 -138
- {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/WHEEL +0 -0
- {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -0,0 +1,785 @@
|
|
|
1
|
+
from unittest.mock import AsyncMock, MagicMock, patch
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
from kiln_ai.adapters.rag.progress import (
|
|
5
|
+
LogMessage,
|
|
6
|
+
RagProgress,
|
|
7
|
+
compute_current_progress_for_rag_config,
|
|
8
|
+
compute_current_progress_for_rag_configs,
|
|
9
|
+
count_records_in_vector_store,
|
|
10
|
+
count_records_in_vector_store_for_rag_config,
|
|
11
|
+
)
|
|
12
|
+
from kiln_ai.datamodel.chunk import ChunkedDocument
|
|
13
|
+
from kiln_ai.datamodel.embedding import ChunkEmbeddings
|
|
14
|
+
from kiln_ai.datamodel.extraction import Document, Extraction
|
|
15
|
+
from kiln_ai.datamodel.project import Project
|
|
16
|
+
from kiln_ai.datamodel.rag import RagConfig
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@pytest.fixture
|
|
20
|
+
def mock_project(tmp_path):
|
|
21
|
+
project_path = tmp_path / "test_project" / "project.kiln"
|
|
22
|
+
project_path.parent.mkdir()
|
|
23
|
+
|
|
24
|
+
project = Project(name="Test Project", path=project_path)
|
|
25
|
+
project.save_to_file()
|
|
26
|
+
|
|
27
|
+
return project
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@pytest.fixture
|
|
31
|
+
def mock_project_magic():
|
|
32
|
+
"""This mock is more flexible than the mock_project fixture. Can mock the base model methods easily"""
|
|
33
|
+
return MagicMock(spec=Project)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@pytest.fixture
|
|
37
|
+
def mock_vector_store_count():
|
|
38
|
+
"""Mock the vector store count operations to return 0 by default"""
|
|
39
|
+
with patch(
|
|
40
|
+
"kiln_ai.adapters.rag.progress.count_records_in_vector_store_for_rag_config",
|
|
41
|
+
new_callable=AsyncMock,
|
|
42
|
+
return_value=0,
|
|
43
|
+
) as mock:
|
|
44
|
+
yield mock
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def create_mock_embedding(embedding_config_id):
|
|
48
|
+
"""Helper to create a mock embedding with the specified config ID"""
|
|
49
|
+
mock_embedding = MagicMock(spec=ChunkEmbeddings)
|
|
50
|
+
mock_embedding.embedding_config_id = embedding_config_id
|
|
51
|
+
mock_embedding.created_at = "2024-01-01T00:00:00Z"
|
|
52
|
+
return mock_embedding
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def create_mock_chunked_document(chunker_config_id, embeddings=None, num_chunks=1):
|
|
56
|
+
"""Helper to create a mock chunked document with the specified config ID and embeddings"""
|
|
57
|
+
if embeddings is None:
|
|
58
|
+
embeddings = []
|
|
59
|
+
|
|
60
|
+
mock_chunked_doc = MagicMock(spec=ChunkedDocument)
|
|
61
|
+
mock_chunked_doc.chunker_config_id = chunker_config_id
|
|
62
|
+
mock_chunked_doc.chunk_embeddings.return_value = embeddings
|
|
63
|
+
mock_chunked_doc.created_at = "2024-01-01T00:00:00Z"
|
|
64
|
+
# Mock the chunks attribute to return a list with the specified number of chunks
|
|
65
|
+
mock_chunked_doc.chunks = [MagicMock() for _ in range(num_chunks)]
|
|
66
|
+
return mock_chunked_doc
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def create_mock_extraction(extractor_config_id, chunked_documents=None):
|
|
70
|
+
"""Helper to create a mock extraction with the specified config ID and chunked documents"""
|
|
71
|
+
if chunked_documents is None:
|
|
72
|
+
chunked_documents = []
|
|
73
|
+
|
|
74
|
+
mock_extraction = MagicMock(spec=Extraction)
|
|
75
|
+
mock_extraction.extractor_config_id = extractor_config_id
|
|
76
|
+
mock_extraction.chunked_documents.return_value = chunked_documents
|
|
77
|
+
mock_extraction.created_at = "2024-01-01T00:00:00Z"
|
|
78
|
+
return mock_extraction
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def create_mock_document(extractions=None, tags=None):
|
|
82
|
+
"""Helper to create a mock document with the specified extractions"""
|
|
83
|
+
if extractions is None:
|
|
84
|
+
extractions = []
|
|
85
|
+
|
|
86
|
+
mock_document = MagicMock(spec=Document)
|
|
87
|
+
mock_document.extractions.return_value = extractions
|
|
88
|
+
mock_document.tags = tags
|
|
89
|
+
return mock_document
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def create_mock_rag_config(
|
|
93
|
+
config_id,
|
|
94
|
+
extractor_config_id,
|
|
95
|
+
chunker_config_id,
|
|
96
|
+
embedding_config_id,
|
|
97
|
+
vector_store_config_id="vector_store_1",
|
|
98
|
+
tags=None,
|
|
99
|
+
):
|
|
100
|
+
"""Helper to create a mock RAG config with the specified IDs"""
|
|
101
|
+
mock_rag_config = MagicMock(spec=RagConfig)
|
|
102
|
+
mock_rag_config.id = config_id
|
|
103
|
+
mock_rag_config.extractor_config_id = extractor_config_id
|
|
104
|
+
mock_rag_config.chunker_config_id = chunker_config_id
|
|
105
|
+
mock_rag_config.embedding_config_id = embedding_config_id
|
|
106
|
+
mock_rag_config.vector_store_config_id = vector_store_config_id
|
|
107
|
+
mock_rag_config.tags = tags
|
|
108
|
+
return mock_rag_config
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class TestLogMessage:
|
|
112
|
+
def test_log_message_creation(self):
|
|
113
|
+
log = LogMessage(level="info", message="Test message")
|
|
114
|
+
assert log.level == "info"
|
|
115
|
+
assert log.message == "Test message"
|
|
116
|
+
|
|
117
|
+
def test_log_message_validation(self):
|
|
118
|
+
# Test valid levels
|
|
119
|
+
for level in ["info", "error", "warning"]:
|
|
120
|
+
log = LogMessage(level=level, message="Test") # type: ignore
|
|
121
|
+
assert log.level == level
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class TestRagProgress:
|
|
125
|
+
def test_rag_progress_default_values(self):
|
|
126
|
+
progress = RagProgress()
|
|
127
|
+
assert progress.total_document_count == 0
|
|
128
|
+
assert progress.total_document_completed_count == 0
|
|
129
|
+
assert progress.total_chunk_count == 0
|
|
130
|
+
assert progress.total_chunk_completed_count == 0
|
|
131
|
+
assert progress.total_document_extracted_count == 0
|
|
132
|
+
assert progress.total_document_extracted_error_count == 0
|
|
133
|
+
assert progress.total_document_chunked_count == 0
|
|
134
|
+
assert progress.total_document_chunked_error_count == 0
|
|
135
|
+
assert progress.total_document_embedded_count == 0
|
|
136
|
+
assert progress.total_document_embedded_error_count == 0
|
|
137
|
+
assert progress.total_chunks_indexed_count == 0
|
|
138
|
+
assert progress.total_chunks_indexed_error_count == 0
|
|
139
|
+
assert progress.logs is None
|
|
140
|
+
|
|
141
|
+
def test_rag_progress_with_values(self):
|
|
142
|
+
logs = [LogMessage(level="info", message="Processing")]
|
|
143
|
+
progress = RagProgress(
|
|
144
|
+
total_document_count=10,
|
|
145
|
+
total_document_completed_count=5,
|
|
146
|
+
total_document_extracted_count=8,
|
|
147
|
+
total_document_chunked_count=6,
|
|
148
|
+
total_document_embedded_count=5,
|
|
149
|
+
total_chunk_count=6,
|
|
150
|
+
total_chunk_completed_count=3,
|
|
151
|
+
total_chunks_indexed_count=3,
|
|
152
|
+
logs=logs,
|
|
153
|
+
)
|
|
154
|
+
assert progress.total_document_count == 10
|
|
155
|
+
assert progress.total_document_completed_count == 5
|
|
156
|
+
assert progress.total_document_extracted_count == 8
|
|
157
|
+
assert progress.total_document_chunked_count == 6
|
|
158
|
+
assert progress.total_document_embedded_count == 5
|
|
159
|
+
assert progress.total_chunk_count == 6
|
|
160
|
+
assert progress.total_chunk_completed_count == 3
|
|
161
|
+
assert progress.total_chunks_indexed_count == 3
|
|
162
|
+
assert progress.logs is not None
|
|
163
|
+
assert len(progress.logs) == 1
|
|
164
|
+
assert progress.logs[0].level == "info"
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
class TestComputeCurrentProgressForRagConfigs:
|
|
168
|
+
@pytest.mark.asyncio
|
|
169
|
+
async def test_empty_project_empty_configs(
|
|
170
|
+
self, mock_project_magic, mock_vector_store_count
|
|
171
|
+
):
|
|
172
|
+
"""Test with no documents and no RAG configs"""
|
|
173
|
+
mock_project_magic.documents.return_value = []
|
|
174
|
+
|
|
175
|
+
result = await compute_current_progress_for_rag_configs(mock_project_magic, [])
|
|
176
|
+
assert result == {}
|
|
177
|
+
|
|
178
|
+
@pytest.mark.asyncio
|
|
179
|
+
async def test_empty_project_with_config(
|
|
180
|
+
self, mock_project_magic, mock_vector_store_count
|
|
181
|
+
):
|
|
182
|
+
"""Test with no documents but with a RAG config"""
|
|
183
|
+
rag_config = create_mock_rag_config("rag1", "ext1", "chunk1", "embed1")
|
|
184
|
+
mock_project_magic.documents.return_value = []
|
|
185
|
+
|
|
186
|
+
result = await compute_current_progress_for_rag_configs(
|
|
187
|
+
mock_project_magic, [rag_config]
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
assert "rag1" in result
|
|
191
|
+
progress = result["rag1"]
|
|
192
|
+
assert progress.total_document_count == 0
|
|
193
|
+
assert progress.total_document_completed_count == 0
|
|
194
|
+
assert progress.total_document_extracted_count == 0
|
|
195
|
+
assert progress.total_document_chunked_count == 0
|
|
196
|
+
assert progress.total_document_embedded_count == 0
|
|
197
|
+
assert progress.total_chunks_indexed_count == 0
|
|
198
|
+
assert progress.total_chunk_count == 0
|
|
199
|
+
assert progress.total_chunk_completed_count == 0
|
|
200
|
+
|
|
201
|
+
@pytest.mark.asyncio
|
|
202
|
+
async def test_documents_no_extractions(
|
|
203
|
+
self, mock_project_magic, mock_vector_store_count
|
|
204
|
+
):
|
|
205
|
+
"""Test with documents but no extractions"""
|
|
206
|
+
documents = [create_mock_document() for _ in range(3)]
|
|
207
|
+
rag_config = create_mock_rag_config("rag1", "ext1", "chunk1", "embed1")
|
|
208
|
+
mock_project_magic.documents.return_value = documents
|
|
209
|
+
|
|
210
|
+
result = await compute_current_progress_for_rag_configs(
|
|
211
|
+
mock_project_magic, [rag_config]
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
assert "rag1" in result
|
|
215
|
+
progress = result["rag1"]
|
|
216
|
+
assert progress.total_document_count == 3
|
|
217
|
+
assert progress.total_document_completed_count == 0
|
|
218
|
+
assert progress.total_document_extracted_count == 0
|
|
219
|
+
assert progress.total_document_chunked_count == 0
|
|
220
|
+
assert progress.total_document_embedded_count == 0
|
|
221
|
+
assert progress.total_chunks_indexed_count == 0
|
|
222
|
+
|
|
223
|
+
@pytest.mark.asyncio
|
|
224
|
+
async def test_full_pipeline_single_config(
|
|
225
|
+
self, mock_project_magic, mock_vector_store_count
|
|
226
|
+
):
|
|
227
|
+
"""Test complete pipeline with one RAG config"""
|
|
228
|
+
# Create documents with separate extraction trees
|
|
229
|
+
documents = []
|
|
230
|
+
for i in range(2):
|
|
231
|
+
# Each document gets its own unique extraction tree
|
|
232
|
+
embedding = create_mock_embedding("embed1")
|
|
233
|
+
chunked_doc = create_mock_chunked_document(
|
|
234
|
+
"chunk1", [embedding], num_chunks=3
|
|
235
|
+
) # 3 chunks per document
|
|
236
|
+
extraction = create_mock_extraction("ext1", [chunked_doc])
|
|
237
|
+
document = create_mock_document([extraction])
|
|
238
|
+
documents.append(document)
|
|
239
|
+
|
|
240
|
+
rag_config = create_mock_rag_config("rag1", "ext1", "chunk1", "embed1")
|
|
241
|
+
|
|
242
|
+
mock_project_magic.documents.return_value = documents
|
|
243
|
+
result = await compute_current_progress_for_rag_configs(
|
|
244
|
+
mock_project_magic, [rag_config]
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
assert "rag1" in result
|
|
248
|
+
progress = result["rag1"]
|
|
249
|
+
assert progress.total_document_count == 2
|
|
250
|
+
assert (
|
|
251
|
+
progress.total_document_completed_count == 2
|
|
252
|
+
) # min of extraction, chunking, embedding (all complete)
|
|
253
|
+
assert progress.total_document_extracted_count == 2
|
|
254
|
+
assert progress.total_document_chunked_count == 2
|
|
255
|
+
assert progress.total_document_embedded_count == 2
|
|
256
|
+
assert progress.total_chunks_indexed_count == 0
|
|
257
|
+
assert progress.total_chunk_count == 6 # 2 documents * 3 chunks each
|
|
258
|
+
assert progress.total_chunk_completed_count == 0 # same as indexed count
|
|
259
|
+
|
|
260
|
+
@pytest.mark.asyncio
|
|
261
|
+
async def test_partial_pipeline_progress(
|
|
262
|
+
self, mock_project_magic, mock_vector_store_count
|
|
263
|
+
):
|
|
264
|
+
"""Test pipeline where some steps are incomplete"""
|
|
265
|
+
# Document 1: fully processed
|
|
266
|
+
embedding1 = create_mock_embedding("embed1")
|
|
267
|
+
chunked_doc1 = create_mock_chunked_document(
|
|
268
|
+
"chunk1", [embedding1], num_chunks=2
|
|
269
|
+
)
|
|
270
|
+
extraction1 = create_mock_extraction("ext1", [chunked_doc1])
|
|
271
|
+
doc1 = create_mock_document([extraction1])
|
|
272
|
+
|
|
273
|
+
# Document 2: extracted and chunked but not embedded
|
|
274
|
+
chunked_doc2 = create_mock_chunked_document(
|
|
275
|
+
"chunk1", [], num_chunks=3
|
|
276
|
+
) # no embeddings
|
|
277
|
+
extraction2 = create_mock_extraction("ext1", [chunked_doc2])
|
|
278
|
+
doc2 = create_mock_document([extraction2])
|
|
279
|
+
|
|
280
|
+
# Document 3: extracted but not chunked
|
|
281
|
+
extraction3 = create_mock_extraction("ext1", []) # no chunked docs
|
|
282
|
+
doc3 = create_mock_document([extraction3])
|
|
283
|
+
|
|
284
|
+
# Document 4: not extracted
|
|
285
|
+
doc4 = create_mock_document([]) # no extractions
|
|
286
|
+
|
|
287
|
+
rag_config = create_mock_rag_config("rag1", "ext1", "chunk1", "embed1")
|
|
288
|
+
|
|
289
|
+
mock_project_magic.documents.return_value = [doc1, doc2, doc3, doc4]
|
|
290
|
+
result = await compute_current_progress_for_rag_configs(
|
|
291
|
+
mock_project_magic, [rag_config]
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
assert "rag1" in result
|
|
295
|
+
progress = result["rag1"]
|
|
296
|
+
assert progress.total_document_count == 4
|
|
297
|
+
assert progress.total_document_extracted_count == 3 # docs 1, 2, 3
|
|
298
|
+
assert progress.total_document_chunked_count == 2 # docs 1, 2
|
|
299
|
+
assert progress.total_document_embedded_count == 1 # doc 1 only
|
|
300
|
+
assert progress.total_chunks_indexed_count == 0 # no indexing implemented yet
|
|
301
|
+
assert progress.total_chunk_count == 5 # doc1 has 2 chunks + doc2 has 3 chunks
|
|
302
|
+
assert progress.total_chunk_completed_count == 0 # same as indexed count
|
|
303
|
+
assert progress.total_document_completed_count == 1 # min(3,2,1) = 1
|
|
304
|
+
|
|
305
|
+
@pytest.mark.asyncio
|
|
306
|
+
async def test_multiple_rag_configs_shared_prefixes(
|
|
307
|
+
self, mock_project_magic, mock_vector_store_count
|
|
308
|
+
):
|
|
309
|
+
"""Test multiple RAG configs that share common path prefixes"""
|
|
310
|
+
# Create data that matches multiple configs
|
|
311
|
+
embedding1 = create_mock_embedding("embed1")
|
|
312
|
+
embedding2 = create_mock_embedding("embed2")
|
|
313
|
+
|
|
314
|
+
chunked_doc = create_mock_chunked_document(
|
|
315
|
+
"chunk1", [embedding1, embedding2], num_chunks=4
|
|
316
|
+
)
|
|
317
|
+
extraction = create_mock_extraction("ext1", [chunked_doc])
|
|
318
|
+
document = create_mock_document([extraction])
|
|
319
|
+
|
|
320
|
+
# Two configs that share extractor and chunker but differ in embedding
|
|
321
|
+
rag_config1 = create_mock_rag_config("rag1", "ext1", "chunk1", "embed1")
|
|
322
|
+
rag_config2 = create_mock_rag_config("rag2", "ext1", "chunk1", "embed2")
|
|
323
|
+
|
|
324
|
+
mock_project_magic.documents.return_value = [document]
|
|
325
|
+
result = await compute_current_progress_for_rag_configs(
|
|
326
|
+
mock_project_magic, [rag_config1, rag_config2]
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
# Both configs should have same extraction and chunking counts
|
|
330
|
+
assert "rag1" in result
|
|
331
|
+
assert "rag2" in result
|
|
332
|
+
|
|
333
|
+
for config_id in ["rag1", "rag2"]:
|
|
334
|
+
progress = result[config_id]
|
|
335
|
+
assert progress.total_document_count == 1
|
|
336
|
+
assert progress.total_document_extracted_count == 1
|
|
337
|
+
assert progress.total_document_chunked_count == 1
|
|
338
|
+
assert progress.total_document_embedded_count == 1
|
|
339
|
+
assert (
|
|
340
|
+
progress.total_chunks_indexed_count == 0
|
|
341
|
+
) # no indexing implemented yet
|
|
342
|
+
assert progress.total_chunk_count == 4 # 4 chunks in the document
|
|
343
|
+
assert progress.total_chunk_completed_count == 0 # same as indexed count
|
|
344
|
+
assert (
|
|
345
|
+
progress.total_document_completed_count == 1
|
|
346
|
+
) # min of extraction, chunking, embedding
|
|
347
|
+
|
|
348
|
+
@pytest.mark.asyncio
|
|
349
|
+
async def test_multiple_rag_configs_different_extractors(
|
|
350
|
+
self, mock_project_magic, mock_vector_store_count
|
|
351
|
+
):
|
|
352
|
+
"""Test multiple RAG configs with different extractors"""
|
|
353
|
+
# Create extractions for different extractors
|
|
354
|
+
embedding = create_mock_embedding("embed1")
|
|
355
|
+
chunked_doc = create_mock_chunked_document("chunk1", [embedding], num_chunks=5)
|
|
356
|
+
|
|
357
|
+
extraction1 = create_mock_extraction("ext1", [chunked_doc])
|
|
358
|
+
extraction2 = create_mock_extraction("ext2", [chunked_doc])
|
|
359
|
+
|
|
360
|
+
document = create_mock_document([extraction1, extraction2])
|
|
361
|
+
|
|
362
|
+
# Two configs with different extractors
|
|
363
|
+
rag_config1 = create_mock_rag_config("rag1", "ext1", "chunk1", "embed1")
|
|
364
|
+
rag_config2 = create_mock_rag_config("rag2", "ext2", "chunk1", "embed1")
|
|
365
|
+
|
|
366
|
+
mock_project_magic.documents.return_value = [document]
|
|
367
|
+
result = await compute_current_progress_for_rag_configs(
|
|
368
|
+
mock_project_magic, [rag_config1, rag_config2]
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
# Both should show progress since document has extractions for both extractors
|
|
372
|
+
for config_id in ["rag1", "rag2"]:
|
|
373
|
+
assert config_id in result
|
|
374
|
+
progress = result[config_id]
|
|
375
|
+
assert progress.total_document_count == 1
|
|
376
|
+
assert progress.total_document_extracted_count == 1
|
|
377
|
+
assert progress.total_document_chunked_count == 1
|
|
378
|
+
assert progress.total_document_embedded_count == 1
|
|
379
|
+
assert (
|
|
380
|
+
progress.total_chunks_indexed_count == 0
|
|
381
|
+
) # no indexing implemented yet
|
|
382
|
+
assert progress.total_chunk_count == 5 # 5 chunks in the document
|
|
383
|
+
assert progress.total_chunk_completed_count == 0 # same as indexed count
|
|
384
|
+
assert (
|
|
385
|
+
progress.total_document_completed_count == 1
|
|
386
|
+
) # min of extraction, chunking, embedding
|
|
387
|
+
|
|
388
|
+
@pytest.mark.asyncio
|
|
389
|
+
async def test_complex_tree_structure(
|
|
390
|
+
self, mock_project_magic, mock_vector_store_count
|
|
391
|
+
):
|
|
392
|
+
"""Test a complex tree with multiple documents, extractors, chunkers, and embeddings"""
|
|
393
|
+
# Document 1: ext1 -> chunk1 -> embed1, embed2
|
|
394
|
+
embedding1_1 = create_mock_embedding("embed1")
|
|
395
|
+
embedding1_2 = create_mock_embedding("embed2")
|
|
396
|
+
chunked_doc1_1 = create_mock_chunked_document(
|
|
397
|
+
"chunk1", [embedding1_1, embedding1_2], num_chunks=2
|
|
398
|
+
)
|
|
399
|
+
extraction1_1 = create_mock_extraction("ext1", [chunked_doc1_1])
|
|
400
|
+
|
|
401
|
+
# Document 1: ext2 -> chunk2 -> embed1
|
|
402
|
+
embedding1_3 = create_mock_embedding("embed1")
|
|
403
|
+
chunked_doc1_2 = create_mock_chunked_document(
|
|
404
|
+
"chunk2", [embedding1_3], num_chunks=3
|
|
405
|
+
)
|
|
406
|
+
extraction1_2 = create_mock_extraction("ext2", [chunked_doc1_2])
|
|
407
|
+
|
|
408
|
+
doc1 = create_mock_document([extraction1_1, extraction1_2])
|
|
409
|
+
|
|
410
|
+
# Document 2: ext1 -> chunk1 -> embed1 only
|
|
411
|
+
embedding2_1 = create_mock_embedding("embed1")
|
|
412
|
+
chunked_doc2_1 = create_mock_chunked_document(
|
|
413
|
+
"chunk1", [embedding2_1], num_chunks=4
|
|
414
|
+
)
|
|
415
|
+
extraction2_1 = create_mock_extraction("ext1", [chunked_doc2_1])
|
|
416
|
+
doc2 = create_mock_document([extraction2_1])
|
|
417
|
+
|
|
418
|
+
# Test various RAG config combinations
|
|
419
|
+
configs = [
|
|
420
|
+
create_mock_rag_config(
|
|
421
|
+
"rag1", "ext1", "chunk1", "embed1"
|
|
422
|
+
), # Should match both docs
|
|
423
|
+
create_mock_rag_config(
|
|
424
|
+
"rag2", "ext1", "chunk1", "embed2"
|
|
425
|
+
), # Should match doc1 only
|
|
426
|
+
create_mock_rag_config(
|
|
427
|
+
"rag3", "ext2", "chunk2", "embed1"
|
|
428
|
+
), # Should match doc1 only
|
|
429
|
+
]
|
|
430
|
+
|
|
431
|
+
mock_project_magic.documents.return_value = [doc1, doc2]
|
|
432
|
+
result = await compute_current_progress_for_rag_configs(
|
|
433
|
+
mock_project_magic,
|
|
434
|
+
configs, # type: ignore
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
# rag1: ext1->chunk1->embed1 appears in both documents
|
|
438
|
+
progress1 = result["rag1"]
|
|
439
|
+
assert progress1.total_document_count == 2
|
|
440
|
+
assert progress1.total_document_extracted_count == 2
|
|
441
|
+
assert progress1.total_document_chunked_count == 2
|
|
442
|
+
assert progress1.total_document_embedded_count == 2
|
|
443
|
+
assert progress1.total_chunks_indexed_count == 0 # no indexing implemented yet
|
|
444
|
+
assert progress1.total_chunk_count == 6 # doc1 has 2 chunks + doc2 has 4 chunks
|
|
445
|
+
assert progress1.total_chunk_completed_count == 0 # same as indexed count
|
|
446
|
+
assert (
|
|
447
|
+
progress1.total_document_completed_count == 2
|
|
448
|
+
) # min of extraction, chunking, embedding
|
|
449
|
+
|
|
450
|
+
# rag2: ext1->chunk1->embed2 appears only in doc1
|
|
451
|
+
progress2 = result["rag2"]
|
|
452
|
+
assert progress2.total_document_count == 2
|
|
453
|
+
assert progress2.total_document_extracted_count == 2 # Both docs have ext1
|
|
454
|
+
assert (
|
|
455
|
+
progress2.total_document_chunked_count == 2
|
|
456
|
+
) # Both docs have ext1->chunk1
|
|
457
|
+
assert (
|
|
458
|
+
progress2.total_document_embedded_count == 1
|
|
459
|
+
) # Only doc1 has ext1->chunk1->embed2
|
|
460
|
+
assert progress2.total_chunks_indexed_count == 0 # no indexing implemented yet
|
|
461
|
+
assert progress2.total_chunk_count == 6 # doc1 has 2 chunks + doc2 has 4 chunks
|
|
462
|
+
assert progress2.total_chunk_completed_count == 0 # same as indexed count
|
|
463
|
+
assert progress2.total_document_completed_count == 1 # min(2,2,1) = 1
|
|
464
|
+
|
|
465
|
+
# rag3: ext2->chunk2->embed1 appears only in doc1
|
|
466
|
+
progress3 = result["rag3"]
|
|
467
|
+
assert progress3.total_document_count == 2
|
|
468
|
+
assert progress3.total_document_extracted_count == 1 # Only doc1 has ext2
|
|
469
|
+
assert progress3.total_document_chunked_count == 1 # Only doc1 has ext2->chunk2
|
|
470
|
+
assert (
|
|
471
|
+
progress3.total_document_embedded_count == 1
|
|
472
|
+
) # Only doc1 has ext2->chunk2->embed1
|
|
473
|
+
assert progress3.total_chunks_indexed_count == 0 # no indexing implemented yet
|
|
474
|
+
assert progress3.total_chunk_count == 3 # doc1 ext2->chunk2 has 3 chunks
|
|
475
|
+
assert progress3.total_chunk_completed_count == 0 # same as indexed count
|
|
476
|
+
assert progress3.total_document_completed_count == 1 # min(1,1,1) = 1
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
class TestComputeCurrentProgressForRagConfig:
|
|
480
|
+
@pytest.mark.asyncio
|
|
481
|
+
async def test_single_config_success(
|
|
482
|
+
self, mock_project_magic, mock_vector_store_count
|
|
483
|
+
):
|
|
484
|
+
"""Test computing progress for a single RAG config"""
|
|
485
|
+
embedding = create_mock_embedding("embed1")
|
|
486
|
+
chunked_doc = create_mock_chunked_document("chunk1", [embedding], num_chunks=3)
|
|
487
|
+
extraction = create_mock_extraction("ext1", [chunked_doc])
|
|
488
|
+
document = create_mock_document([extraction])
|
|
489
|
+
|
|
490
|
+
rag_config = create_mock_rag_config("rag1", "ext1", "chunk1", "embed1")
|
|
491
|
+
|
|
492
|
+
mock_project_magic.documents.return_value = [document]
|
|
493
|
+
result = await compute_current_progress_for_rag_config(
|
|
494
|
+
mock_project_magic, rag_config
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
assert isinstance(result, RagProgress)
|
|
498
|
+
assert result.total_document_count == 1
|
|
499
|
+
assert result.total_chunk_count == 3 # 3 chunks in the document
|
|
500
|
+
assert result.total_chunk_completed_count == 0 # same as indexed count
|
|
501
|
+
assert (
|
|
502
|
+
result.total_document_completed_count == 1
|
|
503
|
+
) # min of extraction, chunking, embedding
|
|
504
|
+
|
|
505
|
+
@pytest.mark.asyncio
|
|
506
|
+
async def test_single_config_not_found_error(
|
|
507
|
+
self, mock_project_magic, mock_vector_store_count
|
|
508
|
+
):
|
|
509
|
+
"""Test error case when RAG config is not found in results"""
|
|
510
|
+
# Create a config that won't be found (this shouldn't happen in practice)
|
|
511
|
+
rag_config = create_mock_rag_config("nonexistent", "ext1", "chunk1", "embed1")
|
|
512
|
+
|
|
513
|
+
# Mock the underlying function to return empty dict to simulate the error
|
|
514
|
+
with patch(
|
|
515
|
+
"kiln_ai.adapters.rag.progress.compute_current_progress_for_rag_configs",
|
|
516
|
+
new_callable=AsyncMock,
|
|
517
|
+
return_value={},
|
|
518
|
+
):
|
|
519
|
+
with pytest.raises(
|
|
520
|
+
ValueError,
|
|
521
|
+
match="Failed to compute progress for rag config nonexistent",
|
|
522
|
+
):
|
|
523
|
+
await compute_current_progress_for_rag_config(
|
|
524
|
+
mock_project_magic, rag_config
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
class TestCountRecordsInVectorStore:
|
|
529
|
+
@pytest.mark.asyncio
|
|
530
|
+
async def test_count_records_success(self):
|
|
531
|
+
"""Test successful counting of records in vector store"""
|
|
532
|
+
mock_rag_config = MagicMock()
|
|
533
|
+
mock_vector_store_config = MagicMock()
|
|
534
|
+
mock_vector_store = AsyncMock()
|
|
535
|
+
mock_vector_store.count_records.return_value = 42
|
|
536
|
+
|
|
537
|
+
with patch(
|
|
538
|
+
"kiln_ai.adapters.rag.progress.vector_store_adapter_for_config",
|
|
539
|
+
new_callable=AsyncMock,
|
|
540
|
+
return_value=mock_vector_store,
|
|
541
|
+
) as mock_adapter:
|
|
542
|
+
result = await count_records_in_vector_store(
|
|
543
|
+
mock_rag_config, mock_vector_store_config
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
assert result == 42
|
|
547
|
+
mock_adapter.assert_called_once_with(
|
|
548
|
+
mock_rag_config, mock_vector_store_config
|
|
549
|
+
)
|
|
550
|
+
mock_vector_store.count_records.assert_called_once()
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
class TestCountRecordsInVectorStoreForRagConfig:
|
|
554
|
+
@pytest.mark.asyncio
|
|
555
|
+
async def test_count_records_success(self, mock_project):
|
|
556
|
+
"""Test successful counting of records for RAG config"""
|
|
557
|
+
|
|
558
|
+
mock_rag_config = MagicMock()
|
|
559
|
+
mock_rag_config.id = "rag1"
|
|
560
|
+
mock_rag_config.vector_store_config_id = "vector_store_1"
|
|
561
|
+
|
|
562
|
+
mock_vector_store_config = MagicMock()
|
|
563
|
+
|
|
564
|
+
with (
|
|
565
|
+
patch(
|
|
566
|
+
"kiln_ai.adapters.rag.progress.VectorStoreConfig.from_id_and_parent_path",
|
|
567
|
+
return_value=mock_vector_store_config,
|
|
568
|
+
) as mock_from_id,
|
|
569
|
+
patch(
|
|
570
|
+
"kiln_ai.adapters.rag.progress.count_records_in_vector_store",
|
|
571
|
+
new_callable=AsyncMock,
|
|
572
|
+
return_value=25,
|
|
573
|
+
) as mock_count,
|
|
574
|
+
):
|
|
575
|
+
result = await count_records_in_vector_store_for_rag_config(
|
|
576
|
+
mock_project, mock_rag_config
|
|
577
|
+
)
|
|
578
|
+
|
|
579
|
+
assert result == 25
|
|
580
|
+
mock_from_id.assert_called_once_with("vector_store_1", mock_project.path)
|
|
581
|
+
mock_count.assert_called_once_with(
|
|
582
|
+
mock_rag_config, mock_vector_store_config
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
@pytest.mark.asyncio
|
|
586
|
+
async def test_count_records_no_vector_store_config_error(self, mock_project):
|
|
587
|
+
"""Test error case when vector store config is None"""
|
|
588
|
+
|
|
589
|
+
mock_rag_config = MagicMock()
|
|
590
|
+
mock_rag_config.id = "rag1"
|
|
591
|
+
mock_rag_config.vector_store_config_id = "vector_store_1"
|
|
592
|
+
|
|
593
|
+
with patch(
|
|
594
|
+
"kiln_ai.adapters.rag.progress.VectorStoreConfig.from_id_and_parent_path",
|
|
595
|
+
return_value=None,
|
|
596
|
+
) as mock_from_id:
|
|
597
|
+
with pytest.raises(
|
|
598
|
+
ValueError,
|
|
599
|
+
match="Rag config rag1 has no vector store config",
|
|
600
|
+
):
|
|
601
|
+
await count_records_in_vector_store_for_rag_config(
|
|
602
|
+
mock_project, mock_rag_config
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
mock_from_id.assert_called_once_with("vector_store_1", mock_project.path)
|
|
606
|
+
|
|
607
|
+
|
|
608
|
+
class TestComputeCurrentProgressForRagConfigsWithTags:
|
|
609
|
+
"""Test progress computation with document tag filtering"""
|
|
610
|
+
|
|
611
|
+
@pytest.mark.asyncio
|
|
612
|
+
async def test_rag_config_with_matching_tags(
|
|
613
|
+
self, mock_project_magic, mock_vector_store_count
|
|
614
|
+
):
|
|
615
|
+
"""Test RAG config that filters by tags - some documents match"""
|
|
616
|
+
# Create documents with different tags
|
|
617
|
+
doc1 = create_mock_document([], tags=["python", "backend"])
|
|
618
|
+
doc2 = create_mock_document([], tags=["javascript", "frontend"])
|
|
619
|
+
doc3 = create_mock_document([], tags=["python", "ml"])
|
|
620
|
+
doc4 = create_mock_document([], tags=["java", "backend"])
|
|
621
|
+
|
|
622
|
+
# RAG config that filters for "python" tag
|
|
623
|
+
rag_config = create_mock_rag_config(
|
|
624
|
+
"rag1", "ext1", "chunk1", "embed1", tags=["python"]
|
|
625
|
+
)
|
|
626
|
+
|
|
627
|
+
mock_project_magic.documents.return_value = [doc1, doc2, doc3, doc4]
|
|
628
|
+
result = await compute_current_progress_for_rag_configs(
|
|
629
|
+
mock_project_magic, [rag_config]
|
|
630
|
+
)
|
|
631
|
+
|
|
632
|
+
# Should only count doc1 and doc3 (have "python" tag)
|
|
633
|
+
assert len(result) == 1
|
|
634
|
+
assert "rag1" in result
|
|
635
|
+
assert result["rag1"].total_document_count == 2
|
|
636
|
+
|
|
637
|
+
@pytest.mark.asyncio
|
|
638
|
+
async def test_rag_config_with_multiple_tags(
|
|
639
|
+
self, mock_project_magic, mock_vector_store_count
|
|
640
|
+
):
|
|
641
|
+
"""Test RAG config with multiple tags (OR logic)"""
|
|
642
|
+
# Create documents with different tags
|
|
643
|
+
doc1 = create_mock_document([], tags=["python", "backend"])
|
|
644
|
+
doc2 = create_mock_document([], tags=["javascript", "frontend"])
|
|
645
|
+
doc3 = create_mock_document([], tags=["rust", "systems"])
|
|
646
|
+
doc4 = create_mock_document([], tags=["go", "backend"])
|
|
647
|
+
|
|
648
|
+
# RAG config that filters for "python" OR "javascript"
|
|
649
|
+
rag_config = create_mock_rag_config(
|
|
650
|
+
"rag1", "ext1", "chunk1", "embed1", tags=["python", "javascript"]
|
|
651
|
+
)
|
|
652
|
+
|
|
653
|
+
mock_project_magic.documents.return_value = [doc1, doc2, doc3, doc4]
|
|
654
|
+
result = await compute_current_progress_for_rag_configs(
|
|
655
|
+
mock_project_magic, [rag_config]
|
|
656
|
+
)
|
|
657
|
+
|
|
658
|
+
# Should count doc1 (python) and doc2 (javascript)
|
|
659
|
+
assert len(result) == 1
|
|
660
|
+
assert "rag1" in result
|
|
661
|
+
assert result["rag1"].total_document_count == 2
|
|
662
|
+
|
|
663
|
+
@pytest.mark.asyncio
|
|
664
|
+
async def test_rag_config_with_no_matching_tags(
|
|
665
|
+
self, mock_project_magic, mock_vector_store_count
|
|
666
|
+
):
|
|
667
|
+
"""Test RAG config where no documents match the tags"""
|
|
668
|
+
# Create documents with tags that don't match filter
|
|
669
|
+
doc1 = create_mock_document([], tags=["python", "backend"])
|
|
670
|
+
doc2 = create_mock_document([], tags=["javascript", "frontend"])
|
|
671
|
+
|
|
672
|
+
# RAG config that filters for "rust" tag
|
|
673
|
+
rag_config = create_mock_rag_config(
|
|
674
|
+
"rag1", "ext1", "chunk1", "embed1", tags=["rust"]
|
|
675
|
+
)
|
|
676
|
+
|
|
677
|
+
mock_project_magic.documents.return_value = [doc1, doc2]
|
|
678
|
+
result = await compute_current_progress_for_rag_configs(
|
|
679
|
+
mock_project_magic, [rag_config]
|
|
680
|
+
)
|
|
681
|
+
|
|
682
|
+
# Should count 0 documents
|
|
683
|
+
assert len(result) == 1
|
|
684
|
+
assert "rag1" in result
|
|
685
|
+
assert result["rag1"].total_document_count == 0
|
|
686
|
+
|
|
687
|
+
@pytest.mark.asyncio
|
|
688
|
+
async def test_rag_config_with_tags_and_extractions(
|
|
689
|
+
self, mock_project_magic, mock_vector_store_count
|
|
690
|
+
):
|
|
691
|
+
"""Test progress calculation with tag filtering and existing extractions"""
|
|
692
|
+
# Create documents with tags and extractions
|
|
693
|
+
embedding1 = create_mock_embedding("embed1")
|
|
694
|
+
chunked_doc1 = create_mock_chunked_document(
|
|
695
|
+
"chunk1", [embedding1], num_chunks=3
|
|
696
|
+
)
|
|
697
|
+
extraction1 = create_mock_extraction("ext1", [chunked_doc1])
|
|
698
|
+
doc1 = create_mock_document([extraction1], tags=["python", "ml"])
|
|
699
|
+
|
|
700
|
+
# Document with different tag - should be filtered out
|
|
701
|
+
embedding2 = create_mock_embedding("embed1")
|
|
702
|
+
chunked_doc2 = create_mock_chunked_document(
|
|
703
|
+
"chunk1", [embedding2], num_chunks=2
|
|
704
|
+
)
|
|
705
|
+
extraction2 = create_mock_extraction("ext1", [chunked_doc2])
|
|
706
|
+
doc2 = create_mock_document([extraction2], tags=["java", "web"])
|
|
707
|
+
|
|
708
|
+
# Document with matching tag but no extractions
|
|
709
|
+
doc3 = create_mock_document([], tags=["python", "backend"])
|
|
710
|
+
|
|
711
|
+
rag_config = create_mock_rag_config(
|
|
712
|
+
"rag1", "ext1", "chunk1", "embed1", tags=["python"]
|
|
713
|
+
)
|
|
714
|
+
|
|
715
|
+
mock_project_magic.documents.return_value = [doc1, doc2, doc3]
|
|
716
|
+
result = await compute_current_progress_for_rag_configs(
|
|
717
|
+
mock_project_magic, [rag_config]
|
|
718
|
+
)
|
|
719
|
+
|
|
720
|
+
# Should only consider doc1 and doc3 (have "python" tag)
|
|
721
|
+
assert len(result) == 1
|
|
722
|
+
assert "rag1" in result
|
|
723
|
+
progress = result["rag1"]
|
|
724
|
+
|
|
725
|
+
assert progress.total_document_count == 2 # doc1 and doc3
|
|
726
|
+
assert progress.total_document_extracted_count == 1 # only doc1 has extraction
|
|
727
|
+
assert progress.total_document_chunked_count == 1 # only doc1 has chunks
|
|
728
|
+
assert progress.total_document_embedded_count == 1 # only doc1 has embeddings
|
|
729
|
+
assert progress.total_chunk_count == 3 # doc1 has 3 chunks
|
|
730
|
+
|
|
731
|
+
@pytest.mark.asyncio
|
|
732
|
+
async def test_multiple_rag_configs_different_tag_filters(
|
|
733
|
+
self, mock_project_magic, mock_vector_store_count
|
|
734
|
+
):
|
|
735
|
+
"""Test multiple RAG configs with different tag filters"""
|
|
736
|
+
# Create documents with various tags
|
|
737
|
+
doc1 = create_mock_document([], tags=["python", "ml"])
|
|
738
|
+
doc2 = create_mock_document([], tags=["javascript", "frontend"])
|
|
739
|
+
doc3 = create_mock_document([], tags=["python", "web"])
|
|
740
|
+
doc4 = create_mock_document([], tags=["rust", "systems"])
|
|
741
|
+
|
|
742
|
+
# Two RAG configs with different tag filters
|
|
743
|
+
rag_config1 = create_mock_rag_config(
|
|
744
|
+
"rag1", "ext1", "chunk1", "embed1", tags=["python"]
|
|
745
|
+
)
|
|
746
|
+
rag_config2 = create_mock_rag_config(
|
|
747
|
+
"rag2", "ext1", "chunk1", "embed1", tags=["javascript", "rust"]
|
|
748
|
+
)
|
|
749
|
+
|
|
750
|
+
mock_project_magic.documents.return_value = [doc1, doc2, doc3, doc4]
|
|
751
|
+
result = await compute_current_progress_for_rag_configs(
|
|
752
|
+
mock_project_magic, [rag_config1, rag_config2]
|
|
753
|
+
)
|
|
754
|
+
|
|
755
|
+
assert len(result) == 2
|
|
756
|
+
|
|
757
|
+
# rag1 should count doc1 and doc3 (python)
|
|
758
|
+
assert result["rag1"].total_document_count == 2
|
|
759
|
+
|
|
760
|
+
# rag2 should count doc2 (javascript) and doc4 (rust)
|
|
761
|
+
assert result["rag2"].total_document_count == 2
|
|
762
|
+
|
|
763
|
+
@pytest.mark.asyncio
|
|
764
|
+
async def test_rag_config_documents_with_no_tags(
|
|
765
|
+
self, mock_project_magic, mock_vector_store_count
|
|
766
|
+
):
|
|
767
|
+
"""Test RAG config filtering when some documents have no tags"""
|
|
768
|
+
# Mix of documents with and without tags
|
|
769
|
+
doc1 = create_mock_document([], tags=["python", "ml"])
|
|
770
|
+
doc2 = create_mock_document([], tags=None) # No tags
|
|
771
|
+
doc3 = create_mock_document([], tags=[]) # Empty tags
|
|
772
|
+
doc4 = create_mock_document([], tags=["python", "web"])
|
|
773
|
+
|
|
774
|
+
rag_config = create_mock_rag_config(
|
|
775
|
+
"rag1", "ext1", "chunk1", "embed1", tags=["python"]
|
|
776
|
+
)
|
|
777
|
+
|
|
778
|
+
mock_project_magic.documents.return_value = [doc1, doc2, doc3, doc4]
|
|
779
|
+
result = await compute_current_progress_for_rag_configs(
|
|
780
|
+
mock_project_magic, [rag_config]
|
|
781
|
+
)
|
|
782
|
+
|
|
783
|
+
# Should only count doc1 and doc4 (have "python" tag)
|
|
784
|
+
assert len(result) == 1
|
|
785
|
+
assert result["rag1"].total_document_count == 2
|