kiln-ai 0.20.1__py3-none-any.whl → 0.22.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (133) hide show
  1. kiln_ai/adapters/__init__.py +6 -0
  2. kiln_ai/adapters/adapter_registry.py +43 -226
  3. kiln_ai/adapters/chunkers/__init__.py +13 -0
  4. kiln_ai/adapters/chunkers/base_chunker.py +42 -0
  5. kiln_ai/adapters/chunkers/chunker_registry.py +16 -0
  6. kiln_ai/adapters/chunkers/fixed_window_chunker.py +39 -0
  7. kiln_ai/adapters/chunkers/helpers.py +23 -0
  8. kiln_ai/adapters/chunkers/test_base_chunker.py +63 -0
  9. kiln_ai/adapters/chunkers/test_chunker_registry.py +28 -0
  10. kiln_ai/adapters/chunkers/test_fixed_window_chunker.py +346 -0
  11. kiln_ai/adapters/chunkers/test_helpers.py +75 -0
  12. kiln_ai/adapters/data_gen/test_data_gen_task.py +9 -3
  13. kiln_ai/adapters/embedding/__init__.py +0 -0
  14. kiln_ai/adapters/embedding/base_embedding_adapter.py +44 -0
  15. kiln_ai/adapters/embedding/embedding_registry.py +32 -0
  16. kiln_ai/adapters/embedding/litellm_embedding_adapter.py +199 -0
  17. kiln_ai/adapters/embedding/test_base_embedding_adapter.py +283 -0
  18. kiln_ai/adapters/embedding/test_embedding_registry.py +166 -0
  19. kiln_ai/adapters/embedding/test_litellm_embedding_adapter.py +1149 -0
  20. kiln_ai/adapters/eval/eval_runner.py +6 -2
  21. kiln_ai/adapters/eval/test_base_eval.py +1 -3
  22. kiln_ai/adapters/eval/test_g_eval.py +1 -1
  23. kiln_ai/adapters/extractors/__init__.py +18 -0
  24. kiln_ai/adapters/extractors/base_extractor.py +72 -0
  25. kiln_ai/adapters/extractors/encoding.py +20 -0
  26. kiln_ai/adapters/extractors/extractor_registry.py +44 -0
  27. kiln_ai/adapters/extractors/extractor_runner.py +112 -0
  28. kiln_ai/adapters/extractors/litellm_extractor.py +406 -0
  29. kiln_ai/adapters/extractors/test_base_extractor.py +244 -0
  30. kiln_ai/adapters/extractors/test_encoding.py +54 -0
  31. kiln_ai/adapters/extractors/test_extractor_registry.py +181 -0
  32. kiln_ai/adapters/extractors/test_extractor_runner.py +181 -0
  33. kiln_ai/adapters/extractors/test_litellm_extractor.py +1290 -0
  34. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +2 -2
  35. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +2 -6
  36. kiln_ai/adapters/fine_tune/test_together_finetune.py +2 -6
  37. kiln_ai/adapters/ml_embedding_model_list.py +494 -0
  38. kiln_ai/adapters/ml_model_list.py +876 -18
  39. kiln_ai/adapters/model_adapters/litellm_adapter.py +40 -75
  40. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +79 -1
  41. kiln_ai/adapters/model_adapters/test_litellm_adapter_tools.py +119 -5
  42. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +9 -3
  43. kiln_ai/adapters/model_adapters/test_structured_output.py +9 -10
  44. kiln_ai/adapters/ollama_tools.py +69 -12
  45. kiln_ai/adapters/provider_tools.py +190 -46
  46. kiln_ai/adapters/rag/deduplication.py +49 -0
  47. kiln_ai/adapters/rag/progress.py +252 -0
  48. kiln_ai/adapters/rag/rag_runners.py +844 -0
  49. kiln_ai/adapters/rag/test_deduplication.py +195 -0
  50. kiln_ai/adapters/rag/test_progress.py +785 -0
  51. kiln_ai/adapters/rag/test_rag_runners.py +2376 -0
  52. kiln_ai/adapters/remote_config.py +80 -8
  53. kiln_ai/adapters/test_adapter_registry.py +579 -86
  54. kiln_ai/adapters/test_ml_embedding_model_list.py +239 -0
  55. kiln_ai/adapters/test_ml_model_list.py +202 -0
  56. kiln_ai/adapters/test_ollama_tools.py +340 -1
  57. kiln_ai/adapters/test_prompt_builders.py +1 -1
  58. kiln_ai/adapters/test_provider_tools.py +199 -8
  59. kiln_ai/adapters/test_remote_config.py +551 -56
  60. kiln_ai/adapters/vector_store/__init__.py +1 -0
  61. kiln_ai/adapters/vector_store/base_vector_store_adapter.py +83 -0
  62. kiln_ai/adapters/vector_store/lancedb_adapter.py +389 -0
  63. kiln_ai/adapters/vector_store/test_base_vector_store.py +160 -0
  64. kiln_ai/adapters/vector_store/test_lancedb_adapter.py +1841 -0
  65. kiln_ai/adapters/vector_store/test_vector_store_registry.py +199 -0
  66. kiln_ai/adapters/vector_store/vector_store_registry.py +33 -0
  67. kiln_ai/datamodel/__init__.py +16 -13
  68. kiln_ai/datamodel/basemodel.py +201 -4
  69. kiln_ai/datamodel/chunk.py +158 -0
  70. kiln_ai/datamodel/datamodel_enums.py +27 -0
  71. kiln_ai/datamodel/embedding.py +64 -0
  72. kiln_ai/datamodel/external_tool_server.py +206 -54
  73. kiln_ai/datamodel/extraction.py +317 -0
  74. kiln_ai/datamodel/project.py +33 -1
  75. kiln_ai/datamodel/rag.py +79 -0
  76. kiln_ai/datamodel/task.py +5 -0
  77. kiln_ai/datamodel/task_output.py +41 -11
  78. kiln_ai/datamodel/test_attachment.py +649 -0
  79. kiln_ai/datamodel/test_basemodel.py +270 -14
  80. kiln_ai/datamodel/test_chunk_models.py +317 -0
  81. kiln_ai/datamodel/test_dataset_split.py +1 -1
  82. kiln_ai/datamodel/test_datasource.py +50 -0
  83. kiln_ai/datamodel/test_embedding_models.py +448 -0
  84. kiln_ai/datamodel/test_eval_model.py +6 -6
  85. kiln_ai/datamodel/test_external_tool_server.py +534 -152
  86. kiln_ai/datamodel/test_extraction_chunk.py +206 -0
  87. kiln_ai/datamodel/test_extraction_model.py +501 -0
  88. kiln_ai/datamodel/test_rag.py +641 -0
  89. kiln_ai/datamodel/test_task.py +35 -1
  90. kiln_ai/datamodel/test_tool_id.py +187 -1
  91. kiln_ai/datamodel/test_vector_store.py +320 -0
  92. kiln_ai/datamodel/tool_id.py +58 -0
  93. kiln_ai/datamodel/vector_store.py +141 -0
  94. kiln_ai/tools/base_tool.py +12 -3
  95. kiln_ai/tools/built_in_tools/math_tools.py +12 -4
  96. kiln_ai/tools/kiln_task_tool.py +158 -0
  97. kiln_ai/tools/mcp_server_tool.py +2 -2
  98. kiln_ai/tools/mcp_session_manager.py +51 -22
  99. kiln_ai/tools/rag_tools.py +164 -0
  100. kiln_ai/tools/test_kiln_task_tool.py +527 -0
  101. kiln_ai/tools/test_mcp_server_tool.py +4 -15
  102. kiln_ai/tools/test_mcp_session_manager.py +187 -227
  103. kiln_ai/tools/test_rag_tools.py +929 -0
  104. kiln_ai/tools/test_tool_registry.py +290 -7
  105. kiln_ai/tools/tool_registry.py +69 -16
  106. kiln_ai/utils/__init__.py +3 -0
  107. kiln_ai/utils/async_job_runner.py +62 -17
  108. kiln_ai/utils/config.py +2 -2
  109. kiln_ai/utils/env.py +15 -0
  110. kiln_ai/utils/filesystem.py +14 -0
  111. kiln_ai/utils/filesystem_cache.py +60 -0
  112. kiln_ai/utils/litellm.py +94 -0
  113. kiln_ai/utils/lock.py +100 -0
  114. kiln_ai/utils/mime_type.py +38 -0
  115. kiln_ai/utils/open_ai_types.py +19 -2
  116. kiln_ai/utils/pdf_utils.py +59 -0
  117. kiln_ai/utils/test_async_job_runner.py +151 -35
  118. kiln_ai/utils/test_env.py +142 -0
  119. kiln_ai/utils/test_filesystem_cache.py +316 -0
  120. kiln_ai/utils/test_litellm.py +206 -0
  121. kiln_ai/utils/test_lock.py +185 -0
  122. kiln_ai/utils/test_mime_type.py +66 -0
  123. kiln_ai/utils/test_open_ai_types.py +88 -12
  124. kiln_ai/utils/test_pdf_utils.py +86 -0
  125. kiln_ai/utils/test_uuid.py +111 -0
  126. kiln_ai/utils/test_validation.py +524 -0
  127. kiln_ai/utils/uuid.py +9 -0
  128. kiln_ai/utils/validation.py +90 -0
  129. {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/METADATA +9 -1
  130. kiln_ai-0.22.0.dist-info/RECORD +213 -0
  131. kiln_ai-0.20.1.dist-info/RECORD +0 -138
  132. {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/WHEEL +0 -0
  133. {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -0,0 +1,785 @@
1
+ from unittest.mock import AsyncMock, MagicMock, patch
2
+
3
+ import pytest
4
+ from kiln_ai.adapters.rag.progress import (
5
+ LogMessage,
6
+ RagProgress,
7
+ compute_current_progress_for_rag_config,
8
+ compute_current_progress_for_rag_configs,
9
+ count_records_in_vector_store,
10
+ count_records_in_vector_store_for_rag_config,
11
+ )
12
+ from kiln_ai.datamodel.chunk import ChunkedDocument
13
+ from kiln_ai.datamodel.embedding import ChunkEmbeddings
14
+ from kiln_ai.datamodel.extraction import Document, Extraction
15
+ from kiln_ai.datamodel.project import Project
16
+ from kiln_ai.datamodel.rag import RagConfig
17
+
18
+
19
+ @pytest.fixture
20
+ def mock_project(tmp_path):
21
+ project_path = tmp_path / "test_project" / "project.kiln"
22
+ project_path.parent.mkdir()
23
+
24
+ project = Project(name="Test Project", path=project_path)
25
+ project.save_to_file()
26
+
27
+ return project
28
+
29
+
30
+ @pytest.fixture
31
+ def mock_project_magic():
32
+ """This mock is more flexible than the mock_project fixture. Can mock the base model methods easily"""
33
+ return MagicMock(spec=Project)
34
+
35
+
36
+ @pytest.fixture
37
+ def mock_vector_store_count():
38
+ """Mock the vector store count operations to return 0 by default"""
39
+ with patch(
40
+ "kiln_ai.adapters.rag.progress.count_records_in_vector_store_for_rag_config",
41
+ new_callable=AsyncMock,
42
+ return_value=0,
43
+ ) as mock:
44
+ yield mock
45
+
46
+
47
+ def create_mock_embedding(embedding_config_id):
48
+ """Helper to create a mock embedding with the specified config ID"""
49
+ mock_embedding = MagicMock(spec=ChunkEmbeddings)
50
+ mock_embedding.embedding_config_id = embedding_config_id
51
+ mock_embedding.created_at = "2024-01-01T00:00:00Z"
52
+ return mock_embedding
53
+
54
+
55
+ def create_mock_chunked_document(chunker_config_id, embeddings=None, num_chunks=1):
56
+ """Helper to create a mock chunked document with the specified config ID and embeddings"""
57
+ if embeddings is None:
58
+ embeddings = []
59
+
60
+ mock_chunked_doc = MagicMock(spec=ChunkedDocument)
61
+ mock_chunked_doc.chunker_config_id = chunker_config_id
62
+ mock_chunked_doc.chunk_embeddings.return_value = embeddings
63
+ mock_chunked_doc.created_at = "2024-01-01T00:00:00Z"
64
+ # Mock the chunks attribute to return a list with the specified number of chunks
65
+ mock_chunked_doc.chunks = [MagicMock() for _ in range(num_chunks)]
66
+ return mock_chunked_doc
67
+
68
+
69
+ def create_mock_extraction(extractor_config_id, chunked_documents=None):
70
+ """Helper to create a mock extraction with the specified config ID and chunked documents"""
71
+ if chunked_documents is None:
72
+ chunked_documents = []
73
+
74
+ mock_extraction = MagicMock(spec=Extraction)
75
+ mock_extraction.extractor_config_id = extractor_config_id
76
+ mock_extraction.chunked_documents.return_value = chunked_documents
77
+ mock_extraction.created_at = "2024-01-01T00:00:00Z"
78
+ return mock_extraction
79
+
80
+
81
+ def create_mock_document(extractions=None, tags=None):
82
+ """Helper to create a mock document with the specified extractions"""
83
+ if extractions is None:
84
+ extractions = []
85
+
86
+ mock_document = MagicMock(spec=Document)
87
+ mock_document.extractions.return_value = extractions
88
+ mock_document.tags = tags
89
+ return mock_document
90
+
91
+
92
+ def create_mock_rag_config(
93
+ config_id,
94
+ extractor_config_id,
95
+ chunker_config_id,
96
+ embedding_config_id,
97
+ vector_store_config_id="vector_store_1",
98
+ tags=None,
99
+ ):
100
+ """Helper to create a mock RAG config with the specified IDs"""
101
+ mock_rag_config = MagicMock(spec=RagConfig)
102
+ mock_rag_config.id = config_id
103
+ mock_rag_config.extractor_config_id = extractor_config_id
104
+ mock_rag_config.chunker_config_id = chunker_config_id
105
+ mock_rag_config.embedding_config_id = embedding_config_id
106
+ mock_rag_config.vector_store_config_id = vector_store_config_id
107
+ mock_rag_config.tags = tags
108
+ return mock_rag_config
109
+
110
+
111
+ class TestLogMessage:
112
+ def test_log_message_creation(self):
113
+ log = LogMessage(level="info", message="Test message")
114
+ assert log.level == "info"
115
+ assert log.message == "Test message"
116
+
117
+ def test_log_message_validation(self):
118
+ # Test valid levels
119
+ for level in ["info", "error", "warning"]:
120
+ log = LogMessage(level=level, message="Test") # type: ignore
121
+ assert log.level == level
122
+
123
+
124
+ class TestRagProgress:
125
+ def test_rag_progress_default_values(self):
126
+ progress = RagProgress()
127
+ assert progress.total_document_count == 0
128
+ assert progress.total_document_completed_count == 0
129
+ assert progress.total_chunk_count == 0
130
+ assert progress.total_chunk_completed_count == 0
131
+ assert progress.total_document_extracted_count == 0
132
+ assert progress.total_document_extracted_error_count == 0
133
+ assert progress.total_document_chunked_count == 0
134
+ assert progress.total_document_chunked_error_count == 0
135
+ assert progress.total_document_embedded_count == 0
136
+ assert progress.total_document_embedded_error_count == 0
137
+ assert progress.total_chunks_indexed_count == 0
138
+ assert progress.total_chunks_indexed_error_count == 0
139
+ assert progress.logs is None
140
+
141
+ def test_rag_progress_with_values(self):
142
+ logs = [LogMessage(level="info", message="Processing")]
143
+ progress = RagProgress(
144
+ total_document_count=10,
145
+ total_document_completed_count=5,
146
+ total_document_extracted_count=8,
147
+ total_document_chunked_count=6,
148
+ total_document_embedded_count=5,
149
+ total_chunk_count=6,
150
+ total_chunk_completed_count=3,
151
+ total_chunks_indexed_count=3,
152
+ logs=logs,
153
+ )
154
+ assert progress.total_document_count == 10
155
+ assert progress.total_document_completed_count == 5
156
+ assert progress.total_document_extracted_count == 8
157
+ assert progress.total_document_chunked_count == 6
158
+ assert progress.total_document_embedded_count == 5
159
+ assert progress.total_chunk_count == 6
160
+ assert progress.total_chunk_completed_count == 3
161
+ assert progress.total_chunks_indexed_count == 3
162
+ assert progress.logs is not None
163
+ assert len(progress.logs) == 1
164
+ assert progress.logs[0].level == "info"
165
+
166
+
167
+ class TestComputeCurrentProgressForRagConfigs:
168
+ @pytest.mark.asyncio
169
+ async def test_empty_project_empty_configs(
170
+ self, mock_project_magic, mock_vector_store_count
171
+ ):
172
+ """Test with no documents and no RAG configs"""
173
+ mock_project_magic.documents.return_value = []
174
+
175
+ result = await compute_current_progress_for_rag_configs(mock_project_magic, [])
176
+ assert result == {}
177
+
178
+ @pytest.mark.asyncio
179
+ async def test_empty_project_with_config(
180
+ self, mock_project_magic, mock_vector_store_count
181
+ ):
182
+ """Test with no documents but with a RAG config"""
183
+ rag_config = create_mock_rag_config("rag1", "ext1", "chunk1", "embed1")
184
+ mock_project_magic.documents.return_value = []
185
+
186
+ result = await compute_current_progress_for_rag_configs(
187
+ mock_project_magic, [rag_config]
188
+ )
189
+
190
+ assert "rag1" in result
191
+ progress = result["rag1"]
192
+ assert progress.total_document_count == 0
193
+ assert progress.total_document_completed_count == 0
194
+ assert progress.total_document_extracted_count == 0
195
+ assert progress.total_document_chunked_count == 0
196
+ assert progress.total_document_embedded_count == 0
197
+ assert progress.total_chunks_indexed_count == 0
198
+ assert progress.total_chunk_count == 0
199
+ assert progress.total_chunk_completed_count == 0
200
+
201
+ @pytest.mark.asyncio
202
+ async def test_documents_no_extractions(
203
+ self, mock_project_magic, mock_vector_store_count
204
+ ):
205
+ """Test with documents but no extractions"""
206
+ documents = [create_mock_document() for _ in range(3)]
207
+ rag_config = create_mock_rag_config("rag1", "ext1", "chunk1", "embed1")
208
+ mock_project_magic.documents.return_value = documents
209
+
210
+ result = await compute_current_progress_for_rag_configs(
211
+ mock_project_magic, [rag_config]
212
+ )
213
+
214
+ assert "rag1" in result
215
+ progress = result["rag1"]
216
+ assert progress.total_document_count == 3
217
+ assert progress.total_document_completed_count == 0
218
+ assert progress.total_document_extracted_count == 0
219
+ assert progress.total_document_chunked_count == 0
220
+ assert progress.total_document_embedded_count == 0
221
+ assert progress.total_chunks_indexed_count == 0
222
+
223
+ @pytest.mark.asyncio
224
+ async def test_full_pipeline_single_config(
225
+ self, mock_project_magic, mock_vector_store_count
226
+ ):
227
+ """Test complete pipeline with one RAG config"""
228
+ # Create documents with separate extraction trees
229
+ documents = []
230
+ for i in range(2):
231
+ # Each document gets its own unique extraction tree
232
+ embedding = create_mock_embedding("embed1")
233
+ chunked_doc = create_mock_chunked_document(
234
+ "chunk1", [embedding], num_chunks=3
235
+ ) # 3 chunks per document
236
+ extraction = create_mock_extraction("ext1", [chunked_doc])
237
+ document = create_mock_document([extraction])
238
+ documents.append(document)
239
+
240
+ rag_config = create_mock_rag_config("rag1", "ext1", "chunk1", "embed1")
241
+
242
+ mock_project_magic.documents.return_value = documents
243
+ result = await compute_current_progress_for_rag_configs(
244
+ mock_project_magic, [rag_config]
245
+ )
246
+
247
+ assert "rag1" in result
248
+ progress = result["rag1"]
249
+ assert progress.total_document_count == 2
250
+ assert (
251
+ progress.total_document_completed_count == 2
252
+ ) # min of extraction, chunking, embedding (all complete)
253
+ assert progress.total_document_extracted_count == 2
254
+ assert progress.total_document_chunked_count == 2
255
+ assert progress.total_document_embedded_count == 2
256
+ assert progress.total_chunks_indexed_count == 0
257
+ assert progress.total_chunk_count == 6 # 2 documents * 3 chunks each
258
+ assert progress.total_chunk_completed_count == 0 # same as indexed count
259
+
260
+ @pytest.mark.asyncio
261
+ async def test_partial_pipeline_progress(
262
+ self, mock_project_magic, mock_vector_store_count
263
+ ):
264
+ """Test pipeline where some steps are incomplete"""
265
+ # Document 1: fully processed
266
+ embedding1 = create_mock_embedding("embed1")
267
+ chunked_doc1 = create_mock_chunked_document(
268
+ "chunk1", [embedding1], num_chunks=2
269
+ )
270
+ extraction1 = create_mock_extraction("ext1", [chunked_doc1])
271
+ doc1 = create_mock_document([extraction1])
272
+
273
+ # Document 2: extracted and chunked but not embedded
274
+ chunked_doc2 = create_mock_chunked_document(
275
+ "chunk1", [], num_chunks=3
276
+ ) # no embeddings
277
+ extraction2 = create_mock_extraction("ext1", [chunked_doc2])
278
+ doc2 = create_mock_document([extraction2])
279
+
280
+ # Document 3: extracted but not chunked
281
+ extraction3 = create_mock_extraction("ext1", []) # no chunked docs
282
+ doc3 = create_mock_document([extraction3])
283
+
284
+ # Document 4: not extracted
285
+ doc4 = create_mock_document([]) # no extractions
286
+
287
+ rag_config = create_mock_rag_config("rag1", "ext1", "chunk1", "embed1")
288
+
289
+ mock_project_magic.documents.return_value = [doc1, doc2, doc3, doc4]
290
+ result = await compute_current_progress_for_rag_configs(
291
+ mock_project_magic, [rag_config]
292
+ )
293
+
294
+ assert "rag1" in result
295
+ progress = result["rag1"]
296
+ assert progress.total_document_count == 4
297
+ assert progress.total_document_extracted_count == 3 # docs 1, 2, 3
298
+ assert progress.total_document_chunked_count == 2 # docs 1, 2
299
+ assert progress.total_document_embedded_count == 1 # doc 1 only
300
+ assert progress.total_chunks_indexed_count == 0 # no indexing implemented yet
301
+ assert progress.total_chunk_count == 5 # doc1 has 2 chunks + doc2 has 3 chunks
302
+ assert progress.total_chunk_completed_count == 0 # same as indexed count
303
+ assert progress.total_document_completed_count == 1 # min(3,2,1) = 1
304
+
305
+ @pytest.mark.asyncio
306
+ async def test_multiple_rag_configs_shared_prefixes(
307
+ self, mock_project_magic, mock_vector_store_count
308
+ ):
309
+ """Test multiple RAG configs that share common path prefixes"""
310
+ # Create data that matches multiple configs
311
+ embedding1 = create_mock_embedding("embed1")
312
+ embedding2 = create_mock_embedding("embed2")
313
+
314
+ chunked_doc = create_mock_chunked_document(
315
+ "chunk1", [embedding1, embedding2], num_chunks=4
316
+ )
317
+ extraction = create_mock_extraction("ext1", [chunked_doc])
318
+ document = create_mock_document([extraction])
319
+
320
+ # Two configs that share extractor and chunker but differ in embedding
321
+ rag_config1 = create_mock_rag_config("rag1", "ext1", "chunk1", "embed1")
322
+ rag_config2 = create_mock_rag_config("rag2", "ext1", "chunk1", "embed2")
323
+
324
+ mock_project_magic.documents.return_value = [document]
325
+ result = await compute_current_progress_for_rag_configs(
326
+ mock_project_magic, [rag_config1, rag_config2]
327
+ )
328
+
329
+ # Both configs should have same extraction and chunking counts
330
+ assert "rag1" in result
331
+ assert "rag2" in result
332
+
333
+ for config_id in ["rag1", "rag2"]:
334
+ progress = result[config_id]
335
+ assert progress.total_document_count == 1
336
+ assert progress.total_document_extracted_count == 1
337
+ assert progress.total_document_chunked_count == 1
338
+ assert progress.total_document_embedded_count == 1
339
+ assert (
340
+ progress.total_chunks_indexed_count == 0
341
+ ) # no indexing implemented yet
342
+ assert progress.total_chunk_count == 4 # 4 chunks in the document
343
+ assert progress.total_chunk_completed_count == 0 # same as indexed count
344
+ assert (
345
+ progress.total_document_completed_count == 1
346
+ ) # min of extraction, chunking, embedding
347
+
348
+ @pytest.mark.asyncio
349
+ async def test_multiple_rag_configs_different_extractors(
350
+ self, mock_project_magic, mock_vector_store_count
351
+ ):
352
+ """Test multiple RAG configs with different extractors"""
353
+ # Create extractions for different extractors
354
+ embedding = create_mock_embedding("embed1")
355
+ chunked_doc = create_mock_chunked_document("chunk1", [embedding], num_chunks=5)
356
+
357
+ extraction1 = create_mock_extraction("ext1", [chunked_doc])
358
+ extraction2 = create_mock_extraction("ext2", [chunked_doc])
359
+
360
+ document = create_mock_document([extraction1, extraction2])
361
+
362
+ # Two configs with different extractors
363
+ rag_config1 = create_mock_rag_config("rag1", "ext1", "chunk1", "embed1")
364
+ rag_config2 = create_mock_rag_config("rag2", "ext2", "chunk1", "embed1")
365
+
366
+ mock_project_magic.documents.return_value = [document]
367
+ result = await compute_current_progress_for_rag_configs(
368
+ mock_project_magic, [rag_config1, rag_config2]
369
+ )
370
+
371
+ # Both should show progress since document has extractions for both extractors
372
+ for config_id in ["rag1", "rag2"]:
373
+ assert config_id in result
374
+ progress = result[config_id]
375
+ assert progress.total_document_count == 1
376
+ assert progress.total_document_extracted_count == 1
377
+ assert progress.total_document_chunked_count == 1
378
+ assert progress.total_document_embedded_count == 1
379
+ assert (
380
+ progress.total_chunks_indexed_count == 0
381
+ ) # no indexing implemented yet
382
+ assert progress.total_chunk_count == 5 # 5 chunks in the document
383
+ assert progress.total_chunk_completed_count == 0 # same as indexed count
384
+ assert (
385
+ progress.total_document_completed_count == 1
386
+ ) # min of extraction, chunking, embedding
387
+
388
+ @pytest.mark.asyncio
389
+ async def test_complex_tree_structure(
390
+ self, mock_project_magic, mock_vector_store_count
391
+ ):
392
+ """Test a complex tree with multiple documents, extractors, chunkers, and embeddings"""
393
+ # Document 1: ext1 -> chunk1 -> embed1, embed2
394
+ embedding1_1 = create_mock_embedding("embed1")
395
+ embedding1_2 = create_mock_embedding("embed2")
396
+ chunked_doc1_1 = create_mock_chunked_document(
397
+ "chunk1", [embedding1_1, embedding1_2], num_chunks=2
398
+ )
399
+ extraction1_1 = create_mock_extraction("ext1", [chunked_doc1_1])
400
+
401
+ # Document 1: ext2 -> chunk2 -> embed1
402
+ embedding1_3 = create_mock_embedding("embed1")
403
+ chunked_doc1_2 = create_mock_chunked_document(
404
+ "chunk2", [embedding1_3], num_chunks=3
405
+ )
406
+ extraction1_2 = create_mock_extraction("ext2", [chunked_doc1_2])
407
+
408
+ doc1 = create_mock_document([extraction1_1, extraction1_2])
409
+
410
+ # Document 2: ext1 -> chunk1 -> embed1 only
411
+ embedding2_1 = create_mock_embedding("embed1")
412
+ chunked_doc2_1 = create_mock_chunked_document(
413
+ "chunk1", [embedding2_1], num_chunks=4
414
+ )
415
+ extraction2_1 = create_mock_extraction("ext1", [chunked_doc2_1])
416
+ doc2 = create_mock_document([extraction2_1])
417
+
418
+ # Test various RAG config combinations
419
+ configs = [
420
+ create_mock_rag_config(
421
+ "rag1", "ext1", "chunk1", "embed1"
422
+ ), # Should match both docs
423
+ create_mock_rag_config(
424
+ "rag2", "ext1", "chunk1", "embed2"
425
+ ), # Should match doc1 only
426
+ create_mock_rag_config(
427
+ "rag3", "ext2", "chunk2", "embed1"
428
+ ), # Should match doc1 only
429
+ ]
430
+
431
+ mock_project_magic.documents.return_value = [doc1, doc2]
432
+ result = await compute_current_progress_for_rag_configs(
433
+ mock_project_magic,
434
+ configs, # type: ignore
435
+ )
436
+
437
+ # rag1: ext1->chunk1->embed1 appears in both documents
438
+ progress1 = result["rag1"]
439
+ assert progress1.total_document_count == 2
440
+ assert progress1.total_document_extracted_count == 2
441
+ assert progress1.total_document_chunked_count == 2
442
+ assert progress1.total_document_embedded_count == 2
443
+ assert progress1.total_chunks_indexed_count == 0 # no indexing implemented yet
444
+ assert progress1.total_chunk_count == 6 # doc1 has 2 chunks + doc2 has 4 chunks
445
+ assert progress1.total_chunk_completed_count == 0 # same as indexed count
446
+ assert (
447
+ progress1.total_document_completed_count == 2
448
+ ) # min of extraction, chunking, embedding
449
+
450
+ # rag2: ext1->chunk1->embed2 appears only in doc1
451
+ progress2 = result["rag2"]
452
+ assert progress2.total_document_count == 2
453
+ assert progress2.total_document_extracted_count == 2 # Both docs have ext1
454
+ assert (
455
+ progress2.total_document_chunked_count == 2
456
+ ) # Both docs have ext1->chunk1
457
+ assert (
458
+ progress2.total_document_embedded_count == 1
459
+ ) # Only doc1 has ext1->chunk1->embed2
460
+ assert progress2.total_chunks_indexed_count == 0 # no indexing implemented yet
461
+ assert progress2.total_chunk_count == 6 # doc1 has 2 chunks + doc2 has 4 chunks
462
+ assert progress2.total_chunk_completed_count == 0 # same as indexed count
463
+ assert progress2.total_document_completed_count == 1 # min(2,2,1) = 1
464
+
465
+ # rag3: ext2->chunk2->embed1 appears only in doc1
466
+ progress3 = result["rag3"]
467
+ assert progress3.total_document_count == 2
468
+ assert progress3.total_document_extracted_count == 1 # Only doc1 has ext2
469
+ assert progress3.total_document_chunked_count == 1 # Only doc1 has ext2->chunk2
470
+ assert (
471
+ progress3.total_document_embedded_count == 1
472
+ ) # Only doc1 has ext2->chunk2->embed1
473
+ assert progress3.total_chunks_indexed_count == 0 # no indexing implemented yet
474
+ assert progress3.total_chunk_count == 3 # doc1 ext2->chunk2 has 3 chunks
475
+ assert progress3.total_chunk_completed_count == 0 # same as indexed count
476
+ assert progress3.total_document_completed_count == 1 # min(1,1,1) = 1
477
+
478
+
479
+ class TestComputeCurrentProgressForRagConfig:
480
+ @pytest.mark.asyncio
481
+ async def test_single_config_success(
482
+ self, mock_project_magic, mock_vector_store_count
483
+ ):
484
+ """Test computing progress for a single RAG config"""
485
+ embedding = create_mock_embedding("embed1")
486
+ chunked_doc = create_mock_chunked_document("chunk1", [embedding], num_chunks=3)
487
+ extraction = create_mock_extraction("ext1", [chunked_doc])
488
+ document = create_mock_document([extraction])
489
+
490
+ rag_config = create_mock_rag_config("rag1", "ext1", "chunk1", "embed1")
491
+
492
+ mock_project_magic.documents.return_value = [document]
493
+ result = await compute_current_progress_for_rag_config(
494
+ mock_project_magic, rag_config
495
+ )
496
+
497
+ assert isinstance(result, RagProgress)
498
+ assert result.total_document_count == 1
499
+ assert result.total_chunk_count == 3 # 3 chunks in the document
500
+ assert result.total_chunk_completed_count == 0 # same as indexed count
501
+ assert (
502
+ result.total_document_completed_count == 1
503
+ ) # min of extraction, chunking, embedding
504
+
505
+ @pytest.mark.asyncio
506
+ async def test_single_config_not_found_error(
507
+ self, mock_project_magic, mock_vector_store_count
508
+ ):
509
+ """Test error case when RAG config is not found in results"""
510
+ # Create a config that won't be found (this shouldn't happen in practice)
511
+ rag_config = create_mock_rag_config("nonexistent", "ext1", "chunk1", "embed1")
512
+
513
+ # Mock the underlying function to return empty dict to simulate the error
514
+ with patch(
515
+ "kiln_ai.adapters.rag.progress.compute_current_progress_for_rag_configs",
516
+ new_callable=AsyncMock,
517
+ return_value={},
518
+ ):
519
+ with pytest.raises(
520
+ ValueError,
521
+ match="Failed to compute progress for rag config nonexistent",
522
+ ):
523
+ await compute_current_progress_for_rag_config(
524
+ mock_project_magic, rag_config
525
+ )
526
+
527
+
528
+ class TestCountRecordsInVectorStore:
529
+ @pytest.mark.asyncio
530
+ async def test_count_records_success(self):
531
+ """Test successful counting of records in vector store"""
532
+ mock_rag_config = MagicMock()
533
+ mock_vector_store_config = MagicMock()
534
+ mock_vector_store = AsyncMock()
535
+ mock_vector_store.count_records.return_value = 42
536
+
537
+ with patch(
538
+ "kiln_ai.adapters.rag.progress.vector_store_adapter_for_config",
539
+ new_callable=AsyncMock,
540
+ return_value=mock_vector_store,
541
+ ) as mock_adapter:
542
+ result = await count_records_in_vector_store(
543
+ mock_rag_config, mock_vector_store_config
544
+ )
545
+
546
+ assert result == 42
547
+ mock_adapter.assert_called_once_with(
548
+ mock_rag_config, mock_vector_store_config
549
+ )
550
+ mock_vector_store.count_records.assert_called_once()
551
+
552
+
553
+ class TestCountRecordsInVectorStoreForRagConfig:
554
+ @pytest.mark.asyncio
555
+ async def test_count_records_success(self, mock_project):
556
+ """Test successful counting of records for RAG config"""
557
+
558
+ mock_rag_config = MagicMock()
559
+ mock_rag_config.id = "rag1"
560
+ mock_rag_config.vector_store_config_id = "vector_store_1"
561
+
562
+ mock_vector_store_config = MagicMock()
563
+
564
+ with (
565
+ patch(
566
+ "kiln_ai.adapters.rag.progress.VectorStoreConfig.from_id_and_parent_path",
567
+ return_value=mock_vector_store_config,
568
+ ) as mock_from_id,
569
+ patch(
570
+ "kiln_ai.adapters.rag.progress.count_records_in_vector_store",
571
+ new_callable=AsyncMock,
572
+ return_value=25,
573
+ ) as mock_count,
574
+ ):
575
+ result = await count_records_in_vector_store_for_rag_config(
576
+ mock_project, mock_rag_config
577
+ )
578
+
579
+ assert result == 25
580
+ mock_from_id.assert_called_once_with("vector_store_1", mock_project.path)
581
+ mock_count.assert_called_once_with(
582
+ mock_rag_config, mock_vector_store_config
583
+ )
584
+
585
+ @pytest.mark.asyncio
586
+ async def test_count_records_no_vector_store_config_error(self, mock_project):
587
+ """Test error case when vector store config is None"""
588
+
589
+ mock_rag_config = MagicMock()
590
+ mock_rag_config.id = "rag1"
591
+ mock_rag_config.vector_store_config_id = "vector_store_1"
592
+
593
+ with patch(
594
+ "kiln_ai.adapters.rag.progress.VectorStoreConfig.from_id_and_parent_path",
595
+ return_value=None,
596
+ ) as mock_from_id:
597
+ with pytest.raises(
598
+ ValueError,
599
+ match="Rag config rag1 has no vector store config",
600
+ ):
601
+ await count_records_in_vector_store_for_rag_config(
602
+ mock_project, mock_rag_config
603
+ )
604
+
605
+ mock_from_id.assert_called_once_with("vector_store_1", mock_project.path)
606
+
607
+
608
+ class TestComputeCurrentProgressForRagConfigsWithTags:
609
+ """Test progress computation with document tag filtering"""
610
+
611
+ @pytest.mark.asyncio
612
+ async def test_rag_config_with_matching_tags(
613
+ self, mock_project_magic, mock_vector_store_count
614
+ ):
615
+ """Test RAG config that filters by tags - some documents match"""
616
+ # Create documents with different tags
617
+ doc1 = create_mock_document([], tags=["python", "backend"])
618
+ doc2 = create_mock_document([], tags=["javascript", "frontend"])
619
+ doc3 = create_mock_document([], tags=["python", "ml"])
620
+ doc4 = create_mock_document([], tags=["java", "backend"])
621
+
622
+ # RAG config that filters for "python" tag
623
+ rag_config = create_mock_rag_config(
624
+ "rag1", "ext1", "chunk1", "embed1", tags=["python"]
625
+ )
626
+
627
+ mock_project_magic.documents.return_value = [doc1, doc2, doc3, doc4]
628
+ result = await compute_current_progress_for_rag_configs(
629
+ mock_project_magic, [rag_config]
630
+ )
631
+
632
+ # Should only count doc1 and doc3 (have "python" tag)
633
+ assert len(result) == 1
634
+ assert "rag1" in result
635
+ assert result["rag1"].total_document_count == 2
636
+
637
+ @pytest.mark.asyncio
638
+ async def test_rag_config_with_multiple_tags(
639
+ self, mock_project_magic, mock_vector_store_count
640
+ ):
641
+ """Test RAG config with multiple tags (OR logic)"""
642
+ # Create documents with different tags
643
+ doc1 = create_mock_document([], tags=["python", "backend"])
644
+ doc2 = create_mock_document([], tags=["javascript", "frontend"])
645
+ doc3 = create_mock_document([], tags=["rust", "systems"])
646
+ doc4 = create_mock_document([], tags=["go", "backend"])
647
+
648
+ # RAG config that filters for "python" OR "javascript"
649
+ rag_config = create_mock_rag_config(
650
+ "rag1", "ext1", "chunk1", "embed1", tags=["python", "javascript"]
651
+ )
652
+
653
+ mock_project_magic.documents.return_value = [doc1, doc2, doc3, doc4]
654
+ result = await compute_current_progress_for_rag_configs(
655
+ mock_project_magic, [rag_config]
656
+ )
657
+
658
+ # Should count doc1 (python) and doc2 (javascript)
659
+ assert len(result) == 1
660
+ assert "rag1" in result
661
+ assert result["rag1"].total_document_count == 2
662
+
663
+ @pytest.mark.asyncio
664
+ async def test_rag_config_with_no_matching_tags(
665
+ self, mock_project_magic, mock_vector_store_count
666
+ ):
667
+ """Test RAG config where no documents match the tags"""
668
+ # Create documents with tags that don't match filter
669
+ doc1 = create_mock_document([], tags=["python", "backend"])
670
+ doc2 = create_mock_document([], tags=["javascript", "frontend"])
671
+
672
+ # RAG config that filters for "rust" tag
673
+ rag_config = create_mock_rag_config(
674
+ "rag1", "ext1", "chunk1", "embed1", tags=["rust"]
675
+ )
676
+
677
+ mock_project_magic.documents.return_value = [doc1, doc2]
678
+ result = await compute_current_progress_for_rag_configs(
679
+ mock_project_magic, [rag_config]
680
+ )
681
+
682
+ # Should count 0 documents
683
+ assert len(result) == 1
684
+ assert "rag1" in result
685
+ assert result["rag1"].total_document_count == 0
686
+
687
+ @pytest.mark.asyncio
688
+ async def test_rag_config_with_tags_and_extractions(
689
+ self, mock_project_magic, mock_vector_store_count
690
+ ):
691
+ """Test progress calculation with tag filtering and existing extractions"""
692
+ # Create documents with tags and extractions
693
+ embedding1 = create_mock_embedding("embed1")
694
+ chunked_doc1 = create_mock_chunked_document(
695
+ "chunk1", [embedding1], num_chunks=3
696
+ )
697
+ extraction1 = create_mock_extraction("ext1", [chunked_doc1])
698
+ doc1 = create_mock_document([extraction1], tags=["python", "ml"])
699
+
700
+ # Document with different tag - should be filtered out
701
+ embedding2 = create_mock_embedding("embed1")
702
+ chunked_doc2 = create_mock_chunked_document(
703
+ "chunk1", [embedding2], num_chunks=2
704
+ )
705
+ extraction2 = create_mock_extraction("ext1", [chunked_doc2])
706
+ doc2 = create_mock_document([extraction2], tags=["java", "web"])
707
+
708
+ # Document with matching tag but no extractions
709
+ doc3 = create_mock_document([], tags=["python", "backend"])
710
+
711
+ rag_config = create_mock_rag_config(
712
+ "rag1", "ext1", "chunk1", "embed1", tags=["python"]
713
+ )
714
+
715
+ mock_project_magic.documents.return_value = [doc1, doc2, doc3]
716
+ result = await compute_current_progress_for_rag_configs(
717
+ mock_project_magic, [rag_config]
718
+ )
719
+
720
+ # Should only consider doc1 and doc3 (have "python" tag)
721
+ assert len(result) == 1
722
+ assert "rag1" in result
723
+ progress = result["rag1"]
724
+
725
+ assert progress.total_document_count == 2 # doc1 and doc3
726
+ assert progress.total_document_extracted_count == 1 # only doc1 has extraction
727
+ assert progress.total_document_chunked_count == 1 # only doc1 has chunks
728
+ assert progress.total_document_embedded_count == 1 # only doc1 has embeddings
729
+ assert progress.total_chunk_count == 3 # doc1 has 3 chunks
730
+
731
+ @pytest.mark.asyncio
732
+ async def test_multiple_rag_configs_different_tag_filters(
733
+ self, mock_project_magic, mock_vector_store_count
734
+ ):
735
+ """Test multiple RAG configs with different tag filters"""
736
+ # Create documents with various tags
737
+ doc1 = create_mock_document([], tags=["python", "ml"])
738
+ doc2 = create_mock_document([], tags=["javascript", "frontend"])
739
+ doc3 = create_mock_document([], tags=["python", "web"])
740
+ doc4 = create_mock_document([], tags=["rust", "systems"])
741
+
742
+ # Two RAG configs with different tag filters
743
+ rag_config1 = create_mock_rag_config(
744
+ "rag1", "ext1", "chunk1", "embed1", tags=["python"]
745
+ )
746
+ rag_config2 = create_mock_rag_config(
747
+ "rag2", "ext1", "chunk1", "embed1", tags=["javascript", "rust"]
748
+ )
749
+
750
+ mock_project_magic.documents.return_value = [doc1, doc2, doc3, doc4]
751
+ result = await compute_current_progress_for_rag_configs(
752
+ mock_project_magic, [rag_config1, rag_config2]
753
+ )
754
+
755
+ assert len(result) == 2
756
+
757
+ # rag1 should count doc1 and doc3 (python)
758
+ assert result["rag1"].total_document_count == 2
759
+
760
+ # rag2 should count doc2 (javascript) and doc4 (rust)
761
+ assert result["rag2"].total_document_count == 2
762
+
763
+ @pytest.mark.asyncio
764
+ async def test_rag_config_documents_with_no_tags(
765
+ self, mock_project_magic, mock_vector_store_count
766
+ ):
767
+ """Test RAG config filtering when some documents have no tags"""
768
+ # Mix of documents with and without tags
769
+ doc1 = create_mock_document([], tags=["python", "ml"])
770
+ doc2 = create_mock_document([], tags=None) # No tags
771
+ doc3 = create_mock_document([], tags=[]) # Empty tags
772
+ doc4 = create_mock_document([], tags=["python", "web"])
773
+
774
+ rag_config = create_mock_rag_config(
775
+ "rag1", "ext1", "chunk1", "embed1", tags=["python"]
776
+ )
777
+
778
+ mock_project_magic.documents.return_value = [doc1, doc2, doc3, doc4]
779
+ result = await compute_current_progress_for_rag_configs(
780
+ mock_project_magic, [rag_config]
781
+ )
782
+
783
+ # Should only count doc1 and doc4 (have "python" tag)
784
+ assert len(result) == 1
785
+ assert result["rag1"].total_document_count == 2