kiln-ai 0.21.0__py3-none-any.whl → 0.22.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kiln-ai might be problematic. Click here for more details.
- kiln_ai/adapters/extractors/litellm_extractor.py +52 -32
- kiln_ai/adapters/extractors/test_litellm_extractor.py +169 -71
- kiln_ai/adapters/ml_embedding_model_list.py +330 -28
- kiln_ai/adapters/ml_model_list.py +503 -23
- kiln_ai/adapters/model_adapters/litellm_adapter.py +39 -8
- kiln_ai/adapters/model_adapters/test_litellm_adapter.py +78 -0
- kiln_ai/adapters/model_adapters/test_litellm_adapter_tools.py +119 -5
- kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +9 -3
- kiln_ai/adapters/model_adapters/test_structured_output.py +6 -9
- kiln_ai/adapters/test_ml_embedding_model_list.py +89 -279
- kiln_ai/adapters/test_ml_model_list.py +0 -10
- kiln_ai/adapters/vector_store/lancedb_adapter.py +24 -70
- kiln_ai/adapters/vector_store/lancedb_helpers.py +101 -0
- kiln_ai/adapters/vector_store/test_lancedb_adapter.py +9 -16
- kiln_ai/adapters/vector_store/test_lancedb_helpers.py +142 -0
- kiln_ai/adapters/vector_store_loaders/__init__.py +0 -0
- kiln_ai/adapters/vector_store_loaders/test_lancedb_loader.py +282 -0
- kiln_ai/adapters/vector_store_loaders/test_vector_store_loader.py +544 -0
- kiln_ai/adapters/vector_store_loaders/vector_store_loader.py +91 -0
- kiln_ai/datamodel/basemodel.py +31 -3
- kiln_ai/datamodel/external_tool_server.py +206 -54
- kiln_ai/datamodel/extraction.py +14 -0
- kiln_ai/datamodel/task.py +5 -0
- kiln_ai/datamodel/task_output.py +41 -11
- kiln_ai/datamodel/test_attachment.py +3 -3
- kiln_ai/datamodel/test_basemodel.py +269 -13
- kiln_ai/datamodel/test_datasource.py +50 -0
- kiln_ai/datamodel/test_external_tool_server.py +534 -152
- kiln_ai/datamodel/test_extraction_model.py +31 -0
- kiln_ai/datamodel/test_task.py +35 -1
- kiln_ai/datamodel/test_tool_id.py +106 -1
- kiln_ai/datamodel/tool_id.py +49 -0
- kiln_ai/tools/base_tool.py +30 -6
- kiln_ai/tools/built_in_tools/math_tools.py +12 -4
- kiln_ai/tools/kiln_task_tool.py +162 -0
- kiln_ai/tools/mcp_server_tool.py +7 -5
- kiln_ai/tools/mcp_session_manager.py +50 -24
- kiln_ai/tools/rag_tools.py +17 -6
- kiln_ai/tools/test_kiln_task_tool.py +527 -0
- kiln_ai/tools/test_mcp_server_tool.py +4 -15
- kiln_ai/tools/test_mcp_session_manager.py +186 -226
- kiln_ai/tools/test_rag_tools.py +86 -5
- kiln_ai/tools/test_tool_registry.py +199 -5
- kiln_ai/tools/tool_registry.py +49 -17
- kiln_ai/utils/filesystem.py +4 -4
- kiln_ai/utils/open_ai_types.py +19 -2
- kiln_ai/utils/pdf_utils.py +21 -0
- kiln_ai/utils/test_open_ai_types.py +88 -12
- kiln_ai/utils/test_pdf_utils.py +14 -1
- {kiln_ai-0.21.0.dist-info → kiln_ai-0.22.1.dist-info}/METADATA +79 -1
- {kiln_ai-0.21.0.dist-info → kiln_ai-0.22.1.dist-info}/RECORD +53 -45
- {kiln_ai-0.21.0.dist-info → kiln_ai-0.22.1.dist-info}/WHEEL +0 -0
- {kiln_ai-0.21.0.dist-info → kiln_ai-0.22.1.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -0,0 +1,544 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from kiln_ai.adapters.vector_store_loaders.vector_store_loader import VectorStoreLoader
|
|
7
|
+
from kiln_ai.datamodel.chunk import Chunk, ChunkedDocument
|
|
8
|
+
from kiln_ai.datamodel.datamodel_enums import KilnMimeType
|
|
9
|
+
from kiln_ai.datamodel.embedding import ChunkEmbeddings, Embedding
|
|
10
|
+
from kiln_ai.datamodel.extraction import (
|
|
11
|
+
Document,
|
|
12
|
+
Extraction,
|
|
13
|
+
ExtractionSource,
|
|
14
|
+
FileInfo,
|
|
15
|
+
Kind,
|
|
16
|
+
)
|
|
17
|
+
from kiln_ai.datamodel.project import Project
|
|
18
|
+
from kiln_ai.datamodel.rag import RagConfig
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class DocWithChunks:
|
|
23
|
+
document: Document
|
|
24
|
+
extraction: Extraction
|
|
25
|
+
chunked_document: ChunkedDocument
|
|
26
|
+
chunked_embeddings: ChunkEmbeddings
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def lorem_ipsum(n: int) -> str:
|
|
30
|
+
return " ".join(
|
|
31
|
+
["Lorem ipsum dolor sit amet, consectetur adipiscing elit." for _ in range(n)]
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@pytest.fixture
|
|
36
|
+
def mock_chunks_factory(mock_attachment_factory):
|
|
37
|
+
def fn(
|
|
38
|
+
project: Project,
|
|
39
|
+
rag_config: RagConfig,
|
|
40
|
+
num_chunks: int = 1,
|
|
41
|
+
text: str | None = None,
|
|
42
|
+
extractor_config_id: str | None = None,
|
|
43
|
+
chunker_config_id: str | None = None,
|
|
44
|
+
embedding_config_id: str | None = None,
|
|
45
|
+
) -> DocWithChunks:
|
|
46
|
+
doc = Document(
|
|
47
|
+
id=f"doc_{uuid.uuid4()}",
|
|
48
|
+
name="Test Document",
|
|
49
|
+
description="Test Document",
|
|
50
|
+
original_file=FileInfo(
|
|
51
|
+
filename="test.pdf",
|
|
52
|
+
size=100,
|
|
53
|
+
mime_type="application/pdf",
|
|
54
|
+
attachment=mock_attachment_factory(KilnMimeType.PDF),
|
|
55
|
+
),
|
|
56
|
+
kind=Kind.DOCUMENT,
|
|
57
|
+
parent=project,
|
|
58
|
+
)
|
|
59
|
+
doc.save_to_file()
|
|
60
|
+
|
|
61
|
+
extraction = Extraction(
|
|
62
|
+
source=ExtractionSource.PROCESSED,
|
|
63
|
+
extractor_config_id=extractor_config_id or rag_config.extractor_config_id,
|
|
64
|
+
output=mock_attachment_factory(KilnMimeType.PDF),
|
|
65
|
+
parent=doc,
|
|
66
|
+
)
|
|
67
|
+
extraction.save_to_file()
|
|
68
|
+
|
|
69
|
+
chunks = [
|
|
70
|
+
Chunk(
|
|
71
|
+
content=mock_attachment_factory(
|
|
72
|
+
KilnMimeType.TXT, text=f"text-{i}: {text or lorem_ipsum(10)}"
|
|
73
|
+
)
|
|
74
|
+
)
|
|
75
|
+
for i in range(num_chunks)
|
|
76
|
+
]
|
|
77
|
+
chunked_document = ChunkedDocument(
|
|
78
|
+
chunks=chunks,
|
|
79
|
+
chunker_config_id=chunker_config_id or rag_config.chunker_config_id,
|
|
80
|
+
parent=extraction,
|
|
81
|
+
)
|
|
82
|
+
chunked_document.save_to_file()
|
|
83
|
+
chunked_embeddings = ChunkEmbeddings(
|
|
84
|
+
embeddings=[
|
|
85
|
+
Embedding(vector=[i + 0.1, i + 0.2, i + 0.3, i + 0.4, i + 0.5])
|
|
86
|
+
for i in range(num_chunks)
|
|
87
|
+
],
|
|
88
|
+
embedding_config_id=embedding_config_id or rag_config.embedding_config_id,
|
|
89
|
+
parent=chunked_document,
|
|
90
|
+
)
|
|
91
|
+
chunked_embeddings.save_to_file()
|
|
92
|
+
return DocWithChunks(
|
|
93
|
+
document=doc,
|
|
94
|
+
extraction=extraction,
|
|
95
|
+
chunked_document=chunked_document,
|
|
96
|
+
chunked_embeddings=chunked_embeddings,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
return fn
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@pytest.fixture
|
|
103
|
+
def mock_project(tmp_path):
|
|
104
|
+
project = Project(
|
|
105
|
+
name="Test Project", path=tmp_path / "test_project" / "project.kiln"
|
|
106
|
+
)
|
|
107
|
+
project.save_to_file()
|
|
108
|
+
return project
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@pytest.fixture
|
|
112
|
+
def rag_config_factory(mock_project):
|
|
113
|
+
def fn(
|
|
114
|
+
extractor_config_id: str = "test_extractor",
|
|
115
|
+
chunker_config_id: str = "test_chunker",
|
|
116
|
+
embedding_config_id: str = "test_embedding",
|
|
117
|
+
) -> RagConfig:
|
|
118
|
+
rag_config = RagConfig(
|
|
119
|
+
name="Test Rag Config",
|
|
120
|
+
parent=mock_project,
|
|
121
|
+
vector_store_config_id="test_vector_store",
|
|
122
|
+
tool_name="test_tool",
|
|
123
|
+
tool_description="test_description",
|
|
124
|
+
extractor_config_id=extractor_config_id,
|
|
125
|
+
chunker_config_id=chunker_config_id,
|
|
126
|
+
embedding_config_id=embedding_config_id,
|
|
127
|
+
)
|
|
128
|
+
rag_config.save_to_file()
|
|
129
|
+
return rag_config
|
|
130
|
+
|
|
131
|
+
return fn
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
# Tests for VectorStoreLoader.iter_llama_index_nodes
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
@pytest.mark.asyncio
|
|
138
|
+
async def test_iter_llama_index_nodes_single_document(
|
|
139
|
+
mock_project, mock_chunks_factory, rag_config_factory
|
|
140
|
+
):
|
|
141
|
+
"""Test iter_llama_index_nodes with a single document that matches all config IDs."""
|
|
142
|
+
rag_config = rag_config_factory()
|
|
143
|
+
loader = VectorStoreLoader(project=mock_project, rag_config=rag_config)
|
|
144
|
+
|
|
145
|
+
# Create a document with chunks
|
|
146
|
+
doc_with_chunks = mock_chunks_factory(
|
|
147
|
+
mock_project, rag_config, num_chunks=3, text="Test content"
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
# Test iterating through nodes
|
|
151
|
+
all_nodes = []
|
|
152
|
+
async for batch in loader.iter_llama_index_nodes():
|
|
153
|
+
all_nodes.extend(batch)
|
|
154
|
+
|
|
155
|
+
assert len(all_nodes) == 3
|
|
156
|
+
# Check that all nodes have the correct document ID
|
|
157
|
+
for node in all_nodes:
|
|
158
|
+
assert node.metadata["kiln_doc_id"] == str(doc_with_chunks.document.id)
|
|
159
|
+
assert "kiln_chunk_idx" in node.metadata
|
|
160
|
+
assert "text" in node.text
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
@pytest.mark.asyncio
|
|
164
|
+
async def test_iter_llama_index_nodes_multiple_documents(
|
|
165
|
+
mock_project, mock_chunks_factory, rag_config_factory
|
|
166
|
+
):
|
|
167
|
+
"""Test iter_llama_index_nodes with multiple documents."""
|
|
168
|
+
rag_config = rag_config_factory()
|
|
169
|
+
loader = VectorStoreLoader(project=mock_project, rag_config=rag_config)
|
|
170
|
+
|
|
171
|
+
# Create multiple documents
|
|
172
|
+
doc1 = mock_chunks_factory(mock_project, rag_config, num_chunks=2, text="Doc 1")
|
|
173
|
+
doc2 = mock_chunks_factory(mock_project, rag_config, num_chunks=3, text="Doc 2")
|
|
174
|
+
doc3 = mock_chunks_factory(mock_project, rag_config, num_chunks=1, text="Doc 3")
|
|
175
|
+
|
|
176
|
+
# Test iterating through nodes
|
|
177
|
+
all_nodes = []
|
|
178
|
+
async for batch in loader.iter_llama_index_nodes():
|
|
179
|
+
all_nodes.extend(batch)
|
|
180
|
+
|
|
181
|
+
assert len(all_nodes) == 6 # 2 + 3 + 1 = 6 total chunks
|
|
182
|
+
|
|
183
|
+
# Group nodes by document ID
|
|
184
|
+
nodes_by_doc = {}
|
|
185
|
+
for node in all_nodes:
|
|
186
|
+
doc_id = node.metadata["kiln_doc_id"]
|
|
187
|
+
if doc_id not in nodes_by_doc:
|
|
188
|
+
nodes_by_doc[doc_id] = []
|
|
189
|
+
nodes_by_doc[doc_id].append(node)
|
|
190
|
+
|
|
191
|
+
# Check that we have nodes from all three documents
|
|
192
|
+
expected_doc_ids = {
|
|
193
|
+
str(doc1.document.id),
|
|
194
|
+
str(doc2.document.id),
|
|
195
|
+
str(doc3.document.id),
|
|
196
|
+
}
|
|
197
|
+
assert set(nodes_by_doc.keys()) == expected_doc_ids
|
|
198
|
+
|
|
199
|
+
# Check chunk counts
|
|
200
|
+
assert len(nodes_by_doc[str(doc1.document.id)]) == 2
|
|
201
|
+
assert len(nodes_by_doc[str(doc2.document.id)]) == 3
|
|
202
|
+
assert len(nodes_by_doc[str(doc3.document.id)]) == 1
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
@pytest.mark.asyncio
|
|
206
|
+
async def test_iter_llama_index_nodes_filters_by_extractor_config_id(
|
|
207
|
+
mock_project, mock_chunks_factory, rag_config_factory
|
|
208
|
+
):
|
|
209
|
+
"""Test that iter_llama_index_nodes filters by extractor_config_id."""
|
|
210
|
+
rag_config = rag_config_factory(extractor_config_id="target_extractor")
|
|
211
|
+
loader = VectorStoreLoader(project=mock_project, rag_config=rag_config)
|
|
212
|
+
|
|
213
|
+
# Create documents with different extractor config IDs
|
|
214
|
+
matching_doc = mock_chunks_factory(
|
|
215
|
+
mock_project,
|
|
216
|
+
rag_config,
|
|
217
|
+
num_chunks=2,
|
|
218
|
+
text="Matching doc",
|
|
219
|
+
extractor_config_id="target_extractor",
|
|
220
|
+
)
|
|
221
|
+
mock_chunks_factory(
|
|
222
|
+
mock_project,
|
|
223
|
+
rag_config,
|
|
224
|
+
num_chunks=2,
|
|
225
|
+
text="Non-matching doc",
|
|
226
|
+
extractor_config_id="other_extractor",
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
# Test iterating through nodes
|
|
230
|
+
all_nodes = []
|
|
231
|
+
async for batch in loader.iter_llama_index_nodes():
|
|
232
|
+
all_nodes.extend(batch)
|
|
233
|
+
|
|
234
|
+
assert len(all_nodes) == 2 # Only the matching document's chunks
|
|
235
|
+
for node in all_nodes:
|
|
236
|
+
assert node.metadata["kiln_doc_id"] == str(matching_doc.document.id)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
@pytest.mark.asyncio
|
|
240
|
+
async def test_iter_llama_index_nodes_filters_by_chunker_config_id(
|
|
241
|
+
mock_project, mock_chunks_factory, rag_config_factory
|
|
242
|
+
):
|
|
243
|
+
"""Test that iter_llama_index_nodes filters by chunker_config_id."""
|
|
244
|
+
rag_config = rag_config_factory(chunker_config_id="target_chunker")
|
|
245
|
+
loader = VectorStoreLoader(project=mock_project, rag_config=rag_config)
|
|
246
|
+
|
|
247
|
+
# Create documents with different chunker config IDs
|
|
248
|
+
matching_doc = mock_chunks_factory(
|
|
249
|
+
mock_project,
|
|
250
|
+
rag_config,
|
|
251
|
+
num_chunks=2,
|
|
252
|
+
text="Matching doc",
|
|
253
|
+
chunker_config_id="target_chunker",
|
|
254
|
+
)
|
|
255
|
+
mock_chunks_factory(
|
|
256
|
+
mock_project,
|
|
257
|
+
rag_config,
|
|
258
|
+
num_chunks=2,
|
|
259
|
+
text="Non-matching doc",
|
|
260
|
+
chunker_config_id="other_chunker",
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
# Test iterating through nodes
|
|
264
|
+
all_nodes = []
|
|
265
|
+
async for batch in loader.iter_llama_index_nodes():
|
|
266
|
+
all_nodes.extend(batch)
|
|
267
|
+
|
|
268
|
+
assert len(all_nodes) == 2 # Only the matching document's chunks
|
|
269
|
+
for node in all_nodes:
|
|
270
|
+
assert node.metadata["kiln_doc_id"] == str(matching_doc.document.id)
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
@pytest.mark.asyncio
|
|
274
|
+
async def test_iter_llama_index_nodes_filters_by_embedding_config_id(
|
|
275
|
+
mock_project, mock_chunks_factory, rag_config_factory
|
|
276
|
+
):
|
|
277
|
+
"""Test that iter_llama_index_nodes filters by embedding_config_id."""
|
|
278
|
+
rag_config = rag_config_factory(embedding_config_id="target_embedding")
|
|
279
|
+
loader = VectorStoreLoader(project=mock_project, rag_config=rag_config)
|
|
280
|
+
|
|
281
|
+
# Create documents with different embedding config IDs
|
|
282
|
+
matching_doc = mock_chunks_factory(
|
|
283
|
+
mock_project,
|
|
284
|
+
rag_config,
|
|
285
|
+
num_chunks=2,
|
|
286
|
+
text="Matching doc",
|
|
287
|
+
embedding_config_id="target_embedding",
|
|
288
|
+
)
|
|
289
|
+
mock_chunks_factory(
|
|
290
|
+
mock_project,
|
|
291
|
+
rag_config,
|
|
292
|
+
num_chunks=2,
|
|
293
|
+
text="Non-matching doc",
|
|
294
|
+
embedding_config_id="other_embedding",
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
# Test iterating through nodes
|
|
298
|
+
all_nodes = []
|
|
299
|
+
async for batch in loader.iter_llama_index_nodes():
|
|
300
|
+
all_nodes.extend(batch)
|
|
301
|
+
|
|
302
|
+
assert len(all_nodes) == 2 # Only the matching document's chunks
|
|
303
|
+
for node in all_nodes:
|
|
304
|
+
assert node.metadata["kiln_doc_id"] == str(matching_doc.document.id)
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
@pytest.mark.asyncio
|
|
308
|
+
async def test_iter_llama_index_nodes_filters_by_all_config_ids(
|
|
309
|
+
mock_project, mock_chunks_factory, rag_config_factory
|
|
310
|
+
):
|
|
311
|
+
"""Test that iter_llama_index_nodes filters by all config IDs simultaneously."""
|
|
312
|
+
rag_config = rag_config_factory(
|
|
313
|
+
extractor_config_id="target_extractor",
|
|
314
|
+
chunker_config_id="target_chunker",
|
|
315
|
+
embedding_config_id="target_embedding",
|
|
316
|
+
)
|
|
317
|
+
loader = VectorStoreLoader(project=mock_project, rag_config=rag_config)
|
|
318
|
+
|
|
319
|
+
# Create documents with different combinations of config IDs
|
|
320
|
+
fully_matching_doc = mock_chunks_factory(
|
|
321
|
+
mock_project,
|
|
322
|
+
rag_config,
|
|
323
|
+
num_chunks=2,
|
|
324
|
+
text="Fully matching doc",
|
|
325
|
+
extractor_config_id="target_extractor",
|
|
326
|
+
chunker_config_id="target_chunker",
|
|
327
|
+
embedding_config_id="target_embedding",
|
|
328
|
+
)
|
|
329
|
+
mock_chunks_factory(
|
|
330
|
+
mock_project,
|
|
331
|
+
rag_config,
|
|
332
|
+
num_chunks=2,
|
|
333
|
+
text="Partially matching doc",
|
|
334
|
+
extractor_config_id="target_extractor",
|
|
335
|
+
chunker_config_id="other_chunker", # Different chunker
|
|
336
|
+
embedding_config_id="target_embedding",
|
|
337
|
+
)
|
|
338
|
+
mock_chunks_factory(
|
|
339
|
+
mock_project,
|
|
340
|
+
rag_config,
|
|
341
|
+
num_chunks=2,
|
|
342
|
+
text="Non-matching doc",
|
|
343
|
+
extractor_config_id="other_extractor",
|
|
344
|
+
chunker_config_id="other_chunker",
|
|
345
|
+
embedding_config_id="other_embedding",
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
# Test iterating through nodes
|
|
349
|
+
all_nodes = []
|
|
350
|
+
async for batch in loader.iter_llama_index_nodes():
|
|
351
|
+
all_nodes.extend(batch)
|
|
352
|
+
|
|
353
|
+
assert len(all_nodes) == 2 # Only the fully matching document's chunks
|
|
354
|
+
for node in all_nodes:
|
|
355
|
+
assert node.metadata["kiln_doc_id"] == str(fully_matching_doc.document.id)
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
@pytest.mark.asyncio
|
|
359
|
+
async def test_iter_llama_index_nodes_empty_project(mock_project, rag_config_factory):
|
|
360
|
+
"""Test iter_llama_index_nodes with an empty project."""
|
|
361
|
+
rag_config = rag_config_factory()
|
|
362
|
+
loader = VectorStoreLoader(project=mock_project, rag_config=rag_config)
|
|
363
|
+
|
|
364
|
+
# Test iterating through nodes
|
|
365
|
+
all_nodes = []
|
|
366
|
+
async for batch in loader.iter_llama_index_nodes():
|
|
367
|
+
all_nodes.extend(batch)
|
|
368
|
+
|
|
369
|
+
assert len(all_nodes) == 0
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
@pytest.mark.asyncio
|
|
373
|
+
async def test_iter_llama_index_nodes_batch_size(
|
|
374
|
+
mock_project, mock_chunks_factory, rag_config_factory
|
|
375
|
+
):
|
|
376
|
+
"""Test that iter_llama_index_nodes respects batch_size parameter."""
|
|
377
|
+
rag_config = rag_config_factory()
|
|
378
|
+
loader = VectorStoreLoader(project=mock_project, rag_config=rag_config)
|
|
379
|
+
|
|
380
|
+
# Create a document with many chunks
|
|
381
|
+
mock_chunks_factory(mock_project, rag_config, num_chunks=5, text="Test content")
|
|
382
|
+
|
|
383
|
+
# Test with small batch size
|
|
384
|
+
batch_size = 2
|
|
385
|
+
batches = []
|
|
386
|
+
async for batch in loader.iter_llama_index_nodes(batch_size=batch_size):
|
|
387
|
+
batches.append(batch)
|
|
388
|
+
assert len(batch) <= batch_size
|
|
389
|
+
|
|
390
|
+
# Should have 3 batches: [2, 2, 1] chunks
|
|
391
|
+
assert len(batches) == 3
|
|
392
|
+
assert len(batches[0]) == 2
|
|
393
|
+
assert len(batches[1]) == 2
|
|
394
|
+
assert len(batches[2]) == 1
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
@pytest.mark.asyncio
|
|
398
|
+
async def test_iter_llama_index_nodes_chunk_text_embedding_mismatch(
|
|
399
|
+
mock_project, mock_chunks_factory, rag_config_factory, mock_attachment_factory
|
|
400
|
+
):
|
|
401
|
+
"""Test that iter_llama_index_nodes raises error on chunk text/embedding count mismatch."""
|
|
402
|
+
rag_config = rag_config_factory()
|
|
403
|
+
loader = VectorStoreLoader(project=mock_project, rag_config=rag_config)
|
|
404
|
+
|
|
405
|
+
# Create a document with mismatched chunk text and embeddings
|
|
406
|
+
doc = Document(
|
|
407
|
+
id=f"doc_{uuid.uuid4()}",
|
|
408
|
+
name="Test Document",
|
|
409
|
+
description="Test Document",
|
|
410
|
+
original_file=FileInfo(
|
|
411
|
+
filename="test.pdf",
|
|
412
|
+
size=100,
|
|
413
|
+
mime_type="application/pdf",
|
|
414
|
+
attachment=mock_attachment_factory(KilnMimeType.PDF),
|
|
415
|
+
),
|
|
416
|
+
kind=Kind.DOCUMENT,
|
|
417
|
+
parent=mock_project,
|
|
418
|
+
)
|
|
419
|
+
doc.save_to_file()
|
|
420
|
+
|
|
421
|
+
extraction = Extraction(
|
|
422
|
+
source=ExtractionSource.PROCESSED,
|
|
423
|
+
extractor_config_id=rag_config.extractor_config_id,
|
|
424
|
+
output=mock_attachment_factory(KilnMimeType.PDF),
|
|
425
|
+
parent=doc,
|
|
426
|
+
)
|
|
427
|
+
extraction.save_to_file()
|
|
428
|
+
|
|
429
|
+
# Create 2 chunks but only 1 embedding
|
|
430
|
+
chunks = [
|
|
431
|
+
Chunk(content=mock_attachment_factory(KilnMimeType.TXT, text=f"chunk-{i}"))
|
|
432
|
+
for i in range(2)
|
|
433
|
+
]
|
|
434
|
+
chunked_document = ChunkedDocument(
|
|
435
|
+
chunks=chunks,
|
|
436
|
+
chunker_config_id=rag_config.chunker_config_id,
|
|
437
|
+
parent=extraction,
|
|
438
|
+
)
|
|
439
|
+
chunked_document.save_to_file()
|
|
440
|
+
|
|
441
|
+
# Only 1 embedding for 2 chunks
|
|
442
|
+
chunked_embeddings = ChunkEmbeddings(
|
|
443
|
+
embeddings=[Embedding(vector=[0.1, 0.2, 0.3])], # Only 1 embedding
|
|
444
|
+
embedding_config_id=rag_config.embedding_config_id,
|
|
445
|
+
parent=chunked_document,
|
|
446
|
+
)
|
|
447
|
+
chunked_embeddings.save_to_file()
|
|
448
|
+
|
|
449
|
+
# Test that it raises an error
|
|
450
|
+
with pytest.raises(ValueError, match="Chunk text/embedding count mismatch"):
|
|
451
|
+
async for batch in loader.iter_llama_index_nodes():
|
|
452
|
+
pass
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
@pytest.mark.asyncio
|
|
456
|
+
async def test_iter_llama_index_nodes_multiple_extractions_per_document(
|
|
457
|
+
mock_project, mock_chunks_factory, rag_config_factory, mock_attachment_factory
|
|
458
|
+
):
|
|
459
|
+
"""Test iter_llama_index_nodes with multiple extractions per document."""
|
|
460
|
+
rag_config = rag_config_factory()
|
|
461
|
+
loader = VectorStoreLoader(project=mock_project, rag_config=rag_config)
|
|
462
|
+
|
|
463
|
+
# Create a document
|
|
464
|
+
doc = Document(
|
|
465
|
+
id=f"doc_{uuid.uuid4()}",
|
|
466
|
+
name="Test Document",
|
|
467
|
+
description="Test Document",
|
|
468
|
+
original_file=FileInfo(
|
|
469
|
+
filename="test.pdf",
|
|
470
|
+
size=100,
|
|
471
|
+
mime_type="application/pdf",
|
|
472
|
+
attachment=mock_attachment_factory(KilnMimeType.PDF),
|
|
473
|
+
),
|
|
474
|
+
kind=Kind.DOCUMENT,
|
|
475
|
+
parent=mock_project,
|
|
476
|
+
)
|
|
477
|
+
doc.save_to_file()
|
|
478
|
+
|
|
479
|
+
# Create multiple extractions for the same document
|
|
480
|
+
extraction1 = Extraction(
|
|
481
|
+
source=ExtractionSource.PROCESSED,
|
|
482
|
+
extractor_config_id=rag_config.extractor_config_id,
|
|
483
|
+
output=mock_attachment_factory(KilnMimeType.PDF),
|
|
484
|
+
parent=doc,
|
|
485
|
+
)
|
|
486
|
+
extraction1.save_to_file()
|
|
487
|
+
|
|
488
|
+
extraction2 = Extraction(
|
|
489
|
+
source=ExtractionSource.PROCESSED,
|
|
490
|
+
extractor_config_id="other_extractor", # Different extractor
|
|
491
|
+
output=mock_attachment_factory(KilnMimeType.PDF),
|
|
492
|
+
parent=doc,
|
|
493
|
+
)
|
|
494
|
+
extraction2.save_to_file()
|
|
495
|
+
|
|
496
|
+
# Create chunked documents and embeddings for each extraction
|
|
497
|
+
chunks1 = [
|
|
498
|
+
Chunk(content=mock_attachment_factory(KilnMimeType.TXT, text=f"chunk1-{i}"))
|
|
499
|
+
for i in range(2)
|
|
500
|
+
]
|
|
501
|
+
chunked_doc1 = ChunkedDocument(
|
|
502
|
+
chunks=chunks1,
|
|
503
|
+
chunker_config_id=rag_config.chunker_config_id,
|
|
504
|
+
parent=extraction1,
|
|
505
|
+
)
|
|
506
|
+
chunked_doc1.save_to_file()
|
|
507
|
+
|
|
508
|
+
chunks2 = [
|
|
509
|
+
Chunk(content=mock_attachment_factory(KilnMimeType.TXT, text=f"chunk2-{i}"))
|
|
510
|
+
for i in range(3)
|
|
511
|
+
]
|
|
512
|
+
chunked_doc2 = ChunkedDocument(
|
|
513
|
+
chunks=chunks2,
|
|
514
|
+
chunker_config_id=rag_config.chunker_config_id,
|
|
515
|
+
parent=extraction2,
|
|
516
|
+
)
|
|
517
|
+
chunked_doc2.save_to_file()
|
|
518
|
+
|
|
519
|
+
# Create embeddings for each chunked document
|
|
520
|
+
embeddings1 = ChunkEmbeddings(
|
|
521
|
+
embeddings=[Embedding(vector=[0.1, 0.2, 0.3]) for _ in range(2)],
|
|
522
|
+
embedding_config_id=rag_config.embedding_config_id,
|
|
523
|
+
parent=chunked_doc1,
|
|
524
|
+
)
|
|
525
|
+
embeddings1.save_to_file()
|
|
526
|
+
|
|
527
|
+
embeddings2 = ChunkEmbeddings(
|
|
528
|
+
embeddings=[Embedding(vector=[0.4, 0.5, 0.6]) for _ in range(3)],
|
|
529
|
+
embedding_config_id=rag_config.embedding_config_id,
|
|
530
|
+
parent=chunked_doc2,
|
|
531
|
+
)
|
|
532
|
+
embeddings2.save_to_file()
|
|
533
|
+
|
|
534
|
+
# Test iterating through nodes
|
|
535
|
+
all_nodes = []
|
|
536
|
+
async for batch in loader.iter_llama_index_nodes():
|
|
537
|
+
all_nodes.extend(batch)
|
|
538
|
+
|
|
539
|
+
# Should only return nodes from the first extraction since the second has a different extractor_config_id
|
|
540
|
+
assert len(all_nodes) == 2
|
|
541
|
+
for node in all_nodes:
|
|
542
|
+
assert node.metadata["kiln_doc_id"] == str(doc.id)
|
|
543
|
+
# All nodes should have chunk indices 0 and 1 (from the first extraction)
|
|
544
|
+
assert node.metadata["kiln_chunk_idx"] in [0, 1]
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from typing import AsyncGenerator, List
|
|
3
|
+
|
|
4
|
+
from llama_index.core.schema import TextNode
|
|
5
|
+
|
|
6
|
+
from kiln_ai.adapters.rag.deduplication import (
|
|
7
|
+
deduplicate_chunk_embeddings,
|
|
8
|
+
deduplicate_chunked_documents,
|
|
9
|
+
deduplicate_extractions,
|
|
10
|
+
)
|
|
11
|
+
from kiln_ai.adapters.vector_store.lancedb_helpers import (
|
|
12
|
+
convert_to_llama_index_node,
|
|
13
|
+
deterministic_chunk_id,
|
|
14
|
+
)
|
|
15
|
+
from kiln_ai.datamodel.project import Project
|
|
16
|
+
from kiln_ai.datamodel.rag import RagConfig
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class VectorStoreLoader:
|
|
20
|
+
"""
|
|
21
|
+
Class for loading data as LlamaIndex Nodes.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
project: Project,
|
|
27
|
+
rag_config: RagConfig,
|
|
28
|
+
):
|
|
29
|
+
self.project = project
|
|
30
|
+
self.rag_config = rag_config
|
|
31
|
+
|
|
32
|
+
async def iter_llama_index_nodes(
|
|
33
|
+
self, batch_size: int = 100
|
|
34
|
+
) -> AsyncGenerator[List[TextNode], None]:
|
|
35
|
+
"""Returns a generator of documents with their corresponding chunks and embeddings."""
|
|
36
|
+
batch: List[TextNode] = []
|
|
37
|
+
for document in self.project.documents():
|
|
38
|
+
await asyncio.sleep(0)
|
|
39
|
+
for extraction in deduplicate_extractions(document.extractions()):
|
|
40
|
+
if (
|
|
41
|
+
extraction.extractor_config_id
|
|
42
|
+
!= self.rag_config.extractor_config_id
|
|
43
|
+
):
|
|
44
|
+
continue
|
|
45
|
+
for chunked_document in deduplicate_chunked_documents(
|
|
46
|
+
extraction.chunked_documents()
|
|
47
|
+
):
|
|
48
|
+
if (
|
|
49
|
+
chunked_document.chunker_config_id
|
|
50
|
+
!= self.rag_config.chunker_config_id
|
|
51
|
+
):
|
|
52
|
+
continue
|
|
53
|
+
for chunk_embeddings in deduplicate_chunk_embeddings(
|
|
54
|
+
chunked_document.chunk_embeddings()
|
|
55
|
+
):
|
|
56
|
+
if (
|
|
57
|
+
chunk_embeddings.embedding_config_id
|
|
58
|
+
!= self.rag_config.embedding_config_id
|
|
59
|
+
):
|
|
60
|
+
continue
|
|
61
|
+
|
|
62
|
+
document_id = str(document.id)
|
|
63
|
+
chunks_text = await chunked_document.load_chunks_text()
|
|
64
|
+
embeddings = chunk_embeddings.embeddings
|
|
65
|
+
if len(chunks_text) != len(embeddings):
|
|
66
|
+
raise ValueError(
|
|
67
|
+
f"Chunk text/embedding count mismatch for document {document_id}: "
|
|
68
|
+
f"{len(chunks_text)} texts vs {len(embeddings)} embeddings"
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
for chunk_idx, (chunk_text, chunk_embeddings) in enumerate(
|
|
72
|
+
zip(chunks_text, embeddings)
|
|
73
|
+
):
|
|
74
|
+
batch.append(
|
|
75
|
+
convert_to_llama_index_node(
|
|
76
|
+
document_id=document_id,
|
|
77
|
+
chunk_idx=chunk_idx,
|
|
78
|
+
node_id=deterministic_chunk_id(
|
|
79
|
+
document_id, chunk_idx
|
|
80
|
+
),
|
|
81
|
+
text=chunk_text,
|
|
82
|
+
vector=chunk_embeddings.vector,
|
|
83
|
+
)
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
if len(batch) >= batch_size:
|
|
87
|
+
yield batch
|
|
88
|
+
batch = []
|
|
89
|
+
|
|
90
|
+
if batch:
|
|
91
|
+
yield batch
|
kiln_ai/datamodel/basemodel.py
CHANGED
|
@@ -9,7 +9,7 @@ from abc import ABCMeta
|
|
|
9
9
|
from builtins import classmethod
|
|
10
10
|
from datetime import datetime
|
|
11
11
|
from pathlib import Path
|
|
12
|
-
from typing import Any, Callable, Dict, List, Optional, Type, TypeVar
|
|
12
|
+
from typing import Any, Callable, Dict, List, Optional, Set, Type, TypeVar
|
|
13
13
|
|
|
14
14
|
from pydantic import (
|
|
15
15
|
BaseModel,
|
|
@@ -78,9 +78,9 @@ def string_to_valid_name(name: str) -> str:
|
|
|
78
78
|
# https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize
|
|
79
79
|
valid_name = unicodedata.normalize("NFKD", name)
|
|
80
80
|
# Replace any forbidden chars with an underscore
|
|
81
|
-
valid_name = re.sub(FORBIDDEN_CHARS_REGEX, "
|
|
81
|
+
valid_name = re.sub(FORBIDDEN_CHARS_REGEX, " ", valid_name)
|
|
82
82
|
# Replace control characters with an underscore
|
|
83
|
-
valid_name = re.sub(r"[\x00-\x1F]", "
|
|
83
|
+
valid_name = re.sub(r"[\x00-\x1F]", " ", valid_name)
|
|
84
84
|
# Replace consecutive whitespace with a single space
|
|
85
85
|
valid_name = re.sub(r"\s+", " ", valid_name)
|
|
86
86
|
# Replace consecutive underscores with a single underscore
|
|
@@ -594,6 +594,34 @@ class KilnParentedModel(KilnBaseModel, metaclass=ABCMeta):
|
|
|
594
594
|
return child
|
|
595
595
|
return None
|
|
596
596
|
|
|
597
|
+
@classmethod
|
|
598
|
+
def from_ids_and_parent_path(
|
|
599
|
+
cls: Type[PT], ids: Set[str], parent_path: Path | None
|
|
600
|
+
) -> Dict[str, PT]:
|
|
601
|
+
"""
|
|
602
|
+
Bulk equivalent of from_id_and_parent_path, much faster for large collections.
|
|
603
|
+
|
|
604
|
+
It picks out the matching models from the directory only once. This avoids
|
|
605
|
+
doing individual costly lookups that scan the whole directory in scenarios
|
|
606
|
+
where we need to iterate over a large collection of models (e.g. bulk tagging).
|
|
607
|
+
"""
|
|
608
|
+
if parent_path is None:
|
|
609
|
+
return {}
|
|
610
|
+
|
|
611
|
+
children = {}
|
|
612
|
+
|
|
613
|
+
# Note: we're using the in-file ID. We could make this faster using the path-ID if this becomes perf bottleneck, but it's better to have 1 source of truth.
|
|
614
|
+
for child_path in cls.iterate_children_paths_of_parent_path(parent_path):
|
|
615
|
+
child_id = ModelCache.shared().get_model_id(child_path, cls)
|
|
616
|
+
if child_id in ids:
|
|
617
|
+
children[child_id] = cls.load_from_file(child_path)
|
|
618
|
+
if child_id is None:
|
|
619
|
+
child = cls.load_from_file(child_path)
|
|
620
|
+
if child.id in ids:
|
|
621
|
+
children[child.id] = child
|
|
622
|
+
|
|
623
|
+
return children
|
|
624
|
+
|
|
597
625
|
|
|
598
626
|
# Parent create methods for all child relationships
|
|
599
627
|
# You must pass in parent_of in the subclass definition, defining the child relationships
|