kiln-ai 0.21.0__py3-none-any.whl → 0.22.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (53) hide show
  1. kiln_ai/adapters/extractors/litellm_extractor.py +52 -32
  2. kiln_ai/adapters/extractors/test_litellm_extractor.py +169 -71
  3. kiln_ai/adapters/ml_embedding_model_list.py +330 -28
  4. kiln_ai/adapters/ml_model_list.py +503 -23
  5. kiln_ai/adapters/model_adapters/litellm_adapter.py +39 -8
  6. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +78 -0
  7. kiln_ai/adapters/model_adapters/test_litellm_adapter_tools.py +119 -5
  8. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +9 -3
  9. kiln_ai/adapters/model_adapters/test_structured_output.py +6 -9
  10. kiln_ai/adapters/test_ml_embedding_model_list.py +89 -279
  11. kiln_ai/adapters/test_ml_model_list.py +0 -10
  12. kiln_ai/adapters/vector_store/lancedb_adapter.py +24 -70
  13. kiln_ai/adapters/vector_store/lancedb_helpers.py +101 -0
  14. kiln_ai/adapters/vector_store/test_lancedb_adapter.py +9 -16
  15. kiln_ai/adapters/vector_store/test_lancedb_helpers.py +142 -0
  16. kiln_ai/adapters/vector_store_loaders/__init__.py +0 -0
  17. kiln_ai/adapters/vector_store_loaders/test_lancedb_loader.py +282 -0
  18. kiln_ai/adapters/vector_store_loaders/test_vector_store_loader.py +544 -0
  19. kiln_ai/adapters/vector_store_loaders/vector_store_loader.py +91 -0
  20. kiln_ai/datamodel/basemodel.py +31 -3
  21. kiln_ai/datamodel/external_tool_server.py +206 -54
  22. kiln_ai/datamodel/extraction.py +14 -0
  23. kiln_ai/datamodel/task.py +5 -0
  24. kiln_ai/datamodel/task_output.py +41 -11
  25. kiln_ai/datamodel/test_attachment.py +3 -3
  26. kiln_ai/datamodel/test_basemodel.py +269 -13
  27. kiln_ai/datamodel/test_datasource.py +50 -0
  28. kiln_ai/datamodel/test_external_tool_server.py +534 -152
  29. kiln_ai/datamodel/test_extraction_model.py +31 -0
  30. kiln_ai/datamodel/test_task.py +35 -1
  31. kiln_ai/datamodel/test_tool_id.py +106 -1
  32. kiln_ai/datamodel/tool_id.py +49 -0
  33. kiln_ai/tools/base_tool.py +30 -6
  34. kiln_ai/tools/built_in_tools/math_tools.py +12 -4
  35. kiln_ai/tools/kiln_task_tool.py +162 -0
  36. kiln_ai/tools/mcp_server_tool.py +7 -5
  37. kiln_ai/tools/mcp_session_manager.py +50 -24
  38. kiln_ai/tools/rag_tools.py +17 -6
  39. kiln_ai/tools/test_kiln_task_tool.py +527 -0
  40. kiln_ai/tools/test_mcp_server_tool.py +4 -15
  41. kiln_ai/tools/test_mcp_session_manager.py +186 -226
  42. kiln_ai/tools/test_rag_tools.py +86 -5
  43. kiln_ai/tools/test_tool_registry.py +199 -5
  44. kiln_ai/tools/tool_registry.py +49 -17
  45. kiln_ai/utils/filesystem.py +4 -4
  46. kiln_ai/utils/open_ai_types.py +19 -2
  47. kiln_ai/utils/pdf_utils.py +21 -0
  48. kiln_ai/utils/test_open_ai_types.py +88 -12
  49. kiln_ai/utils/test_pdf_utils.py +14 -1
  50. {kiln_ai-0.21.0.dist-info → kiln_ai-0.22.1.dist-info}/METADATA +79 -1
  51. {kiln_ai-0.21.0.dist-info → kiln_ai-0.22.1.dist-info}/RECORD +53 -45
  52. {kiln_ai-0.21.0.dist-info → kiln_ai-0.22.1.dist-info}/WHEEL +0 -0
  53. {kiln_ai-0.21.0.dist-info → kiln_ai-0.22.1.dist-info}/licenses/LICENSE.txt +0 -0
@@ -0,0 +1,544 @@
1
+ import uuid
2
+ from dataclasses import dataclass
3
+
4
+ import pytest
5
+
6
+ from kiln_ai.adapters.vector_store_loaders.vector_store_loader import VectorStoreLoader
7
+ from kiln_ai.datamodel.chunk import Chunk, ChunkedDocument
8
+ from kiln_ai.datamodel.datamodel_enums import KilnMimeType
9
+ from kiln_ai.datamodel.embedding import ChunkEmbeddings, Embedding
10
+ from kiln_ai.datamodel.extraction import (
11
+ Document,
12
+ Extraction,
13
+ ExtractionSource,
14
+ FileInfo,
15
+ Kind,
16
+ )
17
+ from kiln_ai.datamodel.project import Project
18
+ from kiln_ai.datamodel.rag import RagConfig
19
+
20
+
21
+ @dataclass
22
+ class DocWithChunks:
23
+ document: Document
24
+ extraction: Extraction
25
+ chunked_document: ChunkedDocument
26
+ chunked_embeddings: ChunkEmbeddings
27
+
28
+
29
+ def lorem_ipsum(n: int) -> str:
30
+ return " ".join(
31
+ ["Lorem ipsum dolor sit amet, consectetur adipiscing elit." for _ in range(n)]
32
+ )
33
+
34
+
35
+ @pytest.fixture
36
+ def mock_chunks_factory(mock_attachment_factory):
37
+ def fn(
38
+ project: Project,
39
+ rag_config: RagConfig,
40
+ num_chunks: int = 1,
41
+ text: str | None = None,
42
+ extractor_config_id: str | None = None,
43
+ chunker_config_id: str | None = None,
44
+ embedding_config_id: str | None = None,
45
+ ) -> DocWithChunks:
46
+ doc = Document(
47
+ id=f"doc_{uuid.uuid4()}",
48
+ name="Test Document",
49
+ description="Test Document",
50
+ original_file=FileInfo(
51
+ filename="test.pdf",
52
+ size=100,
53
+ mime_type="application/pdf",
54
+ attachment=mock_attachment_factory(KilnMimeType.PDF),
55
+ ),
56
+ kind=Kind.DOCUMENT,
57
+ parent=project,
58
+ )
59
+ doc.save_to_file()
60
+
61
+ extraction = Extraction(
62
+ source=ExtractionSource.PROCESSED,
63
+ extractor_config_id=extractor_config_id or rag_config.extractor_config_id,
64
+ output=mock_attachment_factory(KilnMimeType.PDF),
65
+ parent=doc,
66
+ )
67
+ extraction.save_to_file()
68
+
69
+ chunks = [
70
+ Chunk(
71
+ content=mock_attachment_factory(
72
+ KilnMimeType.TXT, text=f"text-{i}: {text or lorem_ipsum(10)}"
73
+ )
74
+ )
75
+ for i in range(num_chunks)
76
+ ]
77
+ chunked_document = ChunkedDocument(
78
+ chunks=chunks,
79
+ chunker_config_id=chunker_config_id or rag_config.chunker_config_id,
80
+ parent=extraction,
81
+ )
82
+ chunked_document.save_to_file()
83
+ chunked_embeddings = ChunkEmbeddings(
84
+ embeddings=[
85
+ Embedding(vector=[i + 0.1, i + 0.2, i + 0.3, i + 0.4, i + 0.5])
86
+ for i in range(num_chunks)
87
+ ],
88
+ embedding_config_id=embedding_config_id or rag_config.embedding_config_id,
89
+ parent=chunked_document,
90
+ )
91
+ chunked_embeddings.save_to_file()
92
+ return DocWithChunks(
93
+ document=doc,
94
+ extraction=extraction,
95
+ chunked_document=chunked_document,
96
+ chunked_embeddings=chunked_embeddings,
97
+ )
98
+
99
+ return fn
100
+
101
+
102
+ @pytest.fixture
103
+ def mock_project(tmp_path):
104
+ project = Project(
105
+ name="Test Project", path=tmp_path / "test_project" / "project.kiln"
106
+ )
107
+ project.save_to_file()
108
+ return project
109
+
110
+
111
+ @pytest.fixture
112
+ def rag_config_factory(mock_project):
113
+ def fn(
114
+ extractor_config_id: str = "test_extractor",
115
+ chunker_config_id: str = "test_chunker",
116
+ embedding_config_id: str = "test_embedding",
117
+ ) -> RagConfig:
118
+ rag_config = RagConfig(
119
+ name="Test Rag Config",
120
+ parent=mock_project,
121
+ vector_store_config_id="test_vector_store",
122
+ tool_name="test_tool",
123
+ tool_description="test_description",
124
+ extractor_config_id=extractor_config_id,
125
+ chunker_config_id=chunker_config_id,
126
+ embedding_config_id=embedding_config_id,
127
+ )
128
+ rag_config.save_to_file()
129
+ return rag_config
130
+
131
+ return fn
132
+
133
+
134
+ # Tests for VectorStoreLoader.iter_llama_index_nodes
135
+
136
+
137
+ @pytest.mark.asyncio
138
+ async def test_iter_llama_index_nodes_single_document(
139
+ mock_project, mock_chunks_factory, rag_config_factory
140
+ ):
141
+ """Test iter_llama_index_nodes with a single document that matches all config IDs."""
142
+ rag_config = rag_config_factory()
143
+ loader = VectorStoreLoader(project=mock_project, rag_config=rag_config)
144
+
145
+ # Create a document with chunks
146
+ doc_with_chunks = mock_chunks_factory(
147
+ mock_project, rag_config, num_chunks=3, text="Test content"
148
+ )
149
+
150
+ # Test iterating through nodes
151
+ all_nodes = []
152
+ async for batch in loader.iter_llama_index_nodes():
153
+ all_nodes.extend(batch)
154
+
155
+ assert len(all_nodes) == 3
156
+ # Check that all nodes have the correct document ID
157
+ for node in all_nodes:
158
+ assert node.metadata["kiln_doc_id"] == str(doc_with_chunks.document.id)
159
+ assert "kiln_chunk_idx" in node.metadata
160
+ assert "text" in node.text
161
+
162
+
163
+ @pytest.mark.asyncio
164
+ async def test_iter_llama_index_nodes_multiple_documents(
165
+ mock_project, mock_chunks_factory, rag_config_factory
166
+ ):
167
+ """Test iter_llama_index_nodes with multiple documents."""
168
+ rag_config = rag_config_factory()
169
+ loader = VectorStoreLoader(project=mock_project, rag_config=rag_config)
170
+
171
+ # Create multiple documents
172
+ doc1 = mock_chunks_factory(mock_project, rag_config, num_chunks=2, text="Doc 1")
173
+ doc2 = mock_chunks_factory(mock_project, rag_config, num_chunks=3, text="Doc 2")
174
+ doc3 = mock_chunks_factory(mock_project, rag_config, num_chunks=1, text="Doc 3")
175
+
176
+ # Test iterating through nodes
177
+ all_nodes = []
178
+ async for batch in loader.iter_llama_index_nodes():
179
+ all_nodes.extend(batch)
180
+
181
+ assert len(all_nodes) == 6 # 2 + 3 + 1 = 6 total chunks
182
+
183
+ # Group nodes by document ID
184
+ nodes_by_doc = {}
185
+ for node in all_nodes:
186
+ doc_id = node.metadata["kiln_doc_id"]
187
+ if doc_id not in nodes_by_doc:
188
+ nodes_by_doc[doc_id] = []
189
+ nodes_by_doc[doc_id].append(node)
190
+
191
+ # Check that we have nodes from all three documents
192
+ expected_doc_ids = {
193
+ str(doc1.document.id),
194
+ str(doc2.document.id),
195
+ str(doc3.document.id),
196
+ }
197
+ assert set(nodes_by_doc.keys()) == expected_doc_ids
198
+
199
+ # Check chunk counts
200
+ assert len(nodes_by_doc[str(doc1.document.id)]) == 2
201
+ assert len(nodes_by_doc[str(doc2.document.id)]) == 3
202
+ assert len(nodes_by_doc[str(doc3.document.id)]) == 1
203
+
204
+
205
+ @pytest.mark.asyncio
206
+ async def test_iter_llama_index_nodes_filters_by_extractor_config_id(
207
+ mock_project, mock_chunks_factory, rag_config_factory
208
+ ):
209
+ """Test that iter_llama_index_nodes filters by extractor_config_id."""
210
+ rag_config = rag_config_factory(extractor_config_id="target_extractor")
211
+ loader = VectorStoreLoader(project=mock_project, rag_config=rag_config)
212
+
213
+ # Create documents with different extractor config IDs
214
+ matching_doc = mock_chunks_factory(
215
+ mock_project,
216
+ rag_config,
217
+ num_chunks=2,
218
+ text="Matching doc",
219
+ extractor_config_id="target_extractor",
220
+ )
221
+ mock_chunks_factory(
222
+ mock_project,
223
+ rag_config,
224
+ num_chunks=2,
225
+ text="Non-matching doc",
226
+ extractor_config_id="other_extractor",
227
+ )
228
+
229
+ # Test iterating through nodes
230
+ all_nodes = []
231
+ async for batch in loader.iter_llama_index_nodes():
232
+ all_nodes.extend(batch)
233
+
234
+ assert len(all_nodes) == 2 # Only the matching document's chunks
235
+ for node in all_nodes:
236
+ assert node.metadata["kiln_doc_id"] == str(matching_doc.document.id)
237
+
238
+
239
+ @pytest.mark.asyncio
240
+ async def test_iter_llama_index_nodes_filters_by_chunker_config_id(
241
+ mock_project, mock_chunks_factory, rag_config_factory
242
+ ):
243
+ """Test that iter_llama_index_nodes filters by chunker_config_id."""
244
+ rag_config = rag_config_factory(chunker_config_id="target_chunker")
245
+ loader = VectorStoreLoader(project=mock_project, rag_config=rag_config)
246
+
247
+ # Create documents with different chunker config IDs
248
+ matching_doc = mock_chunks_factory(
249
+ mock_project,
250
+ rag_config,
251
+ num_chunks=2,
252
+ text="Matching doc",
253
+ chunker_config_id="target_chunker",
254
+ )
255
+ mock_chunks_factory(
256
+ mock_project,
257
+ rag_config,
258
+ num_chunks=2,
259
+ text="Non-matching doc",
260
+ chunker_config_id="other_chunker",
261
+ )
262
+
263
+ # Test iterating through nodes
264
+ all_nodes = []
265
+ async for batch in loader.iter_llama_index_nodes():
266
+ all_nodes.extend(batch)
267
+
268
+ assert len(all_nodes) == 2 # Only the matching document's chunks
269
+ for node in all_nodes:
270
+ assert node.metadata["kiln_doc_id"] == str(matching_doc.document.id)
271
+
272
+
273
+ @pytest.mark.asyncio
274
+ async def test_iter_llama_index_nodes_filters_by_embedding_config_id(
275
+ mock_project, mock_chunks_factory, rag_config_factory
276
+ ):
277
+ """Test that iter_llama_index_nodes filters by embedding_config_id."""
278
+ rag_config = rag_config_factory(embedding_config_id="target_embedding")
279
+ loader = VectorStoreLoader(project=mock_project, rag_config=rag_config)
280
+
281
+ # Create documents with different embedding config IDs
282
+ matching_doc = mock_chunks_factory(
283
+ mock_project,
284
+ rag_config,
285
+ num_chunks=2,
286
+ text="Matching doc",
287
+ embedding_config_id="target_embedding",
288
+ )
289
+ mock_chunks_factory(
290
+ mock_project,
291
+ rag_config,
292
+ num_chunks=2,
293
+ text="Non-matching doc",
294
+ embedding_config_id="other_embedding",
295
+ )
296
+
297
+ # Test iterating through nodes
298
+ all_nodes = []
299
+ async for batch in loader.iter_llama_index_nodes():
300
+ all_nodes.extend(batch)
301
+
302
+ assert len(all_nodes) == 2 # Only the matching document's chunks
303
+ for node in all_nodes:
304
+ assert node.metadata["kiln_doc_id"] == str(matching_doc.document.id)
305
+
306
+
307
+ @pytest.mark.asyncio
308
+ async def test_iter_llama_index_nodes_filters_by_all_config_ids(
309
+ mock_project, mock_chunks_factory, rag_config_factory
310
+ ):
311
+ """Test that iter_llama_index_nodes filters by all config IDs simultaneously."""
312
+ rag_config = rag_config_factory(
313
+ extractor_config_id="target_extractor",
314
+ chunker_config_id="target_chunker",
315
+ embedding_config_id="target_embedding",
316
+ )
317
+ loader = VectorStoreLoader(project=mock_project, rag_config=rag_config)
318
+
319
+ # Create documents with different combinations of config IDs
320
+ fully_matching_doc = mock_chunks_factory(
321
+ mock_project,
322
+ rag_config,
323
+ num_chunks=2,
324
+ text="Fully matching doc",
325
+ extractor_config_id="target_extractor",
326
+ chunker_config_id="target_chunker",
327
+ embedding_config_id="target_embedding",
328
+ )
329
+ mock_chunks_factory(
330
+ mock_project,
331
+ rag_config,
332
+ num_chunks=2,
333
+ text="Partially matching doc",
334
+ extractor_config_id="target_extractor",
335
+ chunker_config_id="other_chunker", # Different chunker
336
+ embedding_config_id="target_embedding",
337
+ )
338
+ mock_chunks_factory(
339
+ mock_project,
340
+ rag_config,
341
+ num_chunks=2,
342
+ text="Non-matching doc",
343
+ extractor_config_id="other_extractor",
344
+ chunker_config_id="other_chunker",
345
+ embedding_config_id="other_embedding",
346
+ )
347
+
348
+ # Test iterating through nodes
349
+ all_nodes = []
350
+ async for batch in loader.iter_llama_index_nodes():
351
+ all_nodes.extend(batch)
352
+
353
+ assert len(all_nodes) == 2 # Only the fully matching document's chunks
354
+ for node in all_nodes:
355
+ assert node.metadata["kiln_doc_id"] == str(fully_matching_doc.document.id)
356
+
357
+
358
+ @pytest.mark.asyncio
359
+ async def test_iter_llama_index_nodes_empty_project(mock_project, rag_config_factory):
360
+ """Test iter_llama_index_nodes with an empty project."""
361
+ rag_config = rag_config_factory()
362
+ loader = VectorStoreLoader(project=mock_project, rag_config=rag_config)
363
+
364
+ # Test iterating through nodes
365
+ all_nodes = []
366
+ async for batch in loader.iter_llama_index_nodes():
367
+ all_nodes.extend(batch)
368
+
369
+ assert len(all_nodes) == 0
370
+
371
+
372
+ @pytest.mark.asyncio
373
+ async def test_iter_llama_index_nodes_batch_size(
374
+ mock_project, mock_chunks_factory, rag_config_factory
375
+ ):
376
+ """Test that iter_llama_index_nodes respects batch_size parameter."""
377
+ rag_config = rag_config_factory()
378
+ loader = VectorStoreLoader(project=mock_project, rag_config=rag_config)
379
+
380
+ # Create a document with many chunks
381
+ mock_chunks_factory(mock_project, rag_config, num_chunks=5, text="Test content")
382
+
383
+ # Test with small batch size
384
+ batch_size = 2
385
+ batches = []
386
+ async for batch in loader.iter_llama_index_nodes(batch_size=batch_size):
387
+ batches.append(batch)
388
+ assert len(batch) <= batch_size
389
+
390
+ # Should have 3 batches: [2, 2, 1] chunks
391
+ assert len(batches) == 3
392
+ assert len(batches[0]) == 2
393
+ assert len(batches[1]) == 2
394
+ assert len(batches[2]) == 1
395
+
396
+
397
+ @pytest.mark.asyncio
398
+ async def test_iter_llama_index_nodes_chunk_text_embedding_mismatch(
399
+ mock_project, mock_chunks_factory, rag_config_factory, mock_attachment_factory
400
+ ):
401
+ """Test that iter_llama_index_nodes raises error on chunk text/embedding count mismatch."""
402
+ rag_config = rag_config_factory()
403
+ loader = VectorStoreLoader(project=mock_project, rag_config=rag_config)
404
+
405
+ # Create a document with mismatched chunk text and embeddings
406
+ doc = Document(
407
+ id=f"doc_{uuid.uuid4()}",
408
+ name="Test Document",
409
+ description="Test Document",
410
+ original_file=FileInfo(
411
+ filename="test.pdf",
412
+ size=100,
413
+ mime_type="application/pdf",
414
+ attachment=mock_attachment_factory(KilnMimeType.PDF),
415
+ ),
416
+ kind=Kind.DOCUMENT,
417
+ parent=mock_project,
418
+ )
419
+ doc.save_to_file()
420
+
421
+ extraction = Extraction(
422
+ source=ExtractionSource.PROCESSED,
423
+ extractor_config_id=rag_config.extractor_config_id,
424
+ output=mock_attachment_factory(KilnMimeType.PDF),
425
+ parent=doc,
426
+ )
427
+ extraction.save_to_file()
428
+
429
+ # Create 2 chunks but only 1 embedding
430
+ chunks = [
431
+ Chunk(content=mock_attachment_factory(KilnMimeType.TXT, text=f"chunk-{i}"))
432
+ for i in range(2)
433
+ ]
434
+ chunked_document = ChunkedDocument(
435
+ chunks=chunks,
436
+ chunker_config_id=rag_config.chunker_config_id,
437
+ parent=extraction,
438
+ )
439
+ chunked_document.save_to_file()
440
+
441
+ # Only 1 embedding for 2 chunks
442
+ chunked_embeddings = ChunkEmbeddings(
443
+ embeddings=[Embedding(vector=[0.1, 0.2, 0.3])], # Only 1 embedding
444
+ embedding_config_id=rag_config.embedding_config_id,
445
+ parent=chunked_document,
446
+ )
447
+ chunked_embeddings.save_to_file()
448
+
449
+ # Test that it raises an error
450
+ with pytest.raises(ValueError, match="Chunk text/embedding count mismatch"):
451
+ async for batch in loader.iter_llama_index_nodes():
452
+ pass
453
+
454
+
455
+ @pytest.mark.asyncio
456
+ async def test_iter_llama_index_nodes_multiple_extractions_per_document(
457
+ mock_project, mock_chunks_factory, rag_config_factory, mock_attachment_factory
458
+ ):
459
+ """Test iter_llama_index_nodes with multiple extractions per document."""
460
+ rag_config = rag_config_factory()
461
+ loader = VectorStoreLoader(project=mock_project, rag_config=rag_config)
462
+
463
+ # Create a document
464
+ doc = Document(
465
+ id=f"doc_{uuid.uuid4()}",
466
+ name="Test Document",
467
+ description="Test Document",
468
+ original_file=FileInfo(
469
+ filename="test.pdf",
470
+ size=100,
471
+ mime_type="application/pdf",
472
+ attachment=mock_attachment_factory(KilnMimeType.PDF),
473
+ ),
474
+ kind=Kind.DOCUMENT,
475
+ parent=mock_project,
476
+ )
477
+ doc.save_to_file()
478
+
479
+ # Create multiple extractions for the same document
480
+ extraction1 = Extraction(
481
+ source=ExtractionSource.PROCESSED,
482
+ extractor_config_id=rag_config.extractor_config_id,
483
+ output=mock_attachment_factory(KilnMimeType.PDF),
484
+ parent=doc,
485
+ )
486
+ extraction1.save_to_file()
487
+
488
+ extraction2 = Extraction(
489
+ source=ExtractionSource.PROCESSED,
490
+ extractor_config_id="other_extractor", # Different extractor
491
+ output=mock_attachment_factory(KilnMimeType.PDF),
492
+ parent=doc,
493
+ )
494
+ extraction2.save_to_file()
495
+
496
+ # Create chunked documents and embeddings for each extraction
497
+ chunks1 = [
498
+ Chunk(content=mock_attachment_factory(KilnMimeType.TXT, text=f"chunk1-{i}"))
499
+ for i in range(2)
500
+ ]
501
+ chunked_doc1 = ChunkedDocument(
502
+ chunks=chunks1,
503
+ chunker_config_id=rag_config.chunker_config_id,
504
+ parent=extraction1,
505
+ )
506
+ chunked_doc1.save_to_file()
507
+
508
+ chunks2 = [
509
+ Chunk(content=mock_attachment_factory(KilnMimeType.TXT, text=f"chunk2-{i}"))
510
+ for i in range(3)
511
+ ]
512
+ chunked_doc2 = ChunkedDocument(
513
+ chunks=chunks2,
514
+ chunker_config_id=rag_config.chunker_config_id,
515
+ parent=extraction2,
516
+ )
517
+ chunked_doc2.save_to_file()
518
+
519
+ # Create embeddings for each chunked document
520
+ embeddings1 = ChunkEmbeddings(
521
+ embeddings=[Embedding(vector=[0.1, 0.2, 0.3]) for _ in range(2)],
522
+ embedding_config_id=rag_config.embedding_config_id,
523
+ parent=chunked_doc1,
524
+ )
525
+ embeddings1.save_to_file()
526
+
527
+ embeddings2 = ChunkEmbeddings(
528
+ embeddings=[Embedding(vector=[0.4, 0.5, 0.6]) for _ in range(3)],
529
+ embedding_config_id=rag_config.embedding_config_id,
530
+ parent=chunked_doc2,
531
+ )
532
+ embeddings2.save_to_file()
533
+
534
+ # Test iterating through nodes
535
+ all_nodes = []
536
+ async for batch in loader.iter_llama_index_nodes():
537
+ all_nodes.extend(batch)
538
+
539
+ # Should only return nodes from the first extraction since the second has a different extractor_config_id
540
+ assert len(all_nodes) == 2
541
+ for node in all_nodes:
542
+ assert node.metadata["kiln_doc_id"] == str(doc.id)
543
+ # All nodes should have chunk indices 0 and 1 (from the first extraction)
544
+ assert node.metadata["kiln_chunk_idx"] in [0, 1]
@@ -0,0 +1,91 @@
1
+ import asyncio
2
+ from typing import AsyncGenerator, List
3
+
4
+ from llama_index.core.schema import TextNode
5
+
6
+ from kiln_ai.adapters.rag.deduplication import (
7
+ deduplicate_chunk_embeddings,
8
+ deduplicate_chunked_documents,
9
+ deduplicate_extractions,
10
+ )
11
+ from kiln_ai.adapters.vector_store.lancedb_helpers import (
12
+ convert_to_llama_index_node,
13
+ deterministic_chunk_id,
14
+ )
15
+ from kiln_ai.datamodel.project import Project
16
+ from kiln_ai.datamodel.rag import RagConfig
17
+
18
+
19
+ class VectorStoreLoader:
20
+ """
21
+ Class for loading data as LlamaIndex Nodes.
22
+ """
23
+
24
+ def __init__(
25
+ self,
26
+ project: Project,
27
+ rag_config: RagConfig,
28
+ ):
29
+ self.project = project
30
+ self.rag_config = rag_config
31
+
32
+ async def iter_llama_index_nodes(
33
+ self, batch_size: int = 100
34
+ ) -> AsyncGenerator[List[TextNode], None]:
35
+ """Returns a generator of documents with their corresponding chunks and embeddings."""
36
+ batch: List[TextNode] = []
37
+ for document in self.project.documents():
38
+ await asyncio.sleep(0)
39
+ for extraction in deduplicate_extractions(document.extractions()):
40
+ if (
41
+ extraction.extractor_config_id
42
+ != self.rag_config.extractor_config_id
43
+ ):
44
+ continue
45
+ for chunked_document in deduplicate_chunked_documents(
46
+ extraction.chunked_documents()
47
+ ):
48
+ if (
49
+ chunked_document.chunker_config_id
50
+ != self.rag_config.chunker_config_id
51
+ ):
52
+ continue
53
+ for chunk_embeddings in deduplicate_chunk_embeddings(
54
+ chunked_document.chunk_embeddings()
55
+ ):
56
+ if (
57
+ chunk_embeddings.embedding_config_id
58
+ != self.rag_config.embedding_config_id
59
+ ):
60
+ continue
61
+
62
+ document_id = str(document.id)
63
+ chunks_text = await chunked_document.load_chunks_text()
64
+ embeddings = chunk_embeddings.embeddings
65
+ if len(chunks_text) != len(embeddings):
66
+ raise ValueError(
67
+ f"Chunk text/embedding count mismatch for document {document_id}: "
68
+ f"{len(chunks_text)} texts vs {len(embeddings)} embeddings"
69
+ )
70
+
71
+ for chunk_idx, (chunk_text, chunk_embeddings) in enumerate(
72
+ zip(chunks_text, embeddings)
73
+ ):
74
+ batch.append(
75
+ convert_to_llama_index_node(
76
+ document_id=document_id,
77
+ chunk_idx=chunk_idx,
78
+ node_id=deterministic_chunk_id(
79
+ document_id, chunk_idx
80
+ ),
81
+ text=chunk_text,
82
+ vector=chunk_embeddings.vector,
83
+ )
84
+ )
85
+
86
+ if len(batch) >= batch_size:
87
+ yield batch
88
+ batch = []
89
+
90
+ if batch:
91
+ yield batch
@@ -9,7 +9,7 @@ from abc import ABCMeta
9
9
  from builtins import classmethod
10
10
  from datetime import datetime
11
11
  from pathlib import Path
12
- from typing import Any, Callable, Dict, List, Optional, Type, TypeVar
12
+ from typing import Any, Callable, Dict, List, Optional, Set, Type, TypeVar
13
13
 
14
14
  from pydantic import (
15
15
  BaseModel,
@@ -78,9 +78,9 @@ def string_to_valid_name(name: str) -> str:
78
78
  # https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize
79
79
  valid_name = unicodedata.normalize("NFKD", name)
80
80
  # Replace any forbidden chars with an underscore
81
- valid_name = re.sub(FORBIDDEN_CHARS_REGEX, "_", valid_name)
81
+ valid_name = re.sub(FORBIDDEN_CHARS_REGEX, " ", valid_name)
82
82
  # Replace control characters with an underscore
83
- valid_name = re.sub(r"[\x00-\x1F]", "_", valid_name)
83
+ valid_name = re.sub(r"[\x00-\x1F]", " ", valid_name)
84
84
  # Replace consecutive whitespace with a single space
85
85
  valid_name = re.sub(r"\s+", " ", valid_name)
86
86
  # Replace consecutive underscores with a single underscore
@@ -594,6 +594,34 @@ class KilnParentedModel(KilnBaseModel, metaclass=ABCMeta):
594
594
  return child
595
595
  return None
596
596
 
597
+ @classmethod
598
+ def from_ids_and_parent_path(
599
+ cls: Type[PT], ids: Set[str], parent_path: Path | None
600
+ ) -> Dict[str, PT]:
601
+ """
602
+ Bulk equivalent of from_id_and_parent_path, much faster for large collections.
603
+
604
+ It picks out the matching models from the directory only once. This avoids
605
+ doing individual costly lookups that scan the whole directory in scenarios
606
+ where we need to iterate over a large collection of models (e.g. bulk tagging).
607
+ """
608
+ if parent_path is None:
609
+ return {}
610
+
611
+ children = {}
612
+
613
+ # Note: we're using the in-file ID. We could make this faster using the path-ID if this becomes perf bottleneck, but it's better to have 1 source of truth.
614
+ for child_path in cls.iterate_children_paths_of_parent_path(parent_path):
615
+ child_id = ModelCache.shared().get_model_id(child_path, cls)
616
+ if child_id in ids:
617
+ children[child_id] = cls.load_from_file(child_path)
618
+ if child_id is None:
619
+ child = cls.load_from_file(child_path)
620
+ if child.id in ids:
621
+ children[child.id] = child
622
+
623
+ return children
624
+
597
625
 
598
626
  # Parent create methods for all child relationships
599
627
  # You must pass in parent_of in the subclass definition, defining the child relationships