kiln-ai 0.20.1__py3-none-any.whl → 0.22.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kiln-ai might be problematic. Click here for more details.
- kiln_ai/adapters/__init__.py +6 -0
- kiln_ai/adapters/adapter_registry.py +43 -226
- kiln_ai/adapters/chunkers/__init__.py +13 -0
- kiln_ai/adapters/chunkers/base_chunker.py +42 -0
- kiln_ai/adapters/chunkers/chunker_registry.py +16 -0
- kiln_ai/adapters/chunkers/fixed_window_chunker.py +39 -0
- kiln_ai/adapters/chunkers/helpers.py +23 -0
- kiln_ai/adapters/chunkers/test_base_chunker.py +63 -0
- kiln_ai/adapters/chunkers/test_chunker_registry.py +28 -0
- kiln_ai/adapters/chunkers/test_fixed_window_chunker.py +346 -0
- kiln_ai/adapters/chunkers/test_helpers.py +75 -0
- kiln_ai/adapters/data_gen/test_data_gen_task.py +9 -3
- kiln_ai/adapters/embedding/__init__.py +0 -0
- kiln_ai/adapters/embedding/base_embedding_adapter.py +44 -0
- kiln_ai/adapters/embedding/embedding_registry.py +32 -0
- kiln_ai/adapters/embedding/litellm_embedding_adapter.py +199 -0
- kiln_ai/adapters/embedding/test_base_embedding_adapter.py +283 -0
- kiln_ai/adapters/embedding/test_embedding_registry.py +166 -0
- kiln_ai/adapters/embedding/test_litellm_embedding_adapter.py +1149 -0
- kiln_ai/adapters/eval/eval_runner.py +6 -2
- kiln_ai/adapters/eval/test_base_eval.py +1 -3
- kiln_ai/adapters/eval/test_g_eval.py +1 -1
- kiln_ai/adapters/extractors/__init__.py +18 -0
- kiln_ai/adapters/extractors/base_extractor.py +72 -0
- kiln_ai/adapters/extractors/encoding.py +20 -0
- kiln_ai/adapters/extractors/extractor_registry.py +44 -0
- kiln_ai/adapters/extractors/extractor_runner.py +112 -0
- kiln_ai/adapters/extractors/litellm_extractor.py +406 -0
- kiln_ai/adapters/extractors/test_base_extractor.py +244 -0
- kiln_ai/adapters/extractors/test_encoding.py +54 -0
- kiln_ai/adapters/extractors/test_extractor_registry.py +181 -0
- kiln_ai/adapters/extractors/test_extractor_runner.py +181 -0
- kiln_ai/adapters/extractors/test_litellm_extractor.py +1290 -0
- kiln_ai/adapters/fine_tune/test_dataset_formatter.py +2 -2
- kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +2 -6
- kiln_ai/adapters/fine_tune/test_together_finetune.py +2 -6
- kiln_ai/adapters/ml_embedding_model_list.py +494 -0
- kiln_ai/adapters/ml_model_list.py +876 -18
- kiln_ai/adapters/model_adapters/litellm_adapter.py +40 -75
- kiln_ai/adapters/model_adapters/test_litellm_adapter.py +79 -1
- kiln_ai/adapters/model_adapters/test_litellm_adapter_tools.py +119 -5
- kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +9 -3
- kiln_ai/adapters/model_adapters/test_structured_output.py +9 -10
- kiln_ai/adapters/ollama_tools.py +69 -12
- kiln_ai/adapters/provider_tools.py +190 -46
- kiln_ai/adapters/rag/deduplication.py +49 -0
- kiln_ai/adapters/rag/progress.py +252 -0
- kiln_ai/adapters/rag/rag_runners.py +844 -0
- kiln_ai/adapters/rag/test_deduplication.py +195 -0
- kiln_ai/adapters/rag/test_progress.py +785 -0
- kiln_ai/adapters/rag/test_rag_runners.py +2376 -0
- kiln_ai/adapters/remote_config.py +80 -8
- kiln_ai/adapters/test_adapter_registry.py +579 -86
- kiln_ai/adapters/test_ml_embedding_model_list.py +239 -0
- kiln_ai/adapters/test_ml_model_list.py +202 -0
- kiln_ai/adapters/test_ollama_tools.py +340 -1
- kiln_ai/adapters/test_prompt_builders.py +1 -1
- kiln_ai/adapters/test_provider_tools.py +199 -8
- kiln_ai/adapters/test_remote_config.py +551 -56
- kiln_ai/adapters/vector_store/__init__.py +1 -0
- kiln_ai/adapters/vector_store/base_vector_store_adapter.py +83 -0
- kiln_ai/adapters/vector_store/lancedb_adapter.py +389 -0
- kiln_ai/adapters/vector_store/test_base_vector_store.py +160 -0
- kiln_ai/adapters/vector_store/test_lancedb_adapter.py +1841 -0
- kiln_ai/adapters/vector_store/test_vector_store_registry.py +199 -0
- kiln_ai/adapters/vector_store/vector_store_registry.py +33 -0
- kiln_ai/datamodel/__init__.py +16 -13
- kiln_ai/datamodel/basemodel.py +201 -4
- kiln_ai/datamodel/chunk.py +158 -0
- kiln_ai/datamodel/datamodel_enums.py +27 -0
- kiln_ai/datamodel/embedding.py +64 -0
- kiln_ai/datamodel/external_tool_server.py +206 -54
- kiln_ai/datamodel/extraction.py +317 -0
- kiln_ai/datamodel/project.py +33 -1
- kiln_ai/datamodel/rag.py +79 -0
- kiln_ai/datamodel/task.py +5 -0
- kiln_ai/datamodel/task_output.py +41 -11
- kiln_ai/datamodel/test_attachment.py +649 -0
- kiln_ai/datamodel/test_basemodel.py +270 -14
- kiln_ai/datamodel/test_chunk_models.py +317 -0
- kiln_ai/datamodel/test_dataset_split.py +1 -1
- kiln_ai/datamodel/test_datasource.py +50 -0
- kiln_ai/datamodel/test_embedding_models.py +448 -0
- kiln_ai/datamodel/test_eval_model.py +6 -6
- kiln_ai/datamodel/test_external_tool_server.py +534 -152
- kiln_ai/datamodel/test_extraction_chunk.py +206 -0
- kiln_ai/datamodel/test_extraction_model.py +501 -0
- kiln_ai/datamodel/test_rag.py +641 -0
- kiln_ai/datamodel/test_task.py +35 -1
- kiln_ai/datamodel/test_tool_id.py +187 -1
- kiln_ai/datamodel/test_vector_store.py +320 -0
- kiln_ai/datamodel/tool_id.py +58 -0
- kiln_ai/datamodel/vector_store.py +141 -0
- kiln_ai/tools/base_tool.py +12 -3
- kiln_ai/tools/built_in_tools/math_tools.py +12 -4
- kiln_ai/tools/kiln_task_tool.py +158 -0
- kiln_ai/tools/mcp_server_tool.py +2 -2
- kiln_ai/tools/mcp_session_manager.py +51 -22
- kiln_ai/tools/rag_tools.py +164 -0
- kiln_ai/tools/test_kiln_task_tool.py +527 -0
- kiln_ai/tools/test_mcp_server_tool.py +4 -15
- kiln_ai/tools/test_mcp_session_manager.py +187 -227
- kiln_ai/tools/test_rag_tools.py +929 -0
- kiln_ai/tools/test_tool_registry.py +290 -7
- kiln_ai/tools/tool_registry.py +69 -16
- kiln_ai/utils/__init__.py +3 -0
- kiln_ai/utils/async_job_runner.py +62 -17
- kiln_ai/utils/config.py +2 -2
- kiln_ai/utils/env.py +15 -0
- kiln_ai/utils/filesystem.py +14 -0
- kiln_ai/utils/filesystem_cache.py +60 -0
- kiln_ai/utils/litellm.py +94 -0
- kiln_ai/utils/lock.py +100 -0
- kiln_ai/utils/mime_type.py +38 -0
- kiln_ai/utils/open_ai_types.py +19 -2
- kiln_ai/utils/pdf_utils.py +59 -0
- kiln_ai/utils/test_async_job_runner.py +151 -35
- kiln_ai/utils/test_env.py +142 -0
- kiln_ai/utils/test_filesystem_cache.py +316 -0
- kiln_ai/utils/test_litellm.py +206 -0
- kiln_ai/utils/test_lock.py +185 -0
- kiln_ai/utils/test_mime_type.py +66 -0
- kiln_ai/utils/test_open_ai_types.py +88 -12
- kiln_ai/utils/test_pdf_utils.py +86 -0
- kiln_ai/utils/test_uuid.py +111 -0
- kiln_ai/utils/test_validation.py +524 -0
- kiln_ai/utils/uuid.py +9 -0
- kiln_ai/utils/validation.py +90 -0
- {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/METADATA +9 -1
- kiln_ai-0.22.0.dist-info/RECORD +213 -0
- kiln_ai-0.20.1.dist-info/RECORD +0 -138
- {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/WHEEL +0 -0
- {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
from unittest.mock import MagicMock
|
|
2
|
+
|
|
3
|
+
from kiln_ai.adapters.rag.deduplication import (
|
|
4
|
+
deduplicate_chunk_embeddings,
|
|
5
|
+
deduplicate_chunked_documents,
|
|
6
|
+
deduplicate_extractions,
|
|
7
|
+
filter_documents_by_tags,
|
|
8
|
+
)
|
|
9
|
+
from kiln_ai.datamodel.chunk import ChunkedDocument
|
|
10
|
+
from kiln_ai.datamodel.embedding import ChunkEmbeddings
|
|
11
|
+
from kiln_ai.datamodel.extraction import Document, Extraction
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class TestFilterDocumentsByTags:
|
|
15
|
+
def test_filter_documents_by_tags_with_none_tags(self):
|
|
16
|
+
"""Test that None tags returns all documents"""
|
|
17
|
+
doc1 = MagicMock(spec=Document)
|
|
18
|
+
doc1.tags = ["tag1", "tag2"]
|
|
19
|
+
doc2 = MagicMock(spec=Document)
|
|
20
|
+
doc2.tags = ["tag3"]
|
|
21
|
+
|
|
22
|
+
documents = [doc1, doc2]
|
|
23
|
+
result = filter_documents_by_tags(documents, None)
|
|
24
|
+
|
|
25
|
+
assert result == documents
|
|
26
|
+
assert len(result) == 2
|
|
27
|
+
|
|
28
|
+
def test_filter_documents_by_tags_with_empty_tags(self):
|
|
29
|
+
"""Test that empty tags list returns all documents"""
|
|
30
|
+
doc1 = MagicMock(spec=Document)
|
|
31
|
+
doc1.tags = ["tag1", "tag2"]
|
|
32
|
+
doc2 = MagicMock(spec=Document)
|
|
33
|
+
doc2.tags = ["tag3"]
|
|
34
|
+
|
|
35
|
+
documents = [doc1, doc2]
|
|
36
|
+
result = filter_documents_by_tags(documents, [])
|
|
37
|
+
|
|
38
|
+
assert result == documents
|
|
39
|
+
assert len(result) == 2
|
|
40
|
+
|
|
41
|
+
def test_filter_documents_by_tags_single_matching_tag(self):
|
|
42
|
+
"""Test filtering with a single matching tag"""
|
|
43
|
+
doc1 = MagicMock(spec=Document)
|
|
44
|
+
doc1.tags = ["tag1", "tag2"]
|
|
45
|
+
doc2 = MagicMock(spec=Document)
|
|
46
|
+
doc2.tags = ["tag3"]
|
|
47
|
+
doc3 = MagicMock(spec=Document)
|
|
48
|
+
doc3.tags = ["tag1", "tag4"]
|
|
49
|
+
|
|
50
|
+
documents = [doc1, doc2, doc3]
|
|
51
|
+
result = filter_documents_by_tags(documents, ["tag1"])
|
|
52
|
+
|
|
53
|
+
assert len(result) == 2
|
|
54
|
+
assert doc1 in result
|
|
55
|
+
assert doc3 in result
|
|
56
|
+
assert doc2 not in result
|
|
57
|
+
|
|
58
|
+
def test_filter_documents_by_tags_multiple_matching_tags(self):
|
|
59
|
+
"""Test filtering with multiple tags (OR logic)"""
|
|
60
|
+
doc1 = MagicMock(spec=Document)
|
|
61
|
+
doc1.tags = ["tag1", "tag2"]
|
|
62
|
+
doc2 = MagicMock(spec=Document)
|
|
63
|
+
doc2.tags = ["tag3"]
|
|
64
|
+
doc3 = MagicMock(spec=Document)
|
|
65
|
+
doc3.tags = ["tag4", "tag5"]
|
|
66
|
+
doc4 = MagicMock(spec=Document)
|
|
67
|
+
doc4.tags = ["tag2", "tag6"]
|
|
68
|
+
|
|
69
|
+
documents = [doc1, doc2, doc3, doc4]
|
|
70
|
+
result = filter_documents_by_tags(documents, ["tag1", "tag3"])
|
|
71
|
+
|
|
72
|
+
assert len(result) == 2
|
|
73
|
+
assert doc1 in result # has tag1
|
|
74
|
+
assert doc2 in result # has tag3
|
|
75
|
+
assert doc3 not in result
|
|
76
|
+
assert doc4 not in result
|
|
77
|
+
|
|
78
|
+
def test_filter_documents_by_tags_no_matching_documents(self):
|
|
79
|
+
"""Test filtering when no documents match the tags"""
|
|
80
|
+
doc1 = MagicMock(spec=Document)
|
|
81
|
+
doc1.tags = ["tag1", "tag2"]
|
|
82
|
+
doc2 = MagicMock(spec=Document)
|
|
83
|
+
doc2.tags = ["tag3"]
|
|
84
|
+
|
|
85
|
+
documents = [doc1, doc2]
|
|
86
|
+
result = filter_documents_by_tags(documents, ["tag4", "tag5"])
|
|
87
|
+
|
|
88
|
+
assert len(result) == 0
|
|
89
|
+
|
|
90
|
+
def test_filter_documents_by_tags_documents_with_no_tags(self):
|
|
91
|
+
"""Test filtering when some documents have no tags"""
|
|
92
|
+
doc1 = MagicMock(spec=Document)
|
|
93
|
+
doc1.tags = ["tag1", "tag2"]
|
|
94
|
+
doc2 = MagicMock(spec=Document)
|
|
95
|
+
doc2.tags = None
|
|
96
|
+
doc3 = MagicMock(spec=Document)
|
|
97
|
+
doc3.tags = []
|
|
98
|
+
doc4 = MagicMock(spec=Document)
|
|
99
|
+
doc4.tags = ["tag1"]
|
|
100
|
+
|
|
101
|
+
documents = [doc1, doc2, doc3, doc4]
|
|
102
|
+
result = filter_documents_by_tags(documents, ["tag1"])
|
|
103
|
+
|
|
104
|
+
assert len(result) == 2
|
|
105
|
+
assert doc1 in result
|
|
106
|
+
assert doc4 in result
|
|
107
|
+
assert doc2 not in result # None tags
|
|
108
|
+
assert doc3 not in result # empty tags
|
|
109
|
+
|
|
110
|
+
def test_filter_documents_by_tags_empty_document_list(self):
|
|
111
|
+
"""Test filtering with empty document list"""
|
|
112
|
+
documents = []
|
|
113
|
+
result = filter_documents_by_tags(documents, ["tag1"])
|
|
114
|
+
|
|
115
|
+
assert len(result) == 0
|
|
116
|
+
|
|
117
|
+
def test_filter_documents_by_tags_case_sensitive(self):
|
|
118
|
+
"""Test that tag filtering is case sensitive"""
|
|
119
|
+
doc1 = MagicMock(spec=Document)
|
|
120
|
+
doc1.tags = ["Tag1", "tag2"]
|
|
121
|
+
doc2 = MagicMock(spec=Document)
|
|
122
|
+
doc2.tags = ["tag1", "tag3"]
|
|
123
|
+
|
|
124
|
+
documents = [doc1, doc2]
|
|
125
|
+
result = filter_documents_by_tags(documents, ["tag1"])
|
|
126
|
+
|
|
127
|
+
assert len(result) == 1
|
|
128
|
+
assert doc2 in result
|
|
129
|
+
assert doc1 not in result # "Tag1" != "tag1"
|
|
130
|
+
|
|
131
|
+
def test_filter_documents_by_tags_partial_match(self):
|
|
132
|
+
"""Test that only exact tag matches work, not partial matches"""
|
|
133
|
+
doc1 = MagicMock(spec=Document)
|
|
134
|
+
doc1.tags = ["tag1", "tag12"]
|
|
135
|
+
doc2 = MagicMock(spec=Document)
|
|
136
|
+
doc2.tags = ["tag", "other"]
|
|
137
|
+
|
|
138
|
+
documents = [doc1, doc2]
|
|
139
|
+
result = filter_documents_by_tags(documents, ["tag"])
|
|
140
|
+
|
|
141
|
+
assert len(result) == 1
|
|
142
|
+
assert doc2 in result
|
|
143
|
+
assert doc1 not in result # "tag1" and "tag12" don't match "tag"
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class TestDeduplicationFunctions:
|
|
147
|
+
"""Basic tests to ensure existing deduplication functions still work"""
|
|
148
|
+
|
|
149
|
+
def test_deduplicate_extractions_basic(self):
|
|
150
|
+
"""Test basic deduplication of extractions"""
|
|
151
|
+
extraction1 = MagicMock(spec=Extraction)
|
|
152
|
+
extraction1.extractor_config_id = "config1"
|
|
153
|
+
extraction1.created_at = "2024-01-01"
|
|
154
|
+
|
|
155
|
+
extraction2 = MagicMock(spec=Extraction)
|
|
156
|
+
extraction2.extractor_config_id = "config1"
|
|
157
|
+
extraction2.created_at = "2024-01-02"
|
|
158
|
+
|
|
159
|
+
extractions = [extraction1, extraction2]
|
|
160
|
+
result = deduplicate_extractions(extractions)
|
|
161
|
+
|
|
162
|
+
assert len(result) == 1
|
|
163
|
+
assert result[0] == extraction1 # earlier created_at
|
|
164
|
+
|
|
165
|
+
def test_deduplicate_chunked_documents_basic(self):
|
|
166
|
+
"""Test basic deduplication of chunked documents"""
|
|
167
|
+
chunked1 = MagicMock(spec=ChunkedDocument)
|
|
168
|
+
chunked1.chunker_config_id = "config1"
|
|
169
|
+
chunked1.created_at = "2024-01-01"
|
|
170
|
+
|
|
171
|
+
chunked2 = MagicMock(spec=ChunkedDocument)
|
|
172
|
+
chunked2.chunker_config_id = "config1"
|
|
173
|
+
chunked2.created_at = "2024-01-02"
|
|
174
|
+
|
|
175
|
+
chunked_docs = [chunked1, chunked2]
|
|
176
|
+
result = deduplicate_chunked_documents(chunked_docs)
|
|
177
|
+
|
|
178
|
+
assert len(result) == 1
|
|
179
|
+
assert result[0] == chunked1 # earlier created_at
|
|
180
|
+
|
|
181
|
+
def test_deduplicate_chunk_embeddings_basic(self):
|
|
182
|
+
"""Test basic deduplication of chunk embeddings"""
|
|
183
|
+
embedding1 = MagicMock(spec=ChunkEmbeddings)
|
|
184
|
+
embedding1.embedding_config_id = "config1"
|
|
185
|
+
embedding1.created_at = "2024-01-01"
|
|
186
|
+
|
|
187
|
+
embedding2 = MagicMock(spec=ChunkEmbeddings)
|
|
188
|
+
embedding2.embedding_config_id = "config1"
|
|
189
|
+
embedding2.created_at = "2024-01-02"
|
|
190
|
+
|
|
191
|
+
embeddings = [embedding1, embedding2]
|
|
192
|
+
result = deduplicate_chunk_embeddings(embeddings)
|
|
193
|
+
|
|
194
|
+
assert len(result) == 1
|
|
195
|
+
assert result[0] == embedding1 # earlier created_at
|