kiln-ai 0.19.0__py3-none-any.whl → 0.21.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (158) hide show
  1. kiln_ai/adapters/__init__.py +8 -2
  2. kiln_ai/adapters/adapter_registry.py +43 -208
  3. kiln_ai/adapters/chat/chat_formatter.py +8 -12
  4. kiln_ai/adapters/chat/test_chat_formatter.py +6 -2
  5. kiln_ai/adapters/chunkers/__init__.py +13 -0
  6. kiln_ai/adapters/chunkers/base_chunker.py +42 -0
  7. kiln_ai/adapters/chunkers/chunker_registry.py +16 -0
  8. kiln_ai/adapters/chunkers/fixed_window_chunker.py +39 -0
  9. kiln_ai/adapters/chunkers/helpers.py +23 -0
  10. kiln_ai/adapters/chunkers/test_base_chunker.py +63 -0
  11. kiln_ai/adapters/chunkers/test_chunker_registry.py +28 -0
  12. kiln_ai/adapters/chunkers/test_fixed_window_chunker.py +346 -0
  13. kiln_ai/adapters/chunkers/test_helpers.py +75 -0
  14. kiln_ai/adapters/data_gen/test_data_gen_task.py +9 -3
  15. kiln_ai/adapters/docker_model_runner_tools.py +119 -0
  16. kiln_ai/adapters/embedding/__init__.py +0 -0
  17. kiln_ai/adapters/embedding/base_embedding_adapter.py +44 -0
  18. kiln_ai/adapters/embedding/embedding_registry.py +32 -0
  19. kiln_ai/adapters/embedding/litellm_embedding_adapter.py +199 -0
  20. kiln_ai/adapters/embedding/test_base_embedding_adapter.py +283 -0
  21. kiln_ai/adapters/embedding/test_embedding_registry.py +166 -0
  22. kiln_ai/adapters/embedding/test_litellm_embedding_adapter.py +1149 -0
  23. kiln_ai/adapters/eval/base_eval.py +2 -2
  24. kiln_ai/adapters/eval/eval_runner.py +9 -3
  25. kiln_ai/adapters/eval/g_eval.py +2 -2
  26. kiln_ai/adapters/eval/test_base_eval.py +2 -4
  27. kiln_ai/adapters/eval/test_g_eval.py +4 -5
  28. kiln_ai/adapters/extractors/__init__.py +18 -0
  29. kiln_ai/adapters/extractors/base_extractor.py +72 -0
  30. kiln_ai/adapters/extractors/encoding.py +20 -0
  31. kiln_ai/adapters/extractors/extractor_registry.py +44 -0
  32. kiln_ai/adapters/extractors/extractor_runner.py +112 -0
  33. kiln_ai/adapters/extractors/litellm_extractor.py +386 -0
  34. kiln_ai/adapters/extractors/test_base_extractor.py +244 -0
  35. kiln_ai/adapters/extractors/test_encoding.py +54 -0
  36. kiln_ai/adapters/extractors/test_extractor_registry.py +181 -0
  37. kiln_ai/adapters/extractors/test_extractor_runner.py +181 -0
  38. kiln_ai/adapters/extractors/test_litellm_extractor.py +1192 -0
  39. kiln_ai/adapters/fine_tune/__init__.py +1 -1
  40. kiln_ai/adapters/fine_tune/openai_finetune.py +14 -4
  41. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +2 -2
  42. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +2 -6
  43. kiln_ai/adapters/fine_tune/test_openai_finetune.py +108 -111
  44. kiln_ai/adapters/fine_tune/test_together_finetune.py +2 -6
  45. kiln_ai/adapters/ml_embedding_model_list.py +192 -0
  46. kiln_ai/adapters/ml_model_list.py +761 -37
  47. kiln_ai/adapters/model_adapters/base_adapter.py +51 -21
  48. kiln_ai/adapters/model_adapters/litellm_adapter.py +380 -138
  49. kiln_ai/adapters/model_adapters/test_base_adapter.py +193 -17
  50. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +407 -2
  51. kiln_ai/adapters/model_adapters/test_litellm_adapter_tools.py +1103 -0
  52. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +5 -5
  53. kiln_ai/adapters/model_adapters/test_structured_output.py +113 -5
  54. kiln_ai/adapters/ollama_tools.py +69 -12
  55. kiln_ai/adapters/parsers/__init__.py +1 -1
  56. kiln_ai/adapters/provider_tools.py +205 -47
  57. kiln_ai/adapters/rag/deduplication.py +49 -0
  58. kiln_ai/adapters/rag/progress.py +252 -0
  59. kiln_ai/adapters/rag/rag_runners.py +844 -0
  60. kiln_ai/adapters/rag/test_deduplication.py +195 -0
  61. kiln_ai/adapters/rag/test_progress.py +785 -0
  62. kiln_ai/adapters/rag/test_rag_runners.py +2376 -0
  63. kiln_ai/adapters/remote_config.py +80 -8
  64. kiln_ai/adapters/repair/test_repair_task.py +12 -9
  65. kiln_ai/adapters/run_output.py +3 -0
  66. kiln_ai/adapters/test_adapter_registry.py +657 -85
  67. kiln_ai/adapters/test_docker_model_runner_tools.py +305 -0
  68. kiln_ai/adapters/test_ml_embedding_model_list.py +429 -0
  69. kiln_ai/adapters/test_ml_model_list.py +251 -1
  70. kiln_ai/adapters/test_ollama_tools.py +340 -1
  71. kiln_ai/adapters/test_prompt_adaptors.py +13 -6
  72. kiln_ai/adapters/test_prompt_builders.py +1 -1
  73. kiln_ai/adapters/test_provider_tools.py +254 -8
  74. kiln_ai/adapters/test_remote_config.py +651 -58
  75. kiln_ai/adapters/vector_store/__init__.py +1 -0
  76. kiln_ai/adapters/vector_store/base_vector_store_adapter.py +83 -0
  77. kiln_ai/adapters/vector_store/lancedb_adapter.py +389 -0
  78. kiln_ai/adapters/vector_store/test_base_vector_store.py +160 -0
  79. kiln_ai/adapters/vector_store/test_lancedb_adapter.py +1841 -0
  80. kiln_ai/adapters/vector_store/test_vector_store_registry.py +199 -0
  81. kiln_ai/adapters/vector_store/vector_store_registry.py +33 -0
  82. kiln_ai/datamodel/__init__.py +39 -34
  83. kiln_ai/datamodel/basemodel.py +170 -1
  84. kiln_ai/datamodel/chunk.py +158 -0
  85. kiln_ai/datamodel/datamodel_enums.py +28 -0
  86. kiln_ai/datamodel/embedding.py +64 -0
  87. kiln_ai/datamodel/eval.py +1 -1
  88. kiln_ai/datamodel/external_tool_server.py +298 -0
  89. kiln_ai/datamodel/extraction.py +303 -0
  90. kiln_ai/datamodel/json_schema.py +25 -10
  91. kiln_ai/datamodel/project.py +40 -1
  92. kiln_ai/datamodel/rag.py +79 -0
  93. kiln_ai/datamodel/registry.py +0 -15
  94. kiln_ai/datamodel/run_config.py +62 -0
  95. kiln_ai/datamodel/task.py +2 -77
  96. kiln_ai/datamodel/task_output.py +6 -1
  97. kiln_ai/datamodel/task_run.py +41 -0
  98. kiln_ai/datamodel/test_attachment.py +649 -0
  99. kiln_ai/datamodel/test_basemodel.py +4 -4
  100. kiln_ai/datamodel/test_chunk_models.py +317 -0
  101. kiln_ai/datamodel/test_dataset_split.py +1 -1
  102. kiln_ai/datamodel/test_embedding_models.py +448 -0
  103. kiln_ai/datamodel/test_eval_model.py +6 -6
  104. kiln_ai/datamodel/test_example_models.py +175 -0
  105. kiln_ai/datamodel/test_external_tool_server.py +691 -0
  106. kiln_ai/datamodel/test_extraction_chunk.py +206 -0
  107. kiln_ai/datamodel/test_extraction_model.py +470 -0
  108. kiln_ai/datamodel/test_rag.py +641 -0
  109. kiln_ai/datamodel/test_registry.py +8 -3
  110. kiln_ai/datamodel/test_task.py +15 -47
  111. kiln_ai/datamodel/test_tool_id.py +320 -0
  112. kiln_ai/datamodel/test_vector_store.py +320 -0
  113. kiln_ai/datamodel/tool_id.py +105 -0
  114. kiln_ai/datamodel/vector_store.py +141 -0
  115. kiln_ai/tools/__init__.py +8 -0
  116. kiln_ai/tools/base_tool.py +82 -0
  117. kiln_ai/tools/built_in_tools/__init__.py +13 -0
  118. kiln_ai/tools/built_in_tools/math_tools.py +124 -0
  119. kiln_ai/tools/built_in_tools/test_math_tools.py +204 -0
  120. kiln_ai/tools/mcp_server_tool.py +95 -0
  121. kiln_ai/tools/mcp_session_manager.py +246 -0
  122. kiln_ai/tools/rag_tools.py +157 -0
  123. kiln_ai/tools/test_base_tools.py +199 -0
  124. kiln_ai/tools/test_mcp_server_tool.py +457 -0
  125. kiln_ai/tools/test_mcp_session_manager.py +1585 -0
  126. kiln_ai/tools/test_rag_tools.py +848 -0
  127. kiln_ai/tools/test_tool_registry.py +562 -0
  128. kiln_ai/tools/tool_registry.py +85 -0
  129. kiln_ai/utils/__init__.py +3 -0
  130. kiln_ai/utils/async_job_runner.py +62 -17
  131. kiln_ai/utils/config.py +24 -2
  132. kiln_ai/utils/env.py +15 -0
  133. kiln_ai/utils/filesystem.py +14 -0
  134. kiln_ai/utils/filesystem_cache.py +60 -0
  135. kiln_ai/utils/litellm.py +94 -0
  136. kiln_ai/utils/lock.py +100 -0
  137. kiln_ai/utils/mime_type.py +38 -0
  138. kiln_ai/utils/open_ai_types.py +94 -0
  139. kiln_ai/utils/pdf_utils.py +38 -0
  140. kiln_ai/utils/project_utils.py +17 -0
  141. kiln_ai/utils/test_async_job_runner.py +151 -35
  142. kiln_ai/utils/test_config.py +138 -1
  143. kiln_ai/utils/test_env.py +142 -0
  144. kiln_ai/utils/test_filesystem_cache.py +316 -0
  145. kiln_ai/utils/test_litellm.py +206 -0
  146. kiln_ai/utils/test_lock.py +185 -0
  147. kiln_ai/utils/test_mime_type.py +66 -0
  148. kiln_ai/utils/test_open_ai_types.py +131 -0
  149. kiln_ai/utils/test_pdf_utils.py +73 -0
  150. kiln_ai/utils/test_uuid.py +111 -0
  151. kiln_ai/utils/test_validation.py +524 -0
  152. kiln_ai/utils/uuid.py +9 -0
  153. kiln_ai/utils/validation.py +90 -0
  154. {kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/METADATA +12 -5
  155. kiln_ai-0.21.0.dist-info/RECORD +211 -0
  156. kiln_ai-0.19.0.dist-info/RECORD +0 -115
  157. {kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/WHEEL +0 -0
  158. {kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -0,0 +1,195 @@
1
+ from unittest.mock import MagicMock
2
+
3
+ from kiln_ai.adapters.rag.deduplication import (
4
+ deduplicate_chunk_embeddings,
5
+ deduplicate_chunked_documents,
6
+ deduplicate_extractions,
7
+ filter_documents_by_tags,
8
+ )
9
+ from kiln_ai.datamodel.chunk import ChunkedDocument
10
+ from kiln_ai.datamodel.embedding import ChunkEmbeddings
11
+ from kiln_ai.datamodel.extraction import Document, Extraction
12
+
13
+
14
+ class TestFilterDocumentsByTags:
15
+ def test_filter_documents_by_tags_with_none_tags(self):
16
+ """Test that None tags returns all documents"""
17
+ doc1 = MagicMock(spec=Document)
18
+ doc1.tags = ["tag1", "tag2"]
19
+ doc2 = MagicMock(spec=Document)
20
+ doc2.tags = ["tag3"]
21
+
22
+ documents = [doc1, doc2]
23
+ result = filter_documents_by_tags(documents, None)
24
+
25
+ assert result == documents
26
+ assert len(result) == 2
27
+
28
+ def test_filter_documents_by_tags_with_empty_tags(self):
29
+ """Test that empty tags list returns all documents"""
30
+ doc1 = MagicMock(spec=Document)
31
+ doc1.tags = ["tag1", "tag2"]
32
+ doc2 = MagicMock(spec=Document)
33
+ doc2.tags = ["tag3"]
34
+
35
+ documents = [doc1, doc2]
36
+ result = filter_documents_by_tags(documents, [])
37
+
38
+ assert result == documents
39
+ assert len(result) == 2
40
+
41
+ def test_filter_documents_by_tags_single_matching_tag(self):
42
+ """Test filtering with a single matching tag"""
43
+ doc1 = MagicMock(spec=Document)
44
+ doc1.tags = ["tag1", "tag2"]
45
+ doc2 = MagicMock(spec=Document)
46
+ doc2.tags = ["tag3"]
47
+ doc3 = MagicMock(spec=Document)
48
+ doc3.tags = ["tag1", "tag4"]
49
+
50
+ documents = [doc1, doc2, doc3]
51
+ result = filter_documents_by_tags(documents, ["tag1"])
52
+
53
+ assert len(result) == 2
54
+ assert doc1 in result
55
+ assert doc3 in result
56
+ assert doc2 not in result
57
+
58
+ def test_filter_documents_by_tags_multiple_matching_tags(self):
59
+ """Test filtering with multiple tags (OR logic)"""
60
+ doc1 = MagicMock(spec=Document)
61
+ doc1.tags = ["tag1", "tag2"]
62
+ doc2 = MagicMock(spec=Document)
63
+ doc2.tags = ["tag3"]
64
+ doc3 = MagicMock(spec=Document)
65
+ doc3.tags = ["tag4", "tag5"]
66
+ doc4 = MagicMock(spec=Document)
67
+ doc4.tags = ["tag2", "tag6"]
68
+
69
+ documents = [doc1, doc2, doc3, doc4]
70
+ result = filter_documents_by_tags(documents, ["tag1", "tag3"])
71
+
72
+ assert len(result) == 2
73
+ assert doc1 in result # has tag1
74
+ assert doc2 in result # has tag3
75
+ assert doc3 not in result
76
+ assert doc4 not in result
77
+
78
+ def test_filter_documents_by_tags_no_matching_documents(self):
79
+ """Test filtering when no documents match the tags"""
80
+ doc1 = MagicMock(spec=Document)
81
+ doc1.tags = ["tag1", "tag2"]
82
+ doc2 = MagicMock(spec=Document)
83
+ doc2.tags = ["tag3"]
84
+
85
+ documents = [doc1, doc2]
86
+ result = filter_documents_by_tags(documents, ["tag4", "tag5"])
87
+
88
+ assert len(result) == 0
89
+
90
+ def test_filter_documents_by_tags_documents_with_no_tags(self):
91
+ """Test filtering when some documents have no tags"""
92
+ doc1 = MagicMock(spec=Document)
93
+ doc1.tags = ["tag1", "tag2"]
94
+ doc2 = MagicMock(spec=Document)
95
+ doc2.tags = None
96
+ doc3 = MagicMock(spec=Document)
97
+ doc3.tags = []
98
+ doc4 = MagicMock(spec=Document)
99
+ doc4.tags = ["tag1"]
100
+
101
+ documents = [doc1, doc2, doc3, doc4]
102
+ result = filter_documents_by_tags(documents, ["tag1"])
103
+
104
+ assert len(result) == 2
105
+ assert doc1 in result
106
+ assert doc4 in result
107
+ assert doc2 not in result # None tags
108
+ assert doc3 not in result # empty tags
109
+
110
+ def test_filter_documents_by_tags_empty_document_list(self):
111
+ """Test filtering with empty document list"""
112
+ documents = []
113
+ result = filter_documents_by_tags(documents, ["tag1"])
114
+
115
+ assert len(result) == 0
116
+
117
+ def test_filter_documents_by_tags_case_sensitive(self):
118
+ """Test that tag filtering is case sensitive"""
119
+ doc1 = MagicMock(spec=Document)
120
+ doc1.tags = ["Tag1", "tag2"]
121
+ doc2 = MagicMock(spec=Document)
122
+ doc2.tags = ["tag1", "tag3"]
123
+
124
+ documents = [doc1, doc2]
125
+ result = filter_documents_by_tags(documents, ["tag1"])
126
+
127
+ assert len(result) == 1
128
+ assert doc2 in result
129
+ assert doc1 not in result # "Tag1" != "tag1"
130
+
131
+ def test_filter_documents_by_tags_partial_match(self):
132
+ """Test that only exact tag matches work, not partial matches"""
133
+ doc1 = MagicMock(spec=Document)
134
+ doc1.tags = ["tag1", "tag12"]
135
+ doc2 = MagicMock(spec=Document)
136
+ doc2.tags = ["tag", "other"]
137
+
138
+ documents = [doc1, doc2]
139
+ result = filter_documents_by_tags(documents, ["tag"])
140
+
141
+ assert len(result) == 1
142
+ assert doc2 in result
143
+ assert doc1 not in result # "tag1" and "tag12" don't match "tag"
144
+
145
+
146
+ class TestDeduplicationFunctions:
147
+ """Basic tests to ensure existing deduplication functions still work"""
148
+
149
+ def test_deduplicate_extractions_basic(self):
150
+ """Test basic deduplication of extractions"""
151
+ extraction1 = MagicMock(spec=Extraction)
152
+ extraction1.extractor_config_id = "config1"
153
+ extraction1.created_at = "2024-01-01"
154
+
155
+ extraction2 = MagicMock(spec=Extraction)
156
+ extraction2.extractor_config_id = "config1"
157
+ extraction2.created_at = "2024-01-02"
158
+
159
+ extractions = [extraction1, extraction2]
160
+ result = deduplicate_extractions(extractions)
161
+
162
+ assert len(result) == 1
163
+ assert result[0] == extraction1 # earlier created_at
164
+
165
+ def test_deduplicate_chunked_documents_basic(self):
166
+ """Test basic deduplication of chunked documents"""
167
+ chunked1 = MagicMock(spec=ChunkedDocument)
168
+ chunked1.chunker_config_id = "config1"
169
+ chunked1.created_at = "2024-01-01"
170
+
171
+ chunked2 = MagicMock(spec=ChunkedDocument)
172
+ chunked2.chunker_config_id = "config1"
173
+ chunked2.created_at = "2024-01-02"
174
+
175
+ chunked_docs = [chunked1, chunked2]
176
+ result = deduplicate_chunked_documents(chunked_docs)
177
+
178
+ assert len(result) == 1
179
+ assert result[0] == chunked1 # earlier created_at
180
+
181
+ def test_deduplicate_chunk_embeddings_basic(self):
182
+ """Test basic deduplication of chunk embeddings"""
183
+ embedding1 = MagicMock(spec=ChunkEmbeddings)
184
+ embedding1.embedding_config_id = "config1"
185
+ embedding1.created_at = "2024-01-01"
186
+
187
+ embedding2 = MagicMock(spec=ChunkEmbeddings)
188
+ embedding2.embedding_config_id = "config1"
189
+ embedding2.created_at = "2024-01-02"
190
+
191
+ embeddings = [embedding1, embedding2]
192
+ result = deduplicate_chunk_embeddings(embeddings)
193
+
194
+ assert len(result) == 1
195
+ assert result[0] == embedding1 # earlier created_at