kiln-ai 0.20.1__py3-none-any.whl → 0.22.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (133) hide show
  1. kiln_ai/adapters/__init__.py +6 -0
  2. kiln_ai/adapters/adapter_registry.py +43 -226
  3. kiln_ai/adapters/chunkers/__init__.py +13 -0
  4. kiln_ai/adapters/chunkers/base_chunker.py +42 -0
  5. kiln_ai/adapters/chunkers/chunker_registry.py +16 -0
  6. kiln_ai/adapters/chunkers/fixed_window_chunker.py +39 -0
  7. kiln_ai/adapters/chunkers/helpers.py +23 -0
  8. kiln_ai/adapters/chunkers/test_base_chunker.py +63 -0
  9. kiln_ai/adapters/chunkers/test_chunker_registry.py +28 -0
  10. kiln_ai/adapters/chunkers/test_fixed_window_chunker.py +346 -0
  11. kiln_ai/adapters/chunkers/test_helpers.py +75 -0
  12. kiln_ai/adapters/data_gen/test_data_gen_task.py +9 -3
  13. kiln_ai/adapters/embedding/__init__.py +0 -0
  14. kiln_ai/adapters/embedding/base_embedding_adapter.py +44 -0
  15. kiln_ai/adapters/embedding/embedding_registry.py +32 -0
  16. kiln_ai/adapters/embedding/litellm_embedding_adapter.py +199 -0
  17. kiln_ai/adapters/embedding/test_base_embedding_adapter.py +283 -0
  18. kiln_ai/adapters/embedding/test_embedding_registry.py +166 -0
  19. kiln_ai/adapters/embedding/test_litellm_embedding_adapter.py +1149 -0
  20. kiln_ai/adapters/eval/eval_runner.py +6 -2
  21. kiln_ai/adapters/eval/test_base_eval.py +1 -3
  22. kiln_ai/adapters/eval/test_g_eval.py +1 -1
  23. kiln_ai/adapters/extractors/__init__.py +18 -0
  24. kiln_ai/adapters/extractors/base_extractor.py +72 -0
  25. kiln_ai/adapters/extractors/encoding.py +20 -0
  26. kiln_ai/adapters/extractors/extractor_registry.py +44 -0
  27. kiln_ai/adapters/extractors/extractor_runner.py +112 -0
  28. kiln_ai/adapters/extractors/litellm_extractor.py +406 -0
  29. kiln_ai/adapters/extractors/test_base_extractor.py +244 -0
  30. kiln_ai/adapters/extractors/test_encoding.py +54 -0
  31. kiln_ai/adapters/extractors/test_extractor_registry.py +181 -0
  32. kiln_ai/adapters/extractors/test_extractor_runner.py +181 -0
  33. kiln_ai/adapters/extractors/test_litellm_extractor.py +1290 -0
  34. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +2 -2
  35. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +2 -6
  36. kiln_ai/adapters/fine_tune/test_together_finetune.py +2 -6
  37. kiln_ai/adapters/ml_embedding_model_list.py +494 -0
  38. kiln_ai/adapters/ml_model_list.py +876 -18
  39. kiln_ai/adapters/model_adapters/litellm_adapter.py +40 -75
  40. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +79 -1
  41. kiln_ai/adapters/model_adapters/test_litellm_adapter_tools.py +119 -5
  42. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +9 -3
  43. kiln_ai/adapters/model_adapters/test_structured_output.py +9 -10
  44. kiln_ai/adapters/ollama_tools.py +69 -12
  45. kiln_ai/adapters/provider_tools.py +190 -46
  46. kiln_ai/adapters/rag/deduplication.py +49 -0
  47. kiln_ai/adapters/rag/progress.py +252 -0
  48. kiln_ai/adapters/rag/rag_runners.py +844 -0
  49. kiln_ai/adapters/rag/test_deduplication.py +195 -0
  50. kiln_ai/adapters/rag/test_progress.py +785 -0
  51. kiln_ai/adapters/rag/test_rag_runners.py +2376 -0
  52. kiln_ai/adapters/remote_config.py +80 -8
  53. kiln_ai/adapters/test_adapter_registry.py +579 -86
  54. kiln_ai/adapters/test_ml_embedding_model_list.py +239 -0
  55. kiln_ai/adapters/test_ml_model_list.py +202 -0
  56. kiln_ai/adapters/test_ollama_tools.py +340 -1
  57. kiln_ai/adapters/test_prompt_builders.py +1 -1
  58. kiln_ai/adapters/test_provider_tools.py +199 -8
  59. kiln_ai/adapters/test_remote_config.py +551 -56
  60. kiln_ai/adapters/vector_store/__init__.py +1 -0
  61. kiln_ai/adapters/vector_store/base_vector_store_adapter.py +83 -0
  62. kiln_ai/adapters/vector_store/lancedb_adapter.py +389 -0
  63. kiln_ai/adapters/vector_store/test_base_vector_store.py +160 -0
  64. kiln_ai/adapters/vector_store/test_lancedb_adapter.py +1841 -0
  65. kiln_ai/adapters/vector_store/test_vector_store_registry.py +199 -0
  66. kiln_ai/adapters/vector_store/vector_store_registry.py +33 -0
  67. kiln_ai/datamodel/__init__.py +16 -13
  68. kiln_ai/datamodel/basemodel.py +201 -4
  69. kiln_ai/datamodel/chunk.py +158 -0
  70. kiln_ai/datamodel/datamodel_enums.py +27 -0
  71. kiln_ai/datamodel/embedding.py +64 -0
  72. kiln_ai/datamodel/external_tool_server.py +206 -54
  73. kiln_ai/datamodel/extraction.py +317 -0
  74. kiln_ai/datamodel/project.py +33 -1
  75. kiln_ai/datamodel/rag.py +79 -0
  76. kiln_ai/datamodel/task.py +5 -0
  77. kiln_ai/datamodel/task_output.py +41 -11
  78. kiln_ai/datamodel/test_attachment.py +649 -0
  79. kiln_ai/datamodel/test_basemodel.py +270 -14
  80. kiln_ai/datamodel/test_chunk_models.py +317 -0
  81. kiln_ai/datamodel/test_dataset_split.py +1 -1
  82. kiln_ai/datamodel/test_datasource.py +50 -0
  83. kiln_ai/datamodel/test_embedding_models.py +448 -0
  84. kiln_ai/datamodel/test_eval_model.py +6 -6
  85. kiln_ai/datamodel/test_external_tool_server.py +534 -152
  86. kiln_ai/datamodel/test_extraction_chunk.py +206 -0
  87. kiln_ai/datamodel/test_extraction_model.py +501 -0
  88. kiln_ai/datamodel/test_rag.py +641 -0
  89. kiln_ai/datamodel/test_task.py +35 -1
  90. kiln_ai/datamodel/test_tool_id.py +187 -1
  91. kiln_ai/datamodel/test_vector_store.py +320 -0
  92. kiln_ai/datamodel/tool_id.py +58 -0
  93. kiln_ai/datamodel/vector_store.py +141 -0
  94. kiln_ai/tools/base_tool.py +12 -3
  95. kiln_ai/tools/built_in_tools/math_tools.py +12 -4
  96. kiln_ai/tools/kiln_task_tool.py +158 -0
  97. kiln_ai/tools/mcp_server_tool.py +2 -2
  98. kiln_ai/tools/mcp_session_manager.py +51 -22
  99. kiln_ai/tools/rag_tools.py +164 -0
  100. kiln_ai/tools/test_kiln_task_tool.py +527 -0
  101. kiln_ai/tools/test_mcp_server_tool.py +4 -15
  102. kiln_ai/tools/test_mcp_session_manager.py +187 -227
  103. kiln_ai/tools/test_rag_tools.py +929 -0
  104. kiln_ai/tools/test_tool_registry.py +290 -7
  105. kiln_ai/tools/tool_registry.py +69 -16
  106. kiln_ai/utils/__init__.py +3 -0
  107. kiln_ai/utils/async_job_runner.py +62 -17
  108. kiln_ai/utils/config.py +2 -2
  109. kiln_ai/utils/env.py +15 -0
  110. kiln_ai/utils/filesystem.py +14 -0
  111. kiln_ai/utils/filesystem_cache.py +60 -0
  112. kiln_ai/utils/litellm.py +94 -0
  113. kiln_ai/utils/lock.py +100 -0
  114. kiln_ai/utils/mime_type.py +38 -0
  115. kiln_ai/utils/open_ai_types.py +19 -2
  116. kiln_ai/utils/pdf_utils.py +59 -0
  117. kiln_ai/utils/test_async_job_runner.py +151 -35
  118. kiln_ai/utils/test_env.py +142 -0
  119. kiln_ai/utils/test_filesystem_cache.py +316 -0
  120. kiln_ai/utils/test_litellm.py +206 -0
  121. kiln_ai/utils/test_lock.py +185 -0
  122. kiln_ai/utils/test_mime_type.py +66 -0
  123. kiln_ai/utils/test_open_ai_types.py +88 -12
  124. kiln_ai/utils/test_pdf_utils.py +86 -0
  125. kiln_ai/utils/test_uuid.py +111 -0
  126. kiln_ai/utils/test_validation.py +524 -0
  127. kiln_ai/utils/uuid.py +9 -0
  128. kiln_ai/utils/validation.py +90 -0
  129. {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/METADATA +9 -1
  130. kiln_ai-0.22.0.dist-info/RECORD +213 -0
  131. kiln_ai-0.20.1.dist-info/RECORD +0 -138
  132. {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/WHEEL +0 -0
  133. {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -3,28 +3,53 @@ import json
3
3
  import logging
4
4
  import os
5
5
  import threading
6
+ from dataclasses import dataclass
6
7
  from pathlib import Path
7
8
  from typing import Any, List
8
9
 
9
10
  import requests
10
11
  from pydantic import ValidationError
11
12
 
13
+ from kiln_ai.adapters.ml_embedding_model_list import (
14
+ KilnEmbeddingModel,
15
+ KilnEmbeddingModelProvider,
16
+ built_in_embedding_models,
17
+ )
18
+ from kiln_ai.datamodel.datamodel_enums import KilnMimeType
19
+
12
20
  from .ml_model_list import KilnModel, KilnModelProvider, built_in_models
13
21
 
14
22
  logger = logging.getLogger(__name__)
15
23
 
16
24
 
17
- def serialize_config(models: List[KilnModel], path: str | Path) -> None:
18
- data = {"model_list": [m.model_dump(mode="json") for m in models]}
25
+ @dataclass
26
+ class KilnRemoteConfig:
27
+ model_list: List[KilnModel]
28
+ embedding_model_list: List[KilnEmbeddingModel]
29
+
30
+
31
+ def serialize_config(
32
+ models: List[KilnModel],
33
+ embedding_models: List[KilnEmbeddingModel],
34
+ path: str | Path,
35
+ ) -> None:
36
+ data = {
37
+ "model_list": [m.model_dump(mode="json") for m in models],
38
+ "embedding_model_list": [m.model_dump(mode="json") for m in embedding_models],
39
+ }
19
40
  Path(path).write_text(json.dumps(data, indent=2, sort_keys=True))
20
41
 
21
42
 
22
- def deserialize_config_at_path(path: str | Path) -> List[KilnModel]:
43
+ def deserialize_config_at_path(
44
+ path: str | Path,
45
+ ) -> KilnRemoteConfig:
23
46
  raw = json.loads(Path(path).read_text())
24
47
  return deserialize_config_data(raw)
25
48
 
26
49
 
27
- def deserialize_config_data(config_data: Any) -> List[KilnModel]:
50
+ def deserialize_config_data(
51
+ config_data: Any,
52
+ ) -> KilnRemoteConfig:
28
53
  if not isinstance(config_data, dict):
29
54
  raise ValueError(f"Remote config expected dict, got {type(config_data)}")
30
55
 
@@ -34,6 +59,12 @@ def deserialize_config_data(config_data: Any) -> List[KilnModel]:
34
59
  f"Remote config expected list of models, got {type(model_list)}"
35
60
  )
36
61
 
62
+ embedding_model_data = config_data.get("embedding_model_list", [])
63
+ if not isinstance(embedding_model_data, list):
64
+ raise ValueError(
65
+ f"Remote config expected list of embedding models, got {type(embedding_model_data)}"
66
+ )
67
+
37
68
  # We must be careful here, because some of the JSON data may be generated from a forward
38
69
  # version of the code that has newer fields / versions of the fields, that may cause
39
70
  # the current client this code is running on to fail to validate the item into a KilnModel.
@@ -48,6 +79,14 @@ def deserialize_config_data(config_data: Any) -> List[KilnModel]:
48
79
  providers = []
49
80
  for provider_data in providers_list:
50
81
  try:
82
+ # we filter out the mime types that we don't support
83
+ mime_types = provider_data.get("multimodal_mime_types")
84
+ if mime_types is not None:
85
+ provider_data["multimodal_mime_types"] = [
86
+ mime_type
87
+ for mime_type in mime_types
88
+ if mime_type in list(KilnMimeType)
89
+ ]
51
90
  provider = KilnModelProvider.model_validate(provider_data)
52
91
  providers.append(provider)
53
92
  except ValidationError as e:
@@ -72,10 +111,38 @@ def deserialize_config_data(config_data: Any) -> List[KilnModel]:
72
111
  model_data,
73
112
  e,
74
113
  )
75
- return models
114
+
115
+ embedding_models = []
116
+ for embedding_model_data in embedding_model_data:
117
+ try:
118
+ provider_list = embedding_model_data.get("providers", [])
119
+ providers = []
120
+ for provider_data in provider_list:
121
+ try:
122
+ provider = KilnEmbeddingModelProvider.model_validate(provider_data)
123
+ providers.append(provider)
124
+ except ValidationError as e:
125
+ logger.warning(
126
+ "Failed to validate an embedding model provider from remote config. Upgrade Kiln to use this model. Details %s: %s",
127
+ provider_data,
128
+ e,
129
+ )
130
+
131
+ embedding_model_data["providers"] = []
132
+ embedding_model = KilnEmbeddingModel.model_validate(embedding_model_data)
133
+ embedding_model.providers = providers
134
+ embedding_models.append(embedding_model)
135
+ except ValidationError as e:
136
+ logger.warning(
137
+ "Failed to validate an embedding model from remote config. Upgrade Kiln to use this model. Details %s: %s",
138
+ embedding_model_data,
139
+ e,
140
+ )
141
+
142
+ return KilnRemoteConfig(model_list=models, embedding_model_list=embedding_models)
76
143
 
77
144
 
78
- def load_from_url(url: str) -> List[KilnModel]:
145
+ def load_from_url(url: str) -> KilnRemoteConfig:
79
146
  response = requests.get(url, timeout=10)
80
147
  response.raise_for_status()
81
148
  data = response.json()
@@ -83,7 +150,11 @@ def load_from_url(url: str) -> List[KilnModel]:
83
150
 
84
151
 
85
152
  def dump_builtin_config(path: str | Path) -> None:
86
- serialize_config(built_in_models, path)
153
+ serialize_config(
154
+ models=built_in_models,
155
+ embedding_models=built_in_embedding_models,
156
+ path=path,
157
+ )
87
158
 
88
159
 
89
160
  def load_remote_models(url: str) -> None:
@@ -93,7 +164,8 @@ def load_remote_models(url: str) -> None:
93
164
  def fetch_and_replace() -> None:
94
165
  try:
95
166
  models = load_from_url(url)
96
- built_in_models[:] = models
167
+ built_in_models[:] = models.model_list
168
+ built_in_embedding_models[:] = models.embedding_model_list
97
169
  except Exception as exc:
98
170
  # Do not crash startup, but surface the issue
99
171
  logger.warning("Failed to fetch remote model list from %s: %s", url, exc)