kiln-ai 0.19.0__py3-none-any.whl → 0.21.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (158) hide show
  1. kiln_ai/adapters/__init__.py +8 -2
  2. kiln_ai/adapters/adapter_registry.py +43 -208
  3. kiln_ai/adapters/chat/chat_formatter.py +8 -12
  4. kiln_ai/adapters/chat/test_chat_formatter.py +6 -2
  5. kiln_ai/adapters/chunkers/__init__.py +13 -0
  6. kiln_ai/adapters/chunkers/base_chunker.py +42 -0
  7. kiln_ai/adapters/chunkers/chunker_registry.py +16 -0
  8. kiln_ai/adapters/chunkers/fixed_window_chunker.py +39 -0
  9. kiln_ai/adapters/chunkers/helpers.py +23 -0
  10. kiln_ai/adapters/chunkers/test_base_chunker.py +63 -0
  11. kiln_ai/adapters/chunkers/test_chunker_registry.py +28 -0
  12. kiln_ai/adapters/chunkers/test_fixed_window_chunker.py +346 -0
  13. kiln_ai/adapters/chunkers/test_helpers.py +75 -0
  14. kiln_ai/adapters/data_gen/test_data_gen_task.py +9 -3
  15. kiln_ai/adapters/docker_model_runner_tools.py +119 -0
  16. kiln_ai/adapters/embedding/__init__.py +0 -0
  17. kiln_ai/adapters/embedding/base_embedding_adapter.py +44 -0
  18. kiln_ai/adapters/embedding/embedding_registry.py +32 -0
  19. kiln_ai/adapters/embedding/litellm_embedding_adapter.py +199 -0
  20. kiln_ai/adapters/embedding/test_base_embedding_adapter.py +283 -0
  21. kiln_ai/adapters/embedding/test_embedding_registry.py +166 -0
  22. kiln_ai/adapters/embedding/test_litellm_embedding_adapter.py +1149 -0
  23. kiln_ai/adapters/eval/base_eval.py +2 -2
  24. kiln_ai/adapters/eval/eval_runner.py +9 -3
  25. kiln_ai/adapters/eval/g_eval.py +2 -2
  26. kiln_ai/adapters/eval/test_base_eval.py +2 -4
  27. kiln_ai/adapters/eval/test_g_eval.py +4 -5
  28. kiln_ai/adapters/extractors/__init__.py +18 -0
  29. kiln_ai/adapters/extractors/base_extractor.py +72 -0
  30. kiln_ai/adapters/extractors/encoding.py +20 -0
  31. kiln_ai/adapters/extractors/extractor_registry.py +44 -0
  32. kiln_ai/adapters/extractors/extractor_runner.py +112 -0
  33. kiln_ai/adapters/extractors/litellm_extractor.py +386 -0
  34. kiln_ai/adapters/extractors/test_base_extractor.py +244 -0
  35. kiln_ai/adapters/extractors/test_encoding.py +54 -0
  36. kiln_ai/adapters/extractors/test_extractor_registry.py +181 -0
  37. kiln_ai/adapters/extractors/test_extractor_runner.py +181 -0
  38. kiln_ai/adapters/extractors/test_litellm_extractor.py +1192 -0
  39. kiln_ai/adapters/fine_tune/__init__.py +1 -1
  40. kiln_ai/adapters/fine_tune/openai_finetune.py +14 -4
  41. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +2 -2
  42. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +2 -6
  43. kiln_ai/adapters/fine_tune/test_openai_finetune.py +108 -111
  44. kiln_ai/adapters/fine_tune/test_together_finetune.py +2 -6
  45. kiln_ai/adapters/ml_embedding_model_list.py +192 -0
  46. kiln_ai/adapters/ml_model_list.py +761 -37
  47. kiln_ai/adapters/model_adapters/base_adapter.py +51 -21
  48. kiln_ai/adapters/model_adapters/litellm_adapter.py +380 -138
  49. kiln_ai/adapters/model_adapters/test_base_adapter.py +193 -17
  50. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +407 -2
  51. kiln_ai/adapters/model_adapters/test_litellm_adapter_tools.py +1103 -0
  52. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +5 -5
  53. kiln_ai/adapters/model_adapters/test_structured_output.py +113 -5
  54. kiln_ai/adapters/ollama_tools.py +69 -12
  55. kiln_ai/adapters/parsers/__init__.py +1 -1
  56. kiln_ai/adapters/provider_tools.py +205 -47
  57. kiln_ai/adapters/rag/deduplication.py +49 -0
  58. kiln_ai/adapters/rag/progress.py +252 -0
  59. kiln_ai/adapters/rag/rag_runners.py +844 -0
  60. kiln_ai/adapters/rag/test_deduplication.py +195 -0
  61. kiln_ai/adapters/rag/test_progress.py +785 -0
  62. kiln_ai/adapters/rag/test_rag_runners.py +2376 -0
  63. kiln_ai/adapters/remote_config.py +80 -8
  64. kiln_ai/adapters/repair/test_repair_task.py +12 -9
  65. kiln_ai/adapters/run_output.py +3 -0
  66. kiln_ai/adapters/test_adapter_registry.py +657 -85
  67. kiln_ai/adapters/test_docker_model_runner_tools.py +305 -0
  68. kiln_ai/adapters/test_ml_embedding_model_list.py +429 -0
  69. kiln_ai/adapters/test_ml_model_list.py +251 -1
  70. kiln_ai/adapters/test_ollama_tools.py +340 -1
  71. kiln_ai/adapters/test_prompt_adaptors.py +13 -6
  72. kiln_ai/adapters/test_prompt_builders.py +1 -1
  73. kiln_ai/adapters/test_provider_tools.py +254 -8
  74. kiln_ai/adapters/test_remote_config.py +651 -58
  75. kiln_ai/adapters/vector_store/__init__.py +1 -0
  76. kiln_ai/adapters/vector_store/base_vector_store_adapter.py +83 -0
  77. kiln_ai/adapters/vector_store/lancedb_adapter.py +389 -0
  78. kiln_ai/adapters/vector_store/test_base_vector_store.py +160 -0
  79. kiln_ai/adapters/vector_store/test_lancedb_adapter.py +1841 -0
  80. kiln_ai/adapters/vector_store/test_vector_store_registry.py +199 -0
  81. kiln_ai/adapters/vector_store/vector_store_registry.py +33 -0
  82. kiln_ai/datamodel/__init__.py +39 -34
  83. kiln_ai/datamodel/basemodel.py +170 -1
  84. kiln_ai/datamodel/chunk.py +158 -0
  85. kiln_ai/datamodel/datamodel_enums.py +28 -0
  86. kiln_ai/datamodel/embedding.py +64 -0
  87. kiln_ai/datamodel/eval.py +1 -1
  88. kiln_ai/datamodel/external_tool_server.py +298 -0
  89. kiln_ai/datamodel/extraction.py +303 -0
  90. kiln_ai/datamodel/json_schema.py +25 -10
  91. kiln_ai/datamodel/project.py +40 -1
  92. kiln_ai/datamodel/rag.py +79 -0
  93. kiln_ai/datamodel/registry.py +0 -15
  94. kiln_ai/datamodel/run_config.py +62 -0
  95. kiln_ai/datamodel/task.py +2 -77
  96. kiln_ai/datamodel/task_output.py +6 -1
  97. kiln_ai/datamodel/task_run.py +41 -0
  98. kiln_ai/datamodel/test_attachment.py +649 -0
  99. kiln_ai/datamodel/test_basemodel.py +4 -4
  100. kiln_ai/datamodel/test_chunk_models.py +317 -0
  101. kiln_ai/datamodel/test_dataset_split.py +1 -1
  102. kiln_ai/datamodel/test_embedding_models.py +448 -0
  103. kiln_ai/datamodel/test_eval_model.py +6 -6
  104. kiln_ai/datamodel/test_example_models.py +175 -0
  105. kiln_ai/datamodel/test_external_tool_server.py +691 -0
  106. kiln_ai/datamodel/test_extraction_chunk.py +206 -0
  107. kiln_ai/datamodel/test_extraction_model.py +470 -0
  108. kiln_ai/datamodel/test_rag.py +641 -0
  109. kiln_ai/datamodel/test_registry.py +8 -3
  110. kiln_ai/datamodel/test_task.py +15 -47
  111. kiln_ai/datamodel/test_tool_id.py +320 -0
  112. kiln_ai/datamodel/test_vector_store.py +320 -0
  113. kiln_ai/datamodel/tool_id.py +105 -0
  114. kiln_ai/datamodel/vector_store.py +141 -0
  115. kiln_ai/tools/__init__.py +8 -0
  116. kiln_ai/tools/base_tool.py +82 -0
  117. kiln_ai/tools/built_in_tools/__init__.py +13 -0
  118. kiln_ai/tools/built_in_tools/math_tools.py +124 -0
  119. kiln_ai/tools/built_in_tools/test_math_tools.py +204 -0
  120. kiln_ai/tools/mcp_server_tool.py +95 -0
  121. kiln_ai/tools/mcp_session_manager.py +246 -0
  122. kiln_ai/tools/rag_tools.py +157 -0
  123. kiln_ai/tools/test_base_tools.py +199 -0
  124. kiln_ai/tools/test_mcp_server_tool.py +457 -0
  125. kiln_ai/tools/test_mcp_session_manager.py +1585 -0
  126. kiln_ai/tools/test_rag_tools.py +848 -0
  127. kiln_ai/tools/test_tool_registry.py +562 -0
  128. kiln_ai/tools/tool_registry.py +85 -0
  129. kiln_ai/utils/__init__.py +3 -0
  130. kiln_ai/utils/async_job_runner.py +62 -17
  131. kiln_ai/utils/config.py +24 -2
  132. kiln_ai/utils/env.py +15 -0
  133. kiln_ai/utils/filesystem.py +14 -0
  134. kiln_ai/utils/filesystem_cache.py +60 -0
  135. kiln_ai/utils/litellm.py +94 -0
  136. kiln_ai/utils/lock.py +100 -0
  137. kiln_ai/utils/mime_type.py +38 -0
  138. kiln_ai/utils/open_ai_types.py +94 -0
  139. kiln_ai/utils/pdf_utils.py +38 -0
  140. kiln_ai/utils/project_utils.py +17 -0
  141. kiln_ai/utils/test_async_job_runner.py +151 -35
  142. kiln_ai/utils/test_config.py +138 -1
  143. kiln_ai/utils/test_env.py +142 -0
  144. kiln_ai/utils/test_filesystem_cache.py +316 -0
  145. kiln_ai/utils/test_litellm.py +206 -0
  146. kiln_ai/utils/test_lock.py +185 -0
  147. kiln_ai/utils/test_mime_type.py +66 -0
  148. kiln_ai/utils/test_open_ai_types.py +131 -0
  149. kiln_ai/utils/test_pdf_utils.py +73 -0
  150. kiln_ai/utils/test_uuid.py +111 -0
  151. kiln_ai/utils/test_validation.py +524 -0
  152. kiln_ai/utils/uuid.py +9 -0
  153. kiln_ai/utils/validation.py +90 -0
  154. {kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/METADATA +12 -5
  155. kiln_ai-0.21.0.dist-info/RECORD +211 -0
  156. kiln_ai-0.19.0.dist-info/RECORD +0 -115
  157. {kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/WHEEL +0 -0
  158. {kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -0,0 +1,303 @@
1
+ import logging
2
+ from enum import Enum
3
+ from typing import TYPE_CHECKING, Any, List, Union
4
+
5
+ import anyio
6
+ from pydantic import (
7
+ BaseModel,
8
+ Field,
9
+ SerializationInfo,
10
+ ValidationInfo,
11
+ field_serializer,
12
+ field_validator,
13
+ model_validator,
14
+ )
15
+ from typing_extensions import Self
16
+
17
+ from kiln_ai.datamodel.basemodel import (
18
+ ID_TYPE,
19
+ FilenameString,
20
+ KilnAttachmentModel,
21
+ KilnParentedModel,
22
+ KilnParentModel,
23
+ )
24
+ from kiln_ai.datamodel.chunk import ChunkedDocument
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+ if TYPE_CHECKING:
29
+ from kiln_ai.datamodel.project import Project
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ class Kind(str, Enum):
35
+ DOCUMENT = "document"
36
+ IMAGE = "image"
37
+ VIDEO = "video"
38
+ AUDIO = "audio"
39
+
40
+
41
+ class OutputFormat(str, Enum):
42
+ TEXT = "text/plain"
43
+ MARKDOWN = "text/markdown"
44
+
45
+
46
+ class ExtractorType(str, Enum):
47
+ LITELLM = "litellm"
48
+
49
+
50
+ SUPPORTED_MIME_TYPES = {
51
+ Kind.DOCUMENT: {
52
+ "application/pdf",
53
+ "text/plain",
54
+ "text/markdown",
55
+ "text/html",
56
+ "text/md",
57
+ },
58
+ Kind.IMAGE: {
59
+ "image/png",
60
+ "image/jpeg",
61
+ },
62
+ Kind.VIDEO: {
63
+ "video/mp4",
64
+ "video/quicktime",
65
+ },
66
+ Kind.AUDIO: {
67
+ "audio/wav",
68
+ "audio/mpeg",
69
+ "audio/ogg",
70
+ },
71
+ }
72
+
73
+
74
+ class ExtractionModel(BaseModel):
75
+ name: str
76
+ label: str
77
+
78
+
79
+ def validate_prompt(prompt: Any, name: str):
80
+ if not isinstance(prompt, str):
81
+ raise ValueError(f"{name} must be a string.")
82
+ if prompt == "":
83
+ raise ValueError(f"{name} cannot be empty.")
84
+
85
+
86
+ class ExtractionSource(str, Enum):
87
+ PROCESSED = "processed"
88
+ PASSTHROUGH = "passthrough"
89
+
90
+
91
+ class Extraction(
92
+ KilnParentedModel, KilnParentModel, parent_of={"chunked_documents": ChunkedDocument}
93
+ ):
94
+ source: ExtractionSource = Field(
95
+ description="The source of the extraction.",
96
+ )
97
+ extractor_config_id: ID_TYPE = Field(
98
+ description="The ID of the extractor config used to extract the data.",
99
+ )
100
+ output: KilnAttachmentModel = Field(
101
+ description="The extraction output.",
102
+ )
103
+
104
+ def parent_document(self) -> Union["Document", None]:
105
+ if self.parent is None or self.parent.__class__.__name__ != "Document":
106
+ return None
107
+ return self.parent # type: ignore
108
+
109
+ async def output_content(self) -> str | None:
110
+ if not self.path:
111
+ raise ValueError(
112
+ "Failed to resolve the path of extraction output attachment because the extraction does not have a path."
113
+ )
114
+
115
+ full_path = self.output.resolve_path(self.path.parent)
116
+
117
+ try:
118
+ return await anyio.Path(full_path).read_text(encoding="utf-8")
119
+ except Exception as e:
120
+ logger.error(
121
+ f"Failed to read extraction output for {full_path}: {e}", exc_info=True
122
+ )
123
+ raise ValueError(f"Failed to read extraction output: {e}")
124
+
125
+ def chunked_documents(self, readonly: bool = False) -> list[ChunkedDocument]:
126
+ return super().chunked_documents(readonly=readonly) # type: ignore
127
+
128
+
129
+ class ExtractorConfig(KilnParentedModel):
130
+ name: FilenameString = Field(
131
+ description="A name to identify the extractor config.",
132
+ )
133
+ is_archived: bool = Field(
134
+ default=False,
135
+ description="Whether the extractor config is archived. Archived extractor configs are not shown in the UI and are not available for use.",
136
+ )
137
+ description: str | None = Field(
138
+ default=None, description="The description of the extractor config"
139
+ )
140
+ model_provider_name: str = Field(
141
+ description="The name of the model provider to use for the extractor config.",
142
+ )
143
+ model_name: str = Field(
144
+ description="The name of the model to use for the extractor config.",
145
+ )
146
+ output_format: OutputFormat = Field(
147
+ default=OutputFormat.MARKDOWN,
148
+ description="The format to use for the output.",
149
+ )
150
+ passthrough_mimetypes: list[OutputFormat] = Field(
151
+ default_factory=list,
152
+ description="If the mimetype is in this list, the extractor will not be used and the text content of the file will be returned as is.",
153
+ )
154
+ extractor_type: ExtractorType = Field(
155
+ description="This is used to determine the type of extractor to use.",
156
+ )
157
+ properties: dict[str, str | int | float | bool | dict[str, str] | None] = Field(
158
+ default_factory=dict,
159
+ description="Properties to be used to execute the extractor config. This is extractor_type specific and should serialize to a json dict.",
160
+ )
161
+
162
+ @field_validator("properties")
163
+ @classmethod
164
+ def validate_properties(
165
+ cls, properties: dict[str, Any], info: ValidationInfo
166
+ ) -> dict[str, Any]:
167
+ def get_property(key: str) -> str:
168
+ value = properties.get(key)
169
+ if value is None or value == "" or not isinstance(value, str):
170
+ raise ValueError(f"Prompt for {key} must be a string")
171
+ return value
172
+
173
+ return {
174
+ "prompt_document": get_property(
175
+ "prompt_document",
176
+ ),
177
+ "prompt_image": get_property(
178
+ "prompt_image",
179
+ ),
180
+ "prompt_video": get_property(
181
+ "prompt_video",
182
+ ),
183
+ "prompt_audio": get_property(
184
+ "prompt_audio",
185
+ ),
186
+ }
187
+
188
+ def prompt_document(self) -> str | None:
189
+ prompt = self.properties.get("prompt_document")
190
+ if prompt is None:
191
+ return None
192
+ if not isinstance(prompt, str):
193
+ raise ValueError(
194
+ "Invalid prompt_document. prompt_document must be a string."
195
+ )
196
+ return prompt
197
+
198
+ def prompt_video(self) -> str | None:
199
+ prompt = self.properties.get("prompt_video")
200
+ if prompt is None:
201
+ return None
202
+ if not isinstance(prompt, str):
203
+ raise ValueError("Invalid prompt_video. prompt_video must be a string.")
204
+ return prompt
205
+
206
+ def prompt_audio(self) -> str | None:
207
+ prompt = self.properties.get("prompt_audio")
208
+ if prompt is None:
209
+ return None
210
+ if not isinstance(prompt, str):
211
+ raise ValueError("Invalid prompt_audio. prompt_audio must be a string.")
212
+ return prompt
213
+
214
+ def prompt_image(self) -> str | None:
215
+ prompt = self.properties.get("prompt_image")
216
+ if prompt is None:
217
+ return None
218
+ if not isinstance(prompt, str):
219
+ raise ValueError("Invalid prompt_image. prompt_image must be a string.")
220
+ return prompt
221
+
222
+ # Workaround to return typed parent without importing Project
223
+ def parent_project(self) -> Union["Project", None]:
224
+ if self.parent is None or self.parent.__class__.__name__ != "Project":
225
+ return None
226
+ return self.parent # type: ignore
227
+
228
+
229
+ class FileInfo(BaseModel):
230
+ filename: str = Field(description="The filename of the file")
231
+
232
+ size: int = Field(description="The size of the file in bytes")
233
+
234
+ mime_type: str = Field(description="The MIME type of the file")
235
+
236
+ attachment: KilnAttachmentModel = Field(
237
+ description="The attachment to the file",
238
+ )
239
+
240
+ @field_serializer("attachment")
241
+ def serialize_attachment(
242
+ self, attachment: KilnAttachmentModel, info: SerializationInfo
243
+ ) -> dict:
244
+ context = info.context or {}
245
+ context["filename_prefix"] = "attachment"
246
+ return attachment.model_dump(mode="json", context=context)
247
+
248
+ @field_validator("mime_type")
249
+ @classmethod
250
+ def validate_mime_type(cls, mime_type: str, info: ValidationInfo) -> str:
251
+ filename = info.data.get("filename") or ""
252
+
253
+ for mime_types in SUPPORTED_MIME_TYPES.values():
254
+ if mime_type in mime_types:
255
+ return mime_type
256
+ raise ValueError(f"MIME type is not supported: {mime_type} (for {filename})")
257
+
258
+
259
+ class Document(
260
+ KilnParentedModel, KilnParentModel, parent_of={"extractions": Extraction}
261
+ ):
262
+ name: FilenameString = Field(
263
+ description="A name to identify the document.",
264
+ )
265
+
266
+ description: str = Field(description="A description for the file")
267
+
268
+ original_file: FileInfo = Field(description="The original file")
269
+
270
+ kind: Kind = Field(
271
+ description="The kind of document. The kind is a broad family of filetypes that can be handled in a similar way"
272
+ )
273
+
274
+ tags: List[str] = Field(
275
+ default_factory=list,
276
+ description="Tags for the document. Tags are used to categorize documents for filtering and reporting.",
277
+ )
278
+
279
+ @model_validator(mode="after")
280
+ def validate_tags(self) -> Self:
281
+ for tag in self.tags:
282
+ if not tag:
283
+ raise ValueError("Tags cannot be empty strings")
284
+ if " " in tag:
285
+ raise ValueError("Tags cannot contain spaces. Try underscores.")
286
+
287
+ return self
288
+
289
+ # Workaround to return typed parent without importing Project
290
+ def parent_project(self) -> Union["Project", None]:
291
+ if self.parent is None or self.parent.__class__.__name__ != "Project":
292
+ return None
293
+ return self.parent # type: ignore
294
+
295
+ def extractions(self, readonly: bool = False) -> list[Extraction]:
296
+ return super().extractions(readonly=readonly) # type: ignore
297
+
298
+
299
+ def get_kind_from_mime_type(mime_type: str) -> Kind | None:
300
+ for kind, mime_types in SUPPORTED_MIME_TYPES.items():
301
+ if mime_type in mime_types:
302
+ return kind
303
+ return None
@@ -84,25 +84,40 @@ def schema_from_json_str(v: str) -> Dict:
84
84
  """
85
85
  try:
86
86
  parsed = json.loads(v)
87
- jsonschema.Draft202012Validator.check_schema(parsed)
88
87
  if not isinstance(parsed, dict):
89
88
  raise ValueError(f"JSON schema must be a dict, not {type(parsed)}")
90
- # Top level arrays are valid JSON schemas, but we don't want to allow them here as they often cause issues
91
- if (
92
- "type" not in parsed
93
- or parsed["type"] != "object"
94
- or "properties" not in parsed
95
- ):
96
- raise ValueError(f"JSON schema must be an object with properties: {v}")
89
+
90
+ validate_schema_dict(parsed)
97
91
  return parsed
98
- except jsonschema.exceptions.SchemaError as e:
99
- raise ValueError(f"Invalid JSON schema: {v} \n{e}")
100
92
  except json.JSONDecodeError as e:
101
93
  raise ValueError(f"Invalid JSON: {v}\n {e}")
102
94
  except Exception as e:
103
95
  raise ValueError(f"Unexpected error parsing JSON schema: {v}\n {e}")
104
96
 
105
97
 
98
+ def validate_schema_dict(v: Dict):
99
+ """Parse and validate a JSON schema dictionary.
100
+
101
+ Args:
102
+ v: Dictionary containing a JSON schema definition
103
+
104
+ Returns:
105
+ Dict containing the parsed JSON schema
106
+
107
+ Raises:
108
+ ValueError: If the input is not a valid JSON schema object with required properties
109
+ """
110
+ try:
111
+ jsonschema.Draft202012Validator.check_schema(v)
112
+ # Top level arrays are valid JSON schemas, but we don't want to allow them here as they often cause issues
113
+ if "type" not in v or v["type"] != "object" or "properties" not in v:
114
+ raise ValueError(f"JSON schema must be an object with properties: {v}")
115
+ except jsonschema.exceptions.SchemaError as e:
116
+ raise ValueError(f"Invalid JSON schema: {v} \n{e}")
117
+ except Exception as e:
118
+ raise ValueError(f"Unexpected error validating dict JSON schema: {v}\n {e}")
119
+
120
+
106
121
  def string_to_json_key(s: str) -> str:
107
122
  """Convert a string to a valid JSON key."""
108
123
  return re.sub(r"[^a-z0-9_]", "", s.strip().lower().replace(" ", "_"))
@@ -1,10 +1,28 @@
1
1
  from pydantic import Field
2
2
 
3
3
  from kiln_ai.datamodel.basemodel import FilenameString, KilnParentModel
4
+ from kiln_ai.datamodel.chunk import ChunkerConfig
5
+ from kiln_ai.datamodel.embedding import EmbeddingConfig
6
+ from kiln_ai.datamodel.external_tool_server import ExternalToolServer
7
+ from kiln_ai.datamodel.extraction import Document, ExtractorConfig
8
+ from kiln_ai.datamodel.rag import RagConfig
4
9
  from kiln_ai.datamodel.task import Task
10
+ from kiln_ai.datamodel.vector_store import VectorStoreConfig
5
11
 
6
12
 
7
- class Project(KilnParentModel, parent_of={"tasks": Task}):
13
+ class Project(
14
+ KilnParentModel,
15
+ parent_of={
16
+ "tasks": Task,
17
+ "documents": Document,
18
+ "extractor_configs": ExtractorConfig,
19
+ "chunker_configs": ChunkerConfig,
20
+ "embedding_configs": EmbeddingConfig,
21
+ "rag_configs": RagConfig,
22
+ "vector_store_configs": VectorStoreConfig,
23
+ "external_tool_servers": ExternalToolServer,
24
+ },
25
+ ):
8
26
  """
9
27
  A collection of related tasks.
10
28
 
@@ -21,3 +39,24 @@ class Project(KilnParentModel, parent_of={"tasks": Task}):
21
39
  # Needed for typechecking. We should fix this in KilnParentModel
22
40
  def tasks(self) -> list[Task]:
23
41
  return super().tasks() # type: ignore
42
+
43
+ def documents(self, readonly: bool = False) -> list[Document]:
44
+ return super().documents(readonly=readonly) # type: ignore
45
+
46
+ def extractor_configs(self, readonly: bool = False) -> list[ExtractorConfig]:
47
+ return super().extractor_configs(readonly=readonly) # type: ignore
48
+
49
+ def chunker_configs(self, readonly: bool = False) -> list[ChunkerConfig]:
50
+ return super().chunker_configs(readonly=readonly) # type: ignore
51
+
52
+ def embedding_configs(self, readonly: bool = False) -> list[EmbeddingConfig]:
53
+ return super().embedding_configs(readonly=readonly) # type: ignore
54
+
55
+ def vector_store_configs(self, readonly: bool = False) -> list[VectorStoreConfig]:
56
+ return super().vector_store_configs(readonly=readonly) # type: ignore
57
+
58
+ def rag_configs(self, readonly: bool = False) -> list[RagConfig]:
59
+ return super().rag_configs(readonly=readonly) # type: ignore
60
+
61
+ def external_tool_servers(self, readonly: bool = False) -> list[ExternalToolServer]:
62
+ return super().external_tool_servers(readonly=readonly) # type: ignore
@@ -0,0 +1,79 @@
1
+ from typing import TYPE_CHECKING, Union
2
+
3
+ from pydantic import Field, model_validator
4
+
5
+ from kiln_ai.datamodel.basemodel import ID_TYPE, FilenameString, KilnParentedModel
6
+ from kiln_ai.utils.validation import ToolNameString
7
+
8
+ if TYPE_CHECKING:
9
+ from kiln_ai.datamodel.project import Project
10
+
11
+
12
+ class RagConfig(KilnParentedModel):
13
+ name: FilenameString = Field(
14
+ description="A name to identify this RAG configuration for your own reference.",
15
+ )
16
+
17
+ is_archived: bool = Field(
18
+ default=False,
19
+ description="Whether the RAG configuration is archived. Archived RAG configurations are not shown in the UI and are not available for use.",
20
+ )
21
+
22
+ description: str | None = Field(
23
+ default=None,
24
+ description="A description of the RAG configuration for you and your team. Will not be used in prompts/training/validation.",
25
+ )
26
+
27
+ tool_name: ToolNameString = Field(
28
+ description="A name for the model to identify the Search Tool in conversations.",
29
+ )
30
+
31
+ tool_description: str = Field(
32
+ description="A description of the purpose of the tool. The model will use this description to understand the tool's capabilities.",
33
+ max_length=128,
34
+ )
35
+
36
+ extractor_config_id: ID_TYPE = Field(
37
+ description="The ID of the extractor config used to extract the documents.",
38
+ )
39
+
40
+ chunker_config_id: ID_TYPE = Field(
41
+ description="The ID of the chunker config used to chunk the documents.",
42
+ )
43
+
44
+ embedding_config_id: ID_TYPE = Field(
45
+ description="The ID of the embedding config used to embed the documents.",
46
+ )
47
+
48
+ vector_store_config_id: ID_TYPE = Field(
49
+ description="The ID of the vector store config used to store the documents.",
50
+ )
51
+
52
+ tags: list[str] | None = Field(
53
+ default=None,
54
+ description="List of document tags to filter by. If None, all documents in the project are used.",
55
+ )
56
+
57
+ # Workaround to return typed parent without importing Project
58
+ def parent_project(self) -> Union["Project", None]:
59
+ if self.parent is None or self.parent.__class__.__name__ != "Project":
60
+ return None
61
+ return self.parent # type: ignore
62
+
63
+ @model_validator(mode="after")
64
+ def validate_tags(self):
65
+ if self.tags is not None:
66
+ if len(self.tags) == 0:
67
+ raise ValueError("Tags cannot be an empty list.")
68
+ for tag in self.tags:
69
+ if not tag:
70
+ raise ValueError("Tags cannot be empty.")
71
+ if " " in tag:
72
+ raise ValueError("Tags cannot contain spaces. Try underscores.")
73
+
74
+ if self.tool_name.strip() == "":
75
+ raise ValueError("Tool name cannot be empty.")
76
+ if self.tool_description.strip() == "":
77
+ raise ValueError("Tool description cannot be empty.")
78
+
79
+ return self
@@ -14,18 +14,3 @@ def all_projects() -> list[Project]:
14
14
  # deleted files are possible continue with the rest
15
15
  continue
16
16
  return projects
17
-
18
-
19
- def project_from_id(project_id: str) -> Project | None:
20
- project_paths = Config.shared().projects
21
- if project_paths is not None:
22
- for project_path in project_paths:
23
- try:
24
- project = Project.load_from_file(project_path)
25
- if project.id == project_id:
26
- return project
27
- except Exception:
28
- # deleted files are possible continue with the rest
29
- continue
30
-
31
- return None
@@ -0,0 +1,62 @@
1
+ from typing import List
2
+
3
+ from pydantic import BaseModel, Field, model_validator
4
+ from typing_extensions import Self
5
+
6
+ from kiln_ai.datamodel.datamodel_enums import (
7
+ ModelProviderName,
8
+ StructuredOutputMode,
9
+ )
10
+ from kiln_ai.datamodel.prompt_id import PromptId
11
+ from kiln_ai.datamodel.tool_id import ToolId
12
+
13
+
14
+ class ToolsRunConfig(BaseModel):
15
+ """
16
+ A config describing which tools are available to a task.
17
+ """
18
+
19
+ tools: List[ToolId] = Field(
20
+ description="The IDs of the tools available to the task."
21
+ )
22
+
23
+
24
+ class RunConfigProperties(BaseModel):
25
+ """
26
+ A configuration for running a task.
27
+
28
+ This includes everything needed to run a task, except the input and task ID. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic).
29
+ """
30
+
31
+ model_name: str = Field(description="The model to use for this run config.")
32
+ model_provider_name: ModelProviderName = Field(
33
+ description="The provider to use for this run config."
34
+ )
35
+ prompt_id: PromptId = Field(
36
+ description="The prompt to use for this run config. Defaults to building a simple prompt from the task if not provided.",
37
+ )
38
+ top_p: float = Field(
39
+ default=1.0,
40
+ description="The top-p value to use for this run config. Defaults to 1.0.",
41
+ )
42
+ temperature: float = Field(
43
+ default=1.0,
44
+ description="The temperature to use for this run config. Defaults to 1.0.",
45
+ )
46
+ structured_output_mode: StructuredOutputMode = Field(
47
+ description="The structured output mode to use for this run config.",
48
+ )
49
+ tools_config: ToolsRunConfig | None = Field(
50
+ default=None,
51
+ description="The tools config to use for this run config, defining which tools are available to the model.",
52
+ )
53
+
54
+ @model_validator(mode="after")
55
+ def validate_required_fields(self) -> Self:
56
+ if not (0 <= self.top_p <= 1):
57
+ raise ValueError("top_p must be between 0 and 1")
58
+
59
+ elif self.temperature < 0 or self.temperature > 2:
60
+ raise ValueError("temperature must be between 0 and 2")
61
+
62
+ return self
kiln_ai/datamodel/task.py CHANGED
@@ -1,9 +1,7 @@
1
1
  from typing import TYPE_CHECKING, Dict, List, Union
2
2
 
3
3
  from pydantic import BaseModel, Field, ValidationInfo, model_validator
4
- from typing_extensions import Self
5
4
 
6
- from kiln_ai.datamodel import Finetune
7
5
  from kiln_ai.datamodel.basemodel import (
8
6
  ID_FIELD,
9
7
  ID_TYPE,
@@ -13,16 +11,16 @@ from kiln_ai.datamodel.basemodel import (
13
11
  KilnParentModel,
14
12
  )
15
13
  from kiln_ai.datamodel.datamodel_enums import (
16
- ModelProviderName,
17
14
  Priority,
18
15
  StructuredOutputMode,
19
16
  TaskOutputRatingType,
20
17
  )
21
18
  from kiln_ai.datamodel.dataset_split import DatasetSplit
22
19
  from kiln_ai.datamodel.eval import Eval
20
+ from kiln_ai.datamodel.finetune import Finetune
23
21
  from kiln_ai.datamodel.json_schema import JsonObjectSchema, schema_from_json_str
24
22
  from kiln_ai.datamodel.prompt import BasePrompt, Prompt
25
- from kiln_ai.datamodel.prompt_id import PromptId
23
+ from kiln_ai.datamodel.run_config import RunConfigProperties
26
24
  from kiln_ai.datamodel.task_run import TaskRun
27
25
 
28
26
  if TYPE_CHECKING:
@@ -45,55 +43,6 @@ class TaskRequirement(BaseModel):
45
43
  type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star)
46
44
 
47
45
 
48
- class RunConfigProperties(BaseModel):
49
- """
50
- A configuration for running a task.
51
-
52
- This includes everything needed to run a task, except the input and task ID. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic).
53
- """
54
-
55
- model_name: str = Field(description="The model to use for this run config.")
56
- model_provider_name: ModelProviderName = Field(
57
- description="The provider to use for this run config."
58
- )
59
- prompt_id: PromptId = Field(
60
- description="The prompt to use for this run config. Defaults to building a simple prompt from the task if not provided.",
61
- )
62
- top_p: float = Field(
63
- default=1.0,
64
- description="The top-p value to use for this run config. Defaults to 1.0.",
65
- )
66
- temperature: float = Field(
67
- default=1.0,
68
- description="The temperature to use for this run config. Defaults to 1.0.",
69
- )
70
- structured_output_mode: StructuredOutputMode = Field(
71
- description="The structured output mode to use for this run config.",
72
- )
73
-
74
- @model_validator(mode="after")
75
- def validate_required_fields(self) -> Self:
76
- if not (0 <= self.top_p <= 1):
77
- raise ValueError("top_p must be between 0 and 1")
78
-
79
- elif self.temperature < 0 or self.temperature > 2:
80
- raise ValueError("temperature must be between 0 and 2")
81
-
82
- return self
83
-
84
-
85
- class RunConfig(RunConfigProperties):
86
- """
87
- A configuration for running a task.
88
-
89
- This includes everything needed to run a task, except the input. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic).
90
-
91
- For example: task, model, provider, prompt, etc.
92
- """
93
-
94
- task: "Task" = Field(description="The task to run.")
95
-
96
-
97
46
  class TaskRunConfig(KilnParentedModel):
98
47
  """
99
48
  A Kiln model for persisting a run config in a Kiln Project, nested under a task.
@@ -124,15 +73,6 @@ class TaskRunConfig(KilnParentedModel):
124
73
  return None
125
74
  return self.parent # type: ignore
126
75
 
127
- def run_config(self) -> RunConfig:
128
- parent_task = self.parent_task()
129
- if parent_task is None:
130
- raise ValueError("Run config must be parented to a task")
131
- return run_config_from_run_config_properties(
132
- task=parent_task,
133
- run_config_properties=self.run_config_properties,
134
- )
135
-
136
76
  # Previously we didn't store structured_output_mode in the run_config_properties. Updgrade old models when loading from file.
137
77
  @model_validator(mode="before")
138
78
  def upgrade_old_entries(cls, data: dict, info: ValidationInfo) -> dict:
@@ -155,21 +95,6 @@ class TaskRunConfig(KilnParentedModel):
155
95
  return data
156
96
 
157
97
 
158
- def run_config_from_run_config_properties(
159
- task: "Task",
160
- run_config_properties: RunConfigProperties,
161
- ) -> RunConfig:
162
- return RunConfig(
163
- task=task,
164
- model_name=run_config_properties.model_name,
165
- model_provider_name=run_config_properties.model_provider_name,
166
- prompt_id=run_config_properties.prompt_id,
167
- top_p=run_config_properties.top_p,
168
- temperature=run_config_properties.temperature,
169
- structured_output_mode=run_config_properties.structured_output_mode,
170
- )
171
-
172
-
173
98
  class Task(
174
99
  KilnParentedModel,
175
100
  KilnParentModel,