kiln-ai 0.11.1__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (80) hide show
  1. kiln_ai/adapters/__init__.py +4 -0
  2. kiln_ai/adapters/adapter_registry.py +163 -39
  3. kiln_ai/adapters/data_gen/data_gen_task.py +18 -0
  4. kiln_ai/adapters/eval/__init__.py +28 -0
  5. kiln_ai/adapters/eval/base_eval.py +164 -0
  6. kiln_ai/adapters/eval/eval_runner.py +270 -0
  7. kiln_ai/adapters/eval/g_eval.py +368 -0
  8. kiln_ai/adapters/eval/registry.py +16 -0
  9. kiln_ai/adapters/eval/test_base_eval.py +325 -0
  10. kiln_ai/adapters/eval/test_eval_runner.py +641 -0
  11. kiln_ai/adapters/eval/test_g_eval.py +498 -0
  12. kiln_ai/adapters/eval/test_g_eval_data.py +4 -0
  13. kiln_ai/adapters/fine_tune/base_finetune.py +16 -2
  14. kiln_ai/adapters/fine_tune/finetune_registry.py +2 -0
  15. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +4 -1
  16. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +1 -1
  17. kiln_ai/adapters/fine_tune/test_openai_finetune.py +1 -1
  18. kiln_ai/adapters/fine_tune/test_together_finetune.py +531 -0
  19. kiln_ai/adapters/fine_tune/together_finetune.py +325 -0
  20. kiln_ai/adapters/ml_model_list.py +758 -163
  21. kiln_ai/adapters/model_adapters/__init__.py +2 -4
  22. kiln_ai/adapters/model_adapters/base_adapter.py +61 -43
  23. kiln_ai/adapters/model_adapters/litellm_adapter.py +391 -0
  24. kiln_ai/adapters/model_adapters/litellm_config.py +13 -0
  25. kiln_ai/adapters/model_adapters/test_base_adapter.py +22 -13
  26. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +407 -0
  27. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +41 -19
  28. kiln_ai/adapters/model_adapters/test_structured_output.py +59 -35
  29. kiln_ai/adapters/ollama_tools.py +3 -3
  30. kiln_ai/adapters/parsers/r1_parser.py +19 -14
  31. kiln_ai/adapters/parsers/test_r1_parser.py +17 -5
  32. kiln_ai/adapters/prompt_builders.py +80 -42
  33. kiln_ai/adapters/provider_tools.py +50 -58
  34. kiln_ai/adapters/repair/repair_task.py +9 -21
  35. kiln_ai/adapters/repair/test_repair_task.py +6 -6
  36. kiln_ai/adapters/run_output.py +3 -0
  37. kiln_ai/adapters/test_adapter_registry.py +26 -29
  38. kiln_ai/adapters/test_generate_docs.py +4 -4
  39. kiln_ai/adapters/test_ollama_tools.py +0 -1
  40. kiln_ai/adapters/test_prompt_adaptors.py +47 -33
  41. kiln_ai/adapters/test_prompt_builders.py +91 -31
  42. kiln_ai/adapters/test_provider_tools.py +26 -81
  43. kiln_ai/datamodel/__init__.py +50 -952
  44. kiln_ai/datamodel/basemodel.py +2 -0
  45. kiln_ai/datamodel/datamodel_enums.py +60 -0
  46. kiln_ai/datamodel/dataset_filters.py +114 -0
  47. kiln_ai/datamodel/dataset_split.py +170 -0
  48. kiln_ai/datamodel/eval.py +298 -0
  49. kiln_ai/datamodel/finetune.py +105 -0
  50. kiln_ai/datamodel/json_schema.py +7 -1
  51. kiln_ai/datamodel/project.py +23 -0
  52. kiln_ai/datamodel/prompt.py +37 -0
  53. kiln_ai/datamodel/prompt_id.py +83 -0
  54. kiln_ai/datamodel/strict_mode.py +24 -0
  55. kiln_ai/datamodel/task.py +181 -0
  56. kiln_ai/datamodel/task_output.py +328 -0
  57. kiln_ai/datamodel/task_run.py +164 -0
  58. kiln_ai/datamodel/test_basemodel.py +19 -11
  59. kiln_ai/datamodel/test_dataset_filters.py +71 -0
  60. kiln_ai/datamodel/test_dataset_split.py +32 -8
  61. kiln_ai/datamodel/test_datasource.py +22 -2
  62. kiln_ai/datamodel/test_eval_model.py +635 -0
  63. kiln_ai/datamodel/test_example_models.py +9 -13
  64. kiln_ai/datamodel/test_json_schema.py +23 -0
  65. kiln_ai/datamodel/test_models.py +2 -2
  66. kiln_ai/datamodel/test_prompt_id.py +129 -0
  67. kiln_ai/datamodel/test_task.py +159 -0
  68. kiln_ai/utils/config.py +43 -1
  69. kiln_ai/utils/dataset_import.py +232 -0
  70. kiln_ai/utils/test_dataset_import.py +596 -0
  71. {kiln_ai-0.11.1.dist-info → kiln_ai-0.13.0.dist-info}/METADATA +86 -6
  72. kiln_ai-0.13.0.dist-info/RECORD +103 -0
  73. kiln_ai/adapters/model_adapters/langchain_adapters.py +0 -302
  74. kiln_ai/adapters/model_adapters/openai_compatible_config.py +0 -11
  75. kiln_ai/adapters/model_adapters/openai_model_adapter.py +0 -246
  76. kiln_ai/adapters/model_adapters/test_langchain_adapter.py +0 -350
  77. kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +0 -225
  78. kiln_ai-0.11.1.dist-info/RECORD +0 -76
  79. {kiln_ai-0.11.1.dist-info → kiln_ai-0.13.0.dist-info}/WHEEL +0 -0
  80. {kiln_ai-0.11.1.dist-info → kiln_ai-0.13.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -1,50 +1,59 @@
1
1
  """
2
- See our docs for details about our datamodel: https://kiln-ai.github.io/Kiln/kiln_core_docs/kiln_ai.html
3
- """
2
+ See our docs for details about our datamodel classes and hierarchy:
4
3
 
5
- from __future__ import annotations
4
+ Developer docs: https://kiln-ai.github.io/Kiln/kiln_core_docs/kiln_ai.html
6
5
 
7
- import json
8
- import math
9
- import random
10
- from enum import Enum, IntEnum
11
- from typing import TYPE_CHECKING, Callable, Dict, List, Type, Union
6
+ User docs: https://docs.getkiln.ai/developers/kiln-datamodel
7
+ """
12
8
 
13
- import jsonschema
14
- import jsonschema.exceptions
15
- from pydantic import (
16
- BaseModel,
17
- Field,
18
- ValidationInfo,
19
- model_validator,
20
- )
21
- from typing_extensions import Self
9
+ # This component uses "flat" imports so we don't have too much internal structure exposed in the API.
10
+ # for example you can just `from datamodel import Task, Project` instead of `from datamodel.task import Task; from datamodel.project import Project`
22
11
 
23
- from kiln_ai.datamodel.json_schema import JsonObjectSchema, schema_from_json_str
12
+ from __future__ import annotations
24
13
 
25
- from .basemodel import (
26
- ID_FIELD,
27
- ID_TYPE,
28
- NAME_FIELD,
29
- SHORT_NAME_FIELD,
30
- KilnBaseModel,
31
- KilnParentedModel,
32
- KilnParentModel,
14
+ from kiln_ai.datamodel import dataset_split, eval, strict_mode
15
+ from kiln_ai.datamodel.datamodel_enums import (
16
+ FinetuneDataStrategy,
17
+ FineTuneStatusType,
18
+ Priority,
19
+ StructuredOutputMode,
20
+ TaskOutputRatingType,
21
+ )
22
+ from kiln_ai.datamodel.dataset_split import (
23
+ DatasetSplit,
24
+ DatasetSplitDefinition,
25
+ )
26
+ from kiln_ai.datamodel.finetune import (
27
+ Finetune,
28
+ )
29
+ from kiln_ai.datamodel.project import Project
30
+ from kiln_ai.datamodel.prompt import BasePrompt, Prompt
31
+ from kiln_ai.datamodel.prompt_id import (
32
+ PromptGenerators,
33
+ PromptId,
34
+ prompt_generator_values,
35
+ )
36
+ from kiln_ai.datamodel.task import Task, TaskRequirement
37
+ from kiln_ai.datamodel.task_output import (
38
+ DataSource,
39
+ DataSourceProperty,
40
+ DataSourceType,
41
+ RequirementRating,
42
+ TaskOutput,
43
+ TaskOutputRating,
44
+ )
45
+ from kiln_ai.datamodel.task_run import (
46
+ TaskRun,
33
47
  )
34
- from .json_schema import validate_schema
35
-
36
- if TYPE_CHECKING:
37
- from . import Task
38
-
39
48
 
40
49
  __all__ = [
41
- "basemodel",
42
- "json_schema",
50
+ "strict_mode",
51
+ "dataset_split",
52
+ "eval",
43
53
  "Task",
44
54
  "Project",
45
55
  "TaskRun",
46
56
  "TaskOutput",
47
- "TaskOutputRating",
48
57
  "Priority",
49
58
  "DataSource",
50
59
  "DataSourceType",
@@ -53,927 +62,16 @@ __all__ = [
53
62
  "FineTuneStatusType",
54
63
  "TaskOutputRatingType",
55
64
  "TaskRequirement",
56
- "TaskDeterminism",
57
65
  "DatasetSplitDefinition",
58
66
  "DatasetSplit",
59
67
  "RequirementRating",
60
68
  "TaskRequirement",
61
- "strict_mode",
62
- "set_strict_mode",
69
+ "BasePrompt",
63
70
  "Prompt",
71
+ "TaskOutputRating",
72
+ "StructuredOutputMode",
73
+ "FinetuneDataStrategy",
74
+ "PromptId",
75
+ "PromptGenerators",
76
+ "prompt_generator_values",
64
77
  ]
65
-
66
-
67
- # We want to be hard on ourselves for data completeness generated by the Kiln App, but don't want to make it hard for users to use the datamodel/library.
68
- # Strict mode enables extra validations that we want to enforce in Kiln App (and any other client that wants best practices), but not in the library (unless they opt in)
69
- _strict_mode: bool = False
70
-
71
-
72
- def strict_mode() -> bool:
73
- return _strict_mode
74
-
75
-
76
- def set_strict_mode(value: bool) -> None:
77
- global _strict_mode
78
- _strict_mode = value
79
-
80
-
81
- class Priority(IntEnum):
82
- """Defines priority levels for tasks and requirements, where P0 is highest priority."""
83
-
84
- p0 = 0
85
- p1 = 1
86
- p2 = 2
87
- p3 = 3
88
-
89
-
90
- # Only one rating type for now, but this allows for extensibility if we want to add more in the future
91
- class TaskOutputRatingType(str, Enum):
92
- """Defines the types of rating systems available for task outputs."""
93
-
94
- five_star = "five_star"
95
- pass_fail = "pass_fail"
96
- pass_fail_critical = "pass_fail_critical"
97
- custom = "custom"
98
-
99
-
100
- class RequirementRating(BaseModel):
101
- """Rating for a specific requirement within a task output."""
102
-
103
- value: float = Field(
104
- description="The rating value. Interpretation depends on rating type"
105
- )
106
- type: TaskOutputRatingType = Field(description="The type of rating")
107
-
108
-
109
- class TaskOutputRating(KilnBaseModel):
110
- """
111
- A rating for a task output, including an overall rating and ratings for each requirement.
112
-
113
- Supports:
114
- - five_star: 1-5 star ratings
115
- - pass_fail: boolean pass/fail (1.0 = pass, 0.0 = fail)
116
- - pass_fail_critical: tri-state (1.0 = pass, 0.0 = fail, -1.0 = critical fail)
117
- """
118
-
119
- type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star)
120
- value: float | None = Field(
121
- description="The rating value. Interpretation depends on rating type:\n- five_star: 1-5 stars\n- pass_fail: 1.0 (pass) or 0.0 (fail)\n- pass_fail_critical: 1.0 (pass), 0.0 (fail), or -1.0 (critical fail)",
122
- default=None,
123
- )
124
- requirement_ratings: Dict[ID_TYPE, RequirementRating] = Field(
125
- default={},
126
- description="The ratings of the requirements of the task.",
127
- )
128
-
129
- # Previously we stored rating values as a dict of floats, but now we store them as RequirementRating objects.
130
- @model_validator(mode="before")
131
- def upgrade_old_format(cls, data: dict) -> dict:
132
- if not isinstance(data, dict):
133
- return data
134
-
135
- # Check if we have the old format (dict of floats)
136
- req_ratings = data.get("requirement_ratings", {})
137
- if req_ratings and all(
138
- isinstance(v, (int, float)) for v in req_ratings.values()
139
- ):
140
- # Convert each float to a RequirementRating object
141
- # all ratings are five star at the point we used this format
142
- data["requirement_ratings"] = {
143
- k: {"value": v, "type": TaskOutputRatingType.five_star}
144
- for k, v in req_ratings.items()
145
- }
146
-
147
- return data
148
-
149
- # Used to select high quality outputs for example selection (MultiShotPromptBuilder, etc)
150
- def is_high_quality(self) -> bool:
151
- if self.value is None:
152
- return False
153
-
154
- if self.type == TaskOutputRatingType.five_star:
155
- return self.value >= 4
156
- elif self.type == TaskOutputRatingType.pass_fail:
157
- return self.value == 1.0
158
- elif self.type == TaskOutputRatingType.pass_fail_critical:
159
- return self.value == 1.0
160
- return False
161
-
162
- @model_validator(mode="after")
163
- def validate_rating(self) -> Self:
164
- if self.type not in TaskOutputRatingType:
165
- raise ValueError(f"Invalid rating type: {self.type}")
166
-
167
- # Overall rating is optional
168
- if self.value is not None:
169
- self._validate_rating(self.type, self.value, "overall rating")
170
-
171
- for req_id, req_rating in self.requirement_ratings.items():
172
- self._validate_rating(
173
- req_rating.type,
174
- req_rating.value,
175
- f"requirement rating for req ID: {req_id}",
176
- )
177
-
178
- return self
179
-
180
- def _validate_rating(
181
- self, type: TaskOutputRatingType, rating: float | None, rating_name: str
182
- ) -> None:
183
- if type == TaskOutputRatingType.five_star:
184
- self._validate_five_star(rating, rating_name)
185
- elif type == TaskOutputRatingType.pass_fail:
186
- self._validate_pass_fail(rating, rating_name)
187
- elif type == TaskOutputRatingType.pass_fail_critical:
188
- self._validate_pass_fail_critical(rating, rating_name)
189
-
190
- def _validate_five_star(self, rating: float | None, rating_name: str) -> None:
191
- if rating is None or not isinstance(rating, float) or not rating.is_integer():
192
- raise ValueError(
193
- f"{rating_name.capitalize()} of type five_star must be an integer value (1-5)"
194
- )
195
- if rating < 1 or rating > 5:
196
- raise ValueError(
197
- f"{rating_name.capitalize()} of type five_star must be between 1 and 5 stars"
198
- )
199
-
200
- def _validate_pass_fail(self, rating: float | None, rating_name: str) -> None:
201
- if rating is None or not isinstance(rating, float) or not rating.is_integer():
202
- raise ValueError(
203
- f"{rating_name.capitalize()} of type pass_fail must be an integer value (0 or 1)"
204
- )
205
- if rating not in [0, 1]:
206
- raise ValueError(
207
- f"{rating_name.capitalize()} of type pass_fail must be 0 (fail) or 1 (pass)"
208
- )
209
-
210
- def _validate_pass_fail_critical(
211
- self, rating: float | None, rating_name: str
212
- ) -> None:
213
- if rating is None or not isinstance(rating, float) or not rating.is_integer():
214
- raise ValueError(
215
- f"{rating_name.capitalize()} of type pass_fail_critical must be an integer value (-1, 0, or 1)"
216
- )
217
- if rating not in [-1, 0, 1]:
218
- raise ValueError(
219
- f"{rating_name.capitalize()} of type pass_fail_critical must be -1 (critical fail), 0 (fail), or 1 (pass)"
220
- )
221
-
222
-
223
- class TaskOutput(KilnBaseModel):
224
- """
225
- An output for a specific task run.
226
-
227
- Contains the actual output content, its source (human or synthetic),
228
- and optional rating information.
229
- """
230
-
231
- output: str = Field(
232
- description="The output of the task. JSON formatted for structured output, plaintext for unstructured output."
233
- )
234
- source: DataSource | None = Field(
235
- description="The source of the output: human or synthetic.",
236
- default=None,
237
- )
238
- rating: TaskOutputRating | None = Field(
239
- default=None, description="The rating of the output"
240
- )
241
-
242
- def validate_output_format(self, task: Task) -> Self:
243
- # validate output
244
- if task.output_json_schema is not None:
245
- try:
246
- validate_schema(json.loads(self.output), task.output_json_schema)
247
- except json.JSONDecodeError:
248
- raise ValueError("Output is not a valid JSON object")
249
- except jsonschema.exceptions.ValidationError as e:
250
- raise ValueError(f"Output does not match task output schema: {e}")
251
- return self
252
-
253
- @model_validator(mode="after")
254
- def validate_output_source(self, info: ValidationInfo) -> Self:
255
- # On strict mode and not loaded from file, we validate output_source is not None.
256
- # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data.
257
- if not strict_mode():
258
- return self
259
- if self.loaded_from_file(info):
260
- return self
261
- if self.source is None:
262
- raise ValueError("Output source is required when strict mode is enabled")
263
- return self
264
-
265
-
266
- class FineTuneStatusType(str, Enum):
267
- """
268
- The status type of a fine-tune (running, completed, failed, etc).
269
- """
270
-
271
- unknown = "unknown" # server error
272
- pending = "pending"
273
- running = "running"
274
- completed = "completed"
275
- failed = "failed"
276
-
277
-
278
- class StructuredOutputMode(str, Enum):
279
- """
280
- Enumeration of supported structured output modes.
281
-
282
- - default: let the adapter decide
283
- - json_schema: request json using API capabilities for json_schema
284
- - function_calling: request json using API capabilities for function calling
285
- - json_mode: request json using API's JSON mode, which should return valid JSON, but isn't checking/passing the schema
286
- - json_instructions: append instructions to the prompt to request json matching the schema. No API capabilities are used. You should have a custom parser on these models as they will be returning strings.
287
- - json_instruction_and_object: append instructions to the prompt to request json matching the schema. Also request the response as json_mode via API capabilities (returning dictionaries).
288
- """
289
-
290
- default = "default"
291
- json_schema = "json_schema"
292
- function_calling = "function_calling"
293
- json_mode = "json_mode"
294
- json_instructions = "json_instructions"
295
- json_instruction_and_object = "json_instruction_and_object"
296
-
297
-
298
- class FinetuneDataStrategy(str, Enum):
299
- final_only = "final_only"
300
- final_and_intermediate = "final_and_intermediate"
301
-
302
-
303
- class Finetune(KilnParentedModel):
304
- """
305
- The Kiln fine-tune datamodel.
306
-
307
- Initially holds a reference to a training job, with needed identifiers to update the status. When complete, contains the new model ID.
308
- """
309
-
310
- name: str = NAME_FIELD
311
- description: str | None = Field(
312
- default=None,
313
- description="A description of the fine-tune for you and your team. Not used in training.",
314
- )
315
- structured_output_mode: StructuredOutputMode | None = Field(
316
- default=None,
317
- description="The mode to use to train the model for structured output, if it was trained with structured output. Will determine how we call the tuned model, so we call with the matching mode.",
318
- )
319
- provider: str = Field(
320
- description="The provider to use for the fine-tune (e.g. 'openai')."
321
- )
322
- base_model_id: str = Field(
323
- description="The id of the base model to use for the fine-tune. This string relates to the provider's IDs for their own models, not Kiln IDs."
324
- )
325
- provider_id: str | None = Field(
326
- default=None,
327
- description="The ID of the fine-tune job on the provider's side. May not be the same as the fine_tune_model_id.",
328
- )
329
- fine_tune_model_id: str | None = Field(
330
- default=None,
331
- description="The ID of the fine-tuned model on the provider's side. May not be the same as the provider_id.",
332
- )
333
- dataset_split_id: str = Field(
334
- description="The ID of the dataset split to use for this fine-tune.",
335
- )
336
- train_split_name: str = Field(
337
- default="train",
338
- description="The name of the training split to use for this fine-tune.",
339
- )
340
- validation_split_name: str | None = Field(
341
- default=None,
342
- description="The name of the validation split to use for this fine-tune. Optional.",
343
- )
344
- parameters: dict[str, str | int | float | bool] = Field(
345
- default={},
346
- description="The parameters to use for this fine-tune. These are provider-specific.",
347
- )
348
- # These two fields are saved exactly used for training. Even if they map exactly to a custom prompt or generator, those can change, so we want to keep a record of the training prompt.
349
- system_message: str = Field(
350
- description="The system message to use for this fine-tune.",
351
- )
352
- thinking_instructions: str | None = Field(
353
- default=None,
354
- description="The thinking instructions to use for this fine-tune. Only used when data_strategy is final_and_intermediate.",
355
- )
356
- latest_status: FineTuneStatusType = Field(
357
- default=FineTuneStatusType.unknown,
358
- description="The latest known status of this fine-tune. Not updated in real time.",
359
- )
360
- properties: Dict[str, str | int | float] = Field(
361
- default={},
362
- description="Properties of the fine-tune. Different providers may use different properties.",
363
- )
364
- data_strategy: FinetuneDataStrategy = Field(
365
- default=FinetuneDataStrategy.final_only,
366
- description="The strategy to use for training the model. 'final_only' will only train on the final response. 'final_and_intermediate' will train on the final response and intermediate outputs (chain of thought or reasoning).",
367
- )
368
-
369
- def parent_task(self) -> Task | None:
370
- if not isinstance(self.parent, Task):
371
- return None
372
- return self.parent
373
-
374
- @model_validator(mode="after")
375
- def validate_thinking_instructions(self) -> Self:
376
- if (
377
- self.thinking_instructions is not None
378
- and self.data_strategy != FinetuneDataStrategy.final_and_intermediate
379
- ):
380
- raise ValueError(
381
- "Thinking instructions can only be used when data_strategy is final_and_intermediate"
382
- )
383
- if (
384
- self.thinking_instructions is None
385
- and self.data_strategy == FinetuneDataStrategy.final_and_intermediate
386
- ):
387
- raise ValueError(
388
- "Thinking instructions are required when data_strategy is final_and_intermediate"
389
- )
390
- return self
391
-
392
-
393
- class DataSourceType(str, Enum):
394
- """
395
- The source type of a piece of data.
396
-
397
- Human: a human created the data
398
- Synthetic: a model created the data
399
- """
400
-
401
- human = "human"
402
- synthetic = "synthetic"
403
-
404
-
405
- class DataSourceProperty(BaseModel):
406
- """
407
- Defines a property that can be associated with a data source.
408
-
409
- Includes validation rules for when properties are required or not allowed
410
- based on the data source type.
411
- """
412
-
413
- name: str
414
- type: Type[Union[str, int, float]]
415
- required_for: List[DataSourceType] = []
416
- not_allowed_for: List[DataSourceType] = []
417
-
418
-
419
- class DataSource(BaseModel):
420
- """
421
- Represents the origin of data, either human or synthetic, with associated properties.
422
-
423
- Properties vary based on the source type - for synthetic sources this includes
424
- model information, for human sources this includes creator information.
425
- """
426
-
427
- type: DataSourceType
428
- properties: Dict[str, str | int | float] = Field(
429
- default={},
430
- description="Properties describing the data source. For synthetic things like model. For human, the human's name.",
431
- )
432
-
433
- _data_source_properties = [
434
- DataSourceProperty(
435
- name="created_by",
436
- type=str,
437
- required_for=[DataSourceType.human],
438
- not_allowed_for=[DataSourceType.synthetic],
439
- ),
440
- DataSourceProperty(
441
- name="model_name",
442
- type=str,
443
- required_for=[DataSourceType.synthetic],
444
- not_allowed_for=[DataSourceType.human],
445
- ),
446
- DataSourceProperty(
447
- name="model_provider",
448
- type=str,
449
- required_for=[DataSourceType.synthetic],
450
- not_allowed_for=[DataSourceType.human],
451
- ),
452
- DataSourceProperty(
453
- name="adapter_name",
454
- type=str,
455
- required_for=[DataSourceType.synthetic],
456
- not_allowed_for=[DataSourceType.human],
457
- ),
458
- DataSourceProperty(
459
- name="prompt_builder_name",
460
- type=str,
461
- not_allowed_for=[DataSourceType.human],
462
- ),
463
- DataSourceProperty(
464
- # Optional: an ID within the scope of the prompt_builder_name.
465
- # Used for prompt builders with IDs (like saved prompts, fine-tune prompts)
466
- name="prompt_id",
467
- type=str,
468
- not_allowed_for=[DataSourceType.human],
469
- ),
470
- ]
471
-
472
- @model_validator(mode="after")
473
- def validate_type(self) -> "DataSource":
474
- if self.type not in DataSourceType:
475
- raise ValueError(f"Invalid data source type: {self.type}")
476
- return self
477
-
478
- @model_validator(mode="after")
479
- def validate_properties(self) -> "DataSource":
480
- for prop in self._data_source_properties:
481
- # Check the property type is correct
482
- if prop.name in self.properties:
483
- if not isinstance(self.properties[prop.name], prop.type):
484
- raise ValueError(
485
- f"'{prop.name}' must be of type {prop.type.__name__} for {self.type} data source"
486
- )
487
- # Check the property is required for the data source type
488
- if self.type in prop.required_for:
489
- if prop.name not in self.properties:
490
- raise ValueError(
491
- f"'{prop.name}' is required for {self.type} data source"
492
- )
493
- # Check the property is not allowed for the data source type
494
- elif self.type in prop.not_allowed_for and prop.name in self.properties:
495
- raise ValueError(
496
- f"'{prop.name}' is not allowed for {self.type} data source"
497
- )
498
- return self
499
-
500
- @model_validator(mode="after")
501
- def validate_no_empty_properties(self) -> Self:
502
- for prop, value in self.properties.items():
503
- if isinstance(value, str) and value == "":
504
- raise ValueError(
505
- f"Property '{prop}' must be a non-empty string for {self.type} data source"
506
- )
507
- return self
508
-
509
-
510
- class TaskRun(KilnParentedModel):
511
- """
512
- Represents a single execution of a Task.
513
-
514
- Contains the input used, its source, the output produced, and optional
515
- repair information if the output needed correction.
516
- """
517
-
518
- input: str = Field(
519
- description="The inputs to the task. JSON formatted for structured input, plaintext for unstructured input."
520
- )
521
- input_source: DataSource | None = Field(
522
- default=None, description="The source of the input: human or synthetic."
523
- )
524
-
525
- output: TaskOutput = Field(description="The output of the task run.")
526
- repair_instructions: str | None = Field(
527
- default=None,
528
- description="Instructions for fixing the output. Should define what is wrong, and how to fix it. Will be used by models for both generating a fixed output, and evaluating future models.",
529
- )
530
- repaired_output: TaskOutput | None = Field(
531
- default=None,
532
- description="An version of the output with issues fixed. This must be a 'fixed' version of the existing output, and not an entirely new output. If you wish to generate an ideal curatorial output for this task unrelated to this output, generate a new TaskOutput with type 'human' instead of using this field.",
533
- )
534
- intermediate_outputs: Dict[str, str] | None = Field(
535
- default=None,
536
- description="Intermediate outputs from the task run. Keys are the names of the intermediate output steps (cot=chain of thought, etc), values are the output data.",
537
- )
538
- tags: List[str] = Field(
539
- default=[],
540
- description="Tags for the task run. Tags are used to categorize task runs for filtering and reporting.",
541
- )
542
-
543
- def has_thinking_training_data(self) -> bool:
544
- """
545
- Does this run have thinking data that we can use to train a thinking model?
546
- """
547
- if self.intermediate_outputs is None:
548
- return False
549
- return (
550
- "chain_of_thought" in self.intermediate_outputs
551
- or "reasoning" in self.intermediate_outputs
552
- )
553
-
554
- def parent_task(self) -> Task | None:
555
- if not isinstance(self.parent, Task):
556
- return None
557
- return self.parent
558
-
559
- @model_validator(mode="after")
560
- def validate_input_format(self, info: ValidationInfo) -> Self:
561
- # Don't validate if loading from file (not new). Too slow.
562
- # We don't allow changing task schema, so this is redundant validation.
563
- # Note: we still validate if editing a loaded model
564
- if self.loading_from_file(info):
565
- # Consider loading an existing model as validated.
566
- self._last_validated_input = self.input
567
- return self
568
-
569
- # Don't validate if input has not changed. Too slow to run this every time.
570
- if (
571
- hasattr(self, "_last_validated_input")
572
- and self.input == self._last_validated_input
573
- ):
574
- return self
575
-
576
- task = self.parent_task()
577
- if task is None:
578
- # don't validate this relationship until we have a path or parent. Give them time to build it (but will catch it before saving)
579
- return self
580
-
581
- # validate output
582
- if task.input_json_schema is not None:
583
- try:
584
- validate_schema(json.loads(self.input), task.input_json_schema)
585
- except json.JSONDecodeError:
586
- raise ValueError("Input is not a valid JSON object")
587
- except jsonschema.exceptions.ValidationError as e:
588
- raise ValueError(f"Input does not match task input schema: {e}")
589
- self._last_validated_input = self.input
590
- return self
591
-
592
- @model_validator(mode="after")
593
- def validate_output_format(self, info: ValidationInfo) -> Self:
594
- # Don't validate if loading from file (not new). Too slow.
595
- # Note: we still validate if editing a loaded model's output.
596
- if self.loading_from_file(info):
597
- # Consider loading an existing model as validated.
598
- self._last_validated_output = self.output.output if self.output else None
599
- return self
600
-
601
- # Don't validate unless output has changed since last validation.
602
- # The validator is slow and costly, don't want it running when setting other fields.
603
- if (
604
- hasattr(self, "_last_validated_output")
605
- and self.output is not None
606
- and self.output.output == self._last_validated_output
607
- ):
608
- return self
609
-
610
- task = self.parent_task()
611
- if task is None:
612
- return self
613
-
614
- self.output.validate_output_format(task)
615
- self._last_validated_output = self.output.output if self.output else None
616
- return self
617
-
618
- @model_validator(mode="after")
619
- def validate_repaired_output(self) -> Self:
620
- if self.repaired_output is not None:
621
- if self.repaired_output.rating is not None:
622
- raise ValueError(
623
- "Repaired output rating must be None. Repaired outputs are assumed to have a perfect rating, as they have been fixed."
624
- )
625
- if self.repair_instructions is None and self.repaired_output is not None:
626
- raise ValueError(
627
- "Repair instructions are required if providing a repaired output."
628
- )
629
- if self.repair_instructions is not None and self.repaired_output is None:
630
- raise ValueError(
631
- "A repaired output is required if providing repair instructions."
632
- )
633
- return self
634
-
635
- @model_validator(mode="after")
636
- def validate_input_source(self, info: ValidationInfo) -> Self:
637
- # On strict mode and not loaded from file, we validate input_source is not None.
638
- # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data.
639
- if not strict_mode():
640
- return self
641
- if self.loaded_from_file(info):
642
- return self
643
- if self.input_source is None:
644
- raise ValueError("input_source is required when strict mode is enabled")
645
- return self
646
-
647
- @model_validator(mode="after")
648
- def validate_tags(self) -> Self:
649
- for tag in self.tags:
650
- if not tag:
651
- raise ValueError("Tags cannot be empty strings")
652
- if " " in tag:
653
- raise ValueError("Tags cannot contain spaces. Try underscores.")
654
-
655
- return self
656
-
657
-
658
- # Define the type alias for clarity
659
- DatasetFilter = Callable[[TaskRun], bool]
660
-
661
-
662
- def AllDatasetFilter(_: TaskRun) -> bool:
663
- return True
664
-
665
-
666
- def HighRatingDatasetFilter(task_run: TaskRun) -> bool:
667
- if task_run.output is None:
668
- return False
669
- if task_run.repaired_output is not None:
670
- # Repairs always considered high quality
671
- return True
672
- if task_run.output.rating is None:
673
- return False
674
- return task_run.output.rating.is_high_quality()
675
-
676
-
677
- def ThinkingModelDatasetFilter(task_run: TaskRun) -> bool:
678
- """
679
- A filter that returns True if the task has intermediate outputs we can training a 'thinking' model on (reasoning or chain of thought)
680
- """
681
- return task_run.has_thinking_training_data()
682
-
683
-
684
- def ThinkingModelHighRatedFilter(task_run: TaskRun) -> bool:
685
- """
686
- A filter that returns True if the task has thinking data and the output is high quality
687
- """
688
- return ThinkingModelDatasetFilter(task_run) and HighRatingDatasetFilter(task_run)
689
-
690
-
691
- class DatasetFilterType(str, Enum):
692
- """Dataset filter names."""
693
-
694
- ALL = "all"
695
- HIGH_RATING = "high_rating"
696
- THINKING_MODEL = "thinking_model"
697
- THINKING_MODEL_HIGH_RATED = "thinking_model_high_rated"
698
-
699
-
700
- dataset_filters = {
701
- DatasetFilterType.ALL: AllDatasetFilter,
702
- DatasetFilterType.HIGH_RATING: HighRatingDatasetFilter,
703
- DatasetFilterType.THINKING_MODEL: ThinkingModelDatasetFilter,
704
- DatasetFilterType.THINKING_MODEL_HIGH_RATED: ThinkingModelHighRatedFilter,
705
- }
706
-
707
-
708
- class DatasetSplitDefinition(BaseModel):
709
- """
710
- A definition of a split in a dataset.
711
-
712
- Example: name="train", description="The training set", percentage=0.8 (80% of the dataset)
713
- """
714
-
715
- name: str = NAME_FIELD
716
- description: str | None = Field(
717
- default=None,
718
- description="A description of the dataset for you and your team. Not used in training.",
719
- )
720
- percentage: float = Field(
721
- ge=0.0,
722
- le=1.0,
723
- description="The percentage of the dataset that this split represents (between 0 and 1).",
724
- )
725
-
726
-
727
- AllSplitDefinition: list[DatasetSplitDefinition] = [
728
- DatasetSplitDefinition(name="all", percentage=1.0)
729
- ]
730
- Train80Test20SplitDefinition: list[DatasetSplitDefinition] = [
731
- DatasetSplitDefinition(name="train", percentage=0.8),
732
- DatasetSplitDefinition(name="test", percentage=0.2),
733
- ]
734
- Train60Test20Val20SplitDefinition: list[DatasetSplitDefinition] = [
735
- DatasetSplitDefinition(name="train", percentage=0.6),
736
- DatasetSplitDefinition(name="test", percentage=0.2),
737
- DatasetSplitDefinition(name="val", percentage=0.2),
738
- ]
739
- Train80Test10Val10SplitDefinition: list[DatasetSplitDefinition] = [
740
- DatasetSplitDefinition(name="train", percentage=0.8),
741
- DatasetSplitDefinition(name="test", percentage=0.1),
742
- DatasetSplitDefinition(name="val", percentage=0.1),
743
- ]
744
-
745
-
746
- class DatasetSplit(KilnParentedModel):
747
- """
748
- A collection of task runs, with optional splits (train, test, validation).
749
-
750
- Used to freeze a dataset into train/test/validation splits for repeatable fine-tuning or other tasks.
751
-
752
- Maintains a list of IDs for each split, to avoid data duplication.
753
- """
754
-
755
- name: str = NAME_FIELD
756
- description: str | None = Field(
757
- default=None,
758
- description="A description of the dataset for you and your team. Not used in training.",
759
- )
760
- splits: list[DatasetSplitDefinition] = Field(
761
- default_factory=list,
762
- description="The splits in the dataset.",
763
- )
764
- split_contents: dict[str, list[str]] = Field(
765
- description="The contents of each split in the dataset. The key is the split name, and the value is a list of task run IDs.",
766
- )
767
- filter: DatasetFilterType | None = Field(
768
- default=None,
769
- description="The filter used to build the dataset.",
770
- )
771
-
772
- @model_validator(mode="after")
773
- def validate_split_percentages(self) -> "DatasetSplit":
774
- total = sum(split.percentage for split in self.splits)
775
- if not math.isclose(total, 1.0, rel_tol=1e-9):
776
- raise ValueError(f"The sum of split percentages must be 1.0 (got {total})")
777
- return self
778
-
779
- @classmethod
780
- def from_task(
781
- cls,
782
- name: str,
783
- task: "Task",
784
- splits: list[DatasetSplitDefinition],
785
- filter_type: DatasetFilterType = DatasetFilterType.ALL,
786
- description: str | None = None,
787
- ):
788
- """
789
- Build a dataset split from a task.
790
- """
791
- filter = dataset_filters[filter_type]
792
- split_contents = cls.build_split_contents(task, splits, filter)
793
- return cls(
794
- parent=task,
795
- name=name,
796
- description=description,
797
- splits=splits,
798
- split_contents=split_contents,
799
- filter=filter_type,
800
- )
801
-
802
- @classmethod
803
- def build_split_contents(
804
- cls,
805
- task: "Task",
806
- splits: list[DatasetSplitDefinition],
807
- filter: DatasetFilter,
808
- ) -> dict[str, list[str]]:
809
- valid_ids = []
810
- for task_run in task.runs():
811
- if filter(task_run):
812
- valid_ids.append(task_run.id)
813
-
814
- # Shuffle and split by split percentage
815
- random.shuffle(valid_ids)
816
- split_contents = {}
817
- start_idx = 0
818
- remaining_items = len(valid_ids)
819
-
820
- # Handle all splits except the last one
821
- for split in splits[:-1]:
822
- split_size = round(len(valid_ids) * split.percentage)
823
- split_contents[split.name] = valid_ids[start_idx : start_idx + split_size]
824
- start_idx += split_size
825
- remaining_items -= split_size
826
-
827
- # Last split gets all remaining items (for rounding)
828
- if splits:
829
- split_contents[splits[-1].name] = valid_ids[start_idx:]
830
-
831
- return split_contents
832
-
833
- def parent_task(self) -> "Task | None":
834
- # inline import to avoid circular import
835
- from kiln_ai.datamodel import Task
836
-
837
- if not isinstance(self.parent, Task):
838
- return None
839
- return self.parent
840
-
841
- def missing_count(self) -> int:
842
- """
843
- Returns:
844
- int: the number of task runs that have an ID persisted in this dataset split, but no longer exist in the dataset
845
- """
846
- parent = self.parent_task()
847
- if parent is None:
848
- raise ValueError("DatasetSplit has no parent task")
849
-
850
- runs = parent.runs(readonly=True)
851
- all_ids = set(run.id for run in runs)
852
- all_ids_in_splits = set()
853
- for ids in self.split_contents.values():
854
- all_ids_in_splits.update(ids)
855
- missing = all_ids_in_splits - all_ids
856
- return len(missing)
857
-
858
-
859
- class Prompt(KilnParentedModel):
860
- """
861
- A prompt for a task.
862
- """
863
-
864
- name: str = NAME_FIELD
865
- prompt: str = Field(
866
- description="The prompt for the task.",
867
- min_length=1,
868
- )
869
- chain_of_thought_instructions: str | None = Field(
870
- default=None,
871
- description="Instructions for the model 'thinking' about the requirement prior to answering. Used for chain of thought style prompting. COT will not be used unless this is provided.",
872
- )
873
-
874
-
875
- class TaskRequirement(BaseModel):
876
- """
877
- Defines a specific requirement that should be met by task outputs.
878
-
879
- Includes an identifier, name, description, instruction for meeting the requirement,
880
- priority level, and rating type (five_star, pass_fail, pass_fail_critical, custom).
881
- """
882
-
883
- id: ID_TYPE = ID_FIELD
884
- name: str = SHORT_NAME_FIELD
885
- description: str | None = Field(default=None)
886
- instruction: str = Field(min_length=1)
887
- priority: Priority = Field(default=Priority.p2)
888
- type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star)
889
-
890
-
891
- class TaskDeterminism(str, Enum):
892
- """
893
- Defines how strictly task outputs should match expected results.
894
-
895
- - deterministic: Requires exact matches
896
- - semantic_match: Allows different wording with same meaning
897
- - flexible: Allows variation in both wording and meaning within requirements
898
- """
899
-
900
- deterministic = "deterministic" # Expect exact match
901
- semantic_match = "semantic_match" # Expect same meaning, but flexible on expression of the meaning
902
- flexible = "flexible" # Flexible on semantic output. Eval should be custom based on parsing requirements.
903
-
904
-
905
- class Task(
906
- KilnParentedModel,
907
- KilnParentModel,
908
- parent_of={
909
- "runs": TaskRun,
910
- "dataset_splits": DatasetSplit,
911
- "finetunes": Finetune,
912
- "prompts": Prompt,
913
- },
914
- ):
915
- """
916
- Represents a specific task to be performed, with associated requirements and validation rules.
917
-
918
- Contains the task definition, requirements, input/output schemas, and maintains
919
- a collection of task runs.
920
- """
921
-
922
- name: str = NAME_FIELD
923
- description: str | None = Field(
924
- default=None,
925
- description="A description of the task for you and your team. Will not be used in prompts/training/validation.",
926
- )
927
- instruction: str = Field(
928
- min_length=1,
929
- description="The instructions for the task. Will be used in prompts/training/validation.",
930
- )
931
- requirements: List[TaskRequirement] = Field(default=[])
932
- output_json_schema: JsonObjectSchema | None = None
933
- input_json_schema: JsonObjectSchema | None = None
934
- thinking_instruction: str | None = Field(
935
- default=None,
936
- description="Instructions for the model 'thinking' about the requirement prior to answering. Used for chain of thought style prompting.",
937
- )
938
-
939
- def output_schema(self) -> Dict | None:
940
- if self.output_json_schema is None:
941
- return None
942
- return schema_from_json_str(self.output_json_schema)
943
-
944
- def input_schema(self) -> Dict | None:
945
- if self.input_json_schema is None:
946
- return None
947
- return schema_from_json_str(self.input_json_schema)
948
-
949
- # These wrappers help for typechecking. TODO P2: fix this in KilnParentModel
950
- def runs(self, readonly: bool = False) -> list[TaskRun]:
951
- return super().runs(readonly=readonly) # type: ignore
952
-
953
- def dataset_splits(self, readonly: bool = False) -> list[DatasetSplit]:
954
- return super().dataset_splits(readonly=readonly) # type: ignore
955
-
956
- def finetunes(self, readonly: bool = False) -> list[Finetune]:
957
- return super().finetunes(readonly=readonly) # type: ignore
958
-
959
- def prompts(self, readonly: bool = False) -> list[Prompt]:
960
- return super().prompts(readonly=readonly) # type: ignore
961
-
962
-
963
- class Project(KilnParentModel, parent_of={"tasks": Task}):
964
- """
965
- A collection of related tasks.
966
-
967
- Projects organize tasks into logical groups and provide high-level descriptions
968
- of the overall goals.
969
- """
970
-
971
- name: str = NAME_FIELD
972
- description: str | None = Field(
973
- default=None,
974
- description="A description of the project for you and your team. Will not be used in prompts/training/validation.",
975
- )
976
-
977
- # Needed for typechecking. TODO P2: fix this in KilnParentModel
978
- def tasks(self) -> list[Task]:
979
- return super().tasks() # type: ignore