kiln-ai 0.8.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (88) hide show
  1. kiln_ai/adapters/__init__.py +7 -7
  2. kiln_ai/adapters/adapter_registry.py +81 -10
  3. kiln_ai/adapters/data_gen/data_gen_task.py +21 -3
  4. kiln_ai/adapters/data_gen/test_data_gen_task.py +23 -3
  5. kiln_ai/adapters/eval/base_eval.py +164 -0
  6. kiln_ai/adapters/eval/eval_runner.py +267 -0
  7. kiln_ai/adapters/eval/g_eval.py +367 -0
  8. kiln_ai/adapters/eval/registry.py +16 -0
  9. kiln_ai/adapters/eval/test_base_eval.py +324 -0
  10. kiln_ai/adapters/eval/test_eval_runner.py +640 -0
  11. kiln_ai/adapters/eval/test_g_eval.py +497 -0
  12. kiln_ai/adapters/eval/test_g_eval_data.py +4 -0
  13. kiln_ai/adapters/fine_tune/base_finetune.py +5 -1
  14. kiln_ai/adapters/fine_tune/dataset_formatter.py +310 -65
  15. kiln_ai/adapters/fine_tune/fireworks_finetune.py +47 -32
  16. kiln_ai/adapters/fine_tune/openai_finetune.py +12 -11
  17. kiln_ai/adapters/fine_tune/test_base_finetune.py +19 -0
  18. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +472 -129
  19. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +114 -22
  20. kiln_ai/adapters/fine_tune/test_openai_finetune.py +125 -14
  21. kiln_ai/adapters/ml_model_list.py +434 -93
  22. kiln_ai/adapters/model_adapters/__init__.py +18 -0
  23. kiln_ai/adapters/model_adapters/base_adapter.py +250 -0
  24. kiln_ai/adapters/model_adapters/langchain_adapters.py +309 -0
  25. kiln_ai/adapters/model_adapters/openai_compatible_config.py +10 -0
  26. kiln_ai/adapters/model_adapters/openai_model_adapter.py +289 -0
  27. kiln_ai/adapters/model_adapters/test_base_adapter.py +199 -0
  28. kiln_ai/adapters/{test_langchain_adapter.py → model_adapters/test_langchain_adapter.py} +105 -97
  29. kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +216 -0
  30. kiln_ai/adapters/{test_saving_adapter_results.py → model_adapters/test_saving_adapter_results.py} +80 -30
  31. kiln_ai/adapters/{test_structured_output.py → model_adapters/test_structured_output.py} +125 -46
  32. kiln_ai/adapters/ollama_tools.py +0 -1
  33. kiln_ai/adapters/parsers/__init__.py +10 -0
  34. kiln_ai/adapters/parsers/base_parser.py +12 -0
  35. kiln_ai/adapters/parsers/json_parser.py +37 -0
  36. kiln_ai/adapters/parsers/parser_registry.py +19 -0
  37. kiln_ai/adapters/parsers/r1_parser.py +69 -0
  38. kiln_ai/adapters/parsers/test_json_parser.py +81 -0
  39. kiln_ai/adapters/parsers/test_parser_registry.py +32 -0
  40. kiln_ai/adapters/parsers/test_r1_parser.py +144 -0
  41. kiln_ai/adapters/prompt_builders.py +193 -49
  42. kiln_ai/adapters/provider_tools.py +91 -36
  43. kiln_ai/adapters/repair/repair_task.py +18 -19
  44. kiln_ai/adapters/repair/test_repair_task.py +7 -7
  45. kiln_ai/adapters/run_output.py +11 -0
  46. kiln_ai/adapters/test_adapter_registry.py +177 -0
  47. kiln_ai/adapters/test_generate_docs.py +69 -0
  48. kiln_ai/adapters/test_ollama_tools.py +0 -1
  49. kiln_ai/adapters/test_prompt_adaptors.py +25 -18
  50. kiln_ai/adapters/test_prompt_builders.py +265 -44
  51. kiln_ai/adapters/test_provider_tools.py +268 -46
  52. kiln_ai/datamodel/__init__.py +51 -772
  53. kiln_ai/datamodel/basemodel.py +31 -11
  54. kiln_ai/datamodel/datamodel_enums.py +58 -0
  55. kiln_ai/datamodel/dataset_filters.py +114 -0
  56. kiln_ai/datamodel/dataset_split.py +170 -0
  57. kiln_ai/datamodel/eval.py +298 -0
  58. kiln_ai/datamodel/finetune.py +105 -0
  59. kiln_ai/datamodel/json_schema.py +14 -3
  60. kiln_ai/datamodel/model_cache.py +8 -3
  61. kiln_ai/datamodel/project.py +23 -0
  62. kiln_ai/datamodel/prompt.py +37 -0
  63. kiln_ai/datamodel/prompt_id.py +83 -0
  64. kiln_ai/datamodel/strict_mode.py +24 -0
  65. kiln_ai/datamodel/task.py +181 -0
  66. kiln_ai/datamodel/task_output.py +321 -0
  67. kiln_ai/datamodel/task_run.py +164 -0
  68. kiln_ai/datamodel/test_basemodel.py +80 -2
  69. kiln_ai/datamodel/test_dataset_filters.py +71 -0
  70. kiln_ai/datamodel/test_dataset_split.py +127 -6
  71. kiln_ai/datamodel/test_datasource.py +3 -2
  72. kiln_ai/datamodel/test_eval_model.py +635 -0
  73. kiln_ai/datamodel/test_example_models.py +34 -17
  74. kiln_ai/datamodel/test_json_schema.py +23 -0
  75. kiln_ai/datamodel/test_model_cache.py +24 -0
  76. kiln_ai/datamodel/test_model_perf.py +125 -0
  77. kiln_ai/datamodel/test_models.py +131 -2
  78. kiln_ai/datamodel/test_prompt_id.py +129 -0
  79. kiln_ai/datamodel/test_task.py +159 -0
  80. kiln_ai/utils/config.py +6 -1
  81. kiln_ai/utils/exhaustive_error.py +6 -0
  82. {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/METADATA +45 -7
  83. kiln_ai-0.12.0.dist-info/RECORD +100 -0
  84. kiln_ai/adapters/base_adapter.py +0 -191
  85. kiln_ai/adapters/langchain_adapters.py +0 -256
  86. kiln_ai-0.8.1.dist-info/RECORD +0 -58
  87. {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/WHEEL +0 -0
  88. {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -1,50 +1,59 @@
1
1
  """
2
- See our docs for details about our datamodel: https://kiln-ai.github.io/Kiln/kiln_core_docs/kiln_ai.html
3
- """
2
+ See our docs for details about our datamodel classes and hierarchy:
4
3
 
5
- from __future__ import annotations
4
+ Developer docs: https://kiln-ai.github.io/Kiln/kiln_core_docs/kiln_ai.html
6
5
 
7
- import json
8
- import math
9
- import random
10
- from enum import Enum, IntEnum
11
- from typing import TYPE_CHECKING, Callable, Dict, List, Type, Union
6
+ User docs: https://docs.getkiln.ai/developers/kiln-datamodel
7
+ """
12
8
 
13
- import jsonschema
14
- import jsonschema.exceptions
15
- from pydantic import (
16
- BaseModel,
17
- Field,
18
- ValidationInfo,
19
- model_validator,
20
- )
21
- from typing_extensions import Self
9
+ # This component uses "flat" imports so we don't have too much internal structure exposed in the API.
10
+ # for example you can just `from datamodel import Task, Project` instead of `from datamodel.task import Task; from datamodel.project import Project`
22
11
 
23
- from kiln_ai.datamodel.json_schema import JsonObjectSchema, schema_from_json_str
12
+ from __future__ import annotations
24
13
 
25
- from .basemodel import (
26
- ID_FIELD,
27
- ID_TYPE,
28
- NAME_FIELD,
29
- SHORT_NAME_FIELD,
30
- KilnBaseModel,
31
- KilnParentedModel,
32
- KilnParentModel,
14
+ from kiln_ai.datamodel import dataset_split, eval, strict_mode
15
+ from kiln_ai.datamodel.datamodel_enums import (
16
+ FinetuneDataStrategy,
17
+ FineTuneStatusType,
18
+ Priority,
19
+ StructuredOutputMode,
20
+ TaskOutputRatingType,
21
+ )
22
+ from kiln_ai.datamodel.dataset_split import (
23
+ DatasetSplit,
24
+ DatasetSplitDefinition,
25
+ )
26
+ from kiln_ai.datamodel.finetune import (
27
+ Finetune,
28
+ )
29
+ from kiln_ai.datamodel.project import Project
30
+ from kiln_ai.datamodel.prompt import BasePrompt, Prompt
31
+ from kiln_ai.datamodel.prompt_id import (
32
+ PromptGenerators,
33
+ PromptId,
34
+ prompt_generator_values,
35
+ )
36
+ from kiln_ai.datamodel.task import Task, TaskRequirement
37
+ from kiln_ai.datamodel.task_output import (
38
+ DataSource,
39
+ DataSourceProperty,
40
+ DataSourceType,
41
+ RequirementRating,
42
+ TaskOutput,
43
+ TaskOutputRating,
44
+ )
45
+ from kiln_ai.datamodel.task_run import (
46
+ TaskRun,
33
47
  )
34
- from .json_schema import validate_schema
35
-
36
- if TYPE_CHECKING:
37
- from . import Task
38
-
39
48
 
40
49
  __all__ = [
41
- "basemodel",
42
- "json_schema",
50
+ "strict_mode",
51
+ "dataset_split",
52
+ "eval",
43
53
  "Task",
44
54
  "Project",
45
55
  "TaskRun",
46
56
  "TaskOutput",
47
- "TaskOutputRating",
48
57
  "Priority",
49
58
  "DataSource",
50
59
  "DataSourceType",
@@ -53,746 +62,16 @@ __all__ = [
53
62
  "FineTuneStatusType",
54
63
  "TaskOutputRatingType",
55
64
  "TaskRequirement",
56
- "TaskDeterminism",
57
65
  "DatasetSplitDefinition",
58
66
  "DatasetSplit",
59
67
  "RequirementRating",
60
68
  "TaskRequirement",
61
- "strict_mode",
62
- "set_strict_mode",
63
- ]
64
-
65
-
66
- # We want to be hard on ourselves for data completeness generated by the Kiln App, but don't want to make it hard for users to use the datamodel/library.
67
- # Strict mode enables extra validations that we want to enforce in Kiln App (and any other client that wants best practices), but not in the library (unless they opt in)
68
- _strict_mode: bool = False
69
-
70
-
71
- def strict_mode() -> bool:
72
- return _strict_mode
73
-
74
-
75
- def set_strict_mode(value: bool) -> None:
76
- global _strict_mode
77
- _strict_mode = value
78
-
79
-
80
- class Priority(IntEnum):
81
- """Defines priority levels for tasks and requirements, where P0 is highest priority."""
82
-
83
- p0 = 0
84
- p1 = 1
85
- p2 = 2
86
- p3 = 3
87
-
88
-
89
- # Only one rating type for now, but this allows for extensibility if we want to add more in the future
90
- class TaskOutputRatingType(str, Enum):
91
- """Defines the types of rating systems available for task outputs."""
92
-
93
- five_star = "five_star"
94
- pass_fail = "pass_fail"
95
- pass_fail_critical = "pass_fail_critical"
96
- custom = "custom"
97
-
98
-
99
- class RequirementRating(BaseModel):
100
- """Rating for a specific requirement within a task output."""
101
-
102
- value: float = Field(
103
- description="The rating value. Interpretation depends on rating type"
104
- )
105
- type: TaskOutputRatingType = Field(description="The type of rating")
106
-
107
-
108
- class TaskOutputRating(KilnBaseModel):
109
- """
110
- A rating for a task output, including an overall rating and ratings for each requirement.
111
-
112
- Supports:
113
- - five_star: 1-5 star ratings
114
- - pass_fail: boolean pass/fail (1.0 = pass, 0.0 = fail)
115
- - pass_fail_critical: tri-state (1.0 = pass, 0.0 = fail, -1.0 = critical fail)
116
- """
117
-
118
- type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star)
119
- value: float | None = Field(
120
- description="The rating value. Interpretation depends on rating type:\n- five_star: 1-5 stars\n- pass_fail: 1.0 (pass) or 0.0 (fail)\n- pass_fail_critical: 1.0 (pass), 0.0 (fail), or -1.0 (critical fail)",
121
- default=None,
122
- )
123
- requirement_ratings: Dict[ID_TYPE, RequirementRating] = Field(
124
- default={},
125
- description="The ratings of the requirements of the task.",
126
- )
127
-
128
- # Previously we stored rating values as a dict of floats, but now we store them as RequirementRating objects.
129
- @model_validator(mode="before")
130
- def upgrade_old_format(cls, data: dict) -> dict:
131
- if not isinstance(data, dict):
132
- return data
133
-
134
- # Check if we have the old format (dict of floats)
135
- req_ratings = data.get("requirement_ratings", {})
136
- if req_ratings and all(
137
- isinstance(v, (int, float)) for v in req_ratings.values()
138
- ):
139
- # Convert each float to a RequirementRating object
140
- # all ratings are five star at the point we used this format
141
- data["requirement_ratings"] = {
142
- k: {"value": v, "type": TaskOutputRatingType.five_star}
143
- for k, v in req_ratings.items()
144
- }
145
-
146
- return data
147
-
148
- # Used to select high quality outputs for example selection (MultiShotPromptBuilder, etc)
149
- def is_high_quality(self) -> bool:
150
- if self.value is None:
151
- return False
152
-
153
- if self.type == TaskOutputRatingType.five_star:
154
- return self.value >= 4
155
- elif self.type == TaskOutputRatingType.pass_fail:
156
- return self.value == 1.0
157
- elif self.type == TaskOutputRatingType.pass_fail_critical:
158
- return self.value == 1.0
159
- return False
160
-
161
- @model_validator(mode="after")
162
- def validate_rating(self) -> Self:
163
- if self.type not in TaskOutputRatingType:
164
- raise ValueError(f"Invalid rating type: {self.type}")
165
-
166
- # Overall rating is optional
167
- if self.value is not None:
168
- self._validate_rating(self.type, self.value, "overall rating")
169
-
170
- for req_id, req_rating in self.requirement_ratings.items():
171
- self._validate_rating(
172
- req_rating.type,
173
- req_rating.value,
174
- f"requirement rating for req ID: {req_id}",
175
- )
176
-
177
- return self
178
-
179
- def _validate_rating(
180
- self, type: TaskOutputRatingType, rating: float | None, rating_name: str
181
- ) -> None:
182
- if type == TaskOutputRatingType.five_star:
183
- self._validate_five_star(rating, rating_name)
184
- elif type == TaskOutputRatingType.pass_fail:
185
- self._validate_pass_fail(rating, rating_name)
186
- elif type == TaskOutputRatingType.pass_fail_critical:
187
- self._validate_pass_fail_critical(rating, rating_name)
188
-
189
- def _validate_five_star(self, rating: float | None, rating_name: str) -> None:
190
- if rating is None or not isinstance(rating, float) or not rating.is_integer():
191
- raise ValueError(
192
- f"{rating_name.capitalize()} of type five_star must be an integer value (1-5)"
193
- )
194
- if rating < 1 or rating > 5:
195
- raise ValueError(
196
- f"{rating_name.capitalize()} of type five_star must be between 1 and 5 stars"
197
- )
198
-
199
- def _validate_pass_fail(self, rating: float | None, rating_name: str) -> None:
200
- if rating is None or not isinstance(rating, float) or not rating.is_integer():
201
- raise ValueError(
202
- f"{rating_name.capitalize()} of type pass_fail must be an integer value (0 or 1)"
203
- )
204
- if rating not in [0, 1]:
205
- raise ValueError(
206
- f"{rating_name.capitalize()} of type pass_fail must be 0 (fail) or 1 (pass)"
207
- )
208
-
209
- def _validate_pass_fail_critical(
210
- self, rating: float | None, rating_name: str
211
- ) -> None:
212
- if rating is None or not isinstance(rating, float) or not rating.is_integer():
213
- raise ValueError(
214
- f"{rating_name.capitalize()} of type pass_fail_critical must be an integer value (-1, 0, or 1)"
215
- )
216
- if rating not in [-1, 0, 1]:
217
- raise ValueError(
218
- f"{rating_name.capitalize()} of type pass_fail_critical must be -1 (critical fail), 0 (fail), or 1 (pass)"
219
- )
220
-
221
-
222
- class TaskOutput(KilnBaseModel):
223
- """
224
- An output for a specific task run.
225
-
226
- Contains the actual output content, its source (human or synthetic),
227
- and optional rating information.
228
- """
229
-
230
- output: str = Field(
231
- description="The output of the task. JSON formatted for structured output, plaintext for unstructured output."
232
- )
233
- source: DataSource | None = Field(
234
- description="The source of the output: human or synthetic.",
235
- default=None,
236
- )
237
- rating: TaskOutputRating | None = Field(
238
- default=None, description="The rating of the output"
239
- )
240
-
241
- def validate_output_format(self, task: Task) -> Self:
242
- # validate output
243
- if task.output_json_schema is not None:
244
- try:
245
- validate_schema(json.loads(self.output), task.output_json_schema)
246
- except json.JSONDecodeError:
247
- raise ValueError("Output is not a valid JSON object")
248
- except jsonschema.exceptions.ValidationError as e:
249
- raise ValueError(f"Output does not match task output schema: {e}")
250
- return self
251
-
252
- @model_validator(mode="after")
253
- def validate_output_source(self, info: ValidationInfo) -> Self:
254
- # On strict mode and not loaded from file, we validate output_source is not None.
255
- # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data.
256
- if not strict_mode():
257
- return self
258
- if self.loaded_from_file(info):
259
- return self
260
- if self.source is None:
261
- raise ValueError("Output source is required when strict mode is enabled")
262
- return self
263
-
264
-
265
- class FineTuneStatusType(str, Enum):
266
- """
267
- The status type of a fine-tune (running, completed, failed, etc).
268
- """
269
-
270
- unknown = "unknown" # server error
271
- pending = "pending"
272
- running = "running"
273
- completed = "completed"
274
- failed = "failed"
275
-
276
-
277
- class Finetune(KilnParentedModel):
278
- name: str = NAME_FIELD
279
- description: str | None = Field(
280
- default=None,
281
- description="A description of the fine-tune for you and your team. Not used in training.",
282
- )
283
- provider: str = Field(
284
- description="The provider to use for the fine-tune (e.g. 'openai')."
285
- )
286
- base_model_id: str = Field(
287
- description="The id of the base model to use for the fine-tune. This string relates to the provider's IDs for their own models, not Kiln IDs."
288
- )
289
- provider_id: str | None = Field(
290
- default=None,
291
- description="The ID of the fine-tune job on the provider's side. May not be the same as the fine_tune_model_id.",
292
- )
293
- fine_tune_model_id: str | None = Field(
294
- default=None,
295
- description="The ID of the fine-tuned model on the provider's side. May not be the same as the provider_id.",
296
- )
297
- dataset_split_id: str = Field(
298
- description="The ID of the dataset split to use for this fine-tune.",
299
- )
300
- train_split_name: str = Field(
301
- default="train",
302
- description="The name of the training split to use for this fine-tune.",
303
- )
304
- validation_split_name: str | None = Field(
305
- default=None,
306
- description="The name of the validation split to use for this fine-tune. Optional.",
307
- )
308
- parameters: dict[str, str | int | float | bool] = Field(
309
- default={},
310
- description="The parameters to use for this fine-tune. These are provider-specific.",
311
- )
312
- system_message: str = Field(
313
- description="The system message to use for this fine-tune.",
314
- )
315
- latest_status: FineTuneStatusType = Field(
316
- default=FineTuneStatusType.unknown,
317
- description="The latest known status of this fine-tune. Not updated in real time.",
318
- )
319
- properties: Dict[str, str | int | float] = Field(
320
- default={},
321
- description="Properties of the fine-tune. Different providers may use different properties.",
322
- )
323
-
324
- def parent_task(self) -> Task | None:
325
- if not isinstance(self.parent, Task):
326
- return None
327
- return self.parent
328
-
329
-
330
- class DataSourceType(str, Enum):
331
- """
332
- The source type of a piece of data.
333
-
334
- Human: a human created the data
335
- Synthetic: a model created the data
336
- """
337
-
338
- human = "human"
339
- synthetic = "synthetic"
340
-
341
-
342
- class DataSourceProperty(BaseModel):
343
- """
344
- Defines a property that can be associated with a data source.
345
-
346
- Includes validation rules for when properties are required or not allowed
347
- based on the data source type.
348
- """
349
-
350
- name: str
351
- type: Type[Union[str, int, float]]
352
- required_for: List[DataSourceType] = []
353
- not_allowed_for: List[DataSourceType] = []
354
-
355
-
356
- class DataSource(BaseModel):
357
- """
358
- Represents the origin of data, either human or synthetic, with associated properties.
359
-
360
- Properties vary based on the source type - for synthetic sources this includes
361
- model information, for human sources this includes creator information.
362
- """
363
-
364
- type: DataSourceType
365
- properties: Dict[str, str | int | float] = Field(
366
- default={},
367
- description="Properties describing the data source. For synthetic things like model. For human, the human's name.",
368
- )
369
-
370
- _data_source_properties = [
371
- DataSourceProperty(
372
- name="created_by",
373
- type=str,
374
- required_for=[DataSourceType.human],
375
- not_allowed_for=[DataSourceType.synthetic],
376
- ),
377
- DataSourceProperty(
378
- name="model_name",
379
- type=str,
380
- required_for=[DataSourceType.synthetic],
381
- not_allowed_for=[DataSourceType.human],
382
- ),
383
- DataSourceProperty(
384
- name="model_provider",
385
- type=str,
386
- required_for=[DataSourceType.synthetic],
387
- not_allowed_for=[DataSourceType.human],
388
- ),
389
- DataSourceProperty(
390
- name="adapter_name",
391
- type=str,
392
- required_for=[DataSourceType.synthetic],
393
- not_allowed_for=[DataSourceType.human],
394
- ),
395
- DataSourceProperty(
396
- name="prompt_builder_name",
397
- type=str,
398
- not_allowed_for=[DataSourceType.human],
399
- ),
400
- ]
401
-
402
- @model_validator(mode="after")
403
- def validate_type(self) -> "DataSource":
404
- if self.type not in DataSourceType:
405
- raise ValueError(f"Invalid data source type: {self.type}")
406
- return self
407
-
408
- @model_validator(mode="after")
409
- def validate_properties(self) -> "DataSource":
410
- for prop in self._data_source_properties:
411
- # Check the property type is correct
412
- if prop.name in self.properties:
413
- if not isinstance(self.properties[prop.name], prop.type):
414
- raise ValueError(
415
- f"'{prop.name}' must be of type {prop.type.__name__} for {self.type} data source"
416
- )
417
- # Check the property is required for the data source type
418
- if self.type in prop.required_for:
419
- if prop.name not in self.properties:
420
- raise ValueError(
421
- f"'{prop.name}' is required for {self.type} data source"
422
- )
423
- # Check the property is not allowed for the data source type
424
- elif self.type in prop.not_allowed_for and prop.name in self.properties:
425
- raise ValueError(
426
- f"'{prop.name}' is not allowed for {self.type} data source"
427
- )
428
- return self
429
-
430
- @model_validator(mode="after")
431
- def validate_no_empty_properties(self) -> Self:
432
- for prop, value in self.properties.items():
433
- if isinstance(value, str) and value == "":
434
- raise ValueError(
435
- f"Property '{prop}' must be a non-empty string for {self.type} data source"
436
- )
437
- return self
438
-
439
-
440
- class TaskRun(KilnParentedModel):
441
- """
442
- Represents a single execution of a Task.
443
-
444
- Contains the input used, its source, the output produced, and optional
445
- repair information if the output needed correction.
446
- """
447
-
448
- input: str = Field(
449
- description="The inputs to the task. JSON formatted for structured input, plaintext for unstructured input."
450
- )
451
- input_source: DataSource | None = Field(
452
- default=None, description="The source of the input: human or synthetic."
453
- )
454
-
455
- output: TaskOutput = Field(description="The output of the task run.")
456
- repair_instructions: str | None = Field(
457
- default=None,
458
- description="Instructions for fixing the output. Should define what is wrong, and how to fix it. Will be used by models for both generating a fixed output, and evaluating future models.",
459
- )
460
- repaired_output: TaskOutput | None = Field(
461
- default=None,
462
- description="An version of the output with issues fixed. This must be a 'fixed' version of the existing output, and not an entirely new output. If you wish to generate an ideal curatorial output for this task unrelated to this output, generate a new TaskOutput with type 'human' instead of using this field.",
463
- )
464
- intermediate_outputs: Dict[str, str] | None = Field(
465
- default=None,
466
- description="Intermediate outputs from the task run. Keys are the names of the intermediate output steps (cot=chain of thought, etc), values are the output data.",
467
- )
468
- tags: List[str] = Field(
469
- default=[],
470
- description="Tags for the task run. Tags are used to categorize task runs for filtering and reporting.",
471
- )
472
-
473
- def parent_task(self) -> Task | None:
474
- if not isinstance(self.parent, Task):
475
- return None
476
- return self.parent
477
-
478
- @model_validator(mode="after")
479
- def validate_input_format(self) -> Self:
480
- task = self.parent_task()
481
- if task is None:
482
- # don't validate this relationship until we have a path or parent. Give them time to build it (but will catch it before saving)
483
- return self
484
-
485
- # validate output
486
- if task.input_json_schema is not None:
487
- try:
488
- validate_schema(json.loads(self.input), task.input_json_schema)
489
- except json.JSONDecodeError:
490
- raise ValueError("Input is not a valid JSON object")
491
- except jsonschema.exceptions.ValidationError as e:
492
- raise ValueError(f"Input does not match task input schema: {e}")
493
- return self
494
-
495
- @model_validator(mode="after")
496
- def validate_output_format(self) -> Self:
497
- task = self.parent_task()
498
- if task is None:
499
- return self
500
-
501
- self.output.validate_output_format(task)
502
- return self
503
-
504
- @model_validator(mode="after")
505
- def validate_repaired_output(self) -> Self:
506
- if self.repaired_output is not None:
507
- if self.repaired_output.rating is not None:
508
- raise ValueError(
509
- "Repaired output rating must be None. Repaired outputs are assumed to have a perfect rating, as they have been fixed."
510
- )
511
- if self.repair_instructions is None and self.repaired_output is not None:
512
- raise ValueError(
513
- "Repair instructions are required if providing a repaired output."
514
- )
515
- if self.repair_instructions is not None and self.repaired_output is None:
516
- raise ValueError(
517
- "A repaired output is required if providing repair instructions."
518
- )
519
- return self
520
-
521
- @model_validator(mode="after")
522
- def validate_input_source(self, info: ValidationInfo) -> Self:
523
- # On strict mode and not loaded from file, we validate input_source is not None.
524
- # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data.
525
- if not strict_mode():
526
- return self
527
- if self.loaded_from_file(info):
528
- return self
529
- if self.input_source is None:
530
- raise ValueError("input_source is required when strict mode is enabled")
531
- return self
532
-
533
- @model_validator(mode="after")
534
- def validate_tags(self) -> Self:
535
- for tag in self.tags:
536
- if not tag:
537
- raise ValueError("Tags cannot be empty strings")
538
- if " " in tag:
539
- raise ValueError("Tags cannot contain spaces. Try underscores.")
540
-
541
- return self
542
-
543
-
544
- # Define the type alias for clarity
545
- DatasetFilter = Callable[[TaskRun], bool]
546
-
547
-
548
- def AllDatasetFilter(_: TaskRun) -> bool:
549
- return True
550
-
551
-
552
- def HighRatingDatasetFilter(task_run: TaskRun) -> bool:
553
- if task_run.output is None or task_run.output.rating is None:
554
- return False
555
- return task_run.output.rating.is_high_quality()
556
-
557
-
558
- class DatasetSplitDefinition(BaseModel):
559
- """
560
- A definition of a split in a dataset.
561
-
562
- Example: name="train", description="The training set", percentage=0.8 (80% of the dataset)
563
- """
564
-
565
- name: str = NAME_FIELD
566
- description: str | None = Field(
567
- default=None,
568
- description="A description of the dataset for you and your team. Not used in training.",
569
- )
570
- percentage: float = Field(
571
- ge=0.0,
572
- le=1.0,
573
- description="The percentage of the dataset that this split represents (between 0 and 1).",
574
- )
575
-
576
-
577
- AllSplitDefinition: list[DatasetSplitDefinition] = [
578
- DatasetSplitDefinition(name="all", percentage=1.0)
579
- ]
580
- Train80Test20SplitDefinition: list[DatasetSplitDefinition] = [
581
- DatasetSplitDefinition(name="train", percentage=0.8),
582
- DatasetSplitDefinition(name="test", percentage=0.2),
583
- ]
584
- Train60Test20Val20SplitDefinition: list[DatasetSplitDefinition] = [
585
- DatasetSplitDefinition(name="train", percentage=0.6),
586
- DatasetSplitDefinition(name="test", percentage=0.2),
587
- DatasetSplitDefinition(name="val", percentage=0.2),
69
+ "BasePrompt",
70
+ "Prompt",
71
+ "TaskOutputRating",
72
+ "StructuredOutputMode",
73
+ "FinetuneDataStrategy",
74
+ "PromptId",
75
+ "PromptGenerators",
76
+ "prompt_generator_values",
588
77
  ]
589
-
590
-
591
- class DatasetSplit(KilnParentedModel):
592
- """
593
- A collection of task runs, with optional splits (train, test, validation).
594
-
595
- Used to freeze a dataset into train/test/validation splits for repeatable fine-tuning or other tasks.
596
-
597
- Maintains a list of IDs for each split, to avoid data duplication.
598
- """
599
-
600
- name: str = NAME_FIELD
601
- description: str | None = Field(
602
- default=None,
603
- description="A description of the dataset for you and your team. Not used in training.",
604
- )
605
- splits: list[DatasetSplitDefinition] = Field(
606
- default_factory=list,
607
- description="The splits in the dataset.",
608
- )
609
- split_contents: dict[str, list[str]] = Field(
610
- description="The contents of each split in the dataset. The key is the split name, and the value is a list of task run IDs.",
611
- )
612
-
613
- @model_validator(mode="after")
614
- def validate_split_percentages(self) -> "DatasetSplit":
615
- total = sum(split.percentage for split in self.splits)
616
- if not math.isclose(total, 1.0, rel_tol=1e-9):
617
- raise ValueError(f"The sum of split percentages must be 1.0 (got {total})")
618
- return self
619
-
620
- @classmethod
621
- def from_task(
622
- cls,
623
- name: str,
624
- task: "Task",
625
- splits: list[DatasetSplitDefinition],
626
- filter: DatasetFilter = AllDatasetFilter,
627
- description: str | None = None,
628
- ):
629
- """
630
- Build a dataset split from a task.
631
- """
632
- split_contents = cls.build_split_contents(task, splits, filter)
633
- return cls(
634
- parent=task,
635
- name=name,
636
- description=description,
637
- splits=splits,
638
- split_contents=split_contents,
639
- )
640
-
641
- @classmethod
642
- def build_split_contents(
643
- cls,
644
- task: "Task",
645
- splits: list[DatasetSplitDefinition],
646
- filter: DatasetFilter,
647
- ) -> dict[str, list[str]]:
648
- valid_ids = []
649
- for task_run in task.runs():
650
- if filter(task_run):
651
- valid_ids.append(task_run.id)
652
-
653
- # Shuffle and split by split percentage
654
- random.shuffle(valid_ids)
655
- split_contents = {}
656
- start_idx = 0
657
- remaining_items = len(valid_ids)
658
-
659
- # Handle all splits except the last one
660
- for split in splits[:-1]:
661
- split_size = round(len(valid_ids) * split.percentage)
662
- split_contents[split.name] = valid_ids[start_idx : start_idx + split_size]
663
- start_idx += split_size
664
- remaining_items -= split_size
665
-
666
- # Last split gets all remaining items (for rounding)
667
- if splits:
668
- split_contents[splits[-1].name] = valid_ids[start_idx:]
669
-
670
- return split_contents
671
-
672
- def parent_task(self) -> "Task | None":
673
- # inline import to avoid circular import
674
- from kiln_ai.datamodel import Task
675
-
676
- if not isinstance(self.parent, Task):
677
- return None
678
- return self.parent
679
-
680
- def missing_count(self) -> int:
681
- """
682
- Returns:
683
- int: the number of task runs that have an ID persisted in this dataset split, but no longer exist in the dataset
684
- """
685
- parent = self.parent_task()
686
- if parent is None:
687
- raise ValueError("DatasetSplit has no parent task")
688
-
689
- runs = parent.runs()
690
- all_ids = set(run.id for run in runs)
691
- all_ids_in_splits = set()
692
- for ids in self.split_contents.values():
693
- all_ids_in_splits.update(ids)
694
- missing = all_ids_in_splits - all_ids
695
- return len(missing)
696
-
697
-
698
- class TaskRequirement(BaseModel):
699
- """
700
- Defines a specific requirement that should be met by task outputs.
701
-
702
- Includes an identifier, name, description, instruction for meeting the requirement,
703
- priority level, and rating type (five_star, pass_fail, pass_fail_critical, custom).
704
- """
705
-
706
- id: ID_TYPE = ID_FIELD
707
- name: str = SHORT_NAME_FIELD
708
- description: str | None = Field(default=None)
709
- instruction: str = Field(min_length=1)
710
- priority: Priority = Field(default=Priority.p2)
711
- type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star)
712
-
713
-
714
- class TaskDeterminism(str, Enum):
715
- """
716
- Defines how strictly task outputs should match expected results.
717
-
718
- - deterministic: Requires exact matches
719
- - semantic_match: Allows different wording with same meaning
720
- - flexible: Allows variation in both wording and meaning within requirements
721
- """
722
-
723
- deterministic = "deterministic" # Expect exact match
724
- semantic_match = "semantic_match" # Expect same meaning, but flexible on expression of the meaning
725
- flexible = "flexible" # Flexible on semantic output. Eval should be custom based on parsing requirements.
726
-
727
-
728
- class Task(
729
- KilnParentedModel,
730
- KilnParentModel,
731
- parent_of={
732
- "runs": TaskRun,
733
- "dataset_splits": DatasetSplit,
734
- "finetunes": Finetune,
735
- },
736
- ):
737
- """
738
- Represents a specific task to be performed, with associated requirements and validation rules.
739
-
740
- Contains the task definition, requirements, input/output schemas, and maintains
741
- a collection of task runs.
742
- """
743
-
744
- name: str = NAME_FIELD
745
- description: str | None = Field(
746
- default=None,
747
- description="A description of the task for you and your team. Will not be used in prompts/training/validation.",
748
- )
749
- instruction: str = Field(
750
- min_length=1,
751
- description="The instructions for the task. Will be used in prompts/training/validation.",
752
- )
753
- requirements: List[TaskRequirement] = Field(default=[])
754
- output_json_schema: JsonObjectSchema | None = None
755
- input_json_schema: JsonObjectSchema | None = None
756
- thinking_instruction: str | None = Field(
757
- default=None,
758
- description="Instructions for the model 'thinking' about the requirement prior to answering. Used for chain of thought style prompting.",
759
- )
760
-
761
- def output_schema(self) -> Dict | None:
762
- if self.output_json_schema is None:
763
- return None
764
- return schema_from_json_str(self.output_json_schema)
765
-
766
- def input_schema(self) -> Dict | None:
767
- if self.input_json_schema is None:
768
- return None
769
- return schema_from_json_str(self.input_json_schema)
770
-
771
- # Needed for typechecking. TODO P2: fix this in KilnParentModel
772
- def runs(self) -> list[TaskRun]:
773
- return super().runs() # type: ignore
774
-
775
- def dataset_splits(self) -> list[DatasetSplit]:
776
- return super().dataset_splits() # type: ignore
777
-
778
- def finetunes(self) -> list[Finetune]:
779
- return super().finetunes() # type: ignore
780
-
781
-
782
- class Project(KilnParentModel, parent_of={"tasks": Task}):
783
- """
784
- A collection of related tasks.
785
-
786
- Projects organize tasks into logical groups and provide high-level descriptions
787
- of the overall goals.
788
- """
789
-
790
- name: str = NAME_FIELD
791
- description: str | None = Field(
792
- default=None,
793
- description="A description of the project for you and your team. Will not be used in prompts/training/validation.",
794
- )
795
-
796
- # Needed for typechecking. TODO P2: fix this in KilnParentModel
797
- def tasks(self) -> list[Task]:
798
- return super().tasks() # type: ignore