kiln-ai 0.8.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (88) hide show
  1. kiln_ai/adapters/__init__.py +7 -7
  2. kiln_ai/adapters/adapter_registry.py +81 -10
  3. kiln_ai/adapters/data_gen/data_gen_task.py +21 -3
  4. kiln_ai/adapters/data_gen/test_data_gen_task.py +23 -3
  5. kiln_ai/adapters/eval/base_eval.py +164 -0
  6. kiln_ai/adapters/eval/eval_runner.py +267 -0
  7. kiln_ai/adapters/eval/g_eval.py +367 -0
  8. kiln_ai/adapters/eval/registry.py +16 -0
  9. kiln_ai/adapters/eval/test_base_eval.py +324 -0
  10. kiln_ai/adapters/eval/test_eval_runner.py +640 -0
  11. kiln_ai/adapters/eval/test_g_eval.py +497 -0
  12. kiln_ai/adapters/eval/test_g_eval_data.py +4 -0
  13. kiln_ai/adapters/fine_tune/base_finetune.py +5 -1
  14. kiln_ai/adapters/fine_tune/dataset_formatter.py +310 -65
  15. kiln_ai/adapters/fine_tune/fireworks_finetune.py +47 -32
  16. kiln_ai/adapters/fine_tune/openai_finetune.py +12 -11
  17. kiln_ai/adapters/fine_tune/test_base_finetune.py +19 -0
  18. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +472 -129
  19. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +114 -22
  20. kiln_ai/adapters/fine_tune/test_openai_finetune.py +125 -14
  21. kiln_ai/adapters/ml_model_list.py +434 -93
  22. kiln_ai/adapters/model_adapters/__init__.py +18 -0
  23. kiln_ai/adapters/model_adapters/base_adapter.py +250 -0
  24. kiln_ai/adapters/model_adapters/langchain_adapters.py +309 -0
  25. kiln_ai/adapters/model_adapters/openai_compatible_config.py +10 -0
  26. kiln_ai/adapters/model_adapters/openai_model_adapter.py +289 -0
  27. kiln_ai/adapters/model_adapters/test_base_adapter.py +199 -0
  28. kiln_ai/adapters/{test_langchain_adapter.py → model_adapters/test_langchain_adapter.py} +105 -97
  29. kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +216 -0
  30. kiln_ai/adapters/{test_saving_adapter_results.py → model_adapters/test_saving_adapter_results.py} +80 -30
  31. kiln_ai/adapters/{test_structured_output.py → model_adapters/test_structured_output.py} +125 -46
  32. kiln_ai/adapters/ollama_tools.py +0 -1
  33. kiln_ai/adapters/parsers/__init__.py +10 -0
  34. kiln_ai/adapters/parsers/base_parser.py +12 -0
  35. kiln_ai/adapters/parsers/json_parser.py +37 -0
  36. kiln_ai/adapters/parsers/parser_registry.py +19 -0
  37. kiln_ai/adapters/parsers/r1_parser.py +69 -0
  38. kiln_ai/adapters/parsers/test_json_parser.py +81 -0
  39. kiln_ai/adapters/parsers/test_parser_registry.py +32 -0
  40. kiln_ai/adapters/parsers/test_r1_parser.py +144 -0
  41. kiln_ai/adapters/prompt_builders.py +193 -49
  42. kiln_ai/adapters/provider_tools.py +91 -36
  43. kiln_ai/adapters/repair/repair_task.py +18 -19
  44. kiln_ai/adapters/repair/test_repair_task.py +7 -7
  45. kiln_ai/adapters/run_output.py +11 -0
  46. kiln_ai/adapters/test_adapter_registry.py +177 -0
  47. kiln_ai/adapters/test_generate_docs.py +69 -0
  48. kiln_ai/adapters/test_ollama_tools.py +0 -1
  49. kiln_ai/adapters/test_prompt_adaptors.py +25 -18
  50. kiln_ai/adapters/test_prompt_builders.py +265 -44
  51. kiln_ai/adapters/test_provider_tools.py +268 -46
  52. kiln_ai/datamodel/__init__.py +51 -772
  53. kiln_ai/datamodel/basemodel.py +31 -11
  54. kiln_ai/datamodel/datamodel_enums.py +58 -0
  55. kiln_ai/datamodel/dataset_filters.py +114 -0
  56. kiln_ai/datamodel/dataset_split.py +170 -0
  57. kiln_ai/datamodel/eval.py +298 -0
  58. kiln_ai/datamodel/finetune.py +105 -0
  59. kiln_ai/datamodel/json_schema.py +14 -3
  60. kiln_ai/datamodel/model_cache.py +8 -3
  61. kiln_ai/datamodel/project.py +23 -0
  62. kiln_ai/datamodel/prompt.py +37 -0
  63. kiln_ai/datamodel/prompt_id.py +83 -0
  64. kiln_ai/datamodel/strict_mode.py +24 -0
  65. kiln_ai/datamodel/task.py +181 -0
  66. kiln_ai/datamodel/task_output.py +321 -0
  67. kiln_ai/datamodel/task_run.py +164 -0
  68. kiln_ai/datamodel/test_basemodel.py +80 -2
  69. kiln_ai/datamodel/test_dataset_filters.py +71 -0
  70. kiln_ai/datamodel/test_dataset_split.py +127 -6
  71. kiln_ai/datamodel/test_datasource.py +3 -2
  72. kiln_ai/datamodel/test_eval_model.py +635 -0
  73. kiln_ai/datamodel/test_example_models.py +34 -17
  74. kiln_ai/datamodel/test_json_schema.py +23 -0
  75. kiln_ai/datamodel/test_model_cache.py +24 -0
  76. kiln_ai/datamodel/test_model_perf.py +125 -0
  77. kiln_ai/datamodel/test_models.py +131 -2
  78. kiln_ai/datamodel/test_prompt_id.py +129 -0
  79. kiln_ai/datamodel/test_task.py +159 -0
  80. kiln_ai/utils/config.py +6 -1
  81. kiln_ai/utils/exhaustive_error.py +6 -0
  82. {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/METADATA +45 -7
  83. kiln_ai-0.12.0.dist-info/RECORD +100 -0
  84. kiln_ai/adapters/base_adapter.py +0 -191
  85. kiln_ai/adapters/langchain_adapters.py +0 -256
  86. kiln_ai-0.8.1.dist-info/RECORD +0 -58
  87. {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/WHEEL +0 -0
  88. {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -0,0 +1,298 @@
1
+ import json
2
+ from enum import Enum
3
+ from typing import TYPE_CHECKING, Any, Dict, List, Union
4
+
5
+ from pydantic import BaseModel, Field, model_validator
6
+ from typing_extensions import Self
7
+
8
+ from kiln_ai.datamodel.basemodel import (
9
+ ID_TYPE,
10
+ NAME_FIELD,
11
+ KilnParentedModel,
12
+ KilnParentModel,
13
+ )
14
+ from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType
15
+ from kiln_ai.datamodel.dataset_filters import DatasetFilterId
16
+ from kiln_ai.datamodel.json_schema import string_to_json_key
17
+ from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
18
+
19
+ if TYPE_CHECKING:
20
+ from kiln_ai.datamodel.task import Task
21
+
22
+ EvalScores = Dict[str, float]
23
+
24
+
25
+ class EvalTemplateId(str, Enum):
26
+ """
27
+ An eval template is a pre-defined eval that can be used as a starting point for a new eval.
28
+ """
29
+
30
+ kiln_requirements = "kiln_requirements"
31
+ toxicity = "toxicity"
32
+ bias = "bias"
33
+ maliciousness = "maliciousness"
34
+ factual_correctness = "factual_correctness"
35
+ jailbreak = "jailbreak"
36
+
37
+
38
+ class EvalConfigType(str, Enum):
39
+ g_eval = "g_eval"
40
+ llm_as_judge = "llm_as_judge"
41
+
42
+
43
+ class EvalOutputScore(BaseModel):
44
+ """
45
+ A definition of a score that an evaluator will produce.
46
+
47
+ Very similar to TaskRequirement, but conceptually different keeping in a separate models.
48
+ """
49
+
50
+ name: str = Field(
51
+ description="The name of the score. Will be provided to the model so use a descriptive name. Should align to the model's TaskRequirement name if you want to use human evals to evaluate the evaluator's performance."
52
+ )
53
+ instruction: str | None = Field(
54
+ default=None,
55
+ description="A description of the score, used to help the model understand the goal of the score. Will be provided to evaluator models, so should be written for the model, not the team/user.",
56
+ )
57
+ type: TaskOutputRatingType = Field(
58
+ description="The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical')."
59
+ )
60
+
61
+ def json_key(self) -> str:
62
+ """
63
+ The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.
64
+
65
+ For example, "Overall Rating" -> "overall_rating"
66
+ """
67
+ return string_to_json_key(self.name)
68
+
69
+ @model_validator(mode="after")
70
+ def validate_type(self) -> Self:
71
+ if self.type == TaskOutputRatingType.custom:
72
+ raise ValueError(
73
+ f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score."
74
+ )
75
+ return self
76
+
77
+
78
+ class EvalRun(KilnParentedModel):
79
+ """
80
+ The results of running an eval on a single dataset item.
81
+
82
+ This is a child of an EvalConfig, which specifies how the scores were generated.
83
+
84
+ Eval runs can be one of 2 types:
85
+ 1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run.
86
+ 2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item.
87
+ """
88
+
89
+ dataset_id: ID_TYPE = Field(
90
+ description="The ID of the dataset item that was used for this run. Must belong to the same Task as the grand-parent eval of this EvalRun."
91
+ )
92
+ task_run_config_id: ID_TYPE | None = Field(
93
+ description="The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config."
94
+ )
95
+ eval_config_eval: bool = Field(
96
+ description="Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task.",
97
+ default=False,
98
+ )
99
+ # These two may duplicate the dataset_id.input/output, but we're denormalizing intentionally.
100
+ input: str = Field(
101
+ description="The input to the task. JSON formatted for structured input, plaintext for unstructured input."
102
+ )
103
+ output: str = Field(
104
+ description="The output of the task. JSON formatted for structured output, plaintext for unstructured output."
105
+ )
106
+ intermediate_outputs: Dict[str, str] | None = Field(
107
+ default=None,
108
+ description="The intermediate outputs of the task (example, eval thinking).",
109
+ )
110
+ scores: EvalScores = Field(
111
+ description="The output scores of the evaluator (aligning to those required by the grand-parent Eval this object is a child of)."
112
+ )
113
+
114
+ def parent_eval_config(self) -> Union["EvalConfig", None]:
115
+ if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig":
116
+ raise ValueError("parent must be an EvalConfig")
117
+ return self.parent # type: ignore
118
+
119
+ @model_validator(mode="after")
120
+ def validate_eval_run_types(self) -> Self:
121
+ if self.eval_config_eval and self.task_run_config_id is not None:
122
+ raise ValueError(
123
+ "task_run_config_id must be None if eval_config_eval is true"
124
+ )
125
+ if not self.eval_config_eval and self.task_run_config_id is None:
126
+ raise ValueError(
127
+ "task_run_config_id must be set if eval_config_eval is false"
128
+ )
129
+ return self
130
+
131
+ @model_validator(mode="after")
132
+ def validate_scores(self) -> Self:
133
+ # We're checking the scores have the expected keys from the grand-parent eval
134
+ if self.scores is None or len(self.scores) == 0:
135
+ raise ValueError("scores are required, and must have at least one score.")
136
+
137
+ parent_eval_config = self.parent_eval_config()
138
+ eval = parent_eval_config.parent_eval() if parent_eval_config else None
139
+ if not eval:
140
+ # Can't validate without the grand-parent eval, allow it to be validated later
141
+ return self
142
+
143
+ output_score_keys = [score.json_key() for score in eval.output_scores]
144
+ if set(output_score_keys) != set(self.scores.keys()):
145
+ raise ValueError(
146
+ f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]"
147
+ )
148
+
149
+ # Check that each score is expected in this eval and the correct type
150
+ for output_score in eval.output_scores:
151
+ match output_score.type:
152
+ case TaskOutputRatingType.five_star:
153
+ five_star_score = self.scores[output_score.json_key()]
154
+ if (
155
+ not isinstance(five_star_score, float)
156
+ or five_star_score < 1.0
157
+ or five_star_score > 5.0
158
+ ):
159
+ raise ValueError(
160
+ f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}"
161
+ )
162
+ case TaskOutputRatingType.pass_fail:
163
+ pass_fail_score = self.scores[output_score.json_key()]
164
+ if (
165
+ not isinstance(pass_fail_score, float)
166
+ or pass_fail_score < 0.0
167
+ or pass_fail_score > 1.0
168
+ ):
169
+ raise ValueError(
170
+ f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}"
171
+ )
172
+ case TaskOutputRatingType.pass_fail_critical:
173
+ pass_fail_critical_score = self.scores[output_score.json_key()]
174
+ if (
175
+ not isinstance(pass_fail_critical_score, float)
176
+ or pass_fail_critical_score < -1.0
177
+ or pass_fail_critical_score > 1.0
178
+ ):
179
+ raise ValueError(
180
+ f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}"
181
+ )
182
+ case TaskOutputRatingType.custom:
183
+ raise ValueError(
184
+ f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score."
185
+ )
186
+ case _:
187
+ # Catch missing cases
188
+ raise_exhaustive_enum_error(output_score.type)
189
+ return self
190
+
191
+
192
+ class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun}):
193
+ """
194
+ A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc.
195
+
196
+ A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config.
197
+ """
198
+
199
+ name: str = NAME_FIELD
200
+ model_name: str = Field(
201
+ description="The name of the model to use for this eval config. ",
202
+ )
203
+ model_provider: str = Field(
204
+ description="The provider of the model to use for this eval config.",
205
+ )
206
+ config_type: EvalConfigType = Field(
207
+ default=EvalConfigType.g_eval,
208
+ description="This is used to determine the type of eval to run.",
209
+ )
210
+ properties: dict[str, Any] = Field(
211
+ default={},
212
+ description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.",
213
+ )
214
+
215
+ def parent_eval(self) -> Union["Eval", None]:
216
+ if self.parent is not None and self.parent.__class__.__name__ != "Eval":
217
+ raise ValueError("parent must be an Eval")
218
+ return self.parent # type: ignore
219
+
220
+ def runs(self, readonly: bool = False) -> list[EvalRun]:
221
+ return super().runs(readonly=readonly) # type: ignore
222
+
223
+ @model_validator(mode="after")
224
+ def validate_properties(self) -> Self:
225
+ if (
226
+ self.config_type == EvalConfigType.g_eval
227
+ or self.config_type == EvalConfigType.llm_as_judge
228
+ ):
229
+ if "eval_steps" not in self.properties or not isinstance(
230
+ self.properties["eval_steps"], list
231
+ ):
232
+ raise ValueError("eval_steps is required and must be a list for g_eval")
233
+ if "task_description" in self.properties and not isinstance(
234
+ self.properties["task_description"], str
235
+ ):
236
+ raise ValueError(
237
+ "task_description is optional, but if provided must be a string"
238
+ )
239
+ return self
240
+ else:
241
+ raise ValueError(f"Invalid eval config type: {self.config_type}")
242
+
243
+ @model_validator(mode="after")
244
+ def validate_json_serializable(self) -> "EvalConfig":
245
+ try:
246
+ # This will raise a TypeError if the dict contains non-JSON-serializable objects
247
+ json.dumps(self.properties)
248
+ except TypeError as e:
249
+ raise ValueError(f"Properties must be JSON serializable: {str(e)}")
250
+ return self
251
+
252
+
253
+ class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}):
254
+ name: str = NAME_FIELD
255
+ description: str | None = Field(
256
+ default=None, description="The description of the eval"
257
+ )
258
+ template: EvalTemplateId | None = Field(
259
+ default=None,
260
+ description="The template selected when creating this eval. Useful for suggesting eval steps and output scores.",
261
+ )
262
+ current_config_id: ID_TYPE = Field(
263
+ default=None,
264
+ description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.",
265
+ )
266
+ eval_set_filter_id: DatasetFilterId = Field(
267
+ description="The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id."
268
+ )
269
+ eval_configs_filter_id: DatasetFilterId = Field(
270
+ description="The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id."
271
+ )
272
+ output_scores: List[EvalOutputScore] = Field(
273
+ description="The scores this evaluator should produce."
274
+ )
275
+
276
+ # Workaround to return typed parent without importing Task
277
+ def parent_task(self) -> Union["Task", None]:
278
+ if self.parent is not None and self.parent.__class__.__name__ != "Task":
279
+ raise ValueError("parent must be a Task")
280
+ return self.parent # type: ignore
281
+
282
+ def configs(self, readonly: bool = False) -> list[EvalConfig]:
283
+ return super().configs(readonly=readonly) # type: ignore
284
+
285
+ @model_validator(mode="after")
286
+ def validate_scores(self) -> Self:
287
+ if self.output_scores is None or len(self.output_scores) == 0:
288
+ raise ValueError(
289
+ "output_scores are required, and must have at least one score."
290
+ )
291
+
292
+ # check for duplicate names (once transformed to JSON keys)
293
+ output_score_keys = [score.json_key() for score in self.output_scores]
294
+ if len(output_score_keys) != len(set(output_score_keys)):
295
+ raise ValueError(
296
+ f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]"
297
+ )
298
+ return self
@@ -0,0 +1,105 @@
1
+ from typing import TYPE_CHECKING, Dict, Union
2
+
3
+ from pydantic import Field, model_validator
4
+ from typing_extensions import Self
5
+
6
+ from kiln_ai.datamodel.basemodel import NAME_FIELD, KilnParentedModel
7
+ from kiln_ai.datamodel.datamodel_enums import (
8
+ FinetuneDataStrategy,
9
+ FineTuneStatusType,
10
+ StructuredOutputMode,
11
+ )
12
+
13
+ if TYPE_CHECKING:
14
+ from kiln_ai.datamodel.task import Task
15
+
16
+
17
+ class Finetune(KilnParentedModel):
18
+ """
19
+ The Kiln fine-tune datamodel.
20
+
21
+ Initially holds a reference to a training job, with needed identifiers to update the status. When complete, contains the new model ID.
22
+ """
23
+
24
+ name: str = NAME_FIELD
25
+ description: str | None = Field(
26
+ default=None,
27
+ description="A description of the fine-tune for you and your team. Not used in training.",
28
+ )
29
+ structured_output_mode: StructuredOutputMode | None = Field(
30
+ default=None,
31
+ description="The mode to use to train the model for structured output, if it was trained with structured output. Will determine how we call the tuned model, so we call with the matching mode.",
32
+ )
33
+ provider: str = Field(
34
+ description="The provider to use for the fine-tune (e.g. 'openai')."
35
+ )
36
+ base_model_id: str = Field(
37
+ description="The id of the base model to use for the fine-tune. This string relates to the provider's IDs for their own models, not Kiln IDs."
38
+ )
39
+ provider_id: str | None = Field(
40
+ default=None,
41
+ description="The ID of the fine-tune job on the provider's side. May not be the same as the fine_tune_model_id.",
42
+ )
43
+ fine_tune_model_id: str | None = Field(
44
+ default=None,
45
+ description="The ID of the fine-tuned model on the provider's side. May not be the same as the provider_id.",
46
+ )
47
+ dataset_split_id: str = Field(
48
+ description="The ID of the dataset split to use for this fine-tune.",
49
+ )
50
+ train_split_name: str = Field(
51
+ default="train",
52
+ description="The name of the training split to use for this fine-tune.",
53
+ )
54
+ validation_split_name: str | None = Field(
55
+ default=None,
56
+ description="The name of the validation split to use for this fine-tune. Optional.",
57
+ )
58
+ parameters: dict[str, str | int | float | bool] = Field(
59
+ default={},
60
+ description="The parameters to use for this fine-tune. These are provider-specific.",
61
+ )
62
+ # These two fields are saved exactly used for training. Even if they map exactly to a custom prompt or generator, those can change, so we want to keep a record of the training prompt.
63
+ system_message: str = Field(
64
+ description="The system message to use for this fine-tune.",
65
+ )
66
+ thinking_instructions: str | None = Field(
67
+ default=None,
68
+ description="The thinking instructions to use for this fine-tune. Only used when data_strategy is final_and_intermediate.",
69
+ )
70
+ latest_status: FineTuneStatusType = Field(
71
+ default=FineTuneStatusType.unknown,
72
+ description="The latest known status of this fine-tune. Not updated in real time.",
73
+ )
74
+ properties: Dict[str, str | int | float] = Field(
75
+ default={},
76
+ description="Properties of the fine-tune. Different providers may use different properties.",
77
+ )
78
+ data_strategy: FinetuneDataStrategy = Field(
79
+ default=FinetuneDataStrategy.final_only,
80
+ description="The strategy to use for training the model. 'final_only' will only train on the final response. 'final_and_intermediate' will train on the final response and intermediate outputs (chain of thought or reasoning).",
81
+ )
82
+
83
+ # Workaround to return typed parent without importing Task
84
+ def parent_task(self) -> Union["Task", None]:
85
+ if self.parent is None or self.parent.__class__.__name__ != "Task":
86
+ return None
87
+ return self.parent # type: ignore
88
+
89
+ @model_validator(mode="after")
90
+ def validate_thinking_instructions(self) -> Self:
91
+ if (
92
+ self.thinking_instructions is not None
93
+ and self.data_strategy != FinetuneDataStrategy.final_and_intermediate
94
+ ):
95
+ raise ValueError(
96
+ "Thinking instructions can only be used when data_strategy is final_and_intermediate"
97
+ )
98
+ if (
99
+ self.thinking_instructions is None
100
+ and self.data_strategy == FinetuneDataStrategy.final_and_intermediate
101
+ ):
102
+ raise ValueError(
103
+ "Thinking instructions are required when data_strategy is final_and_intermediate"
104
+ )
105
+ return self
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import re
2
3
  from typing import Annotated, Dict
3
4
 
4
5
  import jsonschema
@@ -42,9 +43,14 @@ def validate_schema(instance: Dict, schema_str: str) -> None:
42
43
  jsonschema.exceptions.ValidationError: If validation fails
43
44
  ValueError: If the schema is invalid
44
45
  """
45
- schema = schema_from_json_str(schema_str)
46
- v = jsonschema.Draft202012Validator(schema)
47
- return v.validate(instance)
46
+ try:
47
+ schema = schema_from_json_str(schema_str)
48
+ v = jsonschema.Draft202012Validator(schema)
49
+ v.validate(instance)
50
+ except jsonschema.exceptions.ValidationError as e:
51
+ raise ValueError(
52
+ f"This task requires a specific output schema. While the model produced JSON, that JSON didn't meet the schema. Search 'Troubleshooting Structured Data Issues' in our docs for more information. The error from the schema check was: {e.message}"
53
+ ) from e
48
54
 
49
55
 
50
56
  def schema_from_json_str(v: str) -> Dict:
@@ -78,3 +84,8 @@ def schema_from_json_str(v: str) -> Dict:
78
84
  raise ValueError(f"Invalid JSON: {v}\n {e}")
79
85
  except Exception as e:
80
86
  raise ValueError(f"Unexpected error parsing JSON schema: {v}\n {e}")
87
+
88
+
89
+ def string_to_json_key(s: str) -> str:
90
+ """Convert a string to a valid JSON key."""
91
+ return re.sub(r"[^a-z0-9_]", "", s.strip().lower().replace(" ", "_"))
@@ -62,12 +62,17 @@ class ModelCache:
62
62
  raise ValueError(f"Model at {path} is not of type {model_type.__name__}")
63
63
  return model
64
64
 
65
- def get_model(self, path: Path, model_type: Type[T]) -> Optional[T]:
66
- # We return a copy so in-memory edits don't impact the cache until they are saved
65
+ def get_model(
66
+ self, path: Path, model_type: Type[T], readonly: bool = False
67
+ ) -> Optional[T]:
68
+ # We return a copy by default, so in-memory edits don't impact the cache until they are saved
67
69
  # Benchmark shows about 2x slower, but much more foolproof
68
70
  model = self._get_model(path, model_type)
69
71
  if model:
70
- return model.model_copy(deep=True)
72
+ if readonly:
73
+ return model
74
+ else:
75
+ return model.model_copy(deep=True)
71
76
  return None
72
77
 
73
78
  def get_model_id(self, path: Path, model_type: Type[T]) -> Optional[str]:
@@ -0,0 +1,23 @@
1
+ from pydantic import Field
2
+
3
+ from kiln_ai.datamodel.basemodel import NAME_FIELD, KilnParentModel
4
+ from kiln_ai.datamodel.task import Task
5
+
6
+
7
+ class Project(KilnParentModel, parent_of={"tasks": Task}):
8
+ """
9
+ A collection of related tasks.
10
+
11
+ Projects organize tasks into logical groups and provide high-level descriptions
12
+ of the overall goals.
13
+ """
14
+
15
+ name: str = NAME_FIELD
16
+ description: str | None = Field(
17
+ default=None,
18
+ description="A description of the project for you and your team. Will not be used in prompts/training/validation.",
19
+ )
20
+
21
+ # Needed for typechecking. TODO P2: fix this in KilnParentModel
22
+ def tasks(self) -> list[Task]:
23
+ return super().tasks() # type: ignore
@@ -0,0 +1,37 @@
1
+ from pydantic import BaseModel, Field
2
+
3
+ from kiln_ai.datamodel.basemodel import NAME_FIELD, KilnParentedModel
4
+
5
+
6
+ class BasePrompt(BaseModel):
7
+ """
8
+ A prompt for a task. This is the basic data storage format which can be used throughout a project.
9
+
10
+ The "Prompt" model name is reserved for the custom prompts parented by a task.
11
+ """
12
+
13
+ name: str = NAME_FIELD
14
+ description: str | None = Field(
15
+ default=None,
16
+ description="A more detailed description of the prompt.",
17
+ )
18
+ generator_id: str | None = Field(
19
+ default=None,
20
+ description="The id of the generator that created this prompt.",
21
+ )
22
+ prompt: str = Field(
23
+ description="The prompt for the task.",
24
+ min_length=1,
25
+ )
26
+ chain_of_thought_instructions: str | None = Field(
27
+ default=None,
28
+ description="Instructions for the model 'thinking' about the requirement prior to answering. Used for chain of thought style prompting. COT will not be used unless this is provided.",
29
+ )
30
+
31
+
32
+ class Prompt(KilnParentedModel, BasePrompt):
33
+ """
34
+ A prompt for a task. This is the custom prompt parented by a task.
35
+ """
36
+
37
+ pass
@@ -0,0 +1,83 @@
1
+ from enum import Enum
2
+ from typing import Annotated
3
+
4
+ from pydantic import AfterValidator
5
+
6
+
7
+ # Generators that can take any task and build a prompt
8
+ class PromptGenerators(str, Enum):
9
+ SIMPLE = "simple_prompt_builder"
10
+ MULTI_SHOT = "multi_shot_prompt_builder"
11
+ FEW_SHOT = "few_shot_prompt_builder"
12
+ REPAIRS = "repairs_prompt_builder"
13
+ SIMPLE_CHAIN_OF_THOUGHT = "simple_chain_of_thought_prompt_builder"
14
+ FEW_SHOT_CHAIN_OF_THOUGHT = "few_shot_chain_of_thought_prompt_builder"
15
+ MULTI_SHOT_CHAIN_OF_THOUGHT = "multi_shot_chain_of_thought_prompt_builder"
16
+
17
+
18
+ prompt_generator_values = [pg.value for pg in PromptGenerators]
19
+
20
+
21
+ PromptId = Annotated[
22
+ str,
23
+ AfterValidator(lambda v: _check_prompt_id(v)),
24
+ ]
25
+ """
26
+ A pydantic type that validates strings containing a valid prompt ID.
27
+
28
+ Prompt IDs can be one of:
29
+ - A saved prompt ID
30
+ - A fine-tune prompt ID
31
+ - A task run config ID
32
+ - A prompt generator name
33
+ """
34
+
35
+
36
+ def _check_prompt_id(id: str) -> str:
37
+ """
38
+ Check that the prompt ID is valid.
39
+ """
40
+ if id in prompt_generator_values:
41
+ return id
42
+
43
+ if id.startswith("id::"):
44
+ # check it has 4 parts divided by :: -- 'id::project_id::task_id::prompt_id'
45
+ parts = id.split("::")
46
+ if len(parts) != 2 or len(parts[1]) == 0:
47
+ raise ValueError(
48
+ f"Invalid saved prompt ID: {id}. Expected format: 'id::[prompt_id]'."
49
+ )
50
+ return id
51
+
52
+ if id.startswith("task_run_config::"):
53
+ # check it had a eval_id after the :: -- 'project_id::task_id::task_run_config_id'
54
+ parts = id.split("::")
55
+ if len(parts) != 4:
56
+ raise ValueError(
57
+ f"Invalid task run config prompt ID: {id}. Expected format: 'task_run_config::[project_id]::[task_id]::[task_run_config_id]'."
58
+ )
59
+ return id
60
+
61
+ if id.startswith("fine_tune_prompt::"):
62
+ # check it had a fine_tune_id after the :: -- 'fine_tune_prompt::fine_tune_id'
63
+ fine_tune_id = id[18:]
64
+ if len(fine_tune_id) == 0:
65
+ raise ValueError(
66
+ f"Invalid fine-tune prompt ID: {id}. Expected format: 'fine_tune_prompt::[fine_tune_id]'."
67
+ )
68
+ return id
69
+
70
+ raise ValueError(f"Invalid prompt ID: {id}")
71
+
72
+
73
+ def is_frozen_prompt(id: PromptId) -> bool:
74
+ """
75
+ Check if the prompt ID is a frozen prompt.
76
+ """
77
+ if id.startswith("id::"):
78
+ return True
79
+ if id.startswith("task_run_config::"):
80
+ return True
81
+ if id.startswith("fine_tune_prompt::"):
82
+ return True
83
+ return False
@@ -0,0 +1,24 @@
1
+ """
2
+ Strict mode is a feature that enables extra validations that we want to enforce in Kiln App, ensuring everything follows the ideal schema.
3
+
4
+ It's off by default when used through the library. Enable it by calling `set_strict_mode(True)`.
5
+ """
6
+
7
+ # We want to be hard on ourselves for data completeness generated by the Kiln App, but don't want to make it hard for users to use the datamodel/library.
8
+ # Strict mode enables extra validations that we want to enforce in Kiln App (and any other client that wants best practices), but not in the library (unless they opt in)
9
+ _strict_mode: bool = False
10
+
11
+
12
+ def strict_mode() -> bool:
13
+ """
14
+ Get the current strict mode setting.
15
+ """
16
+ return _strict_mode
17
+
18
+
19
+ def set_strict_mode(value: bool) -> None:
20
+ """
21
+ Set the strict mode setting.
22
+ """
23
+ global _strict_mode
24
+ _strict_mode = value