kiln-ai 0.0.4__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kiln-ai might be problematic. Click here for more details.
- kiln_ai/adapters/base_adapter.py +168 -0
- kiln_ai/adapters/langchain_adapters.py +113 -0
- kiln_ai/adapters/ml_model_list.py +436 -0
- kiln_ai/adapters/prompt_builders.py +122 -0
- kiln_ai/adapters/repair/repair_task.py +71 -0
- kiln_ai/adapters/repair/test_repair_task.py +248 -0
- kiln_ai/adapters/test_langchain_adapter.py +50 -0
- kiln_ai/adapters/test_ml_model_list.py +99 -0
- kiln_ai/adapters/test_prompt_adaptors.py +167 -0
- kiln_ai/adapters/test_prompt_builders.py +315 -0
- kiln_ai/adapters/test_saving_adapter_results.py +168 -0
- kiln_ai/adapters/test_structured_output.py +218 -0
- kiln_ai/datamodel/__init__.py +362 -2
- kiln_ai/datamodel/basemodel.py +372 -0
- kiln_ai/datamodel/json_schema.py +45 -0
- kiln_ai/datamodel/test_basemodel.py +277 -0
- kiln_ai/datamodel/test_datasource.py +107 -0
- kiln_ai/datamodel/test_example_models.py +644 -0
- kiln_ai/datamodel/test_json_schema.py +124 -0
- kiln_ai/datamodel/test_models.py +190 -0
- kiln_ai/datamodel/test_nested_save.py +205 -0
- kiln_ai/datamodel/test_output_rating.py +88 -0
- kiln_ai/utils/config.py +170 -0
- kiln_ai/utils/formatting.py +5 -0
- kiln_ai/utils/test_config.py +245 -0
- {kiln_ai-0.0.4.dist-info → kiln_ai-0.5.0.dist-info}/METADATA +20 -1
- kiln_ai-0.5.0.dist-info/RECORD +29 -0
- kiln_ai/__init.__.py +0 -3
- kiln_ai/coreadd.py +0 -3
- kiln_ai/datamodel/project.py +0 -15
- kiln_ai-0.0.4.dist-info/RECORD +0 -8
- {kiln_ai-0.0.4.dist-info → kiln_ai-0.5.0.dist-info}/LICENSE.txt +0 -0
- {kiln_ai-0.0.4.dist-info → kiln_ai-0.5.0.dist-info}/WHEEL +0 -0
kiln_ai/datamodel/__init__.py
CHANGED
|
@@ -1,3 +1,363 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
import json
|
|
4
|
+
from enum import Enum, IntEnum
|
|
5
|
+
from typing import TYPE_CHECKING, Dict, List, Self, Type, Union
|
|
6
|
+
|
|
7
|
+
import jsonschema
|
|
8
|
+
import jsonschema.exceptions
|
|
9
|
+
from kiln_ai.datamodel.json_schema import JsonObjectSchema, schema_from_json_str
|
|
10
|
+
from pydantic import BaseModel, Field, model_validator
|
|
11
|
+
|
|
12
|
+
from .basemodel import (
|
|
13
|
+
ID_FIELD,
|
|
14
|
+
ID_TYPE,
|
|
15
|
+
KilnBaseModel,
|
|
16
|
+
KilnParentedModel,
|
|
17
|
+
KilnParentModel,
|
|
18
|
+
)
|
|
19
|
+
from .json_schema import validate_schema
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from . import Task
|
|
23
|
+
|
|
24
|
+
# Conventions:
|
|
25
|
+
# 1) Names are filename safe as they may be used as file names. They are informational and not to be used in prompts/training/validation.
|
|
26
|
+
# 2) Descrptions are for Kiln users to describe/understanding the purpose of this object. They must never be used in prompts/training/validation. Use "instruction/requirements" instead.
|
|
27
|
+
|
|
28
|
+
# Filename compatible names
|
|
29
|
+
NAME_REGEX = r"^[A-Za-z0-9 _-]+$"
|
|
30
|
+
NAME_FIELD = Field(min_length=1, max_length=120, pattern=NAME_REGEX)
|
|
31
|
+
SHORT_NAME_FIELD = Field(min_length=1, max_length=20, pattern=NAME_REGEX)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class Priority(IntEnum):
|
|
35
|
+
p0 = 0
|
|
36
|
+
p1 = 1
|
|
37
|
+
p2 = 2
|
|
38
|
+
p3 = 3
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# Only one rating type for now, but this allows for extensibility if we want to add more in the future
|
|
42
|
+
class TaskOutputRatingType(str, Enum):
|
|
43
|
+
five_star = "five_star"
|
|
44
|
+
custom = "custom"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class TaskOutputRating(KilnBaseModel):
|
|
48
|
+
"""
|
|
49
|
+
A rating for a task output, including an overall rating and ratings for each requirement.
|
|
50
|
+
|
|
51
|
+
Only supports five star ratings for now, but extensible for custom values.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star)
|
|
55
|
+
value: float | None = Field(
|
|
56
|
+
description="The overall rating value (typically 1-5 stars).",
|
|
57
|
+
default=None,
|
|
58
|
+
)
|
|
59
|
+
requirement_ratings: Dict[ID_TYPE, float] = Field(
|
|
60
|
+
default={},
|
|
61
|
+
description="The ratings of the requirements of the task. The keys are the ids of the requirements. The values are the ratings (typically 1-5 stars).",
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# Used to select high quality outputs for example selection (MultiShotPromptBuilder, etc)
|
|
65
|
+
def is_high_quality(self) -> bool:
|
|
66
|
+
if self.type == TaskOutputRatingType.five_star:
|
|
67
|
+
return self.value is not None and self.value >= 4
|
|
68
|
+
return False
|
|
69
|
+
|
|
70
|
+
@model_validator(mode="after")
|
|
71
|
+
def validate_rating(self) -> Self:
|
|
72
|
+
if self.type not in TaskOutputRatingType:
|
|
73
|
+
raise ValueError(f"Invalid rating type: {self.type}")
|
|
74
|
+
|
|
75
|
+
if self.type == TaskOutputRatingType.five_star:
|
|
76
|
+
if self.value is not None:
|
|
77
|
+
self._validate_five_star(self.value, "overall rating")
|
|
78
|
+
for req_id, req_rating in self.requirement_ratings.items():
|
|
79
|
+
self._validate_five_star(req_rating, f"requirement rating for {req_id}")
|
|
80
|
+
|
|
81
|
+
return self
|
|
82
|
+
|
|
83
|
+
def _validate_five_star(self, rating: float, rating_name: str) -> None:
|
|
84
|
+
if not isinstance(rating, float) or not rating.is_integer():
|
|
85
|
+
raise ValueError(
|
|
86
|
+
f"{rating_name.capitalize()} of type five_star must be an integer value (1.0, 2.0, 3.0, 4.0, or 5.0)"
|
|
87
|
+
)
|
|
88
|
+
if rating < 1 or rating > 5:
|
|
89
|
+
raise ValueError(
|
|
90
|
+
f"{rating_name.capitalize()} of type five_star must be between 1 and 5 stars"
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
def validate_requirement_rating_keys(self, task: Task) -> Self:
|
|
94
|
+
if len(self.requirement_ratings) == 0:
|
|
95
|
+
return self
|
|
96
|
+
|
|
97
|
+
valid_requirement_ids = {req.id for req in task.requirements}
|
|
98
|
+
for key in self.requirement_ratings.keys():
|
|
99
|
+
if key not in valid_requirement_ids:
|
|
100
|
+
raise ValueError(
|
|
101
|
+
f"Requirement ID '{key}' is not a valid requirement ID for this task"
|
|
102
|
+
)
|
|
103
|
+
return self
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class TaskOutput(KilnBaseModel):
|
|
107
|
+
"""
|
|
108
|
+
An output for a specific task run.
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
output: str = Field(
|
|
112
|
+
description="The output of the task. JSON formatted for structured output, plaintext for unstructured output."
|
|
113
|
+
)
|
|
114
|
+
source: DataSource = Field(
|
|
115
|
+
description="The source of the output: human or synthetic."
|
|
116
|
+
)
|
|
117
|
+
rating: TaskOutputRating | None = Field(
|
|
118
|
+
default=None, description="The rating of the output"
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
def validate_output_format(self, task: Task) -> Self:
|
|
122
|
+
# validate output
|
|
123
|
+
if task.output_json_schema is not None:
|
|
124
|
+
try:
|
|
125
|
+
validate_schema(json.loads(self.output), task.output_json_schema)
|
|
126
|
+
except json.JSONDecodeError:
|
|
127
|
+
raise ValueError("Output is not a valid JSON object")
|
|
128
|
+
except jsonschema.exceptions.ValidationError as e:
|
|
129
|
+
raise ValueError(f"Output does not match task output schema: {e}")
|
|
130
|
+
return self
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class DataSourceType(str, Enum):
|
|
134
|
+
"""
|
|
135
|
+
The source of a piece of data.
|
|
136
|
+
"""
|
|
137
|
+
|
|
138
|
+
human = "human"
|
|
139
|
+
synthetic = "synthetic"
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class DataSourceProperty(BaseModel):
|
|
143
|
+
name: str
|
|
144
|
+
type: Type[Union[str, int, float]]
|
|
145
|
+
required_for: List[DataSourceType] = []
|
|
146
|
+
not_allowed_for: List[DataSourceType] = []
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class DataSource(BaseModel):
|
|
150
|
+
type: DataSourceType
|
|
151
|
+
properties: Dict[str, str | int | float] = Field(
|
|
152
|
+
default={},
|
|
153
|
+
description="Properties describing the data source. For synthetic things like model. For human, the human's name.",
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
_data_source_properties = [
|
|
157
|
+
DataSourceProperty(
|
|
158
|
+
name="created_by",
|
|
159
|
+
type=str,
|
|
160
|
+
required_for=[DataSourceType.human],
|
|
161
|
+
not_allowed_for=[DataSourceType.synthetic],
|
|
162
|
+
),
|
|
163
|
+
DataSourceProperty(
|
|
164
|
+
name="model_name",
|
|
165
|
+
type=str,
|
|
166
|
+
required_for=[DataSourceType.synthetic],
|
|
167
|
+
not_allowed_for=[DataSourceType.human],
|
|
168
|
+
),
|
|
169
|
+
DataSourceProperty(
|
|
170
|
+
name="model_provider",
|
|
171
|
+
type=str,
|
|
172
|
+
required_for=[DataSourceType.synthetic],
|
|
173
|
+
not_allowed_for=[DataSourceType.human],
|
|
174
|
+
),
|
|
175
|
+
DataSourceProperty(
|
|
176
|
+
name="adapter_name",
|
|
177
|
+
type=str,
|
|
178
|
+
required_for=[DataSourceType.synthetic],
|
|
179
|
+
not_allowed_for=[DataSourceType.human],
|
|
180
|
+
),
|
|
181
|
+
DataSourceProperty(
|
|
182
|
+
name="prompt_builder_name",
|
|
183
|
+
type=str,
|
|
184
|
+
not_allowed_for=[DataSourceType.human],
|
|
185
|
+
),
|
|
186
|
+
]
|
|
187
|
+
|
|
188
|
+
@model_validator(mode="after")
|
|
189
|
+
def validate_type(self) -> "DataSource":
|
|
190
|
+
if self.type not in DataSourceType:
|
|
191
|
+
raise ValueError(f"Invalid data source type: {self.type}")
|
|
192
|
+
return self
|
|
193
|
+
|
|
194
|
+
@model_validator(mode="after")
|
|
195
|
+
def validate_properties(self) -> "DataSource":
|
|
196
|
+
for prop in self._data_source_properties:
|
|
197
|
+
# Check the property type is correct
|
|
198
|
+
if prop.name in self.properties:
|
|
199
|
+
if not isinstance(self.properties[prop.name], prop.type):
|
|
200
|
+
raise ValueError(
|
|
201
|
+
f"'{prop.name}' must be of type {prop.type.__name__} for {self.type} data source"
|
|
202
|
+
)
|
|
203
|
+
# Check the property is required for the data source type
|
|
204
|
+
if self.type in prop.required_for:
|
|
205
|
+
if prop.name not in self.properties:
|
|
206
|
+
raise ValueError(
|
|
207
|
+
f"'{prop.name}' is required for {self.type} data source"
|
|
208
|
+
)
|
|
209
|
+
# Check the property is not allowed for the data source type
|
|
210
|
+
elif self.type in prop.not_allowed_for and prop.name in self.properties:
|
|
211
|
+
raise ValueError(
|
|
212
|
+
f"'{prop.name}' is not allowed for {self.type} data source"
|
|
213
|
+
)
|
|
214
|
+
return self
|
|
215
|
+
|
|
216
|
+
@model_validator(mode="after")
|
|
217
|
+
def validate_no_empty_properties(self) -> Self:
|
|
218
|
+
for prop, value in self.properties.items():
|
|
219
|
+
if isinstance(value, str) and value == "":
|
|
220
|
+
raise ValueError(
|
|
221
|
+
f"Property '{prop}' must be a non-empty string for {self.type} data source"
|
|
222
|
+
)
|
|
223
|
+
return self
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
class TaskRun(KilnParentedModel):
|
|
227
|
+
"""
|
|
228
|
+
An run of a specific Task, including the input and output.
|
|
229
|
+
"""
|
|
230
|
+
|
|
231
|
+
input: str = Field(
|
|
232
|
+
description="The inputs to the task. JSON formatted for structured input, plaintext for unstructured input."
|
|
233
|
+
)
|
|
234
|
+
input_source: DataSource = Field(
|
|
235
|
+
description="The source of the input: human or synthetic."
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
output: TaskOutput = Field(description="The output of the task run.")
|
|
239
|
+
repair_instructions: str | None = Field(
|
|
240
|
+
default=None,
|
|
241
|
+
description="Instructions for fixing the output. Should define what is wrong, and how to fix it. Will be used by models for both generating a fixed output, and evaluating future models.",
|
|
242
|
+
)
|
|
243
|
+
repaired_output: TaskOutput | None = Field(
|
|
244
|
+
default=None,
|
|
245
|
+
description="An version of the output with issues fixed. This must be a 'fixed' version of the existing output, and not an entirely new output. If you wish to generate an ideal curatorial output for this task unrelated to this output, generate a new TaskOutput with type 'human' instead of using this field.",
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
def parent_task(self) -> Task | None:
|
|
249
|
+
if not isinstance(self.parent, Task):
|
|
250
|
+
return None
|
|
251
|
+
return self.parent
|
|
252
|
+
|
|
253
|
+
@model_validator(mode="after")
|
|
254
|
+
def validate_input_format(self) -> Self:
|
|
255
|
+
task = self.parent_task()
|
|
256
|
+
if task is None:
|
|
257
|
+
# don't validate this relationship until we have a path or parent. Give them time to build it (but will catch it before saving)
|
|
258
|
+
return self
|
|
259
|
+
|
|
260
|
+
# validate output
|
|
261
|
+
if task.input_json_schema is not None:
|
|
262
|
+
try:
|
|
263
|
+
validate_schema(json.loads(self.input), task.input_json_schema)
|
|
264
|
+
except json.JSONDecodeError:
|
|
265
|
+
raise ValueError("Input is not a valid JSON object")
|
|
266
|
+
except jsonschema.exceptions.ValidationError as e:
|
|
267
|
+
raise ValueError(f"Input does not match task input schema: {e}")
|
|
268
|
+
return self
|
|
269
|
+
|
|
270
|
+
@model_validator(mode="after")
|
|
271
|
+
def validate_output_format(self) -> Self:
|
|
272
|
+
task = self.parent_task()
|
|
273
|
+
if task is None:
|
|
274
|
+
return self
|
|
275
|
+
|
|
276
|
+
self.output.validate_output_format(task)
|
|
277
|
+
return self
|
|
278
|
+
|
|
279
|
+
@model_validator(mode="after")
|
|
280
|
+
def validate_requirement_ratings(self) -> Self:
|
|
281
|
+
task = self.parent_task()
|
|
282
|
+
if task is None:
|
|
283
|
+
return self
|
|
284
|
+
|
|
285
|
+
if self.output.rating is not None:
|
|
286
|
+
self.output.rating.validate_requirement_rating_keys(task)
|
|
287
|
+
if self.repaired_output is not None and self.repaired_output.rating is not None:
|
|
288
|
+
self.repaired_output.rating.validate_requirement_rating_keys(task)
|
|
289
|
+
|
|
290
|
+
return self
|
|
291
|
+
|
|
292
|
+
@model_validator(mode="after")
|
|
293
|
+
def validate_repaired_output(self) -> Self:
|
|
294
|
+
if self.repaired_output is not None:
|
|
295
|
+
if self.repaired_output.rating is not None:
|
|
296
|
+
raise ValueError(
|
|
297
|
+
"Repaired output rating must be None. Repaired outputs are assumed to have a perfect rating, as they have been fixed."
|
|
298
|
+
)
|
|
299
|
+
if self.repair_instructions is None and self.repaired_output is not None:
|
|
300
|
+
raise ValueError(
|
|
301
|
+
"Repair instructions are required if providing a repaired output."
|
|
302
|
+
)
|
|
303
|
+
if self.repair_instructions is not None and self.repaired_output is None:
|
|
304
|
+
raise ValueError(
|
|
305
|
+
"A repaired output is required if providing repair instructions."
|
|
306
|
+
)
|
|
307
|
+
return self
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
class TaskRequirement(BaseModel):
|
|
311
|
+
id: ID_TYPE = ID_FIELD
|
|
312
|
+
name: str = SHORT_NAME_FIELD
|
|
313
|
+
description: str | None = Field(default=None)
|
|
314
|
+
instruction: str = Field(min_length=1)
|
|
315
|
+
priority: Priority = Field(default=Priority.p2)
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
class TaskDeterminism(str, Enum):
|
|
319
|
+
deterministic = "deterministic" # Expect exact match
|
|
320
|
+
semantic_match = "semantic_match" # Expect same meaning, but flexible on expression of the meaning
|
|
321
|
+
flexible = "flexible" # Flexible on semantic output. Eval should be custom based on parsing requirements.
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
class Task(
|
|
325
|
+
KilnParentedModel,
|
|
326
|
+
KilnParentModel,
|
|
327
|
+
parent_of={"runs": TaskRun},
|
|
328
|
+
):
|
|
329
|
+
name: str = NAME_FIELD
|
|
330
|
+
description: str = Field(default="")
|
|
331
|
+
priority: Priority = Field(default=Priority.p2)
|
|
332
|
+
determinism: TaskDeterminism = Field(default=TaskDeterminism.flexible)
|
|
333
|
+
instruction: str = Field(min_length=1)
|
|
334
|
+
requirements: List[TaskRequirement] = Field(default=[])
|
|
335
|
+
# TODO: make this required, or formalize the default message output schema
|
|
336
|
+
output_json_schema: JsonObjectSchema | None = None
|
|
337
|
+
input_json_schema: JsonObjectSchema | None = None
|
|
338
|
+
|
|
339
|
+
def output_schema(self) -> Dict | None:
|
|
340
|
+
if self.output_json_schema is None:
|
|
341
|
+
return None
|
|
342
|
+
return schema_from_json_str(self.output_json_schema)
|
|
343
|
+
|
|
344
|
+
def input_schema(self) -> Dict | None:
|
|
345
|
+
if self.input_json_schema is None:
|
|
346
|
+
return None
|
|
347
|
+
return schema_from_json_str(self.input_json_schema)
|
|
348
|
+
|
|
349
|
+
# Needed for typechecking. TODO P2: fix this in KilnParentModel
|
|
350
|
+
def runs(self) -> list[TaskRun]:
|
|
351
|
+
return super().runs() # type: ignore
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
class Project(KilnParentModel, parent_of={"tasks": Task}):
|
|
355
|
+
name: str = NAME_FIELD
|
|
356
|
+
description: str | None = Field(
|
|
357
|
+
default=None,
|
|
358
|
+
description="A description of the project for you and your team. Will not be used in prompts/training/validation.",
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
# Needed for typechecking. TODO P2: fix this in KilnParentModel
|
|
362
|
+
def tasks(self) -> list[Task]:
|
|
363
|
+
return super().tasks() # type: ignore
|