kiln-ai 0.18.0__py3-none-any.whl → 0.20.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kiln-ai might be problematic. Click here for more details.
- kiln_ai/adapters/__init__.py +2 -2
- kiln_ai/adapters/adapter_registry.py +46 -0
- kiln_ai/adapters/chat/chat_formatter.py +8 -12
- kiln_ai/adapters/chat/test_chat_formatter.py +6 -2
- kiln_ai/adapters/data_gen/data_gen_task.py +2 -2
- kiln_ai/adapters/data_gen/test_data_gen_task.py +7 -3
- kiln_ai/adapters/docker_model_runner_tools.py +119 -0
- kiln_ai/adapters/eval/base_eval.py +2 -2
- kiln_ai/adapters/eval/eval_runner.py +3 -1
- kiln_ai/adapters/eval/g_eval.py +2 -2
- kiln_ai/adapters/eval/test_base_eval.py +1 -1
- kiln_ai/adapters/eval/test_eval_runner.py +6 -12
- kiln_ai/adapters/eval/test_g_eval.py +3 -4
- kiln_ai/adapters/eval/test_g_eval_data.py +1 -1
- kiln_ai/adapters/fine_tune/__init__.py +1 -1
- kiln_ai/adapters/fine_tune/base_finetune.py +1 -0
- kiln_ai/adapters/fine_tune/fireworks_finetune.py +32 -20
- kiln_ai/adapters/fine_tune/openai_finetune.py +14 -4
- kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +30 -21
- kiln_ai/adapters/fine_tune/test_openai_finetune.py +108 -111
- kiln_ai/adapters/ml_model_list.py +1009 -111
- kiln_ai/adapters/model_adapters/base_adapter.py +62 -28
- kiln_ai/adapters/model_adapters/litellm_adapter.py +397 -80
- kiln_ai/adapters/model_adapters/test_base_adapter.py +194 -18
- kiln_ai/adapters/model_adapters/test_litellm_adapter.py +428 -4
- kiln_ai/adapters/model_adapters/test_litellm_adapter_tools.py +1103 -0
- kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +5 -5
- kiln_ai/adapters/model_adapters/test_structured_output.py +120 -14
- kiln_ai/adapters/parsers/__init__.py +1 -1
- kiln_ai/adapters/parsers/test_r1_parser.py +1 -1
- kiln_ai/adapters/provider_tools.py +35 -20
- kiln_ai/adapters/remote_config.py +57 -10
- kiln_ai/adapters/repair/repair_task.py +1 -1
- kiln_ai/adapters/repair/test_repair_task.py +12 -9
- kiln_ai/adapters/run_output.py +3 -0
- kiln_ai/adapters/test_adapter_registry.py +109 -2
- kiln_ai/adapters/test_docker_model_runner_tools.py +305 -0
- kiln_ai/adapters/test_ml_model_list.py +51 -1
- kiln_ai/adapters/test_prompt_adaptors.py +13 -6
- kiln_ai/adapters/test_provider_tools.py +73 -12
- kiln_ai/adapters/test_remote_config.py +470 -16
- kiln_ai/datamodel/__init__.py +23 -21
- kiln_ai/datamodel/basemodel.py +54 -28
- kiln_ai/datamodel/datamodel_enums.py +3 -0
- kiln_ai/datamodel/dataset_split.py +5 -3
- kiln_ai/datamodel/eval.py +4 -4
- kiln_ai/datamodel/external_tool_server.py +298 -0
- kiln_ai/datamodel/finetune.py +2 -2
- kiln_ai/datamodel/json_schema.py +25 -10
- kiln_ai/datamodel/project.py +11 -4
- kiln_ai/datamodel/prompt.py +2 -2
- kiln_ai/datamodel/prompt_id.py +4 -4
- kiln_ai/datamodel/registry.py +0 -15
- kiln_ai/datamodel/run_config.py +62 -0
- kiln_ai/datamodel/task.py +8 -83
- kiln_ai/datamodel/task_output.py +7 -2
- kiln_ai/datamodel/task_run.py +41 -0
- kiln_ai/datamodel/test_basemodel.py +213 -21
- kiln_ai/datamodel/test_eval_model.py +6 -6
- kiln_ai/datamodel/test_example_models.py +175 -0
- kiln_ai/datamodel/test_external_tool_server.py +691 -0
- kiln_ai/datamodel/test_model_perf.py +1 -1
- kiln_ai/datamodel/test_prompt_id.py +5 -1
- kiln_ai/datamodel/test_registry.py +8 -3
- kiln_ai/datamodel/test_task.py +20 -47
- kiln_ai/datamodel/test_tool_id.py +239 -0
- kiln_ai/datamodel/tool_id.py +83 -0
- kiln_ai/tools/__init__.py +8 -0
- kiln_ai/tools/base_tool.py +82 -0
- kiln_ai/tools/built_in_tools/__init__.py +13 -0
- kiln_ai/tools/built_in_tools/math_tools.py +124 -0
- kiln_ai/tools/built_in_tools/test_math_tools.py +204 -0
- kiln_ai/tools/mcp_server_tool.py +95 -0
- kiln_ai/tools/mcp_session_manager.py +243 -0
- kiln_ai/tools/test_base_tools.py +199 -0
- kiln_ai/tools/test_mcp_server_tool.py +457 -0
- kiln_ai/tools/test_mcp_session_manager.py +1585 -0
- kiln_ai/tools/test_tool_registry.py +473 -0
- kiln_ai/tools/tool_registry.py +64 -0
- kiln_ai/utils/config.py +32 -0
- kiln_ai/utils/open_ai_types.py +94 -0
- kiln_ai/utils/project_utils.py +17 -0
- kiln_ai/utils/test_config.py +138 -1
- kiln_ai/utils/test_open_ai_types.py +131 -0
- {kiln_ai-0.18.0.dist-info → kiln_ai-0.20.1.dist-info}/METADATA +37 -6
- kiln_ai-0.20.1.dist-info/RECORD +138 -0
- kiln_ai-0.18.0.dist-info/RECORD +0 -115
- {kiln_ai-0.18.0.dist-info → kiln_ai-0.20.1.dist-info}/WHEEL +0 -0
- {kiln_ai-0.18.0.dist-info → kiln_ai-0.20.1.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field, model_validator
|
|
4
|
+
from typing_extensions import Self
|
|
5
|
+
|
|
6
|
+
from kiln_ai.datamodel.datamodel_enums import (
|
|
7
|
+
ModelProviderName,
|
|
8
|
+
StructuredOutputMode,
|
|
9
|
+
)
|
|
10
|
+
from kiln_ai.datamodel.prompt_id import PromptId
|
|
11
|
+
from kiln_ai.datamodel.tool_id import ToolId
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ToolsRunConfig(BaseModel):
|
|
15
|
+
"""
|
|
16
|
+
A config describing which tools are available to a task.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
tools: List[ToolId] = Field(
|
|
20
|
+
description="The IDs of the tools available to the task."
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class RunConfigProperties(BaseModel):
|
|
25
|
+
"""
|
|
26
|
+
A configuration for running a task.
|
|
27
|
+
|
|
28
|
+
This includes everything needed to run a task, except the input and task ID. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic).
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
model_name: str = Field(description="The model to use for this run config.")
|
|
32
|
+
model_provider_name: ModelProviderName = Field(
|
|
33
|
+
description="The provider to use for this run config."
|
|
34
|
+
)
|
|
35
|
+
prompt_id: PromptId = Field(
|
|
36
|
+
description="The prompt to use for this run config. Defaults to building a simple prompt from the task if not provided.",
|
|
37
|
+
)
|
|
38
|
+
top_p: float = Field(
|
|
39
|
+
default=1.0,
|
|
40
|
+
description="The top-p value to use for this run config. Defaults to 1.0.",
|
|
41
|
+
)
|
|
42
|
+
temperature: float = Field(
|
|
43
|
+
default=1.0,
|
|
44
|
+
description="The temperature to use for this run config. Defaults to 1.0.",
|
|
45
|
+
)
|
|
46
|
+
structured_output_mode: StructuredOutputMode = Field(
|
|
47
|
+
description="The structured output mode to use for this run config.",
|
|
48
|
+
)
|
|
49
|
+
tools_config: ToolsRunConfig | None = Field(
|
|
50
|
+
default=None,
|
|
51
|
+
description="The tools config to use for this run config, defining which tools are available to the model.",
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
@model_validator(mode="after")
|
|
55
|
+
def validate_required_fields(self) -> Self:
|
|
56
|
+
if not (0 <= self.top_p <= 1):
|
|
57
|
+
raise ValueError("top_p must be between 0 and 1")
|
|
58
|
+
|
|
59
|
+
elif self.temperature < 0 or self.temperature > 2:
|
|
60
|
+
raise ValueError("temperature must be between 0 and 2")
|
|
61
|
+
|
|
62
|
+
return self
|
kiln_ai/datamodel/task.py
CHANGED
|
@@ -1,28 +1,26 @@
|
|
|
1
1
|
from typing import TYPE_CHECKING, Dict, List, Union
|
|
2
2
|
|
|
3
3
|
from pydantic import BaseModel, Field, ValidationInfo, model_validator
|
|
4
|
-
from typing_extensions import Self
|
|
5
4
|
|
|
6
|
-
from kiln_ai.datamodel import Finetune
|
|
7
5
|
from kiln_ai.datamodel.basemodel import (
|
|
8
6
|
ID_FIELD,
|
|
9
7
|
ID_TYPE,
|
|
10
|
-
|
|
11
|
-
|
|
8
|
+
FilenameString,
|
|
9
|
+
FilenameStringShort,
|
|
12
10
|
KilnParentedModel,
|
|
13
11
|
KilnParentModel,
|
|
14
12
|
)
|
|
15
13
|
from kiln_ai.datamodel.datamodel_enums import (
|
|
16
|
-
ModelProviderName,
|
|
17
14
|
Priority,
|
|
18
15
|
StructuredOutputMode,
|
|
19
16
|
TaskOutputRatingType,
|
|
20
17
|
)
|
|
21
18
|
from kiln_ai.datamodel.dataset_split import DatasetSplit
|
|
22
19
|
from kiln_ai.datamodel.eval import Eval
|
|
20
|
+
from kiln_ai.datamodel.finetune import Finetune
|
|
23
21
|
from kiln_ai.datamodel.json_schema import JsonObjectSchema, schema_from_json_str
|
|
24
22
|
from kiln_ai.datamodel.prompt import BasePrompt, Prompt
|
|
25
|
-
from kiln_ai.datamodel.
|
|
23
|
+
from kiln_ai.datamodel.run_config import RunConfigProperties
|
|
26
24
|
from kiln_ai.datamodel.task_run import TaskRun
|
|
27
25
|
|
|
28
26
|
if TYPE_CHECKING:
|
|
@@ -38,62 +36,13 @@ class TaskRequirement(BaseModel):
|
|
|
38
36
|
"""
|
|
39
37
|
|
|
40
38
|
id: ID_TYPE = ID_FIELD
|
|
41
|
-
name:
|
|
39
|
+
name: FilenameStringShort = Field(description="The name of the task requirement.")
|
|
42
40
|
description: str | None = Field(default=None)
|
|
43
41
|
instruction: str = Field(min_length=1)
|
|
44
42
|
priority: Priority = Field(default=Priority.p2)
|
|
45
43
|
type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star)
|
|
46
44
|
|
|
47
45
|
|
|
48
|
-
class RunConfigProperties(BaseModel):
|
|
49
|
-
"""
|
|
50
|
-
A configuration for running a task.
|
|
51
|
-
|
|
52
|
-
This includes everything needed to run a task, except the input and task ID. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic).
|
|
53
|
-
"""
|
|
54
|
-
|
|
55
|
-
model_name: str = Field(description="The model to use for this run config.")
|
|
56
|
-
model_provider_name: ModelProviderName = Field(
|
|
57
|
-
description="The provider to use for this run config."
|
|
58
|
-
)
|
|
59
|
-
prompt_id: PromptId = Field(
|
|
60
|
-
description="The prompt to use for this run config. Defaults to building a simple prompt from the task if not provided.",
|
|
61
|
-
)
|
|
62
|
-
top_p: float = Field(
|
|
63
|
-
default=1.0,
|
|
64
|
-
description="The top-p value to use for this run config. Defaults to 1.0.",
|
|
65
|
-
)
|
|
66
|
-
temperature: float = Field(
|
|
67
|
-
default=1.0,
|
|
68
|
-
description="The temperature to use for this run config. Defaults to 1.0.",
|
|
69
|
-
)
|
|
70
|
-
structured_output_mode: StructuredOutputMode = Field(
|
|
71
|
-
description="The structured output mode to use for this run config.",
|
|
72
|
-
)
|
|
73
|
-
|
|
74
|
-
@model_validator(mode="after")
|
|
75
|
-
def validate_required_fields(self) -> Self:
|
|
76
|
-
if not (0 <= self.top_p <= 1):
|
|
77
|
-
raise ValueError("top_p must be between 0 and 1")
|
|
78
|
-
|
|
79
|
-
elif self.temperature < 0 or self.temperature > 2:
|
|
80
|
-
raise ValueError("temperature must be between 0 and 2")
|
|
81
|
-
|
|
82
|
-
return self
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
class RunConfig(RunConfigProperties):
|
|
86
|
-
"""
|
|
87
|
-
A configuration for running a task.
|
|
88
|
-
|
|
89
|
-
This includes everything needed to run a task, except the input. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic).
|
|
90
|
-
|
|
91
|
-
For example: task, model, provider, prompt, etc.
|
|
92
|
-
"""
|
|
93
|
-
|
|
94
|
-
task: "Task" = Field(description="The task to run.")
|
|
95
|
-
|
|
96
|
-
|
|
97
46
|
class TaskRunConfig(KilnParentedModel):
|
|
98
47
|
"""
|
|
99
48
|
A Kiln model for persisting a run config in a Kiln Project, nested under a task.
|
|
@@ -103,7 +52,7 @@ class TaskRunConfig(KilnParentedModel):
|
|
|
103
52
|
A run config includes everything needed to run a task, except the input. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic).
|
|
104
53
|
"""
|
|
105
54
|
|
|
106
|
-
name:
|
|
55
|
+
name: FilenameString = Field(description="The name of the task run config.")
|
|
107
56
|
description: str | None = Field(
|
|
108
57
|
default=None, description="The description of the task run config."
|
|
109
58
|
)
|
|
@@ -124,15 +73,6 @@ class TaskRunConfig(KilnParentedModel):
|
|
|
124
73
|
return None
|
|
125
74
|
return self.parent # type: ignore
|
|
126
75
|
|
|
127
|
-
def run_config(self) -> RunConfig:
|
|
128
|
-
parent_task = self.parent_task()
|
|
129
|
-
if parent_task is None:
|
|
130
|
-
raise ValueError("Run config must be parented to a task")
|
|
131
|
-
return run_config_from_run_config_properties(
|
|
132
|
-
task=parent_task,
|
|
133
|
-
run_config_properties=self.run_config_properties,
|
|
134
|
-
)
|
|
135
|
-
|
|
136
76
|
# Previously we didn't store structured_output_mode in the run_config_properties. Updgrade old models when loading from file.
|
|
137
77
|
@model_validator(mode="before")
|
|
138
78
|
def upgrade_old_entries(cls, data: dict, info: ValidationInfo) -> dict:
|
|
@@ -155,21 +95,6 @@ class TaskRunConfig(KilnParentedModel):
|
|
|
155
95
|
return data
|
|
156
96
|
|
|
157
97
|
|
|
158
|
-
def run_config_from_run_config_properties(
|
|
159
|
-
task: "Task",
|
|
160
|
-
run_config_properties: RunConfigProperties,
|
|
161
|
-
) -> RunConfig:
|
|
162
|
-
return RunConfig(
|
|
163
|
-
task=task,
|
|
164
|
-
model_name=run_config_properties.model_name,
|
|
165
|
-
model_provider_name=run_config_properties.model_provider_name,
|
|
166
|
-
prompt_id=run_config_properties.prompt_id,
|
|
167
|
-
top_p=run_config_properties.top_p,
|
|
168
|
-
temperature=run_config_properties.temperature,
|
|
169
|
-
structured_output_mode=run_config_properties.structured_output_mode,
|
|
170
|
-
)
|
|
171
|
-
|
|
172
|
-
|
|
173
98
|
class Task(
|
|
174
99
|
KilnParentedModel,
|
|
175
100
|
KilnParentModel,
|
|
@@ -189,7 +114,7 @@ class Task(
|
|
|
189
114
|
a collection of task runs.
|
|
190
115
|
"""
|
|
191
116
|
|
|
192
|
-
name:
|
|
117
|
+
name: FilenameString = Field(description="The name of the task.")
|
|
193
118
|
description: str | None = Field(
|
|
194
119
|
default=None,
|
|
195
120
|
description="A description of the task for you and your team. Will not be used in prompts/training/validation.",
|
|
@@ -216,7 +141,7 @@ class Task(
|
|
|
216
141
|
return None
|
|
217
142
|
return schema_from_json_str(self.input_json_schema)
|
|
218
143
|
|
|
219
|
-
# These wrappers help for typechecking.
|
|
144
|
+
# These wrappers help for typechecking. We should fix this in KilnParentModel
|
|
220
145
|
def runs(self, readonly: bool = False) -> list[TaskRun]:
|
|
221
146
|
return super().runs(readonly=readonly) # type: ignore
|
|
222
147
|
|
kiln_ai/datamodel/task_output.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from enum import Enum
|
|
3
|
-
from typing import TYPE_CHECKING, Dict, List, Type, Union
|
|
3
|
+
from typing import TYPE_CHECKING, Dict, List, Optional, Type, Union
|
|
4
4
|
|
|
5
5
|
from pydantic import BaseModel, Field, ValidationInfo, model_validator
|
|
6
6
|
from typing_extensions import Self
|
|
@@ -8,6 +8,7 @@ from typing_extensions import Self
|
|
|
8
8
|
from kiln_ai.datamodel.basemodel import ID_TYPE, KilnBaseModel
|
|
9
9
|
from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType
|
|
10
10
|
from kiln_ai.datamodel.json_schema import validate_schema_with_value_error
|
|
11
|
+
from kiln_ai.datamodel.run_config import RunConfigProperties
|
|
11
12
|
from kiln_ai.datamodel.strict_mode import strict_mode
|
|
12
13
|
from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
|
|
13
14
|
|
|
@@ -199,6 +200,10 @@ class DataSource(BaseModel):
|
|
|
199
200
|
default={},
|
|
200
201
|
description="Properties describing the data source. For synthetic things like model. For human, the human's name.",
|
|
201
202
|
)
|
|
203
|
+
run_config: Optional[RunConfigProperties] = Field(
|
|
204
|
+
default=None,
|
|
205
|
+
description="The run config used to generate the data, if generated by a running a model in Kiln (only true for type=synthetic).",
|
|
206
|
+
)
|
|
202
207
|
|
|
203
208
|
_data_source_properties = [
|
|
204
209
|
DataSourceProperty(
|
|
@@ -307,7 +312,7 @@ class TaskOutput(KilnBaseModel):
|
|
|
307
312
|
if task.output_json_schema is not None:
|
|
308
313
|
try:
|
|
309
314
|
output_parsed = json.loads(self.output)
|
|
310
|
-
except json.JSONDecodeError
|
|
315
|
+
except json.JSONDecodeError:
|
|
311
316
|
raise ValueError("Output is not a valid JSON object")
|
|
312
317
|
|
|
313
318
|
validate_schema_with_value_error(
|
kiln_ai/datamodel/task_run.py
CHANGED
|
@@ -8,6 +8,7 @@ from kiln_ai.datamodel.basemodel import KilnParentedModel
|
|
|
8
8
|
from kiln_ai.datamodel.json_schema import validate_schema_with_value_error
|
|
9
9
|
from kiln_ai.datamodel.strict_mode import strict_mode
|
|
10
10
|
from kiln_ai.datamodel.task_output import DataSource, TaskOutput
|
|
11
|
+
from kiln_ai.utils.open_ai_types import ChatCompletionMessageParam
|
|
11
12
|
|
|
12
13
|
if TYPE_CHECKING:
|
|
13
14
|
from kiln_ai.datamodel.task import Task
|
|
@@ -35,6 +36,42 @@ class Usage(BaseModel):
|
|
|
35
36
|
ge=0,
|
|
36
37
|
)
|
|
37
38
|
|
|
39
|
+
def __add__(self, other: "Usage") -> "Usage":
|
|
40
|
+
"""Add two Usage objects together, handling None values gracefully.
|
|
41
|
+
|
|
42
|
+
None + None = None
|
|
43
|
+
None + value = value
|
|
44
|
+
value + None = value
|
|
45
|
+
value1 + value2 = value1 + value2
|
|
46
|
+
"""
|
|
47
|
+
if not isinstance(other, Usage):
|
|
48
|
+
raise TypeError(f"Cannot add Usage with {type(other).__name__}")
|
|
49
|
+
|
|
50
|
+
def _add_optional_int(a: int | None, b: int | None) -> int | None:
|
|
51
|
+
if a is None and b is None:
|
|
52
|
+
return None
|
|
53
|
+
if a is None:
|
|
54
|
+
return b
|
|
55
|
+
if b is None:
|
|
56
|
+
return a
|
|
57
|
+
return a + b
|
|
58
|
+
|
|
59
|
+
def _add_optional_float(a: float | None, b: float | None) -> float | None:
|
|
60
|
+
if a is None and b is None:
|
|
61
|
+
return None
|
|
62
|
+
if a is None:
|
|
63
|
+
return b
|
|
64
|
+
if b is None:
|
|
65
|
+
return a
|
|
66
|
+
return a + b
|
|
67
|
+
|
|
68
|
+
return Usage(
|
|
69
|
+
input_tokens=_add_optional_int(self.input_tokens, other.input_tokens),
|
|
70
|
+
output_tokens=_add_optional_int(self.output_tokens, other.output_tokens),
|
|
71
|
+
total_tokens=_add_optional_int(self.total_tokens, other.total_tokens),
|
|
72
|
+
cost=_add_optional_float(self.cost, other.cost),
|
|
73
|
+
)
|
|
74
|
+
|
|
38
75
|
|
|
39
76
|
class TaskRun(KilnParentedModel):
|
|
40
77
|
"""
|
|
@@ -72,6 +109,10 @@ class TaskRun(KilnParentedModel):
|
|
|
72
109
|
default=None,
|
|
73
110
|
description="Usage information for the task run. This includes the number of input tokens, output tokens, and total tokens used.",
|
|
74
111
|
)
|
|
112
|
+
trace: list[ChatCompletionMessageParam] | None = Field(
|
|
113
|
+
default=None,
|
|
114
|
+
description="The trace of the task run in OpenAI format. This is the list of messages that were sent to/from the model.",
|
|
115
|
+
)
|
|
75
116
|
|
|
76
117
|
def thinking_training_data(self) -> str | None:
|
|
77
118
|
"""
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import datetime
|
|
2
2
|
import json
|
|
3
|
+
import uuid
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
from typing import Optional
|
|
5
6
|
from unittest.mock import MagicMock, patch
|
|
@@ -12,10 +13,11 @@ from kiln_ai.datamodel import Task, TaskRun
|
|
|
12
13
|
from kiln_ai.datamodel.basemodel import (
|
|
13
14
|
KilnBaseModel,
|
|
14
15
|
KilnParentedModel,
|
|
16
|
+
name_validator,
|
|
15
17
|
string_to_valid_name,
|
|
16
18
|
)
|
|
17
19
|
from kiln_ai.datamodel.model_cache import ModelCache
|
|
18
|
-
from kiln_ai.datamodel.task import
|
|
20
|
+
from kiln_ai.datamodel.task import RunConfigProperties
|
|
19
21
|
|
|
20
22
|
|
|
21
23
|
@pytest.fixture
|
|
@@ -328,28 +330,81 @@ def test_delete_no_path():
|
|
|
328
330
|
model.delete()
|
|
329
331
|
|
|
330
332
|
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
333
|
+
@pytest.mark.parametrize(
|
|
334
|
+
"name,expected",
|
|
335
|
+
[
|
|
336
|
+
# Basic valid strings remain unchanged
|
|
337
|
+
("Hello World", "Hello World"),
|
|
338
|
+
("Test-123", "Test-123"),
|
|
339
|
+
("my_file_name", "my_file_name"),
|
|
340
|
+
("multiple!!!symbols", "multiple!!!symbols"),
|
|
341
|
+
# Emoji
|
|
342
|
+
("Hello 👍", "Hello 👍"),
|
|
343
|
+
# Invalid characters are replaced
|
|
344
|
+
("Hello@World!", "Hello@World!"),
|
|
345
|
+
("File.name.txt", "File_name_txt"),
|
|
346
|
+
("Special%%%Chars", "Special_Chars"),
|
|
347
|
+
("Special#$%Chars", "Special#$_Chars"),
|
|
348
|
+
# Consecutive invalid characters are replaced
|
|
349
|
+
("Special%%%Chars", "Special_Chars"),
|
|
350
|
+
("path/to/file", "path_to_file"),
|
|
351
|
+
# Leading/trailing special characters are removed
|
|
352
|
+
("__test__", "test"),
|
|
353
|
+
("...test...", "test"),
|
|
354
|
+
# Whitespace is replaced
|
|
355
|
+
("", ""),
|
|
356
|
+
(" ", ""),
|
|
357
|
+
("Hello World", "Hello World"),
|
|
358
|
+
# Unicode characters are replaced
|
|
359
|
+
("你好", "你好"),
|
|
360
|
+
("你好_世界", "你好_世界"),
|
|
361
|
+
("你好_世界_你好", "你好_世界_你好"),
|
|
362
|
+
# Newlines, tabs, and other control characters are replaced
|
|
363
|
+
("Hello\nworld", "Hello_world"),
|
|
364
|
+
("Hello\tworld", "Hello_world"),
|
|
365
|
+
("Hello\rworld", "Hello_world"),
|
|
366
|
+
("Hello\fworld", "Hello_world"),
|
|
367
|
+
("Hello\bworld", "Hello_world"),
|
|
368
|
+
("Hello\vworld", "Hello_world"),
|
|
369
|
+
("Hello\0world", "Hello_world"),
|
|
370
|
+
("Hello\x00world", "Hello_world"),
|
|
371
|
+
],
|
|
372
|
+
)
|
|
373
|
+
def test_string_to_valid_name(tmp_path, name, expected):
|
|
374
|
+
assert string_to_valid_name(name) == expected
|
|
336
375
|
|
|
337
|
-
#
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
assert string_to_valid_name("Special#$%Chars") == "Special_Chars"
|
|
376
|
+
# check we can create a folder with the valid name
|
|
377
|
+
dir_path = tmp_path / str(uuid.uuid4()) / expected
|
|
378
|
+
dir_path.mkdir(parents=True)
|
|
341
379
|
|
|
342
|
-
# Test consecutive invalid characters
|
|
343
|
-
assert string_to_valid_name("multiple!!!symbols") == "multiple_symbols"
|
|
344
|
-
assert string_to_valid_name("path/to/file") == "path_to_file"
|
|
345
380
|
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
381
|
+
@pytest.mark.parametrize(
|
|
382
|
+
"name,min_length,max_length,should_pass",
|
|
383
|
+
[
|
|
384
|
+
# Valid cases
|
|
385
|
+
("ValidName", 5, 20, True),
|
|
386
|
+
("Short", 1, 10, True),
|
|
387
|
+
("LongerValidName", 5, 20, True),
|
|
388
|
+
# None case (line 53)
|
|
389
|
+
(None, 5, 20, False),
|
|
390
|
+
# Too short cases (lines 57-59)
|
|
391
|
+
("Hi", 5, 20, False),
|
|
392
|
+
("", 1, 20, False),
|
|
393
|
+
("a", 2, 20, False),
|
|
394
|
+
# Too long cases (lines 61-63)
|
|
395
|
+
("ThisNameIsTooLong", 5, 10, False),
|
|
396
|
+
("VeryVeryVeryLongName", 1, 15, False),
|
|
397
|
+
],
|
|
398
|
+
)
|
|
399
|
+
def test_name_validator_error_conditions(name, min_length, max_length, should_pass):
|
|
400
|
+
validator = name_validator(min_length=min_length, max_length=max_length)
|
|
349
401
|
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
402
|
+
if should_pass:
|
|
403
|
+
result = validator(name)
|
|
404
|
+
assert result == name
|
|
405
|
+
else:
|
|
406
|
+
with pytest.raises(ValueError):
|
|
407
|
+
validator(name)
|
|
353
408
|
|
|
354
409
|
|
|
355
410
|
def test_load_from_file_with_cache(test_base_file, tmp_model_cache):
|
|
@@ -497,8 +552,8 @@ def base_task():
|
|
|
497
552
|
@pytest.fixture
|
|
498
553
|
def adapter(base_task):
|
|
499
554
|
return MockAdapter(
|
|
500
|
-
|
|
501
|
-
|
|
555
|
+
task=base_task,
|
|
556
|
+
run_config=RunConfigProperties(
|
|
502
557
|
model_name="test_model",
|
|
503
558
|
model_provider_name="openai",
|
|
504
559
|
prompt_id="simple_prompt_builder",
|
|
@@ -553,3 +608,140 @@ async def test_invoke_parsing_flow(adapter):
|
|
|
553
608
|
match="Reasoning is required for this model, but no reasoning was returned.",
|
|
554
609
|
):
|
|
555
610
|
await adapter.invoke("test input")
|
|
611
|
+
|
|
612
|
+
|
|
613
|
+
async def test_invoke_parsing_flow_basic_no_reasoning(adapter):
|
|
614
|
+
"""Test for reasoning_optional_for_structured_output
|
|
615
|
+
when reasoning is not required.
|
|
616
|
+
This is a special case where we want to return the output as is.
|
|
617
|
+
"""
|
|
618
|
+
# Mock dependencies
|
|
619
|
+
mock_provider = MagicMock()
|
|
620
|
+
mock_provider.parser = "test_parser"
|
|
621
|
+
mock_provider.formatter = None
|
|
622
|
+
mock_provider.reasoning_capable = False
|
|
623
|
+
mock_provider.reasoning_optional_for_structured_output = True
|
|
624
|
+
|
|
625
|
+
mock_parser = MagicMock()
|
|
626
|
+
mock_parser.parse_output.return_value = RunOutput(
|
|
627
|
+
output="parsed test output", intermediate_outputs={"key": "value"}
|
|
628
|
+
)
|
|
629
|
+
|
|
630
|
+
with (
|
|
631
|
+
patch.object(adapter, "model_provider", return_value=mock_provider),
|
|
632
|
+
patch(
|
|
633
|
+
"kiln_ai.adapters.model_adapters.base_adapter.model_parser_from_id",
|
|
634
|
+
return_value=mock_parser,
|
|
635
|
+
),
|
|
636
|
+
patch("kiln_ai.adapters.model_adapters.base_adapter.Config") as mock_config,
|
|
637
|
+
):
|
|
638
|
+
# Disable autosaving for this test
|
|
639
|
+
mock_config.shared.return_value.autosave_runs = False
|
|
640
|
+
mock_config.shared.return_value.user_id = "test_user_id"
|
|
641
|
+
|
|
642
|
+
# Execute
|
|
643
|
+
result = await adapter.invoke("test input")
|
|
644
|
+
|
|
645
|
+
# Verify parsing occurred
|
|
646
|
+
mock_parser.parse_output.assert_called_once()
|
|
647
|
+
parsed_args = mock_parser.parse_output.call_args[1]
|
|
648
|
+
assert isinstance(parsed_args["original_output"], RunOutput)
|
|
649
|
+
assert parsed_args["original_output"].output == "test output"
|
|
650
|
+
|
|
651
|
+
# Verify result contains parsed output
|
|
652
|
+
assert isinstance(result, TaskRun)
|
|
653
|
+
assert result.output.output == "parsed test output"
|
|
654
|
+
assert result.intermediate_outputs == {"key": "value"}
|
|
655
|
+
assert result.input == "test input"
|
|
656
|
+
|
|
657
|
+
|
|
658
|
+
async def test_invoke_parsing_flow_no_reasoning_with_structured_output(adapter):
|
|
659
|
+
"""Test for reasoning_optional_for_structured_output
|
|
660
|
+
when reasoning is required but not provided, with structured output enabled.
|
|
661
|
+
This is a special case where we don't want to error, but we want to return the output as is.
|
|
662
|
+
"""
|
|
663
|
+
# Mock dependencies
|
|
664
|
+
mock_provider = MagicMock()
|
|
665
|
+
mock_provider.parser = "test_parser"
|
|
666
|
+
mock_provider.formatter = None
|
|
667
|
+
mock_provider.reasoning_capable = True
|
|
668
|
+
mock_provider.reasoning_optional_for_structured_output = True
|
|
669
|
+
|
|
670
|
+
mock_parser = MagicMock()
|
|
671
|
+
mock_parser.parse_output.return_value = RunOutput(
|
|
672
|
+
output="parsed test output", intermediate_outputs={"key": "value"}
|
|
673
|
+
)
|
|
674
|
+
|
|
675
|
+
with (
|
|
676
|
+
patch.object(adapter, "model_provider", return_value=mock_provider),
|
|
677
|
+
patch(
|
|
678
|
+
"kiln_ai.adapters.model_adapters.base_adapter.model_parser_from_id",
|
|
679
|
+
return_value=mock_parser,
|
|
680
|
+
),
|
|
681
|
+
patch("kiln_ai.adapters.model_adapters.base_adapter.Config") as mock_config,
|
|
682
|
+
patch.object(adapter, "has_structured_output", return_value=True),
|
|
683
|
+
):
|
|
684
|
+
# Disable autosaving for this test
|
|
685
|
+
mock_config.shared.return_value.autosave_runs = False
|
|
686
|
+
mock_config.shared.return_value.user_id = "test_user_id"
|
|
687
|
+
|
|
688
|
+
# Execute
|
|
689
|
+
result = await adapter.invoke("test input")
|
|
690
|
+
|
|
691
|
+
# Verify parsing occurred
|
|
692
|
+
mock_parser.parse_output.assert_called_once()
|
|
693
|
+
parsed_args = mock_parser.parse_output.call_args[1]
|
|
694
|
+
assert isinstance(parsed_args["original_output"], RunOutput)
|
|
695
|
+
assert parsed_args["original_output"].output == "test output"
|
|
696
|
+
|
|
697
|
+
# Verify result contains parsed output
|
|
698
|
+
assert isinstance(result, TaskRun)
|
|
699
|
+
assert result.output.output == "parsed test output"
|
|
700
|
+
assert result.intermediate_outputs == {"key": "value"}
|
|
701
|
+
assert result.input == "test input"
|
|
702
|
+
|
|
703
|
+
|
|
704
|
+
async def test_invoke_parsing_flow_with_reasoning_and_structured_output(adapter):
|
|
705
|
+
"""Test for reasoning_optional_for_structured_output
|
|
706
|
+
when reasoning is provided with structured output enabled.
|
|
707
|
+
This is a special case where we want to return the output as is.
|
|
708
|
+
"""
|
|
709
|
+
# Mock dependencies
|
|
710
|
+
mock_provider = MagicMock()
|
|
711
|
+
mock_provider.parser = "test_parser"
|
|
712
|
+
mock_provider.formatter = None
|
|
713
|
+
mock_provider.reasoning_capable = True
|
|
714
|
+
mock_provider.reasoning_optional_for_structured_output = True
|
|
715
|
+
|
|
716
|
+
mock_parser = MagicMock()
|
|
717
|
+
mock_parser.parse_output.return_value = RunOutput(
|
|
718
|
+
output="parsed test output", intermediate_outputs={"reasoning": "value"}
|
|
719
|
+
)
|
|
720
|
+
|
|
721
|
+
with (
|
|
722
|
+
patch.object(adapter, "model_provider", return_value=mock_provider),
|
|
723
|
+
patch(
|
|
724
|
+
"kiln_ai.adapters.model_adapters.base_adapter.model_parser_from_id",
|
|
725
|
+
return_value=mock_parser,
|
|
726
|
+
),
|
|
727
|
+
patch("kiln_ai.adapters.model_adapters.base_adapter.Config") as mock_config,
|
|
728
|
+
patch.object(adapter, "has_structured_output", return_value=True),
|
|
729
|
+
):
|
|
730
|
+
# Disable autosaving for this test
|
|
731
|
+
mock_config.shared.return_value.autosave_runs = False
|
|
732
|
+
mock_config.shared.return_value.user_id = "test_user_id"
|
|
733
|
+
|
|
734
|
+
# Execute
|
|
735
|
+
result = await adapter.invoke("test input")
|
|
736
|
+
|
|
737
|
+
# Verify parsing occurred
|
|
738
|
+
mock_parser.parse_output.assert_called_once()
|
|
739
|
+
parsed_args = mock_parser.parse_output.call_args[1]
|
|
740
|
+
assert isinstance(parsed_args["original_output"], RunOutput)
|
|
741
|
+
assert parsed_args["original_output"].output == "test output"
|
|
742
|
+
|
|
743
|
+
# Verify result contains parsed output
|
|
744
|
+
assert isinstance(result, TaskRun)
|
|
745
|
+
assert result.output.output == "parsed test output"
|
|
746
|
+
assert result.intermediate_outputs == {"reasoning": "value"}
|
|
747
|
+
assert result.input == "test input"
|
|
@@ -517,13 +517,13 @@ def test_eval_run_score_keys_must_match(valid_eval_config, valid_eval_run_data):
|
|
|
517
517
|
valid_eval_config.parent = eval
|
|
518
518
|
|
|
519
519
|
# Correct
|
|
520
|
-
|
|
520
|
+
EvalRun(
|
|
521
521
|
parent=valid_eval_config,
|
|
522
522
|
**{**valid_eval_run_data, "scores": {"accuracy": 4.5, "critical": 1.0}},
|
|
523
523
|
)
|
|
524
524
|
|
|
525
525
|
# Correct but wrong order still okay
|
|
526
|
-
|
|
526
|
+
EvalRun(
|
|
527
527
|
parent=valid_eval_config,
|
|
528
528
|
**{**valid_eval_run_data, "scores": {"critical": 1.0, "accuracy": 4.5}},
|
|
529
529
|
)
|
|
@@ -533,7 +533,7 @@ def test_eval_run_score_keys_must_match(valid_eval_config, valid_eval_run_data):
|
|
|
533
533
|
ValueError,
|
|
534
534
|
match="The scores produced by the evaluator must match the scores expected by the eval",
|
|
535
535
|
):
|
|
536
|
-
|
|
536
|
+
EvalRun(
|
|
537
537
|
parent=valid_eval_config,
|
|
538
538
|
**{**valid_eval_run_data, "scores": {"accuracy": 4.5}},
|
|
539
539
|
)
|
|
@@ -543,7 +543,7 @@ def test_eval_run_score_keys_must_match(valid_eval_config, valid_eval_run_data):
|
|
|
543
543
|
ValueError,
|
|
544
544
|
match="The scores produced by the evaluator must match the scores expected by the eval",
|
|
545
545
|
):
|
|
546
|
-
|
|
546
|
+
EvalRun(
|
|
547
547
|
parent=valid_eval_config,
|
|
548
548
|
**{
|
|
549
549
|
**valid_eval_run_data,
|
|
@@ -556,7 +556,7 @@ def test_eval_run_score_keys_must_match(valid_eval_config, valid_eval_run_data):
|
|
|
556
556
|
ValueError,
|
|
557
557
|
match="The scores produced by the evaluator must match the scores expected by the eval",
|
|
558
558
|
):
|
|
559
|
-
|
|
559
|
+
EvalRun(
|
|
560
560
|
parent=valid_eval_config,
|
|
561
561
|
**{**valid_eval_run_data, "scores": {"accuracy": 4.5, "wrong": 1.0}},
|
|
562
562
|
)
|
|
@@ -566,7 +566,7 @@ def test_eval_run_custom_scores_not_allowed(valid_eval_config, valid_eval_run_da
|
|
|
566
566
|
with pytest.raises(
|
|
567
567
|
ValueError, match="Custom scores are not supported in evaluators"
|
|
568
568
|
):
|
|
569
|
-
|
|
569
|
+
Eval(
|
|
570
570
|
name="Test Eval",
|
|
571
571
|
eval_set_filter_id="tag::tag1",
|
|
572
572
|
eval_configs_filter_id="tag::tag2",
|