kiln-ai 0.18.0__py3-none-any.whl → 0.20.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (89) hide show
  1. kiln_ai/adapters/__init__.py +2 -2
  2. kiln_ai/adapters/adapter_registry.py +46 -0
  3. kiln_ai/adapters/chat/chat_formatter.py +8 -12
  4. kiln_ai/adapters/chat/test_chat_formatter.py +6 -2
  5. kiln_ai/adapters/data_gen/data_gen_task.py +2 -2
  6. kiln_ai/adapters/data_gen/test_data_gen_task.py +7 -3
  7. kiln_ai/adapters/docker_model_runner_tools.py +119 -0
  8. kiln_ai/adapters/eval/base_eval.py +2 -2
  9. kiln_ai/adapters/eval/eval_runner.py +3 -1
  10. kiln_ai/adapters/eval/g_eval.py +2 -2
  11. kiln_ai/adapters/eval/test_base_eval.py +1 -1
  12. kiln_ai/adapters/eval/test_eval_runner.py +6 -12
  13. kiln_ai/adapters/eval/test_g_eval.py +3 -4
  14. kiln_ai/adapters/eval/test_g_eval_data.py +1 -1
  15. kiln_ai/adapters/fine_tune/__init__.py +1 -1
  16. kiln_ai/adapters/fine_tune/base_finetune.py +1 -0
  17. kiln_ai/adapters/fine_tune/fireworks_finetune.py +32 -20
  18. kiln_ai/adapters/fine_tune/openai_finetune.py +14 -4
  19. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +30 -21
  20. kiln_ai/adapters/fine_tune/test_openai_finetune.py +108 -111
  21. kiln_ai/adapters/ml_model_list.py +1009 -111
  22. kiln_ai/adapters/model_adapters/base_adapter.py +62 -28
  23. kiln_ai/adapters/model_adapters/litellm_adapter.py +397 -80
  24. kiln_ai/adapters/model_adapters/test_base_adapter.py +194 -18
  25. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +428 -4
  26. kiln_ai/adapters/model_adapters/test_litellm_adapter_tools.py +1103 -0
  27. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +5 -5
  28. kiln_ai/adapters/model_adapters/test_structured_output.py +120 -14
  29. kiln_ai/adapters/parsers/__init__.py +1 -1
  30. kiln_ai/adapters/parsers/test_r1_parser.py +1 -1
  31. kiln_ai/adapters/provider_tools.py +35 -20
  32. kiln_ai/adapters/remote_config.py +57 -10
  33. kiln_ai/adapters/repair/repair_task.py +1 -1
  34. kiln_ai/adapters/repair/test_repair_task.py +12 -9
  35. kiln_ai/adapters/run_output.py +3 -0
  36. kiln_ai/adapters/test_adapter_registry.py +109 -2
  37. kiln_ai/adapters/test_docker_model_runner_tools.py +305 -0
  38. kiln_ai/adapters/test_ml_model_list.py +51 -1
  39. kiln_ai/adapters/test_prompt_adaptors.py +13 -6
  40. kiln_ai/adapters/test_provider_tools.py +73 -12
  41. kiln_ai/adapters/test_remote_config.py +470 -16
  42. kiln_ai/datamodel/__init__.py +23 -21
  43. kiln_ai/datamodel/basemodel.py +54 -28
  44. kiln_ai/datamodel/datamodel_enums.py +3 -0
  45. kiln_ai/datamodel/dataset_split.py +5 -3
  46. kiln_ai/datamodel/eval.py +4 -4
  47. kiln_ai/datamodel/external_tool_server.py +298 -0
  48. kiln_ai/datamodel/finetune.py +2 -2
  49. kiln_ai/datamodel/json_schema.py +25 -10
  50. kiln_ai/datamodel/project.py +11 -4
  51. kiln_ai/datamodel/prompt.py +2 -2
  52. kiln_ai/datamodel/prompt_id.py +4 -4
  53. kiln_ai/datamodel/registry.py +0 -15
  54. kiln_ai/datamodel/run_config.py +62 -0
  55. kiln_ai/datamodel/task.py +8 -83
  56. kiln_ai/datamodel/task_output.py +7 -2
  57. kiln_ai/datamodel/task_run.py +41 -0
  58. kiln_ai/datamodel/test_basemodel.py +213 -21
  59. kiln_ai/datamodel/test_eval_model.py +6 -6
  60. kiln_ai/datamodel/test_example_models.py +175 -0
  61. kiln_ai/datamodel/test_external_tool_server.py +691 -0
  62. kiln_ai/datamodel/test_model_perf.py +1 -1
  63. kiln_ai/datamodel/test_prompt_id.py +5 -1
  64. kiln_ai/datamodel/test_registry.py +8 -3
  65. kiln_ai/datamodel/test_task.py +20 -47
  66. kiln_ai/datamodel/test_tool_id.py +239 -0
  67. kiln_ai/datamodel/tool_id.py +83 -0
  68. kiln_ai/tools/__init__.py +8 -0
  69. kiln_ai/tools/base_tool.py +82 -0
  70. kiln_ai/tools/built_in_tools/__init__.py +13 -0
  71. kiln_ai/tools/built_in_tools/math_tools.py +124 -0
  72. kiln_ai/tools/built_in_tools/test_math_tools.py +204 -0
  73. kiln_ai/tools/mcp_server_tool.py +95 -0
  74. kiln_ai/tools/mcp_session_manager.py +243 -0
  75. kiln_ai/tools/test_base_tools.py +199 -0
  76. kiln_ai/tools/test_mcp_server_tool.py +457 -0
  77. kiln_ai/tools/test_mcp_session_manager.py +1585 -0
  78. kiln_ai/tools/test_tool_registry.py +473 -0
  79. kiln_ai/tools/tool_registry.py +64 -0
  80. kiln_ai/utils/config.py +32 -0
  81. kiln_ai/utils/open_ai_types.py +94 -0
  82. kiln_ai/utils/project_utils.py +17 -0
  83. kiln_ai/utils/test_config.py +138 -1
  84. kiln_ai/utils/test_open_ai_types.py +131 -0
  85. {kiln_ai-0.18.0.dist-info → kiln_ai-0.20.1.dist-info}/METADATA +37 -6
  86. kiln_ai-0.20.1.dist-info/RECORD +138 -0
  87. kiln_ai-0.18.0.dist-info/RECORD +0 -115
  88. {kiln_ai-0.18.0.dist-info → kiln_ai-0.20.1.dist-info}/WHEEL +0 -0
  89. {kiln_ai-0.18.0.dist-info → kiln_ai-0.20.1.dist-info}/licenses/LICENSE.txt +0 -0
@@ -0,0 +1,62 @@
1
+ from typing import List
2
+
3
+ from pydantic import BaseModel, Field, model_validator
4
+ from typing_extensions import Self
5
+
6
+ from kiln_ai.datamodel.datamodel_enums import (
7
+ ModelProviderName,
8
+ StructuredOutputMode,
9
+ )
10
+ from kiln_ai.datamodel.prompt_id import PromptId
11
+ from kiln_ai.datamodel.tool_id import ToolId
12
+
13
+
14
+ class ToolsRunConfig(BaseModel):
15
+ """
16
+ A config describing which tools are available to a task.
17
+ """
18
+
19
+ tools: List[ToolId] = Field(
20
+ description="The IDs of the tools available to the task."
21
+ )
22
+
23
+
24
+ class RunConfigProperties(BaseModel):
25
+ """
26
+ A configuration for running a task.
27
+
28
+ This includes everything needed to run a task, except the input and task ID. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic).
29
+ """
30
+
31
+ model_name: str = Field(description="The model to use for this run config.")
32
+ model_provider_name: ModelProviderName = Field(
33
+ description="The provider to use for this run config."
34
+ )
35
+ prompt_id: PromptId = Field(
36
+ description="The prompt to use for this run config. Defaults to building a simple prompt from the task if not provided.",
37
+ )
38
+ top_p: float = Field(
39
+ default=1.0,
40
+ description="The top-p value to use for this run config. Defaults to 1.0.",
41
+ )
42
+ temperature: float = Field(
43
+ default=1.0,
44
+ description="The temperature to use for this run config. Defaults to 1.0.",
45
+ )
46
+ structured_output_mode: StructuredOutputMode = Field(
47
+ description="The structured output mode to use for this run config.",
48
+ )
49
+ tools_config: ToolsRunConfig | None = Field(
50
+ default=None,
51
+ description="The tools config to use for this run config, defining which tools are available to the model.",
52
+ )
53
+
54
+ @model_validator(mode="after")
55
+ def validate_required_fields(self) -> Self:
56
+ if not (0 <= self.top_p <= 1):
57
+ raise ValueError("top_p must be between 0 and 1")
58
+
59
+ elif self.temperature < 0 or self.temperature > 2:
60
+ raise ValueError("temperature must be between 0 and 2")
61
+
62
+ return self
kiln_ai/datamodel/task.py CHANGED
@@ -1,28 +1,26 @@
1
1
  from typing import TYPE_CHECKING, Dict, List, Union
2
2
 
3
3
  from pydantic import BaseModel, Field, ValidationInfo, model_validator
4
- from typing_extensions import Self
5
4
 
6
- from kiln_ai.datamodel import Finetune
7
5
  from kiln_ai.datamodel.basemodel import (
8
6
  ID_FIELD,
9
7
  ID_TYPE,
10
- NAME_FIELD,
11
- SHORT_NAME_FIELD,
8
+ FilenameString,
9
+ FilenameStringShort,
12
10
  KilnParentedModel,
13
11
  KilnParentModel,
14
12
  )
15
13
  from kiln_ai.datamodel.datamodel_enums import (
16
- ModelProviderName,
17
14
  Priority,
18
15
  StructuredOutputMode,
19
16
  TaskOutputRatingType,
20
17
  )
21
18
  from kiln_ai.datamodel.dataset_split import DatasetSplit
22
19
  from kiln_ai.datamodel.eval import Eval
20
+ from kiln_ai.datamodel.finetune import Finetune
23
21
  from kiln_ai.datamodel.json_schema import JsonObjectSchema, schema_from_json_str
24
22
  from kiln_ai.datamodel.prompt import BasePrompt, Prompt
25
- from kiln_ai.datamodel.prompt_id import PromptId
23
+ from kiln_ai.datamodel.run_config import RunConfigProperties
26
24
  from kiln_ai.datamodel.task_run import TaskRun
27
25
 
28
26
  if TYPE_CHECKING:
@@ -38,62 +36,13 @@ class TaskRequirement(BaseModel):
38
36
  """
39
37
 
40
38
  id: ID_TYPE = ID_FIELD
41
- name: str = SHORT_NAME_FIELD
39
+ name: FilenameStringShort = Field(description="The name of the task requirement.")
42
40
  description: str | None = Field(default=None)
43
41
  instruction: str = Field(min_length=1)
44
42
  priority: Priority = Field(default=Priority.p2)
45
43
  type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star)
46
44
 
47
45
 
48
- class RunConfigProperties(BaseModel):
49
- """
50
- A configuration for running a task.
51
-
52
- This includes everything needed to run a task, except the input and task ID. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic).
53
- """
54
-
55
- model_name: str = Field(description="The model to use for this run config.")
56
- model_provider_name: ModelProviderName = Field(
57
- description="The provider to use for this run config."
58
- )
59
- prompt_id: PromptId = Field(
60
- description="The prompt to use for this run config. Defaults to building a simple prompt from the task if not provided.",
61
- )
62
- top_p: float = Field(
63
- default=1.0,
64
- description="The top-p value to use for this run config. Defaults to 1.0.",
65
- )
66
- temperature: float = Field(
67
- default=1.0,
68
- description="The temperature to use for this run config. Defaults to 1.0.",
69
- )
70
- structured_output_mode: StructuredOutputMode = Field(
71
- description="The structured output mode to use for this run config.",
72
- )
73
-
74
- @model_validator(mode="after")
75
- def validate_required_fields(self) -> Self:
76
- if not (0 <= self.top_p <= 1):
77
- raise ValueError("top_p must be between 0 and 1")
78
-
79
- elif self.temperature < 0 or self.temperature > 2:
80
- raise ValueError("temperature must be between 0 and 2")
81
-
82
- return self
83
-
84
-
85
- class RunConfig(RunConfigProperties):
86
- """
87
- A configuration for running a task.
88
-
89
- This includes everything needed to run a task, except the input. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic).
90
-
91
- For example: task, model, provider, prompt, etc.
92
- """
93
-
94
- task: "Task" = Field(description="The task to run.")
95
-
96
-
97
46
  class TaskRunConfig(KilnParentedModel):
98
47
  """
99
48
  A Kiln model for persisting a run config in a Kiln Project, nested under a task.
@@ -103,7 +52,7 @@ class TaskRunConfig(KilnParentedModel):
103
52
  A run config includes everything needed to run a task, except the input. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic).
104
53
  """
105
54
 
106
- name: str = NAME_FIELD
55
+ name: FilenameString = Field(description="The name of the task run config.")
107
56
  description: str | None = Field(
108
57
  default=None, description="The description of the task run config."
109
58
  )
@@ -124,15 +73,6 @@ class TaskRunConfig(KilnParentedModel):
124
73
  return None
125
74
  return self.parent # type: ignore
126
75
 
127
- def run_config(self) -> RunConfig:
128
- parent_task = self.parent_task()
129
- if parent_task is None:
130
- raise ValueError("Run config must be parented to a task")
131
- return run_config_from_run_config_properties(
132
- task=parent_task,
133
- run_config_properties=self.run_config_properties,
134
- )
135
-
136
76
  # Previously we didn't store structured_output_mode in the run_config_properties. Updgrade old models when loading from file.
137
77
  @model_validator(mode="before")
138
78
  def upgrade_old_entries(cls, data: dict, info: ValidationInfo) -> dict:
@@ -155,21 +95,6 @@ class TaskRunConfig(KilnParentedModel):
155
95
  return data
156
96
 
157
97
 
158
- def run_config_from_run_config_properties(
159
- task: "Task",
160
- run_config_properties: RunConfigProperties,
161
- ) -> RunConfig:
162
- return RunConfig(
163
- task=task,
164
- model_name=run_config_properties.model_name,
165
- model_provider_name=run_config_properties.model_provider_name,
166
- prompt_id=run_config_properties.prompt_id,
167
- top_p=run_config_properties.top_p,
168
- temperature=run_config_properties.temperature,
169
- structured_output_mode=run_config_properties.structured_output_mode,
170
- )
171
-
172
-
173
98
  class Task(
174
99
  KilnParentedModel,
175
100
  KilnParentModel,
@@ -189,7 +114,7 @@ class Task(
189
114
  a collection of task runs.
190
115
  """
191
116
 
192
- name: str = NAME_FIELD
117
+ name: FilenameString = Field(description="The name of the task.")
193
118
  description: str | None = Field(
194
119
  default=None,
195
120
  description="A description of the task for you and your team. Will not be used in prompts/training/validation.",
@@ -216,7 +141,7 @@ class Task(
216
141
  return None
217
142
  return schema_from_json_str(self.input_json_schema)
218
143
 
219
- # These wrappers help for typechecking. TODO P2: fix this in KilnParentModel
144
+ # These wrappers help for typechecking. We should fix this in KilnParentModel
220
145
  def runs(self, readonly: bool = False) -> list[TaskRun]:
221
146
  return super().runs(readonly=readonly) # type: ignore
222
147
 
@@ -1,6 +1,6 @@
1
1
  import json
2
2
  from enum import Enum
3
- from typing import TYPE_CHECKING, Dict, List, Type, Union
3
+ from typing import TYPE_CHECKING, Dict, List, Optional, Type, Union
4
4
 
5
5
  from pydantic import BaseModel, Field, ValidationInfo, model_validator
6
6
  from typing_extensions import Self
@@ -8,6 +8,7 @@ from typing_extensions import Self
8
8
  from kiln_ai.datamodel.basemodel import ID_TYPE, KilnBaseModel
9
9
  from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType
10
10
  from kiln_ai.datamodel.json_schema import validate_schema_with_value_error
11
+ from kiln_ai.datamodel.run_config import RunConfigProperties
11
12
  from kiln_ai.datamodel.strict_mode import strict_mode
12
13
  from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
13
14
 
@@ -199,6 +200,10 @@ class DataSource(BaseModel):
199
200
  default={},
200
201
  description="Properties describing the data source. For synthetic things like model. For human, the human's name.",
201
202
  )
203
+ run_config: Optional[RunConfigProperties] = Field(
204
+ default=None,
205
+ description="The run config used to generate the data, if generated by a running a model in Kiln (only true for type=synthetic).",
206
+ )
202
207
 
203
208
  _data_source_properties = [
204
209
  DataSourceProperty(
@@ -307,7 +312,7 @@ class TaskOutput(KilnBaseModel):
307
312
  if task.output_json_schema is not None:
308
313
  try:
309
314
  output_parsed = json.loads(self.output)
310
- except json.JSONDecodeError as e:
315
+ except json.JSONDecodeError:
311
316
  raise ValueError("Output is not a valid JSON object")
312
317
 
313
318
  validate_schema_with_value_error(
@@ -8,6 +8,7 @@ from kiln_ai.datamodel.basemodel import KilnParentedModel
8
8
  from kiln_ai.datamodel.json_schema import validate_schema_with_value_error
9
9
  from kiln_ai.datamodel.strict_mode import strict_mode
10
10
  from kiln_ai.datamodel.task_output import DataSource, TaskOutput
11
+ from kiln_ai.utils.open_ai_types import ChatCompletionMessageParam
11
12
 
12
13
  if TYPE_CHECKING:
13
14
  from kiln_ai.datamodel.task import Task
@@ -35,6 +36,42 @@ class Usage(BaseModel):
35
36
  ge=0,
36
37
  )
37
38
 
39
+ def __add__(self, other: "Usage") -> "Usage":
40
+ """Add two Usage objects together, handling None values gracefully.
41
+
42
+ None + None = None
43
+ None + value = value
44
+ value + None = value
45
+ value1 + value2 = value1 + value2
46
+ """
47
+ if not isinstance(other, Usage):
48
+ raise TypeError(f"Cannot add Usage with {type(other).__name__}")
49
+
50
+ def _add_optional_int(a: int | None, b: int | None) -> int | None:
51
+ if a is None and b is None:
52
+ return None
53
+ if a is None:
54
+ return b
55
+ if b is None:
56
+ return a
57
+ return a + b
58
+
59
+ def _add_optional_float(a: float | None, b: float | None) -> float | None:
60
+ if a is None and b is None:
61
+ return None
62
+ if a is None:
63
+ return b
64
+ if b is None:
65
+ return a
66
+ return a + b
67
+
68
+ return Usage(
69
+ input_tokens=_add_optional_int(self.input_tokens, other.input_tokens),
70
+ output_tokens=_add_optional_int(self.output_tokens, other.output_tokens),
71
+ total_tokens=_add_optional_int(self.total_tokens, other.total_tokens),
72
+ cost=_add_optional_float(self.cost, other.cost),
73
+ )
74
+
38
75
 
39
76
  class TaskRun(KilnParentedModel):
40
77
  """
@@ -72,6 +109,10 @@ class TaskRun(KilnParentedModel):
72
109
  default=None,
73
110
  description="Usage information for the task run. This includes the number of input tokens, output tokens, and total tokens used.",
74
111
  )
112
+ trace: list[ChatCompletionMessageParam] | None = Field(
113
+ default=None,
114
+ description="The trace of the task run in OpenAI format. This is the list of messages that were sent to/from the model.",
115
+ )
75
116
 
76
117
  def thinking_training_data(self) -> str | None:
77
118
  """
@@ -1,5 +1,6 @@
1
1
  import datetime
2
2
  import json
3
+ import uuid
3
4
  from pathlib import Path
4
5
  from typing import Optional
5
6
  from unittest.mock import MagicMock, patch
@@ -12,10 +13,11 @@ from kiln_ai.datamodel import Task, TaskRun
12
13
  from kiln_ai.datamodel.basemodel import (
13
14
  KilnBaseModel,
14
15
  KilnParentedModel,
16
+ name_validator,
15
17
  string_to_valid_name,
16
18
  )
17
19
  from kiln_ai.datamodel.model_cache import ModelCache
18
- from kiln_ai.datamodel.task import RunConfig
20
+ from kiln_ai.datamodel.task import RunConfigProperties
19
21
 
20
22
 
21
23
  @pytest.fixture
@@ -328,28 +330,81 @@ def test_delete_no_path():
328
330
  model.delete()
329
331
 
330
332
 
331
- def test_string_to_valid_name():
332
- # Test basic valid strings remain unchanged
333
- assert string_to_valid_name("Hello World") == "Hello World"
334
- assert string_to_valid_name("Test-123") == "Test-123"
335
- assert string_to_valid_name("my_file_name") == "my_file_name"
333
+ @pytest.mark.parametrize(
334
+ "name,expected",
335
+ [
336
+ # Basic valid strings remain unchanged
337
+ ("Hello World", "Hello World"),
338
+ ("Test-123", "Test-123"),
339
+ ("my_file_name", "my_file_name"),
340
+ ("multiple!!!symbols", "multiple!!!symbols"),
341
+ # Emoji
342
+ ("Hello 👍", "Hello 👍"),
343
+ # Invalid characters are replaced
344
+ ("Hello@World!", "Hello@World!"),
345
+ ("File.name.txt", "File_name_txt"),
346
+ ("Special%%%Chars", "Special_Chars"),
347
+ ("Special#$%Chars", "Special#$_Chars"),
348
+ # Consecutive invalid characters are replaced
349
+ ("Special%%%Chars", "Special_Chars"),
350
+ ("path/to/file", "path_to_file"),
351
+ # Leading/trailing special characters are removed
352
+ ("__test__", "test"),
353
+ ("...test...", "test"),
354
+ # Whitespace is replaced
355
+ ("", ""),
356
+ (" ", ""),
357
+ ("Hello World", "Hello World"),
358
+ # Unicode characters are replaced
359
+ ("你好", "你好"),
360
+ ("你好_世界", "你好_世界"),
361
+ ("你好_世界_你好", "你好_世界_你好"),
362
+ # Newlines, tabs, and other control characters are replaced
363
+ ("Hello\nworld", "Hello_world"),
364
+ ("Hello\tworld", "Hello_world"),
365
+ ("Hello\rworld", "Hello_world"),
366
+ ("Hello\fworld", "Hello_world"),
367
+ ("Hello\bworld", "Hello_world"),
368
+ ("Hello\vworld", "Hello_world"),
369
+ ("Hello\0world", "Hello_world"),
370
+ ("Hello\x00world", "Hello_world"),
371
+ ],
372
+ )
373
+ def test_string_to_valid_name(tmp_path, name, expected):
374
+ assert string_to_valid_name(name) == expected
336
375
 
337
- # Test invalid characters are replaced
338
- assert string_to_valid_name("Hello@World!") == "Hello_World"
339
- assert string_to_valid_name("File.name.txt") == "File_name_txt"
340
- assert string_to_valid_name("Special#$%Chars") == "Special_Chars"
376
+ # check we can create a folder with the valid name
377
+ dir_path = tmp_path / str(uuid.uuid4()) / expected
378
+ dir_path.mkdir(parents=True)
341
379
 
342
- # Test consecutive invalid characters
343
- assert string_to_valid_name("multiple!!!symbols") == "multiple_symbols"
344
- assert string_to_valid_name("path/to/file") == "path_to_file"
345
380
 
346
- # Test leading/trailing special characters
347
- assert string_to_valid_name("__test__") == "test"
348
- assert string_to_valid_name("...test...") == "test"
381
+ @pytest.mark.parametrize(
382
+ "name,min_length,max_length,should_pass",
383
+ [
384
+ # Valid cases
385
+ ("ValidName", 5, 20, True),
386
+ ("Short", 1, 10, True),
387
+ ("LongerValidName", 5, 20, True),
388
+ # None case (line 53)
389
+ (None, 5, 20, False),
390
+ # Too short cases (lines 57-59)
391
+ ("Hi", 5, 20, False),
392
+ ("", 1, 20, False),
393
+ ("a", 2, 20, False),
394
+ # Too long cases (lines 61-63)
395
+ ("ThisNameIsTooLong", 5, 10, False),
396
+ ("VeryVeryVeryLongName", 1, 15, False),
397
+ ],
398
+ )
399
+ def test_name_validator_error_conditions(name, min_length, max_length, should_pass):
400
+ validator = name_validator(min_length=min_length, max_length=max_length)
349
401
 
350
- # Test empty string and whitespace
351
- assert string_to_valid_name("") == ""
352
- assert string_to_valid_name(" ") == ""
402
+ if should_pass:
403
+ result = validator(name)
404
+ assert result == name
405
+ else:
406
+ with pytest.raises(ValueError):
407
+ validator(name)
353
408
 
354
409
 
355
410
  def test_load_from_file_with_cache(test_base_file, tmp_model_cache):
@@ -497,8 +552,8 @@ def base_task():
497
552
  @pytest.fixture
498
553
  def adapter(base_task):
499
554
  return MockAdapter(
500
- run_config=RunConfig(
501
- task=base_task,
555
+ task=base_task,
556
+ run_config=RunConfigProperties(
502
557
  model_name="test_model",
503
558
  model_provider_name="openai",
504
559
  prompt_id="simple_prompt_builder",
@@ -553,3 +608,140 @@ async def test_invoke_parsing_flow(adapter):
553
608
  match="Reasoning is required for this model, but no reasoning was returned.",
554
609
  ):
555
610
  await adapter.invoke("test input")
611
+
612
+
613
+ async def test_invoke_parsing_flow_basic_no_reasoning(adapter):
614
+ """Test for reasoning_optional_for_structured_output
615
+ when reasoning is not required.
616
+ This is a special case where we want to return the output as is.
617
+ """
618
+ # Mock dependencies
619
+ mock_provider = MagicMock()
620
+ mock_provider.parser = "test_parser"
621
+ mock_provider.formatter = None
622
+ mock_provider.reasoning_capable = False
623
+ mock_provider.reasoning_optional_for_structured_output = True
624
+
625
+ mock_parser = MagicMock()
626
+ mock_parser.parse_output.return_value = RunOutput(
627
+ output="parsed test output", intermediate_outputs={"key": "value"}
628
+ )
629
+
630
+ with (
631
+ patch.object(adapter, "model_provider", return_value=mock_provider),
632
+ patch(
633
+ "kiln_ai.adapters.model_adapters.base_adapter.model_parser_from_id",
634
+ return_value=mock_parser,
635
+ ),
636
+ patch("kiln_ai.adapters.model_adapters.base_adapter.Config") as mock_config,
637
+ ):
638
+ # Disable autosaving for this test
639
+ mock_config.shared.return_value.autosave_runs = False
640
+ mock_config.shared.return_value.user_id = "test_user_id"
641
+
642
+ # Execute
643
+ result = await adapter.invoke("test input")
644
+
645
+ # Verify parsing occurred
646
+ mock_parser.parse_output.assert_called_once()
647
+ parsed_args = mock_parser.parse_output.call_args[1]
648
+ assert isinstance(parsed_args["original_output"], RunOutput)
649
+ assert parsed_args["original_output"].output == "test output"
650
+
651
+ # Verify result contains parsed output
652
+ assert isinstance(result, TaskRun)
653
+ assert result.output.output == "parsed test output"
654
+ assert result.intermediate_outputs == {"key": "value"}
655
+ assert result.input == "test input"
656
+
657
+
658
+ async def test_invoke_parsing_flow_no_reasoning_with_structured_output(adapter):
659
+ """Test for reasoning_optional_for_structured_output
660
+ when reasoning is required but not provided, with structured output enabled.
661
+ This is a special case where we don't want to error, but we want to return the output as is.
662
+ """
663
+ # Mock dependencies
664
+ mock_provider = MagicMock()
665
+ mock_provider.parser = "test_parser"
666
+ mock_provider.formatter = None
667
+ mock_provider.reasoning_capable = True
668
+ mock_provider.reasoning_optional_for_structured_output = True
669
+
670
+ mock_parser = MagicMock()
671
+ mock_parser.parse_output.return_value = RunOutput(
672
+ output="parsed test output", intermediate_outputs={"key": "value"}
673
+ )
674
+
675
+ with (
676
+ patch.object(adapter, "model_provider", return_value=mock_provider),
677
+ patch(
678
+ "kiln_ai.adapters.model_adapters.base_adapter.model_parser_from_id",
679
+ return_value=mock_parser,
680
+ ),
681
+ patch("kiln_ai.adapters.model_adapters.base_adapter.Config") as mock_config,
682
+ patch.object(adapter, "has_structured_output", return_value=True),
683
+ ):
684
+ # Disable autosaving for this test
685
+ mock_config.shared.return_value.autosave_runs = False
686
+ mock_config.shared.return_value.user_id = "test_user_id"
687
+
688
+ # Execute
689
+ result = await adapter.invoke("test input")
690
+
691
+ # Verify parsing occurred
692
+ mock_parser.parse_output.assert_called_once()
693
+ parsed_args = mock_parser.parse_output.call_args[1]
694
+ assert isinstance(parsed_args["original_output"], RunOutput)
695
+ assert parsed_args["original_output"].output == "test output"
696
+
697
+ # Verify result contains parsed output
698
+ assert isinstance(result, TaskRun)
699
+ assert result.output.output == "parsed test output"
700
+ assert result.intermediate_outputs == {"key": "value"}
701
+ assert result.input == "test input"
702
+
703
+
704
+ async def test_invoke_parsing_flow_with_reasoning_and_structured_output(adapter):
705
+ """Test for reasoning_optional_for_structured_output
706
+ when reasoning is provided with structured output enabled.
707
+ This is a special case where we want to return the output as is.
708
+ """
709
+ # Mock dependencies
710
+ mock_provider = MagicMock()
711
+ mock_provider.parser = "test_parser"
712
+ mock_provider.formatter = None
713
+ mock_provider.reasoning_capable = True
714
+ mock_provider.reasoning_optional_for_structured_output = True
715
+
716
+ mock_parser = MagicMock()
717
+ mock_parser.parse_output.return_value = RunOutput(
718
+ output="parsed test output", intermediate_outputs={"reasoning": "value"}
719
+ )
720
+
721
+ with (
722
+ patch.object(adapter, "model_provider", return_value=mock_provider),
723
+ patch(
724
+ "kiln_ai.adapters.model_adapters.base_adapter.model_parser_from_id",
725
+ return_value=mock_parser,
726
+ ),
727
+ patch("kiln_ai.adapters.model_adapters.base_adapter.Config") as mock_config,
728
+ patch.object(adapter, "has_structured_output", return_value=True),
729
+ ):
730
+ # Disable autosaving for this test
731
+ mock_config.shared.return_value.autosave_runs = False
732
+ mock_config.shared.return_value.user_id = "test_user_id"
733
+
734
+ # Execute
735
+ result = await adapter.invoke("test input")
736
+
737
+ # Verify parsing occurred
738
+ mock_parser.parse_output.assert_called_once()
739
+ parsed_args = mock_parser.parse_output.call_args[1]
740
+ assert isinstance(parsed_args["original_output"], RunOutput)
741
+ assert parsed_args["original_output"].output == "test output"
742
+
743
+ # Verify result contains parsed output
744
+ assert isinstance(result, TaskRun)
745
+ assert result.output.output == "parsed test output"
746
+ assert result.intermediate_outputs == {"reasoning": "value"}
747
+ assert result.input == "test input"
@@ -517,13 +517,13 @@ def test_eval_run_score_keys_must_match(valid_eval_config, valid_eval_run_data):
517
517
  valid_eval_config.parent = eval
518
518
 
519
519
  # Correct
520
- run = EvalRun(
520
+ EvalRun(
521
521
  parent=valid_eval_config,
522
522
  **{**valid_eval_run_data, "scores": {"accuracy": 4.5, "critical": 1.0}},
523
523
  )
524
524
 
525
525
  # Correct but wrong order still okay
526
- run = EvalRun(
526
+ EvalRun(
527
527
  parent=valid_eval_config,
528
528
  **{**valid_eval_run_data, "scores": {"critical": 1.0, "accuracy": 4.5}},
529
529
  )
@@ -533,7 +533,7 @@ def test_eval_run_score_keys_must_match(valid_eval_config, valid_eval_run_data):
533
533
  ValueError,
534
534
  match="The scores produced by the evaluator must match the scores expected by the eval",
535
535
  ):
536
- run = EvalRun(
536
+ EvalRun(
537
537
  parent=valid_eval_config,
538
538
  **{**valid_eval_run_data, "scores": {"accuracy": 4.5}},
539
539
  )
@@ -543,7 +543,7 @@ def test_eval_run_score_keys_must_match(valid_eval_config, valid_eval_run_data):
543
543
  ValueError,
544
544
  match="The scores produced by the evaluator must match the scores expected by the eval",
545
545
  ):
546
- run = EvalRun(
546
+ EvalRun(
547
547
  parent=valid_eval_config,
548
548
  **{
549
549
  **valid_eval_run_data,
@@ -556,7 +556,7 @@ def test_eval_run_score_keys_must_match(valid_eval_config, valid_eval_run_data):
556
556
  ValueError,
557
557
  match="The scores produced by the evaluator must match the scores expected by the eval",
558
558
  ):
559
- run = EvalRun(
559
+ EvalRun(
560
560
  parent=valid_eval_config,
561
561
  **{**valid_eval_run_data, "scores": {"accuracy": 4.5, "wrong": 1.0}},
562
562
  )
@@ -566,7 +566,7 @@ def test_eval_run_custom_scores_not_allowed(valid_eval_config, valid_eval_run_da
566
566
  with pytest.raises(
567
567
  ValueError, match="Custom scores are not supported in evaluators"
568
568
  ):
569
- eval = Eval(
569
+ Eval(
570
570
  name="Test Eval",
571
571
  eval_set_filter_id="tag::tag1",
572
572
  eval_configs_filter_id="tag::tag2",