kiln-ai 0.8.1__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kiln-ai might be problematic. Click here for more details.
- kiln_ai/adapters/__init__.py +7 -7
- kiln_ai/adapters/adapter_registry.py +81 -10
- kiln_ai/adapters/data_gen/data_gen_task.py +21 -3
- kiln_ai/adapters/data_gen/test_data_gen_task.py +23 -3
- kiln_ai/adapters/eval/base_eval.py +164 -0
- kiln_ai/adapters/eval/eval_runner.py +267 -0
- kiln_ai/adapters/eval/g_eval.py +367 -0
- kiln_ai/adapters/eval/registry.py +16 -0
- kiln_ai/adapters/eval/test_base_eval.py +324 -0
- kiln_ai/adapters/eval/test_eval_runner.py +640 -0
- kiln_ai/adapters/eval/test_g_eval.py +497 -0
- kiln_ai/adapters/eval/test_g_eval_data.py +4 -0
- kiln_ai/adapters/fine_tune/base_finetune.py +5 -1
- kiln_ai/adapters/fine_tune/dataset_formatter.py +310 -65
- kiln_ai/adapters/fine_tune/fireworks_finetune.py +47 -32
- kiln_ai/adapters/fine_tune/openai_finetune.py +12 -11
- kiln_ai/adapters/fine_tune/test_base_finetune.py +19 -0
- kiln_ai/adapters/fine_tune/test_dataset_formatter.py +472 -129
- kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +114 -22
- kiln_ai/adapters/fine_tune/test_openai_finetune.py +125 -14
- kiln_ai/adapters/ml_model_list.py +434 -93
- kiln_ai/adapters/model_adapters/__init__.py +18 -0
- kiln_ai/adapters/model_adapters/base_adapter.py +250 -0
- kiln_ai/adapters/model_adapters/langchain_adapters.py +309 -0
- kiln_ai/adapters/model_adapters/openai_compatible_config.py +10 -0
- kiln_ai/adapters/model_adapters/openai_model_adapter.py +289 -0
- kiln_ai/adapters/model_adapters/test_base_adapter.py +199 -0
- kiln_ai/adapters/{test_langchain_adapter.py → model_adapters/test_langchain_adapter.py} +105 -97
- kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +216 -0
- kiln_ai/adapters/{test_saving_adapter_results.py → model_adapters/test_saving_adapter_results.py} +80 -30
- kiln_ai/adapters/{test_structured_output.py → model_adapters/test_structured_output.py} +125 -46
- kiln_ai/adapters/ollama_tools.py +0 -1
- kiln_ai/adapters/parsers/__init__.py +10 -0
- kiln_ai/adapters/parsers/base_parser.py +12 -0
- kiln_ai/adapters/parsers/json_parser.py +37 -0
- kiln_ai/adapters/parsers/parser_registry.py +19 -0
- kiln_ai/adapters/parsers/r1_parser.py +69 -0
- kiln_ai/adapters/parsers/test_json_parser.py +81 -0
- kiln_ai/adapters/parsers/test_parser_registry.py +32 -0
- kiln_ai/adapters/parsers/test_r1_parser.py +144 -0
- kiln_ai/adapters/prompt_builders.py +193 -49
- kiln_ai/adapters/provider_tools.py +91 -36
- kiln_ai/adapters/repair/repair_task.py +18 -19
- kiln_ai/adapters/repair/test_repair_task.py +7 -7
- kiln_ai/adapters/run_output.py +11 -0
- kiln_ai/adapters/test_adapter_registry.py +177 -0
- kiln_ai/adapters/test_generate_docs.py +69 -0
- kiln_ai/adapters/test_ollama_tools.py +0 -1
- kiln_ai/adapters/test_prompt_adaptors.py +25 -18
- kiln_ai/adapters/test_prompt_builders.py +265 -44
- kiln_ai/adapters/test_provider_tools.py +268 -46
- kiln_ai/datamodel/__init__.py +51 -772
- kiln_ai/datamodel/basemodel.py +31 -11
- kiln_ai/datamodel/datamodel_enums.py +58 -0
- kiln_ai/datamodel/dataset_filters.py +114 -0
- kiln_ai/datamodel/dataset_split.py +170 -0
- kiln_ai/datamodel/eval.py +298 -0
- kiln_ai/datamodel/finetune.py +105 -0
- kiln_ai/datamodel/json_schema.py +14 -3
- kiln_ai/datamodel/model_cache.py +8 -3
- kiln_ai/datamodel/project.py +23 -0
- kiln_ai/datamodel/prompt.py +37 -0
- kiln_ai/datamodel/prompt_id.py +83 -0
- kiln_ai/datamodel/strict_mode.py +24 -0
- kiln_ai/datamodel/task.py +181 -0
- kiln_ai/datamodel/task_output.py +321 -0
- kiln_ai/datamodel/task_run.py +164 -0
- kiln_ai/datamodel/test_basemodel.py +80 -2
- kiln_ai/datamodel/test_dataset_filters.py +71 -0
- kiln_ai/datamodel/test_dataset_split.py +127 -6
- kiln_ai/datamodel/test_datasource.py +3 -2
- kiln_ai/datamodel/test_eval_model.py +635 -0
- kiln_ai/datamodel/test_example_models.py +34 -17
- kiln_ai/datamodel/test_json_schema.py +23 -0
- kiln_ai/datamodel/test_model_cache.py +24 -0
- kiln_ai/datamodel/test_model_perf.py +125 -0
- kiln_ai/datamodel/test_models.py +131 -2
- kiln_ai/datamodel/test_prompt_id.py +129 -0
- kiln_ai/datamodel/test_task.py +159 -0
- kiln_ai/utils/config.py +6 -1
- kiln_ai/utils/exhaustive_error.py +6 -0
- {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/METADATA +45 -7
- kiln_ai-0.12.0.dist-info/RECORD +100 -0
- kiln_ai/adapters/base_adapter.py +0 -191
- kiln_ai/adapters/langchain_adapters.py +0 -256
- kiln_ai-0.8.1.dist-info/RECORD +0 -58
- {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/WHEEL +0 -0
- {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -6,13 +6,14 @@ from langchain_core.language_models.fake_chat_models import FakeListChatModel
|
|
|
6
6
|
|
|
7
7
|
import kiln_ai.datamodel as datamodel
|
|
8
8
|
from kiln_ai.adapters.adapter_registry import adapter_for_task
|
|
9
|
-
from kiln_ai.adapters.langchain_adapters import LangchainAdapter
|
|
10
9
|
from kiln_ai.adapters.ml_model_list import built_in_models
|
|
10
|
+
from kiln_ai.adapters.model_adapters.langchain_adapters import LangchainAdapter
|
|
11
11
|
from kiln_ai.adapters.ollama_tools import ollama_online
|
|
12
12
|
from kiln_ai.adapters.prompt_builders import (
|
|
13
13
|
BasePromptBuilder,
|
|
14
14
|
SimpleChainOfThoughtPromptBuilder,
|
|
15
15
|
)
|
|
16
|
+
from kiln_ai.datamodel import PromptId
|
|
16
17
|
|
|
17
18
|
|
|
18
19
|
def get_all_models_and_providers():
|
|
@@ -108,7 +109,11 @@ async def test_amazon_bedrock(tmp_path):
|
|
|
108
109
|
async def test_mock(tmp_path):
|
|
109
110
|
task = build_test_task(tmp_path)
|
|
110
111
|
mockChatModel = FakeListChatModel(responses=["mock response"])
|
|
111
|
-
adapter = LangchainAdapter(
|
|
112
|
+
adapter = LangchainAdapter(
|
|
113
|
+
task,
|
|
114
|
+
custom_model=mockChatModel,
|
|
115
|
+
provider="ollama",
|
|
116
|
+
)
|
|
112
117
|
run = await adapter.invoke("You are a mock, send me the response!")
|
|
113
118
|
assert "mock response" in run.output.output
|
|
114
119
|
|
|
@@ -116,7 +121,7 @@ async def test_mock(tmp_path):
|
|
|
116
121
|
async def test_mock_returning_run(tmp_path):
|
|
117
122
|
task = build_test_task(tmp_path)
|
|
118
123
|
mockChatModel = FakeListChatModel(responses=["mock response"])
|
|
119
|
-
adapter = LangchainAdapter(task, custom_model=mockChatModel)
|
|
124
|
+
adapter = LangchainAdapter(task, custom_model=mockChatModel, provider="ollama")
|
|
120
125
|
run = await adapter.invoke("You are a mock, send me the response!")
|
|
121
126
|
assert run.output.output == "mock response"
|
|
122
127
|
assert run is not None
|
|
@@ -127,8 +132,8 @@ async def test_mock_returning_run(tmp_path):
|
|
|
127
132
|
assert run.output.source.properties == {
|
|
128
133
|
"adapter_name": "kiln_langchain_adapter",
|
|
129
134
|
"model_name": "custom.langchain:unknown_model",
|
|
130
|
-
"model_provider": "
|
|
131
|
-
"
|
|
135
|
+
"model_provider": "ollama",
|
|
136
|
+
"prompt_id": "simple_prompt_builder",
|
|
132
137
|
}
|
|
133
138
|
|
|
134
139
|
|
|
@@ -145,8 +150,9 @@ async def test_all_models_providers_plaintext(tmp_path, model_name, provider_nam
|
|
|
145
150
|
@pytest.mark.parametrize("model_name,provider_name", get_all_models_and_providers())
|
|
146
151
|
async def test_cot_prompt_builder(tmp_path, model_name, provider_name):
|
|
147
152
|
task = build_test_task(tmp_path)
|
|
148
|
-
|
|
149
|
-
|
|
153
|
+
await run_simple_task(
|
|
154
|
+
task, model_name, provider_name, "simple_chain_of_thought_prompt_builder"
|
|
155
|
+
)
|
|
150
156
|
|
|
151
157
|
|
|
152
158
|
def build_test_task(tmp_path: Path):
|
|
@@ -182,20 +188,20 @@ async def run_simple_test(
|
|
|
182
188
|
tmp_path: Path,
|
|
183
189
|
model_name: str,
|
|
184
190
|
provider: str | None = None,
|
|
185
|
-
|
|
191
|
+
prompt_id: PromptId | None = None,
|
|
186
192
|
):
|
|
187
193
|
task = build_test_task(tmp_path)
|
|
188
|
-
return await run_simple_task(task, model_name, provider,
|
|
194
|
+
return await run_simple_task(task, model_name, provider, prompt_id)
|
|
189
195
|
|
|
190
196
|
|
|
191
197
|
async def run_simple_task(
|
|
192
198
|
task: datamodel.Task,
|
|
193
199
|
model_name: str,
|
|
194
200
|
provider: str,
|
|
195
|
-
|
|
201
|
+
prompt_id: PromptId | None = None,
|
|
196
202
|
) -> datamodel.TaskRun:
|
|
197
203
|
adapter = adapter_for_task(
|
|
198
|
-
task, model_name=model_name, provider=provider,
|
|
204
|
+
task, model_name=model_name, provider=provider, prompt_id=prompt_id
|
|
199
205
|
)
|
|
200
206
|
|
|
201
207
|
run = await adapter.invoke(
|
|
@@ -208,13 +214,14 @@ async def run_simple_task(
|
|
|
208
214
|
)
|
|
209
215
|
assert "64" in run.output.output
|
|
210
216
|
source_props = run.output.source.properties
|
|
211
|
-
assert source_props["adapter_name"]
|
|
217
|
+
assert source_props["adapter_name"] in [
|
|
218
|
+
"kiln_langchain_adapter",
|
|
219
|
+
"kiln_openai_compatible_adapter",
|
|
220
|
+
]
|
|
212
221
|
assert source_props["model_name"] == model_name
|
|
213
222
|
assert source_props["model_provider"] == provider
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
)
|
|
219
|
-
assert source_props["prompt_builder_name"] == expected_prompt_builder_name
|
|
223
|
+
if prompt_id is None:
|
|
224
|
+
assert source_props["prompt_id"] == "simple_prompt_builder"
|
|
225
|
+
else:
|
|
226
|
+
assert source_props["prompt_id"] == prompt_id
|
|
220
227
|
return run
|
|
@@ -1,37 +1,49 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import logging
|
|
2
3
|
|
|
3
4
|
import pytest
|
|
4
5
|
|
|
5
|
-
from kiln_ai.adapters.base_adapter import
|
|
6
|
+
from kiln_ai.adapters.model_adapters.base_adapter import BaseAdapter
|
|
7
|
+
from kiln_ai.adapters.model_adapters.test_structured_output import (
|
|
8
|
+
build_structured_output_test_task,
|
|
9
|
+
)
|
|
6
10
|
from kiln_ai.adapters.prompt_builders import (
|
|
7
11
|
FewShotChainOfThoughtPromptBuilder,
|
|
8
12
|
FewShotPromptBuilder,
|
|
13
|
+
FineTunePromptBuilder,
|
|
9
14
|
MultiShotChainOfThoughtPromptBuilder,
|
|
10
15
|
MultiShotPromptBuilder,
|
|
11
16
|
RepairsPromptBuilder,
|
|
17
|
+
SavedPromptBuilder,
|
|
12
18
|
SimpleChainOfThoughtPromptBuilder,
|
|
13
19
|
SimplePromptBuilder,
|
|
20
|
+
TaskRunConfigPromptBuilder,
|
|
14
21
|
chain_of_thought_prompt,
|
|
15
|
-
|
|
22
|
+
prompt_builder_from_id,
|
|
16
23
|
)
|
|
17
24
|
from kiln_ai.adapters.test_prompt_adaptors import build_test_task
|
|
18
|
-
from kiln_ai.adapters.test_structured_output import build_structured_output_test_task
|
|
19
25
|
from kiln_ai.datamodel import (
|
|
20
26
|
DataSource,
|
|
21
27
|
DataSourceType,
|
|
28
|
+
Finetune,
|
|
29
|
+
FinetuneDataStrategy,
|
|
22
30
|
Project,
|
|
31
|
+
Prompt,
|
|
23
32
|
Task,
|
|
24
33
|
TaskOutput,
|
|
25
34
|
TaskOutputRating,
|
|
26
35
|
TaskRun,
|
|
27
36
|
)
|
|
37
|
+
from kiln_ai.datamodel.task import RunConfigProperties, TaskRunConfig
|
|
38
|
+
|
|
39
|
+
logger = logging.getLogger(__name__)
|
|
28
40
|
|
|
29
41
|
|
|
30
42
|
def test_simple_prompt_builder(tmp_path):
|
|
31
43
|
task = build_test_task(tmp_path)
|
|
32
44
|
builder = SimplePromptBuilder(task=task)
|
|
33
45
|
input = "two plus two"
|
|
34
|
-
prompt = builder.build_prompt()
|
|
46
|
+
prompt = builder.build_prompt(include_json_instructions=False)
|
|
35
47
|
assert (
|
|
36
48
|
"You are an assistant which performs math tasks provided in plain text."
|
|
37
49
|
in prompt
|
|
@@ -50,19 +62,15 @@ class MockAdapter(BaseAdapter):
|
|
|
50
62
|
def _run(self, input: str) -> str:
|
|
51
63
|
return "mock response"
|
|
52
64
|
|
|
53
|
-
def
|
|
54
|
-
return
|
|
55
|
-
adapter_name="mock_adapter",
|
|
56
|
-
model_name="mock_model",
|
|
57
|
-
model_provider="mock_provider",
|
|
58
|
-
)
|
|
65
|
+
def adapter_name(self) -> str:
|
|
66
|
+
return "mock_adapter"
|
|
59
67
|
|
|
60
68
|
|
|
61
69
|
def test_simple_prompt_builder_structured_output(tmp_path):
|
|
62
70
|
task = build_structured_output_test_task(tmp_path)
|
|
63
71
|
builder = SimplePromptBuilder(task=task)
|
|
64
72
|
input = "Cows"
|
|
65
|
-
prompt = builder.build_prompt()
|
|
73
|
+
prompt = builder.build_prompt(include_json_instructions=False)
|
|
66
74
|
assert "You are an assistant which tells a joke, given a subject." in prompt
|
|
67
75
|
|
|
68
76
|
user_msg = builder.build_user_message(input)
|
|
@@ -70,6 +78,14 @@ def test_simple_prompt_builder_structured_output(tmp_path):
|
|
|
70
78
|
assert input not in prompt
|
|
71
79
|
|
|
72
80
|
|
|
81
|
+
def test_simple_prompt_builder_structured_input_non_ascii(tmp_path):
|
|
82
|
+
task = build_structured_output_test_task(tmp_path)
|
|
83
|
+
builder = SimplePromptBuilder(task=task)
|
|
84
|
+
input = {"key": "你好👋"}
|
|
85
|
+
user_msg = builder.build_user_message(input)
|
|
86
|
+
assert "你好👋" in user_msg
|
|
87
|
+
|
|
88
|
+
|
|
73
89
|
@pytest.fixture
|
|
74
90
|
def task_with_examples(tmp_path):
|
|
75
91
|
# Create a project and task hierarchy
|
|
@@ -198,7 +214,7 @@ def task_with_examples(tmp_path):
|
|
|
198
214
|
def test_multi_shot_prompt_builder(task_with_examples):
|
|
199
215
|
# Verify the order of examples
|
|
200
216
|
prompt_builder = MultiShotPromptBuilder(task=task_with_examples)
|
|
201
|
-
prompt = prompt_builder.build_prompt()
|
|
217
|
+
prompt = prompt_builder.build_prompt(include_json_instructions=False)
|
|
202
218
|
assert "Why did the cow cross the road?" in prompt
|
|
203
219
|
assert prompt.index("Why did the cow cross the road?") < prompt.index(
|
|
204
220
|
"Why don't cats play poker in the jungle?"
|
|
@@ -239,14 +255,14 @@ def test_few_shot_prompt_builder(tmp_path):
|
|
|
239
255
|
# Create 6 examples (2 repaired, 4 high-quality)
|
|
240
256
|
for i in range(6):
|
|
241
257
|
run = TaskRun(
|
|
242
|
-
input=f'{{"subject": "Subject {i+1}"}}',
|
|
258
|
+
input=f'{{"subject": "Subject {i + 1}"}}',
|
|
243
259
|
input_source=DataSource(
|
|
244
260
|
type=DataSourceType.human,
|
|
245
261
|
properties={"created_by": "john_doe"},
|
|
246
262
|
),
|
|
247
263
|
parent=task,
|
|
248
264
|
output=TaskOutput(
|
|
249
|
-
output=f'{{"joke": "Joke Initial Output {i+1}"}}',
|
|
265
|
+
output=f'{{"joke": "Joke Initial Output {i + 1}"}}',
|
|
250
266
|
source=DataSource(
|
|
251
267
|
type=DataSourceType.human,
|
|
252
268
|
properties={"created_by": "john_doe"},
|
|
@@ -254,13 +270,12 @@ def test_few_shot_prompt_builder(tmp_path):
|
|
|
254
270
|
rating=TaskOutputRating(value=4 + (i % 2), reason="Good joke"),
|
|
255
271
|
),
|
|
256
272
|
)
|
|
257
|
-
print("RATING", "Joke Initial Output ", i + 1, " - RATED:", 4 + (i % 2), "\n")
|
|
258
273
|
if i < 2:
|
|
259
274
|
run = run.model_copy(
|
|
260
275
|
update={
|
|
261
276
|
"repair_instructions": "Fix the joke",
|
|
262
277
|
"repaired_output": TaskOutput(
|
|
263
|
-
output=f'{{"joke": "Repaired Joke {i+1}"}}',
|
|
278
|
+
output=f'{{"joke": "Repaired Joke {i + 1}"}}',
|
|
264
279
|
source=DataSource(
|
|
265
280
|
type=DataSourceType.human,
|
|
266
281
|
properties={"created_by": "jane_doe"},
|
|
@@ -272,10 +287,10 @@ def test_few_shot_prompt_builder(tmp_path):
|
|
|
272
287
|
|
|
273
288
|
# Check that only 4 examples are included
|
|
274
289
|
prompt_builder = FewShotPromptBuilder(task=task)
|
|
275
|
-
prompt = prompt_builder.build_prompt()
|
|
290
|
+
prompt = prompt_builder.build_prompt(include_json_instructions=False)
|
|
276
291
|
assert prompt.count("## Example") == 4
|
|
277
292
|
|
|
278
|
-
|
|
293
|
+
logger.info("PROMPT: %s", prompt)
|
|
279
294
|
# Verify the order of examples (2 repaired, then 2 highest-rated)
|
|
280
295
|
assert "Repaired Joke 1" in prompt
|
|
281
296
|
assert "Repaired Joke 2" in prompt
|
|
@@ -289,7 +304,7 @@ def test_few_shot_prompt_builder(tmp_path):
|
|
|
289
304
|
|
|
290
305
|
def check_example_outputs(task: Task, count: int):
|
|
291
306
|
prompt_builder = MultiShotPromptBuilder(task=task)
|
|
292
|
-
prompt = prompt_builder.build_prompt()
|
|
307
|
+
prompt = prompt_builder.build_prompt(include_json_instructions=False)
|
|
293
308
|
assert "# Instruction" in prompt
|
|
294
309
|
assert task.instruction in prompt
|
|
295
310
|
if count == 0:
|
|
@@ -299,32 +314,89 @@ def check_example_outputs(task: Task, count: int):
|
|
|
299
314
|
assert f"## Example {count}" in prompt
|
|
300
315
|
|
|
301
316
|
|
|
302
|
-
def
|
|
303
|
-
|
|
304
|
-
assert
|
|
305
|
-
|
|
317
|
+
def test_prompt_builder_from_id(task_with_examples):
|
|
318
|
+
task = task_with_examples
|
|
319
|
+
assert isinstance(
|
|
320
|
+
prompt_builder_from_id("simple_prompt_builder", task), SimplePromptBuilder
|
|
321
|
+
)
|
|
322
|
+
assert isinstance(
|
|
323
|
+
prompt_builder_from_id("few_shot_prompt_builder", task),
|
|
324
|
+
FewShotPromptBuilder,
|
|
325
|
+
)
|
|
326
|
+
assert isinstance(
|
|
327
|
+
prompt_builder_from_id("multi_shot_prompt_builder", task),
|
|
328
|
+
MultiShotPromptBuilder,
|
|
329
|
+
)
|
|
330
|
+
assert isinstance(
|
|
331
|
+
prompt_builder_from_id("repairs_prompt_builder", task),
|
|
332
|
+
RepairsPromptBuilder,
|
|
333
|
+
)
|
|
334
|
+
assert isinstance(
|
|
335
|
+
prompt_builder_from_id("simple_chain_of_thought_prompt_builder", task),
|
|
336
|
+
SimpleChainOfThoughtPromptBuilder,
|
|
337
|
+
)
|
|
338
|
+
assert isinstance(
|
|
339
|
+
prompt_builder_from_id("few_shot_chain_of_thought_prompt_builder", task),
|
|
340
|
+
FewShotChainOfThoughtPromptBuilder,
|
|
341
|
+
)
|
|
342
|
+
assert isinstance(
|
|
343
|
+
prompt_builder_from_id("multi_shot_chain_of_thought_prompt_builder", task),
|
|
344
|
+
MultiShotChainOfThoughtPromptBuilder,
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
with pytest.raises(ValueError, match="Unknown prompt generator: invalid_name"):
|
|
348
|
+
prompt_builder_from_id("invalid_name", task)
|
|
349
|
+
|
|
350
|
+
with pytest.raises(ValueError, match="Prompt ID not found: 123"):
|
|
351
|
+
prompt_builder_from_id("id::123", task)
|
|
306
352
|
|
|
353
|
+
with pytest.raises(
|
|
354
|
+
ValueError,
|
|
355
|
+
match="Invalid fine-tune ID format. Expected 'project_id::task_id::fine_tune_id'",
|
|
356
|
+
):
|
|
357
|
+
prompt_builder_from_id("fine_tune_prompt::123", task)
|
|
307
358
|
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
359
|
+
with pytest.raises(
|
|
360
|
+
ValueError,
|
|
361
|
+
match="Fine-tune ID not found",
|
|
362
|
+
):
|
|
363
|
+
prompt_builder_from_id("fine_tune_prompt::123::456::789", task)
|
|
364
|
+
|
|
365
|
+
prompt = Prompt(
|
|
366
|
+
name="test_prompt_name",
|
|
367
|
+
prompt="test_prompt",
|
|
368
|
+
chain_of_thought_instructions="coti",
|
|
369
|
+
parent=task,
|
|
316
370
|
)
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
371
|
+
prompt.save_to_file()
|
|
372
|
+
pb = prompt_builder_from_id("id::" + prompt.id, task)
|
|
373
|
+
assert isinstance(pb, SavedPromptBuilder)
|
|
374
|
+
assert pb.prompt_id() == prompt.id
|
|
375
|
+
assert pb.build_prompt(include_json_instructions=False) == "test_prompt"
|
|
376
|
+
assert pb.chain_of_thought_prompt() == "coti"
|
|
377
|
+
|
|
378
|
+
finetune = Finetune(
|
|
379
|
+
name="test_finetune_name",
|
|
380
|
+
system_message="test_system_message",
|
|
381
|
+
thinking_instructions="test_thinking_instructions",
|
|
382
|
+
parent=task,
|
|
383
|
+
base_model_id="test_base_model_id",
|
|
384
|
+
dataset_split_id="asdf",
|
|
385
|
+
provider="test_provider",
|
|
386
|
+
data_strategy=FinetuneDataStrategy.final_and_intermediate,
|
|
320
387
|
)
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
388
|
+
finetune.save_to_file()
|
|
389
|
+
nested_fine_tune_id = (
|
|
390
|
+
task_with_examples.parent.id + "::" + task_with_examples.id + "::" + finetune.id
|
|
324
391
|
)
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
392
|
+
pb = prompt_builder_from_id(
|
|
393
|
+
"fine_tune_prompt::" + nested_fine_tune_id,
|
|
394
|
+
task_with_examples,
|
|
395
|
+
)
|
|
396
|
+
assert isinstance(pb, FineTunePromptBuilder)
|
|
397
|
+
assert pb.prompt_id() == nested_fine_tune_id
|
|
398
|
+
assert pb.build_base_prompt() == "test_system_message"
|
|
399
|
+
assert pb.chain_of_thought_prompt() == "test_thinking_instructions"
|
|
328
400
|
|
|
329
401
|
|
|
330
402
|
def test_example_count():
|
|
@@ -335,7 +407,7 @@ def test_example_count():
|
|
|
335
407
|
def test_repair_multi_shot_prompt_builder(task_with_examples):
|
|
336
408
|
# Verify the order of examples
|
|
337
409
|
prompt_builder = RepairsPromptBuilder(task=task_with_examples)
|
|
338
|
-
prompt = prompt_builder.build_prompt()
|
|
410
|
+
prompt = prompt_builder.build_prompt(include_json_instructions=False)
|
|
339
411
|
assert (
|
|
340
412
|
'Repaired Output Which is Sufficient: {"joke": "Why did the cow cross the road? To get to the udder side!"}'
|
|
341
413
|
in prompt
|
|
@@ -403,7 +475,7 @@ def test_build_prompt_for_ui(tmp_path):
|
|
|
403
475
|
ui_prompt = simple_builder.build_prompt_for_ui()
|
|
404
476
|
|
|
405
477
|
# Should match regular prompt since no chain of thought
|
|
406
|
-
assert ui_prompt == simple_builder.build_prompt()
|
|
478
|
+
assert ui_prompt == simple_builder.build_prompt(include_json_instructions=False)
|
|
407
479
|
assert "# Thinking Instructions" not in ui_prompt
|
|
408
480
|
|
|
409
481
|
# Test chain of thought prompt builder
|
|
@@ -411,7 +483,7 @@ def test_build_prompt_for_ui(tmp_path):
|
|
|
411
483
|
ui_prompt_cot = cot_builder.build_prompt_for_ui()
|
|
412
484
|
|
|
413
485
|
# Should include both base prompt and thinking instructions
|
|
414
|
-
assert cot_builder.build_prompt() in ui_prompt_cot
|
|
486
|
+
assert cot_builder.build_prompt(include_json_instructions=False) in ui_prompt_cot
|
|
415
487
|
assert "# Thinking Instructions" in ui_prompt_cot
|
|
416
488
|
assert "Think step by step" in ui_prompt_cot
|
|
417
489
|
|
|
@@ -423,6 +495,155 @@ def test_build_prompt_for_ui(tmp_path):
|
|
|
423
495
|
custom_cot_builder = SimpleChainOfThoughtPromptBuilder(task=task_with_custom)
|
|
424
496
|
ui_prompt_custom = custom_cot_builder.build_prompt_for_ui()
|
|
425
497
|
|
|
426
|
-
assert
|
|
498
|
+
assert (
|
|
499
|
+
custom_cot_builder.build_prompt(include_json_instructions=False)
|
|
500
|
+
in ui_prompt_custom
|
|
501
|
+
)
|
|
427
502
|
assert "# Thinking Instructions" in ui_prompt_custom
|
|
428
503
|
assert custom_instruction in ui_prompt_custom
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
def test_saved_prompt_builder(tmp_path):
|
|
507
|
+
task = build_test_task(tmp_path)
|
|
508
|
+
|
|
509
|
+
prompt = Prompt(
|
|
510
|
+
name="test_prompt_name",
|
|
511
|
+
prompt="test_prompt",
|
|
512
|
+
parent=task,
|
|
513
|
+
)
|
|
514
|
+
prompt.save_to_file()
|
|
515
|
+
|
|
516
|
+
builder = SavedPromptBuilder(task=task, prompt_id=prompt.id)
|
|
517
|
+
assert builder.build_prompt(include_json_instructions=False) == "test_prompt"
|
|
518
|
+
assert builder.chain_of_thought_prompt() is None
|
|
519
|
+
assert builder.build_prompt_for_ui() == "test_prompt"
|
|
520
|
+
assert builder.prompt_id() == prompt.id
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
def test_saved_prompt_builder_with_chain_of_thought(tmp_path):
|
|
524
|
+
task = build_test_task(tmp_path)
|
|
525
|
+
|
|
526
|
+
prompt = Prompt(
|
|
527
|
+
name="test_prompt_name",
|
|
528
|
+
prompt="test_prompt",
|
|
529
|
+
chain_of_thought_instructions="Think step by step",
|
|
530
|
+
parent=task,
|
|
531
|
+
)
|
|
532
|
+
prompt.save_to_file()
|
|
533
|
+
|
|
534
|
+
builder = SavedPromptBuilder(task=task, prompt_id=prompt.id)
|
|
535
|
+
assert builder.build_prompt(include_json_instructions=False) == "test_prompt"
|
|
536
|
+
assert builder.chain_of_thought_prompt() == "Think step by step"
|
|
537
|
+
assert "Think step by step" in builder.build_prompt_for_ui()
|
|
538
|
+
assert builder.prompt_id() == prompt.id
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
def test_saved_prompt_builder_not_found(tmp_path):
|
|
542
|
+
task = build_test_task(tmp_path)
|
|
543
|
+
|
|
544
|
+
with pytest.raises(ValueError, match="Prompt ID not found: 123"):
|
|
545
|
+
SavedPromptBuilder(task=task, prompt_id="123")
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
def test_build_prompt_with_json_instructions(tmp_path):
|
|
549
|
+
task = build_test_task(tmp_path)
|
|
550
|
+
task = task.model_copy(
|
|
551
|
+
update={
|
|
552
|
+
"output_json_schema": json.dumps(
|
|
553
|
+
{
|
|
554
|
+
"type": "object",
|
|
555
|
+
"properties": {"result": {"type": "string"}},
|
|
556
|
+
"required": ["result"],
|
|
557
|
+
}
|
|
558
|
+
)
|
|
559
|
+
}
|
|
560
|
+
)
|
|
561
|
+
|
|
562
|
+
builder = SimplePromptBuilder(task=task)
|
|
563
|
+
|
|
564
|
+
# Test without JSON instructions
|
|
565
|
+
prompt_without_json = builder.build_prompt(include_json_instructions=False)
|
|
566
|
+
assert "Format Instructions" not in prompt_without_json
|
|
567
|
+
assert (
|
|
568
|
+
"Return a JSON object conforming to the following schema:"
|
|
569
|
+
not in prompt_without_json
|
|
570
|
+
)
|
|
571
|
+
assert task.output_json_schema not in prompt_without_json
|
|
572
|
+
|
|
573
|
+
# Test with JSON instructions
|
|
574
|
+
prompt_with_json = builder.build_prompt(include_json_instructions=True)
|
|
575
|
+
assert "# Format Instructions" in prompt_with_json
|
|
576
|
+
assert (
|
|
577
|
+
"Return a JSON object conforming to the following schema:" in prompt_with_json
|
|
578
|
+
)
|
|
579
|
+
assert "```" in prompt_with_json
|
|
580
|
+
assert (
|
|
581
|
+
"{'type': 'object', 'properties': {'result': {'type': 'string'}}, 'required': ['result']}"
|
|
582
|
+
in prompt_with_json
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
# Verify base prompt is still included
|
|
586
|
+
assert task.instruction in prompt_with_json
|
|
587
|
+
for requirement in task.requirements:
|
|
588
|
+
assert requirement.instruction in prompt_with_json
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
def test_task_run_config_prompt_builder(tmp_path):
|
|
592
|
+
task = build_test_task(tmp_path)
|
|
593
|
+
|
|
594
|
+
run_config = TaskRunConfig(
|
|
595
|
+
name="test_run_config",
|
|
596
|
+
parent=task,
|
|
597
|
+
run_config_properties=RunConfigProperties(
|
|
598
|
+
model_name="gpt-4",
|
|
599
|
+
model_provider_name="openai",
|
|
600
|
+
prompt_id="simple_prompt_builder",
|
|
601
|
+
),
|
|
602
|
+
prompt=Prompt(
|
|
603
|
+
name="test prompt name",
|
|
604
|
+
prompt="test prompt content",
|
|
605
|
+
chain_of_thought_instructions="test step by step",
|
|
606
|
+
),
|
|
607
|
+
)
|
|
608
|
+
run_config.save_to_file()
|
|
609
|
+
|
|
610
|
+
# Construct the eval prompt ID
|
|
611
|
+
run_config_prompt_id = (
|
|
612
|
+
f"task_run_config::{task.parent.id}::{task.id}::{run_config.id}"
|
|
613
|
+
)
|
|
614
|
+
|
|
615
|
+
# Test successful creation 2 ways: constructor and ID creation
|
|
616
|
+
builders = [
|
|
617
|
+
TaskRunConfigPromptBuilder(
|
|
618
|
+
task=task, run_config_prompt_id=run_config_prompt_id
|
|
619
|
+
),
|
|
620
|
+
prompt_builder_from_id(run_config_prompt_id, task),
|
|
621
|
+
]
|
|
622
|
+
|
|
623
|
+
for builder in builders:
|
|
624
|
+
assert (
|
|
625
|
+
builder.build_prompt(include_json_instructions=False)
|
|
626
|
+
== "test prompt content"
|
|
627
|
+
)
|
|
628
|
+
assert builder.chain_of_thought_prompt() == "test step by step"
|
|
629
|
+
assert builder.prompt_id() == run_config_prompt_id
|
|
630
|
+
|
|
631
|
+
|
|
632
|
+
def test_task_run_config_prompt_builder_validation_errors(tmp_path):
|
|
633
|
+
task = build_test_task(tmp_path)
|
|
634
|
+
|
|
635
|
+
# Test invalid format
|
|
636
|
+
with pytest.raises(ValueError, match="Invalid task run config prompt ID"):
|
|
637
|
+
TaskRunConfigPromptBuilder(
|
|
638
|
+
task=task, run_config_prompt_id="task_run_config::wrong::format"
|
|
639
|
+
)
|
|
640
|
+
|
|
641
|
+
# Test task ID mismatch
|
|
642
|
+
wrong_task_id = f"task_run_config::{task.parent.id}::wrong_task_id::config_id"
|
|
643
|
+
with pytest.raises(ValueError, match="Task ID mismatch"):
|
|
644
|
+
TaskRunConfigPromptBuilder(task=task, run_config_prompt_id=wrong_task_id)
|
|
645
|
+
|
|
646
|
+
# Test eval not found
|
|
647
|
+
nonexistent_eval = f"task_run_config::{task.parent.id}::{task.id}::nonexistent_id"
|
|
648
|
+
with pytest.raises(ValueError, match="Task run config ID not found"):
|
|
649
|
+
TaskRunConfigPromptBuilder(task=task, run_config_prompt_id=nonexistent_eval)
|