kiln-ai 0.8.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (88) hide show
  1. kiln_ai/adapters/__init__.py +7 -7
  2. kiln_ai/adapters/adapter_registry.py +81 -10
  3. kiln_ai/adapters/data_gen/data_gen_task.py +21 -3
  4. kiln_ai/adapters/data_gen/test_data_gen_task.py +23 -3
  5. kiln_ai/adapters/eval/base_eval.py +164 -0
  6. kiln_ai/adapters/eval/eval_runner.py +267 -0
  7. kiln_ai/adapters/eval/g_eval.py +367 -0
  8. kiln_ai/adapters/eval/registry.py +16 -0
  9. kiln_ai/adapters/eval/test_base_eval.py +324 -0
  10. kiln_ai/adapters/eval/test_eval_runner.py +640 -0
  11. kiln_ai/adapters/eval/test_g_eval.py +497 -0
  12. kiln_ai/adapters/eval/test_g_eval_data.py +4 -0
  13. kiln_ai/adapters/fine_tune/base_finetune.py +5 -1
  14. kiln_ai/adapters/fine_tune/dataset_formatter.py +310 -65
  15. kiln_ai/adapters/fine_tune/fireworks_finetune.py +47 -32
  16. kiln_ai/adapters/fine_tune/openai_finetune.py +12 -11
  17. kiln_ai/adapters/fine_tune/test_base_finetune.py +19 -0
  18. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +472 -129
  19. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +114 -22
  20. kiln_ai/adapters/fine_tune/test_openai_finetune.py +125 -14
  21. kiln_ai/adapters/ml_model_list.py +434 -93
  22. kiln_ai/adapters/model_adapters/__init__.py +18 -0
  23. kiln_ai/adapters/model_adapters/base_adapter.py +250 -0
  24. kiln_ai/adapters/model_adapters/langchain_adapters.py +309 -0
  25. kiln_ai/adapters/model_adapters/openai_compatible_config.py +10 -0
  26. kiln_ai/adapters/model_adapters/openai_model_adapter.py +289 -0
  27. kiln_ai/adapters/model_adapters/test_base_adapter.py +199 -0
  28. kiln_ai/adapters/{test_langchain_adapter.py → model_adapters/test_langchain_adapter.py} +105 -97
  29. kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +216 -0
  30. kiln_ai/adapters/{test_saving_adapter_results.py → model_adapters/test_saving_adapter_results.py} +80 -30
  31. kiln_ai/adapters/{test_structured_output.py → model_adapters/test_structured_output.py} +125 -46
  32. kiln_ai/adapters/ollama_tools.py +0 -1
  33. kiln_ai/adapters/parsers/__init__.py +10 -0
  34. kiln_ai/adapters/parsers/base_parser.py +12 -0
  35. kiln_ai/adapters/parsers/json_parser.py +37 -0
  36. kiln_ai/adapters/parsers/parser_registry.py +19 -0
  37. kiln_ai/adapters/parsers/r1_parser.py +69 -0
  38. kiln_ai/adapters/parsers/test_json_parser.py +81 -0
  39. kiln_ai/adapters/parsers/test_parser_registry.py +32 -0
  40. kiln_ai/adapters/parsers/test_r1_parser.py +144 -0
  41. kiln_ai/adapters/prompt_builders.py +193 -49
  42. kiln_ai/adapters/provider_tools.py +91 -36
  43. kiln_ai/adapters/repair/repair_task.py +18 -19
  44. kiln_ai/adapters/repair/test_repair_task.py +7 -7
  45. kiln_ai/adapters/run_output.py +11 -0
  46. kiln_ai/adapters/test_adapter_registry.py +177 -0
  47. kiln_ai/adapters/test_generate_docs.py +69 -0
  48. kiln_ai/adapters/test_ollama_tools.py +0 -1
  49. kiln_ai/adapters/test_prompt_adaptors.py +25 -18
  50. kiln_ai/adapters/test_prompt_builders.py +265 -44
  51. kiln_ai/adapters/test_provider_tools.py +268 -46
  52. kiln_ai/datamodel/__init__.py +51 -772
  53. kiln_ai/datamodel/basemodel.py +31 -11
  54. kiln_ai/datamodel/datamodel_enums.py +58 -0
  55. kiln_ai/datamodel/dataset_filters.py +114 -0
  56. kiln_ai/datamodel/dataset_split.py +170 -0
  57. kiln_ai/datamodel/eval.py +298 -0
  58. kiln_ai/datamodel/finetune.py +105 -0
  59. kiln_ai/datamodel/json_schema.py +14 -3
  60. kiln_ai/datamodel/model_cache.py +8 -3
  61. kiln_ai/datamodel/project.py +23 -0
  62. kiln_ai/datamodel/prompt.py +37 -0
  63. kiln_ai/datamodel/prompt_id.py +83 -0
  64. kiln_ai/datamodel/strict_mode.py +24 -0
  65. kiln_ai/datamodel/task.py +181 -0
  66. kiln_ai/datamodel/task_output.py +321 -0
  67. kiln_ai/datamodel/task_run.py +164 -0
  68. kiln_ai/datamodel/test_basemodel.py +80 -2
  69. kiln_ai/datamodel/test_dataset_filters.py +71 -0
  70. kiln_ai/datamodel/test_dataset_split.py +127 -6
  71. kiln_ai/datamodel/test_datasource.py +3 -2
  72. kiln_ai/datamodel/test_eval_model.py +635 -0
  73. kiln_ai/datamodel/test_example_models.py +34 -17
  74. kiln_ai/datamodel/test_json_schema.py +23 -0
  75. kiln_ai/datamodel/test_model_cache.py +24 -0
  76. kiln_ai/datamodel/test_model_perf.py +125 -0
  77. kiln_ai/datamodel/test_models.py +131 -2
  78. kiln_ai/datamodel/test_prompt_id.py +129 -0
  79. kiln_ai/datamodel/test_task.py +159 -0
  80. kiln_ai/utils/config.py +6 -1
  81. kiln_ai/utils/exhaustive_error.py +6 -0
  82. {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/METADATA +45 -7
  83. kiln_ai-0.12.0.dist-info/RECORD +100 -0
  84. kiln_ai/adapters/base_adapter.py +0 -191
  85. kiln_ai/adapters/langchain_adapters.py +0 -256
  86. kiln_ai-0.8.1.dist-info/RECORD +0 -58
  87. {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/WHEEL +0 -0
  88. {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -6,13 +6,14 @@ from langchain_core.language_models.fake_chat_models import FakeListChatModel
6
6
 
7
7
  import kiln_ai.datamodel as datamodel
8
8
  from kiln_ai.adapters.adapter_registry import adapter_for_task
9
- from kiln_ai.adapters.langchain_adapters import LangchainAdapter
10
9
  from kiln_ai.adapters.ml_model_list import built_in_models
10
+ from kiln_ai.adapters.model_adapters.langchain_adapters import LangchainAdapter
11
11
  from kiln_ai.adapters.ollama_tools import ollama_online
12
12
  from kiln_ai.adapters.prompt_builders import (
13
13
  BasePromptBuilder,
14
14
  SimpleChainOfThoughtPromptBuilder,
15
15
  )
16
+ from kiln_ai.datamodel import PromptId
16
17
 
17
18
 
18
19
  def get_all_models_and_providers():
@@ -108,7 +109,11 @@ async def test_amazon_bedrock(tmp_path):
108
109
  async def test_mock(tmp_path):
109
110
  task = build_test_task(tmp_path)
110
111
  mockChatModel = FakeListChatModel(responses=["mock response"])
111
- adapter = LangchainAdapter(task, custom_model=mockChatModel)
112
+ adapter = LangchainAdapter(
113
+ task,
114
+ custom_model=mockChatModel,
115
+ provider="ollama",
116
+ )
112
117
  run = await adapter.invoke("You are a mock, send me the response!")
113
118
  assert "mock response" in run.output.output
114
119
 
@@ -116,7 +121,7 @@ async def test_mock(tmp_path):
116
121
  async def test_mock_returning_run(tmp_path):
117
122
  task = build_test_task(tmp_path)
118
123
  mockChatModel = FakeListChatModel(responses=["mock response"])
119
- adapter = LangchainAdapter(task, custom_model=mockChatModel)
124
+ adapter = LangchainAdapter(task, custom_model=mockChatModel, provider="ollama")
120
125
  run = await adapter.invoke("You are a mock, send me the response!")
121
126
  assert run.output.output == "mock response"
122
127
  assert run is not None
@@ -127,8 +132,8 @@ async def test_mock_returning_run(tmp_path):
127
132
  assert run.output.source.properties == {
128
133
  "adapter_name": "kiln_langchain_adapter",
129
134
  "model_name": "custom.langchain:unknown_model",
130
- "model_provider": "custom.langchain:FakeListChatModel",
131
- "prompt_builder_name": "simple_prompt_builder",
135
+ "model_provider": "ollama",
136
+ "prompt_id": "simple_prompt_builder",
132
137
  }
133
138
 
134
139
 
@@ -145,8 +150,9 @@ async def test_all_models_providers_plaintext(tmp_path, model_name, provider_nam
145
150
  @pytest.mark.parametrize("model_name,provider_name", get_all_models_and_providers())
146
151
  async def test_cot_prompt_builder(tmp_path, model_name, provider_name):
147
152
  task = build_test_task(tmp_path)
148
- pb = SimpleChainOfThoughtPromptBuilder(task)
149
- await run_simple_task(task, model_name, provider_name, pb)
153
+ await run_simple_task(
154
+ task, model_name, provider_name, "simple_chain_of_thought_prompt_builder"
155
+ )
150
156
 
151
157
 
152
158
  def build_test_task(tmp_path: Path):
@@ -182,20 +188,20 @@ async def run_simple_test(
182
188
  tmp_path: Path,
183
189
  model_name: str,
184
190
  provider: str | None = None,
185
- prompt_builder: BasePromptBuilder | None = None,
191
+ prompt_id: PromptId | None = None,
186
192
  ):
187
193
  task = build_test_task(tmp_path)
188
- return await run_simple_task(task, model_name, provider, prompt_builder)
194
+ return await run_simple_task(task, model_name, provider, prompt_id)
189
195
 
190
196
 
191
197
  async def run_simple_task(
192
198
  task: datamodel.Task,
193
199
  model_name: str,
194
200
  provider: str,
195
- prompt_builder: BasePromptBuilder | None = None,
201
+ prompt_id: PromptId | None = None,
196
202
  ) -> datamodel.TaskRun:
197
203
  adapter = adapter_for_task(
198
- task, model_name=model_name, provider=provider, prompt_builder=prompt_builder
204
+ task, model_name=model_name, provider=provider, prompt_id=prompt_id
199
205
  )
200
206
 
201
207
  run = await adapter.invoke(
@@ -208,13 +214,14 @@ async def run_simple_task(
208
214
  )
209
215
  assert "64" in run.output.output
210
216
  source_props = run.output.source.properties
211
- assert source_props["adapter_name"] == "kiln_langchain_adapter"
217
+ assert source_props["adapter_name"] in [
218
+ "kiln_langchain_adapter",
219
+ "kiln_openai_compatible_adapter",
220
+ ]
212
221
  assert source_props["model_name"] == model_name
213
222
  assert source_props["model_provider"] == provider
214
- expected_prompt_builder_name = (
215
- prompt_builder.__class__.prompt_builder_name()
216
- if prompt_builder
217
- else "simple_prompt_builder"
218
- )
219
- assert source_props["prompt_builder_name"] == expected_prompt_builder_name
223
+ if prompt_id is None:
224
+ assert source_props["prompt_id"] == "simple_prompt_builder"
225
+ else:
226
+ assert source_props["prompt_id"] == prompt_id
220
227
  return run
@@ -1,37 +1,49 @@
1
1
  import json
2
+ import logging
2
3
 
3
4
  import pytest
4
5
 
5
- from kiln_ai.adapters.base_adapter import AdapterInfo, BaseAdapter
6
+ from kiln_ai.adapters.model_adapters.base_adapter import BaseAdapter
7
+ from kiln_ai.adapters.model_adapters.test_structured_output import (
8
+ build_structured_output_test_task,
9
+ )
6
10
  from kiln_ai.adapters.prompt_builders import (
7
11
  FewShotChainOfThoughtPromptBuilder,
8
12
  FewShotPromptBuilder,
13
+ FineTunePromptBuilder,
9
14
  MultiShotChainOfThoughtPromptBuilder,
10
15
  MultiShotPromptBuilder,
11
16
  RepairsPromptBuilder,
17
+ SavedPromptBuilder,
12
18
  SimpleChainOfThoughtPromptBuilder,
13
19
  SimplePromptBuilder,
20
+ TaskRunConfigPromptBuilder,
14
21
  chain_of_thought_prompt,
15
- prompt_builder_from_ui_name,
22
+ prompt_builder_from_id,
16
23
  )
17
24
  from kiln_ai.adapters.test_prompt_adaptors import build_test_task
18
- from kiln_ai.adapters.test_structured_output import build_structured_output_test_task
19
25
  from kiln_ai.datamodel import (
20
26
  DataSource,
21
27
  DataSourceType,
28
+ Finetune,
29
+ FinetuneDataStrategy,
22
30
  Project,
31
+ Prompt,
23
32
  Task,
24
33
  TaskOutput,
25
34
  TaskOutputRating,
26
35
  TaskRun,
27
36
  )
37
+ from kiln_ai.datamodel.task import RunConfigProperties, TaskRunConfig
38
+
39
+ logger = logging.getLogger(__name__)
28
40
 
29
41
 
30
42
  def test_simple_prompt_builder(tmp_path):
31
43
  task = build_test_task(tmp_path)
32
44
  builder = SimplePromptBuilder(task=task)
33
45
  input = "two plus two"
34
- prompt = builder.build_prompt()
46
+ prompt = builder.build_prompt(include_json_instructions=False)
35
47
  assert (
36
48
  "You are an assistant which performs math tasks provided in plain text."
37
49
  in prompt
@@ -50,19 +62,15 @@ class MockAdapter(BaseAdapter):
50
62
  def _run(self, input: str) -> str:
51
63
  return "mock response"
52
64
 
53
- def adapter_info(self) -> AdapterInfo:
54
- return AdapterInfo(
55
- adapter_name="mock_adapter",
56
- model_name="mock_model",
57
- model_provider="mock_provider",
58
- )
65
+ def adapter_name(self) -> str:
66
+ return "mock_adapter"
59
67
 
60
68
 
61
69
  def test_simple_prompt_builder_structured_output(tmp_path):
62
70
  task = build_structured_output_test_task(tmp_path)
63
71
  builder = SimplePromptBuilder(task=task)
64
72
  input = "Cows"
65
- prompt = builder.build_prompt()
73
+ prompt = builder.build_prompt(include_json_instructions=False)
66
74
  assert "You are an assistant which tells a joke, given a subject." in prompt
67
75
 
68
76
  user_msg = builder.build_user_message(input)
@@ -70,6 +78,14 @@ def test_simple_prompt_builder_structured_output(tmp_path):
70
78
  assert input not in prompt
71
79
 
72
80
 
81
+ def test_simple_prompt_builder_structured_input_non_ascii(tmp_path):
82
+ task = build_structured_output_test_task(tmp_path)
83
+ builder = SimplePromptBuilder(task=task)
84
+ input = {"key": "你好👋"}
85
+ user_msg = builder.build_user_message(input)
86
+ assert "你好👋" in user_msg
87
+
88
+
73
89
  @pytest.fixture
74
90
  def task_with_examples(tmp_path):
75
91
  # Create a project and task hierarchy
@@ -198,7 +214,7 @@ def task_with_examples(tmp_path):
198
214
  def test_multi_shot_prompt_builder(task_with_examples):
199
215
  # Verify the order of examples
200
216
  prompt_builder = MultiShotPromptBuilder(task=task_with_examples)
201
- prompt = prompt_builder.build_prompt()
217
+ prompt = prompt_builder.build_prompt(include_json_instructions=False)
202
218
  assert "Why did the cow cross the road?" in prompt
203
219
  assert prompt.index("Why did the cow cross the road?") < prompt.index(
204
220
  "Why don't cats play poker in the jungle?"
@@ -239,14 +255,14 @@ def test_few_shot_prompt_builder(tmp_path):
239
255
  # Create 6 examples (2 repaired, 4 high-quality)
240
256
  for i in range(6):
241
257
  run = TaskRun(
242
- input=f'{{"subject": "Subject {i+1}"}}',
258
+ input=f'{{"subject": "Subject {i + 1}"}}',
243
259
  input_source=DataSource(
244
260
  type=DataSourceType.human,
245
261
  properties={"created_by": "john_doe"},
246
262
  ),
247
263
  parent=task,
248
264
  output=TaskOutput(
249
- output=f'{{"joke": "Joke Initial Output {i+1}"}}',
265
+ output=f'{{"joke": "Joke Initial Output {i + 1}"}}',
250
266
  source=DataSource(
251
267
  type=DataSourceType.human,
252
268
  properties={"created_by": "john_doe"},
@@ -254,13 +270,12 @@ def test_few_shot_prompt_builder(tmp_path):
254
270
  rating=TaskOutputRating(value=4 + (i % 2), reason="Good joke"),
255
271
  ),
256
272
  )
257
- print("RATING", "Joke Initial Output ", i + 1, " - RATED:", 4 + (i % 2), "\n")
258
273
  if i < 2:
259
274
  run = run.model_copy(
260
275
  update={
261
276
  "repair_instructions": "Fix the joke",
262
277
  "repaired_output": TaskOutput(
263
- output=f'{{"joke": "Repaired Joke {i+1}"}}',
278
+ output=f'{{"joke": "Repaired Joke {i + 1}"}}',
264
279
  source=DataSource(
265
280
  type=DataSourceType.human,
266
281
  properties={"created_by": "jane_doe"},
@@ -272,10 +287,10 @@ def test_few_shot_prompt_builder(tmp_path):
272
287
 
273
288
  # Check that only 4 examples are included
274
289
  prompt_builder = FewShotPromptBuilder(task=task)
275
- prompt = prompt_builder.build_prompt()
290
+ prompt = prompt_builder.build_prompt(include_json_instructions=False)
276
291
  assert prompt.count("## Example") == 4
277
292
 
278
- print("PROMPT", prompt)
293
+ logger.info("PROMPT: %s", prompt)
279
294
  # Verify the order of examples (2 repaired, then 2 highest-rated)
280
295
  assert "Repaired Joke 1" in prompt
281
296
  assert "Repaired Joke 2" in prompt
@@ -289,7 +304,7 @@ def test_few_shot_prompt_builder(tmp_path):
289
304
 
290
305
  def check_example_outputs(task: Task, count: int):
291
306
  prompt_builder = MultiShotPromptBuilder(task=task)
292
- prompt = prompt_builder.build_prompt()
307
+ prompt = prompt_builder.build_prompt(include_json_instructions=False)
293
308
  assert "# Instruction" in prompt
294
309
  assert task.instruction in prompt
295
310
  if count == 0:
@@ -299,32 +314,89 @@ def check_example_outputs(task: Task, count: int):
299
314
  assert f"## Example {count}" in prompt
300
315
 
301
316
 
302
- def test_prompt_builder_name():
303
- assert SimplePromptBuilder.prompt_builder_name() == "simple_prompt_builder"
304
- assert MultiShotPromptBuilder.prompt_builder_name() == "multi_shot_prompt_builder"
305
- assert RepairsPromptBuilder.prompt_builder_name() == "repairs_prompt_builder"
317
+ def test_prompt_builder_from_id(task_with_examples):
318
+ task = task_with_examples
319
+ assert isinstance(
320
+ prompt_builder_from_id("simple_prompt_builder", task), SimplePromptBuilder
321
+ )
322
+ assert isinstance(
323
+ prompt_builder_from_id("few_shot_prompt_builder", task),
324
+ FewShotPromptBuilder,
325
+ )
326
+ assert isinstance(
327
+ prompt_builder_from_id("multi_shot_prompt_builder", task),
328
+ MultiShotPromptBuilder,
329
+ )
330
+ assert isinstance(
331
+ prompt_builder_from_id("repairs_prompt_builder", task),
332
+ RepairsPromptBuilder,
333
+ )
334
+ assert isinstance(
335
+ prompt_builder_from_id("simple_chain_of_thought_prompt_builder", task),
336
+ SimpleChainOfThoughtPromptBuilder,
337
+ )
338
+ assert isinstance(
339
+ prompt_builder_from_id("few_shot_chain_of_thought_prompt_builder", task),
340
+ FewShotChainOfThoughtPromptBuilder,
341
+ )
342
+ assert isinstance(
343
+ prompt_builder_from_id("multi_shot_chain_of_thought_prompt_builder", task),
344
+ MultiShotChainOfThoughtPromptBuilder,
345
+ )
346
+
347
+ with pytest.raises(ValueError, match="Unknown prompt generator: invalid_name"):
348
+ prompt_builder_from_id("invalid_name", task)
349
+
350
+ with pytest.raises(ValueError, match="Prompt ID not found: 123"):
351
+ prompt_builder_from_id("id::123", task)
306
352
 
353
+ with pytest.raises(
354
+ ValueError,
355
+ match="Invalid fine-tune ID format. Expected 'project_id::task_id::fine_tune_id'",
356
+ ):
357
+ prompt_builder_from_id("fine_tune_prompt::123", task)
307
358
 
308
- def test_prompt_builder_from_ui_name():
309
- assert prompt_builder_from_ui_name("basic") == SimplePromptBuilder
310
- assert prompt_builder_from_ui_name("few_shot") == FewShotPromptBuilder
311
- assert prompt_builder_from_ui_name("many_shot") == MultiShotPromptBuilder
312
- assert prompt_builder_from_ui_name("repairs") == RepairsPromptBuilder
313
- assert (
314
- prompt_builder_from_ui_name("simple_chain_of_thought")
315
- == SimpleChainOfThoughtPromptBuilder
359
+ with pytest.raises(
360
+ ValueError,
361
+ match="Fine-tune ID not found",
362
+ ):
363
+ prompt_builder_from_id("fine_tune_prompt::123::456::789", task)
364
+
365
+ prompt = Prompt(
366
+ name="test_prompt_name",
367
+ prompt="test_prompt",
368
+ chain_of_thought_instructions="coti",
369
+ parent=task,
316
370
  )
317
- assert (
318
- prompt_builder_from_ui_name("few_shot_chain_of_thought")
319
- == FewShotChainOfThoughtPromptBuilder
371
+ prompt.save_to_file()
372
+ pb = prompt_builder_from_id("id::" + prompt.id, task)
373
+ assert isinstance(pb, SavedPromptBuilder)
374
+ assert pb.prompt_id() == prompt.id
375
+ assert pb.build_prompt(include_json_instructions=False) == "test_prompt"
376
+ assert pb.chain_of_thought_prompt() == "coti"
377
+
378
+ finetune = Finetune(
379
+ name="test_finetune_name",
380
+ system_message="test_system_message",
381
+ thinking_instructions="test_thinking_instructions",
382
+ parent=task,
383
+ base_model_id="test_base_model_id",
384
+ dataset_split_id="asdf",
385
+ provider="test_provider",
386
+ data_strategy=FinetuneDataStrategy.final_and_intermediate,
320
387
  )
321
- assert (
322
- prompt_builder_from_ui_name("multi_shot_chain_of_thought")
323
- == MultiShotChainOfThoughtPromptBuilder
388
+ finetune.save_to_file()
389
+ nested_fine_tune_id = (
390
+ task_with_examples.parent.id + "::" + task_with_examples.id + "::" + finetune.id
324
391
  )
325
-
326
- with pytest.raises(ValueError, match="Unknown prompt builder: invalid_name"):
327
- prompt_builder_from_ui_name("invalid_name")
392
+ pb = prompt_builder_from_id(
393
+ "fine_tune_prompt::" + nested_fine_tune_id,
394
+ task_with_examples,
395
+ )
396
+ assert isinstance(pb, FineTunePromptBuilder)
397
+ assert pb.prompt_id() == nested_fine_tune_id
398
+ assert pb.build_base_prompt() == "test_system_message"
399
+ assert pb.chain_of_thought_prompt() == "test_thinking_instructions"
328
400
 
329
401
 
330
402
  def test_example_count():
@@ -335,7 +407,7 @@ def test_example_count():
335
407
  def test_repair_multi_shot_prompt_builder(task_with_examples):
336
408
  # Verify the order of examples
337
409
  prompt_builder = RepairsPromptBuilder(task=task_with_examples)
338
- prompt = prompt_builder.build_prompt()
410
+ prompt = prompt_builder.build_prompt(include_json_instructions=False)
339
411
  assert (
340
412
  'Repaired Output Which is Sufficient: {"joke": "Why did the cow cross the road? To get to the udder side!"}'
341
413
  in prompt
@@ -403,7 +475,7 @@ def test_build_prompt_for_ui(tmp_path):
403
475
  ui_prompt = simple_builder.build_prompt_for_ui()
404
476
 
405
477
  # Should match regular prompt since no chain of thought
406
- assert ui_prompt == simple_builder.build_prompt()
478
+ assert ui_prompt == simple_builder.build_prompt(include_json_instructions=False)
407
479
  assert "# Thinking Instructions" not in ui_prompt
408
480
 
409
481
  # Test chain of thought prompt builder
@@ -411,7 +483,7 @@ def test_build_prompt_for_ui(tmp_path):
411
483
  ui_prompt_cot = cot_builder.build_prompt_for_ui()
412
484
 
413
485
  # Should include both base prompt and thinking instructions
414
- assert cot_builder.build_prompt() in ui_prompt_cot
486
+ assert cot_builder.build_prompt(include_json_instructions=False) in ui_prompt_cot
415
487
  assert "# Thinking Instructions" in ui_prompt_cot
416
488
  assert "Think step by step" in ui_prompt_cot
417
489
 
@@ -423,6 +495,155 @@ def test_build_prompt_for_ui(tmp_path):
423
495
  custom_cot_builder = SimpleChainOfThoughtPromptBuilder(task=task_with_custom)
424
496
  ui_prompt_custom = custom_cot_builder.build_prompt_for_ui()
425
497
 
426
- assert custom_cot_builder.build_prompt() in ui_prompt_custom
498
+ assert (
499
+ custom_cot_builder.build_prompt(include_json_instructions=False)
500
+ in ui_prompt_custom
501
+ )
427
502
  assert "# Thinking Instructions" in ui_prompt_custom
428
503
  assert custom_instruction in ui_prompt_custom
504
+
505
+
506
+ def test_saved_prompt_builder(tmp_path):
507
+ task = build_test_task(tmp_path)
508
+
509
+ prompt = Prompt(
510
+ name="test_prompt_name",
511
+ prompt="test_prompt",
512
+ parent=task,
513
+ )
514
+ prompt.save_to_file()
515
+
516
+ builder = SavedPromptBuilder(task=task, prompt_id=prompt.id)
517
+ assert builder.build_prompt(include_json_instructions=False) == "test_prompt"
518
+ assert builder.chain_of_thought_prompt() is None
519
+ assert builder.build_prompt_for_ui() == "test_prompt"
520
+ assert builder.prompt_id() == prompt.id
521
+
522
+
523
+ def test_saved_prompt_builder_with_chain_of_thought(tmp_path):
524
+ task = build_test_task(tmp_path)
525
+
526
+ prompt = Prompt(
527
+ name="test_prompt_name",
528
+ prompt="test_prompt",
529
+ chain_of_thought_instructions="Think step by step",
530
+ parent=task,
531
+ )
532
+ prompt.save_to_file()
533
+
534
+ builder = SavedPromptBuilder(task=task, prompt_id=prompt.id)
535
+ assert builder.build_prompt(include_json_instructions=False) == "test_prompt"
536
+ assert builder.chain_of_thought_prompt() == "Think step by step"
537
+ assert "Think step by step" in builder.build_prompt_for_ui()
538
+ assert builder.prompt_id() == prompt.id
539
+
540
+
541
+ def test_saved_prompt_builder_not_found(tmp_path):
542
+ task = build_test_task(tmp_path)
543
+
544
+ with pytest.raises(ValueError, match="Prompt ID not found: 123"):
545
+ SavedPromptBuilder(task=task, prompt_id="123")
546
+
547
+
548
+ def test_build_prompt_with_json_instructions(tmp_path):
549
+ task = build_test_task(tmp_path)
550
+ task = task.model_copy(
551
+ update={
552
+ "output_json_schema": json.dumps(
553
+ {
554
+ "type": "object",
555
+ "properties": {"result": {"type": "string"}},
556
+ "required": ["result"],
557
+ }
558
+ )
559
+ }
560
+ )
561
+
562
+ builder = SimplePromptBuilder(task=task)
563
+
564
+ # Test without JSON instructions
565
+ prompt_without_json = builder.build_prompt(include_json_instructions=False)
566
+ assert "Format Instructions" not in prompt_without_json
567
+ assert (
568
+ "Return a JSON object conforming to the following schema:"
569
+ not in prompt_without_json
570
+ )
571
+ assert task.output_json_schema not in prompt_without_json
572
+
573
+ # Test with JSON instructions
574
+ prompt_with_json = builder.build_prompt(include_json_instructions=True)
575
+ assert "# Format Instructions" in prompt_with_json
576
+ assert (
577
+ "Return a JSON object conforming to the following schema:" in prompt_with_json
578
+ )
579
+ assert "```" in prompt_with_json
580
+ assert (
581
+ "{'type': 'object', 'properties': {'result': {'type': 'string'}}, 'required': ['result']}"
582
+ in prompt_with_json
583
+ )
584
+
585
+ # Verify base prompt is still included
586
+ assert task.instruction in prompt_with_json
587
+ for requirement in task.requirements:
588
+ assert requirement.instruction in prompt_with_json
589
+
590
+
591
+ def test_task_run_config_prompt_builder(tmp_path):
592
+ task = build_test_task(tmp_path)
593
+
594
+ run_config = TaskRunConfig(
595
+ name="test_run_config",
596
+ parent=task,
597
+ run_config_properties=RunConfigProperties(
598
+ model_name="gpt-4",
599
+ model_provider_name="openai",
600
+ prompt_id="simple_prompt_builder",
601
+ ),
602
+ prompt=Prompt(
603
+ name="test prompt name",
604
+ prompt="test prompt content",
605
+ chain_of_thought_instructions="test step by step",
606
+ ),
607
+ )
608
+ run_config.save_to_file()
609
+
610
+ # Construct the eval prompt ID
611
+ run_config_prompt_id = (
612
+ f"task_run_config::{task.parent.id}::{task.id}::{run_config.id}"
613
+ )
614
+
615
+ # Test successful creation 2 ways: constructor and ID creation
616
+ builders = [
617
+ TaskRunConfigPromptBuilder(
618
+ task=task, run_config_prompt_id=run_config_prompt_id
619
+ ),
620
+ prompt_builder_from_id(run_config_prompt_id, task),
621
+ ]
622
+
623
+ for builder in builders:
624
+ assert (
625
+ builder.build_prompt(include_json_instructions=False)
626
+ == "test prompt content"
627
+ )
628
+ assert builder.chain_of_thought_prompt() == "test step by step"
629
+ assert builder.prompt_id() == run_config_prompt_id
630
+
631
+
632
+ def test_task_run_config_prompt_builder_validation_errors(tmp_path):
633
+ task = build_test_task(tmp_path)
634
+
635
+ # Test invalid format
636
+ with pytest.raises(ValueError, match="Invalid task run config prompt ID"):
637
+ TaskRunConfigPromptBuilder(
638
+ task=task, run_config_prompt_id="task_run_config::wrong::format"
639
+ )
640
+
641
+ # Test task ID mismatch
642
+ wrong_task_id = f"task_run_config::{task.parent.id}::wrong_task_id::config_id"
643
+ with pytest.raises(ValueError, match="Task ID mismatch"):
644
+ TaskRunConfigPromptBuilder(task=task, run_config_prompt_id=wrong_task_id)
645
+
646
+ # Test eval not found
647
+ nonexistent_eval = f"task_run_config::{task.parent.id}::{task.id}::nonexistent_id"
648
+ with pytest.raises(ValueError, match="Task run config ID not found"):
649
+ TaskRunConfigPromptBuilder(task=task, run_config_prompt_id=nonexistent_eval)