kiln-ai 0.14.0__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. kiln_ai/adapters/eval/base_eval.py +7 -2
  2. kiln_ai/adapters/eval/eval_runner.py +5 -64
  3. kiln_ai/adapters/eval/g_eval.py +3 -3
  4. kiln_ai/adapters/fine_tune/base_finetune.py +6 -3
  5. kiln_ai/adapters/fine_tune/dataset_formatter.py +128 -38
  6. kiln_ai/adapters/fine_tune/finetune_registry.py +2 -0
  7. kiln_ai/adapters/fine_tune/fireworks_finetune.py +2 -1
  8. kiln_ai/adapters/fine_tune/test_base_finetune.py +7 -0
  9. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +267 -10
  10. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +1 -1
  11. kiln_ai/adapters/fine_tune/test_vertex_finetune.py +586 -0
  12. kiln_ai/adapters/fine_tune/vertex_finetune.py +217 -0
  13. kiln_ai/adapters/ml_model_list.py +817 -62
  14. kiln_ai/adapters/model_adapters/base_adapter.py +33 -10
  15. kiln_ai/adapters/model_adapters/litellm_adapter.py +51 -12
  16. kiln_ai/adapters/model_adapters/test_base_adapter.py +74 -2
  17. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +65 -1
  18. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +3 -2
  19. kiln_ai/adapters/model_adapters/test_structured_output.py +4 -6
  20. kiln_ai/adapters/parsers/base_parser.py +0 -3
  21. kiln_ai/adapters/parsers/parser_registry.py +5 -3
  22. kiln_ai/adapters/parsers/r1_parser.py +17 -2
  23. kiln_ai/adapters/parsers/request_formatters.py +40 -0
  24. kiln_ai/adapters/parsers/test_parser_registry.py +2 -2
  25. kiln_ai/adapters/parsers/test_r1_parser.py +44 -1
  26. kiln_ai/adapters/parsers/test_request_formatters.py +76 -0
  27. kiln_ai/adapters/prompt_builders.py +14 -1
  28. kiln_ai/adapters/provider_tools.py +25 -1
  29. kiln_ai/adapters/repair/test_repair_task.py +3 -2
  30. kiln_ai/adapters/test_prompt_builders.py +24 -3
  31. kiln_ai/adapters/test_provider_tools.py +86 -1
  32. kiln_ai/datamodel/__init__.py +2 -0
  33. kiln_ai/datamodel/datamodel_enums.py +14 -0
  34. kiln_ai/datamodel/dataset_filters.py +69 -1
  35. kiln_ai/datamodel/dataset_split.py +4 -0
  36. kiln_ai/datamodel/eval.py +8 -0
  37. kiln_ai/datamodel/finetune.py +1 -0
  38. kiln_ai/datamodel/json_schema.py +24 -7
  39. kiln_ai/datamodel/prompt_id.py +1 -0
  40. kiln_ai/datamodel/task_output.py +10 -6
  41. kiln_ai/datamodel/task_run.py +68 -12
  42. kiln_ai/datamodel/test_basemodel.py +3 -7
  43. kiln_ai/datamodel/test_dataset_filters.py +82 -0
  44. kiln_ai/datamodel/test_dataset_split.py +2 -0
  45. kiln_ai/datamodel/test_example_models.py +158 -3
  46. kiln_ai/datamodel/test_json_schema.py +22 -3
  47. kiln_ai/datamodel/test_model_perf.py +3 -2
  48. kiln_ai/datamodel/test_models.py +50 -2
  49. kiln_ai/utils/async_job_runner.py +106 -0
  50. kiln_ai/utils/dataset_import.py +80 -18
  51. kiln_ai/utils/test_async_job_runner.py +199 -0
  52. kiln_ai/utils/test_dataset_import.py +242 -10
  53. {kiln_ai-0.14.0.dist-info → kiln_ai-0.16.0.dist-info}/METADATA +3 -2
  54. kiln_ai-0.16.0.dist-info/RECORD +108 -0
  55. kiln_ai/adapters/test_generate_docs.py +0 -69
  56. kiln_ai-0.14.0.dist-info/RECORD +0 -103
  57. {kiln_ai-0.14.0.dist-info → kiln_ai-0.16.0.dist-info}/WHEEL +0 -0
  58. {kiln_ai-0.14.0.dist-info → kiln_ai-0.16.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -1,5 +1,6 @@
1
1
  import json
2
2
  import logging
3
+ import re
3
4
  import tempfile
4
5
  from pathlib import Path
5
6
  from unittest.mock import Mock
@@ -15,7 +16,8 @@ from kiln_ai.adapters.fine_tune.dataset_formatter import (
15
16
  generate_chat_message_toolcall,
16
17
  generate_huggingface_chat_template,
17
18
  generate_huggingface_chat_template_toolcall,
18
- generate_vertex_gemini_1_5,
19
+ generate_vertex_gemini,
20
+ serialize_r1_style_message,
19
21
  )
20
22
  from kiln_ai.adapters.model_adapters.base_adapter import COT_FINAL_ANSWER_PROMPT
21
23
  from kiln_ai.datamodel import (
@@ -42,6 +44,7 @@ def mock_task():
42
44
  "input": '{"test": "input 你好"}',
43
45
  "repaired_output": None,
44
46
  "intermediate_outputs": {},
47
+ "thinking_training_data": Mock(return_value=None),
45
48
  "input_source": Mock(
46
49
  spec=DataSource,
47
50
  **{
@@ -83,6 +86,7 @@ def mock_task():
83
86
  def mock_intermediate_outputs(mock_task):
84
87
  for run in mock_task.runs():
85
88
  run.intermediate_outputs = {"reasoning": "thinking output"}
89
+ run.thinking_training_data.return_value = "thinking output"
86
90
  mock_task.thinking_instruction = "thinking instructions"
87
91
  return mock_task
88
92
 
@@ -138,6 +142,31 @@ def test_generate_chat_message_response_thinking():
138
142
  }
139
143
 
140
144
 
145
+ def test_generate_chat_message_response_thinking_r1_style():
146
+ thinking_data = ModelTrainingData(
147
+ input="test input",
148
+ system_message="system message",
149
+ final_output="test output",
150
+ thinking="thinking output",
151
+ thinking_instructions=None,
152
+ thinking_final_answer_prompt=None,
153
+ thinking_r1_style=True,
154
+ )
155
+
156
+ result = generate_chat_message_response(thinking_data)
157
+
158
+ assert result == {
159
+ "messages": [
160
+ {"role": "system", "content": "system message"},
161
+ {"role": "user", "content": "test input"},
162
+ {
163
+ "role": "assistant",
164
+ "content": "<think>\nthinking output\n</think>\n\ntest output",
165
+ },
166
+ ]
167
+ }
168
+
169
+
141
170
  def test_generate_chat_message_toolcall():
142
171
  training_data = ModelTrainingData(
143
172
  input="test input 你好",
@@ -206,6 +235,24 @@ def test_generate_chat_message_toolcall_thinking():
206
235
  }
207
236
 
208
237
 
238
+ def test_generate_chat_message_toolcall_thinking_r1_style():
239
+ training_data = ModelTrainingData(
240
+ input="test input",
241
+ system_message="system message",
242
+ final_output='{"key": "value"}',
243
+ thinking="thinking output",
244
+ thinking_instructions=None,
245
+ thinking_final_answer_prompt=None,
246
+ thinking_r1_style=True,
247
+ )
248
+
249
+ with pytest.raises(
250
+ ValueError,
251
+ match="R1 style thinking is not supported for tool call downloads",
252
+ ):
253
+ generate_chat_message_toolcall(training_data)
254
+
255
+
209
256
  def test_generate_chat_message_toolcall_invalid_json():
210
257
  training_data = ModelTrainingData(
211
258
  input="test input",
@@ -368,6 +415,37 @@ def test_dataset_formatter_dump_with_intermediate_data(
368
415
  assert "thinking instructions" in line
369
416
 
370
417
 
418
+ def test_dataset_formatter_dump_with_intermediate_data_r1_style(
419
+ mock_dataset, mock_intermediate_outputs
420
+ ):
421
+ formatter = DatasetFormatter(
422
+ mock_dataset,
423
+ "system message 你好",
424
+ thinking_instructions=None,
425
+ )
426
+
427
+ result_path = formatter.dump_to_file(
428
+ "train",
429
+ DatasetFormat.OPENAI_CHAT_JSONL,
430
+ data_strategy=FinetuneDataStrategy.final_and_intermediate_r1_compatible,
431
+ )
432
+
433
+ assert result_path.exists()
434
+ assert result_path.parent == Path(tempfile.gettempdir())
435
+ # Test our nice naming, with cot
436
+ assert (
437
+ result_path.name
438
+ == "test_dataset -- split-train -- format-openai_chat_jsonl -- cot.jsonl"
439
+ )
440
+ # Verify file contents
441
+ with open(result_path) as f:
442
+ lines = f.readlines()
443
+ assert len(lines) == 2
444
+ for line in lines:
445
+ assert "<think>" in line
446
+ assert "</think>" in line
447
+
448
+
371
449
  def test_dataset_formatter_dump_with_intermediate_data_custom_instructions(
372
450
  mock_dataset, mock_intermediate_outputs
373
451
  ):
@@ -440,6 +518,31 @@ def test_generate_huggingface_chat_template_thinking():
440
518
  }
441
519
 
442
520
 
521
+ def test_generate_huggingface_chat_template_thinking_r1_style():
522
+ training_data = ModelTrainingData(
523
+ input="test input",
524
+ system_message="system message",
525
+ final_output="test output",
526
+ thinking="thinking output",
527
+ thinking_instructions=None,
528
+ thinking_final_answer_prompt=None,
529
+ thinking_r1_style=True,
530
+ )
531
+
532
+ result = generate_huggingface_chat_template(training_data)
533
+
534
+ assert result == {
535
+ "conversations": [
536
+ {"role": "system", "content": "system message"},
537
+ {"role": "user", "content": "test input"},
538
+ {
539
+ "role": "assistant",
540
+ "content": "<think>\nthinking output\n</think>\n\ntest output",
541
+ },
542
+ ]
543
+ }
544
+
545
+
443
546
  def test_generate_vertex_template():
444
547
  training_data = ModelTrainingData(
445
548
  input="test input",
@@ -447,7 +550,7 @@ def test_generate_vertex_template():
447
550
  final_output="test output",
448
551
  )
449
552
 
450
- result = generate_vertex_gemini_1_5(training_data)
553
+ result = generate_vertex_gemini(training_data)
451
554
 
452
555
  assert result == {
453
556
  "systemInstruction": {
@@ -475,9 +578,7 @@ def test_generate_vertex_template_thinking():
475
578
  thinking_final_answer_prompt="thinking final answer prompt",
476
579
  )
477
580
 
478
- result = generate_vertex_gemini_1_5(training_data)
479
-
480
- logger.info(result)
581
+ result = generate_vertex_gemini(training_data)
481
582
 
482
583
  assert result == {
483
584
  "systemInstruction": {
@@ -498,6 +599,23 @@ def test_generate_vertex_template_thinking():
498
599
  }
499
600
 
500
601
 
602
+ def test_generate_vertex_template_thinking_r1_style():
603
+ training_data = ModelTrainingData(
604
+ input="test input",
605
+ system_message="system message",
606
+ final_output="test output",
607
+ thinking="thinking output",
608
+ thinking_instructions=None,
609
+ thinking_final_answer_prompt=None,
610
+ thinking_r1_style=True,
611
+ )
612
+
613
+ with pytest.raises(
614
+ ValueError, match="R1 style thinking is not supported for Vertex Gemini"
615
+ ):
616
+ generate_vertex_gemini(training_data)
617
+
618
+
501
619
  def test_generate_huggingface_chat_template_toolcall():
502
620
  training_data = ModelTrainingData(
503
621
  input="test input",
@@ -558,6 +676,24 @@ def test_generate_huggingface_chat_template_toolcall_thinking():
558
676
  assert tool_call["function"]["arguments"] == {"key": "value"}
559
677
 
560
678
 
679
+ def test_generate_huggingface_chat_template_toolcall_thinking_r1_style():
680
+ training_data = ModelTrainingData(
681
+ input="test input",
682
+ system_message="system message",
683
+ final_output='{"key": "value"}',
684
+ thinking="thinking output",
685
+ thinking_instructions=None,
686
+ thinking_final_answer_prompt=None,
687
+ thinking_r1_style=True,
688
+ )
689
+
690
+ with pytest.raises(
691
+ ValueError,
692
+ match="R1 style thinking is not supported for tool call downloads",
693
+ ):
694
+ generate_huggingface_chat_template_toolcall(training_data)
695
+
696
+
561
697
  def test_generate_huggingface_chat_template_toolcall_invalid_json():
562
698
  training_data = ModelTrainingData(
563
699
  input="test input",
@@ -572,7 +708,11 @@ def test_generate_huggingface_chat_template_toolcall_invalid_json():
572
708
  def test_build_training_data(mock_task):
573
709
  # Non repaired should use original output
574
710
  mock_task_run = mock_task.runs()[0]
575
- training_data_output = build_training_data(mock_task_run, "system message", False)
711
+ training_data_output = build_training_data(
712
+ mock_task_run,
713
+ "system message",
714
+ data_strategy=FinetuneDataStrategy.final_only,
715
+ )
576
716
  assert training_data_output.final_output == '{"test": "output 你好"}'
577
717
  assert training_data_output.thinking is None
578
718
  assert training_data_output.thinking_instructions is None
@@ -587,11 +727,12 @@ def test_build_training_data_with_COT(mock_task):
587
727
  mock_task_run = mock_task.runs()[0]
588
728
  assert mock_task_run.parent_task() == mock_task
589
729
  mock_task_run.intermediate_outputs = {"chain_of_thought": "cot output"}
730
+ mock_task_run.thinking_training_data.return_value = "cot output"
590
731
 
591
732
  training_data_output = build_training_data(
592
733
  mock_task_run,
593
734
  "system message",
594
- True,
735
+ data_strategy=FinetuneDataStrategy.final_and_intermediate,
595
736
  thinking_instructions="thinking instructions",
596
737
  )
597
738
  assert training_data_output.final_output == '{"test": "output 你好"}'
@@ -600,9 +741,59 @@ def test_build_training_data_with_COT(mock_task):
600
741
  assert training_data_output.thinking_final_answer_prompt == COT_FINAL_ANSWER_PROMPT
601
742
  assert training_data_output.input == '{"test": "input 你好"}'
602
743
  assert training_data_output.system_message == "system message"
744
+ assert training_data_output.thinking_r1_style == False
603
745
  assert training_data_output.supports_cot()
604
746
 
605
747
 
748
+ def test_model_training_data_supports_cot(mock_task):
749
+ training_data = ModelTrainingData(
750
+ input="test input",
751
+ system_message="system message",
752
+ final_output="test output",
753
+ thinking="thinking output",
754
+ thinking_instructions="thinking instructions",
755
+ thinking_final_answer_prompt=COT_FINAL_ANSWER_PROMPT,
756
+ thinking_r1_style=False,
757
+ )
758
+ assert training_data.supports_cot() == True
759
+
760
+
761
+ def test_model_training_data_supports_cot_r1_style(mock_task):
762
+ training_data = ModelTrainingData(
763
+ input="test input",
764
+ system_message="system message",
765
+ final_output="test output",
766
+ thinking="thinking output",
767
+ thinking_instructions="thinking instructions",
768
+ thinking_r1_style=True,
769
+ )
770
+
771
+ with pytest.raises(ValueError, match="R1 style does not support COT"):
772
+ training_data.supports_cot()
773
+
774
+
775
+ def test_build_training_data_with_COT_r1_style(mock_task):
776
+ # Setup with needed fields for thinking
777
+ mock_task_run = mock_task.runs()[0]
778
+ assert mock_task_run.parent_task() == mock_task
779
+ mock_task_run.intermediate_outputs = {"chain_of_thought": "cot output"}
780
+ mock_task_run.thinking_training_data.return_value = "cot output"
781
+
782
+ training_data_output = build_training_data(
783
+ mock_task_run,
784
+ "system message",
785
+ data_strategy=FinetuneDataStrategy.final_and_intermediate_r1_compatible,
786
+ thinking_instructions=None,
787
+ )
788
+ assert training_data_output.final_output == '{"test": "output 你好"}'
789
+ assert training_data_output.thinking == "cot output"
790
+ assert training_data_output.thinking_instructions == None
791
+ assert training_data_output.thinking_final_answer_prompt == None
792
+ assert training_data_output.input == '{"test": "input 你好"}'
793
+ assert training_data_output.system_message == "system message"
794
+ assert training_data_output.thinking_r1_style == True
795
+
796
+
606
797
  def test_build_training_data_with_thinking(mock_task):
607
798
  # Setup with needed fields for thinking
608
799
  mock_task_run = mock_task.runs()[0]
@@ -612,13 +803,14 @@ def test_build_training_data_with_thinking(mock_task):
612
803
  "reasoning": "thinking output",
613
804
  "chain_of_thought": "cot output",
614
805
  }
806
+ mock_task_run.thinking_training_data.return_value = "thinking output"
615
807
  mock_task.thinking_instruction = "thinking instructions"
616
808
  assert mock_task.thinking_instruction == "thinking instructions"
617
809
 
618
810
  training_data_output = build_training_data(
619
811
  mock_task_run,
620
812
  "system message",
621
- True,
813
+ FinetuneDataStrategy.final_and_intermediate,
622
814
  thinking_instructions="thinking instructions",
623
815
  )
624
816
  assert training_data_output.final_output == '{"test": "output 你好"}'
@@ -627,7 +819,36 @@ def test_build_training_data_with_thinking(mock_task):
627
819
  assert training_data_output.thinking_final_answer_prompt == COT_FINAL_ANSWER_PROMPT
628
820
  assert training_data_output.input == '{"test": "input 你好"}'
629
821
  assert training_data_output.system_message == "system message"
630
- assert training_data_output.supports_cot()
822
+ assert training_data_output.thinking_r1_style == False
823
+
824
+
825
+ def test_build_training_data_with_thinking_r1_style(mock_task):
826
+ # Setup with needed fields for thinking
827
+ mock_task_run = mock_task.runs()[0]
828
+ assert mock_task_run.parent_task() == mock_task
829
+ # It should just use the reasoning output if both thinking and chain_of_thought are present
830
+ mock_task_run.intermediate_outputs = {
831
+ "reasoning": "thinking output",
832
+ "chain_of_thought": "cot output",
833
+ }
834
+ mock_task_run.thinking_training_data.return_value = "thinking output"
835
+ mock_task.thinking_instruction = "thinking instructions"
836
+
837
+ assert mock_task.thinking_instruction == "thinking instructions"
838
+
839
+ training_data_output = build_training_data(
840
+ mock_task_run,
841
+ "system message",
842
+ FinetuneDataStrategy.final_and_intermediate_r1_compatible,
843
+ thinking_instructions=None,
844
+ )
845
+ assert training_data_output.final_output == '{"test": "output 你好"}'
846
+ assert training_data_output.thinking == "thinking output"
847
+ assert training_data_output.thinking_instructions == None
848
+ assert training_data_output.thinking_final_answer_prompt == None
849
+ assert training_data_output.input == '{"test": "input 你好"}'
850
+ assert training_data_output.system_message == "system message"
851
+ assert training_data_output.thinking_r1_style == True
631
852
 
632
853
 
633
854
  def test_build_training_data_with_repaired_output(mock_task):
@@ -642,7 +863,11 @@ def test_build_training_data_with_repaired_output(mock_task):
642
863
  ),
643
864
  )
644
865
 
645
- training_data_output = build_training_data(mock_task_run, "system message", False)
866
+ training_data_output = build_training_data(
867
+ mock_task_run,
868
+ "system message",
869
+ data_strategy=FinetuneDataStrategy.final_only,
870
+ )
646
871
  assert training_data_output.final_output == '{"test": "repaired output"}'
647
872
  assert training_data_output.thinking is None
648
873
  assert training_data_output.thinking_instructions is None
@@ -683,3 +908,35 @@ def test_dataset_formatter_dump_to_file_json_schema_format(mock_dataset, tmp_pat
683
908
  assert assistant_msg["content"] == '{"test": "output 你好"}'
684
909
  json_content = json.loads(assistant_msg["content"])
685
910
  assert json_content == {"test": "output 你好"}
911
+
912
+
913
+ @pytest.mark.parametrize(
914
+ "thinking,final_output,expected_output",
915
+ [
916
+ ("thinking", "final output", "<think>\nthinking\n</think>\n\nfinal output"),
917
+ ("thinking", '{"name":"joe"}', '<think>\nthinking\n</think>\n\n{"name":"joe"}'),
918
+ ],
919
+ )
920
+ def test_serialize_r1_style_message(thinking, final_output, expected_output):
921
+ assert (
922
+ serialize_r1_style_message(thinking=thinking, final_output=final_output)
923
+ == expected_output
924
+ )
925
+
926
+
927
+ @pytest.mark.parametrize(
928
+ "thinking,final_output",
929
+ [
930
+ (None, "final output"),
931
+ ("", "final output"),
932
+ (" ", "final output"),
933
+ ],
934
+ )
935
+ def test_serialize_r1_style_message_missing_thinking(thinking, final_output):
936
+ with pytest.raises(
937
+ ValueError,
938
+ match=re.escape(
939
+ "Thinking data is required when fine-tuning thinking models (R1, QwQ, etc). Please ensure your fine-tuning dataset contains reasoning or chain of thought output for every entry."
940
+ ),
941
+ ):
942
+ serialize_r1_style_message(thinking=thinking, final_output=final_output)
@@ -315,7 +315,7 @@ async def test_generate_and_upload_jsonl_success(
315
315
  "thinking_instructions": thinking_instructions,
316
316
  }
317
317
 
318
- assert result == mock_dataset_id
318
+ assert result == "kiln-" + mock_dataset_id
319
319
  assert mock_client.post.call_count == 2
320
320
  assert mock_client.get.call_count == 1
321
321