kiln-ai 0.19.0__py3-none-any.whl → 0.20.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (70) hide show
  1. kiln_ai/adapters/__init__.py +2 -2
  2. kiln_ai/adapters/adapter_registry.py +19 -1
  3. kiln_ai/adapters/chat/chat_formatter.py +8 -12
  4. kiln_ai/adapters/chat/test_chat_formatter.py +6 -2
  5. kiln_ai/adapters/docker_model_runner_tools.py +119 -0
  6. kiln_ai/adapters/eval/base_eval.py +2 -2
  7. kiln_ai/adapters/eval/eval_runner.py +3 -1
  8. kiln_ai/adapters/eval/g_eval.py +2 -2
  9. kiln_ai/adapters/eval/test_base_eval.py +1 -1
  10. kiln_ai/adapters/eval/test_g_eval.py +3 -4
  11. kiln_ai/adapters/fine_tune/__init__.py +1 -1
  12. kiln_ai/adapters/fine_tune/openai_finetune.py +14 -4
  13. kiln_ai/adapters/fine_tune/test_openai_finetune.py +108 -111
  14. kiln_ai/adapters/ml_model_list.py +380 -34
  15. kiln_ai/adapters/model_adapters/base_adapter.py +51 -21
  16. kiln_ai/adapters/model_adapters/litellm_adapter.py +383 -79
  17. kiln_ai/adapters/model_adapters/test_base_adapter.py +193 -17
  18. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +406 -1
  19. kiln_ai/adapters/model_adapters/test_litellm_adapter_tools.py +1103 -0
  20. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +5 -5
  21. kiln_ai/adapters/model_adapters/test_structured_output.py +110 -4
  22. kiln_ai/adapters/parsers/__init__.py +1 -1
  23. kiln_ai/adapters/provider_tools.py +15 -1
  24. kiln_ai/adapters/repair/test_repair_task.py +12 -9
  25. kiln_ai/adapters/run_output.py +3 -0
  26. kiln_ai/adapters/test_adapter_registry.py +80 -1
  27. kiln_ai/adapters/test_docker_model_runner_tools.py +305 -0
  28. kiln_ai/adapters/test_ml_model_list.py +39 -1
  29. kiln_ai/adapters/test_prompt_adaptors.py +13 -6
  30. kiln_ai/adapters/test_provider_tools.py +55 -0
  31. kiln_ai/adapters/test_remote_config.py +98 -0
  32. kiln_ai/datamodel/__init__.py +23 -21
  33. kiln_ai/datamodel/datamodel_enums.py +1 -0
  34. kiln_ai/datamodel/eval.py +1 -1
  35. kiln_ai/datamodel/external_tool_server.py +298 -0
  36. kiln_ai/datamodel/json_schema.py +25 -10
  37. kiln_ai/datamodel/project.py +8 -1
  38. kiln_ai/datamodel/registry.py +0 -15
  39. kiln_ai/datamodel/run_config.py +62 -0
  40. kiln_ai/datamodel/task.py +2 -77
  41. kiln_ai/datamodel/task_output.py +6 -1
  42. kiln_ai/datamodel/task_run.py +41 -0
  43. kiln_ai/datamodel/test_basemodel.py +3 -3
  44. kiln_ai/datamodel/test_example_models.py +175 -0
  45. kiln_ai/datamodel/test_external_tool_server.py +691 -0
  46. kiln_ai/datamodel/test_registry.py +8 -3
  47. kiln_ai/datamodel/test_task.py +15 -47
  48. kiln_ai/datamodel/test_tool_id.py +239 -0
  49. kiln_ai/datamodel/tool_id.py +83 -0
  50. kiln_ai/tools/__init__.py +8 -0
  51. kiln_ai/tools/base_tool.py +82 -0
  52. kiln_ai/tools/built_in_tools/__init__.py +13 -0
  53. kiln_ai/tools/built_in_tools/math_tools.py +124 -0
  54. kiln_ai/tools/built_in_tools/test_math_tools.py +204 -0
  55. kiln_ai/tools/mcp_server_tool.py +95 -0
  56. kiln_ai/tools/mcp_session_manager.py +243 -0
  57. kiln_ai/tools/test_base_tools.py +199 -0
  58. kiln_ai/tools/test_mcp_server_tool.py +457 -0
  59. kiln_ai/tools/test_mcp_session_manager.py +1585 -0
  60. kiln_ai/tools/test_tool_registry.py +473 -0
  61. kiln_ai/tools/tool_registry.py +64 -0
  62. kiln_ai/utils/config.py +22 -0
  63. kiln_ai/utils/open_ai_types.py +94 -0
  64. kiln_ai/utils/project_utils.py +17 -0
  65. kiln_ai/utils/test_config.py +138 -1
  66. kiln_ai/utils/test_open_ai_types.py +131 -0
  67. {kiln_ai-0.19.0.dist-info → kiln_ai-0.20.1.dist-info}/METADATA +6 -5
  68. {kiln_ai-0.19.0.dist-info → kiln_ai-0.20.1.dist-info}/RECORD +70 -47
  69. {kiln_ai-0.19.0.dist-info → kiln_ai-0.20.1.dist-info}/WHEEL +0 -0
  70. {kiln_ai-0.19.0.dist-info → kiln_ai-0.20.1.dist-info}/licenses/LICENSE.txt +0 -0
@@ -3,6 +3,7 @@ from unittest.mock import Mock, patch
3
3
 
4
4
  import litellm
5
5
  import pytest
6
+ from litellm.types.utils import ChoiceLogprobs
6
7
 
7
8
  from kiln_ai.adapters.ml_model_list import ModelProviderName, StructuredOutputMode
8
9
  from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig
@@ -10,6 +11,12 @@ from kiln_ai.adapters.model_adapters.litellm_adapter import LiteLlmAdapter
10
11
  from kiln_ai.adapters.model_adapters.litellm_config import LiteLlmConfig
11
12
  from kiln_ai.datamodel import Project, Task, Usage
12
13
  from kiln_ai.datamodel.task import RunConfigProperties
14
+ from kiln_ai.tools.built_in_tools.math_tools import (
15
+ AddTool,
16
+ DivideTool,
17
+ MultiplyTool,
18
+ SubtractTool,
19
+ )
13
20
 
14
21
 
15
22
  @pytest.fixture
@@ -59,7 +66,7 @@ def test_initialization(config, mock_task):
59
66
  )
60
67
 
61
68
  assert adapter.config == config
62
- assert adapter.run_config.task == mock_task
69
+ assert adapter.task == mock_task
63
70
  assert adapter.run_config.prompt_id == "simple_prompt_builder"
64
71
  assert adapter.base_adapter_config.default_tags == ["test-tag"]
65
72
  assert adapter.run_config.model_name == config.run_config_properties.model_name
@@ -554,6 +561,404 @@ def test_usage_from_response(config, mock_task, litellm_usage, cost, expected_us
554
561
  response.get.assert_called_once_with("usage", None)
555
562
 
556
563
 
564
+ @pytest.fixture
565
+ def mock_math_tools():
566
+ """Create a list of 4 math tools for testing"""
567
+ return [AddTool(), SubtractTool(), MultiplyTool(), DivideTool()]
568
+
569
+
570
+ async def test_litellm_tools_returns_openai_format_with_tools(
571
+ config, mock_task, mock_math_tools
572
+ ):
573
+ """Test litellm_tools returns OpenAI formatted tool list when available_tools has tools"""
574
+ adapter = LiteLlmAdapter(config=config, kiln_task=mock_task)
575
+
576
+ with patch.object(adapter, "available_tools", return_value=mock_math_tools):
577
+ tools = await adapter.litellm_tools()
578
+
579
+ # Should return 4 tools
580
+ assert len(tools) == 4
581
+
582
+ # Each tool should have the OpenAI format
583
+ for tool in tools:
584
+ assert "type" in tool
585
+ assert tool["type"] == "function"
586
+ assert "function" in tool
587
+ assert "name" in tool["function"]
588
+ assert "description" in tool["function"]
589
+ assert "parameters" in tool["function"]
590
+
591
+ # Verify specific tools are present
592
+ tool_names = [tool["function"]["name"] for tool in tools]
593
+ assert "add" in tool_names
594
+ assert "subtract" in tool_names
595
+ assert "multiply" in tool_names
596
+ assert "divide" in tool_names
597
+
598
+
599
+ async def test_litellm_tools_returns_empty_list_without_tools(config, mock_task):
600
+ """Test litellm_tools returns empty list when available_tools has no tools"""
601
+ adapter = LiteLlmAdapter(config=config, kiln_task=mock_task)
602
+
603
+ with patch.object(adapter, "available_tools", return_value=[]):
604
+ tools = await adapter.litellm_tools()
605
+
606
+ assert tools == []
607
+
608
+
609
+ @pytest.mark.asyncio
610
+ async def test_build_completion_kwargs_includes_tools(
611
+ config, mock_task, mock_math_tools
612
+ ):
613
+ """Test build_completion_kwargs includes tools when available_tools has tools"""
614
+ adapter = LiteLlmAdapter(config=config, kiln_task=mock_task)
615
+ mock_provider = Mock()
616
+ messages = [{"role": "user", "content": "Hello"}]
617
+
618
+ with (
619
+ patch.object(adapter, "model_provider", return_value=mock_provider),
620
+ patch.object(adapter, "litellm_model_id", return_value="openai/test-model"),
621
+ patch.object(adapter, "build_extra_body", return_value={}),
622
+ patch.object(adapter, "response_format_options", return_value={}),
623
+ patch.object(adapter, "available_tools", return_value=mock_math_tools),
624
+ ):
625
+ kwargs = await adapter.build_completion_kwargs(mock_provider, messages, None)
626
+
627
+ # Should include tools
628
+ assert "tools" in kwargs
629
+ assert len(kwargs["tools"]) == 4
630
+ assert "tool_choice" in kwargs
631
+ assert kwargs["tool_choice"] == "auto"
632
+
633
+ # Verify tools are properly formatted
634
+ for tool in kwargs["tools"]:
635
+ assert "type" in tool
636
+ assert tool["type"] == "function"
637
+ assert "function" in tool
638
+
639
+
640
+ @pytest.mark.asyncio
641
+ @pytest.mark.parametrize(
642
+ "structured_output_mode, expected_error_message",
643
+ [
644
+ (
645
+ StructuredOutputMode.function_calling,
646
+ "Function calling/tools can't be used as the JSON response format if you're also using tools",
647
+ ),
648
+ (
649
+ StructuredOutputMode.function_calling_weak,
650
+ "Function calling/tools can't be used as the JSON response format if you're also using tools",
651
+ ),
652
+ (
653
+ StructuredOutputMode.json_instructions,
654
+ None,
655
+ ),
656
+ (
657
+ StructuredOutputMode.json_schema,
658
+ None,
659
+ ),
660
+ ],
661
+ )
662
+ async def test_build_completion_kwargs_raises_error_with_tools_conflict(
663
+ config, mock_task, mock_math_tools, structured_output_mode, expected_error_message
664
+ ):
665
+ """Test build_completion_kwargs raises error when structured output mode conflicts with available tools"""
666
+ config.run_config_properties.structured_output_mode = structured_output_mode
667
+ adapter = LiteLlmAdapter(config=config, kiln_task=mock_task)
668
+ mock_provider = Mock()
669
+ messages = [{"role": "user", "content": "Hello"}]
670
+
671
+ with (
672
+ patch.object(adapter, "model_provider", return_value=mock_provider),
673
+ patch.object(adapter, "litellm_model_id", return_value="openai/test-model"),
674
+ patch.object(adapter, "build_extra_body", return_value={}),
675
+ patch.object(adapter, "available_tools", return_value=mock_math_tools),
676
+ ):
677
+ if expected_error_message is not None:
678
+ with pytest.raises(
679
+ ValueError,
680
+ match=expected_error_message,
681
+ ):
682
+ await adapter.build_completion_kwargs(mock_provider, messages, None)
683
+ else:
684
+ # should not raise an error
685
+ await adapter.build_completion_kwargs(mock_provider, messages, None)
686
+
687
+
688
+ class TestExtractAndValidateLogprobs:
689
+ """Test cases for the _extract_and_validate_logprobs helper method"""
690
+
691
+ @pytest.fixture
692
+ def adapter_with_logprobs_required(self, config, mock_task):
693
+ """Create an adapter with logprobs required"""
694
+ base_config = AdapterConfig(top_logprobs=5)
695
+ return LiteLlmAdapter(
696
+ config=config, kiln_task=mock_task, base_adapter_config=base_config
697
+ )
698
+
699
+ @pytest.fixture
700
+ def adapter_without_logprobs_required(self, config, mock_task):
701
+ """Create an adapter without logprobs required"""
702
+ base_config = AdapterConfig(top_logprobs=None)
703
+ return LiteLlmAdapter(
704
+ config=config, kiln_task=mock_task, base_adapter_config=base_config
705
+ )
706
+
707
+ def test_extract_logprobs_with_valid_logprobs(
708
+ self, adapter_without_logprobs_required
709
+ ):
710
+ """Test extracting logprobs when final_choice has valid logprobs"""
711
+ # Create a mock final_choice with valid logprobs
712
+ mock_choice = Mock()
713
+ mock_logprobs = Mock(spec=ChoiceLogprobs)
714
+ mock_choice.logprobs = mock_logprobs
715
+
716
+ result = adapter_without_logprobs_required._extract_and_validate_logprobs(
717
+ mock_choice
718
+ )
719
+
720
+ assert result == mock_logprobs
721
+
722
+ def test_extract_logprobs_with_none_choice(self, adapter_without_logprobs_required):
723
+ """Test extracting logprobs when final_choice is None"""
724
+ result = adapter_without_logprobs_required._extract_and_validate_logprobs(None)
725
+
726
+ assert result is None
727
+
728
+ def test_extract_logprobs_without_logprobs_attribute(
729
+ self, adapter_without_logprobs_required
730
+ ):
731
+ """Test extracting logprobs when final_choice has no logprobs attribute"""
732
+ mock_choice = Mock()
733
+ # Don't add logprobs attribute
734
+
735
+ result = adapter_without_logprobs_required._extract_and_validate_logprobs(
736
+ mock_choice
737
+ )
738
+
739
+ assert result is None
740
+
741
+ def test_extract_logprobs_with_non_choicelogprobs_type(
742
+ self, adapter_without_logprobs_required
743
+ ):
744
+ """Test extracting logprobs when logprobs is not a ChoiceLogprobs instance"""
745
+ mock_choice = Mock()
746
+ mock_choice.logprobs = {"not": "a ChoiceLogprobs object"}
747
+
748
+ result = adapter_without_logprobs_required._extract_and_validate_logprobs(
749
+ mock_choice
750
+ )
751
+
752
+ assert result is None
753
+
754
+ def test_extract_logprobs_with_none_logprobs(
755
+ self, adapter_without_logprobs_required
756
+ ):
757
+ """Test extracting logprobs when logprobs attribute is None"""
758
+ mock_choice = Mock()
759
+ mock_choice.logprobs = None
760
+
761
+ result = adapter_without_logprobs_required._extract_and_validate_logprobs(
762
+ mock_choice
763
+ )
764
+
765
+ assert result is None
766
+
767
+ def test_validate_logprobs_required_but_missing_raises_error(
768
+ self, adapter_with_logprobs_required
769
+ ):
770
+ """Test that missing logprobs raises error when required"""
771
+ mock_choice = Mock()
772
+ # Don't add logprobs or make it None
773
+ mock_choice.logprobs = None
774
+
775
+ with pytest.raises(
776
+ RuntimeError, match="Logprobs were required, but no logprobs were returned"
777
+ ):
778
+ adapter_with_logprobs_required._extract_and_validate_logprobs(mock_choice)
779
+
780
+ def test_validate_logprobs_required_but_none_choice_raises_error(
781
+ self, adapter_with_logprobs_required
782
+ ):
783
+ """Test that None choice raises error when logprobs are required"""
784
+ with pytest.raises(
785
+ RuntimeError, match="Logprobs were required, but no logprobs were returned"
786
+ ):
787
+ adapter_with_logprobs_required._extract_and_validate_logprobs(None)
788
+
789
+ def test_validate_logprobs_required_but_wrong_type_raises_error(
790
+ self, adapter_with_logprobs_required
791
+ ):
792
+ """Test that wrong logprobs type raises error when required"""
793
+ mock_choice = Mock()
794
+ mock_choice.logprobs = {"not": "a ChoiceLogprobs object"}
795
+
796
+ with pytest.raises(
797
+ RuntimeError, match="Logprobs were required, but no logprobs were returned"
798
+ ):
799
+ adapter_with_logprobs_required._extract_and_validate_logprobs(mock_choice)
800
+
801
+ def test_validate_logprobs_required_and_present_succeeds(
802
+ self, adapter_with_logprobs_required
803
+ ):
804
+ """Test that valid logprobs are returned when required and present"""
805
+ mock_choice = Mock()
806
+ mock_logprobs = Mock(spec=ChoiceLogprobs)
807
+ mock_choice.logprobs = mock_logprobs
808
+
809
+ result = adapter_with_logprobs_required._extract_and_validate_logprobs(
810
+ mock_choice
811
+ )
812
+
813
+ assert result == mock_logprobs
814
+
815
+ def test_validate_logprobs_not_required_missing_ok(
816
+ self, adapter_without_logprobs_required
817
+ ):
818
+ """Test that missing logprobs is OK when not required"""
819
+ mock_choice = Mock()
820
+ mock_choice.logprobs = None
821
+
822
+ result = adapter_without_logprobs_required._extract_and_validate_logprobs(
823
+ mock_choice
824
+ )
825
+
826
+ assert result is None
827
+
828
+ @pytest.mark.parametrize("top_logprobs_value", [0, 1, 5, 10])
829
+ def test_validate_logprobs_various_top_logprobs_values(
830
+ self, config, mock_task, top_logprobs_value
831
+ ):
832
+ """Test validation with various top_logprobs values"""
833
+ base_config = AdapterConfig(top_logprobs=top_logprobs_value)
834
+ adapter = LiteLlmAdapter(
835
+ config=config, kiln_task=mock_task, base_adapter_config=base_config
836
+ )
837
+
838
+ mock_choice = Mock()
839
+ mock_choice.logprobs = None
840
+
841
+ with pytest.raises(
842
+ RuntimeError, match="Logprobs were required, but no logprobs were returned"
843
+ ):
844
+ adapter._extract_and_validate_logprobs(mock_choice)
845
+
846
+
847
+ class TestExtractReasoningToIntermediateOutputs:
848
+ def test_extract_reasoning_with_valid_content(self, config, mock_task):
849
+ """Test extracting reasoning content when present and valid"""
850
+ adapter = LiteLlmAdapter(config=config, kiln_task=mock_task)
851
+
852
+ # Create mock choice with reasoning content
853
+ mock_choice = Mock()
854
+ mock_message = Mock()
855
+ mock_message.reasoning_content = "This is my reasoning"
856
+ mock_choice.message = mock_message
857
+
858
+ intermediate_outputs = {}
859
+
860
+ adapter._extract_reasoning_to_intermediate_outputs(
861
+ mock_choice, intermediate_outputs
862
+ )
863
+
864
+ assert intermediate_outputs["reasoning"] == "This is my reasoning"
865
+
866
+ def test_extract_reasoning_with_whitespace_content(self, config, mock_task):
867
+ """Test extracting reasoning content with whitespace that gets stripped"""
868
+ adapter = LiteLlmAdapter(config=config, kiln_task=mock_task)
869
+
870
+ mock_choice = Mock()
871
+ mock_message = Mock()
872
+ mock_message.reasoning_content = (
873
+ " \n This is my reasoning with whitespace \n "
874
+ )
875
+ mock_choice.message = mock_message
876
+
877
+ intermediate_outputs = {}
878
+
879
+ adapter._extract_reasoning_to_intermediate_outputs(
880
+ mock_choice, intermediate_outputs
881
+ )
882
+
883
+ assert (
884
+ intermediate_outputs["reasoning"] == "This is my reasoning with whitespace"
885
+ )
886
+
887
+ def test_extract_reasoning_with_empty_content(self, config, mock_task):
888
+ """Test that empty reasoning content is not added to intermediate outputs"""
889
+ adapter = LiteLlmAdapter(config=config, kiln_task=mock_task)
890
+
891
+ mock_choice = Mock()
892
+ mock_message = Mock()
893
+ mock_message.reasoning_content = " " # Only whitespace
894
+ mock_choice.message = mock_message
895
+
896
+ intermediate_outputs = {}
897
+
898
+ adapter._extract_reasoning_to_intermediate_outputs(
899
+ mock_choice, intermediate_outputs
900
+ )
901
+
902
+ assert "reasoning" not in intermediate_outputs
903
+
904
+ def test_extract_reasoning_with_none_content(self, config, mock_task):
905
+ """Test that None reasoning content is not added to intermediate outputs"""
906
+ adapter = LiteLlmAdapter(config=config, kiln_task=mock_task)
907
+
908
+ mock_choice = Mock()
909
+ mock_message = Mock()
910
+ mock_message.reasoning_content = None
911
+ mock_choice.message = mock_message
912
+
913
+ intermediate_outputs = {}
914
+
915
+ adapter._extract_reasoning_to_intermediate_outputs(
916
+ mock_choice, intermediate_outputs
917
+ )
918
+
919
+ assert "reasoning" not in intermediate_outputs
920
+
921
+ def test_extract_reasoning_with_no_reasoning_attribute(self, config, mock_task):
922
+ """Test that missing reasoning_content attribute is handled gracefully"""
923
+ adapter = LiteLlmAdapter(config=config, kiln_task=mock_task)
924
+
925
+ mock_choice = Mock()
926
+ mock_message = Mock(spec=[]) # Empty spec, no attributes
927
+ mock_choice.message = mock_message
928
+
929
+ intermediate_outputs = {}
930
+
931
+ adapter._extract_reasoning_to_intermediate_outputs(
932
+ mock_choice, intermediate_outputs
933
+ )
934
+
935
+ assert "reasoning" not in intermediate_outputs
936
+
937
+ def test_extract_reasoning_with_no_message_attribute(self, config, mock_task):
938
+ """Test that missing message attribute is handled gracefully"""
939
+ adapter = LiteLlmAdapter(config=config, kiln_task=mock_task)
940
+
941
+ mock_choice = Mock(spec=[]) # Empty spec, no attributes
942
+
943
+ intermediate_outputs = {}
944
+
945
+ adapter._extract_reasoning_to_intermediate_outputs(
946
+ mock_choice, intermediate_outputs
947
+ )
948
+
949
+ assert "reasoning" not in intermediate_outputs
950
+
951
+ def test_extract_reasoning_with_none_choice(self, config, mock_task):
952
+ """Test that None choice is handled gracefully"""
953
+ adapter = LiteLlmAdapter(config=config, kiln_task=mock_task)
954
+
955
+ intermediate_outputs = {}
956
+
957
+ adapter._extract_reasoning_to_intermediate_outputs(None, intermediate_outputs)
958
+
959
+ assert "reasoning" not in intermediate_outputs
960
+
961
+
557
962
  @pytest.mark.parametrize(
558
963
  "enable_thinking",
559
964
  [