kiln-ai 0.18.0__py3-none-any.whl → 0.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kiln-ai might be problematic. Click here for more details.
- kiln_ai/adapters/adapter_registry.py +28 -0
- kiln_ai/adapters/data_gen/data_gen_task.py +2 -2
- kiln_ai/adapters/data_gen/test_data_gen_task.py +7 -3
- kiln_ai/adapters/eval/test_eval_runner.py +6 -12
- kiln_ai/adapters/eval/test_g_eval_data.py +1 -1
- kiln_ai/adapters/fine_tune/base_finetune.py +1 -0
- kiln_ai/adapters/fine_tune/fireworks_finetune.py +32 -20
- kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +30 -21
- kiln_ai/adapters/ml_model_list.py +635 -83
- kiln_ai/adapters/model_adapters/base_adapter.py +11 -7
- kiln_ai/adapters/model_adapters/litellm_adapter.py +14 -1
- kiln_ai/adapters/model_adapters/test_base_adapter.py +1 -1
- kiln_ai/adapters/model_adapters/test_litellm_adapter.py +22 -3
- kiln_ai/adapters/model_adapters/test_structured_output.py +10 -10
- kiln_ai/adapters/parsers/test_r1_parser.py +1 -1
- kiln_ai/adapters/provider_tools.py +20 -19
- kiln_ai/adapters/remote_config.py +57 -10
- kiln_ai/adapters/repair/repair_task.py +1 -1
- kiln_ai/adapters/test_adapter_registry.py +30 -2
- kiln_ai/adapters/test_ml_model_list.py +12 -0
- kiln_ai/adapters/test_provider_tools.py +18 -12
- kiln_ai/adapters/test_remote_config.py +372 -16
- kiln_ai/datamodel/basemodel.py +54 -28
- kiln_ai/datamodel/datamodel_enums.py +2 -0
- kiln_ai/datamodel/dataset_split.py +5 -3
- kiln_ai/datamodel/eval.py +3 -3
- kiln_ai/datamodel/finetune.py +2 -2
- kiln_ai/datamodel/project.py +3 -3
- kiln_ai/datamodel/prompt.py +2 -2
- kiln_ai/datamodel/prompt_id.py +4 -4
- kiln_ai/datamodel/task.py +6 -6
- kiln_ai/datamodel/task_output.py +1 -1
- kiln_ai/datamodel/test_basemodel.py +210 -18
- kiln_ai/datamodel/test_eval_model.py +6 -6
- kiln_ai/datamodel/test_model_perf.py +1 -1
- kiln_ai/datamodel/test_prompt_id.py +5 -1
- kiln_ai/datamodel/test_task.py +5 -0
- kiln_ai/utils/config.py +10 -0
- {kiln_ai-0.18.0.dist-info → kiln_ai-0.19.0.dist-info}/METADATA +32 -2
- {kiln_ai-0.18.0.dist-info → kiln_ai-0.19.0.dist-info}/RECORD +42 -42
- {kiln_ai-0.18.0.dist-info → kiln_ai-0.19.0.dist-info}/WHEEL +0 -0
- {kiln_ai-0.18.0.dist-info → kiln_ai-0.19.0.dist-info}/licenses/LICENSE.txt +0 -0
kiln_ai/datamodel/prompt.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from pydantic import BaseModel, Field
|
|
2
2
|
|
|
3
|
-
from kiln_ai.datamodel.basemodel import
|
|
3
|
+
from kiln_ai.datamodel.basemodel import FilenameString, KilnParentedModel
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
class BasePrompt(BaseModel):
|
|
@@ -10,7 +10,7 @@ class BasePrompt(BaseModel):
|
|
|
10
10
|
The "Prompt" model name is reserved for the custom prompts parented by a task.
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
|
-
name:
|
|
13
|
+
name: FilenameString = Field(description="The name of the prompt.")
|
|
14
14
|
description: str | None = Field(
|
|
15
15
|
default=None,
|
|
16
16
|
description="A more detailed description of the prompt.",
|
kiln_ai/datamodel/prompt_id.py
CHANGED
|
@@ -60,11 +60,11 @@ def _check_prompt_id(id: str) -> str:
|
|
|
60
60
|
return id
|
|
61
61
|
|
|
62
62
|
if id.startswith("fine_tune_prompt::"):
|
|
63
|
-
# check it had a fine_tune_id after the :: -- 'fine_tune_prompt::fine_tune_id'
|
|
64
|
-
|
|
65
|
-
if len(
|
|
63
|
+
# check it had a fine_tune_id after the :: -- 'fine_tune_prompt::[project_id]::[task_id]::fine_tune_id'
|
|
64
|
+
parts = id.split("::")
|
|
65
|
+
if len(parts) != 4 or len(parts[3]) == 0:
|
|
66
66
|
raise ValueError(
|
|
67
|
-
f"Invalid fine-tune prompt ID: {id}. Expected format: 'fine_tune_prompt::[fine_tune_id]'."
|
|
67
|
+
f"Invalid fine-tune prompt ID: {id}. Expected format: 'fine_tune_prompt::[project_id]::[task_id]::[fine_tune_id]'."
|
|
68
68
|
)
|
|
69
69
|
return id
|
|
70
70
|
|
kiln_ai/datamodel/task.py
CHANGED
|
@@ -7,8 +7,8 @@ from kiln_ai.datamodel import Finetune
|
|
|
7
7
|
from kiln_ai.datamodel.basemodel import (
|
|
8
8
|
ID_FIELD,
|
|
9
9
|
ID_TYPE,
|
|
10
|
-
|
|
11
|
-
|
|
10
|
+
FilenameString,
|
|
11
|
+
FilenameStringShort,
|
|
12
12
|
KilnParentedModel,
|
|
13
13
|
KilnParentModel,
|
|
14
14
|
)
|
|
@@ -38,7 +38,7 @@ class TaskRequirement(BaseModel):
|
|
|
38
38
|
"""
|
|
39
39
|
|
|
40
40
|
id: ID_TYPE = ID_FIELD
|
|
41
|
-
name:
|
|
41
|
+
name: FilenameStringShort = Field(description="The name of the task requirement.")
|
|
42
42
|
description: str | None = Field(default=None)
|
|
43
43
|
instruction: str = Field(min_length=1)
|
|
44
44
|
priority: Priority = Field(default=Priority.p2)
|
|
@@ -103,7 +103,7 @@ class TaskRunConfig(KilnParentedModel):
|
|
|
103
103
|
A run config includes everything needed to run a task, except the input. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic).
|
|
104
104
|
"""
|
|
105
105
|
|
|
106
|
-
name:
|
|
106
|
+
name: FilenameString = Field(description="The name of the task run config.")
|
|
107
107
|
description: str | None = Field(
|
|
108
108
|
default=None, description="The description of the task run config."
|
|
109
109
|
)
|
|
@@ -189,7 +189,7 @@ class Task(
|
|
|
189
189
|
a collection of task runs.
|
|
190
190
|
"""
|
|
191
191
|
|
|
192
|
-
name:
|
|
192
|
+
name: FilenameString = Field(description="The name of the task.")
|
|
193
193
|
description: str | None = Field(
|
|
194
194
|
default=None,
|
|
195
195
|
description="A description of the task for you and your team. Will not be used in prompts/training/validation.",
|
|
@@ -216,7 +216,7 @@ class Task(
|
|
|
216
216
|
return None
|
|
217
217
|
return schema_from_json_str(self.input_json_schema)
|
|
218
218
|
|
|
219
|
-
# These wrappers help for typechecking.
|
|
219
|
+
# These wrappers help for typechecking. We should fix this in KilnParentModel
|
|
220
220
|
def runs(self, readonly: bool = False) -> list[TaskRun]:
|
|
221
221
|
return super().runs(readonly=readonly) # type: ignore
|
|
222
222
|
|
kiln_ai/datamodel/task_output.py
CHANGED
|
@@ -307,7 +307,7 @@ class TaskOutput(KilnBaseModel):
|
|
|
307
307
|
if task.output_json_schema is not None:
|
|
308
308
|
try:
|
|
309
309
|
output_parsed = json.loads(self.output)
|
|
310
|
-
except json.JSONDecodeError
|
|
310
|
+
except json.JSONDecodeError:
|
|
311
311
|
raise ValueError("Output is not a valid JSON object")
|
|
312
312
|
|
|
313
313
|
validate_schema_with_value_error(
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import datetime
|
|
2
2
|
import json
|
|
3
|
+
import uuid
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
from typing import Optional
|
|
5
6
|
from unittest.mock import MagicMock, patch
|
|
@@ -12,6 +13,7 @@ from kiln_ai.datamodel import Task, TaskRun
|
|
|
12
13
|
from kiln_ai.datamodel.basemodel import (
|
|
13
14
|
KilnBaseModel,
|
|
14
15
|
KilnParentedModel,
|
|
16
|
+
name_validator,
|
|
15
17
|
string_to_valid_name,
|
|
16
18
|
)
|
|
17
19
|
from kiln_ai.datamodel.model_cache import ModelCache
|
|
@@ -328,28 +330,81 @@ def test_delete_no_path():
|
|
|
328
330
|
model.delete()
|
|
329
331
|
|
|
330
332
|
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
333
|
+
@pytest.mark.parametrize(
|
|
334
|
+
"name,expected",
|
|
335
|
+
[
|
|
336
|
+
# Basic valid strings remain unchanged
|
|
337
|
+
("Hello World", "Hello World"),
|
|
338
|
+
("Test-123", "Test-123"),
|
|
339
|
+
("my_file_name", "my_file_name"),
|
|
340
|
+
("multiple!!!symbols", "multiple!!!symbols"),
|
|
341
|
+
# Emoji
|
|
342
|
+
("Hello 👍", "Hello 👍"),
|
|
343
|
+
# Invalid characters are replaced
|
|
344
|
+
("Hello@World!", "Hello@World!"),
|
|
345
|
+
("File.name.txt", "File_name_txt"),
|
|
346
|
+
("Special%%%Chars", "Special_Chars"),
|
|
347
|
+
("Special#$%Chars", "Special#$_Chars"),
|
|
348
|
+
# Consecutive invalid characters are replaced
|
|
349
|
+
("Special%%%Chars", "Special_Chars"),
|
|
350
|
+
("path/to/file", "path_to_file"),
|
|
351
|
+
# Leading/trailing special characters are removed
|
|
352
|
+
("__test__", "test"),
|
|
353
|
+
("...test...", "test"),
|
|
354
|
+
# Whitespace is replaced
|
|
355
|
+
("", ""),
|
|
356
|
+
(" ", ""),
|
|
357
|
+
("Hello World", "Hello World"),
|
|
358
|
+
# Unicode characters are replaced
|
|
359
|
+
("你好", "你好"),
|
|
360
|
+
("你好_世界", "你好_世界"),
|
|
361
|
+
("你好_世界_你好", "你好_世界_你好"),
|
|
362
|
+
# Newlines, tabs, and other control characters are replaced
|
|
363
|
+
("Hello\nworld", "Hello_world"),
|
|
364
|
+
("Hello\tworld", "Hello_world"),
|
|
365
|
+
("Hello\rworld", "Hello_world"),
|
|
366
|
+
("Hello\fworld", "Hello_world"),
|
|
367
|
+
("Hello\bworld", "Hello_world"),
|
|
368
|
+
("Hello\vworld", "Hello_world"),
|
|
369
|
+
("Hello\0world", "Hello_world"),
|
|
370
|
+
("Hello\x00world", "Hello_world"),
|
|
371
|
+
],
|
|
372
|
+
)
|
|
373
|
+
def test_string_to_valid_name(tmp_path, name, expected):
|
|
374
|
+
assert string_to_valid_name(name) == expected
|
|
336
375
|
|
|
337
|
-
#
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
assert string_to_valid_name("Special#$%Chars") == "Special_Chars"
|
|
376
|
+
# check we can create a folder with the valid name
|
|
377
|
+
dir_path = tmp_path / str(uuid.uuid4()) / expected
|
|
378
|
+
dir_path.mkdir(parents=True)
|
|
341
379
|
|
|
342
|
-
# Test consecutive invalid characters
|
|
343
|
-
assert string_to_valid_name("multiple!!!symbols") == "multiple_symbols"
|
|
344
|
-
assert string_to_valid_name("path/to/file") == "path_to_file"
|
|
345
380
|
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
381
|
+
@pytest.mark.parametrize(
|
|
382
|
+
"name,min_length,max_length,should_pass",
|
|
383
|
+
[
|
|
384
|
+
# Valid cases
|
|
385
|
+
("ValidName", 5, 20, True),
|
|
386
|
+
("Short", 1, 10, True),
|
|
387
|
+
("LongerValidName", 5, 20, True),
|
|
388
|
+
# None case (line 53)
|
|
389
|
+
(None, 5, 20, False),
|
|
390
|
+
# Too short cases (lines 57-59)
|
|
391
|
+
("Hi", 5, 20, False),
|
|
392
|
+
("", 1, 20, False),
|
|
393
|
+
("a", 2, 20, False),
|
|
394
|
+
# Too long cases (lines 61-63)
|
|
395
|
+
("ThisNameIsTooLong", 5, 10, False),
|
|
396
|
+
("VeryVeryVeryLongName", 1, 15, False),
|
|
397
|
+
],
|
|
398
|
+
)
|
|
399
|
+
def test_name_validator_error_conditions(name, min_length, max_length, should_pass):
|
|
400
|
+
validator = name_validator(min_length=min_length, max_length=max_length)
|
|
349
401
|
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
402
|
+
if should_pass:
|
|
403
|
+
result = validator(name)
|
|
404
|
+
assert result == name
|
|
405
|
+
else:
|
|
406
|
+
with pytest.raises(ValueError):
|
|
407
|
+
validator(name)
|
|
353
408
|
|
|
354
409
|
|
|
355
410
|
def test_load_from_file_with_cache(test_base_file, tmp_model_cache):
|
|
@@ -553,3 +608,140 @@ async def test_invoke_parsing_flow(adapter):
|
|
|
553
608
|
match="Reasoning is required for this model, but no reasoning was returned.",
|
|
554
609
|
):
|
|
555
610
|
await adapter.invoke("test input")
|
|
611
|
+
|
|
612
|
+
|
|
613
|
+
async def test_invoke_parsing_flow_basic_no_reasoning(adapter):
|
|
614
|
+
"""Test for reasoning_optional_for_structured_output
|
|
615
|
+
when reasoning is not required.
|
|
616
|
+
This is a special case where we want to return the output as is.
|
|
617
|
+
"""
|
|
618
|
+
# Mock dependencies
|
|
619
|
+
mock_provider = MagicMock()
|
|
620
|
+
mock_provider.parser = "test_parser"
|
|
621
|
+
mock_provider.formatter = None
|
|
622
|
+
mock_provider.reasoning_capable = False
|
|
623
|
+
mock_provider.reasoning_optional_for_structured_output = True
|
|
624
|
+
|
|
625
|
+
mock_parser = MagicMock()
|
|
626
|
+
mock_parser.parse_output.return_value = RunOutput(
|
|
627
|
+
output="parsed test output", intermediate_outputs={"key": "value"}
|
|
628
|
+
)
|
|
629
|
+
|
|
630
|
+
with (
|
|
631
|
+
patch.object(adapter, "model_provider", return_value=mock_provider),
|
|
632
|
+
patch(
|
|
633
|
+
"kiln_ai.adapters.model_adapters.base_adapter.model_parser_from_id",
|
|
634
|
+
return_value=mock_parser,
|
|
635
|
+
),
|
|
636
|
+
patch("kiln_ai.adapters.model_adapters.base_adapter.Config") as mock_config,
|
|
637
|
+
):
|
|
638
|
+
# Disable autosaving for this test
|
|
639
|
+
mock_config.shared.return_value.autosave_runs = False
|
|
640
|
+
mock_config.shared.return_value.user_id = "test_user_id"
|
|
641
|
+
|
|
642
|
+
# Execute
|
|
643
|
+
result = await adapter.invoke("test input")
|
|
644
|
+
|
|
645
|
+
# Verify parsing occurred
|
|
646
|
+
mock_parser.parse_output.assert_called_once()
|
|
647
|
+
parsed_args = mock_parser.parse_output.call_args[1]
|
|
648
|
+
assert isinstance(parsed_args["original_output"], RunOutput)
|
|
649
|
+
assert parsed_args["original_output"].output == "test output"
|
|
650
|
+
|
|
651
|
+
# Verify result contains parsed output
|
|
652
|
+
assert isinstance(result, TaskRun)
|
|
653
|
+
assert result.output.output == "parsed test output"
|
|
654
|
+
assert result.intermediate_outputs == {"key": "value"}
|
|
655
|
+
assert result.input == "test input"
|
|
656
|
+
|
|
657
|
+
|
|
658
|
+
async def test_invoke_parsing_flow_no_reasoning_with_structured_output(adapter):
|
|
659
|
+
"""Test for reasoning_optional_for_structured_output
|
|
660
|
+
when reasoning is required but not provided, with structured output enabled.
|
|
661
|
+
This is a special case where we don't want to error, but we want to return the output as is.
|
|
662
|
+
"""
|
|
663
|
+
# Mock dependencies
|
|
664
|
+
mock_provider = MagicMock()
|
|
665
|
+
mock_provider.parser = "test_parser"
|
|
666
|
+
mock_provider.formatter = None
|
|
667
|
+
mock_provider.reasoning_capable = True
|
|
668
|
+
mock_provider.reasoning_optional_for_structured_output = True
|
|
669
|
+
|
|
670
|
+
mock_parser = MagicMock()
|
|
671
|
+
mock_parser.parse_output.return_value = RunOutput(
|
|
672
|
+
output="parsed test output", intermediate_outputs={"key": "value"}
|
|
673
|
+
)
|
|
674
|
+
|
|
675
|
+
with (
|
|
676
|
+
patch.object(adapter, "model_provider", return_value=mock_provider),
|
|
677
|
+
patch(
|
|
678
|
+
"kiln_ai.adapters.model_adapters.base_adapter.model_parser_from_id",
|
|
679
|
+
return_value=mock_parser,
|
|
680
|
+
),
|
|
681
|
+
patch("kiln_ai.adapters.model_adapters.base_adapter.Config") as mock_config,
|
|
682
|
+
patch.object(adapter, "has_structured_output", return_value=True),
|
|
683
|
+
):
|
|
684
|
+
# Disable autosaving for this test
|
|
685
|
+
mock_config.shared.return_value.autosave_runs = False
|
|
686
|
+
mock_config.shared.return_value.user_id = "test_user_id"
|
|
687
|
+
|
|
688
|
+
# Execute
|
|
689
|
+
result = await adapter.invoke("test input")
|
|
690
|
+
|
|
691
|
+
# Verify parsing occurred
|
|
692
|
+
mock_parser.parse_output.assert_called_once()
|
|
693
|
+
parsed_args = mock_parser.parse_output.call_args[1]
|
|
694
|
+
assert isinstance(parsed_args["original_output"], RunOutput)
|
|
695
|
+
assert parsed_args["original_output"].output == "test output"
|
|
696
|
+
|
|
697
|
+
# Verify result contains parsed output
|
|
698
|
+
assert isinstance(result, TaskRun)
|
|
699
|
+
assert result.output.output == "parsed test output"
|
|
700
|
+
assert result.intermediate_outputs == {"key": "value"}
|
|
701
|
+
assert result.input == "test input"
|
|
702
|
+
|
|
703
|
+
|
|
704
|
+
async def test_invoke_parsing_flow_with_reasoning_and_structured_output(adapter):
|
|
705
|
+
"""Test for reasoning_optional_for_structured_output
|
|
706
|
+
when reasoning is provided with structured output enabled.
|
|
707
|
+
This is a special case where we want to return the output as is.
|
|
708
|
+
"""
|
|
709
|
+
# Mock dependencies
|
|
710
|
+
mock_provider = MagicMock()
|
|
711
|
+
mock_provider.parser = "test_parser"
|
|
712
|
+
mock_provider.formatter = None
|
|
713
|
+
mock_provider.reasoning_capable = True
|
|
714
|
+
mock_provider.reasoning_optional_for_structured_output = True
|
|
715
|
+
|
|
716
|
+
mock_parser = MagicMock()
|
|
717
|
+
mock_parser.parse_output.return_value = RunOutput(
|
|
718
|
+
output="parsed test output", intermediate_outputs={"reasoning": "value"}
|
|
719
|
+
)
|
|
720
|
+
|
|
721
|
+
with (
|
|
722
|
+
patch.object(adapter, "model_provider", return_value=mock_provider),
|
|
723
|
+
patch(
|
|
724
|
+
"kiln_ai.adapters.model_adapters.base_adapter.model_parser_from_id",
|
|
725
|
+
return_value=mock_parser,
|
|
726
|
+
),
|
|
727
|
+
patch("kiln_ai.adapters.model_adapters.base_adapter.Config") as mock_config,
|
|
728
|
+
patch.object(adapter, "has_structured_output", return_value=True),
|
|
729
|
+
):
|
|
730
|
+
# Disable autosaving for this test
|
|
731
|
+
mock_config.shared.return_value.autosave_runs = False
|
|
732
|
+
mock_config.shared.return_value.user_id = "test_user_id"
|
|
733
|
+
|
|
734
|
+
# Execute
|
|
735
|
+
result = await adapter.invoke("test input")
|
|
736
|
+
|
|
737
|
+
# Verify parsing occurred
|
|
738
|
+
mock_parser.parse_output.assert_called_once()
|
|
739
|
+
parsed_args = mock_parser.parse_output.call_args[1]
|
|
740
|
+
assert isinstance(parsed_args["original_output"], RunOutput)
|
|
741
|
+
assert parsed_args["original_output"].output == "test output"
|
|
742
|
+
|
|
743
|
+
# Verify result contains parsed output
|
|
744
|
+
assert isinstance(result, TaskRun)
|
|
745
|
+
assert result.output.output == "parsed test output"
|
|
746
|
+
assert result.intermediate_outputs == {"reasoning": "value"}
|
|
747
|
+
assert result.input == "test input"
|
|
@@ -517,13 +517,13 @@ def test_eval_run_score_keys_must_match(valid_eval_config, valid_eval_run_data):
|
|
|
517
517
|
valid_eval_config.parent = eval
|
|
518
518
|
|
|
519
519
|
# Correct
|
|
520
|
-
|
|
520
|
+
EvalRun(
|
|
521
521
|
parent=valid_eval_config,
|
|
522
522
|
**{**valid_eval_run_data, "scores": {"accuracy": 4.5, "critical": 1.0}},
|
|
523
523
|
)
|
|
524
524
|
|
|
525
525
|
# Correct but wrong order still okay
|
|
526
|
-
|
|
526
|
+
EvalRun(
|
|
527
527
|
parent=valid_eval_config,
|
|
528
528
|
**{**valid_eval_run_data, "scores": {"critical": 1.0, "accuracy": 4.5}},
|
|
529
529
|
)
|
|
@@ -533,7 +533,7 @@ def test_eval_run_score_keys_must_match(valid_eval_config, valid_eval_run_data):
|
|
|
533
533
|
ValueError,
|
|
534
534
|
match="The scores produced by the evaluator must match the scores expected by the eval",
|
|
535
535
|
):
|
|
536
|
-
|
|
536
|
+
EvalRun(
|
|
537
537
|
parent=valid_eval_config,
|
|
538
538
|
**{**valid_eval_run_data, "scores": {"accuracy": 4.5}},
|
|
539
539
|
)
|
|
@@ -543,7 +543,7 @@ def test_eval_run_score_keys_must_match(valid_eval_config, valid_eval_run_data):
|
|
|
543
543
|
ValueError,
|
|
544
544
|
match="The scores produced by the evaluator must match the scores expected by the eval",
|
|
545
545
|
):
|
|
546
|
-
|
|
546
|
+
EvalRun(
|
|
547
547
|
parent=valid_eval_config,
|
|
548
548
|
**{
|
|
549
549
|
**valid_eval_run_data,
|
|
@@ -556,7 +556,7 @@ def test_eval_run_score_keys_must_match(valid_eval_config, valid_eval_run_data):
|
|
|
556
556
|
ValueError,
|
|
557
557
|
match="The scores produced by the evaluator must match the scores expected by the eval",
|
|
558
558
|
):
|
|
559
|
-
|
|
559
|
+
EvalRun(
|
|
560
560
|
parent=valid_eval_config,
|
|
561
561
|
**{**valid_eval_run_data, "scores": {"accuracy": 4.5, "wrong": 1.0}},
|
|
562
562
|
)
|
|
@@ -566,7 +566,7 @@ def test_eval_run_custom_scores_not_allowed(valid_eval_config, valid_eval_run_da
|
|
|
566
566
|
with pytest.raises(
|
|
567
567
|
ValueError, match="Custom scores are not supported in evaluators"
|
|
568
568
|
):
|
|
569
|
-
|
|
569
|
+
Eval(
|
|
570
570
|
name="Test Eval",
|
|
571
571
|
eval_set_filter_id="tag::tag1",
|
|
572
572
|
eval_configs_filter_id="tag::tag2",
|
|
@@ -121,6 +121,6 @@ def test_benchmark_load_from_file(benchmark, task_run):
|
|
|
121
121
|
|
|
122
122
|
# I get 8k ops per second on my MBP. Lower value here for CI and parallel testing.
|
|
123
123
|
# Prior to optimization was 290 ops per second.
|
|
124
|
-
|
|
124
|
+
# sys.stdout.write(f"Ops per second: {ops_per_second:.6f}")
|
|
125
125
|
if ops_per_second < 500:
|
|
126
126
|
pytest.fail(f"Ops per second: {ops_per_second:.6f}, expected more than 1k ops")
|
|
@@ -29,7 +29,7 @@ def test_valid_saved_prompt_id():
|
|
|
29
29
|
|
|
30
30
|
def test_valid_fine_tune_prompt_id():
|
|
31
31
|
"""Test that valid fine-tune prompt IDs are accepted"""
|
|
32
|
-
valid_id = "fine_tune_prompt::ft_123456"
|
|
32
|
+
valid_id = "fine_tune_prompt::project_123::task_456::ft_123456"
|
|
33
33
|
model = ModelTester(prompt_id=valid_id)
|
|
34
34
|
assert model.prompt_id == valid_id
|
|
35
35
|
|
|
@@ -53,6 +53,10 @@ def test_invalid_saved_prompt_id_format(invalid_id):
|
|
|
53
53
|
[
|
|
54
54
|
("fine_tune_prompt::", "Invalid fine-tune prompt ID: fine_tune_prompt::"),
|
|
55
55
|
("fine_tune_prompt", "Invalid prompt ID: fine_tune_prompt"),
|
|
56
|
+
(
|
|
57
|
+
"fine_tune_prompt::ft_123456",
|
|
58
|
+
"Invalid fine-tune prompt ID: fine_tune_prompt::ft_123456",
|
|
59
|
+
),
|
|
56
60
|
],
|
|
57
61
|
)
|
|
58
62
|
def test_invalid_fine_tune_prompt_id_format(invalid_id, expected_error):
|
kiln_ai/datamodel/test_task.py
CHANGED
|
@@ -323,3 +323,8 @@ def test_run_config_upgrade_old_entries():
|
|
|
323
323
|
assert parsed.name == "test name"
|
|
324
324
|
assert parsed.created_by == "scosman"
|
|
325
325
|
assert parsed.run_config_properties.structured_output_mode == "unknown"
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def test_task_name_unicode_name():
|
|
329
|
+
task = Task(name="你好", instruction="Do something")
|
|
330
|
+
assert task.name == "你好"
|
kiln_ai/utils/config.py
CHANGED
|
@@ -124,6 +124,11 @@ class Config:
|
|
|
124
124
|
env_var="WANDB_API_KEY",
|
|
125
125
|
sensitive=True,
|
|
126
126
|
),
|
|
127
|
+
"siliconflow_cn_api_key": ConfigProperty(
|
|
128
|
+
str,
|
|
129
|
+
env_var="SILICONFLOW_CN_API_KEY",
|
|
130
|
+
sensitive=True,
|
|
131
|
+
),
|
|
127
132
|
"wandb_base_url": ConfigProperty(
|
|
128
133
|
str,
|
|
129
134
|
env_var="WANDB_BASE_URL",
|
|
@@ -137,6 +142,11 @@ class Config:
|
|
|
137
142
|
default_lambda=lambda: [],
|
|
138
143
|
sensitive_keys=["api_key"],
|
|
139
144
|
),
|
|
145
|
+
"cerebras_api_key": ConfigProperty(
|
|
146
|
+
str,
|
|
147
|
+
env_var="CEREBRAS_API_KEY",
|
|
148
|
+
sensitive=True,
|
|
149
|
+
),
|
|
140
150
|
}
|
|
141
151
|
self._lock = threading.Lock()
|
|
142
152
|
self._settings = self.load_settings()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: kiln-ai
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.19.0
|
|
4
4
|
Summary: Kiln AI
|
|
5
5
|
Project-URL: Homepage, https://getkiln.ai
|
|
6
6
|
Project-URL: Repository, https://github.com/Kiln-AI/kiln
|
|
@@ -65,6 +65,7 @@ The library has a [comprehensive set of docs](https://kiln-ai.github.io/Kiln/kil
|
|
|
65
65
|
|
|
66
66
|
## Table of Contents
|
|
67
67
|
|
|
68
|
+
- [Connecting AI Providers](#connecting-ai-providers-openai-openrouter-ollama-etc)
|
|
68
69
|
- [Using the Kiln Data Model](#using-the-kiln-data-model)
|
|
69
70
|
- [Understanding the Kiln Data Model](#understanding-the-kiln-data-model)
|
|
70
71
|
- [Datamodel Overview](#datamodel-overview)
|
|
@@ -73,6 +74,7 @@ The library has a [comprehensive set of docs](https://kiln-ai.github.io/Kiln/kil
|
|
|
73
74
|
- [Using your Kiln Dataset in a Notebook or Project](#using-your-kiln-dataset-in-a-notebook-or-project)
|
|
74
75
|
- [Using Kiln Dataset in Pandas](#using-kiln-dataset-in-pandas)
|
|
75
76
|
- [Building and Running a Kiln Task from Code](#building-and-running-a-kiln-task-from-code)
|
|
77
|
+
- [Tagging Task Runs Programmatically](#tagging-task-runs-programmatically)
|
|
76
78
|
- [Adding Custom Model or AI Provider from Code](#adding-custom-model-or-ai-provider-from-code)
|
|
77
79
|
- [Full API Reference](#full-api-reference)
|
|
78
80
|
|
|
@@ -82,6 +84,12 @@ The library has a [comprehensive set of docs](https://kiln-ai.github.io/Kiln/kil
|
|
|
82
84
|
pip install kiln-ai
|
|
83
85
|
```
|
|
84
86
|
|
|
87
|
+
## Connecting AI Providers (OpenAI, OpenRouter, Ollama, etc)
|
|
88
|
+
|
|
89
|
+
The easiest way to connect AI providers is to use the Kiln app UI. Once connected in the UI, credentials will be stored to `~/.kiln_ai/settings.yml`, which will be available to the library.
|
|
90
|
+
|
|
91
|
+
For configuring credentials from code or connecting custom servers/model, see [Adding Custom Model or AI Provider from Code](#adding-custom-model-or-ai-provider-from-code).
|
|
92
|
+
|
|
85
93
|
## Using the Kiln Data Model
|
|
86
94
|
|
|
87
95
|
### Understanding the Kiln Data Model
|
|
@@ -179,7 +187,10 @@ item = kiln_ai.datamodel.TaskRun(
|
|
|
179
187
|
type=kiln_ai.datamodel.DataSourceType.human,
|
|
180
188
|
properties={"created_by": "Jane Doe"},
|
|
181
189
|
),
|
|
182
|
-
rating=kiln_ai.datamodel.TaskOutputRating(
|
|
190
|
+
rating=kiln_ai.datamodel.TaskOutputRating(
|
|
191
|
+
value=5,
|
|
192
|
+
type=kiln_ai.datamodel.datamodel_enums.five_star,
|
|
193
|
+
),
|
|
183
194
|
),
|
|
184
195
|
)
|
|
185
196
|
item.save_to_file()
|
|
@@ -270,6 +281,25 @@ for run in task.runs():
|
|
|
270
281
|
|
|
271
282
|
```
|
|
272
283
|
|
|
284
|
+
## Tagging Task Runs Programmatically
|
|
285
|
+
|
|
286
|
+
You can also tag your Kiln Task runs programmatically:
|
|
287
|
+
|
|
288
|
+
```py
|
|
289
|
+
# Load your Kiln Task from disk
|
|
290
|
+
task_path = "/Users/youruser/Kiln Projects/test project/tasks/632780983478 - Joke Generator/task.kiln"
|
|
291
|
+
task = kiln_ai.datamodel.Task.load_from_file(task_path)
|
|
292
|
+
|
|
293
|
+
for run in task.runs():
|
|
294
|
+
# Parse the task output from JSON
|
|
295
|
+
output = json.loads(run.output.output)
|
|
296
|
+
|
|
297
|
+
# Add a tag if the punchline is unusually short
|
|
298
|
+
if len(output["punchline"]) < 100:
|
|
299
|
+
run.tags.append("very_short")
|
|
300
|
+
run.save_to_file() # Persist the updated tags
|
|
301
|
+
```
|
|
302
|
+
|
|
273
303
|
### Adding Custom Model or AI Provider from Code
|
|
274
304
|
|
|
275
305
|
You can add additional AI models and providers to Kiln.
|