kiln-ai 0.17.0__py3-none-any.whl → 0.18.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kiln_ai/adapters/chat/chat_formatter.py +0 -1
- kiln_ai/adapters/data_gen/data_gen_prompts.py +121 -36
- kiln_ai/adapters/data_gen/data_gen_task.py +49 -36
- kiln_ai/adapters/data_gen/test_data_gen_task.py +311 -34
- kiln_ai/adapters/eval/base_eval.py +6 -7
- kiln_ai/adapters/eval/eval_runner.py +5 -1
- kiln_ai/adapters/eval/g_eval.py +17 -12
- kiln_ai/adapters/eval/test_base_eval.py +8 -2
- kiln_ai/adapters/eval/test_g_eval.py +115 -5
- kiln_ai/adapters/fine_tune/base_finetune.py +1 -6
- kiln_ai/adapters/fine_tune/dataset_formatter.py +1 -5
- kiln_ai/adapters/fine_tune/test_dataset_formatter.py +1 -1
- kiln_ai/adapters/fine_tune/test_vertex_finetune.py +2 -7
- kiln_ai/adapters/fine_tune/together_finetune.py +1 -1
- kiln_ai/adapters/ml_model_list.py +293 -44
- kiln_ai/adapters/model_adapters/litellm_adapter.py +9 -0
- kiln_ai/adapters/model_adapters/test_base_adapter.py +0 -1
- kiln_ai/adapters/model_adapters/test_litellm_adapter.py +48 -0
- kiln_ai/adapters/model_adapters/test_structured_output.py +3 -3
- kiln_ai/adapters/parsers/parser_registry.py +0 -2
- kiln_ai/adapters/parsers/r1_parser.py +0 -1
- kiln_ai/adapters/remote_config.py +66 -0
- kiln_ai/adapters/repair/repair_task.py +1 -6
- kiln_ai/adapters/test_ml_model_list.py +18 -0
- kiln_ai/adapters/test_prompt_adaptors.py +0 -4
- kiln_ai/adapters/test_remote_config.py +100 -0
- kiln_ai/datamodel/eval.py +32 -0
- kiln_ai/datamodel/finetune.py +0 -1
- kiln_ai/datamodel/task_output.py +0 -2
- kiln_ai/datamodel/task_run.py +0 -2
- kiln_ai/datamodel/test_eval_model.py +146 -4
- kiln_ai/utils/logging.py +4 -3
- {kiln_ai-0.17.0.dist-info → kiln_ai-0.18.0.dist-info}/METADATA +2 -2
- {kiln_ai-0.17.0.dist-info → kiln_ai-0.18.0.dist-info}/RECORD +36 -34
- {kiln_ai-0.17.0.dist-info → kiln_ai-0.18.0.dist-info}/WHEEL +0 -0
- {kiln_ai-0.17.0.dist-info → kiln_ai-0.18.0.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -245,7 +245,7 @@ async def run_structured_input_task_no_validation(
|
|
|
245
245
|
try:
|
|
246
246
|
run = await a.invoke({"a": 2, "b": 2, "c": 2})
|
|
247
247
|
response = run.output.output
|
|
248
|
-
return response, a
|
|
248
|
+
return response, a, run
|
|
249
249
|
except ValueError as e:
|
|
250
250
|
if str(e) == "Failed to connect to Ollama. Ensure Ollama is running.":
|
|
251
251
|
pytest.skip(
|
|
@@ -260,7 +260,7 @@ async def run_structured_input_task(
|
|
|
260
260
|
provider: str,
|
|
261
261
|
prompt_id: PromptId,
|
|
262
262
|
):
|
|
263
|
-
response, a = await run_structured_input_task_no_validation(
|
|
263
|
+
response, a, run = await run_structured_input_task_no_validation(
|
|
264
264
|
task, model_name, provider, prompt_id
|
|
265
265
|
)
|
|
266
266
|
assert response is not None
|
|
@@ -350,7 +350,7 @@ When asked for a final result, this is the format (for an equilateral example):
|
|
|
350
350
|
"""
|
|
351
351
|
task.output_json_schema = json.dumps(triangle_schema)
|
|
352
352
|
task.save_to_file()
|
|
353
|
-
response, adapter = await run_structured_input_task_no_validation(
|
|
353
|
+
response, adapter, _ = await run_structured_input_task_no_validation(
|
|
354
354
|
task, model_name, provider_name, "simple_chain_of_thought_prompt_builder"
|
|
355
355
|
)
|
|
356
356
|
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import threading
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import List
|
|
8
|
+
|
|
9
|
+
import requests
|
|
10
|
+
|
|
11
|
+
from .ml_model_list import KilnModel, built_in_models
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def serialize_config(models: List[KilnModel], path: str | Path) -> None:
|
|
17
|
+
data = {"model_list": [m.model_dump(mode="json") for m in models]}
|
|
18
|
+
Path(path).write_text(json.dumps(data, indent=2, sort_keys=True))
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def deserialize_config(path: str | Path) -> List[KilnModel]:
|
|
22
|
+
raw = json.loads(Path(path).read_text())
|
|
23
|
+
model_data = raw.get("model_list", raw if isinstance(raw, list) else [])
|
|
24
|
+
return [KilnModel.model_validate(item) for item in model_data]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def load_from_url(url: str) -> List[KilnModel]:
|
|
28
|
+
response = requests.get(url, timeout=10)
|
|
29
|
+
response.raise_for_status()
|
|
30
|
+
data = response.json()
|
|
31
|
+
if isinstance(data, list):
|
|
32
|
+
model_data = data
|
|
33
|
+
else:
|
|
34
|
+
model_data = data.get("model_list", [])
|
|
35
|
+
return [KilnModel.model_validate(item) for item in model_data]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def dump_builtin_config(path: str | Path) -> None:
|
|
39
|
+
serialize_config(built_in_models, path)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def load_remote_models(url: str) -> None:
|
|
43
|
+
if os.environ.get("KILN_SKIP_REMOTE_MODEL_LIST") == "true":
|
|
44
|
+
return
|
|
45
|
+
|
|
46
|
+
def fetch_and_replace() -> None:
|
|
47
|
+
try:
|
|
48
|
+
models = load_from_url(url)
|
|
49
|
+
built_in_models[:] = models
|
|
50
|
+
except Exception as exc:
|
|
51
|
+
# Do not crash startup, but surface the issue
|
|
52
|
+
logger.warning("Failed to fetch remote model list from %s: %s", url, exc)
|
|
53
|
+
|
|
54
|
+
thread = threading.Thread(target=fetch_and_replace, daemon=True)
|
|
55
|
+
thread.start()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def main() -> None:
|
|
59
|
+
parser = argparse.ArgumentParser()
|
|
60
|
+
parser.add_argument("path", help="output path")
|
|
61
|
+
args = parser.parse_args()
|
|
62
|
+
dump_builtin_config(args.path)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
if __name__ == "__main__":
|
|
66
|
+
main()
|
|
@@ -1,13 +1,8 @@
|
|
|
1
1
|
import json
|
|
2
|
-
from typing import Type
|
|
3
2
|
|
|
4
3
|
from pydantic import BaseModel, Field
|
|
5
4
|
|
|
6
|
-
from kiln_ai.adapters.prompt_builders import
|
|
7
|
-
BasePromptBuilder,
|
|
8
|
-
SavedPromptBuilder,
|
|
9
|
-
prompt_builder_from_id,
|
|
10
|
-
)
|
|
5
|
+
from kiln_ai.adapters.prompt_builders import BasePromptBuilder, prompt_builder_from_id
|
|
11
6
|
from kiln_ai.datamodel import Priority, Project, Task, TaskRequirement, TaskRun
|
|
12
7
|
|
|
13
8
|
|
|
@@ -156,3 +156,21 @@ class TestDefaultStructuredOutputModeForModelProvider:
|
|
|
156
156
|
provider=first_provider.name,
|
|
157
157
|
)
|
|
158
158
|
assert result == first_provider.structured_output_mode
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def test_uncensored():
|
|
162
|
+
"""Test that uncensored is set correctly"""
|
|
163
|
+
model = get_model_by_name(ModelName.grok_3_mini)
|
|
164
|
+
for provider in model.providers:
|
|
165
|
+
assert provider.uncensored
|
|
166
|
+
assert not provider.suggested_for_uncensored_data_gen
|
|
167
|
+
|
|
168
|
+
model = get_model_by_name(ModelName.gpt_4_1_nano)
|
|
169
|
+
for provider in model.providers:
|
|
170
|
+
assert not provider.uncensored
|
|
171
|
+
assert not provider.suggested_for_uncensored_data_gen
|
|
172
|
+
|
|
173
|
+
model = get_model_by_name(ModelName.grok_4)
|
|
174
|
+
for provider in model.providers:
|
|
175
|
+
assert provider.uncensored
|
|
176
|
+
assert provider.suggested_for_uncensored_data_gen
|
|
@@ -13,10 +13,6 @@ from kiln_ai.adapters.model_adapters.litellm_adapter import (
|
|
|
13
13
|
LiteLlmConfig,
|
|
14
14
|
)
|
|
15
15
|
from kiln_ai.adapters.ollama_tools import ollama_online
|
|
16
|
-
from kiln_ai.adapters.prompt_builders import (
|
|
17
|
-
BasePromptBuilder,
|
|
18
|
-
SimpleChainOfThoughtPromptBuilder,
|
|
19
|
-
)
|
|
20
16
|
from kiln_ai.datamodel import PromptId
|
|
21
17
|
from kiln_ai.datamodel.task import RunConfigProperties
|
|
22
18
|
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import os
|
|
3
|
+
from unittest.mock import patch
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from kiln_ai.adapters.ml_model_list import built_in_models
|
|
8
|
+
from kiln_ai.adapters.remote_config import (
|
|
9
|
+
deserialize_config,
|
|
10
|
+
dump_builtin_config,
|
|
11
|
+
load_from_url,
|
|
12
|
+
load_remote_models,
|
|
13
|
+
serialize_config,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def test_round_trip(tmp_path):
|
|
18
|
+
path = tmp_path / "models.json"
|
|
19
|
+
serialize_config(built_in_models, path)
|
|
20
|
+
loaded = deserialize_config(path)
|
|
21
|
+
assert [m.model_dump(mode="json") for m in loaded] == [
|
|
22
|
+
m.model_dump(mode="json") for m in built_in_models
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_load_from_url():
|
|
27
|
+
sample = [built_in_models[0].model_dump(mode="json")]
|
|
28
|
+
|
|
29
|
+
class FakeResponse:
|
|
30
|
+
def raise_for_status(self):
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
def json(self):
|
|
34
|
+
return {"model_list": sample}
|
|
35
|
+
|
|
36
|
+
with patch(
|
|
37
|
+
"kiln_ai.adapters.remote_config.requests.get", return_value=FakeResponse()
|
|
38
|
+
):
|
|
39
|
+
models = load_from_url("http://example.com/models.json")
|
|
40
|
+
assert [m.model_dump(mode="json") for m in models] == sample
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_dump_builtin_config(tmp_path):
|
|
44
|
+
path = tmp_path / "out.json"
|
|
45
|
+
dump_builtin_config(path)
|
|
46
|
+
loaded = deserialize_config(path)
|
|
47
|
+
assert [m.model_dump(mode="json") for m in loaded] == [
|
|
48
|
+
m.model_dump(mode="json") for m in built_in_models
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@pytest.mark.asyncio
|
|
53
|
+
async def test_load_remote_models_success(monkeypatch):
|
|
54
|
+
del os.environ["KILN_SKIP_REMOTE_MODEL_LIST"]
|
|
55
|
+
original = built_in_models.copy()
|
|
56
|
+
sample_models = [built_in_models[0]]
|
|
57
|
+
|
|
58
|
+
def fake_fetch(url):
|
|
59
|
+
return sample_models
|
|
60
|
+
|
|
61
|
+
monkeypatch.setattr("kiln_ai.adapters.remote_config.load_from_url", fake_fetch)
|
|
62
|
+
|
|
63
|
+
load_remote_models("http://example.com/models.json")
|
|
64
|
+
await asyncio.sleep(0.01)
|
|
65
|
+
assert built_in_models == sample_models
|
|
66
|
+
built_in_models[:] = original
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@pytest.mark.asyncio
|
|
70
|
+
async def test_load_remote_models_failure(monkeypatch):
|
|
71
|
+
original = built_in_models.copy()
|
|
72
|
+
|
|
73
|
+
def fake_fetch(url):
|
|
74
|
+
raise RuntimeError("fail")
|
|
75
|
+
|
|
76
|
+
monkeypatch.setattr("kiln_ai.adapters.remote_config.load_from_url", fake_fetch)
|
|
77
|
+
|
|
78
|
+
load_remote_models("http://example.com/models.json")
|
|
79
|
+
await asyncio.sleep(0.01)
|
|
80
|
+
assert built_in_models == original
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def test_deserialize_config_with_extra_keys(tmp_path):
|
|
84
|
+
# Take a valid model and add an extra key, ensure it is ignored and still loads
|
|
85
|
+
import json
|
|
86
|
+
|
|
87
|
+
from kiln_ai.adapters.ml_model_list import built_in_models
|
|
88
|
+
|
|
89
|
+
model_dict = built_in_models[0].model_dump(mode="json")
|
|
90
|
+
model_dict["extra_key"] = "should be ignored or error"
|
|
91
|
+
model_dict["providers"][0]["extra_key"] = "should be ignored or error"
|
|
92
|
+
data = {"model_list": [model_dict]}
|
|
93
|
+
path = tmp_path / "extra.json"
|
|
94
|
+
path.write_text(json.dumps(data))
|
|
95
|
+
# Should NOT raise, and extra key should be ignored
|
|
96
|
+
models = deserialize_config(path)
|
|
97
|
+
assert hasattr(models[0], "family")
|
|
98
|
+
assert not hasattr(models[0], "extra_key")
|
|
99
|
+
assert hasattr(models[0], "providers")
|
|
100
|
+
assert not hasattr(models[0].providers[0], "extra_key")
|
kiln_ai/datamodel/eval.py
CHANGED
|
@@ -14,6 +14,7 @@ from kiln_ai.datamodel.basemodel import (
|
|
|
14
14
|
from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType
|
|
15
15
|
from kiln_ai.datamodel.dataset_filters import DatasetFilterId
|
|
16
16
|
from kiln_ai.datamodel.json_schema import string_to_json_key
|
|
17
|
+
from kiln_ai.datamodel.task_run import Usage
|
|
17
18
|
from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
|
|
18
19
|
|
|
19
20
|
if TYPE_CHECKING:
|
|
@@ -28,6 +29,7 @@ class EvalTemplateId(str, Enum):
|
|
|
28
29
|
"""
|
|
29
30
|
|
|
30
31
|
kiln_requirements = "kiln_requirements"
|
|
32
|
+
issue = "kiln_issue"
|
|
31
33
|
toxicity = "toxicity"
|
|
32
34
|
bias = "bias"
|
|
33
35
|
maliciousness = "maliciousness"
|
|
@@ -110,6 +112,10 @@ class EvalRun(KilnParentedModel):
|
|
|
110
112
|
scores: EvalScores = Field(
|
|
111
113
|
description="The output scores of the evaluator (aligning to those required by the grand-parent Eval this object is a child of)."
|
|
112
114
|
)
|
|
115
|
+
task_run_usage: Usage | None = Field(
|
|
116
|
+
default=None,
|
|
117
|
+
description="The usage of the task run that produced this eval run output (not the usage by the evaluation model).",
|
|
118
|
+
)
|
|
113
119
|
|
|
114
120
|
def parent_eval_config(self) -> Union["EvalConfig", None]:
|
|
115
121
|
if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig":
|
|
@@ -280,6 +286,10 @@ class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}
|
|
|
280
286
|
default=False,
|
|
281
287
|
description="Whether this eval is a favourite of the user. Rendered as a star icon in the UI.",
|
|
282
288
|
)
|
|
289
|
+
template_properties: dict[str, str | int | bool | float] = Field(
|
|
290
|
+
default={},
|
|
291
|
+
description="Properties to be used to execute the eval. This is template_type specific and should serialize to a json dict.",
|
|
292
|
+
)
|
|
283
293
|
|
|
284
294
|
# Workaround to return typed parent without importing Task
|
|
285
295
|
def parent_task(self) -> Union["Task", None]:
|
|
@@ -304,3 +314,25 @@ class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}
|
|
|
304
314
|
f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]"
|
|
305
315
|
)
|
|
306
316
|
return self
|
|
317
|
+
|
|
318
|
+
@model_validator(mode="after")
|
|
319
|
+
def validate_template_properties(self) -> Self:
|
|
320
|
+
# Check for properties that are required for the issue template
|
|
321
|
+
if self.template == EvalTemplateId.issue:
|
|
322
|
+
if "issue_prompt" not in self.template_properties or not isinstance(
|
|
323
|
+
self.template_properties["issue_prompt"], str
|
|
324
|
+
):
|
|
325
|
+
raise ValueError("issue_prompt is required for issue template")
|
|
326
|
+
if "failure_example" in self.template_properties and not isinstance(
|
|
327
|
+
self.template_properties["failure_example"], str
|
|
328
|
+
):
|
|
329
|
+
raise ValueError(
|
|
330
|
+
"failure_example is optional for issue template, but if provided must be a string"
|
|
331
|
+
)
|
|
332
|
+
if "pass_example" in self.template_properties and not isinstance(
|
|
333
|
+
self.template_properties["pass_example"], str
|
|
334
|
+
):
|
|
335
|
+
raise ValueError(
|
|
336
|
+
"pass_example is optional for issue template, but if provided must be a string"
|
|
337
|
+
)
|
|
338
|
+
return self
|
kiln_ai/datamodel/finetune.py
CHANGED
kiln_ai/datamodel/task_output.py
CHANGED
kiln_ai/datamodel/task_run.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import pytest
|
|
2
2
|
from pydantic import ValidationError
|
|
3
3
|
|
|
4
|
-
from kiln_ai.datamodel import BasePrompt
|
|
5
4
|
from kiln_ai.datamodel.basemodel import KilnParentModel
|
|
6
5
|
from kiln_ai.datamodel.eval import (
|
|
7
6
|
Eval,
|
|
@@ -9,11 +8,10 @@ from kiln_ai.datamodel.eval import (
|
|
|
9
8
|
EvalConfigType,
|
|
10
9
|
EvalOutputScore,
|
|
11
10
|
EvalRun,
|
|
11
|
+
EvalTemplateId,
|
|
12
12
|
)
|
|
13
13
|
from kiln_ai.datamodel.task import Task
|
|
14
|
-
from kiln_ai.datamodel.task_output import
|
|
15
|
-
TaskOutputRatingType,
|
|
16
|
-
)
|
|
14
|
+
from kiln_ai.datamodel.task_output import TaskOutputRatingType
|
|
17
15
|
|
|
18
16
|
|
|
19
17
|
@pytest.fixture
|
|
@@ -633,3 +631,147 @@ def test_eval_run_eval_config_eval_validation():
|
|
|
633
631
|
output="test output",
|
|
634
632
|
scores={"score": 1.0},
|
|
635
633
|
)
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
@pytest.mark.parametrize(
|
|
637
|
+
"template_properties,should_raise,expected_error",
|
|
638
|
+
[
|
|
639
|
+
# Valid cases
|
|
640
|
+
(
|
|
641
|
+
{"issue_prompt": "Test issue prompt"},
|
|
642
|
+
False,
|
|
643
|
+
None,
|
|
644
|
+
),
|
|
645
|
+
(
|
|
646
|
+
{
|
|
647
|
+
"issue_prompt": "Test issue prompt",
|
|
648
|
+
"failure_example": "Test failure example",
|
|
649
|
+
},
|
|
650
|
+
False,
|
|
651
|
+
None,
|
|
652
|
+
),
|
|
653
|
+
(
|
|
654
|
+
{
|
|
655
|
+
"issue_prompt": "Test issue prompt",
|
|
656
|
+
"failure_example": "Test failure example",
|
|
657
|
+
"pass_example": "Test pass example",
|
|
658
|
+
},
|
|
659
|
+
False,
|
|
660
|
+
None,
|
|
661
|
+
),
|
|
662
|
+
(
|
|
663
|
+
{
|
|
664
|
+
"issue_prompt": "",
|
|
665
|
+
"failure_example": "",
|
|
666
|
+
"pass_example": "",
|
|
667
|
+
},
|
|
668
|
+
False,
|
|
669
|
+
None,
|
|
670
|
+
),
|
|
671
|
+
# Invalid cases
|
|
672
|
+
(
|
|
673
|
+
{},
|
|
674
|
+
True,
|
|
675
|
+
"issue_prompt is required for issue template",
|
|
676
|
+
),
|
|
677
|
+
(
|
|
678
|
+
{"failure_example": "Test failure example"},
|
|
679
|
+
True,
|
|
680
|
+
"issue_prompt is required for issue template",
|
|
681
|
+
),
|
|
682
|
+
(
|
|
683
|
+
{"issue_prompt": 123},
|
|
684
|
+
True,
|
|
685
|
+
"issue_prompt is required for issue template",
|
|
686
|
+
),
|
|
687
|
+
(
|
|
688
|
+
{
|
|
689
|
+
"issue_prompt": "Test issue prompt",
|
|
690
|
+
"failure_example": 456,
|
|
691
|
+
},
|
|
692
|
+
True,
|
|
693
|
+
"failure_example is optional for issue template, but if provided must be a string",
|
|
694
|
+
),
|
|
695
|
+
(
|
|
696
|
+
{
|
|
697
|
+
"issue_prompt": "Test issue prompt",
|
|
698
|
+
"failure_example": "Test failure example",
|
|
699
|
+
"pass_example": 789,
|
|
700
|
+
},
|
|
701
|
+
True,
|
|
702
|
+
"pass_example is optional for issue template, but if provided must be a string",
|
|
703
|
+
),
|
|
704
|
+
],
|
|
705
|
+
)
|
|
706
|
+
def test_eval_template_properties_issue_template_validation(
|
|
707
|
+
template_properties, should_raise, expected_error
|
|
708
|
+
):
|
|
709
|
+
"""Test issue template validation with various property combinations"""
|
|
710
|
+
if should_raise:
|
|
711
|
+
with pytest.raises(ValueError, match=expected_error):
|
|
712
|
+
Eval(
|
|
713
|
+
name="Test Eval",
|
|
714
|
+
template=EvalTemplateId.issue,
|
|
715
|
+
eval_set_filter_id="tag::tag1",
|
|
716
|
+
eval_configs_filter_id="tag::tag2",
|
|
717
|
+
output_scores=[
|
|
718
|
+
EvalOutputScore(
|
|
719
|
+
name="score",
|
|
720
|
+
type=TaskOutputRatingType.pass_fail,
|
|
721
|
+
)
|
|
722
|
+
],
|
|
723
|
+
template_properties=template_properties,
|
|
724
|
+
)
|
|
725
|
+
else:
|
|
726
|
+
eval = Eval(
|
|
727
|
+
name="Test Eval",
|
|
728
|
+
template=EvalTemplateId.issue,
|
|
729
|
+
eval_set_filter_id="tag::tag1",
|
|
730
|
+
eval_configs_filter_id="tag::tag2",
|
|
731
|
+
output_scores=[
|
|
732
|
+
EvalOutputScore(
|
|
733
|
+
name="score",
|
|
734
|
+
type=TaskOutputRatingType.pass_fail,
|
|
735
|
+
)
|
|
736
|
+
],
|
|
737
|
+
template_properties=template_properties,
|
|
738
|
+
)
|
|
739
|
+
assert eval.template == EvalTemplateId.issue
|
|
740
|
+
for key, value in template_properties.items():
|
|
741
|
+
assert eval.template_properties[key] == value
|
|
742
|
+
|
|
743
|
+
|
|
744
|
+
@pytest.mark.parametrize(
|
|
745
|
+
"template,template_properties",
|
|
746
|
+
[
|
|
747
|
+
(EvalTemplateId.kiln_requirements, {"random_property": "random_value"}),
|
|
748
|
+
(EvalTemplateId.toxicity, {}),
|
|
749
|
+
(EvalTemplateId.bias, {"some_property": 123}),
|
|
750
|
+
(EvalTemplateId.maliciousness, {"test": True}),
|
|
751
|
+
(EvalTemplateId.factual_correctness, {"score": 4.5}),
|
|
752
|
+
(EvalTemplateId.jailbreak, {"prompt": "test"}),
|
|
753
|
+
(
|
|
754
|
+
None,
|
|
755
|
+
{"issue_prompt": "This should not be validated", "failure_example": 123},
|
|
756
|
+
),
|
|
757
|
+
],
|
|
758
|
+
)
|
|
759
|
+
def test_eval_template_properties_non_issue_templates(template, template_properties):
|
|
760
|
+
"""Test that non-issue templates pass validation regardless of template_properties"""
|
|
761
|
+
eval = Eval(
|
|
762
|
+
name="Test Eval",
|
|
763
|
+
template=template,
|
|
764
|
+
eval_set_filter_id="tag::tag1",
|
|
765
|
+
eval_configs_filter_id="tag::tag2",
|
|
766
|
+
output_scores=[
|
|
767
|
+
EvalOutputScore(
|
|
768
|
+
name="score",
|
|
769
|
+
type=TaskOutputRatingType.pass_fail,
|
|
770
|
+
)
|
|
771
|
+
],
|
|
772
|
+
template_properties=template_properties,
|
|
773
|
+
)
|
|
774
|
+
|
|
775
|
+
assert eval.template == template
|
|
776
|
+
for key, value in template_properties.items():
|
|
777
|
+
assert eval.template_properties[key] == value
|
kiln_ai/utils/logging.py
CHANGED
|
@@ -63,14 +63,14 @@ class CustomLiteLLMLogger(CustomLogger):
|
|
|
63
63
|
# Print the formatted input data for the request in API format, pretty print
|
|
64
64
|
try:
|
|
65
65
|
self.logger.info(
|
|
66
|
-
f"Formatted Input Data (API):\n{json.dumps(data, indent=2)}"
|
|
66
|
+
f"Formatted Input Data (API):\n{json.dumps(data, indent=2, ensure_ascii=False)}"
|
|
67
67
|
)
|
|
68
68
|
except Exception as e:
|
|
69
69
|
self.logger.info(f"Formatted Input Data (API): Could not print {e}")
|
|
70
70
|
|
|
71
71
|
# Print the messages for the request in LiteLLM Message list, pretty print
|
|
72
72
|
try:
|
|
73
|
-
json_messages = json.dumps(messages, indent=2)
|
|
73
|
+
json_messages = json.dumps(messages, indent=2, ensure_ascii=False)
|
|
74
74
|
self.logger.info(f"Messages:\n{json_messages}")
|
|
75
75
|
except Exception as e:
|
|
76
76
|
self.logger.info(f"Messages: Could not print {e}")
|
|
@@ -115,7 +115,7 @@ class CustomLiteLLMLogger(CustomLogger):
|
|
|
115
115
|
# JSON format logs if possible
|
|
116
116
|
json_content = json.loads(content)
|
|
117
117
|
self.logger.info(
|
|
118
|
-
f"Model Response Content:\n{json.dumps(json_content, indent=2)}"
|
|
118
|
+
f"Model Response Content:\n{json.dumps(json_content, indent=2, ensure_ascii=False)}"
|
|
119
119
|
)
|
|
120
120
|
except Exception:
|
|
121
121
|
self.logger.info(f"Model Response Content:\n{content}")
|
|
@@ -149,6 +149,7 @@ def setup_litellm_logging(filename: str = "model_calls.log"):
|
|
|
149
149
|
get_log_file_path(filename),
|
|
150
150
|
maxBytes=5 * 1024 * 1024, # 5MB
|
|
151
151
|
backupCount=3,
|
|
152
|
+
encoding="utf-8",
|
|
152
153
|
)
|
|
153
154
|
|
|
154
155
|
# Set formatter to match the default formatting
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: kiln-ai
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.18.0
|
|
4
4
|
Summary: Kiln AI
|
|
5
5
|
Project-URL: Homepage, https://getkiln.ai
|
|
6
6
|
Project-URL: Repository, https://github.com/Kiln-AI/kiln
|
|
@@ -19,7 +19,7 @@ Requires-Dist: boto3>=1.37.10
|
|
|
19
19
|
Requires-Dist: coverage>=7.6.4
|
|
20
20
|
Requires-Dist: google-cloud-aiplatform>=1.84.0
|
|
21
21
|
Requires-Dist: jsonschema>=4.23.0
|
|
22
|
-
Requires-Dist: litellm>=1.
|
|
22
|
+
Requires-Dist: litellm>=1.72.6
|
|
23
23
|
Requires-Dist: openai>=1.53.0
|
|
24
24
|
Requires-Dist: pdoc>=15.0.0
|
|
25
25
|
Requires-Dist: pydantic>=2.9.2
|