kiln-ai 0.8.0__py3-none-any.whl → 0.11.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kiln-ai might be problematic. Click here for more details.
- kiln_ai/adapters/__init__.py +7 -7
- kiln_ai/adapters/adapter_registry.py +77 -5
- kiln_ai/adapters/data_gen/data_gen_task.py +3 -3
- kiln_ai/adapters/data_gen/test_data_gen_task.py +23 -3
- kiln_ai/adapters/fine_tune/base_finetune.py +5 -1
- kiln_ai/adapters/fine_tune/dataset_formatter.py +310 -65
- kiln_ai/adapters/fine_tune/fireworks_finetune.py +47 -32
- kiln_ai/adapters/fine_tune/openai_finetune.py +12 -11
- kiln_ai/adapters/fine_tune/test_base_finetune.py +19 -0
- kiln_ai/adapters/fine_tune/test_dataset_formatter.py +469 -129
- kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +113 -21
- kiln_ai/adapters/fine_tune/test_openai_finetune.py +125 -14
- kiln_ai/adapters/ml_model_list.py +323 -94
- kiln_ai/adapters/model_adapters/__init__.py +18 -0
- kiln_ai/adapters/{base_adapter.py → model_adapters/base_adapter.py} +81 -37
- kiln_ai/adapters/{langchain_adapters.py → model_adapters/langchain_adapters.py} +130 -84
- kiln_ai/adapters/model_adapters/openai_compatible_config.py +11 -0
- kiln_ai/adapters/model_adapters/openai_model_adapter.py +246 -0
- kiln_ai/adapters/model_adapters/test_base_adapter.py +190 -0
- kiln_ai/adapters/{test_langchain_adapter.py → model_adapters/test_langchain_adapter.py} +103 -88
- kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +225 -0
- kiln_ai/adapters/{test_saving_adapter_results.py → model_adapters/test_saving_adapter_results.py} +43 -15
- kiln_ai/adapters/{test_structured_output.py → model_adapters/test_structured_output.py} +93 -20
- kiln_ai/adapters/parsers/__init__.py +10 -0
- kiln_ai/adapters/parsers/base_parser.py +12 -0
- kiln_ai/adapters/parsers/json_parser.py +37 -0
- kiln_ai/adapters/parsers/parser_registry.py +19 -0
- kiln_ai/adapters/parsers/r1_parser.py +69 -0
- kiln_ai/adapters/parsers/test_json_parser.py +81 -0
- kiln_ai/adapters/parsers/test_parser_registry.py +32 -0
- kiln_ai/adapters/parsers/test_r1_parser.py +144 -0
- kiln_ai/adapters/prompt_builders.py +126 -20
- kiln_ai/adapters/provider_tools.py +91 -36
- kiln_ai/adapters/repair/repair_task.py +17 -6
- kiln_ai/adapters/repair/test_repair_task.py +4 -4
- kiln_ai/adapters/run_output.py +8 -0
- kiln_ai/adapters/test_adapter_registry.py +177 -0
- kiln_ai/adapters/test_generate_docs.py +69 -0
- kiln_ai/adapters/test_prompt_adaptors.py +8 -4
- kiln_ai/adapters/test_prompt_builders.py +190 -29
- kiln_ai/adapters/test_provider_tools.py +268 -46
- kiln_ai/datamodel/__init__.py +199 -12
- kiln_ai/datamodel/basemodel.py +31 -11
- kiln_ai/datamodel/json_schema.py +8 -3
- kiln_ai/datamodel/model_cache.py +8 -3
- kiln_ai/datamodel/test_basemodel.py +81 -2
- kiln_ai/datamodel/test_dataset_split.py +100 -3
- kiln_ai/datamodel/test_example_models.py +25 -4
- kiln_ai/datamodel/test_model_cache.py +24 -0
- kiln_ai/datamodel/test_model_perf.py +125 -0
- kiln_ai/datamodel/test_models.py +129 -0
- kiln_ai/utils/exhaustive_error.py +6 -0
- {kiln_ai-0.8.0.dist-info → kiln_ai-0.11.1.dist-info}/METADATA +9 -7
- kiln_ai-0.11.1.dist-info/RECORD +76 -0
- kiln_ai-0.8.0.dist-info/RECORD +0 -58
- {kiln_ai-0.8.0.dist-info → kiln_ai-0.11.1.dist-info}/WHEEL +0 -0
- {kiln_ai-0.8.0.dist-info → kiln_ai-0.11.1.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -5,6 +5,7 @@ from pydantic import ValidationError
|
|
|
5
5
|
from kiln_ai.datamodel import (
|
|
6
6
|
AllDatasetFilter,
|
|
7
7
|
AllSplitDefinition,
|
|
8
|
+
DatasetFilterType,
|
|
8
9
|
DatasetSplit,
|
|
9
10
|
DatasetSplitDefinition,
|
|
10
11
|
DataSource,
|
|
@@ -15,6 +16,8 @@ from kiln_ai.datamodel import (
|
|
|
15
16
|
TaskOutputRating,
|
|
16
17
|
TaskOutputRatingType,
|
|
17
18
|
TaskRun,
|
|
19
|
+
ThinkingModelDatasetFilter,
|
|
20
|
+
ThinkingModelHighRatedFilter,
|
|
18
21
|
Train60Test20Val20SplitDefinition,
|
|
19
22
|
Train80Test20SplitDefinition,
|
|
20
23
|
)
|
|
@@ -131,10 +134,33 @@ def test_all_dataset_filter(task_run):
|
|
|
131
134
|
|
|
132
135
|
|
|
133
136
|
def test_high_rating_dataset_filter(sample_task_runs):
|
|
137
|
+
num_high_quality = 0
|
|
138
|
+
num_low_quality = 0
|
|
134
139
|
for task_run in sample_task_runs:
|
|
135
|
-
|
|
136
|
-
|
|
140
|
+
if HighRatingDatasetFilter(task_run):
|
|
141
|
+
num_high_quality += 1
|
|
142
|
+
assert task_run.output.rating.is_high_quality() is True
|
|
143
|
+
else:
|
|
144
|
+
num_low_quality += 1
|
|
145
|
+
assert task_run.output.rating.is_high_quality() is False
|
|
146
|
+
|
|
147
|
+
# Test repaired output always considered high quality
|
|
148
|
+
task_run = task_run.model_copy(
|
|
149
|
+
update={
|
|
150
|
+
"repair_instructions": "repair instructions",
|
|
151
|
+
"repaired_output": TaskOutput(
|
|
152
|
+
output="repaired output",
|
|
153
|
+
source=DataSource(
|
|
154
|
+
type=DataSourceType.human,
|
|
155
|
+
properties={"created_by": "test-user"},
|
|
156
|
+
),
|
|
157
|
+
),
|
|
158
|
+
}
|
|
137
159
|
)
|
|
160
|
+
assert HighRatingDatasetFilter(task_run) is True
|
|
161
|
+
|
|
162
|
+
assert num_high_quality == 6
|
|
163
|
+
assert num_low_quality == 4
|
|
138
164
|
|
|
139
165
|
|
|
140
166
|
@pytest.mark.parametrize(
|
|
@@ -173,9 +199,11 @@ def test_dataset_split_with_high_rating_filter(sample_task, sample_task_runs):
|
|
|
173
199
|
"Split Name",
|
|
174
200
|
sample_task,
|
|
175
201
|
Train80Test20SplitDefinition,
|
|
176
|
-
|
|
202
|
+
filter_type=DatasetFilterType.HIGH_RATING,
|
|
177
203
|
)
|
|
178
204
|
|
|
205
|
+
assert dataset.filter == DatasetFilterType.HIGH_RATING
|
|
206
|
+
|
|
179
207
|
# Check that only high-rated task runs are included
|
|
180
208
|
all_ids = []
|
|
181
209
|
for ids in dataset.split_contents.values():
|
|
@@ -232,3 +260,72 @@ def test_smaller_sample(sample_task, sample_task_runs):
|
|
|
232
260
|
|
|
233
261
|
# Now we should have 0 missing runs. It's okay that dataset has newer data.
|
|
234
262
|
assert dataset.missing_count() == 0
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
@pytest.mark.parametrize(
|
|
266
|
+
"thinking_data,expected_result",
|
|
267
|
+
[
|
|
268
|
+
({"reasoning": "Here's my answer"}, True),
|
|
269
|
+
({"chain_of_thought": "Here's my answer"}, True),
|
|
270
|
+
({"unknown": "Here's my answer"}, False),
|
|
271
|
+
({}, False),
|
|
272
|
+
(None, False),
|
|
273
|
+
],
|
|
274
|
+
)
|
|
275
|
+
def test_thinking_model_dataset_filter(
|
|
276
|
+
sample_task_runs, thinking_data, expected_result
|
|
277
|
+
):
|
|
278
|
+
# Create a task run with thinking output
|
|
279
|
+
task_run = sample_task_runs[0].model_copy(
|
|
280
|
+
update={
|
|
281
|
+
"output": TaskOutput(
|
|
282
|
+
output="Let me think about this...\nHere's my answer",
|
|
283
|
+
source=DataSource(
|
|
284
|
+
type=DataSourceType.human,
|
|
285
|
+
properties={"created_by": "test-user"},
|
|
286
|
+
),
|
|
287
|
+
rating=TaskOutputRating(value=5, type=TaskOutputRatingType.five_star),
|
|
288
|
+
),
|
|
289
|
+
"intermediate_outputs": thinking_data,
|
|
290
|
+
}
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
assert ThinkingModelDatasetFilter(task_run) is expected_result
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
@pytest.mark.parametrize(
|
|
297
|
+
"thinking_data,rating,expected_result",
|
|
298
|
+
[
|
|
299
|
+
({"reasoning": "Here's my answer"}, 5, True),
|
|
300
|
+
({"chain_of_thought": "Here's my answer"}, 5, True),
|
|
301
|
+
({"unknown": "Here's my answer"}, 5, False),
|
|
302
|
+
({}, 5, False),
|
|
303
|
+
(None, 5, False),
|
|
304
|
+
({"reasoning": "Here's my answer"}, 1, False),
|
|
305
|
+
({"chain_of_thought": "Here's my answer"}, 1, False),
|
|
306
|
+
({"unknown": "Here's my answer"}, 1, False),
|
|
307
|
+
({}, 1, False),
|
|
308
|
+
(None, 1, False),
|
|
309
|
+
],
|
|
310
|
+
)
|
|
311
|
+
def test_thinking_model_dataset_filter_high_rated(
|
|
312
|
+
sample_task_runs, thinking_data, rating, expected_result
|
|
313
|
+
):
|
|
314
|
+
# Create a task run with thinking output
|
|
315
|
+
task_run = sample_task_runs[0].model_copy(
|
|
316
|
+
update={
|
|
317
|
+
"output": TaskOutput(
|
|
318
|
+
output="Let me think about this...\nHere's my answer",
|
|
319
|
+
source=DataSource(
|
|
320
|
+
type=DataSourceType.human,
|
|
321
|
+
properties={"created_by": "test-user"},
|
|
322
|
+
),
|
|
323
|
+
rating=TaskOutputRating(
|
|
324
|
+
value=rating, type=TaskOutputRatingType.five_star
|
|
325
|
+
),
|
|
326
|
+
),
|
|
327
|
+
"intermediate_outputs": thinking_data,
|
|
328
|
+
}
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
assert ThinkingModelHighRatedFilter(task_run) is expected_result
|
|
@@ -284,6 +284,9 @@ def test_task_output_requirement_rating_keys(tmp_path):
|
|
|
284
284
|
assert task_run.output.rating.requirement_ratings is not None
|
|
285
285
|
|
|
286
286
|
|
|
287
|
+
_schema_match = "This task requires a specific output schema. While the model produced JSON, that JSON didn't meet the schema."
|
|
288
|
+
|
|
289
|
+
|
|
287
290
|
def test_task_output_schema_validation(tmp_path):
|
|
288
291
|
# Create a project, task, and example hierarchy
|
|
289
292
|
project = Project(name="Test Project", path=(tmp_path / "test_project"))
|
|
@@ -321,12 +324,24 @@ def test_task_output_schema_validation(tmp_path):
|
|
|
321
324
|
task_output.save_to_file()
|
|
322
325
|
|
|
323
326
|
# changing to invalid output
|
|
324
|
-
with pytest.raises(
|
|
327
|
+
with pytest.raises(
|
|
328
|
+
ValueError,
|
|
329
|
+
match=_schema_match,
|
|
330
|
+
):
|
|
325
331
|
task_output.output.output = '{"name": "John Doe", "age": "thirty"}'
|
|
326
332
|
task_output.save_to_file()
|
|
327
333
|
|
|
334
|
+
# changing to invalid output from loaded model
|
|
335
|
+
loaded_task_output = TaskRun.load_from_file(task_output.path)
|
|
336
|
+
with pytest.raises(
|
|
337
|
+
ValueError,
|
|
338
|
+
match=_schema_match,
|
|
339
|
+
):
|
|
340
|
+
loaded_task_output.output.output = '{"name": "John Doe", "age": "forty"}'
|
|
341
|
+
loaded_task_output.save_to_file()
|
|
342
|
+
|
|
328
343
|
# Invalid case: output does not match task output schema
|
|
329
|
-
with pytest.raises(ValueError, match=
|
|
344
|
+
with pytest.raises(ValueError, match=_schema_match):
|
|
330
345
|
task_output = TaskRun(
|
|
331
346
|
input="Test input",
|
|
332
347
|
input_source=DataSource(
|
|
@@ -382,12 +397,18 @@ def test_task_input_schema_validation(tmp_path):
|
|
|
382
397
|
valid_task_output.save_to_file()
|
|
383
398
|
|
|
384
399
|
# Changing to invalid input
|
|
385
|
-
with pytest.raises(ValueError, match=
|
|
400
|
+
with pytest.raises(ValueError, match=_schema_match):
|
|
386
401
|
valid_task_output.input = '{"name": "John Doe", "age": "thirty"}'
|
|
387
402
|
valid_task_output.save_to_file()
|
|
388
403
|
|
|
404
|
+
# loading from file, then changing to invalid input
|
|
405
|
+
loaded_task_output = TaskRun.load_from_file(valid_task_output.path)
|
|
406
|
+
with pytest.raises(ValueError, match=_schema_match):
|
|
407
|
+
loaded_task_output.input = '{"name": "John Doe", "age": "thirty"}'
|
|
408
|
+
loaded_task_output.save_to_file()
|
|
409
|
+
|
|
389
410
|
# Invalid case: input does not match task input schema
|
|
390
|
-
with pytest.raises(ValueError, match=
|
|
411
|
+
with pytest.raises(ValueError, match=_schema_match):
|
|
391
412
|
task_output = TaskRun(
|
|
392
413
|
input='{"name": "John Doe", "age": "thirty"}',
|
|
393
414
|
input_source=DataSource(
|
|
@@ -242,3 +242,27 @@ def test_check_timestamp_granularity_linux_error():
|
|
|
242
242
|
cache = ModelCache()
|
|
243
243
|
assert cache._check_timestamp_granularity() is False
|
|
244
244
|
assert cache._enabled is False
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def test_get_model_readonly(model_cache, test_path):
|
|
248
|
+
if not model_cache._enabled:
|
|
249
|
+
pytest.skip("Cache is disabled on this fs")
|
|
250
|
+
|
|
251
|
+
model = ModelTest(name="test", value=123)
|
|
252
|
+
mtime_ns = test_path.stat().st_mtime_ns
|
|
253
|
+
|
|
254
|
+
# Set the model in the cache
|
|
255
|
+
model_cache.set_model(test_path, model, mtime_ns)
|
|
256
|
+
|
|
257
|
+
# Get the model in readonly mode
|
|
258
|
+
readonly_model = model_cache.get_model(test_path, ModelTest, readonly=True)
|
|
259
|
+
# Get a regular (copied) model
|
|
260
|
+
copied_model = model_cache.get_model(test_path, ModelTest)
|
|
261
|
+
|
|
262
|
+
# The readonly model should be the exact same instance as the cached model
|
|
263
|
+
assert readonly_model is model_cache.model_cache[test_path][0]
|
|
264
|
+
# While the regular get should be a different instance
|
|
265
|
+
assert copied_model is not model_cache.model_cache[test_path][0]
|
|
266
|
+
|
|
267
|
+
# Both should have the same data
|
|
268
|
+
assert readonly_model == copied_model == model
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
import shutil
|
|
2
|
+
import uuid
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from kiln_ai.datamodel import (
|
|
7
|
+
DataSource,
|
|
8
|
+
DataSourceType,
|
|
9
|
+
Project,
|
|
10
|
+
Task,
|
|
11
|
+
TaskOutput,
|
|
12
|
+
TaskRun,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
test_json_schema = """{
|
|
16
|
+
"type": "object",
|
|
17
|
+
"properties": {
|
|
18
|
+
"setup": {
|
|
19
|
+
"description": "The setup of the joke",
|
|
20
|
+
"title": "Setup",
|
|
21
|
+
"type": "string"
|
|
22
|
+
},
|
|
23
|
+
"punchline": {
|
|
24
|
+
"description": "The punchline to the joke",
|
|
25
|
+
"title": "Punchline",
|
|
26
|
+
"type": "string"
|
|
27
|
+
},
|
|
28
|
+
"rating": {
|
|
29
|
+
"anyOf": [
|
|
30
|
+
{
|
|
31
|
+
"type": "integer"
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
"type": "null"
|
|
35
|
+
}
|
|
36
|
+
],
|
|
37
|
+
"default": null,
|
|
38
|
+
"description": "How funny the joke is, from 1 to 10",
|
|
39
|
+
"title": "Rating"
|
|
40
|
+
}
|
|
41
|
+
},
|
|
42
|
+
"required": [
|
|
43
|
+
"setup",
|
|
44
|
+
"punchline"
|
|
45
|
+
]
|
|
46
|
+
}
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@pytest.fixture
|
|
51
|
+
def task_run(tmp_path):
|
|
52
|
+
# setup a valid project/task/task_run for testing
|
|
53
|
+
output_source = DataSource(
|
|
54
|
+
type=DataSourceType.synthetic,
|
|
55
|
+
properties={
|
|
56
|
+
"model_name": "test-model",
|
|
57
|
+
"model_provider": "test-provider",
|
|
58
|
+
"adapter_name": "test-adapter",
|
|
59
|
+
},
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
project_path = tmp_path / "project.kiln"
|
|
63
|
+
project = Project(name="Test Project", path=project_path)
|
|
64
|
+
project.save_to_file()
|
|
65
|
+
task = Task(
|
|
66
|
+
name="Test Task",
|
|
67
|
+
instruction="Test Instruction",
|
|
68
|
+
parent=project,
|
|
69
|
+
output_json_schema=test_json_schema,
|
|
70
|
+
input_json_schema=test_json_schema,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
task.save_to_file()
|
|
74
|
+
|
|
75
|
+
task_output = TaskOutput(
|
|
76
|
+
output='{"setup": "Why did the chicken cross the road?", "punchline": "To get to the other side"}',
|
|
77
|
+
source=DataSource(
|
|
78
|
+
type=DataSourceType.synthetic,
|
|
79
|
+
properties={
|
|
80
|
+
"model_name": "test-model",
|
|
81
|
+
"model_provider": "test-provider",
|
|
82
|
+
"adapter_name": "test-adapter",
|
|
83
|
+
},
|
|
84
|
+
),
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# Save for later usage
|
|
88
|
+
task_run = TaskRun(
|
|
89
|
+
input='{"setup": "Why did the chicken cross the road?", "punchline": "To get to the other side"}',
|
|
90
|
+
input_source=output_source,
|
|
91
|
+
output=task_output,
|
|
92
|
+
)
|
|
93
|
+
task_run.parent = task
|
|
94
|
+
task_run.save_to_file()
|
|
95
|
+
|
|
96
|
+
return task_run
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@pytest.mark.benchmark
|
|
100
|
+
def test_benchmark_load_from_file(benchmark, task_run):
|
|
101
|
+
task_run_path = task_run.path
|
|
102
|
+
|
|
103
|
+
iterations = 500
|
|
104
|
+
total_time = 0
|
|
105
|
+
|
|
106
|
+
for _ in range(iterations):
|
|
107
|
+
# Copy the task to a new temp path, so we don't get warm loads/cached loads
|
|
108
|
+
temp_path = task_run.path.parent / f"temp_task_run_{uuid.uuid4()}.json"
|
|
109
|
+
shutil.copy(str(task_run_path), str(temp_path))
|
|
110
|
+
|
|
111
|
+
# only time loading the model (and one accessor for delayed validation)
|
|
112
|
+
start_time = benchmark._timer()
|
|
113
|
+
loaded = TaskRun.load_from_file(temp_path)
|
|
114
|
+
assert loaded.id == task_run.id
|
|
115
|
+
end_time = benchmark._timer()
|
|
116
|
+
|
|
117
|
+
total_time += end_time - start_time
|
|
118
|
+
|
|
119
|
+
avg_time_per_iteration = total_time / iterations
|
|
120
|
+
ops_per_second = 1.0 / avg_time_per_iteration
|
|
121
|
+
|
|
122
|
+
# I get 8k ops per second on my MBP. Lower value here for CI.
|
|
123
|
+
# Prior to optimization was 290 ops per second.
|
|
124
|
+
if ops_per_second < 1000:
|
|
125
|
+
pytest.fail(f"Ops per second: {ops_per_second:.6f}, expected more than 1k ops")
|
kiln_ai/datamodel/test_models.py
CHANGED
|
@@ -9,7 +9,9 @@ from kiln_ai.datamodel import (
|
|
|
9
9
|
DataSource,
|
|
10
10
|
DataSourceType,
|
|
11
11
|
Finetune,
|
|
12
|
+
FinetuneDataStrategy,
|
|
12
13
|
Project,
|
|
14
|
+
Prompt,
|
|
13
15
|
Task,
|
|
14
16
|
TaskOutput,
|
|
15
17
|
TaskRun,
|
|
@@ -70,6 +72,20 @@ def test_save_to_file(test_project_file):
|
|
|
70
72
|
assert data["description"] == "Test Description"
|
|
71
73
|
|
|
72
74
|
|
|
75
|
+
def test_save_to_file_non_ascii(test_project_file):
|
|
76
|
+
project = Project(
|
|
77
|
+
name="Test Project", description="Chúc mừng!", path=test_project_file
|
|
78
|
+
)
|
|
79
|
+
project.save_to_file()
|
|
80
|
+
|
|
81
|
+
with open(test_project_file, "r", encoding="utf-8") as file:
|
|
82
|
+
data = json.load(file)
|
|
83
|
+
|
|
84
|
+
assert data["v"] == 1
|
|
85
|
+
assert data["name"] == "Test Project"
|
|
86
|
+
assert data["description"] == "Chúc mừng!"
|
|
87
|
+
|
|
88
|
+
|
|
73
89
|
def test_task_defaults():
|
|
74
90
|
task = Task(name="Test Task", instruction="Test Instruction")
|
|
75
91
|
assert task.description is None
|
|
@@ -488,3 +504,116 @@ def test_task_run_tags_validation():
|
|
|
488
504
|
tags=["valid_tag", "invalid tag"],
|
|
489
505
|
)
|
|
490
506
|
assert "Tags cannot contain spaces. Try underscores." in str(exc_info.value)
|
|
507
|
+
|
|
508
|
+
|
|
509
|
+
def test_prompt_validation():
|
|
510
|
+
prompt = Prompt(name="Test Prompt Name", prompt="Test Prompt")
|
|
511
|
+
assert prompt.name == "Test Prompt Name"
|
|
512
|
+
assert prompt.prompt == "Test Prompt"
|
|
513
|
+
|
|
514
|
+
with pytest.raises(ValidationError):
|
|
515
|
+
Prompt(name="Test Prompt")
|
|
516
|
+
|
|
517
|
+
with pytest.raises(ValidationError):
|
|
518
|
+
Prompt(name="Test Prompt", prompt=None)
|
|
519
|
+
|
|
520
|
+
with pytest.raises(ValidationError):
|
|
521
|
+
Prompt(name="Test Prompt", prompt="")
|
|
522
|
+
|
|
523
|
+
with pytest.raises(ValidationError):
|
|
524
|
+
Prompt(prompt="Test Prompt")
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
def test_prompt_parent_task():
|
|
528
|
+
task = Task(name="Test Task", instruction="Test Instruction")
|
|
529
|
+
prompt = Prompt(name="Test Prompt", prompt="Test Prompt", parent=task)
|
|
530
|
+
assert prompt.parent == task
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
@pytest.mark.parametrize(
|
|
534
|
+
"thinking_instructions,data_strategy,should_raise,expected_message",
|
|
535
|
+
[
|
|
536
|
+
# Test 1: Valid case - no thinking instructions with final_only
|
|
537
|
+
(
|
|
538
|
+
None,
|
|
539
|
+
FinetuneDataStrategy.final_only,
|
|
540
|
+
False,
|
|
541
|
+
None,
|
|
542
|
+
),
|
|
543
|
+
# Test 2: Valid case - thinking instructions with final_and_intermediate
|
|
544
|
+
(
|
|
545
|
+
"Think step by step",
|
|
546
|
+
FinetuneDataStrategy.final_and_intermediate,
|
|
547
|
+
False,
|
|
548
|
+
None,
|
|
549
|
+
),
|
|
550
|
+
# Test 3: Invalid case - thinking instructions with final_only
|
|
551
|
+
(
|
|
552
|
+
"Think step by step",
|
|
553
|
+
FinetuneDataStrategy.final_only,
|
|
554
|
+
True,
|
|
555
|
+
"Thinking instructions can only be used when data_strategy is final_and_intermediate",
|
|
556
|
+
),
|
|
557
|
+
# Test 4: Invalid case - no thinking instructions with final_and_intermediate
|
|
558
|
+
(
|
|
559
|
+
None,
|
|
560
|
+
FinetuneDataStrategy.final_and_intermediate,
|
|
561
|
+
True,
|
|
562
|
+
"Thinking instructions are required when data_strategy is final_and_intermediate",
|
|
563
|
+
),
|
|
564
|
+
],
|
|
565
|
+
)
|
|
566
|
+
def test_finetune_thinking_instructions_validation(
|
|
567
|
+
thinking_instructions, data_strategy, should_raise, expected_message
|
|
568
|
+
):
|
|
569
|
+
base_params = {
|
|
570
|
+
"name": "test-finetune",
|
|
571
|
+
"provider": "openai",
|
|
572
|
+
"base_model_id": "gpt-3.5-turbo",
|
|
573
|
+
"dataset_split_id": "split1",
|
|
574
|
+
"system_message": "test message",
|
|
575
|
+
"data_strategy": data_strategy,
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
if thinking_instructions is not None:
|
|
579
|
+
base_params["thinking_instructions"] = thinking_instructions
|
|
580
|
+
|
|
581
|
+
if should_raise:
|
|
582
|
+
with pytest.raises(ValueError) as exc_info:
|
|
583
|
+
Finetune(**base_params)
|
|
584
|
+
assert expected_message in str(exc_info.value)
|
|
585
|
+
else:
|
|
586
|
+
finetune = Finetune(**base_params)
|
|
587
|
+
assert finetune.thinking_instructions == thinking_instructions
|
|
588
|
+
assert finetune.data_strategy == data_strategy
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
@pytest.mark.parametrize(
|
|
592
|
+
"intermediate_outputs,expected",
|
|
593
|
+
[
|
|
594
|
+
# No intermediate outputs
|
|
595
|
+
(None, False),
|
|
596
|
+
# Empty intermediate outputs
|
|
597
|
+
({}, False),
|
|
598
|
+
# Only chain_of_thought
|
|
599
|
+
({"chain_of_thought": "thinking process"}, True),
|
|
600
|
+
# Only reasoning
|
|
601
|
+
({"reasoning": "reasoning process"}, True),
|
|
602
|
+
# Both chain_of_thought and reasoning
|
|
603
|
+
(
|
|
604
|
+
{"chain_of_thought": "thinking process", "reasoning": "reasoning process"},
|
|
605
|
+
True,
|
|
606
|
+
),
|
|
607
|
+
# Other intermediate outputs but no thinking data
|
|
608
|
+
({"other_output": "some data"}, False),
|
|
609
|
+
# Mixed other outputs with thinking data
|
|
610
|
+
({"chain_of_thought": "thinking process", "other_output": "some data"}, True),
|
|
611
|
+
],
|
|
612
|
+
)
|
|
613
|
+
def test_task_run_has_thinking_training_data(intermediate_outputs, expected):
|
|
614
|
+
task_run = TaskRun(
|
|
615
|
+
input="test input",
|
|
616
|
+
output=TaskOutput(output="test output"),
|
|
617
|
+
intermediate_outputs=intermediate_outputs,
|
|
618
|
+
)
|
|
619
|
+
assert task_run.has_thinking_training_data() == expected
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: kiln-ai
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.11.1
|
|
4
4
|
Summary: Kiln AI
|
|
5
5
|
Project-URL: Homepage, https://getkiln.ai
|
|
6
6
|
Project-URL: Repository, https://github.com/Kiln-AI/kiln
|
|
@@ -19,8 +19,7 @@ Requires-Dist: jsonschema>=4.23.0
|
|
|
19
19
|
Requires-Dist: langchain-aws>=0.2.4
|
|
20
20
|
Requires-Dist: langchain-fireworks>=0.2.5
|
|
21
21
|
Requires-Dist: langchain-groq>=0.2.0
|
|
22
|
-
Requires-Dist: langchain-ollama>=0.2.
|
|
23
|
-
Requires-Dist: langchain-openai>=0.2.4
|
|
22
|
+
Requires-Dist: langchain-ollama>=0.2.2
|
|
24
23
|
Requires-Dist: langchain>=0.3.5
|
|
25
24
|
Requires-Dist: openai>=1.53.0
|
|
26
25
|
Requires-Dist: pdoc>=15.0.0
|
|
@@ -72,7 +71,7 @@ The library has a [comprehensive set of docs](https://kiln-ai.github.io/Kiln/kil
|
|
|
72
71
|
- [Load an Existing Dataset into a Kiln Task Dataset](#load-an-existing-dataset-into-a-kiln-task-dataset)
|
|
73
72
|
- [Using your Kiln Dataset in a Notebook or Project](#using-your-kiln-dataset-in-a-notebook-or-project)
|
|
74
73
|
- [Using Kiln Dataset in Pandas](#using-kiln-dataset-in-pandas)
|
|
75
|
-
- [
|
|
74
|
+
- [Full API Reference](#full-api-reference)
|
|
76
75
|
|
|
77
76
|
## Installation
|
|
78
77
|
|
|
@@ -95,11 +94,14 @@ The Kiln Python library provides a set of Python classes that which help you eas
|
|
|
95
94
|
|
|
96
95
|
### Datamodel Overview
|
|
97
96
|
|
|
97
|
+
Here's a high level overview of the Kiln datamodel. A project folder will reflect this nested structure:
|
|
98
|
+
|
|
98
99
|
- Project: a Kiln Project that organizes related tasks
|
|
99
100
|
- Task: a specific task including prompt instructions, input/output schemas, and requirements
|
|
100
101
|
- TaskRun: a sample (run) of a task including input, output and human rating information
|
|
101
|
-
- DatasetSplit: a frozen collection of task runs divided into train/test/validation splits
|
|
102
102
|
- Finetune: configuration and status tracking for fine-tuning models on task data
|
|
103
|
+
- Prompt: a prompt for this task
|
|
104
|
+
- DatasetSplit: a frozen collection of task runs divided into train/test/validation splits
|
|
103
105
|
|
|
104
106
|
### Load a Project
|
|
105
107
|
|
|
@@ -230,8 +232,8 @@ final_df = pd.concat(dfs, ignore_index=True)
|
|
|
230
232
|
print(final_df)
|
|
231
233
|
```
|
|
232
234
|
|
|
233
|
-
|
|
235
|
+
## Full API Reference
|
|
234
236
|
|
|
235
237
|
The library can do a lot more than the examples we've shown here.
|
|
236
238
|
|
|
237
|
-
See the [docs](https://kiln-ai.github.io/Kiln/kiln_core_docs/index.html)
|
|
239
|
+
See the full API reference in the [docs](https://kiln-ai.github.io/Kiln/kiln_core_docs/index.html) under the `Submodules` section of the sidebar.
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
kiln_ai/__init__.py,sha256=Sc4z8LRVFMwJUoc_DPVUriSXTZ6PO9MaJ80PhRbKyB8,34
|
|
2
|
+
kiln_ai/adapters/__init__.py,sha256=4qEnFkkRSHPKDU7AvYNkqXECjZO_K7PzDCK3HbsY7o4,902
|
|
3
|
+
kiln_ai/adapters/adapter_registry.py,sha256=C4ayYVXRClj5-EEEpJEuMEMesbrJgPxz5yRBv6cyLGQ,3697
|
|
4
|
+
kiln_ai/adapters/ml_model_list.py,sha256=4V6IinAhgYcGba3Kw3Ps2pJcHv5v7NC6Ob-8JN7fh3Y,36240
|
|
5
|
+
kiln_ai/adapters/ollama_tools.py,sha256=0Of6ySbJ2d4j--9laOL6QKgRUQSrqX8dJUIrz20n59s,3561
|
|
6
|
+
kiln_ai/adapters/prompt_builders.py,sha256=gnBf4SI_uz6W9gpkRaKOK9oHQX6eTjdSbKX-Tpaj2gQ,13906
|
|
7
|
+
kiln_ai/adapters/provider_tools.py,sha256=CGNLW0xhFyj93HFwznCoDrrbyQbQAaS2mJuOKaMB6gU,14435
|
|
8
|
+
kiln_ai/adapters/run_output.py,sha256=jANUFb_O_P47aWyA_YjLSl2y0Z7QFWPEUsOGE54BGY4,159
|
|
9
|
+
kiln_ai/adapters/test_adapter_registry.py,sha256=obzcaifFLbgcokJwOCrhBe3IU-U3v4rPVBZya7p_hUM,6209
|
|
10
|
+
kiln_ai/adapters/test_generate_docs.py,sha256=a0eO4fJdHhmvhB7aM3FW4nPYw-fqy1s5B6ZVDPRZtpQ,2730
|
|
11
|
+
kiln_ai/adapters/test_ollama_tools.py,sha256=2KwYVaj3ySV3ld-z51TCGbJEMdb3MZj2eoEicIWz3Q4,2552
|
|
12
|
+
kiln_ai/adapters/test_prompt_adaptors.py,sha256=Z-eGy206sDDXhsfngUOuYEUnaerPDOrQb64hipRxfW4,7550
|
|
13
|
+
kiln_ai/adapters/test_prompt_builders.py,sha256=I6d888fIFDYzIp8DbnDWCrbRj8nlieLVrgiZzgqMV0s,20200
|
|
14
|
+
kiln_ai/adapters/test_provider_tools.py,sha256=DtnC6oFuiBvvbhD-kdCcWzEYqXZfMBM_DexuQdyAVR8,28664
|
|
15
|
+
kiln_ai/adapters/data_gen/__init__.py,sha256=QTZWaf7kq5BorhPvexJfwDEKmjRmIbhwW9ei8LW2SIs,276
|
|
16
|
+
kiln_ai/adapters/data_gen/data_gen_prompts.py,sha256=kudjHnAz7L3q0k_NLyTlaIV7M0uRFrxXNcfcnjOE2uc,5810
|
|
17
|
+
kiln_ai/adapters/data_gen/data_gen_task.py,sha256=0v7ufvuSeY5j-HerfrqXuCo30SYlS6EFIiVyiJM9xj0,5986
|
|
18
|
+
kiln_ai/adapters/data_gen/test_data_gen_task.py,sha256=cRKUKMvC0uVompbmPTKwbnQ_N3c0cQDm4J_9H4Y5U18,10129
|
|
19
|
+
kiln_ai/adapters/fine_tune/__init__.py,sha256=DxdTR60chwgck1aEoVYWyfWi6Ed2ZkdJj0lar-SEAj4,257
|
|
20
|
+
kiln_ai/adapters/fine_tune/base_finetune.py,sha256=n3mfE_3bhhzmN_MQxO5qNezN-qpl4WFamZ3ih41dx4o,6069
|
|
21
|
+
kiln_ai/adapters/fine_tune/dataset_formatter.py,sha256=qRhSSkMhTWn13OMb6LKPVwAU7uY4bB49GDiVSuhDkNg,14449
|
|
22
|
+
kiln_ai/adapters/fine_tune/finetune_registry.py,sha256=H1B-opCTlIyd9JlIFTKsY_ctxUX9ziEc49_gnmg1SZg,483
|
|
23
|
+
kiln_ai/adapters/fine_tune/fireworks_finetune.py,sha256=6IfTDn_8tg6PR0OFudRx6V7Wjvf4P7t0fm_xyRwII68,13978
|
|
24
|
+
kiln_ai/adapters/fine_tune/openai_finetune.py,sha256=Dz9E_0BWfrIkvv8ArZe-RKPwbIKPZ3v8rfbc3JELyTY,8571
|
|
25
|
+
kiln_ai/adapters/fine_tune/test_base_finetune.py,sha256=0zWxFYrDGVuoQNQmi9vVUEkBc4mstfHnsUjQmiJA-sE,10864
|
|
26
|
+
kiln_ai/adapters/fine_tune/test_dataset_formatter.py,sha256=wknJZDgkoj61O1TO5_HLTSdKZjvb78pmTyBRUBfCtYw,23968
|
|
27
|
+
kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py,sha256=qQCBUa6kje0P83vH7MWioVQ7IVPti_TFdJG_QaBwqjY,18105
|
|
28
|
+
kiln_ai/adapters/fine_tune/test_openai_finetune.py,sha256=-QHKNr5LICTOoCDavGxAUnWoAWvL3eMWHywpMf69n1U,19972
|
|
29
|
+
kiln_ai/adapters/model_adapters/__init__.py,sha256=FyNq-twr1zQR55Hd_sDlTcVZ8JsZ9jrIBBURtZNjlss,272
|
|
30
|
+
kiln_ai/adapters/model_adapters/base_adapter.py,sha256=f91QR2mmn3x6bTfRFbgWe6N47aIB8ABpQjgv8Scru5Y,9183
|
|
31
|
+
kiln_ai/adapters/model_adapters/langchain_adapters.py,sha256=JCCaXSFgtiNkr67tv4DRZRdmxGvq2JOkrQdC7H__ybw,12606
|
|
32
|
+
kiln_ai/adapters/model_adapters/openai_compatible_config.py,sha256=o7Ae2uWm1bdqOZaSBbsjH1CH9bnSa1woqTHvP8ds0mY,286
|
|
33
|
+
kiln_ai/adapters/model_adapters/openai_model_adapter.py,sha256=nSr_IcubCB7Dk6zkXsjvLrTAvnTedvHpXPZekiYK_cg,9545
|
|
34
|
+
kiln_ai/adapters/model_adapters/test_base_adapter.py,sha256=rRN76HPepuXKmAEzZCJjK_cHZ6rJBLQ2poYpWOpxOaE,6408
|
|
35
|
+
kiln_ai/adapters/model_adapters/test_langchain_adapter.py,sha256=UyQ-jdTFUpXHTO40nxHELIsV_YRA_RgF1xkAJ5oO-dM,12156
|
|
36
|
+
kiln_ai/adapters/model_adapters/test_openai_model_adapter.py,sha256=3pSLHug_GPRpv5C9zE_hivOsbWgs5nLohS6DxtPz1jo,6914
|
|
37
|
+
kiln_ai/adapters/model_adapters/test_saving_adapter_results.py,sha256=iIjI4Ox3wUpv6g1dcQ3zDFpma0MUqljJ_Lqbo8MbQuw,7393
|
|
38
|
+
kiln_ai/adapters/model_adapters/test_structured_output.py,sha256=BipXAzkoKZuixwG2tPyqle9INuYd3teHDP12arncaXA,11871
|
|
39
|
+
kiln_ai/adapters/parsers/__init__.py,sha256=TGJS_8JhjUwg5Bnq4cDmwt5eIRo4vowmcL2A72L1Hzk,202
|
|
40
|
+
kiln_ai/adapters/parsers/base_parser.py,sha256=DaoZVEOOuFTMZd5ZTpl_as6-xc9NPWGP2fAmP12J58M,389
|
|
41
|
+
kiln_ai/adapters/parsers/json_parser.py,sha256=IszrBrhIFrrVr76UZsuejkBdqpZG27mU72264HVgVzE,1274
|
|
42
|
+
kiln_ai/adapters/parsers/parser_registry.py,sha256=G9bAZrnWrR0a82JAQHsSqA2o7-CjrZUBANZljY_6ZxE,623
|
|
43
|
+
kiln_ai/adapters/parsers/r1_parser.py,sha256=9nMEWDAbRSTFuu_--0HMVfVg9IYSoUNQHHw9OxETlRw,2558
|
|
44
|
+
kiln_ai/adapters/parsers/test_json_parser.py,sha256=9kdWe_vRC5wjP8A1Ym6Zu6enDIz4ARCNiRpcZr7_3ak,1971
|
|
45
|
+
kiln_ai/adapters/parsers/test_parser_registry.py,sha256=S4MdX7cnhCbmeKq8tZwMwRdGWr-019Z-fw5zey9Wm08,1043
|
|
46
|
+
kiln_ai/adapters/parsers/test_r1_parser.py,sha256=Ys1ICRNVgt54rf8IEKNav5sz9zHYvvcVAUuoSwwftg8,4517
|
|
47
|
+
kiln_ai/adapters/repair/__init__.py,sha256=dOO9MEpEhjiwzDVFg3MNfA2bKMPlax9iekDatpTkX8E,217
|
|
48
|
+
kiln_ai/adapters/repair/repair_task.py,sha256=xX7GI3QZ9OEWV1BGr6OAAAFvHu3h6dYqJgOpqwRADmI,3842
|
|
49
|
+
kiln_ai/adapters/repair/test_repair_task.py,sha256=2pxEwnmJCN_z35_kTNiaFSvnTw_njRinmHV1P9Y7UQw,7974
|
|
50
|
+
kiln_ai/datamodel/__init__.py,sha256=aWGqXz4H-NGDFGBHcnArUVj3o-WgJsRACJ3QHF--8-k,36079
|
|
51
|
+
kiln_ai/datamodel/basemodel.py,sha256=TwMBfNFJ7-5bp2QOoTQUl_YVrF0pkDAk5Rdk6EWEXxI,22143
|
|
52
|
+
kiln_ai/datamodel/json_schema.py,sha256=VjjYkzy8X-QZqOQNZH6x7KitrtdcmZNttVP49iqBJAk,2817
|
|
53
|
+
kiln_ai/datamodel/model_cache.py,sha256=9X4aAigbkFdytckgw8InCMh86uBna0ME_1HJSeMPEn0,4495
|
|
54
|
+
kiln_ai/datamodel/registry.py,sha256=XwGFXJFKZtOpR1Z9ven6SftggfADdZRm8TFxCEVtfUQ,957
|
|
55
|
+
kiln_ai/datamodel/test_basemodel.py,sha256=PqBlx5gIN3DxYjDuLzmNYIp-VwMvOQ9P5hOjPfOs-9g,17813
|
|
56
|
+
kiln_ai/datamodel/test_dataset_split.py,sha256=7DXu3WaIGZ4Tyj0ahgrK7jRChDwU1eolXEdz4urkWNc,10530
|
|
57
|
+
kiln_ai/datamodel/test_datasource.py,sha256=GAiZz31qezVVPwFqnt8wHMu15WvtlV89jw8C1Ue6YNI,3165
|
|
58
|
+
kiln_ai/datamodel/test_example_models.py,sha256=ekAxweFBtkryheBFJMQvy0TkWvjNtlqBsJv3Pcu5uNE,21048
|
|
59
|
+
kiln_ai/datamodel/test_json_schema.py,sha256=vdLnTQxxrcmuSrf6iOmkrmpfh7JnxqIw4B4dbDAAcZ4,3199
|
|
60
|
+
kiln_ai/datamodel/test_model_cache.py,sha256=Fy-ucYNzS5JEG-8SFY4nVHA8iRbXXxai20f8_oGl97o,8184
|
|
61
|
+
kiln_ai/datamodel/test_model_perf.py,sha256=NdD7L8XraGkunaEKGPsfYwdcbIgdjhFanOO3G6hU158,3235
|
|
62
|
+
kiln_ai/datamodel/test_models.py,sha256=s3zjlnN3zbSjPyLfz3PDABO8msKeQXDRd4WrpkLWrOE,19274
|
|
63
|
+
kiln_ai/datamodel/test_nested_save.py,sha256=xciCddqvPyKyoyjC5Lx_3Kh1t4LJv1xYRAPazR3SRcs,5588
|
|
64
|
+
kiln_ai/datamodel/test_output_rating.py,sha256=zvPIp2shAgCs2RQBgwYoL09fRA3krHvgAqUa91RlWR0,15125
|
|
65
|
+
kiln_ai/datamodel/test_registry.py,sha256=PhS4anLi5Bf_023obuTlO5DALhtPB8WIc_bX12Yg6Po,2705
|
|
66
|
+
kiln_ai/utils/__init__.py,sha256=PTD0MwBCKAMIOGsTAwsFaJOusTJJoRFTfOGqRvCaU-E,142
|
|
67
|
+
kiln_ai/utils/config.py,sha256=u289b2AHuQoPup_vILTSpgsO29fxJyU8zy8BwADAtvs,6859
|
|
68
|
+
kiln_ai/utils/exhaustive_error.py,sha256=TkkRixIAR3CPEKHeAJzyv0mtxp6BxUBKMvobA3vzQug,262
|
|
69
|
+
kiln_ai/utils/formatting.py,sha256=VtB9oag0lOGv17dwT7OPX_3HzBfaU9GsLH-iLete0yM,97
|
|
70
|
+
kiln_ai/utils/name_generator.py,sha256=v26TgpCwQbhQFcZvzgjZvURinjrOyyFhxpsI6NQrHKc,1914
|
|
71
|
+
kiln_ai/utils/test_config.py,sha256=Jw3nMFeIgZUsZDRJJY2HpB-2EkR2NoZ-rDe_o9oA7ws,9174
|
|
72
|
+
kiln_ai/utils/test_name_geneator.py,sha256=9-hSTBshyakqlPbFnNcggwLrL7lcPTitauBYHg9jFWI,1513
|
|
73
|
+
kiln_ai-0.11.1.dist-info/METADATA,sha256=Dvlb27BCuxsJOpZHogOzrnlCYiL6FBLFCtCxu3gmee8,9217
|
|
74
|
+
kiln_ai-0.11.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
75
|
+
kiln_ai-0.11.1.dist-info/licenses/LICENSE.txt,sha256=_NA5pnTYgRRr4qH6lE3X-TuZJ8iRcMUi5ASoGr-lEx8,1209
|
|
76
|
+
kiln_ai-0.11.1.dist-info/RECORD,,
|