completion-kit 0.20.3 → 0.20.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/javascripts/completion_kit/application.js +70 -0
- data/app/assets/stylesheets/completion_kit/application.css +13 -3
- data/app/controllers/completion_kit/metrics_controller.rb +10 -1
- data/app/controllers/completion_kit/runs_controller.rb +1 -1
- data/app/models/completion_kit/run.rb +3 -3
- data/app/services/completion_kit/mcp_tools/judges.rb +1 -1
- data/app/services/completion_kit/mcp_tools/prompts.rb +2 -2
- data/app/services/completion_kit/mcp_tools/runs.rb +1 -1
- data/app/views/completion_kit/api_reference/_body.html.erb +15 -1
- data/app/views/completion_kit/metrics/_form.html.erb +16 -14
- data/app/views/completion_kit/runs/_form.html.erb +2 -2
- data/app/views/completion_kit/runs/_row.html.erb +1 -1
- data/app/views/completion_kit/runs/_status_header.html.erb +1 -1
- data/lib/completion_kit/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e49a3e185722be44be75a0e236862942841720af9f8b6bdcfe233778968d26ab
|
|
4
|
+
data.tar.gz: 1545324b88bc8eef05f507d71a80e06804d4a9d3c65f99010b76610657ffa021
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 21f1b48c9ed2ba23b111eb1ff733ac62f180050ce3e1edd9ea31719ee25cf557025af86e00f13be27b6e04a79afb677ced23fa7e7e5a9580715249533559338c
|
|
7
|
+
data.tar.gz: a7799c294e109c42a585f9fc7aa01e94d389291dd3fc1eec6821123ea4cb066e5cb805b803dfcf935f771b494bf1324fd7f5787f781746cfd0290d6c97b93641
|
|
@@ -210,6 +210,76 @@ document.addEventListener("click", function(e) {
|
|
|
210
210
|
});
|
|
211
211
|
});
|
|
212
212
|
|
|
213
|
+
var CK_CHECK_FIELDS = {
|
|
214
|
+
contains: ["value", "case_sensitive", "trim"],
|
|
215
|
+
not_contains: ["value", "case_sensitive", "trim"],
|
|
216
|
+
equals: ["value", "case_sensitive", "trim"],
|
|
217
|
+
regex: ["pattern", "case_sensitive", "multiline"],
|
|
218
|
+
valid_json: [],
|
|
219
|
+
json_path_equals: ["json_path", "expected"],
|
|
220
|
+
length_bounds: ["min", "max"],
|
|
221
|
+
no_refusal: []
|
|
222
|
+
};
|
|
223
|
+
|
|
224
|
+
function ckApplyCheckFields(scope) {
|
|
225
|
+
if (!scope) return;
|
|
226
|
+
var kindSelect = scope.querySelector('[name="metric[check_config][check_kind]"]');
|
|
227
|
+
if (!kindSelect) return;
|
|
228
|
+
var visible = CK_CHECK_FIELDS[kindSelect.value];
|
|
229
|
+
var targetSelect = scope.querySelector('[name="metric[check_config][target]"]');
|
|
230
|
+
var targetIsJsonPath = !!(targetSelect && targetSelect.value === "json_path");
|
|
231
|
+
scope.querySelectorAll("[data-ck-check-field]").forEach(function(field) {
|
|
232
|
+
var key = field.getAttribute("data-ck-check-field");
|
|
233
|
+
var show;
|
|
234
|
+
if (key === "target_path") {
|
|
235
|
+
show = targetIsJsonPath;
|
|
236
|
+
} else if (!visible) {
|
|
237
|
+
show = true;
|
|
238
|
+
} else {
|
|
239
|
+
show = visible.indexOf(key) !== -1;
|
|
240
|
+
}
|
|
241
|
+
field.hidden = !show;
|
|
242
|
+
});
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
function ckApplyMetricType(group) {
|
|
246
|
+
var checked = group.querySelector('input[type="radio"]:checked');
|
|
247
|
+
if (!checked) return;
|
|
248
|
+
var value = checked.value;
|
|
249
|
+
var scope = group.closest("form") || document;
|
|
250
|
+
scope.querySelectorAll("[data-ck-metric-editor]").forEach(function(editor) {
|
|
251
|
+
var active = editor.getAttribute("data-ck-metric-editor") === value;
|
|
252
|
+
editor.hidden = !active;
|
|
253
|
+
editor.querySelectorAll("input, select, textarea").forEach(function(field) {
|
|
254
|
+
field.disabled = !active;
|
|
255
|
+
});
|
|
256
|
+
});
|
|
257
|
+
ckApplyCheckFields(scope);
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
document.addEventListener("turbo:load", function() {
|
|
261
|
+
document.querySelectorAll("[data-ck-metric-type]").forEach(function(group) {
|
|
262
|
+
ckApplyMetricType(group);
|
|
263
|
+
});
|
|
264
|
+
document.querySelectorAll('[data-ck-metric-editor="check"]').forEach(function(editor) {
|
|
265
|
+
ckApplyCheckFields(editor);
|
|
266
|
+
});
|
|
267
|
+
});
|
|
268
|
+
|
|
269
|
+
document.addEventListener("change", function(e) {
|
|
270
|
+
var target = e.target;
|
|
271
|
+
if (!target || !target.closest) return;
|
|
272
|
+
var group = target.closest("[data-ck-metric-type]");
|
|
273
|
+
if (group && target.type === "radio") {
|
|
274
|
+
ckApplyMetricType(group);
|
|
275
|
+
return;
|
|
276
|
+
}
|
|
277
|
+
if (target.name === "metric[check_config][check_kind]" || target.name === "metric[check_config][target]") {
|
|
278
|
+
var scope = target.closest('[data-ck-metric-editor="check"]') || target.closest("form");
|
|
279
|
+
ckApplyCheckFields(scope);
|
|
280
|
+
}
|
|
281
|
+
});
|
|
282
|
+
|
|
213
283
|
document.addEventListener("click", function(e) {
|
|
214
284
|
var btn = e.target.closest("[data-ck-apply]");
|
|
215
285
|
if (!btn) return;
|
|
@@ -1922,6 +1922,13 @@ label.ck-checkbox input {
|
|
|
1922
1922
|
cursor: pointer;
|
|
1923
1923
|
}
|
|
1924
1924
|
|
|
1925
|
+
.ck-radio-info {
|
|
1926
|
+
width: 16px;
|
|
1927
|
+
height: 16px;
|
|
1928
|
+
color: var(--ck-muted);
|
|
1929
|
+
cursor: help;
|
|
1930
|
+
}
|
|
1931
|
+
|
|
1925
1932
|
.ck-field-row {
|
|
1926
1933
|
display: flex;
|
|
1927
1934
|
gap: 1rem;
|
|
@@ -3238,7 +3245,8 @@ select.ck-input {
|
|
|
3238
3245
|
#ck-tab-metric-groups:checked ~ .ck-api-tabs__nav label[for="ck-tab-metric-groups"],
|
|
3239
3246
|
#ck-tab-agreements:checked ~ .ck-api-tabs__nav label[for="ck-tab-agreements"],
|
|
3240
3247
|
#ck-tab-tags:checked ~ .ck-api-tabs__nav label[for="ck-tab-tags"],
|
|
3241
|
-
#ck-tab-providers:checked ~ .ck-api-tabs__nav label[for="ck-tab-providers"]
|
|
3248
|
+
#ck-tab-providers:checked ~ .ck-api-tabs__nav label[for="ck-tab-providers"],
|
|
3249
|
+
#ck-tab-imports:checked ~ .ck-api-tabs__nav label[for="ck-tab-imports"] {
|
|
3242
3250
|
color: var(--ck-accent);
|
|
3243
3251
|
background: var(--ck-surface-soft);
|
|
3244
3252
|
border-left-color: var(--ck-accent);
|
|
@@ -3253,7 +3261,8 @@ select.ck-input {
|
|
|
3253
3261
|
#ck-tab-metric-groups:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(7),
|
|
3254
3262
|
#ck-tab-agreements:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(8),
|
|
3255
3263
|
#ck-tab-tags:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(9),
|
|
3256
|
-
#ck-tab-providers:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(10)
|
|
3264
|
+
#ck-tab-providers:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(10),
|
|
3265
|
+
#ck-tab-imports:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(11) {
|
|
3257
3266
|
display: block;
|
|
3258
3267
|
}
|
|
3259
3268
|
|
|
@@ -3295,7 +3304,8 @@ select.ck-input {
|
|
|
3295
3304
|
#ck-tab-metric-groups:checked ~ .ck-api-tabs__nav label[for="ck-tab-metric-groups"],
|
|
3296
3305
|
#ck-tab-agreements:checked ~ .ck-api-tabs__nav label[for="ck-tab-agreements"],
|
|
3297
3306
|
#ck-tab-tags:checked ~ .ck-api-tabs__nav label[for="ck-tab-tags"],
|
|
3298
|
-
#ck-tab-providers:checked ~ .ck-api-tabs__nav label[for="ck-tab-providers"]
|
|
3307
|
+
#ck-tab-providers:checked ~ .ck-api-tabs__nav label[for="ck-tab-providers"],
|
|
3308
|
+
#ck-tab-imports:checked ~ .ck-api-tabs__nav label[for="ck-tab-imports"] {
|
|
3299
3309
|
border-left-color: transparent;
|
|
3300
3310
|
border-bottom-color: var(--ck-accent);
|
|
3301
3311
|
}
|
|
@@ -68,7 +68,7 @@ module CompletionKit
|
|
|
68
68
|
end
|
|
69
69
|
|
|
70
70
|
def create
|
|
71
|
-
@metric = Metric.new(
|
|
71
|
+
@metric = Metric.new(create_metric_params)
|
|
72
72
|
|
|
73
73
|
if @metric.save
|
|
74
74
|
redirect_to metric_path(@metric), notice: "Metric was successfully created."
|
|
@@ -235,6 +235,15 @@ module CompletionKit
|
|
|
235
235
|
@metric = Metric.find(params[:id])
|
|
236
236
|
end
|
|
237
237
|
|
|
238
|
+
def create_metric_params
|
|
239
|
+
attrs = metric_params
|
|
240
|
+
if attrs[:metric_type] == "check"
|
|
241
|
+
attrs.except(:instruction, :rubric_bands)
|
|
242
|
+
else
|
|
243
|
+
attrs.except(:check_config)
|
|
244
|
+
end
|
|
245
|
+
end
|
|
246
|
+
|
|
238
247
|
def metric_params
|
|
239
248
|
permitted = params.require(:metric).permit(:name, :instruction, :metric_type,
|
|
240
249
|
rubric_bands: [:stars, :description],
|
|
@@ -122,7 +122,7 @@ module CompletionKit
|
|
|
122
122
|
|
|
123
123
|
def suggest
|
|
124
124
|
if @run.prompt.nil?
|
|
125
|
-
redirect_to run_path(@run), alert: "
|
|
125
|
+
redirect_to run_path(@run), alert: "A run that only scores existing outputs has no prompt to improve."
|
|
126
126
|
return
|
|
127
127
|
end
|
|
128
128
|
|
|
@@ -30,7 +30,7 @@ module CompletionKit
|
|
|
30
30
|
display_scoped.select(:id)
|
|
31
31
|
end
|
|
32
32
|
|
|
33
|
-
# A
|
|
33
|
+
# A scoring-only run grades a pre-existing column on the dataset instead of
|
|
34
34
|
# generating new outputs. No prompt is attached; the response text is read
|
|
35
35
|
# from row[output_column]; no LLM generation happens.
|
|
36
36
|
def judge_only?
|
|
@@ -442,7 +442,7 @@ module CompletionKit
|
|
|
442
442
|
self.name = "#{prompt.name} — v#{prompt.version_number} ##{count}"
|
|
443
443
|
elsif dataset.present?
|
|
444
444
|
count = Run.where(prompt_id: nil, dataset_id: dataset.id).count + 1
|
|
445
|
-
self.name = "#{dataset.name}
|
|
445
|
+
self.name = "#{dataset.name} scoring ##{count}"
|
|
446
446
|
end
|
|
447
447
|
end
|
|
448
448
|
|
|
@@ -461,7 +461,7 @@ module CompletionKit
|
|
|
461
461
|
return if prompt.present?
|
|
462
462
|
|
|
463
463
|
if dataset.nil?
|
|
464
|
-
errors.add(:dataset_id, "is required
|
|
464
|
+
errors.add(:dataset_id, "is required when scoring existing outputs (no prompt)")
|
|
465
465
|
return
|
|
466
466
|
end
|
|
467
467
|
|
|
@@ -5,7 +5,7 @@ module CompletionKit
|
|
|
5
5
|
|
|
6
6
|
TOOLS = {
|
|
7
7
|
"judges_replay" => {
|
|
8
|
-
description: "Run the current judge against a dataset (
|
|
8
|
+
description: "Run the current judge against a dataset (scores existing outputs). Wraps runs_create with prompt_id omitted and output_column supplied. Re-judges existing dataset outputs so you can compare against human verdicts.",
|
|
9
9
|
inputSchema: {
|
|
10
10
|
type: "object",
|
|
11
11
|
properties: {
|
|
@@ -51,7 +51,7 @@ module CompletionKit
|
|
|
51
51
|
handler: :publish
|
|
52
52
|
},
|
|
53
53
|
"prompts_suggest_improvement" => {
|
|
54
|
-
description: "Suggest an improved version of a prompt, grounded in a run's test results and judge feedback. Analyzes the run's responses, scores, and reviews, then returns reasoning plus a rewritten template (preserving {{variables}}) and persists it as a Suggestion. Requires a run that has a prompt (not a
|
|
54
|
+
description: "Suggest an improved version of a prompt, grounded in a run's test results and judge feedback. Analyzes the run's responses, scores, and reviews, then returns reasoning plus a rewritten template (preserving {{variables}}) and persists it as a Suggestion. Requires a run that has a prompt (not a scoring-only run).",
|
|
55
55
|
inputSchema: {
|
|
56
56
|
type: "object",
|
|
57
57
|
properties: {run_id: {type: "integer", description: "The run whose results ground the improvement."}},
|
|
@@ -107,7 +107,7 @@ module CompletionKit
|
|
|
107
107
|
|
|
108
108
|
def self.suggest_improvement(args)
|
|
109
109
|
run = Run.find(args["run_id"])
|
|
110
|
-
return error_result("
|
|
110
|
+
return error_result("A run that only scores existing outputs has no prompt to improve.") if run.prompt.nil?
|
|
111
111
|
|
|
112
112
|
result = PromptImprovementService.new(run).suggest
|
|
113
113
|
return error_result("The model didn't return a usable rewrite.") if result["suggested_template"].blank?
|
|
@@ -15,7 +15,7 @@ module CompletionKit
|
|
|
15
15
|
handler: :get
|
|
16
16
|
},
|
|
17
17
|
"runs_create" => {
|
|
18
|
-
description: "Create a run. Omit prompt_id and provide output_column
|
|
18
|
+
description: "Create a run. Omit prompt_id and provide output_column to score existing outputs by grading a pre-existing dataset column instead of generating new ones.",
|
|
19
19
|
inputSchema: {
|
|
20
20
|
type: "object",
|
|
21
21
|
properties: {
|
|
@@ -20,6 +20,7 @@
|
|
|
20
20
|
<input type="radio" name="ck-api-tab" id="ck-tab-agreements" class="ck-api-tabs__radio">
|
|
21
21
|
<input type="radio" name="ck-api-tab" id="ck-tab-tags" class="ck-api-tabs__radio">
|
|
22
22
|
<input type="radio" name="ck-api-tab" id="ck-tab-providers" class="ck-api-tabs__radio">
|
|
23
|
+
<input type="radio" name="ck-api-tab" id="ck-tab-imports" class="ck-api-tabs__radio">
|
|
23
24
|
|
|
24
25
|
<nav class="ck-api-tabs__nav">
|
|
25
26
|
<label for="ck-tab-mcp" class="ck-api-tabs__label">MCP <span class="ck-api-tabs__count"><%= CompletionKit::McpDispatcher.tool_definitions.size %></span></label>
|
|
@@ -32,6 +33,7 @@
|
|
|
32
33
|
<label for="ck-tab-agreements" class="ck-api-tabs__label">Agreements <span class="ck-api-tabs__count">3</span></label>
|
|
33
34
|
<label for="ck-tab-tags" class="ck-api-tabs__label">Tags <span class="ck-api-tabs__count">5</span></label>
|
|
34
35
|
<label for="ck-tab-providers" class="ck-api-tabs__label">Providers <span class="ck-api-tabs__count">5</span></label>
|
|
36
|
+
<label for="ck-tab-imports" class="ck-api-tabs__label">Imports <span class="ck-api-tabs__count">1</span></label>
|
|
35
37
|
</nav>
|
|
36
38
|
|
|
37
39
|
<div class="ck-api-tabs__panels">
|
|
@@ -124,7 +126,7 @@
|
|
|
124
126
|
<div class="ck-api-endpoint">
|
|
125
127
|
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/runs</p>
|
|
126
128
|
<p class="ck-meta-copy">Create a new run.</p>
|
|
127
|
-
<p class="ck-api-params"><strong>Optional:</strong> <code>name</code>, <code>prompt_id</code>, <code>dataset_id</code>, <code>metric_ids</code>, <code>judge_model</code>, <code>output_column</code> (
|
|
129
|
+
<p class="ck-api-params"><strong>Optional:</strong> <code>name</code>, <code>prompt_id</code>, <code>dataset_id</code>, <code>metric_ids</code>, <code>judge_model</code>, <code>output_column</code> (score existing outputs: omit <code>prompt_id</code> and grade a dataset column instead, default <code>actual_output</code>)</p>
|
|
128
130
|
<%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/runs \\\n -H \"Authorization: Bearer #{token}\" \\\n -H \"Content-Type: application/json\" \\\n -d '{\"prompt_id\": 1, \"dataset_id\": 1, \"metric_ids\": [1, 2]}'" %>
|
|
129
131
|
</div>
|
|
130
132
|
<div class="ck-api-endpoint">
|
|
@@ -379,6 +381,18 @@
|
|
|
379
381
|
} %>
|
|
380
382
|
</div>
|
|
381
383
|
|
|
384
|
+
<div class="ck-api-tabs__panel">
|
|
385
|
+
<h2 class="ck-section-title">Imports</h2>
|
|
386
|
+
<p class="ck-copy">Bring an existing <a href="https://www.promptfoo.dev" class="ck-link">promptfoo</a> config into CompletionKit in one call. Prompts, the test dataset, assert-based metrics, and providers are created where they map cleanly and skipped with a reason where they don't.</p>
|
|
387
|
+
<div class="ck-api-endpoint">
|
|
388
|
+
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/imports/promptfoo</p>
|
|
389
|
+
<p class="ck-meta-copy">Import a promptfooconfig.yaml. Send the YAML as a <code>config</code> param, or POST the raw YAML as the request body. Returns 201 with a mapping summary, or 422 if the YAML cannot be parsed.</p>
|
|
390
|
+
<p class="ck-api-params"><strong>Request:</strong> <code>config</code> (the YAML text) or a raw YAML request body</p>
|
|
391
|
+
<p class="ck-api-params"><strong>Response 201:</strong> <code>prompts</code>, <code>dataset</code>, <code>metrics</code>, and <code>providers</code>, each listing what was <code>created</code> and what was <code>skipped</code> (with a reason)</p>
|
|
392
|
+
<%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/imports/promptfoo \\\n -H \"Authorization: Bearer #{token}\" \\\n -H \"Content-Type: application/x-yaml\" \\\n --data-binary @promptfooconfig.yaml" %>
|
|
393
|
+
</div>
|
|
394
|
+
</div>
|
|
395
|
+
|
|
382
396
|
</div>
|
|
383
397
|
</div>
|
|
384
398
|
|
|
@@ -75,20 +75,22 @@
|
|
|
75
75
|
<% else %>
|
|
76
76
|
<div class="ck-field" data-ck-metric-type>
|
|
77
77
|
<p class="ck-section-title">Metric type</p>
|
|
78
|
-
<p class="ck-hint">
|
|
78
|
+
<p class="ck-hint">The judge gives each response 1 to 5 stars against your rubric. A check just passes or fails, with no AI.</p>
|
|
79
79
|
<label class="ck-radio">
|
|
80
80
|
<%= form.radio_button :metric_type, "llm_judge", checked: !metric.check? %>
|
|
81
81
|
<span>LLM judge (1-5)</span>
|
|
82
|
+
<%= heroicon_tag "information-circle", variant: :outline, size: 16, class: "ck-radio-info", "aria-hidden": "true", title: "An AI reads each response and rates it 1 to 5 stars against your rubric, with a written reason. Best for subjective quality: tone, helpfulness, accuracy." %>
|
|
82
83
|
</label>
|
|
83
84
|
<label class="ck-radio">
|
|
84
85
|
<%= form.radio_button :metric_type, "check", checked: metric.check? %>
|
|
85
86
|
<span>Deterministic check</span>
|
|
87
|
+
<%= heroicon_tag "information-circle", variant: :outline, size: 16, class: "ck-radio-info", "aria-hidden": "true", title: "A rule that passes or fails instantly with no AI and no cost. Best for exact things: valid JSON, contains a phrase, no refusal." %>
|
|
86
88
|
</label>
|
|
87
89
|
</div>
|
|
88
90
|
<% end %>
|
|
89
91
|
|
|
90
92
|
<% if show_judge %>
|
|
91
|
-
<div class="ck-field ck-field--spacious" data-ck-metric-editor="llm_judge"
|
|
93
|
+
<div class="ck-field ck-field--spacious" data-ck-metric-editor="llm_judge" <%= "hidden" if metric.check? %>>
|
|
92
94
|
<p class="ck-section-title">Instruction</p>
|
|
93
95
|
<p class="ck-hint">What should the judge assess? This instruction is sent to the LLM judge when scoring outputs.</p>
|
|
94
96
|
<%= form.text_area :instruction, rows: 8, class: "ck-input ck-input--area", placeholder: "Evaluate whether the output...", **ck_field_aria(form, :instruction) %>
|
|
@@ -112,7 +114,7 @@
|
|
|
112
114
|
<% end %>
|
|
113
115
|
</div>
|
|
114
116
|
|
|
115
|
-
<div class="ck-field ck-field--spacious"
|
|
117
|
+
<div class="ck-field ck-field--spacious" data-ck-metric-editor="llm_judge" <%= "hidden" if metric.check? %>>
|
|
116
118
|
<p class="ck-section-title">Rubric<%= render "completion_kit/metrics/rubric_hint" %></p>
|
|
117
119
|
<p class="ck-hint">What each star rating means for this metric.</p>
|
|
118
120
|
|
|
@@ -155,7 +157,7 @@
|
|
|
155
157
|
|
|
156
158
|
<% if show_check %>
|
|
157
159
|
<% check = metric.check_config || {} %>
|
|
158
|
-
<div class="ck-field ck-field--spacious" data-ck-metric-editor="check"
|
|
160
|
+
<div class="ck-field ck-field--spacious" data-ck-metric-editor="check" <%= "hidden" unless metric.check? %>>
|
|
159
161
|
<p class="ck-section-title">Check</p>
|
|
160
162
|
<p class="ck-hint">A deterministic pass/fail rule. Fill only the fields the chosen kind needs.</p>
|
|
161
163
|
|
|
@@ -177,56 +179,56 @@
|
|
|
177
179
|
</select>
|
|
178
180
|
</div>
|
|
179
181
|
|
|
180
|
-
<div class="ck-field">
|
|
182
|
+
<div class="ck-field" data-ck-check-field="target_path">
|
|
181
183
|
<label class="ck-label" for="metric_check_target_path">Target path</label>
|
|
182
184
|
<p class="ck-hint">Used when target is json_path, e.g. data.items.0.name.</p>
|
|
183
185
|
<input type="text" name="metric[check_config][target_path]" id="metric_check_target_path" class="ck-input" value="<%= check["target_path"] %>">
|
|
184
186
|
</div>
|
|
185
187
|
|
|
186
|
-
<div class="ck-field">
|
|
188
|
+
<div class="ck-field" data-ck-check-field="value">
|
|
187
189
|
<label class="ck-label" for="metric_check_value">Value</label>
|
|
188
190
|
<p class="ck-hint">The substring or exact string for contains, not_contains, or equals.</p>
|
|
189
191
|
<input type="text" name="metric[check_config][value]" id="metric_check_value" class="ck-input" value="<%= check["value"] %>">
|
|
190
192
|
</div>
|
|
191
193
|
|
|
192
|
-
<div class="ck-field">
|
|
194
|
+
<div class="ck-field" data-ck-check-field="pattern">
|
|
193
195
|
<label class="ck-label" for="metric_check_pattern">Pattern</label>
|
|
194
196
|
<p class="ck-hint">A regular expression for the regex kind.</p>
|
|
195
197
|
<input type="text" name="metric[check_config][pattern]" id="metric_check_pattern" class="ck-input" value="<%= check["pattern"] %>">
|
|
196
198
|
</div>
|
|
197
199
|
|
|
198
|
-
<div class="ck-field">
|
|
200
|
+
<div class="ck-field" data-ck-check-field="json_path">
|
|
199
201
|
<label class="ck-label" for="metric_check_json_path">JSON path</label>
|
|
200
202
|
<p class="ck-hint">Dotted path into parsed JSON for json_path_equals.</p>
|
|
201
203
|
<input type="text" name="metric[check_config][json_path]" id="metric_check_json_path" class="ck-input" value="<%= check["json_path"] %>">
|
|
202
204
|
</div>
|
|
203
205
|
|
|
204
|
-
<div class="ck-field">
|
|
206
|
+
<div class="ck-field" data-ck-check-field="expected">
|
|
205
207
|
<label class="ck-label" for="metric_check_expected">Expected</label>
|
|
206
208
|
<p class="ck-hint">The value the JSON path must equal.</p>
|
|
207
209
|
<input type="text" name="metric[check_config][expected]" id="metric_check_expected" class="ck-input" value="<%= check["expected"] %>">
|
|
208
210
|
</div>
|
|
209
211
|
|
|
210
212
|
<div class="ck-field-row">
|
|
211
|
-
<div class="ck-field">
|
|
213
|
+
<div class="ck-field" data-ck-check-field="min">
|
|
212
214
|
<label class="ck-label" for="metric_check_min">Min length</label>
|
|
213
215
|
<input type="number" name="metric[check_config][min]" id="metric_check_min" class="ck-input" value="<%= check["min"] %>">
|
|
214
216
|
</div>
|
|
215
|
-
<div class="ck-field">
|
|
217
|
+
<div class="ck-field" data-ck-check-field="max">
|
|
216
218
|
<label class="ck-label" for="metric_check_max">Max length</label>
|
|
217
219
|
<input type="number" name="metric[check_config][max]" id="metric_check_max" class="ck-input" value="<%= check["max"] %>">
|
|
218
220
|
</div>
|
|
219
221
|
</div>
|
|
220
222
|
|
|
221
|
-
<label class="ck-checkbox">
|
|
223
|
+
<label class="ck-checkbox" data-ck-check-field="case_sensitive">
|
|
222
224
|
<input type="checkbox" name="metric[check_config][case_sensitive]" value="true"<%= " checked" if check["case_sensitive"] %>>
|
|
223
225
|
<span>Case sensitive</span>
|
|
224
226
|
</label>
|
|
225
|
-
<label class="ck-checkbox">
|
|
227
|
+
<label class="ck-checkbox" data-ck-check-field="multiline">
|
|
226
228
|
<input type="checkbox" name="metric[check_config][multiline]" value="true"<%= " checked" if check["multiline"] %>>
|
|
227
229
|
<span>Multiline</span>
|
|
228
230
|
</label>
|
|
229
|
-
<label class="ck-checkbox">
|
|
231
|
+
<label class="ck-checkbox" data-ck-check-field="trim">
|
|
230
232
|
<input type="checkbox" name="metric[check_config][trim]" value="true"<%= " checked" if check["trim"] %>>
|
|
231
233
|
<span>Trim whitespace</span>
|
|
232
234
|
</label>
|
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<%= check_box_tag "run[judge_only]", "1", run.persisted? && run.judge_only?, id: "run_judge_only", class: "ck-checkbox" %>
|
|
23
23
|
<span class="ck-checkbox-label__box" aria-hidden="true"></span>
|
|
24
24
|
<span class="ck-checkbox-label__body">
|
|
25
|
-
<span class="ck-checkbox-label__text">
|
|
25
|
+
<span class="ck-checkbox-label__text">Score existing outputs</span>
|
|
26
26
|
<span class="ck-checkbox-label__hint">Grade an existing column on the dataset instead of running a prompt. Roughly half the LLM calls per row.</span>
|
|
27
27
|
</span>
|
|
28
28
|
</label>
|
|
@@ -263,7 +263,7 @@ function updateRunForm() {
|
|
|
263
263
|
}
|
|
264
264
|
} else if (!dataset) {
|
|
265
265
|
if (datasetField) datasetField.className = 'ck-field ck-field--info';
|
|
266
|
-
if (datasetHint) datasetHint.textContent = '
|
|
266
|
+
if (datasetHint) datasetHint.textContent = 'Skip generation and score responses you already have from a dataset column. Works with rubric metrics or deterministic checks.';
|
|
267
267
|
}
|
|
268
268
|
} else {
|
|
269
269
|
valid = prompt !== '';
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
<%= link_to run.prompt.name, ck_prompt_path(run.prompt), class: "ck-runs-table__config-link", onclick: "event.stopPropagation();" %>
|
|
11
11
|
<span class="ck-runs-table__version">v<%= run.prompt.version_number %></span>
|
|
12
12
|
<% else %>
|
|
13
|
-
<span class="ck-runs-table__version">
|
|
13
|
+
<span class="ck-runs-table__version">Scoring only</span>
|
|
14
14
|
<% end %>
|
|
15
15
|
<% if run.dataset %>
|
|
16
16
|
<span class="ck-runs-table__sep">·</span>
|
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<% if run.prompt %>
|
|
23
23
|
<p class="ck-meta-copy"><%= link_to run.prompt.display_name, prompt_path(run.prompt), class: "ck-link" %> <span class="ck-chip" style="text-transform: none;"><%= run.prompt.llm_model %></span></p>
|
|
24
24
|
<% else %>
|
|
25
|
-
<p class="ck-meta-copy">
|
|
25
|
+
<p class="ck-meta-copy">Scoring existing outputs, grading column <code><%= run.output_column.presence || "actual_output" %></code><% if run.dataset %> on <%= link_to run.dataset.name, dataset_path(run.dataset), class: "ck-link" %><% end %></p>
|
|
26
26
|
<% end %>
|
|
27
27
|
</div>
|
|
28
28
|
<%= render "completion_kit/runs/actions", run: run %>
|