openai 4.95.0 → 4.96.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +26 -0
- package/_vendor/zod-to-json-schema/parsers/object.d.ts.map +1 -1
- package/_vendor/zod-to-json-schema/parsers/object.js +6 -2
- package/_vendor/zod-to-json-schema/parsers/object.js.map +1 -1
- package/_vendor/zod-to-json-schema/parsers/object.mjs +6 -2
- package/_vendor/zod-to-json-schema/parsers/object.mjs.map +1 -1
- package/package.json +1 -1
- package/resources/beta/assistants.d.ts +2 -0
- package/resources/beta/assistants.d.ts.map +1 -1
- package/resources/beta/assistants.js +1 -0
- package/resources/beta/assistants.js.map +1 -1
- package/resources/beta/assistants.mjs +1 -0
- package/resources/beta/assistants.mjs.map +1 -1
- package/resources/beta/realtime/realtime.d.ts +83 -2
- package/resources/beta/realtime/realtime.d.ts.map +1 -1
- package/resources/beta/realtime/realtime.js.map +1 -1
- package/resources/beta/realtime/realtime.mjs.map +1 -1
- package/resources/beta/threads/threads.d.ts +2 -1
- package/resources/beta/threads/threads.d.ts.map +1 -1
- package/resources/beta/threads/threads.js.map +1 -1
- package/resources/beta/threads/threads.mjs.map +1 -1
- package/resources/evals/evals.d.ts +546 -90
- package/resources/evals/evals.d.ts.map +1 -1
- package/resources/evals/evals.js.map +1 -1
- package/resources/evals/evals.mjs.map +1 -1
- package/resources/evals/runs/runs.d.ts +1111 -147
- package/resources/evals/runs/runs.d.ts.map +1 -1
- package/resources/evals/runs/runs.js.map +1 -1
- package/resources/evals/runs/runs.mjs.map +1 -1
- package/resources/fine-tuning/checkpoints/permissions.d.ts +1 -1
- package/resources/fine-tuning/checkpoints/permissions.d.ts.map +1 -1
- package/resources/fine-tuning/checkpoints/permissions.js +2 -2
- package/resources/fine-tuning/checkpoints/permissions.js.map +1 -1
- package/resources/fine-tuning/checkpoints/permissions.mjs +2 -2
- package/resources/fine-tuning/checkpoints/permissions.mjs.map +1 -1
- package/resources/images.d.ts +141 -40
- package/resources/images.d.ts.map +1 -1
- package/resources/images.js +4 -2
- package/resources/images.js.map +1 -1
- package/resources/images.mjs +4 -2
- package/resources/images.mjs.map +1 -1
- package/resources/responses/responses.d.ts +132 -2
- package/resources/responses/responses.d.ts.map +1 -1
- package/resources/responses/responses.js.map +1 -1
- package/resources/responses/responses.mjs.map +1 -1
- package/src/_vendor/zod-to-json-schema/parsers/object.ts +10 -2
- package/src/resources/beta/assistants.ts +3 -0
- package/src/resources/beta/realtime/realtime.ts +97 -1
- package/src/resources/beta/threads/threads.ts +3 -3
- package/src/resources/evals/evals.ts +652 -97
- package/src/resources/evals/runs/runs.ts +1433 -266
- package/src/resources/fine-tuning/checkpoints/permissions.ts +5 -1
- package/src/resources/images.ts +162 -40
- package/src/resources/responses/responses.ts +162 -0
- package/src/version.ts +1 -1
- package/version.d.ts +1 -1
- package/version.js +1 -1
- package/version.mjs +1 -1
|
@@ -4,6 +4,7 @@ import { APIResource } from '../../../resource';
|
|
|
4
4
|
import { isRequestOptions } from '../../../core';
|
|
5
5
|
import * as Core from '../../../core';
|
|
6
6
|
import * as Shared from '../../shared';
|
|
7
|
+
import * as ResponsesAPI from '../../responses/responses';
|
|
7
8
|
import * as OutputItemsAPI from './output-items';
|
|
8
9
|
import {
|
|
9
10
|
OutputItemListParams,
|
|
@@ -83,15 +84,6 @@ export class RunListResponsesPage extends CursorPage<RunListResponse> {}
|
|
|
83
84
|
* A CompletionsRunDataSource object describing a model sampling configuration.
|
|
84
85
|
*/
|
|
85
86
|
export interface CreateEvalCompletionsRunDataSource {
|
|
86
|
-
input_messages:
|
|
87
|
-
| CreateEvalCompletionsRunDataSource.Template
|
|
88
|
-
| CreateEvalCompletionsRunDataSource.ItemReference;
|
|
89
|
-
|
|
90
|
-
/**
|
|
91
|
-
* The name of the model to use for generating completions (e.g. "o3-mini").
|
|
92
|
-
*/
|
|
93
|
-
model: string;
|
|
94
|
-
|
|
95
87
|
/**
|
|
96
88
|
* A StoredCompletionsRunDataSource configuration describing a set of filters
|
|
97
89
|
*/
|
|
@@ -105,105 +97,19 @@ export interface CreateEvalCompletionsRunDataSource {
|
|
|
105
97
|
*/
|
|
106
98
|
type: 'completions';
|
|
107
99
|
|
|
100
|
+
input_messages?:
|
|
101
|
+
| CreateEvalCompletionsRunDataSource.Template
|
|
102
|
+
| CreateEvalCompletionsRunDataSource.ItemReference;
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* The name of the model to use for generating completions (e.g. "o3-mini").
|
|
106
|
+
*/
|
|
107
|
+
model?: string;
|
|
108
|
+
|
|
108
109
|
sampling_params?: CreateEvalCompletionsRunDataSource.SamplingParams;
|
|
109
110
|
}
|
|
110
111
|
|
|
111
112
|
export namespace CreateEvalCompletionsRunDataSource {
|
|
112
|
-
export interface Template {
|
|
113
|
-
/**
|
|
114
|
-
* A list of chat messages forming the prompt or context. May include variable
|
|
115
|
-
* references to the "item" namespace, ie {{item.name}}.
|
|
116
|
-
*/
|
|
117
|
-
template: Array<Template.ChatMessage | Template.InputMessage | Template.OutputMessage>;
|
|
118
|
-
|
|
119
|
-
/**
|
|
120
|
-
* The type of input messages. Always `template`.
|
|
121
|
-
*/
|
|
122
|
-
type: 'template';
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
export namespace Template {
|
|
126
|
-
export interface ChatMessage {
|
|
127
|
-
/**
|
|
128
|
-
* The content of the message.
|
|
129
|
-
*/
|
|
130
|
-
content: string;
|
|
131
|
-
|
|
132
|
-
/**
|
|
133
|
-
* The role of the message (e.g. "system", "assistant", "user").
|
|
134
|
-
*/
|
|
135
|
-
role: string;
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
export interface InputMessage {
|
|
139
|
-
content: InputMessage.Content;
|
|
140
|
-
|
|
141
|
-
/**
|
|
142
|
-
* The role of the message. One of `user`, `system`, or `developer`.
|
|
143
|
-
*/
|
|
144
|
-
role: 'user' | 'system' | 'developer';
|
|
145
|
-
|
|
146
|
-
/**
|
|
147
|
-
* The type of item, which is always `message`.
|
|
148
|
-
*/
|
|
149
|
-
type: 'message';
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
export namespace InputMessage {
|
|
153
|
-
export interface Content {
|
|
154
|
-
/**
|
|
155
|
-
* The text content.
|
|
156
|
-
*/
|
|
157
|
-
text: string;
|
|
158
|
-
|
|
159
|
-
/**
|
|
160
|
-
* The type of content, which is always `input_text`.
|
|
161
|
-
*/
|
|
162
|
-
type: 'input_text';
|
|
163
|
-
}
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
export interface OutputMessage {
|
|
167
|
-
content: OutputMessage.Content;
|
|
168
|
-
|
|
169
|
-
/**
|
|
170
|
-
* The role of the message. Must be `assistant` for output.
|
|
171
|
-
*/
|
|
172
|
-
role: 'assistant';
|
|
173
|
-
|
|
174
|
-
/**
|
|
175
|
-
* The type of item, which is always `message`.
|
|
176
|
-
*/
|
|
177
|
-
type: 'message';
|
|
178
|
-
}
|
|
179
|
-
|
|
180
|
-
export namespace OutputMessage {
|
|
181
|
-
export interface Content {
|
|
182
|
-
/**
|
|
183
|
-
* The text content.
|
|
184
|
-
*/
|
|
185
|
-
text: string;
|
|
186
|
-
|
|
187
|
-
/**
|
|
188
|
-
* The type of content, which is always `output_text`.
|
|
189
|
-
*/
|
|
190
|
-
type: 'output_text';
|
|
191
|
-
}
|
|
192
|
-
}
|
|
193
|
-
}
|
|
194
|
-
|
|
195
|
-
export interface ItemReference {
|
|
196
|
-
/**
|
|
197
|
-
* A reference to a variable in the "item" namespace. Ie, "item.name"
|
|
198
|
-
*/
|
|
199
|
-
item_reference: string;
|
|
200
|
-
|
|
201
|
-
/**
|
|
202
|
-
* The type of input messages. Always `item_reference`.
|
|
203
|
-
*/
|
|
204
|
-
type: 'item_reference';
|
|
205
|
-
}
|
|
206
|
-
|
|
207
113
|
export interface FileContent {
|
|
208
114
|
/**
|
|
209
115
|
* The content of the jsonl file.
|
|
@@ -240,20 +146,25 @@ export namespace CreateEvalCompletionsRunDataSource {
|
|
|
240
146
|
* A StoredCompletionsRunDataSource configuration describing a set of filters
|
|
241
147
|
*/
|
|
242
148
|
export interface StoredCompletions {
|
|
149
|
+
/**
|
|
150
|
+
* The type of source. Always `stored_completions`.
|
|
151
|
+
*/
|
|
152
|
+
type: 'stored_completions';
|
|
153
|
+
|
|
243
154
|
/**
|
|
244
155
|
* An optional Unix timestamp to filter items created after this time.
|
|
245
156
|
*/
|
|
246
|
-
created_after
|
|
157
|
+
created_after?: number | null;
|
|
247
158
|
|
|
248
159
|
/**
|
|
249
160
|
* An optional Unix timestamp to filter items created before this time.
|
|
250
161
|
*/
|
|
251
|
-
created_before
|
|
162
|
+
created_before?: number | null;
|
|
252
163
|
|
|
253
164
|
/**
|
|
254
165
|
* An optional maximum number of items to return.
|
|
255
166
|
*/
|
|
256
|
-
limit
|
|
167
|
+
limit?: number | null;
|
|
257
168
|
|
|
258
169
|
/**
|
|
259
170
|
* Set of 16 key-value pairs that can be attached to an object. This can be useful
|
|
@@ -263,17 +174,81 @@ export namespace CreateEvalCompletionsRunDataSource {
|
|
|
263
174
|
* Keys are strings with a maximum length of 64 characters. Values are strings with
|
|
264
175
|
* a maximum length of 512 characters.
|
|
265
176
|
*/
|
|
266
|
-
metadata
|
|
177
|
+
metadata?: Shared.Metadata | null;
|
|
267
178
|
|
|
268
179
|
/**
|
|
269
180
|
* An optional model to filter by (e.g., 'gpt-4o').
|
|
270
181
|
*/
|
|
271
|
-
model
|
|
182
|
+
model?: string | null;
|
|
183
|
+
}
|
|
272
184
|
|
|
185
|
+
export interface Template {
|
|
273
186
|
/**
|
|
274
|
-
*
|
|
187
|
+
* A list of chat messages forming the prompt or context. May include variable
|
|
188
|
+
* references to the "item" namespace, ie {{item.name}}.
|
|
275
189
|
*/
|
|
276
|
-
|
|
190
|
+
template: Array<ResponsesAPI.EasyInputMessage | Template.Message>;
|
|
191
|
+
|
|
192
|
+
/**
|
|
193
|
+
* The type of input messages. Always `template`.
|
|
194
|
+
*/
|
|
195
|
+
type: 'template';
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
export namespace Template {
|
|
199
|
+
/**
|
|
200
|
+
* A message input to the model with a role indicating instruction following
|
|
201
|
+
* hierarchy. Instructions given with the `developer` or `system` role take
|
|
202
|
+
* precedence over instructions given with the `user` role. Messages with the
|
|
203
|
+
* `assistant` role are presumed to have been generated by the model in previous
|
|
204
|
+
* interactions.
|
|
205
|
+
*/
|
|
206
|
+
export interface Message {
|
|
207
|
+
/**
|
|
208
|
+
* Text inputs to the model - can contain template strings.
|
|
209
|
+
*/
|
|
210
|
+
content: string | ResponsesAPI.ResponseInputText | Message.OutputText;
|
|
211
|
+
|
|
212
|
+
/**
|
|
213
|
+
* The role of the message input. One of `user`, `assistant`, `system`, or
|
|
214
|
+
* `developer`.
|
|
215
|
+
*/
|
|
216
|
+
role: 'user' | 'assistant' | 'system' | 'developer';
|
|
217
|
+
|
|
218
|
+
/**
|
|
219
|
+
* The type of the message input. Always `message`.
|
|
220
|
+
*/
|
|
221
|
+
type?: 'message';
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
export namespace Message {
|
|
225
|
+
/**
|
|
226
|
+
* A text output from the model.
|
|
227
|
+
*/
|
|
228
|
+
export interface OutputText {
|
|
229
|
+
/**
|
|
230
|
+
* The text output from the model.
|
|
231
|
+
*/
|
|
232
|
+
text: string;
|
|
233
|
+
|
|
234
|
+
/**
|
|
235
|
+
* The type of the output text. Always `output_text`.
|
|
236
|
+
*/
|
|
237
|
+
type: 'output_text';
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
export interface ItemReference {
|
|
243
|
+
/**
|
|
244
|
+
* A reference to a variable in the "item" namespace. Ie, "item.name"
|
|
245
|
+
*/
|
|
246
|
+
item_reference: string;
|
|
247
|
+
|
|
248
|
+
/**
|
|
249
|
+
* The type of input messages. Always `item_reference`.
|
|
250
|
+
*/
|
|
251
|
+
type: 'item_reference';
|
|
277
252
|
}
|
|
278
253
|
|
|
279
254
|
export interface SamplingParams {
|
|
@@ -378,7 +353,10 @@ export interface RunCreateResponse {
|
|
|
378
353
|
/**
|
|
379
354
|
* Information about the run's data source.
|
|
380
355
|
*/
|
|
381
|
-
data_source:
|
|
356
|
+
data_source:
|
|
357
|
+
| CreateEvalJSONLRunDataSource
|
|
358
|
+
| CreateEvalCompletionsRunDataSource
|
|
359
|
+
| RunCreateResponse.Completions;
|
|
382
360
|
|
|
383
361
|
/**
|
|
384
362
|
* An object representing an error response from the Eval API.
|
|
@@ -442,162 +420,240 @@ export interface RunCreateResponse {
|
|
|
442
420
|
}
|
|
443
421
|
|
|
444
422
|
export namespace RunCreateResponse {
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
cached_tokens: number;
|
|
450
|
-
|
|
423
|
+
/**
|
|
424
|
+
* A ResponsesRunDataSource object describing a model sampling configuration.
|
|
425
|
+
*/
|
|
426
|
+
export interface Completions {
|
|
451
427
|
/**
|
|
452
|
-
*
|
|
428
|
+
* A EvalResponsesSource object describing a run data source configuration.
|
|
453
429
|
*/
|
|
454
|
-
|
|
430
|
+
source: Completions.FileContent | Completions.FileID | Completions.Responses;
|
|
455
431
|
|
|
456
432
|
/**
|
|
457
|
-
* The
|
|
433
|
+
* The type of run data source. Always `completions`.
|
|
458
434
|
*/
|
|
459
|
-
|
|
435
|
+
type: 'completions';
|
|
460
436
|
|
|
461
|
-
|
|
462
|
-
* The name of the model.
|
|
463
|
-
*/
|
|
464
|
-
model_name: string;
|
|
437
|
+
input_messages?: Completions.Template | Completions.ItemReference;
|
|
465
438
|
|
|
466
439
|
/**
|
|
467
|
-
* The
|
|
440
|
+
* The name of the model to use for generating completions (e.g. "o3-mini").
|
|
468
441
|
*/
|
|
469
|
-
|
|
442
|
+
model?: string;
|
|
470
443
|
|
|
471
|
-
|
|
472
|
-
* The total number of tokens used.
|
|
473
|
-
*/
|
|
474
|
-
total_tokens: number;
|
|
444
|
+
sampling_params?: Completions.SamplingParams;
|
|
475
445
|
}
|
|
476
446
|
|
|
477
|
-
export
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
447
|
+
export namespace Completions {
|
|
448
|
+
export interface FileContent {
|
|
449
|
+
/**
|
|
450
|
+
* The content of the jsonl file.
|
|
451
|
+
*/
|
|
452
|
+
content: Array<FileContent.Content>;
|
|
482
453
|
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
454
|
+
/**
|
|
455
|
+
* The type of jsonl source. Always `file_content`.
|
|
456
|
+
*/
|
|
457
|
+
type: 'file_content';
|
|
458
|
+
}
|
|
487
459
|
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
testing_criteria: string;
|
|
492
|
-
}
|
|
460
|
+
export namespace FileContent {
|
|
461
|
+
export interface Content {
|
|
462
|
+
item: Record<string, unknown>;
|
|
493
463
|
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
export interface ResultCounts {
|
|
498
|
-
/**
|
|
499
|
-
* Number of output items that resulted in an error.
|
|
500
|
-
*/
|
|
501
|
-
errored: number;
|
|
464
|
+
sample?: Record<string, unknown>;
|
|
465
|
+
}
|
|
466
|
+
}
|
|
502
467
|
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
468
|
+
export interface FileID {
|
|
469
|
+
/**
|
|
470
|
+
* The identifier of the file.
|
|
471
|
+
*/
|
|
472
|
+
id: string;
|
|
507
473
|
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
474
|
+
/**
|
|
475
|
+
* The type of jsonl source. Always `file_id`.
|
|
476
|
+
*/
|
|
477
|
+
type: 'file_id';
|
|
478
|
+
}
|
|
512
479
|
|
|
513
480
|
/**
|
|
514
|
-
*
|
|
481
|
+
* A EvalResponsesSource object describing a run data source configuration.
|
|
515
482
|
*/
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
483
|
+
export interface Responses {
|
|
484
|
+
/**
|
|
485
|
+
* The type of run data source. Always `responses`.
|
|
486
|
+
*/
|
|
487
|
+
type: 'responses';
|
|
519
488
|
|
|
520
|
-
/**
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
* Unique identifier for the evaluation run.
|
|
526
|
-
*/
|
|
527
|
-
id: string;
|
|
489
|
+
/**
|
|
490
|
+
* Whether to allow parallel tool calls. This is a query parameter used to select
|
|
491
|
+
* responses.
|
|
492
|
+
*/
|
|
493
|
+
allow_parallel_tool_calls?: boolean | null;
|
|
528
494
|
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
495
|
+
/**
|
|
496
|
+
* Only include items created after this timestamp (inclusive). This is a query
|
|
497
|
+
* parameter used to select responses.
|
|
498
|
+
*/
|
|
499
|
+
created_after?: number | null;
|
|
533
500
|
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
501
|
+
/**
|
|
502
|
+
* Only include items created before this timestamp (inclusive). This is a query
|
|
503
|
+
* parameter used to select responses.
|
|
504
|
+
*/
|
|
505
|
+
created_before?: number | null;
|
|
538
506
|
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
507
|
+
/**
|
|
508
|
+
* Whether the response has tool calls. This is a query parameter used to select
|
|
509
|
+
* responses.
|
|
510
|
+
*/
|
|
511
|
+
has_tool_calls?: boolean | null;
|
|
543
512
|
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
513
|
+
/**
|
|
514
|
+
* Optional search string for instructions. This is a query parameter used to
|
|
515
|
+
* select responses.
|
|
516
|
+
*/
|
|
517
|
+
instructions_search?: string | null;
|
|
548
518
|
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
* Keys are strings with a maximum length of 64 characters. Values are strings with
|
|
555
|
-
* a maximum length of 512 characters.
|
|
556
|
-
*/
|
|
557
|
-
metadata: Shared.Metadata | null;
|
|
519
|
+
/**
|
|
520
|
+
* Metadata filter for the responses. This is a query parameter used to select
|
|
521
|
+
* responses.
|
|
522
|
+
*/
|
|
523
|
+
metadata?: unknown | null;
|
|
558
524
|
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
525
|
+
/**
|
|
526
|
+
* The name of the model to find responses for. This is a query parameter used to
|
|
527
|
+
* select responses.
|
|
528
|
+
*/
|
|
529
|
+
model?: string | null;
|
|
563
530
|
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
531
|
+
/**
|
|
532
|
+
* Optional reasoning effort parameter. This is a query parameter used to select
|
|
533
|
+
* responses.
|
|
534
|
+
*/
|
|
535
|
+
reasoning_effort?: Shared.ReasoningEffort | null;
|
|
568
536
|
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
537
|
+
/**
|
|
538
|
+
* Sampling temperature. This is a query parameter used to select responses.
|
|
539
|
+
*/
|
|
540
|
+
temperature?: number | null;
|
|
573
541
|
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
542
|
+
/**
|
|
543
|
+
* Nucleus sampling parameter. This is a query parameter used to select responses.
|
|
544
|
+
*/
|
|
545
|
+
top_p?: number | null;
|
|
578
546
|
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
547
|
+
/**
|
|
548
|
+
* List of user identifiers. This is a query parameter used to select responses.
|
|
549
|
+
*/
|
|
550
|
+
users?: Array<string> | null;
|
|
551
|
+
}
|
|
583
552
|
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
553
|
+
export interface Template {
|
|
554
|
+
/**
|
|
555
|
+
* A list of chat messages forming the prompt or context. May include variable
|
|
556
|
+
* references to the "item" namespace, ie {{item.name}}.
|
|
557
|
+
*/
|
|
558
|
+
template: Array<Template.ChatMessage | Template.EvalItem>;
|
|
588
559
|
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
560
|
+
/**
|
|
561
|
+
* The type of input messages. Always `template`.
|
|
562
|
+
*/
|
|
563
|
+
type: 'template';
|
|
564
|
+
}
|
|
593
565
|
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
566
|
+
export namespace Template {
|
|
567
|
+
export interface ChatMessage {
|
|
568
|
+
/**
|
|
569
|
+
* The content of the message.
|
|
570
|
+
*/
|
|
571
|
+
content: string;
|
|
572
|
+
|
|
573
|
+
/**
|
|
574
|
+
* The role of the message (e.g. "system", "assistant", "user").
|
|
575
|
+
*/
|
|
576
|
+
role: string;
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
/**
|
|
580
|
+
* A message input to the model with a role indicating instruction following
|
|
581
|
+
* hierarchy. Instructions given with the `developer` or `system` role take
|
|
582
|
+
* precedence over instructions given with the `user` role. Messages with the
|
|
583
|
+
* `assistant` role are presumed to have been generated by the model in previous
|
|
584
|
+
* interactions.
|
|
585
|
+
*/
|
|
586
|
+
export interface EvalItem {
|
|
587
|
+
/**
|
|
588
|
+
* Text inputs to the model - can contain template strings.
|
|
589
|
+
*/
|
|
590
|
+
content: string | ResponsesAPI.ResponseInputText | EvalItem.OutputText;
|
|
591
|
+
|
|
592
|
+
/**
|
|
593
|
+
* The role of the message input. One of `user`, `assistant`, `system`, or
|
|
594
|
+
* `developer`.
|
|
595
|
+
*/
|
|
596
|
+
role: 'user' | 'assistant' | 'system' | 'developer';
|
|
597
|
+
|
|
598
|
+
/**
|
|
599
|
+
* The type of the message input. Always `message`.
|
|
600
|
+
*/
|
|
601
|
+
type?: 'message';
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
export namespace EvalItem {
|
|
605
|
+
/**
|
|
606
|
+
* A text output from the model.
|
|
607
|
+
*/
|
|
608
|
+
export interface OutputText {
|
|
609
|
+
/**
|
|
610
|
+
* The text output from the model.
|
|
611
|
+
*/
|
|
612
|
+
text: string;
|
|
613
|
+
|
|
614
|
+
/**
|
|
615
|
+
* The type of the output text. Always `output_text`.
|
|
616
|
+
*/
|
|
617
|
+
type: 'output_text';
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
export interface ItemReference {
|
|
623
|
+
/**
|
|
624
|
+
* A reference to a variable in the "item" namespace. Ie, "item.name"
|
|
625
|
+
*/
|
|
626
|
+
item_reference: string;
|
|
627
|
+
|
|
628
|
+
/**
|
|
629
|
+
* The type of input messages. Always `item_reference`.
|
|
630
|
+
*/
|
|
631
|
+
type: 'item_reference';
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
export interface SamplingParams {
|
|
635
|
+
/**
|
|
636
|
+
* The maximum number of tokens in the generated output.
|
|
637
|
+
*/
|
|
638
|
+
max_completion_tokens?: number;
|
|
639
|
+
|
|
640
|
+
/**
|
|
641
|
+
* A seed value to initialize the randomness, during sampling.
|
|
642
|
+
*/
|
|
643
|
+
seed?: number;
|
|
644
|
+
|
|
645
|
+
/**
|
|
646
|
+
* A higher temperature increases randomness in the outputs.
|
|
647
|
+
*/
|
|
648
|
+
temperature?: number;
|
|
649
|
+
|
|
650
|
+
/**
|
|
651
|
+
* An alternative to temperature for nucleus sampling; 1.0 includes all tokens.
|
|
652
|
+
*/
|
|
653
|
+
top_p?: number;
|
|
654
|
+
}
|
|
655
|
+
}
|
|
599
656
|
|
|
600
|
-
export namespace RunRetrieveResponse {
|
|
601
657
|
export interface PerModelUsage {
|
|
602
658
|
/**
|
|
603
659
|
* The number of tokens retrieved from cache.
|
|
@@ -676,7 +732,7 @@ export namespace RunRetrieveResponse {
|
|
|
676
732
|
/**
|
|
677
733
|
* A schema representing an evaluation run.
|
|
678
734
|
*/
|
|
679
|
-
export interface
|
|
735
|
+
export interface RunRetrieveResponse {
|
|
680
736
|
/**
|
|
681
737
|
* Unique identifier for the evaluation run.
|
|
682
738
|
*/
|
|
@@ -690,7 +746,10 @@ export interface RunListResponse {
|
|
|
690
746
|
/**
|
|
691
747
|
* Information about the run's data source.
|
|
692
748
|
*/
|
|
693
|
-
data_source:
|
|
749
|
+
data_source:
|
|
750
|
+
| CreateEvalJSONLRunDataSource
|
|
751
|
+
| CreateEvalCompletionsRunDataSource
|
|
752
|
+
| RunRetrieveResponse.Completions;
|
|
694
753
|
|
|
695
754
|
/**
|
|
696
755
|
* An object representing an error response from the Eval API.
|
|
@@ -730,12 +789,12 @@ export interface RunListResponse {
|
|
|
730
789
|
/**
|
|
731
790
|
* Usage statistics for each model during the evaluation run.
|
|
732
791
|
*/
|
|
733
|
-
per_model_usage: Array<
|
|
792
|
+
per_model_usage: Array<RunRetrieveResponse.PerModelUsage>;
|
|
734
793
|
|
|
735
794
|
/**
|
|
736
795
|
* Results per testing criteria applied during the evaluation run.
|
|
737
796
|
*/
|
|
738
|
-
per_testing_criteria_results: Array<
|
|
797
|
+
per_testing_criteria_results: Array<RunRetrieveResponse.PerTestingCriteriaResult>;
|
|
739
798
|
|
|
740
799
|
/**
|
|
741
800
|
* The URL to the rendered evaluation run report on the UI dashboard.
|
|
@@ -745,7 +804,7 @@ export interface RunListResponse {
|
|
|
745
804
|
/**
|
|
746
805
|
* Counters summarizing the outcomes of the evaluation run.
|
|
747
806
|
*/
|
|
748
|
-
result_counts:
|
|
807
|
+
result_counts: RunRetrieveResponse.ResultCounts;
|
|
749
808
|
|
|
750
809
|
/**
|
|
751
810
|
* The status of the evaluation run.
|
|
@@ -753,7 +812,241 @@ export interface RunListResponse {
|
|
|
753
812
|
status: string;
|
|
754
813
|
}
|
|
755
814
|
|
|
756
|
-
export namespace
|
|
815
|
+
export namespace RunRetrieveResponse {
|
|
816
|
+
/**
|
|
817
|
+
* A ResponsesRunDataSource object describing a model sampling configuration.
|
|
818
|
+
*/
|
|
819
|
+
export interface Completions {
|
|
820
|
+
/**
|
|
821
|
+
* A EvalResponsesSource object describing a run data source configuration.
|
|
822
|
+
*/
|
|
823
|
+
source: Completions.FileContent | Completions.FileID | Completions.Responses;
|
|
824
|
+
|
|
825
|
+
/**
|
|
826
|
+
* The type of run data source. Always `completions`.
|
|
827
|
+
*/
|
|
828
|
+
type: 'completions';
|
|
829
|
+
|
|
830
|
+
input_messages?: Completions.Template | Completions.ItemReference;
|
|
831
|
+
|
|
832
|
+
/**
|
|
833
|
+
* The name of the model to use for generating completions (e.g. "o3-mini").
|
|
834
|
+
*/
|
|
835
|
+
model?: string;
|
|
836
|
+
|
|
837
|
+
sampling_params?: Completions.SamplingParams;
|
|
838
|
+
}
|
|
839
|
+
|
|
840
|
+
export namespace Completions {
|
|
841
|
+
export interface FileContent {
|
|
842
|
+
/**
|
|
843
|
+
* The content of the jsonl file.
|
|
844
|
+
*/
|
|
845
|
+
content: Array<FileContent.Content>;
|
|
846
|
+
|
|
847
|
+
/**
|
|
848
|
+
* The type of jsonl source. Always `file_content`.
|
|
849
|
+
*/
|
|
850
|
+
type: 'file_content';
|
|
851
|
+
}
|
|
852
|
+
|
|
853
|
+
export namespace FileContent {
|
|
854
|
+
export interface Content {
|
|
855
|
+
item: Record<string, unknown>;
|
|
856
|
+
|
|
857
|
+
sample?: Record<string, unknown>;
|
|
858
|
+
}
|
|
859
|
+
}
|
|
860
|
+
|
|
861
|
+
export interface FileID {
|
|
862
|
+
/**
|
|
863
|
+
* The identifier of the file.
|
|
864
|
+
*/
|
|
865
|
+
id: string;
|
|
866
|
+
|
|
867
|
+
/**
|
|
868
|
+
* The type of jsonl source. Always `file_id`.
|
|
869
|
+
*/
|
|
870
|
+
type: 'file_id';
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
/**
|
|
874
|
+
* A EvalResponsesSource object describing a run data source configuration.
|
|
875
|
+
*/
|
|
876
|
+
export interface Responses {
|
|
877
|
+
/**
|
|
878
|
+
* The type of run data source. Always `responses`.
|
|
879
|
+
*/
|
|
880
|
+
type: 'responses';
|
|
881
|
+
|
|
882
|
+
/**
|
|
883
|
+
* Whether to allow parallel tool calls. This is a query parameter used to select
|
|
884
|
+
* responses.
|
|
885
|
+
*/
|
|
886
|
+
allow_parallel_tool_calls?: boolean | null;
|
|
887
|
+
|
|
888
|
+
/**
|
|
889
|
+
* Only include items created after this timestamp (inclusive). This is a query
|
|
890
|
+
* parameter used to select responses.
|
|
891
|
+
*/
|
|
892
|
+
created_after?: number | null;
|
|
893
|
+
|
|
894
|
+
/**
|
|
895
|
+
* Only include items created before this timestamp (inclusive). This is a query
|
|
896
|
+
* parameter used to select responses.
|
|
897
|
+
*/
|
|
898
|
+
created_before?: number | null;
|
|
899
|
+
|
|
900
|
+
/**
|
|
901
|
+
* Whether the response has tool calls. This is a query parameter used to select
|
|
902
|
+
* responses.
|
|
903
|
+
*/
|
|
904
|
+
has_tool_calls?: boolean | null;
|
|
905
|
+
|
|
906
|
+
/**
|
|
907
|
+
* Optional search string for instructions. This is a query parameter used to
|
|
908
|
+
* select responses.
|
|
909
|
+
*/
|
|
910
|
+
instructions_search?: string | null;
|
|
911
|
+
|
|
912
|
+
/**
|
|
913
|
+
* Metadata filter for the responses. This is a query parameter used to select
|
|
914
|
+
* responses.
|
|
915
|
+
*/
|
|
916
|
+
metadata?: unknown | null;
|
|
917
|
+
|
|
918
|
+
/**
|
|
919
|
+
* The name of the model to find responses for. This is a query parameter used to
|
|
920
|
+
* select responses.
|
|
921
|
+
*/
|
|
922
|
+
model?: string | null;
|
|
923
|
+
|
|
924
|
+
/**
|
|
925
|
+
* Optional reasoning effort parameter. This is a query parameter used to select
|
|
926
|
+
* responses.
|
|
927
|
+
*/
|
|
928
|
+
reasoning_effort?: Shared.ReasoningEffort | null;
|
|
929
|
+
|
|
930
|
+
/**
|
|
931
|
+
* Sampling temperature. This is a query parameter used to select responses.
|
|
932
|
+
*/
|
|
933
|
+
temperature?: number | null;
|
|
934
|
+
|
|
935
|
+
/**
|
|
936
|
+
* Nucleus sampling parameter. This is a query parameter used to select responses.
|
|
937
|
+
*/
|
|
938
|
+
top_p?: number | null;
|
|
939
|
+
|
|
940
|
+
/**
|
|
941
|
+
* List of user identifiers. This is a query parameter used to select responses.
|
|
942
|
+
*/
|
|
943
|
+
users?: Array<string> | null;
|
|
944
|
+
}
|
|
945
|
+
|
|
946
|
+
export interface Template {
|
|
947
|
+
/**
|
|
948
|
+
* A list of chat messages forming the prompt or context. May include variable
|
|
949
|
+
* references to the "item" namespace, ie {{item.name}}.
|
|
950
|
+
*/
|
|
951
|
+
template: Array<Template.ChatMessage | Template.EvalItem>;
|
|
952
|
+
|
|
953
|
+
/**
|
|
954
|
+
* The type of input messages. Always `template`.
|
|
955
|
+
*/
|
|
956
|
+
type: 'template';
|
|
957
|
+
}
|
|
958
|
+
|
|
959
|
+
export namespace Template {
|
|
960
|
+
export interface ChatMessage {
|
|
961
|
+
/**
|
|
962
|
+
* The content of the message.
|
|
963
|
+
*/
|
|
964
|
+
content: string;
|
|
965
|
+
|
|
966
|
+
/**
|
|
967
|
+
* The role of the message (e.g. "system", "assistant", "user").
|
|
968
|
+
*/
|
|
969
|
+
role: string;
|
|
970
|
+
}
|
|
971
|
+
|
|
972
|
+
/**
|
|
973
|
+
* A message input to the model with a role indicating instruction following
|
|
974
|
+
* hierarchy. Instructions given with the `developer` or `system` role take
|
|
975
|
+
* precedence over instructions given with the `user` role. Messages with the
|
|
976
|
+
* `assistant` role are presumed to have been generated by the model in previous
|
|
977
|
+
* interactions.
|
|
978
|
+
*/
|
|
979
|
+
export interface EvalItem {
|
|
980
|
+
/**
|
|
981
|
+
* Text inputs to the model - can contain template strings.
|
|
982
|
+
*/
|
|
983
|
+
content: string | ResponsesAPI.ResponseInputText | EvalItem.OutputText;
|
|
984
|
+
|
|
985
|
+
/**
|
|
986
|
+
* The role of the message input. One of `user`, `assistant`, `system`, or
|
|
987
|
+
* `developer`.
|
|
988
|
+
*/
|
|
989
|
+
role: 'user' | 'assistant' | 'system' | 'developer';
|
|
990
|
+
|
|
991
|
+
/**
|
|
992
|
+
* The type of the message input. Always `message`.
|
|
993
|
+
*/
|
|
994
|
+
type?: 'message';
|
|
995
|
+
}
|
|
996
|
+
|
|
997
|
+
export namespace EvalItem {
|
|
998
|
+
/**
|
|
999
|
+
* A text output from the model.
|
|
1000
|
+
*/
|
|
1001
|
+
export interface OutputText {
|
|
1002
|
+
/**
|
|
1003
|
+
* The text output from the model.
|
|
1004
|
+
*/
|
|
1005
|
+
text: string;
|
|
1006
|
+
|
|
1007
|
+
/**
|
|
1008
|
+
* The type of the output text. Always `output_text`.
|
|
1009
|
+
*/
|
|
1010
|
+
type: 'output_text';
|
|
1011
|
+
}
|
|
1012
|
+
}
|
|
1013
|
+
}
|
|
1014
|
+
|
|
1015
|
+
export interface ItemReference {
|
|
1016
|
+
/**
|
|
1017
|
+
* A reference to a variable in the "item" namespace. Ie, "item.name"
|
|
1018
|
+
*/
|
|
1019
|
+
item_reference: string;
|
|
1020
|
+
|
|
1021
|
+
/**
|
|
1022
|
+
* The type of input messages. Always `item_reference`.
|
|
1023
|
+
*/
|
|
1024
|
+
type: 'item_reference';
|
|
1025
|
+
}
|
|
1026
|
+
|
|
1027
|
+
export interface SamplingParams {
|
|
1028
|
+
/**
|
|
1029
|
+
* The maximum number of tokens in the generated output.
|
|
1030
|
+
*/
|
|
1031
|
+
max_completion_tokens?: number;
|
|
1032
|
+
|
|
1033
|
+
/**
|
|
1034
|
+
* A seed value to initialize the randomness, during sampling.
|
|
1035
|
+
*/
|
|
1036
|
+
seed?: number;
|
|
1037
|
+
|
|
1038
|
+
/**
|
|
1039
|
+
* A higher temperature increases randomness in the outputs.
|
|
1040
|
+
*/
|
|
1041
|
+
temperature?: number;
|
|
1042
|
+
|
|
1043
|
+
/**
|
|
1044
|
+
* An alternative to temperature for nucleus sampling; 1.0 includes all tokens.
|
|
1045
|
+
*/
|
|
1046
|
+
top_p?: number;
|
|
1047
|
+
}
|
|
1048
|
+
}
|
|
1049
|
+
|
|
757
1050
|
export interface PerModelUsage {
|
|
758
1051
|
/**
|
|
759
1052
|
* The number of tokens retrieved from cache.
|
|
@@ -829,18 +1122,10 @@ export namespace RunListResponse {
|
|
|
829
1122
|
}
|
|
830
1123
|
}
|
|
831
1124
|
|
|
832
|
-
export interface RunDeleteResponse {
|
|
833
|
-
deleted?: boolean;
|
|
834
|
-
|
|
835
|
-
object?: string;
|
|
836
|
-
|
|
837
|
-
run_id?: string;
|
|
838
|
-
}
|
|
839
|
-
|
|
840
1125
|
/**
|
|
841
1126
|
* A schema representing an evaluation run.
|
|
842
1127
|
*/
|
|
843
|
-
export interface
|
|
1128
|
+
export interface RunListResponse {
|
|
844
1129
|
/**
|
|
845
1130
|
* Unique identifier for the evaluation run.
|
|
846
1131
|
*/
|
|
@@ -854,7 +1139,10 @@ export interface RunCancelResponse {
|
|
|
854
1139
|
/**
|
|
855
1140
|
* Information about the run's data source.
|
|
856
1141
|
*/
|
|
857
|
-
data_source:
|
|
1142
|
+
data_source:
|
|
1143
|
+
| CreateEvalJSONLRunDataSource
|
|
1144
|
+
| CreateEvalCompletionsRunDataSource
|
|
1145
|
+
| RunListResponse.Completions;
|
|
858
1146
|
|
|
859
1147
|
/**
|
|
860
1148
|
* An object representing an error response from the Eval API.
|
|
@@ -894,12 +1182,12 @@ export interface RunCancelResponse {
|
|
|
894
1182
|
/**
|
|
895
1183
|
* Usage statistics for each model during the evaluation run.
|
|
896
1184
|
*/
|
|
897
|
-
per_model_usage: Array<
|
|
1185
|
+
per_model_usage: Array<RunListResponse.PerModelUsage>;
|
|
898
1186
|
|
|
899
1187
|
/**
|
|
900
1188
|
* Results per testing criteria applied during the evaluation run.
|
|
901
1189
|
*/
|
|
902
|
-
per_testing_criteria_results: Array<
|
|
1190
|
+
per_testing_criteria_results: Array<RunListResponse.PerTestingCriteriaResult>;
|
|
903
1191
|
|
|
904
1192
|
/**
|
|
905
1193
|
* The URL to the rendered evaluation run report on the UI dashboard.
|
|
@@ -909,7 +1197,7 @@ export interface RunCancelResponse {
|
|
|
909
1197
|
/**
|
|
910
1198
|
* Counters summarizing the outcomes of the evaluation run.
|
|
911
1199
|
*/
|
|
912
|
-
result_counts:
|
|
1200
|
+
result_counts: RunListResponse.ResultCounts;
|
|
913
1201
|
|
|
914
1202
|
/**
|
|
915
1203
|
* The status of the evaluation run.
|
|
@@ -917,25 +1205,660 @@ export interface RunCancelResponse {
|
|
|
917
1205
|
status: string;
|
|
918
1206
|
}
|
|
919
1207
|
|
|
920
|
-
export namespace
|
|
921
|
-
|
|
1208
|
+
export namespace RunListResponse {
|
|
1209
|
+
/**
|
|
1210
|
+
* A ResponsesRunDataSource object describing a model sampling configuration.
|
|
1211
|
+
*/
|
|
1212
|
+
export interface Completions {
|
|
922
1213
|
/**
|
|
923
|
-
*
|
|
1214
|
+
* A EvalResponsesSource object describing a run data source configuration.
|
|
924
1215
|
*/
|
|
925
|
-
|
|
1216
|
+
source: Completions.FileContent | Completions.FileID | Completions.Responses;
|
|
926
1217
|
|
|
927
1218
|
/**
|
|
928
|
-
* The
|
|
1219
|
+
* The type of run data source. Always `completions`.
|
|
929
1220
|
*/
|
|
930
|
-
|
|
1221
|
+
type: 'completions';
|
|
931
1222
|
|
|
932
|
-
|
|
933
|
-
* The number of invocations.
|
|
934
|
-
*/
|
|
935
|
-
invocation_count: number;
|
|
1223
|
+
input_messages?: Completions.Template | Completions.ItemReference;
|
|
936
1224
|
|
|
937
1225
|
/**
|
|
938
|
-
* The name of the model.
|
|
1226
|
+
* The name of the model to use for generating completions (e.g. "o3-mini").
|
|
1227
|
+
*/
|
|
1228
|
+
model?: string;
|
|
1229
|
+
|
|
1230
|
+
sampling_params?: Completions.SamplingParams;
|
|
1231
|
+
}
|
|
1232
|
+
|
|
1233
|
+
export namespace Completions {
|
|
1234
|
+
export interface FileContent {
|
|
1235
|
+
/**
|
|
1236
|
+
* The content of the jsonl file.
|
|
1237
|
+
*/
|
|
1238
|
+
content: Array<FileContent.Content>;
|
|
1239
|
+
|
|
1240
|
+
/**
|
|
1241
|
+
* The type of jsonl source. Always `file_content`.
|
|
1242
|
+
*/
|
|
1243
|
+
type: 'file_content';
|
|
1244
|
+
}
|
|
1245
|
+
|
|
1246
|
+
export namespace FileContent {
|
|
1247
|
+
export interface Content {
|
|
1248
|
+
item: Record<string, unknown>;
|
|
1249
|
+
|
|
1250
|
+
sample?: Record<string, unknown>;
|
|
1251
|
+
}
|
|
1252
|
+
}
|
|
1253
|
+
|
|
1254
|
+
export interface FileID {
|
|
1255
|
+
/**
|
|
1256
|
+
* The identifier of the file.
|
|
1257
|
+
*/
|
|
1258
|
+
id: string;
|
|
1259
|
+
|
|
1260
|
+
/**
|
|
1261
|
+
* The type of jsonl source. Always `file_id`.
|
|
1262
|
+
*/
|
|
1263
|
+
type: 'file_id';
|
|
1264
|
+
}
|
|
1265
|
+
|
|
1266
|
+
/**
|
|
1267
|
+
* A EvalResponsesSource object describing a run data source configuration.
|
|
1268
|
+
*/
|
|
1269
|
+
export interface Responses {
|
|
1270
|
+
/**
|
|
1271
|
+
* The type of run data source. Always `responses`.
|
|
1272
|
+
*/
|
|
1273
|
+
type: 'responses';
|
|
1274
|
+
|
|
1275
|
+
/**
|
|
1276
|
+
* Whether to allow parallel tool calls. This is a query parameter used to select
|
|
1277
|
+
* responses.
|
|
1278
|
+
*/
|
|
1279
|
+
allow_parallel_tool_calls?: boolean | null;
|
|
1280
|
+
|
|
1281
|
+
/**
|
|
1282
|
+
* Only include items created after this timestamp (inclusive). This is a query
|
|
1283
|
+
* parameter used to select responses.
|
|
1284
|
+
*/
|
|
1285
|
+
created_after?: number | null;
|
|
1286
|
+
|
|
1287
|
+
/**
|
|
1288
|
+
* Only include items created before this timestamp (inclusive). This is a query
|
|
1289
|
+
* parameter used to select responses.
|
|
1290
|
+
*/
|
|
1291
|
+
created_before?: number | null;
|
|
1292
|
+
|
|
1293
|
+
/**
|
|
1294
|
+
* Whether the response has tool calls. This is a query parameter used to select
|
|
1295
|
+
* responses.
|
|
1296
|
+
*/
|
|
1297
|
+
has_tool_calls?: boolean | null;
|
|
1298
|
+
|
|
1299
|
+
/**
|
|
1300
|
+
* Optional search string for instructions. This is a query parameter used to
|
|
1301
|
+
* select responses.
|
|
1302
|
+
*/
|
|
1303
|
+
instructions_search?: string | null;
|
|
1304
|
+
|
|
1305
|
+
/**
|
|
1306
|
+
* Metadata filter for the responses. This is a query parameter used to select
|
|
1307
|
+
* responses.
|
|
1308
|
+
*/
|
|
1309
|
+
metadata?: unknown | null;
|
|
1310
|
+
|
|
1311
|
+
/**
|
|
1312
|
+
* The name of the model to find responses for. This is a query parameter used to
|
|
1313
|
+
* select responses.
|
|
1314
|
+
*/
|
|
1315
|
+
model?: string | null;
|
|
1316
|
+
|
|
1317
|
+
/**
|
|
1318
|
+
* Optional reasoning effort parameter. This is a query parameter used to select
|
|
1319
|
+
* responses.
|
|
1320
|
+
*/
|
|
1321
|
+
reasoning_effort?: Shared.ReasoningEffort | null;
|
|
1322
|
+
|
|
1323
|
+
/**
|
|
1324
|
+
* Sampling temperature. This is a query parameter used to select responses.
|
|
1325
|
+
*/
|
|
1326
|
+
temperature?: number | null;
|
|
1327
|
+
|
|
1328
|
+
/**
|
|
1329
|
+
* Nucleus sampling parameter. This is a query parameter used to select responses.
|
|
1330
|
+
*/
|
|
1331
|
+
top_p?: number | null;
|
|
1332
|
+
|
|
1333
|
+
/**
|
|
1334
|
+
* List of user identifiers. This is a query parameter used to select responses.
|
|
1335
|
+
*/
|
|
1336
|
+
users?: Array<string> | null;
|
|
1337
|
+
}
|
|
1338
|
+
|
|
1339
|
+
export interface Template {
|
|
1340
|
+
/**
|
|
1341
|
+
* A list of chat messages forming the prompt or context. May include variable
|
|
1342
|
+
* references to the "item" namespace, ie {{item.name}}.
|
|
1343
|
+
*/
|
|
1344
|
+
template: Array<Template.ChatMessage | Template.EvalItem>;
|
|
1345
|
+
|
|
1346
|
+
/**
|
|
1347
|
+
* The type of input messages. Always `template`.
|
|
1348
|
+
*/
|
|
1349
|
+
type: 'template';
|
|
1350
|
+
}
|
|
1351
|
+
|
|
1352
|
+
export namespace Template {
|
|
1353
|
+
export interface ChatMessage {
|
|
1354
|
+
/**
|
|
1355
|
+
* The content of the message.
|
|
1356
|
+
*/
|
|
1357
|
+
content: string;
|
|
1358
|
+
|
|
1359
|
+
/**
|
|
1360
|
+
* The role of the message (e.g. "system", "assistant", "user").
|
|
1361
|
+
*/
|
|
1362
|
+
role: string;
|
|
1363
|
+
}
|
|
1364
|
+
|
|
1365
|
+
/**
|
|
1366
|
+
* A message input to the model with a role indicating instruction following
|
|
1367
|
+
* hierarchy. Instructions given with the `developer` or `system` role take
|
|
1368
|
+
* precedence over instructions given with the `user` role. Messages with the
|
|
1369
|
+
* `assistant` role are presumed to have been generated by the model in previous
|
|
1370
|
+
* interactions.
|
|
1371
|
+
*/
|
|
1372
|
+
export interface EvalItem {
|
|
1373
|
+
/**
|
|
1374
|
+
* Text inputs to the model - can contain template strings.
|
|
1375
|
+
*/
|
|
1376
|
+
content: string | ResponsesAPI.ResponseInputText | EvalItem.OutputText;
|
|
1377
|
+
|
|
1378
|
+
/**
|
|
1379
|
+
* The role of the message input. One of `user`, `assistant`, `system`, or
|
|
1380
|
+
* `developer`.
|
|
1381
|
+
*/
|
|
1382
|
+
role: 'user' | 'assistant' | 'system' | 'developer';
|
|
1383
|
+
|
|
1384
|
+
/**
|
|
1385
|
+
* The type of the message input. Always `message`.
|
|
1386
|
+
*/
|
|
1387
|
+
type?: 'message';
|
|
1388
|
+
}
|
|
1389
|
+
|
|
1390
|
+
export namespace EvalItem {
|
|
1391
|
+
/**
|
|
1392
|
+
* A text output from the model.
|
|
1393
|
+
*/
|
|
1394
|
+
export interface OutputText {
|
|
1395
|
+
/**
|
|
1396
|
+
* The text output from the model.
|
|
1397
|
+
*/
|
|
1398
|
+
text: string;
|
|
1399
|
+
|
|
1400
|
+
/**
|
|
1401
|
+
* The type of the output text. Always `output_text`.
|
|
1402
|
+
*/
|
|
1403
|
+
type: 'output_text';
|
|
1404
|
+
}
|
|
1405
|
+
}
|
|
1406
|
+
}
|
|
1407
|
+
|
|
1408
|
+
export interface ItemReference {
|
|
1409
|
+
/**
|
|
1410
|
+
* A reference to a variable in the "item" namespace. Ie, "item.name"
|
|
1411
|
+
*/
|
|
1412
|
+
item_reference: string;
|
|
1413
|
+
|
|
1414
|
+
/**
|
|
1415
|
+
* The type of input messages. Always `item_reference`.
|
|
1416
|
+
*/
|
|
1417
|
+
type: 'item_reference';
|
|
1418
|
+
}
|
|
1419
|
+
|
|
1420
|
+
export interface SamplingParams {
|
|
1421
|
+
/**
|
|
1422
|
+
* The maximum number of tokens in the generated output.
|
|
1423
|
+
*/
|
|
1424
|
+
max_completion_tokens?: number;
|
|
1425
|
+
|
|
1426
|
+
/**
|
|
1427
|
+
* A seed value to initialize the randomness, during sampling.
|
|
1428
|
+
*/
|
|
1429
|
+
seed?: number;
|
|
1430
|
+
|
|
1431
|
+
/**
|
|
1432
|
+
* A higher temperature increases randomness in the outputs.
|
|
1433
|
+
*/
|
|
1434
|
+
temperature?: number;
|
|
1435
|
+
|
|
1436
|
+
/**
|
|
1437
|
+
* An alternative to temperature for nucleus sampling; 1.0 includes all tokens.
|
|
1438
|
+
*/
|
|
1439
|
+
top_p?: number;
|
|
1440
|
+
}
|
|
1441
|
+
}
|
|
1442
|
+
|
|
1443
|
+
export interface PerModelUsage {
|
|
1444
|
+
/**
|
|
1445
|
+
* The number of tokens retrieved from cache.
|
|
1446
|
+
*/
|
|
1447
|
+
cached_tokens: number;
|
|
1448
|
+
|
|
1449
|
+
/**
|
|
1450
|
+
* The number of completion tokens generated.
|
|
1451
|
+
*/
|
|
1452
|
+
completion_tokens: number;
|
|
1453
|
+
|
|
1454
|
+
/**
|
|
1455
|
+
* The number of invocations.
|
|
1456
|
+
*/
|
|
1457
|
+
invocation_count: number;
|
|
1458
|
+
|
|
1459
|
+
/**
|
|
1460
|
+
* The name of the model.
|
|
1461
|
+
*/
|
|
1462
|
+
model_name: string;
|
|
1463
|
+
|
|
1464
|
+
/**
|
|
1465
|
+
* The number of prompt tokens used.
|
|
1466
|
+
*/
|
|
1467
|
+
prompt_tokens: number;
|
|
1468
|
+
|
|
1469
|
+
/**
|
|
1470
|
+
* The total number of tokens used.
|
|
1471
|
+
*/
|
|
1472
|
+
total_tokens: number;
|
|
1473
|
+
}
|
|
1474
|
+
|
|
1475
|
+
export interface PerTestingCriteriaResult {
|
|
1476
|
+
/**
|
|
1477
|
+
* Number of tests failed for this criteria.
|
|
1478
|
+
*/
|
|
1479
|
+
failed: number;
|
|
1480
|
+
|
|
1481
|
+
/**
|
|
1482
|
+
* Number of tests passed for this criteria.
|
|
1483
|
+
*/
|
|
1484
|
+
passed: number;
|
|
1485
|
+
|
|
1486
|
+
/**
|
|
1487
|
+
* A description of the testing criteria.
|
|
1488
|
+
*/
|
|
1489
|
+
testing_criteria: string;
|
|
1490
|
+
}
|
|
1491
|
+
|
|
1492
|
+
/**
|
|
1493
|
+
* Counters summarizing the outcomes of the evaluation run.
|
|
1494
|
+
*/
|
|
1495
|
+
export interface ResultCounts {
|
|
1496
|
+
/**
|
|
1497
|
+
* Number of output items that resulted in an error.
|
|
1498
|
+
*/
|
|
1499
|
+
errored: number;
|
|
1500
|
+
|
|
1501
|
+
/**
|
|
1502
|
+
* Number of output items that failed to pass the evaluation.
|
|
1503
|
+
*/
|
|
1504
|
+
failed: number;
|
|
1505
|
+
|
|
1506
|
+
/**
|
|
1507
|
+
* Number of output items that passed the evaluation.
|
|
1508
|
+
*/
|
|
1509
|
+
passed: number;
|
|
1510
|
+
|
|
1511
|
+
/**
|
|
1512
|
+
* Total number of executed output items.
|
|
1513
|
+
*/
|
|
1514
|
+
total: number;
|
|
1515
|
+
}
|
|
1516
|
+
}
|
|
1517
|
+
|
|
1518
|
+
export interface RunDeleteResponse {
|
|
1519
|
+
deleted?: boolean;
|
|
1520
|
+
|
|
1521
|
+
object?: string;
|
|
1522
|
+
|
|
1523
|
+
run_id?: string;
|
|
1524
|
+
}
|
|
1525
|
+
|
|
1526
|
+
/**
|
|
1527
|
+
* A schema representing an evaluation run.
|
|
1528
|
+
*/
|
|
1529
|
+
export interface RunCancelResponse {
|
|
1530
|
+
/**
|
|
1531
|
+
* Unique identifier for the evaluation run.
|
|
1532
|
+
*/
|
|
1533
|
+
id: string;
|
|
1534
|
+
|
|
1535
|
+
/**
|
|
1536
|
+
* Unix timestamp (in seconds) when the evaluation run was created.
|
|
1537
|
+
*/
|
|
1538
|
+
created_at: number;
|
|
1539
|
+
|
|
1540
|
+
/**
|
|
1541
|
+
* Information about the run's data source.
|
|
1542
|
+
*/
|
|
1543
|
+
data_source:
|
|
1544
|
+
| CreateEvalJSONLRunDataSource
|
|
1545
|
+
| CreateEvalCompletionsRunDataSource
|
|
1546
|
+
| RunCancelResponse.Completions;
|
|
1547
|
+
|
|
1548
|
+
/**
|
|
1549
|
+
* An object representing an error response from the Eval API.
|
|
1550
|
+
*/
|
|
1551
|
+
error: EvalAPIError;
|
|
1552
|
+
|
|
1553
|
+
/**
|
|
1554
|
+
* The identifier of the associated evaluation.
|
|
1555
|
+
*/
|
|
1556
|
+
eval_id: string;
|
|
1557
|
+
|
|
1558
|
+
/**
|
|
1559
|
+
* Set of 16 key-value pairs that can be attached to an object. This can be useful
|
|
1560
|
+
* for storing additional information about the object in a structured format, and
|
|
1561
|
+
* querying for objects via API or the dashboard.
|
|
1562
|
+
*
|
|
1563
|
+
* Keys are strings with a maximum length of 64 characters. Values are strings with
|
|
1564
|
+
* a maximum length of 512 characters.
|
|
1565
|
+
*/
|
|
1566
|
+
metadata: Shared.Metadata | null;
|
|
1567
|
+
|
|
1568
|
+
/**
|
|
1569
|
+
* The model that is evaluated, if applicable.
|
|
1570
|
+
*/
|
|
1571
|
+
model: string;
|
|
1572
|
+
|
|
1573
|
+
/**
|
|
1574
|
+
* The name of the evaluation run.
|
|
1575
|
+
*/
|
|
1576
|
+
name: string;
|
|
1577
|
+
|
|
1578
|
+
/**
|
|
1579
|
+
* The type of the object. Always "eval.run".
|
|
1580
|
+
*/
|
|
1581
|
+
object: 'eval.run';
|
|
1582
|
+
|
|
1583
|
+
/**
|
|
1584
|
+
* Usage statistics for each model during the evaluation run.
|
|
1585
|
+
*/
|
|
1586
|
+
per_model_usage: Array<RunCancelResponse.PerModelUsage>;
|
|
1587
|
+
|
|
1588
|
+
/**
|
|
1589
|
+
* Results per testing criteria applied during the evaluation run.
|
|
1590
|
+
*/
|
|
1591
|
+
per_testing_criteria_results: Array<RunCancelResponse.PerTestingCriteriaResult>;
|
|
1592
|
+
|
|
1593
|
+
/**
|
|
1594
|
+
* The URL to the rendered evaluation run report on the UI dashboard.
|
|
1595
|
+
*/
|
|
1596
|
+
report_url: string;
|
|
1597
|
+
|
|
1598
|
+
/**
|
|
1599
|
+
* Counters summarizing the outcomes of the evaluation run.
|
|
1600
|
+
*/
|
|
1601
|
+
result_counts: RunCancelResponse.ResultCounts;
|
|
1602
|
+
|
|
1603
|
+
/**
|
|
1604
|
+
* The status of the evaluation run.
|
|
1605
|
+
*/
|
|
1606
|
+
status: string;
|
|
1607
|
+
}
|
|
1608
|
+
|
|
1609
|
+
export namespace RunCancelResponse {
|
|
1610
|
+
/**
|
|
1611
|
+
* A ResponsesRunDataSource object describing a model sampling configuration.
|
|
1612
|
+
*/
|
|
1613
|
+
export interface Completions {
|
|
1614
|
+
/**
|
|
1615
|
+
* A EvalResponsesSource object describing a run data source configuration.
|
|
1616
|
+
*/
|
|
1617
|
+
source: Completions.FileContent | Completions.FileID | Completions.Responses;
|
|
1618
|
+
|
|
1619
|
+
/**
|
|
1620
|
+
* The type of run data source. Always `completions`.
|
|
1621
|
+
*/
|
|
1622
|
+
type: 'completions';
|
|
1623
|
+
|
|
1624
|
+
input_messages?: Completions.Template | Completions.ItemReference;
|
|
1625
|
+
|
|
1626
|
+
/**
|
|
1627
|
+
* The name of the model to use for generating completions (e.g. "o3-mini").
|
|
1628
|
+
*/
|
|
1629
|
+
model?: string;
|
|
1630
|
+
|
|
1631
|
+
sampling_params?: Completions.SamplingParams;
|
|
1632
|
+
}
|
|
1633
|
+
|
|
1634
|
+
export namespace Completions {
|
|
1635
|
+
export interface FileContent {
|
|
1636
|
+
/**
|
|
1637
|
+
* The content of the jsonl file.
|
|
1638
|
+
*/
|
|
1639
|
+
content: Array<FileContent.Content>;
|
|
1640
|
+
|
|
1641
|
+
/**
|
|
1642
|
+
* The type of jsonl source. Always `file_content`.
|
|
1643
|
+
*/
|
|
1644
|
+
type: 'file_content';
|
|
1645
|
+
}
|
|
1646
|
+
|
|
1647
|
+
export namespace FileContent {
|
|
1648
|
+
export interface Content {
|
|
1649
|
+
item: Record<string, unknown>;
|
|
1650
|
+
|
|
1651
|
+
sample?: Record<string, unknown>;
|
|
1652
|
+
}
|
|
1653
|
+
}
|
|
1654
|
+
|
|
1655
|
+
export interface FileID {
|
|
1656
|
+
/**
|
|
1657
|
+
* The identifier of the file.
|
|
1658
|
+
*/
|
|
1659
|
+
id: string;
|
|
1660
|
+
|
|
1661
|
+
/**
|
|
1662
|
+
* The type of jsonl source. Always `file_id`.
|
|
1663
|
+
*/
|
|
1664
|
+
type: 'file_id';
|
|
1665
|
+
}
|
|
1666
|
+
|
|
1667
|
+
/**
|
|
1668
|
+
* A EvalResponsesSource object describing a run data source configuration.
|
|
1669
|
+
*/
|
|
1670
|
+
export interface Responses {
|
|
1671
|
+
/**
|
|
1672
|
+
* The type of run data source. Always `responses`.
|
|
1673
|
+
*/
|
|
1674
|
+
type: 'responses';
|
|
1675
|
+
|
|
1676
|
+
/**
|
|
1677
|
+
* Whether to allow parallel tool calls. This is a query parameter used to select
|
|
1678
|
+
* responses.
|
|
1679
|
+
*/
|
|
1680
|
+
allow_parallel_tool_calls?: boolean | null;
|
|
1681
|
+
|
|
1682
|
+
/**
|
|
1683
|
+
* Only include items created after this timestamp (inclusive). This is a query
|
|
1684
|
+
* parameter used to select responses.
|
|
1685
|
+
*/
|
|
1686
|
+
created_after?: number | null;
|
|
1687
|
+
|
|
1688
|
+
/**
|
|
1689
|
+
* Only include items created before this timestamp (inclusive). This is a query
|
|
1690
|
+
* parameter used to select responses.
|
|
1691
|
+
*/
|
|
1692
|
+
created_before?: number | null;
|
|
1693
|
+
|
|
1694
|
+
/**
|
|
1695
|
+
* Whether the response has tool calls. This is a query parameter used to select
|
|
1696
|
+
* responses.
|
|
1697
|
+
*/
|
|
1698
|
+
has_tool_calls?: boolean | null;
|
|
1699
|
+
|
|
1700
|
+
/**
|
|
1701
|
+
* Optional search string for instructions. This is a query parameter used to
|
|
1702
|
+
* select responses.
|
|
1703
|
+
*/
|
|
1704
|
+
instructions_search?: string | null;
|
|
1705
|
+
|
|
1706
|
+
/**
|
|
1707
|
+
* Metadata filter for the responses. This is a query parameter used to select
|
|
1708
|
+
* responses.
|
|
1709
|
+
*/
|
|
1710
|
+
metadata?: unknown | null;
|
|
1711
|
+
|
|
1712
|
+
/**
|
|
1713
|
+
* The name of the model to find responses for. This is a query parameter used to
|
|
1714
|
+
* select responses.
|
|
1715
|
+
*/
|
|
1716
|
+
model?: string | null;
|
|
1717
|
+
|
|
1718
|
+
/**
|
|
1719
|
+
* Optional reasoning effort parameter. This is a query parameter used to select
|
|
1720
|
+
* responses.
|
|
1721
|
+
*/
|
|
1722
|
+
reasoning_effort?: Shared.ReasoningEffort | null;
|
|
1723
|
+
|
|
1724
|
+
/**
|
|
1725
|
+
* Sampling temperature. This is a query parameter used to select responses.
|
|
1726
|
+
*/
|
|
1727
|
+
temperature?: number | null;
|
|
1728
|
+
|
|
1729
|
+
/**
|
|
1730
|
+
* Nucleus sampling parameter. This is a query parameter used to select responses.
|
|
1731
|
+
*/
|
|
1732
|
+
top_p?: number | null;
|
|
1733
|
+
|
|
1734
|
+
/**
|
|
1735
|
+
* List of user identifiers. This is a query parameter used to select responses.
|
|
1736
|
+
*/
|
|
1737
|
+
users?: Array<string> | null;
|
|
1738
|
+
}
|
|
1739
|
+
|
|
1740
|
+
export interface Template {
|
|
1741
|
+
/**
|
|
1742
|
+
* A list of chat messages forming the prompt or context. May include variable
|
|
1743
|
+
* references to the "item" namespace, ie {{item.name}}.
|
|
1744
|
+
*/
|
|
1745
|
+
template: Array<Template.ChatMessage | Template.EvalItem>;
|
|
1746
|
+
|
|
1747
|
+
/**
|
|
1748
|
+
* The type of input messages. Always `template`.
|
|
1749
|
+
*/
|
|
1750
|
+
type: 'template';
|
|
1751
|
+
}
|
|
1752
|
+
|
|
1753
|
+
export namespace Template {
|
|
1754
|
+
export interface ChatMessage {
|
|
1755
|
+
/**
|
|
1756
|
+
* The content of the message.
|
|
1757
|
+
*/
|
|
1758
|
+
content: string;
|
|
1759
|
+
|
|
1760
|
+
/**
|
|
1761
|
+
* The role of the message (e.g. "system", "assistant", "user").
|
|
1762
|
+
*/
|
|
1763
|
+
role: string;
|
|
1764
|
+
}
|
|
1765
|
+
|
|
1766
|
+
/**
|
|
1767
|
+
* A message input to the model with a role indicating instruction following
|
|
1768
|
+
* hierarchy. Instructions given with the `developer` or `system` role take
|
|
1769
|
+
* precedence over instructions given with the `user` role. Messages with the
|
|
1770
|
+
* `assistant` role are presumed to have been generated by the model in previous
|
|
1771
|
+
* interactions.
|
|
1772
|
+
*/
|
|
1773
|
+
export interface EvalItem {
|
|
1774
|
+
/**
|
|
1775
|
+
* Text inputs to the model - can contain template strings.
|
|
1776
|
+
*/
|
|
1777
|
+
content: string | ResponsesAPI.ResponseInputText | EvalItem.OutputText;
|
|
1778
|
+
|
|
1779
|
+
/**
|
|
1780
|
+
* The role of the message input. One of `user`, `assistant`, `system`, or
|
|
1781
|
+
* `developer`.
|
|
1782
|
+
*/
|
|
1783
|
+
role: 'user' | 'assistant' | 'system' | 'developer';
|
|
1784
|
+
|
|
1785
|
+
/**
|
|
1786
|
+
* The type of the message input. Always `message`.
|
|
1787
|
+
*/
|
|
1788
|
+
type?: 'message';
|
|
1789
|
+
}
|
|
1790
|
+
|
|
1791
|
+
export namespace EvalItem {
|
|
1792
|
+
/**
|
|
1793
|
+
* A text output from the model.
|
|
1794
|
+
*/
|
|
1795
|
+
export interface OutputText {
|
|
1796
|
+
/**
|
|
1797
|
+
* The text output from the model.
|
|
1798
|
+
*/
|
|
1799
|
+
text: string;
|
|
1800
|
+
|
|
1801
|
+
/**
|
|
1802
|
+
* The type of the output text. Always `output_text`.
|
|
1803
|
+
*/
|
|
1804
|
+
type: 'output_text';
|
|
1805
|
+
}
|
|
1806
|
+
}
|
|
1807
|
+
}
|
|
1808
|
+
|
|
1809
|
+
export interface ItemReference {
|
|
1810
|
+
/**
|
|
1811
|
+
* A reference to a variable in the "item" namespace. Ie, "item.name"
|
|
1812
|
+
*/
|
|
1813
|
+
item_reference: string;
|
|
1814
|
+
|
|
1815
|
+
/**
|
|
1816
|
+
* The type of input messages. Always `item_reference`.
|
|
1817
|
+
*/
|
|
1818
|
+
type: 'item_reference';
|
|
1819
|
+
}
|
|
1820
|
+
|
|
1821
|
+
export interface SamplingParams {
|
|
1822
|
+
/**
|
|
1823
|
+
* The maximum number of tokens in the generated output.
|
|
1824
|
+
*/
|
|
1825
|
+
max_completion_tokens?: number;
|
|
1826
|
+
|
|
1827
|
+
/**
|
|
1828
|
+
* A seed value to initialize the randomness, during sampling.
|
|
1829
|
+
*/
|
|
1830
|
+
seed?: number;
|
|
1831
|
+
|
|
1832
|
+
/**
|
|
1833
|
+
* A higher temperature increases randomness in the outputs.
|
|
1834
|
+
*/
|
|
1835
|
+
temperature?: number;
|
|
1836
|
+
|
|
1837
|
+
/**
|
|
1838
|
+
* An alternative to temperature for nucleus sampling; 1.0 includes all tokens.
|
|
1839
|
+
*/
|
|
1840
|
+
top_p?: number;
|
|
1841
|
+
}
|
|
1842
|
+
}
|
|
1843
|
+
|
|
1844
|
+
export interface PerModelUsage {
|
|
1845
|
+
/**
|
|
1846
|
+
* The number of tokens retrieved from cache.
|
|
1847
|
+
*/
|
|
1848
|
+
cached_tokens: number;
|
|
1849
|
+
|
|
1850
|
+
/**
|
|
1851
|
+
* The number of completion tokens generated.
|
|
1852
|
+
*/
|
|
1853
|
+
completion_tokens: number;
|
|
1854
|
+
|
|
1855
|
+
/**
|
|
1856
|
+
* The number of invocations.
|
|
1857
|
+
*/
|
|
1858
|
+
invocation_count: number;
|
|
1859
|
+
|
|
1860
|
+
/**
|
|
1861
|
+
* The name of the model.
|
|
939
1862
|
*/
|
|
940
1863
|
model_name: string;
|
|
941
1864
|
|
|
@@ -997,7 +1920,10 @@ export interface RunCreateParams {
|
|
|
997
1920
|
/**
|
|
998
1921
|
* Details about the run's data source.
|
|
999
1922
|
*/
|
|
1000
|
-
data_source:
|
|
1923
|
+
data_source:
|
|
1924
|
+
| CreateEvalJSONLRunDataSource
|
|
1925
|
+
| CreateEvalCompletionsRunDataSource
|
|
1926
|
+
| RunCreateParams.CreateEvalResponsesRunDataSource;
|
|
1001
1927
|
|
|
1002
1928
|
/**
|
|
1003
1929
|
* Set of 16 key-value pairs that can be attached to an object. This can be useful
|
|
@@ -1015,6 +1941,247 @@ export interface RunCreateParams {
|
|
|
1015
1941
|
name?: string;
|
|
1016
1942
|
}
|
|
1017
1943
|
|
|
1944
|
+
export namespace RunCreateParams {
|
|
1945
|
+
/**
|
|
1946
|
+
* A ResponsesRunDataSource object describing a model sampling configuration.
|
|
1947
|
+
*/
|
|
1948
|
+
export interface CreateEvalResponsesRunDataSource {
|
|
1949
|
+
/**
|
|
1950
|
+
* A EvalResponsesSource object describing a run data source configuration.
|
|
1951
|
+
*/
|
|
1952
|
+
source:
|
|
1953
|
+
| CreateEvalResponsesRunDataSource.FileContent
|
|
1954
|
+
| CreateEvalResponsesRunDataSource.FileID
|
|
1955
|
+
| CreateEvalResponsesRunDataSource.Responses;
|
|
1956
|
+
|
|
1957
|
+
/**
|
|
1958
|
+
* The type of run data source. Always `completions`.
|
|
1959
|
+
*/
|
|
1960
|
+
type: 'completions';
|
|
1961
|
+
|
|
1962
|
+
input_messages?:
|
|
1963
|
+
| CreateEvalResponsesRunDataSource.Template
|
|
1964
|
+
| CreateEvalResponsesRunDataSource.ItemReference;
|
|
1965
|
+
|
|
1966
|
+
/**
|
|
1967
|
+
* The name of the model to use for generating completions (e.g. "o3-mini").
|
|
1968
|
+
*/
|
|
1969
|
+
model?: string;
|
|
1970
|
+
|
|
1971
|
+
sampling_params?: CreateEvalResponsesRunDataSource.SamplingParams;
|
|
1972
|
+
}
|
|
1973
|
+
|
|
1974
|
+
export namespace CreateEvalResponsesRunDataSource {
|
|
1975
|
+
export interface FileContent {
|
|
1976
|
+
/**
|
|
1977
|
+
* The content of the jsonl file.
|
|
1978
|
+
*/
|
|
1979
|
+
content: Array<FileContent.Content>;
|
|
1980
|
+
|
|
1981
|
+
/**
|
|
1982
|
+
* The type of jsonl source. Always `file_content`.
|
|
1983
|
+
*/
|
|
1984
|
+
type: 'file_content';
|
|
1985
|
+
}
|
|
1986
|
+
|
|
1987
|
+
export namespace FileContent {
|
|
1988
|
+
export interface Content {
|
|
1989
|
+
item: Record<string, unknown>;
|
|
1990
|
+
|
|
1991
|
+
sample?: Record<string, unknown>;
|
|
1992
|
+
}
|
|
1993
|
+
}
|
|
1994
|
+
|
|
1995
|
+
export interface FileID {
|
|
1996
|
+
/**
|
|
1997
|
+
* The identifier of the file.
|
|
1998
|
+
*/
|
|
1999
|
+
id: string;
|
|
2000
|
+
|
|
2001
|
+
/**
|
|
2002
|
+
* The type of jsonl source. Always `file_id`.
|
|
2003
|
+
*/
|
|
2004
|
+
type: 'file_id';
|
|
2005
|
+
}
|
|
2006
|
+
|
|
2007
|
+
/**
|
|
2008
|
+
* A EvalResponsesSource object describing a run data source configuration.
|
|
2009
|
+
*/
|
|
2010
|
+
export interface Responses {
|
|
2011
|
+
/**
|
|
2012
|
+
* The type of run data source. Always `responses`.
|
|
2013
|
+
*/
|
|
2014
|
+
type: 'responses';
|
|
2015
|
+
|
|
2016
|
+
/**
|
|
2017
|
+
* Whether to allow parallel tool calls. This is a query parameter used to select
|
|
2018
|
+
* responses.
|
|
2019
|
+
*/
|
|
2020
|
+
allow_parallel_tool_calls?: boolean | null;
|
|
2021
|
+
|
|
2022
|
+
/**
|
|
2023
|
+
* Only include items created after this timestamp (inclusive). This is a query
|
|
2024
|
+
* parameter used to select responses.
|
|
2025
|
+
*/
|
|
2026
|
+
created_after?: number | null;
|
|
2027
|
+
|
|
2028
|
+
/**
|
|
2029
|
+
* Only include items created before this timestamp (inclusive). This is a query
|
|
2030
|
+
* parameter used to select responses.
|
|
2031
|
+
*/
|
|
2032
|
+
created_before?: number | null;
|
|
2033
|
+
|
|
2034
|
+
/**
|
|
2035
|
+
* Whether the response has tool calls. This is a query parameter used to select
|
|
2036
|
+
* responses.
|
|
2037
|
+
*/
|
|
2038
|
+
has_tool_calls?: boolean | null;
|
|
2039
|
+
|
|
2040
|
+
/**
|
|
2041
|
+
* Optional search string for instructions. This is a query parameter used to
|
|
2042
|
+
* select responses.
|
|
2043
|
+
*/
|
|
2044
|
+
instructions_search?: string | null;
|
|
2045
|
+
|
|
2046
|
+
/**
|
|
2047
|
+
* Metadata filter for the responses. This is a query parameter used to select
|
|
2048
|
+
* responses.
|
|
2049
|
+
*/
|
|
2050
|
+
metadata?: unknown | null;
|
|
2051
|
+
|
|
2052
|
+
/**
|
|
2053
|
+
* The name of the model to find responses for. This is a query parameter used to
|
|
2054
|
+
* select responses.
|
|
2055
|
+
*/
|
|
2056
|
+
model?: string | null;
|
|
2057
|
+
|
|
2058
|
+
/**
|
|
2059
|
+
* Optional reasoning effort parameter. This is a query parameter used to select
|
|
2060
|
+
* responses.
|
|
2061
|
+
*/
|
|
2062
|
+
reasoning_effort?: Shared.ReasoningEffort | null;
|
|
2063
|
+
|
|
2064
|
+
/**
|
|
2065
|
+
* Sampling temperature. This is a query parameter used to select responses.
|
|
2066
|
+
*/
|
|
2067
|
+
temperature?: number | null;
|
|
2068
|
+
|
|
2069
|
+
/**
|
|
2070
|
+
* Nucleus sampling parameter. This is a query parameter used to select responses.
|
|
2071
|
+
*/
|
|
2072
|
+
top_p?: number | null;
|
|
2073
|
+
|
|
2074
|
+
/**
|
|
2075
|
+
* List of user identifiers. This is a query parameter used to select responses.
|
|
2076
|
+
*/
|
|
2077
|
+
users?: Array<string> | null;
|
|
2078
|
+
}
|
|
2079
|
+
|
|
2080
|
+
export interface Template {
|
|
2081
|
+
/**
|
|
2082
|
+
* A list of chat messages forming the prompt or context. May include variable
|
|
2083
|
+
* references to the "item" namespace, ie {{item.name}}.
|
|
2084
|
+
*/
|
|
2085
|
+
template: Array<Template.ChatMessage | Template.EvalItem>;
|
|
2086
|
+
|
|
2087
|
+
/**
|
|
2088
|
+
* The type of input messages. Always `template`.
|
|
2089
|
+
*/
|
|
2090
|
+
type: 'template';
|
|
2091
|
+
}
|
|
2092
|
+
|
|
2093
|
+
export namespace Template {
|
|
2094
|
+
export interface ChatMessage {
|
|
2095
|
+
/**
|
|
2096
|
+
* The content of the message.
|
|
2097
|
+
*/
|
|
2098
|
+
content: string;
|
|
2099
|
+
|
|
2100
|
+
/**
|
|
2101
|
+
* The role of the message (e.g. "system", "assistant", "user").
|
|
2102
|
+
*/
|
|
2103
|
+
role: string;
|
|
2104
|
+
}
|
|
2105
|
+
|
|
2106
|
+
/**
|
|
2107
|
+
* A message input to the model with a role indicating instruction following
|
|
2108
|
+
* hierarchy. Instructions given with the `developer` or `system` role take
|
|
2109
|
+
* precedence over instructions given with the `user` role. Messages with the
|
|
2110
|
+
* `assistant` role are presumed to have been generated by the model in previous
|
|
2111
|
+
* interactions.
|
|
2112
|
+
*/
|
|
2113
|
+
export interface EvalItem {
|
|
2114
|
+
/**
|
|
2115
|
+
* Text inputs to the model - can contain template strings.
|
|
2116
|
+
*/
|
|
2117
|
+
content: string | ResponsesAPI.ResponseInputText | EvalItem.OutputText;
|
|
2118
|
+
|
|
2119
|
+
/**
|
|
2120
|
+
* The role of the message input. One of `user`, `assistant`, `system`, or
|
|
2121
|
+
* `developer`.
|
|
2122
|
+
*/
|
|
2123
|
+
role: 'user' | 'assistant' | 'system' | 'developer';
|
|
2124
|
+
|
|
2125
|
+
/**
|
|
2126
|
+
* The type of the message input. Always `message`.
|
|
2127
|
+
*/
|
|
2128
|
+
type?: 'message';
|
|
2129
|
+
}
|
|
2130
|
+
|
|
2131
|
+
export namespace EvalItem {
|
|
2132
|
+
/**
|
|
2133
|
+
* A text output from the model.
|
|
2134
|
+
*/
|
|
2135
|
+
export interface OutputText {
|
|
2136
|
+
/**
|
|
2137
|
+
* The text output from the model.
|
|
2138
|
+
*/
|
|
2139
|
+
text: string;
|
|
2140
|
+
|
|
2141
|
+
/**
|
|
2142
|
+
* The type of the output text. Always `output_text`.
|
|
2143
|
+
*/
|
|
2144
|
+
type: 'output_text';
|
|
2145
|
+
}
|
|
2146
|
+
}
|
|
2147
|
+
}
|
|
2148
|
+
|
|
2149
|
+
export interface ItemReference {
|
|
2150
|
+
/**
|
|
2151
|
+
* A reference to a variable in the "item" namespace. Ie, "item.name"
|
|
2152
|
+
*/
|
|
2153
|
+
item_reference: string;
|
|
2154
|
+
|
|
2155
|
+
/**
|
|
2156
|
+
* The type of input messages. Always `item_reference`.
|
|
2157
|
+
*/
|
|
2158
|
+
type: 'item_reference';
|
|
2159
|
+
}
|
|
2160
|
+
|
|
2161
|
+
export interface SamplingParams {
|
|
2162
|
+
/**
|
|
2163
|
+
* The maximum number of tokens in the generated output.
|
|
2164
|
+
*/
|
|
2165
|
+
max_completion_tokens?: number;
|
|
2166
|
+
|
|
2167
|
+
/**
|
|
2168
|
+
* A seed value to initialize the randomness, during sampling.
|
|
2169
|
+
*/
|
|
2170
|
+
seed?: number;
|
|
2171
|
+
|
|
2172
|
+
/**
|
|
2173
|
+
* A higher temperature increases randomness in the outputs.
|
|
2174
|
+
*/
|
|
2175
|
+
temperature?: number;
|
|
2176
|
+
|
|
2177
|
+
/**
|
|
2178
|
+
* An alternative to temperature for nucleus sampling; 1.0 includes all tokens.
|
|
2179
|
+
*/
|
|
2180
|
+
top_p?: number;
|
|
2181
|
+
}
|
|
2182
|
+
}
|
|
2183
|
+
}
|
|
2184
|
+
|
|
1018
2185
|
export interface RunListParams extends CursorPageParams {
|
|
1019
2186
|
/**
|
|
1020
2187
|
* Sort order for runs by timestamp. Use `asc` for ascending order or `desc` for
|
|
@@ -1023,8 +2190,8 @@ export interface RunListParams extends CursorPageParams {
|
|
|
1023
2190
|
order?: 'asc' | 'desc';
|
|
1024
2191
|
|
|
1025
2192
|
/**
|
|
1026
|
-
* Filter runs by status.
|
|
1027
|
-
*
|
|
2193
|
+
* Filter runs by status. One of `queued` | `in_progress` | `failed` | `completed`
|
|
2194
|
+
* | `canceled`.
|
|
1028
2195
|
*/
|
|
1029
2196
|
status?: 'queued' | 'in_progress' | 'completed' | 'canceled' | 'failed';
|
|
1030
2197
|
}
|