@crewdle/mist-connector-openai 1.0.22 → 1.0.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -70,16 +70,17 @@ export class OpenAIGenerativeAIWorkerConnector {
|
|
|
70
70
|
const response = await this.client.audio.transcriptions.create({
|
|
71
71
|
model: options.model.id,
|
|
72
72
|
file,
|
|
73
|
-
response_format: parameters.responseFormat
|
|
73
|
+
response_format: this.transcriptionFormat(options.model.id, parameters.responseFormat),
|
|
74
74
|
language: parameters.language,
|
|
75
75
|
timestamp_granularities: parameters.timestampGranularities ? [parameters.timestampGranularities] : undefined,
|
|
76
76
|
});
|
|
77
77
|
console.log('OpenAIGenerativeAIWorkerConnector.processJob audio transcription response');
|
|
78
|
+
const transcriptionUsage = this.transcriptionUsage(response);
|
|
78
79
|
return {
|
|
79
80
|
type: "prompt" /* GenerativeAIJobType.Prompt */,
|
|
80
81
|
output: (!parameters.responseFormat || parameters.responseFormat === 'json') ? response.text : JSON.stringify(response),
|
|
81
|
-
inputTokens:
|
|
82
|
-
outputTokens:
|
|
82
|
+
inputTokens: transcriptionUsage.inputTokens,
|
|
83
|
+
outputTokens: transcriptionUsage.outputTokens,
|
|
83
84
|
};
|
|
84
85
|
}
|
|
85
86
|
if (options.model.taskType === GenerativeAITaskType.ImageGeneration) {
|
|
@@ -138,6 +139,7 @@ export class OpenAIGenerativeAIWorkerConnector {
|
|
|
138
139
|
const reasoning = this.getReasoning(parameters, options.model.id);
|
|
139
140
|
let inputTokens = 0;
|
|
140
141
|
let outputTokens = 0;
|
|
142
|
+
const inputBuckets = { base: 0, cached: 0 };
|
|
141
143
|
let output = '';
|
|
142
144
|
let resultFile;
|
|
143
145
|
let partial = '';
|
|
@@ -178,6 +180,11 @@ Only output the missing remainder. Do not restart or re-open tags already emitte
|
|
|
178
180
|
partial = '';
|
|
179
181
|
inputTokens += response.usage?.input_tokens ?? 0;
|
|
180
182
|
outputTokens += response.usage?.output_tokens ?? 0;
|
|
183
|
+
{
|
|
184
|
+
const segment = this.usageBuckets(response.usage);
|
|
185
|
+
inputBuckets.base += segment.input.base;
|
|
186
|
+
inputBuckets.cached += segment.input.cached;
|
|
187
|
+
}
|
|
181
188
|
const promises = [];
|
|
182
189
|
for (const content of response.output) {
|
|
183
190
|
if (content.type === 'message') {
|
|
@@ -185,6 +192,19 @@ Only output the missing remainder. Do not restart or re-open tags already emitte
|
|
|
185
192
|
if (message.type === 'output_text') {
|
|
186
193
|
output += message.text;
|
|
187
194
|
partial += message.text;
|
|
195
|
+
if (message.annotations && message.annotations.length > 0) {
|
|
196
|
+
for (const annotation of message.annotations) {
|
|
197
|
+
if (annotation.type === 'container_file_citation') {
|
|
198
|
+
const mimeType = this.getMimeType(annotation.filename);
|
|
199
|
+
const file = await this.client.containers.files.content.retrieve(annotation.file_id, {
|
|
200
|
+
container_id: annotation.container_id,
|
|
201
|
+
});
|
|
202
|
+
console.log('OpenAIGenerativeAIWorkerConnector.processJob file', mimeType);
|
|
203
|
+
const buffer = await file.arrayBuffer();
|
|
204
|
+
resultFile = `data:${mimeType};base64,${Buffer.from(buffer).toString('base64')}`;
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
}
|
|
188
208
|
}
|
|
189
209
|
}
|
|
190
210
|
}
|
|
@@ -219,9 +239,55 @@ Only output the missing remainder. Do not restart or re-open tags already emitte
|
|
|
219
239
|
resultFile,
|
|
220
240
|
inputTokens,
|
|
221
241
|
outputTokens,
|
|
242
|
+
tokenBuckets: { input: inputBuckets, output: { base: outputTokens } },
|
|
222
243
|
};
|
|
223
244
|
}
|
|
224
245
|
}
|
|
246
|
+
/**
|
|
247
|
+
* OpenAI reports cached prompt tokens inside the input token total; split
|
|
248
|
+
* them out so billing can rate the cached share via the model's tokenRates.
|
|
249
|
+
* Image models (gpt-image) also detail text vs image input tokens — bucket
|
|
250
|
+
* the text share so docs whose base input SKU is the image price can rate
|
|
251
|
+
* it (e.g. gpt-image-2 `text: 0.625`). Unrated buckets bill at base, so
|
|
252
|
+
* this is a no-op for plain text models. cached can overlap the modality
|
|
253
|
+
* details, hence the clamp.
|
|
254
|
+
*/
|
|
255
|
+
usageBuckets(usage) {
|
|
256
|
+
const total = usage?.input_tokens ?? 0;
|
|
257
|
+
const details = usage?.input_tokens_details ?? {};
|
|
258
|
+
const cached = details.cached_tokens ?? 0;
|
|
259
|
+
const text = details.text_tokens ?? 0;
|
|
260
|
+
const audio = details.audio_tokens ?? 0;
|
|
261
|
+
return { input: { base: Math.max(0, total - cached - text - audio), cached, text, audio }, output: { base: usage?.output_tokens ?? 0 } };
|
|
262
|
+
}
|
|
263
|
+
/**
|
|
264
|
+
* whisper-1 reports no token usage; verbose_json carries the audio duration
|
|
265
|
+
* the platform bills on, so it replaces whisper's default/json format
|
|
266
|
+
* (verbose_json still carries `text`, so json callers see the same output).
|
|
267
|
+
*/
|
|
268
|
+
transcriptionFormat(modelId, requested) {
|
|
269
|
+
if (modelId.startsWith('whisper') && (!requested || requested === 'json')) {
|
|
270
|
+
return 'verbose_json';
|
|
271
|
+
}
|
|
272
|
+
return requested || 'json';
|
|
273
|
+
}
|
|
274
|
+
/**
|
|
275
|
+
* Token-billed transcription models (gpt-4o-transcribe*) report
|
|
276
|
+
* usage.type === 'tokens'. whisper-1 reports no token usage — bill the
|
|
277
|
+
* audio duration (usage.type === 'duration', or verbose_json's `duration`)
|
|
278
|
+
* at 1,000 output tokens per minute, so whisper's catalog outputPrice is
|
|
279
|
+
* USD per 1,000 minutes of audio (mirrors the cloudlet proxy's metering).
|
|
280
|
+
*/
|
|
281
|
+
transcriptionUsage(response) {
|
|
282
|
+
const usage = response?.usage;
|
|
283
|
+
if (usage?.type === 'tokens') {
|
|
284
|
+
return { inputTokens: usage.input_tokens ?? 0, outputTokens: usage.output_tokens ?? 0 };
|
|
285
|
+
}
|
|
286
|
+
const seconds = usage?.type === 'duration'
|
|
287
|
+
? (usage.seconds ?? 0)
|
|
288
|
+
: (typeof response?.duration === 'number' ? response.duration : 0);
|
|
289
|
+
return { inputTokens: 0, outputTokens: Math.round((seconds / 60) * 1000) };
|
|
290
|
+
}
|
|
225
291
|
async *processJobStream(parameters, options) {
|
|
226
292
|
if (!this.client) {
|
|
227
293
|
throw new Error('Client not initialized');
|
|
@@ -248,7 +314,10 @@ Only output the missing remainder. Do not restart or re-open tags already emitte
|
|
|
248
314
|
throw new Error('No file found');
|
|
249
315
|
}
|
|
250
316
|
console.log('OpenAIGenerativeAIWorkerConnector.processJobStream audio transcription');
|
|
251
|
-
|
|
317
|
+
// whisper-1 does not support streaming transcription — it always takes
|
|
318
|
+
// the non-streaming path below (with verbose_json so duration is billed).
|
|
319
|
+
const isWhisper = options.model.id.startsWith('whisper');
|
|
320
|
+
if ((!parameters.responseFormat || parameters.responseFormat === 'json') && !isWhisper) {
|
|
252
321
|
const stream = await this.client.audio.transcriptions.create({
|
|
253
322
|
model: options.model.id,
|
|
254
323
|
file,
|
|
@@ -270,15 +339,16 @@ Only output the missing remainder. Do not restart or re-open tags already emitte
|
|
|
270
339
|
const response = await this.client.audio.transcriptions.create({
|
|
271
340
|
model: options.model.id,
|
|
272
341
|
file,
|
|
273
|
-
response_format:
|
|
342
|
+
response_format: this.transcriptionFormat(options.model.id, parameters.responseFormat),
|
|
274
343
|
language: parameters.language,
|
|
275
344
|
timestamp_granularities: parameters.timestampGranularities ? [parameters.timestampGranularities] : undefined,
|
|
276
345
|
});
|
|
346
|
+
const transcriptionUsage = this.transcriptionUsage(response);
|
|
277
347
|
yield {
|
|
278
348
|
type: "prompt" /* GenerativeAIJobType.Prompt */,
|
|
279
|
-
output: JSON.stringify(response),
|
|
280
|
-
inputTokens:
|
|
281
|
-
outputTokens:
|
|
349
|
+
output: (!parameters.responseFormat || parameters.responseFormat === 'json') ? response.text : JSON.stringify(response),
|
|
350
|
+
inputTokens: transcriptionUsage.inputTokens,
|
|
351
|
+
outputTokens: transcriptionUsage.outputTokens,
|
|
282
352
|
};
|
|
283
353
|
}
|
|
284
354
|
return;
|
|
@@ -392,12 +462,31 @@ Only output the missing remainder. Do not restart or re-open tags already emitte
|
|
|
392
462
|
outputTokens: 0,
|
|
393
463
|
};
|
|
394
464
|
}
|
|
465
|
+
if (chunk.type === 'response.output_text.annotation.added') {
|
|
466
|
+
const annotation = chunk.annotation;
|
|
467
|
+
if (annotation.type === 'container_file_citation') {
|
|
468
|
+
const mimeType = this.getMimeType(annotation.filename);
|
|
469
|
+
const file = await this.client.containers.files.content.retrieve(annotation.file_id, {
|
|
470
|
+
container_id: annotation.container_id,
|
|
471
|
+
});
|
|
472
|
+
console.log('OpenAIGenerativeAIWorkerConnector.processJobStream file', mimeType);
|
|
473
|
+
const buffer = await file.arrayBuffer();
|
|
474
|
+
yield {
|
|
475
|
+
type: "prompt" /* GenerativeAIJobType.Prompt */,
|
|
476
|
+
output: '',
|
|
477
|
+
resultFile: `data:${mimeType};base64,${Buffer.from(buffer).toString('base64')}`,
|
|
478
|
+
inputTokens: 0,
|
|
479
|
+
outputTokens: 0,
|
|
480
|
+
};
|
|
481
|
+
}
|
|
482
|
+
}
|
|
395
483
|
if (chunk.type === 'response.completed') {
|
|
396
484
|
yield {
|
|
397
485
|
type: "prompt" /* GenerativeAIJobType.Prompt */,
|
|
398
486
|
output: '',
|
|
399
487
|
inputTokens: chunk.response.usage?.input_tokens ?? 0,
|
|
400
488
|
outputTokens: chunk.response.usage?.output_tokens ?? 0,
|
|
489
|
+
tokenBuckets: this.usageBuckets(chunk.response.usage),
|
|
401
490
|
};
|
|
402
491
|
}
|
|
403
492
|
if (chunk.type === 'response.incomplete') {
|
|
@@ -407,6 +496,7 @@ Only output the missing remainder. Do not restart or re-open tags already emitte
|
|
|
407
496
|
output: '',
|
|
408
497
|
inputTokens: chunk.response.usage?.input_tokens ?? 0,
|
|
409
498
|
outputTokens: chunk.response.usage?.output_tokens ?? 0,
|
|
499
|
+
tokenBuckets: this.usageBuckets(chunk.response.usage),
|
|
410
500
|
};
|
|
411
501
|
if (!parameters.privacy && chunk.response.incomplete_details?.reason === 'max_output_tokens') {
|
|
412
502
|
if (continuationCount < MAX_CONTINUATIONS) {
|
|
@@ -534,7 +624,7 @@ Only output the missing remainder. Do not restart or re-open tags already emitte
|
|
|
534
624
|
if (c.type === 'file') {
|
|
535
625
|
return {
|
|
536
626
|
type: 'input_file',
|
|
537
|
-
filename: c.file.filename,
|
|
627
|
+
filename: c.file.filename ?? `${Math.random().toString(36).substring(2, 15)}.pdf`,
|
|
538
628
|
file_data: c.file.file_data,
|
|
539
629
|
};
|
|
540
630
|
}
|
|
@@ -639,4 +729,68 @@ Only output the missing remainder. Do not restart or re-open tags already emitte
|
|
|
639
729
|
}
|
|
640
730
|
return undefined;
|
|
641
731
|
}
|
|
732
|
+
getMimeType(filename) {
|
|
733
|
+
const extension = filename.split('.').pop();
|
|
734
|
+
if (extension === 'pdf') {
|
|
735
|
+
return 'application/pdf';
|
|
736
|
+
}
|
|
737
|
+
if (extension === 'jpg' || extension === 'jpeg') {
|
|
738
|
+
return 'image/jpeg';
|
|
739
|
+
}
|
|
740
|
+
if (extension === 'png') {
|
|
741
|
+
return 'image/png';
|
|
742
|
+
}
|
|
743
|
+
if (extension === 'gif') {
|
|
744
|
+
return 'image/gif';
|
|
745
|
+
}
|
|
746
|
+
if (extension === 'webp') {
|
|
747
|
+
return 'image/webp';
|
|
748
|
+
}
|
|
749
|
+
if (extension === 'svg') {
|
|
750
|
+
return 'image/svg+xml';
|
|
751
|
+
}
|
|
752
|
+
if (extension === 'txt') {
|
|
753
|
+
return 'text/plain';
|
|
754
|
+
}
|
|
755
|
+
if (extension === 'html') {
|
|
756
|
+
return 'text/html';
|
|
757
|
+
}
|
|
758
|
+
if (extension === 'css') {
|
|
759
|
+
return 'text/css';
|
|
760
|
+
}
|
|
761
|
+
if (extension === 'js') {
|
|
762
|
+
return 'application/javascript';
|
|
763
|
+
}
|
|
764
|
+
if (extension === 'json') {
|
|
765
|
+
return 'application/json';
|
|
766
|
+
}
|
|
767
|
+
if (extension === 'xml') {
|
|
768
|
+
return 'application/xml';
|
|
769
|
+
}
|
|
770
|
+
if (extension === 'csv') {
|
|
771
|
+
return 'text/csv';
|
|
772
|
+
}
|
|
773
|
+
if (extension === 'tsv') {
|
|
774
|
+
return 'text/tab-separated-values';
|
|
775
|
+
}
|
|
776
|
+
if (extension === 'docx') {
|
|
777
|
+
return 'application/vnd.openxmlformats-officedocument.wordprocessingml.document';
|
|
778
|
+
}
|
|
779
|
+
if (extension === 'doc') {
|
|
780
|
+
return 'application/msword';
|
|
781
|
+
}
|
|
782
|
+
if (extension === 'xls') {
|
|
783
|
+
return 'application/vnd.ms-excel';
|
|
784
|
+
}
|
|
785
|
+
if (extension === 'xlsx') {
|
|
786
|
+
return 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet';
|
|
787
|
+
}
|
|
788
|
+
if (extension === 'pptx') {
|
|
789
|
+
return 'application/vnd.openxmlformats-officedocument.presentationml.presentation';
|
|
790
|
+
}
|
|
791
|
+
if (extension === 'ppt') {
|
|
792
|
+
return 'application/vnd.ms-powerpoint';
|
|
793
|
+
}
|
|
794
|
+
return 'application/octet-stream';
|
|
795
|
+
}
|
|
642
796
|
}
|
|
@@ -8,6 +8,30 @@ export declare class OpenAIGenerativeAIWorkerConnector implements IGenerativeAIW
|
|
|
8
8
|
close(): Promise<void>;
|
|
9
9
|
getEngineType(): GenerativeAIEngineType;
|
|
10
10
|
processJob(parameters: GenerativeAIWorkerConnectorParameters, options?: IGenerativeAIWorkerOptions): Promise<IGenerativeAIWorkerConnectorPromptResult>;
|
|
11
|
+
/**
|
|
12
|
+
* OpenAI reports cached prompt tokens inside the input token total; split
|
|
13
|
+
* them out so billing can rate the cached share via the model's tokenRates.
|
|
14
|
+
* Image models (gpt-image) also detail text vs image input tokens — bucket
|
|
15
|
+
* the text share so docs whose base input SKU is the image price can rate
|
|
16
|
+
* it (e.g. gpt-image-2 `text: 0.625`). Unrated buckets bill at base, so
|
|
17
|
+
* this is a no-op for plain text models. cached can overlap the modality
|
|
18
|
+
* details, hence the clamp.
|
|
19
|
+
*/
|
|
20
|
+
private usageBuckets;
|
|
21
|
+
/**
|
|
22
|
+
* whisper-1 reports no token usage; verbose_json carries the audio duration
|
|
23
|
+
* the platform bills on, so it replaces whisper's default/json format
|
|
24
|
+
* (verbose_json still carries `text`, so json callers see the same output).
|
|
25
|
+
*/
|
|
26
|
+
private transcriptionFormat;
|
|
27
|
+
/**
|
|
28
|
+
* Token-billed transcription models (gpt-4o-transcribe*) report
|
|
29
|
+
* usage.type === 'tokens'. whisper-1 reports no token usage — bill the
|
|
30
|
+
* audio duration (usage.type === 'duration', or verbose_json's `duration`)
|
|
31
|
+
* at 1,000 output tokens per minute, so whisper's catalog outputPrice is
|
|
32
|
+
* USD per 1,000 minutes of audio (mirrors the cloudlet proxy's metering).
|
|
33
|
+
*/
|
|
34
|
+
private transcriptionUsage;
|
|
11
35
|
processJobStream(parameters: GenerativeAIWorkerConnectorParameters, options?: IGenerativeAIWorkerOptions): AsyncGenerator<IGenerativeAIWorkerConnectorPromptResult>;
|
|
12
36
|
private processToolCall;
|
|
13
37
|
private getMessages;
|
|
@@ -15,4 +39,5 @@ export declare class OpenAIGenerativeAIWorkerConnector implements IGenerativeAIW
|
|
|
15
39
|
private getTools;
|
|
16
40
|
private getReasoning;
|
|
17
41
|
private getResponseFormat;
|
|
42
|
+
getMimeType(filename: string): string;
|
|
18
43
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@crewdle/mist-connector-openai",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.24",
|
|
4
4
|
"description": "",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"types": "dist/types/index.d.ts",
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
"dist/"
|
|
16
16
|
],
|
|
17
17
|
"devDependencies": {
|
|
18
|
-
"@crewdle/web-sdk-types": "^1.0.
|
|
18
|
+
"@crewdle/web-sdk-types": "^1.0.58",
|
|
19
19
|
"@types/node": "^22.13.9",
|
|
20
20
|
"typescript": "^5.8.2"
|
|
21
21
|
},
|