@crewdle/mist-connector-openai 1.0.22 → 1.0.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -70,16 +70,17 @@ export class OpenAIGenerativeAIWorkerConnector {
70
70
  const response = await this.client.audio.transcriptions.create({
71
71
  model: options.model.id,
72
72
  file,
73
- response_format: parameters.responseFormat || 'json',
73
+ response_format: this.transcriptionFormat(options.model.id, parameters.responseFormat),
74
74
  language: parameters.language,
75
75
  timestamp_granularities: parameters.timestampGranularities ? [parameters.timestampGranularities] : undefined,
76
76
  });
77
77
  console.log('OpenAIGenerativeAIWorkerConnector.processJob audio transcription response');
78
+ const transcriptionUsage = this.transcriptionUsage(response);
78
79
  return {
79
80
  type: "prompt" /* GenerativeAIJobType.Prompt */,
80
81
  output: (!parameters.responseFormat || parameters.responseFormat === 'json') ? response.text : JSON.stringify(response),
81
- inputTokens: response.usage?.type === 'tokens' ? response.usage.input_tokens : 0,
82
- outputTokens: response.usage?.type === 'tokens' ? response.usage.output_tokens : 0,
82
+ inputTokens: transcriptionUsage.inputTokens,
83
+ outputTokens: transcriptionUsage.outputTokens,
83
84
  };
84
85
  }
85
86
  if (options.model.taskType === GenerativeAITaskType.ImageGeneration) {
@@ -138,6 +139,7 @@ export class OpenAIGenerativeAIWorkerConnector {
138
139
  const reasoning = this.getReasoning(parameters, options.model.id);
139
140
  let inputTokens = 0;
140
141
  let outputTokens = 0;
142
+ const inputBuckets = { base: 0, cached: 0 };
141
143
  let output = '';
142
144
  let resultFile;
143
145
  let partial = '';
@@ -178,6 +180,11 @@ Only output the missing remainder. Do not restart or re-open tags already emitte
178
180
  partial = '';
179
181
  inputTokens += response.usage?.input_tokens ?? 0;
180
182
  outputTokens += response.usage?.output_tokens ?? 0;
183
+ {
184
+ const segment = this.usageBuckets(response.usage);
185
+ inputBuckets.base += segment.input.base;
186
+ inputBuckets.cached += segment.input.cached;
187
+ }
181
188
  const promises = [];
182
189
  for (const content of response.output) {
183
190
  if (content.type === 'message') {
@@ -185,6 +192,19 @@ Only output the missing remainder. Do not restart or re-open tags already emitte
185
192
  if (message.type === 'output_text') {
186
193
  output += message.text;
187
194
  partial += message.text;
195
+ if (message.annotations && message.annotations.length > 0) {
196
+ for (const annotation of message.annotations) {
197
+ if (annotation.type === 'container_file_citation') {
198
+ const mimeType = this.getMimeType(annotation.filename);
199
+ const file = await this.client.containers.files.content.retrieve(annotation.file_id, {
200
+ container_id: annotation.container_id,
201
+ });
202
+ console.log('OpenAIGenerativeAIWorkerConnector.processJob file', mimeType);
203
+ const buffer = await file.arrayBuffer();
204
+ resultFile = `data:${mimeType};base64,${Buffer.from(buffer).toString('base64')}`;
205
+ }
206
+ }
207
+ }
188
208
  }
189
209
  }
190
210
  }
@@ -219,9 +239,47 @@ Only output the missing remainder. Do not restart or re-open tags already emitte
219
239
  resultFile,
220
240
  inputTokens,
221
241
  outputTokens,
242
+ tokenBuckets: { input: inputBuckets, output: { base: outputTokens } },
222
243
  };
223
244
  }
224
245
  }
246
+ /**
247
+ * OpenAI reports cached prompt tokens inside the input token total; split
248
+ * them out so billing can rate the cached share via the model's tokenRates.
249
+ */
250
+ usageBuckets(usage) {
251
+ const total = usage?.input_tokens ?? 0;
252
+ const cached = usage?.input_tokens_details?.cached_tokens ?? 0;
253
+ return { input: { base: total - cached, cached }, output: { base: usage?.output_tokens ?? 0 } };
254
+ }
255
+ /**
256
+ * whisper-1 reports no token usage; verbose_json carries the audio duration
257
+ * the platform bills on, so it replaces whisper's default/json format
258
+ * (verbose_json still carries `text`, so json callers see the same output).
259
+ */
260
+ transcriptionFormat(modelId, requested) {
261
+ if (modelId.startsWith('whisper') && (!requested || requested === 'json')) {
262
+ return 'verbose_json';
263
+ }
264
+ return requested || 'json';
265
+ }
266
+ /**
267
+ * Token-billed transcription models (gpt-4o-transcribe*) report
268
+ * usage.type === 'tokens'. whisper-1 reports no token usage — bill the
269
+ * audio duration (usage.type === 'duration', or verbose_json's `duration`)
270
+ * at 1,000 output tokens per minute, so whisper's catalog outputPrice is
271
+ * USD per 1,000 minutes of audio (mirrors the cloudlet proxy's metering).
272
+ */
273
+ transcriptionUsage(response) {
274
+ const usage = response?.usage;
275
+ if (usage?.type === 'tokens') {
276
+ return { inputTokens: usage.input_tokens ?? 0, outputTokens: usage.output_tokens ?? 0 };
277
+ }
278
+ const seconds = usage?.type === 'duration'
279
+ ? (usage.seconds ?? 0)
280
+ : (typeof response?.duration === 'number' ? response.duration : 0);
281
+ return { inputTokens: 0, outputTokens: Math.round((seconds / 60) * 1000) };
282
+ }
225
283
  async *processJobStream(parameters, options) {
226
284
  if (!this.client) {
227
285
  throw new Error('Client not initialized');
@@ -248,7 +306,10 @@ Only output the missing remainder. Do not restart or re-open tags already emitte
248
306
  throw new Error('No file found');
249
307
  }
250
308
  console.log('OpenAIGenerativeAIWorkerConnector.processJobStream audio transcription');
251
- if (!parameters.responseFormat || parameters.responseFormat === 'json') {
309
+ // whisper-1 does not support streaming transcription — it always takes
310
+ // the non-streaming path below (with verbose_json so duration is billed).
311
+ const isWhisper = options.model.id.startsWith('whisper');
312
+ if ((!parameters.responseFormat || parameters.responseFormat === 'json') && !isWhisper) {
252
313
  const stream = await this.client.audio.transcriptions.create({
253
314
  model: options.model.id,
254
315
  file,
@@ -270,15 +331,16 @@ Only output the missing remainder. Do not restart or re-open tags already emitte
270
331
  const response = await this.client.audio.transcriptions.create({
271
332
  model: options.model.id,
272
333
  file,
273
- response_format: parameters.responseFormat ? parameters.responseFormat : 'json',
334
+ response_format: this.transcriptionFormat(options.model.id, parameters.responseFormat),
274
335
  language: parameters.language,
275
336
  timestamp_granularities: parameters.timestampGranularities ? [parameters.timestampGranularities] : undefined,
276
337
  });
338
+ const transcriptionUsage = this.transcriptionUsage(response);
277
339
  yield {
278
340
  type: "prompt" /* GenerativeAIJobType.Prompt */,
279
- output: JSON.stringify(response),
280
- inputTokens: 0,
281
- outputTokens: 0,
341
+ output: (!parameters.responseFormat || parameters.responseFormat === 'json') ? response.text : JSON.stringify(response),
342
+ inputTokens: transcriptionUsage.inputTokens,
343
+ outputTokens: transcriptionUsage.outputTokens,
282
344
  };
283
345
  }
284
346
  return;
@@ -392,12 +454,31 @@ Only output the missing remainder. Do not restart or re-open tags already emitte
392
454
  outputTokens: 0,
393
455
  };
394
456
  }
457
+ if (chunk.type === 'response.output_text.annotation.added') {
458
+ const annotation = chunk.annotation;
459
+ if (annotation.type === 'container_file_citation') {
460
+ const mimeType = this.getMimeType(annotation.filename);
461
+ const file = await this.client.containers.files.content.retrieve(annotation.file_id, {
462
+ container_id: annotation.container_id,
463
+ });
464
+ console.log('OpenAIGenerativeAIWorkerConnector.processJobStream file', mimeType);
465
+ const buffer = await file.arrayBuffer();
466
+ yield {
467
+ type: "prompt" /* GenerativeAIJobType.Prompt */,
468
+ output: '',
469
+ resultFile: `data:${mimeType};base64,${Buffer.from(buffer).toString('base64')}`,
470
+ inputTokens: 0,
471
+ outputTokens: 0,
472
+ };
473
+ }
474
+ }
395
475
  if (chunk.type === 'response.completed') {
396
476
  yield {
397
477
  type: "prompt" /* GenerativeAIJobType.Prompt */,
398
478
  output: '',
399
479
  inputTokens: chunk.response.usage?.input_tokens ?? 0,
400
480
  outputTokens: chunk.response.usage?.output_tokens ?? 0,
481
+ tokenBuckets: this.usageBuckets(chunk.response.usage),
401
482
  };
402
483
  }
403
484
  if (chunk.type === 'response.incomplete') {
@@ -407,6 +488,7 @@ Only output the missing remainder. Do not restart or re-open tags already emitte
407
488
  output: '',
408
489
  inputTokens: chunk.response.usage?.input_tokens ?? 0,
409
490
  outputTokens: chunk.response.usage?.output_tokens ?? 0,
491
+ tokenBuckets: this.usageBuckets(chunk.response.usage),
410
492
  };
411
493
  if (!parameters.privacy && chunk.response.incomplete_details?.reason === 'max_output_tokens') {
412
494
  if (continuationCount < MAX_CONTINUATIONS) {
@@ -534,7 +616,7 @@ Only output the missing remainder. Do not restart or re-open tags already emitte
534
616
  if (c.type === 'file') {
535
617
  return {
536
618
  type: 'input_file',
537
- filename: c.file.filename,
619
+ filename: c.file.filename ?? `${Math.random().toString(36).substring(2, 15)}.pdf`,
538
620
  file_data: c.file.file_data,
539
621
  };
540
622
  }
@@ -639,4 +721,68 @@ Only output the missing remainder. Do not restart or re-open tags already emitte
639
721
  }
640
722
  return undefined;
641
723
  }
724
+ getMimeType(filename) {
725
+ const extension = filename.split('.').pop();
726
+ if (extension === 'pdf') {
727
+ return 'application/pdf';
728
+ }
729
+ if (extension === 'jpg' || extension === 'jpeg') {
730
+ return 'image/jpeg';
731
+ }
732
+ if (extension === 'png') {
733
+ return 'image/png';
734
+ }
735
+ if (extension === 'gif') {
736
+ return 'image/gif';
737
+ }
738
+ if (extension === 'webp') {
739
+ return 'image/webp';
740
+ }
741
+ if (extension === 'svg') {
742
+ return 'image/svg+xml';
743
+ }
744
+ if (extension === 'txt') {
745
+ return 'text/plain';
746
+ }
747
+ if (extension === 'html') {
748
+ return 'text/html';
749
+ }
750
+ if (extension === 'css') {
751
+ return 'text/css';
752
+ }
753
+ if (extension === 'js') {
754
+ return 'application/javascript';
755
+ }
756
+ if (extension === 'json') {
757
+ return 'application/json';
758
+ }
759
+ if (extension === 'xml') {
760
+ return 'application/xml';
761
+ }
762
+ if (extension === 'csv') {
763
+ return 'text/csv';
764
+ }
765
+ if (extension === 'tsv') {
766
+ return 'text/tab-separated-values';
767
+ }
768
+ if (extension === 'docx') {
769
+ return 'application/vnd.openxmlformats-officedocument.wordprocessingml.document';
770
+ }
771
+ if (extension === 'doc') {
772
+ return 'application/msword';
773
+ }
774
+ if (extension === 'xls') {
775
+ return 'application/vnd.ms-excel';
776
+ }
777
+ if (extension === 'xlsx') {
778
+ return 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet';
779
+ }
780
+ if (extension === 'pptx') {
781
+ return 'application/vnd.openxmlformats-officedocument.presentationml.presentation';
782
+ }
783
+ if (extension === 'ppt') {
784
+ return 'application/vnd.ms-powerpoint';
785
+ }
786
+ return 'application/octet-stream';
787
+ }
642
788
  }
@@ -8,6 +8,25 @@ export declare class OpenAIGenerativeAIWorkerConnector implements IGenerativeAIW
8
8
  close(): Promise<void>;
9
9
  getEngineType(): GenerativeAIEngineType;
10
10
  processJob(parameters: GenerativeAIWorkerConnectorParameters, options?: IGenerativeAIWorkerOptions): Promise<IGenerativeAIWorkerConnectorPromptResult>;
11
+ /**
12
+ * OpenAI reports cached prompt tokens inside the input token total; split
13
+ * them out so billing can rate the cached share via the model's tokenRates.
14
+ */
15
+ private usageBuckets;
16
+ /**
17
+ * whisper-1 reports no token usage; verbose_json carries the audio duration
18
+ * the platform bills on, so it replaces whisper's default/json format
19
+ * (verbose_json still carries `text`, so json callers see the same output).
20
+ */
21
+ private transcriptionFormat;
22
+ /**
23
+ * Token-billed transcription models (gpt-4o-transcribe*) report
24
+ * usage.type === 'tokens'. whisper-1 reports no token usage — bill the
25
+ * audio duration (usage.type === 'duration', or verbose_json's `duration`)
26
+ * at 1,000 output tokens per minute, so whisper's catalog outputPrice is
27
+ * USD per 1,000 minutes of audio (mirrors the cloudlet proxy's metering).
28
+ */
29
+ private transcriptionUsage;
11
30
  processJobStream(parameters: GenerativeAIWorkerConnectorParameters, options?: IGenerativeAIWorkerOptions): AsyncGenerator<IGenerativeAIWorkerConnectorPromptResult>;
12
31
  private processToolCall;
13
32
  private getMessages;
@@ -15,4 +34,5 @@ export declare class OpenAIGenerativeAIWorkerConnector implements IGenerativeAIW
15
34
  private getTools;
16
35
  private getReasoning;
17
36
  private getResponseFormat;
37
+ getMimeType(filename: string): string;
18
38
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@crewdle/mist-connector-openai",
3
- "version": "1.0.22",
3
+ "version": "1.0.23",
4
4
  "description": "",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/types/index.d.ts",
@@ -15,7 +15,7 @@
15
15
  "dist/"
16
16
  ],
17
17
  "devDependencies": {
18
- "@crewdle/web-sdk-types": "^1.0.55",
18
+ "@crewdle/web-sdk-types": "^1.0.58",
19
19
  "@types/node": "^22.13.9",
20
20
  "typescript": "^5.8.2"
21
21
  },