@crewdle/mist-connector-openai 1.0.22 → 1.0.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -70,16 +70,17 @@ export class OpenAIGenerativeAIWorkerConnector {
70
70
  const response = await this.client.audio.transcriptions.create({
71
71
  model: options.model.id,
72
72
  file,
73
- response_format: parameters.responseFormat || 'json',
73
+ response_format: this.transcriptionFormat(options.model.id, parameters.responseFormat),
74
74
  language: parameters.language,
75
75
  timestamp_granularities: parameters.timestampGranularities ? [parameters.timestampGranularities] : undefined,
76
76
  });
77
77
  console.log('OpenAIGenerativeAIWorkerConnector.processJob audio transcription response');
78
+ const transcriptionUsage = this.transcriptionUsage(response);
78
79
  return {
79
80
  type: "prompt" /* GenerativeAIJobType.Prompt */,
80
81
  output: (!parameters.responseFormat || parameters.responseFormat === 'json') ? response.text : JSON.stringify(response),
81
- inputTokens: response.usage?.type === 'tokens' ? response.usage.input_tokens : 0,
82
- outputTokens: response.usage?.type === 'tokens' ? response.usage.output_tokens : 0,
82
+ inputTokens: transcriptionUsage.inputTokens,
83
+ outputTokens: transcriptionUsage.outputTokens,
83
84
  };
84
85
  }
85
86
  if (options.model.taskType === GenerativeAITaskType.ImageGeneration) {
@@ -138,6 +139,7 @@ export class OpenAIGenerativeAIWorkerConnector {
138
139
  const reasoning = this.getReasoning(parameters, options.model.id);
139
140
  let inputTokens = 0;
140
141
  let outputTokens = 0;
142
+ const inputBuckets = { base: 0, cached: 0 };
141
143
  let output = '';
142
144
  let resultFile;
143
145
  let partial = '';
@@ -178,6 +180,11 @@ Only output the missing remainder. Do not restart or re-open tags already emitte
178
180
  partial = '';
179
181
  inputTokens += response.usage?.input_tokens ?? 0;
180
182
  outputTokens += response.usage?.output_tokens ?? 0;
183
+ {
184
+ const segment = this.usageBuckets(response.usage);
185
+ inputBuckets.base += segment.input.base;
186
+ inputBuckets.cached += segment.input.cached;
187
+ }
181
188
  const promises = [];
182
189
  for (const content of response.output) {
183
190
  if (content.type === 'message') {
@@ -185,6 +192,19 @@ Only output the missing remainder. Do not restart or re-open tags already emitte
185
192
  if (message.type === 'output_text') {
186
193
  output += message.text;
187
194
  partial += message.text;
195
+ if (message.annotations && message.annotations.length > 0) {
196
+ for (const annotation of message.annotations) {
197
+ if (annotation.type === 'container_file_citation') {
198
+ const mimeType = this.getMimeType(annotation.filename);
199
+ const file = await this.client.containers.files.content.retrieve(annotation.file_id, {
200
+ container_id: annotation.container_id,
201
+ });
202
+ console.log('OpenAIGenerativeAIWorkerConnector.processJob file', mimeType);
203
+ const buffer = await file.arrayBuffer();
204
+ resultFile = `data:${mimeType};base64,${Buffer.from(buffer).toString('base64')}`;
205
+ }
206
+ }
207
+ }
188
208
  }
189
209
  }
190
210
  }
@@ -219,9 +239,55 @@ Only output the missing remainder. Do not restart or re-open tags already emitte
219
239
  resultFile,
220
240
  inputTokens,
221
241
  outputTokens,
242
+ tokenBuckets: { input: inputBuckets, output: { base: outputTokens } },
222
243
  };
223
244
  }
224
245
  }
246
+ /**
247
+ * OpenAI reports cached prompt tokens inside the input token total; split
248
+ * them out so billing can rate the cached share via the model's tokenRates.
249
+ * Image models (gpt-image) also detail text vs image input tokens — bucket
250
+ * the text share so docs whose base input SKU is the image price can rate
251
+ * it (e.g. gpt-image-2 `text: 0.625`). Unrated buckets bill at base, so
252
+ * this is a no-op for plain text models. cached can overlap the modality
253
+ * details, hence the clamp.
254
+ */
255
+ usageBuckets(usage) {
256
+ const total = usage?.input_tokens ?? 0;
257
+ const details = usage?.input_tokens_details ?? {};
258
+ const cached = details.cached_tokens ?? 0;
259
+ const text = details.text_tokens ?? 0;
260
+ const audio = details.audio_tokens ?? 0;
261
+ return { input: { base: Math.max(0, total - cached - text - audio), cached, text, audio }, output: { base: usage?.output_tokens ?? 0 } };
262
+ }
263
+ /**
264
+ * whisper-1 reports no token usage; verbose_json carries the audio duration
265
+ * the platform bills on, so it replaces whisper's default/json format
266
+ * (verbose_json still carries `text`, so json callers see the same output).
267
+ */
268
+ transcriptionFormat(modelId, requested) {
269
+ if (modelId.startsWith('whisper') && (!requested || requested === 'json')) {
270
+ return 'verbose_json';
271
+ }
272
+ return requested || 'json';
273
+ }
274
+ /**
275
+ * Token-billed transcription models (gpt-4o-transcribe*) report
276
+ * usage.type === 'tokens'. whisper-1 reports no token usage — bill the
277
+ * audio duration (usage.type === 'duration', or verbose_json's `duration`)
278
+ * at 1,000 output tokens per minute, so whisper's catalog outputPrice is
279
+ * USD per 1,000 minutes of audio (mirrors the cloudlet proxy's metering).
280
+ */
281
+ transcriptionUsage(response) {
282
+ const usage = response?.usage;
283
+ if (usage?.type === 'tokens') {
284
+ return { inputTokens: usage.input_tokens ?? 0, outputTokens: usage.output_tokens ?? 0 };
285
+ }
286
+ const seconds = usage?.type === 'duration'
287
+ ? (usage.seconds ?? 0)
288
+ : (typeof response?.duration === 'number' ? response.duration : 0);
289
+ return { inputTokens: 0, outputTokens: Math.round((seconds / 60) * 1000) };
290
+ }
225
291
  async *processJobStream(parameters, options) {
226
292
  if (!this.client) {
227
293
  throw new Error('Client not initialized');
@@ -248,7 +314,10 @@ Only output the missing remainder. Do not restart or re-open tags already emitte
248
314
  throw new Error('No file found');
249
315
  }
250
316
  console.log('OpenAIGenerativeAIWorkerConnector.processJobStream audio transcription');
251
- if (!parameters.responseFormat || parameters.responseFormat === 'json') {
317
+ // whisper-1 does not support streaming transcription — it always takes
318
+ // the non-streaming path below (with verbose_json so duration is billed).
319
+ const isWhisper = options.model.id.startsWith('whisper');
320
+ if ((!parameters.responseFormat || parameters.responseFormat === 'json') && !isWhisper) {
252
321
  const stream = await this.client.audio.transcriptions.create({
253
322
  model: options.model.id,
254
323
  file,
@@ -270,15 +339,16 @@ Only output the missing remainder. Do not restart or re-open tags already emitte
270
339
  const response = await this.client.audio.transcriptions.create({
271
340
  model: options.model.id,
272
341
  file,
273
- response_format: parameters.responseFormat ? parameters.responseFormat : 'json',
342
+ response_format: this.transcriptionFormat(options.model.id, parameters.responseFormat),
274
343
  language: parameters.language,
275
344
  timestamp_granularities: parameters.timestampGranularities ? [parameters.timestampGranularities] : undefined,
276
345
  });
346
+ const transcriptionUsage = this.transcriptionUsage(response);
277
347
  yield {
278
348
  type: "prompt" /* GenerativeAIJobType.Prompt */,
279
- output: JSON.stringify(response),
280
- inputTokens: 0,
281
- outputTokens: 0,
349
+ output: (!parameters.responseFormat || parameters.responseFormat === 'json') ? response.text : JSON.stringify(response),
350
+ inputTokens: transcriptionUsage.inputTokens,
351
+ outputTokens: transcriptionUsage.outputTokens,
282
352
  };
283
353
  }
284
354
  return;
@@ -392,12 +462,31 @@ Only output the missing remainder. Do not restart or re-open tags already emitte
392
462
  outputTokens: 0,
393
463
  };
394
464
  }
465
+ if (chunk.type === 'response.output_text.annotation.added') {
466
+ const annotation = chunk.annotation;
467
+ if (annotation.type === 'container_file_citation') {
468
+ const mimeType = this.getMimeType(annotation.filename);
469
+ const file = await this.client.containers.files.content.retrieve(annotation.file_id, {
470
+ container_id: annotation.container_id,
471
+ });
472
+ console.log('OpenAIGenerativeAIWorkerConnector.processJobStream file', mimeType);
473
+ const buffer = await file.arrayBuffer();
474
+ yield {
475
+ type: "prompt" /* GenerativeAIJobType.Prompt */,
476
+ output: '',
477
+ resultFile: `data:${mimeType};base64,${Buffer.from(buffer).toString('base64')}`,
478
+ inputTokens: 0,
479
+ outputTokens: 0,
480
+ };
481
+ }
482
+ }
395
483
  if (chunk.type === 'response.completed') {
396
484
  yield {
397
485
  type: "prompt" /* GenerativeAIJobType.Prompt */,
398
486
  output: '',
399
487
  inputTokens: chunk.response.usage?.input_tokens ?? 0,
400
488
  outputTokens: chunk.response.usage?.output_tokens ?? 0,
489
+ tokenBuckets: this.usageBuckets(chunk.response.usage),
401
490
  };
402
491
  }
403
492
  if (chunk.type === 'response.incomplete') {
@@ -407,6 +496,7 @@ Only output the missing remainder. Do not restart or re-open tags already emitte
407
496
  output: '',
408
497
  inputTokens: chunk.response.usage?.input_tokens ?? 0,
409
498
  outputTokens: chunk.response.usage?.output_tokens ?? 0,
499
+ tokenBuckets: this.usageBuckets(chunk.response.usage),
410
500
  };
411
501
  if (!parameters.privacy && chunk.response.incomplete_details?.reason === 'max_output_tokens') {
412
502
  if (continuationCount < MAX_CONTINUATIONS) {
@@ -534,7 +624,7 @@ Only output the missing remainder. Do not restart or re-open tags already emitte
534
624
  if (c.type === 'file') {
535
625
  return {
536
626
  type: 'input_file',
537
- filename: c.file.filename,
627
+ filename: c.file.filename ?? `${Math.random().toString(36).substring(2, 15)}.pdf`,
538
628
  file_data: c.file.file_data,
539
629
  };
540
630
  }
@@ -639,4 +729,68 @@ Only output the missing remainder. Do not restart or re-open tags already emitte
639
729
  }
640
730
  return undefined;
641
731
  }
732
+ getMimeType(filename) {
733
+ const extension = filename.split('.').pop();
734
+ if (extension === 'pdf') {
735
+ return 'application/pdf';
736
+ }
737
+ if (extension === 'jpg' || extension === 'jpeg') {
738
+ return 'image/jpeg';
739
+ }
740
+ if (extension === 'png') {
741
+ return 'image/png';
742
+ }
743
+ if (extension === 'gif') {
744
+ return 'image/gif';
745
+ }
746
+ if (extension === 'webp') {
747
+ return 'image/webp';
748
+ }
749
+ if (extension === 'svg') {
750
+ return 'image/svg+xml';
751
+ }
752
+ if (extension === 'txt') {
753
+ return 'text/plain';
754
+ }
755
+ if (extension === 'html') {
756
+ return 'text/html';
757
+ }
758
+ if (extension === 'css') {
759
+ return 'text/css';
760
+ }
761
+ if (extension === 'js') {
762
+ return 'application/javascript';
763
+ }
764
+ if (extension === 'json') {
765
+ return 'application/json';
766
+ }
767
+ if (extension === 'xml') {
768
+ return 'application/xml';
769
+ }
770
+ if (extension === 'csv') {
771
+ return 'text/csv';
772
+ }
773
+ if (extension === 'tsv') {
774
+ return 'text/tab-separated-values';
775
+ }
776
+ if (extension === 'docx') {
777
+ return 'application/vnd.openxmlformats-officedocument.wordprocessingml.document';
778
+ }
779
+ if (extension === 'doc') {
780
+ return 'application/msword';
781
+ }
782
+ if (extension === 'xls') {
783
+ return 'application/vnd.ms-excel';
784
+ }
785
+ if (extension === 'xlsx') {
786
+ return 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet';
787
+ }
788
+ if (extension === 'pptx') {
789
+ return 'application/vnd.openxmlformats-officedocument.presentationml.presentation';
790
+ }
791
+ if (extension === 'ppt') {
792
+ return 'application/vnd.ms-powerpoint';
793
+ }
794
+ return 'application/octet-stream';
795
+ }
642
796
  }
@@ -8,6 +8,30 @@ export declare class OpenAIGenerativeAIWorkerConnector implements IGenerativeAIW
8
8
  close(): Promise<void>;
9
9
  getEngineType(): GenerativeAIEngineType;
10
10
  processJob(parameters: GenerativeAIWorkerConnectorParameters, options?: IGenerativeAIWorkerOptions): Promise<IGenerativeAIWorkerConnectorPromptResult>;
11
+ /**
12
+ * OpenAI reports cached prompt tokens inside the input token total; split
13
+ * them out so billing can rate the cached share via the model's tokenRates.
14
+ * Image models (gpt-image) also detail text vs image input tokens — bucket
15
+ * the text share so docs whose base input SKU is the image price can rate
16
+ * it (e.g. gpt-image-2 `text: 0.625`). Unrated buckets bill at base, so
17
+ * this is a no-op for plain text models. cached can overlap the modality
18
+ * details, hence the clamp.
19
+ */
20
+ private usageBuckets;
21
+ /**
22
+ * whisper-1 reports no token usage; verbose_json carries the audio duration
23
+ * the platform bills on, so it replaces whisper's default/json format
24
+ * (verbose_json still carries `text`, so json callers see the same output).
25
+ */
26
+ private transcriptionFormat;
27
+ /**
28
+ * Token-billed transcription models (gpt-4o-transcribe*) report
29
+ * usage.type === 'tokens'. whisper-1 reports no token usage — bill the
30
+ * audio duration (usage.type === 'duration', or verbose_json's `duration`)
31
+ * at 1,000 output tokens per minute, so whisper's catalog outputPrice is
32
+ * USD per 1,000 minutes of audio (mirrors the cloudlet proxy's metering).
33
+ */
34
+ private transcriptionUsage;
11
35
  processJobStream(parameters: GenerativeAIWorkerConnectorParameters, options?: IGenerativeAIWorkerOptions): AsyncGenerator<IGenerativeAIWorkerConnectorPromptResult>;
12
36
  private processToolCall;
13
37
  private getMessages;
@@ -15,4 +39,5 @@ export declare class OpenAIGenerativeAIWorkerConnector implements IGenerativeAIW
15
39
  private getTools;
16
40
  private getReasoning;
17
41
  private getResponseFormat;
42
+ getMimeType(filename: string): string;
18
43
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@crewdle/mist-connector-openai",
3
- "version": "1.0.22",
3
+ "version": "1.0.24",
4
4
  "description": "",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/types/index.d.ts",
@@ -15,7 +15,7 @@
15
15
  "dist/"
16
16
  ],
17
17
  "devDependencies": {
18
- "@crewdle/web-sdk-types": "^1.0.55",
18
+ "@crewdle/web-sdk-types": "^1.0.58",
19
19
  "@types/node": "^22.13.9",
20
20
  "typescript": "^5.8.2"
21
21
  },