@heyputer/puter.js 2.1.2 → 2.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/modules/AI.js CHANGED
@@ -1,5 +1,45 @@
1
1
  import * as utils from '../lib/utils.js';
2
2
 
3
+ const normalizeTTSProvider = (value) => {
4
+ if (typeof value !== 'string') {
5
+ return 'aws-polly';
6
+ }
7
+ const lower = value.toLowerCase();
8
+ if (lower === 'openai') return 'openai';
9
+ if (lower === 'aws' || lower === 'polly' || lower === 'aws-polly') return 'aws-polly';
10
+ return value;
11
+ };
12
+
13
+ const TOGETHER_IMAGE_MODEL_PREFIXES = [
14
+ 'black-forest-labs/',
15
+ 'stabilityai/',
16
+ 'togethercomputer/',
17
+ 'playgroundai/',
18
+ 'runwayml/',
19
+ 'lightricks/',
20
+ 'sg161222/',
21
+ 'wavymulder/',
22
+ 'prompthero/',
23
+ ];
24
+
25
+ const TOGETHER_IMAGE_MODEL_KEYWORDS = [
26
+ 'flux',
27
+ 'kling',
28
+ 'sd3',
29
+ 'stable-diffusion',
30
+ 'kolors',
31
+ ];
32
+
33
+ const TOGETHER_VIDEO_MODEL_PREFIXES = [
34
+ 'minimax/',
35
+ 'google/',
36
+ 'bytedance/',
37
+ 'pixverse/',
38
+ 'kwaivgi/',
39
+ 'vidu/',
40
+ 'wan-ai/',
41
+ ];
42
+
3
43
  class AI{
4
44
  /**
5
45
  * Creates a new instance with the given authentication token, API origin, and app ID,
@@ -78,48 +118,100 @@ class AI{
78
118
  }
79
119
 
80
120
  img2txt = async (...args) => {
81
- let MAX_INPUT_SIZE = 10 * 1024 * 1024;
121
+ const MAX_INPUT_SIZE = 10 * 1024 * 1024;
122
+ if (!args || args.length === 0) {
123
+ throw { message: 'Arguments are required', code: 'arguments_required' };
124
+ }
125
+
126
+ const isBlobLike = (value) => {
127
+ if (typeof Blob === 'undefined') return false;
128
+ return value instanceof Blob || (typeof File !== 'undefined' && value instanceof File);
129
+ };
130
+ const isPlainObject = (value) => value && typeof value === 'object' && !Array.isArray(value) && !isBlobLike(value);
131
+ const normalizeProvider = (value) => {
132
+ if (!value) return 'aws-textract';
133
+ const normalized = String(value).toLowerCase();
134
+ if (['aws', 'textract', 'aws-textract'].includes(normalized)) return 'aws-textract';
135
+ if (['mistral', 'mistral-ocr'].includes(normalized)) return 'mistral';
136
+ return 'aws-textract';
137
+ };
138
+
82
139
  let options = {};
140
+ if (isPlainObject(args[0])) {
141
+ options = { ...args[0] };
142
+ } else {
143
+ options.source = args[0];
144
+ }
145
+
83
146
  let testMode = false;
147
+ for (let i = 1; i < args.length; i++) {
148
+ const value = args[i];
149
+ if (typeof value === 'boolean') {
150
+ testMode = testMode || value;
151
+ } else if (isPlainObject(value)) {
152
+ options = { ...options, ...value };
153
+ }
154
+ }
84
155
 
85
- // Check that the argument is not undefined or null
86
- if(!args){
87
- throw({message: 'Arguments are required', code: 'arguments_required'});
156
+ if (typeof options.testMode === 'boolean') {
157
+ testMode = options.testMode;
88
158
  }
89
159
 
90
- // if argument is string transform it to the object that the API expects
91
- if (typeof args[0] === 'string' || args[0] instanceof Blob) {
92
- options.source = args[0];
160
+ const provider = normalizeProvider(options.provider);
161
+ delete options.provider;
162
+ delete options.testMode;
163
+
164
+ if (!options.source) {
165
+ throw { message: 'Source is required', code: 'source_required' };
93
166
  }
94
167
 
95
- // if input is a blob, transform it to a data URI
96
- if (args[0].source instanceof Blob) {
97
- options.source = await utils.blobToDataUri(args[0].source);
168
+ if (isBlobLike(options.source)) {
169
+ options.source = await utils.blobToDataUri(options.source);
170
+ } else if (options.source?.source && isBlobLike(options.source.source)) {
171
+ // Support shape { source: Blob }
172
+ options.source = await utils.blobToDataUri(options.source.source);
98
173
  }
99
174
 
100
- // check input size
101
- if (options.source.length > this.MAX_INPUT_SIZE) {
175
+ if (typeof options.source === 'string' &&
176
+ options.source.startsWith('data:') &&
177
+ options.source.length > MAX_INPUT_SIZE) {
102
178
  throw { message: 'Input size cannot be larger than ' + MAX_INPUT_SIZE, code: 'input_too_large' };
103
179
  }
104
180
 
105
- // determine if test mode is enabled
106
- if (typeof args[1] === 'boolean' && args[1] === true ||
107
- typeof args[2] === 'boolean' && args[2] === true ||
108
- typeof args[3] === 'boolean' && args[3] === true) {
109
- testMode = true;
110
- }
111
-
112
- return await utils.make_driver_method(['source'], 'puter-ocr', 'aws-textract', 'recognize', {
113
- test_mode: testMode ?? false,
114
- transform: async (result) => {
181
+ const toText = (result) => {
182
+ if (!result) return '';
183
+ if (Array.isArray(result.blocks) && result.blocks.length) {
115
184
  let str = '';
116
- for (let i = 0; i < result?.blocks?.length; i++) {
117
- if("text/textract:LINE" === result.blocks[i].type)
118
- str += result.blocks[i].text + "\n";
185
+ for (const block of result.blocks) {
186
+ if (typeof block?.text !== 'string') continue;
187
+ if (!block.type || block.type === 'text/textract:LINE' || block.type.startsWith('text/')) {
188
+ str += block.text + '\n';
189
+ }
119
190
  }
120
- return str;
191
+ if (str.trim()) return str;
121
192
  }
122
- }).call(this, options);
193
+ if (Array.isArray(result.pages) && result.pages.length) {
194
+ const markdown = result.pages
195
+ .map(page => (page?.markdown || '').trim())
196
+ .filter(Boolean)
197
+ .join('\n\n');
198
+ if (markdown.trim()) return markdown;
199
+ }
200
+ if (typeof result.document_annotation === 'string') {
201
+ return result.document_annotation;
202
+ }
203
+ if (typeof result.text === 'string') {
204
+ return result.text;
205
+ }
206
+ return '';
207
+ };
208
+
209
+ const driverCall = utils.make_driver_method(['source'], 'puter-ocr', provider, 'recognize', {
210
+ test_mode: testMode ?? false,
211
+ transform: async (result) => toText(result),
212
+ });
213
+
214
+ return await driverCall.call(this, options);
123
215
  }
124
216
 
125
217
  txt2speech = async (...args) => {
@@ -183,23 +275,43 @@ class AI{
183
275
  throw { message: 'Text parameter is required', code: 'text_required' };
184
276
  }
185
277
 
186
- // Validate engine if provided
187
- if (options.engine) {
188
- const validEngines = ['standard', 'neural', 'long-form', 'generative'];
189
- if (!validEngines.includes(options.engine)) {
278
+ const validEngines = ['standard', 'neural', 'long-form', 'generative'];
279
+ let provider = normalizeTTSProvider(options.provider);
280
+
281
+ if (options.engine && normalizeTTSProvider(options.engine) === 'openai' && !options.provider) {
282
+ provider = 'openai';
283
+ }
284
+
285
+ if (provider === 'openai') {
286
+ if (!options.model && typeof options.engine === 'string') {
287
+ options.model = options.engine;
288
+ }
289
+ if (!options.voice) {
290
+ options.voice = 'alloy';
291
+ }
292
+ if (!options.model) {
293
+ options.model = 'gpt-4o-mini-tts';
294
+ }
295
+ if (!options.response_format) {
296
+ options.response_format = 'mp3';
297
+ }
298
+ delete options.engine;
299
+ } else {
300
+ provider = 'aws-polly';
301
+
302
+ if (options.engine && !validEngines.includes(options.engine)) {
190
303
  throw { message: 'Invalid engine. Must be one of: ' + validEngines.join(', '), code: 'invalid_engine' };
191
304
  }
192
- }
193
305
 
194
- // Set default values if not provided
195
- if (!options.voice) {
196
- options.voice = 'Joanna';
197
- }
198
- if (!options.engine) {
199
- options.engine = 'standard';
200
- }
201
- if (!options.language) {
202
- options.language = 'en-US';
306
+ if (!options.voice) {
307
+ options.voice = 'Joanna';
308
+ }
309
+ if (!options.engine) {
310
+ options.engine = 'standard';
311
+ }
312
+ if (!options.language) {
313
+ options.language = 'en-US';
314
+ }
203
315
  }
204
316
 
205
317
  // check input size
@@ -214,12 +326,28 @@ class AI{
214
326
  break;
215
327
  }
216
328
  }
217
-
218
- return await utils.make_driver_method(['source'], 'puter-tts', 'aws-polly', 'synthesize', {
329
+
330
+ const driverName = provider === 'openai' ? 'openai-tts' : 'aws-polly';
331
+
332
+ return await utils.make_driver_method(['source'], 'puter-tts', driverName, 'synthesize', {
219
333
  responseType: 'blob',
220
334
  test_mode: testMode ?? false,
221
335
  transform: async (result) => {
222
- const url = await utils.blob_to_url(result);
336
+ let url;
337
+ if (typeof result === 'string') {
338
+ url = result;
339
+ } else if (result instanceof Blob) {
340
+ url = await utils.blob_to_url(result);
341
+ } else if (result instanceof ArrayBuffer) {
342
+ const blob = new Blob([result]);
343
+ url = await utils.blob_to_url(blob);
344
+ } else if (result && typeof result === 'object' && typeof result.arrayBuffer === 'function') {
345
+ const arrayBuffer = await result.arrayBuffer();
346
+ const blob = new Blob([arrayBuffer], { type: result.type || undefined });
347
+ url = await utils.blob_to_url(blob);
348
+ } else {
349
+ throw { message: 'Unexpected audio response format', code: 'invalid_audio_response' };
350
+ }
223
351
  const audio = new Audio(url);
224
352
  audio.toString = () => url;
225
353
  audio.valueOf = () => url;
@@ -228,16 +356,105 @@ class AI{
228
356
  }).call(this, options);
229
357
  }
230
358
 
359
+ speech2txt = async (...args) => {
360
+ const MAX_INPUT_SIZE = 25 * 1024 * 1024;
361
+ if ( !args || !args.length ) {
362
+ throw ({ message: 'Arguments are required', code: 'arguments_required' });
363
+ }
364
+
365
+ const normalizeSource = async (value) => {
366
+ if ( value instanceof Blob ) {
367
+ return await utils.blobToDataUri(value);
368
+ }
369
+ return value;
370
+ };
371
+
372
+ let options = {};
373
+ let testMode = false;
374
+
375
+ const primary = args[0];
376
+ if ( primary && typeof primary === 'object' && !Array.isArray(primary) && !(primary instanceof Blob) ) {
377
+ options = { ...primary };
378
+ } else {
379
+ options.file = await normalizeSource(primary);
380
+ }
381
+
382
+ if ( args[1] && typeof args[1] === 'object' && !Array.isArray(args[1]) && !(args[1] instanceof Blob) ) {
383
+ options = { ...options, ...args[1] };
384
+ } else if ( typeof args[1] === 'boolean' ) {
385
+ testMode = args[1];
386
+ }
387
+
388
+ if ( typeof args[2] === 'boolean' ) {
389
+ testMode = args[2];
390
+ }
391
+
392
+ if ( options.audio ) {
393
+ options.file = await normalizeSource(options.audio);
394
+ delete options.audio;
395
+ }
396
+
397
+ if ( options.file instanceof Blob ) {
398
+ options.file = await normalizeSource(options.file);
399
+ }
400
+
401
+ if ( !options.file ) {
402
+ throw { message: 'Audio input is required', code: 'audio_required' };
403
+ }
404
+
405
+ if ( typeof options.file === 'string' && options.file.startsWith('data:') ) {
406
+ const base64 = options.file.split(',')[1] || '';
407
+ const padding = base64.endsWith('==') ? 2 : (base64.endsWith('=') ? 1 : 0);
408
+ const byteLength = Math.floor((base64.length * 3) / 4) - padding;
409
+ if ( byteLength > MAX_INPUT_SIZE ) {
410
+ throw { message: 'Input size cannot be larger than 25 MB', code: 'input_too_large' };
411
+ }
412
+ }
413
+
414
+ const driverMethod = options.translate ? 'translate' : 'transcribe';
415
+ const driverArgs = { ...options };
416
+ delete driverArgs.translate;
417
+
418
+ const responseFormat = driverArgs.response_format;
419
+
420
+ return await utils.make_driver_method([], 'puter-speech2txt', 'openai-speech2txt', driverMethod, {
421
+ test_mode: testMode,
422
+ transform: async (result) => {
423
+ if ( responseFormat === 'text' && result && typeof result === 'object' && typeof result.text === 'string' ) {
424
+ return result.text;
425
+ }
426
+ return result;
427
+ },
428
+ }).call(this, driverArgs);
429
+ }
430
+
231
431
  // Add new methods for TTS engine management
232
432
  txt2speech = Object.assign(this.txt2speech, {
233
433
  /**
234
434
  * List available TTS engines with pricing information
235
435
  * @returns {Promise<Array>} Array of available engines
236
436
  */
237
- listEngines: async () => {
238
- return await utils.make_driver_method(['source'], 'puter-tts', 'aws-polly', 'list_engines', {
437
+ listEngines: async (options = {}) => {
438
+ let provider = 'aws-polly';
439
+ let params = {};
440
+
441
+ if (typeof options === 'string') {
442
+ provider = normalizeTTSProvider(options);
443
+ } else if (options && typeof options === 'object') {
444
+ provider = normalizeTTSProvider(options.provider) || provider;
445
+ params = { ...options };
446
+ delete params.provider;
447
+ }
448
+
449
+ if (provider === 'openai') {
450
+ params.provider = 'openai';
451
+ }
452
+
453
+ const driverName = provider === 'openai' ? 'openai-tts' : 'aws-polly';
454
+
455
+ return await utils.make_driver_method(['source'], 'puter-tts', driverName, 'list_engines', {
239
456
  responseType: 'text',
240
- }).call(this, {});
457
+ }).call(this, params);
241
458
  },
242
459
 
243
460
  /**
@@ -245,13 +462,26 @@ class AI{
245
462
  * @param {string} [engine] - Optional engine filter
246
463
  * @returns {Promise<Array>} Array of available voices
247
464
  */
248
- listVoices: async (engine) => {
249
- const params = {};
250
- if (engine) {
251
- params.engine = engine;
465
+ listVoices: async (options) => {
466
+ let provider = 'aws-polly';
467
+ let params = {};
468
+
469
+ if (typeof options === 'string') {
470
+ params.engine = options;
471
+ } else if (options && typeof options === 'object') {
472
+ provider = normalizeTTSProvider(options.provider) || provider;
473
+ params = { ...options };
474
+ delete params.provider;
252
475
  }
253
476
 
254
- return utils.make_driver_method(['source'], 'puter-tts', 'aws-polly', 'list_voices', {
477
+ if (provider === 'openai') {
478
+ params.provider = 'openai';
479
+ delete params.engine;
480
+ }
481
+
482
+ const driverName = provider === 'openai' ? 'openai-tts' : 'aws-polly';
483
+
484
+ return utils.make_driver_method(['source'], 'puter-tts', driverName, 'list_voices', {
255
485
  responseType: 'text',
256
486
  }).call(this, params);
257
487
  }
@@ -565,6 +795,9 @@ class AI{
565
795
  else if ( requestParams.model.startsWith('openrouter:') ) {
566
796
  driver = 'openrouter';
567
797
  }
798
+ else if ( requestParams.model.startsWith('ollama:') ) {
799
+ driver = 'ollama';
800
+ }
568
801
 
569
802
  // stream flag from userParams
570
803
  if(userParams.stream !== undefined && typeof userParams.stream === 'boolean'){
@@ -576,7 +809,7 @@ class AI{
576
809
  }
577
810
 
578
811
  // Additional parameters to pass from userParams to requestParams
579
- const PARAMS_TO_PASS = ['tools', 'response'];
812
+ const PARAMS_TO_PASS = ['tools', 'response', 'reasoning', 'reasoning_effort', 'text', 'verbosity'];
580
813
  for ( const name of PARAMS_TO_PASS ) {
581
814
  if ( userParams[name] ) {
582
815
  requestParams[name] = userParams[name];
@@ -660,21 +893,155 @@ class AI{
660
893
  if (options.model === "nano-banana")
661
894
  options.model = "gemini-2.5-flash-image-preview";
662
895
 
663
- if (options.model === "gemini-2.5-flash-image-preview")
896
+ const driverHint = typeof options.driver === 'string' ? options.driver : undefined;
897
+ const providerRaw = typeof options.provider === 'string'
898
+ ? options.provider
899
+ : (typeof options.service === 'string' ? options.service : undefined);
900
+ const providerHint = typeof providerRaw === 'string' ? providerRaw.toLowerCase() : undefined;
901
+ const modelLower = typeof options.model === 'string' ? options.model.toLowerCase() : '';
902
+
903
+ const looksLikeTogetherModel =
904
+ typeof options.model === 'string' &&
905
+ (TOGETHER_IMAGE_MODEL_PREFIXES.some(prefix => modelLower.startsWith(prefix)) ||
906
+ TOGETHER_IMAGE_MODEL_KEYWORDS.some(keyword => modelLower.includes(keyword)));
907
+
908
+ if (driverHint) {
909
+ AIService = driverHint;
910
+ } else if (providerHint === 'gemini') {
911
+ AIService = "gemini-image-generation";
912
+ } else if (providerHint === 'together' || providerHint === 'together-ai') {
913
+ AIService = "together-image-generation";
914
+ } else if (options.model === "gemini-2.5-flash-image-preview") {
664
915
  AIService = "gemini-image-generation";
916
+ } else if (looksLikeTogetherModel) {
917
+ AIService = "together-image-generation";
918
+ }
665
919
  // Call the original chat.complete method
666
920
  return await utils.make_driver_method(['prompt'], 'puter-image-generation', AIService, 'generate', {
667
921
  responseType: 'blob',
668
922
  test_mode: testMode ?? false,
669
- transform: async blob => {
923
+ transform: async result => {
924
+ let url;
925
+ if ( typeof result === 'string' ) {
926
+ url = result;
927
+ } else if ( result instanceof Blob ) {
928
+ url = await utils.blob_to_url(result);
929
+ } else if ( result instanceof ArrayBuffer ) {
930
+ const blob = new Blob([result]);
931
+ url = await utils.blob_to_url(blob);
932
+ } else if ( result && typeof result === 'object' && typeof result.arrayBuffer === 'function' ) {
933
+ const arrayBuffer = await result.arrayBuffer();
934
+ const blob = new Blob([arrayBuffer], { type: result.type || undefined });
935
+ url = await utils.blob_to_url(blob);
936
+ } else {
937
+ throw { message: 'Unexpected image response format', code: 'invalid_image_response' };
938
+ }
670
939
  let img = new Image();
671
- img.src = await utils.blob_to_url(blob);
940
+ img.src = url;
672
941
  img.toString = () => img.src;
673
942
  img.valueOf = () => img.src;
674
943
  return img;
675
944
  }
676
945
  }).call(this, options);
677
946
  }
947
+
948
+ txt2vid = async (...args) => {
949
+ let options = {};
950
+ let testMode = false;
951
+
952
+ if(!args){
953
+ throw({message: 'Arguments are required', code: 'arguments_required'});
954
+ }
955
+
956
+ if (typeof args[0] === 'string') {
957
+ options = { prompt: args[0] };
958
+ }
959
+
960
+ if (typeof args[1] === 'boolean' && args[1] === true) {
961
+ testMode = true;
962
+ }
963
+
964
+ if (typeof args[0] === 'string' && typeof args[1] === "object") {
965
+ options = args[1];
966
+ options.prompt = args[0];
967
+ }
968
+
969
+ if (typeof args[0] === 'object') {
970
+ options = args[0];
971
+ }
972
+
973
+ if (!options.prompt) {
974
+ throw({message: 'Prompt parameter is required', code: 'prompt_required'});
975
+ }
976
+
977
+ if (!options.model) {
978
+ options.model = 'sora-2';
979
+ }
980
+
981
+ if (options.duration !== undefined && options.seconds === undefined) {
982
+ options.seconds = options.duration;
983
+ }
984
+
985
+ let videoService = 'openai-video-generation';
986
+ const driverHint = typeof options.driver === 'string' ? options.driver : undefined;
987
+ const driverHintLower = driverHint ? driverHint.toLowerCase() : undefined;
988
+ const providerRaw = typeof options.provider === 'string'
989
+ ? options.provider
990
+ : (typeof options.service === 'string' ? options.service : undefined);
991
+ const providerHint = typeof providerRaw === 'string' ? providerRaw.toLowerCase() : undefined;
992
+ const modelLower = typeof options.model === 'string' ? options.model.toLowerCase() : '';
993
+
994
+ const looksLikeTogetherVideoModel = typeof options.model === 'string' &&
995
+ TOGETHER_VIDEO_MODEL_PREFIXES.some(prefix => modelLower.startsWith(prefix));
996
+
997
+ if (driverHintLower === 'together' || driverHintLower === 'together-ai') {
998
+ videoService = 'together-video-generation';
999
+ } else if (driverHintLower === 'together-video-generation') {
1000
+ videoService = 'together-video-generation';
1001
+ } else if (driverHintLower === 'openai') {
1002
+ videoService = 'openai-video-generation';
1003
+ } else if (driverHint) {
1004
+ videoService = driverHint;
1005
+ } else if (providerHint === 'together' || providerHint === 'together-ai') {
1006
+ videoService = 'together-video-generation';
1007
+ } else if (looksLikeTogetherVideoModel) {
1008
+ videoService = 'together-video-generation';
1009
+ }
1010
+
1011
+ return await utils.make_driver_method(['prompt'], 'puter-video-generation', videoService, 'generate', {
1012
+ responseType: 'blob',
1013
+ test_mode: testMode ?? false,
1014
+ transform: async result => {
1015
+ let sourceUrl = null;
1016
+ let mimeType = null;
1017
+ if (result instanceof Blob) {
1018
+ sourceUrl = await utils.blob_to_url(result);
1019
+ mimeType = result.type || 'video/mp4';
1020
+ } else if (typeof result === 'string') {
1021
+ sourceUrl = result;
1022
+ } else if (result && typeof result === 'object') {
1023
+ sourceUrl = result.asset_url || result.url || result.href || null;
1024
+ mimeType = result.mime_type || result.content_type || null;
1025
+ }
1026
+
1027
+ if (!sourceUrl) {
1028
+ return result;
1029
+ }
1030
+
1031
+ const video = document.createElement('video');
1032
+ video.src = sourceUrl;
1033
+ video.controls = true;
1034
+ video.preload = 'metadata';
1035
+ if (mimeType) {
1036
+ video.setAttribute('data-mime-type', mimeType);
1037
+ }
1038
+ video.setAttribute('data-source', sourceUrl);
1039
+ video.toString = () => video.src;
1040
+ video.valueOf = () => video.src;
1041
+ return video;
1042
+ }
1043
+ }).call(this, options);
1044
+ }
678
1045
  }
679
1046
 
680
1047
  export default AI;