@tyvm/knowhow 0.0.69 → 0.0.70

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. package/docs/shell-commands.md +174 -0
  2. package/package.json +1 -1
  3. package/src/agents/base/base.ts +1 -3
  4. package/src/agents/developer/developer.ts +21 -13
  5. package/src/agents/tools/agentCall.ts +4 -2
  6. package/src/agents/tools/fileSearch.ts +5 -1
  7. package/src/agents/tools/startAgentTask.ts +131 -22
  8. package/src/chat/CliChatService.ts +57 -11
  9. package/src/chat/modules/AgentModule.ts +72 -12
  10. package/src/chat/modules/CustomCommandsModule.ts +79 -0
  11. package/src/chat/modules/InternalChatModule.ts +11 -1
  12. package/src/chat/modules/ShellCommandModule.ts +96 -0
  13. package/src/chat/modules/index.ts +1 -0
  14. package/src/chat/types.ts +14 -2
  15. package/src/chat.ts +16 -13
  16. package/src/cli.ts +16 -6
  17. package/src/clients/anthropic.ts +41 -90
  18. package/src/clients/gemini.ts +445 -87
  19. package/src/clients/index.ts +125 -0
  20. package/src/clients/knowhow.ts +81 -0
  21. package/src/clients/openai.ts +256 -145
  22. package/src/clients/pricing/anthropic.ts +90 -0
  23. package/src/clients/pricing/google.ts +65 -0
  24. package/src/clients/pricing/index.ts +4 -0
  25. package/src/clients/pricing/openai.ts +134 -0
  26. package/src/clients/pricing/xai.ts +62 -0
  27. package/src/clients/types.ts +170 -1
  28. package/src/clients/xai.ts +275 -46
  29. package/src/config.ts +61 -15
  30. package/src/embeddings.ts +9 -1
  31. package/src/microphone.ts +15 -16
  32. package/src/migrations.ts +151 -0
  33. package/src/plugins/AgentsMdPlugin.ts +118 -0
  34. package/src/plugins/PluginBase.ts +8 -0
  35. package/src/plugins/downloader/downloader.ts +5 -6
  36. package/src/plugins/embedding.ts +10 -8
  37. package/src/plugins/exec.ts +70 -0
  38. package/src/plugins/github.ts +120 -74
  39. package/src/plugins/language.ts +11 -13
  40. package/src/plugins/plugins.ts +25 -4
  41. package/src/plugins/tmux.ts +132 -0
  42. package/src/plugins/types.ts +1 -0
  43. package/src/plugins/vim.ts +14 -1
  44. package/src/services/AgentSyncFs.ts +417 -0
  45. package/src/services/{AgentSynchronization.ts → AgentSyncKnowhowWeb.ts} +2 -2
  46. package/src/services/EventService.ts +0 -1
  47. package/src/services/KnowhowClient.ts +106 -0
  48. package/src/services/index.ts +4 -2
  49. package/src/types.ts +57 -4
  50. package/src/worker.ts +11 -6
  51. package/tests/manual/modalities/README.md +157 -0
  52. package/tests/manual/modalities/google.modalities.test.ts +335 -0
  53. package/tests/manual/modalities/openai.modalities.test.ts +329 -0
  54. package/tests/manual/modalities/streaming.test.ts +260 -0
  55. package/tests/manual/modalities/xai.modalities.test.ts +307 -0
  56. package/tests/plugins/language/languagePlugin-content-triggers.test.ts +5 -5
  57. package/tests/plugins/language/languagePlugin-integration.test.ts +1 -1
  58. package/tests/plugins/language/languagePlugin.test.ts +17 -8
  59. package/ts_build/package.json +1 -1
  60. package/ts_build/src/agents/base/base.js +1 -1
  61. package/ts_build/src/agents/base/base.js.map +1 -1
  62. package/ts_build/src/agents/developer/developer.js +21 -12
  63. package/ts_build/src/agents/developer/developer.js.map +1 -1
  64. package/ts_build/src/agents/tools/agentCall.js +4 -2
  65. package/ts_build/src/agents/tools/agentCall.js.map +1 -1
  66. package/ts_build/src/agents/tools/executeScript/index.d.ts +1 -1
  67. package/ts_build/src/agents/tools/fileSearch.js +2 -1
  68. package/ts_build/src/agents/tools/fileSearch.js.map +1 -1
  69. package/ts_build/src/agents/tools/github/index.d.ts +1 -1
  70. package/ts_build/src/agents/tools/startAgentTask.d.ts +2 -1
  71. package/ts_build/src/agents/tools/startAgentTask.js +118 -17
  72. package/ts_build/src/agents/tools/startAgentTask.js.map +1 -1
  73. package/ts_build/src/chat/CliChatService.d.ts +4 -0
  74. package/ts_build/src/chat/CliChatService.js +39 -5
  75. package/ts_build/src/chat/CliChatService.js.map +1 -1
  76. package/ts_build/src/chat/modules/AgentModule.d.ts +4 -1
  77. package/ts_build/src/chat/modules/AgentModule.js +49 -11
  78. package/ts_build/src/chat/modules/AgentModule.js.map +1 -1
  79. package/ts_build/src/chat/modules/CustomCommandsModule.d.ts +9 -0
  80. package/ts_build/src/chat/modules/CustomCommandsModule.js +58 -0
  81. package/ts_build/src/chat/modules/CustomCommandsModule.js.map +1 -0
  82. package/ts_build/src/chat/modules/InternalChatModule.d.ts +2 -0
  83. package/ts_build/src/chat/modules/InternalChatModule.js +10 -0
  84. package/ts_build/src/chat/modules/InternalChatModule.js.map +1 -1
  85. package/ts_build/src/chat/modules/ShellCommandModule.d.ts +8 -0
  86. package/ts_build/src/chat/modules/ShellCommandModule.js +83 -0
  87. package/ts_build/src/chat/modules/ShellCommandModule.js.map +1 -0
  88. package/ts_build/src/chat/modules/index.d.ts +1 -0
  89. package/ts_build/src/chat/modules/index.js +3 -1
  90. package/ts_build/src/chat/modules/index.js.map +1 -1
  91. package/ts_build/src/chat/types.d.ts +11 -1
  92. package/ts_build/src/chat.js +16 -13
  93. package/ts_build/src/chat.js.map +1 -1
  94. package/ts_build/src/cli.js +10 -3
  95. package/ts_build/src/cli.js.map +1 -1
  96. package/ts_build/src/clients/anthropic.d.ts +5 -1
  97. package/ts_build/src/clients/anthropic.js +18 -91
  98. package/ts_build/src/clients/anthropic.js.map +1 -1
  99. package/ts_build/src/clients/gemini.d.ts +80 -2
  100. package/ts_build/src/clients/gemini.js +336 -74
  101. package/ts_build/src/clients/gemini.js.map +1 -1
  102. package/ts_build/src/clients/index.d.ts +9 -1
  103. package/ts_build/src/clients/index.js +65 -0
  104. package/ts_build/src/clients/index.js.map +1 -1
  105. package/ts_build/src/clients/knowhow.d.ts +9 -1
  106. package/ts_build/src/clients/knowhow.js +43 -0
  107. package/ts_build/src/clients/knowhow.js.map +1 -1
  108. package/ts_build/src/clients/openai.d.ts +9 -1
  109. package/ts_build/src/clients/openai.js +201 -133
  110. package/ts_build/src/clients/openai.js.map +1 -1
  111. package/ts_build/src/clients/pricing/anthropic.d.ts +17 -0
  112. package/ts_build/src/clients/pricing/anthropic.js +93 -0
  113. package/ts_build/src/clients/pricing/anthropic.js.map +1 -0
  114. package/ts_build/src/clients/pricing/google.d.ts +73 -0
  115. package/ts_build/src/clients/pricing/google.js +68 -0
  116. package/ts_build/src/clients/pricing/google.js.map +1 -0
  117. package/ts_build/src/clients/pricing/index.d.ts +4 -0
  118. package/ts_build/src/clients/pricing/index.js +14 -0
  119. package/ts_build/src/clients/pricing/index.js.map +1 -0
  120. package/ts_build/src/clients/pricing/openai.d.ts +7 -0
  121. package/ts_build/src/clients/pricing/openai.js +137 -0
  122. package/ts_build/src/clients/pricing/openai.js.map +1 -0
  123. package/ts_build/src/clients/pricing/xai.d.ts +26 -0
  124. package/ts_build/src/clients/pricing/xai.js +59 -0
  125. package/ts_build/src/clients/pricing/xai.js.map +1 -0
  126. package/ts_build/src/clients/types.d.ts +135 -0
  127. package/ts_build/src/clients/xai.d.ts +9 -1
  128. package/ts_build/src/clients/xai.js +178 -46
  129. package/ts_build/src/clients/xai.js.map +1 -1
  130. package/ts_build/src/config.d.ts +1 -0
  131. package/ts_build/src/config.js +45 -16
  132. package/ts_build/src/config.js.map +1 -1
  133. package/ts_build/src/embeddings.js +8 -1
  134. package/ts_build/src/embeddings.js.map +1 -1
  135. package/ts_build/src/microphone.js +7 -9
  136. package/ts_build/src/microphone.js.map +1 -1
  137. package/ts_build/src/migrations.d.ts +17 -0
  138. package/ts_build/src/migrations.js +86 -0
  139. package/ts_build/src/migrations.js.map +1 -0
  140. package/ts_build/src/plugins/AgentsMdPlugin.d.ts +13 -0
  141. package/ts_build/src/plugins/AgentsMdPlugin.js +118 -0
  142. package/ts_build/src/plugins/AgentsMdPlugin.js.map +1 -0
  143. package/ts_build/src/plugins/PluginBase.d.ts +1 -0
  144. package/ts_build/src/plugins/PluginBase.js +3 -0
  145. package/ts_build/src/plugins/PluginBase.js.map +1 -1
  146. package/ts_build/src/plugins/downloader/downloader.js +5 -5
  147. package/ts_build/src/plugins/downloader/downloader.js.map +1 -1
  148. package/ts_build/src/plugins/embedding.js +9 -8
  149. package/ts_build/src/plugins/embedding.js.map +1 -1
  150. package/ts_build/src/plugins/exec.d.ts +10 -0
  151. package/ts_build/src/plugins/exec.js +56 -0
  152. package/ts_build/src/plugins/exec.js.map +1 -0
  153. package/ts_build/src/plugins/github.js +93 -51
  154. package/ts_build/src/plugins/github.js.map +1 -1
  155. package/ts_build/src/plugins/language.js +14 -11
  156. package/ts_build/src/plugins/language.js.map +1 -1
  157. package/ts_build/src/plugins/plugins.d.ts +1 -0
  158. package/ts_build/src/plugins/plugins.js +19 -1
  159. package/ts_build/src/plugins/plugins.js.map +1 -1
  160. package/ts_build/src/plugins/tmux.d.ts +14 -0
  161. package/ts_build/src/plugins/tmux.js +108 -0
  162. package/ts_build/src/plugins/tmux.js.map +1 -0
  163. package/ts_build/src/plugins/types.d.ts +1 -0
  164. package/ts_build/src/plugins/vim.js +11 -1
  165. package/ts_build/src/plugins/vim.js.map +1 -1
  166. package/ts_build/src/services/AgentSyncFs.d.ts +34 -0
  167. package/ts_build/src/services/AgentSyncFs.js +325 -0
  168. package/ts_build/src/services/AgentSyncFs.js.map +1 -0
  169. package/ts_build/src/services/AgentSyncKnowhowWeb.d.ts +29 -0
  170. package/ts_build/src/services/AgentSyncKnowhowWeb.js +178 -0
  171. package/ts_build/src/services/AgentSyncKnowhowWeb.js.map +1 -0
  172. package/ts_build/src/services/AgentSynchronization.d.ts +1 -1
  173. package/ts_build/src/services/AgentSynchronization.js +3 -3
  174. package/ts_build/src/services/AgentSynchronization.js.map +1 -1
  175. package/ts_build/src/services/EventService.js.map +1 -1
  176. package/ts_build/src/services/KnowhowClient.d.ts +9 -1
  177. package/ts_build/src/services/KnowhowClient.js +58 -0
  178. package/ts_build/src/services/KnowhowClient.js.map +1 -1
  179. package/ts_build/src/services/index.d.ts +2 -1
  180. package/ts_build/src/services/index.js +2 -1
  181. package/ts_build/src/services/index.js.map +1 -1
  182. package/ts_build/src/types.d.ts +26 -1
  183. package/ts_build/src/types.js +45 -4
  184. package/ts_build/src/types.js.map +1 -1
  185. package/ts_build/src/utils/PersistentInputManager.d.ts +28 -0
  186. package/ts_build/src/utils/PersistentInputManager.js +293 -0
  187. package/ts_build/src/utils/PersistentInputManager.js.map +1 -0
  188. package/ts_build/src/worker.js +2 -2
  189. package/ts_build/src/worker.js.map +1 -1
  190. package/ts_build/tests/manual/modalities/google.modalities.test.d.ts +1 -0
  191. package/ts_build/tests/manual/modalities/google.modalities.test.js +252 -0
  192. package/ts_build/tests/manual/modalities/google.modalities.test.js.map +1 -0
  193. package/ts_build/tests/manual/modalities/openai.modalities.test.d.ts +1 -0
  194. package/ts_build/tests/manual/modalities/openai.modalities.test.js +252 -0
  195. package/ts_build/tests/manual/modalities/openai.modalities.test.js.map +1 -0
  196. package/ts_build/tests/manual/modalities/streaming.test.d.ts +1 -0
  197. package/ts_build/tests/manual/modalities/streaming.test.js +206 -0
  198. package/ts_build/tests/manual/modalities/streaming.test.js.map +1 -0
  199. package/ts_build/tests/manual/modalities/xai.modalities.test.d.ts +1 -0
  200. package/ts_build/tests/manual/modalities/xai.modalities.test.js +226 -0
  201. package/ts_build/tests/manual/modalities/xai.modalities.test.js.map +1 -0
  202. package/ts_build/tests/manual/persistent-input-test.d.ts +1 -0
  203. package/ts_build/tests/manual/persistent-input-test.js +35 -0
  204. package/ts_build/tests/manual/persistent-input-test.js.map +1 -0
  205. package/ts_build/tests/plugins/language/languagePlugin-content-triggers.test.js +5 -5
  206. package/ts_build/tests/plugins/language/languagePlugin-content-triggers.test.js.map +1 -1
  207. package/ts_build/tests/plugins/language/languagePlugin-integration.test.js +1 -1
  208. package/ts_build/tests/plugins/language/languagePlugin-integration.test.js.map +1 -1
  209. package/ts_build/tests/plugins/language/languagePlugin.test.js +17 -7
  210. package/ts_build/tests/plugins/language/languagePlugin.test.js.map +1 -1
@@ -9,8 +9,12 @@ import {
9
9
  ToolConfig,
10
10
  UsageMetadata,
11
11
  } from "@google/genai";
12
+ import * as os from "os";
13
+ import * as fsSync from "fs";
14
+ import * as pathSync from "path";
12
15
  import { wait } from "../utils";
13
16
  import { EmbeddingModels, Models } from "../types";
17
+ import { GeminiTextPricing } from "./pricing";
14
18
 
15
19
  import {
16
20
  GenericClient,
@@ -23,6 +27,20 @@ import {
23
27
  MessageContent,
24
28
  ToolCall,
25
29
  OutputMessage,
30
+ AudioTranscriptionOptions,
31
+ AudioTranscriptionResponse,
32
+ AudioGenerationOptions,
33
+ AudioGenerationResponse,
34
+ ImageGenerationOptions,
35
+ ImageGenerationResponse,
36
+ VideoGenerationOptions,
37
+ VideoGenerationResponse,
38
+ VideoStatusOptions,
39
+ VideoStatusResponse,
40
+ FileUploadOptions,
41
+ FileUploadResponse,
42
+ FileDownloadOptions,
43
+ FileDownloadResponse,
26
44
  } from "./types";
27
45
 
28
46
  function getMimeTypeFromUrl(url: string): string {
@@ -32,6 +50,51 @@ function getMimeTypeFromUrl(url: string): string {
32
50
  return "image/jpeg";
33
51
  }
34
52
 
53
+ function getVideoMimeTypeFromUrl(url: string): string {
54
+ if (url.endsWith(".mp4")) return "video/mp4";
55
+ if (url.endsWith(".webm")) return "video/webm";
56
+ if (url.endsWith(".mov")) return "video/quicktime";
57
+ if (url.endsWith(".avi")) return "video/x-msvideo";
58
+ return "video/mp4";
59
+ }
60
+
61
+ /**
62
+ * Converts raw PCM audio data to WAV format by prepending a WAV header.
63
+ * Gemini TTS returns raw 16-bit PCM (audio/L16) which needs a WAV header to be playable.
64
+ */
65
+ function pcmToWav(
66
+ pcmData: Buffer,
67
+ sampleRate: number = 24000,
68
+ numChannels: number = 1,
69
+ bitsPerSample: number = 16
70
+ ): Buffer {
71
+ const dataSize = pcmData.length;
72
+ const headerSize = 44;
73
+ const wavBuffer = Buffer.alloc(headerSize + dataSize);
74
+
75
+ // RIFF header
76
+ wavBuffer.write("RIFF", 0);
77
+ wavBuffer.writeUInt32LE(36 + dataSize, 4); // file size - 8
78
+ wavBuffer.write("WAVE", 8);
79
+
80
+ // fmt chunk
81
+ wavBuffer.write("fmt ", 12);
82
+ wavBuffer.writeUInt32LE(16, 16); // chunk size
83
+ wavBuffer.writeUInt16LE(1, 20); // PCM format
84
+ wavBuffer.writeUInt16LE(numChannels, 22);
85
+ wavBuffer.writeUInt32LE(sampleRate, 24);
86
+ wavBuffer.writeUInt32LE(sampleRate * numChannels * (bitsPerSample / 8), 28); // byte rate
87
+ wavBuffer.writeUInt16LE(numChannels * (bitsPerSample / 8), 32); // block align
88
+ wavBuffer.writeUInt16LE(bitsPerSample, 34);
89
+
90
+ // data chunk
91
+ wavBuffer.write("data", 36);
92
+ wavBuffer.writeUInt32LE(dataSize, 40);
93
+ pcmData.copy(wavBuffer, 44);
94
+
95
+ return wavBuffer;
96
+ }
97
+
35
98
  export class GenericGeminiClient implements GenericClient {
36
99
  private client: GoogleGenAI;
37
100
  private apiKey?: string;
@@ -64,16 +127,33 @@ export class GenericGeminiClient implements GenericClient {
64
127
  return { text: part.text };
65
128
  }
66
129
  if (part.type === "image_url") {
67
- // Google GenAI's fileData part type uses a URI.
68
- // The example uses createPartFromUri which takes a uri string and mimeType.
69
- // We assume the image_url.url can be used as the uri.
70
- // Note: Google's example uploads files first and uses the resulting URI.
71
- // Directly using a URL here might have limitations depending on the URL type
72
- // (e.g., data URLs vs. public http URLs).
73
- const mimeType = getMimeTypeFromUrl(part.image_url.url);
130
+ const url = part.image_url.url;
131
+ if (url.startsWith("data:")) {
132
+ const [header, base64Data] = url.split(",");
133
+ const mimeType = header.split(":")[1].split(";")[0];
134
+ return {
135
+ inlineData: {
136
+ data: base64Data,
137
+ mimeType,
138
+ },
139
+ };
140
+ }
141
+
142
+ // If it's a File API URI
143
+ if (url.startsWith("https://generativelanguage.googleapis.com")) {
144
+ return {
145
+ fileData: {
146
+ fileUri: url,
147
+ mimeType: getMimeTypeFromUrl(url),
148
+ },
149
+ };
150
+ }
151
+ }
152
+ if (part.type === "video_url") {
153
+ const mimeType = getVideoMimeTypeFromUrl(part.video_url.url);
74
154
  return {
75
155
  fileData: {
76
- uri: part.image_url.url,
156
+ fileUri: part.video_url.url,
77
157
  mimeType,
78
158
  },
79
159
  };
@@ -81,9 +161,9 @@ export class GenericGeminiClient implements GenericClient {
81
161
  // Handle other potential generic message content types if necessary
82
162
  // For now, only text and image_url are explicitly handled.
83
163
  console.warn(
84
- `Unsupported generic message content part type: ${(part as any).type}`
164
+ `Unsupported generic message content part type: ${part.type}`
85
165
  );
86
- return { text: `[Unsupported content type: ${(part as any).type}]` };
166
+ return { text: `[Unsupported content type: ${part.type}]` };
87
167
  })
88
168
  .filter((part) => !!part); // Filter out any null/undefined parts if transformation fails
89
169
  }
@@ -119,7 +199,7 @@ export class GenericGeminiClient implements GenericClient {
119
199
  (systemInstruction ? systemInstruction + "\n" : "") +
120
200
  this.transformContentParts(msg.content)
121
201
  .filter((p) => "text" in p && typeof p.text === "string")
122
- .map((p) => (p as any).text)
202
+ .map((p) => p.text)
123
203
  .join("\n");
124
204
  }
125
205
  } else if (msg.role === "user" || msg.role === "assistant") {
@@ -215,13 +295,13 @@ export class GenericGeminiClient implements GenericClient {
215
295
  * @returns A cleaned schema object compatible with Gemini API
216
296
  */
217
297
  private cleanSchemaForGemini(schema: any): any {
218
- if (!schema || typeof schema !== 'object') {
298
+ if (!schema || typeof schema !== "object") {
219
299
  return schema;
220
300
  }
221
301
 
222
302
  // Handle arrays
223
303
  if (Array.isArray(schema)) {
224
- return schema.map(item => this.cleanSchemaForGemini(item));
304
+ return schema.map((item) => this.cleanSchemaForGemini(item));
225
305
  }
226
306
 
227
307
  const cleaned: any = {};
@@ -236,24 +316,29 @@ export class GenericGeminiClient implements GenericClient {
236
316
  // - $ref: JSON Schema references not supported
237
317
  // - $defs: JSON Schema definitions not supported
238
318
  // - positional: internal knowhow property, not part of JSON Schema
239
- if (key === 'additionalProperties' || key === '$ref' || key === '$defs' || key === 'positional') {
319
+ if (
320
+ key === "additionalProperties" ||
321
+ key === "$ref" ||
322
+ key === "$defs" ||
323
+ key === "positional"
324
+ ) {
240
325
  continue;
241
326
  }
242
327
 
243
328
  const value = schema[key];
244
329
 
245
330
  // Convert type to uppercase if it's a string
246
- if (key === 'type' && typeof value === 'string') {
331
+ if (key === "type" && typeof value === "string") {
247
332
  cleaned[key] = value.toUpperCase();
248
333
  }
249
334
  // Handle type arrays (e.g., ["string", "null"])
250
- else if (key === 'type' && Array.isArray(value)) {
251
- cleaned[key] = value.map((t: string) =>
252
- typeof t === 'string' ? t.toUpperCase() : t
335
+ else if (key === "type" && Array.isArray(value)) {
336
+ cleaned[key] = value.map((t: string) =>
337
+ typeof t === "string" ? t.toUpperCase() : t
253
338
  );
254
339
  }
255
340
  // Recursively clean nested objects
256
- else if (typeof value === 'object' && value !== null) {
341
+ else if (typeof value === "object" && value !== null) {
257
342
  cleaned[key] = this.cleanSchemaForGemini(value);
258
343
  }
259
344
  // Copy primitive values as-is
@@ -277,12 +362,14 @@ export class GenericGeminiClient implements GenericClient {
277
362
 
278
363
  const functionDeclarations: FunctionDeclaration[] = tools.map((tool) => {
279
364
  // Clean the entire parameters schema to remove unsupported fields
280
- const cleanedParameters = this.cleanSchemaForGemini(tool.function.parameters);
281
-
365
+ const cleanedParameters = this.cleanSchemaForGemini(
366
+ tool.function.parameters
367
+ );
368
+
282
369
  return {
283
370
  name: tool.function.name,
284
371
  description: tool.function.description || "",
285
- parameters: cleanedParameters as any,
372
+ parameters: cleanedParameters,
286
373
  };
287
374
  });
288
375
 
@@ -391,75 +478,16 @@ export class GenericGeminiClient implements GenericClient {
391
478
  usd_cost: usdCost,
392
479
  };
393
480
  } catch (error) {
394
- console.error("Error calling Google GenAI generateContent:", error);
481
+ console.error(
482
+ "Error calling Google GenAI generateContent:",
483
+ error.message
484
+ );
395
485
  throw error;
396
486
  }
397
487
  }
398
488
 
399
- pricesPerMillion(): { [key: string]: any } {
400
- return {
401
- [Models.google.Gemini_3_Preview]: {
402
- input: 2,
403
- input_gt_200k: 4,
404
- output: 12,
405
- output_gt_200k: 18,
406
- context_caching: 0.2,
407
- context_caching_gt_200k: 0.4,
408
- },
409
- [Models.google.Gemini_25_Flash_Preview]: {
410
- input: 0.3,
411
- output: 2.5,
412
- thinking_output: 3.5,
413
- context_caching: 0.0375,
414
- },
415
- [Models.google.Gemini_25_Pro_Preview]: {
416
- input: 1.25,
417
- input_gt_200k: 2.5,
418
- output: 10.0,
419
- output_gt_200k: 15.0,
420
- context_caching: 0.125,
421
- context_caching_gt_200k: 0.25,
422
- },
423
- [Models.google.Gemini_20_Flash]: {
424
- input: 0.1,
425
- output: 0.4,
426
- context_caching: 0.025,
427
- },
428
- [Models.google.Gemini_20_Flash_Preview_Image_Generation]: {
429
- input: 0.1,
430
- output: 0.4,
431
- image_generation: 0.039,
432
- },
433
- [Models.google.Gemini_20_Flash_Lite]: {
434
- input: 0.075,
435
- output: 0.3,
436
- },
437
- [Models.google.Gemini_15_Flash]: {
438
- input: 0.075,
439
- output: 0.3,
440
- context_caching: 0.01875,
441
- },
442
- [Models.google.Gemini_15_Flash_8B]: {
443
- input: 0.0375,
444
- output: 0.15,
445
- context_caching: 0.01,
446
- },
447
- [Models.google.Gemini_15_Pro]: {
448
- input: 1.25,
449
- output: 5.0,
450
- context_caching: 0.3125,
451
- },
452
- [Models.google.Imagen_3]: {
453
- image_generation: 0.03,
454
- },
455
- [Models.google.Veo_2]: {
456
- video_generation: 0.35,
457
- },
458
- [EmbeddingModels.google.Gemini_Embedding]: {
459
- input: 0, // Free of charge
460
- output: 0, // Free of charge
461
- },
462
- };
489
+ pricesPerMillion() {
490
+ return GeminiTextPricing;
463
491
  }
464
492
 
465
493
  calculateCost(model: string, usage: UsageMetadata): number | undefined {
@@ -558,4 +586,334 @@ export class GenericGeminiClient implements GenericClient {
558
586
  throw error;
559
587
  }
560
588
  }
589
+
590
+ async createAudioTranscription(
591
+ options: AudioTranscriptionOptions
592
+ ): Promise<AudioTranscriptionResponse> {
593
+ throw new Error(
594
+ "Audio transcription is not yet supported by the Gemini client. Use OpenAI client with Whisper model instead."
595
+ );
596
+ }
597
+
598
+ async createAudioGeneration(
599
+ options: AudioGenerationOptions
600
+ ): Promise<AudioGenerationResponse> {
601
+ try {
602
+ const response = await this.client.models.generateContent({
603
+ model: options.model,
604
+ contents: [
605
+ {
606
+ role: "user",
607
+ parts: [{ text: options.input }],
608
+ },
609
+ ],
610
+ config: {
611
+ responseModalities: ["AUDIO"],
612
+ speechConfig: {
613
+ voiceConfig: {
614
+ prebuiltVoiceConfig: {
615
+ voiceName: options.voice || "Puck",
616
+ },
617
+ },
618
+ },
619
+ },
620
+ });
621
+
622
+ // Extract audio data from the response
623
+ // Gemini returns inline audio data in the response parts
624
+ const audioPart = response.candidates?.[0]?.content?.parts?.find(
625
+ (part: any) => part.inlineData?.mimeType?.startsWith("audio/")
626
+ );
627
+
628
+ if (!audioPart || !audioPart.inlineData) {
629
+ throw new Error("No audio data returned from Gemini TTS");
630
+ }
631
+
632
+ // Convert base64 to buffer
633
+ const rawBuffer = Buffer.from(audioPart.inlineData.data, "base64");
634
+ const mimeType = audioPart.inlineData.mimeType || "audio/wav";
635
+
636
+ // Gemini returns raw PCM (audio/L16) - convert to WAV format for playability
637
+ let audioBuffer = rawBuffer;
638
+ if (mimeType.includes("L16") || mimeType.includes("pcm")) {
639
+ // Parse sample rate from mime type e.g. "audio/L16;codec=pcm;rate=24000"
640
+ const rateMatch = mimeType.match(/rate=(\d+)/);
641
+ const sampleRate = rateMatch ? parseInt(rateMatch[1], 10) : 24000;
642
+ audioBuffer = pcmToWav(rawBuffer, sampleRate);
643
+ }
644
+
645
+ return {
646
+ audio: audioBuffer,
647
+ format: "audio/wav",
648
+ };
649
+ } catch (error) {
650
+ console.error("Error calling Gemini TTS:", error);
651
+ throw error;
652
+ }
653
+ }
654
+
655
+ async createImageGeneration(
656
+ options: ImageGenerationOptions
657
+ ): Promise<ImageGenerationResponse> {
658
+ try {
659
+ // Check if using Imagen 3 model or Gemini Flash inline generation
660
+ const isImagen3 = options.model?.includes("imagen");
661
+
662
+ if (isImagen3) {
663
+ // Imagen 3 uses the generateImages endpoint
664
+ const response = await this.client.models.generateImages({
665
+ model: options.model,
666
+ prompt: options.prompt,
667
+ config: {
668
+ numberOfImages: options.n || 1,
669
+ },
670
+ });
671
+
672
+ // Convert response to ImageGenerationResponse format
673
+ const generatedImages = response.generatedImages || [];
674
+ const images = generatedImages.map((img) => ({
675
+ // imageBytes is already a base64-encoded string from the API
676
+ // Don't re-encode it, just use it directly
677
+ b64_json: img.image?.imageBytes
678
+ ? img.image.imageBytes
679
+ : "",
680
+ revised_prompt: options.prompt,
681
+ }));
682
+
683
+ return {
684
+ created: Math.floor(Date.now() / 1000),
685
+ data: images,
686
+ usd_cost: 0.03 * images.length,
687
+ };
688
+ } else {
689
+ // Use Gemini Flash inline image generation (e.g., gemini-2.0-flash-preview-image-generation)
690
+ const response = await this.client.models.generateContent({
691
+ model: options.model,
692
+ contents: [
693
+ {
694
+ role: "user",
695
+ parts: [{ text: options.prompt }],
696
+ },
697
+ ],
698
+ config: {
699
+ responseModalities: ["IMAGE", "TEXT"],
700
+ },
701
+ });
702
+
703
+ // Extract image data from the response
704
+ const imageParts =
705
+ response.candidates?.[0]?.content?.parts?.filter((part: any) =>
706
+ part.inlineData?.mimeType?.startsWith("image/")
707
+ ) || [];
708
+
709
+ if (imageParts.length === 0) {
710
+ throw new Error("No image data returned from Gemini");
711
+ }
712
+
713
+ const images = imageParts.map((part: any) => ({
714
+ b64_json: part.inlineData.data,
715
+ revised_prompt: options.prompt,
716
+ }));
717
+
718
+ const usageMetadata = response.usageMetadata;
719
+ const usdCost = usageMetadata
720
+ ? this.calculateCost(options.model, usageMetadata)
721
+ : undefined;
722
+
723
+ return {
724
+ created: Math.floor(Date.now() / 1000),
725
+ data: images,
726
+ usd_cost: usdCost,
727
+ };
728
+ }
729
+ } catch (error) {
730
+ console.error("Error calling Gemini image generation:", error);
731
+ throw error;
732
+ }
733
+ }
734
+
735
+ async createVideoGeneration(
736
+ options: VideoGenerationOptions
737
+ ): Promise<VideoGenerationResponse> {
738
+ try {
739
+ // Submit the video generation job – do NOT poll here.
740
+ // Use getVideoStatus() to poll and downloadFile() to fetch the result.
741
+ const operation = await this.client.models.generateVideos({
742
+ model: options.model,
743
+ prompt: options.prompt,
744
+ config: {
745
+ numberOfVideos: options.n || 1,
746
+ ...(options.duration && {
747
+ durationSeconds: Math.max(6, options.duration),
748
+ }),
749
+ ...(options.resolution && { resolution: options.resolution }),
750
+ ...(options.aspect_ratio && { aspectRatio: options.aspect_ratio }),
751
+ },
752
+ });
753
+
754
+ // Calculate estimated cost: $0.35 per second of video
755
+ const duration = options.duration || 5; // Default 5 seconds
756
+ const usdCost = (options.n || 1) * duration * 0.35;
757
+
758
+ // Return the operation name as jobId so callers can use getVideoStatus / downloadVideo
759
+ return {
760
+ created: Math.floor(Date.now() / 1000),
761
+ data: [],
762
+ jobId: operation.name,
763
+ usd_cost: usdCost,
764
+ };
765
+ } catch (error) {
766
+ console.error("Error calling Gemini video generation:", error);
767
+ throw error;
768
+ }
769
+ }
770
+
771
+ async getVideoStatus(options: VideoStatusOptions): Promise<VideoStatusResponse> {
772
+ try {
773
+ const operation = await this.client.operations.getVideosOperation({
774
+ operation: { name: options.jobId },
775
+ });
776
+
777
+ if (operation.error) {
778
+ return {
779
+ jobId: options.jobId,
780
+ status: "failed",
781
+ error: JSON.stringify(operation.error),
782
+ };
783
+ }
784
+
785
+ if (!operation.done) {
786
+ return {
787
+ jobId: options.jobId,
788
+ status: "in_progress",
789
+ };
790
+ }
791
+
792
+ // Completed – extract file URIs
793
+ const generatedVideos = operation.response?.generatedVideos || [];
794
+ const data = generatedVideos.map((vid) => {
795
+ const videoBytes: string | undefined = vid.video?.videoBytes;
796
+ const uri: string | undefined = vid.video?.uri;
797
+ return {
798
+ b64_json: videoBytes || undefined,
799
+ url: uri || undefined,
800
+ fileUri: uri || undefined,
801
+ };
802
+ });
803
+
804
+ return {
805
+ jobId: options.jobId,
806
+ status: "completed",
807
+ data,
808
+ };
809
+ } catch (error) {
810
+ console.error("Error checking Gemini video status:", error);
811
+ throw error;
812
+ }
813
+ }
814
+
815
+ /**
816
+ * Download a video (or any file) via the Google GenAI Files API.
817
+ * Pass either `fileId` (the files/* name) or `uri` (the full URI).
818
+ */
819
+ async downloadVideo(options: FileDownloadOptions): Promise<FileDownloadResponse> {
820
+ return this.downloadFile(options);
821
+ }
822
+
823
+ /**
824
+ * Upload a file to the Google GenAI Files API.
825
+ */
826
+ async uploadFile(options: FileUploadOptions): Promise<FileUploadResponse> {
827
+ try {
828
+ const blob = new Blob([options.data], { type: options.mimeType });
829
+ const uploadedFile = await this.client.files.upload({
830
+ file: blob,
831
+ config: {
832
+ mimeType: options.mimeType,
833
+ displayName: options.displayName,
834
+ name: options.fileName,
835
+ },
836
+ });
837
+
838
+ return {
839
+ fileId: uploadedFile.name,
840
+ uri: uploadedFile.uri,
841
+ url: uploadedFile.downloadUri || uploadedFile.uri,
842
+ mimeType: uploadedFile.mimeType,
843
+ sizeBytes: uploadedFile.sizeBytes ? Number(uploadedFile.sizeBytes) : undefined,
844
+ };
845
+ } catch (error) {
846
+ console.error("Error uploading file to Google GenAI Files API:", error);
847
+ throw error;
848
+ }
849
+ }
850
+
851
+ /**
852
+ * Download a file from the Google GenAI Files API.
853
+ *
854
+ * The SDK's `files.download()` writes to disk, so we use a temp file and
855
+ * read it back as a Buffer. Pass either:
856
+ * - `fileId`: the files/* resource name (e.g. "files/abc-123") or a Video uri
857
+ * - `uri`: the full Video.uri returned in GeneratedVideo (also accepted as fileId)
858
+ *
859
+ * For generated videos the `file` param accepts the Video object directly
860
+ * (uri + optional mimeType), which the SDK resolves to a download URL.
861
+ */
862
+ async downloadFile(options: FileDownloadOptions): Promise<FileDownloadResponse> {
863
+ const mimeMap: Record<string, string> = {
864
+ ".mp4": "video/mp4",
865
+ ".webm": "video/webm",
866
+ ".mov": "video/quicktime",
867
+ ".png": "image/png",
868
+ ".jpg": "image/jpeg",
869
+ ".jpeg": "image/jpeg",
870
+ ".gif": "image/gif",
871
+ ".wav": "audio/wav",
872
+ ".mp3": "audio/mpeg",
873
+ };
874
+
875
+ try {
876
+ // The Google GenAI SDK's files.download() uses an async pipe that is NOT
877
+ // properly awaited, so we fetch the file directly via HTTP instead.
878
+ // Build the download URL from the uri/fileId.
879
+ const rawUri = options.uri || options.fileId || "";
880
+
881
+ // If it's already a full https URL, use it directly (append API key).
882
+ // Otherwise construct the Files API download URL from the resource name.
883
+ let downloadUrl: string;
884
+ if (rawUri.startsWith("https://")) {
885
+ // Append API key if not already present
886
+ const sep = rawUri.includes("?") ? "&" : "?";
887
+ downloadUrl = `${rawUri}${sep}key=${this.apiKey}`;
888
+ } else {
889
+ // Strip leading "files/" if present to get just the file ID
890
+ const fileId = rawUri.replace(/^files\//, "");
891
+ downloadUrl = `https://generativelanguage.googleapis.com/v1beta/files/${fileId}:download?alt=media&key=${this.apiKey}`;
892
+ }
893
+
894
+ const response = await fetch(downloadUrl);
895
+ if (!response.ok) {
896
+ throw new Error(`HTTP ${response.status} ${response.statusText} downloading ${downloadUrl}`);
897
+ }
898
+
899
+ const arrayBuffer = await response.arrayBuffer();
900
+ const data = Buffer.from(arrayBuffer);
901
+
902
+ // If caller supplied a filePath, write to it (creating dirs as needed)
903
+ if (options.filePath) {
904
+ fsSync.mkdirSync(pathSync.dirname(options.filePath), { recursive: true });
905
+ fsSync.writeFileSync(options.filePath, data);
906
+ }
907
+
908
+ // Infer mime type from the URI/fileId first (more reliable), then from the path
909
+ const sourceForExt = options.uri || options.fileId || options.filePath || "";
910
+ const ext = pathSync.extname(sourceForExt.split("?")[0]).toLowerCase();
911
+ const mimeType = mimeMap[ext] || "video/mp4";
912
+
913
+ return { data, mimeType };
914
+ } catch (error) {
915
+ console.error("Error downloading file from Google GenAI Files API:", error);
916
+ throw error;
917
+ }
918
+ }
561
919
  }