@aigne/gemini 0.13.4 → 0.14.0-beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,30 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.14.0-beta](https://github.com/AIGNE-io/aigne-framework/compare/gemini-v0.13.5...gemini-v0.14.0-beta) (2025-09-22)
4
+
5
+
6
+ ### Features
7
+
8
+ * improve image model architecture and file handling ([#527](https://github.com/AIGNE-io/aigne-framework/issues/527)) ([4db50aa](https://github.com/AIGNE-io/aigne-framework/commit/4db50aa0387a1a0f045ca11aaa61613e36ca7597))
9
+
10
+
11
+ ### Dependencies
12
+
13
+ * The following workspace dependencies were updated
14
+ * dependencies
15
+ * @aigne/openai bumped to 0.16.0-beta
16
+ * @aigne/platform-helpers bumped to 0.6.3-beta
17
+ * devDependencies
18
+ * @aigne/core bumped to 1.61.0-beta
19
+ * @aigne/test-utils bumped to 0.5.53-beta
20
+
21
+ ## [0.13.5](https://github.com/AIGNE-io/aigne-framework/compare/gemini-v0.13.4...gemini-v0.13.5) (2025-09-18)
22
+
23
+
24
+ ### Bug Fixes
25
+
26
+ * **gemini:** should include at least one user message ([#521](https://github.com/AIGNE-io/aigne-framework/issues/521)) ([eb2752e](https://github.com/AIGNE-io/aigne-framework/commit/eb2752ed7d78f59c435ecc3ccb7227e804e3781e))
27
+
3
28
  ## [0.13.4](https://github.com/AIGNE-io/aigne-framework/compare/gemini-v0.13.3...gemini-v0.13.4) (2025-09-18)
4
29
 
5
30
 
@@ -232,9 +232,14 @@ class GeminiChatModel extends openai_1.OpenAIChatModel {
232
232
  }
233
233
  async getRunMessages(input) {
234
234
  const messages = await super.getRunMessages(input);
235
- const lastMessage = messages.at(-1);
236
- if (lastMessage?.role === "system") {
237
- lastMessage.role = "user"; // Ensure the last message is from the user
235
+ if (!messages.some((i) => i.role === "user")) {
236
+ for (const msg of messages) {
237
+ if (msg.role === "system") {
238
+ // Ensure the last message is from the user
239
+ msg.role = "user";
240
+ break;
241
+ }
242
+ }
238
243
  }
239
244
  return messages;
240
245
  }
@@ -1,4 +1,4 @@
1
- import { ImageModel, type ImageModelInput, type ImageModelOptions, type ImageModelOutput } from "@aigne/core";
1
+ import { type AgentInvokeOptions, ImageModel, type ImageModelInput, type ImageModelOptions, type ImageModelOutput } from "@aigne/core";
2
2
  import { type GenerateContentConfig, type GenerateImagesConfig, GoogleGenAI } from "@google/genai";
3
3
  export interface GeminiImageModelInput extends ImageModelInput, GenerateImagesConfig, GenerateContentConfig {
4
4
  }
@@ -28,7 +28,7 @@ export declare class GeminiImageModel extends ImageModel<GeminiImageModelInput,
28
28
  * @param input The input to process
29
29
  * @returns The generated response
30
30
  */
31
- process(input: GeminiImageModelInput): Promise<ImageModelOutput>;
31
+ process(input: GeminiImageModelInput, options: AgentInvokeOptions): Promise<ImageModelOutput>;
32
32
  private generateImageByImagenModel;
33
33
  private generateImageByGeminiModel;
34
34
  }
@@ -52,7 +52,7 @@ class GeminiImageModel extends core_1.ImageModel {
52
52
  * @param input The input to process
53
53
  * @returns The generated response
54
54
  */
55
- async process(input) {
55
+ async process(input, options) {
56
56
  const model = input.model || this.credential.model;
57
57
  const responseFormat = input.responseFormat || "base64";
58
58
  if (responseFormat === "url") {
@@ -61,7 +61,7 @@ class GeminiImageModel extends core_1.ImageModel {
61
61
  if (model.includes("imagen")) {
62
62
  return this.generateImageByImagenModel(input);
63
63
  }
64
- return this.generateImageByGeminiModel(input);
64
+ return this.generateImageByGeminiModel(input, options);
65
65
  }
66
66
  async generateImageByImagenModel(input) {
67
67
  const model = input.model || this.credential.model;
@@ -89,7 +89,9 @@ class GeminiImageModel extends core_1.ImageModel {
89
89
  });
90
90
  return {
91
91
  images: response.generatedImages
92
- ?.map(({ image }) => (image?.imageBytes ? { base64: image.imageBytes } : undefined))
92
+ ?.map(({ image }) => image?.imageBytes
93
+ ? { type: "file", data: image.imageBytes, mimeType: image.mimeType }
94
+ : undefined)
93
95
  .filter(type_utils_js_1.isNonNullable) || [],
94
96
  usage: {
95
97
  inputTokens: 0,
@@ -98,7 +100,7 @@ class GeminiImageModel extends core_1.ImageModel {
98
100
  model,
99
101
  };
100
102
  }
101
- async generateImageByGeminiModel(input) {
103
+ async generateImageByGeminiModel(input, options) {
102
104
  const model = input.model || this.credential.model;
103
105
  const mergedInput = { ...this.modelOptions, ...input };
104
106
  const inputKeys = [
@@ -131,24 +133,35 @@ class GeminiImageModel extends core_1.ImageModel {
131
133
  "topK",
132
134
  "topP",
133
135
  ];
136
+ const images = await Promise.all((0, type_utils_js_1.flat)(input.image).map(async (image) => {
137
+ const { data, mimeType } = await this.transformFileOutput(core_1.FileOutputType.file, image, options);
138
+ return { inlineData: { data, mimeType } };
139
+ }));
134
140
  const response = await this.client.models.generateContent({
135
141
  model: model,
136
- contents: input.prompt,
142
+ contents: [{ text: input.prompt }, ...images],
137
143
  config: {
138
- responseModalities: [genai_1.Modality.TEXT, genai_1.Modality.IMAGE],
144
+ responseModalities: [genai_1.Modality.IMAGE],
139
145
  candidateCount: input.n || 1,
140
146
  ...(0, type_utils_js_1.pick)(mergedInput, inputKeys),
141
147
  },
142
148
  });
143
149
  const allImages = (response.candidates ?? [])
144
150
  .flatMap((candidate) => candidate.content?.parts ?? [])
145
- .map((part) => (part.inlineData?.data ? { base64: part.inlineData?.data } : null))
151
+ .map((part) => part.inlineData?.data
152
+ ? {
153
+ type: "file",
154
+ data: part.inlineData.data,
155
+ filename: part.inlineData.displayName,
156
+ mimeType: part.inlineData.mimeType,
157
+ }
158
+ : null)
146
159
  .filter(type_utils_js_1.isNonNullable);
147
160
  return {
148
161
  images: allImages,
149
162
  usage: {
150
- inputTokens: 0,
151
- outputTokens: 0,
163
+ inputTokens: response.usageMetadata?.promptTokenCount || 0,
164
+ outputTokens: response.usageMetadata?.candidatesTokenCount || 0,
152
165
  },
153
166
  model,
154
167
  };
@@ -1,4 +1,4 @@
1
- import { ImageModel, type ImageModelInput, type ImageModelOptions, type ImageModelOutput } from "@aigne/core";
1
+ import { type AgentInvokeOptions, ImageModel, type ImageModelInput, type ImageModelOptions, type ImageModelOutput } from "@aigne/core";
2
2
  import { type GenerateContentConfig, type GenerateImagesConfig, GoogleGenAI } from "@google/genai";
3
3
  export interface GeminiImageModelInput extends ImageModelInput, GenerateImagesConfig, GenerateContentConfig {
4
4
  }
@@ -28,7 +28,7 @@ export declare class GeminiImageModel extends ImageModel<GeminiImageModelInput,
28
28
  * @param input The input to process
29
29
  * @returns The generated response
30
30
  */
31
- process(input: GeminiImageModelInput): Promise<ImageModelOutput>;
31
+ process(input: GeminiImageModelInput, options: AgentInvokeOptions): Promise<ImageModelOutput>;
32
32
  private generateImageByImagenModel;
33
33
  private generateImageByGeminiModel;
34
34
  }
@@ -229,9 +229,14 @@ export class GeminiChatModel extends OpenAIChatModel {
229
229
  }
230
230
  async getRunMessages(input) {
231
231
  const messages = await super.getRunMessages(input);
232
- const lastMessage = messages.at(-1);
233
- if (lastMessage?.role === "system") {
234
- lastMessage.role = "user"; // Ensure the last message is from the user
232
+ if (!messages.some((i) => i.role === "user")) {
233
+ for (const msg of messages) {
234
+ if (msg.role === "system") {
235
+ // Ensure the last message is from the user
236
+ msg.role = "user";
237
+ break;
238
+ }
239
+ }
235
240
  }
236
241
  return messages;
237
242
  }
@@ -1,4 +1,4 @@
1
- import { ImageModel, type ImageModelInput, type ImageModelOptions, type ImageModelOutput } from "@aigne/core";
1
+ import { type AgentInvokeOptions, ImageModel, type ImageModelInput, type ImageModelOptions, type ImageModelOutput } from "@aigne/core";
2
2
  import { type GenerateContentConfig, type GenerateImagesConfig, GoogleGenAI } from "@google/genai";
3
3
  export interface GeminiImageModelInput extends ImageModelInput, GenerateImagesConfig, GenerateContentConfig {
4
4
  }
@@ -28,7 +28,7 @@ export declare class GeminiImageModel extends ImageModel<GeminiImageModelInput,
28
28
  * @param input The input to process
29
29
  * @returns The generated response
30
30
  */
31
- process(input: GeminiImageModelInput): Promise<ImageModelOutput>;
31
+ process(input: GeminiImageModelInput, options: AgentInvokeOptions): Promise<ImageModelOutput>;
32
32
  private generateImageByImagenModel;
33
33
  private generateImageByGeminiModel;
34
34
  }
@@ -1,5 +1,5 @@
1
- import { ImageModel, imageModelInputSchema, } from "@aigne/core";
2
- import { checkArguments, isNonNullable, pick } from "@aigne/core/utils/type-utils.js";
1
+ import { FileOutputType, ImageModel, imageModelInputSchema, } from "@aigne/core";
2
+ import { checkArguments, flat, isNonNullable, pick } from "@aigne/core/utils/type-utils.js";
3
3
  import { GoogleGenAI, Modality, } from "@google/genai";
4
4
  import { z } from "zod";
5
5
  const DEFAULT_MODEL = "imagen-4.0-generate-001";
@@ -49,7 +49,7 @@ export class GeminiImageModel extends ImageModel {
49
49
  * @param input The input to process
50
50
  * @returns The generated response
51
51
  */
52
- async process(input) {
52
+ async process(input, options) {
53
53
  const model = input.model || this.credential.model;
54
54
  const responseFormat = input.responseFormat || "base64";
55
55
  if (responseFormat === "url") {
@@ -58,7 +58,7 @@ export class GeminiImageModel extends ImageModel {
58
58
  if (model.includes("imagen")) {
59
59
  return this.generateImageByImagenModel(input);
60
60
  }
61
- return this.generateImageByGeminiModel(input);
61
+ return this.generateImageByGeminiModel(input, options);
62
62
  }
63
63
  async generateImageByImagenModel(input) {
64
64
  const model = input.model || this.credential.model;
@@ -86,7 +86,9 @@ export class GeminiImageModel extends ImageModel {
86
86
  });
87
87
  return {
88
88
  images: response.generatedImages
89
- ?.map(({ image }) => (image?.imageBytes ? { base64: image.imageBytes } : undefined))
89
+ ?.map(({ image }) => image?.imageBytes
90
+ ? { type: "file", data: image.imageBytes, mimeType: image.mimeType }
91
+ : undefined)
90
92
  .filter(isNonNullable) || [],
91
93
  usage: {
92
94
  inputTokens: 0,
@@ -95,7 +97,7 @@ export class GeminiImageModel extends ImageModel {
95
97
  model,
96
98
  };
97
99
  }
98
- async generateImageByGeminiModel(input) {
100
+ async generateImageByGeminiModel(input, options) {
99
101
  const model = input.model || this.credential.model;
100
102
  const mergedInput = { ...this.modelOptions, ...input };
101
103
  const inputKeys = [
@@ -128,24 +130,35 @@ export class GeminiImageModel extends ImageModel {
128
130
  "topK",
129
131
  "topP",
130
132
  ];
133
+ const images = await Promise.all(flat(input.image).map(async (image) => {
134
+ const { data, mimeType } = await this.transformFileOutput(FileOutputType.file, image, options);
135
+ return { inlineData: { data, mimeType } };
136
+ }));
131
137
  const response = await this.client.models.generateContent({
132
138
  model: model,
133
- contents: input.prompt,
139
+ contents: [{ text: input.prompt }, ...images],
134
140
  config: {
135
- responseModalities: [Modality.TEXT, Modality.IMAGE],
141
+ responseModalities: [Modality.IMAGE],
136
142
  candidateCount: input.n || 1,
137
143
  ...pick(mergedInput, inputKeys),
138
144
  },
139
145
  });
140
146
  const allImages = (response.candidates ?? [])
141
147
  .flatMap((candidate) => candidate.content?.parts ?? [])
142
- .map((part) => (part.inlineData?.data ? { base64: part.inlineData?.data } : null))
148
+ .map((part) => part.inlineData?.data
149
+ ? {
150
+ type: "file",
151
+ data: part.inlineData.data,
152
+ filename: part.inlineData.displayName,
153
+ mimeType: part.inlineData.mimeType,
154
+ }
155
+ : null)
143
156
  .filter(isNonNullable);
144
157
  return {
145
158
  images: allImages,
146
159
  usage: {
147
- inputTokens: 0,
148
- outputTokens: 0,
160
+ inputTokens: response.usageMetadata?.promptTokenCount || 0,
161
+ outputTokens: response.usageMetadata?.candidatesTokenCount || 0,
149
162
  },
150
163
  model,
151
164
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@aigne/gemini",
3
- "version": "0.13.4",
3
+ "version": "0.14.0-beta",
4
4
  "description": "AIGNE Gemini SDK for integrating with Google's Gemini AI models",
5
5
  "publishConfig": {
6
6
  "access": "public"
@@ -38,8 +38,8 @@
38
38
  "@google/genai": "^1.20.0",
39
39
  "uuid": "^13.0.0",
40
40
  "zod": "^3.25.67",
41
- "@aigne/openai": "^0.15.4",
42
- "@aigne/platform-helpers": "^0.6.2"
41
+ "@aigne/platform-helpers": "^0.6.3-beta",
42
+ "@aigne/openai": "^0.16.0-beta"
43
43
  },
44
44
  "devDependencies": {
45
45
  "@types/bun": "^1.2.22",
@@ -47,8 +47,8 @@
47
47
  "npm-run-all": "^4.1.5",
48
48
  "rimraf": "^6.0.1",
49
49
  "typescript": "^5.9.2",
50
- "@aigne/core": "^1.60.3",
51
- "@aigne/test-utils": "^0.5.52"
50
+ "@aigne/test-utils": "^0.5.53-beta",
51
+ "@aigne/core": "^1.61.0-beta"
52
52
  },
53
53
  "scripts": {
54
54
  "lint": "tsc --noEmit",