ruby-gemini-api 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/README.md +177 -0
- data/lib/gemini/client.rb +40 -0
- data/lib/gemini/response.rb +99 -2
- data/lib/gemini/tokens.rb +77 -0
- data/lib/gemini/tts.rb +83 -0
- data/lib/gemini/version.rb +1 -1
- data/lib/gemini.rb +2 -0
- metadata +3 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: cd1be2a2b81543d21686e9d4ade4de2e6aa42ea8b26abc4bb0929e9aa77fada0
|
|
4
|
+
data.tar.gz: 9b96121c0a8a68220e4e368057424211e29e0652f28f1812b56049842d31c5c9
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: be505ec75d011d31cd3c741924b2e845024d4bccdc36b3fbdfa63e0468771a45f7dd4abbc31ec6ade82ff4e992b46bd9e3aa65a06bd874a73a799f1cdb246bed
|
|
7
|
+
data.tar.gz: edcc02ff33b17e56a004836b2c55c7a133d883bc38c9f67893f3d3a59afb239bf7329454ef8278cf618e2d393867e445dc88bf539350999258c856f02c775b69
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,21 @@
|
|
|
1
1
|
## [Unreleased]
|
|
2
2
|
|
|
3
|
+
## [1.2.0] - 2026-05-14
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
- TTS (speech generation) API support
|
|
7
|
+
- `client.tts.generate(text, voice:)` and `client.generate_speech(text, voice:)` shortcut
|
|
8
|
+
- Single-speaker mode via `voice:` and multi-speaker mode via `multi_speaker: [{ speaker:, voice: }, ...]`
|
|
9
|
+
- 30 prebuilt voices exposed as `Gemini::TTS::VOICES`
|
|
10
|
+
- Default model `gemini-2.5-flash-preview-tts` (override via `model:`)
|
|
11
|
+
- `Response` helpers: `#audio_data`, `#audio_mime_type`, `#audio_response?`, `#save_audio(path)` which auto-wraps L16 PCM in a RIFF/WAVE header
|
|
12
|
+
- Demos: `tts_demo.rb` / `tts_demo_ja.rb`
|
|
13
|
+
- `countTokens` API support
|
|
14
|
+
- `client.tokens.count(input, ...)` and `client.count_tokens(input, ...)` shortcut
|
|
15
|
+
- Accepts String / Array / Hash inputs, full `contents:` array, plus optional `system_instruction:`, `tools:`, `generation_config:`, `cached_content:` (auto-wraps payload in `generateContentRequest` when extra fields are present)
|
|
16
|
+
- `Response` helpers: `#count_tokens`, `#prompt_tokens_details`, `#cached_content_token_count`, `#count_tokens_response?`
|
|
17
|
+
- Demos: `count_tokens_demo.rb` / `count_tokens_demo_ja.rb`
|
|
18
|
+
|
|
3
19
|
## [1.1.0] - 2026-04-29
|
|
4
20
|
|
|
5
21
|
### Added
|
data/README.md
CHANGED
|
@@ -31,6 +31,8 @@ This project is inspired by and pays homage to [ruby-openai](https://github.com/
|
|
|
31
31
|
- Document processing (PDFs and other formats)
|
|
32
32
|
- Context caching for efficient processing
|
|
33
33
|
- Text embeddings (single and batch) with task type, title, and output dimensionality control
|
|
34
|
+
- Token counting (`countTokens`) for prompts, chat history, and full requests with system instruction / tools / cached content
|
|
35
|
+
- Speech generation (TTS) with 30 prebuilt voices, single-speaker and multi-speaker modes, and one-line WAV file output
|
|
34
36
|
- Live API: real-time bidirectional conversations with text/audio/video and function calling (sync and async)
|
|
35
37
|
|
|
36
38
|
### Function Calling
|
|
@@ -1263,6 +1265,181 @@ response.embedding_response? # true if the payload contains embedding data
|
|
|
1263
1265
|
|
|
1264
1266
|
A complete example is available in `demo/embeddings_demo.rb`.
|
|
1265
1267
|
|
|
1268
|
+
### Token Counting
|
|
1269
|
+
|
|
1270
|
+
Estimate how many tokens an input would consume before sending it to a generation endpoint. Useful for cost/quota planning and for staying within a model's context window.
|
|
1271
|
+
|
|
1272
|
+
#### Basic Usage
|
|
1273
|
+
|
|
1274
|
+
```ruby
|
|
1275
|
+
require 'gemini'
|
|
1276
|
+
|
|
1277
|
+
client = Gemini::Client.new(ENV['GEMINI_API_KEY'])
|
|
1278
|
+
|
|
1279
|
+
response = client.count_tokens("The quick brown fox jumps over the lazy dog.")
|
|
1280
|
+
|
|
1281
|
+
puts response.count_tokens # => 9 (totalTokens)
|
|
1282
|
+
puts response.prompt_tokens_details # => [{"modality"=>"TEXT", "tokenCount"=>9}]
|
|
1283
|
+
```
|
|
1284
|
+
|
|
1285
|
+
By default the request goes to `gemini-2.5-flash`. Override it with `model:`:
|
|
1286
|
+
|
|
1287
|
+
```ruby
|
|
1288
|
+
client.count_tokens("Hello", model: "gemini-2.5-pro")
|
|
1289
|
+
```
|
|
1290
|
+
|
|
1291
|
+
#### Multi-turn Chat History
|
|
1292
|
+
|
|
1293
|
+
Pass a fully formed `contents:` array (the same shape used by `generateContent`) to count tokens for an entire conversation:
|
|
1294
|
+
|
|
1295
|
+
```ruby
|
|
1296
|
+
response = client.count_tokens(
|
|
1297
|
+
contents: [
|
|
1298
|
+
{ role: "user", parts: [{ text: "Hi, my name is Bob." }] },
|
|
1299
|
+
{ role: "model", parts: [{ text: "Hi Bob!" }] },
|
|
1300
|
+
{ role: "user", parts: [{ text: "What's the weather like today?" }] }
|
|
1301
|
+
]
|
|
1302
|
+
)
|
|
1303
|
+
```
|
|
1304
|
+
|
|
1305
|
+
#### With System Instruction, Tools, or Cached Content
|
|
1306
|
+
|
|
1307
|
+
When you include `system_instruction:`, `tools:`, `generation_config:`, or `cached_content:`, the request is automatically wrapped as a `generateContentRequest` so the count reflects the full payload:
|
|
1308
|
+
|
|
1309
|
+
```ruby
|
|
1310
|
+
response = client.count_tokens(
|
|
1311
|
+
"What is the weather in Tokyo?",
|
|
1312
|
+
system_instruction: "You are a concise weather assistant.",
|
|
1313
|
+
tools: [
|
|
1314
|
+
{
|
|
1315
|
+
function_declarations: [
|
|
1316
|
+
{
|
|
1317
|
+
name: "get_weather",
|
|
1318
|
+
description: "Get the current weather for a city.",
|
|
1319
|
+
parameters: {
|
|
1320
|
+
type: "object",
|
|
1321
|
+
properties: { city: { type: "string" } },
|
|
1322
|
+
required: ["city"]
|
|
1323
|
+
}
|
|
1324
|
+
}
|
|
1325
|
+
]
|
|
1326
|
+
}
|
|
1327
|
+
]
|
|
1328
|
+
)
|
|
1329
|
+
|
|
1330
|
+
puts response.count_tokens
|
|
1331
|
+
```
|
|
1332
|
+
|
|
1333
|
+
#### Direct Access via `tokens`
|
|
1334
|
+
|
|
1335
|
+
```ruby
|
|
1336
|
+
client.tokens.count("Hello", model: "gemini-2.5-flash")
|
|
1337
|
+
```
|
|
1338
|
+
|
|
1339
|
+
#### Response Helpers
|
|
1340
|
+
|
|
1341
|
+
```ruby
|
|
1342
|
+
response.count_tokens # totalTokens from the API (Integer)
|
|
1343
|
+
response.prompt_tokens_details # per-modality breakdown (Array<Hash>)
|
|
1344
|
+
response.cached_content_token_count # tokens reused from cachedContent (Integer)
|
|
1345
|
+
response.count_tokens_response? # true if the payload is a countTokens response
|
|
1346
|
+
```
|
|
1347
|
+
|
|
1348
|
+
A complete example is available in `demo/count_tokens_demo.rb`.
|
|
1349
|
+
|
|
1350
|
+
### Speech Generation (TTS)
|
|
1351
|
+
|
|
1352
|
+
Generate spoken audio from text using Gemini's TTS preview models. The API returns 24 kHz, 16-bit, mono PCM (L16) audio; `Response#save_audio` wraps it in a RIFF/WAVE header so the result is directly playable.
|
|
1353
|
+
|
|
1354
|
+
#### Single-Speaker
|
|
1355
|
+
|
|
1356
|
+
```ruby
|
|
1357
|
+
require 'gemini'
|
|
1358
|
+
|
|
1359
|
+
client = Gemini::Client.new(ENV['GEMINI_API_KEY'])
|
|
1360
|
+
|
|
1361
|
+
response = client.generate_speech(
|
|
1362
|
+
"Say cheerfully: Have a wonderful day!",
|
|
1363
|
+
voice: "Kore"
|
|
1364
|
+
)
|
|
1365
|
+
|
|
1366
|
+
if response.success?
|
|
1367
|
+
response.save_audio("hello.wav")
|
|
1368
|
+
puts response.audio_mime_type # => "audio/L16;codec=pcm;rate=24000"
|
|
1369
|
+
end
|
|
1370
|
+
```
|
|
1371
|
+
|
|
1372
|
+
Phrase the prompt as an instruction to read text aloud (`Say ...:` / `Read the following:`); a bare phrase like `"Hello"` is treated as a chat message and rejected with a 400 error.
|
|
1373
|
+
|
|
1374
|
+
#### Multi-Speaker
|
|
1375
|
+
|
|
1376
|
+
Provide a `multi_speaker:` array to assign different voices to named speakers (up to 2 speakers in the current preview models). Reference the speakers by the same names in your prompt.
|
|
1377
|
+
|
|
1378
|
+
```ruby
|
|
1379
|
+
script = <<~SCRIPT
|
|
1380
|
+
TTS the following conversation between Joe and Jane:
|
|
1381
|
+
Joe: How's it going today, Jane?
|
|
1382
|
+
Jane: Not too bad, how about you?
|
|
1383
|
+
SCRIPT
|
|
1384
|
+
|
|
1385
|
+
response = client.generate_speech(
|
|
1386
|
+
script,
|
|
1387
|
+
multi_speaker: [
|
|
1388
|
+
{ speaker: "Joe", voice: "Kore" },
|
|
1389
|
+
{ speaker: "Jane", voice: "Puck" }
|
|
1390
|
+
]
|
|
1391
|
+
)
|
|
1392
|
+
|
|
1393
|
+
response.save_audio("dialogue.wav")
|
|
1394
|
+
```
|
|
1395
|
+
|
|
1396
|
+
#### Style Control
|
|
1397
|
+
|
|
1398
|
+
You can steer tone, pace, and emotion in two ways.
|
|
1399
|
+
|
|
1400
|
+
**1. Natural-language instruction** — describe the delivery as part of the prompt.
|
|
1401
|
+
|
|
1402
|
+
```ruby
|
|
1403
|
+
client.generate_speech(
|
|
1404
|
+
"Read this in a soft whisper: I have a secret... and you must never tell anyone.",
|
|
1405
|
+
voice: "Zephyr"
|
|
1406
|
+
)
|
|
1407
|
+
```
|
|
1408
|
+
|
|
1409
|
+
**2. Inline bracket tag** — put a directive like `[whispers]`, `[excited]`, `[laughs]`, `[sighs]`, `[shouting]`, etc. at the start of the text to apply that style to what follows.
|
|
1410
|
+
|
|
1411
|
+
```ruby
|
|
1412
|
+
client.generate_speech(
|
|
1413
|
+
"[whispers] I have a secret... and you must never tell anyone.",
|
|
1414
|
+
voice: "Zephyr"
|
|
1415
|
+
)
|
|
1416
|
+
```
|
|
1417
|
+
|
|
1418
|
+
Stick to **one style per call**: switching style mid-prompt (e.g. `[whispers] ... [excited] ...`) tends to leave the second segment in the first style or drop it entirely. If you need multiple styles, call `generate_speech` once per sentence and concatenate the audio yourself.
|
|
1419
|
+
|
|
1420
|
+
#### Models and Voices
|
|
1421
|
+
|
|
1422
|
+
- Default model: `gemini-2.5-flash-preview-tts` (override via `model:`)
|
|
1423
|
+
- Other models: `gemini-2.5-pro-preview-tts`, `gemini-3.1-flash-tts-preview`
|
|
1424
|
+
- 30 prebuilt voices are listed in `Gemini::TTS::VOICES` (Zephyr, Puck, Charon, Kore, Fenrir, Leda, Orus, Aoede, …). Unknown names raise `ArgumentError` at build time.
|
|
1425
|
+
|
|
1426
|
+
#### Direct Access via `tts`
|
|
1427
|
+
|
|
1428
|
+
```ruby
|
|
1429
|
+
client.tts.generate("Say hello.", voice: "Kore")
|
|
1430
|
+
```
|
|
1431
|
+
|
|
1432
|
+
#### Response Helpers
|
|
1433
|
+
|
|
1434
|
+
```ruby
|
|
1435
|
+
response.audio_data # Base64-encoded PCM payload
|
|
1436
|
+
response.audio_mime_type # e.g. "audio/L16;codec=pcm;rate=24000"
|
|
1437
|
+
response.audio_response? # true if the payload contains audio inlineData
|
|
1438
|
+
response.save_audio(path) # writes a playable .wav file and returns the path
|
|
1439
|
+
```
|
|
1440
|
+
|
|
1441
|
+
A complete example is available in `demo/tts_demo.rb`.
|
|
1442
|
+
|
|
1266
1443
|
### Structured Output with JSON Schema
|
|
1267
1444
|
|
|
1268
1445
|
You can request responses in structured JSON format by specifying a JSON schema:
|
data/lib/gemini/client.rb
CHANGED
|
@@ -80,6 +80,46 @@ module Gemini
|
|
|
80
80
|
@embeddings_api ||= Gemini::Embeddings.new(client: self)
|
|
81
81
|
end
|
|
82
82
|
|
|
83
|
+
# Token counting APIアクセサ
|
|
84
|
+
def tokens
|
|
85
|
+
@tokens ||= Gemini::Tokens.new(client: self)
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# TTS (speech generation) APIアクセサ
|
|
89
|
+
def tts
|
|
90
|
+
@tts ||= Gemini::TTS.new(client: self)
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Convenience wrapper for TTS speech generation.
|
|
94
|
+
def generate_speech(text, voice: nil, multi_speaker: nil, model: Gemini::TTS::DEFAULT_MODEL,
|
|
95
|
+
speech_config: nil, **parameters)
|
|
96
|
+
tts.generate(
|
|
97
|
+
text,
|
|
98
|
+
voice: voice,
|
|
99
|
+
multi_speaker: multi_speaker,
|
|
100
|
+
model: model,
|
|
101
|
+
speech_config: speech_config,
|
|
102
|
+
**parameters
|
|
103
|
+
)
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Convenience wrapper for countTokens.
|
|
107
|
+
# input can be a String, Array of parts/strings, Hash, or omitted when contents: is given.
|
|
108
|
+
def count_tokens(input = nil, model: Gemini::Tokens::DEFAULT_MODEL, contents: nil,
|
|
109
|
+
system_instruction: nil, tools: nil, generation_config: nil,
|
|
110
|
+
cached_content: nil, **parameters)
|
|
111
|
+
tokens.count(
|
|
112
|
+
input,
|
|
113
|
+
model: model,
|
|
114
|
+
contents: contents,
|
|
115
|
+
system_instruction: system_instruction,
|
|
116
|
+
tools: tools,
|
|
117
|
+
generation_config: generation_config,
|
|
118
|
+
cached_content: cached_content,
|
|
119
|
+
**parameters
|
|
120
|
+
)
|
|
121
|
+
end
|
|
122
|
+
|
|
83
123
|
def reset_headers
|
|
84
124
|
@extra_headers = {}
|
|
85
125
|
end
|
data/lib/gemini/response.rb
CHANGED
|
@@ -41,9 +41,83 @@ module Gemini
|
|
|
41
41
|
# Get image parts (if any)
|
|
42
42
|
def image_parts
|
|
43
43
|
return [] unless valid?
|
|
44
|
-
|
|
44
|
+
|
|
45
45
|
parts.select { |part| part.key?("inline_data") && part["inline_data"]["mime_type"].start_with?("image/") }
|
|
46
46
|
end
|
|
47
|
+
|
|
48
|
+
# Get the first audio inlineData part (TTS responses use camelCase "inlineData")
|
|
49
|
+
def audio_part
|
|
50
|
+
return nil unless valid?
|
|
51
|
+
|
|
52
|
+
parts.find do |part|
|
|
53
|
+
data_key = part["inlineData"] || part["inline_data"]
|
|
54
|
+
next false unless data_key
|
|
55
|
+
mt = data_key["mimeType"] || data_key["mime_type"]
|
|
56
|
+
mt.is_a?(String) && mt.start_with?("audio/")
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Base64-encoded audio data from a TTS response
|
|
61
|
+
def audio_data
|
|
62
|
+
part = audio_part
|
|
63
|
+
return nil unless part
|
|
64
|
+
data_key = part["inlineData"] || part["inline_data"]
|
|
65
|
+
data_key["data"]
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# MIME type of the audio payload (e.g. "audio/L16;codec=pcm;rate=24000")
|
|
69
|
+
def audio_mime_type
|
|
70
|
+
part = audio_part
|
|
71
|
+
return nil unless part
|
|
72
|
+
data_key = part["inlineData"] || part["inline_data"]
|
|
73
|
+
data_key["mimeType"] || data_key["mime_type"]
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# True if the response contains audio inlineData
|
|
77
|
+
def audio_response?
|
|
78
|
+
!audio_part.nil?
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Save audio to a file. PCM (L16) payloads are wrapped in a WAV header so
|
|
82
|
+
# the result is directly playable; other audio MIME types are written as-is.
|
|
83
|
+
# Returns the written file path or nil if no audio is present.
|
|
84
|
+
def save_audio(filepath)
|
|
85
|
+
data_b64 = audio_data
|
|
86
|
+
return nil unless data_b64
|
|
87
|
+
|
|
88
|
+
require 'base64'
|
|
89
|
+
raw = Base64.strict_decode64(data_b64)
|
|
90
|
+
mime = audio_mime_type.to_s
|
|
91
|
+
|
|
92
|
+
if mime.include?("L16") || mime.include?("pcm")
|
|
93
|
+
rate = mime[/rate=(\d+)/, 1]&.to_i || 24000
|
|
94
|
+
channels = 1
|
|
95
|
+
bits_per_sample = 16
|
|
96
|
+
byte_rate = rate * channels * bits_per_sample / 8
|
|
97
|
+
block_align = channels * bits_per_sample / 8
|
|
98
|
+
data_size = raw.bytesize
|
|
99
|
+
|
|
100
|
+
header = +""
|
|
101
|
+
header << "RIFF"
|
|
102
|
+
header << [36 + data_size].pack("V")
|
|
103
|
+
header << "WAVE"
|
|
104
|
+
header << "fmt "
|
|
105
|
+
header << [16].pack("V")
|
|
106
|
+
header << [1].pack("v")
|
|
107
|
+
header << [channels].pack("v")
|
|
108
|
+
header << [rate].pack("V")
|
|
109
|
+
header << [byte_rate].pack("V")
|
|
110
|
+
header << [block_align].pack("v")
|
|
111
|
+
header << [bits_per_sample].pack("v")
|
|
112
|
+
header << "data"
|
|
113
|
+
header << [data_size].pack("V")
|
|
114
|
+
|
|
115
|
+
File.binwrite(filepath, header + raw)
|
|
116
|
+
else
|
|
117
|
+
File.binwrite(filepath, raw)
|
|
118
|
+
end
|
|
119
|
+
filepath
|
|
120
|
+
end
|
|
47
121
|
|
|
48
122
|
# Get all content with string representation
|
|
49
123
|
def full_content
|
|
@@ -73,7 +147,8 @@ module Gemini
|
|
|
73
147
|
!@raw_data.nil? &&
|
|
74
148
|
((@raw_data.key?("candidates") && !@raw_data["candidates"].empty?) ||
|
|
75
149
|
(@raw_data.key?("predictions") && !@raw_data["predictions"].empty?) ||
|
|
76
|
-
embedding_response?
|
|
150
|
+
embedding_response? ||
|
|
151
|
+
count_tokens_response?)
|
|
77
152
|
end
|
|
78
153
|
|
|
79
154
|
# Check if the raw response contains embedding data
|
|
@@ -231,6 +306,28 @@ module Gemini
|
|
|
231
306
|
def total_tokens
|
|
232
307
|
usage&.dig("totalTokens") || 0
|
|
233
308
|
end
|
|
309
|
+
|
|
310
|
+
# Check whether this response is a countTokens API result
|
|
311
|
+
def count_tokens_response?
|
|
312
|
+
!@raw_data.nil? && @raw_data.key?("totalTokens") &&
|
|
313
|
+
!@raw_data.key?("candidates") && !@raw_data.key?("predictions") &&
|
|
314
|
+
!embedding_response?
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
# Total tokens reported by the countTokens API (top-level totalTokens)
|
|
318
|
+
def count_tokens
|
|
319
|
+
@raw_data&.dig("totalTokens")
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
# Cached content token count reported by countTokens
|
|
323
|
+
def cached_content_token_count
|
|
324
|
+
@raw_data&.dig("cachedContentTokenCount") || 0
|
|
325
|
+
end
|
|
326
|
+
|
|
327
|
+
# Per-modality token breakdown reported by countTokens
|
|
328
|
+
def prompt_tokens_details
|
|
329
|
+
@raw_data&.dig("promptTokensDetails") || []
|
|
330
|
+
end
|
|
234
331
|
|
|
235
332
|
# Process chunks for streaming responses
|
|
236
333
|
def stream_chunks
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
module Gemini
|
|
2
|
+
class Tokens
|
|
3
|
+
DEFAULT_MODEL = "gemini-2.5-flash".freeze
|
|
4
|
+
|
|
5
|
+
def initialize(client:)
|
|
6
|
+
@client = client
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
# Count tokens for the given input.
|
|
10
|
+
#
|
|
11
|
+
# input: String, Array of parts/contents, or Hash. Optional when `contents:` is given.
|
|
12
|
+
# contents: full Array of Content objects (overrides input).
|
|
13
|
+
# system_instruction: String or Content hash.
|
|
14
|
+
# tools: Array of tool definitions (passed via generateContentRequest form).
|
|
15
|
+
# generation_config: Hash forwarded as generationConfig.
|
|
16
|
+
# cached_content: cachedContents/* resource name.
|
|
17
|
+
def count(input = nil, model: DEFAULT_MODEL, contents: nil, system_instruction: nil,
|
|
18
|
+
tools: nil, generation_config: nil, cached_content: nil, **parameters)
|
|
19
|
+
normalized_model = normalize_model(model)
|
|
20
|
+
|
|
21
|
+
payload = build_payload(
|
|
22
|
+
model: normalized_model,
|
|
23
|
+
input: input,
|
|
24
|
+
contents: contents,
|
|
25
|
+
system_instruction: system_instruction,
|
|
26
|
+
tools: tools,
|
|
27
|
+
generation_config: generation_config,
|
|
28
|
+
cached_content: cached_content
|
|
29
|
+
).merge(parameters)
|
|
30
|
+
|
|
31
|
+
response = @client.json_post(
|
|
32
|
+
path: "models/#{normalized_model}:countTokens",
|
|
33
|
+
parameters: payload
|
|
34
|
+
)
|
|
35
|
+
Gemini::Response.new(response)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
private
|
|
39
|
+
|
|
40
|
+
def build_payload(model:, input:, contents:, system_instruction:, tools:, generation_config:, cached_content:)
|
|
41
|
+
resolved_contents = contents || [format_content(input)]
|
|
42
|
+
|
|
43
|
+
# Use generateContentRequest form when extra request fields are present
|
|
44
|
+
if system_instruction || tools || generation_config || cached_content
|
|
45
|
+
# model is required inside the nested GenerateContentRequest
|
|
46
|
+
gc_request = { model: "models/#{model}", contents: resolved_contents }
|
|
47
|
+
gc_request[:systemInstruction] = format_content(system_instruction) if system_instruction
|
|
48
|
+
gc_request[:tools] = tools if tools
|
|
49
|
+
gc_request[:generationConfig] = generation_config if generation_config
|
|
50
|
+
gc_request[:cachedContent] = cached_content if cached_content
|
|
51
|
+
{ generateContentRequest: gc_request }
|
|
52
|
+
else
|
|
53
|
+
{ contents: resolved_contents }
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def format_content(input)
|
|
58
|
+
case input
|
|
59
|
+
when nil
|
|
60
|
+
raise ArgumentError, "input or contents parameter is required"
|
|
61
|
+
when String
|
|
62
|
+
{ parts: [{ text: input }] }
|
|
63
|
+
when Array
|
|
64
|
+
{ parts: input.map { |part| part.is_a?(String) ? { text: part } : part } }
|
|
65
|
+
when Hash
|
|
66
|
+
input.key?(:parts) || input.key?("parts") ? input : { parts: [input] }
|
|
67
|
+
else
|
|
68
|
+
{ parts: [{ text: input.to_s }] }
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def normalize_model(model)
|
|
73
|
+
model_str = model.to_s
|
|
74
|
+
model_str.start_with?("models/") ? model_str.delete_prefix("models/") : model_str
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
data/lib/gemini/tts.rb
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
module Gemini
|
|
2
|
+
class TTS
|
|
3
|
+
DEFAULT_MODEL = "gemini-2.5-flash-preview-tts".freeze
|
|
4
|
+
|
|
5
|
+
# 30 prebuilt voice names available for the prebuiltVoiceConfig
|
|
6
|
+
VOICES = %w[
|
|
7
|
+
Zephyr Puck Charon Kore Fenrir Leda Orus Aoede Callirrhoe Autonoe
|
|
8
|
+
Enceladus Iapetus Umbriel Algieba Despina Erinome Algenib Rasalgethi
|
|
9
|
+
Laomedeia Achernar Alnilam Schedar Gacrux Pulcherrima Achird
|
|
10
|
+
Zubenelgenubi Vindemiatrix Sadachbia Sadaltager Sulafat
|
|
11
|
+
].freeze
|
|
12
|
+
|
|
13
|
+
def initialize(client:)
|
|
14
|
+
@client = client
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Generate speech audio from text.
|
|
18
|
+
#
|
|
19
|
+
# text: prompt String (use style cues / bracket tags like [excited] for control,
|
|
20
|
+
# or "Speaker 1: ... Speaker 2: ..." for multi-speaker).
|
|
21
|
+
# voice: a single voice name (prebuiltVoiceConfig). Mutually exclusive with multi_speaker.
|
|
22
|
+
# multi_speaker: Array of { speaker:, voice: } Hashes for multi-speaker output.
|
|
23
|
+
# model: TTS preview model name. Defaults to gemini-2.5-flash-preview-tts.
|
|
24
|
+
# speech_config: raw speechConfig Hash override (skips voice/multi_speaker handling).
|
|
25
|
+
def generate(text, voice: nil, multi_speaker: nil, model: DEFAULT_MODEL,
|
|
26
|
+
speech_config: nil, **parameters)
|
|
27
|
+
raise ArgumentError, "text is required" if text.nil? || text.to_s.empty?
|
|
28
|
+
if voice && multi_speaker
|
|
29
|
+
raise ArgumentError, "voice and multi_speaker are mutually exclusive"
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
resolved_speech_config = speech_config || build_speech_config(voice: voice, multi_speaker: multi_speaker)
|
|
33
|
+
raise ArgumentError, "voice, multi_speaker, or speech_config is required" unless resolved_speech_config
|
|
34
|
+
|
|
35
|
+
payload = {
|
|
36
|
+
contents: [{ parts: [{ text: text }] }],
|
|
37
|
+
generationConfig: {
|
|
38
|
+
responseModalities: ["AUDIO"],
|
|
39
|
+
speechConfig: resolved_speech_config
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
payload.merge!(parameters) if parameters && !parameters.empty?
|
|
44
|
+
|
|
45
|
+
response = @client.json_post(
|
|
46
|
+
path: "models/#{normalize_model(model)}:generateContent",
|
|
47
|
+
parameters: payload
|
|
48
|
+
)
|
|
49
|
+
Gemini::Response.new(response)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
private
|
|
53
|
+
|
|
54
|
+
def build_speech_config(voice:, multi_speaker:)
|
|
55
|
+
if multi_speaker
|
|
56
|
+
speaker_voice_configs = multi_speaker.map do |entry|
|
|
57
|
+
speaker = entry[:speaker] || entry["speaker"]
|
|
58
|
+
v = entry[:voice] || entry["voice"]
|
|
59
|
+
raise ArgumentError, "multi_speaker entries require :speaker and :voice" unless speaker && v
|
|
60
|
+
validate_voice!(v)
|
|
61
|
+
{
|
|
62
|
+
speaker: speaker,
|
|
63
|
+
voiceConfig: { prebuiltVoiceConfig: { voiceName: v } }
|
|
64
|
+
}
|
|
65
|
+
end
|
|
66
|
+
{ multiSpeakerVoiceConfig: { speakerVoiceConfigs: speaker_voice_configs } }
|
|
67
|
+
elsif voice
|
|
68
|
+
validate_voice!(voice)
|
|
69
|
+
{ voiceConfig: { prebuiltVoiceConfig: { voiceName: voice } } }
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def validate_voice!(voice)
|
|
74
|
+
return if VOICES.include?(voice.to_s)
|
|
75
|
+
raise ArgumentError, "Unknown voice '#{voice}'. Available voices: #{VOICES.join(', ')}"
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def normalize_model(model)
|
|
79
|
+
model_str = model.to_s
|
|
80
|
+
model_str.start_with?("models/") ? model_str.delete_prefix("models/") : model_str
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
data/lib/gemini/version.rb
CHANGED
data/lib/gemini.rb
CHANGED
|
@@ -12,6 +12,8 @@ require_relative "gemini/threads"
|
|
|
12
12
|
require_relative "gemini/messages"
|
|
13
13
|
require_relative "gemini/runs"
|
|
14
14
|
require_relative "gemini/embeddings"
|
|
15
|
+
require_relative "gemini/tokens"
|
|
16
|
+
require_relative "gemini/tts"
|
|
15
17
|
require_relative "gemini/audio"
|
|
16
18
|
require_relative "gemini/files"
|
|
17
19
|
require_relative "gemini/images"
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ruby-gemini-api
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.
|
|
4
|
+
version: 1.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- rira100000000
|
|
@@ -180,7 +180,9 @@ files:
|
|
|
180
180
|
- lib/gemini/response.rb
|
|
181
181
|
- lib/gemini/runs.rb
|
|
182
182
|
- lib/gemini/threads.rb
|
|
183
|
+
- lib/gemini/tokens.rb
|
|
183
184
|
- lib/gemini/tool_definition.rb
|
|
185
|
+
- lib/gemini/tts.rb
|
|
184
186
|
- lib/gemini/version.rb
|
|
185
187
|
- lib/gemini/video.rb
|
|
186
188
|
- lib/ruby/gemini.rb
|