geminisst 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core.d.ts +0 -5
- package/dist/core.js +12 -58
- package/dist/index.js +6 -2
- package/package.json +1 -1
package/dist/core.d.ts
CHANGED
|
@@ -1,10 +1,5 @@
|
|
|
1
1
|
import { SSTOptions, TranscriptionResult } from './types.js';
|
|
2
2
|
/**
|
|
3
3
|
* Processes audio using the Gemini API.
|
|
4
|
-
* @param audioData - Base64 encoded audio string
|
|
5
|
-
* @param mimeType - MIME type of the audio
|
|
6
|
-
* @param apiKey - Google Gemini API Key
|
|
7
|
-
* @param options - Configuration options
|
|
8
|
-
* @returns Promise resolving to the transcription result
|
|
9
4
|
*/
|
|
10
5
|
export declare function processAudioWithGemini(audioData: string, mimeType: string, apiKey: string, options: SSTOptions): Promise<TranscriptionResult>;
|
package/dist/core.js
CHANGED
|
@@ -5,40 +5,30 @@ import { GoogleGenAI } from '@google/genai';
|
|
|
5
5
|
import { DEFAULT_SYSTEM_INSTRUCTION } from './constants.js';
|
|
6
6
|
/**
|
|
7
7
|
* Processes audio using the Gemini API.
|
|
8
|
-
* @param audioData - Base64 encoded audio string
|
|
9
|
-
* @param mimeType - MIME type of the audio
|
|
10
|
-
* @param apiKey - Google Gemini API Key
|
|
11
|
-
* @param options - Configuration options
|
|
12
|
-
* @returns Promise resolving to the transcription result
|
|
13
8
|
*/
|
|
14
9
|
export async function processAudioWithGemini(audioData, mimeType, apiKey, options) {
|
|
15
10
|
if (!apiKey) {
|
|
16
|
-
throw new Error("API Key is required");
|
|
11
|
+
throw new Error("[geminisst] API Key is required");
|
|
17
12
|
}
|
|
18
|
-
// Initialize the AI client
|
|
13
|
+
// Initialize the AI client
|
|
19
14
|
const ai = new GoogleGenAI({ apiKey: apiKey });
|
|
20
15
|
const modelName = options.model || "gemini-2.5-flash-lite";
|
|
21
|
-
|
|
16
|
+
const startTime = Date.now();
|
|
17
|
+
// Configure as per latest Gemini 2.5 specifications
|
|
22
18
|
const config = {
|
|
23
19
|
thinkingConfig: {
|
|
24
|
-
includeThoughts: true,
|
|
25
|
-
thinkingBudget: -1
|
|
20
|
+
includeThoughts: true,
|
|
21
|
+
thinkingBudget: -1
|
|
26
22
|
},
|
|
27
|
-
// Fixed System Instruction: Users cannot override this as it is the core STT logic.
|
|
28
23
|
systemInstruction: DEFAULT_SYSTEM_INSTRUCTION
|
|
29
24
|
};
|
|
30
25
|
if (options.verbose) {
|
|
31
|
-
console.log(`[
|
|
32
|
-
console.log(`[
|
|
33
|
-
console.log(`[SSTLibrary] System Instruction: Locked (Core)`);
|
|
26
|
+
console.log(`[geminisst] Model: ${modelName}`);
|
|
27
|
+
console.log(`[geminisst] Thinking: Enabled (Dynamic)`);
|
|
34
28
|
}
|
|
35
29
|
const promptText = options.prompt || "Transcribe this audio.";
|
|
36
|
-
const startTime = Date.now();
|
|
37
30
|
try {
|
|
38
|
-
|
|
39
|
-
* Using the syntax from the provided documentation:
|
|
40
|
-
* ai.models.generateContent({ model, contents, config })
|
|
41
|
-
*/
|
|
31
|
+
// Standard call using the models.generateContent API
|
|
42
32
|
const response = await ai.models.generateContent({
|
|
43
33
|
model: modelName,
|
|
44
34
|
contents: [
|
|
@@ -59,10 +49,9 @@ export async function processAudioWithGemini(audioData, mimeType, apiKey, option
|
|
|
59
49
|
});
|
|
60
50
|
const endTime = Date.now();
|
|
61
51
|
const processingTimeSec = parseFloat(((endTime - startTime) / 1000).toFixed(2));
|
|
62
|
-
// Handle the response according to the documentation structure
|
|
63
52
|
const candidate = response.candidates?.[0];
|
|
64
53
|
const textParts = candidate?.content?.parts || [];
|
|
65
|
-
//
|
|
54
|
+
// Separate transcript and thoughts
|
|
66
55
|
const transcriptText = textParts
|
|
67
56
|
.filter((p) => !p.thought)
|
|
68
57
|
.map((p) => p.text)
|
|
@@ -71,7 +60,6 @@ export async function processAudioWithGemini(audioData, mimeType, apiKey, option
|
|
|
71
60
|
.filter((p) => p.thought)
|
|
72
61
|
.map((p) => p.text)
|
|
73
62
|
.join('') || "";
|
|
74
|
-
// Extract usage details
|
|
75
63
|
const usage = response.usageMetadata ? {
|
|
76
64
|
inputTokens: response.usageMetadata.promptTokenCount || 0,
|
|
77
65
|
outputTokens: response.usageMetadata.candidatesTokenCount || 0,
|
|
@@ -86,41 +74,7 @@ export async function processAudioWithGemini(audioData, mimeType, apiKey, option
|
|
|
86
74
|
};
|
|
87
75
|
}
|
|
88
76
|
catch (error) {
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
if (options.verbose)
|
|
92
|
-
console.warn("[SSTLibrary] Newer syntax failed, trying fallback...");
|
|
93
|
-
try {
|
|
94
|
-
const model = ai.getGenerativeModel({ model: modelName }, config);
|
|
95
|
-
const result = await model.generateContent({
|
|
96
|
-
contents: [{
|
|
97
|
-
role: 'user',
|
|
98
|
-
parts: [
|
|
99
|
-
{ text: promptText },
|
|
100
|
-
{ inlineData: { mimeType, data: audioData } }
|
|
101
|
-
]
|
|
102
|
-
}]
|
|
103
|
-
});
|
|
104
|
-
const endTime = Date.now();
|
|
105
|
-
const processingTimeSec = parseFloat(((endTime - startTime) / 1000).toFixed(2));
|
|
106
|
-
const resp = result.response;
|
|
107
|
-
const candidate = resp.candidates?.[0];
|
|
108
|
-
const parts = candidate?.content?.parts || [];
|
|
109
|
-
return {
|
|
110
|
-
text: parts.filter((p) => !p.thought).map((p) => p.text).join(''),
|
|
111
|
-
thoughts: parts.filter((p) => p.thought).map((p) => p.text).join(''),
|
|
112
|
-
model: modelName,
|
|
113
|
-
usage: resp.usageMetadata ? {
|
|
114
|
-
inputTokens: resp.usageMetadata.promptTokenCount,
|
|
115
|
-
outputTokens: resp.usageMetadata.candidatesTokenCount,
|
|
116
|
-
totalTokens: resp.usageMetadata.totalTokenCount,
|
|
117
|
-
processingTimeSec: processingTimeSec
|
|
118
|
-
} : undefined
|
|
119
|
-
};
|
|
120
|
-
}
|
|
121
|
-
catch (fallbackError) {
|
|
122
|
-
console.error("[SSTLibrary] Transcription failed:", fallbackError);
|
|
123
|
-
throw fallbackError;
|
|
124
|
-
}
|
|
77
|
+
console.error("[geminisst] Error calling Gemini API:", error.message);
|
|
78
|
+
throw error;
|
|
125
79
|
}
|
|
126
80
|
}
|
package/dist/index.js
CHANGED
|
@@ -10,9 +10,13 @@ import { bufferToBase64 } from './utils.js';
|
|
|
10
10
|
* @returns The transcription result object containing text and thoughts
|
|
11
11
|
*/
|
|
12
12
|
export async function audioToText(audioFile, apiKey, options = {}) {
|
|
13
|
-
// 1.
|
|
13
|
+
// 1. Validate Audio File Path
|
|
14
14
|
if (!fs.existsSync(audioFile)) {
|
|
15
|
-
throw new Error(`Audio file not found: ${audioFile}`);
|
|
15
|
+
throw new Error(`[geminisst] Audio file not found at path: ${audioFile}`);
|
|
16
|
+
}
|
|
17
|
+
const stats = fs.statSync(audioFile);
|
|
18
|
+
if (stats.isDirectory()) {
|
|
19
|
+
throw new Error(`[geminisst] Expected a file path but found a directory: ${audioFile}`);
|
|
16
20
|
}
|
|
17
21
|
// Simple mime type detection based on extension
|
|
18
22
|
const ext = path.extname(audioFile).toLowerCase().replace('.', '');
|