geminisst 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core.d.ts +1 -0
- package/dist/core.js +43 -65
- package/dist/index.d.ts +5 -4
- package/dist/index.js +13 -11
- package/dist/types.d.ts +17 -6
- package/dist/types.js +1 -1
- package/package.json +1 -1
package/dist/core.d.ts
CHANGED
package/dist/core.js
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Core business logic using
|
|
2
|
+
* Core business logic using the modern @google/genai SDK
|
|
3
|
+
* strictly following the provided multimodal and thinking documentation.
|
|
3
4
|
*/
|
|
4
5
|
import { GoogleGenAI } from '@google/genai';
|
|
5
6
|
import { DEFAULT_SYSTEM_INSTRUCTION } from './constants.js';
|
|
6
7
|
/**
|
|
7
8
|
* Processes audio using the Gemini API.
|
|
9
|
+
*
|
|
8
10
|
* @param audioData - Base64 encoded audio string
|
|
9
11
|
* @param mimeType - MIME type of the audio
|
|
10
12
|
* @param apiKey - Google Gemini API Key
|
|
@@ -13,30 +15,32 @@ import { DEFAULT_SYSTEM_INSTRUCTION } from './constants.js';
|
|
|
13
15
|
*/
|
|
14
16
|
export async function processAudioWithGemini(audioData, mimeType, apiKey, options) {
|
|
15
17
|
if (!apiKey) {
|
|
16
|
-
throw new Error("API Key is required");
|
|
18
|
+
throw new Error("[geminisst] API Key is required.");
|
|
17
19
|
}
|
|
18
|
-
// Initialize
|
|
19
|
-
const ai = new GoogleGenAI({ apiKey
|
|
20
|
+
// Initialize client as per documentation: new GoogleGenAI({ apiKey })
|
|
21
|
+
const ai = new GoogleGenAI({ apiKey });
|
|
20
22
|
const modelName = options.model || "gemini-2.5-flash-lite";
|
|
21
|
-
|
|
23
|
+
const startTime = Date.now();
|
|
24
|
+
/**
|
|
25
|
+
* Configuration strictly following the Gemini 2.5 series docs:
|
|
26
|
+
* - thinkingBudget: -1 enables Dynamic Thinking.
|
|
27
|
+
* - includeThoughts: true allows capturing reasoning parts.
|
|
28
|
+
*/
|
|
22
29
|
const config = {
|
|
30
|
+
systemInstruction: DEFAULT_SYSTEM_INSTRUCTION,
|
|
23
31
|
thinkingConfig: {
|
|
24
|
-
includeThoughts: true,
|
|
25
|
-
thinkingBudget: -1
|
|
32
|
+
includeThoughts: true,
|
|
33
|
+
thinkingBudget: -1,
|
|
26
34
|
},
|
|
27
|
-
// Fixed System Instruction: Users cannot override this as it is the core STT logic.
|
|
28
|
-
systemInstruction: DEFAULT_SYSTEM_INSTRUCTION
|
|
29
35
|
};
|
|
30
36
|
if (options.verbose) {
|
|
31
|
-
console.log(`[
|
|
32
|
-
console.log(`[
|
|
33
|
-
console.log(`[SSTLibrary] System Instruction: Locked (Core)`);
|
|
37
|
+
console.log(`[geminisst] Initializing ${modelName}...`);
|
|
38
|
+
console.log(`[geminisst] Dynamic Thinking: Enabled`);
|
|
34
39
|
}
|
|
35
|
-
const promptText = options.prompt || "Transcribe this audio.";
|
|
36
|
-
const startTime = Date.now();
|
|
40
|
+
const promptText = options.prompt || "Transcribe this audio exactly.";
|
|
37
41
|
try {
|
|
38
42
|
/**
|
|
39
|
-
*
|
|
43
|
+
* Multimodal generation using inlineData as per docs:
|
|
40
44
|
* ai.models.generateContent({ model, contents, config })
|
|
41
45
|
*/
|
|
42
46
|
const response = await ai.models.generateContent({
|
|
@@ -59,68 +63,42 @@ export async function processAudioWithGemini(audioData, mimeType, apiKey, option
|
|
|
59
63
|
});
|
|
60
64
|
const endTime = Date.now();
|
|
61
65
|
const processingTimeSec = parseFloat(((endTime - startTime) / 1000).toFixed(2));
|
|
62
|
-
//
|
|
66
|
+
// response.candidates[0].content.parts handling
|
|
63
67
|
const candidate = response.candidates?.[0];
|
|
64
|
-
const
|
|
65
|
-
//
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
.
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
68
|
+
const parts = candidate?.content?.parts || [];
|
|
69
|
+
// Separate actual transcript from reasoning thoughts
|
|
70
|
+
let transcriptText = "";
|
|
71
|
+
let thoughtText = "";
|
|
72
|
+
for (const part of parts) {
|
|
73
|
+
if (part.thought) {
|
|
74
|
+
thoughtText += part.text || "";
|
|
75
|
+
}
|
|
76
|
+
else {
|
|
77
|
+
transcriptText += part.text || "";
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* Usage metadata handling as per documentation:
|
|
82
|
+
* response.usageMetadata.promptTokenCount etc.
|
|
83
|
+
*/
|
|
75
84
|
const usage = response.usageMetadata ? {
|
|
76
85
|
inputTokens: response.usageMetadata.promptTokenCount || 0,
|
|
77
86
|
outputTokens: response.usageMetadata.candidatesTokenCount || 0,
|
|
78
87
|
totalTokens: response.usageMetadata.totalTokenCount || 0,
|
|
79
88
|
processingTimeSec: processingTimeSec
|
|
80
|
-
} :
|
|
89
|
+
} : { processingTimeSec };
|
|
81
90
|
return {
|
|
82
|
-
text: transcriptText,
|
|
83
|
-
thoughts: thoughtText,
|
|
91
|
+
text: transcriptText.trim(),
|
|
92
|
+
thoughts: thoughtText.trim(),
|
|
84
93
|
model: modelName,
|
|
85
94
|
usage: usage
|
|
86
95
|
};
|
|
87
96
|
}
|
|
88
97
|
catch (error) {
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
if (options.verbose)
|
|
92
|
-
console.warn("[SSTLibrary] Newer syntax failed, trying fallback...");
|
|
93
|
-
try {
|
|
94
|
-
const model = ai.getGenerativeModel({ model: modelName }, config);
|
|
95
|
-
const result = await model.generateContent({
|
|
96
|
-
contents: [{
|
|
97
|
-
role: 'user',
|
|
98
|
-
parts: [
|
|
99
|
-
{ text: promptText },
|
|
100
|
-
{ inlineData: { mimeType, data: audioData } }
|
|
101
|
-
]
|
|
102
|
-
}]
|
|
103
|
-
});
|
|
104
|
-
const endTime = Date.now();
|
|
105
|
-
const processingTimeSec = parseFloat(((endTime - startTime) / 1000).toFixed(2));
|
|
106
|
-
const resp = result.response;
|
|
107
|
-
const candidate = resp.candidates?.[0];
|
|
108
|
-
const parts = candidate?.content?.parts || [];
|
|
109
|
-
return {
|
|
110
|
-
text: parts.filter((p) => !p.thought).map((p) => p.text).join(''),
|
|
111
|
-
thoughts: parts.filter((p) => p.thought).map((p) => p.text).join(''),
|
|
112
|
-
model: modelName,
|
|
113
|
-
usage: resp.usageMetadata ? {
|
|
114
|
-
inputTokens: resp.usageMetadata.promptTokenCount,
|
|
115
|
-
outputTokens: resp.usageMetadata.candidatesTokenCount,
|
|
116
|
-
totalTokens: resp.usageMetadata.totalTokenCount,
|
|
117
|
-
processingTimeSec: processingTimeSec
|
|
118
|
-
} : undefined
|
|
119
|
-
};
|
|
120
|
-
}
|
|
121
|
-
catch (fallbackError) {
|
|
122
|
-
console.error("[SSTLibrary] Transcription failed:", fallbackError);
|
|
123
|
-
throw fallbackError;
|
|
98
|
+
if (options.verbose) {
|
|
99
|
+
console.error("[geminisst] API Call failed:", error);
|
|
124
100
|
}
|
|
101
|
+
// Re-throw with clear message
|
|
102
|
+
throw new Error(`[geminisst] ${error.message || "Unknown API Error"}`);
|
|
125
103
|
}
|
|
126
104
|
}
|
package/dist/index.d.ts
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import { SSTOptions, TranscriptionResult } from './types.js';
|
|
2
2
|
/**
|
|
3
|
-
* Node.js entry point
|
|
4
|
-
*
|
|
5
|
-
* @param
|
|
6
|
-
* @param
|
|
3
|
+
* Node.js entry point for geminisst
|
|
4
|
+
*
|
|
5
|
+
* @param audioFile - Path to the audio file
|
|
6
|
+
* @param apiKey - Google Gemini API Key
|
|
7
|
+
* @param options - Configuration options
|
|
7
8
|
* @returns The transcription result object containing text and thoughts
|
|
8
9
|
*/
|
|
9
10
|
export declare function audioToText(audioFile: string, apiKey: string, options?: SSTOptions): Promise<TranscriptionResult>;
|
package/dist/index.js
CHANGED
|
@@ -1,39 +1,41 @@
|
|
|
1
|
-
import { processAudioWithGemini } from './core.js';
|
|
2
1
|
import * as fs from 'fs';
|
|
3
2
|
import * as path from 'path';
|
|
3
|
+
import { processAudioWithGemini } from './core.js';
|
|
4
4
|
import { bufferToBase64 } from './utils.js';
|
|
5
5
|
/**
|
|
6
|
-
* Node.js entry point
|
|
7
|
-
*
|
|
8
|
-
* @param
|
|
9
|
-
* @param
|
|
6
|
+
* Node.js entry point for geminisst
|
|
7
|
+
*
|
|
8
|
+
* @param audioFile - Path to the audio file
|
|
9
|
+
* @param apiKey - Google Gemini API Key
|
|
10
|
+
* @param options - Configuration options
|
|
10
11
|
* @returns The transcription result object containing text and thoughts
|
|
11
12
|
*/
|
|
12
13
|
export async function audioToText(audioFile, apiKey, options = {}) {
|
|
13
14
|
// 1. Validate Audio File Path
|
|
14
15
|
if (!fs.existsSync(audioFile)) {
|
|
15
|
-
throw new Error(`[geminisst] Audio file not found
|
|
16
|
+
throw new Error(`[geminisst] Audio file not found: ${audioFile}`);
|
|
16
17
|
}
|
|
17
18
|
const stats = fs.statSync(audioFile);
|
|
18
19
|
if (stats.isDirectory()) {
|
|
19
20
|
throw new Error(`[geminisst] Expected a file path but found a directory: ${audioFile}`);
|
|
20
21
|
}
|
|
21
|
-
//
|
|
22
|
+
// 2. Detect Mime Type
|
|
22
23
|
const ext = path.extname(audioFile).toLowerCase().replace('.', '');
|
|
23
|
-
// Default map
|
|
24
24
|
const mimeMap = {
|
|
25
25
|
'mp3': 'audio/mp3',
|
|
26
|
+
'mpeg': 'audio/mpeg',
|
|
26
27
|
'wav': 'audio/wav',
|
|
27
28
|
'ogg': 'audio/ogg',
|
|
28
29
|
'flac': 'audio/flac',
|
|
29
30
|
'aac': 'audio/aac',
|
|
30
|
-
'm4a': 'audio/m4a',
|
|
31
|
+
'm4a': 'audio/m4a',
|
|
31
32
|
'mp4': 'audio/mp4'
|
|
32
33
|
};
|
|
33
|
-
const mimeType = mimeMap[ext] || 'audio/mp3';
|
|
34
|
+
const mimeType = mimeMap[ext] || 'audio/mp3';
|
|
35
|
+
// 3. Read File and Convert to Base64
|
|
34
36
|
const fileBuffer = fs.readFileSync(audioFile);
|
|
35
37
|
const base64Audio = bufferToBase64(fileBuffer);
|
|
36
|
-
//
|
|
38
|
+
// 4. Process with Gemini Core
|
|
37
39
|
return await processAudioWithGemini(base64Audio, mimeType, apiKey, options);
|
|
38
40
|
}
|
|
39
41
|
export * from './types.js';
|
package/dist/types.d.ts
CHANGED
|
@@ -1,26 +1,37 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
2
|
+
* geminisst Type Definitions
|
|
3
3
|
*/
|
|
4
|
-
export type AudioInput = string | File;
|
|
5
4
|
export interface SSTOptions {
|
|
6
5
|
/**
|
|
7
|
-
*
|
|
6
|
+
* Optional guidance for the model (e.g. "Transcribe in Hindi", "English transcript only").
|
|
8
7
|
*/
|
|
9
8
|
prompt?: string;
|
|
10
9
|
/**
|
|
11
|
-
*
|
|
10
|
+
* The Gemini model version. Defaults to "gemini-2.5-flash-lite".
|
|
12
11
|
*/
|
|
13
12
|
model?: string;
|
|
14
13
|
/**
|
|
15
|
-
*
|
|
14
|
+
* Enable internal logging for debugging.
|
|
16
15
|
*/
|
|
17
16
|
verbose?: boolean;
|
|
18
17
|
}
|
|
19
18
|
export interface TranscriptionResult {
|
|
19
|
+
/**
|
|
20
|
+
* The verbatim transcribed text.
|
|
21
|
+
*/
|
|
20
22
|
text: string;
|
|
23
|
+
/**
|
|
24
|
+
* The AI's internal reasoning (Thought Summary).
|
|
25
|
+
*/
|
|
21
26
|
thoughts?: string;
|
|
27
|
+
/**
|
|
28
|
+
* The model used for the request.
|
|
29
|
+
*/
|
|
22
30
|
model: string;
|
|
23
|
-
|
|
31
|
+
/**
|
|
32
|
+
* Token usage and performance metadata.
|
|
33
|
+
*/
|
|
34
|
+
usage: {
|
|
24
35
|
inputTokens: number;
|
|
25
36
|
outputTokens: number;
|
|
26
37
|
totalTokens: number;
|
package/dist/types.js
CHANGED