geminisst 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +119 -0
- package/dist/constants.d.ts +1 -0
- package/dist/constants.js +35 -0
- package/dist/core.d.ts +10 -0
- package/dist/core.js +126 -0
- package/dist/index.d.ts +10 -0
- package/dist/index.js +35 -0
- package/dist/types.d.ts +29 -0
- package/dist/types.js +4 -0
- package/dist/utils.d.ts +13 -0
- package/dist/utils.js +35 -0
- package/package.json +42 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
ISC License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026, Smart Tell Line
|
|
4
|
+
|
|
5
|
+
Permission to use, copy, modify, and/or distribute this software for any
|
|
6
|
+
purpose with or without fee is hereby granted, provided that the above
|
|
7
|
+
copyright notice and this permission notice appear in all copies.
|
|
8
|
+
|
|
9
|
+
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
|
10
|
+
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
|
11
|
+
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
|
12
|
+
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|
13
|
+
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
|
14
|
+
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
|
15
|
+
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
# geminisst 🎙️
|
|
2
|
+
|
|
3
|
+
**geminisst** is a revolutionary, professional-grade Node.js library for high-accuracy **Audio-to-Text** conversion. Powered by **Google's Gemini 2.5 Flash Lite**, it offers a massive **1 Million+ context window** and next-gen multimodal understanding that suns traditional STT engines.
|
|
4
|
+
|
|
5
|
+
Whether it's a 3-second voice clip or a multi-hour podcast, `geminisst` processes it with lightning speed, incredible cost-efficiency, and deep reasoning capabilities.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## 🚀 Key Features
|
|
10
|
+
|
|
11
|
+
* **Gemini 2.5 Flash Lite Engine:** Optimized for the latest AI architecture, ensuring ultra-fast response times.
|
|
12
|
+
* **1M+ Context Window:** Process massive audio files (up to several hours) in a single request.
|
|
13
|
+
* **Dynamic Thinking (Reasoning):** Uses `thinkingBudget: -1` to allow the AI to reason about the audio (accents, context, noise) before transcribing.
|
|
14
|
+
* **AI Thought Summaries:** Access the AI's internal reasoning process (`thoughts`) to understand *how* it arrived at the transcript.
|
|
15
|
+
* **Locked Core Logic:** Built-in, unoverrideable system instructions ensure the AI acts as a pure transcription engine (no summaries, no opinions, 100% verbatim).
|
|
16
|
+
* **Automatic Language Detection:** Seamlessly handles Hindi, English, Hinglish, and many other languages without manual configuration.
|
|
17
|
+
* **Processing Metadata:** Real-time tracking of token usage and exact processing time in seconds.
|
|
18
|
+
* **TypeScript Native:** Full type safety and IntelliSense support for a superior developer experience.
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## 📦 Installation
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
npm install geminisst
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
*Note: You need a Google Gemini API Key. Get one for free at [Google AI Studio](https://aistudio.google.com/).*
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## 🛠️ Quick Start
|
|
33
|
+
|
|
34
|
+
### 1. Simple Transcription
|
|
35
|
+
The most basic way to use `geminisst`.
|
|
36
|
+
|
|
37
|
+
```javascript
|
|
38
|
+
import { audioToText } from 'geminisst';
|
|
39
|
+
|
|
40
|
+
const apiKey = "YOUR_GEMINI_API_KEY";
|
|
41
|
+
const result = await audioToText('./meeting.mp3', apiKey);
|
|
42
|
+
|
|
43
|
+
console.log("Transcript:", result.text);
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### 2. Advanced Usage (With Thoughts & Metadata)
|
|
47
|
+
Get deeper insights into the transcription process.
|
|
48
|
+
|
|
49
|
+
```javascript
|
|
50
|
+
import { audioToText } from 'geminisst';
|
|
51
|
+
|
|
52
|
+
async function transcribeWithInsights() {
|
|
53
|
+
const result = await audioToText('./interview.wav', 'YOUR_API_KEY', {
|
|
54
|
+
prompt: "The audio is in a mix of Hindi and English. Please use Devanagari for Hindi parts.",
|
|
55
|
+
verbose: true
|
|
56
|
+
});
|
|
57
|
+
|
|
58
|
+
// Access the AI's "Internal Monologue"
|
|
59
|
+
if (result.thoughts) {
|
|
60
|
+
console.log("AI Reasoning:", result.thoughts);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// Exact spoken words
|
|
64
|
+
console.log("Transcript:", result.text);
|
|
65
|
+
|
|
66
|
+
// Performance metrics
|
|
67
|
+
console.log(`Finished in ${result.usage.processingTimeSec}s`);
|
|
68
|
+
console.log(`Total Tokens used: ${result.usage.totalTokens}`);
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
transcribeWithInsights();
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## 📖 API Reference
|
|
77
|
+
|
|
78
|
+
### `audioToText(filePath, apiKey, options?)`
|
|
79
|
+
|
|
80
|
+
| Parameter | Type | Description |
|
|
81
|
+
| :--- | :--- | :--- |
|
|
82
|
+
| `filePath` | `string` | Absolute or relative path to the audio file. |
|
|
83
|
+
| `apiKey` | `string` | Your Google Gemini API Key. |
|
|
84
|
+
| `options` | `SSTOptions` | (Optional) Advanced configuration. |
|
|
85
|
+
|
|
86
|
+
### `SSTOptions`
|
|
87
|
+
|
|
88
|
+
| Option | Type | Default | Description |
|
|
89
|
+
| :--- | :--- | :--- | :--- |
|
|
90
|
+
| `prompt` | `string` | `undefined` | Style/Language guidance (e.g., "Transcribe in English letters"). |
|
|
91
|
+
| `model` | `string` | `"gemini-2.5-flash-lite"` | Override default model. |
|
|
92
|
+
| `verbose` | `boolean` | `false` | Log internal steps for debugging. |
|
|
93
|
+
|
|
94
|
+
### `TranscriptionResult`
|
|
95
|
+
|
|
96
|
+
| Property | Type | Description |
|
|
97
|
+
| :--- | :--- | :--- |
|
|
98
|
+
| `text` | `string` | The 100% accurate, verbatim transcript. |
|
|
99
|
+
| `thoughts` | `string` | AI's thought summary explaining the reasoning/language detection. |
|
|
100
|
+
| `model` | `string` | The specific model version used. |
|
|
101
|
+
| `usage` | `object` | Stats: `inputTokens`, `outputTokens`, `totalTokens`, `processingTimeSec`. |
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## 🛡️ Transcription Rules (Locked Logic)
|
|
106
|
+
|
|
107
|
+
This library is hardcoded with professional transcription rules that **cannot be overridden**, ensuring reliability:
|
|
108
|
+
1. **Verbatim Content:** Captures stutters, hesitations, and repetitions exactly.
|
|
109
|
+
2. **Noise Suppression:** Automatically ignores background noise and focuses on speech.
|
|
110
|
+
3. **No Interpretation:** Forbidden from adding opinions, interpretations, or extra content.
|
|
111
|
+
4. **Style Integrity:** Maintains the natural flow and pronunciation matching of the spoken words.
|
|
112
|
+
|
|
113
|
+
---
|
|
114
|
+
|
|
115
|
+
## 📄 License
|
|
116
|
+
|
|
117
|
+
ISC - Distributed under the ISC License. See `LICENSE` for more information.
|
|
118
|
+
|
|
119
|
+
Copyright (c) 2026, **Smart Tell Line**.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare const DEFAULT_SYSTEM_INSTRUCTION = "You are a speech-to-text AI model. Tumhara kaam sirf ye hai ki jo bhi audio mile, usko accurately text mein convert karo. Tumhe kuch bhi summarize, interpret, ya translate nahi karna hai\u2014bas spoken words ka exact transcript generate karna hai.\nRules/Instructions:\nUser-specified language:\nAgar user language specify karta hai (Hindi, English, Hinglish, etc.), to text us language ke style mein hi likho.\nTranslation mat karo. Agar audio aur user-specified language match nahi karta, to pronunciation match karte hue text likho.\nExample: Audio Hindi, user language English \u2192 English letters me likho, pronunciation Hindi jaisa.\nDefault behavior (language not specified):\nAgar user koi language specify nahi karta, to audio ki language automatically detect karo aur us language ke style mein hi transcript generate karo.\nKisi bhi language ko detect karke usi style mein natural aur realistic transcript produce karo.\nBackground noise handling:\nLow volume background sound ya irrelevant noise ignore karo.\nSirf clearly spoken, meaningful words capture karo.\nExact transcription:\nText exactly waise hi likho jaise bola gaya hai, stutters, hesitations, ya repeated words ke saath.\nPunctuation aur line breaks readability ke liye use kar sakte ho, lekin content strictly audio based hona chahiye.\nMulti-language or code-switching:\nAgar speaker multiple languages mix karta hai, ya Hinglish bolta hai, to user-specified language follow karo, ya default detection ke case me primary spoken language detect karke transcript banao.\nTransliteration rules apply kar sakte ho only for pronunciation matching.\nProfessional speech-to-text behavior:\nAct like a real transcription engine: natural, realistic, aur 100% accurate audio-to-text conversion.\nKabhi bhi apna opinion, extra content, ya summary add mat karo.\nOutput Examples:\nUser language Hindi:\nAudio: \"Hello, kaise ho? Main theek hoon.\"\nOutput: \"Hello, kaise ho? Main theek hoon.\"\nUser language English, audio Hindi:\nAudio: \"Mera naam Raj hai.\"\nOutput: \"Mera naam Raj hai.\" (English letters, pronunciation Hindi jaisa)\nNo user language specified (automatic detection):\nAudio: \"Bonjour, comment \u00E7a va?\"\nOutput: \"Bonjour, comment \u00E7a va?\" (French detected, transcript French style)\nHinglish audio, user language Hinglish:\nAudio: \"Good morning, doston!\"\nOutput: \"Good morning, doston!\"\nFocus 100% on accurate audio-to-text conversion with correct language style, pronunciation, and automatic detection. Background noise ignore karo, aur transcript ko natural aur realistic banao.";
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
export const DEFAULT_SYSTEM_INSTRUCTION = `You are a speech-to-text AI model. Tumhara kaam sirf ye hai ki jo bhi audio mile, usko accurately text mein convert karo. Tumhe kuch bhi summarize, interpret, ya translate nahi karna hai—bas spoken words ka exact transcript generate karna hai.
|
|
2
|
+
Rules/Instructions:
|
|
3
|
+
User-specified language:
|
|
4
|
+
Agar user language specify karta hai (Hindi, English, Hinglish, etc.), to text us language ke style mein hi likho.
|
|
5
|
+
Translation mat karo. Agar audio aur user-specified language match nahi karta, to pronunciation match karte hue text likho.
|
|
6
|
+
Example: Audio Hindi, user language English → English letters me likho, pronunciation Hindi jaisa.
|
|
7
|
+
Default behavior (language not specified):
|
|
8
|
+
Agar user koi language specify nahi karta, to audio ki language automatically detect karo aur us language ke style mein hi transcript generate karo.
|
|
9
|
+
Kisi bhi language ko detect karke usi style mein natural aur realistic transcript produce karo.
|
|
10
|
+
Background noise handling:
|
|
11
|
+
Low volume background sound ya irrelevant noise ignore karo.
|
|
12
|
+
Sirf clearly spoken, meaningful words capture karo.
|
|
13
|
+
Exact transcription:
|
|
14
|
+
Text exactly waise hi likho jaise bola gaya hai, stutters, hesitations, ya repeated words ke saath.
|
|
15
|
+
Punctuation aur line breaks readability ke liye use kar sakte ho, lekin content strictly audio based hona chahiye.
|
|
16
|
+
Multi-language or code-switching:
|
|
17
|
+
Agar speaker multiple languages mix karta hai, ya Hinglish bolta hai, to user-specified language follow karo, ya default detection ke case me primary spoken language detect karke transcript banao.
|
|
18
|
+
Transliteration rules apply kar sakte ho only for pronunciation matching.
|
|
19
|
+
Professional speech-to-text behavior:
|
|
20
|
+
Act like a real transcription engine: natural, realistic, aur 100% accurate audio-to-text conversion.
|
|
21
|
+
Kabhi bhi apna opinion, extra content, ya summary add mat karo.
|
|
22
|
+
Output Examples:
|
|
23
|
+
User language Hindi:
|
|
24
|
+
Audio: "Hello, kaise ho? Main theek hoon."
|
|
25
|
+
Output: "Hello, kaise ho? Main theek hoon."
|
|
26
|
+
User language English, audio Hindi:
|
|
27
|
+
Audio: "Mera naam Raj hai."
|
|
28
|
+
Output: "Mera naam Raj hai." (English letters, pronunciation Hindi jaisa)
|
|
29
|
+
No user language specified (automatic detection):
|
|
30
|
+
Audio: "Bonjour, comment ça va?"
|
|
31
|
+
Output: "Bonjour, comment ça va?" (French detected, transcript French style)
|
|
32
|
+
Hinglish audio, user language Hinglish:
|
|
33
|
+
Audio: "Good morning, doston!"
|
|
34
|
+
Output: "Good morning, doston!"
|
|
35
|
+
Focus 100% on accurate audio-to-text conversion with correct language style, pronunciation, and automatic detection. Background noise ignore karo, aur transcript ko natural aur realistic banao.`;
|
package/dist/core.d.ts
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import { SSTOptions, TranscriptionResult } from './types.js';
|
|
2
|
+
/**
|
|
3
|
+
* Processes audio using the Gemini API.
|
|
4
|
+
* @param audioData - Base64 encoded audio string
|
|
5
|
+
* @param mimeType - MIME type of the audio
|
|
6
|
+
* @param apiKey - Google Gemini API Key
|
|
7
|
+
* @param options - Configuration options
|
|
8
|
+
* @returns Promise resolving to the transcription result
|
|
9
|
+
*/
|
|
10
|
+
export declare function processAudioWithGemini(audioData: string, mimeType: string, apiKey: string, options: SSTOptions): Promise<TranscriptionResult>;
|
package/dist/core.js
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Core business logic using Google GenAI SDK
|
|
3
|
+
*/
|
|
4
|
+
import { GoogleGenAI } from '@google/genai';
|
|
5
|
+
import { DEFAULT_SYSTEM_INSTRUCTION } from './constants.js';
|
|
6
|
+
/**
|
|
7
|
+
* Processes audio using the Gemini API.
|
|
8
|
+
* @param audioData - Base64 encoded audio string
|
|
9
|
+
* @param mimeType - MIME type of the audio
|
|
10
|
+
* @param apiKey - Google Gemini API Key
|
|
11
|
+
* @param options - Configuration options
|
|
12
|
+
* @returns Promise resolving to the transcription result
|
|
13
|
+
*/
|
|
14
|
+
export async function processAudioWithGemini(audioData, mimeType, apiKey, options) {
|
|
15
|
+
if (!apiKey) {
|
|
16
|
+
throw new Error("API Key is required");
|
|
17
|
+
}
|
|
18
|
+
// Initialize the AI client according to documentation: new GoogleGenAI({ apiKey })
|
|
19
|
+
const ai = new GoogleGenAI({ apiKey: apiKey });
|
|
20
|
+
const modelName = options.model || "gemini-2.5-flash-lite";
|
|
21
|
+
// Configure thinking mode as per Gemini 2.5 specifications in documentation
|
|
22
|
+
const config = {
|
|
23
|
+
thinkingConfig: {
|
|
24
|
+
includeThoughts: true, // Enabled to allow monitoring thoughts if needed
|
|
25
|
+
thinkingBudget: -1 // Dynamic thinking enabled (-1)
|
|
26
|
+
},
|
|
27
|
+
// Fixed System Instruction: Users cannot override this as it is the core STT logic.
|
|
28
|
+
systemInstruction: DEFAULT_SYSTEM_INSTRUCTION
|
|
29
|
+
};
|
|
30
|
+
if (options.verbose) {
|
|
31
|
+
console.log(`[SSTLibrary] Model: ${modelName}`);
|
|
32
|
+
console.log(`[SSTLibrary] Thinking: Dynamic (-1)`);
|
|
33
|
+
console.log(`[SSTLibrary] System Instruction: Locked (Core)`);
|
|
34
|
+
}
|
|
35
|
+
const promptText = options.prompt || "Transcribe this audio.";
|
|
36
|
+
const startTime = Date.now();
|
|
37
|
+
try {
|
|
38
|
+
/**
|
|
39
|
+
* Using the syntax from the provided documentation:
|
|
40
|
+
* ai.models.generateContent({ model, contents, config })
|
|
41
|
+
*/
|
|
42
|
+
const response = await ai.models.generateContent({
|
|
43
|
+
model: modelName,
|
|
44
|
+
contents: [
|
|
45
|
+
{
|
|
46
|
+
role: "user",
|
|
47
|
+
parts: [
|
|
48
|
+
{ text: promptText },
|
|
49
|
+
{
|
|
50
|
+
inlineData: {
|
|
51
|
+
mimeType: mimeType,
|
|
52
|
+
data: audioData
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
]
|
|
56
|
+
}
|
|
57
|
+
],
|
|
58
|
+
config: config
|
|
59
|
+
});
|
|
60
|
+
const endTime = Date.now();
|
|
61
|
+
const processingTimeSec = parseFloat(((endTime - startTime) / 1000).toFixed(2));
|
|
62
|
+
// Handle the response according to the documentation structure
|
|
63
|
+
const candidate = response.candidates?.[0];
|
|
64
|
+
const textParts = candidate?.content?.parts || [];
|
|
65
|
+
// Combine text parts and thought parts separately
|
|
66
|
+
const transcriptText = textParts
|
|
67
|
+
.filter((p) => !p.thought)
|
|
68
|
+
.map((p) => p.text)
|
|
69
|
+
.join('') || "";
|
|
70
|
+
const thoughtText = textParts
|
|
71
|
+
.filter((p) => p.thought)
|
|
72
|
+
.map((p) => p.text)
|
|
73
|
+
.join('') || "";
|
|
74
|
+
// Extract usage details
|
|
75
|
+
const usage = response.usageMetadata ? {
|
|
76
|
+
inputTokens: response.usageMetadata.promptTokenCount || 0,
|
|
77
|
+
outputTokens: response.usageMetadata.candidatesTokenCount || 0,
|
|
78
|
+
totalTokens: response.usageMetadata.totalTokenCount || 0,
|
|
79
|
+
processingTimeSec: processingTimeSec
|
|
80
|
+
} : undefined;
|
|
81
|
+
return {
|
|
82
|
+
text: transcriptText,
|
|
83
|
+
thoughts: thoughtText,
|
|
84
|
+
model: modelName,
|
|
85
|
+
usage: usage
|
|
86
|
+
};
|
|
87
|
+
}
|
|
88
|
+
catch (error) {
|
|
89
|
+
// If the newer ai.models.generateContent syntax is not available in the installed SDK version,
|
|
90
|
+
// fallback to the widely supported getGenerativeModel method while keeping logic consistent.
|
|
91
|
+
if (options.verbose)
|
|
92
|
+
console.warn("[SSTLibrary] Newer syntax failed, trying fallback...");
|
|
93
|
+
try {
|
|
94
|
+
const model = ai.getGenerativeModel({ model: modelName }, config);
|
|
95
|
+
const result = await model.generateContent({
|
|
96
|
+
contents: [{
|
|
97
|
+
role: 'user',
|
|
98
|
+
parts: [
|
|
99
|
+
{ text: promptText },
|
|
100
|
+
{ inlineData: { mimeType, data: audioData } }
|
|
101
|
+
]
|
|
102
|
+
}]
|
|
103
|
+
});
|
|
104
|
+
const endTime = Date.now();
|
|
105
|
+
const processingTimeSec = parseFloat(((endTime - startTime) / 1000).toFixed(2));
|
|
106
|
+
const resp = result.response;
|
|
107
|
+
const candidate = resp.candidates?.[0];
|
|
108
|
+
const parts = candidate?.content?.parts || [];
|
|
109
|
+
return {
|
|
110
|
+
text: parts.filter((p) => !p.thought).map((p) => p.text).join(''),
|
|
111
|
+
thoughts: parts.filter((p) => p.thought).map((p) => p.text).join(''),
|
|
112
|
+
model: modelName,
|
|
113
|
+
usage: resp.usageMetadata ? {
|
|
114
|
+
inputTokens: resp.usageMetadata.promptTokenCount,
|
|
115
|
+
outputTokens: resp.usageMetadata.candidatesTokenCount,
|
|
116
|
+
totalTokens: resp.usageMetadata.totalTokenCount,
|
|
117
|
+
processingTimeSec: processingTimeSec
|
|
118
|
+
} : undefined
|
|
119
|
+
};
|
|
120
|
+
}
|
|
121
|
+
catch (fallbackError) {
|
|
122
|
+
console.error("[SSTLibrary] Transcription failed:", fallbackError);
|
|
123
|
+
throw fallbackError;
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import { SSTOptions, TranscriptionResult } from './types.js';
|
|
2
|
+
/**
|
|
3
|
+
* Node.js entry point
|
|
4
|
+
* @param audioFile Path to the audio file
|
|
5
|
+
* @param apiKey Google Gemini API Key
|
|
6
|
+
* @param options Configuration options
|
|
7
|
+
* @returns The transcription result object containing text and thoughts
|
|
8
|
+
*/
|
|
9
|
+
export declare function audioToText(audioFile: string, apiKey: string, options?: SSTOptions): Promise<TranscriptionResult>;
|
|
10
|
+
export * from './types.js';
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import { processAudioWithGemini } from './core.js';
|
|
2
|
+
import * as fs from 'fs';
|
|
3
|
+
import * as path from 'path';
|
|
4
|
+
import { bufferToBase64 } from './utils.js';
|
|
5
|
+
/**
|
|
6
|
+
* Node.js entry point
|
|
7
|
+
* @param audioFile Path to the audio file
|
|
8
|
+
* @param apiKey Google Gemini API Key
|
|
9
|
+
* @param options Configuration options
|
|
10
|
+
* @returns The transcription result object containing text and thoughts
|
|
11
|
+
*/
|
|
12
|
+
export async function audioToText(audioFile, apiKey, options = {}) {
|
|
13
|
+
// 1. Read Audio File
|
|
14
|
+
if (!fs.existsSync(audioFile)) {
|
|
15
|
+
throw new Error(`Audio file not found: ${audioFile}`);
|
|
16
|
+
}
|
|
17
|
+
// Simple mime type detection based on extension
|
|
18
|
+
const ext = path.extname(audioFile).toLowerCase().replace('.', '');
|
|
19
|
+
// Default map
|
|
20
|
+
const mimeMap = {
|
|
21
|
+
'mp3': 'audio/mp3',
|
|
22
|
+
'wav': 'audio/wav',
|
|
23
|
+
'ogg': 'audio/ogg',
|
|
24
|
+
'flac': 'audio/flac',
|
|
25
|
+
'aac': 'audio/aac',
|
|
26
|
+
'm4a': 'audio/m4a', // often parsed as mp4/aac
|
|
27
|
+
'mp4': 'audio/mp4'
|
|
28
|
+
};
|
|
29
|
+
const mimeType = mimeMap[ext] || 'audio/mp3'; // Default to mp3 if unknown
|
|
30
|
+
const fileBuffer = fs.readFileSync(audioFile);
|
|
31
|
+
const base64Audio = bufferToBase64(fileBuffer);
|
|
32
|
+
// 2. Process
|
|
33
|
+
return await processAudioWithGemini(base64Audio, mimeType, apiKey, options);
|
|
34
|
+
}
|
|
35
|
+
export * from './types.js';
|
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Common types for the SST Library
|
|
3
|
+
*/
|
|
4
|
+
export type AudioInput = string | File;
|
|
5
|
+
export interface SSTOptions {
|
|
6
|
+
/**
|
|
7
|
+
* The text prompt to guide the audio processing (e.g., "Transcribe in Hindi", "In English letters").
|
|
8
|
+
*/
|
|
9
|
+
prompt?: string;
|
|
10
|
+
/**
|
|
11
|
+
* Model to use. Defaults to "gemini-2.5-flash-lite".
|
|
12
|
+
*/
|
|
13
|
+
model?: string;
|
|
14
|
+
/**
|
|
15
|
+
* Verbose logging.
|
|
16
|
+
*/
|
|
17
|
+
verbose?: boolean;
|
|
18
|
+
}
|
|
19
|
+
export interface TranscriptionResult {
|
|
20
|
+
text: string;
|
|
21
|
+
thoughts?: string;
|
|
22
|
+
model: string;
|
|
23
|
+
usage?: {
|
|
24
|
+
inputTokens: number;
|
|
25
|
+
outputTokens: number;
|
|
26
|
+
totalTokens: number;
|
|
27
|
+
processingTimeSec: number;
|
|
28
|
+
};
|
|
29
|
+
}
|
package/dist/types.js
ADDED
package/dist/utils.d.ts
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Utility functions
|
|
3
|
+
*/
|
|
4
|
+
export declare function isNode(): boolean;
|
|
5
|
+
export declare function isBrowser(): boolean;
|
|
6
|
+
/**
|
|
7
|
+
* Converts a File object (Browser) to a Base64 string.
|
|
8
|
+
*/
|
|
9
|
+
export declare function fileToBase64(file: File): Promise<string>;
|
|
10
|
+
/**
|
|
11
|
+
* Converts a NodeJS Buffer to a Base64 string.
|
|
12
|
+
*/
|
|
13
|
+
export declare function bufferToBase64(buffer: Buffer): string;
|
package/dist/utils.js
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Utility functions
|
|
3
|
+
*/
|
|
4
|
+
export function isNode() {
|
|
5
|
+
return typeof process !== 'undefined' && process.versions != null && process.versions.node != null;
|
|
6
|
+
}
|
|
7
|
+
export function isBrowser() {
|
|
8
|
+
return typeof window !== 'undefined' && typeof window.document !== 'undefined';
|
|
9
|
+
}
|
|
10
|
+
/**
|
|
11
|
+
* Converts a File object (Browser) to a Base64 string.
|
|
12
|
+
*/
|
|
13
|
+
export async function fileToBase64(file) {
|
|
14
|
+
return new Promise((resolve, reject) => {
|
|
15
|
+
const reader = new FileReader();
|
|
16
|
+
reader.readAsDataURL(file);
|
|
17
|
+
reader.onload = () => {
|
|
18
|
+
if (typeof reader.result === 'string') {
|
|
19
|
+
// Remove the Data-URL declaration (e.g., "data:audio/mp3;base64,")
|
|
20
|
+
const base64 = reader.result.split(',')[1];
|
|
21
|
+
resolve(base64);
|
|
22
|
+
}
|
|
23
|
+
else {
|
|
24
|
+
reject(new Error('Failed to convert file to base64'));
|
|
25
|
+
}
|
|
26
|
+
};
|
|
27
|
+
reader.onerror = error => reject(error);
|
|
28
|
+
});
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Converts a NodeJS Buffer to a Base64 string.
|
|
32
|
+
*/
|
|
33
|
+
export function bufferToBase64(buffer) {
|
|
34
|
+
return buffer.toString('base64');
|
|
35
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "geminisst",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Revolutionary high-accuracy Audio-to-Text library powered by Gemini 2.5 Flash Lite with 1M+ context window.",
|
|
5
|
+
"keywords": [
|
|
6
|
+
"sst",
|
|
7
|
+
"stt",
|
|
8
|
+
"gemini",
|
|
9
|
+
"audio-to-text",
|
|
10
|
+
"transcription",
|
|
11
|
+
"multimodal",
|
|
12
|
+
"ai",
|
|
13
|
+
"google-genai"
|
|
14
|
+
],
|
|
15
|
+
"license": "ISC",
|
|
16
|
+
"author": "Smart Tell Line",
|
|
17
|
+
"type": "module",
|
|
18
|
+
"main": "./dist/index.js",
|
|
19
|
+
"types": "./dist/index.d.ts",
|
|
20
|
+
"files": [
|
|
21
|
+
"dist",
|
|
22
|
+
"README.md",
|
|
23
|
+
"LICENSE"
|
|
24
|
+
],
|
|
25
|
+
"scripts": {
|
|
26
|
+
"build": "tsc",
|
|
27
|
+
"prepublishOnly": "npm run build",
|
|
28
|
+
"test": "node examples/test_node.js",
|
|
29
|
+
"lint": "tsc --noEmit"
|
|
30
|
+
},
|
|
31
|
+
"dependencies": {
|
|
32
|
+
"@google/genai": "*",
|
|
33
|
+
"dotenv": "^16.4.0"
|
|
34
|
+
},
|
|
35
|
+
"devDependencies": {
|
|
36
|
+
"typescript": "^5.0.0",
|
|
37
|
+
"@types/node": "^20.0.0"
|
|
38
|
+
},
|
|
39
|
+
"engines": {
|
|
40
|
+
"node": ">=18.0.0"
|
|
41
|
+
}
|
|
42
|
+
}
|