geminisst 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of geminisst might be problematic. Click here for more details.

package/dist/core.d.ts CHANGED
@@ -1,6 +1,7 @@
1
1
  import { SSTOptions, TranscriptionResult } from './types.js';
2
2
  /**
3
3
  * Processes audio using the Gemini API.
4
+ *
4
5
  * @param audioData - Base64 encoded audio string
5
6
  * @param mimeType - MIME type of the audio
6
7
  * @param apiKey - Google Gemini API Key
package/dist/core.js CHANGED
@@ -1,10 +1,12 @@
1
1
  /**
2
- * Core business logic using Google GenAI SDK
2
+ * Core business logic using the modern @google/genai SDK
3
+ * strictly following the provided multimodal and thinking documentation.
3
4
  */
4
5
  import { GoogleGenAI } from '@google/genai';
5
6
  import { DEFAULT_SYSTEM_INSTRUCTION } from './constants.js';
6
7
  /**
7
8
  * Processes audio using the Gemini API.
9
+ *
8
10
  * @param audioData - Base64 encoded audio string
9
11
  * @param mimeType - MIME type of the audio
10
12
  * @param apiKey - Google Gemini API Key
@@ -13,30 +15,32 @@ import { DEFAULT_SYSTEM_INSTRUCTION } from './constants.js';
13
15
  */
14
16
  export async function processAudioWithGemini(audioData, mimeType, apiKey, options) {
15
17
  if (!apiKey) {
16
- throw new Error("API Key is required");
18
+ throw new Error("[geminisst] API Key is required.");
17
19
  }
18
- // Initialize the AI client according to documentation: new GoogleGenAI({ apiKey })
19
- const ai = new GoogleGenAI({ apiKey: apiKey });
20
+ // Initialize client as per documentation: new GoogleGenAI({ apiKey })
21
+ const ai = new GoogleGenAI({ apiKey });
20
22
  const modelName = options.model || "gemini-2.5-flash-lite";
21
- // Configure thinking mode as per Gemini 2.5 specifications in documentation
23
+ const startTime = Date.now();
24
+ /**
25
+ * Configuration strictly following the Gemini 2.5 series docs:
26
+ * - thinkingBudget: -1 enables Dynamic Thinking.
27
+ * - includeThoughts: true allows capturing reasoning parts.
28
+ */
22
29
  const config = {
30
+ systemInstruction: DEFAULT_SYSTEM_INSTRUCTION,
23
31
  thinkingConfig: {
24
- includeThoughts: true, // Enabled to allow monitoring thoughts if needed
25
- thinkingBudget: -1 // Dynamic thinking enabled (-1)
32
+ includeThoughts: true,
33
+ thinkingBudget: -1,
26
34
  },
27
- // Fixed System Instruction: Users cannot override this as it is the core STT logic.
28
- systemInstruction: DEFAULT_SYSTEM_INSTRUCTION
29
35
  };
30
36
  if (options.verbose) {
31
- console.log(`[SSTLibrary] Model: ${modelName}`);
32
- console.log(`[SSTLibrary] Thinking: Dynamic (-1)`);
33
- console.log(`[SSTLibrary] System Instruction: Locked (Core)`);
37
+ console.log(`[geminisst] Initializing ${modelName}...`);
38
+ console.log(`[geminisst] Dynamic Thinking: Enabled`);
34
39
  }
35
- const promptText = options.prompt || "Transcribe this audio.";
36
- const startTime = Date.now();
40
+ const promptText = options.prompt || "Transcribe this audio exactly.";
37
41
  try {
38
42
  /**
39
- * Using the syntax from the provided documentation:
43
+ * Multimodal generation using inlineData as per docs:
40
44
  * ai.models.generateContent({ model, contents, config })
41
45
  */
42
46
  const response = await ai.models.generateContent({
@@ -59,68 +63,42 @@ export async function processAudioWithGemini(audioData, mimeType, apiKey, option
59
63
  });
60
64
  const endTime = Date.now();
61
65
  const processingTimeSec = parseFloat(((endTime - startTime) / 1000).toFixed(2));
62
- // Handle the response according to the documentation structure
66
+ // response.candidates[0].content.parts handling
63
67
  const candidate = response.candidates?.[0];
64
- const textParts = candidate?.content?.parts || [];
65
- // Combine text parts and thought parts separately
66
- const transcriptText = textParts
67
- .filter((p) => !p.thought)
68
- .map((p) => p.text)
69
- .join('') || "";
70
- const thoughtText = textParts
71
- .filter((p) => p.thought)
72
- .map((p) => p.text)
73
- .join('') || "";
74
- // Extract usage details
68
+ const parts = candidate?.content?.parts || [];
69
+ // Separate actual transcript from reasoning thoughts
70
+ let transcriptText = "";
71
+ let thoughtText = "";
72
+ for (const part of parts) {
73
+ if (part.thought) {
74
+ thoughtText += part.text || "";
75
+ }
76
+ else {
77
+ transcriptText += part.text || "";
78
+ }
79
+ }
80
+ /**
81
+ * Usage metadata handling as per documentation:
82
+ * response.usageMetadata.promptTokenCount etc.
83
+ */
75
84
  const usage = response.usageMetadata ? {
76
85
  inputTokens: response.usageMetadata.promptTokenCount || 0,
77
86
  outputTokens: response.usageMetadata.candidatesTokenCount || 0,
78
87
  totalTokens: response.usageMetadata.totalTokenCount || 0,
79
88
  processingTimeSec: processingTimeSec
80
- } : undefined;
89
+ } : { processingTimeSec };
81
90
  return {
82
- text: transcriptText,
83
- thoughts: thoughtText,
91
+ text: transcriptText.trim(),
92
+ thoughts: thoughtText.trim(),
84
93
  model: modelName,
85
94
  usage: usage
86
95
  };
87
96
  }
88
97
  catch (error) {
89
- // If the newer ai.models.generateContent syntax is not available in the installed SDK version,
90
- // fallback to the widely supported getGenerativeModel method while keeping logic consistent.
91
- if (options.verbose)
92
- console.warn("[SSTLibrary] Newer syntax failed, trying fallback...");
93
- try {
94
- const model = ai.getGenerativeModel({ model: modelName }, config);
95
- const result = await model.generateContent({
96
- contents: [{
97
- role: 'user',
98
- parts: [
99
- { text: promptText },
100
- { inlineData: { mimeType, data: audioData } }
101
- ]
102
- }]
103
- });
104
- const endTime = Date.now();
105
- const processingTimeSec = parseFloat(((endTime - startTime) / 1000).toFixed(2));
106
- const resp = result.response;
107
- const candidate = resp.candidates?.[0];
108
- const parts = candidate?.content?.parts || [];
109
- return {
110
- text: parts.filter((p) => !p.thought).map((p) => p.text).join(''),
111
- thoughts: parts.filter((p) => p.thought).map((p) => p.text).join(''),
112
- model: modelName,
113
- usage: resp.usageMetadata ? {
114
- inputTokens: resp.usageMetadata.promptTokenCount,
115
- outputTokens: resp.usageMetadata.candidatesTokenCount,
116
- totalTokens: resp.usageMetadata.totalTokenCount,
117
- processingTimeSec: processingTimeSec
118
- } : undefined
119
- };
120
- }
121
- catch (fallbackError) {
122
- console.error("[SSTLibrary] Transcription failed:", fallbackError);
123
- throw fallbackError;
98
+ if (options.verbose) {
99
+ console.error("[geminisst] API Call failed:", error);
124
100
  }
101
+ // Re-throw with clear message
102
+ throw new Error(`[geminisst] ${error.message || "Unknown API Error"}`);
125
103
  }
126
104
  }
package/dist/index.d.ts CHANGED
@@ -1,9 +1,10 @@
1
1
  import { SSTOptions, TranscriptionResult } from './types.js';
2
2
  /**
3
- * Node.js entry point
4
- * @param audioFile Path to the audio file
5
- * @param apiKey Google Gemini API Key
6
- * @param options Configuration options
3
+ * Node.js entry point for geminisst
4
+ *
5
+ * @param audioFile - Path to the audio file
6
+ * @param apiKey - Google Gemini API Key
7
+ * @param options - Configuration options
7
8
  * @returns The transcription result object containing text and thoughts
8
9
  */
9
10
  export declare function audioToText(audioFile: string, apiKey: string, options?: SSTOptions): Promise<TranscriptionResult>;
package/dist/index.js CHANGED
@@ -1,39 +1,41 @@
1
- import { processAudioWithGemini } from './core.js';
2
1
  import * as fs from 'fs';
3
2
  import * as path from 'path';
3
+ import { processAudioWithGemini } from './core.js';
4
4
  import { bufferToBase64 } from './utils.js';
5
5
  /**
6
- * Node.js entry point
7
- * @param audioFile Path to the audio file
8
- * @param apiKey Google Gemini API Key
9
- * @param options Configuration options
6
+ * Node.js entry point for geminisst
7
+ *
8
+ * @param audioFile - Path to the audio file
9
+ * @param apiKey - Google Gemini API Key
10
+ * @param options - Configuration options
10
11
  * @returns The transcription result object containing text and thoughts
11
12
  */
12
13
  export async function audioToText(audioFile, apiKey, options = {}) {
13
14
  // 1. Validate Audio File Path
14
15
  if (!fs.existsSync(audioFile)) {
15
- throw new Error(`[geminisst] Audio file not found at path: ${audioFile}`);
16
+ throw new Error(`[geminisst] Audio file not found: ${audioFile}`);
16
17
  }
17
18
  const stats = fs.statSync(audioFile);
18
19
  if (stats.isDirectory()) {
19
20
  throw new Error(`[geminisst] Expected a file path but found a directory: ${audioFile}`);
20
21
  }
21
- // Simple mime type detection based on extension
22
+ // 2. Detect Mime Type
22
23
  const ext = path.extname(audioFile).toLowerCase().replace('.', '');
23
- // Default map
24
24
  const mimeMap = {
25
25
  'mp3': 'audio/mp3',
26
+ 'mpeg': 'audio/mpeg',
26
27
  'wav': 'audio/wav',
27
28
  'ogg': 'audio/ogg',
28
29
  'flac': 'audio/flac',
29
30
  'aac': 'audio/aac',
30
- 'm4a': 'audio/m4a', // often parsed as mp4/aac
31
+ 'm4a': 'audio/m4a',
31
32
  'mp4': 'audio/mp4'
32
33
  };
33
- const mimeType = mimeMap[ext] || 'audio/mp3'; // Default to mp3 if unknown
34
+ const mimeType = mimeMap[ext] || 'audio/mp3';
35
+ // 3. Read File and Convert to Base64
34
36
  const fileBuffer = fs.readFileSync(audioFile);
35
37
  const base64Audio = bufferToBase64(fileBuffer);
36
- // 2. Process
38
+ // 4. Process with Gemini Core
37
39
  return await processAudioWithGemini(base64Audio, mimeType, apiKey, options);
38
40
  }
39
41
  export * from './types.js';
package/dist/types.d.ts CHANGED
@@ -1,26 +1,37 @@
1
1
  /**
2
- * Common types for the SST Library
2
+ * geminisst Type Definitions
3
3
  */
4
- export type AudioInput = string | File;
5
4
  export interface SSTOptions {
6
5
  /**
7
- * The text prompt to guide the audio processing (e.g., "Transcribe in Hindi", "In English letters").
6
+ * Optional guidance for the model (e.g. "Transcribe in Hindi", "English transcript only").
8
7
  */
9
8
  prompt?: string;
10
9
  /**
11
- * Model to use. Defaults to "gemini-2.5-flash-lite".
10
+ * The Gemini model version. Defaults to "gemini-2.5-flash-lite".
12
11
  */
13
12
  model?: string;
14
13
  /**
15
- * Verbose logging.
14
+ * Enable internal logging for debugging.
16
15
  */
17
16
  verbose?: boolean;
18
17
  }
19
18
  export interface TranscriptionResult {
19
+ /**
20
+ * The verbatim transcribed text.
21
+ */
20
22
  text: string;
23
+ /**
24
+ * The AI's internal reasoning (Thought Summary).
25
+ */
21
26
  thoughts?: string;
27
+ /**
28
+ * The model used for the request.
29
+ */
22
30
  model: string;
23
- usage?: {
31
+ /**
32
+ * Token usage and performance metadata.
33
+ */
34
+ usage: {
24
35
  inputTokens: number;
25
36
  outputTokens: number;
26
37
  totalTokens: number;
package/dist/types.js CHANGED
@@ -1,4 +1,4 @@
1
1
  /**
2
- * Common types for the SST Library
2
+ * geminisst Type Definitions
3
3
  */
4
4
  export {};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "geminisst",
3
- "version": "1.0.1",
3
+ "version": "1.0.3",
4
4
  "description": "Revolutionary high-accuracy Audio-to-Text library powered by Gemini 2.5 Flash Lite with 1M+ context window.",
5
5
  "keywords": [
6
6
  "sst",