npm - geminisst - Versions diffs - 1.0.1 → 1.0.3 - Mend

geminisst 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/core.d.ts CHANGED Viewed

@@ -1,6 +1,7 @@
 import { SSTOptions, TranscriptionResult } from './types.js';
 /**
  * Processes audio using the Gemini API.
+ *
  * @param audioData - Base64 encoded audio string
  * @param mimeType - MIME type of the audio
  * @param apiKey - Google Gemini API Key

package/dist/core.js CHANGED Viewed

@@ -1,10 +1,12 @@
 /**
- * Core business logic using Google GenAI SDK
+ * Core business logic using the modern @google/genai SDK
+ * strictly following the provided multimodal and thinking documentation.
  */
 import { GoogleGenAI } from '@google/genai';
 import { DEFAULT_SYSTEM_INSTRUCTION } from './constants.js';
 /**
  * Processes audio using the Gemini API.
+ *
  * @param audioData - Base64 encoded audio string
  * @param mimeType - MIME type of the audio
  * @param apiKey - Google Gemini API Key
@@ -13,30 +15,32 @@ import { DEFAULT_SYSTEM_INSTRUCTION } from './constants.js';
  */
 export async function processAudioWithGemini(audioData, mimeType, apiKey, options) {
     if (!apiKey) {
-        throw new Error("API Key is required");
+        throw new Error("[geminisst] API Key is required.");
     }
-    // Initialize the AI client according to documentation: new GoogleGenAI({ apiKey })
-    const ai = new GoogleGenAI({ apiKey: apiKey });
+    // Initialize client as per documentation: new GoogleGenAI({ apiKey })
+    const ai = new GoogleGenAI({ apiKey });
     const modelName = options.model || "gemini-2.5-flash-lite";
-    // Configure thinking mode as per Gemini 2.5 specifications in documentation
+    const startTime = Date.now();
+    /**
+     * Configuration strictly following the Gemini 2.5 series docs:
+     * - thinkingBudget: -1 enables Dynamic Thinking.
+     * - includeThoughts: true allows capturing reasoning parts.
+     */
     const config = {
+        systemInstruction: DEFAULT_SYSTEM_INSTRUCTION,
         thinkingConfig: {
-            includeThoughts: true, // Enabled to allow monitoring thoughts if needed
-            thinkingBudget: -1 // Dynamic thinking enabled (-1)
+            includeThoughts: true,
+            thinkingBudget: -1,
         },
-        // Fixed System Instruction: Users cannot override this as it is the core STT logic.
-        systemInstruction: DEFAULT_SYSTEM_INSTRUCTION
     };
     if (options.verbose) {
-        console.log(`[SSTLibrary] Model: ${modelName}`);
-        console.log(`[SSTLibrary] Thinking: Dynamic (-1)`);
-        console.log(`[SSTLibrary] System Instruction: Locked (Core)`);
+        console.log(`[geminisst] Initializing ${modelName}...`);
+        console.log(`[geminisst] Dynamic Thinking: Enabled`);
     }
-    const promptText = options.prompt || "Transcribe this audio.";
-    const startTime = Date.now();
+    const promptText = options.prompt || "Transcribe this audio exactly.";
     try {
         /**
-         * Using the syntax from the provided documentation:
+         * Multimodal generation using inlineData as per docs:
          * ai.models.generateContent({ model, contents, config })
          */
         const response = await ai.models.generateContent({
@@ -59,68 +63,42 @@ export async function processAudioWithGemini(audioData, mimeType, apiKey, option
         });
         const endTime = Date.now();
         const processingTimeSec = parseFloat(((endTime - startTime) / 1000).toFixed(2));
-        // Handle the response according to the documentation structure
+        // response.candidates[0].content.parts handling
         const candidate = response.candidates?.[0];
-        const textParts = candidate?.content?.parts || [];
-        // Combine text parts and thought parts separately
-        const transcriptText = textParts
-            .filter((p) => !p.thought)
-            .map((p) => p.text)
-            .join('') || "";
-        const thoughtText = textParts
-            .filter((p) => p.thought)
-            .map((p) => p.text)
-            .join('') || "";
-        // Extract usage details
+        const parts = candidate?.content?.parts || [];
+        // Separate actual transcript from reasoning thoughts
+        let transcriptText = "";
+        let thoughtText = "";
+        for (const part of parts) {
+            if (part.thought) {
+                thoughtText += part.text || "";
+            }
+            else {
+                transcriptText += part.text || "";
+            }
+        }
+        /**
+         * Usage metadata handling as per documentation:
+         * response.usageMetadata.promptTokenCount etc.
+         */
         const usage = response.usageMetadata ? {
             inputTokens: response.usageMetadata.promptTokenCount || 0,
             outputTokens: response.usageMetadata.candidatesTokenCount || 0,
             totalTokens: response.usageMetadata.totalTokenCount || 0,
             processingTimeSec: processingTimeSec
-        } : undefined;
+        } : { processingTimeSec };
         return {
-            text: transcriptText,
-            thoughts: thoughtText,
+            text: transcriptText.trim(),
+            thoughts: thoughtText.trim(),
             model: modelName,
             usage: usage
         };
     }
     catch (error) {
-        // If the newer ai.models.generateContent syntax is not available in the installed SDK version,
-        // fallback to the widely supported getGenerativeModel method while keeping logic consistent.
-        if (options.verbose)
-            console.warn("[SSTLibrary] Newer syntax failed, trying fallback...");
-        try {
-            const model = ai.getGenerativeModel({ model: modelName }, config);
-            const result = await model.generateContent({
-                contents: [{
-                        role: 'user',
-                        parts: [
-                            { text: promptText },
-                            { inlineData: { mimeType, data: audioData } }
-                        ]
-                    }]
-            });
-            const endTime = Date.now();
-            const processingTimeSec = parseFloat(((endTime - startTime) / 1000).toFixed(2));
-            const resp = result.response;
-            const candidate = resp.candidates?.[0];
-            const parts = candidate?.content?.parts || [];
-            return {
-                text: parts.filter((p) => !p.thought).map((p) => p.text).join(''),
-                thoughts: parts.filter((p) => p.thought).map((p) => p.text).join(''),
-                model: modelName,
-                usage: resp.usageMetadata ? {
-                    inputTokens: resp.usageMetadata.promptTokenCount,
-                    outputTokens: resp.usageMetadata.candidatesTokenCount,
-                    totalTokens: resp.usageMetadata.totalTokenCount,
-                    processingTimeSec: processingTimeSec
-                } : undefined
-            };
-        }
-        catch (fallbackError) {
-            console.error("[SSTLibrary] Transcription failed:", fallbackError);
-            throw fallbackError;
+        if (options.verbose) {
+            console.error("[geminisst] API Call failed:", error);
         }
+        // Re-throw with clear message
+        throw new Error(`[geminisst] ${error.message || "Unknown API Error"}`);
     }
 }

package/dist/index.d.ts CHANGED Viewed

@@ -1,9 +1,10 @@
 import { SSTOptions, TranscriptionResult } from './types.js';
 /**
- * Node.js entry point
- * @param audioFile Path to the audio file
- * @param apiKey Google Gemini API Key
- * @param options Configuration options
+ * Node.js entry point for geminisst
+ *
+ * @param audioFile - Path to the audio file
+ * @param apiKey - Google Gemini API Key
+ * @param options - Configuration options
  * @returns The transcription result object containing text and thoughts
  */
 export declare function audioToText(audioFile: string, apiKey: string, options?: SSTOptions): Promise<TranscriptionResult>;

package/dist/index.js CHANGED Viewed

@@ -1,39 +1,41 @@
-import { processAudioWithGemini } from './core.js';
 import * as fs from 'fs';
 import * as path from 'path';
+import { processAudioWithGemini } from './core.js';
 import { bufferToBase64 } from './utils.js';
 /**
- * Node.js entry point
- * @param audioFile Path to the audio file
- * @param apiKey Google Gemini API Key
- * @param options Configuration options
+ * Node.js entry point for geminisst
+ *
+ * @param audioFile - Path to the audio file
+ * @param apiKey - Google Gemini API Key
+ * @param options - Configuration options
  * @returns The transcription result object containing text and thoughts
  */
 export async function audioToText(audioFile, apiKey, options = {}) {
     // 1. Validate Audio File Path
     if (!fs.existsSync(audioFile)) {
-        throw new Error(`[geminisst] Audio file not found at path: ${audioFile}`);
+        throw new Error(`[geminisst] Audio file not found: ${audioFile}`);
     }
     const stats = fs.statSync(audioFile);
     if (stats.isDirectory()) {
         throw new Error(`[geminisst] Expected a file path but found a directory: ${audioFile}`);
     }
-    // Simple mime type detection based on extension
+    // 2. Detect Mime Type
     const ext = path.extname(audioFile).toLowerCase().replace('.', '');
-    // Default map
     const mimeMap = {
         'mp3': 'audio/mp3',
+        'mpeg': 'audio/mpeg',
         'wav': 'audio/wav',
         'ogg': 'audio/ogg',
         'flac': 'audio/flac',
         'aac': 'audio/aac',
-        'm4a': 'audio/m4a', // often parsed as mp4/aac
+        'm4a': 'audio/m4a',
         'mp4': 'audio/mp4'
     };
-    const mimeType = mimeMap[ext] || 'audio/mp3'; // Default to mp3 if unknown
+    const mimeType = mimeMap[ext] || 'audio/mp3';
+    // 3. Read File and Convert to Base64
     const fileBuffer = fs.readFileSync(audioFile);
     const base64Audio = bufferToBase64(fileBuffer);
-    // 2. Process
+    // 4. Process with Gemini Core
     return await processAudioWithGemini(base64Audio, mimeType, apiKey, options);
 }
 export * from './types.js';

package/dist/types.d.ts CHANGED Viewed

@@ -1,26 +1,37 @@
 /**
- * Common types for the SST Library
+ * geminisst Type Definitions
  */
-export type AudioInput = string | File;
 export interface SSTOptions {
     /**
-     * The text prompt to guide the audio processing (e.g., "Transcribe in Hindi", "In English letters").
+     * Optional guidance for the model (e.g. "Transcribe in Hindi", "English transcript only").
      */
     prompt?: string;
     /**
-     * Model to use. Defaults to "gemini-2.5-flash-lite".
+     * The Gemini model version. Defaults to "gemini-2.5-flash-lite".
      */
     model?: string;
     /**
-     * Verbose logging.
+     * Enable internal logging for debugging.
      */
     verbose?: boolean;
 }
 export interface TranscriptionResult {
+    /**
+     * The verbatim transcribed text.
+     */
     text: string;
+    /**
+     * The AI's internal reasoning (Thought Summary).
+     */
     thoughts?: string;
+    /**
+     * The model used for the request.
+     */
     model: string;
-    usage?: {
+    /**
+     * Token usage and performance metadata.
+     */
+    usage: {
         inputTokens: number;
         outputTokens: number;
         totalTokens: number;

package/dist/types.js CHANGED Viewed

@@ -1,4 +1,4 @@
 /**
- * Common types for the SST Library
+ * geminisst Type Definitions
  */
 export {};

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "geminisst",
-  "version": "1.0.1",
+  "version": "1.0.3",
   "description": "Revolutionary high-accuracy Audio-to-Text library powered by Gemini 2.5 Flash Lite with 1M+ context window.",
   "keywords": [
     "sst",