npm - geminisst - Versions diffs - 1.0.2 → 1.0.3 - Mend

geminisst 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/core.d.ts CHANGED Viewed

@@ -1,5 +1,11 @@
 import { SSTOptions, TranscriptionResult } from './types.js';
 /**
  * Processes audio using the Gemini API.
+ *
+ * @param audioData - Base64 encoded audio string
+ * @param mimeType - MIME type of the audio
+ * @param apiKey - Google Gemini API Key
+ * @param options - Configuration options
+ * @returns Promise resolving to the transcription result
  */
 export declare function processAudioWithGemini(audioData: string, mimeType: string, apiKey: string, options: SSTOptions): Promise<TranscriptionResult>;

package/dist/core.js CHANGED Viewed

@@ -1,34 +1,48 @@
 /**
- * Core business logic using Google GenAI SDK
+ * Core business logic using the modern @google/genai SDK
+ * strictly following the provided multimodal and thinking documentation.
  */
 import { GoogleGenAI } from '@google/genai';
 import { DEFAULT_SYSTEM_INSTRUCTION } from './constants.js';
 /**
  * Processes audio using the Gemini API.
+ *
+ * @param audioData - Base64 encoded audio string
+ * @param mimeType - MIME type of the audio
+ * @param apiKey - Google Gemini API Key
+ * @param options - Configuration options
+ * @returns Promise resolving to the transcription result
  */
 export async function processAudioWithGemini(audioData, mimeType, apiKey, options) {
     if (!apiKey) {
-        throw new Error("[geminisst] API Key is required");
+        throw new Error("[geminisst] API Key is required.");
     }
-    // Initialize the AI client
-    const ai = new GoogleGenAI({ apiKey: apiKey });
+    // Initialize client as per documentation: new GoogleGenAI({ apiKey })
+    const ai = new GoogleGenAI({ apiKey });
     const modelName = options.model || "gemini-2.5-flash-lite";
     const startTime = Date.now();
-    // Configure as per latest Gemini 2.5 specifications
+    /**
+     * Configuration strictly following the Gemini 2.5 series docs:
+     * - thinkingBudget: -1 enables Dynamic Thinking.
+     * - includeThoughts: true allows capturing reasoning parts.
+     */
     const config = {
+        systemInstruction: DEFAULT_SYSTEM_INSTRUCTION,
         thinkingConfig: {
             includeThoughts: true,
-            thinkingBudget: -1
+            thinkingBudget: -1,
         },
-        systemInstruction: DEFAULT_SYSTEM_INSTRUCTION
     };
     if (options.verbose) {
-        console.log(`[geminisst] Model: ${modelName}`);
-        console.log(`[geminisst] Thinking: Enabled (Dynamic)`);
+        console.log(`[geminisst] Initializing ${modelName}...`);
+        console.log(`[geminisst] Dynamic Thinking: Enabled`);
     }
-    const promptText = options.prompt || "Transcribe this audio.";
+    const promptText = options.prompt || "Transcribe this audio exactly.";
     try {
-        // Standard call using the models.generateContent API
+        /**
+         * Multimodal generation using inlineData as per docs:
+         * ai.models.generateContent({ model, contents, config })
+         */
         const response = await ai.models.generateContent({
             model: modelName,
             contents: [
@@ -49,32 +63,42 @@ export async function processAudioWithGemini(audioData, mimeType, apiKey, option
         });
         const endTime = Date.now();
         const processingTimeSec = parseFloat(((endTime - startTime) / 1000).toFixed(2));
+        // response.candidates[0].content.parts handling
         const candidate = response.candidates?.[0];
-        const textParts = candidate?.content?.parts || [];
-        // Separate transcript and thoughts
-        const transcriptText = textParts
-            .filter((p) => !p.thought)
-            .map((p) => p.text)
-            .join('') || "";
-        const thoughtText = textParts
-            .filter((p) => p.thought)
-            .map((p) => p.text)
-            .join('') || "";
+        const parts = candidate?.content?.parts || [];
+        // Separate actual transcript from reasoning thoughts
+        let transcriptText = "";
+        let thoughtText = "";
+        for (const part of parts) {
+            if (part.thought) {
+                thoughtText += part.text || "";
+            }
+            else {
+                transcriptText += part.text || "";
+            }
+        }
+        /**
+         * Usage metadata handling as per documentation:
+         * response.usageMetadata.promptTokenCount etc.
+         */
         const usage = response.usageMetadata ? {
             inputTokens: response.usageMetadata.promptTokenCount || 0,
             outputTokens: response.usageMetadata.candidatesTokenCount || 0,
             totalTokens: response.usageMetadata.totalTokenCount || 0,
             processingTimeSec: processingTimeSec
-        } : undefined;
+        } : { processingTimeSec };
         return {
-            text: transcriptText,
-            thoughts: thoughtText,
+            text: transcriptText.trim(),
+            thoughts: thoughtText.trim(),
             model: modelName,
             usage: usage
         };
     }
     catch (error) {
-        console.error("[geminisst] Error calling Gemini API:", error.message);
-        throw error;
+        if (options.verbose) {
+            console.error("[geminisst] API Call failed:", error);
+        }
+        // Re-throw with clear message
+        throw new Error(`[geminisst] ${error.message || "Unknown API Error"}`);
     }
 }

package/dist/index.d.ts CHANGED Viewed

@@ -1,9 +1,10 @@
 import { SSTOptions, TranscriptionResult } from './types.js';
 /**
- * Node.js entry point
- * @param audioFile Path to the audio file
- * @param apiKey Google Gemini API Key
- * @param options Configuration options
+ * Node.js entry point for geminisst
+ *
+ * @param audioFile - Path to the audio file
+ * @param apiKey - Google Gemini API Key
+ * @param options - Configuration options
  * @returns The transcription result object containing text and thoughts
  */
 export declare function audioToText(audioFile: string, apiKey: string, options?: SSTOptions): Promise<TranscriptionResult>;

package/dist/index.js CHANGED Viewed

@@ -1,39 +1,41 @@
-import { processAudioWithGemini } from './core.js';
 import * as fs from 'fs';
 import * as path from 'path';
+import { processAudioWithGemini } from './core.js';
 import { bufferToBase64 } from './utils.js';
 /**
- * Node.js entry point
- * @param audioFile Path to the audio file
- * @param apiKey Google Gemini API Key
- * @param options Configuration options
+ * Node.js entry point for geminisst
+ *
+ * @param audioFile - Path to the audio file
+ * @param apiKey - Google Gemini API Key
+ * @param options - Configuration options
  * @returns The transcription result object containing text and thoughts
  */
 export async function audioToText(audioFile, apiKey, options = {}) {
     // 1. Validate Audio File Path
     if (!fs.existsSync(audioFile)) {
-        throw new Error(`[geminisst] Audio file not found at path: ${audioFile}`);
+        throw new Error(`[geminisst] Audio file not found: ${audioFile}`);
     }
     const stats = fs.statSync(audioFile);
     if (stats.isDirectory()) {
         throw new Error(`[geminisst] Expected a file path but found a directory: ${audioFile}`);
     }
-    // Simple mime type detection based on extension
+    // 2. Detect Mime Type
     const ext = path.extname(audioFile).toLowerCase().replace('.', '');
-    // Default map
     const mimeMap = {
         'mp3': 'audio/mp3',
+        'mpeg': 'audio/mpeg',
         'wav': 'audio/wav',
         'ogg': 'audio/ogg',
         'flac': 'audio/flac',
         'aac': 'audio/aac',
-        'm4a': 'audio/m4a', // often parsed as mp4/aac
+        'm4a': 'audio/m4a',
         'mp4': 'audio/mp4'
     };
-    const mimeType = mimeMap[ext] || 'audio/mp3'; // Default to mp3 if unknown
+    const mimeType = mimeMap[ext] || 'audio/mp3';
+    // 3. Read File and Convert to Base64
     const fileBuffer = fs.readFileSync(audioFile);
     const base64Audio = bufferToBase64(fileBuffer);
-    // 2. Process
+    // 4. Process with Gemini Core
     return await processAudioWithGemini(base64Audio, mimeType, apiKey, options);
 }
 export * from './types.js';

package/dist/types.d.ts CHANGED Viewed

@@ -1,26 +1,37 @@
 /**
- * Common types for the SST Library
+ * geminisst Type Definitions
  */
-export type AudioInput = string | File;
 export interface SSTOptions {
     /**
-     * The text prompt to guide the audio processing (e.g., "Transcribe in Hindi", "In English letters").
+     * Optional guidance for the model (e.g. "Transcribe in Hindi", "English transcript only").
      */
     prompt?: string;
     /**
-     * Model to use. Defaults to "gemini-2.5-flash-lite".
+     * The Gemini model version. Defaults to "gemini-2.5-flash-lite".
      */
     model?: string;
     /**
-     * Verbose logging.
+     * Enable internal logging for debugging.
      */
     verbose?: boolean;
 }
 export interface TranscriptionResult {
+    /**
+     * The verbatim transcribed text.
+     */
     text: string;
+    /**
+     * The AI's internal reasoning (Thought Summary).
+     */
     thoughts?: string;
+    /**
+     * The model used for the request.
+     */
     model: string;
-    usage?: {
+    /**
+     * Token usage and performance metadata.
+     */
+    usage: {
         inputTokens: number;
         outputTokens: number;
         totalTokens: number;

package/dist/types.js CHANGED Viewed

@@ -1,4 +1,4 @@
 /**
- * Common types for the SST Library
+ * geminisst Type Definitions
  */
 export {};

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "geminisst",
-  "version": "1.0.2",
+  "version": "1.0.3",
   "description": "Revolutionary high-accuracy Audio-to-Text library powered by Gemini 2.5 Flash Lite with 1M+ context window.",
   "keywords": [
     "sst",