geminisst 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/core.d.ts CHANGED
@@ -1,5 +1,11 @@
1
1
  import { SSTOptions, TranscriptionResult } from './types.js';
2
2
  /**
3
3
  * Processes audio using the Gemini API.
4
+ *
5
+ * @param audioData - Base64 encoded audio string
6
+ * @param mimeType - MIME type of the audio
7
+ * @param apiKey - Google Gemini API Key
8
+ * @param options - Configuration options
9
+ * @returns Promise resolving to the transcription result
4
10
  */
5
11
  export declare function processAudioWithGemini(audioData: string, mimeType: string, apiKey: string, options: SSTOptions): Promise<TranscriptionResult>;
package/dist/core.js CHANGED
@@ -1,34 +1,48 @@
1
1
  /**
2
- * Core business logic using Google GenAI SDK
2
+ * Core business logic using the modern @google/genai SDK
3
+ * strictly following the provided multimodal and thinking documentation.
3
4
  */
4
5
  import { GoogleGenAI } from '@google/genai';
5
6
  import { DEFAULT_SYSTEM_INSTRUCTION } from './constants.js';
6
7
  /**
7
8
  * Processes audio using the Gemini API.
9
+ *
10
+ * @param audioData - Base64 encoded audio string
11
+ * @param mimeType - MIME type of the audio
12
+ * @param apiKey - Google Gemini API Key
13
+ * @param options - Configuration options
14
+ * @returns Promise resolving to the transcription result
8
15
  */
9
16
  export async function processAudioWithGemini(audioData, mimeType, apiKey, options) {
10
17
  if (!apiKey) {
11
- throw new Error("[geminisst] API Key is required");
18
+ throw new Error("[geminisst] API Key is required.");
12
19
  }
13
- // Initialize the AI client
14
- const ai = new GoogleGenAI({ apiKey: apiKey });
20
+ // Initialize client as per documentation: new GoogleGenAI({ apiKey })
21
+ const ai = new GoogleGenAI({ apiKey });
15
22
  const modelName = options.model || "gemini-2.5-flash-lite";
16
23
  const startTime = Date.now();
17
- // Configure as per latest Gemini 2.5 specifications
24
+ /**
25
+ * Configuration strictly following the Gemini 2.5 series docs:
26
+ * - thinkingBudget: -1 enables Dynamic Thinking.
27
+ * - includeThoughts: true allows capturing reasoning parts.
28
+ */
18
29
  const config = {
30
+ systemInstruction: DEFAULT_SYSTEM_INSTRUCTION,
19
31
  thinkingConfig: {
20
32
  includeThoughts: true,
21
- thinkingBudget: -1
33
+ thinkingBudget: -1,
22
34
  },
23
- systemInstruction: DEFAULT_SYSTEM_INSTRUCTION
24
35
  };
25
36
  if (options.verbose) {
26
- console.log(`[geminisst] Model: ${modelName}`);
27
- console.log(`[geminisst] Thinking: Enabled (Dynamic)`);
37
+ console.log(`[geminisst] Initializing ${modelName}...`);
38
+ console.log(`[geminisst] Dynamic Thinking: Enabled`);
28
39
  }
29
- const promptText = options.prompt || "Transcribe this audio.";
40
+ const promptText = options.prompt || "Transcribe this audio exactly.";
30
41
  try {
31
- // Standard call using the models.generateContent API
42
+ /**
43
+ * Multimodal generation using inlineData as per docs:
44
+ * ai.models.generateContent({ model, contents, config })
45
+ */
32
46
  const response = await ai.models.generateContent({
33
47
  model: modelName,
34
48
  contents: [
@@ -49,32 +63,42 @@ export async function processAudioWithGemini(audioData, mimeType, apiKey, option
49
63
  });
50
64
  const endTime = Date.now();
51
65
  const processingTimeSec = parseFloat(((endTime - startTime) / 1000).toFixed(2));
66
+ // response.candidates[0].content.parts handling
52
67
  const candidate = response.candidates?.[0];
53
- const textParts = candidate?.content?.parts || [];
54
- // Separate transcript and thoughts
55
- const transcriptText = textParts
56
- .filter((p) => !p.thought)
57
- .map((p) => p.text)
58
- .join('') || "";
59
- const thoughtText = textParts
60
- .filter((p) => p.thought)
61
- .map((p) => p.text)
62
- .join('') || "";
68
+ const parts = candidate?.content?.parts || [];
69
+ // Separate actual transcript from reasoning thoughts
70
+ let transcriptText = "";
71
+ let thoughtText = "";
72
+ for (const part of parts) {
73
+ if (part.thought) {
74
+ thoughtText += part.text || "";
75
+ }
76
+ else {
77
+ transcriptText += part.text || "";
78
+ }
79
+ }
80
+ /**
81
+ * Usage metadata handling as per documentation:
82
+ * response.usageMetadata.promptTokenCount etc.
83
+ */
63
84
  const usage = response.usageMetadata ? {
64
85
  inputTokens: response.usageMetadata.promptTokenCount || 0,
65
86
  outputTokens: response.usageMetadata.candidatesTokenCount || 0,
66
87
  totalTokens: response.usageMetadata.totalTokenCount || 0,
67
88
  processingTimeSec: processingTimeSec
68
- } : undefined;
89
+ } : { processingTimeSec };
69
90
  return {
70
- text: transcriptText,
71
- thoughts: thoughtText,
91
+ text: transcriptText.trim(),
92
+ thoughts: thoughtText.trim(),
72
93
  model: modelName,
73
94
  usage: usage
74
95
  };
75
96
  }
76
97
  catch (error) {
77
- console.error("[geminisst] Error calling Gemini API:", error.message);
78
- throw error;
98
+ if (options.verbose) {
99
+ console.error("[geminisst] API Call failed:", error);
100
+ }
101
+ // Re-throw with clear message
102
+ throw new Error(`[geminisst] ${error.message || "Unknown API Error"}`);
79
103
  }
80
104
  }
package/dist/index.d.ts CHANGED
@@ -1,9 +1,10 @@
1
1
  import { SSTOptions, TranscriptionResult } from './types.js';
2
2
  /**
3
- * Node.js entry point
4
- * @param audioFile Path to the audio file
5
- * @param apiKey Google Gemini API Key
6
- * @param options Configuration options
3
+ * Node.js entry point for geminisst
4
+ *
5
+ * @param audioFile - Path to the audio file
6
+ * @param apiKey - Google Gemini API Key
7
+ * @param options - Configuration options
7
8
  * @returns The transcription result object containing text and thoughts
8
9
  */
9
10
  export declare function audioToText(audioFile: string, apiKey: string, options?: SSTOptions): Promise<TranscriptionResult>;
package/dist/index.js CHANGED
@@ -1,39 +1,41 @@
1
- import { processAudioWithGemini } from './core.js';
2
1
  import * as fs from 'fs';
3
2
  import * as path from 'path';
3
+ import { processAudioWithGemini } from './core.js';
4
4
  import { bufferToBase64 } from './utils.js';
5
5
  /**
6
- * Node.js entry point
7
- * @param audioFile Path to the audio file
8
- * @param apiKey Google Gemini API Key
9
- * @param options Configuration options
6
+ * Node.js entry point for geminisst
7
+ *
8
+ * @param audioFile - Path to the audio file
9
+ * @param apiKey - Google Gemini API Key
10
+ * @param options - Configuration options
10
11
  * @returns The transcription result object containing text and thoughts
11
12
  */
12
13
  export async function audioToText(audioFile, apiKey, options = {}) {
13
14
  // 1. Validate Audio File Path
14
15
  if (!fs.existsSync(audioFile)) {
15
- throw new Error(`[geminisst] Audio file not found at path: ${audioFile}`);
16
+ throw new Error(`[geminisst] Audio file not found: ${audioFile}`);
16
17
  }
17
18
  const stats = fs.statSync(audioFile);
18
19
  if (stats.isDirectory()) {
19
20
  throw new Error(`[geminisst] Expected a file path but found a directory: ${audioFile}`);
20
21
  }
21
- // Simple mime type detection based on extension
22
+ // 2. Detect Mime Type
22
23
  const ext = path.extname(audioFile).toLowerCase().replace('.', '');
23
- // Default map
24
24
  const mimeMap = {
25
25
  'mp3': 'audio/mp3',
26
+ 'mpeg': 'audio/mpeg',
26
27
  'wav': 'audio/wav',
27
28
  'ogg': 'audio/ogg',
28
29
  'flac': 'audio/flac',
29
30
  'aac': 'audio/aac',
30
- 'm4a': 'audio/m4a', // often parsed as mp4/aac
31
+ 'm4a': 'audio/m4a',
31
32
  'mp4': 'audio/mp4'
32
33
  };
33
- const mimeType = mimeMap[ext] || 'audio/mp3'; // Default to mp3 if unknown
34
+ const mimeType = mimeMap[ext] || 'audio/mp3';
35
+ // 3. Read File and Convert to Base64
34
36
  const fileBuffer = fs.readFileSync(audioFile);
35
37
  const base64Audio = bufferToBase64(fileBuffer);
36
- // 2. Process
38
+ // 4. Process with Gemini Core
37
39
  return await processAudioWithGemini(base64Audio, mimeType, apiKey, options);
38
40
  }
39
41
  export * from './types.js';
package/dist/types.d.ts CHANGED
@@ -1,26 +1,37 @@
1
1
  /**
2
- * Common types for the SST Library
2
+ * geminisst Type Definitions
3
3
  */
4
- export type AudioInput = string | File;
5
4
  export interface SSTOptions {
6
5
  /**
7
- * The text prompt to guide the audio processing (e.g., "Transcribe in Hindi", "In English letters").
6
+ * Optional guidance for the model (e.g. "Transcribe in Hindi", "English transcript only").
8
7
  */
9
8
  prompt?: string;
10
9
  /**
11
- * Model to use. Defaults to "gemini-2.5-flash-lite".
10
+ * The Gemini model version. Defaults to "gemini-2.5-flash-lite".
12
11
  */
13
12
  model?: string;
14
13
  /**
15
- * Verbose logging.
14
+ * Enable internal logging for debugging.
16
15
  */
17
16
  verbose?: boolean;
18
17
  }
19
18
  export interface TranscriptionResult {
19
+ /**
20
+ * The verbatim transcribed text.
21
+ */
20
22
  text: string;
23
+ /**
24
+ * The AI's internal reasoning (Thought Summary).
25
+ */
21
26
  thoughts?: string;
27
+ /**
28
+ * The model used for the request.
29
+ */
22
30
  model: string;
23
- usage?: {
31
+ /**
32
+ * Token usage and performance metadata.
33
+ */
34
+ usage: {
24
35
  inputTokens: number;
25
36
  outputTokens: number;
26
37
  totalTokens: number;
package/dist/types.js CHANGED
@@ -1,4 +1,4 @@
1
1
  /**
2
- * Common types for the SST Library
2
+ * geminisst Type Definitions
3
3
  */
4
4
  export {};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "geminisst",
3
- "version": "1.0.2",
3
+ "version": "1.0.3",
4
4
  "description": "Revolutionary high-accuracy Audio-to-Text library powered by Gemini 2.5 Flash Lite with 1M+ context window.",
5
5
  "keywords": [
6
6
  "sst",