npm - @pandi2352/gemini-ocr - Versions diffs - 2.0.0 → 4.0.0 - Mend

@pandi2352/gemini-ocr 2.0.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -112,6 +112,23 @@ console.log('Extracted Data:', result.entityResult);
 */
 ```
+```
+### 6. Realtime Progress Feedback
+Get granular updates on the processing stages.
+```typescript
+await processOCR({
+  input: ['./large_document.pdf'],
+  apiKey: process.env.GEMINI_API_KEY,
+  onProgress: (stage, message) => {
+    // stage: 'upload' | 'generate_text' | 'enrich' | 'complete'
+    console.log(`[${stage}]: ${message}`);
+  }
+});
+```
 ---
 ## 🛠️ Configuration Options
@@ -120,11 +137,12 @@ console.log('Extracted Data:', result.entityResult);
 | :--- | :--- | :--- | :--- |
 | `input` | `Array<string \| Buffer \| Object>` | **Required** | Array of file paths, URLs, Buffers, or Base64 strings. |
 | `apiKey` | `string` | **Required** | Your Google Gemini API Key. |
-| `model` | `string` | `gemini-1.5-flash` | The AI model to use. |
+| `model` | `string` | `gemini-1.5-flash` | The AI model (use `gemini-1.5-flash-8b` for speed). |
 | `summarize` | `boolean` | `false` | Generate `metadata` (title, desc, thumbnail). |
 | `mindmap` | `boolean` | `false` | Generate Mermaid.js syntax for visual mapping. |
 | `extractEntities`| `boolean` | `false` | Enable structured field extraction. |
 | `entitySchema` | `string[]` | `auto` | Custom fields to extract (optional). |
+| `onProgress` | `(stage, userMsg) => void` | `undefined` | Callback for realtime progress updates. |
 ---

package/dist/index.js CHANGED Viewed

@@ -52,15 +52,19 @@ async function processSingleFile(input, options) {
     const logger = new utils_1.Logger();
     const requestId = (0, utils_1.generateRequestId)();
     logger.log(`INIT: Processing file. RequestId: ${requestId}`);
+    if (options.onProgress)
+        options.onProgress('init', 'Initializing processing...');
     try {
         if (!options.apiKey)
             throw new Error('Gemini API key is required.');
         const gemini = new llm_1.GeminiClient(options.apiKey, logger);
-        const modelName = options.model || 'gemini-1.5-flash';
+        const modelName = options.model || 'gemini-2.5-flash';
         // Input Processing
         const inputHandler = new input_handler_1.InputHandler(logger);
         const normalized = await inputHandler.processInput(input);
         const mimeType = normalized.mimeType;
+        if (options.onProgress)
+            options.onProgress('upload', 'Processing input file...');
         // Strategy
         let strategy = 'MEDIA';
         if (mimeType === 'text/plain' || mimeType === 'text/csv' || normalized.extension === 'txt' || normalized.extension === 'csv') {
@@ -85,6 +89,8 @@ async function processSingleFile(input, options) {
         let analysisText = '';
         let extractedTextDocx = '';
         let fileUri;
+        if (options.onProgress)
+            options.onProgress('generate_text', 'Generating analysis...');
         if (strategy === 'TEXT') {
             const content = normalized.data.toString('utf-8');
             finalPrompt += `\n\nDOCUMENT CONTENT:\n${content}`;
@@ -136,48 +142,58 @@ async function processSingleFile(input, options) {
                 catch (e) { }
             }
         }
-        // Mindmap
+        // Parallel Processing for Advanced Features
+        // Optimization Note: Future version can combine these into a single "Mega-Prompt" to reduce HTTP round-trips.
+        if (options.onProgress && (options.mindmap || options.extractEntities)) {
+            options.onProgress('enrich', 'Generating mindmap/entities...');
+        }
+        const tasks = [];
         let mindmap = null;
+        let entityResult = null;
         if (options.mindmap) {
             const enrichPrompt = `${prompts_1.ENRICHMENT_PROMPT}\n\nCONTEXT:\n${mainAnalysis}`;
-            try {
-                const enrichRes = await gemini.generateContent(modelName, enrichPrompt);
-                const jsonPart = enrichRes.match(/\{[\s\S]*\}/);
-                const mermaidPart = enrichRes.match(/```mermaid\n([\s\S]*?)\n```/);
-                if (jsonPart) {
-                    const parsed = JSON.parse(jsonPart[0]);
-                    mindmap = parsed.mermaid || null;
-                }
-                else if (mermaidPart) {
-                    mindmap = mermaidPart[1];
+            tasks.push((async () => {
+                try {
+                    const enrichRes = await gemini.generateContent(modelName, enrichPrompt);
+                    const jsonPart = enrichRes.match(/\{[\s\S]*\}/);
+                    const mermaidPart = enrichRes.match(/```mermaid\n([\s\S]*?)\n```/);
+                    if (jsonPart) {
+                        const parsed = JSON.parse(jsonPart[0]);
+                        mindmap = parsed.mermaid || null;
+                    }
+                    else if (mermaidPart) {
+                        mindmap = mermaidPart[1];
+                    }
+                    else {
+                        mindmap = enrichRes.replace(/```/g, '');
+                    }
                 }
-                else {
-                    mindmap = enrichRes.replace(/```/g, '');
+                catch (e) {
+                    logger.log(`Mindmap failed: ${e}`);
                 }
-            }
-            catch (e) {
-                logger.log(`Mindmap failed: ${e}`);
-            }
+            })());
         }
-        // Entities
-        let entityResult = null;
         if (options.extractEntities) {
             let entityPromptStr = options.entitySchema
                 ? (0, prompts_1.generateEntityPrompt)(options.entitySchema)
                 : prompts_1.AUTO_ENTITY_EXTRACTION_PROMPT;
-            try {
-                const context = strategy === 'DOCX' ? extractedTextDocx : mainAnalysis;
-                const finalEntityPrompt = `${entityPromptStr}\n\nDATA CONTEXT:\n${context}`;
-                const res = await gemini.generateContent(modelName, finalEntityPrompt);
-                const json = res.match(/\{[\s\S]*\}/);
-                if (json) {
-                    entityResult = JSON.parse(json[0]);
+            tasks.push((async () => {
+                try {
+                    const context = strategy === 'DOCX' ? extractedTextDocx : mainAnalysis;
+                    const finalEntityPrompt = `${entityPromptStr}\n\nDATA CONTEXT:\n${context}`;
+                    const res = await gemini.generateContent(modelName, finalEntityPrompt);
+                    const json = res.match(/\{[\s\S]*\}/);
+                    if (json) {
+                        entityResult = JSON.parse(json[0]);
+                    }
                 }
-            }
-            catch (e) {
-                logger.log(`Entity extraction failed: ${e}`);
-            }
+                catch (e) {
+                    logger.log(`Entity extraction failed: ${e}`);
+                }
+            })());
         }
+        // Wait for all parallel tasks to complete
+        await Promise.all(tasks);
         // Page Count
         let pageCount = 1;
         if (mimeType === 'application/pdf') {
@@ -188,6 +204,8 @@ async function processSingleFile(input, options) {
             catch (e) { }
         }
         const endTime = new Date();
+        if (options.onProgress)
+            options.onProgress('complete', 'Processing complete.');
         // Success Result
         return {
             status: 'success',

package/dist/types.d.ts CHANGED Viewed

@@ -14,6 +14,7 @@ export interface OCROptions {
     pageLimit?: number;
     entitySchema?: string[] | any;
     classify?: boolean;
+    onProgress?: (stage: string, message: string) => void;
 }
 export interface OCRTimings {
     startTime: string;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@pandi2352/gemini-ocr",
-  "version": "2.0.0",
+  "version": "4.0.0",
   "description": "A lightweight OCR processing wrapper using Google Gemini Vision models.",
   "publishConfig": {
     "access": "public"