npm - @pandi2352/gemini-ocr - Versions diffs - 1.0.0 - Mend

@pandi2352/gemini-ocr 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2024 Pandi2352
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

package/README.md ADDED Viewed

@@ -0,0 +1,145 @@
+<div align="center">
+# 🔮 Gemini OCR (@pandi2352/gemini-ocr)
+### The Next-Gen Document Intelligence Wrapper
+[![npm version](https://img.shields.io/npm/v/@pandi2352/gemini-ocr?style=flat-square&color=blue)](https://www.npmjs.com/package/@pandi2352/gemini-ocr)
+[![TypeScript](https://img.shields.io/badge/Built%20With-TypeScript-3178C6?style=flat-square&logo=typescript)](https://www.typescriptlang.org/)
+[![License](https://img.shields.io/badge/License-MIT-green?style=flat-square)](./LICENSE)
+[![AI Powered](https://img.shields.io/badge/AI-Powered%20by%20Gemini-8E75B2?style=flat-square&logo=google)](https://deepmind.google/technologies/gemini/)
+<p class="description">
+  Turn <b>any file</b> (PDF, Image, DOCX, Audio, Video) into <b>structured data</b>, <br>
+  <b>mindmaps</b>, and <b>summaries</b> with a single function call.
+</p>
+</div>
+---
+## ⚡ Why Gemini OCR?
+Traditional OCR (Tesseract, AWS Textract) gives you just text. **Gemini OCR gives you understanding.**
+| Feature | Description |
+| :--- | :--- |
+| **🧠 Deep Understanding** | Don't just extract text—understand it. Get summaries, titles, and context. |
+| **🗺️ Mindmaps** | Auto-generate **Mermaid.js** mindmaps to visualize complex documents. |
+| **🏎️ Batch Processing** | Process standard arrays of files (`['path', 'url']`) in parallel. |
+| **🎯 Entity Extraction** | Extract specific fields (Dates, Names, IDs) into strict JSON. |
+| **🌈 Multimodal** | Works on **PDFs**, **Images**, **Word Docs**, **Audio**, and **Video**. |
+---
+## 📚 Step-by-Step Usage Guide
+### 1. Prerequisites
+You need a Google Gemini API Key.
+[**Get your API Key here**](https://aistudio.google.com/app/apikey)
+### 2. Installation
+Install the package in your Node.js project:
+```bash
+npm install @pandi2352/gemini-ocr
+```
+### 3. Basic Usage (Text Extraction)
+Create a file (e.g., `index.ts`) and add the following. This works for locally stored files or URLs.
+```typescript
+import { processOCR } from '@pandi2352/gemini-ocr';
+async function main() {
+  const results = await processOCR({
+    // Input can be a single file string or an array
+    input: ['./my-document.pdf'],
+    apiKey: process.env.GEMINI_API_KEY
+  });
+  console.log(results[0].extractedText);
+}
+main();
+```
+### 4. Batch Processing (Multiple Files)
+Pass an array of file paths or URLs. They are processed in parallel.
+```typescript
+const results = await processOCR({
+  input: [
+    './invoice_january.pdf',
+    'https://example.com/receipt.jpg',
+    './meeting_notes.docx'
+  ],
+  apiKey: process.env.GEMINI_API_KEY,
+  summarize: true // Optional: Get summaries for all
+});
+results.forEach((doc, index) => {
+  if (doc.status === 'success') {
+    console.log(`File ${index + 1}: ${doc.summary}`);
+  }
+});
+```
+### 5. Advanced Intelligence (Mindmaps & Entities)
+Unlock the full power of AI by enabling specific flags.
+```typescript
+const [result] = await processOCR({
+  input: ['./complex_contract.pdf'],
+  apiKey: process.env.GEMINI_API_KEY,
+  // Enable Advanced Features
+  mindmap: true,        // Generates Mermaid.js visualization
+  extractEntities: true, // Extracts JSON data
+  entitySchema: ['Contract Value', 'Start Date', 'Parties Involved'] // Optional custom fields
+});
+// 1. Get the Mindmap
+console.log('Mindmap Code:', result.mindmap);
+// 2. Get Structured Data
+console.log('Extracted Data:', result.entityResult);
+/* Output:
+{
+  "contract_value": "$50,000",
+  "start_date": "2024-01-01",
+  "parties_involved": "Company A, Vendor B"
+}
+*/
+```
+---
+## 🛠️ Configuration Options
+| Option | Type | Default | Description |
+| :--- | :--- | :--- | :--- |
+| `input` | `Array<string \| Buffer \| Object>` | **Required** | Array of file paths, URLs, Buffers, or Base64 strings. |
+| `apiKey` | `string` | **Required** | Your Google Gemini API Key. |
+| `model` | `string` | `gemini-1.5-flash` | The AI model to use. |
+| `summarize` | `boolean` | `false` | Generate `metadata` (title, desc, thumbnail). |
+| `mindmap` | `boolean` | `false` | Generate Mermaid.js syntax for visual mapping. |
+| `extractEntities`| `boolean` | `false` | Enable structured field extraction. |
+| `entitySchema` | `string[]` | `auto` | Custom fields to extract (optional). |
+---
+## 🤝 Contributing
+We love contributions! Please feel free to submit a Pull Request.
+1. Fork it
+2. Create your feature branch (`git checkout -b feature/cool-feature`)
+3. Commit your changes
+4. Push to the branch
+5. Open a Pull Request
+---
+<div align="center">
+  <sub>Built with ❤️ for developers by developers.</sub>
+</div>

package/dist/converters.d.ts ADDED Viewed

	@@ -0,0 +1 @@
1	+ export declare function extractTextFromDocx(buffer: Buffer): Promise<string>;

package/dist/converters.js ADDED Viewed

@@ -0,0 +1,11 @@
+"use strict";
+var __importDefault = (this && this.__importDefault) || function (mod) {
+    return (mod && mod.__esModule) ? mod : { "default": mod };
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.extractTextFromDocx = extractTextFromDocx;
+const mammoth_1 = __importDefault(require("mammoth"));
+async function extractTextFromDocx(buffer) {
+    const result = await mammoth_1.default.extractRawText({ buffer: buffer });
+    return result.value;
+}

package/dist/index.d.ts ADDED Viewed

@@ -0,0 +1,5 @@
+import { OCRSource, OCROptions, OCRResult } from './types';
+export * from './types';
+export declare function processOCR(options: OCROptions & {
+    input: OCRSource[];
+}): Promise<OCRResult[]>;

package/dist/index.js ADDED Viewed

@@ -0,0 +1,245 @@
+"use strict";
+var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    var desc = Object.getOwnPropertyDescriptor(m, k);
+    if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
+      desc = { enumerable: true, get: function() { return m[k]; } };
+    }
+    Object.defineProperty(o, k2, desc);
+}) : (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    o[k2] = m[k];
+}));
+var __exportStar = (this && this.__exportStar) || function(m, exports) {
+    for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.processOCR = processOCR;
+const pdf_lib_1 = require("pdf-lib");
+const input_handler_1 = require("./input-handler");
+const llm_1 = require("./llm");
+const converters_1 = require("./converters");
+const prompts_1 = require("./prompts");
+const utils_1 = require("./utils");
+__exportStar(require("./types"), exports);
+// Helper to normalize input
+function normalizeSource(source) {
+    if (Buffer.isBuffer(source)) {
+        return { type: 'buffer', value: source };
+    }
+    if (typeof source === 'string') {
+        const trimmed = source.trim();
+        if (trimmed.startsWith('http') || trimmed.startsWith('https')) {
+            return { type: 'url', value: trimmed };
+        }
+        if (trimmed.startsWith('data:')) {
+            return { type: 'base64', value: trimmed };
+        }
+        return { type: 'path', value: trimmed };
+    }
+    return source;
+}
+async function processOCR(options) {
+    const inputs = options.input;
+    const normalizedInputs = inputs.map(normalizeSource);
+    // Run sequentially or strictly limited parallel could be better for API limits,
+    // but Promise.all is standard for "batch".
+    const results = await Promise.all(normalizedInputs.map(input => processSingleFile(input, options)));
+    return results;
+}
+async function processSingleFile(input, options) {
+    const startTime = new Date();
+    const logger = new utils_1.Logger();
+    const requestId = (0, utils_1.generateRequestId)();
+    logger.log(`INIT: Processing file. RequestId: ${requestId}`);
+    try {
+        if (!options.apiKey)
+            throw new Error('Gemini API key is required.');
+        const gemini = new llm_1.GeminiClient(options.apiKey, logger);
+        const modelName = options.model || 'gemini-1.5-flash';
+        // Input Processing
+        const inputHandler = new input_handler_1.InputHandler(logger);
+        const normalized = await inputHandler.processInput(input);
+        const mimeType = normalized.mimeType;
+        // Strategy
+        let strategy = 'MEDIA';
+        if (mimeType === 'text/plain' || mimeType === 'text/csv' || normalized.extension === 'txt' || normalized.extension === 'csv') {
+            strategy = 'TEXT';
+        }
+        else if (mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' || normalized.extension === 'docx') {
+            strategy = 'DOCX';
+        }
+        // Prompt Construction
+        let basePrompt = prompts_1.PDF_EXTRACTION_PROMPT;
+        if (mimeType.startsWith('image/'))
+            basePrompt = prompts_1.IMAGE_CONTEXT_PROMPT;
+        else if (mimeType.startsWith('audio/'))
+            basePrompt = prompts_1.AUDIO_CONTEXT_PROMPT;
+        else if (mimeType.startsWith('video/'))
+            basePrompt = prompts_1.VIDEO_CONTEXT_PROMPT;
+        let finalPrompt = basePrompt;
+        if (options.summarize) {
+            finalPrompt += `\n\n${prompts_1.META_JSON_PROMPT}`;
+        }
+        // Execution
+        let analysisText = '';
+        let extractedTextDocx = '';
+        let fileUri;
+        if (strategy === 'TEXT') {
+            const content = normalized.data.toString('utf-8');
+            finalPrompt += `\n\nDOCUMENT CONTENT:\n${content}`;
+            analysisText = await gemini.generateContent(modelName, finalPrompt);
+        }
+        else if (strategy === 'DOCX') {
+            extractedTextDocx = await (0, converters_1.extractTextFromDocx)(normalized.data);
+            finalPrompt += `\n\nDOCUMENT CONTENT:\n${extractedTextDocx}`;
+            analysisText = await gemini.generateContent(modelName, finalPrompt);
+        }
+        else {
+            if (normalized.originalPath) {
+                fileUri = await gemini.uploadFile(normalized.originalPath, mimeType);
+            }
+            else {
+                fileUri = await gemini.uploadBuffer(normalized.data, mimeType, normalized.extension);
+            }
+            analysisText = await gemini.generateContent(modelName, finalPrompt, { mimeType, fileUri });
+        }
+        // Parse Metadata
+        let metaJson = {};
+        let mainAnalysis = analysisText;
+        if (options.summarize) {
+            // Try strict block first
+            const jsonMatch = analysisText.match(/```json\n([\s\S]*?)\n```/);
+            if (jsonMatch && jsonMatch[1]) {
+                try {
+                    metaJson = JSON.parse(jsonMatch[1]);
+                    mainAnalysis = analysisText.replace(/```json\n[\s\S]*?\n```/, '').trim();
+                }
+                catch (e) {
+                    logger.log('Failed to parse summary JSON block');
+                }
+            }
+            else {
+                // Fallback: look for last JSON object if no block
+                try {
+                    const lastBrace = analysisText.lastIndexOf('}');
+                    const firstBrace = analysisText.lastIndexOf('{'); // Risky if multiple
+                    if (lastBrace > firstBrace && firstBrace !== -1) {
+                        const potentialJson = analysisText.substring(firstBrace, lastBrace + 1);
+                        // Minimal check if it looks like metadata
+                        if (potentialJson.includes('"title"')) {
+                            metaJson = JSON.parse(potentialJson);
+                            mainAnalysis = analysisText.substring(0, firstBrace).trim();
+                        }
+                    }
+                }
+                catch (e) { }
+            }
+        }
+        // Mindmap
+        let mindmap = null;
+        if (options.mindmap) {
+            const enrichPrompt = `${prompts_1.ENRICHMENT_PROMPT}\n\nCONTEXT:\n${mainAnalysis}`;
+            try {
+                const enrichRes = await gemini.generateContent(modelName, enrichPrompt);
+                const jsonPart = enrichRes.match(/\{[\s\S]*\}/);
+                const mermaidPart = enrichRes.match(/```mermaid\n([\s\S]*?)\n```/);
+                if (jsonPart) {
+                    const parsed = JSON.parse(jsonPart[0]);
+                    mindmap = parsed.mermaid || null;
+                }
+                else if (mermaidPart) {
+                    mindmap = mermaidPart[1];
+                }
+                else {
+                    mindmap = enrichRes.replace(/```/g, '');
+                }
+            }
+            catch (e) {
+                logger.log(`Mindmap failed: ${e}`);
+            }
+        }
+        // Entities
+        let entityResult = null;
+        if (options.extractEntities) {
+            let entityPromptStr = options.entitySchema
+                ? (0, prompts_1.generateEntityPrompt)(options.entitySchema)
+                : prompts_1.AUTO_ENTITY_EXTRACTION_PROMPT;
+            try {
+                const context = strategy === 'DOCX' ? extractedTextDocx : mainAnalysis;
+                const finalEntityPrompt = `${entityPromptStr}\n\nDATA CONTEXT:\n${context}`;
+                const res = await gemini.generateContent(modelName, finalEntityPrompt);
+                const json = res.match(/\{[\s\S]*\}/);
+                if (json) {
+                    entityResult = JSON.parse(json[0]);
+                }
+            }
+            catch (e) {
+                logger.log(`Entity extraction failed: ${e}`);
+            }
+        }
+        // Page Count
+        let pageCount = 1;
+        if (mimeType === 'application/pdf') {
+            try {
+                const pdfDoc = await pdf_lib_1.PDFDocument.load(normalized.data);
+                pageCount = pdfDoc.getPageCount();
+            }
+            catch (e) { }
+        }
+        const endTime = new Date();
+        // Success Result
+        return {
+            status: 'success',
+            error: null,
+            extractedText: strategy === 'DOCX' ? extractedTextDocx : mainAnalysis,
+            summary: metaJson.description || null,
+            metadata: metaJson,
+            mindmap,
+            entityResult,
+            pageCount,
+            language: 'en',
+            documentType: 'unknown',
+            confidence: null,
+            timings: {
+                startTime: startTime.toISOString(),
+                endTime: endTime.toISOString(),
+                durationMs: (0, utils_1.calculateDuration)(startTime, endTime)
+            },
+            logs: {
+                events: logger.getEvents(),
+                geminiModel: modelName,
+                requestId
+            },
+            raw: options.includeRaw ? analysisText : null
+        };
+    }
+    catch (error) {
+        const endTime = new Date();
+        // Error Result (Structured)
+        return {
+            status: 'error',
+            error: error.message || 'Unknown error',
+            extractedText: '',
+            summary: null,
+            mindmap: null,
+            entityResult: null,
+            metadata: {},
+            language: null,
+            documentType: null,
+            pageCount: 0,
+            confidence: null,
+            timings: {
+                startTime: startTime.toISOString(),
+                endTime: endTime.toISOString(),
+                durationMs: (0, utils_1.calculateDuration)(startTime, endTime)
+            },
+            logs: {
+                events: logger.getEvents(),
+                geminiModel: options.model || 'unknown',
+                requestId
+            },
+            raw: null
+        };
+    }
+}

package/dist/input-handler.d.ts ADDED Viewed

@@ -0,0 +1,7 @@
+import { OCRObjectInput, NormalizedFile } from './types';
+import { Logger } from './utils';
+export declare class InputHandler {
+    private logger;
+    constructor(logger: Logger);
+    processInput(input: OCRObjectInput): Promise<NormalizedFile>;
+}

package/dist/input-handler.js ADDED Viewed

@@ -0,0 +1,88 @@
+"use strict";
+var __importDefault = (this && this.__importDefault) || function (mod) {
+    return (mod && mod.__esModule) ? mod : { "default": mod };
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.InputHandler = void 0;
+const promises_1 = __importDefault(require("fs/promises"));
+const axios_1 = __importDefault(require("axios"));
+const file_type_1 = require("file-type");
+const mime_types_1 = __importDefault(require("mime-types"));
+const path_1 = __importDefault(require("path"));
+class InputHandler {
+    constructor(logger) {
+        this.logger = logger;
+    }
+    async processInput(input) {
+        this.logger.log(`Processing input type: ${input.type}`);
+        let buffer;
+        let originalPath;
+        switch (input.type) {
+            case 'path':
+                if (typeof input.value !== 'string') {
+                    throw new Error('Input value for "path" must be a string.');
+                }
+                originalPath = path_1.default.resolve(input.value);
+                this.logger.log(`Reading file from path: ${originalPath}`);
+                buffer = await promises_1.default.readFile(originalPath);
+                break;
+            case 'url':
+                if (typeof input.value !== 'string') {
+                    throw new Error('Input value for "url" must be a string.');
+                }
+                this.logger.log(`Fetching from URL: ${input.value}`);
+                const response = await axios_1.default.get(input.value, { responseType: 'arraybuffer' });
+                buffer = Buffer.from(response.data);
+                break;
+            case 'base64':
+                if (typeof input.value !== 'string') {
+                    throw new Error('Input value for "base64" must be a string.');
+                }
+                this.logger.log('Decoding Base64 content');
+                const base64Data = input.value.replace(/^data:.*?;base64,/, '');
+                buffer = Buffer.from(base64Data, 'base64');
+                break;
+            case 'buffer':
+                if (!Buffer.isBuffer(input.value)) {
+                    throw new Error('Input value for "buffer" must be a Buffer.');
+                }
+                this.logger.log('Using provided Buffer');
+                buffer = input.value;
+                break;
+            default:
+                throw new Error(`Unsupported input type: ${input.type}`);
+        }
+        const typeInfo = await (0, file_type_1.fromBuffer)(buffer);
+        let mimeType = typeInfo?.mime;
+        let extension = typeInfo?.ext;
+        // Fallback if file-type cannot detect
+        if (!mimeType) {
+            if (input.type === 'path') {
+                const lookup = mime_types_1.default.lookup(input.value);
+                if (lookup) {
+                    mimeType = lookup;
+                    extension = mime_types_1.default.extension(lookup) || undefined;
+                }
+            }
+            if (!mimeType && input.type === 'url') {
+                const lookup = mime_types_1.default.lookup(input.value);
+                if (lookup) {
+                    mimeType = lookup;
+                    extension = mime_types_1.default.extension(lookup) || undefined;
+                }
+            }
+            if (!mimeType) {
+                mimeType = 'application/octet-stream';
+                extension = 'bin';
+            }
+        }
+        this.logger.log(`Detected MIME type: ${mimeType}, Extension: ${extension}`);
+        return {
+            data: buffer,
+            mimeType: mimeType,
+            extension,
+            originalPath
+        };
+    }
+}
+exports.InputHandler = InputHandler;

package/dist/llm.d.ts ADDED Viewed

@@ -0,0 +1,18 @@
+import { GenerativeModel } from '@google/generative-ai';
+import { Logger } from './utils';
+export declare class GeminiClient {
+    private genAI;
+    private fileManager;
+    private logger;
+    constructor(apiKey: string, logger: Logger);
+    getModel(modelName: string, mimeType?: string): GenerativeModel;
+    uploadFile(filePath: string, mimeType: string): Promise<string>;
+    uploadBuffer(buffer: Buffer, mimeType: string, extension?: string): Promise<string>;
+    generateContent(modelName: string, prompt: string, fileData?: {
+        mimeType: string;
+        fileUri: string;
+    } | {
+        mimeType: string;
+        inlineData: string;
+    }): Promise<string>;
+}

package/dist/llm.js ADDED Viewed

@@ -0,0 +1,100 @@
+"use strict";
+var __importDefault = (this && this.__importDefault) || function (mod) {
+    return (mod && mod.__esModule) ? mod : { "default": mod };
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.GeminiClient = void 0;
+const generative_ai_1 = require("@google/generative-ai");
+const server_1 = require("@google/generative-ai/server");
+const fs_1 = __importDefault(require("fs"));
+const path_1 = __importDefault(require("path"));
+const os_1 = __importDefault(require("os"));
+const uuid_1 = require("uuid");
+class GeminiClient {
+    constructor(apiKey, logger) {
+        this.genAI = new generative_ai_1.GoogleGenerativeAI(apiKey);
+        this.fileManager = new server_1.GoogleAIFileManager(apiKey);
+        this.logger = logger;
+    }
+    getModel(modelName, mimeType = 'text/plain') {
+        return this.genAI.getGenerativeModel({
+            model: modelName,
+            // generationConfig: { responseMimeType: mimeType } // Handle strictly in generate call if needed
+        });
+    }
+    async uploadFile(filePath, mimeType) {
+        this.logger.log(`Uploading file to Gemini: ${filePath} (${mimeType})`);
+        try {
+            const uploadResult = await this.fileManager.uploadFile(filePath, {
+                mimeType,
+                displayName: path_1.default.basename(filePath),
+            });
+            this.logger.log(`Upload successful. API Name: ${uploadResult.file.name}, URI: ${uploadResult.file.uri}`);
+            return uploadResult.file.uri;
+        }
+        catch (error) {
+            this.logger.log(`Upload failed: ${error.message}`);
+            throw error;
+        }
+    }
+    async uploadBuffer(buffer, mimeType, extension = 'tmp') {
+        const tempDir = os_1.default.tmpdir();
+        const tempFilePath = path_1.default.join(tempDir, `gemini-ocr-${(0, uuid_1.v4)()}.${extension}`);
+        try {
+            this.logger.log(`Writing buffer to temp file: ${tempFilePath}`);
+            await fs_1.default.promises.writeFile(tempFilePath, buffer);
+            const uri = await this.uploadFile(tempFilePath, mimeType);
+            // Cleanup temp file immediately after upload?
+            // Documentation says file must be uploaded. After upload we can delete local file.
+            // But we should wait for processing? No, once uploaded to Gemini (File API), we can delete local.
+            // BUT if the file is processing (video), it might take time to be 'ACTIVE'.
+            // Upload returns immediately. State might be 'PROCESSING'.
+            return uri;
+        }
+        finally {
+            // Clean up temp file
+            if (fs_1.default.existsSync(tempFilePath)) {
+                try {
+                    await fs_1.default.promises.unlink(tempFilePath);
+                    this.logger.log(`Deleted temp file: ${tempFilePath}`);
+                }
+                catch (e) {
+                    this.logger.log(`Failed to delete temp file: ${e}`);
+                }
+            }
+        }
+    }
+    async generateContent(modelName, prompt, fileData) {
+        const model = this.getModel(modelName);
+        const parts = [{ text: prompt }];
+        if (fileData) {
+            if ('fileUri' in fileData) {
+                parts.push({
+                    fileData: {
+                        mimeType: fileData.mimeType,
+                        fileUri: fileData.fileUri
+                    }
+                });
+            }
+            else if ('inlineData' in fileData) {
+                parts.push({
+                    inlineData: {
+                        mimeType: fileData.mimeType,
+                        data: fileData.inlineData
+                    }
+                });
+            }
+        }
+        this.logger.log(`Generating content with model: ${modelName}`);
+        try {
+            const result = await model.generateContent(parts);
+            const response = await result.response;
+            return response.text();
+        }
+        catch (error) {
+            this.logger.log(`Gemini generation error: ${error.message}`);
+            throw error;
+        }
+    }
+}
+exports.GeminiClient = GeminiClient;

package/dist/prompts.d.ts ADDED Viewed

@@ -0,0 +1,8 @@
+export declare const ENRICHMENT_PROMPT = "\nYou are an expert content analyzer and information architect. Your task is to visualize the core structure and key concepts of the provided document using a **Mermaid.js Mindmap**.\n\n**Objective**: Create a \"Modern Mindmap\" that maps out the central theme, main branches (key topics), and sub-branches (details/evidence).\n\n**Instructions**:\n1.  **Analyze**: Understand the central thesis and hierarchical structure of the content.\n2.  **Generate Mermaid Code**: Output valid Mermaid `mindmap` syntax.\n    *   Start with `mindmap` keyword.\n    *   Use **2 spaces** indentation for hierarchy (no tabs).\n    *   **CRITICAL**: If a node label contains parentheses `()`, brackets `[]`, or special characters, you MUST wrap the text in quotes, e.g., `node[\"Label with (parens)\"]`.\n    *   Fail-safe: If unsure, wrap the label in double quotes.\n    *   Root node should be the Document Title or Main Topic.\n    *   Keep node labels concise (1-4 words preferred).\n    *   **Style**: Aim for a balanced, radial or hierarchical structure that is easy to read.\n3.  **Strictly NO Markdown**: Return ONLY the mermaid code inside a JSON object.\n\n**Output Format**:\nReturn a JSON object strictly following this structure. Do NOT wrap it in markdown code blocks.\n\n{\n  \"mermaid\": \"mindmap\\n  root((...))\"\n}\n";
+export declare const PDF_EXTRACTION_PROMPT = "**Objective:** Perform a comprehensive extraction and analysis of the provided document.\n\n**Instructions:**\n1.  **Text Extraction:** Extract all textual content/OCR from the document. Preserve logical structure.\n2.  **Context:** If images/charts are present, describe them briefly in-flow.\n\n**Output:**\nReturn ONLY the extracted text and descriptions. Do not add conversational filler.\n";
+export declare const IMAGE_CONTEXT_PROMPT = "Analyze the uploaded image. Extract all text verbatim and describe the visual content in detail.";
+export declare const AUDIO_CONTEXT_PROMPT = "Analyze the audio. Transcribe speech and describe sounds/music.";
+export declare const VIDEO_CONTEXT_PROMPT = "Analyze the video. Transcribe speech, describe visual scenes, actions, and text on screen.";
+export declare const META_JSON_PROMPT = "\n*** METADATA REQUIREMENT ***\n\nAppend the following JSON object at the VERY END of your response.\n\n```json\n{\n  \"title\": \"Short descriptive title\",\n  \"description\": \"Concise summary (1-2 sentences)\",\n  \"thumbnail\": \"Visual description for a thumbnail\"\n}\n```\n";
+export declare const AUTO_ENTITY_EXTRACTION_PROMPT = "\nYou are an expert entity extractor. Automatically identify significant entities (Names, Dates, IDs, amounts).\nReturn valid JSON with snake_case keys.\n";
+export declare const generateEntityPrompt: (fields: string[] | any) => string;

package/dist/prompts.js ADDED Viewed

@@ -0,0 +1,73 @@
+"use strict";
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.generateEntityPrompt = exports.AUTO_ENTITY_EXTRACTION_PROMPT = exports.META_JSON_PROMPT = exports.VIDEO_CONTEXT_PROMPT = exports.AUDIO_CONTEXT_PROMPT = exports.IMAGE_CONTEXT_PROMPT = exports.PDF_EXTRACTION_PROMPT = exports.ENRICHMENT_PROMPT = void 0;
+exports.ENRICHMENT_PROMPT = `
+You are an expert content analyzer and information architect. Your task is to visualize the core structure and key concepts of the provided document using a **Mermaid.js Mindmap**.
+**Objective**: Create a "Modern Mindmap" that maps out the central theme, main branches (key topics), and sub-branches (details/evidence).
+**Instructions**:
+1.  **Analyze**: Understand the central thesis and hierarchical structure of the content.
+2.  **Generate Mermaid Code**: Output valid Mermaid \`mindmap\` syntax.
+    *   Start with \`mindmap\` keyword.
+    *   Use **2 spaces** indentation for hierarchy (no tabs).
+    *   **CRITICAL**: If a node label contains parentheses \`()\`, brackets \`[]\`, or special characters, you MUST wrap the text in quotes, e.g., \`node["Label with (parens)"]\`.
+    *   Fail-safe: If unsure, wrap the label in double quotes.
+    *   Root node should be the Document Title or Main Topic.
+    *   Keep node labels concise (1-4 words preferred).
+    *   **Style**: Aim for a balanced, radial or hierarchical structure that is easy to read.
+3.  **Strictly NO Markdown**: Return ONLY the mermaid code inside a JSON object.
+**Output Format**:
+Return a JSON object strictly following this structure. Do NOT wrap it in markdown code blocks.
+{
+  "mermaid": "mindmap\\n  root((...))"
+}
+`;
+exports.PDF_EXTRACTION_PROMPT = `**Objective:** Perform a comprehensive extraction and analysis of the provided document.
+**Instructions:**
+1.  **Text Extraction:** Extract all textual content/OCR from the document. Preserve logical structure.
+2.  **Context:** If images/charts are present, describe them briefly in-flow.
+**Output:**
+Return ONLY the extracted text and descriptions. Do not add conversational filler.
+`;
+exports.IMAGE_CONTEXT_PROMPT = `Analyze the uploaded image. Extract all text verbatim and describe the visual content in detail.`;
+exports.AUDIO_CONTEXT_PROMPT = `Analyze the audio. Transcribe speech and describe sounds/music.`;
+exports.VIDEO_CONTEXT_PROMPT = `Analyze the video. Transcribe speech, describe visual scenes, actions, and text on screen.`;
+exports.META_JSON_PROMPT = `
+*** METADATA REQUIREMENT ***
+Append the following JSON object at the VERY END of your response.
+\`\`\`json
+{
+  "title": "Short descriptive title",
+  "description": "Concise summary (1-2 sentences)",
+  "thumbnail": "Visual description for a thumbnail"
+}
+\`\`\`
+`;
+exports.AUTO_ENTITY_EXTRACTION_PROMPT = `
+You are an expert entity extractor. Automatically identify significant entities (Names, Dates, IDs, amounts).
+Return valid JSON with snake_case keys.
+`;
+const generateEntityPrompt = (fields) => {
+    let fieldsList = '';
+    let jsonJSON = {};
+    if (Array.isArray(fields)) {
+        fieldsList = fields.join(', ');
+        jsonJSON = fields.reduce((acc, curr) => ({ ...acc, [curr.toLowerCase().replace(/ /g, '_')]: "..." }), {});
+    }
+    else {
+        jsonJSON = fields;
+    }
+    return `
+Extract the following entities: ${fieldsList || 'from schema'}.
+Return strictly JSON matching this structure:
+${JSON.stringify(jsonJSON, null, 2)}
+`;
+};
+exports.generateEntityPrompt = generateEntityPrompt;

package/dist/types.d.ts ADDED Viewed

@@ -0,0 +1,55 @@
+export type InputType = 'url' | 'base64' | 'buffer' | 'path';
+export interface OCRObjectInput {
+    type: InputType;
+    value: string | Buffer;
+}
+export type OCRSource = string | Buffer | OCRObjectInput;
+export interface OCROptions {
+    model?: string;
+    apiKey: string;
+    includeRaw?: boolean;
+    summarize?: boolean;
+    mindmap?: boolean;
+    extractEntities?: boolean;
+    pageLimit?: number;
+    entitySchema?: string[] | any;
+    classify?: boolean;
+}
+export interface OCRTimings {
+    startTime: string;
+    endTime: string;
+    durationMs: number;
+}
+export interface OCRLogs {
+    events: string[];
+    geminiModel: string;
+    requestId: string | null;
+}
+export interface DocumentMetadata {
+    title?: string;
+    description?: string;
+    thumbnail?: string;
+    [key: string]: any;
+}
+export interface OCRResult {
+    status: 'success' | 'error';
+    error: string | null;
+    extractedText: string;
+    summary: string | null;
+    mindmap: string | null;
+    entityResult: any | null;
+    metadata: DocumentMetadata;
+    language: string | null;
+    documentType: string | null;
+    pageCount: number;
+    confidence: number | null;
+    timings: OCRTimings;
+    logs: OCRLogs;
+    raw: any | null;
+}
+export interface NormalizedFile {
+    mimeType: string;
+    data: Buffer;
+    extension?: string;
+    originalPath?: string;
+}

package/dist/types.js ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ "use strict";
2	+ Object.defineProperty(exports, "__esModule", { value: true });

package/dist/utils.d.ts ADDED Viewed

@@ -0,0 +1,7 @@
+export declare class Logger {
+    private events;
+    log(message: string): void;
+    getEvents(): string[];
+}
+export declare function generateRequestId(): string;
+export declare function calculateDuration(startTime: Date, endTime: Date): number;

package/dist/utils.js ADDED Viewed

@@ -0,0 +1,25 @@
+"use strict";
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.Logger = void 0;
+exports.generateRequestId = generateRequestId;
+exports.calculateDuration = calculateDuration;
+const uuid_1 = require("uuid");
+class Logger {
+    constructor() {
+        this.events = [];
+    }
+    log(message) {
+        const timestamp = new Date().toISOString();
+        this.events.push(`${timestamp} - ${message}`);
+    }
+    getEvents() {
+        return this.events;
+    }
+}
+exports.Logger = Logger;
+function generateRequestId() {
+    return (0, uuid_1.v4)();
+}
+function calculateDuration(startTime, endTime) {
+    return endTime.getTime() - startTime.getTime();
+}

package/package.json ADDED Viewed

@@ -0,0 +1,53 @@
+{
+  "name": "@pandi2352/gemini-ocr",
+  "version": "1.0.0",
+  "description": "A lightweight OCR processing wrapper using Google Gemini Vision models.",
+  "publishConfig": {
+    "access": "public"
+  },
+  "main": "dist/index.js",
+  "types": "dist/index.d.ts",
+  "author": "Pandi2352",
+  "license": "MIT",
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/Pandi2352/npm-gemini-ocr.git"
+  },
+  "files": [
+    "dist",
+    "README.md",
+    "LICENSE"
+  ],
+  "scripts": {
+    "build": "rimraf dist && tsc",
+    "prepublishOnly": "npm run build",
+    "test": "ts-node test.ts"
+  },
+  "keywords": [
+    "ocr",
+    "gemini",
+    "google",
+    "ai",
+    "pdf",
+    "image",
+    "vision"
+  ],
+  "devDependencies": {
+    "@types/mime-types": "^3.0.1",
+    "@types/uuid": "^10.0.0",
+    "rimraf": "^6.1.2",
+    "ts-node": "^10.9.2"
+  },
+  "dependencies": {
+    "@google/generative-ai": "^0.24.1",
+    "@types/node": "^25.0.3",
+    "axios": "^1.13.2",
+    "dotenv": "^17.2.3",
+    "file-type": "^16.5.4",
+    "mammoth": "^1.8.0",
+    "mime-types": "^3.0.2",
+    "pdf-lib": "^1.17.1",
+    "typescript": "^5.9.3",
+    "uuid": "^13.0.0"
+  }
+}