@pandi2352/gemini-ocr 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Pandi2352
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,145 @@
1
+ <div align="center">
2
+
3
+ # 🔮 Gemini OCR (@pandi2352/gemini-ocr)
4
+ ### The Next-Gen Document Intelligence Wrapper
5
+
6
+ [![npm version](https://img.shields.io/npm/v/@pandi2352/gemini-ocr?style=flat-square&color=blue)](https://www.npmjs.com/package/@pandi2352/gemini-ocr)
7
+ [![TypeScript](https://img.shields.io/badge/Built%20With-TypeScript-3178C6?style=flat-square&logo=typescript)](https://www.typescriptlang.org/)
8
+ [![License](https://img.shields.io/badge/License-MIT-green?style=flat-square)](./LICENSE)
9
+ [![AI Powered](https://img.shields.io/badge/AI-Powered%20by%20Gemini-8E75B2?style=flat-square&logo=google)](https://deepmind.google/technologies/gemini/)
10
+
11
+ <p class="description">
12
+ Turn <b>any file</b> (PDF, Image, DOCX, Audio, Video) into <b>structured data</b>, <br>
13
+ <b>mindmaps</b>, and <b>summaries</b> with a single function call.
14
+ </p>
15
+
16
+ </div>
17
+
18
+ ---
19
+
20
+ ## ⚡ Why Gemini OCR?
21
+
22
+ Traditional OCR (Tesseract, AWS Textract) gives you just text. **Gemini OCR gives you understanding.**
23
+
24
+ | Feature | Description |
25
+ | :--- | :--- |
26
+ | **🧠 Deep Understanding** | Don't just extract text—understand it. Get summaries, titles, and context. |
27
+ | **🗺️ Mindmaps** | Auto-generate **Mermaid.js** mindmaps to visualize complex documents. |
28
+ | **🏎️ Batch Processing** | Process standard arrays of files (`['path', 'url']`) in parallel. |
29
+ | **🎯 Entity Extraction** | Extract specific fields (Dates, Names, IDs) into strict JSON. |
30
+ | **🌈 Multimodal** | Works on **PDFs**, **Images**, **Word Docs**, **Audio**, and **Video**. |
31
+
32
+ ---
33
+
34
+ ## 📚 Step-by-Step Usage Guide
35
+
36
+ ### 1. Prerequisites
37
+ You need a Google Gemini API Key.
38
+ [**Get your API Key here**](https://aistudio.google.com/app/apikey)
39
+
40
+ ### 2. Installation
41
+ Install the package in your Node.js project:
42
+
43
+ ```bash
44
+ npm install @pandi2352/gemini-ocr
45
+ ```
46
+
47
+ ### 3. Basic Usage (Text Extraction)
48
+ Create a file (e.g., `index.ts`) and add the following. This works for locally stored files or URLs.
49
+
50
+ ```typescript
51
+ import { processOCR } from '@pandi2352/gemini-ocr';
52
+
53
+ async function main() {
54
+ const results = await processOCR({
55
+ // Input can be a single file string or an array
56
+ input: ['./my-document.pdf'],
57
+ apiKey: process.env.GEMINI_API_KEY
58
+ });
59
+
60
+ console.log(results[0].extractedText);
61
+ }
62
+
63
+ main();
64
+ ```
65
+
66
+ ### 4. Batch Processing (Multiple Files)
67
+ Pass an array of file paths or URLs. They are processed in parallel.
68
+
69
+ ```typescript
70
+ const results = await processOCR({
71
+ input: [
72
+ './invoice_january.pdf',
73
+ 'https://example.com/receipt.jpg',
74
+ './meeting_notes.docx'
75
+ ],
76
+ apiKey: process.env.GEMINI_API_KEY,
77
+ summarize: true // Optional: Get summaries for all
78
+ });
79
+
80
+ results.forEach((doc, index) => {
81
+ if (doc.status === 'success') {
82
+ console.log(`File ${index + 1}: ${doc.summary}`);
83
+ }
84
+ });
85
+ ```
86
+
87
+ ### 5. Advanced Intelligence (Mindmaps & Entities)
88
+ Unlock the full power of AI by enabling specific flags.
89
+
90
+ ```typescript
91
+ const [result] = await processOCR({
92
+ input: ['./complex_contract.pdf'],
93
+ apiKey: process.env.GEMINI_API_KEY,
94
+
95
+ // Enable Advanced Features
96
+ mindmap: true, // Generates Mermaid.js visualization
97
+ extractEntities: true, // Extracts JSON data
98
+ entitySchema: ['Contract Value', 'Start Date', 'Parties Involved'] // Optional custom fields
99
+ });
100
+
101
+ // 1. Get the Mindmap
102
+ console.log('Mindmap Code:', result.mindmap);
103
+
104
+ // 2. Get Structured Data
105
+ console.log('Extracted Data:', result.entityResult);
106
+ /* Output:
107
+ {
108
+ "contract_value": "$50,000",
109
+ "start_date": "2024-01-01",
110
+ "parties_involved": "Company A, Vendor B"
111
+ }
112
+ */
113
+ ```
114
+
115
+ ---
116
+
117
+ ## 🛠️ Configuration Options
118
+
119
+ | Option | Type | Default | Description |
120
+ | :--- | :--- | :--- | :--- |
121
+ | `input` | `Array<string \| Buffer \| Object>` | **Required** | Array of file paths, URLs, Buffers, or Base64 strings. |
122
+ | `apiKey` | `string` | **Required** | Your Google Gemini API Key. |
123
+ | `model` | `string` | `gemini-1.5-flash` | The AI model to use. |
124
+ | `summarize` | `boolean` | `false` | Generate `metadata` (title, desc, thumbnail). |
125
+ | `mindmap` | `boolean` | `false` | Generate Mermaid.js syntax for visual mapping. |
126
+ | `extractEntities`| `boolean` | `false` | Enable structured field extraction. |
127
+ | `entitySchema` | `string[]` | `auto` | Custom fields to extract (optional). |
128
+
129
+ ---
130
+
131
+ ## 🤝 Contributing
132
+
133
+ We love contributions! Please feel free to submit a Pull Request.
134
+
135
+ 1. Fork it
136
+ 2. Create your feature branch (`git checkout -b feature/cool-feature`)
137
+ 3. Commit your changes
138
+ 4. Push to the branch
139
+ 5. Open a Pull Request
140
+
141
+ ---
142
+
143
+ <div align="center">
144
+ <sub>Built with ❤️ for developers by developers.</sub>
145
+ </div>
@@ -0,0 +1 @@
1
+ export declare function extractTextFromDocx(buffer: Buffer): Promise<string>;
@@ -0,0 +1,11 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.extractTextFromDocx = extractTextFromDocx;
7
+ const mammoth_1 = __importDefault(require("mammoth"));
8
+ async function extractTextFromDocx(buffer) {
9
+ const result = await mammoth_1.default.extractRawText({ buffer: buffer });
10
+ return result.value;
11
+ }
@@ -0,0 +1,5 @@
1
+ import { OCRSource, OCROptions, OCRResult } from './types';
2
+ export * from './types';
3
+ export declare function processOCR(options: OCROptions & {
4
+ input: OCRSource[];
5
+ }): Promise<OCRResult[]>;
package/dist/index.js ADDED
@@ -0,0 +1,245 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __exportStar = (this && this.__exportStar) || function(m, exports) {
14
+ for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
15
+ };
16
+ Object.defineProperty(exports, "__esModule", { value: true });
17
+ exports.processOCR = processOCR;
18
+ const pdf_lib_1 = require("pdf-lib");
19
+ const input_handler_1 = require("./input-handler");
20
+ const llm_1 = require("./llm");
21
+ const converters_1 = require("./converters");
22
+ const prompts_1 = require("./prompts");
23
+ const utils_1 = require("./utils");
24
+ __exportStar(require("./types"), exports);
25
+ // Helper to normalize input
26
+ function normalizeSource(source) {
27
+ if (Buffer.isBuffer(source)) {
28
+ return { type: 'buffer', value: source };
29
+ }
30
+ if (typeof source === 'string') {
31
+ const trimmed = source.trim();
32
+ if (trimmed.startsWith('http') || trimmed.startsWith('https')) {
33
+ return { type: 'url', value: trimmed };
34
+ }
35
+ if (trimmed.startsWith('data:')) {
36
+ return { type: 'base64', value: trimmed };
37
+ }
38
+ return { type: 'path', value: trimmed };
39
+ }
40
+ return source;
41
+ }
42
+ async function processOCR(options) {
43
+ const inputs = options.input;
44
+ const normalizedInputs = inputs.map(normalizeSource);
45
+ // Run sequentially or strictly limited parallel could be better for API limits,
46
+ // but Promise.all is standard for "batch".
47
+ const results = await Promise.all(normalizedInputs.map(input => processSingleFile(input, options)));
48
+ return results;
49
+ }
50
+ async function processSingleFile(input, options) {
51
+ const startTime = new Date();
52
+ const logger = new utils_1.Logger();
53
+ const requestId = (0, utils_1.generateRequestId)();
54
+ logger.log(`INIT: Processing file. RequestId: ${requestId}`);
55
+ try {
56
+ if (!options.apiKey)
57
+ throw new Error('Gemini API key is required.');
58
+ const gemini = new llm_1.GeminiClient(options.apiKey, logger);
59
+ const modelName = options.model || 'gemini-1.5-flash';
60
+ // Input Processing
61
+ const inputHandler = new input_handler_1.InputHandler(logger);
62
+ const normalized = await inputHandler.processInput(input);
63
+ const mimeType = normalized.mimeType;
64
+ // Strategy
65
+ let strategy = 'MEDIA';
66
+ if (mimeType === 'text/plain' || mimeType === 'text/csv' || normalized.extension === 'txt' || normalized.extension === 'csv') {
67
+ strategy = 'TEXT';
68
+ }
69
+ else if (mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' || normalized.extension === 'docx') {
70
+ strategy = 'DOCX';
71
+ }
72
+ // Prompt Construction
73
+ let basePrompt = prompts_1.PDF_EXTRACTION_PROMPT;
74
+ if (mimeType.startsWith('image/'))
75
+ basePrompt = prompts_1.IMAGE_CONTEXT_PROMPT;
76
+ else if (mimeType.startsWith('audio/'))
77
+ basePrompt = prompts_1.AUDIO_CONTEXT_PROMPT;
78
+ else if (mimeType.startsWith('video/'))
79
+ basePrompt = prompts_1.VIDEO_CONTEXT_PROMPT;
80
+ let finalPrompt = basePrompt;
81
+ if (options.summarize) {
82
+ finalPrompt += `\n\n${prompts_1.META_JSON_PROMPT}`;
83
+ }
84
+ // Execution
85
+ let analysisText = '';
86
+ let extractedTextDocx = '';
87
+ let fileUri;
88
+ if (strategy === 'TEXT') {
89
+ const content = normalized.data.toString('utf-8');
90
+ finalPrompt += `\n\nDOCUMENT CONTENT:\n${content}`;
91
+ analysisText = await gemini.generateContent(modelName, finalPrompt);
92
+ }
93
+ else if (strategy === 'DOCX') {
94
+ extractedTextDocx = await (0, converters_1.extractTextFromDocx)(normalized.data);
95
+ finalPrompt += `\n\nDOCUMENT CONTENT:\n${extractedTextDocx}`;
96
+ analysisText = await gemini.generateContent(modelName, finalPrompt);
97
+ }
98
+ else {
99
+ if (normalized.originalPath) {
100
+ fileUri = await gemini.uploadFile(normalized.originalPath, mimeType);
101
+ }
102
+ else {
103
+ fileUri = await gemini.uploadBuffer(normalized.data, mimeType, normalized.extension);
104
+ }
105
+ analysisText = await gemini.generateContent(modelName, finalPrompt, { mimeType, fileUri });
106
+ }
107
+ // Parse Metadata
108
+ let metaJson = {};
109
+ let mainAnalysis = analysisText;
110
+ if (options.summarize) {
111
+ // Try strict block first
112
+ const jsonMatch = analysisText.match(/```json\n([\s\S]*?)\n```/);
113
+ if (jsonMatch && jsonMatch[1]) {
114
+ try {
115
+ metaJson = JSON.parse(jsonMatch[1]);
116
+ mainAnalysis = analysisText.replace(/```json\n[\s\S]*?\n```/, '').trim();
117
+ }
118
+ catch (e) {
119
+ logger.log('Failed to parse summary JSON block');
120
+ }
121
+ }
122
+ else {
123
+ // Fallback: look for last JSON object if no block
124
+ try {
125
+ const lastBrace = analysisText.lastIndexOf('}');
126
+ const firstBrace = analysisText.lastIndexOf('{'); // Risky if multiple
127
+ if (lastBrace > firstBrace && firstBrace !== -1) {
128
+ const potentialJson = analysisText.substring(firstBrace, lastBrace + 1);
129
+ // Minimal check if it looks like metadata
130
+ if (potentialJson.includes('"title"')) {
131
+ metaJson = JSON.parse(potentialJson);
132
+ mainAnalysis = analysisText.substring(0, firstBrace).trim();
133
+ }
134
+ }
135
+ }
136
+ catch (e) { }
137
+ }
138
+ }
139
+ // Mindmap
140
+ let mindmap = null;
141
+ if (options.mindmap) {
142
+ const enrichPrompt = `${prompts_1.ENRICHMENT_PROMPT}\n\nCONTEXT:\n${mainAnalysis}`;
143
+ try {
144
+ const enrichRes = await gemini.generateContent(modelName, enrichPrompt);
145
+ const jsonPart = enrichRes.match(/\{[\s\S]*\}/);
146
+ const mermaidPart = enrichRes.match(/```mermaid\n([\s\S]*?)\n```/);
147
+ if (jsonPart) {
148
+ const parsed = JSON.parse(jsonPart[0]);
149
+ mindmap = parsed.mermaid || null;
150
+ }
151
+ else if (mermaidPart) {
152
+ mindmap = mermaidPart[1];
153
+ }
154
+ else {
155
+ mindmap = enrichRes.replace(/```/g, '');
156
+ }
157
+ }
158
+ catch (e) {
159
+ logger.log(`Mindmap failed: ${e}`);
160
+ }
161
+ }
162
+ // Entities
163
+ let entityResult = null;
164
+ if (options.extractEntities) {
165
+ let entityPromptStr = options.entitySchema
166
+ ? (0, prompts_1.generateEntityPrompt)(options.entitySchema)
167
+ : prompts_1.AUTO_ENTITY_EXTRACTION_PROMPT;
168
+ try {
169
+ const context = strategy === 'DOCX' ? extractedTextDocx : mainAnalysis;
170
+ const finalEntityPrompt = `${entityPromptStr}\n\nDATA CONTEXT:\n${context}`;
171
+ const res = await gemini.generateContent(modelName, finalEntityPrompt);
172
+ const json = res.match(/\{[\s\S]*\}/);
173
+ if (json) {
174
+ entityResult = JSON.parse(json[0]);
175
+ }
176
+ }
177
+ catch (e) {
178
+ logger.log(`Entity extraction failed: ${e}`);
179
+ }
180
+ }
181
+ // Page Count
182
+ let pageCount = 1;
183
+ if (mimeType === 'application/pdf') {
184
+ try {
185
+ const pdfDoc = await pdf_lib_1.PDFDocument.load(normalized.data);
186
+ pageCount = pdfDoc.getPageCount();
187
+ }
188
+ catch (e) { }
189
+ }
190
+ const endTime = new Date();
191
+ // Success Result
192
+ return {
193
+ status: 'success',
194
+ error: null,
195
+ extractedText: strategy === 'DOCX' ? extractedTextDocx : mainAnalysis,
196
+ summary: metaJson.description || null,
197
+ metadata: metaJson,
198
+ mindmap,
199
+ entityResult,
200
+ pageCount,
201
+ language: 'en',
202
+ documentType: 'unknown',
203
+ confidence: null,
204
+ timings: {
205
+ startTime: startTime.toISOString(),
206
+ endTime: endTime.toISOString(),
207
+ durationMs: (0, utils_1.calculateDuration)(startTime, endTime)
208
+ },
209
+ logs: {
210
+ events: logger.getEvents(),
211
+ geminiModel: modelName,
212
+ requestId
213
+ },
214
+ raw: options.includeRaw ? analysisText : null
215
+ };
216
+ }
217
+ catch (error) {
218
+ const endTime = new Date();
219
+ // Error Result (Structured)
220
+ return {
221
+ status: 'error',
222
+ error: error.message || 'Unknown error',
223
+ extractedText: '',
224
+ summary: null,
225
+ mindmap: null,
226
+ entityResult: null,
227
+ metadata: {},
228
+ language: null,
229
+ documentType: null,
230
+ pageCount: 0,
231
+ confidence: null,
232
+ timings: {
233
+ startTime: startTime.toISOString(),
234
+ endTime: endTime.toISOString(),
235
+ durationMs: (0, utils_1.calculateDuration)(startTime, endTime)
236
+ },
237
+ logs: {
238
+ events: logger.getEvents(),
239
+ geminiModel: options.model || 'unknown',
240
+ requestId
241
+ },
242
+ raw: null
243
+ };
244
+ }
245
+ }
@@ -0,0 +1,7 @@
1
+ import { OCRObjectInput, NormalizedFile } from './types';
2
+ import { Logger } from './utils';
3
+ export declare class InputHandler {
4
+ private logger;
5
+ constructor(logger: Logger);
6
+ processInput(input: OCRObjectInput): Promise<NormalizedFile>;
7
+ }
@@ -0,0 +1,88 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.InputHandler = void 0;
7
+ const promises_1 = __importDefault(require("fs/promises"));
8
+ const axios_1 = __importDefault(require("axios"));
9
+ const file_type_1 = require("file-type");
10
+ const mime_types_1 = __importDefault(require("mime-types"));
11
+ const path_1 = __importDefault(require("path"));
12
+ class InputHandler {
13
+ constructor(logger) {
14
+ this.logger = logger;
15
+ }
16
+ async processInput(input) {
17
+ this.logger.log(`Processing input type: ${input.type}`);
18
+ let buffer;
19
+ let originalPath;
20
+ switch (input.type) {
21
+ case 'path':
22
+ if (typeof input.value !== 'string') {
23
+ throw new Error('Input value for "path" must be a string.');
24
+ }
25
+ originalPath = path_1.default.resolve(input.value);
26
+ this.logger.log(`Reading file from path: ${originalPath}`);
27
+ buffer = await promises_1.default.readFile(originalPath);
28
+ break;
29
+ case 'url':
30
+ if (typeof input.value !== 'string') {
31
+ throw new Error('Input value for "url" must be a string.');
32
+ }
33
+ this.logger.log(`Fetching from URL: ${input.value}`);
34
+ const response = await axios_1.default.get(input.value, { responseType: 'arraybuffer' });
35
+ buffer = Buffer.from(response.data);
36
+ break;
37
+ case 'base64':
38
+ if (typeof input.value !== 'string') {
39
+ throw new Error('Input value for "base64" must be a string.');
40
+ }
41
+ this.logger.log('Decoding Base64 content');
42
+ const base64Data = input.value.replace(/^data:.*?;base64,/, '');
43
+ buffer = Buffer.from(base64Data, 'base64');
44
+ break;
45
+ case 'buffer':
46
+ if (!Buffer.isBuffer(input.value)) {
47
+ throw new Error('Input value for "buffer" must be a Buffer.');
48
+ }
49
+ this.logger.log('Using provided Buffer');
50
+ buffer = input.value;
51
+ break;
52
+ default:
53
+ throw new Error(`Unsupported input type: ${input.type}`);
54
+ }
55
+ const typeInfo = await (0, file_type_1.fromBuffer)(buffer);
56
+ let mimeType = typeInfo?.mime;
57
+ let extension = typeInfo?.ext;
58
+ // Fallback if file-type cannot detect
59
+ if (!mimeType) {
60
+ if (input.type === 'path') {
61
+ const lookup = mime_types_1.default.lookup(input.value);
62
+ if (lookup) {
63
+ mimeType = lookup;
64
+ extension = mime_types_1.default.extension(lookup) || undefined;
65
+ }
66
+ }
67
+ if (!mimeType && input.type === 'url') {
68
+ const lookup = mime_types_1.default.lookup(input.value);
69
+ if (lookup) {
70
+ mimeType = lookup;
71
+ extension = mime_types_1.default.extension(lookup) || undefined;
72
+ }
73
+ }
74
+ if (!mimeType) {
75
+ mimeType = 'application/octet-stream';
76
+ extension = 'bin';
77
+ }
78
+ }
79
+ this.logger.log(`Detected MIME type: ${mimeType}, Extension: ${extension}`);
80
+ return {
81
+ data: buffer,
82
+ mimeType: mimeType,
83
+ extension,
84
+ originalPath
85
+ };
86
+ }
87
+ }
88
+ exports.InputHandler = InputHandler;
package/dist/llm.d.ts ADDED
@@ -0,0 +1,18 @@
1
+ import { GenerativeModel } from '@google/generative-ai';
2
+ import { Logger } from './utils';
3
+ export declare class GeminiClient {
4
+ private genAI;
5
+ private fileManager;
6
+ private logger;
7
+ constructor(apiKey: string, logger: Logger);
8
+ getModel(modelName: string, mimeType?: string): GenerativeModel;
9
+ uploadFile(filePath: string, mimeType: string): Promise<string>;
10
+ uploadBuffer(buffer: Buffer, mimeType: string, extension?: string): Promise<string>;
11
+ generateContent(modelName: string, prompt: string, fileData?: {
12
+ mimeType: string;
13
+ fileUri: string;
14
+ } | {
15
+ mimeType: string;
16
+ inlineData: string;
17
+ }): Promise<string>;
18
+ }
package/dist/llm.js ADDED
@@ -0,0 +1,100 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.GeminiClient = void 0;
7
+ const generative_ai_1 = require("@google/generative-ai");
8
+ const server_1 = require("@google/generative-ai/server");
9
+ const fs_1 = __importDefault(require("fs"));
10
+ const path_1 = __importDefault(require("path"));
11
+ const os_1 = __importDefault(require("os"));
12
+ const uuid_1 = require("uuid");
13
+ class GeminiClient {
14
+ constructor(apiKey, logger) {
15
+ this.genAI = new generative_ai_1.GoogleGenerativeAI(apiKey);
16
+ this.fileManager = new server_1.GoogleAIFileManager(apiKey);
17
+ this.logger = logger;
18
+ }
19
+ getModel(modelName, mimeType = 'text/plain') {
20
+ return this.genAI.getGenerativeModel({
21
+ model: modelName,
22
+ // generationConfig: { responseMimeType: mimeType } // Handle strictly in generate call if needed
23
+ });
24
+ }
25
+ async uploadFile(filePath, mimeType) {
26
+ this.logger.log(`Uploading file to Gemini: ${filePath} (${mimeType})`);
27
+ try {
28
+ const uploadResult = await this.fileManager.uploadFile(filePath, {
29
+ mimeType,
30
+ displayName: path_1.default.basename(filePath),
31
+ });
32
+ this.logger.log(`Upload successful. API Name: ${uploadResult.file.name}, URI: ${uploadResult.file.uri}`);
33
+ return uploadResult.file.uri;
34
+ }
35
+ catch (error) {
36
+ this.logger.log(`Upload failed: ${error.message}`);
37
+ throw error;
38
+ }
39
+ }
40
+ async uploadBuffer(buffer, mimeType, extension = 'tmp') {
41
+ const tempDir = os_1.default.tmpdir();
42
+ const tempFilePath = path_1.default.join(tempDir, `gemini-ocr-${(0, uuid_1.v4)()}.${extension}`);
43
+ try {
44
+ this.logger.log(`Writing buffer to temp file: ${tempFilePath}`);
45
+ await fs_1.default.promises.writeFile(tempFilePath, buffer);
46
+ const uri = await this.uploadFile(tempFilePath, mimeType);
47
+ // Cleanup temp file immediately after upload?
48
+ // Documentation says file must be uploaded. After upload we can delete local file.
49
+ // But we should wait for processing? No, once uploaded to Gemini (File API), we can delete local.
50
+ // BUT if the file is processing (video), it might take time to be 'ACTIVE'.
51
+ // Upload returns immediately. State might be 'PROCESSING'.
52
+ return uri;
53
+ }
54
+ finally {
55
+ // Clean up temp file
56
+ if (fs_1.default.existsSync(tempFilePath)) {
57
+ try {
58
+ await fs_1.default.promises.unlink(tempFilePath);
59
+ this.logger.log(`Deleted temp file: ${tempFilePath}`);
60
+ }
61
+ catch (e) {
62
+ this.logger.log(`Failed to delete temp file: ${e}`);
63
+ }
64
+ }
65
+ }
66
+ }
67
+ async generateContent(modelName, prompt, fileData) {
68
+ const model = this.getModel(modelName);
69
+ const parts = [{ text: prompt }];
70
+ if (fileData) {
71
+ if ('fileUri' in fileData) {
72
+ parts.push({
73
+ fileData: {
74
+ mimeType: fileData.mimeType,
75
+ fileUri: fileData.fileUri
76
+ }
77
+ });
78
+ }
79
+ else if ('inlineData' in fileData) {
80
+ parts.push({
81
+ inlineData: {
82
+ mimeType: fileData.mimeType,
83
+ data: fileData.inlineData
84
+ }
85
+ });
86
+ }
87
+ }
88
+ this.logger.log(`Generating content with model: ${modelName}`);
89
+ try {
90
+ const result = await model.generateContent(parts);
91
+ const response = await result.response;
92
+ return response.text();
93
+ }
94
+ catch (error) {
95
+ this.logger.log(`Gemini generation error: ${error.message}`);
96
+ throw error;
97
+ }
98
+ }
99
+ }
100
+ exports.GeminiClient = GeminiClient;
@@ -0,0 +1,8 @@
1
+ export declare const ENRICHMENT_PROMPT = "\nYou are an expert content analyzer and information architect. Your task is to visualize the core structure and key concepts of the provided document using a **Mermaid.js Mindmap**.\n\n**Objective**: Create a \"Modern Mindmap\" that maps out the central theme, main branches (key topics), and sub-branches (details/evidence).\n\n**Instructions**:\n1. **Analyze**: Understand the central thesis and hierarchical structure of the content.\n2. **Generate Mermaid Code**: Output valid Mermaid `mindmap` syntax.\n * Start with `mindmap` keyword.\n * Use **2 spaces** indentation for hierarchy (no tabs).\n * **CRITICAL**: If a node label contains parentheses `()`, brackets `[]`, or special characters, you MUST wrap the text in quotes, e.g., `node[\"Label with (parens)\"]`.\n * Fail-safe: If unsure, wrap the label in double quotes.\n * Root node should be the Document Title or Main Topic.\n * Keep node labels concise (1-4 words preferred).\n * **Style**: Aim for a balanced, radial or hierarchical structure that is easy to read.\n3. **Strictly NO Markdown**: Return ONLY the mermaid code inside a JSON object.\n\n**Output Format**:\nReturn a JSON object strictly following this structure. Do NOT wrap it in markdown code blocks.\n\n{\n \"mermaid\": \"mindmap\\n root((...))\"\n}\n";
2
+ export declare const PDF_EXTRACTION_PROMPT = "**Objective:** Perform a comprehensive extraction and analysis of the provided document.\n\n**Instructions:**\n1. **Text Extraction:** Extract all textual content/OCR from the document. Preserve logical structure.\n2. **Context:** If images/charts are present, describe them briefly in-flow.\n\n**Output:**\nReturn ONLY the extracted text and descriptions. Do not add conversational filler.\n";
3
+ export declare const IMAGE_CONTEXT_PROMPT = "Analyze the uploaded image. Extract all text verbatim and describe the visual content in detail.";
4
+ export declare const AUDIO_CONTEXT_PROMPT = "Analyze the audio. Transcribe speech and describe sounds/music.";
5
+ export declare const VIDEO_CONTEXT_PROMPT = "Analyze the video. Transcribe speech, describe visual scenes, actions, and text on screen.";
6
+ export declare const META_JSON_PROMPT = "\n*** METADATA REQUIREMENT ***\n\nAppend the following JSON object at the VERY END of your response.\n\n```json\n{\n \"title\": \"Short descriptive title\",\n \"description\": \"Concise summary (1-2 sentences)\",\n \"thumbnail\": \"Visual description for a thumbnail\"\n}\n```\n";
7
+ export declare const AUTO_ENTITY_EXTRACTION_PROMPT = "\nYou are an expert entity extractor. Automatically identify significant entities (Names, Dates, IDs, amounts).\nReturn valid JSON with snake_case keys.\n";
8
+ export declare const generateEntityPrompt: (fields: string[] | any) => string;
@@ -0,0 +1,73 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.generateEntityPrompt = exports.AUTO_ENTITY_EXTRACTION_PROMPT = exports.META_JSON_PROMPT = exports.VIDEO_CONTEXT_PROMPT = exports.AUDIO_CONTEXT_PROMPT = exports.IMAGE_CONTEXT_PROMPT = exports.PDF_EXTRACTION_PROMPT = exports.ENRICHMENT_PROMPT = void 0;
4
+ exports.ENRICHMENT_PROMPT = `
5
+ You are an expert content analyzer and information architect. Your task is to visualize the core structure and key concepts of the provided document using a **Mermaid.js Mindmap**.
6
+
7
+ **Objective**: Create a "Modern Mindmap" that maps out the central theme, main branches (key topics), and sub-branches (details/evidence).
8
+
9
+ **Instructions**:
10
+ 1. **Analyze**: Understand the central thesis and hierarchical structure of the content.
11
+ 2. **Generate Mermaid Code**: Output valid Mermaid \`mindmap\` syntax.
12
+ * Start with \`mindmap\` keyword.
13
+ * Use **2 spaces** indentation for hierarchy (no tabs).
14
+ * **CRITICAL**: If a node label contains parentheses \`()\`, brackets \`[]\`, or special characters, you MUST wrap the text in quotes, e.g., \`node["Label with (parens)"]\`.
15
+ * Fail-safe: If unsure, wrap the label in double quotes.
16
+ * Root node should be the Document Title or Main Topic.
17
+ * Keep node labels concise (1-4 words preferred).
18
+ * **Style**: Aim for a balanced, radial or hierarchical structure that is easy to read.
19
+ 3. **Strictly NO Markdown**: Return ONLY the mermaid code inside a JSON object.
20
+
21
+ **Output Format**:
22
+ Return a JSON object strictly following this structure. Do NOT wrap it in markdown code blocks.
23
+
24
+ {
25
+ "mermaid": "mindmap\\n root((...))"
26
+ }
27
+ `;
28
+ exports.PDF_EXTRACTION_PROMPT = `**Objective:** Perform a comprehensive extraction and analysis of the provided document.
29
+
30
+ **Instructions:**
31
+ 1. **Text Extraction:** Extract all textual content/OCR from the document. Preserve logical structure.
32
+ 2. **Context:** If images/charts are present, describe them briefly in-flow.
33
+
34
+ **Output:**
35
+ Return ONLY the extracted text and descriptions. Do not add conversational filler.
36
+ `;
37
+ exports.IMAGE_CONTEXT_PROMPT = `Analyze the uploaded image. Extract all text verbatim and describe the visual content in detail.`;
38
+ exports.AUDIO_CONTEXT_PROMPT = `Analyze the audio. Transcribe speech and describe sounds/music.`;
39
+ exports.VIDEO_CONTEXT_PROMPT = `Analyze the video. Transcribe speech, describe visual scenes, actions, and text on screen.`;
40
+ exports.META_JSON_PROMPT = `
41
+ *** METADATA REQUIREMENT ***
42
+
43
+ Append the following JSON object at the VERY END of your response.
44
+
45
+ \`\`\`json
46
+ {
47
+ "title": "Short descriptive title",
48
+ "description": "Concise summary (1-2 sentences)",
49
+ "thumbnail": "Visual description for a thumbnail"
50
+ }
51
+ \`\`\`
52
+ `;
53
+ exports.AUTO_ENTITY_EXTRACTION_PROMPT = `
54
+ You are an expert entity extractor. Automatically identify significant entities (Names, Dates, IDs, amounts).
55
+ Return valid JSON with snake_case keys.
56
+ `;
57
+ const generateEntityPrompt = (fields) => {
58
+ let fieldsList = '';
59
+ let jsonJSON = {};
60
+ if (Array.isArray(fields)) {
61
+ fieldsList = fields.join(', ');
62
+ jsonJSON = fields.reduce((acc, curr) => ({ ...acc, [curr.toLowerCase().replace(/ /g, '_')]: "..." }), {});
63
+ }
64
+ else {
65
+ jsonJSON = fields;
66
+ }
67
+ return `
68
+ Extract the following entities: ${fieldsList || 'from schema'}.
69
+ Return strictly JSON matching this structure:
70
+ ${JSON.stringify(jsonJSON, null, 2)}
71
+ `;
72
+ };
73
+ exports.generateEntityPrompt = generateEntityPrompt;
@@ -0,0 +1,55 @@
1
+ export type InputType = 'url' | 'base64' | 'buffer' | 'path';
2
+ export interface OCRObjectInput {
3
+ type: InputType;
4
+ value: string | Buffer;
5
+ }
6
+ export type OCRSource = string | Buffer | OCRObjectInput;
7
+ export interface OCROptions {
8
+ model?: string;
9
+ apiKey: string;
10
+ includeRaw?: boolean;
11
+ summarize?: boolean;
12
+ mindmap?: boolean;
13
+ extractEntities?: boolean;
14
+ pageLimit?: number;
15
+ entitySchema?: string[] | any;
16
+ classify?: boolean;
17
+ }
18
+ export interface OCRTimings {
19
+ startTime: string;
20
+ endTime: string;
21
+ durationMs: number;
22
+ }
23
+ export interface OCRLogs {
24
+ events: string[];
25
+ geminiModel: string;
26
+ requestId: string | null;
27
+ }
28
+ export interface DocumentMetadata {
29
+ title?: string;
30
+ description?: string;
31
+ thumbnail?: string;
32
+ [key: string]: any;
33
+ }
34
+ export interface OCRResult {
35
+ status: 'success' | 'error';
36
+ error: string | null;
37
+ extractedText: string;
38
+ summary: string | null;
39
+ mindmap: string | null;
40
+ entityResult: any | null;
41
+ metadata: DocumentMetadata;
42
+ language: string | null;
43
+ documentType: string | null;
44
+ pageCount: number;
45
+ confidence: number | null;
46
+ timings: OCRTimings;
47
+ logs: OCRLogs;
48
+ raw: any | null;
49
+ }
50
+ export interface NormalizedFile {
51
+ mimeType: string;
52
+ data: Buffer;
53
+ extension?: string;
54
+ originalPath?: string;
55
+ }
package/dist/types.js ADDED
@@ -0,0 +1,2 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
@@ -0,0 +1,7 @@
1
+ export declare class Logger {
2
+ private events;
3
+ log(message: string): void;
4
+ getEvents(): string[];
5
+ }
6
+ export declare function generateRequestId(): string;
7
+ export declare function calculateDuration(startTime: Date, endTime: Date): number;
package/dist/utils.js ADDED
@@ -0,0 +1,25 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.Logger = void 0;
4
+ exports.generateRequestId = generateRequestId;
5
+ exports.calculateDuration = calculateDuration;
6
+ const uuid_1 = require("uuid");
7
+ class Logger {
8
+ constructor() {
9
+ this.events = [];
10
+ }
11
+ log(message) {
12
+ const timestamp = new Date().toISOString();
13
+ this.events.push(`${timestamp} - ${message}`);
14
+ }
15
+ getEvents() {
16
+ return this.events;
17
+ }
18
+ }
19
+ exports.Logger = Logger;
20
+ function generateRequestId() {
21
+ return (0, uuid_1.v4)();
22
+ }
23
+ function calculateDuration(startTime, endTime) {
24
+ return endTime.getTime() - startTime.getTime();
25
+ }
package/package.json ADDED
@@ -0,0 +1,53 @@
1
+ {
2
+ "name": "@pandi2352/gemini-ocr",
3
+ "version": "1.0.0",
4
+ "description": "A lightweight OCR processing wrapper using Google Gemini Vision models.",
5
+ "publishConfig": {
6
+ "access": "public"
7
+ },
8
+ "main": "dist/index.js",
9
+ "types": "dist/index.d.ts",
10
+ "author": "Pandi2352",
11
+ "license": "MIT",
12
+ "repository": {
13
+ "type": "git",
14
+ "url": "git+https://github.com/Pandi2352/npm-gemini-ocr.git"
15
+ },
16
+ "files": [
17
+ "dist",
18
+ "README.md",
19
+ "LICENSE"
20
+ ],
21
+ "scripts": {
22
+ "build": "rimraf dist && tsc",
23
+ "prepublishOnly": "npm run build",
24
+ "test": "ts-node test.ts"
25
+ },
26
+ "keywords": [
27
+ "ocr",
28
+ "gemini",
29
+ "google",
30
+ "ai",
31
+ "pdf",
32
+ "image",
33
+ "vision"
34
+ ],
35
+ "devDependencies": {
36
+ "@types/mime-types": "^3.0.1",
37
+ "@types/uuid": "^10.0.0",
38
+ "rimraf": "^6.1.2",
39
+ "ts-node": "^10.9.2"
40
+ },
41
+ "dependencies": {
42
+ "@google/generative-ai": "^0.24.1",
43
+ "@types/node": "^25.0.3",
44
+ "axios": "^1.13.2",
45
+ "dotenv": "^17.2.3",
46
+ "file-type": "^16.5.4",
47
+ "mammoth": "^1.8.0",
48
+ "mime-types": "^3.0.2",
49
+ "pdf-lib": "^1.17.1",
50
+ "typescript": "^5.9.3",
51
+ "uuid": "^13.0.0"
52
+ }
53
+ }