@pandi2352/gemini-ocr 2.0.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -112,6 +112,23 @@ console.log('Extracted Data:', result.entityResult);
112
112
  */
113
113
  ```
114
114
 
115
+ ```
116
+
117
+ ### 6. Realtime Progress Feedback
118
+ Get granular updates on the processing stages.
119
+
120
+ ```typescript
121
+ await processOCR({
122
+ input: ['./large_document.pdf'],
123
+ apiKey: process.env.GEMINI_API_KEY,
124
+
125
+ onProgress: (stage, message) => {
126
+ // stage: 'upload' | 'generate_text' | 'enrich' | 'complete'
127
+ console.log(`[${stage}]: ${message}`);
128
+ }
129
+ });
130
+ ```
131
+
115
132
  ---
116
133
 
117
134
  ## 🛠️ Configuration Options
@@ -120,11 +137,12 @@ console.log('Extracted Data:', result.entityResult);
120
137
  | :--- | :--- | :--- | :--- |
121
138
  | `input` | `Array<string \| Buffer \| Object>` | **Required** | Array of file paths, URLs, Buffers, or Base64 strings. |
122
139
  | `apiKey` | `string` | **Required** | Your Google Gemini API Key. |
123
- | `model` | `string` | `gemini-1.5-flash` | The AI model to use. |
140
+ | `model` | `string` | `gemini-1.5-flash` | The AI model (use `gemini-1.5-flash-8b` for speed). |
124
141
  | `summarize` | `boolean` | `false` | Generate `metadata` (title, desc, thumbnail). |
125
142
  | `mindmap` | `boolean` | `false` | Generate Mermaid.js syntax for visual mapping. |
126
143
  | `extractEntities`| `boolean` | `false` | Enable structured field extraction. |
127
144
  | `entitySchema` | `string[]` | `auto` | Custom fields to extract (optional). |
145
+ | `onProgress` | `(stage, userMsg) => void` | `undefined` | Callback for realtime progress updates. |
128
146
 
129
147
  ---
130
148
 
package/dist/index.js CHANGED
@@ -52,15 +52,19 @@ async function processSingleFile(input, options) {
52
52
  const logger = new utils_1.Logger();
53
53
  const requestId = (0, utils_1.generateRequestId)();
54
54
  logger.log(`INIT: Processing file. RequestId: ${requestId}`);
55
+ if (options.onProgress)
56
+ options.onProgress('init', 'Initializing processing...');
55
57
  try {
56
58
  if (!options.apiKey)
57
59
  throw new Error('Gemini API key is required.');
58
60
  const gemini = new llm_1.GeminiClient(options.apiKey, logger);
59
- const modelName = options.model || 'gemini-1.5-flash';
61
+ const modelName = options.model || 'gemini-2.5-flash';
60
62
  // Input Processing
61
63
  const inputHandler = new input_handler_1.InputHandler(logger);
62
64
  const normalized = await inputHandler.processInput(input);
63
65
  const mimeType = normalized.mimeType;
66
+ if (options.onProgress)
67
+ options.onProgress('upload', 'Processing input file...');
64
68
  // Strategy
65
69
  let strategy = 'MEDIA';
66
70
  if (mimeType === 'text/plain' || mimeType === 'text/csv' || normalized.extension === 'txt' || normalized.extension === 'csv') {
@@ -85,6 +89,8 @@ async function processSingleFile(input, options) {
85
89
  let analysisText = '';
86
90
  let extractedTextDocx = '';
87
91
  let fileUri;
92
+ if (options.onProgress)
93
+ options.onProgress('generate_text', 'Generating analysis...');
88
94
  if (strategy === 'TEXT') {
89
95
  const content = normalized.data.toString('utf-8');
90
96
  finalPrompt += `\n\nDOCUMENT CONTENT:\n${content}`;
@@ -136,48 +142,58 @@ async function processSingleFile(input, options) {
136
142
  catch (e) { }
137
143
  }
138
144
  }
139
- // Mindmap
145
+ // Parallel Processing for Advanced Features
146
+ // Optimization Note: Future version can combine these into a single "Mega-Prompt" to reduce HTTP round-trips.
147
+ if (options.onProgress && (options.mindmap || options.extractEntities)) {
148
+ options.onProgress('enrich', 'Generating mindmap/entities...');
149
+ }
150
+ const tasks = [];
140
151
  let mindmap = null;
152
+ let entityResult = null;
141
153
  if (options.mindmap) {
142
154
  const enrichPrompt = `${prompts_1.ENRICHMENT_PROMPT}\n\nCONTEXT:\n${mainAnalysis}`;
143
- try {
144
- const enrichRes = await gemini.generateContent(modelName, enrichPrompt);
145
- const jsonPart = enrichRes.match(/\{[\s\S]*\}/);
146
- const mermaidPart = enrichRes.match(/```mermaid\n([\s\S]*?)\n```/);
147
- if (jsonPart) {
148
- const parsed = JSON.parse(jsonPart[0]);
149
- mindmap = parsed.mermaid || null;
150
- }
151
- else if (mermaidPart) {
152
- mindmap = mermaidPart[1];
155
+ tasks.push((async () => {
156
+ try {
157
+ const enrichRes = await gemini.generateContent(modelName, enrichPrompt);
158
+ const jsonPart = enrichRes.match(/\{[\s\S]*\}/);
159
+ const mermaidPart = enrichRes.match(/```mermaid\n([\s\S]*?)\n```/);
160
+ if (jsonPart) {
161
+ const parsed = JSON.parse(jsonPart[0]);
162
+ mindmap = parsed.mermaid || null;
163
+ }
164
+ else if (mermaidPart) {
165
+ mindmap = mermaidPart[1];
166
+ }
167
+ else {
168
+ mindmap = enrichRes.replace(/```/g, '');
169
+ }
153
170
  }
154
- else {
155
- mindmap = enrichRes.replace(/```/g, '');
171
+ catch (e) {
172
+ logger.log(`Mindmap failed: ${e}`);
156
173
  }
157
- }
158
- catch (e) {
159
- logger.log(`Mindmap failed: ${e}`);
160
- }
174
+ })());
161
175
  }
162
- // Entities
163
- let entityResult = null;
164
176
  if (options.extractEntities) {
165
177
  let entityPromptStr = options.entitySchema
166
178
  ? (0, prompts_1.generateEntityPrompt)(options.entitySchema)
167
179
  : prompts_1.AUTO_ENTITY_EXTRACTION_PROMPT;
168
- try {
169
- const context = strategy === 'DOCX' ? extractedTextDocx : mainAnalysis;
170
- const finalEntityPrompt = `${entityPromptStr}\n\nDATA CONTEXT:\n${context}`;
171
- const res = await gemini.generateContent(modelName, finalEntityPrompt);
172
- const json = res.match(/\{[\s\S]*\}/);
173
- if (json) {
174
- entityResult = JSON.parse(json[0]);
180
+ tasks.push((async () => {
181
+ try {
182
+ const context = strategy === 'DOCX' ? extractedTextDocx : mainAnalysis;
183
+ const finalEntityPrompt = `${entityPromptStr}\n\nDATA CONTEXT:\n${context}`;
184
+ const res = await gemini.generateContent(modelName, finalEntityPrompt);
185
+ const json = res.match(/\{[\s\S]*\}/);
186
+ if (json) {
187
+ entityResult = JSON.parse(json[0]);
188
+ }
175
189
  }
176
- }
177
- catch (e) {
178
- logger.log(`Entity extraction failed: ${e}`);
179
- }
190
+ catch (e) {
191
+ logger.log(`Entity extraction failed: ${e}`);
192
+ }
193
+ })());
180
194
  }
195
+ // Wait for all parallel tasks to complete
196
+ await Promise.all(tasks);
181
197
  // Page Count
182
198
  let pageCount = 1;
183
199
  if (mimeType === 'application/pdf') {
@@ -188,6 +204,8 @@ async function processSingleFile(input, options) {
188
204
  catch (e) { }
189
205
  }
190
206
  const endTime = new Date();
207
+ if (options.onProgress)
208
+ options.onProgress('complete', 'Processing complete.');
191
209
  // Success Result
192
210
  return {
193
211
  status: 'success',
package/dist/types.d.ts CHANGED
@@ -14,6 +14,7 @@ export interface OCROptions {
14
14
  pageLimit?: number;
15
15
  entitySchema?: string[] | any;
16
16
  classify?: boolean;
17
+ onProgress?: (stage: string, message: string) => void;
17
18
  }
18
19
  export interface OCRTimings {
19
20
  startTime: string;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pandi2352/gemini-ocr",
3
- "version": "2.0.0",
3
+ "version": "4.0.0",
4
4
  "description": "A lightweight OCR processing wrapper using Google Gemini Vision models.",
5
5
  "publishConfig": {
6
6
  "access": "public"