isa-model 0.3.4__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. isa_model/config/__init__.py +9 -0
  2. isa_model/config/config_manager.py +213 -0
  3. isa_model/core/model_manager.py +5 -0
  4. isa_model/core/model_registry.py +39 -6
  5. isa_model/core/storage/supabase_storage.py +344 -0
  6. isa_model/core/vision_models_init.py +116 -0
  7. isa_model/deployment/cloud/__init__.py +9 -0
  8. isa_model/deployment/cloud/modal/__init__.py +10 -0
  9. isa_model/deployment/cloud/modal/isa_vision_doc_service.py +612 -0
  10. isa_model/deployment/cloud/modal/isa_vision_ui_service.py +305 -0
  11. isa_model/inference/ai_factory.py +238 -14
  12. isa_model/inference/providers/modal_provider.py +109 -0
  13. isa_model/inference/providers/yyds_provider.py +108 -0
  14. isa_model/inference/services/__init__.py +2 -1
  15. isa_model/inference/services/base_service.py +0 -38
  16. isa_model/inference/services/llm/base_llm_service.py +32 -0
  17. isa_model/inference/services/llm/llm_adapter.py +40 -0
  18. isa_model/inference/services/llm/ollama_llm_service.py +104 -3
  19. isa_model/inference/services/llm/openai_llm_service.py +67 -15
  20. isa_model/inference/services/llm/yyds_llm_service.py +254 -0
  21. isa_model/inference/services/stacked/__init__.py +26 -0
  22. isa_model/inference/services/stacked/base_stacked_service.py +269 -0
  23. isa_model/inference/services/stacked/config.py +426 -0
  24. isa_model/inference/services/stacked/doc_analysis_service.py +640 -0
  25. isa_model/inference/services/stacked/flux_professional_service.py +579 -0
  26. isa_model/inference/services/stacked/ui_analysis_service.py +1319 -0
  27. isa_model/inference/services/vision/base_image_gen_service.py +0 -34
  28. isa_model/inference/services/vision/base_vision_service.py +46 -2
  29. isa_model/inference/services/vision/isA_vision_service.py +402 -0
  30. isa_model/inference/services/vision/openai_vision_service.py +151 -9
  31. isa_model/inference/services/vision/replicate_image_gen_service.py +166 -38
  32. isa_model/inference/services/vision/replicate_vision_service.py +693 -0
  33. isa_model/serving/__init__.py +19 -0
  34. isa_model/serving/api/__init__.py +10 -0
  35. isa_model/serving/api/fastapi_server.py +84 -0
  36. isa_model/serving/api/middleware/__init__.py +9 -0
  37. isa_model/serving/api/middleware/request_logger.py +88 -0
  38. isa_model/serving/api/routes/__init__.py +5 -0
  39. isa_model/serving/api/routes/health.py +82 -0
  40. isa_model/serving/api/routes/llm.py +19 -0
  41. isa_model/serving/api/routes/ui_analysis.py +223 -0
  42. isa_model/serving/api/routes/vision.py +19 -0
  43. isa_model/serving/api/schemas/__init__.py +17 -0
  44. isa_model/serving/api/schemas/common.py +33 -0
  45. isa_model/serving/api/schemas/ui_analysis.py +78 -0
  46. {isa_model-0.3.4.dist-info → isa_model-0.3.5.dist-info}/METADATA +1 -1
  47. {isa_model-0.3.4.dist-info → isa_model-0.3.5.dist-info}/RECORD +49 -17
  48. {isa_model-0.3.4.dist-info → isa_model-0.3.5.dist-info}/WHEEL +0 -0
  49. {isa_model-0.3.4.dist-info → isa_model-0.3.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,612 @@
1
+ """
2
+ ISA Vision Document Service
3
+
4
+ Specialized service for document analysis including:
5
+ - Table detection (Table Transformer Detection)
6
+ - Table structure recognition (Table Transformer Structure v1.1)
7
+ - OCR text extraction (PaddleOCR 3.0)
8
+ """
9
+
10
+ import modal
11
+ import torch
12
+ import base64
13
+ import io
14
+ import numpy as np
15
+ from PIL import Image
16
+ from typing import Dict, List, Optional, Any
17
+ import time
18
+ import json
19
+ import os
20
+ import logging
21
+
22
+ # Define Modal application
23
+ app = modal.App("isa-vision-doc")
24
+
25
+ # Download document analysis models
26
+ def download_doc_models():
27
+ """Download document analysis models"""
28
+ from huggingface_hub import snapshot_download
29
+ import subprocess
30
+
31
+ print("📦 Downloading document analysis models...")
32
+ os.makedirs("/models", exist_ok=True)
33
+
34
+ # Download Table Transformer Detection
35
+ try:
36
+ snapshot_download(
37
+ repo_id="microsoft/table-transformer-detection",
38
+ local_dir="/models/table-transformer-detection",
39
+ allow_patterns=["**/*.pt", "**/*.pth", "**/*.bin", "**/*.json", "**/*.safetensors"]
40
+ )
41
+ print("✅ Table Transformer Detection downloaded")
42
+ except Exception as e:
43
+ print(f"⚠️ Table Transformer Detection download failed: {e}")
44
+
45
+ # Download Table Transformer Structure Recognition v1.1
46
+ try:
47
+ snapshot_download(
48
+ repo_id="microsoft/table-transformer-structure-recognition-v1.1-all",
49
+ local_dir="/models/table-transformer-structure",
50
+ allow_patterns=["**/*.pt", "**/*.pth", "**/*.bin", "**/*.json", "**/*.safetensors"]
51
+ )
52
+ print("✅ Table Transformer Structure Recognition v1.1 downloaded")
53
+ except Exception as e:
54
+ print(f"⚠️ Table Transformer Structure Recognition download failed: {e}")
55
+
56
+ # Install PaddleOCR
57
+ try:
58
+ subprocess.run(["pip", "install", "paddleocr>=2.7.0", "--no-deps"], check=True)
59
+ print("✅ PaddleOCR installed")
60
+ except Exception as e:
61
+ print(f"⚠️ PaddleOCR install failed: {e}")
62
+
63
+ print("📦 Document analysis models download completed")
64
+
65
+ # Define Modal container image
66
+ image = (
67
+ modal.Image.debian_slim(python_version="3.11")
68
+ .apt_install([
69
+ # OpenGL and graphics libraries for PaddleOCR
70
+ "libgl1-mesa-glx",
71
+ "libglib2.0-0",
72
+ "libsm6",
73
+ "libxext6",
74
+ "libxrender-dev",
75
+ "libgomp1",
76
+ # Font support
77
+ "fontconfig",
78
+ "libfontconfig1",
79
+ "libfreetype6",
80
+ ])
81
+ .pip_install([
82
+ # Core AI libraries
83
+ "torch>=2.0.0",
84
+ "torchvision",
85
+ "transformers>=4.35.0",
86
+ "huggingface_hub",
87
+ "accelerate",
88
+
89
+ # Image processing
90
+ "pillow>=10.0.1",
91
+ "opencv-python-headless",
92
+ "numpy>=1.24.3",
93
+
94
+ # OCR libraries - Latest stable versions
95
+ "paddleocr>=3.0.0",
96
+ "paddlepaddle>=3.0.0",
97
+
98
+ # HTTP libraries
99
+ "httpx>=0.26.0",
100
+ "requests",
101
+
102
+ # Utilities
103
+ "pydantic>=2.0.0",
104
+ "python-dotenv",
105
+ ])
106
+ .run_function(download_doc_models)
107
+ .env({
108
+ "TRANSFORMERS_CACHE": "/models",
109
+ "FONTCONFIG_PATH": "/etc/fonts",
110
+ "KMP_DUPLICATE_LIB_OK": "TRUE",
111
+ "OMP_NUM_THREADS": "1",
112
+ "CUDA_VISIBLE_DEVICES": "0"
113
+ })
114
+ )
115
+
116
+ # Document Analysis Service
117
+ @app.cls(
118
+ gpu="T4",
119
+ image=image,
120
+ memory=16384, # 16GB RAM
121
+ timeout=1800, # 30 minutes
122
+ scaledown_window=300, # 5 minutes idle timeout
123
+ min_containers=0, # Scale to zero when not in use
124
+ )
125
+ class DocumentAnalysisService:
126
+ """
127
+ Document Analysis Service
128
+
129
+ Provides document analysis capabilities including:
130
+ - Table detection and structure recognition
131
+ - OCR text extraction
132
+ - Combined document parsing
133
+ """
134
+
135
+ def __init__(self):
136
+ self.models = {}
137
+ self.logger = logging.getLogger(__name__)
138
+
139
+ @modal.enter()
140
+ def load_models(self):
141
+ """Load document analysis models on container startup"""
142
+ print("🚀 Loading document analysis models...")
143
+ start_time = time.time()
144
+
145
+ try:
146
+ import sys
147
+ # Check system environment
148
+ print(f"🔧 System info:")
149
+ print(f" - Python version: {sys.version}")
150
+ print(f" - PyTorch version: {torch.__version__}")
151
+ print(f" - CUDA available: {torch.cuda.is_available()}")
152
+ if torch.cuda.is_available():
153
+ print(f" - CUDA version: {torch.version.cuda}")
154
+ print(f" - GPU count: {torch.cuda.device_count()}")
155
+
156
+ # Load table detection models
157
+ self._load_table_models()
158
+
159
+ # Load OCR models
160
+ self._load_ocr_models()
161
+
162
+ load_time = time.time() - start_time
163
+ print(f"✅ Document analysis models loaded in {load_time:.2f}s")
164
+
165
+ # Verify models are loaded
166
+ if not self.models.get('ocr'):
167
+ print("⚠️ OCR model failed to load - service will use fallback")
168
+
169
+ except Exception as e:
170
+ print(f"❌ Critical error during model loading: {e}")
171
+ import traceback
172
+ traceback.print_exc()
173
+ # Don't raise - let service start with degraded functionality
174
+
175
+ def _load_table_models(self):
176
+ """Load table detection and structure recognition models"""
177
+ print("📊 Loading table analysis models...")
178
+
179
+ # TODO: Implement actual Table Transformer loading
180
+ # For now, we don't load these models to avoid mock data
181
+ print("⚠️ Table Transformer models not implemented yet")
182
+ print(" - Table detection will return empty results")
183
+ print(" - Table structure analysis will return empty results")
184
+
185
+ def _load_ocr_models(self):
186
+ """Load OCR models"""
187
+ print("🔤 Loading OCR models...")
188
+
189
+ try:
190
+ import os
191
+ # Set environment variables to prevent conflicts and optimize performance
192
+ os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
193
+ os.environ['OMP_NUM_THREADS'] = '1'
194
+ os.environ['MKLDNN_DISABLED'] = '1' # Disable MKLDNN to force GPU usage
195
+
196
+ from paddleocr import PaddleOCR
197
+
198
+ # Initialize PaddleOCR 3.0 with minimal configuration
199
+ # PaddleOCR 3.0 uses PP-OCRv5_server model by default which supports multiple languages
200
+ self.models['ocr'] = PaddleOCR(
201
+ use_angle_cls=True, # Enable text direction classification
202
+ lang='ch' # Chinese language (also supports English in the same model)
203
+ )
204
+ print("✅ PaddleOCR loaded successfully with official defaults")
205
+ print(f" - GPU available: {torch.cuda.is_available()}")
206
+ if torch.cuda.is_available():
207
+ print(f" - CUDA device: {torch.cuda.get_device_name(0)}")
208
+ print(f" - CUDA version: {torch.version.cuda}")
209
+
210
+ # Test OCR initialization
211
+ print("🔍 Testing OCR initialization...")
212
+
213
+ except Exception as e:
214
+ print(f"⚠️ PaddleOCR loading failed: {e}")
215
+ import traceback
216
+ traceback.print_exc()
217
+ self.models['ocr'] = None
218
+
219
+ @modal.method()
220
+ def detect_tables(self, image_b64: str) -> Dict[str, Any]:
221
+ """
222
+ Detect tables in document image
223
+
224
+ Args:
225
+ image_b64: Base64 encoded image
226
+
227
+ Returns:
228
+ Table detection results
229
+ """
230
+ start_time = time.time()
231
+
232
+ try:
233
+ # Decode image
234
+ image = self._decode_image(image_b64)
235
+ image_np = np.array(image)
236
+
237
+ # Perform table detection
238
+ tables = self._detect_tables_impl(image_np)
239
+
240
+ processing_time = time.time() - start_time
241
+
242
+ return {
243
+ 'success': True,
244
+ 'service': 'isa-vision-doc',
245
+ 'function': 'table_detection',
246
+ 'tables': tables,
247
+ 'table_count': len(tables),
248
+ 'processing_time': processing_time,
249
+ 'model_info': {
250
+ 'detector': 'Table Transformer Detection',
251
+ 'gpu': 'T4'
252
+ }
253
+ }
254
+
255
+ except Exception as e:
256
+ self.logger.error(f"Table detection failed: {e}")
257
+ return {
258
+ 'success': False,
259
+ 'service': 'isa-vision-doc',
260
+ 'function': 'table_detection',
261
+ 'error': str(e),
262
+ 'processing_time': time.time() - start_time
263
+ }
264
+
265
+ @modal.method()
266
+ def analyze_table_structure(self, image_b64: str, table_bbox: List[int] = None) -> Dict[str, Any]:
267
+ """
268
+ Analyze table structure in image
269
+
270
+ Args:
271
+ image_b64: Base64 encoded image
272
+ table_bbox: Optional bounding box of table [x1, y1, x2, y2]
273
+
274
+ Returns:
275
+ Table structure analysis results
276
+ """
277
+ start_time = time.time()
278
+
279
+ try:
280
+ # Decode image
281
+ image = self._decode_image(image_b64)
282
+ image_np = np.array(image)
283
+
284
+ # Crop to table region if bbox provided
285
+ if table_bbox:
286
+ x1, y1, x2, y2 = table_bbox
287
+ image_np = image_np[y1:y2, x1:x2]
288
+
289
+ # Analyze table structure
290
+ structure = self._analyze_table_structure_impl(image_np)
291
+
292
+ processing_time = time.time() - start_time
293
+
294
+ return {
295
+ 'success': True,
296
+ 'service': 'isa-vision-doc',
297
+ 'function': 'table_structure',
298
+ 'structure': structure,
299
+ 'processing_time': processing_time,
300
+ 'model_info': {
301
+ 'analyzer': 'Table Transformer Structure Recognition v1.1',
302
+ 'gpu': 'T4'
303
+ }
304
+ }
305
+
306
+ except Exception as e:
307
+ self.logger.error(f"Table structure analysis failed: {e}")
308
+ return {
309
+ 'success': False,
310
+ 'service': 'isa-vision-doc',
311
+ 'function': 'table_structure',
312
+ 'error': str(e),
313
+ 'processing_time': time.time() - start_time
314
+ }
315
+
316
+ @modal.method()
317
+ def extract_text(self, image_b64: str, regions: List[Dict] = None) -> Dict[str, Any]:
318
+ """
319
+ Extract text from document image using OCR
320
+
321
+ Args:
322
+ image_b64: Base64 encoded image
323
+ regions: Optional list of regions to focus OCR on
324
+
325
+ Returns:
326
+ OCR text extraction results
327
+ """
328
+ start_time = time.time()
329
+
330
+ try:
331
+ # Decode image
332
+ image = self._decode_image(image_b64)
333
+ image_np = np.array(image)
334
+
335
+ # Perform OCR
336
+ text_results = self._extract_text_impl(image_np, regions)
337
+
338
+ processing_time = time.time() - start_time
339
+
340
+ return {
341
+ 'success': True,
342
+ 'service': 'isa-vision-doc',
343
+ 'function': 'ocr',
344
+ 'text_results': text_results,
345
+ 'text_count': len(text_results),
346
+ 'processing_time': processing_time,
347
+ 'model_info': {
348
+ 'ocr_engine': 'PaddleOCR 3.0',
349
+ 'gpu': 'T4'
350
+ }
351
+ }
352
+
353
+ except Exception as e:
354
+ self.logger.error(f"OCR extraction failed: {e}")
355
+ return {
356
+ 'success': False,
357
+ 'service': 'isa-vision-doc',
358
+ 'function': 'ocr',
359
+ 'error': str(e),
360
+ 'processing_time': time.time() - start_time
361
+ }
362
+
363
+ @modal.method()
364
+ def analyze_document_complete(self, image_b64: str) -> Dict[str, Any]:
365
+ """
366
+ Complete document analysis: tables + structure + OCR
367
+
368
+ Args:
369
+ image_b64: Base64 encoded image
370
+
371
+ Returns:
372
+ Complete document analysis results
373
+ """
374
+ start_time = time.time()
375
+
376
+ try:
377
+ # Decode image once for all operations
378
+ image = self._decode_image(image_b64)
379
+ image_np = np.array(image)
380
+
381
+ # Step 1: Detect tables
382
+ tables = self._detect_tables_impl(image_np)
383
+ table_detection_start = time.time()
384
+ table_result = {
385
+ 'success': True,
386
+ 'tables': tables,
387
+ 'processing_time': time.time() - table_detection_start
388
+ }
389
+
390
+ # Step 2: Extract text
391
+ ocr_start = time.time()
392
+ text_results = self._extract_text_impl(image_np)
393
+ ocr_result = {
394
+ 'success': True,
395
+ 'text_results': text_results,
396
+ 'processing_time': time.time() - ocr_start
397
+ }
398
+
399
+ # Step 3: Analyze table structures if tables found
400
+ structure_results = []
401
+ if table_result.get('success') and table_result.get('tables'):
402
+ for table in table_result['tables']:
403
+ if 'bbox' in table:
404
+ x1, y1, x2, y2 = table['bbox']
405
+ table_image = image_np[y1:y2, x1:x2]
406
+ structure = self._analyze_table_structure_impl(table_image)
407
+ structure_results.append(structure)
408
+
409
+ total_time = time.time() - start_time
410
+
411
+ return {
412
+ 'success': True,
413
+ 'service': 'isa-vision-doc',
414
+ 'function': 'complete_analysis',
415
+ 'total_execution_time': total_time,
416
+ 'results': {
417
+ 'tables': table_result.get('tables', []),
418
+ 'table_structures': structure_results,
419
+ 'text_extraction': ocr_result.get('text_results', [])
420
+ },
421
+ 'summary': {
422
+ 'tables_found': len(table_result.get('tables', [])),
423
+ 'text_regions_found': len(ocr_result.get('text_results', [])),
424
+ 'structures_analyzed': len(structure_results)
425
+ },
426
+ 'performance_metrics': {
427
+ 'table_detection_time': table_result.get('processing_time', 0),
428
+ 'ocr_time': ocr_result.get('processing_time', 0),
429
+ 'total_time': total_time,
430
+ 'platform': 'modal'
431
+ }
432
+ }
433
+
434
+ except Exception as e:
435
+ self.logger.error(f"Complete document analysis failed: {e}")
436
+ return {
437
+ 'success': False,
438
+ 'service': 'isa-vision-doc',
439
+ 'function': 'complete_analysis',
440
+ 'error': str(e),
441
+ 'total_execution_time': time.time() - start_time
442
+ }
443
+
444
+ def _detect_tables_impl(self, image_np: np.ndarray) -> List[Dict[str, Any]]:
445
+ """Implementation of table detection"""
446
+ print("🔍 Table detection requested but not implemented")
447
+ print("⚠️ Table Transformer models need to be properly loaded")
448
+
449
+ # Return empty list since we don't have real table detection yet
450
+ # TODO: Implement actual Table Transformer Detection
451
+ return []
452
+
453
+ def _analyze_table_structure_impl(self, image_np: np.ndarray) -> Dict[str, Any]:
454
+ """Implementation of table structure analysis"""
455
+ print("📊 Table structure analysis requested but not implemented")
456
+ print("⚠️ Table Transformer Structure Recognition models need to be properly loaded")
457
+
458
+ # Return empty structure since we don't have real table structure analysis yet
459
+ # TODO: Implement actual Table Transformer Structure Recognition
460
+ return {
461
+ 'rows': 0,
462
+ 'columns': 0,
463
+ 'cells': [],
464
+ 'confidence': 0.0
465
+ }
466
+
467
+ def _extract_text_impl(self, image_np: np.ndarray, regions: List[Dict] = None) -> List[Dict[str, Any]]:
468
+ """Implementation of OCR text extraction"""
469
+ print(f"🔍 Debug: OCR model in models: {'ocr' in self.models}")
470
+ print(f"🔍 Debug: OCR model value: {self.models.get('ocr')}")
471
+ print(f"🔍 Debug: OCR model is not None: {self.models.get('ocr') is not None}")
472
+
473
+ if self.models.get('ocr') is not None:
474
+ try:
475
+ print("🔤 Using real PaddleOCR for text extraction...")
476
+ ocr = self.models['ocr']
477
+ print(f"🔍 Debug: OCR object type: {type(ocr)}")
478
+
479
+ # Ensure image is in correct format for PaddleOCR
480
+ if len(image_np.shape) == 3 and image_np.shape[2] == 3:
481
+ # Convert RGB to BGR for OpenCV/PaddleOCR
482
+ image_bgr = image_np[:, :, ::-1]
483
+ else:
484
+ image_bgr = image_np
485
+
486
+ print(f"🔍 Image shape for OCR: {image_bgr.shape}")
487
+ print(f"🔍 Image dtype: {image_bgr.dtype}")
488
+ print(f"🔍 Image min/max values: {image_bgr.min()}/{image_bgr.max()}")
489
+
490
+ # Save debug image to check what we're actually sending to OCR
491
+ try:
492
+ import cv2
493
+ cv2.imwrite('/tmp/debug_ocr_input.jpg', image_bgr)
494
+ print("🔍 Debug image saved to /tmp/debug_ocr_input.jpg")
495
+ except Exception as e:
496
+ print(f"⚠️ Failed to save debug image: {e}")
497
+
498
+ # Run PaddleOCR (angle classification is now built-in for v3.0)
499
+ print("🔍 Calling PaddleOCR...")
500
+ result = ocr.ocr(image_bgr)
501
+ print(f"🔍 PaddleOCR completed, raw result type: {type(result)}")
502
+
503
+ text_results = []
504
+ print(f"🔍 Checking result: result={bool(result)}, result length={len(result) if result else 0}")
505
+ if result:
506
+ print(f"🔍 First result element exists: {result[0] is not None}")
507
+ print(f"🔍 First result type: {type(result[0])}")
508
+ print(f"🔍 First result bool: {bool(result[0])}")
509
+
510
+ # Try to get length safely
511
+ try:
512
+ print(f"🔍 First result length: {len(result[0])}")
513
+ except Exception as e:
514
+ print(f"🔍 Cannot get length: {e}")
515
+
516
+ print(f"🔍 About to check if result[0] is truthy...")
517
+ if result and result[0]:
518
+ first_result = result[0]
519
+
520
+ # Debug: check what attributes the object actually has
521
+ print(f"🔍 Object attributes: {dir(first_result)}")
522
+ print(f"🔍 Has rec_texts: {hasattr(first_result, 'rec_texts')}")
523
+
524
+ # Check if it's PaddleOCR 3.0+ OCRResult object
525
+ if hasattr(first_result, 'rec_texts'):
526
+ print(f"🔍 Processing PaddleOCR 3.0+ OCRResult with {len(first_result.rec_texts)} text regions...")
527
+
528
+ rec_texts = first_result.rec_texts
529
+ rec_scores = first_result.rec_scores
530
+ rec_boxes = first_result.rec_boxes
531
+
532
+ for idx in range(len(rec_texts)):
533
+ text = rec_texts[idx]
534
+ confidence = rec_scores[idx]
535
+ bbox = rec_boxes[idx] # Should be [x1, y1, x2, y2]
536
+
537
+ text_results.append({
538
+ 'id': f'text_{idx}',
539
+ 'text': text,
540
+ 'confidence': float(confidence),
541
+ 'bbox': [int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])],
542
+ 'center': [
543
+ (int(bbox[0]) + int(bbox[2])) // 2,
544
+ (int(bbox[1]) + int(bbox[3])) // 2
545
+ ]
546
+ })
547
+
548
+ else:
549
+ print(f"🔍 Processing legacy format with {len(first_result)} text regions...")
550
+ for idx, line in enumerate(first_result):
551
+ bbox = line[0] # Bounding box points
552
+ text_info = line[1] # (text, confidence)
553
+
554
+ if text_info and len(text_info) >= 2:
555
+ # Convert bbox points to [x1, y1, x2, y2]
556
+ x_coords = [point[0] for point in bbox]
557
+ y_coords = [point[1] for point in bbox]
558
+ bbox_rect = [
559
+ int(min(x_coords)),
560
+ int(min(y_coords)),
561
+ int(max(x_coords)),
562
+ int(max(y_coords))
563
+ ]
564
+
565
+ text_results.append({
566
+ 'id': f'text_{idx}',
567
+ 'text': text_info[0],
568
+ 'confidence': text_info[1],
569
+ 'bbox': bbox_rect,
570
+ 'center': [
571
+ (bbox_rect[0] + bbox_rect[2]) // 2,
572
+ (bbox_rect[1] + bbox_rect[3]) // 2
573
+ ]
574
+ })
575
+
576
+ print(f"✅ Real PaddleOCR extraction: {len(text_results)} text regions found")
577
+ return text_results
578
+
579
+ except Exception as e:
580
+ print(f"❌ PaddleOCR failed: {e}")
581
+ import traceback
582
+ traceback.print_exc()
583
+
584
+ # No fallback - return empty if PaddleOCR is not available
585
+ print("❌ PaddleOCR not available, returning empty results")
586
+ return []
587
+
588
+ @modal.method()
589
+ def health_check(self) -> Dict[str, Any]:
590
+ """Health check endpoint"""
591
+ return {
592
+ 'status': 'healthy',
593
+ 'service': 'isa-vision-doc',
594
+ 'models_loaded': list(self.models.keys()),
595
+ 'capabilities': ['table_detection', 'table_structure', 'ocr'],
596
+ 'timestamp': time.time(),
597
+ 'gpu': 'T4'
598
+ }
599
+
600
+ def _decode_image(self, image_b64: str) -> Image.Image:
601
+ """Decode base64 image"""
602
+ if image_b64.startswith('data:image'):
603
+ image_b64 = image_b64.split(',')[1]
604
+
605
+ image_data = base64.b64decode(image_b64)
606
+ return Image.open(io.BytesIO(image_data)).convert('RGB')
607
+
608
+ # Warmup function removed to save costs
609
+
610
+ if __name__ == "__main__":
611
+ print("🚀 ISA Vision Document Service - Modal Deployment")
612
+ print("Deploy with: modal deploy isa_vision_doc_service.py")