isa-model 0.3.91__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. isa_model/client.py +732 -573
  2. isa_model/core/cache/redis_cache.py +401 -0
  3. isa_model/core/config/config_manager.py +53 -10
  4. isa_model/core/config.py +1 -1
  5. isa_model/core/database/__init__.py +1 -0
  6. isa_model/core/database/migrations.py +277 -0
  7. isa_model/core/database/supabase_client.py +123 -0
  8. isa_model/core/models/__init__.py +37 -0
  9. isa_model/core/models/model_billing_tracker.py +60 -88
  10. isa_model/core/models/model_manager.py +36 -18
  11. isa_model/core/models/model_repo.py +44 -38
  12. isa_model/core/models/model_statistics_tracker.py +234 -0
  13. isa_model/core/models/model_storage.py +0 -1
  14. isa_model/core/models/model_version_manager.py +959 -0
  15. isa_model/core/pricing_manager.py +2 -249
  16. isa_model/core/resilience/circuit_breaker.py +366 -0
  17. isa_model/core/security/secrets.py +358 -0
  18. isa_model/core/services/__init__.py +2 -4
  19. isa_model/core/services/intelligent_model_selector.py +101 -370
  20. isa_model/core/storage/hf_storage.py +1 -1
  21. isa_model/core/types.py +7 -0
  22. isa_model/deployment/cloud/modal/isa_audio_chatTTS_service.py +520 -0
  23. isa_model/deployment/cloud/modal/isa_audio_fish_service.py +0 -0
  24. isa_model/deployment/cloud/modal/isa_audio_openvoice_service.py +758 -0
  25. isa_model/deployment/cloud/modal/isa_audio_service_v2.py +1044 -0
  26. isa_model/deployment/cloud/modal/isa_embed_rerank_service.py +296 -0
  27. isa_model/deployment/cloud/modal/isa_video_hunyuan_service.py +423 -0
  28. isa_model/deployment/cloud/modal/isa_vision_ocr_service.py +519 -0
  29. isa_model/deployment/cloud/modal/isa_vision_qwen25_service.py +709 -0
  30. isa_model/deployment/cloud/modal/isa_vision_table_service.py +467 -323
  31. isa_model/deployment/cloud/modal/isa_vision_ui_service.py +607 -180
  32. isa_model/deployment/cloud/modal/isa_vision_ui_service_optimized.py +660 -0
  33. isa_model/deployment/core/deployment_manager.py +6 -4
  34. isa_model/deployment/services/auto_hf_modal_deployer.py +894 -0
  35. isa_model/eval/benchmarks/__init__.py +27 -0
  36. isa_model/eval/benchmarks/multimodal_datasets.py +460 -0
  37. isa_model/eval/benchmarks.py +244 -12
  38. isa_model/eval/evaluators/__init__.py +8 -2
  39. isa_model/eval/evaluators/audio_evaluator.py +727 -0
  40. isa_model/eval/evaluators/embedding_evaluator.py +742 -0
  41. isa_model/eval/evaluators/vision_evaluator.py +564 -0
  42. isa_model/eval/example_evaluation.py +395 -0
  43. isa_model/eval/factory.py +272 -5
  44. isa_model/eval/isa_benchmarks.py +700 -0
  45. isa_model/eval/isa_integration.py +582 -0
  46. isa_model/eval/metrics.py +159 -6
  47. isa_model/eval/tests/unit/test_basic.py +396 -0
  48. isa_model/inference/ai_factory.py +44 -8
  49. isa_model/inference/services/audio/__init__.py +21 -0
  50. isa_model/inference/services/audio/base_realtime_service.py +225 -0
  51. isa_model/inference/services/audio/isa_tts_service.py +0 -0
  52. isa_model/inference/services/audio/openai_realtime_service.py +320 -124
  53. isa_model/inference/services/audio/openai_stt_service.py +32 -6
  54. isa_model/inference/services/base_service.py +17 -1
  55. isa_model/inference/services/embedding/__init__.py +13 -0
  56. isa_model/inference/services/embedding/base_embed_service.py +111 -8
  57. isa_model/inference/services/embedding/isa_embed_service.py +305 -0
  58. isa_model/inference/services/embedding/openai_embed_service.py +2 -4
  59. isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
  60. isa_model/inference/services/img/__init__.py +2 -2
  61. isa_model/inference/services/img/base_image_gen_service.py +24 -7
  62. isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
  63. isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
  64. isa_model/inference/services/img/services/replicate_flux.py +226 -0
  65. isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
  66. isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
  67. isa_model/inference/services/img/tests/test_img_client.py +297 -0
  68. isa_model/inference/services/llm/base_llm_service.py +30 -6
  69. isa_model/inference/services/llm/helpers/llm_adapter.py +63 -9
  70. isa_model/inference/services/llm/ollama_llm_service.py +2 -1
  71. isa_model/inference/services/llm/openai_llm_service.py +652 -55
  72. isa_model/inference/services/llm/yyds_llm_service.py +2 -1
  73. isa_model/inference/services/vision/__init__.py +5 -5
  74. isa_model/inference/services/vision/base_vision_service.py +118 -185
  75. isa_model/inference/services/vision/helpers/image_utils.py +11 -5
  76. isa_model/inference/services/vision/isa_vision_service.py +573 -0
  77. isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
  78. isa_model/serving/api/fastapi_server.py +88 -16
  79. isa_model/serving/api/middleware/auth.py +311 -0
  80. isa_model/serving/api/middleware/security.py +278 -0
  81. isa_model/serving/api/routes/analytics.py +486 -0
  82. isa_model/serving/api/routes/deployments.py +339 -0
  83. isa_model/serving/api/routes/evaluations.py +579 -0
  84. isa_model/serving/api/routes/logs.py +430 -0
  85. isa_model/serving/api/routes/settings.py +582 -0
  86. isa_model/serving/api/routes/unified.py +324 -165
  87. isa_model/serving/api/startup.py +304 -0
  88. isa_model/serving/modal_proxy_server.py +249 -0
  89. isa_model/training/__init__.py +100 -6
  90. isa_model/training/core/__init__.py +4 -1
  91. isa_model/training/examples/intelligent_training_example.py +281 -0
  92. isa_model/training/intelligent/__init__.py +25 -0
  93. isa_model/training/intelligent/decision_engine.py +643 -0
  94. isa_model/training/intelligent/intelligent_factory.py +888 -0
  95. isa_model/training/intelligent/knowledge_base.py +751 -0
  96. isa_model/training/intelligent/resource_optimizer.py +839 -0
  97. isa_model/training/intelligent/task_classifier.py +576 -0
  98. isa_model/training/storage/__init__.py +24 -0
  99. isa_model/training/storage/core_integration.py +439 -0
  100. isa_model/training/storage/training_repository.py +552 -0
  101. isa_model/training/storage/training_storage.py +628 -0
  102. {isa_model-0.3.91.dist-info → isa_model-0.4.0.dist-info}/METADATA +13 -1
  103. isa_model-0.4.0.dist-info/RECORD +182 -0
  104. isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
  105. isa_model/deployment/cloud/modal/register_models.py +0 -321
  106. isa_model/inference/adapter/unified_api.py +0 -248
  107. isa_model/inference/services/helpers/stacked_config.py +0 -148
  108. isa_model/inference/services/img/flux_professional_service.py +0 -603
  109. isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
  110. isa_model/inference/services/others/table_transformer_service.py +0 -61
  111. isa_model/inference/services/vision/doc_analysis_service.py +0 -640
  112. isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
  113. isa_model/inference/services/vision/ui_analysis_service.py +0 -823
  114. isa_model/scripts/inference_tracker.py +0 -283
  115. isa_model/scripts/mlflow_manager.py +0 -379
  116. isa_model/scripts/model_registry.py +0 -465
  117. isa_model/scripts/register_models.py +0 -370
  118. isa_model/scripts/register_models_with_embeddings.py +0 -510
  119. isa_model/scripts/start_mlflow.py +0 -95
  120. isa_model/scripts/training_tracker.py +0 -257
  121. isa_model-0.3.91.dist-info/RECORD +0 -138
  122. {isa_model-0.3.91.dist-info → isa_model-0.4.0.dist-info}/WHEEL +0 -0
  123. {isa_model-0.3.91.dist-info → isa_model-0.4.0.dist-info}/top_level.txt +0 -0
@@ -1,640 +0,0 @@
1
- """
2
- Document Analysis Stacked Service
3
-
4
- A comprehensive document analysis service that combines multiple vision models
5
- to process complex documents through a five-step pipeline:
6
-
7
- 1. VLM Document Classification (GPT-4V/Claude)
8
- 2. Table Transformer Detection
9
- 3. Table Transformer Structure Recognition
10
- 4. PaddleOCR Text Extraction
11
- 5. VLM Intelligent Matching
12
-
13
- The sixth step (predefined structure mapping) is handled by specific business services
14
- that have their own templates and schemas for different use cases.
15
-
16
- This service orchestrates OpenAI Vision Service and ISA Vision Service
17
- using the BaseStackedService framework.
18
- """
19
-
20
- import json
21
- import logging
22
- from typing import Dict, Any, List, Union, Optional, BinaryIO
23
- from datetime import datetime
24
-
25
- from .helpers.base_stacked_service import (
26
- BaseStackedService, LayerConfig, LayerType, LayerResult
27
- )
28
-
29
- logger = logging.getLogger(__name__)
30
-
31
- class DocAnalysisStackedService(BaseStackedService):
32
- """Stacked Document Analysis Service using multiple vision models (5-step pipeline)"""
33
-
34
- def __init__(self, ai_factory, service_name: str = "doc-analysis-stacked"):
35
- super().__init__(ai_factory, service_name)
36
-
37
- # Configure the 5-step pipeline layers
38
- self._configure_pipeline_layers()
39
-
40
- logger.info(f"Initialized DocAnalysisStackedService with 5-step pipeline")
41
-
42
- def _configure_pipeline_layers(self):
43
- """Configure the 5-step document analysis pipeline"""
44
-
45
- # Step 1: VLM Document Classification
46
- self.add_layer(LayerConfig(
47
- name="document_classification",
48
- layer_type=LayerType.CLASSIFICATION,
49
- service_type="vision",
50
- model_name="gpt-4.1-nano", # Use OpenAI Vision Service
51
- parameters={
52
- "task": "classification",
53
- "max_tokens": 1500
54
- },
55
- depends_on=[],
56
- timeout=30.0,
57
- retry_count=2,
58
- fallback_enabled=True
59
- ))
60
-
61
- # Step 2: Table Detection
62
- self.add_layer(LayerConfig(
63
- name="table_detection",
64
- layer_type=LayerType.DETECTION,
65
- service_type="vision",
66
- model_name="isa-vision-doc", # Use ISA Vision Service
67
- parameters={
68
- "task": "analyze_document",
69
- "confidence_threshold": 0.5
70
- },
71
- depends_on=["document_classification"],
72
- timeout=45.0,
73
- retry_count=1,
74
- fallback_enabled=True
75
- ))
76
-
77
- # Step 3: Table Structure Recognition
78
- self.add_layer(LayerConfig(
79
- name="table_structure",
80
- layer_type=LayerType.DETECTION,
81
- service_type="vision",
82
- model_name="isa-vision-doc",
83
- parameters={
84
- "task": "table_structure_recognition"
85
- },
86
- depends_on=["table_detection"],
87
- timeout=30.0,
88
- retry_count=1,
89
- fallback_enabled=True
90
- ))
91
-
92
- # Step 4: OCR Text Extraction
93
- self.add_layer(LayerConfig(
94
- name="ocr_extraction",
95
- layer_type=LayerType.DETECTION,
96
- service_type="vision",
97
- model_name="isa-vision-doc",
98
- parameters={
99
- "task": "extract_text"
100
- },
101
- depends_on=["table_structure"],
102
- timeout=60.0,
103
- retry_count=2,
104
- fallback_enabled=False # OCR is critical
105
- ))
106
-
107
- # Step 5: Intelligent Matching
108
- self.add_layer(LayerConfig(
109
- name="intelligent_matching",
110
- layer_type=LayerType.INTELLIGENCE,
111
- service_type="vision",
112
- model_name="gpt-4.1-nano",
113
- parameters={
114
- "task": "intelligent_matching",
115
- "max_tokens": 2000
116
- },
117
- depends_on=["document_classification", "ocr_extraction"],
118
- timeout=45.0,
119
- retry_count=2,
120
- fallback_enabled=True
121
- ))
122
-
123
- async def execute_layer_logic(self, layer: LayerConfig, service: Any, context: Dict[str, Any]) -> Any:
124
- """Execute the specific logic for each layer"""
125
-
126
- input_data = context.get("input", {})
127
- results = context.get("results", {})
128
-
129
- if layer.name == "document_classification":
130
- return await self._execute_document_classification(service, input_data, layer.parameters)
131
-
132
- elif layer.name == "table_detection":
133
- return await self._execute_table_detection(service, input_data, layer.parameters)
134
-
135
- elif layer.name == "table_structure":
136
- table_regions = results.get("table_detection", {}).data.get("table_regions", [])
137
- return await self._execute_table_structure(service, input_data, table_regions, layer.parameters)
138
-
139
- elif layer.name == "ocr_extraction":
140
- table_regions = results.get("table_detection", {}).data.get("table_regions", [])
141
- return await self._execute_ocr_extraction(service, input_data, table_regions, layer.parameters)
142
-
143
- elif layer.name == "intelligent_matching":
144
- classification_data = results.get("document_classification", {}).data
145
- ocr_data = results.get("ocr_extraction", {}).data
146
- return await self._execute_intelligent_matching(service, classification_data, ocr_data, layer.parameters)
147
-
148
- else:
149
- raise ValueError(f"Unknown layer: {layer.name}")
150
-
151
- async def _execute_document_classification(self, service, input_data: Dict[str, Any], params: Dict[str, Any]) -> Dict[str, Any]:
152
- """Execute Step 1: VLM Document Classification"""
153
-
154
- images = input_data.get("images", [])
155
- if not images:
156
- raise ValueError("No images provided for classification")
157
-
158
- classification_prompt = """
159
- 请分析这些文档页面,识别:
160
- 1. 文档类型(报关单、发票、合同、装箱单等)
161
- 2. 公司信息(发货人、收货人)
162
- 3. 业务类型(出口、进口、跨境电商等)
163
- 4. 页面关系(是否属于同一份文档)
164
-
165
- 请以JSON格式返回结果:
166
- {
167
- "document_classification": {
168
- "document_type": "文档类型",
169
- "business_type": "业务类型",
170
- "pages": [
171
- {
172
- "page_id": 1,
173
- "page_type": "页面类型",
174
- "company": "公司名称",
175
- "confidence": 0.95
176
- }
177
- ]
178
- }
179
- }
180
- """
181
-
182
- # Analyze each page
183
- page_results = []
184
- for i, image in enumerate(images):
185
- result = await service.analyze_image(
186
- image,
187
- classification_prompt,
188
- max_tokens=params.get("max_tokens", 1500)
189
- )
190
-
191
- page_results.append({
192
- "page_id": i + 1,
193
- "analysis": result.get("text", ""),
194
- "confidence": result.get("confidence", 0.8)
195
- })
196
-
197
- # Combine results for multi-page analysis if multiple pages
198
- if len(images) > 1:
199
- combined_prompt = f"""
200
- 基于以下各页面的分析结果,提供整体文档分类:
201
-
202
- {json.dumps(page_results, ensure_ascii=False, indent=2)}
203
-
204
- 请返回最终的分类结果,格式如前面所示。
205
- """
206
-
207
- final_result = await service.analyze_image(
208
- images[0],
209
- combined_prompt,
210
- max_tokens=1000
211
- )
212
- else:
213
- final_result = page_results[0] if page_results else {"analysis": ""}
214
-
215
- # Parse the classification result
216
- classification = self._parse_json_response(final_result.get("text", "{}"))
217
-
218
- return {
219
- "success": True,
220
- "step": "document_classification",
221
- "classification": classification,
222
- "page_count": len(images),
223
- "processing_time": datetime.now().isoformat(),
224
- "metadata": {
225
- "model": "gpt-4.1-nano",
226
- "service": "vlm_classification"
227
- }
228
- }
229
-
230
- async def _execute_table_detection(self, service, input_data: Dict[str, Any], params: Dict[str, Any]) -> Dict[str, Any]:
231
- """Execute Step 2: Table Transformer Detection"""
232
-
233
- image = input_data.get("image") or (input_data.get("images", [None])[0])
234
- if not image:
235
- raise ValueError("No image provided for table detection")
236
-
237
- # Use ISA Vision Service for table detection
238
- detection_result = await service.invoke(
239
- image,
240
- task="analyze_document"
241
- )
242
-
243
- if detection_result.get("success"):
244
- tables = detection_result.get("table_regions", [])
245
-
246
- return {
247
- "success": True,
248
- "step": "table_detection",
249
- "table_count": len(tables),
250
- "table_regions": tables,
251
- "processing_time": datetime.now().isoformat(),
252
- "metadata": {
253
- "model": "table-transformer-detection",
254
- "service": "isa_vision"
255
- }
256
- }
257
- else:
258
- raise Exception(detection_result.get("error", "Table detection failed"))
259
-
260
- async def _execute_table_structure(self, service, input_data: Dict[str, Any], table_regions: List[Dict[str, Any]], params: Dict[str, Any]) -> Dict[str, Any]:
261
- """Execute Step 3: Table Transformer Structure Recognition"""
262
-
263
- structure_results = []
264
-
265
- for i, table_region in enumerate(table_regions):
266
- # Process each table region for structure recognition
267
- structure_result = {
268
- "table_id": i + 1,
269
- "bbox": table_region.get("bbox", []),
270
- "rows": table_region.get("rows", 0),
271
- "columns": table_region.get("columns", 0),
272
- "cells": table_region.get("cells", []),
273
- "confidence": table_region.get("confidence", 0.8)
274
- }
275
- structure_results.append(structure_result)
276
-
277
- return {
278
- "success": True,
279
- "step": "table_structure_recognition",
280
- "structures": structure_results,
281
- "processing_time": datetime.now().isoformat(),
282
- "metadata": {
283
- "model": "table-transformer-structure",
284
- "service": "isa_vision"
285
- }
286
- }
287
-
288
- async def _execute_ocr_extraction(self, service, input_data: Dict[str, Any], table_regions: List[Dict[str, Any]], params: Dict[str, Any]) -> Dict[str, Any]:
289
- """Execute Step 4: PaddleOCR Text Extraction"""
290
-
291
- image = input_data.get("image") or (input_data.get("images", [None])[0])
292
- if not image:
293
- raise ValueError("No image provided for OCR extraction")
294
-
295
- # Use ISA Vision Service for OCR
296
- ocr_result = await service.extract_text(image)
297
-
298
- if ocr_result.get("success"):
299
- extracted_text = ocr_result.get("text", "")
300
- bounding_boxes = ocr_result.get("bounding_boxes", [])
301
-
302
- # If table regions are provided, filter OCR results to table areas
303
- if table_regions:
304
- table_ocr_data = []
305
- for table in table_regions:
306
- table_text = self._extract_text_from_region(
307
- extracted_text,
308
- bounding_boxes,
309
- table.get("bbox", [])
310
- )
311
- table_ocr_data.append({
312
- "table_id": table.get("table_id", 0),
313
- "text": table_text,
314
- "bbox": table.get("bbox", [])
315
- })
316
-
317
- return {
318
- "success": True,
319
- "step": "ocr_extraction",
320
- "full_text": extracted_text,
321
- "table_ocr_data": table_ocr_data,
322
- "confidence": ocr_result.get("confidence", 0.8),
323
- "processing_time": datetime.now().isoformat(),
324
- "metadata": {
325
- "model": "PaddleOCR",
326
- "service": "isa_vision"
327
- }
328
- }
329
- else:
330
- return {
331
- "success": True,
332
- "step": "ocr_extraction",
333
- "full_text": extracted_text,
334
- "confidence": ocr_result.get("confidence", 0.8),
335
- "processing_time": datetime.now().isoformat(),
336
- "metadata": {
337
- "model": "PaddleOCR",
338
- "service": "isa_vision"
339
- }
340
- }
341
- else:
342
- raise Exception(ocr_result.get("error", "OCR extraction failed"))
343
-
344
- async def _execute_intelligent_matching(self, service, classification_data: Dict[str, Any], ocr_data: Dict[str, Any], params: Dict[str, Any]) -> Dict[str, Any]:
345
- """Execute Step 5: VLM Intelligent Matching"""
346
-
347
- matching_prompt = f"""
348
- 基于识别到的表格数据和文档上下文,请提取并整理以下信息:
349
-
350
- 文档分类信息:
351
- {json.dumps(classification_data, ensure_ascii=False, indent=2)}
352
-
353
- OCR提取的数据:
354
- {json.dumps(ocr_data, ensure_ascii=False, indent=2)}
355
-
356
- 请执行以下任务:
357
- 1. 将表格数据转换为结构化格式
358
- 2. 识别关键字段的含义和关系
359
- 3. 处理跨表格的数据关联
360
- 4. 识别并纠正OCR错误
361
- 5. 提供标准化的结构化数据,供业务服务进行模板映射
362
-
363
- 返回JSON格式:
364
- {{
365
- "structured_data": {{
366
- "basic_info": {{
367
- "document_type": "文档类型",
368
- "company_info": "公司信息",
369
- "date": "日期",
370
- "document_number": "文档编号"
371
- }},
372
- "items": [
373
- {{
374
- "item_name": "商品名称",
375
- "quantity": "数量",
376
- "unit_price": "单价",
377
- "total_price": "总价"
378
- }}
379
- ],
380
- "financial_summary": {{
381
- "subtotal": "小计",
382
- "tax": "税费",
383
- "total": "总计"
384
- }},
385
- "additional_fields": {{
386
- "field_name": "字段值"
387
- }}
388
- }},
389
- "field_confidence": {{
390
- "basic_info": 0.95,
391
- "items": 0.90,
392
- "financial_summary": 0.85
393
- }},
394
- "corrections_made": [
395
- "纠正了OCR错误示例"
396
- ],
397
- "ready_for_mapping": true
398
- }}
399
- """
400
-
401
- result = await service.analyze_image(
402
- None, # Text-only analysis
403
- matching_prompt,
404
- max_tokens=params.get("max_tokens", 2000)
405
- )
406
-
407
- structured_data = self._parse_json_response(result.get("text", "{}"))
408
-
409
- return {
410
- "success": True,
411
- "step": "intelligent_matching",
412
- "structured_data": structured_data,
413
- "ready_for_business_mapping": True,
414
- "processing_time": datetime.now().isoformat(),
415
- "metadata": {
416
- "model": "gpt-4.1-nano",
417
- "service": "vlm_matching",
418
- "note": "Ready for business service template mapping"
419
- }
420
- }
421
-
422
- def generate_final_output(self, results: Dict[str, LayerResult]) -> Dict[str, Any]:
423
- """Generate final output from all layer results"""
424
-
425
- # Check if all critical steps succeeded
426
- critical_steps = ["document_classification", "ocr_extraction", "intelligent_matching"]
427
- success = all(
428
- step in results and results[step].success
429
- for step in critical_steps
430
- )
431
-
432
- if success:
433
- # Get the final structured data from intelligent matching
434
- final_data = results["intelligent_matching"].data.get("structured_data", {})
435
-
436
- return {
437
- "success": True,
438
- "pipeline": "five_step_document_analysis",
439
- "final_structured_data": final_data,
440
- "ready_for_business_mapping": True,
441
- "classification": results.get("document_classification", {}).data.get("classification", {}),
442
- "table_info": {
443
- "table_count": results.get("table_detection", {}).data.get("table_count", 0),
444
- "table_regions": results.get("table_detection", {}).data.get("table_regions", [])
445
- },
446
- "ocr_info": {
447
- "full_text": results.get("ocr_extraction", {}).data.get("full_text", ""),
448
- "confidence": results.get("ocr_extraction", {}).data.get("confidence", 0)
449
- },
450
- "metadata": {
451
- "service": "doc_analysis_stacked",
452
- "pipeline_version": "5_step",
453
- "processing_complete": True
454
- }
455
- }
456
- else:
457
- # Return error information
458
- failed_steps = [
459
- step for step in critical_steps
460
- if step not in results or not results[step].success
461
- ]
462
-
463
- return {
464
- "success": False,
465
- "pipeline": "five_step_document_analysis",
466
- "error": f"Critical steps failed: {failed_steps}",
467
- "failed_steps": failed_steps,
468
- "partial_results": {
469
- name: result.data for name, result in results.items()
470
- if result.success
471
- },
472
- "metadata": {
473
- "service": "doc_analysis_stacked",
474
- "pipeline_version": "5_step",
475
- "processing_complete": False
476
- }
477
- }
478
-
479
- async def execute_fallback(self, layer: LayerConfig, context: Dict[str, Any], error: str) -> Optional[Any]:
480
- """Execute fallback logic for failed layers"""
481
-
482
- if layer.name == "table_detection":
483
- # Fallback: Continue without table regions
484
- logger.warning(f"Table detection failed: {error}. Continuing without table regions.")
485
- return {
486
- "success": True,
487
- "step": "table_detection",
488
- "table_count": 0,
489
- "table_regions": [],
490
- "fallback": True,
491
- "processing_time": datetime.now().isoformat(),
492
- "metadata": {
493
- "model": "fallback",
494
- "service": "fallback",
495
- "error": error
496
- }
497
- }
498
-
499
- elif layer.name == "table_structure":
500
- # Fallback: Return empty structure
501
- logger.warning(f"Table structure recognition failed: {error}. Using empty structure.")
502
- return {
503
- "success": True,
504
- "step": "table_structure_recognition",
505
- "structures": [],
506
- "fallback": True,
507
- "processing_time": datetime.now().isoformat(),
508
- "metadata": {
509
- "model": "fallback",
510
- "service": "fallback",
511
- "error": error
512
- }
513
- }
514
-
515
- elif layer.name == "document_classification":
516
- # Fallback: Use generic classification
517
- logger.warning(f"Document classification failed: {error}. Using generic classification.")
518
- return {
519
- "success": True,
520
- "step": "document_classification",
521
- "classification": {
522
- "document_classification": {
523
- "document_type": "unknown_document",
524
- "business_type": "unknown",
525
- "confidence": 0.1,
526
- "pages": []
527
- }
528
- },
529
- "fallback": True,
530
- "processing_time": datetime.now().isoformat(),
531
- "metadata": {
532
- "model": "fallback",
533
- "service": "fallback",
534
- "error": error
535
- }
536
- }
537
-
538
- elif layer.name == "intelligent_matching":
539
- # Fallback: Use basic structured format
540
- logger.warning(f"Intelligent matching failed: {error}. Using basic structure.")
541
- return {
542
- "success": True,
543
- "step": "intelligent_matching",
544
- "structured_data": {
545
- "structured_data": {
546
- "basic_info": {},
547
- "items": [],
548
- "financial_summary": {},
549
- "additional_fields": {}
550
- },
551
- "field_confidence": {},
552
- "corrections_made": [],
553
- "ready_for_mapping": False
554
- },
555
- "fallback": True,
556
- "processing_time": datetime.now().isoformat(),
557
- "metadata": {
558
- "model": "fallback",
559
- "service": "fallback",
560
- "error": error
561
- }
562
- }
563
-
564
- return None
565
-
566
- # Helper methods
567
-
568
- def _parse_json_response(self, response_text: str) -> Dict[str, Any]:
569
- """Parse JSON response from VLM"""
570
- try:
571
- # Try to find JSON in the response
572
- start_idx = response_text.find('{')
573
- end_idx = response_text.rfind('}') + 1
574
-
575
- if start_idx != -1 and end_idx > start_idx:
576
- json_str = response_text[start_idx:end_idx]
577
- return json.loads(json_str)
578
- else:
579
- # Fallback to empty dict
580
- return {}
581
-
582
- except json.JSONDecodeError:
583
- logger.warning(f"Failed to parse JSON from response: {response_text[:200]}...")
584
- return {}
585
-
586
- def _extract_text_from_region(
587
- self,
588
- full_text: str,
589
- bounding_boxes: List[Dict],
590
- region_bbox: List[int]
591
- ) -> str:
592
- """Extract text that falls within a specific region"""
593
- # This is a simplified implementation
594
- # In practice, you would need to check if bounding boxes overlap with region
595
- return full_text # For now, return full text
596
-
597
- # Convenience methods for direct usage
598
-
599
- async def analyze_document(self, image: Union[str, BinaryIO], images: Optional[List[Union[str, BinaryIO]]] = None) -> Dict[str, Any]:
600
- """Convenience method for complete document analysis"""
601
-
602
- if images is None:
603
- images = [image] if image else []
604
-
605
- input_data = {
606
- "image": image,
607
- "images": images
608
- }
609
-
610
- return await self.invoke(input_data)
611
-
612
- async def classify_document_only(self, images: List[Union[str, BinaryIO]]) -> Dict[str, Any]:
613
- """Convenience method for document classification only"""
614
-
615
- # Temporarily configure only classification layer
616
- original_layers = self.layers.copy()
617
- self.layers = [layer for layer in self.layers if layer.name == "document_classification"]
618
-
619
- try:
620
- input_data = {"images": images}
621
- result = await self.invoke(input_data)
622
- return result
623
- finally:
624
- # Restore original layers
625
- self.layers = original_layers
626
-
627
- async def extract_text_only(self, image: Union[str, BinaryIO]) -> Dict[str, Any]:
628
- """Convenience method for OCR extraction only"""
629
-
630
- # Temporarily configure only OCR layer
631
- original_layers = self.layers.copy()
632
- self.layers = [layer for layer in self.layers if layer.name == "ocr_extraction"]
633
-
634
- try:
635
- input_data = {"image": image}
636
- result = await self.invoke(input_data)
637
- return result
638
- finally:
639
- # Restore original layers
640
- self.layers = original_layers