isa-model 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/__init__.py +30 -1
- isa_model/client.py +770 -0
- isa_model/core/config/__init__.py +16 -0
- isa_model/core/config/config_manager.py +514 -0
- isa_model/core/config.py +426 -0
- isa_model/core/models/model_billing_tracker.py +476 -0
- isa_model/core/models/model_manager.py +399 -0
- isa_model/core/models/model_repo.py +343 -0
- isa_model/core/pricing_manager.py +426 -0
- isa_model/core/services/__init__.py +19 -0
- isa_model/core/services/intelligent_model_selector.py +547 -0
- isa_model/core/types.py +291 -0
- isa_model/deployment/__init__.py +2 -0
- isa_model/deployment/cloud/__init__.py +9 -0
- isa_model/deployment/cloud/modal/__init__.py +10 -0
- isa_model/deployment/cloud/modal/isa_vision_doc_service.py +766 -0
- isa_model/deployment/cloud/modal/isa_vision_table_service.py +532 -0
- isa_model/deployment/cloud/modal/isa_vision_ui_service.py +406 -0
- isa_model/deployment/cloud/modal/register_models.py +321 -0
- isa_model/deployment/runtime/deployed_service.py +338 -0
- isa_model/deployment/services/__init__.py +9 -0
- isa_model/deployment/services/auto_deploy_vision_service.py +537 -0
- isa_model/deployment/services/model_service.py +332 -0
- isa_model/deployment/services/service_monitor.py +356 -0
- isa_model/deployment/services/service_registry.py +527 -0
- isa_model/eval/__init__.py +80 -44
- isa_model/eval/config/__init__.py +10 -0
- isa_model/eval/config/evaluation_config.py +108 -0
- isa_model/eval/evaluators/__init__.py +18 -0
- isa_model/eval/evaluators/base_evaluator.py +503 -0
- isa_model/eval/evaluators/llm_evaluator.py +472 -0
- isa_model/eval/factory.py +417 -709
- isa_model/eval/infrastructure/__init__.py +24 -0
- isa_model/eval/infrastructure/experiment_tracker.py +466 -0
- isa_model/eval/metrics.py +191 -21
- isa_model/inference/ai_factory.py +187 -387
- isa_model/inference/providers/modal_provider.py +109 -0
- isa_model/inference/providers/yyds_provider.py +108 -0
- isa_model/inference/services/__init__.py +2 -1
- isa_model/inference/services/audio/base_stt_service.py +65 -1
- isa_model/inference/services/audio/base_tts_service.py +75 -1
- isa_model/inference/services/audio/openai_stt_service.py +189 -151
- isa_model/inference/services/audio/openai_tts_service.py +12 -10
- isa_model/inference/services/audio/replicate_tts_service.py +61 -56
- isa_model/inference/services/base_service.py +55 -55
- isa_model/inference/services/embedding/base_embed_service.py +65 -1
- isa_model/inference/services/embedding/ollama_embed_service.py +103 -43
- isa_model/inference/services/embedding/openai_embed_service.py +8 -10
- isa_model/inference/services/helpers/stacked_config.py +148 -0
- isa_model/inference/services/img/__init__.py +18 -0
- isa_model/inference/services/{vision → img}/base_image_gen_service.py +80 -35
- isa_model/inference/services/img/flux_professional_service.py +603 -0
- isa_model/inference/services/img/helpers/base_stacked_service.py +274 -0
- isa_model/inference/services/{vision → img}/replicate_image_gen_service.py +210 -69
- isa_model/inference/services/llm/__init__.py +3 -3
- isa_model/inference/services/llm/base_llm_service.py +519 -35
- isa_model/inference/services/llm/{llm_adapter.py → helpers/llm_adapter.py} +40 -0
- isa_model/inference/services/llm/helpers/llm_prompts.py +258 -0
- isa_model/inference/services/llm/helpers/llm_utils.py +280 -0
- isa_model/inference/services/llm/ollama_llm_service.py +150 -15
- isa_model/inference/services/llm/openai_llm_service.py +134 -31
- isa_model/inference/services/llm/yyds_llm_service.py +255 -0
- isa_model/inference/services/vision/__init__.py +38 -4
- isa_model/inference/services/vision/base_vision_service.py +241 -96
- isa_model/inference/services/vision/disabled/isA_vision_service.py +500 -0
- isa_model/inference/services/vision/doc_analysis_service.py +640 -0
- isa_model/inference/services/vision/helpers/base_stacked_service.py +274 -0
- isa_model/inference/services/vision/helpers/image_utils.py +272 -3
- isa_model/inference/services/vision/helpers/vision_prompts.py +297 -0
- isa_model/inference/services/vision/openai_vision_service.py +109 -170
- isa_model/inference/services/vision/replicate_vision_service.py +508 -0
- isa_model/inference/services/vision/ui_analysis_service.py +823 -0
- isa_model/scripts/register_models.py +370 -0
- isa_model/scripts/register_models_with_embeddings.py +510 -0
- isa_model/serving/__init__.py +19 -0
- isa_model/serving/api/__init__.py +10 -0
- isa_model/serving/api/fastapi_server.py +89 -0
- isa_model/serving/api/middleware/__init__.py +9 -0
- isa_model/serving/api/middleware/request_logger.py +88 -0
- isa_model/serving/api/routes/__init__.py +5 -0
- isa_model/serving/api/routes/health.py +82 -0
- isa_model/serving/api/routes/llm.py +19 -0
- isa_model/serving/api/routes/ui_analysis.py +223 -0
- isa_model/serving/api/routes/unified.py +202 -0
- isa_model/serving/api/routes/vision.py +19 -0
- isa_model/serving/api/schemas/__init__.py +17 -0
- isa_model/serving/api/schemas/common.py +33 -0
- isa_model/serving/api/schemas/ui_analysis.py +78 -0
- {isa_model-0.3.4.dist-info → isa_model-0.3.6.dist-info}/METADATA +4 -1
- isa_model-0.3.6.dist-info/RECORD +147 -0
- isa_model/core/model_manager.py +0 -208
- isa_model/core/model_registry.py +0 -342
- isa_model/inference/billing_tracker.py +0 -406
- isa_model/inference/services/llm/triton_llm_service.py +0 -481
- isa_model/inference/services/vision/ollama_vision_service.py +0 -194
- isa_model-0.3.4.dist-info/RECORD +0 -91
- /isa_model/core/{model_storage.py → models/model_storage.py} +0 -0
- /isa_model/inference/services/{vision → embedding}/helpers/text_splitter.py +0 -0
- {isa_model-0.3.4.dist-info → isa_model-0.3.6.dist-info}/WHEEL +0 -0
- {isa_model-0.3.4.dist-info → isa_model-0.3.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,640 @@
|
|
1
|
+
"""
|
2
|
+
Document Analysis Stacked Service
|
3
|
+
|
4
|
+
A comprehensive document analysis service that combines multiple vision models
|
5
|
+
to process complex documents through a five-step pipeline:
|
6
|
+
|
7
|
+
1. VLM Document Classification (GPT-4V/Claude)
|
8
|
+
2. Table Transformer Detection
|
9
|
+
3. Table Transformer Structure Recognition
|
10
|
+
4. PaddleOCR Text Extraction
|
11
|
+
5. VLM Intelligent Matching
|
12
|
+
|
13
|
+
The sixth step (predefined structure mapping) is handled by specific business services
|
14
|
+
that have their own templates and schemas for different use cases.
|
15
|
+
|
16
|
+
This service orchestrates OpenAI Vision Service and ISA Vision Service
|
17
|
+
using the BaseStackedService framework.
|
18
|
+
"""
|
19
|
+
|
20
|
+
import json
|
21
|
+
import logging
|
22
|
+
from typing import Dict, Any, List, Union, Optional, BinaryIO
|
23
|
+
from datetime import datetime
|
24
|
+
|
25
|
+
from .helpers.base_stacked_service import (
|
26
|
+
BaseStackedService, LayerConfig, LayerType, LayerResult
|
27
|
+
)
|
28
|
+
|
29
|
+
logger = logging.getLogger(__name__)
|
30
|
+
|
31
|
+
class DocAnalysisStackedService(BaseStackedService):
|
32
|
+
"""Stacked Document Analysis Service using multiple vision models (5-step pipeline)"""
|
33
|
+
|
34
|
+
def __init__(self, ai_factory, service_name: str = "doc-analysis-stacked"):
|
35
|
+
super().__init__(ai_factory, service_name)
|
36
|
+
|
37
|
+
# Configure the 5-step pipeline layers
|
38
|
+
self._configure_pipeline_layers()
|
39
|
+
|
40
|
+
logger.info(f"Initialized DocAnalysisStackedService with 5-step pipeline")
|
41
|
+
|
42
|
+
def _configure_pipeline_layers(self):
|
43
|
+
"""Configure the 5-step document analysis pipeline"""
|
44
|
+
|
45
|
+
# Step 1: VLM Document Classification
|
46
|
+
self.add_layer(LayerConfig(
|
47
|
+
name="document_classification",
|
48
|
+
layer_type=LayerType.CLASSIFICATION,
|
49
|
+
service_type="vision",
|
50
|
+
model_name="gpt-4.1-nano", # Use OpenAI Vision Service
|
51
|
+
parameters={
|
52
|
+
"task": "classification",
|
53
|
+
"max_tokens": 1500
|
54
|
+
},
|
55
|
+
depends_on=[],
|
56
|
+
timeout=30.0,
|
57
|
+
retry_count=2,
|
58
|
+
fallback_enabled=True
|
59
|
+
))
|
60
|
+
|
61
|
+
# Step 2: Table Detection
|
62
|
+
self.add_layer(LayerConfig(
|
63
|
+
name="table_detection",
|
64
|
+
layer_type=LayerType.DETECTION,
|
65
|
+
service_type="vision",
|
66
|
+
model_name="isa-vision-doc", # Use ISA Vision Service
|
67
|
+
parameters={
|
68
|
+
"task": "analyze_document",
|
69
|
+
"confidence_threshold": 0.5
|
70
|
+
},
|
71
|
+
depends_on=["document_classification"],
|
72
|
+
timeout=45.0,
|
73
|
+
retry_count=1,
|
74
|
+
fallback_enabled=True
|
75
|
+
))
|
76
|
+
|
77
|
+
# Step 3: Table Structure Recognition
|
78
|
+
self.add_layer(LayerConfig(
|
79
|
+
name="table_structure",
|
80
|
+
layer_type=LayerType.DETECTION,
|
81
|
+
service_type="vision",
|
82
|
+
model_name="isa-vision-doc",
|
83
|
+
parameters={
|
84
|
+
"task": "table_structure_recognition"
|
85
|
+
},
|
86
|
+
depends_on=["table_detection"],
|
87
|
+
timeout=30.0,
|
88
|
+
retry_count=1,
|
89
|
+
fallback_enabled=True
|
90
|
+
))
|
91
|
+
|
92
|
+
# Step 4: OCR Text Extraction
|
93
|
+
self.add_layer(LayerConfig(
|
94
|
+
name="ocr_extraction",
|
95
|
+
layer_type=LayerType.DETECTION,
|
96
|
+
service_type="vision",
|
97
|
+
model_name="isa-vision-doc",
|
98
|
+
parameters={
|
99
|
+
"task": "extract_text"
|
100
|
+
},
|
101
|
+
depends_on=["table_structure"],
|
102
|
+
timeout=60.0,
|
103
|
+
retry_count=2,
|
104
|
+
fallback_enabled=False # OCR is critical
|
105
|
+
))
|
106
|
+
|
107
|
+
# Step 5: Intelligent Matching
|
108
|
+
self.add_layer(LayerConfig(
|
109
|
+
name="intelligent_matching",
|
110
|
+
layer_type=LayerType.INTELLIGENCE,
|
111
|
+
service_type="vision",
|
112
|
+
model_name="gpt-4.1-nano",
|
113
|
+
parameters={
|
114
|
+
"task": "intelligent_matching",
|
115
|
+
"max_tokens": 2000
|
116
|
+
},
|
117
|
+
depends_on=["document_classification", "ocr_extraction"],
|
118
|
+
timeout=45.0,
|
119
|
+
retry_count=2,
|
120
|
+
fallback_enabled=True
|
121
|
+
))
|
122
|
+
|
123
|
+
async def execute_layer_logic(self, layer: LayerConfig, service: Any, context: Dict[str, Any]) -> Any:
|
124
|
+
"""Execute the specific logic for each layer"""
|
125
|
+
|
126
|
+
input_data = context.get("input", {})
|
127
|
+
results = context.get("results", {})
|
128
|
+
|
129
|
+
if layer.name == "document_classification":
|
130
|
+
return await self._execute_document_classification(service, input_data, layer.parameters)
|
131
|
+
|
132
|
+
elif layer.name == "table_detection":
|
133
|
+
return await self._execute_table_detection(service, input_data, layer.parameters)
|
134
|
+
|
135
|
+
elif layer.name == "table_structure":
|
136
|
+
table_regions = results.get("table_detection", {}).data.get("table_regions", [])
|
137
|
+
return await self._execute_table_structure(service, input_data, table_regions, layer.parameters)
|
138
|
+
|
139
|
+
elif layer.name == "ocr_extraction":
|
140
|
+
table_regions = results.get("table_detection", {}).data.get("table_regions", [])
|
141
|
+
return await self._execute_ocr_extraction(service, input_data, table_regions, layer.parameters)
|
142
|
+
|
143
|
+
elif layer.name == "intelligent_matching":
|
144
|
+
classification_data = results.get("document_classification", {}).data
|
145
|
+
ocr_data = results.get("ocr_extraction", {}).data
|
146
|
+
return await self._execute_intelligent_matching(service, classification_data, ocr_data, layer.parameters)
|
147
|
+
|
148
|
+
else:
|
149
|
+
raise ValueError(f"Unknown layer: {layer.name}")
|
150
|
+
|
151
|
+
async def _execute_document_classification(self, service, input_data: Dict[str, Any], params: Dict[str, Any]) -> Dict[str, Any]:
|
152
|
+
"""Execute Step 1: VLM Document Classification"""
|
153
|
+
|
154
|
+
images = input_data.get("images", [])
|
155
|
+
if not images:
|
156
|
+
raise ValueError("No images provided for classification")
|
157
|
+
|
158
|
+
classification_prompt = """
|
159
|
+
请分析这些文档页面,识别:
|
160
|
+
1. 文档类型(报关单、发票、合同、装箱单等)
|
161
|
+
2. 公司信息(发货人、收货人)
|
162
|
+
3. 业务类型(出口、进口、跨境电商等)
|
163
|
+
4. 页面关系(是否属于同一份文档)
|
164
|
+
|
165
|
+
请以JSON格式返回结果:
|
166
|
+
{
|
167
|
+
"document_classification": {
|
168
|
+
"document_type": "文档类型",
|
169
|
+
"business_type": "业务类型",
|
170
|
+
"pages": [
|
171
|
+
{
|
172
|
+
"page_id": 1,
|
173
|
+
"page_type": "页面类型",
|
174
|
+
"company": "公司名称",
|
175
|
+
"confidence": 0.95
|
176
|
+
}
|
177
|
+
]
|
178
|
+
}
|
179
|
+
}
|
180
|
+
"""
|
181
|
+
|
182
|
+
# Analyze each page
|
183
|
+
page_results = []
|
184
|
+
for i, image in enumerate(images):
|
185
|
+
result = await service.analyze_image(
|
186
|
+
image,
|
187
|
+
classification_prompt,
|
188
|
+
max_tokens=params.get("max_tokens", 1500)
|
189
|
+
)
|
190
|
+
|
191
|
+
page_results.append({
|
192
|
+
"page_id": i + 1,
|
193
|
+
"analysis": result.get("text", ""),
|
194
|
+
"confidence": result.get("confidence", 0.8)
|
195
|
+
})
|
196
|
+
|
197
|
+
# Combine results for multi-page analysis if multiple pages
|
198
|
+
if len(images) > 1:
|
199
|
+
combined_prompt = f"""
|
200
|
+
基于以下各页面的分析结果,提供整体文档分类:
|
201
|
+
|
202
|
+
{json.dumps(page_results, ensure_ascii=False, indent=2)}
|
203
|
+
|
204
|
+
请返回最终的分类结果,格式如前面所示。
|
205
|
+
"""
|
206
|
+
|
207
|
+
final_result = await service.analyze_image(
|
208
|
+
images[0],
|
209
|
+
combined_prompt,
|
210
|
+
max_tokens=1000
|
211
|
+
)
|
212
|
+
else:
|
213
|
+
final_result = page_results[0] if page_results else {"analysis": ""}
|
214
|
+
|
215
|
+
# Parse the classification result
|
216
|
+
classification = self._parse_json_response(final_result.get("text", "{}"))
|
217
|
+
|
218
|
+
return {
|
219
|
+
"success": True,
|
220
|
+
"step": "document_classification",
|
221
|
+
"classification": classification,
|
222
|
+
"page_count": len(images),
|
223
|
+
"processing_time": datetime.now().isoformat(),
|
224
|
+
"metadata": {
|
225
|
+
"model": "gpt-4.1-nano",
|
226
|
+
"service": "vlm_classification"
|
227
|
+
}
|
228
|
+
}
|
229
|
+
|
230
|
+
async def _execute_table_detection(self, service, input_data: Dict[str, Any], params: Dict[str, Any]) -> Dict[str, Any]:
|
231
|
+
"""Execute Step 2: Table Transformer Detection"""
|
232
|
+
|
233
|
+
image = input_data.get("image") or (input_data.get("images", [None])[0])
|
234
|
+
if not image:
|
235
|
+
raise ValueError("No image provided for table detection")
|
236
|
+
|
237
|
+
# Use ISA Vision Service for table detection
|
238
|
+
detection_result = await service.invoke(
|
239
|
+
image,
|
240
|
+
task="analyze_document"
|
241
|
+
)
|
242
|
+
|
243
|
+
if detection_result.get("success"):
|
244
|
+
tables = detection_result.get("table_regions", [])
|
245
|
+
|
246
|
+
return {
|
247
|
+
"success": True,
|
248
|
+
"step": "table_detection",
|
249
|
+
"table_count": len(tables),
|
250
|
+
"table_regions": tables,
|
251
|
+
"processing_time": datetime.now().isoformat(),
|
252
|
+
"metadata": {
|
253
|
+
"model": "table-transformer-detection",
|
254
|
+
"service": "isa_vision"
|
255
|
+
}
|
256
|
+
}
|
257
|
+
else:
|
258
|
+
raise Exception(detection_result.get("error", "Table detection failed"))
|
259
|
+
|
260
|
+
async def _execute_table_structure(self, service, input_data: Dict[str, Any], table_regions: List[Dict[str, Any]], params: Dict[str, Any]) -> Dict[str, Any]:
|
261
|
+
"""Execute Step 3: Table Transformer Structure Recognition"""
|
262
|
+
|
263
|
+
structure_results = []
|
264
|
+
|
265
|
+
for i, table_region in enumerate(table_regions):
|
266
|
+
# Process each table region for structure recognition
|
267
|
+
structure_result = {
|
268
|
+
"table_id": i + 1,
|
269
|
+
"bbox": table_region.get("bbox", []),
|
270
|
+
"rows": table_region.get("rows", 0),
|
271
|
+
"columns": table_region.get("columns", 0),
|
272
|
+
"cells": table_region.get("cells", []),
|
273
|
+
"confidence": table_region.get("confidence", 0.8)
|
274
|
+
}
|
275
|
+
structure_results.append(structure_result)
|
276
|
+
|
277
|
+
return {
|
278
|
+
"success": True,
|
279
|
+
"step": "table_structure_recognition",
|
280
|
+
"structures": structure_results,
|
281
|
+
"processing_time": datetime.now().isoformat(),
|
282
|
+
"metadata": {
|
283
|
+
"model": "table-transformer-structure",
|
284
|
+
"service": "isa_vision"
|
285
|
+
}
|
286
|
+
}
|
287
|
+
|
288
|
+
async def _execute_ocr_extraction(self, service, input_data: Dict[str, Any], table_regions: List[Dict[str, Any]], params: Dict[str, Any]) -> Dict[str, Any]:
|
289
|
+
"""Execute Step 4: PaddleOCR Text Extraction"""
|
290
|
+
|
291
|
+
image = input_data.get("image") or (input_data.get("images", [None])[0])
|
292
|
+
if not image:
|
293
|
+
raise ValueError("No image provided for OCR extraction")
|
294
|
+
|
295
|
+
# Use ISA Vision Service for OCR
|
296
|
+
ocr_result = await service.extract_text(image)
|
297
|
+
|
298
|
+
if ocr_result.get("success"):
|
299
|
+
extracted_text = ocr_result.get("text", "")
|
300
|
+
bounding_boxes = ocr_result.get("bounding_boxes", [])
|
301
|
+
|
302
|
+
# If table regions are provided, filter OCR results to table areas
|
303
|
+
if table_regions:
|
304
|
+
table_ocr_data = []
|
305
|
+
for table in table_regions:
|
306
|
+
table_text = self._extract_text_from_region(
|
307
|
+
extracted_text,
|
308
|
+
bounding_boxes,
|
309
|
+
table.get("bbox", [])
|
310
|
+
)
|
311
|
+
table_ocr_data.append({
|
312
|
+
"table_id": table.get("table_id", 0),
|
313
|
+
"text": table_text,
|
314
|
+
"bbox": table.get("bbox", [])
|
315
|
+
})
|
316
|
+
|
317
|
+
return {
|
318
|
+
"success": True,
|
319
|
+
"step": "ocr_extraction",
|
320
|
+
"full_text": extracted_text,
|
321
|
+
"table_ocr_data": table_ocr_data,
|
322
|
+
"confidence": ocr_result.get("confidence", 0.8),
|
323
|
+
"processing_time": datetime.now().isoformat(),
|
324
|
+
"metadata": {
|
325
|
+
"model": "PaddleOCR",
|
326
|
+
"service": "isa_vision"
|
327
|
+
}
|
328
|
+
}
|
329
|
+
else:
|
330
|
+
return {
|
331
|
+
"success": True,
|
332
|
+
"step": "ocr_extraction",
|
333
|
+
"full_text": extracted_text,
|
334
|
+
"confidence": ocr_result.get("confidence", 0.8),
|
335
|
+
"processing_time": datetime.now().isoformat(),
|
336
|
+
"metadata": {
|
337
|
+
"model": "PaddleOCR",
|
338
|
+
"service": "isa_vision"
|
339
|
+
}
|
340
|
+
}
|
341
|
+
else:
|
342
|
+
raise Exception(ocr_result.get("error", "OCR extraction failed"))
|
343
|
+
|
344
|
+
async def _execute_intelligent_matching(self, service, classification_data: Dict[str, Any], ocr_data: Dict[str, Any], params: Dict[str, Any]) -> Dict[str, Any]:
|
345
|
+
"""Execute Step 5: VLM Intelligent Matching"""
|
346
|
+
|
347
|
+
matching_prompt = f"""
|
348
|
+
基于识别到的表格数据和文档上下文,请提取并整理以下信息:
|
349
|
+
|
350
|
+
文档分类信息:
|
351
|
+
{json.dumps(classification_data, ensure_ascii=False, indent=2)}
|
352
|
+
|
353
|
+
OCR提取的数据:
|
354
|
+
{json.dumps(ocr_data, ensure_ascii=False, indent=2)}
|
355
|
+
|
356
|
+
请执行以下任务:
|
357
|
+
1. 将表格数据转换为结构化格式
|
358
|
+
2. 识别关键字段的含义和关系
|
359
|
+
3. 处理跨表格的数据关联
|
360
|
+
4. 识别并纠正OCR错误
|
361
|
+
5. 提供标准化的结构化数据,供业务服务进行模板映射
|
362
|
+
|
363
|
+
返回JSON格式:
|
364
|
+
{{
|
365
|
+
"structured_data": {{
|
366
|
+
"basic_info": {{
|
367
|
+
"document_type": "文档类型",
|
368
|
+
"company_info": "公司信息",
|
369
|
+
"date": "日期",
|
370
|
+
"document_number": "文档编号"
|
371
|
+
}},
|
372
|
+
"items": [
|
373
|
+
{{
|
374
|
+
"item_name": "商品名称",
|
375
|
+
"quantity": "数量",
|
376
|
+
"unit_price": "单价",
|
377
|
+
"total_price": "总价"
|
378
|
+
}}
|
379
|
+
],
|
380
|
+
"financial_summary": {{
|
381
|
+
"subtotal": "小计",
|
382
|
+
"tax": "税费",
|
383
|
+
"total": "总计"
|
384
|
+
}},
|
385
|
+
"additional_fields": {{
|
386
|
+
"field_name": "字段值"
|
387
|
+
}}
|
388
|
+
}},
|
389
|
+
"field_confidence": {{
|
390
|
+
"basic_info": 0.95,
|
391
|
+
"items": 0.90,
|
392
|
+
"financial_summary": 0.85
|
393
|
+
}},
|
394
|
+
"corrections_made": [
|
395
|
+
"纠正了OCR错误示例"
|
396
|
+
],
|
397
|
+
"ready_for_mapping": true
|
398
|
+
}}
|
399
|
+
"""
|
400
|
+
|
401
|
+
result = await service.analyze_image(
|
402
|
+
None, # Text-only analysis
|
403
|
+
matching_prompt,
|
404
|
+
max_tokens=params.get("max_tokens", 2000)
|
405
|
+
)
|
406
|
+
|
407
|
+
structured_data = self._parse_json_response(result.get("text", "{}"))
|
408
|
+
|
409
|
+
return {
|
410
|
+
"success": True,
|
411
|
+
"step": "intelligent_matching",
|
412
|
+
"structured_data": structured_data,
|
413
|
+
"ready_for_business_mapping": True,
|
414
|
+
"processing_time": datetime.now().isoformat(),
|
415
|
+
"metadata": {
|
416
|
+
"model": "gpt-4.1-nano",
|
417
|
+
"service": "vlm_matching",
|
418
|
+
"note": "Ready for business service template mapping"
|
419
|
+
}
|
420
|
+
}
|
421
|
+
|
422
|
+
def generate_final_output(self, results: Dict[str, LayerResult]) -> Dict[str, Any]:
|
423
|
+
"""Generate final output from all layer results"""
|
424
|
+
|
425
|
+
# Check if all critical steps succeeded
|
426
|
+
critical_steps = ["document_classification", "ocr_extraction", "intelligent_matching"]
|
427
|
+
success = all(
|
428
|
+
step in results and results[step].success
|
429
|
+
for step in critical_steps
|
430
|
+
)
|
431
|
+
|
432
|
+
if success:
|
433
|
+
# Get the final structured data from intelligent matching
|
434
|
+
final_data = results["intelligent_matching"].data.get("structured_data", {})
|
435
|
+
|
436
|
+
return {
|
437
|
+
"success": True,
|
438
|
+
"pipeline": "five_step_document_analysis",
|
439
|
+
"final_structured_data": final_data,
|
440
|
+
"ready_for_business_mapping": True,
|
441
|
+
"classification": results.get("document_classification", {}).data.get("classification", {}),
|
442
|
+
"table_info": {
|
443
|
+
"table_count": results.get("table_detection", {}).data.get("table_count", 0),
|
444
|
+
"table_regions": results.get("table_detection", {}).data.get("table_regions", [])
|
445
|
+
},
|
446
|
+
"ocr_info": {
|
447
|
+
"full_text": results.get("ocr_extraction", {}).data.get("full_text", ""),
|
448
|
+
"confidence": results.get("ocr_extraction", {}).data.get("confidence", 0)
|
449
|
+
},
|
450
|
+
"metadata": {
|
451
|
+
"service": "doc_analysis_stacked",
|
452
|
+
"pipeline_version": "5_step",
|
453
|
+
"processing_complete": True
|
454
|
+
}
|
455
|
+
}
|
456
|
+
else:
|
457
|
+
# Return error information
|
458
|
+
failed_steps = [
|
459
|
+
step for step in critical_steps
|
460
|
+
if step not in results or not results[step].success
|
461
|
+
]
|
462
|
+
|
463
|
+
return {
|
464
|
+
"success": False,
|
465
|
+
"pipeline": "five_step_document_analysis",
|
466
|
+
"error": f"Critical steps failed: {failed_steps}",
|
467
|
+
"failed_steps": failed_steps,
|
468
|
+
"partial_results": {
|
469
|
+
name: result.data for name, result in results.items()
|
470
|
+
if result.success
|
471
|
+
},
|
472
|
+
"metadata": {
|
473
|
+
"service": "doc_analysis_stacked",
|
474
|
+
"pipeline_version": "5_step",
|
475
|
+
"processing_complete": False
|
476
|
+
}
|
477
|
+
}
|
478
|
+
|
479
|
+
async def execute_fallback(self, layer: LayerConfig, context: Dict[str, Any], error: str) -> Optional[Any]:
|
480
|
+
"""Execute fallback logic for failed layers"""
|
481
|
+
|
482
|
+
if layer.name == "table_detection":
|
483
|
+
# Fallback: Continue without table regions
|
484
|
+
logger.warning(f"Table detection failed: {error}. Continuing without table regions.")
|
485
|
+
return {
|
486
|
+
"success": True,
|
487
|
+
"step": "table_detection",
|
488
|
+
"table_count": 0,
|
489
|
+
"table_regions": [],
|
490
|
+
"fallback": True,
|
491
|
+
"processing_time": datetime.now().isoformat(),
|
492
|
+
"metadata": {
|
493
|
+
"model": "fallback",
|
494
|
+
"service": "fallback",
|
495
|
+
"error": error
|
496
|
+
}
|
497
|
+
}
|
498
|
+
|
499
|
+
elif layer.name == "table_structure":
|
500
|
+
# Fallback: Return empty structure
|
501
|
+
logger.warning(f"Table structure recognition failed: {error}. Using empty structure.")
|
502
|
+
return {
|
503
|
+
"success": True,
|
504
|
+
"step": "table_structure_recognition",
|
505
|
+
"structures": [],
|
506
|
+
"fallback": True,
|
507
|
+
"processing_time": datetime.now().isoformat(),
|
508
|
+
"metadata": {
|
509
|
+
"model": "fallback",
|
510
|
+
"service": "fallback",
|
511
|
+
"error": error
|
512
|
+
}
|
513
|
+
}
|
514
|
+
|
515
|
+
elif layer.name == "document_classification":
|
516
|
+
# Fallback: Use generic classification
|
517
|
+
logger.warning(f"Document classification failed: {error}. Using generic classification.")
|
518
|
+
return {
|
519
|
+
"success": True,
|
520
|
+
"step": "document_classification",
|
521
|
+
"classification": {
|
522
|
+
"document_classification": {
|
523
|
+
"document_type": "unknown_document",
|
524
|
+
"business_type": "unknown",
|
525
|
+
"confidence": 0.1,
|
526
|
+
"pages": []
|
527
|
+
}
|
528
|
+
},
|
529
|
+
"fallback": True,
|
530
|
+
"processing_time": datetime.now().isoformat(),
|
531
|
+
"metadata": {
|
532
|
+
"model": "fallback",
|
533
|
+
"service": "fallback",
|
534
|
+
"error": error
|
535
|
+
}
|
536
|
+
}
|
537
|
+
|
538
|
+
elif layer.name == "intelligent_matching":
|
539
|
+
# Fallback: Use basic structured format
|
540
|
+
logger.warning(f"Intelligent matching failed: {error}. Using basic structure.")
|
541
|
+
return {
|
542
|
+
"success": True,
|
543
|
+
"step": "intelligent_matching",
|
544
|
+
"structured_data": {
|
545
|
+
"structured_data": {
|
546
|
+
"basic_info": {},
|
547
|
+
"items": [],
|
548
|
+
"financial_summary": {},
|
549
|
+
"additional_fields": {}
|
550
|
+
},
|
551
|
+
"field_confidence": {},
|
552
|
+
"corrections_made": [],
|
553
|
+
"ready_for_mapping": False
|
554
|
+
},
|
555
|
+
"fallback": True,
|
556
|
+
"processing_time": datetime.now().isoformat(),
|
557
|
+
"metadata": {
|
558
|
+
"model": "fallback",
|
559
|
+
"service": "fallback",
|
560
|
+
"error": error
|
561
|
+
}
|
562
|
+
}
|
563
|
+
|
564
|
+
return None
|
565
|
+
|
566
|
+
# Helper methods
|
567
|
+
|
568
|
+
def _parse_json_response(self, response_text: str) -> Dict[str, Any]:
|
569
|
+
"""Parse JSON response from VLM"""
|
570
|
+
try:
|
571
|
+
# Try to find JSON in the response
|
572
|
+
start_idx = response_text.find('{')
|
573
|
+
end_idx = response_text.rfind('}') + 1
|
574
|
+
|
575
|
+
if start_idx != -1 and end_idx > start_idx:
|
576
|
+
json_str = response_text[start_idx:end_idx]
|
577
|
+
return json.loads(json_str)
|
578
|
+
else:
|
579
|
+
# Fallback to empty dict
|
580
|
+
return {}
|
581
|
+
|
582
|
+
except json.JSONDecodeError:
|
583
|
+
logger.warning(f"Failed to parse JSON from response: {response_text[:200]}...")
|
584
|
+
return {}
|
585
|
+
|
586
|
+
def _extract_text_from_region(
|
587
|
+
self,
|
588
|
+
full_text: str,
|
589
|
+
bounding_boxes: List[Dict],
|
590
|
+
region_bbox: List[int]
|
591
|
+
) -> str:
|
592
|
+
"""Extract text that falls within a specific region"""
|
593
|
+
# This is a simplified implementation
|
594
|
+
# In practice, you would need to check if bounding boxes overlap with region
|
595
|
+
return full_text # For now, return full text
|
596
|
+
|
597
|
+
# Convenience methods for direct usage
|
598
|
+
|
599
|
+
async def analyze_document(self, image: Union[str, BinaryIO], images: Optional[List[Union[str, BinaryIO]]] = None) -> Dict[str, Any]:
|
600
|
+
"""Convenience method for complete document analysis"""
|
601
|
+
|
602
|
+
if images is None:
|
603
|
+
images = [image] if image else []
|
604
|
+
|
605
|
+
input_data = {
|
606
|
+
"image": image,
|
607
|
+
"images": images
|
608
|
+
}
|
609
|
+
|
610
|
+
return await self.invoke(input_data)
|
611
|
+
|
612
|
+
async def classify_document_only(self, images: List[Union[str, BinaryIO]]) -> Dict[str, Any]:
|
613
|
+
"""Convenience method for document classification only"""
|
614
|
+
|
615
|
+
# Temporarily configure only classification layer
|
616
|
+
original_layers = self.layers.copy()
|
617
|
+
self.layers = [layer for layer in self.layers if layer.name == "document_classification"]
|
618
|
+
|
619
|
+
try:
|
620
|
+
input_data = {"images": images}
|
621
|
+
result = await self.invoke(input_data)
|
622
|
+
return result
|
623
|
+
finally:
|
624
|
+
# Restore original layers
|
625
|
+
self.layers = original_layers
|
626
|
+
|
627
|
+
async def extract_text_only(self, image: Union[str, BinaryIO]) -> Dict[str, Any]:
|
628
|
+
"""Convenience method for OCR extraction only"""
|
629
|
+
|
630
|
+
# Temporarily configure only OCR layer
|
631
|
+
original_layers = self.layers.copy()
|
632
|
+
self.layers = [layer for layer in self.layers if layer.name == "ocr_extraction"]
|
633
|
+
|
634
|
+
try:
|
635
|
+
input_data = {"image": image}
|
636
|
+
result = await self.invoke(input_data)
|
637
|
+
return result
|
638
|
+
finally:
|
639
|
+
# Restore original layers
|
640
|
+
self.layers = original_layers
|