flexllm 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flexllm/__init__.py +224 -0
- flexllm/__main__.py +1096 -0
- flexllm/async_api/__init__.py +9 -0
- flexllm/async_api/concurrent_call.py +100 -0
- flexllm/async_api/concurrent_executor.py +1036 -0
- flexllm/async_api/core.py +373 -0
- flexllm/async_api/interface.py +12 -0
- flexllm/async_api/progress.py +277 -0
- flexllm/base_client.py +988 -0
- flexllm/batch_tools/__init__.py +16 -0
- flexllm/batch_tools/folder_processor.py +317 -0
- flexllm/batch_tools/table_processor.py +363 -0
- flexllm/cache/__init__.py +10 -0
- flexllm/cache/response_cache.py +293 -0
- flexllm/chain_of_thought_client.py +1120 -0
- flexllm/claudeclient.py +402 -0
- flexllm/client_pool.py +698 -0
- flexllm/geminiclient.py +563 -0
- flexllm/llm_client.py +523 -0
- flexllm/llm_parser.py +60 -0
- flexllm/mllm_client.py +559 -0
- flexllm/msg_processors/__init__.py +174 -0
- flexllm/msg_processors/image_processor.py +729 -0
- flexllm/msg_processors/image_processor_helper.py +485 -0
- flexllm/msg_processors/messages_processor.py +341 -0
- flexllm/msg_processors/unified_processor.py +1404 -0
- flexllm/openaiclient.py +256 -0
- flexllm/pricing/__init__.py +104 -0
- flexllm/pricing/data.json +1201 -0
- flexllm/pricing/updater.py +223 -0
- flexllm/provider_router.py +213 -0
- flexllm/token_counter.py +270 -0
- flexllm/utils/__init__.py +1 -0
- flexllm/utils/core.py +41 -0
- flexllm-0.3.3.dist-info/METADATA +573 -0
- flexllm-0.3.3.dist-info/RECORD +39 -0
- flexllm-0.3.3.dist-info/WHEEL +4 -0
- flexllm-0.3.3.dist-info/entry_points.txt +3 -0
- flexllm-0.3.3.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,1404 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
统一的图像处理器
|
|
4
|
+
合并本地文件处理和URL处理功能,提供高性能的批量消息预处理
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import aiohttp
|
|
9
|
+
import time
|
|
10
|
+
import os
|
|
11
|
+
import base64
|
|
12
|
+
import hashlib
|
|
13
|
+
import json
|
|
14
|
+
import sys
|
|
15
|
+
import io
|
|
16
|
+
import threading
|
|
17
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Optional, List, Dict, Any, Tuple, Union, Callable
|
|
20
|
+
from dataclasses import dataclass
|
|
21
|
+
from functools import lru_cache
|
|
22
|
+
from copy import deepcopy
|
|
23
|
+
from collections import defaultdict
|
|
24
|
+
import gc
|
|
25
|
+
import contextlib
|
|
26
|
+
|
|
27
|
+
import numpy as np
|
|
28
|
+
from PIL import Image
|
|
29
|
+
from loguru import logger
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
import cv2
|
|
33
|
+
HAS_CV2 = True
|
|
34
|
+
except ImportError:
|
|
35
|
+
cv2 = None
|
|
36
|
+
HAS_CV2 = False
|
|
37
|
+
|
|
38
|
+
# 导入缓存配置
|
|
39
|
+
try:
|
|
40
|
+
from .image_processor import (
|
|
41
|
+
ImageCacheConfig,
|
|
42
|
+
DEFAULT_CACHE_DIR,
|
|
43
|
+
get_target_size,
|
|
44
|
+
LANCZOS,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
HAS_IMAGE_PROCESSOR = True
|
|
48
|
+
except ImportError:
|
|
49
|
+
HAS_IMAGE_PROCESSOR = False
|
|
50
|
+
DEFAULT_CACHE_DIR = "cache"
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
from tqdm.asyncio import tqdm
|
|
54
|
+
|
|
55
|
+
TQDM_AVAILABLE = True
|
|
56
|
+
except ImportError:
|
|
57
|
+
TQDM_AVAILABLE = False
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@contextlib.contextmanager
|
|
61
|
+
def suppress_stdout():
|
|
62
|
+
"""上下文管理器,用于抑制stdout输出"""
|
|
63
|
+
old_stdout = sys.stdout
|
|
64
|
+
try:
|
|
65
|
+
sys.stdout = io.StringIO()
|
|
66
|
+
yield
|
|
67
|
+
finally:
|
|
68
|
+
sys.stdout = old_stdout
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@contextlib.contextmanager
|
|
72
|
+
def suppress_stderr():
|
|
73
|
+
"""上下文管理器,用于抑制stderr输出"""
|
|
74
|
+
old_stderr = sys.stderr
|
|
75
|
+
try:
|
|
76
|
+
sys.stderr = io.StringIO()
|
|
77
|
+
yield
|
|
78
|
+
finally:
|
|
79
|
+
sys.stderr = old_stderr
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@contextlib.contextmanager
|
|
83
|
+
def suppress_all_output():
|
|
84
|
+
"""上下文管理器,用于抑制所有输出"""
|
|
85
|
+
with suppress_stdout(), suppress_stderr():
|
|
86
|
+
yield
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def safe_repr_source(source: str, max_length: int = 100) -> str:
|
|
90
|
+
"""安全地表示图像源,避免输出大量base64字符串"""
|
|
91
|
+
if not source:
|
|
92
|
+
return "空源"
|
|
93
|
+
|
|
94
|
+
# 检查是否是base64数据URI
|
|
95
|
+
if source.startswith("data:image/") and ";base64," in source:
|
|
96
|
+
parts = source.split(";base64,", 1)
|
|
97
|
+
if len(parts) == 2:
|
|
98
|
+
mime_type = parts[0].replace("data:", "")
|
|
99
|
+
base64_data = parts[1]
|
|
100
|
+
return f"[{mime_type} base64数据 长度:{len(base64_data)}]"
|
|
101
|
+
|
|
102
|
+
# 检查是否是纯base64字符串(很长且只包含base64字符)
|
|
103
|
+
if len(source) > 100 and all(
|
|
104
|
+
c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
|
|
105
|
+
for c in source
|
|
106
|
+
):
|
|
107
|
+
return f"[base64数据 长度:{len(source)}]"
|
|
108
|
+
|
|
109
|
+
# 普通字符串,截断显示
|
|
110
|
+
if len(source) <= max_length:
|
|
111
|
+
return source
|
|
112
|
+
else:
|
|
113
|
+
return source[:max_length] + "..."
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def safe_repr_error(error_msg: str, max_length: int = 200) -> str:
|
|
117
|
+
"""安全地表示错误信息,避免输出大量base64字符串"""
|
|
118
|
+
if not error_msg:
|
|
119
|
+
return error_msg
|
|
120
|
+
|
|
121
|
+
# 检查错误信息中是否包含data:image的base64数据
|
|
122
|
+
if "data:image/" in error_msg and ";base64," in error_msg:
|
|
123
|
+
import re
|
|
124
|
+
|
|
125
|
+
# 使用正则表达式替换base64数据URI
|
|
126
|
+
pattern = r"data:image/[^;]+;base64,[A-Za-z0-9+/]+=*"
|
|
127
|
+
|
|
128
|
+
def replace_base64(match):
|
|
129
|
+
full_uri = match.group(0)
|
|
130
|
+
parts = full_uri.split(";base64,", 1)
|
|
131
|
+
if len(parts) == 2:
|
|
132
|
+
mime_type = parts[0].replace("data:", "")
|
|
133
|
+
base64_data = parts[1]
|
|
134
|
+
return f"[{mime_type} base64数据 长度:{len(base64_data)}]"
|
|
135
|
+
return full_uri
|
|
136
|
+
|
|
137
|
+
error_msg = re.sub(pattern, replace_base64, error_msg)
|
|
138
|
+
|
|
139
|
+
# 截断过长的错误信息
|
|
140
|
+
if len(error_msg) <= max_length:
|
|
141
|
+
return error_msg
|
|
142
|
+
else:
|
|
143
|
+
return error_msg[:max_length] + "..."
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
@dataclass
|
|
147
|
+
class UnifiedProcessorConfig:
|
|
148
|
+
"""统一处理器配置"""
|
|
149
|
+
|
|
150
|
+
# 线程和并发配置
|
|
151
|
+
max_workers: int = 8
|
|
152
|
+
max_concurrent: int = 10
|
|
153
|
+
enable_multithreading: bool = True
|
|
154
|
+
|
|
155
|
+
# 图像质量配置
|
|
156
|
+
jpeg_quality: int = 95
|
|
157
|
+
png_compression: int = 1
|
|
158
|
+
webp_quality: int = 90
|
|
159
|
+
|
|
160
|
+
# 缓存配置
|
|
161
|
+
memory_cache_size_mb: int = 500
|
|
162
|
+
enable_disk_cache: bool = True
|
|
163
|
+
disk_cache_dir: str = DEFAULT_CACHE_DIR
|
|
164
|
+
force_refresh_disk_cache: bool = False
|
|
165
|
+
retry_failed_disk_cache: bool = False
|
|
166
|
+
|
|
167
|
+
# 性能配置
|
|
168
|
+
prefetch_size: int = 50
|
|
169
|
+
enable_simd: bool = True
|
|
170
|
+
suppress_opencv_output: bool = True
|
|
171
|
+
|
|
172
|
+
# 超时配置
|
|
173
|
+
single_file_timeout: float = 10.0
|
|
174
|
+
batch_timeout: float = 60.0
|
|
175
|
+
network_timeout: float = 15.0
|
|
176
|
+
|
|
177
|
+
@classmethod
|
|
178
|
+
def default(cls) -> "UnifiedProcessorConfig":
|
|
179
|
+
"""默认配置"""
|
|
180
|
+
return cls()
|
|
181
|
+
|
|
182
|
+
@classmethod
|
|
183
|
+
def high_performance(cls) -> "UnifiedProcessorConfig":
|
|
184
|
+
"""高性能配置"""
|
|
185
|
+
return cls(
|
|
186
|
+
max_workers=16,
|
|
187
|
+
max_concurrent=32,
|
|
188
|
+
jpeg_quality=95,
|
|
189
|
+
png_compression=3,
|
|
190
|
+
memory_cache_size_mb=1000,
|
|
191
|
+
prefetch_size=100,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
@classmethod
|
|
195
|
+
def memory_optimized(cls) -> "UnifiedProcessorConfig":
|
|
196
|
+
"""内存优化配置"""
|
|
197
|
+
return cls(
|
|
198
|
+
max_workers=4,
|
|
199
|
+
max_concurrent=6,
|
|
200
|
+
jpeg_quality=80,
|
|
201
|
+
png_compression=6,
|
|
202
|
+
memory_cache_size_mb=200,
|
|
203
|
+
prefetch_size=20,
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
@classmethod
|
|
207
|
+
def from_image_cache_config(cls, cache_config: "ImageCacheConfig") -> "UnifiedProcessorConfig":
|
|
208
|
+
"""从旧版本ImageCacheConfig创建新配置"""
|
|
209
|
+
return cls(
|
|
210
|
+
enable_disk_cache=cache_config.enabled,
|
|
211
|
+
disk_cache_dir=cache_config.cache_dir,
|
|
212
|
+
force_refresh_disk_cache=cache_config.force_refresh,
|
|
213
|
+
retry_failed_disk_cache=cache_config.retry_failed,
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
@classmethod
|
|
217
|
+
def auto_detect(cls) -> "UnifiedProcessorConfig":
|
|
218
|
+
"""自适应配置,根据系统资源自动调整"""
|
|
219
|
+
try:
|
|
220
|
+
import psutil
|
|
221
|
+
import os
|
|
222
|
+
|
|
223
|
+
# 获取系统信息
|
|
224
|
+
cpu_count = os.cpu_count() or 4
|
|
225
|
+
memory_gb = psutil.virtual_memory().total / (1024**3)
|
|
226
|
+
|
|
227
|
+
# 根据CPU核心数调整worker数量
|
|
228
|
+
max_workers = max(4, min(cpu_count, 24))
|
|
229
|
+
max_concurrent = max(6, min(cpu_count * 2, 40))
|
|
230
|
+
|
|
231
|
+
# 根据内存大小调整缓存
|
|
232
|
+
if memory_gb >= 16:
|
|
233
|
+
# 16GB+: 高性能模式
|
|
234
|
+
cache_size = 1000
|
|
235
|
+
prefetch_size = 100
|
|
236
|
+
jpeg_quality = 95
|
|
237
|
+
elif memory_gb >= 8:
|
|
238
|
+
# 8-16GB: 平衡模式
|
|
239
|
+
cache_size = 500
|
|
240
|
+
prefetch_size = 50
|
|
241
|
+
jpeg_quality = 90
|
|
242
|
+
else:
|
|
243
|
+
# <8GB: 节省模式
|
|
244
|
+
cache_size = 200
|
|
245
|
+
prefetch_size = 20
|
|
246
|
+
jpeg_quality = 80
|
|
247
|
+
|
|
248
|
+
return cls(
|
|
249
|
+
max_workers=max_workers,
|
|
250
|
+
max_concurrent=max_concurrent,
|
|
251
|
+
memory_cache_size_mb=cache_size,
|
|
252
|
+
prefetch_size=prefetch_size,
|
|
253
|
+
jpeg_quality=jpeg_quality,
|
|
254
|
+
png_compression=3,
|
|
255
|
+
enable_disk_cache=True, # 默认启用磁盘缓存
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
except ImportError:
|
|
259
|
+
# 如果没有psutil,回退到默认配置
|
|
260
|
+
return cls.default()
|
|
261
|
+
except Exception:
|
|
262
|
+
# 其他异常,回退到默认配置
|
|
263
|
+
return cls.default()
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
class UnifiedMemoryCache:
|
|
267
|
+
"""统一的线程安全内存缓存"""
|
|
268
|
+
|
|
269
|
+
def __init__(self, max_size_mb: int = 500):
|
|
270
|
+
self.max_size_bytes = max_size_mb * 1024 * 1024
|
|
271
|
+
self.cache = {}
|
|
272
|
+
self.access_times = {}
|
|
273
|
+
self.cache_sizes = {}
|
|
274
|
+
self.current_size = 0
|
|
275
|
+
self.lock = threading.RLock()
|
|
276
|
+
self.hit_count = 0
|
|
277
|
+
self.miss_count = 0
|
|
278
|
+
|
|
279
|
+
def _evict_lru(self):
|
|
280
|
+
"""清理LRU项目"""
|
|
281
|
+
if not self.cache:
|
|
282
|
+
return
|
|
283
|
+
|
|
284
|
+
# 找到最少使用的项目
|
|
285
|
+
lru_key = min(self.access_times.keys(), key=lambda k: self.access_times[k])
|
|
286
|
+
|
|
287
|
+
# 移除LRU项目
|
|
288
|
+
if lru_key in self.cache:
|
|
289
|
+
self.cache.pop(lru_key)
|
|
290
|
+
self.access_times.pop(lru_key)
|
|
291
|
+
size = self.cache_sizes.pop(lru_key, 0)
|
|
292
|
+
self.current_size -= size
|
|
293
|
+
|
|
294
|
+
def _generate_cache_key(
|
|
295
|
+
self,
|
|
296
|
+
source: str,
|
|
297
|
+
max_width: Optional[int] = None,
|
|
298
|
+
max_height: Optional[int] = None,
|
|
299
|
+
max_pixels: Optional[int] = None,
|
|
300
|
+
**kwargs,
|
|
301
|
+
) -> str:
|
|
302
|
+
"""生成缓存键"""
|
|
303
|
+
try:
|
|
304
|
+
key_parts = [source]
|
|
305
|
+
|
|
306
|
+
# 添加处理参数
|
|
307
|
+
if max_width is not None:
|
|
308
|
+
key_parts.append(f"w:{max_width}")
|
|
309
|
+
if max_height is not None:
|
|
310
|
+
key_parts.append(f"h:{max_height}")
|
|
311
|
+
if max_pixels is not None:
|
|
312
|
+
key_parts.append(f"p:{max_pixels}")
|
|
313
|
+
|
|
314
|
+
# 对于本地文件,添加修改时间
|
|
315
|
+
if os.path.exists(source):
|
|
316
|
+
try:
|
|
317
|
+
mtime = os.path.getmtime(source)
|
|
318
|
+
key_parts.append(f"mtime:{mtime}")
|
|
319
|
+
except:
|
|
320
|
+
pass
|
|
321
|
+
|
|
322
|
+
# 添加其他参数
|
|
323
|
+
for key in sorted(kwargs.keys()):
|
|
324
|
+
if kwargs[key] is not None:
|
|
325
|
+
key_parts.append(f"{key}:{kwargs[key]}")
|
|
326
|
+
|
|
327
|
+
key_data = "|".join(key_parts)
|
|
328
|
+
return hashlib.md5(key_data.encode()).hexdigest()
|
|
329
|
+
except Exception:
|
|
330
|
+
return hashlib.md5(source.encode()).hexdigest()
|
|
331
|
+
|
|
332
|
+
def get(self, source: str, **kwargs) -> Optional[str]:
|
|
333
|
+
"""获取缓存数据"""
|
|
334
|
+
cache_key = self._generate_cache_key(source, **kwargs)
|
|
335
|
+
|
|
336
|
+
with self.lock:
|
|
337
|
+
if cache_key in self.cache:
|
|
338
|
+
self.access_times[cache_key] = time.time()
|
|
339
|
+
self.hit_count += 1
|
|
340
|
+
return self.cache[cache_key]
|
|
341
|
+
else:
|
|
342
|
+
self.miss_count += 1
|
|
343
|
+
return None
|
|
344
|
+
|
|
345
|
+
def put(self, source: str, data: str, **kwargs):
|
|
346
|
+
"""存储缓存数据"""
|
|
347
|
+
cache_key = self._generate_cache_key(source, **kwargs)
|
|
348
|
+
|
|
349
|
+
with self.lock:
|
|
350
|
+
try:
|
|
351
|
+
data_size = len(data.encode("utf-8"))
|
|
352
|
+
|
|
353
|
+
# 如果数据太大,不缓存
|
|
354
|
+
if data_size > self.max_size_bytes * 0.5:
|
|
355
|
+
return
|
|
356
|
+
|
|
357
|
+
# 清理空间
|
|
358
|
+
while (
|
|
359
|
+
self.current_size + data_size > self.max_size_bytes and self.cache
|
|
360
|
+
):
|
|
361
|
+
self._evict_lru()
|
|
362
|
+
|
|
363
|
+
# 存储数据
|
|
364
|
+
self.cache[cache_key] = data
|
|
365
|
+
self.access_times[cache_key] = time.time()
|
|
366
|
+
self.cache_sizes[cache_key] = data_size
|
|
367
|
+
self.current_size += data_size
|
|
368
|
+
except Exception:
|
|
369
|
+
# 静默处理缓存错误
|
|
370
|
+
pass
|
|
371
|
+
|
|
372
|
+
def clear(self):
|
|
373
|
+
"""清空缓存"""
|
|
374
|
+
with self.lock:
|
|
375
|
+
self.cache.clear()
|
|
376
|
+
self.access_times.clear()
|
|
377
|
+
self.cache_sizes.clear()
|
|
378
|
+
self.current_size = 0
|
|
379
|
+
gc.collect()
|
|
380
|
+
|
|
381
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
382
|
+
"""获取缓存统计"""
|
|
383
|
+
with self.lock:
|
|
384
|
+
total_requests = self.hit_count + self.miss_count
|
|
385
|
+
hit_rate = (
|
|
386
|
+
(self.hit_count / total_requests * 100) if total_requests > 0 else 0
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
return {
|
|
390
|
+
"cached_items": len(self.cache),
|
|
391
|
+
"current_size_mb": self.current_size / 1024 / 1024,
|
|
392
|
+
"max_size_mb": self.max_size_bytes / 1024 / 1024,
|
|
393
|
+
"usage_percent": (self.current_size / self.max_size_bytes * 100)
|
|
394
|
+
if self.max_size_bytes > 0
|
|
395
|
+
else 0,
|
|
396
|
+
"hit_count": self.hit_count,
|
|
397
|
+
"miss_count": self.miss_count,
|
|
398
|
+
"hit_rate_percent": hit_rate,
|
|
399
|
+
"total_requests": total_requests,
|
|
400
|
+
"avg_item_size_kb": (self.current_size / 1024 / len(self.cache))
|
|
401
|
+
if self.cache
|
|
402
|
+
else 0,
|
|
403
|
+
"cache_efficiency": "excellent"
|
|
404
|
+
if hit_rate > 80
|
|
405
|
+
else "good"
|
|
406
|
+
if hit_rate > 60
|
|
407
|
+
else "fair"
|
|
408
|
+
if hit_rate > 40
|
|
409
|
+
else "poor",
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
class UnifiedImageProcessor:
|
|
414
|
+
"""统一的图像处理器,支持本地文件和URL"""
|
|
415
|
+
|
|
416
|
+
def __init__(self, config: Optional[UnifiedProcessorConfig] = None):
|
|
417
|
+
self.config = config or UnifiedProcessorConfig.default()
|
|
418
|
+
self.memory_cache = UnifiedMemoryCache(self.config.memory_cache_size_mb)
|
|
419
|
+
|
|
420
|
+
# 磁盘缓存配置
|
|
421
|
+
self.disk_cache_config = None
|
|
422
|
+
if self.config.enable_disk_cache and HAS_IMAGE_PROCESSOR:
|
|
423
|
+
self.disk_cache_config = ImageCacheConfig(
|
|
424
|
+
enabled=True,
|
|
425
|
+
cache_dir=self.config.disk_cache_dir,
|
|
426
|
+
force_refresh=self.config.force_refresh_disk_cache,
|
|
427
|
+
retry_failed=self.config.retry_failed_disk_cache
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
# 线程池和锁
|
|
431
|
+
self.executor = None
|
|
432
|
+
self.processing_locks: Dict[str, asyncio.Lock] = {}
|
|
433
|
+
self.lock = asyncio.Lock()
|
|
434
|
+
self._executor_initialized = False
|
|
435
|
+
self._init_lock = threading.Lock()
|
|
436
|
+
|
|
437
|
+
# 性能统计
|
|
438
|
+
self._total_processed = 0
|
|
439
|
+
self._total_processing_time = 0.0
|
|
440
|
+
self._start_time = time.time()
|
|
441
|
+
|
|
442
|
+
# 初始化OpenCV优化
|
|
443
|
+
self._init_opencv_optimizations()
|
|
444
|
+
|
|
445
|
+
def _init_opencv_optimizations(self):
|
|
446
|
+
"""初始化OpenCV优化设置"""
|
|
447
|
+
if not HAS_CV2:
|
|
448
|
+
return
|
|
449
|
+
try:
|
|
450
|
+
with (
|
|
451
|
+
suppress_all_output()
|
|
452
|
+
if self.config.suppress_opencv_output
|
|
453
|
+
else contextlib.nullcontext()
|
|
454
|
+
):
|
|
455
|
+
cv2.setUseOptimized(True)
|
|
456
|
+
cv2.setNumThreads(self.config.max_workers)
|
|
457
|
+
cv2.setLogLevel(cv2.LOG_LEVEL_ERROR)
|
|
458
|
+
|
|
459
|
+
if self.config.enable_simd and hasattr(cv2, "useOptimized"):
|
|
460
|
+
cv2.useOptimized()
|
|
461
|
+
except Exception:
|
|
462
|
+
pass
|
|
463
|
+
|
|
464
|
+
def _get_executor(self) -> ThreadPoolExecutor:
|
|
465
|
+
"""获取线程池执行器(延迟初始化)"""
|
|
466
|
+
if not self._executor_initialized:
|
|
467
|
+
with self._init_lock:
|
|
468
|
+
if not self._executor_initialized:
|
|
469
|
+
self.executor = ThreadPoolExecutor(
|
|
470
|
+
max_workers=self.config.max_workers,
|
|
471
|
+
thread_name_prefix="unified_processor",
|
|
472
|
+
)
|
|
473
|
+
self._executor_initialized = True
|
|
474
|
+
return self.executor
|
|
475
|
+
|
|
476
|
+
async def _get_processing_lock(self, cache_key: str) -> asyncio.Lock:
|
|
477
|
+
"""获取文件处理锁"""
|
|
478
|
+
async with self.lock:
|
|
479
|
+
if cache_key not in self.processing_locks:
|
|
480
|
+
self.processing_locks[cache_key] = asyncio.Lock()
|
|
481
|
+
return self.processing_locks[cache_key]
|
|
482
|
+
|
|
483
|
+
def _detect_image_format(self, file_path: str) -> str:
|
|
484
|
+
"""检测图像格式"""
|
|
485
|
+
try:
|
|
486
|
+
ext = Path(file_path).suffix.lower()
|
|
487
|
+
format_map = {
|
|
488
|
+
".jpg": "JPEG",
|
|
489
|
+
".jpeg": "JPEG",
|
|
490
|
+
".png": "PNG",
|
|
491
|
+
".webp": "WEBP",
|
|
492
|
+
".bmp": "BMP",
|
|
493
|
+
".tiff": "TIFF",
|
|
494
|
+
".tif": "TIFF",
|
|
495
|
+
}
|
|
496
|
+
return format_map.get(ext, "JPEG")
|
|
497
|
+
except Exception:
|
|
498
|
+
return "JPEG"
|
|
499
|
+
|
|
500
|
+
def _get_encode_params(self, format_type: str) -> List[int]:
|
|
501
|
+
"""获取编码参数"""
|
|
502
|
+
if not HAS_CV2:
|
|
503
|
+
return []
|
|
504
|
+
try:
|
|
505
|
+
if format_type == "JPEG":
|
|
506
|
+
return [cv2.IMWRITE_JPEG_QUALITY, self.config.jpeg_quality]
|
|
507
|
+
elif format_type == "PNG":
|
|
508
|
+
return [cv2.IMWRITE_PNG_COMPRESSION, self.config.png_compression]
|
|
509
|
+
elif format_type == "WEBP":
|
|
510
|
+
return [cv2.IMWRITE_WEBP_QUALITY, self.config.webp_quality]
|
|
511
|
+
else:
|
|
512
|
+
return []
|
|
513
|
+
except Exception:
|
|
514
|
+
return []
|
|
515
|
+
|
|
516
|
+
def _calculate_target_size(
|
|
517
|
+
self,
|
|
518
|
+
original_width: int,
|
|
519
|
+
original_height: int,
|
|
520
|
+
max_width: Optional[int],
|
|
521
|
+
max_height: Optional[int],
|
|
522
|
+
max_pixels: Optional[int],
|
|
523
|
+
) -> Tuple[int, int]:
|
|
524
|
+
"""计算目标尺寸"""
|
|
525
|
+
try:
|
|
526
|
+
width, height = original_width, original_height
|
|
527
|
+
|
|
528
|
+
# 应用最大宽度/高度限制
|
|
529
|
+
if max_width and width > max_width:
|
|
530
|
+
height = int(height * max_width / width)
|
|
531
|
+
width = max_width
|
|
532
|
+
|
|
533
|
+
if max_height and height > max_height:
|
|
534
|
+
width = int(width * max_height / height)
|
|
535
|
+
height = max_height
|
|
536
|
+
|
|
537
|
+
# 应用最大像素限制
|
|
538
|
+
if max_pixels and (width * height > max_pixels):
|
|
539
|
+
ratio = (max_pixels / (width * height)) ** 0.5
|
|
540
|
+
width = int(width * ratio)
|
|
541
|
+
height = int(height * ratio)
|
|
542
|
+
|
|
543
|
+
return max(1, width), max(1, height)
|
|
544
|
+
except Exception:
|
|
545
|
+
return original_width, original_height
|
|
546
|
+
|
|
547
|
+
def _process_local_file_sync(
|
|
548
|
+
self,
|
|
549
|
+
file_path: str,
|
|
550
|
+
max_width: Optional[int] = None,
|
|
551
|
+
max_height: Optional[int] = None,
|
|
552
|
+
max_pixels: Optional[int] = None,
|
|
553
|
+
return_with_mime: bool = True,
|
|
554
|
+
) -> str:
|
|
555
|
+
"""同步处理本地文件"""
|
|
556
|
+
if not HAS_CV2:
|
|
557
|
+
raise ImportError(
|
|
558
|
+
"图像处理功能需要安装 opencv-python。请运行: pip install flexllm[image]"
|
|
559
|
+
)
|
|
560
|
+
try:
|
|
561
|
+
with (
|
|
562
|
+
suppress_all_output()
|
|
563
|
+
if self.config.suppress_opencv_output
|
|
564
|
+
else contextlib.nullcontext()
|
|
565
|
+
):
|
|
566
|
+
# 使用OpenCV读取图像
|
|
567
|
+
img = cv2.imread(file_path, cv2.IMREAD_COLOR)
|
|
568
|
+
if img is None:
|
|
569
|
+
raise ValueError(f"无法读取图像文件: {file_path}")
|
|
570
|
+
|
|
571
|
+
original_height, original_width = img.shape[:2]
|
|
572
|
+
|
|
573
|
+
# 计算目标尺寸
|
|
574
|
+
target_width, target_height = self._calculate_target_size(
|
|
575
|
+
original_width, original_height, max_width, max_height, max_pixels
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
# 如果需要调整大小
|
|
579
|
+
if target_width != original_width or target_height != original_height:
|
|
580
|
+
img = cv2.resize(
|
|
581
|
+
img,
|
|
582
|
+
(target_width, target_height),
|
|
583
|
+
interpolation=cv2.INTER_LANCZOS4,
|
|
584
|
+
)
|
|
585
|
+
|
|
586
|
+
# 检测原始格式并编码
|
|
587
|
+
format_type = self._detect_image_format(file_path)
|
|
588
|
+
encode_params = self._get_encode_params(format_type)
|
|
589
|
+
|
|
590
|
+
ext = f".{format_type.lower()}"
|
|
591
|
+
if format_type == "JPEG":
|
|
592
|
+
ext = ".jpg"
|
|
593
|
+
|
|
594
|
+
success, buffer = cv2.imencode(ext, img, encode_params)
|
|
595
|
+
if not success:
|
|
596
|
+
raise ValueError(f"图像编码失败: {file_path}")
|
|
597
|
+
|
|
598
|
+
# 转换为base64
|
|
599
|
+
base64_data = base64.b64encode(buffer.tobytes()).decode("utf-8")
|
|
600
|
+
|
|
601
|
+
# 添加MIME类型前缀
|
|
602
|
+
if return_with_mime:
|
|
603
|
+
mime_type = f"image/{format_type.lower()}"
|
|
604
|
+
result = f"data:{mime_type};base64,{base64_data}"
|
|
605
|
+
else:
|
|
606
|
+
result = base64_data
|
|
607
|
+
|
|
608
|
+
return result
|
|
609
|
+
|
|
610
|
+
except Exception as e:
|
|
611
|
+
raise ValueError(f"处理本地文件失败: {file_path}, 错误: {str(e)}")
|
|
612
|
+
|
|
613
|
+
async def _process_url_async(
|
|
614
|
+
self,
|
|
615
|
+
url: str,
|
|
616
|
+
session: aiohttp.ClientSession,
|
|
617
|
+
max_width: Optional[int] = None,
|
|
618
|
+
max_height: Optional[int] = None,
|
|
619
|
+
max_pixels: Optional[int] = None,
|
|
620
|
+
return_with_mime: bool = True,
|
|
621
|
+
) -> str:
|
|
622
|
+
"""异步处理URL"""
|
|
623
|
+
try:
|
|
624
|
+
# 如果有image_processor,使用它处理URL(包含磁盘缓存)
|
|
625
|
+
if HAS_IMAGE_PROCESSOR:
|
|
626
|
+
from .image_processor import encode_image_to_base64
|
|
627
|
+
|
|
628
|
+
return await encode_image_to_base64(
|
|
629
|
+
url, session, max_width, max_height, max_pixels,
|
|
630
|
+
return_with_mime, cache_config=self.disk_cache_config
|
|
631
|
+
)
|
|
632
|
+
else:
|
|
633
|
+
# 简单的URL处理实现
|
|
634
|
+
timeout = aiohttp.ClientTimeout(total=self.config.network_timeout)
|
|
635
|
+
async with session.get(url, timeout=timeout) as response:
|
|
636
|
+
if response.status == 200:
|
|
637
|
+
content = await response.read()
|
|
638
|
+
base64_data = base64.b64encode(content).decode("utf-8")
|
|
639
|
+
|
|
640
|
+
if return_with_mime:
|
|
641
|
+
content_type = response.headers.get(
|
|
642
|
+
"content-type", "image/jpeg"
|
|
643
|
+
)
|
|
644
|
+
return f"data:{content_type};base64,{base64_data}"
|
|
645
|
+
return base64_data
|
|
646
|
+
else:
|
|
647
|
+
raise ValueError(f"HTTP {response.status}")
|
|
648
|
+
except Exception as e:
|
|
649
|
+
raise ValueError(
|
|
650
|
+
f"处理URL失败: {safe_repr_source(url)}, 错误: {safe_repr_error(str(e))}"
|
|
651
|
+
)
|
|
652
|
+
|
|
653
|
+
async def process_single_source(
|
|
654
|
+
self,
|
|
655
|
+
source: str,
|
|
656
|
+
session: Optional[aiohttp.ClientSession] = None,
|
|
657
|
+
max_width: Optional[int] = None,
|
|
658
|
+
max_height: Optional[int] = None,
|
|
659
|
+
max_pixels: Optional[int] = None,
|
|
660
|
+
return_with_mime: bool = True,
|
|
661
|
+
) -> str:
|
|
662
|
+
"""处理单个图像源(本地文件或URL)"""
|
|
663
|
+
|
|
664
|
+
# 1. 首先检查内存缓存
|
|
665
|
+
cached_result = self.memory_cache.get(
|
|
666
|
+
source,
|
|
667
|
+
max_width=max_width,
|
|
668
|
+
max_height=max_height,
|
|
669
|
+
max_pixels=max_pixels,
|
|
670
|
+
return_with_mime=return_with_mime,
|
|
671
|
+
)
|
|
672
|
+
if cached_result is not None:
|
|
673
|
+
return cached_result
|
|
674
|
+
|
|
675
|
+
# 获取处理锁
|
|
676
|
+
cache_key = self.memory_cache._generate_cache_key(
|
|
677
|
+
source,
|
|
678
|
+
max_width=max_width,
|
|
679
|
+
max_height=max_height,
|
|
680
|
+
max_pixels=max_pixels,
|
|
681
|
+
return_with_mime=return_with_mime,
|
|
682
|
+
)
|
|
683
|
+
file_lock = await self._get_processing_lock(cache_key)
|
|
684
|
+
|
|
685
|
+
async with file_lock:
|
|
686
|
+
# 再次检查内存缓存(双重检查锁定模式)
|
|
687
|
+
cached_result = self.memory_cache.get(
|
|
688
|
+
source,
|
|
689
|
+
max_width=max_width,
|
|
690
|
+
max_height=max_height,
|
|
691
|
+
max_pixels=max_pixels,
|
|
692
|
+
return_with_mime=return_with_mime,
|
|
693
|
+
)
|
|
694
|
+
if cached_result is not None:
|
|
695
|
+
return cached_result
|
|
696
|
+
|
|
697
|
+
# 开始性能计时
|
|
698
|
+
start_time = time.time()
|
|
699
|
+
|
|
700
|
+
try:
|
|
701
|
+
# 判断是本地文件还是URL
|
|
702
|
+
if os.path.exists(source) or source.startswith("file://"):
|
|
703
|
+
file_path = source[7:] if source.startswith("file://") else source
|
|
704
|
+
|
|
705
|
+
# 在线程池中处理本地文件
|
|
706
|
+
executor = self._get_executor()
|
|
707
|
+
loop = asyncio.get_event_loop()
|
|
708
|
+
result = await asyncio.wait_for(
|
|
709
|
+
loop.run_in_executor(
|
|
710
|
+
executor,
|
|
711
|
+
self._process_local_file_sync,
|
|
712
|
+
file_path,
|
|
713
|
+
max_width,
|
|
714
|
+
max_height,
|
|
715
|
+
max_pixels,
|
|
716
|
+
return_with_mime,
|
|
717
|
+
),
|
|
718
|
+
timeout=self.config.single_file_timeout,
|
|
719
|
+
)
|
|
720
|
+
else:
|
|
721
|
+
# 处理URL(会自动使用磁盘缓存)
|
|
722
|
+
if session is None:
|
|
723
|
+
async with aiohttp.ClientSession() as temp_session:
|
|
724
|
+
result = await asyncio.wait_for(
|
|
725
|
+
self._process_url_async(
|
|
726
|
+
source,
|
|
727
|
+
temp_session,
|
|
728
|
+
max_width,
|
|
729
|
+
max_height,
|
|
730
|
+
max_pixels,
|
|
731
|
+
return_with_mime,
|
|
732
|
+
),
|
|
733
|
+
timeout=self.config.network_timeout,
|
|
734
|
+
)
|
|
735
|
+
else:
|
|
736
|
+
result = await asyncio.wait_for(
|
|
737
|
+
self._process_url_async(
|
|
738
|
+
source,
|
|
739
|
+
session,
|
|
740
|
+
max_width,
|
|
741
|
+
max_height,
|
|
742
|
+
max_pixels,
|
|
743
|
+
return_with_mime,
|
|
744
|
+
),
|
|
745
|
+
timeout=self.config.network_timeout,
|
|
746
|
+
)
|
|
747
|
+
|
|
748
|
+
# 将结果缓存到内存
|
|
749
|
+
self.memory_cache.put(
|
|
750
|
+
source,
|
|
751
|
+
result,
|
|
752
|
+
max_width=max_width,
|
|
753
|
+
max_height=max_height,
|
|
754
|
+
max_pixels=max_pixels,
|
|
755
|
+
return_with_mime=return_with_mime,
|
|
756
|
+
)
|
|
757
|
+
|
|
758
|
+
# 更新性能统计
|
|
759
|
+
processing_time = time.time() - start_time
|
|
760
|
+
self._total_processed += 1
|
|
761
|
+
self._total_processing_time += processing_time
|
|
762
|
+
|
|
763
|
+
return result
|
|
764
|
+
|
|
765
|
+
except asyncio.TimeoutError:
|
|
766
|
+
logger.warning(f"处理超时: {safe_repr_source(source)}")
|
|
767
|
+
return ""
|
|
768
|
+
except Exception as e:
|
|
769
|
+
logger.error(
|
|
770
|
+
f"处理失败: {safe_repr_source(source)}, 错误: {safe_repr_error(str(e))}"
|
|
771
|
+
)
|
|
772
|
+
return ""
|
|
773
|
+
|
|
774
|
+
async def process_batch(
|
|
775
|
+
self,
|
|
776
|
+
sources: List[str],
|
|
777
|
+
session: Optional[aiohttp.ClientSession] = None,
|
|
778
|
+
max_width: Optional[int] = None,
|
|
779
|
+
max_height: Optional[int] = None,
|
|
780
|
+
max_pixels: Optional[int] = None,
|
|
781
|
+
return_with_mime: bool = True,
|
|
782
|
+
) -> List[str]:
|
|
783
|
+
"""批量处理图像源"""
|
|
784
|
+
if not sources:
|
|
785
|
+
return []
|
|
786
|
+
|
|
787
|
+
# 去重并保持顺序映射
|
|
788
|
+
unique_sources = []
|
|
789
|
+
source_indices = {}
|
|
790
|
+
for i, source in enumerate(sources):
|
|
791
|
+
if source not in source_indices:
|
|
792
|
+
source_indices[source] = []
|
|
793
|
+
unique_sources.append(source)
|
|
794
|
+
source_indices[source].append(i)
|
|
795
|
+
|
|
796
|
+
# 创建信号量控制并发
|
|
797
|
+
semaphore = asyncio.Semaphore(self.config.max_concurrent)
|
|
798
|
+
|
|
799
|
+
async def process_single_with_semaphore(source: str) -> Tuple[str, str]:
|
|
800
|
+
async with semaphore:
|
|
801
|
+
result = await self.process_single_source(
|
|
802
|
+
source, session, max_width, max_height, max_pixels, return_with_mime
|
|
803
|
+
)
|
|
804
|
+
return source, result
|
|
805
|
+
|
|
806
|
+
# 并发处理所有唯一源
|
|
807
|
+
tasks = [process_single_with_semaphore(source) for source in unique_sources]
|
|
808
|
+
|
|
809
|
+
try:
|
|
810
|
+
results = await asyncio.wait_for(
|
|
811
|
+
asyncio.gather(*tasks, return_exceptions=True),
|
|
812
|
+
timeout=self.config.batch_timeout,
|
|
813
|
+
)
|
|
814
|
+
except asyncio.TimeoutError:
|
|
815
|
+
logger.warning("批处理超时")
|
|
816
|
+
results = [(source, "") for source in unique_sources]
|
|
817
|
+
|
|
818
|
+
# 构建结果映射
|
|
819
|
+
result_mapping = {}
|
|
820
|
+
for result in results:
|
|
821
|
+
if isinstance(result, Exception):
|
|
822
|
+
continue
|
|
823
|
+
source, processed_result = result
|
|
824
|
+
result_mapping[source] = processed_result
|
|
825
|
+
|
|
826
|
+
# 根据原始顺序返回结果
|
|
827
|
+
final_results = []
|
|
828
|
+
for source in sources:
|
|
829
|
+
final_results.append(result_mapping.get(source, ""))
|
|
830
|
+
|
|
831
|
+
return final_results
|
|
832
|
+
|
|
833
|
+
def get_cache_stats(self) -> Dict[str, Any]:
|
|
834
|
+
"""获取缓存统计信息"""
|
|
835
|
+
stats = {"memory_cache": self.memory_cache.get_stats()}
|
|
836
|
+
|
|
837
|
+
# 如果启用了磁盘缓存,添加磁盘缓存统计
|
|
838
|
+
if self.config.enable_disk_cache and self.disk_cache_config and self.disk_cache_config.enabled:
|
|
839
|
+
disk_stats = self._get_disk_cache_stats()
|
|
840
|
+
stats["disk_cache"] = disk_stats
|
|
841
|
+
|
|
842
|
+
return stats
|
|
843
|
+
|
|
844
|
+
def _get_disk_cache_stats(self) -> Dict[str, Any]:
|
|
845
|
+
"""获取磁盘缓存统计信息"""
|
|
846
|
+
if not self.disk_cache_config or not self.disk_cache_config.enabled:
|
|
847
|
+
return {"enabled": False}
|
|
848
|
+
|
|
849
|
+
try:
|
|
850
|
+
cache_dir = Path(self.disk_cache_config.cache_dir)
|
|
851
|
+
if not cache_dir.exists():
|
|
852
|
+
return {"enabled": True, "cached_files": 0, "total_size_mb": 0}
|
|
853
|
+
|
|
854
|
+
# 统计缓存文件
|
|
855
|
+
image_files = list(cache_dir.glob("*"))
|
|
856
|
+
image_files = [f for f in image_files if f.suffix.lower() in ['.jpg', '.jpeg', '.png', '.webp', '.gif']]
|
|
857
|
+
error_files = list(cache_dir.glob("*.error"))
|
|
858
|
+
|
|
859
|
+
# 计算总大小
|
|
860
|
+
total_size = sum(f.stat().st_size for f in image_files if f.is_file())
|
|
861
|
+
|
|
862
|
+
return {
|
|
863
|
+
"enabled": True,
|
|
864
|
+
"cache_dir": str(cache_dir),
|
|
865
|
+
"cached_images": len(image_files),
|
|
866
|
+
"error_cache_files": len(error_files),
|
|
867
|
+
"total_files": len(image_files) + len(error_files),
|
|
868
|
+
"total_size_mb": total_size / 1024 / 1024,
|
|
869
|
+
"force_refresh": self.disk_cache_config.force_refresh,
|
|
870
|
+
"retry_failed": self.disk_cache_config.retry_failed,
|
|
871
|
+
}
|
|
872
|
+
except Exception as e:
|
|
873
|
+
return {"enabled": True, "error": str(e)}
|
|
874
|
+
|
|
875
|
+
def get_performance_stats(self) -> Dict[str, Any]:
|
|
876
|
+
"""获取性能统计信息"""
|
|
877
|
+
uptime = time.time() - self._start_time
|
|
878
|
+
avg_processing_time = (
|
|
879
|
+
self._total_processing_time / self._total_processed
|
|
880
|
+
if self._total_processed > 0
|
|
881
|
+
else 0
|
|
882
|
+
)
|
|
883
|
+
throughput = self._total_processed / uptime if uptime > 0 else 0
|
|
884
|
+
|
|
885
|
+
return {
|
|
886
|
+
"total_processed": self._total_processed,
|
|
887
|
+
"total_processing_time": self._total_processing_time,
|
|
888
|
+
"uptime_seconds": uptime,
|
|
889
|
+
"avg_processing_time": avg_processing_time,
|
|
890
|
+
"throughput_per_second": throughput,
|
|
891
|
+
"cache_stats": self.get_cache_stats(),
|
|
892
|
+
"config": {
|
|
893
|
+
"max_workers": self.config.max_workers,
|
|
894
|
+
"max_concurrent": self.config.max_concurrent,
|
|
895
|
+
"memory_cache_size_mb": self.config.memory_cache_size_mb,
|
|
896
|
+
"jpeg_quality": self.config.jpeg_quality,
|
|
897
|
+
},
|
|
898
|
+
}
|
|
899
|
+
|
|
900
|
+
def clear_cache(self, clear_disk_cache: bool = False):
|
|
901
|
+
"""清空缓存"""
|
|
902
|
+
# 清空内存缓存
|
|
903
|
+
self.memory_cache.clear()
|
|
904
|
+
|
|
905
|
+
# 可选地清空磁盘缓存
|
|
906
|
+
if clear_disk_cache and self.disk_cache_config and self.disk_cache_config.enabled:
|
|
907
|
+
self._clear_disk_cache()
|
|
908
|
+
|
|
909
|
+
def _clear_disk_cache(self):
|
|
910
|
+
"""清空磁盘缓存"""
|
|
911
|
+
try:
|
|
912
|
+
cache_dir = Path(self.disk_cache_config.cache_dir)
|
|
913
|
+
if cache_dir.exists():
|
|
914
|
+
# 删除所有缓存文件
|
|
915
|
+
for cache_file in cache_dir.iterdir():
|
|
916
|
+
if cache_file.is_file():
|
|
917
|
+
cache_file.unlink()
|
|
918
|
+
|
|
919
|
+
logger.info(f"已清空磁盘缓存目录: {cache_dir}")
|
|
920
|
+
except Exception as e:
|
|
921
|
+
logger.warning(f"清空磁盘缓存失败: {e}")
|
|
922
|
+
|
|
923
|
+
def cleanup(self):
|
|
924
|
+
"""清理资源"""
|
|
925
|
+
try:
|
|
926
|
+
if self.executor:
|
|
927
|
+
self.executor.shutdown(wait=True)
|
|
928
|
+
self.clear_cache()
|
|
929
|
+
except Exception:
|
|
930
|
+
pass
|
|
931
|
+
|
|
932
|
+
|
|
933
|
+
# 全局处理器实例
|
|
934
|
+
_global_unified_processor = None
|
|
935
|
+
_unified_processor_lock = threading.Lock()
|
|
936
|
+
|
|
937
|
+
|
|
938
|
+
def get_global_unified_processor(
|
|
939
|
+
config: Optional[UnifiedProcessorConfig] = None,
|
|
940
|
+
) -> UnifiedImageProcessor:
|
|
941
|
+
"""获取全局统一处理器实例(单例模式)"""
|
|
942
|
+
global _global_unified_processor
|
|
943
|
+
|
|
944
|
+
if _global_unified_processor is None:
|
|
945
|
+
with _unified_processor_lock:
|
|
946
|
+
if _global_unified_processor is None:
|
|
947
|
+
_global_unified_processor = UnifiedImageProcessor(config)
|
|
948
|
+
|
|
949
|
+
return _global_unified_processor
|
|
950
|
+
|
|
951
|
+
|
|
952
|
+
async def process_content_recursive(
|
|
953
|
+
content: Any,
|
|
954
|
+
session: Optional[aiohttp.ClientSession] = None,
|
|
955
|
+
processor: Optional[UnifiedImageProcessor] = None,
|
|
956
|
+
**kwargs,
|
|
957
|
+
):
|
|
958
|
+
"""递归处理内容中的图像URL"""
|
|
959
|
+
if processor is None:
|
|
960
|
+
processor = get_global_unified_processor()
|
|
961
|
+
|
|
962
|
+
if isinstance(content, dict):
|
|
963
|
+
for key, value in content.items():
|
|
964
|
+
if key == "url" and isinstance(value, str):
|
|
965
|
+
# 处理图像URL
|
|
966
|
+
try:
|
|
967
|
+
base64_data = await processor.process_single_source(
|
|
968
|
+
value, session, **kwargs
|
|
969
|
+
)
|
|
970
|
+
if base64_data:
|
|
971
|
+
content[key] = base64_data
|
|
972
|
+
except Exception as e:
|
|
973
|
+
logger.error(
|
|
974
|
+
f"处理URL失败 {safe_repr_source(value)}: {safe_repr_error(str(e))}"
|
|
975
|
+
)
|
|
976
|
+
else:
|
|
977
|
+
await process_content_recursive(value, session, processor, **kwargs)
|
|
978
|
+
elif isinstance(content, list):
|
|
979
|
+
for item in content:
|
|
980
|
+
await process_content_recursive(item, session, processor, **kwargs)
|
|
981
|
+
|
|
982
|
+
|
|
983
|
+
async def unified_messages_preprocess(
|
|
984
|
+
messages: List[Dict[str, Any]],
|
|
985
|
+
inplace: bool = False,
|
|
986
|
+
processor_config: Optional[UnifiedProcessorConfig] = None,
|
|
987
|
+
**kwargs,
|
|
988
|
+
) -> List[Dict[str, Any]]:
|
|
989
|
+
"""
|
|
990
|
+
统一的单个消息列表预处理
|
|
991
|
+
|
|
992
|
+
Args:
|
|
993
|
+
messages: 单个消息列表
|
|
994
|
+
inplace: 是否原地修改
|
|
995
|
+
processor_config: 处理器配置
|
|
996
|
+
**kwargs: 其他处理参数
|
|
997
|
+
|
|
998
|
+
Returns:
|
|
999
|
+
处理后的消息列表
|
|
1000
|
+
"""
|
|
1001
|
+
# 创建或获取处理器
|
|
1002
|
+
if processor_config:
|
|
1003
|
+
processor = UnifiedImageProcessor(processor_config)
|
|
1004
|
+
else:
|
|
1005
|
+
processor = get_global_unified_processor()
|
|
1006
|
+
|
|
1007
|
+
try:
|
|
1008
|
+
# 如果不是原地修改,创建副本
|
|
1009
|
+
if not inplace:
|
|
1010
|
+
messages = deepcopy(messages)
|
|
1011
|
+
|
|
1012
|
+
# 使用HTTP会话处理所有图像
|
|
1013
|
+
async with aiohttp.ClientSession() as session:
|
|
1014
|
+
# 递归处理所有消息内容
|
|
1015
|
+
for message in messages:
|
|
1016
|
+
await process_content_recursive(message, session, processor, **kwargs)
|
|
1017
|
+
|
|
1018
|
+
return messages
|
|
1019
|
+
|
|
1020
|
+
except Exception as e:
|
|
1021
|
+
logger.error(f"消息预处理失败: {e}")
|
|
1022
|
+
return messages
|
|
1023
|
+
|
|
1024
|
+
|
|
1025
|
+
async def unified_batch_messages_preprocess(
|
|
1026
|
+
messages_list: Union[List[List[Dict[str, Any]]], Any],
|
|
1027
|
+
max_concurrent: int = 10,
|
|
1028
|
+
inplace: bool = False,
|
|
1029
|
+
processor_config: Optional[UnifiedProcessorConfig] = None,
|
|
1030
|
+
as_iterator: bool = False,
|
|
1031
|
+
progress_callback: Optional[Callable] = None,
|
|
1032
|
+
show_progress: bool = False,
|
|
1033
|
+
progress_desc: str = "统一处理消息",
|
|
1034
|
+
max_width: Optional[int] = None,
|
|
1035
|
+
max_height: Optional[int] = None,
|
|
1036
|
+
max_pixels: Optional[int] = None,
|
|
1037
|
+
**kwargs,
|
|
1038
|
+
) -> Union[List[List[Dict[str, Any]]], Any]:
|
|
1039
|
+
"""
|
|
1040
|
+
统一的批量消息预处理函数
|
|
1041
|
+
|
|
1042
|
+
完全兼容messages_processor.py的API,支持本地文件和URL的高性能处理
|
|
1043
|
+
|
|
1044
|
+
Args:
|
|
1045
|
+
messages_list: 消息列表的列表,可以是列表、迭代器或异步迭代器
|
|
1046
|
+
max_concurrent: 最大并发处理数
|
|
1047
|
+
inplace: 是否原地修改
|
|
1048
|
+
processor_config: 处理器配置
|
|
1049
|
+
as_iterator: 是否返回异步迭代器
|
|
1050
|
+
progress_callback: 进度回调函数
|
|
1051
|
+
show_progress: 是否显示进度条
|
|
1052
|
+
progress_desc: 进度描述
|
|
1053
|
+
max_width: 最大宽度
|
|
1054
|
+
max_height: 最大高度
|
|
1055
|
+
max_pixels: 最大像素数
|
|
1056
|
+
**kwargs: 其他处理参数
|
|
1057
|
+
|
|
1058
|
+
Returns:
|
|
1059
|
+
处理后的消息列表或异步迭代器
|
|
1060
|
+
"""
|
|
1061
|
+
|
|
1062
|
+
# 创建或获取处理器
|
|
1063
|
+
if processor_config:
|
|
1064
|
+
processor = UnifiedImageProcessor(processor_config)
|
|
1065
|
+
else:
|
|
1066
|
+
processor = get_global_unified_processor()
|
|
1067
|
+
|
|
1068
|
+
print(f"{processor.config=}")
|
|
1069
|
+
|
|
1070
|
+
# 创建处理单个消息列表的函数
|
|
1071
|
+
async def process_single_batch(messages, semaphore, index=None):
|
|
1072
|
+
async with semaphore:
|
|
1073
|
+
try:
|
|
1074
|
+
processed_messages = await unified_messages_preprocess(
|
|
1075
|
+
messages,
|
|
1076
|
+
inplace=inplace,
|
|
1077
|
+
processor_config=processor_config,
|
|
1078
|
+
max_width=max_width,
|
|
1079
|
+
max_height=max_height,
|
|
1080
|
+
max_pixels=max_pixels,
|
|
1081
|
+
**kwargs,
|
|
1082
|
+
)
|
|
1083
|
+
except Exception as e:
|
|
1084
|
+
logger.error(f"批处理错误 {index}: {e}")
|
|
1085
|
+
processed_messages = messages
|
|
1086
|
+
return processed_messages, index
|
|
1087
|
+
|
|
1088
|
+
# 进度报告函数
|
|
1089
|
+
def report_progress(current: int, total: int, start_time: float = None):
|
|
1090
|
+
if progress_callback:
|
|
1091
|
+
try:
|
|
1092
|
+
# 计算时间信息
|
|
1093
|
+
elapsed_time = time.time() - start_time if start_time else 0
|
|
1094
|
+
|
|
1095
|
+
# 创建扩展的进度信息
|
|
1096
|
+
progress_info = {
|
|
1097
|
+
"current": current,
|
|
1098
|
+
"total": total,
|
|
1099
|
+
"percentage": (current / total * 100) if total > 0 else 0,
|
|
1100
|
+
"elapsed_time": elapsed_time,
|
|
1101
|
+
"estimated_total_time": (elapsed_time / current * total)
|
|
1102
|
+
if current > 0
|
|
1103
|
+
else 0,
|
|
1104
|
+
"estimated_remaining_time": (
|
|
1105
|
+
elapsed_time / current * (total - current)
|
|
1106
|
+
)
|
|
1107
|
+
if current > 0
|
|
1108
|
+
else 0,
|
|
1109
|
+
"rate": current / elapsed_time if elapsed_time > 0 else 0,
|
|
1110
|
+
}
|
|
1111
|
+
|
|
1112
|
+
# 如果回调函数接受单个参数,传递扩展信息;否则保持兼容性
|
|
1113
|
+
import inspect
|
|
1114
|
+
|
|
1115
|
+
sig = inspect.signature(progress_callback)
|
|
1116
|
+
if len(sig.parameters) == 1:
|
|
1117
|
+
progress_callback(progress_info)
|
|
1118
|
+
else:
|
|
1119
|
+
progress_callback(current, total)
|
|
1120
|
+
|
|
1121
|
+
except Exception as e:
|
|
1122
|
+
logger.warning(f"进度回调函数执行失败: {e}")
|
|
1123
|
+
|
|
1124
|
+
# 如果要求返回迭代器
|
|
1125
|
+
if as_iterator:
|
|
1126
|
+
|
|
1127
|
+
async def process_iterator():
|
|
1128
|
+
semaphore = asyncio.Semaphore(max_concurrent)
|
|
1129
|
+
|
|
1130
|
+
# 检查是否为异步迭代器
|
|
1131
|
+
is_async_iterator = hasattr(messages_list, "__aiter__")
|
|
1132
|
+
|
|
1133
|
+
processed_count = 0
|
|
1134
|
+
total_count = None
|
|
1135
|
+
messages_to_process = messages_list
|
|
1136
|
+
|
|
1137
|
+
# 如果可以获取总数,先计算总数
|
|
1138
|
+
if not is_async_iterator and hasattr(messages_list, "__len__"):
|
|
1139
|
+
total_count = len(messages_list)
|
|
1140
|
+
elif not is_async_iterator:
|
|
1141
|
+
# 对于迭代器,先转换为列表获取长度
|
|
1142
|
+
messages_list_converted = list(messages_list)
|
|
1143
|
+
total_count = len(messages_list_converted)
|
|
1144
|
+
messages_to_process = iter(messages_list_converted)
|
|
1145
|
+
|
|
1146
|
+
# 创建进度条
|
|
1147
|
+
pbar = None
|
|
1148
|
+
start_time = time.time()
|
|
1149
|
+
if show_progress and TQDM_AVAILABLE and total_count:
|
|
1150
|
+
bar_format = "{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]"
|
|
1151
|
+
pbar = tqdm(
|
|
1152
|
+
total=total_count,
|
|
1153
|
+
desc=progress_desc,
|
|
1154
|
+
unit="批次",
|
|
1155
|
+
bar_format=bar_format,
|
|
1156
|
+
ncols=100,
|
|
1157
|
+
miniters=1,
|
|
1158
|
+
)
|
|
1159
|
+
|
|
1160
|
+
try:
|
|
1161
|
+
# 处理异步迭代器
|
|
1162
|
+
if is_async_iterator:
|
|
1163
|
+
pending_tasks = []
|
|
1164
|
+
task_index = 0
|
|
1165
|
+
async for messages in messages_to_process:
|
|
1166
|
+
# 如果已经达到最大并发数,等待一个任务完成
|
|
1167
|
+
if len(pending_tasks) >= max_concurrent:
|
|
1168
|
+
done, pending_tasks = await asyncio.wait(
|
|
1169
|
+
pending_tasks, return_when=asyncio.FIRST_COMPLETED
|
|
1170
|
+
)
|
|
1171
|
+
for task in done:
|
|
1172
|
+
result, _ = await task
|
|
1173
|
+
processed_count += 1
|
|
1174
|
+
if pbar:
|
|
1175
|
+
pbar.update(1)
|
|
1176
|
+
report_progress(
|
|
1177
|
+
processed_count,
|
|
1178
|
+
total_count or processed_count,
|
|
1179
|
+
start_time,
|
|
1180
|
+
)
|
|
1181
|
+
yield result
|
|
1182
|
+
|
|
1183
|
+
# 创建新任务
|
|
1184
|
+
task = asyncio.create_task(
|
|
1185
|
+
process_single_batch(messages, semaphore, task_index)
|
|
1186
|
+
)
|
|
1187
|
+
pending_tasks.append(task)
|
|
1188
|
+
task_index += 1
|
|
1189
|
+
|
|
1190
|
+
# 等待所有剩余任务完成
|
|
1191
|
+
if pending_tasks:
|
|
1192
|
+
for task in asyncio.as_completed(pending_tasks):
|
|
1193
|
+
result, _ = await task
|
|
1194
|
+
processed_count += 1
|
|
1195
|
+
if pbar:
|
|
1196
|
+
pbar.update(1)
|
|
1197
|
+
report_progress(
|
|
1198
|
+
processed_count,
|
|
1199
|
+
total_count or processed_count,
|
|
1200
|
+
start_time,
|
|
1201
|
+
)
|
|
1202
|
+
yield result
|
|
1203
|
+
|
|
1204
|
+
# 处理同步迭代器或列表
|
|
1205
|
+
else:
|
|
1206
|
+
# 转换为列表以避免消耗迭代器
|
|
1207
|
+
if not isinstance(messages_to_process, (list, tuple)):
|
|
1208
|
+
messages_list_converted = list(messages_to_process)
|
|
1209
|
+
else:
|
|
1210
|
+
messages_list_converted = messages_to_process
|
|
1211
|
+
|
|
1212
|
+
if not total_count:
|
|
1213
|
+
total_count = len(messages_list_converted)
|
|
1214
|
+
if pbar:
|
|
1215
|
+
pbar.total = total_count
|
|
1216
|
+
|
|
1217
|
+
# 分批处理
|
|
1218
|
+
for i in range(0, len(messages_list_converted), max_concurrent):
|
|
1219
|
+
batch = messages_list_converted[i : i + max_concurrent]
|
|
1220
|
+
tasks = [
|
|
1221
|
+
process_single_batch(messages, semaphore, i + j)
|
|
1222
|
+
for j, messages in enumerate(batch)
|
|
1223
|
+
]
|
|
1224
|
+
results = await asyncio.gather(*tasks)
|
|
1225
|
+
|
|
1226
|
+
for result, _ in results:
|
|
1227
|
+
processed_count += 1
|
|
1228
|
+
if pbar:
|
|
1229
|
+
pbar.update(1)
|
|
1230
|
+
report_progress(processed_count, total_count, start_time)
|
|
1231
|
+
yield result
|
|
1232
|
+
|
|
1233
|
+
finally:
|
|
1234
|
+
if pbar:
|
|
1235
|
+
pbar.close()
|
|
1236
|
+
|
|
1237
|
+
return process_iterator()
|
|
1238
|
+
|
|
1239
|
+
# 原始实现,返回列表
|
|
1240
|
+
else:
|
|
1241
|
+
semaphore = asyncio.Semaphore(max_concurrent)
|
|
1242
|
+
|
|
1243
|
+
# 检查是否为异步迭代器
|
|
1244
|
+
is_async_iterator = hasattr(messages_list, "__aiter__")
|
|
1245
|
+
|
|
1246
|
+
# 转换为列表
|
|
1247
|
+
if is_async_iterator:
|
|
1248
|
+
messages_list_converted = []
|
|
1249
|
+
async for messages in messages_list:
|
|
1250
|
+
messages_list_converted.append(messages)
|
|
1251
|
+
elif not isinstance(messages_list, (list, tuple)):
|
|
1252
|
+
messages_list_converted = list(messages_list)
|
|
1253
|
+
else:
|
|
1254
|
+
messages_list_converted = messages_list
|
|
1255
|
+
|
|
1256
|
+
if not messages_list_converted:
|
|
1257
|
+
return []
|
|
1258
|
+
|
|
1259
|
+
total_count = len(messages_list_converted)
|
|
1260
|
+
processed_count = 0
|
|
1261
|
+
|
|
1262
|
+
# 创建进度条
|
|
1263
|
+
pbar = None
|
|
1264
|
+
start_time = time.time()
|
|
1265
|
+
if show_progress and TQDM_AVAILABLE:
|
|
1266
|
+
bar_format = (
|
|
1267
|
+
"{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]"
|
|
1268
|
+
)
|
|
1269
|
+
pbar = tqdm(
|
|
1270
|
+
total=total_count,
|
|
1271
|
+
desc=progress_desc,
|
|
1272
|
+
unit=" items",
|
|
1273
|
+
bar_format=bar_format,
|
|
1274
|
+
ncols=100,
|
|
1275
|
+
miniters=1,
|
|
1276
|
+
)
|
|
1277
|
+
|
|
1278
|
+
try:
|
|
1279
|
+
# 分批处理以实现进度更新
|
|
1280
|
+
results = []
|
|
1281
|
+
for i in range(0, len(messages_list_converted), max_concurrent):
|
|
1282
|
+
batch = messages_list_converted[i : i + max_concurrent]
|
|
1283
|
+
tasks = [
|
|
1284
|
+
process_single_batch(messages, semaphore, i + j)
|
|
1285
|
+
for j, messages in enumerate(batch)
|
|
1286
|
+
]
|
|
1287
|
+
batch_results = await asyncio.gather(*tasks)
|
|
1288
|
+
|
|
1289
|
+
for result, _ in batch_results:
|
|
1290
|
+
results.append(result)
|
|
1291
|
+
processed_count += 1
|
|
1292
|
+
if pbar:
|
|
1293
|
+
pbar.update(1)
|
|
1294
|
+
report_progress(processed_count, total_count, start_time)
|
|
1295
|
+
|
|
1296
|
+
return results
|
|
1297
|
+
|
|
1298
|
+
finally:
|
|
1299
|
+
if pbar:
|
|
1300
|
+
pbar.close()
|
|
1301
|
+
|
|
1302
|
+
|
|
1303
|
+
# 向后兼容的别名和完整的API兼容性
|
|
1304
|
+
messages_preprocess = unified_messages_preprocess
|
|
1305
|
+
batch_messages_preprocess = unified_batch_messages_preprocess
|
|
1306
|
+
batch_process_messages = unified_batch_messages_preprocess
|
|
1307
|
+
|
|
1308
|
+
# 专用别名
|
|
1309
|
+
optimized_batch_messages_preprocess = unified_batch_messages_preprocess
|
|
1310
|
+
improved_batch_messages_preprocess = unified_batch_messages_preprocess
|
|
1311
|
+
opencv_batch_messages_preprocess = unified_batch_messages_preprocess
|
|
1312
|
+
|
|
1313
|
+
|
|
1314
|
+
# 便捷函数
|
|
1315
|
+
async def unified_encode_image_to_base64(
|
|
1316
|
+
image_source: Union[str, List[str]],
|
|
1317
|
+
session: Optional[aiohttp.ClientSession] = None,
|
|
1318
|
+
max_width: Optional[int] = None,
|
|
1319
|
+
max_height: Optional[int] = None,
|
|
1320
|
+
max_pixels: Optional[int] = None,
|
|
1321
|
+
return_with_mime: bool = True,
|
|
1322
|
+
processor_config: Optional[UnifiedProcessorConfig] = None,
|
|
1323
|
+
) -> Union[str, List[str]]:
|
|
1324
|
+
"""
|
|
1325
|
+
统一的图像编码函数,支持本地文件和URL
|
|
1326
|
+
|
|
1327
|
+
Args:
|
|
1328
|
+
image_source: 图像源,可以是单个路径/URL或列表
|
|
1329
|
+
session: HTTP会话(可选)
|
|
1330
|
+
max_width: 最大宽度
|
|
1331
|
+
max_height: 最大高度
|
|
1332
|
+
max_pixels: 最大像素数
|
|
1333
|
+
return_with_mime: 是否返回带MIME前缀的结果
|
|
1334
|
+
processor_config: 处理器配置
|
|
1335
|
+
|
|
1336
|
+
Returns:
|
|
1337
|
+
Base64编码的图像数据
|
|
1338
|
+
"""
|
|
1339
|
+
processor = (
|
|
1340
|
+
UnifiedImageProcessor(processor_config)
|
|
1341
|
+
if processor_config
|
|
1342
|
+
else get_global_unified_processor()
|
|
1343
|
+
)
|
|
1344
|
+
|
|
1345
|
+
if isinstance(image_source, str):
|
|
1346
|
+
return await processor.process_single_source(
|
|
1347
|
+
image_source, session, max_width, max_height, max_pixels, return_with_mime
|
|
1348
|
+
)
|
|
1349
|
+
elif isinstance(image_source, list):
|
|
1350
|
+
return await processor.process_batch(
|
|
1351
|
+
image_source, session, max_width, max_height, max_pixels, return_with_mime
|
|
1352
|
+
)
|
|
1353
|
+
else:
|
|
1354
|
+
raise ValueError(f"不支持的图像源类型: {type(image_source)}")
|
|
1355
|
+
|
|
1356
|
+
|
|
1357
|
+
# 向后兼容别名
|
|
1358
|
+
encode_image_to_base64 = unified_encode_image_to_base64
|
|
1359
|
+
safe_optimized_encode_image_to_base64 = unified_encode_image_to_base64
|
|
1360
|
+
|
|
1361
|
+
|
|
1362
|
+
def cleanup_global_unified_processor():
|
|
1363
|
+
"""清理全局统一处理器"""
|
|
1364
|
+
global _global_unified_processor
|
|
1365
|
+
if _global_unified_processor:
|
|
1366
|
+
_global_unified_processor.cleanup()
|
|
1367
|
+
_global_unified_processor = None
|
|
1368
|
+
|
|
1369
|
+
|
|
1370
|
+
# 示例用法
|
|
1371
|
+
if __name__ == "__main__":
|
|
1372
|
+
|
|
1373
|
+
async def test_unified_processor():
|
|
1374
|
+
config = UnifiedProcessorConfig.high_performance()
|
|
1375
|
+
processor = UnifiedImageProcessor(config)
|
|
1376
|
+
|
|
1377
|
+
# 测试本地文件
|
|
1378
|
+
# local_result = await processor.process_single_source(
|
|
1379
|
+
# "test_image.jpg", max_width=800, max_height=600
|
|
1380
|
+
# )
|
|
1381
|
+
# print(f"本地文件处理完成,长度: {len(local_result)}")
|
|
1382
|
+
|
|
1383
|
+
# 测试URL
|
|
1384
|
+
async with aiohttp.ClientSession() as session:
|
|
1385
|
+
url_result = await processor.process_single_source(
|
|
1386
|
+
"https://p2.itc.cn/q_70/images03/20230402/1853ae33e80b499ebc120426a80b19d3.jpeg",
|
|
1387
|
+
session,
|
|
1388
|
+
max_width=80,
|
|
1389
|
+
max_height=60,
|
|
1390
|
+
)
|
|
1391
|
+
# 安全打印,避免打印整个base64数据
|
|
1392
|
+
print(f"URL处理完成,长度: {len(url_result)}")
|
|
1393
|
+
if len(url_result) > 100:
|
|
1394
|
+
print(f"结果预览: {url_result[:100]}...")
|
|
1395
|
+
else:
|
|
1396
|
+
print(f"完整结果: {url_result}")
|
|
1397
|
+
|
|
1398
|
+
# 获取统计信息
|
|
1399
|
+
stats = processor.get_cache_stats()
|
|
1400
|
+
print(f"缓存统计: {stats}")
|
|
1401
|
+
|
|
1402
|
+
processor.cleanup()
|
|
1403
|
+
|
|
1404
|
+
asyncio.run(test_unified_processor())
|