tamar-file-hub-client 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,6 +4,7 @@ from typing import Optional, Union, BinaryIO, Tuple, Any
4
4
 
5
5
  from ...schemas import File, UploadFile
6
6
  from ...utils.file_utils import get_file_mime_type
7
+ from ...utils.mime_extension_mapper import get_extension_from_mime_type_with_fallback
7
8
  from ...errors import ValidationError, FileNotFoundError
8
9
 
9
10
 
@@ -14,11 +15,16 @@ class BaseFileService:
14
15
 
15
16
  def _extract_file_info(
16
17
  self,
17
- file: Union[str, Path, BinaryIO, bytes]
18
+ file: Union[str, Path, BinaryIO, bytes],
19
+ mime_type: Optional[str] = None
18
20
  ) -> Tuple[Optional[str], bytes, int, str, str, str]:
19
21
  """
20
22
  提取文件信息并返回统一的 bytes 内容与 SHA256 哈希
21
23
 
24
+ Args:
25
+ file: 文件路径、Path对象、文件对象或字节数据
26
+ mime_type: 可选的MIME类型,如果提供则用于推断文件扩展名
27
+
22
28
  Returns:
23
29
  (文件名, 内容(bytes), 文件大小, MIME类型, 文件扩展名, 文件hash)
24
30
  """
@@ -55,9 +61,22 @@ class BaseFileService:
55
61
  # Case 2: 原始字节流
56
62
  elif isinstance(file, bytes):
57
63
  sha256 = hashlib.sha256(file).hexdigest()
58
- # 为字节流生成默认文件名
59
- file_name = f"upload_{sha256[:8]}.dat"
60
- return file_name, file, len(file), "application/octet-stream", 'dat', sha256
64
+
65
+ # 确定MIME类型和文件扩展名
66
+ if mime_type:
67
+ # 如果显式提供了MIME类型,直接使用
68
+ final_mime_type = mime_type
69
+ else:
70
+ # 如果没有提供MIME类型,尝试从文件内容推断
71
+ final_mime_type = self._detect_mime_from_content(file)
72
+
73
+ # 根据MIME类型推断文件扩展名,如果推断失败则使用默认的'dat'
74
+ file_ext = get_extension_from_mime_type_with_fallback(final_mime_type, 'dat')
75
+
76
+ # 为字节流生成文件名,使用推断出的扩展名
77
+ file_name = f"upload_{sha256[:8]}.{file_ext}"
78
+
79
+ return file_name, file, len(file), final_mime_type, file_ext, sha256
61
80
 
62
81
  # Case 3: 可读文件对象
63
82
  elif hasattr(file, 'read'):
@@ -70,12 +89,37 @@ class BaseFileService:
70
89
 
71
90
  # 如果没有文件名,生成一个默认的
72
91
  if not file_name:
73
- file_name = f"upload_{file_hash[:8]}.dat"
74
- file_type = 'dat'
75
- mime_type = "application/octet-stream"
92
+ # 确定MIME类型
93
+ if mime_type:
94
+ # 如果显式提供了MIME类型,直接使用
95
+ final_mime_type = mime_type
96
+ else:
97
+ # 如果没有提供MIME类型,尝试从文件内容推断
98
+ final_mime_type = self._detect_mime_from_content(content)
99
+
100
+ # 根据MIME类型推断文件扩展名
101
+ file_type = get_extension_from_mime_type_with_fallback(final_mime_type, 'dat')
102
+
103
+ # 生成文件名
104
+ file_name = f"upload_{file_hash[:8]}.{file_type}"
105
+ mime_type = final_mime_type
76
106
  else:
77
- file_type = Path(file_name).suffix.lstrip('.').lower()
78
- mime_type = get_file_mime_type(Path(file_name))
107
+ # 有文件名的情况下,优先使用文件名的扩展名
108
+ file_type = Path(file_name).suffix.lstrip('.').lower() or 'dat'
109
+
110
+ # 如果提供了MIME类型则使用,否则从文件名推断
111
+ if mime_type:
112
+ # 检查MIME类型与文件扩展名是否匹配,如果不匹配则使用MIME类型推断的扩展名
113
+ inferred_ext = get_extension_from_mime_type_with_fallback(mime_type, file_type)
114
+ if inferred_ext != file_type:
115
+ # MIME类型与文件扩展名不匹配,使用MIME类型推断的扩展名
116
+ file_type = inferred_ext
117
+ # 更新文件名以反映正确的扩展名
118
+ base_name = Path(file_name).stem
119
+ file_name = f"{base_name}.{file_type}"
120
+ else:
121
+ mime_type = get_file_mime_type(Path(file_name))
122
+
79
123
  file_name = Path(file_name).name
80
124
 
81
125
  return file_name, content, file_size, mime_type, file_type, file_hash
@@ -83,6 +127,163 @@ class BaseFileService:
83
127
  else:
84
128
  raise ValidationError(f"不支持的文件类型: {type(file)}")
85
129
 
130
+ def _detect_mime_from_content(self, content: bytes) -> str:
131
+ """
132
+ 从文件内容推断MIME类型
133
+ 通过文件头(magic bytes)识别常见的文件格式
134
+
135
+ Args:
136
+ content: 文件内容的字节数据
137
+
138
+ Returns:
139
+ 推断出的MIME类型,如果无法识别则返回默认值
140
+ """
141
+ if not content:
142
+ return "application/octet-stream"
143
+
144
+ # 常见文件格式的魔术字节(文件头)
145
+ magic_bytes_patterns = [
146
+ # 图片格式
147
+ (b"\x89PNG\r\n\x1a\n", "image/png"),
148
+ (b"\xff\xd8\xff\xe0", "image/jpeg"), # JFIF
149
+ (b"\xff\xd8\xff\xe1", "image/jpeg"), # EXIF
150
+ (b"\xff\xd8\xff\xe2", "image/jpeg"), # Canon
151
+ (b"\xff\xd8\xff\xe3", "image/jpeg"), # Samsung
152
+ (b"\xff\xd8\xff\xee", "image/jpeg"), # Adobe
153
+ (b"\xff\xd8\xff\xdb", "image/jpeg"), # Samsung D500
154
+ (b"\xff\xd8\xff", "image/jpeg"), # 通用JPEG标识符(放最后作为后备)
155
+ (b"RIFF", "image/webp"), # WebP文件以RIFF开头,需要进一步检查
156
+ (b"GIF87a", "image/gif"),
157
+ (b"GIF89a", "image/gif"),
158
+ (b"BM", "image/bmp"),
159
+ (b"\x00\x00\x01\x00", "image/x-icon"), # ICO
160
+ (b"\x00\x00\x02\x00", "image/x-icon"), # CUR
161
+
162
+ # 视频格式 - 大幅增强MP4检测
163
+ (b"\x00\x00\x00\x14ftyp", "video/quicktime"), # MOV (20字节)
164
+ (b"\x00\x00\x00\x15ftyp", "video/mp4"), # MP4 (21字节)
165
+ (b"\x00\x00\x00\x16ftyp", "video/mp4"), # MP4 (22字节)
166
+ (b"\x00\x00\x00\x17ftyp", "video/mp4"), # MP4 (23字节)
167
+ (b"\x00\x00\x00\x18ftyp", "video/mp4"), # MP4 (24字节)
168
+ (b"\x00\x00\x00\x19ftyp", "video/mp4"), # MP4 (25字节)
169
+ (b"\x00\x00\x00\x1aftyp", "video/mp4"), # MP4 (26字节)
170
+ (b"\x00\x00\x00\x1bftyp", "video/mp4"), # MP4 (27字节)
171
+ (b"\x00\x00\x00\x1cftyp", "video/mp4"), # MP4 (28字节)
172
+ (b"\x00\x00\x00\x1dftyp", "video/mp4"), # MP4 (29字节)
173
+ (b"\x00\x00\x00\x1eftyp", "video/mp4"), # MP4 (30字节)
174
+ (b"\x00\x00\x00\x1fftyp", "video/mp4"), # MP4 (31字节)
175
+ (b"\x00\x00\x00\x20ftyp", "video/mp4"), # MP4 (32字节)
176
+ (b"\x00\x00\x00!ftyp", "video/mp4"), # MP4 (33字节)
177
+ (b"\x00\x00\x00\"ftyp", "video/mp4"), # MP4 (34字节)
178
+ (b"\x00\x00\x00#ftyp", "video/mp4"), # MP4 (35字节)
179
+ (b"\x00\x00\x00$ftyp", "video/mp4"), # MP4 (36字节)
180
+ (b"ftypmp4", "video/mp4"), # 直接MP4标识
181
+ (b"ftypisom", "video/mp4"), # ISO Base Media
182
+ (b"ftypM4V", "video/mp4"), # iTunes M4V
183
+ (b"ftypM4A", "video/mp4"), # iTunes M4A
184
+ (b"ftypf4v", "video/mp4"), # Flash Video MP4
185
+ (b"ftypkddi", "video/mp4"), # Kodak
186
+ (b"ftypmif1", "video/mp4"), # HEIF
187
+ (b"ftypmsf1", "video/mp4"), # HEIF sequence
188
+ (b"ftypheic", "video/mp4"), # HEIC
189
+ (b"ftypheif", "video/mp4"), # HEIF
190
+ (b"ftypmj2s", "video/mp4"), # Motion JPEG 2000
191
+ (b"ftypmjp2", "video/mp4"), # Motion JPEG 2000
192
+ (b"\x1a\x45\xdf\xa3", "video/webm"), # WebM/Matroska
193
+ (b"FLV\x01", "video/x-flv"), # Flash Video
194
+ (b"\x00\x00\x01\xba", "video/mpeg"), # MPEG Program Stream
195
+ (b"\x00\x00\x01\xb3", "video/mpeg"), # MPEG Video Stream
196
+ (b"RIFF", "video/avi"), # AVI (需要进一步检查)
197
+
198
+ # 音频格式 - AAC需要放在MP3前面,因为有重叠
199
+ (b"\xff\xf1", "audio/aac"), # AAC ADTS
200
+ (b"\xff\xf9", "audio/aac"), # AAC ADTS
201
+ (b"\xff\xfb", "audio/mpeg"), # MP3 Layer III
202
+ (b"\xff\xfa", "audio/mpeg"), # MP3 Layer III
203
+ (b"\xff\xf3", "audio/mpeg"), # MP3 Layer III
204
+ (b"\xff\xf2", "audio/mpeg"), # MP3 Layer II
205
+ (b"\xff\xf0", "audio/mpeg"), # MP3 Layer reserve
206
+ (b"ID3", "audio/mpeg"), # MP3 with ID3v2
207
+ (b"RIFF", "audio/wav"), # WAV也以RIFF开头,需要进一步检查
208
+ (b"OggS", "audio/ogg"), # OGG
209
+ (b"fLaC", "audio/flac"), # FLAC
210
+ (b"ftypM4A", "audio/mp4"), # M4A (AAC in MP4)
211
+ (b"#!AMR", "audio/amr"), # AMR
212
+ (b".snd", "audio/basic"), # AU
213
+ (b"dns.", "audio/basic"), # AU (big endian)
214
+ (b"FORM", "audio/aiff"), # AIFF
215
+
216
+ # 文档格式
217
+ (b"%PDF", "application/pdf"),
218
+ (b"PK\x03\x04", "application/zip"), # ZIP
219
+ (b"PK\x05\x06", "application/zip"), # Empty ZIP
220
+ (b"PK\x07\x08", "application/zip"), # Spanned ZIP
221
+ (b"Rar!", "application/x-rar-compressed"), # RAR
222
+ (b"\x1f\x8b\x08", "application/gzip"), # GZIP
223
+ (b"BZh", "application/x-bzip2"), # BZIP2
224
+ (b"\x37\x7a\xbc\xaf\x27\x1c", "application/x-7z-compressed"), # 7Z
225
+
226
+ # Office文档
227
+ (b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", "application/vnd.ms-office"), # MS Office 97-2003
228
+ (b"PK\x03\x04\x14\x00\x06\x00", "application/vnd.openxmlformats-officedocument"), # Office 2007+
229
+
230
+ # 可执行文件
231
+ (b"MZ", "application/x-msdownload"), # Windows EXE
232
+ (b"\x7fELF", "application/x-executable"), # Linux ELF
233
+ (b"\xfe\xed\xfa\xce", "application/x-mach-binary"), # macOS Mach-O (32-bit)
234
+ (b"\xfe\xed\xfa\xcf", "application/x-mach-binary"), # macOS Mach-O (64-bit)
235
+ ]
236
+
237
+ # 检查文件头匹配
238
+ for pattern, mime_type in magic_bytes_patterns:
239
+ if content.startswith(pattern):
240
+ # 特殊处理RIFF格式,需要进一步区分WebP和WAV
241
+ if pattern == b"RIFF" and len(content) >= 12:
242
+ # RIFF格式的第8-11字节指示具体格式
243
+ format_type = content[8:12]
244
+ if format_type == b"WEBP":
245
+ return "image/webp"
246
+ elif format_type == b"WAVE":
247
+ return "audio/wav"
248
+ elif format_type == b"AVI ":
249
+ return "video/x-msvideo"
250
+ # 如果RIFF格式无法进一步识别,返回通用二进制类型
251
+ return "application/octet-stream"
252
+ else:
253
+ return mime_type
254
+
255
+ # 检查是否是明确的文本内容(更保守的检测)
256
+ try:
257
+ text_content = content.decode('utf-8')
258
+ # 只有在明确是结构化文本格式时才识别为文本
259
+ if text_content.strip().startswith('{') and text_content.strip().endswith('}'):
260
+ # 可能是JSON
261
+ try:
262
+ import json
263
+ json.loads(text_content)
264
+ return "application/json"
265
+ except:
266
+ pass
267
+ elif text_content.strip().startswith('<') and text_content.strip().endswith('>'):
268
+ # 可能是XML/HTML
269
+ if '<!DOCTYPE html' in text_content.lower() or '<html' in text_content.lower():
270
+ return "text/html"
271
+ else:
272
+ return "application/xml"
273
+ # 对于普通文本内容,保持保守,除非明确包含文本标识
274
+ elif any(indicator in text_content.lower() for indicator in ['content-type:', 'charset=', '<!doctype', '<?xml']):
275
+ return "text/plain"
276
+ # 对于其他看起来像文本的内容,如果内容很短且看起来是人为构造的测试数据,不要改变默认行为
277
+ elif len(content) < 100 and any(test_word in text_content.lower() for test_word in ['test', 'fake', 'data', 'content']):
278
+ # 可能是测试数据,返回默认值保持兼容性
279
+ return "application/octet-stream"
280
+ except UnicodeDecodeError:
281
+ # 不是文本内容
282
+ pass
283
+
284
+ # 如果无法识别,返回默认的二进制类型
285
+ return "application/octet-stream"
286
+
86
287
  def _convert_file_info(self, proto_file: Any) -> File:
87
288
  """转换Proto文件信息为模型"""
88
289
  from ...utils.converter import timestamp_to_datetime
@@ -219,7 +219,7 @@ class SyncBlobService(BaseFileService):
219
219
  upload_url = self.http_uploader.start_resumable_session(
220
220
  url=upload_url_resp.upload_url,
221
221
  total_file_size=file_size,
222
- mine_type=mime_type,
222
+ mime_type=mime_type,
223
223
  )
224
224
 
225
225
  # 上传文件到对象存储
@@ -402,6 +402,7 @@ class SyncBlobService(BaseFileService):
402
402
  keep_original_filename: Optional[bool] = False,
403
403
  url: Optional[str] = None,
404
404
  file_name: Optional[str] = None,
405
+ mime_type: Optional[str] = None,
405
406
  request_id: Optional[str] = None,
406
407
  **metadata
407
408
  ) -> FileUploadResponse:
@@ -417,6 +418,7 @@ class SyncBlobService(BaseFileService):
417
418
  keep_original_filename: 是否保留原始文件名(默认False)
418
419
  url: 要下载并上传的URL(可选)
419
420
  file_name: 当使用url参数时指定的文件名(可选)
421
+ mime_type: MIME类型(可选,用于推断文件扩展名,特别适用于AI生成的字节数据)
420
422
  request_id: 请求ID(可选,如果不提供则自动生成)
421
423
  **metadata: 额外的元数据
422
424
 
@@ -426,6 +428,8 @@ class SyncBlobService(BaseFileService):
426
428
  Note:
427
429
  必须提供 file 或 url 参数之一
428
430
 
431
+ 当传入bytes或BinaryIO且未提供file_name时,建议提供mime_type以确保正确的文件扩展名推断
432
+
429
433
  Cache-Control 头在 GCS 直传模式(STREAM/RESUMABLE)下自动设置为 "public, max-age=86400"
430
434
  """
431
435
  # 参数验证:必须提供 file 或 url 之一
@@ -448,26 +452,38 @@ class SyncBlobService(BaseFileService):
448
452
  # 使用下载的内容作为file参数
449
453
  file = downloaded_content
450
454
 
451
- # 提取文件信息(bytes会返回默认的MIME类型,我们稍后会基于文件名重新计算)
452
- _, content, file_size, _, _, file_hash = self._extract_file_info(file)
455
+ # 基于文件名计算MIME类型
456
+ mime_type = get_file_mime_type(Path(file_name))
457
+
458
+ # 提取文件信息,传入MIME类型用于推断扩展名
459
+ _, content, file_size, _, _, file_hash = self._extract_file_info(file, mime_type)
453
460
 
454
461
  # file_name已经在上面设置了(要么是用户指定的,要么是从URL提取的)
455
462
  extracted_file_name = file_name
456
463
 
457
- # 基于文件名计算文件类型和MIME类型
464
+ # 基于文件名计算文件类型
458
465
  file_type = Path(extracted_file_name).suffix.lstrip('.').lower() if Path(
459
466
  extracted_file_name).suffix else 'dat'
460
- mime_type = get_file_mime_type(Path(extracted_file_name))
461
467
  else:
462
468
  # 解析文件参数,提取文件信息
463
- extracted_file_name, content, file_size, extract_mime_type, extract_file_type, file_hash = self._extract_file_info(
464
- file)
469
+ # 如果用户指定了文件名,先从文件名推断MIME类型,然后传给_extract_file_info
465
470
  if file_name:
471
+ # 用户指定了文件名,优先使用用户提供的MIME类型,否则从文件名推断
472
+ if mime_type:
473
+ file_name_mime_type = mime_type
474
+ else:
475
+ file_name_mime_type = get_file_mime_type(Path(file_name))
476
+ extracted_file_name, content, file_size, extract_mime_type, extract_file_type, file_hash = self._extract_file_info(
477
+ file, file_name_mime_type)
478
+ # 使用用户指定的文件名
466
479
  extracted_file_name = file_name
467
- mime_type = get_file_mime_type(file_name)
480
+ mime_type = file_name_mime_type
468
481
  file_type = Path(extracted_file_name).suffix.lstrip('.').lower() if Path(
469
482
  extracted_file_name).suffix else 'dat'
470
483
  else:
484
+ # 没有指定文件名,传入用户提供的MIME类型(如果有)
485
+ extracted_file_name, content, file_size, extract_mime_type, extract_file_type, file_hash = self._extract_file_info(
486
+ file, mime_type)
471
487
  mime_type = extract_mime_type
472
488
  file_type = extract_file_type
473
489
 
@@ -668,6 +684,7 @@ class SyncBlobService(BaseFileService):
668
684
  download_urls.append(DownloadUrlInfo(
669
685
  file_id=url_info.file_id,
670
686
  url=url_info.url,
687
+ mime_type=url_info.mime_type,
671
688
  error=url_info.error if url_info.HasField('error') else None
672
689
  ))
673
690
 
@@ -53,6 +53,11 @@ from .ip_detector import (
53
53
  UserIPContext,
54
54
  flask_auto_user_ip,
55
55
  )
56
+ from .mime_extension_mapper import (
57
+ MimeExtensionMapper,
58
+ get_extension_from_mime_type,
59
+ get_extension_from_mime_type_with_fallback,
60
+ )
56
61
 
57
62
  __all__ = [
58
63
  # 文件工具
@@ -101,4 +106,9 @@ __all__ = [
101
106
  "set_user_ip_extractor",
102
107
  "UserIPContext",
103
108
  "flask_auto_user_ip",
109
+
110
+ # MIME扩展名映射工具
111
+ "MimeExtensionMapper",
112
+ "get_extension_from_mime_type",
113
+ "get_extension_from_mime_type_with_fallback",
104
114
  ]