tretool 0.2.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tretool/encoding.py CHANGED
@@ -1,92 +1,421 @@
1
1
  import chardet
2
+ import logging
3
+ import warnings
4
+ from typing import Union, BinaryIO, Optional, List, Dict, Tuple
5
+ from io import BytesIO, SEEK_SET
6
+ import re
7
+ from collections import OrderedDict
2
8
 
3
- from typing import Union, BinaryIO
9
+ # 配置日志
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
4
12
 
5
- def detect_encoding(
6
- input_data: Union[bytes, str, BinaryIO],
7
- sample_size: int = 1024,
8
- fallback_encoding: str = 'utf-8'
9
- ) -> str:
13
+ class EncodingDetectionError(Exception):
14
+ """编码检测异常基类"""
15
+ pass
16
+
17
+ class InvalidInputError(EncodingDetectionError):
18
+ """无效输入异常"""
19
+ pass
20
+
21
+ class EncodingValidationError(EncodingDetectionError):
22
+ """编码验证异常"""
23
+ pass
24
+
25
+ class SmartEncodingDetector:
10
26
  """
11
- 自动检测文本数据的字符编码
12
-
13
- 参数:
14
- input_data: 可以是以下类型之一:
15
- - bytes: 原始字节数据
16
- - str: 字符串(将尝试重新编码检测)
17
- - BinaryIO: 文件对象(将读取前sample_size字节)
18
- sample_size: 从文件/大数据中采样的字节数(默认1024)
19
- fallback_encoding: 无法检测时使用的回退编码(默认'utf-8')
20
-
21
- 返回:
22
- 检测到的编码名称字符串
23
-
24
- 示例:
25
- # 检测字节数据编码
26
- detect_encoding(b'\xc3\xa9chantillon')
27
-
28
- # 检测文件编码
29
- with open('file.txt', 'rb') as f:
30
- encoding = detect_encoding(f)
27
+ 终极智能文本编码检测器
28
+
29
+ 主要特性:
30
+ - 多阶段智能检测流程
31
+ - 支持BOM自动检测
32
+ - 高级编码验证机制
33
+ - 可配置的检测策略
34
+ - 完善的错误处理
35
+ - 性能优化
36
+ - 详细的文档和类型提示
31
37
  """
32
- raw_data = _get_sample_data(input_data, sample_size)
33
38
 
34
- if not raw_data:
35
- return fallback_encoding
39
+ # BOM标记定义
40
+ BOM_MARKERS: Dict[str, bytes] = {
41
+ 'utf-8-sig': b'\xef\xbb\xbf',
42
+ 'utf-16': b'\xff\xfe',
43
+ 'utf-16be': b'\xfe\xff',
44
+ 'utf-32': b'\xff\xfe\x00\x00',
45
+ 'utf-32be': b'\x00\x00\xfe\xff',
46
+ }
47
+
48
+ # 默认优先编码列表(按优先级排序)
49
+ DEFAULT_PREFERRED_ENCODINGS = [
50
+ 'utf-8', # 最通用的Unicode编码
51
+ 'gb18030', # 中文国家标准
52
+ 'gbk', # 中文扩展
53
+ 'gb2312', # 中文基本集
54
+ 'big5', # 繁体中文
55
+ 'shift_jis', # 日文
56
+ 'euc-jp', # 日文
57
+ 'euc-kr', # 韩文
58
+ 'iso-8859-1', # 西欧
59
+ 'windows-1252', # 西欧扩展
60
+ 'ascii', # 基本ASCII
61
+ ]
62
+
63
+ # 常见编码别名映射
64
+ ENCODING_ALIASES: Dict[str, str] = {
65
+ 'ascii': 'utf-8', # ASCII是UTF-8的子集
66
+ 'latin1': 'iso-8859-1',
67
+ 'cp936': 'gbk',
68
+ 'ms936': 'gbk',
69
+ 'csgb2312': 'gb2312',
70
+ 'ms950': 'big5',
71
+ 'cp950': 'big5',
72
+ }
73
+
74
+ def __init__(
75
+ self,
76
+ preferred_encodings: Optional[List[str]] = None,
77
+ min_confidence: float = 0.7,
78
+ enable_bom_detection: bool = True,
79
+ enable_heuristics: bool = True,
80
+ enable_validation: bool = True,
81
+ sample_size: int = 4096,
82
+ max_retries: int = 3
83
+ ):
84
+ """
85
+ 初始化编码检测器
86
+
87
+ 参数:
88
+ preferred_encodings: 自定义优先编码列表
89
+ min_confidence: chardet的最小置信度阈值(0-1)
90
+ enable_bom_detection: 是否启用BOM检测
91
+ enable_heuristics: 是否启用启发式规则
92
+ enable_validation: 是否启用严格验证
93
+ sample_size: 采样数据大小(字节)
94
+ max_retries: 最大重试次数
95
+ """
96
+ self.preferred_encodings = self._normalize_encodings(
97
+ preferred_encodings or self.DEFAULT_PREFERRED_ENCODINGS
98
+ )
99
+ self.min_confidence = min(min_confidence, 1.0)
100
+ self.enable_bom_detection = enable_bom_detection
101
+ self.enable_heuristics = enable_heuristics
102
+ self.enable_validation = enable_validation
103
+ self.sample_size = max(sample_size, 128)
104
+ self.max_retries = max(max_retries, 1)
105
+
106
+ # 编译常用正则表达式
107
+ self._chinese_pattern = re.compile(r'[\u4e00-\u9fff]')
108
+ self._japanese_pattern = re.compile(r'[\u3040-\u309f\u30a0-\u30ff]')
109
+ self._korean_pattern = re.compile(r'[\uac00-\ud7a3]')
110
+
111
+ def detect(self, input_data: Union[bytes, str, BinaryIO], sample_size: Optional[int] = None) -> str:
112
+ """
113
+ 检测文本数据的编码
114
+
115
+ 参数:
116
+ input_data: 输入数据(bytes/str/文件对象)
117
+ sample_size: 可选自定义采样大小
118
+
119
+ 返回:
120
+ 检测到的编码名称
121
+
122
+ 异常:
123
+ InvalidInputError: 输入数据无效
124
+ EncodingDetectionError: 编码检测失败
125
+ """
126
+ sample_size = sample_size or self.sample_size
127
+
128
+ try:
129
+ # 获取样本数据
130
+ raw_data = self._get_sample_data(input_data, sample_size)
131
+ if not raw_data:
132
+ raise InvalidInputError("输入数据为空")
133
+
134
+ # 多阶段检测
135
+ return self._detect_in_stages(raw_data)
136
+
137
+ except Exception as e:
138
+ logger.error(f"编码检测失败: {str(e)}")
139
+ raise EncodingDetectionError(f"无法检测编码: {str(e)}")
140
+
141
+ def detect_and_decode(self, input_data: Union[bytes, BinaryIO], sample_size: Optional[int] = None) -> str:
142
+ """
143
+ 检测编码并解码数据
144
+
145
+ 参数:
146
+ input_data: 字节数据或文件对象
147
+ sample_size: 可选采样大小
148
+
149
+ 返回:
150
+ 解码后的字符串
151
+
152
+ 异常:
153
+ InvalidInputError: 输入数据无效
154
+ EncodingDetectionError: 编码检测失败
155
+ UnicodeDecodeError: 解码失败
156
+ """
157
+ if isinstance(input_data, bytes):
158
+ encoding = self.detect(input_data, sample_size)
159
+ return self._safe_decode(input_data, encoding)
160
+
161
+ if hasattr(input_data, 'read'):
162
+ # 保存原始位置
163
+ original_pos = input_data.tell()
164
+
165
+ try:
166
+ # 读取样本检测编码
167
+ sample = input_data.read(sample_size or self.sample_size)
168
+ if not isinstance(sample, bytes):
169
+ raise InvalidInputError("文件对象必须返回bytes")
170
+
171
+ encoding = self.detect(sample, sample_size)
172
+
173
+ # 重置并读取全部内容
174
+ input_data.seek(original_pos, SEEK_SET)
175
+ full_data = input_data.read()
176
+
177
+ if not isinstance(full_data, bytes):
178
+ raise InvalidInputError("文件对象必须返回bytes")
179
+
180
+ return self._safe_decode(full_data, encoding)
181
+
182
+ finally:
183
+ # 确保文件指针被恢复
184
+ input_data.seek(original_pos, SEEK_SET)
185
+
186
+ raise InvalidInputError("输入必须是bytes或文件对象")
36
187
 
37
- try:
38
- # 使用chardet进行编码检测
39
- result = chardet.detect(raw_data)
40
- confidence = result['confidence']
41
- encoding = result['encoding'].lower()
188
+ def _detect_in_stages(self, data: bytes) -> str:
189
+ """多阶段编码检测流程"""
190
+ # 阶段1: BOM检测
191
+ if self.enable_bom_detection:
192
+ bom_encoding = self._detect_bom(data)
193
+ if bom_encoding:
194
+ logger.debug(f"通过BOM检测到编码: {bom_encoding}")
195
+ return bom_encoding
196
+
197
+ # 阶段2: chardet统计分析
198
+ detected_encoding = self._detect_with_chardet(data)
199
+ if detected_encoding:
200
+ logger.debug(f"chardet检测到编码: {detected_encoding}")
201
+ return detected_encoding
202
+
203
+ # 阶段3: 启发式规则
204
+ if self.enable_heuristics:
205
+ heuristic_encoding = self._apply_heuristics(data)
206
+ if heuristic_encoding:
207
+ logger.debug(f"启发式检测到编码: {heuristic_encoding}")
208
+ return heuristic_encoding
42
209
 
43
- # 验证检测结果
44
- if confidence > 0.9:
45
- return encoding
46
- if confidence > 0.7 and validate_encoding(raw_data, encoding):
47
- return encoding
210
+ # 阶段4: 优先编码列表验证
211
+ for encoding in self.preferred_encodings:
212
+ if self._validate_encoding(data, encoding):
213
+ logger.debug(f"通过优先编码列表验证: {encoding}")
214
+ return encoding
48
215
 
49
- # 尝试常见编码验证
50
- for enc in ['utf-8', 'latin-1', 'gbk', 'gb2312', 'big5']:
51
- if validate_encoding(raw_data, enc):
52
- return enc
216
+ # 最终回退
217
+ warnings.warn("无法确定精确编码,使用utf-8作为回退", RuntimeWarning)
218
+ return 'utf-8'
219
+
220
+ def _get_sample_data(self, input_data: Union[bytes, str, BinaryIO], sample_size: int) -> bytes:
221
+ """安全获取样本数据"""
222
+ if isinstance(input_data, bytes):
223
+ return input_data[:sample_size]
224
+
225
+ if isinstance(input_data, str):
226
+ try:
227
+ # 尝试用latin-1编码获取原始字节(不会失败)
228
+ return input_data.encode('latin-1')[:sample_size]
229
+ except Exception as e:
230
+ raise InvalidInputError(f"字符串转换失败: {str(e)}")
231
+
232
+ if hasattr(input_data, 'read'):
233
+ try:
234
+ # 保存原始位置
235
+ original_pos = input_data.tell()
236
+
237
+ # 读取样本数据
238
+ data = input_data.read(sample_size)
53
239
 
54
- except Exception:
55
- pass
240
+ # 确保返回的是bytes
241
+ if not isinstance(data, bytes):
242
+ raise InvalidInputError("文件对象必须返回bytes")
243
+
244
+ # 恢复文件指针
245
+ input_data.seek(original_pos, SEEK_SET)
246
+ return data
247
+ except Exception as e:
248
+ raise InvalidInputError(f"文件读取失败: {str(e)}")
249
+
250
+ raise InvalidInputError("不支持的输入数据类型")
56
251
 
57
- return fallback_encoding
252
+ def _detect_bom(self, data: bytes) -> Optional[str]:
253
+ """检测BOM标记"""
254
+ for encoding, bom in self.BOM_MARKERS.items():
255
+ if data.startswith(bom):
256
+ return encoding
257
+ return None
258
+
259
+ def _detect_with_chardet(self, data: bytes) -> Optional[str]:
260
+ """使用chardet检测编码"""
261
+ try:
262
+ result = chardet.detect(data)
263
+ confidence = result['confidence']
264
+ encoding = result['encoding'].lower()
265
+
266
+ # 处理编码别名
267
+ encoding = self.ENCODING_ALIASES.get(encoding, encoding)
268
+
269
+ if confidence >= self.min_confidence:
270
+ if not self.enable_validation or self._validate_encoding(data, encoding):
271
+ return encoding
272
+ return None
273
+ except Exception as e:
274
+ logger.warning(f"chardet检测失败: {str(e)}")
275
+ return None
276
+
277
+ def _apply_heuristics(self, data: bytes) -> Optional[str]:
278
+ """应用启发式规则检测编码"""
279
+ # 尝试解码为常见编码并检查字符分布
280
+ for encoding in ['utf-8', 'gb18030', 'big5', 'shift_jis', 'euc-kr']:
281
+ try:
282
+ decoded = data.decode(encoding, errors='strict')
283
+
284
+ # 中文字符检测
285
+ if self._chinese_pattern.search(decoded):
286
+ if encoding in ['gb18030', 'gbk', 'gb2312']:
287
+ return encoding
288
+ if 'big5' in encoding:
289
+ return 'big5'
290
+
291
+ # 日文字符检测
292
+ if self._japanese_pattern.search(decoded):
293
+ if 'shift_jis' in encoding or 'euc-jp' in encoding:
294
+ return encoding
295
+
296
+ # 韩文字符检测
297
+ if self._korean_pattern.search(decoded):
298
+ if 'euc-kr' in encoding:
299
+ return 'euc-kr'
300
+
301
+ # 如果解码成功但没有特定字符,返回该编码
302
+ return encoding
303
+
304
+ except UnicodeError:
305
+ continue
306
+
307
+ return None
308
+
309
+ def _validate_encoding(self, data: bytes, encoding: str) -> bool:
310
+ """严格验证编码是否有效"""
311
+ if not data:
312
+ return False
313
+
314
+ for _ in range(self.max_retries):
315
+ try:
316
+ # 尝试严格解码
317
+ decoded = data.decode(encoding, errors='strict')
318
+
319
+ # 验证是否可以重新编码
320
+ reencoded = decoded.encode(encoding, errors='strict')
321
+
322
+ # 对于UTF-8系列编码,不要求完全可逆(因为BOM可能被去除)
323
+ if encoding.startswith('utf-8'):
324
+ return True
325
+
326
+ # 验证数据一致性
327
+ return reencoded == data
328
+
329
+ except UnicodeError as e:
330
+ logger.debug(f"编码验证失败 {encoding}: {str(e)}")
331
+ return False
332
+ except Exception as e:
333
+ logger.warning(f"编码验证异常 {encoding}: {str(e)}")
334
+ continue
335
+
336
+ return False
337
+
338
+ def _safe_decode(self, data: bytes, encoding: str) -> str:
339
+ """安全解码数据"""
340
+ for _ in range(self.max_retries):
341
+ try:
342
+ return data.decode(encoding, errors='strict')
343
+ except UnicodeError as e:
344
+ logger.warning(f"解码失败 {encoding}, 尝试替代方案: {str(e)}")
345
+ # 尝试用错误处理器
346
+ return data.decode(encoding, errors='replace')
347
+ except Exception as e:
348
+ logger.error(f"解码异常: {str(e)}")
349
+ raise EncodingValidationError(f"无法用 {encoding} 解码数据: {str(e)}")
350
+
351
+ def _normalize_encodings(self, encodings: List[str]) -> List[str]:
352
+ """标准化编码名称并去重"""
353
+ seen = set()
354
+ normalized = []
355
+
356
+ for enc in encodings:
357
+ # 转换为小写并处理别名
358
+ lower_enc = enc.lower().replace('-', '_')
359
+ norm_enc = self.ENCODING_ALIASES.get(lower_enc, lower_enc)
360
+
361
+ if norm_enc not in seen:
362
+ seen.add(norm_enc)
363
+ normalized.append(norm_enc)
364
+
365
+ return normalized
58
366
 
59
367
 
60
- def _get_sample_data(
368
+ # 全局默认检测器实例
369
+ _default_detector = SmartEncodingDetector()
370
+
371
+ # 便捷函数
372
+ def detect_encoding(
61
373
  input_data: Union[bytes, str, BinaryIO],
62
- sample_size: int
63
- ) -> bytes:
64
- """获取用于检测的样本数据"""
65
- if isinstance(input_data, bytes):
66
- return input_data[:sample_size]
67
-
68
- if isinstance(input_data, str):
69
- try:
70
- return input_data.encode('latin-1', errors='ignore')[:sample_size]
71
- except:
72
- return b''
374
+ sample_size: Optional[int] = None,
375
+ preferred_encodings: Optional[List[str]] = None,
376
+ min_confidence: float = 0.7
377
+ ) -> str:
378
+ """
379
+ 自动检测文本数据的字符编码(便捷函数)
73
380
 
74
- if hasattr(input_data, 'read'):
75
- try:
76
- pos = input_data.tell()
77
- data = input_data.read(sample_size)
78
- input_data.seek(pos) # 重置文件指针
79
- return data if isinstance(data, bytes) else b''
80
- except:
81
- return b''
82
-
83
- return b''
381
+ 参数:
382
+ input_data: 输入数据
383
+ sample_size: 采样大小
384
+ preferred_encodings: 优先编码列表
385
+ min_confidence: 最小置信度
386
+
387
+ 返回:
388
+ 检测到的编码名称
389
+
390
+ 异常:
391
+ EncodingDetectionError: 编码检测失败
392
+ """
393
+ detector = SmartEncodingDetector(
394
+ preferred_encodings=preferred_encodings,
395
+ min_confidence=min_confidence
396
+ )
397
+ return detector.detect(input_data, sample_size)
84
398
 
85
399
 
86
- def validate_encoding(data: bytes, encoding: str) -> bool:
87
- """验证编码是否有效"""
88
- try:
89
- data.decode(encoding, errors='strict')
90
- return True
91
- except:
92
- return False
400
+ def detect_and_decode(
401
+ input_data: Union[bytes, BinaryIO],
402
+ sample_size: Optional[int] = None,
403
+ preferred_encodings: Optional[List[str]] = None
404
+ ) -> str:
405
+ """
406
+ 检测编码并解码数据(便捷函数)
407
+
408
+ 参数:
409
+ input_data: 输入数据
410
+ sample_size: 采样大小
411
+ preferred_encodings: 优先编码列表
412
+
413
+ 返回:
414
+ 解码后的字符串
415
+
416
+ 异常:
417
+ EncodingDetectionError: 编码检测失败
418
+ UnicodeDecodeError: 解码失败
419
+ """
420
+ detector = SmartEncodingDetector(preferred_encodings=preferred_encodings)
421
+ return detector.detect_and_decode(input_data, sample_size)