re-common 10.0.8__py3-none-any.whl → 10.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,7 @@ class DotDict(dict):
2
2
  """
3
3
  让字典成为对象 既可以用字典方式访问 也可以用点访问key
4
4
  """
5
+
5
6
  def __init__(self, *args, **kwargs):
6
7
  super().__init__(*args, **kwargs)
7
8
  # 递归地将嵌套字典转换为 DotDict
@@ -21,4 +22,16 @@ class DotDict(dict):
21
22
  def __setattr__(self, key, value):
22
23
  if isinstance(value, dict): # 如果值是字典,转换为 DotDict
23
24
  value = DotDict(value)
24
- self[key] = value
25
+ self[key] = value
26
+
27
+ def to_dict(self):
28
+ """
29
+ 将 DotDict 实例转换为普通字典
30
+ """
31
+ result = {}
32
+ for key, value in self.items():
33
+ if isinstance(value, DotDict):
34
+ result[key] = value.to_dict()
35
+ else:
36
+ result[key] = value
37
+ return result
@@ -1,7 +1,9 @@
1
1
  from typing import List
2
2
 
3
3
  import jieba
4
- from datasketch import MinHash
4
+ from datasketch import MinHash, minhash
5
+
6
+ from re_common.v2.baselibrary.utils.string_bool import is_single_cjk_char
5
7
 
6
8
 
7
9
  def tokenize(text: str, stopwords=None) -> List[str]:
@@ -11,8 +13,16 @@ def tokenize(text: str, stopwords=None) -> List[str]:
11
13
  if stopwords is None:
12
14
  stopwords = []
13
15
  words = jieba.lcut(text)
16
+
14
17
  # 统计单字符数据 长度,防止结巴分词分不了的单词 将数据分为单个字符
15
- one_char_size = len([i for i in words if len(i) == 1])
18
+
19
+ # 这里为什么使用函数 而不是在推导式中兼容,主要是在一些 spark中 推导式的if 条件不遵循最短路径原则会将表达式当做一个整体算子
20
+ def is_singel_en(i):
21
+ if len(i) == 1 and not is_single_cjk_char(i):
22
+ return True
23
+ return False
24
+
25
+ one_char_size = len([i for i in words if is_singel_en(i)])
16
26
  all_size = len(words)
17
27
  # 如果单字符个数超过一定比例 就直接用空格分词
18
28
  if all_size != 0 and one_char_size / all_size > 0.6:
@@ -31,3 +41,13 @@ def create_minhash(words: List[str], num_perm=128) -> MinHash:
31
41
  for word in words:
32
42
  minhash.update(word.encode("utf-8"))
33
43
  return minhash
44
+
45
+
46
+ def get_str_minhash(title):
47
+ from re_common.v2.baselibrary.utils.string_clear import rel_clear
48
+ rel_title = rel_clear(title)
49
+ if not rel_title:
50
+ return ""
51
+ words = tokenize(rel_title)
52
+ minhash = create_minhash(words)
53
+ return minhash
@@ -10,17 +10,81 @@ def contains_chinese_chars(s):
10
10
 
11
11
 
12
12
  def is_empty(value):
13
+ """
14
+ 判断一个值是否为空。
15
+
16
+ 支持的类型:
17
+ - None
18
+ - 空字符串(去除空白后)
19
+ - pandas 的 NaN
20
+ - 其他可迭代类型(如列表、字典等)的长度为 0
21
+ - 其他情况返回 False
22
+ """
13
23
  # 如果是 None,直接返回 True
14
24
  if value is None:
15
25
  return True
16
26
 
27
+ # 尝试处理 pandas 的 NaN
28
+ try:
29
+ import pandas as pd
30
+ if pd.isna(value):
31
+ return True
32
+ except ImportError:
33
+ pass # 如果没有安装 pandas,跳过
34
+
17
35
  # 如果是字符串,检查去除空白后是否为空
18
36
  if isinstance(value, str):
19
37
  return value.strip() == ""
20
38
 
21
- # 可选:处理其他可迭代类型(如列表、字典等)
39
+ # 处理其他可迭代类型(如列表、字典等)
22
40
  if hasattr(value, "__len__"):
23
41
  return len(value) == 0
24
42
 
25
43
  # 默认情况下,非 None、非空类型返回 False
26
- return False
44
+ return False
45
+
46
+
47
+ class InvalidCharLengthError(Exception):
48
+ """自定义异常类,用于处理输入字符长度不为 1 的情况"""
49
+ pass
50
+
51
+
52
+ def is_single_cjk_char(char):
53
+ """
54
+ 判断单个字符是否为中日韩字符
55
+ :param char: 要判断的单个字符
56
+ :return: 如果是中日韩字符返回 True,否则返回 False
57
+ """
58
+ # 检查输入字符的长度
59
+ if len(char) != 1:
60
+ raise InvalidCharLengthError("输入的字符串长度必须为 1,请提供单个字符进行判断。")
61
+ code_point = ord(char)
62
+ # 中日韩统一表意文字
63
+ ranges = [
64
+ (0x4E00, 0x9FFF), # CJK 统一表意符号
65
+ (0x3400, 0x4DBF), # CJK 统一表意符号扩展 A
66
+ (0x20000, 0x2A6DF), # CJK 统一表意符号扩展 B
67
+ (0x2A700, 0x2B73F), # CJK 统一表意符号扩展 C
68
+ (0x2B740, 0x2B81F), # CJK 统一表意符号扩展 D
69
+ (0x2B820, 0x2CEAF), # CJK 统一表意符号扩展 E
70
+ (0x2CEB0, 0x2EBEF), # CJK 统一表意符号扩展 F
71
+ (0x30000, 0x3134F), # CJK 统一表意符号扩展 G
72
+ (0x31350, 0x323AF), # CJK 统一表意符号扩展 H
73
+ (0x3300, 0x33FF), # CJK 兼容符号
74
+ (0xFE30, 0xFE4F), # CJK 兼容形式
75
+ (0xF900, 0xFAFF), # CJK 兼容表意符号
76
+ (0x2F800, 0x2FA1F), # CJK 兼容表意符号补充
77
+ (0x3105, 0x3129), # 注音字母
78
+ (0x31A0, 0x31BF), # 注音字母扩展
79
+ (0x3040, 0x309F), # 平假名
80
+ (0x30A0, 0x30FF), # 片假名
81
+ (0x31F0, 0x31FF), # 片假名扩展
82
+ (0xAC00, 0xD7AF), # 韩文音节
83
+ (0x1100, 0x11FF), # 韩文字母
84
+ (0xA960, 0xA97F), # 韩文字母扩展 A
85
+ (0xD7B0, 0xD7FF), # 韩文字母扩展 B
86
+ ]
87
+ for start, end in ranges:
88
+ if start <= code_point <= end:
89
+ return True
90
+ return False
@@ -165,10 +165,11 @@ def ref_clear(str_obj):
165
165
  def clear_obj(str_obj):
166
166
  # 为对象化定制的清理
167
167
  str_obj = clear_au_organ(str_obj)
168
- str_obj = str_obj.replace("ß", "SS")
169
- return (StringClear(str_obj)
170
- .remove_diacritics() # 清理音标
171
- .upper()
172
- .get_str() # 获取str
173
- .strip() # 去掉空格
174
- )
168
+ # str_obj = str_obj.replace("ß", "SS") # "ß" 的 大写就是 "SS"
169
+ result = (StringClear(str_obj)
170
+ .remove_diacritics() # 清理音标
171
+ .upper()
172
+ .get_str() # 获取str
173
+ .strip() # 去掉空格
174
+ )
175
+ return result
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: re_common
3
- Version: 10.0.8
3
+ Version: 10.0.10
4
4
  Summary: a library about all python projects
5
5
  Home-page: https://gitee.com/xujiangios/re-common
6
6
  Author: vic
@@ -168,10 +168,10 @@ re_common/v2/baselibrary/s3object/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeR
168
168
  re_common/v2/baselibrary/s3object/baseboto3.py,sha256=mXuIFx99pnrPGQ4LJCZwlN1HLbaU-OWLwck0cVzW6hc,11203
169
169
  re_common/v2/baselibrary/tools/WeChatRobot.py,sha256=EaQgNncROAhU5-psYRGWAshIV5aEw-p2u1kYLpvr7RA,2796
170
170
  re_common/v2/baselibrary/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
171
- re_common/v2/baselibrary/tools/dict_tools.py,sha256=HW-YZOUhv5GMzFsF-ArLfDoszui1K3_M7IiRIe4VEXA,909
171
+ re_common/v2/baselibrary/tools/dict_tools.py,sha256=BTh7oJuJ619IZgxiYlim0ltrXBclDtb7WzyFGr7wVf0,1246
172
172
  re_common/v2/baselibrary/tools/dolphinscheduler.py,sha256=1m7UGYDiuvJUCI6ik6CGM2fO8U5XteJzn55VRbwB9ts,7978
173
173
  re_common/v2/baselibrary/tools/list_tools.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
174
- re_common/v2/baselibrary/tools/search_hash_tools.py,sha256=o-PNjmcYDkfyiR75Jci_9sSn4cGi_F9jPCIrwYdnb1U,1013
174
+ re_common/v2/baselibrary/tools/search_hash_tools.py,sha256=d_h9j7VxiXpcn1GHZ7L2tpx9_LDQshcl58tlKvSxZPg,1691
175
175
  re_common/v2/baselibrary/tools/text_matcher.py,sha256=F4WtLO-b7H6V9TIvOntCD9ZXSQP_KijPuLLYcLPtrKQ,7021
176
176
  re_common/v2/baselibrary/tools/unionfind_tools.py,sha256=VYHZZPXwBYljsm7TjV1B6iCgDn3O3btzNf9hMvQySVU,2965
177
177
  re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=tzjVr_-6iPAKTt14hR-BhRshdRgeT_MPJpUQkxcTXns,4084
@@ -181,8 +181,8 @@ re_common/v2/baselibrary/utils/basedict.py,sha256=tSV85pARe8ZQDY77_h_heS81EWwcgJ
181
181
  re_common/v2/baselibrary/utils/basehdfs.py,sha256=NVV5Q0OMPlM_zTrs9ZDoPJv29GQv5wi9-AP1us5dBrQ,4651
182
182
  re_common/v2/baselibrary/utils/json_cls.py,sha256=dHOkWafG9lbQDoub9cbDwT2fDjMKtblQnjFLeA4hECA,286
183
183
  re_common/v2/baselibrary/utils/n_ary_expression_tree.py,sha256=-05kO6G2Rth7CEK-5lfFrthFZ1Q0-0a7cni7mWZ-2gg,9172
184
- re_common/v2/baselibrary/utils/string_bool.py,sha256=4VCr1g8pX5YnzZSKctQgQfmhSQ0aw7a8ruhWdiRmBFU,641
185
- re_common/v2/baselibrary/utils/string_clear.py,sha256=g_2s2C4yY0C5AvuANjn02g7e_VM_uNY1lxoQg5HtLrk,5799
184
+ re_common/v2/baselibrary/utils/string_bool.py,sha256=EJnkSck4ofcIeJ6nLzAOVtlt6o1WBgvgVwIqJKj5Suc,2993
185
+ re_common/v2/baselibrary/utils/string_clear.py,sha256=LDIf-3Czq1sXp-54aifXdXbdGUX7hpFBKqQa5Azj_lo,5861
186
186
  re_common/v2/baselibrary/utils/stringutils.py,sha256=GLXHAm8IulC_8hWrN2aiFQjsoOpjczvcVozmTJj86-A,3864
187
187
  re_common/vip/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
188
188
  re_common/vip/base_step_process.py,sha256=VXXiNj0I5CpzXIMCgOPU86bzDJkSBkUS-9CpZIl_GOk,205
@@ -210,8 +210,8 @@ re_common/vip/title/transform/TransformRegulationTitleToZt.py,sha256=LKRdIsWKues
210
210
  re_common/vip/title/transform/TransformStandardTitleToZt.py,sha256=-fCKAbSBzXVyQDCE61CalvR9E_QzQMA08QOO_NePFNI,5563
211
211
  re_common/vip/title/transform/TransformThesisTitleToZt.py,sha256=QS-uV0cQrpUFAcKucuJQ9Ue2VRQH-inmfn_X3IplfRo,5488
212
212
  re_common/vip/title/transform/__init__.py,sha256=m83-CWyRq_VHPYHaALEQlmXrkTdrZ3e4B_kCfBYE-uc,239
213
- re_common-10.0.8.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
214
- re_common-10.0.8.dist-info/METADATA,sha256=687IQ2myx3vDwQca9JMzKTf8KHCR6qSst65ykG1VZ9Y,581
215
- re_common-10.0.8.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
216
- re_common-10.0.8.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
217
- re_common-10.0.8.dist-info/RECORD,,
213
+ re_common-10.0.10.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
214
+ re_common-10.0.10.dist-info/METADATA,sha256=mOarqqiMSzMjAcu1sV0OxUGdwfANLray_3ZpjkAPxFg,582
215
+ re_common-10.0.10.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
216
+ re_common-10.0.10.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
217
+ re_common-10.0.10.dist-info/RECORD,,