re-common 10.0.4__py3-none-any.whl → 10.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,9 +3,12 @@ import re
3
3
  import string
4
4
 
5
5
  import regex
6
- import unicodedata
6
+ from jellyfish import damerau_levenshtein_distance
7
7
  from rapidfuzz._utils import setupPandas, is_none
8
8
  from rapidfuzz.distance import Jaro
9
+ from unidecode import unidecode
10
+
11
+ from re_common.v2.baselibrary.utils.stringutils import get_diacritic_variant
9
12
 
10
13
  """
11
14
  作者比率分布 大部分在 1和 2
@@ -23,17 +26,6 @@ additional_chars = '‑–‐’·.—'
23
26
  extended_punctuation = string.punctuation + additional_chars
24
27
 
25
28
 
26
- def get_diacritic_variant(char1):
27
- # 将字符转换为标准的 Unicode 形式
28
- normalized_char1 = unicodedata.normalize('NFD', char1)
29
-
30
- # 获取基本字符(去掉变音符号)
31
- base_char1 = ''.join(c for c in normalized_char1 if unicodedata.category(c) != 'Mn')
32
-
33
- # 判断基本字符是否相同
34
- return base_char1
35
-
36
-
37
29
  def detect_other_languages(text):
38
30
  # 匹配所有非中文、非英文、非数字字符
39
31
  pattern = r'[^\u4E00-\u9FFFa-zA-Z0-9\s.,!?;:\'\"()‑\-–—‐’·˜.]'
@@ -127,6 +119,49 @@ def custom_rstrip(s):
127
119
  return s
128
120
 
129
121
 
122
+ # 分割中文拼音,如"Xiaohong" ————> ['Xiao', 'hong']
123
+ def chinese_pinyin_split_by_rules(input_str):
124
+ # 声母列表(含复合声母)
125
+ initials = {
126
+ 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h',
127
+ 'j', 'q', 'x', 'zh', 'ch', 'sh', 'r', 'z', 'c', 's', 'y', 'w'
128
+ }
129
+ # 韵母列表(部分示例)
130
+ finals = {
131
+ 'a', 'o', 'e', 'ai', 'ei', 'ao', 'ou', 'an', 'en', 'ang', 'eng', 'ong',
132
+ 'i', 'ia', 'ie', 'iao', 'iu', 'ian', 'in', 'iang', 'ing', 'iong',
133
+ 'u', 'ua', 'uo', 'uai', 'ui', 'uan', 'un', 'uang', 'ueng',
134
+ 'v', 've', 'van', 'vn'
135
+ }
136
+ result = []
137
+ while input_str:
138
+ # 尝试匹配最长声母
139
+ max_initial_len = 2 # 最长声母如 'zh'
140
+ matched_initial = ""
141
+ for length in range(max_initial_len, 0, -1):
142
+ candidate = input_str[:length]
143
+ if candidate.lower() in initials:
144
+ matched_initial = candidate
145
+ break
146
+ # 切分声母后的剩余部分
147
+ remaining = input_str[len(matched_initial):]
148
+ # 匹配韵母
149
+ max_final_len = min(4, len(remaining)) # 最长韵母如 'iong'
150
+ matched_final = ""
151
+ for length in range(max_final_len, 0, -1):
152
+ candidate = remaining[:length]
153
+ if candidate.lower() in finals:
154
+ matched_final = candidate
155
+ break
156
+ if matched_final:
157
+ # 合并声母和韵母
158
+ syllable = matched_initial + matched_final
159
+ result.append(syllable)
160
+ input_str = input_str[len(syllable):]
161
+ else:
162
+ return [] # 无法切分
163
+ return result
164
+
130
165
 
131
166
  def AuthorRatio(
132
167
  s1,
@@ -151,8 +186,8 @@ def AuthorRatio(
151
186
  if not s1 or not s2:
152
187
  return 0
153
188
  # 处理音标问题
154
- s1 = get_diacritic_variant(s1)
155
- s2 = get_diacritic_variant(s2)
189
+ s1 = get_diacritic_variant(unidecode(s1))
190
+ s2 = get_diacritic_variant(unidecode(s2))
156
191
  # 这里提出来是为了少计算 但后期需要平衡内存和算力
157
192
  # 移除指定符号 这里做了小写化处理
158
193
  s1_punc = remove_punctuation(s1)
@@ -218,6 +253,10 @@ def AuthorRatio(
218
253
  # 如果两个字符忽略大小写后相同,继续比较下一个字符
219
254
  if i1.lower() == i2.lower():
220
255
  continue
256
+ # 在作者中 有可能错误字母 当单词大于3 且只有一个字母错误或者位置交换时 可以认为这两个单词相同
257
+ # 样例 "De Gusmio, Ana Paula Henriques","De Gusmão, Ana Paula Henriques"
258
+ if len(i1) > 3 and damerau_levenshtein_distance(i1, i2) <= 1:
259
+ continue
221
260
 
222
261
  # 如果其中一个字符的长度为1(即是单个字母),检查它们的首字母是否匹配
223
262
  if len(i1) == 1 or len(i2) == 1:
@@ -0,0 +1,244 @@
1
+ class Node:
2
+ def __init__(self, value, children=None):
3
+ self.value = value
4
+ self.children = children if children is not None else []
5
+
6
+ def __repr__(self):
7
+ return f"Node(value={self.value}, children={self.children})"
8
+
9
+
10
+ def tokenize(expression):
11
+ """将表达式分解为标记(token),仅以 and, or, not 和括号作为分界符
12
+
13
+ Args:
14
+ expression (str): 输入的字符串表达式,例如 'a = 3 + (b > 2) and c'
15
+
16
+ Returns:
17
+ list: 分解后的标记列表,例如 ['a = 3 + (b > 2)', 'and', 'c']
18
+ """
19
+ tokens = [] # 存储最终的标记列表
20
+ current = "" # 当前正在构建的标记字符串
21
+ i = 0 # 当前字符的索引
22
+ length = len(expression) # 输入表达式的长度
23
+
24
+ def is_delimiter_match(expression, i, delimiter_len=3, delimiter="and"):
25
+ """检查当前位置是否匹配指定的分隔符(and, or, not)
26
+
27
+ Args:
28
+ expression (str): 输入的表达式字符串
29
+ i (int): 当前检查的起始索引
30
+ delimiter_len (int): 分隔符的长度,默认为 3(适用于 'and' 和 'not')
31
+ delimiter (str): 要检查的分隔符,默认为 'and'
32
+
33
+ Returns:
34
+ bool: 如果当前位置匹配分隔符且前后有空格,返回 True,否则返回 False
35
+ """
36
+ # 检查索引是否超出范围
37
+ if not i + delimiter_len <= length:
38
+ return False
39
+ # 检查当前位置是否匹配指定分隔符(忽略大小写)
40
+ if not expression[i:i + delimiter_len].lower() == delimiter:
41
+ return False
42
+
43
+ # 检查分隔符前是否有一个空格(如果不是字符串开头)
44
+ if i - 1 >= 0:
45
+ if not expression[i - 1].lower() == ' ':
46
+ return False
47
+
48
+ # 检查分隔符后是否有一个空格(如果不是字符串结尾)
49
+ if i + delimiter_len + 1 <= length:
50
+ if not expression[i + delimiter_len].lower() == ' ':
51
+ return False
52
+ return True
53
+
54
+ # 遍历表达式的每个字符
55
+ while i < length:
56
+ char = expression[i] # 当前处理的字符
57
+
58
+ # 处理括号
59
+ if char in "()":
60
+ if current.strip(): # 如果当前标记有内容,先将其添加到 tokens
61
+ tokens.append(current.strip())
62
+ current = "" # 重置当前标记
63
+ tokens.append(char) # 将括号作为独立标记添加
64
+ i += 1 # 移动到下一个字符
65
+ continue
66
+
67
+ # 检查是否遇到 and, or, not 分隔符
68
+ if is_delimiter_match(expression, i, delimiter_len=3, delimiter="and"):
69
+ if current.strip(): # 如果当前标记有内容,先添加
70
+ tokens.append(current.strip())
71
+ current = "" # 重置当前标记
72
+ tokens.append("and") # 添加 'and' 标记
73
+ i += 3 # 跳过 'and' 的长度
74
+ continue
75
+ elif is_delimiter_match(expression, i, delimiter_len=2, delimiter="or"):
76
+ if current.strip(): # 如果当前标记有内容,先添加
77
+ tokens.append(current.strip())
78
+ current = "" # 重置当前标记
79
+ tokens.append("or") # 添加 'or' 标记
80
+ i += 2 # 跳过 'or' 的长度
81
+ continue
82
+ elif is_delimiter_match(expression, i, delimiter_len=3, delimiter="not"):
83
+ if current.strip(): # 如果当前标记有内容,先添加
84
+ tokens.append(current.strip())
85
+ current = "" # 重置当前标记
86
+ tokens.append("not") # 添加 'not' 标记
87
+ i += 3 # 跳过 'not' 的长度
88
+ continue
89
+
90
+ # 将非分隔符字符追加到当前标记中,包括空格
91
+ current += char
92
+ i += 1 # 移动到下一个字符
93
+
94
+ # 处理最后一个标记(如果有内容)
95
+ if current.strip():
96
+ tokens.append(current.strip())
97
+
98
+ return tokens # 返回标记列表
99
+
100
+
101
+
102
+
103
+ def parse_expression(tokens):
104
+ """递归下降解析表达式"""
105
+
106
+ def parse_or(tokens, pos):
107
+ """解析 OR 级别(最低优先级)"""
108
+ left, pos = parse_and(tokens, pos)
109
+ while pos < len(tokens) and tokens[pos] == 'or':
110
+ pos += 1
111
+ if pos >= len(tokens):
112
+ raise ValueError("Incomplete expression after 'or'")
113
+ right, pos = parse_and(tokens, pos)
114
+ left = Node('or', [left, right])
115
+ return left, pos
116
+
117
+ def parse_and(tokens, pos):
118
+ """解析 AND 级别(次高优先级)"""
119
+ left, pos = parse_not(tokens, pos)
120
+ while pos < len(tokens) and tokens[pos] == 'and':
121
+ pos += 1
122
+ if pos >= len(tokens):
123
+ raise ValueError("Incomplete expression after 'and'")
124
+ right, pos = parse_not(tokens, pos)
125
+ left = Node('and', [left, right])
126
+ return left, pos
127
+
128
+ def parse_not(tokens, pos):
129
+ """解析 NOT 级别(最高优先级)"""
130
+ if pos < len(tokens) and tokens[pos] == 'not':
131
+ pos += 1
132
+ if pos >= len(tokens):
133
+ raise ValueError("Incomplete expression after 'not'")
134
+ child, pos = parse_primary(tokens, pos)
135
+ return Node('not', [child]), pos
136
+ return parse_primary(tokens, pos)
137
+
138
+ def parse_primary(tokens, pos):
139
+ """解析基本单元(条件或括号表达式)"""
140
+ if pos >= len(tokens):
141
+ raise ValueError("Unexpected end of expression")
142
+
143
+ if tokens[pos] == '(':
144
+ pos += 1
145
+ subtree, pos = parse_or(tokens, pos)
146
+ if pos >= len(tokens) or tokens[pos] != ')':
147
+ raise ValueError("Missing closing parenthesis")
148
+ return subtree, pos + 1
149
+ else:
150
+ # 假设这是一个条件(如 A=1)
151
+ return Node(tokens[pos]), pos + 1
152
+
153
+ # 从头开始解析
154
+ tree, pos = parse_or(tokens, 0)
155
+ if pos < len(tokens):
156
+ raise ValueError(f"Extra tokens after expression: {tokens[pos:]}")
157
+ return tree
158
+
159
+
160
+ def flatten_tree(node):
161
+ """清理语法树,将嵌套的同级 and/or 节点展平。
162
+
163
+ Args:
164
+ node (Node): 输入的语法树节点
165
+
166
+ Returns:
167
+ Node: 清理后的新语法树节点
168
+ """
169
+ # 如果没有子节点,直接返回原节点(条件节点)
170
+ if not node.children:
171
+ return Node(value=node.value, children=[])
172
+
173
+ # 递归清理所有子节点
174
+ cleaned_children = [flatten_tree(child) for child in node.children]
175
+
176
+ # 如果当前节点是 'and' 或 'or',展平嵌套的同类节点
177
+ if node.value in ('and', 'or'):
178
+ flattened_children = []
179
+ for child in cleaned_children:
180
+ # 如果子节点的值与当前节点相同(例如 'or' 下的 'or'),将其子节点提升
181
+ if child.value == node.value:
182
+ flattened_children.extend(child.children)
183
+ else:
184
+ flattened_children.append(child)
185
+ return Node(value=node.value, children=flattened_children)
186
+
187
+ # 对于其他节点(例如 'not'),保持结构不变,只更新子节点
188
+ return Node(value=node.value, children=cleaned_children)
189
+
190
+
191
+ def pretty_print_tree(node, indent=0, prefix=""):
192
+ """生成语法树的格式化字符串表示,带有层次缩进。
193
+
194
+ Args:
195
+ node (Node): 要格式化的语法树节点
196
+ indent (int): 当前缩进级别(空格数),默认从 0 开始
197
+ prefix (str): 前缀字符串,用于表示当前行的开头
198
+
199
+ Returns:
200
+ str: 格式化后的树形字符串
201
+ """
202
+ # 基本缩进单位
203
+ spaces = " " * indent
204
+
205
+ # 如果没有子节点,返回单行表示
206
+ if not node.children:
207
+ return f"{spaces}{prefix}Node(value='{node.value}', children=[])"
208
+
209
+ # 构建当前节点的字符串
210
+ result = [f"{spaces}{prefix}Node(value='{node.value}', children=["]
211
+
212
+ # 递归处理每个子节点
213
+ for i, child in enumerate(node.children):
214
+ is_last = i == len(node.children) - 1
215
+ child_prefix = " " if is_last else " "
216
+ result.append(pretty_print_tree(child, indent + 4, child_prefix))
217
+
218
+ # 添加结束括号
219
+ result.append(f"{spaces}])")
220
+
221
+ # 将所有行连接成一个字符串
222
+ return "\n".join(result)
223
+
224
+ # 测试代码
225
+ expressions = [
226
+ "not A=1 and B= 2",
227
+ "A=1 and (not B=2 or (C=3 or D=4))",
228
+ "A=1 and not (B=2 or C=3 and D=4 or E=5)",
229
+ "(A=1 and not (B=2 or C=3 or D=4))",
230
+ "A=1 and", # 不完整表达式
231
+ "and A=1", # 不完整表达式
232
+ ]
233
+
234
+ for expr in expressions:
235
+ try:
236
+ print(f"\nExpression: {expr}")
237
+ tokens = tokenize(expr)
238
+ print("Tokens:", tokens)
239
+ tree = parse_expression(tokens)
240
+ tree = flatten_tree(tree)
241
+ tree = pretty_print_tree(tree)
242
+ print("Tree:", tree)
243
+ except ValueError as e:
244
+ print(f"Error: {e}")
@@ -67,7 +67,6 @@ class StringClear(object):
67
67
  return self
68
68
 
69
69
  def replace_dash_with_space(self):
70
- # 横线换成空格 比 去除符号有时更有用
71
70
  self.obj_str = self.obj_str.replace("-", " ")
72
71
  return self
73
72
 
@@ -106,7 +105,6 @@ def rel_clear(str_obj):
106
105
  .to_str() # 防止其他类型传入 比如 int double
107
106
  .qj_to_bj() # 全角转半角
108
107
  .remove_html_tag() # html标签清理
109
- .replace_dash_with_space() # 横线转空格 在 英文 title 中更有用
110
108
  .remove_special_chars() # 移除特殊字符,仅保留字母、数字、空格和汉字 \w 已经包括所有 Unicode 字母 下划线 _ 会被保留
111
109
  .collapse_spaces() # 移除多余空格,连续多个空格变一个
112
110
  .lower() # 小写
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: re_common
3
- Version: 10.0.4
3
+ Version: 10.0.6
4
4
  Summary: a library about all python projects
5
5
  Home-page: https://gitee.com/xujiangios/re-common
6
6
  Author: vic
@@ -173,12 +173,13 @@ re_common/v2/baselibrary/tools/text_matcher.py,sha256=F4WtLO-b7H6V9TIvOntCD9ZXSQ
173
173
  re_common/v2/baselibrary/tools/unionfind_tools.py,sha256=VYHZZPXwBYljsm7TjV1B6iCgDn3O3btzNf9hMvQySVU,2965
174
174
  re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=tzjVr_-6iPAKTt14hR-BhRshdRgeT_MPJpUQkxcTXns,4084
175
175
  re_common/v2/baselibrary/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
176
- re_common/v2/baselibrary/utils/author_smi.py,sha256=Mjl0GYH9e0TP48yxnxC7qgMP2bZW04pa8TQezpKo9L0,11796
176
+ re_common/v2/baselibrary/utils/author_smi.py,sha256=1ebH3AHv19jtJWdlqNdwu6t58HNVLCotuCB6ff1SWiw,13666
177
177
  re_common/v2/baselibrary/utils/basedict.py,sha256=tSV85pARe8ZQDY77_h_heS81EWwcgJW076DcA9WQyjY,1161
178
178
  re_common/v2/baselibrary/utils/basehdfs.py,sha256=NVV5Q0OMPlM_zTrs9ZDoPJv29GQv5wi9-AP1us5dBrQ,4651
179
179
  re_common/v2/baselibrary/utils/json_cls.py,sha256=dHOkWafG9lbQDoub9cbDwT2fDjMKtblQnjFLeA4hECA,286
180
+ re_common/v2/baselibrary/utils/n_ary_expression_tree.py,sha256=-05kO6G2Rth7CEK-5lfFrthFZ1Q0-0a7cni7mWZ-2gg,9172
180
181
  re_common/v2/baselibrary/utils/string_bool.py,sha256=4VCr1g8pX5YnzZSKctQgQfmhSQ0aw7a8ruhWdiRmBFU,641
181
- re_common/v2/baselibrary/utils/string_clear.py,sha256=6mkBAZUNh5-JTPmB9lj_i4eLT9C6ZW1nH4tZiGveIE4,3778
182
+ re_common/v2/baselibrary/utils/string_clear.py,sha256=sKKXEqCtItbJxsjgrBXBeubXaiAYuoc0301EOVFzXbk,3627
182
183
  re_common/v2/baselibrary/utils/stringutils.py,sha256=GLXHAm8IulC_8hWrN2aiFQjsoOpjczvcVozmTJj86-A,3864
183
184
  re_common/vip/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
184
185
  re_common/vip/base_step_process.py,sha256=VXXiNj0I5CpzXIMCgOPU86bzDJkSBkUS-9CpZIl_GOk,205
@@ -206,8 +207,8 @@ re_common/vip/title/transform/TransformRegulationTitleToZt.py,sha256=LKRdIsWKues
206
207
  re_common/vip/title/transform/TransformStandardTitleToZt.py,sha256=-fCKAbSBzXVyQDCE61CalvR9E_QzQMA08QOO_NePFNI,5563
207
208
  re_common/vip/title/transform/TransformThesisTitleToZt.py,sha256=QS-uV0cQrpUFAcKucuJQ9Ue2VRQH-inmfn_X3IplfRo,5488
208
209
  re_common/vip/title/transform/__init__.py,sha256=m83-CWyRq_VHPYHaALEQlmXrkTdrZ3e4B_kCfBYE-uc,239
209
- re_common-10.0.4.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
210
- re_common-10.0.4.dist-info/METADATA,sha256=SNPQXc5koTrhSxu9yAPRPN42uItn6onNvmG7GHTMdcE,581
211
- re_common-10.0.4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
212
- re_common-10.0.4.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
213
- re_common-10.0.4.dist-info/RECORD,,
210
+ re_common-10.0.6.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
211
+ re_common-10.0.6.dist-info/METADATA,sha256=4gcNYlu46W2s5D1IRTzBtM_Sp3DtETT51Xxv-RkX7Ns,581
212
+ re_common-10.0.6.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
213
+ re_common-10.0.6.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
214
+ re_common-10.0.6.dist-info/RECORD,,