re-common 10.0.23__py3-none-any.whl → 10.0.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,196 @@
1
+ # 某些业务中的字符串处理 算是特定场景的工具 不算通用工具
2
+ import re
3
+
4
+ from re_common.v2.baselibrary.utils.string_bool import is_all_symbols
5
+
6
+
7
+ def clean_organ_postcode(organ):
8
+ """
9
+ 格式化组织名称字符串,移除括号内容并删除独立的6位数字(邮政编码),然后清理标点。
10
+
11
+ 备注: 该方法替换java 里面的 formatOrgan
12
+
13
+ 参数:
14
+ organ (str): 输入的组织名称字符串,可能包含括号、分号和邮政编码。
15
+
16
+ 返回:
17
+ str: 格式化并清理后的组织名称字符串(无独立6位数字)。
18
+ """
19
+ # 如果输入为空,设为空字符串以避免后续操作报错
20
+ if not organ:
21
+ organ = ""
22
+
23
+ # 删除方括号和圆括号中的内容(包括括号本身)
24
+ organ = re.sub(r"\[.*?\]", "", organ) # 非贪婪匹配方括号内容
25
+ organ = re.sub(r"\(.*?\)", "", organ) # 非贪婪匹配圆括号内容
26
+
27
+ # 定义正则表达式,匹配独立的6位数字
28
+ # \b 表示单词边界,确保6位数字是独立的(前后不是字母、数字或下划线)
29
+ organ = re.sub(r"\b[0-9]{6}\b", "", organ)
30
+
31
+ # 初始化结果列表,用于存储处理后的组织名称部分
32
+ format_organ = []
33
+ # 按分号分割字符串,生成组织名称的各个部分
34
+ organ_parts = organ.split(";")
35
+
36
+ # 遍历每个部分,追加到结果列表
37
+ for temp_organ in organ_parts:
38
+ # 去除首尾多余空格后追加(避免因移除邮编导致的空字符串)
39
+ cleaned_part = temp_organ.strip()
40
+ # 如果首尾是标点符号,则移除
41
+ # 定义标点符号的正则表达式(这里包括常见标点)
42
+ punctuation = r"^[!,.?;:#$%^&*+-]+|[!,.?;:#$%^&*+-]+$"
43
+ cleaned_part = re.sub(punctuation, "", cleaned_part)
44
+ if cleaned_part: # 只追加非空部分
45
+ format_organ.append(cleaned_part)
46
+
47
+ # 用分号连接结果,转换为大写并清理标点
48
+ format_organ = ";".join(format_organ)
49
+
50
+ # 返回最终结果并去除首尾空格
51
+ return format_organ.strip()
52
+
53
+
54
+ def get_first_organ(organ):
55
+ if not organ:
56
+ return ""
57
+ organ_list = organ.strip().split(";")
58
+ for organ_one in organ_list:
59
+ # 清理邮政编码
60
+ organ_one = clean_organ_postcode(organ_one)
61
+ if organ_one.strip():
62
+ return organ_one
63
+
64
+ return ""
65
+
66
+
67
+ def get_first_author(author: str) -> str:
68
+ if not author:
69
+ return ""
70
+ au_list = author.strip().split(";")
71
+ for au in au_list:
72
+ au = re.sub("\\[.*?]", "", au)
73
+ au = re.sub("\\(.*?\\)", "", au)
74
+ if au.strip():
75
+ return au
76
+ return ""
77
+
78
+
79
+ def get_author_list(author: str):
80
+ lists = []
81
+ if not author:
82
+ return []
83
+ au_list = author.strip().split(";")
84
+ for au in au_list:
85
+ au = re.sub("\\[.*?]", "", au)
86
+ au = re.sub("\\(.*?\\)", "", au)
87
+ if au.strip():
88
+ lists.append(au.strip())
89
+ return lists
90
+
91
+
92
+ def get_scopus_author_abbr(author_row: str):
93
+ if not author_row:
94
+ return ""
95
+ author_list = author_row.split("&&")
96
+ if len(author_list) != 3:
97
+ raise Exception("错误的数据个数 可能来自其他数据源")
98
+
99
+ abbr_list = author_list[0].strip().split(";")
100
+ abbr_list = [author.strip() for author in abbr_list if
101
+ author.strip() and author.strip().lower() not in ("*", "and")]
102
+ return ";".join(abbr_list)
103
+
104
+
105
+ def get_wos_author_abbr(author_row: str):
106
+ if not author_row:
107
+ return ""
108
+ author_list = author_row.split("&&")
109
+ if len(author_list) != 4:
110
+ raise Exception("错误的数据个数 可能来自其他数据源")
111
+ abbr_list = []
112
+ abbr_list_au = author_list[0].strip().split(";")
113
+ abbr_list_ba = author_list[2].strip().split(";")
114
+ abbr_list.extend(abbr_list_au)
115
+ abbr_list.extend(abbr_list_ba)
116
+ abbr_list = [author.strip() for author in abbr_list if
117
+ author.strip() and author.strip().lower() not in ("*", "and")]
118
+ return ";".join(abbr_list)
119
+
120
+
121
+ def deal_rel_vol(vol_str: str):
122
+ """
123
+ 处理 期刊融合时的卷处理逻辑
124
+ """
125
+
126
+ # 如果卷是全符号 清理掉
127
+ if is_all_symbols(vol_str):
128
+ vol_str = ""
129
+
130
+ if vol_str.replace(".", "").isdigit():
131
+ try:
132
+ float_num = float(vol_str)
133
+ if int(float_num) == float_num:
134
+ return str(int(float_num))
135
+ except:
136
+ pass
137
+
138
+ if vol_str.lower().startswith("v "):
139
+ vol_str = vol_str.lower().replace("v ", "").strip()
140
+ return vol_str
141
+ if vol_str.lower().startswith("volume "):
142
+ vol_str = vol_str.lower().replace("volume ", "").strip()
143
+ return vol_str
144
+ if vol_str.lower().startswith("vol. "):
145
+ vol_str = vol_str.lower().replace("vol. ", "").strip()
146
+ return vol_str
147
+ if vol_str.lower().startswith("vol "):
148
+ vol_str = vol_str.lower().replace("vol ", "").strip()
149
+ return vol_str
150
+ return vol_str
151
+
152
+
153
+ def deal_num_strs(input_str):
154
+ """
155
+ int后在str 防止有浮点型的表达方式
156
+ """
157
+ number_list = re.findall(r'\d+', input_str)
158
+ transformed_numbers = [str(int(num)) for num in number_list]
159
+
160
+ # 替换原字符串中的数字为转换后的数字
161
+ for num, transformed_num in zip(number_list, transformed_numbers):
162
+ input_str = input_str.replace(num, transformed_num)
163
+ return input_str
164
+
165
+
166
+ def deal_num(num_str):
167
+ """
168
+ 将 期格式化 方便 group尤其是有横杆的数据
169
+ 该方法 为融合二次分割时使用,如果场景合适也可以用于其他地方
170
+ :param strs:
171
+ :return:
172
+ """
173
+ # 如果期是全符号清理掉
174
+ if is_all_symbols(num_str):
175
+ num_str = ""
176
+
177
+ if num_str.lower().startswith("n "):
178
+ num_str = num_str.lower().replace("n ", "").strip()
179
+
180
+ num_str = num_str.lower().replace("special_issue_", '').replace("_special_issue", '').replace("issue", "")
181
+ num_str = num_str.replace("spec.", "").replace("iss.", "").replace("spl.", "").replace("special.", "").replace(
182
+ "specialissue.", "")
183
+ num_str = num_str.replace("spec", "").replace("iss", "").replace("spl", "").replace("special", "").replace(
184
+ "specialissue", '')
185
+
186
+ num_str = num_str.replace("-", "_").replace(".", "_").upper()
187
+ num_str = num_str.lstrip("_").rstrip("_")
188
+ if num_str.find("_") > -1:
189
+ start, end = num_str.split("_")
190
+ start = deal_num_strs(start)
191
+ end = deal_num_strs(end)
192
+ num_str = start + "_" + end
193
+ else:
194
+ num_str = deal_num_strs(num_str)
195
+
196
+ return num_str.lower().strip()
File without changes
@@ -0,0 +1,6 @@
1
+ def assign_group_id(rows: list, sub_db_order: list):
2
+ subdb_keyid_map = {row.sub_db_id: row.keyid for row in rows}
3
+ for sub_db_id in sub_db_order:
4
+ if keyid := subdb_keyid_map.get(sub_db_id):
5
+ return keyid, len(rows), rows
6
+ return rows[0].keyid, len(rows), rows
@@ -6,10 +6,13 @@ import json
6
6
  import traceback
7
7
 
8
8
 
9
+ # c1d3a814-1a02-4bbd-b5c2-f756fef92cb8: b层机器人消息群-非聊天 的 pythonspark
10
+ # 013547da-3d78-4a7f-b4a7-e668b192c293: b层机器人消息群-非聊天 的 数仓B层服务端部署通知
11
+
9
12
  # 发送消息到企业微信机器人
10
13
  # vx_key: string类型,自己的企业微信机器人的key
11
14
  # s:string类型,要发送的消息
12
- def send_vx(vx_key, s):
15
+ def send_vx(vx_key, s, i=0):
13
16
  vx_url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=' + vx_key
14
17
  headers = {"Content-Type": "text/plain"}
15
18
  data = {
@@ -18,10 +21,13 @@ def send_vx(vx_key, s):
18
21
  "content": s,
19
22
  }
20
23
  }
24
+ if i > 3:
25
+ raise Exception(str(traceback.format_exc()))
21
26
  try:
22
27
  requests.post(url=vx_url, headers=headers, json=data, timeout=30)
23
28
  except:
24
- send_vx(vx_key, str(traceback.format_exc()))
29
+ i = i + 1
30
+ send_vx(vx_key, str(traceback.format_exc()), i)
25
31
 
26
32
 
27
33
  # 发送文件到企业微信机器人
@@ -71,9 +77,19 @@ def file_to_vx(vx_key, data, file_name):
71
77
  else:
72
78
  if isinstance(data, dict):
73
79
  data_str = json.dumps(data, ensure_ascii=False)
80
+ elif isinstance(data, list):
81
+ data_str = ""
82
+ for i in data:
83
+ if isinstance(i, dict):
84
+ data_str = data_str + json.dumps(i, ensure_ascii=False) + "\n"
85
+ else:
86
+ data_str = data_str + str(i) + "\n"
74
87
  else:
75
88
  data_str = str(data)
89
+ print(data_str[:100])
76
90
  with open(file_path, 'w', encoding='utf-8') as f:
77
91
  f.write(data_str)
92
+ post_file(vx_key, file_path)
78
93
  except:
79
94
  send_vx(vx_key, str(traceback.format_exc()))
95
+ os.system('rm -r' + temp_dir + '')
@@ -0,0 +1,53 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import List, Generator
3
+
4
+
5
+ class BaseFileReader(ABC):
6
+
7
+ def __init__(self, batch_size: int = 10000):
8
+ self.batch_size = batch_size
9
+ self.read_model = 1
10
+
11
+ @abstractmethod
12
+ def list_files(self, path: str) -> List[str]:
13
+ """列出路径下所有目标文件"""
14
+ pass
15
+
16
+ @abstractmethod
17
+ def count_lines(self, file_path: str) -> int:
18
+ """统计文件行数"""
19
+ pass
20
+
21
+ @abstractmethod
22
+ def read_lines(self, file_path: str) -> Generator[List[str], None, None]:
23
+ """读取文件内容,返回批量数据"""
24
+ pass
25
+
26
+ @abstractmethod
27
+ def read_all(self, file_path: str) -> List[List[str]]:
28
+ """读取整个文件,默认按1000行分批"""
29
+ return [line for line in self.read_lines(file_path)]
30
+
31
+ def read_select(self, file_path: str) -> Generator[List[str], None, None]:
32
+ if self.read_model == 1:
33
+ for batch_data in self.read_lines(file_path):
34
+ yield batch_data
35
+ elif self.read_model == 2:
36
+ for batch_data in self.read_all(file_path):
37
+ yield batch_data
38
+ else:
39
+ raise Exception("模式选择错误")
40
+
41
+
42
+ class BaseFileWriter(ABC):
43
+
44
+ def __init__(self, file_path: str, compress: bool = True, overwrite: bool = True, encoding: str = "utf-8"):
45
+ self.file_path = file_path
46
+ self.compress = compress
47
+ self.encoding = encoding
48
+ self.overwrite = overwrite
49
+
50
+ @abstractmethod
51
+ def write_lines(self, lines: List[str], file_path: str):
52
+ """写入多行文本到文件,支持压缩"""
53
+ pass