xlin 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xlin-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 兮尘
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
xlin-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,24 @@
1
+ Metadata-Version: 2.1
2
+ Name: xlin
3
+ Version: 0.1.0
4
+ Summary: toolbox for LinXueyuan
5
+ License: MIT
6
+ Author: XiChen
7
+ Author-email: 23211526+LinXueyuanStdio@users.noreply.github.com
8
+ Requires-Python: >=3.10,<4.0
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Requires-Dist: loguru (>=0.7.2,<0.8.0)
15
+ Requires-Dist: pandas (>=2.2.3,<3.0.0)
16
+ Requires-Dist: pyexcel (>=0.7.1,<0.8.0)
17
+ Requires-Dist: pyexcel-xls (>=0.7.0,<0.8.0)
18
+ Requires-Dist: pyexcel-xlsx (>=0.6.0,<0.7.0)
19
+ Requires-Dist: xlsxwriter (==3.1.2)
20
+ Description-Content-Type: text/markdown
21
+
22
+ # xlin
23
+ 个人 python 工具代码
24
+
xlin-0.1.0/README.md ADDED
@@ -0,0 +1,2 @@
1
+ # xlin
2
+ 个人 python 工具代码
@@ -0,0 +1,21 @@
1
+ [tool.poetry]
2
+ name = "xlin"
3
+ version = "0.1.0"
4
+ description = "toolbox for LinXueyuan"
5
+ authors = ["XiChen <23211526+LinXueyuanStdio@users.noreply.github.com>"]
6
+ license = "MIT"
7
+ readme = "README.md"
8
+
9
+ [tool.poetry.dependencies]
10
+ python = "^3.10"
11
+
12
+ loguru = "^0.7.2"
13
+ pandas = "^2.2.3"
14
+ pyexcel = "^0.7.1"
15
+ pyexcel-xls = "^0.7.0"
16
+ pyexcel-xlsx = "^0.6.0"
17
+ xlsxwriter = "3.1.2"
18
+
19
+ [build-system]
20
+ requires = ["poetry-core"]
21
+ build-backend = "poetry.core.masonry.api"
@@ -0,0 +1,13 @@
1
+ def text_is_all_chinese(test: str):
2
+ for ch in test:
3
+ if '\u4e00' <= ch <= '\u9fff':
4
+ continue
5
+ return False
6
+ return True
7
+
8
+
9
+ def text_contains_chinese(test: str):
10
+ for ch in test:
11
+ if '\u4e00' <= ch <= '\u9fff':
12
+ return True
13
+ return False
@@ -0,0 +1,195 @@
1
+ import json
2
+ from typing import *
3
+
4
+ from pathlib import Path
5
+ from loguru import logger
6
+ import pandas as pd
7
+
8
+
9
+ def dataframe_to_json_list(df: pd.DataFrame):
10
+ """
11
+ Args:
12
+ df (pd.DataFrame): df
13
+
14
+ Returns:
15
+ List[Dict[str, str]]: json list: [{"col1": "xxx", "col2": "xxx", ...}, ...]
16
+ """
17
+ json_list = []
18
+ for i, line in df.iterrows():
19
+ json_list.append(dict(line))
20
+ return json_list
21
+
22
+
23
+ def transform_dataframe_to_json_list(df: pd.DataFrame, row_transform):
24
+ """
25
+ Args:
26
+ df (pd.DataFrame): df
27
+ row_transform : lambda row: prompt_template.format(row['query']), "", row['label']
28
+
29
+ Returns:
30
+ List[Dict[str, str]]: json list: [{"instruction": "xxx", "input": "xxx", "output": "xxx"}, ...]
31
+ """
32
+ out_list = list()
33
+ for _, row in df.iterrows():
34
+ instruction, input, output = row_transform(row)
35
+ out_list.append({"instruction": instruction, "input": input, "output": output})
36
+ return out_list
37
+
38
+
39
+ def jsonlist_to_dataframe(json_list: List[Dict[str, str]]):
40
+ """
41
+ Args:
42
+ json_list (List[Dict[str, str]]): json list: [{"col1": "xxx", "col2": "xxx", ...}, ...]
43
+
44
+ Returns:
45
+ pd.DataFrame: df
46
+ """
47
+ return pd.DataFrame(json_list)
48
+
49
+
50
+ def load_json(filename: str):
51
+ with open(filename, "r", encoding="utf-8") as f:
52
+ return json.load(f)
53
+
54
+
55
+ def save_json(json_list: Union[Dict[str, str], List[Dict[str, str]]], filename: str):
56
+ Path(filename).parent.mkdir(parents=True, exist_ok=True)
57
+ with open(filename, "w", encoding="utf-8") as f:
58
+ return json.dump(json_list, f, ensure_ascii=False, separators=(",", ":"), indent=2)
59
+
60
+
61
+ def load_json_list(filename: str):
62
+ with open(filename, "r", encoding="utf-8") as f:
63
+ lines = f.readlines()
64
+ json_list = []
65
+ for i in lines:
66
+ try:
67
+ obj = json.loads(i.strip())
68
+ except:
69
+ continue
70
+ json_list.append(obj)
71
+ return json_list
72
+
73
+
74
+ def save_json_list(json_list: List[Dict[str, str]], filename: str):
75
+ Path(filename).parent.mkdir(parents=True, exist_ok=True)
76
+ with open(filename, "w", encoding="utf-8") as f:
77
+ f.write("\n".join([json.dumps(line, ensure_ascii=False, separators=(",", ":")) for line in json_list]))
78
+
79
+
80
+ def merge_json_list(filenames: List[str], output_filename: str):
81
+ json_list = []
82
+ for filename in filenames:
83
+ json_list.extend(load_json_list(filename))
84
+ save_json_list(json_list, output_filename)
85
+
86
+
87
+ def jsonlist_dict_summary(jsonlist_dict: Dict[str, List[dict]]):
88
+ rows = []
89
+ for k, jsonlist in jsonlist_dict.items():
90
+ if len(jsonlist) == 0:
91
+ continue
92
+ row = {
93
+ "sheet_name": k,
94
+ "length": len(jsonlist),
95
+ "columns": str(list(jsonlist[0].keys())),
96
+ }
97
+ rows.append(row)
98
+ df = pd.DataFrame(rows)
99
+ return df
100
+
101
+
102
+ def print_in_json(text: str):
103
+ print(json.dumps({"text": text}, indent=2, ensure_ascii=False))
104
+
105
+
106
+ def apply_changes_to_jsonlist(
107
+ jsonlist: List[Dict[str, str]],
108
+ changes: Dict[str, Callable[[Dict[str, str]], Tuple[Literal["deleted", "updated", "unchanged"], Dict[str, str]]]],
109
+ verbose=False,
110
+ **kwargs,
111
+ ):
112
+ rows = jsonlist
113
+ total_updated = 0
114
+ total_deleted = 0
115
+ for name, change in changes.items():
116
+ new_rows = []
117
+ updated = 0
118
+ deleted = 0
119
+ for row in rows:
120
+ status, new_row = change(row, **kwargs)
121
+ if status == "deleted":
122
+ deleted += 1
123
+ continue
124
+ if status == "updated":
125
+ updated += 1
126
+ new_rows.append(new_row)
127
+ rows = new_rows
128
+ msgs = []
129
+ if updated > 0:
130
+ total_updated += updated
131
+ msgs += [f"updated {updated}"]
132
+ if deleted > 0:
133
+ total_deleted += deleted
134
+ msgs += [f"deleted {deleted}"]
135
+ if verbose and updated > 0 or deleted > 0:
136
+ logger.info(f"{name}: {', '.join(msgs)}, remained {len(new_rows)} rows.")
137
+ return rows, total_updated, total_deleted
138
+
139
+
140
+ def apply_changes_to_paths(
141
+ paths: List[Path],
142
+ changes: Dict[str, Callable[[Dict[str, str]], Tuple[Literal["deleted", "updated", "unchanged"], Dict[str, str]]]],
143
+ verbose=False,
144
+ save=False,
145
+ load_json=load_json,
146
+ save_json=save_json,
147
+ **kwargs,
148
+ ):
149
+ total_updated = 0
150
+ total_deleted = 0
151
+ for path in paths:
152
+ if verbose:
153
+ print("checking", path)
154
+ jsonlist = load_json(path)
155
+ kwargs["path"] = path
156
+ new_jsonlist, updated, deleted = apply_changes_to_jsonlist(jsonlist, changes, verbose, **kwargs)
157
+ msgs = [f"total {len(jsonlist)} -> {len(new_jsonlist)}"]
158
+ if updated > 0:
159
+ total_updated += updated
160
+ msgs += [f"updated {updated}"]
161
+ if deleted > 0:
162
+ msgs += [f"deleted {deleted}"]
163
+ total_deleted += deleted
164
+ if updated > 0 or deleted > 0:
165
+ print(f"{path}: {', '.join(msgs)}")
166
+ if save:
167
+ if len(new_jsonlist) > 0:
168
+ save_json(new_jsonlist, path)
169
+ else:
170
+ path.unlink()
171
+ print(f"total: updated {total_updated}, deleted {total_deleted}")
172
+
173
+
174
+ def backup_current_output(row: Dict[str, str], output_key="output"):
175
+ if "old_output" in row:
176
+ for i in range(1, 10):
177
+ if f"old_output{i}" not in row:
178
+ row[f"old_output{i}"] = row[output_key]
179
+ break
180
+ else:
181
+ row["old_output"] = row[output_key]
182
+ return row
183
+
184
+
185
+ def backup_and_set_output(row: Dict[str, str], output: str):
186
+ backup_current_output(row)
187
+ row["output"] = output
188
+ return row
189
+
190
+
191
+ def generator_from_paths(paths: List[Path], load_data: Callable[[Path], List[Dict[str, Any]]] = load_json):
192
+ for path in paths:
193
+ jsonlist: List[Dict[str, Any]] = load_data(path)
194
+ for row in jsonlist:
195
+ yield path, row
@@ -0,0 +1,188 @@
1
+ import time
2
+ import os
3
+ import multiprocessing
4
+ from multiprocessing.pool import ThreadPool
5
+ from typing import *
6
+
7
+ import pandas as pd
8
+ from pathlib import Path
9
+ from tqdm import tqdm
10
+ from loguru import logger
11
+
12
+ from jsonl import dataframe_to_json_list, load_json_list, save_json_list, load_json, save_json
13
+
14
+
15
+ cpu_count = multiprocessing.cpu_count()
16
+ # pool = ThreadPool(cpu_count) # 大模型接口辣鸡,太快会截断答案
17
+ thread_pool_size = int(os.getenv("THREAD_POOL_SIZE", 5))
18
+ pool = ThreadPool(thread_pool_size)
19
+ logger.debug(f"pool size: {thread_pool_size}, cpu count: {cpu_count}")
20
+
21
+
22
+ def multiprocessing_mapping_jsonlist(
23
+ jsonlist: List[Any],
24
+ output_path: Optional[Union[str, Path]],
25
+ partial_func,
26
+ batch_size=cpu_count * 2,
27
+ cache_batch_num=1,
28
+ thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
29
+ ):
30
+ """mapping a column to another column
31
+
32
+ Args:
33
+ df (DataFrame): [description]
34
+ output_path (Path): 数据量大的时候需要缓存
35
+ partial_func (function): (Dict[str, str]) -> Dict[str, str]
36
+ """
37
+ need_caching = output_path is not None
38
+ tmp_list, output_list = list(), list()
39
+ start_idx = 0
40
+ if need_caching:
41
+ output_path = Path(output_path)
42
+ if output_path.exists():
43
+ output_list = load_json_list(output_path)
44
+ start_idx = len(output_list)
45
+ logger.warning(f"Cache found {output_path} has {start_idx} rows. This process will continue at row index {start_idx}.")
46
+ logger.warning(f"缓存 {output_path} 存在 {start_idx} 行. 本次处理将从第 {start_idx} 行开始.")
47
+ else:
48
+ output_path.parent.mkdir(parents=True, exist_ok=True)
49
+ pool = ThreadPool(thread_pool_size)
50
+ logger.debug(f"pool size: {thread_pool_size}, cpu count: {cpu_count}")
51
+ start_time = time.time()
52
+ last_save_time = start_time
53
+ for i, line in tqdm(list(enumerate(jsonlist))):
54
+ if i < start_idx:
55
+ continue
56
+ tmp_list.append(line)
57
+ if len(tmp_list) == batch_size:
58
+ results = pool.map(partial_func, tmp_list)
59
+ output_list.extend([x for x in results])
60
+ tmp_list = list()
61
+ if need_caching and (i // batch_size) % cache_batch_num == 0:
62
+ current_time = time.time()
63
+ if current_time - last_save_time < 3:
64
+ # 如果多进程处理太快,为了不让 IO 成为瓶颈拉慢进度,不足 3 秒的批次都忽略,也不缓存中间结果
65
+ last_save_time = current_time
66
+ continue
67
+ save_json_list(output_list, output_path)
68
+ last_save_time = time.time()
69
+ if len(tmp_list) > 0:
70
+ results = pool.map(partial_func, tmp_list)
71
+ output_list.extend([x for x in results])
72
+ if need_caching:
73
+ save_json_list(output_list, output_path)
74
+ return output_list
75
+
76
+
77
+ def multiprocessing_mapping(
78
+ df: pd.DataFrame,
79
+ output_path: Optional[Union[str, Path]],
80
+ partial_func,
81
+ batch_size=cpu_count * 2,
82
+ cache_batch_num=1,
83
+ thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
84
+ ):
85
+ """mapping a column to another column
86
+
87
+ Args:
88
+ df (DataFrame): [description]
89
+ output_path (Path): 数据量大的时候需要缓存
90
+ partial_func (function): (Dict[str, str]) -> Dict[str, str]
91
+ """
92
+ need_caching = output_path is not None
93
+ tmp_list, output_list = list(), list()
94
+ start_idx = 0
95
+ if need_caching:
96
+ output_path = Path(output_path)
97
+ if output_path.exists():
98
+ # existed_df = read_as_dataframe(output_path)
99
+ # start_idx = len(existed_df)
100
+ # output_list = dataframe_to_json_list(existed_df)
101
+ # logger.warning(f"Cache found {output_path} has {start_idx} rows. This process will continue at row index {start_idx}.")
102
+ # logger.warning(f"缓存 {output_path} 存在 {start_idx} 行. 本次处理将从第 {start_idx} 行开始.")
103
+ pass
104
+ else:
105
+ output_path.parent.mkdir(parents=True, exist_ok=True)
106
+ pool = ThreadPool(thread_pool_size)
107
+ logger.debug(f"pool size: {thread_pool_size}, cpu count: {cpu_count}")
108
+ start_time = time.time()
109
+ last_save_time = start_time
110
+ for i, line in tqdm(list(df.iterrows())):
111
+ if i < start_idx:
112
+ continue
113
+ line_info: dict = line.to_dict()
114
+ line_info: Dict[str, str] = {str(k): str(v) for k, v in line_info.items()}
115
+ tmp_list.append(line_info)
116
+ if len(tmp_list) == batch_size:
117
+ results = pool.map(partial_func, tmp_list)
118
+ output_list.extend([x for x in results])
119
+ tmp_list = list()
120
+ if need_caching and (i // batch_size) % cache_batch_num == 0:
121
+ current_time = time.time()
122
+ if current_time - last_save_time < 3:
123
+ # 如果多进程处理太快,为了不让 IO 成为瓶颈拉慢进度,不足 3 秒的批次都忽略,也不缓存中间结果
124
+ last_save_time = current_time
125
+ continue
126
+ output_df = pd.DataFrame(output_list)
127
+ output_df.to_excel(output_path, index=False)
128
+ last_save_time = time.time()
129
+ if len(tmp_list) > 0:
130
+ results = pool.map(partial_func, tmp_list)
131
+ output_list.extend([x for x in results])
132
+ output_df = pd.DataFrame(output_list)
133
+ if need_caching:
134
+ output_df.to_excel(output_path, index=False)
135
+ return output_df, output_list
136
+
137
+
138
+ def dataframe_by_row_mapping(
139
+ df: pd.DataFrame,
140
+ mapping_func: Callable[[dict], Tuple[bool, dict]],
141
+ use_multiprocessing=True,
142
+ thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
143
+ ):
144
+ rows = []
145
+ if use_multiprocessing:
146
+ pool = ThreadPool(thread_pool_size)
147
+ logger.debug(f"pool size: {thread_pool_size}, cpu count: {cpu_count}")
148
+ results = pool.map(mapping_func, dataframe_to_json_list(df))
149
+ for ok, row in results:
150
+ if ok:
151
+ rows.append(row)
152
+ else:
153
+ for i, row in df.iterrows():
154
+ ok, row = mapping_func(row)
155
+ if ok:
156
+ rows.append(row)
157
+ df = pd.DataFrame(rows)
158
+ return df
159
+
160
+
161
+ def continue_run(
162
+ jsonfiles: List[str],
163
+ save_dir: str,
164
+ mapping_func,
165
+ load_func=load_json,
166
+ save_func=save_json,
167
+ batch_size=1024,
168
+ cache_size=8,
169
+ ):
170
+ save_dir: Path = Path(save_dir)
171
+ save_dir.mkdir(parents=True, exist_ok=True)
172
+ new_jsonfiles = []
173
+ for jsonfile in jsonfiles:
174
+ jsonfile = Path(jsonfile)
175
+ jsonlist = load_func(jsonfile)
176
+ output_filepath = save_dir / jsonfile.name
177
+ for row in jsonlist:
178
+ row["来源"] = jsonfile.name
179
+ new_jsonlist = multiprocessing_mapping_jsonlist(
180
+ jsonlist,
181
+ output_filepath,
182
+ mapping_func,
183
+ batch_size,
184
+ cache_size,
185
+ )
186
+ save_func(new_jsonlist, output_filepath)
187
+ new_jsonfiles.append(output_filepath)
188
+ return new_jsonfiles
@@ -0,0 +1,233 @@
1
+ from collections import defaultdict
2
+ import os
3
+ from typing import Callable, Dict, List, Optional, Tuple, Union
4
+ from pathlib import Path
5
+ from loguru import logger
6
+
7
+ import pandas as pd
8
+ import pyexcel
9
+
10
+ from util import ls
11
+ from jsonl import dataframe_to_json_list, load_json, load_json_list, save_json_list
12
+ from xls2xlsx import is_xslx
13
+
14
+
15
+ def valid_to_read_as_dataframe(filename: str) -> bool:
16
+ suffix_list = [".json", ".jsonl", ".xlsx", "xls", ".csv"]
17
+ return any([filename.endswith(suffix) for suffix in suffix_list])
18
+
19
+
20
+ def read_as_dataframe(filepath: Union[str, Path], sheet_name: Optional[str] = None, fill_empty_str_to_na=True) -> pd.DataFrame:
21
+ """
22
+ 读取文件为表格
23
+ """
24
+ filepath = Path(filepath)
25
+ if filepath.is_dir():
26
+ paths = ls(filepath, expand_all_subdir=True)
27
+ df_list = []
28
+ for path in paths:
29
+ try:
30
+ df = read_as_dataframe(path, sheet_name, fill_empty_str_to_na)
31
+ df['数据来源'] = path.name
32
+ except:
33
+ df = pd.DataFrame()
34
+ df_list.append(df)
35
+ df = pd.concat(df_list)
36
+ if fill_empty_str_to_na:
37
+ df.fillna("", inplace=True)
38
+ return df
39
+ filename = filepath.name
40
+ if filename.endswith(".json") or filename.endswith(".jsonl"):
41
+ try:
42
+ json_list = load_json(filepath)
43
+ except:
44
+ json_list = load_json_list(filepath)
45
+ df = pd.DataFrame(json_list)
46
+ elif filename.endswith(".xlsx"):
47
+ df = pd.read_excel(filepath) if sheet_name is None else pd.read_excel(filepath, sheet_name)
48
+ elif filename.endswith(".xls"):
49
+ if is_xslx(filepath):
50
+ df = pd.read_excel(filepath) if sheet_name is None else pd.read_excel(filepath, sheet_name)
51
+ else:
52
+ df = pyexcel.get_sheet(file_name=filepath)
53
+ elif filename.endswith(".csv"):
54
+ df = pd.read_csv(filepath)
55
+ else:
56
+ raise ValueError(f"Unsupported filetype {filepath}. filetype not in [json, jsonl, xlsx, xls, csv]")
57
+ if fill_empty_str_to_na:
58
+ df.fillna("", inplace=True)
59
+ return df
60
+
61
+
62
+ def read_maybe_dir_as_dataframe(filepath: Union[str, Path], sheet_name: Optional[str] = None) -> pd.DataFrame:
63
+ """
64
+ input path 可能是文件夹,此时将文件夹下的所有表格拼接到一起返回,要求所有表头一致
65
+ 如果不是文件夹,则为文件,尝试直接读取为表格返回
66
+ """
67
+ out_list = list()
68
+ if not isinstance(filepath, Path):
69
+ filepath = Path(filepath)
70
+ if not filepath.exists():
71
+ raise ValueError(f"Path Not Exist: {filepath}")
72
+ if not filepath.is_dir():
73
+ return read_as_dataframe(filepath, sheet_name)
74
+
75
+ files = os.listdir(filepath)
76
+ for file_name in files:
77
+ if not valid_to_read_as_dataframe(file_name):
78
+ continue
79
+ input_file = filepath / file_name
80
+ df = read_as_dataframe(input_file, sheet_name)
81
+ df.fillna("", inplace=True)
82
+ for _, line in df.iterrows():
83
+ line = line.to_dict()
84
+ out_list.append(line)
85
+ df = pd.DataFrame(out_list)
86
+ return df
87
+
88
+
89
+ def read_as_dataframe_dict(filepath: Union[str, Path], fill_empty_str_to_na=True):
90
+ filepath = Path(filepath)
91
+ if filepath.is_dir():
92
+ paths = ls(filepath, expand_all_subdir=True)
93
+ df_dict_list = []
94
+ for path in paths:
95
+ try:
96
+ df_dict = read_as_dataframe_dict(path, fill_empty_str_to_na)
97
+ except:
98
+ df_dict = {}
99
+ df_dict_list.append(df_dict)
100
+ df_dict = merge_multiple_df_dict(df_dict_list)
101
+ return df_dict
102
+ df_dict: Dict[str, pd.DataFrame] = pd.read_excel(filepath, sheet_name=None)
103
+ if isinstance(df_dict, dict):
104
+ for name, df in df_dict.items():
105
+ if fill_empty_str_to_na:
106
+ df.fillna("", inplace=True)
107
+ df['数据来源'] = filepath.name
108
+ elif isinstance(df_dict, pd.DataFrame):
109
+ if fill_empty_str_to_na:
110
+ df_dict.fillna("", inplace=True)
111
+ df_dict['数据来源'] = filepath.name
112
+ return df_dict
113
+
114
+
115
+ def df_dict_summary(df_dict: Dict[str, pd.DataFrame]):
116
+ rows = []
117
+ for k, df in df_dict.items():
118
+ row = {
119
+ "sheet_name": k,
120
+ "length": len(df),
121
+ "columns": str(list(df.columns)),
122
+ }
123
+ rows.append(row)
124
+ df = pd.DataFrame(rows)
125
+ return df
126
+
127
+
128
+ def save_df_dict(df_dict: Dict[str, pd.DataFrame], output_filepath: Union[str, Path]):
129
+ if not isinstance(output_filepath, Path):
130
+ output_filepath = Path(output_filepath)
131
+ output_filepath.parent.mkdir(parents=True, exist_ok=True)
132
+ with pd.ExcelWriter(output_filepath, engine="xlsxwriter") as writer:
133
+ for k, df in df_dict.items():
134
+ if len(k) > 31:
135
+ logger.warning(f"表名太长,自动截断了:[{k}]的长度为{len(k)}")
136
+ df.to_excel(writer, sheet_name=k[:31], index=False)
137
+ return output_filepath
138
+
139
+
140
+ def save_df_from_jsonlist(jsonlist: List[Dict[str, str]], output_filepath: Union[str, Path]):
141
+ df = pd.DataFrame(jsonlist)
142
+ return save_df(df, output_filepath)
143
+
144
+
145
+ def save_df(df: pd.DataFrame, output_filepath: Union[str, Path]):
146
+ if not isinstance(output_filepath, Path):
147
+ output_filepath = Path(output_filepath)
148
+ output_filepath.parent.mkdir(parents=True, exist_ok=True)
149
+ df.to_excel(output_filepath, index=False)
150
+ return output_filepath
151
+
152
+
153
+ def lazy_build_dataframe(name: str, output_filepath: Path, func, filetype: str = "xlsx"):
154
+ logger.info(name)
155
+ output_filepath.parent.mkdir(parents=True, exist_ok=True)
156
+ if output_filepath.exists():
157
+ df = read_as_dataframe(output_filepath)
158
+ else:
159
+ df: pd.DataFrame = func()
160
+ filename = output_filepath.name.split(".")[0]
161
+ if filetype == "xlsx":
162
+ df.to_excel(output_filepath.parent / f"{filename}.xlsx", index=False)
163
+ elif filetype == "json":
164
+ save_json_list(dataframe_to_json_list(df), output_filepath.parent / f"{filename}.json")
165
+ elif filetype == "jsonl":
166
+ save_json_list(dataframe_to_json_list(df), output_filepath.parent / f"{filename}.jsonl")
167
+ else:
168
+ logger.warning(f"不认识的 {filetype},默认保存为 xlsx")
169
+ df.to_excel(output_filepath.parent / f"{filename}.xlsx", index=False)
170
+ logger.info(f"{name}结果保存在 {output_filepath}")
171
+ return df
172
+
173
+
174
+ def lazy_build_dataframe_dict(name: str, output_filepath: Path, df_dict: Dict[str, pd.DataFrame], func, skip_sheets: List[str] = list()):
175
+ logger.info(name)
176
+ output_filepath.parent.mkdir(parents=True, exist_ok=True)
177
+ if output_filepath.exists():
178
+ new_df_dict = read_as_dataframe_dict(output_filepath)
179
+ else:
180
+ new_df_dict = {}
181
+ for sheet_name, df in df_dict.items():
182
+ if sheet_name in skip_sheets:
183
+ continue
184
+ df = func(sheet_name, df)
185
+ new_df_dict[sheet_name] = df
186
+ save_df_dict(new_df_dict, output_filepath)
187
+ logger.info(f"{name}结果保存在 {output_filepath}")
188
+ return new_df_dict
189
+
190
+
191
+ def merge_multiple_df_dict(list_of_df_dict: List[Dict[str, pd.DataFrame]], sort=True):
192
+ df_dict_merged = defaultdict(list)
193
+ for df_dict in list_of_df_dict:
194
+ for k, df in df_dict.items():
195
+ df_dict_merged[k].append(df)
196
+ df_dict_merged: Dict[str, pd.DataFrame] = {k: pd.concat(v) for k, v in df_dict_merged.items()}
197
+ if sort:
198
+ df_dict_merged: Dict[str, pd.DataFrame] = {k: df_dict_merged[k] for k in sorted(df_dict_merged)}
199
+ return df_dict_merged
200
+
201
+
202
+ def remove_duplicate_and_sort(df: pd.DataFrame, key_col="query", sort_by='label'):
203
+ query_to_rows = {}
204
+ for i, row in df.iterrows():
205
+ query_to_rows[row[key_col]] = row
206
+ rows = sorted(list(query_to_rows.values()), key=lambda row: row[sort_by])
207
+ df_filtered = pd.DataFrame(rows)
208
+ return df_filtered
209
+
210
+
211
+ def color_negative_red(x):
212
+ color = "red" if x < 0 else ""
213
+ return f"color: {color}"
214
+
215
+
216
+ def highlight_max(x):
217
+ is_max = x == x.max()
218
+ return [("background-color: yellow" if m else "") for m in is_max]
219
+
220
+
221
+ def split_dataframe(df: pd.DataFrame, output_dir: Union[str, Path], tag: str, split_count=6):
222
+ output_dir = Path(output_dir)
223
+ output_dir.mkdir(parents=True, exist_ok=True)
224
+ rows = dataframe_to_json_list(df)
225
+ split_step = len(rows) // split_count + 1
226
+ df_list = []
227
+ for i in range(0, len(rows), split_step):
228
+ filepath = output_dir / f"{tag}_{i // split_step}.xlsx"
229
+ df_i = pd.DataFrame(rows[i:i+split_step])
230
+ df_i.to_excel(filepath, index=False)
231
+ df_list.append(df_i)
232
+ return df_list
233
+
@@ -0,0 +1,33 @@
1
+ from typing import List
2
+
3
+ import pandas as pd
4
+
5
+
6
+
7
+ def bucket_count(length: List[int], step=50, skip_zero_count=False):
8
+ grouped_count = []
9
+ j = 0
10
+ for i in range(0, max(length) + step, step):
11
+ grouped_count.append(0)
12
+ while j < len(length) and length[j] < i:
13
+ grouped_count[i // step] += 1
14
+ j += 1
15
+ x, y = [], []
16
+ for i, j in enumerate(grouped_count):
17
+ if i == 0:
18
+ continue
19
+ if skip_zero_count and j == 0:
20
+ continue
21
+ print(f"[{(i-1)*step}, {i*step}) {j} {sum(grouped_count[:i+1])/len(length)*100:.2f}%")
22
+ x.append((i - 1) * step)
23
+ y.append(j)
24
+ return x, y
25
+
26
+
27
+ def statistic_char_length(df: pd.DataFrame, instruction_key="instruction"):
28
+ length = []
29
+ for i, row in df.iterrows():
30
+ length.append(len(row[instruction_key]))
31
+ length.sort()
32
+ return length
33
+
@@ -0,0 +1,10 @@
1
+
2
+ blue = "\x1b[34m"
3
+ cyan = "\x1b[36;21m"
4
+ green = "\x1b[32;21m"
5
+ orange = "\x1b[33;21m"
6
+ grey = "\x1b[38;21m"
7
+ yellow = "\x1b[33;21m"
8
+ red = "\x1b[31;21m"
9
+ bold_red = "\x1b[31;1m"
10
+ reset = "\x1b[0m"
@@ -0,0 +1,321 @@
1
+ from collections import defaultdict
2
+ from typing import *
3
+ from pathlib import Path
4
+ import pandas as pd
5
+ import os
6
+ import asyncio
7
+ import datetime
8
+ from loguru import logger
9
+ import shutil
10
+ import random
11
+
12
+
13
+ date_str = datetime.datetime.now().strftime("%Y%m%d")
14
+ datetime_str = datetime.datetime.now().strftime("%Y%m%d_%Hh%Mm%Ss")
15
+
16
+
17
+ def random_timestamp(start_timestamp=None, end_timestamp=None):
18
+ if start_timestamp is None:
19
+ start_timestamp = datetime.datetime(2024, 1, 1).timestamp()
20
+ if end_timestamp is None:
21
+ end_timestamp = datetime.datetime.now().timestamp()
22
+ return random.uniform(start_timestamp, end_timestamp)
23
+
24
+
25
+ def random_timestamp_str(start_timestamp=None, end_timestamp=None, format="%Y年%m月%d日%H时%M分"):
26
+ return datetime.datetime.fromtimestamp(random_timestamp(start_timestamp, end_timestamp)).strftime(format)
27
+
28
+
29
+ def auto_retry_to_get_data(retry_times, request, data_key="data", *args, **kwargs):
30
+ if retry_times == 0:
31
+ return {}
32
+ resp = request(*args, **kwargs)
33
+ if resp is not None:
34
+ if data_key is None:
35
+ return resp
36
+ elif data_key in resp and resp[data_key] is not None:
37
+ return resp[data_key]
38
+ logger.debug("[error! retrying...]", resp)
39
+ return auto_retry_to_get_data(retry_times - 1, request, data_key, *args, **kwargs)
40
+
41
+
42
+ def append_column(df: pd.DataFrame, query_column: str, output_column: str, transform):
43
+ query = df[query_column].tolist()
44
+ loop = asyncio.get_event_loop()
45
+ result = loop.run_until_complete(transform(query))
46
+ df[output_column] = [str(r) for r in result]
47
+ return df
48
+
49
+
50
+ def request_wrapper(request_num=10):
51
+ def request_wrapper_body(func):
52
+ def wrapper(*args, **kwargs):
53
+ c = request_num
54
+ excute_num = 0
55
+ while c > 0:
56
+ c -= 1
57
+ res = func(*args, **kwargs)
58
+ excute_num += 1
59
+ if res != "-1":
60
+ logger.debug("{} excute_num: {}".format(func.__name__, excute_num))
61
+ return res
62
+ logger.debug("{} excute_num: {}".format(func.__name__, excute_num))
63
+ return ""
64
+
65
+ return wrapper
66
+
67
+ return request_wrapper_body
68
+
69
+
70
+ def copy_file(input_filepath, output_filepath, force_overwrite=False, verbose=False):
71
+ if verbose:
72
+ logger.info(f"正在复制 {input_filepath} 到 {output_filepath}")
73
+ if not isinstance(output_filepath, Path):
74
+ output_filepath = Path(output_filepath)
75
+ if output_filepath.exists() and not force_overwrite:
76
+ if verbose:
77
+ logger.warning(f"文件已存在,跳过复制:{output_filepath}")
78
+ return output_filepath
79
+ shutil.copy(input_filepath, output_filepath, follow_symlinks=True)
80
+ return output_filepath
81
+
82
+
83
+ def rm(dir_path: Union[str, Path, List[str], List[Path]], filter: Callable[[Path], bool] = lambda filepath: True, expand_all_subdir=True, debug=False):
84
+ if isinstance(dir_path, str) and "," in dir_path:
85
+ for path in dir_path.split(","):
86
+ rm(path, filter, expand_all_subdir)
87
+ return
88
+ if isinstance(dir_path, list):
89
+ for path in dir_path:
90
+ rm(path, filter, expand_all_subdir)
91
+ return
92
+ dir_path = Path(dir_path)
93
+ if not dir_path.exists():
94
+ if debug:
95
+ print(f"路径不存在 {dir_path}")
96
+ return
97
+ if not dir_path.is_dir():
98
+ if filter(dir_path):
99
+ dir_path.unlink()
100
+ if debug:
101
+ print(f"删除文件 {dir_path}")
102
+ return
103
+ filenames = os.listdir(dir_path)
104
+ for filename in sorted(filenames):
105
+ filepath = dir_path / filename
106
+ if debug:
107
+ print("checking", filepath)
108
+ if filepath.is_dir():
109
+ paths = ls(filepath, filter, expand_all_subdir)
110
+ if len(paths) > 0:
111
+ rm(paths, filter, expand_all_subdir)
112
+ child = filepath
113
+ while child.exists() and len(os.listdir(child)) > 0:
114
+ child = child / os.listdir(child)[0]
115
+ while child != filepath:
116
+ if child.exists() and len(os.listdir(child)) == 0:
117
+ child.rmdir()
118
+ if debug:
119
+ print(f"删除空文件夹 {child}")
120
+ else:
121
+ break
122
+ if filepath.exists() and len(os.listdir(filepath)) == 0:
123
+ filepath.rmdir()
124
+ if debug:
125
+ print(f"删除空文件夹 {filepath}")
126
+ elif filter(filepath):
127
+ rm(filepath, filter, expand_all_subdir)
128
+ if dir_path.exists() and len(os.listdir(dir_path)) == 0:
129
+ dir_path.rmdir()
130
+ if debug:
131
+ print(f"删除空文件夹 {dir_path}")
132
+
133
+
134
+ def cp(
135
+ input_dir_path: Union[str, Path, List[str], List[Path]],
136
+ output_dir_path: Union[str, Path],
137
+ base_input_dir: Optional[Union[Path, str]] = None,
138
+ force_overwrite: bool = False,
139
+ filter: Callable[[Path], bool] = lambda filepath: True,
140
+ expand_all_subdir=True,
141
+ verbose=False,
142
+ ):
143
+ input_paths = ls(input_dir_path, filter, expand_all_subdir)
144
+ if len(input_paths) == 0:
145
+ if verbose:
146
+ logger.warning(f"no files in {input_dir_path}")
147
+ return
148
+ if base_input_dir is None:
149
+ # 计算最大公共路径
150
+ if len(input_paths) > 1:
151
+ base_input_dir = os.path.commonpath([str(p) for p in input_paths])
152
+ else:
153
+ base_input_dir = input_paths[0].parent
154
+ base_input_dir = Path(base_input_dir)
155
+ output_dir_path = Path(output_dir_path)
156
+ for input_path in input_paths:
157
+ relative_path = input_path.relative_to(base_input_dir)
158
+ output_path = output_dir_path / relative_path
159
+ output_path.parent.mkdir(parents=True, exist_ok=True)
160
+ copy_file(input_path, output_path, force_overwrite, verbose)
161
+
162
+
163
+ def ls(dir_path: Union[str, Path, List[str], List[Path]], filter: Callable[[Path], bool] = lambda filepath: True, expand_all_subdir=True):
164
+ """list all files, return a list of filepaths
165
+
166
+ Args:
167
+ dir_path (Union[str, Path]): dir
168
+ filter ((Path) -> bool, optional): filter. Defaults to lambda filepath:True.
169
+ expand_all_subdir (bool, optional): _description_. Defaults to True.
170
+
171
+ Returns:
172
+ List[Path]: not null, may be empty list []
173
+ """
174
+ filepaths: List[Path] = []
175
+ if isinstance(dir_path, str) and "," in dir_path:
176
+ for path in dir_path.split(","):
177
+ filepaths.extend(ls(path, filter, expand_all_subdir))
178
+ return filepaths
179
+ if isinstance(dir_path, list):
180
+ for path in dir_path:
181
+ filepaths.extend(ls(path, filter, expand_all_subdir))
182
+ return filepaths
183
+ dir_path = Path(dir_path)
184
+ if not dir_path.exists():
185
+ return filepaths
186
+ if not dir_path.is_dir():
187
+ if filter(dir_path):
188
+ return [dir_path]
189
+ else:
190
+ return filepaths
191
+ filenames = os.listdir(dir_path)
192
+ for filename in sorted(filenames):
193
+ filepath = dir_path / filename
194
+ if filepath.is_dir():
195
+ if expand_all_subdir:
196
+ filepaths.extend(ls(filepath, filter, expand_all_subdir))
197
+ elif filter(filepath):
198
+ filepaths.append(filepath)
199
+ return filepaths
200
+
201
+
202
+ def clean_empty_folder(dir_path):
203
+ dir_path = Path(dir_path)
204
+ sub_names = os.listdir(dir_path)
205
+ if not sub_names or len(sub_names) == 0:
206
+ print(f"clean empty folder: {dir_path}")
207
+ dir_path.rmdir()
208
+ clean_empty_folder(dir_path.parent)
209
+ else:
210
+ for sub_name in sub_names:
211
+ path = dir_path / sub_name
212
+ if path.is_dir():
213
+ clean_empty_folder(path)
214
+
215
+
216
+ def grouped_col_list(df: pd.DataFrame, key_col="query", value_col="output"):
217
+ grouped = defaultdict(list)
218
+ if key_col not in df.columns:
219
+ logger.warning(f"`{key_col}` not in columns: {list(df.columns)}")
220
+ return grouped
221
+ for i, row in df.iterrows():
222
+ grouped[row[key_col]].append(row[value_col])
223
+ return grouped
224
+
225
+
226
+ def grouped_col(df: pd.DataFrame, key_col="query", value_col="output"):
227
+ grouped = {}
228
+ if key_col not in df.columns:
229
+ logger.warning(f"`{key_col}` not in columns: {list(df.columns)}")
230
+ return grouped
231
+ for i, row in df.iterrows():
232
+ grouped[row[key_col]] = row[value_col]
233
+ return grouped
234
+
235
+
236
+ def grouped_row(df: pd.DataFrame, key_col="query"):
237
+ grouped = defaultdict(list)
238
+ if key_col not in df.columns:
239
+ logger.warning(f"`{key_col}` not in columns: {list(df.columns)}")
240
+ return grouped
241
+ for i, row in df.iterrows():
242
+ grouped[row[key_col]].append(row)
243
+ return grouped
244
+
245
+
246
+ def grouped_row_in_jsonlist(jsonlist: List[Dict[str, Any]], key_col="query"):
247
+ grouped = defaultdict(list)
248
+ for i, row in enumerate(jsonlist):
249
+ if key_col not in row:
250
+ logger.warning(f"`{key_col}` not in row: {row}")
251
+ notfound_key = f"NotFound:{key_col}"
252
+ grouped[notfound_key].append(row)
253
+ continue
254
+ grouped[row[key_col]].append(row)
255
+ return grouped
256
+
257
+
258
+ def submit_file(path: Union[str, Path], target_dir: Union[str, Path]):
259
+ p = Path(path).absolute()
260
+ target_dir = Path(target_dir).absolute()
261
+ logger.info(f"正在复制到目标文件夹 {target_dir}")
262
+ if p.is_dir():
263
+ logger.info(f"文件夹 {p}")
264
+ filenames = os.listdir(path)
265
+ for filename in filenames:
266
+ src_file = p / filename
267
+ tgt_file = target_dir / filename
268
+ copy_file(src_file, tgt_file)
269
+ logger.info(f"已复制 {filename} 到 {tgt_file}")
270
+ else:
271
+ filename = p.name
272
+ logger.info(f"文件 {filename}")
273
+ src_file = p
274
+ tgt_file = target_dir / filename
275
+ copy_file(src_file, tgt_file)
276
+ logger.info(f"已复制 {filename} 到 {tgt_file}")
277
+ filenames = os.listdir(target_dir)
278
+ logger.info("现在目标文件夹下的文件有:\n" + "\n".join(filenames))
279
+
280
+
281
+ def pretty_limited_text(text: str, limited_length: int = 300, language="zh"):
282
+ text = str(text).strip()
283
+ if len(text) > limited_length:
284
+ # if language == "zh":
285
+ # tail = f"...(共{len(text)}字)"
286
+ # else:
287
+ # tail = f"...({len(text)} words in total)"
288
+ # return text[: limited_length - len(tail)] + tail
289
+ return text[: limited_length // 2] + text[-limited_length // 2 :]
290
+ return text
291
+
292
+
293
+ def bucket_count(length):
294
+ grouped_count = []
295
+ j = 0
296
+ for i in range(0, max(length), 50):
297
+ grouped_count.append(0)
298
+ while length[j] < i:
299
+ grouped_count[i // 50] += 1
300
+ j += 1
301
+ for i, j in enumerate(grouped_count):
302
+ if i == 0 or j == 0:
303
+ continue
304
+ print(f"[{(i-1)*50}, {i*50}) {j} {sum(grouped_count[:i+1])/len(length)*100:.2f}%")
305
+
306
+
307
+ def sortedCounter(obj, by="key", reverse=False, return_list=False):
308
+ c = Counter(obj)
309
+ c_list = [(k, c[k]) for k in c]
310
+ if by == "key":
311
+ c_list = sorted(c_list, key=lambda x: x[0], reverse=reverse)
312
+ elif by in ["value", "count"]:
313
+ c_list = sorted(c_list, key=lambda x: x[1], reverse=reverse)
314
+ else:
315
+ raise Exception(f"unsupported by: {by}")
316
+ c = Counter()
317
+ for k, v in c_list:
318
+ c[k] = v
319
+ if return_list:
320
+ return c, c_list
321
+ return c
@@ -0,0 +1,13 @@
1
+ from typing import *
2
+ import uuid
3
+
4
+ import pandas as pd
5
+
6
+
7
+ def append_uuid_column(df: pd.DataFrame, uuid_key="uuid"):
8
+ rows = []
9
+ for i, row in df.iterrows():
10
+ row[uuid_key] = str(uuid.uuid4())
11
+ rows.append(row)
12
+ df = pd.DataFrame(rows)
13
+ return df
@@ -0,0 +1,33 @@
1
+ # pip install pyexcel pyexcel-xls pyexcel-xlsx
2
+ import os
3
+
4
+ import pyexcel as p
5
+
6
+
7
+ def convert_xls_dir_to_xlsx(data_dir):
8
+ filenames = os.listdir(data_dir)
9
+ for filename in filenames:
10
+ if filename.endswith(".xls"):
11
+ convert_xls_to_xlsx(os.path.join(data_dir, filename))
12
+
13
+ def convert_xls_to_xlsx(file_name):
14
+ converted_filename = file_name + 'x'
15
+ if is_xslx(file_name):
16
+ # rename to .xlsx
17
+ with open(file_name, 'rb') as f:
18
+ with open(converted_filename, 'wb') as f2:
19
+ f2.write(f.read())
20
+ return converted_filename
21
+ sheet = p.get_sheet(file_name=file_name)
22
+ sheet.save_as(converted_filename)
23
+ return converted_filename
24
+
25
+
26
+ def is_xslx(filename):
27
+ with open(filename, 'rb') as f:
28
+ first_four_bytes = f.read()[:4]
29
+ return first_four_bytes == b'PK\x03\x04'
30
+
31
+ if __name__ == "__main__":
32
+ import sys
33
+ convert_xls_to_xlsx(sys.argv[1])
@@ -0,0 +1,8 @@
1
+ import yaml
2
+
3
+
4
+ def load_yaml(yaml_file: str):
5
+ with open(yaml_file, "r", encoding="utf-8") as f:
6
+ file_str = f.read()
7
+ schema = yaml.safe_load(file_str)
8
+ return schema