xlin 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xlin-0.1.0/LICENSE +21 -0
- xlin-0.1.0/PKG-INFO +24 -0
- xlin-0.1.0/README.md +2 -0
- xlin-0.1.0/pyproject.toml +21 -0
- xlin-0.1.0/xlin/ischinese.py +13 -0
- xlin-0.1.0/xlin/jsonl.py +195 -0
- xlin-0.1.0/xlin/multiprocess_mapping.py +188 -0
- xlin-0.1.0/xlin/read_as_dataframe.py +233 -0
- xlin-0.1.0/xlin/statistic.py +33 -0
- xlin-0.1.0/xlin/terminal_color.py +10 -0
- xlin-0.1.0/xlin/util.py +321 -0
- xlin-0.1.0/xlin/uuid.py +13 -0
- xlin-0.1.0/xlin/xls2xlsx.py +33 -0
- xlin-0.1.0/xlin/yaml.py +8 -0
xlin-0.1.0/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2024 兮尘
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
xlin-0.1.0/PKG-INFO
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: xlin
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: toolbox for LinXueyuan
|
5
|
+
License: MIT
|
6
|
+
Author: XiChen
|
7
|
+
Author-email: 23211526+LinXueyuanStdio@users.noreply.github.com
|
8
|
+
Requires-Python: >=3.10,<4.0
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
14
|
+
Requires-Dist: loguru (>=0.7.2,<0.8.0)
|
15
|
+
Requires-Dist: pandas (>=2.2.3,<3.0.0)
|
16
|
+
Requires-Dist: pyexcel (>=0.7.1,<0.8.0)
|
17
|
+
Requires-Dist: pyexcel-xls (>=0.7.0,<0.8.0)
|
18
|
+
Requires-Dist: pyexcel-xlsx (>=0.6.0,<0.7.0)
|
19
|
+
Requires-Dist: xlsxwriter (==3.1.2)
|
20
|
+
Description-Content-Type: text/markdown
|
21
|
+
|
22
|
+
# xlin
|
23
|
+
个人 python 工具代码
|
24
|
+
|
xlin-0.1.0/README.md
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
[tool.poetry]
|
2
|
+
name = "xlin"
|
3
|
+
version = "0.1.0"
|
4
|
+
description = "toolbox for LinXueyuan"
|
5
|
+
authors = ["XiChen <23211526+LinXueyuanStdio@users.noreply.github.com>"]
|
6
|
+
license = "MIT"
|
7
|
+
readme = "README.md"
|
8
|
+
|
9
|
+
[tool.poetry.dependencies]
|
10
|
+
python = "^3.10"
|
11
|
+
|
12
|
+
loguru = "^0.7.2"
|
13
|
+
pandas = "^2.2.3"
|
14
|
+
pyexcel = "^0.7.1"
|
15
|
+
pyexcel-xls = "^0.7.0"
|
16
|
+
pyexcel-xlsx = "^0.6.0"
|
17
|
+
xlsxwriter = "3.1.2"
|
18
|
+
|
19
|
+
[build-system]
|
20
|
+
requires = ["poetry-core"]
|
21
|
+
build-backend = "poetry.core.masonry.api"
|
xlin-0.1.0/xlin/jsonl.py
ADDED
@@ -0,0 +1,195 @@
|
|
1
|
+
import json
|
2
|
+
from typing import *
|
3
|
+
|
4
|
+
from pathlib import Path
|
5
|
+
from loguru import logger
|
6
|
+
import pandas as pd
|
7
|
+
|
8
|
+
|
9
|
+
def dataframe_to_json_list(df: pd.DataFrame):
|
10
|
+
"""
|
11
|
+
Args:
|
12
|
+
df (pd.DataFrame): df
|
13
|
+
|
14
|
+
Returns:
|
15
|
+
List[Dict[str, str]]: json list: [{"col1": "xxx", "col2": "xxx", ...}, ...]
|
16
|
+
"""
|
17
|
+
json_list = []
|
18
|
+
for i, line in df.iterrows():
|
19
|
+
json_list.append(dict(line))
|
20
|
+
return json_list
|
21
|
+
|
22
|
+
|
23
|
+
def transform_dataframe_to_json_list(df: pd.DataFrame, row_transform):
|
24
|
+
"""
|
25
|
+
Args:
|
26
|
+
df (pd.DataFrame): df
|
27
|
+
row_transform : lambda row: prompt_template.format(row['query']), "", row['label']
|
28
|
+
|
29
|
+
Returns:
|
30
|
+
List[Dict[str, str]]: json list: [{"instruction": "xxx", "input": "xxx", "output": "xxx"}, ...]
|
31
|
+
"""
|
32
|
+
out_list = list()
|
33
|
+
for _, row in df.iterrows():
|
34
|
+
instruction, input, output = row_transform(row)
|
35
|
+
out_list.append({"instruction": instruction, "input": input, "output": output})
|
36
|
+
return out_list
|
37
|
+
|
38
|
+
|
39
|
+
def jsonlist_to_dataframe(json_list: List[Dict[str, str]]):
|
40
|
+
"""
|
41
|
+
Args:
|
42
|
+
json_list (List[Dict[str, str]]): json list: [{"col1": "xxx", "col2": "xxx", ...}, ...]
|
43
|
+
|
44
|
+
Returns:
|
45
|
+
pd.DataFrame: df
|
46
|
+
"""
|
47
|
+
return pd.DataFrame(json_list)
|
48
|
+
|
49
|
+
|
50
|
+
def load_json(filename: str):
|
51
|
+
with open(filename, "r", encoding="utf-8") as f:
|
52
|
+
return json.load(f)
|
53
|
+
|
54
|
+
|
55
|
+
def save_json(json_list: Union[Dict[str, str], List[Dict[str, str]]], filename: str):
|
56
|
+
Path(filename).parent.mkdir(parents=True, exist_ok=True)
|
57
|
+
with open(filename, "w", encoding="utf-8") as f:
|
58
|
+
return json.dump(json_list, f, ensure_ascii=False, separators=(",", ":"), indent=2)
|
59
|
+
|
60
|
+
|
61
|
+
def load_json_list(filename: str):
|
62
|
+
with open(filename, "r", encoding="utf-8") as f:
|
63
|
+
lines = f.readlines()
|
64
|
+
json_list = []
|
65
|
+
for i in lines:
|
66
|
+
try:
|
67
|
+
obj = json.loads(i.strip())
|
68
|
+
except:
|
69
|
+
continue
|
70
|
+
json_list.append(obj)
|
71
|
+
return json_list
|
72
|
+
|
73
|
+
|
74
|
+
def save_json_list(json_list: List[Dict[str, str]], filename: str):
|
75
|
+
Path(filename).parent.mkdir(parents=True, exist_ok=True)
|
76
|
+
with open(filename, "w", encoding="utf-8") as f:
|
77
|
+
f.write("\n".join([json.dumps(line, ensure_ascii=False, separators=(",", ":")) for line in json_list]))
|
78
|
+
|
79
|
+
|
80
|
+
def merge_json_list(filenames: List[str], output_filename: str):
|
81
|
+
json_list = []
|
82
|
+
for filename in filenames:
|
83
|
+
json_list.extend(load_json_list(filename))
|
84
|
+
save_json_list(json_list, output_filename)
|
85
|
+
|
86
|
+
|
87
|
+
def jsonlist_dict_summary(jsonlist_dict: Dict[str, List[dict]]):
|
88
|
+
rows = []
|
89
|
+
for k, jsonlist in jsonlist_dict.items():
|
90
|
+
if len(jsonlist) == 0:
|
91
|
+
continue
|
92
|
+
row = {
|
93
|
+
"sheet_name": k,
|
94
|
+
"length": len(jsonlist),
|
95
|
+
"columns": str(list(jsonlist[0].keys())),
|
96
|
+
}
|
97
|
+
rows.append(row)
|
98
|
+
df = pd.DataFrame(rows)
|
99
|
+
return df
|
100
|
+
|
101
|
+
|
102
|
+
def print_in_json(text: str):
|
103
|
+
print(json.dumps({"text": text}, indent=2, ensure_ascii=False))
|
104
|
+
|
105
|
+
|
106
|
+
def apply_changes_to_jsonlist(
|
107
|
+
jsonlist: List[Dict[str, str]],
|
108
|
+
changes: Dict[str, Callable[[Dict[str, str]], Tuple[Literal["deleted", "updated", "unchanged"], Dict[str, str]]]],
|
109
|
+
verbose=False,
|
110
|
+
**kwargs,
|
111
|
+
):
|
112
|
+
rows = jsonlist
|
113
|
+
total_updated = 0
|
114
|
+
total_deleted = 0
|
115
|
+
for name, change in changes.items():
|
116
|
+
new_rows = []
|
117
|
+
updated = 0
|
118
|
+
deleted = 0
|
119
|
+
for row in rows:
|
120
|
+
status, new_row = change(row, **kwargs)
|
121
|
+
if status == "deleted":
|
122
|
+
deleted += 1
|
123
|
+
continue
|
124
|
+
if status == "updated":
|
125
|
+
updated += 1
|
126
|
+
new_rows.append(new_row)
|
127
|
+
rows = new_rows
|
128
|
+
msgs = []
|
129
|
+
if updated > 0:
|
130
|
+
total_updated += updated
|
131
|
+
msgs += [f"updated {updated}"]
|
132
|
+
if deleted > 0:
|
133
|
+
total_deleted += deleted
|
134
|
+
msgs += [f"deleted {deleted}"]
|
135
|
+
if verbose and updated > 0 or deleted > 0:
|
136
|
+
logger.info(f"{name}: {', '.join(msgs)}, remained {len(new_rows)} rows.")
|
137
|
+
return rows, total_updated, total_deleted
|
138
|
+
|
139
|
+
|
140
|
+
def apply_changes_to_paths(
|
141
|
+
paths: List[Path],
|
142
|
+
changes: Dict[str, Callable[[Dict[str, str]], Tuple[Literal["deleted", "updated", "unchanged"], Dict[str, str]]]],
|
143
|
+
verbose=False,
|
144
|
+
save=False,
|
145
|
+
load_json=load_json,
|
146
|
+
save_json=save_json,
|
147
|
+
**kwargs,
|
148
|
+
):
|
149
|
+
total_updated = 0
|
150
|
+
total_deleted = 0
|
151
|
+
for path in paths:
|
152
|
+
if verbose:
|
153
|
+
print("checking", path)
|
154
|
+
jsonlist = load_json(path)
|
155
|
+
kwargs["path"] = path
|
156
|
+
new_jsonlist, updated, deleted = apply_changes_to_jsonlist(jsonlist, changes, verbose, **kwargs)
|
157
|
+
msgs = [f"total {len(jsonlist)} -> {len(new_jsonlist)}"]
|
158
|
+
if updated > 0:
|
159
|
+
total_updated += updated
|
160
|
+
msgs += [f"updated {updated}"]
|
161
|
+
if deleted > 0:
|
162
|
+
msgs += [f"deleted {deleted}"]
|
163
|
+
total_deleted += deleted
|
164
|
+
if updated > 0 or deleted > 0:
|
165
|
+
print(f"{path}: {', '.join(msgs)}")
|
166
|
+
if save:
|
167
|
+
if len(new_jsonlist) > 0:
|
168
|
+
save_json(new_jsonlist, path)
|
169
|
+
else:
|
170
|
+
path.unlink()
|
171
|
+
print(f"total: updated {total_updated}, deleted {total_deleted}")
|
172
|
+
|
173
|
+
|
174
|
+
def backup_current_output(row: Dict[str, str], output_key="output"):
|
175
|
+
if "old_output" in row:
|
176
|
+
for i in range(1, 10):
|
177
|
+
if f"old_output{i}" not in row:
|
178
|
+
row[f"old_output{i}"] = row[output_key]
|
179
|
+
break
|
180
|
+
else:
|
181
|
+
row["old_output"] = row[output_key]
|
182
|
+
return row
|
183
|
+
|
184
|
+
|
185
|
+
def backup_and_set_output(row: Dict[str, str], output: str):
|
186
|
+
backup_current_output(row)
|
187
|
+
row["output"] = output
|
188
|
+
return row
|
189
|
+
|
190
|
+
|
191
|
+
def generator_from_paths(paths: List[Path], load_data: Callable[[Path], List[Dict[str, Any]]] = load_json):
|
192
|
+
for path in paths:
|
193
|
+
jsonlist: List[Dict[str, Any]] = load_data(path)
|
194
|
+
for row in jsonlist:
|
195
|
+
yield path, row
|
@@ -0,0 +1,188 @@
|
|
1
|
+
import time
|
2
|
+
import os
|
3
|
+
import multiprocessing
|
4
|
+
from multiprocessing.pool import ThreadPool
|
5
|
+
from typing import *
|
6
|
+
|
7
|
+
import pandas as pd
|
8
|
+
from pathlib import Path
|
9
|
+
from tqdm import tqdm
|
10
|
+
from loguru import logger
|
11
|
+
|
12
|
+
from jsonl import dataframe_to_json_list, load_json_list, save_json_list, load_json, save_json
|
13
|
+
|
14
|
+
|
15
|
+
cpu_count = multiprocessing.cpu_count()
|
16
|
+
# pool = ThreadPool(cpu_count) # 大模型接口辣鸡,太快会截断答案
|
17
|
+
thread_pool_size = int(os.getenv("THREAD_POOL_SIZE", 5))
|
18
|
+
pool = ThreadPool(thread_pool_size)
|
19
|
+
logger.debug(f"pool size: {thread_pool_size}, cpu count: {cpu_count}")
|
20
|
+
|
21
|
+
|
22
|
+
def multiprocessing_mapping_jsonlist(
|
23
|
+
jsonlist: List[Any],
|
24
|
+
output_path: Optional[Union[str, Path]],
|
25
|
+
partial_func,
|
26
|
+
batch_size=cpu_count * 2,
|
27
|
+
cache_batch_num=1,
|
28
|
+
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
|
29
|
+
):
|
30
|
+
"""mapping a column to another column
|
31
|
+
|
32
|
+
Args:
|
33
|
+
df (DataFrame): [description]
|
34
|
+
output_path (Path): 数据量大的时候需要缓存
|
35
|
+
partial_func (function): (Dict[str, str]) -> Dict[str, str]
|
36
|
+
"""
|
37
|
+
need_caching = output_path is not None
|
38
|
+
tmp_list, output_list = list(), list()
|
39
|
+
start_idx = 0
|
40
|
+
if need_caching:
|
41
|
+
output_path = Path(output_path)
|
42
|
+
if output_path.exists():
|
43
|
+
output_list = load_json_list(output_path)
|
44
|
+
start_idx = len(output_list)
|
45
|
+
logger.warning(f"Cache found {output_path} has {start_idx} rows. This process will continue at row index {start_idx}.")
|
46
|
+
logger.warning(f"缓存 {output_path} 存在 {start_idx} 行. 本次处理将从第 {start_idx} 行开始.")
|
47
|
+
else:
|
48
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
49
|
+
pool = ThreadPool(thread_pool_size)
|
50
|
+
logger.debug(f"pool size: {thread_pool_size}, cpu count: {cpu_count}")
|
51
|
+
start_time = time.time()
|
52
|
+
last_save_time = start_time
|
53
|
+
for i, line in tqdm(list(enumerate(jsonlist))):
|
54
|
+
if i < start_idx:
|
55
|
+
continue
|
56
|
+
tmp_list.append(line)
|
57
|
+
if len(tmp_list) == batch_size:
|
58
|
+
results = pool.map(partial_func, tmp_list)
|
59
|
+
output_list.extend([x for x in results])
|
60
|
+
tmp_list = list()
|
61
|
+
if need_caching and (i // batch_size) % cache_batch_num == 0:
|
62
|
+
current_time = time.time()
|
63
|
+
if current_time - last_save_time < 3:
|
64
|
+
# 如果多进程处理太快,为了不让 IO 成为瓶颈拉慢进度,不足 3 秒的批次都忽略,也不缓存中间结果
|
65
|
+
last_save_time = current_time
|
66
|
+
continue
|
67
|
+
save_json_list(output_list, output_path)
|
68
|
+
last_save_time = time.time()
|
69
|
+
if len(tmp_list) > 0:
|
70
|
+
results = pool.map(partial_func, tmp_list)
|
71
|
+
output_list.extend([x for x in results])
|
72
|
+
if need_caching:
|
73
|
+
save_json_list(output_list, output_path)
|
74
|
+
return output_list
|
75
|
+
|
76
|
+
|
77
|
+
def multiprocessing_mapping(
|
78
|
+
df: pd.DataFrame,
|
79
|
+
output_path: Optional[Union[str, Path]],
|
80
|
+
partial_func,
|
81
|
+
batch_size=cpu_count * 2,
|
82
|
+
cache_batch_num=1,
|
83
|
+
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
|
84
|
+
):
|
85
|
+
"""mapping a column to another column
|
86
|
+
|
87
|
+
Args:
|
88
|
+
df (DataFrame): [description]
|
89
|
+
output_path (Path): 数据量大的时候需要缓存
|
90
|
+
partial_func (function): (Dict[str, str]) -> Dict[str, str]
|
91
|
+
"""
|
92
|
+
need_caching = output_path is not None
|
93
|
+
tmp_list, output_list = list(), list()
|
94
|
+
start_idx = 0
|
95
|
+
if need_caching:
|
96
|
+
output_path = Path(output_path)
|
97
|
+
if output_path.exists():
|
98
|
+
# existed_df = read_as_dataframe(output_path)
|
99
|
+
# start_idx = len(existed_df)
|
100
|
+
# output_list = dataframe_to_json_list(existed_df)
|
101
|
+
# logger.warning(f"Cache found {output_path} has {start_idx} rows. This process will continue at row index {start_idx}.")
|
102
|
+
# logger.warning(f"缓存 {output_path} 存在 {start_idx} 行. 本次处理将从第 {start_idx} 行开始.")
|
103
|
+
pass
|
104
|
+
else:
|
105
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
106
|
+
pool = ThreadPool(thread_pool_size)
|
107
|
+
logger.debug(f"pool size: {thread_pool_size}, cpu count: {cpu_count}")
|
108
|
+
start_time = time.time()
|
109
|
+
last_save_time = start_time
|
110
|
+
for i, line in tqdm(list(df.iterrows())):
|
111
|
+
if i < start_idx:
|
112
|
+
continue
|
113
|
+
line_info: dict = line.to_dict()
|
114
|
+
line_info: Dict[str, str] = {str(k): str(v) for k, v in line_info.items()}
|
115
|
+
tmp_list.append(line_info)
|
116
|
+
if len(tmp_list) == batch_size:
|
117
|
+
results = pool.map(partial_func, tmp_list)
|
118
|
+
output_list.extend([x for x in results])
|
119
|
+
tmp_list = list()
|
120
|
+
if need_caching and (i // batch_size) % cache_batch_num == 0:
|
121
|
+
current_time = time.time()
|
122
|
+
if current_time - last_save_time < 3:
|
123
|
+
# 如果多进程处理太快,为了不让 IO 成为瓶颈拉慢进度,不足 3 秒的批次都忽略,也不缓存中间结果
|
124
|
+
last_save_time = current_time
|
125
|
+
continue
|
126
|
+
output_df = pd.DataFrame(output_list)
|
127
|
+
output_df.to_excel(output_path, index=False)
|
128
|
+
last_save_time = time.time()
|
129
|
+
if len(tmp_list) > 0:
|
130
|
+
results = pool.map(partial_func, tmp_list)
|
131
|
+
output_list.extend([x for x in results])
|
132
|
+
output_df = pd.DataFrame(output_list)
|
133
|
+
if need_caching:
|
134
|
+
output_df.to_excel(output_path, index=False)
|
135
|
+
return output_df, output_list
|
136
|
+
|
137
|
+
|
138
|
+
def dataframe_by_row_mapping(
|
139
|
+
df: pd.DataFrame,
|
140
|
+
mapping_func: Callable[[dict], Tuple[bool, dict]],
|
141
|
+
use_multiprocessing=True,
|
142
|
+
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
|
143
|
+
):
|
144
|
+
rows = []
|
145
|
+
if use_multiprocessing:
|
146
|
+
pool = ThreadPool(thread_pool_size)
|
147
|
+
logger.debug(f"pool size: {thread_pool_size}, cpu count: {cpu_count}")
|
148
|
+
results = pool.map(mapping_func, dataframe_to_json_list(df))
|
149
|
+
for ok, row in results:
|
150
|
+
if ok:
|
151
|
+
rows.append(row)
|
152
|
+
else:
|
153
|
+
for i, row in df.iterrows():
|
154
|
+
ok, row = mapping_func(row)
|
155
|
+
if ok:
|
156
|
+
rows.append(row)
|
157
|
+
df = pd.DataFrame(rows)
|
158
|
+
return df
|
159
|
+
|
160
|
+
|
161
|
+
def continue_run(
|
162
|
+
jsonfiles: List[str],
|
163
|
+
save_dir: str,
|
164
|
+
mapping_func,
|
165
|
+
load_func=load_json,
|
166
|
+
save_func=save_json,
|
167
|
+
batch_size=1024,
|
168
|
+
cache_size=8,
|
169
|
+
):
|
170
|
+
save_dir: Path = Path(save_dir)
|
171
|
+
save_dir.mkdir(parents=True, exist_ok=True)
|
172
|
+
new_jsonfiles = []
|
173
|
+
for jsonfile in jsonfiles:
|
174
|
+
jsonfile = Path(jsonfile)
|
175
|
+
jsonlist = load_func(jsonfile)
|
176
|
+
output_filepath = save_dir / jsonfile.name
|
177
|
+
for row in jsonlist:
|
178
|
+
row["来源"] = jsonfile.name
|
179
|
+
new_jsonlist = multiprocessing_mapping_jsonlist(
|
180
|
+
jsonlist,
|
181
|
+
output_filepath,
|
182
|
+
mapping_func,
|
183
|
+
batch_size,
|
184
|
+
cache_size,
|
185
|
+
)
|
186
|
+
save_func(new_jsonlist, output_filepath)
|
187
|
+
new_jsonfiles.append(output_filepath)
|
188
|
+
return new_jsonfiles
|
@@ -0,0 +1,233 @@
|
|
1
|
+
from collections import defaultdict
|
2
|
+
import os
|
3
|
+
from typing import Callable, Dict, List, Optional, Tuple, Union
|
4
|
+
from pathlib import Path
|
5
|
+
from loguru import logger
|
6
|
+
|
7
|
+
import pandas as pd
|
8
|
+
import pyexcel
|
9
|
+
|
10
|
+
from util import ls
|
11
|
+
from jsonl import dataframe_to_json_list, load_json, load_json_list, save_json_list
|
12
|
+
from xls2xlsx import is_xslx
|
13
|
+
|
14
|
+
|
15
|
+
def valid_to_read_as_dataframe(filename: str) -> bool:
|
16
|
+
suffix_list = [".json", ".jsonl", ".xlsx", "xls", ".csv"]
|
17
|
+
return any([filename.endswith(suffix) for suffix in suffix_list])
|
18
|
+
|
19
|
+
|
20
|
+
def read_as_dataframe(filepath: Union[str, Path], sheet_name: Optional[str] = None, fill_empty_str_to_na=True) -> pd.DataFrame:
|
21
|
+
"""
|
22
|
+
读取文件为表格
|
23
|
+
"""
|
24
|
+
filepath = Path(filepath)
|
25
|
+
if filepath.is_dir():
|
26
|
+
paths = ls(filepath, expand_all_subdir=True)
|
27
|
+
df_list = []
|
28
|
+
for path in paths:
|
29
|
+
try:
|
30
|
+
df = read_as_dataframe(path, sheet_name, fill_empty_str_to_na)
|
31
|
+
df['数据来源'] = path.name
|
32
|
+
except:
|
33
|
+
df = pd.DataFrame()
|
34
|
+
df_list.append(df)
|
35
|
+
df = pd.concat(df_list)
|
36
|
+
if fill_empty_str_to_na:
|
37
|
+
df.fillna("", inplace=True)
|
38
|
+
return df
|
39
|
+
filename = filepath.name
|
40
|
+
if filename.endswith(".json") or filename.endswith(".jsonl"):
|
41
|
+
try:
|
42
|
+
json_list = load_json(filepath)
|
43
|
+
except:
|
44
|
+
json_list = load_json_list(filepath)
|
45
|
+
df = pd.DataFrame(json_list)
|
46
|
+
elif filename.endswith(".xlsx"):
|
47
|
+
df = pd.read_excel(filepath) if sheet_name is None else pd.read_excel(filepath, sheet_name)
|
48
|
+
elif filename.endswith(".xls"):
|
49
|
+
if is_xslx(filepath):
|
50
|
+
df = pd.read_excel(filepath) if sheet_name is None else pd.read_excel(filepath, sheet_name)
|
51
|
+
else:
|
52
|
+
df = pyexcel.get_sheet(file_name=filepath)
|
53
|
+
elif filename.endswith(".csv"):
|
54
|
+
df = pd.read_csv(filepath)
|
55
|
+
else:
|
56
|
+
raise ValueError(f"Unsupported filetype {filepath}. filetype not in [json, jsonl, xlsx, xls, csv]")
|
57
|
+
if fill_empty_str_to_na:
|
58
|
+
df.fillna("", inplace=True)
|
59
|
+
return df
|
60
|
+
|
61
|
+
|
62
|
+
def read_maybe_dir_as_dataframe(filepath: Union[str, Path], sheet_name: Optional[str] = None) -> pd.DataFrame:
|
63
|
+
"""
|
64
|
+
input path 可能是文件夹,此时将文件夹下的所有表格拼接到一起返回,要求所有表头一致
|
65
|
+
如果不是文件夹,则为文件,尝试直接读取为表格返回
|
66
|
+
"""
|
67
|
+
out_list = list()
|
68
|
+
if not isinstance(filepath, Path):
|
69
|
+
filepath = Path(filepath)
|
70
|
+
if not filepath.exists():
|
71
|
+
raise ValueError(f"Path Not Exist: {filepath}")
|
72
|
+
if not filepath.is_dir():
|
73
|
+
return read_as_dataframe(filepath, sheet_name)
|
74
|
+
|
75
|
+
files = os.listdir(filepath)
|
76
|
+
for file_name in files:
|
77
|
+
if not valid_to_read_as_dataframe(file_name):
|
78
|
+
continue
|
79
|
+
input_file = filepath / file_name
|
80
|
+
df = read_as_dataframe(input_file, sheet_name)
|
81
|
+
df.fillna("", inplace=True)
|
82
|
+
for _, line in df.iterrows():
|
83
|
+
line = line.to_dict()
|
84
|
+
out_list.append(line)
|
85
|
+
df = pd.DataFrame(out_list)
|
86
|
+
return df
|
87
|
+
|
88
|
+
|
89
|
+
def read_as_dataframe_dict(filepath: Union[str, Path], fill_empty_str_to_na=True):
|
90
|
+
filepath = Path(filepath)
|
91
|
+
if filepath.is_dir():
|
92
|
+
paths = ls(filepath, expand_all_subdir=True)
|
93
|
+
df_dict_list = []
|
94
|
+
for path in paths:
|
95
|
+
try:
|
96
|
+
df_dict = read_as_dataframe_dict(path, fill_empty_str_to_na)
|
97
|
+
except:
|
98
|
+
df_dict = {}
|
99
|
+
df_dict_list.append(df_dict)
|
100
|
+
df_dict = merge_multiple_df_dict(df_dict_list)
|
101
|
+
return df_dict
|
102
|
+
df_dict: Dict[str, pd.DataFrame] = pd.read_excel(filepath, sheet_name=None)
|
103
|
+
if isinstance(df_dict, dict):
|
104
|
+
for name, df in df_dict.items():
|
105
|
+
if fill_empty_str_to_na:
|
106
|
+
df.fillna("", inplace=True)
|
107
|
+
df['数据来源'] = filepath.name
|
108
|
+
elif isinstance(df_dict, pd.DataFrame):
|
109
|
+
if fill_empty_str_to_na:
|
110
|
+
df_dict.fillna("", inplace=True)
|
111
|
+
df_dict['数据来源'] = filepath.name
|
112
|
+
return df_dict
|
113
|
+
|
114
|
+
|
115
|
+
def df_dict_summary(df_dict: Dict[str, pd.DataFrame]):
|
116
|
+
rows = []
|
117
|
+
for k, df in df_dict.items():
|
118
|
+
row = {
|
119
|
+
"sheet_name": k,
|
120
|
+
"length": len(df),
|
121
|
+
"columns": str(list(df.columns)),
|
122
|
+
}
|
123
|
+
rows.append(row)
|
124
|
+
df = pd.DataFrame(rows)
|
125
|
+
return df
|
126
|
+
|
127
|
+
|
128
|
+
def save_df_dict(df_dict: Dict[str, pd.DataFrame], output_filepath: Union[str, Path]):
|
129
|
+
if not isinstance(output_filepath, Path):
|
130
|
+
output_filepath = Path(output_filepath)
|
131
|
+
output_filepath.parent.mkdir(parents=True, exist_ok=True)
|
132
|
+
with pd.ExcelWriter(output_filepath, engine="xlsxwriter") as writer:
|
133
|
+
for k, df in df_dict.items():
|
134
|
+
if len(k) > 31:
|
135
|
+
logger.warning(f"表名太长,自动截断了:[{k}]的长度为{len(k)}")
|
136
|
+
df.to_excel(writer, sheet_name=k[:31], index=False)
|
137
|
+
return output_filepath
|
138
|
+
|
139
|
+
|
140
|
+
def save_df_from_jsonlist(jsonlist: List[Dict[str, str]], output_filepath: Union[str, Path]):
|
141
|
+
df = pd.DataFrame(jsonlist)
|
142
|
+
return save_df(df, output_filepath)
|
143
|
+
|
144
|
+
|
145
|
+
def save_df(df: pd.DataFrame, output_filepath: Union[str, Path]):
|
146
|
+
if not isinstance(output_filepath, Path):
|
147
|
+
output_filepath = Path(output_filepath)
|
148
|
+
output_filepath.parent.mkdir(parents=True, exist_ok=True)
|
149
|
+
df.to_excel(output_filepath, index=False)
|
150
|
+
return output_filepath
|
151
|
+
|
152
|
+
|
153
|
+
def lazy_build_dataframe(name: str, output_filepath: Path, func, filetype: str = "xlsx"):
|
154
|
+
logger.info(name)
|
155
|
+
output_filepath.parent.mkdir(parents=True, exist_ok=True)
|
156
|
+
if output_filepath.exists():
|
157
|
+
df = read_as_dataframe(output_filepath)
|
158
|
+
else:
|
159
|
+
df: pd.DataFrame = func()
|
160
|
+
filename = output_filepath.name.split(".")[0]
|
161
|
+
if filetype == "xlsx":
|
162
|
+
df.to_excel(output_filepath.parent / f"{filename}.xlsx", index=False)
|
163
|
+
elif filetype == "json":
|
164
|
+
save_json_list(dataframe_to_json_list(df), output_filepath.parent / f"{filename}.json")
|
165
|
+
elif filetype == "jsonl":
|
166
|
+
save_json_list(dataframe_to_json_list(df), output_filepath.parent / f"{filename}.jsonl")
|
167
|
+
else:
|
168
|
+
logger.warning(f"不认识的 {filetype},默认保存为 xlsx")
|
169
|
+
df.to_excel(output_filepath.parent / f"{filename}.xlsx", index=False)
|
170
|
+
logger.info(f"{name}结果保存在 {output_filepath}")
|
171
|
+
return df
|
172
|
+
|
173
|
+
|
174
|
+
def lazy_build_dataframe_dict(name: str, output_filepath: Path, df_dict: Dict[str, pd.DataFrame], func, skip_sheets: List[str] = list()):
|
175
|
+
logger.info(name)
|
176
|
+
output_filepath.parent.mkdir(parents=True, exist_ok=True)
|
177
|
+
if output_filepath.exists():
|
178
|
+
new_df_dict = read_as_dataframe_dict(output_filepath)
|
179
|
+
else:
|
180
|
+
new_df_dict = {}
|
181
|
+
for sheet_name, df in df_dict.items():
|
182
|
+
if sheet_name in skip_sheets:
|
183
|
+
continue
|
184
|
+
df = func(sheet_name, df)
|
185
|
+
new_df_dict[sheet_name] = df
|
186
|
+
save_df_dict(new_df_dict, output_filepath)
|
187
|
+
logger.info(f"{name}结果保存在 {output_filepath}")
|
188
|
+
return new_df_dict
|
189
|
+
|
190
|
+
|
191
|
+
def merge_multiple_df_dict(list_of_df_dict: List[Dict[str, pd.DataFrame]], sort=True):
|
192
|
+
df_dict_merged = defaultdict(list)
|
193
|
+
for df_dict in list_of_df_dict:
|
194
|
+
for k, df in df_dict.items():
|
195
|
+
df_dict_merged[k].append(df)
|
196
|
+
df_dict_merged: Dict[str, pd.DataFrame] = {k: pd.concat(v) for k, v in df_dict_merged.items()}
|
197
|
+
if sort:
|
198
|
+
df_dict_merged: Dict[str, pd.DataFrame] = {k: df_dict_merged[k] for k in sorted(df_dict_merged)}
|
199
|
+
return df_dict_merged
|
200
|
+
|
201
|
+
|
202
|
+
def remove_duplicate_and_sort(df: pd.DataFrame, key_col="query", sort_by='label'):
|
203
|
+
query_to_rows = {}
|
204
|
+
for i, row in df.iterrows():
|
205
|
+
query_to_rows[row[key_col]] = row
|
206
|
+
rows = sorted(list(query_to_rows.values()), key=lambda row: row[sort_by])
|
207
|
+
df_filtered = pd.DataFrame(rows)
|
208
|
+
return df_filtered
|
209
|
+
|
210
|
+
|
211
|
+
def color_negative_red(x):
|
212
|
+
color = "red" if x < 0 else ""
|
213
|
+
return f"color: {color}"
|
214
|
+
|
215
|
+
|
216
|
+
def highlight_max(x):
|
217
|
+
is_max = x == x.max()
|
218
|
+
return [("background-color: yellow" if m else "") for m in is_max]
|
219
|
+
|
220
|
+
|
221
|
+
def split_dataframe(df: pd.DataFrame, output_dir: Union[str, Path], tag: str, split_count=6):
|
222
|
+
output_dir = Path(output_dir)
|
223
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
224
|
+
rows = dataframe_to_json_list(df)
|
225
|
+
split_step = len(rows) // split_count + 1
|
226
|
+
df_list = []
|
227
|
+
for i in range(0, len(rows), split_step):
|
228
|
+
filepath = output_dir / f"{tag}_{i // split_step}.xlsx"
|
229
|
+
df_i = pd.DataFrame(rows[i:i+split_step])
|
230
|
+
df_i.to_excel(filepath, index=False)
|
231
|
+
df_list.append(df_i)
|
232
|
+
return df_list
|
233
|
+
|
@@ -0,0 +1,33 @@
|
|
1
|
+
from typing import List
|
2
|
+
|
3
|
+
import pandas as pd
|
4
|
+
|
5
|
+
|
6
|
+
|
7
|
+
def bucket_count(length: List[int], step=50, skip_zero_count=False):
|
8
|
+
grouped_count = []
|
9
|
+
j = 0
|
10
|
+
for i in range(0, max(length) + step, step):
|
11
|
+
grouped_count.append(0)
|
12
|
+
while j < len(length) and length[j] < i:
|
13
|
+
grouped_count[i // step] += 1
|
14
|
+
j += 1
|
15
|
+
x, y = [], []
|
16
|
+
for i, j in enumerate(grouped_count):
|
17
|
+
if i == 0:
|
18
|
+
continue
|
19
|
+
if skip_zero_count and j == 0:
|
20
|
+
continue
|
21
|
+
print(f"[{(i-1)*step}, {i*step}) {j} {sum(grouped_count[:i+1])/len(length)*100:.2f}%")
|
22
|
+
x.append((i - 1) * step)
|
23
|
+
y.append(j)
|
24
|
+
return x, y
|
25
|
+
|
26
|
+
|
27
|
+
def statistic_char_length(df: pd.DataFrame, instruction_key="instruction"):
|
28
|
+
length = []
|
29
|
+
for i, row in df.iterrows():
|
30
|
+
length.append(len(row[instruction_key]))
|
31
|
+
length.sort()
|
32
|
+
return length
|
33
|
+
|
xlin-0.1.0/xlin/util.py
ADDED
@@ -0,0 +1,321 @@
|
|
1
|
+
from collections import defaultdict
|
2
|
+
from typing import *
|
3
|
+
from pathlib import Path
|
4
|
+
import pandas as pd
|
5
|
+
import os
|
6
|
+
import asyncio
|
7
|
+
import datetime
|
8
|
+
from loguru import logger
|
9
|
+
import shutil
|
10
|
+
import random
|
11
|
+
|
12
|
+
|
13
|
+
date_str = datetime.datetime.now().strftime("%Y%m%d")
|
14
|
+
datetime_str = datetime.datetime.now().strftime("%Y%m%d_%Hh%Mm%Ss")
|
15
|
+
|
16
|
+
|
17
|
+
def random_timestamp(start_timestamp=None, end_timestamp=None):
|
18
|
+
if start_timestamp is None:
|
19
|
+
start_timestamp = datetime.datetime(2024, 1, 1).timestamp()
|
20
|
+
if end_timestamp is None:
|
21
|
+
end_timestamp = datetime.datetime.now().timestamp()
|
22
|
+
return random.uniform(start_timestamp, end_timestamp)
|
23
|
+
|
24
|
+
|
25
|
+
def random_timestamp_str(start_timestamp=None, end_timestamp=None, format="%Y年%m月%d日%H时%M分"):
|
26
|
+
return datetime.datetime.fromtimestamp(random_timestamp(start_timestamp, end_timestamp)).strftime(format)
|
27
|
+
|
28
|
+
|
29
|
+
def auto_retry_to_get_data(retry_times, request, data_key="data", *args, **kwargs):
|
30
|
+
if retry_times == 0:
|
31
|
+
return {}
|
32
|
+
resp = request(*args, **kwargs)
|
33
|
+
if resp is not None:
|
34
|
+
if data_key is None:
|
35
|
+
return resp
|
36
|
+
elif data_key in resp and resp[data_key] is not None:
|
37
|
+
return resp[data_key]
|
38
|
+
logger.debug("[error! retrying...]", resp)
|
39
|
+
return auto_retry_to_get_data(retry_times - 1, request, data_key, *args, **kwargs)
|
40
|
+
|
41
|
+
|
42
|
+
def append_column(df: pd.DataFrame, query_column: str, output_column: str, transform):
|
43
|
+
query = df[query_column].tolist()
|
44
|
+
loop = asyncio.get_event_loop()
|
45
|
+
result = loop.run_until_complete(transform(query))
|
46
|
+
df[output_column] = [str(r) for r in result]
|
47
|
+
return df
|
48
|
+
|
49
|
+
|
50
|
+
def request_wrapper(request_num=10):
|
51
|
+
def request_wrapper_body(func):
|
52
|
+
def wrapper(*args, **kwargs):
|
53
|
+
c = request_num
|
54
|
+
excute_num = 0
|
55
|
+
while c > 0:
|
56
|
+
c -= 1
|
57
|
+
res = func(*args, **kwargs)
|
58
|
+
excute_num += 1
|
59
|
+
if res != "-1":
|
60
|
+
logger.debug("{} excute_num: {}".format(func.__name__, excute_num))
|
61
|
+
return res
|
62
|
+
logger.debug("{} excute_num: {}".format(func.__name__, excute_num))
|
63
|
+
return ""
|
64
|
+
|
65
|
+
return wrapper
|
66
|
+
|
67
|
+
return request_wrapper_body
|
68
|
+
|
69
|
+
|
70
|
+
def copy_file(input_filepath, output_filepath, force_overwrite=False, verbose=False):
|
71
|
+
if verbose:
|
72
|
+
logger.info(f"正在复制 {input_filepath} 到 {output_filepath}")
|
73
|
+
if not isinstance(output_filepath, Path):
|
74
|
+
output_filepath = Path(output_filepath)
|
75
|
+
if output_filepath.exists() and not force_overwrite:
|
76
|
+
if verbose:
|
77
|
+
logger.warning(f"文件已存在,跳过复制:{output_filepath}")
|
78
|
+
return output_filepath
|
79
|
+
shutil.copy(input_filepath, output_filepath, follow_symlinks=True)
|
80
|
+
return output_filepath
|
81
|
+
|
82
|
+
|
83
|
+
def rm(dir_path: Union[str, Path, List[str], List[Path]], filter: Callable[[Path], bool] = lambda filepath: True, expand_all_subdir=True, debug=False):
|
84
|
+
if isinstance(dir_path, str) and "," in dir_path:
|
85
|
+
for path in dir_path.split(","):
|
86
|
+
rm(path, filter, expand_all_subdir)
|
87
|
+
return
|
88
|
+
if isinstance(dir_path, list):
|
89
|
+
for path in dir_path:
|
90
|
+
rm(path, filter, expand_all_subdir)
|
91
|
+
return
|
92
|
+
dir_path = Path(dir_path)
|
93
|
+
if not dir_path.exists():
|
94
|
+
if debug:
|
95
|
+
print(f"路径不存在 {dir_path}")
|
96
|
+
return
|
97
|
+
if not dir_path.is_dir():
|
98
|
+
if filter(dir_path):
|
99
|
+
dir_path.unlink()
|
100
|
+
if debug:
|
101
|
+
print(f"删除文件 {dir_path}")
|
102
|
+
return
|
103
|
+
filenames = os.listdir(dir_path)
|
104
|
+
for filename in sorted(filenames):
|
105
|
+
filepath = dir_path / filename
|
106
|
+
if debug:
|
107
|
+
print("checking", filepath)
|
108
|
+
if filepath.is_dir():
|
109
|
+
paths = ls(filepath, filter, expand_all_subdir)
|
110
|
+
if len(paths) > 0:
|
111
|
+
rm(paths, filter, expand_all_subdir)
|
112
|
+
child = filepath
|
113
|
+
while child.exists() and len(os.listdir(child)) > 0:
|
114
|
+
child = child / os.listdir(child)[0]
|
115
|
+
while child != filepath:
|
116
|
+
if child.exists() and len(os.listdir(child)) == 0:
|
117
|
+
child.rmdir()
|
118
|
+
if debug:
|
119
|
+
print(f"删除空文件夹 {child}")
|
120
|
+
else:
|
121
|
+
break
|
122
|
+
if filepath.exists() and len(os.listdir(filepath)) == 0:
|
123
|
+
filepath.rmdir()
|
124
|
+
if debug:
|
125
|
+
print(f"删除空文件夹 {filepath}")
|
126
|
+
elif filter(filepath):
|
127
|
+
rm(filepath, filter, expand_all_subdir)
|
128
|
+
if dir_path.exists() and len(os.listdir(dir_path)) == 0:
|
129
|
+
dir_path.rmdir()
|
130
|
+
if debug:
|
131
|
+
print(f"删除空文件夹 {dir_path}")
|
132
|
+
|
133
|
+
|
134
|
+
def cp(
|
135
|
+
input_dir_path: Union[str, Path, List[str], List[Path]],
|
136
|
+
output_dir_path: Union[str, Path],
|
137
|
+
base_input_dir: Optional[Union[Path, str]] = None,
|
138
|
+
force_overwrite: bool = False,
|
139
|
+
filter: Callable[[Path], bool] = lambda filepath: True,
|
140
|
+
expand_all_subdir=True,
|
141
|
+
verbose=False,
|
142
|
+
):
|
143
|
+
input_paths = ls(input_dir_path, filter, expand_all_subdir)
|
144
|
+
if len(input_paths) == 0:
|
145
|
+
if verbose:
|
146
|
+
logger.warning(f"no files in {input_dir_path}")
|
147
|
+
return
|
148
|
+
if base_input_dir is None:
|
149
|
+
# 计算最大公共路径
|
150
|
+
if len(input_paths) > 1:
|
151
|
+
base_input_dir = os.path.commonpath([str(p) for p in input_paths])
|
152
|
+
else:
|
153
|
+
base_input_dir = input_paths[0].parent
|
154
|
+
base_input_dir = Path(base_input_dir)
|
155
|
+
output_dir_path = Path(output_dir_path)
|
156
|
+
for input_path in input_paths:
|
157
|
+
relative_path = input_path.relative_to(base_input_dir)
|
158
|
+
output_path = output_dir_path / relative_path
|
159
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
160
|
+
copy_file(input_path, output_path, force_overwrite, verbose)
|
161
|
+
|
162
|
+
|
163
|
+
def ls(dir_path: Union[str, Path, List[str], List[Path]], filter: Callable[[Path], bool] = lambda filepath: True, expand_all_subdir=True):
|
164
|
+
"""list all files, return a list of filepaths
|
165
|
+
|
166
|
+
Args:
|
167
|
+
dir_path (Union[str, Path]): dir
|
168
|
+
filter ((Path) -> bool, optional): filter. Defaults to lambda filepath:True.
|
169
|
+
expand_all_subdir (bool, optional): _description_. Defaults to True.
|
170
|
+
|
171
|
+
Returns:
|
172
|
+
List[Path]: not null, may be empty list []
|
173
|
+
"""
|
174
|
+
filepaths: List[Path] = []
|
175
|
+
if isinstance(dir_path, str) and "," in dir_path:
|
176
|
+
for path in dir_path.split(","):
|
177
|
+
filepaths.extend(ls(path, filter, expand_all_subdir))
|
178
|
+
return filepaths
|
179
|
+
if isinstance(dir_path, list):
|
180
|
+
for path in dir_path:
|
181
|
+
filepaths.extend(ls(path, filter, expand_all_subdir))
|
182
|
+
return filepaths
|
183
|
+
dir_path = Path(dir_path)
|
184
|
+
if not dir_path.exists():
|
185
|
+
return filepaths
|
186
|
+
if not dir_path.is_dir():
|
187
|
+
if filter(dir_path):
|
188
|
+
return [dir_path]
|
189
|
+
else:
|
190
|
+
return filepaths
|
191
|
+
filenames = os.listdir(dir_path)
|
192
|
+
for filename in sorted(filenames):
|
193
|
+
filepath = dir_path / filename
|
194
|
+
if filepath.is_dir():
|
195
|
+
if expand_all_subdir:
|
196
|
+
filepaths.extend(ls(filepath, filter, expand_all_subdir))
|
197
|
+
elif filter(filepath):
|
198
|
+
filepaths.append(filepath)
|
199
|
+
return filepaths
|
200
|
+
|
201
|
+
|
202
|
+
def clean_empty_folder(dir_path):
|
203
|
+
dir_path = Path(dir_path)
|
204
|
+
sub_names = os.listdir(dir_path)
|
205
|
+
if not sub_names or len(sub_names) == 0:
|
206
|
+
print(f"clean empty folder: {dir_path}")
|
207
|
+
dir_path.rmdir()
|
208
|
+
clean_empty_folder(dir_path.parent)
|
209
|
+
else:
|
210
|
+
for sub_name in sub_names:
|
211
|
+
path = dir_path / sub_name
|
212
|
+
if path.is_dir():
|
213
|
+
clean_empty_folder(path)
|
214
|
+
|
215
|
+
|
216
|
+
def grouped_col_list(df: pd.DataFrame, key_col="query", value_col="output"):
|
217
|
+
grouped = defaultdict(list)
|
218
|
+
if key_col not in df.columns:
|
219
|
+
logger.warning(f"`{key_col}` not in columns: {list(df.columns)}")
|
220
|
+
return grouped
|
221
|
+
for i, row in df.iterrows():
|
222
|
+
grouped[row[key_col]].append(row[value_col])
|
223
|
+
return grouped
|
224
|
+
|
225
|
+
|
226
|
+
def grouped_col(df: pd.DataFrame, key_col="query", value_col="output"):
|
227
|
+
grouped = {}
|
228
|
+
if key_col not in df.columns:
|
229
|
+
logger.warning(f"`{key_col}` not in columns: {list(df.columns)}")
|
230
|
+
return grouped
|
231
|
+
for i, row in df.iterrows():
|
232
|
+
grouped[row[key_col]] = row[value_col]
|
233
|
+
return grouped
|
234
|
+
|
235
|
+
|
236
|
+
def grouped_row(df: pd.DataFrame, key_col="query"):
|
237
|
+
grouped = defaultdict(list)
|
238
|
+
if key_col not in df.columns:
|
239
|
+
logger.warning(f"`{key_col}` not in columns: {list(df.columns)}")
|
240
|
+
return grouped
|
241
|
+
for i, row in df.iterrows():
|
242
|
+
grouped[row[key_col]].append(row)
|
243
|
+
return grouped
|
244
|
+
|
245
|
+
|
246
|
+
def grouped_row_in_jsonlist(jsonlist: List[Dict[str, Any]], key_col="query"):
|
247
|
+
grouped = defaultdict(list)
|
248
|
+
for i, row in enumerate(jsonlist):
|
249
|
+
if key_col not in row:
|
250
|
+
logger.warning(f"`{key_col}` not in row: {row}")
|
251
|
+
notfound_key = f"NotFound:{key_col}"
|
252
|
+
grouped[notfound_key].append(row)
|
253
|
+
continue
|
254
|
+
grouped[row[key_col]].append(row)
|
255
|
+
return grouped
|
256
|
+
|
257
|
+
|
258
|
+
def submit_file(path: Union[str, Path], target_dir: Union[str, Path]):
|
259
|
+
p = Path(path).absolute()
|
260
|
+
target_dir = Path(target_dir).absolute()
|
261
|
+
logger.info(f"正在复制到目标文件夹 {target_dir}")
|
262
|
+
if p.is_dir():
|
263
|
+
logger.info(f"文件夹 {p}")
|
264
|
+
filenames = os.listdir(path)
|
265
|
+
for filename in filenames:
|
266
|
+
src_file = p / filename
|
267
|
+
tgt_file = target_dir / filename
|
268
|
+
copy_file(src_file, tgt_file)
|
269
|
+
logger.info(f"已复制 {filename} 到 {tgt_file}")
|
270
|
+
else:
|
271
|
+
filename = p.name
|
272
|
+
logger.info(f"文件 {filename}")
|
273
|
+
src_file = p
|
274
|
+
tgt_file = target_dir / filename
|
275
|
+
copy_file(src_file, tgt_file)
|
276
|
+
logger.info(f"已复制 {filename} 到 {tgt_file}")
|
277
|
+
filenames = os.listdir(target_dir)
|
278
|
+
logger.info("现在目标文件夹下的文件有:\n" + "\n".join(filenames))
|
279
|
+
|
280
|
+
|
281
|
+
def pretty_limited_text(text: str, limited_length: int = 300, language="zh"):
|
282
|
+
text = str(text).strip()
|
283
|
+
if len(text) > limited_length:
|
284
|
+
# if language == "zh":
|
285
|
+
# tail = f"...(共{len(text)}字)"
|
286
|
+
# else:
|
287
|
+
# tail = f"...({len(text)} words in total)"
|
288
|
+
# return text[: limited_length - len(tail)] + tail
|
289
|
+
return text[: limited_length // 2] + text[-limited_length // 2 :]
|
290
|
+
return text
|
291
|
+
|
292
|
+
|
293
|
+
def bucket_count(length):
|
294
|
+
grouped_count = []
|
295
|
+
j = 0
|
296
|
+
for i in range(0, max(length), 50):
|
297
|
+
grouped_count.append(0)
|
298
|
+
while length[j] < i:
|
299
|
+
grouped_count[i // 50] += 1
|
300
|
+
j += 1
|
301
|
+
for i, j in enumerate(grouped_count):
|
302
|
+
if i == 0 or j == 0:
|
303
|
+
continue
|
304
|
+
print(f"[{(i-1)*50}, {i*50}) {j} {sum(grouped_count[:i+1])/len(length)*100:.2f}%")
|
305
|
+
|
306
|
+
|
307
|
+
def sortedCounter(obj, by="key", reverse=False, return_list=False):
|
308
|
+
c = Counter(obj)
|
309
|
+
c_list = [(k, c[k]) for k in c]
|
310
|
+
if by == "key":
|
311
|
+
c_list = sorted(c_list, key=lambda x: x[0], reverse=reverse)
|
312
|
+
elif by in ["value", "count"]:
|
313
|
+
c_list = sorted(c_list, key=lambda x: x[1], reverse=reverse)
|
314
|
+
else:
|
315
|
+
raise Exception(f"unsupported by: {by}")
|
316
|
+
c = Counter()
|
317
|
+
for k, v in c_list:
|
318
|
+
c[k] = v
|
319
|
+
if return_list:
|
320
|
+
return c, c_list
|
321
|
+
return c
|
xlin-0.1.0/xlin/uuid.py
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
from typing import *
|
2
|
+
import uuid
|
3
|
+
|
4
|
+
import pandas as pd
|
5
|
+
|
6
|
+
|
7
|
+
def append_uuid_column(df: pd.DataFrame, uuid_key="uuid"):
|
8
|
+
rows = []
|
9
|
+
for i, row in df.iterrows():
|
10
|
+
row[uuid_key] = str(uuid.uuid4())
|
11
|
+
rows.append(row)
|
12
|
+
df = pd.DataFrame(rows)
|
13
|
+
return df
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# pip install pyexcel pyexcel-xls pyexcel-xlsx
|
2
|
+
import os
|
3
|
+
|
4
|
+
import pyexcel as p
|
5
|
+
|
6
|
+
|
7
|
+
def convert_xls_dir_to_xlsx(data_dir):
|
8
|
+
filenames = os.listdir(data_dir)
|
9
|
+
for filename in filenames:
|
10
|
+
if filename.endswith(".xls"):
|
11
|
+
convert_xls_to_xlsx(os.path.join(data_dir, filename))
|
12
|
+
|
13
|
+
def convert_xls_to_xlsx(file_name):
|
14
|
+
converted_filename = file_name + 'x'
|
15
|
+
if is_xslx(file_name):
|
16
|
+
# rename to .xlsx
|
17
|
+
with open(file_name, 'rb') as f:
|
18
|
+
with open(converted_filename, 'wb') as f2:
|
19
|
+
f2.write(f.read())
|
20
|
+
return converted_filename
|
21
|
+
sheet = p.get_sheet(file_name=file_name)
|
22
|
+
sheet.save_as(converted_filename)
|
23
|
+
return converted_filename
|
24
|
+
|
25
|
+
|
26
|
+
def is_xslx(filename):
|
27
|
+
with open(filename, 'rb') as f:
|
28
|
+
first_four_bytes = f.read()[:4]
|
29
|
+
return first_four_bytes == b'PK\x03\x04'
|
30
|
+
|
31
|
+
if __name__ == "__main__":
|
32
|
+
import sys
|
33
|
+
convert_xls_to_xlsx(sys.argv[1])
|