xlin 0.1.11__tar.gz → 0.1.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xlin-0.1.11 → xlin-0.1.13}/PKG-INFO +4 -2
- {xlin-0.1.11 → xlin-0.1.13}/pyproject.toml +2 -1
- {xlin-0.1.11 → xlin-0.1.13}/xlin/read_as_dataframe.py +88 -57
- xlin-0.1.13/xlin/statistic.py +116 -0
- xlin-0.1.11/xlin/statistic.py +0 -33
- {xlin-0.1.11 → xlin-0.1.13}/LICENSE +0 -0
- {xlin-0.1.11 → xlin-0.1.13}/README.md +0 -0
- {xlin-0.1.11 → xlin-0.1.13}/xlin/__init__.py +0 -0
- {xlin-0.1.11 → xlin-0.1.13}/xlin/ischinese.py +0 -0
- {xlin-0.1.11 → xlin-0.1.13}/xlin/jsonl.py +0 -0
- {xlin-0.1.11 → xlin-0.1.13}/xlin/metric.py +0 -0
- {xlin-0.1.11 → xlin-0.1.13}/xlin/multiprocess_mapping.py +0 -0
- {xlin-0.1.11 → xlin-0.1.13}/xlin/util.py +0 -0
- {xlin-0.1.11 → xlin-0.1.13}/xlin/xls2xlsx.py +0 -0
- {xlin-0.1.11 → xlin-0.1.13}/xlin/yaml.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.3
|
2
2
|
Name: xlin
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.13
|
4
4
|
Summary: toolbox for LinXueyuan
|
5
5
|
License: MIT
|
6
6
|
Author: XiChen
|
@@ -18,11 +18,13 @@ Classifier: Programming Language :: Python :: 3.9
|
|
18
18
|
Classifier: Programming Language :: Python :: 3.10
|
19
19
|
Classifier: Programming Language :: Python :: 3.11
|
20
20
|
Classifier: Programming Language :: Python :: 3.12
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
21
22
|
Requires-Dist: loguru
|
22
23
|
Requires-Dist: pandas
|
23
24
|
Requires-Dist: pyexcel
|
24
25
|
Requires-Dist: pyexcel-xls
|
25
26
|
Requires-Dist: pyexcel-xlsx
|
27
|
+
Requires-Dist: pyyaml
|
26
28
|
Requires-Dist: tqdm
|
27
29
|
Requires-Dist: xlsxwriter
|
28
30
|
Description-Content-Type: text/markdown
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "xlin"
|
3
|
-
version = "0.1.
|
3
|
+
version = "0.1.13"
|
4
4
|
description = "toolbox for LinXueyuan"
|
5
5
|
authors = ["XiChen <23211526+LinXueyuanStdio@users.noreply.github.com>"]
|
6
6
|
license = "MIT"
|
@@ -14,6 +14,7 @@ pyexcel-xls = "*"
|
|
14
14
|
pyexcel-xlsx = "*"
|
15
15
|
xlsxwriter = "*"
|
16
16
|
tqdm = "*"
|
17
|
+
pyyaml = "*"
|
17
18
|
|
18
19
|
[build-system]
|
19
20
|
requires = ["poetry-core"]
|
@@ -12,23 +12,23 @@ from xlin.jsonl import dataframe_to_json_list, load_json, load_json_list, save_j
|
|
12
12
|
from xlin.xls2xlsx import is_xslx
|
13
13
|
|
14
14
|
|
15
|
-
def
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
15
|
+
def read_as_dataframe(
|
16
|
+
filepath: Union[str, Path],
|
17
|
+
sheet_name: Optional[str] = None,
|
18
|
+
fill_empty_str_to_na=True,
|
19
|
+
filter=lambda x: True,
|
20
|
+
) -> pd.DataFrame:
|
21
21
|
"""
|
22
|
-
|
22
|
+
读取文件为表格。如果是文件夹,则读取文件夹下的所有文件为表格并拼接
|
23
23
|
"""
|
24
24
|
filepath = Path(filepath)
|
25
25
|
if filepath.is_dir():
|
26
|
-
paths = ls(filepath, expand_all_subdir=True)
|
26
|
+
paths = ls(filepath, filter=filter, expand_all_subdir=True)
|
27
27
|
df_list = []
|
28
28
|
for path in paths:
|
29
29
|
try:
|
30
|
-
df = read_as_dataframe(path, sheet_name, fill_empty_str_to_na)
|
31
|
-
df[
|
30
|
+
df = read_as_dataframe(path, sheet_name, fill_empty_str_to_na, filter)
|
31
|
+
df["数据来源"] = path.name
|
32
32
|
except:
|
33
33
|
df = pd.DataFrame()
|
34
34
|
df_list.append(df)
|
@@ -44,56 +44,64 @@ def read_as_dataframe(filepath: Union[str, Path], sheet_name: Optional[str] = No
|
|
44
44
|
json_list = load_json_list(filepath)
|
45
45
|
df = pd.DataFrame(json_list)
|
46
46
|
elif filename.endswith(".xlsx"):
|
47
|
-
|
47
|
+
if sheet_name is None:
|
48
|
+
df = pd.read_excel(filepath)
|
49
|
+
else:
|
50
|
+
df = pd.read_excel(filepath, sheet_name)
|
48
51
|
elif filename.endswith(".xls"):
|
49
52
|
if is_xslx(filepath):
|
50
|
-
|
53
|
+
if sheet_name is None:
|
54
|
+
df = pd.read_excel(filepath)
|
55
|
+
else:
|
56
|
+
df = pd.read_excel(filepath, sheet_name)
|
51
57
|
else:
|
52
58
|
df = pyexcel.get_sheet(file_name=filepath)
|
53
59
|
elif filename.endswith(".csv"):
|
54
60
|
df = pd.read_csv(filepath)
|
61
|
+
elif filename.endswith(".parquet"):
|
62
|
+
df = pd.read_parquet(filepath)
|
63
|
+
elif filename.endswith(".feather"):
|
64
|
+
df = pd.read_feather(filepath)
|
65
|
+
elif filename.endswith(".pkl"):
|
66
|
+
df = pd.read_pickle(filepath)
|
67
|
+
elif filename.endswith(".h5"):
|
68
|
+
df = pd.read_hdf(filepath)
|
69
|
+
elif filename.endswith(".txt"):
|
70
|
+
df = pd.read_csv(filepath, delimiter="\t")
|
71
|
+
elif filename.endswith(".tsv"):
|
72
|
+
df = pd.read_csv(filepath, delimiter="\t")
|
73
|
+
elif filename.endswith(".xml"):
|
74
|
+
df = pd.read_xml(filepath)
|
75
|
+
elif filename.endswith(".html"):
|
76
|
+
df = pd.read_html(filepath)[0]
|
77
|
+
elif filename.endswith(".db"):
|
78
|
+
df = pd.read_sql_table(sheet_name, filepath)
|
55
79
|
else:
|
56
|
-
raise ValueError(
|
80
|
+
raise ValueError(
|
81
|
+
(
|
82
|
+
f"Unsupported filetype {filepath}. filetype not in \n"
|
83
|
+
"[json, jsonl, xlsx, xls, csv, "
|
84
|
+
"parquet, feather, pkl, h5, txt, "
|
85
|
+
"tsv, xml, html, db]"
|
86
|
+
)
|
87
|
+
)
|
57
88
|
if fill_empty_str_to_na:
|
58
89
|
df.fillna("", inplace=True)
|
59
90
|
return df
|
60
91
|
|
61
92
|
|
62
|
-
def
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
out_list = list()
|
68
|
-
if not isinstance(filepath, Path):
|
69
|
-
filepath = Path(filepath)
|
70
|
-
if not filepath.exists():
|
71
|
-
raise ValueError(f"Path Not Exist: {filepath}")
|
72
|
-
if not filepath.is_dir():
|
73
|
-
return read_as_dataframe(filepath, sheet_name)
|
74
|
-
|
75
|
-
files = os.listdir(filepath)
|
76
|
-
for file_name in files:
|
77
|
-
if not valid_to_read_as_dataframe(file_name):
|
78
|
-
continue
|
79
|
-
input_file = filepath / file_name
|
80
|
-
df = read_as_dataframe(input_file, sheet_name)
|
81
|
-
df.fillna("", inplace=True)
|
82
|
-
for _, line in df.iterrows():
|
83
|
-
line = line.to_dict()
|
84
|
-
out_list.append(line)
|
85
|
-
df = pd.DataFrame(out_list)
|
86
|
-
return df
|
87
|
-
|
88
|
-
|
89
|
-
def read_as_dataframe_dict(filepath: Union[str, Path], fill_empty_str_to_na=True):
|
93
|
+
def read_as_dataframe_dict(
|
94
|
+
filepath: Union[str, Path],
|
95
|
+
fill_empty_str_to_na=True,
|
96
|
+
filter=lambda x: True,
|
97
|
+
):
|
90
98
|
filepath = Path(filepath)
|
91
99
|
if filepath.is_dir():
|
92
|
-
paths = ls(filepath, expand_all_subdir=True)
|
100
|
+
paths = ls(filepath, filter=filter, expand_all_subdir=True)
|
93
101
|
df_dict_list = []
|
94
102
|
for path in paths:
|
95
103
|
try:
|
96
|
-
df_dict = read_as_dataframe_dict(path, fill_empty_str_to_na)
|
104
|
+
df_dict = read_as_dataframe_dict(path, fill_empty_str_to_na, filter)
|
97
105
|
except:
|
98
106
|
df_dict = {}
|
99
107
|
df_dict_list.append(df_dict)
|
@@ -104,11 +112,11 @@ def read_as_dataframe_dict(filepath: Union[str, Path], fill_empty_str_to_na=True
|
|
104
112
|
for name, df in df_dict.items():
|
105
113
|
if fill_empty_str_to_na:
|
106
114
|
df.fillna("", inplace=True)
|
107
|
-
df[
|
115
|
+
df["数据来源"] = filepath.name
|
108
116
|
elif isinstance(df_dict, pd.DataFrame):
|
109
117
|
if fill_empty_str_to_na:
|
110
118
|
df_dict.fillna("", inplace=True)
|
111
|
-
df_dict[
|
119
|
+
df_dict["数据来源"] = filepath.name
|
112
120
|
return df_dict
|
113
121
|
|
114
122
|
|
@@ -137,7 +145,10 @@ def save_df_dict(df_dict: Dict[str, pd.DataFrame], output_filepath: Union[str, P
|
|
137
145
|
return output_filepath
|
138
146
|
|
139
147
|
|
140
|
-
def save_df_from_jsonlist(
|
148
|
+
def save_df_from_jsonlist(
|
149
|
+
jsonlist: List[Dict[str, str]],
|
150
|
+
output_filepath: Union[str, Path],
|
151
|
+
):
|
141
152
|
df = pd.DataFrame(jsonlist)
|
142
153
|
return save_df(df, output_filepath)
|
143
154
|
|
@@ -150,7 +161,9 @@ def save_df(df: pd.DataFrame, output_filepath: Union[str, Path]):
|
|
150
161
|
return output_filepath
|
151
162
|
|
152
163
|
|
153
|
-
def lazy_build_dataframe(
|
164
|
+
def lazy_build_dataframe(
|
165
|
+
name: str, output_filepath: Path, func, filetype: str = "xlsx"
|
166
|
+
):
|
154
167
|
logger.info(name)
|
155
168
|
output_filepath.parent.mkdir(parents=True, exist_ok=True)
|
156
169
|
if output_filepath.exists():
|
@@ -161,9 +174,13 @@ def lazy_build_dataframe(name: str, output_filepath: Path, func, filetype: str =
|
|
161
174
|
if filetype == "xlsx":
|
162
175
|
df.to_excel(output_filepath.parent / f"{filename}.xlsx", index=False)
|
163
176
|
elif filetype == "json":
|
164
|
-
save_json_list(
|
177
|
+
save_json_list(
|
178
|
+
dataframe_to_json_list(df), output_filepath.parent / f"{filename}.json"
|
179
|
+
)
|
165
180
|
elif filetype == "jsonl":
|
166
|
-
save_json_list(
|
181
|
+
save_json_list(
|
182
|
+
dataframe_to_json_list(df), output_filepath.parent / f"{filename}.jsonl"
|
183
|
+
)
|
167
184
|
else:
|
168
185
|
logger.warning(f"不认识的 {filetype},默认保存为 xlsx")
|
169
186
|
df.to_excel(output_filepath.parent / f"{filename}.xlsx", index=False)
|
@@ -171,7 +188,13 @@ def lazy_build_dataframe(name: str, output_filepath: Path, func, filetype: str =
|
|
171
188
|
return df
|
172
189
|
|
173
190
|
|
174
|
-
def lazy_build_dataframe_dict(
|
191
|
+
def lazy_build_dataframe_dict(
|
192
|
+
name: str,
|
193
|
+
output_filepath: Path,
|
194
|
+
df_dict: Dict[str, pd.DataFrame],
|
195
|
+
func: Callable[[str, pd.DataFrame], pd.DataFrame],
|
196
|
+
skip_sheets: List[str] = list(),
|
197
|
+
):
|
175
198
|
logger.info(name)
|
176
199
|
output_filepath.parent.mkdir(parents=True, exist_ok=True)
|
177
200
|
if output_filepath.exists():
|
@@ -193,13 +216,17 @@ def merge_multiple_df_dict(list_of_df_dict: List[Dict[str, pd.DataFrame]], sort=
|
|
193
216
|
for df_dict in list_of_df_dict:
|
194
217
|
for k, df in df_dict.items():
|
195
218
|
df_dict_merged[k].append(df)
|
196
|
-
df_dict_merged: Dict[str, pd.DataFrame] = {
|
219
|
+
df_dict_merged: Dict[str, pd.DataFrame] = {
|
220
|
+
k: pd.concat(v) for k, v in df_dict_merged.items()
|
221
|
+
}
|
197
222
|
if sort:
|
198
|
-
df_dict_merged: Dict[str, pd.DataFrame] = {
|
223
|
+
df_dict_merged: Dict[str, pd.DataFrame] = {
|
224
|
+
k: df_dict_merged[k] for k in sorted(df_dict_merged)
|
225
|
+
}
|
199
226
|
return df_dict_merged
|
200
227
|
|
201
228
|
|
202
|
-
def remove_duplicate_and_sort(df: pd.DataFrame, key_col="query", sort_by=
|
229
|
+
def remove_duplicate_and_sort(df: pd.DataFrame, key_col="query", sort_by="label"):
|
203
230
|
query_to_rows = {}
|
204
231
|
for i, row in df.iterrows():
|
205
232
|
query_to_rows[row[key_col]] = row
|
@@ -218,16 +245,20 @@ def highlight_max(x):
|
|
218
245
|
return [("background-color: yellow" if m else "") for m in is_max]
|
219
246
|
|
220
247
|
|
221
|
-
def split_dataframe(
|
248
|
+
def split_dataframe(
|
249
|
+
df: pd.DataFrame,
|
250
|
+
output_dir: Union[str, Path],
|
251
|
+
output_filename_prefix: str,
|
252
|
+
split_count=6,
|
253
|
+
):
|
222
254
|
output_dir = Path(output_dir)
|
223
255
|
output_dir.mkdir(parents=True, exist_ok=True)
|
224
256
|
rows = dataframe_to_json_list(df)
|
225
257
|
split_step = len(rows) // split_count + 1
|
226
258
|
df_list = []
|
227
259
|
for i in range(0, len(rows), split_step):
|
228
|
-
filepath = output_dir / f"{
|
229
|
-
df_i = pd.DataFrame(rows[i:i+split_step])
|
260
|
+
filepath = output_dir / f"{output_filename_prefix}_{i // split_step}.xlsx"
|
261
|
+
df_i = pd.DataFrame(rows[i : i + split_step])
|
230
262
|
df_i.to_excel(filepath, index=False)
|
231
263
|
df_list.append(df_i)
|
232
264
|
return df_list
|
233
|
-
|
@@ -0,0 +1,116 @@
|
|
1
|
+
from typing import List
|
2
|
+
|
3
|
+
import pandas as pd
|
4
|
+
|
5
|
+
|
6
|
+
|
7
|
+
def bucket_count(length: List[int], step=50, skip_zero_count=False):
|
8
|
+
grouped_count = []
|
9
|
+
j = 0
|
10
|
+
for i in range(0, max(length) + step, step):
|
11
|
+
grouped_count.append(0)
|
12
|
+
while j < len(length) and length[j] < i:
|
13
|
+
grouped_count[i // step] += 1
|
14
|
+
j += 1
|
15
|
+
x, y = [], []
|
16
|
+
for i, j in enumerate(grouped_count):
|
17
|
+
if i == 0:
|
18
|
+
continue
|
19
|
+
if skip_zero_count and j == 0:
|
20
|
+
continue
|
21
|
+
print(f"[{(i-1)*step}, {i*step}) {j} {sum(grouped_count[:i+1])/len(length)*100:.2f}%")
|
22
|
+
x.append((i - 1) * step)
|
23
|
+
y.append(j)
|
24
|
+
return x, y
|
25
|
+
|
26
|
+
|
27
|
+
def statistic_char_length(df: pd.DataFrame, instruction_key="instruction"):
|
28
|
+
length = []
|
29
|
+
for i, row in df.iterrows():
|
30
|
+
length.append(len(row[instruction_key]))
|
31
|
+
length.sort()
|
32
|
+
return length
|
33
|
+
|
34
|
+
|
35
|
+
def statistic_token_length(df: pd.DataFrame, model_path: str, row_to_prompt: lambda row: row["prompt"]):
|
36
|
+
from transformers import AutoTokenizer
|
37
|
+
|
38
|
+
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
39
|
+
lengths = []
|
40
|
+
for i, row in df.iterrows():
|
41
|
+
prompt = row_to_prompt(row)
|
42
|
+
inputs = tokenizer(prompt, return_tensors="pt")
|
43
|
+
length = inputs["input_ids"].shape[1]
|
44
|
+
lengths.append(length)
|
45
|
+
lengths.sort()
|
46
|
+
return lengths
|
47
|
+
|
48
|
+
|
49
|
+
def draw_histogram(data: list[int], bins=30, title="Data Distribution Analysis"):
|
50
|
+
import numpy as np
|
51
|
+
import matplotlib.pyplot as plt
|
52
|
+
from scipy.stats import gaussian_kde
|
53
|
+
|
54
|
+
data = np.array(data)
|
55
|
+
|
56
|
+
# 计算统计指标
|
57
|
+
mean = np.mean(data)
|
58
|
+
median = np.median(data)
|
59
|
+
std = np.std(data)
|
60
|
+
q25, q75 = np.percentile(data, [25, 75])
|
61
|
+
data_range = (np.min(data), np.max(data))
|
62
|
+
|
63
|
+
# 创建图形和坐标轴
|
64
|
+
plt.figure(figsize=(12, 7), dpi=100)
|
65
|
+
|
66
|
+
# 绘制直方图
|
67
|
+
plt.hist(data, bins=bins, density=True, alpha=0.5, color="skyblue", edgecolor="white", label="Distribution")
|
68
|
+
|
69
|
+
# 绘制核密度估计(KDE)
|
70
|
+
kde = gaussian_kde(data)
|
71
|
+
x_vals = np.linspace(data_range[0] - 1, data_range[1] + 1, 1000)
|
72
|
+
plt.plot(x_vals, kde(x_vals), color="navy", linewidth=2, label="KDE Curve")
|
73
|
+
|
74
|
+
# 添加统计线
|
75
|
+
plt.axvline(mean, color="red", linestyle="--", linewidth=2, label=f"Mean ({mean:.2f})")
|
76
|
+
plt.axvline(median, color="green", linestyle="-.", linewidth=2, label=f"Median ({median:.2f})")
|
77
|
+
plt.axvspan(mean - std, mean + std, color="orange", alpha=0.1, label=f"±1 Std.Dev ({std:.2f})")
|
78
|
+
|
79
|
+
# 添加四分位线
|
80
|
+
plt.axvline(q25, color="purple", linestyle=":", alpha=0.8, label=f"25th Percentile ({q25:.2f})")
|
81
|
+
plt.axvline(q75, color="purple", linestyle=":", alpha=0.8, label=f"75th Percentile ({q75:.2f})")
|
82
|
+
|
83
|
+
# 添加统计摘要
|
84
|
+
stats_text = f"""\
|
85
|
+
Data Range: [{data_range[0]:.2f}, {data_range[1]:.2f}]
|
86
|
+
Observations: {len(data):,}
|
87
|
+
Standard Deviation: {std:.2f}
|
88
|
+
IQR: {q75 - q25:.2f}
|
89
|
+
Skewness: {float((data - mean).mean()**3 / std**3):.4f}
|
90
|
+
Kurtosis: {float((data - mean).mean()**4 / std**4):.4f}\
|
91
|
+
"""
|
92
|
+
# 文字左对齐 align
|
93
|
+
plt.annotate(stats_text, xy=(0.99, 0.98), xycoords="axes fraction", ha="right", va="top", fontfamily="monospace", bbox=dict(boxstyle="round", facecolor="white", alpha=0.8),)
|
94
|
+
|
95
|
+
# 设置图形属性
|
96
|
+
plt.title(title, fontsize=14, pad=20)
|
97
|
+
plt.xlabel("Value", fontsize=12)
|
98
|
+
plt.ylabel("Density", fontsize=12)
|
99
|
+
plt.grid(True, linestyle="--", alpha=0.4)
|
100
|
+
plt.legend(loc="upper left", frameon=True, framealpha=0.9, shadow=True)
|
101
|
+
|
102
|
+
# 调整坐标轴范围
|
103
|
+
buffer = (data_range[1] - data_range[0]) * 0.1
|
104
|
+
plt.xlim(data_range[0] - buffer, data_range[1] + buffer)
|
105
|
+
|
106
|
+
# 显示图形
|
107
|
+
plt.tight_layout()
|
108
|
+
plt.show()
|
109
|
+
|
110
|
+
|
111
|
+
def draw_pie(numbers: List[int], title="Pie Chart of Numbers"):
|
112
|
+
import matplotlib.pyplot as plt
|
113
|
+
|
114
|
+
plt.pie(numbers, labels=[str(i) for i in range(len(numbers))], autopct='%1.1f%%')
|
115
|
+
plt.title(title)
|
116
|
+
plt.show()
|
xlin-0.1.11/xlin/statistic.py
DELETED
@@ -1,33 +0,0 @@
|
|
1
|
-
from typing import List
|
2
|
-
|
3
|
-
import pandas as pd
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
def bucket_count(length: List[int], step=50, skip_zero_count=False):
|
8
|
-
grouped_count = []
|
9
|
-
j = 0
|
10
|
-
for i in range(0, max(length) + step, step):
|
11
|
-
grouped_count.append(0)
|
12
|
-
while j < len(length) and length[j] < i:
|
13
|
-
grouped_count[i // step] += 1
|
14
|
-
j += 1
|
15
|
-
x, y = [], []
|
16
|
-
for i, j in enumerate(grouped_count):
|
17
|
-
if i == 0:
|
18
|
-
continue
|
19
|
-
if skip_zero_count and j == 0:
|
20
|
-
continue
|
21
|
-
print(f"[{(i-1)*step}, {i*step}) {j} {sum(grouped_count[:i+1])/len(length)*100:.2f}%")
|
22
|
-
x.append((i - 1) * step)
|
23
|
-
y.append(j)
|
24
|
-
return x, y
|
25
|
-
|
26
|
-
|
27
|
-
def statistic_char_length(df: pd.DataFrame, instruction_key="instruction"):
|
28
|
-
length = []
|
29
|
-
for i, row in df.iterrows():
|
30
|
-
length.append(len(row[instruction_key]))
|
31
|
-
length.sort()
|
32
|
-
return length
|
33
|
-
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|