xlin 0.1.14__tar.gz → 0.1.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.1
2
2
  Name: xlin
3
- Version: 0.1.14
3
+ Version: 0.1.16
4
4
  Summary: toolbox for LinXueyuan
5
5
  License: MIT
6
6
  Author: XiChen
@@ -18,7 +18,6 @@ Classifier: Programming Language :: Python :: 3.9
18
18
  Classifier: Programming Language :: Python :: 3.10
19
19
  Classifier: Programming Language :: Python :: 3.11
20
20
  Classifier: Programming Language :: Python :: 3.12
21
- Classifier: Programming Language :: Python :: 3.13
22
21
  Requires-Dist: loguru
23
22
  Requires-Dist: pandas
24
23
  Requires-Dist: pyexcel
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "xlin"
3
- version = "0.1.14"
3
+ version = "0.1.16"
4
4
  description = "toolbox for LinXueyuan"
5
5
  authors = ["XiChen <23211526+LinXueyuanStdio@users.noreply.github.com>"]
6
6
  license = "MIT"
@@ -0,0 +1,266 @@
1
+ from typing import List
2
+ from collections import defaultdict
3
+
4
+ import pandas as pd
5
+
6
+
7
+
8
+ def bucket_count(length: List[int], step=50, skip_zero_count=False):
9
+ grouped_count = []
10
+ j = 0
11
+ for i in range(0, max(length) + step, step):
12
+ grouped_count.append(0)
13
+ while j < len(length) and length[j] < i:
14
+ grouped_count[i // step] += 1
15
+ j += 1
16
+ x, y = [], []
17
+ for i, j in enumerate(grouped_count):
18
+ if i == 0:
19
+ continue
20
+ if skip_zero_count and j == 0:
21
+ continue
22
+ print(f"[{(i-1)*step}, {i*step}) {j} {sum(grouped_count[:i+1])/len(length)*100:.2f}%")
23
+ x.append((i - 1) * step)
24
+ y.append(j)
25
+ return x, y
26
+
27
+
28
+ def statistic_char_length(df: pd.DataFrame, instruction_key="instruction"):
29
+ length = []
30
+ for i, row in df.iterrows():
31
+ length.append(len(row[instruction_key]))
32
+ length.sort()
33
+ return length
34
+
35
+
36
+ def statistic_token_length(df: pd.DataFrame, model_path: str, row_to_prompt: lambda row: row["prompt"]):
37
+ from transformers import AutoTokenizer
38
+
39
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
40
+ lengths = []
41
+ for i, row in df.iterrows():
42
+ prompt = row_to_prompt(row)
43
+ inputs = tokenizer(prompt, return_tensors="pt")
44
+ length = inputs["input_ids"].shape[1]
45
+ lengths.append(length)
46
+ lengths.sort()
47
+ return lengths
48
+
49
+
50
+ def draw_histogram(data: list[int], bins=30, title="Data Distribution Analysis"):
51
+ import numpy as np
52
+ import matplotlib.pyplot as plt
53
+ from scipy.stats import gaussian_kde
54
+
55
+ data = np.array(data)
56
+
57
+ # 计算统计指标
58
+ mean = np.mean(data)
59
+ median = np.median(data)
60
+ std = np.std(data)
61
+ q25, q75, q80, q90 = np.percentile(data, [25, 75, 80, 90])
62
+ data_range = (np.min(data), np.max(data))
63
+
64
+ # 创建图形和坐标轴
65
+ plt.figure(figsize=(12, 7), dpi=100)
66
+
67
+ # 绘制直方图
68
+ plt.hist(data, bins=bins, density=True, alpha=0.5, color="skyblue", edgecolor="white", label="Distribution")
69
+
70
+ # 绘制核密度估计(KDE)
71
+ kde = gaussian_kde(data)
72
+ x_vals = np.linspace(data_range[0] - 1, data_range[1] + 1, 1000)
73
+ plt.plot(x_vals, kde(x_vals), color="navy", linewidth=2, label="KDE Curve")
74
+
75
+ # 添加统计线
76
+ plt.axvline(mean, color="red", linestyle="--", linewidth=2, label=f"Mean ({mean:.2f})")
77
+ plt.axvline(median, color="green", linestyle="-.", linewidth=2, label=f"Median ({median:.2f})")
78
+ plt.axvspan(mean - std, mean + std, color="orange", alpha=0.1, label=f"±1 Std.Dev ({std:.2f})")
79
+
80
+ # 添加四分位线
81
+ plt.axvline(q25, color="purple", linestyle=":", alpha=0.8, label=f"25th Percentile ({q25:.2f})")
82
+ plt.axvline(q75, color="purple", linestyle=":", alpha=0.8, label=f"75th Percentile ({q75:.2f})")
83
+ plt.axvline(q80, color="purple", linestyle=":", alpha=0.8, label=f"80th Percentile ({q80:.2f})")
84
+ plt.axvline(q90, color="purple", linestyle=":", alpha=0.8, label=f"90th Percentile ({q90:.2f})")
85
+
86
+ # 添加统计摘要
87
+ stats_text = f"""\
88
+ Data Range: [{data_range[0]:.2f}, {data_range[1]:.2f}]
89
+ Observations: {len(data):,}
90
+ Standard Deviation: {std:.2f}
91
+ IQR: {q75 - q25:.2f}
92
+ Skewness: {float((data - mean).mean()**3 / std**3):.4f}
93
+ Kurtosis: {float((data - mean).mean()**4 / std**4):.4f}\
94
+ """
95
+ # 文字左对齐 align
96
+ plt.annotate(stats_text, xy=(0.99, 0.98), xycoords="axes fraction", ha="right", va="top", fontfamily="monospace", bbox=dict(boxstyle="round", facecolor="white", alpha=0.8),)
97
+
98
+ # 设置图形属性
99
+ plt.title(title, fontsize=14, pad=20)
100
+ plt.xlabel("Value", fontsize=12)
101
+ plt.ylabel("Density", fontsize=12)
102
+ plt.grid(True, linestyle="--", alpha=0.4)
103
+ plt.legend(loc="upper left", frameon=True, framealpha=0.9, shadow=True)
104
+
105
+ # 调整坐标轴范围
106
+ buffer = (data_range[1] - data_range[0]) * 0.1
107
+ plt.xlim(data_range[0] - buffer, data_range[1] + buffer)
108
+
109
+ # 显示图形
110
+ plt.tight_layout()
111
+ plt.show()
112
+
113
+
114
+ def draw_pie(numbers: List[int], title="Pie Chart of Numbers"):
115
+ import matplotlib.pyplot as plt
116
+
117
+ plt.pie(numbers, labels=[str(i) for i in range(len(numbers))], autopct='%1.1f%%')
118
+ plt.title(title)
119
+ plt.show()
120
+
121
+
122
+ def generate_classification_report(predictions: List[str], labels: List[str]) -> dict:
123
+ """
124
+ 生成包含准确率、混淆矩阵、分类报告等详细评估结果的字典
125
+
126
+ Args:
127
+ predictions: 模型预测结果列表
128
+ labels: 真实标签列表
129
+
130
+ Returns:
131
+ 包含以下结构的字典:
132
+ - accuracy: 整体准确率
133
+ - confusion_matrix: 混淆矩阵DataFrame
134
+ - class_report: 分类报告DataFrame
135
+ - error_analysis: 错误样本分析DataFrame
136
+ - total_samples: 总样本数
137
+ - time_generated: 报告生成时间
138
+ """
139
+ # 基础校验
140
+ assert len(predictions) == len(labels), "预测结果与标签长度不一致"
141
+
142
+ # 初始化报告字典
143
+ report = {}
144
+
145
+ # 获取唯一类别
146
+ classes = sorted(list(set(labels)))
147
+ error_label = "out_of_class"
148
+ extend_classes = classes + [error_label]
149
+
150
+ # 计算基础指标
151
+ total = len(labels)
152
+ correct = sum(p == l for p, l in zip(predictions, labels))
153
+
154
+ # 1. 准确率计算
155
+ report["accuracy"] = correct / total
156
+
157
+ # 2. 混淆矩阵构建
158
+ confusion = defaultdict(int)
159
+ for true_label, pred_label in zip(labels, predictions):
160
+ if pred_label not in classes:
161
+ pred_label = error_label
162
+ confusion[(true_label, pred_label)] += 1
163
+
164
+ confusion_matrix = pd.DataFrame(index=extend_classes, columns=extend_classes, data=0)
165
+ for (true, pred), count in confusion.items():
166
+ confusion_matrix.loc[true, pred] = count
167
+
168
+ # 3. 分类报告生成
169
+ micro_tp = 0
170
+ micro_fp = 0
171
+ micro_fn = 0
172
+ class_stats = []
173
+ for cls in extend_classes:
174
+ tp = confusion[(cls, cls)]
175
+ fp = sum(confusion[(other, cls)] for other in extend_classes if other != cls)
176
+ fn = sum(confusion[(cls, other)] for other in extend_classes if other != cls)
177
+ micro_tp += tp
178
+ micro_fp += fp
179
+ micro_fn += fn
180
+
181
+ precision = tp / (tp + fp) if (tp + fp) > 0 else 0
182
+ recall = tp / (tp + fn) if (tp + fn) > 0 else 0
183
+ f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
184
+
185
+ class_stats.append(
186
+ {
187
+ "class": cls,
188
+ "precision": precision,
189
+ "recall": recall,
190
+ "f1_score": f1,
191
+ "support": sum(confusion[(cls, other)] for other in extend_classes),
192
+ },
193
+ )
194
+
195
+ # 添加汇总统计
196
+ class_df = pd.DataFrame(class_stats)
197
+ report["class_report"] = class_df
198
+ confusion_matrix["recall"] = class_df["recall"].values.tolist()
199
+ p = class_df["precision"].values.tolist() + [None]
200
+ tail = pd.DataFrame([p], index=["precision"], columns=confusion_matrix.columns)
201
+ confusion_matrix = pd.concat([confusion_matrix, tail], axis=0)
202
+ confusion_matrix.index.name = "True \\ Label"
203
+ report["confusion_matrix"] = confusion_matrix
204
+
205
+ micro_precision = micro_tp / (micro_tp + micro_fp) if (micro_tp + micro_fp) > 0 else 0
206
+ micro_recall = micro_tp / (micro_tp + micro_fn) if (micro_tp + micro_fn) > 0 else 0
207
+ micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall) if (micro_precision + micro_recall) > 0 else 0
208
+ report["micro_stats"] = {
209
+ "precision": micro_precision,
210
+ "recall": micro_recall,
211
+ "f1_score": micro_f1,
212
+ }
213
+ report["macro_stats"] = {
214
+ "precision": class_df[class_df["class"] != error_label]["precision"].mean(),
215
+ "recall": class_df[class_df["class"] != error_label]["recall"].mean(),
216
+ "f1_score": class_df[class_df["class"] != error_label]["f1_score"].mean(),
217
+ }
218
+
219
+ # 4. 元数据信息
220
+ import datetime
221
+ report["total_samples"] = total
222
+ report["time_generated"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
223
+
224
+ return report
225
+
226
+
227
+ def print_classification_report(predictions: List[str], labels: List[str]):
228
+ report = generate_classification_report(predictions, labels)
229
+ """
230
+ 打印报告内容
231
+ """
232
+ print(f"准确率: {report['accuracy']:.2%}")
233
+ print(f"总样本数: {report['total_samples']}, 生成时间: {report['time_generated']}")
234
+ print()
235
+ # 打印微观统计
236
+ print("=== 微观统计 ===")
237
+ micro_stats = report["micro_stats"]
238
+ print(f"准确率: {micro_stats['precision']:.2%}")
239
+ print(f"召回率: {micro_stats['recall']:.2%}")
240
+ print(f"F1分数: {micro_stats['f1_score']:.2%}")
241
+ print()
242
+ # 打印宏观统计
243
+ print("=== 宏观统计 ===")
244
+ macro_stats = report["macro_stats"]
245
+ print(f"准确率: {macro_stats['precision']:.2%}")
246
+ print(f"召回率: {macro_stats['recall']:.2%}")
247
+ print(f"F1分数: {macro_stats['f1_score']:.2%}")
248
+ print()
249
+
250
+ # 打印混淆矩阵
251
+ print("=== 混淆矩阵 ===")
252
+ print(report["confusion_matrix"])
253
+ print()
254
+
255
+ # 打印分类报告
256
+ print("=== 分类报告 ===")
257
+ print(report["class_report"])
258
+ print()
259
+
260
+
261
+ if __name__ == "__main__":
262
+ # 示例数据
263
+ preds = ["cat", "dog", "cat", "dog", "extra1", "extra2"]
264
+ truth = ["cat", "cat", "dog", "dog", "dog", "dog"]
265
+
266
+ print_classification_report(preds, truth)
@@ -103,32 +103,12 @@ def rm(dir_path: Union[str, Path, List[str], List[Path]], filter: Callable[[Path
103
103
  filenames = os.listdir(dir_path)
104
104
  for filename in sorted(filenames):
105
105
  filepath = dir_path / filename
106
- if debug:
107
- print("checking", filepath)
108
- if filepath.is_dir():
109
- paths = ls(filepath, filter, expand_all_subdir)
110
- if len(paths) > 0:
111
- rm(paths, filter, expand_all_subdir)
112
- child = filepath
113
- while child.exists() and len(os.listdir(child)) > 0:
114
- child = child / os.listdir(child)[0]
115
- while child != filepath:
116
- if child.exists() and len(os.listdir(child)) == 0:
117
- child.rmdir()
118
- if debug:
119
- print(f"删除空文件夹 {child}")
120
- else:
121
- break
122
- if filepath.exists() and len(os.listdir(filepath)) == 0:
123
- filepath.rmdir()
124
- if debug:
125
- print(f"删除空文件夹 {filepath}")
126
- elif filter(filepath):
127
- rm(filepath, filter, expand_all_subdir)
128
- if dir_path.exists() and len(os.listdir(dir_path)) == 0:
129
- dir_path.rmdir()
130
- if debug:
131
- print(f"删除空文件夹 {dir_path}")
106
+ rm(filepath, filter, expand_all_subdir, debug)
107
+ if dir_path.exists() and dir_path.is_dir() and len(os.listdir(dir_path)) == 0:
108
+ if filter(dir_path):
109
+ dir_path.rmdir()
110
+ if debug:
111
+ print(f"删除空文件夹 {dir_path}")
132
112
 
133
113
 
134
114
  def cp(
@@ -1,118 +0,0 @@
1
- from typing import List
2
-
3
- import pandas as pd
4
-
5
-
6
-
7
- def bucket_count(length: List[int], step=50, skip_zero_count=False):
8
- grouped_count = []
9
- j = 0
10
- for i in range(0, max(length) + step, step):
11
- grouped_count.append(0)
12
- while j < len(length) and length[j] < i:
13
- grouped_count[i // step] += 1
14
- j += 1
15
- x, y = [], []
16
- for i, j in enumerate(grouped_count):
17
- if i == 0:
18
- continue
19
- if skip_zero_count and j == 0:
20
- continue
21
- print(f"[{(i-1)*step}, {i*step}) {j} {sum(grouped_count[:i+1])/len(length)*100:.2f}%")
22
- x.append((i - 1) * step)
23
- y.append(j)
24
- return x, y
25
-
26
-
27
- def statistic_char_length(df: pd.DataFrame, instruction_key="instruction"):
28
- length = []
29
- for i, row in df.iterrows():
30
- length.append(len(row[instruction_key]))
31
- length.sort()
32
- return length
33
-
34
-
35
- def statistic_token_length(df: pd.DataFrame, model_path: str, row_to_prompt: lambda row: row["prompt"]):
36
- from transformers import AutoTokenizer
37
-
38
- tokenizer = AutoTokenizer.from_pretrained(model_path)
39
- lengths = []
40
- for i, row in df.iterrows():
41
- prompt = row_to_prompt(row)
42
- inputs = tokenizer(prompt, return_tensors="pt")
43
- length = inputs["input_ids"].shape[1]
44
- lengths.append(length)
45
- lengths.sort()
46
- return lengths
47
-
48
-
49
- def draw_histogram(data: list[int], bins=30, title="Data Distribution Analysis"):
50
- import numpy as np
51
- import matplotlib.pyplot as plt
52
- from scipy.stats import gaussian_kde
53
-
54
- data = np.array(data)
55
-
56
- # 计算统计指标
57
- mean = np.mean(data)
58
- median = np.median(data)
59
- std = np.std(data)
60
- q25, q75, q80, q90 = np.percentile(data, [25, 75, 80, 90])
61
- data_range = (np.min(data), np.max(data))
62
-
63
- # 创建图形和坐标轴
64
- plt.figure(figsize=(12, 7), dpi=100)
65
-
66
- # 绘制直方图
67
- plt.hist(data, bins=bins, density=True, alpha=0.5, color="skyblue", edgecolor="white", label="Distribution")
68
-
69
- # 绘制核密度估计(KDE)
70
- kde = gaussian_kde(data)
71
- x_vals = np.linspace(data_range[0] - 1, data_range[1] + 1, 1000)
72
- plt.plot(x_vals, kde(x_vals), color="navy", linewidth=2, label="KDE Curve")
73
-
74
- # 添加统计线
75
- plt.axvline(mean, color="red", linestyle="--", linewidth=2, label=f"Mean ({mean:.2f})")
76
- plt.axvline(median, color="green", linestyle="-.", linewidth=2, label=f"Median ({median:.2f})")
77
- plt.axvspan(mean - std, mean + std, color="orange", alpha=0.1, label=f"±1 Std.Dev ({std:.2f})")
78
-
79
- # 添加四分位线
80
- plt.axvline(q25, color="purple", linestyle=":", alpha=0.8, label=f"25th Percentile ({q25:.2f})")
81
- plt.axvline(q75, color="purple", linestyle=":", alpha=0.8, label=f"75th Percentile ({q75:.2f})")
82
- plt.axvline(q80, color="purple", linestyle=":", alpha=0.8, label=f"80th Percentile ({q80:.2f})")
83
- plt.axvline(q90, color="purple", linestyle=":", alpha=0.8, label=f"90th Percentile ({q90:.2f})")
84
-
85
- # 添加统计摘要
86
- stats_text = f"""\
87
- Data Range: [{data_range[0]:.2f}, {data_range[1]:.2f}]
88
- Observations: {len(data):,}
89
- Standard Deviation: {std:.2f}
90
- IQR: {q75 - q25:.2f}
91
- Skewness: {float((data - mean).mean()**3 / std**3):.4f}
92
- Kurtosis: {float((data - mean).mean()**4 / std**4):.4f}\
93
- """
94
- # 文字左对齐 align
95
- plt.annotate(stats_text, xy=(0.99, 0.98), xycoords="axes fraction", ha="right", va="top", fontfamily="monospace", bbox=dict(boxstyle="round", facecolor="white", alpha=0.8),)
96
-
97
- # 设置图形属性
98
- plt.title(title, fontsize=14, pad=20)
99
- plt.xlabel("Value", fontsize=12)
100
- plt.ylabel("Density", fontsize=12)
101
- plt.grid(True, linestyle="--", alpha=0.4)
102
- plt.legend(loc="upper left", frameon=True, framealpha=0.9, shadow=True)
103
-
104
- # 调整坐标轴范围
105
- buffer = (data_range[1] - data_range[0]) * 0.1
106
- plt.xlim(data_range[0] - buffer, data_range[1] + buffer)
107
-
108
- # 显示图形
109
- plt.tight_layout()
110
- plt.show()
111
-
112
-
113
- def draw_pie(numbers: List[int], title="Pie Chart of Numbers"):
114
- import matplotlib.pyplot as plt
115
-
116
- plt.pie(numbers, labels=[str(i) for i in range(len(numbers))], autopct='%1.1f%%')
117
- plt.title(title)
118
- plt.show()
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes