xlin 0.1.11__py2.py3-none-any.whl → 0.1.13__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xlin/read_as_dataframe.py CHANGED
@@ -12,23 +12,23 @@ from xlin.jsonl import dataframe_to_json_list, load_json, load_json_list, save_j
12
12
  from xlin.xls2xlsx import is_xslx
13
13
 
14
14
 
15
- def valid_to_read_as_dataframe(filename: str) -> bool:
16
- suffix_list = [".json", ".jsonl", ".xlsx", "xls", ".csv"]
17
- return any([filename.endswith(suffix) for suffix in suffix_list])
18
-
19
-
20
- def read_as_dataframe(filepath: Union[str, Path], sheet_name: Optional[str] = None, fill_empty_str_to_na=True) -> pd.DataFrame:
15
+ def read_as_dataframe(
16
+ filepath: Union[str, Path],
17
+ sheet_name: Optional[str] = None,
18
+ fill_empty_str_to_na=True,
19
+ filter=lambda x: True,
20
+ ) -> pd.DataFrame:
21
21
  """
22
- 读取文件为表格
22
+ 读取文件为表格。如果是文件夹,则读取文件夹下的所有文件为表格并拼接
23
23
  """
24
24
  filepath = Path(filepath)
25
25
  if filepath.is_dir():
26
- paths = ls(filepath, expand_all_subdir=True)
26
+ paths = ls(filepath, filter=filter, expand_all_subdir=True)
27
27
  df_list = []
28
28
  for path in paths:
29
29
  try:
30
- df = read_as_dataframe(path, sheet_name, fill_empty_str_to_na)
31
- df['数据来源'] = path.name
30
+ df = read_as_dataframe(path, sheet_name, fill_empty_str_to_na, filter)
31
+ df["数据来源"] = path.name
32
32
  except:
33
33
  df = pd.DataFrame()
34
34
  df_list.append(df)
@@ -44,56 +44,64 @@ def read_as_dataframe(filepath: Union[str, Path], sheet_name: Optional[str] = No
44
44
  json_list = load_json_list(filepath)
45
45
  df = pd.DataFrame(json_list)
46
46
  elif filename.endswith(".xlsx"):
47
- df = pd.read_excel(filepath) if sheet_name is None else pd.read_excel(filepath, sheet_name)
47
+ if sheet_name is None:
48
+ df = pd.read_excel(filepath)
49
+ else:
50
+ df = pd.read_excel(filepath, sheet_name)
48
51
  elif filename.endswith(".xls"):
49
52
  if is_xslx(filepath):
50
- df = pd.read_excel(filepath) if sheet_name is None else pd.read_excel(filepath, sheet_name)
53
+ if sheet_name is None:
54
+ df = pd.read_excel(filepath)
55
+ else:
56
+ df = pd.read_excel(filepath, sheet_name)
51
57
  else:
52
58
  df = pyexcel.get_sheet(file_name=filepath)
53
59
  elif filename.endswith(".csv"):
54
60
  df = pd.read_csv(filepath)
61
+ elif filename.endswith(".parquet"):
62
+ df = pd.read_parquet(filepath)
63
+ elif filename.endswith(".feather"):
64
+ df = pd.read_feather(filepath)
65
+ elif filename.endswith(".pkl"):
66
+ df = pd.read_pickle(filepath)
67
+ elif filename.endswith(".h5"):
68
+ df = pd.read_hdf(filepath)
69
+ elif filename.endswith(".txt"):
70
+ df = pd.read_csv(filepath, delimiter="\t")
71
+ elif filename.endswith(".tsv"):
72
+ df = pd.read_csv(filepath, delimiter="\t")
73
+ elif filename.endswith(".xml"):
74
+ df = pd.read_xml(filepath)
75
+ elif filename.endswith(".html"):
76
+ df = pd.read_html(filepath)[0]
77
+ elif filename.endswith(".db"):
78
+ df = pd.read_sql_table(sheet_name, filepath)
55
79
  else:
56
- raise ValueError(f"Unsupported filetype {filepath}. filetype not in [json, jsonl, xlsx, xls, csv]")
80
+ raise ValueError(
81
+ (
82
+ f"Unsupported filetype {filepath}. filetype not in \n"
83
+ "[json, jsonl, xlsx, xls, csv, "
84
+ "parquet, feather, pkl, h5, txt, "
85
+ "tsv, xml, html, db]"
86
+ )
87
+ )
57
88
  if fill_empty_str_to_na:
58
89
  df.fillna("", inplace=True)
59
90
  return df
60
91
 
61
92
 
62
- def read_maybe_dir_as_dataframe(filepath: Union[str, Path], sheet_name: Optional[str] = None) -> pd.DataFrame:
63
- """
64
- input path 可能是文件夹,此时将文件夹下的所有表格拼接到一起返回,要求所有表头一致
65
- 如果不是文件夹,则为文件,尝试直接读取为表格返回
66
- """
67
- out_list = list()
68
- if not isinstance(filepath, Path):
69
- filepath = Path(filepath)
70
- if not filepath.exists():
71
- raise ValueError(f"Path Not Exist: {filepath}")
72
- if not filepath.is_dir():
73
- return read_as_dataframe(filepath, sheet_name)
74
-
75
- files = os.listdir(filepath)
76
- for file_name in files:
77
- if not valid_to_read_as_dataframe(file_name):
78
- continue
79
- input_file = filepath / file_name
80
- df = read_as_dataframe(input_file, sheet_name)
81
- df.fillna("", inplace=True)
82
- for _, line in df.iterrows():
83
- line = line.to_dict()
84
- out_list.append(line)
85
- df = pd.DataFrame(out_list)
86
- return df
87
-
88
-
89
- def read_as_dataframe_dict(filepath: Union[str, Path], fill_empty_str_to_na=True):
93
+ def read_as_dataframe_dict(
94
+ filepath: Union[str, Path],
95
+ fill_empty_str_to_na=True,
96
+ filter=lambda x: True,
97
+ ):
90
98
  filepath = Path(filepath)
91
99
  if filepath.is_dir():
92
- paths = ls(filepath, expand_all_subdir=True)
100
+ paths = ls(filepath, filter=filter, expand_all_subdir=True)
93
101
  df_dict_list = []
94
102
  for path in paths:
95
103
  try:
96
- df_dict = read_as_dataframe_dict(path, fill_empty_str_to_na)
104
+ df_dict = read_as_dataframe_dict(path, fill_empty_str_to_na, filter)
97
105
  except:
98
106
  df_dict = {}
99
107
  df_dict_list.append(df_dict)
@@ -104,11 +112,11 @@ def read_as_dataframe_dict(filepath: Union[str, Path], fill_empty_str_to_na=True
104
112
  for name, df in df_dict.items():
105
113
  if fill_empty_str_to_na:
106
114
  df.fillna("", inplace=True)
107
- df['数据来源'] = filepath.name
115
+ df["数据来源"] = filepath.name
108
116
  elif isinstance(df_dict, pd.DataFrame):
109
117
  if fill_empty_str_to_na:
110
118
  df_dict.fillna("", inplace=True)
111
- df_dict['数据来源'] = filepath.name
119
+ df_dict["数据来源"] = filepath.name
112
120
  return df_dict
113
121
 
114
122
 
@@ -137,7 +145,10 @@ def save_df_dict(df_dict: Dict[str, pd.DataFrame], output_filepath: Union[str, P
137
145
  return output_filepath
138
146
 
139
147
 
140
- def save_df_from_jsonlist(jsonlist: List[Dict[str, str]], output_filepath: Union[str, Path]):
148
+ def save_df_from_jsonlist(
149
+ jsonlist: List[Dict[str, str]],
150
+ output_filepath: Union[str, Path],
151
+ ):
141
152
  df = pd.DataFrame(jsonlist)
142
153
  return save_df(df, output_filepath)
143
154
 
@@ -150,7 +161,9 @@ def save_df(df: pd.DataFrame, output_filepath: Union[str, Path]):
150
161
  return output_filepath
151
162
 
152
163
 
153
- def lazy_build_dataframe(name: str, output_filepath: Path, func, filetype: str = "xlsx"):
164
+ def lazy_build_dataframe(
165
+ name: str, output_filepath: Path, func, filetype: str = "xlsx"
166
+ ):
154
167
  logger.info(name)
155
168
  output_filepath.parent.mkdir(parents=True, exist_ok=True)
156
169
  if output_filepath.exists():
@@ -161,9 +174,13 @@ def lazy_build_dataframe(name: str, output_filepath: Path, func, filetype: str =
161
174
  if filetype == "xlsx":
162
175
  df.to_excel(output_filepath.parent / f"{filename}.xlsx", index=False)
163
176
  elif filetype == "json":
164
- save_json_list(dataframe_to_json_list(df), output_filepath.parent / f"{filename}.json")
177
+ save_json_list(
178
+ dataframe_to_json_list(df), output_filepath.parent / f"{filename}.json"
179
+ )
165
180
  elif filetype == "jsonl":
166
- save_json_list(dataframe_to_json_list(df), output_filepath.parent / f"{filename}.jsonl")
181
+ save_json_list(
182
+ dataframe_to_json_list(df), output_filepath.parent / f"{filename}.jsonl"
183
+ )
167
184
  else:
168
185
  logger.warning(f"不认识的 {filetype},默认保存为 xlsx")
169
186
  df.to_excel(output_filepath.parent / f"{filename}.xlsx", index=False)
@@ -171,7 +188,13 @@ def lazy_build_dataframe(name: str, output_filepath: Path, func, filetype: str =
171
188
  return df
172
189
 
173
190
 
174
- def lazy_build_dataframe_dict(name: str, output_filepath: Path, df_dict: Dict[str, pd.DataFrame], func, skip_sheets: List[str] = list()):
191
+ def lazy_build_dataframe_dict(
192
+ name: str,
193
+ output_filepath: Path,
194
+ df_dict: Dict[str, pd.DataFrame],
195
+ func: Callable[[str, pd.DataFrame], pd.DataFrame],
196
+ skip_sheets: List[str] = list(),
197
+ ):
175
198
  logger.info(name)
176
199
  output_filepath.parent.mkdir(parents=True, exist_ok=True)
177
200
  if output_filepath.exists():
@@ -193,13 +216,17 @@ def merge_multiple_df_dict(list_of_df_dict: List[Dict[str, pd.DataFrame]], sort=
193
216
  for df_dict in list_of_df_dict:
194
217
  for k, df in df_dict.items():
195
218
  df_dict_merged[k].append(df)
196
- df_dict_merged: Dict[str, pd.DataFrame] = {k: pd.concat(v) for k, v in df_dict_merged.items()}
219
+ df_dict_merged: Dict[str, pd.DataFrame] = {
220
+ k: pd.concat(v) for k, v in df_dict_merged.items()
221
+ }
197
222
  if sort:
198
- df_dict_merged: Dict[str, pd.DataFrame] = {k: df_dict_merged[k] for k in sorted(df_dict_merged)}
223
+ df_dict_merged: Dict[str, pd.DataFrame] = {
224
+ k: df_dict_merged[k] for k in sorted(df_dict_merged)
225
+ }
199
226
  return df_dict_merged
200
227
 
201
228
 
202
- def remove_duplicate_and_sort(df: pd.DataFrame, key_col="query", sort_by='label'):
229
+ def remove_duplicate_and_sort(df: pd.DataFrame, key_col="query", sort_by="label"):
203
230
  query_to_rows = {}
204
231
  for i, row in df.iterrows():
205
232
  query_to_rows[row[key_col]] = row
@@ -218,16 +245,20 @@ def highlight_max(x):
218
245
  return [("background-color: yellow" if m else "") for m in is_max]
219
246
 
220
247
 
221
- def split_dataframe(df: pd.DataFrame, output_dir: Union[str, Path], tag: str, split_count=6):
248
+ def split_dataframe(
249
+ df: pd.DataFrame,
250
+ output_dir: Union[str, Path],
251
+ output_filename_prefix: str,
252
+ split_count=6,
253
+ ):
222
254
  output_dir = Path(output_dir)
223
255
  output_dir.mkdir(parents=True, exist_ok=True)
224
256
  rows = dataframe_to_json_list(df)
225
257
  split_step = len(rows) // split_count + 1
226
258
  df_list = []
227
259
  for i in range(0, len(rows), split_step):
228
- filepath = output_dir / f"{tag}_{i // split_step}.xlsx"
229
- df_i = pd.DataFrame(rows[i:i+split_step])
260
+ filepath = output_dir / f"{output_filename_prefix}_{i // split_step}.xlsx"
261
+ df_i = pd.DataFrame(rows[i : i + split_step])
230
262
  df_i.to_excel(filepath, index=False)
231
263
  df_list.append(df_i)
232
264
  return df_list
233
-
xlin/statistic.py CHANGED
@@ -31,3 +31,86 @@ def statistic_char_length(df: pd.DataFrame, instruction_key="instruction"):
31
31
  length.sort()
32
32
  return length
33
33
 
34
+
35
+ def statistic_token_length(df: pd.DataFrame, model_path: str, row_to_prompt: lambda row: row["prompt"]):
36
+ from transformers import AutoTokenizer
37
+
38
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
39
+ lengths = []
40
+ for i, row in df.iterrows():
41
+ prompt = row_to_prompt(row)
42
+ inputs = tokenizer(prompt, return_tensors="pt")
43
+ length = inputs["input_ids"].shape[1]
44
+ lengths.append(length)
45
+ lengths.sort()
46
+ return lengths
47
+
48
+
49
+ def draw_histogram(data: list[int], bins=30, title="Data Distribution Analysis"):
50
+ import numpy as np
51
+ import matplotlib.pyplot as plt
52
+ from scipy.stats import gaussian_kde
53
+
54
+ data = np.array(data)
55
+
56
+ # 计算统计指标
57
+ mean = np.mean(data)
58
+ median = np.median(data)
59
+ std = np.std(data)
60
+ q25, q75 = np.percentile(data, [25, 75])
61
+ data_range = (np.min(data), np.max(data))
62
+
63
+ # 创建图形和坐标轴
64
+ plt.figure(figsize=(12, 7), dpi=100)
65
+
66
+ # 绘制直方图
67
+ plt.hist(data, bins=bins, density=True, alpha=0.5, color="skyblue", edgecolor="white", label="Distribution")
68
+
69
+ # 绘制核密度估计(KDE)
70
+ kde = gaussian_kde(data)
71
+ x_vals = np.linspace(data_range[0] - 1, data_range[1] + 1, 1000)
72
+ plt.plot(x_vals, kde(x_vals), color="navy", linewidth=2, label="KDE Curve")
73
+
74
+ # 添加统计线
75
+ plt.axvline(mean, color="red", linestyle="--", linewidth=2, label=f"Mean ({mean:.2f})")
76
+ plt.axvline(median, color="green", linestyle="-.", linewidth=2, label=f"Median ({median:.2f})")
77
+ plt.axvspan(mean - std, mean + std, color="orange", alpha=0.1, label=f"±1 Std.Dev ({std:.2f})")
78
+
79
+ # 添加四分位线
80
+ plt.axvline(q25, color="purple", linestyle=":", alpha=0.8, label=f"25th Percentile ({q25:.2f})")
81
+ plt.axvline(q75, color="purple", linestyle=":", alpha=0.8, label=f"75th Percentile ({q75:.2f})")
82
+
83
+ # 添加统计摘要
84
+ stats_text = f"""\
85
+ Data Range: [{data_range[0]:.2f}, {data_range[1]:.2f}]
86
+ Observations: {len(data):,}
87
+ Standard Deviation: {std:.2f}
88
+ IQR: {q75 - q25:.2f}
89
+ Skewness: {float((data - mean).mean()**3 / std**3):.4f}
90
+ Kurtosis: {float((data - mean).mean()**4 / std**4):.4f}\
91
+ """
92
+ # 文字左对齐 align
93
+ plt.annotate(stats_text, xy=(0.99, 0.98), xycoords="axes fraction", ha="right", va="top", fontfamily="monospace", bbox=dict(boxstyle="round", facecolor="white", alpha=0.8),)
94
+
95
+ # 设置图形属性
96
+ plt.title(title, fontsize=14, pad=20)
97
+ plt.xlabel("Value", fontsize=12)
98
+ plt.ylabel("Density", fontsize=12)
99
+ plt.grid(True, linestyle="--", alpha=0.4)
100
+ plt.legend(loc="upper left", frameon=True, framealpha=0.9, shadow=True)
101
+
102
+ # 调整坐标轴范围
103
+ buffer = (data_range[1] - data_range[0]) * 0.1
104
+ plt.xlim(data_range[0] - buffer, data_range[1] + buffer)
105
+
106
+ # 显示图形
107
+ plt.tight_layout()
108
+ plt.show()
109
+
110
+
111
+ def draw_pie(numbers: List[int], title="Pie Chart of Numbers"):
112
+ import matplotlib.pyplot as plt
113
+
114
+ plt.pie(numbers, labels=[str(i) for i in range(len(numbers))], autopct='%1.1f%%')
115
+ plt.title(title)
116
+ plt.show()
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.3
2
2
  Name: xlin
3
- Version: 0.1.11
3
+ Version: 0.1.13
4
4
  Summary: toolbox for LinXueyuan
5
5
  License: MIT
6
6
  Author: XiChen
@@ -18,11 +18,13 @@ Classifier: Programming Language :: Python :: 3.9
18
18
  Classifier: Programming Language :: Python :: 3.10
19
19
  Classifier: Programming Language :: Python :: 3.11
20
20
  Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
21
22
  Requires-Dist: loguru
22
23
  Requires-Dist: pandas
23
24
  Requires-Dist: pyexcel
24
25
  Requires-Dist: pyexcel-xls
25
26
  Requires-Dist: pyexcel-xlsx
27
+ Requires-Dist: pyyaml
26
28
  Requires-Dist: tqdm
27
29
  Requires-Dist: xlsxwriter
28
30
  Description-Content-Type: text/markdown
@@ -3,12 +3,12 @@ xlin/ischinese.py,sha256=Ia9IMQ6q-UHkdLwqS70L1fTnfSPbluFrv_I1UqsKquo,293
3
3
  xlin/jsonl.py,sha256=DvVM241a9VgQlp5WIMPRv-JIolT0RdSxw47IG_fc7xE,6690
4
4
  xlin/metric.py,sha256=N7wJ35y-C-IaBr1I1CJ_37lTG7gA69zmn9Xg6xSwKoI,1690
5
5
  xlin/multiprocess_mapping.py,sha256=pmzyEUYpbpIZ_ezyvWWWRpr7D7n4t3E3jW1nGXBbVck,7652
6
- xlin/read_as_dataframe.py,sha256=ir3HUT6dt3crqa3xnlcNn8j3wqjSIGJgiIVLP3KkBaQ,8678
7
- xlin/statistic.py,sha256=BLj8hszlbBT5xDIfd70_YtOb8QgZEvYXiFJDGXBwCfw,881
6
+ xlin/read_as_dataframe.py,sha256=P8bOYW-zm8uGhehCldZI9ZQhHHLGqDPDbSMNWI2li6g,8885
7
+ xlin/statistic.py,sha256=ldTSClpPiPJeA2Yo2rcy14dDtbX8waRL3rZxkxJqTMo,3911
8
8
  xlin/util.py,sha256=SOQUh506GQlljJYLYuI6nScSTOrgRQnMq2xfxSvKIlI,11303
9
9
  xlin/xls2xlsx.py,sha256=5zfcM0gmunFQOcOj9nYd9Dj0HMhU7-cPKnPIy6Ot9iU,930
10
10
  xlin/yaml.py,sha256=kICi7G3Td5q2MaSXXt85qNTWoHMgjzt7pvn7r3C4dME,183
11
- xlin-0.1.11.dist-info/LICENSE,sha256=KX0dDCYlO4DskqMZY8qeY94EZMrDRNnNqlGLkXVlKyM,1063
12
- xlin-0.1.11.dist-info/METADATA,sha256=uX2_qdpKUG3TEUnG5iT19lOOim7YAIk5UuagU-6Vi8Y,1067
13
- xlin-0.1.11.dist-info/WHEEL,sha256=IrRNNNJ-uuL1ggO5qMvT1GGhQVdQU54d6ZpYqEZfEWo,92
14
- xlin-0.1.11.dist-info/RECORD,,
11
+ xlin-0.1.13.dist-info/LICENSE,sha256=KX0dDCYlO4DskqMZY8qeY94EZMrDRNnNqlGLkXVlKyM,1063
12
+ xlin-0.1.13.dist-info/METADATA,sha256=21YKyI-upWi6HFq2zkBoruE_8GS7WtPbkt_Pl6SaixE,1140
13
+ xlin-0.1.13.dist-info/WHEEL,sha256=aiTauIPAnqOMBuVimVqk4bevIILHLWXNdkzTocSR-tg,92
14
+ xlin-0.1.13.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 1.9.0
2
+ Generator: poetry-core 2.1.1
3
3
  Root-Is-Purelib: true
4
4
  Tag: py2.py3-none-any
File without changes