xlin 0.1.20__py2.py3-none-any.whl → 0.1.22__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xlin/read_as_dataframe.py +7 -5
- xlin/statistic.py +11 -11
- {xlin-0.1.20.dist-info → xlin-0.1.22.dist-info}/METADATA +1 -1
- {xlin-0.1.20.dist-info → xlin-0.1.22.dist-info}/RECORD +6 -6
- {xlin-0.1.20.dist-info → xlin-0.1.22.dist-info}/LICENSE +0 -0
- {xlin-0.1.20.dist-info → xlin-0.1.22.dist-info}/WHEEL +0 -0
xlin/read_as_dataframe.py
CHANGED
@@ -13,7 +13,7 @@ from xlin.xls2xlsx import is_xslx
|
|
13
13
|
|
14
14
|
|
15
15
|
def read_as_dataframe(
|
16
|
-
filepath: Union[str, Path],
|
16
|
+
filepath: Union[str, Path, list[str], list[Path]],
|
17
17
|
sheet_name: Optional[str] = None,
|
18
18
|
fill_empty_str_to_na=True,
|
19
19
|
filter=lambda x: True,
|
@@ -21,11 +21,9 @@ def read_as_dataframe(
|
|
21
21
|
"""
|
22
22
|
读取文件为表格。如果是文件夹,则读取文件夹下的所有文件为表格并拼接
|
23
23
|
"""
|
24
|
-
|
25
|
-
if filepath.is_dir():
|
26
|
-
paths = ls(filepath, filter=filter, expand_all_subdir=True)
|
24
|
+
if isinstance(filepath, list):
|
27
25
|
df_list = []
|
28
|
-
for path in
|
26
|
+
for path in filepath:
|
29
27
|
try:
|
30
28
|
df = read_as_dataframe(path, sheet_name, fill_empty_str_to_na, filter)
|
31
29
|
df["数据来源"] = path.name
|
@@ -36,6 +34,10 @@ def read_as_dataframe(
|
|
36
34
|
if fill_empty_str_to_na:
|
37
35
|
df.fillna("", inplace=True)
|
38
36
|
return df
|
37
|
+
filepath = Path(filepath)
|
38
|
+
if filepath.is_dir():
|
39
|
+
paths = ls(filepath, filter=filter, expand_all_subdir=True)
|
40
|
+
return read_as_dataframe(paths, sheet_name, fill_empty_str_to_na, filter)
|
39
41
|
filename = filepath.name
|
40
42
|
if filename.endswith(".json") or filename.endswith(".jsonl"):
|
41
43
|
try:
|
xlin/statistic.py
CHANGED
@@ -19,7 +19,7 @@ def bucket_count(length: List[int], step=50, skip_zero_count=False):
|
|
19
19
|
continue
|
20
20
|
if skip_zero_count and j == 0:
|
21
21
|
continue
|
22
|
-
print(f"[{(i-1)*step}, {i*step}) {j} {sum(grouped_count[:i+1])/len(length)*100:.
|
22
|
+
print(f"[{(i-1)*step}, {i*step}) {j} {sum(grouped_count[:i+1])/len(length)*100:.4f}%")
|
23
23
|
x.append((i - 1) * step)
|
24
24
|
y.append(j)
|
25
25
|
return x, y
|
@@ -73,22 +73,22 @@ def draw_histogram(data: list[int], bins=30, title="Data Distribution Analysis")
|
|
73
73
|
plt.plot(x_vals, kde(x_vals), color="navy", linewidth=2, label="KDE Curve")
|
74
74
|
|
75
75
|
# 添加统计线
|
76
|
-
plt.axvline(mean, color="red", linestyle="--", linewidth=2, label=f"Mean ({mean:.
|
77
|
-
plt.axvline(median, color="green", linestyle="-.", linewidth=2, label=f"Median ({median:.
|
78
|
-
plt.axvspan(mean - std, mean + std, color="orange", alpha=0.1, label=f"±1 Std.Dev ({std:.
|
76
|
+
plt.axvline(mean, color="red", linestyle="--", linewidth=2, label=f"Mean ({mean:.4f})")
|
77
|
+
plt.axvline(median, color="green", linestyle="-.", linewidth=2, label=f"Median ({median:.4f})")
|
78
|
+
plt.axvspan(mean - std, mean + std, color="orange", alpha=0.1, label=f"±1 Std.Dev ({std:.4f})")
|
79
79
|
|
80
80
|
# 添加四分位线
|
81
|
-
plt.axvline(q25, color="purple", linestyle=":", alpha=0.8, label=f"25th Percentile ({q25:.
|
82
|
-
plt.axvline(q75, color="purple", linestyle=":", alpha=0.8, label=f"75th Percentile ({q75:.
|
83
|
-
plt.axvline(q80, color="purple", linestyle=":", alpha=0.8, label=f"80th Percentile ({q80:.
|
84
|
-
plt.axvline(q90, color="purple", linestyle=":", alpha=0.8, label=f"90th Percentile ({q90:.
|
81
|
+
plt.axvline(q25, color="purple", linestyle=":", alpha=0.8, label=f"25th Percentile ({q25:.4f})")
|
82
|
+
plt.axvline(q75, color="purple", linestyle=":", alpha=0.8, label=f"75th Percentile ({q75:.4f})")
|
83
|
+
plt.axvline(q80, color="purple", linestyle=":", alpha=0.8, label=f"80th Percentile ({q80:.4f})")
|
84
|
+
plt.axvline(q90, color="purple", linestyle=":", alpha=0.8, label=f"90th Percentile ({q90:.4f})")
|
85
85
|
|
86
86
|
# 添加统计摘要
|
87
87
|
stats_text = f"""\
|
88
|
-
Data Range: [{data_range[0]:.
|
88
|
+
Data Range: [{data_range[0]:.4f}, {data_range[1]:.4f}]
|
89
89
|
Observations: {len(data):,}
|
90
|
-
Standard Deviation: {std:.
|
91
|
-
IQR: {q75 - q25:.
|
90
|
+
Standard Deviation: {std:.4f}
|
91
|
+
IQR: {q75 - q25:.4f}
|
92
92
|
Skewness: {float((data - mean).mean()**3 / std**3):.4f}
|
93
93
|
Kurtosis: {float((data - mean).mean()**4 / std**4):.4f}\
|
94
94
|
"""
|
@@ -3,13 +3,13 @@ xlin/ischinese.py,sha256=Ia9IMQ6q-UHkdLwqS70L1fTnfSPbluFrv_I1UqsKquo,293
|
|
3
3
|
xlin/jsonl.py,sha256=Ogn_9eIx1NPmI_hMvBVwuDTooJYDEJ8FTtViQ8zTVlQ,7618
|
4
4
|
xlin/metric.py,sha256=N7wJ35y-C-IaBr1I1CJ_37lTG7gA69zmn9Xg6xSwKoI,1690
|
5
5
|
xlin/multiprocess_mapping.py,sha256=dRXQoLaG1dK_qZ8B3bJblV0RKM2gqIeSW1EaOZbIdD0,14251
|
6
|
-
xlin/read_as_dataframe.py,sha256=
|
7
|
-
xlin/statistic.py,sha256=
|
6
|
+
xlin/read_as_dataframe.py,sha256=MqY57L7Wp9UoWTRlZLSBKQNaZa-dKw51-ufrKvHKf8s,9041
|
7
|
+
xlin/statistic.py,sha256=WMZkPFJ5da0rqIJHabdjCbWmgzgCOIj_H6KM5SVF7H0,9301
|
8
8
|
xlin/timing.py,sha256=XMT8dMcMolOMohDvAZOIM_BAiPMREhGQKnO1kc5s6PU,998
|
9
9
|
xlin/util.py,sha256=TTWJaqF5D_r-gAZ_fj0kyHomvCagjwHXQZ2OPSgwd54,10976
|
10
10
|
xlin/xls2xlsx.py,sha256=uSmXcDvIhi5Sq0LGidMXy0wErNBXdjaoa6EftYVjTXs,947
|
11
11
|
xlin/yaml.py,sha256=kICi7G3Td5q2MaSXXt85qNTWoHMgjzt7pvn7r3C4dME,183
|
12
|
-
xlin-0.1.
|
13
|
-
xlin-0.1.
|
14
|
-
xlin-0.1.
|
15
|
-
xlin-0.1.
|
12
|
+
xlin-0.1.22.dist-info/LICENSE,sha256=60ys6rRtc1dZOP8UjSUr9fAqhZudT3WpKe5WbMCralM,1066
|
13
|
+
xlin-0.1.22.dist-info/METADATA,sha256=77itC4591plUaDbS6T01BmZqeZ3jkoXT9uHgsLTTeA8,1098
|
14
|
+
xlin-0.1.22.dist-info/WHEEL,sha256=IrRNNNJ-uuL1ggO5qMvT1GGhQVdQU54d6ZpYqEZfEWo,92
|
15
|
+
xlin-0.1.22.dist-info/RECORD,,
|
File without changes
|
File without changes
|