pyxllib 0.3.197__py3-none-any.whl → 3.201.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyxllib/__init__.py +14 -21
- pyxllib/algo/__init__.py +8 -8
- pyxllib/algo/disjoint.py +54 -54
- pyxllib/algo/geo.py +537 -541
- pyxllib/algo/intervals.py +964 -964
- pyxllib/algo/matcher.py +389 -389
- pyxllib/algo/newbie.py +166 -166
- pyxllib/algo/pupil.py +629 -629
- pyxllib/algo/shapelylib.py +67 -67
- pyxllib/algo/specialist.py +241 -241
- pyxllib/algo/stat.py +494 -494
- pyxllib/algo/treelib.py +145 -149
- pyxllib/algo/unitlib.py +62 -66
- pyxllib/autogui/__init__.py +5 -5
- pyxllib/autogui/activewin.py +246 -246
- pyxllib/autogui/all.py +9 -9
- pyxllib/autogui/autogui.py +846 -852
- pyxllib/autogui/uiautolib.py +362 -362
- pyxllib/autogui/virtualkey.py +102 -102
- pyxllib/autogui/wechat.py +827 -827
- pyxllib/autogui/wechat_msg.py +421 -421
- pyxllib/autogui/wxautolib.py +84 -84
- pyxllib/cv/__init__.py +5 -5
- pyxllib/cv/expert.py +267 -267
- pyxllib/cv/imfile.py +159 -159
- pyxllib/cv/imhash.py +39 -39
- pyxllib/cv/pupil.py +9 -9
- pyxllib/cv/rgbfmt.py +1525 -1525
- pyxllib/cv/slidercaptcha.py +137 -137
- pyxllib/cv/trackbartools.py +251 -251
- pyxllib/cv/xlcvlib.py +1040 -1040
- pyxllib/cv/xlpillib.py +423 -423
- pyxllib/data/echarts.py +236 -240
- pyxllib/data/jsonlib.py +85 -89
- pyxllib/data/oss.py +72 -72
- pyxllib/data/pglib.py +1111 -1127
- pyxllib/data/sqlite.py +568 -568
- pyxllib/data/sqllib.py +297 -297
- pyxllib/ext/JLineViewer.py +505 -505
- pyxllib/ext/__init__.py +6 -6
- pyxllib/ext/demolib.py +251 -246
- pyxllib/ext/drissionlib.py +277 -277
- pyxllib/ext/kq5034lib.py +12 -12
- pyxllib/ext/qt.py +449 -449
- pyxllib/ext/robustprocfile.py +493 -497
- pyxllib/ext/seleniumlib.py +76 -76
- pyxllib/ext/tk.py +173 -173
- pyxllib/ext/unixlib.py +821 -827
- pyxllib/ext/utools.py +345 -351
- pyxllib/ext/webhook.py +124 -119
- pyxllib/ext/win32lib.py +40 -40
- pyxllib/ext/wjxlib.py +91 -88
- pyxllib/ext/wpsapi.py +124 -124
- pyxllib/ext/xlwork.py +9 -9
- pyxllib/ext/yuquelib.py +1110 -1105
- pyxllib/file/__init__.py +17 -17
- pyxllib/file/docxlib.py +757 -761
- pyxllib/file/gitlib.py +309 -309
- pyxllib/file/libreoffice.py +165 -165
- pyxllib/file/movielib.py +144 -148
- pyxllib/file/newbie.py +10 -10
- pyxllib/file/onenotelib.py +1469 -1469
- pyxllib/file/packlib/__init__.py +330 -330
- pyxllib/file/packlib/zipfile.py +2441 -2441
- pyxllib/file/pdflib.py +422 -426
- pyxllib/file/pupil.py +185 -185
- pyxllib/file/specialist/__init__.py +681 -685
- pyxllib/file/specialist/dirlib.py +799 -799
- pyxllib/file/specialist/download.py +193 -193
- pyxllib/file/specialist/filelib.py +2825 -2829
- pyxllib/file/xlsxlib.py +3122 -3131
- pyxllib/file/xlsyncfile.py +341 -341
- pyxllib/prog/__init__.py +5 -5
- pyxllib/prog/cachetools.py +58 -64
- pyxllib/prog/deprecatedlib.py +233 -233
- pyxllib/prog/filelock.py +42 -42
- pyxllib/prog/ipyexec.py +253 -253
- pyxllib/prog/multiprogs.py +940 -940
- pyxllib/prog/newbie.py +451 -451
- pyxllib/prog/pupil.py +1208 -1197
- pyxllib/prog/sitepackages.py +33 -33
- pyxllib/prog/specialist/__init__.py +348 -391
- pyxllib/prog/specialist/bc.py +203 -203
- pyxllib/prog/specialist/browser.py +497 -497
- pyxllib/prog/specialist/common.py +347 -347
- pyxllib/prog/specialist/datetime.py +198 -198
- pyxllib/prog/specialist/tictoc.py +240 -240
- pyxllib/prog/specialist/xllog.py +180 -180
- pyxllib/prog/xlosenv.py +110 -108
- pyxllib/stdlib/__init__.py +17 -17
- pyxllib/stdlib/tablepyxl/__init__.py +10 -10
- pyxllib/stdlib/tablepyxl/style.py +303 -303
- pyxllib/stdlib/tablepyxl/tablepyxl.py +130 -130
- pyxllib/text/__init__.py +8 -8
- pyxllib/text/ahocorasick.py +36 -39
- pyxllib/text/airscript.js +754 -744
- pyxllib/text/charclasslib.py +121 -121
- pyxllib/text/jiebalib.py +267 -267
- pyxllib/text/jinjalib.py +27 -32
- pyxllib/text/jsa_ai_prompt.md +271 -271
- pyxllib/text/jscode.py +922 -922
- pyxllib/text/latex/__init__.py +158 -158
- pyxllib/text/levenshtein.py +303 -303
- pyxllib/text/nestenv.py +1215 -1215
- pyxllib/text/newbie.py +300 -300
- pyxllib/text/pupil/__init__.py +8 -8
- pyxllib/text/pupil/common.py +1121 -1121
- pyxllib/text/pupil/xlalign.py +326 -326
- pyxllib/text/pycode.py +47 -47
- pyxllib/text/specialist/__init__.py +8 -8
- pyxllib/text/specialist/common.py +112 -112
- pyxllib/text/specialist/ptag.py +186 -186
- pyxllib/text/spellchecker.py +172 -172
- pyxllib/text/templates/echart_base.html +10 -10
- pyxllib/text/templates/highlight_code.html +16 -16
- pyxllib/text/templates/latex_editor.html +102 -102
- pyxllib/text/vbacode.py +17 -17
- pyxllib/text/xmllib.py +741 -747
- pyxllib/xl.py +42 -39
- pyxllib/xlcv.py +17 -17
- pyxllib-3.201.1.dist-info/METADATA +296 -0
- pyxllib-3.201.1.dist-info/RECORD +125 -0
- {pyxllib-0.3.197.dist-info → pyxllib-3.201.1.dist-info}/licenses/LICENSE +190 -190
- pyxllib/ext/old.py +0 -663
- pyxllib-0.3.197.dist-info/METADATA +0 -48
- pyxllib-0.3.197.dist-info/RECORD +0 -126
- {pyxllib-0.3.197.dist-info → pyxllib-3.201.1.dist-info}/WHEEL +0 -0
pyxllib/algo/stat.py
CHANGED
@@ -1,494 +1,494 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
# @Author : 陈坤泽
|
4
|
-
# @Email : 877362867@qq.com
|
5
|
-
# @Date : 2021/06/03 23:04
|
6
|
-
|
7
|
-
""" 统计方面的功能
|
8
|
-
|
9
|
-
主要是pandas、表格运算
|
10
|
-
"""
|
11
|
-
|
12
|
-
import sys
|
13
|
-
from collections import defaultdict, Counter
|
14
|
-
|
15
|
-
import pandas as pd
|
16
|
-
|
17
|
-
from pyxllib.prog.pupil import dprint, typename
|
18
|
-
from pyxllib.file.specialist import XlPath
|
19
|
-
|
20
|
-
pd.options.display.unicode.east_asian_width = True # 优化中文输出对齐问题
|
21
|
-
try:
|
22
|
-
pd.set_option('future.no_silent_downcasting', True)
|
23
|
-
except Exception as e:
|
24
|
-
pass
|
25
|
-
|
26
|
-
|
27
|
-
def treetable(childreds, parents, arg3=None, nodename_colname=None):
|
28
|
-
""" 输入childres子结点id列表,和parents父结点id列表
|
29
|
-
|
30
|
-
两个列表长度必须相等
|
31
|
-
文档:http://note.youdao.com/noteshare?id=126200f45d301fcb4364d06a0cae8376
|
32
|
-
|
33
|
-
有两种调用形式
|
34
|
-
>> treetable(childreds, parents) --> DataFrame (新建df)
|
35
|
-
>> treetable(df, child_colname, parent_colname) --> DataFrame (修改后的df)
|
36
|
-
|
37
|
-
返回一个二维列表
|
38
|
-
新的childreds (末尾可能回加虚结点)
|
39
|
-
新的parents
|
40
|
-
函数会计算每一行childred对应的树排序后的排序编号order
|
41
|
-
以及每个节点深度depth
|
42
|
-
|
43
|
-
>> ls1 = [6, 2, 4, 5, 3], ls2 = [7, 1, 2, 2, 1], treetable(ls1, ls2)
|
44
|
-
child_id parent_id depth tree_order tree_struct
|
45
|
-
5 7 root 1 1 = = 7
|
46
|
-
0 6 7 2 2 = = = = 6
|
47
|
-
6 1 root 1 3 = = 1
|
48
|
-
1 2 1 2 4 = = = = 2
|
49
|
-
2 4 2 3 5 = = = = = = 4
|
50
|
-
3 5 2 3 6 = = = = = = 5
|
51
|
-
4 3 1 2 7 = = = = 3
|
52
|
-
"""
|
53
|
-
# 0 参数预处理
|
54
|
-
if isinstance(childreds, pd.DataFrame):
|
55
|
-
df = childreds
|
56
|
-
child_colname = parents
|
57
|
-
parent_colname = arg3
|
58
|
-
if not arg3: raise TypeError
|
59
|
-
childreds = df[child_colname].tolist()
|
60
|
-
parents = df[parent_colname].tolist()
|
61
|
-
else:
|
62
|
-
df = None
|
63
|
-
|
64
|
-
# 1 建立root根节点,确保除了root其他结点都存在记录
|
65
|
-
lefts = set(parents) - set(childreds) # parents列中没有在childreds出现的结点
|
66
|
-
cs, ps = list(childreds), list(parents)
|
67
|
-
|
68
|
-
if len(lefts) == 0:
|
69
|
-
# b_left为空一定有环,b_left不为空也不一定是正常的树
|
70
|
-
raise ValueError('有环,不是树结构')
|
71
|
-
elif len(lefts) == 1: # 只有一个未出现的结点,那么它既是根节点
|
72
|
-
root = list(lefts)[0]
|
73
|
-
else: # 多个父结点没有记录,则对这些父结点统一加一个root父结点
|
74
|
-
root = 'root'
|
75
|
-
allnode = set(parents) | set(childreds) # 所有结点集合
|
76
|
-
while root in allnode: root += '-' # 一直在末尾加'-',直到这个结点是输入里未出现的
|
77
|
-
# 添加结点
|
78
|
-
lefts = list(lefts)
|
79
|
-
lefts.sort(key=lambda x: parents.index(x))
|
80
|
-
for t in lefts:
|
81
|
-
cs.append(t)
|
82
|
-
ps.append(root)
|
83
|
-
|
84
|
-
n = len(cs)
|
85
|
-
depth, tree_order, len_childs = [-1] * n, [-1] * n, [0] * n
|
86
|
-
|
87
|
-
# 2 构造父结点-孩子结点的字典dd
|
88
|
-
dd = defaultdict(list)
|
89
|
-
for i in range(n): dd[ps[i]] += [i]
|
90
|
-
|
91
|
-
# 3 dfs
|
92
|
-
cnt = 1
|
93
|
-
|
94
|
-
def dfs(node, d):
|
95
|
-
"""找node的所有子结点"""
|
96
|
-
nonlocal cnt
|
97
|
-
for i in dd.get(node, []):
|
98
|
-
tree_order[i], depth[i], len_childs[i] = cnt, d, len(dd[cs[i]])
|
99
|
-
cnt += 1
|
100
|
-
dfs(cs[i], d + 1)
|
101
|
-
|
102
|
-
dfs(root, 1)
|
103
|
-
|
104
|
-
# 4 输出格式
|
105
|
-
tree_struct = list(map(lambda i: f"{'_ _ ' * depth[i]}{cs[i]}" + (f'[{len_childs[i]}]' if len_childs[i] else ''),
|
106
|
-
range(n)))
|
107
|
-
|
108
|
-
if df is None:
|
109
|
-
ls = list(zip(cs, ps, depth, tree_order, len_childs, tree_struct))
|
110
|
-
df = pd.DataFrame.from_records(ls, columns=('child_id', 'parent_id',
|
111
|
-
'depth', 'tree_order', 'len_childs', 'tree_struct'))
|
112
|
-
else:
|
113
|
-
k = len(df)
|
114
|
-
df = df.append(pd.DataFrame({child_colname: cs[k:], parent_colname: ps[k:]}), sort=False, ignore_index=True)
|
115
|
-
if nodename_colname:
|
116
|
-
tree_struct = list(
|
117
|
-
map(lambda i: f"{'_ _ ' * depth[i]}{cs[i]} {df.iloc[i][nodename_colname]}"
|
118
|
-
+ (f'[{len_childs[i]}]' if len_childs[i] else ''), range(n)))
|
119
|
-
df['depth'], df['tree_order'], df['len_childs'], df['tree_struct'] = depth, tree_order, len_childs, tree_struct
|
120
|
-
df.sort_values('tree_order', inplace=True) # 注意有时候可能不能排序,要维持输入时候的顺序
|
121
|
-
return df
|
122
|
-
|
123
|
-
|
124
|
-
def treetable_flatten(df, *, reverse=False, childid_colname='id', parentid_colname='parent_id', format_colname=None):
|
125
|
-
""" 获得知识树横向展开表:列为depth-3, depth-2, depth-1,表示倒数第3级、倒数第2级、倒数第1级
|
126
|
-
|
127
|
-
:param df: DataFrame数据
|
128
|
-
:param reverse:
|
129
|
-
False,正常地罗列depth1、depth2、depth3...等结点信息
|
130
|
-
True,反向列举所属层级,即显示倒数第1层parent1,然后是倒数第2层parent2...
|
131
|
-
:param childid_colname: 孩子结点列
|
132
|
-
:param parentid_colname: 父结点列
|
133
|
-
:param format_colname: 显示的数值
|
134
|
-
None,默认采用 childid_colname 的值
|
135
|
-
str,某一列的名称,采用那一列的值(可以实现设置好格式)
|
136
|
-
:return:
|
137
|
-
"""
|
138
|
-
# 1 构造辅助数组
|
139
|
-
if format_colname is None: format_colname = parentid_colname
|
140
|
-
parentid = dict() # parentid[k] = v, 存储结点k对应的父结点v
|
141
|
-
nodeval = dict() # nodeval[k] = v, 存储结点k需要显示的数值情况
|
142
|
-
if len(df[df.index.duplicated()]):
|
143
|
-
dprint(len(set(df.index)), len(df.index)) # 有重复index
|
144
|
-
raise ValueError
|
145
|
-
|
146
|
-
for idx, row in df.iterrows():
|
147
|
-
parentid[row[childid_colname]] = row[parentid_colname]
|
148
|
-
nodeval[row[childid_colname]] = str(row[format_colname])
|
149
|
-
|
150
|
-
# 2 每个结点往上遍历出所有父结点
|
151
|
-
parents = []
|
152
|
-
for idx, row in df.iterrows():
|
153
|
-
ps = [nodeval[row[childid_colname]]] # 包含结点自身的所有父结点名称
|
154
|
-
p = row[parentid_colname]
|
155
|
-
while p in parentid:
|
156
|
-
ps.append(nodeval[p])
|
157
|
-
p = parentid[p]
|
158
|
-
parents.append(ps)
|
159
|
-
num_depth = max(map(len, parents), default=0)
|
160
|
-
|
161
|
-
# 3 这里可以灵活调整最终要显示的格式效果
|
162
|
-
df['parents'] = parents
|
163
|
-
if reverse:
|
164
|
-
for j in range(num_depth, 0, -1): df[f'depth-{j}'] = ''
|
165
|
-
for idx, row in df.iterrows():
|
166
|
-
for j in range(1, len(row.parents) + 1):
|
167
|
-
df.loc[idx, f'depth-{j}'] = row.parents[j - 1]
|
168
|
-
else:
|
169
|
-
for j in range(num_depth): df[f'depth{j}'] = ''
|
170
|
-
for idx, row in df.iterrows():
|
171
|
-
for j in range(len(row.parents)):
|
172
|
-
df.loc[idx, f'depth{j}'] = row.parents[-j - 1]
|
173
|
-
df.drop('parents', axis=1, inplace=True)
|
174
|
-
return df
|
175
|
-
|
176
|
-
|
177
|
-
def write_dataframes_to_excel(outfile, dataframes, order_mode='序号'):
|
178
|
-
""" 将多个DataFrame表格写入一个Excel文件,并添加序号列
|
179
|
-
|
180
|
-
:param str outfile: 输出的Excel文件路径
|
181
|
-
:param dict dataframes: 包含要保存的DataFrame的字典,键为sheet名,值为DataFrame
|
182
|
-
:param str order_mode: 序号模式,可选值为 'default' 或 '序号',默认为 '序号'
|
183
|
-
|
184
|
-
>> write_dataframes_to_excel('test.xlsx', {'images': df1, 'annotations': df2})
|
185
|
-
|
186
|
-
# TODO 存成表格后,可以使用openpyxl等库再打开表格精修
|
187
|
-
|
188
|
-
实现上,尽可能在一些常见结构上,进行一些格式美化。但对费常规结构,就保留df默认排版效果,不做特殊处理。
|
189
|
-
"""
|
190
|
-
with pd.ExcelWriter(str(outfile), engine='xlsxwriter') as writer:
|
191
|
-
head_format = writer.book.add_format({'font_size': 12, 'font_color': 'blue',
|
192
|
-
'align': 'left', 'valign': 'vcenter'})
|
193
|
-
for sheet_name, df in dataframes.items():
|
194
|
-
if df.index.nlevels == 1 and df.columns.nlevels == 1:
|
195
|
-
if order_mode == '序号':
|
196
|
-
# 写入带有序号列的数据表格
|
197
|
-
if '序号' not in df.columns:
|
198
|
-
df = df.copy()
|
199
|
-
df.insert(0, '序号', range(1, len(df) + 1))
|
200
|
-
else:
|
201
|
-
df = df.reset_index()
|
202
|
-
df.columns = ['_index'] + list(df.columns[1:])
|
203
|
-
df.to_excel(writer, sheet_name=sheet_name, freeze_panes=(1, 0), index=False)
|
204
|
-
else:
|
205
|
-
# 写入普通的数据表格
|
206
|
-
df.to_excel(writer, sheet_name=sheet_name, freeze_panes=(1, df.index.nlevels))
|
207
|
-
|
208
|
-
# 设置表头格式
|
209
|
-
if df.columns.nlevels == 1:
|
210
|
-
start = df.index.nlevels
|
211
|
-
if start == 1:
|
212
|
-
start = 0
|
213
|
-
for col_num, value in enumerate(df.columns, start=start):
|
214
|
-
writer.sheets[sheet_name].write(0, col_num, value, head_format)
|
215
|
-
|
216
|
-
|
217
|
-
def read_dataframes_from_excel(infile):
|
218
|
-
""" 从Excel文件读取多个DataFrame表格
|
219
|
-
|
220
|
-
:param str infile: Excel文件路径
|
221
|
-
:return: 包含读取的DataFrame的字典,键为工作表名,值为DataFrame
|
222
|
-
:rtype: dict
|
223
|
-
|
224
|
-
注意这个函数不太适用于与读取多级index和多级columns的情况,建议遇到这种情况,手动读取,
|
225
|
-
read_excel可以设置header=[0,1]、index=[0,1,2]的形式来定制表头所在位置。
|
226
|
-
"""
|
227
|
-
dataframes = {}
|
228
|
-
with pd.ExcelFile(infile) as xls:
|
229
|
-
sheet_names = xls.sheet_names
|
230
|
-
for sheet_name in sheet_names:
|
231
|
-
df = pd.read_excel(xls, sheet_name=sheet_name)
|
232
|
-
if '_index' in df.columns:
|
233
|
-
df = df.drop('_index', axis=1)
|
234
|
-
dataframes[sheet_name] = df
|
235
|
-
return dataframes
|
236
|
-
|
237
|
-
|
238
|
-
def update_dataframes_to_excel(outfile, dataframes, order_mode='序号'):
|
239
|
-
""" 更新xlsx文件中的sheets数据 """
|
240
|
-
outfile = XlPath(outfile)
|
241
|
-
if outfile.is_file():
|
242
|
-
data = read_dataframes_from_excel(outfile)
|
243
|
-
else:
|
244
|
-
data = {}
|
245
|
-
data.update(dataframes)
|
246
|
-
write_dataframes_to_excel(outfile, data, order_mode)
|
247
|
-
|
248
|
-
|
249
|
-
def xlpivot(df, index=None, columns=None, values=None):
|
250
|
-
""" 对pandas进行封装的数据透视表功能
|
251
|
-
|
252
|
-
:param df: 数据表
|
253
|
-
:param index: 行划分方式
|
254
|
-
:param columns: 列划分方式
|
255
|
-
:param values: 显示的值
|
256
|
-
Callable[items, value]:输出一个函数
|
257
|
-
list[str]: 支持输入属性列表,表示显示原始值的意思。如果原始值不唯一,则逗号分开拼接后显示。但这种用法就不太算是传统意义的数据透视表了
|
258
|
-
:return: 数据透视表的表格
|
259
|
-
|
260
|
-
使用示例:
|
261
|
-
def func(items): # 输入匹配的多行数据
|
262
|
-
x = items.iloc[0]
|
263
|
-
return f'{x["precision"]:.0f},{x["recall"]:.0f},{x["hmean"]:.2f},{x["fps"]}' # 返回显示的值
|
264
|
-
|
265
|
-
>> df2 = xlpivot(df, ['model_type'], ['dataset', 'total_frame'],
|
266
|
-
{'precision,recall,hmean,fps': func})
|
267
|
-
|
268
|
-
注意技巧:如果要在分组后约束特定顺序,可以使用特殊前缀进行编号对齐
|
269
|
-
"""
|
270
|
-
|
271
|
-
# 1 将分组的格式标准化
|
272
|
-
def reset_groups(keys):
|
273
|
-
if isinstance(keys, (list, tuple)):
|
274
|
-
return list(keys)
|
275
|
-
elif keys:
|
276
|
-
return [keys]
|
277
|
-
else:
|
278
|
-
return []
|
279
|
-
|
280
|
-
index_, columns_ = reset_groups(index), reset_groups(columns)
|
281
|
-
|
282
|
-
# 2 目标值的格式标准化
|
283
|
-
def make_col_func(col):
|
284
|
-
def func(rows):
|
285
|
-
if len(rows):
|
286
|
-
return ', '.join(map(str, rows[col].values))
|
287
|
-
return ''
|
288
|
-
|
289
|
-
return func
|
290
|
-
|
291
|
-
if isinstance(values, (list, tuple)):
|
292
|
-
values = {v: make_col_func(v) for v in values}
|
293
|
-
|
294
|
-
if callable(values):
|
295
|
-
values_ = {'values': values}
|
296
|
-
elif isinstance(values, dict):
|
297
|
-
values_ = values
|
298
|
-
else:
|
299
|
-
raise TypeError
|
300
|
-
|
301
|
-
# 3 分组
|
302
|
-
assert len(df), 'df是空的'
|
303
|
-
|
304
|
-
keys = index_ + columns_
|
305
|
-
dfgp = df.groupby(keys)
|
306
|
-
data = defaultdict(list)
|
307
|
-
for ks, items in dfgp:
|
308
|
-
# 要存储分组(keys)相关的值
|
309
|
-
if len(keys) == 1:
|
310
|
-
data[keys[0]].append(ks)
|
311
|
-
else:
|
312
|
-
for i, k in enumerate(keys):
|
313
|
-
data[k].append(ks[i])
|
314
|
-
# 再存储生成的值
|
315
|
-
for k, func in values_.items():
|
316
|
-
data[k].append(func(items))
|
317
|
-
df2 = pd.DataFrame.from_dict(data)
|
318
|
-
|
319
|
-
# 4 可视化表格
|
320
|
-
if index and columns:
|
321
|
-
view_table = df2.pivot(index=index, columns=columns, values=list(values_.keys()))
|
322
|
-
elif index:
|
323
|
-
view_table = df2.set_index(index_)
|
324
|
-
else: # 只有columns,没有index
|
325
|
-
view_table = df2.set_index(index_).T
|
326
|
-
return view_table
|
327
|
-
|
328
|
-
|
329
|
-
def count_key_combinations(df, col_names, count_col_name='count'):
|
330
|
-
""" 统计列出的几个列名,各种组合出现的次数
|
331
|
-
|
332
|
-
:param df:
|
333
|
-
:param col_names: ['a', 'b', 'c']
|
334
|
-
:param count_col_name: 新增的统计出现次数的列名,默认count
|
335
|
-
:return: 新的次数统计的df表格
|
336
|
-
|
337
|
-
这个功能跟 SqlCodeGenerator 的 keys_count、one2many很像,是可以代替这两个功能的
|
338
|
-
"""
|
339
|
-
from collections import Counter
|
340
|
-
|
341
|
-
# 0 参数处理
|
342
|
-
if isinstance(col_names, str):
|
343
|
-
col_names = [col_names]
|
344
|
-
|
345
|
-
# 1 统计每种组合出现的次数
|
346
|
-
cols = [df[name] for name in col_names]
|
347
|
-
ct = Counter(tuple(zip(*cols)))
|
348
|
-
|
349
|
-
# 2 生成新的df的统计表
|
350
|
-
ls = []
|
351
|
-
for k, v in ct.most_common():
|
352
|
-
ls.append([*k, v])
|
353
|
-
df2 = pd.DataFrame.from_records(ls, columns=list(col_names) + [count_col_name])
|
354
|
-
return df2
|
355
|
-
|
356
|
-
|
357
|
-
def pareto_accumulate(weights, accuracy=0.01, *, print_mode=False, value_unit_type='K'):
|
358
|
-
""" 帕累托累计
|
359
|
-
|
360
|
-
可以用来分析主要出现的权重、频次
|
361
|
-
二八法则,往往20%的数据,就能解决80%的问题
|
362
|
-
|
363
|
-
:param weights: 一组权重数据
|
364
|
-
:param accuracy: 累计精度,当统计到末尾时,可能有大量权重过小的数值
|
365
|
-
此时不频繁进行累计权重计算,而是当更新权重累计达到accuracy,才会更新一个记录点
|
366
|
-
注意这是全量数据综合的百分比,所以最小更新量就是1%
|
367
|
-
:param print_mode: 是否直接展示可视化结果
|
368
|
-
:return: [(累计数值数量, ≥当前阈值, 累计权重), ...]
|
369
|
-
|
370
|
-
>>> pareto_accumulate([1, 2, 3, 9, 8, 7, 4, 6, 5])
|
371
|
-
[(1, 9, 9), (2, 8, 17), (3, 7, 24), (4, 6, 30), (5, 5, 35), (6, 4, 39), (7, 3, 42), (8, 2, 44), (9, 1, 45)]
|
372
|
-
>>> pareto_accumulate([1, 2, 3, 9, 8, 7, 4, 6, 5], 0.1)
|
373
|
-
[(1, 9, 9), (2, 8, 17), (3, 7, 24), (4, 6, 30), (5, 5, 35), (7, 3, 42), (9, 1, 45)]
|
374
|
-
"""
|
375
|
-
# 1 基础数据计算
|
376
|
-
points = []
|
377
|
-
weights = sorted(weights, reverse=True)
|
378
|
-
|
379
|
-
total = sum(weights)
|
380
|
-
accuracy = total * accuracy
|
381
|
-
|
382
|
-
acc = 0
|
383
|
-
delta = 0
|
384
|
-
for i, w in enumerate(weights, start=1):
|
385
|
-
acc += w
|
386
|
-
delta += w
|
387
|
-
if delta >= accuracy:
|
388
|
-
points.append((i, w, acc))
|
389
|
-
delta = 0
|
390
|
-
if delta:
|
391
|
-
points.append((len(weights), weights[-1], acc))
|
392
|
-
|
393
|
-
# 2 结果展示
|
394
|
-
def fmt(p):
|
395
|
-
from pyxllib.prog.newbie import human_readable_number
|
396
|
-
ls = [f'{human_readable_number(p[0], "万")}条≥{human_readable_number(p[1])}',
|
397
|
-
f'{human_readable_number(p[2], value_unit_type)}({p[2] / total_size:.0%})']
|
398
|
-
return ','.join(map(str, ls))
|
399
|
-
|
400
|
-
total_size = points[-1][2]
|
401
|
-
labels = [fmt(p) for p in points]
|
402
|
-
|
403
|
-
pts = [[p[0], p[2]] for p in points]
|
404
|
-
|
405
|
-
if print_mode:
|
406
|
-
if sys.platform == 'win32':
|
407
|
-
from pyxllib.data.echarts import Line
|
408
|
-
from pyxllib.prog.specialist import browser
|
409
|
-
|
410
|
-
x = Line()
|
411
|
-
x.add_series('帕累托累积权重', pts, labels=labels, label={'position': 'right'})
|
412
|
-
browser(x)
|
413
|
-
else:
|
414
|
-
print(*labels, sep='\n')
|
415
|
-
|
416
|
-
return pts, labels
|
417
|
-
|
418
|
-
|
419
|
-
class XlDataFrame(pd.DataFrame):
|
420
|
-
def check_dtypes(self):
|
421
|
-
""" 检查数据类型
|
422
|
-
第1列是列名,第2列是原本dtypes显示的类型看,第3列是我扩展的统计的实际数据类型
|
423
|
-
"""
|
424
|
-
d = self.dtypes
|
425
|
-
ls = [[k, d[k], Counter([typename(x) for x in v]).most_common()] for k, v in self.iteritems()]
|
426
|
-
df = pd.DataFrame.from_records(ls, columns=['name', 'type', 'detail'])
|
427
|
-
return df
|
428
|
-
|
429
|
-
|
430
|
-
class ModifiableRow:
|
431
|
-
def __init__(self, df, index):
|
432
|
-
self.df = df
|
433
|
-
self.index = index
|
434
|
-
|
435
|
-
def __getitem__(self, item):
|
436
|
-
return self.df.at[self.index, item]
|
437
|
-
|
438
|
-
def __setitem__(self, key, value):
|
439
|
-
self.df.at[self.index, key] = value
|
440
|
-
|
441
|
-
|
442
|
-
def print_full_dataframe(df):
|
443
|
-
"""
|
444
|
-
临时设置以完整显示DataFrame的内容
|
445
|
-
|
446
|
-
:param pd.DataFrame df: 需要完整显示的DataFrame
|
447
|
-
"""
|
448
|
-
with pd.option_context('display.max_rows', None,
|
449
|
-
'display.max_columns', None,
|
450
|
-
'display.width', 1000,
|
451
|
-
'display.max_colwidth', None):
|
452
|
-
print(df)
|
453
|
-
|
454
|
-
pd.options.display.max_rows = 60
|
455
|
-
|
456
|
-
|
457
|
-
def custom_fillna(df, default_fill_value='', numeric_fill_value=None, specific_fill=None):
|
458
|
-
""" 使用更多灵活性填充DataFrame中的NaN值。
|
459
|
-
|
460
|
-
:param pandas.DataFrame df: 需要处理的DataFrame。
|
461
|
-
:param str default_fill_value: 非数值列中NaN的默认填充值。
|
462
|
-
:param numeric_fill_value: 数值列中NaN的填充值,如果不指定,则默认为None。
|
463
|
-
:param dict specific_fill: 指定列名及其NaN的填充值,如果不指定,则默认为None。
|
464
|
-
:return: 已根据指定标准填充NaN值的pandas.DataFrame。
|
465
|
-
|
466
|
-
>>> df = pd.DataFrame({'A': [1, 2, None], 'B': [None, 'x', 'y'], 'C': [None, None, None]})
|
467
|
-
>>> custom_fillna(df, 'filled', 0, {'C': 'special'})
|
468
|
-
"""
|
469
|
-
for column in df.columns:
|
470
|
-
# 检查列是否在specific_fill中指定;如果是,则使用指定的值填充。
|
471
|
-
if specific_fill and column in specific_fill:
|
472
|
-
df[column] = df[column].fillna(specific_fill[column])
|
473
|
-
# 如果列是数值型且指定了numeric_fill_value,则使用numeric_fill_value填充。
|
474
|
-
elif numeric_fill_value is not None and pd.api.types.is_numeric_dtype(df[column]):
|
475
|
-
df[column] = df[column].fillna(numeric_fill_value)
|
476
|
-
# 否则,对非数值列使用default_fill_value进行填充。
|
477
|
-
elif pd.api.types.is_object_dtype(df[column]) or pd.api.types.is_string_dtype(df[column]):
|
478
|
-
df[column] = df[column].fillna(default_fill_value)
|
479
|
-
# 可以在这里添加更多条件,以处理其他数据类型,如datetime。
|
480
|
-
return df
|
481
|
-
|
482
|
-
|
483
|
-
def dataframe_to_list(df):
|
484
|
-
"""将DataFrame转换为列表结构,第一行是表头,其余是数据"""
|
485
|
-
# 获取表头(列名)作为第一个列表元素
|
486
|
-
headers = df.columns.tolist()
|
487
|
-
|
488
|
-
# 获取数据行,每一行作为一个列表,然后将所有这些列表收集到一个大列表中
|
489
|
-
data_rows = df.values.tolist()
|
490
|
-
|
491
|
-
# 将表头和数据行合并成最终的列表
|
492
|
-
result_list = [headers] + data_rows
|
493
|
-
|
494
|
-
return result_list
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : 陈坤泽
|
4
|
+
# @Email : 877362867@qq.com
|
5
|
+
# @Date : 2021/06/03 23:04
|
6
|
+
|
7
|
+
""" 统计方面的功能
|
8
|
+
|
9
|
+
主要是pandas、表格运算
|
10
|
+
"""
|
11
|
+
|
12
|
+
import sys
|
13
|
+
from collections import defaultdict, Counter
|
14
|
+
|
15
|
+
import pandas as pd
|
16
|
+
|
17
|
+
from pyxllib.prog.pupil import dprint, typename
|
18
|
+
from pyxllib.file.specialist import XlPath
|
19
|
+
|
20
|
+
pd.options.display.unicode.east_asian_width = True # 优化中文输出对齐问题
|
21
|
+
try:
|
22
|
+
pd.set_option('future.no_silent_downcasting', True)
|
23
|
+
except Exception as e:
|
24
|
+
pass
|
25
|
+
|
26
|
+
|
27
|
+
def treetable(childreds, parents, arg3=None, nodename_colname=None):
|
28
|
+
""" 输入childres子结点id列表,和parents父结点id列表
|
29
|
+
|
30
|
+
两个列表长度必须相等
|
31
|
+
文档:http://note.youdao.com/noteshare?id=126200f45d301fcb4364d06a0cae8376
|
32
|
+
|
33
|
+
有两种调用形式
|
34
|
+
>> treetable(childreds, parents) --> DataFrame (新建df)
|
35
|
+
>> treetable(df, child_colname, parent_colname) --> DataFrame (修改后的df)
|
36
|
+
|
37
|
+
返回一个二维列表
|
38
|
+
新的childreds (末尾可能回加虚结点)
|
39
|
+
新的parents
|
40
|
+
函数会计算每一行childred对应的树排序后的排序编号order
|
41
|
+
以及每个节点深度depth
|
42
|
+
|
43
|
+
>> ls1 = [6, 2, 4, 5, 3], ls2 = [7, 1, 2, 2, 1], treetable(ls1, ls2)
|
44
|
+
child_id parent_id depth tree_order tree_struct
|
45
|
+
5 7 root 1 1 = = 7
|
46
|
+
0 6 7 2 2 = = = = 6
|
47
|
+
6 1 root 1 3 = = 1
|
48
|
+
1 2 1 2 4 = = = = 2
|
49
|
+
2 4 2 3 5 = = = = = = 4
|
50
|
+
3 5 2 3 6 = = = = = = 5
|
51
|
+
4 3 1 2 7 = = = = 3
|
52
|
+
"""
|
53
|
+
# 0 参数预处理
|
54
|
+
if isinstance(childreds, pd.DataFrame):
|
55
|
+
df = childreds
|
56
|
+
child_colname = parents
|
57
|
+
parent_colname = arg3
|
58
|
+
if not arg3: raise TypeError
|
59
|
+
childreds = df[child_colname].tolist()
|
60
|
+
parents = df[parent_colname].tolist()
|
61
|
+
else:
|
62
|
+
df = None
|
63
|
+
|
64
|
+
# 1 建立root根节点,确保除了root其他结点都存在记录
|
65
|
+
lefts = set(parents) - set(childreds) # parents列中没有在childreds出现的结点
|
66
|
+
cs, ps = list(childreds), list(parents)
|
67
|
+
|
68
|
+
if len(lefts) == 0:
|
69
|
+
# b_left为空一定有环,b_left不为空也不一定是正常的树
|
70
|
+
raise ValueError('有环,不是树结构')
|
71
|
+
elif len(lefts) == 1: # 只有一个未出现的结点,那么它既是根节点
|
72
|
+
root = list(lefts)[0]
|
73
|
+
else: # 多个父结点没有记录,则对这些父结点统一加一个root父结点
|
74
|
+
root = 'root'
|
75
|
+
allnode = set(parents) | set(childreds) # 所有结点集合
|
76
|
+
while root in allnode: root += '-' # 一直在末尾加'-',直到这个结点是输入里未出现的
|
77
|
+
# 添加结点
|
78
|
+
lefts = list(lefts)
|
79
|
+
lefts.sort(key=lambda x: parents.index(x))
|
80
|
+
for t in lefts:
|
81
|
+
cs.append(t)
|
82
|
+
ps.append(root)
|
83
|
+
|
84
|
+
n = len(cs)
|
85
|
+
depth, tree_order, len_childs = [-1] * n, [-1] * n, [0] * n
|
86
|
+
|
87
|
+
# 2 构造父结点-孩子结点的字典dd
|
88
|
+
dd = defaultdict(list)
|
89
|
+
for i in range(n): dd[ps[i]] += [i]
|
90
|
+
|
91
|
+
# 3 dfs
|
92
|
+
cnt = 1
|
93
|
+
|
94
|
+
def dfs(node, d):
|
95
|
+
"""找node的所有子结点"""
|
96
|
+
nonlocal cnt
|
97
|
+
for i in dd.get(node, []):
|
98
|
+
tree_order[i], depth[i], len_childs[i] = cnt, d, len(dd[cs[i]])
|
99
|
+
cnt += 1
|
100
|
+
dfs(cs[i], d + 1)
|
101
|
+
|
102
|
+
dfs(root, 1)
|
103
|
+
|
104
|
+
# 4 输出格式
|
105
|
+
tree_struct = list(map(lambda i: f"{'_ _ ' * depth[i]}{cs[i]}" + (f'[{len_childs[i]}]' if len_childs[i] else ''),
|
106
|
+
range(n)))
|
107
|
+
|
108
|
+
if df is None:
|
109
|
+
ls = list(zip(cs, ps, depth, tree_order, len_childs, tree_struct))
|
110
|
+
df = pd.DataFrame.from_records(ls, columns=('child_id', 'parent_id',
|
111
|
+
'depth', 'tree_order', 'len_childs', 'tree_struct'))
|
112
|
+
else:
|
113
|
+
k = len(df)
|
114
|
+
df = df.append(pd.DataFrame({child_colname: cs[k:], parent_colname: ps[k:]}), sort=False, ignore_index=True)
|
115
|
+
if nodename_colname:
|
116
|
+
tree_struct = list(
|
117
|
+
map(lambda i: f"{'_ _ ' * depth[i]}{cs[i]} {df.iloc[i][nodename_colname]}"
|
118
|
+
+ (f'[{len_childs[i]}]' if len_childs[i] else ''), range(n)))
|
119
|
+
df['depth'], df['tree_order'], df['len_childs'], df['tree_struct'] = depth, tree_order, len_childs, tree_struct
|
120
|
+
df.sort_values('tree_order', inplace=True) # 注意有时候可能不能排序,要维持输入时候的顺序
|
121
|
+
return df
|
122
|
+
|
123
|
+
|
124
|
+
def treetable_flatten(df, *, reverse=False, childid_colname='id', parentid_colname='parent_id', format_colname=None):
|
125
|
+
""" 获得知识树横向展开表:列为depth-3, depth-2, depth-1,表示倒数第3级、倒数第2级、倒数第1级
|
126
|
+
|
127
|
+
:param df: DataFrame数据
|
128
|
+
:param reverse:
|
129
|
+
False,正常地罗列depth1、depth2、depth3...等结点信息
|
130
|
+
True,反向列举所属层级,即显示倒数第1层parent1,然后是倒数第2层parent2...
|
131
|
+
:param childid_colname: 孩子结点列
|
132
|
+
:param parentid_colname: 父结点列
|
133
|
+
:param format_colname: 显示的数值
|
134
|
+
None,默认采用 childid_colname 的值
|
135
|
+
str,某一列的名称,采用那一列的值(可以实现设置好格式)
|
136
|
+
:return:
|
137
|
+
"""
|
138
|
+
# 1 构造辅助数组
|
139
|
+
if format_colname is None: format_colname = parentid_colname
|
140
|
+
parentid = dict() # parentid[k] = v, 存储结点k对应的父结点v
|
141
|
+
nodeval = dict() # nodeval[k] = v, 存储结点k需要显示的数值情况
|
142
|
+
if len(df[df.index.duplicated()]):
|
143
|
+
dprint(len(set(df.index)), len(df.index)) # 有重复index
|
144
|
+
raise ValueError
|
145
|
+
|
146
|
+
for idx, row in df.iterrows():
|
147
|
+
parentid[row[childid_colname]] = row[parentid_colname]
|
148
|
+
nodeval[row[childid_colname]] = str(row[format_colname])
|
149
|
+
|
150
|
+
# 2 每个结点往上遍历出所有父结点
|
151
|
+
parents = []
|
152
|
+
for idx, row in df.iterrows():
|
153
|
+
ps = [nodeval[row[childid_colname]]] # 包含结点自身的所有父结点名称
|
154
|
+
p = row[parentid_colname]
|
155
|
+
while p in parentid:
|
156
|
+
ps.append(nodeval[p])
|
157
|
+
p = parentid[p]
|
158
|
+
parents.append(ps)
|
159
|
+
num_depth = max(map(len, parents), default=0)
|
160
|
+
|
161
|
+
# 3 这里可以灵活调整最终要显示的格式效果
|
162
|
+
df['parents'] = parents
|
163
|
+
if reverse:
|
164
|
+
for j in range(num_depth, 0, -1): df[f'depth-{j}'] = ''
|
165
|
+
for idx, row in df.iterrows():
|
166
|
+
for j in range(1, len(row.parents) + 1):
|
167
|
+
df.loc[idx, f'depth-{j}'] = row.parents[j - 1]
|
168
|
+
else:
|
169
|
+
for j in range(num_depth): df[f'depth{j}'] = ''
|
170
|
+
for idx, row in df.iterrows():
|
171
|
+
for j in range(len(row.parents)):
|
172
|
+
df.loc[idx, f'depth{j}'] = row.parents[-j - 1]
|
173
|
+
df.drop('parents', axis=1, inplace=True)
|
174
|
+
return df
|
175
|
+
|
176
|
+
|
177
|
+
def write_dataframes_to_excel(outfile, dataframes, order_mode='序号'):
|
178
|
+
""" 将多个DataFrame表格写入一个Excel文件,并添加序号列
|
179
|
+
|
180
|
+
:param str outfile: 输出的Excel文件路径
|
181
|
+
:param dict dataframes: 包含要保存的DataFrame的字典,键为sheet名,值为DataFrame
|
182
|
+
:param str order_mode: 序号模式,可选值为 'default' 或 '序号',默认为 '序号'
|
183
|
+
|
184
|
+
>> write_dataframes_to_excel('test.xlsx', {'images': df1, 'annotations': df2})
|
185
|
+
|
186
|
+
# TODO 存成表格后,可以使用openpyxl等库再打开表格精修
|
187
|
+
|
188
|
+
实现上,尽可能在一些常见结构上,进行一些格式美化。但对费常规结构,就保留df默认排版效果,不做特殊处理。
|
189
|
+
"""
|
190
|
+
with pd.ExcelWriter(str(outfile), engine='xlsxwriter') as writer:
|
191
|
+
head_format = writer.book.add_format({'font_size': 12, 'font_color': 'blue',
|
192
|
+
'align': 'left', 'valign': 'vcenter'})
|
193
|
+
for sheet_name, df in dataframes.items():
|
194
|
+
if df.index.nlevels == 1 and df.columns.nlevels == 1:
|
195
|
+
if order_mode == '序号':
|
196
|
+
# 写入带有序号列的数据表格
|
197
|
+
if '序号' not in df.columns:
|
198
|
+
df = df.copy()
|
199
|
+
df.insert(0, '序号', range(1, len(df) + 1))
|
200
|
+
else:
|
201
|
+
df = df.reset_index()
|
202
|
+
df.columns = ['_index'] + list(df.columns[1:])
|
203
|
+
df.to_excel(writer, sheet_name=sheet_name, freeze_panes=(1, 0), index=False)
|
204
|
+
else:
|
205
|
+
# 写入普通的数据表格
|
206
|
+
df.to_excel(writer, sheet_name=sheet_name, freeze_panes=(1, df.index.nlevels))
|
207
|
+
|
208
|
+
# 设置表头格式
|
209
|
+
if df.columns.nlevels == 1:
|
210
|
+
start = df.index.nlevels
|
211
|
+
if start == 1:
|
212
|
+
start = 0
|
213
|
+
for col_num, value in enumerate(df.columns, start=start):
|
214
|
+
writer.sheets[sheet_name].write(0, col_num, value, head_format)
|
215
|
+
|
216
|
+
|
217
|
+
def read_dataframes_from_excel(infile):
|
218
|
+
""" 从Excel文件读取多个DataFrame表格
|
219
|
+
|
220
|
+
:param str infile: Excel文件路径
|
221
|
+
:return: 包含读取的DataFrame的字典,键为工作表名,值为DataFrame
|
222
|
+
:rtype: dict
|
223
|
+
|
224
|
+
注意这个函数不太适用于与读取多级index和多级columns的情况,建议遇到这种情况,手动读取,
|
225
|
+
read_excel可以设置header=[0,1]、index=[0,1,2]的形式来定制表头所在位置。
|
226
|
+
"""
|
227
|
+
dataframes = {}
|
228
|
+
with pd.ExcelFile(infile) as xls:
|
229
|
+
sheet_names = xls.sheet_names
|
230
|
+
for sheet_name in sheet_names:
|
231
|
+
df = pd.read_excel(xls, sheet_name=sheet_name)
|
232
|
+
if '_index' in df.columns:
|
233
|
+
df = df.drop('_index', axis=1)
|
234
|
+
dataframes[sheet_name] = df
|
235
|
+
return dataframes
|
236
|
+
|
237
|
+
|
238
|
+
def update_dataframes_to_excel(outfile, dataframes, order_mode='序号'):
|
239
|
+
""" 更新xlsx文件中的sheets数据 """
|
240
|
+
outfile = XlPath(outfile)
|
241
|
+
if outfile.is_file():
|
242
|
+
data = read_dataframes_from_excel(outfile)
|
243
|
+
else:
|
244
|
+
data = {}
|
245
|
+
data.update(dataframes)
|
246
|
+
write_dataframes_to_excel(outfile, data, order_mode)
|
247
|
+
|
248
|
+
|
249
|
+
def xlpivot(df, index=None, columns=None, values=None):
|
250
|
+
""" 对pandas进行封装的数据透视表功能
|
251
|
+
|
252
|
+
:param df: 数据表
|
253
|
+
:param index: 行划分方式
|
254
|
+
:param columns: 列划分方式
|
255
|
+
:param values: 显示的值
|
256
|
+
Callable[items, value]:输出一个函数
|
257
|
+
list[str]: 支持输入属性列表,表示显示原始值的意思。如果原始值不唯一,则逗号分开拼接后显示。但这种用法就不太算是传统意义的数据透视表了
|
258
|
+
:return: 数据透视表的表格
|
259
|
+
|
260
|
+
使用示例:
|
261
|
+
def func(items): # 输入匹配的多行数据
|
262
|
+
x = items.iloc[0]
|
263
|
+
return f'{x["precision"]:.0f},{x["recall"]:.0f},{x["hmean"]:.2f},{x["fps"]}' # 返回显示的值
|
264
|
+
|
265
|
+
>> df2 = xlpivot(df, ['model_type'], ['dataset', 'total_frame'],
|
266
|
+
{'precision,recall,hmean,fps': func})
|
267
|
+
|
268
|
+
注意技巧:如果要在分组后约束特定顺序,可以使用特殊前缀进行编号对齐
|
269
|
+
"""
|
270
|
+
|
271
|
+
# 1 将分组的格式标准化
|
272
|
+
def reset_groups(keys):
|
273
|
+
if isinstance(keys, (list, tuple)):
|
274
|
+
return list(keys)
|
275
|
+
elif keys:
|
276
|
+
return [keys]
|
277
|
+
else:
|
278
|
+
return []
|
279
|
+
|
280
|
+
index_, columns_ = reset_groups(index), reset_groups(columns)
|
281
|
+
|
282
|
+
# 2 目标值的格式标准化
|
283
|
+
def make_col_func(col):
|
284
|
+
def func(rows):
|
285
|
+
if len(rows):
|
286
|
+
return ', '.join(map(str, rows[col].values))
|
287
|
+
return ''
|
288
|
+
|
289
|
+
return func
|
290
|
+
|
291
|
+
if isinstance(values, (list, tuple)):
|
292
|
+
values = {v: make_col_func(v) for v in values}
|
293
|
+
|
294
|
+
if callable(values):
|
295
|
+
values_ = {'values': values}
|
296
|
+
elif isinstance(values, dict):
|
297
|
+
values_ = values
|
298
|
+
else:
|
299
|
+
raise TypeError
|
300
|
+
|
301
|
+
# 3 分组
|
302
|
+
assert len(df), 'df是空的'
|
303
|
+
|
304
|
+
keys = index_ + columns_
|
305
|
+
dfgp = df.groupby(keys)
|
306
|
+
data = defaultdict(list)
|
307
|
+
for ks, items in dfgp:
|
308
|
+
# 要存储分组(keys)相关的值
|
309
|
+
if len(keys) == 1:
|
310
|
+
data[keys[0]].append(ks)
|
311
|
+
else:
|
312
|
+
for i, k in enumerate(keys):
|
313
|
+
data[k].append(ks[i])
|
314
|
+
# 再存储生成的值
|
315
|
+
for k, func in values_.items():
|
316
|
+
data[k].append(func(items))
|
317
|
+
df2 = pd.DataFrame.from_dict(data)
|
318
|
+
|
319
|
+
# 4 可视化表格
|
320
|
+
if index and columns:
|
321
|
+
view_table = df2.pivot(index=index, columns=columns, values=list(values_.keys()))
|
322
|
+
elif index:
|
323
|
+
view_table = df2.set_index(index_)
|
324
|
+
else: # 只有columns,没有index
|
325
|
+
view_table = df2.set_index(index_).T
|
326
|
+
return view_table
|
327
|
+
|
328
|
+
|
329
|
+
def count_key_combinations(df, col_names, count_col_name='count'):
|
330
|
+
""" 统计列出的几个列名,各种组合出现的次数
|
331
|
+
|
332
|
+
:param df:
|
333
|
+
:param col_names: ['a', 'b', 'c']
|
334
|
+
:param count_col_name: 新增的统计出现次数的列名,默认count
|
335
|
+
:return: 新的次数统计的df表格
|
336
|
+
|
337
|
+
这个功能跟 SqlCodeGenerator 的 keys_count、one2many很像,是可以代替这两个功能的
|
338
|
+
"""
|
339
|
+
from collections import Counter
|
340
|
+
|
341
|
+
# 0 参数处理
|
342
|
+
if isinstance(col_names, str):
|
343
|
+
col_names = [col_names]
|
344
|
+
|
345
|
+
# 1 统计每种组合出现的次数
|
346
|
+
cols = [df[name] for name in col_names]
|
347
|
+
ct = Counter(tuple(zip(*cols)))
|
348
|
+
|
349
|
+
# 2 生成新的df的统计表
|
350
|
+
ls = []
|
351
|
+
for k, v in ct.most_common():
|
352
|
+
ls.append([*k, v])
|
353
|
+
df2 = pd.DataFrame.from_records(ls, columns=list(col_names) + [count_col_name])
|
354
|
+
return df2
|
355
|
+
|
356
|
+
|
357
|
+
def pareto_accumulate(weights, accuracy=0.01, *, print_mode=False, value_unit_type='K'):
|
358
|
+
""" 帕累托累计
|
359
|
+
|
360
|
+
可以用来分析主要出现的权重、频次
|
361
|
+
二八法则,往往20%的数据,就能解决80%的问题
|
362
|
+
|
363
|
+
:param weights: 一组权重数据
|
364
|
+
:param accuracy: 累计精度,当统计到末尾时,可能有大量权重过小的数值
|
365
|
+
此时不频繁进行累计权重计算,而是当更新权重累计达到accuracy,才会更新一个记录点
|
366
|
+
注意这是全量数据综合的百分比,所以最小更新量就是1%
|
367
|
+
:param print_mode: 是否直接展示可视化结果
|
368
|
+
:return: [(累计数值数量, ≥当前阈值, 累计权重), ...]
|
369
|
+
|
370
|
+
>>> pareto_accumulate([1, 2, 3, 9, 8, 7, 4, 6, 5])
|
371
|
+
[(1, 9, 9), (2, 8, 17), (3, 7, 24), (4, 6, 30), (5, 5, 35), (6, 4, 39), (7, 3, 42), (8, 2, 44), (9, 1, 45)]
|
372
|
+
>>> pareto_accumulate([1, 2, 3, 9, 8, 7, 4, 6, 5], 0.1)
|
373
|
+
[(1, 9, 9), (2, 8, 17), (3, 7, 24), (4, 6, 30), (5, 5, 35), (7, 3, 42), (9, 1, 45)]
|
374
|
+
"""
|
375
|
+
# 1 基础数据计算
|
376
|
+
points = []
|
377
|
+
weights = sorted(weights, reverse=True)
|
378
|
+
|
379
|
+
total = sum(weights)
|
380
|
+
accuracy = total * accuracy
|
381
|
+
|
382
|
+
acc = 0
|
383
|
+
delta = 0
|
384
|
+
for i, w in enumerate(weights, start=1):
|
385
|
+
acc += w
|
386
|
+
delta += w
|
387
|
+
if delta >= accuracy:
|
388
|
+
points.append((i, w, acc))
|
389
|
+
delta = 0
|
390
|
+
if delta:
|
391
|
+
points.append((len(weights), weights[-1], acc))
|
392
|
+
|
393
|
+
# 2 结果展示
|
394
|
+
def fmt(p):
|
395
|
+
from pyxllib.prog.newbie import human_readable_number
|
396
|
+
ls = [f'{human_readable_number(p[0], "万")}条≥{human_readable_number(p[1])}',
|
397
|
+
f'{human_readable_number(p[2], value_unit_type)}({p[2] / total_size:.0%})']
|
398
|
+
return ','.join(map(str, ls))
|
399
|
+
|
400
|
+
total_size = points[-1][2]
|
401
|
+
labels = [fmt(p) for p in points]
|
402
|
+
|
403
|
+
pts = [[p[0], p[2]] for p in points]
|
404
|
+
|
405
|
+
if print_mode:
|
406
|
+
if sys.platform == 'win32':
|
407
|
+
from pyxllib.data.echarts import Line
|
408
|
+
from pyxllib.prog.specialist import browser
|
409
|
+
|
410
|
+
x = Line()
|
411
|
+
x.add_series('帕累托累积权重', pts, labels=labels, label={'position': 'right'})
|
412
|
+
browser(x)
|
413
|
+
else:
|
414
|
+
print(*labels, sep='\n')
|
415
|
+
|
416
|
+
return pts, labels
|
417
|
+
|
418
|
+
|
419
|
+
class XlDataFrame(pd.DataFrame):
|
420
|
+
def check_dtypes(self):
|
421
|
+
""" 检查数据类型
|
422
|
+
第1列是列名,第2列是原本dtypes显示的类型看,第3列是我扩展的统计的实际数据类型
|
423
|
+
"""
|
424
|
+
d = self.dtypes
|
425
|
+
ls = [[k, d[k], Counter([typename(x) for x in v]).most_common()] for k, v in self.iteritems()]
|
426
|
+
df = pd.DataFrame.from_records(ls, columns=['name', 'type', 'detail'])
|
427
|
+
return df
|
428
|
+
|
429
|
+
|
430
|
+
class ModifiableRow:
|
431
|
+
def __init__(self, df, index):
|
432
|
+
self.df = df
|
433
|
+
self.index = index
|
434
|
+
|
435
|
+
def __getitem__(self, item):
|
436
|
+
return self.df.at[self.index, item]
|
437
|
+
|
438
|
+
def __setitem__(self, key, value):
|
439
|
+
self.df.at[self.index, key] = value
|
440
|
+
|
441
|
+
|
442
|
+
def print_full_dataframe(df):
|
443
|
+
"""
|
444
|
+
临时设置以完整显示DataFrame的内容
|
445
|
+
|
446
|
+
:param pd.DataFrame df: 需要完整显示的DataFrame
|
447
|
+
"""
|
448
|
+
with pd.option_context('display.max_rows', None,
|
449
|
+
'display.max_columns', None,
|
450
|
+
'display.width', 1000,
|
451
|
+
'display.max_colwidth', None):
|
452
|
+
print(df)
|
453
|
+
|
454
|
+
pd.options.display.max_rows = 60
|
455
|
+
|
456
|
+
|
457
|
+
def custom_fillna(df, default_fill_value='', numeric_fill_value=None, specific_fill=None):
|
458
|
+
""" 使用更多灵活性填充DataFrame中的NaN值。
|
459
|
+
|
460
|
+
:param pandas.DataFrame df: 需要处理的DataFrame。
|
461
|
+
:param str default_fill_value: 非数值列中NaN的默认填充值。
|
462
|
+
:param numeric_fill_value: 数值列中NaN的填充值,如果不指定,则默认为None。
|
463
|
+
:param dict specific_fill: 指定列名及其NaN的填充值,如果不指定,则默认为None。
|
464
|
+
:return: 已根据指定标准填充NaN值的pandas.DataFrame。
|
465
|
+
|
466
|
+
>>> df = pd.DataFrame({'A': [1, 2, None], 'B': [None, 'x', 'y'], 'C': [None, None, None]})
|
467
|
+
>>> custom_fillna(df, 'filled', 0, {'C': 'special'})
|
468
|
+
"""
|
469
|
+
for column in df.columns:
|
470
|
+
# 检查列是否在specific_fill中指定;如果是,则使用指定的值填充。
|
471
|
+
if specific_fill and column in specific_fill:
|
472
|
+
df[column] = df[column].fillna(specific_fill[column])
|
473
|
+
# 如果列是数值型且指定了numeric_fill_value,则使用numeric_fill_value填充。
|
474
|
+
elif numeric_fill_value is not None and pd.api.types.is_numeric_dtype(df[column]):
|
475
|
+
df[column] = df[column].fillna(numeric_fill_value)
|
476
|
+
# 否则,对非数值列使用default_fill_value进行填充。
|
477
|
+
elif pd.api.types.is_object_dtype(df[column]) or pd.api.types.is_string_dtype(df[column]):
|
478
|
+
df[column] = df[column].fillna(default_fill_value)
|
479
|
+
# 可以在这里添加更多条件,以处理其他数据类型,如datetime。
|
480
|
+
return df
|
481
|
+
|
482
|
+
|
483
|
+
def dataframe_to_list(df):
|
484
|
+
"""将DataFrame转换为列表结构,第一行是表头,其余是数据"""
|
485
|
+
# 获取表头(列名)作为第一个列表元素
|
486
|
+
headers = df.columns.tolist()
|
487
|
+
|
488
|
+
# 获取数据行,每一行作为一个列表,然后将所有这些列表收集到一个大列表中
|
489
|
+
data_rows = df.values.tolist()
|
490
|
+
|
491
|
+
# 将表头和数据行合并成最终的列表
|
492
|
+
result_list = [headers] + data_rows
|
493
|
+
|
494
|
+
return result_list
|