pyxllib 0.3.197__py3-none-any.whl → 3.201.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. pyxllib/__init__.py +14 -21
  2. pyxllib/algo/__init__.py +8 -8
  3. pyxllib/algo/disjoint.py +54 -54
  4. pyxllib/algo/geo.py +537 -541
  5. pyxllib/algo/intervals.py +964 -964
  6. pyxllib/algo/matcher.py +389 -389
  7. pyxllib/algo/newbie.py +166 -166
  8. pyxllib/algo/pupil.py +629 -629
  9. pyxllib/algo/shapelylib.py +67 -67
  10. pyxllib/algo/specialist.py +241 -241
  11. pyxllib/algo/stat.py +494 -494
  12. pyxllib/algo/treelib.py +145 -149
  13. pyxllib/algo/unitlib.py +62 -66
  14. pyxllib/autogui/__init__.py +5 -5
  15. pyxllib/autogui/activewin.py +246 -246
  16. pyxllib/autogui/all.py +9 -9
  17. pyxllib/autogui/autogui.py +846 -852
  18. pyxllib/autogui/uiautolib.py +362 -362
  19. pyxllib/autogui/virtualkey.py +102 -102
  20. pyxllib/autogui/wechat.py +827 -827
  21. pyxllib/autogui/wechat_msg.py +421 -421
  22. pyxllib/autogui/wxautolib.py +84 -84
  23. pyxllib/cv/__init__.py +5 -5
  24. pyxllib/cv/expert.py +267 -267
  25. pyxllib/cv/imfile.py +159 -159
  26. pyxllib/cv/imhash.py +39 -39
  27. pyxllib/cv/pupil.py +9 -9
  28. pyxllib/cv/rgbfmt.py +1525 -1525
  29. pyxllib/cv/slidercaptcha.py +137 -137
  30. pyxllib/cv/trackbartools.py +251 -251
  31. pyxllib/cv/xlcvlib.py +1040 -1040
  32. pyxllib/cv/xlpillib.py +423 -423
  33. pyxllib/data/echarts.py +236 -240
  34. pyxllib/data/jsonlib.py +85 -89
  35. pyxllib/data/oss.py +72 -72
  36. pyxllib/data/pglib.py +1111 -1127
  37. pyxllib/data/sqlite.py +568 -568
  38. pyxllib/data/sqllib.py +297 -297
  39. pyxllib/ext/JLineViewer.py +505 -505
  40. pyxllib/ext/__init__.py +6 -6
  41. pyxllib/ext/demolib.py +251 -246
  42. pyxllib/ext/drissionlib.py +277 -277
  43. pyxllib/ext/kq5034lib.py +12 -12
  44. pyxllib/ext/qt.py +449 -449
  45. pyxllib/ext/robustprocfile.py +493 -497
  46. pyxllib/ext/seleniumlib.py +76 -76
  47. pyxllib/ext/tk.py +173 -173
  48. pyxllib/ext/unixlib.py +821 -827
  49. pyxllib/ext/utools.py +345 -351
  50. pyxllib/ext/webhook.py +124 -119
  51. pyxllib/ext/win32lib.py +40 -40
  52. pyxllib/ext/wjxlib.py +91 -88
  53. pyxllib/ext/wpsapi.py +124 -124
  54. pyxllib/ext/xlwork.py +9 -9
  55. pyxllib/ext/yuquelib.py +1110 -1105
  56. pyxllib/file/__init__.py +17 -17
  57. pyxllib/file/docxlib.py +757 -761
  58. pyxllib/file/gitlib.py +309 -309
  59. pyxllib/file/libreoffice.py +165 -165
  60. pyxllib/file/movielib.py +144 -148
  61. pyxllib/file/newbie.py +10 -10
  62. pyxllib/file/onenotelib.py +1469 -1469
  63. pyxllib/file/packlib/__init__.py +330 -330
  64. pyxllib/file/packlib/zipfile.py +2441 -2441
  65. pyxllib/file/pdflib.py +422 -426
  66. pyxllib/file/pupil.py +185 -185
  67. pyxllib/file/specialist/__init__.py +681 -685
  68. pyxllib/file/specialist/dirlib.py +799 -799
  69. pyxllib/file/specialist/download.py +193 -193
  70. pyxllib/file/specialist/filelib.py +2825 -2829
  71. pyxllib/file/xlsxlib.py +3122 -3131
  72. pyxllib/file/xlsyncfile.py +341 -341
  73. pyxllib/prog/__init__.py +5 -5
  74. pyxllib/prog/cachetools.py +58 -64
  75. pyxllib/prog/deprecatedlib.py +233 -233
  76. pyxllib/prog/filelock.py +42 -42
  77. pyxllib/prog/ipyexec.py +253 -253
  78. pyxllib/prog/multiprogs.py +940 -940
  79. pyxllib/prog/newbie.py +451 -451
  80. pyxllib/prog/pupil.py +1208 -1197
  81. pyxllib/prog/sitepackages.py +33 -33
  82. pyxllib/prog/specialist/__init__.py +348 -391
  83. pyxllib/prog/specialist/bc.py +203 -203
  84. pyxllib/prog/specialist/browser.py +497 -497
  85. pyxllib/prog/specialist/common.py +347 -347
  86. pyxllib/prog/specialist/datetime.py +198 -198
  87. pyxllib/prog/specialist/tictoc.py +240 -240
  88. pyxllib/prog/specialist/xllog.py +180 -180
  89. pyxllib/prog/xlosenv.py +110 -108
  90. pyxllib/stdlib/__init__.py +17 -17
  91. pyxllib/stdlib/tablepyxl/__init__.py +10 -10
  92. pyxllib/stdlib/tablepyxl/style.py +303 -303
  93. pyxllib/stdlib/tablepyxl/tablepyxl.py +130 -130
  94. pyxllib/text/__init__.py +8 -8
  95. pyxllib/text/ahocorasick.py +36 -39
  96. pyxllib/text/airscript.js +754 -744
  97. pyxllib/text/charclasslib.py +121 -121
  98. pyxllib/text/jiebalib.py +267 -267
  99. pyxllib/text/jinjalib.py +27 -32
  100. pyxllib/text/jsa_ai_prompt.md +271 -271
  101. pyxllib/text/jscode.py +922 -922
  102. pyxllib/text/latex/__init__.py +158 -158
  103. pyxllib/text/levenshtein.py +303 -303
  104. pyxllib/text/nestenv.py +1215 -1215
  105. pyxllib/text/newbie.py +300 -300
  106. pyxllib/text/pupil/__init__.py +8 -8
  107. pyxllib/text/pupil/common.py +1121 -1121
  108. pyxllib/text/pupil/xlalign.py +326 -326
  109. pyxllib/text/pycode.py +47 -47
  110. pyxllib/text/specialist/__init__.py +8 -8
  111. pyxllib/text/specialist/common.py +112 -112
  112. pyxllib/text/specialist/ptag.py +186 -186
  113. pyxllib/text/spellchecker.py +172 -172
  114. pyxllib/text/templates/echart_base.html +10 -10
  115. pyxllib/text/templates/highlight_code.html +16 -16
  116. pyxllib/text/templates/latex_editor.html +102 -102
  117. pyxllib/text/vbacode.py +17 -17
  118. pyxllib/text/xmllib.py +741 -747
  119. pyxllib/xl.py +42 -39
  120. pyxllib/xlcv.py +17 -17
  121. pyxllib-3.201.1.dist-info/METADATA +296 -0
  122. pyxllib-3.201.1.dist-info/RECORD +125 -0
  123. {pyxllib-0.3.197.dist-info → pyxllib-3.201.1.dist-info}/licenses/LICENSE +190 -190
  124. pyxllib/ext/old.py +0 -663
  125. pyxllib-0.3.197.dist-info/METADATA +0 -48
  126. pyxllib-0.3.197.dist-info/RECORD +0 -126
  127. {pyxllib-0.3.197.dist-info → pyxllib-3.201.1.dist-info}/WHEEL +0 -0
pyxllib/algo/stat.py CHANGED
@@ -1,494 +1,494 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- # @Author : 陈坤泽
4
- # @Email : 877362867@qq.com
5
- # @Date : 2021/06/03 23:04
6
-
7
- """ 统计方面的功能
8
-
9
- 主要是pandas、表格运算
10
- """
11
-
12
- import sys
13
- from collections import defaultdict, Counter
14
-
15
- import pandas as pd
16
-
17
- from pyxllib.prog.pupil import dprint, typename
18
- from pyxllib.file.specialist import XlPath
19
-
20
- pd.options.display.unicode.east_asian_width = True # 优化中文输出对齐问题
21
- try:
22
- pd.set_option('future.no_silent_downcasting', True)
23
- except Exception as e:
24
- pass
25
-
26
-
27
- def treetable(childreds, parents, arg3=None, nodename_colname=None):
28
- """ 输入childres子结点id列表,和parents父结点id列表
29
-
30
- 两个列表长度必须相等
31
- 文档:http://note.youdao.com/noteshare?id=126200f45d301fcb4364d06a0cae8376
32
-
33
- 有两种调用形式
34
- >> treetable(childreds, parents) --> DataFrame (新建df)
35
- >> treetable(df, child_colname, parent_colname) --> DataFrame (修改后的df)
36
-
37
- 返回一个二维列表
38
- 新的childreds (末尾可能回加虚结点)
39
- 新的parents
40
- 函数会计算每一行childred对应的树排序后的排序编号order
41
- 以及每个节点深度depth
42
-
43
- >> ls1 = [6, 2, 4, 5, 3], ls2 = [7, 1, 2, 2, 1], treetable(ls1, ls2)
44
- child_id parent_id depth tree_order tree_struct
45
- 5 7 root 1 1 = = 7
46
- 0 6 7 2 2 = = = = 6
47
- 6 1 root 1 3 = = 1
48
- 1 2 1 2 4 = = = = 2
49
- 2 4 2 3 5 = = = = = = 4
50
- 3 5 2 3 6 = = = = = = 5
51
- 4 3 1 2 7 = = = = 3
52
- """
53
- # 0 参数预处理
54
- if isinstance(childreds, pd.DataFrame):
55
- df = childreds
56
- child_colname = parents
57
- parent_colname = arg3
58
- if not arg3: raise TypeError
59
- childreds = df[child_colname].tolist()
60
- parents = df[parent_colname].tolist()
61
- else:
62
- df = None
63
-
64
- # 1 建立root根节点,确保除了root其他结点都存在记录
65
- lefts = set(parents) - set(childreds) # parents列中没有在childreds出现的结点
66
- cs, ps = list(childreds), list(parents)
67
-
68
- if len(lefts) == 0:
69
- # b_left为空一定有环,b_left不为空也不一定是正常的树
70
- raise ValueError('有环,不是树结构')
71
- elif len(lefts) == 1: # 只有一个未出现的结点,那么它既是根节点
72
- root = list(lefts)[0]
73
- else: # 多个父结点没有记录,则对这些父结点统一加一个root父结点
74
- root = 'root'
75
- allnode = set(parents) | set(childreds) # 所有结点集合
76
- while root in allnode: root += '-' # 一直在末尾加'-',直到这个结点是输入里未出现的
77
- # 添加结点
78
- lefts = list(lefts)
79
- lefts.sort(key=lambda x: parents.index(x))
80
- for t in lefts:
81
- cs.append(t)
82
- ps.append(root)
83
-
84
- n = len(cs)
85
- depth, tree_order, len_childs = [-1] * n, [-1] * n, [0] * n
86
-
87
- # 2 构造父结点-孩子结点的字典dd
88
- dd = defaultdict(list)
89
- for i in range(n): dd[ps[i]] += [i]
90
-
91
- # 3 dfs
92
- cnt = 1
93
-
94
- def dfs(node, d):
95
- """找node的所有子结点"""
96
- nonlocal cnt
97
- for i in dd.get(node, []):
98
- tree_order[i], depth[i], len_childs[i] = cnt, d, len(dd[cs[i]])
99
- cnt += 1
100
- dfs(cs[i], d + 1)
101
-
102
- dfs(root, 1)
103
-
104
- # 4 输出格式
105
- tree_struct = list(map(lambda i: f"{'_ _ ' * depth[i]}{cs[i]}" + (f'[{len_childs[i]}]' if len_childs[i] else ''),
106
- range(n)))
107
-
108
- if df is None:
109
- ls = list(zip(cs, ps, depth, tree_order, len_childs, tree_struct))
110
- df = pd.DataFrame.from_records(ls, columns=('child_id', 'parent_id',
111
- 'depth', 'tree_order', 'len_childs', 'tree_struct'))
112
- else:
113
- k = len(df)
114
- df = df.append(pd.DataFrame({child_colname: cs[k:], parent_colname: ps[k:]}), sort=False, ignore_index=True)
115
- if nodename_colname:
116
- tree_struct = list(
117
- map(lambda i: f"{'_ _ ' * depth[i]}{cs[i]} {df.iloc[i][nodename_colname]}"
118
- + (f'[{len_childs[i]}]' if len_childs[i] else ''), range(n)))
119
- df['depth'], df['tree_order'], df['len_childs'], df['tree_struct'] = depth, tree_order, len_childs, tree_struct
120
- df.sort_values('tree_order', inplace=True) # 注意有时候可能不能排序,要维持输入时候的顺序
121
- return df
122
-
123
-
124
- def treetable_flatten(df, *, reverse=False, childid_colname='id', parentid_colname='parent_id', format_colname=None):
125
- """ 获得知识树横向展开表:列为depth-3, depth-2, depth-1,表示倒数第3级、倒数第2级、倒数第1级
126
-
127
- :param df: DataFrame数据
128
- :param reverse:
129
- False,正常地罗列depth1、depth2、depth3...等结点信息
130
- True,反向列举所属层级,即显示倒数第1层parent1,然后是倒数第2层parent2...
131
- :param childid_colname: 孩子结点列
132
- :param parentid_colname: 父结点列
133
- :param format_colname: 显示的数值
134
- None,默认采用 childid_colname 的值
135
- str,某一列的名称,采用那一列的值(可以实现设置好格式)
136
- :return:
137
- """
138
- # 1 构造辅助数组
139
- if format_colname is None: format_colname = parentid_colname
140
- parentid = dict() # parentid[k] = v, 存储结点k对应的父结点v
141
- nodeval = dict() # nodeval[k] = v, 存储结点k需要显示的数值情况
142
- if len(df[df.index.duplicated()]):
143
- dprint(len(set(df.index)), len(df.index)) # 有重复index
144
- raise ValueError
145
-
146
- for idx, row in df.iterrows():
147
- parentid[row[childid_colname]] = row[parentid_colname]
148
- nodeval[row[childid_colname]] = str(row[format_colname])
149
-
150
- # 2 每个结点往上遍历出所有父结点
151
- parents = []
152
- for idx, row in df.iterrows():
153
- ps = [nodeval[row[childid_colname]]] # 包含结点自身的所有父结点名称
154
- p = row[parentid_colname]
155
- while p in parentid:
156
- ps.append(nodeval[p])
157
- p = parentid[p]
158
- parents.append(ps)
159
- num_depth = max(map(len, parents), default=0)
160
-
161
- # 3 这里可以灵活调整最终要显示的格式效果
162
- df['parents'] = parents
163
- if reverse:
164
- for j in range(num_depth, 0, -1): df[f'depth-{j}'] = ''
165
- for idx, row in df.iterrows():
166
- for j in range(1, len(row.parents) + 1):
167
- df.loc[idx, f'depth-{j}'] = row.parents[j - 1]
168
- else:
169
- for j in range(num_depth): df[f'depth{j}'] = ''
170
- for idx, row in df.iterrows():
171
- for j in range(len(row.parents)):
172
- df.loc[idx, f'depth{j}'] = row.parents[-j - 1]
173
- df.drop('parents', axis=1, inplace=True)
174
- return df
175
-
176
-
177
- def write_dataframes_to_excel(outfile, dataframes, order_mode='序号'):
178
- """ 将多个DataFrame表格写入一个Excel文件,并添加序号列
179
-
180
- :param str outfile: 输出的Excel文件路径
181
- :param dict dataframes: 包含要保存的DataFrame的字典,键为sheet名,值为DataFrame
182
- :param str order_mode: 序号模式,可选值为 'default' 或 '序号',默认为 '序号'
183
-
184
- >> write_dataframes_to_excel('test.xlsx', {'images': df1, 'annotations': df2})
185
-
186
- # TODO 存成表格后,可以使用openpyxl等库再打开表格精修
187
-
188
- 实现上,尽可能在一些常见结构上,进行一些格式美化。但对费常规结构,就保留df默认排版效果,不做特殊处理。
189
- """
190
- with pd.ExcelWriter(str(outfile), engine='xlsxwriter') as writer:
191
- head_format = writer.book.add_format({'font_size': 12, 'font_color': 'blue',
192
- 'align': 'left', 'valign': 'vcenter'})
193
- for sheet_name, df in dataframes.items():
194
- if df.index.nlevels == 1 and df.columns.nlevels == 1:
195
- if order_mode == '序号':
196
- # 写入带有序号列的数据表格
197
- if '序号' not in df.columns:
198
- df = df.copy()
199
- df.insert(0, '序号', range(1, len(df) + 1))
200
- else:
201
- df = df.reset_index()
202
- df.columns = ['_index'] + list(df.columns[1:])
203
- df.to_excel(writer, sheet_name=sheet_name, freeze_panes=(1, 0), index=False)
204
- else:
205
- # 写入普通的数据表格
206
- df.to_excel(writer, sheet_name=sheet_name, freeze_panes=(1, df.index.nlevels))
207
-
208
- # 设置表头格式
209
- if df.columns.nlevels == 1:
210
- start = df.index.nlevels
211
- if start == 1:
212
- start = 0
213
- for col_num, value in enumerate(df.columns, start=start):
214
- writer.sheets[sheet_name].write(0, col_num, value, head_format)
215
-
216
-
217
- def read_dataframes_from_excel(infile):
218
- """ 从Excel文件读取多个DataFrame表格
219
-
220
- :param str infile: Excel文件路径
221
- :return: 包含读取的DataFrame的字典,键为工作表名,值为DataFrame
222
- :rtype: dict
223
-
224
- 注意这个函数不太适用于与读取多级index和多级columns的情况,建议遇到这种情况,手动读取,
225
- read_excel可以设置header=[0,1]、index=[0,1,2]的形式来定制表头所在位置。
226
- """
227
- dataframes = {}
228
- with pd.ExcelFile(infile) as xls:
229
- sheet_names = xls.sheet_names
230
- for sheet_name in sheet_names:
231
- df = pd.read_excel(xls, sheet_name=sheet_name)
232
- if '_index' in df.columns:
233
- df = df.drop('_index', axis=1)
234
- dataframes[sheet_name] = df
235
- return dataframes
236
-
237
-
238
- def update_dataframes_to_excel(outfile, dataframes, order_mode='序号'):
239
- """ 更新xlsx文件中的sheets数据 """
240
- outfile = XlPath(outfile)
241
- if outfile.is_file():
242
- data = read_dataframes_from_excel(outfile)
243
- else:
244
- data = {}
245
- data.update(dataframes)
246
- write_dataframes_to_excel(outfile, data, order_mode)
247
-
248
-
249
- def xlpivot(df, index=None, columns=None, values=None):
250
- """ 对pandas进行封装的数据透视表功能
251
-
252
- :param df: 数据表
253
- :param index: 行划分方式
254
- :param columns: 列划分方式
255
- :param values: 显示的值
256
- Callable[items, value]:输出一个函数
257
- list[str]: 支持输入属性列表,表示显示原始值的意思。如果原始值不唯一,则逗号分开拼接后显示。但这种用法就不太算是传统意义的数据透视表了
258
- :return: 数据透视表的表格
259
-
260
- 使用示例:
261
- def func(items): # 输入匹配的多行数据
262
- x = items.iloc[0]
263
- return f'{x["precision"]:.0f},{x["recall"]:.0f},{x["hmean"]:.2f},{x["fps"]}' # 返回显示的值
264
-
265
- >> df2 = xlpivot(df, ['model_type'], ['dataset', 'total_frame'],
266
- {'precision,recall,hmean,fps': func})
267
-
268
- 注意技巧:如果要在分组后约束特定顺序,可以使用特殊前缀进行编号对齐
269
- """
270
-
271
- # 1 将分组的格式标准化
272
- def reset_groups(keys):
273
- if isinstance(keys, (list, tuple)):
274
- return list(keys)
275
- elif keys:
276
- return [keys]
277
- else:
278
- return []
279
-
280
- index_, columns_ = reset_groups(index), reset_groups(columns)
281
-
282
- # 2 目标值的格式标准化
283
- def make_col_func(col):
284
- def func(rows):
285
- if len(rows):
286
- return ', '.join(map(str, rows[col].values))
287
- return ''
288
-
289
- return func
290
-
291
- if isinstance(values, (list, tuple)):
292
- values = {v: make_col_func(v) for v in values}
293
-
294
- if callable(values):
295
- values_ = {'values': values}
296
- elif isinstance(values, dict):
297
- values_ = values
298
- else:
299
- raise TypeError
300
-
301
- # 3 分组
302
- assert len(df), 'df是空的'
303
-
304
- keys = index_ + columns_
305
- dfgp = df.groupby(keys)
306
- data = defaultdict(list)
307
- for ks, items in dfgp:
308
- # 要存储分组(keys)相关的值
309
- if len(keys) == 1:
310
- data[keys[0]].append(ks)
311
- else:
312
- for i, k in enumerate(keys):
313
- data[k].append(ks[i])
314
- # 再存储生成的值
315
- for k, func in values_.items():
316
- data[k].append(func(items))
317
- df2 = pd.DataFrame.from_dict(data)
318
-
319
- # 4 可视化表格
320
- if index and columns:
321
- view_table = df2.pivot(index=index, columns=columns, values=list(values_.keys()))
322
- elif index:
323
- view_table = df2.set_index(index_)
324
- else: # 只有columns,没有index
325
- view_table = df2.set_index(index_).T
326
- return view_table
327
-
328
-
329
- def count_key_combinations(df, col_names, count_col_name='count'):
330
- """ 统计列出的几个列名,各种组合出现的次数
331
-
332
- :param df:
333
- :param col_names: ['a', 'b', 'c']
334
- :param count_col_name: 新增的统计出现次数的列名,默认count
335
- :return: 新的次数统计的df表格
336
-
337
- 这个功能跟 SqlCodeGenerator 的 keys_count、one2many很像,是可以代替这两个功能的
338
- """
339
- from collections import Counter
340
-
341
- # 0 参数处理
342
- if isinstance(col_names, str):
343
- col_names = [col_names]
344
-
345
- # 1 统计每种组合出现的次数
346
- cols = [df[name] for name in col_names]
347
- ct = Counter(tuple(zip(*cols)))
348
-
349
- # 2 生成新的df的统计表
350
- ls = []
351
- for k, v in ct.most_common():
352
- ls.append([*k, v])
353
- df2 = pd.DataFrame.from_records(ls, columns=list(col_names) + [count_col_name])
354
- return df2
355
-
356
-
357
- def pareto_accumulate(weights, accuracy=0.01, *, print_mode=False, value_unit_type='K'):
358
- """ 帕累托累计
359
-
360
- 可以用来分析主要出现的权重、频次
361
- 二八法则,往往20%的数据,就能解决80%的问题
362
-
363
- :param weights: 一组权重数据
364
- :param accuracy: 累计精度,当统计到末尾时,可能有大量权重过小的数值
365
- 此时不频繁进行累计权重计算,而是当更新权重累计达到accuracy,才会更新一个记录点
366
- 注意这是全量数据综合的百分比,所以最小更新量就是1%
367
- :param print_mode: 是否直接展示可视化结果
368
- :return: [(累计数值数量, ≥当前阈值, 累计权重), ...]
369
-
370
- >>> pareto_accumulate([1, 2, 3, 9, 8, 7, 4, 6, 5])
371
- [(1, 9, 9), (2, 8, 17), (3, 7, 24), (4, 6, 30), (5, 5, 35), (6, 4, 39), (7, 3, 42), (8, 2, 44), (9, 1, 45)]
372
- >>> pareto_accumulate([1, 2, 3, 9, 8, 7, 4, 6, 5], 0.1)
373
- [(1, 9, 9), (2, 8, 17), (3, 7, 24), (4, 6, 30), (5, 5, 35), (7, 3, 42), (9, 1, 45)]
374
- """
375
- # 1 基础数据计算
376
- points = []
377
- weights = sorted(weights, reverse=True)
378
-
379
- total = sum(weights)
380
- accuracy = total * accuracy
381
-
382
- acc = 0
383
- delta = 0
384
- for i, w in enumerate(weights, start=1):
385
- acc += w
386
- delta += w
387
- if delta >= accuracy:
388
- points.append((i, w, acc))
389
- delta = 0
390
- if delta:
391
- points.append((len(weights), weights[-1], acc))
392
-
393
- # 2 结果展示
394
- def fmt(p):
395
- from pyxllib.prog.newbie import human_readable_number
396
- ls = [f'{human_readable_number(p[0], "万")}条≥{human_readable_number(p[1])}',
397
- f'{human_readable_number(p[2], value_unit_type)}({p[2] / total_size:.0%})']
398
- return ','.join(map(str, ls))
399
-
400
- total_size = points[-1][2]
401
- labels = [fmt(p) for p in points]
402
-
403
- pts = [[p[0], p[2]] for p in points]
404
-
405
- if print_mode:
406
- if sys.platform == 'win32':
407
- from pyxllib.data.echarts import Line
408
- from pyxllib.prog.specialist import browser
409
-
410
- x = Line()
411
- x.add_series('帕累托累积权重', pts, labels=labels, label={'position': 'right'})
412
- browser(x)
413
- else:
414
- print(*labels, sep='\n')
415
-
416
- return pts, labels
417
-
418
-
419
- class XlDataFrame(pd.DataFrame):
420
- def check_dtypes(self):
421
- """ 检查数据类型
422
- 第1列是列名,第2列是原本dtypes显示的类型看,第3列是我扩展的统计的实际数据类型
423
- """
424
- d = self.dtypes
425
- ls = [[k, d[k], Counter([typename(x) for x in v]).most_common()] for k, v in self.iteritems()]
426
- df = pd.DataFrame.from_records(ls, columns=['name', 'type', 'detail'])
427
- return df
428
-
429
-
430
- class ModifiableRow:
431
- def __init__(self, df, index):
432
- self.df = df
433
- self.index = index
434
-
435
- def __getitem__(self, item):
436
- return self.df.at[self.index, item]
437
-
438
- def __setitem__(self, key, value):
439
- self.df.at[self.index, key] = value
440
-
441
-
442
- def print_full_dataframe(df):
443
- """
444
- 临时设置以完整显示DataFrame的内容
445
-
446
- :param pd.DataFrame df: 需要完整显示的DataFrame
447
- """
448
- with pd.option_context('display.max_rows', None,
449
- 'display.max_columns', None,
450
- 'display.width', 1000,
451
- 'display.max_colwidth', None):
452
- print(df)
453
-
454
- pd.options.display.max_rows = 60
455
-
456
-
457
- def custom_fillna(df, default_fill_value='', numeric_fill_value=None, specific_fill=None):
458
- """ 使用更多灵活性填充DataFrame中的NaN值。
459
-
460
- :param pandas.DataFrame df: 需要处理的DataFrame。
461
- :param str default_fill_value: 非数值列中NaN的默认填充值。
462
- :param numeric_fill_value: 数值列中NaN的填充值,如果不指定,则默认为None。
463
- :param dict specific_fill: 指定列名及其NaN的填充值,如果不指定,则默认为None。
464
- :return: 已根据指定标准填充NaN值的pandas.DataFrame。
465
-
466
- >>> df = pd.DataFrame({'A': [1, 2, None], 'B': [None, 'x', 'y'], 'C': [None, None, None]})
467
- >>> custom_fillna(df, 'filled', 0, {'C': 'special'})
468
- """
469
- for column in df.columns:
470
- # 检查列是否在specific_fill中指定;如果是,则使用指定的值填充。
471
- if specific_fill and column in specific_fill:
472
- df[column] = df[column].fillna(specific_fill[column])
473
- # 如果列是数值型且指定了numeric_fill_value,则使用numeric_fill_value填充。
474
- elif numeric_fill_value is not None and pd.api.types.is_numeric_dtype(df[column]):
475
- df[column] = df[column].fillna(numeric_fill_value)
476
- # 否则,对非数值列使用default_fill_value进行填充。
477
- elif pd.api.types.is_object_dtype(df[column]) or pd.api.types.is_string_dtype(df[column]):
478
- df[column] = df[column].fillna(default_fill_value)
479
- # 可以在这里添加更多条件,以处理其他数据类型,如datetime。
480
- return df
481
-
482
-
483
- def dataframe_to_list(df):
484
- """将DataFrame转换为列表结构,第一行是表头,其余是数据"""
485
- # 获取表头(列名)作为第一个列表元素
486
- headers = df.columns.tolist()
487
-
488
- # 获取数据行,每一行作为一个列表,然后将所有这些列表收集到一个大列表中
489
- data_rows = df.values.tolist()
490
-
491
- # 将表头和数据行合并成最终的列表
492
- result_list = [headers] + data_rows
493
-
494
- return result_list
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : 陈坤泽
4
+ # @Email : 877362867@qq.com
5
+ # @Date : 2021/06/03 23:04
6
+
7
+ """ 统计方面的功能
8
+
9
+ 主要是pandas、表格运算
10
+ """
11
+
12
+ import sys
13
+ from collections import defaultdict, Counter
14
+
15
+ import pandas as pd
16
+
17
+ from pyxllib.prog.pupil import dprint, typename
18
+ from pyxllib.file.specialist import XlPath
19
+
20
+ pd.options.display.unicode.east_asian_width = True # 优化中文输出对齐问题
21
+ try:
22
+ pd.set_option('future.no_silent_downcasting', True)
23
+ except Exception as e:
24
+ pass
25
+
26
+
27
+ def treetable(childreds, parents, arg3=None, nodename_colname=None):
28
+ """ 输入childres子结点id列表,和parents父结点id列表
29
+
30
+ 两个列表长度必须相等
31
+ 文档:http://note.youdao.com/noteshare?id=126200f45d301fcb4364d06a0cae8376
32
+
33
+ 有两种调用形式
34
+ >> treetable(childreds, parents) --> DataFrame (新建df)
35
+ >> treetable(df, child_colname, parent_colname) --> DataFrame (修改后的df)
36
+
37
+ 返回一个二维列表
38
+ 新的childreds (末尾可能回加虚结点)
39
+ 新的parents
40
+ 函数会计算每一行childred对应的树排序后的排序编号order
41
+ 以及每个节点深度depth
42
+
43
+ >> ls1 = [6, 2, 4, 5, 3], ls2 = [7, 1, 2, 2, 1], treetable(ls1, ls2)
44
+ child_id parent_id depth tree_order tree_struct
45
+ 5 7 root 1 1 = = 7
46
+ 0 6 7 2 2 = = = = 6
47
+ 6 1 root 1 3 = = 1
48
+ 1 2 1 2 4 = = = = 2
49
+ 2 4 2 3 5 = = = = = = 4
50
+ 3 5 2 3 6 = = = = = = 5
51
+ 4 3 1 2 7 = = = = 3
52
+ """
53
+ # 0 参数预处理
54
+ if isinstance(childreds, pd.DataFrame):
55
+ df = childreds
56
+ child_colname = parents
57
+ parent_colname = arg3
58
+ if not arg3: raise TypeError
59
+ childreds = df[child_colname].tolist()
60
+ parents = df[parent_colname].tolist()
61
+ else:
62
+ df = None
63
+
64
+ # 1 建立root根节点,确保除了root其他结点都存在记录
65
+ lefts = set(parents) - set(childreds) # parents列中没有在childreds出现的结点
66
+ cs, ps = list(childreds), list(parents)
67
+
68
+ if len(lefts) == 0:
69
+ # b_left为空一定有环,b_left不为空也不一定是正常的树
70
+ raise ValueError('有环,不是树结构')
71
+ elif len(lefts) == 1: # 只有一个未出现的结点,那么它既是根节点
72
+ root = list(lefts)[0]
73
+ else: # 多个父结点没有记录,则对这些父结点统一加一个root父结点
74
+ root = 'root'
75
+ allnode = set(parents) | set(childreds) # 所有结点集合
76
+ while root in allnode: root += '-' # 一直在末尾加'-',直到这个结点是输入里未出现的
77
+ # 添加结点
78
+ lefts = list(lefts)
79
+ lefts.sort(key=lambda x: parents.index(x))
80
+ for t in lefts:
81
+ cs.append(t)
82
+ ps.append(root)
83
+
84
+ n = len(cs)
85
+ depth, tree_order, len_childs = [-1] * n, [-1] * n, [0] * n
86
+
87
+ # 2 构造父结点-孩子结点的字典dd
88
+ dd = defaultdict(list)
89
+ for i in range(n): dd[ps[i]] += [i]
90
+
91
+ # 3 dfs
92
+ cnt = 1
93
+
94
+ def dfs(node, d):
95
+ """找node的所有子结点"""
96
+ nonlocal cnt
97
+ for i in dd.get(node, []):
98
+ tree_order[i], depth[i], len_childs[i] = cnt, d, len(dd[cs[i]])
99
+ cnt += 1
100
+ dfs(cs[i], d + 1)
101
+
102
+ dfs(root, 1)
103
+
104
+ # 4 输出格式
105
+ tree_struct = list(map(lambda i: f"{'_ _ ' * depth[i]}{cs[i]}" + (f'[{len_childs[i]}]' if len_childs[i] else ''),
106
+ range(n)))
107
+
108
+ if df is None:
109
+ ls = list(zip(cs, ps, depth, tree_order, len_childs, tree_struct))
110
+ df = pd.DataFrame.from_records(ls, columns=('child_id', 'parent_id',
111
+ 'depth', 'tree_order', 'len_childs', 'tree_struct'))
112
+ else:
113
+ k = len(df)
114
+ df = df.append(pd.DataFrame({child_colname: cs[k:], parent_colname: ps[k:]}), sort=False, ignore_index=True)
115
+ if nodename_colname:
116
+ tree_struct = list(
117
+ map(lambda i: f"{'_ _ ' * depth[i]}{cs[i]} {df.iloc[i][nodename_colname]}"
118
+ + (f'[{len_childs[i]}]' if len_childs[i] else ''), range(n)))
119
+ df['depth'], df['tree_order'], df['len_childs'], df['tree_struct'] = depth, tree_order, len_childs, tree_struct
120
+ df.sort_values('tree_order', inplace=True) # 注意有时候可能不能排序,要维持输入时候的顺序
121
+ return df
122
+
123
+
124
+ def treetable_flatten(df, *, reverse=False, childid_colname='id', parentid_colname='parent_id', format_colname=None):
125
+ """ 获得知识树横向展开表:列为depth-3, depth-2, depth-1,表示倒数第3级、倒数第2级、倒数第1级
126
+
127
+ :param df: DataFrame数据
128
+ :param reverse:
129
+ False,正常地罗列depth1、depth2、depth3...等结点信息
130
+ True,反向列举所属层级,即显示倒数第1层parent1,然后是倒数第2层parent2...
131
+ :param childid_colname: 孩子结点列
132
+ :param parentid_colname: 父结点列
133
+ :param format_colname: 显示的数值
134
+ None,默认采用 childid_colname 的值
135
+ str,某一列的名称,采用那一列的值(可以实现设置好格式)
136
+ :return:
137
+ """
138
+ # 1 构造辅助数组
139
+ if format_colname is None: format_colname = parentid_colname
140
+ parentid = dict() # parentid[k] = v, 存储结点k对应的父结点v
141
+ nodeval = dict() # nodeval[k] = v, 存储结点k需要显示的数值情况
142
+ if len(df[df.index.duplicated()]):
143
+ dprint(len(set(df.index)), len(df.index)) # 有重复index
144
+ raise ValueError
145
+
146
+ for idx, row in df.iterrows():
147
+ parentid[row[childid_colname]] = row[parentid_colname]
148
+ nodeval[row[childid_colname]] = str(row[format_colname])
149
+
150
+ # 2 每个结点往上遍历出所有父结点
151
+ parents = []
152
+ for idx, row in df.iterrows():
153
+ ps = [nodeval[row[childid_colname]]] # 包含结点自身的所有父结点名称
154
+ p = row[parentid_colname]
155
+ while p in parentid:
156
+ ps.append(nodeval[p])
157
+ p = parentid[p]
158
+ parents.append(ps)
159
+ num_depth = max(map(len, parents), default=0)
160
+
161
+ # 3 这里可以灵活调整最终要显示的格式效果
162
+ df['parents'] = parents
163
+ if reverse:
164
+ for j in range(num_depth, 0, -1): df[f'depth-{j}'] = ''
165
+ for idx, row in df.iterrows():
166
+ for j in range(1, len(row.parents) + 1):
167
+ df.loc[idx, f'depth-{j}'] = row.parents[j - 1]
168
+ else:
169
+ for j in range(num_depth): df[f'depth{j}'] = ''
170
+ for idx, row in df.iterrows():
171
+ for j in range(len(row.parents)):
172
+ df.loc[idx, f'depth{j}'] = row.parents[-j - 1]
173
+ df.drop('parents', axis=1, inplace=True)
174
+ return df
175
+
176
+
177
+ def write_dataframes_to_excel(outfile, dataframes, order_mode='序号'):
178
+ """ 将多个DataFrame表格写入一个Excel文件,并添加序号列
179
+
180
+ :param str outfile: 输出的Excel文件路径
181
+ :param dict dataframes: 包含要保存的DataFrame的字典,键为sheet名,值为DataFrame
182
+ :param str order_mode: 序号模式,可选值为 'default' 或 '序号',默认为 '序号'
183
+
184
+ >> write_dataframes_to_excel('test.xlsx', {'images': df1, 'annotations': df2})
185
+
186
+ # TODO 存成表格后,可以使用openpyxl等库再打开表格精修
187
+
188
+ 实现上,尽可能在一些常见结构上,进行一些格式美化。但对费常规结构,就保留df默认排版效果,不做特殊处理。
189
+ """
190
+ with pd.ExcelWriter(str(outfile), engine='xlsxwriter') as writer:
191
+ head_format = writer.book.add_format({'font_size': 12, 'font_color': 'blue',
192
+ 'align': 'left', 'valign': 'vcenter'})
193
+ for sheet_name, df in dataframes.items():
194
+ if df.index.nlevels == 1 and df.columns.nlevels == 1:
195
+ if order_mode == '序号':
196
+ # 写入带有序号列的数据表格
197
+ if '序号' not in df.columns:
198
+ df = df.copy()
199
+ df.insert(0, '序号', range(1, len(df) + 1))
200
+ else:
201
+ df = df.reset_index()
202
+ df.columns = ['_index'] + list(df.columns[1:])
203
+ df.to_excel(writer, sheet_name=sheet_name, freeze_panes=(1, 0), index=False)
204
+ else:
205
+ # 写入普通的数据表格
206
+ df.to_excel(writer, sheet_name=sheet_name, freeze_panes=(1, df.index.nlevels))
207
+
208
+ # 设置表头格式
209
+ if df.columns.nlevels == 1:
210
+ start = df.index.nlevels
211
+ if start == 1:
212
+ start = 0
213
+ for col_num, value in enumerate(df.columns, start=start):
214
+ writer.sheets[sheet_name].write(0, col_num, value, head_format)
215
+
216
+
217
+ def read_dataframes_from_excel(infile):
218
+ """ 从Excel文件读取多个DataFrame表格
219
+
220
+ :param str infile: Excel文件路径
221
+ :return: 包含读取的DataFrame的字典,键为工作表名,值为DataFrame
222
+ :rtype: dict
223
+
224
+ 注意这个函数不太适用于与读取多级index和多级columns的情况,建议遇到这种情况,手动读取,
225
+ read_excel可以设置header=[0,1]、index=[0,1,2]的形式来定制表头所在位置。
226
+ """
227
+ dataframes = {}
228
+ with pd.ExcelFile(infile) as xls:
229
+ sheet_names = xls.sheet_names
230
+ for sheet_name in sheet_names:
231
+ df = pd.read_excel(xls, sheet_name=sheet_name)
232
+ if '_index' in df.columns:
233
+ df = df.drop('_index', axis=1)
234
+ dataframes[sheet_name] = df
235
+ return dataframes
236
+
237
+
238
+ def update_dataframes_to_excel(outfile, dataframes, order_mode='序号'):
239
+ """ 更新xlsx文件中的sheets数据 """
240
+ outfile = XlPath(outfile)
241
+ if outfile.is_file():
242
+ data = read_dataframes_from_excel(outfile)
243
+ else:
244
+ data = {}
245
+ data.update(dataframes)
246
+ write_dataframes_to_excel(outfile, data, order_mode)
247
+
248
+
249
+ def xlpivot(df, index=None, columns=None, values=None):
250
+ """ 对pandas进行封装的数据透视表功能
251
+
252
+ :param df: 数据表
253
+ :param index: 行划分方式
254
+ :param columns: 列划分方式
255
+ :param values: 显示的值
256
+ Callable[items, value]:输出一个函数
257
+ list[str]: 支持输入属性列表,表示显示原始值的意思。如果原始值不唯一,则逗号分开拼接后显示。但这种用法就不太算是传统意义的数据透视表了
258
+ :return: 数据透视表的表格
259
+
260
+ 使用示例:
261
+ def func(items): # 输入匹配的多行数据
262
+ x = items.iloc[0]
263
+ return f'{x["precision"]:.0f},{x["recall"]:.0f},{x["hmean"]:.2f},{x["fps"]}' # 返回显示的值
264
+
265
+ >> df2 = xlpivot(df, ['model_type'], ['dataset', 'total_frame'],
266
+ {'precision,recall,hmean,fps': func})
267
+
268
+ 注意技巧:如果要在分组后约束特定顺序,可以使用特殊前缀进行编号对齐
269
+ """
270
+
271
+ # 1 将分组的格式标准化
272
+ def reset_groups(keys):
273
+ if isinstance(keys, (list, tuple)):
274
+ return list(keys)
275
+ elif keys:
276
+ return [keys]
277
+ else:
278
+ return []
279
+
280
+ index_, columns_ = reset_groups(index), reset_groups(columns)
281
+
282
+ # 2 目标值的格式标准化
283
+ def make_col_func(col):
284
+ def func(rows):
285
+ if len(rows):
286
+ return ', '.join(map(str, rows[col].values))
287
+ return ''
288
+
289
+ return func
290
+
291
+ if isinstance(values, (list, tuple)):
292
+ values = {v: make_col_func(v) for v in values}
293
+
294
+ if callable(values):
295
+ values_ = {'values': values}
296
+ elif isinstance(values, dict):
297
+ values_ = values
298
+ else:
299
+ raise TypeError
300
+
301
+ # 3 分组
302
+ assert len(df), 'df是空的'
303
+
304
+ keys = index_ + columns_
305
+ dfgp = df.groupby(keys)
306
+ data = defaultdict(list)
307
+ for ks, items in dfgp:
308
+ # 要存储分组(keys)相关的值
309
+ if len(keys) == 1:
310
+ data[keys[0]].append(ks)
311
+ else:
312
+ for i, k in enumerate(keys):
313
+ data[k].append(ks[i])
314
+ # 再存储生成的值
315
+ for k, func in values_.items():
316
+ data[k].append(func(items))
317
+ df2 = pd.DataFrame.from_dict(data)
318
+
319
+ # 4 可视化表格
320
+ if index and columns:
321
+ view_table = df2.pivot(index=index, columns=columns, values=list(values_.keys()))
322
+ elif index:
323
+ view_table = df2.set_index(index_)
324
+ else: # 只有columns,没有index
325
+ view_table = df2.set_index(index_).T
326
+ return view_table
327
+
328
+
329
+ def count_key_combinations(df, col_names, count_col_name='count'):
330
+ """ 统计列出的几个列名,各种组合出现的次数
331
+
332
+ :param df:
333
+ :param col_names: ['a', 'b', 'c']
334
+ :param count_col_name: 新增的统计出现次数的列名,默认count
335
+ :return: 新的次数统计的df表格
336
+
337
+ 这个功能跟 SqlCodeGenerator 的 keys_count、one2many很像,是可以代替这两个功能的
338
+ """
339
+ from collections import Counter
340
+
341
+ # 0 参数处理
342
+ if isinstance(col_names, str):
343
+ col_names = [col_names]
344
+
345
+ # 1 统计每种组合出现的次数
346
+ cols = [df[name] for name in col_names]
347
+ ct = Counter(tuple(zip(*cols)))
348
+
349
+ # 2 生成新的df的统计表
350
+ ls = []
351
+ for k, v in ct.most_common():
352
+ ls.append([*k, v])
353
+ df2 = pd.DataFrame.from_records(ls, columns=list(col_names) + [count_col_name])
354
+ return df2
355
+
356
+
357
+ def pareto_accumulate(weights, accuracy=0.01, *, print_mode=False, value_unit_type='K'):
358
+ """ 帕累托累计
359
+
360
+ 可以用来分析主要出现的权重、频次
361
+ 二八法则,往往20%的数据,就能解决80%的问题
362
+
363
+ :param weights: 一组权重数据
364
+ :param accuracy: 累计精度,当统计到末尾时,可能有大量权重过小的数值
365
+ 此时不频繁进行累计权重计算,而是当更新权重累计达到accuracy,才会更新一个记录点
366
+ 注意这是全量数据综合的百分比,所以最小更新量就是1%
367
+ :param print_mode: 是否直接展示可视化结果
368
+ :return: [(累计数值数量, ≥当前阈值, 累计权重), ...]
369
+
370
+ >>> pareto_accumulate([1, 2, 3, 9, 8, 7, 4, 6, 5])
371
+ [(1, 9, 9), (2, 8, 17), (3, 7, 24), (4, 6, 30), (5, 5, 35), (6, 4, 39), (7, 3, 42), (8, 2, 44), (9, 1, 45)]
372
+ >>> pareto_accumulate([1, 2, 3, 9, 8, 7, 4, 6, 5], 0.1)
373
+ [(1, 9, 9), (2, 8, 17), (3, 7, 24), (4, 6, 30), (5, 5, 35), (7, 3, 42), (9, 1, 45)]
374
+ """
375
+ # 1 基础数据计算
376
+ points = []
377
+ weights = sorted(weights, reverse=True)
378
+
379
+ total = sum(weights)
380
+ accuracy = total * accuracy
381
+
382
+ acc = 0
383
+ delta = 0
384
+ for i, w in enumerate(weights, start=1):
385
+ acc += w
386
+ delta += w
387
+ if delta >= accuracy:
388
+ points.append((i, w, acc))
389
+ delta = 0
390
+ if delta:
391
+ points.append((len(weights), weights[-1], acc))
392
+
393
+ # 2 结果展示
394
+ def fmt(p):
395
+ from pyxllib.prog.newbie import human_readable_number
396
+ ls = [f'{human_readable_number(p[0], "万")}条≥{human_readable_number(p[1])}',
397
+ f'{human_readable_number(p[2], value_unit_type)}({p[2] / total_size:.0%})']
398
+ return ','.join(map(str, ls))
399
+
400
+ total_size = points[-1][2]
401
+ labels = [fmt(p) for p in points]
402
+
403
+ pts = [[p[0], p[2]] for p in points]
404
+
405
+ if print_mode:
406
+ if sys.platform == 'win32':
407
+ from pyxllib.data.echarts import Line
408
+ from pyxllib.prog.specialist import browser
409
+
410
+ x = Line()
411
+ x.add_series('帕累托累积权重', pts, labels=labels, label={'position': 'right'})
412
+ browser(x)
413
+ else:
414
+ print(*labels, sep='\n')
415
+
416
+ return pts, labels
417
+
418
+
419
+ class XlDataFrame(pd.DataFrame):
420
+ def check_dtypes(self):
421
+ """ 检查数据类型
422
+ 第1列是列名,第2列是原本dtypes显示的类型看,第3列是我扩展的统计的实际数据类型
423
+ """
424
+ d = self.dtypes
425
+ ls = [[k, d[k], Counter([typename(x) for x in v]).most_common()] for k, v in self.iteritems()]
426
+ df = pd.DataFrame.from_records(ls, columns=['name', 'type', 'detail'])
427
+ return df
428
+
429
+
430
+ class ModifiableRow:
431
+ def __init__(self, df, index):
432
+ self.df = df
433
+ self.index = index
434
+
435
+ def __getitem__(self, item):
436
+ return self.df.at[self.index, item]
437
+
438
+ def __setitem__(self, key, value):
439
+ self.df.at[self.index, key] = value
440
+
441
+
442
+ def print_full_dataframe(df):
443
+ """
444
+ 临时设置以完整显示DataFrame的内容
445
+
446
+ :param pd.DataFrame df: 需要完整显示的DataFrame
447
+ """
448
+ with pd.option_context('display.max_rows', None,
449
+ 'display.max_columns', None,
450
+ 'display.width', 1000,
451
+ 'display.max_colwidth', None):
452
+ print(df)
453
+
454
+ pd.options.display.max_rows = 60
455
+
456
+
457
+ def custom_fillna(df, default_fill_value='', numeric_fill_value=None, specific_fill=None):
458
+ """ 使用更多灵活性填充DataFrame中的NaN值。
459
+
460
+ :param pandas.DataFrame df: 需要处理的DataFrame。
461
+ :param str default_fill_value: 非数值列中NaN的默认填充值。
462
+ :param numeric_fill_value: 数值列中NaN的填充值,如果不指定,则默认为None。
463
+ :param dict specific_fill: 指定列名及其NaN的填充值,如果不指定,则默认为None。
464
+ :return: 已根据指定标准填充NaN值的pandas.DataFrame。
465
+
466
+ >>> df = pd.DataFrame({'A': [1, 2, None], 'B': [None, 'x', 'y'], 'C': [None, None, None]})
467
+ >>> custom_fillna(df, 'filled', 0, {'C': 'special'})
468
+ """
469
+ for column in df.columns:
470
+ # 检查列是否在specific_fill中指定;如果是,则使用指定的值填充。
471
+ if specific_fill and column in specific_fill:
472
+ df[column] = df[column].fillna(specific_fill[column])
473
+ # 如果列是数值型且指定了numeric_fill_value,则使用numeric_fill_value填充。
474
+ elif numeric_fill_value is not None and pd.api.types.is_numeric_dtype(df[column]):
475
+ df[column] = df[column].fillna(numeric_fill_value)
476
+ # 否则,对非数值列使用default_fill_value进行填充。
477
+ elif pd.api.types.is_object_dtype(df[column]) or pd.api.types.is_string_dtype(df[column]):
478
+ df[column] = df[column].fillna(default_fill_value)
479
+ # 可以在这里添加更多条件,以处理其他数据类型,如datetime。
480
+ return df
481
+
482
+
483
+ def dataframe_to_list(df):
484
+ """将DataFrame转换为列表结构,第一行是表头,其余是数据"""
485
+ # 获取表头(列名)作为第一个列表元素
486
+ headers = df.columns.tolist()
487
+
488
+ # 获取数据行,每一行作为一个列表,然后将所有这些列表收集到一个大列表中
489
+ data_rows = df.values.tolist()
490
+
491
+ # 将表头和数据行合并成最终的列表
492
+ result_list = [headers] + data_rows
493
+
494
+ return result_list