pyxllib 0.3.197__py3-none-any.whl → 0.3.200__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyxllib/__init__.py +21 -21
- pyxllib/algo/__init__.py +8 -8
- pyxllib/algo/disjoint.py +54 -54
- pyxllib/algo/geo.py +541 -541
- pyxllib/algo/intervals.py +964 -964
- pyxllib/algo/matcher.py +389 -389
- pyxllib/algo/newbie.py +166 -166
- pyxllib/algo/pupil.py +629 -629
- pyxllib/algo/shapelylib.py +67 -67
- pyxllib/algo/specialist.py +241 -241
- pyxllib/algo/stat.py +494 -494
- pyxllib/algo/treelib.py +149 -149
- pyxllib/algo/unitlib.py +66 -66
- pyxllib/autogui/__init__.py +5 -5
- pyxllib/autogui/activewin.py +246 -246
- pyxllib/autogui/all.py +9 -9
- pyxllib/autogui/autogui.py +852 -852
- pyxllib/autogui/uiautolib.py +362 -362
- pyxllib/autogui/virtualkey.py +102 -102
- pyxllib/autogui/wechat.py +827 -827
- pyxllib/autogui/wechat_msg.py +421 -421
- pyxllib/autogui/wxautolib.py +84 -84
- pyxllib/cv/__init__.py +5 -5
- pyxllib/cv/expert.py +267 -267
- pyxllib/cv/imfile.py +159 -159
- pyxllib/cv/imhash.py +39 -39
- pyxllib/cv/pupil.py +9 -9
- pyxllib/cv/rgbfmt.py +1525 -1525
- pyxllib/cv/slidercaptcha.py +137 -137
- pyxllib/cv/trackbartools.py +251 -251
- pyxllib/cv/xlcvlib.py +1040 -1040
- pyxllib/cv/xlpillib.py +423 -423
- pyxllib/data/echarts.py +240 -240
- pyxllib/data/jsonlib.py +89 -89
- pyxllib/data/oss.py +72 -72
- pyxllib/data/pglib.py +1127 -1127
- pyxllib/data/sqlite.py +568 -568
- pyxllib/data/sqllib.py +297 -297
- pyxllib/ext/JLineViewer.py +505 -505
- pyxllib/ext/__init__.py +6 -6
- pyxllib/ext/demolib.py +246 -246
- pyxllib/ext/drissionlib.py +277 -277
- pyxllib/ext/kq5034lib.py +12 -12
- pyxllib/ext/old.py +663 -663
- pyxllib/ext/qt.py +449 -449
- pyxllib/ext/robustprocfile.py +497 -497
- pyxllib/ext/seleniumlib.py +76 -76
- pyxllib/ext/tk.py +173 -173
- pyxllib/ext/unixlib.py +827 -827
- pyxllib/ext/utools.py +351 -351
- pyxllib/ext/webhook.py +124 -119
- pyxllib/ext/win32lib.py +40 -40
- pyxllib/ext/wjxlib.py +88 -88
- pyxllib/ext/wpsapi.py +124 -124
- pyxllib/ext/xlwork.py +9 -9
- pyxllib/ext/yuquelib.py +1105 -1105
- pyxllib/file/__init__.py +17 -17
- pyxllib/file/docxlib.py +761 -761
- pyxllib/file/gitlib.py +309 -309
- pyxllib/file/libreoffice.py +165 -165
- pyxllib/file/movielib.py +148 -148
- pyxllib/file/newbie.py +10 -10
- pyxllib/file/onenotelib.py +1469 -1469
- pyxllib/file/packlib/__init__.py +330 -330
- pyxllib/file/packlib/zipfile.py +2441 -2441
- pyxllib/file/pdflib.py +426 -426
- pyxllib/file/pupil.py +185 -185
- pyxllib/file/specialist/__init__.py +685 -685
- pyxllib/file/specialist/dirlib.py +799 -799
- pyxllib/file/specialist/download.py +193 -193
- pyxllib/file/specialist/filelib.py +2829 -2829
- pyxllib/file/xlsxlib.py +3131 -3131
- pyxllib/file/xlsyncfile.py +341 -341
- pyxllib/prog/__init__.py +5 -5
- pyxllib/prog/cachetools.py +64 -64
- pyxllib/prog/deprecatedlib.py +233 -233
- pyxllib/prog/filelock.py +42 -42
- pyxllib/prog/ipyexec.py +253 -253
- pyxllib/prog/multiprogs.py +940 -940
- pyxllib/prog/newbie.py +451 -451
- pyxllib/prog/pupil.py +1197 -1197
- pyxllib/prog/sitepackages.py +33 -33
- pyxllib/prog/specialist/__init__.py +391 -391
- pyxllib/prog/specialist/bc.py +203 -203
- pyxllib/prog/specialist/browser.py +497 -497
- pyxllib/prog/specialist/common.py +347 -347
- pyxllib/prog/specialist/datetime.py +198 -198
- pyxllib/prog/specialist/tictoc.py +240 -240
- pyxllib/prog/specialist/xllog.py +180 -180
- pyxllib/prog/xlosenv.py +108 -108
- pyxllib/stdlib/__init__.py +17 -17
- pyxllib/stdlib/tablepyxl/__init__.py +10 -10
- pyxllib/stdlib/tablepyxl/style.py +303 -303
- pyxllib/stdlib/tablepyxl/tablepyxl.py +130 -130
- pyxllib/text/__init__.py +8 -8
- pyxllib/text/ahocorasick.py +39 -39
- pyxllib/text/airscript.js +744 -744
- pyxllib/text/charclasslib.py +121 -121
- pyxllib/text/jiebalib.py +267 -267
- pyxllib/text/jinjalib.py +32 -32
- pyxllib/text/jsa_ai_prompt.md +271 -271
- pyxllib/text/jscode.py +922 -922
- pyxllib/text/latex/__init__.py +158 -158
- pyxllib/text/levenshtein.py +303 -303
- pyxllib/text/nestenv.py +1215 -1215
- pyxllib/text/newbie.py +300 -300
- pyxllib/text/pupil/__init__.py +8 -8
- pyxllib/text/pupil/common.py +1121 -1121
- pyxllib/text/pupil/xlalign.py +326 -326
- pyxllib/text/pycode.py +47 -47
- pyxllib/text/specialist/__init__.py +8 -8
- pyxllib/text/specialist/common.py +112 -112
- pyxllib/text/specialist/ptag.py +186 -186
- pyxllib/text/spellchecker.py +172 -172
- pyxllib/text/templates/echart_base.html +10 -10
- pyxllib/text/templates/highlight_code.html +16 -16
- pyxllib/text/templates/latex_editor.html +102 -102
- pyxllib/text/vbacode.py +17 -17
- pyxllib/text/xmllib.py +747 -747
- pyxllib/xl.py +42 -39
- pyxllib/xlcv.py +17 -17
- {pyxllib-0.3.197.dist-info → pyxllib-0.3.200.dist-info}/METADATA +1 -1
- pyxllib-0.3.200.dist-info/RECORD +126 -0
- {pyxllib-0.3.197.dist-info → pyxllib-0.3.200.dist-info}/licenses/LICENSE +190 -190
- pyxllib-0.3.197.dist-info/RECORD +0 -126
- {pyxllib-0.3.197.dist-info → pyxllib-0.3.200.dist-info}/WHEEL +0 -0
@@ -1,391 +1,391 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
# @Author : 陈坤泽
|
4
|
-
# @Email : 877362867@qq.com
|
5
|
-
# @Date : 2021/06/06 18:40
|
6
|
-
|
7
|
-
|
8
|
-
from pyxllib.prog.specialist.common import *
|
9
|
-
from pyxllib.prog.specialist.xllog import *
|
10
|
-
from pyxllib.prog.specialist.browser import *
|
11
|
-
from pyxllib.prog.specialist.bc import *
|
12
|
-
from pyxllib.prog.specialist.tictoc import *
|
13
|
-
from pyxllib.prog.specialist.datetime import *
|
14
|
-
|
15
|
-
import concurrent.futures
|
16
|
-
import os
|
17
|
-
import re
|
18
|
-
import subprocess
|
19
|
-
import time
|
20
|
-
from statistics import mean
|
21
|
-
from threading import Thread
|
22
|
-
|
23
|
-
from tqdm import tqdm
|
24
|
-
import requests
|
25
|
-
from humanfriendly import parse_size
|
26
|
-
|
27
|
-
from pyxllib.prog.newbie import human_readable_size
|
28
|
-
from pyxllib.prog.pupil import get_installed_packages, aligned_range, percentage_and_value
|
29
|
-
from pyxllib.prog.xlosenv import XlOsEnv
|
30
|
-
from pyxllib.file.specialist import cache_file
|
31
|
-
|
32
|
-
|
33
|
-
def mtqdm(func, iterable, *args, max_workers=1, check_per_seconds=0.01, **kwargs):
|
34
|
-
""" 对tqdm的封装,增加了多线程的支持
|
35
|
-
|
36
|
-
这里名称前缀多出的m有multi的意思
|
37
|
-
|
38
|
-
:param max_workers: 默认是单线程,改成None会自动变为多线程
|
39
|
-
或者可以自己指定线程数
|
40
|
-
注意,使用负数,可以用对等绝对值数据的“多进程”
|
41
|
-
:param smoothing: tqdm官方默认值是0.3
|
42
|
-
这里关掉指数移动平均,直接计算整体平均速度
|
43
|
-
因为对我个人来说,大部分时候需要严谨地分析性能,得到整体平均速度,而不是预估当前速度
|
44
|
-
:param mininterval: 官方默认值是0.1,表示显示更新间隔秒数
|
45
|
-
这里不用那么频繁,每秒更新就行了~~
|
46
|
-
:param check_per_seconds: 每隔多少秒检查队列
|
47
|
-
有些任务,这个值故意设大一点,可以减少频繁的队列检查时间,提高运行速度
|
48
|
-
整体功能类似Iterate
|
49
|
-
"""
|
50
|
-
|
51
|
-
# 0 个人习惯参数
|
52
|
-
kwargs['smoothing'] = kwargs.get('smoothing', 0)
|
53
|
-
kwargs['mininterval'] = kwargs.get('mininterval', 1)
|
54
|
-
|
55
|
-
if max_workers == 1:
|
56
|
-
# 1 如果只用一个线程,则不使用concurrent.futures.ThreadPoolExecutor,能加速
|
57
|
-
for x in tqdm(iterable, *args, **kwargs):
|
58
|
-
func(x)
|
59
|
-
else:
|
60
|
-
# 2 默认的多线程运行机制,出错是不会暂停的;这里对原函数功能进行封装,增加报错功能
|
61
|
-
error = False
|
62
|
-
|
63
|
-
def wrap_func(x):
|
64
|
-
nonlocal error
|
65
|
-
try:
|
66
|
-
func(x)
|
67
|
-
except Exception as e:
|
68
|
-
error = e
|
69
|
-
|
70
|
-
# 3 多线程/多进程 和 进度条 功能的结合
|
71
|
-
if max_workers > 1:
|
72
|
-
executor = concurrent.futures.ThreadPoolExecutor(max_workers)
|
73
|
-
for x in tqdm(iterable, *args, **kwargs):
|
74
|
-
while executor._work_queue.qsize():
|
75
|
-
if check_per_seconds:
|
76
|
-
time.sleep(check_per_seconds)
|
77
|
-
executor.submit(wrap_func, x)
|
78
|
-
if error:
|
79
|
-
raise error
|
80
|
-
else:
|
81
|
-
executor = concurrent.futures.ProcessPoolExecutor(-max_workers)
|
82
|
-
for x in tqdm(iterable, *args, **kwargs):
|
83
|
-
# while executor._call_queue.pending_work_items:
|
84
|
-
# if check_per_seconds:
|
85
|
-
# time.sleep(check_per_seconds)
|
86
|
-
executor.submit(wrap_func, x)
|
87
|
-
if error:
|
88
|
-
raise error
|
89
|
-
|
90
|
-
executor.shutdown()
|
91
|
-
|
92
|
-
|
93
|
-
def distribute_package(root, version=None, repository=None, *,
|
94
|
-
upload=True,
|
95
|
-
version_file='setup.py',
|
96
|
-
delete_dist=True):
|
97
|
-
""" 发布包的工具函数
|
98
|
-
|
99
|
-
:param root: 项目的根目录,例如 'D:/slns/pyxllib'
|
100
|
-
根目录下有对应的 setup.py 等文件
|
101
|
-
:param repository: 比如我配置了 [xlpr],就可以传入 'xlpr'
|
102
|
-
:param version_file: 保存版本号的文件,注意看正则规则,需要满足特定的范式,才会自动更新版本号
|
103
|
-
:param delete_dist: 上传完是否自动删除dist目录,要检查上传包是否有遗漏时,要关闭
|
104
|
-
"""
|
105
|
-
import sys
|
106
|
-
from pyxllib.file.specialist import XlPath
|
107
|
-
|
108
|
-
# 1 切换工作目录
|
109
|
-
os.chdir(str(root))
|
110
|
-
|
111
|
-
# 2 改版本号
|
112
|
-
if version:
|
113
|
-
f = XlPath(root) / version_file
|
114
|
-
s = re.sub(r"(version\s*=\s*)(['\"])(.+?)(\2)", fr'\1\g<2>{version}\4', f.read_text())
|
115
|
-
f.write_text(s)
|
116
|
-
|
117
|
-
# 3 打包
|
118
|
-
subprocess.run(f'{sys.executable} setup.py sdist')
|
119
|
-
|
120
|
-
# 4 上传
|
121
|
-
if upload:
|
122
|
-
# 上传
|
123
|
-
cmd = 'twine upload dist/*'
|
124
|
-
if repository:
|
125
|
-
cmd += f' -r {repository}'
|
126
|
-
subprocess.run(cmd)
|
127
|
-
# 删除打包生成的中间文件
|
128
|
-
if delete_dist:
|
129
|
-
XlPath('dist').delete()
|
130
|
-
XlPath('build').delete()
|
131
|
-
|
132
|
-
# 这个不能删,不然importlib会读取不到模块的版本号
|
133
|
-
# [d.delete() for d in XlPath('.').select_dirs(r'*.egg-info')]
|
134
|
-
|
135
|
-
|
136
|
-
def estimate_package_size(package):
|
137
|
-
""" 估计一个库占用的存储大小 """
|
138
|
-
|
139
|
-
# 将cache文件存储到临时目录中,避免重复获取网页
|
140
|
-
def get_size(package):
|
141
|
-
r = requests.get(f'https://pypi.org/project/{package}/#files')
|
142
|
-
if r.status_code == 404:
|
143
|
-
return '(0 MB' # 找不到的包默认按0MB计算
|
144
|
-
else:
|
145
|
-
return r.text
|
146
|
-
|
147
|
-
s = cache_file(package + '.pypi', lambda: get_size(package))
|
148
|
-
# 找出所有包大小,计算平均值作为这个包大小的预估
|
149
|
-
# 注意,这里进位是x1000,不是x1024
|
150
|
-
v = mean(list(map(parse_size, re.findall(r'\((\d+(?:\.\d+)?\s*\wB(?:ytes)?)', s))) or [0])
|
151
|
-
return v
|
152
|
-
|
153
|
-
|
154
|
-
def estimate_pip_packages(*, print_mode=False):
|
155
|
-
""" 检查pip list中包的大小,从大到小排序
|
156
|
-
|
157
|
-
:param print_mode:
|
158
|
-
0,不输出,只返回运算结果,[(package_name, package_size), ...]
|
159
|
-
1,输出最后的美化过的运算表格
|
160
|
-
2,输出中间计算过程
|
161
|
-
"""
|
162
|
-
|
163
|
-
def printf(*args, **kwargs):
|
164
|
-
# dm表示mode增量
|
165
|
-
if print_mode > 1:
|
166
|
-
print(*args, **kwargs)
|
167
|
-
|
168
|
-
packages = get_installed_packages()
|
169
|
-
package_sizes = []
|
170
|
-
for package_name in packages:
|
171
|
-
package_size = estimate_package_size(package_name)
|
172
|
-
package_sizes.append((package_name, package_size))
|
173
|
-
printf(f"{package_name}: {human_readable_size(package_size)}")
|
174
|
-
|
175
|
-
package_sizes.sort(key=lambda x: (-x[1], x[0]))
|
176
|
-
if print_mode > 0:
|
177
|
-
if print_mode > 1: print('- ' * 20)
|
178
|
-
for package_name, package_size in package_sizes:
|
179
|
-
print(f"{package_name}: {human_readable_size(package_size)}")
|
180
|
-
return package_sizes
|
181
|
-
|
182
|
-
|
183
|
-
class ProgressBar:
|
184
|
-
""" 对运行可能需要较长时间的任务,添加进度条显示
|
185
|
-
|
186
|
-
# 示例用法
|
187
|
-
with ProgressBar(100) as pb:
|
188
|
-
for i in range(100):
|
189
|
-
time.sleep(0.1) # 模拟耗时工作
|
190
|
-
pb.progress = i + 1 # 更新进度
|
191
|
-
"""
|
192
|
-
|
193
|
-
def __init__(self, total):
|
194
|
-
self.total = total # 总进度
|
195
|
-
self.progress = 0 # 当前进度
|
196
|
-
self.stop_flag = False # 停止标志
|
197
|
-
|
198
|
-
def __enter__(self):
|
199
|
-
# 启动进度显示线程
|
200
|
-
self.progress_thread = Thread(target=self.display_progress)
|
201
|
-
self.progress_thread.start()
|
202
|
-
return self
|
203
|
-
|
204
|
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
205
|
-
# 强制将进度设置为100%
|
206
|
-
self.progress = self.total
|
207
|
-
# 停止进度显示线程
|
208
|
-
self.stop_flag = True
|
209
|
-
self.progress_thread.join()
|
210
|
-
|
211
|
-
def display_progress(self):
|
212
|
-
with tqdm(total=self.total) as pbar:
|
213
|
-
while not self.stop_flag:
|
214
|
-
pbar.n = self.progress
|
215
|
-
pbar.refresh()
|
216
|
-
time.sleep(1)
|
217
|
-
pbar.n = self.progress
|
218
|
-
pbar.refresh()
|
219
|
-
|
220
|
-
|
221
|
-
class BitMaskTool:
|
222
|
-
""" 二进制位掩码工具
|
223
|
-
|
224
|
-
概念术语
|
225
|
-
bitval,每一个位上的具体取值,0或1
|
226
|
-
bitsum,所有位上的取值的和,即二进制数的十进制表示
|
227
|
-
"""
|
228
|
-
|
229
|
-
def __init__(self, bit_names=None, bitsum_counter=None):
|
230
|
-
""" 初始化 BitMaskTool 对象
|
231
|
-
|
232
|
-
:param list bit_names: 每一位功能开启时,显示的标签名。未输入时,填充0,1,2,3...注意总数,要按域宽对齐
|
233
|
-
:param dict/list bitsum_counter: 各种值出现的次数,可以是一个字典或者一个包含出现次数的列表
|
234
|
-
"""
|
235
|
-
# 1 每一位功能开启时,显示的标签名。未输入时,填充0,1,2,3...注意总数,要按域宽对齐
|
236
|
-
if bit_names is None:
|
237
|
-
bit_names = list(aligned_range(len(bit_names)))
|
238
|
-
self.bit_names = bit_names
|
239
|
-
# 2 各种值出现的次数
|
240
|
-
if isinstance(bitsum_counter, (list, tuple)):
|
241
|
-
bitsum_counter = Counter(bitsum_counter)
|
242
|
-
self.bitsum_counter = bitsum_counter or {}
|
243
|
-
|
244
|
-
def get_bitsum_names(self, bitsum):
|
245
|
-
""" 从bitsum得到names的拼接
|
246
|
-
|
247
|
-
>> get_bitsum_names(3)
|
248
|
-
'语义定位,图表'
|
249
|
-
"""
|
250
|
-
if not isinstance(bitsum, int):
|
251
|
-
try:
|
252
|
-
bitsum = int(bitsum)
|
253
|
-
except ValueError:
|
254
|
-
bitsum = 0
|
255
|
-
|
256
|
-
tags = []
|
257
|
-
for i, k in enumerate(self.bit_names):
|
258
|
-
if (1 << i) & bitsum:
|
259
|
-
tags.append(k)
|
260
|
-
return ','.join(tags)
|
261
|
-
|
262
|
-
def count_bitsum_relations(self, target_bitsum, relation='='):
|
263
|
-
""" 计算特定关系的 bitsum 数量
|
264
|
-
|
265
|
-
:param int target_bitsum: 目标 bitsum
|
266
|
-
:param str relation: 关系类型,可以是 '=', '⊂', '⊃'
|
267
|
-
假设bitval对应的bit_names为n1,n2,n3,n4。
|
268
|
-
那么bitsum相当于是bitval的一个集合
|
269
|
-
比如a={n1,n3,n4},b={n1,n3},因为a完全包含b,所以认为a⊃b,或者a⊋b、b⊂a、b⊊a
|
270
|
-
:return int: 符合条件的 bitsum 数量
|
271
|
-
"""
|
272
|
-
count = 0
|
273
|
-
if relation == '=':
|
274
|
-
# 直接计算等于 target_bitsum 的数量
|
275
|
-
count = self.bitsum_counter.get(target_bitsum, 0)
|
276
|
-
elif relation == '⊂':
|
277
|
-
# 计算所有被 target_bitsum 包含的 bitsum 的数量
|
278
|
-
for bitsum, num in self.bitsum_counter.items():
|
279
|
-
if bitsum and bitsum & target_bitsum == bitsum:
|
280
|
-
count += num
|
281
|
-
elif relation == '⊃':
|
282
|
-
# 计算所有包含 target_bitsum 的 bitsum 的数量
|
283
|
-
for bitsum, num in self.bitsum_counter.items():
|
284
|
-
if bitsum & target_bitsum == target_bitsum:
|
285
|
-
count += num
|
286
|
-
return count
|
287
|
-
|
288
|
-
def check_bitflag(self, max_bitsum_len=None, reletion='=',
|
289
|
-
filter_zero=False, sort_by=None, *,
|
290
|
-
min_bitsum_len=0):
|
291
|
-
""" 检查并返回 bitsum 关系的 DataFrame
|
292
|
-
|
293
|
-
:param int max_bitsum_len: 最大 bitsum 长度
|
294
|
-
:param str reletion: 关系类型,可以是 '=', '⊂', '⊃'
|
295
|
-
支持输入多个字符,表示要同时计算多种关系
|
296
|
-
:param bool filter_zero: 是否过滤掉零值
|
297
|
-
:param None|str sort_by: 排序字段
|
298
|
-
None, 默认排序
|
299
|
-
count, 按照数量从大到小排序
|
300
|
-
bitsum, 按照 bitsum 从小到大排序
|
301
|
-
:param int min_bitsum_len: 最小 bitsum 长度
|
302
|
-
:return: 包含 bitsum 关系的 DataFrame
|
303
|
-
"""
|
304
|
-
from itertools import combinations
|
305
|
-
|
306
|
-
total = sum(self.bitsum_counter.values())
|
307
|
-
rows, columns = [], ['类型', '名称', '百分比.次数']
|
308
|
-
rows.append([-1, '总计', total])
|
309
|
-
|
310
|
-
if max_bitsum_len is None:
|
311
|
-
max_bitsum_len = len(self.bit_names)
|
312
|
-
|
313
|
-
bitvals = [(1 << i) for i in range(len(self.bit_names))]
|
314
|
-
for m in range(min_bitsum_len, max_bitsum_len + 1):
|
315
|
-
for comb in combinations(bitvals, m):
|
316
|
-
bitsum = sum(comb)
|
317
|
-
count = self.count_bitsum_relations(bitsum, relation=reletion)
|
318
|
-
if filter_zero and count == 0:
|
319
|
-
continue
|
320
|
-
rows.append([f'{reletion}{bitsum}',
|
321
|
-
self.get_bitsum_names(bitsum),
|
322
|
-
count])
|
323
|
-
|
324
|
-
if sort_by == 'count':
|
325
|
-
rows.sort(key=lambda x: x[2], reverse=True)
|
326
|
-
elif sort_by == 'bitsum':
|
327
|
-
rows.sort(key=lambda x: int(x[0][1:]) if isinstance(x[0], str) else x[0])
|
328
|
-
|
329
|
-
df = pd.DataFrame.from_records(rows, columns=columns)
|
330
|
-
df['百分比.次数'] = percentage_and_value(df['百分比.次数'], 2, total=total)
|
331
|
-
return df
|
332
|
-
|
333
|
-
def report(self):
|
334
|
-
""" 生成统计报告 """
|
335
|
-
html_content = []
|
336
|
-
|
337
|
-
html_content.append('<h1>1 包含每一位bitval特征的数量</h1>')
|
338
|
-
df1 = self.check_bitflag(1, '⊃')
|
339
|
-
html_content.append(df1.to_html())
|
340
|
-
|
341
|
-
html_content.append('<h1>2 每一种具体bitsum组合的数量</h1>')
|
342
|
-
df2 = self.check_bitflag(reletion='=', filter_zero=True, sort_by='bitsum')
|
343
|
-
html_content.append(df2.to_html())
|
344
|
-
|
345
|
-
return '\n'.join(html_content)
|
346
|
-
|
347
|
-
|
348
|
-
def loguru_setup_jsonl_logfile(logger, log_dir, rotation_size="10 MB"):
|
349
|
-
"""
|
350
|
-
给loguru的日志器添加导出文件的功能,使用jsonl格式
|
351
|
-
|
352
|
-
:param logger: 日志记录器,一般是from loguru import logger的logger
|
353
|
-
:param log_dir: 存储日志的目录,因为有多个文件,这里要输入的是所在的目录
|
354
|
-
:param rotation_size: 文件多大后分割
|
355
|
-
:return:
|
356
|
-
"""
|
357
|
-
from datetime import datetime
|
358
|
-
|
359
|
-
os.makedirs(log_dir, exist_ok=True) # 自动创建日志目录
|
360
|
-
|
361
|
-
# 日志文件名匹配的正则表达式,格式为 年月日_时分秒.log
|
362
|
-
log_filename_pattern = re.compile(r"(\d{8}_\d{6})\.jsonl")
|
363
|
-
|
364
|
-
# 找到最新的日志文件
|
365
|
-
def find_latest_log_file(log_dir):
|
366
|
-
log_files = []
|
367
|
-
for file in os.listdir(log_dir):
|
368
|
-
if log_filename_pattern.match(file):
|
369
|
-
log_files.append(file)
|
370
|
-
|
371
|
-
if log_files:
|
372
|
-
# 根据时间排序,选择最新的日志文件
|
373
|
-
log_files.sort(reverse=True)
|
374
|
-
return os.path.join(log_dir, log_files[0])
|
375
|
-
return None
|
376
|
-
|
377
|
-
# 检查是否有未写满的日志文件
|
378
|
-
latest_log_file = find_latest_log_file(log_dir)
|
379
|
-
|
380
|
-
if latest_log_file:
|
381
|
-
log_path = latest_log_file
|
382
|
-
else:
|
383
|
-
# 生成新的日志文件名
|
384
|
-
log_filename = datetime.now().strftime("%Y%m%d_%H%M%S") + ".jsonl"
|
385
|
-
log_path = os.path.join(log_dir, log_filename)
|
386
|
-
|
387
|
-
# 配置 logger,写入日志文件,设置旋转条件,使用 JSON 序列化
|
388
|
-
logger.add(log_path, rotation=rotation_size, serialize=True)
|
389
|
-
|
390
|
-
# 输出初始化成功信息
|
391
|
-
logger.info(f"日志系统已初始化,日志文件路径:{log_path}")
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : 陈坤泽
|
4
|
+
# @Email : 877362867@qq.com
|
5
|
+
# @Date : 2021/06/06 18:40
|
6
|
+
|
7
|
+
|
8
|
+
from pyxllib.prog.specialist.common import *
|
9
|
+
from pyxllib.prog.specialist.xllog import *
|
10
|
+
from pyxllib.prog.specialist.browser import *
|
11
|
+
from pyxllib.prog.specialist.bc import *
|
12
|
+
from pyxllib.prog.specialist.tictoc import *
|
13
|
+
from pyxllib.prog.specialist.datetime import *
|
14
|
+
|
15
|
+
import concurrent.futures
|
16
|
+
import os
|
17
|
+
import re
|
18
|
+
import subprocess
|
19
|
+
import time
|
20
|
+
from statistics import mean
|
21
|
+
from threading import Thread
|
22
|
+
|
23
|
+
from tqdm import tqdm
|
24
|
+
import requests
|
25
|
+
from humanfriendly import parse_size
|
26
|
+
|
27
|
+
from pyxllib.prog.newbie import human_readable_size
|
28
|
+
from pyxllib.prog.pupil import get_installed_packages, aligned_range, percentage_and_value
|
29
|
+
from pyxllib.prog.xlosenv import XlOsEnv
|
30
|
+
from pyxllib.file.specialist import cache_file
|
31
|
+
|
32
|
+
|
33
|
+
def mtqdm(func, iterable, *args, max_workers=1, check_per_seconds=0.01, **kwargs):
|
34
|
+
""" 对tqdm的封装,增加了多线程的支持
|
35
|
+
|
36
|
+
这里名称前缀多出的m有multi的意思
|
37
|
+
|
38
|
+
:param max_workers: 默认是单线程,改成None会自动变为多线程
|
39
|
+
或者可以自己指定线程数
|
40
|
+
注意,使用负数,可以用对等绝对值数据的“多进程”
|
41
|
+
:param smoothing: tqdm官方默认值是0.3
|
42
|
+
这里关掉指数移动平均,直接计算整体平均速度
|
43
|
+
因为对我个人来说,大部分时候需要严谨地分析性能,得到整体平均速度,而不是预估当前速度
|
44
|
+
:param mininterval: 官方默认值是0.1,表示显示更新间隔秒数
|
45
|
+
这里不用那么频繁,每秒更新就行了~~
|
46
|
+
:param check_per_seconds: 每隔多少秒检查队列
|
47
|
+
有些任务,这个值故意设大一点,可以减少频繁的队列检查时间,提高运行速度
|
48
|
+
整体功能类似Iterate
|
49
|
+
"""
|
50
|
+
|
51
|
+
# 0 个人习惯参数
|
52
|
+
kwargs['smoothing'] = kwargs.get('smoothing', 0)
|
53
|
+
kwargs['mininterval'] = kwargs.get('mininterval', 1)
|
54
|
+
|
55
|
+
if max_workers == 1:
|
56
|
+
# 1 如果只用一个线程,则不使用concurrent.futures.ThreadPoolExecutor,能加速
|
57
|
+
for x in tqdm(iterable, *args, **kwargs):
|
58
|
+
func(x)
|
59
|
+
else:
|
60
|
+
# 2 默认的多线程运行机制,出错是不会暂停的;这里对原函数功能进行封装,增加报错功能
|
61
|
+
error = False
|
62
|
+
|
63
|
+
def wrap_func(x):
|
64
|
+
nonlocal error
|
65
|
+
try:
|
66
|
+
func(x)
|
67
|
+
except Exception as e:
|
68
|
+
error = e
|
69
|
+
|
70
|
+
# 3 多线程/多进程 和 进度条 功能的结合
|
71
|
+
if max_workers > 1:
|
72
|
+
executor = concurrent.futures.ThreadPoolExecutor(max_workers)
|
73
|
+
for x in tqdm(iterable, *args, **kwargs):
|
74
|
+
while executor._work_queue.qsize():
|
75
|
+
if check_per_seconds:
|
76
|
+
time.sleep(check_per_seconds)
|
77
|
+
executor.submit(wrap_func, x)
|
78
|
+
if error:
|
79
|
+
raise error
|
80
|
+
else:
|
81
|
+
executor = concurrent.futures.ProcessPoolExecutor(-max_workers)
|
82
|
+
for x in tqdm(iterable, *args, **kwargs):
|
83
|
+
# while executor._call_queue.pending_work_items:
|
84
|
+
# if check_per_seconds:
|
85
|
+
# time.sleep(check_per_seconds)
|
86
|
+
executor.submit(wrap_func, x)
|
87
|
+
if error:
|
88
|
+
raise error
|
89
|
+
|
90
|
+
executor.shutdown()
|
91
|
+
|
92
|
+
|
93
|
+
def distribute_package(root, version=None, repository=None, *,
|
94
|
+
upload=True,
|
95
|
+
version_file='setup.py',
|
96
|
+
delete_dist=True):
|
97
|
+
""" 发布包的工具函数
|
98
|
+
|
99
|
+
:param root: 项目的根目录,例如 'D:/slns/pyxllib'
|
100
|
+
根目录下有对应的 setup.py 等文件
|
101
|
+
:param repository: 比如我配置了 [xlpr],就可以传入 'xlpr'
|
102
|
+
:param version_file: 保存版本号的文件,注意看正则规则,需要满足特定的范式,才会自动更新版本号
|
103
|
+
:param delete_dist: 上传完是否自动删除dist目录,要检查上传包是否有遗漏时,要关闭
|
104
|
+
"""
|
105
|
+
import sys
|
106
|
+
from pyxllib.file.specialist import XlPath
|
107
|
+
|
108
|
+
# 1 切换工作目录
|
109
|
+
os.chdir(str(root))
|
110
|
+
|
111
|
+
# 2 改版本号
|
112
|
+
if version:
|
113
|
+
f = XlPath(root) / version_file
|
114
|
+
s = re.sub(r"(version\s*=\s*)(['\"])(.+?)(\2)", fr'\1\g<2>{version}\4', f.read_text())
|
115
|
+
f.write_text(s)
|
116
|
+
|
117
|
+
# 3 打包
|
118
|
+
subprocess.run(f'{sys.executable} setup.py sdist')
|
119
|
+
|
120
|
+
# 4 上传
|
121
|
+
if upload:
|
122
|
+
# 上传
|
123
|
+
cmd = 'twine upload dist/*'
|
124
|
+
if repository:
|
125
|
+
cmd += f' -r {repository}'
|
126
|
+
subprocess.run(cmd)
|
127
|
+
# 删除打包生成的中间文件
|
128
|
+
if delete_dist:
|
129
|
+
XlPath('dist').delete()
|
130
|
+
XlPath('build').delete()
|
131
|
+
|
132
|
+
# 这个不能删,不然importlib会读取不到模块的版本号
|
133
|
+
# [d.delete() for d in XlPath('.').select_dirs(r'*.egg-info')]
|
134
|
+
|
135
|
+
|
136
|
+
def estimate_package_size(package):
|
137
|
+
""" 估计一个库占用的存储大小 """
|
138
|
+
|
139
|
+
# 将cache文件存储到临时目录中,避免重复获取网页
|
140
|
+
def get_size(package):
|
141
|
+
r = requests.get(f'https://pypi.org/project/{package}/#files')
|
142
|
+
if r.status_code == 404:
|
143
|
+
return '(0 MB' # 找不到的包默认按0MB计算
|
144
|
+
else:
|
145
|
+
return r.text
|
146
|
+
|
147
|
+
s = cache_file(package + '.pypi', lambda: get_size(package))
|
148
|
+
# 找出所有包大小,计算平均值作为这个包大小的预估
|
149
|
+
# 注意,这里进位是x1000,不是x1024
|
150
|
+
v = mean(list(map(parse_size, re.findall(r'\((\d+(?:\.\d+)?\s*\wB(?:ytes)?)', s))) or [0])
|
151
|
+
return v
|
152
|
+
|
153
|
+
|
154
|
+
def estimate_pip_packages(*, print_mode=False):
|
155
|
+
""" 检查pip list中包的大小,从大到小排序
|
156
|
+
|
157
|
+
:param print_mode:
|
158
|
+
0,不输出,只返回运算结果,[(package_name, package_size), ...]
|
159
|
+
1,输出最后的美化过的运算表格
|
160
|
+
2,输出中间计算过程
|
161
|
+
"""
|
162
|
+
|
163
|
+
def printf(*args, **kwargs):
|
164
|
+
# dm表示mode增量
|
165
|
+
if print_mode > 1:
|
166
|
+
print(*args, **kwargs)
|
167
|
+
|
168
|
+
packages = get_installed_packages()
|
169
|
+
package_sizes = []
|
170
|
+
for package_name in packages:
|
171
|
+
package_size = estimate_package_size(package_name)
|
172
|
+
package_sizes.append((package_name, package_size))
|
173
|
+
printf(f"{package_name}: {human_readable_size(package_size)}")
|
174
|
+
|
175
|
+
package_sizes.sort(key=lambda x: (-x[1], x[0]))
|
176
|
+
if print_mode > 0:
|
177
|
+
if print_mode > 1: print('- ' * 20)
|
178
|
+
for package_name, package_size in package_sizes:
|
179
|
+
print(f"{package_name}: {human_readable_size(package_size)}")
|
180
|
+
return package_sizes
|
181
|
+
|
182
|
+
|
183
|
+
class ProgressBar:
|
184
|
+
""" 对运行可能需要较长时间的任务,添加进度条显示
|
185
|
+
|
186
|
+
# 示例用法
|
187
|
+
with ProgressBar(100) as pb:
|
188
|
+
for i in range(100):
|
189
|
+
time.sleep(0.1) # 模拟耗时工作
|
190
|
+
pb.progress = i + 1 # 更新进度
|
191
|
+
"""
|
192
|
+
|
193
|
+
def __init__(self, total):
|
194
|
+
self.total = total # 总进度
|
195
|
+
self.progress = 0 # 当前进度
|
196
|
+
self.stop_flag = False # 停止标志
|
197
|
+
|
198
|
+
def __enter__(self):
|
199
|
+
# 启动进度显示线程
|
200
|
+
self.progress_thread = Thread(target=self.display_progress)
|
201
|
+
self.progress_thread.start()
|
202
|
+
return self
|
203
|
+
|
204
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
205
|
+
# 强制将进度设置为100%
|
206
|
+
self.progress = self.total
|
207
|
+
# 停止进度显示线程
|
208
|
+
self.stop_flag = True
|
209
|
+
self.progress_thread.join()
|
210
|
+
|
211
|
+
def display_progress(self):
|
212
|
+
with tqdm(total=self.total) as pbar:
|
213
|
+
while not self.stop_flag:
|
214
|
+
pbar.n = self.progress
|
215
|
+
pbar.refresh()
|
216
|
+
time.sleep(1)
|
217
|
+
pbar.n = self.progress
|
218
|
+
pbar.refresh()
|
219
|
+
|
220
|
+
|
221
|
+
class BitMaskTool:
|
222
|
+
""" 二进制位掩码工具
|
223
|
+
|
224
|
+
概念术语
|
225
|
+
bitval,每一个位上的具体取值,0或1
|
226
|
+
bitsum,所有位上的取值的和,即二进制数的十进制表示
|
227
|
+
"""
|
228
|
+
|
229
|
+
def __init__(self, bit_names=None, bitsum_counter=None):
|
230
|
+
""" 初始化 BitMaskTool 对象
|
231
|
+
|
232
|
+
:param list bit_names: 每一位功能开启时,显示的标签名。未输入时,填充0,1,2,3...注意总数,要按域宽对齐
|
233
|
+
:param dict/list bitsum_counter: 各种值出现的次数,可以是一个字典或者一个包含出现次数的列表
|
234
|
+
"""
|
235
|
+
# 1 每一位功能开启时,显示的标签名。未输入时,填充0,1,2,3...注意总数,要按域宽对齐
|
236
|
+
if bit_names is None:
|
237
|
+
bit_names = list(aligned_range(len(bit_names)))
|
238
|
+
self.bit_names = bit_names
|
239
|
+
# 2 各种值出现的次数
|
240
|
+
if isinstance(bitsum_counter, (list, tuple)):
|
241
|
+
bitsum_counter = Counter(bitsum_counter)
|
242
|
+
self.bitsum_counter = bitsum_counter or {}
|
243
|
+
|
244
|
+
def get_bitsum_names(self, bitsum):
|
245
|
+
""" 从bitsum得到names的拼接
|
246
|
+
|
247
|
+
>> get_bitsum_names(3)
|
248
|
+
'语义定位,图表'
|
249
|
+
"""
|
250
|
+
if not isinstance(bitsum, int):
|
251
|
+
try:
|
252
|
+
bitsum = int(bitsum)
|
253
|
+
except ValueError:
|
254
|
+
bitsum = 0
|
255
|
+
|
256
|
+
tags = []
|
257
|
+
for i, k in enumerate(self.bit_names):
|
258
|
+
if (1 << i) & bitsum:
|
259
|
+
tags.append(k)
|
260
|
+
return ','.join(tags)
|
261
|
+
|
262
|
+
def count_bitsum_relations(self, target_bitsum, relation='='):
|
263
|
+
""" 计算特定关系的 bitsum 数量
|
264
|
+
|
265
|
+
:param int target_bitsum: 目标 bitsum
|
266
|
+
:param str relation: 关系类型,可以是 '=', '⊂', '⊃'
|
267
|
+
假设bitval对应的bit_names为n1,n2,n3,n4。
|
268
|
+
那么bitsum相当于是bitval的一个集合
|
269
|
+
比如a={n1,n3,n4},b={n1,n3},因为a完全包含b,所以认为a⊃b,或者a⊋b、b⊂a、b⊊a
|
270
|
+
:return int: 符合条件的 bitsum 数量
|
271
|
+
"""
|
272
|
+
count = 0
|
273
|
+
if relation == '=':
|
274
|
+
# 直接计算等于 target_bitsum 的数量
|
275
|
+
count = self.bitsum_counter.get(target_bitsum, 0)
|
276
|
+
elif relation == '⊂':
|
277
|
+
# 计算所有被 target_bitsum 包含的 bitsum 的数量
|
278
|
+
for bitsum, num in self.bitsum_counter.items():
|
279
|
+
if bitsum and bitsum & target_bitsum == bitsum:
|
280
|
+
count += num
|
281
|
+
elif relation == '⊃':
|
282
|
+
# 计算所有包含 target_bitsum 的 bitsum 的数量
|
283
|
+
for bitsum, num in self.bitsum_counter.items():
|
284
|
+
if bitsum & target_bitsum == target_bitsum:
|
285
|
+
count += num
|
286
|
+
return count
|
287
|
+
|
288
|
+
def check_bitflag(self, max_bitsum_len=None, reletion='=',
|
289
|
+
filter_zero=False, sort_by=None, *,
|
290
|
+
min_bitsum_len=0):
|
291
|
+
""" 检查并返回 bitsum 关系的 DataFrame
|
292
|
+
|
293
|
+
:param int max_bitsum_len: 最大 bitsum 长度
|
294
|
+
:param str reletion: 关系类型,可以是 '=', '⊂', '⊃'
|
295
|
+
支持输入多个字符,表示要同时计算多种关系
|
296
|
+
:param bool filter_zero: 是否过滤掉零值
|
297
|
+
:param None|str sort_by: 排序字段
|
298
|
+
None, 默认排序
|
299
|
+
count, 按照数量从大到小排序
|
300
|
+
bitsum, 按照 bitsum 从小到大排序
|
301
|
+
:param int min_bitsum_len: 最小 bitsum 长度
|
302
|
+
:return: 包含 bitsum 关系的 DataFrame
|
303
|
+
"""
|
304
|
+
from itertools import combinations
|
305
|
+
|
306
|
+
total = sum(self.bitsum_counter.values())
|
307
|
+
rows, columns = [], ['类型', '名称', '百分比.次数']
|
308
|
+
rows.append([-1, '总计', total])
|
309
|
+
|
310
|
+
if max_bitsum_len is None:
|
311
|
+
max_bitsum_len = len(self.bit_names)
|
312
|
+
|
313
|
+
bitvals = [(1 << i) for i in range(len(self.bit_names))]
|
314
|
+
for m in range(min_bitsum_len, max_bitsum_len + 1):
|
315
|
+
for comb in combinations(bitvals, m):
|
316
|
+
bitsum = sum(comb)
|
317
|
+
count = self.count_bitsum_relations(bitsum, relation=reletion)
|
318
|
+
if filter_zero and count == 0:
|
319
|
+
continue
|
320
|
+
rows.append([f'{reletion}{bitsum}',
|
321
|
+
self.get_bitsum_names(bitsum),
|
322
|
+
count])
|
323
|
+
|
324
|
+
if sort_by == 'count':
|
325
|
+
rows.sort(key=lambda x: x[2], reverse=True)
|
326
|
+
elif sort_by == 'bitsum':
|
327
|
+
rows.sort(key=lambda x: int(x[0][1:]) if isinstance(x[0], str) else x[0])
|
328
|
+
|
329
|
+
df = pd.DataFrame.from_records(rows, columns=columns)
|
330
|
+
df['百分比.次数'] = percentage_and_value(df['百分比.次数'], 2, total=total)
|
331
|
+
return df
|
332
|
+
|
333
|
+
def report(self):
|
334
|
+
""" 生成统计报告 """
|
335
|
+
html_content = []
|
336
|
+
|
337
|
+
html_content.append('<h1>1 包含每一位bitval特征的数量</h1>')
|
338
|
+
df1 = self.check_bitflag(1, '⊃')
|
339
|
+
html_content.append(df1.to_html())
|
340
|
+
|
341
|
+
html_content.append('<h1>2 每一种具体bitsum组合的数量</h1>')
|
342
|
+
df2 = self.check_bitflag(reletion='=', filter_zero=True, sort_by='bitsum')
|
343
|
+
html_content.append(df2.to_html())
|
344
|
+
|
345
|
+
return '\n'.join(html_content)
|
346
|
+
|
347
|
+
|
348
|
+
def loguru_setup_jsonl_logfile(logger, log_dir, rotation_size="10 MB"):
|
349
|
+
"""
|
350
|
+
给loguru的日志器添加导出文件的功能,使用jsonl格式
|
351
|
+
|
352
|
+
:param logger: 日志记录器,一般是from loguru import logger的logger
|
353
|
+
:param log_dir: 存储日志的目录,因为有多个文件,这里要输入的是所在的目录
|
354
|
+
:param rotation_size: 文件多大后分割
|
355
|
+
:return:
|
356
|
+
"""
|
357
|
+
from datetime import datetime
|
358
|
+
|
359
|
+
os.makedirs(log_dir, exist_ok=True) # 自动创建日志目录
|
360
|
+
|
361
|
+
# 日志文件名匹配的正则表达式,格式为 年月日_时分秒.log
|
362
|
+
log_filename_pattern = re.compile(r"(\d{8}_\d{6})\.jsonl")
|
363
|
+
|
364
|
+
# 找到最新的日志文件
|
365
|
+
def find_latest_log_file(log_dir):
|
366
|
+
log_files = []
|
367
|
+
for file in os.listdir(log_dir):
|
368
|
+
if log_filename_pattern.match(file):
|
369
|
+
log_files.append(file)
|
370
|
+
|
371
|
+
if log_files:
|
372
|
+
# 根据时间排序,选择最新的日志文件
|
373
|
+
log_files.sort(reverse=True)
|
374
|
+
return os.path.join(log_dir, log_files[0])
|
375
|
+
return None
|
376
|
+
|
377
|
+
# 检查是否有未写满的日志文件
|
378
|
+
latest_log_file = find_latest_log_file(log_dir)
|
379
|
+
|
380
|
+
if latest_log_file:
|
381
|
+
log_path = latest_log_file
|
382
|
+
else:
|
383
|
+
# 生成新的日志文件名
|
384
|
+
log_filename = datetime.now().strftime("%Y%m%d_%H%M%S") + ".jsonl"
|
385
|
+
log_path = os.path.join(log_dir, log_filename)
|
386
|
+
|
387
|
+
# 配置 logger,写入日志文件,设置旋转条件,使用 JSON 序列化
|
388
|
+
logger.add(log_path, rotation=rotation_size, serialize=True)
|
389
|
+
|
390
|
+
# 输出初始化成功信息
|
391
|
+
logger.info(f"日志系统已初始化,日志文件路径:{log_path}")
|