pyxllib 0.0.43__py3-none-any.whl → 0.3.197__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyxllib/__init__.py +9 -2
- pyxllib/algo/__init__.py +8 -0
- pyxllib/algo/disjoint.py +54 -0
- pyxllib/algo/geo.py +541 -0
- pyxllib/{util/mathlib.py → algo/intervals.py} +172 -36
- pyxllib/algo/matcher.py +389 -0
- pyxllib/algo/newbie.py +166 -0
- pyxllib/algo/pupil.py +629 -0
- pyxllib/algo/shapelylib.py +67 -0
- pyxllib/algo/specialist.py +241 -0
- pyxllib/algo/stat.py +494 -0
- pyxllib/algo/treelib.py +149 -0
- pyxllib/algo/unitlib.py +66 -0
- pyxllib/autogui/__init__.py +5 -0
- pyxllib/autogui/activewin.py +246 -0
- pyxllib/autogui/all.py +9 -0
- pyxllib/autogui/autogui.py +852 -0
- pyxllib/autogui/uiautolib.py +362 -0
- pyxllib/autogui/virtualkey.py +102 -0
- pyxllib/autogui/wechat.py +827 -0
- pyxllib/autogui/wechat_msg.py +421 -0
- pyxllib/autogui/wxautolib.py +84 -0
- pyxllib/cv/__init__.py +1 -11
- pyxllib/cv/expert.py +267 -0
- pyxllib/cv/{imlib.py → imfile.py} +18 -83
- pyxllib/cv/imhash.py +39 -0
- pyxllib/cv/pupil.py +9 -0
- pyxllib/cv/rgbfmt.py +1525 -0
- pyxllib/cv/slidercaptcha.py +137 -0
- pyxllib/cv/trackbartools.py +163 -49
- pyxllib/cv/xlcvlib.py +1040 -0
- pyxllib/cv/xlpillib.py +423 -0
- pyxllib/data/__init__.py +0 -0
- pyxllib/data/echarts.py +240 -0
- pyxllib/data/jsonlib.py +89 -0
- pyxllib/{util/oss2_.py → data/oss.py} +11 -9
- pyxllib/data/pglib.py +1127 -0
- pyxllib/data/sqlite.py +568 -0
- pyxllib/{util → data}/sqllib.py +13 -31
- pyxllib/ext/JLineViewer.py +505 -0
- pyxllib/ext/__init__.py +6 -0
- pyxllib/{util → ext}/demolib.py +119 -35
- pyxllib/ext/drissionlib.py +277 -0
- pyxllib/ext/kq5034lib.py +12 -0
- pyxllib/{util/main.py → ext/old.py} +122 -284
- pyxllib/ext/qt.py +449 -0
- pyxllib/ext/robustprocfile.py +497 -0
- pyxllib/ext/seleniumlib.py +76 -0
- pyxllib/{util/tklib.py → ext/tk.py} +10 -11
- pyxllib/ext/unixlib.py +827 -0
- pyxllib/ext/utools.py +351 -0
- pyxllib/{util/webhooklib.py → ext/webhook.py} +45 -17
- pyxllib/ext/win32lib.py +40 -0
- pyxllib/ext/wjxlib.py +88 -0
- pyxllib/ext/wpsapi.py +124 -0
- pyxllib/ext/xlwork.py +9 -0
- pyxllib/ext/yuquelib.py +1105 -0
- pyxllib/file/__init__.py +17 -0
- pyxllib/file/docxlib.py +761 -0
- pyxllib/{util → file}/gitlib.py +40 -27
- pyxllib/file/libreoffice.py +165 -0
- pyxllib/file/movielib.py +148 -0
- pyxllib/file/newbie.py +10 -0
- pyxllib/file/onenotelib.py +1469 -0
- pyxllib/file/packlib/__init__.py +330 -0
- pyxllib/{util → file/packlib}/zipfile.py +598 -195
- pyxllib/file/pdflib.py +426 -0
- pyxllib/file/pupil.py +185 -0
- pyxllib/file/specialist/__init__.py +685 -0
- pyxllib/{basic/_5_dirlib.py → file/specialist/dirlib.py} +364 -93
- pyxllib/file/specialist/download.py +193 -0
- pyxllib/file/specialist/filelib.py +2829 -0
- pyxllib/file/xlsxlib.py +3131 -0
- pyxllib/file/xlsyncfile.py +341 -0
- pyxllib/prog/__init__.py +5 -0
- pyxllib/prog/cachetools.py +64 -0
- pyxllib/prog/deprecatedlib.py +233 -0
- pyxllib/prog/filelock.py +42 -0
- pyxllib/prog/ipyexec.py +253 -0
- pyxllib/prog/multiprogs.py +940 -0
- pyxllib/prog/newbie.py +451 -0
- pyxllib/prog/pupil.py +1197 -0
- pyxllib/{sitepackages.py → prog/sitepackages.py} +5 -3
- pyxllib/prog/specialist/__init__.py +391 -0
- pyxllib/prog/specialist/bc.py +203 -0
- pyxllib/prog/specialist/browser.py +497 -0
- pyxllib/prog/specialist/common.py +347 -0
- pyxllib/prog/specialist/datetime.py +199 -0
- pyxllib/prog/specialist/tictoc.py +240 -0
- pyxllib/prog/specialist/xllog.py +180 -0
- pyxllib/prog/xlosenv.py +108 -0
- pyxllib/stdlib/__init__.py +17 -0
- pyxllib/{util → stdlib}/tablepyxl/__init__.py +1 -3
- pyxllib/{util → stdlib}/tablepyxl/style.py +1 -1
- pyxllib/{util → stdlib}/tablepyxl/tablepyxl.py +2 -4
- pyxllib/text/__init__.py +8 -0
- pyxllib/text/ahocorasick.py +39 -0
- pyxllib/text/airscript.js +744 -0
- pyxllib/text/charclasslib.py +121 -0
- pyxllib/text/jiebalib.py +267 -0
- pyxllib/text/jinjalib.py +32 -0
- pyxllib/text/jsa_ai_prompt.md +271 -0
- pyxllib/text/jscode.py +922 -0
- pyxllib/text/latex/__init__.py +158 -0
- pyxllib/text/levenshtein.py +303 -0
- pyxllib/text/nestenv.py +1215 -0
- pyxllib/text/newbie.py +300 -0
- pyxllib/text/pupil/__init__.py +8 -0
- pyxllib/text/pupil/common.py +1121 -0
- pyxllib/text/pupil/xlalign.py +326 -0
- pyxllib/text/pycode.py +47 -0
- pyxllib/text/specialist/__init__.py +8 -0
- pyxllib/text/specialist/common.py +112 -0
- pyxllib/text/specialist/ptag.py +186 -0
- pyxllib/text/spellchecker.py +172 -0
- pyxllib/text/templates/echart_base.html +11 -0
- pyxllib/text/templates/highlight_code.html +17 -0
- pyxllib/text/templates/latex_editor.html +103 -0
- pyxllib/text/vbacode.py +17 -0
- pyxllib/text/xmllib.py +747 -0
- pyxllib/xl.py +39 -0
- pyxllib/xlcv.py +17 -0
- pyxllib-0.3.197.dist-info/METADATA +48 -0
- pyxllib-0.3.197.dist-info/RECORD +126 -0
- {pyxllib-0.0.43.dist-info → pyxllib-0.3.197.dist-info}/WHEEL +4 -5
- pyxllib/basic/_1_strlib.py +0 -945
- pyxllib/basic/_2_timelib.py +0 -488
- pyxllib/basic/_3_pathlib.py +0 -916
- pyxllib/basic/_4_loglib.py +0 -419
- pyxllib/basic/__init__.py +0 -54
- pyxllib/basic/arrow_.py +0 -250
- pyxllib/basic/chardet_.py +0 -66
- pyxllib/basic/dirlib.py +0 -529
- pyxllib/basic/dprint.py +0 -202
- pyxllib/basic/extension.py +0 -12
- pyxllib/basic/judge.py +0 -31
- pyxllib/basic/log.py +0 -204
- pyxllib/basic/pathlib_.py +0 -705
- pyxllib/basic/pytictoc.py +0 -102
- pyxllib/basic/qiniu_.py +0 -61
- pyxllib/basic/strlib.py +0 -761
- pyxllib/basic/timer.py +0 -132
- pyxllib/cv/cv.py +0 -834
- pyxllib/cv/cvlib/_1_geo.py +0 -543
- pyxllib/cv/cvlib/_2_cvprcs.py +0 -309
- pyxllib/cv/cvlib/_2_imgproc.py +0 -594
- pyxllib/cv/cvlib/_3_pilprcs.py +0 -80
- pyxllib/cv/cvlib/_4_cvimg.py +0 -211
- pyxllib/cv/cvlib/__init__.py +0 -10
- pyxllib/cv/debugtools.py +0 -82
- pyxllib/cv/fitz_.py +0 -300
- pyxllib/cv/installer.py +0 -42
- pyxllib/debug/_0_installer.py +0 -38
- pyxllib/debug/_1_typelib.py +0 -277
- pyxllib/debug/_2_chrome.py +0 -198
- pyxllib/debug/_3_showdir.py +0 -161
- pyxllib/debug/_4_bcompare.py +0 -140
- pyxllib/debug/__init__.py +0 -49
- pyxllib/debug/bcompare.py +0 -132
- pyxllib/debug/chrome.py +0 -198
- pyxllib/debug/installer.py +0 -38
- pyxllib/debug/showdir.py +0 -158
- pyxllib/debug/typelib.py +0 -278
- pyxllib/image/__init__.py +0 -12
- pyxllib/torch/__init__.py +0 -20
- pyxllib/torch/modellib.py +0 -37
- pyxllib/torch/trainlib.py +0 -344
- pyxllib/util/__init__.py +0 -20
- pyxllib/util/aip_.py +0 -141
- pyxllib/util/casiadb.py +0 -59
- pyxllib/util/excellib.py +0 -495
- pyxllib/util/filelib.py +0 -612
- pyxllib/util/jsondata.py +0 -27
- pyxllib/util/jsondata2.py +0 -92
- pyxllib/util/labelmelib.py +0 -139
- pyxllib/util/onepy/__init__.py +0 -29
- pyxllib/util/onepy/onepy.py +0 -574
- pyxllib/util/onepy/onmanager.py +0 -170
- pyxllib/util/pyautogui_.py +0 -219
- pyxllib/util/textlib.py +0 -1305
- pyxllib/util/unorder.py +0 -22
- pyxllib/util/xmllib.py +0 -639
- pyxllib-0.0.43.dist-info/METADATA +0 -39
- pyxllib-0.0.43.dist-info/RECORD +0 -80
- pyxllib-0.0.43.dist-info/top_level.txt +0 -1
- {pyxllib-0.0.43.dist-info → pyxllib-0.3.197.dist-info/licenses}/LICENSE +0 -0
pyxllib/util/unorder.py
DELETED
@@ -1,22 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
# @Author : 陈坤泽
|
4
|
-
# @Email : 877362867@qq.com
|
5
|
-
# @Data : 2020/05/30
|
6
|
-
|
7
|
-
|
8
|
-
"""
|
9
|
-
未系统分类、零散、冷门的功能
|
10
|
-
"""
|
11
|
-
|
12
|
-
|
13
|
-
def document(func):
|
14
|
-
"""文档函数装饰器
|
15
|
-
用该装饰器器时,表明一个函数是用伪代码在表示一系列的操作逻辑,不能直接拿来执行的
|
16
|
-
很可能是一套半自动化工具
|
17
|
-
"""
|
18
|
-
|
19
|
-
def wrapper(*args):
|
20
|
-
raise RuntimeError(f'函数:{func.__name__} 是一个伪代码流程示例文档,不能直接运行')
|
21
|
-
|
22
|
-
return wrapper
|
pyxllib/util/xmllib.py
DELETED
@@ -1,639 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
# @Author : 陈坤泽
|
4
|
-
# @Email : 877362867@qq.com
|
5
|
-
# @Data : 2020/06/02 20:16
|
6
|
-
|
7
|
-
|
8
|
-
"""
|
9
|
-
xml等网页结构方面的处理
|
10
|
-
"""
|
11
|
-
|
12
|
-
|
13
|
-
from collections import defaultdict, Counter
|
14
|
-
|
15
|
-
|
16
|
-
import bs4
|
17
|
-
from bs4 import BeautifulSoup
|
18
|
-
|
19
|
-
|
20
|
-
from pyxllib.util.textlib import *
|
21
|
-
|
22
|
-
|
23
|
-
____section_1_dfs_base = """
|
24
|
-
一个通用的递归功能
|
25
|
-
"""
|
26
|
-
|
27
|
-
|
28
|
-
def dfs_base(node, *,
|
29
|
-
child_generator=None, select_depth=None, linenum=False,
|
30
|
-
mystr=None, msghead=True, lsstr=None, show_node_type=False, prefix=' '):
|
31
|
-
"""输入一个节点node,以及该节点当前depth
|
32
|
-
:param prefix: 缩进格式,默认用4个空格
|
33
|
-
:param node: 节点
|
34
|
-
:param child_generator: 子节点生成函数
|
35
|
-
函数支持输入一个节点参数
|
36
|
-
返回一个子节点列表
|
37
|
-
:param select_depth: 要显示的深度
|
38
|
-
单个数字:获得指定层
|
39
|
-
Sequences: 两个整数,取出这个闭区间内的层级内容
|
40
|
-
:param mystr: 自定义单个节点字符串方式
|
41
|
-
标准是输入2个参数 mystr(node, depth),返回字符串化的结果,记得前缀缩进也要自己控制的!
|
42
|
-
也可以只输入一个参数 mystr(node):
|
43
|
-
这种情况会自动按照每层4个空格进行缩进
|
44
|
-
:param lsstr: 自定义整个列表的字符串化方法,在mystr的基础上调控更加灵活,但要写的代码也更多
|
45
|
-
:param linenum:节点从1开始编号
|
46
|
-
行号后面,默认会跟一个类似Excel列名的字母,表示层级深度
|
47
|
-
:param msghead: 第1行输出一些统计信息
|
48
|
-
:param show_node_type:
|
49
|
-
:return 返回一个遍历清单ls
|
50
|
-
ls的每个元素是一个列表
|
51
|
-
第1个值是depth
|
52
|
-
第2个值是节点ref
|
53
|
-
|
54
|
-
Requires
|
55
|
-
textwrap:用到shorten
|
56
|
-
align.listalign:生成列编号时对齐
|
57
|
-
"""
|
58
|
-
# 1 子节点生成器,与配置
|
59
|
-
def bs4_child_generator(node):
|
60
|
-
try:
|
61
|
-
return node.children
|
62
|
-
except AttributeError:
|
63
|
-
return []
|
64
|
-
|
65
|
-
# 配置子节点生成器
|
66
|
-
if not child_generator:
|
67
|
-
child_generator = bs4_child_generator
|
68
|
-
|
69
|
-
# 2 dfs实际实现代码,获得节点清单
|
70
|
-
def inner(node, depth=0):
|
71
|
-
"""dfs实际实现代码
|
72
|
-
TODO:把depth过滤写进inner不生成?! 不过目前还是按照生成整棵树处理,能统计到一些信息。
|
73
|
-
"""
|
74
|
-
ls = [[node, depth]]
|
75
|
-
for t in child_generator(node):
|
76
|
-
ls += inner(t, depth + 1)
|
77
|
-
return ls
|
78
|
-
ls = inner(node)
|
79
|
-
total_node = len(ls)
|
80
|
-
total_depth = max(map(lambda x: x[1], ls))
|
81
|
-
head = f'总节点数:1~{total_node},总深度:0~{total_depth}'
|
82
|
-
|
83
|
-
# 4 过滤与重新整理ls(select_depth)
|
84
|
-
logo = True
|
85
|
-
cnt = 0
|
86
|
-
tree_num = 0
|
87
|
-
if isinstance(select_depth, int):
|
88
|
-
|
89
|
-
for i in range(total_node):
|
90
|
-
if ls[i][1] == select_depth:
|
91
|
-
ls[i][1] = 0
|
92
|
-
cnt += 1
|
93
|
-
logo = True
|
94
|
-
elif ls[i][1] < select_depth and logo: # 遇到第1个父节点添加一个空行
|
95
|
-
ls[i] = ''
|
96
|
-
tree_num += 1
|
97
|
-
logo = False
|
98
|
-
else: # 删除该节点,不做任何显示
|
99
|
-
ls[i] = None
|
100
|
-
head += f';挑选出的节点数:{cnt},所选深度:{select_depth},树数量:{tree_num}'
|
101
|
-
|
102
|
-
elif hasattr(select_depth, '__getitem__'):
|
103
|
-
for i in range(total_node):
|
104
|
-
if select_depth[0] <= ls[i][1] <= select_depth[1]:
|
105
|
-
ls[i][1] -= select_depth[0]
|
106
|
-
cnt += 1
|
107
|
-
logo = True
|
108
|
-
elif ls[i][1] < select_depth[0] and logo: # 遇到第1个父节点添加一个空行
|
109
|
-
ls[i] = ''
|
110
|
-
tree_num += 1
|
111
|
-
logo = False
|
112
|
-
else: # 删除该节点,不做任何显示
|
113
|
-
ls[i] = None
|
114
|
-
head += f';挑选出的节点数:{cnt},所选深度:{select_depth[0]}~{select_depth[1]},树数量:{tree_num}'
|
115
|
-
"""注意此时ls[i]的状态,有3种类型
|
116
|
-
(node, depth):tuple类型,第0个元素是node对象,第1个元素是该元素所处层级
|
117
|
-
None:已删除元素,但为了后续编号方便,没有真正的移出,而是用None作为标记
|
118
|
-
'':已删除元素,但这里涉及父节点的删除,建议此处留一个空行
|
119
|
-
"""
|
120
|
-
|
121
|
-
# 5 格式处理
|
122
|
-
def default_mystr(node, depth):
|
123
|
-
s1 = prefix * depth
|
124
|
-
s2 = typename(node)+',' if show_node_type else ''
|
125
|
-
s3 = textwrap.shorten(str(node), 200)
|
126
|
-
return s1 + s2 + s3
|
127
|
-
|
128
|
-
def default_lsstr(ls):
|
129
|
-
nonlocal mystr
|
130
|
-
if not mystr:
|
131
|
-
mystr = default_mystr
|
132
|
-
else:
|
133
|
-
try: # 测试两个参数情况下是否可以正常运行
|
134
|
-
mystr('', 0)
|
135
|
-
except TypeError:
|
136
|
-
# 如果不能正常运行,则进行封装从而支持2个参数
|
137
|
-
func = mystr
|
138
|
-
|
139
|
-
def str_plus(node, depth): # 注意这里函数名要换一个新的func
|
140
|
-
return prefix * depth + func(node)
|
141
|
-
mystr = str_plus
|
142
|
-
|
143
|
-
line_num = listalign(range(1, total_node + 1))
|
144
|
-
res = []
|
145
|
-
for i in range(total_node):
|
146
|
-
if ls[i] is not None:
|
147
|
-
if isinstance(ls[i], str): # 已经指定该行要显示什么
|
148
|
-
res.append(ls[i])
|
149
|
-
else:
|
150
|
-
if linenum: # 增加了一个能显示层级的int2excel_col_name
|
151
|
-
res.append(line_num[i] + int2myalphaenum(ls[i][1]) + ' ' + mystr(ls[i][0], ls[i][1]))
|
152
|
-
else:
|
153
|
-
res.append(mystr(ls[i][0], ls[i][1]))
|
154
|
-
|
155
|
-
s = '\n'.join(res)
|
156
|
-
return s
|
157
|
-
|
158
|
-
if not lsstr:
|
159
|
-
lsstr = default_lsstr
|
160
|
-
|
161
|
-
s = lsstr(ls)
|
162
|
-
|
163
|
-
# 是否要添加信息头
|
164
|
-
if msghead:
|
165
|
-
s = head + '\n' + s
|
166
|
-
|
167
|
-
return s
|
168
|
-
|
169
|
-
|
170
|
-
def treetable(childreds, parents, arg3=None, nodename_colname=None):
|
171
|
-
"""输入childres子结点id列表,和parents父结点id列表
|
172
|
-
两个列表长度必须相等
|
173
|
-
文档:http://note.youdao.com/noteshare?id=126200f45d301fcb4364d06a0cae8376
|
174
|
-
|
175
|
-
有两种调用形式
|
176
|
-
>> treetable(childreds, parents) --> DataFrame (新建df)
|
177
|
-
>> treetable(df, child_colname, parent_colname) --> DataFrame (修改后的df)
|
178
|
-
|
179
|
-
返回一个二维列表
|
180
|
-
新的childreds (末尾可能回加虚结点)
|
181
|
-
新的parents
|
182
|
-
函数会计算每一行childred对应的树排序后的排序编号order
|
183
|
-
以及每个节点深度depth
|
184
|
-
|
185
|
-
>> ls1 = [6, 2, 4, 5, 3], ls2 = [7, 1, 2, 2, 1], treetable(ls1, ls2)
|
186
|
-
child_id parent_id depth tree_order tree_struct
|
187
|
-
5 7 root 1 1 = = 7
|
188
|
-
0 6 7 2 2 = = = = 6
|
189
|
-
6 1 root 1 3 = = 1
|
190
|
-
1 2 1 2 4 = = = = 2
|
191
|
-
2 4 2 3 5 = = = = = = 4
|
192
|
-
3 5 2 3 6 = = = = = = 5
|
193
|
-
4 3 1 2 7 = = = = 3
|
194
|
-
"""
|
195
|
-
# 0 参数预处理
|
196
|
-
if isinstance(childreds, pd.DataFrame):
|
197
|
-
df = childreds
|
198
|
-
child_colname = parents
|
199
|
-
parent_colname = arg3
|
200
|
-
if not arg3: raise TypeError
|
201
|
-
childreds = df[child_colname].tolist()
|
202
|
-
parents = df[parent_colname].tolist()
|
203
|
-
else:
|
204
|
-
df = None
|
205
|
-
|
206
|
-
# 1 建立root根节点,确保除了root其他结点都存在记录
|
207
|
-
lefts = set(parents) - set(childreds) # parents列中没有在childreds出现的结点
|
208
|
-
cs, ps = list(childreds), list(parents)
|
209
|
-
|
210
|
-
if len(lefts) == 0:
|
211
|
-
# b_left为空一定有环,b_left不为空也不一定是正常的树
|
212
|
-
raise ValueError('有环,不是树结构')
|
213
|
-
elif len(lefts) == 1: # 只有一个未出现的结点,那么它既是根节点
|
214
|
-
root = list(lefts)[0]
|
215
|
-
else: # 多个父结点没有记录,则对这些父结点统一加一个root父结点
|
216
|
-
root = 'root'
|
217
|
-
allnode = set(parents) | set(childreds) # 所有结点集合
|
218
|
-
while root in allnode: root += '-' # 一直在末尾加'-',直到这个结点是输入里未出现的
|
219
|
-
# 添加结点
|
220
|
-
lefts = list(lefts)
|
221
|
-
lefts.sort(key=lambda x: parents.index(x))
|
222
|
-
for t in lefts:
|
223
|
-
cs.append(t)
|
224
|
-
ps.append(root)
|
225
|
-
|
226
|
-
n = len(cs)
|
227
|
-
depth, tree_order, len_childs = [-1]*n, [-1]*n, [0]*n
|
228
|
-
|
229
|
-
# 2 构造父结点-孩子结点的字典dd
|
230
|
-
dd = defaultdict(list)
|
231
|
-
for i in range(n): dd[ps[i]] += [i]
|
232
|
-
|
233
|
-
# 3 dfs
|
234
|
-
cnt = 1
|
235
|
-
def dfs(node, d):
|
236
|
-
"""找node的所有子结点"""
|
237
|
-
nonlocal cnt
|
238
|
-
for i in dd.get(node, []):
|
239
|
-
tree_order[i], depth[i], len_childs[i] = cnt, d, len(dd[cs[i]])
|
240
|
-
cnt += 1
|
241
|
-
dfs(cs[i], d+1)
|
242
|
-
dfs(root, 1)
|
243
|
-
|
244
|
-
# 4 输出格式
|
245
|
-
tree_struct = list(map(lambda i: f"{'_ _ ' * depth[i]}{cs[i]}" + (f'[{len_childs[i]}]' if len_childs[i] else ''),
|
246
|
-
range(n)))
|
247
|
-
|
248
|
-
if df is None:
|
249
|
-
ls = list(zip(cs, ps, depth, tree_order, len_childs, tree_struct))
|
250
|
-
df = pd.DataFrame.from_records(ls, columns=('child_id', 'parent_id',
|
251
|
-
'depth', 'tree_order', 'len_childs', 'tree_struct'))
|
252
|
-
else:
|
253
|
-
k = len(df)
|
254
|
-
df = df.append(pd.DataFrame({child_colname: cs[k:], parent_colname: ps[k:]}), sort=False, ignore_index=True)
|
255
|
-
if nodename_colname:
|
256
|
-
tree_struct = list(
|
257
|
-
map(lambda i: f"{'_ _ ' * depth[i]}{cs[i]} {df.iloc[i][nodename_colname]}"
|
258
|
-
+ (f'[{len_childs[i]}]' if len_childs[i] else ''), range(n)))
|
259
|
-
df['depth'], df['tree_order'], df['len_childs'], df['tree_struct'] = depth, tree_order, len_childs, tree_struct
|
260
|
-
df.sort_values('tree_order', inplace=True) # 注意有时候可能不能排序,要维持输入时候的顺序
|
261
|
-
return df
|
262
|
-
|
263
|
-
|
264
|
-
def treetable_flatten(df, *, reverse=False, childid_colname='id', parentid_colname='parent_id', format_colname=None):
|
265
|
-
"""获得知识树横向展开表:列为depth-3, depth-2, depth-1,表示倒数第3级、倒数第2级、倒数第1级
|
266
|
-
:param df: DataFrame数据
|
267
|
-
:param reverse:
|
268
|
-
False,正常地罗列depth1、depth2、depth3...等结点信息
|
269
|
-
True,反向列举所属层级,即显示倒数第1层parent1,然后是倒数第2层parent2...
|
270
|
-
:param childid_colname: 孩子结点列
|
271
|
-
:param parentid_colname: 父结点列
|
272
|
-
:param format_colname: 显示的数值
|
273
|
-
None,默认采用 childid_colname 的值
|
274
|
-
str,某一列的名称,采用那一列的值(可以实现设置好格式)
|
275
|
-
:return:
|
276
|
-
"""
|
277
|
-
# 1 构造辅助数组
|
278
|
-
if format_colname is None: format_colname = parentid_colname
|
279
|
-
parentid = dict() # parentid[k] = v, 存储结点k对应的父结点v
|
280
|
-
nodeval = dict() # nodeval[k] = v, 存储结点k需要显示的数值情况
|
281
|
-
if len(df[df.index.duplicated()]):
|
282
|
-
dprint(len(set(df.index)), len(df.index)) # 有重复index
|
283
|
-
raise ValueError
|
284
|
-
|
285
|
-
for idx, row in df.iterrows():
|
286
|
-
parentid[row[childid_colname]] = row[parentid_colname]
|
287
|
-
nodeval[row[childid_colname]] = str(row[format_colname])
|
288
|
-
|
289
|
-
# 2 每个结点往上遍历出所有父结点
|
290
|
-
parents = []
|
291
|
-
for idx, row in df.iterrows():
|
292
|
-
ps = [nodeval[row[childid_colname]]] # 包含结点自身的所有父结点名称
|
293
|
-
p = row[parentid_colname]
|
294
|
-
while p in parentid:
|
295
|
-
ps.append(nodeval[p])
|
296
|
-
p = parentid[p]
|
297
|
-
parents.append(ps)
|
298
|
-
num_depth = max(map(len, parents), default=0)
|
299
|
-
|
300
|
-
# 3 这里可以灵活调整最终要显示的格式效果
|
301
|
-
df['parents'] = parents
|
302
|
-
if reverse:
|
303
|
-
for j in range(num_depth, 0, -1): df[f'depth-{j}'] = ''
|
304
|
-
for idx, row in df.iterrows():
|
305
|
-
for j in range(1, len(row.parents)+1):
|
306
|
-
df.loc[idx, f'depth-{j}'] = row.parents[j-1]
|
307
|
-
else:
|
308
|
-
for j in range(num_depth): df[f'depth{j}'] = ''
|
309
|
-
for idx, row in df.iterrows():
|
310
|
-
for j in range(len(row.parents)):
|
311
|
-
df.loc[idx, f'depth{j}'] = row.parents[-j-1]
|
312
|
-
df.drop('parents', axis=1, inplace=True)
|
313
|
-
return df
|
314
|
-
|
315
|
-
|
316
|
-
____section_2_xml = """
|
317
|
-
xml相关的一些功能函数
|
318
|
-
"""
|
319
|
-
|
320
|
-
|
321
|
-
def readurl(url):
|
322
|
-
"""从url读取文本"""
|
323
|
-
r = requests.get(url)
|
324
|
-
soup = BeautifulSoup(r.text, 'lxml')
|
325
|
-
s = soup.get_text()
|
326
|
-
return s
|
327
|
-
|
328
|
-
|
329
|
-
____section_3_xmlparser = """
|
330
|
-
"""
|
331
|
-
|
332
|
-
|
333
|
-
def tag_name(t):
|
334
|
-
"""输入一个bs4的Tag或NavigableString,
|
335
|
-
返回tag.name或者'NavigableString'
|
336
|
-
"""
|
337
|
-
if t.name:
|
338
|
-
return t.name
|
339
|
-
elif isinstance(t, bs4.NavigableString):
|
340
|
-
return 'NavigableString'
|
341
|
-
else:
|
342
|
-
dprint(t) # 获取结点t名称失败
|
343
|
-
return None
|
344
|
-
|
345
|
-
|
346
|
-
def subtag_names(t):
|
347
|
-
"""列出结点t的所有直接子结点(花括号后面跟的数字是连续出现次数)
|
348
|
-
例如body的: p{137},tbl,p{94},tbl,p{1640},sectPr
|
349
|
-
"""
|
350
|
-
def counter(m):
|
351
|
-
s1 = m.group(1)
|
352
|
-
n = (m.end(0) - m.start(0)) // len(s1)
|
353
|
-
s = s1[:-1] + '{' + str(n) + '}'
|
354
|
-
if m.string[m.end(0)-1] == ',':
|
355
|
-
s += ','
|
356
|
-
return s
|
357
|
-
|
358
|
-
if t.name and t.contents:
|
359
|
-
s = ','.join(map(tag_name, t.contents)) + ','
|
360
|
-
s = re.sub(r'([^,]+,)(\1)+', counter, s)
|
361
|
-
else:
|
362
|
-
s = ''
|
363
|
-
if s and s[-1] == ',':
|
364
|
-
s = s[:-1]
|
365
|
-
return s
|
366
|
-
|
367
|
-
|
368
|
-
class XmlParser:
|
369
|
-
def __init__(self, node=None):
|
370
|
-
"""两种初始化方式
|
371
|
-
提供node:用某个bs4的PageElement等对象初始化
|
372
|
-
未提供node,一般是方便给MyBs4等类继承使用
|
373
|
-
"""
|
374
|
-
if node: # TODO:可以扩展,支持不同类型的初始化
|
375
|
-
self._node = node
|
376
|
-
|
377
|
-
def node(self):
|
378
|
-
"""获得xml结点的接口函数"""
|
379
|
-
return self._node if getattr(self, '_node') else self
|
380
|
-
|
381
|
-
def treestruct_raw(self, **kwargs):
|
382
|
-
"""查看树形结构的raw版本
|
383
|
-
各参数含义详见dfs_base
|
384
|
-
"""
|
385
|
-
# 1 先用dfs获得基本结果
|
386
|
-
s = dfs_base(self.node(), **kwargs)
|
387
|
-
return s
|
388
|
-
|
389
|
-
def treestruct_brief(self, linenum=True, prefix='- ', **kwargs):
|
390
|
-
"""查看树形结构的简洁版
|
391
|
-
"""
|
392
|
-
def mystr(node):
|
393
|
-
# if isinstance(node, (bs4.ProcessingInstruction, code4101py.stdlib.bs4.ProcessingInstruction)):
|
394
|
-
if isinstance(node, bs4.ProcessingInstruction):
|
395
|
-
s = 'ProcessingInstruction,' + str(node)
|
396
|
-
# elif isinstance(node, (bs4.Tag, code4101py.stdlib.bs4.Tag)):
|
397
|
-
elif isinstance(node, bs4.Tag):
|
398
|
-
s = node.name + ',' + mydictstr(node.attrs, item_delimit=',')
|
399
|
-
# elif isinstance(node, (bs4.NavigableString, code4101py.stdlib.bs4.NavigableString)):
|
400
|
-
elif isinstance(node, bs4.NavigableString):
|
401
|
-
# s = 'NavigableString'
|
402
|
-
s = shorten(str(node), 200)
|
403
|
-
if not s.strip():
|
404
|
-
s = '<??>'
|
405
|
-
else:
|
406
|
-
s = '遇到特殊类型,' + str(node)
|
407
|
-
return s
|
408
|
-
|
409
|
-
s = dfs_base(self.node(), mystr=mystr, prefix=prefix, linenum=linenum, **kwargs)
|
410
|
-
return s
|
411
|
-
|
412
|
-
def treestruct_stat(self):
|
413
|
-
"""生成一个两个二维表的统计数据
|
414
|
-
ls1, ls2 = treestruct_stat()
|
415
|
-
ls1: 结点规律表
|
416
|
-
ls2: 属性规律表
|
417
|
-
count_tagname、check_tag的功能基本都可以被这个函数代替
|
418
|
-
"""
|
419
|
-
def text(t):
|
420
|
-
""" 考虑到结果一般都是存储到excel,所以会把无法存成gbk的字符串删掉
|
421
|
-
另外控制了每个元素的长度上限
|
422
|
-
"""
|
423
|
-
s = ensure_gbk(t)
|
424
|
-
s = s[:100]
|
425
|
-
return s
|
426
|
-
|
427
|
-
def depth(t):
|
428
|
-
"""结点t的深度"""
|
429
|
-
return len(tuple(t.parents))
|
430
|
-
|
431
|
-
t = self.contents[0]
|
432
|
-
# ls1 = [['element序号', '层级', '结构', '父结点', '当前结点', '属性值/字符串值', '直接子结点结构']]
|
433
|
-
# ls2 = [['序号', 'element序号', '当前结点', '属性名', '属性值']] #
|
434
|
-
ls1 = [] # 这个重点是分析结点规律
|
435
|
-
ls2 = [] # 这个重点是分析属性规律
|
436
|
-
i = 1
|
437
|
-
while t:
|
438
|
-
# 1 结点规律表
|
439
|
-
d = depth(t)
|
440
|
-
line = [i, d, '_'*d+str(d), tag_name(t.parent), tag_name(t),
|
441
|
-
text(mydictstr(t.attrs) if t.name else t), # 结点存属性,字符串存值
|
442
|
-
subtag_names(t)]
|
443
|
-
ls1.append(line)
|
444
|
-
# 2 属性规律表
|
445
|
-
if t.name:
|
446
|
-
k = len(ls2)
|
447
|
-
for attr, value in t.attrs.items():
|
448
|
-
ls2.append([k, i, tag_name(t), attr, value])
|
449
|
-
k += 1
|
450
|
-
# 下个结点
|
451
|
-
t = t.next_element
|
452
|
-
i += 1
|
453
|
-
df1 = pd.DataFrame.from_records(ls1, columns=['element序号', '层级', '结构', '父结点', '当前结点', '属性值/字符串值', '直接子结点结构'])
|
454
|
-
df2 = pd.DataFrame.from_records(ls2, columns=['序号', 'element序号', '当前结点', '属性名', '属性值'])
|
455
|
-
return df1, df2
|
456
|
-
|
457
|
-
def count_tagname(self):
|
458
|
-
"""统计每个标签出现的次数:
|
459
|
-
1 w:rpr 650
|
460
|
-
2 w:rfonts 650
|
461
|
-
3 w:szcs 618
|
462
|
-
4 w:r 565
|
463
|
-
5 None 532
|
464
|
-
6 w:t 531
|
465
|
-
"""
|
466
|
-
ct = collections.Counter()
|
467
|
-
|
468
|
-
def inner(node):
|
469
|
-
try:
|
470
|
-
ct[node.name] += 1
|
471
|
-
for t in node.children:
|
472
|
-
inner(t)
|
473
|
-
except AttributeError:
|
474
|
-
pass
|
475
|
-
|
476
|
-
inner(self.node())
|
477
|
-
return ct.most_common()
|
478
|
-
|
479
|
-
def check_tag(self, tagname=None):
|
480
|
-
"""统计每个标签在不同层级出现的次数:
|
481
|
-
|
482
|
-
:param tagname:
|
483
|
-
None:统计全文出现的各种标签在不同层级出现次数
|
484
|
-
't'等值: tagname参数允许只检查特殊标签情况,此时会将所有tagname设为第0级
|
485
|
-
|
486
|
-
TODO 检查一个标签内部是否有同名标签?
|
487
|
-
"""
|
488
|
-
d = defaultdict()
|
489
|
-
|
490
|
-
def add(name, depth):
|
491
|
-
if name not in d:
|
492
|
-
d[name] = defaultdict(int)
|
493
|
-
d[name][depth] += 1
|
494
|
-
|
495
|
-
def inner(node, depth):
|
496
|
-
if isinstance(node, bs4.ProcessingInstruction):
|
497
|
-
add('ProcessingInstruction', depth)
|
498
|
-
elif isinstance(node, bs4.Tag):
|
499
|
-
if node.name == tagname and depth:
|
500
|
-
dprint(node, depth) # tagname里有同名子标签
|
501
|
-
add(node.name, depth)
|
502
|
-
for t in node.children:
|
503
|
-
inner(t, depth+1)
|
504
|
-
elif isinstance(node, bs4.NavigableString):
|
505
|
-
add('NavigableString', depth)
|
506
|
-
else:
|
507
|
-
add('其他特殊结点', depth)
|
508
|
-
|
509
|
-
# 1 统计结点在每一层出现的次数
|
510
|
-
if tagname:
|
511
|
-
for t in self.node().find_all(tagname):
|
512
|
-
inner(t, 0)
|
513
|
-
else:
|
514
|
-
inner(self.node(), 0)
|
515
|
-
|
516
|
-
# 2 总出现次数和?
|
517
|
-
|
518
|
-
return d
|
519
|
-
|
520
|
-
def check_namespace(self):
|
521
|
-
"""检查名称空间问题,会同时检查标签名和属性名:
|
522
|
-
1 cNvPr pic:cNvPr(579),wps:cNvPr(52),wpg:cNvPr(15)
|
523
|
-
2 spPr pic:spPr(579),wps:spPr(52)
|
524
|
-
"""
|
525
|
-
# 1 获得所有名称
|
526
|
-
# 因为是采用node的原始xml文本,所以能保证会取得带有名称空间的文本内容
|
527
|
-
ct0 = Counter(re.findall(r'<([a-zA-Z:]+)', str(self.node())))
|
528
|
-
ct = defaultdict(str)
|
529
|
-
s = set()
|
530
|
-
for key, value in ct0.items():
|
531
|
-
k = re.sub(r'.*:', '', key)
|
532
|
-
if k in ct:
|
533
|
-
s.add(k)
|
534
|
-
ct[k] += f',{key}({value})'
|
535
|
-
else:
|
536
|
-
ct[k] = f'{key}({value})'
|
537
|
-
|
538
|
-
# 2 对有重复和无重复的元素划分存储
|
539
|
-
ls1 = [] # 有重复的存储到ls1
|
540
|
-
ls2 = [] # 没有重复的正常结果存储到ls2,可以不显示
|
541
|
-
for k, v in ct.items():
|
542
|
-
if k in s:
|
543
|
-
ls1.append([k, v])
|
544
|
-
else:
|
545
|
-
ls2.append([k, v])
|
546
|
-
|
547
|
-
# 3 显示有重复的情况
|
548
|
-
# chrome(ls1, filename='检查名称空间问题')
|
549
|
-
return ls1
|
550
|
-
|
551
|
-
|
552
|
-
class MyBs4(BeautifulSoup, XmlParser):
|
553
|
-
"""xml、html 等数据通用处理算法,常用功能有:
|
554
|
-
|
555
|
-
show_brief:显示xml结构
|
556
|
-
count_tagname: 统计各个结点名称出现次数
|
557
|
-
"""
|
558
|
-
def __init__(self, markup="", features='lxml', *args, **kwargs):
|
559
|
-
# markup = Path(markup).read()
|
560
|
-
# TODO: **kwargs我不知道怎么传进来啊,不过感觉也不删大雅没什么鸟用吧~~
|
561
|
-
super().__init__(markup, features, *args, **kwargs)
|
562
|
-
|
563
|
-
def insert_after(self, successor):
|
564
|
-
pass
|
565
|
-
|
566
|
-
def insert_before(self, successor):
|
567
|
-
pass
|
568
|
-
|
569
|
-
|
570
|
-
____section_temp = """
|
571
|
-
"""
|
572
|
-
|
573
|
-
|
574
|
-
def mathjax_html_head(s):
|
575
|
-
"""增加mathjax解析脚本"""
|
576
|
-
head = r"""<!DOCTYPE html>
|
577
|
-
<html>
|
578
|
-
<head>
|
579
|
-
<head><meta http-equiv=Content-Type content="text/html;charset=utf-8"></head>
|
580
|
-
<script src="https://a.cdn.histudy.com/lib/config/mathjax_config-klxx.js?v=1.1"></script>
|
581
|
-
<script type="text/javascript" async src="https://a.cdn.histudy.com/lib/mathjax/2.7.1/MathJax/MathJax.js?config=TeX-AMS-MML_SVG">
|
582
|
-
MathJax.Hub.Config(MATHJAX_KLXX_CONFIG);
|
583
|
-
</script>
|
584
|
-
</head>
|
585
|
-
<body>"""
|
586
|
-
tail = '</body></html>'
|
587
|
-
return head + s + tail
|
588
|
-
|
589
|
-
|
590
|
-
def 自动制作网页标题的导航栏(html_content, title='temphtml'):
|
591
|
-
"""
|
592
|
-
:param html_content: 原始网页的完整内容
|
593
|
-
:param title: 页面标题,默认会先找head/title,如果没有,则取一个随机名称(TODO 未实装,目前固定名称'test')
|
594
|
-
|
595
|
-
算法基本原理:读取原网页,找出所有h标签,并增设a锚点
|
596
|
-
另外生成一个导航html文件
|
597
|
-
然后再生成一个主文件,让用户通过主文件来浏览页面
|
598
|
-
|
599
|
-
# 读取csdn博客并展示目录 (不过因为这个存在跳级,效果不是那么好)
|
600
|
-
>> file = 自动制作网页标题的导航栏(requests.get(r'https://blog.csdn.net/code4101/article/details/83009000').content.decode('utf8'))
|
601
|
-
>> chrome(str(file))
|
602
|
-
http://i2.tiimg.com/582188/64f40d235705de69.png
|
603
|
-
"""
|
604
|
-
# 1 对原html,设置锚点,生成一个新的文件f2;生成导航目录文件f1。
|
605
|
-
cnt = 0
|
606
|
-
|
607
|
-
# TODO 目前不支持跳级的情况
|
608
|
-
# 这个refs是可以用py算法生成的,目前是存储在github上引用
|
609
|
-
refs = ['<html><head>',
|
610
|
-
'<link rel=Stylesheet type="text/css" media=all href="https://code4101.github.io/css/navigation0.css">',
|
611
|
-
'</head><body>']
|
612
|
-
|
613
|
-
f2 = Path(title + '_内容', '.html', Path.TEMP)
|
614
|
-
|
615
|
-
def func(m):
|
616
|
-
nonlocal cnt
|
617
|
-
cnt += 1
|
618
|
-
name, content = m.group('name'), m.group('inner')
|
619
|
-
content = BeautifulSoup(content, 'lxml').get_text()
|
620
|
-
refs.append(f'<a href="{f2}#生成导航栏浏览网页{cnt}" target="showframe"><{name}>{content}</{name}></a>')
|
621
|
-
return f'<a name="生成导航栏浏览网页{cnt}"/>' + m.group()
|
622
|
-
|
623
|
-
html_content = re.sub(r'<(?P<name>h\d+)(?:>|\s.*?>)(?P<body>\s*(?P<inner>.*?)\s*)</\1>',
|
624
|
-
func, html_content, flags=re.DOTALL)
|
625
|
-
|
626
|
-
refs.append('</body>\n</html>')
|
627
|
-
|
628
|
-
f1 = Path(title + '_导航', '.html', Path.TEMP).write('\n'.join(refs), if_exists='replace')
|
629
|
-
f2 = f2.write(html_content, if_exists='replace')
|
630
|
-
|
631
|
-
# 2 生成首页 f0
|
632
|
-
main_content = f"""<html>
|
633
|
-
<frameset cols="20%,80%">
|
634
|
-
<frame src="{f1}">
|
635
|
-
<frame src="{f2}" name="showframe">
|
636
|
-
</frameset></html>"""
|
637
|
-
|
638
|
-
f0 = Path(title, '.html', Path.TEMP).write(main_content, if_exists='replace')
|
639
|
-
return f0
|
@@ -1,39 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.1
|
2
|
-
Name: pyxllib
|
3
|
-
Version: 0.0.43
|
4
|
-
Summary: 厦门理工模式识别团队通用python代码工具库
|
5
|
-
Home-page: https://github.com/XLPRUtils/pyxllib
|
6
|
-
Author: code4101
|
7
|
-
Author-email: 877362867@qq.com
|
8
|
-
License: Apache License 2.0
|
9
|
-
Keywords: pyxllib
|
10
|
-
Platform: UNKNOWN
|
11
|
-
Classifier: Programming Language :: Python :: 3
|
12
|
-
Classifier: Operating System :: OS Independent
|
13
|
-
Requires-Python: >=3.6
|
14
|
-
Description-Content-Type: text/markdown
|
15
|
-
Requires-Dist: arrow
|
16
|
-
Requires-Dist: chardet
|
17
|
-
Requires-Dist: requests
|
18
|
-
Requires-Dist: qiniu
|
19
|
-
Requires-Dist: pyyaml
|
20
|
-
Requires-Dist: disjoint-set (==0.6.3)
|
21
|
-
Requires-Dist: coloredlogs
|
22
|
-
|
23
|
-
# 1 install
|
24
|
-
|
25
|
-
工具包已经提交到pypi: https://pypi.org/project/pyxllib/
|
26
|
-
可以直接安装:
|
27
|
-
|
28
|
-
```
|
29
|
-
pip install pyxllib
|
30
|
-
```
|
31
|
-
|
32
|
-
更详细的安装问题见:https://www.yuque.com/xlpr/pyxllib/install
|
33
|
-
|
34
|
-
# 2 document
|
35
|
-
|
36
|
-
使用文档: https://www.yuque.com/xlpr/pyxllib ,
|
37
|
-
正在努力完善中,欢迎更多小伙伴一起助力
|
38
|
-
|
39
|
-
|