pyxllib 0.0.43__py3-none-any.whl → 0.3.197__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. pyxllib/__init__.py +9 -2
  2. pyxllib/algo/__init__.py +8 -0
  3. pyxllib/algo/disjoint.py +54 -0
  4. pyxllib/algo/geo.py +541 -0
  5. pyxllib/{util/mathlib.py → algo/intervals.py} +172 -36
  6. pyxllib/algo/matcher.py +389 -0
  7. pyxllib/algo/newbie.py +166 -0
  8. pyxllib/algo/pupil.py +629 -0
  9. pyxllib/algo/shapelylib.py +67 -0
  10. pyxllib/algo/specialist.py +241 -0
  11. pyxllib/algo/stat.py +494 -0
  12. pyxllib/algo/treelib.py +149 -0
  13. pyxllib/algo/unitlib.py +66 -0
  14. pyxllib/autogui/__init__.py +5 -0
  15. pyxllib/autogui/activewin.py +246 -0
  16. pyxllib/autogui/all.py +9 -0
  17. pyxllib/autogui/autogui.py +852 -0
  18. pyxllib/autogui/uiautolib.py +362 -0
  19. pyxllib/autogui/virtualkey.py +102 -0
  20. pyxllib/autogui/wechat.py +827 -0
  21. pyxllib/autogui/wechat_msg.py +421 -0
  22. pyxllib/autogui/wxautolib.py +84 -0
  23. pyxllib/cv/__init__.py +1 -11
  24. pyxllib/cv/expert.py +267 -0
  25. pyxllib/cv/{imlib.py → imfile.py} +18 -83
  26. pyxllib/cv/imhash.py +39 -0
  27. pyxllib/cv/pupil.py +9 -0
  28. pyxllib/cv/rgbfmt.py +1525 -0
  29. pyxllib/cv/slidercaptcha.py +137 -0
  30. pyxllib/cv/trackbartools.py +163 -49
  31. pyxllib/cv/xlcvlib.py +1040 -0
  32. pyxllib/cv/xlpillib.py +423 -0
  33. pyxllib/data/__init__.py +0 -0
  34. pyxllib/data/echarts.py +240 -0
  35. pyxllib/data/jsonlib.py +89 -0
  36. pyxllib/{util/oss2_.py → data/oss.py} +11 -9
  37. pyxllib/data/pglib.py +1127 -0
  38. pyxllib/data/sqlite.py +568 -0
  39. pyxllib/{util → data}/sqllib.py +13 -31
  40. pyxllib/ext/JLineViewer.py +505 -0
  41. pyxllib/ext/__init__.py +6 -0
  42. pyxllib/{util → ext}/demolib.py +119 -35
  43. pyxllib/ext/drissionlib.py +277 -0
  44. pyxllib/ext/kq5034lib.py +12 -0
  45. pyxllib/{util/main.py → ext/old.py} +122 -284
  46. pyxllib/ext/qt.py +449 -0
  47. pyxllib/ext/robustprocfile.py +497 -0
  48. pyxllib/ext/seleniumlib.py +76 -0
  49. pyxllib/{util/tklib.py → ext/tk.py} +10 -11
  50. pyxllib/ext/unixlib.py +827 -0
  51. pyxllib/ext/utools.py +351 -0
  52. pyxllib/{util/webhooklib.py → ext/webhook.py} +45 -17
  53. pyxllib/ext/win32lib.py +40 -0
  54. pyxllib/ext/wjxlib.py +88 -0
  55. pyxllib/ext/wpsapi.py +124 -0
  56. pyxllib/ext/xlwork.py +9 -0
  57. pyxllib/ext/yuquelib.py +1105 -0
  58. pyxllib/file/__init__.py +17 -0
  59. pyxllib/file/docxlib.py +761 -0
  60. pyxllib/{util → file}/gitlib.py +40 -27
  61. pyxllib/file/libreoffice.py +165 -0
  62. pyxllib/file/movielib.py +148 -0
  63. pyxllib/file/newbie.py +10 -0
  64. pyxllib/file/onenotelib.py +1469 -0
  65. pyxllib/file/packlib/__init__.py +330 -0
  66. pyxllib/{util → file/packlib}/zipfile.py +598 -195
  67. pyxllib/file/pdflib.py +426 -0
  68. pyxllib/file/pupil.py +185 -0
  69. pyxllib/file/specialist/__init__.py +685 -0
  70. pyxllib/{basic/_5_dirlib.py → file/specialist/dirlib.py} +364 -93
  71. pyxllib/file/specialist/download.py +193 -0
  72. pyxllib/file/specialist/filelib.py +2829 -0
  73. pyxllib/file/xlsxlib.py +3131 -0
  74. pyxllib/file/xlsyncfile.py +341 -0
  75. pyxllib/prog/__init__.py +5 -0
  76. pyxllib/prog/cachetools.py +64 -0
  77. pyxllib/prog/deprecatedlib.py +233 -0
  78. pyxllib/prog/filelock.py +42 -0
  79. pyxllib/prog/ipyexec.py +253 -0
  80. pyxllib/prog/multiprogs.py +940 -0
  81. pyxllib/prog/newbie.py +451 -0
  82. pyxllib/prog/pupil.py +1197 -0
  83. pyxllib/{sitepackages.py → prog/sitepackages.py} +5 -3
  84. pyxllib/prog/specialist/__init__.py +391 -0
  85. pyxllib/prog/specialist/bc.py +203 -0
  86. pyxllib/prog/specialist/browser.py +497 -0
  87. pyxllib/prog/specialist/common.py +347 -0
  88. pyxllib/prog/specialist/datetime.py +199 -0
  89. pyxllib/prog/specialist/tictoc.py +240 -0
  90. pyxllib/prog/specialist/xllog.py +180 -0
  91. pyxllib/prog/xlosenv.py +108 -0
  92. pyxllib/stdlib/__init__.py +17 -0
  93. pyxllib/{util → stdlib}/tablepyxl/__init__.py +1 -3
  94. pyxllib/{util → stdlib}/tablepyxl/style.py +1 -1
  95. pyxllib/{util → stdlib}/tablepyxl/tablepyxl.py +2 -4
  96. pyxllib/text/__init__.py +8 -0
  97. pyxllib/text/ahocorasick.py +39 -0
  98. pyxllib/text/airscript.js +744 -0
  99. pyxllib/text/charclasslib.py +121 -0
  100. pyxllib/text/jiebalib.py +267 -0
  101. pyxllib/text/jinjalib.py +32 -0
  102. pyxllib/text/jsa_ai_prompt.md +271 -0
  103. pyxllib/text/jscode.py +922 -0
  104. pyxllib/text/latex/__init__.py +158 -0
  105. pyxllib/text/levenshtein.py +303 -0
  106. pyxllib/text/nestenv.py +1215 -0
  107. pyxllib/text/newbie.py +300 -0
  108. pyxllib/text/pupil/__init__.py +8 -0
  109. pyxllib/text/pupil/common.py +1121 -0
  110. pyxllib/text/pupil/xlalign.py +326 -0
  111. pyxllib/text/pycode.py +47 -0
  112. pyxllib/text/specialist/__init__.py +8 -0
  113. pyxllib/text/specialist/common.py +112 -0
  114. pyxllib/text/specialist/ptag.py +186 -0
  115. pyxllib/text/spellchecker.py +172 -0
  116. pyxllib/text/templates/echart_base.html +11 -0
  117. pyxllib/text/templates/highlight_code.html +17 -0
  118. pyxllib/text/templates/latex_editor.html +103 -0
  119. pyxllib/text/vbacode.py +17 -0
  120. pyxllib/text/xmllib.py +747 -0
  121. pyxllib/xl.py +39 -0
  122. pyxllib/xlcv.py +17 -0
  123. pyxllib-0.3.197.dist-info/METADATA +48 -0
  124. pyxllib-0.3.197.dist-info/RECORD +126 -0
  125. {pyxllib-0.0.43.dist-info → pyxllib-0.3.197.dist-info}/WHEEL +4 -5
  126. pyxllib/basic/_1_strlib.py +0 -945
  127. pyxllib/basic/_2_timelib.py +0 -488
  128. pyxllib/basic/_3_pathlib.py +0 -916
  129. pyxllib/basic/_4_loglib.py +0 -419
  130. pyxllib/basic/__init__.py +0 -54
  131. pyxllib/basic/arrow_.py +0 -250
  132. pyxllib/basic/chardet_.py +0 -66
  133. pyxllib/basic/dirlib.py +0 -529
  134. pyxllib/basic/dprint.py +0 -202
  135. pyxllib/basic/extension.py +0 -12
  136. pyxllib/basic/judge.py +0 -31
  137. pyxllib/basic/log.py +0 -204
  138. pyxllib/basic/pathlib_.py +0 -705
  139. pyxllib/basic/pytictoc.py +0 -102
  140. pyxllib/basic/qiniu_.py +0 -61
  141. pyxllib/basic/strlib.py +0 -761
  142. pyxllib/basic/timer.py +0 -132
  143. pyxllib/cv/cv.py +0 -834
  144. pyxllib/cv/cvlib/_1_geo.py +0 -543
  145. pyxllib/cv/cvlib/_2_cvprcs.py +0 -309
  146. pyxllib/cv/cvlib/_2_imgproc.py +0 -594
  147. pyxllib/cv/cvlib/_3_pilprcs.py +0 -80
  148. pyxllib/cv/cvlib/_4_cvimg.py +0 -211
  149. pyxllib/cv/cvlib/__init__.py +0 -10
  150. pyxllib/cv/debugtools.py +0 -82
  151. pyxllib/cv/fitz_.py +0 -300
  152. pyxllib/cv/installer.py +0 -42
  153. pyxllib/debug/_0_installer.py +0 -38
  154. pyxllib/debug/_1_typelib.py +0 -277
  155. pyxllib/debug/_2_chrome.py +0 -198
  156. pyxllib/debug/_3_showdir.py +0 -161
  157. pyxllib/debug/_4_bcompare.py +0 -140
  158. pyxllib/debug/__init__.py +0 -49
  159. pyxllib/debug/bcompare.py +0 -132
  160. pyxllib/debug/chrome.py +0 -198
  161. pyxllib/debug/installer.py +0 -38
  162. pyxllib/debug/showdir.py +0 -158
  163. pyxllib/debug/typelib.py +0 -278
  164. pyxllib/image/__init__.py +0 -12
  165. pyxllib/torch/__init__.py +0 -20
  166. pyxllib/torch/modellib.py +0 -37
  167. pyxllib/torch/trainlib.py +0 -344
  168. pyxllib/util/__init__.py +0 -20
  169. pyxllib/util/aip_.py +0 -141
  170. pyxllib/util/casiadb.py +0 -59
  171. pyxllib/util/excellib.py +0 -495
  172. pyxllib/util/filelib.py +0 -612
  173. pyxllib/util/jsondata.py +0 -27
  174. pyxllib/util/jsondata2.py +0 -92
  175. pyxllib/util/labelmelib.py +0 -139
  176. pyxllib/util/onepy/__init__.py +0 -29
  177. pyxllib/util/onepy/onepy.py +0 -574
  178. pyxllib/util/onepy/onmanager.py +0 -170
  179. pyxllib/util/pyautogui_.py +0 -219
  180. pyxllib/util/textlib.py +0 -1305
  181. pyxllib/util/unorder.py +0 -22
  182. pyxllib/util/xmllib.py +0 -639
  183. pyxllib-0.0.43.dist-info/METADATA +0 -39
  184. pyxllib-0.0.43.dist-info/RECORD +0 -80
  185. pyxllib-0.0.43.dist-info/top_level.txt +0 -1
  186. {pyxllib-0.0.43.dist-info → pyxllib-0.3.197.dist-info/licenses}/LICENSE +0 -0
pyxllib/util/unorder.py DELETED
@@ -1,22 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- # @Author : 陈坤泽
4
- # @Email : 877362867@qq.com
5
- # @Data : 2020/05/30
6
-
7
-
8
- """
9
- 未系统分类、零散、冷门的功能
10
- """
11
-
12
-
13
- def document(func):
14
- """文档函数装饰器
15
- 用该装饰器器时,表明一个函数是用伪代码在表示一系列的操作逻辑,不能直接拿来执行的
16
- 很可能是一套半自动化工具
17
- """
18
-
19
- def wrapper(*args):
20
- raise RuntimeError(f'函数:{func.__name__} 是一个伪代码流程示例文档,不能直接运行')
21
-
22
- return wrapper
pyxllib/util/xmllib.py DELETED
@@ -1,639 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- # @Author : 陈坤泽
4
- # @Email : 877362867@qq.com
5
- # @Data : 2020/06/02 20:16
6
-
7
-
8
- """
9
- xml等网页结构方面的处理
10
- """
11
-
12
-
13
- from collections import defaultdict, Counter
14
-
15
-
16
- import bs4
17
- from bs4 import BeautifulSoup
18
-
19
-
20
- from pyxllib.util.textlib import *
21
-
22
-
23
- ____section_1_dfs_base = """
24
- 一个通用的递归功能
25
- """
26
-
27
-
28
- def dfs_base(node, *,
29
- child_generator=None, select_depth=None, linenum=False,
30
- mystr=None, msghead=True, lsstr=None, show_node_type=False, prefix=' '):
31
- """输入一个节点node,以及该节点当前depth
32
- :param prefix: 缩进格式,默认用4个空格
33
- :param node: 节点
34
- :param child_generator: 子节点生成函数
35
- 函数支持输入一个节点参数
36
- 返回一个子节点列表
37
- :param select_depth: 要显示的深度
38
- 单个数字:获得指定层
39
- Sequences: 两个整数,取出这个闭区间内的层级内容
40
- :param mystr: 自定义单个节点字符串方式
41
- 标准是输入2个参数 mystr(node, depth),返回字符串化的结果,记得前缀缩进也要自己控制的!
42
- 也可以只输入一个参数 mystr(node):
43
- 这种情况会自动按照每层4个空格进行缩进
44
- :param lsstr: 自定义整个列表的字符串化方法,在mystr的基础上调控更加灵活,但要写的代码也更多
45
- :param linenum:节点从1开始编号
46
- 行号后面,默认会跟一个类似Excel列名的字母,表示层级深度
47
- :param msghead: 第1行输出一些统计信息
48
- :param show_node_type:
49
- :return 返回一个遍历清单ls
50
- ls的每个元素是一个列表
51
- 第1个值是depth
52
- 第2个值是节点ref
53
-
54
- Requires
55
- textwrap:用到shorten
56
- align.listalign:生成列编号时对齐
57
- """
58
- # 1 子节点生成器,与配置
59
- def bs4_child_generator(node):
60
- try:
61
- return node.children
62
- except AttributeError:
63
- return []
64
-
65
- # 配置子节点生成器
66
- if not child_generator:
67
- child_generator = bs4_child_generator
68
-
69
- # 2 dfs实际实现代码,获得节点清单
70
- def inner(node, depth=0):
71
- """dfs实际实现代码
72
- TODO:把depth过滤写进inner不生成?! 不过目前还是按照生成整棵树处理,能统计到一些信息。
73
- """
74
- ls = [[node, depth]]
75
- for t in child_generator(node):
76
- ls += inner(t, depth + 1)
77
- return ls
78
- ls = inner(node)
79
- total_node = len(ls)
80
- total_depth = max(map(lambda x: x[1], ls))
81
- head = f'总节点数:1~{total_node},总深度:0~{total_depth}'
82
-
83
- # 4 过滤与重新整理ls(select_depth)
84
- logo = True
85
- cnt = 0
86
- tree_num = 0
87
- if isinstance(select_depth, int):
88
-
89
- for i in range(total_node):
90
- if ls[i][1] == select_depth:
91
- ls[i][1] = 0
92
- cnt += 1
93
- logo = True
94
- elif ls[i][1] < select_depth and logo: # 遇到第1个父节点添加一个空行
95
- ls[i] = ''
96
- tree_num += 1
97
- logo = False
98
- else: # 删除该节点,不做任何显示
99
- ls[i] = None
100
- head += f';挑选出的节点数:{cnt},所选深度:{select_depth},树数量:{tree_num}'
101
-
102
- elif hasattr(select_depth, '__getitem__'):
103
- for i in range(total_node):
104
- if select_depth[0] <= ls[i][1] <= select_depth[1]:
105
- ls[i][1] -= select_depth[0]
106
- cnt += 1
107
- logo = True
108
- elif ls[i][1] < select_depth[0] and logo: # 遇到第1个父节点添加一个空行
109
- ls[i] = ''
110
- tree_num += 1
111
- logo = False
112
- else: # 删除该节点,不做任何显示
113
- ls[i] = None
114
- head += f';挑选出的节点数:{cnt},所选深度:{select_depth[0]}~{select_depth[1]},树数量:{tree_num}'
115
- """注意此时ls[i]的状态,有3种类型
116
- (node, depth):tuple类型,第0个元素是node对象,第1个元素是该元素所处层级
117
- None:已删除元素,但为了后续编号方便,没有真正的移出,而是用None作为标记
118
- '':已删除元素,但这里涉及父节点的删除,建议此处留一个空行
119
- """
120
-
121
- # 5 格式处理
122
- def default_mystr(node, depth):
123
- s1 = prefix * depth
124
- s2 = typename(node)+',' if show_node_type else ''
125
- s3 = textwrap.shorten(str(node), 200)
126
- return s1 + s2 + s3
127
-
128
- def default_lsstr(ls):
129
- nonlocal mystr
130
- if not mystr:
131
- mystr = default_mystr
132
- else:
133
- try: # 测试两个参数情况下是否可以正常运行
134
- mystr('', 0)
135
- except TypeError:
136
- # 如果不能正常运行,则进行封装从而支持2个参数
137
- func = mystr
138
-
139
- def str_plus(node, depth): # 注意这里函数名要换一个新的func
140
- return prefix * depth + func(node)
141
- mystr = str_plus
142
-
143
- line_num = listalign(range(1, total_node + 1))
144
- res = []
145
- for i in range(total_node):
146
- if ls[i] is not None:
147
- if isinstance(ls[i], str): # 已经指定该行要显示什么
148
- res.append(ls[i])
149
- else:
150
- if linenum: # 增加了一个能显示层级的int2excel_col_name
151
- res.append(line_num[i] + int2myalphaenum(ls[i][1]) + ' ' + mystr(ls[i][0], ls[i][1]))
152
- else:
153
- res.append(mystr(ls[i][0], ls[i][1]))
154
-
155
- s = '\n'.join(res)
156
- return s
157
-
158
- if not lsstr:
159
- lsstr = default_lsstr
160
-
161
- s = lsstr(ls)
162
-
163
- # 是否要添加信息头
164
- if msghead:
165
- s = head + '\n' + s
166
-
167
- return s
168
-
169
-
170
- def treetable(childreds, parents, arg3=None, nodename_colname=None):
171
- """输入childres子结点id列表,和parents父结点id列表
172
- 两个列表长度必须相等
173
- 文档:http://note.youdao.com/noteshare?id=126200f45d301fcb4364d06a0cae8376
174
-
175
- 有两种调用形式
176
- >> treetable(childreds, parents) --> DataFrame (新建df)
177
- >> treetable(df, child_colname, parent_colname) --> DataFrame (修改后的df)
178
-
179
- 返回一个二维列表
180
- 新的childreds (末尾可能回加虚结点)
181
- 新的parents
182
- 函数会计算每一行childred对应的树排序后的排序编号order
183
- 以及每个节点深度depth
184
-
185
- >> ls1 = [6, 2, 4, 5, 3], ls2 = [7, 1, 2, 2, 1], treetable(ls1, ls2)
186
- child_id parent_id depth tree_order tree_struct
187
- 5 7 root 1 1 = = 7
188
- 0 6 7 2 2 = = = = 6
189
- 6 1 root 1 3 = = 1
190
- 1 2 1 2 4 = = = = 2
191
- 2 4 2 3 5 = = = = = = 4
192
- 3 5 2 3 6 = = = = = = 5
193
- 4 3 1 2 7 = = = = 3
194
- """
195
- # 0 参数预处理
196
- if isinstance(childreds, pd.DataFrame):
197
- df = childreds
198
- child_colname = parents
199
- parent_colname = arg3
200
- if not arg3: raise TypeError
201
- childreds = df[child_colname].tolist()
202
- parents = df[parent_colname].tolist()
203
- else:
204
- df = None
205
-
206
- # 1 建立root根节点,确保除了root其他结点都存在记录
207
- lefts = set(parents) - set(childreds) # parents列中没有在childreds出现的结点
208
- cs, ps = list(childreds), list(parents)
209
-
210
- if len(lefts) == 0:
211
- # b_left为空一定有环,b_left不为空也不一定是正常的树
212
- raise ValueError('有环,不是树结构')
213
- elif len(lefts) == 1: # 只有一个未出现的结点,那么它既是根节点
214
- root = list(lefts)[0]
215
- else: # 多个父结点没有记录,则对这些父结点统一加一个root父结点
216
- root = 'root'
217
- allnode = set(parents) | set(childreds) # 所有结点集合
218
- while root in allnode: root += '-' # 一直在末尾加'-',直到这个结点是输入里未出现的
219
- # 添加结点
220
- lefts = list(lefts)
221
- lefts.sort(key=lambda x: parents.index(x))
222
- for t in lefts:
223
- cs.append(t)
224
- ps.append(root)
225
-
226
- n = len(cs)
227
- depth, tree_order, len_childs = [-1]*n, [-1]*n, [0]*n
228
-
229
- # 2 构造父结点-孩子结点的字典dd
230
- dd = defaultdict(list)
231
- for i in range(n): dd[ps[i]] += [i]
232
-
233
- # 3 dfs
234
- cnt = 1
235
- def dfs(node, d):
236
- """找node的所有子结点"""
237
- nonlocal cnt
238
- for i in dd.get(node, []):
239
- tree_order[i], depth[i], len_childs[i] = cnt, d, len(dd[cs[i]])
240
- cnt += 1
241
- dfs(cs[i], d+1)
242
- dfs(root, 1)
243
-
244
- # 4 输出格式
245
- tree_struct = list(map(lambda i: f"{'_ _ ' * depth[i]}{cs[i]}" + (f'[{len_childs[i]}]' if len_childs[i] else ''),
246
- range(n)))
247
-
248
- if df is None:
249
- ls = list(zip(cs, ps, depth, tree_order, len_childs, tree_struct))
250
- df = pd.DataFrame.from_records(ls, columns=('child_id', 'parent_id',
251
- 'depth', 'tree_order', 'len_childs', 'tree_struct'))
252
- else:
253
- k = len(df)
254
- df = df.append(pd.DataFrame({child_colname: cs[k:], parent_colname: ps[k:]}), sort=False, ignore_index=True)
255
- if nodename_colname:
256
- tree_struct = list(
257
- map(lambda i: f"{'_ _ ' * depth[i]}{cs[i]} {df.iloc[i][nodename_colname]}"
258
- + (f'[{len_childs[i]}]' if len_childs[i] else ''), range(n)))
259
- df['depth'], df['tree_order'], df['len_childs'], df['tree_struct'] = depth, tree_order, len_childs, tree_struct
260
- df.sort_values('tree_order', inplace=True) # 注意有时候可能不能排序,要维持输入时候的顺序
261
- return df
262
-
263
-
264
- def treetable_flatten(df, *, reverse=False, childid_colname='id', parentid_colname='parent_id', format_colname=None):
265
- """获得知识树横向展开表:列为depth-3, depth-2, depth-1,表示倒数第3级、倒数第2级、倒数第1级
266
- :param df: DataFrame数据
267
- :param reverse:
268
- False,正常地罗列depth1、depth2、depth3...等结点信息
269
- True,反向列举所属层级,即显示倒数第1层parent1,然后是倒数第2层parent2...
270
- :param childid_colname: 孩子结点列
271
- :param parentid_colname: 父结点列
272
- :param format_colname: 显示的数值
273
- None,默认采用 childid_colname 的值
274
- str,某一列的名称,采用那一列的值(可以实现设置好格式)
275
- :return:
276
- """
277
- # 1 构造辅助数组
278
- if format_colname is None: format_colname = parentid_colname
279
- parentid = dict() # parentid[k] = v, 存储结点k对应的父结点v
280
- nodeval = dict() # nodeval[k] = v, 存储结点k需要显示的数值情况
281
- if len(df[df.index.duplicated()]):
282
- dprint(len(set(df.index)), len(df.index)) # 有重复index
283
- raise ValueError
284
-
285
- for idx, row in df.iterrows():
286
- parentid[row[childid_colname]] = row[parentid_colname]
287
- nodeval[row[childid_colname]] = str(row[format_colname])
288
-
289
- # 2 每个结点往上遍历出所有父结点
290
- parents = []
291
- for idx, row in df.iterrows():
292
- ps = [nodeval[row[childid_colname]]] # 包含结点自身的所有父结点名称
293
- p = row[parentid_colname]
294
- while p in parentid:
295
- ps.append(nodeval[p])
296
- p = parentid[p]
297
- parents.append(ps)
298
- num_depth = max(map(len, parents), default=0)
299
-
300
- # 3 这里可以灵活调整最终要显示的格式效果
301
- df['parents'] = parents
302
- if reverse:
303
- for j in range(num_depth, 0, -1): df[f'depth-{j}'] = ''
304
- for idx, row in df.iterrows():
305
- for j in range(1, len(row.parents)+1):
306
- df.loc[idx, f'depth-{j}'] = row.parents[j-1]
307
- else:
308
- for j in range(num_depth): df[f'depth{j}'] = ''
309
- for idx, row in df.iterrows():
310
- for j in range(len(row.parents)):
311
- df.loc[idx, f'depth{j}'] = row.parents[-j-1]
312
- df.drop('parents', axis=1, inplace=True)
313
- return df
314
-
315
-
316
- ____section_2_xml = """
317
- xml相关的一些功能函数
318
- """
319
-
320
-
321
- def readurl(url):
322
- """从url读取文本"""
323
- r = requests.get(url)
324
- soup = BeautifulSoup(r.text, 'lxml')
325
- s = soup.get_text()
326
- return s
327
-
328
-
329
- ____section_3_xmlparser = """
330
- """
331
-
332
-
333
- def tag_name(t):
334
- """输入一个bs4的Tag或NavigableString,
335
- 返回tag.name或者'NavigableString'
336
- """
337
- if t.name:
338
- return t.name
339
- elif isinstance(t, bs4.NavigableString):
340
- return 'NavigableString'
341
- else:
342
- dprint(t) # 获取结点t名称失败
343
- return None
344
-
345
-
346
- def subtag_names(t):
347
- """列出结点t的所有直接子结点(花括号后面跟的数字是连续出现次数)
348
- 例如body的: p{137},tbl,p{94},tbl,p{1640},sectPr
349
- """
350
- def counter(m):
351
- s1 = m.group(1)
352
- n = (m.end(0) - m.start(0)) // len(s1)
353
- s = s1[:-1] + '{' + str(n) + '}'
354
- if m.string[m.end(0)-1] == ',':
355
- s += ','
356
- return s
357
-
358
- if t.name and t.contents:
359
- s = ','.join(map(tag_name, t.contents)) + ','
360
- s = re.sub(r'([^,]+,)(\1)+', counter, s)
361
- else:
362
- s = ''
363
- if s and s[-1] == ',':
364
- s = s[:-1]
365
- return s
366
-
367
-
368
- class XmlParser:
369
- def __init__(self, node=None):
370
- """两种初始化方式
371
- 提供node:用某个bs4的PageElement等对象初始化
372
- 未提供node,一般是方便给MyBs4等类继承使用
373
- """
374
- if node: # TODO:可以扩展,支持不同类型的初始化
375
- self._node = node
376
-
377
- def node(self):
378
- """获得xml结点的接口函数"""
379
- return self._node if getattr(self, '_node') else self
380
-
381
- def treestruct_raw(self, **kwargs):
382
- """查看树形结构的raw版本
383
- 各参数含义详见dfs_base
384
- """
385
- # 1 先用dfs获得基本结果
386
- s = dfs_base(self.node(), **kwargs)
387
- return s
388
-
389
- def treestruct_brief(self, linenum=True, prefix='- ', **kwargs):
390
- """查看树形结构的简洁版
391
- """
392
- def mystr(node):
393
- # if isinstance(node, (bs4.ProcessingInstruction, code4101py.stdlib.bs4.ProcessingInstruction)):
394
- if isinstance(node, bs4.ProcessingInstruction):
395
- s = 'ProcessingInstruction,' + str(node)
396
- # elif isinstance(node, (bs4.Tag, code4101py.stdlib.bs4.Tag)):
397
- elif isinstance(node, bs4.Tag):
398
- s = node.name + ',' + mydictstr(node.attrs, item_delimit=',')
399
- # elif isinstance(node, (bs4.NavigableString, code4101py.stdlib.bs4.NavigableString)):
400
- elif isinstance(node, bs4.NavigableString):
401
- # s = 'NavigableString'
402
- s = shorten(str(node), 200)
403
- if not s.strip():
404
- s = '<??>'
405
- else:
406
- s = '遇到特殊类型,' + str(node)
407
- return s
408
-
409
- s = dfs_base(self.node(), mystr=mystr, prefix=prefix, linenum=linenum, **kwargs)
410
- return s
411
-
412
- def treestruct_stat(self):
413
- """生成一个两个二维表的统计数据
414
- ls1, ls2 = treestruct_stat()
415
- ls1: 结点规律表
416
- ls2: 属性规律表
417
- count_tagname、check_tag的功能基本都可以被这个函数代替
418
- """
419
- def text(t):
420
- """ 考虑到结果一般都是存储到excel,所以会把无法存成gbk的字符串删掉
421
- 另外控制了每个元素的长度上限
422
- """
423
- s = ensure_gbk(t)
424
- s = s[:100]
425
- return s
426
-
427
- def depth(t):
428
- """结点t的深度"""
429
- return len(tuple(t.parents))
430
-
431
- t = self.contents[0]
432
- # ls1 = [['element序号', '层级', '结构', '父结点', '当前结点', '属性值/字符串值', '直接子结点结构']]
433
- # ls2 = [['序号', 'element序号', '当前结点', '属性名', '属性值']] #
434
- ls1 = [] # 这个重点是分析结点规律
435
- ls2 = [] # 这个重点是分析属性规律
436
- i = 1
437
- while t:
438
- # 1 结点规律表
439
- d = depth(t)
440
- line = [i, d, '_'*d+str(d), tag_name(t.parent), tag_name(t),
441
- text(mydictstr(t.attrs) if t.name else t), # 结点存属性,字符串存值
442
- subtag_names(t)]
443
- ls1.append(line)
444
- # 2 属性规律表
445
- if t.name:
446
- k = len(ls2)
447
- for attr, value in t.attrs.items():
448
- ls2.append([k, i, tag_name(t), attr, value])
449
- k += 1
450
- # 下个结点
451
- t = t.next_element
452
- i += 1
453
- df1 = pd.DataFrame.from_records(ls1, columns=['element序号', '层级', '结构', '父结点', '当前结点', '属性值/字符串值', '直接子结点结构'])
454
- df2 = pd.DataFrame.from_records(ls2, columns=['序号', 'element序号', '当前结点', '属性名', '属性值'])
455
- return df1, df2
456
-
457
- def count_tagname(self):
458
- """统计每个标签出现的次数:
459
- 1 w:rpr 650
460
- 2 w:rfonts 650
461
- 3 w:szcs 618
462
- 4 w:r 565
463
- 5 None 532
464
- 6 w:t 531
465
- """
466
- ct = collections.Counter()
467
-
468
- def inner(node):
469
- try:
470
- ct[node.name] += 1
471
- for t in node.children:
472
- inner(t)
473
- except AttributeError:
474
- pass
475
-
476
- inner(self.node())
477
- return ct.most_common()
478
-
479
- def check_tag(self, tagname=None):
480
- """统计每个标签在不同层级出现的次数:
481
-
482
- :param tagname:
483
- None:统计全文出现的各种标签在不同层级出现次数
484
- 't'等值: tagname参数允许只检查特殊标签情况,此时会将所有tagname设为第0级
485
-
486
- TODO 检查一个标签内部是否有同名标签?
487
- """
488
- d = defaultdict()
489
-
490
- def add(name, depth):
491
- if name not in d:
492
- d[name] = defaultdict(int)
493
- d[name][depth] += 1
494
-
495
- def inner(node, depth):
496
- if isinstance(node, bs4.ProcessingInstruction):
497
- add('ProcessingInstruction', depth)
498
- elif isinstance(node, bs4.Tag):
499
- if node.name == tagname and depth:
500
- dprint(node, depth) # tagname里有同名子标签
501
- add(node.name, depth)
502
- for t in node.children:
503
- inner(t, depth+1)
504
- elif isinstance(node, bs4.NavigableString):
505
- add('NavigableString', depth)
506
- else:
507
- add('其他特殊结点', depth)
508
-
509
- # 1 统计结点在每一层出现的次数
510
- if tagname:
511
- for t in self.node().find_all(tagname):
512
- inner(t, 0)
513
- else:
514
- inner(self.node(), 0)
515
-
516
- # 2 总出现次数和?
517
-
518
- return d
519
-
520
- def check_namespace(self):
521
- """检查名称空间问题,会同时检查标签名和属性名:
522
- 1 cNvPr pic:cNvPr(579),wps:cNvPr(52),wpg:cNvPr(15)
523
- 2 spPr pic:spPr(579),wps:spPr(52)
524
- """
525
- # 1 获得所有名称
526
- # 因为是采用node的原始xml文本,所以能保证会取得带有名称空间的文本内容
527
- ct0 = Counter(re.findall(r'<([a-zA-Z:]+)', str(self.node())))
528
- ct = defaultdict(str)
529
- s = set()
530
- for key, value in ct0.items():
531
- k = re.sub(r'.*:', '', key)
532
- if k in ct:
533
- s.add(k)
534
- ct[k] += f',{key}({value})'
535
- else:
536
- ct[k] = f'{key}({value})'
537
-
538
- # 2 对有重复和无重复的元素划分存储
539
- ls1 = [] # 有重复的存储到ls1
540
- ls2 = [] # 没有重复的正常结果存储到ls2,可以不显示
541
- for k, v in ct.items():
542
- if k in s:
543
- ls1.append([k, v])
544
- else:
545
- ls2.append([k, v])
546
-
547
- # 3 显示有重复的情况
548
- # chrome(ls1, filename='检查名称空间问题')
549
- return ls1
550
-
551
-
552
- class MyBs4(BeautifulSoup, XmlParser):
553
- """xml、html 等数据通用处理算法,常用功能有:
554
-
555
- show_brief:显示xml结构
556
- count_tagname: 统计各个结点名称出现次数
557
- """
558
- def __init__(self, markup="", features='lxml', *args, **kwargs):
559
- # markup = Path(markup).read()
560
- # TODO: **kwargs我不知道怎么传进来啊,不过感觉也不删大雅没什么鸟用吧~~
561
- super().__init__(markup, features, *args, **kwargs)
562
-
563
- def insert_after(self, successor):
564
- pass
565
-
566
- def insert_before(self, successor):
567
- pass
568
-
569
-
570
- ____section_temp = """
571
- """
572
-
573
-
574
- def mathjax_html_head(s):
575
- """增加mathjax解析脚本"""
576
- head = r"""<!DOCTYPE html>
577
- <html>
578
- <head>
579
- <head><meta http-equiv=Content-Type content="text/html;charset=utf-8"></head>
580
- <script src="https://a.cdn.histudy.com/lib/config/mathjax_config-klxx.js?v=1.1"></script>
581
- <script type="text/javascript" async src="https://a.cdn.histudy.com/lib/mathjax/2.7.1/MathJax/MathJax.js?config=TeX-AMS-MML_SVG">
582
- MathJax.Hub.Config(MATHJAX_KLXX_CONFIG);
583
- </script>
584
- </head>
585
- <body>"""
586
- tail = '</body></html>'
587
- return head + s + tail
588
-
589
-
590
- def 自动制作网页标题的导航栏(html_content, title='temphtml'):
591
- """
592
- :param html_content: 原始网页的完整内容
593
- :param title: 页面标题,默认会先找head/title,如果没有,则取一个随机名称(TODO 未实装,目前固定名称'test')
594
-
595
- 算法基本原理:读取原网页,找出所有h标签,并增设a锚点
596
- 另外生成一个导航html文件
597
- 然后再生成一个主文件,让用户通过主文件来浏览页面
598
-
599
- # 读取csdn博客并展示目录 (不过因为这个存在跳级,效果不是那么好)
600
- >> file = 自动制作网页标题的导航栏(requests.get(r'https://blog.csdn.net/code4101/article/details/83009000').content.decode('utf8'))
601
- >> chrome(str(file))
602
- http://i2.tiimg.com/582188/64f40d235705de69.png
603
- """
604
- # 1 对原html,设置锚点,生成一个新的文件f2;生成导航目录文件f1。
605
- cnt = 0
606
-
607
- # TODO 目前不支持跳级的情况
608
- # 这个refs是可以用py算法生成的,目前是存储在github上引用
609
- refs = ['<html><head>',
610
- '<link rel=Stylesheet type="text/css" media=all href="https://code4101.github.io/css/navigation0.css">',
611
- '</head><body>']
612
-
613
- f2 = Path(title + '_内容', '.html', Path.TEMP)
614
-
615
- def func(m):
616
- nonlocal cnt
617
- cnt += 1
618
- name, content = m.group('name'), m.group('inner')
619
- content = BeautifulSoup(content, 'lxml').get_text()
620
- refs.append(f'<a href="{f2}#生成导航栏浏览网页{cnt}" target="showframe"><{name}>{content}</{name}></a>')
621
- return f'<a name="生成导航栏浏览网页{cnt}"/>' + m.group()
622
-
623
- html_content = re.sub(r'<(?P<name>h\d+)(?:>|\s.*?>)(?P<body>\s*(?P<inner>.*?)\s*)</\1>',
624
- func, html_content, flags=re.DOTALL)
625
-
626
- refs.append('</body>\n</html>')
627
-
628
- f1 = Path(title + '_导航', '.html', Path.TEMP).write('\n'.join(refs), if_exists='replace')
629
- f2 = f2.write(html_content, if_exists='replace')
630
-
631
- # 2 生成首页 f0
632
- main_content = f"""<html>
633
- <frameset cols="20%,80%">
634
- <frame src="{f1}">
635
- <frame src="{f2}" name="showframe">
636
- </frameset></html>"""
637
-
638
- f0 = Path(title, '.html', Path.TEMP).write(main_content, if_exists='replace')
639
- return f0
@@ -1,39 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: pyxllib
3
- Version: 0.0.43
4
- Summary: 厦门理工模式识别团队通用python代码工具库
5
- Home-page: https://github.com/XLPRUtils/pyxllib
6
- Author: code4101
7
- Author-email: 877362867@qq.com
8
- License: Apache License 2.0
9
- Keywords: pyxllib
10
- Platform: UNKNOWN
11
- Classifier: Programming Language :: Python :: 3
12
- Classifier: Operating System :: OS Independent
13
- Requires-Python: >=3.6
14
- Description-Content-Type: text/markdown
15
- Requires-Dist: arrow
16
- Requires-Dist: chardet
17
- Requires-Dist: requests
18
- Requires-Dist: qiniu
19
- Requires-Dist: pyyaml
20
- Requires-Dist: disjoint-set (==0.6.3)
21
- Requires-Dist: coloredlogs
22
-
23
- # 1 install
24
-
25
- 工具包已经提交到pypi: https://pypi.org/project/pyxllib/
26
- 可以直接安装:
27
-
28
- ```
29
- pip install pyxllib
30
- ```
31
-
32
- 更详细的安装问题见:https://www.yuque.com/xlpr/pyxllib/install
33
-
34
- # 2 document
35
-
36
- 使用文档: https://www.yuque.com/xlpr/pyxllib ,
37
- 正在努力完善中,欢迎更多小伙伴一起助力
38
-
39
-