pyxllib 0.3.197__py3-none-any.whl → 0.3.200__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyxllib/__init__.py +21 -21
- pyxllib/algo/__init__.py +8 -8
- pyxllib/algo/disjoint.py +54 -54
- pyxllib/algo/geo.py +541 -541
- pyxllib/algo/intervals.py +964 -964
- pyxllib/algo/matcher.py +389 -389
- pyxllib/algo/newbie.py +166 -166
- pyxllib/algo/pupil.py +629 -629
- pyxllib/algo/shapelylib.py +67 -67
- pyxllib/algo/specialist.py +241 -241
- pyxllib/algo/stat.py +494 -494
- pyxllib/algo/treelib.py +149 -149
- pyxllib/algo/unitlib.py +66 -66
- pyxllib/autogui/__init__.py +5 -5
- pyxllib/autogui/activewin.py +246 -246
- pyxllib/autogui/all.py +9 -9
- pyxllib/autogui/autogui.py +852 -852
- pyxllib/autogui/uiautolib.py +362 -362
- pyxllib/autogui/virtualkey.py +102 -102
- pyxllib/autogui/wechat.py +827 -827
- pyxllib/autogui/wechat_msg.py +421 -421
- pyxllib/autogui/wxautolib.py +84 -84
- pyxllib/cv/__init__.py +5 -5
- pyxllib/cv/expert.py +267 -267
- pyxllib/cv/imfile.py +159 -159
- pyxllib/cv/imhash.py +39 -39
- pyxllib/cv/pupil.py +9 -9
- pyxllib/cv/rgbfmt.py +1525 -1525
- pyxllib/cv/slidercaptcha.py +137 -137
- pyxllib/cv/trackbartools.py +251 -251
- pyxllib/cv/xlcvlib.py +1040 -1040
- pyxllib/cv/xlpillib.py +423 -423
- pyxllib/data/echarts.py +240 -240
- pyxllib/data/jsonlib.py +89 -89
- pyxllib/data/oss.py +72 -72
- pyxllib/data/pglib.py +1127 -1127
- pyxllib/data/sqlite.py +568 -568
- pyxllib/data/sqllib.py +297 -297
- pyxllib/ext/JLineViewer.py +505 -505
- pyxllib/ext/__init__.py +6 -6
- pyxllib/ext/demolib.py +246 -246
- pyxllib/ext/drissionlib.py +277 -277
- pyxllib/ext/kq5034lib.py +12 -12
- pyxllib/ext/old.py +663 -663
- pyxllib/ext/qt.py +449 -449
- pyxllib/ext/robustprocfile.py +497 -497
- pyxllib/ext/seleniumlib.py +76 -76
- pyxllib/ext/tk.py +173 -173
- pyxllib/ext/unixlib.py +827 -827
- pyxllib/ext/utools.py +351 -351
- pyxllib/ext/webhook.py +124 -119
- pyxllib/ext/win32lib.py +40 -40
- pyxllib/ext/wjxlib.py +88 -88
- pyxllib/ext/wpsapi.py +124 -124
- pyxllib/ext/xlwork.py +9 -9
- pyxllib/ext/yuquelib.py +1105 -1105
- pyxllib/file/__init__.py +17 -17
- pyxllib/file/docxlib.py +761 -761
- pyxllib/file/gitlib.py +309 -309
- pyxllib/file/libreoffice.py +165 -165
- pyxllib/file/movielib.py +148 -148
- pyxllib/file/newbie.py +10 -10
- pyxllib/file/onenotelib.py +1469 -1469
- pyxllib/file/packlib/__init__.py +330 -330
- pyxllib/file/packlib/zipfile.py +2441 -2441
- pyxllib/file/pdflib.py +426 -426
- pyxllib/file/pupil.py +185 -185
- pyxllib/file/specialist/__init__.py +685 -685
- pyxllib/file/specialist/dirlib.py +799 -799
- pyxllib/file/specialist/download.py +193 -193
- pyxllib/file/specialist/filelib.py +2829 -2829
- pyxllib/file/xlsxlib.py +3131 -3131
- pyxllib/file/xlsyncfile.py +341 -341
- pyxllib/prog/__init__.py +5 -5
- pyxllib/prog/cachetools.py +64 -64
- pyxllib/prog/deprecatedlib.py +233 -233
- pyxllib/prog/filelock.py +42 -42
- pyxllib/prog/ipyexec.py +253 -253
- pyxllib/prog/multiprogs.py +940 -940
- pyxllib/prog/newbie.py +451 -451
- pyxllib/prog/pupil.py +1197 -1197
- pyxllib/prog/sitepackages.py +33 -33
- pyxllib/prog/specialist/__init__.py +391 -391
- pyxllib/prog/specialist/bc.py +203 -203
- pyxllib/prog/specialist/browser.py +497 -497
- pyxllib/prog/specialist/common.py +347 -347
- pyxllib/prog/specialist/datetime.py +198 -198
- pyxllib/prog/specialist/tictoc.py +240 -240
- pyxllib/prog/specialist/xllog.py +180 -180
- pyxllib/prog/xlosenv.py +108 -108
- pyxllib/stdlib/__init__.py +17 -17
- pyxllib/stdlib/tablepyxl/__init__.py +10 -10
- pyxllib/stdlib/tablepyxl/style.py +303 -303
- pyxllib/stdlib/tablepyxl/tablepyxl.py +130 -130
- pyxllib/text/__init__.py +8 -8
- pyxllib/text/ahocorasick.py +39 -39
- pyxllib/text/airscript.js +744 -744
- pyxllib/text/charclasslib.py +121 -121
- pyxllib/text/jiebalib.py +267 -267
- pyxllib/text/jinjalib.py +32 -32
- pyxllib/text/jsa_ai_prompt.md +271 -271
- pyxllib/text/jscode.py +922 -922
- pyxllib/text/latex/__init__.py +158 -158
- pyxllib/text/levenshtein.py +303 -303
- pyxllib/text/nestenv.py +1215 -1215
- pyxllib/text/newbie.py +300 -300
- pyxllib/text/pupil/__init__.py +8 -8
- pyxllib/text/pupil/common.py +1121 -1121
- pyxllib/text/pupil/xlalign.py +326 -326
- pyxllib/text/pycode.py +47 -47
- pyxllib/text/specialist/__init__.py +8 -8
- pyxllib/text/specialist/common.py +112 -112
- pyxllib/text/specialist/ptag.py +186 -186
- pyxllib/text/spellchecker.py +172 -172
- pyxllib/text/templates/echart_base.html +10 -10
- pyxllib/text/templates/highlight_code.html +16 -16
- pyxllib/text/templates/latex_editor.html +102 -102
- pyxllib/text/vbacode.py +17 -17
- pyxllib/text/xmllib.py +747 -747
- pyxllib/xl.py +42 -39
- pyxllib/xlcv.py +17 -17
- {pyxllib-0.3.197.dist-info → pyxllib-0.3.200.dist-info}/METADATA +1 -1
- pyxllib-0.3.200.dist-info/RECORD +126 -0
- {pyxllib-0.3.197.dist-info → pyxllib-0.3.200.dist-info}/licenses/LICENSE +190 -190
- pyxllib-0.3.197.dist-info/RECORD +0 -126
- {pyxllib-0.3.197.dist-info → pyxllib-0.3.200.dist-info}/WHEEL +0 -0
pyxllib/text/xmllib.py
CHANGED
@@ -1,747 +1,747 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
# @Author : 陈坤泽
|
4
|
-
# @Email : 877362867@qq.com
|
5
|
-
# @Date : 2020/06/02 20:16
|
6
|
-
|
7
|
-
"""
|
8
|
-
xml等网页结构方面的处理
|
9
|
-
"""
|
10
|
-
|
11
|
-
# from pyxllib.prog.pupil import check_install_package
|
12
|
-
|
13
|
-
# 一个xpath解析库
|
14
|
-
# check_install_package('xpath_parser', 'xpath-parser')
|
15
|
-
|
16
|
-
import collections
|
17
|
-
from collections import Counter, defaultdict
|
18
|
-
import re
|
19
|
-
import os
|
20
|
-
|
21
|
-
import requests
|
22
|
-
import pandas as pd
|
23
|
-
import bs4
|
24
|
-
from bs4 import BeautifulSoup
|
25
|
-
from humanfriendly import format_size
|
26
|
-
# from xpath_parser import XpathExpression
|
27
|
-
|
28
|
-
from pyxllib.prog.newbie import round_int
|
29
|
-
from pyxllib.prog.pupil import dprint, run_once, inject_members
|
30
|
-
from pyxllib.prog.specialist import browser
|
31
|
-
from pyxllib.algo.pupil import SearchBase
|
32
|
-
from pyxllib.text.newbie import xldictstr
|
33
|
-
from pyxllib.text.pupil import shorten, ensure_gbk, BookContents, strwidth, grp_chinese_char
|
34
|
-
from pyxllib.file.specialist import File, Dir, get_etag
|
35
|
-
from pyxllib.text.jinjalib import get_jinja_template
|
36
|
-
|
37
|
-
|
38
|
-
class XlBs4Tag(bs4.element.Tag):
|
39
|
-
|
40
|
-
@property
|
41
|
-
def tag_name(self):
|
42
|
-
"""输入一个bs4的Tag或NavigableString,
|
43
|
-
返回tag.name或者'NavigableString'
|
44
|
-
"""
|
45
|
-
if self.name:
|
46
|
-
return self.name
|
47
|
-
elif isinstance(self, bs4.element.NavigableString):
|
48
|
-
return 'NavigableString'
|
49
|
-
else:
|
50
|
-
dprint(self) # 获取结点t名称失败
|
51
|
-
return None
|
52
|
-
|
53
|
-
def subtag_names(self):
|
54
|
-
""" 列出结点的所有直接子结点(花括号后面跟的数字是连续出现次数)
|
55
|
-
例如body的: p{137},tbl,p{94},tbl,p{1640},sectPr
|
56
|
-
"""
|
57
|
-
|
58
|
-
def counter(m):
|
59
|
-
s1 = m.group(1)
|
60
|
-
n = (m.end(0) - m.start(0)) // len(s1)
|
61
|
-
s = s1[:-1] + '{' + str(n) + '}'
|
62
|
-
if m.string[m.end(0) - 1] == ',':
|
63
|
-
s += ','
|
64
|
-
return s
|
65
|
-
|
66
|
-
if self.name and self.contents:
|
67
|
-
s = ','.join([x.tag_name for x in self.contents]) + ','
|
68
|
-
s = re.sub(r'([^,]+,)(\1)+', counter, s)
|
69
|
-
else:
|
70
|
-
s = ''
|
71
|
-
if s and s[-1] == ',':
|
72
|
-
s = s[:-1]
|
73
|
-
return s
|
74
|
-
|
75
|
-
def treestruct_raw(self, **kwargs):
|
76
|
-
""" 查看树形结构的raw版本
|
77
|
-
各参数含义详见dfs_base
|
78
|
-
"""
|
79
|
-
# 1 先用dfs获得基本结果
|
80
|
-
sb = SearchBase(self)
|
81
|
-
s = sb.fmt_nodes(**kwargs)
|
82
|
-
return s
|
83
|
-
|
84
|
-
def treestruct_brief(self, linenum=True, prefix='- ', **kwargs):
|
85
|
-
""" 查看树形结构的简洁版
|
86
|
-
"""
|
87
|
-
|
88
|
-
class Search(SearchBase):
|
89
|
-
def fmt_node(self, node, depth, *, prefix=prefix, show_node_type=False):
|
90
|
-
if isinstance(node, bs4.element.ProcessingInstruction):
|
91
|
-
s = 'ProcessingInstruction,' + str(node)
|
92
|
-
elif isinstance(node, bs4.element.Tag):
|
93
|
-
s = node.name + ',' + xldictstr(node.attrs, item_delimit=',')
|
94
|
-
elif isinstance(node, bs4.element.NavigableString):
|
95
|
-
s = shorten(str(node), 200)
|
96
|
-
if not s.strip():
|
97
|
-
s = '<??>'
|
98
|
-
else:
|
99
|
-
s = '遇到特殊类型,' + str(node)
|
100
|
-
return (prefix * depth) + s
|
101
|
-
|
102
|
-
search = Search(self)
|
103
|
-
res = search.fmt_nodes(linenum=linenum, **kwargs)
|
104
|
-
return res
|
105
|
-
|
106
|
-
def treestruct_stat(self):
|
107
|
-
"""生成一个两个二维表的统计数据
|
108
|
-
ls1, ls2 = treestruct_stat()
|
109
|
-
ls1: 结点规律表
|
110
|
-
ls2: 属性规律表
|
111
|
-
count_tagname、check_tag的功能基本都可以被这个函数代替
|
112
|
-
"""
|
113
|
-
|
114
|
-
def text(t):
|
115
|
-
""" 考虑到结果一般都是存储到excel,所以会把无法存成gbk的字符串删掉
|
116
|
-
另外控制了每个元素的长度上限
|
117
|
-
"""
|
118
|
-
s = ensure_gbk(t)
|
119
|
-
s = s[:100]
|
120
|
-
return s
|
121
|
-
|
122
|
-
def depth(t):
|
123
|
-
"""结点t的深度"""
|
124
|
-
return len(tuple(t.parents))
|
125
|
-
|
126
|
-
t = self.contents[0]
|
127
|
-
# ls1 = [['element序号', '层级', '结构', '父结点', '当前结点', '属性值/字符串值', '直接子结点结构']]
|
128
|
-
# ls2 = [['序号', 'element序号', '当前结点', '属性名', '属性值']] #
|
129
|
-
ls1 = [] # 这个重点是分析结点规律
|
130
|
-
ls2 = [] # 这个重点是分析属性规律
|
131
|
-
i = 1
|
132
|
-
while t:
|
133
|
-
# 1 结点规律表
|
134
|
-
d = depth(t)
|
135
|
-
line = [i, d, '_' * d + str(d), t.parent.tag_name, t.tag_name,
|
136
|
-
text(xldictstr(t.attrs) if t.name else t), # 结点存属性,字符串存值
|
137
|
-
t.subtag_names()]
|
138
|
-
ls1.append(line)
|
139
|
-
# 2 属性规律表
|
140
|
-
if t.name:
|
141
|
-
k = len(ls2)
|
142
|
-
for attr, value in t.attrs.items():
|
143
|
-
ls2.append([k, i, t.tag_name, attr, value])
|
144
|
-
k += 1
|
145
|
-
# 下个结点
|
146
|
-
t = t.next_element
|
147
|
-
i += 1
|
148
|
-
df1 = pd.DataFrame.from_records(ls1,
|
149
|
-
columns=['element序号', '层级', '结构', '父结点', '当前结点', '属性值/字符串值',
|
150
|
-
'直接子结点结构'])
|
151
|
-
df2 = pd.DataFrame.from_records(ls2, columns=['序号', 'element序号', '当前结点', '属性名', '属性值'])
|
152
|
-
return df1, df2
|
153
|
-
|
154
|
-
def count_tagname(self):
|
155
|
-
"""统计每个标签出现的次数:
|
156
|
-
1 w:rpr 650
|
157
|
-
2 w:rfonts 650
|
158
|
-
3 w:szcs 618
|
159
|
-
4 w:r 565
|
160
|
-
5 None 532
|
161
|
-
6 w:t 531
|
162
|
-
"""
|
163
|
-
ct = collections.Counter()
|
164
|
-
|
165
|
-
def inner(node):
|
166
|
-
try:
|
167
|
-
ct[node.name] += 1
|
168
|
-
for t in node.children:
|
169
|
-
inner(t)
|
170
|
-
except AttributeError:
|
171
|
-
pass
|
172
|
-
|
173
|
-
inner(self)
|
174
|
-
return ct.most_common()
|
175
|
-
|
176
|
-
def check_tag(self, tagname=None):
|
177
|
-
""" 统计每个标签在不同层级出现的次数:
|
178
|
-
|
179
|
-
:param tagname:
|
180
|
-
None:统计全文出现的各种标签在不同层级出现次数
|
181
|
-
't'等值: tagname参数允许只检查特殊标签情况,此时会将所有tagname设为第0级
|
182
|
-
|
183
|
-
TODO 检查一个标签内部是否有同名标签?
|
184
|
-
"""
|
185
|
-
d = defaultdict()
|
186
|
-
|
187
|
-
def add(name, depth):
|
188
|
-
if name not in d:
|
189
|
-
d[name] = defaultdict(int)
|
190
|
-
d[name][depth] += 1
|
191
|
-
|
192
|
-
def inner(node, depth):
|
193
|
-
if isinstance(node, bs4.element.ProcessingInstruction):
|
194
|
-
add('ProcessingInstruction', depth)
|
195
|
-
elif isinstance(node, bs4.element.Tag):
|
196
|
-
if node.name == tagname and depth:
|
197
|
-
dprint(node, depth) # tagname里有同名子标签
|
198
|
-
add(node.name, depth)
|
199
|
-
for t in node.children:
|
200
|
-
inner(t, depth + 1)
|
201
|
-
elif isinstance(node, bs4.element.NavigableString):
|
202
|
-
add('NavigableString', depth)
|
203
|
-
else:
|
204
|
-
add('其他特殊结点', depth)
|
205
|
-
|
206
|
-
# 1 统计结点在每一层出现的次数
|
207
|
-
if tagname:
|
208
|
-
for t in self.find_all(tagname):
|
209
|
-
inner(t, 0)
|
210
|
-
else:
|
211
|
-
inner(self, 0)
|
212
|
-
|
213
|
-
# 2 总出现次数和?
|
214
|
-
|
215
|
-
return d
|
216
|
-
|
217
|
-
def check_namespace(self):
|
218
|
-
"""检查名称空间问题,会同时检查标签名和属性名:
|
219
|
-
1 cNvPr pic:cNvPr(579),wps:cNvPr(52),wpg:cNvPr(15)
|
220
|
-
2 spPr pic:spPr(579),wps:spPr(52)
|
221
|
-
"""
|
222
|
-
# 1 获得所有名称
|
223
|
-
# 因为是采用node的原始xml文本,所以能保证会取得带有名称空间的文本内容
|
224
|
-
ct0 = Counter(re.findall(r'<([a-zA-Z:]+)', str(self)))
|
225
|
-
ct = defaultdict(str)
|
226
|
-
s = set()
|
227
|
-
for key, value in ct0.items():
|
228
|
-
k = re.sub(r'.*:', '', key)
|
229
|
-
if k in ct:
|
230
|
-
s.add(k)
|
231
|
-
ct[k] += f',{key}({value})'
|
232
|
-
else:
|
233
|
-
ct[k] = f'{key}({value})'
|
234
|
-
|
235
|
-
# 2 对有重复和无重复的元素划分存储
|
236
|
-
ls1 = [] # 有重复的存储到ls1
|
237
|
-
ls2 = [] # 没有重复的正常结果存储到ls2,可以不显示
|
238
|
-
for k, v in ct.items():
|
239
|
-
if k in s:
|
240
|
-
ls1.append([k, v])
|
241
|
-
else:
|
242
|
-
ls2.append([k, v])
|
243
|
-
|
244
|
-
# 3 显示有重复的情况
|
245
|
-
# browser(ls1, filename='检查名称空间问题')
|
246
|
-
return ls1
|
247
|
-
|
248
|
-
def get_catalogue(self, *args, size=False, start_level=-1, **kwargs):
|
249
|
-
""" 找到所有的h生成文本版的目录
|
250
|
-
|
251
|
-
:param bool|int size: 布尔或者乘因子,表示是否展示文本,以及乘以倍率,比如双语阅读时,size可以缩放一半
|
252
|
-
|
253
|
-
*args, **kwargs 参考 BookContents.format_str
|
254
|
-
|
255
|
-
注意这里算法跟css样式不太一样,避免这里能写代码,能做更细腻的操作
|
256
|
-
"""
|
257
|
-
bc = BookContents()
|
258
|
-
for h in self.find_all(re.compile(r'h\d')):
|
259
|
-
if size:
|
260
|
-
part_size = h.section_text_size(size, fmt=True)
|
261
|
-
bc.add(int(h.name[1]), h.get_text().replace('\n', ' ').strip(), part_size)
|
262
|
-
else:
|
263
|
-
bc.add(int(h.name[1]), h.get_text().replace('\n', ' ').strip())
|
264
|
-
|
265
|
-
if 'page' not in kwargs:
|
266
|
-
kwargs['page'] = size
|
267
|
-
|
268
|
-
if bc.contents:
|
269
|
-
return bc.format_str(*args, start_level=start_level, **kwargs)
|
270
|
-
else:
|
271
|
-
return ''
|
272
|
-
|
273
|
-
def section_text_size(self, factor=1, fmt=False):
|
274
|
-
""" 计算某节标题下的正文内容长度 """
|
275
|
-
if not re.match(r'h\d+$', self.name):
|
276
|
-
raise TypeError
|
277
|
-
|
278
|
-
# 这应该是相对比较简便的计算每一节内容多长的算法~~
|
279
|
-
part_size = 0
|
280
|
-
for x in self.next_siblings:
|
281
|
-
if x.name == self.name:
|
282
|
-
break
|
283
|
-
else:
|
284
|
-
text = str(x) if isinstance(x, bs4.element.NavigableString) else x.get_text()
|
285
|
-
part_size += strwidth(text)
|
286
|
-
part_size = round_int(part_size * factor)
|
287
|
-
|
288
|
-
if fmt:
|
289
|
-
return format_size(part_size).replace(' ', '').replace('bytes', 'B')
|
290
|
-
else:
|
291
|
-
return part_size
|
292
|
-
|
293
|
-
def head_add_size(self, factor=1):
|
294
|
-
""" 标题增加每节内容大小标记
|
295
|
-
|
296
|
-
:param factor: 乘因子,默认是1。但双语阅读等情况,内容会多拷贝一份,此时可以乘以0.5,显示正常原文的大小。
|
297
|
-
"""
|
298
|
-
for h in self.find_all(re.compile(r'h\d')):
|
299
|
-
part_size = h.section_text_size(factor, fmt=True)
|
300
|
-
navi_str = list(h.strings)[-1].rstrip()
|
301
|
-
navi_str.replace_with(str(navi_str) + ',' + part_size)
|
302
|
-
|
303
|
-
def head_add_number(self, start_level=-1, jump=True):
|
304
|
-
""" 标题增加每节编号
|
305
|
-
"""
|
306
|
-
bc = BookContents()
|
307
|
-
heads = list(self.find_all(re.compile(r'h\d')))
|
308
|
-
for h in heads:
|
309
|
-
bc.add(int(h.name[1]), h.get_text().replace('\n', ' '))
|
310
|
-
|
311
|
-
if not bc.contents:
|
312
|
-
return
|
313
|
-
|
314
|
-
nums = bc.format_numbers(start_level=start_level, jump=jump)
|
315
|
-
for i, h in enumerate(heads):
|
316
|
-
navi_strs = list(h.strings)
|
317
|
-
if navi_strs:
|
318
|
-
navi_str = navi_strs[0]
|
319
|
-
if nums[i]:
|
320
|
-
navi_str.replace_with(nums[i] + ' ' + str(navi_str))
|
321
|
-
else:
|
322
|
-
h.string = nums[i]
|
323
|
-
|
324
|
-
def xltext(self):
|
325
|
-
""" 自己特用的文本化方法
|
326
|
-
|
327
|
-
有些空格会丢掉,要用这句转回来
|
328
|
-
|
329
|
-
210924周五20:23,但后续实验又遭到了质疑,目前这功能虽然留着,但不建议使用
|
330
|
-
"""
|
331
|
-
# return self.prettify(formatter=lambda s: s.replace(u'\xa0', ' '))
|
332
|
-
# \xa0好像是些特殊字符,删掉就行。。。 不对,也不是特殊字符~~
|
333
|
-
# return self.prettify(formatter=lambda s: s.replace(u'\xa0', ''))
|
334
|
-
# return self.prettify()
|
335
|
-
return str(self)
|
336
|
-
|
337
|
-
def browser(self):
|
338
|
-
browser.html(self)
|
339
|
-
|
340
|
-
@run_once('id,str')
|
341
|
-
def get_nonempty_childrens(self, *args):
|
342
|
-
""" 获得所有Tag类型的直接子结点 (偏定制,不是那么通用的接口)
|
343
|
-
|
344
|
-
会同时检查NavigableString类型,且必须是空白字符串,比如空格、\n之类
|
345
|
-
"""
|
346
|
-
|
347
|
-
def check(x):
|
348
|
-
if isinstance(x, bs4.element.Tag):
|
349
|
-
return True
|
350
|
-
elif isinstance(x, bs4.element.Comment):
|
351
|
-
return False
|
352
|
-
elif isinstance(x, bs4.element.NavigableString):
|
353
|
-
assert not x.strip(), f'非空字符串值:{x}'
|
354
|
-
return False
|
355
|
-
else:
|
356
|
-
raise ValueError(f'未见类型 {x}')
|
357
|
-
|
358
|
-
ls = list(filter(check, self.children))
|
359
|
-
|
360
|
-
if len(args):
|
361
|
-
return ls[args[0]].get_nonempty_childrens(*args[1:])
|
362
|
-
else:
|
363
|
-
return ls
|
364
|
-
|
365
|
-
def get_nonempty_children(self, *args):
|
366
|
-
""" 输入args下标,指定获得某一个非空子结点 """
|
367
|
-
if len(args):
|
368
|
-
ls = self.get_nonempty_childrens(*args[:-1])
|
369
|
-
return ls[args[-1]]
|
370
|
-
else:
|
371
|
-
return self
|
372
|
-
|
373
|
-
def next_preorder_node(self, iter_child=True):
|
374
|
-
""" 自己写的先序遍历
|
375
|
-
|
376
|
-
主要应用在xml、bs4相关遍历检索时,有时候遇到特殊结点
|
377
|
-
可能子结点不需要解析
|
378
|
-
或者整个cur_node和子结点已经被解析完了,不需要再按照通常的先序遍历继续进入子结点
|
379
|
-
此时可以 iter_child=False,进入下一个兄弟结点
|
380
|
-
"""
|
381
|
-
# 传入的不一定是一个Tag结点~~
|
382
|
-
if not isinstance(self, bs4.element.Tag):
|
383
|
-
return None
|
384
|
-
|
385
|
-
if iter_child and self.contents:
|
386
|
-
return self.contents[0]
|
387
|
-
else:
|
388
|
-
cur_node = self
|
389
|
-
while True:
|
390
|
-
parent = cur_node.parent
|
391
|
-
if parent is None:
|
392
|
-
return None
|
393
|
-
sibing = cur_node.find_next_sibling()
|
394
|
-
if sibing:
|
395
|
-
return sibing
|
396
|
-
cur_node = parent
|
397
|
-
|
398
|
-
def find_by_xpath(self, xpath):
|
399
|
-
""" 使用xpath定位元素
|
400
|
-
|
401
|
-
bs4官方没有自带,网上找到的很多也不中意。就自己根据需求简单定制一下。非完整版实现,但希望能支持常用的几个操作。
|
402
|
-
好在还是有现成的xpath解析库的,自己扩展实现也不会太难。
|
403
|
-
"""
|
404
|
-
from xpath_parser import XpathExpression
|
405
|
-
|
406
|
-
xp = XpathExpression(xpath)
|
407
|
-
|
408
|
-
cur_tag = self
|
409
|
-
for node in xp.nodes:
|
410
|
-
if node.name == '*':
|
411
|
-
name = None
|
412
|
-
else:
|
413
|
-
name = node.name
|
414
|
-
|
415
|
-
# TODO 其他前缀功能: .. 父结点, / 根节点
|
416
|
-
recursive = node.ignore_position
|
417
|
-
|
418
|
-
attrs = {}
|
419
|
-
limit = 1
|
420
|
-
for a in node.attrs:
|
421
|
-
if a[0] == '@':
|
422
|
-
k, v = a.split('=')
|
423
|
-
attrs[k[1:]] = v[1:-1]
|
424
|
-
elif re.match(r'\d+$', a): # 索引下标
|
425
|
-
limit = int(a)
|
426
|
-
else:
|
427
|
-
raise NotImplementedError
|
428
|
-
|
429
|
-
# node.type没用上,应该有些需要用途的
|
430
|
-
|
431
|
-
sub_tags = cur_tag.find_all(name, attrs, recursive, limit=limit)
|
432
|
-
if sub_tags:
|
433
|
-
cur_tag = sub_tags[-1]
|
434
|
-
else: # 没找到
|
435
|
-
return None
|
436
|
-
|
437
|
-
return cur_tag
|
438
|
-
|
439
|
-
def __修改功能(self):
|
440
|
-
pass
|
441
|
-
|
442
|
-
@classmethod
|
443
|
-
def _to_node(cls, html):
|
444
|
-
""" 输入可以是字符串、文档、结点 """
|
445
|
-
if isinstance(html, str):
|
446
|
-
new_node = next(BeautifulSoup(html, 'lxml').body.children)
|
447
|
-
elif html.find('body'):
|
448
|
-
new_node = next(html.body.children)
|
449
|
-
else:
|
450
|
-
new_node = html
|
451
|
-
return new_node
|
452
|
-
|
453
|
-
@classmethod
|
454
|
-
def _to_nodes(cls, html):
|
455
|
-
""" 输入可以是字符串、文档、结点 """
|
456
|
-
if isinstance(html, str):
|
457
|
-
new_nodes = list(BeautifulSoup(html, 'lxml').body.children)
|
458
|
-
elif html.find('body'):
|
459
|
-
new_nodes = list(html.body.children)
|
460
|
-
else:
|
461
|
-
new_nodes = [html]
|
462
|
-
return new_nodes
|
463
|
-
|
464
|
-
def replace_html_with(self, html):
|
465
|
-
nodes = self._to_nodes(html) # 支持替换成多个节点
|
466
|
-
if not nodes:
|
467
|
-
return
|
468
|
-
self.replace_with(nodes[0])
|
469
|
-
|
470
|
-
cur = nodes[0]
|
471
|
-
for node in nodes[1:]:
|
472
|
-
cur.insert_after(node)
|
473
|
-
cur = node
|
474
|
-
|
475
|
-
def insert_html_before(self, html):
|
476
|
-
nodes = self._to_nodes(html)
|
477
|
-
if not nodes:
|
478
|
-
return
|
479
|
-
self.insert_before(nodes[0])
|
480
|
-
|
481
|
-
cur = nodes[0]
|
482
|
-
for node in nodes[1:]:
|
483
|
-
cur.insert_after(node)
|
484
|
-
cur = node
|
485
|
-
|
486
|
-
def insert_html_after(self, html):
|
487
|
-
nodes = self._to_nodes(html)
|
488
|
-
if not nodes:
|
489
|
-
return
|
490
|
-
|
491
|
-
cur = self
|
492
|
-
for node in nodes:
|
493
|
-
cur.insert_after(node)
|
494
|
-
cur = node
|
495
|
-
|
496
|
-
def append_html(self, html):
|
497
|
-
""" 原append的扩展 """
|
498
|
-
nodes = self._to_nodes(html)
|
499
|
-
for node in nodes:
|
500
|
-
self.append(node)
|
501
|
-
|
502
|
-
|
503
|
-
inject_members(XlBs4Tag, bs4.element.Tag)
|
504
|
-
# 这样虽然不优雅,但主要是让特殊的String类型也支持兼容tag_name属性
|
505
|
-
inject_members(XlBs4Tag, bs4.element.NavigableString)
|
506
|
-
|
507
|
-
|
508
|
-
def mathjax_html_head(s):
|
509
|
-
"""增加mathjax解析脚本"""
|
510
|
-
head = r"""<!DOCTYPE html>
|
511
|
-
<html>
|
512
|
-
<head>
|
513
|
-
<head><meta http-equiv=Content-Type content="text/html;charset=utf-8"></head>
|
514
|
-
<script src="https://a.cdn.histudy.com/lib/config/mathjax_config-klxx.js?v=1.1"></script>
|
515
|
-
<script type="text/javascript" async src="https://a.cdn.histudy.com/lib/mathjax/2.7.1/MathJax/MathJax.js?config=TeX-AMS-MML_SVG">
|
516
|
-
MathJax.Hub.Config(MATHJAX_KLXX_CONFIG);
|
517
|
-
</script>
|
518
|
-
</head>
|
519
|
-
<body>"""
|
520
|
-
tail = '</body></html>'
|
521
|
-
return head + s + tail
|
522
|
-
|
523
|
-
|
524
|
-
def html_bitran_template(htmlcontent):
|
525
|
-
""" 双语翻译的html模板,html bilingual translation template
|
526
|
-
|
527
|
-
一般是将word导出的html文件,转成方便谷歌翻译操作,进行双语对照的格式
|
528
|
-
|
529
|
-
基本原理,是利用chrome识别class="notranslate"标记会跳过不翻译的特性
|
530
|
-
对正文标签p拷贝两份,一份原文,一份带notranslate标记的内容
|
531
|
-
这样在执行谷歌翻译后,就能出现双语对照的效果
|
532
|
-
|
533
|
-
其实最好的办法,是能调用翻译API,直接给出双语成果的html
|
534
|
-
但谷歌的googletrans连不上外网无法使用
|
535
|
-
其他公司的翻译接口应该没问题,但我嫌其可能没有google好,以及不是重点,就先暂缓开发
|
536
|
-
---
|
537
|
-
习惯来说,一般上面是英文,下面是中文,但是我又想使用中文标题~~
|
538
|
-
"""
|
539
|
-
from pyxllib.text.nestenv import NestEnv
|
540
|
-
|
541
|
-
# 0 将所有负margin-left变为0
|
542
|
-
htmlcontent = re.sub(r'margin-left:-\d+(\.\d+)', 'margin-left:0', htmlcontent)
|
543
|
-
|
544
|
-
# 1 区间定位分组
|
545
|
-
ne = NestEnv(htmlcontent)
|
546
|
-
ne2 = ne.xmltag('p')
|
547
|
-
for name in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'pre', 'ol', 'li'):
|
548
|
-
ne2 += ne.xmltag(name, symmetry=True)
|
549
|
-
|
550
|
-
# 以下是针对python document复制到word的情况,不一定具有广泛泛用性
|
551
|
-
# 目的是让代码块按块复制,而不是按行复制
|
552
|
-
ne2 += ne.find2(re.compile("<div style=['\"]mso-element:para-border-div;.+?#AACC99"), '</div>')
|
553
|
-
|
554
|
-
# 2 每个区间的处理规则
|
555
|
-
def func(s):
|
556
|
-
""" 找出p、h后,具体对每个tag要执行的操作
|
557
|
-
|
558
|
-
分前后两波文本s1(原文),s2(翻译文)
|
559
|
-
"""
|
560
|
-
|
561
|
-
# 1 s1 只要加 notranslate
|
562
|
-
s1 = s
|
563
|
-
bs = BeautifulSoup(s1, 'lxml')
|
564
|
-
x = next(bs.body.children)
|
565
|
-
cls_ = x.get('class', None)
|
566
|
-
x['class'] = (cls_ + ['notranslate']) if cls_ else 'notranslate'
|
567
|
-
s1 = x.prettify()
|
568
|
-
|
569
|
-
# 2 s2 可能要做些骚操作
|
570
|
-
s2 = s
|
571
|
-
bs = BeautifulSoup(s2, 'lxml')
|
572
|
-
x = next(bs.body.children)
|
573
|
-
|
574
|
-
# 比如自定义翻译,这个无伤大雅的,如果搞不定,可以先注释掉,后面再说
|
575
|
-
# if re.match(r'h\d+$', x.name):
|
576
|
-
# for y in x.descendants:
|
577
|
-
# if isinstance(y, NavigableString):
|
578
|
-
# y.replace_with(re.sub(r'Conclusion', '总结', str(y)))
|
579
|
-
# else:
|
580
|
-
# for z in y.strings:
|
581
|
-
# z.replace_with(re.sub(r'Conclusion', '总结', str(z)))
|
582
|
-
# y.replace_with(re.sub(r'^Abstract$', '摘要', str(y)))
|
583
|
-
# s2 = str(x)
|
584
|
-
|
585
|
-
if re.match(r'h\d+$', x.name):
|
586
|
-
x.name = 'p' # 去掉标题格式,统一为段落格式
|
587
|
-
s2 = x.prettify()
|
588
|
-
elif x.name in ('div', 'pre'):
|
589
|
-
# 实际使用体验,想了下,代码块还是不如保留原样最方便,不用拷贝翻译
|
590
|
-
# s2 = x.prettify()
|
591
|
-
s2 = '' # 复制方式很有技巧
|
592
|
-
# 如果p没有文本字符串,也不拷贝
|
593
|
-
if not x.get_text().strip():
|
594
|
-
s2 = ''
|
595
|
-
# if x.name == 'p' and x.get('style', None) and 'margin-left' in x['style']:
|
596
|
-
# x['style'] = re.sub(r'(margin-left:)\d+(\.\d+)?', r'\g<1>0', x['style'])
|
597
|
-
|
598
|
-
return s1 + '\n' + s2
|
599
|
-
|
600
|
-
res = ne2.replace(func)
|
601
|
-
|
602
|
-
return res
|
603
|
-
|
604
|
-
|
605
|
-
class MakeHtmlNavigation:
|
606
|
-
""" 给网页添加一个带有超链接跳转的导航栏 """
|
607
|
-
|
608
|
-
@classmethod
|
609
|
-
def from_url(cls, url, **kwargs):
|
610
|
-
""" 自动下载url的内容,缓存到本地后,加上导航栏打开 """
|
611
|
-
content = requests.get(url).content.decode('utf8')
|
612
|
-
etag = get_etag(url) # 直接算url的etag,不用很严谨
|
613
|
-
return cls.from_content(content, etag, **kwargs)
|
614
|
-
|
615
|
-
@classmethod
|
616
|
-
def from_file(cls, file, **kwargs):
|
617
|
-
""" 输入本地一个html文件的路径,加上导航栏打开 """
|
618
|
-
file = File(file)
|
619
|
-
content = file.read()
|
620
|
-
# 输入文件的情况,生成的_content等html要在同目录
|
621
|
-
return cls.from_content(content, os.path.splitext(str(file))[0], **kwargs)
|
622
|
-
|
623
|
-
@classmethod
|
624
|
-
def from_content(cls, html_content, title='temphtml', *,
|
625
|
-
encoding=None, number=True, text_catalogue=True):
|
626
|
-
"""
|
627
|
-
:param html_content: 原始网页的完整内容
|
628
|
-
:param title: 页面标题,默认会先找head/title,如果没有,则取一个随机名称(TODO 未实装,目前固定名称)
|
629
|
-
:param encoding: 保存的几个文件编码,默认是utf8,但windows平台有些特殊场合也可能要存储gbk
|
630
|
-
:param number: 是否对每节启用自动编号的css
|
631
|
-
|
632
|
-
算法基本原理:读取原网页,找出所有h标签,并增设a锚点
|
633
|
-
另外生成一个导航html文件
|
634
|
-
然后再生成一个主文件,让用户通过主文件来浏览页面
|
635
|
-
|
636
|
-
# 读取csdn博客并展示目录 (不过因为这个存在跳级,效果不是那么好)
|
637
|
-
>> file = 自动制作网页标题的导航栏(requests.get(r'https://blog.csdn.net/code4101/article/details/83009000').content.decode('utf8'))
|
638
|
-
>> browser(str(file))
|
639
|
-
http://i2.tiimg.com/582188/64f40d235705de69.png
|
640
|
-
"""
|
641
|
-
from humanfriendly import format_size
|
642
|
-
|
643
|
-
# 1 对原html,设置锚点,生成一个新的文件f2
|
644
|
-
cnt = 0
|
645
|
-
|
646
|
-
# 这个refs是可以用py算法生成的,目前是存储在github上引用
|
647
|
-
refs = ['<html><head>',
|
648
|
-
'<link rel=Stylesheet type="text/css" media=all '
|
649
|
-
f'href="https://code4101.github.io/css/navigation{int(number)}.css">',
|
650
|
-
'</head><body>']
|
651
|
-
|
652
|
-
f2 = File(title + '_content', Dir.TEMP, suffix='.html')
|
653
|
-
|
654
|
-
def func(m):
|
655
|
-
nonlocal cnt
|
656
|
-
cnt += 1
|
657
|
-
name, content = m.group('name'), m.group('inner')
|
658
|
-
content = BeautifulSoup(content, 'lxml').get_text()
|
659
|
-
# 要写<h><a></a></h>,不能写<a><h></h></a>,否则css中设置的计数器重置不会起作用
|
660
|
-
refs.append(f'<{name}><a href="{f2}#navigation{cnt}" target="showframe">{content}</a></{name}>')
|
661
|
-
return f'<a name="navigation{cnt}"/>' + m.group()
|
662
|
-
|
663
|
-
html_content = re.sub(r'<(?P<name>h\d+)(?:>|\s.*?>)(?P<body>\s*(?P<inner>.*?)\s*)</\1>',
|
664
|
-
func, html_content, flags=re.DOTALL)
|
665
|
-
f2 = f2.write(html_content, encoding=encoding, if_exists='replace')
|
666
|
-
|
667
|
-
# 2 f1除了导航栏,可以多附带一些有用的参考信息
|
668
|
-
# 2.1 前文的refs已经存储了超链接的导航
|
669
|
-
|
670
|
-
# 2.2 文本版的目录
|
671
|
-
bs = BeautifulSoup(html_content, 'lxml')
|
672
|
-
text = bs.get_text()
|
673
|
-
if text_catalogue:
|
674
|
-
# 目录
|
675
|
-
refs.append(f'<br/>【文本版的目录】')
|
676
|
-
catalogue = bs.get_catalogue(indent='\t', start_level=-1, jump=True, size=True)
|
677
|
-
refs.append(f'<pre>{catalogue}</pre>')
|
678
|
-
# 全文长度
|
679
|
-
n = strwidth(text)
|
680
|
-
refs.append('<br/>【Total Bytes】' + format_size(n))
|
681
|
-
|
682
|
-
# 2.3 文中使用的高频词
|
683
|
-
# 英文可以直接按空格切开统计,区分大小写
|
684
|
-
text2 = re.sub(grp_chinese_char(), '', text) # 删除中文,先不做中文的功能~~
|
685
|
-
text2 = re.sub(r'[,\.,。\(\)();;??"]', ' ', text2) # 标点符号按空格处理
|
686
|
-
words = Counter(text2.split())
|
687
|
-
msg = '\n'.join([(x[0] if x[1] == 1 else f'{x[0]},{x[1]}') for x in words.most_common()])
|
688
|
-
msg += f'<br/>共{len(words)}个词汇,用词数{sum(words.values())}。'
|
689
|
-
refs.append(f'<br/>【词汇表】<pre>{msg}</pre>')
|
690
|
-
|
691
|
-
# 2.5 收尾,写入f1
|
692
|
-
refs.append('</body>\n</html>')
|
693
|
-
f1 = File(title + '_catalogue', Dir.TEMP, suffix='.html').write('\n'.join(refs), encoding=encoding,
|
694
|
-
if_exists='replace')
|
695
|
-
|
696
|
-
# 3 生成主页 f0
|
697
|
-
main_content = f"""<html>
|
698
|
-
<frameset cols="20%,80%">
|
699
|
-
<frame src="{f1}">
|
700
|
-
<frame src="{f2}" name="showframe">
|
701
|
-
</frameset></html>"""
|
702
|
-
|
703
|
-
f0 = File(title + '_index', Dir.TEMP, suffix='.html').write(main_content, encoding=encoding,
|
704
|
-
if_exists='replace')
|
705
|
-
return f0
|
706
|
-
|
707
|
-
|
708
|
-
class HtmlParser:
|
709
|
-
""" 对树形结构、位置比较固定的html文档的一个解析框架 """
|
710
|
-
|
711
|
-
def __init__(self, root):
|
712
|
-
""" 输入根节点root """
|
713
|
-
self.root = root
|
714
|
-
|
715
|
-
@classmethod
|
716
|
-
@run_once
|
717
|
-
def get_parse_funcs(cls):
|
718
|
-
res = []
|
719
|
-
|
720
|
-
# 获取所有的方法名
|
721
|
-
members = dir(cls)
|
722
|
-
methods = filter(lambda m: callable(getattr(cls, m)), members)
|
723
|
-
|
724
|
-
# 以parse、parse_0、parse_0_2等格式命名的函数,是解析树结构特定位置,这里自动执行解析
|
725
|
-
for method in methods:
|
726
|
-
if re.match(r'parse(_\d+)*$', method):
|
727
|
-
# 智能获取对应下标的结构变量
|
728
|
-
res.append(method)
|
729
|
-
|
730
|
-
return res
|
731
|
-
|
732
|
-
def run(self):
|
733
|
-
for method in self.get_parse_funcs():
|
734
|
-
# 智能获取对应下标的结构变量
|
735
|
-
idxs = [int(v) for v in method[5:].split('_') if v]
|
736
|
-
x = self.root.get_nonempty_children(*idxs)
|
737
|
-
# 自动执行函数
|
738
|
-
getattr(self, method)(x)
|
739
|
-
|
740
|
-
|
741
|
-
def concat_htmlbody(ls):
|
742
|
-
""" 对多份网页内容中的body进行拼接
|
743
|
-
"""
|
744
|
-
texts = [re.search(r'<body>(.*?)</body>', x, flags=re.DOTALL).group(1) for x in ls]
|
745
|
-
# 用第一份作为主模板
|
746
|
-
text = re.sub(r'<body>(.*?)</body>', lambda m: '<body>' + '\n'.join(texts) + '</body>', ls[0], flags=re.DOTALL)
|
747
|
-
return text
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : 陈坤泽
|
4
|
+
# @Email : 877362867@qq.com
|
5
|
+
# @Date : 2020/06/02 20:16
|
6
|
+
|
7
|
+
"""
|
8
|
+
xml等网页结构方面的处理
|
9
|
+
"""
|
10
|
+
|
11
|
+
# from pyxllib.prog.pupil import check_install_package
|
12
|
+
|
13
|
+
# 一个xpath解析库
|
14
|
+
# check_install_package('xpath_parser', 'xpath-parser')
|
15
|
+
|
16
|
+
import collections
|
17
|
+
from collections import Counter, defaultdict
|
18
|
+
import re
|
19
|
+
import os
|
20
|
+
|
21
|
+
import requests
|
22
|
+
import pandas as pd
|
23
|
+
import bs4
|
24
|
+
from bs4 import BeautifulSoup
|
25
|
+
from humanfriendly import format_size
|
26
|
+
# from xpath_parser import XpathExpression
|
27
|
+
|
28
|
+
from pyxllib.prog.newbie import round_int
|
29
|
+
from pyxllib.prog.pupil import dprint, run_once, inject_members
|
30
|
+
from pyxllib.prog.specialist import browser
|
31
|
+
from pyxllib.algo.pupil import SearchBase
|
32
|
+
from pyxllib.text.newbie import xldictstr
|
33
|
+
from pyxllib.text.pupil import shorten, ensure_gbk, BookContents, strwidth, grp_chinese_char
|
34
|
+
from pyxllib.file.specialist import File, Dir, get_etag
|
35
|
+
from pyxllib.text.jinjalib import get_jinja_template
|
36
|
+
|
37
|
+
|
38
|
+
class XlBs4Tag(bs4.element.Tag):
|
39
|
+
|
40
|
+
@property
|
41
|
+
def tag_name(self):
|
42
|
+
"""输入一个bs4的Tag或NavigableString,
|
43
|
+
返回tag.name或者'NavigableString'
|
44
|
+
"""
|
45
|
+
if self.name:
|
46
|
+
return self.name
|
47
|
+
elif isinstance(self, bs4.element.NavigableString):
|
48
|
+
return 'NavigableString'
|
49
|
+
else:
|
50
|
+
dprint(self) # 获取结点t名称失败
|
51
|
+
return None
|
52
|
+
|
53
|
+
def subtag_names(self):
|
54
|
+
""" 列出结点的所有直接子结点(花括号后面跟的数字是连续出现次数)
|
55
|
+
例如body的: p{137},tbl,p{94},tbl,p{1640},sectPr
|
56
|
+
"""
|
57
|
+
|
58
|
+
def counter(m):
|
59
|
+
s1 = m.group(1)
|
60
|
+
n = (m.end(0) - m.start(0)) // len(s1)
|
61
|
+
s = s1[:-1] + '{' + str(n) + '}'
|
62
|
+
if m.string[m.end(0) - 1] == ',':
|
63
|
+
s += ','
|
64
|
+
return s
|
65
|
+
|
66
|
+
if self.name and self.contents:
|
67
|
+
s = ','.join([x.tag_name for x in self.contents]) + ','
|
68
|
+
s = re.sub(r'([^,]+,)(\1)+', counter, s)
|
69
|
+
else:
|
70
|
+
s = ''
|
71
|
+
if s and s[-1] == ',':
|
72
|
+
s = s[:-1]
|
73
|
+
return s
|
74
|
+
|
75
|
+
def treestruct_raw(self, **kwargs):
|
76
|
+
""" 查看树形结构的raw版本
|
77
|
+
各参数含义详见dfs_base
|
78
|
+
"""
|
79
|
+
# 1 先用dfs获得基本结果
|
80
|
+
sb = SearchBase(self)
|
81
|
+
s = sb.fmt_nodes(**kwargs)
|
82
|
+
return s
|
83
|
+
|
84
|
+
def treestruct_brief(self, linenum=True, prefix='- ', **kwargs):
|
85
|
+
""" 查看树形结构的简洁版
|
86
|
+
"""
|
87
|
+
|
88
|
+
class Search(SearchBase):
|
89
|
+
def fmt_node(self, node, depth, *, prefix=prefix, show_node_type=False):
|
90
|
+
if isinstance(node, bs4.element.ProcessingInstruction):
|
91
|
+
s = 'ProcessingInstruction,' + str(node)
|
92
|
+
elif isinstance(node, bs4.element.Tag):
|
93
|
+
s = node.name + ',' + xldictstr(node.attrs, item_delimit=',')
|
94
|
+
elif isinstance(node, bs4.element.NavigableString):
|
95
|
+
s = shorten(str(node), 200)
|
96
|
+
if not s.strip():
|
97
|
+
s = '<??>'
|
98
|
+
else:
|
99
|
+
s = '遇到特殊类型,' + str(node)
|
100
|
+
return (prefix * depth) + s
|
101
|
+
|
102
|
+
search = Search(self)
|
103
|
+
res = search.fmt_nodes(linenum=linenum, **kwargs)
|
104
|
+
return res
|
105
|
+
|
106
|
+
def treestruct_stat(self):
|
107
|
+
"""生成一个两个二维表的统计数据
|
108
|
+
ls1, ls2 = treestruct_stat()
|
109
|
+
ls1: 结点规律表
|
110
|
+
ls2: 属性规律表
|
111
|
+
count_tagname、check_tag的功能基本都可以被这个函数代替
|
112
|
+
"""
|
113
|
+
|
114
|
+
def text(t):
|
115
|
+
""" 考虑到结果一般都是存储到excel,所以会把无法存成gbk的字符串删掉
|
116
|
+
另外控制了每个元素的长度上限
|
117
|
+
"""
|
118
|
+
s = ensure_gbk(t)
|
119
|
+
s = s[:100]
|
120
|
+
return s
|
121
|
+
|
122
|
+
def depth(t):
|
123
|
+
"""结点t的深度"""
|
124
|
+
return len(tuple(t.parents))
|
125
|
+
|
126
|
+
t = self.contents[0]
|
127
|
+
# ls1 = [['element序号', '层级', '结构', '父结点', '当前结点', '属性值/字符串值', '直接子结点结构']]
|
128
|
+
# ls2 = [['序号', 'element序号', '当前结点', '属性名', '属性值']] #
|
129
|
+
ls1 = [] # 这个重点是分析结点规律
|
130
|
+
ls2 = [] # 这个重点是分析属性规律
|
131
|
+
i = 1
|
132
|
+
while t:
|
133
|
+
# 1 结点规律表
|
134
|
+
d = depth(t)
|
135
|
+
line = [i, d, '_' * d + str(d), t.parent.tag_name, t.tag_name,
|
136
|
+
text(xldictstr(t.attrs) if t.name else t), # 结点存属性,字符串存值
|
137
|
+
t.subtag_names()]
|
138
|
+
ls1.append(line)
|
139
|
+
# 2 属性规律表
|
140
|
+
if t.name:
|
141
|
+
k = len(ls2)
|
142
|
+
for attr, value in t.attrs.items():
|
143
|
+
ls2.append([k, i, t.tag_name, attr, value])
|
144
|
+
k += 1
|
145
|
+
# 下个结点
|
146
|
+
t = t.next_element
|
147
|
+
i += 1
|
148
|
+
df1 = pd.DataFrame.from_records(ls1,
|
149
|
+
columns=['element序号', '层级', '结构', '父结点', '当前结点', '属性值/字符串值',
|
150
|
+
'直接子结点结构'])
|
151
|
+
df2 = pd.DataFrame.from_records(ls2, columns=['序号', 'element序号', '当前结点', '属性名', '属性值'])
|
152
|
+
return df1, df2
|
153
|
+
|
154
|
+
def count_tagname(self):
|
155
|
+
"""统计每个标签出现的次数:
|
156
|
+
1 w:rpr 650
|
157
|
+
2 w:rfonts 650
|
158
|
+
3 w:szcs 618
|
159
|
+
4 w:r 565
|
160
|
+
5 None 532
|
161
|
+
6 w:t 531
|
162
|
+
"""
|
163
|
+
ct = collections.Counter()
|
164
|
+
|
165
|
+
def inner(node):
|
166
|
+
try:
|
167
|
+
ct[node.name] += 1
|
168
|
+
for t in node.children:
|
169
|
+
inner(t)
|
170
|
+
except AttributeError:
|
171
|
+
pass
|
172
|
+
|
173
|
+
inner(self)
|
174
|
+
return ct.most_common()
|
175
|
+
|
176
|
+
def check_tag(self, tagname=None):
|
177
|
+
""" 统计每个标签在不同层级出现的次数:
|
178
|
+
|
179
|
+
:param tagname:
|
180
|
+
None:统计全文出现的各种标签在不同层级出现次数
|
181
|
+
't'等值: tagname参数允许只检查特殊标签情况,此时会将所有tagname设为第0级
|
182
|
+
|
183
|
+
TODO 检查一个标签内部是否有同名标签?
|
184
|
+
"""
|
185
|
+
d = defaultdict()
|
186
|
+
|
187
|
+
def add(name, depth):
|
188
|
+
if name not in d:
|
189
|
+
d[name] = defaultdict(int)
|
190
|
+
d[name][depth] += 1
|
191
|
+
|
192
|
+
def inner(node, depth):
|
193
|
+
if isinstance(node, bs4.element.ProcessingInstruction):
|
194
|
+
add('ProcessingInstruction', depth)
|
195
|
+
elif isinstance(node, bs4.element.Tag):
|
196
|
+
if node.name == tagname and depth:
|
197
|
+
dprint(node, depth) # tagname里有同名子标签
|
198
|
+
add(node.name, depth)
|
199
|
+
for t in node.children:
|
200
|
+
inner(t, depth + 1)
|
201
|
+
elif isinstance(node, bs4.element.NavigableString):
|
202
|
+
add('NavigableString', depth)
|
203
|
+
else:
|
204
|
+
add('其他特殊结点', depth)
|
205
|
+
|
206
|
+
# 1 统计结点在每一层出现的次数
|
207
|
+
if tagname:
|
208
|
+
for t in self.find_all(tagname):
|
209
|
+
inner(t, 0)
|
210
|
+
else:
|
211
|
+
inner(self, 0)
|
212
|
+
|
213
|
+
# 2 总出现次数和?
|
214
|
+
|
215
|
+
return d
|
216
|
+
|
217
|
+
def check_namespace(self):
|
218
|
+
"""检查名称空间问题,会同时检查标签名和属性名:
|
219
|
+
1 cNvPr pic:cNvPr(579),wps:cNvPr(52),wpg:cNvPr(15)
|
220
|
+
2 spPr pic:spPr(579),wps:spPr(52)
|
221
|
+
"""
|
222
|
+
# 1 获得所有名称
|
223
|
+
# 因为是采用node的原始xml文本,所以能保证会取得带有名称空间的文本内容
|
224
|
+
ct0 = Counter(re.findall(r'<([a-zA-Z:]+)', str(self)))
|
225
|
+
ct = defaultdict(str)
|
226
|
+
s = set()
|
227
|
+
for key, value in ct0.items():
|
228
|
+
k = re.sub(r'.*:', '', key)
|
229
|
+
if k in ct:
|
230
|
+
s.add(k)
|
231
|
+
ct[k] += f',{key}({value})'
|
232
|
+
else:
|
233
|
+
ct[k] = f'{key}({value})'
|
234
|
+
|
235
|
+
# 2 对有重复和无重复的元素划分存储
|
236
|
+
ls1 = [] # 有重复的存储到ls1
|
237
|
+
ls2 = [] # 没有重复的正常结果存储到ls2,可以不显示
|
238
|
+
for k, v in ct.items():
|
239
|
+
if k in s:
|
240
|
+
ls1.append([k, v])
|
241
|
+
else:
|
242
|
+
ls2.append([k, v])
|
243
|
+
|
244
|
+
# 3 显示有重复的情况
|
245
|
+
# browser(ls1, filename='检查名称空间问题')
|
246
|
+
return ls1
|
247
|
+
|
248
|
+
def get_catalogue(self, *args, size=False, start_level=-1, **kwargs):
|
249
|
+
""" 找到所有的h生成文本版的目录
|
250
|
+
|
251
|
+
:param bool|int size: 布尔或者乘因子,表示是否展示文本,以及乘以倍率,比如双语阅读时,size可以缩放一半
|
252
|
+
|
253
|
+
*args, **kwargs 参考 BookContents.format_str
|
254
|
+
|
255
|
+
注意这里算法跟css样式不太一样,避免这里能写代码,能做更细腻的操作
|
256
|
+
"""
|
257
|
+
bc = BookContents()
|
258
|
+
for h in self.find_all(re.compile(r'h\d')):
|
259
|
+
if size:
|
260
|
+
part_size = h.section_text_size(size, fmt=True)
|
261
|
+
bc.add(int(h.name[1]), h.get_text().replace('\n', ' ').strip(), part_size)
|
262
|
+
else:
|
263
|
+
bc.add(int(h.name[1]), h.get_text().replace('\n', ' ').strip())
|
264
|
+
|
265
|
+
if 'page' not in kwargs:
|
266
|
+
kwargs['page'] = size
|
267
|
+
|
268
|
+
if bc.contents:
|
269
|
+
return bc.format_str(*args, start_level=start_level, **kwargs)
|
270
|
+
else:
|
271
|
+
return ''
|
272
|
+
|
273
|
+
def section_text_size(self, factor=1, fmt=False):
|
274
|
+
""" 计算某节标题下的正文内容长度 """
|
275
|
+
if not re.match(r'h\d+$', self.name):
|
276
|
+
raise TypeError
|
277
|
+
|
278
|
+
# 这应该是相对比较简便的计算每一节内容多长的算法~~
|
279
|
+
part_size = 0
|
280
|
+
for x in self.next_siblings:
|
281
|
+
if x.name == self.name:
|
282
|
+
break
|
283
|
+
else:
|
284
|
+
text = str(x) if isinstance(x, bs4.element.NavigableString) else x.get_text()
|
285
|
+
part_size += strwidth(text)
|
286
|
+
part_size = round_int(part_size * factor)
|
287
|
+
|
288
|
+
if fmt:
|
289
|
+
return format_size(part_size).replace(' ', '').replace('bytes', 'B')
|
290
|
+
else:
|
291
|
+
return part_size
|
292
|
+
|
293
|
+
def head_add_size(self, factor=1):
|
294
|
+
""" 标题增加每节内容大小标记
|
295
|
+
|
296
|
+
:param factor: 乘因子,默认是1。但双语阅读等情况,内容会多拷贝一份,此时可以乘以0.5,显示正常原文的大小。
|
297
|
+
"""
|
298
|
+
for h in self.find_all(re.compile(r'h\d')):
|
299
|
+
part_size = h.section_text_size(factor, fmt=True)
|
300
|
+
navi_str = list(h.strings)[-1].rstrip()
|
301
|
+
navi_str.replace_with(str(navi_str) + ',' + part_size)
|
302
|
+
|
303
|
+
def head_add_number(self, start_level=-1, jump=True):
|
304
|
+
""" 标题增加每节编号
|
305
|
+
"""
|
306
|
+
bc = BookContents()
|
307
|
+
heads = list(self.find_all(re.compile(r'h\d')))
|
308
|
+
for h in heads:
|
309
|
+
bc.add(int(h.name[1]), h.get_text().replace('\n', ' '))
|
310
|
+
|
311
|
+
if not bc.contents:
|
312
|
+
return
|
313
|
+
|
314
|
+
nums = bc.format_numbers(start_level=start_level, jump=jump)
|
315
|
+
for i, h in enumerate(heads):
|
316
|
+
navi_strs = list(h.strings)
|
317
|
+
if navi_strs:
|
318
|
+
navi_str = navi_strs[0]
|
319
|
+
if nums[i]:
|
320
|
+
navi_str.replace_with(nums[i] + ' ' + str(navi_str))
|
321
|
+
else:
|
322
|
+
h.string = nums[i]
|
323
|
+
|
324
|
+
def xltext(self):
|
325
|
+
""" 自己特用的文本化方法
|
326
|
+
|
327
|
+
有些空格会丢掉,要用这句转回来
|
328
|
+
|
329
|
+
210924周五20:23,但后续实验又遭到了质疑,目前这功能虽然留着,但不建议使用
|
330
|
+
"""
|
331
|
+
# return self.prettify(formatter=lambda s: s.replace(u'\xa0', ' '))
|
332
|
+
# \xa0好像是些特殊字符,删掉就行。。。 不对,也不是特殊字符~~
|
333
|
+
# return self.prettify(formatter=lambda s: s.replace(u'\xa0', ''))
|
334
|
+
# return self.prettify()
|
335
|
+
return str(self)
|
336
|
+
|
337
|
+
def browser(self):
|
338
|
+
browser.html(self)
|
339
|
+
|
340
|
+
@run_once('id,str')
|
341
|
+
def get_nonempty_childrens(self, *args):
|
342
|
+
""" 获得所有Tag类型的直接子结点 (偏定制,不是那么通用的接口)
|
343
|
+
|
344
|
+
会同时检查NavigableString类型,且必须是空白字符串,比如空格、\n之类
|
345
|
+
"""
|
346
|
+
|
347
|
+
def check(x):
|
348
|
+
if isinstance(x, bs4.element.Tag):
|
349
|
+
return True
|
350
|
+
elif isinstance(x, bs4.element.Comment):
|
351
|
+
return False
|
352
|
+
elif isinstance(x, bs4.element.NavigableString):
|
353
|
+
assert not x.strip(), f'非空字符串值:{x}'
|
354
|
+
return False
|
355
|
+
else:
|
356
|
+
raise ValueError(f'未见类型 {x}')
|
357
|
+
|
358
|
+
ls = list(filter(check, self.children))
|
359
|
+
|
360
|
+
if len(args):
|
361
|
+
return ls[args[0]].get_nonempty_childrens(*args[1:])
|
362
|
+
else:
|
363
|
+
return ls
|
364
|
+
|
365
|
+
def get_nonempty_children(self, *args):
|
366
|
+
""" 输入args下标,指定获得某一个非空子结点 """
|
367
|
+
if len(args):
|
368
|
+
ls = self.get_nonempty_childrens(*args[:-1])
|
369
|
+
return ls[args[-1]]
|
370
|
+
else:
|
371
|
+
return self
|
372
|
+
|
373
|
+
def next_preorder_node(self, iter_child=True):
|
374
|
+
""" 自己写的先序遍历
|
375
|
+
|
376
|
+
主要应用在xml、bs4相关遍历检索时,有时候遇到特殊结点
|
377
|
+
可能子结点不需要解析
|
378
|
+
或者整个cur_node和子结点已经被解析完了,不需要再按照通常的先序遍历继续进入子结点
|
379
|
+
此时可以 iter_child=False,进入下一个兄弟结点
|
380
|
+
"""
|
381
|
+
# 传入的不一定是一个Tag结点~~
|
382
|
+
if not isinstance(self, bs4.element.Tag):
|
383
|
+
return None
|
384
|
+
|
385
|
+
if iter_child and self.contents:
|
386
|
+
return self.contents[0]
|
387
|
+
else:
|
388
|
+
cur_node = self
|
389
|
+
while True:
|
390
|
+
parent = cur_node.parent
|
391
|
+
if parent is None:
|
392
|
+
return None
|
393
|
+
sibing = cur_node.find_next_sibling()
|
394
|
+
if sibing:
|
395
|
+
return sibing
|
396
|
+
cur_node = parent
|
397
|
+
|
398
|
+
def find_by_xpath(self, xpath):
|
399
|
+
""" 使用xpath定位元素
|
400
|
+
|
401
|
+
bs4官方没有自带,网上找到的很多也不中意。就自己根据需求简单定制一下。非完整版实现,但希望能支持常用的几个操作。
|
402
|
+
好在还是有现成的xpath解析库的,自己扩展实现也不会太难。
|
403
|
+
"""
|
404
|
+
from xpath_parser import XpathExpression
|
405
|
+
|
406
|
+
xp = XpathExpression(xpath)
|
407
|
+
|
408
|
+
cur_tag = self
|
409
|
+
for node in xp.nodes:
|
410
|
+
if node.name == '*':
|
411
|
+
name = None
|
412
|
+
else:
|
413
|
+
name = node.name
|
414
|
+
|
415
|
+
# TODO 其他前缀功能: .. 父结点, / 根节点
|
416
|
+
recursive = node.ignore_position
|
417
|
+
|
418
|
+
attrs = {}
|
419
|
+
limit = 1
|
420
|
+
for a in node.attrs:
|
421
|
+
if a[0] == '@':
|
422
|
+
k, v = a.split('=')
|
423
|
+
attrs[k[1:]] = v[1:-1]
|
424
|
+
elif re.match(r'\d+$', a): # 索引下标
|
425
|
+
limit = int(a)
|
426
|
+
else:
|
427
|
+
raise NotImplementedError
|
428
|
+
|
429
|
+
# node.type没用上,应该有些需要用途的
|
430
|
+
|
431
|
+
sub_tags = cur_tag.find_all(name, attrs, recursive, limit=limit)
|
432
|
+
if sub_tags:
|
433
|
+
cur_tag = sub_tags[-1]
|
434
|
+
else: # 没找到
|
435
|
+
return None
|
436
|
+
|
437
|
+
return cur_tag
|
438
|
+
|
439
|
+
def __修改功能(self):
|
440
|
+
pass
|
441
|
+
|
442
|
+
@classmethod
|
443
|
+
def _to_node(cls, html):
|
444
|
+
""" 输入可以是字符串、文档、结点 """
|
445
|
+
if isinstance(html, str):
|
446
|
+
new_node = next(BeautifulSoup(html, 'lxml').body.children)
|
447
|
+
elif html.find('body'):
|
448
|
+
new_node = next(html.body.children)
|
449
|
+
else:
|
450
|
+
new_node = html
|
451
|
+
return new_node
|
452
|
+
|
453
|
+
@classmethod
|
454
|
+
def _to_nodes(cls, html):
|
455
|
+
""" 输入可以是字符串、文档、结点 """
|
456
|
+
if isinstance(html, str):
|
457
|
+
new_nodes = list(BeautifulSoup(html, 'lxml').body.children)
|
458
|
+
elif html.find('body'):
|
459
|
+
new_nodes = list(html.body.children)
|
460
|
+
else:
|
461
|
+
new_nodes = [html]
|
462
|
+
return new_nodes
|
463
|
+
|
464
|
+
def replace_html_with(self, html):
|
465
|
+
nodes = self._to_nodes(html) # 支持替换成多个节点
|
466
|
+
if not nodes:
|
467
|
+
return
|
468
|
+
self.replace_with(nodes[0])
|
469
|
+
|
470
|
+
cur = nodes[0]
|
471
|
+
for node in nodes[1:]:
|
472
|
+
cur.insert_after(node)
|
473
|
+
cur = node
|
474
|
+
|
475
|
+
def insert_html_before(self, html):
|
476
|
+
nodes = self._to_nodes(html)
|
477
|
+
if not nodes:
|
478
|
+
return
|
479
|
+
self.insert_before(nodes[0])
|
480
|
+
|
481
|
+
cur = nodes[0]
|
482
|
+
for node in nodes[1:]:
|
483
|
+
cur.insert_after(node)
|
484
|
+
cur = node
|
485
|
+
|
486
|
+
def insert_html_after(self, html):
|
487
|
+
nodes = self._to_nodes(html)
|
488
|
+
if not nodes:
|
489
|
+
return
|
490
|
+
|
491
|
+
cur = self
|
492
|
+
for node in nodes:
|
493
|
+
cur.insert_after(node)
|
494
|
+
cur = node
|
495
|
+
|
496
|
+
def append_html(self, html):
|
497
|
+
""" 原append的扩展 """
|
498
|
+
nodes = self._to_nodes(html)
|
499
|
+
for node in nodes:
|
500
|
+
self.append(node)
|
501
|
+
|
502
|
+
|
503
|
+
inject_members(XlBs4Tag, bs4.element.Tag)
|
504
|
+
# 这样虽然不优雅,但主要是让特殊的String类型也支持兼容tag_name属性
|
505
|
+
inject_members(XlBs4Tag, bs4.element.NavigableString)
|
506
|
+
|
507
|
+
|
508
|
+
def mathjax_html_head(s):
|
509
|
+
"""增加mathjax解析脚本"""
|
510
|
+
head = r"""<!DOCTYPE html>
|
511
|
+
<html>
|
512
|
+
<head>
|
513
|
+
<head><meta http-equiv=Content-Type content="text/html;charset=utf-8"></head>
|
514
|
+
<script src="https://a.cdn.histudy.com/lib/config/mathjax_config-klxx.js?v=1.1"></script>
|
515
|
+
<script type="text/javascript" async src="https://a.cdn.histudy.com/lib/mathjax/2.7.1/MathJax/MathJax.js?config=TeX-AMS-MML_SVG">
|
516
|
+
MathJax.Hub.Config(MATHJAX_KLXX_CONFIG);
|
517
|
+
</script>
|
518
|
+
</head>
|
519
|
+
<body>"""
|
520
|
+
tail = '</body></html>'
|
521
|
+
return head + s + tail
|
522
|
+
|
523
|
+
|
524
|
+
def html_bitran_template(htmlcontent):
|
525
|
+
""" 双语翻译的html模板,html bilingual translation template
|
526
|
+
|
527
|
+
一般是将word导出的html文件,转成方便谷歌翻译操作,进行双语对照的格式
|
528
|
+
|
529
|
+
基本原理,是利用chrome识别class="notranslate"标记会跳过不翻译的特性
|
530
|
+
对正文标签p拷贝两份,一份原文,一份带notranslate标记的内容
|
531
|
+
这样在执行谷歌翻译后,就能出现双语对照的效果
|
532
|
+
|
533
|
+
其实最好的办法,是能调用翻译API,直接给出双语成果的html
|
534
|
+
但谷歌的googletrans连不上外网无法使用
|
535
|
+
其他公司的翻译接口应该没问题,但我嫌其可能没有google好,以及不是重点,就先暂缓开发
|
536
|
+
---
|
537
|
+
习惯来说,一般上面是英文,下面是中文,但是我又想使用中文标题~~
|
538
|
+
"""
|
539
|
+
from pyxllib.text.nestenv import NestEnv
|
540
|
+
|
541
|
+
# 0 将所有负margin-left变为0
|
542
|
+
htmlcontent = re.sub(r'margin-left:-\d+(\.\d+)', 'margin-left:0', htmlcontent)
|
543
|
+
|
544
|
+
# 1 区间定位分组
|
545
|
+
ne = NestEnv(htmlcontent)
|
546
|
+
ne2 = ne.xmltag('p')
|
547
|
+
for name in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'pre', 'ol', 'li'):
|
548
|
+
ne2 += ne.xmltag(name, symmetry=True)
|
549
|
+
|
550
|
+
# 以下是针对python document复制到word的情况,不一定具有广泛泛用性
|
551
|
+
# 目的是让代码块按块复制,而不是按行复制
|
552
|
+
ne2 += ne.find2(re.compile("<div style=['\"]mso-element:para-border-div;.+?#AACC99"), '</div>')
|
553
|
+
|
554
|
+
# 2 每个区间的处理规则
|
555
|
+
def func(s):
|
556
|
+
""" 找出p、h后,具体对每个tag要执行的操作
|
557
|
+
|
558
|
+
分前后两波文本s1(原文),s2(翻译文)
|
559
|
+
"""
|
560
|
+
|
561
|
+
# 1 s1 只要加 notranslate
|
562
|
+
s1 = s
|
563
|
+
bs = BeautifulSoup(s1, 'lxml')
|
564
|
+
x = next(bs.body.children)
|
565
|
+
cls_ = x.get('class', None)
|
566
|
+
x['class'] = (cls_ + ['notranslate']) if cls_ else 'notranslate'
|
567
|
+
s1 = x.prettify()
|
568
|
+
|
569
|
+
# 2 s2 可能要做些骚操作
|
570
|
+
s2 = s
|
571
|
+
bs = BeautifulSoup(s2, 'lxml')
|
572
|
+
x = next(bs.body.children)
|
573
|
+
|
574
|
+
# 比如自定义翻译,这个无伤大雅的,如果搞不定,可以先注释掉,后面再说
|
575
|
+
# if re.match(r'h\d+$', x.name):
|
576
|
+
# for y in x.descendants:
|
577
|
+
# if isinstance(y, NavigableString):
|
578
|
+
# y.replace_with(re.sub(r'Conclusion', '总结', str(y)))
|
579
|
+
# else:
|
580
|
+
# for z in y.strings:
|
581
|
+
# z.replace_with(re.sub(r'Conclusion', '总结', str(z)))
|
582
|
+
# y.replace_with(re.sub(r'^Abstract$', '摘要', str(y)))
|
583
|
+
# s2 = str(x)
|
584
|
+
|
585
|
+
if re.match(r'h\d+$', x.name):
|
586
|
+
x.name = 'p' # 去掉标题格式,统一为段落格式
|
587
|
+
s2 = x.prettify()
|
588
|
+
elif x.name in ('div', 'pre'):
|
589
|
+
# 实际使用体验,想了下,代码块还是不如保留原样最方便,不用拷贝翻译
|
590
|
+
# s2 = x.prettify()
|
591
|
+
s2 = '' # 复制方式很有技巧
|
592
|
+
# 如果p没有文本字符串,也不拷贝
|
593
|
+
if not x.get_text().strip():
|
594
|
+
s2 = ''
|
595
|
+
# if x.name == 'p' and x.get('style', None) and 'margin-left' in x['style']:
|
596
|
+
# x['style'] = re.sub(r'(margin-left:)\d+(\.\d+)?', r'\g<1>0', x['style'])
|
597
|
+
|
598
|
+
return s1 + '\n' + s2
|
599
|
+
|
600
|
+
res = ne2.replace(func)
|
601
|
+
|
602
|
+
return res
|
603
|
+
|
604
|
+
|
605
|
+
class MakeHtmlNavigation:
|
606
|
+
""" 给网页添加一个带有超链接跳转的导航栏 """
|
607
|
+
|
608
|
+
@classmethod
|
609
|
+
def from_url(cls, url, **kwargs):
|
610
|
+
""" 自动下载url的内容,缓存到本地后,加上导航栏打开 """
|
611
|
+
content = requests.get(url).content.decode('utf8')
|
612
|
+
etag = get_etag(url) # 直接算url的etag,不用很严谨
|
613
|
+
return cls.from_content(content, etag, **kwargs)
|
614
|
+
|
615
|
+
@classmethod
|
616
|
+
def from_file(cls, file, **kwargs):
|
617
|
+
""" 输入本地一个html文件的路径,加上导航栏打开 """
|
618
|
+
file = File(file)
|
619
|
+
content = file.read()
|
620
|
+
# 输入文件的情况,生成的_content等html要在同目录
|
621
|
+
return cls.from_content(content, os.path.splitext(str(file))[0], **kwargs)
|
622
|
+
|
623
|
+
@classmethod
|
624
|
+
def from_content(cls, html_content, title='temphtml', *,
|
625
|
+
encoding=None, number=True, text_catalogue=True):
|
626
|
+
"""
|
627
|
+
:param html_content: 原始网页的完整内容
|
628
|
+
:param title: 页面标题,默认会先找head/title,如果没有,则取一个随机名称(TODO 未实装,目前固定名称)
|
629
|
+
:param encoding: 保存的几个文件编码,默认是utf8,但windows平台有些特殊场合也可能要存储gbk
|
630
|
+
:param number: 是否对每节启用自动编号的css
|
631
|
+
|
632
|
+
算法基本原理:读取原网页,找出所有h标签,并增设a锚点
|
633
|
+
另外生成一个导航html文件
|
634
|
+
然后再生成一个主文件,让用户通过主文件来浏览页面
|
635
|
+
|
636
|
+
# 读取csdn博客并展示目录 (不过因为这个存在跳级,效果不是那么好)
|
637
|
+
>> file = 自动制作网页标题的导航栏(requests.get(r'https://blog.csdn.net/code4101/article/details/83009000').content.decode('utf8'))
|
638
|
+
>> browser(str(file))
|
639
|
+
http://i2.tiimg.com/582188/64f40d235705de69.png
|
640
|
+
"""
|
641
|
+
from humanfriendly import format_size
|
642
|
+
|
643
|
+
# 1 对原html,设置锚点,生成一个新的文件f2
|
644
|
+
cnt = 0
|
645
|
+
|
646
|
+
# 这个refs是可以用py算法生成的,目前是存储在github上引用
|
647
|
+
refs = ['<html><head>',
|
648
|
+
'<link rel=Stylesheet type="text/css" media=all '
|
649
|
+
f'href="https://code4101.github.io/css/navigation{int(number)}.css">',
|
650
|
+
'</head><body>']
|
651
|
+
|
652
|
+
f2 = File(title + '_content', Dir.TEMP, suffix='.html')
|
653
|
+
|
654
|
+
def func(m):
|
655
|
+
nonlocal cnt
|
656
|
+
cnt += 1
|
657
|
+
name, content = m.group('name'), m.group('inner')
|
658
|
+
content = BeautifulSoup(content, 'lxml').get_text()
|
659
|
+
# 要写<h><a></a></h>,不能写<a><h></h></a>,否则css中设置的计数器重置不会起作用
|
660
|
+
refs.append(f'<{name}><a href="{f2}#navigation{cnt}" target="showframe">{content}</a></{name}>')
|
661
|
+
return f'<a name="navigation{cnt}"/>' + m.group()
|
662
|
+
|
663
|
+
html_content = re.sub(r'<(?P<name>h\d+)(?:>|\s.*?>)(?P<body>\s*(?P<inner>.*?)\s*)</\1>',
|
664
|
+
func, html_content, flags=re.DOTALL)
|
665
|
+
f2 = f2.write(html_content, encoding=encoding, if_exists='replace')
|
666
|
+
|
667
|
+
# 2 f1除了导航栏,可以多附带一些有用的参考信息
|
668
|
+
# 2.1 前文的refs已经存储了超链接的导航
|
669
|
+
|
670
|
+
# 2.2 文本版的目录
|
671
|
+
bs = BeautifulSoup(html_content, 'lxml')
|
672
|
+
text = bs.get_text()
|
673
|
+
if text_catalogue:
|
674
|
+
# 目录
|
675
|
+
refs.append(f'<br/>【文本版的目录】')
|
676
|
+
catalogue = bs.get_catalogue(indent='\t', start_level=-1, jump=True, size=True)
|
677
|
+
refs.append(f'<pre>{catalogue}</pre>')
|
678
|
+
# 全文长度
|
679
|
+
n = strwidth(text)
|
680
|
+
refs.append('<br/>【Total Bytes】' + format_size(n))
|
681
|
+
|
682
|
+
# 2.3 文中使用的高频词
|
683
|
+
# 英文可以直接按空格切开统计,区分大小写
|
684
|
+
text2 = re.sub(grp_chinese_char(), '', text) # 删除中文,先不做中文的功能~~
|
685
|
+
text2 = re.sub(r'[,\.,。\(\)();;??"]', ' ', text2) # 标点符号按空格处理
|
686
|
+
words = Counter(text2.split())
|
687
|
+
msg = '\n'.join([(x[0] if x[1] == 1 else f'{x[0]},{x[1]}') for x in words.most_common()])
|
688
|
+
msg += f'<br/>共{len(words)}个词汇,用词数{sum(words.values())}。'
|
689
|
+
refs.append(f'<br/>【词汇表】<pre>{msg}</pre>')
|
690
|
+
|
691
|
+
# 2.5 收尾,写入f1
|
692
|
+
refs.append('</body>\n</html>')
|
693
|
+
f1 = File(title + '_catalogue', Dir.TEMP, suffix='.html').write('\n'.join(refs), encoding=encoding,
|
694
|
+
if_exists='replace')
|
695
|
+
|
696
|
+
# 3 生成主页 f0
|
697
|
+
main_content = f"""<html>
|
698
|
+
<frameset cols="20%,80%">
|
699
|
+
<frame src="{f1}">
|
700
|
+
<frame src="{f2}" name="showframe">
|
701
|
+
</frameset></html>"""
|
702
|
+
|
703
|
+
f0 = File(title + '_index', Dir.TEMP, suffix='.html').write(main_content, encoding=encoding,
|
704
|
+
if_exists='replace')
|
705
|
+
return f0
|
706
|
+
|
707
|
+
|
708
|
+
class HtmlParser:
|
709
|
+
""" 对树形结构、位置比较固定的html文档的一个解析框架 """
|
710
|
+
|
711
|
+
def __init__(self, root):
|
712
|
+
""" 输入根节点root """
|
713
|
+
self.root = root
|
714
|
+
|
715
|
+
@classmethod
|
716
|
+
@run_once
|
717
|
+
def get_parse_funcs(cls):
|
718
|
+
res = []
|
719
|
+
|
720
|
+
# 获取所有的方法名
|
721
|
+
members = dir(cls)
|
722
|
+
methods = filter(lambda m: callable(getattr(cls, m)), members)
|
723
|
+
|
724
|
+
# 以parse、parse_0、parse_0_2等格式命名的函数,是解析树结构特定位置,这里自动执行解析
|
725
|
+
for method in methods:
|
726
|
+
if re.match(r'parse(_\d+)*$', method):
|
727
|
+
# 智能获取对应下标的结构变量
|
728
|
+
res.append(method)
|
729
|
+
|
730
|
+
return res
|
731
|
+
|
732
|
+
def run(self):
|
733
|
+
for method in self.get_parse_funcs():
|
734
|
+
# 智能获取对应下标的结构变量
|
735
|
+
idxs = [int(v) for v in method[5:].split('_') if v]
|
736
|
+
x = self.root.get_nonempty_children(*idxs)
|
737
|
+
# 自动执行函数
|
738
|
+
getattr(self, method)(x)
|
739
|
+
|
740
|
+
|
741
|
+
def concat_htmlbody(ls):
|
742
|
+
""" 对多份网页内容中的body进行拼接
|
743
|
+
"""
|
744
|
+
texts = [re.search(r'<body>(.*?)</body>', x, flags=re.DOTALL).group(1) for x in ls]
|
745
|
+
# 用第一份作为主模板
|
746
|
+
text = re.sub(r'<body>(.*?)</body>', lambda m: '<body>' + '\n'.join(texts) + '</body>', ls[0], flags=re.DOTALL)
|
747
|
+
return text
|