re-common 10.0.36__py3-none-any.whl → 10.0.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- re_common/v2/baselibrary/utils/stringutils.py +17 -5
- {re_common-10.0.36.dist-info → re_common-10.0.38.dist-info}/METADATA +1 -1
- {re_common-10.0.36.dist-info → re_common-10.0.38.dist-info}/RECORD +6 -6
- {re_common-10.0.36.dist-info → re_common-10.0.38.dist-info}/LICENSE +0 -0
- {re_common-10.0.36.dist-info → re_common-10.0.38.dist-info}/WHEEL +0 -0
- {re_common-10.0.36.dist-info → re_common-10.0.38.dist-info}/top_level.txt +0 -0
|
@@ -179,12 +179,24 @@ class HTMLTextExtractor(HTMLParser):
|
|
|
179
179
|
# parser.close()
|
|
180
180
|
# return parser.get_text()
|
|
181
181
|
|
|
182
|
+
# def clean_html(html):
|
|
183
|
+
# """使用 Parsel 提取 HTML 中的纯文本"""
|
|
184
|
+
# sel = Selector(text=html, type='html')
|
|
185
|
+
# # 提取所有文本(包括子元素的文本)
|
|
186
|
+
# text = sel.xpath("string()").getall()
|
|
187
|
+
# return "".join(text).strip()
|
|
188
|
+
|
|
189
|
+
|
|
182
190
|
def clean_html(html):
|
|
183
|
-
""
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
191
|
+
if "<" in html:
|
|
192
|
+
from bs4 import BeautifulSoup
|
|
193
|
+
try:
|
|
194
|
+
soup = BeautifulSoup(html, "lxml")
|
|
195
|
+
return soup.get_text()
|
|
196
|
+
except:
|
|
197
|
+
soup = BeautifulSoup(html, "html5lib")
|
|
198
|
+
return soup.get_text()
|
|
199
|
+
return html
|
|
188
200
|
|
|
189
201
|
|
|
190
202
|
def remove_spaces_between_chinese_characters(text):
|
|
@@ -214,7 +214,7 @@ re_common/v2/baselibrary/utils/n_ary_expression_tree.py,sha256=-05kO6G2Rth7CEK-5
|
|
|
214
214
|
re_common/v2/baselibrary/utils/string_bool.py,sha256=vxnjSFOfuHWGxkqaIbUNn21opx5tfV1uCXSahFfp1mU,6197
|
|
215
215
|
re_common/v2/baselibrary/utils/string_clear.py,sha256=Ympa0Cs2y_72QeeyMS8de8y_QgtEFJJQ0AgHnylbMUc,7861
|
|
216
216
|
re_common/v2/baselibrary/utils/string_smi.py,sha256=cU0WAWHRGnGoVQx3eCEKeM_q_olFNzRTJe7rSe586SY,741
|
|
217
|
-
re_common/v2/baselibrary/utils/stringutils.py,sha256=
|
|
217
|
+
re_common/v2/baselibrary/utils/stringutils.py,sha256=uUjcyMEmYIzPUxQ_enpRj_qrvQnYtoL2gEug7WIrIbU,8463
|
|
218
218
|
re_common/vip/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
219
219
|
re_common/vip/base_step_process.py,sha256=VXXiNj0I5CpzXIMCgOPU86bzDJkSBkUS-9CpZIl_GOk,205
|
|
220
220
|
re_common/vip/baseencodeid.py,sha256=nERoe89ueFM52bG7xwJdflcZHk6T2RQQKbc5uUZc3RM,3272
|
|
@@ -241,8 +241,8 @@ re_common/vip/title/transform/TransformRegulationTitleToZt.py,sha256=LKRdIsWKues
|
|
|
241
241
|
re_common/vip/title/transform/TransformStandardTitleToZt.py,sha256=-fCKAbSBzXVyQDCE61CalvR9E_QzQMA08QOO_NePFNI,5563
|
|
242
242
|
re_common/vip/title/transform/TransformThesisTitleToZt.py,sha256=QS-uV0cQrpUFAcKucuJQ9Ue2VRQH-inmfn_X3IplfRo,5488
|
|
243
243
|
re_common/vip/title/transform/__init__.py,sha256=m83-CWyRq_VHPYHaALEQlmXrkTdrZ3e4B_kCfBYE-uc,239
|
|
244
|
-
re_common-10.0.
|
|
245
|
-
re_common-10.0.
|
|
246
|
-
re_common-10.0.
|
|
247
|
-
re_common-10.0.
|
|
248
|
-
re_common-10.0.
|
|
244
|
+
re_common-10.0.38.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
245
|
+
re_common-10.0.38.dist-info/METADATA,sha256=abixIXN2b8yXkvit_fXpJkwJtQGlJXwWuRII7kNYaTM,582
|
|
246
|
+
re_common-10.0.38.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
247
|
+
re_common-10.0.38.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
|
|
248
|
+
re_common-10.0.38.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|