re-common 10.0.36__py3-none-any.whl → 10.0.38__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -179,12 +179,24 @@ class HTMLTextExtractor(HTMLParser):
179
179
  # parser.close()
180
180
  # return parser.get_text()
181
181
 
182
+ # def clean_html(html):
183
+ # """使用 Parsel 提取 HTML 中的纯文本"""
184
+ # sel = Selector(text=html, type='html')
185
+ # # 提取所有文本(包括子元素的文本)
186
+ # text = sel.xpath("string()").getall()
187
+ # return "".join(text).strip()
188
+
189
+
182
190
  def clean_html(html):
183
- """使用 Parsel 提取 HTML 中的纯文本"""
184
- sel = Selector(text=html, type='html')
185
- # 提取所有文本(包括子元素的文本)
186
- text = sel.xpath("string()").getall()
187
- return "".join(text).strip()
191
+ if "<" in html:
192
+ from bs4 import BeautifulSoup
193
+ try:
194
+ soup = BeautifulSoup(html, "lxml")
195
+ return soup.get_text()
196
+ except:
197
+ soup = BeautifulSoup(html, "html5lib")
198
+ return soup.get_text()
199
+ return html
188
200
 
189
201
 
190
202
  def remove_spaces_between_chinese_characters(text):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: re_common
3
- Version: 10.0.36
3
+ Version: 10.0.38
4
4
  Summary: a library about all python projects
5
5
  Home-page: https://gitee.com/xujiangios/re-common
6
6
  Author: vic
@@ -214,7 +214,7 @@ re_common/v2/baselibrary/utils/n_ary_expression_tree.py,sha256=-05kO6G2Rth7CEK-5
214
214
  re_common/v2/baselibrary/utils/string_bool.py,sha256=vxnjSFOfuHWGxkqaIbUNn21opx5tfV1uCXSahFfp1mU,6197
215
215
  re_common/v2/baselibrary/utils/string_clear.py,sha256=Ympa0Cs2y_72QeeyMS8de8y_QgtEFJJQ0AgHnylbMUc,7861
216
216
  re_common/v2/baselibrary/utils/string_smi.py,sha256=cU0WAWHRGnGoVQx3eCEKeM_q_olFNzRTJe7rSe586SY,741
217
- re_common/v2/baselibrary/utils/stringutils.py,sha256=F1JZ9vfSWM0TEffiNUGVE40yrzXz0fuzrYyys-PgDqw,8144
217
+ re_common/v2/baselibrary/utils/stringutils.py,sha256=uUjcyMEmYIzPUxQ_enpRj_qrvQnYtoL2gEug7WIrIbU,8463
218
218
  re_common/vip/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
219
219
  re_common/vip/base_step_process.py,sha256=VXXiNj0I5CpzXIMCgOPU86bzDJkSBkUS-9CpZIl_GOk,205
220
220
  re_common/vip/baseencodeid.py,sha256=nERoe89ueFM52bG7xwJdflcZHk6T2RQQKbc5uUZc3RM,3272
@@ -241,8 +241,8 @@ re_common/vip/title/transform/TransformRegulationTitleToZt.py,sha256=LKRdIsWKues
241
241
  re_common/vip/title/transform/TransformStandardTitleToZt.py,sha256=-fCKAbSBzXVyQDCE61CalvR9E_QzQMA08QOO_NePFNI,5563
242
242
  re_common/vip/title/transform/TransformThesisTitleToZt.py,sha256=QS-uV0cQrpUFAcKucuJQ9Ue2VRQH-inmfn_X3IplfRo,5488
243
243
  re_common/vip/title/transform/__init__.py,sha256=m83-CWyRq_VHPYHaALEQlmXrkTdrZ3e4B_kCfBYE-uc,239
244
- re_common-10.0.36.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
245
- re_common-10.0.36.dist-info/METADATA,sha256=2BGTDBrd17-eWFaKfrQbE9xZdyOe3i-NwME64lIRjvs,582
246
- re_common-10.0.36.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
247
- re_common-10.0.36.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
248
- re_common-10.0.36.dist-info/RECORD,,
244
+ re_common-10.0.38.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
245
+ re_common-10.0.38.dist-info/METADATA,sha256=abixIXN2b8yXkvit_fXpJkwJtQGlJXwWuRII7kNYaTM,582
246
+ re_common-10.0.38.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
247
+ re_common-10.0.38.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
248
+ re_common-10.0.38.dist-info/RECORD,,