re-common 10.0.3__py3-none-any.whl → 10.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  import re
2
2
  import regex
3
3
 
4
- from re_common.v2.baselibrary.utils.stringutils import qj2bj, bj2qj, get_diacritic_variant
4
+ from re_common.v2.baselibrary.utils.stringutils import qj2bj, bj2qj, get_diacritic_variant, clean_html
5
5
 
6
6
 
7
7
  class StringClear(object):
@@ -88,20 +88,9 @@ class StringClear(object):
88
88
 
89
89
  def remove_html_tag(self):
90
90
  import html
91
- from parsel import Selector
92
91
 
93
92
  self.obj_str = html.unescape(self.obj_str)
94
93
 
95
- def clean_html(html):
96
- sel = Selector(text=html, type='html')
97
- # 移除脚本和样式
98
- sel.xpath('//script').remove()
99
- sel.xpath('//style').remove()
100
-
101
- # 提取文本
102
- text_nodes = sel.xpath('//text()').getall()
103
- return ''.join(t.strip() for t in text_nodes if t.strip())
104
-
105
94
  self.obj_str = clean_html(self.obj_str)
106
95
 
107
96
  return self
@@ -1,7 +1,9 @@
1
1
  import re
2
+ import threading
2
3
 
3
4
  import regex
4
5
  import unicodedata
6
+ from html.parser import HTMLParser
5
7
 
6
8
 
7
9
  def bj2qj(src):
@@ -25,6 +27,7 @@ def bj2qj(src):
25
27
 
26
28
  return ''.join(buf)
27
29
 
30
+
28
31
  def qj2bj(src):
29
32
  """
30
33
  全角转半角
@@ -62,6 +65,7 @@ def get_diacritic_variant(char1):
62
65
  # 判断基本字符是否相同
63
66
  return base_char1
64
67
 
68
+
65
69
  def get_alphabetic_ratio(text: str) -> float:
66
70
  # 返回字母型字符所占比例
67
71
  if not text:
@@ -93,4 +97,50 @@ def get_alphabetic_ratio(text: str) -> float:
93
97
  alphabetic_chars = re.findall(alphabetic_pattern, clean_text)
94
98
 
95
99
  # 返回字母型字符所占比例
96
- return len(alphabetic_chars) / len(clean_text)
100
+ return len(alphabetic_chars) / len(clean_text)
101
+
102
+
103
+ class HTMLTextExtractor(HTMLParser):
104
+ _thread_local = threading.local() # 线程局部存储
105
+
106
+ def __init__(self):
107
+ super().__init__()
108
+ self.reset_state()
109
+
110
+ def handle_starttag(self, tag, attrs):
111
+ if tag in ('script', 'style'):
112
+ self.skip = True
113
+
114
+ def handle_endtag(self, tag):
115
+ if tag in ('script', 'style'):
116
+ self.skip = False
117
+
118
+ def handle_data(self, data):
119
+ if not self.skip and data.strip():
120
+ self.text.append(data)
121
+
122
+ def reset_state(self):
123
+ self.reset()
124
+ self.text = []
125
+ self.skip = False
126
+
127
+ def get_text(self):
128
+ return ''.join(self.text).strip()
129
+
130
+ @classmethod
131
+ def get_parser(cls):
132
+ # 每个线程获取独立实例
133
+ if not hasattr(cls._thread_local, 'parser'):
134
+ cls._thread_local.parser = cls()
135
+ return cls._thread_local.parser
136
+
137
+
138
+ def clean_html(html):
139
+ parser = HTMLTextExtractor.get_parser()
140
+ parser.reset_state()
141
+ parser.feed(html)
142
+ parser.close()
143
+ return parser.get_text()
144
+
145
+
146
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: re_common
3
- Version: 10.0.3
3
+ Version: 10.0.4
4
4
  Summary: a library about all python projects
5
5
  Home-page: https://gitee.com/xujiangios/re-common
6
6
  Author: vic
@@ -178,8 +178,8 @@ re_common/v2/baselibrary/utils/basedict.py,sha256=tSV85pARe8ZQDY77_h_heS81EWwcgJ
178
178
  re_common/v2/baselibrary/utils/basehdfs.py,sha256=NVV5Q0OMPlM_zTrs9ZDoPJv29GQv5wi9-AP1us5dBrQ,4651
179
179
  re_common/v2/baselibrary/utils/json_cls.py,sha256=dHOkWafG9lbQDoub9cbDwT2fDjMKtblQnjFLeA4hECA,286
180
180
  re_common/v2/baselibrary/utils/string_bool.py,sha256=4VCr1g8pX5YnzZSKctQgQfmhSQ0aw7a8ruhWdiRmBFU,641
181
- re_common/v2/baselibrary/utils/string_clear.py,sha256=okoBZCL1FytdI9FO7MBLTe1fvgGWt0-z87oUkg7jWkg,4171
182
- re_common/v2/baselibrary/utils/stringutils.py,sha256=quAgCdW_ayQwY4AqnZZkZ4NlcSEcy6f1arOVSeP2vEo,2699
181
+ re_common/v2/baselibrary/utils/string_clear.py,sha256=6mkBAZUNh5-JTPmB9lj_i4eLT9C6ZW1nH4tZiGveIE4,3778
182
+ re_common/v2/baselibrary/utils/stringutils.py,sha256=GLXHAm8IulC_8hWrN2aiFQjsoOpjczvcVozmTJj86-A,3864
183
183
  re_common/vip/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
184
184
  re_common/vip/base_step_process.py,sha256=VXXiNj0I5CpzXIMCgOPU86bzDJkSBkUS-9CpZIl_GOk,205
185
185
  re_common/vip/baseencodeid.py,sha256=nERoe89ueFM52bG7xwJdflcZHk6T2RQQKbc5uUZc3RM,3272
@@ -206,8 +206,8 @@ re_common/vip/title/transform/TransformRegulationTitleToZt.py,sha256=LKRdIsWKues
206
206
  re_common/vip/title/transform/TransformStandardTitleToZt.py,sha256=-fCKAbSBzXVyQDCE61CalvR9E_QzQMA08QOO_NePFNI,5563
207
207
  re_common/vip/title/transform/TransformThesisTitleToZt.py,sha256=QS-uV0cQrpUFAcKucuJQ9Ue2VRQH-inmfn_X3IplfRo,5488
208
208
  re_common/vip/title/transform/__init__.py,sha256=m83-CWyRq_VHPYHaALEQlmXrkTdrZ3e4B_kCfBYE-uc,239
209
- re_common-10.0.3.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
210
- re_common-10.0.3.dist-info/METADATA,sha256=L9pU8jkMSaBnkTrRwU5xP9O5l5TGPQms5DZM-Ux4MDw,581
211
- re_common-10.0.3.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
212
- re_common-10.0.3.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
213
- re_common-10.0.3.dist-info/RECORD,,
209
+ re_common-10.0.4.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
210
+ re_common-10.0.4.dist-info/METADATA,sha256=SNPQXc5koTrhSxu9yAPRPN42uItn6onNvmG7GHTMdcE,581
211
+ re_common-10.0.4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
212
+ re_common-10.0.4.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
213
+ re_common-10.0.4.dist-info/RECORD,,