hjxdl 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hdl/_version.py +2 -2
- hdl/utils/llm/embs.py +5 -3
- {hjxdl-0.1.7.dist-info → hjxdl-0.1.8.dist-info}/METADATA +1 -1
- {hjxdl-0.1.7.dist-info → hjxdl-0.1.8.dist-info}/RECORD +6 -6
- {hjxdl-0.1.7.dist-info → hjxdl-0.1.8.dist-info}/WHEEL +0 -0
- {hjxdl-0.1.7.dist-info → hjxdl-0.1.8.dist-info}/top_level.txt +0 -0
hdl/_version.py
CHANGED
hdl/utils/llm/embs.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
import re
|
2
2
|
|
3
3
|
|
4
4
|
class BEEmbedder():
|
@@ -118,6 +118,8 @@ class HFEmbedder():
|
|
118
118
|
Returns:
|
119
119
|
None
|
120
120
|
"""
|
121
|
+
|
122
|
+
from sentence_transformers import SentenceTransformer
|
121
123
|
|
122
124
|
self.device = device
|
123
125
|
self.emb_dir = emb_dir
|
@@ -190,7 +192,7 @@ class HFEmbedder():
|
|
190
192
|
|
191
193
|
def get_n_tokens(
|
192
194
|
paragraph,
|
193
|
-
model: str =
|
195
|
+
model: str = ""
|
194
196
|
):
|
195
197
|
"""Get the number of tokens in a paragraph using a specified model.
|
196
198
|
|
@@ -201,7 +203,7 @@ def get_n_tokens(
|
|
201
203
|
Returns:
|
202
204
|
int: The number of tokens in the paragraph based on the specified model or default CJK tokenization.
|
203
205
|
"""
|
204
|
-
if model
|
206
|
+
if model == "":
|
205
207
|
cjk_regex = re.compile(u'[\u1100-\uFFFDh]+?')
|
206
208
|
trimed_cjk = cjk_regex.sub( ' a ', paragraph, 0)
|
207
209
|
return len(trimed_cjk.split())
|
@@ -1,5 +1,5 @@
|
|
1
1
|
hdl/__init__.py,sha256=5sZZNySv08wwfzJcSDssGTqUn9wlmDsR6R4XB8J8mFM,70
|
2
|
-
hdl/_version.py,sha256=
|
2
|
+
hdl/_version.py,sha256=PdJ7dZoz_SyEgX0MdrMfQYBFlGcwpemv6ibF8NKALBY,411
|
3
3
|
hdl/args/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
4
|
hdl/args/loss_args.py,sha256=s7YzSdd7IjD24rZvvOrxLLFqMZQb9YylxKeyelSdrTk,70
|
5
5
|
hdl/controllers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -85,11 +85,11 @@ hdl/utils/general/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
85
85
|
hdl/utils/general/glob.py,sha256=8-RCnt6L297wMIfn34ZAMCsGCZUjHG3MGglGZI1cX0g,491
|
86
86
|
hdl/utils/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
87
87
|
hdl/utils/llm/chat.py,sha256=gsbqWh8fTcJUENU6ZuMClZAuSOLFnD5VP8kXOxGh3Zw,13776
|
88
|
-
hdl/utils/llm/embs.py,sha256=
|
88
|
+
hdl/utils/llm/embs.py,sha256=Tf0FOYrOFZp7qQpEPiSCXzlgyHH0X9HVTUtsup74a9E,7174
|
89
89
|
hdl/utils/llm/extract.py,sha256=2sK_WJzmYIc8iuWaM9DA6Nw3_6q1O4lJ5pKpcZo-bBA,6512
|
90
90
|
hdl/utils/schedulers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
91
91
|
hdl/utils/schedulers/norm_lr.py,sha256=bDwCmdEK-WkgxQMFBiMuchv8Mm7C0-GZJ6usm-PQk14,4461
|
92
|
-
hjxdl-0.1.
|
93
|
-
hjxdl-0.1.
|
94
|
-
hjxdl-0.1.
|
95
|
-
hjxdl-0.1.
|
92
|
+
hjxdl-0.1.8.dist-info/METADATA,sha256=a9BaE0EGy5G9EM3Tbsi4LMmIrCMFJUuDjFnmmu_nBW4,542
|
93
|
+
hjxdl-0.1.8.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
|
94
|
+
hjxdl-0.1.8.dist-info/top_level.txt,sha256=-kxwTM5JPhylp06z3zAVO3w6_h7wtBfBo2zgM6YZoTk,4
|
95
|
+
hjxdl-0.1.8.dist-info/RECORD,,
|
File without changes
|
File without changes
|