hjxdl 0.0.17__py3-none-any.whl → 0.0.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hdl/_version.py +2 -2
- hdl/utils/llm/embs.py +119 -4
- {hjxdl-0.0.17.dist-info → hjxdl-0.0.19.dist-info}/METADATA +1 -1
- {hjxdl-0.0.17.dist-info → hjxdl-0.0.19.dist-info}/RECORD +6 -6
- {hjxdl-0.0.17.dist-info → hjxdl-0.0.19.dist-info}/WHEEL +0 -0
- {hjxdl-0.0.17.dist-info → hjxdl-0.0.19.dist-info}/top_level.txt +0 -0
hdl/_version.py
CHANGED
hdl/utils/llm/embs.py
CHANGED
@@ -1,8 +1,14 @@
|
|
1
|
-
|
1
|
+
from sentence_transformers import SentenceTransformer
|
2
|
+
|
3
|
+
|
4
|
+
class BEEmbedder():
|
2
5
|
def __init__(
|
3
6
|
self,
|
4
7
|
emb_name: str = "bge",
|
5
|
-
emb_dir: str = None
|
8
|
+
emb_dir: str = None,
|
9
|
+
device: str = 'cuda',
|
10
|
+
batch_size: int = 16,
|
11
|
+
max_length: int = 1024,
|
6
12
|
) -> None:
|
7
13
|
"""Initializes the object with the specified embedding name and directory.
|
8
14
|
|
@@ -15,6 +21,14 @@ class HFEmbedder():
|
|
15
21
|
"""
|
16
22
|
self.emb_name = emb_name
|
17
23
|
self.emb_dir = emb_dir
|
24
|
+
self.batch_size = batch_size
|
25
|
+
|
26
|
+
self.model_kwargs = {'device': device}
|
27
|
+
self.encode_kwargs = {
|
28
|
+
'batch_size': self.batch_size,
|
29
|
+
'normalize_embeddings': True,
|
30
|
+
'show_progress_bar': False
|
31
|
+
}
|
18
32
|
|
19
33
|
if "bge" in emb_name.lower():
|
20
34
|
from FlagEmbedding import BGEM3FlagModel
|
@@ -31,7 +45,7 @@ class HFEmbedder():
|
|
31
45
|
|
32
46
|
def encode(
|
33
47
|
self,
|
34
|
-
sentences
|
48
|
+
sentences,
|
35
49
|
):
|
36
50
|
"""Encode the input sentences using the model.
|
37
51
|
|
@@ -47,7 +61,9 @@ class HFEmbedder():
|
|
47
61
|
sentences,
|
48
62
|
return_dense=True,
|
49
63
|
return_sparse=True,
|
50
|
-
return_colbert_vecs=False
|
64
|
+
return_colbert_vecs=False,
|
65
|
+
batch_size=self.batch_size,
|
66
|
+
max_length=self.max_length
|
51
67
|
)
|
52
68
|
if "bge" in self.emb_name.lower():
|
53
69
|
return output["dense_vecs"]
|
@@ -71,3 +87,102 @@ class HFEmbedder():
|
|
71
87
|
output_2 = self.encode(sentences_2)
|
72
88
|
similarity = output_1 @ output_2.T
|
73
89
|
return similarity
|
90
|
+
|
91
|
+
|
92
|
+
class HFEmbedder():
|
93
|
+
def __init__(
|
94
|
+
self,
|
95
|
+
emb_dir: str = None,
|
96
|
+
device: str = 'cuda',
|
97
|
+
trust_remote_code: bool = True,
|
98
|
+
*args, **kwargs
|
99
|
+
) -> None:
|
100
|
+
"""Initialize the class with the specified parameters.
|
101
|
+
|
102
|
+
Args:
|
103
|
+
emb_dir (str): Directory path to the embeddings.
|
104
|
+
device (str): Device to be used for computation (default is 'cuda').
|
105
|
+
trust_remote_code (bool): Whether to trust remote code (default is True).
|
106
|
+
*args: Variable length argument list.
|
107
|
+
**kwargs: Arbitrary keyword arguments.
|
108
|
+
- modules: Optional[Iterable[torch.nn.modules.module.Module]] = None,
|
109
|
+
- device: Optional[str] = None,
|
110
|
+
- prompts: Optional[Dict[str, str]] = None,
|
111
|
+
- default_prompt_name: Optional[str] = None,
|
112
|
+
- cache_folder: Optional[str] = None,
|
113
|
+
- revision: Optional[str] = None,
|
114
|
+
- token: Union[str, bool, NoneType] = None,
|
115
|
+
- use_auth_token: Union[str, bool, NoneType] = None,
|
116
|
+
- truncate_dim: Optional[int] = None,
|
117
|
+
|
118
|
+
Returns:
|
119
|
+
None
|
120
|
+
"""
|
121
|
+
|
122
|
+
self.device = device
|
123
|
+
self.emb_dir = emb_dir
|
124
|
+
|
125
|
+
self.model = SentenceTransformer(
|
126
|
+
emb_dir,
|
127
|
+
device=device,
|
128
|
+
trust_remote_code=trust_remote_code,
|
129
|
+
*args, **kwargs
|
130
|
+
).half()
|
131
|
+
# self.model = model.half()
|
132
|
+
|
133
|
+
def encode(
|
134
|
+
self,
|
135
|
+
sentences: list[str],
|
136
|
+
*args, **kwargs
|
137
|
+
):
|
138
|
+
"""Encode the input sentences using the model.
|
139
|
+
|
140
|
+
Args:
|
141
|
+
sentences (list[str]): List of input sentences to encode.
|
142
|
+
*args: Variable length argument list.
|
143
|
+
**kwargs: Arbitrary keyword arguments.
|
144
|
+
- prompt_name: Optional[str] = None,
|
145
|
+
- prompt: Optional[str] = None,
|
146
|
+
- batch_size: int = 32,
|
147
|
+
- show_progress_bar: bool = None,
|
148
|
+
- output_value: Optional[Literal['sentence_embedding', 'token_embeddings']] = 'sentence_embedding',
|
149
|
+
- precision: Literal['float32', 'int8', 'uint8', 'binary', 'ubinary'] = 'float32',
|
150
|
+
- convert_to_numpy: bool = True,
|
151
|
+
- convert_to_tensor: bool = False,
|
152
|
+
- device: str = None,
|
153
|
+
- normalize_embeddings: bool = False,
|
154
|
+
|
155
|
+
Returns:
|
156
|
+
output: Encoded representation of the input sentences.
|
157
|
+
"""
|
158
|
+
if isinstance(sentences, str):
|
159
|
+
sentences = [sentences]
|
160
|
+
if kwargs.get("convert_to_tensor", False) is True:
|
161
|
+
kwargs["device"] = self.device
|
162
|
+
output = self.model.encode(
|
163
|
+
sentences,
|
164
|
+
*args, **kwargs
|
165
|
+
)
|
166
|
+
return output
|
167
|
+
|
168
|
+
def sim(
|
169
|
+
self,
|
170
|
+
sentences_1,
|
171
|
+
sentences_2,
|
172
|
+
*args, **kwargs
|
173
|
+
):
|
174
|
+
"""Calculate the similarity between two sets of sentences.
|
175
|
+
|
176
|
+
Args:
|
177
|
+
sentences_1 (list): List of sentences for the first set.
|
178
|
+
sentences_2 (list): List of sentences for the second set.
|
179
|
+
*args: Additional positional arguments to be passed to the encode function.
|
180
|
+
**kwargs: Additional keyword arguments to be passed to the encode function.
|
181
|
+
|
182
|
+
Returns:
|
183
|
+
numpy.ndarray: Similarity matrix between the two sets of sentences.
|
184
|
+
"""
|
185
|
+
output_1 = self.encode(sentences_1, *args, **kwargs)
|
186
|
+
output_2 = self.encode(sentences_2, *args, **kwargs)
|
187
|
+
similarity = output_1 @ output_2.T
|
188
|
+
return similarity
|
@@ -1,5 +1,5 @@
|
|
1
1
|
hdl/__init__.py,sha256=5sZZNySv08wwfzJcSDssGTqUn9wlmDsR6R4XB8J8mFM,70
|
2
|
-
hdl/_version.py,sha256=
|
2
|
+
hdl/_version.py,sha256=CYabGzkNwriz1Zjt5kNvBOZD6wtqQ_twYh4s5xzmT-I,413
|
3
3
|
hdl/args/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
4
|
hdl/args/loss_args.py,sha256=s7YzSdd7IjD24rZvvOrxLLFqMZQb9YylxKeyelSdrTk,70
|
5
5
|
hdl/controllers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -85,10 +85,10 @@ hdl/utils/general/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
85
85
|
hdl/utils/general/glob.py,sha256=8-RCnt6L297wMIfn34ZAMCsGCZUjHG3MGglGZI1cX0g,491
|
86
86
|
hdl/utils/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
87
87
|
hdl/utils/llm/chat.py,sha256=H2c8assJlSdZQKIfPkYrVZHqv66TsdsxtaLXv0kNe1w,11565
|
88
|
-
hdl/utils/llm/embs.py,sha256=
|
88
|
+
hdl/utils/llm/embs.py,sha256=sC8tga7HgDwPI2m7TDWKp9kkxEIMxEyMtgmEhfRi4vI,6362
|
89
89
|
hdl/utils/schedulers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
90
90
|
hdl/utils/schedulers/norm_lr.py,sha256=bDwCmdEK-WkgxQMFBiMuchv8Mm7C0-GZJ6usm-PQk14,4461
|
91
|
-
hjxdl-0.0.
|
92
|
-
hjxdl-0.0.
|
93
|
-
hjxdl-0.0.
|
94
|
-
hjxdl-0.0.
|
91
|
+
hjxdl-0.0.19.dist-info/METADATA,sha256=W1TN19HaXAt3kP6BvCpHWNam7w9lhe3LlVql4QCh5jw,543
|
92
|
+
hjxdl-0.0.19.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
|
93
|
+
hjxdl-0.0.19.dist-info/top_level.txt,sha256=-kxwTM5JPhylp06z3zAVO3w6_h7wtBfBo2zgM6YZoTk,4
|
94
|
+
hjxdl-0.0.19.dist-info/RECORD,,
|
File without changes
|
File without changes
|