nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. nltkor/Kor_char.py +193 -0
  2. nltkor/__init__.py +16 -0
  3. nltkor/alignment/__init__.py +1315 -0
  4. nltkor/cider/__init__.py +2 -0
  5. nltkor/cider/cider.py +55 -0
  6. nltkor/cider/cider_scorer.py +207 -0
  7. nltkor/distance/__init__.py +441 -0
  8. nltkor/distance/wasserstein.py +126 -0
  9. nltkor/etc.py +22 -0
  10. nltkor/lazyimport.py +144 -0
  11. nltkor/make_requirement.py +11 -0
  12. nltkor/metrics/__init__.py +63 -0
  13. nltkor/metrics/bartscore.py +301 -0
  14. nltkor/metrics/bertscore.py +331 -0
  15. nltkor/metrics/bleu_tensor.py +20 -0
  16. nltkor/metrics/classical.py +847 -0
  17. nltkor/metrics/entment.py +24 -0
  18. nltkor/metrics/eval.py +517 -0
  19. nltkor/metrics/mauve.py +273 -0
  20. nltkor/metrics/mauve_utils.py +131 -0
  21. nltkor/misc/__init__.py +11 -0
  22. nltkor/misc/string2string_basic_functions.py +59 -0
  23. nltkor/misc/string2string_default_tokenizer.py +83 -0
  24. nltkor/misc/string2string_hash_functions.py +159 -0
  25. nltkor/misc/string2string_word_embeddings.py +503 -0
  26. nltkor/search/__init__.py +10 -0
  27. nltkor/search/classical.py +569 -0
  28. nltkor/search/faiss_search.py +787 -0
  29. nltkor/search/kobert_tokenizer.py +181 -0
  30. nltkor/sejong/__init__.py +3 -0
  31. nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
  32. nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
  33. nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
  34. nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
  35. nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
  36. nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
  37. nltkor/sejong/ch.py +12 -0
  38. nltkor/sejong/dict_semClassNum.txt +491 -0
  39. nltkor/sejong/layer.txt +630 -0
  40. nltkor/sejong/sejong_download.py +87 -0
  41. nltkor/sejong/ssem.py +684 -0
  42. nltkor/similarity/__init__.py +3 -0
  43. nltkor/similarity/bartscore____.py +337 -0
  44. nltkor/similarity/bertscore____.py +339 -0
  45. nltkor/similarity/classical.py +245 -0
  46. nltkor/similarity/cosine_similarity.py +175 -0
  47. nltkor/tag/__init__.py +71 -0
  48. nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
  49. nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
  50. nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
  51. nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
  52. nltkor/tag/espresso_tag.py +220 -0
  53. nltkor/tag/libs/__init__.py +10 -0
  54. nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
  55. nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
  56. nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
  57. nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
  58. nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
  59. nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
  60. nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
  61. nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
  62. nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
  63. nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
  64. nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
  65. nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
  66. nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
  67. nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
  68. nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
  69. nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
  70. nltkor/tag/libs/arguments.py +280 -0
  71. nltkor/tag/libs/attributes.py +231 -0
  72. nltkor/tag/libs/config.py +159 -0
  73. nltkor/tag/libs/metadata.py +129 -0
  74. nltkor/tag/libs/ner/__init__.py +2 -0
  75. nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
  76. nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
  77. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
  78. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
  79. nltkor/tag/libs/ner/macmorphoreader.py +7 -0
  80. nltkor/tag/libs/ner/ner_reader.py +92 -0
  81. nltkor/tag/libs/network.c +72325 -0
  82. nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
  83. nltkor/tag/libs/network.pyx +878 -0
  84. nltkor/tag/libs/networkconv.pyx +1028 -0
  85. nltkor/tag/libs/networkdependencyconv.pyx +451 -0
  86. nltkor/tag/libs/parse/__init__.py +1 -0
  87. nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
  88. nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
  89. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
  90. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
  91. nltkor/tag/libs/parse/parse_reader.py +283 -0
  92. nltkor/tag/libs/pos/__init__.py +2 -0
  93. nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
  94. nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
  95. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
  96. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
  97. nltkor/tag/libs/pos/macmorphoreader.py +7 -0
  98. nltkor/tag/libs/pos/pos_reader.py +97 -0
  99. nltkor/tag/libs/reader.py +485 -0
  100. nltkor/tag/libs/srl/__init__.py +3 -0
  101. nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
  102. nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
  103. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
  104. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
  105. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
  106. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
  107. nltkor/tag/libs/srl/__srl_reader_.py +535 -0
  108. nltkor/tag/libs/srl/srl_reader.py +436 -0
  109. nltkor/tag/libs/srl/train_srl.py +87 -0
  110. nltkor/tag/libs/taggers.py +926 -0
  111. nltkor/tag/libs/utils.py +384 -0
  112. nltkor/tag/libs/word_dictionary.py +239 -0
  113. nltkor/tag/libs/wsd/__init__.py +2 -0
  114. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
  115. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
  116. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
  117. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
  118. nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
  119. nltkor/tag/libs/wsd/wsd_reader.py +93 -0
  120. nltkor/tokenize/__init__.py +62 -0
  121. nltkor/tokenize/ko_tokenize.py +115 -0
  122. nltkor/trans.py +121 -0
  123. nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
  124. nltkor-1.2.14.dist-info/METADATA +41 -0
  125. nltkor-1.2.14.dist-info/RECORD +127 -0
  126. nltkor-1.2.14.dist-info/WHEEL +5 -0
  127. nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,159 @@
1
+ """
2
+ string2string code
3
+ src = https://github.com/stanfordnlp/string2string
4
+
5
+
6
+ MIT License
7
+
8
+ Copyright (c) 2023 Mirac Suzgun
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+
29
+ """
30
+
31
+ """
32
+ This module contains the hash functions used in search algorithms.
33
+
34
+ A hash function takes a string (or other object) and returns a number.
35
+ The number is called the hash value, hash code, or simply the hash. The hash value is used to determine the location of the string in the hash table.
36
+ - The hash function must be deterministic, meaning that the same string always produces the same hash value.
37
+ - If two strings produce the same hash value, we say that the hash values collide.
38
+ - The hash function must also be fast, so it is important to keep the number of operations to a minimum.
39
+ """
40
+
41
+ from typing import List, Union, Tuple, Optional
42
+ import numpy as np
43
+
44
+
45
+ # A parent class for all hash functions
46
+ class HashFunction:
47
+ """
48
+ This class contains the parent class for all hash functions.
49
+ """
50
+ def __init__(self):
51
+ pass
52
+
53
+ def compute(self,
54
+ str1: str,
55
+ ) -> int:
56
+ """
57
+ Returns the hash value of a string.
58
+
59
+ Arguments:
60
+ str1 (str): The string.
61
+
62
+ Returns:
63
+ int: The hash value of the string.
64
+ """
65
+ pass
66
+
67
+
68
+ # Polynomial rolling hash function class
69
+ class PolynomialRollingHash(HashFunction):
70
+ """
71
+ This class contains the polynomial rolling hash function.
72
+ """
73
+
74
+ def __init__(self,
75
+ base: int = 10, # 256,
76
+ modulus: int = 101, # 65537,
77
+ ) -> None:
78
+ """
79
+ Initializes the polynomial rolling hash function.
80
+
81
+ Arguments:
82
+ base (int): The base to use. Default is 256.
83
+ modulus (int): The modulus to use. Default is 65537.
84
+
85
+ Returns:
86
+ None
87
+
88
+ .. note::
89
+ * Why 65537? Because it is a Fermat prime.
90
+ """
91
+ super().__init__()
92
+
93
+ # Check the inputs
94
+ assert base > 0, 'The base must be positive.'
95
+ assert modulus > 0, 'The modulus must be positive.'
96
+
97
+ # Set the attributes
98
+ self.base = base
99
+ self.modulus = modulus
100
+
101
+ # Initialize the current hash value
102
+ self.current_hash = 0
103
+
104
+
105
+ def compute(self,
106
+ str1: str,
107
+ ) -> int:
108
+ """
109
+ Returns the hash value of a string.
110
+
111
+ Arguments:
112
+ str1 (str): The string.
113
+
114
+ Returns:
115
+ int: The hash value of the string.
116
+ """
117
+ # Compute the hash value of the string
118
+ for char in str1:
119
+ self.current_hash = (self.current_hash * self.base + ord(char)) % self.modulus
120
+
121
+ # Return the hash value
122
+ return self.current_hash
123
+
124
+
125
+ def update(self,
126
+ old_char: str,
127
+ new_char: str,
128
+ window_size: int,
129
+ ) -> int:
130
+ """
131
+ Updates the hash value of a string.
132
+
133
+ Arguments:
134
+ old_char (str): The old character.
135
+ new_char (str): The new character.
136
+
137
+ Returns:
138
+ int: The hash value of the string.
139
+ """
140
+ # Update the hash value of the string
141
+ self.current_hash = (self.current_hash - ord(old_char) * (self.base ** (window_size - 1))) % self.modulus
142
+ self.current_hash = (self.current_hash * self.base + ord(new_char)) % self.modulus
143
+
144
+ # Return the hash value
145
+ return self.current_hash
146
+
147
+
148
+ def reset(self) -> None:
149
+ """
150
+ Resets the hash value.
151
+
152
+ Arguments:
153
+ None
154
+
155
+ Returns:
156
+ None
157
+ """
158
+ # Reset the current hash value
159
+ self.current_hash = 0
@@ -0,0 +1,503 @@
1
+ """
2
+ string2string code
3
+ src = https://github.com/stanfordnlp/string2string
4
+
5
+
6
+ MIT License
7
+
8
+ Copyright (c) 2023 Mirac Suzgun
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+
29
+ """
30
+
31
+ """
32
+ This module implements the word embeddings class.
33
+ """
34
+ # from tqdm import tqdm
35
+ import numpy as np
36
+ from typing import List, Union
37
+ #import torch
38
+ import os
39
+ from nltkor.make_requirement import make_requirement
40
+ try:
41
+ import torch
42
+ from torch import Tensor
43
+ from torch.nn import functional as F
44
+ import fasttext
45
+ import fasttext.util
46
+ except ImportError:
47
+ requirement = ['torch', 'fasttext']
48
+ file_path = make_requirement(requirement)
49
+ raise Exception(f"""
50
+ Need to install Libraries, please pip install below libraries
51
+ \t pip install torch
52
+ \t pip install fasttext-wheel
53
+ Or, use pip install requirement.txt
54
+ \t pip install -r {file_path}
55
+ """)
56
+ # for dev purposes
57
+ import sys
58
+ # sys.path.append("/Users/dowon/nltk_ko/nltk/misc")
59
+ from nltkor.misc.string2string_default_tokenizer import Tokenizer
60
+ # from string2string_default_tokenizer import Tokenizer
61
+
62
+
63
+ class NeuralEmbeddings:
64
+ """
65
+ This class is an abstract class for neural word embeddings.
66
+ """
67
+
68
+ def __init__(self,
69
+ tokenizer: Tokenizer = None,
70
+ ) -> None:
71
+ """
72
+ Constructor.
73
+
74
+ Arguments:
75
+ tokenizer (Tokenizer): The tokenizer to use.
76
+ """
77
+ # Set the tokenizer
78
+ if tokenizer is None:
79
+ self.tokenizer = Tokenizer(word_delimiter=" ")
80
+
81
+
82
+
83
+ def __call__(self,
84
+ tokens: Union[List[str], str],
85
+ ) -> Tensor:
86
+ """
87
+ This function returns the embeddings of the given tokens.
88
+
89
+ Arguments:
90
+ tokens (Union[List[str], str]): The tokens to embed.
91
+
92
+ Returns:
93
+ Tensor: The embeddings of the given tokens.
94
+ """
95
+ # Check the tokens
96
+ if isinstance(tokens, str):
97
+ tokens = self.tokenizer.tokenize(tokens)
98
+
99
+ # Embed the tokens
100
+ return self.embedding_layer(torch.tensor([self.vocabulary_dict[token] for token in tokens]))
101
+
102
+
103
+ def get_embedding(self,
104
+ tokens: Union[List[str], str]
105
+ ) -> Tensor:
106
+ """
107
+ This function returns the embeddings of the given tokens.
108
+
109
+ Arguments:
110
+ tokens (Union[List[str], str]): The tokens to embed.
111
+
112
+ Returns:
113
+ Tensor: The embeddings of the given tokens.
114
+ """
115
+ return self.__call__(tokens)
116
+
117
+
118
+ # GloVe embeddings class
119
+ class GloVeEmbeddings(NeuralEmbeddings):
120
+ """
121
+ This class implements the GloVe word embeddings.
122
+ """
123
+ # Pre-trained GloVe embeddings
124
+ # Source: https://github.com/stanfordnlp/GloVe#download-pre-trained-word-vectors
125
+ MODEL_OPTIONS = {
126
+ 'glove.6B.200d': {
127
+ 'Description': 'Wikipedia 2014 + Gigaword 5 (6B tokens, 400K vocab, uncased, 300d vectors, 822 MB download)',
128
+ 'URL': 'https://huggingface.co/stanfordnlp/glove/resolve/main/glove.6B.zip',
129
+ },
130
+ 'glove.twitter.27B': {
131
+ 'Description': 'Twitter (27B tokens, 1.2M vocab, uncased, 200d vectors, 1.42 GB download)',
132
+ 'URL': 'https://huggingface.co/stanfordnlp/glove/resolve/main/glove.twitter.27B.zip',
133
+ },
134
+ 'glove.42B.300d': {
135
+ 'Description': 'Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors, 1.75 GB download)',
136
+ 'URL': 'https://huggingface.co/stanfordnlp/glove/resolve/main/glove.42B.300d.zip',
137
+ },
138
+ 'glove.840B.300d': {
139
+ 'Description': 'Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors, 2.03 GB download)',
140
+ 'URL': 'https://huggingface.co/stanfordnlp/glove/resolve/main/glove.840B.300d.zip',
141
+ },
142
+ }
143
+
144
+ def __init__(self,
145
+ model: str = 'glove.6B.200D',
146
+ dim: int = 50,
147
+ force_download: bool = False,
148
+ dir = None,
149
+ tokenizer: Tokenizer = None,
150
+ ) -> None:
151
+ r"""
152
+ This function initializes the GloVe embeddings class.
153
+
154
+ Arguments:
155
+ model (str): The model to use. Default is 'glove.6B.200D'. (Options are: 'glove.6B.200D', 'glove.twitter.27B', 'glove.42B.300d', 'glove.840B.300d'.)
156
+ dim (int): The dimension of the embeddings. Default is 300.
157
+ force_download (bool): Whether to force download the model. Default is False.
158
+ dir (str): The directory to save or load the model. Default is None.
159
+ tokenizer (Tokenizer): The tokenizer to use. Default is None.
160
+
161
+ Returns:
162
+ None
163
+
164
+ Raises:
165
+ ValueError: If the model is not in the MODEL_OPTIONS [glove.6B.200D', 'glove.twitter.27B', 'glove.42B.300d', 'glove.840B.300d'].
166
+
167
+
168
+ .. attention::
169
+
170
+ If you use this class, please make sure to cite the following paper:
171
+
172
+ .. code-block:: latex
173
+
174
+ @inproceedings{pennington2014glove,
175
+ title={Glove: Global vectors for word representation},
176
+ author={Pennington, Jeffrey and Socher, Richard and Manning, Christopher D},
177
+ booktitle={Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP)},
178
+ pages={1532--1543},
179
+ year={2014}
180
+ }
181
+
182
+
183
+ .. note::
184
+ * If directory is None, the model will be saved in the torch hub directory.
185
+ * If the model is not downloaded, it will be downloaded automatically.
186
+ """
187
+ # Check model
188
+ if model not in self.MODEL_OPTIONS:
189
+ raise ValueError(f'Invalid model: {model}.')
190
+
191
+ # Set the attributes
192
+ self.model = model
193
+ self.force_download = force_download
194
+ self.dir = dir
195
+ self.token_size = self.model.split('.')[1]
196
+ self.dim = dim
197
+
198
+ # Set the path
199
+ if self.dir is None:
200
+ self.dir = f'{torch.hub.get_dir()}/{self.model}'
201
+
202
+ # Remove the trailing slash
203
+ if self.dir[-1] == '/':
204
+ self.dir = self.dir[:-1]
205
+
206
+ # Download the embeddings if they do not exist or if force_download is True
207
+ if not os.path.exists(self.dir) or self.force_download:
208
+
209
+ # Create the directory if it does not exist
210
+ if not (os.path.exists(self.dir)):
211
+ os.system(f'mkdir {self.dir}')
212
+
213
+ # Download the glove .zip file
214
+ print(f'Downloading the {self.model} zip file...')
215
+ torch.hub.download_url_to_file(
216
+ url=self.MODEL_OPTIONS[self.model]['URL'],
217
+ dst=f'{self.dir}/glove.zip',
218
+ )
219
+
220
+ # Unzip the glove .txt files
221
+ print(f'Unzipping the {self.model} zip file...')
222
+ os.system(f'unzip {self.dir}/glove.zip -d {self.dir}')
223
+
224
+ # Delete the zip file
225
+ os.system(f'rm {self.dir}/glove.zip')
226
+
227
+ # Process each glove .txt file and save it as a .pt file
228
+ for file in os.listdir(self.dir):
229
+ # Extract the words and the embeddings from the glove .txt file and save them as a .pt file
230
+
231
+ # Example of a glove .txt file:
232
+ # the 0.418 0.24968 -0.41242 0.1217 ...
233
+ # ...
234
+ # and 0.26818 0.14346 -0.27877 0.016257 ...
235
+ # ...
236
+
237
+ print(f'Processing {file}...')
238
+
239
+ # Load the file
240
+ with open(f'{self.dir}/{file}', 'r') as f:
241
+ lines = f.readlines()
242
+
243
+ # Extract the dimension of the embeddings from the file name (e.g. glove.6B.200d.txt -> 200)
244
+ file_embed_dim = file.split('.')[2][:-1]
245
+
246
+ # Extract the words and the embeddings
247
+ words = []
248
+ embeddings = np.zeros((len(lines), int(file_embed_dim)))
249
+ for i, line in enumerate(lines):
250
+ line = line.split(' ')
251
+ words.append(line[0])
252
+ embeddings[i] = np.array([float(x) for x in line[1:]])
253
+
254
+ # Convert the embeddings to a tensor
255
+ embeddings = torch.from_numpy(embeddings)
256
+
257
+ # Save the words and the embeddings as a .pt file
258
+ torch.save(words, f'{self.dir}/{file[:-4]}.words.pt')
259
+ torch.save(embeddings, f'{self.dir}/{file[:-4]}.embeddings.pt')
260
+
261
+ # Delete the glove .txt files
262
+ os.system(f'rm -r {self.dir}/*.txt')
263
+
264
+ # Load the weights and the vocabulary
265
+ weights = torch.load(f'{self.dir}/glove.{self.token_size}.{self.dim}d.embeddings.pt')
266
+ vocabulary = torch.load(f'{self.dir}/glove.{self.token_size}.{self.dim}d.words.pt')
267
+
268
+ # If the embeddings already exist
269
+ else:
270
+ # Load the weights and the vocabulary
271
+ weights = torch.load(f'{self.dir}/glove.{self.token_size}.{self.dim}d.embeddings.pt')
272
+ vocabulary = torch.load(f'{self.dir}/glove.{self.token_size}.{self.dim}d.words.pt')
273
+
274
+ # Create the vocabulary dictionary to be fed to the embedding layer
275
+ self.vocabulary_dict = {word: i for i, word in enumerate(vocabulary)}
276
+
277
+ # Create the embedding layer
278
+ self.embedding_layer = torch.nn.Embedding.from_pretrained(
279
+ embeddings=weights,
280
+ freeze=True,
281
+ )
282
+
283
+ # Set the tokenizer
284
+ if tokenizer is None:
285
+ self.tokenizer = Tokenizer()
286
+ else:
287
+ self.tokenizer = tokenizer
288
+
289
+
290
+ def __call__(self,
291
+ tokens: Union[List[str], str],
292
+ ) -> Tensor:
293
+ """
294
+ This function returns the embeddings of the given tokens.
295
+
296
+ Arguments:
297
+ tokens (Union[List[str], str]): The tokens to embed.
298
+
299
+ Returns:
300
+ Tensor: The embeddings of the given tokens.
301
+ """
302
+ return super().__call__(tokens)
303
+
304
+
305
+ def get_embedding(self,
306
+ tokens: Union[List[str], str]
307
+ ) -> Tensor:
308
+ r"""
309
+ This function returns the embeddings of the given tokens.
310
+
311
+ Arguments:
312
+ tokens (Union[List[str], str]): The tokens to embed.
313
+
314
+ Returns:
315
+ Tensor: The embeddings of the given tokens.
316
+ """
317
+ return self.__call__(tokens)
318
+
319
+
320
+ # FastTextEmbeddings class
321
+ class FastTextEmbeddings(NeuralEmbeddings):
322
+ """
323
+ This class implements the FastText embeddings.
324
+ """
325
+ def __init__(self,
326
+ model: str = 'cc.en.300.bin',
327
+ force_download: bool = True,
328
+ dir: str = None,
329
+ ) -> None:
330
+ r"""
331
+ This function initializes the FastTextEmbeddings class.
332
+
333
+ Arguments:
334
+ model (str): The model to use. Some of the available models are:
335
+
336
+ - 'cc.en.300.bin': The English model trained on Common Crawl (300 dimensions)
337
+ - 'cc.hi.300.bin': The Hindi model trained on Common Crawl (300 dimensions)
338
+ - 'cc.fr.300.bin': The French model trained on Common Crawl (300 dimensions)
339
+ - 'cc.yi.300.bin': The Yiddish model trained on Common Crawl (300 dimensions)
340
+ - ...
341
+ - 'wiki.en': The English model trained on Wikipedia (300 dimensions)
342
+ - 'wiki.simple': The Simple English model trained on Wikipedia (300 dimensions)
343
+ - 'wiki.ar': The Arabic model trained on Wikipedia (300 dimensions)
344
+ - 'wiki.bg': The Bulgarian model trained on Wikipedia (300 dimensions)
345
+ - 'wiki.ca': The Catalan model trained on Wikipedia (300 dimensions)
346
+ - 'wiki.zh': The Chinese model trained on Wikipedia (300 dimensions)
347
+ - 'wiki.sw': The Swahili model trained on Wikipedia (300 dimensions)
348
+ - 'wiki.fr': The French model trained on Wikipedia (300 dimensions)
349
+ - 'wiki.de': The German model trained on Wikipedia (300 dimensions)
350
+ - 'wiki.es': The Spanish model trained on Wikipedia (300 dimensions)
351
+ - 'wiki.it': The Italian model trained on Wikipedia (300 dimensions)
352
+ - 'wiki.pt': The Portuguese model trained on Wikipedia (300 dimensions)
353
+ - 'wiki.ru': The Russian model trained on Wikipedia (300 dimensions)
354
+ - 'wiki.tr': The Turkish model trained on Wikipedia (300 dimensions)
355
+ - 'wiki.uk': The Ukrainian model trained on Wikipedia (300 dimensions)
356
+ - 'wiki.vi': The Vietnamese model trained on Wikipedia (300 dimensions)
357
+ - 'wiki.id': The Indonesian model trained on Wikipedia (300 dimensions)
358
+ - 'wiki.ja': The Japanese model trained on Wikipedia (300 dimensions)
359
+ - ...
360
+
361
+ force_download (bool): Whether to force the download of the model. Default: False.
362
+ dir (str): The directory to save and load the model.
363
+
364
+ Returns:
365
+ None
366
+
367
+ Raises:
368
+ ValueError: If the given model is not available.
369
+
370
+ .. attention::
371
+
372
+ If you make use of this code, please cite the following papers (depending on the model you use):
373
+
374
+ .. code-block:: latex
375
+
376
+ @inproceedings{mikolov2018advances,
377
+ title={Advances in Pre-Training Distributed Word Representations},
378
+ author={Mikolov, Tomas and Grave, Edouard and Bojanowski, Piotr and Puhrsch, Christian and Joulin, Armand},
379
+ booktitle={Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018)},
380
+ year={2018}
381
+ }
382
+
383
+ .. code-block:: latex
384
+
385
+ @article{bojanowski2017enriching,
386
+ title={Enriching Word Vectors with Subword Information},
387
+ author={Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas},
388
+ journal={Transactions of the Association for Computational Linguistics},
389
+ volume={5},
390
+ year={2017},
391
+ issn={2307-387X},
392
+ pages={135--146}
393
+ }
394
+
395
+ .. code-block:: latex
396
+
397
+ @article{joulin2016fasttext,
398
+ title={FastText.zip: Compressing text classification models},
399
+ author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Douze, Matthijs and J{\'e}gou, H{\'e}rve and Mikolov, Tomas},
400
+ journal={arXiv preprint arXiv:1612.03651},
401
+ year={2016}
402
+ }
403
+
404
+ .. note::
405
+
406
+ * The models are downloaded from https://fasttext.cc/docs/en/english-vectors.html.
407
+ * The models are saved in the torch hub directory, if no directory is specified.
408
+ *
409
+ """
410
+
411
+ # Set the attributes
412
+ self.model = model
413
+ self.dir = dir
414
+ self.force_download = force_download
415
+
416
+ # Set the path
417
+ if self.dir is None:
418
+ # For convenience, we save the model in the torch hub directory
419
+ self.dir = f'{torch.hub.get_dir()}/{self.model}'
420
+
421
+ # Remove the trailing slash
422
+ if self.dir[-1] == '/':
423
+ self.dir = self.dir[:-1]
424
+
425
+ # Download the embeddings if they do not exist or if force_download is True
426
+ if not os.path.exists(self.dir) or self.force_download:
427
+ # Create the directory if it does not exist
428
+ if not os.path.exists(self.dir):
429
+ os.system(f'mkdir {self.dir}')
430
+
431
+ # Download using wget
432
+ if 'wiki' in model:
433
+ # https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip
434
+ os.system(f'wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/{model}.zip -P {self.dir}')
435
+ os.system(f'unzip {self.dirl}.zip -d {self.dir}')
436
+ os.system(f'rm {self.dir}.zip')
437
+ else:
438
+ # https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
439
+ os.system(f'wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/{model}.gz -P {self.dir}')
440
+ os.system(f'gunzip {self.dir}.gz -d {self.dir}')
441
+ os.system(f'rm {self.dir}.gz')
442
+
443
+ # Load the model
444
+ ft = fasttext.load_model(f'{self.dir}/{model}')
445
+
446
+ # Get the vocabulary
447
+ words = ft.get_words()
448
+
449
+ # Convert the embeddings to a tensor
450
+ embeddings =torch.tensor(ft.get_input_matrix())
451
+
452
+ # Save the words and the embeddings as a .pt file
453
+ torch.save(words, f'{self.dir}/{model}.words.pt')
454
+ torch.save(embeddings, f'{self.dir}/{model}.embeddings.pt')
455
+
456
+ # Delete the model
457
+ del ft
458
+
459
+ else:
460
+ try:
461
+ # Load the words and the embeddings
462
+ words = torch.load(f'{self.dir}/{model}.words.pt')
463
+ embeddings = torch.load(f'{self.dir}/{model}.embeddings.pt')
464
+ except:
465
+ raise Exception(f'Please install the {model} model first by setting force_download to True.')
466
+
467
+ # Create the vocabulary dictionary to be fed to the embedding layer
468
+ self.vocabulary_dict = {word: i for i, word in enumerate(words)}
469
+
470
+ # Create the embedding layer
471
+ self.embedding_layer = torch.nn.Embedding.from_pretrained(
472
+ embeddings=embeddings,
473
+ freeze=True,
474
+ )
475
+
476
+ def __call__(self,
477
+ tokens: Union[List[str], str],
478
+ ) -> Tensor:
479
+ """
480
+ This function returns the embeddings of the given tokens.
481
+
482
+ Arguments:
483
+ tokens (Union[List[str], str]): The tokens to embed.
484
+
485
+ Returns:
486
+ Tensor: The embeddings of the given tokens.
487
+ """
488
+ return super().__call__(tokens)
489
+
490
+
491
+ def get_embedding(self,
492
+ tokens: Union[List[str], str]
493
+ ) -> Tensor:
494
+ """
495
+ This function returns the embeddings of the given tokens.
496
+
497
+ Arguments:
498
+ tokens (Union[List[str], str]): The tokens to embed.
499
+
500
+ Returns:
501
+ Tensor: The embeddings of the given tokens.
502
+ """
503
+ return self.__call__(tokens)
@@ -0,0 +1,10 @@
1
+ # The following trick allows us to import the classes directly from the search module:
2
+ from .classical import (
3
+ SearchAlgorithm,
4
+ NaiveSearch,
5
+ RabinKarpSearch,
6
+ KMPSearch,
7
+ BoyerMooreSearch,
8
+ )
9
+ from .faiss_search import FaissSearch
10
+ from .kobert_tokenizer import KoBERTTokenizer