nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nltkor/Kor_char.py +193 -0
- nltkor/__init__.py +16 -0
- nltkor/alignment/__init__.py +1315 -0
- nltkor/cider/__init__.py +2 -0
- nltkor/cider/cider.py +55 -0
- nltkor/cider/cider_scorer.py +207 -0
- nltkor/distance/__init__.py +441 -0
- nltkor/distance/wasserstein.py +126 -0
- nltkor/etc.py +22 -0
- nltkor/lazyimport.py +144 -0
- nltkor/make_requirement.py +11 -0
- nltkor/metrics/__init__.py +63 -0
- nltkor/metrics/bartscore.py +301 -0
- nltkor/metrics/bertscore.py +331 -0
- nltkor/metrics/bleu_tensor.py +20 -0
- nltkor/metrics/classical.py +847 -0
- nltkor/metrics/entment.py +24 -0
- nltkor/metrics/eval.py +517 -0
- nltkor/metrics/mauve.py +273 -0
- nltkor/metrics/mauve_utils.py +131 -0
- nltkor/misc/__init__.py +11 -0
- nltkor/misc/string2string_basic_functions.py +59 -0
- nltkor/misc/string2string_default_tokenizer.py +83 -0
- nltkor/misc/string2string_hash_functions.py +159 -0
- nltkor/misc/string2string_word_embeddings.py +503 -0
- nltkor/search/__init__.py +10 -0
- nltkor/search/classical.py +569 -0
- nltkor/search/faiss_search.py +787 -0
- nltkor/search/kobert_tokenizer.py +181 -0
- nltkor/sejong/__init__.py +3 -0
- nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
- nltkor/sejong/ch.py +12 -0
- nltkor/sejong/dict_semClassNum.txt +491 -0
- nltkor/sejong/layer.txt +630 -0
- nltkor/sejong/sejong_download.py +87 -0
- nltkor/sejong/ssem.py +684 -0
- nltkor/similarity/__init__.py +3 -0
- nltkor/similarity/bartscore____.py +337 -0
- nltkor/similarity/bertscore____.py +339 -0
- nltkor/similarity/classical.py +245 -0
- nltkor/similarity/cosine_similarity.py +175 -0
- nltkor/tag/__init__.py +71 -0
- nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
- nltkor/tag/espresso_tag.py +220 -0
- nltkor/tag/libs/__init__.py +10 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
- nltkor/tag/libs/arguments.py +280 -0
- nltkor/tag/libs/attributes.py +231 -0
- nltkor/tag/libs/config.py +159 -0
- nltkor/tag/libs/metadata.py +129 -0
- nltkor/tag/libs/ner/__init__.py +2 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/macmorphoreader.py +7 -0
- nltkor/tag/libs/ner/ner_reader.py +92 -0
- nltkor/tag/libs/network.c +72325 -0
- nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
- nltkor/tag/libs/network.pyx +878 -0
- nltkor/tag/libs/networkconv.pyx +1028 -0
- nltkor/tag/libs/networkdependencyconv.pyx +451 -0
- nltkor/tag/libs/parse/__init__.py +1 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/parse_reader.py +283 -0
- nltkor/tag/libs/pos/__init__.py +2 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/macmorphoreader.py +7 -0
- nltkor/tag/libs/pos/pos_reader.py +97 -0
- nltkor/tag/libs/reader.py +485 -0
- nltkor/tag/libs/srl/__init__.py +3 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__srl_reader_.py +535 -0
- nltkor/tag/libs/srl/srl_reader.py +436 -0
- nltkor/tag/libs/srl/train_srl.py +87 -0
- nltkor/tag/libs/taggers.py +926 -0
- nltkor/tag/libs/utils.py +384 -0
- nltkor/tag/libs/word_dictionary.py +239 -0
- nltkor/tag/libs/wsd/__init__.py +2 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
- nltkor/tag/libs/wsd/wsd_reader.py +93 -0
- nltkor/tokenize/__init__.py +62 -0
- nltkor/tokenize/ko_tokenize.py +115 -0
- nltkor/trans.py +121 -0
- nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
- nltkor-1.2.14.dist-info/METADATA +41 -0
- nltkor-1.2.14.dist-info/RECORD +127 -0
- nltkor-1.2.14.dist-info/WHEEL +5 -0
- nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,159 @@
|
|
1
|
+
"""
|
2
|
+
string2string code
|
3
|
+
src = https://github.com/stanfordnlp/string2string
|
4
|
+
|
5
|
+
|
6
|
+
MIT License
|
7
|
+
|
8
|
+
Copyright (c) 2023 Mirac Suzgun
|
9
|
+
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
12
|
+
in the Software without restriction, including without limitation the rights
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
15
|
+
furnished to do so, subject to the following conditions:
|
16
|
+
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
18
|
+
copies or substantial portions of the Software.
|
19
|
+
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
26
|
+
SOFTWARE.
|
27
|
+
|
28
|
+
|
29
|
+
"""
|
30
|
+
|
31
|
+
"""
|
32
|
+
This module contains the hash functions used in search algorithms.
|
33
|
+
|
34
|
+
A hash function takes a string (or other object) and returns a number.
|
35
|
+
The number is called the hash value, hash code, or simply the hash. The hash value is used to determine the location of the string in the hash table.
|
36
|
+
- The hash function must be deterministic, meaning that the same string always produces the same hash value.
|
37
|
+
- If two strings produce the same hash value, we say that the hash values collide.
|
38
|
+
- The hash function must also be fast, so it is important to keep the number of operations to a minimum.
|
39
|
+
"""
|
40
|
+
|
41
|
+
from typing import List, Union, Tuple, Optional
|
42
|
+
import numpy as np
|
43
|
+
|
44
|
+
|
45
|
+
# A parent class for all hash functions
|
46
|
+
class HashFunction:
|
47
|
+
"""
|
48
|
+
This class contains the parent class for all hash functions.
|
49
|
+
"""
|
50
|
+
def __init__(self):
|
51
|
+
pass
|
52
|
+
|
53
|
+
def compute(self,
|
54
|
+
str1: str,
|
55
|
+
) -> int:
|
56
|
+
"""
|
57
|
+
Returns the hash value of a string.
|
58
|
+
|
59
|
+
Arguments:
|
60
|
+
str1 (str): The string.
|
61
|
+
|
62
|
+
Returns:
|
63
|
+
int: The hash value of the string.
|
64
|
+
"""
|
65
|
+
pass
|
66
|
+
|
67
|
+
|
68
|
+
# Polynomial rolling hash function class
|
69
|
+
class PolynomialRollingHash(HashFunction):
|
70
|
+
"""
|
71
|
+
This class contains the polynomial rolling hash function.
|
72
|
+
"""
|
73
|
+
|
74
|
+
def __init__(self,
|
75
|
+
base: int = 10, # 256,
|
76
|
+
modulus: int = 101, # 65537,
|
77
|
+
) -> None:
|
78
|
+
"""
|
79
|
+
Initializes the polynomial rolling hash function.
|
80
|
+
|
81
|
+
Arguments:
|
82
|
+
base (int): The base to use. Default is 256.
|
83
|
+
modulus (int): The modulus to use. Default is 65537.
|
84
|
+
|
85
|
+
Returns:
|
86
|
+
None
|
87
|
+
|
88
|
+
.. note::
|
89
|
+
* Why 65537? Because it is a Fermat prime.
|
90
|
+
"""
|
91
|
+
super().__init__()
|
92
|
+
|
93
|
+
# Check the inputs
|
94
|
+
assert base > 0, 'The base must be positive.'
|
95
|
+
assert modulus > 0, 'The modulus must be positive.'
|
96
|
+
|
97
|
+
# Set the attributes
|
98
|
+
self.base = base
|
99
|
+
self.modulus = modulus
|
100
|
+
|
101
|
+
# Initialize the current hash value
|
102
|
+
self.current_hash = 0
|
103
|
+
|
104
|
+
|
105
|
+
def compute(self,
|
106
|
+
str1: str,
|
107
|
+
) -> int:
|
108
|
+
"""
|
109
|
+
Returns the hash value of a string.
|
110
|
+
|
111
|
+
Arguments:
|
112
|
+
str1 (str): The string.
|
113
|
+
|
114
|
+
Returns:
|
115
|
+
int: The hash value of the string.
|
116
|
+
"""
|
117
|
+
# Compute the hash value of the string
|
118
|
+
for char in str1:
|
119
|
+
self.current_hash = (self.current_hash * self.base + ord(char)) % self.modulus
|
120
|
+
|
121
|
+
# Return the hash value
|
122
|
+
return self.current_hash
|
123
|
+
|
124
|
+
|
125
|
+
def update(self,
|
126
|
+
old_char: str,
|
127
|
+
new_char: str,
|
128
|
+
window_size: int,
|
129
|
+
) -> int:
|
130
|
+
"""
|
131
|
+
Updates the hash value of a string.
|
132
|
+
|
133
|
+
Arguments:
|
134
|
+
old_char (str): The old character.
|
135
|
+
new_char (str): The new character.
|
136
|
+
|
137
|
+
Returns:
|
138
|
+
int: The hash value of the string.
|
139
|
+
"""
|
140
|
+
# Update the hash value of the string
|
141
|
+
self.current_hash = (self.current_hash - ord(old_char) * (self.base ** (window_size - 1))) % self.modulus
|
142
|
+
self.current_hash = (self.current_hash * self.base + ord(new_char)) % self.modulus
|
143
|
+
|
144
|
+
# Return the hash value
|
145
|
+
return self.current_hash
|
146
|
+
|
147
|
+
|
148
|
+
def reset(self) -> None:
|
149
|
+
"""
|
150
|
+
Resets the hash value.
|
151
|
+
|
152
|
+
Arguments:
|
153
|
+
None
|
154
|
+
|
155
|
+
Returns:
|
156
|
+
None
|
157
|
+
"""
|
158
|
+
# Reset the current hash value
|
159
|
+
self.current_hash = 0
|
@@ -0,0 +1,503 @@
|
|
1
|
+
"""
|
2
|
+
string2string code
|
3
|
+
src = https://github.com/stanfordnlp/string2string
|
4
|
+
|
5
|
+
|
6
|
+
MIT License
|
7
|
+
|
8
|
+
Copyright (c) 2023 Mirac Suzgun
|
9
|
+
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
12
|
+
in the Software without restriction, including without limitation the rights
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
15
|
+
furnished to do so, subject to the following conditions:
|
16
|
+
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
18
|
+
copies or substantial portions of the Software.
|
19
|
+
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
26
|
+
SOFTWARE.
|
27
|
+
|
28
|
+
|
29
|
+
"""
|
30
|
+
|
31
|
+
"""
|
32
|
+
This module implements the word embeddings class.
|
33
|
+
"""
|
34
|
+
# from tqdm import tqdm
|
35
|
+
import numpy as np
|
36
|
+
from typing import List, Union
|
37
|
+
#import torch
|
38
|
+
import os
|
39
|
+
from nltkor.make_requirement import make_requirement
|
40
|
+
try:
|
41
|
+
import torch
|
42
|
+
from torch import Tensor
|
43
|
+
from torch.nn import functional as F
|
44
|
+
import fasttext
|
45
|
+
import fasttext.util
|
46
|
+
except ImportError:
|
47
|
+
requirement = ['torch', 'fasttext']
|
48
|
+
file_path = make_requirement(requirement)
|
49
|
+
raise Exception(f"""
|
50
|
+
Need to install Libraries, please pip install below libraries
|
51
|
+
\t pip install torch
|
52
|
+
\t pip install fasttext-wheel
|
53
|
+
Or, use pip install requirement.txt
|
54
|
+
\t pip install -r {file_path}
|
55
|
+
""")
|
56
|
+
# for dev purposes
|
57
|
+
import sys
|
58
|
+
# sys.path.append("/Users/dowon/nltk_ko/nltk/misc")
|
59
|
+
from nltkor.misc.string2string_default_tokenizer import Tokenizer
|
60
|
+
# from string2string_default_tokenizer import Tokenizer
|
61
|
+
|
62
|
+
|
63
|
+
class NeuralEmbeddings:
|
64
|
+
"""
|
65
|
+
This class is an abstract class for neural word embeddings.
|
66
|
+
"""
|
67
|
+
|
68
|
+
def __init__(self,
|
69
|
+
tokenizer: Tokenizer = None,
|
70
|
+
) -> None:
|
71
|
+
"""
|
72
|
+
Constructor.
|
73
|
+
|
74
|
+
Arguments:
|
75
|
+
tokenizer (Tokenizer): The tokenizer to use.
|
76
|
+
"""
|
77
|
+
# Set the tokenizer
|
78
|
+
if tokenizer is None:
|
79
|
+
self.tokenizer = Tokenizer(word_delimiter=" ")
|
80
|
+
|
81
|
+
|
82
|
+
|
83
|
+
def __call__(self,
|
84
|
+
tokens: Union[List[str], str],
|
85
|
+
) -> Tensor:
|
86
|
+
"""
|
87
|
+
This function returns the embeddings of the given tokens.
|
88
|
+
|
89
|
+
Arguments:
|
90
|
+
tokens (Union[List[str], str]): The tokens to embed.
|
91
|
+
|
92
|
+
Returns:
|
93
|
+
Tensor: The embeddings of the given tokens.
|
94
|
+
"""
|
95
|
+
# Check the tokens
|
96
|
+
if isinstance(tokens, str):
|
97
|
+
tokens = self.tokenizer.tokenize(tokens)
|
98
|
+
|
99
|
+
# Embed the tokens
|
100
|
+
return self.embedding_layer(torch.tensor([self.vocabulary_dict[token] for token in tokens]))
|
101
|
+
|
102
|
+
|
103
|
+
def get_embedding(self,
|
104
|
+
tokens: Union[List[str], str]
|
105
|
+
) -> Tensor:
|
106
|
+
"""
|
107
|
+
This function returns the embeddings of the given tokens.
|
108
|
+
|
109
|
+
Arguments:
|
110
|
+
tokens (Union[List[str], str]): The tokens to embed.
|
111
|
+
|
112
|
+
Returns:
|
113
|
+
Tensor: The embeddings of the given tokens.
|
114
|
+
"""
|
115
|
+
return self.__call__(tokens)
|
116
|
+
|
117
|
+
|
118
|
+
# GloVe embeddings class
|
119
|
+
class GloVeEmbeddings(NeuralEmbeddings):
|
120
|
+
"""
|
121
|
+
This class implements the GloVe word embeddings.
|
122
|
+
"""
|
123
|
+
# Pre-trained GloVe embeddings
|
124
|
+
# Source: https://github.com/stanfordnlp/GloVe#download-pre-trained-word-vectors
|
125
|
+
MODEL_OPTIONS = {
|
126
|
+
'glove.6B.200d': {
|
127
|
+
'Description': 'Wikipedia 2014 + Gigaword 5 (6B tokens, 400K vocab, uncased, 300d vectors, 822 MB download)',
|
128
|
+
'URL': 'https://huggingface.co/stanfordnlp/glove/resolve/main/glove.6B.zip',
|
129
|
+
},
|
130
|
+
'glove.twitter.27B': {
|
131
|
+
'Description': 'Twitter (27B tokens, 1.2M vocab, uncased, 200d vectors, 1.42 GB download)',
|
132
|
+
'URL': 'https://huggingface.co/stanfordnlp/glove/resolve/main/glove.twitter.27B.zip',
|
133
|
+
},
|
134
|
+
'glove.42B.300d': {
|
135
|
+
'Description': 'Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors, 1.75 GB download)',
|
136
|
+
'URL': 'https://huggingface.co/stanfordnlp/glove/resolve/main/glove.42B.300d.zip',
|
137
|
+
},
|
138
|
+
'glove.840B.300d': {
|
139
|
+
'Description': 'Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors, 2.03 GB download)',
|
140
|
+
'URL': 'https://huggingface.co/stanfordnlp/glove/resolve/main/glove.840B.300d.zip',
|
141
|
+
},
|
142
|
+
}
|
143
|
+
|
144
|
+
def __init__(self,
|
145
|
+
model: str = 'glove.6B.200D',
|
146
|
+
dim: int = 50,
|
147
|
+
force_download: bool = False,
|
148
|
+
dir = None,
|
149
|
+
tokenizer: Tokenizer = None,
|
150
|
+
) -> None:
|
151
|
+
r"""
|
152
|
+
This function initializes the GloVe embeddings class.
|
153
|
+
|
154
|
+
Arguments:
|
155
|
+
model (str): The model to use. Default is 'glove.6B.200D'. (Options are: 'glove.6B.200D', 'glove.twitter.27B', 'glove.42B.300d', 'glove.840B.300d'.)
|
156
|
+
dim (int): The dimension of the embeddings. Default is 300.
|
157
|
+
force_download (bool): Whether to force download the model. Default is False.
|
158
|
+
dir (str): The directory to save or load the model. Default is None.
|
159
|
+
tokenizer (Tokenizer): The tokenizer to use. Default is None.
|
160
|
+
|
161
|
+
Returns:
|
162
|
+
None
|
163
|
+
|
164
|
+
Raises:
|
165
|
+
ValueError: If the model is not in the MODEL_OPTIONS [glove.6B.200D', 'glove.twitter.27B', 'glove.42B.300d', 'glove.840B.300d'].
|
166
|
+
|
167
|
+
|
168
|
+
.. attention::
|
169
|
+
|
170
|
+
If you use this class, please make sure to cite the following paper:
|
171
|
+
|
172
|
+
.. code-block:: latex
|
173
|
+
|
174
|
+
@inproceedings{pennington2014glove,
|
175
|
+
title={Glove: Global vectors for word representation},
|
176
|
+
author={Pennington, Jeffrey and Socher, Richard and Manning, Christopher D},
|
177
|
+
booktitle={Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP)},
|
178
|
+
pages={1532--1543},
|
179
|
+
year={2014}
|
180
|
+
}
|
181
|
+
|
182
|
+
|
183
|
+
.. note::
|
184
|
+
* If directory is None, the model will be saved in the torch hub directory.
|
185
|
+
* If the model is not downloaded, it will be downloaded automatically.
|
186
|
+
"""
|
187
|
+
# Check model
|
188
|
+
if model not in self.MODEL_OPTIONS:
|
189
|
+
raise ValueError(f'Invalid model: {model}.')
|
190
|
+
|
191
|
+
# Set the attributes
|
192
|
+
self.model = model
|
193
|
+
self.force_download = force_download
|
194
|
+
self.dir = dir
|
195
|
+
self.token_size = self.model.split('.')[1]
|
196
|
+
self.dim = dim
|
197
|
+
|
198
|
+
# Set the path
|
199
|
+
if self.dir is None:
|
200
|
+
self.dir = f'{torch.hub.get_dir()}/{self.model}'
|
201
|
+
|
202
|
+
# Remove the trailing slash
|
203
|
+
if self.dir[-1] == '/':
|
204
|
+
self.dir = self.dir[:-1]
|
205
|
+
|
206
|
+
# Download the embeddings if they do not exist or if force_download is True
|
207
|
+
if not os.path.exists(self.dir) or self.force_download:
|
208
|
+
|
209
|
+
# Create the directory if it does not exist
|
210
|
+
if not (os.path.exists(self.dir)):
|
211
|
+
os.system(f'mkdir {self.dir}')
|
212
|
+
|
213
|
+
# Download the glove .zip file
|
214
|
+
print(f'Downloading the {self.model} zip file...')
|
215
|
+
torch.hub.download_url_to_file(
|
216
|
+
url=self.MODEL_OPTIONS[self.model]['URL'],
|
217
|
+
dst=f'{self.dir}/glove.zip',
|
218
|
+
)
|
219
|
+
|
220
|
+
# Unzip the glove .txt files
|
221
|
+
print(f'Unzipping the {self.model} zip file...')
|
222
|
+
os.system(f'unzip {self.dir}/glove.zip -d {self.dir}')
|
223
|
+
|
224
|
+
# Delete the zip file
|
225
|
+
os.system(f'rm {self.dir}/glove.zip')
|
226
|
+
|
227
|
+
# Process each glove .txt file and save it as a .pt file
|
228
|
+
for file in os.listdir(self.dir):
|
229
|
+
# Extract the words and the embeddings from the glove .txt file and save them as a .pt file
|
230
|
+
|
231
|
+
# Example of a glove .txt file:
|
232
|
+
# the 0.418 0.24968 -0.41242 0.1217 ...
|
233
|
+
# ...
|
234
|
+
# and 0.26818 0.14346 -0.27877 0.016257 ...
|
235
|
+
# ...
|
236
|
+
|
237
|
+
print(f'Processing {file}...')
|
238
|
+
|
239
|
+
# Load the file
|
240
|
+
with open(f'{self.dir}/{file}', 'r') as f:
|
241
|
+
lines = f.readlines()
|
242
|
+
|
243
|
+
# Extract the dimension of the embeddings from the file name (e.g. glove.6B.200d.txt -> 200)
|
244
|
+
file_embed_dim = file.split('.')[2][:-1]
|
245
|
+
|
246
|
+
# Extract the words and the embeddings
|
247
|
+
words = []
|
248
|
+
embeddings = np.zeros((len(lines), int(file_embed_dim)))
|
249
|
+
for i, line in enumerate(lines):
|
250
|
+
line = line.split(' ')
|
251
|
+
words.append(line[0])
|
252
|
+
embeddings[i] = np.array([float(x) for x in line[1:]])
|
253
|
+
|
254
|
+
# Convert the embeddings to a tensor
|
255
|
+
embeddings = torch.from_numpy(embeddings)
|
256
|
+
|
257
|
+
# Save the words and the embeddings as a .pt file
|
258
|
+
torch.save(words, f'{self.dir}/{file[:-4]}.words.pt')
|
259
|
+
torch.save(embeddings, f'{self.dir}/{file[:-4]}.embeddings.pt')
|
260
|
+
|
261
|
+
# Delete the glove .txt files
|
262
|
+
os.system(f'rm -r {self.dir}/*.txt')
|
263
|
+
|
264
|
+
# Load the weights and the vocabulary
|
265
|
+
weights = torch.load(f'{self.dir}/glove.{self.token_size}.{self.dim}d.embeddings.pt')
|
266
|
+
vocabulary = torch.load(f'{self.dir}/glove.{self.token_size}.{self.dim}d.words.pt')
|
267
|
+
|
268
|
+
# If the embeddings already exist
|
269
|
+
else:
|
270
|
+
# Load the weights and the vocabulary
|
271
|
+
weights = torch.load(f'{self.dir}/glove.{self.token_size}.{self.dim}d.embeddings.pt')
|
272
|
+
vocabulary = torch.load(f'{self.dir}/glove.{self.token_size}.{self.dim}d.words.pt')
|
273
|
+
|
274
|
+
# Create the vocabulary dictionary to be fed to the embedding layer
|
275
|
+
self.vocabulary_dict = {word: i for i, word in enumerate(vocabulary)}
|
276
|
+
|
277
|
+
# Create the embedding layer
|
278
|
+
self.embedding_layer = torch.nn.Embedding.from_pretrained(
|
279
|
+
embeddings=weights,
|
280
|
+
freeze=True,
|
281
|
+
)
|
282
|
+
|
283
|
+
# Set the tokenizer
|
284
|
+
if tokenizer is None:
|
285
|
+
self.tokenizer = Tokenizer()
|
286
|
+
else:
|
287
|
+
self.tokenizer = tokenizer
|
288
|
+
|
289
|
+
|
290
|
+
def __call__(self,
|
291
|
+
tokens: Union[List[str], str],
|
292
|
+
) -> Tensor:
|
293
|
+
"""
|
294
|
+
This function returns the embeddings of the given tokens.
|
295
|
+
|
296
|
+
Arguments:
|
297
|
+
tokens (Union[List[str], str]): The tokens to embed.
|
298
|
+
|
299
|
+
Returns:
|
300
|
+
Tensor: The embeddings of the given tokens.
|
301
|
+
"""
|
302
|
+
return super().__call__(tokens)
|
303
|
+
|
304
|
+
|
305
|
+
def get_embedding(self,
|
306
|
+
tokens: Union[List[str], str]
|
307
|
+
) -> Tensor:
|
308
|
+
r"""
|
309
|
+
This function returns the embeddings of the given tokens.
|
310
|
+
|
311
|
+
Arguments:
|
312
|
+
tokens (Union[List[str], str]): The tokens to embed.
|
313
|
+
|
314
|
+
Returns:
|
315
|
+
Tensor: The embeddings of the given tokens.
|
316
|
+
"""
|
317
|
+
return self.__call__(tokens)
|
318
|
+
|
319
|
+
|
320
|
+
# FastTextEmbeddings class
|
321
|
+
class FastTextEmbeddings(NeuralEmbeddings):
|
322
|
+
"""
|
323
|
+
This class implements the FastText embeddings.
|
324
|
+
"""
|
325
|
+
def __init__(self,
|
326
|
+
model: str = 'cc.en.300.bin',
|
327
|
+
force_download: bool = True,
|
328
|
+
dir: str = None,
|
329
|
+
) -> None:
|
330
|
+
r"""
|
331
|
+
This function initializes the FastTextEmbeddings class.
|
332
|
+
|
333
|
+
Arguments:
|
334
|
+
model (str): The model to use. Some of the available models are:
|
335
|
+
|
336
|
+
- 'cc.en.300.bin': The English model trained on Common Crawl (300 dimensions)
|
337
|
+
- 'cc.hi.300.bin': The Hindi model trained on Common Crawl (300 dimensions)
|
338
|
+
- 'cc.fr.300.bin': The French model trained on Common Crawl (300 dimensions)
|
339
|
+
- 'cc.yi.300.bin': The Yiddish model trained on Common Crawl (300 dimensions)
|
340
|
+
- ...
|
341
|
+
- 'wiki.en': The English model trained on Wikipedia (300 dimensions)
|
342
|
+
- 'wiki.simple': The Simple English model trained on Wikipedia (300 dimensions)
|
343
|
+
- 'wiki.ar': The Arabic model trained on Wikipedia (300 dimensions)
|
344
|
+
- 'wiki.bg': The Bulgarian model trained on Wikipedia (300 dimensions)
|
345
|
+
- 'wiki.ca': The Catalan model trained on Wikipedia (300 dimensions)
|
346
|
+
- 'wiki.zh': The Chinese model trained on Wikipedia (300 dimensions)
|
347
|
+
- 'wiki.sw': The Swahili model trained on Wikipedia (300 dimensions)
|
348
|
+
- 'wiki.fr': The French model trained on Wikipedia (300 dimensions)
|
349
|
+
- 'wiki.de': The German model trained on Wikipedia (300 dimensions)
|
350
|
+
- 'wiki.es': The Spanish model trained on Wikipedia (300 dimensions)
|
351
|
+
- 'wiki.it': The Italian model trained on Wikipedia (300 dimensions)
|
352
|
+
- 'wiki.pt': The Portuguese model trained on Wikipedia (300 dimensions)
|
353
|
+
- 'wiki.ru': The Russian model trained on Wikipedia (300 dimensions)
|
354
|
+
- 'wiki.tr': The Turkish model trained on Wikipedia (300 dimensions)
|
355
|
+
- 'wiki.uk': The Ukrainian model trained on Wikipedia (300 dimensions)
|
356
|
+
- 'wiki.vi': The Vietnamese model trained on Wikipedia (300 dimensions)
|
357
|
+
- 'wiki.id': The Indonesian model trained on Wikipedia (300 dimensions)
|
358
|
+
- 'wiki.ja': The Japanese model trained on Wikipedia (300 dimensions)
|
359
|
+
- ...
|
360
|
+
|
361
|
+
force_download (bool): Whether to force the download of the model. Default: False.
|
362
|
+
dir (str): The directory to save and load the model.
|
363
|
+
|
364
|
+
Returns:
|
365
|
+
None
|
366
|
+
|
367
|
+
Raises:
|
368
|
+
ValueError: If the given model is not available.
|
369
|
+
|
370
|
+
.. attention::
|
371
|
+
|
372
|
+
If you make use of this code, please cite the following papers (depending on the model you use):
|
373
|
+
|
374
|
+
.. code-block:: latex
|
375
|
+
|
376
|
+
@inproceedings{mikolov2018advances,
|
377
|
+
title={Advances in Pre-Training Distributed Word Representations},
|
378
|
+
author={Mikolov, Tomas and Grave, Edouard and Bojanowski, Piotr and Puhrsch, Christian and Joulin, Armand},
|
379
|
+
booktitle={Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018)},
|
380
|
+
year={2018}
|
381
|
+
}
|
382
|
+
|
383
|
+
.. code-block:: latex
|
384
|
+
|
385
|
+
@article{bojanowski2017enriching,
|
386
|
+
title={Enriching Word Vectors with Subword Information},
|
387
|
+
author={Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas},
|
388
|
+
journal={Transactions of the Association for Computational Linguistics},
|
389
|
+
volume={5},
|
390
|
+
year={2017},
|
391
|
+
issn={2307-387X},
|
392
|
+
pages={135--146}
|
393
|
+
}
|
394
|
+
|
395
|
+
.. code-block:: latex
|
396
|
+
|
397
|
+
@article{joulin2016fasttext,
|
398
|
+
title={FastText.zip: Compressing text classification models},
|
399
|
+
author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Douze, Matthijs and J{\'e}gou, H{\'e}rve and Mikolov, Tomas},
|
400
|
+
journal={arXiv preprint arXiv:1612.03651},
|
401
|
+
year={2016}
|
402
|
+
}
|
403
|
+
|
404
|
+
.. note::
|
405
|
+
|
406
|
+
* The models are downloaded from https://fasttext.cc/docs/en/english-vectors.html.
|
407
|
+
* The models are saved in the torch hub directory, if no directory is specified.
|
408
|
+
*
|
409
|
+
"""
|
410
|
+
|
411
|
+
# Set the attributes
|
412
|
+
self.model = model
|
413
|
+
self.dir = dir
|
414
|
+
self.force_download = force_download
|
415
|
+
|
416
|
+
# Set the path
|
417
|
+
if self.dir is None:
|
418
|
+
# For convenience, we save the model in the torch hub directory
|
419
|
+
self.dir = f'{torch.hub.get_dir()}/{self.model}'
|
420
|
+
|
421
|
+
# Remove the trailing slash
|
422
|
+
if self.dir[-1] == '/':
|
423
|
+
self.dir = self.dir[:-1]
|
424
|
+
|
425
|
+
# Download the embeddings if they do not exist or if force_download is True
|
426
|
+
if not os.path.exists(self.dir) or self.force_download:
|
427
|
+
# Create the directory if it does not exist
|
428
|
+
if not os.path.exists(self.dir):
|
429
|
+
os.system(f'mkdir {self.dir}')
|
430
|
+
|
431
|
+
# Download using wget
|
432
|
+
if 'wiki' in model:
|
433
|
+
# https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip
|
434
|
+
os.system(f'wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/{model}.zip -P {self.dir}')
|
435
|
+
os.system(f'unzip {self.dirl}.zip -d {self.dir}')
|
436
|
+
os.system(f'rm {self.dir}.zip')
|
437
|
+
else:
|
438
|
+
# https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
|
439
|
+
os.system(f'wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/{model}.gz -P {self.dir}')
|
440
|
+
os.system(f'gunzip {self.dir}.gz -d {self.dir}')
|
441
|
+
os.system(f'rm {self.dir}.gz')
|
442
|
+
|
443
|
+
# Load the model
|
444
|
+
ft = fasttext.load_model(f'{self.dir}/{model}')
|
445
|
+
|
446
|
+
# Get the vocabulary
|
447
|
+
words = ft.get_words()
|
448
|
+
|
449
|
+
# Convert the embeddings to a tensor
|
450
|
+
embeddings =torch.tensor(ft.get_input_matrix())
|
451
|
+
|
452
|
+
# Save the words and the embeddings as a .pt file
|
453
|
+
torch.save(words, f'{self.dir}/{model}.words.pt')
|
454
|
+
torch.save(embeddings, f'{self.dir}/{model}.embeddings.pt')
|
455
|
+
|
456
|
+
# Delete the model
|
457
|
+
del ft
|
458
|
+
|
459
|
+
else:
|
460
|
+
try:
|
461
|
+
# Load the words and the embeddings
|
462
|
+
words = torch.load(f'{self.dir}/{model}.words.pt')
|
463
|
+
embeddings = torch.load(f'{self.dir}/{model}.embeddings.pt')
|
464
|
+
except:
|
465
|
+
raise Exception(f'Please install the {model} model first by setting force_download to True.')
|
466
|
+
|
467
|
+
# Create the vocabulary dictionary to be fed to the embedding layer
|
468
|
+
self.vocabulary_dict = {word: i for i, word in enumerate(words)}
|
469
|
+
|
470
|
+
# Create the embedding layer
|
471
|
+
self.embedding_layer = torch.nn.Embedding.from_pretrained(
|
472
|
+
embeddings=embeddings,
|
473
|
+
freeze=True,
|
474
|
+
)
|
475
|
+
|
476
|
+
def __call__(self,
|
477
|
+
tokens: Union[List[str], str],
|
478
|
+
) -> Tensor:
|
479
|
+
"""
|
480
|
+
This function returns the embeddings of the given tokens.
|
481
|
+
|
482
|
+
Arguments:
|
483
|
+
tokens (Union[List[str], str]): The tokens to embed.
|
484
|
+
|
485
|
+
Returns:
|
486
|
+
Tensor: The embeddings of the given tokens.
|
487
|
+
"""
|
488
|
+
return super().__call__(tokens)
|
489
|
+
|
490
|
+
|
491
|
+
def get_embedding(self,
|
492
|
+
tokens: Union[List[str], str]
|
493
|
+
) -> Tensor:
|
494
|
+
"""
|
495
|
+
This function returns the embeddings of the given tokens.
|
496
|
+
|
497
|
+
Arguments:
|
498
|
+
tokens (Union[List[str], str]): The tokens to embed.
|
499
|
+
|
500
|
+
Returns:
|
501
|
+
Tensor: The embeddings of the given tokens.
|
502
|
+
"""
|
503
|
+
return self.__call__(tokens)
|
@@ -0,0 +1,10 @@
|
|
1
|
+
# The following trick allows us to import the classes directly from the search module:
|
2
|
+
from .classical import (
|
3
|
+
SearchAlgorithm,
|
4
|
+
NaiveSearch,
|
5
|
+
RabinKarpSearch,
|
6
|
+
KMPSearch,
|
7
|
+
BoyerMooreSearch,
|
8
|
+
)
|
9
|
+
from .faiss_search import FaissSearch
|
10
|
+
from .kobert_tokenizer import KoBERTTokenizer
|