nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nltkor/Kor_char.py +193 -0
- nltkor/__init__.py +16 -0
- nltkor/alignment/__init__.py +1315 -0
- nltkor/cider/__init__.py +2 -0
- nltkor/cider/cider.py +55 -0
- nltkor/cider/cider_scorer.py +207 -0
- nltkor/distance/__init__.py +441 -0
- nltkor/distance/wasserstein.py +126 -0
- nltkor/etc.py +22 -0
- nltkor/lazyimport.py +144 -0
- nltkor/make_requirement.py +11 -0
- nltkor/metrics/__init__.py +63 -0
- nltkor/metrics/bartscore.py +301 -0
- nltkor/metrics/bertscore.py +331 -0
- nltkor/metrics/bleu_tensor.py +20 -0
- nltkor/metrics/classical.py +847 -0
- nltkor/metrics/entment.py +24 -0
- nltkor/metrics/eval.py +517 -0
- nltkor/metrics/mauve.py +273 -0
- nltkor/metrics/mauve_utils.py +131 -0
- nltkor/misc/__init__.py +11 -0
- nltkor/misc/string2string_basic_functions.py +59 -0
- nltkor/misc/string2string_default_tokenizer.py +83 -0
- nltkor/misc/string2string_hash_functions.py +159 -0
- nltkor/misc/string2string_word_embeddings.py +503 -0
- nltkor/search/__init__.py +10 -0
- nltkor/search/classical.py +569 -0
- nltkor/search/faiss_search.py +787 -0
- nltkor/search/kobert_tokenizer.py +181 -0
- nltkor/sejong/__init__.py +3 -0
- nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
- nltkor/sejong/ch.py +12 -0
- nltkor/sejong/dict_semClassNum.txt +491 -0
- nltkor/sejong/layer.txt +630 -0
- nltkor/sejong/sejong_download.py +87 -0
- nltkor/sejong/ssem.py +684 -0
- nltkor/similarity/__init__.py +3 -0
- nltkor/similarity/bartscore____.py +337 -0
- nltkor/similarity/bertscore____.py +339 -0
- nltkor/similarity/classical.py +245 -0
- nltkor/similarity/cosine_similarity.py +175 -0
- nltkor/tag/__init__.py +71 -0
- nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
- nltkor/tag/espresso_tag.py +220 -0
- nltkor/tag/libs/__init__.py +10 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
- nltkor/tag/libs/arguments.py +280 -0
- nltkor/tag/libs/attributes.py +231 -0
- nltkor/tag/libs/config.py +159 -0
- nltkor/tag/libs/metadata.py +129 -0
- nltkor/tag/libs/ner/__init__.py +2 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/macmorphoreader.py +7 -0
- nltkor/tag/libs/ner/ner_reader.py +92 -0
- nltkor/tag/libs/network.c +72325 -0
- nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
- nltkor/tag/libs/network.pyx +878 -0
- nltkor/tag/libs/networkconv.pyx +1028 -0
- nltkor/tag/libs/networkdependencyconv.pyx +451 -0
- nltkor/tag/libs/parse/__init__.py +1 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/parse_reader.py +283 -0
- nltkor/tag/libs/pos/__init__.py +2 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/macmorphoreader.py +7 -0
- nltkor/tag/libs/pos/pos_reader.py +97 -0
- nltkor/tag/libs/reader.py +485 -0
- nltkor/tag/libs/srl/__init__.py +3 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__srl_reader_.py +535 -0
- nltkor/tag/libs/srl/srl_reader.py +436 -0
- nltkor/tag/libs/srl/train_srl.py +87 -0
- nltkor/tag/libs/taggers.py +926 -0
- nltkor/tag/libs/utils.py +384 -0
- nltkor/tag/libs/word_dictionary.py +239 -0
- nltkor/tag/libs/wsd/__init__.py +2 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
- nltkor/tag/libs/wsd/wsd_reader.py +93 -0
- nltkor/tokenize/__init__.py +62 -0
- nltkor/tokenize/ko_tokenize.py +115 -0
- nltkor/trans.py +121 -0
- nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
- nltkor-1.2.14.dist-info/METADATA +41 -0
- nltkor-1.2.14.dist-info/RECORD +127 -0
- nltkor-1.2.14.dist-info/WHEEL +5 -0
- nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,245 @@
|
|
1
|
+
"""
|
2
|
+
string2string similarity
|
3
|
+
src = https://github.com/stanfordnlp/string2string
|
4
|
+
|
5
|
+
|
6
|
+
MIT License
|
7
|
+
|
8
|
+
Copyright (c) 2023 Mirac Suzgun
|
9
|
+
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
12
|
+
in the Software without restriction, including without limitation the rights
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
15
|
+
furnished to do so, subject to the following conditions:
|
16
|
+
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
18
|
+
copies or substantial portions of the Software.
|
19
|
+
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
26
|
+
SOFTWARE.
|
27
|
+
|
28
|
+
|
29
|
+
"""
|
30
|
+
|
31
|
+
"""
|
32
|
+
This module contains the classes for the similarity metrics and functions.
|
33
|
+
"""
|
34
|
+
|
35
|
+
|
36
|
+
from typing import List, Union, Tuple, Optional
|
37
|
+
import numpy as np
|
38
|
+
|
39
|
+
# # Import the LongestCommonSubsequence class
|
40
|
+
|
41
|
+
# for dev purposes
|
42
|
+
import sys
|
43
|
+
# sys.path.append("/Users/dowon/nltk_ko/nltk/metrics")
|
44
|
+
from nltkor.alignment import LongestCommonSubsequence, LongestCommonSubstring
|
45
|
+
# from alignment import LongestCommonSubsequence, LongestCommonSubstring
|
46
|
+
|
47
|
+
# Longest Common Subsequence based similarity class
|
48
|
+
class LCSubsequenceSimilarity(LongestCommonSubsequence):
|
49
|
+
"""
|
50
|
+
This class contains the Longest Common Subsequence similarity metric.
|
51
|
+
|
52
|
+
This class inherits from the LongestCommonSubsequence class.
|
53
|
+
"""
|
54
|
+
|
55
|
+
def __init__(self):
|
56
|
+
super().__init__()
|
57
|
+
|
58
|
+
|
59
|
+
def compute(self,
|
60
|
+
str1: Union[str, List[str]],
|
61
|
+
str2: Union[str, List[str]],
|
62
|
+
denominator: str = 'max',
|
63
|
+
) -> float:
|
64
|
+
"""
|
65
|
+
Returns the LCS-similarity between two strings.
|
66
|
+
|
67
|
+
Arguments:
|
68
|
+
str1 (Union[str, List[str]]): The first string or list of strings.
|
69
|
+
str2 (Union[str, List[str]]): The second string or list of strings.
|
70
|
+
denominator (str): The denominator to use. Options are 'max' and 'sum'. Default is 'max'.
|
71
|
+
|
72
|
+
Returns:
|
73
|
+
float: The similarity between the two strings.
|
74
|
+
|
75
|
+
Raises:
|
76
|
+
ValueError: If the denominator is invalid.
|
77
|
+
"""
|
78
|
+
|
79
|
+
# Get the numerator
|
80
|
+
numerator, _ = super().compute(str1, str2)
|
81
|
+
|
82
|
+
if denominator == 'max':
|
83
|
+
return (numerator / max(len(str1), len(str2)))
|
84
|
+
elif denominator == 'sum':
|
85
|
+
return (2. * numerator / (len(str1) + len(str2)))
|
86
|
+
else:
|
87
|
+
raise ValueError('Invalid denominator.')
|
88
|
+
|
89
|
+
|
90
|
+
|
91
|
+
# Longest Common Substring based similarity class
|
92
|
+
class LCSubstringSimilarity(LongestCommonSubstring):
|
93
|
+
"""
|
94
|
+
This class contains the Longest Common Substring similarity metric.
|
95
|
+
|
96
|
+
This class inherits from the LongestCommonSubstring class.
|
97
|
+
"""
|
98
|
+
def __init__(self):
|
99
|
+
super().__init__()
|
100
|
+
|
101
|
+
|
102
|
+
def compute(self,
|
103
|
+
str1: Union[str, List[str]],
|
104
|
+
str2: Union[str, List[str]],
|
105
|
+
denominator: str = 'max',
|
106
|
+
) -> float:
|
107
|
+
"""
|
108
|
+
Returns the LCS-similarity between two strings.
|
109
|
+
|
110
|
+
Arguments:
|
111
|
+
str1 (Union[str, List[str]]): The first string or list of strings.
|
112
|
+
str2 (Union[str, List[str]]): The second string or list of strings.
|
113
|
+
denominator (str): The denominator to use. Options are 'max' and 'sum'. Default is 'max'.
|
114
|
+
|
115
|
+
Returns:
|
116
|
+
float: The similarity between the two strings.
|
117
|
+
|
118
|
+
Raises:
|
119
|
+
ValueError: If the denominator is invalid.
|
120
|
+
"""
|
121
|
+
# Get the numerator
|
122
|
+
numerator, _ = super().compute(str1, str2)
|
123
|
+
|
124
|
+
if denominator == 'max':
|
125
|
+
return (numerator / max(len(str1), len(str2)))
|
126
|
+
elif denominator == 'sum':
|
127
|
+
return (2. * numerator / (len(str1) + len(str2)))
|
128
|
+
else:
|
129
|
+
raise ValueError('Invalid denominator.')
|
130
|
+
|
131
|
+
|
132
|
+
# Jaro similarity class
|
133
|
+
class JaroSimilarity:
|
134
|
+
"""
|
135
|
+
This class contains the Jaro similarity metric.
|
136
|
+
"""
|
137
|
+
|
138
|
+
def __init__(self):
|
139
|
+
pass
|
140
|
+
|
141
|
+
|
142
|
+
def compute(self,
|
143
|
+
str1: Union[str, List[str]],
|
144
|
+
str2: Union[str, List[str]],
|
145
|
+
) -> float:
|
146
|
+
"""
|
147
|
+
This function returns the Jaro similarity between two strings.
|
148
|
+
|
149
|
+
Arguments:
|
150
|
+
str1 (Union[str, List[str]]): The first string or list of strings.
|
151
|
+
str2 (Union[str, List[str]]): The second string or list of strings.
|
152
|
+
|
153
|
+
Returns:
|
154
|
+
float: The Jaro similarity between the two strings.
|
155
|
+
"""
|
156
|
+
# Get the length of the strings
|
157
|
+
len1 = len(str1)
|
158
|
+
len2 = len(str2)
|
159
|
+
|
160
|
+
# Get the maximum distance, which we denote by k
|
161
|
+
k = max(len1, len2) // 2 - 1
|
162
|
+
|
163
|
+
# Initialize the number of matching characters and the number of transpositions
|
164
|
+
num_matches = 0
|
165
|
+
num_transpositions = 0
|
166
|
+
|
167
|
+
# Initialize the list of matching flags for the strings
|
168
|
+
matches1 = [False] * len1
|
169
|
+
matches2 = [False] * len2
|
170
|
+
|
171
|
+
# Loop through the characters in the first string and find the matching characters
|
172
|
+
for i in range(len1):
|
173
|
+
# Get the lower and upper bounds for the search
|
174
|
+
lower_bound = max(0, i - k)
|
175
|
+
upper_bound = min(len2, i + k + 1)
|
176
|
+
|
177
|
+
# Loop through the characters in the second string
|
178
|
+
for j in range(lower_bound, upper_bound):
|
179
|
+
# Check if the characters match
|
180
|
+
if not matches2[j] and str1[i] == str2[j]:
|
181
|
+
# Increment the number of matches
|
182
|
+
num_matches += 1
|
183
|
+
|
184
|
+
# Set the matching flags
|
185
|
+
matches1[i] = True
|
186
|
+
matches2[j] = True
|
187
|
+
|
188
|
+
# Break out of the loop
|
189
|
+
break
|
190
|
+
|
191
|
+
# Check if there are no matches
|
192
|
+
if num_matches == 0:
|
193
|
+
return 0.
|
194
|
+
|
195
|
+
# Loop through again but this time find the number of transpositions
|
196
|
+
# That is, the number of times where there are two matching characters but there is another "matched" character in between them
|
197
|
+
moving_index = 0
|
198
|
+
for i in range(len1):
|
199
|
+
# Check if the character is a match
|
200
|
+
if matches1[i]:
|
201
|
+
# Find the next match
|
202
|
+
for j in range(moving_index, len2):
|
203
|
+
# Check if the character is a match
|
204
|
+
if matches2[j]:
|
205
|
+
# Set the moving index
|
206
|
+
moving_index = j + 1
|
207
|
+
|
208
|
+
# Check if the characters are not in the right order
|
209
|
+
if str1[i] != str2[j]:
|
210
|
+
# Increment the number of transpositions
|
211
|
+
num_transpositions += 1
|
212
|
+
|
213
|
+
# Break out of the loop
|
214
|
+
break
|
215
|
+
|
216
|
+
num_transpositions = num_transpositions // 2
|
217
|
+
|
218
|
+
# Return the Jaro similarity
|
219
|
+
return (num_matches / len1 + num_matches / len2 + (num_matches - num_transpositions) / num_matches) / 3.0
|
220
|
+
|
221
|
+
def demo():
|
222
|
+
"""
|
223
|
+
This function demonstrates the similarity metrics.
|
224
|
+
"""
|
225
|
+
# Initialize the similarity metrics
|
226
|
+
lcs_sim = LCSubsequenceSimilarity()
|
227
|
+
lcs_sub_sim = LCSubstringSimilarity()
|
228
|
+
jaro_sim = JaroSimilarity()
|
229
|
+
|
230
|
+
# Initialize the strings
|
231
|
+
str1 = '제가 나와 있는 곳은 경남 거제시 옥포동 덕포 해수욕장에 나와 있습니다.'
|
232
|
+
str2 = '강한 바람에 간판이나 지붕이 떨어지는 등 피해가 잇따르기도 했습니다.'
|
233
|
+
|
234
|
+
# Get the similarity metrics
|
235
|
+
lcs_sim_score = lcs_sim.compute(str1, str2)
|
236
|
+
lcs_sub_sim_score = lcs_sub_sim.compute(str1, str2)
|
237
|
+
jaro_sim_score = jaro_sim.compute(str1, str2)
|
238
|
+
|
239
|
+
# Print the results
|
240
|
+
print('Longest Common Subsequence Similarity: {}'.format(lcs_sim_score))
|
241
|
+
print('Longest Common Substring Similarity: {}'.format(lcs_sub_sim_score))
|
242
|
+
print('Jaro Similarity: {}'.format(jaro_sim_score))
|
243
|
+
|
244
|
+
if __name__ == '__main__':
|
245
|
+
demo()
|
@@ -0,0 +1,175 @@
|
|
1
|
+
"""
|
2
|
+
string2string similarity
|
3
|
+
src = https://github.com/stanfordnlp/string2string
|
4
|
+
|
5
|
+
|
6
|
+
MIT License
|
7
|
+
|
8
|
+
Copyright (c) 2023 Mirac Suzgun
|
9
|
+
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
12
|
+
in the Software without restriction, including without limitation the rights
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
15
|
+
furnished to do so, subject to the following conditions:
|
16
|
+
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
18
|
+
copies or substantial portions of the Software.
|
19
|
+
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
26
|
+
SOFTWARE.
|
27
|
+
|
28
|
+
|
29
|
+
"""
|
30
|
+
|
31
|
+
from typing import List, Union, Tuple
|
32
|
+
import torch
|
33
|
+
from torch import Tensor
|
34
|
+
from torch.nn import functional as F
|
35
|
+
import numpy as np
|
36
|
+
|
37
|
+
# for dev purposes
|
38
|
+
import sys
|
39
|
+
# sys.path.append("/Users/dowon/nltk_ko/nltk/misc")
|
40
|
+
from nltkor.misc.string2string_word_embeddings import GloVeEmbeddings
|
41
|
+
# from string2string_word_embeddings import GloVeEmbeddings
|
42
|
+
|
43
|
+
|
44
|
+
# Cosine similarity class
|
45
|
+
class CosineSimilarity:
|
46
|
+
def __init__(self) -> None:
|
47
|
+
r"""
|
48
|
+
This function initializes the CosineSimilarity class.
|
49
|
+
"""
|
50
|
+
pass
|
51
|
+
|
52
|
+
|
53
|
+
# Compute (tensor)
|
54
|
+
def _compute_tensor(self,
|
55
|
+
x1: Tensor,
|
56
|
+
x2: Tensor,
|
57
|
+
dim: int = 1,
|
58
|
+
eps: float = 1e-8
|
59
|
+
) -> Tensor:
|
60
|
+
r"""
|
61
|
+
Computes the cosine similarity between two tensors along a given dimension.
|
62
|
+
|
63
|
+
Arguments:
|
64
|
+
x1 (Tensor): First tensor.
|
65
|
+
x2 (Tensor): Second tensor.
|
66
|
+
dim (int): Dimension to compute cosine similarity.
|
67
|
+
eps (float): Epsilon value.
|
68
|
+
|
69
|
+
Returns:
|
70
|
+
Tensor: Cosine similarity between two tensors along a given dimension.
|
71
|
+
"""
|
72
|
+
# Make sure that x1 and x2 are float tensors
|
73
|
+
if x1.dtype != torch.float:
|
74
|
+
x1 = x1.float()
|
75
|
+
if x2.dtype != torch.float:
|
76
|
+
x2 = x2.float()
|
77
|
+
# Compute cosine similarity between two tensors
|
78
|
+
return F.cosine_similarity(x1, x2, dim, eps)
|
79
|
+
|
80
|
+
|
81
|
+
# Compute (numpy)
|
82
|
+
def _compute_numpy(self,
|
83
|
+
x1: np.ndarray,
|
84
|
+
x2: np.ndarray,
|
85
|
+
dim: int = 1,
|
86
|
+
eps: float = 1e-8
|
87
|
+
) -> np.ndarray:
|
88
|
+
r"""
|
89
|
+
Computes the cosine similarity between two numpy arrays along a given dimension.
|
90
|
+
|
91
|
+
Arguments:
|
92
|
+
x1 (np.ndarray): First numpy array.
|
93
|
+
x2 (np.ndarray): Second numpy array.
|
94
|
+
dim (int): Dimension (or axis in the numpy realm) to compute cosine similarity.
|
95
|
+
eps (float): Epsilon value (to prevent division by zero).
|
96
|
+
|
97
|
+
Returns:
|
98
|
+
np.ndarray: Cosine similarity between two numpy arrays along a given dimension.
|
99
|
+
"""
|
100
|
+
# Compute cosine similarity between two numpy arrays along a given dimension "dim"
|
101
|
+
return np.sum(x1 * x2, axis=dim) / np.maximum(np.linalg.norm(x1, axis=dim) * np.linalg.norm(x2, axis=dim), eps)
|
102
|
+
|
103
|
+
|
104
|
+
# Compute
|
105
|
+
def compute(self,
|
106
|
+
x1: Union[Tensor, np.ndarray],
|
107
|
+
x2: Union[Tensor, np.ndarray],
|
108
|
+
dim: int = 0,
|
109
|
+
eps: float = 1e-8
|
110
|
+
) -> Union[Tensor, np.ndarray]:
|
111
|
+
r"""
|
112
|
+
Computes the cosine similarity between two tensors (or numpy arrays) along a given dimension.
|
113
|
+
|
114
|
+
* For two (non-zero) vectors, :math:`x_1` and :math:`x_2`, the cosine similarity is defined as follows:
|
115
|
+
|
116
|
+
.. math::
|
117
|
+
:nowrap:
|
118
|
+
|
119
|
+
\begin{align}
|
120
|
+
\texttt{cosine-similarity}(x_1, x_2) & = |x_1|| \ ||x_2|| \cos(\theta) \\
|
121
|
+
& = \frac{x_1 \cdot x_2}{||x_1|| \ ||x_2||} \\
|
122
|
+
& = \frac{\sum_{i=1}^n x_{1i} x_{2i}}{\sqrt{\sum_{i=1}^n x_{1i}^2} \sqrt{\sum_{i=1}^n x_{2i}^2}}
|
123
|
+
\end{align}
|
124
|
+
|
125
|
+
where :math:`\theta` denotes the angle between the vectors, :math:`\cdot` the dot product, and :math:`||\cdot||` the norm operator.
|
126
|
+
|
127
|
+
* In practice, the cosine similarity is computed as follows:
|
128
|
+
|
129
|
+
.. math::
|
130
|
+
:nowrap:
|
131
|
+
|
132
|
+
\begin{align}
|
133
|
+
\texttt{cosine-similarity}(x_1, x_2) & = \frac{x_1 \cdot x_2}{\max(||x_1|| ||x_2||, \epsilon)}
|
134
|
+
\end{align}
|
135
|
+
|
136
|
+
where :math:`\epsilon` is a small value to avoid division by zero.
|
137
|
+
|
138
|
+
|
139
|
+
Arguments:
|
140
|
+
x1 (Union[Tensor, np.ndarray]): First tensor (or numpy array).
|
141
|
+
x2 (Union[Tensor, np.ndarray]): Second tensor (or numpy array).
|
142
|
+
dim (int): Dimension to compute cosine similarity (default: 0).
|
143
|
+
eps (float): Epsilon value (to avoid division by zero).
|
144
|
+
|
145
|
+
Returns:
|
146
|
+
Union[Tensor, np.ndarray]: Cosine similarity between two tensors (or numpy arrays) along a given dimension.
|
147
|
+
|
148
|
+
Raises:
|
149
|
+
TypeError: If x1 and x2 are not of the same type (either tensor or numpy array).
|
150
|
+
TypeError: If x1 and x2 are not tensors or numpy arrays.
|
151
|
+
"""
|
152
|
+
# Check if x1 and x2 are of the same type (either tensor or numpy array)
|
153
|
+
if type(x1) != type(x2):
|
154
|
+
raise TypeError("x1 and x2 must be of the same type (either tensor or numpy array).")
|
155
|
+
|
156
|
+
# If x1 and x2 are tensors
|
157
|
+
if type(x1) == Tensor:
|
158
|
+
# Compute cosine similarity
|
159
|
+
return self._compute_tensor(x1, x2, dim, eps)
|
160
|
+
# If x1 and x2 are numpy arrays
|
161
|
+
elif type(x1) == np.ndarray:
|
162
|
+
# Compute cosine similarity
|
163
|
+
return self._compute_numpy(x1, x2, dim, eps)
|
164
|
+
# If x1 and x2 are not tensors or numpy arrays
|
165
|
+
else:
|
166
|
+
raise TypeError("x1 and x2 must be either tensors or numpy arrays.")
|
167
|
+
|
168
|
+
def demo():
|
169
|
+
array1 = np.array([20, 65, 1])
|
170
|
+
array2 = np.array([98, 67, 548])
|
171
|
+
|
172
|
+
print("demo : ", CosineSimilarity().compute(array1, array2))
|
173
|
+
|
174
|
+
if __name__ == "__main__":
|
175
|
+
demo()
|
nltkor/tag/__init__.py
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
# Natural Language Toolkit: Taggers
|
3
|
+
#
|
4
|
+
# Copyright (C) 2001-2020 NLTK Project
|
5
|
+
# Author: Edward Loper <edloper@gmail.com>
|
6
|
+
# Steven Bird <stevenbird1@gmail.com> (minor additions)
|
7
|
+
# URL: <http://nltk.org/>
|
8
|
+
# For license information, see LICENSE.TXT
|
9
|
+
"""
|
10
|
+
NLTK Taggers
|
11
|
+
|
12
|
+
This package contains classes and interfaces for part-of-speech
|
13
|
+
tagging, or simply "tagging".
|
14
|
+
|
15
|
+
A "tag" is a case-sensitive string that specifies some property of a token,
|
16
|
+
such as its part of speech. Tagged tokens are encoded as tuples
|
17
|
+
``(tag, token)``. For example, the following tagged token combines
|
18
|
+
the word ``'fly'`` with a noun part of speech tag (``'NN'``):
|
19
|
+
|
20
|
+
>>> tagged_tok = ('fly', 'NN')
|
21
|
+
|
22
|
+
An off-the-shelf tagger is available for English. It uses the Penn Treebank tagset:
|
23
|
+
|
24
|
+
>>> from nltk import pos_tag, word_tokenize
|
25
|
+
>>> pos_tag(word_tokenize("John's big idea isn't all that bad."))
|
26
|
+
[('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'),
|
27
|
+
("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')]
|
28
|
+
|
29
|
+
A Russian tagger is also available if you specify lang="rus". It uses
|
30
|
+
the Russian National Corpus tagset:
|
31
|
+
|
32
|
+
>>> pos_tag(word_tokenize("Илья оторопел и дважды перечитал бумажку."), lang='rus') # doctest: +SKIP
|
33
|
+
[('Илья', 'S'), ('оторопел', 'V'), ('и', 'CONJ'), ('дважды', 'ADV'), ('перечитал', 'V'),
|
34
|
+
('бумажку', 'S'), ('.', 'NONLEX')]
|
35
|
+
|
36
|
+
This package defines several taggers, which take a list of tokens,
|
37
|
+
assign a tag to each one, and return the resulting list of tagged tokens.
|
38
|
+
Most of the taggers are built automatically based on a training corpus.
|
39
|
+
For example, the unigram tagger tags each word *w* by checking what
|
40
|
+
the most frequent tag for *w* was in a training corpus:
|
41
|
+
|
42
|
+
>>> from nltk.corpus import brown
|
43
|
+
>>> from nltk.tag import UnigramTagger
|
44
|
+
>>> tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500])
|
45
|
+
>>> sent = ['Mitchell', 'decried', 'the', 'high', 'rate', 'of', 'unemployment']
|
46
|
+
>>> for word, tag in tagger.tag(sent):
|
47
|
+
... print(word, '->', tag)
|
48
|
+
Mitchell -> NP
|
49
|
+
decried -> None
|
50
|
+
the -> AT
|
51
|
+
high -> JJ
|
52
|
+
rate -> NN
|
53
|
+
of -> IN
|
54
|
+
unemployment -> None
|
55
|
+
|
56
|
+
Note that words that the tagger has not seen during training receive a tag
|
57
|
+
of ``None``.
|
58
|
+
|
59
|
+
We evaluate a tagger on data that was not seen during training:
|
60
|
+
|
61
|
+
>>> tagger.evaluate(brown.tagged_sents(categories='news')[500:600])
|
62
|
+
0.73...
|
63
|
+
|
64
|
+
For more information, please consult chapter 5 of the NLTK Book.
|
65
|
+
"""
|
66
|
+
|
67
|
+
|
68
|
+
from nltkor.tag.espresso_tag import EspressoTagger
|
69
|
+
#import nltkor.tag
|
70
|
+
from nltkor.tag.libs import taggers
|
71
|
+
from .libs import PickleConverter
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|