nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. nltkor/Kor_char.py +193 -0
  2. nltkor/__init__.py +16 -0
  3. nltkor/alignment/__init__.py +1315 -0
  4. nltkor/cider/__init__.py +2 -0
  5. nltkor/cider/cider.py +55 -0
  6. nltkor/cider/cider_scorer.py +207 -0
  7. nltkor/distance/__init__.py +441 -0
  8. nltkor/distance/wasserstein.py +126 -0
  9. nltkor/etc.py +22 -0
  10. nltkor/lazyimport.py +144 -0
  11. nltkor/make_requirement.py +11 -0
  12. nltkor/metrics/__init__.py +63 -0
  13. nltkor/metrics/bartscore.py +301 -0
  14. nltkor/metrics/bertscore.py +331 -0
  15. nltkor/metrics/bleu_tensor.py +20 -0
  16. nltkor/metrics/classical.py +847 -0
  17. nltkor/metrics/entment.py +24 -0
  18. nltkor/metrics/eval.py +517 -0
  19. nltkor/metrics/mauve.py +273 -0
  20. nltkor/metrics/mauve_utils.py +131 -0
  21. nltkor/misc/__init__.py +11 -0
  22. nltkor/misc/string2string_basic_functions.py +59 -0
  23. nltkor/misc/string2string_default_tokenizer.py +83 -0
  24. nltkor/misc/string2string_hash_functions.py +159 -0
  25. nltkor/misc/string2string_word_embeddings.py +503 -0
  26. nltkor/search/__init__.py +10 -0
  27. nltkor/search/classical.py +569 -0
  28. nltkor/search/faiss_search.py +787 -0
  29. nltkor/search/kobert_tokenizer.py +181 -0
  30. nltkor/sejong/__init__.py +3 -0
  31. nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
  32. nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
  33. nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
  34. nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
  35. nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
  36. nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
  37. nltkor/sejong/ch.py +12 -0
  38. nltkor/sejong/dict_semClassNum.txt +491 -0
  39. nltkor/sejong/layer.txt +630 -0
  40. nltkor/sejong/sejong_download.py +87 -0
  41. nltkor/sejong/ssem.py +684 -0
  42. nltkor/similarity/__init__.py +3 -0
  43. nltkor/similarity/bartscore____.py +337 -0
  44. nltkor/similarity/bertscore____.py +339 -0
  45. nltkor/similarity/classical.py +245 -0
  46. nltkor/similarity/cosine_similarity.py +175 -0
  47. nltkor/tag/__init__.py +71 -0
  48. nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
  49. nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
  50. nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
  51. nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
  52. nltkor/tag/espresso_tag.py +220 -0
  53. nltkor/tag/libs/__init__.py +10 -0
  54. nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
  55. nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
  56. nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
  57. nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
  58. nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
  59. nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
  60. nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
  61. nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
  62. nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
  63. nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
  64. nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
  65. nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
  66. nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
  67. nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
  68. nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
  69. nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
  70. nltkor/tag/libs/arguments.py +280 -0
  71. nltkor/tag/libs/attributes.py +231 -0
  72. nltkor/tag/libs/config.py +159 -0
  73. nltkor/tag/libs/metadata.py +129 -0
  74. nltkor/tag/libs/ner/__init__.py +2 -0
  75. nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
  76. nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
  77. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
  78. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
  79. nltkor/tag/libs/ner/macmorphoreader.py +7 -0
  80. nltkor/tag/libs/ner/ner_reader.py +92 -0
  81. nltkor/tag/libs/network.c +72325 -0
  82. nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
  83. nltkor/tag/libs/network.pyx +878 -0
  84. nltkor/tag/libs/networkconv.pyx +1028 -0
  85. nltkor/tag/libs/networkdependencyconv.pyx +451 -0
  86. nltkor/tag/libs/parse/__init__.py +1 -0
  87. nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
  88. nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
  89. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
  90. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
  91. nltkor/tag/libs/parse/parse_reader.py +283 -0
  92. nltkor/tag/libs/pos/__init__.py +2 -0
  93. nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
  94. nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
  95. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
  96. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
  97. nltkor/tag/libs/pos/macmorphoreader.py +7 -0
  98. nltkor/tag/libs/pos/pos_reader.py +97 -0
  99. nltkor/tag/libs/reader.py +485 -0
  100. nltkor/tag/libs/srl/__init__.py +3 -0
  101. nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
  102. nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
  103. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
  104. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
  105. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
  106. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
  107. nltkor/tag/libs/srl/__srl_reader_.py +535 -0
  108. nltkor/tag/libs/srl/srl_reader.py +436 -0
  109. nltkor/tag/libs/srl/train_srl.py +87 -0
  110. nltkor/tag/libs/taggers.py +926 -0
  111. nltkor/tag/libs/utils.py +384 -0
  112. nltkor/tag/libs/word_dictionary.py +239 -0
  113. nltkor/tag/libs/wsd/__init__.py +2 -0
  114. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
  115. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
  116. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
  117. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
  118. nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
  119. nltkor/tag/libs/wsd/wsd_reader.py +93 -0
  120. nltkor/tokenize/__init__.py +62 -0
  121. nltkor/tokenize/ko_tokenize.py +115 -0
  122. nltkor/trans.py +121 -0
  123. nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
  124. nltkor-1.2.14.dist-info/METADATA +41 -0
  125. nltkor-1.2.14.dist-info/RECORD +127 -0
  126. nltkor-1.2.14.dist-info/WHEEL +5 -0
  127. nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,245 @@
1
+ """
2
+ string2string similarity
3
+ src = https://github.com/stanfordnlp/string2string
4
+
5
+
6
+ MIT License
7
+
8
+ Copyright (c) 2023 Mirac Suzgun
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+
29
+ """
30
+
31
+ """
32
+ This module contains the classes for the similarity metrics and functions.
33
+ """
34
+
35
+
36
+ from typing import List, Union, Tuple, Optional
37
+ import numpy as np
38
+
39
+ # # Import the LongestCommonSubsequence class
40
+
41
+ # for dev purposes
42
+ import sys
43
+ # sys.path.append("/Users/dowon/nltk_ko/nltk/metrics")
44
+ from nltkor.alignment import LongestCommonSubsequence, LongestCommonSubstring
45
+ # from alignment import LongestCommonSubsequence, LongestCommonSubstring
46
+
47
+ # Longest Common Subsequence based similarity class
48
+ class LCSubsequenceSimilarity(LongestCommonSubsequence):
49
+ """
50
+ This class contains the Longest Common Subsequence similarity metric.
51
+
52
+ This class inherits from the LongestCommonSubsequence class.
53
+ """
54
+
55
+ def __init__(self):
56
+ super().__init__()
57
+
58
+
59
+ def compute(self,
60
+ str1: Union[str, List[str]],
61
+ str2: Union[str, List[str]],
62
+ denominator: str = 'max',
63
+ ) -> float:
64
+ """
65
+ Returns the LCS-similarity between two strings.
66
+
67
+ Arguments:
68
+ str1 (Union[str, List[str]]): The first string or list of strings.
69
+ str2 (Union[str, List[str]]): The second string or list of strings.
70
+ denominator (str): The denominator to use. Options are 'max' and 'sum'. Default is 'max'.
71
+
72
+ Returns:
73
+ float: The similarity between the two strings.
74
+
75
+ Raises:
76
+ ValueError: If the denominator is invalid.
77
+ """
78
+
79
+ # Get the numerator
80
+ numerator, _ = super().compute(str1, str2)
81
+
82
+ if denominator == 'max':
83
+ return (numerator / max(len(str1), len(str2)))
84
+ elif denominator == 'sum':
85
+ return (2. * numerator / (len(str1) + len(str2)))
86
+ else:
87
+ raise ValueError('Invalid denominator.')
88
+
89
+
90
+
91
+ # Longest Common Substring based similarity class
92
+ class LCSubstringSimilarity(LongestCommonSubstring):
93
+ """
94
+ This class contains the Longest Common Substring similarity metric.
95
+
96
+ This class inherits from the LongestCommonSubstring class.
97
+ """
98
+ def __init__(self):
99
+ super().__init__()
100
+
101
+
102
+ def compute(self,
103
+ str1: Union[str, List[str]],
104
+ str2: Union[str, List[str]],
105
+ denominator: str = 'max',
106
+ ) -> float:
107
+ """
108
+ Returns the LCS-similarity between two strings.
109
+
110
+ Arguments:
111
+ str1 (Union[str, List[str]]): The first string or list of strings.
112
+ str2 (Union[str, List[str]]): The second string or list of strings.
113
+ denominator (str): The denominator to use. Options are 'max' and 'sum'. Default is 'max'.
114
+
115
+ Returns:
116
+ float: The similarity between the two strings.
117
+
118
+ Raises:
119
+ ValueError: If the denominator is invalid.
120
+ """
121
+ # Get the numerator
122
+ numerator, _ = super().compute(str1, str2)
123
+
124
+ if denominator == 'max':
125
+ return (numerator / max(len(str1), len(str2)))
126
+ elif denominator == 'sum':
127
+ return (2. * numerator / (len(str1) + len(str2)))
128
+ else:
129
+ raise ValueError('Invalid denominator.')
130
+
131
+
132
+ # Jaro similarity class
133
+ class JaroSimilarity:
134
+ """
135
+ This class contains the Jaro similarity metric.
136
+ """
137
+
138
+ def __init__(self):
139
+ pass
140
+
141
+
142
+ def compute(self,
143
+ str1: Union[str, List[str]],
144
+ str2: Union[str, List[str]],
145
+ ) -> float:
146
+ """
147
+ This function returns the Jaro similarity between two strings.
148
+
149
+ Arguments:
150
+ str1 (Union[str, List[str]]): The first string or list of strings.
151
+ str2 (Union[str, List[str]]): The second string or list of strings.
152
+
153
+ Returns:
154
+ float: The Jaro similarity between the two strings.
155
+ """
156
+ # Get the length of the strings
157
+ len1 = len(str1)
158
+ len2 = len(str2)
159
+
160
+ # Get the maximum distance, which we denote by k
161
+ k = max(len1, len2) // 2 - 1
162
+
163
+ # Initialize the number of matching characters and the number of transpositions
164
+ num_matches = 0
165
+ num_transpositions = 0
166
+
167
+ # Initialize the list of matching flags for the strings
168
+ matches1 = [False] * len1
169
+ matches2 = [False] * len2
170
+
171
+ # Loop through the characters in the first string and find the matching characters
172
+ for i in range(len1):
173
+ # Get the lower and upper bounds for the search
174
+ lower_bound = max(0, i - k)
175
+ upper_bound = min(len2, i + k + 1)
176
+
177
+ # Loop through the characters in the second string
178
+ for j in range(lower_bound, upper_bound):
179
+ # Check if the characters match
180
+ if not matches2[j] and str1[i] == str2[j]:
181
+ # Increment the number of matches
182
+ num_matches += 1
183
+
184
+ # Set the matching flags
185
+ matches1[i] = True
186
+ matches2[j] = True
187
+
188
+ # Break out of the loop
189
+ break
190
+
191
+ # Check if there are no matches
192
+ if num_matches == 0:
193
+ return 0.
194
+
195
+ # Loop through again but this time find the number of transpositions
196
+ # That is, the number of times where there are two matching characters but there is another "matched" character in between them
197
+ moving_index = 0
198
+ for i in range(len1):
199
+ # Check if the character is a match
200
+ if matches1[i]:
201
+ # Find the next match
202
+ for j in range(moving_index, len2):
203
+ # Check if the character is a match
204
+ if matches2[j]:
205
+ # Set the moving index
206
+ moving_index = j + 1
207
+
208
+ # Check if the characters are not in the right order
209
+ if str1[i] != str2[j]:
210
+ # Increment the number of transpositions
211
+ num_transpositions += 1
212
+
213
+ # Break out of the loop
214
+ break
215
+
216
+ num_transpositions = num_transpositions // 2
217
+
218
+ # Return the Jaro similarity
219
+ return (num_matches / len1 + num_matches / len2 + (num_matches - num_transpositions) / num_matches) / 3.0
220
+
221
+ def demo():
222
+ """
223
+ This function demonstrates the similarity metrics.
224
+ """
225
+ # Initialize the similarity metrics
226
+ lcs_sim = LCSubsequenceSimilarity()
227
+ lcs_sub_sim = LCSubstringSimilarity()
228
+ jaro_sim = JaroSimilarity()
229
+
230
+ # Initialize the strings
231
+ str1 = '제가 나와 있는 곳은 경남 거제시 옥포동 덕포 해수욕장에 나와 있습니다.'
232
+ str2 = '강한 바람에 간판이나 지붕이 떨어지는 등 피해가 잇따르기도 했습니다.'
233
+
234
+ # Get the similarity metrics
235
+ lcs_sim_score = lcs_sim.compute(str1, str2)
236
+ lcs_sub_sim_score = lcs_sub_sim.compute(str1, str2)
237
+ jaro_sim_score = jaro_sim.compute(str1, str2)
238
+
239
+ # Print the results
240
+ print('Longest Common Subsequence Similarity: {}'.format(lcs_sim_score))
241
+ print('Longest Common Substring Similarity: {}'.format(lcs_sub_sim_score))
242
+ print('Jaro Similarity: {}'.format(jaro_sim_score))
243
+
244
+ if __name__ == '__main__':
245
+ demo()
@@ -0,0 +1,175 @@
1
+ """
2
+ string2string similarity
3
+ src = https://github.com/stanfordnlp/string2string
4
+
5
+
6
+ MIT License
7
+
8
+ Copyright (c) 2023 Mirac Suzgun
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+
29
+ """
30
+
31
+ from typing import List, Union, Tuple
32
+ import torch
33
+ from torch import Tensor
34
+ from torch.nn import functional as F
35
+ import numpy as np
36
+
37
+ # for dev purposes
38
+ import sys
39
+ # sys.path.append("/Users/dowon/nltk_ko/nltk/misc")
40
+ from nltkor.misc.string2string_word_embeddings import GloVeEmbeddings
41
+ # from string2string_word_embeddings import GloVeEmbeddings
42
+
43
+
44
+ # Cosine similarity class
45
+ class CosineSimilarity:
46
+ def __init__(self) -> None:
47
+ r"""
48
+ This function initializes the CosineSimilarity class.
49
+ """
50
+ pass
51
+
52
+
53
+ # Compute (tensor)
54
+ def _compute_tensor(self,
55
+ x1: Tensor,
56
+ x2: Tensor,
57
+ dim: int = 1,
58
+ eps: float = 1e-8
59
+ ) -> Tensor:
60
+ r"""
61
+ Computes the cosine similarity between two tensors along a given dimension.
62
+
63
+ Arguments:
64
+ x1 (Tensor): First tensor.
65
+ x2 (Tensor): Second tensor.
66
+ dim (int): Dimension to compute cosine similarity.
67
+ eps (float): Epsilon value.
68
+
69
+ Returns:
70
+ Tensor: Cosine similarity between two tensors along a given dimension.
71
+ """
72
+ # Make sure that x1 and x2 are float tensors
73
+ if x1.dtype != torch.float:
74
+ x1 = x1.float()
75
+ if x2.dtype != torch.float:
76
+ x2 = x2.float()
77
+ # Compute cosine similarity between two tensors
78
+ return F.cosine_similarity(x1, x2, dim, eps)
79
+
80
+
81
+ # Compute (numpy)
82
+ def _compute_numpy(self,
83
+ x1: np.ndarray,
84
+ x2: np.ndarray,
85
+ dim: int = 1,
86
+ eps: float = 1e-8
87
+ ) -> np.ndarray:
88
+ r"""
89
+ Computes the cosine similarity between two numpy arrays along a given dimension.
90
+
91
+ Arguments:
92
+ x1 (np.ndarray): First numpy array.
93
+ x2 (np.ndarray): Second numpy array.
94
+ dim (int): Dimension (or axis in the numpy realm) to compute cosine similarity.
95
+ eps (float): Epsilon value (to prevent division by zero).
96
+
97
+ Returns:
98
+ np.ndarray: Cosine similarity between two numpy arrays along a given dimension.
99
+ """
100
+ # Compute cosine similarity between two numpy arrays along a given dimension "dim"
101
+ return np.sum(x1 * x2, axis=dim) / np.maximum(np.linalg.norm(x1, axis=dim) * np.linalg.norm(x2, axis=dim), eps)
102
+
103
+
104
+ # Compute
105
+ def compute(self,
106
+ x1: Union[Tensor, np.ndarray],
107
+ x2: Union[Tensor, np.ndarray],
108
+ dim: int = 0,
109
+ eps: float = 1e-8
110
+ ) -> Union[Tensor, np.ndarray]:
111
+ r"""
112
+ Computes the cosine similarity between two tensors (or numpy arrays) along a given dimension.
113
+
114
+ * For two (non-zero) vectors, :math:`x_1` and :math:`x_2`, the cosine similarity is defined as follows:
115
+
116
+ .. math::
117
+ :nowrap:
118
+
119
+ \begin{align}
120
+ \texttt{cosine-similarity}(x_1, x_2) & = |x_1|| \ ||x_2|| \cos(\theta) \\
121
+ & = \frac{x_1 \cdot x_2}{||x_1|| \ ||x_2||} \\
122
+ & = \frac{\sum_{i=1}^n x_{1i} x_{2i}}{\sqrt{\sum_{i=1}^n x_{1i}^2} \sqrt{\sum_{i=1}^n x_{2i}^2}}
123
+ \end{align}
124
+
125
+ where :math:`\theta` denotes the angle between the vectors, :math:`\cdot` the dot product, and :math:`||\cdot||` the norm operator.
126
+
127
+ * In practice, the cosine similarity is computed as follows:
128
+
129
+ .. math::
130
+ :nowrap:
131
+
132
+ \begin{align}
133
+ \texttt{cosine-similarity}(x_1, x_2) & = \frac{x_1 \cdot x_2}{\max(||x_1|| ||x_2||, \epsilon)}
134
+ \end{align}
135
+
136
+ where :math:`\epsilon` is a small value to avoid division by zero.
137
+
138
+
139
+ Arguments:
140
+ x1 (Union[Tensor, np.ndarray]): First tensor (or numpy array).
141
+ x2 (Union[Tensor, np.ndarray]): Second tensor (or numpy array).
142
+ dim (int): Dimension to compute cosine similarity (default: 0).
143
+ eps (float): Epsilon value (to avoid division by zero).
144
+
145
+ Returns:
146
+ Union[Tensor, np.ndarray]: Cosine similarity between two tensors (or numpy arrays) along a given dimension.
147
+
148
+ Raises:
149
+ TypeError: If x1 and x2 are not of the same type (either tensor or numpy array).
150
+ TypeError: If x1 and x2 are not tensors or numpy arrays.
151
+ """
152
+ # Check if x1 and x2 are of the same type (either tensor or numpy array)
153
+ if type(x1) != type(x2):
154
+ raise TypeError("x1 and x2 must be of the same type (either tensor or numpy array).")
155
+
156
+ # If x1 and x2 are tensors
157
+ if type(x1) == Tensor:
158
+ # Compute cosine similarity
159
+ return self._compute_tensor(x1, x2, dim, eps)
160
+ # If x1 and x2 are numpy arrays
161
+ elif type(x1) == np.ndarray:
162
+ # Compute cosine similarity
163
+ return self._compute_numpy(x1, x2, dim, eps)
164
+ # If x1 and x2 are not tensors or numpy arrays
165
+ else:
166
+ raise TypeError("x1 and x2 must be either tensors or numpy arrays.")
167
+
168
+ def demo():
169
+ array1 = np.array([20, 65, 1])
170
+ array2 = np.array([98, 67, 548])
171
+
172
+ print("demo : ", CosineSimilarity().compute(array1, array2))
173
+
174
+ if __name__ == "__main__":
175
+ demo()
nltkor/tag/__init__.py ADDED
@@ -0,0 +1,71 @@
1
+ # -*- coding: utf-8 -*-
2
+ # Natural Language Toolkit: Taggers
3
+ #
4
+ # Copyright (C) 2001-2020 NLTK Project
5
+ # Author: Edward Loper <edloper@gmail.com>
6
+ # Steven Bird <stevenbird1@gmail.com> (minor additions)
7
+ # URL: <http://nltk.org/>
8
+ # For license information, see LICENSE.TXT
9
+ """
10
+ NLTK Taggers
11
+
12
+ This package contains classes and interfaces for part-of-speech
13
+ tagging, or simply "tagging".
14
+
15
+ A "tag" is a case-sensitive string that specifies some property of a token,
16
+ such as its part of speech. Tagged tokens are encoded as tuples
17
+ ``(tag, token)``. For example, the following tagged token combines
18
+ the word ``'fly'`` with a noun part of speech tag (``'NN'``):
19
+
20
+ >>> tagged_tok = ('fly', 'NN')
21
+
22
+ An off-the-shelf tagger is available for English. It uses the Penn Treebank tagset:
23
+
24
+ >>> from nltk import pos_tag, word_tokenize
25
+ >>> pos_tag(word_tokenize("John's big idea isn't all that bad."))
26
+ [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'),
27
+ ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')]
28
+
29
+ A Russian tagger is also available if you specify lang="rus". It uses
30
+ the Russian National Corpus tagset:
31
+
32
+ >>> pos_tag(word_tokenize("Илья оторопел и дважды перечитал бумажку."), lang='rus') # doctest: +SKIP
33
+ [('Илья', 'S'), ('оторопел', 'V'), ('и', 'CONJ'), ('дважды', 'ADV'), ('перечитал', 'V'),
34
+ ('бумажку', 'S'), ('.', 'NONLEX')]
35
+
36
+ This package defines several taggers, which take a list of tokens,
37
+ assign a tag to each one, and return the resulting list of tagged tokens.
38
+ Most of the taggers are built automatically based on a training corpus.
39
+ For example, the unigram tagger tags each word *w* by checking what
40
+ the most frequent tag for *w* was in a training corpus:
41
+
42
+ >>> from nltk.corpus import brown
43
+ >>> from nltk.tag import UnigramTagger
44
+ >>> tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500])
45
+ >>> sent = ['Mitchell', 'decried', 'the', 'high', 'rate', 'of', 'unemployment']
46
+ >>> for word, tag in tagger.tag(sent):
47
+ ... print(word, '->', tag)
48
+ Mitchell -> NP
49
+ decried -> None
50
+ the -> AT
51
+ high -> JJ
52
+ rate -> NN
53
+ of -> IN
54
+ unemployment -> None
55
+
56
+ Note that words that the tagger has not seen during training receive a tag
57
+ of ``None``.
58
+
59
+ We evaluate a tagger on data that was not seen during training:
60
+
61
+ >>> tagger.evaluate(brown.tagged_sents(categories='news')[500:600])
62
+ 0.73...
63
+
64
+ For more information, please consult chapter 5 of the NLTK Book.
65
+ """
66
+
67
+
68
+ from nltkor.tag.espresso_tag import EspressoTagger
69
+ #import nltkor.tag
70
+ from nltkor.tag.libs import taggers
71
+ from .libs import PickleConverter