nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. nltkor/Kor_char.py +193 -0
  2. nltkor/__init__.py +16 -0
  3. nltkor/alignment/__init__.py +1315 -0
  4. nltkor/cider/__init__.py +2 -0
  5. nltkor/cider/cider.py +55 -0
  6. nltkor/cider/cider_scorer.py +207 -0
  7. nltkor/distance/__init__.py +441 -0
  8. nltkor/distance/wasserstein.py +126 -0
  9. nltkor/etc.py +22 -0
  10. nltkor/lazyimport.py +144 -0
  11. nltkor/make_requirement.py +11 -0
  12. nltkor/metrics/__init__.py +63 -0
  13. nltkor/metrics/bartscore.py +301 -0
  14. nltkor/metrics/bertscore.py +331 -0
  15. nltkor/metrics/bleu_tensor.py +20 -0
  16. nltkor/metrics/classical.py +847 -0
  17. nltkor/metrics/entment.py +24 -0
  18. nltkor/metrics/eval.py +517 -0
  19. nltkor/metrics/mauve.py +273 -0
  20. nltkor/metrics/mauve_utils.py +131 -0
  21. nltkor/misc/__init__.py +11 -0
  22. nltkor/misc/string2string_basic_functions.py +59 -0
  23. nltkor/misc/string2string_default_tokenizer.py +83 -0
  24. nltkor/misc/string2string_hash_functions.py +159 -0
  25. nltkor/misc/string2string_word_embeddings.py +503 -0
  26. nltkor/search/__init__.py +10 -0
  27. nltkor/search/classical.py +569 -0
  28. nltkor/search/faiss_search.py +787 -0
  29. nltkor/search/kobert_tokenizer.py +181 -0
  30. nltkor/sejong/__init__.py +3 -0
  31. nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
  32. nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
  33. nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
  34. nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
  35. nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
  36. nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
  37. nltkor/sejong/ch.py +12 -0
  38. nltkor/sejong/dict_semClassNum.txt +491 -0
  39. nltkor/sejong/layer.txt +630 -0
  40. nltkor/sejong/sejong_download.py +87 -0
  41. nltkor/sejong/ssem.py +684 -0
  42. nltkor/similarity/__init__.py +3 -0
  43. nltkor/similarity/bartscore____.py +337 -0
  44. nltkor/similarity/bertscore____.py +339 -0
  45. nltkor/similarity/classical.py +245 -0
  46. nltkor/similarity/cosine_similarity.py +175 -0
  47. nltkor/tag/__init__.py +71 -0
  48. nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
  49. nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
  50. nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
  51. nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
  52. nltkor/tag/espresso_tag.py +220 -0
  53. nltkor/tag/libs/__init__.py +10 -0
  54. nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
  55. nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
  56. nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
  57. nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
  58. nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
  59. nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
  60. nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
  61. nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
  62. nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
  63. nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
  64. nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
  65. nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
  66. nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
  67. nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
  68. nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
  69. nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
  70. nltkor/tag/libs/arguments.py +280 -0
  71. nltkor/tag/libs/attributes.py +231 -0
  72. nltkor/tag/libs/config.py +159 -0
  73. nltkor/tag/libs/metadata.py +129 -0
  74. nltkor/tag/libs/ner/__init__.py +2 -0
  75. nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
  76. nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
  77. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
  78. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
  79. nltkor/tag/libs/ner/macmorphoreader.py +7 -0
  80. nltkor/tag/libs/ner/ner_reader.py +92 -0
  81. nltkor/tag/libs/network.c +72325 -0
  82. nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
  83. nltkor/tag/libs/network.pyx +878 -0
  84. nltkor/tag/libs/networkconv.pyx +1028 -0
  85. nltkor/tag/libs/networkdependencyconv.pyx +451 -0
  86. nltkor/tag/libs/parse/__init__.py +1 -0
  87. nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
  88. nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
  89. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
  90. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
  91. nltkor/tag/libs/parse/parse_reader.py +283 -0
  92. nltkor/tag/libs/pos/__init__.py +2 -0
  93. nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
  94. nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
  95. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
  96. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
  97. nltkor/tag/libs/pos/macmorphoreader.py +7 -0
  98. nltkor/tag/libs/pos/pos_reader.py +97 -0
  99. nltkor/tag/libs/reader.py +485 -0
  100. nltkor/tag/libs/srl/__init__.py +3 -0
  101. nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
  102. nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
  103. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
  104. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
  105. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
  106. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
  107. nltkor/tag/libs/srl/__srl_reader_.py +535 -0
  108. nltkor/tag/libs/srl/srl_reader.py +436 -0
  109. nltkor/tag/libs/srl/train_srl.py +87 -0
  110. nltkor/tag/libs/taggers.py +926 -0
  111. nltkor/tag/libs/utils.py +384 -0
  112. nltkor/tag/libs/word_dictionary.py +239 -0
  113. nltkor/tag/libs/wsd/__init__.py +2 -0
  114. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
  115. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
  116. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
  117. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
  118. nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
  119. nltkor/tag/libs/wsd/wsd_reader.py +93 -0
  120. nltkor/tokenize/__init__.py +62 -0
  121. nltkor/tokenize/ko_tokenize.py +115 -0
  122. nltkor/trans.py +121 -0
  123. nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
  124. nltkor-1.2.14.dist-info/METADATA +41 -0
  125. nltkor-1.2.14.dist-info/RECORD +127 -0
  126. nltkor-1.2.14.dist-info/WHEEL +5 -0
  127. nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,181 @@
1
+ # coding=utf-8
2
+ # Copyright 2021 SKT AI Authors.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from typing import Any, Dict, List, Optional
17
+ from nltkor.make_requirement import make_requirement
18
+ try:
19
+ from transformers.tokenization_utils import AddedToken
20
+ from transformers import XLNetTokenizer
21
+ from transformers import SPIECE_UNDERLINE
22
+ import sentencepiece
23
+ except ImportError:
24
+ requirement = ['transformers>=4.8.2', 'sentencepiece']
25
+ file_path = make_requirement(requirement)
26
+ raise Exception(f"""
27
+ Need to install Libraries, please pip install below libraries
28
+ \t pip install transformers>=4.8.2
29
+ \t pip install sentencepiece
30
+ Or, use pip install requirement.txt
31
+ \t pip install -r {file_path}
32
+ """)
33
+
34
+ class KoBERTTokenizer(XLNetTokenizer):
35
+ padding_side = "right"
36
+
37
+ def __init__(
38
+ self,
39
+ vocab_file,
40
+ do_lower_case=False,
41
+ remove_space=True,
42
+ keep_accents=False,
43
+ bos_token="[CLS]",
44
+ eos_token="[SEP]",
45
+ unk_token="[UNK]",
46
+ sep_token="[SEP]",
47
+ pad_token="[PAD]",
48
+ cls_token="[CLS]",
49
+ mask_token="[MASK]",
50
+ additional_special_tokens=None,
51
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
52
+ **kwargs
53
+ ) -> None:
54
+ # Mask token behave like a normal word, i.e. include the space before it
55
+ mask_token = (
56
+ AddedToken(mask_token, lstrip=True, rstrip=False)
57
+ if isinstance(mask_token, str)
58
+ else mask_token
59
+ )
60
+
61
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
62
+
63
+ super().__init__(
64
+ vocab_file,
65
+ do_lower_case=do_lower_case,
66
+ remove_space=remove_space,
67
+ keep_accents=keep_accents,
68
+ bos_token=bos_token,
69
+ eos_token=eos_token,
70
+ unk_token=unk_token,
71
+ sep_token=sep_token,
72
+ pad_token=pad_token,
73
+ cls_token=cls_token,
74
+ mask_token=mask_token,
75
+ additional_special_tokens=additional_special_tokens,
76
+ sp_model_kwargs=self.sp_model_kwargs,
77
+ **kwargs,
78
+ )
79
+ self._pad_token_type_id = 0
80
+
81
+ def build_inputs_with_special_tokens(
82
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
83
+ ) -> List[int]:
84
+ """
85
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
86
+ adding special tokens. An XLNet sequence has the following format:
87
+ - single sequence: ``<cls> X <sep>``
88
+ - pair of sequences: ``<cls> A <sep> B <sep>``
89
+ Args:
90
+ token_ids_0 (:obj:`List[int]`):
91
+ List of IDs to which the special tokens will be added.
92
+ token_ids_1 (:obj:`List[int]`, `optional`):
93
+ Optional second list of IDs for sequence pairs.
94
+ Returns:
95
+ :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
96
+ """
97
+ sep = [self.sep_token_id]
98
+ cls = [self.cls_token_id]
99
+ if token_ids_1 is None:
100
+ return cls + token_ids_0 + sep
101
+ return cls + token_ids_0 + sep + token_ids_1 + sep
102
+
103
+ def _tokenize(self, text: str) -> List[str]:
104
+ """Tokenize a string."""
105
+ text = self.preprocess_text(text)
106
+ pieces = self.sp_model.encode(text, out_type=str, **self.sp_model_kwargs)
107
+ new_pieces = []
108
+ for piece in pieces:
109
+ if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
110
+ cur_pieces = self.sp_model.EncodeAsPieces(
111
+ piece[:-1].replace(SPIECE_UNDERLINE, "")
112
+ )
113
+ if (
114
+ piece[0] != SPIECE_UNDERLINE
115
+ and cur_pieces[0][0] == SPIECE_UNDERLINE
116
+ ):
117
+ if len(cur_pieces[0]) == 1:
118
+ cur_pieces = cur_pieces[1:]
119
+ else:
120
+ cur_pieces[0] = cur_pieces[0][1:]
121
+ cur_pieces.append(piece[-1])
122
+ new_pieces.extend(cur_pieces)
123
+ else:
124
+ new_pieces.append(piece)
125
+
126
+ return new_pieces
127
+
128
+ def build_inputs_with_special_tokens(
129
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
130
+ ) -> List[int]:
131
+ """
132
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
133
+ adding special tokens. An XLNet sequence has the following format:
134
+
135
+ - single sequence: ``<cls> X <sep> ``
136
+ - pair of sequences: ``<cls> A <sep> B <sep>``
137
+
138
+ Args:
139
+ token_ids_0 (:obj:`List[int]`):
140
+ List of IDs to which the special tokens will be added.
141
+ token_ids_1 (:obj:`List[int]`, `optional`):
142
+ Optional second list of IDs for sequence pairs.
143
+
144
+ Returns:
145
+ :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
146
+ """
147
+ sep = [self.sep_token_id]
148
+ cls = [self.cls_token_id]
149
+ if token_ids_1 is None:
150
+ return cls + token_ids_0 + sep
151
+ return cls + token_ids_0 + sep + token_ids_1 + sep
152
+
153
+ def create_token_type_ids_from_sequences(
154
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
155
+ ) -> List[int]:
156
+ """
157
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLNet
158
+ sequence pair mask has the following format:
159
+
160
+ ::
161
+
162
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
163
+ | first sequence | second sequence |
164
+
165
+ If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
166
+
167
+ Args:
168
+ token_ids_0 (:obj:`List[int]`):
169
+ List of IDs.
170
+ token_ids_1 (:obj:`List[int]`, `optional`):
171
+ Optional second list of IDs for sequence pairs.
172
+
173
+ Returns:
174
+ :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
175
+ sequence(s).
176
+ """
177
+ sep = [self.sep_token_id]
178
+ cls = [self.cls_token_id]
179
+ if token_ids_1 is None:
180
+ return len(cls + token_ids_0 + sep) * [0]
181
+ return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
@@ -0,0 +1,3 @@
1
+ from nltkor.sejong.sejong_download import SejongDir
2
+
3
+ __all__=['ssem']
nltkor/sejong/ch.py ADDED
@@ -0,0 +1,12 @@
1
+ import os
2
+ import unicodedata
3
+ import sys
4
+
5
+
6
+ for filename in os.listdir('/01. 체언_상세//'):
7
+ new_filename=filename.replcace(filename, unicodedata.normalize('NFD',filename))
8
+ os.rename(filename, new_filename)
9
+
10
+
11
+
12
+