nltkor 1.2.0__cp39-cp39-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. nltkor/Kor_char.py +193 -0
  2. nltkor/__init__.py +15 -0
  3. nltkor/alignment/__init__.py +1315 -0
  4. nltkor/cider/__init__.py +2 -0
  5. nltkor/cider/cider.py +55 -0
  6. nltkor/cider/cider_scorer.py +207 -0
  7. nltkor/distance/__init__.py +441 -0
  8. nltkor/distance/wasserstein.py +126 -0
  9. nltkor/etc.py +22 -0
  10. nltkor/lazyimport.py +144 -0
  11. nltkor/make_requirement.py +11 -0
  12. nltkor/metrics/__init__.py +63 -0
  13. nltkor/metrics/bartscore.py +301 -0
  14. nltkor/metrics/bertscore.py +331 -0
  15. nltkor/metrics/bleu_tensor.py +20 -0
  16. nltkor/metrics/classical.py +814 -0
  17. nltkor/metrics/entment.py +24 -0
  18. nltkor/metrics/eval.py +517 -0
  19. nltkor/metrics/mauve.py +273 -0
  20. nltkor/metrics/mauve_utils.py +131 -0
  21. nltkor/misc/__init__.py +11 -0
  22. nltkor/misc/string2string_basic_functions.py +59 -0
  23. nltkor/misc/string2string_default_tokenizer.py +83 -0
  24. nltkor/misc/string2string_hash_functions.py +159 -0
  25. nltkor/misc/string2string_word_embeddings.py +503 -0
  26. nltkor/search/__init__.py +10 -0
  27. nltkor/search/classical.py +569 -0
  28. nltkor/search/faiss_search.py +467 -0
  29. nltkor/search/kobert_tokenizer.py +181 -0
  30. nltkor/sejong/__init__.py +3 -0
  31. nltkor/sejong/ch.py +12 -0
  32. nltkor/sejong/dict_semClassNum.txt +491 -0
  33. nltkor/sejong/layer.txt +630 -0
  34. nltkor/sejong/sejong_download.py +87 -0
  35. nltkor/sejong/ssem.py +685 -0
  36. nltkor/similarity/__init__.py +3 -0
  37. nltkor/similarity/bartscore____.py +337 -0
  38. nltkor/similarity/bertscore____.py +339 -0
  39. nltkor/similarity/classical.py +245 -0
  40. nltkor/similarity/cosine_similarity.py +175 -0
  41. nltkor/tag/__init__.py +70 -0
  42. nltkor/tag/espresso_tag.py +220 -0
  43. nltkor/tag/libs/__init__.py +9 -0
  44. nltkor/tag/libs/arguments.py +280 -0
  45. nltkor/tag/libs/attributes.py +231 -0
  46. nltkor/tag/libs/config.py +158 -0
  47. nltkor/tag/libs/metadata.py +129 -0
  48. nltkor/tag/libs/ner/__init__.py +2 -0
  49. nltkor/tag/libs/ner/macmorphoreader.py +7 -0
  50. nltkor/tag/libs/ner/ner_reader.py +92 -0
  51. nltkor/tag/libs/network.c +59267 -0
  52. nltkor/tag/libs/network.cpython-39-darwin.so +0 -0
  53. nltkor/tag/libs/parse/__init__.py +1 -0
  54. nltkor/tag/libs/parse/parse_reader.py +283 -0
  55. nltkor/tag/libs/pos/__init__.py +2 -0
  56. nltkor/tag/libs/pos/macmorphoreader.py +7 -0
  57. nltkor/tag/libs/pos/pos_reader.py +89 -0
  58. nltkor/tag/libs/reader.py +510 -0
  59. nltkor/tag/libs/srl/__init__.py +3 -0
  60. nltkor/tag/libs/srl/__srl_reader_.py +535 -0
  61. nltkor/tag/libs/srl/srl_reader.py +436 -0
  62. nltkor/tag/libs/srl/train_srl.py +87 -0
  63. nltkor/tag/libs/taggers.py +926 -0
  64. nltkor/tag/libs/utils.py +344 -0
  65. nltkor/tag/libs/word_dictionary.py +239 -0
  66. nltkor/tag/libs/wsd/__init__.py +2 -0
  67. nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
  68. nltkor/tag/libs/wsd/wsd_reader.py +93 -0
  69. nltkor/tokenize/__init__.py +62 -0
  70. nltkor/tokenize/ko_tokenize.py +115 -0
  71. nltkor/trans.py +121 -0
  72. nltkor-1.2.0.dist-info/LICENSE.txt +1093 -0
  73. nltkor-1.2.0.dist-info/METADATA +33 -0
  74. nltkor-1.2.0.dist-info/RECORD +76 -0
  75. nltkor-1.2.0.dist-info/WHEEL +5 -0
  76. nltkor-1.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,467 @@
1
+ """
2
+ string2string search
3
+ src = https://github.com/stanfordnlp/string2string
4
+
5
+
6
+ MIT License
7
+
8
+ Copyright (c) 2023 Mirac Suzgun
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+
29
+ """
30
+
31
+
32
+ """
33
+ This module contains a wrapper for the Faiss library by Facebook AI Research.
34
+ """
35
+
36
+ from typing import List, Union, Optional, Dict, Any
37
+ import os
38
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
39
+
40
+ from nltkor.make_requirement import make_requirement
41
+ try:
42
+ import torch
43
+ from transformers import AutoTokenizer, AutoModel, XLNetTokenizer
44
+ import pandas as pd
45
+ from datasets import Dataset
46
+ # import protobuf
47
+ except ImportError:
48
+ requirment = ['torch', 'transformers>=4.8.2', 'pandas', 'datasets', "protobuf", 'sentencepiece']
49
+ file_path = make_requirement(requirment)
50
+ raise Exception(f"""
51
+ Need to install Libraries, please pip install below libraries
52
+ \t pip install transformers>=4.8.2
53
+ \t pip install torch
54
+ \t pip install pandas
55
+ \t pip install datasets
56
+ \t pip install protobuf
57
+ \t pip install sentencepiece
58
+ Or, use pip install requirement.txt
59
+ \t pip install -r {file_path}
60
+ """)
61
+
62
+ # from nltk.search.kobert_tokenizer import KoBERTTokenizer
63
+
64
+
65
+
66
+ # FAISS library wrapper class
67
+ class FaissSearch:
68
+ def __init__(self,
69
+ model_name_or_path: str = 'klue/bert-base',
70
+ tokenizer_name_or_path: str = 'klue/bert-base',
71
+ device: str = 'cpu'
72
+ ) -> None:
73
+ r"""
74
+ This function initializes the wrapper for the FAISS library, which is used to perform semantic search.
75
+
76
+
77
+ .. attention::
78
+
79
+ * If you use this class, please make sure to cite the following paper:
80
+
81
+ .. code-block:: latex
82
+
83
+ @article{johnson2019billion,
84
+ title={Billion-scale similarity search with {GPUs}},
85
+ author={Johnson, Jeff and Douze, Matthijs and J{\'e}gou, Herv{\'e}},
86
+ journal={IEEE Transactions on Big Data},
87
+ volume={7},
88
+ number={3},
89
+ pages={535--547},
90
+ year={2019},
91
+ publisher={IEEE}
92
+ }
93
+
94
+ * The code is based on the following GitHub repository:
95
+ https://github.com/facebookresearch/faiss
96
+
97
+ Arguments:
98
+ model_name_or_path (str, optional): The name or path of the model to use. Defaults to 'facebook/bart-large'.
99
+ tokenizer_name_or_path (str, optional): The name or path of the tokenizer to use. Defaults to 'facebook/bart-large'.
100
+ device (str, optional): The device to use. Defaults to 'cpu'.
101
+
102
+ Returns:
103
+ None
104
+ """
105
+
106
+ # Set the device
107
+ self.device = device
108
+
109
+ # If the tokenizer is not specified, use the model name or path
110
+ if tokenizer_name_or_path is None:
111
+ tokenizer_name_or_path = model_name_or_path
112
+
113
+ # Load the tokenizer
114
+ if tokenizer_name_or_path == 'skt/kobert-base-v1':
115
+ # self.tokenizer = KoBERTTokenizer.from_pretrained(tokenizer_name_or_path)
116
+ self.tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name_or_path)
117
+ else:
118
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
119
+
120
+ # Load the model
121
+ self.model = AutoModel.from_pretrained(model_name_or_path).to(self.device)
122
+
123
+ # Set the model to evaluation mode (since we do not need the gradients)
124
+ self.model.eval()
125
+
126
+ # Initialize the dataset
127
+ self.dataset = None
128
+
129
+
130
+
131
+ # Auxiliary function to get the last hidden state
132
+ def get_last_hidden_state(self,
133
+ embeddings: torch.Tensor,
134
+ ) -> torch.Tensor:
135
+ """
136
+ This function returns the last hidden state (e.g., [CLS] token's) of the input embeddings.
137
+
138
+ Arguments:
139
+ embeddings (torch.Tensor): The input embeddings.
140
+
141
+ Returns:
142
+ torch.Tensor: The last hidden state.
143
+ """
144
+
145
+ # Get the last hidden state
146
+ last_hidden_state = embeddings.last_hidden_state
147
+
148
+ # Return the last hidden state
149
+ return last_hidden_state[:, 0, :]
150
+
151
+
152
+
153
+ # Auxiliary function to get the mean pooling
154
+ def get_mean_pooling(self,
155
+ embeddings: torch.Tensor,
156
+ ) -> torch.Tensor:
157
+ """
158
+ This function returns the mean pooling of the input embeddings.
159
+
160
+ Arguments:
161
+ embeddings (torch.Tensor): The input embeddings.
162
+
163
+ Returns:
164
+ torch.Tensor: The mean pooling.
165
+ """
166
+
167
+ # Get the mean pooling
168
+ mean_pooling = embeddings.last_hidden_state.mean(dim=1)
169
+
170
+ # Return the mean pooling
171
+ return mean_pooling
172
+
173
+
174
+
175
+
176
+ # Get the embeddings
177
+ def get_embeddings(self,
178
+ text: Union[str, List[str]],
179
+ embedding_type: str = 'last_hidden_state',
180
+ batch_size: int = 8,
181
+ num_workers: int = 4,
182
+ ) -> torch.Tensor:
183
+ """
184
+ This function returns the embeddings of the input text.
185
+
186
+ Arguments:
187
+ text (Union[str, List[str]]): The input text.
188
+ embedding_type (str, optional): The type of embedding to use. Defaults to 'last_hidden_state'.
189
+ batch_size (int, optional): The batch size to use. Defaults to 8.
190
+ num_workers (int, optional): The number of workers to use. Defaults to 4.
191
+
192
+ Returns:
193
+ torch.Tensor: The embeddings.
194
+
195
+ Raises:
196
+ ValueError: If the embedding type is invalid.
197
+ """
198
+
199
+ # Check if the embedding type is valid
200
+ if embedding_type not in ['last_hidden_state', 'mean_pooling']:
201
+ raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
202
+
203
+ # Tokenize the input text
204
+ encoded_text = self.tokenizer(
205
+ text,
206
+ padding=True,
207
+ truncation=True,
208
+ return_tensors='pt',
209
+ )
210
+
211
+ # Move the input text to the device
212
+ encoded_text = encoded_text.to(self.device)
213
+
214
+ # encoded_inputs = {k: v.to(self.device) for k, v in encoded_inputs.items()}
215
+
216
+ # Get the embeddings
217
+ with torch.no_grad():
218
+ embeddings = self.model(**encoded_text)
219
+
220
+ # Get the proper embedding type
221
+ if embedding_type == 'last_hidden_state':
222
+ # Get the last hidden state
223
+ embeddings = self.get_last_hidden_state(embeddings)
224
+ elif embedding_type == 'mean_pooling':
225
+ # Get the mean pooling
226
+ embeddings = self.get_mean_pooling(embeddings)
227
+
228
+ # Return the embeddings
229
+ return embeddings
230
+
231
+
232
+
233
+ # Add FAISS index
234
+ def add_faiss_index(self,
235
+ column_name: str = 'embeddings',
236
+ metric_type: Optional[int] = None,
237
+ batch_size: int = 8,
238
+ **kwargs,
239
+ ) -> None:
240
+ """
241
+ This function adds a FAISS index to the dataset.
242
+
243
+ Arguments:
244
+ column_name (str, optional): The name of the column containing the embeddings. Defaults to 'embeddings'.
245
+ index_type (str, optional): The index type to use. Defaults to 'Flat'.
246
+ metric_type (str, optional): The metric type to use. Defaults to 'L2'.
247
+
248
+ Returns:
249
+ None
250
+
251
+ Raises:
252
+ ValueError: If the dataset is not initialized.
253
+ """
254
+
255
+ # Check if the dataset is initialized
256
+ if self.dataset is None:
257
+ raise ValueError('The dataset is not initialized. Please initialize the dataset first.')
258
+
259
+ print('Adding FAISS index...')
260
+ self.dataset.add_faiss_index(
261
+ column_name,
262
+ # metric_type=metric_type,
263
+ # device=self.device,
264
+ # batch_size=batch_size,
265
+ faiss_verbose=True,
266
+ # **kwargs,
267
+ )
268
+
269
+
270
+ def save_faiss_index(self,
271
+ index_name: str,
272
+ file_path: str,
273
+ ) -> None:
274
+ """
275
+ This function saves the FAISS index to the specified file path.
276
+ * This is a wrapper function for the `save_faiss_index` function in the `Dataset` class.
277
+
278
+ Arguments:
279
+ index_name (str): The name of the FAISS index (e.g., "embeddings")
280
+ file_path (str): The file path to save the FAISS index.
281
+
282
+ Returns:
283
+ None
284
+
285
+ Raises:
286
+ ValueError: If the dataset is not initialized.
287
+ """
288
+
289
+ # Check if the dataset is initialized
290
+ if self.dataset is None:
291
+ raise ValueError('The dataset is not initialized. Please initialize the dataset first.')
292
+
293
+ print('Saving FAISS index...')
294
+ self.dataset.save_faiss_index(index_name=index_name, file=file_path)
295
+
296
+
297
+
298
+ def load_faiss_index(self,
299
+ index_name: str,
300
+ file_path: str,
301
+ device: str = 'cpu',
302
+ ) -> None:
303
+ """
304
+ This function loads the FAISS index from the specified file path.
305
+ * This is a wrapper function for the `load_faiss_index` function in the `Dataset` class.
306
+
307
+ Arguments:
308
+ index_name (str): The name of the FAISS index (e.g., "embeddings")
309
+ file_path (str): The file path to load the FAISS index from.
310
+ device (str, optional): The device to use ("cpu" or "cuda") (default: "cpu").
311
+
312
+ Returns:
313
+ None
314
+
315
+ Raises:
316
+ ValueError: If the dataset is not initialized.
317
+ """
318
+
319
+ # Check if the dataset is initialized
320
+ if self.dataset is None:
321
+ raise ValueError('The dataset is not initialized. Please initialize the dataset first.')
322
+
323
+ print('Loading FAISS index...')
324
+ self.dataset.load_faiss_index(index_name=index_name, file=file_path, device=device)
325
+
326
+
327
+
328
+ # Initialize the corpus using a dictionary or pandas DataFrame or HuggingFace Datasets object
329
+ def initialize_corpus(self,
330
+ corpus: Union[Dict[str, List[str]], pd.DataFrame, Dataset],
331
+ section: str = 'text',
332
+ index_column_name: str = 'embeddings',
333
+ embedding_type: str = 'last_hidden_state',
334
+ batch_size: Optional[int] = None,
335
+ num_workers: Optional[int] = None,
336
+ save_path: Optional[str] = None,
337
+ ) -> Dataset:
338
+ """
339
+ This function initializes a dataset using a dictionary or pandas DataFrame or HuggingFace Datasets object.
340
+
341
+ Arguments:
342
+ dataset_dict (Dict[str, List[str]]): The dataset dictionary.
343
+ section (str): The section of the dataset to use whose embeddings will be used for semantic search (e.g., 'text', 'title', etc.) (default: 'text').
344
+ index_column_name (str): The name of the column containing the embeddings (default: 'embeddings')
345
+ embedding_type (str): The type of embedding to use (default: 'last_hidden_state').
346
+ batch_size (int, optional): The batch size to use (default: 8).
347
+ max_length (int, optional): The maximum length of the input sequences.
348
+ num_workers (int, optional): The number of workers to use.
349
+ save_path (Optional[str], optional): The path to save the dataset (default: None).
350
+
351
+ Returns:
352
+ Dataset: The dataset object (HuggingFace Datasets).
353
+
354
+ Raises:
355
+ ValueError: If the dataset is not a dictionary or pandas DataFrame or HuggingFace Datasets object.
356
+ """
357
+
358
+ # Create the dataset
359
+ if isinstance(corpus, dict):
360
+ self.dataset = Dataset.from_dict(corpus)
361
+ elif isinstance(corpus, pd.DataFrame):
362
+ self.dataset = Dataset.from_pandas(corpus)
363
+ elif isinstance(corpus, Dataset):
364
+ self.dataset = corpus
365
+ else:
366
+ raise ValueError('The dataset must be a dictionary or pandas DataFrame.')
367
+
368
+ # Set the embedding_type
369
+ self.embedding_type = embedding_type
370
+
371
+
372
+ # Tokenize the dataset
373
+ # self.dataset = self.dataset.map(
374
+ # lambda x: x[section],
375
+ # batched=True,
376
+ # batch_size=batch_size,
377
+ # num_proc=num_workers,
378
+ # )
379
+
380
+ # Map the section of the dataset to the embeddings
381
+ self.dataset = self.dataset.map(
382
+ lambda x: {
383
+ index_column_name: self.get_embeddings(x[section], embedding_type=self.embedding_type).detach().cpu().numpy()[0]
384
+ },
385
+ # batched=True,
386
+ batch_size=batch_size,
387
+ num_proc=num_workers,
388
+ )
389
+
390
+ # Save the dataset
391
+ if save_path is not None:
392
+ self.dataset.to_json(save_path)
393
+
394
+ # Add FAISS index
395
+ self.add_faiss_index(
396
+ column_name=index_column_name,
397
+ )
398
+
399
+ # Return the dataset
400
+ return self.dataset
401
+
402
+
403
+
404
+ # Initialize the dataset using a JSON file
405
+ def load_dataset_from_json(self,
406
+ json_path: str,
407
+ ) -> Dataset:
408
+ """
409
+ This function loads a dataset from a JSON file.
410
+
411
+ Arguments:
412
+ json_path (str): The path to the JSON file.
413
+
414
+ Returns:
415
+ Dataset: The dataset.
416
+ """
417
+
418
+ # Load the dataset
419
+ self.dataset = Dataset.from_json(json_path)
420
+
421
+ # Return the dataset
422
+ return self.dataset
423
+
424
+
425
+
426
+ # Search for the most similar elements in the dataset, given a query
427
+ def search(self,
428
+ query: str,
429
+ k: int = 1,
430
+ index_column_name: str = 'embeddings',
431
+ ) -> pd.DataFrame:
432
+ """
433
+ This function searches for the most similar elements in the dataset, given a query.
434
+
435
+ Arguments:
436
+ query (str): The query.
437
+ k (int, optional): The number of elements to return (default: 1).
438
+ index_column_name (str, optional): The name of the column containing the embeddings (default: 'embeddings')
439
+
440
+ Returns:
441
+ pd.DataFrame: The most similar elements in the dataset (text, score, etc.), sorted by score.
442
+
443
+ Remarks:
444
+ The returned elements are dictionaries containing the text and the score.
445
+ """
446
+
447
+ # Get the embeddings of the query
448
+ query_embeddings = self.get_embeddings([query], embedding_type=self.embedding_type).detach().cpu().numpy()
449
+
450
+ # Search for the most similar elements in the dataset
451
+ scores, similar_elts = self.dataset.get_nearest_examples(
452
+ index_name=index_column_name,
453
+ query=query_embeddings,
454
+ k=k,
455
+ )
456
+
457
+ # Convert the results to a pandas DataFrame
458
+ results_df = pd.DataFrame.from_dict(similar_elts)
459
+
460
+ # Add the scores
461
+ results_df['score'] = scores
462
+
463
+ # Sort the results by score
464
+ results_df.sort_values("score", ascending=True, inplace=True)
465
+
466
+ # Return the most similar elements
467
+ return results_df
@@ -0,0 +1,181 @@
1
+ # coding=utf-8
2
+ # Copyright 2021 SKT AI Authors.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from typing import Any, Dict, List, Optional
17
+ from nltkor.make_requirement import make_requirement
18
+ try:
19
+ from transformers.tokenization_utils import AddedToken
20
+ from transformers import XLNetTokenizer
21
+ from transformers import SPIECE_UNDERLINE
22
+ import sentencepiece
23
+ except ImportError:
24
+ requirement = ['transformers>=4.8.2', 'sentencepiece']
25
+ file_path = make_requirement(requirement)
26
+ raise Exception(f"""
27
+ Need to install Libraries, please pip install below libraries
28
+ \t pip install transformers>=4.8.2
29
+ \t pip install sentencepiece
30
+ Or, use pip install requirement.txt
31
+ \t pip install -r {file_path}
32
+ """)
33
+
34
+ class KoBERTTokenizer(XLNetTokenizer):
35
+ padding_side = "right"
36
+
37
+ def __init__(
38
+ self,
39
+ vocab_file,
40
+ do_lower_case=False,
41
+ remove_space=True,
42
+ keep_accents=False,
43
+ bos_token="[CLS]",
44
+ eos_token="[SEP]",
45
+ unk_token="[UNK]",
46
+ sep_token="[SEP]",
47
+ pad_token="[PAD]",
48
+ cls_token="[CLS]",
49
+ mask_token="[MASK]",
50
+ additional_special_tokens=None,
51
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
52
+ **kwargs
53
+ ) -> None:
54
+ # Mask token behave like a normal word, i.e. include the space before it
55
+ mask_token = (
56
+ AddedToken(mask_token, lstrip=True, rstrip=False)
57
+ if isinstance(mask_token, str)
58
+ else mask_token
59
+ )
60
+
61
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
62
+
63
+ super().__init__(
64
+ vocab_file,
65
+ do_lower_case=do_lower_case,
66
+ remove_space=remove_space,
67
+ keep_accents=keep_accents,
68
+ bos_token=bos_token,
69
+ eos_token=eos_token,
70
+ unk_token=unk_token,
71
+ sep_token=sep_token,
72
+ pad_token=pad_token,
73
+ cls_token=cls_token,
74
+ mask_token=mask_token,
75
+ additional_special_tokens=additional_special_tokens,
76
+ sp_model_kwargs=self.sp_model_kwargs,
77
+ **kwargs,
78
+ )
79
+ self._pad_token_type_id = 0
80
+
81
+ def build_inputs_with_special_tokens(
82
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
83
+ ) -> List[int]:
84
+ """
85
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
86
+ adding special tokens. An XLNet sequence has the following format:
87
+ - single sequence: ``<cls> X <sep>``
88
+ - pair of sequences: ``<cls> A <sep> B <sep>``
89
+ Args:
90
+ token_ids_0 (:obj:`List[int]`):
91
+ List of IDs to which the special tokens will be added.
92
+ token_ids_1 (:obj:`List[int]`, `optional`):
93
+ Optional second list of IDs for sequence pairs.
94
+ Returns:
95
+ :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
96
+ """
97
+ sep = [self.sep_token_id]
98
+ cls = [self.cls_token_id]
99
+ if token_ids_1 is None:
100
+ return cls + token_ids_0 + sep
101
+ return cls + token_ids_0 + sep + token_ids_1 + sep
102
+
103
+ def _tokenize(self, text: str) -> List[str]:
104
+ """Tokenize a string."""
105
+ text = self.preprocess_text(text)
106
+ pieces = self.sp_model.encode(text, out_type=str, **self.sp_model_kwargs)
107
+ new_pieces = []
108
+ for piece in pieces:
109
+ if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
110
+ cur_pieces = self.sp_model.EncodeAsPieces(
111
+ piece[:-1].replace(SPIECE_UNDERLINE, "")
112
+ )
113
+ if (
114
+ piece[0] != SPIECE_UNDERLINE
115
+ and cur_pieces[0][0] == SPIECE_UNDERLINE
116
+ ):
117
+ if len(cur_pieces[0]) == 1:
118
+ cur_pieces = cur_pieces[1:]
119
+ else:
120
+ cur_pieces[0] = cur_pieces[0][1:]
121
+ cur_pieces.append(piece[-1])
122
+ new_pieces.extend(cur_pieces)
123
+ else:
124
+ new_pieces.append(piece)
125
+
126
+ return new_pieces
127
+
128
+ def build_inputs_with_special_tokens(
129
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
130
+ ) -> List[int]:
131
+ """
132
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
133
+ adding special tokens. An XLNet sequence has the following format:
134
+
135
+ - single sequence: ``<cls> X <sep> ``
136
+ - pair of sequences: ``<cls> A <sep> B <sep>``
137
+
138
+ Args:
139
+ token_ids_0 (:obj:`List[int]`):
140
+ List of IDs to which the special tokens will be added.
141
+ token_ids_1 (:obj:`List[int]`, `optional`):
142
+ Optional second list of IDs for sequence pairs.
143
+
144
+ Returns:
145
+ :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
146
+ """
147
+ sep = [self.sep_token_id]
148
+ cls = [self.cls_token_id]
149
+ if token_ids_1 is None:
150
+ return cls + token_ids_0 + sep
151
+ return cls + token_ids_0 + sep + token_ids_1 + sep
152
+
153
+ def create_token_type_ids_from_sequences(
154
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
155
+ ) -> List[int]:
156
+ """
157
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLNet
158
+ sequence pair mask has the following format:
159
+
160
+ ::
161
+
162
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
163
+ | first sequence | second sequence |
164
+
165
+ If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
166
+
167
+ Args:
168
+ token_ids_0 (:obj:`List[int]`):
169
+ List of IDs.
170
+ token_ids_1 (:obj:`List[int]`, `optional`):
171
+ Optional second list of IDs for sequence pairs.
172
+
173
+ Returns:
174
+ :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
175
+ sequence(s).
176
+ """
177
+ sep = [self.sep_token_id]
178
+ cls = [self.cls_token_id]
179
+ if token_ids_1 is None:
180
+ return len(cls + token_ids_0 + sep) * [0]
181
+ return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
@@ -0,0 +1,3 @@
1
+ from nltkor.sejong.sejong_download import SejongDir
2
+
3
+ __all__=['ssem']
nltkor/sejong/ch.py ADDED
@@ -0,0 +1,12 @@
1
+ import os
2
+ import unicodedata
3
+ import sys
4
+
5
+
6
+ for filename in os.listdir('/01. 체언_상세//'):
7
+ new_filename=filename.replcace(filename, unicodedata.normalize('NFD',filename))
8
+ os.rename(filename, new_filename)
9
+
10
+
11
+
12
+