PyPI - nltkor - Versions diffs - 1.2.0__cp39-cp39-macosx_10_9_x86_64.whl - Mend

nltkor 1.2.0__cp39-cp39-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

nltkor/Kor_char.py +193 -0
nltkor/__init__.py +15 -0
nltkor/alignment/__init__.py +1315 -0
nltkor/cider/__init__.py +2 -0
nltkor/cider/cider.py +55 -0
nltkor/cider/cider_scorer.py +207 -0
nltkor/distance/__init__.py +441 -0
nltkor/distance/wasserstein.py +126 -0
nltkor/etc.py +22 -0
nltkor/lazyimport.py +144 -0
nltkor/make_requirement.py +11 -0
nltkor/metrics/__init__.py +63 -0
nltkor/metrics/bartscore.py +301 -0
nltkor/metrics/bertscore.py +331 -0
nltkor/metrics/bleu_tensor.py +20 -0
nltkor/metrics/classical.py +814 -0
nltkor/metrics/entment.py +24 -0
nltkor/metrics/eval.py +517 -0
nltkor/metrics/mauve.py +273 -0
nltkor/metrics/mauve_utils.py +131 -0
nltkor/misc/__init__.py +11 -0
nltkor/misc/string2string_basic_functions.py +59 -0
nltkor/misc/string2string_default_tokenizer.py +83 -0
nltkor/misc/string2string_hash_functions.py +159 -0
nltkor/misc/string2string_word_embeddings.py +503 -0
nltkor/search/__init__.py +10 -0
nltkor/search/classical.py +569 -0
nltkor/search/faiss_search.py +467 -0
nltkor/search/kobert_tokenizer.py +181 -0
nltkor/sejong/__init__.py +3 -0
nltkor/sejong/ch.py +12 -0
nltkor/sejong/dict_semClassNum.txt +491 -0
nltkor/sejong/layer.txt +630 -0
nltkor/sejong/sejong_download.py +87 -0
nltkor/sejong/ssem.py +685 -0
nltkor/similarity/__init__.py +3 -0
nltkor/similarity/bartscore____.py +337 -0
nltkor/similarity/bertscore____.py +339 -0
nltkor/similarity/classical.py +245 -0
nltkor/similarity/cosine_similarity.py +175 -0
nltkor/tag/__init__.py +70 -0
nltkor/tag/espresso_tag.py +220 -0
nltkor/tag/libs/__init__.py +9 -0
nltkor/tag/libs/arguments.py +280 -0
nltkor/tag/libs/attributes.py +231 -0
nltkor/tag/libs/config.py +158 -0
nltkor/tag/libs/metadata.py +129 -0
nltkor/tag/libs/ner/__init__.py +2 -0
nltkor/tag/libs/ner/macmorphoreader.py +7 -0
nltkor/tag/libs/ner/ner_reader.py +92 -0
nltkor/tag/libs/network.c +59267 -0
nltkor/tag/libs/network.cpython-39-darwin.so +0 -0
nltkor/tag/libs/parse/__init__.py +1 -0
nltkor/tag/libs/parse/parse_reader.py +283 -0
nltkor/tag/libs/pos/__init__.py +2 -0
nltkor/tag/libs/pos/macmorphoreader.py +7 -0
nltkor/tag/libs/pos/pos_reader.py +89 -0
nltkor/tag/libs/reader.py +510 -0
nltkor/tag/libs/srl/__init__.py +3 -0
nltkor/tag/libs/srl/__srl_reader_.py +535 -0
nltkor/tag/libs/srl/srl_reader.py +436 -0
nltkor/tag/libs/srl/train_srl.py +87 -0
nltkor/tag/libs/taggers.py +926 -0
nltkor/tag/libs/utils.py +344 -0
nltkor/tag/libs/word_dictionary.py +239 -0
nltkor/tag/libs/wsd/__init__.py +2 -0
nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
nltkor/tag/libs/wsd/wsd_reader.py +93 -0
nltkor/tokenize/__init__.py +62 -0
nltkor/tokenize/ko_tokenize.py +115 -0
nltkor/trans.py +121 -0
nltkor-1.2.0.dist-info/LICENSE.txt +1093 -0
nltkor-1.2.0.dist-info/METADATA +33 -0
nltkor-1.2.0.dist-info/RECORD +76 -0
nltkor-1.2.0.dist-info/WHEEL +5 -0
nltkor-1.2.0.dist-info/top_level.txt +1 -0

nltkor/search/faiss_search.py ADDED Viewed

@@ -0,0 +1,467 @@
+"""
+string2string search
+src = https://github.com/stanfordnlp/string2string
+MIT License
+Copyright (c) 2023 Mirac Suzgun
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+"""
+This module contains a wrapper for the Faiss library by Facebook AI Research.
+"""
+from typing import List, Union, Optional, Dict, Any
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+from nltkor.make_requirement import make_requirement
+try:
+    import torch
+    from transformers import AutoTokenizer, AutoModel, XLNetTokenizer
+    import pandas as pd
+    from datasets import Dataset
+    # import protobuf
+except ImportError:
+    requirment = ['torch', 'transformers>=4.8.2', 'pandas', 'datasets', "protobuf", 'sentencepiece']
+    file_path = make_requirement(requirment)
+    raise Exception(f"""
+    Need to install Libraries, please pip install below libraries
+    \t pip install transformers>=4.8.2
+    \t pip install torch
+    \t pip install pandas
+    \t pip install datasets
+    \t pip install protobuf
+    \t pip install sentencepiece
+    Or, use pip install requirement.txt
+    \t  pip install -r {file_path}
+    """)
+# from nltk.search.kobert_tokenizer import KoBERTTokenizer
+# FAISS library wrapper class
+class FaissSearch:
+    def __init__(self,
+        model_name_or_path: str = 'klue/bert-base',
+        tokenizer_name_or_path: str = 'klue/bert-base',
+        device: str = 'cpu'
+        ) -> None:
+        r"""
+        This function initializes the wrapper for the FAISS library, which is used to perform semantic search.
+        .. attention::
+            * If you use this class, please make sure to cite the following paper:
+                .. code-block:: latex
+                    @article{johnson2019billion,
+                        title={Billion-scale similarity search with {GPUs}},
+                        author={Johnson, Jeff and Douze, Matthijs and J{\'e}gou, Herv{\'e}},
+                        journal={IEEE Transactions on Big Data},
+                        volume={7},
+                        number={3},
+                        pages={535--547},
+                        year={2019},
+                        publisher={IEEE}
+                    }
+            * The code is based on the following GitHub repository:
+                https://github.com/facebookresearch/faiss
+        Arguments:
+            model_name_or_path (str, optional): The name or path of the model to use. Defaults to 'facebook/bart-large'.
+            tokenizer_name_or_path (str, optional): The name or path of the tokenizer to use. Defaults to 'facebook/bart-large'.
+            device (str, optional): The device to use. Defaults to 'cpu'.
+        Returns:
+            None
+        """
+        # Set the device
+        self.device = device
+        # If the tokenizer is not specified, use the model name or path
+        if tokenizer_name_or_path is None:
+            tokenizer_name_or_path = model_name_or_path
+        # Load the tokenizer
+        if tokenizer_name_or_path == 'skt/kobert-base-v1':
+            # self.tokenizer = KoBERTTokenizer.from_pretrained(tokenizer_name_or_path)
+            self.tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name_or_path)
+        else:
+            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
+        # Load the model
+        self.model = AutoModel.from_pretrained(model_name_or_path).to(self.device)
+        # Set the model to evaluation mode (since we do not need the gradients)
+        self.model.eval()
+        # Initialize the dataset
+        self.dataset = None
+    # Auxiliary function to get the last hidden state
+    def get_last_hidden_state(self,
+        embeddings: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        This function returns the last hidden state (e.g., [CLS] token's) of the input embeddings.
+        Arguments:
+            embeddings (torch.Tensor): The input embeddings.
+        Returns:
+            torch.Tensor: The last hidden state.
+        """
+        # Get the last hidden state
+        last_hidden_state = embeddings.last_hidden_state
+        # Return the last hidden state
+        return last_hidden_state[:, 0, :]
+    # Auxiliary function to get the mean pooling
+    def get_mean_pooling(self,
+        embeddings: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        This function returns the mean pooling of the input embeddings.
+        Arguments:
+            embeddings (torch.Tensor): The input embeddings.
+        Returns:
+            torch.Tensor: The mean pooling.
+        """
+        # Get the mean pooling
+        mean_pooling = embeddings.last_hidden_state.mean(dim=1)
+        # Return the mean pooling
+        return mean_pooling
+    # Get the embeddings
+    def get_embeddings(self,
+        text: Union[str, List[str]],
+        embedding_type: str = 'last_hidden_state',
+        batch_size: int = 8,
+        num_workers: int = 4,
+    ) -> torch.Tensor:
+        """
+        This function returns the embeddings of the input text.
+        Arguments:
+            text (Union[str, List[str]]): The input text.
+            embedding_type (str, optional): The type of embedding to use. Defaults to 'last_hidden_state'.
+            batch_size (int, optional): The batch size to use. Defaults to 8.
+            num_workers (int, optional): The number of workers to use. Defaults to 4.
+        Returns:
+            torch.Tensor: The embeddings.
+        Raises:
+            ValueError: If the embedding type is invalid.
+        """
+        # Check if the embedding type is valid
+        if embedding_type not in ['last_hidden_state', 'mean_pooling']:
+            raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
+        # Tokenize the input text
+        encoded_text = self.tokenizer(
+            text,
+            padding=True,
+            truncation=True,
+            return_tensors='pt',
+        )
+        # Move the input text to the device
+        encoded_text = encoded_text.to(self.device)
+        # encoded_inputs = {k: v.to(self.device) for k, v in encoded_inputs.items()}
+        # Get the embeddings
+        with torch.no_grad():
+            embeddings = self.model(**encoded_text)
+        # Get the proper embedding type
+        if embedding_type == 'last_hidden_state':
+            # Get the last hidden state
+            embeddings = self.get_last_hidden_state(embeddings)
+        elif embedding_type == 'mean_pooling':
+            # Get the mean pooling
+            embeddings = self.get_mean_pooling(embeddings)
+        # Return the embeddings
+        return embeddings
+    # Add FAISS index
+    def add_faiss_index(self,
+        column_name: str = 'embeddings',
+        metric_type: Optional[int] = None,
+        batch_size: int = 8,
+        **kwargs,
+    ) -> None:
+        """
+        This function adds a FAISS index to the dataset.
+        Arguments:
+            column_name (str, optional): The name of the column containing the embeddings. Defaults to 'embeddings'.
+            index_type (str, optional): The index type to use. Defaults to 'Flat'.
+            metric_type (str, optional): The metric type to use. Defaults to 'L2'.
+        Returns:
+            None
+        Raises:
+            ValueError: If the dataset is not initialized.
+        """
+        # Check if the dataset is initialized
+        if self.dataset is None:
+            raise ValueError('The dataset is not initialized. Please initialize the dataset first.')
+        print('Adding FAISS index...')
+        self.dataset.add_faiss_index(
+            column_name,
+            # metric_type=metric_type,
+            # device=self.device,
+            # batch_size=batch_size,
+            faiss_verbose=True,
+            # **kwargs,
+        )
+    def save_faiss_index(self,
+        index_name: str,
+        file_path: str,
+    ) -> None:
+        """
+        This function saves the FAISS index to the specified file path.
+            * This is a wrapper function for the `save_faiss_index` function in the `Dataset` class.
+        Arguments:
+            index_name (str): The name of the FAISS index  (e.g., "embeddings")
+            file_path (str): The file path to save the FAISS index.
+        Returns:
+            None
+        Raises:
+            ValueError: If the dataset is not initialized.
+        """
+        # Check if the dataset is initialized
+        if self.dataset is None:
+            raise ValueError('The dataset is not initialized. Please initialize the dataset first.')
+        print('Saving FAISS index...')
+        self.dataset.save_faiss_index(index_name=index_name, file=file_path)
+    def load_faiss_index(self,
+        index_name: str,
+        file_path: str,
+        device: str = 'cpu',
+    ) -> None:
+        """
+        This function loads the FAISS index from the specified file path.
+            * This is a wrapper function for the `load_faiss_index` function in the `Dataset` class.
+        Arguments:
+            index_name (str): The name of the FAISS index  (e.g., "embeddings")
+            file_path (str): The file path to load the FAISS index from.
+            device (str, optional): The device to use ("cpu" or "cuda") (default: "cpu").
+        Returns:
+            None
+        Raises:
+            ValueError: If the dataset is not initialized.
+        """
+        # Check if the dataset is initialized
+        if self.dataset is None:
+            raise ValueError('The dataset is not initialized. Please initialize the dataset first.')
+        print('Loading FAISS index...')
+        self.dataset.load_faiss_index(index_name=index_name, file=file_path, device=device)
+    # Initialize the corpus using a dictionary or pandas DataFrame or HuggingFace Datasets object
+    def initialize_corpus(self,
+        corpus: Union[Dict[str, List[str]], pd.DataFrame, Dataset],
+        section: str = 'text',
+        index_column_name: str = 'embeddings',
+        embedding_type: str = 'last_hidden_state',
+        batch_size: Optional[int] = None,
+        num_workers: Optional[int] = None,
+        save_path: Optional[str] = None,
+    ) -> Dataset:
+        """
+        This function initializes a dataset using a dictionary or pandas DataFrame or HuggingFace Datasets object.
+        Arguments:
+            dataset_dict (Dict[str, List[str]]): The dataset dictionary.
+            section (str): The section of the dataset to use whose embeddings will be used for semantic search (e.g., 'text', 'title', etc.) (default: 'text').
+            index_column_name (str): The name of the column containing the embeddings (default: 'embeddings')
+            embedding_type (str): The type of embedding to use (default: 'last_hidden_state').
+            batch_size (int, optional): The batch size to use (default: 8).
+            max_length (int, optional): The maximum length of the input sequences.
+            num_workers (int, optional): The number of workers to use.
+            save_path (Optional[str], optional): The path to save the dataset (default: None).
+        Returns:
+            Dataset: The dataset object (HuggingFace Datasets).
+        Raises:
+            ValueError: If the dataset is not a dictionary or pandas DataFrame or HuggingFace Datasets object.
+        """
+        # Create the dataset
+        if isinstance(corpus, dict):
+            self.dataset = Dataset.from_dict(corpus)
+        elif isinstance(corpus, pd.DataFrame):
+            self.dataset = Dataset.from_pandas(corpus)
+        elif isinstance(corpus, Dataset):
+            self.dataset = corpus
+        else:
+            raise ValueError('The dataset must be a dictionary or pandas DataFrame.')
+        # Set the embedding_type
+        self.embedding_type = embedding_type
+        # Tokenize the dataset
+        # self.dataset = self.dataset.map(
+        #     lambda x: x[section],
+        #     batched=True,
+        #     batch_size=batch_size,
+        #     num_proc=num_workers,
+        # )
+        # Map the section of the dataset to the embeddings
+        self.dataset = self.dataset.map(
+            lambda x: {
+                index_column_name: self.get_embeddings(x[section], embedding_type=self.embedding_type).detach().cpu().numpy()[0]
+                },
+            # batched=True,
+            batch_size=batch_size,
+            num_proc=num_workers,
+        )
+        # Save the dataset
+        if save_path is not None:
+            self.dataset.to_json(save_path)
+        # Add FAISS index
+        self.add_faiss_index(
+            column_name=index_column_name,
+        )
+        # Return the dataset
+        return self.dataset
+    # Initialize the dataset using a JSON file
+    def load_dataset_from_json(self,
+        json_path: str,
+    ) -> Dataset:
+        """
+        This function loads a dataset from a JSON file.
+        Arguments:
+            json_path (str): The path to the JSON file.
+        Returns:
+            Dataset: The dataset.
+        """
+        # Load the dataset
+        self.dataset = Dataset.from_json(json_path)
+        # Return the dataset
+        return self.dataset
+    # Search for the most similar elements in the dataset, given a query
+    def search(self,
+        query: str,
+        k: int = 1,
+        index_column_name: str = 'embeddings',
+    ) -> pd.DataFrame:
+        """
+        This function searches for the most similar elements in the dataset, given a query.
+        Arguments:
+            query (str): The query.
+            k (int, optional): The number of elements to return  (default: 1).
+            index_column_name (str, optional): The name of the column containing the embeddings (default: 'embeddings')
+        Returns:
+            pd.DataFrame: The most similar elements in the dataset (text, score, etc.), sorted by score.
+        Remarks:
+            The returned elements are dictionaries containing the text and the score.
+        """
+        # Get the embeddings of the query
+        query_embeddings = self.get_embeddings([query], embedding_type=self.embedding_type).detach().cpu().numpy()
+        # Search for the most similar elements in the dataset
+        scores, similar_elts = self.dataset.get_nearest_examples(
+            index_name=index_column_name,
+            query=query_embeddings,
+            k=k,
+        )
+        # Convert the results to a pandas DataFrame
+        results_df = pd.DataFrame.from_dict(similar_elts)
+        # Add the scores
+        results_df['score'] = scores
+        # Sort the results by score
+        results_df.sort_values("score", ascending=True, inplace=True)
+        # Return the most similar elements
+        return results_df

nltkor/search/kobert_tokenizer.py ADDED Viewed

@@ -0,0 +1,181 @@
+# coding=utf-8
+# Copyright 2021 SKT AI Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, List, Optional
+from nltkor.make_requirement import make_requirement
+try:
+    from transformers.tokenization_utils import AddedToken
+    from transformers import XLNetTokenizer
+    from transformers import SPIECE_UNDERLINE
+    import sentencepiece
+except ImportError:
+    requirement = ['transformers>=4.8.2', 'sentencepiece']
+    file_path = make_requirement(requirement)
+    raise Exception(f"""
+    Need to install Libraries, please pip install below libraries
+    \t pip install transformers>=4.8.2
+    \t pip install sentencepiece
+    Or, use pip install requirement.txt
+    \t  pip install -r {file_path}
+    """)
+class KoBERTTokenizer(XLNetTokenizer):
+    padding_side = "right"
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=False,
+        remove_space=True,
+        keep_accents=False,
+        bos_token="[CLS]",
+        eos_token="[SEP]",
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        additional_special_tokens=None,
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        **kwargs
+    ) -> None:
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = (
+            AddedToken(mask_token, lstrip=True, rstrip=False)
+            if isinstance(mask_token, str)
+            else mask_token
+        )
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        super().__init__(
+            vocab_file,
+            do_lower_case=do_lower_case,
+            remove_space=remove_space,
+            keep_accents=keep_accents,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            additional_special_tokens=additional_special_tokens,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+        self._pad_token_type_id = 0
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An XLNet sequence has the following format:
+        - single sequence: ``<cls> X <sep>``
+        - pair of sequences: ``<cls> A <sep> B <sep>``
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return cls + token_ids_0 + sep
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+    def _tokenize(self, text: str) -> List[str]:
+        """Tokenize a string."""
+        text = self.preprocess_text(text)
+        pieces = self.sp_model.encode(text, out_type=str, **self.sp_model_kwargs)
+        new_pieces = []
+        for piece in pieces:
+            if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
+                cur_pieces = self.sp_model.EncodeAsPieces(
+                    piece[:-1].replace(SPIECE_UNDERLINE, "")
+                )
+                if (
+                    piece[0] != SPIECE_UNDERLINE
+                    and cur_pieces[0][0] == SPIECE_UNDERLINE
+                ):
+                    if len(cur_pieces[0]) == 1:
+                        cur_pieces = cur_pieces[1:]
+                    else:
+                        cur_pieces[0] = cur_pieces[0][1:]
+                cur_pieces.append(piece[-1])
+                new_pieces.extend(cur_pieces)
+            else:
+                new_pieces.append(piece)
+        return new_pieces
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An XLNet sequence has the following format:
+        - single sequence: ``<cls> X <sep> ``
+        - pair of sequences: ``<cls> A <sep> B <sep>``
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return cls + token_ids_0 + sep
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLNet
+        sequence pair mask has the following format:
+        ::
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

nltkor/sejong/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from nltkor.sejong.sejong_download import SejongDir
+__all__=['ssem']

nltkor/sejong/ch.py ADDED Viewed

@@ -0,0 +1,12 @@
+import os
+import unicodedata
+import sys
+for filename in os.listdir('/01. 체언_상세//'):
+	new_filename=filename.replcace(filename, unicodedata.normalize('NFD',filename))
+	os.rename(filename, new_filename)