PyPI - nltkor - Versions diffs - 1.2.18__cp39-cp39-macosx_10_9_universal2.whl → 1.2.20__cp39-cp39-macosx_10_9_universal2.whl - Mend

nltkor 1.2.18__cp39-cp39-macosx_10_9_universal2.whl → 1.2.20__cp39-cp39-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

nltkor/__init__.py +1 -1
nltkor/search/__init__.py +1 -1
nltkor/search/faiss_search.py +7 -314
nltkor/search/test.py +25 -0
nltkor/search/{search_dict.py → trie_search.py} +10 -10
nltkor/tag/libs/__init__.py +0 -0
nltkor/tag/libs/arguments.py +0 -0
nltkor/tag/libs/attributes.py +0 -0
nltkor/tag/libs/metadata.py +0 -0
nltkor/tag/libs/ner/__init__.py +0 -0
nltkor/tag/libs/ner/macmorphoreader.py +0 -0
nltkor/tag/libs/ner/ner_reader.py +0 -0
nltkor/tag/libs/network.c +125 -125
nltkor/tag/libs/network.cpython-39-darwin.so +0 -0
nltkor/tag/libs/parse/__init__.py +0 -0
nltkor/tag/libs/parse/parse_reader.py +0 -0
nltkor/tag/libs/pos/__init__.py +0 -0
nltkor/tag/libs/pos/macmorphoreader.py +0 -0
nltkor/tag/libs/pos/pos_reader.py +0 -0
nltkor/tag/libs/srl/__init__.py +0 -0
nltkor/tag/libs/srl/__srl_reader_.py +0 -0
nltkor/tag/libs/srl/srl_reader.py +0 -0
nltkor/tag/libs/srl/train_srl.py +0 -0
nltkor/tag/libs/taggers.py +0 -0
nltkor/tag/libs/word_dictionary.py +0 -0
nltkor/tag/libs/wsd/__init__.py +0 -0
nltkor/tag/libs/wsd/macmorphoreader.py +0 -0
nltkor/tag/libs/wsd/wsd_reader.py +0 -0
{nltkor-1.2.18.dist-info → nltkor-1.2.20.dist-info}/METADATA +16 -16
{nltkor-1.2.18.dist-info → nltkor-1.2.20.dist-info}/RECORD +12 -11
{nltkor-1.2.18.dist-info → nltkor-1.2.20.dist-info}/WHEEL +1 -1
{nltkor-1.2.18.dist-info → nltkor-1.2.20.dist-info/licenses}/LICENSE.txt +0 -0
{nltkor-1.2.18.dist-info → nltkor-1.2.20.dist-info}/top_level.txt +0 -0

nltkor/__init__.py CHANGED Viewed

@@ -13,4 +13,4 @@ from nltkor import trans
 from nltkor import Kor_char
 from nltkor import etc
-__version__ = '1.2.18'
+__version__ = '1.2.20'

nltkor/search/__init__.py CHANGED Viewed

@@ -8,4 +8,4 @@ from .classical import (
 )
 from .faiss_search import FaissSearch
 from .kobert_tokenizer import KoBERTTokenizer
-from .search_dict import SearchDic
+from .trie_search import TRIESearch

nltkor/search/faiss_search.py CHANGED Viewed

@@ -68,24 +68,22 @@ except ImportError:
 class FaissSearch:
     def __new__(cls,
-            mode = None,
+            mode = 'dense',
             model_name_or_path: str = 'klue/bert-base',
             tokenizer_name_or_path: str = 'klue/bert-base',
             embedding_type: str = 'last_hidden_state',
             device: str = 'cpu'
             ) -> None:
-        if mode == 'sentence':
-            return FaissSearch_SenEmbed(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
-        elif mode == 'word':
-            return FaissSearch_WordEmbed(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
+        if mode == 'dense':
+            return FaissSearch_Dense(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
         elif mode == 'sparse':
             return FaissSearch_Sparse(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
         else:
-            raise ValueError("choice 'sentence' or 'word' or 'sparse'")
+            raise ValueError("choice 'dense' or 'sparse'.")
-class FaissSearch_SenEmbed:
+class FaissSearch_Dense:
     def __init__(self,
         model_name_or_path: str = 'klue/bert-base',
         tokenizer_name_or_path: str = 'klue/bert-base',
@@ -474,7 +472,7 @@ class FaissSearch_SenEmbed:
-class FaissSearch_Sparse(FaissSearch_SenEmbed):
+class FaissSearch_Sparse(FaissSearch_Dense):
     def __init__(self,
         model_name_or_path: str = 'klue/bert-base',
         tokenizer_name_or_path: str = 'klue/bert-base',
@@ -586,312 +584,7 @@ class FaissSearch_Sparse(FaissSearch_SenEmbed):
         embeddings = embeddings['logits']
         embeddings = torch.sum(torch.log(1+torch.relu(embeddings)) * encoded_text['attention_mask'].unsqueeze(-1), dim=1)
-        e_norm = torch.nn.functional.normalize(embeddings, p=2, dim=1, eps=1e-8)
         # Return the embeddings
-        return e_norm
-# FAISS word embedding library wrapper class
-class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
-    def __init__(self,
-        model_name_or_path: str = 'klue/bert-base',
-        tokenizer_name_or_path: str = 'klue/bert-base',
-        embedding_type: str = 'last_hidden_state',
-        device: str = 'cpu',
-        ) -> None:
-        r"""
-        This function initializes the wrapper for the FAISS library, which is used to perform semantic search.
-        .. attention::
-            * If you use this class, please make sure to cite the following paper:
-                .. code-block:: latex
-                    @article{johnson2019billion,
-                        title={Billion-scale similarity search with {GPUs}},
-                        author={Johnson, Jeff and Douze, Matthijs and J{\'e}gou, Herv{\'e}},
-                        journal={IEEE Transactions on Big Data},
-                        volume={7},
-                        number={3},
-                        pages={535--547},
-                        year={2019},
-                        publisher={IEEE}
-                    }
-            * The code is based on the following GitHub repository:
-                https://github.com/facebookresearch/faiss
-        Arguments:
-            model_name_or_path (str, optional): The name or path of the model to use. Defaults to 'facebook/bart-large'.
-            tokenizer_name_or_path (str, optional): The name or path of the tokenizer to use. Defaults to 'facebook/bart-large'.
-            device (str, optional): The device to use. Defaults to 'cpu'.
-        Returns:
-            None
-        """
-        # Set the device
-        self.device = device
-        # If the tokenizer is not specified, use the model name or path
-        if tokenizer_name_or_path is None:
-            tokenizer_name_or_path = model_name_or_path
-        # Load the tokenizer
-        if tokenizer_name_or_path == 'skt/kobert-base-v1':
-            # self.tokenizer = KoBERTTokenizer.from_pretrained(tokenizer_name_or_path)
-            self.tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name_or_path)
-        else:
-            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
-        # Load the model
-        self.model = AutoModel.from_pretrained(model_name_or_path).to(self.device)
-        # Set the model to evaluation mode (since we do not need the gradients)
-        self.model.eval()
-        # Initialize the dataset
-        self.dataset = None
-    # Get the embeddings (new code)
-    def get_doc_embeddings(self,
-        #text: Union[str, List[str]],
-        text=None,
-        embedding_type: str = 'last_hidden_state',
-        batch_size: int = 8,
-        num_workers: int = 4,
-    ) -> torch.Tensor:
-        """
-        This function returns the embeddings of the input text.
-        Arguments:
-            text (Union[str, List[str]]): The input text.
-            embedding_type (str, optional): The type of embedding to use. Defaults to 'last_hidden_state'.
-            batch_size (int, optional): The batch size to use. Defaults to 8.
-            num_workers (int, optional): The number of workers to use. Defaults to 4.
-        Returns:
-            torch.Tensor: The embeddings.
-        Raises:
-            ValueError: If the embedding type is invalid.
-        """
-        # Check if the embedding type is valid
-        if embedding_type not in ['last_hidden_state', 'mean_pooling']:
-            raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
-        ids_dict = {}
-        # Tokenize the input text
-        for sentence in text['text']:
-            encoded_text = self.tokenizer(
-                sentence,
-                padding=False,
-                truncation=True,
-                return_tensors='pt',
-                add_special_tokens=False
-            )
-            # Move the input text to the device
-            encoded_text = encoded_text.to(self.device)
-            token_ids_list = encoded_text['input_ids'].tolist()
-            token_ids_list = token_ids_list[0]
-            for ids in token_ids_list:
-                if ids not in ids_dict.keys():
-                    ids_dict[ids] = [sentence]
-                else:
-                    if text not in ids_dict[ids]:
-                        ids_dict[ids].append(sentence)
-        # Get the embeddings
-        embedding_dict = {}
-        self.model.eval()
-        for key, value in ids_dict.items():
-            embed = self.model(torch.tensor([[key]]), output_hidden_states=True).hidden_states[-1][:,0,:].detach()
-            embedding_dict[embed] = value
-        # Return the embeddings
-        return embedding_dict
-    # Get the embeddings (new code)
-    def get_query_embeddings(self,
-        text: Union[str, List[str]],
-        embedding_type: str = 'last_hidden_state',
-        batch_size: int = 8,
-        num_workers: int = 4,
-    ) -> torch.Tensor:
-        """
-        This function returns the embeddings of the input text.
-        Arguments:
-            text (Union[str, List[str]]): The input text.
-            embedding_type (str, optional): The type of embedding to use. Defaults to 'last_hidden_state'.
-            batch_size (int, optional): The batch size to use. Defaults to 8.
-            num_workers (int, optional): The number of workers to use. Defaults to 4.
-        Returns:
-            torch.Tensor: The embeddings.
-        Raises:
-            ValueError: If the embedding type is invalid.
-        """
-        # Check if the embedding type is valid
-        if embedding_type not in ['last_hidden_state', 'mean_pooling']:
-            raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
-        # Tokenize the input text
-        encoded_text = self.tokenizer(
-            text,
-            padding=False,
-            truncation=True,
-            return_tensors='pt',
-            add_special_tokens=False,
-        )
-        # Move the input text to the device
-        encoded_text = encoded_text.to(self.device)
-        token_ids_list = encoded_text['input_ids'].tolist()
-        token_ids_list = token_ids_list[0]
-        tensor_list = [torch.tensor([[value]]) for value in token_ids_list]
-        # Get the embeddings
-        embeds = []
-        self.model.eval()
-        for index, tensor in enumerate(tensor_list):
-            embed = self.model(tensor, output_hidden_states=True).hidden_states[-1][:,0,:].detach().cpu().numpy()
-            embeds.append(embed)
-        # Return the embeddings
-        return embeds
-    # Initialize the corpus using a dictionary or pandas DataFrame or HuggingFace Datasets object
-    def initialize_corpus(self,
-        corpus: Union[Dict[str, List[str]], pd.DataFrame, Dataset],
-        section: str = 'text',
-        index_column_name: str = 'embeddings',
-        embedding_type: str = 'last_hidden_state',
-        batch_size: Optional[int] = None,
-        num_workers: Optional[int] = None,
-        save_path: Optional[str] = None,
-    ) -> Dataset:
-        """
-        This function initializes a dataset using a dictionary or pandas DataFrame or HuggingFace Datasets object.
-        Arguments:
-            dataset_dict (Dict[str, List[str]]): The dataset dictionary.
-            section (str): The section of the dataset to use whose embeddings will be used for semantic search (e.g., 'text', 'title', etc.) (default: 'text').
-            index_column_name (str): The name of the column containing the embeddings (default: 'embeddings')
-            embedding_type (str): The type of embedding to use (default: 'last_hidden_state').
-            batch_size (int, optional): The batch size to use (default: 8).
-            max_length (int, optional): The maximum length of the input sequences.
-            num_workers (int, optional): The number of workers to use.
-            save_path (Optional[str], optional): The path to save the dataset (default: None).
-        Returns:
-            Dataset: The dataset object (HuggingFace Datasets).
-        Raises:
-            ValueError: If the dataset is not a dictionary or pandas DataFrame or HuggingFace Datasets object.
-        """
-        # corpus = { 'text': [...] } -> form_dict
-        # Set the embedding_type
-        self.embedding_type = embedding_type
-        # get embedding dict
-        embedding_dict = self.get_doc_embeddings(text=corpus, embedding_type=self.embedding_type)
-        data = {
-                'text' : embedding_dict.values(),
-                'embeddings': []
-                }
-        for embed in embedding_dict.keys():
-            embed_list = embed.tolist()
-            data['embeddings'].append(embed_list[0])
-        if isinstance(data, dict):
-            self.dataset = Dataset.from_dict(data)
-        elif isinstance(data, pd.DataFrame):
-            self.dataset = Dataset.from_pandas(data)
-        elif isinstance(data, Dataset):
-            self.dataset = corpus
-        else:
-            raise ValueError('The dataset must be a dictionary or pandas DataFrame.')
-        # Save the dataset
-        if save_path is not None:
-            self.dataset.to_json(save_path)
-        # Add FAISS index
-        self.add_faiss_index(
-            column_name=index_column_name,
-        )
-        # Return the dataset
-        return self.dataset
-    # Search for the most similar elements in the dataset, given a query
-    def search(self,
-        query: str,
-        k: int = 1,
-        index_column_name: str = 'embeddings',
-    ) -> pd.DataFrame:
-        """
-        This function searches for the most similar elements in the dataset, given a query.
-        Arguments:
-            query (str): The query.
-            k (int, optional): The number of elements to return  (default: 1).
-            index_column_name (str, optional): The name of the column containing the embeddings (default: 'embeddings')
-        Returns:
-            pd.DataFrame: The most similar elements in the dataset (text, score, etc.), sorted by score.
-        Remarks:
-            The returned elements are dictionaries containing the text and the score.
-        """
-        # Get the embeddings of the query
-        query_embeddings = self.get_query_embeddings([query], embedding_type=self.embedding_type)
-        # query_embedding이랑 self.dataset['embeddings'] 값 비교
-        scores = []
-        similar_elts = []
-        for query in query_embeddings:
-            # Search for the most similar elements in the dataset
-            score, similar_elt = self.dataset.get_nearest_examples(
-                index_name=index_column_name,
-                query=query,
-                k=k,
-            )
-            scores.append(score)
-            similar_elts.append(similar_elt)
+        return embeddings
-        text_list = []
-        for item in similar_elts:
-            for text in item['text']:
-                text_list.append(text)
-        flat_list = [sentence for sublist in text_list for sentence in sublist]
-        count = Counter(flat_list)
-        count = dict(count.most_common(5))
-        sorted_dict = dict(sorted(count.items(), key=lambda x: x[1], reverse=True))
-        # Convert the results to a pandas DataFrame
-        results_df = pd.DataFrame({'text': sorted_dict.keys() , 'freq': sorted_dict.values()})
-        # Return the most similar elements
-        return results_df

nltkor/search/test.py ADDED Viewed

@@ -0,0 +1,25 @@
+from trie_search import TRIESearch
+root = {}
+dict_file = '/Users/chanhyeok/Downloads/lexicon.txt'
+sc = TRIESearch(root)
+with open(dict_file, 'r') as f:
+    for line in f:
+        if ';;' in line[:2]: continue
+        k, v = line.strip().split('\t')
+        sc.build_trie_search(k, v)
+    # print(root)
+word = '고용 노동부'
+values, value_data = sc.trie_search(word, True)
+print(values, value_data)
+word = '2시뉴스외전'
+values, value_data = sc.trie_search( word, True)
+print(values, value_data)
+word = '2시 뉴스외전'
+values, value_data = sc.trie_search( word, True)
+print(values, value_data)
+word = 'gbc'
+values, value_data = sc.trie_search( word, True)
+print(values, value_data)

nltkor/search/{search_dict.py → trie_search.py} RENAMED Viewed

@@ -4,11 +4,11 @@ import numpy as np
 import json
 import argparse
-class SearchDic :
+class TRIESearch :
     def __init__ (self,root) :
         self.root = root
-    def build_search_dict(self, word, data) -> dict:
+    def build_trie_search(self, word, data) -> dict:
         current_dict = self.root
         _end_word_ = '$$'
         for letter in word:
@@ -19,7 +19,7 @@ class SearchDic :
-    def search_dict(self, word, space_flag=False):
+    def trie_search(self, word, space_flag=False):
         '''
         TRIE 탐색
         space_flag: if True then including space, otherwise do not including space
@@ -69,27 +69,27 @@ class SearchDic :
             return pickle.load(f)
 if __name__ == "__main__":
     root = {}
-    dict_file = '텍스트파일경로'
-    sc = SearchDic(root)
+    dict_file = '텍스트파일 경로'
+    sc = TRIESearch(root)
     with open(dict_file, 'r') as f:
         for line in f:
             if ';;' in line[:2]: continue
             k, v = line.strip().split('\t')
-            sc.build_search_dict(k, v)
+            sc.build_trie_search(k, v)
     # print(root)
     word = '고용 노동부'
-    values, value_data = sc.search_dict(word, True)
+    values, value_data = sc.trie_search(word, True)
     print(values, value_data)
     word = '2시뉴스외전'
-    values, value_data = sc.search_dict( word, True)
+    values, value_data = sc.trie_search( word, True)
     print(values, value_data)
     word = '2시 뉴스외전'
-    values, value_data = sc.search_dict( word, True)
+    values, value_data = sc.trie_search( word, True)
     print(values, value_data)
     word = 'gbc'
-    values, value_data = sc.search_dict( word, True)
+    values, value_data = sc.trie_search( word, True)
     print(values, value_data)

nltkor/tag/libs/__init__.py CHANGED Viewed

File without changes

nltkor/tag/libs/arguments.py CHANGED Viewed

File without changes

nltkor/tag/libs/attributes.py CHANGED Viewed

File without changes

nltkor/tag/libs/metadata.py CHANGED Viewed

File without changes

nltkor/tag/libs/ner/__init__.py CHANGED Viewed

File without changes

nltkor/tag/libs/ner/macmorphoreader.py CHANGED Viewed

File without changes

nltkor/tag/libs/ner/ner_reader.py CHANGED Viewed

File without changes