PyPI - nltkor - Versions diffs - 1.2.19__cp39-cp39-macosx_10_9_universal2.whl → 1.2.20__cp39-cp39-macosx_10_9_universal2.whl - Mend

nltkor 1.2.19__cp39-cp39-macosx_10_9_universal2.whl → 1.2.20__cp39-cp39-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

nltkor/__init__.py CHANGED Viewed

@@ -13,4 +13,4 @@ from nltkor import trans
 from nltkor import Kor_char
 from nltkor import etc
-__version__ = '1.2.19'
+__version__ = '1.2.20'

nltkor/search/faiss_search.py CHANGED Viewed

@@ -68,24 +68,22 @@ except ImportError:
 class FaissSearch:
     def __new__(cls,
-            mode = None,
+            mode = 'dense',
             model_name_or_path: str = 'klue/bert-base',
             tokenizer_name_or_path: str = 'klue/bert-base',
             embedding_type: str = 'last_hidden_state',
             device: str = 'cpu'
             ) -> None:
-        if mode == 'sentence':
-            return FaissSearch_SenEmbed(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
-        elif mode == 'word':
-            return FaissSearch_WordEmbed(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
+        if mode == 'dense':
+            return FaissSearch_Dense(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
         elif mode == 'sparse':
             return FaissSearch_Sparse(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
         else:
-            raise ValueError("choice 'sentence' or 'word' or 'sparse'")
+            raise ValueError("choice 'dense' or 'sparse'.")
-class FaissSearch_SenEmbed:
+class FaissSearch_Dense:
     def __init__(self,
         model_name_or_path: str = 'klue/bert-base',
         tokenizer_name_or_path: str = 'klue/bert-base',
@@ -474,7 +472,7 @@ class FaissSearch_SenEmbed:
-class FaissSearch_Sparse(FaissSearch_SenEmbed):
+class FaissSearch_Sparse(FaissSearch_Dense):
     def __init__(self,
         model_name_or_path: str = 'klue/bert-base',
         tokenizer_name_or_path: str = 'klue/bert-base',
@@ -586,312 +584,7 @@ class FaissSearch_Sparse(FaissSearch_SenEmbed):
         embeddings = embeddings['logits']
         embeddings = torch.sum(torch.log(1+torch.relu(embeddings)) * encoded_text['attention_mask'].unsqueeze(-1), dim=1)
-        e_norm = torch.nn.functional.normalize(embeddings, p=2, dim=1, eps=1e-8)
         # Return the embeddings
-        return e_norm
-# FAISS word embedding library wrapper class
-class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
-    def __init__(self,
-        model_name_or_path: str = 'klue/bert-base',
-        tokenizer_name_or_path: str = 'klue/bert-base',
-        embedding_type: str = 'last_hidden_state',
-        device: str = 'cpu',
-        ) -> None:
-        r"""
-        This function initializes the wrapper for the FAISS library, which is used to perform semantic search.
-        .. attention::
-            * If you use this class, please make sure to cite the following paper:
-                .. code-block:: latex
-                    @article{johnson2019billion,
-                        title={Billion-scale similarity search with {GPUs}},
-                        author={Johnson, Jeff and Douze, Matthijs and J{\'e}gou, Herv{\'e}},
-                        journal={IEEE Transactions on Big Data},
-                        volume={7},
-                        number={3},
-                        pages={535--547},
-                        year={2019},
-                        publisher={IEEE}
-                    }
-            * The code is based on the following GitHub repository:
-                https://github.com/facebookresearch/faiss
-        Arguments:
-            model_name_or_path (str, optional): The name or path of the model to use. Defaults to 'facebook/bart-large'.
-            tokenizer_name_or_path (str, optional): The name or path of the tokenizer to use. Defaults to 'facebook/bart-large'.
-            device (str, optional): The device to use. Defaults to 'cpu'.
-        Returns:
-            None
-        """
-        # Set the device
-        self.device = device
-        # If the tokenizer is not specified, use the model name or path
-        if tokenizer_name_or_path is None:
-            tokenizer_name_or_path = model_name_or_path
-        # Load the tokenizer
-        if tokenizer_name_or_path == 'skt/kobert-base-v1':
-            # self.tokenizer = KoBERTTokenizer.from_pretrained(tokenizer_name_or_path)
-            self.tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name_or_path)
-        else:
-            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
-        # Load the model
-        self.model = AutoModel.from_pretrained(model_name_or_path).to(self.device)
-        # Set the model to evaluation mode (since we do not need the gradients)
-        self.model.eval()
-        # Initialize the dataset
-        self.dataset = None
-    # Get the embeddings (new code)
-    def get_doc_embeddings(self,
-        #text: Union[str, List[str]],
-        text=None,
-        embedding_type: str = 'last_hidden_state',
-        batch_size: int = 8,
-        num_workers: int = 4,
-    ) -> torch.Tensor:
-        """
-        This function returns the embeddings of the input text.
-        Arguments:
-            text (Union[str, List[str]]): The input text.
-            embedding_type (str, optional): The type of embedding to use. Defaults to 'last_hidden_state'.
-            batch_size (int, optional): The batch size to use. Defaults to 8.
-            num_workers (int, optional): The number of workers to use. Defaults to 4.
-        Returns:
-            torch.Tensor: The embeddings.
-        Raises:
-            ValueError: If the embedding type is invalid.
-        """
-        # Check if the embedding type is valid
-        if embedding_type not in ['last_hidden_state', 'mean_pooling']:
-            raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
-        ids_dict = {}
-        # Tokenize the input text
-        for sentence in text['text']:
-            encoded_text = self.tokenizer(
-                sentence,
-                padding=False,
-                truncation=True,
-                return_tensors='pt',
-                add_special_tokens=False
-            )
-            # Move the input text to the device
-            encoded_text = encoded_text.to(self.device)
-            token_ids_list = encoded_text['input_ids'].tolist()
-            token_ids_list = token_ids_list[0]
-            for ids in token_ids_list:
-                if ids not in ids_dict.keys():
-                    ids_dict[ids] = [sentence]
-                else:
-                    if text not in ids_dict[ids]:
-                        ids_dict[ids].append(sentence)
-        # Get the embeddings
-        embedding_dict = {}
-        self.model.eval()
-        for key, value in ids_dict.items():
-            embed = self.model(torch.tensor([[key]]), output_hidden_states=True).hidden_states[-1][:,0,:].detach()
-            embedding_dict[embed] = value
-        # Return the embeddings
-        return embedding_dict
-    # Get the embeddings (new code)
-    def get_query_embeddings(self,
-        text: Union[str, List[str]],
-        embedding_type: str = 'last_hidden_state',
-        batch_size: int = 8,
-        num_workers: int = 4,
-    ) -> torch.Tensor:
-        """
-        This function returns the embeddings of the input text.
-        Arguments:
-            text (Union[str, List[str]]): The input text.
-            embedding_type (str, optional): The type of embedding to use. Defaults to 'last_hidden_state'.
-            batch_size (int, optional): The batch size to use. Defaults to 8.
-            num_workers (int, optional): The number of workers to use. Defaults to 4.
-        Returns:
-            torch.Tensor: The embeddings.
-        Raises:
-            ValueError: If the embedding type is invalid.
-        """
-        # Check if the embedding type is valid
-        if embedding_type not in ['last_hidden_state', 'mean_pooling']:
-            raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
-        # Tokenize the input text
-        encoded_text = self.tokenizer(
-            text,
-            padding=False,
-            truncation=True,
-            return_tensors='pt',
-            add_special_tokens=False,
-        )
-        # Move the input text to the device
-        encoded_text = encoded_text.to(self.device)
-        token_ids_list = encoded_text['input_ids'].tolist()
-        token_ids_list = token_ids_list[0]
-        tensor_list = [torch.tensor([[value]]) for value in token_ids_list]
-        # Get the embeddings
-        embeds = []
-        self.model.eval()
-        for index, tensor in enumerate(tensor_list):
-            embed = self.model(tensor, output_hidden_states=True).hidden_states[-1][:,0,:].detach().cpu().numpy()
-            embeds.append(embed)
-        # Return the embeddings
-        return embeds
-    # Initialize the corpus using a dictionary or pandas DataFrame or HuggingFace Datasets object
-    def initialize_corpus(self,
-        corpus: Union[Dict[str, List[str]], pd.DataFrame, Dataset],
-        section: str = 'text',
-        index_column_name: str = 'embeddings',
-        embedding_type: str = 'last_hidden_state',
-        batch_size: Optional[int] = None,
-        num_workers: Optional[int] = None,
-        save_path: Optional[str] = None,
-    ) -> Dataset:
-        """
-        This function initializes a dataset using a dictionary or pandas DataFrame or HuggingFace Datasets object.
-        Arguments:
-            dataset_dict (Dict[str, List[str]]): The dataset dictionary.
-            section (str): The section of the dataset to use whose embeddings will be used for semantic search (e.g., 'text', 'title', etc.) (default: 'text').
-            index_column_name (str): The name of the column containing the embeddings (default: 'embeddings')
-            embedding_type (str): The type of embedding to use (default: 'last_hidden_state').
-            batch_size (int, optional): The batch size to use (default: 8).
-            max_length (int, optional): The maximum length of the input sequences.
-            num_workers (int, optional): The number of workers to use.
-            save_path (Optional[str], optional): The path to save the dataset (default: None).
-        Returns:
-            Dataset: The dataset object (HuggingFace Datasets).
-        Raises:
-            ValueError: If the dataset is not a dictionary or pandas DataFrame or HuggingFace Datasets object.
-        """
-        # corpus = { 'text': [...] } -> form_dict
-        # Set the embedding_type
-        self.embedding_type = embedding_type
-        # get embedding dict
-        embedding_dict = self.get_doc_embeddings(text=corpus, embedding_type=self.embedding_type)
-        data = {
-                'text' : embedding_dict.values(),
-                'embeddings': []
-                }
-        for embed in embedding_dict.keys():
-            embed_list = embed.tolist()
-            data['embeddings'].append(embed_list[0])
-        if isinstance(data, dict):
-            self.dataset = Dataset.from_dict(data)
-        elif isinstance(data, pd.DataFrame):
-            self.dataset = Dataset.from_pandas(data)
-        elif isinstance(data, Dataset):
-            self.dataset = corpus
-        else:
-            raise ValueError('The dataset must be a dictionary or pandas DataFrame.')
-        # Save the dataset
-        if save_path is not None:
-            self.dataset.to_json(save_path)
-        # Add FAISS index
-        self.add_faiss_index(
-            column_name=index_column_name,
-        )
-        # Return the dataset
-        return self.dataset
-    # Search for the most similar elements in the dataset, given a query
-    def search(self,
-        query: str,
-        k: int = 1,
-        index_column_name: str = 'embeddings',
-    ) -> pd.DataFrame:
-        """
-        This function searches for the most similar elements in the dataset, given a query.
-        Arguments:
-            query (str): The query.
-            k (int, optional): The number of elements to return  (default: 1).
-            index_column_name (str, optional): The name of the column containing the embeddings (default: 'embeddings')
-        Returns:
-            pd.DataFrame: The most similar elements in the dataset (text, score, etc.), sorted by score.
-        Remarks:
-            The returned elements are dictionaries containing the text and the score.
-        """
-        # Get the embeddings of the query
-        query_embeddings = self.get_query_embeddings([query], embedding_type=self.embedding_type)
-        # query_embedding이랑 self.dataset['embeddings'] 값 비교
-        scores = []
-        similar_elts = []
-        for query in query_embeddings:
-            # Search for the most similar elements in the dataset
-            score, similar_elt = self.dataset.get_nearest_examples(
-                index_name=index_column_name,
-                query=query,
-                k=k,
-            )
-            scores.append(score)
-            similar_elts.append(similar_elt)
+        return embeddings
-        text_list = []
-        for item in similar_elts:
-            for text in item['text']:
-                text_list.append(text)
-        flat_list = [sentence for sublist in text_list for sentence in sublist]
-        count = Counter(flat_list)
-        count = dict(count.most_common(5))
-        sorted_dict = dict(sorted(count.items(), key=lambda x: x[1], reverse=True))
-        # Convert the results to a pandas DataFrame
-        results_df = pd.DataFrame({'text': sorted_dict.keys() , 'freq': sorted_dict.values()})
-        # Return the most similar elements
-        return results_df

{nltkor-1.2.19.dist-info → nltkor-1.2.20.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nltkor
-Version: 1.2.19
+Version: 1.2.20
 Home-page: https://modi.changwon.ac.kr/air_cwnu/nlp_tool/nltk_ko.git
 Keywords: string matching,pattern matching,edit distance,string to string correction,string to string matching,Levenshtein edit distance,Hamming distance,Damerau-Levenshtein distance,Jaro-Winkler distance,longest common subsequence,longest common substring,dynamic programming,approximate string matching,semantic similarity,natural language processing,NLP,information retrieval,rouge,sacrebleu,bertscore,bartscore,fasttext,glove,cosine similarity,Smith-Waterman,Needleman-Wunsch,Hirschberg,Karp-Rabin,Knuth-Morris-Pratt,Boyer-Moore
 Classifier: Programming Language :: Python :: 3.7

{nltkor-1.2.19.dist-info → nltkor-1.2.20.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 nltkor/Kor_char.py,sha256=KtixIsoKCtKItbwnZ7ehk47jjXhdvW_luHJBkIUNYUo,4735
-nltkor/__init__.py,sha256=lFPatS1OKR8Dttg3_N4RK2gLVHAKCnk62MDLnKvKYbI,367
+nltkor/__init__.py,sha256=_uvNJGh9igxlXMtbwdVt0NNGYQYBgpBeZtsjprllfvs,367
 nltkor/etc.py,sha256=gbu4BZKe_x8g_OcuhqkKu6Z6_J-wQ0xE6pumnl4z3JE,387
 nltkor/lazyimport.py,sha256=GFL1znsYUhV7mzW3U1IUQwvBcRH4d4YMFJNcGQ8eRLc,4561
 nltkor/make_requirement.py,sha256=vvftdbp81DcaKzalNeZ-6AOmzSMveftZuAIar1NSWtE,283
@@ -25,7 +25,7 @@ nltkor/misc/string2string_hash_functions.py,sha256=OrxrqQOGOJy4tjNCiUSwvD1G51AJ2
 nltkor/misc/string2string_word_embeddings.py,sha256=T_GtJMyJsYSY0FRrmg-LzSkfVCTuQzFEqSqV8_P1GNQ,19021
 nltkor/search/__init__.py,sha256=uSR8pxjUQ2gX4dYhr5hN43YiMgtCQdNSDJ-Tgu_aY1w,330
 nltkor/search/classical.py,sha256=su1yyfiWDI-0w5QOnV1n06GWqhZceeiKZqWK_2_ANAs,19547
-nltkor/search/faiss_search.py,sha256=TMtABMlvyrDkaSiVNd9APvthF6P6rZTEojzl7XWZ8WM,32197
+nltkor/search/faiss_search.py,sha256=3kBC-QZoyNfx47iCMeE_2GTqJU8NKbwXqF2pbGZ_Vwk,20453
 nltkor/search/kobert_tokenizer.py,sha256=vUrOsrbwZKV7TBRqut6P2P8j4XGcZZfHtgus3xdAMGE,6929
 nltkor/search/test.py,sha256=12kcEeNhKmJrtFnao2OrNywMVZZmBt1SvAQkRz4W09s,696
 nltkor/search/trie_search.py,sha256=Q94_Iiig4CbAhQCGNmD0tN-575TpYMMJh1Jkakl4rO4,3031
@@ -121,8 +121,8 @@ nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc,sha256=0_BAk6rs2fEkzcb
 nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc,sha256=z3eOoU5yC3ON5Vkrld_SL07KEWxmcUkj1lejSkCy_ZI,2474
 nltkor/tokenize/__init__.py,sha256=ZdWHiwNiIPUFdxZV1Yk1nVSjnC2xUZ6lfCMAtswimFQ,2613
 nltkor/tokenize/ko_tokenize.py,sha256=T6IlXQXOEwa15TopybcRc9wLZfypLT2aAU_5CXhWuh4,3853
-nltkor-1.2.19.dist-info/licenses/LICENSE.txt,sha256=c7URrdgMRPTfDHJt9SGSSOhqXOLzVQ_VdTh8Du2kMAY,58937
-nltkor-1.2.19.dist-info/METADATA,sha256=23KCX-yuPoo7qvoNIzGyXVp05USTuWtcE2VXGaIO6GE,1733
-nltkor-1.2.19.dist-info/WHEEL,sha256=FZpotpoE2pJWvPd4MXRrq-vJvPwnPHGw8XNlxl6WcTI,112
-nltkor-1.2.19.dist-info/top_level.txt,sha256=XbFtt4S9DLUdj3lThO7ro_RyJnAobZaMFpAQpD3yEmQ,7
-nltkor-1.2.19.dist-info/RECORD,,
+nltkor-1.2.20.dist-info/licenses/LICENSE.txt,sha256=c7URrdgMRPTfDHJt9SGSSOhqXOLzVQ_VdTh8Du2kMAY,58937
+nltkor-1.2.20.dist-info/METADATA,sha256=MvtK-iBwQbr-aAqp4de_-ySnMZ-S3jW3aTz0wzJFJ_M,1733
+nltkor-1.2.20.dist-info/WHEEL,sha256=FZpotpoE2pJWvPd4MXRrq-vJvPwnPHGw8XNlxl6WcTI,112
+nltkor-1.2.20.dist-info/top_level.txt,sha256=XbFtt4S9DLUdj3lThO7ro_RyJnAobZaMFpAQpD3yEmQ,7
+nltkor-1.2.20.dist-info/RECORD,,

{nltkor-1.2.19.dist-info → nltkor-1.2.20.dist-info}/WHEEL RENAMED Viewed

File without changes

{nltkor-1.2.19.dist-info → nltkor-1.2.20.dist-info}/licenses/LICENSE.txt RENAMED Viewed

File without changes

{nltkor-1.2.19.dist-info → nltkor-1.2.20.dist-info}/top_level.txt RENAMED Viewed

File without changes