PyPI - nltkor - Versions diffs - 1.2.5__tar.gz → 1.2.9__tar.gz - Mend

nltkor 1.2.5tar.gz → 1.2.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

{nltkor-1.2.5 → nltkor-1.2.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: nltkor
-Version: 1.2.5
+Version: 1.2.9
 Home-page: https://modi.changwon.ac.kr/air_cwnu/nlp_tool/nltk_ko.git
 Keywords: string matching,pattern matching,edit distance,string to string correction,string to string matching,Levenshtein edit distance,Hamming distance,Damerau-Levenshtein distance,Jaro-Winkler distance,longest common subsequence,longest common substring,dynamic programming,approximate string matching,semantic similarity,natural language processing,NLP,information retrieval,rouge,sacrebleu,bertscore,bartscore,fasttext,glove,cosine similarity,Smith-Waterman,Needleman-Wunsch,Hirschberg,Karp-Rabin,Knuth-Morris-Pratt,Boyer-Moore
 Classifier: Programming Language :: Python :: 3.7
@@ -12,21 +12,25 @@ Classifier: Operating System :: OS Independent
 Classifier: Typing :: Typed
 Requires-Python: >=3.7
 License-File: LICENSE.txt
+Requires-Dist: Cython
+Requires-Dist: numpy<=1.26.4,>=1.23.5
 Requires-Dist: regex
 Requires-Dist: tqdm>=4.40.0
 Requires-Dist: joblib
-Requires-Dist: numpy==1.23.0
 Requires-Dist: requests
 Requires-Dist: nltk>3.0
-Requires-Dist: pyarrow==14.0.0
+Requires-Dist: pyarrow
 Requires-Dist: beautifulSoup4
-Requires-Dist: faiss-cpu>=1.7.3
+Requires-Dist: faiss-cpu==1.7.3
 Requires-Dist: datasets
 Requires-Dist: torch
+Requires-Dist: dill<0.3.9
 Requires-Dist: scikit-learn>=0.22.1
-Requires-Dist: transformers>=4.8.2
+Requires-Dist: transformers==4.42.2
 Requires-Dist: protobuf
 Requires-Dist: sentencepiece
 Requires-Dist: pandas
 Requires-Dist: bert_score
-Requires-Dist: fasttext==0.9.2
+Requires-Dist: chardet
+Requires-Dist: GPUtil
+Requires-Dist: fasttext

{nltkor-1.2.5 → nltkor-1.2.9}/nltkor/__init__.py RENAMED Viewed

@@ -1,6 +1,7 @@
 from nltkor import alignment
 from nltkor import cider
 from nltkor import distance
 from nltkor import sejong
 from nltkor import metrics
 from nltkor import misc
@@ -8,8 +9,8 @@ from nltkor import search
 from nltkor import similarity
 from nltkor import tag
 from nltkor import tokenize
 from nltkor import trans
 from nltkor import Kor_char
 from nltkor import etc
+__version__ = '1.2.9'

{nltkor-1.2.5 → nltkor-1.2.9}/nltkor/metrics/classical.py RENAMED Viewed

@@ -5,6 +5,8 @@ import numpy as np
 from typing import Callable, Iterable, List, Tuple, Union
 from copy import deepcopy
 import itertools
+import torch
+import time
 from nltk.translate.bleu_score import *
 from nltk.metrics import confusionmatrix
 from collections import defaultdict
@@ -54,6 +56,37 @@ class DefaultMetric:
 		return float(tp/total)
+	def accuracy_norm(model, tokenizer, input_text: str, candidates: list, label: int):
+		reserved_memory = []
+		inference_time = []
+		tokenized_prompt = tokenizer(input_text, return_tensors='pt').input_ids
+		total_candidate = []
+		for ending in candidates:
+			len_ending = len(ending)
+			tokenized_ending = tokenizer(ending, return_tensors='pt').input_ids
+			tokenized_ending = tokenized_ending[:, 1:]
+			input_ids = torch.cat([tokenized_prompt, tokenized_ending], dim=-1).cuda()
+			labels = input_ids.clone()
+			labels[0, :tokenized_prompt.shape[1]] = -100
+			start = time.time()
+			with torch.no_grad():
+				outputs = model(input_ids, labels=labels)
+				inference_time.append(time.time() - start)
+			reserved_memory.append(torch.cuda.memory_reserved() / (1024**2))
+			total_logprobs = -outputs.loss.item() * tokenized_ending.shape[1]
+			total_candidate.append(total_logprobs/len_ending)
+		answer_idx = total_candidate.index(max(total_candidate))
+		if int(label) == answer_idx:
+			cor = 1
+		else:
+			cor = 0
+		metric_dict = {
+			"reserved_memory": reserved_memory,
+			"inference_time": inference_time
+			}
+		return cor, metric_dict
 	def recall_score(self, true, pred, avg='micro'):
 		mat=confusionmatrix.ConfusionMatrix(true,pred)
@@ -197,7 +230,7 @@ class DefaultMetric:
 		return (((precision*recall)/(precision+recall))*2)
 	def pos_eval(self, fin):

{nltkor-1.2.5 → nltkor-1.2.9}/nltkor/misc/string2string_word_embeddings.py RENAMED Viewed

@@ -49,7 +49,7 @@ except ImportError:
     raise Exception(f"""
     Need to install Libraries, please pip install below libraries
     \t pip install torch
-    \t pip install fasttext
+    \t pip install fasttext-wheel
     Or, use pip install requirement.txt
     \t  pip install -r {file_path}
     """)

{nltkor-1.2.5 → nltkor-1.2.9}/nltkor/search/faiss_search.py RENAMED Viewed

@@ -33,8 +33,11 @@ SOFTWARE.
 This module contains a wrapper for the Faiss library by Facebook AI Research.
 """
+from collections import Counter
 from typing import List, Union, Optional, Dict, Any
 import os
+import copy
+import logging
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 from nltkor.make_requirement import make_requirement
@@ -62,13 +65,27 @@ except ImportError:
 # from nltk.search.kobert_tokenizer import KoBERTTokenizer
-# FAISS library wrapper class
 class FaissSearch:
+    def __new__(cls,
+            mode = None,
+            model_name_or_path: str = 'klue/bert-base',
+            tokenizer_name_or_path: str = 'klue/bert-base',
+            device: str = 'cpu'
+            ) -> None:
+        if mode == 'sentence':
+            return FaissSearch_SenEmbed(model_name_or_path)
+        elif mode == 'word':
+            return FaissSearch_WordEmbed(model_name_or_path)
+        else:
+            raise ValueError("choice 'sentence' or 'word'")
+# FAISS original library wrapper class
+class FaissSearch_SenEmbed:
     def __init__(self,
         model_name_or_path: str = 'klue/bert-base',
         tokenizer_name_or_path: str = 'klue/bert-base',
-        device: str = 'cpu'
+        device: str = 'cpu',
         ) -> None:
         r"""
         This function initializes the wrapper for the FAISS library, which is used to perform semantic search.
@@ -171,8 +188,6 @@ class FaissSearch:
         return mean_pooling
     # Get the embeddings
     def get_embeddings(self,
         text: Union[str, List[str]],
@@ -369,14 +384,6 @@ class FaissSearch:
         self.embedding_type = embedding_type
-        # Tokenize the dataset
-        # self.dataset = self.dataset.map(
-        #     lambda x: x[section],
-        #     batched=True,
-        #     batch_size=batch_size,
-        #     num_proc=num_workers,
-        # )
         # Map the section of the dataset to the embeddings
         self.dataset = self.dataset.map(
             lambda x: {
@@ -465,3 +472,316 @@ class FaissSearch:
         # Return the most similar elements
         return results_df
+# FAISS word embedding library wrapper class
+class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
+    def __init__(self,
+        model_name_or_path: str = 'klue/bert-base',
+        tokenizer_name_or_path: str = 'klue/bert-base',
+        device: str = 'cpu',
+        ) -> None:
+        r"""
+        This function initializes the wrapper for the FAISS library, which is used to perform semantic search.
+        .. attention::
+            * If you use this class, please make sure to cite the following paper:
+                .. code-block:: latex
+                    @article{johnson2019billion,
+                        title={Billion-scale similarity search with {GPUs}},
+                        author={Johnson, Jeff and Douze, Matthijs and J{\'e}gou, Herv{\'e}},
+                        journal={IEEE Transactions on Big Data},
+                        volume={7},
+                        number={3},
+                        pages={535--547},
+                        year={2019},
+                        publisher={IEEE}
+                    }
+            * The code is based on the following GitHub repository:
+                https://github.com/facebookresearch/faiss
+        Arguments:
+            model_name_or_path (str, optional): The name or path of the model to use. Defaults to 'facebook/bart-large'.
+            tokenizer_name_or_path (str, optional): The name or path of the tokenizer to use. Defaults to 'facebook/bart-large'.
+            device (str, optional): The device to use. Defaults to 'cpu'.
+        Returns:
+            None
+        """
+        # Set the device
+        self.device = device
+        # If the tokenizer is not specified, use the model name or path
+        if tokenizer_name_or_path is None:
+            tokenizer_name_or_path = model_name_or_path
+        # Load the tokenizer
+        if tokenizer_name_or_path == 'skt/kobert-base-v1':
+            # self.tokenizer = KoBERTTokenizer.from_pretrained(tokenizer_name_or_path)
+            self.tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name_or_path)
+        else:
+            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
+        # Load the model
+        self.model = AutoModel.from_pretrained(model_name_or_path).to(self.device)
+        # Set the model to evaluation mode (since we do not need the gradients)
+        self.model.eval()
+        # Initialize the dataset
+        self.dataset = None
+    # Get the embeddings (new code)
+    def get_doc_embeddings(self,
+        #text: Union[str, List[str]],
+        text=None,
+        embedding_type: str = 'last_hidden_state',
+        batch_size: int = 8,
+        num_workers: int = 4,
+    ) -> torch.Tensor:
+        """
+        This function returns the embeddings of the input text.
+        Arguments:
+            text (Union[str, List[str]]): The input text.
+            embedding_type (str, optional): The type of embedding to use. Defaults to 'last_hidden_state'.
+            batch_size (int, optional): The batch size to use. Defaults to 8.
+            num_workers (int, optional): The number of workers to use. Defaults to 4.
+        Returns:
+            torch.Tensor: The embeddings.
+        Raises:
+            ValueError: If the embedding type is invalid.
+        """
+        # Check if the embedding type is valid
+        if embedding_type not in ['last_hidden_state', 'mean_pooling']:
+            raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
+        ids_dict = {}
+        # Tokenize the input text
+        for sentence in text['text']:
+            encoded_text = self.tokenizer(
+                sentence,
+                padding=False,
+                truncation=True,
+                return_tensors='pt',
+                add_special_tokens=False,
+            )
+            # Move the input text to the device
+            encoded_text = encoded_text.to(self.device)
+            token_ids_list = encoded_text['input_ids'].tolist()
+            token_ids_list = token_ids_list[0]
+            for ids in token_ids_list:
+                if ids not in ids_dict.keys():
+                    ids_dict[ids] = [sentence]
+                else:
+                    if text not in ids_dict[ids]:
+                        ids_dict[ids].append(sentence)
+        # Get the embeddings
+        embedding_dict = {}
+        self.model.eval()
+        for key, value in ids_dict.items():
+            embed = self.model(torch.tensor([[key]]), output_hidden_states=True).hidden_states[-1][:,0,:].detach()
+            embedding_dict[embed] = value
+        # Return the embeddings
+        return embedding_dict
+    # Get the embeddings (new code)
+    def get_query_embeddings(self,
+        text: Union[str, List[str]],
+        embedding_type: str = 'last_hidden_state',
+        batch_size: int = 8,
+        num_workers: int = 4,
+    ) -> torch.Tensor:
+        """
+        This function returns the embeddings of the input text.
+        Arguments:
+            text (Union[str, List[str]]): The input text.
+            embedding_type (str, optional): The type of embedding to use. Defaults to 'last_hidden_state'.
+            batch_size (int, optional): The batch size to use. Defaults to 8.
+            num_workers (int, optional): The number of workers to use. Defaults to 4.
+        Returns:
+            torch.Tensor: The embeddings.
+        Raises:
+            ValueError: If the embedding type is invalid.
+        """
+        # Check if the embedding type is valid
+        if embedding_type not in ['last_hidden_state', 'mean_pooling']:
+            raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
+        # Tokenize the input text
+        encoded_text = self.tokenizer(
+            text,
+            padding=False,
+            truncation=True,
+            return_tensors='pt',
+            add_special_tokens=False,
+        )
+        # Move the input text to the device
+        encoded_text = encoded_text.to(self.device)
+        token_ids_list = encoded_text['input_ids'].tolist()
+        token_ids_list = token_ids_list[0]
+        tensor_list = [torch.tensor([[value]]) for value in token_ids_list]
+        # Get the embeddings
+        embeds = []
+        self.model.eval()
+        for index, tensor in enumerate(tensor_list):
+            embed = self.model(tensor, output_hidden_states=True).hidden_states[-1][:,0,:].detach().cpu().numpy()
+            embeds.append(embed)
+        # Return the embeddings
+        return embeds
+    # Initialize the corpus using a dictionary or pandas DataFrame or HuggingFace Datasets object
+    def initialize_corpus(self,
+        corpus: Union[Dict[str, List[str]], pd.DataFrame, Dataset],
+        section: str = 'text',
+        index_column_name: str = 'embeddings',
+        embedding_type: str = 'last_hidden_state',
+        batch_size: Optional[int] = None,
+        num_workers: Optional[int] = None,
+        save_path: Optional[str] = None,
+    ) -> Dataset:
+        """
+        This function initializes a dataset using a dictionary or pandas DataFrame or HuggingFace Datasets object.
+        Arguments:
+            dataset_dict (Dict[str, List[str]]): The dataset dictionary.
+            section (str): The section of the dataset to use whose embeddings will be used for semantic search (e.g., 'text', 'title', etc.) (default: 'text').
+            index_column_name (str): The name of the column containing the embeddings (default: 'embeddings')
+            embedding_type (str): The type of embedding to use (default: 'last_hidden_state').
+            batch_size (int, optional): The batch size to use (default: 8).
+            max_length (int, optional): The maximum length of the input sequences.
+            num_workers (int, optional): The number of workers to use.
+            save_path (Optional[str], optional): The path to save the dataset (default: None).
+        Returns:
+            Dataset: The dataset object (HuggingFace Datasets).
+        Raises:
+            ValueError: If the dataset is not a dictionary or pandas DataFrame or HuggingFace Datasets object.
+        """
+        # corpus = { 'text': [...] } -> form_dict
+        # Set the embedding_type
+        self.embedding_type = embedding_type
+        # get embedding dict
+        embedding_dict = self.get_doc_embeddings(text=corpus, embedding_type=self.embedding_type)
+        data = {
+                'text' : embedding_dict.values(),
+                'embeddings': []
+                }
+        for embed in embedding_dict.keys():
+            embed_list = embed.tolist()
+            data['embeddings'].append(embed_list[0])
+        if isinstance(data, dict):
+            self.dataset = Dataset.from_dict(data)
+        elif isinstance(data, pd.DataFrame):
+            self.dataset = Dataset.from_pandas(data)
+        elif isinstance(data, Dataset):
+            self.dataset = corpus
+        else:
+            raise ValueError('The dataset must be a dictionary or pandas DataFrame.')
+        # Save the dataset
+        if save_path is not None:
+            self.dataset.to_json(save_path)
+        # Add FAISS index
+        self.add_faiss_index(
+            column_name=index_column_name,
+        )
+        # Return the dataset
+        return self.dataset
+    # Search for the most similar elements in the dataset, given a query
+    def search(self,
+        query: str,
+        k: int = 1,
+        index_column_name: str = 'embeddings',
+    ) -> pd.DataFrame:
+        """
+        This function searches for the most similar elements in the dataset, given a query.
+        Arguments:
+            query (str): The query.
+            k (int, optional): The number of elements to return  (default: 1).
+            index_column_name (str, optional): The name of the column containing the embeddings (default: 'embeddings')
+        Returns:
+            pd.DataFrame: The most similar elements in the dataset (text, score, etc.), sorted by score.
+        Remarks:
+            The returned elements are dictionaries containing the text and the score.
+        """
+        # Get the embeddings of the query
+        query_embeddings = self.get_query_embeddings([query], embedding_type=self.embedding_type)
+        # query_embedding이랑 self.dataset['embeddings'] 값 비교
+        scores = []
+        similar_elts = []
+        for query in query_embeddings:
+            # Search for the most similar elements in the dataset
+            score, similar_elt = self.dataset.get_nearest_examples(
+                index_name=index_column_name,
+                query=query,
+                k=k,
+            )
+            scores.append(score)
+            similar_elts.append(similar_elt)
+        text_list = []
+        for item in similar_elts:
+            for text in item['text']:
+                text_list.append(text)
+        flat_list = [sentence for sublist in text_list for sentence in sublist]
+        count = Counter(flat_list)
+        count = dict(count.most_common(5))
+        sorted_dict = dict(sorted(count.items(), key=lambda x: x[1], reverse=True))
+        # Convert the results to a pandas DataFrame
+        results_df = pd.DataFrame({'text': sorted_dict.keys() , 'freq': sorted_dict.values()})
+        # Return the most similar elements
+        return results_df

nltkor-1.2.9/nltkor/sejong/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file

nltkor-1.2.9/nltkor/sejong/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file

nltkor-1.2.9/nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc ADDED Viewed

Binary file

{nltkor-1.2.5 → nltkor-1.2.9}/nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc RENAMED Viewed

Binary file

nltkor-1.2.9/nltkor/sejong/__pycache__/ssem.cpython-38.pyc ADDED Viewed

Binary file

nltkor-1.2.9/nltkor/sejong/__pycache__/ssem.cpython-39.pyc ADDED Viewed

Binary file

{nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/__init__.py RENAMED Viewed

@@ -68,3 +68,4 @@ For more information, please consult chapter 5 of the NLTK Book.
 from nltkor.tag.espresso_tag import EspressoTagger
 #import nltkor.tag
 from nltkor.tag.libs import taggers
+from .libs import PickleConverter

{nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/__init__.py RENAMED Viewed

@@ -5,5 +5,6 @@ from . import utils
 from .taggers import POSTagger, NERTagger, WSDTagger, SRLTagger, DependencyParser
 from .utils import tokenize
+from .utils import PickleConverter
 __version__ = '1.2.0'

{nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/config.py RENAMED Viewed

@@ -27,9 +27,10 @@ def get_config_paths(directory):
 				('network_text_pos'            , 'pos-network.txt'),
 				('pos_tags'                    , 'pos-tags.txt'),
 				('pos_tag_dict'                , 'pos-tags.txt'),
-				('pos_co_lexicon'              , 'pos-co-lexicon.txt'),
-				('pos_morph_lexicon'           , 'pos-morph-lexicon.txt'),
-				('pos_prob_dict'               , 'pos-prob-dict.txt'),
+				('pos_co_lexicon'              , 'pos-co-lexicon.pickle'),
+				('pos_morph_lexicon'           , 'pos-morph-lexicon.pickle'),
+				('pos_prob_dict'               , 'pos-prob-dict.pickle'),
+				('pos_morph_lexicon_txt'       , 'pos-morph-lexicon.txt'),
 				('suffix'                      , 'suffixes.txt'),
 				('suffixes'                    , 'suffixes.txt'),
 				('prefix'                      , 'prefixes.txt'),

nltkor 1.2.5__tar.gz → 1.2.9__tar.gz

nltkor 1.2.5tar.gz → 1.2.9tar.gz