PyPI - nltkor - Versions diffs - 1.2.14__tar.gz → 1.2.16__tar.gz - Mend

nltkor 1.2.14tar.gz → 1.2.16tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (133) hide show

{nltkor-1.2.14 → nltkor-1.2.16}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: nltkor
-Version: 1.2.14
+Version: 1.2.16
 Home-page: https://modi.changwon.ac.kr/air_cwnu/nlp_tool/nltk_ko.git
 Keywords: string matching,pattern matching,edit distance,string to string correction,string to string matching,Levenshtein edit distance,Hamming distance,Damerau-Levenshtein distance,Jaro-Winkler distance,longest common subsequence,longest common substring,dynamic programming,approximate string matching,semantic similarity,natural language processing,NLP,information retrieval,rouge,sacrebleu,bertscore,bartscore,fasttext,glove,cosine similarity,Smith-Waterman,Needleman-Wunsch,Hirschberg,Karp-Rabin,Knuth-Morris-Pratt,Boyer-Moore
 Classifier: Programming Language :: Python :: 3.7
@@ -37,5 +37,6 @@ Requires-Dist: fasttext
 Dynamic: classifier
 Dynamic: home-page
 Dynamic: keywords
+Dynamic: license-file
 Dynamic: requires-dist
 Dynamic: requires-python

{nltkor-1.2.14 → nltkor-1.2.16}/README.md RENAMED Viewed

@@ -777,19 +777,19 @@ Time: 0.05374705195426941, memory: 1409.9
 #### 5.10.1 BLEU for tensor
 - 각 score의 값이 tensor 로 반환한다.
 ```python
->>> from nltk.translate.bleu_score import *
->>> from nltko.tokenize import Ko_tokenize
+>>> from nltkor.metrics import DefaultMetric
+>>> import torch
 >>> can=torch.tensor([[1,2,3,4,5],[3,4,5,6,4]])
 >>> ref=torch.tensor([[1,2,3,4,5],[3,5,6,7,10]])
->>> bleu_tensor(ref,can,1)
+>>> DefaultMetric().bleu_tensor(ref,can,1)
 tensor(0.8000)
->>> bleu_tensor(ref,can,2)
+>>> DefaultMetric().bleu_tensor(ref,can,2)
 tensor(0.6250)
->>> bleu_tensor(ref,can,3)
+>>> DefaultMetric().bleu_tensor(ref,can,3)
 tensor(0.5000)
->>> bleu_tensor(ref,can,4)
+>>> DefaultMetric().bleu_tensor(ref,can,4)
 tensor(0.5000)
->>> bleu_tensor(ref,can)
+>>> DefaultMetric().bleu_tensor(ref,can)
 tensor(0.5946)
 ```
@@ -910,11 +910,18 @@ TF-IDF를 n-gram에 대한 가중치로 계산하고 참조 캡션과 생성 캡
 0.6303797468354431
 ```
-#### 5.14 EntMent
+#### 5.14 EMR(Entity Mention Recall)
-- EntMent (Entity Mention Recall)
+요약된 텍스트가 참조 문서에 등장하는 중요 개체를 얼마나 잘 유지하고 있는지에 대한 평가 지표이다.
-  : 요약된 텍스트에 포함된 고유 엔터티의 참조 비율
+```python
+>>> # -*- coding: utf-8 -*-
+>>> from nltkor.metrics import EntMent
+>>> EntMent().entity("국립창원대학교(총장 박민원)가 사천우주항공캠퍼스 개교와 함께 2025학년도 사천우주항공공학부 입학식을 7일 오전 11시 사천우주항공캠퍼스에서 열었다.이날 행사에는 박민원 총장을 비롯해 국민의힘 서천호 국회의원(사천·남해·하동), 윤영빈 우주항공청장, 박동식 사천시장, 김규헌 사천시의회 의장, 지역 유관기관 관계자들과 신입생 및 가족들이 참석했다. 글로컬대학30사업 선정에 따라 국립창원대와 통합을 추진 중인 경남도립거창대학, 경남도립남해대학 관계자도 함께 자리했다.행사는 1부 현판 제막식과 2부 입학식으로 진행됐으며, 박동식 사천시장은 신입생들에게 축하 선물로 금배지를 전달했고, 박민원 총장은 캠퍼스 설립에 기여한 유공자들에게 표창장을 수여했다.","국립창원대학교는 4월 7일 사천우주항공캠퍼스에서 2025학년도 사천우주항공공학부 입학식을 개최했다. 이날 행사에는 박민원 총장, 서천호 국회의원, 윤영빈 우주항공청장, 박동식 사천시장 등 주요 인사와 신입생 및 가족들이 참석했으며, 글로컬대학30사업과 관련된 거창대학·남해대학 관계자들도 함께했다. 행사는 현판 제막식과 입학식으로 나뉘어 진행되었고, 신입생들에게는 금배지가, 캠퍼스 설립 유공자들에게는 표창장이 수여되었다.")
+Downloading Espresso5 model...
+0.8888888888888888
+```
 ### 6 확장 평가 함수

{nltkor-1.2.14 → nltkor-1.2.16}/nltkor/__init__.py RENAMED Viewed

@@ -13,4 +13,4 @@ from nltkor import trans
 from nltkor import Kor_char
 from nltkor import etc
-__version__ = '1.2.14'
+__version__ = '1.2.16'

{nltkor-1.2.14 → nltkor-1.2.16}/nltkor/metrics/__init__.py RENAMED Viewed

@@ -52,7 +52,7 @@ from nltk.metrics.aline import align
 from nltkor.metrics.eval import StringMetric
 """
 from nltkor.metrics.classical import DefaultMetric
-from nltkor.metrics.entment import EntMent
+from nltkor.metrics.entment import EMR
 from nltkor.metrics.bleu_tensor import *
 #DefaultMetric = lazy_import.lazy_callable("nltkor.metrics.classical.DefaultMetric")
 #Mauve = lazy_import.lazy_callable("nltkor.metrics.mauve.Mauve")

{nltkor-1.2.14 → nltkor-1.2.16}/nltkor/metrics/classical.py RENAMED Viewed

@@ -7,6 +7,7 @@ from copy import deepcopy
 import itertools
 import torch
 import time
+import math
 from nltk.translate.bleu_score import *
 from nltk.metrics import confusionmatrix
 from collections import defaultdict
@@ -415,6 +416,17 @@ class DefaultMetric:
 		elif n==4:
 			return self.bleu(reference,candiate,(0,0,0,1), smoothing_function=smoothing_function)
+	def bleu_tensor(self,reference,candidate,n=0, smoothing_function=None):
+		if n: weights = tuple(1 if i == n-1 else 0 for i in range(4))
+		else: weights = (0.25, 0.25, 0.25, 0.25)
+		reference=reference.unsqueeze(1)
+		reference=reference.numpy()
+		candidate=candidate.numpy()
+		return torch.tensor(corpus_bleu(reference,candidate,weights,smoothing_function=smoothing_function))

{nltkor-1.2.14 → nltkor-1.2.16}/nltkor/metrics/entment.py RENAMED Viewed

@@ -1,6 +1,6 @@
 from nltkor.tag import EspressoTagger
-class EntMent :
+class EMR :
     def __init__(self):
         self.entity_list = []

{nltkor-1.2.14 → nltkor-1.2.16}/nltkor/search/faiss_search.py RENAMED Viewed

@@ -33,11 +33,12 @@ SOFTWARE.
 This module contains a wrapper for the Faiss library by Facebook AI Research.
 """
-from collections import Counter
+from collections import Counter
 from typing import List, Union, Optional, Dict, Any
 import os
 import copy
 import logging
+import transformers
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 from nltkor.make_requirement import make_requirement
@@ -70,24 +71,28 @@ class FaissSearch:
             mode = None,
             model_name_or_path: str = 'klue/bert-base',
             tokenizer_name_or_path: str = 'klue/bert-base',
+            embedding_type: str = 'last_hidden_state',
             device: str = 'cpu'
             ) -> None:
         if mode == 'sentence':
-            return FaissSearch_SenEmbed(model_name_or_path)
+            return FaissSearch_SenEmbed(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
         elif mode == 'word':
-            return FaissSearch_WordEmbed(model_name_or_path)
+            return FaissSearch_WordEmbed(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
+        elif mode == 'splade':
+            return FaissSearch_Splade(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
         else:
-            raise ValueError("choice 'sentence' or 'word'")
+            raise ValueError("choice 'sentence' or 'word' or 'splade'")
-# FAISS original library wrapper class
 class FaissSearch_SenEmbed:
     def __init__(self,
         model_name_or_path: str = 'klue/bert-base',
         tokenizer_name_or_path: str = 'klue/bert-base',
+        embedding_type: str = 'last_hidden_state',
         device: str = 'cpu',
         ) -> None:
-        r"""
+        """
         This function initializes the wrapper for the FAISS library, which is used to perform semantic search.
@@ -143,8 +148,7 @@ class FaissSearch_SenEmbed:
         # Initialize the dataset
         self.dataset = None
     # Auxiliary function to get the last hidden state
     def get_last_hidden_state(self,
         embeddings: torch.Tensor,
@@ -166,7 +170,6 @@ class FaissSearch_SenEmbed:
         return last_hidden_state[:, 0, :]
     # Auxiliary function to get the mean pooling
     def get_mean_pooling(self,
         embeddings: torch.Tensor,
@@ -244,7 +247,6 @@ class FaissSearch_SenEmbed:
         return embeddings
     # Add FAISS index
     def add_faiss_index(self,
         column_name: str = 'embeddings',
@@ -309,7 +311,6 @@ class FaissSearch_SenEmbed:
         self.dataset.save_faiss_index(index_name=index_name, file=file_path)
     def load_faiss_index(self,
         index_name: str,
         file_path: str,
@@ -339,7 +340,6 @@ class FaissSearch_SenEmbed:
         self.dataset.load_faiss_index(index_name=index_name, file=file_path, device=device)
     # Initialize the corpus using a dictionary or pandas DataFrame or HuggingFace Datasets object
     def initialize_corpus(self,
         corpus: Union[Dict[str, List[str]], pd.DataFrame, Dataset],
@@ -407,7 +407,6 @@ class FaissSearch_SenEmbed:
         return self.dataset
     # Initialize the dataset using a JSON file
     def load_dataset_from_json(self,
         json_path: str,
@@ -429,7 +428,6 @@ class FaissSearch_SenEmbed:
         return self.dataset
     # Search for the most similar elements in the dataset, given a query
     def search(self,
         query: str,
@@ -475,12 +473,132 @@ class FaissSearch_SenEmbed:
+# FAISS Splade + ICT library wrapper class
+class FaissSearch_Splade(FaissSearch_SenEmbed):
+    def __init__(self,
+        model_name_or_path: str = 'klue/bert-base',
+        tokenizer_name_or_path: str = 'klue/bert-base',
+        embedding_type: str = 'last_hidden_state',
+        device: str = 'cpu',
+        ) -> None:
+        r"""
+        This function initializes the wrapper for the FAISS library, which is used to perform semantic search.
+        .. attention::
+            * If you use this class, please make sure to cite the following paper:
+                .. code-block:: latex
+                    @article{johnson2019billion,
+                        title={Billion-scale similarity search with {GPUs}},
+                        author={Johnson, Jeff and Douze, Matthijs and J{\'e}gou, Herv{\'e}},
+                        journal={IEEE Transactions on Big Data},
+                        volume={7},
+                        number={3},
+                        pages={535--547},
+                        year={2019},
+                        publisher={IEEE}
+                    }
+            * The code is based on the following GitHub repository:
+                https://github.com/facebookresearch/faiss
+        Arguments:
+            model_name_or_path (str, optional): The name or path of the model to use. Defaults to 'facebook/bart-large'.
+            tokenizer_name_or_path (str, optional): The name or path of the tokenizer to use. Defaults to 'facebook/bart-large'.
+            device (str, optional): The device to use. Defaults to 'cpu'.
+        Returns:
+            None
+        """
+        # Set the device
+        self.device = device
+        # If the tokenizer is not specified, use the model name or path
+        if tokenizer_name_or_path is None:
+            tokenizer_name_or_path = model_name_or_path
+        # Load the tokenizer
+        if tokenizer_name_or_path == 'skt/kobert-base-v1':
+            # self.tokenizer = KoBERTTokenizer.from_pretrained(tokenizer_name_or_path)
+            self.tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name_or_path)
+        else:
+            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
+        # Load the model
+        self.model = transformers.BertForMaskedLM.from_pretrained(model_name_or_path).to(self.device)
+        # Set the model to evaluation mode (since we do not need the gradients)
+        self.model.eval()
+        # Initialize the dataset
+        self.dataset = None
+    # Get the embeddings
+    def get_embeddings(self,
+        text: Union[str, List[str]],
+        embedding_type: str = 'last_hidden_state',
+        batch_size: int = 8,
+        num_workers: int = 4,
+    ) -> torch.Tensor:
+        """
+        This function returns the embeddings of the input text.
+        Arguments:
+            text (Union[str, List[str]]): The input text.
+            embedding_type (str, optional): The type of embedding to use. Defaults to 'last_hidden_state'.
+            batch_size (int, optional): The batch size to use. Defaults to 8.
+            num_workers (int, optional): The number of workers to use. Defaults to 4.
+        Returns:
+            torch.Tensor: The embeddings.
+        Raises:
+            ValueError: If the embedding type is invalid.
+        """
+        # Check if the embedding type is valid
+        if embedding_type not in ['last_hidden_state', 'mean_pooling']:
+            raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
+        # Tokenize the input text
+        encoded_text = self.tokenizer(
+            text,
+            padding=True,
+            truncation=True,
+            return_tensors='pt',
+        )
+        # Move the input text to the device
+        encoded_text = encoded_text.to(self.device)
+        # encoded_inputs = {k: v.to(self.device) for k, v in encoded_inputs.items()}
+        # Get the embeddings
+        with torch.no_grad():
+            embeddings = self.model(**encoded_text)
+        # Get the last hidden state
+        embeddings = embeddings['logits']
+        embeddings = torch.sum(torch.log(1+torch.relu(embeddings)) * encoded_text['attention_mask'].unsqueeze(-1), dim=1)
+        e_norm = torch.nn.functional.normalize(embeddings, p=2, dim=1, eps=1e-8)
+        # Return the embeddings
+        return e_norm
 # FAISS word embedding library wrapper class
 class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
     def __init__(self,
         model_name_or_path: str = 'klue/bert-base',
         tokenizer_name_or_path: str = 'klue/bert-base',
+        embedding_type: str = 'last_hidden_state',
         device: str = 'cpu',
         ) -> None:
         r"""
@@ -533,6 +651,7 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
         # Load the model
         self.model = AutoModel.from_pretrained(model_name_or_path).to(self.device)
         # Set the model to evaluation mode (since we do not need the gradients)
         self.model.eval()
@@ -540,7 +659,6 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
         self.dataset = None
     # Get the embeddings (new code)
     def get_doc_embeddings(self,
         #text: Union[str, List[str]],
@@ -564,7 +682,7 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
         Raises:
             ValueError: If the embedding type is invalid.
         """
         # Check if the embedding type is valid
         if embedding_type not in ['last_hidden_state', 'mean_pooling']:
             raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
@@ -577,12 +695,10 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
                 padding=False,
                 truncation=True,
                 return_tensors='pt',
-                add_special_tokens=False,
+                add_special_tokens=False
             )
             # Move the input text to the device
             encoded_text = encoded_text.to(self.device)
             token_ids_list = encoded_text['input_ids'].tolist()
             token_ids_list = token_ids_list[0]
             for ids in token_ids_list:
@@ -591,19 +707,17 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
                 else:
                     if text not in ids_dict[ids]:
                         ids_dict[ids].append(sentence)
         # Get the embeddings
         embedding_dict = {}
         self.model.eval()
         for key, value in ids_dict.items():
             embed = self.model(torch.tensor([[key]]), output_hidden_states=True).hidden_states[-1][:,0,:].detach()
             embedding_dict[embed] = value
         # Return the embeddings
         return embedding_dict
     # Get the embeddings (new code)
     def get_query_embeddings(self,
         text: Union[str, List[str]],
@@ -657,7 +771,6 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
         # Return the embeddings
         return embeds
     # Initialize the corpus using a dictionary or pandas DataFrame or HuggingFace Datasets object
     def initialize_corpus(self,
@@ -693,7 +806,7 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
         # Set the embedding_type
         self.embedding_type = embedding_type
         # get embedding dict
         embedding_dict = self.get_doc_embeddings(text=corpus, embedding_type=self.embedding_type)
@@ -729,7 +842,6 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
         return self.dataset
     # Search for the most similar elements in the dataset, given a query
     def search(self,
         query: str,
@@ -751,7 +863,6 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
             The returned elements are dictionaries containing the text and the score.
         """
         # Get the embeddings of the query
         query_embeddings = self.get_query_embeddings([query], embedding_type=self.embedding_type)
@@ -768,6 +879,7 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
             scores.append(score)
             similar_elts.append(similar_elt)
         text_list = []
         for item in similar_elts:
             for text in item['text']:
@@ -776,12 +888,10 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
         flat_list = [sentence for sublist in text_list for sentence in sublist]
         count = Counter(flat_list)
         count = dict(count.most_common(5))
         sorted_dict = dict(sorted(count.items(), key=lambda x: x[1], reverse=True))
         # Convert the results to a pandas DataFrame
         results_df = pd.DataFrame({'text': sorted_dict.keys() , 'freq': sorted_dict.values()})
         # Return the most similar elements
         return results_df

{nltkor-1.2.14 → nltkor-1.2.16}/nltkor.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: nltkor
-Version: 1.2.14
+Version: 1.2.16
 Home-page: https://modi.changwon.ac.kr/air_cwnu/nlp_tool/nltk_ko.git
 Keywords: string matching,pattern matching,edit distance,string to string correction,string to string matching,Levenshtein edit distance,Hamming distance,Damerau-Levenshtein distance,Jaro-Winkler distance,longest common subsequence,longest common substring,dynamic programming,approximate string matching,semantic similarity,natural language processing,NLP,information retrieval,rouge,sacrebleu,bertscore,bartscore,fasttext,glove,cosine similarity,Smith-Waterman,Needleman-Wunsch,Hirschberg,Karp-Rabin,Knuth-Morris-Pratt,Boyer-Moore
 Classifier: Programming Language :: Python :: 3.7
@@ -37,5 +37,6 @@ Requires-Dist: fasttext
 Dynamic: classifier
 Dynamic: home-page
 Dynamic: keywords
+Dynamic: license-file
 Dynamic: requires-dist
 Dynamic: requires-python

{nltkor-1.2.14 → nltkor-1.2.16}/nltkor.egg-info/SOURCES.txt RENAMED Viewed

@@ -126,6 +126,4 @@ nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc
 nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc
 nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc
 nltkor/tokenize/__init__.py
-nltkor/tokenize/ko_tokenize.py
-test/test.py
-test/testespresso.py
+nltkor/tokenize/ko_tokenize.py

{nltkor-1.2.14 → nltkor-1.2.16}/setup.py RENAMED Viewed

@@ -66,7 +66,7 @@ module1 = cythonize([
 setup(
   name='nltkor',
-  version='1.2.14',
+  version='1.2.16',
 	url='https://modi.changwon.ac.kr/air_cwnu/nlp_tool/nltk_ko.git',
   packages=find_packages(exclude=[]),
   python_requires='>=3.7',

nltkor 1.2.14__tar.gz → 1.2.16__tar.gz

nltkor 1.2.14tar.gz → 1.2.16tar.gz