nltkor 1.2.14__tar.gz → 1.2.16__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {nltkor-1.2.14 → nltkor-1.2.16}/PKG-INFO +3 -2
- {nltkor-1.2.14 → nltkor-1.2.16}/README.md +17 -10
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/__init__.py +1 -1
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/metrics/__init__.py +1 -1
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/metrics/classical.py +12 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/metrics/entment.py +1 -1
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/search/faiss_search.py +139 -29
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor.egg-info/PKG-INFO +3 -2
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor.egg-info/SOURCES.txt +1 -3
- {nltkor-1.2.14 → nltkor-1.2.16}/setup.py +1 -1
- nltkor-1.2.14/test/test.py +0 -282
- nltkor-1.2.14/test/testespresso.py +0 -19
- {nltkor-1.2.14 → nltkor-1.2.16}/LICENSE.txt +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/Kor_char.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/alignment/__init__.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/cider/__init__.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/cider/cider.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/cider/cider_scorer.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/distance/__init__.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/distance/wasserstein.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/etc.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/lazyimport.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/make_requirement.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/metrics/bartscore.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/metrics/bertscore.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/metrics/bleu_tensor.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/metrics/eval.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/metrics/mauve.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/metrics/mauve_utils.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/misc/__init__.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/misc/string2string_basic_functions.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/misc/string2string_default_tokenizer.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/misc/string2string_hash_functions.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/misc/string2string_word_embeddings.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/search/__init__.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/search/classical.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/search/kobert_tokenizer.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/sejong/__init__.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/sejong/ch.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/sejong/dict_semClassNum.txt +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/sejong/layer.txt +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/sejong/sejong_download.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/sejong/ssem.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/similarity/__init__.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/similarity/bartscore____.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/similarity/bertscore____.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/similarity/classical.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/similarity/cosine_similarity.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/__init__.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/espresso_tag.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__init__.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/arguments.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/attributes.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/config.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/metadata.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/ner/__init__.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/ner/macmorphoreader.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/ner/ner_reader.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/network.c +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/network.pyx +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/networkconv.pyx +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/networkdependencyconv.pyx +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/parse/__init__.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/parse/parse_reader.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/pos/__init__.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/pos/macmorphoreader.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/pos/pos_reader.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/reader.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/srl/__init__.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/srl/__srl_reader_.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/srl/srl_reader.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/srl/train_srl.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/taggers.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/utils.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/word_dictionary.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/wsd/__init__.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/wsd/macmorphoreader.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/wsd/wsd_reader.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tokenize/__init__.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tokenize/ko_tokenize.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/trans.py +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor.egg-info/dependency_links.txt +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor.egg-info/requires.txt +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/nltkor.egg-info/top_level.txt +0 -0
- {nltkor-1.2.14 → nltkor-1.2.16}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: nltkor
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.16
|
4
4
|
Home-page: https://modi.changwon.ac.kr/air_cwnu/nlp_tool/nltk_ko.git
|
5
5
|
Keywords: string matching,pattern matching,edit distance,string to string correction,string to string matching,Levenshtein edit distance,Hamming distance,Damerau-Levenshtein distance,Jaro-Winkler distance,longest common subsequence,longest common substring,dynamic programming,approximate string matching,semantic similarity,natural language processing,NLP,information retrieval,rouge,sacrebleu,bertscore,bartscore,fasttext,glove,cosine similarity,Smith-Waterman,Needleman-Wunsch,Hirschberg,Karp-Rabin,Knuth-Morris-Pratt,Boyer-Moore
|
6
6
|
Classifier: Programming Language :: Python :: 3.7
|
@@ -37,5 +37,6 @@ Requires-Dist: fasttext
|
|
37
37
|
Dynamic: classifier
|
38
38
|
Dynamic: home-page
|
39
39
|
Dynamic: keywords
|
40
|
+
Dynamic: license-file
|
40
41
|
Dynamic: requires-dist
|
41
42
|
Dynamic: requires-python
|
@@ -777,19 +777,19 @@ Time: 0.05374705195426941, memory: 1409.9
|
|
777
777
|
#### 5.10.1 BLEU for tensor
|
778
778
|
- 각 score의 값이 tensor 로 반환한다.
|
779
779
|
```python
|
780
|
-
>>> from
|
781
|
-
>>>
|
780
|
+
>>> from nltkor.metrics import DefaultMetric
|
781
|
+
>>> import torch
|
782
782
|
>>> can=torch.tensor([[1,2,3,4,5],[3,4,5,6,4]])
|
783
783
|
>>> ref=torch.tensor([[1,2,3,4,5],[3,5,6,7,10]])
|
784
|
-
>>> bleu_tensor(ref,can,1)
|
784
|
+
>>> DefaultMetric().bleu_tensor(ref,can,1)
|
785
785
|
tensor(0.8000)
|
786
|
-
>>> bleu_tensor(ref,can,2)
|
786
|
+
>>> DefaultMetric().bleu_tensor(ref,can,2)
|
787
787
|
tensor(0.6250)
|
788
|
-
>>> bleu_tensor(ref,can,3)
|
788
|
+
>>> DefaultMetric().bleu_tensor(ref,can,3)
|
789
789
|
tensor(0.5000)
|
790
|
-
>>> bleu_tensor(ref,can,4)
|
790
|
+
>>> DefaultMetric().bleu_tensor(ref,can,4)
|
791
791
|
tensor(0.5000)
|
792
|
-
>>> bleu_tensor(ref,can)
|
792
|
+
>>> DefaultMetric().bleu_tensor(ref,can)
|
793
793
|
tensor(0.5946)
|
794
794
|
|
795
795
|
```
|
@@ -910,11 +910,18 @@ TF-IDF를 n-gram에 대한 가중치로 계산하고 참조 캡션과 생성 캡
|
|
910
910
|
0.6303797468354431
|
911
911
|
```
|
912
912
|
|
913
|
-
#### 5.14
|
913
|
+
#### 5.14 EMR(Entity Mention Recall)
|
914
|
+
|
914
915
|
|
915
|
-
|
916
|
+
요약된 텍스트가 참조 문서에 등장하는 중요 개체를 얼마나 잘 유지하고 있는지에 대한 평가 지표이다.
|
916
917
|
|
917
|
-
|
918
|
+
```python
|
919
|
+
>>> # -*- coding: utf-8 -*-
|
920
|
+
>>> from nltkor.metrics import EntMent
|
921
|
+
>>> EntMent().entity("국립창원대학교(총장 박민원)가 사천우주항공캠퍼스 개교와 함께 2025학년도 사천우주항공공학부 입학식을 7일 오전 11시 사천우주항공캠퍼스에서 열었다.이날 행사에는 박민원 총장을 비롯해 국민의힘 서천호 국회의원(사천·남해·하동), 윤영빈 우주항공청장, 박동식 사천시장, 김규헌 사천시의회 의장, 지역 유관기관 관계자들과 신입생 및 가족들이 참석했다. 글로컬대학30사업 선정에 따라 국립창원대와 통합을 추진 중인 경남도립거창대학, 경남도립남해대학 관계자도 함께 자리했다.행사는 1부 현판 제막식과 2부 입학식으로 진행됐으며, 박동식 사천시장은 신입생들에게 축하 선물로 금배지를 전달했고, 박민원 총장은 캠퍼스 설립에 기여한 유공자들에게 표창장을 수여했다.","국립창원대학교는 4월 7일 사천우주항공캠퍼스에서 2025학년도 사천우주항공공학부 입학식을 개최했다. 이날 행사에는 박민원 총장, 서천호 국회의원, 윤영빈 우주항공청장, 박동식 사천시장 등 주요 인사와 신입생 및 가족들이 참석했으며, 글로컬대학30사업과 관련된 거창대학·남해대학 관계자들도 함께했다. 행사는 현판 제막식과 입학식으로 나뉘어 진행되었고, 신입생들에게는 금배지가, 캠퍼스 설립 유공자들에게는 표창장이 수여되었다.")
|
922
|
+
Downloading Espresso5 model...
|
923
|
+
0.8888888888888888
|
924
|
+
```
|
918
925
|
|
919
926
|
|
920
927
|
### 6 확장 평가 함수
|
@@ -52,7 +52,7 @@ from nltk.metrics.aline import align
|
|
52
52
|
from nltkor.metrics.eval import StringMetric
|
53
53
|
"""
|
54
54
|
from nltkor.metrics.classical import DefaultMetric
|
55
|
-
from nltkor.metrics.entment import
|
55
|
+
from nltkor.metrics.entment import EMR
|
56
56
|
from nltkor.metrics.bleu_tensor import *
|
57
57
|
#DefaultMetric = lazy_import.lazy_callable("nltkor.metrics.classical.DefaultMetric")
|
58
58
|
#Mauve = lazy_import.lazy_callable("nltkor.metrics.mauve.Mauve")
|
@@ -7,6 +7,7 @@ from copy import deepcopy
|
|
7
7
|
import itertools
|
8
8
|
import torch
|
9
9
|
import time
|
10
|
+
import math
|
10
11
|
from nltk.translate.bleu_score import *
|
11
12
|
from nltk.metrics import confusionmatrix
|
12
13
|
from collections import defaultdict
|
@@ -415,6 +416,17 @@ class DefaultMetric:
|
|
415
416
|
elif n==4:
|
416
417
|
return self.bleu(reference,candiate,(0,0,0,1), smoothing_function=smoothing_function)
|
417
418
|
|
419
|
+
def bleu_tensor(self,reference,candidate,n=0, smoothing_function=None):
|
420
|
+
|
421
|
+
if n: weights = tuple(1 if i == n-1 else 0 for i in range(4))
|
422
|
+
else: weights = (0.25, 0.25, 0.25, 0.25)
|
423
|
+
|
424
|
+
reference=reference.unsqueeze(1)
|
425
|
+
reference=reference.numpy()
|
426
|
+
candidate=candidate.numpy()
|
427
|
+
return torch.tensor(corpus_bleu(reference,candidate,weights,smoothing_function=smoothing_function))
|
428
|
+
|
429
|
+
|
418
430
|
|
419
431
|
|
420
432
|
|
@@ -33,11 +33,12 @@ SOFTWARE.
|
|
33
33
|
This module contains a wrapper for the Faiss library by Facebook AI Research.
|
34
34
|
"""
|
35
35
|
|
36
|
-
from collections import Counter
|
36
|
+
from collections import Counter
|
37
37
|
from typing import List, Union, Optional, Dict, Any
|
38
38
|
import os
|
39
39
|
import copy
|
40
40
|
import logging
|
41
|
+
import transformers
|
41
42
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
42
43
|
|
43
44
|
from nltkor.make_requirement import make_requirement
|
@@ -70,24 +71,28 @@ class FaissSearch:
|
|
70
71
|
mode = None,
|
71
72
|
model_name_or_path: str = 'klue/bert-base',
|
72
73
|
tokenizer_name_or_path: str = 'klue/bert-base',
|
74
|
+
embedding_type: str = 'last_hidden_state',
|
73
75
|
device: str = 'cpu'
|
74
76
|
) -> None:
|
75
77
|
if mode == 'sentence':
|
76
|
-
return FaissSearch_SenEmbed(model_name_or_path)
|
78
|
+
return FaissSearch_SenEmbed(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
|
77
79
|
elif mode == 'word':
|
78
|
-
return FaissSearch_WordEmbed(model_name_or_path)
|
80
|
+
return FaissSearch_WordEmbed(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
|
81
|
+
elif mode == 'splade':
|
82
|
+
return FaissSearch_Splade(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
|
79
83
|
else:
|
80
|
-
raise ValueError("choice 'sentence' or 'word'")
|
84
|
+
raise ValueError("choice 'sentence' or 'word' or 'splade'")
|
85
|
+
|
81
86
|
|
82
87
|
|
83
|
-
# FAISS original library wrapper class
|
84
88
|
class FaissSearch_SenEmbed:
|
85
89
|
def __init__(self,
|
86
90
|
model_name_or_path: str = 'klue/bert-base',
|
87
91
|
tokenizer_name_or_path: str = 'klue/bert-base',
|
92
|
+
embedding_type: str = 'last_hidden_state',
|
88
93
|
device: str = 'cpu',
|
89
94
|
) -> None:
|
90
|
-
|
95
|
+
"""
|
91
96
|
This function initializes the wrapper for the FAISS library, which is used to perform semantic search.
|
92
97
|
|
93
98
|
|
@@ -143,8 +148,7 @@ class FaissSearch_SenEmbed:
|
|
143
148
|
# Initialize the dataset
|
144
149
|
self.dataset = None
|
145
150
|
|
146
|
-
|
147
|
-
|
151
|
+
|
148
152
|
# Auxiliary function to get the last hidden state
|
149
153
|
def get_last_hidden_state(self,
|
150
154
|
embeddings: torch.Tensor,
|
@@ -166,7 +170,6 @@ class FaissSearch_SenEmbed:
|
|
166
170
|
return last_hidden_state[:, 0, :]
|
167
171
|
|
168
172
|
|
169
|
-
|
170
173
|
# Auxiliary function to get the mean pooling
|
171
174
|
def get_mean_pooling(self,
|
172
175
|
embeddings: torch.Tensor,
|
@@ -244,7 +247,6 @@ class FaissSearch_SenEmbed:
|
|
244
247
|
return embeddings
|
245
248
|
|
246
249
|
|
247
|
-
|
248
250
|
# Add FAISS index
|
249
251
|
def add_faiss_index(self,
|
250
252
|
column_name: str = 'embeddings',
|
@@ -309,7 +311,6 @@ class FaissSearch_SenEmbed:
|
|
309
311
|
self.dataset.save_faiss_index(index_name=index_name, file=file_path)
|
310
312
|
|
311
313
|
|
312
|
-
|
313
314
|
def load_faiss_index(self,
|
314
315
|
index_name: str,
|
315
316
|
file_path: str,
|
@@ -339,7 +340,6 @@ class FaissSearch_SenEmbed:
|
|
339
340
|
self.dataset.load_faiss_index(index_name=index_name, file=file_path, device=device)
|
340
341
|
|
341
342
|
|
342
|
-
|
343
343
|
# Initialize the corpus using a dictionary or pandas DataFrame or HuggingFace Datasets object
|
344
344
|
def initialize_corpus(self,
|
345
345
|
corpus: Union[Dict[str, List[str]], pd.DataFrame, Dataset],
|
@@ -407,7 +407,6 @@ class FaissSearch_SenEmbed:
|
|
407
407
|
return self.dataset
|
408
408
|
|
409
409
|
|
410
|
-
|
411
410
|
# Initialize the dataset using a JSON file
|
412
411
|
def load_dataset_from_json(self,
|
413
412
|
json_path: str,
|
@@ -429,7 +428,6 @@ class FaissSearch_SenEmbed:
|
|
429
428
|
return self.dataset
|
430
429
|
|
431
430
|
|
432
|
-
|
433
431
|
# Search for the most similar elements in the dataset, given a query
|
434
432
|
def search(self,
|
435
433
|
query: str,
|
@@ -475,12 +473,132 @@ class FaissSearch_SenEmbed:
|
|
475
473
|
|
476
474
|
|
477
475
|
|
476
|
+
# FAISS Splade + ICT library wrapper class
|
477
|
+
class FaissSearch_Splade(FaissSearch_SenEmbed):
|
478
|
+
def __init__(self,
|
479
|
+
model_name_or_path: str = 'klue/bert-base',
|
480
|
+
tokenizer_name_or_path: str = 'klue/bert-base',
|
481
|
+
embedding_type: str = 'last_hidden_state',
|
482
|
+
device: str = 'cpu',
|
483
|
+
) -> None:
|
484
|
+
r"""
|
485
|
+
This function initializes the wrapper for the FAISS library, which is used to perform semantic search.
|
486
|
+
|
487
|
+
|
488
|
+
.. attention::
|
489
|
+
|
490
|
+
* If you use this class, please make sure to cite the following paper:
|
491
|
+
|
492
|
+
.. code-block:: latex
|
493
|
+
|
494
|
+
@article{johnson2019billion,
|
495
|
+
title={Billion-scale similarity search with {GPUs}},
|
496
|
+
author={Johnson, Jeff and Douze, Matthijs and J{\'e}gou, Herv{\'e}},
|
497
|
+
journal={IEEE Transactions on Big Data},
|
498
|
+
volume={7},
|
499
|
+
number={3},
|
500
|
+
pages={535--547},
|
501
|
+
year={2019},
|
502
|
+
publisher={IEEE}
|
503
|
+
}
|
504
|
+
|
505
|
+
* The code is based on the following GitHub repository:
|
506
|
+
https://github.com/facebookresearch/faiss
|
507
|
+
|
508
|
+
Arguments:
|
509
|
+
model_name_or_path (str, optional): The name or path of the model to use. Defaults to 'facebook/bart-large'.
|
510
|
+
tokenizer_name_or_path (str, optional): The name or path of the tokenizer to use. Defaults to 'facebook/bart-large'.
|
511
|
+
device (str, optional): The device to use. Defaults to 'cpu'.
|
512
|
+
|
513
|
+
Returns:
|
514
|
+
None
|
515
|
+
"""
|
516
|
+
|
517
|
+
# Set the device
|
518
|
+
self.device = device
|
519
|
+
|
520
|
+
# If the tokenizer is not specified, use the model name or path
|
521
|
+
if tokenizer_name_or_path is None:
|
522
|
+
tokenizer_name_or_path = model_name_or_path
|
523
|
+
|
524
|
+
# Load the tokenizer
|
525
|
+
if tokenizer_name_or_path == 'skt/kobert-base-v1':
|
526
|
+
# self.tokenizer = KoBERTTokenizer.from_pretrained(tokenizer_name_or_path)
|
527
|
+
self.tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name_or_path)
|
528
|
+
else:
|
529
|
+
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
|
530
|
+
|
531
|
+
# Load the model
|
532
|
+
self.model = transformers.BertForMaskedLM.from_pretrained(model_name_or_path).to(self.device)
|
533
|
+
|
534
|
+
# Set the model to evaluation mode (since we do not need the gradients)
|
535
|
+
self.model.eval()
|
536
|
+
|
537
|
+
# Initialize the dataset
|
538
|
+
self.dataset = None
|
539
|
+
|
540
|
+
|
541
|
+
# Get the embeddings
|
542
|
+
def get_embeddings(self,
|
543
|
+
text: Union[str, List[str]],
|
544
|
+
embedding_type: str = 'last_hidden_state',
|
545
|
+
batch_size: int = 8,
|
546
|
+
num_workers: int = 4,
|
547
|
+
) -> torch.Tensor:
|
548
|
+
"""
|
549
|
+
This function returns the embeddings of the input text.
|
550
|
+
|
551
|
+
Arguments:
|
552
|
+
text (Union[str, List[str]]): The input text.
|
553
|
+
embedding_type (str, optional): The type of embedding to use. Defaults to 'last_hidden_state'.
|
554
|
+
batch_size (int, optional): The batch size to use. Defaults to 8.
|
555
|
+
num_workers (int, optional): The number of workers to use. Defaults to 4.
|
556
|
+
|
557
|
+
Returns:
|
558
|
+
torch.Tensor: The embeddings.
|
559
|
+
|
560
|
+
Raises:
|
561
|
+
ValueError: If the embedding type is invalid.
|
562
|
+
"""
|
563
|
+
|
564
|
+
# Check if the embedding type is valid
|
565
|
+
if embedding_type not in ['last_hidden_state', 'mean_pooling']:
|
566
|
+
raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
|
567
|
+
|
568
|
+
# Tokenize the input text
|
569
|
+
encoded_text = self.tokenizer(
|
570
|
+
text,
|
571
|
+
padding=True,
|
572
|
+
truncation=True,
|
573
|
+
return_tensors='pt',
|
574
|
+
)
|
575
|
+
|
576
|
+
# Move the input text to the device
|
577
|
+
encoded_text = encoded_text.to(self.device)
|
578
|
+
|
579
|
+
# encoded_inputs = {k: v.to(self.device) for k, v in encoded_inputs.items()}
|
580
|
+
|
581
|
+
# Get the embeddings
|
582
|
+
with torch.no_grad():
|
583
|
+
embeddings = self.model(**encoded_text)
|
584
|
+
|
585
|
+
# Get the last hidden state
|
586
|
+
embeddings = embeddings['logits']
|
587
|
+
|
588
|
+
embeddings = torch.sum(torch.log(1+torch.relu(embeddings)) * encoded_text['attention_mask'].unsqueeze(-1), dim=1)
|
589
|
+
e_norm = torch.nn.functional.normalize(embeddings, p=2, dim=1, eps=1e-8)
|
590
|
+
|
591
|
+
# Return the embeddings
|
592
|
+
return e_norm
|
593
|
+
|
594
|
+
|
478
595
|
|
479
596
|
# FAISS word embedding library wrapper class
|
480
597
|
class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
|
481
598
|
def __init__(self,
|
482
599
|
model_name_or_path: str = 'klue/bert-base',
|
483
600
|
tokenizer_name_or_path: str = 'klue/bert-base',
|
601
|
+
embedding_type: str = 'last_hidden_state',
|
484
602
|
device: str = 'cpu',
|
485
603
|
) -> None:
|
486
604
|
r"""
|
@@ -533,6 +651,7 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
|
|
533
651
|
# Load the model
|
534
652
|
self.model = AutoModel.from_pretrained(model_name_or_path).to(self.device)
|
535
653
|
|
654
|
+
|
536
655
|
# Set the model to evaluation mode (since we do not need the gradients)
|
537
656
|
self.model.eval()
|
538
657
|
|
@@ -540,7 +659,6 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
|
|
540
659
|
self.dataset = None
|
541
660
|
|
542
661
|
|
543
|
-
|
544
662
|
# Get the embeddings (new code)
|
545
663
|
def get_doc_embeddings(self,
|
546
664
|
#text: Union[str, List[str]],
|
@@ -564,7 +682,7 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
|
|
564
682
|
Raises:
|
565
683
|
ValueError: If the embedding type is invalid.
|
566
684
|
"""
|
567
|
-
|
685
|
+
|
568
686
|
# Check if the embedding type is valid
|
569
687
|
if embedding_type not in ['last_hidden_state', 'mean_pooling']:
|
570
688
|
raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
|
@@ -577,12 +695,10 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
|
|
577
695
|
padding=False,
|
578
696
|
truncation=True,
|
579
697
|
return_tensors='pt',
|
580
|
-
add_special_tokens=False
|
698
|
+
add_special_tokens=False
|
581
699
|
)
|
582
|
-
|
583
700
|
# Move the input text to the device
|
584
701
|
encoded_text = encoded_text.to(self.device)
|
585
|
-
|
586
702
|
token_ids_list = encoded_text['input_ids'].tolist()
|
587
703
|
token_ids_list = token_ids_list[0]
|
588
704
|
for ids in token_ids_list:
|
@@ -591,19 +707,17 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
|
|
591
707
|
else:
|
592
708
|
if text not in ids_dict[ids]:
|
593
709
|
ids_dict[ids].append(sentence)
|
594
|
-
|
595
710
|
# Get the embeddings
|
596
711
|
embedding_dict = {}
|
597
712
|
self.model.eval()
|
598
713
|
for key, value in ids_dict.items():
|
599
714
|
embed = self.model(torch.tensor([[key]]), output_hidden_states=True).hidden_states[-1][:,0,:].detach()
|
600
715
|
embedding_dict[embed] = value
|
601
|
-
|
716
|
+
|
602
717
|
# Return the embeddings
|
603
718
|
return embedding_dict
|
604
719
|
|
605
720
|
|
606
|
-
|
607
721
|
# Get the embeddings (new code)
|
608
722
|
def get_query_embeddings(self,
|
609
723
|
text: Union[str, List[str]],
|
@@ -657,7 +771,6 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
|
|
657
771
|
# Return the embeddings
|
658
772
|
return embeds
|
659
773
|
|
660
|
-
|
661
774
|
|
662
775
|
# Initialize the corpus using a dictionary or pandas DataFrame or HuggingFace Datasets object
|
663
776
|
def initialize_corpus(self,
|
@@ -693,7 +806,7 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
|
|
693
806
|
|
694
807
|
# Set the embedding_type
|
695
808
|
self.embedding_type = embedding_type
|
696
|
-
|
809
|
+
|
697
810
|
# get embedding dict
|
698
811
|
embedding_dict = self.get_doc_embeddings(text=corpus, embedding_type=self.embedding_type)
|
699
812
|
|
@@ -729,7 +842,6 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
|
|
729
842
|
return self.dataset
|
730
843
|
|
731
844
|
|
732
|
-
|
733
845
|
# Search for the most similar elements in the dataset, given a query
|
734
846
|
def search(self,
|
735
847
|
query: str,
|
@@ -751,7 +863,6 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
|
|
751
863
|
The returned elements are dictionaries containing the text and the score.
|
752
864
|
"""
|
753
865
|
|
754
|
-
|
755
866
|
# Get the embeddings of the query
|
756
867
|
query_embeddings = self.get_query_embeddings([query], embedding_type=self.embedding_type)
|
757
868
|
|
@@ -768,6 +879,7 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
|
|
768
879
|
scores.append(score)
|
769
880
|
similar_elts.append(similar_elt)
|
770
881
|
|
882
|
+
|
771
883
|
text_list = []
|
772
884
|
for item in similar_elts:
|
773
885
|
for text in item['text']:
|
@@ -776,12 +888,10 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
|
|
776
888
|
flat_list = [sentence for sublist in text_list for sentence in sublist]
|
777
889
|
count = Counter(flat_list)
|
778
890
|
count = dict(count.most_common(5))
|
779
|
-
|
891
|
+
|
780
892
|
sorted_dict = dict(sorted(count.items(), key=lambda x: x[1], reverse=True))
|
781
|
-
|
782
893
|
# Convert the results to a pandas DataFrame
|
783
894
|
results_df = pd.DataFrame({'text': sorted_dict.keys() , 'freq': sorted_dict.values()})
|
784
895
|
|
785
|
-
|
786
896
|
# Return the most similar elements
|
787
897
|
return results_df
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: nltkor
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.16
|
4
4
|
Home-page: https://modi.changwon.ac.kr/air_cwnu/nlp_tool/nltk_ko.git
|
5
5
|
Keywords: string matching,pattern matching,edit distance,string to string correction,string to string matching,Levenshtein edit distance,Hamming distance,Damerau-Levenshtein distance,Jaro-Winkler distance,longest common subsequence,longest common substring,dynamic programming,approximate string matching,semantic similarity,natural language processing,NLP,information retrieval,rouge,sacrebleu,bertscore,bartscore,fasttext,glove,cosine similarity,Smith-Waterman,Needleman-Wunsch,Hirschberg,Karp-Rabin,Knuth-Morris-Pratt,Boyer-Moore
|
6
6
|
Classifier: Programming Language :: Python :: 3.7
|
@@ -37,5 +37,6 @@ Requires-Dist: fasttext
|
|
37
37
|
Dynamic: classifier
|
38
38
|
Dynamic: home-page
|
39
39
|
Dynamic: keywords
|
40
|
+
Dynamic: license-file
|
40
41
|
Dynamic: requires-dist
|
41
42
|
Dynamic: requires-python
|
@@ -126,6 +126,4 @@ nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc
|
|
126
126
|
nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc
|
127
127
|
nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc
|
128
128
|
nltkor/tokenize/__init__.py
|
129
|
-
nltkor/tokenize/ko_tokenize.py
|
130
|
-
test/test.py
|
131
|
-
test/testespresso.py
|
129
|
+
nltkor/tokenize/ko_tokenize.py
|