nltkor 1.2.14__tar.gz → 1.2.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. {nltkor-1.2.14 → nltkor-1.2.16}/PKG-INFO +3 -2
  2. {nltkor-1.2.14 → nltkor-1.2.16}/README.md +17 -10
  3. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/__init__.py +1 -1
  4. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/metrics/__init__.py +1 -1
  5. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/metrics/classical.py +12 -0
  6. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/metrics/entment.py +1 -1
  7. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/search/faiss_search.py +139 -29
  8. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor.egg-info/PKG-INFO +3 -2
  9. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor.egg-info/SOURCES.txt +1 -3
  10. {nltkor-1.2.14 → nltkor-1.2.16}/setup.py +1 -1
  11. nltkor-1.2.14/test/test.py +0 -282
  12. nltkor-1.2.14/test/testespresso.py +0 -19
  13. {nltkor-1.2.14 → nltkor-1.2.16}/LICENSE.txt +0 -0
  14. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/Kor_char.py +0 -0
  15. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/alignment/__init__.py +0 -0
  16. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/cider/__init__.py +0 -0
  17. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/cider/cider.py +0 -0
  18. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/cider/cider_scorer.py +0 -0
  19. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/distance/__init__.py +0 -0
  20. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/distance/wasserstein.py +0 -0
  21. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/etc.py +0 -0
  22. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/lazyimport.py +0 -0
  23. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/make_requirement.py +0 -0
  24. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/metrics/bartscore.py +0 -0
  25. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/metrics/bertscore.py +0 -0
  26. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/metrics/bleu_tensor.py +0 -0
  27. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/metrics/eval.py +0 -0
  28. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/metrics/mauve.py +0 -0
  29. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/metrics/mauve_utils.py +0 -0
  30. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/misc/__init__.py +0 -0
  31. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/misc/string2string_basic_functions.py +0 -0
  32. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/misc/string2string_default_tokenizer.py +0 -0
  33. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/misc/string2string_hash_functions.py +0 -0
  34. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/misc/string2string_word_embeddings.py +0 -0
  35. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/search/__init__.py +0 -0
  36. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/search/classical.py +0 -0
  37. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/search/kobert_tokenizer.py +0 -0
  38. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/sejong/__init__.py +0 -0
  39. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
  40. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
  41. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
  42. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
  43. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
  44. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
  45. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/sejong/ch.py +0 -0
  46. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/sejong/dict_semClassNum.txt +0 -0
  47. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/sejong/layer.txt +0 -0
  48. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/sejong/sejong_download.py +0 -0
  49. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/sejong/ssem.py +0 -0
  50. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/similarity/__init__.py +0 -0
  51. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/similarity/bartscore____.py +0 -0
  52. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/similarity/bertscore____.py +0 -0
  53. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/similarity/classical.py +0 -0
  54. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/similarity/cosine_similarity.py +0 -0
  55. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/__init__.py +0 -0
  56. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
  57. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
  58. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
  59. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
  60. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/espresso_tag.py +0 -0
  61. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__init__.py +0 -0
  62. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
  63. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
  64. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
  65. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
  66. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
  67. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
  68. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
  69. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
  70. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
  71. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
  72. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
  73. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
  74. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
  75. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
  76. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
  77. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
  78. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/arguments.py +0 -0
  79. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/attributes.py +0 -0
  80. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/config.py +0 -0
  81. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/metadata.py +0 -0
  82. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/ner/__init__.py +0 -0
  83. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
  84. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
  85. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
  86. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
  87. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/ner/macmorphoreader.py +0 -0
  88. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/ner/ner_reader.py +0 -0
  89. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/network.c +0 -0
  90. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/network.pyx +0 -0
  91. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/networkconv.pyx +0 -0
  92. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/networkdependencyconv.pyx +0 -0
  93. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/parse/__init__.py +0 -0
  94. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
  95. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
  96. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
  97. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
  98. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/parse/parse_reader.py +0 -0
  99. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/pos/__init__.py +0 -0
  100. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
  101. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
  102. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
  103. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
  104. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/pos/macmorphoreader.py +0 -0
  105. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/pos/pos_reader.py +0 -0
  106. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/reader.py +0 -0
  107. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/srl/__init__.py +0 -0
  108. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
  109. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
  110. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
  111. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
  112. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
  113. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
  114. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/srl/__srl_reader_.py +0 -0
  115. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/srl/srl_reader.py +0 -0
  116. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/srl/train_srl.py +0 -0
  117. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/taggers.py +0 -0
  118. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/utils.py +0 -0
  119. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/word_dictionary.py +0 -0
  120. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/wsd/__init__.py +0 -0
  121. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
  122. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
  123. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
  124. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
  125. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/wsd/macmorphoreader.py +0 -0
  126. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tag/libs/wsd/wsd_reader.py +0 -0
  127. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tokenize/__init__.py +0 -0
  128. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/tokenize/ko_tokenize.py +0 -0
  129. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor/trans.py +0 -0
  130. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor.egg-info/dependency_links.txt +0 -0
  131. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor.egg-info/requires.txt +0 -0
  132. {nltkor-1.2.14 → nltkor-1.2.16}/nltkor.egg-info/top_level.txt +0 -0
  133. {nltkor-1.2.14 → nltkor-1.2.16}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: nltkor
3
- Version: 1.2.14
3
+ Version: 1.2.16
4
4
  Home-page: https://modi.changwon.ac.kr/air_cwnu/nlp_tool/nltk_ko.git
5
5
  Keywords: string matching,pattern matching,edit distance,string to string correction,string to string matching,Levenshtein edit distance,Hamming distance,Damerau-Levenshtein distance,Jaro-Winkler distance,longest common subsequence,longest common substring,dynamic programming,approximate string matching,semantic similarity,natural language processing,NLP,information retrieval,rouge,sacrebleu,bertscore,bartscore,fasttext,glove,cosine similarity,Smith-Waterman,Needleman-Wunsch,Hirschberg,Karp-Rabin,Knuth-Morris-Pratt,Boyer-Moore
6
6
  Classifier: Programming Language :: Python :: 3.7
@@ -37,5 +37,6 @@ Requires-Dist: fasttext
37
37
  Dynamic: classifier
38
38
  Dynamic: home-page
39
39
  Dynamic: keywords
40
+ Dynamic: license-file
40
41
  Dynamic: requires-dist
41
42
  Dynamic: requires-python
@@ -777,19 +777,19 @@ Time: 0.05374705195426941, memory: 1409.9
777
777
  #### 5.10.1 BLEU for tensor
778
778
  - 각 score의 값이 tensor 로 반환한다.
779
779
  ```python
780
- >>> from nltk.translate.bleu_score import *
781
- >>> from nltko.tokenize import Ko_tokenize
780
+ >>> from nltkor.metrics import DefaultMetric
781
+ >>> import torch
782
782
  >>> can=torch.tensor([[1,2,3,4,5],[3,4,5,6,4]])
783
783
  >>> ref=torch.tensor([[1,2,3,4,5],[3,5,6,7,10]])
784
- >>> bleu_tensor(ref,can,1)
784
+ >>> DefaultMetric().bleu_tensor(ref,can,1)
785
785
  tensor(0.8000)
786
- >>> bleu_tensor(ref,can,2)
786
+ >>> DefaultMetric().bleu_tensor(ref,can,2)
787
787
  tensor(0.6250)
788
- >>> bleu_tensor(ref,can,3)
788
+ >>> DefaultMetric().bleu_tensor(ref,can,3)
789
789
  tensor(0.5000)
790
- >>> bleu_tensor(ref,can,4)
790
+ >>> DefaultMetric().bleu_tensor(ref,can,4)
791
791
  tensor(0.5000)
792
- >>> bleu_tensor(ref,can)
792
+ >>> DefaultMetric().bleu_tensor(ref,can)
793
793
  tensor(0.5946)
794
794
 
795
795
  ```
@@ -910,11 +910,18 @@ TF-IDF를 n-gram에 대한 가중치로 계산하고 참조 캡션과 생성 캡
910
910
  0.6303797468354431
911
911
  ```
912
912
 
913
- #### 5.14 EntMent
913
+ #### 5.14 EMR(Entity Mention Recall)
914
+
914
915
 
915
- - EntMent (Entity Mention Recall)
916
+ 요약된 텍스트가 참조 문서에 등장하는 중요 개체를 얼마나 잘 유지하고 있는지에 대한 평가 지표이다.
916
917
 
917
- : 요약된 텍스트에 포함된 고유 엔터티의 참조 비율
918
+ ```python
919
+ >>> # -*- coding: utf-8 -*-
920
+ >>> from nltkor.metrics import EntMent
921
+ >>> EntMent().entity("국립창원대학교(총장 박민원)가 사천우주항공캠퍼스 개교와 함께 2025학년도 사천우주항공공학부 입학식을 7일 오전 11시 사천우주항공캠퍼스에서 열었다.이날 행사에는 박민원 총장을 비롯해 국민의힘 서천호 국회의원(사천·남해·하동), 윤영빈 우주항공청장, 박동식 사천시장, 김규헌 사천시의회 의장, 지역 유관기관 관계자들과 신입생 및 가족들이 참석했다. 글로컬대학30사업 선정에 따라 국립창원대와 통합을 추진 중인 경남도립거창대학, 경남도립남해대학 관계자도 함께 자리했다.행사는 1부 현판 제막식과 2부 입학식으로 진행됐으며, 박동식 사천시장은 신입생들에게 축하 선물로 금배지를 전달했고, 박민원 총장은 캠퍼스 설립에 기여한 유공자들에게 표창장을 수여했다.","국립창원대학교는 4월 7일 사천우주항공캠퍼스에서 2025학년도 사천우주항공공학부 입학식을 개최했다. 이날 행사에는 박민원 총장, 서천호 국회의원, 윤영빈 우주항공청장, 박동식 사천시장 등 주요 인사와 신입생 및 가족들이 참석했으며, 글로컬대학30사업과 관련된 거창대학·남해대학 관계자들도 함께했다. 행사는 현판 제막식과 입학식으로 나뉘어 진행되었고, 신입생들에게는 금배지가, 캠퍼스 설립 유공자들에게는 표창장이 수여되었다.")
922
+ Downloading Espresso5 model...
923
+ 0.8888888888888888
924
+ ```
918
925
 
919
926
 
920
927
  ### 6 확장 평가 함수
@@ -13,4 +13,4 @@ from nltkor import trans
13
13
  from nltkor import Kor_char
14
14
  from nltkor import etc
15
15
 
16
- __version__ = '1.2.14'
16
+ __version__ = '1.2.16'
@@ -52,7 +52,7 @@ from nltk.metrics.aline import align
52
52
  from nltkor.metrics.eval import StringMetric
53
53
  """
54
54
  from nltkor.metrics.classical import DefaultMetric
55
- from nltkor.metrics.entment import EntMent
55
+ from nltkor.metrics.entment import EMR
56
56
  from nltkor.metrics.bleu_tensor import *
57
57
  #DefaultMetric = lazy_import.lazy_callable("nltkor.metrics.classical.DefaultMetric")
58
58
  #Mauve = lazy_import.lazy_callable("nltkor.metrics.mauve.Mauve")
@@ -7,6 +7,7 @@ from copy import deepcopy
7
7
  import itertools
8
8
  import torch
9
9
  import time
10
+ import math
10
11
  from nltk.translate.bleu_score import *
11
12
  from nltk.metrics import confusionmatrix
12
13
  from collections import defaultdict
@@ -415,6 +416,17 @@ class DefaultMetric:
415
416
  elif n==4:
416
417
  return self.bleu(reference,candiate,(0,0,0,1), smoothing_function=smoothing_function)
417
418
 
419
+ def bleu_tensor(self,reference,candidate,n=0, smoothing_function=None):
420
+
421
+ if n: weights = tuple(1 if i == n-1 else 0 for i in range(4))
422
+ else: weights = (0.25, 0.25, 0.25, 0.25)
423
+
424
+ reference=reference.unsqueeze(1)
425
+ reference=reference.numpy()
426
+ candidate=candidate.numpy()
427
+ return torch.tensor(corpus_bleu(reference,candidate,weights,smoothing_function=smoothing_function))
428
+
429
+
418
430
 
419
431
 
420
432
 
@@ -1,6 +1,6 @@
1
1
  from nltkor.tag import EspressoTagger
2
2
 
3
- class EntMent :
3
+ class EMR :
4
4
 
5
5
  def __init__(self):
6
6
  self.entity_list = []
@@ -33,11 +33,12 @@ SOFTWARE.
33
33
  This module contains a wrapper for the Faiss library by Facebook AI Research.
34
34
  """
35
35
 
36
- from collections import Counter
36
+ from collections import Counter
37
37
  from typing import List, Union, Optional, Dict, Any
38
38
  import os
39
39
  import copy
40
40
  import logging
41
+ import transformers
41
42
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
42
43
 
43
44
  from nltkor.make_requirement import make_requirement
@@ -70,24 +71,28 @@ class FaissSearch:
70
71
  mode = None,
71
72
  model_name_or_path: str = 'klue/bert-base',
72
73
  tokenizer_name_or_path: str = 'klue/bert-base',
74
+ embedding_type: str = 'last_hidden_state',
73
75
  device: str = 'cpu'
74
76
  ) -> None:
75
77
  if mode == 'sentence':
76
- return FaissSearch_SenEmbed(model_name_or_path)
78
+ return FaissSearch_SenEmbed(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
77
79
  elif mode == 'word':
78
- return FaissSearch_WordEmbed(model_name_or_path)
80
+ return FaissSearch_WordEmbed(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
81
+ elif mode == 'splade':
82
+ return FaissSearch_Splade(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
79
83
  else:
80
- raise ValueError("choice 'sentence' or 'word'")
84
+ raise ValueError("choice 'sentence' or 'word' or 'splade'")
85
+
81
86
 
82
87
 
83
- # FAISS original library wrapper class
84
88
  class FaissSearch_SenEmbed:
85
89
  def __init__(self,
86
90
  model_name_or_path: str = 'klue/bert-base',
87
91
  tokenizer_name_or_path: str = 'klue/bert-base',
92
+ embedding_type: str = 'last_hidden_state',
88
93
  device: str = 'cpu',
89
94
  ) -> None:
90
- r"""
95
+ """
91
96
  This function initializes the wrapper for the FAISS library, which is used to perform semantic search.
92
97
 
93
98
 
@@ -143,8 +148,7 @@ class FaissSearch_SenEmbed:
143
148
  # Initialize the dataset
144
149
  self.dataset = None
145
150
 
146
-
147
-
151
+
148
152
  # Auxiliary function to get the last hidden state
149
153
  def get_last_hidden_state(self,
150
154
  embeddings: torch.Tensor,
@@ -166,7 +170,6 @@ class FaissSearch_SenEmbed:
166
170
  return last_hidden_state[:, 0, :]
167
171
 
168
172
 
169
-
170
173
  # Auxiliary function to get the mean pooling
171
174
  def get_mean_pooling(self,
172
175
  embeddings: torch.Tensor,
@@ -244,7 +247,6 @@ class FaissSearch_SenEmbed:
244
247
  return embeddings
245
248
 
246
249
 
247
-
248
250
  # Add FAISS index
249
251
  def add_faiss_index(self,
250
252
  column_name: str = 'embeddings',
@@ -309,7 +311,6 @@ class FaissSearch_SenEmbed:
309
311
  self.dataset.save_faiss_index(index_name=index_name, file=file_path)
310
312
 
311
313
 
312
-
313
314
  def load_faiss_index(self,
314
315
  index_name: str,
315
316
  file_path: str,
@@ -339,7 +340,6 @@ class FaissSearch_SenEmbed:
339
340
  self.dataset.load_faiss_index(index_name=index_name, file=file_path, device=device)
340
341
 
341
342
 
342
-
343
343
  # Initialize the corpus using a dictionary or pandas DataFrame or HuggingFace Datasets object
344
344
  def initialize_corpus(self,
345
345
  corpus: Union[Dict[str, List[str]], pd.DataFrame, Dataset],
@@ -407,7 +407,6 @@ class FaissSearch_SenEmbed:
407
407
  return self.dataset
408
408
 
409
409
 
410
-
411
410
  # Initialize the dataset using a JSON file
412
411
  def load_dataset_from_json(self,
413
412
  json_path: str,
@@ -429,7 +428,6 @@ class FaissSearch_SenEmbed:
429
428
  return self.dataset
430
429
 
431
430
 
432
-
433
431
  # Search for the most similar elements in the dataset, given a query
434
432
  def search(self,
435
433
  query: str,
@@ -475,12 +473,132 @@ class FaissSearch_SenEmbed:
475
473
 
476
474
 
477
475
 
476
+ # FAISS Splade + ICT library wrapper class
477
+ class FaissSearch_Splade(FaissSearch_SenEmbed):
478
+ def __init__(self,
479
+ model_name_or_path: str = 'klue/bert-base',
480
+ tokenizer_name_or_path: str = 'klue/bert-base',
481
+ embedding_type: str = 'last_hidden_state',
482
+ device: str = 'cpu',
483
+ ) -> None:
484
+ r"""
485
+ This function initializes the wrapper for the FAISS library, which is used to perform semantic search.
486
+
487
+
488
+ .. attention::
489
+
490
+ * If you use this class, please make sure to cite the following paper:
491
+
492
+ .. code-block:: latex
493
+
494
+ @article{johnson2019billion,
495
+ title={Billion-scale similarity search with {GPUs}},
496
+ author={Johnson, Jeff and Douze, Matthijs and J{\'e}gou, Herv{\'e}},
497
+ journal={IEEE Transactions on Big Data},
498
+ volume={7},
499
+ number={3},
500
+ pages={535--547},
501
+ year={2019},
502
+ publisher={IEEE}
503
+ }
504
+
505
+ * The code is based on the following GitHub repository:
506
+ https://github.com/facebookresearch/faiss
507
+
508
+ Arguments:
509
+ model_name_or_path (str, optional): The name or path of the model to use. Defaults to 'facebook/bart-large'.
510
+ tokenizer_name_or_path (str, optional): The name or path of the tokenizer to use. Defaults to 'facebook/bart-large'.
511
+ device (str, optional): The device to use. Defaults to 'cpu'.
512
+
513
+ Returns:
514
+ None
515
+ """
516
+
517
+ # Set the device
518
+ self.device = device
519
+
520
+ # If the tokenizer is not specified, use the model name or path
521
+ if tokenizer_name_or_path is None:
522
+ tokenizer_name_or_path = model_name_or_path
523
+
524
+ # Load the tokenizer
525
+ if tokenizer_name_or_path == 'skt/kobert-base-v1':
526
+ # self.tokenizer = KoBERTTokenizer.from_pretrained(tokenizer_name_or_path)
527
+ self.tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name_or_path)
528
+ else:
529
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
530
+
531
+ # Load the model
532
+ self.model = transformers.BertForMaskedLM.from_pretrained(model_name_or_path).to(self.device)
533
+
534
+ # Set the model to evaluation mode (since we do not need the gradients)
535
+ self.model.eval()
536
+
537
+ # Initialize the dataset
538
+ self.dataset = None
539
+
540
+
541
+ # Get the embeddings
542
+ def get_embeddings(self,
543
+ text: Union[str, List[str]],
544
+ embedding_type: str = 'last_hidden_state',
545
+ batch_size: int = 8,
546
+ num_workers: int = 4,
547
+ ) -> torch.Tensor:
548
+ """
549
+ This function returns the embeddings of the input text.
550
+
551
+ Arguments:
552
+ text (Union[str, List[str]]): The input text.
553
+ embedding_type (str, optional): The type of embedding to use. Defaults to 'last_hidden_state'.
554
+ batch_size (int, optional): The batch size to use. Defaults to 8.
555
+ num_workers (int, optional): The number of workers to use. Defaults to 4.
556
+
557
+ Returns:
558
+ torch.Tensor: The embeddings.
559
+
560
+ Raises:
561
+ ValueError: If the embedding type is invalid.
562
+ """
563
+
564
+ # Check if the embedding type is valid
565
+ if embedding_type not in ['last_hidden_state', 'mean_pooling']:
566
+ raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
567
+
568
+ # Tokenize the input text
569
+ encoded_text = self.tokenizer(
570
+ text,
571
+ padding=True,
572
+ truncation=True,
573
+ return_tensors='pt',
574
+ )
575
+
576
+ # Move the input text to the device
577
+ encoded_text = encoded_text.to(self.device)
578
+
579
+ # encoded_inputs = {k: v.to(self.device) for k, v in encoded_inputs.items()}
580
+
581
+ # Get the embeddings
582
+ with torch.no_grad():
583
+ embeddings = self.model(**encoded_text)
584
+
585
+ # Get the last hidden state
586
+ embeddings = embeddings['logits']
587
+
588
+ embeddings = torch.sum(torch.log(1+torch.relu(embeddings)) * encoded_text['attention_mask'].unsqueeze(-1), dim=1)
589
+ e_norm = torch.nn.functional.normalize(embeddings, p=2, dim=1, eps=1e-8)
590
+
591
+ # Return the embeddings
592
+ return e_norm
593
+
594
+
478
595
 
479
596
  # FAISS word embedding library wrapper class
480
597
  class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
481
598
  def __init__(self,
482
599
  model_name_or_path: str = 'klue/bert-base',
483
600
  tokenizer_name_or_path: str = 'klue/bert-base',
601
+ embedding_type: str = 'last_hidden_state',
484
602
  device: str = 'cpu',
485
603
  ) -> None:
486
604
  r"""
@@ -533,6 +651,7 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
533
651
  # Load the model
534
652
  self.model = AutoModel.from_pretrained(model_name_or_path).to(self.device)
535
653
 
654
+
536
655
  # Set the model to evaluation mode (since we do not need the gradients)
537
656
  self.model.eval()
538
657
 
@@ -540,7 +659,6 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
540
659
  self.dataset = None
541
660
 
542
661
 
543
-
544
662
  # Get the embeddings (new code)
545
663
  def get_doc_embeddings(self,
546
664
  #text: Union[str, List[str]],
@@ -564,7 +682,7 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
564
682
  Raises:
565
683
  ValueError: If the embedding type is invalid.
566
684
  """
567
-
685
+
568
686
  # Check if the embedding type is valid
569
687
  if embedding_type not in ['last_hidden_state', 'mean_pooling']:
570
688
  raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
@@ -577,12 +695,10 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
577
695
  padding=False,
578
696
  truncation=True,
579
697
  return_tensors='pt',
580
- add_special_tokens=False,
698
+ add_special_tokens=False
581
699
  )
582
-
583
700
  # Move the input text to the device
584
701
  encoded_text = encoded_text.to(self.device)
585
-
586
702
  token_ids_list = encoded_text['input_ids'].tolist()
587
703
  token_ids_list = token_ids_list[0]
588
704
  for ids in token_ids_list:
@@ -591,19 +707,17 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
591
707
  else:
592
708
  if text not in ids_dict[ids]:
593
709
  ids_dict[ids].append(sentence)
594
-
595
710
  # Get the embeddings
596
711
  embedding_dict = {}
597
712
  self.model.eval()
598
713
  for key, value in ids_dict.items():
599
714
  embed = self.model(torch.tensor([[key]]), output_hidden_states=True).hidden_states[-1][:,0,:].detach()
600
715
  embedding_dict[embed] = value
601
-
716
+
602
717
  # Return the embeddings
603
718
  return embedding_dict
604
719
 
605
720
 
606
-
607
721
  # Get the embeddings (new code)
608
722
  def get_query_embeddings(self,
609
723
  text: Union[str, List[str]],
@@ -657,7 +771,6 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
657
771
  # Return the embeddings
658
772
  return embeds
659
773
 
660
-
661
774
 
662
775
  # Initialize the corpus using a dictionary or pandas DataFrame or HuggingFace Datasets object
663
776
  def initialize_corpus(self,
@@ -693,7 +806,7 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
693
806
 
694
807
  # Set the embedding_type
695
808
  self.embedding_type = embedding_type
696
-
809
+
697
810
  # get embedding dict
698
811
  embedding_dict = self.get_doc_embeddings(text=corpus, embedding_type=self.embedding_type)
699
812
 
@@ -729,7 +842,6 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
729
842
  return self.dataset
730
843
 
731
844
 
732
-
733
845
  # Search for the most similar elements in the dataset, given a query
734
846
  def search(self,
735
847
  query: str,
@@ -751,7 +863,6 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
751
863
  The returned elements are dictionaries containing the text and the score.
752
864
  """
753
865
 
754
-
755
866
  # Get the embeddings of the query
756
867
  query_embeddings = self.get_query_embeddings([query], embedding_type=self.embedding_type)
757
868
 
@@ -768,6 +879,7 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
768
879
  scores.append(score)
769
880
  similar_elts.append(similar_elt)
770
881
 
882
+
771
883
  text_list = []
772
884
  for item in similar_elts:
773
885
  for text in item['text']:
@@ -776,12 +888,10 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
776
888
  flat_list = [sentence for sublist in text_list for sentence in sublist]
777
889
  count = Counter(flat_list)
778
890
  count = dict(count.most_common(5))
779
-
891
+
780
892
  sorted_dict = dict(sorted(count.items(), key=lambda x: x[1], reverse=True))
781
-
782
893
  # Convert the results to a pandas DataFrame
783
894
  results_df = pd.DataFrame({'text': sorted_dict.keys() , 'freq': sorted_dict.values()})
784
895
 
785
-
786
896
  # Return the most similar elements
787
897
  return results_df
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: nltkor
3
- Version: 1.2.14
3
+ Version: 1.2.16
4
4
  Home-page: https://modi.changwon.ac.kr/air_cwnu/nlp_tool/nltk_ko.git
5
5
  Keywords: string matching,pattern matching,edit distance,string to string correction,string to string matching,Levenshtein edit distance,Hamming distance,Damerau-Levenshtein distance,Jaro-Winkler distance,longest common subsequence,longest common substring,dynamic programming,approximate string matching,semantic similarity,natural language processing,NLP,information retrieval,rouge,sacrebleu,bertscore,bartscore,fasttext,glove,cosine similarity,Smith-Waterman,Needleman-Wunsch,Hirschberg,Karp-Rabin,Knuth-Morris-Pratt,Boyer-Moore
6
6
  Classifier: Programming Language :: Python :: 3.7
@@ -37,5 +37,6 @@ Requires-Dist: fasttext
37
37
  Dynamic: classifier
38
38
  Dynamic: home-page
39
39
  Dynamic: keywords
40
+ Dynamic: license-file
40
41
  Dynamic: requires-dist
41
42
  Dynamic: requires-python
@@ -126,6 +126,4 @@ nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc
126
126
  nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc
127
127
  nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc
128
128
  nltkor/tokenize/__init__.py
129
- nltkor/tokenize/ko_tokenize.py
130
- test/test.py
131
- test/testespresso.py
129
+ nltkor/tokenize/ko_tokenize.py
@@ -66,7 +66,7 @@ module1 = cythonize([
66
66
 
67
67
  setup(
68
68
  name='nltkor',
69
- version='1.2.14',
69
+ version='1.2.16',
70
70
  url='https://modi.changwon.ac.kr/air_cwnu/nlp_tool/nltk_ko.git',
71
71
  packages=find_packages(exclude=[]),
72
72
  python_requires='>=3.7',