nltkor 1.2.15__tar.gz → 1.2.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. {nltkor-1.2.15 → nltkor-1.2.16}/PKG-INFO +1 -1
  2. {nltkor-1.2.15 → nltkor-1.2.16}/README.md +10 -3
  3. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/__init__.py +1 -1
  4. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/metrics/__init__.py +1 -1
  5. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/metrics/entment.py +1 -1
  6. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/search/faiss_search.py +139 -29
  7. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor.egg-info/PKG-INFO +1 -1
  8. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor.egg-info/SOURCES.txt +1 -3
  9. {nltkor-1.2.15 → nltkor-1.2.16}/setup.py +1 -1
  10. nltkor-1.2.15/test/test.py +0 -282
  11. nltkor-1.2.15/test/testespresso.py +0 -19
  12. {nltkor-1.2.15 → nltkor-1.2.16}/LICENSE.txt +0 -0
  13. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/Kor_char.py +0 -0
  14. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/alignment/__init__.py +0 -0
  15. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/cider/__init__.py +0 -0
  16. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/cider/cider.py +0 -0
  17. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/cider/cider_scorer.py +0 -0
  18. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/distance/__init__.py +0 -0
  19. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/distance/wasserstein.py +0 -0
  20. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/etc.py +0 -0
  21. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/lazyimport.py +0 -0
  22. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/make_requirement.py +0 -0
  23. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/metrics/bartscore.py +0 -0
  24. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/metrics/bertscore.py +0 -0
  25. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/metrics/bleu_tensor.py +0 -0
  26. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/metrics/classical.py +0 -0
  27. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/metrics/eval.py +0 -0
  28. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/metrics/mauve.py +0 -0
  29. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/metrics/mauve_utils.py +0 -0
  30. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/misc/__init__.py +0 -0
  31. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/misc/string2string_basic_functions.py +0 -0
  32. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/misc/string2string_default_tokenizer.py +0 -0
  33. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/misc/string2string_hash_functions.py +0 -0
  34. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/misc/string2string_word_embeddings.py +0 -0
  35. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/search/__init__.py +0 -0
  36. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/search/classical.py +0 -0
  37. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/search/kobert_tokenizer.py +0 -0
  38. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/sejong/__init__.py +0 -0
  39. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
  40. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
  41. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
  42. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
  43. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
  44. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
  45. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/sejong/ch.py +0 -0
  46. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/sejong/dict_semClassNum.txt +0 -0
  47. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/sejong/layer.txt +0 -0
  48. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/sejong/sejong_download.py +0 -0
  49. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/sejong/ssem.py +0 -0
  50. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/similarity/__init__.py +0 -0
  51. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/similarity/bartscore____.py +0 -0
  52. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/similarity/bertscore____.py +0 -0
  53. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/similarity/classical.py +0 -0
  54. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/similarity/cosine_similarity.py +0 -0
  55. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/__init__.py +0 -0
  56. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
  57. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
  58. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
  59. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
  60. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/espresso_tag.py +0 -0
  61. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/__init__.py +0 -0
  62. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
  63. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
  64. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
  65. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
  66. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
  67. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
  68. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
  69. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
  70. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
  71. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
  72. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
  73. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
  74. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
  75. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
  76. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
  77. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
  78. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/arguments.py +0 -0
  79. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/attributes.py +0 -0
  80. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/config.py +0 -0
  81. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/metadata.py +0 -0
  82. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/ner/__init__.py +0 -0
  83. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
  84. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
  85. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
  86. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
  87. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/ner/macmorphoreader.py +0 -0
  88. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/ner/ner_reader.py +0 -0
  89. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/network.c +0 -0
  90. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/network.pyx +0 -0
  91. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/networkconv.pyx +0 -0
  92. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/networkdependencyconv.pyx +0 -0
  93. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/parse/__init__.py +0 -0
  94. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
  95. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
  96. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
  97. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
  98. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/parse/parse_reader.py +0 -0
  99. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/pos/__init__.py +0 -0
  100. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
  101. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
  102. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
  103. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
  104. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/pos/macmorphoreader.py +0 -0
  105. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/pos/pos_reader.py +0 -0
  106. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/reader.py +0 -0
  107. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/srl/__init__.py +0 -0
  108. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
  109. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
  110. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
  111. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
  112. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
  113. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
  114. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/srl/__srl_reader_.py +0 -0
  115. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/srl/srl_reader.py +0 -0
  116. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/srl/train_srl.py +0 -0
  117. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/taggers.py +0 -0
  118. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/utils.py +0 -0
  119. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/word_dictionary.py +0 -0
  120. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/wsd/__init__.py +0 -0
  121. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
  122. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
  123. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
  124. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
  125. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/wsd/macmorphoreader.py +0 -0
  126. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tag/libs/wsd/wsd_reader.py +0 -0
  127. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tokenize/__init__.py +0 -0
  128. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/tokenize/ko_tokenize.py +0 -0
  129. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor/trans.py +0 -0
  130. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor.egg-info/dependency_links.txt +0 -0
  131. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor.egg-info/requires.txt +0 -0
  132. {nltkor-1.2.15 → nltkor-1.2.16}/nltkor.egg-info/top_level.txt +0 -0
  133. {nltkor-1.2.15 → nltkor-1.2.16}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nltkor
3
- Version: 1.2.15
3
+ Version: 1.2.16
4
4
  Home-page: https://modi.changwon.ac.kr/air_cwnu/nlp_tool/nltk_ko.git
5
5
  Keywords: string matching,pattern matching,edit distance,string to string correction,string to string matching,Levenshtein edit distance,Hamming distance,Damerau-Levenshtein distance,Jaro-Winkler distance,longest common subsequence,longest common substring,dynamic programming,approximate string matching,semantic similarity,natural language processing,NLP,information retrieval,rouge,sacrebleu,bertscore,bartscore,fasttext,glove,cosine similarity,Smith-Waterman,Needleman-Wunsch,Hirschberg,Karp-Rabin,Knuth-Morris-Pratt,Boyer-Moore
6
6
  Classifier: Programming Language :: Python :: 3.7
@@ -910,11 +910,18 @@ TF-IDF를 n-gram에 대한 가중치로 계산하고 참조 캡션과 생성 캡
910
910
  0.6303797468354431
911
911
  ```
912
912
 
913
- #### 5.14 EntMent
913
+ #### 5.14 EMR(Entity Mention Recall)
914
914
 
915
- - EntMent (Entity Mention Recall)
916
915
 
917
- : 요약된 텍스트에 포함된 고유 엔터티의 참조 비율
916
+ 요약된 텍스트가 참조 문서에 등장하는 중요 개체를 얼마나 잘 유지하고 있는지에 대한 평가 지표이다.
917
+
918
+ ```python
919
+ >>> # -*- coding: utf-8 -*-
920
+ >>> from nltkor.metrics import EntMent
921
+ >>> EntMent().entity("국립창원대학교(총장 박민원)가 사천우주항공캠퍼스 개교와 함께 2025학년도 사천우주항공공학부 입학식을 7일 오전 11시 사천우주항공캠퍼스에서 열었다.이날 행사에는 박민원 총장을 비롯해 국민의힘 서천호 국회의원(사천·남해·하동), 윤영빈 우주항공청장, 박동식 사천시장, 김규헌 사천시의회 의장, 지역 유관기관 관계자들과 신입생 및 가족들이 참석했다. 글로컬대학30사업 선정에 따라 국립창원대와 통합을 추진 중인 경남도립거창대학, 경남도립남해대학 관계자도 함께 자리했다.행사는 1부 현판 제막식과 2부 입학식으로 진행됐으며, 박동식 사천시장은 신입생들에게 축하 선물로 금배지를 전달했고, 박민원 총장은 캠퍼스 설립에 기여한 유공자들에게 표창장을 수여했다.","국립창원대학교는 4월 7일 사천우주항공캠퍼스에서 2025학년도 사천우주항공공학부 입학식을 개최했다. 이날 행사에는 박민원 총장, 서천호 국회의원, 윤영빈 우주항공청장, 박동식 사천시장 등 주요 인사와 신입생 및 가족들이 참석했으며, 글로컬대학30사업과 관련된 거창대학·남해대학 관계자들도 함께했다. 행사는 현판 제막식과 입학식으로 나뉘어 진행되었고, 신입생들에게는 금배지가, 캠퍼스 설립 유공자들에게는 표창장이 수여되었다.")
922
+ Downloading Espresso5 model...
923
+ 0.8888888888888888
924
+ ```
918
925
 
919
926
 
920
927
  ### 6 확장 평가 함수
@@ -13,4 +13,4 @@ from nltkor import trans
13
13
  from nltkor import Kor_char
14
14
  from nltkor import etc
15
15
 
16
- __version__ = '1.2.15'
16
+ __version__ = '1.2.16'
@@ -52,7 +52,7 @@ from nltk.metrics.aline import align
52
52
  from nltkor.metrics.eval import StringMetric
53
53
  """
54
54
  from nltkor.metrics.classical import DefaultMetric
55
- from nltkor.metrics.entment import EntMent
55
+ from nltkor.metrics.entment import EMR
56
56
  from nltkor.metrics.bleu_tensor import *
57
57
  #DefaultMetric = lazy_import.lazy_callable("nltkor.metrics.classical.DefaultMetric")
58
58
  #Mauve = lazy_import.lazy_callable("nltkor.metrics.mauve.Mauve")
@@ -1,6 +1,6 @@
1
1
  from nltkor.tag import EspressoTagger
2
2
 
3
- class EntMent :
3
+ class EMR :
4
4
 
5
5
  def __init__(self):
6
6
  self.entity_list = []
@@ -33,11 +33,12 @@ SOFTWARE.
33
33
  This module contains a wrapper for the Faiss library by Facebook AI Research.
34
34
  """
35
35
 
36
- from collections import Counter
36
+ from collections import Counter
37
37
  from typing import List, Union, Optional, Dict, Any
38
38
  import os
39
39
  import copy
40
40
  import logging
41
+ import transformers
41
42
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
42
43
 
43
44
  from nltkor.make_requirement import make_requirement
@@ -70,24 +71,28 @@ class FaissSearch:
70
71
  mode = None,
71
72
  model_name_or_path: str = 'klue/bert-base',
72
73
  tokenizer_name_or_path: str = 'klue/bert-base',
74
+ embedding_type: str = 'last_hidden_state',
73
75
  device: str = 'cpu'
74
76
  ) -> None:
75
77
  if mode == 'sentence':
76
- return FaissSearch_SenEmbed(model_name_or_path)
78
+ return FaissSearch_SenEmbed(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
77
79
  elif mode == 'word':
78
- return FaissSearch_WordEmbed(model_name_or_path)
80
+ return FaissSearch_WordEmbed(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
81
+ elif mode == 'splade':
82
+ return FaissSearch_Splade(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
79
83
  else:
80
- raise ValueError("choice 'sentence' or 'word'")
84
+ raise ValueError("choice 'sentence' or 'word' or 'splade'")
85
+
81
86
 
82
87
 
83
- # FAISS original library wrapper class
84
88
  class FaissSearch_SenEmbed:
85
89
  def __init__(self,
86
90
  model_name_or_path: str = 'klue/bert-base',
87
91
  tokenizer_name_or_path: str = 'klue/bert-base',
92
+ embedding_type: str = 'last_hidden_state',
88
93
  device: str = 'cpu',
89
94
  ) -> None:
90
- r"""
95
+ """
91
96
  This function initializes the wrapper for the FAISS library, which is used to perform semantic search.
92
97
 
93
98
 
@@ -143,8 +148,7 @@ class FaissSearch_SenEmbed:
143
148
  # Initialize the dataset
144
149
  self.dataset = None
145
150
 
146
-
147
-
151
+
148
152
  # Auxiliary function to get the last hidden state
149
153
  def get_last_hidden_state(self,
150
154
  embeddings: torch.Tensor,
@@ -166,7 +170,6 @@ class FaissSearch_SenEmbed:
166
170
  return last_hidden_state[:, 0, :]
167
171
 
168
172
 
169
-
170
173
  # Auxiliary function to get the mean pooling
171
174
  def get_mean_pooling(self,
172
175
  embeddings: torch.Tensor,
@@ -244,7 +247,6 @@ class FaissSearch_SenEmbed:
244
247
  return embeddings
245
248
 
246
249
 
247
-
248
250
  # Add FAISS index
249
251
  def add_faiss_index(self,
250
252
  column_name: str = 'embeddings',
@@ -309,7 +311,6 @@ class FaissSearch_SenEmbed:
309
311
  self.dataset.save_faiss_index(index_name=index_name, file=file_path)
310
312
 
311
313
 
312
-
313
314
  def load_faiss_index(self,
314
315
  index_name: str,
315
316
  file_path: str,
@@ -339,7 +340,6 @@ class FaissSearch_SenEmbed:
339
340
  self.dataset.load_faiss_index(index_name=index_name, file=file_path, device=device)
340
341
 
341
342
 
342
-
343
343
  # Initialize the corpus using a dictionary or pandas DataFrame or HuggingFace Datasets object
344
344
  def initialize_corpus(self,
345
345
  corpus: Union[Dict[str, List[str]], pd.DataFrame, Dataset],
@@ -407,7 +407,6 @@ class FaissSearch_SenEmbed:
407
407
  return self.dataset
408
408
 
409
409
 
410
-
411
410
  # Initialize the dataset using a JSON file
412
411
  def load_dataset_from_json(self,
413
412
  json_path: str,
@@ -429,7 +428,6 @@ class FaissSearch_SenEmbed:
429
428
  return self.dataset
430
429
 
431
430
 
432
-
433
431
  # Search for the most similar elements in the dataset, given a query
434
432
  def search(self,
435
433
  query: str,
@@ -475,12 +473,132 @@ class FaissSearch_SenEmbed:
475
473
 
476
474
 
477
475
 
476
+ # FAISS Splade + ICT library wrapper class
477
+ class FaissSearch_Splade(FaissSearch_SenEmbed):
478
+ def __init__(self,
479
+ model_name_or_path: str = 'klue/bert-base',
480
+ tokenizer_name_or_path: str = 'klue/bert-base',
481
+ embedding_type: str = 'last_hidden_state',
482
+ device: str = 'cpu',
483
+ ) -> None:
484
+ r"""
485
+ This function initializes the wrapper for the FAISS library, which is used to perform semantic search.
486
+
487
+
488
+ .. attention::
489
+
490
+ * If you use this class, please make sure to cite the following paper:
491
+
492
+ .. code-block:: latex
493
+
494
+ @article{johnson2019billion,
495
+ title={Billion-scale similarity search with {GPUs}},
496
+ author={Johnson, Jeff and Douze, Matthijs and J{\'e}gou, Herv{\'e}},
497
+ journal={IEEE Transactions on Big Data},
498
+ volume={7},
499
+ number={3},
500
+ pages={535--547},
501
+ year={2019},
502
+ publisher={IEEE}
503
+ }
504
+
505
+ * The code is based on the following GitHub repository:
506
+ https://github.com/facebookresearch/faiss
507
+
508
+ Arguments:
509
+ model_name_or_path (str, optional): The name or path of the model to use. Defaults to 'facebook/bart-large'.
510
+ tokenizer_name_or_path (str, optional): The name or path of the tokenizer to use. Defaults to 'facebook/bart-large'.
511
+ device (str, optional): The device to use. Defaults to 'cpu'.
512
+
513
+ Returns:
514
+ None
515
+ """
516
+
517
+ # Set the device
518
+ self.device = device
519
+
520
+ # If the tokenizer is not specified, use the model name or path
521
+ if tokenizer_name_or_path is None:
522
+ tokenizer_name_or_path = model_name_or_path
523
+
524
+ # Load the tokenizer
525
+ if tokenizer_name_or_path == 'skt/kobert-base-v1':
526
+ # self.tokenizer = KoBERTTokenizer.from_pretrained(tokenizer_name_or_path)
527
+ self.tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name_or_path)
528
+ else:
529
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
530
+
531
+ # Load the model
532
+ self.model = transformers.BertForMaskedLM.from_pretrained(model_name_or_path).to(self.device)
533
+
534
+ # Set the model to evaluation mode (since we do not need the gradients)
535
+ self.model.eval()
536
+
537
+ # Initialize the dataset
538
+ self.dataset = None
539
+
540
+
541
+ # Get the embeddings
542
+ def get_embeddings(self,
543
+ text: Union[str, List[str]],
544
+ embedding_type: str = 'last_hidden_state',
545
+ batch_size: int = 8,
546
+ num_workers: int = 4,
547
+ ) -> torch.Tensor:
548
+ """
549
+ This function returns the embeddings of the input text.
550
+
551
+ Arguments:
552
+ text (Union[str, List[str]]): The input text.
553
+ embedding_type (str, optional): The type of embedding to use. Defaults to 'last_hidden_state'.
554
+ batch_size (int, optional): The batch size to use. Defaults to 8.
555
+ num_workers (int, optional): The number of workers to use. Defaults to 4.
556
+
557
+ Returns:
558
+ torch.Tensor: The embeddings.
559
+
560
+ Raises:
561
+ ValueError: If the embedding type is invalid.
562
+ """
563
+
564
+ # Check if the embedding type is valid
565
+ if embedding_type not in ['last_hidden_state', 'mean_pooling']:
566
+ raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
567
+
568
+ # Tokenize the input text
569
+ encoded_text = self.tokenizer(
570
+ text,
571
+ padding=True,
572
+ truncation=True,
573
+ return_tensors='pt',
574
+ )
575
+
576
+ # Move the input text to the device
577
+ encoded_text = encoded_text.to(self.device)
578
+
579
+ # encoded_inputs = {k: v.to(self.device) for k, v in encoded_inputs.items()}
580
+
581
+ # Get the embeddings
582
+ with torch.no_grad():
583
+ embeddings = self.model(**encoded_text)
584
+
585
+ # Get the last hidden state
586
+ embeddings = embeddings['logits']
587
+
588
+ embeddings = torch.sum(torch.log(1+torch.relu(embeddings)) * encoded_text['attention_mask'].unsqueeze(-1), dim=1)
589
+ e_norm = torch.nn.functional.normalize(embeddings, p=2, dim=1, eps=1e-8)
590
+
591
+ # Return the embeddings
592
+ return e_norm
593
+
594
+
478
595
 
479
596
  # FAISS word embedding library wrapper class
480
597
  class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
481
598
  def __init__(self,
482
599
  model_name_or_path: str = 'klue/bert-base',
483
600
  tokenizer_name_or_path: str = 'klue/bert-base',
601
+ embedding_type: str = 'last_hidden_state',
484
602
  device: str = 'cpu',
485
603
  ) -> None:
486
604
  r"""
@@ -533,6 +651,7 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
533
651
  # Load the model
534
652
  self.model = AutoModel.from_pretrained(model_name_or_path).to(self.device)
535
653
 
654
+
536
655
  # Set the model to evaluation mode (since we do not need the gradients)
537
656
  self.model.eval()
538
657
 
@@ -540,7 +659,6 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
540
659
  self.dataset = None
541
660
 
542
661
 
543
-
544
662
  # Get the embeddings (new code)
545
663
  def get_doc_embeddings(self,
546
664
  #text: Union[str, List[str]],
@@ -564,7 +682,7 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
564
682
  Raises:
565
683
  ValueError: If the embedding type is invalid.
566
684
  """
567
-
685
+
568
686
  # Check if the embedding type is valid
569
687
  if embedding_type not in ['last_hidden_state', 'mean_pooling']:
570
688
  raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
@@ -577,12 +695,10 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
577
695
  padding=False,
578
696
  truncation=True,
579
697
  return_tensors='pt',
580
- add_special_tokens=False,
698
+ add_special_tokens=False
581
699
  )
582
-
583
700
  # Move the input text to the device
584
701
  encoded_text = encoded_text.to(self.device)
585
-
586
702
  token_ids_list = encoded_text['input_ids'].tolist()
587
703
  token_ids_list = token_ids_list[0]
588
704
  for ids in token_ids_list:
@@ -591,19 +707,17 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
591
707
  else:
592
708
  if text not in ids_dict[ids]:
593
709
  ids_dict[ids].append(sentence)
594
-
595
710
  # Get the embeddings
596
711
  embedding_dict = {}
597
712
  self.model.eval()
598
713
  for key, value in ids_dict.items():
599
714
  embed = self.model(torch.tensor([[key]]), output_hidden_states=True).hidden_states[-1][:,0,:].detach()
600
715
  embedding_dict[embed] = value
601
-
716
+
602
717
  # Return the embeddings
603
718
  return embedding_dict
604
719
 
605
720
 
606
-
607
721
  # Get the embeddings (new code)
608
722
  def get_query_embeddings(self,
609
723
  text: Union[str, List[str]],
@@ -657,7 +771,6 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
657
771
  # Return the embeddings
658
772
  return embeds
659
773
 
660
-
661
774
 
662
775
  # Initialize the corpus using a dictionary or pandas DataFrame or HuggingFace Datasets object
663
776
  def initialize_corpus(self,
@@ -693,7 +806,7 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
693
806
 
694
807
  # Set the embedding_type
695
808
  self.embedding_type = embedding_type
696
-
809
+
697
810
  # get embedding dict
698
811
  embedding_dict = self.get_doc_embeddings(text=corpus, embedding_type=self.embedding_type)
699
812
 
@@ -729,7 +842,6 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
729
842
  return self.dataset
730
843
 
731
844
 
732
-
733
845
  # Search for the most similar elements in the dataset, given a query
734
846
  def search(self,
735
847
  query: str,
@@ -751,7 +863,6 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
751
863
  The returned elements are dictionaries containing the text and the score.
752
864
  """
753
865
 
754
-
755
866
  # Get the embeddings of the query
756
867
  query_embeddings = self.get_query_embeddings([query], embedding_type=self.embedding_type)
757
868
 
@@ -768,6 +879,7 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
768
879
  scores.append(score)
769
880
  similar_elts.append(similar_elt)
770
881
 
882
+
771
883
  text_list = []
772
884
  for item in similar_elts:
773
885
  for text in item['text']:
@@ -776,12 +888,10 @@ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
776
888
  flat_list = [sentence for sublist in text_list for sentence in sublist]
777
889
  count = Counter(flat_list)
778
890
  count = dict(count.most_common(5))
779
-
891
+
780
892
  sorted_dict = dict(sorted(count.items(), key=lambda x: x[1], reverse=True))
781
-
782
893
  # Convert the results to a pandas DataFrame
783
894
  results_df = pd.DataFrame({'text': sorted_dict.keys() , 'freq': sorted_dict.values()})
784
895
 
785
-
786
896
  # Return the most similar elements
787
897
  return results_df
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nltkor
3
- Version: 1.2.15
3
+ Version: 1.2.16
4
4
  Home-page: https://modi.changwon.ac.kr/air_cwnu/nlp_tool/nltk_ko.git
5
5
  Keywords: string matching,pattern matching,edit distance,string to string correction,string to string matching,Levenshtein edit distance,Hamming distance,Damerau-Levenshtein distance,Jaro-Winkler distance,longest common subsequence,longest common substring,dynamic programming,approximate string matching,semantic similarity,natural language processing,NLP,information retrieval,rouge,sacrebleu,bertscore,bartscore,fasttext,glove,cosine similarity,Smith-Waterman,Needleman-Wunsch,Hirschberg,Karp-Rabin,Knuth-Morris-Pratt,Boyer-Moore
6
6
  Classifier: Programming Language :: Python :: 3.7
@@ -126,6 +126,4 @@ nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc
126
126
  nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc
127
127
  nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc
128
128
  nltkor/tokenize/__init__.py
129
- nltkor/tokenize/ko_tokenize.py
130
- test/test.py
131
- test/testespresso.py
129
+ nltkor/tokenize/ko_tokenize.py
@@ -66,7 +66,7 @@ module1 = cythonize([
66
66
 
67
67
  setup(
68
68
  name='nltkor',
69
- version='1.2.15',
69
+ version='1.2.16',
70
70
  url='https://modi.changwon.ac.kr/air_cwnu/nlp_tool/nltk_ko.git',
71
71
  packages=find_packages(exclude=[]),
72
72
  python_requires='>=3.7',
@@ -1,282 +0,0 @@
1
- from nltk.alignment import NeedlemanWunsch, SmithWaterman, Hirschberg, LongestCommonSubsequence, LongestCommonSubstring, DTW
2
- from nltk.distance import LevenshteinEditDistance, HammingDistance, DamerauLevenshteinDistance, WassersteinDistance
3
- from nltk.similarity import CosineSimilarity, LCSubstringSimilarity, LCSubsequenceSimilarity, JaroSimilarity
4
- from nltk.tokenize import sent_tokenize, word_tokenize, syllable_tokenize
5
- from nltk.search import NaiveSearch, RabinKarpSearch, KMPSearch, BoyerMooreSearch, FaissSearch
6
- from nltk.metrics import BERTScore, BARTScore, DefaultMetric
7
- from nltk import pos_tag, nouns, word_segmentor, pos_tag_with_verb_form
8
- import numpy as np
9
- from typing import List
10
- import torch
11
-
12
- def demo():
13
- str1 = '기존에 제품이 장기간 사용으로 손상'
14
- str2 = '장기간 사용으로 제품이 손상'
15
-
16
- # result1, result2 = NeedlemanWunsch().get_alignment(str1, str2)
17
- # print(result1, '\n', result2)
18
-
19
- result1, result2 = SmithWaterman().get_alignment(str1, str2)
20
- print(f"{result1}\n{result2}")
21
-
22
- # result1, result2 = Hirschberg().get_alignment(str1, str2)
23
- # print(f"{result1}\n{result2}")
24
-
25
- # result = DTW().get_alignment_path(str1, str2)
26
- # print(result)
27
-
28
- # result = LongestCommonSubsequence().compute(str1, str2)
29
- # print(result)
30
-
31
- # result = LongestCommonSubstring().compute(str1, str2)
32
- # print("-------LongestCommonSubstring-------")
33
- # print(result)
34
- # print("------------------------------------")
35
- # print()
36
-
37
- def demo2():
38
- str1 = '나는 학생이다.'
39
- str2 = '그는 선생님이다.'
40
-
41
- result = BARTScore().compute([str1], [str2])
42
- print("-------BARTScore-------")
43
- print(result)
44
- print("-----------------------")
45
- print()
46
-
47
- def demo3():
48
- str1 = '나는 학생이다.'
49
- str2 = '그는 선생님이다.'
50
- model_name = 'bert-base-uncased'
51
- result = BERTScore(model_name_or_path=model_name, lang='kor', num_layers=12).compute([str1], [str2])
52
-
53
- print("model name: ", model_name)
54
- print("-------BERTScore-------")
55
- print(result)
56
- print("-----------------------")
57
- print()
58
-
59
- def demo4():
60
- demo_setences = ['제가 나와 있는 곳은 경남 거제시 옥포동 덕포 해수욕장에 나와 있습니다.']
61
- for sen in demo_setences:
62
- print(word_tokenize(sen, "korean"))
63
- print(pos_tag(sen, lang='kor'))
64
-
65
- def demo5():
66
- str1 = '나는 학생이다.'
67
- str2 = '그는 선생님이다.'
68
-
69
- # result = LevenshteinEditDistance().compute(str1, str2)
70
-
71
- # result = HammingDistance().compute(str1, str2)
72
-
73
-
74
- result = DamerauLevenshteinDistance().compute(str1, str2)
75
-
76
- print("-------DamerauLevenshteinDistance-------")
77
- print(result)
78
- print("----------------------------------------")
79
- print()
80
-
81
- def demo6():
82
- x1 = np.array([1, 2, 3, 4, 5])
83
- x2 = np.array([3, 7, 8, 3, 1])
84
-
85
- result = CosineSimilarity().compute(x1, x2)
86
-
87
- print("-------CosineSimilarity-------")
88
- print(result)
89
- print("------------------------------")
90
- print()
91
-
92
- def demo7():
93
- str1 = '나는 학생이다.'
94
- str2 = '그는 선생님이다.'
95
-
96
- result = LCSubstringSimilarity().compute(str1, str2)
97
-
98
- print("-------LCSubstringSimilarity-------")
99
- print(result)
100
- print("-----------------------------------")
101
- print()
102
-
103
- result = LCSubsequenceSimilarity().compute(str1, str2)
104
-
105
- print("-------LCSubsequenceSimilarity-------")
106
- print(result)
107
- print("--------------------------------------")
108
- print()
109
-
110
- result = JaroSimilarity().compute(str1, str2)
111
-
112
- print("-------JaroSimilarity-------")
113
- print(result)
114
- print("----------------------------")
115
- print()
116
-
117
-
118
- def demo8():
119
- pattern = "학생"
120
- str1 = '나는 학생이다.'
121
-
122
- result = NaiveSearch().search(pattern, str1)
123
- print(result)
124
-
125
- result = RabinKarpSearch().search(pattern, str1)
126
- print(result)
127
-
128
- result = KMPSearch().search(pattern, str1)
129
- print(result)
130
-
131
- result = BoyerMooreSearch().search(pattern, str1)
132
- print(result)
133
-
134
- def demo9():
135
- faiss = FaissSearch(model_name_or_path = 'skt/kobert-base-v1', tokenizer_name_or_path = 'skt/kobert-base-v1')
136
- corpus = {
137
- 'text': [
138
- "오늘은 날씨가 매우 덥습니다.",
139
- "저는 음악을 듣는 것을 좋아합니다.",
140
- "한국 음식 중에서 떡볶이가 제일 맛있습니다.",
141
- "도서관에서 책을 읽는 건 좋은 취미입니다.",
142
- "내일은 친구와 영화를 보러 갈 거예요.",
143
- "여름 휴가 때 해변에 가서 수영하고 싶어요.",
144
- "한국의 문화는 다양하고 흥미로워요.",
145
- "피아노 연주는 나를 편안하게 해줍니다.",
146
- "공원에서 산책하면 스트레스가 풀립니다.",
147
- "요즘 드라마를 많이 시청하고 있어요.",
148
- "커피가 일상에서 필수입니다.",
149
- "새로운 언어를 배우는 것은 어려운 일이에요.",
150
- "가을에 단풍 구경을 가고 싶어요.",
151
- "요리를 만들면 집안이 좋아보입니다.",
152
- "휴대폰 없이 하루를 보내는 것이 쉽지 않아요.",
153
- "스포츠를 하면 건강에 좋습니다.",
154
- "고양이와 개 중에 어떤 동물을 좋아하세요?"
155
- "천천히 걸어가면서 풍경을 감상하는 것이 좋아요.",
156
- "일주일에 한 번은 가족과 모임을 가요.",
157
- "공부할 때 집중력을 높이는 방법이 있을까요?",
158
- "봄에 꽃들이 피어날 때가 기대되요.",
159
- "여행 가방을 챙기고 싶어서 설레여요.",
160
- "사진 찍는 걸 좋아하는데, 카메라가 필요해요.",
161
- "다음 주에 시험이 있어서 공부해야 해요.",
162
- "운동을 하면 몸이 가벼워집니다.",
163
- "좋은 책을 읽으면 마음이 풍요로워져요.",
164
- "새로운 음악을 발견하면 기분이 좋아져요.",
165
- "미술 전시회에 가면 예술을 감상할 수 있어요.",
166
- "친구들과 함께 시간을 보내는 건 즐거워요.",
167
- "자전거 타면 바람을 맞으면서 즐거워집니다."
168
- ],
169
- }
170
- print(faiss.initialize_corpus(corpus=corpus, section='text', embedding_type='mean_pooling', save_path='/Users/dowon/Test/test.json'))
171
- query = "오늘은 날씨가 매우 춥다."
172
- top_k = 5
173
- result = faiss.search(query, top_k)
174
- print(result)
175
-
176
- def faiss_test():
177
- faiss = FaissSearch(model_name_or_path = 'klue/bert-base')
178
- result = TextReader("/Users/dowon/Test/sentence1.txt").read()
179
- id = 0
180
-
181
- for i in result:
182
- print(i)
183
- i = i.replace('\n', '')
184
- print(i)
185
- i = "i am test"
186
- print(faiss.get_embeddings(text=i, num_workers=10).detach().cpu().numpy())
187
- id += 1
188
- if id ==3:
189
- break
190
-
191
- def faiss_save_test():
192
- faiss = FaissSearch(model_name_or_path = '/Users/dowon/test_model/trained_model/', tokenizer_name_or_path = '/Users/dowon/test_model/trained_model/')
193
- faiss.load_dataset_from_json('/Users/dowon/Test/test.json')
194
- faiss.embedding_type = 'mean_pooling'
195
- # faiss.load_faiss_index(index_name='embeddings',file_path='/Users/dowon/Test/test_index.json')
196
- faiss.add_faiss_index(column_name='embeddings')
197
- query = "오늘은 날시가 매우 춥다."
198
- top_k = 5
199
- result = faiss.search(query, top_k)
200
- print(result)
201
-
202
-
203
- def demo10():
204
- metric = DefaultMetric()
205
- y_true = [1, 3, 3, 5, 5,1]
206
- y_pred = [1, 2, 3, 4, 5,2]
207
- str1 = "i am teacher"
208
- str2 = "he is student"
209
- print(metric.precision_score(y_true, y_pred, "macro"))
210
-
211
- def demo11():
212
- print("\nBegin Wasserstein distance demo ")
213
-
214
- P = np.array([0.6, 0.1, 0.1, 0.1, 0.1])
215
- Q1 = np.array([0.1, 0.1, 0.6, 0.1, 0.1])
216
- Q2 = np.array([0.1, 0.1, 0.1, 0.1, 0.6])
217
-
218
- P = torch.from_numpy(P)
219
- Q1 = torch.from_numpy(Q1)
220
- Q2 = torch.from_numpy(Q2)
221
- kl_p_q1 = WassersteinDistance().compute_kullback(P, Q1)
222
- kl_p_q2 = WassersteinDistance().compute_kullback(P, Q2)
223
-
224
- wass_p_q1 = WassersteinDistance().compute_wasserstein(P, Q1)
225
- wass_p_q2 = WassersteinDistance().compute_wasserstein(P, Q2)
226
-
227
- jesson_p_q1 = WassersteinDistance().compute_jesson_shannon(P, Q1)
228
- jesson_p_q2 = WassersteinDistance().compute_jesson_shannon(P, Q2)
229
-
230
-
231
- print("\nKullback-Leibler distances: ")
232
- print("P to Q1 : %0.4f " % kl_p_q1)
233
- print("P to Q2 : %0.4f " % kl_p_q2)
234
-
235
- print("\nWasserstein distances: ")
236
- print("P to Q1 : %0.4f " % wass_p_q1)
237
- print("P to Q2 : %0.4f " % wass_p_q2)
238
-
239
- print("\nJesson-Shannon distances: ")
240
- print("P to Q1 : %0.4f " % jesson_p_q1)
241
- print("P to Q2 : %0.4f " % jesson_p_q2)
242
-
243
- print("\nEnd demo ")
244
-
245
- def demo12():
246
- y_pred = [5, 2, 4, 1, 3, 2, 5, 6, 7]
247
- y_true = [1, 3, 6, 7, 1, 5]
248
-
249
- user = [[5, 3, 2], [9, 1, 2], [3, 5, 6], [7, 2, 1]]
250
- h_pred = [[15, 6, 21, 3], [15, 77, 23, 14], [51, 23, 21, 2], [53, 2, 1, 5]]
251
-
252
- metric = DefaultMetric()
253
- print(metric.precision_at_k(y_true, y_pred, 3))
254
- print(metric.recall_at_k(y_true,y_pred, 3))
255
- print(metric.hit_rate_at_k(user, h_pred, 1))
256
-
257
-
258
-
259
- class TextReader:
260
- def __init__(self, path: str):
261
- self.path = path
262
-
263
- def read(self) -> List[str]:
264
- with open(self.path, 'r') as f:
265
- return f.readlines()
266
-
267
-
268
- if __name__=="__main__":
269
- # demo()
270
- # demo2()
271
- # demo3()
272
- #demo4()
273
- # demo5()
274
- # demo6()
275
- # demo7()
276
- # demo8()
277
- # demo9()
278
- # faiss_test()
279
- # faiss_save_test()
280
- # demo10()
281
- demo11()
282
- #demo12()
@@ -1,19 +0,0 @@
1
- from nltk.tag import EspressoTagger
2
-
3
- if __name__ == '__main__':
4
- sent = "나는 배가 고프다. 나는 아름다운 강산에 살고있다."
5
- tagger = EspressoTagger()
6
- print()
7
- print(tagger.tag('pos', sent))
8
- print("dependency :")
9
- print(tagger.tag('dependency', sent))
10
- print('ner :')
11
- ner = tagger.tag('ner', sent)
12
- print(ner)
13
- print()
14
- print()
15
- print('wsd :')
16
- print(tagger.tag('wsd', sent))
17
- print()
18
- #print('srl :')
19
- #print(tagger.tag('srl', sent))
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes