nltkor 1.2.16__tar.gz → 1.2.18__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {nltkor-1.2.16 → nltkor-1.2.18}/PKG-INFO +8 -30
- {nltkor-1.2.16 → nltkor-1.2.18}/README.md +43 -2
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/__init__.py +1 -1
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/metrics/__init__.py +0 -1
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/search/__init__.py +2 -1
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/search/faiss_search.py +9 -9
- nltkor-1.2.18/nltkor/search/search_dict.py +95 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/network.c +24404 -27780
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor.egg-info/PKG-INFO +8 -30
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor.egg-info/SOURCES.txt +1 -1
- {nltkor-1.2.16 → nltkor-1.2.18}/setup.py +1 -1
- nltkor-1.2.16/nltkor/metrics/bleu_tensor.py +0 -20
- {nltkor-1.2.16 → nltkor-1.2.18}/LICENSE.txt +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/Kor_char.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/alignment/__init__.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/cider/__init__.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/cider/cider.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/cider/cider_scorer.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/distance/__init__.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/distance/wasserstein.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/etc.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/lazyimport.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/make_requirement.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/metrics/bartscore.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/metrics/bertscore.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/metrics/classical.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/metrics/entment.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/metrics/eval.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/metrics/mauve.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/metrics/mauve_utils.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/misc/__init__.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/misc/string2string_basic_functions.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/misc/string2string_default_tokenizer.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/misc/string2string_hash_functions.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/misc/string2string_word_embeddings.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/search/classical.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/search/kobert_tokenizer.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/sejong/__init__.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/sejong/ch.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/sejong/dict_semClassNum.txt +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/sejong/layer.txt +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/sejong/sejong_download.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/sejong/ssem.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/similarity/__init__.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/similarity/bartscore____.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/similarity/bertscore____.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/similarity/classical.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/similarity/cosine_similarity.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/__init__.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/espresso_tag.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/__init__.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/arguments.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/attributes.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/config.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/metadata.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/ner/__init__.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/ner/macmorphoreader.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/ner/ner_reader.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/network.pyx +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/networkconv.pyx +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/networkdependencyconv.pyx +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/parse/__init__.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/parse/parse_reader.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/pos/__init__.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/pos/macmorphoreader.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/pos/pos_reader.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/reader.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/srl/__init__.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/srl/__srl_reader_.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/srl/srl_reader.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/srl/train_srl.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/taggers.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/utils.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/word_dictionary.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/wsd/__init__.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/wsd/macmorphoreader.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tag/libs/wsd/wsd_reader.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tokenize/__init__.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/tokenize/ko_tokenize.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor/trans.py +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor.egg-info/dependency_links.txt +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor.egg-info/requires.txt +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/nltkor.egg-info/top_level.txt +0 -0
- {nltkor-1.2.16 → nltkor-1.2.18}/setup.cfg +0 -0
@@ -1,8 +1,11 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.1
|
2
2
|
Name: nltkor
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.18
|
4
|
+
Summary: UNKNOWN
|
4
5
|
Home-page: https://modi.changwon.ac.kr/air_cwnu/nlp_tool/nltk_ko.git
|
6
|
+
License: UNKNOWN
|
5
7
|
Keywords: string matching,pattern matching,edit distance,string to string correction,string to string matching,Levenshtein edit distance,Hamming distance,Damerau-Levenshtein distance,Jaro-Winkler distance,longest common subsequence,longest common substring,dynamic programming,approximate string matching,semantic similarity,natural language processing,NLP,information retrieval,rouge,sacrebleu,bertscore,bartscore,fasttext,glove,cosine similarity,Smith-Waterman,Needleman-Wunsch,Hirschberg,Karp-Rabin,Knuth-Morris-Pratt,Boyer-Moore
|
8
|
+
Platform: UNKNOWN
|
6
9
|
Classifier: Programming Language :: Python :: 3.7
|
7
10
|
Classifier: Programming Language :: Python :: 3.8
|
8
11
|
Classifier: Programming Language :: Python :: 3.9
|
@@ -12,31 +15,6 @@ Classifier: Operating System :: OS Independent
|
|
12
15
|
Classifier: Typing :: Typed
|
13
16
|
Requires-Python: >=3.7
|
14
17
|
License-File: LICENSE.txt
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
Requires-Dist: tqdm>=4.40.0
|
19
|
-
Requires-Dist: joblib
|
20
|
-
Requires-Dist: requests
|
21
|
-
Requires-Dist: nltk>3.0
|
22
|
-
Requires-Dist: pyarrow
|
23
|
-
Requires-Dist: beautifulSoup4
|
24
|
-
Requires-Dist: faiss-cpu==1.7.3
|
25
|
-
Requires-Dist: datasets
|
26
|
-
Requires-Dist: torch
|
27
|
-
Requires-Dist: dill<0.3.9
|
28
|
-
Requires-Dist: scikit-learn>=0.22.1
|
29
|
-
Requires-Dist: transformers==4.42.2
|
30
|
-
Requires-Dist: protobuf
|
31
|
-
Requires-Dist: sentencepiece
|
32
|
-
Requires-Dist: pandas
|
33
|
-
Requires-Dist: bert_score
|
34
|
-
Requires-Dist: chardet
|
35
|
-
Requires-Dist: GPUtil
|
36
|
-
Requires-Dist: fasttext
|
37
|
-
Dynamic: classifier
|
38
|
-
Dynamic: home-page
|
39
|
-
Dynamic: keywords
|
40
|
-
Dynamic: license-file
|
41
|
-
Dynamic: requires-dist
|
42
|
-
Dynamic: requires-python
|
18
|
+
|
19
|
+
UNKNOWN
|
20
|
+
|
@@ -8,6 +8,8 @@
|
|
8
8
|
| 2 | 2024.5.22 | 차정원 | NLTKo 1.1.0 공개 |
|
9
9
|
| 3 | 2025.2.5 | 이예나 | NLTKor 1.2.0 공개<br> bleu tensor 추가, entment 추가, accurancy norm 추가 |
|
10
10
|
| 4 | 2025.4.3 | 이예나 | NLTKor 1.2.10 업데이트<br> espresso 오류 수정 |
|
11
|
+
| 5 | 2025.5.21 | 정찬혁 | NLTKor 1.2.18 업데이트<br> TRIE 검색 추가|
|
12
|
+
|
11
13
|
|
12
14
|
|
13
15
|
|
@@ -51,7 +53,7 @@
|
|
51
53
|
- [5.11 ROUGE](#511-rouge)
|
52
54
|
- [5.12 CIDER](#512-cider)
|
53
55
|
- [5.13 METEOR](#513-meteor)
|
54
|
-
- [5.14
|
56
|
+
- [5.14 EMR(Entity Mention Recall)](#514-emrentity-mention-recall)
|
55
57
|
- [6 확장 평가 함수](#6-확장-평가-함수)
|
56
58
|
- [6.1 MAUVE](#61-mauve)
|
57
59
|
- [6.2 BERT Score](#62-bert-score)
|
@@ -89,6 +91,7 @@
|
|
89
91
|
- [12.3 KMP 검색 알고리즘](#123-kmp-검색)
|
90
92
|
- [12.4 Boyer-Moore 검색 알고리즘](#124-boyer-moore-검색)
|
91
93
|
- [12.5 Faiss-Semantic 검색](#125-faiss-semantic-검색)
|
94
|
+
- [12.6 TRIE 검색](#126-trie-검색)
|
92
95
|
- [13. 세종전자사전 (ssem)](#13-세종전자사전-ssem)
|
93
96
|
- [13.1 객체 확인 방법](#131-객체-확인-방법)
|
94
97
|
- [13.2 entry 접근법](#132-entry-접근법)
|
@@ -609,6 +612,7 @@ accuracy = correct / len(examples)
|
|
609
612
|
print(f"Accuracy: {accuracy * 100:.2f}%")
|
610
613
|
print(f"Time: {sum(inference_times)/len(inference_times)}, memory: {sum(memory_usages)/len(memory_usages)}")
|
611
614
|
```
|
615
|
+
**결과**
|
612
616
|
```
|
613
617
|
Accuracy: 20.00
|
614
618
|
Time: 0.05374705195426941, memory: 1409.9
|
@@ -915,10 +919,11 @@ TF-IDF를 n-gram에 대한 가중치로 계산하고 참조 캡션과 생성 캡
|
|
915
919
|
|
916
920
|
요약된 텍스트가 참조 문서에 등장하는 중요 개체를 얼마나 잘 유지하고 있는지에 대한 평가 지표이다.
|
917
921
|
|
922
|
+
EMR().entity(원본 텍스트,요약된 텍스트)
|
918
923
|
```python
|
919
924
|
>>> # -*- coding: utf-8 -*-
|
920
925
|
>>> from nltkor.metrics import EntMent
|
921
|
-
>>>
|
926
|
+
>>> EMR().entity("국립창원대학교(총장 박민원)가 사천우주항공캠퍼스 개교와 함께 2025학년도 사천우주항공공학부 입학식을 7일 오전 11시 사천우주항공캠퍼스에서 열었다.이날 행사에는 박민원 총장을 비롯해 국민의힘 서천호 국회의원(사천·남해·하동), 윤영빈 우주항공청장, 박동식 사천시장, 김규헌 사천시의회 의장, 지역 유관기관 관계자들과 신입생 및 가족들이 참석했다. 글로컬대학30사업 선정에 따라 국립창원대와 통합을 추진 중인 경남도립거창대학, 경남도립남해대학 관계자도 함께 자리했다.행사는 1부 현판 제막식과 2부 입학식으로 진행됐으며, 박동식 사천시장은 신입생들에게 축하 선물로 금배지를 전달했고, 박민원 총장은 캠퍼스 설립에 기여한 유공자들에게 표창장을 수여했다.","국립창원대학교는 4월 7일 사천우주항공캠퍼스에서 2025학년도 사천우주항공공학부 입학식을 개최했다. 이날 행사에는 박민원 총장, 서천호 국회의원, 윤영빈 우주항공청장, 박동식 사천시장 등 주요 인사와 신입생 및 가족들이 참석했으며, 글로컬대학30사업과 관련된 거창대학·남해대학 관계자들도 함께했다. 행사는 현판 제막식과 입학식으로 나뉘어 진행되었고, 신입생들에게는 금배지가, 캠퍼스 설립 유공자들에게는 표창장이 수여되었다.")
|
922
927
|
Downloading Espresso5 model...
|
923
928
|
0.8888888888888888
|
924
929
|
```
|
@@ -1664,6 +1669,42 @@ Adding FAISS index...
|
|
1664
1669
|
4 피아노 연주는 나를 편안하게 해줍니다. [-0.242319867, 0.6492734551, -1.4172941446, 0.... 34.069862
|
1665
1670
|
```
|
1666
1671
|
|
1672
|
+
#### 12.6 TRIE 검색
|
1673
|
+
- 텍스트 파일에 word가 포함되어 있는지 판단한다.
|
1674
|
+
|
1675
|
+
```python
|
1676
|
+
root = {}
|
1677
|
+
dict_file = '텍스트 파일 경로'
|
1678
|
+
sc = SearchDic(root)
|
1679
|
+
with open(dict_file, 'r') as f:
|
1680
|
+
for line in f:
|
1681
|
+
if ';;' in line[:2]: continue
|
1682
|
+
k, v = line.strip().split('\t')
|
1683
|
+
sc.build_search_dict(k, v)
|
1684
|
+
# print(root)
|
1685
|
+
word = '고용 노동부'
|
1686
|
+
values, value_data = sc.search_dict(word, True)
|
1687
|
+
print(values, value_data)
|
1688
|
+
|
1689
|
+
word = '2시뉴스외전'
|
1690
|
+
values, value_data = sc.search_dict(word, True)
|
1691
|
+
print(values, value_data)
|
1692
|
+
word = '2시 뉴스외전'
|
1693
|
+
values, value_data = sc.search_dict(word, True)
|
1694
|
+
print(values, value_data)
|
1695
|
+
|
1696
|
+
word = 'gbc'
|
1697
|
+
values, value_data = sc.search_dict(word, True)
|
1698
|
+
print(values, value_data)
|
1699
|
+
```
|
1700
|
+
**결과**
|
1701
|
+
```
|
1702
|
+
['고용 노동부'] ['NN']
|
1703
|
+
['2시뉴스외전'] ['NN']
|
1704
|
+
['2시 뉴스외전'] ['NN']
|
1705
|
+
['bc'] ['ND']
|
1706
|
+
```
|
1707
|
+
|
1667
1708
|
### 13. 세종전자사전 (ssem)
|
1668
1709
|
|
1669
1710
|
우선 해당 기능을 사용하기 전에 인자 포맷에 대해 설명한다. 인자는 **entrys, entry, sense** 함수에서 사용한다. 인자 포맷을 설명하기 위해 예제는 체언의 '눈'과 용언의 '감다'를 이용하였다.
|
@@ -53,7 +53,6 @@ from nltkor.metrics.eval import StringMetric
|
|
53
53
|
"""
|
54
54
|
from nltkor.metrics.classical import DefaultMetric
|
55
55
|
from nltkor.metrics.entment import EMR
|
56
|
-
from nltkor.metrics.bleu_tensor import *
|
57
56
|
#DefaultMetric = lazy_import.lazy_callable("nltkor.metrics.classical.DefaultMetric")
|
58
57
|
#Mauve = lazy_import.lazy_callable("nltkor.metrics.mauve.Mauve")
|
59
58
|
from nltkor.metrics.mauve import Mauve
|
@@ -78,10 +78,10 @@ class FaissSearch:
|
|
78
78
|
return FaissSearch_SenEmbed(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
|
79
79
|
elif mode == 'word':
|
80
80
|
return FaissSearch_WordEmbed(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
|
81
|
-
elif mode == '
|
82
|
-
return
|
81
|
+
elif mode == 'sparse':
|
82
|
+
return FaissSearch_Sparse(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
|
83
83
|
else:
|
84
|
-
raise ValueError("choice 'sentence' or 'word' or '
|
84
|
+
raise ValueError("choice 'sentence' or 'word' or 'sparse'")
|
85
85
|
|
86
86
|
|
87
87
|
|
@@ -233,7 +233,7 @@ class FaissSearch_SenEmbed:
|
|
233
233
|
|
234
234
|
# Get the embeddings
|
235
235
|
with torch.no_grad():
|
236
|
-
embeddings = self.model(
|
236
|
+
embeddings = self.model(encoded_text['input_ids'])
|
237
237
|
|
238
238
|
# Get the proper embedding type
|
239
239
|
if embedding_type == 'last_hidden_state':
|
@@ -426,7 +426,7 @@ class FaissSearch_SenEmbed:
|
|
426
426
|
|
427
427
|
# Return the dataset
|
428
428
|
return self.dataset
|
429
|
-
|
429
|
+
|
430
430
|
|
431
431
|
# Search for the most similar elements in the dataset, given a query
|
432
432
|
def search(self,
|
@@ -465,6 +465,7 @@ class FaissSearch_SenEmbed:
|
|
465
465
|
# Add the scores
|
466
466
|
results_df['score'] = scores
|
467
467
|
|
468
|
+
|
468
469
|
# Sort the results by score
|
469
470
|
results_df.sort_values("score", ascending=True, inplace=True)
|
470
471
|
|
@@ -473,8 +474,7 @@ class FaissSearch_SenEmbed:
|
|
473
474
|
|
474
475
|
|
475
476
|
|
476
|
-
|
477
|
-
class FaissSearch_Splade(FaissSearch_SenEmbed):
|
477
|
+
class FaissSearch_Sparse(FaissSearch_SenEmbed):
|
478
478
|
def __init__(self,
|
479
479
|
model_name_or_path: str = 'klue/bert-base',
|
480
480
|
tokenizer_name_or_path: str = 'klue/bert-base',
|
@@ -580,14 +580,14 @@ class FaissSearch_Splade(FaissSearch_SenEmbed):
|
|
580
580
|
|
581
581
|
# Get the embeddings
|
582
582
|
with torch.no_grad():
|
583
|
-
embeddings = self.model(
|
583
|
+
embeddings = self.model(encoded_text['input_ids'])
|
584
584
|
|
585
585
|
# Get the last hidden state
|
586
586
|
embeddings = embeddings['logits']
|
587
587
|
|
588
588
|
embeddings = torch.sum(torch.log(1+torch.relu(embeddings)) * encoded_text['attention_mask'].unsqueeze(-1), dim=1)
|
589
589
|
e_norm = torch.nn.functional.normalize(embeddings, p=2, dim=1, eps=1e-8)
|
590
|
-
|
590
|
+
|
591
591
|
# Return the embeddings
|
592
592
|
return e_norm
|
593
593
|
|
@@ -0,0 +1,95 @@
|
|
1
|
+
import re, os, sys
|
2
|
+
import pandas as pd
|
3
|
+
import numpy as np
|
4
|
+
import json
|
5
|
+
import argparse
|
6
|
+
|
7
|
+
class SearchDic :
|
8
|
+
def __init__ (self,root) :
|
9
|
+
self.root = root
|
10
|
+
|
11
|
+
def build_search_dict(self, word, data) -> dict:
|
12
|
+
current_dict = self.root
|
13
|
+
_end_word_ = '$$'
|
14
|
+
for letter in word:
|
15
|
+
|
16
|
+
current_dict = current_dict.setdefault(letter, {})
|
17
|
+
current_dict = current_dict.setdefault(_end_word_, data)
|
18
|
+
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
def search_dict(self, word, space_flag=False):
|
23
|
+
'''
|
24
|
+
TRIE 탐색
|
25
|
+
space_flag: if True then including space, otherwise do not including space
|
26
|
+
'''
|
27
|
+
|
28
|
+
values = list()
|
29
|
+
value_data = list()
|
30
|
+
if not word: return self.root.keys()
|
31
|
+
|
32
|
+
current_dict = self.root
|
33
|
+
_end_word_ = '$$'
|
34
|
+
SPACE = ' '
|
35
|
+
s = 0
|
36
|
+
for i, letter in enumerate(word):
|
37
|
+
#print(i, s, '>', letter, values, value_data, current_dict)
|
38
|
+
if letter in current_dict:
|
39
|
+
#print('\t', letter, values, value_data, current_dict)
|
40
|
+
current_dict = current_dict[letter]
|
41
|
+
if _end_word_ in current_dict :
|
42
|
+
values.append(word[s:i+1])
|
43
|
+
value_data.append(current_dict[_end_word_])
|
44
|
+
elif space_flag and letter != SPACE and SPACE in current_dict:
|
45
|
+
look_ahead_dict = current_dict[SPACE]
|
46
|
+
# print('\t==', i, letter, values, look_ahead_dict)
|
47
|
+
if letter in look_ahead_dict:
|
48
|
+
current_dict = look_ahead_dict[letter]
|
49
|
+
elif space_flag and letter == SPACE:
|
50
|
+
# print('\t##', i, letter, word[i+1], values)
|
51
|
+
continue
|
52
|
+
else:
|
53
|
+
# print('\t@@', i, letter, values)
|
54
|
+
s = i+1
|
55
|
+
current_dict = self.root
|
56
|
+
else:
|
57
|
+
if values: return values, value_data
|
58
|
+
else: return list(word), value_data
|
59
|
+
|
60
|
+
|
61
|
+
def save_dict(self, file_path):
|
62
|
+
# root dictionary를 pickle 파일로 저장
|
63
|
+
with open(file_path, 'wb') as f:
|
64
|
+
pickle.dump(self.root, f)
|
65
|
+
|
66
|
+
def load_dict(self,file_path) -> dict:
|
67
|
+
# pickle 퍄일을 읽어들인다.
|
68
|
+
with open(file_path, 'rb') as f:
|
69
|
+
return pickle.load(f)
|
70
|
+
if __name__ == "__main__":
|
71
|
+
root = {}
|
72
|
+
dict_file = '텍스트파일경로'
|
73
|
+
sc = SearchDic(root)
|
74
|
+
with open(dict_file, 'r') as f:
|
75
|
+
for line in f:
|
76
|
+
if ';;' in line[:2]: continue
|
77
|
+
k, v = line.strip().split('\t')
|
78
|
+
sc.build_search_dict(k, v)
|
79
|
+
# print(root)
|
80
|
+
word = '고용 노동부'
|
81
|
+
values, value_data = sc.search_dict(word, True)
|
82
|
+
print(values, value_data)
|
83
|
+
|
84
|
+
word = '2시뉴스외전'
|
85
|
+
values, value_data = sc.search_dict( word, True)
|
86
|
+
print(values, value_data)
|
87
|
+
word = '2시 뉴스외전'
|
88
|
+
values, value_data = sc.search_dict( word, True)
|
89
|
+
print(values, value_data)
|
90
|
+
|
91
|
+
word = 'gbc'
|
92
|
+
values, value_data = sc.search_dict( word, True)
|
93
|
+
print(values, value_data)
|
94
|
+
|
95
|
+
|