nltkor 1.2.17__tar.gz → 1.2.18__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {nltkor-1.2.17 → nltkor-1.2.18}/PKG-INFO +7 -23
- {nltkor-1.2.17 → nltkor-1.2.18}/README.md +40 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/__init__.py +1 -1
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/metrics/__init__.py +0 -1
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/search/__init__.py +2 -1
- nltkor-1.2.18/nltkor/search/search_dict.py +95 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/network.c +24404 -27780
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor.egg-info/PKG-INFO +7 -23
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor.egg-info/SOURCES.txt +1 -1
- {nltkor-1.2.17 → nltkor-1.2.18}/setup.py +1 -1
- nltkor-1.2.17/nltkor/metrics/bleu_tensor.py +0 -20
- {nltkor-1.2.17 → nltkor-1.2.18}/LICENSE.txt +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/Kor_char.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/alignment/__init__.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/cider/__init__.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/cider/cider.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/cider/cider_scorer.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/distance/__init__.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/distance/wasserstein.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/etc.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/lazyimport.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/make_requirement.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/metrics/bartscore.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/metrics/bertscore.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/metrics/classical.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/metrics/entment.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/metrics/eval.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/metrics/mauve.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/metrics/mauve_utils.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/misc/__init__.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/misc/string2string_basic_functions.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/misc/string2string_default_tokenizer.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/misc/string2string_hash_functions.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/misc/string2string_word_embeddings.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/search/classical.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/search/faiss_search.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/search/kobert_tokenizer.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/sejong/__init__.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/sejong/ch.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/sejong/dict_semClassNum.txt +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/sejong/layer.txt +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/sejong/sejong_download.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/sejong/ssem.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/similarity/__init__.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/similarity/bartscore____.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/similarity/bertscore____.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/similarity/classical.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/similarity/cosine_similarity.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/__init__.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/espresso_tag.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/__init__.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/arguments.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/attributes.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/config.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/metadata.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/ner/__init__.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/ner/macmorphoreader.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/ner/ner_reader.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/network.pyx +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/networkconv.pyx +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/networkdependencyconv.pyx +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/parse/__init__.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/parse/parse_reader.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/pos/__init__.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/pos/macmorphoreader.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/pos/pos_reader.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/reader.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/srl/__init__.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/srl/__srl_reader_.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/srl/srl_reader.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/srl/train_srl.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/taggers.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/utils.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/word_dictionary.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/wsd/__init__.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/wsd/macmorphoreader.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tag/libs/wsd/wsd_reader.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tokenize/__init__.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/tokenize/ko_tokenize.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor/trans.py +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor.egg-info/dependency_links.txt +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor.egg-info/requires.txt +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/nltkor.egg-info/top_level.txt +0 -0
- {nltkor-1.2.17 → nltkor-1.2.18}/setup.cfg +0 -0
@@ -1,8 +1,11 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: nltkor
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.18
|
4
|
+
Summary: UNKNOWN
|
4
5
|
Home-page: https://modi.changwon.ac.kr/air_cwnu/nlp_tool/nltk_ko.git
|
6
|
+
License: UNKNOWN
|
5
7
|
Keywords: string matching,pattern matching,edit distance,string to string correction,string to string matching,Levenshtein edit distance,Hamming distance,Damerau-Levenshtein distance,Jaro-Winkler distance,longest common subsequence,longest common substring,dynamic programming,approximate string matching,semantic similarity,natural language processing,NLP,information retrieval,rouge,sacrebleu,bertscore,bartscore,fasttext,glove,cosine similarity,Smith-Waterman,Needleman-Wunsch,Hirschberg,Karp-Rabin,Knuth-Morris-Pratt,Boyer-Moore
|
8
|
+
Platform: UNKNOWN
|
6
9
|
Classifier: Programming Language :: Python :: 3.7
|
7
10
|
Classifier: Programming Language :: Python :: 3.8
|
8
11
|
Classifier: Programming Language :: Python :: 3.9
|
@@ -12,25 +15,6 @@ Classifier: Operating System :: OS Independent
|
|
12
15
|
Classifier: Typing :: Typed
|
13
16
|
Requires-Python: >=3.7
|
14
17
|
License-File: LICENSE.txt
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
Requires-Dist: tqdm>=4.40.0
|
19
|
-
Requires-Dist: joblib
|
20
|
-
Requires-Dist: requests
|
21
|
-
Requires-Dist: nltk>3.0
|
22
|
-
Requires-Dist: pyarrow
|
23
|
-
Requires-Dist: beautifulSoup4
|
24
|
-
Requires-Dist: faiss-cpu==1.7.3
|
25
|
-
Requires-Dist: datasets
|
26
|
-
Requires-Dist: torch
|
27
|
-
Requires-Dist: dill<0.3.9
|
28
|
-
Requires-Dist: scikit-learn>=0.22.1
|
29
|
-
Requires-Dist: transformers==4.42.2
|
30
|
-
Requires-Dist: protobuf
|
31
|
-
Requires-Dist: sentencepiece
|
32
|
-
Requires-Dist: pandas
|
33
|
-
Requires-Dist: bert_score
|
34
|
-
Requires-Dist: chardet
|
35
|
-
Requires-Dist: GPUtil
|
36
|
-
Requires-Dist: fasttext
|
18
|
+
|
19
|
+
UNKNOWN
|
20
|
+
|
@@ -8,6 +8,8 @@
|
|
8
8
|
| 2 | 2024.5.22 | 차정원 | NLTKo 1.1.0 공개 |
|
9
9
|
| 3 | 2025.2.5 | 이예나 | NLTKor 1.2.0 공개<br> bleu tensor 추가, entment 추가, accurancy norm 추가 |
|
10
10
|
| 4 | 2025.4.3 | 이예나 | NLTKor 1.2.10 업데이트<br> espresso 오류 수정 |
|
11
|
+
| 5 | 2025.5.21 | 정찬혁 | NLTKor 1.2.18 업데이트<br> TRIE 검색 추가|
|
12
|
+
|
11
13
|
|
12
14
|
|
13
15
|
|
@@ -89,6 +91,7 @@
|
|
89
91
|
- [12.3 KMP 검색 알고리즘](#123-kmp-검색)
|
90
92
|
- [12.4 Boyer-Moore 검색 알고리즘](#124-boyer-moore-검색)
|
91
93
|
- [12.5 Faiss-Semantic 검색](#125-faiss-semantic-검색)
|
94
|
+
- [12.6 TRIE 검색](#126-trie-검색)
|
92
95
|
- [13. 세종전자사전 (ssem)](#13-세종전자사전-ssem)
|
93
96
|
- [13.1 객체 확인 방법](#131-객체-확인-방법)
|
94
97
|
- [13.2 entry 접근법](#132-entry-접근법)
|
@@ -609,6 +612,7 @@ accuracy = correct / len(examples)
|
|
609
612
|
print(f"Accuracy: {accuracy * 100:.2f}%")
|
610
613
|
print(f"Time: {sum(inference_times)/len(inference_times)}, memory: {sum(memory_usages)/len(memory_usages)}")
|
611
614
|
```
|
615
|
+
**결과**
|
612
616
|
```
|
613
617
|
Accuracy: 20.00
|
614
618
|
Time: 0.05374705195426941, memory: 1409.9
|
@@ -1665,6 +1669,42 @@ Adding FAISS index...
|
|
1665
1669
|
4 피아노 연주는 나를 편안하게 해줍니다. [-0.242319867, 0.6492734551, -1.4172941446, 0.... 34.069862
|
1666
1670
|
```
|
1667
1671
|
|
1672
|
+
#### 12.6 TRIE 검색
|
1673
|
+
- 텍스트 파일에 word가 포함되어 있는지 판단한다.
|
1674
|
+
|
1675
|
+
```python
|
1676
|
+
root = {}
|
1677
|
+
dict_file = '텍스트 파일 경로'
|
1678
|
+
sc = SearchDic(root)
|
1679
|
+
with open(dict_file, 'r') as f:
|
1680
|
+
for line in f:
|
1681
|
+
if ';;' in line[:2]: continue
|
1682
|
+
k, v = line.strip().split('\t')
|
1683
|
+
sc.build_search_dict(k, v)
|
1684
|
+
# print(root)
|
1685
|
+
word = '고용 노동부'
|
1686
|
+
values, value_data = sc.search_dict(word, True)
|
1687
|
+
print(values, value_data)
|
1688
|
+
|
1689
|
+
word = '2시뉴스외전'
|
1690
|
+
values, value_data = sc.search_dict(word, True)
|
1691
|
+
print(values, value_data)
|
1692
|
+
word = '2시 뉴스외전'
|
1693
|
+
values, value_data = sc.search_dict(word, True)
|
1694
|
+
print(values, value_data)
|
1695
|
+
|
1696
|
+
word = 'gbc'
|
1697
|
+
values, value_data = sc.search_dict(word, True)
|
1698
|
+
print(values, value_data)
|
1699
|
+
```
|
1700
|
+
**결과**
|
1701
|
+
```
|
1702
|
+
['고용 노동부'] ['NN']
|
1703
|
+
['2시뉴스외전'] ['NN']
|
1704
|
+
['2시 뉴스외전'] ['NN']
|
1705
|
+
['bc'] ['ND']
|
1706
|
+
```
|
1707
|
+
|
1668
1708
|
### 13. 세종전자사전 (ssem)
|
1669
1709
|
|
1670
1710
|
우선 해당 기능을 사용하기 전에 인자 포맷에 대해 설명한다. 인자는 **entrys, entry, sense** 함수에서 사용한다. 인자 포맷을 설명하기 위해 예제는 체언의 '눈'과 용언의 '감다'를 이용하였다.
|
@@ -53,7 +53,6 @@ from nltkor.metrics.eval import StringMetric
|
|
53
53
|
"""
|
54
54
|
from nltkor.metrics.classical import DefaultMetric
|
55
55
|
from nltkor.metrics.entment import EMR
|
56
|
-
from nltkor.metrics.bleu_tensor import *
|
57
56
|
#DefaultMetric = lazy_import.lazy_callable("nltkor.metrics.classical.DefaultMetric")
|
58
57
|
#Mauve = lazy_import.lazy_callable("nltkor.metrics.mauve.Mauve")
|
59
58
|
from nltkor.metrics.mauve import Mauve
|
@@ -0,0 +1,95 @@
|
|
1
|
+
import re, os, sys
|
2
|
+
import pandas as pd
|
3
|
+
import numpy as np
|
4
|
+
import json
|
5
|
+
import argparse
|
6
|
+
|
7
|
+
class SearchDic :
|
8
|
+
def __init__ (self,root) :
|
9
|
+
self.root = root
|
10
|
+
|
11
|
+
def build_search_dict(self, word, data) -> dict:
|
12
|
+
current_dict = self.root
|
13
|
+
_end_word_ = '$$'
|
14
|
+
for letter in word:
|
15
|
+
|
16
|
+
current_dict = current_dict.setdefault(letter, {})
|
17
|
+
current_dict = current_dict.setdefault(_end_word_, data)
|
18
|
+
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
def search_dict(self, word, space_flag=False):
|
23
|
+
'''
|
24
|
+
TRIE 탐색
|
25
|
+
space_flag: if True then including space, otherwise do not including space
|
26
|
+
'''
|
27
|
+
|
28
|
+
values = list()
|
29
|
+
value_data = list()
|
30
|
+
if not word: return self.root.keys()
|
31
|
+
|
32
|
+
current_dict = self.root
|
33
|
+
_end_word_ = '$$'
|
34
|
+
SPACE = ' '
|
35
|
+
s = 0
|
36
|
+
for i, letter in enumerate(word):
|
37
|
+
#print(i, s, '>', letter, values, value_data, current_dict)
|
38
|
+
if letter in current_dict:
|
39
|
+
#print('\t', letter, values, value_data, current_dict)
|
40
|
+
current_dict = current_dict[letter]
|
41
|
+
if _end_word_ in current_dict :
|
42
|
+
values.append(word[s:i+1])
|
43
|
+
value_data.append(current_dict[_end_word_])
|
44
|
+
elif space_flag and letter != SPACE and SPACE in current_dict:
|
45
|
+
look_ahead_dict = current_dict[SPACE]
|
46
|
+
# print('\t==', i, letter, values, look_ahead_dict)
|
47
|
+
if letter in look_ahead_dict:
|
48
|
+
current_dict = look_ahead_dict[letter]
|
49
|
+
elif space_flag and letter == SPACE:
|
50
|
+
# print('\t##', i, letter, word[i+1], values)
|
51
|
+
continue
|
52
|
+
else:
|
53
|
+
# print('\t@@', i, letter, values)
|
54
|
+
s = i+1
|
55
|
+
current_dict = self.root
|
56
|
+
else:
|
57
|
+
if values: return values, value_data
|
58
|
+
else: return list(word), value_data
|
59
|
+
|
60
|
+
|
61
|
+
def save_dict(self, file_path):
|
62
|
+
# root dictionary를 pickle 파일로 저장
|
63
|
+
with open(file_path, 'wb') as f:
|
64
|
+
pickle.dump(self.root, f)
|
65
|
+
|
66
|
+
def load_dict(self,file_path) -> dict:
|
67
|
+
# pickle 퍄일을 읽어들인다.
|
68
|
+
with open(file_path, 'rb') as f:
|
69
|
+
return pickle.load(f)
|
70
|
+
if __name__ == "__main__":
|
71
|
+
root = {}
|
72
|
+
dict_file = '텍스트파일경로'
|
73
|
+
sc = SearchDic(root)
|
74
|
+
with open(dict_file, 'r') as f:
|
75
|
+
for line in f:
|
76
|
+
if ';;' in line[:2]: continue
|
77
|
+
k, v = line.strip().split('\t')
|
78
|
+
sc.build_search_dict(k, v)
|
79
|
+
# print(root)
|
80
|
+
word = '고용 노동부'
|
81
|
+
values, value_data = sc.search_dict(word, True)
|
82
|
+
print(values, value_data)
|
83
|
+
|
84
|
+
word = '2시뉴스외전'
|
85
|
+
values, value_data = sc.search_dict( word, True)
|
86
|
+
print(values, value_data)
|
87
|
+
word = '2시 뉴스외전'
|
88
|
+
values, value_data = sc.search_dict( word, True)
|
89
|
+
print(values, value_data)
|
90
|
+
|
91
|
+
word = 'gbc'
|
92
|
+
values, value_data = sc.search_dict( word, True)
|
93
|
+
print(values, value_data)
|
94
|
+
|
95
|
+
|