nltkor 1.2.18__tar.gz → 1.2.20__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {nltkor-1.2.18 → nltkor-1.2.20}/PKG-INFO +30 -8
- {nltkor-1.2.18 → nltkor-1.2.20}/README.md +110 -39
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/__init__.py +1 -1
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/search/__init__.py +1 -1
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/search/faiss_search.py +7 -314
- nltkor-1.2.18/nltkor/search/search_dict.py → nltkor-1.2.20/nltkor/search/trie_search.py +10 -10
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/network.c +125 -125
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor.egg-info/PKG-INFO +30 -8
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor.egg-info/SOURCES.txt +1 -1
- {nltkor-1.2.18 → nltkor-1.2.20}/setup.py +1 -1
- {nltkor-1.2.18 → nltkor-1.2.20}/LICENSE.txt +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/Kor_char.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/alignment/__init__.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/cider/__init__.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/cider/cider.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/cider/cider_scorer.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/distance/__init__.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/distance/wasserstein.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/etc.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/lazyimport.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/make_requirement.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/metrics/__init__.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/metrics/bartscore.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/metrics/bertscore.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/metrics/classical.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/metrics/entment.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/metrics/eval.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/metrics/mauve.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/metrics/mauve_utils.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/misc/__init__.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/misc/string2string_basic_functions.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/misc/string2string_default_tokenizer.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/misc/string2string_hash_functions.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/misc/string2string_word_embeddings.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/search/classical.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/search/kobert_tokenizer.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/sejong/__init__.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/sejong/ch.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/sejong/dict_semClassNum.txt +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/sejong/layer.txt +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/sejong/sejong_download.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/sejong/ssem.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/similarity/__init__.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/similarity/bartscore____.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/similarity/bertscore____.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/similarity/classical.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/similarity/cosine_similarity.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/__init__.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/espresso_tag.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/__init__.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/arguments.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/attributes.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/config.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/metadata.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/ner/__init__.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/ner/macmorphoreader.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/ner/ner_reader.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/network.pyx +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/networkconv.pyx +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/networkdependencyconv.pyx +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/parse/__init__.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/parse/parse_reader.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/pos/__init__.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/pos/macmorphoreader.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/pos/pos_reader.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/reader.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/srl/__init__.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/srl/__srl_reader_.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/srl/srl_reader.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/srl/train_srl.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/taggers.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/utils.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/word_dictionary.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/wsd/__init__.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/wsd/macmorphoreader.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tag/libs/wsd/wsd_reader.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tokenize/__init__.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/tokenize/ko_tokenize.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor/trans.py +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor.egg-info/dependency_links.txt +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor.egg-info/requires.txt +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/nltkor.egg-info/top_level.txt +0 -0
- {nltkor-1.2.18 → nltkor-1.2.20}/setup.cfg +0 -0
@@ -1,11 +1,8 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: nltkor
|
3
|
-
Version: 1.2.
|
4
|
-
Summary: UNKNOWN
|
3
|
+
Version: 1.2.20
|
5
4
|
Home-page: https://modi.changwon.ac.kr/air_cwnu/nlp_tool/nltk_ko.git
|
6
|
-
License: UNKNOWN
|
7
5
|
Keywords: string matching,pattern matching,edit distance,string to string correction,string to string matching,Levenshtein edit distance,Hamming distance,Damerau-Levenshtein distance,Jaro-Winkler distance,longest common subsequence,longest common substring,dynamic programming,approximate string matching,semantic similarity,natural language processing,NLP,information retrieval,rouge,sacrebleu,bertscore,bartscore,fasttext,glove,cosine similarity,Smith-Waterman,Needleman-Wunsch,Hirschberg,Karp-Rabin,Knuth-Morris-Pratt,Boyer-Moore
|
8
|
-
Platform: UNKNOWN
|
9
6
|
Classifier: Programming Language :: Python :: 3.7
|
10
7
|
Classifier: Programming Language :: Python :: 3.8
|
11
8
|
Classifier: Programming Language :: Python :: 3.9
|
@@ -15,6 +12,31 @@ Classifier: Operating System :: OS Independent
|
|
15
12
|
Classifier: Typing :: Typed
|
16
13
|
Requires-Python: >=3.7
|
17
14
|
License-File: LICENSE.txt
|
18
|
-
|
19
|
-
|
20
|
-
|
15
|
+
Requires-Dist: Cython
|
16
|
+
Requires-Dist: numpy<=1.26.4,>=1.23.5
|
17
|
+
Requires-Dist: regex
|
18
|
+
Requires-Dist: tqdm>=4.40.0
|
19
|
+
Requires-Dist: joblib
|
20
|
+
Requires-Dist: requests
|
21
|
+
Requires-Dist: nltk>3.0
|
22
|
+
Requires-Dist: pyarrow
|
23
|
+
Requires-Dist: beautifulSoup4
|
24
|
+
Requires-Dist: faiss-cpu==1.7.3
|
25
|
+
Requires-Dist: datasets
|
26
|
+
Requires-Dist: torch
|
27
|
+
Requires-Dist: dill<0.3.9
|
28
|
+
Requires-Dist: scikit-learn>=0.22.1
|
29
|
+
Requires-Dist: transformers==4.42.2
|
30
|
+
Requires-Dist: protobuf
|
31
|
+
Requires-Dist: sentencepiece
|
32
|
+
Requires-Dist: pandas
|
33
|
+
Requires-Dist: bert_score
|
34
|
+
Requires-Dist: chardet
|
35
|
+
Requires-Dist: GPUtil
|
36
|
+
Requires-Dist: fasttext
|
37
|
+
Dynamic: classifier
|
38
|
+
Dynamic: home-page
|
39
|
+
Dynamic: keywords
|
40
|
+
Dynamic: license-file
|
41
|
+
Dynamic: requires-dist
|
42
|
+
Dynamic: requires-python
|
@@ -110,18 +110,13 @@ NLTKor는 한국어를 위한 NLTK이며 기존의 영어에서 사용하는 Wor
|
|
110
110
|
|
111
111
|
## 2. 사용 환경
|
112
112
|
|
113
|
-
- 운영체제 : ubuntu 18.04, ubuntu 22.04, MacOS
|
113
|
+
- 운영체제 : ubuntu 18.04, ubuntu 22.04, MacOS, Windows
|
114
114
|
- 언어 : `python3.8`, `python3.9`, `python3.10`, `python3.11`
|
115
115
|
- 라이브러리 : nltk>=1.1.3, numpy==1.23, faiss-cpu=1.7.3 **※ 해당 NLTKor는 영어 NLTK를 별도로 인스톨해야 함.**
|
116
116
|
|
117
117
|
**주의사항**
|
118
118
|
|
119
|
-
-
|
120
|
-
|
121
|
-
| OS | python | 아키텍처 |
|
122
|
-
| ------ | ----------------------------------------- | ------------- |
|
123
|
-
| Mac | python3.8 | arm64 |
|
124
|
-
| ubuntu | python3.8 python3.9 python3.10 python3.11 | arm64, x86_64 |
|
119
|
+
- Windows 환경에서 python 3.9~3.11을 사용할 경우 fasttext 라이브러리가 지원되지 않습니다. 대신 fasttext-wheel 라이브러리를 사용해야 합니다(pip install fasttext-wheel).
|
125
120
|
|
126
121
|
### 2.1 라이브러리 설치
|
127
122
|
|
@@ -1533,21 +1528,33 @@ P to Q2 : 0.1981
|
|
1533
1528
|
```
|
1534
1529
|
|
1535
1530
|
#### 12.5 Faiss-Semantic 검색
|
1536
|
-
|
1537
|
-
- **
|
1538
|
-
-
|
1539
|
-
|
1540
|
-
-
|
1541
|
-
-
|
1542
|
-
-
|
1543
|
-
-
|
1544
|
-
-
|
1545
|
-
-
|
1546
|
-
-
|
1531
|
+
- class FaissSearch
|
1532
|
+
- **__new__**(mode = None, model_name_or_path: str = 'klue/bert-base', tokneizer_name_or_path: str = 'klue/bert-base', device: str = 'cpu') -> None : mode에 따라 이용할 class로 이동합니다.
|
1533
|
+
- mode = dense(dense | sparse)
|
1534
|
+
|
1535
|
+
- class FaissSearch_Dense : 기존 Faiss를 이용한 검색
|
1536
|
+
- **init**(model_name_or_path: str = 'klue/bert-base', tokenizer_name_or_path: str = 'klue/bert-base', device: str = 'cpu')→ None : FaissSearh_Dense를 초기화 합니다.
|
1537
|
+
- add_faiss_index(column_name: str = 'embeddings', metric_type: int | None = None, batch_size: int = 8, \*\*kwargs)→ None : FAISS index를 dataset에 추가합니다.
|
1538
|
+
- get_embeddings(text: str | List[str], embedding_type: str = 'last_hidden_state', batch_size: int = 8, num_workers: int = 4)→ Tensor : 텍스트를 임베딩합니다.
|
1539
|
+
- get_last_hidden_state(embeddings: Tensor)→ Tensor : 임베딩된 텍스트의 last hidden state를 반환합니다.
|
1540
|
+
- get_mean_pooling(embeddings: Tensor)→ Tensor : 입력 임베딩의 mean pooling을 반환합니다.
|
1541
|
+
- initialize_corpus(corpus: Dict[str, List[str]] | DataFrame | Dataset, section: str = 'text', index_column_name: str = 'embeddings', embedding_type: str = 'last_hidden_state', batch_size: int | None = None, num_workers: int | None = None, save_path: str | None = None)→ Dataset : 데이터셋을 초기화합니다.
|
1542
|
+
- load_dataset_from_json(json_path: str)→ Dataset : json 파일에서 데이터셋을 로드합니다.
|
1543
|
+
- load_faiss_index(index_name: str, file_path: str, device: str = 'cpu')→ None : FAISS index를 로드합니다.
|
1544
|
+
- save_faiss_index(index_name: str, file_path: str)→ None : 특정한 파일 경로로 FAISS index를 저장합니다.
|
1545
|
+
- search(query: str, k: int = 1, index_column_name: str = 'embeddings')→ DataFrame : 데이터셋에서 쿼리를 검색합니다.
|
1546
|
+
|
1547
|
+
- class FaissSearch_Sparse(FaissSearch_Dense) : 학습가능한 sparse representation을 이용하는 모델을 위한 faiss
|
1548
|
+
- **init**(model_name_or_path: str = 'klue/bert-base', tokenizer_name_or_path: str = 'klue/bert-base', device: str = 'cpu') -> None : FaissSearch_Sparse를 초기화합니다.
|
1549
|
+
- get_embeddings(text: str | List[str], embedding_type: str = 'last_hidden_state', batch_size: int = 8, num_workers: int = 4) -> Tensor : 텍스트를 임베딩합니다.
|
1550
|
+
|
1551
|
+
<br>
|
1552
|
+
|
1553
|
+
- mode = 'dense' : 원본 faiss를 실행합니다.
|
1547
1554
|
|
1548
1555
|
```python
|
1549
|
-
>>> from nltkor.search import FaissSearch
|
1550
|
-
>>> faiss = FaissSearch(model_name_or_path = '
|
1556
|
+
>>> from nltkor.search.faiss_search import FaissSearch
|
1557
|
+
>>> faiss = FaissSearch(model_name_or_path = 'klue/bert-base', mode='dense')
|
1551
1558
|
>>> corpus = {
|
1552
1559
|
'text': [
|
1553
1560
|
"오늘은 날씨가 매우 덥습니다.",
|
@@ -1583,29 +1590,93 @@ P to Q2 : 0.1981
|
|
1583
1590
|
],
|
1584
1591
|
}
|
1585
1592
|
>>> faiss.initialize_corpus(corpus=corpus, section='text', embedding_type='mean_pooling')
|
1586
|
-
>>> query = "오늘은
|
1593
|
+
>>> query = "오늘은 날씨가 매우 춥다."
|
1587
1594
|
>>> top_k = 5
|
1588
1595
|
>>> result = faiss.search(query, top_k)
|
1589
1596
|
>>> print(result)
|
1590
|
-
|
1591
|
-
|
1592
|
-
|
1593
|
-
|
1594
|
-
|
1595
|
-
|
1597
|
+
|
1598
|
+
|
1599
|
+
text embeddings score
|
1600
|
+
0 오늘은 날씨가 매우 덥습니다. [-0.06737425178289413, -0.6356450319290161, -0... 52.453941
|
1601
|
+
1 휴대폰 없이 하루를 보내는 것이 쉽지 않아요. [0.09126424789428711, -0.011225797235965729, -... 168.310577
|
1602
|
+
2 내일은 친구와 영화를 보러 갈 거예요. [-0.21793286502361298, -0.2237573117017746, 0.... 181.051544
|
1603
|
+
3 요리를 만들면 집안이 좋아보입니다. [0.7215852737426758, -0.426792711019516, -0.07... 203.423340
|
1604
|
+
4 스포츠를 하면 건강에 좋습니다. [0.1290944665670395, -0.6169838905334473, -0.2... 205.527954
|
1605
|
+
|
1606
|
+
```
|
1607
|
+
|
1608
|
+
<br>
|
1609
|
+
|
1610
|
+
- mode = 'sparse' : 학습가능한 sparse representation을 이용한 모델을 위한 faiss 코드를 실행합니다.
|
1611
|
+
|
1612
|
+
```python
|
1613
|
+
>>> from nltkor.search.faiss_search import FaissSearch
|
1614
|
+
>>> model_name_or_path = 'klue/bert-base'
|
1615
|
+
>>> faiss = FaissSearch(model_name_or_path=model_name_or_path, mode='sparse')
|
1616
|
+
>>> corpus = {
|
1617
|
+
'text': [
|
1618
|
+
"오늘은 날씨가 매우 덥습니다.",
|
1619
|
+
"저는 음악을 듣는 것을 좋아합니다.",
|
1620
|
+
"한국 음식 중에서 떡볶이가 제일 맛있습니다.",
|
1621
|
+
"도서관에서 책을 읽는 건 좋은 취미입니다.",
|
1622
|
+
"내일은 친구와 영화를 보러 갈 거예요.",
|
1623
|
+
"여름 휴가 때 해변에 가서 수영하고 싶어요.",
|
1624
|
+
"한국의 문화는 다양하고 흥미로워요.",
|
1625
|
+
"피아노 연주는 나를 편안하게 해줍니다.",
|
1626
|
+
"공원에서 산책하면 스트레스가 풀립니다.",
|
1627
|
+
"요즘 드라마를 많이 시청하고 있어요.",
|
1628
|
+
"커피가 일상에서 필수입니다.",
|
1629
|
+
"새로운 언어를 배우는 것은 어려운 일이에요.",
|
1630
|
+
"가을에 단풍 구경을 가고 싶어요.",
|
1631
|
+
"요리를 만들면 집안이 좋아보입니다.",
|
1632
|
+
"휴대폰 없이 하루를 보내는 것이 쉽지 않아요.",
|
1633
|
+
"스포츠를 하면 건강에 좋습니다.",
|
1634
|
+
"고양이와 개 중에 어떤 동물을 좋아하세요?"
|
1635
|
+
"천천히 걸어가면서 풍경을 감상하는 것이 좋아요.",
|
1636
|
+
"일주일에 한 번은 가족과 모임을 가요.",
|
1637
|
+
"공부할 때 집중력을 높이는 방법이 있을까요?",
|
1638
|
+
"봄에 꽃들이 피어날 때가 기대되요.",
|
1639
|
+
"여행 가방을 챙기고 싶어서 설레여요.",
|
1640
|
+
"사진 찍는 걸 좋아하는데, 카메라가 필요해요.",
|
1641
|
+
"다음 주에 시험이 있어서 공부해야 해요.",
|
1642
|
+
"운동을 하면 몸이 가벼워집니다.",
|
1643
|
+
"좋은 책을 읽으면 마음이 풍요로워져요.",
|
1644
|
+
"새로운 음악을 발견하면 기분이 좋아져요.",
|
1645
|
+
"미술 전시회에 가면 예술을 감상할 수 있어요.",
|
1646
|
+
"친구들과 함께 시간을 보내는 건 즐거워요.",
|
1647
|
+
"자전거 타면 바람을 맞으면서 즐거워집니다."
|
1648
|
+
],
|
1649
|
+
}
|
1650
|
+
>>> faiss.initialize_corpus(corpus=corpus, section='text', embedding_type='last_hidden_state')
|
1651
|
+
>>> query = "오늘은 날씨가 매우 춥다."
|
1652
|
+
>>> top_k = 5
|
1653
|
+
>>> result = faiss.search(query=query, k=top_k)
|
1654
|
+
>>> print(result)
|
1655
|
+
|
1656
|
+
text embeddings score
|
1657
|
+
0 오늘은 날씨가 매우 덥습니다. [0.0, 0.055695388466119766, 0.0, 0.0, 0.0, 0.0... 0.130759
|
1658
|
+
1 휴대폰 없이 하루를 보내는 것이 쉽지 않아요. [0.0, 0.06064636632800102, 0.0, 0.0, 0.0, 0.03... 0.418491
|
1659
|
+
2 내일은 친구와 영화를 보러 갈 거예요. [0.0, 0.0474698506295681, 0.0, 0.0, 0.0, 0.039... 0.435895
|
1660
|
+
3 가을에 단풍 구경을 가고 싶어요. [0.0, 0.05392831563949585, 0.0, 0.0, 0.0, 0.05... 0.488796
|
1661
|
+
4 스포츠를 하면 건강에 좋습니다. [0.0, 0.05404529720544815, 0.0, 0.0, 0.0, 0.04... 0.496646
|
1662
|
+
|
1596
1663
|
```
|
1597
1664
|
|
1665
|
+
<br>
|
1666
|
+
|
1667
|
+
|
1598
1668
|
- faiss 검색을 매번 initialize 하지 않고, 미리 initialize 해놓은 후 검색을 수행할 수 있습니다.
|
1599
1669
|
|
1670
|
+
|
1600
1671
|
**사용법 & 결과**
|
1601
1672
|
|
1602
1673
|
```python
|
1603
|
-
>>> from nltkor.search import FaissSearch
|
1674
|
+
>>> from nltkor.search.faiss_search import FaissSearch
|
1604
1675
|
|
1605
1676
|
# if you use model and tokenizer in local
|
1606
1677
|
# faiss = FaissSearch(model_name_or_path = '~/test_model/trained_model/', tokenizer_name_or_path = '~/test_model/trained_model/')
|
1607
1678
|
|
1608
|
-
>>> faiss = FaissSearch(model_name_or_path = '
|
1679
|
+
>>> faiss = FaissSearch(model_name_or_path = 'klue/bert-base', mode='dense')
|
1609
1680
|
>>> corpus = {
|
1610
1681
|
'text': [
|
1611
1682
|
"오늘은 날씨가 매우 덥습니다.",
|
@@ -1646,12 +1717,12 @@ P to Q2 : 0.1981
|
|
1646
1717
|
- `initialize_corpus()` 메소드 실행시 `save_path`를 지정하면, 해당 경로에 임베딩된 Dataset이 json형식으로 저장됩니다.
|
1647
1718
|
|
1648
1719
|
```python
|
1649
|
-
>>> from nltkor.search import FaissSearch
|
1720
|
+
>>> from nltkor.search.faiss_search import FaissSearch
|
1650
1721
|
|
1651
|
-
>>> faiss = FaissSearch(model_name_or_path = '
|
1722
|
+
>>> faiss = FaissSearch(model_name_or_path = 'klue/bert-base', mode='dense')
|
1652
1723
|
>>> faiss.load_dataset_from_json('./test.json')
|
1653
1724
|
>>> faiss.embedding_type = 'mean_pooling' # initalize_corpus() 메소드 실행시 지정한 embedding_type과 동일하게 지정해야 합니다.
|
1654
|
-
>>> faiss
|
1725
|
+
>>> faiss.add_faiss_index(colum_name = 'embeddings')
|
1655
1726
|
>>> query = '오늘은 날씨가 매우 춥다.'
|
1656
1727
|
>>> top_k = 5
|
1657
1728
|
>>> result = faiss.search(query=query, top_k=top_k)
|
@@ -1674,27 +1745,27 @@ Adding FAISS index...
|
|
1674
1745
|
|
1675
1746
|
```python
|
1676
1747
|
root = {}
|
1677
|
-
dict_file = '
|
1678
|
-
sc =
|
1748
|
+
dict_file = '텍스트파일 경로'
|
1749
|
+
sc = TRIESearch(root)
|
1679
1750
|
with open(dict_file, 'r') as f:
|
1680
1751
|
for line in f:
|
1681
1752
|
if ';;' in line[:2]: continue
|
1682
1753
|
k, v = line.strip().split('\t')
|
1683
|
-
sc.
|
1754
|
+
sc.build_trie_search(k, v)
|
1684
1755
|
# print(root)
|
1685
1756
|
word = '고용 노동부'
|
1686
|
-
values, value_data = sc.
|
1757
|
+
values, value_data = sc.trie_search(word, True)
|
1687
1758
|
print(values, value_data)
|
1688
1759
|
|
1689
1760
|
word = '2시뉴스외전'
|
1690
|
-
values, value_data = sc.
|
1761
|
+
values, value_data = sc.trie_search( word, True)
|
1691
1762
|
print(values, value_data)
|
1692
1763
|
word = '2시 뉴스외전'
|
1693
|
-
values, value_data = sc.
|
1764
|
+
values, value_data = sc.trie_search( word, True)
|
1694
1765
|
print(values, value_data)
|
1695
1766
|
|
1696
1767
|
word = 'gbc'
|
1697
|
-
values, value_data = sc.
|
1768
|
+
values, value_data = sc.trie_search( word, True)
|
1698
1769
|
print(values, value_data)
|
1699
1770
|
```
|
1700
1771
|
**결과**
|
@@ -68,24 +68,22 @@ except ImportError:
|
|
68
68
|
|
69
69
|
class FaissSearch:
|
70
70
|
def __new__(cls,
|
71
|
-
mode =
|
71
|
+
mode = 'dense',
|
72
72
|
model_name_or_path: str = 'klue/bert-base',
|
73
73
|
tokenizer_name_or_path: str = 'klue/bert-base',
|
74
74
|
embedding_type: str = 'last_hidden_state',
|
75
75
|
device: str = 'cpu'
|
76
76
|
) -> None:
|
77
|
-
if mode == '
|
78
|
-
return
|
79
|
-
elif mode == 'word':
|
80
|
-
return FaissSearch_WordEmbed(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
|
77
|
+
if mode == 'dense':
|
78
|
+
return FaissSearch_Dense(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
|
81
79
|
elif mode == 'sparse':
|
82
80
|
return FaissSearch_Sparse(model_name_or_path=model_name_or_path, embedding_type=embedding_type)
|
83
81
|
else:
|
84
|
-
raise ValueError("choice '
|
82
|
+
raise ValueError("choice 'dense' or 'sparse'.")
|
85
83
|
|
86
84
|
|
87
85
|
|
88
|
-
class
|
86
|
+
class FaissSearch_Dense:
|
89
87
|
def __init__(self,
|
90
88
|
model_name_or_path: str = 'klue/bert-base',
|
91
89
|
tokenizer_name_or_path: str = 'klue/bert-base',
|
@@ -474,7 +472,7 @@ class FaissSearch_SenEmbed:
|
|
474
472
|
|
475
473
|
|
476
474
|
|
477
|
-
class FaissSearch_Sparse(
|
475
|
+
class FaissSearch_Sparse(FaissSearch_Dense):
|
478
476
|
def __init__(self,
|
479
477
|
model_name_or_path: str = 'klue/bert-base',
|
480
478
|
tokenizer_name_or_path: str = 'klue/bert-base',
|
@@ -586,312 +584,7 @@ class FaissSearch_Sparse(FaissSearch_SenEmbed):
|
|
586
584
|
embeddings = embeddings['logits']
|
587
585
|
|
588
586
|
embeddings = torch.sum(torch.log(1+torch.relu(embeddings)) * encoded_text['attention_mask'].unsqueeze(-1), dim=1)
|
589
|
-
e_norm = torch.nn.functional.normalize(embeddings, p=2, dim=1, eps=1e-8)
|
590
587
|
|
591
588
|
# Return the embeddings
|
592
|
-
return
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
# FAISS word embedding library wrapper class
|
597
|
-
class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
|
598
|
-
def __init__(self,
|
599
|
-
model_name_or_path: str = 'klue/bert-base',
|
600
|
-
tokenizer_name_or_path: str = 'klue/bert-base',
|
601
|
-
embedding_type: str = 'last_hidden_state',
|
602
|
-
device: str = 'cpu',
|
603
|
-
) -> None:
|
604
|
-
r"""
|
605
|
-
This function initializes the wrapper for the FAISS library, which is used to perform semantic search.
|
606
|
-
|
607
|
-
|
608
|
-
.. attention::
|
609
|
-
|
610
|
-
* If you use this class, please make sure to cite the following paper:
|
611
|
-
|
612
|
-
.. code-block:: latex
|
613
|
-
|
614
|
-
@article{johnson2019billion,
|
615
|
-
title={Billion-scale similarity search with {GPUs}},
|
616
|
-
author={Johnson, Jeff and Douze, Matthijs and J{\'e}gou, Herv{\'e}},
|
617
|
-
journal={IEEE Transactions on Big Data},
|
618
|
-
volume={7},
|
619
|
-
number={3},
|
620
|
-
pages={535--547},
|
621
|
-
year={2019},
|
622
|
-
publisher={IEEE}
|
623
|
-
}
|
624
|
-
|
625
|
-
* The code is based on the following GitHub repository:
|
626
|
-
https://github.com/facebookresearch/faiss
|
627
|
-
|
628
|
-
Arguments:
|
629
|
-
model_name_or_path (str, optional): The name or path of the model to use. Defaults to 'facebook/bart-large'.
|
630
|
-
tokenizer_name_or_path (str, optional): The name or path of the tokenizer to use. Defaults to 'facebook/bart-large'.
|
631
|
-
device (str, optional): The device to use. Defaults to 'cpu'.
|
632
|
-
|
633
|
-
Returns:
|
634
|
-
None
|
635
|
-
"""
|
636
|
-
|
637
|
-
# Set the device
|
638
|
-
self.device = device
|
639
|
-
|
640
|
-
# If the tokenizer is not specified, use the model name or path
|
641
|
-
if tokenizer_name_or_path is None:
|
642
|
-
tokenizer_name_or_path = model_name_or_path
|
643
|
-
|
644
|
-
# Load the tokenizer
|
645
|
-
if tokenizer_name_or_path == 'skt/kobert-base-v1':
|
646
|
-
# self.tokenizer = KoBERTTokenizer.from_pretrained(tokenizer_name_or_path)
|
647
|
-
self.tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name_or_path)
|
648
|
-
else:
|
649
|
-
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
|
650
|
-
|
651
|
-
# Load the model
|
652
|
-
self.model = AutoModel.from_pretrained(model_name_or_path).to(self.device)
|
653
|
-
|
654
|
-
|
655
|
-
# Set the model to evaluation mode (since we do not need the gradients)
|
656
|
-
self.model.eval()
|
657
|
-
|
658
|
-
# Initialize the dataset
|
659
|
-
self.dataset = None
|
660
|
-
|
661
|
-
|
662
|
-
# Get the embeddings (new code)
|
663
|
-
def get_doc_embeddings(self,
|
664
|
-
#text: Union[str, List[str]],
|
665
|
-
text=None,
|
666
|
-
embedding_type: str = 'last_hidden_state',
|
667
|
-
batch_size: int = 8,
|
668
|
-
num_workers: int = 4,
|
669
|
-
) -> torch.Tensor:
|
670
|
-
"""
|
671
|
-
This function returns the embeddings of the input text.
|
672
|
-
|
673
|
-
Arguments:
|
674
|
-
text (Union[str, List[str]]): The input text.
|
675
|
-
embedding_type (str, optional): The type of embedding to use. Defaults to 'last_hidden_state'.
|
676
|
-
batch_size (int, optional): The batch size to use. Defaults to 8.
|
677
|
-
num_workers (int, optional): The number of workers to use. Defaults to 4.
|
678
|
-
|
679
|
-
Returns:
|
680
|
-
torch.Tensor: The embeddings.
|
681
|
-
|
682
|
-
Raises:
|
683
|
-
ValueError: If the embedding type is invalid.
|
684
|
-
"""
|
685
|
-
|
686
|
-
# Check if the embedding type is valid
|
687
|
-
if embedding_type not in ['last_hidden_state', 'mean_pooling']:
|
688
|
-
raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
|
689
|
-
|
690
|
-
ids_dict = {}
|
691
|
-
# Tokenize the input text
|
692
|
-
for sentence in text['text']:
|
693
|
-
encoded_text = self.tokenizer(
|
694
|
-
sentence,
|
695
|
-
padding=False,
|
696
|
-
truncation=True,
|
697
|
-
return_tensors='pt',
|
698
|
-
add_special_tokens=False
|
699
|
-
)
|
700
|
-
# Move the input text to the device
|
701
|
-
encoded_text = encoded_text.to(self.device)
|
702
|
-
token_ids_list = encoded_text['input_ids'].tolist()
|
703
|
-
token_ids_list = token_ids_list[0]
|
704
|
-
for ids in token_ids_list:
|
705
|
-
if ids not in ids_dict.keys():
|
706
|
-
ids_dict[ids] = [sentence]
|
707
|
-
else:
|
708
|
-
if text not in ids_dict[ids]:
|
709
|
-
ids_dict[ids].append(sentence)
|
710
|
-
# Get the embeddings
|
711
|
-
embedding_dict = {}
|
712
|
-
self.model.eval()
|
713
|
-
for key, value in ids_dict.items():
|
714
|
-
embed = self.model(torch.tensor([[key]]), output_hidden_states=True).hidden_states[-1][:,0,:].detach()
|
715
|
-
embedding_dict[embed] = value
|
716
|
-
|
717
|
-
# Return the embeddings
|
718
|
-
return embedding_dict
|
719
|
-
|
720
|
-
|
721
|
-
# Get the embeddings (new code)
|
722
|
-
def get_query_embeddings(self,
|
723
|
-
text: Union[str, List[str]],
|
724
|
-
embedding_type: str = 'last_hidden_state',
|
725
|
-
batch_size: int = 8,
|
726
|
-
num_workers: int = 4,
|
727
|
-
) -> torch.Tensor:
|
728
|
-
"""
|
729
|
-
This function returns the embeddings of the input text.
|
730
|
-
|
731
|
-
Arguments:
|
732
|
-
text (Union[str, List[str]]): The input text.
|
733
|
-
embedding_type (str, optional): The type of embedding to use. Defaults to 'last_hidden_state'.
|
734
|
-
batch_size (int, optional): The batch size to use. Defaults to 8.
|
735
|
-
num_workers (int, optional): The number of workers to use. Defaults to 4.
|
736
|
-
|
737
|
-
Returns:
|
738
|
-
torch.Tensor: The embeddings.
|
739
|
-
|
740
|
-
Raises:
|
741
|
-
ValueError: If the embedding type is invalid.
|
742
|
-
"""
|
743
|
-
|
744
|
-
# Check if the embedding type is valid
|
745
|
-
if embedding_type not in ['last_hidden_state', 'mean_pooling']:
|
746
|
-
raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
|
747
|
-
|
748
|
-
# Tokenize the input text
|
749
|
-
encoded_text = self.tokenizer(
|
750
|
-
text,
|
751
|
-
padding=False,
|
752
|
-
truncation=True,
|
753
|
-
return_tensors='pt',
|
754
|
-
add_special_tokens=False,
|
755
|
-
)
|
756
|
-
|
757
|
-
# Move the input text to the device
|
758
|
-
encoded_text = encoded_text.to(self.device)
|
759
|
-
|
760
|
-
token_ids_list = encoded_text['input_ids'].tolist()
|
761
|
-
token_ids_list = token_ids_list[0]
|
762
|
-
tensor_list = [torch.tensor([[value]]) for value in token_ids_list]
|
763
|
-
|
764
|
-
# Get the embeddings
|
765
|
-
embeds = []
|
766
|
-
self.model.eval()
|
767
|
-
for index, tensor in enumerate(tensor_list):
|
768
|
-
embed = self.model(tensor, output_hidden_states=True).hidden_states[-1][:,0,:].detach().cpu().numpy()
|
769
|
-
embeds.append(embed)
|
770
|
-
|
771
|
-
# Return the embeddings
|
772
|
-
return embeds
|
773
|
-
|
774
|
-
|
775
|
-
# Initialize the corpus using a dictionary or pandas DataFrame or HuggingFace Datasets object
|
776
|
-
def initialize_corpus(self,
|
777
|
-
corpus: Union[Dict[str, List[str]], pd.DataFrame, Dataset],
|
778
|
-
section: str = 'text',
|
779
|
-
index_column_name: str = 'embeddings',
|
780
|
-
embedding_type: str = 'last_hidden_state',
|
781
|
-
batch_size: Optional[int] = None,
|
782
|
-
num_workers: Optional[int] = None,
|
783
|
-
save_path: Optional[str] = None,
|
784
|
-
) -> Dataset:
|
785
|
-
"""
|
786
|
-
This function initializes a dataset using a dictionary or pandas DataFrame or HuggingFace Datasets object.
|
787
|
-
|
788
|
-
Arguments:
|
789
|
-
dataset_dict (Dict[str, List[str]]): The dataset dictionary.
|
790
|
-
section (str): The section of the dataset to use whose embeddings will be used for semantic search (e.g., 'text', 'title', etc.) (default: 'text').
|
791
|
-
index_column_name (str): The name of the column containing the embeddings (default: 'embeddings')
|
792
|
-
embedding_type (str): The type of embedding to use (default: 'last_hidden_state').
|
793
|
-
batch_size (int, optional): The batch size to use (default: 8).
|
794
|
-
max_length (int, optional): The maximum length of the input sequences.
|
795
|
-
num_workers (int, optional): The number of workers to use.
|
796
|
-
save_path (Optional[str], optional): The path to save the dataset (default: None).
|
797
|
-
|
798
|
-
Returns:
|
799
|
-
Dataset: The dataset object (HuggingFace Datasets).
|
800
|
-
|
801
|
-
Raises:
|
802
|
-
ValueError: If the dataset is not a dictionary or pandas DataFrame or HuggingFace Datasets object.
|
803
|
-
"""
|
804
|
-
|
805
|
-
# corpus = { 'text': [...] } -> form_dict
|
806
|
-
|
807
|
-
# Set the embedding_type
|
808
|
-
self.embedding_type = embedding_type
|
809
|
-
|
810
|
-
# get embedding dict
|
811
|
-
embedding_dict = self.get_doc_embeddings(text=corpus, embedding_type=self.embedding_type)
|
812
|
-
|
813
|
-
data = {
|
814
|
-
'text' : embedding_dict.values(),
|
815
|
-
'embeddings': []
|
816
|
-
}
|
817
|
-
|
818
|
-
for embed in embedding_dict.keys():
|
819
|
-
embed_list = embed.tolist()
|
820
|
-
data['embeddings'].append(embed_list[0])
|
821
|
-
|
822
|
-
|
823
|
-
if isinstance(data, dict):
|
824
|
-
self.dataset = Dataset.from_dict(data)
|
825
|
-
elif isinstance(data, pd.DataFrame):
|
826
|
-
self.dataset = Dataset.from_pandas(data)
|
827
|
-
elif isinstance(data, Dataset):
|
828
|
-
self.dataset = corpus
|
829
|
-
else:
|
830
|
-
raise ValueError('The dataset must be a dictionary or pandas DataFrame.')
|
831
|
-
|
832
|
-
# Save the dataset
|
833
|
-
if save_path is not None:
|
834
|
-
self.dataset.to_json(save_path)
|
835
|
-
|
836
|
-
# Add FAISS index
|
837
|
-
self.add_faiss_index(
|
838
|
-
column_name=index_column_name,
|
839
|
-
)
|
840
|
-
|
841
|
-
# Return the dataset
|
842
|
-
return self.dataset
|
843
|
-
|
844
|
-
|
845
|
-
# Search for the most similar elements in the dataset, given a query
|
846
|
-
def search(self,
|
847
|
-
query: str,
|
848
|
-
k: int = 1,
|
849
|
-
index_column_name: str = 'embeddings',
|
850
|
-
) -> pd.DataFrame:
|
851
|
-
"""
|
852
|
-
This function searches for the most similar elements in the dataset, given a query.
|
853
|
-
|
854
|
-
Arguments:
|
855
|
-
query (str): The query.
|
856
|
-
k (int, optional): The number of elements to return (default: 1).
|
857
|
-
index_column_name (str, optional): The name of the column containing the embeddings (default: 'embeddings')
|
858
|
-
|
859
|
-
Returns:
|
860
|
-
pd.DataFrame: The most similar elements in the dataset (text, score, etc.), sorted by score.
|
861
|
-
|
862
|
-
Remarks:
|
863
|
-
The returned elements are dictionaries containing the text and the score.
|
864
|
-
"""
|
865
|
-
|
866
|
-
# Get the embeddings of the query
|
867
|
-
query_embeddings = self.get_query_embeddings([query], embedding_type=self.embedding_type)
|
868
|
-
|
869
|
-
# query_embedding이랑 self.dataset['embeddings'] 값 비교
|
870
|
-
scores = []
|
871
|
-
similar_elts = []
|
872
|
-
for query in query_embeddings:
|
873
|
-
# Search for the most similar elements in the dataset
|
874
|
-
score, similar_elt = self.dataset.get_nearest_examples(
|
875
|
-
index_name=index_column_name,
|
876
|
-
query=query,
|
877
|
-
k=k,
|
878
|
-
)
|
879
|
-
scores.append(score)
|
880
|
-
similar_elts.append(similar_elt)
|
881
|
-
|
589
|
+
return embeddings
|
882
590
|
|
883
|
-
text_list = []
|
884
|
-
for item in similar_elts:
|
885
|
-
for text in item['text']:
|
886
|
-
text_list.append(text)
|
887
|
-
|
888
|
-
flat_list = [sentence for sublist in text_list for sentence in sublist]
|
889
|
-
count = Counter(flat_list)
|
890
|
-
count = dict(count.most_common(5))
|
891
|
-
|
892
|
-
sorted_dict = dict(sorted(count.items(), key=lambda x: x[1], reverse=True))
|
893
|
-
# Convert the results to a pandas DataFrame
|
894
|
-
results_df = pd.DataFrame({'text': sorted_dict.keys() , 'freq': sorted_dict.values()})
|
895
|
-
|
896
|
-
# Return the most similar elements
|
897
|
-
return results_df
|