nltkor 1.2.5__tar.gz → 1.2.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. {nltkor-1.2.5 → nltkor-1.2.9}/PKG-INFO +10 -6
  2. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/__init__.py +3 -2
  3. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/metrics/classical.py +34 -1
  4. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/misc/string2string_word_embeddings.py +1 -1
  5. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/search/faiss_search.py +333 -13
  6. nltkor-1.2.9/nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
  7. nltkor-1.2.9/nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
  8. nltkor-1.2.9/nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
  9. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
  10. nltkor-1.2.9/nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
  11. nltkor-1.2.9/nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
  12. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/__init__.py +1 -0
  13. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/__init__.py +1 -0
  14. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/config.py +4 -3
  15. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/network.c +43296 -30238
  16. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/pos/pos_reader.py +10 -2
  17. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/reader.py +38 -68
  18. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/taggers.py +6 -6
  19. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/utils.py +41 -1
  20. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor.egg-info/PKG-INFO +10 -6
  21. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor.egg-info/SOURCES.txt +4 -0
  22. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor.egg-info/requires.txt +9 -5
  23. {nltkor-1.2.5 → nltkor-1.2.9}/setup.py +50 -26
  24. nltkor-1.2.5/nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
  25. {nltkor-1.2.5 → nltkor-1.2.9}/LICENSE.txt +0 -0
  26. {nltkor-1.2.5 → nltkor-1.2.9}/README.md +0 -0
  27. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/Kor_char.py +0 -0
  28. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/alignment/__init__.py +0 -0
  29. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/cider/__init__.py +0 -0
  30. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/cider/cider.py +0 -0
  31. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/cider/cider_scorer.py +0 -0
  32. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/distance/__init__.py +0 -0
  33. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/distance/wasserstein.py +0 -0
  34. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/etc.py +0 -0
  35. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/lazyimport.py +0 -0
  36. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/make_requirement.py +0 -0
  37. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/metrics/__init__.py +0 -0
  38. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/metrics/bartscore.py +0 -0
  39. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/metrics/bertscore.py +0 -0
  40. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/metrics/bleu_tensor.py +0 -0
  41. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/metrics/entment.py +0 -0
  42. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/metrics/eval.py +0 -0
  43. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/metrics/mauve.py +0 -0
  44. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/metrics/mauve_utils.py +0 -0
  45. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/misc/__init__.py +0 -0
  46. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/misc/string2string_basic_functions.py +0 -0
  47. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/misc/string2string_default_tokenizer.py +0 -0
  48. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/misc/string2string_hash_functions.py +0 -0
  49. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/search/__init__.py +0 -0
  50. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/search/classical.py +0 -0
  51. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/search/kobert_tokenizer.py +0 -0
  52. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/sejong/__init__.py +0 -0
  53. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/sejong/ch.py +0 -0
  54. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/sejong/dict_semClassNum.txt +0 -0
  55. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/sejong/layer.txt +0 -0
  56. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/sejong/sejong_download.py +0 -0
  57. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/sejong/ssem.py +0 -0
  58. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/similarity/__init__.py +0 -0
  59. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/similarity/bartscore____.py +0 -0
  60. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/similarity/bertscore____.py +0 -0
  61. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/similarity/classical.py +0 -0
  62. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/similarity/cosine_similarity.py +0 -0
  63. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/espresso_tag.py +0 -0
  64. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/arguments.py +0 -0
  65. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/attributes.py +0 -0
  66. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/metadata.py +0 -0
  67. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/ner/__init__.py +0 -0
  68. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/ner/macmorphoreader.py +0 -0
  69. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/ner/ner_reader.py +0 -0
  70. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/parse/__init__.py +0 -0
  71. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/parse/parse_reader.py +0 -0
  72. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/pos/__init__.py +0 -0
  73. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/pos/macmorphoreader.py +0 -0
  74. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/srl/__init__.py +0 -0
  75. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/srl/__srl_reader_.py +0 -0
  76. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/srl/srl_reader.py +0 -0
  77. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/srl/train_srl.py +0 -0
  78. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/word_dictionary.py +0 -0
  79. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/wsd/__init__.py +0 -0
  80. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/wsd/macmorphoreader.py +0 -0
  81. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tag/libs/wsd/wsd_reader.py +0 -0
  82. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tokenize/__init__.py +0 -0
  83. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/tokenize/ko_tokenize.py +0 -0
  84. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor/trans.py +0 -0
  85. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor.egg-info/dependency_links.txt +0 -0
  86. {nltkor-1.2.5 → nltkor-1.2.9}/nltkor.egg-info/top_level.txt +0 -0
  87. {nltkor-1.2.5 → nltkor-1.2.9}/setup.cfg +0 -0
  88. {nltkor-1.2.5 → nltkor-1.2.9}/test/test.py +0 -0
  89. {nltkor-1.2.5 → nltkor-1.2.9}/test/testespresso.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: nltkor
3
- Version: 1.2.5
3
+ Version: 1.2.9
4
4
  Home-page: https://modi.changwon.ac.kr/air_cwnu/nlp_tool/nltk_ko.git
5
5
  Keywords: string matching,pattern matching,edit distance,string to string correction,string to string matching,Levenshtein edit distance,Hamming distance,Damerau-Levenshtein distance,Jaro-Winkler distance,longest common subsequence,longest common substring,dynamic programming,approximate string matching,semantic similarity,natural language processing,NLP,information retrieval,rouge,sacrebleu,bertscore,bartscore,fasttext,glove,cosine similarity,Smith-Waterman,Needleman-Wunsch,Hirschberg,Karp-Rabin,Knuth-Morris-Pratt,Boyer-Moore
6
6
  Classifier: Programming Language :: Python :: 3.7
@@ -12,21 +12,25 @@ Classifier: Operating System :: OS Independent
12
12
  Classifier: Typing :: Typed
13
13
  Requires-Python: >=3.7
14
14
  License-File: LICENSE.txt
15
+ Requires-Dist: Cython
16
+ Requires-Dist: numpy<=1.26.4,>=1.23.5
15
17
  Requires-Dist: regex
16
18
  Requires-Dist: tqdm>=4.40.0
17
19
  Requires-Dist: joblib
18
- Requires-Dist: numpy==1.23.0
19
20
  Requires-Dist: requests
20
21
  Requires-Dist: nltk>3.0
21
- Requires-Dist: pyarrow==14.0.0
22
+ Requires-Dist: pyarrow
22
23
  Requires-Dist: beautifulSoup4
23
- Requires-Dist: faiss-cpu>=1.7.3
24
+ Requires-Dist: faiss-cpu==1.7.3
24
25
  Requires-Dist: datasets
25
26
  Requires-Dist: torch
27
+ Requires-Dist: dill<0.3.9
26
28
  Requires-Dist: scikit-learn>=0.22.1
27
- Requires-Dist: transformers>=4.8.2
29
+ Requires-Dist: transformers==4.42.2
28
30
  Requires-Dist: protobuf
29
31
  Requires-Dist: sentencepiece
30
32
  Requires-Dist: pandas
31
33
  Requires-Dist: bert_score
32
- Requires-Dist: fasttext==0.9.2
34
+ Requires-Dist: chardet
35
+ Requires-Dist: GPUtil
36
+ Requires-Dist: fasttext
@@ -1,6 +1,7 @@
1
1
  from nltkor import alignment
2
2
  from nltkor import cider
3
3
  from nltkor import distance
4
+
4
5
  from nltkor import sejong
5
6
  from nltkor import metrics
6
7
  from nltkor import misc
@@ -8,8 +9,8 @@ from nltkor import search
8
9
  from nltkor import similarity
9
10
  from nltkor import tag
10
11
  from nltkor import tokenize
11
-
12
-
13
12
  from nltkor import trans
14
13
  from nltkor import Kor_char
15
14
  from nltkor import etc
15
+
16
+ __version__ = '1.2.9'
@@ -5,6 +5,8 @@ import numpy as np
5
5
  from typing import Callable, Iterable, List, Tuple, Union
6
6
  from copy import deepcopy
7
7
  import itertools
8
+ import torch
9
+ import time
8
10
  from nltk.translate.bleu_score import *
9
11
  from nltk.metrics import confusionmatrix
10
12
  from collections import defaultdict
@@ -54,6 +56,37 @@ class DefaultMetric:
54
56
 
55
57
  return float(tp/total)
56
58
 
59
+ def accuracy_norm(model, tokenizer, input_text: str, candidates: list, label: int):
60
+ reserved_memory = []
61
+ inference_time = []
62
+ tokenized_prompt = tokenizer(input_text, return_tensors='pt').input_ids
63
+ total_candidate = []
64
+
65
+ for ending in candidates:
66
+ len_ending = len(ending)
67
+ tokenized_ending = tokenizer(ending, return_tensors='pt').input_ids
68
+ tokenized_ending = tokenized_ending[:, 1:]
69
+ input_ids = torch.cat([tokenized_prompt, tokenized_ending], dim=-1).cuda()
70
+ labels = input_ids.clone()
71
+ labels[0, :tokenized_prompt.shape[1]] = -100
72
+ start = time.time()
73
+ with torch.no_grad():
74
+ outputs = model(input_ids, labels=labels)
75
+ inference_time.append(time.time() - start)
76
+ reserved_memory.append(torch.cuda.memory_reserved() / (1024**2))
77
+ total_logprobs = -outputs.loss.item() * tokenized_ending.shape[1]
78
+ total_candidate.append(total_logprobs/len_ending)
79
+ answer_idx = total_candidate.index(max(total_candidate))
80
+ if int(label) == answer_idx:
81
+ cor = 1
82
+ else:
83
+ cor = 0
84
+ metric_dict = {
85
+ "reserved_memory": reserved_memory,
86
+ "inference_time": inference_time
87
+ }
88
+ return cor, metric_dict
89
+
57
90
  def recall_score(self, true, pred, avg='micro'):
58
91
 
59
92
  mat=confusionmatrix.ConfusionMatrix(true,pred)
@@ -197,7 +230,7 @@ class DefaultMetric:
197
230
 
198
231
  return (((precision*recall)/(precision+recall))*2)
199
232
 
200
-
233
+
201
234
 
202
235
 
203
236
  def pos_eval(self, fin):
@@ -49,7 +49,7 @@ except ImportError:
49
49
  raise Exception(f"""
50
50
  Need to install Libraries, please pip install below libraries
51
51
  \t pip install torch
52
- \t pip install fasttext
52
+ \t pip install fasttext-wheel
53
53
  Or, use pip install requirement.txt
54
54
  \t pip install -r {file_path}
55
55
  """)
@@ -33,8 +33,11 @@ SOFTWARE.
33
33
  This module contains a wrapper for the Faiss library by Facebook AI Research.
34
34
  """
35
35
 
36
+ from collections import Counter
36
37
  from typing import List, Union, Optional, Dict, Any
37
38
  import os
39
+ import copy
40
+ import logging
38
41
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
39
42
 
40
43
  from nltkor.make_requirement import make_requirement
@@ -62,13 +65,27 @@ except ImportError:
62
65
  # from nltk.search.kobert_tokenizer import KoBERTTokenizer
63
66
 
64
67
 
65
-
66
- # FAISS library wrapper class
67
68
  class FaissSearch:
69
+ def __new__(cls,
70
+ mode = None,
71
+ model_name_or_path: str = 'klue/bert-base',
72
+ tokenizer_name_or_path: str = 'klue/bert-base',
73
+ device: str = 'cpu'
74
+ ) -> None:
75
+ if mode == 'sentence':
76
+ return FaissSearch_SenEmbed(model_name_or_path)
77
+ elif mode == 'word':
78
+ return FaissSearch_WordEmbed(model_name_or_path)
79
+ else:
80
+ raise ValueError("choice 'sentence' or 'word'")
81
+
82
+
83
+ # FAISS original library wrapper class
84
+ class FaissSearch_SenEmbed:
68
85
  def __init__(self,
69
86
  model_name_or_path: str = 'klue/bert-base',
70
87
  tokenizer_name_or_path: str = 'klue/bert-base',
71
- device: str = 'cpu'
88
+ device: str = 'cpu',
72
89
  ) -> None:
73
90
  r"""
74
91
  This function initializes the wrapper for the FAISS library, which is used to perform semantic search.
@@ -171,8 +188,6 @@ class FaissSearch:
171
188
  return mean_pooling
172
189
 
173
190
 
174
-
175
-
176
191
  # Get the embeddings
177
192
  def get_embeddings(self,
178
193
  text: Union[str, List[str]],
@@ -369,14 +384,6 @@ class FaissSearch:
369
384
  self.embedding_type = embedding_type
370
385
 
371
386
 
372
- # Tokenize the dataset
373
- # self.dataset = self.dataset.map(
374
- # lambda x: x[section],
375
- # batched=True,
376
- # batch_size=batch_size,
377
- # num_proc=num_workers,
378
- # )
379
-
380
387
  # Map the section of the dataset to the embeddings
381
388
  self.dataset = self.dataset.map(
382
389
  lambda x: {
@@ -465,3 +472,316 @@ class FaissSearch:
465
472
 
466
473
  # Return the most similar elements
467
474
  return results_df
475
+
476
+
477
+
478
+
479
+ # FAISS word embedding library wrapper class
480
+ class FaissSearch_WordEmbed(FaissSearch_SenEmbed):
481
+ def __init__(self,
482
+ model_name_or_path: str = 'klue/bert-base',
483
+ tokenizer_name_or_path: str = 'klue/bert-base',
484
+ device: str = 'cpu',
485
+ ) -> None:
486
+ r"""
487
+ This function initializes the wrapper for the FAISS library, which is used to perform semantic search.
488
+
489
+
490
+ .. attention::
491
+
492
+ * If you use this class, please make sure to cite the following paper:
493
+
494
+ .. code-block:: latex
495
+
496
+ @article{johnson2019billion,
497
+ title={Billion-scale similarity search with {GPUs}},
498
+ author={Johnson, Jeff and Douze, Matthijs and J{\'e}gou, Herv{\'e}},
499
+ journal={IEEE Transactions on Big Data},
500
+ volume={7},
501
+ number={3},
502
+ pages={535--547},
503
+ year={2019},
504
+ publisher={IEEE}
505
+ }
506
+
507
+ * The code is based on the following GitHub repository:
508
+ https://github.com/facebookresearch/faiss
509
+
510
+ Arguments:
511
+ model_name_or_path (str, optional): The name or path of the model to use. Defaults to 'facebook/bart-large'.
512
+ tokenizer_name_or_path (str, optional): The name or path of the tokenizer to use. Defaults to 'facebook/bart-large'.
513
+ device (str, optional): The device to use. Defaults to 'cpu'.
514
+
515
+ Returns:
516
+ None
517
+ """
518
+
519
+ # Set the device
520
+ self.device = device
521
+
522
+ # If the tokenizer is not specified, use the model name or path
523
+ if tokenizer_name_or_path is None:
524
+ tokenizer_name_or_path = model_name_or_path
525
+
526
+ # Load the tokenizer
527
+ if tokenizer_name_or_path == 'skt/kobert-base-v1':
528
+ # self.tokenizer = KoBERTTokenizer.from_pretrained(tokenizer_name_or_path)
529
+ self.tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name_or_path)
530
+ else:
531
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
532
+
533
+ # Load the model
534
+ self.model = AutoModel.from_pretrained(model_name_or_path).to(self.device)
535
+
536
+ # Set the model to evaluation mode (since we do not need the gradients)
537
+ self.model.eval()
538
+
539
+ # Initialize the dataset
540
+ self.dataset = None
541
+
542
+
543
+
544
+ # Get the embeddings (new code)
545
+ def get_doc_embeddings(self,
546
+ #text: Union[str, List[str]],
547
+ text=None,
548
+ embedding_type: str = 'last_hidden_state',
549
+ batch_size: int = 8,
550
+ num_workers: int = 4,
551
+ ) -> torch.Tensor:
552
+ """
553
+ This function returns the embeddings of the input text.
554
+
555
+ Arguments:
556
+ text (Union[str, List[str]]): The input text.
557
+ embedding_type (str, optional): The type of embedding to use. Defaults to 'last_hidden_state'.
558
+ batch_size (int, optional): The batch size to use. Defaults to 8.
559
+ num_workers (int, optional): The number of workers to use. Defaults to 4.
560
+
561
+ Returns:
562
+ torch.Tensor: The embeddings.
563
+
564
+ Raises:
565
+ ValueError: If the embedding type is invalid.
566
+ """
567
+
568
+ # Check if the embedding type is valid
569
+ if embedding_type not in ['last_hidden_state', 'mean_pooling']:
570
+ raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
571
+
572
+ ids_dict = {}
573
+ # Tokenize the input text
574
+ for sentence in text['text']:
575
+ encoded_text = self.tokenizer(
576
+ sentence,
577
+ padding=False,
578
+ truncation=True,
579
+ return_tensors='pt',
580
+ add_special_tokens=False,
581
+ )
582
+
583
+ # Move the input text to the device
584
+ encoded_text = encoded_text.to(self.device)
585
+
586
+ token_ids_list = encoded_text['input_ids'].tolist()
587
+ token_ids_list = token_ids_list[0]
588
+ for ids in token_ids_list:
589
+ if ids not in ids_dict.keys():
590
+ ids_dict[ids] = [sentence]
591
+ else:
592
+ if text not in ids_dict[ids]:
593
+ ids_dict[ids].append(sentence)
594
+
595
+ # Get the embeddings
596
+ embedding_dict = {}
597
+ self.model.eval()
598
+ for key, value in ids_dict.items():
599
+ embed = self.model(torch.tensor([[key]]), output_hidden_states=True).hidden_states[-1][:,0,:].detach()
600
+ embedding_dict[embed] = value
601
+
602
+ # Return the embeddings
603
+ return embedding_dict
604
+
605
+
606
+
607
+ # Get the embeddings (new code)
608
+ def get_query_embeddings(self,
609
+ text: Union[str, List[str]],
610
+ embedding_type: str = 'last_hidden_state',
611
+ batch_size: int = 8,
612
+ num_workers: int = 4,
613
+ ) -> torch.Tensor:
614
+ """
615
+ This function returns the embeddings of the input text.
616
+
617
+ Arguments:
618
+ text (Union[str, List[str]]): The input text.
619
+ embedding_type (str, optional): The type of embedding to use. Defaults to 'last_hidden_state'.
620
+ batch_size (int, optional): The batch size to use. Defaults to 8.
621
+ num_workers (int, optional): The number of workers to use. Defaults to 4.
622
+
623
+ Returns:
624
+ torch.Tensor: The embeddings.
625
+
626
+ Raises:
627
+ ValueError: If the embedding type is invalid.
628
+ """
629
+
630
+ # Check if the embedding type is valid
631
+ if embedding_type not in ['last_hidden_state', 'mean_pooling']:
632
+ raise ValueError(f'Invalid embedding type: {embedding_type}. Only "last_hidden_state" and "mean_pooling" are supported.')
633
+
634
+ # Tokenize the input text
635
+ encoded_text = self.tokenizer(
636
+ text,
637
+ padding=False,
638
+ truncation=True,
639
+ return_tensors='pt',
640
+ add_special_tokens=False,
641
+ )
642
+
643
+ # Move the input text to the device
644
+ encoded_text = encoded_text.to(self.device)
645
+
646
+ token_ids_list = encoded_text['input_ids'].tolist()
647
+ token_ids_list = token_ids_list[0]
648
+ tensor_list = [torch.tensor([[value]]) for value in token_ids_list]
649
+
650
+ # Get the embeddings
651
+ embeds = []
652
+ self.model.eval()
653
+ for index, tensor in enumerate(tensor_list):
654
+ embed = self.model(tensor, output_hidden_states=True).hidden_states[-1][:,0,:].detach().cpu().numpy()
655
+ embeds.append(embed)
656
+
657
+ # Return the embeddings
658
+ return embeds
659
+
660
+
661
+
662
+ # Initialize the corpus using a dictionary or pandas DataFrame or HuggingFace Datasets object
663
+ def initialize_corpus(self,
664
+ corpus: Union[Dict[str, List[str]], pd.DataFrame, Dataset],
665
+ section: str = 'text',
666
+ index_column_name: str = 'embeddings',
667
+ embedding_type: str = 'last_hidden_state',
668
+ batch_size: Optional[int] = None,
669
+ num_workers: Optional[int] = None,
670
+ save_path: Optional[str] = None,
671
+ ) -> Dataset:
672
+ """
673
+ This function initializes a dataset using a dictionary or pandas DataFrame or HuggingFace Datasets object.
674
+
675
+ Arguments:
676
+ dataset_dict (Dict[str, List[str]]): The dataset dictionary.
677
+ section (str): The section of the dataset to use whose embeddings will be used for semantic search (e.g., 'text', 'title', etc.) (default: 'text').
678
+ index_column_name (str): The name of the column containing the embeddings (default: 'embeddings')
679
+ embedding_type (str): The type of embedding to use (default: 'last_hidden_state').
680
+ batch_size (int, optional): The batch size to use (default: 8).
681
+ max_length (int, optional): The maximum length of the input sequences.
682
+ num_workers (int, optional): The number of workers to use.
683
+ save_path (Optional[str], optional): The path to save the dataset (default: None).
684
+
685
+ Returns:
686
+ Dataset: The dataset object (HuggingFace Datasets).
687
+
688
+ Raises:
689
+ ValueError: If the dataset is not a dictionary or pandas DataFrame or HuggingFace Datasets object.
690
+ """
691
+
692
+ # corpus = { 'text': [...] } -> form_dict
693
+
694
+ # Set the embedding_type
695
+ self.embedding_type = embedding_type
696
+
697
+ # get embedding dict
698
+ embedding_dict = self.get_doc_embeddings(text=corpus, embedding_type=self.embedding_type)
699
+
700
+ data = {
701
+ 'text' : embedding_dict.values(),
702
+ 'embeddings': []
703
+ }
704
+
705
+ for embed in embedding_dict.keys():
706
+ embed_list = embed.tolist()
707
+ data['embeddings'].append(embed_list[0])
708
+
709
+
710
+ if isinstance(data, dict):
711
+ self.dataset = Dataset.from_dict(data)
712
+ elif isinstance(data, pd.DataFrame):
713
+ self.dataset = Dataset.from_pandas(data)
714
+ elif isinstance(data, Dataset):
715
+ self.dataset = corpus
716
+ else:
717
+ raise ValueError('The dataset must be a dictionary or pandas DataFrame.')
718
+
719
+ # Save the dataset
720
+ if save_path is not None:
721
+ self.dataset.to_json(save_path)
722
+
723
+ # Add FAISS index
724
+ self.add_faiss_index(
725
+ column_name=index_column_name,
726
+ )
727
+
728
+ # Return the dataset
729
+ return self.dataset
730
+
731
+
732
+
733
+ # Search for the most similar elements in the dataset, given a query
734
+ def search(self,
735
+ query: str,
736
+ k: int = 1,
737
+ index_column_name: str = 'embeddings',
738
+ ) -> pd.DataFrame:
739
+ """
740
+ This function searches for the most similar elements in the dataset, given a query.
741
+
742
+ Arguments:
743
+ query (str): The query.
744
+ k (int, optional): The number of elements to return (default: 1).
745
+ index_column_name (str, optional): The name of the column containing the embeddings (default: 'embeddings')
746
+
747
+ Returns:
748
+ pd.DataFrame: The most similar elements in the dataset (text, score, etc.), sorted by score.
749
+
750
+ Remarks:
751
+ The returned elements are dictionaries containing the text and the score.
752
+ """
753
+
754
+
755
+ # Get the embeddings of the query
756
+ query_embeddings = self.get_query_embeddings([query], embedding_type=self.embedding_type)
757
+
758
+ # query_embedding이랑 self.dataset['embeddings'] 값 비교
759
+ scores = []
760
+ similar_elts = []
761
+ for query in query_embeddings:
762
+ # Search for the most similar elements in the dataset
763
+ score, similar_elt = self.dataset.get_nearest_examples(
764
+ index_name=index_column_name,
765
+ query=query,
766
+ k=k,
767
+ )
768
+ scores.append(score)
769
+ similar_elts.append(similar_elt)
770
+
771
+ text_list = []
772
+ for item in similar_elts:
773
+ for text in item['text']:
774
+ text_list.append(text)
775
+
776
+ flat_list = [sentence for sublist in text_list for sentence in sublist]
777
+ count = Counter(flat_list)
778
+ count = dict(count.most_common(5))
779
+
780
+ sorted_dict = dict(sorted(count.items(), key=lambda x: x[1], reverse=True))
781
+
782
+ # Convert the results to a pandas DataFrame
783
+ results_df = pd.DataFrame({'text': sorted_dict.keys() , 'freq': sorted_dict.values()})
784
+
785
+
786
+ # Return the most similar elements
787
+ return results_df
@@ -68,3 +68,4 @@ For more information, please consult chapter 5 of the NLTK Book.
68
68
  from nltkor.tag.espresso_tag import EspressoTagger
69
69
  #import nltkor.tag
70
70
  from nltkor.tag.libs import taggers
71
+ from .libs import PickleConverter
@@ -5,5 +5,6 @@ from . import utils
5
5
 
6
6
  from .taggers import POSTagger, NERTagger, WSDTagger, SRLTagger, DependencyParser
7
7
  from .utils import tokenize
8
+ from .utils import PickleConverter
8
9
 
9
10
  __version__ = '1.2.0'
@@ -27,9 +27,10 @@ def get_config_paths(directory):
27
27
  ('network_text_pos' , 'pos-network.txt'),
28
28
  ('pos_tags' , 'pos-tags.txt'),
29
29
  ('pos_tag_dict' , 'pos-tags.txt'),
30
- ('pos_co_lexicon' , 'pos-co-lexicon.txt'),
31
- ('pos_morph_lexicon' , 'pos-morph-lexicon.txt'),
32
- ('pos_prob_dict' , 'pos-prob-dict.txt'),
30
+ ('pos_co_lexicon' , 'pos-co-lexicon.pickle'),
31
+ ('pos_morph_lexicon' , 'pos-morph-lexicon.pickle'),
32
+ ('pos_prob_dict' , 'pos-prob-dict.pickle'),
33
+ ('pos_morph_lexicon_txt' , 'pos-morph-lexicon.txt'),
33
34
  ('suffix' , 'suffixes.txt'),
34
35
  ('suffixes' , 'suffixes.txt'),
35
36
  ('prefix' , 'prefixes.txt'),