nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nltkor/Kor_char.py +193 -0
- nltkor/__init__.py +16 -0
- nltkor/alignment/__init__.py +1315 -0
- nltkor/cider/__init__.py +2 -0
- nltkor/cider/cider.py +55 -0
- nltkor/cider/cider_scorer.py +207 -0
- nltkor/distance/__init__.py +441 -0
- nltkor/distance/wasserstein.py +126 -0
- nltkor/etc.py +22 -0
- nltkor/lazyimport.py +144 -0
- nltkor/make_requirement.py +11 -0
- nltkor/metrics/__init__.py +63 -0
- nltkor/metrics/bartscore.py +301 -0
- nltkor/metrics/bertscore.py +331 -0
- nltkor/metrics/bleu_tensor.py +20 -0
- nltkor/metrics/classical.py +847 -0
- nltkor/metrics/entment.py +24 -0
- nltkor/metrics/eval.py +517 -0
- nltkor/metrics/mauve.py +273 -0
- nltkor/metrics/mauve_utils.py +131 -0
- nltkor/misc/__init__.py +11 -0
- nltkor/misc/string2string_basic_functions.py +59 -0
- nltkor/misc/string2string_default_tokenizer.py +83 -0
- nltkor/misc/string2string_hash_functions.py +159 -0
- nltkor/misc/string2string_word_embeddings.py +503 -0
- nltkor/search/__init__.py +10 -0
- nltkor/search/classical.py +569 -0
- nltkor/search/faiss_search.py +787 -0
- nltkor/search/kobert_tokenizer.py +181 -0
- nltkor/sejong/__init__.py +3 -0
- nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
- nltkor/sejong/ch.py +12 -0
- nltkor/sejong/dict_semClassNum.txt +491 -0
- nltkor/sejong/layer.txt +630 -0
- nltkor/sejong/sejong_download.py +87 -0
- nltkor/sejong/ssem.py +684 -0
- nltkor/similarity/__init__.py +3 -0
- nltkor/similarity/bartscore____.py +337 -0
- nltkor/similarity/bertscore____.py +339 -0
- nltkor/similarity/classical.py +245 -0
- nltkor/similarity/cosine_similarity.py +175 -0
- nltkor/tag/__init__.py +71 -0
- nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
- nltkor/tag/espresso_tag.py +220 -0
- nltkor/tag/libs/__init__.py +10 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
- nltkor/tag/libs/arguments.py +280 -0
- nltkor/tag/libs/attributes.py +231 -0
- nltkor/tag/libs/config.py +159 -0
- nltkor/tag/libs/metadata.py +129 -0
- nltkor/tag/libs/ner/__init__.py +2 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/macmorphoreader.py +7 -0
- nltkor/tag/libs/ner/ner_reader.py +92 -0
- nltkor/tag/libs/network.c +72325 -0
- nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
- nltkor/tag/libs/network.pyx +878 -0
- nltkor/tag/libs/networkconv.pyx +1028 -0
- nltkor/tag/libs/networkdependencyconv.pyx +451 -0
- nltkor/tag/libs/parse/__init__.py +1 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/parse_reader.py +283 -0
- nltkor/tag/libs/pos/__init__.py +2 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/macmorphoreader.py +7 -0
- nltkor/tag/libs/pos/pos_reader.py +97 -0
- nltkor/tag/libs/reader.py +485 -0
- nltkor/tag/libs/srl/__init__.py +3 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__srl_reader_.py +535 -0
- nltkor/tag/libs/srl/srl_reader.py +436 -0
- nltkor/tag/libs/srl/train_srl.py +87 -0
- nltkor/tag/libs/taggers.py +926 -0
- nltkor/tag/libs/utils.py +384 -0
- nltkor/tag/libs/word_dictionary.py +239 -0
- nltkor/tag/libs/wsd/__init__.py +2 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
- nltkor/tag/libs/wsd/wsd_reader.py +93 -0
- nltkor/tokenize/__init__.py +62 -0
- nltkor/tokenize/ko_tokenize.py +115 -0
- nltkor/trans.py +121 -0
- nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
- nltkor-1.2.14.dist-info/METADATA +41 -0
- nltkor-1.2.14.dist-info/RECORD +127 -0
- nltkor-1.2.14.dist-info/WHEEL +5 -0
- nltkor-1.2.14.dist-info/top_level.txt +1 -0
nltkor/metrics/mauve.py
ADDED
@@ -0,0 +1,273 @@
|
|
1
|
+
# Author: Krishna Pillutla
|
2
|
+
# License: GPLv3
|
3
|
+
|
4
|
+
import math
|
5
|
+
import numpy as np
|
6
|
+
import time
|
7
|
+
from types import SimpleNamespace
|
8
|
+
from nltkor.make_requirement import make_requirement
|
9
|
+
|
10
|
+
import faiss
|
11
|
+
from sklearn.preprocessing import normalize
|
12
|
+
from sklearn.decomposition import PCA
|
13
|
+
from sklearn.metrics import auc as compute_area_under_curve
|
14
|
+
import torch
|
15
|
+
from transformers import AutoModel, AutoTokenizer
|
16
|
+
|
17
|
+
try:
|
18
|
+
import torch
|
19
|
+
FOUND_TORCH = True
|
20
|
+
except (ImportError, ModuleNotFoundError):
|
21
|
+
FOUND_TORCH = False
|
22
|
+
|
23
|
+
try:
|
24
|
+
import transformers
|
25
|
+
FOUND_TRANSFORMERS = True
|
26
|
+
except (ImportError, ModuleNotFoundError):
|
27
|
+
FOUND_TRANSFORMERS = False
|
28
|
+
|
29
|
+
if FOUND_TORCH and FOUND_TRANSFORMERS:
|
30
|
+
# only needed for tokenizing
|
31
|
+
from .mauve_utils import get_tokenizer, get_model, featurize_tokens_from_model, get_device_from_arg
|
32
|
+
|
33
|
+
|
34
|
+
MODEL, TOKENIZER, MODEL_NAME = None, None, None
|
35
|
+
|
36
|
+
class Mauve:
|
37
|
+
def __init__(self, model_name_or_path='skt/kobert-base-v1'):
|
38
|
+
self.featurize_model_name=model_name_or_path
|
39
|
+
|
40
|
+
def compute(self,
|
41
|
+
p_features=None, q_features=None,
|
42
|
+
p_tokens=None, q_tokens=None,
|
43
|
+
p_text=None, q_text=None,
|
44
|
+
num_buckets='auto', pca_max_data=-1, kmeans_explained_var=0.9,
|
45
|
+
kmeans_num_redo=5, kmeans_max_iter=500,
|
46
|
+
device_id=-1, max_text_length=1024,
|
47
|
+
divergence_curve_discretization_size=25, mauve_scaling_factor=5,
|
48
|
+
verbose=False, seed=25, batch_size=1, use_float64=False,
|
49
|
+
):
|
50
|
+
|
51
|
+
"""
|
52
|
+
Compute the MAUVE score between two text generations P and Q.
|
53
|
+
|
54
|
+
P is either specified as ``p_features``, ``p_tokens``, or ``p_text``. Same with Q.
|
55
|
+
|
56
|
+
:param ``p_features``: ``numpy.ndarray`` of shape (n, d), where n is the number of generations.
|
57
|
+
:param ``q_features``: ``numpy.ndarray`` of shape (n, d), where n is the number of generations.
|
58
|
+
:param ``p_tokens``: list of length n, each entry is torch.LongTensor of shape (1, length).
|
59
|
+
:param ``q_tokens``: list of length n, each entry is torch.LongTensor of shape (1, length).
|
60
|
+
:param ``p_text``: list of length n, each entry is a string.
|
61
|
+
:param ``q_text``: list of length n, each entry is a string.
|
62
|
+
:param ``num_buckets``: the size of the histogram to quantize P and Q. Options: ``'auto'`` (default, which is n/10) or an integer.
|
63
|
+
:param ``pca_max_data``: the number data points to use for PCA. If `-1`, use all the data. Default -1.
|
64
|
+
:param ``kmeans_explained_var``: amount of variance of the data to keep in dimensionality reduction by PCA. Default 0.9.
|
65
|
+
:param ``kmeans_num_redo``: number of times to redo k-means clustering (the best objective is kept). Default 5.
|
66
|
+
Try reducing this to 1 in order to reduce running time.
|
67
|
+
:param ``kmeans_max_iter``: maximum number of k-means iterations. Default 500.
|
68
|
+
Try reducing this to 100 in order to reduce running time.
|
69
|
+
:param ``featurize_model_name``: name of the model from which features are obtained. Default 'gpt2-large'.
|
70
|
+
We support all models which can be loaded from ``transformers.AutoModel.from_pretrained(featurize_model_name)``.
|
71
|
+
:param ``device_id``: Device for featurization. Supply gpu_id (e.g. 0 or 3) to use GPU or -1 to use CPU.
|
72
|
+
:param ``max_text_length``: maximum number of tokens to consider. Default 1024.
|
73
|
+
:param ``divergence_curve_discretization_size``: Number of points to consider on the divergence curve. Default 25.
|
74
|
+
Larger values do not offer much of a difference.
|
75
|
+
:param ``mauve_scaling_factor``: The constant``c`` from the paper. Default 5.
|
76
|
+
See `Best Practices <index.html#best-practices-for-mauve>`_ for details.
|
77
|
+
:param ``verbose``: If True, print running time updates.
|
78
|
+
:param ``seed``: random seed to initialize k-means cluster assignments.
|
79
|
+
:param ``batch_size``: Batch size for feature extraction.
|
80
|
+
A larger batch size speeds up computation.
|
81
|
+
You might have to experiment to find the largest batch size that fits in your GPU memory.
|
82
|
+
See `here <https://github.com/krishnap25/mauve/issues/8#issuecomment-1082075240>`_ for details.
|
83
|
+
|
84
|
+
:return: an object with fields p_hist, q_hist, divergence_curve and mauve.
|
85
|
+
|
86
|
+
* ``out.mauve`` is a number between 0 and 1, the MAUVE score. Higher values means P is closer to Q.
|
87
|
+
* ``out.frontier_integral``, a number between 0 and 1. Lower values mean that P is closer to Q.
|
88
|
+
* ``out.p_hist`` is the obtained histogram for P. Same for ``out.q_hist``.
|
89
|
+
* ``out.divergence_curve`` contains the points in the divergence curve. It is of shape (m, 2), where m is ``divergence_curve_discretization_size``
|
90
|
+
|
91
|
+
"""
|
92
|
+
|
93
|
+
if p_features is None and p_tokens is None and p_text is None:
|
94
|
+
raise ValueError('Supply at least one of p_features, p_tokens, p_text')
|
95
|
+
if q_features is None and q_tokens is None and q_text is None:
|
96
|
+
raise ValueError('Supply at least one of q_features, q_tokens, q_text')
|
97
|
+
p_features = self.get_features_from_input(
|
98
|
+
p_features, p_tokens, p_text, self.featurize_model_name, max_text_length,
|
99
|
+
device_id, name="p", verbose=verbose, batch_size=batch_size, use_float64=use_float64,
|
100
|
+
)
|
101
|
+
q_features = self.get_features_from_input(
|
102
|
+
q_features, q_tokens, q_text, self.featurize_model_name, max_text_length,
|
103
|
+
device_id, name="q", verbose=verbose, batch_size=batch_size, use_float64=use_float64,
|
104
|
+
)
|
105
|
+
if num_buckets == 'auto':
|
106
|
+
# heuristic: use num_clusters = num_generations / 10
|
107
|
+
num_buckets = max(2, int(round(min(p_features.shape[0], q_features.shape[0]) / 10)))
|
108
|
+
elif not isinstance(num_buckets, int):
|
109
|
+
raise ValueError('num_buckets is expected to be an integer or "auto"')
|
110
|
+
|
111
|
+
# Acutal binning
|
112
|
+
t1 = time.time()
|
113
|
+
p, q = self.cluster_feats(p_features, q_features,
|
114
|
+
num_clusters=num_buckets,
|
115
|
+
norm='l2', whiten=False,
|
116
|
+
pca_max_data=pca_max_data,
|
117
|
+
explained_variance=kmeans_explained_var,
|
118
|
+
num_redo=kmeans_num_redo,
|
119
|
+
max_iter=kmeans_max_iter,
|
120
|
+
seed=seed, verbose=verbose)
|
121
|
+
t2 = time.time()
|
122
|
+
if verbose:
|
123
|
+
print('total discretization time:', round(t2-t1, 2), 'seconds')
|
124
|
+
|
125
|
+
# Divergence curve and mauve
|
126
|
+
mixture_weights = np.linspace(1e-6, 1-1e-6, divergence_curve_discretization_size)
|
127
|
+
divergence_curve = self.get_divergence_curve_for_multinomials(p, q, mixture_weights, mauve_scaling_factor)
|
128
|
+
x, y = divergence_curve.T
|
129
|
+
idxs1 = np.argsort(x)
|
130
|
+
idxs2 = np.argsort(y)
|
131
|
+
mauve_score = 0.5 * (
|
132
|
+
compute_area_under_curve(x[idxs1], y[idxs1]) +
|
133
|
+
compute_area_under_curve(y[idxs2], x[idxs2])
|
134
|
+
)
|
135
|
+
fi_score = self.get_fronter_integral(p, q)
|
136
|
+
to_return = SimpleNamespace(
|
137
|
+
p_hist=p, q_hist=q, divergence_curve=divergence_curve,
|
138
|
+
mauve=mauve_score,
|
139
|
+
frontier_integral=fi_score,
|
140
|
+
num_buckets=num_buckets,
|
141
|
+
)
|
142
|
+
return to_return
|
143
|
+
|
144
|
+
def get_features_from_input(self, features, tokenized_texts, texts,
|
145
|
+
featurize_model_name, max_len, device_id, name, batch_size,
|
146
|
+
verbose=False, use_float64=False):
|
147
|
+
global MODEL, TOKENIZER, MODEL_NAME
|
148
|
+
if features is None:
|
149
|
+
# Featurizing is necessary. Make sure the required packages are available
|
150
|
+
if not FOUND_TORCH:
|
151
|
+
raise ModuleNotFoundError(
|
152
|
+
"""PyTorch not found. Please install PyTorch if you would like to use the featurization.
|
153
|
+
For details, see `https://github.com/krishnap25/mauve`
|
154
|
+
and `https://pytorch.org/get-started/locally/`.
|
155
|
+
""")
|
156
|
+
if not FOUND_TRANSFORMERS:
|
157
|
+
raise ModuleNotFoundError(
|
158
|
+
"""Transformers not found. Please install Transformers if you would like to use the featurization.
|
159
|
+
For details, see `https://github.com/krishnap25/mauve`
|
160
|
+
and `https://huggingface.co/transformers/installation.html`.
|
161
|
+
""")
|
162
|
+
|
163
|
+
if tokenized_texts is None:
|
164
|
+
# tokenize texts
|
165
|
+
if TOKENIZER is None or MODEL_NAME != featurize_model_name:
|
166
|
+
if verbose: print('Loading tokenizer')
|
167
|
+
TOKENIZER = get_tokenizer(featurize_model_name)
|
168
|
+
if verbose: print('Tokenizing text...')
|
169
|
+
tokenized_texts = [
|
170
|
+
TOKENIZER.encode(sen, return_tensors='pt', truncation=True, max_length=max_len)
|
171
|
+
for sen in texts
|
172
|
+
]
|
173
|
+
# use tokenized_texts to featurize
|
174
|
+
if TOKENIZER is None or MODEL_NAME != featurize_model_name:
|
175
|
+
if verbose: print('Loading tokenizer')
|
176
|
+
TOKENIZER = get_tokenizer(featurize_model_name)
|
177
|
+
if MODEL is None or MODEL_NAME != featurize_model_name:
|
178
|
+
if verbose: print('Loading model')
|
179
|
+
MODEL = get_model(featurize_model_name, TOKENIZER, device_id)
|
180
|
+
MODEL_NAME = featurize_model_name
|
181
|
+
else:
|
182
|
+
MODEL = MODEL.to(get_device_from_arg(device_id))
|
183
|
+
if use_float64:
|
184
|
+
MODEL = MODEL.double()
|
185
|
+
if verbose: print('Featurizing tokens')
|
186
|
+
features = featurize_tokens_from_model(MODEL, tokenized_texts, batch_size, name).detach().cpu().numpy()
|
187
|
+
else:
|
188
|
+
features = np.asarray(features)
|
189
|
+
return features
|
190
|
+
|
191
|
+
def cluster_feats(self, p, q, num_clusters,
|
192
|
+
norm='none', whiten=True,
|
193
|
+
pca_max_data=-1,
|
194
|
+
explained_variance=0.9,
|
195
|
+
num_redo=5, max_iter=500,
|
196
|
+
seed=0, verbose=False):
|
197
|
+
assert 0 < explained_variance < 1
|
198
|
+
if verbose:
|
199
|
+
print(f'seed = {seed}')
|
200
|
+
assert norm in ['none', 'l2', 'l1', None]
|
201
|
+
data1 = np.vstack([q, p])
|
202
|
+
if norm in ['l2', 'l1']:
|
203
|
+
data1 = normalize(data1, norm=norm, axis=1)
|
204
|
+
pca = PCA(n_components=None, whiten=whiten, random_state=seed+1)
|
205
|
+
if pca_max_data < 0 or pca_max_data >= data1.shape[0]:
|
206
|
+
pca.fit(data1)
|
207
|
+
elif 0 < pca_max_data < data1.shape[0]:
|
208
|
+
rng = np.random.RandomState(seed+5)
|
209
|
+
idxs = rng.choice(data1.shape[0], size=pca_max_data, replace=False)
|
210
|
+
pca.fit(data1[idxs])
|
211
|
+
else:
|
212
|
+
raise ValueError(f'Invalid argument pca_max_data={pca_max_data} with {data1.shape[0]} datapoints')
|
213
|
+
s = np.cumsum(pca.explained_variance_ratio_)
|
214
|
+
idx = np.argmax(s >= explained_variance) # last index to consider
|
215
|
+
if verbose:
|
216
|
+
print(f'performing clustering in lower dimension = {idx}')
|
217
|
+
data1 = pca.transform(data1)[:, :idx+1]
|
218
|
+
# Cluster
|
219
|
+
data1 = data1.astype(np.float32)
|
220
|
+
t1 = time.time()
|
221
|
+
kmeans = faiss.Kmeans(data1.shape[1], num_clusters, niter=max_iter,
|
222
|
+
verbose=verbose, nredo=num_redo, update_index=True,
|
223
|
+
seed=seed+2)
|
224
|
+
kmeans.train(data1)
|
225
|
+
_, labels = kmeans.index.search(data1, 1)
|
226
|
+
labels = labels.reshape(-1)
|
227
|
+
t2 = time.time()
|
228
|
+
if verbose:
|
229
|
+
print('kmeans time:', round(t2-t1, 2), 's')
|
230
|
+
|
231
|
+
q_labels = labels[:len(q)]
|
232
|
+
p_labels = labels[len(q):]
|
233
|
+
|
234
|
+
q_bins = np.histogram(q_labels, bins=num_clusters,
|
235
|
+
range=[0, num_clusters], density=True)[0]
|
236
|
+
p_bins = np.histogram(p_labels, bins=num_clusters,
|
237
|
+
range=[0, num_clusters], density=True)[0]
|
238
|
+
return p_bins / p_bins.sum(), q_bins / q_bins.sum()
|
239
|
+
|
240
|
+
|
241
|
+
def kl_multinomial(self, p, q):
|
242
|
+
assert p.shape == q.shape
|
243
|
+
if np.logical_and(p != 0, q == 0).any():
|
244
|
+
return np.inf
|
245
|
+
else:
|
246
|
+
idxs = np.logical_and(p != 0, q != 0)
|
247
|
+
return np.sum(p[idxs] * np.log(p[idxs] / q[idxs]))
|
248
|
+
|
249
|
+
|
250
|
+
def get_divergence_curve_for_multinomials(self, p, q, mixture_weights, scaling_factor):
|
251
|
+
# TODO: check if extreme points are needed
|
252
|
+
divergence_curve = [[0, np.inf]] # extreme point
|
253
|
+
for w in np.sort(mixture_weights):
|
254
|
+
r = w * p + (1 - w) * q
|
255
|
+
divergence_curve.append([self.kl_multinomial(q, r), self.kl_multinomial(p, r)])
|
256
|
+
divergence_curve.append([np.inf, 0]) # other extreme point
|
257
|
+
return np.exp(-scaling_factor * np.asarray(divergence_curve))
|
258
|
+
|
259
|
+
def get_fronter_integral(self, p, q, scaling_factor=2):
|
260
|
+
total = 0.0
|
261
|
+
for p1, q1 in zip(p, q):
|
262
|
+
if p1 == 0 and q1 == 0:
|
263
|
+
pass
|
264
|
+
elif p1 == 0:
|
265
|
+
total += q1 / 4
|
266
|
+
elif q1 == 0:
|
267
|
+
total += p1 / 4
|
268
|
+
elif abs(p1 - q1) > 1e-8:
|
269
|
+
t1 = p1 + q1
|
270
|
+
t2 = p1 * q1 * (math.log(p1) - math.log(q1)) / (p1 - q1)
|
271
|
+
total += 0.25 * t1 - 0.5 * t2
|
272
|
+
# else: contribution is 0
|
273
|
+
return total * scaling_factor
|
@@ -0,0 +1,131 @@
|
|
1
|
+
# Author: Krishna Pillutla
|
2
|
+
# License: GPLv3
|
3
|
+
import json
|
4
|
+
import os
|
5
|
+
import time
|
6
|
+
from tqdm.auto import tqdm as tqdm_original
|
7
|
+
|
8
|
+
import torch
|
9
|
+
from transformers import AutoModel, AutoTokenizer, XLNetTokenizer
|
10
|
+
# from nltkor.search.kobert_tokenizer import KoBERTTokenizer
|
11
|
+
|
12
|
+
CPU_DEVICE = torch.device('cpu')
|
13
|
+
tqdm = lambda *args, **kwargs: tqdm_original(
|
14
|
+
*args, **kwargs, disable=os.environ.get("DISABLE_TQDM", False))
|
15
|
+
|
16
|
+
|
17
|
+
def get_device_from_arg(device_id):
|
18
|
+
if (device_id is not None and
|
19
|
+
torch.cuda.is_available() and
|
20
|
+
0 <= device_id < torch.cuda.device_count()):
|
21
|
+
return torch.device(f'cuda:{device_id}')
|
22
|
+
else:
|
23
|
+
return CPU_DEVICE
|
24
|
+
|
25
|
+
def get_model(model_name, tokenizer, device_id):
|
26
|
+
device = get_device_from_arg(device_id)
|
27
|
+
if 'gpt2' in model_name or "bert" in model_name:
|
28
|
+
model = AutoModel.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id).to(device)
|
29
|
+
model = model.eval()
|
30
|
+
else:
|
31
|
+
raise ValueError(f'Unknown model: {model_name}')
|
32
|
+
return model
|
33
|
+
|
34
|
+
def get_tokenizer(model_name='skt/kobert-base-v1'):
|
35
|
+
if 'gpt2' in model_name or "bert" in model_name:
|
36
|
+
if model_name == 'skt/kobert-base-v1':
|
37
|
+
# tokenizer = KoBERTTokenizer.from_pretrained(model_name)
|
38
|
+
tokenizer = XLNetTokenizer.from_pretrained(model_name)
|
39
|
+
else:
|
40
|
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
41
|
+
else:
|
42
|
+
raise ValueError(f'Unknown model: {model_name}')
|
43
|
+
return tokenizer
|
44
|
+
|
45
|
+
def load_json_dataset(data_path, max_num_data):
|
46
|
+
texts = []
|
47
|
+
for i, line in enumerate(open(data_path)):
|
48
|
+
if i >= max_num_data:
|
49
|
+
break
|
50
|
+
texts.append(json.loads(line)['text'])
|
51
|
+
return texts
|
52
|
+
|
53
|
+
def load_and_tokenize_json_data(tokenizer, data_path, max_len=1024, max_num_data=float('inf')):
|
54
|
+
""" Load and tokenize the data in a jsonl format
|
55
|
+
|
56
|
+
:param tokenizer: HF tokenizer object
|
57
|
+
:param data_path: jsonl file to read. Read the "text" field of each line
|
58
|
+
:param max_len: maximum length of tokenized data
|
59
|
+
:param max_num_data: maximum number of lines to load
|
60
|
+
:return: list of `torch.LongTensor`s of shape (1, num_tokens), one for each input line
|
61
|
+
"""
|
62
|
+
assert max_len <= 1024 and max_num_data >= 2000, f"max_len={max_len}, max_num_data={max_num_data} are insufficent"
|
63
|
+
t1 = time.time()
|
64
|
+
texts = load_json_dataset(data_path, max_num_data=max_num_data)
|
65
|
+
t2 = time.time()
|
66
|
+
print(f'dataset load time: {round(t2-t1, 2)} sec')
|
67
|
+
t1 = time.time()
|
68
|
+
tokenized_texts = [tokenizer.encode(sen, return_tensors='pt', truncation=True, max_length=max_len)
|
69
|
+
for sen in texts]
|
70
|
+
t2 = time.time()
|
71
|
+
print(f'tokenizing time: {round(t2-t1, 2)} sec')
|
72
|
+
return tokenized_texts
|
73
|
+
|
74
|
+
def decode_samples_from_lst(tokenizer, tokenized_texts):
|
75
|
+
""" Decode from tokens to string
|
76
|
+
|
77
|
+
:param tokenizer: HF tokenizer
|
78
|
+
:param tokenized_texts: list of list of tokens
|
79
|
+
:return: decoded output as a list of strings of the same length as tokenized_text_list
|
80
|
+
"""
|
81
|
+
t1 = time.time()
|
82
|
+
output = []
|
83
|
+
for l in tokenized_texts:
|
84
|
+
o = tokenizer.decode(torch.LongTensor(l), skip_special_tokens=True)
|
85
|
+
output.append(o)
|
86
|
+
t2 = time.time()
|
87
|
+
print(f'de-tokenizing time: {round(t2-t1, 2)}')
|
88
|
+
return output
|
89
|
+
|
90
|
+
@torch.no_grad()
|
91
|
+
def featurize_tokens_from_model(model, tokenized_texts, batch_size, name="", verbose=False):
|
92
|
+
"""Featurize tokenized texts using models, support batchify
|
93
|
+
:param model: HF Transformers model
|
94
|
+
:param batch_size: Batch size used during forward pass
|
95
|
+
:param tokenized_texts: list of torch.LongTensor of shape (1, length)
|
96
|
+
:param verbose: If True, print status and time
|
97
|
+
:return:
|
98
|
+
"""
|
99
|
+
device = next(model.parameters()).device
|
100
|
+
t1 = time.time()
|
101
|
+
feats, chunks, chunk_sent_lengths = [], [], []
|
102
|
+
chunk_idx = 0
|
103
|
+
|
104
|
+
while chunk_idx * batch_size < len(tokenized_texts):
|
105
|
+
_chunk = [_t.view(-1) for _t in tokenized_texts[chunk_idx * batch_size: (chunk_idx + 1) * batch_size]]
|
106
|
+
chunks.append(_chunk)
|
107
|
+
chunk_sent_lengths.append([len(_c) for _c in _chunk])
|
108
|
+
chunk_idx += 1
|
109
|
+
|
110
|
+
for chunk, chunk_sent_length in tqdm(list(zip(chunks, chunk_sent_lengths)), desc=f"Featurizing {name}"):
|
111
|
+
padded_chunk = torch.nn.utils.rnn.pad_sequence(chunk,
|
112
|
+
batch_first=True,
|
113
|
+
padding_value=0).to(device)
|
114
|
+
attention_mask = torch.nn.utils.rnn.pad_sequence(
|
115
|
+
[torch.ones(sent_length).long() for sent_length in chunk_sent_length],
|
116
|
+
batch_first=True,
|
117
|
+
padding_value=0).to(device)
|
118
|
+
outs = model(input_ids=padded_chunk,
|
119
|
+
attention_mask=attention_mask,
|
120
|
+
past_key_values=None,
|
121
|
+
output_hidden_states=True,
|
122
|
+
return_dict=True)
|
123
|
+
h = []
|
124
|
+
for hidden_state, sent_length in zip(outs.hidden_states[-1], chunk_sent_length):
|
125
|
+
h.append(hidden_state[sent_length - 1])
|
126
|
+
h = torch.stack(h, dim=0)
|
127
|
+
feats.append(h.cpu())
|
128
|
+
t2 = time.time()
|
129
|
+
if verbose:
|
130
|
+
print(f'Featurize time: {round(t2-t1, 2)}')
|
131
|
+
return torch.cat(feats)
|
nltkor/misc/__init__.py
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
# Natural Language Toolkit: Miscellaneous modules
|
2
|
+
#
|
3
|
+
# Copyright (C) 2001-2020 NLTK Project
|
4
|
+
# Author: Steven Bird <stevenbird1@gmail.com>
|
5
|
+
# URL: <http://nltk.org/>
|
6
|
+
# For license information, see LICENSE.TXT
|
7
|
+
|
8
|
+
from nltk.misc.chomsky import generate_chomsky
|
9
|
+
from nltk.misc.wordfinder import word_finder
|
10
|
+
from nltk.misc.minimalset import MinimalSet
|
11
|
+
from nltk.misc.babelfish import babelize_shell
|
@@ -0,0 +1,59 @@
|
|
1
|
+
"""
|
2
|
+
string2string code
|
3
|
+
src = https://github.com/stanfordnlp/string2string
|
4
|
+
|
5
|
+
|
6
|
+
MIT License
|
7
|
+
|
8
|
+
Copyright (c) 2023 Mirac Suzgun
|
9
|
+
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
12
|
+
in the Software without restriction, including without limitation the rights
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
15
|
+
furnished to do so, subject to the following conditions:
|
16
|
+
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
18
|
+
copies or substantial portions of the Software.
|
19
|
+
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
26
|
+
SOFTWARE.
|
27
|
+
|
28
|
+
|
29
|
+
"""
|
30
|
+
|
31
|
+
from typing import List, Union
|
32
|
+
|
33
|
+
# Take the Cartesian product of two lists of strings (or lists of lists of strings)
|
34
|
+
def cartesian_product(
|
35
|
+
lst1: Union[List[str], List[List[str]]],
|
36
|
+
lst2: Union[List[str], List[List[str]]],
|
37
|
+
boolList: bool = False,
|
38
|
+
list_of_list_separator: str = " ## ",
|
39
|
+
) -> Union[List[str], List[List[str]]]:
|
40
|
+
"""
|
41
|
+
This function returns the Cartesian product of two lists of strings (or lists of lists of strings).
|
42
|
+
|
43
|
+
Arguments:
|
44
|
+
lst1: The first list of strings (or lists of lists of strings).
|
45
|
+
lst2: The second list of strings (or lists of lists of strings).
|
46
|
+
boolList: A boolean flag indicating whether the output should be a list of strings (or lists of lists of strings) (default: False).
|
47
|
+
|
48
|
+
Returns:
|
49
|
+
The Cartesian product of the two lists of strings (or lists of lists of strings).
|
50
|
+
"""
|
51
|
+
if lst1 == []:
|
52
|
+
return lst2
|
53
|
+
elif lst2 == []:
|
54
|
+
return lst1
|
55
|
+
return [
|
56
|
+
s1 + ("" if not (boolList) else list_of_list_separator) + s2
|
57
|
+
for s1 in lst1
|
58
|
+
for s2 in lst2
|
59
|
+
]
|
@@ -0,0 +1,83 @@
|
|
1
|
+
"""
|
2
|
+
string2string code
|
3
|
+
src = https://github.com/stanfordnlp/string2string
|
4
|
+
|
5
|
+
|
6
|
+
MIT License
|
7
|
+
|
8
|
+
Copyright (c) 2023 Mirac Suzgun
|
9
|
+
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
12
|
+
in the Software without restriction, including without limitation the rights
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
15
|
+
furnished to do so, subject to the following conditions:
|
16
|
+
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
18
|
+
copies or substantial portions of the Software.
|
19
|
+
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
26
|
+
SOFTWARE.
|
27
|
+
|
28
|
+
|
29
|
+
"""
|
30
|
+
|
31
|
+
"""
|
32
|
+
This file contains the default tokenizer.
|
33
|
+
"""
|
34
|
+
|
35
|
+
from typing import List
|
36
|
+
|
37
|
+
# Tokenizer class
|
38
|
+
class Tokenizer:
|
39
|
+
"""
|
40
|
+
This class contains the tokenizer.
|
41
|
+
"""
|
42
|
+
|
43
|
+
def __init__(self,
|
44
|
+
word_delimiter: str = " ",
|
45
|
+
):
|
46
|
+
"""
|
47
|
+
Initializes the Tokenizer class.
|
48
|
+
|
49
|
+
Arguments:
|
50
|
+
word_delimiter (str): The word delimiter. Default is " ".
|
51
|
+
"""
|
52
|
+
# Set the word delimiter
|
53
|
+
self.word_delimiter = word_delimiter
|
54
|
+
|
55
|
+
# Tokenize
|
56
|
+
def tokenize(self,
|
57
|
+
text: str,
|
58
|
+
) -> List[str]:
|
59
|
+
"""
|
60
|
+
Returns the tokens from a string.
|
61
|
+
|
62
|
+
Arguments:
|
63
|
+
text (str): The text to tokenize.
|
64
|
+
|
65
|
+
Returns:
|
66
|
+
List[str]: The tokens.
|
67
|
+
"""
|
68
|
+
return text.split(self.word_delimiter)
|
69
|
+
|
70
|
+
# Detokenize
|
71
|
+
def detokenize(self,
|
72
|
+
tokens: List[str],
|
73
|
+
) -> str:
|
74
|
+
"""
|
75
|
+
Returns the string from a list of tokens.
|
76
|
+
|
77
|
+
Arguments:
|
78
|
+
tokens (List[str]): The tokens.
|
79
|
+
|
80
|
+
Returns:
|
81
|
+
str: The string.
|
82
|
+
"""
|
83
|
+
return self.word_delimiter.join(tokens)
|