nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. nltkor/Kor_char.py +193 -0
  2. nltkor/__init__.py +16 -0
  3. nltkor/alignment/__init__.py +1315 -0
  4. nltkor/cider/__init__.py +2 -0
  5. nltkor/cider/cider.py +55 -0
  6. nltkor/cider/cider_scorer.py +207 -0
  7. nltkor/distance/__init__.py +441 -0
  8. nltkor/distance/wasserstein.py +126 -0
  9. nltkor/etc.py +22 -0
  10. nltkor/lazyimport.py +144 -0
  11. nltkor/make_requirement.py +11 -0
  12. nltkor/metrics/__init__.py +63 -0
  13. nltkor/metrics/bartscore.py +301 -0
  14. nltkor/metrics/bertscore.py +331 -0
  15. nltkor/metrics/bleu_tensor.py +20 -0
  16. nltkor/metrics/classical.py +847 -0
  17. nltkor/metrics/entment.py +24 -0
  18. nltkor/metrics/eval.py +517 -0
  19. nltkor/metrics/mauve.py +273 -0
  20. nltkor/metrics/mauve_utils.py +131 -0
  21. nltkor/misc/__init__.py +11 -0
  22. nltkor/misc/string2string_basic_functions.py +59 -0
  23. nltkor/misc/string2string_default_tokenizer.py +83 -0
  24. nltkor/misc/string2string_hash_functions.py +159 -0
  25. nltkor/misc/string2string_word_embeddings.py +503 -0
  26. nltkor/search/__init__.py +10 -0
  27. nltkor/search/classical.py +569 -0
  28. nltkor/search/faiss_search.py +787 -0
  29. nltkor/search/kobert_tokenizer.py +181 -0
  30. nltkor/sejong/__init__.py +3 -0
  31. nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
  32. nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
  33. nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
  34. nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
  35. nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
  36. nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
  37. nltkor/sejong/ch.py +12 -0
  38. nltkor/sejong/dict_semClassNum.txt +491 -0
  39. nltkor/sejong/layer.txt +630 -0
  40. nltkor/sejong/sejong_download.py +87 -0
  41. nltkor/sejong/ssem.py +684 -0
  42. nltkor/similarity/__init__.py +3 -0
  43. nltkor/similarity/bartscore____.py +337 -0
  44. nltkor/similarity/bertscore____.py +339 -0
  45. nltkor/similarity/classical.py +245 -0
  46. nltkor/similarity/cosine_similarity.py +175 -0
  47. nltkor/tag/__init__.py +71 -0
  48. nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
  49. nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
  50. nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
  51. nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
  52. nltkor/tag/espresso_tag.py +220 -0
  53. nltkor/tag/libs/__init__.py +10 -0
  54. nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
  55. nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
  56. nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
  57. nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
  58. nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
  59. nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
  60. nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
  61. nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
  62. nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
  63. nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
  64. nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
  65. nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
  66. nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
  67. nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
  68. nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
  69. nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
  70. nltkor/tag/libs/arguments.py +280 -0
  71. nltkor/tag/libs/attributes.py +231 -0
  72. nltkor/tag/libs/config.py +159 -0
  73. nltkor/tag/libs/metadata.py +129 -0
  74. nltkor/tag/libs/ner/__init__.py +2 -0
  75. nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
  76. nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
  77. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
  78. nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
  79. nltkor/tag/libs/ner/macmorphoreader.py +7 -0
  80. nltkor/tag/libs/ner/ner_reader.py +92 -0
  81. nltkor/tag/libs/network.c +72325 -0
  82. nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
  83. nltkor/tag/libs/network.pyx +878 -0
  84. nltkor/tag/libs/networkconv.pyx +1028 -0
  85. nltkor/tag/libs/networkdependencyconv.pyx +451 -0
  86. nltkor/tag/libs/parse/__init__.py +1 -0
  87. nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
  88. nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
  89. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
  90. nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
  91. nltkor/tag/libs/parse/parse_reader.py +283 -0
  92. nltkor/tag/libs/pos/__init__.py +2 -0
  93. nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
  94. nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
  95. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
  96. nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
  97. nltkor/tag/libs/pos/macmorphoreader.py +7 -0
  98. nltkor/tag/libs/pos/pos_reader.py +97 -0
  99. nltkor/tag/libs/reader.py +485 -0
  100. nltkor/tag/libs/srl/__init__.py +3 -0
  101. nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
  102. nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
  103. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
  104. nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
  105. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
  106. nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
  107. nltkor/tag/libs/srl/__srl_reader_.py +535 -0
  108. nltkor/tag/libs/srl/srl_reader.py +436 -0
  109. nltkor/tag/libs/srl/train_srl.py +87 -0
  110. nltkor/tag/libs/taggers.py +926 -0
  111. nltkor/tag/libs/utils.py +384 -0
  112. nltkor/tag/libs/word_dictionary.py +239 -0
  113. nltkor/tag/libs/wsd/__init__.py +2 -0
  114. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
  115. nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
  116. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
  117. nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
  118. nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
  119. nltkor/tag/libs/wsd/wsd_reader.py +93 -0
  120. nltkor/tokenize/__init__.py +62 -0
  121. nltkor/tokenize/ko_tokenize.py +115 -0
  122. nltkor/trans.py +121 -0
  123. nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
  124. nltkor-1.2.14.dist-info/METADATA +41 -0
  125. nltkor-1.2.14.dist-info/RECORD +127 -0
  126. nltkor-1.2.14.dist-info/WHEEL +5 -0
  127. nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,273 @@
1
+ # Author: Krishna Pillutla
2
+ # License: GPLv3
3
+
4
+ import math
5
+ import numpy as np
6
+ import time
7
+ from types import SimpleNamespace
8
+ from nltkor.make_requirement import make_requirement
9
+
10
+ import faiss
11
+ from sklearn.preprocessing import normalize
12
+ from sklearn.decomposition import PCA
13
+ from sklearn.metrics import auc as compute_area_under_curve
14
+ import torch
15
+ from transformers import AutoModel, AutoTokenizer
16
+
17
+ try:
18
+ import torch
19
+ FOUND_TORCH = True
20
+ except (ImportError, ModuleNotFoundError):
21
+ FOUND_TORCH = False
22
+
23
+ try:
24
+ import transformers
25
+ FOUND_TRANSFORMERS = True
26
+ except (ImportError, ModuleNotFoundError):
27
+ FOUND_TRANSFORMERS = False
28
+
29
+ if FOUND_TORCH and FOUND_TRANSFORMERS:
30
+ # only needed for tokenizing
31
+ from .mauve_utils import get_tokenizer, get_model, featurize_tokens_from_model, get_device_from_arg
32
+
33
+
34
+ MODEL, TOKENIZER, MODEL_NAME = None, None, None
35
+
36
+ class Mauve:
37
+ def __init__(self, model_name_or_path='skt/kobert-base-v1'):
38
+ self.featurize_model_name=model_name_or_path
39
+
40
+ def compute(self,
41
+ p_features=None, q_features=None,
42
+ p_tokens=None, q_tokens=None,
43
+ p_text=None, q_text=None,
44
+ num_buckets='auto', pca_max_data=-1, kmeans_explained_var=0.9,
45
+ kmeans_num_redo=5, kmeans_max_iter=500,
46
+ device_id=-1, max_text_length=1024,
47
+ divergence_curve_discretization_size=25, mauve_scaling_factor=5,
48
+ verbose=False, seed=25, batch_size=1, use_float64=False,
49
+ ):
50
+
51
+ """
52
+ Compute the MAUVE score between two text generations P and Q.
53
+
54
+ P is either specified as ``p_features``, ``p_tokens``, or ``p_text``. Same with Q.
55
+
56
+ :param ``p_features``: ``numpy.ndarray`` of shape (n, d), where n is the number of generations.
57
+ :param ``q_features``: ``numpy.ndarray`` of shape (n, d), where n is the number of generations.
58
+ :param ``p_tokens``: list of length n, each entry is torch.LongTensor of shape (1, length).
59
+ :param ``q_tokens``: list of length n, each entry is torch.LongTensor of shape (1, length).
60
+ :param ``p_text``: list of length n, each entry is a string.
61
+ :param ``q_text``: list of length n, each entry is a string.
62
+ :param ``num_buckets``: the size of the histogram to quantize P and Q. Options: ``'auto'`` (default, which is n/10) or an integer.
63
+ :param ``pca_max_data``: the number data points to use for PCA. If `-1`, use all the data. Default -1.
64
+ :param ``kmeans_explained_var``: amount of variance of the data to keep in dimensionality reduction by PCA. Default 0.9.
65
+ :param ``kmeans_num_redo``: number of times to redo k-means clustering (the best objective is kept). Default 5.
66
+ Try reducing this to 1 in order to reduce running time.
67
+ :param ``kmeans_max_iter``: maximum number of k-means iterations. Default 500.
68
+ Try reducing this to 100 in order to reduce running time.
69
+ :param ``featurize_model_name``: name of the model from which features are obtained. Default 'gpt2-large'.
70
+ We support all models which can be loaded from ``transformers.AutoModel.from_pretrained(featurize_model_name)``.
71
+ :param ``device_id``: Device for featurization. Supply gpu_id (e.g. 0 or 3) to use GPU or -1 to use CPU.
72
+ :param ``max_text_length``: maximum number of tokens to consider. Default 1024.
73
+ :param ``divergence_curve_discretization_size``: Number of points to consider on the divergence curve. Default 25.
74
+ Larger values do not offer much of a difference.
75
+ :param ``mauve_scaling_factor``: The constant``c`` from the paper. Default 5.
76
+ See `Best Practices <index.html#best-practices-for-mauve>`_ for details.
77
+ :param ``verbose``: If True, print running time updates.
78
+ :param ``seed``: random seed to initialize k-means cluster assignments.
79
+ :param ``batch_size``: Batch size for feature extraction.
80
+ A larger batch size speeds up computation.
81
+ You might have to experiment to find the largest batch size that fits in your GPU memory.
82
+ See `here <https://github.com/krishnap25/mauve/issues/8#issuecomment-1082075240>`_ for details.
83
+
84
+ :return: an object with fields p_hist, q_hist, divergence_curve and mauve.
85
+
86
+ * ``out.mauve`` is a number between 0 and 1, the MAUVE score. Higher values means P is closer to Q.
87
+ * ``out.frontier_integral``, a number between 0 and 1. Lower values mean that P is closer to Q.
88
+ * ``out.p_hist`` is the obtained histogram for P. Same for ``out.q_hist``.
89
+ * ``out.divergence_curve`` contains the points in the divergence curve. It is of shape (m, 2), where m is ``divergence_curve_discretization_size``
90
+
91
+ """
92
+
93
+ if p_features is None and p_tokens is None and p_text is None:
94
+ raise ValueError('Supply at least one of p_features, p_tokens, p_text')
95
+ if q_features is None and q_tokens is None and q_text is None:
96
+ raise ValueError('Supply at least one of q_features, q_tokens, q_text')
97
+ p_features = self.get_features_from_input(
98
+ p_features, p_tokens, p_text, self.featurize_model_name, max_text_length,
99
+ device_id, name="p", verbose=verbose, batch_size=batch_size, use_float64=use_float64,
100
+ )
101
+ q_features = self.get_features_from_input(
102
+ q_features, q_tokens, q_text, self.featurize_model_name, max_text_length,
103
+ device_id, name="q", verbose=verbose, batch_size=batch_size, use_float64=use_float64,
104
+ )
105
+ if num_buckets == 'auto':
106
+ # heuristic: use num_clusters = num_generations / 10
107
+ num_buckets = max(2, int(round(min(p_features.shape[0], q_features.shape[0]) / 10)))
108
+ elif not isinstance(num_buckets, int):
109
+ raise ValueError('num_buckets is expected to be an integer or "auto"')
110
+
111
+ # Acutal binning
112
+ t1 = time.time()
113
+ p, q = self.cluster_feats(p_features, q_features,
114
+ num_clusters=num_buckets,
115
+ norm='l2', whiten=False,
116
+ pca_max_data=pca_max_data,
117
+ explained_variance=kmeans_explained_var,
118
+ num_redo=kmeans_num_redo,
119
+ max_iter=kmeans_max_iter,
120
+ seed=seed, verbose=verbose)
121
+ t2 = time.time()
122
+ if verbose:
123
+ print('total discretization time:', round(t2-t1, 2), 'seconds')
124
+
125
+ # Divergence curve and mauve
126
+ mixture_weights = np.linspace(1e-6, 1-1e-6, divergence_curve_discretization_size)
127
+ divergence_curve = self.get_divergence_curve_for_multinomials(p, q, mixture_weights, mauve_scaling_factor)
128
+ x, y = divergence_curve.T
129
+ idxs1 = np.argsort(x)
130
+ idxs2 = np.argsort(y)
131
+ mauve_score = 0.5 * (
132
+ compute_area_under_curve(x[idxs1], y[idxs1]) +
133
+ compute_area_under_curve(y[idxs2], x[idxs2])
134
+ )
135
+ fi_score = self.get_fronter_integral(p, q)
136
+ to_return = SimpleNamespace(
137
+ p_hist=p, q_hist=q, divergence_curve=divergence_curve,
138
+ mauve=mauve_score,
139
+ frontier_integral=fi_score,
140
+ num_buckets=num_buckets,
141
+ )
142
+ return to_return
143
+
144
+ def get_features_from_input(self, features, tokenized_texts, texts,
145
+ featurize_model_name, max_len, device_id, name, batch_size,
146
+ verbose=False, use_float64=False):
147
+ global MODEL, TOKENIZER, MODEL_NAME
148
+ if features is None:
149
+ # Featurizing is necessary. Make sure the required packages are available
150
+ if not FOUND_TORCH:
151
+ raise ModuleNotFoundError(
152
+ """PyTorch not found. Please install PyTorch if you would like to use the featurization.
153
+ For details, see `https://github.com/krishnap25/mauve`
154
+ and `https://pytorch.org/get-started/locally/`.
155
+ """)
156
+ if not FOUND_TRANSFORMERS:
157
+ raise ModuleNotFoundError(
158
+ """Transformers not found. Please install Transformers if you would like to use the featurization.
159
+ For details, see `https://github.com/krishnap25/mauve`
160
+ and `https://huggingface.co/transformers/installation.html`.
161
+ """)
162
+
163
+ if tokenized_texts is None:
164
+ # tokenize texts
165
+ if TOKENIZER is None or MODEL_NAME != featurize_model_name:
166
+ if verbose: print('Loading tokenizer')
167
+ TOKENIZER = get_tokenizer(featurize_model_name)
168
+ if verbose: print('Tokenizing text...')
169
+ tokenized_texts = [
170
+ TOKENIZER.encode(sen, return_tensors='pt', truncation=True, max_length=max_len)
171
+ for sen in texts
172
+ ]
173
+ # use tokenized_texts to featurize
174
+ if TOKENIZER is None or MODEL_NAME != featurize_model_name:
175
+ if verbose: print('Loading tokenizer')
176
+ TOKENIZER = get_tokenizer(featurize_model_name)
177
+ if MODEL is None or MODEL_NAME != featurize_model_name:
178
+ if verbose: print('Loading model')
179
+ MODEL = get_model(featurize_model_name, TOKENIZER, device_id)
180
+ MODEL_NAME = featurize_model_name
181
+ else:
182
+ MODEL = MODEL.to(get_device_from_arg(device_id))
183
+ if use_float64:
184
+ MODEL = MODEL.double()
185
+ if verbose: print('Featurizing tokens')
186
+ features = featurize_tokens_from_model(MODEL, tokenized_texts, batch_size, name).detach().cpu().numpy()
187
+ else:
188
+ features = np.asarray(features)
189
+ return features
190
+
191
+ def cluster_feats(self, p, q, num_clusters,
192
+ norm='none', whiten=True,
193
+ pca_max_data=-1,
194
+ explained_variance=0.9,
195
+ num_redo=5, max_iter=500,
196
+ seed=0, verbose=False):
197
+ assert 0 < explained_variance < 1
198
+ if verbose:
199
+ print(f'seed = {seed}')
200
+ assert norm in ['none', 'l2', 'l1', None]
201
+ data1 = np.vstack([q, p])
202
+ if norm in ['l2', 'l1']:
203
+ data1 = normalize(data1, norm=norm, axis=1)
204
+ pca = PCA(n_components=None, whiten=whiten, random_state=seed+1)
205
+ if pca_max_data < 0 or pca_max_data >= data1.shape[0]:
206
+ pca.fit(data1)
207
+ elif 0 < pca_max_data < data1.shape[0]:
208
+ rng = np.random.RandomState(seed+5)
209
+ idxs = rng.choice(data1.shape[0], size=pca_max_data, replace=False)
210
+ pca.fit(data1[idxs])
211
+ else:
212
+ raise ValueError(f'Invalid argument pca_max_data={pca_max_data} with {data1.shape[0]} datapoints')
213
+ s = np.cumsum(pca.explained_variance_ratio_)
214
+ idx = np.argmax(s >= explained_variance) # last index to consider
215
+ if verbose:
216
+ print(f'performing clustering in lower dimension = {idx}')
217
+ data1 = pca.transform(data1)[:, :idx+1]
218
+ # Cluster
219
+ data1 = data1.astype(np.float32)
220
+ t1 = time.time()
221
+ kmeans = faiss.Kmeans(data1.shape[1], num_clusters, niter=max_iter,
222
+ verbose=verbose, nredo=num_redo, update_index=True,
223
+ seed=seed+2)
224
+ kmeans.train(data1)
225
+ _, labels = kmeans.index.search(data1, 1)
226
+ labels = labels.reshape(-1)
227
+ t2 = time.time()
228
+ if verbose:
229
+ print('kmeans time:', round(t2-t1, 2), 's')
230
+
231
+ q_labels = labels[:len(q)]
232
+ p_labels = labels[len(q):]
233
+
234
+ q_bins = np.histogram(q_labels, bins=num_clusters,
235
+ range=[0, num_clusters], density=True)[0]
236
+ p_bins = np.histogram(p_labels, bins=num_clusters,
237
+ range=[0, num_clusters], density=True)[0]
238
+ return p_bins / p_bins.sum(), q_bins / q_bins.sum()
239
+
240
+
241
+ def kl_multinomial(self, p, q):
242
+ assert p.shape == q.shape
243
+ if np.logical_and(p != 0, q == 0).any():
244
+ return np.inf
245
+ else:
246
+ idxs = np.logical_and(p != 0, q != 0)
247
+ return np.sum(p[idxs] * np.log(p[idxs] / q[idxs]))
248
+
249
+
250
+ def get_divergence_curve_for_multinomials(self, p, q, mixture_weights, scaling_factor):
251
+ # TODO: check if extreme points are needed
252
+ divergence_curve = [[0, np.inf]] # extreme point
253
+ for w in np.sort(mixture_weights):
254
+ r = w * p + (1 - w) * q
255
+ divergence_curve.append([self.kl_multinomial(q, r), self.kl_multinomial(p, r)])
256
+ divergence_curve.append([np.inf, 0]) # other extreme point
257
+ return np.exp(-scaling_factor * np.asarray(divergence_curve))
258
+
259
+ def get_fronter_integral(self, p, q, scaling_factor=2):
260
+ total = 0.0
261
+ for p1, q1 in zip(p, q):
262
+ if p1 == 0 and q1 == 0:
263
+ pass
264
+ elif p1 == 0:
265
+ total += q1 / 4
266
+ elif q1 == 0:
267
+ total += p1 / 4
268
+ elif abs(p1 - q1) > 1e-8:
269
+ t1 = p1 + q1
270
+ t2 = p1 * q1 * (math.log(p1) - math.log(q1)) / (p1 - q1)
271
+ total += 0.25 * t1 - 0.5 * t2
272
+ # else: contribution is 0
273
+ return total * scaling_factor
@@ -0,0 +1,131 @@
1
+ # Author: Krishna Pillutla
2
+ # License: GPLv3
3
+ import json
4
+ import os
5
+ import time
6
+ from tqdm.auto import tqdm as tqdm_original
7
+
8
+ import torch
9
+ from transformers import AutoModel, AutoTokenizer, XLNetTokenizer
10
+ # from nltkor.search.kobert_tokenizer import KoBERTTokenizer
11
+
12
+ CPU_DEVICE = torch.device('cpu')
13
+ tqdm = lambda *args, **kwargs: tqdm_original(
14
+ *args, **kwargs, disable=os.environ.get("DISABLE_TQDM", False))
15
+
16
+
17
+ def get_device_from_arg(device_id):
18
+ if (device_id is not None and
19
+ torch.cuda.is_available() and
20
+ 0 <= device_id < torch.cuda.device_count()):
21
+ return torch.device(f'cuda:{device_id}')
22
+ else:
23
+ return CPU_DEVICE
24
+
25
+ def get_model(model_name, tokenizer, device_id):
26
+ device = get_device_from_arg(device_id)
27
+ if 'gpt2' in model_name or "bert" in model_name:
28
+ model = AutoModel.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id).to(device)
29
+ model = model.eval()
30
+ else:
31
+ raise ValueError(f'Unknown model: {model_name}')
32
+ return model
33
+
34
+ def get_tokenizer(model_name='skt/kobert-base-v1'):
35
+ if 'gpt2' in model_name or "bert" in model_name:
36
+ if model_name == 'skt/kobert-base-v1':
37
+ # tokenizer = KoBERTTokenizer.from_pretrained(model_name)
38
+ tokenizer = XLNetTokenizer.from_pretrained(model_name)
39
+ else:
40
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
41
+ else:
42
+ raise ValueError(f'Unknown model: {model_name}')
43
+ return tokenizer
44
+
45
+ def load_json_dataset(data_path, max_num_data):
46
+ texts = []
47
+ for i, line in enumerate(open(data_path)):
48
+ if i >= max_num_data:
49
+ break
50
+ texts.append(json.loads(line)['text'])
51
+ return texts
52
+
53
+ def load_and_tokenize_json_data(tokenizer, data_path, max_len=1024, max_num_data=float('inf')):
54
+ """ Load and tokenize the data in a jsonl format
55
+
56
+ :param tokenizer: HF tokenizer object
57
+ :param data_path: jsonl file to read. Read the "text" field of each line
58
+ :param max_len: maximum length of tokenized data
59
+ :param max_num_data: maximum number of lines to load
60
+ :return: list of `torch.LongTensor`s of shape (1, num_tokens), one for each input line
61
+ """
62
+ assert max_len <= 1024 and max_num_data >= 2000, f"max_len={max_len}, max_num_data={max_num_data} are insufficent"
63
+ t1 = time.time()
64
+ texts = load_json_dataset(data_path, max_num_data=max_num_data)
65
+ t2 = time.time()
66
+ print(f'dataset load time: {round(t2-t1, 2)} sec')
67
+ t1 = time.time()
68
+ tokenized_texts = [tokenizer.encode(sen, return_tensors='pt', truncation=True, max_length=max_len)
69
+ for sen in texts]
70
+ t2 = time.time()
71
+ print(f'tokenizing time: {round(t2-t1, 2)} sec')
72
+ return tokenized_texts
73
+
74
+ def decode_samples_from_lst(tokenizer, tokenized_texts):
75
+ """ Decode from tokens to string
76
+
77
+ :param tokenizer: HF tokenizer
78
+ :param tokenized_texts: list of list of tokens
79
+ :return: decoded output as a list of strings of the same length as tokenized_text_list
80
+ """
81
+ t1 = time.time()
82
+ output = []
83
+ for l in tokenized_texts:
84
+ o = tokenizer.decode(torch.LongTensor(l), skip_special_tokens=True)
85
+ output.append(o)
86
+ t2 = time.time()
87
+ print(f'de-tokenizing time: {round(t2-t1, 2)}')
88
+ return output
89
+
90
+ @torch.no_grad()
91
+ def featurize_tokens_from_model(model, tokenized_texts, batch_size, name="", verbose=False):
92
+ """Featurize tokenized texts using models, support batchify
93
+ :param model: HF Transformers model
94
+ :param batch_size: Batch size used during forward pass
95
+ :param tokenized_texts: list of torch.LongTensor of shape (1, length)
96
+ :param verbose: If True, print status and time
97
+ :return:
98
+ """
99
+ device = next(model.parameters()).device
100
+ t1 = time.time()
101
+ feats, chunks, chunk_sent_lengths = [], [], []
102
+ chunk_idx = 0
103
+
104
+ while chunk_idx * batch_size < len(tokenized_texts):
105
+ _chunk = [_t.view(-1) for _t in tokenized_texts[chunk_idx * batch_size: (chunk_idx + 1) * batch_size]]
106
+ chunks.append(_chunk)
107
+ chunk_sent_lengths.append([len(_c) for _c in _chunk])
108
+ chunk_idx += 1
109
+
110
+ for chunk, chunk_sent_length in tqdm(list(zip(chunks, chunk_sent_lengths)), desc=f"Featurizing {name}"):
111
+ padded_chunk = torch.nn.utils.rnn.pad_sequence(chunk,
112
+ batch_first=True,
113
+ padding_value=0).to(device)
114
+ attention_mask = torch.nn.utils.rnn.pad_sequence(
115
+ [torch.ones(sent_length).long() for sent_length in chunk_sent_length],
116
+ batch_first=True,
117
+ padding_value=0).to(device)
118
+ outs = model(input_ids=padded_chunk,
119
+ attention_mask=attention_mask,
120
+ past_key_values=None,
121
+ output_hidden_states=True,
122
+ return_dict=True)
123
+ h = []
124
+ for hidden_state, sent_length in zip(outs.hidden_states[-1], chunk_sent_length):
125
+ h.append(hidden_state[sent_length - 1])
126
+ h = torch.stack(h, dim=0)
127
+ feats.append(h.cpu())
128
+ t2 = time.time()
129
+ if verbose:
130
+ print(f'Featurize time: {round(t2-t1, 2)}')
131
+ return torch.cat(feats)
@@ -0,0 +1,11 @@
1
+ # Natural Language Toolkit: Miscellaneous modules
2
+ #
3
+ # Copyright (C) 2001-2020 NLTK Project
4
+ # Author: Steven Bird <stevenbird1@gmail.com>
5
+ # URL: <http://nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ from nltk.misc.chomsky import generate_chomsky
9
+ from nltk.misc.wordfinder import word_finder
10
+ from nltk.misc.minimalset import MinimalSet
11
+ from nltk.misc.babelfish import babelize_shell
@@ -0,0 +1,59 @@
1
+ """
2
+ string2string code
3
+ src = https://github.com/stanfordnlp/string2string
4
+
5
+
6
+ MIT License
7
+
8
+ Copyright (c) 2023 Mirac Suzgun
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+
29
+ """
30
+
31
+ from typing import List, Union
32
+
33
+ # Take the Cartesian product of two lists of strings (or lists of lists of strings)
34
+ def cartesian_product(
35
+ lst1: Union[List[str], List[List[str]]],
36
+ lst2: Union[List[str], List[List[str]]],
37
+ boolList: bool = False,
38
+ list_of_list_separator: str = " ## ",
39
+ ) -> Union[List[str], List[List[str]]]:
40
+ """
41
+ This function returns the Cartesian product of two lists of strings (or lists of lists of strings).
42
+
43
+ Arguments:
44
+ lst1: The first list of strings (or lists of lists of strings).
45
+ lst2: The second list of strings (or lists of lists of strings).
46
+ boolList: A boolean flag indicating whether the output should be a list of strings (or lists of lists of strings) (default: False).
47
+
48
+ Returns:
49
+ The Cartesian product of the two lists of strings (or lists of lists of strings).
50
+ """
51
+ if lst1 == []:
52
+ return lst2
53
+ elif lst2 == []:
54
+ return lst1
55
+ return [
56
+ s1 + ("" if not (boolList) else list_of_list_separator) + s2
57
+ for s1 in lst1
58
+ for s2 in lst2
59
+ ]
@@ -0,0 +1,83 @@
1
+ """
2
+ string2string code
3
+ src = https://github.com/stanfordnlp/string2string
4
+
5
+
6
+ MIT License
7
+
8
+ Copyright (c) 2023 Mirac Suzgun
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+
29
+ """
30
+
31
+ """
32
+ This file contains the default tokenizer.
33
+ """
34
+
35
+ from typing import List
36
+
37
+ # Tokenizer class
38
+ class Tokenizer:
39
+ """
40
+ This class contains the tokenizer.
41
+ """
42
+
43
+ def __init__(self,
44
+ word_delimiter: str = " ",
45
+ ):
46
+ """
47
+ Initializes the Tokenizer class.
48
+
49
+ Arguments:
50
+ word_delimiter (str): The word delimiter. Default is " ".
51
+ """
52
+ # Set the word delimiter
53
+ self.word_delimiter = word_delimiter
54
+
55
+ # Tokenize
56
+ def tokenize(self,
57
+ text: str,
58
+ ) -> List[str]:
59
+ """
60
+ Returns the tokens from a string.
61
+
62
+ Arguments:
63
+ text (str): The text to tokenize.
64
+
65
+ Returns:
66
+ List[str]: The tokens.
67
+ """
68
+ return text.split(self.word_delimiter)
69
+
70
+ # Detokenize
71
+ def detokenize(self,
72
+ tokens: List[str],
73
+ ) -> str:
74
+ """
75
+ Returns the string from a list of tokens.
76
+
77
+ Arguments:
78
+ tokens (List[str]): The tokens.
79
+
80
+ Returns:
81
+ str: The string.
82
+ """
83
+ return self.word_delimiter.join(tokens)