tmnt 0.7.52b20240601__py3-none-any.whl → 0.7.52b20240603__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tmnt/estimator.py +21 -182
- tmnt/eval_npmi.py +56 -37
- tmnt/inference.py +10 -52
- tmnt/modeling.py +79 -175
- {tmnt-0.7.52b20240601.dist-info → tmnt-0.7.52b20240603.dist-info}/METADATA +2 -2
- {tmnt-0.7.52b20240601.dist-info → tmnt-0.7.52b20240603.dist-info}/RECORD +10 -10
- {tmnt-0.7.52b20240601.dist-info → tmnt-0.7.52b20240603.dist-info}/LICENSE +0 -0
- {tmnt-0.7.52b20240601.dist-info → tmnt-0.7.52b20240603.dist-info}/NOTICE +0 -0
- {tmnt-0.7.52b20240601.dist-info → tmnt-0.7.52b20240603.dist-info}/WHEEL +0 -0
- {tmnt-0.7.52b20240601.dist-info → tmnt-0.7.52b20240603.dist-info}/top_level.txt +0 -0
tmnt/estimator.py
CHANGED
@@ -17,7 +17,7 @@ import json
|
|
17
17
|
|
18
18
|
from sklearn.metrics import average_precision_score, top_k_accuracy_score, roc_auc_score, ndcg_score, precision_recall_fscore_support
|
19
19
|
from tmnt.data_loading import PairedDataLoader, SingletonWrapperLoader, SparseDataLoader, get_llm_model
|
20
|
-
from tmnt.modeling import BowVAEModel,
|
20
|
+
from tmnt.modeling import BowVAEModel, SeqBowVED
|
21
21
|
from tmnt.modeling import CrossBatchCosineSimilarityLoss, GeneralizedSDMLLoss, MultiNegativeCrossEntropyLoss, MetricSeqBowVED, MetricBowVAEModel
|
22
22
|
from tmnt.eval_npmi import EvaluateNPMI
|
23
23
|
from tmnt.distribution import LogisticGaussianDistribution, BaseDistribution, GaussianDistribution, VonMisesDistribution
|
@@ -80,6 +80,7 @@ class BaseEstimator(object):
|
|
80
80
|
coherence_via_encoder: bool = False,
|
81
81
|
pretrained_param_file: Optional[str] = None,
|
82
82
|
warm_start: bool = False,
|
83
|
+
npmi_matrix: Optional[torch.Tensor] = None,
|
83
84
|
test_batch_size: int = 0):
|
84
85
|
self.vocabulary = vocabulary
|
85
86
|
self.log_method = log_method
|
@@ -100,6 +101,7 @@ class BaseEstimator(object):
|
|
100
101
|
self.warm_start = warm_start
|
101
102
|
self.num_val_words = -1 ## will be set later for computing Perplexity on validation dataset
|
102
103
|
self.latent_distribution.device = self.device
|
104
|
+
self.npmi_matrix : Optional[torch.Tensor] = npmi_matrix ## used with NPMI loss
|
103
105
|
|
104
106
|
|
105
107
|
def _np_one_hot(self, vec, n_outputs):
|
@@ -150,8 +152,7 @@ class BaseEstimator(object):
|
|
150
152
|
unique_term_ids.add(topic_ids[j])
|
151
153
|
redundancy = (1.0 - (float(len(unique_term_ids)) / num_topics / unique_limit)) ** 2
|
152
154
|
return npmi, redundancy
|
153
|
-
|
154
|
-
|
155
|
+
|
155
156
|
def _get_objective_from_validation_result(self, val_result):
|
156
157
|
"""
|
157
158
|
Get the final objective value from the various validation metrics.
|
@@ -417,7 +418,7 @@ class BaseBowEstimator(BaseEstimator):
|
|
417
418
|
with torch.no_grad():
|
418
419
|
for i, ((data,labels),) in enumerate(dataloader):
|
419
420
|
data = data.to(self.device)
|
420
|
-
_, kl_loss, rec_loss, _
|
421
|
+
_, kl_loss, rec_loss, _ = self._forward(self.model, data)
|
421
422
|
total_rec_loss += float(rec_loss.sum())
|
422
423
|
total_kl_loss += float(kl_loss.sum())
|
423
424
|
if ((total_rec_loss + total_kl_loss) / total_words) < 709.0:
|
@@ -517,7 +518,7 @@ class BaseBowEstimator(BaseEstimator):
|
|
517
518
|
labels = torch.zeros(data.shape[0]).unsqueeze(dim=1)
|
518
519
|
labels = labels.to(self.device)
|
519
520
|
|
520
|
-
elbo_ls, kl_ls, rec_ls,
|
521
|
+
elbo_ls, kl_ls, rec_ls, predicted_labels = \
|
521
522
|
self._forward(self.model, data)
|
522
523
|
if self.has_classifier:
|
523
524
|
labels = labels.float() if self.multilabel else labels
|
@@ -529,13 +530,13 @@ class BaseBowEstimator(BaseEstimator):
|
|
529
530
|
else:
|
530
531
|
total_ls = elbo_ls.mean()
|
531
532
|
label_ls = torch.zeros(total_ls.shape)
|
532
|
-
return elbo_ls, kl_ls, rec_ls,
|
533
|
+
return elbo_ls, kl_ls, rec_ls, label_ls, total_ls
|
533
534
|
|
534
535
|
def _get_unlabeled_losses(self, model, data):
|
535
|
-
elbo_ls, kl_ls, rec_ls,
|
536
|
-
self._forward(
|
536
|
+
elbo_ls, kl_ls, rec_ls, predicted_labels = \
|
537
|
+
self._forward(model, data)
|
537
538
|
total_ls = elbo_ls.mean() / self.gamma
|
538
|
-
return elbo_ls, kl_ls, rec_ls,
|
539
|
+
return elbo_ls, kl_ls, rec_ls, total_ls
|
539
540
|
|
540
541
|
def fit_with_validation_loaders(self, train_dataloader, validation_dataloader, aux_dataloader,
|
541
542
|
train_X_size, val_X_size, aux_X_size, total_val_words, val_X=None, val_y=None):
|
@@ -550,14 +551,14 @@ class BaseBowEstimator(BaseEstimator):
|
|
550
551
|
lab_losses = []
|
551
552
|
self.model.train()
|
552
553
|
for i, (data_batch, aux_batch) in enumerate(joint_loader):
|
553
|
-
elbo_ls, kl_loss, _,
|
554
|
+
elbo_ls, kl_loss, _, lab_loss, total_ls = self._get_losses(self.model, data_batch)
|
554
555
|
elbo_mean = elbo_ls.mean()
|
555
556
|
if aux_batch is not None:
|
556
557
|
total_ls.backward(retain_graph=True)
|
557
558
|
aux_data, = aux_batch
|
558
559
|
aux_data, _ = aux_data # ignore (null) label
|
559
560
|
aux_data = aux_data.to(self.device)
|
560
|
-
elbo_ls_a, kl_loss_a, _,
|
561
|
+
elbo_ls_a, kl_loss_a, _, total_ls_a = self._get_unlabeled_losses(self.model, aux_data)
|
561
562
|
total_ls_a.backward()
|
562
563
|
else:
|
563
564
|
total_ls.backward()
|
@@ -601,11 +602,6 @@ class BaseBowEstimator(BaseEstimator):
|
|
601
602
|
else:
|
602
603
|
self._output_status("Epoch [{}]. Objective = {} ==> PPL = {}. NPMI ={}. Redundancy = {}."
|
603
604
|
.format(epoch+1, sc_obj, v_res['ppl'], v_res['npmi'], v_res['redundancy']))
|
604
|
-
#session.report({"objective": sc_obj, "coherence": v_res['npmi'], "perplexity": v_res['ppl'],
|
605
|
-
# "redundancy": v_res['redundancy']})
|
606
|
-
#if self.reporter:
|
607
|
-
# self.reporter(epoch=epoch+1, objective=sc_obj, time_step=time.time(),
|
608
|
-
# coherence=v_res['npmi'], perplexity=v_res['ppl'], redundancy=v_res['redundancy'])
|
609
605
|
return sc_obj, v_res
|
610
606
|
|
611
607
|
|
@@ -615,6 +611,8 @@ class BaseBowEstimator(BaseEstimator):
|
|
615
611
|
if self.model is None or not self.warm_start:
|
616
612
|
self.model = self._get_model()
|
617
613
|
self.model.initialize_bias_terms(wd_freqs.squeeze()) ## initialize bias weights to log frequencies
|
614
|
+
if self.npmi_matrix is not None:
|
615
|
+
self.model.initialize_npmi_loss(self.npmi_matrix)
|
618
616
|
return x_size
|
619
617
|
|
620
618
|
|
@@ -644,6 +642,7 @@ class BaseBowEstimator(BaseEstimator):
|
|
644
642
|
X_data = train_dataloader.dataset.data
|
645
643
|
train_dataloader = SingletonWrapperLoader(train_dataloader)
|
646
644
|
train_X_size = X_data.shape
|
645
|
+
print("**** Setting up model with biases")
|
647
646
|
_ = self.setup_model_with_biases(X_data)
|
648
647
|
|
649
648
|
if aux_X is not None:
|
@@ -718,7 +717,7 @@ class BowEstimator(BaseBowEstimator):
|
|
718
717
|
|
719
718
|
Returns:
|
720
719
|
Tuple of:
|
721
|
-
elbo, kl_loss, rec_loss,
|
720
|
+
elbo, kl_loss, rec_loss, reconstruction
|
722
721
|
"""
|
723
722
|
return model(data)
|
724
723
|
|
@@ -822,6 +821,8 @@ class BowMetricEstimator(BowEstimator):
|
|
822
821
|
model = self._get_model()
|
823
822
|
tr_bow_matrix = self._get_bow_matrix(train_data)
|
824
823
|
model.initialize_bias_terms(tr_bow_matrix.sum(axis=0))
|
824
|
+
if self.npmi_matrix is not None:
|
825
|
+
self.model.initialize_npmi_loss(self.npmi_matrix)
|
825
826
|
return model
|
826
827
|
|
827
828
|
def _forward(self, model, data):
|
@@ -936,171 +937,6 @@ class BowMetricEstimator(BowEstimator):
|
|
936
937
|
|
937
938
|
|
938
939
|
|
939
|
-
class CovariateBowEstimator(BaseBowEstimator):
|
940
|
-
|
941
|
-
def __init__(self, *args, n_covars=0, **kwargs):
|
942
|
-
|
943
|
-
super().__init__(*args, **kwargs)
|
944
|
-
|
945
|
-
self.covar_net_layers = 1 ### XXX - temp hardcoded
|
946
|
-
self.n_covars = n_covars
|
947
|
-
|
948
|
-
|
949
|
-
@classmethod
|
950
|
-
def from_config(cls, n_covars, *args, **kwargs):
|
951
|
-
est = super().from_config(*args, **kwargs)
|
952
|
-
est.n_covars = n_covars
|
953
|
-
return est
|
954
|
-
|
955
|
-
def _get_model(self):
|
956
|
-
"""
|
957
|
-
Returns
|
958
|
-
MXNet model initialized using provided hyperparameters
|
959
|
-
"""
|
960
|
-
if self.embedding_source != 'random':
|
961
|
-
#e_type, e_name = tuple(self.embedding_source.split(':'))
|
962
|
-
pt_embedding = pretrained_aliases('glove.6B.100d')
|
963
|
-
pretrained = pt_embedding.get_vecs_by_tokens(self.vocabulary)
|
964
|
-
emb_size = 100
|
965
|
-
#pt_embedding = nlp.embedding.create(e_type, source=e_name)
|
966
|
-
#self.vocabulary.set_embedding(pt_embedding)
|
967
|
-
#emb_size = len(self.vocabulary.embedding.idx_to_vec[0])
|
968
|
-
#for word in self.vocabulary.embedding._idx_to_token:
|
969
|
-
# if (self.vocabulary.embedding[word] == mx.nd.zeros(emb_size)).sum() == emb_size:
|
970
|
-
# self.vocabulary.embedding[word] = mx.nd.random.normal(0, 0.1, emb_size)
|
971
|
-
else:
|
972
|
-
emb_size = self.embedding_size
|
973
|
-
model = \
|
974
|
-
CovariateBowVAEModel(n_covars=self.n_covars,
|
975
|
-
enc_dim=self.enc_hidden_dim, embedding_size=emb_size,
|
976
|
-
fixed_embedding=self.fixed_embedding, latent_distribution=self.latent_distribution,
|
977
|
-
coherence_reg_penalty=self.coherence_reg_penalty, redundancy_reg_penalty=self.redundancy_reg_penalty,
|
978
|
-
ctx=self.ctx)
|
979
|
-
return model
|
980
|
-
|
981
|
-
|
982
|
-
def _get_losses(self, model, batch_data):
|
983
|
-
# batch_data has form: ((data, covars),)
|
984
|
-
(data,covars), = batch_data
|
985
|
-
data = data.to(self.device)
|
986
|
-
covars = covars.to(self.device)
|
987
|
-
elbo_ls, kl_ls, rec_ls, coherence_loss, red_ls, predicted_labels = \
|
988
|
-
self._forward(self.model, data, covars)
|
989
|
-
total_ls = elbo_ls.mean()
|
990
|
-
label_ls = mx.nd.zeros(total_ls.shape)
|
991
|
-
return elbo_ls, kl_ls, rec_ls, red_ls, label_ls, total_ls
|
992
|
-
|
993
|
-
|
994
|
-
def _get_config(self):
|
995
|
-
config = super()._get_config()
|
996
|
-
config['n_covars'] = self.n_covars
|
997
|
-
return config
|
998
|
-
|
999
|
-
|
1000
|
-
def _forward(self,
|
1001
|
-
model: BowVAEModel,
|
1002
|
-
data: torch.Tensor,
|
1003
|
-
covars: torch.Tensor) -> Tuple[torch.Tensor,
|
1004
|
-
torch.Tensor,
|
1005
|
-
torch.Tensor,
|
1006
|
-
torch.Tensor,
|
1007
|
-
torch.Tensor,
|
1008
|
-
torch.Tensor,
|
1009
|
-
torch.Tensor] :
|
1010
|
-
"""
|
1011
|
-
Forward pass of BowVAE model given the supplied data
|
1012
|
-
|
1013
|
-
Parameters:
|
1014
|
-
model: Model that returns elbo, kl_loss, rec_loss, l1_pen, coherence_loss, redundancy_loss, reconstruction
|
1015
|
-
data: Document word matrix of shape (n_train_samples, vocab_size)
|
1016
|
-
covars: Covariate matrix. shape [n_samples, n_covars]
|
1017
|
-
|
1018
|
-
Returns:
|
1019
|
-
(tuple): Tuple of:
|
1020
|
-
elbo, kl_loss, rec_loss, l1_pen, coherence_loss, redundancy_loss, reconstruction
|
1021
|
-
"""
|
1022
|
-
self.train_data = data
|
1023
|
-
self.train_labels = covars
|
1024
|
-
return model(data, covars)
|
1025
|
-
|
1026
|
-
|
1027
|
-
def _npmi_per_covariate(self, X, y, k=10):
|
1028
|
-
"""
|
1029
|
-
Calculate NPMI(Normalized Pointwise Mutual Information) for each covariate for data X
|
1030
|
-
|
1031
|
-
Parameters:
|
1032
|
-
X (array-like or sparse matrix): Document word matrix. shape [n_samples, vocab_size]
|
1033
|
-
y (array-like or sparse matrix): Covariate matrix. shape [n_samples, n_covars]
|
1034
|
-
k (int): Threshold at which to compute npmi. optional (default=10)
|
1035
|
-
|
1036
|
-
Returns:
|
1037
|
-
(dict): Dictionary of npmi scores for each covariate.
|
1038
|
-
"""
|
1039
|
-
X_train = X.toarray()
|
1040
|
-
y_train = y
|
1041
|
-
covars = np.unique(y_train, axis=0)
|
1042
|
-
covar_npmi = {}
|
1043
|
-
npmi_total = 0
|
1044
|
-
for covar in covars:
|
1045
|
-
mask = (y_train == covar).all(axis=1)
|
1046
|
-
X_covar, y_covar = torch.tensor(X_train[mask], dtype='float'), torch.tensor(y_train[mask], dtype='float')
|
1047
|
-
sorted_ids = self.model.get_ordered_terms_with_covar_at_data(X_covar,k, y_covar)
|
1048
|
-
top_k_words_per_topic = [[int(i) for i in list(sorted_ids[:k, t].asnumpy())] for t in range(self.n_latent)]
|
1049
|
-
npmi_eval = EvaluateNPMI(top_k_words_per_topic)
|
1050
|
-
npmi = npmi_eval.evaluate_csr_mat(X_covar)
|
1051
|
-
|
1052
|
-
#if(self.label_map):
|
1053
|
-
# covar_key = covar[0]
|
1054
|
-
#else:
|
1055
|
-
# covar_key = np.where(covar)[0][0]
|
1056
|
-
covar_keky = covar[0]
|
1057
|
-
covar_npmi[covar_key] = npmi
|
1058
|
-
npmi_total += npmi
|
1059
|
-
return npmi_total / len(covars)
|
1060
|
-
|
1061
|
-
def _npmi(self, X, k=10):
|
1062
|
-
return super()._npmi(X, k=k)
|
1063
|
-
#return self._npmi_per_covariate(X, y, k)
|
1064
|
-
|
1065
|
-
def _get_objective_from_validation_result(self, v_res):
|
1066
|
-
return v_res['npmi']
|
1067
|
-
|
1068
|
-
def validate(self, X, y):
|
1069
|
-
npmi, redundancy = self._npmi(X)
|
1070
|
-
return {'npmi': npmi, 'redundancy': redundancy, 'ppl': 0.0}
|
1071
|
-
|
1072
|
-
def get_topic_vectors(self) -> torch.Tensor:
|
1073
|
-
"""
|
1074
|
-
Get topic vectors of the fitted model.
|
1075
|
-
|
1076
|
-
Returns:
|
1077
|
-
topic_vectors: Topic word distribution. topic_distribution[i, j] represents word j in topic i.
|
1078
|
-
shape=(n_latent, vocab_size)
|
1079
|
-
"""
|
1080
|
-
|
1081
|
-
return self.model.get_topic_vectors(self.train_data, self.train_labels)
|
1082
|
-
|
1083
|
-
def initialize_with_pretrained(self):
|
1084
|
-
assert(self.pretrained_param_file is not None)
|
1085
|
-
self.model = self._get_model()
|
1086
|
-
self.model.load_parameters(self.pretrained_param_file, allow_missing=False)
|
1087
|
-
|
1088
|
-
|
1089
|
-
def transform(self, X: sp.csr.csr_matrix, y: np.ndarray):
|
1090
|
-
"""
|
1091
|
-
Transform data X and y according to the fitted model.
|
1092
|
-
|
1093
|
-
Parameters:
|
1094
|
-
X: Document word matrix of shape {n_samples, n_features)
|
1095
|
-
y: Covariate matrix of shape (n_train_samples, n_covars)
|
1096
|
-
|
1097
|
-
Returns:
|
1098
|
-
Document topic distribution for X and y of shape=(n_samples, n_latent)
|
1099
|
-
"""
|
1100
|
-
x_mxnet, y_mxnet = mx.nd.array(X, dtype=np.float32), mx.nd.array(y, dtype=np.float32)
|
1101
|
-
return self.model.encode_data_with_covariates(x_mxnet, y_mxnet).asnumpy()
|
1102
|
-
|
1103
|
-
|
1104
940
|
class SeqBowEstimator(BaseEstimator):
|
1105
941
|
|
1106
942
|
def __init__(self, *args,
|
@@ -1213,6 +1049,9 @@ class SeqBowEstimator(BaseEstimator):
|
|
1213
1049
|
model = self._get_model()
|
1214
1050
|
tr_bow_counts = self._get_bow_wd_counts(train_data)
|
1215
1051
|
model.initialize_bias_terms(tr_bow_counts)
|
1052
|
+
if self.npmi_matrix is not None:
|
1053
|
+
print("****** INITIALIZING NPMI LOSS FUNCTION *******")
|
1054
|
+
model.initialize_npmi_loss(self.npmi_matrix)
|
1216
1055
|
return model
|
1217
1056
|
|
1218
1057
|
|
tmnt/eval_npmi.py
CHANGED
@@ -10,9 +10,13 @@ from collections import Counter
|
|
10
10
|
import numpy as np
|
11
11
|
import scipy
|
12
12
|
import scipy.sparse
|
13
|
+
from tqdm import tqdm
|
13
14
|
|
14
15
|
from tmnt.utils.ngram_helpers import BigramReader
|
15
16
|
from itertools import combinations
|
17
|
+
from gensim.models.coherencemodel import CoherenceModel
|
18
|
+
from tmnt.preprocess.vectorizer import TMNTVectorizer
|
19
|
+
from gensim.corpora.dictionary import Dictionary
|
16
20
|
|
17
21
|
__all__ = ['NPMI', 'EvaluateNPMI']
|
18
22
|
|
@@ -22,7 +26,6 @@ class NPMI(object):
|
|
22
26
|
self.unigram_cnts = unigram_cnts
|
23
27
|
self.bigram_cnts = bigram_cnts
|
24
28
|
self.n_docs = n_docs
|
25
|
-
|
26
29
|
|
27
30
|
def wd_id_pair_npmi(self, w1: int, w2: int):
|
28
31
|
cw1 = self.unigram_cnts.get(w1, 0.0)
|
@@ -89,41 +92,57 @@ class EvaluateNPMI(object):
|
|
89
92
|
total_npmi += total_topic_npmi
|
90
93
|
return total_npmi / len(self.top_k_words_per_topic)
|
91
94
|
|
92
|
-
def
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
95
|
+
def get_full_vocab_npmi_matrix(self, mat):
|
96
|
+
vocab_size = mat.shape[1]
|
97
|
+
npmi_matrix = np.zeros((vocab_size, vocab_size))
|
98
|
+
n_docs = mat.shape[0]
|
99
|
+
if isinstance(mat, scipy.sparse.csr.csr_matrix):
|
100
|
+
is_sparse = True
|
101
|
+
for (w1, w2) in tqdm(combinations(np.arange(vocab_size), 2)):
|
102
|
+
o_1 = mat[:, w1] > 0
|
103
|
+
o_2 = mat[:, w2] > 0
|
104
|
+
if is_sparse:
|
105
|
+
o_1 = o_1.toarray().squeeze()
|
106
|
+
o_2 = o_2.toarray().squeeze()
|
107
|
+
occur_1 = np.array(o_1, dtype='int')
|
108
|
+
occur_2 = np.array(o_2, dtype='int')
|
109
|
+
unigram_1 = occur_1.sum()
|
110
|
+
unigram_2 = occur_2.sum()
|
111
|
+
bigram_cnt = np.sum(occur_1 * occur_2)
|
112
|
+
if bigram_cnt < 1:
|
99
113
|
npmi = 0.0
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
114
|
+
else:
|
115
|
+
npmi = (log10(n_docs) + log10(bigram_cnt) - log10(unigram_1) - log10(unigram_2)) / (log10(n_docs) - log10(bigram_cnt) + 1e-4)
|
116
|
+
npmi_matrix[w1, w2] = npmi
|
117
|
+
return npmi_matrix
|
118
|
+
|
119
|
+
|
120
|
+
class FullNPMI(object):
|
121
|
+
|
122
|
+
def get_full_vocab_npmi_matrix(self, mat: scipy.sparse.csr_matrix, tf: TMNTVectorizer):
|
123
|
+
corpus = []
|
124
|
+
npmi_matrix = np.zeros((tf.vocab_size, tf.vocab_size))
|
125
|
+
for ri in range(mat.shape[0]):
|
126
|
+
row = mat.getrow(ri)
|
127
|
+
corpus.append(list(zip(row.indices, row.data)))
|
128
|
+
topics = [ list(range(mat.shape[1])) ]
|
129
|
+
dictionary = Dictionary()
|
130
|
+
dictionary.id2token = tf.get_vocab().get_itos()
|
131
|
+
dictionary.token2id = tf.get_vocab().get_stoi()
|
132
|
+
cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=dictionary, coherence='u_mass', topn=len(topics[0]))
|
133
|
+
segmented_topics = cm.measure.seg(cm.topics)
|
134
|
+
accumulator = cm.estimate_probabilities(segmented_topics)
|
135
|
+
num_docs = accumulator.num_docs
|
136
|
+
eps = 1e-12
|
137
|
+
for w1, w2 in tqdm(segmented_topics[0]):
|
138
|
+
w1_count = accumulator[w1]
|
139
|
+
w2_count = accumulator[w2]
|
140
|
+
co_occur_count = accumulator[w1, w2]
|
141
|
+
p_w1_w2 = co_occur_count / num_docs
|
142
|
+
p_w1 = w1_count / num_docs
|
143
|
+
p_w2 = w2_count / num_docs
|
144
|
+
npmi_matrix[w1, w2] = np.log((p_w1_w2 + eps) / (p_w1 * p_w2)) / -np.log(p_w1_w2 + eps)
|
145
|
+
return npmi_matrix
|
146
|
+
|
147
|
+
|
129
148
|
|
tmnt/inference.py
CHANGED
@@ -10,8 +10,8 @@ import io
|
|
10
10
|
import os
|
11
11
|
import torch
|
12
12
|
import pickle
|
13
|
-
from tmnt.modeling import BowVAEModel,
|
14
|
-
from tmnt.estimator import BowEstimator,
|
13
|
+
from tmnt.modeling import BowVAEModel, SeqBowVED, MetricSeqBowVED
|
14
|
+
from tmnt.estimator import BowEstimator, SeqBowEstimator, SeqBowMetricEstimator
|
15
15
|
from tmnt.data_loading import SparseDataLoader
|
16
16
|
from tmnt.preprocess.vectorizer import TMNTVectorizer
|
17
17
|
from tmnt.utils.recalibrate import recalibrate_scores
|
@@ -54,9 +54,6 @@ class BaseInferencer(object):
|
|
54
54
|
def get_top_k_words_per_topic(self, k):
|
55
55
|
raise NotImplementedError
|
56
56
|
|
57
|
-
def get_top_k_words_per_topic_per_covariate(self, k):
|
58
|
-
raise NotImplementedError
|
59
|
-
|
60
57
|
def get_pyldavis_details(self, sp_vec_file_or_X, y=None):
|
61
58
|
w_pr, dt_matrix, doc_lengths, term_cnts = self.get_model_details(sp_vec_file_or_X, y=y)
|
62
59
|
d1 = w_pr.cpu().detach().numpy().tolist()
|
@@ -80,12 +77,7 @@ class BowVAEInferencer(BaseInferencer):
|
|
80
77
|
self.vocab = estimator.vocabulary
|
81
78
|
self.n_latent = estimator.n_latent
|
82
79
|
self.model = estimator.model
|
83
|
-
|
84
|
-
self.covar_model = True
|
85
|
-
self.n_covars = estimator.model.n_covars
|
86
|
-
self.covar_net_layers = estimator.model.covar_net_layers
|
87
|
-
else:
|
88
|
-
self.covar_model = False
|
80
|
+
self.covar_model = False
|
89
81
|
|
90
82
|
@classmethod
|
91
83
|
def from_saved(cls, model_dir=None, device='cpu'):
|
@@ -96,12 +88,7 @@ class BowVAEInferencer(BaseInferencer):
|
|
96
88
|
serialized_vectorizer_file = os.path.join(model_dir,'vectorizer.pkl')
|
97
89
|
with io.open(config_file, 'r') as f:
|
98
90
|
config_dict = json.load(f)
|
99
|
-
|
100
|
-
estimator = CovariateBowEstimator.from_config(config_dict['n_covars'],
|
101
|
-
config_file, vocab_file,
|
102
|
-
pretrained_param_file=param_file)
|
103
|
-
else:
|
104
|
-
estimator = BowEstimator.from_saved(model_dir)
|
91
|
+
estimator = BowEstimator.from_saved(model_dir)
|
105
92
|
estimator.initialize_with_pretrained()
|
106
93
|
if os.path.exists(serialized_vectorizer_file):
|
107
94
|
with open(serialized_vectorizer_file, 'rb') as fp:
|
@@ -174,16 +161,12 @@ class BowVAEInferencer(BaseInferencer):
|
|
174
161
|
for _, (data,labels) in enumerate(infer_iter):
|
175
162
|
with torch.no_grad():
|
176
163
|
data = data.to(self.device)
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
if self.model.multilabel:
|
184
|
-
preds = list(self.model.predict(data).sigmoid().detach().numpy())
|
185
|
-
else:
|
186
|
-
preds = list(self.model.predict(data).softmax(dim=1).detach().numpy())
|
164
|
+
encs = self.model.encode_data(data, include_bn=include_bn)
|
165
|
+
if include_predictions:
|
166
|
+
if self.model.multilabel:
|
167
|
+
preds = list(self.model.predict(data).sigmoid().detach().numpy())
|
168
|
+
else:
|
169
|
+
preds = list(self.model.predict(data).softmax(dim=1).detach().numpy())
|
187
170
|
if use_probs:
|
188
171
|
#e1 = (encs - encs.min(dim=1).unsqueeze(1)).astype('float64')
|
189
172
|
e1 = (encs - encs.min(dim=1)[0].unsqueeze(1))
|
@@ -233,31 +216,6 @@ class BowVAEInferencer(BaseInferencer):
|
|
233
216
|
return topic_terms
|
234
217
|
|
235
218
|
|
236
|
-
def get_top_k_words_per_topic_per_covariate(self, k):
|
237
|
-
n_topics = self.n_latent
|
238
|
-
w = self.model.cov_decoder.cov_inter_decoder.collect_params().get('weight').data()
|
239
|
-
n_covars = int(w.shape[1] / n_topics)
|
240
|
-
topic_terms = []
|
241
|
-
for i in range(n_covars):
|
242
|
-
cv_i_slice = w[:, (i * n_topics):((i+1) * n_topics)]
|
243
|
-
sorted_ids = cv_i_slice.argsort(dim=0, is_ascend=False)
|
244
|
-
cv_i_terms = []
|
245
|
-
for t in range(n_topics):
|
246
|
-
top_k = [ self.vocab.lookup_token(int(i)) for i in list(sorted_ids[:k, t].asnumpy()) ]
|
247
|
-
cv_i_terms.append(top_k)
|
248
|
-
topic_terms.append(cv_i_terms)
|
249
|
-
return topic_terms
|
250
|
-
|
251
|
-
def get_covariate_model_details(self):
|
252
|
-
## 1) C x K x W tensor with |C| P(term|topic) probability matricies where |C| is number of co-variates
|
253
|
-
w = self.model.cov_decoder.cov_inter_decoder.collect_params().get('weight').data().transpose()
|
254
|
-
w_rsh = w.reshape(-1,self.n_latent, w.shape[1])
|
255
|
-
return w_rsh.softmax(dim=2)
|
256
|
-
|
257
|
-
|
258
|
-
def get_top_k_words_per_topic_over_scalar_covariate(self, k, min_v=0.0, max_v=1.0, step=0.1):
|
259
|
-
raise NotImplemented
|
260
|
-
|
261
219
|
def predict_text(self, txt: List[str], pred_threshold: float = 0.5) -> Tuple[List[str], List[np.ndarray], np.ndarray]:
|
262
220
|
"""Take a list of input documents/passages as strings and return document encodings (topics) and classification outputs
|
263
221
|
|
tmnt/modeling.py
CHANGED
@@ -36,7 +36,8 @@ class BaseVAE(nn.Module):
|
|
36
36
|
|
37
37
|
self.latent_distribution = latent_distribution
|
38
38
|
self.decoder = nn.Linear(self.n_latent, self.vocab_size).to(device)
|
39
|
-
|
39
|
+
self.npmi_with_diversity_loss : Optional[NPMILossWithDiversity] = None
|
40
|
+
self.npmi_alpha = 0.7
|
40
41
|
|
41
42
|
def initialize_bias_terms(self, wd_freqs: Optional[np.ndarray]):
|
42
43
|
if wd_freqs is not None:
|
@@ -46,6 +47,10 @@ class BaseVAE(nn.Module):
|
|
46
47
|
self.decoder.bias = nn.Parameter(torch.tensor(log_freq, dtype=torch.float32, device=self.device))
|
47
48
|
self.decoder.bias.requires_grad_(False)
|
48
49
|
|
50
|
+
def initialize_npmi_loss(self, npmi_mat):
|
51
|
+
t_npmi_mat = torch.Tensor(npmi_mat, device=self.device)
|
52
|
+
self.npmi_with_diversity_loss = NPMILossWithDiversity(t_npmi_mat, device=self.device, alpha=self.npmi_alpha)
|
53
|
+
|
49
54
|
def get_ordered_terms(self):
|
50
55
|
"""
|
51
56
|
Returns the top K terms for each topic based on sensitivity analysis. Terms whose
|
@@ -62,26 +67,29 @@ class BaseVAE(nn.Module):
|
|
62
67
|
"""
|
63
68
|
Returns unnormalized topic vectors
|
64
69
|
"""
|
65
|
-
z = torch.ones((
|
70
|
+
z = torch.ones((self.n_latent,), device=self.device)
|
66
71
|
jacobian = torch.autograd.functional.jacobian(self.decoder, z)
|
67
|
-
return jacobian.cpu().
|
72
|
+
return jacobian.cpu().numpy()
|
68
73
|
|
69
74
|
|
70
|
-
def
|
71
|
-
if self.
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
75
|
+
def add_npmi_and_diversity_loss(self, cur_loss):
|
76
|
+
if self.npmi_with_diversity_loss:
|
77
|
+
z = torch.ones((self.n_latent,), device=self.device)
|
78
|
+
jacobian = torch.autograd.functional.jacobian(self.decoder, z)
|
79
|
+
npmi_loss = self.npmi_with_diversity_loss(jacobian)
|
80
|
+
npmi_loss = npmi_loss.sum()
|
81
|
+
print("npmi loss = {}".format(npmi_loss))
|
82
|
+
return (cur_loss + npmi_loss)
|
76
83
|
else:
|
77
|
-
return
|
84
|
+
return cur_loss
|
78
85
|
|
79
|
-
|
86
|
+
|
87
|
+
def get_loss_terms(self, data, y, KL):
|
80
88
|
rr = data * torch.log(y+1e-12)
|
81
89
|
recon_loss = -(rr.sum(dim=1))
|
82
90
|
i_loss = KL + recon_loss
|
83
|
-
ii_loss
|
84
|
-
return ii_loss, recon_loss
|
91
|
+
ii_loss = self.add_npmi_and_diversity_loss(i_loss)
|
92
|
+
return ii_loss, recon_loss
|
85
93
|
|
86
94
|
|
87
95
|
class BowVAEModel(BaseVAE):
|
@@ -224,14 +232,14 @@ class BowVAEModel(BaseVAE):
|
|
224
232
|
z, KL = self.latent_distribution(enc_out, batch_size)
|
225
233
|
xhat = self.decoder(z)
|
226
234
|
y = torch.nn.functional.softmax(xhat, dim=1)
|
227
|
-
ii_loss, recon_loss
|
228
|
-
self.get_loss_terms(data, y, KL
|
235
|
+
ii_loss, recon_loss = \
|
236
|
+
self.get_loss_terms(data, y, KL)
|
229
237
|
if self.has_classifier:
|
230
238
|
mu_out = self.latent_distribution.get_mu_encoding(enc_out)
|
231
239
|
classifier_outputs = self.classifier(self.lab_dr(mu_out))
|
232
240
|
else:
|
233
241
|
classifier_outputs = None
|
234
|
-
return ii_loss, KL, recon_loss,
|
242
|
+
return ii_loss, KL, recon_loss, classifier_outputs
|
235
243
|
|
236
244
|
|
237
245
|
class MetricBowVAEModel(BowVAEModel):
|
@@ -241,13 +249,6 @@ class MetricBowVAEModel(BowVAEModel):
|
|
241
249
|
super(MetricBowVAEModel, self).__init__(*args, **kwargs)
|
242
250
|
|
243
251
|
|
244
|
-
def get_redundancy_penalty(self):
|
245
|
-
w = self.decoder.weight.data
|
246
|
-
emb = self.embedding.weight.data if self.embedding is not None else w.transpose()
|
247
|
-
_, redundancy_loss = self.coherence_regularization(w, emb)
|
248
|
-
return redundancy_loss
|
249
|
-
|
250
|
-
|
251
252
|
def _get_elbo(self, bow, enc):
|
252
253
|
batch_size = bow.shape[0]
|
253
254
|
z, KL = self.latent_distribution(enc, batch_size)
|
@@ -277,159 +278,61 @@ class MetricBowVAEModel(BowVAEModel):
|
|
277
278
|
return (elbo1 + elbo2), (rec_loss1 + rec_loss2), (KL_loss1 + KL_loss2), redundancy_loss, mu1, mu2
|
278
279
|
|
279
280
|
|
280
|
-
class
|
281
|
-
"""Bag-of-words topic model with labels used as co-variates
|
282
|
-
"""
|
283
|
-
def __init__(self, covar_net_layers=1, *args, **kwargs):
|
284
|
-
super(CovariateBowVAEModel, self).__init__(*args, **kwargs)
|
285
|
-
self.covar_net_layers = covar_net_layers
|
286
|
-
with self.name_scope():
|
287
|
-
if self.n_covars < 1:
|
288
|
-
self.cov_decoder = ContinuousCovariateModel(self.n_latent, self.vocab_size,
|
289
|
-
total_layers=self.covar_net_layers, device=self.device)
|
290
|
-
else:
|
291
|
-
self.cov_decoder = CovariateModel(self.n_latent, self.n_covars, self.vocab_size,
|
292
|
-
interactions=True, device=self.device)
|
293
|
-
|
294
|
-
|
295
|
-
def encode_data_with_covariates(self, data, covars, include_bn=False):
|
296
|
-
"""
|
297
|
-
Encode data to the mean of the latent distribution defined by the input `data`
|
298
|
-
"""
|
299
|
-
emb_out = self.embedding(data)
|
300
|
-
enc_out = self.encoder(mx.nd.concat(emb_out, covars))
|
301
|
-
return self.latent_distribution.get_mu_encoding(enc_out, include_bn=include_bn)
|
302
|
-
|
303
|
-
|
304
|
-
def get_ordered_terms_with_covar_at_data(self, data, k, covar):
|
305
|
-
"""
|
306
|
-
Uses test/training data-point as the input points around which term sensitivity is computed
|
307
|
-
"""
|
308
|
-
data = data.to(self.device)
|
309
|
-
covar = covar.to(self.device)
|
310
|
-
jacobian = torch.zeros((self.vocab_size, self.n_latent), device=self.device)
|
311
|
-
|
312
|
-
batch_size = data.shape[0]
|
313
|
-
emb_out = self.embedding(data)
|
314
|
-
|
315
|
-
co_emb = torch.cat(emb_out, covar)
|
316
|
-
z = self.latent_distribution.get_mu_encoding(self.encoder(co_emb))
|
317
|
-
z.attach_grad()
|
318
|
-
outputs = []
|
319
|
-
with mx.autograd.record():
|
320
|
-
dec_out = self.decoder(z)
|
321
|
-
cov_dec_out = self.cov_decoder(z, covar)
|
322
|
-
y = mx.nd.softmax(cov_dec_out + dec_out, axis=1)
|
323
|
-
for i in range(self.vocab_size):
|
324
|
-
outputs.append(y[:,i])
|
325
|
-
for i, output in enumerate(outputs):
|
326
|
-
output.backward(retain_graph=True)
|
327
|
-
jacobian[i] += z.grad.sum(axis=0)
|
328
|
-
sorted_j = jacobian.argsort(axis=0, is_ascend=False)
|
329
|
-
return sorted_j
|
281
|
+
class NPMILossWithDiversity(nn.Module):
|
330
282
|
|
331
|
-
def
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
covar = covar.as_in_context(self.model_ctx)
|
337
|
-
jacobian = mx.nd.zeros(shape=(self.vocab_size, self.n_latent), ctx=self.model_ctx)
|
338
|
-
|
339
|
-
batch_size = data.shape[0]
|
340
|
-
emb_out = self.embedding(data)
|
341
|
-
|
342
|
-
co_emb = mx.nd.concat(emb_out, covar)
|
343
|
-
z = self.latent_distribution.get_mu_encoding(self.encoder(co_emb))
|
344
|
-
z.attach_grad()
|
345
|
-
outputs = []
|
346
|
-
with mx.autograd.record():
|
347
|
-
dec_out = self.decoder(z)
|
348
|
-
cov_dec_out = self.cov_decoder(z, covar)
|
349
|
-
y = mx.nd.softmax(cov_dec_out + dec_out, axis=1)
|
350
|
-
for i in range(self.vocab_size):
|
351
|
-
outputs.append(y[:,i])
|
352
|
-
for i, output in enumerate(outputs):
|
353
|
-
output.backward(retain_graph=True)
|
354
|
-
jacobian[i] += z.grad.sum(axis=0)
|
355
|
-
return jacobian
|
356
|
-
|
357
|
-
|
358
|
-
def forward(self, F, data, covars):
|
359
|
-
batch_size = data.shape[0]
|
360
|
-
emb_out = self.embedding(data)
|
361
|
-
if self.n_covars > 0:
|
362
|
-
covars = F.one_hot(covars, self.n_covars)
|
363
|
-
co_emb = F.concat(emb_out, covars)
|
364
|
-
z, KL = self.run_encode(F, co_emb, batch_size)
|
365
|
-
dec_out = self.decoder(z)
|
366
|
-
cov_dec_out = self.cov_decoder(z, covars)
|
367
|
-
y = F.softmax(dec_out + cov_dec_out, axis=1)
|
368
|
-
ii_loss, recon_loss, coherence_loss, redundancy_loss = \
|
369
|
-
self.get_loss_terms(F, data, y, KL, batch_size)
|
370
|
-
return ii_loss, KL, recon_loss, coherence_loss, redundancy_loss, None
|
371
|
-
|
372
|
-
|
373
|
-
class CovariateModel(nn.Module):
|
374
|
-
|
375
|
-
def __init__(self, n_topics, n_covars, vocab_size, interactions=False, device='cpu'):
|
376
|
-
self.n_topics = n_topics
|
377
|
-
self.n_covars = n_covars
|
378
|
-
self.vocab_size = vocab_size
|
379
|
-
self.interactions = interactions
|
283
|
+
def __init__(self, npmi_matrix: torch.Tensor, device: torch.device, k=20, alpha=0.7, use_diversity_loss=True):
|
284
|
+
super(NPMILossWithDiversity, self).__init__()
|
285
|
+
self.alpha = alpha
|
286
|
+
self.npmi_matrix = npmi_matrix
|
287
|
+
self.use_diversity_loss = use_diversity_loss
|
380
288
|
self.device = device
|
381
|
-
|
382
|
-
with self.name_scope():
|
383
|
-
self.cov_decoder = torch.nn.Linear(n_covars, self.vocab_size, bias=False)
|
384
|
-
if self.interactions:
|
385
|
-
self.cov_inter_decoder = torch.nn.Linear(self.n_covars * self.n_topics, self.vocab_size, bias=False)
|
386
|
-
self.apply(self._init_weights)
|
387
|
-
|
388
|
-
def _init_weights(self, module):
|
389
|
-
if isinstance(module, torch.nn.Linear):
|
390
|
-
torch.nn.init.xavier_uniform_(module.weight.data)
|
391
|
-
|
392
|
-
|
393
|
-
def forward(self, topic_distrib, covars):
|
394
|
-
score_C = self.cov_decoder(covars)
|
395
|
-
if self.interactions:
|
396
|
-
td_rsh = topic_distrib.unsqueeze(1)
|
397
|
-
cov_rsh = covars.unsqueeze(2)
|
398
|
-
cov_interactions = cov_rsh * td_rsh ## shape (N, Topics, Covariates) -- outer product
|
399
|
-
batch_size = cov_interactions.shape[0]
|
400
|
-
cov_interactions_rsh = torch.reshape(cov_interactions, (batch_size, self.n_topics * self.n_covars))
|
401
|
-
score_CI = self.cov_inter_decoder(cov_interactions_rsh)
|
402
|
-
return score_CI + score_C
|
403
|
-
else:
|
404
|
-
return score_C
|
405
|
-
|
406
|
-
|
407
|
-
class ContinuousCovariateModel(nn.Module):
|
408
|
-
|
409
|
-
def __init__(self, n_topics, vocab_size, total_layers = 1, device='device'):
|
410
|
-
self.n_topics = n_topics
|
411
|
-
self.n_scalars = 1 # number of continuous variables
|
412
|
-
self.model_ctx = ctx
|
413
|
-
self.time_topic_dim = 300
|
414
|
-
super(ContinuousCovariateModel, self).__init__()
|
415
|
-
|
416
|
-
with self.name_scope():
|
417
|
-
self.cov_decoder = nn.Sequential()
|
418
|
-
for i in range(total_layers):
|
419
|
-
if i < 1:
|
420
|
-
in_units = self.n_scalars + self.n_topics
|
421
|
-
else:
|
422
|
-
in_units = self.time_topic_dim
|
423
|
-
self.cov_decoder.add_module("linear_"+str(i), nn.Linear(in_units, self.time_topic_dim,
|
424
|
-
bias=(i < 1)))
|
425
|
-
self.cov_decoder.add_module("relu_"+str(i), nn.Relu())
|
426
|
-
self.cov_decoder.add_module("linear_out_", nn.Linear(self.time_topic_dim, vocab_size, bias=False))
|
427
|
-
|
428
|
-
def forward(self, topic_distrib, scalars):
|
429
|
-
inputs = torch.cat((topic_distrib, scalars), 0)
|
430
|
-
sc_transform = self.cov_decoder(inputs)
|
431
|
-
return sc_transform
|
289
|
+
self.k = k
|
432
290
|
|
291
|
+
def _row_wise_normalize_inplace(self, x, mask=None):
|
292
|
+
for row_idx, row in enumerate(x):
|
293
|
+
if mask != None:
|
294
|
+
row_mask = mask[row_idx]
|
295
|
+
row = row[row_mask]
|
296
|
+
x[row_idx][row_mask] = (row - row.min()) / (row.max() - row.min())
|
297
|
+
else:
|
298
|
+
row_min = row.min().item()
|
299
|
+
row_max = row.max().item()
|
300
|
+
x[row_idx] = (row - row_min)/(row_max - row_min)
|
301
|
+
return x
|
302
|
+
|
303
|
+
def _get_npmi_loss(self, jacobian):
|
304
|
+
#z = torch.ones((self.n_latent,), device=self.device)
|
305
|
+
#jacobian = torch.autograd.functional.jacobian(self.decoder, z)
|
306
|
+
#beta = self.model.get_topic_vectors().t() # |T| x |V|
|
307
|
+
beta = jacobian.t()
|
308
|
+
n_topics = beta.shape[0]
|
309
|
+
self.npmi_matrix.fill_diagonal_(1)
|
310
|
+
topk_idx = torch.topk(beta, self.k, dim=1)[1]
|
311
|
+
topk_mask = torch.zeros_like(beta)
|
312
|
+
for row_idx, indices in enumerate(topk_idx):
|
313
|
+
topk_mask[row_idx, indices] = 1
|
314
|
+
beta_mask = (1 - topk_mask) * -99999
|
315
|
+
topk_mask = topk_mask.bool()
|
316
|
+
topk_softmax_beta = torch.softmax(beta + beta_mask, dim=1)
|
317
|
+
softmax_beta = torch.softmax(beta, dim=1)
|
318
|
+
weighted_npmi = 1 - self._row_wise_normalize_inplace(torch.matmul(topk_softmax_beta.detach(), self.npmi_matrix))
|
319
|
+
#print("Weighted_npmi sum = {}".format(weighted_npmi.sum()))
|
320
|
+
npmi_loss = 100 * (softmax_beta ** 2) * weighted_npmi
|
321
|
+
if self.use_diversity_loss:
|
322
|
+
diversity_mask = torch.zeros_like(beta).bool()
|
323
|
+
for topic_idx in range(n_topics):
|
324
|
+
other_rows_mask = torch.ones(n_topics).bool().to(self.device)
|
325
|
+
other_rows_mask[topic_idx] = False
|
326
|
+
diversity_mask[topic_idx] = topk_mask[other_rows_mask].sum(0) > 0
|
327
|
+
#print("Diversity mask sum = {}".format(diversity_mask.sum()))
|
328
|
+
npmi_loss = ( self.alpha * torch.masked_select(npmi_loss, diversity_mask)).sum() + \
|
329
|
+
((1 - self.alpha) * torch.masked_select(npmi_loss, ~diversity_mask)).sum()
|
330
|
+
npmi_loss *= 2
|
331
|
+
return npmi_loss
|
332
|
+
|
333
|
+
def forward(self, beta):
|
334
|
+
return self._get_npmi_loss(beta)
|
335
|
+
|
433
336
|
|
434
337
|
class CoherenceRegularizer(nn.Module):
|
435
338
|
|
@@ -570,9 +473,10 @@ class SeqBowVED(BaseSeqBowVED):
|
|
570
473
|
classifier_outputs = self.classifier(z_mu)
|
571
474
|
else:
|
572
475
|
classifier_outputs = None
|
476
|
+
redundancy_loss = entropy_loss
|
477
|
+
ii_loss = self.add_npmi_and_diversity_loss(elbo)
|
573
478
|
redundancy_loss = entropy_loss #self.get_redundancy_penalty()
|
574
|
-
|
575
|
-
return elbo, rec_loss, KL_loss, redundancy_loss, classifier_outputs
|
479
|
+
return ii_loss, rec_loss, KL_loss, redundancy_loss, classifier_outputs
|
576
480
|
|
577
481
|
|
578
482
|
class MetricSeqBowVED(BaseSeqBowVED):
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: tmnt
|
3
|
-
Version: 0.7.
|
3
|
+
Version: 0.7.52b20240603
|
4
4
|
Summary: Topic modeling neural toolkit
|
5
5
|
Home-page: https://github.com/mitre/tmnt.git
|
6
6
|
Author: The MITRE Corporation
|
@@ -29,7 +29,7 @@ Requires-Dist: sacremoses >=0.0.38
|
|
29
29
|
Requires-Dist: sentence-splitter ==1.4
|
30
30
|
Requires-Dist: umap-learn[plot] >=0.5.5
|
31
31
|
Requires-Dist: numba
|
32
|
-
Requires-Dist: scipy
|
32
|
+
Requires-Dist: scipy ==1.12.0
|
33
33
|
Requires-Dist: tabulate >=0.8.7
|
34
34
|
Requires-Dist: torch >=2.1.2
|
35
35
|
Requires-Dist: torchtext >=0.13.0
|
@@ -2,10 +2,10 @@ tmnt/__init__.py,sha256=EPNq1H7UMyMewWT_zTGBaC7ZouvCywX_gMX4G1dtmvw,250
|
|
2
2
|
tmnt/configuration.py,sha256=P8PEhzVPKO5xG0FrdTLRQ60OYWigbzPY-OSx_hzQlrY,10054
|
3
3
|
tmnt/data_loading.py,sha256=A0tsM6x61BGhYBV6rAYdryz2NwbR__8EAYj_Q4Z-DCs,18736
|
4
4
|
tmnt/distribution.py,sha256=Pmyc5gwDd_-jP7vLVb0vdNQaSSvF1EuiTZEWg3KfmI8,10866
|
5
|
-
tmnt/estimator.py,sha256=
|
6
|
-
tmnt/eval_npmi.py,sha256=
|
7
|
-
tmnt/inference.py,sha256=
|
8
|
-
tmnt/modeling.py,sha256=
|
5
|
+
tmnt/estimator.py,sha256=qqb3zYCUGY53bcXjUK_B7_yvLkjiMYAeYaPk5XoFxnY,70622
|
6
|
+
tmnt/eval_npmi.py,sha256=DTW9dNHVe6H57gndQIZ4gX9EghuBstwznA3YBqILJk0,5820
|
7
|
+
tmnt/inference.py,sha256=da8qAnjTDTuWQfPEOQewOfgikqE00XT1xGMiO2mckI4,15679
|
8
|
+
tmnt/modeling.py,sha256=RhabXB8f9ZliOOgQVJiwwnEnvdK-oil7fGe4prDiPjc,30508
|
9
9
|
tmnt/preprocess/__init__.py,sha256=gwMejkQrnqKS05i0JVsUru2hDUR5jE1hKC10dL934GU,170
|
10
10
|
tmnt/preprocess/tokenizer.py,sha256=-ZgowfbHrM040vbNTktZM_hdl6HDTqxSJ4mDAxq3dUs,14050
|
11
11
|
tmnt/preprocess/vectorizer.py,sha256=RkdivqP76qAJDianV09lONad9NbfBVWLZgIbU_P1-zo,15796
|
@@ -17,9 +17,9 @@ tmnt/utils/ngram_helpers.py,sha256=VrIzou2oQHCLBLSWODDeikN3PYat1NqqvEeYQj_GhbA,1
|
|
17
17
|
tmnt/utils/pubmed_utils.py,sha256=3sHwoun7vxb0GV-arhpXLMUbAZne0huAh9xQNy6H40E,1274
|
18
18
|
tmnt/utils/random.py,sha256=qY75WG3peWoMh9pUyCPBEo6q8IvkF6VRjeb5CqJOBF8,327
|
19
19
|
tmnt/utils/recalibrate.py,sha256=TmpB8An8bslICZ13UTJfIvr8VoqiSedtpHxec4n8CHk,1439
|
20
|
-
tmnt-0.7.
|
21
|
-
tmnt-0.7.
|
22
|
-
tmnt-0.7.
|
23
|
-
tmnt-0.7.
|
24
|
-
tmnt-0.7.
|
25
|
-
tmnt-0.7.
|
20
|
+
tmnt-0.7.52b20240603.dist-info/LICENSE,sha256=qFZJrfJ7Zi4IXDiyiGVrHWic_l1h2tc36tI8Z7rK9bs,11356
|
21
|
+
tmnt-0.7.52b20240603.dist-info/METADATA,sha256=9ep8zbq62Jahe1jps7Qjt54XDcIZTxIqKsR7i65jyDY,1461
|
22
|
+
tmnt-0.7.52b20240603.dist-info/NOTICE,sha256=p0kYIVAkReTFaGb4C-qPa7h5ztze6hGzOpjCMMbOipU,425
|
23
|
+
tmnt-0.7.52b20240603.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
24
|
+
tmnt-0.7.52b20240603.dist-info/top_level.txt,sha256=RpYgUl187sXnqmiwKjZZdcDlHz2AALs6bGdUcukyd_E,5
|
25
|
+
tmnt-0.7.52b20240603.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|