PyPI - tmnt - Versions diffs - 0.7.52b20240601__py3-none-any.whl → 0.7.52b20240603__py3-none-any.whl - Mend

tmnt 0.7.52b20240601py3-none-any.whl → 0.7.52b20240603py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

tmnt/estimator.py CHANGED Viewed

@@ -17,7 +17,7 @@ import json
 from sklearn.metrics import average_precision_score, top_k_accuracy_score, roc_auc_score, ndcg_score, precision_recall_fscore_support
 from tmnt.data_loading import PairedDataLoader, SingletonWrapperLoader, SparseDataLoader, get_llm_model
-from tmnt.modeling import BowVAEModel, CovariateBowVAEModel, SeqBowVED
+from tmnt.modeling import BowVAEModel, SeqBowVED
 from tmnt.modeling import CrossBatchCosineSimilarityLoss, GeneralizedSDMLLoss, MultiNegativeCrossEntropyLoss, MetricSeqBowVED, MetricBowVAEModel
 from tmnt.eval_npmi import EvaluateNPMI
 from tmnt.distribution import LogisticGaussianDistribution, BaseDistribution, GaussianDistribution, VonMisesDistribution
@@ -80,6 +80,7 @@ class BaseEstimator(object):
                  coherence_via_encoder: bool = False,
                  pretrained_param_file: Optional[str] = None,
                  warm_start: bool = False,
+                 npmi_matrix: Optional[torch.Tensor] = None,
                  test_batch_size: int = 0):
         self.vocabulary = vocabulary
         self.log_method = log_method
@@ -100,6 +101,7 @@ class BaseEstimator(object):
         self.warm_start = warm_start
         self.num_val_words = -1 ## will be set later for computing Perplexity on validation dataset
         self.latent_distribution.device = self.device
+        self.npmi_matrix : Optional[torch.Tensor] = npmi_matrix ## used with NPMI loss
     def _np_one_hot(self, vec, n_outputs):
@@ -150,8 +152,7 @@ class BaseEstimator(object):
                 unique_term_ids.add(topic_ids[j])
         redundancy = (1.0 - (float(len(unique_term_ids)) / num_topics / unique_limit)) ** 2
         return npmi, redundancy
     def _get_objective_from_validation_result(self, val_result):
         """
         Get the final objective value from the various validation metrics.
@@ -417,7 +418,7 @@ class BaseBowEstimator(BaseEstimator):
         with torch.no_grad():
             for i, ((data,labels),) in enumerate(dataloader):
                 data = data.to(self.device)
-                _, kl_loss, rec_loss, _, _, _ = self._forward(self.model, data)
+                _, kl_loss, rec_loss, _ = self._forward(self.model, data)
                 total_rec_loss += float(rec_loss.sum())
                 total_kl_loss += float(kl_loss.sum())
         if ((total_rec_loss + total_kl_loss) / total_words) < 709.0:
@@ -517,7 +518,7 @@ class BaseBowEstimator(BaseEstimator):
             labels = torch.zeros(data.shape[0]).unsqueeze(dim=1)
         labels = labels.to(self.device)
-        elbo_ls, kl_ls, rec_ls, coherence_loss, red_ls, predicted_labels = \
+        elbo_ls, kl_ls, rec_ls, predicted_labels = \
             self._forward(self.model, data)
         if self.has_classifier:
             labels = labels.float() if self.multilabel else labels
@@ -529,13 +530,13 @@ class BaseBowEstimator(BaseEstimator):
         else:
             total_ls = elbo_ls.mean()
             label_ls = torch.zeros(total_ls.shape)
-        return elbo_ls, kl_ls, rec_ls, red_ls, label_ls, total_ls
+        return elbo_ls, kl_ls, rec_ls, label_ls, total_ls
     def _get_unlabeled_losses(self, model, data):
-        elbo_ls, kl_ls, rec_ls, coherence_loss, red_ls, predicted_labels = \
-            self._forward(self.model, data)
+        elbo_ls, kl_ls, rec_ls, predicted_labels = \
+            self._forward(model, data)
         total_ls = elbo_ls.mean() / self.gamma
-        return elbo_ls, kl_ls, rec_ls, red_ls, total_ls
+        return elbo_ls, kl_ls, rec_ls, total_ls
     def fit_with_validation_loaders(self, train_dataloader, validation_dataloader, aux_dataloader,
                                     train_X_size, val_X_size, aux_X_size, total_val_words, val_X=None, val_y=None):
@@ -550,14 +551,14 @@ class BaseBowEstimator(BaseEstimator):
             lab_losses  = []
             self.model.train()
             for i, (data_batch, aux_batch) in enumerate(joint_loader):
-                elbo_ls, kl_loss, _, _, lab_loss, total_ls = self._get_losses(self.model, data_batch)
+                elbo_ls, kl_loss, _, lab_loss, total_ls = self._get_losses(self.model, data_batch)
                 elbo_mean = elbo_ls.mean()
                 if aux_batch is not None:
                     total_ls.backward(retain_graph=True)
                     aux_data, = aux_batch
                     aux_data, _ = aux_data # ignore (null) label
                     aux_data = aux_data.to(self.device)
-                    elbo_ls_a, kl_loss_a, _, _, total_ls_a = self._get_unlabeled_losses(self.model, aux_data)
+                    elbo_ls_a, kl_loss_a, _, total_ls_a = self._get_unlabeled_losses(self.model, aux_data)
                     total_ls_a.backward()
                 else:
                     total_ls.backward()
@@ -601,11 +602,6 @@ class BaseBowEstimator(BaseEstimator):
         else:
             self._output_status("Epoch [{}]. Objective = {} ==> PPL = {}. NPMI ={}. Redundancy = {}."
                                 .format(epoch+1, sc_obj, v_res['ppl'], v_res['npmi'], v_res['redundancy']))
-        #session.report({"objective": sc_obj, "coherence": v_res['npmi'], "perplexity": v_res['ppl'],
-        #                "redundancy": v_res['redundancy']})
-        #if self.reporter:
-        #    self.reporter(epoch=epoch+1, objective=sc_obj, time_step=time.time(),
-        #                  coherence=v_res['npmi'], perplexity=v_res['ppl'], redundancy=v_res['redundancy'])
         return sc_obj, v_res
@@ -615,6 +611,8 @@ class BaseBowEstimator(BaseEstimator):
         if self.model is None or not self.warm_start:
             self.model = self._get_model()
             self.model.initialize_bias_terms(wd_freqs.squeeze())  ## initialize bias weights to log frequencies
+        if self.npmi_matrix is not None:
+            self.model.initialize_npmi_loss(self.npmi_matrix)
         return x_size
@@ -644,6 +642,7 @@ class BaseBowEstimator(BaseEstimator):
         X_data = train_dataloader.dataset.data
         train_dataloader = SingletonWrapperLoader(train_dataloader)
         train_X_size = X_data.shape
+        print("**** Setting up model with biases")
         _ = self.setup_model_with_biases(X_data)
         if aux_X is not None:
@@ -718,7 +717,7 @@ class BowEstimator(BaseBowEstimator):
         Returns:
             Tuple of:
-                elbo, kl_loss, rec_loss, coherence_loss, redundancy_loss, reconstruction
+                elbo, kl_loss, rec_loss, reconstruction
         """
         return model(data)
@@ -822,6 +821,8 @@ class BowMetricEstimator(BowEstimator):
         model = self._get_model()
         tr_bow_matrix = self._get_bow_matrix(train_data)
         model.initialize_bias_terms(tr_bow_matrix.sum(axis=0))
+        if self.npmi_matrix is not None:
+            self.model.initialize_npmi_loss(self.npmi_matrix)
         return model
     def _forward(self, model, data):
@@ -936,171 +937,6 @@ class BowMetricEstimator(BowEstimator):
-class CovariateBowEstimator(BaseBowEstimator):
-    def __init__(self, *args, n_covars=0, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.covar_net_layers = 1 ### XXX - temp hardcoded
-        self.n_covars = n_covars
-    @classmethod
-    def from_config(cls, n_covars, *args, **kwargs):
-        est = super().from_config(*args, **kwargs)
-        est.n_covars = n_covars
-        return est
-    def _get_model(self):
-        """
-        Returns
-        MXNet model initialized using provided hyperparameters
-        """
-        if self.embedding_source != 'random':
-            #e_type, e_name = tuple(self.embedding_source.split(':'))
-            pt_embedding = pretrained_aliases('glove.6B.100d')
-            pretrained = pt_embedding.get_vecs_by_tokens(self.vocabulary)
-            emb_size = 100
-            #pt_embedding = nlp.embedding.create(e_type, source=e_name)
-            #self.vocabulary.set_embedding(pt_embedding)
-            #emb_size = len(self.vocabulary.embedding.idx_to_vec[0])
-            #for word in self.vocabulary.embedding._idx_to_token:
-            #    if (self.vocabulary.embedding[word] == mx.nd.zeros(emb_size)).sum() == emb_size:
-            #        self.vocabulary.embedding[word] = mx.nd.random.normal(0, 0.1, emb_size)
-        else:
-            emb_size = self.embedding_size
-        model = \
-            CovariateBowVAEModel(n_covars=self.n_covars,
-                                 enc_dim=self.enc_hidden_dim, embedding_size=emb_size,
-                                 fixed_embedding=self.fixed_embedding, latent_distribution=self.latent_distribution,
-                                 coherence_reg_penalty=self.coherence_reg_penalty, redundancy_reg_penalty=self.redundancy_reg_penalty,
-                                 ctx=self.ctx)
-        return model
-    def _get_losses(self, model, batch_data):
-        # batch_data has form: ((data, covars),)
-        (data,covars), = batch_data
-        data = data.to(self.device)
-        covars = covars.to(self.device)
-        elbo_ls, kl_ls, rec_ls, coherence_loss, red_ls, predicted_labels = \
-            self._forward(self.model, data, covars)
-        total_ls = elbo_ls.mean()
-        label_ls = mx.nd.zeros(total_ls.shape)
-        return elbo_ls, kl_ls, rec_ls, red_ls, label_ls, total_ls
-    def _get_config(self):
-        config = super()._get_config()
-        config['n_covars'] = self.n_covars
-        return config
-    def _forward(self,
-                 model: BowVAEModel,
-                 data: torch.Tensor,
-                 covars: torch.Tensor) -> Tuple[torch.Tensor,
-                                                 torch.Tensor,
-                                                 torch.Tensor,
-                                                 torch.Tensor,
-                                                 torch.Tensor,
-                                                 torch.Tensor,
-                                                 torch.Tensor] :
-        """
-        Forward pass of BowVAE model given the supplied data
-        Parameters:
-            model: Model that returns elbo, kl_loss, rec_loss, l1_pen, coherence_loss, redundancy_loss, reconstruction
-            data: Document word matrix of shape (n_train_samples, vocab_size)
-            covars: Covariate matrix. shape [n_samples, n_covars]
-        Returns:
-            (tuple): Tuple of:
-                elbo, kl_loss, rec_loss, l1_pen, coherence_loss, redundancy_loss, reconstruction
-        """
-        self.train_data = data
-        self.train_labels = covars
-        return model(data, covars)
-    def _npmi_per_covariate(self, X, y, k=10):
-        """
-        Calculate NPMI(Normalized Pointwise Mutual Information) for each covariate for data X
-        Parameters:
-            X (array-like or sparse matrix): Document word matrix. shape [n_samples, vocab_size]
-            y (array-like or sparse matrix): Covariate matrix. shape [n_samples, n_covars]
-            k (int): Threshold at which to compute npmi. optional (default=10)
-        Returns:
-            (dict): Dictionary of npmi scores for each covariate.
-        """
-        X_train = X.toarray()
-        y_train = y
-        covars = np.unique(y_train, axis=0)
-        covar_npmi = {}
-        npmi_total = 0
-        for covar in covars:
-            mask = (y_train == covar).all(axis=1)
-            X_covar, y_covar = torch.tensor(X_train[mask], dtype='float'), torch.tensor(y_train[mask], dtype='float')
-            sorted_ids = self.model.get_ordered_terms_with_covar_at_data(X_covar,k, y_covar)
-            top_k_words_per_topic = [[int(i) for i in list(sorted_ids[:k, t].asnumpy())] for t in range(self.n_latent)]
-            npmi_eval = EvaluateNPMI(top_k_words_per_topic)
-            npmi = npmi_eval.evaluate_csr_mat(X_covar)
-            #if(self.label_map):
-            #    covar_key = covar[0]
-            #else:
-            #    covar_key = np.where(covar)[0][0]
-            covar_keky = covar[0]
-            covar_npmi[covar_key] = npmi
-            npmi_total += npmi
-        return npmi_total / len(covars)
-    def _npmi(self, X, k=10):
-        return super()._npmi(X, k=k)
-        #return self._npmi_per_covariate(X, y, k)
-    def _get_objective_from_validation_result(self, v_res):
-        return v_res['npmi']
-    def validate(self, X, y):
-        npmi, redundancy = self._npmi(X)
-        return {'npmi': npmi, 'redundancy': redundancy, 'ppl': 0.0}
-    def get_topic_vectors(self) -> torch.Tensor:
-        """
-        Get topic vectors of the fitted model.
-        Returns:
-            topic_vectors: Topic word distribution. topic_distribution[i, j] represents word j in topic i.
-                shape=(n_latent, vocab_size)
-        """
-        return self.model.get_topic_vectors(self.train_data, self.train_labels)
-    def initialize_with_pretrained(self):
-        assert(self.pretrained_param_file is not None)
-        self.model = self._get_model()
-        self.model.load_parameters(self.pretrained_param_file, allow_missing=False)
-    def transform(self, X: sp.csr.csr_matrix, y: np.ndarray):
-        """
-        Transform data X and y according to the fitted model.
-        Parameters:
-            X: Document word matrix of shape {n_samples, n_features)
-            y: Covariate matrix of shape (n_train_samples, n_covars)
-        Returns:
-            Document topic distribution for X and y of shape=(n_samples, n_latent)
-        """
-        x_mxnet, y_mxnet = mx.nd.array(X, dtype=np.float32), mx.nd.array(y, dtype=np.float32)
-        return self.model.encode_data_with_covariates(x_mxnet, y_mxnet).asnumpy()
 class SeqBowEstimator(BaseEstimator):
     def __init__(self, *args,
@@ -1213,6 +1049,9 @@ class SeqBowEstimator(BaseEstimator):
         model = self._get_model()
         tr_bow_counts = self._get_bow_wd_counts(train_data)
         model.initialize_bias_terms(tr_bow_counts)
+        if self.npmi_matrix is not None:
+            print("****** INITIALIZING NPMI LOSS FUNCTION *******")
+            model.initialize_npmi_loss(self.npmi_matrix)
         return model

tmnt/eval_npmi.py CHANGED Viewed

@@ -10,9 +10,13 @@ from collections import Counter
 import numpy as np
 import scipy
 import scipy.sparse
+from tqdm import tqdm
 from tmnt.utils.ngram_helpers import BigramReader
 from itertools import combinations
+from gensim.models.coherencemodel import CoherenceModel
+from tmnt.preprocess.vectorizer import TMNTVectorizer
+from gensim.corpora.dictionary import Dictionary
 __all__ = ['NPMI', 'EvaluateNPMI']
@@ -22,7 +26,6 @@ class NPMI(object):
         self.unigram_cnts = unigram_cnts
         self.bigram_cnts = bigram_cnts
         self.n_docs = n_docs
     def wd_id_pair_npmi(self, w1: int, w2: int):
         cw1 = self.unigram_cnts.get(w1, 0.0)
@@ -89,41 +92,57 @@ class EvaluateNPMI(object):
             total_npmi += total_topic_npmi
         return total_npmi / len(self.top_k_words_per_topic)
-    def evaluate_csr_loader(self, dataloader):
-        ndocs = 0
-        total_npmi = 0
-        for i, words_per_topic in enumerate(self.top_k_words_per_topic):
-            n_topics = len(words_per_topic)
-            total_topic_npmi = 0
-            for (w1, w2) in combinations(sorted(words_per_topic), 2):
+    def get_full_vocab_npmi_matrix(self, mat):
+        vocab_size = mat.shape[1]
+        npmi_matrix = np.zeros((vocab_size, vocab_size))
+        n_docs = mat.shape[0]
+        if isinstance(mat, scipy.sparse.csr.csr_matrix):
+            is_sparse = True
+        for (w1, w2) in tqdm(combinations(np.arange(vocab_size), 2)):
+            o_1 = mat[:, w1] > 0
+            o_2 = mat[:, w2] > 0
+            if is_sparse:
+                o_1 = o_1.toarray().squeeze()
+                o_2 = o_2.toarray().squeeze()
+            occur_1 = np.array(o_1, dtype='int')
+            occur_2 = np.array(o_2, dtype='int')
+            unigram_1 = occur_1.sum()
+            unigram_2 = occur_2.sum()
+            bigram_cnt = np.sum(occur_1 * occur_2)
+            if bigram_cnt < 1:
                 npmi = 0.0
-                unigram1 = 0.0
-                unigram2 = 0.0
-                bigram_cnt = 0.0
-                n_docs = 0
-                for _, (csr,_) in enumerate(dataloader):
-                    is_sparse = isinstance(csr, mx.nd.sparse.CSRNDArray)
-                    if is_sparse:
-                        mat = csr.asscipy()
-                    else:
-                        mat = csr.asnumpy()
-                    n_docs += mat.shape[0]
-                    o1 = mat[:, w1] > 0
-                    o2 = mat[:, w2] > 0
-                    if is_sparse:
-                        o1 = o1.toarray().squeeze()
-                        o2 = o2.toarray().squeeze()
-                    occur1 = np.array(o1, dtype='int')
-                    occur2 = np.array(o2, dtype='int')
-                    unigram1 += occur1.sum()
-                    unigram2 += occur2.sum()
-                    bigram_cnt += np.sum(occur1 * occur2)
-                if bigram_cnt >= 1:
-                    npmi += (log10(n_docs) + log10(bigram_cnt) - log10(unigram1) - log10(unigram2)) / (log10(n_docs) - log10(bigram_cnt) + 1e-4)
-                total_topic_npmi += npmi
-            total_topic_npmi *= (2 / (n_topics * (n_topics-1)))
-            total_npmi += total_topic_npmi
-        return total_npmi / len(self.top_k_words_per_topic)
+            else:
+                npmi = (log10(n_docs) + log10(bigram_cnt) - log10(unigram_1) - log10(unigram_2)) / (log10(n_docs) - log10(bigram_cnt) + 1e-4)
+            npmi_matrix[w1, w2] = npmi
+        return npmi_matrix
+class FullNPMI(object):
+    def get_full_vocab_npmi_matrix(self, mat: scipy.sparse.csr_matrix, tf: TMNTVectorizer):
+        corpus = []
+        npmi_matrix = np.zeros((tf.vocab_size, tf.vocab_size))
+        for ri in range(mat.shape[0]):
+            row = mat.getrow(ri)
+            corpus.append(list(zip(row.indices, row.data)))
+        topics = [ list(range(mat.shape[1])) ]
+        dictionary = Dictionary()
+        dictionary.id2token = tf.get_vocab().get_itos()
+        dictionary.token2id = tf.get_vocab().get_stoi()
+        cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=dictionary, coherence='u_mass', topn=len(topics[0]))
+        segmented_topics = cm.measure.seg(cm.topics)
+        accumulator = cm.estimate_probabilities(segmented_topics)
+        num_docs = accumulator.num_docs
+        eps = 1e-12
+        for w1, w2 in tqdm(segmented_topics[0]):
+            w1_count = accumulator[w1]
+            w2_count = accumulator[w2]
+            co_occur_count = accumulator[w1, w2]
+            p_w1_w2 = co_occur_count / num_docs
+            p_w1 = w1_count / num_docs
+            p_w2 = w2_count / num_docs
+            npmi_matrix[w1, w2] = np.log((p_w1_w2 + eps) / (p_w1 * p_w2)) / -np.log(p_w1_w2  + eps)
+        return npmi_matrix

tmnt/inference.py CHANGED Viewed

@@ -10,8 +10,8 @@ import io
 import os
 import torch
 import pickle
-from tmnt.modeling import BowVAEModel, CovariateBowVAEModel, SeqBowVED, MetricSeqBowVED
-from tmnt.estimator import BowEstimator, CovariateBowEstimator, SeqBowEstimator, SeqBowMetricEstimator
+from tmnt.modeling import BowVAEModel, SeqBowVED, MetricSeqBowVED
+from tmnt.estimator import BowEstimator, SeqBowEstimator, SeqBowMetricEstimator
 from tmnt.data_loading import SparseDataLoader
 from tmnt.preprocess.vectorizer import TMNTVectorizer
 from tmnt.utils.recalibrate import recalibrate_scores
@@ -54,9 +54,6 @@ class BaseInferencer(object):
     def get_top_k_words_per_topic(self, k):
         raise NotImplementedError
-    def get_top_k_words_per_topic_per_covariate(self, k):
-        raise NotImplementedError
     def get_pyldavis_details(self, sp_vec_file_or_X, y=None):
         w_pr, dt_matrix, doc_lengths, term_cnts = self.get_model_details(sp_vec_file_or_X, y=y)
         d1 = w_pr.cpu().detach().numpy().tolist()
@@ -80,12 +77,7 @@ class BowVAEInferencer(BaseInferencer):
         self.vocab = estimator.vocabulary
         self.n_latent = estimator.n_latent
         self.model = estimator.model
-        if isinstance(estimator.model, CovariateBowVAEModel):
-            self.covar_model = True
-            self.n_covars = estimator.model.n_covars
-            self.covar_net_layers = estimator.model.covar_net_layers
-        else:
-            self.covar_model = False
+        self.covar_model = False
     @classmethod
     def from_saved(cls, model_dir=None, device='cpu'):
@@ -96,12 +88,7 @@ class BowVAEInferencer(BaseInferencer):
         serialized_vectorizer_file = os.path.join(model_dir,'vectorizer.pkl')
         with io.open(config_file, 'r') as f:
             config_dict = json.load(f)
-        if config_dict['n_covars'] > 0:
-            estimator = CovariateBowEstimator.from_config(config_dict['n_covars'],
-                                                          config_file, vocab_file,
-                                                          pretrained_param_file=param_file)
-        else:
-            estimator = BowEstimator.from_saved(model_dir)
+        estimator = BowEstimator.from_saved(model_dir)
         estimator.initialize_with_pretrained()
         if os.path.exists(serialized_vectorizer_file):
             with open(serialized_vectorizer_file, 'rb') as fp:
@@ -174,16 +161,12 @@ class BowVAEInferencer(BaseInferencer):
         for _, (data,labels) in enumerate(infer_iter):
             with torch.no_grad():
                 data = data.to(self.device)
-                if self.covar_model and labels is not None:
-                    labels = labels.to(self.device)
-                    encs = self.model.encode_data_with_covariates(data, labels, include_bn=include_bn)
-                else:
-                    encs = self.model.encode_data(data, include_bn=include_bn)
-                    if include_predictions:
-                        if self.model.multilabel:
-                            preds = list(self.model.predict(data).sigmoid().detach().numpy())
-                        else:
-                            preds = list(self.model.predict(data).softmax(dim=1).detach().numpy())
+                encs = self.model.encode_data(data, include_bn=include_bn)
+                if include_predictions:
+                    if self.model.multilabel:
+                        preds = list(self.model.predict(data).sigmoid().detach().numpy())
+                    else:
+                        preds = list(self.model.predict(data).softmax(dim=1).detach().numpy())
                 if use_probs:
                     #e1 = (encs - encs.min(dim=1).unsqueeze(1)).astype('float64')
                     e1 = (encs - encs.min(dim=1)[0].unsqueeze(1))
@@ -233,31 +216,6 @@ class BowVAEInferencer(BaseInferencer):
         return topic_terms
-    def get_top_k_words_per_topic_per_covariate(self, k):
-        n_topics = self.n_latent
-        w = self.model.cov_decoder.cov_inter_decoder.collect_params().get('weight').data()
-        n_covars = int(w.shape[1] / n_topics)
-        topic_terms = []
-        for i in range(n_covars):
-            cv_i_slice = w[:, (i * n_topics):((i+1) * n_topics)]
-            sorted_ids = cv_i_slice.argsort(dim=0, is_ascend=False)
-            cv_i_terms = []
-            for t in range(n_topics):
-                top_k = [ self.vocab.lookup_token(int(i)) for i in list(sorted_ids[:k, t].asnumpy()) ]
-                cv_i_terms.append(top_k)
-            topic_terms.append(cv_i_terms)
-        return topic_terms
-    def get_covariate_model_details(self):
-        ## 1) C x K x W tensor with |C|  P(term|topic) probability matricies where |C| is number of co-variates
-        w = self.model.cov_decoder.cov_inter_decoder.collect_params().get('weight').data().transpose()
-        w_rsh = w.reshape(-1,self.n_latent, w.shape[1])
-        return w_rsh.softmax(dim=2)
-    def get_top_k_words_per_topic_over_scalar_covariate(self, k, min_v=0.0, max_v=1.0, step=0.1):
-        raise NotImplemented
     def predict_text(self, txt: List[str], pred_threshold: float = 0.5) -> Tuple[List[str], List[np.ndarray], np.ndarray]:
         """Take a list of input documents/passages as strings and return document encodings (topics) and classification outputs

tmnt/modeling.py CHANGED Viewed

@@ -36,7 +36,8 @@ class BaseVAE(nn.Module):
         self.latent_distribution = latent_distribution
         self.decoder = nn.Linear(self.n_latent, self.vocab_size).to(device)
-        #self.coherence_regularization = CoherenceRegularizer(self.coherence_reg_penalty, self.redundancy_reg_penalty)
+        self.npmi_with_diversity_loss : Optional[NPMILossWithDiversity] = None
+        self.npmi_alpha = 0.7
     def initialize_bias_terms(self, wd_freqs: Optional[np.ndarray]):
         if wd_freqs is not None:
@@ -46,6 +47,10 @@ class BaseVAE(nn.Module):
                 self.decoder.bias = nn.Parameter(torch.tensor(log_freq, dtype=torch.float32, device=self.device))
                 self.decoder.bias.requires_grad_(False)
+    def initialize_npmi_loss(self, npmi_mat):
+        t_npmi_mat = torch.Tensor(npmi_mat, device=self.device)
+        self.npmi_with_diversity_loss = NPMILossWithDiversity(t_npmi_mat, device=self.device, alpha=self.npmi_alpha)
     def get_ordered_terms(self):
         """
         Returns the top K terms for each topic based on sensitivity analysis. Terms whose
@@ -62,26 +67,29 @@ class BaseVAE(nn.Module):
         """
         Returns unnormalized topic vectors
         """
-        z = torch.ones((1, self.n_latent), device=self.device)
+        z = torch.ones((self.n_latent,), device=self.device)
         jacobian = torch.autograd.functional.jacobian(self.decoder, z)
-        return jacobian.cpu().asnumpy()
+        return jacobian.cpu().numpy()
-    def add_coherence_reg_penalty(self, cur_loss):
-        if self.coherence_reg_penalty > 0.0 and self.embedding is not None:
-            w = self.decoder.weight.data
-            emb = self.embedding.weight.data
-            c, d = self.coherence_regularization(w, emb)
-            return (cur_loss + c + d), c, d
+    def add_npmi_and_diversity_loss(self, cur_loss):
+        if self.npmi_with_diversity_loss:
+            z = torch.ones((self.n_latent,), device=self.device)
+            jacobian = torch.autograd.functional.jacobian(self.decoder, z)
+            npmi_loss = self.npmi_with_diversity_loss(jacobian)
+            npmi_loss = npmi_loss.sum()
+            print("npmi loss = {}".format(npmi_loss))
+            return (cur_loss + npmi_loss)
         else:
-            return (cur_loss, torch.zeros_like(cur_loss, device=self.device), torch.zeros_like(cur_loss, device=self.device))
+            return cur_loss
-    def get_loss_terms(self, data, y, KL, batch_size):
+    def get_loss_terms(self, data, y, KL):
         rr = data * torch.log(y+1e-12)
         recon_loss = -(rr.sum(dim=1))
         i_loss = KL + recon_loss
-        ii_loss, coherence_loss, redundancy_loss = self.add_coherence_reg_penalty(i_loss)
-        return ii_loss, recon_loss, coherence_loss, redundancy_loss
+        ii_loss = self.add_npmi_and_diversity_loss(i_loss)
+        return ii_loss, recon_loss
 class BowVAEModel(BaseVAE):
@@ -224,14 +232,14 @@ class BowVAEModel(BaseVAE):
         z, KL   = self.latent_distribution(enc_out, batch_size)
         xhat = self.decoder(z)
         y = torch.nn.functional.softmax(xhat, dim=1)
-        ii_loss, recon_loss, coherence_loss, redundancy_loss = \
-            self.get_loss_terms(data, y, KL, batch_size)
+        ii_loss, recon_loss = \
+            self.get_loss_terms(data, y, KL)
         if self.has_classifier:
             mu_out  = self.latent_distribution.get_mu_encoding(enc_out)
             classifier_outputs = self.classifier(self.lab_dr(mu_out))
         else:
             classifier_outputs = None
-        return ii_loss, KL, recon_loss, coherence_loss, redundancy_loss, classifier_outputs
+        return ii_loss, KL, recon_loss, classifier_outputs
 class MetricBowVAEModel(BowVAEModel):
@@ -241,13 +249,6 @@ class MetricBowVAEModel(BowVAEModel):
         super(MetricBowVAEModel, self).__init__(*args, **kwargs)
-    def get_redundancy_penalty(self):
-        w = self.decoder.weight.data
-        emb = self.embedding.weight.data if self.embedding is not None else w.transpose()
-        _, redundancy_loss = self.coherence_regularization(w, emb)
-        return redundancy_loss
     def _get_elbo(self, bow, enc):
         batch_size = bow.shape[0]
         z, KL = self.latent_distribution(enc, batch_size)
@@ -277,159 +278,61 @@ class MetricBowVAEModel(BowVAEModel):
         return (elbo1 + elbo2), (rec_loss1 + rec_loss2), (KL_loss1 + KL_loss2), redundancy_loss, mu1, mu2
-class CovariateBowVAEModel(BowVAEModel):
-    """Bag-of-words topic model with labels used as co-variates
-    """
-    def __init__(self, covar_net_layers=1, *args, **kwargs):
-        super(CovariateBowVAEModel, self).__init__(*args, **kwargs)
-        self.covar_net_layers = covar_net_layers
-        with self.name_scope():
-            if self.n_covars < 1:
-                self.cov_decoder = ContinuousCovariateModel(self.n_latent, self.vocab_size,
-                                                            total_layers=self.covar_net_layers, device=self.device)
-            else:
-                self.cov_decoder = CovariateModel(self.n_latent, self.n_covars, self.vocab_size,
-                                                  interactions=True, device=self.device)
-    def encode_data_with_covariates(self, data, covars, include_bn=False):
-        """
-        Encode data to the mean of the latent distribution defined by the input `data`
-        """
-        emb_out = self.embedding(data)
-        enc_out = self.encoder(mx.nd.concat(emb_out, covars))
-        return self.latent_distribution.get_mu_encoding(enc_out, include_bn=include_bn)
-    def get_ordered_terms_with_covar_at_data(self, data, k, covar):
-        """
-        Uses test/training data-point as the input points around which term sensitivity is computed
-        """
-        data = data.to(self.device)
-        covar = covar.to(self.device)
-        jacobian = torch.zeros((self.vocab_size, self.n_latent), device=self.device)
-        batch_size = data.shape[0]
-        emb_out = self.embedding(data)
-        co_emb = torch.cat(emb_out, covar)
-        z = self.latent_distribution.get_mu_encoding(self.encoder(co_emb))
-        z.attach_grad()
-        outputs = []
-        with mx.autograd.record():
-            dec_out = self.decoder(z)
-            cov_dec_out = self.cov_decoder(z, covar)
-            y = mx.nd.softmax(cov_dec_out + dec_out, axis=1)
-            for i in range(self.vocab_size):
-                outputs.append(y[:,i])
-        for i, output in enumerate(outputs):
-            output.backward(retain_graph=True)
-            jacobian[i] += z.grad.sum(axis=0)
-        sorted_j = jacobian.argsort(axis=0, is_ascend=False)
-        return sorted_j
+class NPMILossWithDiversity(nn.Module):
-    def get_topic_vectors(self, data, covar):
-        """
-        Returns unnormalized topic vectors based on the input data
-        """
-        data = data.as_in_context(self.model_ctx)
-        covar = covar.as_in_context(self.model_ctx)
-        jacobian = mx.nd.zeros(shape=(self.vocab_size, self.n_latent), ctx=self.model_ctx)
-        batch_size = data.shape[0]
-        emb_out = self.embedding(data)
-        co_emb = mx.nd.concat(emb_out, covar)
-        z = self.latent_distribution.get_mu_encoding(self.encoder(co_emb))
-        z.attach_grad()
-        outputs = []
-        with mx.autograd.record():
-            dec_out = self.decoder(z)
-            cov_dec_out = self.cov_decoder(z, covar)
-            y = mx.nd.softmax(cov_dec_out + dec_out, axis=1)
-            for i in range(self.vocab_size):
-                outputs.append(y[:,i])
-        for i, output in enumerate(outputs):
-            output.backward(retain_graph=True)
-            jacobian[i] += z.grad.sum(axis=0)
-        return jacobian
-    def forward(self, F, data, covars):
-        batch_size = data.shape[0]
-        emb_out = self.embedding(data)
-        if self.n_covars > 0:
-            covars = F.one_hot(covars, self.n_covars)
-        co_emb = F.concat(emb_out, covars)
-        z, KL = self.run_encode(F, co_emb, batch_size)
-        dec_out = self.decoder(z)
-        cov_dec_out = self.cov_decoder(z, covars)
-        y = F.softmax(dec_out + cov_dec_out, axis=1)
-        ii_loss, recon_loss, coherence_loss, redundancy_loss = \
-            self.get_loss_terms(F, data, y, KL, batch_size)
-        return ii_loss, KL, recon_loss, coherence_loss, redundancy_loss, None
-class CovariateModel(nn.Module):
-    def __init__(self, n_topics, n_covars, vocab_size, interactions=False, device='cpu'):
-        self.n_topics = n_topics
-        self.n_covars = n_covars
-        self.vocab_size = vocab_size
-        self.interactions = interactions
+    def __init__(self, npmi_matrix: torch.Tensor, device: torch.device, k=20, alpha=0.7, use_diversity_loss=True):
+        super(NPMILossWithDiversity, self).__init__()
+        self.alpha = alpha
+        self.npmi_matrix = npmi_matrix
+        self.use_diversity_loss = use_diversity_loss
         self.device = device
-        super(CovariateModel, self).__init__()
-        with self.name_scope():
-            self.cov_decoder = torch.nn.Linear(n_covars, self.vocab_size, bias=False)
-            if self.interactions:
-                self.cov_inter_decoder = torch.nn.Linear(self.n_covars * self.n_topics, self.vocab_size, bias=False)
-        self.apply(self._init_weights)
-    def _init_weights(self, module):
-        if isinstance(module, torch.nn.Linear):
-            torch.nn.init.xavier_uniform_(module.weight.data)
-    def forward(self, topic_distrib, covars):
-        score_C = self.cov_decoder(covars)
-        if self.interactions:
-            td_rsh = topic_distrib.unsqueeze(1)
-            cov_rsh = covars.unsqueeze(2)
-            cov_interactions = cov_rsh * td_rsh    ## shape (N, Topics, Covariates) -- outer product
-            batch_size = cov_interactions.shape[0]
-            cov_interactions_rsh = torch.reshape(cov_interactions, (batch_size, self.n_topics * self.n_covars))
-            score_CI = self.cov_inter_decoder(cov_interactions_rsh)
-            return score_CI + score_C
-        else:
-            return score_C
-class ContinuousCovariateModel(nn.Module):
-    def __init__(self, n_topics, vocab_size, total_layers = 1, device='device'):
-        self.n_topics  = n_topics
-        self.n_scalars = 1   # number of continuous variables
-        self.model_ctx = ctx
-        self.time_topic_dim = 300
-        super(ContinuousCovariateModel, self).__init__()
-        with self.name_scope():
-            self.cov_decoder = nn.Sequential()
-            for i in range(total_layers):
-                if i < 1:
-                    in_units = self.n_scalars + self.n_topics
-                else:
-                    in_units = self.time_topic_dim
-                self.cov_decoder.add_module("linear_"+str(i), nn.Linear(in_units, self.time_topic_dim,
-                                               bias=(i < 1)))
-                self.cov_decoder.add_module("relu_"+str(i), nn.Relu())
-            self.cov_decoder.add_module("linear_out_", nn.Linear(self.time_topic_dim, vocab_size, bias=False))
-    def forward(self, topic_distrib, scalars):
-        inputs = torch.cat((topic_distrib, scalars), 0)
-        sc_transform = self.cov_decoder(inputs)
-        return sc_transform
+        self.k = k
+    def _row_wise_normalize_inplace(self, x, mask=None):
+        for row_idx, row in enumerate(x):
+            if mask != None:
+                row_mask = mask[row_idx]
+                row = row[row_mask]
+                x[row_idx][row_mask] = (row - row.min()) / (row.max() - row.min())
+            else:
+                row_min = row.min().item()
+                row_max = row.max().item()
+                x[row_idx] = (row - row_min)/(row_max - row_min)
+        return x
+    def _get_npmi_loss(self, jacobian):
+        #z = torch.ones((self.n_latent,), device=self.device)
+        #jacobian = torch.autograd.functional.jacobian(self.decoder, z)
+        #beta = self.model.get_topic_vectors().t() # |T| x |V|
+        beta = jacobian.t()
+        n_topics = beta.shape[0]
+        self.npmi_matrix.fill_diagonal_(1)
+        topk_idx = torch.topk(beta, self.k, dim=1)[1]
+        topk_mask = torch.zeros_like(beta)
+        for row_idx, indices in enumerate(topk_idx):
+            topk_mask[row_idx, indices] = 1
+        beta_mask = (1 - topk_mask) * -99999
+        topk_mask = topk_mask.bool()
+        topk_softmax_beta = torch.softmax(beta + beta_mask, dim=1)
+        softmax_beta = torch.softmax(beta, dim=1)
+        weighted_npmi = 1 - self._row_wise_normalize_inplace(torch.matmul(topk_softmax_beta.detach(), self.npmi_matrix))
+        #print("Weighted_npmi sum = {}".format(weighted_npmi.sum()))
+        npmi_loss = 100 * (softmax_beta ** 2) * weighted_npmi
+        if self.use_diversity_loss:
+            diversity_mask = torch.zeros_like(beta).bool()
+            for topic_idx in range(n_topics):
+                other_rows_mask = torch.ones(n_topics).bool().to(self.device)
+                other_rows_mask[topic_idx] = False
+                diversity_mask[topic_idx] = topk_mask[other_rows_mask].sum(0) > 0
+            #print("Diversity mask sum = {}".format(diversity_mask.sum()))
+            npmi_loss = ( self.alpha * torch.masked_select(npmi_loss, diversity_mask)).sum() + \
+                        ((1 - self.alpha) * torch.masked_select(npmi_loss, ~diversity_mask)).sum()
+            npmi_loss *= 2
+        return npmi_loss
+    def forward(self, beta):
+        return self._get_npmi_loss(beta)
 class CoherenceRegularizer(nn.Module):
@@ -570,9 +473,10 @@ class SeqBowVED(BaseSeqBowVED):
             classifier_outputs = self.classifier(z_mu)
         else:
             classifier_outputs = None
+        redundancy_loss = entropy_loss
+        ii_loss = self.add_npmi_and_diversity_loss(elbo)
         redundancy_loss = entropy_loss  #self.get_redundancy_penalty()
-        elbo = elbo + redundancy_loss
-        return elbo, rec_loss, KL_loss, redundancy_loss, classifier_outputs
+        return ii_loss, rec_loss, KL_loss, redundancy_loss, classifier_outputs
 class MetricSeqBowVED(BaseSeqBowVED):

{tmnt-0.7.52b20240601.dist-info → tmnt-0.7.52b20240603.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: tmnt
-Version: 0.7.52b20240601
+Version: 0.7.52b20240603
 Summary: Topic modeling neural toolkit
 Home-page: https://github.com/mitre/tmnt.git
 Author: The MITRE Corporation
@@ -29,7 +29,7 @@ Requires-Dist: sacremoses >=0.0.38
 Requires-Dist: sentence-splitter ==1.4
 Requires-Dist: umap-learn[plot] >=0.5.5
 Requires-Dist: numba
-Requires-Dist: scipy
+Requires-Dist: scipy ==1.12.0
 Requires-Dist: tabulate >=0.8.7
 Requires-Dist: torch >=2.1.2
 Requires-Dist: torchtext >=0.13.0

{tmnt-0.7.52b20240601.dist-info → tmnt-0.7.52b20240603.dist-info}/RECORD RENAMED Viewed

@@ -2,10 +2,10 @@ tmnt/__init__.py,sha256=EPNq1H7UMyMewWT_zTGBaC7ZouvCywX_gMX4G1dtmvw,250
 tmnt/configuration.py,sha256=P8PEhzVPKO5xG0FrdTLRQ60OYWigbzPY-OSx_hzQlrY,10054
 tmnt/data_loading.py,sha256=A0tsM6x61BGhYBV6rAYdryz2NwbR__8EAYj_Q4Z-DCs,18736
 tmnt/distribution.py,sha256=Pmyc5gwDd_-jP7vLVb0vdNQaSSvF1EuiTZEWg3KfmI8,10866
-tmnt/estimator.py,sha256=MERanBwrbYqUcHC872qXCIjUoqjlTKnYjOCBu6mxo90,77217
-tmnt/eval_npmi.py,sha256=ODRDMsBgDM__iCNEX399ck7bAhl7ydvgDqmpfR7Y-q4,5048
-tmnt/inference.py,sha256=Sw7GO7QiWVEtbPJKBjFB7AiKRmUOZbFZn3tCrsStzWw,17845
-tmnt/modeling.py,sha256=UJvwQU2ujmY3hUBmUuTWOsZ5AcUFcw-kQhYFK5pICTY,34549
+tmnt/estimator.py,sha256=qqb3zYCUGY53bcXjUK_B7_yvLkjiMYAeYaPk5XoFxnY,70622
+tmnt/eval_npmi.py,sha256=DTW9dNHVe6H57gndQIZ4gX9EghuBstwznA3YBqILJk0,5820
+tmnt/inference.py,sha256=da8qAnjTDTuWQfPEOQewOfgikqE00XT1xGMiO2mckI4,15679
+tmnt/modeling.py,sha256=RhabXB8f9ZliOOgQVJiwwnEnvdK-oil7fGe4prDiPjc,30508
 tmnt/preprocess/__init__.py,sha256=gwMejkQrnqKS05i0JVsUru2hDUR5jE1hKC10dL934GU,170
 tmnt/preprocess/tokenizer.py,sha256=-ZgowfbHrM040vbNTktZM_hdl6HDTqxSJ4mDAxq3dUs,14050
 tmnt/preprocess/vectorizer.py,sha256=RkdivqP76qAJDianV09lONad9NbfBVWLZgIbU_P1-zo,15796
@@ -17,9 +17,9 @@ tmnt/utils/ngram_helpers.py,sha256=VrIzou2oQHCLBLSWODDeikN3PYat1NqqvEeYQj_GhbA,1
 tmnt/utils/pubmed_utils.py,sha256=3sHwoun7vxb0GV-arhpXLMUbAZne0huAh9xQNy6H40E,1274
 tmnt/utils/random.py,sha256=qY75WG3peWoMh9pUyCPBEo6q8IvkF6VRjeb5CqJOBF8,327
 tmnt/utils/recalibrate.py,sha256=TmpB8An8bslICZ13UTJfIvr8VoqiSedtpHxec4n8CHk,1439
-tmnt-0.7.52b20240601.dist-info/LICENSE,sha256=qFZJrfJ7Zi4IXDiyiGVrHWic_l1h2tc36tI8Z7rK9bs,11356
-tmnt-0.7.52b20240601.dist-info/METADATA,sha256=DVnMmTZFgj9izJT6u0HMlmmgxNFWxF4zEpVgSi7Zx9Y,1452
-tmnt-0.7.52b20240601.dist-info/NOTICE,sha256=p0kYIVAkReTFaGb4C-qPa7h5ztze6hGzOpjCMMbOipU,425
-tmnt-0.7.52b20240601.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-tmnt-0.7.52b20240601.dist-info/top_level.txt,sha256=RpYgUl187sXnqmiwKjZZdcDlHz2AALs6bGdUcukyd_E,5
-tmnt-0.7.52b20240601.dist-info/RECORD,,
+tmnt-0.7.52b20240603.dist-info/LICENSE,sha256=qFZJrfJ7Zi4IXDiyiGVrHWic_l1h2tc36tI8Z7rK9bs,11356
+tmnt-0.7.52b20240603.dist-info/METADATA,sha256=9ep8zbq62Jahe1jps7Qjt54XDcIZTxIqKsR7i65jyDY,1461
+tmnt-0.7.52b20240603.dist-info/NOTICE,sha256=p0kYIVAkReTFaGb4C-qPa7h5ztze6hGzOpjCMMbOipU,425
+tmnt-0.7.52b20240603.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+tmnt-0.7.52b20240603.dist-info/top_level.txt,sha256=RpYgUl187sXnqmiwKjZZdcDlHz2AALs6bGdUcukyd_E,5
+tmnt-0.7.52b20240603.dist-info/RECORD,,

{tmnt-0.7.52b20240601.dist-info → tmnt-0.7.52b20240603.dist-info}/LICENSE RENAMED Viewed

File without changes

{tmnt-0.7.52b20240601.dist-info → tmnt-0.7.52b20240603.dist-info}/NOTICE RENAMED Viewed

File without changes

{tmnt-0.7.52b20240601.dist-info → tmnt-0.7.52b20240603.dist-info}/WHEEL RENAMED Viewed

File without changes

{tmnt-0.7.52b20240601.dist-info → tmnt-0.7.52b20240603.dist-info}/top_level.txt RENAMED Viewed

File without changes

tmnt 0.7.52b20240601__py3-none-any.whl → 0.7.52b20240603__py3-none-any.whl

tmnt 0.7.52b20240601py3-none-any.whl → 0.7.52b20240603py3-none-any.whl