PyPI - tmnt - Versions diffs - 0.7.52b20240616__py3-none-any.whl → 0.7.53__py3-none-any.whl - Mend

tmnt 0.7.52b20240616py3-none-any.whl → 0.7.53py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

tmnt/estimator.py CHANGED Viewed

@@ -17,7 +17,7 @@ import json
 from sklearn.metrics import average_precision_score, top_k_accuracy_score, roc_auc_score, ndcg_score, precision_recall_fscore_support
 from tmnt.data_loading import PairedDataLoader, SingletonWrapperLoader, SparseDataLoader, get_llm_model
-from tmnt.modeling import BowVAEModel, SeqBowVED
+from tmnt.modeling import BowVAEModel, SeqBowVED, BaseVAE
 from tmnt.modeling import CrossBatchCosineSimilarityLoss, GeneralizedSDMLLoss, MultiNegativeCrossEntropyLoss, MetricSeqBowVED, MetricBowVAEModel
 from tmnt.eval_npmi import EvaluateNPMI
 from tmnt.distribution import LogisticGaussianDistribution, BaseDistribution, GaussianDistribution, VonMisesDistribution
@@ -57,8 +57,6 @@ class BaseEstimator(object):
         latent_distribution: Latent distribution of the variational autoencoder - defaults to LogisticGaussian with 20 dimensions
         optimizer: optimizer (default = "adam")
         lr: Learning rate of training. (default=0.005)
-        coherence_reg_penalty: Regularization penalty for topic coherence. optional (default=0.0)
-        redundancy_reg_penalty: Regularization penalty for topic redundancy. optional (default=0.0)
         batch_size: Batch training size. optional (default=128)
         epochs : Number of training epochs. optional(default=40)
         coherence_via_encoder: Flag to use encoder to derive coherence scores (via gradient attribution)
@@ -73,26 +71,24 @@ class BaseEstimator(object):
                  device: Optional[str] = 'cpu',
                  latent_distribution: BaseDistribution = None,
                  lr: float = 0.005,
-                 coherence_reg_penalty: float = 0.0,
-                 redundancy_reg_penalty: float = 0.0,
                  batch_size: int = 128,
                  epochs: int = 40,
                  coherence_via_encoder: bool = False,
                  pretrained_param_file: Optional[str] = None,
                  warm_start: bool = False,
                  npmi_matrix: Optional[torch.Tensor] = None,
+                 npmi_lambda: float = 0.7,
+                 npmi_scale: float = 100.0,
                  test_batch_size: int = 0):
         self.vocabulary = vocabulary
         self.log_method = log_method
         self.quiet = quiet
-        self.model = None
+        self.model : Optional[BaseVAE] = None
         self.coherence_coefficient = coherence_coefficient
         self.device = device
         self.latent_distribution = latent_distribution
         self.lr = lr
         self.n_latent = self.latent_distribution.n_latent
-        self.coherence_reg_penalty = coherence_reg_penalty
-        self.redundancy_reg_penalty = redundancy_reg_penalty
         self.batch_size = batch_size
         self.test_batch_size = test_batch_size or batch_size
         self.epochs = epochs
@@ -102,6 +98,8 @@ class BaseEstimator(object):
         self.num_val_words = -1 ## will be set later for computing Perplexity on validation dataset
         self.latent_distribution.device = self.device
         self.npmi_matrix : Optional[torch.Tensor] = npmi_matrix ## used with NPMI loss
+        self.npmi_lambda = npmi_lambda
+        self.npmi_scale  = npmi_scale
     def _np_one_hot(self, vec, n_outputs):
@@ -303,12 +301,9 @@ class BaseBowEstimator(BaseEstimator):
         latent_distrib = config['latent_distribution']
         n_latent = int(config['n_latent'])
         enc_hidden_dim = int(config['enc_hidden_dim'])
-        coherence_reg_penalty = float(config['coherence_loss_wt'])
-        redundancy_reg_penalty = float(config['redundancy_loss_wt'])
         batch_size = int(config['batch_size'])
         embedding_source = config['embedding']['source']
         fixed_embedding  = config['embedding'].get('fixed') == True
-        covar_net_layers = config['covar_net_layers']
         n_encoding_layers = config['num_enc_layers']
         enc_dr = config['enc_dr']
         epochs = int(config['epochs'])
@@ -334,8 +329,7 @@ class BaseBowEstimator(BaseEstimator):
                     coherence_coefficient=coherence_coefficient,
                     device=device, lr=lr, latent_distribution=latent_distribution,
                     enc_hidden_dim=enc_hidden_dim,
-                    coherence_reg_penalty=coherence_reg_penalty,
-                    redundancy_reg_penalty=redundancy_reg_penalty, batch_size=batch_size,
+                    batch_size=batch_size,
                     embedding_source=embedding_source, embedding_size=emb_size, fixed_embedding=fixed_embedding,
                     num_enc_layers=n_encoding_layers, enc_dr=enc_dr,
                     epochs=epochs, log_method='log', coherence_via_encoder=coherence_via_encoder,
@@ -353,11 +347,8 @@ class BaseBowEstimator(BaseEstimator):
         config['batch_size']         = self.batch_size
         config['num_enc_layers']     = self.n_encoding_layers
         config['enc_dr']             = self.enc_dr
-        config['coherence_loss_wt']  = self.coherence_reg_penalty
-        config['redundancy_loss_wt'] = self.redundancy_reg_penalty
         config['n_labels']           = self.n_labels
         config['covar_net_layers']   = 1
-        config['n_covars']           = 0
         if isinstance(self.latent_distribution, VonMisesDistribution):
             config['latent_distribution'] = {'dist_type':'vmf', 'kappa': self.latent_distribution.kappa}
         elif isinstance(self.latent_distribution, LogisticGaussianDistribution):
@@ -379,7 +370,6 @@ class BaseBowEstimator(BaseEstimator):
         sp_file = os.path.join(model_dir, 'model.config')
         vocab_file = os.path.join(model_dir, 'vocab.json')
         logging.info("Model parameters, configuration and vocabulary written to {}".format(model_dir))
-        #self.model.save_parameters(pfile)
         torch.save(self.model, pfile)
         config = self._get_config()
         specs = json.dumps(config, sort_keys=True, indent=4)
@@ -481,7 +471,6 @@ class BaseBowEstimator(BaseEstimator):
         return v_res
     def validate(self, val_X, val_y):
-        #val_dataloader = SparseDataLoader(val_X, val_y, batch_size=self.test_batch_size)
         val_dataloader = SingletonWrapperLoader(SparseDataLoader(val_X, val_y, batch_size=self.test_batch_size))
         total_val_words = val_X.sum()
         if self.num_val_words < 0:
@@ -612,7 +601,7 @@ class BaseBowEstimator(BaseEstimator):
             self.model = self._get_model()
             self.model.initialize_bias_terms(wd_freqs.squeeze())  ## initialize bias weights to log frequencies
         if self.npmi_matrix is not None:
-            self.model.initialize_npmi_loss(self.npmi_matrix)
+            self.model.initialize_npmi_loss(self.npmi_matrix, npmi_lambda=self.npmi_lambda, npmi_scale=self.npmi_scale)
         return x_size
@@ -642,7 +631,6 @@ class BaseBowEstimator(BaseEstimator):
         X_data = train_dataloader.dataset.data
         train_dataloader = SingletonWrapperLoader(train_dataloader)
         train_X_size = X_data.shape
-        print("**** Setting up model with biases")
         _ = self.setup_model_with_biases(X_data)
         if aux_X is not None:
@@ -746,8 +734,7 @@ class BowEstimator(BaseBowEstimator):
                             gamma = self.gamma,
                             multilabel = self.multilabel,
                             latent_distribution=self.latent_distribution,
-                            coherence_reg_penalty=self.coherence_reg_penalty, redundancy_reg_penalty=self.redundancy_reg_penalty,
-                            n_covars=0, device=self.device)
+                            device=self.device)
         if self.pretrained_param_file is not None:
             model = torch.load(self.pretrained_param_file)
         model.to(self.device)
@@ -810,8 +797,7 @@ class BowMetricEstimator(BowEstimator):
                             gamma = self.gamma,
                             multilabel = self.multilabel,
                             latent_distribution=self.latent_distribution,
-                            coherence_reg_penalty=self.coherence_reg_penalty, redundancy_reg_penalty=self.redundancy_reg_penalty,
-                              n_covars=0, device=self.device)
+                            device=self.device)
         if self.pretrained_param_file is not None:
             model.load_parameters(self.pretrained_param_file, allow_missing=False)
         return model
@@ -929,14 +915,9 @@ class BowMetricEstimator(BowEstimator):
                             .format(epoch, v_res['avg_prec'], v_res['avg_prec'], v_res['au_roc'], v_res['ndcg'],
                                     v_res['top_1'], v_res['top_2'], v_res['top_3'], v_res['top_4']))
         self._output_status("  AP Scores: {}".format(v_res['ap_scores']))
-        #session.report({"objective": v_res['avg_prec'], "perplexity": v_res['ppl']})
-        #if self.reporter:
-        #    self.reporter(epoch=epoch+1, objective=v_res['avg_prec'], time_step=time.time(), coherence=0.0,
-        #                  perplexity=0.0, redundancy=0.0)
         return v_res['avg_prec'], v_res
 class SeqBowEstimator(BaseEstimator):
     def __init__(self, *args,

tmnt/modeling.py CHANGED Viewed

@@ -22,22 +22,16 @@ from typing import List, Tuple, Dict, Optional, Union, NoReturn
 class BaseVAE(nn.Module):
     def __init__(self, vocab_size=2000, latent_distribution=LogisticGaussianDistribution(100, 20),
-                 coherence_reg_penalty=0.0, redundancy_reg_penalty=0.0,
-                 n_covars=0, device='cpu', **kwargs):
+                 device='cpu', **kwargs):
         super(BaseVAE, self).__init__(**kwargs)
         self.vocab_size = vocab_size
         self.n_latent   = latent_distribution.n_latent
         self.enc_size   = latent_distribution.enc_size
-        self.coherence_reg_penalty = coherence_reg_penalty
-        self.redundancy_reg_penalty = redundancy_reg_penalty
-        self.n_covars = n_covars
         self.device = device
         self.embedding = None
         self.latent_distribution = latent_distribution
         self.decoder = nn.Linear(self.n_latent, self.vocab_size).to(device)
         self.npmi_with_diversity_loss : Optional[NPMILossWithDiversity] = None
-        self.npmi_alpha = 0.7
     def initialize_bias_terms(self, wd_freqs: Optional[np.ndarray]):
         if wd_freqs is not None:
@@ -47,9 +41,9 @@ class BaseVAE(nn.Module):
                 self.decoder.bias = nn.Parameter(torch.tensor(log_freq, dtype=torch.float32, device=self.device))
                 self.decoder.bias.requires_grad_(False)
-    def initialize_npmi_loss(self, npmi_mat):
-        t_npmi_mat = torch.Tensor(npmi_mat, device=self.device)
-        self.npmi_with_diversity_loss = NPMILossWithDiversity(t_npmi_mat, device=self.device, alpha=self.npmi_alpha)
+    def initialize_npmi_loss(self, npmi_mat, npmi_lambda=0.7, npmi_scale=100.0):
+        t_npmi_mat = torch.Tensor(npmi_mat).to(self.device)
+        self.npmi_with_diversity_loss = NPMILossWithDiversity(t_npmi_mat, device=self.device, npmi_lambda=npmi_lambda, npmi_scale=npmi_scale)
     def get_ordered_terms(self):
         """
@@ -78,7 +72,6 @@ class BaseVAE(nn.Module):
             jacobian = torch.autograd.functional.jacobian(self.decoder, z)
             npmi_loss = self.npmi_with_diversity_loss(jacobian)
             npmi_loss = npmi_loss.sum()
-            print("npmi loss = {}".format(npmi_loss))
             return (cur_loss + npmi_loss)
         else:
             return cur_loss
@@ -101,7 +94,6 @@ class BowVAEModel(BaseVAE):
         embedding_size (int): Number of dimensions for embedding layer
         n_encoding_layers (int): Number of layers used for the encoder. (default = 1)
         enc_dr (float): Dropout after each encoder layer. (default = 0.1)
-        n_covars (int): Number of values for categorical co-variate (0 for non-CovariateData BOW model)
         device (str): context device
     """
     def __init__(self,
@@ -121,7 +113,7 @@ class BowVAEModel(BaseVAE):
         self.gamma    = gamma
         self.classifier_dropout=classifier_dropout
         self.has_classifier = self.n_labels > 1
-        self.encoding_dims = [self.embedding_size + self.n_covars] + [enc_dim for _ in range(n_encoding_layers)]
+        self.encoding_dims = [self.embedding_size] + [enc_dim for _ in range(n_encoding_layers)]
         self.embedding = torch.nn.Sequential()
         self.embedding.add_module("linear", torch.nn.Linear(self.vocab_size, self.embedding_size))
         self.embedding.add_module("tanh", torch.nn.Tanh())
@@ -280,10 +272,11 @@ class MetricBowVAEModel(BowVAEModel):
 class NPMILossWithDiversity(nn.Module):
-    def __init__(self, npmi_matrix: torch.Tensor, device: torch.device, k=20, alpha=0.7, use_diversity_loss=True):
+    def __init__(self, npmi_matrix: torch.Tensor, device: torch.device, k=20, npmi_lambda=0.7, npmi_scale=100.0, use_diversity_loss=True):
         super(NPMILossWithDiversity, self).__init__()
-        self.alpha = alpha
+        self.npmi_lambda = npmi_lambda
         self.npmi_matrix = npmi_matrix
+        self.npmi_scale  = npmi_scale
         self.use_diversity_loss = use_diversity_loss
         self.device = device
         self.k = k
@@ -301,9 +294,6 @@ class NPMILossWithDiversity(nn.Module):
         return x
     def _get_npmi_loss(self, jacobian):
-        #z = torch.ones((self.n_latent,), device=self.device)
-        #jacobian = torch.autograd.functional.jacobian(self.decoder, z)
-        #beta = self.model.get_topic_vectors().t() # |T| x |V|
         beta = jacobian.t()
         n_topics = beta.shape[0]
         self.npmi_matrix.fill_diagonal_(1)
@@ -317,21 +307,20 @@ class NPMILossWithDiversity(nn.Module):
         softmax_beta = torch.softmax(beta, dim=1)
         weighted_npmi = 1 - self._row_wise_normalize_inplace(torch.matmul(topk_softmax_beta.detach(), self.npmi_matrix))
         #print("Weighted_npmi sum = {}".format(weighted_npmi.sum()))
-        npmi_loss = 100 * (softmax_beta ** 2) * weighted_npmi
+        npmi_loss = self.npmi_scale * (softmax_beta ** 2) * weighted_npmi
         if self.use_diversity_loss:
             diversity_mask = torch.zeros_like(beta).bool()
             for topic_idx in range(n_topics):
                 other_rows_mask = torch.ones(n_topics).bool().to(self.device)
                 other_rows_mask[topic_idx] = False
                 diversity_mask[topic_idx] = topk_mask[other_rows_mask].sum(0) > 0
-            #print("Diversity mask sum = {}".format(diversity_mask.sum()))
-            npmi_loss = ( self.alpha * torch.masked_select(npmi_loss, diversity_mask)).sum() + \
-                        ((1 - self.alpha) * torch.masked_select(npmi_loss, ~diversity_mask)).sum()
+            npmi_loss = ( self.npmi_lambda * torch.masked_select(npmi_loss, diversity_mask)).sum() + \
+                        ((1 - self.npmi_lambda) * torch.masked_select(npmi_loss, ~diversity_mask)).sum()
             npmi_loss *= 2
         return npmi_loss
-    def forward(self, beta):
-        return self._get_npmi_loss(beta)
+    def forward(self, jacobian):
+        return self._get_npmi_loss(jacobian)
 class CoherenceRegularizer(nn.Module):

{tmnt-0.7.52b20240616.dist-info → tmnt-0.7.53.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: tmnt
-Version: 0.7.52b20240616
+Version: 0.7.53
 Summary: Topic modeling neural toolkit
 Home-page: https://github.com/mitre/tmnt.git
 Author: The MITRE Corporation
@@ -22,7 +22,7 @@ Requires-Dist: MarkupSafe >=2.0
 Requires-Dist: joblib >=0.8.4
 Requires-Dist: future >=0.18.2
 Requires-Dist: funcy >=1.16
-Requires-Dist: pandas ==1.5.3
+Requires-Dist: pandas >=2.0.0
 Requires-Dist: pyOpenSSL ==18.0.0
 Requires-Dist: PySocks ==1.6.8
 Requires-Dist: sacremoses >=0.0.38
@@ -37,7 +37,7 @@ Requires-Dist: torchtext >=0.13.0
 The Topic Modeling Neural Toolkit (TMNT) is a software library that enables training
 topic models as neural network-based variational auto-encoders.
-Current stable version is: 0.7.46
+Current stable version is: 0.7.53
 Documentation can be found here: https://tmnt.readthedocs.io/en/stable/

{tmnt-0.7.52b20240616.dist-info → tmnt-0.7.53.dist-info}/RECORD RENAMED Viewed

@@ -2,10 +2,10 @@ tmnt/__init__.py,sha256=EPNq1H7UMyMewWT_zTGBaC7ZouvCywX_gMX4G1dtmvw,250
 tmnt/configuration.py,sha256=P8PEhzVPKO5xG0FrdTLRQ60OYWigbzPY-OSx_hzQlrY,10054
 tmnt/data_loading.py,sha256=A0tsM6x61BGhYBV6rAYdryz2NwbR__8EAYj_Q4Z-DCs,18736
 tmnt/distribution.py,sha256=Pmyc5gwDd_-jP7vLVb0vdNQaSSvF1EuiTZEWg3KfmI8,10866
-tmnt/estimator.py,sha256=qqb3zYCUGY53bcXjUK_B7_yvLkjiMYAeYaPk5XoFxnY,70622
+tmnt/estimator.py,sha256=2RUfX9BRnDgrFAR6sr1uzDs0OYbdg9xdfPj2bckvKgQ,69220
 tmnt/eval_npmi.py,sha256=DTW9dNHVe6H57gndQIZ4gX9EghuBstwznA3YBqILJk0,5820
 tmnt/inference.py,sha256=da8qAnjTDTuWQfPEOQewOfgikqE00XT1xGMiO2mckI4,15679
-tmnt/modeling.py,sha256=RhabXB8f9ZliOOgQVJiwwnEnvdK-oil7fGe4prDiPjc,30508
+tmnt/modeling.py,sha256=O1V7ppU7J6pvESTvdEoV9BXbEF4Z-J1OHnRtszuagaA,29956
 tmnt/preprocess/__init__.py,sha256=gwMejkQrnqKS05i0JVsUru2hDUR5jE1hKC10dL934GU,170
 tmnt/preprocess/tokenizer.py,sha256=-ZgowfbHrM040vbNTktZM_hdl6HDTqxSJ4mDAxq3dUs,14050
 tmnt/preprocess/vectorizer.py,sha256=RkdivqP76qAJDianV09lONad9NbfBVWLZgIbU_P1-zo,15796
@@ -17,9 +17,9 @@ tmnt/utils/ngram_helpers.py,sha256=VrIzou2oQHCLBLSWODDeikN3PYat1NqqvEeYQj_GhbA,1
 tmnt/utils/pubmed_utils.py,sha256=3sHwoun7vxb0GV-arhpXLMUbAZne0huAh9xQNy6H40E,1274
 tmnt/utils/random.py,sha256=qY75WG3peWoMh9pUyCPBEo6q8IvkF6VRjeb5CqJOBF8,327
 tmnt/utils/recalibrate.py,sha256=TmpB8An8bslICZ13UTJfIvr8VoqiSedtpHxec4n8CHk,1439
-tmnt-0.7.52b20240616.dist-info/LICENSE,sha256=qFZJrfJ7Zi4IXDiyiGVrHWic_l1h2tc36tI8Z7rK9bs,11356
-tmnt-0.7.52b20240616.dist-info/METADATA,sha256=Z-X2I14scGyqqxawlYGzbOs3BeobPaYlHd1MSWK41D8,1461
-tmnt-0.7.52b20240616.dist-info/NOTICE,sha256=p0kYIVAkReTFaGb4C-qPa7h5ztze6hGzOpjCMMbOipU,425
-tmnt-0.7.52b20240616.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-tmnt-0.7.52b20240616.dist-info/top_level.txt,sha256=RpYgUl187sXnqmiwKjZZdcDlHz2AALs6bGdUcukyd_E,5
-tmnt-0.7.52b20240616.dist-info/RECORD,,
+tmnt-0.7.53.dist-info/LICENSE,sha256=qFZJrfJ7Zi4IXDiyiGVrHWic_l1h2tc36tI8Z7rK9bs,11356
+tmnt-0.7.53.dist-info/METADATA,sha256=ncWhLX2-Vzg5pLzr4bQrEmVQKAlah8LvThs2L9jw1RI,1452
+tmnt-0.7.53.dist-info/NOTICE,sha256=p0kYIVAkReTFaGb4C-qPa7h5ztze6hGzOpjCMMbOipU,425
+tmnt-0.7.53.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+tmnt-0.7.53.dist-info/top_level.txt,sha256=RpYgUl187sXnqmiwKjZZdcDlHz2AALs6bGdUcukyd_E,5
+tmnt-0.7.53.dist-info/RECORD,,

{tmnt-0.7.52b20240616.dist-info → tmnt-0.7.53.dist-info}/LICENSE RENAMED Viewed

File without changes

{tmnt-0.7.52b20240616.dist-info → tmnt-0.7.53.dist-info}/NOTICE RENAMED Viewed

File without changes

{tmnt-0.7.52b20240616.dist-info → tmnt-0.7.53.dist-info}/WHEEL RENAMED Viewed

File without changes

{tmnt-0.7.52b20240616.dist-info → tmnt-0.7.53.dist-info}/top_level.txt RENAMED Viewed

File without changes

tmnt 0.7.52b20240616__py3-none-any.whl → 0.7.53__py3-none-any.whl

tmnt 0.7.52b20240616py3-none-any.whl → 0.7.53py3-none-any.whl