tmnt 0.7.52b20240616__py3-none-any.whl → 0.7.53__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tmnt/estimator.py CHANGED
@@ -17,7 +17,7 @@ import json
17
17
 
18
18
  from sklearn.metrics import average_precision_score, top_k_accuracy_score, roc_auc_score, ndcg_score, precision_recall_fscore_support
19
19
  from tmnt.data_loading import PairedDataLoader, SingletonWrapperLoader, SparseDataLoader, get_llm_model
20
- from tmnt.modeling import BowVAEModel, SeqBowVED
20
+ from tmnt.modeling import BowVAEModel, SeqBowVED, BaseVAE
21
21
  from tmnt.modeling import CrossBatchCosineSimilarityLoss, GeneralizedSDMLLoss, MultiNegativeCrossEntropyLoss, MetricSeqBowVED, MetricBowVAEModel
22
22
  from tmnt.eval_npmi import EvaluateNPMI
23
23
  from tmnt.distribution import LogisticGaussianDistribution, BaseDistribution, GaussianDistribution, VonMisesDistribution
@@ -57,8 +57,6 @@ class BaseEstimator(object):
57
57
  latent_distribution: Latent distribution of the variational autoencoder - defaults to LogisticGaussian with 20 dimensions
58
58
  optimizer: optimizer (default = "adam")
59
59
  lr: Learning rate of training. (default=0.005)
60
- coherence_reg_penalty: Regularization penalty for topic coherence. optional (default=0.0)
61
- redundancy_reg_penalty: Regularization penalty for topic redundancy. optional (default=0.0)
62
60
  batch_size: Batch training size. optional (default=128)
63
61
  epochs : Number of training epochs. optional(default=40)
64
62
  coherence_via_encoder: Flag to use encoder to derive coherence scores (via gradient attribution)
@@ -73,26 +71,24 @@ class BaseEstimator(object):
73
71
  device: Optional[str] = 'cpu',
74
72
  latent_distribution: BaseDistribution = None,
75
73
  lr: float = 0.005,
76
- coherence_reg_penalty: float = 0.0,
77
- redundancy_reg_penalty: float = 0.0,
78
74
  batch_size: int = 128,
79
75
  epochs: int = 40,
80
76
  coherence_via_encoder: bool = False,
81
77
  pretrained_param_file: Optional[str] = None,
82
78
  warm_start: bool = False,
83
79
  npmi_matrix: Optional[torch.Tensor] = None,
80
+ npmi_lambda: float = 0.7,
81
+ npmi_scale: float = 100.0,
84
82
  test_batch_size: int = 0):
85
83
  self.vocabulary = vocabulary
86
84
  self.log_method = log_method
87
85
  self.quiet = quiet
88
- self.model = None
86
+ self.model : Optional[BaseVAE] = None
89
87
  self.coherence_coefficient = coherence_coefficient
90
88
  self.device = device
91
89
  self.latent_distribution = latent_distribution
92
90
  self.lr = lr
93
91
  self.n_latent = self.latent_distribution.n_latent
94
- self.coherence_reg_penalty = coherence_reg_penalty
95
- self.redundancy_reg_penalty = redundancy_reg_penalty
96
92
  self.batch_size = batch_size
97
93
  self.test_batch_size = test_batch_size or batch_size
98
94
  self.epochs = epochs
@@ -102,6 +98,8 @@ class BaseEstimator(object):
102
98
  self.num_val_words = -1 ## will be set later for computing Perplexity on validation dataset
103
99
  self.latent_distribution.device = self.device
104
100
  self.npmi_matrix : Optional[torch.Tensor] = npmi_matrix ## used with NPMI loss
101
+ self.npmi_lambda = npmi_lambda
102
+ self.npmi_scale = npmi_scale
105
103
 
106
104
 
107
105
  def _np_one_hot(self, vec, n_outputs):
@@ -303,12 +301,9 @@ class BaseBowEstimator(BaseEstimator):
303
301
  latent_distrib = config['latent_distribution']
304
302
  n_latent = int(config['n_latent'])
305
303
  enc_hidden_dim = int(config['enc_hidden_dim'])
306
- coherence_reg_penalty = float(config['coherence_loss_wt'])
307
- redundancy_reg_penalty = float(config['redundancy_loss_wt'])
308
304
  batch_size = int(config['batch_size'])
309
305
  embedding_source = config['embedding']['source']
310
306
  fixed_embedding = config['embedding'].get('fixed') == True
311
- covar_net_layers = config['covar_net_layers']
312
307
  n_encoding_layers = config['num_enc_layers']
313
308
  enc_dr = config['enc_dr']
314
309
  epochs = int(config['epochs'])
@@ -334,8 +329,7 @@ class BaseBowEstimator(BaseEstimator):
334
329
  coherence_coefficient=coherence_coefficient,
335
330
  device=device, lr=lr, latent_distribution=latent_distribution,
336
331
  enc_hidden_dim=enc_hidden_dim,
337
- coherence_reg_penalty=coherence_reg_penalty,
338
- redundancy_reg_penalty=redundancy_reg_penalty, batch_size=batch_size,
332
+ batch_size=batch_size,
339
333
  embedding_source=embedding_source, embedding_size=emb_size, fixed_embedding=fixed_embedding,
340
334
  num_enc_layers=n_encoding_layers, enc_dr=enc_dr,
341
335
  epochs=epochs, log_method='log', coherence_via_encoder=coherence_via_encoder,
@@ -353,11 +347,8 @@ class BaseBowEstimator(BaseEstimator):
353
347
  config['batch_size'] = self.batch_size
354
348
  config['num_enc_layers'] = self.n_encoding_layers
355
349
  config['enc_dr'] = self.enc_dr
356
- config['coherence_loss_wt'] = self.coherence_reg_penalty
357
- config['redundancy_loss_wt'] = self.redundancy_reg_penalty
358
350
  config['n_labels'] = self.n_labels
359
351
  config['covar_net_layers'] = 1
360
- config['n_covars'] = 0
361
352
  if isinstance(self.latent_distribution, VonMisesDistribution):
362
353
  config['latent_distribution'] = {'dist_type':'vmf', 'kappa': self.latent_distribution.kappa}
363
354
  elif isinstance(self.latent_distribution, LogisticGaussianDistribution):
@@ -379,7 +370,6 @@ class BaseBowEstimator(BaseEstimator):
379
370
  sp_file = os.path.join(model_dir, 'model.config')
380
371
  vocab_file = os.path.join(model_dir, 'vocab.json')
381
372
  logging.info("Model parameters, configuration and vocabulary written to {}".format(model_dir))
382
- #self.model.save_parameters(pfile)
383
373
  torch.save(self.model, pfile)
384
374
  config = self._get_config()
385
375
  specs = json.dumps(config, sort_keys=True, indent=4)
@@ -481,7 +471,6 @@ class BaseBowEstimator(BaseEstimator):
481
471
  return v_res
482
472
 
483
473
  def validate(self, val_X, val_y):
484
- #val_dataloader = SparseDataLoader(val_X, val_y, batch_size=self.test_batch_size)
485
474
  val_dataloader = SingletonWrapperLoader(SparseDataLoader(val_X, val_y, batch_size=self.test_batch_size))
486
475
  total_val_words = val_X.sum()
487
476
  if self.num_val_words < 0:
@@ -612,7 +601,7 @@ class BaseBowEstimator(BaseEstimator):
612
601
  self.model = self._get_model()
613
602
  self.model.initialize_bias_terms(wd_freqs.squeeze()) ## initialize bias weights to log frequencies
614
603
  if self.npmi_matrix is not None:
615
- self.model.initialize_npmi_loss(self.npmi_matrix)
604
+ self.model.initialize_npmi_loss(self.npmi_matrix, npmi_lambda=self.npmi_lambda, npmi_scale=self.npmi_scale)
616
605
  return x_size
617
606
 
618
607
 
@@ -642,7 +631,6 @@ class BaseBowEstimator(BaseEstimator):
642
631
  X_data = train_dataloader.dataset.data
643
632
  train_dataloader = SingletonWrapperLoader(train_dataloader)
644
633
  train_X_size = X_data.shape
645
- print("**** Setting up model with biases")
646
634
  _ = self.setup_model_with_biases(X_data)
647
635
 
648
636
  if aux_X is not None:
@@ -746,8 +734,7 @@ class BowEstimator(BaseBowEstimator):
746
734
  gamma = self.gamma,
747
735
  multilabel = self.multilabel,
748
736
  latent_distribution=self.latent_distribution,
749
- coherence_reg_penalty=self.coherence_reg_penalty, redundancy_reg_penalty=self.redundancy_reg_penalty,
750
- n_covars=0, device=self.device)
737
+ device=self.device)
751
738
  if self.pretrained_param_file is not None:
752
739
  model = torch.load(self.pretrained_param_file)
753
740
  model.to(self.device)
@@ -810,8 +797,7 @@ class BowMetricEstimator(BowEstimator):
810
797
  gamma = self.gamma,
811
798
  multilabel = self.multilabel,
812
799
  latent_distribution=self.latent_distribution,
813
- coherence_reg_penalty=self.coherence_reg_penalty, redundancy_reg_penalty=self.redundancy_reg_penalty,
814
- n_covars=0, device=self.device)
800
+ device=self.device)
815
801
  if self.pretrained_param_file is not None:
816
802
  model.load_parameters(self.pretrained_param_file, allow_missing=False)
817
803
  return model
@@ -929,14 +915,9 @@ class BowMetricEstimator(BowEstimator):
929
915
  .format(epoch, v_res['avg_prec'], v_res['avg_prec'], v_res['au_roc'], v_res['ndcg'],
930
916
  v_res['top_1'], v_res['top_2'], v_res['top_3'], v_res['top_4']))
931
917
  self._output_status(" AP Scores: {}".format(v_res['ap_scores']))
932
- #session.report({"objective": v_res['avg_prec'], "perplexity": v_res['ppl']})
933
- #if self.reporter:
934
- # self.reporter(epoch=epoch+1, objective=v_res['avg_prec'], time_step=time.time(), coherence=0.0,
935
- # perplexity=0.0, redundancy=0.0)
936
918
  return v_res['avg_prec'], v_res
937
919
 
938
920
 
939
-
940
921
  class SeqBowEstimator(BaseEstimator):
941
922
 
942
923
  def __init__(self, *args,
tmnt/modeling.py CHANGED
@@ -22,22 +22,16 @@ from typing import List, Tuple, Dict, Optional, Union, NoReturn
22
22
  class BaseVAE(nn.Module):
23
23
 
24
24
  def __init__(self, vocab_size=2000, latent_distribution=LogisticGaussianDistribution(100, 20),
25
- coherence_reg_penalty=0.0, redundancy_reg_penalty=0.0,
26
- n_covars=0, device='cpu', **kwargs):
25
+ device='cpu', **kwargs):
27
26
  super(BaseVAE, self).__init__(**kwargs)
28
27
  self.vocab_size = vocab_size
29
28
  self.n_latent = latent_distribution.n_latent
30
29
  self.enc_size = latent_distribution.enc_size
31
- self.coherence_reg_penalty = coherence_reg_penalty
32
- self.redundancy_reg_penalty = redundancy_reg_penalty
33
- self.n_covars = n_covars
34
30
  self.device = device
35
31
  self.embedding = None
36
-
37
32
  self.latent_distribution = latent_distribution
38
33
  self.decoder = nn.Linear(self.n_latent, self.vocab_size).to(device)
39
34
  self.npmi_with_diversity_loss : Optional[NPMILossWithDiversity] = None
40
- self.npmi_alpha = 0.7
41
35
 
42
36
  def initialize_bias_terms(self, wd_freqs: Optional[np.ndarray]):
43
37
  if wd_freqs is not None:
@@ -47,9 +41,9 @@ class BaseVAE(nn.Module):
47
41
  self.decoder.bias = nn.Parameter(torch.tensor(log_freq, dtype=torch.float32, device=self.device))
48
42
  self.decoder.bias.requires_grad_(False)
49
43
 
50
- def initialize_npmi_loss(self, npmi_mat):
51
- t_npmi_mat = torch.Tensor(npmi_mat, device=self.device)
52
- self.npmi_with_diversity_loss = NPMILossWithDiversity(t_npmi_mat, device=self.device, alpha=self.npmi_alpha)
44
+ def initialize_npmi_loss(self, npmi_mat, npmi_lambda=0.7, npmi_scale=100.0):
45
+ t_npmi_mat = torch.Tensor(npmi_mat).to(self.device)
46
+ self.npmi_with_diversity_loss = NPMILossWithDiversity(t_npmi_mat, device=self.device, npmi_lambda=npmi_lambda, npmi_scale=npmi_scale)
53
47
 
54
48
  def get_ordered_terms(self):
55
49
  """
@@ -78,7 +72,6 @@ class BaseVAE(nn.Module):
78
72
  jacobian = torch.autograd.functional.jacobian(self.decoder, z)
79
73
  npmi_loss = self.npmi_with_diversity_loss(jacobian)
80
74
  npmi_loss = npmi_loss.sum()
81
- print("npmi loss = {}".format(npmi_loss))
82
75
  return (cur_loss + npmi_loss)
83
76
  else:
84
77
  return cur_loss
@@ -101,7 +94,6 @@ class BowVAEModel(BaseVAE):
101
94
  embedding_size (int): Number of dimensions for embedding layer
102
95
  n_encoding_layers (int): Number of layers used for the encoder. (default = 1)
103
96
  enc_dr (float): Dropout after each encoder layer. (default = 0.1)
104
- n_covars (int): Number of values for categorical co-variate (0 for non-CovariateData BOW model)
105
97
  device (str): context device
106
98
  """
107
99
  def __init__(self,
@@ -121,7 +113,7 @@ class BowVAEModel(BaseVAE):
121
113
  self.gamma = gamma
122
114
  self.classifier_dropout=classifier_dropout
123
115
  self.has_classifier = self.n_labels > 1
124
- self.encoding_dims = [self.embedding_size + self.n_covars] + [enc_dim for _ in range(n_encoding_layers)]
116
+ self.encoding_dims = [self.embedding_size] + [enc_dim for _ in range(n_encoding_layers)]
125
117
  self.embedding = torch.nn.Sequential()
126
118
  self.embedding.add_module("linear", torch.nn.Linear(self.vocab_size, self.embedding_size))
127
119
  self.embedding.add_module("tanh", torch.nn.Tanh())
@@ -280,10 +272,11 @@ class MetricBowVAEModel(BowVAEModel):
280
272
 
281
273
  class NPMILossWithDiversity(nn.Module):
282
274
 
283
- def __init__(self, npmi_matrix: torch.Tensor, device: torch.device, k=20, alpha=0.7, use_diversity_loss=True):
275
+ def __init__(self, npmi_matrix: torch.Tensor, device: torch.device, k=20, npmi_lambda=0.7, npmi_scale=100.0, use_diversity_loss=True):
284
276
  super(NPMILossWithDiversity, self).__init__()
285
- self.alpha = alpha
277
+ self.npmi_lambda = npmi_lambda
286
278
  self.npmi_matrix = npmi_matrix
279
+ self.npmi_scale = npmi_scale
287
280
  self.use_diversity_loss = use_diversity_loss
288
281
  self.device = device
289
282
  self.k = k
@@ -301,9 +294,6 @@ class NPMILossWithDiversity(nn.Module):
301
294
  return x
302
295
 
303
296
  def _get_npmi_loss(self, jacobian):
304
- #z = torch.ones((self.n_latent,), device=self.device)
305
- #jacobian = torch.autograd.functional.jacobian(self.decoder, z)
306
- #beta = self.model.get_topic_vectors().t() # |T| x |V|
307
297
  beta = jacobian.t()
308
298
  n_topics = beta.shape[0]
309
299
  self.npmi_matrix.fill_diagonal_(1)
@@ -317,21 +307,20 @@ class NPMILossWithDiversity(nn.Module):
317
307
  softmax_beta = torch.softmax(beta, dim=1)
318
308
  weighted_npmi = 1 - self._row_wise_normalize_inplace(torch.matmul(topk_softmax_beta.detach(), self.npmi_matrix))
319
309
  #print("Weighted_npmi sum = {}".format(weighted_npmi.sum()))
320
- npmi_loss = 100 * (softmax_beta ** 2) * weighted_npmi
310
+ npmi_loss = self.npmi_scale * (softmax_beta ** 2) * weighted_npmi
321
311
  if self.use_diversity_loss:
322
312
  diversity_mask = torch.zeros_like(beta).bool()
323
313
  for topic_idx in range(n_topics):
324
314
  other_rows_mask = torch.ones(n_topics).bool().to(self.device)
325
315
  other_rows_mask[topic_idx] = False
326
316
  diversity_mask[topic_idx] = topk_mask[other_rows_mask].sum(0) > 0
327
- #print("Diversity mask sum = {}".format(diversity_mask.sum()))
328
- npmi_loss = ( self.alpha * torch.masked_select(npmi_loss, diversity_mask)).sum() + \
329
- ((1 - self.alpha) * torch.masked_select(npmi_loss, ~diversity_mask)).sum()
317
+ npmi_loss = ( self.npmi_lambda * torch.masked_select(npmi_loss, diversity_mask)).sum() + \
318
+ ((1 - self.npmi_lambda) * torch.masked_select(npmi_loss, ~diversity_mask)).sum()
330
319
  npmi_loss *= 2
331
320
  return npmi_loss
332
321
 
333
- def forward(self, beta):
334
- return self._get_npmi_loss(beta)
322
+ def forward(self, jacobian):
323
+ return self._get_npmi_loss(jacobian)
335
324
 
336
325
 
337
326
  class CoherenceRegularizer(nn.Module):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: tmnt
3
- Version: 0.7.52b20240616
3
+ Version: 0.7.53
4
4
  Summary: Topic modeling neural toolkit
5
5
  Home-page: https://github.com/mitre/tmnt.git
6
6
  Author: The MITRE Corporation
@@ -22,7 +22,7 @@ Requires-Dist: MarkupSafe >=2.0
22
22
  Requires-Dist: joblib >=0.8.4
23
23
  Requires-Dist: future >=0.18.2
24
24
  Requires-Dist: funcy >=1.16
25
- Requires-Dist: pandas ==1.5.3
25
+ Requires-Dist: pandas >=2.0.0
26
26
  Requires-Dist: pyOpenSSL ==18.0.0
27
27
  Requires-Dist: PySocks ==1.6.8
28
28
  Requires-Dist: sacremoses >=0.0.38
@@ -37,7 +37,7 @@ Requires-Dist: torchtext >=0.13.0
37
37
  The Topic Modeling Neural Toolkit (TMNT) is a software library that enables training
38
38
  topic models as neural network-based variational auto-encoders.
39
39
 
40
- Current stable version is: 0.7.46
40
+ Current stable version is: 0.7.53
41
41
 
42
42
  Documentation can be found here: https://tmnt.readthedocs.io/en/stable/
43
43
 
@@ -2,10 +2,10 @@ tmnt/__init__.py,sha256=EPNq1H7UMyMewWT_zTGBaC7ZouvCywX_gMX4G1dtmvw,250
2
2
  tmnt/configuration.py,sha256=P8PEhzVPKO5xG0FrdTLRQ60OYWigbzPY-OSx_hzQlrY,10054
3
3
  tmnt/data_loading.py,sha256=A0tsM6x61BGhYBV6rAYdryz2NwbR__8EAYj_Q4Z-DCs,18736
4
4
  tmnt/distribution.py,sha256=Pmyc5gwDd_-jP7vLVb0vdNQaSSvF1EuiTZEWg3KfmI8,10866
5
- tmnt/estimator.py,sha256=qqb3zYCUGY53bcXjUK_B7_yvLkjiMYAeYaPk5XoFxnY,70622
5
+ tmnt/estimator.py,sha256=2RUfX9BRnDgrFAR6sr1uzDs0OYbdg9xdfPj2bckvKgQ,69220
6
6
  tmnt/eval_npmi.py,sha256=DTW9dNHVe6H57gndQIZ4gX9EghuBstwznA3YBqILJk0,5820
7
7
  tmnt/inference.py,sha256=da8qAnjTDTuWQfPEOQewOfgikqE00XT1xGMiO2mckI4,15679
8
- tmnt/modeling.py,sha256=RhabXB8f9ZliOOgQVJiwwnEnvdK-oil7fGe4prDiPjc,30508
8
+ tmnt/modeling.py,sha256=O1V7ppU7J6pvESTvdEoV9BXbEF4Z-J1OHnRtszuagaA,29956
9
9
  tmnt/preprocess/__init__.py,sha256=gwMejkQrnqKS05i0JVsUru2hDUR5jE1hKC10dL934GU,170
10
10
  tmnt/preprocess/tokenizer.py,sha256=-ZgowfbHrM040vbNTktZM_hdl6HDTqxSJ4mDAxq3dUs,14050
11
11
  tmnt/preprocess/vectorizer.py,sha256=RkdivqP76qAJDianV09lONad9NbfBVWLZgIbU_P1-zo,15796
@@ -17,9 +17,9 @@ tmnt/utils/ngram_helpers.py,sha256=VrIzou2oQHCLBLSWODDeikN3PYat1NqqvEeYQj_GhbA,1
17
17
  tmnt/utils/pubmed_utils.py,sha256=3sHwoun7vxb0GV-arhpXLMUbAZne0huAh9xQNy6H40E,1274
18
18
  tmnt/utils/random.py,sha256=qY75WG3peWoMh9pUyCPBEo6q8IvkF6VRjeb5CqJOBF8,327
19
19
  tmnt/utils/recalibrate.py,sha256=TmpB8An8bslICZ13UTJfIvr8VoqiSedtpHxec4n8CHk,1439
20
- tmnt-0.7.52b20240616.dist-info/LICENSE,sha256=qFZJrfJ7Zi4IXDiyiGVrHWic_l1h2tc36tI8Z7rK9bs,11356
21
- tmnt-0.7.52b20240616.dist-info/METADATA,sha256=Z-X2I14scGyqqxawlYGzbOs3BeobPaYlHd1MSWK41D8,1461
22
- tmnt-0.7.52b20240616.dist-info/NOTICE,sha256=p0kYIVAkReTFaGb4C-qPa7h5ztze6hGzOpjCMMbOipU,425
23
- tmnt-0.7.52b20240616.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
24
- tmnt-0.7.52b20240616.dist-info/top_level.txt,sha256=RpYgUl187sXnqmiwKjZZdcDlHz2AALs6bGdUcukyd_E,5
25
- tmnt-0.7.52b20240616.dist-info/RECORD,,
20
+ tmnt-0.7.53.dist-info/LICENSE,sha256=qFZJrfJ7Zi4IXDiyiGVrHWic_l1h2tc36tI8Z7rK9bs,11356
21
+ tmnt-0.7.53.dist-info/METADATA,sha256=ncWhLX2-Vzg5pLzr4bQrEmVQKAlah8LvThs2L9jw1RI,1452
22
+ tmnt-0.7.53.dist-info/NOTICE,sha256=p0kYIVAkReTFaGb4C-qPa7h5ztze6hGzOpjCMMbOipU,425
23
+ tmnt-0.7.53.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
24
+ tmnt-0.7.53.dist-info/top_level.txt,sha256=RpYgUl187sXnqmiwKjZZdcDlHz2AALs6bGdUcukyd_E,5
25
+ tmnt-0.7.53.dist-info/RECORD,,