PyPI - tmnt - Versions diffs - 0.7.54b20240817__py3-none-any.whl → 0.7.57__py3-none-any.whl - Mend

tmnt 0.7.54b20240817py3-none-any.whl → 0.7.57py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

tmnt/data_loading.py +19 -13
tmnt/distribution.py +88 -3
tmnt/estimator.py +11 -34
tmnt/eval_npmi.py +19 -0
tmnt/modeling.py +3 -4
tmnt/preprocess/vectorizer.py +129 -11
tmnt/utils/vocab.py +126 -0
{tmnt-0.7.54b20240817.dist-info → tmnt-0.7.57.dist-info}/METADATA +13 -3
{tmnt-0.7.54b20240817.dist-info → tmnt-0.7.57.dist-info}/RECORD +13 -12
{tmnt-0.7.54b20240817.dist-info → tmnt-0.7.57.dist-info}/WHEEL +1 -1
{tmnt-0.7.54b20240817.dist-info → tmnt-0.7.57.dist-info}/LICENSE +0 -0
{tmnt-0.7.54b20240817.dist-info → tmnt-0.7.57.dist-info}/NOTICE +0 -0
{tmnt-0.7.54b20240817.dist-info → tmnt-0.7.57.dist-info}/top_level.txt +0 -0

tmnt/data_loading.py CHANGED Viewed

@@ -25,9 +25,7 @@ from typing import List, Tuple, Dict, Optional, Union, NoReturn
 import torch
 from torch.utils.data import DataLoader, Sampler, WeightedRandomSampler, RandomSampler
-from torchtext.vocab import vocab as build_vocab
-from torchtext.data.utils import get_tokenizer
-from torchtext.vocab import build_vocab_from_iterator
+from tmnt.utils.vocab import build_vocab
 from transformers import DistilBertTokenizer, DistilBertModel, AutoTokenizer, AutoModel, DistilBertTokenizer, BertModel, DistilBertModel, OpenAIGPTModel
 from sklearn.model_selection import StratifiedKFold
@@ -41,13 +39,14 @@ llm_catalog = {
     'allenai/scibert_scivocab_uncased': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained),
     'johngiorgi/declutr-sci-base': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained),
     'BAAI/bge-base-en-v1.5': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained),
-    'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained)
-    ## add more model options here if desired
+    'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained),
+    'Alibaba-NLP/gte-base-en-v1.5': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained)
+    ## add more model options here ...
     }
 def get_llm(model_name):
     tok_fn, model_fn = llm_catalog.get(model_name, ((AutoTokenizer.from_pretrained, AutoModel.from_pretrained)))
-    return tok_fn(model_name), model_fn(model_name)
+    return tok_fn(model_name), model_fn(model_name, trust_remote_code=True)
 def get_llm_tokenizer(model_name):
     tok_fn, model_fn = llm_catalog.get(model_name, ((AutoTokenizer.from_pretrained, AutoModel.from_pretrained)))
@@ -55,19 +54,20 @@ def get_llm_tokenizer(model_name):
 def get_llm_model(model_name):
     tok_fn, model_fn = llm_catalog.get(model_name, ((AutoTokenizer.from_pretrained, AutoModel.from_pretrained)))
-    return model_fn(model_name)
+    return model_fn(model_name, trust_remote_code=True)
-def get_unwrapped_llm_dataloader(data, bow_vectorizer, llm_name, label_map, batch_size, max_len, shuffle=False, device='cpu'):
+def get_unwrapped_llm_dataloader(data, bow_vectorizer, llm_name, label_map, batch_size, max_len, bow_target_texts=None,
+                                 shuffle=False, device='cpu'):
     label_pipeline = lambda x: label_map.get(x, 0)
     text_pipeline  = get_llm_tokenizer(llm_name)
     def collate_batch(batch):
         label_list, text_list, mask_list, bow_list = [], [], [], []
-        for (_label, _text) in batch:
+        for (_label, _text, _target_text) in batch:
             label_list.append(label_pipeline(_label))
             tokenized_result = text_pipeline(_text, return_tensors='pt', padding='max_length',
                                            max_length=max_len, truncation=True)
-            bag_of_words,_ = bow_vectorizer.transform([_text])
+            bag_of_words,_ = bow_vectorizer.transform([_target_text])
             processed_text = tokenized_result['input_ids']
             mask = tokenized_result['attention_mask']
             mask_list.append(mask)
@@ -78,10 +78,16 @@ def get_unwrapped_llm_dataloader(data, bow_vectorizer, llm_name, label_map, batc
         mask_list  = torch.vstack(mask_list)
         bow_list   = torch.vstack([ sparse_coo_to_tensor(bow_vec.tocoo()) for bow_vec in bow_list ])
         return label_list.to(device), text_list.to(device), mask_list.to(device), bow_list.to(device)
-    return DataLoader(data, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_batch)
+    if bow_target_texts is not None:
+        assert len(bow_target_texts) == len(data)
+        full_data = [ (label, txt, alt_text) for ((label, txt), alt_text) in zip(data, bow_target_texts)]
+    else:
+        full_data = [ (label, txt, txt) for (label, txt) in data]
+    return DataLoader(full_data, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_batch)
-def get_llm_dataloader(data, bow_vectorizer, llm_name, label_map, batch_size, max_len, shuffle=False, device='cpu'):
-    return SingletonWrapperLoader(get_unwrapped_llm_dataloader(data, bow_vectorizer, llm_name, label_map, batch_size, max_len, shuffle=shuffle, device=device))
+def get_llm_dataloader(data, bow_vectorizer, llm_name, label_map, batch_size, max_len, bow_target_texts=None, shuffle=False, device='cpu'):
+    return SingletonWrapperLoader(get_unwrapped_llm_dataloader(data, bow_vectorizer, llm_name, label_map, batch_size, max_len,
+                                                               bow_target_texts=bow_target_texts, shuffle=shuffle, device=device))
 def get_llm_paired_dataloader(data_a, data_b, bow_vectorizer, llm_name, label_map, batch_size, max_len_a, max_len_b,

tmnt/distribution.py CHANGED Viewed

@@ -14,6 +14,7 @@ from torch.nn import Sequential
 import torch
 from scipy import special as sp
 import torch
+from typing import Callable, Literal, Optional, Tuple, TypeVar, Union
 __all__ = ['BaseDistribution', 'GaussianDistribution', 'GaussianUnitVarDistribution', 'LogisticGaussianDistribution',
@@ -28,12 +29,10 @@ class BaseDistribution(nn.Module):
         self.enc_size = enc_size
         self.device = device
         self.mu_encoder = nn.Linear(enc_size, n_latent).to(device)
-        #self.mu_encoder = Sequential(self.mu_proj, nn.Softplus().to(device))
         self.mu_bn = nn.BatchNorm1d(n_latent, momentum = 0.8, eps=0.0001).to(device)
         self.softmax = nn.Softmax(dim=1).to(device)
         self.softplus = nn.Softplus().to(device)
         self.on_simplex = on_simplex
-        #self.mu_bn.collect_params().setattr('grad_req', 'null')
     ## this is required by most priors
     def _get_gaussian_sample(self, mu, lv, batch_size):
@@ -266,5 +265,91 @@ class Projection(BaseDistribution):
+class TopK(nn.Module):
+    def __init__(
+        self, k: int, postact_fn: Callable[[torch.Tensor], torch.Tensor] = nn.ReLU()
+    ):
+        super().__init__()
+        self.k = k
+        self.postact_fn = postact_fn
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        topk = torch.topk(x, k=self.k, dim=-1)
+        values = self.postact_fn(topk.values)
+        result = torch.zeros_like(x)
+        result.scatter_(-1, topk.indices, values)
+        return result
+class ConceptLogisticGaussianDistribution(nn.Module):
+    """Sparse concept encoding with Logistic normal/Gaussian latent distribution with specified prior
+    Parameters:
+        n_latent (int): Dimentionality of the latent distribution
+        device (device): Torch computational context (cpu or gpu[id])
+        dr (float): Dropout value for dropout applied post sample. optional (default = 0.2)
+        alpha (float): Value the determines prior variance as 1/alpha - (2/n_latent) + 1/(n_latent^2)
+    """
+    def __init__(self, enc_size, n_latent, n_concepts=16000, k_sparsity=32, device='cpu', dr=0.1, alpha=1.0):
+        super(ConceptLogisticGaussianDistribution, self).__init__()
+        self.n_latent = n_latent
+        self.enc_size = enc_size
+        self.device = device
+        self.activation = TopK(k=k_sparsity)
+        self.core_sparse = Sequential(nn.Linear(enc_size, n_concepts), self.activation).to(device)
+        self.mu_encoder = Sequential(self.core_sparse, nn.Linear(n_concepts, n_latent)).to(device)
+        self.mu_bn = nn.BatchNorm1d(n_latent, momentum = 0.8, eps=0.0001).to(device)
+        self.softmax = nn.Softmax(dim=1).to(device)
+        self.on_simplex = True
+        self.alpha = alpha
+        self.n_concepts = n_concepts
+        prior_var = 1 / self.alpha - (2.0 / n_latent) + 1 / (self.n_latent * self.n_latent)
+        self.prior_var = torch.tensor([prior_var], device=device)
+        self.prior_logvar = torch.tensor([math.log(prior_var)], device=device)
+        ## NOTE: the weights to model the log-variance are separate but the sparse encoder is shared
+        ## between the lv_encoder and mu_encoder (above)
+        self.lv_encoder = Sequential(self.core_sparse, nn.Linear(n_concepts, n_latent)).to(device)
+        self.lv_bn = nn.BatchNorm1d(n_latent, momentum = 0.8, eps=0.001).to(device)
+        self.post_sample_dr_o = nn.Dropout(dr)
+    ## this is required by most priors
+    def _get_gaussian_sample(self, mu, lv, batch_size):
+        eps = Normal(torch.zeros(batch_size, self.n_latent),
+                     torch.ones(batch_size, self.n_latent)).sample().to(self.device)
+        return (mu + torch.exp(0.5*lv).to(self.device) * eps)
+    def _get_kl_term(self, mu, lv):
+        posterior_var = torch.exp(lv)
+        delta = mu
+        dt = torch.div(delta * delta, self.prior_var)
+        v_div = torch.div(posterior_var, self.prior_var)
+        lv_div = self.prior_logvar - lv
+        return (0.5 * (torch.sum((v_div + dt + lv_div), 1) - self.n_latent)).to(self.device)
+    def forward(self, data, batch_size):
+        """Generate a sample according to the logistic Gaussian latent distribution given the encoder outputs
+        """
+        mu = self.mu_encoder(data)
+        mu_bn = self.mu_bn(mu)
+        lv = self.lv_encoder(data)
+        lv_bn = self.lv_bn(lv)
+        z_p = self._get_gaussian_sample(mu_bn, lv_bn, batch_size)
+        KL = self._get_kl_term(mu, lv)
+        z = self.post_sample_dr_o(z_p)
+        return self.softmax(z), KL
+    def get_mu_encoding(self, data, include_bn=True, normalize=False):
+        """Provide the distribution mean as the natural result of running the full encoder
+        Parameters:
+            data (:class:`mxnet.ndarray.NDArray`): Output of pre-latent encoding layers
+        Returns:
+            encoding (:class:`mxnet.ndarray.NDArray`): Encoding vector representing unnormalized topic proportions
+        """
+        enc = self.mu_encoder(data)
+        if include_bn:
+            enc = self.mu_bn(enc)
+        mu = self.softmax(enc) if normalize else enc
+        return mu

tmnt/estimator.py CHANGED Viewed

@@ -21,6 +21,7 @@ from tmnt.modeling import BowVAEModel, SeqBowVED, BaseVAE
 from tmnt.modeling import CrossBatchCosineSimilarityLoss, GeneralizedSDMLLoss, MultiNegativeCrossEntropyLoss, MetricSeqBowVED, MetricBowVAEModel
 from tmnt.eval_npmi import EvaluateNPMI
 from tmnt.distribution import LogisticGaussianDistribution, BaseDistribution, GaussianDistribution, VonMisesDistribution
+from tmnt.utils.vocab import Vocab
 ## evaluation routines
 from torcheval.metrics import MultilabelAUPRC, MulticlassAUPRC
@@ -38,7 +39,6 @@ import pickle
 from typing import List, Tuple, Dict, Optional, Union, NoReturn
 import torch
-import torchtext
 from torch.utils.data import Dataset, DataLoader
 from tqdm import tqdm
@@ -249,7 +249,7 @@ class BaseBowEstimator(BaseEstimator):
                                device        = device)
     @classmethod
-    def from_config(cls, config: Union[str, dict], vocabulary: Union[str, torchtext.vocab.Vocab],
+    def from_config(cls, config: Union[str, dict], vocabulary: Union[str, Vocab],
                     n_labels: int = 0,
                     coherence_coefficient: float = 8.0,
                     coherence_via_encoder: bool = False,
@@ -285,16 +285,11 @@ class BaseBowEstimator(BaseEstimator):
                 logging.error("File {} does not appear to be a valid vocabulary file".format(vocabulary))
                 raise Exception("Invalid Json Configuration File")
             vocabulary = torchtext.vocab.vocab(voc_js)
-        #if vocabulary['embedding'] is not None:
-        if False:
-            raise Exception("Pre-trained embeddings not yet (re-)supported")
-            #emb_size = vocabulary['embedding'].idx_to_vec[0].size
-        else:
-            emb_size = config['embedding'].get('size')
-            if not emb_size:
-                emb_size = config['derived_info'].get('embedding_size')
-            if not emb_size:
-                raise Exception("Embedding size must be provided as the 'size' attribute of 'embedding' or as 'derived_info.embedding_size'")
+        emb_size = config['embedding'].get('size')
+        if not emb_size:
+            emb_size = config['derived_info'].get('embedding_size')
+        if not emb_size:
+            raise Exception("Embedding size must be provided as the 'size' attribute of 'embedding' or as 'derived_info.embedding_size'")
         gamma = config.get('gamma', 1.0)
         multilabel = config.get('multilabel', False)
         lr = config['lr']
@@ -781,12 +776,6 @@ class BowMetricEstimator(BowEstimator):
     def _get_model(self, bow_size=-1):
         if self.embedding_source != 'random':
             e_type, e_name = tuple(self.embedding_source.split(':'))
-            #pt_embedding = nlp.embedding.create(e_type, source=e_name)
-            #self.vocabulary.set_embedding(pt_embedding)
-            #emb_size = len(self.vocabulary.embedding.idx_to_vec[0])
-            #for word in self.vocabulary.embedding._idx_to_token:
-            #    if (self.vocabulary.embedding[word] == mx.nd.zeros(emb_size)).sum() == emb_size:
-            #        self.vocabulary.embedding[word] = mx.nd.random.normal(0, 0.1, emb_size)
         else:
             emb_size = self.embedding_size
         model = \
@@ -959,7 +948,7 @@ class SeqBowEstimator(BaseEstimator):
     @classmethod
     def from_config(cls,
                     config: Union[str, dict],
-                    vocabulary: torchtext.vocab.Vocab,
+                    vocabulary: Vocab,
                     log_interval: int = 1,
                     pretrained_param_file: Optional[str] = None,
                     n_labels: Optional[int] = None,
@@ -985,7 +974,7 @@ class SeqBowEstimator(BaseEstimator):
                 raise Exception("Invalid Json Configuration File")
         ldist_def = config['latent_distribution']
         llm_model_name = config['llm_model_name']
-        model = torch.load(pretrained_param_file, map_location=device)
+        model = torch.load(pretrained_param_file, map_location=device, weights_only=False)
         latent_distribution = model.latent_distribution
         estimator = cls(llm_model_name = llm_model_name,
@@ -1017,7 +1006,7 @@ class SeqBowEstimator(BaseEstimator):
             config_file = os.path.join(model_dir, 'model.config')
         with open(config_file) as f:
             config = json.loads(f.read())
-        vocab = torch.load(vocab_file)
+        vocab = torch.load(vocab_file, weights_only=False)
         return cls.from_config(config,
                                vocabulary = vocab,
                                log_interval = log_interval,
@@ -1030,7 +1019,6 @@ class SeqBowEstimator(BaseEstimator):
         tr_bow_counts = self._get_bow_wd_counts(train_data)
         model.initialize_bias_terms(tr_bow_counts)
         if self.npmi_matrix is not None:
-            print("****** INITIALIZING NPMI LOSS FUNCTION *******")
             model.initialize_npmi_loss(self.npmi_matrix)
         return model
@@ -1057,7 +1045,6 @@ class SeqBowEstimator(BaseEstimator):
         else:
             config['latent_distribution'] = {'dist_type':'gaussian'}
         config['epochs'] = self.epochs
-        #config['embedding_source'] = self.embedding_source
         config['gamma'] = self.gamma
         config['warmup_ratio'] = self.warmup_ratio
         config['llm_model_name'] = self.llm_model_name
@@ -1091,9 +1078,6 @@ class SeqBowEstimator(BaseEstimator):
                   log_interval, epoch_id, learning_rate):
         """Generate and print out the log message for training. """
         if self.has_classifier:
-            #metric_nm, metric_val = self.metric.compute()
-            #if not isinstance(metric_nm, list):
-            #    metric_nm, metric_val = [metric_nm], [metric_val]
             metric_nm = "AUPRC"
             try:
                 metric_val = self.metric.compute()
@@ -1126,7 +1110,6 @@ class SeqBowEstimator(BaseEstimator):
         rows = 0
         for i, data in enumerate(dataloader):
             seqs, = data
-            #bow_batch = list(seqs[3].squeeze(axis=1))
             bow_batch = list(seqs[3])
             rows += len(bow_batch)
             if i >= max_rows:
@@ -1170,10 +1153,7 @@ class SeqBowEstimator(BaseEstimator):
             label_ls = label_ls.mean()
             total_ls = (self.gamma * label_ls) + elbo_ls.mean()
             if not self.multilabel:
-                #label_ind = label.argmax(dim=0)
-                #self.metric.update([out], [label_ind])
                 self.metric.update(torch.tensor(out), torch.tensor(label))
-                #self.metric.update(torch.Tensor([out]), torch.Tensor([label_ind]))
             else:
                 self.metric.update([out], [label])
         else:
@@ -1214,7 +1194,6 @@ class SeqBowEstimator(BaseEstimator):
         joint_loader = PairedDataLoader(train_data, aux_data)
         num_train_steps = len(joint_loader) * self.epochs
-        ## The following from HuggingFace trainer.py lines 1047 to 1063
         decay_parameters = get_parameter_names(model.llm, ALL_LAYERNORM_LAYERS)
         decay_parameters = [name for name in decay_parameters if "bias" not in name]
         non_llm_parameters = [name for name,_ in model.named_parameters() if not name.startswith("llm")]
@@ -1288,10 +1267,8 @@ class SeqBowEstimator(BaseEstimator):
                 if aux_batch is not None:
                     update_loss_details(total_ls_2, elbo_ls_2, red_ls_2, None)
-                #debug
                 if not accumulate or (batch_id + 1) % accumulate == 0:
-                    #torch.nn.utils.clip_grad.clip_grad_value_(model.llm.parameters(), 1.0)
+                    torch.nn.utils.clip_grad.clip_grad_value_(model.llm.parameters(), 1.0)
                     optimizer.step()
                     dec_optimizer.step()
                     lr_scheduler.step()

tmnt/eval_npmi.py CHANGED Viewed

@@ -115,6 +115,25 @@ class EvaluateNPMI(object):
                 npmi = (log10(n_docs) + log10(bigram_cnt) - log10(unigram_1) - log10(unigram_2)) / (log10(n_docs) - log10(bigram_cnt) + 1e-4)
             npmi_matrix[w1, w2] = npmi
         return npmi_matrix
+class EvaluateNPMIUmass(object):
+    def __init__(self, npmi_matrix: np.array, vectorizer: TMNTVectorizer):
+        self.vectorizer = vectorizer
+        self.npmi_matrix = npmi_matrix # by convention this will be lower-triangular
+        dim = npmi_matrix.shape[0]
+        for mc in range(self.npmi_matrix.shape[0]):
+            for i in range(mc+1,dim):
+                self.npmi_matrix[mc,i] = self.npmi_matrix[i,mc]
+    def evaluate_topics(self, topic_ids):
+        npmi_score = 0.0
+        total_size = len(topic_ids) * len(topic_ids[0])
+        for topic in topic_ids:
+            for (w1, w2) in combinations(topic):
+                npmi_score += self.npmi_matrix[w1, w2]
+        return npmi_score / total_size
 class FullNPMI(object):

tmnt/modeling.py CHANGED Viewed

@@ -56,7 +56,6 @@ class BaseVAE(nn.Module):
         sorted_j = jacobian.argsort(dim=0, descending=True)
         return sorted_j.cpu().numpy()
     def get_topic_vectors(self):
         """
         Returns unnormalized topic vectors
@@ -126,7 +125,8 @@ class BowVAEModel(BaseVAE):
     def _init_weights(self, module):
         if isinstance(module, torch.nn.Linear):
-            torch.nn.init.xavier_uniform_(module.weight.data)
+            torch.nn.init.kaiming_uniform_(module.weight.data)
+            #torch.nn.init.xavier_uniform_(module.weight.data)
     def _get_encoder(self, dims, dr=0.1):
@@ -462,9 +462,8 @@ class SeqBowVED(BaseSeqBowVED):
             classifier_outputs = self.classifier(z_mu)
         else:
             classifier_outputs = None
-        redundancy_loss = entropy_loss
         ii_loss = self.add_npmi_and_diversity_loss(elbo)
-        redundancy_loss = entropy_loss  #self.get_redundancy_penalty()
+        redundancy_loss = ii_loss  #self.get_redundancy_penalty()
         return ii_loss, rec_loss, KL_loss, redundancy_loss, classifier_outputs

tmnt/preprocess/vectorizer.py CHANGED Viewed

@@ -6,17 +6,12 @@ Copyright (c) 2019-2021 The MITRE Corporation.
 import io
 import os
 import json
-import torchtext
-from torchtext.vocab import vocab as build_vocab
+from tmnt.utils.vocab import Vocab, build_vocab
 import glob
 from multiprocessing import Pool, cpu_count
 from mantichora import mantichora
 from atpbar import atpbar
 import collections
-import threading
-import logging
-import threading
-import scipy
 import scipy.sparse as sp
 import numpy as np
 from queue import Queue
@@ -25,9 +20,14 @@ from sklearn.datasets import dump_svmlight_file
 from tmnt.preprocess import BasicTokenizer
 from typing import List, Dict, Optional, Any, Tuple
 from collections import OrderedDict
+from sklearn.utils import check_array
+from sklearn.preprocessing import normalize
+from sklearn.feature_extraction.text import TfidfTransformer
+from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
+from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
-__all__ = ['TMNTVectorizer']
+__all__ = ['TMNTVectorizer', 'CTFIDFVectorizer']
 class TMNTVectorizer(object):
@@ -57,10 +57,12 @@ class TMNTVectorizer(object):
     def __init__(self, text_key: str = 'body', label_key: Optional[str] = None, min_doc_size: int = 1,
                  label_remap: Optional[Dict[str,str]] = None,
                  json_out_dir: Optional[str] = None, vocab_size: int = 2000, file_pat: str = '*.json',
-                 encoding: str = 'utf-8', initial_vocabulary: Optional[torchtext.vocab.Vocab] = None,
+                 encoding: str = 'utf-8', initial_vocabulary: Optional[Vocab] = None,
                  additional_feature_keys: List[str] = None, stop_word_file: str = None,
                  split_char: str = ',',
                  max_ws_tokens: int = -1,
+                 source_key: Optional[str] = None,
+                 source_json: Optional[str] = None,
                  count_vectorizer_kwargs: Dict[str, Any] = {'max_df':0.95, 'min_df':0.0, 'stop_words':'english'}):
         self.encoding = encoding
         self.max_ws_tokens = max_ws_tokens
@@ -78,12 +80,53 @@ class TMNTVectorizer(object):
         self.cv_kwargs = self._update_count_vectorizer_args(count_vectorizer_kwargs, stop_word_file)
         if not 'token_pattern' in self.cv_kwargs:
             self.cv_kwargs['token_pattern'] = r'\b[A-Za-z][A-Za-z]+\b'
+        if source_key and source_json:
+            source_terms = self._get_source_specific_terms(source_json, 10, text_key, source_key,
+                                                           {'token_pattern': self.cv_kwargs['token_pattern'],
+                                                           'stop_words': self.cv_kwargs['stop_words'],
+                                                            'max_df': 1.0, 'min_df':0.0})
+            stop_words = set(source_terms)
+            stop_words.update(set(ENGLISH_STOP_WORDS))
+            self.cv_kwargs['stop_words'] = frozenset(stop_words)
         self.vectorizer = CountVectorizer(max_features=self.vocab_size,
                                           vocabulary=(initial_vocabulary.get_itos() if initial_vocabulary else None),
                                           **self.cv_kwargs)
         self.label_map = {}
+    def _get_source_specific_terms(self, json_file, k: int, text_key: str, source_key: str, cv_kwargs):
+        by_source = {}
+        with io.open(json_file) as fp:
+            for l in fp:
+                js = json.loads(l)
+                txt = js[text_key]
+                src = js[source_key]
+                if src not in by_source:
+                    by_source[src] = []
+                by_source[src].append(txt)
+        docs_by_source = [''.join(txts) for txts in by_source.values()]
+        print(cv_kwargs)
+        count_vectorizer = CountVectorizer(**cv_kwargs)
+        count = count_vectorizer.fit_transform(docs_by_source)
+        ctfidf = CTFIDFVectorizer().fit_transform(count)
+        tok_to_idx = list(count_vectorizer.vocabulary_.items())
+        tok_to_idx.sort(key = lambda x: x[1])
+        ordered_vocab = OrderedDict([ (k,1) for (k,_) in tok_to_idx ])
+        ovocab = build_vocab(ordered_vocab)
+        per_source_tokens = []
+        for i in range(ctfidf.shape[0]):
+            ts = ctfidf[i].toarray().squeeze()
+            per_source_tokens.append(ovocab.lookup_tokens((-ts).argsort()[:k]))
+        final_tokens_intersect = set(per_source_tokens[0])
+        final_tokens_union = set(per_source_tokens[0])
+        for src_tokens in per_source_tokens:
+            final_tokens_intersect.intersection_update(src_tokens)
+            final_tokens_union.update(src_tokens)
+        res = final_tokens_union - final_tokens_intersect
+        print("Removed terms = {}".format(res))
+        return final_tokens_union - final_tokens_intersect
     def _update_count_vectorizer_args(self, cv_kwargs: Dict[str, Any], stop_word_file: str) -> Dict[str, Any]:
         if stop_word_file:
@@ -113,11 +156,11 @@ class TMNTVectorizer(object):
         return list(set(wds))
-    def get_vocab(self) -> torchtext.vocab.Vocab:
-        """Returns the Torchtext vocabulary associated with the vectorizer
+    def get_vocab(self) -> Vocab:
+        """Returns the vocabulary associated with the vectorizer
         Returns:
-            Torchtext vocabulary
+            vocabulary
         """
         if self.vocab is not None:
             return self.vocab
@@ -375,3 +418,78 @@ class TMNTVectorizer(object):
         y = self._get_ys_dir(json_dir)
         return X, y
+class CTFIDFVectorizer(TfidfTransformer):
+    def __init__(self, *args, **kwargs):
+        super(CTFIDFVectorizer, self).__init__(*args, **kwargs)
+        self._idf_diag = None
+    def fit(self, X: sp.csr_matrix):
+        """Learn the idf vector (global term weights)
+        Parameters
+        ----------
+        X : sparse matrix of shape n_samples, n_features)
+            A matrix of term/token counts.
+        """
+        # Prepare input
+        X = check_array(X, accept_sparse=('csr', 'csc'))
+        if not sp.issparse(X):
+            X = sp.csr_matrix(X)
+        dtype = X.dtype if X.dtype in FLOAT_DTYPES else np.float64
+        # Calculate IDF scores
+        _, n_features = X.shape
+        df = np.squeeze(np.asarray(X.sum(axis=0)))
+        avg_nr_samples = int(X.sum(axis=1).mean())
+        idf = np.log(avg_nr_samples / df)
+        self._idf_diag = sp.diags(idf, offsets=0,
+                                  shape=(n_features, n_features),
+                                  format='csr',
+                                  dtype=dtype)
+        setattr(self, 'idf_', True)
+        return self
+    def transform(self, X: sp.csr_matrix, copy=True) -> sp.csr_matrix:
+        """Transform a count-based matrix to c-TF-IDF
+        Parameters
+        ----------
+        X : sparse matrix of (n_samples, n_features)
+            a matrix of term/token counts
+        Returns
+        -------
+        vectors : sparse matrix of shape (n_samples, n_features)
+        """
+        # Prepare input
+        X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES, copy=copy)
+        if not sp.issparse(X):
+            X = sp.csr_matrix(X, dtype=np.float64)
+        _, n_features = X.shape
+        # idf_ being a property, the automatic attributes detection
+        # does not work as usual and we need to specify the attribute
+        # name:
+        check_is_fitted(self, attributes=["idf_"],
+                        msg='idf vector is not fitted')
+        # Check if expected nr features is found
+        expected_n_features = self._idf_diag.shape[0]
+        if n_features != expected_n_features:
+            raise ValueError("Input has n_features=%d while the model"
+                             " has been trained with n_features=%d" % (
+                                 n_features, expected_n_features))
+        X = X * self._idf_diag
+        if self.norm:
+            X = normalize(X, axis=1, norm='l1', copy=False)
+        return X

tmnt/utils/vocab.py ADDED Viewed

@@ -0,0 +1,126 @@
+import torch
+import torch.nn as nn
+from typing import Dict, List, Optional, Iterable, OrderedDict
+from collections import Counter
+class Vocab(nn.Module):
+    r"""Creates a vocab object which maps tokens to indices.
+    Args:
+        vocab (torch.classes.torchtext.Vocab or torchtext._torchtext.Vocab): a cpp vocab object.
+    """
+    def __init__(self, stoi: Dict):
+        super(Vocab, self).__init__()
+        self.stoi = stoi
+        self.itos = list(stoi.keys())
+    def forward(self, tokens: List[str]) -> List[int]:
+        r"""Calls the `lookup_indices` method
+        Args:
+            tokens: a list of tokens used to lookup their corresponding `indices`.
+        Returns:
+            The indices associated with a list of `tokens`.
+        """
+        return [self.stoi[t] for t in tokens]
+    def __len__(self) -> int:
+        r"""
+        Returns:
+            The length of the vocab.
+        """
+        return len(self.stoi)
+    def __contains__(self, token: str) -> bool:
+        r"""
+        Args:
+            token: The token for which to check the membership.
+        Returns:
+            Whether the token is member of vocab or not.
+        """
+        return self.stoi.__contains__(token)
+    def __getitem__(self, token: str) -> int:
+        r"""
+        Args:
+            token: The token used to lookup the corresponding index.
+        Returns:
+            The index corresponding to the associated token.
+        """
+        return self.stoi[token]
+    def insert_token(self, token: str, index: int) -> None:
+        r"""
+        Args:
+            token: The token used to lookup the corresponding index.
+            index: The index corresponding to the associated token.
+        Raises:
+            RuntimeError: If `index` is not in range [0, Vocab.size()] or if `token` already exists in the vocab.
+        """
+        if not token in self.stoi:
+            self.stoi[token] = index
+            self.itos[index] = token
+    def lookup_token(self, index: int) -> str:
+        r"""
+        Args:
+            index: The index corresponding to the associated token.
+        Returns:
+            token: The token used to lookup the corresponding index.
+        Raises:
+            RuntimeError: If `index` not in range [0, itos.size()).
+        """
+        return self.itos[index]
+    def lookup_tokens(self, indices: List[int]) -> List[str]:
+        r"""
+        Args:
+            indices: The `indices` used to lookup their corresponding`tokens`.
+        Returns:
+            The `tokens` associated with `indices`.
+        Raises:
+            RuntimeError: If an index within `indices` is not int range [0, itos.size()).
+        """
+        return [ self.itos[i] for i in indices]
+    def lookup_indices(self, tokens: List[str]) -> List[int]:
+        r"""
+        Args:
+            tokens: the tokens used to lookup their corresponding `indices`.
+        Returns:
+            The 'indices` associated with `tokens`.
+        """
+        return [ self.stoi[t] for t in tokens ]
+    def get_stoi(self) -> Dict[str, int]:
+        r"""
+        Returns:
+            Dictionary mapping tokens to indices.
+        """
+        return self.stoi
+    def get_itos(self) -> List[str]:
+        r"""
+        Returns:
+            List mapping indices to tokens.
+        """
+        return self.itos
+def build_vocab(
+    odict: OrderedDict
+) -> Vocab:
+    """
+    """
+    dict_by_position = dict(zip(odict.keys(), range(0,len(odict))))
+    return Vocab(dict_by_position)

{tmnt-0.7.54b20240817.dist-info → tmnt-0.7.57.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: tmnt
-Version: 0.7.54b20240817
+Version: 0.7.57
 Summary: Topic modeling neural toolkit
 Home-page: https://github.com/mitre/tmnt.git
 Author: The MITRE Corporation
@@ -14,6 +14,7 @@ Description-Content-Type: text/markdown
 License-File: LICENSE
 License-File: NOTICE
 Requires-Dist: optuna
+Requires-Dist: datasets
 Requires-Dist: mantichora>=0.9.5
 Requires-Dist: transformers[torch]
 Requires-Dist: torcheval
@@ -32,7 +33,16 @@ Requires-Dist: numba
 Requires-Dist: scipy==1.12.0
 Requires-Dist: tabulate>=0.8.7
 Requires-Dist: torch>=2.1.2
-Requires-Dist: torchtext>=0.13.0
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: license
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary
 The Topic Modeling Neural Toolkit (TMNT) is a software library that enables training
 topic models as neural network-based variational auto-encoders.

{tmnt-0.7.54b20240817.dist-info → tmnt-0.7.57.dist-info}/RECORD RENAMED Viewed

@@ -1,14 +1,14 @@
 tmnt/__init__.py,sha256=EPNq1H7UMyMewWT_zTGBaC7ZouvCywX_gMX4G1dtmvw,250
 tmnt/configuration.py,sha256=P8PEhzVPKO5xG0FrdTLRQ60OYWigbzPY-OSx_hzQlrY,10054
-tmnt/data_loading.py,sha256=A0tsM6x61BGhYBV6rAYdryz2NwbR__8EAYj_Q4Z-DCs,18736
-tmnt/distribution.py,sha256=Pmyc5gwDd_-jP7vLVb0vdNQaSSvF1EuiTZEWg3KfmI8,10866
-tmnt/estimator.py,sha256=bPyLx4rmVe4mC9ciEq7uluONTD2y1enUluAkmw-TPBI,69095
-tmnt/eval_npmi.py,sha256=DTW9dNHVe6H57gndQIZ4gX9EghuBstwznA3YBqILJk0,5820
+tmnt/data_loading.py,sha256=zB3wIBXgl_UKjjRLQgPwCZOVTcjHK4YahxCbsLd70RY,19238
+tmnt/distribution.py,sha256=2YBfaGIiUJc-OjKaotnKmicSEdL4OAGBx3icacbePQ8,14868
+tmnt/estimator.py,sha256=qh-pCbmhhtGpRKKQv10ANyQakuoMYaVH87NM5UIxtyM,67777
+tmnt/eval_npmi.py,sha256=8S-IE-bEhtQofF6oKeXs7oaUeu-7yDlaEqjMj52gmNQ,6549
 tmnt/inference.py,sha256=da8qAnjTDTuWQfPEOQewOfgikqE00XT1xGMiO2mckI4,15679
-tmnt/modeling.py,sha256=O1V7ppU7J6pvESTvdEoV9BXbEF4Z-J1OHnRtszuagaA,29956
+tmnt/modeling.py,sha256=QRnHbNFp85LKp5ILYsJqTeQ3BV0jLPCwKX1Eh-Ed3Dc,29975
 tmnt/preprocess/__init__.py,sha256=gwMejkQrnqKS05i0JVsUru2hDUR5jE1hKC10dL934GU,170
 tmnt/preprocess/tokenizer.py,sha256=-ZgowfbHrM040vbNTktZM_hdl6HDTqxSJ4mDAxq3dUs,14050
-tmnt/preprocess/vectorizer.py,sha256=RkdivqP76qAJDianV09lONad9NbfBVWLZgIbU_P1-zo,15796
+tmnt/preprocess/vectorizer.py,sha256=RaianZ_DG3Nc-RI96FtmI4PCZPi5Nipx9a5xndLZ52M,20689
 tmnt/utils/__init__.py,sha256=1PZsxRPsHI_DnOpxD0iAhLxhxHnx6Svzg3W-79YfWWs,237
 tmnt/utils/csv2json.py,sha256=A1TXy-uxA4dc9tw0tjiHzL7fv4C6b0Uc_bwI1keTmKU,795
 tmnt/utils/log_utils.py,sha256=ZtR4nF_Iee23ev935YQcTtXv-cCC7lgXkXLl_yokfS4,2075
@@ -17,9 +17,10 @@ tmnt/utils/ngram_helpers.py,sha256=VrIzou2oQHCLBLSWODDeikN3PYat1NqqvEeYQj_GhbA,1
 tmnt/utils/pubmed_utils.py,sha256=3sHwoun7vxb0GV-arhpXLMUbAZne0huAh9xQNy6H40E,1274
 tmnt/utils/random.py,sha256=qY75WG3peWoMh9pUyCPBEo6q8IvkF6VRjeb5CqJOBF8,327
 tmnt/utils/recalibrate.py,sha256=TmpB8An8bslICZ13UTJfIvr8VoqiSedtpHxec4n8CHk,1439
-tmnt-0.7.54b20240817.dist-info/LICENSE,sha256=qFZJrfJ7Zi4IXDiyiGVrHWic_l1h2tc36tI8Z7rK9bs,11356
-tmnt-0.7.54b20240817.dist-info/METADATA,sha256=zhqLOFFBySECnzwDqy9n-QrmJNxWqUk8IPCGqJkK3RY,1445
-tmnt-0.7.54b20240817.dist-info/NOTICE,sha256=p0kYIVAkReTFaGb4C-qPa7h5ztze6hGzOpjCMMbOipU,425
-tmnt-0.7.54b20240817.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
-tmnt-0.7.54b20240817.dist-info/top_level.txt,sha256=RpYgUl187sXnqmiwKjZZdcDlHz2AALs6bGdUcukyd_E,5
-tmnt-0.7.54b20240817.dist-info/RECORD,,
+tmnt/utils/vocab.py,sha256=J6GFGLyvDgdmtVQjYlyzWjuykRD3kllCKPG1z0lI0P8,3504
+tmnt-0.7.57.dist-info/LICENSE,sha256=qFZJrfJ7Zi4IXDiyiGVrHWic_l1h2tc36tI8Z7rK9bs,11356
+tmnt-0.7.57.dist-info/METADATA,sha256=EDNrl4p3d9j2UXPwENrMAp0EgaRQuJCBGFvXdYoJTmI,1641
+tmnt-0.7.57.dist-info/NOTICE,sha256=p0kYIVAkReTFaGb4C-qPa7h5ztze6hGzOpjCMMbOipU,425
+tmnt-0.7.57.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
+tmnt-0.7.57.dist-info/top_level.txt,sha256=RpYgUl187sXnqmiwKjZZdcDlHz2AALs6bGdUcukyd_E,5
+tmnt-0.7.57.dist-info/RECORD,,

{tmnt-0.7.54b20240817.dist-info → tmnt-0.7.57.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (72.2.0)
+Generator: setuptools (76.0.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{tmnt-0.7.54b20240817.dist-info → tmnt-0.7.57.dist-info}/LICENSE RENAMED Viewed

File without changes

{tmnt-0.7.54b20240817.dist-info → tmnt-0.7.57.dist-info}/NOTICE RENAMED Viewed

File without changes

{tmnt-0.7.54b20240817.dist-info → tmnt-0.7.57.dist-info}/top_level.txt RENAMED Viewed

File without changes

tmnt 0.7.54b20240817__py3-none-any.whl → 0.7.57__py3-none-any.whl

tmnt 0.7.54b20240817py3-none-any.whl → 0.7.57py3-none-any.whl