PyPI - tmnt - Versions diffs - 0.7.56__py3-none-any.whl → 0.7.58__py3-none-any.whl - Mend

tmnt 0.7.56py3-none-any.whl → 0.7.58py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

tmnt/data_loading.py +16 -10
tmnt/distribution.py +102 -15
tmnt/estimator.py +15 -7
tmnt/inference.py +5 -1
tmnt/modeling.py +13 -5
tmnt/preprocess/vectorizer.py +129 -11
tmnt/utils/vocab.py +126 -0
{tmnt-0.7.56.dist-info → tmnt-0.7.58.dist-info}/METADATA +14 -3
{tmnt-0.7.56.dist-info → tmnt-0.7.58.dist-info}/RECORD +13 -12
{tmnt-0.7.56.dist-info → tmnt-0.7.58.dist-info}/WHEEL +1 -1
{tmnt-0.7.56.dist-info → tmnt-0.7.58.dist-info/licenses}/LICENSE +0 -0
{tmnt-0.7.56.dist-info → tmnt-0.7.58.dist-info/licenses}/NOTICE +0 -0
{tmnt-0.7.56.dist-info → tmnt-0.7.58.dist-info}/top_level.txt +0 -0

tmnt/data_loading.py CHANGED Viewed

@@ -25,9 +25,7 @@ from typing import List, Tuple, Dict, Optional, Union, NoReturn
 import torch
 from torch.utils.data import DataLoader, Sampler, WeightedRandomSampler, RandomSampler
-from torchtext.vocab import vocab as build_vocab
-from torchtext.data.utils import get_tokenizer
-from torchtext.vocab import build_vocab_from_iterator
+from tmnt.utils.vocab import build_vocab
 from transformers import DistilBertTokenizer, DistilBertModel, AutoTokenizer, AutoModel, DistilBertTokenizer, BertModel, DistilBertModel, OpenAIGPTModel
 from sklearn.model_selection import StratifiedKFold
@@ -42,7 +40,8 @@ llm_catalog = {
     'johngiorgi/declutr-sci-base': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained),
     'BAAI/bge-base-en-v1.5': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained),
     'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained),
-    'Alibaba-NLP/gte-base-en-v1.5': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained)
+    'Alibaba-NLP/gte-base-en-v1.5': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained),
+    'intfloat/multilingual-e5-base': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained)
     ## add more model options here ...
     }
@@ -58,17 +57,18 @@ def get_llm_model(model_name):
     tok_fn, model_fn = llm_catalog.get(model_name, ((AutoTokenizer.from_pretrained, AutoModel.from_pretrained)))
     return model_fn(model_name, trust_remote_code=True)
-def get_unwrapped_llm_dataloader(data, bow_vectorizer, llm_name, label_map, batch_size, max_len, shuffle=False, device='cpu'):
+def get_unwrapped_llm_dataloader(data, bow_vectorizer, llm_name, label_map, batch_size, max_len, bow_target_texts=None,
+                                 shuffle=False, device='cpu'):
     label_pipeline = lambda x: label_map.get(x, 0)
     text_pipeline  = get_llm_tokenizer(llm_name)
     def collate_batch(batch):
         label_list, text_list, mask_list, bow_list = [], [], [], []
-        for (_label, _text) in batch:
+        for (_label, _text, _target_text) in batch:
             label_list.append(label_pipeline(_label))
             tokenized_result = text_pipeline(_text, return_tensors='pt', padding='max_length',
                                            max_length=max_len, truncation=True)
-            bag_of_words,_ = bow_vectorizer.transform([_text])
+            bag_of_words,_ = bow_vectorizer.transform([_target_text])
             processed_text = tokenized_result['input_ids']
             mask = tokenized_result['attention_mask']
             mask_list.append(mask)
@@ -79,10 +79,16 @@ def get_unwrapped_llm_dataloader(data, bow_vectorizer, llm_name, label_map, batc
         mask_list  = torch.vstack(mask_list)
         bow_list   = torch.vstack([ sparse_coo_to_tensor(bow_vec.tocoo()) for bow_vec in bow_list ])
         return label_list.to(device), text_list.to(device), mask_list.to(device), bow_list.to(device)
-    return DataLoader(data, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_batch)
+    if bow_target_texts is not None:
+        assert len(bow_target_texts) == len(data)
+        full_data = [ (label, txt, alt_text) for ((label, txt), alt_text) in zip(data, bow_target_texts)]
+    else:
+        full_data = [ (label, txt, txt) for (label, txt) in data]
+    return DataLoader(full_data, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_batch)
-def get_llm_dataloader(data, bow_vectorizer, llm_name, label_map, batch_size, max_len, shuffle=False, device='cpu'):
-    return SingletonWrapperLoader(get_unwrapped_llm_dataloader(data, bow_vectorizer, llm_name, label_map, batch_size, max_len, shuffle=shuffle, device=device))
+def get_llm_dataloader(data, bow_vectorizer, llm_name, label_map, batch_size, max_len, bow_target_texts=None, shuffle=False, device='cpu'):
+    return SingletonWrapperLoader(get_unwrapped_llm_dataloader(data, bow_vectorizer, llm_name, label_map, batch_size, max_len,
+                                                               bow_target_texts=bow_target_texts, shuffle=shuffle, device=device))
 def get_llm_paired_dataloader(data_a, data_b, bow_vectorizer, llm_name, label_map, batch_size, max_len_a, max_len_b,

tmnt/distribution.py CHANGED Viewed

@@ -14,26 +14,20 @@ from torch.nn import Sequential
 import torch
 from scipy import special as sp
 import torch
+from typing import Callable, Literal, Optional, Tuple, TypeVar, Union
+from tmnt.sparse.modeling import TopKEncoder
 __all__ = ['BaseDistribution', 'GaussianDistribution', 'GaussianUnitVarDistribution', 'LogisticGaussianDistribution',
            'VonMisesDistribution']
 class BaseDistribution(nn.Module):
-    def __init__(self, enc_size, n_latent, device, on_simplex=False):
+    def __init__(self, enc_size, n_latent, device, on_simplex=True):
         super(BaseDistribution, self).__init__()
         self.n_latent = n_latent
         self.enc_size = enc_size
         self.device = device
-        self.mu_encoder = nn.Linear(enc_size, n_latent).to(device)
-        #self.mu_encoder = Sequential(self.mu_proj, nn.Softplus().to(device))
-        self.mu_bn = nn.BatchNorm1d(n_latent, momentum = 0.8, eps=0.0001).to(device)
-        self.softmax = nn.Softmax(dim=1).to(device)
-        self.softplus = nn.Softplus().to(device)
-        self.on_simplex = on_simplex
-        #self.mu_bn.collect_params().setattr('grad_req', 'null')
     ## this is required by most priors
     def _get_gaussian_sample(self, mu, lv, batch_size):
@@ -48,11 +42,25 @@ class BaseDistribution(nn.Module):
     def get_mu_encoding(self, data, include_bn):
         raise NotImplemented
+    def freeze_pre_encoder(self) -> None:
+        raise NotImplemented
+    def unfreeze_pre_encoder(self) -> None:
+        raise NotImplemented
+class SimpleDistribution(BaseDistribution):
+    def __init__(self, enc_size, n_latent, device, on_simplex=False):
+        super(SimpleDistribution, self).__init__(enc_size, n_latent, device, on_simplex=on_simplex)
+        self.mu_encoder = nn.Linear(enc_size, n_latent).to(device)
+        self.mu_bn = nn.BatchNorm1d(n_latent, momentum = 0.8, eps=0.0001).to(device)
+        self.softmax = nn.Softmax(dim=1).to(device)
+        self.softplus = nn.Softplus().to(device)
+        self.on_simplex = on_simplex
-class GaussianDistribution(BaseDistribution):
+class GaussianDistribution(SimpleDistribution):
     """Gaussian latent distribution with diagnol co-variance.
     Parameters:
@@ -99,7 +107,7 @@ class GaussianDistribution(BaseDistribution):
-class GaussianUnitVarDistribution(BaseDistribution):
+class GaussianUnitVarDistribution(SimpleDistribution):
     """Gaussian latent distribution with fixed unit variance.
     Parameters:
@@ -142,7 +150,7 @@ class GaussianUnitVarDistribution(BaseDistribution):
         return mu
-class LogisticGaussianDistribution(BaseDistribution):
+class LogisticGaussianDistribution(SimpleDistribution):
     """Logistic normal/Gaussian latent distribution with specified prior
     Parameters:
@@ -199,7 +207,7 @@ class LogisticGaussianDistribution(BaseDistribution):
         return mu
-class VonMisesDistribution(BaseDistribution):
+class VonMisesDistribution(SimpleDistribution):
     def __init__(self, enc_size, n_latent, kappa=100.0, dr=0.1, device='cpu'):
         super(VonMisesDistribution, self).__init__(enc_size, n_latent, device, on_simplex=False)
@@ -239,7 +247,7 @@ class VonMisesDistribution(BaseDistribution):
-class Projection(BaseDistribution):
+class Projection(SimpleDistribution):
     def __init__(self, enc_size, n_latent, device='cpu'):
         super(Projection, self).__init__(enc_size, n_latent, device)
@@ -265,6 +273,85 @@ class Projection(BaseDistribution):
         return enc
+class ConceptLogisticGaussianDistribution(BaseDistribution):
+    """Sparse concept encoding with Logistic normal/Gaussian latent distribution with specified prior
+    Parameters:
+        n_latent (int): Dimentionality of the latent distribution
+        device (device): Torch computational context (cpu or gpu[id])
+        dr (float): Dropout value for dropout applied post sample. optional (default = 0.2)
+        alpha (float): Value the determines prior variance as 1/alpha - (2/n_latent) + 1/(n_latent^2)
+    """
+    def __init__(self, enc_size, n_latent, sparse_encoder: TopKEncoder, device='cpu', dr=0.1, alpha=1.0):
+        super(ConceptLogisticGaussianDistribution, self).__init__(enc_size, n_latent, device, on_simplex=True)
+        self.n_latent = n_latent
+        self.enc_size = enc_size
+        self.device = device
+        self.sparse_encoder = sparse_encoder.to(device)
+        self.n_concepts = sparse_encoder.get_dict_size()
+        self.sparse_to_mu = nn.Linear(self.n_concepts, n_latent).to(device)
+        self.sparse_bn = nn.BatchNorm1d(self.n_concepts, momentum=0.8, eps=0.0001).to(device)
+        self.mu_bn = nn.BatchNorm1d(n_latent, momentum = 0.8, eps=0.0001).to(device)
+        self.softmax = nn.Softmax(dim=1).to(device)
+        self.on_simplex = True
+        self.alpha = alpha
+        prior_var = 1 / self.alpha - (2.0 / n_latent) + 1 / (self.n_latent * self.n_latent)
+        self.prior_var = torch.tensor([prior_var], device=device)
+        self.prior_logvar = torch.tensor([math.log(prior_var)], device=device)
+        ## NOTE: the weights to model the log-variance are separate but the sparse encoder is shared
+        ## between the lv_encoder and mu_encoder (above)
+        self.sparse_to_lv = nn.Linear(self.n_concepts, n_latent).to(device)
+        self.lv_bn = nn.BatchNorm1d(n_latent, momentum = 0.8, eps=0.001).to(device)
+        self.post_sample_dr_o = nn.Dropout(dr)
+    def freeze_pre_encoder(self):
+        self.sparse_encoder.W_enc.requires_grad = False
+        self.sparse_encoder.b_enc.requires_grad = False
+    def unfreeze_pre_encoder(self):
+        self.sparse_encoder.W_enc.requires_grad = True
+        self.sparse_encoder.b_enc.requires_grad = True
+    def _get_kl_term(self, mu, lv):
+        posterior_var = torch.exp(lv)
+        delta = mu
+        dt = torch.div(delta * delta, self.prior_var)
+        v_div = torch.div(posterior_var, self.prior_var)
+        lv_div = self.prior_logvar - lv
+        return (0.5 * (torch.sum((v_div + dt + lv_div), 1) - self.n_latent)).to(self.device)
+    def forward(self, data, batch_size):
+        """Generate a sample according to the logistic Gaussian latent distribution given the encoder outputs
+        """
+        _, sparse, _, _, _ = self.sparse_encoder(data)
+        #sparse_bn = self.sparse_bn(sparse)
+        mu = self.sparse_to_mu(sparse)
+        mu_bn = self.mu_bn(mu)
+        lv = self.sparse_to_lv(sparse)
+        lv_bn = self.lv_bn(lv)
+        z_p = self._get_gaussian_sample(mu_bn, lv_bn, batch_size)
+        KL = self._get_kl_term(mu, lv)
+        z = self.post_sample_dr_o(z_p)
+        return self.softmax(z), KL
+    def get_sparse_encoding(self, data):
+        _, sparse, _, _, _ = self.sparse_encoder(data)
+        return sparse
+    def get_mu_encoding(self, data, include_bn=True, normalize=False):
+        """Provide the distribution mean as the natural result of running the full encoder
+        Parameters:
+            data (:class:`mxnet.ndarray.NDArray`): Output of pre-latent encoding layers
+        Returns:
+            encoding (:class:`mxnet.ndarray.NDArray`): Encoding vector representing unnormalized topic proportions
+        """
+        _, sparse, _, _, _ = self.sparse_encoder(data)
+        enc = self.sparse_to_mu(sparse)
+        if include_bn:
+            enc = self.mu_bn(enc)
+        mu = self.softmax(enc) if normalize else enc
+        return mu

tmnt/estimator.py CHANGED Viewed

@@ -21,6 +21,7 @@ from tmnt.modeling import BowVAEModel, SeqBowVED, BaseVAE
 from tmnt.modeling import CrossBatchCosineSimilarityLoss, GeneralizedSDMLLoss, MultiNegativeCrossEntropyLoss, MetricSeqBowVED, MetricBowVAEModel
 from tmnt.eval_npmi import EvaluateNPMI
 from tmnt.distribution import LogisticGaussianDistribution, BaseDistribution, GaussianDistribution, VonMisesDistribution
+from tmnt.utils.vocab import Vocab
 ## evaluation routines
 from torcheval.metrics import MultilabelAUPRC, MulticlassAUPRC
@@ -38,7 +39,6 @@ import pickle
 from typing import List, Tuple, Dict, Optional, Union, NoReturn
 import torch
-import torchtext
 from torch.utils.data import Dataset, DataLoader
 from tqdm import tqdm
@@ -249,7 +249,7 @@ class BaseBowEstimator(BaseEstimator):
                                device        = device)
     @classmethod
-    def from_config(cls, config: Union[str, dict], vocabulary: Union[str, torchtext.vocab.Vocab],
+    def from_config(cls, config: Union[str, dict], vocabulary: Union[str, Vocab],
                     n_labels: int = 0,
                     coherence_coefficient: float = 8.0,
                     coherence_via_encoder: bool = False,
@@ -943,12 +943,13 @@ class SeqBowEstimator(BaseEstimator):
         self._bow_matrix = None
         self.entropy_loss_coef = entropy_loss_coef
         self.pool_encoder = pool_encoder
+        self.freeze_pre_encoder_weights = False
     @classmethod
     def from_config(cls,
                     config: Union[str, dict],
-                    vocabulary: torchtext.vocab.Vocab,
+                    vocabulary: Vocab,
                     log_interval: int = 1,
                     pretrained_param_file: Optional[str] = None,
                     n_labels: Optional[int] = None,
@@ -974,7 +975,7 @@ class SeqBowEstimator(BaseEstimator):
                 raise Exception("Invalid Json Configuration File")
         ldist_def = config['latent_distribution']
         llm_model_name = config['llm_model_name']
-        model = torch.load(pretrained_param_file, map_location=device)
+        model = torch.load(pretrained_param_file, map_location=device, weights_only=False)
         latent_distribution = model.latent_distribution
         estimator = cls(llm_model_name = llm_model_name,
@@ -1006,13 +1007,16 @@ class SeqBowEstimator(BaseEstimator):
             config_file = os.path.join(model_dir, 'model.config')
         with open(config_file) as f:
             config = json.loads(f.read())
-        vocab = torch.load(vocab_file)
+        vocab = torch.load(vocab_file, weights_only=False)
         return cls.from_config(config,
                                vocabulary = vocab,
                                log_interval = log_interval,
                                pretrained_param_file = param_file,
                                device = device)
+    def freeze_pre_encoder(self):
+        self.freeze_pre_encoder_weights = True
     def _get_model_bias_initialize(self, train_data):
         model = self._get_model()
@@ -1030,6 +1034,7 @@ class SeqBowEstimator(BaseEstimator):
                           entropy_loss_coef=self.entropy_loss_coef,
                           dropout=self.classifier_dropout)
         return model
     def _get_config(self):
         config = {}
@@ -1185,8 +1190,10 @@ class SeqBowEstimator(BaseEstimator):
         if self.model is None or not self.warm_start:
             self.model = self._get_model_bias_initialize(train_data)
-        model = self.model
+        if self.freeze_pre_encoder_weights:
+            self.model.freeze_pre_encoder()
+        model = self.model
         accumulate = False
         v_res      = None
@@ -1268,7 +1275,8 @@ class SeqBowEstimator(BaseEstimator):
                     update_loss_details(total_ls_2, elbo_ls_2, red_ls_2, None)
                 if not accumulate or (batch_id + 1) % accumulate == 0:
-                    torch.nn.utils.clip_grad.clip_grad_value_(model.llm.parameters(), 1.0)
+                    if not self.freeze_pre_encoder_weights:
+                        torch.nn.utils.clip_grad.clip_grad_value_(model.llm.parameters(), 1.0)
                     optimizer.step()
                     dec_optimizer.step()
                     lr_scheduler.step()

tmnt/inference.py CHANGED Viewed

@@ -18,8 +18,9 @@ from tmnt.utils.recalibrate import recalibrate_scores
 from sklearn.datasets import load_svmlight_file
 from functools import partial
 from tmnt.data_loading import get_llm_tokenizer
 from typing import List, Tuple, Dict, Optional, Union, NoReturn
+from scipy.sparse import csr_matrix
+from tmnt.distribution import ConceptLogisticGaussianDistribution
 MAX_DESIGN_MATRIX = 250000000
@@ -347,6 +348,9 @@ class MetricSeqVEDInferencer(SeqVEDInferencer):

tmnt/modeling.py CHANGED Viewed

@@ -45,6 +45,9 @@ class BaseVAE(nn.Module):
         t_npmi_mat = torch.Tensor(npmi_mat).to(self.device)
         self.npmi_with_diversity_loss = NPMILossWithDiversity(t_npmi_mat, device=self.device, npmi_lambda=npmi_lambda, npmi_scale=npmi_scale)
+    def freeze_pre_encoder(self):
+        pass
     def get_ordered_terms(self):
         """
         Returns the top K terms for each topic based on sensitivity analysis. Terms whose
@@ -56,7 +59,6 @@ class BaseVAE(nn.Module):
         sorted_j = jacobian.argsort(dim=0, descending=True)
         return sorted_j.cpu().numpy()
     def get_topic_vectors(self):
         """
         Returns unnormalized topic vectors
@@ -126,7 +128,8 @@ class BowVAEModel(BaseVAE):
     def _init_weights(self, module):
         if isinstance(module, torch.nn.Linear):
-            torch.nn.init.xavier_uniform_(module.weight.data)
+            torch.nn.init.kaiming_uniform_(module.weight.data)
+            #torch.nn.init.xavier_uniform_(module.weight.data)
     def _get_encoder(self, dims, dr=0.1):
@@ -360,7 +363,7 @@ class CoherenceRegularizer(nn.Module):
 class BaseSeqBowVED(BaseVAE):
     def __init__(self,
                  llm,
-                 latent_dist,
+                 latent_dist: BaseDistribution,
                  num_classes=0,
                  dropout=0.0,
                  vocab_size=2000,
@@ -401,6 +404,11 @@ class BaseSeqBowVED(BaseVAE):
             return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
         else:
             return model_output.last_hidden_state[:,0,:]
+    def freeze_pre_encoder(self):
+        for p in self.llm.parameters():
+            p.requires_grad = False
+        self.latent_distribution.freeze_pre_encoder()
     def get_ordered_terms(self):
         """
@@ -447,6 +455,7 @@ class SeqBowVED(BaseSeqBowVED):
             self.classifier = torch.nn.Sequential()
             self.classifier.add_module("dr", nn.Dropout(self.dropout).to(self.device))
             self.classifier.add_module("l_out", nn.Linear(self.n_latent, self.num_classes).to(self.device))
     def forward(self, input_ids, attention_mask, bow=None):  # pylint: disable=arguments-differ
         llm_output = self.llm(input_ids, attention_mask)
@@ -462,9 +471,8 @@ class SeqBowVED(BaseSeqBowVED):
             classifier_outputs = self.classifier(z_mu)
         else:
             classifier_outputs = None
-        redundancy_loss = entropy_loss
         ii_loss = self.add_npmi_and_diversity_loss(elbo)
-        redundancy_loss = entropy_loss  #self.get_redundancy_penalty()
+        redundancy_loss = ii_loss  #self.get_redundancy_penalty()
         return ii_loss, rec_loss, KL_loss, redundancy_loss, classifier_outputs

tmnt/preprocess/vectorizer.py CHANGED Viewed

@@ -6,17 +6,12 @@ Copyright (c) 2019-2021 The MITRE Corporation.
 import io
 import os
 import json
-import torchtext
-from torchtext.vocab import vocab as build_vocab
+from tmnt.utils.vocab import Vocab, build_vocab
 import glob
 from multiprocessing import Pool, cpu_count
 from mantichora import mantichora
 from atpbar import atpbar
 import collections
-import threading
-import logging
-import threading
-import scipy
 import scipy.sparse as sp
 import numpy as np
 from queue import Queue
@@ -25,9 +20,14 @@ from sklearn.datasets import dump_svmlight_file
 from tmnt.preprocess import BasicTokenizer
 from typing import List, Dict, Optional, Any, Tuple
 from collections import OrderedDict
+from sklearn.utils import check_array
+from sklearn.preprocessing import normalize
+from sklearn.feature_extraction.text import TfidfTransformer
+from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
+from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
-__all__ = ['TMNTVectorizer']
+__all__ = ['TMNTVectorizer', 'CTFIDFVectorizer']
 class TMNTVectorizer(object):
@@ -57,10 +57,12 @@ class TMNTVectorizer(object):
     def __init__(self, text_key: str = 'body', label_key: Optional[str] = None, min_doc_size: int = 1,
                  label_remap: Optional[Dict[str,str]] = None,
                  json_out_dir: Optional[str] = None, vocab_size: int = 2000, file_pat: str = '*.json',
-                 encoding: str = 'utf-8', initial_vocabulary: Optional[torchtext.vocab.Vocab] = None,
+                 encoding: str = 'utf-8', initial_vocabulary: Optional[Vocab] = None,
                  additional_feature_keys: List[str] = None, stop_word_file: str = None,
                  split_char: str = ',',
                  max_ws_tokens: int = -1,
+                 source_key: Optional[str] = None,
+                 source_json: Optional[str] = None,
                  count_vectorizer_kwargs: Dict[str, Any] = {'max_df':0.95, 'min_df':0.0, 'stop_words':'english'}):
         self.encoding = encoding
         self.max_ws_tokens = max_ws_tokens
@@ -78,12 +80,53 @@ class TMNTVectorizer(object):
         self.cv_kwargs = self._update_count_vectorizer_args(count_vectorizer_kwargs, stop_word_file)
         if not 'token_pattern' in self.cv_kwargs:
             self.cv_kwargs['token_pattern'] = r'\b[A-Za-z][A-Za-z]+\b'
+        if source_key and source_json:
+            source_terms = self._get_source_specific_terms(source_json, 10, text_key, source_key,
+                                                           {'token_pattern': self.cv_kwargs['token_pattern'],
+                                                           'stop_words': self.cv_kwargs['stop_words'],
+                                                            'max_df': 1.0, 'min_df':0.0})
+            stop_words = set(source_terms)
+            stop_words.update(set(ENGLISH_STOP_WORDS))
+            self.cv_kwargs['stop_words'] = frozenset(stop_words)
         self.vectorizer = CountVectorizer(max_features=self.vocab_size,
                                           vocabulary=(initial_vocabulary.get_itos() if initial_vocabulary else None),
                                           **self.cv_kwargs)
         self.label_map = {}
+    def _get_source_specific_terms(self, json_file, k: int, text_key: str, source_key: str, cv_kwargs):
+        by_source = {}
+        with io.open(json_file) as fp:
+            for l in fp:
+                js = json.loads(l)
+                txt = js[text_key]
+                src = js[source_key]
+                if src not in by_source:
+                    by_source[src] = []
+                by_source[src].append(txt)
+        docs_by_source = [''.join(txts) for txts in by_source.values()]
+        print(cv_kwargs)
+        count_vectorizer = CountVectorizer(**cv_kwargs)
+        count = count_vectorizer.fit_transform(docs_by_source)
+        ctfidf = CTFIDFVectorizer().fit_transform(count)
+        tok_to_idx = list(count_vectorizer.vocabulary_.items())
+        tok_to_idx.sort(key = lambda x: x[1])
+        ordered_vocab = OrderedDict([ (k,1) for (k,_) in tok_to_idx ])
+        ovocab = build_vocab(ordered_vocab)
+        per_source_tokens = []
+        for i in range(ctfidf.shape[0]):
+            ts = ctfidf[i].toarray().squeeze()
+            per_source_tokens.append(ovocab.lookup_tokens((-ts).argsort()[:k]))
+        final_tokens_intersect = set(per_source_tokens[0])
+        final_tokens_union = set(per_source_tokens[0])
+        for src_tokens in per_source_tokens:
+            final_tokens_intersect.intersection_update(src_tokens)
+            final_tokens_union.update(src_tokens)
+        res = final_tokens_union - final_tokens_intersect
+        print("Removed terms = {}".format(res))
+        return final_tokens_union - final_tokens_intersect
     def _update_count_vectorizer_args(self, cv_kwargs: Dict[str, Any], stop_word_file: str) -> Dict[str, Any]:
         if stop_word_file:
@@ -113,11 +156,11 @@ class TMNTVectorizer(object):
         return list(set(wds))
-    def get_vocab(self) -> torchtext.vocab.Vocab:
-        """Returns the Torchtext vocabulary associated with the vectorizer
+    def get_vocab(self) -> Vocab:
+        """Returns the vocabulary associated with the vectorizer
         Returns:
-            Torchtext vocabulary
+            vocabulary
         """
         if self.vocab is not None:
             return self.vocab
@@ -375,3 +418,78 @@ class TMNTVectorizer(object):
         y = self._get_ys_dir(json_dir)
         return X, y
+class CTFIDFVectorizer(TfidfTransformer):
+    def __init__(self, *args, **kwargs):
+        super(CTFIDFVectorizer, self).__init__(*args, **kwargs)
+        self._idf_diag = None
+    def fit(self, X: sp.csr_matrix):
+        """Learn the idf vector (global term weights)
+        Parameters
+        ----------
+        X : sparse matrix of shape n_samples, n_features)
+            A matrix of term/token counts.
+        """
+        # Prepare input
+        X = check_array(X, accept_sparse=('csr', 'csc'))
+        if not sp.issparse(X):
+            X = sp.csr_matrix(X)
+        dtype = X.dtype if X.dtype in FLOAT_DTYPES else np.float64
+        # Calculate IDF scores
+        _, n_features = X.shape
+        df = np.squeeze(np.asarray(X.sum(axis=0)))
+        avg_nr_samples = int(X.sum(axis=1).mean())
+        idf = np.log(avg_nr_samples / df)
+        self._idf_diag = sp.diags(idf, offsets=0,
+                                  shape=(n_features, n_features),
+                                  format='csr',
+                                  dtype=dtype)
+        setattr(self, 'idf_', True)
+        return self
+    def transform(self, X: sp.csr_matrix, copy=True) -> sp.csr_matrix:
+        """Transform a count-based matrix to c-TF-IDF
+        Parameters
+        ----------
+        X : sparse matrix of (n_samples, n_features)
+            a matrix of term/token counts
+        Returns
+        -------
+        vectors : sparse matrix of shape (n_samples, n_features)
+        """
+        # Prepare input
+        X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES, copy=copy)
+        if not sp.issparse(X):
+            X = sp.csr_matrix(X, dtype=np.float64)
+        _, n_features = X.shape
+        # idf_ being a property, the automatic attributes detection
+        # does not work as usual and we need to specify the attribute
+        # name:
+        check_is_fitted(self, attributes=["idf_"],
+                        msg='idf vector is not fitted')
+        # Check if expected nr features is found
+        expected_n_features = self._idf_diag.shape[0]
+        if n_features != expected_n_features:
+            raise ValueError("Input has n_features=%d while the model"
+                             " has been trained with n_features=%d" % (
+                                 n_features, expected_n_features))
+        X = X * self._idf_diag
+        if self.norm:
+            X = normalize(X, axis=1, norm='l1', copy=False)
+        return X

tmnt/utils/vocab.py ADDED Viewed

@@ -0,0 +1,126 @@
+import torch
+import torch.nn as nn
+from typing import Dict, List, Optional, Iterable, OrderedDict
+from collections import Counter
+class Vocab(nn.Module):
+    r"""Creates a vocab object which maps tokens to indices.
+    Args:
+        vocab (torch.classes.torchtext.Vocab or torchtext._torchtext.Vocab): a cpp vocab object.
+    """
+    def __init__(self, stoi: Dict):
+        super(Vocab, self).__init__()
+        self.stoi = stoi
+        self.itos = list(stoi.keys())
+    def forward(self, tokens: List[str]) -> List[int]:
+        r"""Calls the `lookup_indices` method
+        Args:
+            tokens: a list of tokens used to lookup their corresponding `indices`.
+        Returns:
+            The indices associated with a list of `tokens`.
+        """
+        return [self.stoi[t] for t in tokens]
+    def __len__(self) -> int:
+        r"""
+        Returns:
+            The length of the vocab.
+        """
+        return len(self.stoi)
+    def __contains__(self, token: str) -> bool:
+        r"""
+        Args:
+            token: The token for which to check the membership.
+        Returns:
+            Whether the token is member of vocab or not.
+        """
+        return self.stoi.__contains__(token)
+    def __getitem__(self, token: str) -> int:
+        r"""
+        Args:
+            token: The token used to lookup the corresponding index.
+        Returns:
+            The index corresponding to the associated token.
+        """
+        return self.stoi[token]
+    def insert_token(self, token: str, index: int) -> None:
+        r"""
+        Args:
+            token: The token used to lookup the corresponding index.
+            index: The index corresponding to the associated token.
+        Raises:
+            RuntimeError: If `index` is not in range [0, Vocab.size()] or if `token` already exists in the vocab.
+        """
+        if not token in self.stoi:
+            self.stoi[token] = index
+            self.itos[index] = token
+    def lookup_token(self, index: int) -> str:
+        r"""
+        Args:
+            index: The index corresponding to the associated token.
+        Returns:
+            token: The token used to lookup the corresponding index.
+        Raises:
+            RuntimeError: If `index` not in range [0, itos.size()).
+        """
+        return self.itos[index]
+    def lookup_tokens(self, indices: List[int]) -> List[str]:
+        r"""
+        Args:
+            indices: The `indices` used to lookup their corresponding`tokens`.
+        Returns:
+            The `tokens` associated with `indices`.
+        Raises:
+            RuntimeError: If an index within `indices` is not int range [0, itos.size()).
+        """
+        return [ self.itos[i] for i in indices]
+    def lookup_indices(self, tokens: List[str]) -> List[int]:
+        r"""
+        Args:
+            tokens: the tokens used to lookup their corresponding `indices`.
+        Returns:
+            The 'indices` associated with `tokens`.
+        """
+        return [ self.stoi[t] for t in tokens ]
+    def get_stoi(self) -> Dict[str, int]:
+        r"""
+        Returns:
+            Dictionary mapping tokens to indices.
+        """
+        return self.stoi
+    def get_itos(self) -> List[str]:
+        r"""
+        Returns:
+            List mapping indices to tokens.
+        """
+        return self.itos
+def build_vocab(
+    odict: OrderedDict
+) -> Vocab:
+    """
+    """
+    dict_by_position = dict(zip(odict.keys(), range(0,len(odict))))
+    return Vocab(dict_by_position)

{tmnt-0.7.56.dist-info → tmnt-0.7.58.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: tmnt
-Version: 0.7.56
+Version: 0.7.58
 Summary: Topic modeling neural toolkit
 Home-page: https://github.com/mitre/tmnt.git
 Author: The MITRE Corporation
@@ -14,6 +14,7 @@ Description-Content-Type: text/markdown
 License-File: LICENSE
 License-File: NOTICE
 Requires-Dist: optuna
+Requires-Dist: datasets
 Requires-Dist: mantichora>=0.9.5
 Requires-Dist: transformers[torch]
 Requires-Dist: torcheval
@@ -32,7 +33,17 @@ Requires-Dist: numba
 Requires-Dist: scipy==1.12.0
 Requires-Dist: tabulate>=0.8.7
 Requires-Dist: torch>=2.1.2
-Requires-Dist: torchtext>=0.13.0
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: license
+Dynamic: license-file
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary
 The Topic Modeling Neural Toolkit (TMNT) is a software library that enables training
 topic models as neural network-based variational auto-encoders.

{tmnt-0.7.56.dist-info → tmnt-0.7.58.dist-info}/RECORD RENAMED Viewed

@@ -1,14 +1,14 @@
 tmnt/__init__.py,sha256=EPNq1H7UMyMewWT_zTGBaC7ZouvCywX_gMX4G1dtmvw,250
 tmnt/configuration.py,sha256=P8PEhzVPKO5xG0FrdTLRQ60OYWigbzPY-OSx_hzQlrY,10054
-tmnt/data_loading.py,sha256=vsAMyHGi3fuOFDmqo_zenNKOtVQiuqMHA-iPYWYpGKE,18873
-tmnt/distribution.py,sha256=Pmyc5gwDd_-jP7vLVb0vdNQaSSvF1EuiTZEWg3KfmI8,10866
-tmnt/estimator.py,sha256=htQ_JeUedEYWLPIBDbDhEL5deWtHiVNRKQN1528SybY,67751
+tmnt/data_loading.py,sha256=LcVcXX00UsuAillRPILcvmqj3AsCIgzB6V_S6lfsbIY,19335
+tmnt/distribution.py,sha256=4gn1wnszVAErzICCvZXSYki0G78WC3_jyBr27N-Aj3E,15108
+tmnt/estimator.py,sha256=KnnvSNXm6cRL0GwDrGdgqqPX5ZubpCQ0WqcSXJDkUU4,68072
 tmnt/eval_npmi.py,sha256=8S-IE-bEhtQofF6oKeXs7oaUeu-7yDlaEqjMj52gmNQ,6549
-tmnt/inference.py,sha256=da8qAnjTDTuWQfPEOQewOfgikqE00XT1xGMiO2mckI4,15679
-tmnt/modeling.py,sha256=O1V7ppU7J6pvESTvdEoV9BXbEF4Z-J1OHnRtszuagaA,29956
+tmnt/inference.py,sha256=Iwc2_w7QrS1epiVEm_Ewx5sYFNNMDfvhMJETOgJqm0E,15783
+tmnt/modeling.py,sha256=rGHQsW7ldycFUd1f9NzcnNuSRElr600vLwmYPl6YY0M,30215
 tmnt/preprocess/__init__.py,sha256=gwMejkQrnqKS05i0JVsUru2hDUR5jE1hKC10dL934GU,170
 tmnt/preprocess/tokenizer.py,sha256=-ZgowfbHrM040vbNTktZM_hdl6HDTqxSJ4mDAxq3dUs,14050
-tmnt/preprocess/vectorizer.py,sha256=RkdivqP76qAJDianV09lONad9NbfBVWLZgIbU_P1-zo,15796
+tmnt/preprocess/vectorizer.py,sha256=RaianZ_DG3Nc-RI96FtmI4PCZPi5Nipx9a5xndLZ52M,20689
 tmnt/utils/__init__.py,sha256=1PZsxRPsHI_DnOpxD0iAhLxhxHnx6Svzg3W-79YfWWs,237
 tmnt/utils/csv2json.py,sha256=A1TXy-uxA4dc9tw0tjiHzL7fv4C6b0Uc_bwI1keTmKU,795
 tmnt/utils/log_utils.py,sha256=ZtR4nF_Iee23ev935YQcTtXv-cCC7lgXkXLl_yokfS4,2075
@@ -17,9 +17,10 @@ tmnt/utils/ngram_helpers.py,sha256=VrIzou2oQHCLBLSWODDeikN3PYat1NqqvEeYQj_GhbA,1
 tmnt/utils/pubmed_utils.py,sha256=3sHwoun7vxb0GV-arhpXLMUbAZne0huAh9xQNy6H40E,1274
 tmnt/utils/random.py,sha256=qY75WG3peWoMh9pUyCPBEo6q8IvkF6VRjeb5CqJOBF8,327
 tmnt/utils/recalibrate.py,sha256=TmpB8An8bslICZ13UTJfIvr8VoqiSedtpHxec4n8CHk,1439
-tmnt-0.7.56.dist-info/LICENSE,sha256=qFZJrfJ7Zi4IXDiyiGVrHWic_l1h2tc36tI8Z7rK9bs,11356
-tmnt-0.7.56.dist-info/METADATA,sha256=jk7-JlrqxLTACr0LsMoLGXT0nq0VVQIkWFoFNqYlEPE,1436
-tmnt-0.7.56.dist-info/NOTICE,sha256=p0kYIVAkReTFaGb4C-qPa7h5ztze6hGzOpjCMMbOipU,425
-tmnt-0.7.56.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-tmnt-0.7.56.dist-info/top_level.txt,sha256=RpYgUl187sXnqmiwKjZZdcDlHz2AALs6bGdUcukyd_E,5
-tmnt-0.7.56.dist-info/RECORD,,
+tmnt/utils/vocab.py,sha256=J6GFGLyvDgdmtVQjYlyzWjuykRD3kllCKPG1z0lI0P8,3504
+tmnt-0.7.58.dist-info/licenses/LICENSE,sha256=qFZJrfJ7Zi4IXDiyiGVrHWic_l1h2tc36tI8Z7rK9bs,11356
+tmnt-0.7.58.dist-info/licenses/NOTICE,sha256=p0kYIVAkReTFaGb4C-qPa7h5ztze6hGzOpjCMMbOipU,425
+tmnt-0.7.58.dist-info/METADATA,sha256=drdqhfVdpDs5LD_FMAMZjPRWw_TnNqFlGsh0QGtm8QE,1663
+tmnt-0.7.58.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
+tmnt-0.7.58.dist-info/top_level.txt,sha256=RpYgUl187sXnqmiwKjZZdcDlHz2AALs6bGdUcukyd_E,5
+tmnt-0.7.58.dist-info/RECORD,,

{tmnt-0.7.56.dist-info → tmnt-0.7.58.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.6.0)
+Generator: setuptools (78.1.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{tmnt-0.7.56.dist-info → tmnt-0.7.58.dist-info/licenses}/LICENSE RENAMED Viewed

File without changes

{tmnt-0.7.56.dist-info → tmnt-0.7.58.dist-info/licenses}/NOTICE RENAMED Viewed

File without changes

{tmnt-0.7.56.dist-info → tmnt-0.7.58.dist-info}/top_level.txt RENAMED Viewed

File without changes

tmnt 0.7.56__py3-none-any.whl → 0.7.58__py3-none-any.whl

tmnt 0.7.56py3-none-any.whl → 0.7.58py3-none-any.whl