PyPI - tmnt - Versions diffs - 0.7.54b20240816__py3-none-any.whl → 0.7.56__py3-none-any.whl - Mend

tmnt 0.7.54b20240816py3-none-any.whl → 0.7.56py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

tmnt/data_loading.py CHANGED Viewed

@@ -41,13 +41,14 @@ llm_catalog = {
     'allenai/scibert_scivocab_uncased': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained),
     'johngiorgi/declutr-sci-base': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained),
     'BAAI/bge-base-en-v1.5': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained),
-    'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained)
-    ## add more model options here if desired
+    'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained),
+    'Alibaba-NLP/gte-base-en-v1.5': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained)
+    ## add more model options here ...
     }
 def get_llm(model_name):
     tok_fn, model_fn = llm_catalog.get(model_name, ((AutoTokenizer.from_pretrained, AutoModel.from_pretrained)))
-    return tok_fn(model_name), model_fn(model_name)
+    return tok_fn(model_name), model_fn(model_name, trust_remote_code=True)
 def get_llm_tokenizer(model_name):
     tok_fn, model_fn = llm_catalog.get(model_name, ((AutoTokenizer.from_pretrained, AutoModel.from_pretrained)))
@@ -55,7 +56,7 @@ def get_llm_tokenizer(model_name):
 def get_llm_model(model_name):
     tok_fn, model_fn = llm_catalog.get(model_name, ((AutoTokenizer.from_pretrained, AutoModel.from_pretrained)))
-    return model_fn(model_name)
+    return model_fn(model_name, trust_remote_code=True)
 def get_unwrapped_llm_dataloader(data, bow_vectorizer, llm_name, label_map, batch_size, max_len, shuffle=False, device='cpu'):
     label_pipeline = lambda x: label_map.get(x, 0)

tmnt/estimator.py CHANGED Viewed

@@ -285,16 +285,11 @@ class BaseBowEstimator(BaseEstimator):
                 logging.error("File {} does not appear to be a valid vocabulary file".format(vocabulary))
                 raise Exception("Invalid Json Configuration File")
             vocabulary = torchtext.vocab.vocab(voc_js)
-        #if vocabulary['embedding'] is not None:
-        if False:
-            raise Exception("Pre-trained embeddings not yet (re-)supported")
-            #emb_size = vocabulary['embedding'].idx_to_vec[0].size
-        else:
-            emb_size = config['embedding'].get('size')
-            if not emb_size:
-                emb_size = config['derived_info'].get('embedding_size')
-            if not emb_size:
-                raise Exception("Embedding size must be provided as the 'size' attribute of 'embedding' or as 'derived_info.embedding_size'")
+        emb_size = config['embedding'].get('size')
+        if not emb_size:
+            emb_size = config['derived_info'].get('embedding_size')
+        if not emb_size:
+            raise Exception("Embedding size must be provided as the 'size' attribute of 'embedding' or as 'derived_info.embedding_size'")
         gamma = config.get('gamma', 1.0)
         multilabel = config.get('multilabel', False)
         lr = config['lr']
@@ -781,12 +776,6 @@ class BowMetricEstimator(BowEstimator):
     def _get_model(self, bow_size=-1):
         if self.embedding_source != 'random':
             e_type, e_name = tuple(self.embedding_source.split(':'))
-            #pt_embedding = nlp.embedding.create(e_type, source=e_name)
-            #self.vocabulary.set_embedding(pt_embedding)
-            #emb_size = len(self.vocabulary.embedding.idx_to_vec[0])
-            #for word in self.vocabulary.embedding._idx_to_token:
-            #    if (self.vocabulary.embedding[word] == mx.nd.zeros(emb_size)).sum() == emb_size:
-            #        self.vocabulary.embedding[word] = mx.nd.random.normal(0, 0.1, emb_size)
         else:
             emb_size = self.embedding_size
         model = \
@@ -1030,7 +1019,6 @@ class SeqBowEstimator(BaseEstimator):
         tr_bow_counts = self._get_bow_wd_counts(train_data)
         model.initialize_bias_terms(tr_bow_counts)
         if self.npmi_matrix is not None:
-            print("****** INITIALIZING NPMI LOSS FUNCTION *******")
             model.initialize_npmi_loss(self.npmi_matrix)
         return model
@@ -1057,7 +1045,6 @@ class SeqBowEstimator(BaseEstimator):
         else:
             config['latent_distribution'] = {'dist_type':'gaussian'}
         config['epochs'] = self.epochs
-        #config['embedding_source'] = self.embedding_source
         config['gamma'] = self.gamma
         config['warmup_ratio'] = self.warmup_ratio
         config['llm_model_name'] = self.llm_model_name
@@ -1091,9 +1078,6 @@ class SeqBowEstimator(BaseEstimator):
                   log_interval, epoch_id, learning_rate):
         """Generate and print out the log message for training. """
         if self.has_classifier:
-            #metric_nm, metric_val = self.metric.compute()
-            #if not isinstance(metric_nm, list):
-            #    metric_nm, metric_val = [metric_nm], [metric_val]
             metric_nm = "AUPRC"
             try:
                 metric_val = self.metric.compute()
@@ -1126,7 +1110,6 @@ class SeqBowEstimator(BaseEstimator):
         rows = 0
         for i, data in enumerate(dataloader):
             seqs, = data
-            #bow_batch = list(seqs[3].squeeze(axis=1))
             bow_batch = list(seqs[3])
             rows += len(bow_batch)
             if i >= max_rows:
@@ -1170,10 +1153,7 @@ class SeqBowEstimator(BaseEstimator):
             label_ls = label_ls.mean()
             total_ls = (self.gamma * label_ls) + elbo_ls.mean()
             if not self.multilabel:
-                #label_ind = label.argmax(dim=0)
-                #self.metric.update([out], [label_ind])
                 self.metric.update(torch.tensor(out), torch.tensor(label))
-                #self.metric.update(torch.Tensor([out]), torch.Tensor([label_ind]))
             else:
                 self.metric.update([out], [label])
         else:
@@ -1214,7 +1194,6 @@ class SeqBowEstimator(BaseEstimator):
         joint_loader = PairedDataLoader(train_data, aux_data)
         num_train_steps = len(joint_loader) * self.epochs
-        ## The following from HuggingFace trainer.py lines 1047 to 1063
         decay_parameters = get_parameter_names(model.llm, ALL_LAYERNORM_LAYERS)
         decay_parameters = [name for name in decay_parameters if "bias" not in name]
         non_llm_parameters = [name for name,_ in model.named_parameters() if not name.startswith("llm")]
@@ -1288,10 +1267,8 @@ class SeqBowEstimator(BaseEstimator):
                 if aux_batch is not None:
                     update_loss_details(total_ls_2, elbo_ls_2, red_ls_2, None)
-                #debug
                 if not accumulate or (batch_id + 1) % accumulate == 0:
-                    #torch.nn.utils.clip_grad.clip_grad_value_(model.llm.parameters(), 1.0)
+                    torch.nn.utils.clip_grad.clip_grad_value_(model.llm.parameters(), 1.0)
                     optimizer.step()
                     dec_optimizer.step()
                     lr_scheduler.step()

tmnt/eval_npmi.py CHANGED Viewed

@@ -115,6 +115,25 @@ class EvaluateNPMI(object):
                 npmi = (log10(n_docs) + log10(bigram_cnt) - log10(unigram_1) - log10(unigram_2)) / (log10(n_docs) - log10(bigram_cnt) + 1e-4)
             npmi_matrix[w1, w2] = npmi
         return npmi_matrix
+class EvaluateNPMIUmass(object):
+    def __init__(self, npmi_matrix: np.array, vectorizer: TMNTVectorizer):
+        self.vectorizer = vectorizer
+        self.npmi_matrix = npmi_matrix # by convention this will be lower-triangular
+        dim = npmi_matrix.shape[0]
+        for mc in range(self.npmi_matrix.shape[0]):
+            for i in range(mc+1,dim):
+                self.npmi_matrix[mc,i] = self.npmi_matrix[i,mc]
+    def evaluate_topics(self, topic_ids):
+        npmi_score = 0.0
+        total_size = len(topic_ids) * len(topic_ids[0])
+        for topic in topic_ids:
+            for (w1, w2) in combinations(topic):
+                npmi_score += self.npmi_matrix[w1, w2]
+        return npmi_score / total_size
 class FullNPMI(object):

{tmnt-0.7.54b20240816.dist-info → tmnt-0.7.56.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: tmnt
-Version: 0.7.54b20240816
+Version: 0.7.56
 Summary: Topic modeling neural toolkit
 Home-page: https://github.com/mitre/tmnt.git
 Author: The MITRE Corporation

{tmnt-0.7.54b20240816.dist-info → tmnt-0.7.56.dist-info}/RECORD RENAMED Viewed

@@ -1,9 +1,9 @@
 tmnt/__init__.py,sha256=EPNq1H7UMyMewWT_zTGBaC7ZouvCywX_gMX4G1dtmvw,250
 tmnt/configuration.py,sha256=P8PEhzVPKO5xG0FrdTLRQ60OYWigbzPY-OSx_hzQlrY,10054
-tmnt/data_loading.py,sha256=A0tsM6x61BGhYBV6rAYdryz2NwbR__8EAYj_Q4Z-DCs,18736
+tmnt/data_loading.py,sha256=vsAMyHGi3fuOFDmqo_zenNKOtVQiuqMHA-iPYWYpGKE,18873
 tmnt/distribution.py,sha256=Pmyc5gwDd_-jP7vLVb0vdNQaSSvF1EuiTZEWg3KfmI8,10866
-tmnt/estimator.py,sha256=bPyLx4rmVe4mC9ciEq7uluONTD2y1enUluAkmw-TPBI,69095
-tmnt/eval_npmi.py,sha256=DTW9dNHVe6H57gndQIZ4gX9EghuBstwznA3YBqILJk0,5820
+tmnt/estimator.py,sha256=htQ_JeUedEYWLPIBDbDhEL5deWtHiVNRKQN1528SybY,67751
+tmnt/eval_npmi.py,sha256=8S-IE-bEhtQofF6oKeXs7oaUeu-7yDlaEqjMj52gmNQ,6549
 tmnt/inference.py,sha256=da8qAnjTDTuWQfPEOQewOfgikqE00XT1xGMiO2mckI4,15679
 tmnt/modeling.py,sha256=O1V7ppU7J6pvESTvdEoV9BXbEF4Z-J1OHnRtszuagaA,29956
 tmnt/preprocess/__init__.py,sha256=gwMejkQrnqKS05i0JVsUru2hDUR5jE1hKC10dL934GU,170
@@ -17,9 +17,9 @@ tmnt/utils/ngram_helpers.py,sha256=VrIzou2oQHCLBLSWODDeikN3PYat1NqqvEeYQj_GhbA,1
 tmnt/utils/pubmed_utils.py,sha256=3sHwoun7vxb0GV-arhpXLMUbAZne0huAh9xQNy6H40E,1274
 tmnt/utils/random.py,sha256=qY75WG3peWoMh9pUyCPBEo6q8IvkF6VRjeb5CqJOBF8,327
 tmnt/utils/recalibrate.py,sha256=TmpB8An8bslICZ13UTJfIvr8VoqiSedtpHxec4n8CHk,1439
-tmnt-0.7.54b20240816.dist-info/LICENSE,sha256=qFZJrfJ7Zi4IXDiyiGVrHWic_l1h2tc36tI8Z7rK9bs,11356
-tmnt-0.7.54b20240816.dist-info/METADATA,sha256=1ZRQz3VNnyvQ_TQ5shUIXlhpYWu2-U-garbxibQclig,1445
-tmnt-0.7.54b20240816.dist-info/NOTICE,sha256=p0kYIVAkReTFaGb4C-qPa7h5ztze6hGzOpjCMMbOipU,425
-tmnt-0.7.54b20240816.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
-tmnt-0.7.54b20240816.dist-info/top_level.txt,sha256=RpYgUl187sXnqmiwKjZZdcDlHz2AALs6bGdUcukyd_E,5
-tmnt-0.7.54b20240816.dist-info/RECORD,,
+tmnt-0.7.56.dist-info/LICENSE,sha256=qFZJrfJ7Zi4IXDiyiGVrHWic_l1h2tc36tI8Z7rK9bs,11356
+tmnt-0.7.56.dist-info/METADATA,sha256=jk7-JlrqxLTACr0LsMoLGXT0nq0VVQIkWFoFNqYlEPE,1436
+tmnt-0.7.56.dist-info/NOTICE,sha256=p0kYIVAkReTFaGb4C-qPa7h5ztze6hGzOpjCMMbOipU,425
+tmnt-0.7.56.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
+tmnt-0.7.56.dist-info/top_level.txt,sha256=RpYgUl187sXnqmiwKjZZdcDlHz2AALs6bGdUcukyd_E,5
+tmnt-0.7.56.dist-info/RECORD,,

{tmnt-0.7.54b20240816.dist-info → tmnt-0.7.56.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (72.2.0)
+Generator: setuptools (75.6.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{tmnt-0.7.54b20240816.dist-info → tmnt-0.7.56.dist-info}/LICENSE RENAMED Viewed

File without changes

{tmnt-0.7.54b20240816.dist-info → tmnt-0.7.56.dist-info}/NOTICE RENAMED Viewed

File without changes

{tmnt-0.7.54b20240816.dist-info → tmnt-0.7.56.dist-info}/top_level.txt RENAMED Viewed

File without changes

tmnt 0.7.54b20240816__py3-none-any.whl → 0.7.56__py3-none-any.whl

tmnt 0.7.54b20240816py3-none-any.whl → 0.7.56py3-none-any.whl