PyPI - glam4cm - Versions diffs - 0.1.0__py3-none-any.whl - Mend

glam4cm 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

glam4cm/__init__.py +9 -0
glam4cm/data_loading/__init__.py +0 -0
glam4cm/data_loading/data.py +631 -0
glam4cm/data_loading/encoding.py +76 -0
glam4cm/data_loading/graph_dataset.py +940 -0
glam4cm/data_loading/metadata.py +84 -0
glam4cm/data_loading/models_dataset.py +361 -0
glam4cm/data_loading/utils.py +20 -0
glam4cm/downstream_tasks/__init__.py +0 -0
glam4cm/downstream_tasks/bert_edge_classification.py +144 -0
glam4cm/downstream_tasks/bert_graph_classification.py +137 -0
glam4cm/downstream_tasks/bert_graph_classification_comp.py +156 -0
glam4cm/downstream_tasks/bert_link_prediction.py +145 -0
glam4cm/downstream_tasks/bert_node_classification.py +164 -0
glam4cm/downstream_tasks/cm_gpt_edge_classification.py +73 -0
glam4cm/downstream_tasks/cm_gpt_node_classification.py +76 -0
glam4cm/downstream_tasks/cm_gpt_pretraining.py +64 -0
glam4cm/downstream_tasks/common_args.py +160 -0
glam4cm/downstream_tasks/create_dataset.py +51 -0
glam4cm/downstream_tasks/gnn_edge_classification.py +106 -0
glam4cm/downstream_tasks/gnn_graph_cls.py +101 -0
glam4cm/downstream_tasks/gnn_link_prediction.py +109 -0
glam4cm/downstream_tasks/gnn_node_classification.py +103 -0
glam4cm/downstream_tasks/tf_idf_text_classification.py +22 -0
glam4cm/downstream_tasks/utils.py +35 -0
glam4cm/downstream_tasks/word2vec_text_classification.py +108 -0
glam4cm/embeddings/__init__.py +0 -0
glam4cm/embeddings/bert.py +72 -0
glam4cm/embeddings/common.py +43 -0
glam4cm/embeddings/fasttext.py +0 -0
glam4cm/embeddings/tfidf.py +25 -0
glam4cm/embeddings/w2v.py +41 -0
glam4cm/encoding/__init__.py +0 -0
glam4cm/encoding/common.py +0 -0
glam4cm/encoding/encoders.py +100 -0
glam4cm/graph2str/__init__.py +0 -0
glam4cm/graph2str/common.py +34 -0
glam4cm/graph2str/constants.py +15 -0
glam4cm/graph2str/ontouml.py +141 -0
glam4cm/graph2str/uml.py +0 -0
glam4cm/lang2graph/__init__.py +0 -0
glam4cm/lang2graph/archimate.py +31 -0
glam4cm/lang2graph/bpmn.py +0 -0
glam4cm/lang2graph/common.py +416 -0
glam4cm/lang2graph/ecore.py +221 -0
glam4cm/lang2graph/ontouml.py +169 -0
glam4cm/lang2graph/utils.py +80 -0
glam4cm/models/cmgpt.py +352 -0
glam4cm/models/gnn_layers.py +273 -0
glam4cm/models/hf.py +10 -0
glam4cm/run.py +99 -0
glam4cm/run_configs.py +126 -0
glam4cm/settings.py +54 -0
glam4cm/tokenization/__init__.py +0 -0
glam4cm/tokenization/special_tokens.py +4 -0
glam4cm/tokenization/utils.py +37 -0
glam4cm/trainers/__init__.py +0 -0
glam4cm/trainers/bert_classifier.py +105 -0
glam4cm/trainers/cm_gpt_trainer.py +153 -0
glam4cm/trainers/gnn_edge_classifier.py +126 -0
glam4cm/trainers/gnn_graph_classifier.py +123 -0
glam4cm/trainers/gnn_link_predictor.py +144 -0
glam4cm/trainers/gnn_node_classifier.py +135 -0
glam4cm/trainers/gnn_trainer.py +129 -0
glam4cm/trainers/metrics.py +55 -0
glam4cm/utils.py +194 -0
glam4cm-0.1.0.dist-info/LICENSE +21 -0
glam4cm-0.1.0.dist-info/METADATA +86 -0
glam4cm-0.1.0.dist-info/RECORD +72 -0
glam4cm-0.1.0.dist-info/WHEEL +5 -0
glam4cm-0.1.0.dist-info/entry_points.txt +2 -0
glam4cm-0.1.0.dist-info/top_level.txt +1 -0

glam4cm/run.py ADDED Viewed

@@ -0,0 +1,99 @@
+import argparse
+from glam4cm.downstream_tasks import (
+    bert_graph_classification_comp,
+    bert_graph_classification,
+    bert_node_classification,
+    bert_edge_classification,
+    bert_link_prediction,
+    gnn_graph_cls,
+    gnn_node_classification,
+    gnn_edge_classification,
+    gnn_link_prediction,
+    create_dataset,
+)
+from glam4cm.downstream_tasks import cm_gpt_pretraining
+from glam4cm.downstream_tasks import cm_gpt_node_classification
+from glam4cm.downstream_tasks import cm_gpt_edge_classification
+from glam4cm.downstream_tasks.bert_graph_classification_comp import get_parser as bert_comp_parse_args
+from glam4cm.downstream_tasks.bert_graph_classification import get_parser as bert_parse_args
+from glam4cm.downstream_tasks.gnn_graph_cls import get_parser as gnn_parse_args
+from glam4cm.downstream_tasks.create_dataset import get_parser as create_dataset_parse_args
+from glam4cm.downstream_tasks.bert_link_prediction import get_parser as bert_lp_parse_args
+from glam4cm.downstream_tasks.gnn_edge_classification import get_parser as gnn_ec_parse_args
+from glam4cm.downstream_tasks.gnn_link_prediction import get_parser as gnn_lp_parse_args
+from glam4cm.downstream_tasks.bert_edge_classification import get_parser as bert_ec_parse_args
+from glam4cm.downstream_tasks.gnn_node_classification import get_parser as gnn_nc_parse_args
+from glam4cm.downstream_tasks.bert_node_classification import get_parser as bert_nc_parse_args
+from glam4cm.downstream_tasks.cm_gpt_pretraining import get_parser as cm_gpt_parse_args
+from glam4cm.downstream_tasks.cm_gpt_node_classification import get_parser as cm_gpt_nc_parse_args
+from glam4cm.downstream_tasks.cm_gpt_edge_classification import get_parser as cm_gpt_ec_parse_args
+tasks = {
+    0: 'Create Dataset',
+    1: 'BERT Graph Classification Comparison',
+    2: 'BERT Graph Classification',
+	3: 'BERT Node Classification',
+    4: 'BERT Link Prediction',
+    5: 'BERT Edge Classification',
+    6: 'GNN Graph Classification',
+    7: 'GNN Node Classification',
+    8: 'GNN Edge Classification',
+    9: 'GNN Link Prediction',
+    10: 'CM-GPT Causal Modeling',
+    11: 'CM-GPT Node Classification',
+    12: 'CM-GPT Edge Classification'
+}
+tasks_handler_map = {
+    0: (create_dataset.run, create_dataset_parse_args),
+    1: (bert_graph_classification_comp.run, bert_comp_parse_args),
+    2: (bert_graph_classification.run, bert_parse_args),
+    3: (bert_node_classification.run, bert_nc_parse_args),
+    4: (bert_link_prediction.run, bert_lp_parse_args),
+    5: (bert_edge_classification.run, bert_ec_parse_args),
+    6: (gnn_graph_cls.run, gnn_parse_args),
+    7: (gnn_node_classification.run, gnn_nc_parse_args),
+    8: (gnn_edge_classification.run, gnn_ec_parse_args),
+    9: (gnn_link_prediction.run, gnn_lp_parse_args),
+    10: (cm_gpt_pretraining.run, cm_gpt_parse_args),
+    11: (cm_gpt_node_classification.run, cm_gpt_nc_parse_args),
+    12: (cm_gpt_edge_classification.run, cm_gpt_ec_parse_args)
+}
+if __name__ == '__main__':
+    main_parser = argparse.ArgumentParser(description="Train ML models on conceptual models")
+    main_parser.add_argument('--task_id', type=int, required=True, help=f'ID of the task to run. Options are: {"\n".join(f"{k}: {v}" for k, v in tasks.items())}', choices=list(tasks.keys()), default=0)
+    main_parser.add_argument('--th', '--task_help', action="store_true", help="Help for the task specified by --task_id")
+    args, remaining_args = main_parser.parse_known_args()
+    if not any(vars(args).values()):
+        print("No arguments provided. Please provide arguments to run the task.")
+        main_parser.print_help()
+        exit(1)
+    ### If args has -h or --help, print help
+    if any(x in remaining_args for x in ['-th', '--task_help']):
+        task_id = args.task_id
+        hander, task_parser = tasks_handler_map[task_id]
+        print("Help for task:", tasks[task_id])
+        task_parser().print_help()
+        exit(0)
+    print("Running GLAM4CM with:", vars(args))  # Placeholder for real functionality
+    task_id = args.task_id
+    hander, task_parser = tasks_handler_map[task_id]
+    task_args = task_parser().parse_args(remaining_args)
+    hander(task_args)

glam4cm/run_configs.py ADDED Viewed

@@ -0,0 +1,126 @@
+import subprocess
+from tqdm.auto import tqdm
+tasks = {
+    0: 'Create Dataset',
+    1: 'BERT Graph Classification Comparison',
+    2: 'BERT Graph Classification',
+	3: 'BERT Node Classification',
+    4: 'BERT Link Prediction',
+    5: 'BERT Edge Classification',
+    6: 'GNN Graph Classification',
+    7: 'GNN Node Classification',
+    8: 'GNN Edge Classification',
+    9: 'GNN Link Prediction',
+}
+all_tasks = {
+    1: [
+        '--dataset=ecore_555 --num_epochs=5 --train_batch_size=2',
+        '--dataset=modelset --num_epochs=10 --train_batch_size=2',
+	],
+	2: [
+		'--dataset=ecore_555 --num_epochs=5 --min_edges=10 --train_batch_size=2',
+        '--dataset=ecore_555 --num_epochs=5 --use_attributes --min_edges=10 --train_batch_size=2',
+        '--dataset=ecore_555 --num_epochs=5 --use_edge_types --min_edges=10 --train_batch_size=2',
+        '--dataset=ecore_555 --num_epochs=5 --use_attributes --use_edge_types --min_edges=10 --train_batch_size=2',
+		'--dataset=modelset --num_epochs=10 --min_edges=10 --train_batch_size=2',
+        '--dataset=modelset --num_epochs=10 --use_attributes --min_edges=10 --train_batch_size=2',
+        '--dataset=modelset --num_epochs=10 --use_edge_types --min_edges=10 --train_batch_size=2',
+        '--dataset=modelset --num_epochs=10 --use_attributes --use_edge_types --min_edges=10 --train_batch_size=2',
+	],
+	3: [
+		'--dataset=ecore_555 --num_epochs=5 --cls_label=abstract --min_edges=10 --train_batch_size=32',
+        '--dataset=ecore_555 --num_epochs=5 --use_attributes --cls_label=abstract --train_batch_size=32 --min_edges=10',
+        '--dataset=ecore_555 --num_epochs=5 --use_edge_types --cls_label=abstract --train_batch_size=32 --min_edges=10',
+        '--dataset=ecore_555 --num_epochs=5 --use_attributes --use_edge_types --cls_label=abstract --train_batch_size=32 --min_edges=10',
+		'--dataset=modelset --num_epochs=10 --cls_label=abstract --train_batch_size=32 --min_edges=10',
+        '--dataset=modelset --num_epochs=10 --use_attributes --cls_label=abstract --train_batch_size=32 --min_edges=10',
+        '--dataset=modelset --num_epochs=10 --use_edge_types --cls_label=abstract --train_batch_size=32 --min_edges=10',
+        '--dataset=modelset --num_epochs=10 --use_attributes --use_edge_types --cls_label=abstract --train_batch_size=32 --min_edges=10',
+        '--dataset=mar-ecore-github --num_epochs=10 --use_attributes --use_edge_types --cls_label=abstract --train_batch_size=32 --min_edges=10',
+		'--dataset=eamodelset --num_epochs=15 --cls_label=type --train_batch_size=32 --min_edges=10',
+        '--dataset=eamodelset --num_epochs=15 --use_edge_types --cls_label=type --train_batch_size=32 --min_edges=10',
+        '--dataset=eamodelset --num_epochs=15 --cls_label=layer --train_batch_size=32 --min_edges=10',
+        '--dataset=eamodelset --num_epochs=15 --use_edge_types --cls_label=layer --train_batch_size=32 --min_edges=10',
+	],
+	4: [
+		'--dataset=ecore_555 --num_epochs=3 --train_batch_size=32 --min_edges=10',
+		'--dataset=ecore_555 --num_epochs=3 --use_attributes --train_batch_size=32 --min_edges=10',
+		'--dataset=modelset --num_epochs=5 --train_batch_size=32 --min_edges=10 --reload',
+		'--dataset=modelset --num_epochs=5 --use_attributes --train_batch_size=32 --min_edges=10 --reload',
+        '--dataset=mar-ecore-github --num_epochs=5 --use_attributes --train_batch_size=32 --min_edges=10 --reload',
+		'--dataset=eamodelset --num_epochs=5 --train_batch_size=32 --min_edges=10 --reload',
+	],
+    5: [
+        '--dataset=ecore_555 --num_epochs=5 --train_batch_size=32 --min_edges=10 --reload',
+        '--dataset=ecore_555 --num_epochs=5 --use_attributes --train_batch_size=32 --min_edges=10 --reload',
+        '--dataset=modelset --num_epochs=10 --train_batch_size=32 --min_edges=10 --reload',
+        '--dataset=modelset --num_epochs=10 --use_attributes --train_batch_size=32 --min_edges=10 --reload',
+        '--dataset=mar-ecore-github --num_epochs=10 --use_attributes --train_batch_size=32 --min_edges=10 --reload',
+        '--dataset=eamodelset --num_epochs=15 --train_batch_size=32 --min_edges=10 --reload',
+    ],
+	6: [
+		'--dataset=ecore_555 --num_epochs=200 --batch_size=32 --min_edges=10 --reload',
+		'--dataset=ecore_555 --num_epochs=200 --batch_size=32 --min_edges=10 --use_embeddings --ckpt=results/ecore_555/graph_cls_/10_att_0_nt_0/checkpoint-225 --reload',
+		'--dataset=ecore_555 --num_epochs=200 --batch_size=32 --min_edges=10 --use_embeddings --use_attributes --ckpt=results/ecore_555/graph_cls_/10_att_1_nt_0/checkpoint-225 --reload',
+		'--dataset=ecore_555 --num_epochs=200 --batch_size=32 --min_edges=10 --use_embeddings --use_edge_types --ckpt=results/ecore_555/graph_cls_/10_att_0_nt_1/checkpoint-225 --reload',
+		'--dataset=ecore_555 --num_epochs=200 --batch_size=32 --min_edges=10 --use_embeddings --use_attributes --use_edge_types --ckpt=results/ecore_555/graph_cls_/10_att_1_nt_1/checkpoint-225 --reload',
+        '--dataset=modelset --num_epochs=200 --batch_size=32 --min_edges=10 --reload',
+		'--dataset=modelset --num_epochs=200 --batch_size=32 --min_edges=10 --use_embeddings --ckpt=results/modelset/graph_cls_/10_att_0_nt_0/checkpoint-2540 --reload',
+		'--dataset=modelset --num_epochs=200 --batch_size=32 --min_edges=10 --use_embeddings --use_attributes --ckpt=results/modelset/graph_cls_/10_att_1_nt_0/checkpoint-2540 --reload',
+		'--dataset=modelset --num_epochs=200 --batch_size=32 --min_edges=10 --use_embeddings --use_edge_types --ckpt=results/modelset/graph_cls_/10_att_0_nt_1/checkpoint-2540 --reload',
+		'--dataset=modelset --num_epochs=200 --batch_size=32 --min_edges=10 --use_embeddings --use_attributes --use_edge_types --ckpt=results/modelset/graph_cls_/10_att_1_nt_1/checkpoint-2540 --reload',
+    ],
+	7: [
+		'--dataset=ecore_555 --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --reload',
+		'--dataset=ecore_555 --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --use_embeddings --ckpt=results/ecore_555/node_cls/abstract/abstract_10_att_0_nt_0/checkpoint-540 --reload',
+		'--dataset=ecore_555 --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --use_embeddings --use_attributes --ckpt=results/ecore_555/node_cls/abstract/abstract_10_att_1_nt_0/checkpoint-540 --reload',
+		'--dataset=ecore_555 --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --use_embeddings --use_edge_types --ckpt=results/ecore_555/node_cls/abstract/abstract_10_att_0_nt_1/checkpoint-540 --reload',
+		'--dataset=ecore_555 --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --use_embeddings --use_attributes --use_edge_types --ckpt=results/ecore_555/node_cls/abstract/abstract_10_att_1_nt_1/checkpoint-540 --reload',
+		'--dataset=modelset --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --reload',
+		'--dataset=modelset --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --use_embeddings --ckpt=results/modelset/node_cls/abstract/abstract_10_att_0_nt_0/checkpoint-6870 --reload',
+		'--dataset=modelset --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --use_embeddings --use_attributes --ckpt=results/modelset/node_cls/abstract/abstract_10_att_1_nt_0/checkpoint-6870 --reload',
+		'--dataset=modelset --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --use_embeddings --use_edge_types --ckpt=results/modelset/node_cls/abstract/abstract_10_att_0_nt_1/checkpoint-6870 --reload',
+		'--dataset=modelset --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --use_embeddings --use_attributes --use_edge_types --ckpt=results/modelset/node_cls/abstract/abstract_10_att_1_nt_1/checkpoint-6870 --reload',
+        '--dataset=mar-ecore-github --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --reload',
+		'--dataset=mar-ecore-github --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --use_embeddings --ckpt=results/mar-ecore-github/node_cls/abstract/abstract_10_att_0_nt_0/checkpoint-19400 --reload',
+		'--dataset=mar-ecore-github --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --use_embeddings --use_attributes --ckpt=results/mar-ecore-github/node_cls/abstract/abstract_10_att_1_nt_0/checkpoint-19400 --reload',
+		'--dataset=mar-ecore-github --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --use_embeddings --use_edge_types --ckpt=results/mar-ecore-github/node_cls/abstract/abstract_10_att_0_nt_1/checkpoint-19400 --reload',
+		'--dataset=mar-ecore-github --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --use_embeddings --use_attributes --use_edge_types --ckpt=results/mar-ecore-github/node_cls/abstract/abstract_10_att_1_nt_1/checkpoint-19400 --reload',
+        '--dataset=eamodelset --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=type --reload',
+		'--dataset=eamodelset --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=type --use_embeddings --ckpt=results/eamodelset/node_cls/layer/layer_10_att_0_nt_0/checkpoint-9570 --reload',
+		'--dataset=eamodelset --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=type --use_embeddings --use_edge_types --ckpt=results/eamodelset/node_cls/layer/layer_10_att_0_nt_1/checkpoint-9570 --reload',
+        '--dataset=eamodelset --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=type --reload',
+		'--dataset=eamodelset --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=type --use_embeddings --ckpt=results/eamodelset/node_cls/type/type_10_att_0_nt_0/checkpoint-9570 --reload',
+		'--dataset=eamodelset --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=type --use_embeddings --use_edge_types --ckpt=results/eamodelset/node_cls/type/type_10_att_0_nt_1/checkpoint-9570 --reload',
+	]
+}
+allowed_tasks = [7]
+for script_id in tqdm(allowed_tasks, desc='Running tasks'):
+	task = tasks[script_id]
+	for script in tqdm(all_tasks[script_id], desc=f'Running scripts for {task}'):
+		script += f' --task={script_id} '
+		print(f'Running {script}')
+		subprocess.run(f'python run.py {script}', shell=True)

glam4cm/settings.py ADDED Viewed

@@ -0,0 +1,54 @@
+import os
+import torch
+import logging
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+BERT_MODEL = 'bert-base-uncased'
+WORD2VEC_MODEL = 'word2vec'
+TFIDF_MODEL = 'tfidf'
+FAST_TEXT_MODEL = 'uml-fasttext.bin'
+W2V_CONFIG = dict(
+    epoch=100,
+    dim=128,
+    ws=5,
+    minCount=1,
+    thread=4,
+    model='skipgram'
+)
+device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+torch.set_float32_matmul_precision('high')
+seed = 42
+datasets_dir = 'datasets'
+ecore_json_path = os.path.join(datasets_dir, 'ecore_555/ecore_555.jsonl')
+mar_json_path = os.path.join(datasets_dir, 'mar-ecore-github/ecore-github.jsonl')
+modelsets_uml_json_path = os.path.join(datasets_dir, 'modelset/uml.jsonl')
+modelsets_ecore_json_path = os.path.join(datasets_dir, 'modelset/ecore.jsonl')
+graph_data_dir = 'datasets/graph_data'
+# Path: settings.py
+LP_TASK_EDGE_CLS = 'edge_cls'
+LP_TASK_LINK_PRED = 'lp'
+EPOCH = 'epoch'
+LOSS = 'loss'
+TRAIN_LOSS = 'train_loss'
+TEST_LOSS = 'test_loss'
+TEST_ACC = 'test_acc'
+TRAINING_PHASE = 'train'
+VALIDATION_PHASE = 'val'
+TESTING_PHASE = 'test'

glam4cm/tokenization/__init__.py ADDED Viewed

File without changes

glam4cm/tokenization/special_tokens.py ADDED Viewed

@@ -0,0 +1,4 @@
+EDGE_START = '<edge_begin>'
+EDGE_END = '<edge_end>'
+NODE_BEGIN = '<node_begin>'
+NODE_END = '<node_end>'

glam4cm/tokenization/utils.py ADDED Viewed

@@ -0,0 +1,37 @@
+from re import finditer
+from glam4cm.tokenization.special_tokens import (
+    EDGE_START, EDGE_END, NODE_BEGIN, NODE_END
+)
+from transformers import AutoTokenizer
+def get_special_tokens():
+    return {
+        'additional_special_tokens': [EDGE_START, EDGE_END, NODE_BEGIN, NODE_END]
+    }
+def get_tokenizer(model_name, use_special_tokens=False, max_length=512) -> AutoTokenizer:
+    print(f"Loading tokenizer for {model_name}")
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    if use_special_tokens:
+        tokenizer.add_special_tokens(get_special_tokens())
+    tokenizer.model_max_length = max_length
+    return tokenizer
+def camel_case_split(identifier) -> list:
+    matches = finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
+    return [m.group(0) for m in matches]
+def doc_tokenizer(doc, lower=False) -> str:
+    words = doc.split()
+    # split _
+    words = [w2 for w1 in words for w2 in w1.split('_') if w2 != '']
+    # camelcase
+    words = [w2.lower() if lower else w2 for w1 in words for w2 in camel_case_split(w1) if w2 != '']
+    return " ".join(words)

glam4cm/trainers/__init__.py ADDED Viewed

File without changes

glam4cm/trainers/bert_classifier.py ADDED Viewed

@@ -0,0 +1,105 @@
+from collections import Counter
+import numpy as np
+from sklearn.model_selection import StratifiedKFold, train_test_split
+from transformers import (
+    Trainer,
+    TrainingArguments
+)
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer
+)
+from glam4cm.data_loading.encoding import EncodingDataset
+from glam4cm.settings import device
+from sklearn.preprocessing import LabelEncoder
+from glam4cm.trainers.metrics import compute_metrics
+class BertTrainer:
+    def __init__(
+        self,
+        model_name,
+        ckpt=None,
+        max_length=512
+    ):
+        self.model_name = model_name
+        self.ckpt = ckpt
+        self.max_length = max_length
+    def train(
+        self,
+        texts,
+        labels,
+        test_ratio=0.2,
+        kfold=False,
+        num_train_epochs=15,
+        train_batch_size=2,
+        eval_batch_size=128,
+        weight_decay=0.01,
+        logging_steps=50,
+        eval_steps=50,
+        save_steps=50,
+        learning_rate=5e-5,
+        warmup_steps=500,
+        output_dir='./results',
+        logs_dir='./logs',
+        seed=42
+    ):
+        def train_fold():
+            print(f'Train: {len(X_train)}, Test: {len(X_test)}')
+            print("Class distribution in train: ", Counter(y_train))
+            print("Class distribution in test: ", Counter(y_test))
+            tokenizer = AutoTokenizer.from_pretrained(self.model_name if not self.ckpt else self.ckpt)
+            model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=num_classes)
+            model.to(device)
+            train_ds = EncodingDataset(tokenizer, X_train, y_train, max_length=self.max_length)
+            test_ds = EncodingDataset(tokenizer, X_test, y_test, max_length=self.max_length)
+            training_args = TrainingArguments(
+                output_dir=output_dir,
+                num_train_epochs=num_train_epochs,
+                eval_strategy="steps",
+                per_device_train_batch_size=train_batch_size,
+                per_device_eval_batch_size=eval_batch_size,
+                warmup_steps=warmup_steps,
+                weight_decay=weight_decay,
+                learning_rate=learning_rate,
+                logging_dir=logs_dir,
+                logging_steps=logging_steps,
+                eval_steps=eval_steps,
+                save_steps=save_steps,
+                save_total_limit=2,
+                load_best_model_at_end=True,
+                fp16=True
+            )
+            trainer = Trainer(
+                model=model,
+                args=training_args,
+                train_dataset=train_ds,
+                eval_dataset=test_ds,
+                compute_metrics=compute_metrics
+            )
+            trainer.train()
+            results = trainer.evaluate()
+            print(results)
+        y = LabelEncoder().fit_transform(labels)
+        num_classes = len(set(y))
+        if kfold > 0:
+            k = int(1 / self.test_ratio)
+            kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
+            n = len(self.graphs)
+            for i, (train_idx, test_idx) in enumerate(kfold.split(np.zeros(n), np.zeros(n))):
+                X_train, y_train = [texts[i] for i in train_idx], [y[i] for i in train_idx]
+                X_test, y_test = [texts[i] for i in test_idx], [y[i] for i in test_idx]
+                print("Fold number: ", i+1)
+                train_fold()
+        else:
+            X_train, X_test, y_train, y_test = train_test_split(texts, y, test_size=test_ratio, random_state=seed)
+            train_fold()

glam4cm/trainers/cm_gpt_trainer.py ADDED Viewed

@@ -0,0 +1,153 @@
+import os
+import time
+from typing import Union
+from torch.utils.data import Dataset
+from tensorboardX import SummaryWriter
+from torch.utils.data import DataLoader
+from tqdm.auto import tqdm
+import torch.nn as nn
+from glam4cm.models.cmgpt import CMGPT, CMGPTClassifier
+import torch
+from glam4cm.settings import device
+from glam4cm.trainers.metrics import compute_classification_metrics
+class CMGPTTrainer:
+    def __init__(
+        self,
+        model: Union[CMGPT, CMGPTClassifier],
+        train_dataset: Dataset,
+        test_dataset: Dataset,
+        batch_size: int = 32,
+        lr: float = 1e-5,
+        num_epochs: int = 10,
+        log_dir: str = 'logs',
+        results_dir: str = 'results/cmgpt',
+        compute_metrics: callable = None
+    ):
+        self.model = model
+        self.model.to(device)
+        # self.model = torch.compile(self.model)
+        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=lr)
+        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=num_epochs)
+        self.results_dir = results_dir
+        os.makedirs(results_dir, exist_ok=True)
+        self.writer = SummaryWriter(log_dir=log_dir)
+        self.dataloaders = {
+            'train': DataLoader(train_dataset, batch_size=batch_size, shuffle=True),
+            'test': DataLoader(test_dataset, batch_size=batch_size, shuffle=False),
+        }
+        self.num_epochs = num_epochs
+        if not compute_metrics and isinstance(self.model, CMGPTClassifier):
+            self.compute_metrics = compute_classification_metrics
+        else:
+            self.compute_metrics = compute_metrics
+        print(f"Number of parameters: {sum(p.numel() for p in self.model.parameters() if p.requires_grad)/ 1000000:.3f}M")
+    def step(self, batch, idx=None):
+        # B, T = batch['input_ids'].shape
+        # t0 = time.time()
+        self.optimizer.zero_grad()
+        logits, loss = self.model(
+            batch['input_ids'].to(device),
+            batch['attention_mask'].to(device),
+            batch['labels'].to(device)
+        )
+        loss.backward()
+        self.optimizer.step()
+        # torch.cuda.synchronize()
+        # t1 = time.time()
+        # dt = (t1 - t0)*1000
+        # tokens_per_sec = B*T/(t1-t0)
+        # if idx is not None:
+        #     print(f"Batch: {idx}, Loss: {loss.item()}, Time: {dt} ms, Tokens/s: {tokens_per_sec}")
+        # else:
+        #     print(f"Loss: {loss.item()}, Time: {dt} ms, Tokens/s: {tokens_per_sec}")
+        # if idx > 100:
+        #     print("Breaking")
+        #     exit()
+        return logits, loss
+    def train(self):
+        for epoch in tqdm(range(self.num_epochs), desc='Training Epoch'):
+            self.model.train()
+            train_loss = 0
+            all_preds, all_labels = list(), list()
+            for i, batch in tqdm(enumerate(self.dataloaders['train']), desc='Training Batches', total=len(self.dataloaders['train'])):
+                logits, loss = self.step(batch, i)
+                train_loss += loss.item()
+                self.writer.add_scalar('loss/train', loss.item(), epoch * len(self.dataloaders['train']) + i)
+                if self.compute_metrics is not None:
+                    all_preds.append(logits.detach().cpu())
+                    all_labels.append(batch['labels'].cpu())
+                # break
+            print("Train loss: ", train_loss / len(self.dataloaders['train']))
+            # if self.compute_metrics is not None:
+            #     all_preds = torch.cat(all_preds, dim=0)
+            #     all_labels = torch.cat(all_labels, dim=0)
+            #     metrics = self.compute_metrics(all_preds, all_labels)
+            #     for key, value in metrics.items():
+            #         self.writer.add_scalar(key, value, epoch)
+            #     # print("Train Metrics: ", metrics)
+            self.test(epoch)
+            self.scheduler.step()
+    def test(self, epoch=None):
+        self.model.eval()
+        test_loss = 0
+        all_preds, all_labels = list(), list()
+        for i, batch in tqdm(enumerate(self.dataloaders['test']), desc='Testing Batches', total=len(self.dataloaders['test'])):
+            logits, loss = self.model(
+                batch['input_ids'].to(device),
+                batch['attention_mask'].to(device),
+                batch['labels'].to(device)
+            )
+            test_loss += loss.item()
+            if self.compute_metrics is not None:
+                all_preds.append(logits.detach().cpu())
+                all_labels.append(batch['labels'].cpu())
+        if epoch is not None:
+            self.writer.add_scalar('loss/test', test_loss / len(self.dataloaders['test']), epoch)
+            # break
+        print("Test loss: ", test_loss / len(self.dataloaders['test']))
+        if self.compute_metrics is not None:
+            all_preds = torch.cat(all_preds, dim=0)
+            all_labels = torch.cat(all_labels, dim=0)
+            metrics = self.compute_metrics(all_preds, all_labels)
+            for key, value in metrics.items():
+                self.writer.add_scalar(key, value, epoch)
+            print("Test Metrics: ", metrics)
+    def save_model(self):
+        if isinstance(self.model, CMGPT):
+            path = f'{self.results_dir}/cmgpt.pth'
+        elif isinstance(self.model, CMGPTClassifier):
+            path = f'{self.results_dir}/cmgpt-classifier.pth'
+        torch.save(self.model.state_dict(), path)