glam4cm 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. glam4cm/__init__.py +9 -0
  2. glam4cm/data_loading/__init__.py +0 -0
  3. glam4cm/data_loading/data.py +631 -0
  4. glam4cm/data_loading/encoding.py +76 -0
  5. glam4cm/data_loading/graph_dataset.py +940 -0
  6. glam4cm/data_loading/metadata.py +84 -0
  7. glam4cm/data_loading/models_dataset.py +361 -0
  8. glam4cm/data_loading/utils.py +20 -0
  9. glam4cm/downstream_tasks/__init__.py +0 -0
  10. glam4cm/downstream_tasks/bert_edge_classification.py +144 -0
  11. glam4cm/downstream_tasks/bert_graph_classification.py +137 -0
  12. glam4cm/downstream_tasks/bert_graph_classification_comp.py +156 -0
  13. glam4cm/downstream_tasks/bert_link_prediction.py +145 -0
  14. glam4cm/downstream_tasks/bert_node_classification.py +164 -0
  15. glam4cm/downstream_tasks/cm_gpt_edge_classification.py +73 -0
  16. glam4cm/downstream_tasks/cm_gpt_node_classification.py +76 -0
  17. glam4cm/downstream_tasks/cm_gpt_pretraining.py +64 -0
  18. glam4cm/downstream_tasks/common_args.py +160 -0
  19. glam4cm/downstream_tasks/create_dataset.py +51 -0
  20. glam4cm/downstream_tasks/gnn_edge_classification.py +106 -0
  21. glam4cm/downstream_tasks/gnn_graph_cls.py +101 -0
  22. glam4cm/downstream_tasks/gnn_link_prediction.py +109 -0
  23. glam4cm/downstream_tasks/gnn_node_classification.py +103 -0
  24. glam4cm/downstream_tasks/tf_idf_text_classification.py +22 -0
  25. glam4cm/downstream_tasks/utils.py +35 -0
  26. glam4cm/downstream_tasks/word2vec_text_classification.py +108 -0
  27. glam4cm/embeddings/__init__.py +0 -0
  28. glam4cm/embeddings/bert.py +72 -0
  29. glam4cm/embeddings/common.py +43 -0
  30. glam4cm/embeddings/fasttext.py +0 -0
  31. glam4cm/embeddings/tfidf.py +25 -0
  32. glam4cm/embeddings/w2v.py +41 -0
  33. glam4cm/encoding/__init__.py +0 -0
  34. glam4cm/encoding/common.py +0 -0
  35. glam4cm/encoding/encoders.py +100 -0
  36. glam4cm/graph2str/__init__.py +0 -0
  37. glam4cm/graph2str/common.py +34 -0
  38. glam4cm/graph2str/constants.py +15 -0
  39. glam4cm/graph2str/ontouml.py +141 -0
  40. glam4cm/graph2str/uml.py +0 -0
  41. glam4cm/lang2graph/__init__.py +0 -0
  42. glam4cm/lang2graph/archimate.py +31 -0
  43. glam4cm/lang2graph/bpmn.py +0 -0
  44. glam4cm/lang2graph/common.py +416 -0
  45. glam4cm/lang2graph/ecore.py +221 -0
  46. glam4cm/lang2graph/ontouml.py +169 -0
  47. glam4cm/lang2graph/utils.py +80 -0
  48. glam4cm/models/cmgpt.py +352 -0
  49. glam4cm/models/gnn_layers.py +273 -0
  50. glam4cm/models/hf.py +10 -0
  51. glam4cm/run.py +99 -0
  52. glam4cm/run_configs.py +126 -0
  53. glam4cm/settings.py +54 -0
  54. glam4cm/tokenization/__init__.py +0 -0
  55. glam4cm/tokenization/special_tokens.py +4 -0
  56. glam4cm/tokenization/utils.py +37 -0
  57. glam4cm/trainers/__init__.py +0 -0
  58. glam4cm/trainers/bert_classifier.py +105 -0
  59. glam4cm/trainers/cm_gpt_trainer.py +153 -0
  60. glam4cm/trainers/gnn_edge_classifier.py +126 -0
  61. glam4cm/trainers/gnn_graph_classifier.py +123 -0
  62. glam4cm/trainers/gnn_link_predictor.py +144 -0
  63. glam4cm/trainers/gnn_node_classifier.py +135 -0
  64. glam4cm/trainers/gnn_trainer.py +129 -0
  65. glam4cm/trainers/metrics.py +55 -0
  66. glam4cm/utils.py +194 -0
  67. glam4cm-0.1.0.dist-info/LICENSE +21 -0
  68. glam4cm-0.1.0.dist-info/METADATA +86 -0
  69. glam4cm-0.1.0.dist-info/RECORD +72 -0
  70. glam4cm-0.1.0.dist-info/WHEEL +5 -0
  71. glam4cm-0.1.0.dist-info/entry_points.txt +2 -0
  72. glam4cm-0.1.0.dist-info/top_level.txt +1 -0
glam4cm/run.py ADDED
@@ -0,0 +1,99 @@
1
+ import argparse
2
+ from glam4cm.downstream_tasks import (
3
+ bert_graph_classification_comp,
4
+ bert_graph_classification,
5
+ bert_node_classification,
6
+ bert_edge_classification,
7
+ bert_link_prediction,
8
+
9
+ gnn_graph_cls,
10
+ gnn_node_classification,
11
+ gnn_edge_classification,
12
+ gnn_link_prediction,
13
+ create_dataset,
14
+ )
15
+
16
+ from glam4cm.downstream_tasks import cm_gpt_pretraining
17
+ from glam4cm.downstream_tasks import cm_gpt_node_classification
18
+ from glam4cm.downstream_tasks import cm_gpt_edge_classification
19
+ from glam4cm.downstream_tasks.bert_graph_classification_comp import get_parser as bert_comp_parse_args
20
+ from glam4cm.downstream_tasks.bert_graph_classification import get_parser as bert_parse_args
21
+ from glam4cm.downstream_tasks.gnn_graph_cls import get_parser as gnn_parse_args
22
+ from glam4cm.downstream_tasks.create_dataset import get_parser as create_dataset_parse_args
23
+ from glam4cm.downstream_tasks.bert_link_prediction import get_parser as bert_lp_parse_args
24
+ from glam4cm.downstream_tasks.gnn_edge_classification import get_parser as gnn_ec_parse_args
25
+ from glam4cm.downstream_tasks.gnn_link_prediction import get_parser as gnn_lp_parse_args
26
+ from glam4cm.downstream_tasks.bert_edge_classification import get_parser as bert_ec_parse_args
27
+ from glam4cm.downstream_tasks.gnn_node_classification import get_parser as gnn_nc_parse_args
28
+ from glam4cm.downstream_tasks.bert_node_classification import get_parser as bert_nc_parse_args
29
+ from glam4cm.downstream_tasks.cm_gpt_pretraining import get_parser as cm_gpt_parse_args
30
+ from glam4cm.downstream_tasks.cm_gpt_node_classification import get_parser as cm_gpt_nc_parse_args
31
+ from glam4cm.downstream_tasks.cm_gpt_edge_classification import get_parser as cm_gpt_ec_parse_args
32
+
33
+
34
+ tasks = {
35
+ 0: 'Create Dataset',
36
+
37
+ 1: 'BERT Graph Classification Comparison',
38
+ 2: 'BERT Graph Classification',
39
+ 3: 'BERT Node Classification',
40
+ 4: 'BERT Link Prediction',
41
+ 5: 'BERT Edge Classification',
42
+
43
+
44
+ 6: 'GNN Graph Classification',
45
+ 7: 'GNN Node Classification',
46
+ 8: 'GNN Edge Classification',
47
+ 9: 'GNN Link Prediction',
48
+ 10: 'CM-GPT Causal Modeling',
49
+ 11: 'CM-GPT Node Classification',
50
+ 12: 'CM-GPT Edge Classification'
51
+ }
52
+
53
+
54
+ tasks_handler_map = {
55
+ 0: (create_dataset.run, create_dataset_parse_args),
56
+ 1: (bert_graph_classification_comp.run, bert_comp_parse_args),
57
+ 2: (bert_graph_classification.run, bert_parse_args),
58
+ 3: (bert_node_classification.run, bert_nc_parse_args),
59
+ 4: (bert_link_prediction.run, bert_lp_parse_args),
60
+ 5: (bert_edge_classification.run, bert_ec_parse_args),
61
+ 6: (gnn_graph_cls.run, gnn_parse_args),
62
+ 7: (gnn_node_classification.run, gnn_nc_parse_args),
63
+ 8: (gnn_edge_classification.run, gnn_ec_parse_args),
64
+ 9: (gnn_link_prediction.run, gnn_lp_parse_args),
65
+ 10: (cm_gpt_pretraining.run, cm_gpt_parse_args),
66
+ 11: (cm_gpt_node_classification.run, cm_gpt_nc_parse_args),
67
+ 12: (cm_gpt_edge_classification.run, cm_gpt_ec_parse_args)
68
+ }
69
+
70
+
71
+ if __name__ == '__main__':
72
+
73
+ main_parser = argparse.ArgumentParser(description="Train ML models on conceptual models")
74
+ main_parser.add_argument('--task_id', type=int, required=True, help=f'ID of the task to run. Options are: {"\n".join(f"{k}: {v}" for k, v in tasks.items())}', choices=list(tasks.keys()), default=0)
75
+ main_parser.add_argument('--th', '--task_help', action="store_true", help="Help for the task specified by --task_id")
76
+
77
+
78
+ args, remaining_args = main_parser.parse_known_args()
79
+
80
+ if not any(vars(args).values()):
81
+ print("No arguments provided. Please provide arguments to run the task.")
82
+ main_parser.print_help()
83
+ exit(1)
84
+
85
+ ### If args has -h or --help, print help
86
+ if any(x in remaining_args for x in ['-th', '--task_help']):
87
+ task_id = args.task_id
88
+ hander, task_parser = tasks_handler_map[task_id]
89
+ print("Help for task:", tasks[task_id])
90
+ task_parser().print_help()
91
+ exit(0)
92
+
93
+ print("Running GLAM4CM with:", vars(args)) # Placeholder for real functionality
94
+
95
+
96
+ task_id = args.task_id
97
+ hander, task_parser = tasks_handler_map[task_id]
98
+ task_args = task_parser().parse_args(remaining_args)
99
+ hander(task_args)
glam4cm/run_configs.py ADDED
@@ -0,0 +1,126 @@
1
+ import subprocess
2
+
3
+ from tqdm.auto import tqdm
4
+
5
+
6
+ tasks = {
7
+ 0: 'Create Dataset',
8
+
9
+ 1: 'BERT Graph Classification Comparison',
10
+ 2: 'BERT Graph Classification',
11
+ 3: 'BERT Node Classification',
12
+ 4: 'BERT Link Prediction',
13
+ 5: 'BERT Edge Classification',
14
+
15
+
16
+ 6: 'GNN Graph Classification',
17
+ 7: 'GNN Node Classification',
18
+ 8: 'GNN Edge Classification',
19
+ 9: 'GNN Link Prediction',
20
+ }
21
+
22
+ all_tasks = {
23
+ 1: [
24
+ '--dataset=ecore_555 --num_epochs=5 --train_batch_size=2',
25
+ '--dataset=modelset --num_epochs=10 --train_batch_size=2',
26
+ ],
27
+
28
+ 2: [
29
+ '--dataset=ecore_555 --num_epochs=5 --min_edges=10 --train_batch_size=2',
30
+ '--dataset=ecore_555 --num_epochs=5 --use_attributes --min_edges=10 --train_batch_size=2',
31
+ '--dataset=ecore_555 --num_epochs=5 --use_edge_types --min_edges=10 --train_batch_size=2',
32
+ '--dataset=ecore_555 --num_epochs=5 --use_attributes --use_edge_types --min_edges=10 --train_batch_size=2',
33
+ '--dataset=modelset --num_epochs=10 --min_edges=10 --train_batch_size=2',
34
+ '--dataset=modelset --num_epochs=10 --use_attributes --min_edges=10 --train_batch_size=2',
35
+ '--dataset=modelset --num_epochs=10 --use_edge_types --min_edges=10 --train_batch_size=2',
36
+ '--dataset=modelset --num_epochs=10 --use_attributes --use_edge_types --min_edges=10 --train_batch_size=2',
37
+ ],
38
+
39
+ 3: [
40
+ '--dataset=ecore_555 --num_epochs=5 --cls_label=abstract --min_edges=10 --train_batch_size=32',
41
+ '--dataset=ecore_555 --num_epochs=5 --use_attributes --cls_label=abstract --train_batch_size=32 --min_edges=10',
42
+ '--dataset=ecore_555 --num_epochs=5 --use_edge_types --cls_label=abstract --train_batch_size=32 --min_edges=10',
43
+ '--dataset=ecore_555 --num_epochs=5 --use_attributes --use_edge_types --cls_label=abstract --train_batch_size=32 --min_edges=10',
44
+ '--dataset=modelset --num_epochs=10 --cls_label=abstract --train_batch_size=32 --min_edges=10',
45
+ '--dataset=modelset --num_epochs=10 --use_attributes --cls_label=abstract --train_batch_size=32 --min_edges=10',
46
+ '--dataset=modelset --num_epochs=10 --use_edge_types --cls_label=abstract --train_batch_size=32 --min_edges=10',
47
+ '--dataset=modelset --num_epochs=10 --use_attributes --use_edge_types --cls_label=abstract --train_batch_size=32 --min_edges=10',
48
+
49
+ '--dataset=mar-ecore-github --num_epochs=10 --use_attributes --use_edge_types --cls_label=abstract --train_batch_size=32 --min_edges=10',
50
+
51
+ '--dataset=eamodelset --num_epochs=15 --cls_label=type --train_batch_size=32 --min_edges=10',
52
+ '--dataset=eamodelset --num_epochs=15 --use_edge_types --cls_label=type --train_batch_size=32 --min_edges=10',
53
+ '--dataset=eamodelset --num_epochs=15 --cls_label=layer --train_batch_size=32 --min_edges=10',
54
+ '--dataset=eamodelset --num_epochs=15 --use_edge_types --cls_label=layer --train_batch_size=32 --min_edges=10',
55
+ ],
56
+
57
+ 4: [
58
+ '--dataset=ecore_555 --num_epochs=3 --train_batch_size=32 --min_edges=10',
59
+ '--dataset=ecore_555 --num_epochs=3 --use_attributes --train_batch_size=32 --min_edges=10',
60
+ '--dataset=modelset --num_epochs=5 --train_batch_size=32 --min_edges=10 --reload',
61
+ '--dataset=modelset --num_epochs=5 --use_attributes --train_batch_size=32 --min_edges=10 --reload',
62
+
63
+ '--dataset=mar-ecore-github --num_epochs=5 --use_attributes --train_batch_size=32 --min_edges=10 --reload',
64
+ '--dataset=eamodelset --num_epochs=5 --train_batch_size=32 --min_edges=10 --reload',
65
+ ],
66
+
67
+ 5: [
68
+ '--dataset=ecore_555 --num_epochs=5 --train_batch_size=32 --min_edges=10 --reload',
69
+ '--dataset=ecore_555 --num_epochs=5 --use_attributes --train_batch_size=32 --min_edges=10 --reload',
70
+ '--dataset=modelset --num_epochs=10 --train_batch_size=32 --min_edges=10 --reload',
71
+ '--dataset=modelset --num_epochs=10 --use_attributes --train_batch_size=32 --min_edges=10 --reload',
72
+ '--dataset=mar-ecore-github --num_epochs=10 --use_attributes --train_batch_size=32 --min_edges=10 --reload',
73
+ '--dataset=eamodelset --num_epochs=15 --train_batch_size=32 --min_edges=10 --reload',
74
+ ],
75
+ 6: [
76
+ '--dataset=ecore_555 --num_epochs=200 --batch_size=32 --min_edges=10 --reload',
77
+ '--dataset=ecore_555 --num_epochs=200 --batch_size=32 --min_edges=10 --use_embeddings --ckpt=results/ecore_555/graph_cls_/10_att_0_nt_0/checkpoint-225 --reload',
78
+ '--dataset=ecore_555 --num_epochs=200 --batch_size=32 --min_edges=10 --use_embeddings --use_attributes --ckpt=results/ecore_555/graph_cls_/10_att_1_nt_0/checkpoint-225 --reload',
79
+ '--dataset=ecore_555 --num_epochs=200 --batch_size=32 --min_edges=10 --use_embeddings --use_edge_types --ckpt=results/ecore_555/graph_cls_/10_att_0_nt_1/checkpoint-225 --reload',
80
+ '--dataset=ecore_555 --num_epochs=200 --batch_size=32 --min_edges=10 --use_embeddings --use_attributes --use_edge_types --ckpt=results/ecore_555/graph_cls_/10_att_1_nt_1/checkpoint-225 --reload',
81
+
82
+ '--dataset=modelset --num_epochs=200 --batch_size=32 --min_edges=10 --reload',
83
+ '--dataset=modelset --num_epochs=200 --batch_size=32 --min_edges=10 --use_embeddings --ckpt=results/modelset/graph_cls_/10_att_0_nt_0/checkpoint-2540 --reload',
84
+ '--dataset=modelset --num_epochs=200 --batch_size=32 --min_edges=10 --use_embeddings --use_attributes --ckpt=results/modelset/graph_cls_/10_att_1_nt_0/checkpoint-2540 --reload',
85
+ '--dataset=modelset --num_epochs=200 --batch_size=32 --min_edges=10 --use_embeddings --use_edge_types --ckpt=results/modelset/graph_cls_/10_att_0_nt_1/checkpoint-2540 --reload',
86
+ '--dataset=modelset --num_epochs=200 --batch_size=32 --min_edges=10 --use_embeddings --use_attributes --use_edge_types --ckpt=results/modelset/graph_cls_/10_att_1_nt_1/checkpoint-2540 --reload',
87
+ ],
88
+ 7: [
89
+ '--dataset=ecore_555 --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --reload',
90
+ '--dataset=ecore_555 --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --use_embeddings --ckpt=results/ecore_555/node_cls/abstract/abstract_10_att_0_nt_0/checkpoint-540 --reload',
91
+ '--dataset=ecore_555 --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --use_embeddings --use_attributes --ckpt=results/ecore_555/node_cls/abstract/abstract_10_att_1_nt_0/checkpoint-540 --reload',
92
+ '--dataset=ecore_555 --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --use_embeddings --use_edge_types --ckpt=results/ecore_555/node_cls/abstract/abstract_10_att_0_nt_1/checkpoint-540 --reload',
93
+ '--dataset=ecore_555 --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --use_embeddings --use_attributes --use_edge_types --ckpt=results/ecore_555/node_cls/abstract/abstract_10_att_1_nt_1/checkpoint-540 --reload',
94
+
95
+
96
+ '--dataset=modelset --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --reload',
97
+ '--dataset=modelset --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --use_embeddings --ckpt=results/modelset/node_cls/abstract/abstract_10_att_0_nt_0/checkpoint-6870 --reload',
98
+ '--dataset=modelset --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --use_embeddings --use_attributes --ckpt=results/modelset/node_cls/abstract/abstract_10_att_1_nt_0/checkpoint-6870 --reload',
99
+ '--dataset=modelset --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --use_embeddings --use_edge_types --ckpt=results/modelset/node_cls/abstract/abstract_10_att_0_nt_1/checkpoint-6870 --reload',
100
+ '--dataset=modelset --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --use_embeddings --use_attributes --use_edge_types --ckpt=results/modelset/node_cls/abstract/abstract_10_att_1_nt_1/checkpoint-6870 --reload',
101
+
102
+ '--dataset=mar-ecore-github --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --reload',
103
+ '--dataset=mar-ecore-github --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --use_embeddings --ckpt=results/mar-ecore-github/node_cls/abstract/abstract_10_att_0_nt_0/checkpoint-19400 --reload',
104
+ '--dataset=mar-ecore-github --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --use_embeddings --use_attributes --ckpt=results/mar-ecore-github/node_cls/abstract/abstract_10_att_1_nt_0/checkpoint-19400 --reload',
105
+ '--dataset=mar-ecore-github --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --use_embeddings --use_edge_types --ckpt=results/mar-ecore-github/node_cls/abstract/abstract_10_att_0_nt_1/checkpoint-19400 --reload',
106
+ '--dataset=mar-ecore-github --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=abstract --use_embeddings --use_attributes --use_edge_types --ckpt=results/mar-ecore-github/node_cls/abstract/abstract_10_att_1_nt_1/checkpoint-19400 --reload',
107
+
108
+ '--dataset=eamodelset --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=type --reload',
109
+ '--dataset=eamodelset --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=type --use_embeddings --ckpt=results/eamodelset/node_cls/layer/layer_10_att_0_nt_0/checkpoint-9570 --reload',
110
+ '--dataset=eamodelset --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=type --use_embeddings --use_edge_types --ckpt=results/eamodelset/node_cls/layer/layer_10_att_0_nt_1/checkpoint-9570 --reload',
111
+
112
+ '--dataset=eamodelset --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=type --reload',
113
+ '--dataset=eamodelset --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=type --use_embeddings --ckpt=results/eamodelset/node_cls/type/type_10_att_0_nt_0/checkpoint-9570 --reload',
114
+ '--dataset=eamodelset --num_epochs=200 --batch_size=32 --min_edges=10 --cls_label=type --use_embeddings --use_edge_types --ckpt=results/eamodelset/node_cls/type/type_10_att_0_nt_1/checkpoint-9570 --reload',
115
+ ]
116
+ }
117
+
118
+ allowed_tasks = [7]
119
+
120
+ for script_id in tqdm(allowed_tasks, desc='Running tasks'):
121
+ task = tasks[script_id]
122
+ for script in tqdm(all_tasks[script_id], desc=f'Running scripts for {task}'):
123
+ script += f' --task={script_id} '
124
+ print(f'Running {script}')
125
+
126
+ subprocess.run(f'python run.py {script}', shell=True)
glam4cm/settings.py ADDED
@@ -0,0 +1,54 @@
1
+ import os
2
+ import torch
3
+
4
+ import logging
5
+
6
+ logger = logging.getLogger(__name__)
7
+ logger.setLevel(logging.DEBUG)
8
+
9
+
10
+
11
+ BERT_MODEL = 'bert-base-uncased'
12
+ WORD2VEC_MODEL = 'word2vec'
13
+ TFIDF_MODEL = 'tfidf'
14
+ FAST_TEXT_MODEL = 'uml-fasttext.bin'
15
+
16
+ W2V_CONFIG = dict(
17
+ epoch=100,
18
+ dim=128,
19
+ ws=5,
20
+ minCount=1,
21
+ thread=4,
22
+ model='skipgram'
23
+ )
24
+
25
+ device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
26
+ torch.set_float32_matmul_precision('high')
27
+
28
+
29
+ seed = 42
30
+ datasets_dir = 'datasets'
31
+ ecore_json_path = os.path.join(datasets_dir, 'ecore_555/ecore_555.jsonl')
32
+ mar_json_path = os.path.join(datasets_dir, 'mar-ecore-github/ecore-github.jsonl')
33
+ modelsets_uml_json_path = os.path.join(datasets_dir, 'modelset/uml.jsonl')
34
+ modelsets_ecore_json_path = os.path.join(datasets_dir, 'modelset/ecore.jsonl')
35
+
36
+
37
+ graph_data_dir = 'datasets/graph_data'
38
+
39
+ # Path: settings.py
40
+
41
+
42
+ LP_TASK_EDGE_CLS = 'edge_cls'
43
+ LP_TASK_LINK_PRED = 'lp'
44
+
45
+
46
+ EPOCH = 'epoch'
47
+ LOSS = 'loss'
48
+ TRAIN_LOSS = 'train_loss'
49
+ TEST_LOSS = 'test_loss'
50
+ TEST_ACC = 'test_acc'
51
+
52
+ TRAINING_PHASE = 'train'
53
+ VALIDATION_PHASE = 'val'
54
+ TESTING_PHASE = 'test'
File without changes
@@ -0,0 +1,4 @@
1
+ EDGE_START = '<edge_begin>'
2
+ EDGE_END = '<edge_end>'
3
+ NODE_BEGIN = '<node_begin>'
4
+ NODE_END = '<node_end>'
@@ -0,0 +1,37 @@
1
+ from re import finditer
2
+ from glam4cm.tokenization.special_tokens import (
3
+ EDGE_START, EDGE_END, NODE_BEGIN, NODE_END
4
+ )
5
+ from transformers import AutoTokenizer
6
+
7
+ def get_special_tokens():
8
+ return {
9
+ 'additional_special_tokens': [EDGE_START, EDGE_END, NODE_BEGIN, NODE_END]
10
+ }
11
+
12
+
13
+ def get_tokenizer(model_name, use_special_tokens=False, max_length=512) -> AutoTokenizer:
14
+ print(f"Loading tokenizer for {model_name}")
15
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
16
+ if tokenizer.pad_token is None:
17
+ tokenizer.pad_token = tokenizer.eos_token
18
+
19
+ if use_special_tokens:
20
+ tokenizer.add_special_tokens(get_special_tokens())
21
+
22
+ tokenizer.model_max_length = max_length
23
+ return tokenizer
24
+
25
+
26
+ def camel_case_split(identifier) -> list:
27
+ matches = finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
28
+ return [m.group(0) for m in matches]
29
+
30
+
31
+ def doc_tokenizer(doc, lower=False) -> str:
32
+ words = doc.split()
33
+ # split _
34
+ words = [w2 for w1 in words for w2 in w1.split('_') if w2 != '']
35
+ # camelcase
36
+ words = [w2.lower() if lower else w2 for w1 in words for w2 in camel_case_split(w1) if w2 != '']
37
+ return " ".join(words)
File without changes
@@ -0,0 +1,105 @@
1
+ from collections import Counter
2
+ import numpy as np
3
+ from sklearn.model_selection import StratifiedKFold, train_test_split
4
+ from transformers import (
5
+ Trainer,
6
+ TrainingArguments
7
+ )
8
+ from transformers import (
9
+ AutoModelForSequenceClassification,
10
+ AutoTokenizer
11
+ )
12
+ from glam4cm.data_loading.encoding import EncodingDataset
13
+ from glam4cm.settings import device
14
+ from sklearn.preprocessing import LabelEncoder
15
+ from glam4cm.trainers.metrics import compute_metrics
16
+
17
+
18
+ class BertTrainer:
19
+ def __init__(
20
+ self,
21
+ model_name,
22
+ ckpt=None,
23
+ max_length=512
24
+ ):
25
+ self.model_name = model_name
26
+ self.ckpt = ckpt
27
+ self.max_length = max_length
28
+
29
+
30
+ def train(
31
+ self,
32
+ texts,
33
+ labels,
34
+ test_ratio=0.2,
35
+ kfold=False,
36
+ num_train_epochs=15,
37
+ train_batch_size=2,
38
+ eval_batch_size=128,
39
+ weight_decay=0.01,
40
+ logging_steps=50,
41
+ eval_steps=50,
42
+ save_steps=50,
43
+ learning_rate=5e-5,
44
+ warmup_steps=500,
45
+ output_dir='./results',
46
+ logs_dir='./logs',
47
+ seed=42
48
+ ):
49
+ def train_fold():
50
+ print(f'Train: {len(X_train)}, Test: {len(X_test)}')
51
+ print("Class distribution in train: ", Counter(y_train))
52
+ print("Class distribution in test: ", Counter(y_test))
53
+
54
+ tokenizer = AutoTokenizer.from_pretrained(self.model_name if not self.ckpt else self.ckpt)
55
+ model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=num_classes)
56
+ model.to(device)
57
+
58
+ train_ds = EncodingDataset(tokenizer, X_train, y_train, max_length=self.max_length)
59
+ test_ds = EncodingDataset(tokenizer, X_test, y_test, max_length=self.max_length)
60
+
61
+ training_args = TrainingArguments(
62
+ output_dir=output_dir,
63
+ num_train_epochs=num_train_epochs,
64
+ eval_strategy="steps",
65
+ per_device_train_batch_size=train_batch_size,
66
+ per_device_eval_batch_size=eval_batch_size,
67
+ warmup_steps=warmup_steps,
68
+ weight_decay=weight_decay,
69
+ learning_rate=learning_rate,
70
+ logging_dir=logs_dir,
71
+ logging_steps=logging_steps,
72
+ eval_steps=eval_steps,
73
+ save_steps=save_steps,
74
+ save_total_limit=2,
75
+ load_best_model_at_end=True,
76
+ fp16=True
77
+ )
78
+
79
+ trainer = Trainer(
80
+ model=model,
81
+ args=training_args,
82
+ train_dataset=train_ds,
83
+ eval_dataset=test_ds,
84
+ compute_metrics=compute_metrics
85
+ )
86
+
87
+ trainer.train()
88
+ results = trainer.evaluate()
89
+ print(results)
90
+
91
+
92
+ y = LabelEncoder().fit_transform(labels)
93
+ num_classes = len(set(y))
94
+ if kfold > 0:
95
+ k = int(1 / self.test_ratio)
96
+ kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
97
+ n = len(self.graphs)
98
+ for i, (train_idx, test_idx) in enumerate(kfold.split(np.zeros(n), np.zeros(n))):
99
+ X_train, y_train = [texts[i] for i in train_idx], [y[i] for i in train_idx]
100
+ X_test, y_test = [texts[i] for i in test_idx], [y[i] for i in test_idx]
101
+ print("Fold number: ", i+1)
102
+ train_fold()
103
+ else:
104
+ X_train, X_test, y_train, y_test = train_test_split(texts, y, test_size=test_ratio, random_state=seed)
105
+ train_fold()
@@ -0,0 +1,153 @@
1
+ import os
2
+ import time
3
+ from typing import Union
4
+ from torch.utils.data import Dataset
5
+ from tensorboardX import SummaryWriter
6
+ from torch.utils.data import DataLoader
7
+ from tqdm.auto import tqdm
8
+ import torch.nn as nn
9
+
10
+ from glam4cm.models.cmgpt import CMGPT, CMGPTClassifier
11
+ import torch
12
+ from glam4cm.settings import device
13
+
14
+
15
+ from glam4cm.trainers.metrics import compute_classification_metrics
16
+
17
+
18
+ class CMGPTTrainer:
19
+ def __init__(
20
+ self,
21
+ model: Union[CMGPT, CMGPTClassifier],
22
+ train_dataset: Dataset,
23
+ test_dataset: Dataset,
24
+ batch_size: int = 32,
25
+
26
+ lr: float = 1e-5,
27
+ num_epochs: int = 10,
28
+ log_dir: str = 'logs',
29
+ results_dir: str = 'results/cmgpt',
30
+ compute_metrics: callable = None
31
+ ):
32
+ self.model = model
33
+ self.model.to(device)
34
+
35
+ # self.model = torch.compile(self.model)
36
+
37
+ self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=lr)
38
+ self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=num_epochs)
39
+ self.results_dir = results_dir
40
+ os.makedirs(results_dir, exist_ok=True)
41
+ self.writer = SummaryWriter(log_dir=log_dir)
42
+
43
+ self.dataloaders = {
44
+ 'train': DataLoader(train_dataset, batch_size=batch_size, shuffle=True),
45
+ 'test': DataLoader(test_dataset, batch_size=batch_size, shuffle=False),
46
+ }
47
+
48
+ self.num_epochs = num_epochs
49
+ if not compute_metrics and isinstance(self.model, CMGPTClassifier):
50
+ self.compute_metrics = compute_classification_metrics
51
+ else:
52
+ self.compute_metrics = compute_metrics
53
+
54
+ print(f"Number of parameters: {sum(p.numel() for p in self.model.parameters() if p.requires_grad)/ 1000000:.3f}M")
55
+
56
+
57
+ def step(self, batch, idx=None):
58
+ # B, T = batch['input_ids'].shape
59
+ # t0 = time.time()
60
+ self.optimizer.zero_grad()
61
+ logits, loss = self.model(
62
+ batch['input_ids'].to(device),
63
+ batch['attention_mask'].to(device),
64
+ batch['labels'].to(device)
65
+ )
66
+
67
+ loss.backward()
68
+ self.optimizer.step()
69
+ # torch.cuda.synchronize()
70
+ # t1 = time.time()
71
+ # dt = (t1 - t0)*1000
72
+ # tokens_per_sec = B*T/(t1-t0)
73
+ # if idx is not None:
74
+ # print(f"Batch: {idx}, Loss: {loss.item()}, Time: {dt} ms, Tokens/s: {tokens_per_sec}")
75
+ # else:
76
+ # print(f"Loss: {loss.item()}, Time: {dt} ms, Tokens/s: {tokens_per_sec}")
77
+ # if idx > 100:
78
+ # print("Breaking")
79
+ # exit()
80
+ return logits, loss
81
+
82
+
83
+ def train(self):
84
+ for epoch in tqdm(range(self.num_epochs), desc='Training Epoch'):
85
+ self.model.train()
86
+ train_loss = 0
87
+ all_preds, all_labels = list(), list()
88
+ for i, batch in tqdm(enumerate(self.dataloaders['train']), desc='Training Batches', total=len(self.dataloaders['train'])):
89
+ logits, loss = self.step(batch, i)
90
+ train_loss += loss.item()
91
+
92
+ self.writer.add_scalar('loss/train', loss.item(), epoch * len(self.dataloaders['train']) + i)
93
+
94
+ if self.compute_metrics is not None:
95
+ all_preds.append(logits.detach().cpu())
96
+ all_labels.append(batch['labels'].cpu())
97
+ # break
98
+ print("Train loss: ", train_loss / len(self.dataloaders['train']))
99
+
100
+ # if self.compute_metrics is not None:
101
+ # all_preds = torch.cat(all_preds, dim=0)
102
+ # all_labels = torch.cat(all_labels, dim=0)
103
+ # metrics = self.compute_metrics(all_preds, all_labels)
104
+ # for key, value in metrics.items():
105
+ # self.writer.add_scalar(key, value, epoch)
106
+
107
+ # # print("Train Metrics: ", metrics)
108
+
109
+
110
+
111
+ self.test(epoch)
112
+ self.scheduler.step()
113
+
114
+
115
+ def test(self, epoch=None):
116
+ self.model.eval()
117
+ test_loss = 0
118
+ all_preds, all_labels = list(), list()
119
+ for i, batch in tqdm(enumerate(self.dataloaders['test']), desc='Testing Batches', total=len(self.dataloaders['test'])):
120
+ logits, loss = self.model(
121
+ batch['input_ids'].to(device),
122
+ batch['attention_mask'].to(device),
123
+ batch['labels'].to(device)
124
+ )
125
+ test_loss += loss.item()
126
+
127
+ if self.compute_metrics is not None:
128
+ all_preds.append(logits.detach().cpu())
129
+ all_labels.append(batch['labels'].cpu())
130
+
131
+ if epoch is not None:
132
+ self.writer.add_scalar('loss/test', test_loss / len(self.dataloaders['test']), epoch)
133
+
134
+ # break
135
+
136
+ print("Test loss: ", test_loss / len(self.dataloaders['test']))
137
+
138
+ if self.compute_metrics is not None:
139
+ all_preds = torch.cat(all_preds, dim=0)
140
+ all_labels = torch.cat(all_labels, dim=0)
141
+ metrics = self.compute_metrics(all_preds, all_labels)
142
+ for key, value in metrics.items():
143
+ self.writer.add_scalar(key, value, epoch)
144
+
145
+ print("Test Metrics: ", metrics)
146
+
147
+
148
+ def save_model(self):
149
+ if isinstance(self.model, CMGPT):
150
+ path = f'{self.results_dir}/cmgpt.pth'
151
+ elif isinstance(self.model, CMGPTClassifier):
152
+ path = f'{self.results_dir}/cmgpt-classifier.pth'
153
+ torch.save(self.model.state_dict(), path)