glam4cm 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. glam4cm/__init__.py +9 -0
  2. glam4cm/data_loading/__init__.py +0 -0
  3. glam4cm/data_loading/data.py +631 -0
  4. glam4cm/data_loading/encoding.py +76 -0
  5. glam4cm/data_loading/graph_dataset.py +940 -0
  6. glam4cm/data_loading/metadata.py +84 -0
  7. glam4cm/data_loading/models_dataset.py +361 -0
  8. glam4cm/data_loading/utils.py +20 -0
  9. glam4cm/downstream_tasks/__init__.py +0 -0
  10. glam4cm/downstream_tasks/bert_edge_classification.py +144 -0
  11. glam4cm/downstream_tasks/bert_graph_classification.py +137 -0
  12. glam4cm/downstream_tasks/bert_graph_classification_comp.py +156 -0
  13. glam4cm/downstream_tasks/bert_link_prediction.py +145 -0
  14. glam4cm/downstream_tasks/bert_node_classification.py +164 -0
  15. glam4cm/downstream_tasks/cm_gpt_edge_classification.py +73 -0
  16. glam4cm/downstream_tasks/cm_gpt_node_classification.py +76 -0
  17. glam4cm/downstream_tasks/cm_gpt_pretraining.py +64 -0
  18. glam4cm/downstream_tasks/common_args.py +160 -0
  19. glam4cm/downstream_tasks/create_dataset.py +51 -0
  20. glam4cm/downstream_tasks/gnn_edge_classification.py +106 -0
  21. glam4cm/downstream_tasks/gnn_graph_cls.py +101 -0
  22. glam4cm/downstream_tasks/gnn_link_prediction.py +109 -0
  23. glam4cm/downstream_tasks/gnn_node_classification.py +103 -0
  24. glam4cm/downstream_tasks/tf_idf_text_classification.py +22 -0
  25. glam4cm/downstream_tasks/utils.py +35 -0
  26. glam4cm/downstream_tasks/word2vec_text_classification.py +108 -0
  27. glam4cm/embeddings/__init__.py +0 -0
  28. glam4cm/embeddings/bert.py +72 -0
  29. glam4cm/embeddings/common.py +43 -0
  30. glam4cm/embeddings/fasttext.py +0 -0
  31. glam4cm/embeddings/tfidf.py +25 -0
  32. glam4cm/embeddings/w2v.py +41 -0
  33. glam4cm/encoding/__init__.py +0 -0
  34. glam4cm/encoding/common.py +0 -0
  35. glam4cm/encoding/encoders.py +100 -0
  36. glam4cm/graph2str/__init__.py +0 -0
  37. glam4cm/graph2str/common.py +34 -0
  38. glam4cm/graph2str/constants.py +15 -0
  39. glam4cm/graph2str/ontouml.py +141 -0
  40. glam4cm/graph2str/uml.py +0 -0
  41. glam4cm/lang2graph/__init__.py +0 -0
  42. glam4cm/lang2graph/archimate.py +31 -0
  43. glam4cm/lang2graph/bpmn.py +0 -0
  44. glam4cm/lang2graph/common.py +416 -0
  45. glam4cm/lang2graph/ecore.py +221 -0
  46. glam4cm/lang2graph/ontouml.py +169 -0
  47. glam4cm/lang2graph/utils.py +80 -0
  48. glam4cm/models/cmgpt.py +352 -0
  49. glam4cm/models/gnn_layers.py +273 -0
  50. glam4cm/models/hf.py +10 -0
  51. glam4cm/run.py +99 -0
  52. glam4cm/run_configs.py +126 -0
  53. glam4cm/settings.py +54 -0
  54. glam4cm/tokenization/__init__.py +0 -0
  55. glam4cm/tokenization/special_tokens.py +4 -0
  56. glam4cm/tokenization/utils.py +37 -0
  57. glam4cm/trainers/__init__.py +0 -0
  58. glam4cm/trainers/bert_classifier.py +105 -0
  59. glam4cm/trainers/cm_gpt_trainer.py +153 -0
  60. glam4cm/trainers/gnn_edge_classifier.py +126 -0
  61. glam4cm/trainers/gnn_graph_classifier.py +123 -0
  62. glam4cm/trainers/gnn_link_predictor.py +144 -0
  63. glam4cm/trainers/gnn_node_classifier.py +135 -0
  64. glam4cm/trainers/gnn_trainer.py +129 -0
  65. glam4cm/trainers/metrics.py +55 -0
  66. glam4cm/utils.py +194 -0
  67. glam4cm-0.1.0.dist-info/LICENSE +21 -0
  68. glam4cm-0.1.0.dist-info/METADATA +86 -0
  69. glam4cm-0.1.0.dist-info/RECORD +72 -0
  70. glam4cm-0.1.0.dist-info/WHEEL +5 -0
  71. glam4cm-0.1.0.dist-info/entry_points.txt +2 -0
  72. glam4cm-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,76 @@
1
+ from torch.utils.data import Dataset
2
+ import torch
3
+
4
+ # Create your dataset
5
+ class EncodingDataset(Dataset):
6
+ def __init__(
7
+ self,
8
+ tokenizer,
9
+ texts,
10
+ labels=None,
11
+ max_length=512,
12
+ remove_duplicates=False
13
+ ):
14
+
15
+ if remove_duplicates:
16
+ # print(f'Dataset with {len(texts)} samples before removing duplicates')
17
+ texts_to_id = {text: i for i, text in enumerate(texts)}
18
+ texts = list(texts_to_id.keys())
19
+ labels = [labels[i] for i in texts_to_id.values()] if labels else None
20
+
21
+ # print(f'Created dataset with {len(texts)} samples')
22
+
23
+ self.inputs = tokenizer(
24
+ texts,
25
+ return_tensors='pt',
26
+ truncation=True,
27
+ padding='max_length',
28
+ max_length=max_length
29
+ )
30
+
31
+ if labels is not None:
32
+ self.inputs['labels'] = torch.tensor(labels, dtype=torch.long) if labels is not None else None
33
+
34
+ print("Encoding Dataset created with {} samples".format(len(self.inputs['input_ids'])))
35
+ # print("\n".join([f"Label: {l}, Text: {i}" for i, l in zip(texts, labels)]))
36
+ # import code; code.interact(local=locals())
37
+
38
+
39
+ def __len__(self):
40
+ return len(self.inputs['input_ids'])
41
+
42
+
43
+ def __getitem__(self, index):
44
+ return {k: v[index] for k, v in self.inputs.items()}
45
+
46
+
47
+ class GPTTextDataset(Dataset):
48
+ def __init__(self, texts, tokenizer, max_length=512):
49
+ self.texts = texts
50
+ self.tokenizer = tokenizer
51
+ self.max_length = max_length
52
+
53
+ # Tokenize all the texts upon initialization
54
+ self.encodings = self.tokenizer(
55
+ texts,
56
+ truncation=True,
57
+ padding=True, # Pads to the longest sequence in the batch
58
+ max_length=max_length,
59
+ return_tensors="pt" # Return PyTorch tensors
60
+ )
61
+
62
+ def __len__(self):
63
+ return len(self.texts)
64
+
65
+ def __getitem__(self, idx):
66
+ input_ids = self.encodings["input_ids"][idx]
67
+ attention_mask = self.encodings["attention_mask"][idx]
68
+
69
+ # Labels for language modeling are the same as input_ids
70
+ labels = input_ids.clone()
71
+
72
+ return {
73
+ "input_ids": input_ids,
74
+ "attention_mask": attention_mask,
75
+ "labels": labels
76
+ }