glam4cm 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glam4cm/__init__.py +9 -0
- glam4cm/data_loading/__init__.py +0 -0
- glam4cm/data_loading/data.py +631 -0
- glam4cm/data_loading/encoding.py +76 -0
- glam4cm/data_loading/graph_dataset.py +940 -0
- glam4cm/data_loading/metadata.py +84 -0
- glam4cm/data_loading/models_dataset.py +361 -0
- glam4cm/data_loading/utils.py +20 -0
- glam4cm/downstream_tasks/__init__.py +0 -0
- glam4cm/downstream_tasks/bert_edge_classification.py +144 -0
- glam4cm/downstream_tasks/bert_graph_classification.py +137 -0
- glam4cm/downstream_tasks/bert_graph_classification_comp.py +156 -0
- glam4cm/downstream_tasks/bert_link_prediction.py +145 -0
- glam4cm/downstream_tasks/bert_node_classification.py +164 -0
- glam4cm/downstream_tasks/cm_gpt_edge_classification.py +73 -0
- glam4cm/downstream_tasks/cm_gpt_node_classification.py +76 -0
- glam4cm/downstream_tasks/cm_gpt_pretraining.py +64 -0
- glam4cm/downstream_tasks/common_args.py +160 -0
- glam4cm/downstream_tasks/create_dataset.py +51 -0
- glam4cm/downstream_tasks/gnn_edge_classification.py +106 -0
- glam4cm/downstream_tasks/gnn_graph_cls.py +101 -0
- glam4cm/downstream_tasks/gnn_link_prediction.py +109 -0
- glam4cm/downstream_tasks/gnn_node_classification.py +103 -0
- glam4cm/downstream_tasks/tf_idf_text_classification.py +22 -0
- glam4cm/downstream_tasks/utils.py +35 -0
- glam4cm/downstream_tasks/word2vec_text_classification.py +108 -0
- glam4cm/embeddings/__init__.py +0 -0
- glam4cm/embeddings/bert.py +72 -0
- glam4cm/embeddings/common.py +43 -0
- glam4cm/embeddings/fasttext.py +0 -0
- glam4cm/embeddings/tfidf.py +25 -0
- glam4cm/embeddings/w2v.py +41 -0
- glam4cm/encoding/__init__.py +0 -0
- glam4cm/encoding/common.py +0 -0
- glam4cm/encoding/encoders.py +100 -0
- glam4cm/graph2str/__init__.py +0 -0
- glam4cm/graph2str/common.py +34 -0
- glam4cm/graph2str/constants.py +15 -0
- glam4cm/graph2str/ontouml.py +141 -0
- glam4cm/graph2str/uml.py +0 -0
- glam4cm/lang2graph/__init__.py +0 -0
- glam4cm/lang2graph/archimate.py +31 -0
- glam4cm/lang2graph/bpmn.py +0 -0
- glam4cm/lang2graph/common.py +416 -0
- glam4cm/lang2graph/ecore.py +221 -0
- glam4cm/lang2graph/ontouml.py +169 -0
- glam4cm/lang2graph/utils.py +80 -0
- glam4cm/models/cmgpt.py +352 -0
- glam4cm/models/gnn_layers.py +273 -0
- glam4cm/models/hf.py +10 -0
- glam4cm/run.py +99 -0
- glam4cm/run_configs.py +126 -0
- glam4cm/settings.py +54 -0
- glam4cm/tokenization/__init__.py +0 -0
- glam4cm/tokenization/special_tokens.py +4 -0
- glam4cm/tokenization/utils.py +37 -0
- glam4cm/trainers/__init__.py +0 -0
- glam4cm/trainers/bert_classifier.py +105 -0
- glam4cm/trainers/cm_gpt_trainer.py +153 -0
- glam4cm/trainers/gnn_edge_classifier.py +126 -0
- glam4cm/trainers/gnn_graph_classifier.py +123 -0
- glam4cm/trainers/gnn_link_predictor.py +144 -0
- glam4cm/trainers/gnn_node_classifier.py +135 -0
- glam4cm/trainers/gnn_trainer.py +129 -0
- glam4cm/trainers/metrics.py +55 -0
- glam4cm/utils.py +194 -0
- glam4cm-0.1.0.dist-info/LICENSE +21 -0
- glam4cm-0.1.0.dist-info/METADATA +86 -0
- glam4cm-0.1.0.dist-info/RECORD +72 -0
- glam4cm-0.1.0.dist-info/WHEEL +5 -0
- glam4cm-0.1.0.dist-info/entry_points.txt +2 -0
- glam4cm-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,76 @@
|
|
1
|
+
from torch.utils.data import Dataset
|
2
|
+
import torch
|
3
|
+
|
4
|
+
# Create your dataset
|
5
|
+
class EncodingDataset(Dataset):
|
6
|
+
def __init__(
|
7
|
+
self,
|
8
|
+
tokenizer,
|
9
|
+
texts,
|
10
|
+
labels=None,
|
11
|
+
max_length=512,
|
12
|
+
remove_duplicates=False
|
13
|
+
):
|
14
|
+
|
15
|
+
if remove_duplicates:
|
16
|
+
# print(f'Dataset with {len(texts)} samples before removing duplicates')
|
17
|
+
texts_to_id = {text: i for i, text in enumerate(texts)}
|
18
|
+
texts = list(texts_to_id.keys())
|
19
|
+
labels = [labels[i] for i in texts_to_id.values()] if labels else None
|
20
|
+
|
21
|
+
# print(f'Created dataset with {len(texts)} samples')
|
22
|
+
|
23
|
+
self.inputs = tokenizer(
|
24
|
+
texts,
|
25
|
+
return_tensors='pt',
|
26
|
+
truncation=True,
|
27
|
+
padding='max_length',
|
28
|
+
max_length=max_length
|
29
|
+
)
|
30
|
+
|
31
|
+
if labels is not None:
|
32
|
+
self.inputs['labels'] = torch.tensor(labels, dtype=torch.long) if labels is not None else None
|
33
|
+
|
34
|
+
print("Encoding Dataset created with {} samples".format(len(self.inputs['input_ids'])))
|
35
|
+
# print("\n".join([f"Label: {l}, Text: {i}" for i, l in zip(texts, labels)]))
|
36
|
+
# import code; code.interact(local=locals())
|
37
|
+
|
38
|
+
|
39
|
+
def __len__(self):
|
40
|
+
return len(self.inputs['input_ids'])
|
41
|
+
|
42
|
+
|
43
|
+
def __getitem__(self, index):
|
44
|
+
return {k: v[index] for k, v in self.inputs.items()}
|
45
|
+
|
46
|
+
|
47
|
+
class GPTTextDataset(Dataset):
|
48
|
+
def __init__(self, texts, tokenizer, max_length=512):
|
49
|
+
self.texts = texts
|
50
|
+
self.tokenizer = tokenizer
|
51
|
+
self.max_length = max_length
|
52
|
+
|
53
|
+
# Tokenize all the texts upon initialization
|
54
|
+
self.encodings = self.tokenizer(
|
55
|
+
texts,
|
56
|
+
truncation=True,
|
57
|
+
padding=True, # Pads to the longest sequence in the batch
|
58
|
+
max_length=max_length,
|
59
|
+
return_tensors="pt" # Return PyTorch tensors
|
60
|
+
)
|
61
|
+
|
62
|
+
def __len__(self):
|
63
|
+
return len(self.texts)
|
64
|
+
|
65
|
+
def __getitem__(self, idx):
|
66
|
+
input_ids = self.encodings["input_ids"][idx]
|
67
|
+
attention_mask = self.encodings["attention_mask"][idx]
|
68
|
+
|
69
|
+
# Labels for language modeling are the same as input_ids
|
70
|
+
labels = input_ids.clone()
|
71
|
+
|
72
|
+
return {
|
73
|
+
"input_ids": input_ids,
|
74
|
+
"attention_mask": attention_mask,
|
75
|
+
"labels": labels
|
76
|
+
}
|