glam4cm 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. glam4cm/__init__.py +9 -0
  2. glam4cm/data_loading/__init__.py +0 -0
  3. glam4cm/data_loading/data.py +631 -0
  4. glam4cm/data_loading/encoding.py +76 -0
  5. glam4cm/data_loading/graph_dataset.py +940 -0
  6. glam4cm/data_loading/metadata.py +84 -0
  7. glam4cm/data_loading/models_dataset.py +361 -0
  8. glam4cm/data_loading/utils.py +20 -0
  9. glam4cm/downstream_tasks/__init__.py +0 -0
  10. glam4cm/downstream_tasks/bert_edge_classification.py +144 -0
  11. glam4cm/downstream_tasks/bert_graph_classification.py +137 -0
  12. glam4cm/downstream_tasks/bert_graph_classification_comp.py +156 -0
  13. glam4cm/downstream_tasks/bert_link_prediction.py +145 -0
  14. glam4cm/downstream_tasks/bert_node_classification.py +164 -0
  15. glam4cm/downstream_tasks/cm_gpt_edge_classification.py +73 -0
  16. glam4cm/downstream_tasks/cm_gpt_node_classification.py +76 -0
  17. glam4cm/downstream_tasks/cm_gpt_pretraining.py +64 -0
  18. glam4cm/downstream_tasks/common_args.py +160 -0
  19. glam4cm/downstream_tasks/create_dataset.py +51 -0
  20. glam4cm/downstream_tasks/gnn_edge_classification.py +106 -0
  21. glam4cm/downstream_tasks/gnn_graph_cls.py +101 -0
  22. glam4cm/downstream_tasks/gnn_link_prediction.py +109 -0
  23. glam4cm/downstream_tasks/gnn_node_classification.py +103 -0
  24. glam4cm/downstream_tasks/tf_idf_text_classification.py +22 -0
  25. glam4cm/downstream_tasks/utils.py +35 -0
  26. glam4cm/downstream_tasks/word2vec_text_classification.py +108 -0
  27. glam4cm/embeddings/__init__.py +0 -0
  28. glam4cm/embeddings/bert.py +72 -0
  29. glam4cm/embeddings/common.py +43 -0
  30. glam4cm/embeddings/fasttext.py +0 -0
  31. glam4cm/embeddings/tfidf.py +25 -0
  32. glam4cm/embeddings/w2v.py +41 -0
  33. glam4cm/encoding/__init__.py +0 -0
  34. glam4cm/encoding/common.py +0 -0
  35. glam4cm/encoding/encoders.py +100 -0
  36. glam4cm/graph2str/__init__.py +0 -0
  37. glam4cm/graph2str/common.py +34 -0
  38. glam4cm/graph2str/constants.py +15 -0
  39. glam4cm/graph2str/ontouml.py +141 -0
  40. glam4cm/graph2str/uml.py +0 -0
  41. glam4cm/lang2graph/__init__.py +0 -0
  42. glam4cm/lang2graph/archimate.py +31 -0
  43. glam4cm/lang2graph/bpmn.py +0 -0
  44. glam4cm/lang2graph/common.py +416 -0
  45. glam4cm/lang2graph/ecore.py +221 -0
  46. glam4cm/lang2graph/ontouml.py +169 -0
  47. glam4cm/lang2graph/utils.py +80 -0
  48. glam4cm/models/cmgpt.py +352 -0
  49. glam4cm/models/gnn_layers.py +273 -0
  50. glam4cm/models/hf.py +10 -0
  51. glam4cm/run.py +99 -0
  52. glam4cm/run_configs.py +126 -0
  53. glam4cm/settings.py +54 -0
  54. glam4cm/tokenization/__init__.py +0 -0
  55. glam4cm/tokenization/special_tokens.py +4 -0
  56. glam4cm/tokenization/utils.py +37 -0
  57. glam4cm/trainers/__init__.py +0 -0
  58. glam4cm/trainers/bert_classifier.py +105 -0
  59. glam4cm/trainers/cm_gpt_trainer.py +153 -0
  60. glam4cm/trainers/gnn_edge_classifier.py +126 -0
  61. glam4cm/trainers/gnn_graph_classifier.py +123 -0
  62. glam4cm/trainers/gnn_link_predictor.py +144 -0
  63. glam4cm/trainers/gnn_node_classifier.py +135 -0
  64. glam4cm/trainers/gnn_trainer.py +129 -0
  65. glam4cm/trainers/metrics.py +55 -0
  66. glam4cm/utils.py +194 -0
  67. glam4cm-0.1.0.dist-info/LICENSE +21 -0
  68. glam4cm-0.1.0.dist-info/METADATA +86 -0
  69. glam4cm-0.1.0.dist-info/RECORD +72 -0
  70. glam4cm-0.1.0.dist-info/WHEEL +5 -0
  71. glam4cm-0.1.0.dist-info/entry_points.txt +2 -0
  72. glam4cm-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,103 @@
1
+ import os
2
+ from glam4cm.data_loading.graph_dataset import GraphNodeDataset
3
+ from glam4cm.models.gnn_layers import GNNConv, NodeClassifier
4
+ from glam4cm.downstream_tasks.utils import get_models_dataset
5
+ from glam4cm.tokenization.special_tokens import *
6
+ from glam4cm.trainers.gnn_node_classifier import GNNNodeClassificationTrainer as Trainer
7
+ from glam4cm.utils import merge_argument_parsers, set_seed
8
+ from glam4cm.downstream_tasks.common_args import get_common_args_parser, get_config_params, get_gnn_args_parser
9
+
10
+
11
+ def get_parser():
12
+ common_parser = get_common_args_parser()
13
+ gnn_parser = get_gnn_args_parser()
14
+ parser = merge_argument_parsers(common_parser, gnn_parser)
15
+ return parser
16
+
17
+
18
+ def run(args):
19
+
20
+ set_seed(args.seed)
21
+
22
+ config_params = dict(
23
+ min_enr = args.min_enr,
24
+ min_edges = args.min_edges,
25
+ remove_duplicates = args.remove_duplicates,
26
+ reload = args.reload,
27
+ language = args.language
28
+ )
29
+ dataset_name = args.dataset
30
+
31
+ dataset = get_models_dataset(dataset_name, **config_params)
32
+ graph_data_params = get_config_params(args)
33
+
34
+ print("Loading graph dataset")
35
+ graph_dataset = GraphNodeDataset(dataset, **graph_data_params)
36
+ print("Loaded graph dataset")
37
+
38
+ graph_torch_data = graph_dataset.get_torch_dataset()
39
+
40
+ num_nodes_label = f"num_nodes_{args.node_cls_label}"
41
+ assert hasattr(graph_dataset, num_nodes_label), f"Graph dataset does not have attribute {num_nodes_label}"
42
+ num_classes = getattr(graph_dataset, f"num_nodes_{args.node_cls_label}")
43
+
44
+
45
+ input_dim = graph_torch_data[0].x.shape[1]
46
+
47
+ model_name = args.gnn_conv_model
48
+
49
+ hidden_dim = args.hidden_dim
50
+ output_dim = args.output_dim
51
+ num_conv_layers = args.num_conv_layers
52
+ num_mlp_layers = args.num_mlp_layers
53
+ num_heads = args.num_heads
54
+ residual = True
55
+ l_norm = args.l_norm
56
+ dropout = args.dropout
57
+ aggregation = args.aggregation
58
+
59
+ edge_dim = graph_dataset[0].data.edge_attr.shape[1] if args.num_heads else None
60
+ gnn_conv_model = GNNConv(
61
+ model_name=model_name,
62
+ input_dim=input_dim,
63
+ hidden_dim=hidden_dim,
64
+ out_dim=output_dim,
65
+ num_layers=num_conv_layers,
66
+ num_heads=num_heads,
67
+ residual=residual,
68
+ l_norm=l_norm,
69
+ dropout=dropout,
70
+ aggregation=aggregation,
71
+ edge_dim=edge_dim,
72
+ )
73
+
74
+ clf_input_dim = output_dim*num_heads if args.num_heads else output_dim
75
+ mlp_predictor = NodeClassifier(
76
+ input_dim=clf_input_dim,
77
+ hidden_dim=hidden_dim,
78
+ num_layers=num_mlp_layers,
79
+ num_classes=num_classes,
80
+ bias=True,
81
+ )
82
+
83
+ logs_dir = os.path.join(
84
+ "logs",
85
+ dataset_name,
86
+ "gnn_node_cls",
87
+ f"{graph_dataset.config_hash}",
88
+ )
89
+
90
+ trainer = Trainer(
91
+ gnn_conv_model,
92
+ mlp_predictor,
93
+ graph_torch_data,
94
+ cls_label=args.node_cls_label,
95
+ exclude_labels=getattr(graph_dataset, f"node_exclude_{args.node_cls_label}"),
96
+ lr=args.lr,
97
+ num_epochs=args.num_epochs,
98
+ use_edge_attrs=args.use_edge_attrs,
99
+ logs_dir=logs_dir
100
+ )
101
+
102
+ print("Training GNN Node Classification model")
103
+ trainer.run()
@@ -0,0 +1,22 @@
1
+ from sklearn.feature_extraction.text import TfidfVectorizer
2
+ from sklearn.svm import SVC
3
+ from sklearn.pipeline import make_pipeline
4
+ from sklearn.metrics import classification_report
5
+
6
+
7
+ def run(data):
8
+ X_train, X_test = data['train_nodes'], data['test_nodes']
9
+ y_train, y_test = data['train_node_classes'], data['test_node_classes']
10
+
11
+ pipeline = make_pipeline(TfidfVectorizer(), SVC(kernel='linear'), verbose=True)
12
+
13
+ print("Fitting SVM classifier")
14
+ # Train the model
15
+ pipeline.fit(X_train, y_train)
16
+
17
+ print("Predicting")
18
+ # Predict on the test set
19
+ y_pred = pipeline.predict(X_test)
20
+
21
+ # Print classification report
22
+ print(classification_report(y_test, y_pred))
@@ -0,0 +1,35 @@
1
+ from glam4cm.data_loading.models_dataset import (
2
+ ArchiMateDataset,
3
+ EcoreDataset
4
+ )
5
+
6
+
7
+ dataset_to_metamodel = {
8
+ 'modelset': 'ecore',
9
+ 'ecore_555': 'ecore',
10
+ 'mar-ecore-github': 'ecore',
11
+ 'eamodelset': 'ea'
12
+ }
13
+
14
+
15
+ def get_metamodel_dataset_type(dataset):
16
+ return dataset_to_metamodel[dataset]
17
+
18
+
19
+ def get_model_dataset_class(dataset_name):
20
+ dataset_type = get_metamodel_dataset_type(dataset_name)
21
+ if dataset_type == 'ea':
22
+ dataset_class = ArchiMateDataset
23
+ elif dataset_type == 'ecore':
24
+ dataset_class = EcoreDataset
25
+ else:
26
+ raise ValueError(f"Unknown dataset type: {dataset_type}")
27
+ return dataset_class
28
+
29
+
30
+ def get_models_dataset(dataset_name, **config_params):
31
+ dataset_type = get_metamodel_dataset_type(dataset_name)
32
+ if dataset_type != 'ea' and 'language' in config_params:
33
+ del config_params['language']
34
+ dataset_class = get_model_dataset_class(dataset_name)
35
+ return dataset_class(dataset_name, **config_params)
@@ -0,0 +1,108 @@
1
+ import os
2
+ import fasttext
3
+ from sklearn.svm import SVC
4
+ from sklearn.metrics import classification_report
5
+ from sklearn.preprocessing import LabelEncoder
6
+ import numpy as np
7
+ import torch
8
+ from torch.utils.data import DataLoader, TensorDataset
9
+
10
+ import torch.nn as nn
11
+ import torch.optim as optim
12
+
13
+ # Step 4: Train a neural network classifier using the embeddings
14
+ class SimpleNN(nn.Module):
15
+ def __init__(self, input_dim, output_dim):
16
+ super(SimpleNN, self).__init__()
17
+ self.fc1 = nn.Linear(input_dim, 128)
18
+ self.fc2 = nn.Linear(128, output_dim)
19
+
20
+ def forward(self, x):
21
+ x = torch.relu(self.fc1(x))
22
+ x = self.fc2(x)
23
+ return x
24
+
25
+ # Step 2: Use the embeddings to transform the dataset
26
+ def get_embeddings(texts, model):
27
+ embeddings = []
28
+ for text in texts:
29
+ words = text.split()
30
+ word_vectors = [model.get_word_vector(word) for word in words]
31
+ if word_vectors:
32
+ embeddings.append(np.mean(word_vectors, axis=0))
33
+ else:
34
+ embeddings.append(np.zeros(model.vector_size))
35
+ return np.array(embeddings)
36
+
37
+
38
+ def run(data):
39
+ sentences = [text.split() for text in data['train_nodes'] + data['test_nodes']]
40
+ with open('data.txt', 'w') as f:
41
+ f.write("\n".join(sentences))
42
+
43
+ word2vec_model = fasttext.train_unsupervised(
44
+ 'data.txt',
45
+ dim=128,
46
+ ws=5,
47
+ minCount=1,
48
+ epoch=100,
49
+ lr=0.05,
50
+ workers=4
51
+ )
52
+ os.remove('data.txt')
53
+
54
+ X_train_embeddings = get_embeddings(data['train_nodes'], word2vec_model)
55
+ X_test_embeddings = get_embeddings(data['test_nodes'], word2vec_model)
56
+
57
+ # Encode labels
58
+ label_encoder = LabelEncoder()
59
+ y_train = label_encoder.fit_transform(data['train_node_classes'])
60
+ y_test = label_encoder.transform(data['test_node_classes'])
61
+
62
+ # Step 3: Train an SVM classifier using the embeddings
63
+ svm_classifier = SVC(kernel='linear')
64
+ svm_classifier.fit(X_train_embeddings, y_train)
65
+ y_pred_svm = svm_classifier.predict(X_test_embeddings)
66
+ print("SVM Classification Report:")
67
+ print(classification_report(y_test, y_pred_svm))
68
+
69
+
70
+ input_dim = X_train_embeddings.shape[1]
71
+ output_dim = len(np.unique(y_train))
72
+
73
+ model = SimpleNN(input_dim, output_dim)
74
+ criterion = nn.CrossEntropyLoss()
75
+ optimizer = optim.Adam(model.parameters(), lr=0.001)
76
+
77
+ # Convert data to PyTorch tensors
78
+ X_train_tensor = torch.tensor(X_train_embeddings, dtype=torch.float32)
79
+ y_train_tensor = torch.tensor(y_train, dtype=torch.long)
80
+ X_test_tensor = torch.tensor(X_test_embeddings, dtype=torch.float32)
81
+
82
+ train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
83
+ train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
84
+
85
+ # Train the neural network
86
+ num_epochs = 10
87
+ for epoch in range(num_epochs):
88
+ model.train()
89
+ for X_batch, y_batch in train_loader:
90
+ optimizer.zero_grad()
91
+ outputs = model(X_batch)
92
+ loss = criterion(outputs, y_batch)
93
+ loss.backward()
94
+ optimizer.step()
95
+ print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")
96
+
97
+ # Evaluate the neural network
98
+ model.eval()
99
+ with torch.no_grad():
100
+ outputs = model(X_test_tensor)
101
+ _, y_pred_nn = torch.max(outputs, 1)
102
+ y_pred_nn = y_pred_nn.numpy()
103
+ print("Neural Network Classification Report:")
104
+ report = classification_report(y_test, y_pred_nn)
105
+
106
+ print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")
107
+ print(f"Classification Report: {report}")
108
+
File without changes
@@ -0,0 +1,72 @@
1
+ from transformers import AutoModel, AutoTokenizer
2
+ import torch
3
+ from typing import List, Union
4
+ from glam4cm.embeddings.common import Embedder
5
+ from glam4cm.data_loading.encoding import EncodingDataset
6
+ from torch.utils.data import DataLoader
7
+ from glam4cm.settings import device
8
+ import numpy as np
9
+
10
+ class BertEmbedder(Embedder):
11
+ def __init__(self, model_name, ckpt=None):
12
+ super().__init__(name='BERT')
13
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
14
+ self.model = AutoModel.from_pretrained(ckpt if ckpt else model_name)
15
+ self.model.to(device)
16
+ self.finetuned = bool(ckpt)
17
+
18
+
19
+ @property
20
+ def embedding_dim(self) -> int:
21
+ """
22
+ Returns the embedding dimension of the model
23
+ By Running the model on a dummy input and extracting the output shape
24
+ """
25
+ with torch.no_grad():
26
+ dummy_input = self.tokenizer('Hello World!', return_tensors='pt')
27
+ outputs = self.model(**{k: v.to(device) for k, v in dummy_input.items()})
28
+ embedding_dim = outputs.last_hidden_state.shape[-1]
29
+ return embedding_dim
30
+
31
+ def embed(self, text: Union[str, List[str]], aggregate='cls'):
32
+ torch.cuda.empty_cache()
33
+ if isinstance(text, str):
34
+ text = [text]
35
+
36
+ print("Number of Texts: ", len(text))
37
+
38
+ dataset = EncodingDataset(self.tokenizer, texts=text, remove_duplicates=False)
39
+ loader = DataLoader(dataset, batch_size=256)
40
+
41
+ embeddings = list()
42
+ with torch.no_grad():
43
+ for batch in loader:
44
+ # Move inputs to device and process
45
+ input_ids = batch['input_ids'].to(device)
46
+ attention_mask = batch['attention_mask'].to(device)
47
+
48
+ outputs = self.model(input_ids, attention_mask)
49
+
50
+ # Collect embeddings
51
+ embeddings.append(outputs.last_hidden_state.cpu().numpy())
52
+
53
+ # Free memory of the batch inputs
54
+ del input_ids, attention_mask, outputs
55
+ torch.cuda.empty_cache() # Clear GPU memory
56
+
57
+
58
+ # Combine all the embeddings
59
+ embeddings = np.concatenate(embeddings, axis=0)
60
+
61
+ if aggregate == 'mean':
62
+ embeddings = np.mean(embeddings, axis=-1)
63
+ elif aggregate == 'max':
64
+ embeddings = np.max(embeddings, axis=-1)
65
+ elif aggregate == 'cls':
66
+ embeddings = embeddings[:, 0, :]
67
+ else:
68
+ raise ValueError(f'Unknown aggregation method: {aggregate}')
69
+
70
+ # print("Embeddings Shape: ", embeddings.shape)
71
+ assert embeddings.shape[0] == len(text)
72
+ return embeddings
@@ -0,0 +1,43 @@
1
+ from abc import abstractmethod
2
+ import json
3
+ import os
4
+ from typing import List, Union
5
+ import torch
6
+ from glam4cm.settings import (
7
+ WORD2VEC_MODEL,
8
+ TFIDF_MODEL
9
+ )
10
+
11
+
12
+ class Embedder:
13
+ def __init__(self, name: str):
14
+ self.name = name
15
+ self.finetuned = False
16
+
17
+ @abstractmethod
18
+ def embed(self, text: Union[str, List[str]], aggregate='mean') -> torch.Tensor:
19
+ pass
20
+
21
+ @property
22
+ def embedding_dim(self) -> int:
23
+ pass
24
+
25
+
26
+ def get_embedding_model(
27
+ model_name: str,
28
+ ckpt: str = None
29
+ ) -> Embedder:
30
+ if ckpt:
31
+ model_name = json.load(open(os.path.join(ckpt, 'config.json')))['_name_or_path']
32
+
33
+ if 'bert' in model_name:
34
+ from glam4cm.embeddings.bert import BertEmbedder
35
+ return BertEmbedder(model_name, ckpt)
36
+ elif WORD2VEC_MODEL in model_name:
37
+ from glam4cm.embeddings.w2v import Word2VecEmbedder
38
+ return Word2VecEmbedder()
39
+ elif TFIDF_MODEL in model_name:
40
+ from glam4cm.embeddings.tfidf import TfidfEmbedder
41
+ return TfidfEmbedder()
42
+ else:
43
+ raise ValueError(f'Unknown model name: {model_name}')
File without changes
@@ -0,0 +1,25 @@
1
+ import numpy as np
2
+ from typing import List, Union
3
+ from glam4cm.embeddings.common import Embedder
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+
6
+
7
+ class TfidfEmbedder(Embedder):
8
+ def __init__(self):
9
+ super().__init__(name='TFIDF')
10
+ pass
11
+
12
+ def train(self, texts: List[str]):
13
+ print("TFIDFEmbedder: Training TF-IDF model")
14
+ self.model = TfidfVectorizer()
15
+ self.model.fit(texts)
16
+ print("TFIDFEmbedder: Model trained")
17
+
18
+ @property
19
+ def embedding_dim(self) -> int:
20
+ return len(self.model.get_feature_names_out())
21
+
22
+ def embed(self, text: Union[str, List[str]]):
23
+ if isinstance(text, str):
24
+ text = [text]
25
+ return self.model.transform(text)
@@ -0,0 +1,41 @@
1
+ import os
2
+ from typing import List, Union
3
+ from glam4cm.embeddings.common import Embedder
4
+ from glam4cm.settings import W2V_CONFIG
5
+ import numpy as np
6
+ import fasttext
7
+
8
+
9
+ class Word2VecEmbedder(Embedder):
10
+ def __init__(self):
11
+ super().__init__(name='Word2Vec')
12
+
13
+ @property
14
+ def embedding_dim(self) -> int:
15
+ return self.model.vector_size
16
+
17
+ def train(self, texts: List[str]):
18
+ print("Word2VecEmbedder: Training Word2Vec model")
19
+ texts = [text.split() for text in texts]
20
+ with open('data.txt', 'w') as f:
21
+ f.write("\n".join(" ".join(words) for words in texts))
22
+ self.model = fasttext.train_unsupervised('data.txt', **W2V_CONFIG)
23
+ os.remove('data.txt')
24
+ print("Total words in the model:", len(self.model.wv))
25
+ print("Word2VecEmbedder: Word2Vec model trained")
26
+
27
+ def embed(self, text: Union[str, List[str]]):
28
+
29
+ def get_text_embedding(text: str):
30
+ words = text.split()
31
+ word_vectors = [self.model.wv[word] for word in words if word in self.model.wv]
32
+ if word_vectors:
33
+ return np.mean(word_vectors, axis=0)
34
+ else:
35
+ return np.zeros(self.embedding_dim)
36
+
37
+ if isinstance(text, str):
38
+ text = [text]
39
+ word_vectors = [get_text_embedding(t) for t in text]
40
+ return np.array(word_vectors)
41
+
File without changes
File without changes
@@ -0,0 +1,100 @@
1
+ from sklearn.feature_extraction.text import TfidfVectorizer
2
+ from transformers import AutoTokenizer
3
+ from sklearn.preprocessing import LabelEncoder
4
+ import fasttext
5
+ from scipy.sparse import csr_matrix
6
+ import numpy as np
7
+ from encoding.common import (
8
+ doc_tokenizer,
9
+ SEP
10
+ )
11
+
12
+
13
+ class TFIDFEncoder:
14
+ def __init__(self, X=None):
15
+ self.encoder = TfidfVectorizer(
16
+ lowercase=False, tokenizer=doc_tokenizer, min_df=3
17
+ )
18
+
19
+ if X:
20
+ self.encode(X)
21
+
22
+ def encode(self, X):
23
+ # print('Fitting TFIDF')
24
+ X_t = self.encoder.fit_transform(X)
25
+ X_sp = csr_matrix(np.vstack([x.toarray() for x in X_t]))
26
+ # print('TFIDF Encoded')
27
+ return X_sp
28
+
29
+
30
+ class BertTokenizerEncoder:
31
+ def __init__(self, name, X=None):
32
+ self.tokenizer = AutoTokenizer.from_pretrained(name)
33
+
34
+ if X:
35
+ self.encode(X)
36
+
37
+
38
+ def encode(self, X, batch_encode=False, percentile=100):
39
+ # print('Tokenizing Bert')
40
+ tokens = self.tokenizer(X)
41
+
42
+ if batch_encode:
43
+ lengths = [len(i) for i in tokens['input_ids']]
44
+ size = int(np.percentile(lengths, percentile)) if percentile < 100 else max(lengths)
45
+ if size > 512:
46
+ print(f'WARNING: Max size is {size}. Truncating to 512')
47
+ size = max(size, 512)
48
+
49
+ tokenized_data = self.tokenizer(
50
+ X,
51
+ padding=True,
52
+ truncation=True,
53
+ max_length=size
54
+ )
55
+ else:
56
+ tokenized_data = self.tokenizer(X)
57
+ # print('Bert Tokenized')
58
+
59
+ return tokenized_data
60
+
61
+
62
+ class BertTFIDF:
63
+ def __init__(self, name, X=None):
64
+ self.bert = BertTokenizerEncoder(name)
65
+ self.tfidf = TFIDFEncoder()
66
+
67
+ if X:
68
+ self.encode(X)
69
+
70
+ def encode(self, X):
71
+ X_b = [f"{SEP}".join([str(j) for j in i]) for i in self.bert.encode(X)['input_ids']]
72
+ X_t = self.tfidf.encode(X_b)
73
+ return X_t
74
+
75
+
76
+ class FasttextEncoder:
77
+ def __init__(self, model_name, X=None):
78
+ self.model = fasttext.load_model(model_name)
79
+ if X:
80
+ self.encode(X)
81
+
82
+ def encode(self, X):
83
+ def get_sentence_embedding(sentence):
84
+ return self.model.get_sentence_vector(sentence)
85
+
86
+ # print('Encoding Fasttext')
87
+ X_t = [" ".join(doc_tokenizer(i)) for i in X]
88
+ X_t = np.array([get_sentence_embedding(i) for i in X_t])
89
+ # print('Fasttext Encoded')
90
+ return X_t
91
+
92
+
93
+ class ClassLabelEncoder(LabelEncoder):
94
+ def __init__(self, y=None) -> None:
95
+ super().__init__()
96
+ if y:
97
+ self.fit(y)
98
+
99
+ def encode(self, y):
100
+ return self.fit_transform(y)
File without changes
@@ -0,0 +1,34 @@
1
+ from collections import deque
2
+ import re
3
+
4
+
5
+ remove_extra_spaces = lambda txt: re.sub(r'\s+', ' ', txt.strip())
6
+
7
+ def find_nodes_within_distance(graph, start_node, distance):
8
+ q, visited = deque(), dict()
9
+ q.append((start_node, 0))
10
+
11
+ while q:
12
+ n, d = q.popleft()
13
+ if d <= distance:
14
+ visited[n] = d
15
+ neighbours = [neighbor for neighbor in graph.neighbors(n) if neighbor != n and neighbor not in visited]
16
+ for neighbour in neighbours:
17
+ if neighbour not in visited:
18
+ q.append((neighbour, d + 1))
19
+
20
+ sorted_list = sorted(visited.items(), key=lambda x: x[1])
21
+ return sorted_list
22
+
23
+
24
+ def get_node_neighbours(graph, start_node, distance):
25
+ neighbours = find_nodes_within_distance(graph, start_node, distance)
26
+ max_distance = max(distance for _, distance in neighbours)
27
+ distance = min(distance, max_distance)
28
+ return [node for node, d in neighbours if d == distance]
29
+
30
+
31
+ def has_neighbours_incl_incoming(graph, node):
32
+ edges = list(graph.edges(node))
33
+ edges += list(graph.in_edges(node))
34
+ return len(edges) != 0
@@ -0,0 +1,15 @@
1
+ SEP = "[SEP]"
2
+ STEREOTYPE = "[STEREOTYPE]"
3
+
4
+ e_s = {'rel': 'relates', 'gen': 'generalizes'}
5
+
6
+ frequent_stereotypes = [
7
+ 'kind',
8
+ 'subkind',
9
+ 'phase',
10
+ 'role',
11
+ 'category',
12
+ 'mixin',
13
+ 'rolemixin',
14
+ 'phasemixin'
15
+ ]