glam4cm 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glam4cm/__init__.py +9 -0
- glam4cm/data_loading/__init__.py +0 -0
- glam4cm/data_loading/data.py +631 -0
- glam4cm/data_loading/encoding.py +76 -0
- glam4cm/data_loading/graph_dataset.py +940 -0
- glam4cm/data_loading/metadata.py +84 -0
- glam4cm/data_loading/models_dataset.py +361 -0
- glam4cm/data_loading/utils.py +20 -0
- glam4cm/downstream_tasks/__init__.py +0 -0
- glam4cm/downstream_tasks/bert_edge_classification.py +144 -0
- glam4cm/downstream_tasks/bert_graph_classification.py +137 -0
- glam4cm/downstream_tasks/bert_graph_classification_comp.py +156 -0
- glam4cm/downstream_tasks/bert_link_prediction.py +145 -0
- glam4cm/downstream_tasks/bert_node_classification.py +164 -0
- glam4cm/downstream_tasks/cm_gpt_edge_classification.py +73 -0
- glam4cm/downstream_tasks/cm_gpt_node_classification.py +76 -0
- glam4cm/downstream_tasks/cm_gpt_pretraining.py +64 -0
- glam4cm/downstream_tasks/common_args.py +160 -0
- glam4cm/downstream_tasks/create_dataset.py +51 -0
- glam4cm/downstream_tasks/gnn_edge_classification.py +106 -0
- glam4cm/downstream_tasks/gnn_graph_cls.py +101 -0
- glam4cm/downstream_tasks/gnn_link_prediction.py +109 -0
- glam4cm/downstream_tasks/gnn_node_classification.py +103 -0
- glam4cm/downstream_tasks/tf_idf_text_classification.py +22 -0
- glam4cm/downstream_tasks/utils.py +35 -0
- glam4cm/downstream_tasks/word2vec_text_classification.py +108 -0
- glam4cm/embeddings/__init__.py +0 -0
- glam4cm/embeddings/bert.py +72 -0
- glam4cm/embeddings/common.py +43 -0
- glam4cm/embeddings/fasttext.py +0 -0
- glam4cm/embeddings/tfidf.py +25 -0
- glam4cm/embeddings/w2v.py +41 -0
- glam4cm/encoding/__init__.py +0 -0
- glam4cm/encoding/common.py +0 -0
- glam4cm/encoding/encoders.py +100 -0
- glam4cm/graph2str/__init__.py +0 -0
- glam4cm/graph2str/common.py +34 -0
- glam4cm/graph2str/constants.py +15 -0
- glam4cm/graph2str/ontouml.py +141 -0
- glam4cm/graph2str/uml.py +0 -0
- glam4cm/lang2graph/__init__.py +0 -0
- glam4cm/lang2graph/archimate.py +31 -0
- glam4cm/lang2graph/bpmn.py +0 -0
- glam4cm/lang2graph/common.py +416 -0
- glam4cm/lang2graph/ecore.py +221 -0
- glam4cm/lang2graph/ontouml.py +169 -0
- glam4cm/lang2graph/utils.py +80 -0
- glam4cm/models/cmgpt.py +352 -0
- glam4cm/models/gnn_layers.py +273 -0
- glam4cm/models/hf.py +10 -0
- glam4cm/run.py +99 -0
- glam4cm/run_configs.py +126 -0
- glam4cm/settings.py +54 -0
- glam4cm/tokenization/__init__.py +0 -0
- glam4cm/tokenization/special_tokens.py +4 -0
- glam4cm/tokenization/utils.py +37 -0
- glam4cm/trainers/__init__.py +0 -0
- glam4cm/trainers/bert_classifier.py +105 -0
- glam4cm/trainers/cm_gpt_trainer.py +153 -0
- glam4cm/trainers/gnn_edge_classifier.py +126 -0
- glam4cm/trainers/gnn_graph_classifier.py +123 -0
- glam4cm/trainers/gnn_link_predictor.py +144 -0
- glam4cm/trainers/gnn_node_classifier.py +135 -0
- glam4cm/trainers/gnn_trainer.py +129 -0
- glam4cm/trainers/metrics.py +55 -0
- glam4cm/utils.py +194 -0
- glam4cm-0.1.0.dist-info/LICENSE +21 -0
- glam4cm-0.1.0.dist-info/METADATA +86 -0
- glam4cm-0.1.0.dist-info/RECORD +72 -0
- glam4cm-0.1.0.dist-info/WHEEL +5 -0
- glam4cm-0.1.0.dist-info/entry_points.txt +2 -0
- glam4cm-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,103 @@
|
|
1
|
+
import os
|
2
|
+
from glam4cm.data_loading.graph_dataset import GraphNodeDataset
|
3
|
+
from glam4cm.models.gnn_layers import GNNConv, NodeClassifier
|
4
|
+
from glam4cm.downstream_tasks.utils import get_models_dataset
|
5
|
+
from glam4cm.tokenization.special_tokens import *
|
6
|
+
from glam4cm.trainers.gnn_node_classifier import GNNNodeClassificationTrainer as Trainer
|
7
|
+
from glam4cm.utils import merge_argument_parsers, set_seed
|
8
|
+
from glam4cm.downstream_tasks.common_args import get_common_args_parser, get_config_params, get_gnn_args_parser
|
9
|
+
|
10
|
+
|
11
|
+
def get_parser():
|
12
|
+
common_parser = get_common_args_parser()
|
13
|
+
gnn_parser = get_gnn_args_parser()
|
14
|
+
parser = merge_argument_parsers(common_parser, gnn_parser)
|
15
|
+
return parser
|
16
|
+
|
17
|
+
|
18
|
+
def run(args):
|
19
|
+
|
20
|
+
set_seed(args.seed)
|
21
|
+
|
22
|
+
config_params = dict(
|
23
|
+
min_enr = args.min_enr,
|
24
|
+
min_edges = args.min_edges,
|
25
|
+
remove_duplicates = args.remove_duplicates,
|
26
|
+
reload = args.reload,
|
27
|
+
language = args.language
|
28
|
+
)
|
29
|
+
dataset_name = args.dataset
|
30
|
+
|
31
|
+
dataset = get_models_dataset(dataset_name, **config_params)
|
32
|
+
graph_data_params = get_config_params(args)
|
33
|
+
|
34
|
+
print("Loading graph dataset")
|
35
|
+
graph_dataset = GraphNodeDataset(dataset, **graph_data_params)
|
36
|
+
print("Loaded graph dataset")
|
37
|
+
|
38
|
+
graph_torch_data = graph_dataset.get_torch_dataset()
|
39
|
+
|
40
|
+
num_nodes_label = f"num_nodes_{args.node_cls_label}"
|
41
|
+
assert hasattr(graph_dataset, num_nodes_label), f"Graph dataset does not have attribute {num_nodes_label}"
|
42
|
+
num_classes = getattr(graph_dataset, f"num_nodes_{args.node_cls_label}")
|
43
|
+
|
44
|
+
|
45
|
+
input_dim = graph_torch_data[0].x.shape[1]
|
46
|
+
|
47
|
+
model_name = args.gnn_conv_model
|
48
|
+
|
49
|
+
hidden_dim = args.hidden_dim
|
50
|
+
output_dim = args.output_dim
|
51
|
+
num_conv_layers = args.num_conv_layers
|
52
|
+
num_mlp_layers = args.num_mlp_layers
|
53
|
+
num_heads = args.num_heads
|
54
|
+
residual = True
|
55
|
+
l_norm = args.l_norm
|
56
|
+
dropout = args.dropout
|
57
|
+
aggregation = args.aggregation
|
58
|
+
|
59
|
+
edge_dim = graph_dataset[0].data.edge_attr.shape[1] if args.num_heads else None
|
60
|
+
gnn_conv_model = GNNConv(
|
61
|
+
model_name=model_name,
|
62
|
+
input_dim=input_dim,
|
63
|
+
hidden_dim=hidden_dim,
|
64
|
+
out_dim=output_dim,
|
65
|
+
num_layers=num_conv_layers,
|
66
|
+
num_heads=num_heads,
|
67
|
+
residual=residual,
|
68
|
+
l_norm=l_norm,
|
69
|
+
dropout=dropout,
|
70
|
+
aggregation=aggregation,
|
71
|
+
edge_dim=edge_dim,
|
72
|
+
)
|
73
|
+
|
74
|
+
clf_input_dim = output_dim*num_heads if args.num_heads else output_dim
|
75
|
+
mlp_predictor = NodeClassifier(
|
76
|
+
input_dim=clf_input_dim,
|
77
|
+
hidden_dim=hidden_dim,
|
78
|
+
num_layers=num_mlp_layers,
|
79
|
+
num_classes=num_classes,
|
80
|
+
bias=True,
|
81
|
+
)
|
82
|
+
|
83
|
+
logs_dir = os.path.join(
|
84
|
+
"logs",
|
85
|
+
dataset_name,
|
86
|
+
"gnn_node_cls",
|
87
|
+
f"{graph_dataset.config_hash}",
|
88
|
+
)
|
89
|
+
|
90
|
+
trainer = Trainer(
|
91
|
+
gnn_conv_model,
|
92
|
+
mlp_predictor,
|
93
|
+
graph_torch_data,
|
94
|
+
cls_label=args.node_cls_label,
|
95
|
+
exclude_labels=getattr(graph_dataset, f"node_exclude_{args.node_cls_label}"),
|
96
|
+
lr=args.lr,
|
97
|
+
num_epochs=args.num_epochs,
|
98
|
+
use_edge_attrs=args.use_edge_attrs,
|
99
|
+
logs_dir=logs_dir
|
100
|
+
)
|
101
|
+
|
102
|
+
print("Training GNN Node Classification model")
|
103
|
+
trainer.run()
|
@@ -0,0 +1,22 @@
|
|
1
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
2
|
+
from sklearn.svm import SVC
|
3
|
+
from sklearn.pipeline import make_pipeline
|
4
|
+
from sklearn.metrics import classification_report
|
5
|
+
|
6
|
+
|
7
|
+
def run(data):
|
8
|
+
X_train, X_test = data['train_nodes'], data['test_nodes']
|
9
|
+
y_train, y_test = data['train_node_classes'], data['test_node_classes']
|
10
|
+
|
11
|
+
pipeline = make_pipeline(TfidfVectorizer(), SVC(kernel='linear'), verbose=True)
|
12
|
+
|
13
|
+
print("Fitting SVM classifier")
|
14
|
+
# Train the model
|
15
|
+
pipeline.fit(X_train, y_train)
|
16
|
+
|
17
|
+
print("Predicting")
|
18
|
+
# Predict on the test set
|
19
|
+
y_pred = pipeline.predict(X_test)
|
20
|
+
|
21
|
+
# Print classification report
|
22
|
+
print(classification_report(y_test, y_pred))
|
@@ -0,0 +1,35 @@
|
|
1
|
+
from glam4cm.data_loading.models_dataset import (
|
2
|
+
ArchiMateDataset,
|
3
|
+
EcoreDataset
|
4
|
+
)
|
5
|
+
|
6
|
+
|
7
|
+
dataset_to_metamodel = {
|
8
|
+
'modelset': 'ecore',
|
9
|
+
'ecore_555': 'ecore',
|
10
|
+
'mar-ecore-github': 'ecore',
|
11
|
+
'eamodelset': 'ea'
|
12
|
+
}
|
13
|
+
|
14
|
+
|
15
|
+
def get_metamodel_dataset_type(dataset):
|
16
|
+
return dataset_to_metamodel[dataset]
|
17
|
+
|
18
|
+
|
19
|
+
def get_model_dataset_class(dataset_name):
|
20
|
+
dataset_type = get_metamodel_dataset_type(dataset_name)
|
21
|
+
if dataset_type == 'ea':
|
22
|
+
dataset_class = ArchiMateDataset
|
23
|
+
elif dataset_type == 'ecore':
|
24
|
+
dataset_class = EcoreDataset
|
25
|
+
else:
|
26
|
+
raise ValueError(f"Unknown dataset type: {dataset_type}")
|
27
|
+
return dataset_class
|
28
|
+
|
29
|
+
|
30
|
+
def get_models_dataset(dataset_name, **config_params):
|
31
|
+
dataset_type = get_metamodel_dataset_type(dataset_name)
|
32
|
+
if dataset_type != 'ea' and 'language' in config_params:
|
33
|
+
del config_params['language']
|
34
|
+
dataset_class = get_model_dataset_class(dataset_name)
|
35
|
+
return dataset_class(dataset_name, **config_params)
|
@@ -0,0 +1,108 @@
|
|
1
|
+
import os
|
2
|
+
import fasttext
|
3
|
+
from sklearn.svm import SVC
|
4
|
+
from sklearn.metrics import classification_report
|
5
|
+
from sklearn.preprocessing import LabelEncoder
|
6
|
+
import numpy as np
|
7
|
+
import torch
|
8
|
+
from torch.utils.data import DataLoader, TensorDataset
|
9
|
+
|
10
|
+
import torch.nn as nn
|
11
|
+
import torch.optim as optim
|
12
|
+
|
13
|
+
# Step 4: Train a neural network classifier using the embeddings
|
14
|
+
class SimpleNN(nn.Module):
|
15
|
+
def __init__(self, input_dim, output_dim):
|
16
|
+
super(SimpleNN, self).__init__()
|
17
|
+
self.fc1 = nn.Linear(input_dim, 128)
|
18
|
+
self.fc2 = nn.Linear(128, output_dim)
|
19
|
+
|
20
|
+
def forward(self, x):
|
21
|
+
x = torch.relu(self.fc1(x))
|
22
|
+
x = self.fc2(x)
|
23
|
+
return x
|
24
|
+
|
25
|
+
# Step 2: Use the embeddings to transform the dataset
|
26
|
+
def get_embeddings(texts, model):
|
27
|
+
embeddings = []
|
28
|
+
for text in texts:
|
29
|
+
words = text.split()
|
30
|
+
word_vectors = [model.get_word_vector(word) for word in words]
|
31
|
+
if word_vectors:
|
32
|
+
embeddings.append(np.mean(word_vectors, axis=0))
|
33
|
+
else:
|
34
|
+
embeddings.append(np.zeros(model.vector_size))
|
35
|
+
return np.array(embeddings)
|
36
|
+
|
37
|
+
|
38
|
+
def run(data):
|
39
|
+
sentences = [text.split() for text in data['train_nodes'] + data['test_nodes']]
|
40
|
+
with open('data.txt', 'w') as f:
|
41
|
+
f.write("\n".join(sentences))
|
42
|
+
|
43
|
+
word2vec_model = fasttext.train_unsupervised(
|
44
|
+
'data.txt',
|
45
|
+
dim=128,
|
46
|
+
ws=5,
|
47
|
+
minCount=1,
|
48
|
+
epoch=100,
|
49
|
+
lr=0.05,
|
50
|
+
workers=4
|
51
|
+
)
|
52
|
+
os.remove('data.txt')
|
53
|
+
|
54
|
+
X_train_embeddings = get_embeddings(data['train_nodes'], word2vec_model)
|
55
|
+
X_test_embeddings = get_embeddings(data['test_nodes'], word2vec_model)
|
56
|
+
|
57
|
+
# Encode labels
|
58
|
+
label_encoder = LabelEncoder()
|
59
|
+
y_train = label_encoder.fit_transform(data['train_node_classes'])
|
60
|
+
y_test = label_encoder.transform(data['test_node_classes'])
|
61
|
+
|
62
|
+
# Step 3: Train an SVM classifier using the embeddings
|
63
|
+
svm_classifier = SVC(kernel='linear')
|
64
|
+
svm_classifier.fit(X_train_embeddings, y_train)
|
65
|
+
y_pred_svm = svm_classifier.predict(X_test_embeddings)
|
66
|
+
print("SVM Classification Report:")
|
67
|
+
print(classification_report(y_test, y_pred_svm))
|
68
|
+
|
69
|
+
|
70
|
+
input_dim = X_train_embeddings.shape[1]
|
71
|
+
output_dim = len(np.unique(y_train))
|
72
|
+
|
73
|
+
model = SimpleNN(input_dim, output_dim)
|
74
|
+
criterion = nn.CrossEntropyLoss()
|
75
|
+
optimizer = optim.Adam(model.parameters(), lr=0.001)
|
76
|
+
|
77
|
+
# Convert data to PyTorch tensors
|
78
|
+
X_train_tensor = torch.tensor(X_train_embeddings, dtype=torch.float32)
|
79
|
+
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
|
80
|
+
X_test_tensor = torch.tensor(X_test_embeddings, dtype=torch.float32)
|
81
|
+
|
82
|
+
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
|
83
|
+
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
|
84
|
+
|
85
|
+
# Train the neural network
|
86
|
+
num_epochs = 10
|
87
|
+
for epoch in range(num_epochs):
|
88
|
+
model.train()
|
89
|
+
for X_batch, y_batch in train_loader:
|
90
|
+
optimizer.zero_grad()
|
91
|
+
outputs = model(X_batch)
|
92
|
+
loss = criterion(outputs, y_batch)
|
93
|
+
loss.backward()
|
94
|
+
optimizer.step()
|
95
|
+
print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")
|
96
|
+
|
97
|
+
# Evaluate the neural network
|
98
|
+
model.eval()
|
99
|
+
with torch.no_grad():
|
100
|
+
outputs = model(X_test_tensor)
|
101
|
+
_, y_pred_nn = torch.max(outputs, 1)
|
102
|
+
y_pred_nn = y_pred_nn.numpy()
|
103
|
+
print("Neural Network Classification Report:")
|
104
|
+
report = classification_report(y_test, y_pred_nn)
|
105
|
+
|
106
|
+
print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")
|
107
|
+
print(f"Classification Report: {report}")
|
108
|
+
|
File without changes
|
@@ -0,0 +1,72 @@
|
|
1
|
+
from transformers import AutoModel, AutoTokenizer
|
2
|
+
import torch
|
3
|
+
from typing import List, Union
|
4
|
+
from glam4cm.embeddings.common import Embedder
|
5
|
+
from glam4cm.data_loading.encoding import EncodingDataset
|
6
|
+
from torch.utils.data import DataLoader
|
7
|
+
from glam4cm.settings import device
|
8
|
+
import numpy as np
|
9
|
+
|
10
|
+
class BertEmbedder(Embedder):
|
11
|
+
def __init__(self, model_name, ckpt=None):
|
12
|
+
super().__init__(name='BERT')
|
13
|
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
14
|
+
self.model = AutoModel.from_pretrained(ckpt if ckpt else model_name)
|
15
|
+
self.model.to(device)
|
16
|
+
self.finetuned = bool(ckpt)
|
17
|
+
|
18
|
+
|
19
|
+
@property
|
20
|
+
def embedding_dim(self) -> int:
|
21
|
+
"""
|
22
|
+
Returns the embedding dimension of the model
|
23
|
+
By Running the model on a dummy input and extracting the output shape
|
24
|
+
"""
|
25
|
+
with torch.no_grad():
|
26
|
+
dummy_input = self.tokenizer('Hello World!', return_tensors='pt')
|
27
|
+
outputs = self.model(**{k: v.to(device) for k, v in dummy_input.items()})
|
28
|
+
embedding_dim = outputs.last_hidden_state.shape[-1]
|
29
|
+
return embedding_dim
|
30
|
+
|
31
|
+
def embed(self, text: Union[str, List[str]], aggregate='cls'):
|
32
|
+
torch.cuda.empty_cache()
|
33
|
+
if isinstance(text, str):
|
34
|
+
text = [text]
|
35
|
+
|
36
|
+
print("Number of Texts: ", len(text))
|
37
|
+
|
38
|
+
dataset = EncodingDataset(self.tokenizer, texts=text, remove_duplicates=False)
|
39
|
+
loader = DataLoader(dataset, batch_size=256)
|
40
|
+
|
41
|
+
embeddings = list()
|
42
|
+
with torch.no_grad():
|
43
|
+
for batch in loader:
|
44
|
+
# Move inputs to device and process
|
45
|
+
input_ids = batch['input_ids'].to(device)
|
46
|
+
attention_mask = batch['attention_mask'].to(device)
|
47
|
+
|
48
|
+
outputs = self.model(input_ids, attention_mask)
|
49
|
+
|
50
|
+
# Collect embeddings
|
51
|
+
embeddings.append(outputs.last_hidden_state.cpu().numpy())
|
52
|
+
|
53
|
+
# Free memory of the batch inputs
|
54
|
+
del input_ids, attention_mask, outputs
|
55
|
+
torch.cuda.empty_cache() # Clear GPU memory
|
56
|
+
|
57
|
+
|
58
|
+
# Combine all the embeddings
|
59
|
+
embeddings = np.concatenate(embeddings, axis=0)
|
60
|
+
|
61
|
+
if aggregate == 'mean':
|
62
|
+
embeddings = np.mean(embeddings, axis=-1)
|
63
|
+
elif aggregate == 'max':
|
64
|
+
embeddings = np.max(embeddings, axis=-1)
|
65
|
+
elif aggregate == 'cls':
|
66
|
+
embeddings = embeddings[:, 0, :]
|
67
|
+
else:
|
68
|
+
raise ValueError(f'Unknown aggregation method: {aggregate}')
|
69
|
+
|
70
|
+
# print("Embeddings Shape: ", embeddings.shape)
|
71
|
+
assert embeddings.shape[0] == len(text)
|
72
|
+
return embeddings
|
@@ -0,0 +1,43 @@
|
|
1
|
+
from abc import abstractmethod
|
2
|
+
import json
|
3
|
+
import os
|
4
|
+
from typing import List, Union
|
5
|
+
import torch
|
6
|
+
from glam4cm.settings import (
|
7
|
+
WORD2VEC_MODEL,
|
8
|
+
TFIDF_MODEL
|
9
|
+
)
|
10
|
+
|
11
|
+
|
12
|
+
class Embedder:
|
13
|
+
def __init__(self, name: str):
|
14
|
+
self.name = name
|
15
|
+
self.finetuned = False
|
16
|
+
|
17
|
+
@abstractmethod
|
18
|
+
def embed(self, text: Union[str, List[str]], aggregate='mean') -> torch.Tensor:
|
19
|
+
pass
|
20
|
+
|
21
|
+
@property
|
22
|
+
def embedding_dim(self) -> int:
|
23
|
+
pass
|
24
|
+
|
25
|
+
|
26
|
+
def get_embedding_model(
|
27
|
+
model_name: str,
|
28
|
+
ckpt: str = None
|
29
|
+
) -> Embedder:
|
30
|
+
if ckpt:
|
31
|
+
model_name = json.load(open(os.path.join(ckpt, 'config.json')))['_name_or_path']
|
32
|
+
|
33
|
+
if 'bert' in model_name:
|
34
|
+
from glam4cm.embeddings.bert import BertEmbedder
|
35
|
+
return BertEmbedder(model_name, ckpt)
|
36
|
+
elif WORD2VEC_MODEL in model_name:
|
37
|
+
from glam4cm.embeddings.w2v import Word2VecEmbedder
|
38
|
+
return Word2VecEmbedder()
|
39
|
+
elif TFIDF_MODEL in model_name:
|
40
|
+
from glam4cm.embeddings.tfidf import TfidfEmbedder
|
41
|
+
return TfidfEmbedder()
|
42
|
+
else:
|
43
|
+
raise ValueError(f'Unknown model name: {model_name}')
|
File without changes
|
@@ -0,0 +1,25 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from typing import List, Union
|
3
|
+
from glam4cm.embeddings.common import Embedder
|
4
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
5
|
+
|
6
|
+
|
7
|
+
class TfidfEmbedder(Embedder):
|
8
|
+
def __init__(self):
|
9
|
+
super().__init__(name='TFIDF')
|
10
|
+
pass
|
11
|
+
|
12
|
+
def train(self, texts: List[str]):
|
13
|
+
print("TFIDFEmbedder: Training TF-IDF model")
|
14
|
+
self.model = TfidfVectorizer()
|
15
|
+
self.model.fit(texts)
|
16
|
+
print("TFIDFEmbedder: Model trained")
|
17
|
+
|
18
|
+
@property
|
19
|
+
def embedding_dim(self) -> int:
|
20
|
+
return len(self.model.get_feature_names_out())
|
21
|
+
|
22
|
+
def embed(self, text: Union[str, List[str]]):
|
23
|
+
if isinstance(text, str):
|
24
|
+
text = [text]
|
25
|
+
return self.model.transform(text)
|
@@ -0,0 +1,41 @@
|
|
1
|
+
import os
|
2
|
+
from typing import List, Union
|
3
|
+
from glam4cm.embeddings.common import Embedder
|
4
|
+
from glam4cm.settings import W2V_CONFIG
|
5
|
+
import numpy as np
|
6
|
+
import fasttext
|
7
|
+
|
8
|
+
|
9
|
+
class Word2VecEmbedder(Embedder):
|
10
|
+
def __init__(self):
|
11
|
+
super().__init__(name='Word2Vec')
|
12
|
+
|
13
|
+
@property
|
14
|
+
def embedding_dim(self) -> int:
|
15
|
+
return self.model.vector_size
|
16
|
+
|
17
|
+
def train(self, texts: List[str]):
|
18
|
+
print("Word2VecEmbedder: Training Word2Vec model")
|
19
|
+
texts = [text.split() for text in texts]
|
20
|
+
with open('data.txt', 'w') as f:
|
21
|
+
f.write("\n".join(" ".join(words) for words in texts))
|
22
|
+
self.model = fasttext.train_unsupervised('data.txt', **W2V_CONFIG)
|
23
|
+
os.remove('data.txt')
|
24
|
+
print("Total words in the model:", len(self.model.wv))
|
25
|
+
print("Word2VecEmbedder: Word2Vec model trained")
|
26
|
+
|
27
|
+
def embed(self, text: Union[str, List[str]]):
|
28
|
+
|
29
|
+
def get_text_embedding(text: str):
|
30
|
+
words = text.split()
|
31
|
+
word_vectors = [self.model.wv[word] for word in words if word in self.model.wv]
|
32
|
+
if word_vectors:
|
33
|
+
return np.mean(word_vectors, axis=0)
|
34
|
+
else:
|
35
|
+
return np.zeros(self.embedding_dim)
|
36
|
+
|
37
|
+
if isinstance(text, str):
|
38
|
+
text = [text]
|
39
|
+
word_vectors = [get_text_embedding(t) for t in text]
|
40
|
+
return np.array(word_vectors)
|
41
|
+
|
File without changes
|
File without changes
|
@@ -0,0 +1,100 @@
|
|
1
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
2
|
+
from transformers import AutoTokenizer
|
3
|
+
from sklearn.preprocessing import LabelEncoder
|
4
|
+
import fasttext
|
5
|
+
from scipy.sparse import csr_matrix
|
6
|
+
import numpy as np
|
7
|
+
from encoding.common import (
|
8
|
+
doc_tokenizer,
|
9
|
+
SEP
|
10
|
+
)
|
11
|
+
|
12
|
+
|
13
|
+
class TFIDFEncoder:
|
14
|
+
def __init__(self, X=None):
|
15
|
+
self.encoder = TfidfVectorizer(
|
16
|
+
lowercase=False, tokenizer=doc_tokenizer, min_df=3
|
17
|
+
)
|
18
|
+
|
19
|
+
if X:
|
20
|
+
self.encode(X)
|
21
|
+
|
22
|
+
def encode(self, X):
|
23
|
+
# print('Fitting TFIDF')
|
24
|
+
X_t = self.encoder.fit_transform(X)
|
25
|
+
X_sp = csr_matrix(np.vstack([x.toarray() for x in X_t]))
|
26
|
+
# print('TFIDF Encoded')
|
27
|
+
return X_sp
|
28
|
+
|
29
|
+
|
30
|
+
class BertTokenizerEncoder:
|
31
|
+
def __init__(self, name, X=None):
|
32
|
+
self.tokenizer = AutoTokenizer.from_pretrained(name)
|
33
|
+
|
34
|
+
if X:
|
35
|
+
self.encode(X)
|
36
|
+
|
37
|
+
|
38
|
+
def encode(self, X, batch_encode=False, percentile=100):
|
39
|
+
# print('Tokenizing Bert')
|
40
|
+
tokens = self.tokenizer(X)
|
41
|
+
|
42
|
+
if batch_encode:
|
43
|
+
lengths = [len(i) for i in tokens['input_ids']]
|
44
|
+
size = int(np.percentile(lengths, percentile)) if percentile < 100 else max(lengths)
|
45
|
+
if size > 512:
|
46
|
+
print(f'WARNING: Max size is {size}. Truncating to 512')
|
47
|
+
size = max(size, 512)
|
48
|
+
|
49
|
+
tokenized_data = self.tokenizer(
|
50
|
+
X,
|
51
|
+
padding=True,
|
52
|
+
truncation=True,
|
53
|
+
max_length=size
|
54
|
+
)
|
55
|
+
else:
|
56
|
+
tokenized_data = self.tokenizer(X)
|
57
|
+
# print('Bert Tokenized')
|
58
|
+
|
59
|
+
return tokenized_data
|
60
|
+
|
61
|
+
|
62
|
+
class BertTFIDF:
|
63
|
+
def __init__(self, name, X=None):
|
64
|
+
self.bert = BertTokenizerEncoder(name)
|
65
|
+
self.tfidf = TFIDFEncoder()
|
66
|
+
|
67
|
+
if X:
|
68
|
+
self.encode(X)
|
69
|
+
|
70
|
+
def encode(self, X):
|
71
|
+
X_b = [f"{SEP}".join([str(j) for j in i]) for i in self.bert.encode(X)['input_ids']]
|
72
|
+
X_t = self.tfidf.encode(X_b)
|
73
|
+
return X_t
|
74
|
+
|
75
|
+
|
76
|
+
class FasttextEncoder:
|
77
|
+
def __init__(self, model_name, X=None):
|
78
|
+
self.model = fasttext.load_model(model_name)
|
79
|
+
if X:
|
80
|
+
self.encode(X)
|
81
|
+
|
82
|
+
def encode(self, X):
|
83
|
+
def get_sentence_embedding(sentence):
|
84
|
+
return self.model.get_sentence_vector(sentence)
|
85
|
+
|
86
|
+
# print('Encoding Fasttext')
|
87
|
+
X_t = [" ".join(doc_tokenizer(i)) for i in X]
|
88
|
+
X_t = np.array([get_sentence_embedding(i) for i in X_t])
|
89
|
+
# print('Fasttext Encoded')
|
90
|
+
return X_t
|
91
|
+
|
92
|
+
|
93
|
+
class ClassLabelEncoder(LabelEncoder):
|
94
|
+
def __init__(self, y=None) -> None:
|
95
|
+
super().__init__()
|
96
|
+
if y:
|
97
|
+
self.fit(y)
|
98
|
+
|
99
|
+
def encode(self, y):
|
100
|
+
return self.fit_transform(y)
|
File without changes
|
@@ -0,0 +1,34 @@
|
|
1
|
+
from collections import deque
|
2
|
+
import re
|
3
|
+
|
4
|
+
|
5
|
+
remove_extra_spaces = lambda txt: re.sub(r'\s+', ' ', txt.strip())
|
6
|
+
|
7
|
+
def find_nodes_within_distance(graph, start_node, distance):
|
8
|
+
q, visited = deque(), dict()
|
9
|
+
q.append((start_node, 0))
|
10
|
+
|
11
|
+
while q:
|
12
|
+
n, d = q.popleft()
|
13
|
+
if d <= distance:
|
14
|
+
visited[n] = d
|
15
|
+
neighbours = [neighbor for neighbor in graph.neighbors(n) if neighbor != n and neighbor not in visited]
|
16
|
+
for neighbour in neighbours:
|
17
|
+
if neighbour not in visited:
|
18
|
+
q.append((neighbour, d + 1))
|
19
|
+
|
20
|
+
sorted_list = sorted(visited.items(), key=lambda x: x[1])
|
21
|
+
return sorted_list
|
22
|
+
|
23
|
+
|
24
|
+
def get_node_neighbours(graph, start_node, distance):
|
25
|
+
neighbours = find_nodes_within_distance(graph, start_node, distance)
|
26
|
+
max_distance = max(distance for _, distance in neighbours)
|
27
|
+
distance = min(distance, max_distance)
|
28
|
+
return [node for node, d in neighbours if d == distance]
|
29
|
+
|
30
|
+
|
31
|
+
def has_neighbours_incl_incoming(graph, node):
|
32
|
+
edges = list(graph.edges(node))
|
33
|
+
edges += list(graph.in_edges(node))
|
34
|
+
return len(edges) != 0
|