PyPI - glam4cm - Versions diffs - 0.1.1__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

glam4cm 0.1.1py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

glam4cm/__init__.py +2 -1
glam4cm/data_loading/data.py +90 -146
glam4cm/data_loading/encoding.py +17 -6
glam4cm/data_loading/graph_dataset.py +192 -57
glam4cm/data_loading/metadata.py +1 -1
glam4cm/data_loading/models_dataset.py +42 -18
glam4cm/downstream_tasks/bert_edge_classification.py +49 -22
glam4cm/downstream_tasks/bert_graph_classification.py +44 -14
glam4cm/downstream_tasks/bert_graph_classification_comp.py +47 -24
glam4cm/downstream_tasks/bert_link_prediction.py +46 -26
glam4cm/downstream_tasks/bert_node_classification.py +127 -89
glam4cm/downstream_tasks/cm_gpt_node_classification.py +61 -15
glam4cm/downstream_tasks/common_args.py +32 -4
glam4cm/downstream_tasks/gnn_edge_classification.py +24 -7
glam4cm/downstream_tasks/gnn_graph_cls.py +19 -6
glam4cm/downstream_tasks/gnn_link_prediction.py +25 -13
glam4cm/downstream_tasks/gnn_node_classification.py +19 -7
glam4cm/downstream_tasks/utils.py +16 -2
glam4cm/embeddings/bert.py +1 -1
glam4cm/embeddings/common.py +7 -4
glam4cm/encoding/encoders.py +1 -1
glam4cm/lang2graph/archimate.py +0 -5
glam4cm/lang2graph/common.py +99 -41
glam4cm/lang2graph/ecore.py +1 -2
glam4cm/lang2graph/ontouml.py +8 -7
glam4cm/models/gnn_layers.py +20 -6
glam4cm/models/hf.py +2 -2
glam4cm/run.py +12 -7
glam4cm/run_conf_v2.py +405 -0
glam4cm/run_configs.py +70 -106
glam4cm/run_confs.py +41 -0
glam4cm/settings.py +15 -2
glam4cm/tokenization/special_tokens.py +23 -1
glam4cm/tokenization/utils.py +23 -4
glam4cm/trainers/cm_gpt_trainer.py +1 -1
glam4cm/trainers/gnn_edge_classifier.py +12 -1
glam4cm/trainers/gnn_graph_classifier.py +12 -5
glam4cm/trainers/gnn_link_predictor.py +18 -3
glam4cm/trainers/gnn_link_predictor_v2.py +146 -0
glam4cm/trainers/gnn_trainer.py +8 -0
glam4cm/trainers/metrics.py +1 -1
glam4cm/utils.py +265 -2
{glam4cm-0.1.1.dist-info → glam4cm-1.0.0.dist-info}/METADATA +3 -2
glam4cm-1.0.0.dist-info/RECORD +75 -0
{glam4cm-0.1.1.dist-info → glam4cm-1.0.0.dist-info}/WHEEL +1 -1
glam4cm-0.1.1.dist-info/RECORD +0 -72
{glam4cm-0.1.1.dist-info → glam4cm-1.0.0.dist-info}/entry_points.txt +0 -0
{glam4cm-0.1.1.dist-info → glam4cm-1.0.0.dist-info/licenses}/LICENSE +0 -0
{glam4cm-0.1.1.dist-info → glam4cm-1.0.0.dist-info}/top_level.txt +0 -0

glam4cm/data_loading/graph_dataset.py CHANGED Viewed

@@ -11,18 +11,29 @@ import numpy as np
 from scipy.sparse import csr_matrix
 from transformers import AutoTokenizer
 from glam4cm.data_loading.data import TorchEdgeGraph, TorchGraph, TorchNodeGraph
-from glam4cm.data_loading.models_dataset import ArchiMateDataset, EcoreDataset, OntoUMLDataset
+from glam4cm.data_loading.models_dataset import (
+    ArchiMateDataset,
+    EcoreDataset,
+    OntoUMLDataset
+)
 from glam4cm.data_loading.encoding import EncodingDataset, GPTTextDataset
 from tqdm.auto import tqdm
 from glam4cm.embeddings.w2v import Word2VecEmbedder
 from glam4cm.embeddings.tfidf import TfidfEmbedder
 from glam4cm.embeddings.common import get_embedding_model
 from glam4cm.lang2graph.common import LangGraph, get_node_data, get_edge_data
-from glam4cm.data_loading.metadata import ArchimateMetaData, EcoreMetaData, OntoUMLMetaData
+from glam4cm.data_loading.metadata import (
+    ArchimateMetaData,
+    EcoreMetaData,
+    OntoUMLMetaData
+)
 from glam4cm.settings import seed
 from glam4cm.settings import (
-    LP_TASK_EDGE_CLS,
-    LP_TASK_LINK_PRED,
+    EDGE_CLS_TASK,
+    LINK_PRED_TASK,
+    NODE_CLS_TASK,
+    GRAPH_CLS_TASK,
+    DUMMY_GRAPH_CLS_TASK
 )
 import glam4cm.utils as utils
@@ -94,7 +105,7 @@ class GraphDataset(torch.utils.data.Dataset):
     def __init__(
         self,
         models_dataset: Union[EcoreDataset, ArchiMateDataset],
-        save_dir='datasets/graph_data',
+        task_type: str,
         distance=1,
         add_negative_train_samples=False,
         neg_sampling_ratio=1,
@@ -103,6 +114,8 @@ class GraphDataset(torch.utils.data.Dataset):
         use_node_types=False,
         use_edge_label=False,
         no_labels=False,
+        node_topk=-1,
         node_cls_label=None,
         edge_cls_label=None,
@@ -120,8 +133,11 @@ class GraphDataset(torch.utils.data.Dataset):
         randomize_ee=False,
         random_embed_dim=128,
-        exclude_labels: list = [None, ''],
+        exclude_labels: List = None,
+        save_dir='datasets/graph_data',
     ):
+        self.task_type = task_type
         if isinstance(models_dataset, EcoreDataset):
             self.metadata = EcoreMetaData()
         elif isinstance(models_dataset, ArchiMateDataset):
@@ -147,12 +163,34 @@ class GraphDataset(torch.utils.data.Dataset):
         self.test_ratio = test_ratio
-        self.no_shuffle = no_shuffle
-        self.exclude_labels = exclude_labels
         self.use_special_tokens = use_special_tokens
         self.node_cls_label = node_cls_label
         self.edge_cls_label = edge_cls_label
+        self.node_topk = node_topk
+        self.no_shuffle = no_shuffle
+        all_labels = [
+            i[0] for i in sorted(
+                dict(Counter(
+                    sum(
+                        [
+                            [
+                                v for v in list(dict(model.numbered_graph.nodes(data=node_cls_label)).values()) if v
+                            ]
+                            for model in models_dataset],
+                        []
+                    )
+                )).items(),
+                key=lambda x: x[1],
+                reverse=True
+            )
+        ]
+        self.node_topk = all_labels[:node_topk] if node_topk > 0 else all_labels
+        self.exclude_labels = (all_labels[node_topk+1:] if node_topk > 0 else []) + [None, '']
         self.randomize_ne = randomize_ne
         self.randomize_ee = randomize_ee
@@ -161,6 +199,8 @@ class GraphDataset(torch.utils.data.Dataset):
         self.graphs: List[Union[TorchNodeGraph, TorchEdgeGraph]] = []
         self.config = dict(
             name=models_dataset.name,
+            task_type=task_type,
+            node_topk=node_topk,
             distance=distance,
             add_negative_train_samples=add_negative_train_samples,
             neg_sampling_ratio=neg_sampling_ratio,
@@ -181,7 +221,6 @@ class GraphDataset(torch.utils.data.Dataset):
             randomize_ee=randomize_ee,
             random_embed_dim=random_embed_dim
         )
         self.save_dir = os.path.join(save_dir, models_dataset.name)
         os.makedirs(self.save_dir, exist_ok=True)
@@ -210,22 +249,21 @@ class GraphDataset(torch.utils.data.Dataset):
         self.config_hash = self.get_config_hash()
         os.makedirs(os.path.join(self.save_dir, self.config_hash), exist_ok=True)
         self.file_paths = {
-            graph.hash: os.path.join(self.save_dir, self.config_hash, f'{graph.hash}', 'data.pkl')
+            graph.hash: os.path.join(
+                self.save_dir,
+                self.config_hash,
+                f'{graph.hash}_{self.get_string_gen_params_hash()}',
+                'data.pkl'
+            )
             for graph in models_dataset
         }
         print("Number of duplicate graphs: ", len(models_dataset) - len(self.file_paths))
-    def set_torch_graphs(
-            self,
-            type: str,
-            models_dataset: Union[EcoreDataset, ArchiMateDataset],
-            limit: int =-1
-        ):
+    def get_common_params(self):
         common_params = dict(
             metadata=self.metadata,
+            task_type=self.task_type,
             distance=self.distance,
             test_ratio=self.test_ratio,
             use_attributes=self.use_attributes,
@@ -235,8 +273,34 @@ class GraphDataset(torch.utils.data.Dataset):
             use_special_tokens=self.use_special_tokens,
             no_labels=self.no_labels,
             node_cls_label=self.node_cls_label,
-            edge_cls_label=self.edge_cls_label
+            edge_cls_label=self.edge_cls_label,
+            node_topk=self.node_topk,
         )
+        return common_params
+    def get_string_gen_params_hash(self):
+        string_gen_params = f"""
+        distance={self.distance},
+        use_attributes={self.use_attributes},
+        use_node_types={self.use_node_types},
+        use_edge_types={self.use_edge_types},
+        use_edge_label={self.use_edge_label},
+        use_special_tokens={self.use_special_tokens},
+        no_labels={self.no_labels},
+        node_cls_label={self.node_cls_label},
+        edge_cls_label={self.edge_cls_label},
+        node_topk={self.node_topk},
+        ckpt={self.ckpt},
+        """
+        return utils.md5_hash(string_gen_params)
+    def set_torch_graphs(
+            self,
+            models_dataset: Union[EcoreDataset, ArchiMateDataset],
+            limit: int =-1
+        ):
+        common_params = self.get_common_params()
         def create_node_graph(graph: LangGraph, fp: str) -> TorchNodeGraph:
             node_params = {
                 **common_params,
@@ -256,24 +320,29 @@ class GraphDataset(torch.utils.data.Dataset):
             return torch_graph
         models_size = len(models_dataset) \
             if (limit == -1 or limit > len(models_dataset)) else limit
         self.set_file_hashes(models_dataset[:models_size])
-        for graph in tqdm(models_dataset[:models_size], desc=f'Creating {type} graphs'):
+        for graph in tqdm(models_dataset[:models_size], desc=f'Creating {self.task_type} graphs'):
             fp = self.file_paths[graph.hash]
             if not os.path.exists(fp) or self.reload:
-                if type == 'node':
+                if self.task_type in [NODE_CLS_TASK, GRAPH_CLS_TASK, DUMMY_GRAPH_CLS_TASK]:
                     torch_graph: TorchNodeGraph = create_node_graph(graph, fp)
-                elif type == 'edge':
+                elif self.task_type in [EDGE_CLS_TASK, LINK_PRED_TASK]:
                     torch_graph: TorchEdgeGraph = create_edge_graph(graph, fp)
+                else:
+                    raise ValueError(f"Invalid task type: {self.task_type}")
                 torch_graph.save()
-    def embed(self):
-        for fp in tqdm(self.file_paths.values(), desc='Embedding graphs'):
+    def embed(self, models_dataset, limit):
+        models_size = len(models_dataset) \
+            if (limit == -1 or limit > len(models_dataset)) else limit
+        print("Limit: ", limit)
+        for graph in tqdm(models_dataset[:models_size], desc=f'Creating {self.task_type} graphs'):
+            fp = self.file_paths[graph.hash]
             torch_graph = TorchGraph.load(fp)
             torch_graph.embed(
                 self.embedder,
@@ -306,7 +375,8 @@ class GraphDataset(torch.utils.data.Dataset):
                 return np.concatenate([a, b], axis=1)
             prefix_cls = getattr(self, f"{prefix}_cls_label")
-            num_classes = getattr(self, f"num_{prefix}s_{prefix_cls}") + 1
+            num_classes = getattr(self, f"{prefix}_label_map_{prefix_cls}").classes_.shape[0]
             # print(f"Number of {prefix} types: {num_classes}")
             for g in self.graphs:
                 types = np.eye(num_classes)[getattr(g.data, f"{prefix}_{prefix_cls}")]
@@ -325,10 +395,10 @@ class GraphDataset(torch.utils.data.Dataset):
             assert all(g.data.edge_attr.shape[1] == edge_dim for g in self.graphs), "Edge types not added correctly"
-        if self.use_node_types and self.node_cls_label:
+        if self.use_node_types and self.node_cls_label and self.task_type not in [NODE_CLS_TASK]:
             set_types('node')
-        if self.use_edge_types and self.edge_cls_label:
+        if self.use_edge_types and self.edge_cls_label and self.task_type not in [EDGE_CLS_TASK, LINK_PRED_TASK]:
             set_types('edge')
     def __len__(self):
@@ -375,8 +445,9 @@ class GraphDataset(torch.utils.data.Dataset):
                 assert torch_graph.data.overall_edge_index.shape[1] == torch_graph.graph.number_of_edges(), \
                 f"Number of edges mismatch, {torch_graph.data.edge_index.shape[1]} != {torch_graph.graph.number_of_edges()}"
             else:
-                assert torch_graph.data.edge_index.shape[1] == torch_graph.graph.number_of_edges(), \
-                f"Number of edges mismatch, {torch_graph.data.edge_index.shape[1]} != {torch_graph.graph.number_of_edges()}"
+                if len(torch_graph.data.edge_index.shape) > 1:
+                    assert torch_graph.data.edge_index.shape[1] == torch_graph.graph.number_of_edges(), \
+                    f"Number of edges mismatch, {torch_graph.data.edge_index.shape[1]} != {torch_graph.graph.number_of_edges()}"
@@ -407,7 +478,6 @@ class GraphDataset(torch.utils.data.Dataset):
             node_label_map = LabelEncoder()
             node_label_map.fit_transform([j for i in label_values for j in i])
             label_values = [node_label_map.transform(i) for i in label_values]
-            print(node_label_map.classes_)
             for torch_graph, node_classes in zip(self.graphs, label_values):
                 setattr(torch_graph.data, f"node_{cls_label}", np.array(node_classes))
@@ -532,7 +602,7 @@ class GraphDataset(torch.utils.data.Dataset):
         y = [getattr(self.graphs[i].data, f'graph_{graph_label_name}')[0].item() for i in indices]
         dataset = EncodingDataset(tokenizer, X, y, remove_duplicates=remove_duplicates)
-        print("\n".join([f"Label: {self.graph_label_map_label.inverse_transform([l])[0]}, Text: {i}" for i, l in zip(X, y)]))
+        # print("\n".join([f"Label: {self.graph_label_map_label.inverse_transform([l])[0]}, Text: {i}" for i, l in zip(X, y)]))
         return dataset
@@ -566,7 +636,7 @@ class GraphEdgeDataset(GraphDataset):
     def __init__(
             self,
             models_dataset: Union[EcoreDataset, ArchiMateDataset],
-            save_dir='datasets/graph_data',
+            task_type: str,
             distance=0,
             reload=False,
             test_ratio=0.2,
@@ -580,6 +650,8 @@ class GraphEdgeDataset(GraphDataset):
             use_node_types=False,
             no_labels=False,
+            node_topk = -1,
             use_embeddings=False,
             embed_model_name='bert-base-uncased',
             ckpt=None,
@@ -595,13 +667,13 @@ class GraphEdgeDataset(GraphDataset):
             node_cls_label: str = None,
             edge_cls_label: str = None,
-            task_type=LP_TASK_EDGE_CLS
+            save_dir='datasets/graph_data'
         ):
+        assert task_type in [EDGE_CLS_TASK, GRAPH_CLS_TASK], f"Invalid task type: Must be one of {[EDGE_CLS_TASK, GRAPH_CLS_TASK]}."
         super().__init__(
             models_dataset=models_dataset,
-            save_dir=save_dir,
+            task_type=task_type,
             distance=distance,
             test_ratio=test_ratio,
@@ -609,8 +681,9 @@ class GraphEdgeDataset(GraphDataset):
             use_edge_types=use_edge_types,
             use_edge_label=use_edge_label,
             use_attributes=use_attributes,
-            no_labels=no_labels,
+            no_labels=no_labels,
+            node_topk = node_topk,
             add_negative_train_samples=add_negative_train_samples,
             neg_sampling_ratio=neg_sampling_ratio,
@@ -631,11 +704,10 @@ class GraphEdgeDataset(GraphDataset):
             randomize_ne=randomize_ne,
             randomize_ee=randomize_ee,
             random_embed_dim=random_embed_dim,
+            save_dir=save_dir
         )
-        self.task_type = task_type
-        self.set_torch_graphs('edge', models_dataset, limit)
+        self.set_torch_graphs(models_dataset, limit)
         if self.use_embeddings and (isinstance(self.embedder, Word2VecEmbedder) or isinstance(self.embedder, TfidfEmbedder)):
             texts = self.get_link_prediction_texts(only_texts=True)
@@ -644,7 +716,7 @@ class GraphEdgeDataset(GraphDataset):
             self.embedder.train(texts)
             print(f"Trained {self.embedder.name} Embedder")
-        self.embed()
+        self.embed(models_dataset, limit)
         train_count, test_count = dict(), dict()
         for g in self.graphs:
@@ -675,12 +747,13 @@ class GraphEdgeDataset(GraphDataset):
         assert label is not None, "No edge label found in data. Please define edge label in metadata"
         data = defaultdict(list)
-        for torch_graph in tqdm(self.graphs, desc='Getting Graph Texts'):
+        for torch_graph in tqdm(self.graphs, desc=f'Getting {self.task_type} Texts'):
             # torch_graph: TorchEdgeGraph = TorchGraph.load(fp)
             graph_data = torch_graph.get_link_prediction_texts(label, self.task_type, only_texts)
             for k, v in graph_data.items():
                 data[k] += v
         print("Train Texts: ", data[f'train_pos_edges'][:20])
         print("Test Texts: ", data[f'test_pos_edges'][:20])
@@ -701,7 +774,7 @@ class GraphEdgeDataset(GraphDataset):
         print("Tokenizing data")
-        if self.task_type == LP_TASK_EDGE_CLS:
+        if self.task_type == EDGE_CLS_TASK:
             datasets = {
                 'train': EncodingDataset(
                     tokenizer,
@@ -714,7 +787,7 @@ class GraphEdgeDataset(GraphDataset):
                     data['test_edge_classes']
                 )
             }
-        elif self.task_type == LP_TASK_LINK_PRED:
+        elif self.task_type == LINK_PRED_TASK:
             datasets = {
                 'train': EncodingDataset(
                     tokenizer,
@@ -738,8 +811,9 @@ class GraphEdgeDataset(GraphDataset):
 class GraphNodeDataset(GraphDataset):
     def __init__(
         self,
-        models_dataset: Union[EcoreDataset, ArchiMateDataset],
-        save_dir='datasets/graph_data',
+        models_dataset: Union[EcoreDataset, ArchiMateDataset, OntoUMLDataset],
+        task_type: str,
         distance=0,
         test_ratio=0.2,
         reload=False,
@@ -749,6 +823,7 @@ class GraphNodeDataset(GraphDataset):
         use_node_types=False,
         use_edge_label=False,
         use_special_tokens=False,
+        node_topk=-1,
         use_embeddings=False,
         embed_model_name='bert-base-uncased',
@@ -762,11 +837,69 @@ class GraphNodeDataset(GraphDataset):
         limit: int = -1,
         no_labels=False,
         node_cls_label: str = None,
-        edge_cls_label: str = None
+        edge_cls_label: str = None,
+        save_dir='datasets/graph_data',
     ):
+        """
+        Parameters
+        ----------
+        models_dataset: Union[EcoreDataset, ArchiMateDataset, OntoUMLDataset]
+            The dataset of models to convert to a graph dataset.
+        task_type: str
+            The type of task to perform on the graph dataset. Must be one of 'node', 'edge', or 'graph'.
+        distance: int
+            The distance to consider when creating the graph. If 0, only the node itself is considered.
+        test_ratio: float
+            The proportion of the dataset to split into the test set.
+        reload: bool
+            Whether to reload the dataset from disk if it already exists.
+        use_attributes: bool
+            Whether to include attributes of the nodes and edges in the graph.
+        use_edge_types: bool
+            Whether to include the types of the edges in the graph.
+        use_node_types: bool
+            Whether to include the types of the nodes in the graph.
+        use_edge_label: bool
+            Whether to include the labels of the edges in the graph.
+        use_special_tokens: bool
+            Whether to include special tokens for the start and end of a node or edge sequence.
+        node_topk: int
+            The number of top nodes to include in the graph. If -1, all nodes are included.
+        use_embeddings: bool
+            Whether to use embeddings for the node and edge attributes.
+        embed_model_name: str
+            The name of the embedding model to use.
+        ckpt: str
+            The path to the checkpoint file of the embedding model.
+        no_shuffle: bool
+            Whether to shuffle the dataset before splitting it into train and test sets.
+        randomize_ne: bool
+            Whether to randomize the node embeddings.
+        randomize_ee: bool
+            Whether to randomize the edge embeddings.
+        random_embed_dim: int
+            The dimension of the random embeddings.
+        limit: int
+            The maximum number of models to include in the dataset.
+        no_labels: bool
+            Whether to include labels for the nodes and edges in the graph.
+        node_cls_label: str
+            The label to use for the node classification task.
+        edge_cls_label: str
+            The label to use for the edge classification task.
+        save_dir: str
+            The directory in which to save the dataset.
+        Returns
+        -------
+        A GraphNodeDataset object.
+        """
+        assert task_type in [NODE_CLS_TASK, GRAPH_CLS_TASK], f"Invalid task type: Must be one of {[NODE_CLS_TASK, GRAPH_CLS_TASK]}."
         super().__init__(
             models_dataset=models_dataset,
-            save_dir=save_dir,
+            task_type=task_type,
             distance=distance,
             test_ratio=test_ratio,
@@ -778,6 +911,8 @@ class GraphNodeDataset(GraphDataset):
             node_cls_label=node_cls_label,
             edge_cls_label=edge_cls_label,
+            node_topk=node_topk,
             use_embeddings=use_embeddings,
             embed_model_name=embed_model_name,
@@ -791,9 +926,10 @@ class GraphNodeDataset(GraphDataset):
             randomize_ne=randomize_ne,
             randomize_ee=randomize_ee,
             random_embed_dim=random_embed_dim,
+            save_dir=save_dir,
         )
-        self.set_torch_graphs('node', models_dataset, limit)
+        self.set_torch_graphs(models_dataset, limit)
         if self.use_embeddings and (isinstance(self.embedder, Word2VecEmbedder) or isinstance(self.embedder, TfidfEmbedder)):
             texts = self.get_node_classification_texts()
@@ -802,7 +938,7 @@ class GraphNodeDataset(GraphDataset):
             self.embedder.train(texts)
             print(f"Trained {self.embedder.name} Embedder")
-        self.embed()
+        self.embed(models_dataset, limit)
         node_labels = self.metadata.node_cls
         if isinstance(node_labels, str):
@@ -860,9 +996,8 @@ class GraphNodeDataset(GraphDataset):
             data['test_node_classes'] += test_node_classes
-        print("Tokenizing data")
-        print(data['train_nodes'][:10])
-        print(data['test_nodes'][:10])
+        # print("\n".join(data['train_nodes']))
+        # print("\n".join(data['test_nodes']))
         if hasattr(self, "node_label_map_type"):
             node_label_map.inverse_transform([i.item() for i in train_node_classes]) == train_node_strs
             node_label_map.inverse_transform([i.item() for i in test_node_classes]) == test_node_strs

glam4cm/data_loading/metadata.py CHANGED Viewed

@@ -76,7 +76,7 @@ class OntoUMLMetaData(GraphMetadata):
             "attributes": "properties"
         }
         self.edge = {
-            "cls": "stereotype"
+            "cls": 'type'
         }
         self.graph = {

glam4cm/data_loading/models_dataset.py CHANGED Viewed

@@ -30,7 +30,8 @@ class ModelDataset:
         min_edges: int = -1,
         min_enr: float = -1,
         timeout=-1,
-        preprocess_graph_text: callable = None
+        preprocess_graph_text: callable = None,
+        include_dummies=False
     ):
         self.name = dataset_name
         self.dataset_dir = dataset_dir
@@ -41,6 +42,7 @@ class ModelDataset:
         self.min_enr = min_enr
         self.timeout = timeout
         self.preprocess_graph_text = preprocess_graph_text
+        self.include_dummies = include_dummies
         self.graphs: List[LangGraph] = []
@@ -114,12 +116,14 @@ class ModelDataset:
     def save(self):
         print(f'Saving {self.name} to pickle')
-        with open(os.path.join(self.save_dir, f'{self.name}.pkl'), 'wb') as f:
+        pkl_file = f'{self.name}{"_with_dummies" if self.include_dummies else ''}.pkl'
+        with open(os.path.join(self.save_dir, pkl_file), 'wb') as f:
             pickle.dump(self.graphs, f)
         print(f'Saved {self.name} to pickle')
     def filter_graphs(self):
+        # print("Filtering graphs with min edges and min enr: ", self.min_edges, self.min_enr)
         graphs = list()
         for graph in self.graphs:
             addable = True
@@ -129,6 +133,7 @@ class ModelDataset:
                 addable = False
             if addable:
+                # print("Addable because min edges and min enr: ", graph.number_of_edges())
                 graphs.append(graph)
         self.graphs = graphs
@@ -137,7 +142,8 @@ class ModelDataset:
     def load(self):
         print(f'Loading {self.name} from pickle')
-        with open(os.path.join(self.save_dir, f'{self.name}.pkl'), 'rb') as f:
+        pkl_file = f'{self.name}{"_with_dummies" if self.include_dummies else ''}.pkl'
+        with open(os.path.join(self.save_dir, pkl_file), 'rb') as f:
             self.graphs = pickle.load(f)
         self.filter_graphs()
@@ -172,7 +178,8 @@ class EcoreDataset(ModelDataset):
             remove_duplicates=False,
             min_edges: int = -1,
             min_enr: float = -1,
-            preprocess_graph_text: callable = None
+            preprocess_graph_text: callable = None,
+            include_dummies=False
         ):
         super().__init__(
             dataset_name,
@@ -180,22 +187,34 @@ class EcoreDataset(ModelDataset):
             save_dir=save_dir,
             min_edges=min_edges,
             min_enr=min_enr,
-            preprocess_graph_text=preprocess_graph_text
+            preprocess_graph_text=preprocess_graph_text,
+            include_dummies=include_dummies
         )
         os.makedirs(save_dir, exist_ok=True)
         dataset_exists = os.path.exists(os.path.join(save_dir, f'{dataset_name}.pkl'))
         if reload or not dataset_exists:
             self.graphs: List[EcoreNxG] = []
             data_path = os.path.join(dataset_dir, dataset_name)
-            for file in os.listdir(data_path):
-                if file.endswith('.jsonl') and file.startswith('ecore'):
-                    json_objects = json.load(open(os.path.join(data_path, file)))
-                    for g in tqdm(json_objects, desc=f'Loading {dataset_name.title()}'):
-                        if remove_duplicates and g['is_duplicated']:
-                            continue
-                        nxg = EcoreNxG(g)
-                        self.graphs.append(nxg)
+            file_name = os.path.join(data_path, 'ecore.jsonl') if not include_dummies\
+                else os.path.join(data_path, 'ecore-with-dummy.jsonl')
+            # for file in os.listdir(data_path):
+            #     if file.endswith('.jsonl') and file.startswith("ecore"):
+            json_objects = json.load(open(file_name))
+            for g in tqdm(json_objects, desc=f'Loading {dataset_name.title()}'):
+                if remove_duplicates and g['is_duplicated']:
+                    continue
+                if not include_dummies and g['labels'] == 'dummy':
+                    print(f"Skipping dummy graph {g['ids']}")
+                    continue
+                nxg = EcoreNxG(g)
+                self.graphs.append(nxg)
             print(f'Loaded Total {self.name} with {len(self.graphs)} graphs')
             print("Filtering...")
@@ -233,7 +252,8 @@ class ArchiMateDataset(ModelDataset):
             min_enr: float = -1,
             timeout=-1,
             language=None,
-            preprocess_graph_text: callable = None
+            preprocess_graph_text: callable = None,
+            include_dummies=False
         ):
         super().__init__(
             dataset_name,
@@ -242,7 +262,8 @@ class ArchiMateDataset(ModelDataset):
             min_edges=min_edges,
             min_enr=min_enr,
             timeout=timeout,
-            preprocess_graph_text=preprocess_graph_text
+            preprocess_graph_text=preprocess_graph_text,
+            include_dummies=include_dummies
         )
         os.makedirs(save_dir, exist_ok=True)
@@ -274,7 +295,7 @@ class ArchiMateDataset(ModelDataset):
                         except Exception as e:
                             raise e
+            print("Total graphs:", len(self.graphs))
             self.filter_graphs()
             self.save()
         else:
@@ -283,6 +304,7 @@ class ArchiMateDataset(ModelDataset):
         if remove_duplicates:
             self.dedup()
+        assert all([g.number_of_edges() >= min_edges for g in self.graphs]), f"Filtered out graphs with less than {min_edges} edges"
         print(f'Loaded {self.name} with {len(self.graphs)} graphs')
         print(f'Graphs: {len(self.graphs)}')
@@ -305,7 +327,8 @@ class OntoUMLDataset(ModelDataset):
             min_edges: int = -1,
             min_enr: float = -1,
             timeout=-1,
-            preprocess_graph_text: callable = None
+            preprocess_graph_text: callable = None,
+            include_dummies=False
         ):
         super().__init__(
             dataset_name,
@@ -314,7 +337,8 @@ class OntoUMLDataset(ModelDataset):
             min_edges=min_edges,
             min_enr=min_enr,
             timeout=timeout,
-            preprocess_graph_text=preprocess_graph_text
+            preprocess_graph_text=preprocess_graph_text,
+            include_dummies=include_dummies
         )
         os.makedirs(save_dir, exist_ok=True)

glam4cm 0.1.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

glam4cm 0.1.1py3-none-any.whl → 1.0.0py3-none-any.whl