PyPI - glam4cm - Versions diffs - 0.1.0__py3-none-any.whl - Mend

glam4cm 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

glam4cm/__init__.py +9 -0
glam4cm/data_loading/__init__.py +0 -0
glam4cm/data_loading/data.py +631 -0
glam4cm/data_loading/encoding.py +76 -0
glam4cm/data_loading/graph_dataset.py +940 -0
glam4cm/data_loading/metadata.py +84 -0
glam4cm/data_loading/models_dataset.py +361 -0
glam4cm/data_loading/utils.py +20 -0
glam4cm/downstream_tasks/__init__.py +0 -0
glam4cm/downstream_tasks/bert_edge_classification.py +144 -0
glam4cm/downstream_tasks/bert_graph_classification.py +137 -0
glam4cm/downstream_tasks/bert_graph_classification_comp.py +156 -0
glam4cm/downstream_tasks/bert_link_prediction.py +145 -0
glam4cm/downstream_tasks/bert_node_classification.py +164 -0
glam4cm/downstream_tasks/cm_gpt_edge_classification.py +73 -0
glam4cm/downstream_tasks/cm_gpt_node_classification.py +76 -0
glam4cm/downstream_tasks/cm_gpt_pretraining.py +64 -0
glam4cm/downstream_tasks/common_args.py +160 -0
glam4cm/downstream_tasks/create_dataset.py +51 -0
glam4cm/downstream_tasks/gnn_edge_classification.py +106 -0
glam4cm/downstream_tasks/gnn_graph_cls.py +101 -0
glam4cm/downstream_tasks/gnn_link_prediction.py +109 -0
glam4cm/downstream_tasks/gnn_node_classification.py +103 -0
glam4cm/downstream_tasks/tf_idf_text_classification.py +22 -0
glam4cm/downstream_tasks/utils.py +35 -0
glam4cm/downstream_tasks/word2vec_text_classification.py +108 -0
glam4cm/embeddings/__init__.py +0 -0
glam4cm/embeddings/bert.py +72 -0
glam4cm/embeddings/common.py +43 -0
glam4cm/embeddings/fasttext.py +0 -0
glam4cm/embeddings/tfidf.py +25 -0
glam4cm/embeddings/w2v.py +41 -0
glam4cm/encoding/__init__.py +0 -0
glam4cm/encoding/common.py +0 -0
glam4cm/encoding/encoders.py +100 -0
glam4cm/graph2str/__init__.py +0 -0
glam4cm/graph2str/common.py +34 -0
glam4cm/graph2str/constants.py +15 -0
glam4cm/graph2str/ontouml.py +141 -0
glam4cm/graph2str/uml.py +0 -0
glam4cm/lang2graph/__init__.py +0 -0
glam4cm/lang2graph/archimate.py +31 -0
glam4cm/lang2graph/bpmn.py +0 -0
glam4cm/lang2graph/common.py +416 -0
glam4cm/lang2graph/ecore.py +221 -0
glam4cm/lang2graph/ontouml.py +169 -0
glam4cm/lang2graph/utils.py +80 -0
glam4cm/models/cmgpt.py +352 -0
glam4cm/models/gnn_layers.py +273 -0
glam4cm/models/hf.py +10 -0
glam4cm/run.py +99 -0
glam4cm/run_configs.py +126 -0
glam4cm/settings.py +54 -0
glam4cm/tokenization/__init__.py +0 -0
glam4cm/tokenization/special_tokens.py +4 -0
glam4cm/tokenization/utils.py +37 -0
glam4cm/trainers/__init__.py +0 -0
glam4cm/trainers/bert_classifier.py +105 -0
glam4cm/trainers/cm_gpt_trainer.py +153 -0
glam4cm/trainers/gnn_edge_classifier.py +126 -0
glam4cm/trainers/gnn_graph_classifier.py +123 -0
glam4cm/trainers/gnn_link_predictor.py +144 -0
glam4cm/trainers/gnn_node_classifier.py +135 -0
glam4cm/trainers/gnn_trainer.py +129 -0
glam4cm/trainers/metrics.py +55 -0
glam4cm/utils.py +194 -0
glam4cm-0.1.0.dist-info/LICENSE +21 -0
glam4cm-0.1.0.dist-info/METADATA +86 -0
glam4cm-0.1.0.dist-info/RECORD +72 -0
glam4cm-0.1.0.dist-info/WHEEL +5 -0
glam4cm-0.1.0.dist-info/entry_points.txt +2 -0
glam4cm-0.1.0.dist-info/top_level.txt +1 -0

glam4cm/lang2graph/ecore.py ADDED Viewed

@@ -0,0 +1,221 @@
+import xmltodict
+from glam4cm.lang2graph.common import LangGraph
+import json
+from glam4cm.tokenization.utils import doc_tokenizer
+from glam4cm.settings import logger
+REFERENCE = 'reference'
+SUPERTYPE = 'supertype'
+CONTAINMENT = 'containment'
+EGenericType = 'EGenericType'
+EPackage = 'EPackage'
+EClass = 'EClass'
+EAttribute = 'EAttribute'
+EReference = 'EReference'
+EEnum = 'EEnum'
+EEnumLiteral = 'EEnumLiteral'
+EOperation = 'EOperation'
+EParameter = 'EParameter'
+EDataType = 'EDataType'
+GenericNodes = [EGenericType, EPackage]
+class EcoreNxG(LangGraph):
+    def __init__(self, json_obj: dict):
+        super().__init__()
+        self.xmi = json_obj.get('xmi')
+        self.graph_id = json_obj.get('ids')
+        self.json_obj = json_obj
+        self.graph_type = json_obj.get('model_type')
+        self.label = json_obj.get('labels')
+        self.is_duplicated = json_obj.get('is_duplicated')
+        self.directed = json.loads(json_obj.get('graph')).get('directed')
+        # self.text = doc_tokenizer(json_obj.get('txt'))
+        self.__create_graph()
+        self.set_numbered_labels()
+    def __create_graph(self):
+        model = xmltodict.parse(self.xmi)
+        eclassifiers, _ = get_eclassifiers(model)
+        classifier_nodes = dict()
+        for eclassifier in eclassifiers:
+            eclassifier_info = get_eclassifier_info(eclassifier)
+            classifier_nodes[eclassifier_info['name']] = eclassifier_info
+        references = get_connections(classifier_nodes)
+        for classifier_name, classifier_info in classifier_nodes.items():
+            # if classifier_info['type'] != 'class':
+            #     continue
+            structural_features = classifier_info.get('structural_features', [])
+            attributes = list()
+            for f in structural_features:
+                if f['type'] == 'ecore:EAttribute':
+                    name = f['name']
+                    attr_type = f['ref'] if f['ref'] else ''
+                    attributes.append((name, attr_type))
+            self.add_node(
+                classifier_name,
+                name=classifier_name,
+                attributes=attributes,
+                abstract=classifier_info['abstract']
+            )
+        for edge in references:
+            src, dest = edge['source'], edge['target']
+            name = edge['name'] if 'name' in edge else ''
+            self.add_edge(src, dest, name=name, type=edge['type'])
+        for node in self.nodes:
+            self.nodes[node]['abstract'] = self.nodes[node]['abstract'] if 'abstract' in self.nodes[node] and self.nodes[node]['abstract'] is not None else False
+        logger.info(f'Graph {self.graph_id} created with {self.number_of_nodes()} nodes and {self.number_of_edges()} edges')
+    def __str__(self):
+        return self.__repr__()
+    def __repr__(self):
+        reference_edges = [edge for edge in self.edges if self.edges[edge]['type'] == REFERENCE]
+        containment_edges = [edge for edge in self.edges if self.edges[edge]['type'] == CONTAINMENT]
+        supertype_edges = [edge for edge in self.edges if self.edges[edge]['type'] == SUPERTYPE]
+        return f'EcoreNxG({self.graph_id}, nodes={self.number_of_nodes()}, edges={self.number_of_edges()}, references={len(reference_edges)}, containment={len(containment_edges)}, supertypes={len(supertype_edges)})'
+def get_eclassifiers(json_obj):
+    def get_eclassifiers_util(json_obj, classifiers: list):
+        for key, value in json_obj.items():
+            if key == 'eClassifiers':
+                if isinstance(value, dict):
+                    value = [value]
+                classifiers.extend(value)
+            elif isinstance(value, dict):
+                get_eclassifiers_util(value, classifiers)
+            elif isinstance(value, list):
+                for item in value:
+                    if isinstance(item, dict):
+                        get_eclassifiers_util(item, classifiers)
+    classifiers = list()
+    get_eclassifiers_util(json_obj, classifiers)
+    names = [c['@name'] for c in classifiers]
+    return classifiers, len(names) - len(set(names))
+def get_connections(nodes):
+    links = list()
+    for source_class, classifier_info in nodes.items():
+        if classifier_info['type'] != 'class':
+            continue
+        super_types = classifier_info['super_types']
+        for super_type in super_types:
+            if super_type in nodes:
+                links.append({
+                    'source': source_class,
+                    'target': super_type,
+                    'type': SUPERTYPE,
+                })
+                nodes[super_type]['abstract'] = True
+        for feature in classifier_info['structural_features']:
+            ref = feature['ref']
+            if ref and ref in nodes:
+                links.append({
+                    'name': feature['name'],
+                    'source': source_class,
+                    'target': ref,
+                    'type': REFERENCE if not feature['containment'] else CONTAINMENT
+                })
+    for node in nodes:
+        abstract = nodes[node].get('abstract', '')
+        if abstract:
+            nodes[node]['abstract'] = True
+        else:
+            nodes[node]['abstract'] = False
+    return links
+def get_estructural_feature(structural_feat):
+    feat_type = '@xsi:type' if '@xsi:type' in structural_feat else '@xmi:type'
+    structural_feat_type = structural_feat[feat_type]
+    name = structural_feat['@name']
+    eType = structural_feat['@eType'] if '@eType' in structural_feat else False
+    return {
+        'name': name,
+        'ref': eType.split('/')[-1] if eType else None,
+        'type': structural_feat_type,
+        'containment': structural_feat['@containment'] if '@containment' in structural_feat else None,
+    }
+def get_eclassifier_info_eclass(eclass):
+    name = eclass['@name']
+    super_types = eclass['@eSuperTypes'] if '@eSuperTypes' in eclass else ""
+    super_types = [s.split('/')[-1] for s in super_types.split(' ')] if super_types else []
+    structural_features = eclass['eStructuralFeatures'] if 'eStructuralFeatures' in eclass else []
+    if not isinstance(structural_features, list):
+        structural_features = [structural_features]
+    structural_features_info = list()
+    for feature in structural_features:
+        structural_features_info.append(get_estructural_feature(feature))
+    return {
+        'name': name,
+        'type': 'class',
+        'super_types': super_types,
+        'structural_features': structural_features_info,
+        'abstract': '@abstract' in eclass and eclass['@abstract']
+    }
+def get_eclassifier_info_eenum(eenum):
+    name = eenum['@name']
+    literals = eenum['eLiterals'] if 'eLiterals' in eenum else []
+    if not isinstance(literals, list):
+        literals = [literals]
+    literals_info = list()
+    for literal in literals:
+        literal_label = '@literal' if '@literal' in literal else '@value'
+        name = literal['@name']
+        value = literal[literal_label] if literal_label in literal else ""
+        literals_info.append((name, value))
+    return {
+        'name': name,
+        'type': 'enum',
+        'literals': literals_info
+    }
+def get_eclassifier_info_edatatype(edatatype):
+    name = edatatype['@name']
+    return {
+        'type': 'datatype',
+        'name': name,
+    }
+def get_eclassifier_info(eclassifier):
+    classifier_type = '@xsi:type' if '@xsi:type' in eclassifier else '@xmi:type'
+    if classifier_type not in eclassifier:
+        raise ValueError(f"Classifier has no type: {eclassifier}")
+    if eclassifier[classifier_type] in ['ecore:EClass', 'EClass']:
+        return get_eclassifier_info_eclass(eclassifier)
+    elif eclassifier[classifier_type] in ['ecore:EEnum', 'EEnum']:
+        return get_eclassifier_info_eenum(eclassifier)
+    elif eclassifier[classifier_type] in ['ecore:EDataType', 'EDataType']:
+        return get_eclassifier_info_edatatype(eclassifier)
+    else:
+        logger.log(eclassifier)
+        raise ValueError(f"Unknown classifier type: {eclassifier[classifier_type]}")

glam4cm/lang2graph/ontouml.py ADDED Viewed

@@ -0,0 +1,169 @@
+import json
+from tqdm.auto import tqdm
+from glam4cm.lang2graph.common import LangGraph
+from glam4cm.utils import find_files_with_extension
+from glam4cm.settings import logger
+ONTOUML_ELEMENT_ID = 'id'
+ONTOUML_ELEMENT_TYPE = 'type'
+ONTOUML_ELEMENT_NAME = 'name'
+ONTOUML_ELEMENT_DESCRIPTION = 'description'
+ONTOUML_GENERALIZATION = "Generalization"
+ONTOUML_GENERALIZATION_GENERAL = "general"
+ONTOUML_GENERALIZATION_SPECIFIC = "specific"
+ONTOUML_GENERALIZATION_SET = "GeneralizationSet"
+ONTOUML_GENERALIZATION_SET_GENERALIZATIONS = "generalizations"
+ONTOUML_GENERALIZATION_SET_IS_DISJOINT = "isDisjoint"
+ONTOUML_GENERALIZATION_SET_IS_COMPLETE = "isComplete"
+ONTOUML_PROJECT = "Project"
+ONTOUML_PROJECT_MODEL = "model"
+ONTOUML_PROJECT_MODEL_CONTENTS = "contents"
+ONTOUML_RELATION = "Relation"
+ONTOUML_PROPERTIES = "properties"
+ONTOUML_RELATION_PROPERTY_TYPE = "propertyType"
+ONTOUML_STEREOTYPE = "stereotype"
+ONTOUML_CLASS = "Class"
+ONTOUML_ENUMERATION = "enumeration"
+ONTOUML_CLASS_LITERALS = 'literals'
+ONTOUML_PACKAGE = "Package"
+ONTOUML_LITERAL = "Literal"
+extra_properties = [
+    "isAbstract",
+    "isDerived",
+    "isDisjoint",
+    "type",
+    "isComplete",
+    "isPowertype",
+    "isExtensional",
+    "isOrdered",
+    "aggregationKind",
+]
+class OntoUMLNxG(LangGraph):
+    def __init__(self, json_obj: dict, rel_as_node=True):
+        super().__init__()
+        self.json_obj = json_obj
+        self.rel_as_node = rel_as_node
+        self.__create_graph()
+        self.set_numbered_labels()
+        self.text = " ".join([
+            self.nodes[node]['name'] if 'name' in self.nodes[node] else ''
+            for node in self.nodes
+        ])
+    def __create_graph(self):
+        def ontouml_id2obj(obj):
+            assert isinstance(obj, dict)
+            for key in obj:
+                if key == ONTOUML_ELEMENT_ID and ONTOUML_ELEMENT_TYPE in obj and obj[ONTOUML_ELEMENT_TYPE]\
+                    in [ONTOUML_CLASS, ONTOUML_RELATION, ONTOUML_GENERALIZATION_SET, ONTOUML_GENERALIZATION]\
+                        and ONTOUML_ELEMENT_DESCRIPTION in obj:
+                    id2obj_map[obj[ONTOUML_ELEMENT_ID]] = obj
+                elif isinstance(obj[key], dict):
+                    ontouml_id2obj(obj[key])
+                elif isinstance(obj[key], list):
+                    for item in obj[key]:
+                        assert not isinstance(item, list)
+                        if isinstance(item, dict):
+                            ontouml_id2obj(item)
+        def create_nxg():
+            for k, v in id2obj_map.items():
+                node_name = v.get('name', '')
+                if v[ONTOUML_ELEMENT_TYPE] in [ONTOUML_CLASS, ONTOUML_RELATION]:
+                    self.add_node(k, name=node_name, type=v[ONTOUML_ELEMENT_TYPE], description='')
+                    for prop in extra_properties:
+                        self.nodes[k][prop] = v[prop] if prop in v else False
+                    logger.info(f"Node: {node_name} type: {v[ONTOUML_ELEMENT_TYPE]}")
+                logger.info(f"Node: {node_name} type: {v[ONTOUML_ELEMENT_TYPE]}")
+                if ONTOUML_STEREOTYPE in v and v[ONTOUML_STEREOTYPE] is not None:
+                    self.nodes[k][ONTOUML_STEREOTYPE] = v[ONTOUML_STEREOTYPE].lower()
+                    logger.info(f"Stereotype: {v[ONTOUML_STEREOTYPE].lower()}")
+                if ONTOUML_ELEMENT_DESCRIPTION in v and v[ONTOUML_ELEMENT_DESCRIPTION] is not None:
+                    self.nodes[k][ONTOUML_ELEMENT_DESCRIPTION] = v[ONTOUML_ELEMENT_DESCRIPTION]
+                    logger.info(f"Description: {v[ONTOUML_ELEMENT_DESCRIPTION]}")
+                if v[ONTOUML_ELEMENT_TYPE] == ONTOUML_CLASS:
+                    if ONTOUML_CLASS_LITERALS in v and v[ONTOUML_CLASS_LITERALS] is not None:
+                        literals = v[ONTOUML_CLASS_LITERALS] if isinstance(v[ONTOUML_CLASS_LITERALS], list) else [v[ONTOUML_CLASS_LITERALS]]
+                        literals_str = ", ".join([literal[ONTOUML_ELEMENT_NAME] for literal in literals])
+                        self.nodes[k][ONTOUML_PROPERTIES] = literals_str
+                        logger.info(f"Literals: {literals_str}")
+                    elif ONTOUML_PROPERTIES in v and v[ONTOUML_PROPERTIES] is not None:
+                        properties = v[ONTOUML_PROPERTIES] if isinstance(v[ONTOUML_PROPERTIES], list) else [v[ONTOUML_PROPERTIES]]
+                        properties_str = ", ".join([property[ONTOUML_ELEMENT_NAME] for property in properties])
+                        self.nodes[k][ONTOUML_PROPERTIES] = properties_str
+                        logger.info(f"Properties: {properties_str}")
+                elif v[ONTOUML_ELEMENT_TYPE] == ONTOUML_RELATION:
+                    properties = v[ONTOUML_PROPERTIES] if isinstance(v[ONTOUML_PROPERTIES], list) else [v[ONTOUML_PROPERTIES]]
+                    assert len(properties) == 2
+                    try:
+                        x_id = properties[0][ONTOUML_RELATION_PROPERTY_TYPE][ONTOUML_ELEMENT_ID]
+                        y_id = properties[1][ONTOUML_RELATION_PROPERTY_TYPE][ONTOUML_ELEMENT_ID]
+                        x_name = id2obj_map[x_id][ONTOUML_ELEMENT_NAME] if ONTOUML_ELEMENT_NAME is not None else ''
+                        y_name = id2obj_map[y_id][ONTOUML_ELEMENT_NAME] if ONTOUML_ELEMENT_NAME is not None else ''
+                        self.add_edge(x_id, v[ONTOUML_ELEMENT_ID], type='rel')
+                        self.add_edge(v[ONTOUML_ELEMENT_ID], y_id, type='rel')
+                        logger.info(f"\tRelationship:, {x_name} --> {y_name}\n")
+                    except TypeError as e:
+                        # print(f"Error in {v[ONTOUML_ELEMENT_TYPE]}, {v[ONTOUML_ELEMENT_NAME]}")
+                        pass
+                elif v[ONTOUML_ELEMENT_TYPE] == ONTOUML_GENERALIZATION:
+                    general = v[ONTOUML_GENERALIZATION_GENERAL][ONTOUML_ELEMENT_ID]
+                    specific = v[ONTOUML_GENERALIZATION_SPECIFIC][ONTOUML_ELEMENT_ID]
+                    general_name = id2obj_map[general][ONTOUML_ELEMENT_NAME]\
+                        if ONTOUML_ELEMENT_NAME in id2obj_map[general] else ''
+                    specific_name = id2obj_map[specific][ONTOUML_ELEMENT_NAME] \
+                        if ONTOUML_ELEMENT_NAME in id2obj_map[specific] else ''
+                    logger.info(f"\tGeneralization:, {specific_name} -->> {general_name}\n")
+                    self.add_edge(specific, general, type='gen')
+        def create_nxg_rel_as_edge():
+            # TODO: To be implemented
+            pass
+        id2obj_map = dict()
+        ontouml_id2obj(self.json_obj)
+        if self.rel_as_node:
+            create_nxg()
+        else:
+            create_nxg_rel_as_edge()
+def get_ontouml_to_nx(data_dir, min_stereotypes=10):
+    ontouml_graphs = list()
+    models = find_files_with_extension(data_dir, "json")
+    for mfp in tqdm(models, desc=f"Reading {len(models)} OntoUML models"):
+        if mfp.endswith(".ecore") or mfp.endswith(".json"):
+            json_obj = json.loads(open(mfp, 'r', encoding='iso-8859-1').read())
+            g = OntoUMLNxG(json_obj)
+            stereotype_nodes = [node for node, stereotype in g.nodes(data=ONTOUML_STEREOTYPE) if stereotype is not None]
+            if len(stereotype_nodes) >= min_stereotypes:
+                ontouml_graphs.append((g, mfp))
+    return ontouml_graphs

glam4cm/lang2graph/utils.py ADDED Viewed

@@ -0,0 +1,80 @@
+import signal
+import networkx as nx
+class TimeoutException(Exception):
+    pass
+def timeout_handler(signum, frame):
+    raise TimeoutException("Took too long")
+def run_with_timeout(func, args=(), kwargs={}, timeout_duration=5):
+    # Set the signal handler and a timeout alarm
+    signal.signal(signal.SIGALRM, timeout_handler)
+    signal.alarm(timeout_duration)
+    try:
+        result = func(*args, **kwargs)
+    except TimeoutException:
+        result = None
+    finally:
+        # Disable the alarm
+        signal.alarm(0)
+    return result
+def get_triple_text(node, edge_data, neighbour):
+    src = f'Class {node}'
+    dest = f'Class {neighbour}'
+    if edge_data is None:
+        return f'{src} -> {dest}'
+    if edge_data['type'] == 'reference' and 'name' in edge_data:
+        return f'{src} -> ({edge_data["name"]}) -> {dest}'
+    return f'{src} -> {dest}'
+def find_node_str_upto_distance(node, distance=1):
+    nodes_with_distance = find_nodes_within_distance(
+        node,
+        distance=distance
+    )
+    if distance == 0:
+        return f'Class {node}'
+    d2n = {dd[0]: set() for _, dd in nodes_with_distance}
+    for neighbour, dis_data in nodes_with_distance:
+        d, edge_data = dis_data
+        if d == 0:
+            continue
+        node_text = get_triple_text(
+            node, edge_data, neighbour
+        )
+        if node_text:
+            d2n[d].add(node_text)
+    d2n = sorted(d2n.items(), key=lambda x: x[0])
+    node_buckets = [f" ".join(nbs) for _, nbs in d2n]
+    path_str = " | ".join(node_buckets)
+    return path_str
+def find_nodes_within_distance(g: nx.DiGraph, n, distance=1):
+    visited = {n: (0, None)}
+    queue = [(n, 0)]
+    while queue:
+        node, d = queue.pop(0)
+        if d == distance:
+            continue
+        for neighbor in g.neighbors(node):
+            if neighbor not in visited:
+                visited[neighbor] = (d+1, g.edges[node, neighbor])
+                queue.append((neighbor, d+1))
+    visited = sorted(visited.items(), key=lambda x: x[1][0])
+    return visited