PyPI - chebai - Versions diffs - 0.0.2.dev0__tar.gz → 1.0.1__tar.gz - Mend

chebai 0.0.2.dev0tar.gz → 1.0.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

{chebai-0.0.2.dev0 → chebai-1.0.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: chebai
-Version: 0.0.2.dev0
+Version: 1.0.1
 Home-page:
 Author: MGlauer
 Author-email: martin.glauer@ovgu.de
@@ -10,7 +10,7 @@ Requires-Dist: certifi
 Requires-Dist: idna
 Requires-Dist: joblib
 Requires-Dist: networkx
-Requires-Dist: numpy<2
+Requires-Dist: numpy
 Requires-Dist: pandas
 Requires-Dist: python-dateutil
 Requires-Dist: pytz

{chebai-0.0.2.dev0 → chebai-1.0.1}/chebai/models/base.py RENAMED Viewed

@@ -1,9 +1,9 @@
 import logging
-from typing import Any, Dict, Optional, Union, Iterable
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Iterable, Optional, Union
 import torch
 from lightning.pytorch.core.module import LightningModule
-from torchmetrics import Metric
 from chebai.preprocessing.structures import XYData
@@ -12,7 +12,7 @@ logging.getLogger("pysmiles").setLevel(logging.CRITICAL)
 _MODEL_REGISTRY = dict()
-class ChebaiBaseNet(LightningModule):
+class ChebaiBaseNet(LightningModule, ABC):
     """
     Base class for Chebai neural network models inheriting from PyTorch Lightning's LightningModule.
@@ -353,6 +353,7 @@ class ChebaiBaseNet(LightningModule):
                     logger=True,
                 )
+    @abstractmethod
     def forward(self, x: Dict[str, Any]) -> torch.Tensor:
         """
         Defines the forward pass.
@@ -363,7 +364,7 @@ class ChebaiBaseNet(LightningModule):
         Returns:
             torch.Tensor: The model output.
         """
-        raise NotImplementedError
+        pass
     def configure_optimizers(self, **kwargs) -> torch.optim.Optimizer:
         """

{chebai-0.0.2.dev0 → chebai-1.0.1}/chebai/preprocessing/datasets/base.py RENAMED Viewed

@@ -29,7 +29,8 @@ class XYBaseDataModule(LightningDataModule):
     Args:
         batch_size (int): The batch size for data loading. Default is 1.
-        train_split (float): The ratio of training data to total data and of test data to (validation + test) data. Default is 0.85.
+        test_split (float): The ratio of test data to total data. Default is 0.1.
+        validation_split (float): The ratio of validation data to total data. Default is 0.05.
         reader_kwargs (dict): Additional keyword arguments to be passed to the data reader. Default is None.
         prediction_kind (str): The kind of prediction to be performed (only relevant for the predict_dataloader). Default is "test".
         data_limit (Optional[int]): The maximum number of data samples to load. If set to None, the complete dataset will be used. Default is None.
@@ -45,7 +46,8 @@ class XYBaseDataModule(LightningDataModule):
     Attributes:
         READER (DataReader): The data reader class to use.
         reader (DataReader): An instance of the data reader class.
-        train_split (float): The ratio of training data to total data.
+        test_split (float): The ratio of test data to total data.
+        validation_split (float): The ratio of validation data to total data.
         batch_size (int): The batch size for data loading.
         prediction_kind (str): The kind of prediction to be performed.
         data_limit (Optional[int]): The maximum number of data samples to load.
@@ -68,7 +70,8 @@ class XYBaseDataModule(LightningDataModule):
     def __init__(
         self,
         batch_size: int = 1,
-        train_split: float = 0.85,
+        test_split: Optional[float] = 0.1,
+        validation_split: Optional[float] = 0.05,
         reader_kwargs: Optional[dict] = None,
         prediction_kind: str = "test",
         data_limit: Optional[int] = None,
@@ -86,7 +89,9 @@ class XYBaseDataModule(LightningDataModule):
         if reader_kwargs is None:
             reader_kwargs = dict()
         self.reader = self.READER(**reader_kwargs)
-        self.train_split = train_split
+        self.test_split = test_split
+        self.validation_split = validation_split
         self.batch_size = batch_size
         self.prediction_kind = prediction_kind
         self.data_limit = data_limit
@@ -335,8 +340,9 @@ class XYBaseDataModule(LightningDataModule):
             val
             for val in data
             if val["features"] is not None
-            and self.n_token_limit is None
-            or len(val["features"]) <= self.n_token_limit
+            and (
+                self.n_token_limit is None or len(val["features"]) <= self.n_token_limit
+            )
         ]
         return data
@@ -439,13 +445,25 @@ class XYBaseDataModule(LightningDataModule):
         ):
             self.setup_processed()
-        if not ("keep_reader" in kwargs and kwargs["keep_reader"]):
-            self.reader.on_finish()
+        self._after_setup(**kwargs)
+    def _after_setup(self, **kwargs):
+        """
+        Finalize the setup process after ensuring the processed data is available.
+        This method performs post-setup tasks like finalizing the reader and setting internal properties.
+        """
+        self.reader.on_finish()
         self._set_processed_data_props()
     def _set_processed_data_props(self):
+        """
+        Load processed data and extract metadata.
+        Sets:
+            - self._num_of_labels: Number of target labels in the dataset.
+            - self._feature_vector_size: Maximum feature vector length across all data points.
+        """
         data_pt = torch.load(
             os.path.join(self.processed_dir, self.processed_file_names_dict["data"]),
             weights_only=False,
@@ -1009,15 +1027,13 @@ class _DynamicDataset(XYBaseDataModule, ABC):
         labels_list = df["labels"].tolist()
-        test_size = 1 - self.train_split - (1 - self.train_split) ** 2
         if len(labels_list[0]) > 1:
             splitter = MultilabelStratifiedShuffleSplit(
-                n_splits=1, test_size=test_size, random_state=seed
+                n_splits=1, test_size=self.test_split, random_state=seed
             )
         else:
             splitter = StratifiedShuffleSplit(
-                n_splits=1, test_size=test_size, random_state=seed
+                n_splits=1, test_size=self.test_split, random_state=seed
             )
         train_indices, test_indices = next(splitter.split(labels_list, labels_list))
@@ -1070,16 +1086,17 @@ class _DynamicDataset(XYBaseDataModule, ABC):
             return folds
-        # scale val set size by 1/self.train_split to compensate for (hypothetical) test set size (1-self.train_split)
-        test_size = ((1 - self.train_split) ** 2) / self.train_split
         if len(labels_list_trainval[0]) > 1:
             splitter = MultilabelStratifiedShuffleSplit(
-                n_splits=1, test_size=test_size, random_state=seed
+                n_splits=1,
+                test_size=self.validation_split / (1 - self.test_split),
+                random_state=seed,
             )
         else:
             splitter = StratifiedShuffleSplit(
-                n_splits=1, test_size=test_size, random_state=seed
+                n_splits=1,
+                test_size=self.validation_split / (1 - self.test_split),
+                random_state=seed,
             )
         train_indices, validation_indices = next(
@@ -1102,7 +1119,9 @@ class _DynamicDataset(XYBaseDataModule, ABC):
         splits_df = pd.read_csv(self.splits_file_path)
         filename = self.processed_file_names_dict["data"]
-        data = self.load_processed_data(filename=filename)
+        data = self.load_processed_data_from_file(
+            os.path.join(self.processed_dir, filename)
+        )
         df_data = pd.DataFrame(data)
         train_ids = splits_df[splits_df["split"] == "train"]["id"]
@@ -1113,6 +1132,7 @@ class _DynamicDataset(XYBaseDataModule, ABC):
         self._dynamic_df_val = df_data[df_data["ident"].isin(validation_ids)]
         self._dynamic_df_test = df_data[df_data["ident"].isin(test_ids)]
+    # ------------------------------ Phase: DataLoaders -----------------------------------
     def load_processed_data(
         self, kind: Optional[str] = None, filename: Optional[str] = None
     ) -> List[Dict[str, Any]]:
@@ -1148,24 +1168,19 @@ class _DynamicDataset(XYBaseDataModule, ABC):
         # If both kind and filename are given, use filename
         if kind is not None and filename is None:
-            try:
-                if self.use_inner_cross_validation and kind != "test":
-                    filename = self.processed_file_names_dict[
-                        f"fold_{self.fold_index}_{kind}"
-                    ]
-                else:
-                    data_df = self.dynamic_split_dfs[kind]
-                    return data_df.to_dict(orient="records")
-            except KeyError:
-                kind = f"{kind}"
+            if self.use_inner_cross_validation and kind != "test":
+                filename = self.processed_file_names_dict[
+                    f"fold_{self.fold_index}_{kind}"
+                ]
+            else:
+                data_df = self.dynamic_split_dfs[kind]
+                return data_df.to_dict(orient="records")
         # If filename is provided
-        try:
-            return torch.load(
-                os.path.join(self.processed_dir, filename), weights_only=False
-            )
-        except FileNotFoundError:
-            raise FileNotFoundError(f"File {filename} doesn't exist")
+        return self.load_processed_data_from_file(filename)
+    def load_processed_data_from_file(self, filename):
+        return torch.load(os.path.join(filename), weights_only=False)
     # ------------------------------ Phase: Raw Properties -----------------------------------
     @property

{chebai-0.0.2.dev0 → chebai-1.0.1}/chebai/preprocessing/datasets/chebi.py RENAMED Viewed

@@ -401,8 +401,8 @@ class _ChEBIDataExtractor(_DynamicDataset, ABC):
         """
         try:
             filename = self.processed_file_names_dict["data"]
-            data_chebi_version = torch.load(
-                os.path.join(self.processed_dir, filename), weights_only=False
+            data_chebi_version = self.load_processed_data_from_file(
+                os.path.join(self.processed_dir, filename)
             )
         except FileNotFoundError:
             raise FileNotFoundError(

{chebai-0.0.2.dev0 → chebai-1.0.1}/chebai/preprocessing/reader.py RENAMED Viewed

@@ -1,5 +1,9 @@
+import inspect
 import os
-from typing import Any, Dict, List, Optional, Tuple
+import sys
+from abc import ABC
+from itertools import islice
+from typing import Any, Dict, List, Optional
 import deepsmiles
 import selfies as sf
@@ -36,7 +40,7 @@ class DataReader:
         if collator_kwargs is None:
             collator_kwargs = dict()
         self.collator = self.COLLATOR(**collator_kwargs)
-        self.dirname = os.path.dirname(__file__)
+        self.dirname = os.path.dirname(inspect.getfile(self.__class__))
         self._token_path = token_path
     def _get_raw_data(self, row: Dict[str, Any]) -> Any:
@@ -117,33 +121,65 @@ class DataReader:
         return
-class ChemDataReader(DataReader):
+class TokenIndexerReader(DataReader, ABC):
     """
-    Data reader for chemical data using SMILES tokens.
+    Abstract base class for reading tokenized data and mapping tokens to unique indices.
-    Args:
-        collator_kwargs: Optional dictionary of keyword arguments for the collator.
-        token_path: Optional path for the token file.
-        kwargs: Additional keyword arguments.
+    This class maintains a cache of token-to-index mappings that can be extended during runtime,
+    and saves new tokens to a persistent file at the end of processing.
     """
-    COLLATOR = RaggedCollator
-    @classmethod
-    def name(cls) -> str:
-        """Returns the name of the data reader."""
-        return "smiles_token"
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         with open(self.token_path, "r") as pk:
-            self.cache = [x.strip() for x in pk]
+            self.cache: Dict[str, int] = {
+                token.strip(): idx for idx, token in enumerate(pk)
+            }
+        self._loaded_tokens_count = len(self.cache)
     def _get_token_index(self, token: str) -> int:
         """Returns a unique number for each token, automatically adds new tokens."""
         if not str(token) in self.cache:
-            self.cache.append(str(token))
-        return self.cache.index(str(token)) + EMBEDDING_OFFSET
+            self.cache[(str(token))] = len(self.cache)
+        return self.cache[str(token)] + EMBEDDING_OFFSET
+    def on_finish(self) -> None:
+        """
+        Saves the current cache of tokens to the token file.This method is called after all data processing is complete.
+        """
+        print(f"first 10 tokens: {list(islice(self.cache, 10))}")
+        total_tokens = len(self.cache)
+        if total_tokens > self._loaded_tokens_count:
+            print("New tokens added to the cache, Saving them to token file.....")
+            assert sys.version_info >= (
+                3,
+                7,
+            ), "This code requires Python 3.7 or higher."
+            # For python 3.7+, the standard dict type preserves insertion order, and is iterated over in same order
+            # https://docs.python.org/3/whatsnew/3.7.html#summary-release-highlights
+            # https://mail.python.org/pipermail/python-dev/2017-December/151283.html
+            new_tokens = list(
+                islice(self.cache, self._loaded_tokens_count, total_tokens)
+            )
+            with open(self.token_path, "a") as pk:
+                print(f"saving new {len(new_tokens)} tokens to {self.token_path}...")
+                pk.writelines([f"{c}\n" for c in new_tokens])
+class ChemDataReader(TokenIndexerReader):
+    """
+    Data reader for chemical data using SMILES tokens.
+    """
+    COLLATOR = RaggedCollator
+    @classmethod
+    def name(cls) -> str:
+        """Returns the name of the data reader."""
+        return "smiles_token"
     def _read_data(self, raw_data: str) -> List[int]:
         """
@@ -157,15 +193,6 @@ class ChemDataReader(DataReader):
         """
         return [self._get_token_index(v[1]) for v in _tokenize(raw_data)]
-    def on_finish(self) -> None:
-        """
-        Saves the current cache of tokens to the token file. This method is called after all data processing is complete.
-        """
-        with open(self.token_path, "w") as pk:
-            print(f"saving {len(self.cache)} tokens to {self.token_path}...")
-            print(f"first 10 tokens: {self.cache[:10]}")
-            pk.writelines([f"{c}\n" for c in self.cache])
 class DeepChemDataReader(ChemDataReader):
     """

{chebai-0.0.2.dev0 → chebai-1.0.1}/chebai/result/analyse_sem.py RENAMED Viewed

@@ -1,20 +1,21 @@
 import gc
-import sys
 import traceback
 from datetime import datetime
 from typing import List, LiteralString
+import pandas as pd
 from torchmetrics.functional.classification import (
     multilabel_auroc,
     multilabel_average_precision,
     multilabel_f1_score,
 )
-from utils import *
 from chebai.loss.semantic import DisjointLoss
+from chebai.models import Electra
 from chebai.preprocessing.datasets.base import _DynamicDataset
 from chebai.preprocessing.datasets.chebi import ChEBIOver100
 from chebai.preprocessing.datasets.pubchem import PubChemKMeans
+from chebai.result.utils import *
 DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
@@ -122,7 +123,7 @@ def load_preds_labels(
 def get_label_names(data_module):
     if os.path.exists(os.path.join(data_module.processed_dir_main, "classes.txt")):
         with open(os.path.join(data_module.processed_dir_main, "classes.txt")) as fin:
-            return [int(line.strip()) for line in fin]
+            return [line.strip() for line in fin]
     print(
         f"Failed to retrieve label names, {os.path.join(data_module.processed_dir_main, 'classes.txt')} not found"
     )
@@ -131,70 +132,97 @@ def get_label_names(data_module):
 def get_chebi_graph(data_module, label_names):
     if os.path.exists(os.path.join(data_module.raw_dir, "chebi.obo")):
-        chebi_graph = data_module.extract_class_hierarchy(
+        chebi_graph = data_module._extract_class_hierarchy(
             os.path.join(data_module.raw_dir, "chebi.obo")
         )
-        return chebi_graph.subgraph(label_names)
+        return chebi_graph.subgraph([int(n) for n in label_names])
     print(
         f"Failed to retrieve ChEBI graph, {os.path.join(data_module.raw_dir, 'chebi.obo')} not found"
     )
     return None
-def get_disjoint_groups():
-    disjoints_owl_file = os.path.join("data", "chebi-disjoints.owl")
-    with open(disjoints_owl_file, "r") as f:
-        plaintext = f.read()
-        segments = plaintext.split("<")
-        disjoint_pairs = []
-        left = None
-        for seg in segments:
-            if seg.startswith("rdf:Description ") or seg.startswith("owl:Class"):
-                left = int(seg.split('rdf:about="&obo;CHEBI_')[1].split('"')[0])
-            elif seg.startswith("owl:disjointWith"):
-                right = int(seg.split('rdf:resource="&obo;CHEBI_')[1].split('"')[0])
-                disjoint_pairs.append([left, right])
-        disjoint_groups = []
-        for seg in plaintext.split("<rdf:Description>"):
-            if "owl;AllDisjointClasses" in seg:
-                classes = seg.split('rdf:about="&obo;CHEBI_')[1:]
-                classes = [int(c.split('"')[0]) for c in classes]
-                disjoint_groups.append(classes)
+def get_disjoint_groups(disjoint_files):
+    if disjoint_files is None:
+        disjoint_files = os.path.join("data", "chebi-disjoints.owl")
+    disjoint_pairs, disjoint_groups = [], []
+    for file in disjoint_files:
+        if file.split(".")[-1] == "csv":
+            disjoint_pairs += pd.read_csv(file, header=None).values.tolist()
+        elif file.split(".")[-1] == "owl":
+            with open(file, "r") as f:
+                plaintext = f.read()
+                segments = plaintext.split("<")
+                disjoint_pairs = []
+                left = None
+                for seg in segments:
+                    if seg.startswith("rdf:Description ") or seg.startswith(
+                        "owl:Class"
+                    ):
+                        left = int(seg.split('rdf:about="&obo;CHEBI_')[1].split('"')[0])
+                    elif seg.startswith("owl:disjointWith"):
+                        right = int(
+                            seg.split('rdf:resource="&obo;CHEBI_')[1].split('"')[0]
+                        )
+                        disjoint_pairs.append([left, right])
+                disjoint_groups = []
+                for seg in plaintext.split("<rdf:Description>"):
+                    if "owl;AllDisjointClasses" in seg:
+                        classes = seg.split('rdf:about="&obo;CHEBI_')[1:]
+                        classes = [int(c.split('"')[0]) for c in classes]
+                        disjoint_groups.append(classes)
+        else:
+            raise NotImplementedError(
+                "Unsupported disjoint file format: " + file.split(".")[-1]
+            )
     disjoint_all = disjoint_pairs + disjoint_groups
     # one disjointness is commented out in the owl-file
     # (the correct way would be to parse the owl file and notice the comment symbols, but for this case, it should work)
-    disjoint_all.remove([22729, 51880])
-    print(f"Found {len(disjoint_all)} disjoint groups")
+    if [22729, 51880] in disjoint_all:
+        disjoint_all.remove([22729, 51880])
+    # print(f"Found {len(disjoint_all)} disjoint groups")
     return disjoint_all
 class PredictionSmoother:
     """Removes implication and disjointness violations from predictions"""
-    def __init__(self, dataset):
-        self.label_names = get_label_names(dataset)
+    def __init__(self, dataset, label_names=None, disjoint_files=None):
+        if label_names:
+            self.label_names = label_names
+        else:
+            self.label_names = get_label_names(dataset)
         self.chebi_graph = get_chebi_graph(dataset, self.label_names)
-        self.disjoint_groups = get_disjoint_groups()
+        self.disjoint_groups = get_disjoint_groups(disjoint_files)
     def __call__(self, preds):
         preds_sum_orig = torch.sum(preds)
-        print(f"Preds sum: {preds_sum_orig}")
-        # eliminate implication violations by setting each prediction to maximum of its successors
         for i, label in enumerate(self.label_names):
             succs = [
-                self.label_names.index(p) for p in self.chebi_graph.successors(label)
+                self.label_names.index(str(p))
+                for p in self.chebi_graph.successors(int(label))
             ] + [i]
             if len(succs) > 0:
+                if torch.max(preds[:, succs], dim=1).values > 0.5 and preds[:, i] < 0.5:
+                    print(
+                        f"Correcting prediction for {label} to max of subclasses {list(self.chebi_graph.successors(int(label)))}"
+                    )
+                    print(
+                        f"Original pred: {preds[:, i]}, successors: {preds[:, succs]}"
+                    )
                 preds[:, i] = torch.max(preds[:, succs], dim=1).values
-        print(f"Preds change (step 1): {torch.sum(preds) - preds_sum_orig}")
+        if torch.sum(preds) != preds_sum_orig:
+            print(f"Preds change (step 1): {torch.sum(preds) - preds_sum_orig}")
         preds_sum_orig = torch.sum(preds)
         # step 2: eliminate disjointness violations: for group of disjoint classes, set all except max to 0.49 (if it is not already lower)
         preds_bounded = torch.min(preds, torch.ones_like(preds) * 0.49)
         for disj_group in self.disjoint_groups:
             disj_group = [
-                self.label_names.index(g) for g in disj_group if g in self.label_names
+                self.label_names.index(str(g))
+                for g in disj_group
+                if g in self.label_names
             ]
             if len(disj_group) > 1:
                 old_preds = preds[:, disj_group]
@@ -211,14 +239,12 @@ class PredictionSmoother:
                     print(
                         f"disjointness group {[self.label_names[d] for d in disj_group]} changed {samples_changed} samples"
                     )
-        print(
-            f"Preds change after disjointness (step 2): {torch.sum(preds) - preds_sum_orig}"
-        )
         preds_sum_orig = torch.sum(preds)
         # step 3: disjointness violation removal may have caused new implication inconsistencies -> set each prediction to min of predecessors
         for i, label in enumerate(self.label_names):
             predecessors = [i] + [
-                self.label_names.index(p) for p in self.chebi_graph.predecessors(label)
+                self.label_names.index(str(p))
+                for p in self.chebi_graph.predecessors(int(label))
             ]
             lowest_predecessors = torch.min(preds[:, predecessors], dim=1)
             preds[:, i] = lowest_predecessors.values

chebai 0.0.2.dev0__tar.gz → 1.0.1__tar.gz

chebai 0.0.2.dev0tar.gz → 1.0.1tar.gz