PyPI - pyg-nightly - Versions diffs - 2.7.0.dev20241124__py3-none-any.whl → 2.7.0.dev20241127__py3-none-any.whl - Mend

pyg-nightly 2.7.0.dev20241124py3-none-any.whl → 2.7.0.dev20241127py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

{pyg_nightly-2.7.0.dev20241124.dist-info → pyg_nightly-2.7.0.dev20241127.dist-info}/METADATA +1 -1
{pyg_nightly-2.7.0.dev20241124.dist-info → pyg_nightly-2.7.0.dev20241127.dist-info}/RECORD +19 -13
torch_geometric/__init__.py +1 -1
torch_geometric/data/__init__.py +5 -0
torch_geometric/data/large_graph_indexer.py +677 -0
torch_geometric/datasets/__init__.py +2 -0
torch_geometric/datasets/git_mol_dataset.py +263 -0
torch_geometric/loader/__init__.py +2 -0
torch_geometric/loader/rag_loader.py +106 -0
torch_geometric/nn/models/__init__.py +2 -0
torch_geometric/nn/models/g_retriever.py +12 -1
torch_geometric/nn/models/git_mol.py +336 -0
torch_geometric/nn/nlp/__init__.py +2 -0
torch_geometric/nn/nlp/sentence_transformer.py +30 -0
torch_geometric/nn/nlp/vision_transformer.py +33 -0
torch_geometric/profile/__init__.py +2 -0
torch_geometric/profile/nvtx.py +66 -0
torch_geometric/sampler/base.py +8 -0
{pyg_nightly-2.7.0.dev20241124.dist-info → pyg_nightly-2.7.0.dev20241127.dist-info}/WHEEL +0 -0

torch_geometric/datasets/git_mol_dataset.py ADDED Viewed

@@ -0,0 +1,263 @@
+import sys
+from typing import Any, Callable, Dict, List, Optional
+import numpy as np
+import torch
+from tqdm import tqdm
+from torch_geometric.data import (
+    Data,
+    InMemoryDataset,
+    download_google_url,
+    extract_zip,
+)
+from torch_geometric.io import fs
+def safe_index(lst: List[Any], e: int) -> int:
+    return lst.index(e) if e in lst else len(lst) - 1
+class GitMolDataset(InMemoryDataset):
+    r"""The dataset from the `"GIT-Mol: A Multi-modal Large Language Model
+    for Molecular Science with Graph, Image, and Text"
+    <https://arxiv.org/pdf/2308.06911>`_ paper.
+    Args:
+        root (str): Root directory where the dataset should be saved.
+        transform (callable, optional): A function/transform that takes in an
+            :obj:`torch_geometric.data.Data` object and returns a transformed
+            version. The data object will be transformed before every access.
+            (default: :obj:`None`)
+        pre_transform (callable, optional): A function/transform that takes in
+            an :obj:`torch_geometric.data.Data` object and returns a
+            transformed version. The data object will be transformed before
+            being saved to disk. (default: :obj:`None`)
+        pre_filter (callable, optional): A function that takes in an
+            :obj:`torch_geometric.data.Data` object and returns a boolean
+            value, indicating whether the data object should be included in the
+            final dataset. (default: :obj:`None`)
+        force_reload (bool, optional): Whether to re-process the dataset.
+            (default: :obj:`False`)
+        split (int, optional): Datasets split, train/valid/test=0/1/2.
+            (default: :obj:`0`)
+    """
+    raw_url_id = '1loBXabD6ncAFY-vanRsVtRUSFkEtBweg'
+    def __init__(
+        self,
+        root: str,
+        transform: Optional[Callable] = None,
+        pre_transform: Optional[Callable] = None,
+        pre_filter: Optional[Callable] = None,
+        force_reload: bool = False,
+        split: int = 0,
+    ):
+        from torchvision import transforms
+        self.split = split
+        if self.split == 0:
+            self.img_transform = transforms.Compose([
+                transforms.Resize((224, 224)),
+                transforms.RandomRotation(15),
+                transforms.ColorJitter(brightness=0.5, contrast=0.5, hue=0.5),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+            ])
+        else:
+            self.img_transform = transforms.Compose([
+                transforms.Resize((224, 224)),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+            ])
+        super().__init__(root, transform, pre_transform, pre_filter,
+                         force_reload=force_reload)
+        self.load(self.processed_paths[0])
+    @property
+    def raw_file_names(self) -> List[str]:
+        return ['train_3500.pkl', 'valid_450.pkl', 'test_450.pkl']
+    @property
+    def processed_file_names(self) -> str:
+        return ['train.pt', 'valid.pt', 'test.pt'][self.split]
+    def download(self) -> None:
+        file_path = download_google_url(
+            self.raw_url_id,
+            self.raw_dir,
+            'gitmol.zip',
+        )
+        extract_zip(file_path, self.raw_dir)
+    def process(self) -> None:
+        import pandas as pd
+        from PIL import Image
+        try:
+            from rdkit import Chem, RDLogger
+            RDLogger.DisableLog('rdApp.*')  # type: ignore
+            WITH_RDKIT = True
+        except ImportError:
+            WITH_RDKIT = False
+        if not WITH_RDKIT:
+            print(("Using a pre-processed version of the dataset. Please "
+                   "install 'rdkit' to alternatively process the raw data."),
+                  file=sys.stderr)
+            data_list = fs.torch_load(self.raw_paths[0])
+            data_list = [Data(**data_dict) for data_dict in data_list]
+            if self.pre_filter is not None:
+                data_list = [d for d in data_list if self.pre_filter(d)]
+            if self.pre_transform is not None:
+                data_list = [self.pre_transform(d) for d in data_list]
+            self.save(data_list, self.processed_paths[0])
+            return
+        allowable_features: Dict[str, List[Any]] = {
+            'possible_atomic_num_list':
+            list(range(1, 119)) + ['misc'],
+            'possible_formal_charge_list':
+            [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 'misc'],
+            'possible_chirality_list': [
+                Chem.rdchem.ChiralType.CHI_UNSPECIFIED,
+                Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW,
+                Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW,
+                Chem.rdchem.ChiralType.CHI_OTHER
+            ],
+            'possible_hybridization_list': [
+                Chem.rdchem.HybridizationType.SP,
+                Chem.rdchem.HybridizationType.SP2,
+                Chem.rdchem.HybridizationType.SP3,
+                Chem.rdchem.HybridizationType.SP3D,
+                Chem.rdchem.HybridizationType.SP3D2,
+                Chem.rdchem.HybridizationType.UNSPECIFIED, 'misc'
+            ],
+            'possible_numH_list': [0, 1, 2, 3, 4, 5, 6, 7, 8, 'misc'],
+            'possible_implicit_valence_list': [0, 1, 2, 3, 4, 5, 6],
+            'possible_degree_list': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 'misc'],
+            'possible_number_radical_e_list': [0, 1, 2, 3, 4, 'misc'],
+            'possible_is_aromatic_list': [False, True],
+            'possible_is_in_ring_list': [False, True],
+            'possible_bond_type_list': [
+                Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE,
+                Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC,
+                Chem.rdchem.BondType.ZERO
+            ],
+            'possible_bond_dirs': [  # only for double bond stereo information
+                Chem.rdchem.BondDir.NONE, Chem.rdchem.BondDir.ENDUPRIGHT,
+                Chem.rdchem.BondDir.ENDDOWNRIGHT
+            ],
+            'possible_bond_stereo_list': [
+                Chem.rdchem.BondStereo.STEREONONE,
+                Chem.rdchem.BondStereo.STEREOZ,
+                Chem.rdchem.BondStereo.STEREOE,
+                Chem.rdchem.BondStereo.STEREOCIS,
+                Chem.rdchem.BondStereo.STEREOTRANS,
+                Chem.rdchem.BondStereo.STEREOANY,
+            ],
+            'possible_is_conjugated_list': [False, True]
+        }
+        data = pd.read_pickle(
+            f'{self.raw_dir}/igcdata_toy/{self.raw_file_names[self.split]}')
+        data_list = []
+        for _, r in tqdm(data.iterrows(), total=data.shape[0]):
+            smiles = r['isosmiles']
+            mol = Chem.MolFromSmiles(smiles.strip('\n'))
+            if mol is not None:
+                # text
+                summary = r['summary']
+                # image
+                cid = r['cid']
+                img_file = f'{self.raw_dir}/igcdata_toy/imgs/CID_{cid}.png'
+                img = Image.open(img_file).convert('RGB')
+                img = self.img_transform(img).unsqueeze(0)
+                # graph
+                atom_features_list = []
+                for atom in mol.GetAtoms():  # type: ignore
+                    atom_feature = [
+                        safe_index(
+                            allowable_features['possible_atomic_num_list'],
+                            atom.GetAtomicNum()),
+                        allowable_features['possible_chirality_list'].index(
+                            atom.GetChiralTag()),
+                        safe_index(allowable_features['possible_degree_list'],
+                                   atom.GetTotalDegree()),
+                        safe_index(
+                            allowable_features['possible_formal_charge_list'],
+                            atom.GetFormalCharge()),
+                        safe_index(allowable_features['possible_numH_list'],
+                                   atom.GetTotalNumHs()),
+                        safe_index(
+                            allowable_features[
+                                'possible_number_radical_e_list'],
+                            atom.GetNumRadicalElectrons()),
+                        safe_index(
+                            allowable_features['possible_hybridization_list'],
+                            atom.GetHybridization()),
+                        allowable_features['possible_is_aromatic_list'].index(
+                            atom.GetIsAromatic()),
+                        allowable_features['possible_is_in_ring_list'].index(
+                            atom.IsInRing()),
+                    ]
+                    atom_features_list.append(atom_feature)
+                x = torch.tensor(np.array(atom_features_list),
+                                 dtype=torch.long)
+                edges_list = []
+                edge_features_list = []
+                for bond in mol.GetBonds():  # type: ignore
+                    i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
+                    edge_feature = [
+                        safe_index(
+                            allowable_features['possible_bond_type_list'],
+                            bond.GetBondType()),
+                        allowable_features['possible_bond_stereo_list'].index(
+                            bond.GetStereo()),
+                        allowable_features['possible_is_conjugated_list'].
+                        index(bond.GetIsConjugated()),
+                    ]
+                    edges_list.append((i, j))
+                    edge_features_list.append(edge_feature)
+                    edges_list.append((j, i))
+                    edge_features_list.append(edge_feature)
+                edge_index = torch.tensor(
+                    np.array(edges_list).T,
+                    dtype=torch.long,
+                )
+                edge_attr = torch.tensor(
+                    np.array(edge_features_list),
+                    dtype=torch.long,
+                )
+                data = Data(
+                    x=x,
+                    edge_index=edge_index,
+                    smiles=smiles,
+                    edge_attr=edge_attr,
+                    image=img,
+                    caption=summary,
+                )
+                if self.pre_filter is not None and not self.pre_filter(data):
+                    continue
+                if self.pre_transform is not None:
+                    data = self.pre_transform(data)
+                data_list.append(data)
+        self.save(data_list, self.processed_paths[0])

torch_geometric/loader/__init__.py CHANGED Viewed

@@ -22,6 +22,7 @@ from .dynamic_batch_sampler import DynamicBatchSampler
 from .prefetch import PrefetchLoader
 from .cache import CachedLoader
 from .mixin import AffinityMixin
+from .rag_loader import RAGQueryLoader
 __all__ = classes = [
     'DataLoader',
@@ -50,6 +51,7 @@ __all__ = classes = [
     'PrefetchLoader',
     'CachedLoader',
     'AffinityMixin',
+    'RAGQueryLoader',
 ]
 RandomNodeSampler = deprecated(

torch_geometric/loader/rag_loader.py ADDED Viewed

@@ -0,0 +1,106 @@
+from abc import abstractmethod
+from typing import Any, Callable, Dict, Optional, Protocol, Tuple, Union
+from torch_geometric.data import Data, FeatureStore, HeteroData
+from torch_geometric.sampler import HeteroSamplerOutput, SamplerOutput
+from torch_geometric.typing import InputEdges, InputNodes
+class RAGFeatureStore(Protocol):
+    """Feature store for remote GNN RAG backend."""
+    @abstractmethod
+    def retrieve_seed_nodes(self, query: Any, **kwargs) -> InputNodes:
+        """Makes a comparison between the query and all the nodes to get all
+        the closest nodes. Return the indices of the nodes that are to be seeds
+        for the RAG Sampler.
+        """
+        ...
+    @abstractmethod
+    def retrieve_seed_edges(self, query: Any, **kwargs) -> InputEdges:
+        """Makes a comparison between the query and all the edges to get all
+        the closest nodes. Returns the edge indices that are to be the seeds
+        for the RAG Sampler.
+        """
+        ...
+    @abstractmethod
+    def load_subgraph(
+        self, sample: Union[SamplerOutput, HeteroSamplerOutput]
+    ) -> Union[Data, HeteroData]:
+        """Combines sampled subgraph output with features in a Data object."""
+        ...
+class RAGGraphStore(Protocol):
+    """Graph store for remote GNN RAG backend."""
+    @abstractmethod
+    def sample_subgraph(self, seed_nodes: InputNodes, seed_edges: InputEdges,
+                        **kwargs) -> Union[SamplerOutput, HeteroSamplerOutput]:
+        """Sample a subgraph using the seeded nodes and edges."""
+        ...
+    @abstractmethod
+    def register_feature_store(self, feature_store: FeatureStore):
+        """Register a feature store to be used with the sampler. Samplers need
+        info from the feature store in order to work properly on HeteroGraphs.
+        """
+        ...
+# TODO: Make compatible with Heterographs
+class RAGQueryLoader:
+    def __init__(self, data: Tuple[RAGFeatureStore, RAGGraphStore],
+                 local_filter: Optional[Callable[[Data, Any], Data]] = None,
+                 seed_nodes_kwargs: Optional[Dict[str, Any]] = None,
+                 seed_edges_kwargs: Optional[Dict[str, Any]] = None,
+                 sampler_kwargs: Optional[Dict[str, Any]] = None,
+                 loader_kwargs: Optional[Dict[str, Any]] = None):
+        """Loader meant for making queries from a remote backend.
+        Args:
+            data (Tuple[RAGFeatureStore, RAGGraphStore]): Remote FeatureStore
+                and GraphStore to load from. Assumed to conform to the
+                protocols listed above.
+            local_filter (Optional[Callable[[Data, Any], Data]], optional):
+                Optional local transform to apply to data after retrieval.
+                Defaults to None.
+            seed_nodes_kwargs (Optional[Dict[str, Any]], optional): Paramaters
+                to pass into process for fetching seed nodes. Defaults to None.
+            seed_edges_kwargs (Optional[Dict[str, Any]], optional): Parameters
+                to pass into process for fetching seed edges. Defaults to None.
+            sampler_kwargs (Optional[Dict[str, Any]], optional): Parameters to
+                pass into process for sampling graph. Defaults to None.
+            loader_kwargs (Optional[Dict[str, Any]], optional): Parameters to
+                pass into process for loading graph features. Defaults to None.
+        """
+        fstore, gstore = data
+        self.feature_store = fstore
+        self.graph_store = gstore
+        self.graph_store.register_feature_store(self.feature_store)
+        self.local_filter = local_filter
+        self.seed_nodes_kwargs = seed_nodes_kwargs or {}
+        self.seed_edges_kwargs = seed_edges_kwargs or {}
+        self.sampler_kwargs = sampler_kwargs or {}
+        self.loader_kwargs = loader_kwargs or {}
+    def query(self, query: Any) -> Data:
+        """Retrieve a subgraph associated with the query with all its feature
+        attributes.
+        """
+        seed_nodes = self.feature_store.retrieve_seed_nodes(
+            query, **self.seed_nodes_kwargs)
+        seed_edges = self.feature_store.retrieve_seed_edges(
+            query, **self.seed_edges_kwargs)
+        subgraph_sample = self.graph_store.sample_subgraph(
+            seed_nodes, seed_edges, **self.sampler_kwargs)
+        data = self.feature_store.load_subgraph(sample=subgraph_sample,
+                                                **self.loader_kwargs)
+        if self.local_filter:
+            data = self.local_filter(data, query)
+        return data

torch_geometric/nn/models/__init__.py CHANGED Viewed

@@ -29,6 +29,7 @@ from .pmlp import PMLP
 from .neural_fingerprint import NeuralFingerprint
 from .visnet import ViSNet
 from .g_retriever import GRetriever
+from .git_mol import GITMol
 from .molecule_gpt import MoleculeGPT
 from .glem import GLEM
 # Deprecated:
@@ -78,6 +79,7 @@ __all__ = classes = [
     'NeuralFingerprint',
     'ViSNet',
     'GRetriever',
+    'GITMol',
     'MoleculeGPT',
     'GLEM',
 ]

torch_geometric/nn/models/g_retriever.py CHANGED Viewed

@@ -21,6 +21,8 @@ class GRetriever(torch.nn.Module):
             (default: :obj:`False`)
         mlp_out_channels (int, optional): The size of each graph embedding
             after projection. (default: :obj:`4096`)
+        mlp_out_tokens (int, optional): Number of LLM prefix tokens to
+            reserve for GNN output. (default: :obj:`1`)
     .. warning::
         This module has been tested with the following HuggingFace models
@@ -43,6 +45,7 @@ class GRetriever(torch.nn.Module):
         gnn: torch.nn.Module,
         use_lora: bool = False,
         mlp_out_channels: int = 4096,
+        mlp_out_tokens: int = 1,
     ) -> None:
         super().__init__()
@@ -77,7 +80,9 @@ class GRetriever(torch.nn.Module):
         self.projector = torch.nn.Sequential(
             torch.nn.Linear(mlp_hidden_channels, mlp_hidden_channels),
             torch.nn.Sigmoid(),
-            torch.nn.Linear(mlp_hidden_channels, mlp_out_channels),
+            torch.nn.Linear(mlp_hidden_channels,
+                            mlp_out_channels * mlp_out_tokens),
+            torch.nn.Unflatten(-1, (mlp_out_tokens, mlp_out_channels)),
         ).to(self.llm.device)
     def encode(
@@ -126,6 +131,9 @@ class GRetriever(torch.nn.Module):
         x = self.projector(x)
         xs = x.split(1, dim=0)
+        # Handle case where theres more than one embedding for each sample
+        xs = [x.squeeze(0) for x in xs]
         # Handle questions without node features:
         batch_unique = batch.unique()
         batch_size = len(question)
@@ -182,6 +190,9 @@ class GRetriever(torch.nn.Module):
         x = self.projector(x)
         xs = x.split(1, dim=0)
+        # Handle case where theres more than one embedding for each sample
+        xs = [x.squeeze(0) for x in xs]
         # Handle questions without node features:
         batch_unique = batch.unique()
         batch_size = len(question)

pyg-nightly 2.7.0.dev20241124__py3-none-any.whl → 2.7.0.dev20241127__py3-none-any.whl

pyg-nightly 2.7.0.dev20241124py3-none-any.whl → 2.7.0.dev20241127py3-none-any.whl