PyPI - pyg-nightly - Versions diffs - 2.6.0.dev20240318__py3-none-any.whl → 2.7.0.dev20250115__py3-none-any.whl - Mend

pyg-nightly 2.6.0.dev20240318py3-none-any.whl → 2.7.0.dev20250115py3-none-any.whl

Files changed (226) hide show

{pyg_nightly-2.6.0.dev20240318.dist-info → pyg_nightly-2.7.0.dev20250115.dist-info}/METADATA +31 -47
{pyg_nightly-2.6.0.dev20240318.dist-info → pyg_nightly-2.7.0.dev20250115.dist-info}/RECORD +226 -199
{pyg_nightly-2.6.0.dev20240318.dist-info → pyg_nightly-2.7.0.dev20250115.dist-info}/WHEEL +1 -1
torch_geometric/__init__.py +28 -1
torch_geometric/_compile.py +8 -1
torch_geometric/_onnx.py +14 -0
torch_geometric/config_mixin.py +113 -0
torch_geometric/config_store.py +28 -19
torch_geometric/data/__init__.py +24 -1
torch_geometric/data/batch.py +2 -2
torch_geometric/data/collate.py +8 -2
torch_geometric/data/data.py +16 -8
torch_geometric/data/database.py +61 -15
torch_geometric/data/dataset.py +14 -6
torch_geometric/data/feature_store.py +25 -42
torch_geometric/data/graph_store.py +1 -5
torch_geometric/data/hetero_data.py +18 -9
torch_geometric/data/in_memory_dataset.py +2 -4
torch_geometric/data/large_graph_indexer.py +677 -0
torch_geometric/data/lightning/datamodule.py +4 -4
torch_geometric/data/separate.py +6 -1
torch_geometric/data/storage.py +17 -7
torch_geometric/data/summary.py +14 -4
torch_geometric/data/temporal.py +1 -2
torch_geometric/datasets/__init__.py +17 -2
torch_geometric/datasets/actor.py +9 -11
torch_geometric/datasets/airfrans.py +15 -18
torch_geometric/datasets/airports.py +10 -12
torch_geometric/datasets/amazon.py +8 -11
torch_geometric/datasets/amazon_book.py +9 -10
torch_geometric/datasets/amazon_products.py +9 -10
torch_geometric/datasets/aminer.py +8 -9
torch_geometric/datasets/aqsol.py +10 -13
torch_geometric/datasets/attributed_graph_dataset.py +10 -12
torch_geometric/datasets/ba_multi_shapes.py +10 -12
torch_geometric/datasets/ba_shapes.py +5 -6
torch_geometric/datasets/bitcoin_otc.py +1 -1
torch_geometric/datasets/brca_tgca.py +1 -1
torch_geometric/datasets/cornell.py +145 -0
torch_geometric/datasets/dblp.py +2 -1
torch_geometric/datasets/dbp15k.py +2 -2
torch_geometric/datasets/fake.py +1 -3
torch_geometric/datasets/flickr.py +2 -1
torch_geometric/datasets/freebase.py +1 -1
torch_geometric/datasets/gdelt_lite.py +3 -2
torch_geometric/datasets/ged_dataset.py +3 -2
torch_geometric/datasets/git_mol_dataset.py +263 -0
torch_geometric/datasets/gnn_benchmark_dataset.py +11 -10
torch_geometric/datasets/hgb_dataset.py +8 -8
torch_geometric/datasets/imdb.py +2 -1
torch_geometric/datasets/karate.py +3 -2
torch_geometric/datasets/last_fm.py +2 -1
torch_geometric/datasets/linkx_dataset.py +4 -3
torch_geometric/datasets/lrgb.py +3 -5
torch_geometric/datasets/malnet_tiny.py +4 -3
torch_geometric/datasets/mnist_superpixels.py +2 -3
torch_geometric/datasets/molecule_gpt_dataset.py +485 -0
torch_geometric/datasets/molecule_net.py +15 -3
torch_geometric/datasets/motif_generator/base.py +0 -1
torch_geometric/datasets/neurograph.py +1 -3
torch_geometric/datasets/ogb_mag.py +1 -1
torch_geometric/datasets/opf.py +239 -0
torch_geometric/datasets/ose_gvcs.py +1 -1
torch_geometric/datasets/pascal.py +11 -9
torch_geometric/datasets/pascal_pf.py +1 -1
torch_geometric/datasets/pcpnet_dataset.py +1 -1
torch_geometric/datasets/pcqm4m.py +10 -3
torch_geometric/datasets/ppi.py +1 -1
torch_geometric/datasets/qm9.py +8 -7
torch_geometric/datasets/rcdd.py +4 -4
torch_geometric/datasets/reddit.py +2 -1
torch_geometric/datasets/reddit2.py +2 -1
torch_geometric/datasets/rel_link_pred_dataset.py +3 -3
torch_geometric/datasets/s3dis.py +5 -3
torch_geometric/datasets/shapenet.py +3 -3
torch_geometric/datasets/shrec2016.py +2 -2
torch_geometric/datasets/snap_dataset.py +7 -1
torch_geometric/datasets/tag_dataset.py +350 -0
torch_geometric/datasets/upfd.py +2 -1
torch_geometric/datasets/web_qsp_dataset.py +246 -0
torch_geometric/datasets/webkb.py +2 -2
torch_geometric/datasets/wikics.py +1 -1
torch_geometric/datasets/wikidata.py +3 -2
torch_geometric/datasets/wikipedia_network.py +2 -2
torch_geometric/datasets/willow_object_class.py +1 -1
torch_geometric/datasets/word_net.py +2 -2
torch_geometric/datasets/yelp.py +2 -1
torch_geometric/datasets/zinc.py +1 -1
torch_geometric/device.py +42 -0
torch_geometric/distributed/local_feature_store.py +3 -2
torch_geometric/distributed/local_graph_store.py +2 -1
torch_geometric/distributed/partition.py +9 -8
torch_geometric/edge_index.py +616 -438
torch_geometric/explain/algorithm/base.py +0 -1
torch_geometric/explain/algorithm/graphmask_explainer.py +1 -2
torch_geometric/explain/algorithm/pg_explainer.py +1 -1
torch_geometric/explain/explanation.py +2 -2
torch_geometric/graphgym/checkpoint.py +2 -1
torch_geometric/graphgym/logger.py +4 -4
torch_geometric/graphgym/loss.py +1 -1
torch_geometric/graphgym/utils/agg_runs.py +6 -6
torch_geometric/index.py +826 -0
torch_geometric/inspector.py +13 -7
torch_geometric/io/fs.py +28 -2
torch_geometric/io/npz.py +2 -1
torch_geometric/io/off.py +2 -2
torch_geometric/io/sdf.py +2 -2
torch_geometric/io/tu.py +4 -5
torch_geometric/loader/__init__.py +4 -0
torch_geometric/loader/cluster.py +10 -4
torch_geometric/loader/graph_saint.py +2 -1
torch_geometric/loader/ibmb_loader.py +12 -4
torch_geometric/loader/mixin.py +1 -1
torch_geometric/loader/neighbor_loader.py +1 -1
torch_geometric/loader/neighbor_sampler.py +2 -2
torch_geometric/loader/prefetch.py +1 -1
torch_geometric/loader/rag_loader.py +107 -0
torch_geometric/loader/utils.py +8 -7
torch_geometric/loader/zip_loader.py +10 -0
torch_geometric/metrics/__init__.py +11 -2
torch_geometric/metrics/link_pred.py +317 -65
torch_geometric/nn/aggr/__init__.py +4 -0
torch_geometric/nn/aggr/attention.py +0 -2
torch_geometric/nn/aggr/base.py +3 -5
torch_geometric/nn/aggr/patch_transformer.py +143 -0
torch_geometric/nn/aggr/set_transformer.py +1 -1
torch_geometric/nn/aggr/variance_preserving.py +33 -0
torch_geometric/nn/attention/__init__.py +5 -1
torch_geometric/nn/attention/qformer.py +71 -0
torch_geometric/nn/conv/collect.jinja +7 -4
torch_geometric/nn/conv/cugraph/base.py +8 -12
torch_geometric/nn/conv/edge_conv.py +3 -2
torch_geometric/nn/conv/fused_gat_conv.py +1 -1
torch_geometric/nn/conv/gat_conv.py +35 -7
torch_geometric/nn/conv/gatv2_conv.py +36 -6
torch_geometric/nn/conv/general_conv.py +1 -1
torch_geometric/nn/conv/graph_conv.py +21 -3
torch_geometric/nn/conv/gravnet_conv.py +3 -2
torch_geometric/nn/conv/hetero_conv.py +3 -3
torch_geometric/nn/conv/hgt_conv.py +1 -1
torch_geometric/nn/conv/message_passing.py +138 -87
torch_geometric/nn/conv/mixhop_conv.py +1 -1
torch_geometric/nn/conv/propagate.jinja +9 -1
torch_geometric/nn/conv/rgcn_conv.py +5 -5
torch_geometric/nn/conv/spline_conv.py +4 -4
torch_geometric/nn/conv/x_conv.py +3 -2
torch_geometric/nn/dense/linear.py +11 -6
torch_geometric/nn/fx.py +3 -3
torch_geometric/nn/model_hub.py +3 -1
torch_geometric/nn/models/__init__.py +10 -2
torch_geometric/nn/models/deep_graph_infomax.py +1 -2
torch_geometric/nn/models/dimenet_utils.py +5 -7
torch_geometric/nn/models/g_retriever.py +230 -0
torch_geometric/nn/models/git_mol.py +336 -0
torch_geometric/nn/models/glem.py +385 -0
torch_geometric/nn/models/gnnff.py +0 -1
torch_geometric/nn/models/graph_unet.py +12 -3
torch_geometric/nn/models/jumping_knowledge.py +63 -4
torch_geometric/nn/models/lightgcn.py +1 -1
torch_geometric/nn/models/metapath2vec.py +5 -5
torch_geometric/nn/models/molecule_gpt.py +222 -0
torch_geometric/nn/models/node2vec.py +2 -3
torch_geometric/nn/models/schnet.py +2 -1
torch_geometric/nn/models/signed_gcn.py +3 -3
torch_geometric/nn/module_dict.py +2 -2
torch_geometric/nn/nlp/__init__.py +9 -0
torch_geometric/nn/nlp/llm.py +329 -0
torch_geometric/nn/nlp/sentence_transformer.py +134 -0
torch_geometric/nn/nlp/vision_transformer.py +33 -0
torch_geometric/nn/norm/batch_norm.py +1 -1
torch_geometric/nn/parameter_dict.py +2 -2
torch_geometric/nn/pool/__init__.py +21 -5
torch_geometric/nn/pool/cluster_pool.py +145 -0
torch_geometric/nn/pool/connect/base.py +0 -1
torch_geometric/nn/pool/edge_pool.py +1 -1
torch_geometric/nn/pool/graclus.py +4 -2
torch_geometric/nn/pool/pool.py +8 -2
torch_geometric/nn/pool/select/base.py +0 -1
torch_geometric/nn/pool/voxel_grid.py +3 -2
torch_geometric/nn/resolver.py +1 -1
torch_geometric/nn/sequential.jinja +10 -23
torch_geometric/nn/sequential.py +204 -78
torch_geometric/nn/summary.py +1 -1
torch_geometric/nn/to_hetero_with_bases_transformer.py +19 -19
torch_geometric/profile/__init__.py +2 -0
torch_geometric/profile/nvtx.py +66 -0
torch_geometric/profile/profiler.py +30 -19
torch_geometric/resolver.py +1 -1
torch_geometric/sampler/base.py +34 -13
torch_geometric/sampler/neighbor_sampler.py +11 -10
torch_geometric/sampler/utils.py +1 -1
torch_geometric/template.py +1 -0
torch_geometric/testing/__init__.py +6 -2
torch_geometric/testing/decorators.py +56 -22
torch_geometric/testing/feature_store.py +1 -1
torch_geometric/transforms/__init__.py +2 -0
torch_geometric/transforms/add_metapaths.py +5 -5
torch_geometric/transforms/add_positional_encoding.py +1 -1
torch_geometric/transforms/delaunay.py +65 -14
torch_geometric/transforms/face_to_edge.py +32 -3
torch_geometric/transforms/gdc.py +7 -6
torch_geometric/transforms/laplacian_lambda_max.py +3 -3
torch_geometric/transforms/mask.py +5 -1
torch_geometric/transforms/node_property_split.py +1 -2
torch_geometric/transforms/pad.py +7 -6
torch_geometric/transforms/random_link_split.py +1 -1
torch_geometric/transforms/remove_self_loops.py +36 -0
torch_geometric/transforms/svd_feature_reduction.py +1 -1
torch_geometric/transforms/to_sparse_tensor.py +1 -1
torch_geometric/transforms/two_hop.py +1 -1
torch_geometric/transforms/virtual_node.py +2 -1
torch_geometric/typing.py +43 -6
torch_geometric/utils/__init__.py +5 -1
torch_geometric/utils/_negative_sampling.py +1 -1
torch_geometric/utils/_normalize_edge_index.py +46 -0
torch_geometric/utils/_scatter.py +38 -12
torch_geometric/utils/_subgraph.py +4 -0
torch_geometric/utils/_tree_decomposition.py +2 -2
torch_geometric/utils/augmentation.py +1 -1
torch_geometric/utils/convert.py +12 -8
torch_geometric/utils/geodesic.py +24 -22
torch_geometric/utils/hetero.py +1 -1
torch_geometric/utils/map.py +8 -2
torch_geometric/utils/smiles.py +65 -27
torch_geometric/utils/sparse.py +39 -25
torch_geometric/visualization/graph.py +3 -4

torch_geometric/datasets/molecule_gpt_dataset.py ADDED Viewed

@@ -0,0 +1,485 @@
+import gzip
+import json
+import multiprocessing
+import os
+import sys
+from collections import defaultdict
+from multiprocessing import Pool
+from typing import Callable, List, Optional, Tuple
+import numpy as np
+import requests
+import torch
+from tqdm import tqdm
+from torch_geometric.data import Data, InMemoryDataset, download_url
+from torch_geometric.io import fs
+from torch_geometric.nn.nlp import LLM
+from torch_geometric.utils import one_hot
+def clean_up_description(description: str) -> str:
+    description = description + " "
+    # extra adj Pure
+    if description.startswith("Pure "):
+        description = description.replace("Pure ", "")
+    # fix typo
+    if description.startswith("Mercurycombines"):
+        description = description.replace("Mercurycombines",
+                                          "Mercury combines")
+    # a special case
+    description = description.replace(
+        "17-Hydroxy-6-methylpregna-3,6-diene-3,20-dione. ",
+        "17-Hydroxy-6-methylpregna-3,6-diene-3,20-dione is ")
+    # a special case
+    description = description.replace("5-Thymidylic acid. ",
+                                      "5-Thymidylic acid. is ")
+    # a special case
+    description = description.replace(
+        "5'-S-(3-Amino-3-carboxypropyl)-5'-thioadenosine. ",
+        "5'-S-(3-Amino-3-carboxypropyl)-5'-thioadenosine. is ")
+    # a special case
+    description = description.replace(
+        ("Guanosine 5'-(trihydrogen diphosphate), monoanhydride"
+         " with phosphorothioic acid. "),
+        ("Guanosine 5'-(trihydrogen diphosphate), monoanhydride"
+         " with phosphorothioic acid is "))
+    # a special case
+    description = description.replace("5'-Uridylic acid. ",
+                                      "5'-Uridylic acid is ")
+    # a special case
+    description = description.replace("5'-Adenylic acid, ",
+                                      "5'-Adenylic acid is ")
+    # a special case
+    description = description.replace(
+        "Uridine 5'-(tetrahydrogen triphosphate). ",
+        "Uridine 5'-(tetrahydrogen triphosphate). is ")
+    # a special case
+    description = description.replace("Inosine 5'-Monophosphate. ",
+                                      "Inosine 5'-Monophosphate. is ")
+    # a special case
+    description = description.replace("Pivaloyloxymethyl butyrate (AN-9), ",
+                                      "Pivaloyloxymethyl butyrate (AN-9) is ")
+    # a special case
+    description = description.replace(
+        "4-Amino-5-cyano-7-(D-ribofuranosyl)-7H- pyrrolo(2,3-d)pyrimidine. ",
+        "4-Amino-5-cyano-7-(D-ribofuranosyl)-7H- pyrrolo(2,3-d)pyrimidine is ")
+    # a special case
+    description = description.replace(
+        "Cardamonin (also known as Dihydroxymethoxychalcone), ",
+        "Cardamonin (also known as Dihydroxymethoxychalcone) is ")
+    # a special case
+    description = description.replace("Lithium has been used to treat ",
+                                      "Lithium is ")
+    # a special case
+    description = description.replace("4,4'-Methylenebis ",
+                                      "4,4'-Methylenebis is ")
+    # a special case
+    description = description.replace(
+        "2,3,7,8-Tetrachlorodibenzo-p-dioxin",
+        "2,3,7,8-Tetrachlorodibenzo-p-dioxin is ")
+    # a special case
+    description = description.replace("Exposure to 2,4,5-trichlorophenol ",
+                                      "2,4,5-Trichlorophenol exposure ")
+    index = 0
+    L = len(description)
+    if description.startswith('C.I. '):
+        start_index = len('C.I. ')
+    elif description.startswith('Nectriapyrone. D '):
+        start_index = len('Nectriapyrone. D ')
+    elif description.startswith(
+            'Salmonella enterica sv. Minnesota LPS core oligosaccharide'):
+        start_index = len(
+            'Salmonella enterica sv. Minnesota LPS core oligosaccharide')
+    else:
+        start_index = 0
+    for index in range(start_index, L - 1):
+        if index < L - 2:
+            if description[index] == '.' and description[
+                    index + 1] == ' ' and 'A' <= description[index + 2] <= 'Z':
+                break
+        elif index == L - 2:
+            break
+    first_sentence = description[:index + 1]
+    return first_sentence
+def extract_name(
+    name_raw: str,
+    description: str,
+) -> Tuple[Optional[str], str, str]:
+    first_sentence = clean_up_description(description)
+    splitter = '  --  --  '
+    if ' are ' in first_sentence or ' were ' in first_sentence:
+        replaced_words = 'These molecules'
+    else:
+        replaced_words = 'This molecule'
+    first_sentence = first_sentence.replace(' is ', splitter)
+    first_sentence = first_sentence.replace(' are ', splitter)
+    first_sentence = first_sentence.replace(' was ', splitter)
+    first_sentence = first_sentence.replace(' were ', splitter)
+    first_sentence = first_sentence.replace(' appears ', splitter)
+    first_sentence = first_sentence.replace(' occurs ', splitter)
+    first_sentence = first_sentence.replace(' stands for ', splitter)
+    first_sentence = first_sentence.replace(' belongs to ', splitter)
+    first_sentence = first_sentence.replace(' exists ',
+                                            splitter)  # only for CID=11443
+    first_sentence = first_sentence.replace(' has been used in trials ',
+                                            splitter)
+    first_sentence = first_sentence.replace(' has been investigated ',
+                                            splitter)
+    first_sentence = first_sentence.replace(' has many uses ', splitter)
+    if splitter in first_sentence:
+        extracted_name = first_sentence.split(splitter, 1)[0]
+    elif first_sentence.startswith(name_raw):
+        extracted_name = name_raw
+    elif name_raw in first_sentence:
+        extracted_name = name_raw
+        extracted_name = None
+        print("=====", name_raw)
+        print("first sentence: ", first_sentence)
+    else:
+        extracted_name = None
+    if extracted_name is not None:
+        extracted_description = description.replace(extracted_name,
+                                                    replaced_words)
+    else:
+        extracted_description = description
+    return extracted_name, extracted_description, first_sentence
+class MoleculeGPTDataset(InMemoryDataset):
+    r"""The dataset from the `"MoleculeGPT: Instruction Following Large
+    Language Models for Molecular Property Prediction"
+    <https://ai4d3.github.io/papers/34.pdf>`_ paper.
+    Args:
+        root (str): Root directory where the dataset should be saved.
+        transform (callable, optional): A function/transform that takes in an
+            :obj:`torch_geometric.data.Data` object and returns a transformed
+            version. The data object will be transformed before every access.
+            (default: :obj:`None`)
+        pre_transform (callable, optional): A function/transform that takes in
+            an :obj:`torch_geometric.data.Data` object and returns a
+            transformed version. The data object will be transformed before
+            being saved to disk. (default: :obj:`None`)
+        pre_filter (callable, optional): A function that takes in an
+            :obj:`torch_geometric.data.Data` object and returns a boolean
+            value, indicating whether the data object should be included in the
+            final dataset. (default: :obj:`None`)
+        force_reload (bool, optional): Whether to re-process the dataset.
+            (default: :obj:`False`)
+        total_page_num (int, optional): The number of pages from PubChem.
+            (default: :obj:`10`)
+        total_block_num (int, optional): The blocks of SDF files from PubChem.
+            (default: :obj:`1`)
+    """
+    description_url = (
+        'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/annotations/'
+        'heading/json?heading_type=Compound&heading=Record+Description&page={}'
+    )
+    compound_url = ('https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/'
+                    'CURRENT-Full/SDF')
+    def __init__(
+        self,
+        root: str,
+        transform: Optional[Callable] = None,
+        pre_transform: Optional[Callable] = None,
+        pre_filter: Optional[Callable] = None,
+        force_reload: bool = False,
+        total_page_num: int = 10,
+        total_block_num: int = 1,
+    ):
+        self.total_page_num = total_page_num
+        self.total_block_num = total_block_num
+        super().__init__(root, transform, pre_transform, pre_filter,
+                         force_reload=force_reload)
+        self.load(self.processed_paths[0])
+    @property
+    def raw_file_names(self) -> List[str]:
+        return ['pubchem.csv']
+    @property
+    def processed_file_names(self) -> List[str]:
+        return ['data.pt']
+    def download(self) -> None:
+        # Step 01. Extract description
+        step1_folder = f"{self.raw_dir}/step_01_PubChemSTM_description"
+        if not os.path.exists(step1_folder):
+            os.makedirs(step1_folder)
+            valid_CID_set = set()
+            CID2name_raw, CID2name_extracted = defaultdict(list), defaultdict(
+                list)
+            CID2text_raw, CID2text_extracted = defaultdict(list), defaultdict(
+                list)
+            for page_index in tqdm(range(self.total_page_num)):
+                page_num = page_index + 1
+                f_out = open(
+                    f"{step1_folder}/Compound_description_{page_num}.txt", "w")
+                description_data = requests.get(
+                    self.description_url.format(page_num)).json()
+                description_data = description_data["Annotations"]
+                assert description_data["Page"] == page_num
+                record_list = description_data["Annotation"]
+                for record in record_list:
+                    try:
+                        CID = record["LinkedRecords"]["CID"][0]
+                        if "Name" in record:
+                            name_raw = record["Name"]
+                            CID2name_raw[CID].append(name_raw)
+                        else:
+                            name_raw = None
+                        data_list = record["Data"]
+                        for data in data_list:
+                            description = data["Value"]["StringWithMarkup"][0][
+                                "String"].strip()
+                            extracted_name, extracted_description, _ = extract_name(  # noqa: E501
+                                name_raw, description)
+                            if extracted_name is not None:
+                                CID2name_extracted[CID].append(extracted_name)
+                            CID2text_raw[CID].append(description)
+                            CID2text_extracted[CID].append(
+                                extracted_description)
+                            valid_CID_set.add(CID)
+                            f_out.write(f"{CID}\n")
+                            f_out.write(f"{extracted_description}\n\n")
+                    except Exception:
+                        continue
+            valid_CID_list = sorted(list(valid_CID_set))
+            print(f"Total CID (with raw name) {len(CID2name_raw)}")
+            print(f"Total CID (with extracted name) {len(CID2name_extracted)}")
+            print(f"Total CID {len(valid_CID_list)}")
+            with open(f"{self.raw_dir}/CID2name_raw.json", "w") as f:
+                json.dump(CID2name_raw, f)
+            with open(f"{self.raw_dir}/CID2name.json", "w") as f:
+                json.dump(CID2name_extracted, f)
+            with open(f"{self.raw_dir}/CID2text_raw.json", "w") as f:
+                json.dump(CID2text_raw, f)
+            with open(f"{self.raw_dir}/CID2text.json", "w") as f:
+                json.dump(CID2text_extracted, f)
+        # Step 02. Download SDF Files
+        step2_folder = f"{self.raw_dir}/step_02_PubChemSTM_SDF"
+        if not os.path.exists(step2_folder):
+            for block_id in tqdm(range(self.total_block_num)):
+                block_size = 500000
+                l_id = block_id * block_size + 1
+                r_id = (block_id + 1) * block_size
+                compound_file_name = f"Compound_{l_id:09d}_{r_id:09d}.sdf.gz"
+                download_url(f"{self.compound_url}/{compound_file_name}",
+                             step2_folder)
+    def process(self, use_mp: bool = False) -> None:
+        try:
+            from rdkit import Chem
+            from rdkit.Chem.rdchem import BondType as BT
+            WITH_RDKIT = True
+        except ImportError:
+            WITH_RDKIT = False
+        if not WITH_RDKIT:
+            print(("Using a pre-processed version of the dataset. Please "
+                   "install 'rdkit' to alternatively process the raw data."),
+                  file=sys.stderr)
+            data_list = fs.torch_load(self.raw_paths[0])
+            data_list = [Data(**data_dict) for data_dict in data_list]
+            if self.pre_filter is not None:
+                data_list = [d for d in data_list if self.pre_filter(d)]
+            if self.pre_transform is not None:
+                data_list = [self.pre_transform(d) for d in data_list]
+            self.save(data_list, self.processed_paths[0])
+            return
+        # Step 03. Filter out SDF
+        step2_folder = f"{self.raw_dir}/step_02_PubChemSTM_SDF"
+        step3_folder = f"{self.raw_dir}/step_03_PubChemSTM_filtered"
+        if not os.path.exists(step3_folder):
+            os.makedirs(step3_folder)
+            with open(f"{self.raw_dir}/CID2text.json") as f:
+                CID2text = json.load(f)
+            target_CID_list = set(CID2text.keys())
+            block_size = 500000
+            def extract_one_SDF_file(block_id: int) -> None:
+                valid_mol_count = 0
+                writer = Chem.SDWriter(
+                    f'{step3_folder}/filtered_{block_id}.sdf')
+                l_id = block_id * block_size + 1
+                r_id = (block_id + 1) * block_size
+                compound_file_name = f"Compound_{l_id:09d}_{r_id:09d}.sdf.gz"
+                gzip_loader = gzip.open(f"{step2_folder}/{compound_file_name}")
+                suppl = Chem.ForwardSDMolSupplier(gzip_loader)
+                for mol in tqdm(suppl):
+                    if mol is None:
+                        continue
+                    cid = mol.GetProp("PUBCHEM_COMPOUND_CID")
+                    if cid not in target_CID_list:
+                        continue
+                    writer.write(mol)
+                    valid_mol_count += 1
+                writer.close()
+                print(f"block id: {block_id}\nfound {valid_mol_count}\n\n")
+                sys.stdout.flush()
+                return
+            if use_mp:
+                num_process = multiprocessing.cpu_count()
+                print(f"{num_process} CPUs")
+                num_process = 8
+                p = Pool(num_process)
+                block_id_list = np.arange(self.total_block_num)
+                with p:
+                    p.map(extract_one_SDF_file, block_id_list)
+            else:
+                for block_id in range(self.total_block_num):
+                    extract_one_SDF_file(block_id)
+        # Step 04. Merge SDF
+        with open(f"{self.raw_dir}/CID2text.json") as f:
+            CID2text = json.load(f)
+        target_CID_list = set(CID2text.keys())
+        print(f'The length of target_CID_list: {len(target_CID_list)}')
+        writer = Chem.SDWriter(f'{self.raw_dir}/molecules.sdf')
+        found_CID_set = set()
+        for block_id in range(self.total_block_num + 1):
+            compound_file_path = f"{step3_folder}/filtered_{block_id}.sdf"
+            try:
+                suppl = Chem.SDMolSupplier(compound_file_path)
+                for mol in tqdm(suppl):
+                    writer.write(mol)
+                    cid = mol.GetProp("PUBCHEM_COMPOUND_CID")
+                    found_CID_set.add(cid)
+            except Exception:
+                print(f"block id: {block_id} with 0 valid SDF file")
+                continue
+        writer.close()
+        print(f"In total: {len(found_CID_set)} molecules")
+        # Step 05. Convert to PyG data format
+        types = {'H': 0, 'C': 1, 'N': 2, 'O': 3, 'F': 4, 'Unknow': 5}
+        bonds = {BT.SINGLE: 0, BT.DOUBLE: 1, BT.TRIPLE: 2, BT.AROMATIC: 3}
+        data_list = []
+        # Real data
+        CID2text_file = f'{self.raw_dir}/CID2text.json'
+        with open(CID2text_file) as f:
+            CID2text_data = json.load(f)
+        suppl = Chem.SDMolSupplier(f'{self.raw_dir}/molecules.sdf')
+        llm = LLM(
+            # model_name='lmsys/vicuna-7b-v1.5',
+            model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1',
+            num_params=1,
+            dtype=torch.bfloat16,
+        )
+        prompt = ("Propose a question regarding the molecule '∼' "
+                  "whose answer is: {}:")
+        for mol in tqdm(suppl):
+            if mol.HasProp('PUBCHEM_COMPOUND_CID'):
+                CID = mol.GetProp("PUBCHEM_COMPOUND_CID")
+                CAN_SMILES = mol.GetProp("PUBCHEM_OPENEYE_CAN_SMILES")
+                m: Chem.Mol = Chem.MolFromSmiles(CAN_SMILES)
+                if m is None:
+                    continue
+                RDKit_CAN_SMILES = Chem.MolToSmiles(m)
+                ground_truth = CID2text_data[CID][0]
+                instruction = llm.inference([prompt.format(ground_truth)])[0]
+                x: torch.Tensor = torch.tensor([
+                    types[atom.GetSymbol()] if atom.GetSymbol() in types else 5
+                    for atom in m.GetAtoms()
+                ])
+                x = one_hot(x, num_classes=len(types), dtype=torch.float)
+                rows, cols, edge_types = [], [], []
+                for bond in m.GetBonds():
+                    i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
+                    edge_types += [bonds[bond.GetBondType()]] * 2
+                    rows += [i, j]
+                    cols += [j, i]
+                edge_index = torch.tensor([rows, cols], dtype=torch.long)
+                edge_type = torch.tensor(edge_types, dtype=torch.long)
+                edge_attr = one_hot(edge_type, num_classes=len(bonds))
+                data = Data(
+                    x=x,
+                    edge_index=edge_index,
+                    edge_attr=edge_attr,
+                    smiles=RDKit_CAN_SMILES,
+                    instruction=instruction,
+                    y=ground_truth,
+                )
+                if self.pre_filter is not None and not self.pre_filter(data):
+                    continue
+                if self.pre_transform is not None:
+                    data = self.pre_transform(data)
+                data_list.append(data)
+        self.save(data_list, self.processed_paths[0])

torch_geometric/datasets/molecule_net.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import os
 import os.path as osp
 import re
+import warnings
 from typing import Callable, Dict, Optional, Tuple, Union
 import torch
 from torch_geometric.data import InMemoryDataset, download_url, extract_gz
-from torch_geometric.utils import from_smiles
+from torch_geometric.utils import from_smiles as _from_smiles
 class MoleculeNet(InMemoryDataset):
@@ -38,6 +39,10 @@ class MoleculeNet(InMemoryDataset):
             final dataset. (default: :obj:`None`)
         force_reload (bool, optional): Whether to re-process the dataset.
             (default: :obj:`False`)
+        from_smiles (callable, optional): A custom function that takes a SMILES
+            string and outputs a :obj:`~torch_geometric.data.Data` object.
+            If not set, defaults to :meth:`~torch_geometric.utils.from_smiles`.
+            (default: :obj:`None`)
     **STATS:**
@@ -152,9 +157,11 @@ class MoleculeNet(InMemoryDataset):
         pre_transform: Optional[Callable] = None,
         pre_filter: Optional[Callable] = None,
         force_reload: bool = False,
+        from_smiles: Optional[Callable] = None,
     ) -> None:
         self.name = name.lower()
         assert self.name in self.names.keys()
+        self.from_smiles = from_smiles or _from_smiles
         super().__init__(root, transform, pre_transform, pre_filter,
                          force_reload=force_reload)
         self.load(self.processed_paths[0])
@@ -183,7 +190,7 @@ class MoleculeNet(InMemoryDataset):
             os.unlink(path)
     def process(self) -> None:
-        with open(self.raw_paths[0], 'r') as f:
+        with open(self.raw_paths[0]) as f:
             dataset = f.read().split('\n')[1:-1]
             dataset = [x for x in dataset if len(x) > 0]  # Filter empty lines.
@@ -199,9 +206,14 @@ class MoleculeNet(InMemoryDataset):
             ys = [float(y) if len(y) > 0 else float('NaN') for y in labels]
             y = torch.tensor(ys, dtype=torch.float).view(1, -1)
-            data = from_smiles(smiles)
+            data = self.from_smiles(smiles)
             data.y = y
+            if data.num_nodes == 0:
+                warnings.warn(f"Skipping molecule '{smiles}' since it "
+                              f"resulted in zero atoms")
+                continue
             if self.pre_filter is not None and not self.pre_filter(data):
                 continue

torch_geometric/datasets/motif_generator/base.py CHANGED Viewed

@@ -10,7 +10,6 @@ class MotifGenerator(ABC):
     @abstractmethod
     def __call__(self) -> Data:
         r"""To be implemented by :class:`Motif` subclasses."""
-        pass
     @staticmethod
     def resolve(query: Any, *args: Any, **kwargs: Any) -> 'MotifGenerator':

torch_geometric/datasets/neurograph.py CHANGED Viewed

@@ -2,8 +2,6 @@ import os
 import os.path as osp
 from typing import Callable, List, Optional
-import torch
 from torch_geometric.data import (
     Data,
     InMemoryDataset,
@@ -110,7 +108,7 @@ class NeuroGraphDataset(InMemoryDataset):
         fs.rm(osp.join(self.raw_dir, self.name))
     def process(self) -> None:
-        data, slices = torch.load(self.raw_paths[0])
+        data, slices = fs.torch_load(self.raw_paths[0])
         num_samples = slices['x'].size(0) - 1
         data_list: List[Data] = []

torch_geometric/datasets/ogb_mag.py CHANGED Viewed

@@ -147,7 +147,7 @@ class OGB_MAG(InMemoryDataset):
             for node_type in ['author', 'institution', 'field_of_study']:
                 data[node_type].num_nodes = num_nodes_df[node_type].tolist()[0]
         else:
-            emb_dict = torch.load(self.raw_paths[-1])
+            emb_dict = fs.torch_load(self.raw_paths[-1])
             for key, value in emb_dict.items():
                 if key != 'paper':
                     data[key].x = value

pyg-nightly 2.6.0.dev20240318__py3-none-any.whl → 2.7.0.dev20250115__py3-none-any.whl

pyg-nightly 2.6.0.dev20240318py3-none-any.whl → 2.7.0.dev20250115py3-none-any.whl