PyPI - pyg-nightly - Versions diffs - 2.7.0.dev20241119__py3-none-any.whl → 2.7.0.dev20241121__py3-none-any.whl - Mend

pyg-nightly 2.7.0.dev20241119py3-none-any.whl → 2.7.0.dev20241121py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

{pyg_nightly-2.7.0.dev20241119.dist-info → pyg_nightly-2.7.0.dev20241121.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: pyg-nightly
-Version: 2.7.0.dev20241119
+Version: 2.7.0.dev20241121
 Summary: Graph Neural Network Library for PyTorch
 Keywords: deep-learning,pytorch,geometric-deep-learning,graph-neural-networks,graph-convolutional-networks
 Author-email: Matthias Fey <matthias@pyg.org>

{pyg_nightly-2.7.0.dev20241119.dist-info → pyg_nightly-2.7.0.dev20241121.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-torch_geometric/__init__.py,sha256=spxW7Bk1ADYtDbAY5o7hc4aHzY-HMhp_JzJaHacQX30,1904
+torch_geometric/__init__.py,sha256=KxeHpFIYYrXJ-wesw35LT5EYOhXnC86_S8hRPAfpOy4,1904
 torch_geometric/_compile.py,sha256=f-WQeH4VLi5Hn9lrgztFUCSrN_FImjhQa6BxFzcYC38,1338
 torch_geometric/_onnx.py,sha256=V9ffrIKSqhDw6xUZ12lkuSfNs48cQp2EeJ6Z19GfnVw,349
 torch_geometric/backend.py,sha256=lVaf7aLoVaB3M-UcByUJ1G4T4FOK6LXAg0CF4W3E8jo,1575
@@ -53,7 +53,7 @@ torch_geometric/data/temporal.py,sha256=WOJ6gFrTLikaLhUvotyUF5ql14FkE5Ox3hNkdSp6
 torch_geometric/data/view.py,sha256=XjkVSc-UWZFCT4DlXLShZtO8duhFQkS9gq88zZXANsk,1089
 torch_geometric/data/lightning/__init__.py,sha256=w3En1tJfy3kSqe1MycpOyZpHFO3fxBCgNCUOznPA3YU,178
 torch_geometric/data/lightning/datamodule.py,sha256=Bn9iaIfE4NWDDWWMqCvBeZ4bIW1Silx_Ol5CPJCliaQ,29242
-torch_geometric/datasets/__init__.py,sha256=fey-955PyCQXGBeUTNPWwU5uK3PJOEvaY1_fDt1SxXc,5880
+torch_geometric/datasets/__init__.py,sha256=f9YqoX9WTSVMzjuLfFD_eCsC4iQ5kbFNQiZru3n6qw0,6013
 torch_geometric/datasets/actor.py,sha256=oUxgJIX8bi5hJr1etWNYIFyVQNDDXi1nyVpHGGMEAGQ,4304
 torch_geometric/datasets/airfrans.py,sha256=212gYsk7PvF-qcmvM2YXaOBhFrS79evAGg_sPHXih4w,5439
 torch_geometric/datasets/airports.py,sha256=b3gkv3gY2JkUpmGiz36Z-g7EcnSfU8lBG1YsCOWdJ6k,3758
@@ -113,6 +113,7 @@ torch_geometric/datasets/md17.py,sha256=BD6LU2xm6_ycXVk6r4O0poNt5Sr_PJ2P1QjNqIOL
 torch_geometric/datasets/mixhop_synthetic_dataset.py,sha256=4NNvTHUvvV6pcqQCyVDS5XhppXUeF2H9GTfFoc49eyU,3951
 torch_geometric/datasets/mnist_superpixels.py,sha256=o2ArbZ0_OE0u8VCaHmWwvngESlOFr9oM9dSEP_tjAS4,3340
 torch_geometric/datasets/modelnet.py,sha256=-qmLjlQiKVWmtHefAIIE97dQxEcaBfetMJnvgYZuwkg,5347
+torch_geometric/datasets/molecule_gpt_dataset.py,sha256=XE14wgPVBm2kVLYL6NgXUDhv4QGHxVISG-VWEwO7hfA,18754
 torch_geometric/datasets/molecule_net.py,sha256=VNWLEDulFID8mLsxgN8q1T-O3M2i0n0Si5ISwEZezMU,7379
 torch_geometric/datasets/movie_lens.py,sha256=M4Bu0Xus8IkW8GYzjxPxSdPXNbcCCx9cu6cncxBvLx8,4033
 torch_geometric/datasets/movie_lens_100k.py,sha256=eTpBAteM3jqTEtiwLxmhVj4r8JvftvPx8Hvs-3ZIHlU,6057
@@ -144,6 +145,7 @@ torch_geometric/datasets/shapenet.py,sha256=tn3HiQQAr6lxHrqxfOVaAtl40guwFYTXWCbS
 torch_geometric/datasets/shrec2016.py,sha256=cTLhctbqE0EUEvKddJFhPzDb1oLKXOth4O_WzsWtyMk,6323
 torch_geometric/datasets/snap_dataset.py,sha256=r3sC-dHDouyaYoHGdoBY0uO0qOOvD6_Hb96d2ceGMZk,9433
 torch_geometric/datasets/suite_sparse.py,sha256=eqjH4vAUq872qdk3YdLkZSwlu6r7HHpTgK0vEVGmY1s,3278
+torch_geometric/datasets/tag_dataset.py,sha256=0fzOsakR9L9CK6ppGN-USD4-Vq-ssbQ2Xovw2nqqtWo,14759
 torch_geometric/datasets/taobao.py,sha256=CUcZpbWsNTasevflO8zqP0YvENy89P7wpKS4MHaDJ6Q,4170
 torch_geometric/datasets/tosca.py,sha256=nUSF8NQT1GlkwWQLshjWmr8xORsvRHzzIqhUyDCvABc,4632
 torch_geometric/datasets/tu_dataset.py,sha256=14OSaXBgVwT1dX2h1wZ3xVIwoo0GQBEfR3yWh6Q0VF0,7847
@@ -324,8 +326,9 @@ torch_geometric/nn/aggr/set_transformer.py,sha256=FG7_JizpFX14M6VSCwLSjYXYdJ1ZiQ
 torch_geometric/nn/aggr/sort.py,sha256=bvOOWnFkNOBOZih4rqVZQsjfeDX3vmXo1bpPSFD846w,2507
 torch_geometric/nn/aggr/utils.py,sha256=CLJ-ZrVWYIOBpdhQBLAz94dj3cMKKKc3qwGr4DFbiCU,8338
 torch_geometric/nn/aggr/variance_preserving.py,sha256=fu-U_aGYpVLpgSFvVg0ONMe6nqoyv8tZ6Y35qMYTf9w,1126
-torch_geometric/nn/attention/__init__.py,sha256=Ip6n4xbUbhJhrmPO9LjvHq0nNQe-yxiC4WHyOYOrHJc,76
+torch_geometric/nn/attention/__init__.py,sha256=1lCB7zh7uM6FkpW81S9U4CvxTwpCkz59KatPTIE9UmA,127
 torch_geometric/nn/attention/performer.py,sha256=2PCDn4_-oNTao2-DkXIaoi18anP01OxRELF2pvp-jk8,7357
+torch_geometric/nn/attention/qformer.py,sha256=7J-pWm_vpumK38IC-iCBz4oqL-BEIofEIxJ0wfjWq9A,2338
 torch_geometric/nn/conv/__init__.py,sha256=37zTdt0gfSAUPMtwXjZg5mWx_itojJVFNODYR1h1ch0,3515
 torch_geometric/nn/conv/agnn_conv.py,sha256=5nEPLx_BBHcDaO6HWzLuHfXc0Yd_reKynAOH0Iq09lU,3077
 torch_geometric/nn/conv/antisymmetric_conv.py,sha256=dhA6sCETy1jlXReYJZBSyToOcL_mZ1wL10fMIb8Ppuw,4387
@@ -417,7 +420,7 @@ torch_geometric/nn/kge/distmult.py,sha256=dGQ0bVzjreZgFN1lXE23_IIidsiOq7ehPrMb-N
 torch_geometric/nn/kge/loader.py,sha256=5Uc1j3OUMQnBYSHDqL7pLCty1siFLzoPkztigYO2zP8,771
 torch_geometric/nn/kge/rotate.py,sha256=XLuO1AbyTt5cJxr97ZzoyAyIEsHKesgW5TvDmnGJAao,3208
 torch_geometric/nn/kge/transe.py,sha256=jlejq5BLMm-sb1wWcLDp7pZqCdelWBgjDIC8ctbjSdU,3088
-torch_geometric/nn/models/__init__.py,sha256=RpYFFqaYWq1BVMF3Fs-EQo-QZDdLQjIHPdkl3d2MOW4,2017
+torch_geometric/nn/models/__init__.py,sha256=dr2-YsRzUdVBM6Ut78FB9Wbjn-kzV0gPwOlWGPdQLY4,2108
 torch_geometric/nn/models/attentive_fp.py,sha256=tkgvw28wg9-JqHIfBllfCwTHrZIUiv85yZJcDqjz3z0,6634
 torch_geometric/nn/models/autoencoder.py,sha256=nGje-zty78Y3hxOJ9o0_6QziJjOvBlknk6z0_fDQwQU,10770
 torch_geometric/nn/models/basic_gnn.py,sha256=PGa0RUMyvrNy_5yRI2jX_zwPsmZXwOQWfsWvxOiHsSk,31225
@@ -428,6 +431,7 @@ torch_geometric/nn/models/deepgcn.py,sha256=tIgT03cj8MghYlxEozpoGvGG_CwpJrGDxv1Z
 torch_geometric/nn/models/dimenet.py,sha256=Kc5p-rB5q-0e8lY22l-OdQTscTxJh2lTEpeRFMdL4RY,36186
 torch_geometric/nn/models/dimenet_utils.py,sha256=Eyn_EiJqwKvuYj6BtRpSxrzMG3v4Gk98X9MxZ7uvwm4,5069
 torch_geometric/nn/models/g_retriever.py,sha256=VueRImNJlh1WvRWcsSXliSw8RlxlzWlu2WSFs_VQaJc,7749
+torch_geometric/nn/models/glem.py,sha256=gqQF4jlU7U_u5-zGeJZuHiEqhSXa-wLU5TghN4u5fYY,16389
 torch_geometric/nn/models/gnnff.py,sha256=15dkiLgy0LmH1hnUrpeoHioIp4BPTfjpVATpnGRt9E0,7860
 torch_geometric/nn/models/graph_mixer.py,sha256=mthMeCOikR8gseEsu4oJ3Cd9C35zHSv1p32ROwnG-6s,9246
 torch_geometric/nn/models/graph_unet.py,sha256=N8TSmJo8AlbZjjcame0xW_jZvMOirL5ahw6qv5Yjpbs,5586
@@ -439,6 +443,7 @@ torch_geometric/nn/models/mask_label.py,sha256=B2HcL6ZkaUEo3a8nebZoUqEIfDEfcIGOV
 torch_geometric/nn/models/meta.py,sha256=lQWovjdQgTGT_rDAm6L186ObINeQCD9tLBz8xenmrF0,6540
 torch_geometric/nn/models/metapath2vec.py,sha256=nxttGe4QVWr4teYEoNz8uHRu-yVsLSZPOeF_tz0bj2o,10788
 torch_geometric/nn/models/mlp.py,sha256=rdwUFxxxqLjXK-iy1L1sXiwSNwAfqTlvHLaqVZ-jwCs,10315
+torch_geometric/nn/models/molecule_gpt.py,sha256=k-XULH6jaurj-R2EE4sIWTkqlNqa3CzWxfQgfFa-G8s,7637
 torch_geometric/nn/models/neural_fingerprint.py,sha256=pTLJgU9Uh2Lnf9bggLj4cKI8YdEFcMF-9MALuubqbuQ,2378
 torch_geometric/nn/models/node2vec.py,sha256=U-VhJlvt5lT-JShFrF5tN84wCPqoVuftLVNyOVXs0OU,7664
 torch_geometric/nn/models/pmlp.py,sha256=dcAASVSyQMMhItSfEJWPeAFh0R3tNCwAHwdrShwQ8o4,3538
@@ -450,8 +455,8 @@ torch_geometric/nn/models/signed_gcn.py,sha256=J40CnedFIqtKI1LhW1ITSEFRbA_XiJZL6
 torch_geometric/nn/models/tgn.py,sha256=kEGdfLJybkbMT4UMoAh2nCzfX3_nDjfm1cicuPHEwAM,11878
 torch_geometric/nn/models/visnet.py,sha256=97OFMCsPDEI5BCSi7RhoRcU2CNRp7zck2tEzrltFZj4,43192
 torch_geometric/nn/nlp/__init__.py,sha256=JJESTA7w_K8v60XbCd25IqmrKKHLz5OiNexMHYGV2mE,138
-torch_geometric/nn/nlp/llm.py,sha256=_penl2qkDMeVtlwGPrl7UuyxBh6ILtdiLHmrUNQHkYc,11731
-torch_geometric/nn/nlp/sentence_transformer.py,sha256=JrTN3W1srdkNX7qYDGB08mY5615i5nfEJSTHAdd5EuA,3260
+torch_geometric/nn/nlp/llm.py,sha256=M15Qn0yHyA6HL2rHCH2p4H6hKjUvLfnzlxdfEFvRxSA,11732
+torch_geometric/nn/nlp/sentence_transformer.py,sha256=VzMtNUYk6FvOVc3PdVets9_2Sb2FdQbzu9H3m6teRlI,3417
 torch_geometric/nn/norm/__init__.py,sha256=u2qIDrkbeuObGVXSAIftAlvSd6ouGTtxznCfD-59UiA,669
 torch_geometric/nn/norm/batch_norm.py,sha256=sJKrinHGwA-noIgteg1RD2W06rd0zskD-rXuY-36glY,8283
 torch_geometric/nn/norm/diff_group_norm.py,sha256=b57XvNekrUYGDjNJlGeqvaMGNJmHwopSF0_yyBWlLuA,4722
@@ -618,6 +623,6 @@ torch_geometric/utils/undirected.py,sha256=H_nfpI0_WluOG6VfjPyldvcjL4w5USAKWu2x5
 torch_geometric/visualization/__init__.py,sha256=PyR_4K5SafsJrBr6qWrkjKr6GBL1b7FtZybyXCDEVwY,154
 torch_geometric/visualization/graph.py,sha256=ZuLPL92yGRi7lxlqsUPwL_EVVXF7P2kMcveTtW79vpA,4784
 torch_geometric/visualization/influence.py,sha256=CWMvuNA_Nf1sfbJmQgn58yS4OFpeKXeZPe7kEuvkUBw,477
-pyg_nightly-2.7.0.dev20241119.dist-info/WHEEL,sha256=CpUCUxeHQbRN5UGRQHYRJorO5Af-Qy_fHMctcQ8DSGI,82
-pyg_nightly-2.7.0.dev20241119.dist-info/METADATA,sha256=3Y-GTdZXsDzzOxIrxa35EwttZ_dPAwF2jkLotBJ9ubg,62979
-pyg_nightly-2.7.0.dev20241119.dist-info/RECORD,,
+pyg_nightly-2.7.0.dev20241121.dist-info/WHEEL,sha256=CpUCUxeHQbRN5UGRQHYRJorO5Af-Qy_fHMctcQ8DSGI,82
+pyg_nightly-2.7.0.dev20241121.dist-info/METADATA,sha256=GZC_8xTwtszZfiAKIcN4yjzbZCrGmYbnpl5lr98v8eg,62979
+pyg_nightly-2.7.0.dev20241121.dist-info/RECORD,,

torch_geometric/__init__.py CHANGED Viewed

@@ -30,7 +30,7 @@ from .lazy_loader import LazyLoader
 contrib = LazyLoader('contrib', globals(), 'torch_geometric.contrib')
 graphgym = LazyLoader('graphgym', globals(), 'torch_geometric.graphgym')
-__version__ = '2.7.0.dev20241119'
+__version__ = '2.7.0.dev20241121'
 __all__ = [
     'Index',

torch_geometric/datasets/__init__.py CHANGED Viewed

@@ -77,6 +77,8 @@ from .myket import MyketDataset
 from .brca_tgca import BrcaTcga
 from .neurograph import NeuroGraphDataset
 from .web_qsp_dataset import WebQSPDataset
+from .molecule_gpt_dataset import MoleculeGPTDataset
+from .tag_dataset import TAGDataset
 from .dbp15k import DBP15K
 from .aminer import AMiner
@@ -190,6 +192,8 @@ homo_datasets = [
     'BrcaTcga',
     'NeuroGraphDataset',
     'WebQSPDataset',
+    'MoleculeGPTDataset',
+    'TAGDataset',
 ]
 hetero_datasets = [

torch_geometric/datasets/molecule_gpt_dataset.py ADDED Viewed

@@ -0,0 +1,480 @@
+import gzip
+import json
+import multiprocessing
+import os
+import sys
+from collections import defaultdict
+from multiprocessing import Pool
+from typing import Callable, List, Optional, Tuple
+import numpy as np
+import requests
+import torch
+from tqdm import tqdm
+from torch_geometric.data import Data, InMemoryDataset, download_url
+from torch_geometric.io import fs
+from torch_geometric.nn.nlp import LLM
+from torch_geometric.utils import one_hot
+def clean_up_description(description: str) -> str:
+    description = description + " "
+    # extra adj Pure
+    if description.startswith("Pure "):
+        description = description.replace("Pure ", "")
+    # fix typo
+    if description.startswith("Mercurycombines"):
+        description = description.replace("Mercurycombines",
+                                          "Mercury combines")
+    # a special case
+    description = description.replace(
+        "17-Hydroxy-6-methylpregna-3,6-diene-3,20-dione. ",
+        "17-Hydroxy-6-methylpregna-3,6-diene-3,20-dione is ")
+    # a special case
+    description = description.replace("5-Thymidylic acid. ",
+                                      "5-Thymidylic acid. is ")
+    # a special case
+    description = description.replace(
+        "5'-S-(3-Amino-3-carboxypropyl)-5'-thioadenosine. ",
+        "5'-S-(3-Amino-3-carboxypropyl)-5'-thioadenosine. is ")
+    # a special case
+    description = description.replace(
+        ("Guanosine 5'-(trihydrogen diphosphate), monoanhydride"
+         " with phosphorothioic acid. "),
+        ("Guanosine 5'-(trihydrogen diphosphate), monoanhydride"
+         " with phosphorothioic acid is "))
+    # a special case
+    description = description.replace("5'-Uridylic acid. ",
+                                      "5'-Uridylic acid is ")
+    # a special case
+    description = description.replace("5'-Adenylic acid, ",
+                                      "5'-Adenylic acid is ")
+    # a special case
+    description = description.replace(
+        "Uridine 5'-(tetrahydrogen triphosphate). ",
+        "Uridine 5'-(tetrahydrogen triphosphate). is ")
+    # a special case
+    description = description.replace("Inosine 5'-Monophosphate. ",
+                                      "Inosine 5'-Monophosphate. is ")
+    # a special case
+    description = description.replace("Pivaloyloxymethyl butyrate (AN-9), ",
+                                      "Pivaloyloxymethyl butyrate (AN-9) is ")
+    # a special case
+    description = description.replace(
+        "4-Amino-5-cyano-7-(D-ribofuranosyl)-7H- pyrrolo(2,3-d)pyrimidine. ",
+        "4-Amino-5-cyano-7-(D-ribofuranosyl)-7H- pyrrolo(2,3-d)pyrimidine is ")
+    # a special case
+    description = description.replace(
+        "Cardamonin (also known as Dihydroxymethoxychalcone), ",
+        "Cardamonin (also known as Dihydroxymethoxychalcone) is ")
+    # a special case
+    description = description.replace("Lithium has been used to treat ",
+                                      "Lithium is ")
+    # a special case
+    description = description.replace("4,4'-Methylenebis ",
+                                      "4,4'-Methylenebis is ")
+    # a special case
+    description = description.replace(
+        "2,3,7,8-Tetrachlorodibenzo-p-dioxin",
+        "2,3,7,8-Tetrachlorodibenzo-p-dioxin is ")
+    # a special case
+    description = description.replace("Exposure to 2,4,5-trichlorophenol ",
+                                      "2,4,5-Trichlorophenol exposure ")
+    index = 0
+    L = len(description)
+    if description.startswith('C.I. '):
+        start_index = len('C.I. ')
+    elif description.startswith('Nectriapyrone. D '):
+        start_index = len('Nectriapyrone. D ')
+    elif description.startswith(
+            'Salmonella enterica sv. Minnesota LPS core oligosaccharide'):
+        start_index = len(
+            'Salmonella enterica sv. Minnesota LPS core oligosaccharide')
+    else:
+        start_index = 0
+    for index in range(start_index, L - 1):
+        if index < L - 2:
+            if description[index] == '.' and description[
+                    index + 1] == ' ' and 'A' <= description[index + 2] <= 'Z':
+                break
+        elif index == L - 2:
+            break
+    first_sentence = description[:index + 1]
+    return first_sentence
+def extract_name(name_raw: str, description: str) -> Tuple[str, str, str]:
+    first_sentence = clean_up_description(description)
+    splitter = '  --  --  '
+    if ' are ' in first_sentence or ' were ' in first_sentence:
+        replaced_words = 'These molecules'
+    else:
+        replaced_words = 'This molecule'
+    first_sentence = first_sentence.replace(' is ', splitter)
+    first_sentence = first_sentence.replace(' are ', splitter)
+    first_sentence = first_sentence.replace(' was ', splitter)
+    first_sentence = first_sentence.replace(' were ', splitter)
+    first_sentence = first_sentence.replace(' appears ', splitter)
+    first_sentence = first_sentence.replace(' occurs ', splitter)
+    first_sentence = first_sentence.replace(' stands for ', splitter)
+    first_sentence = first_sentence.replace(' belongs to ', splitter)
+    first_sentence = first_sentence.replace(' exists ',
+                                            splitter)  # only for CID=11443
+    first_sentence = first_sentence.replace(' has been used in trials ',
+                                            splitter)
+    first_sentence = first_sentence.replace(' has been investigated ',
+                                            splitter)
+    first_sentence = first_sentence.replace(' has many uses ', splitter)
+    if splitter in first_sentence:
+        extracted_name = first_sentence.split(splitter, 1)[0]
+    elif first_sentence.startswith(name_raw):
+        extracted_name = name_raw
+    elif name_raw in first_sentence:
+        extracted_name = name_raw
+        extracted_name = None
+        print("=====", name_raw)
+        print("first sentence: ", first_sentence)
+    else:
+        extracted_name = None
+    if extracted_name is not None:
+        extracted_description = description.replace(extracted_name,
+                                                    replaced_words)
+    else:
+        extracted_description = description
+    return extracted_name, extracted_description, first_sentence
+class MoleculeGPTDataset(InMemoryDataset):
+    r"""The dataset from the `"MoleculeGPT: Instruction Following Large
+    Language Models for Molecular Property Prediction"
+    <https://ai4d3.github.io/papers/34.pdf>`_ paper.
+    Args:
+        root (str): Root directory where the dataset should be saved.
+        transform (callable, optional): A function/transform that takes in an
+            :obj:`torch_geometric.data.Data` object and returns a transformed
+            version. The data object will be transformed before every access.
+            (default: :obj:`None`)
+        pre_transform (callable, optional): A function/transform that takes in
+            an :obj:`torch_geometric.data.Data` object and returns a
+            transformed version. The data object will be transformed before
+            being saved to disk. (default: :obj:`None`)
+        pre_filter (callable, optional): A function that takes in an
+            :obj:`torch_geometric.data.Data` object and returns a boolean
+            value, indicating whether the data object should be included in the
+            final dataset. (default: :obj:`None`)
+        force_reload (bool, optional): Whether to re-process the dataset.
+            (default: :obj:`False`)
+        total_page_num (int, optional): The number of pages from PubChem.
+            (default: :obj:`10`)
+        total_block_num (int, optional): The blocks of SDF files from PubChem.
+            (default: :obj:`1`)
+    """
+    description_url = (
+        'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/annotations/'
+        'heading/json?heading_type=Compound&heading=Record+Description&page={}'
+    )
+    compound_url = ('https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/'
+                    'CURRENT-Full/SDF')
+    def __init__(
+        self,
+        root: str,
+        transform: Optional[Callable] = None,
+        pre_transform: Optional[Callable] = None,
+        pre_filter: Optional[Callable] = None,
+        force_reload: bool = False,
+        total_page_num: int = 10,
+        total_block_num: int = 1,
+    ):
+        self.total_page_num = total_page_num
+        self.total_block_num = total_block_num
+        super().__init__(root, transform, pre_transform, pre_filter,
+                         force_reload=force_reload)
+        self.load(self.processed_paths[0])
+    @property
+    def raw_file_names(self) -> List[str]:
+        return ['pubchem.csv']
+    @property
+    def processed_file_names(self) -> List[str]:
+        return ['data.pt']
+    def download(self) -> None:
+        # Step 01. Extract description
+        step1_folder = f"{self.raw_dir}/step_01_PubChemSTM_description"
+        if not os.path.exists(step1_folder):
+            os.makedirs(step1_folder)
+            valid_CID_set = set()
+            CID2name_raw, CID2name_extracted = defaultdict(list), defaultdict(
+                list)
+            CID2text_raw, CID2text_extracted = defaultdict(list), defaultdict(
+                list)
+            for page_index in tqdm(range(self.total_page_num)):
+                page_num = page_index + 1
+                f_out = open(
+                    f"{step1_folder}/Compound_description_{page_num}.txt", "w")
+                description_data = requests.get(
+                    self.description_url.format(page_num)).json()
+                description_data = description_data["Annotations"]
+                assert description_data["Page"] == page_num
+                record_list = description_data["Annotation"]
+                for record in record_list:
+                    try:
+                        CID = record["LinkedRecords"]["CID"][0]
+                        if "Name" in record:
+                            name_raw = record["Name"]
+                            CID2name_raw[CID].append(name_raw)
+                        else:
+                            name_raw = None
+                        data_list = record["Data"]
+                        for data in data_list:
+                            description = data["Value"]["StringWithMarkup"][0][
+                                "String"].strip()
+                            extracted_name, extracted_description, _ = extract_name(  # noqa: E501
+                                name_raw, description)
+                            if extracted_name is not None:
+                                CID2name_extracted[CID].append(extracted_name)
+                            CID2text_raw[CID].append(description)
+                            CID2text_extracted[CID].append(
+                                extracted_description)
+                            valid_CID_set.add(CID)
+                            f_out.write(f"{CID}\n")
+                            f_out.write(f"{extracted_description}\n\n")
+                    except Exception:
+                        continue
+            valid_CID_list = sorted(list(valid_CID_set))
+            print(f"Total CID (with raw name) {len(CID2name_raw)}")
+            print(f"Total CID (with extracted name) {len(CID2name_extracted)}")
+            print(f"Total CID {len(valid_CID_list)}")
+            with open(f"{self.raw_dir}/CID2name_raw.json", "w") as f:
+                json.dump(CID2name_raw, f)
+            with open(f"{self.raw_dir}/CID2name.json", "w") as f:
+                json.dump(CID2name_extracted, f)
+            with open(f"{self.raw_dir}/CID2text_raw.json", "w") as f:
+                json.dump(CID2text_raw, f)
+            with open(f"{self.raw_dir}/CID2text.json", "w") as f:
+                json.dump(CID2text_extracted, f)
+        # Step 02. Download SDF Files
+        step2_folder = f"{self.raw_dir}/step_02_PubChemSTM_SDF"
+        if not os.path.exists(step2_folder):
+            for block_id in tqdm(range(self.total_block_num)):
+                block_size = 500000
+                l_id = block_id * block_size + 1
+                r_id = (block_id + 1) * block_size
+                compound_file_name = f"Compound_{l_id:09d}_{r_id:09d}.sdf.gz"
+                download_url(f"{self.compound_url}/{compound_file_name}",
+                             step2_folder)
+    def process(self, use_mp: bool = False) -> None:
+        try:
+            from rdkit import Chem
+            from rdkit.Chem.rdchem import BondType as BT
+            WITH_RDKIT = True
+        except ImportError:
+            WITH_RDKIT = False
+        if not WITH_RDKIT:
+            print(("Using a pre-processed version of the dataset. Please "
+                   "install 'rdkit' to alternatively process the raw data."),
+                  file=sys.stderr)
+            data_list = fs.torch_load(self.raw_paths[0])
+            data_list = [Data(**data_dict) for data_dict in data_list]
+            if self.pre_filter is not None:
+                data_list = [d for d in data_list if self.pre_filter(d)]
+            if self.pre_transform is not None:
+                data_list = [self.pre_transform(d) for d in data_list]
+            self.save(data_list, self.processed_paths[0])
+            return
+        # Step 03. Filter out SDF
+        step2_folder = f"{self.raw_dir}/step_02_PubChemSTM_SDF"
+        step3_folder = f"{self.raw_dir}/step_03_PubChemSTM_filtered"
+        if not os.path.exists(step3_folder):
+            os.makedirs(step3_folder)
+            with open(f"{self.raw_dir}/CID2text.json") as f:
+                CID2text = json.load(f)
+            target_CID_list = set(CID2text.keys())
+            block_size = 500000
+            def extract_one_SDF_file(block_id: int) -> None:
+                valid_mol_count = 0
+                writer = Chem.SDWriter(
+                    f'{step3_folder}/filtered_{block_id}.sdf')
+                l_id = block_id * block_size + 1
+                r_id = (block_id + 1) * block_size
+                compound_file_name = f"Compound_{l_id:09d}_{r_id:09d}.sdf.gz"
+                gzip_loader = gzip.open(f"{step2_folder}/{compound_file_name}")
+                suppl = Chem.ForwardSDMolSupplier(gzip_loader)
+                for mol in tqdm(suppl):
+                    if mol is None:
+                        continue
+                    cid = mol.GetProp("PUBCHEM_COMPOUND_CID")
+                    if cid not in target_CID_list:
+                        continue
+                    writer.write(mol)
+                    valid_mol_count += 1
+                print(f"block id: {block_id}\nfound {valid_mol_count}\n\n")
+                sys.stdout.flush()
+                return
+            if use_mp:
+                num_process = multiprocessing.cpu_count()
+                print(f"{num_process} CPUs")
+                num_process = 8
+                p = Pool(num_process)
+                block_id_list = np.arange(self.total_block_num)
+                with p:
+                    p.map(extract_one_SDF_file, block_id_list)
+            else:
+                for block_id in range(self.total_block_num):
+                    extract_one_SDF_file(block_id)
+        # Step 04. Merge SDF
+        with open(f"{self.raw_dir}/CID2text.json") as f:
+            CID2text = json.load(f)
+        target_CID_list = set(CID2text.keys())
+        print(f'The length of target_CID_list: {len(target_CID_list)}')
+        writer = Chem.SDWriter(f'{self.raw_dir}/molecules.sdf')
+        found_CID_set = set()
+        for block_id in range(self.total_block_num + 1):
+            compound_file_path = f"{step3_folder}/filtered_{block_id}.sdf"
+            try:
+                suppl = Chem.SDMolSupplier(compound_file_path)
+                for mol in tqdm(suppl):
+                    writer.write(mol)
+                    cid = mol.GetProp("PUBCHEM_COMPOUND_CID")
+                    found_CID_set.add(cid)
+            except Exception:
+                print(f"block id: {block_id} with 0 valid SDF file")
+                continue
+        print(f"In total: {len(found_CID_set)} molecules")
+        # Step 05. Convert to PyG data format
+        types = {'H': 0, 'C': 1, 'N': 2, 'O': 3, 'F': 4, 'Unknow': 5}
+        bonds = {BT.SINGLE: 0, BT.DOUBLE: 1, BT.TRIPLE: 2, BT.AROMATIC: 3}
+        data_list = []
+        # Real data
+        CID2text_file = f'{self.raw_dir}/CID2text.json'
+        with open(CID2text_file) as f:
+            CID2text_data = json.load(f)
+        suppl = Chem.SDMolSupplier(f'{self.raw_dir}/molecules.sdf')
+        llm = LLM(
+            # model_name='lmsys/vicuna-7b-v1.5',
+            model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1',
+            num_params=1,
+            dtype=torch.bfloat16,
+        )
+        prompt = ("Propose a question regarding the molecule '∼' "
+                  "whose answer is: {}:")
+        for mol in tqdm(suppl):
+            if mol.HasProp('PUBCHEM_COMPOUND_CID'):
+                CID = mol.GetProp("PUBCHEM_COMPOUND_CID")
+                CAN_SMILES = mol.GetProp("PUBCHEM_OPENEYE_CAN_SMILES")
+                m: Chem.Mol = Chem.MolFromSmiles(CAN_SMILES)
+                if m is None:
+                    continue
+                RDKit_CAN_SMILES = Chem.MolToSmiles(m)
+                ground_truth = CID2text_data[CID][0]
+                instruction = llm.inference([prompt.format(ground_truth)])[0]
+                x: torch.Tensor = torch.tensor([
+                    types[atom.GetSymbol()] if atom.GetSymbol() in types else 5
+                    for atom in m.GetAtoms()  # type: ignore
+                ])
+                x = one_hot(x, num_classes=len(types), dtype=torch.float)
+                rows, cols, edge_types = [], [], []
+                for bond in m.GetBonds():  # type: ignore
+                    i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
+                    edge_types += [bonds[bond.GetBondType()]] * 2
+                    rows += [i, j]
+                    cols += [j, i]
+                edge_index = torch.tensor([rows, cols], dtype=torch.long)
+                edge_type = torch.tensor(edge_types, dtype=torch.long)
+                edge_attr = one_hot(edge_type, num_classes=len(bonds))
+                data = Data(
+                    x=x,
+                    edge_index=edge_index,
+                    edge_attr=edge_attr,
+                    smiles=RDKit_CAN_SMILES,
+                    instruction=instruction,
+                    y=ground_truth,
+                )
+                if self.pre_filter is not None and not self.pre_filter(data):
+                    continue
+                if self.pre_transform is not None:
+                    data = self.pre_transform(data)
+                data_list.append(data)
+        self.save(data_list, self.processed_paths[0])

pyg-nightly 2.7.0.dev20241119__py3-none-any.whl → 2.7.0.dev20241121__py3-none-any.whl

pyg-nightly 2.7.0.dev20241119py3-none-any.whl → 2.7.0.dev20241121py3-none-any.whl