PyPI - pyg-nightly - Versions diffs - 2.7.0.dev20250919__py3-none-any.whl → 2.7.0.dev20250920__py3-none-any.whl - Mend

pyg-nightly 2.7.0.dev20250919py3-none-any.whl → 2.7.0.dev20250920py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pyg-nightly might be problematic. Click here for more details.

Files changed (8) hide show

{pyg_nightly-2.7.0.dev20250919.dist-info → pyg_nightly-2.7.0.dev20250920.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pyg-nightly
-Version: 2.7.0.dev20250919
+Version: 2.7.0.dev20250920
 Summary: Graph Neural Network Library for PyTorch
 Keywords: deep-learning,pytorch,geometric-deep-learning,graph-neural-networks,graph-convolutional-networks
 Author-email: Matthias Fey <matthias@pyg.org>

{pyg_nightly-2.7.0.dev20250919.dist-info → pyg_nightly-2.7.0.dev20250920.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-torch_geometric/__init__.py,sha256=OLGPhTHC1wmAq6rg69s1sbJUFuptQaVPvG0ggAYNOlM,2292
+torch_geometric/__init__.py,sha256=0ij8VxVSK4T5A19Dr05CrBi0vbfTv2d0vTpB73hQsws,2292
 torch_geometric/_compile.py,sha256=9yqMTBKatZPr40WavJz9FjNi7pQj8YZAZOyZmmRGXgc,1351
 torch_geometric/_onnx.py,sha256=ODB_8cwFUiwBUjngXn6-K5HHb7IDul7DDXuuGX7vj_0,8178
 torch_geometric/backend.py,sha256=lVaf7aLoVaB3M-UcByUJ1G4T4FOK6LXAg0CF4W3E8jo,1575
@@ -151,7 +151,7 @@ torch_geometric/datasets/shapenet.py,sha256=tn3HiQQAr6lxHrqxfOVaAtl40guwFYTXWCbS
 torch_geometric/datasets/shrec2016.py,sha256=cTLhctbqE0EUEvKddJFhPzDb1oLKXOth4O_WzsWtyMk,6323
 torch_geometric/datasets/snap_dataset.py,sha256=deJvB6cpIQ3bu_pcWoqgEo1-Kl_NcFi7ZSUci645X0U,9481
 torch_geometric/datasets/suite_sparse.py,sha256=eqjH4vAUq872qdk3YdLkZSwlu6r7HHpTgK0vEVGmY1s,3278
-torch_geometric/datasets/tag_dataset.py,sha256=qTnwr2N1tbWYeLGbItfv70UxQ3n1rKesjeVU3kcOCP8,14757
+torch_geometric/datasets/tag_dataset.py,sha256=jslijGCh37ip2YkrQLyvbk-1QRJ3yqFpmzuQSxckXrE,19402
 torch_geometric/datasets/taobao.py,sha256=CUcZpbWsNTasevflO8zqP0YvENy89P7wpKS4MHaDJ6Q,4170
 torch_geometric/datasets/teeth3ds.py,sha256=hZvhcq9lsQENNFr5hk50w2T3CgxE_tlnQfrCgN6uIDQ,9919
 torch_geometric/datasets/tosca.py,sha256=nUSF8NQT1GlkwWQLshjWmr8xORsvRHzzIqhUyDCvABc,4632
@@ -308,10 +308,10 @@ torch_geometric/loader/prefetch.py,sha256=z30TIcu3_6ZubllUOwNLunlq4RyQdFj36vPE5Q
 torch_geometric/loader/random_node_loader.py,sha256=rCmRXYv70SPxBo-Oh049eFEWEZDV7FmlRPzmjcoirXQ,2196
 torch_geometric/loader/shadow.py,sha256=_hCspYf9SlJYX0lqEjxFec9e9t1iMScNThOoWR1wQGM,4173
 torch_geometric/loader/temporal_dataloader.py,sha256=Z7L_rYdl6SYBQXAgtr18FVcmfMH9kP1fBWrc2W63g2c,2250
-torch_geometric/loader/utils.py,sha256=3hzKzIgB52QIZu7Jdn4JeXZaegIJinIQfIUP9DrUWUQ,14903
+torch_geometric/loader/utils.py,sha256=DgGHK6kNu7ZZIZuaT0Ya_4rUctBMMKyBBSdHhuU389w,14903
 torch_geometric/loader/zip_loader.py,sha256=3lt10fD15Rxm1WhWzypswGzCEwUz4h8OLCD1nE15yNg,3843
 torch_geometric/metrics/__init__.py,sha256=3krvDobW6vV5yHTjq2S2pmOXxNfysNG26muq7z48e94,699
-torch_geometric/metrics/link_pred.py,sha256=1_hE3KiRqAdZLI6QuUbjgyFC__mTyFu_RimM3bD8wRw,31678
+torch_geometric/metrics/link_pred.py,sha256=bacmFGn7rm0iF2wOJdAW-iTZ04bOuiS-7ur2K-MZKlA,31684
 torch_geometric/nn/__init__.py,sha256=tTEKDy4vpjPNKyG1Vg9GIx7dVFJuQtBoh2M19ascGpo,880
 torch_geometric/nn/data_parallel.py,sha256=YiybTWoSFyfSzlXAamZ_-y1f7B6tvDEFHOuy_AyJz9Q,4761
 torch_geometric/nn/encoding.py,sha256=82fpwyOx0-STFSAJ5AzG0p2WFC9u1M4KgmKIql8hSLc,3634
@@ -654,7 +654,7 @@ torch_geometric/utils/undirected.py,sha256=H_nfpI0_WluOG6VfjPyldvcjL4w5USAKWu2x5
 torch_geometric/visualization/__init__.py,sha256=b-HnVesXjyJ_L1N-DnjiRiRVf7lhwKaBQF_2i5YMVSU,208
 torch_geometric/visualization/graph.py,sha256=mfZHXYfiU-CWMtfawYc80IxVwVmtK9hbIkSKhM_j7oI,14311
 torch_geometric/visualization/influence.py,sha256=CWMvuNA_Nf1sfbJmQgn58yS4OFpeKXeZPe7kEuvkUBw,477
-pyg_nightly-2.7.0.dev20250919.dist-info/licenses/LICENSE,sha256=ic-27cMJc1kWoMEYncz3Ya3Ur2Bi3bNLWib2DT763-o,1067
-pyg_nightly-2.7.0.dev20250919.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
-pyg_nightly-2.7.0.dev20250919.dist-info/METADATA,sha256=IfaNYkgI-HE5ar5wS1k5XG9esWAh643uI1uvCOX7ChY,64145
-pyg_nightly-2.7.0.dev20250919.dist-info/RECORD,,
+pyg_nightly-2.7.0.dev20250920.dist-info/licenses/LICENSE,sha256=ic-27cMJc1kWoMEYncz3Ya3Ur2Bi3bNLWib2DT763-o,1067
+pyg_nightly-2.7.0.dev20250920.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
+pyg_nightly-2.7.0.dev20250920.dist-info/METADATA,sha256=PAeahjszlJpaI4WHs-eZPOYELiodtDDAPudxTK4MfTA,64145
+pyg_nightly-2.7.0.dev20250920.dist-info/RECORD,,

torch_geometric/__init__.py CHANGED Viewed

@@ -31,7 +31,7 @@ from .lazy_loader import LazyLoader
 contrib = LazyLoader('contrib', globals(), 'torch_geometric.contrib')
 graphgym = LazyLoader('graphgym', globals(), 'torch_geometric.graphgym')
-__version__ = '2.7.0.dev20250919'
+__version__ = '2.7.0.dev20250920'
 __all__ = [
     'Index',

torch_geometric/datasets/tag_dataset.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import csv
 import os
 import os.path as osp
 from collections.abc import Sequence
@@ -10,6 +11,7 @@ from tqdm import tqdm
 from torch_geometric.data import InMemoryDataset, download_google_url
 from torch_geometric.data.data import BaseData
+from torch_geometric.io import fs
 try:
     from pandas import DataFrame, read_csv
@@ -22,14 +24,16 @@ IndexType = Union[slice, Tensor, np.ndarray, Sequence]
 class TAGDataset(InMemoryDataset):
     r"""The Text Attributed Graph datasets from the
-    `"Learning on Large-scale Text-attributed Graphs via Variational Inference
-    " <https://arxiv.org/abs/2210.14709>`_ paper.
+    `"Learning on Large-scale Text-attributed Graphs via Variational Inference"
+    <https://arxiv.org/abs/2210.14709>`_ paper and `"Harnessing Explanations:
+    LLM-to-LM Interpreter for Enhanced Text-Attributed Graph Representation
+    Learning" <https://arxiv.org/abs/2305.19523>`_ paper.
     This dataset is aiming on transform `ogbn products`, `ogbn arxiv`
     into Text Attributed Graph that each node in graph is associate with a
-    raw text, that dataset can be adapt to DataLoader (for LM training) and
-    NeighborLoader(for GNN training). In addition, this class can be use as a
-    wrapper class by convert a InMemoryDataset with Tokenizer and text into
-    Text Attributed Graph.
+    raw text, LLM prediction and explanation, that dataset can be adapt to
+    DataLoader (for LM training) and NeighborLoader(for GNN training).
+    In addition, this class can be use as a wrapper class by convert a
+    InMemoryDataset with Tokenizer and text into Text Attributed Graph.
     Args:
         root (str): Root directory where the dataset should be saved.
@@ -51,22 +55,35 @@ class TAGDataset(InMemoryDataset):
             or not, default: False
         force_reload (bool): default: False
     .. note::
-        See `example/llm_plus_gnn/glem.py` for example usage
+        See `example/llm/glem.py` for example usage
     """
     raw_text_id = {
         'ogbn-arxiv': '1g3OOVhRyiyKv13LY6gbp8GLITocOUr_3',
         'ogbn-products': '1I-S176-W4Bm1iPDjQv3hYwQBtxE0v8mt'
     }
-    def __init__(self, root: str, dataset: InMemoryDataset,
-                 tokenizer_name: str, text: Optional[List[str]] = None,
-                 split_idx: Optional[Dict[str, Tensor]] = None,
-                 tokenize_batch_size: int = 256, token_on_disk: bool = False,
-                 text_on_disk: bool = False,
-                 force_reload: bool = False) -> None:
+    llm_prediction_url = 'https://github.com/XiaoxinHe/TAPE/raw/main/gpt_preds'
+    llm_explanation_id = {
+        'ogbn-arxiv': '1o8n2xRen-N_elF9NQpIca0iCHJgEJbRQ',
+    }
+    def __init__(
+        self,
+        root: str,
+        dataset: InMemoryDataset,
+        tokenizer_name: str,
+        text: Optional[List[str]] = None,
+        split_idx: Optional[Dict[str, Tensor]] = None,
+        tokenize_batch_size: int = 256,
+        token_on_disk: bool = False,
+        text_on_disk: bool = False,
+        force_reload: bool = False,
+    ) -> None:
         # list the vars you want to pass in before run download & process
         self.name = dataset.name
         self.text = text
+        self.llm_prediction_topk = 5
         self.tokenizer_name = tokenizer_name
         from transformers import AutoTokenizer
         self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
@@ -93,8 +110,9 @@ class TAGDataset(InMemoryDataset):
                              "is_gold mask, please pass splited index "
                              "in format of dictionaty with 'train', 'valid' "
                              "'test' index tensor to 'split_idx'")
-        if text is not None and text_on_disk:
-            self.save_node_text(text)
+        if text_on_disk:
+            if text is not None:
+                self.save_node_text(text)
         self.text_on_disk = text_on_disk
         # init will call download and process
         super().__init__(self.root, transform=None, pre_transform=None,
@@ -119,6 +137,10 @@ class TAGDataset(InMemoryDataset):
         self.token_on_disk = token_on_disk
         self.tokenize_batch_size = tokenize_batch_size
         self._token = self.tokenize_graph(self.tokenize_batch_size)
+        self._llm_explanation_token = self.tokenize_graph(
+            self.tokenize_batch_size, text_type='llm_explanation')
+        self._all_token = self.tokenize_graph(self.tokenize_batch_size,
+                                              text_type='all')
         self.__num_classes__ = dataset.num_classes
     @property
@@ -146,6 +168,19 @@ class TAGDataset(InMemoryDataset):
             self._token = self.tokenize_graph()
         return self._token
+    @property
+    def llm_explanation_token(self) -> Dict[str, Tensor]:
+        if self._llm_explanation_token is None:  # lazy load
+            self._llm_explanation_token = self.tokenize_graph(
+                text_type='llm_explanation')
+        return self._llm_explanation_token
+    @property
+    def all_token(self) -> Dict[str, Tensor]:
+        if self._all_token is None:  # lazy load
+            self._all_token = self.tokenize_graph(text_type='all')
+        return self._all_token
     # load is_gold after init
     @property
     def is_gold(self) -> Tensor:
@@ -194,10 +229,17 @@ class TAGDataset(InMemoryDataset):
                                             folder=f'{self.root}/raw',
                                             filename='node-text.csv.gz',
                                             log=True)
-        text_df = read_csv(raw_text_path)
-        self.text = list(text_df['text'])
+        self.text = list(read_csv(raw_text_path)['text'])
+        print('downloading llm explanations')
+        llm_explanation_path = download_google_url(
+            id=self.llm_explanation_id[self.name], folder=f'{self.root}/raw',
+            filename='node-gpt-response.csv.gz', log=True)
+        self.llm_explanation = list(read_csv(llm_explanation_path)['text'])
+        print('downloading llm predictions')
+        fs.cp(f'{self.llm_prediction_url}/{self.name}.csv', self.raw_dir)
     def process(self) -> None:
+        # process Title and Abstraction
         if osp.exists(osp.join(self.root, 'raw', 'node-text.csv.gz')):
             text_df = read_csv(osp.join(self.root, 'raw', 'node-text.csv.gz'))
             self.text = list(text_df['text'])
@@ -212,6 +254,42 @@ class TAGDataset(InMemoryDataset):
                              "The raw text of each node is not specified"
                              "Please pass in 'text' when convert your dataset "
                              "to Text Attribute Graph Dataset")
+        # process LLM explanation and prediction
+        llm_explanation_path = f'{self.raw_dir}/node-gpt-response.csv.gz'
+        llm_prediction_path = f'{self.raw_dir}/{self.name}.csv'
+        if osp.exists(llm_explanation_path) and osp.exists(
+                llm_prediction_path):
+            # load LLM explanation
+            self.llm_explanation = list(read_csv(llm_explanation_path)['text'])
+            # load LLM prediction
+            preds = []
+            with open(llm_prediction_path) as file:
+                reader = csv.reader(file)
+                for row in reader:
+                    inner_list = []
+                    for value in row:
+                        inner_list.append(int(value))
+                    preds.append(inner_list)
+            pl = torch.zeros(len(preds), self.llm_prediction_topk,
+                             dtype=torch.long)
+            for i, pred in enumerate(preds):
+                pl[i][:len(pred)] = torch.tensor(
+                    pred[:self.llm_prediction_topk], dtype=torch.long) + 1
+        elif self.name in self.llm_explanation_id:
+            self.download()
+        else:
+            print(
+                'The dataset is not ogbn-arxiv,'
+                'please pass in your llm explanation list to `llm_explanation`'
+                'and llm prediction list to `llm_prediction`')
+        if self.llm_explanation is None or pl is None:
+            raise ValueError(
+                "The TAGDataset only have ogbn-arxiv LLM explanations"
+                "and predictions in default. The llm explanation and"
+                "prediction of each node is not specified."
+                "Please pass in 'llm_explanation' and 'llm_prediction' when"
+                "convert your dataset to Text Attribute Graph Dataset")
     def save_node_text(self, text: List[str]) -> None:
         node_text_path = osp.join(self.root, 'raw', 'node-text.csv.gz')
@@ -224,22 +302,39 @@ class TAGDataset(InMemoryDataset):
             text_df.to_csv(osp.join(node_text_path), compression='gzip',
                            index=False)
-    def tokenize_graph(self, batch_size: int = 256) -> Dict[str, Tensor]:
+    def tokenize_graph(self, batch_size: int = 256,
+                       text_type: str = 'raw_text') -> Dict[str, Tensor]:
         r"""Tokenizing the text associate with each node, running in cpu.
         Args:
             batch_size (Optional[int]): batch size of list of text for
                 generating emebdding
+            text_type (Optional[str]): type of text
         Returns:
             Dict[str, torch.Tensor]: tokenized graph
         """
+        assert text_type in ['raw_text', 'llm_explanation', 'all']
+        if text_type == 'raw_text':
+            _text = self.text
+        elif text_type == 'llm_explanation':
+            _text = self.llm_explanation
+        elif text_type == 'all':
+            if self.text is None or self.llm_explanation is None:
+                raise ValueError("The TAGDataset need text and llm explanation"
+                                 "for tokenizing all text")
+            _text = [
+                f'{raw_txt} Explanation: {exp_txt}'
+                for raw_txt, exp_txt in zip(self.text, self.llm_explanation)
+            ]
         data_len = 0
-        if self.text is not None:
-            data_len = len(self.text)
+        if _text is not None:
+            data_len = len(_text)
         else:
             raise ValueError("The TAGDataset need text for tokenization")
         token_keys = ['input_ids', 'token_type_ids', 'attention_mask']
-        path = os.path.join(self.processed_dir, 'token', self.tokenizer_name)
+        path = os.path.join(self.processed_dir, 'token', text_type,
+                            self.tokenizer_name)
         # Check if the .pt files already exist
         token_files_exist = any(
             os.path.exists(os.path.join(path, f'{k}.pt')) for k in token_keys)
@@ -256,12 +351,12 @@ class TAGDataset(InMemoryDataset):
         all_encoded_token = {k: [] for k in token_keys}
         pbar = tqdm(total=data_len)
-        pbar.set_description('Tokenizing Text Attributed Graph')
+        pbar.set_description(f'Tokenizing Text Attributed Graph {text_type}')
         for i in range(0, data_len, batch_size):
             end_index = min(data_len, i + batch_size)
-            token = self.tokenizer(self.text[i:min(i + batch_size, data_len)],
-                                   padding='max_length', truncation=True,
-                                   max_length=512, return_tensors="pt")
+            token = self.tokenizer(_text[i:end_index], padding='max_length',
+                                   truncation=True, max_length=512,
+                                   return_tensors="pt")
             for k in token.keys():
                 all_encoded_token[k].append(token[k])
             pbar.update(end_index - i)
@@ -289,10 +384,18 @@ class TAGDataset(InMemoryDataset):
         Args:
             tag_dataset (TAGDataset): the parent dataset
+            text_type (str): type of text
         """
-        def __init__(self, tag_dataset: 'TAGDataset') -> None:
+        def __init__(self, tag_dataset: 'TAGDataset',
+                     text_type: str = 'raw_text') -> None:
+            assert text_type in ['raw_text', 'llm_explanation', 'all']
             self.tag_dataset = tag_dataset
-            self.token = tag_dataset.token
+            if text_type == 'raw_text':
+                self.token = tag_dataset.token
+            elif text_type == 'llm_explanation':
+                self.token = tag_dataset.llm_explanation_token
+            elif text_type == 'all':
+                self.token = tag_dataset.all_token
             assert tag_dataset._data is not None
             self._data = tag_dataset._data
@@ -312,7 +415,8 @@ class TAGDataset(InMemoryDataset):
         # for LM training
         def __getitem__(
-                self, node_id: IndexType
+            self,
+            node_id: IndexType,
         ) -> Dict[str, Union[Tensor, Dict[str, Tensor]]]:
             r"""This function will override the function in
             torch.utils.data.Dataset, and will be called when you
@@ -343,8 +447,8 @@ class TAGDataset(InMemoryDataset):
         def __repr__(self) -> str:
             return f'{self.__class__.__name__}()'
-    def to_text_dataset(self) -> TextDataset:
+    def to_text_dataset(self, text_type: str = 'raw_text') -> TextDataset:
         r"""Factory Build text dataset from Text Attributed Graph Dataset
         each data point is node's associated text token.
         """
-        return TAGDataset.TextDataset(self)
+        return TAGDataset.TextDataset(self, text_type)

torch_geometric/loader/utils.py CHANGED Viewed

@@ -256,14 +256,6 @@ def filter_custom_hetero_store(
     # Construct a new `HeteroData` object:
     data = custom_cls() if custom_cls is not None else HeteroData()
-    # Filter edge storage:
-    # TODO support edge attributes
-    for attr in graph_store.get_all_edge_attrs():
-        key = attr.edge_type
-        if key in row_dict and key in col_dict:
-            edge_index = torch.stack([row_dict[key], col_dict[key]], dim=0)
-            data[attr.edge_type].edge_index = edge_index
     # Filter node storage:
     required_attrs = []
     for attr in feature_store.get_all_tensor_attrs():
@@ -280,6 +272,14 @@ def filter_custom_hetero_store(
     for i, attr in enumerate(required_attrs):
         data[attr.group_name][attr.attr_name] = tensors[i]
+    # Filter edge storage:
+    # TODO support edge attributes
+    for attr in graph_store.get_all_edge_attrs():
+        key = attr.edge_type
+        if key in row_dict and key in col_dict:
+            edge_index = torch.stack([row_dict[key], col_dict[key]], dim=0)
+            data[attr.edge_type].edge_index = edge_index
     return data

torch_geometric/metrics/link_pred.py CHANGED Viewed

@@ -53,7 +53,7 @@ class LinkPredMetricData:
         # Flatten both prediction and ground-truth indices, and determine
         # overlaps afterwards via `torch.searchsorted`.
-        max_index = max(  # type: ignore
+        max_index = max(
             self.pred_index_mat.max()
             if self.pred_index_mat.numel() > 0 else 0,
             self.edge_label_index[1].max()
@@ -820,9 +820,10 @@ class LinkPredPersonalization(_LinkPredMetric):
             right = pred[col.cpu()].to(device)
             # Use offset to work around applying `isin` along a specific dim:
-            i = max(left.max(), right.max()) + 1  # type: ignore
-            i = torch.arange(0, i * row.size(0), i, device=device).view(-1, 1)
-            isin = torch.isin(left + i, right + i)
+            i = max(int(left.max()), int(right.max())) + 1
+            idx = torch.arange(0, i * row.size(0), i, device=device)
+            idx = idx.view(-1, 1)
+            isin = torch.isin(left + idx, right + idx)
             # Compute personalization via average inverse cosine similarity:
             cos = isin.sum(dim=-1) / pred.size(1)

{pyg_nightly-2.7.0.dev20250919.dist-info → pyg_nightly-2.7.0.dev20250920.dist-info}/WHEEL RENAMED Viewed

File without changes

{pyg_nightly-2.7.0.dev20250919.dist-info → pyg_nightly-2.7.0.dev20250920.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

pyg-nightly 2.7.0.dev20250919__py3-none-any.whl → 2.7.0.dev20250920__py3-none-any.whl

Potentially problematic release.

pyg-nightly 2.7.0.dev20250919py3-none-any.whl → 2.7.0.dev20250920py3-none-any.whl