PyPI - pyg-nightly - Versions diffs - 2.7.0.dev20250905__py3-none-any.whl → 2.7.0.dev20250906__py3-none-any.whl - Mend

pyg-nightly 2.7.0.dev20250905py3-none-any.whl → 2.7.0.dev20250906py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

{pyg_nightly-2.7.0.dev20250905.dist-info → pyg_nightly-2.7.0.dev20250906.dist-info}/METADATA +2 -1
{pyg_nightly-2.7.0.dev20250905.dist-info → pyg_nightly-2.7.0.dev20250906.dist-info}/RECORD +32 -25
torch_geometric/__init__.py +1 -1
torch_geometric/data/__init__.py +0 -5
torch_geometric/data/lightning/datamodule.py +2 -2
torch_geometric/datasets/molecule_gpt_dataset.py +1 -1
torch_geometric/datasets/web_qsp_dataset.py +262 -210
torch_geometric/graphgym/imports.py +2 -2
torch_geometric/llm/__init__.py +9 -0
torch_geometric/{data → llm}/large_graph_indexer.py +124 -61
torch_geometric/llm/models/__init__.py +23 -0
torch_geometric/{nn → llm}/models/g_retriever.py +68 -49
torch_geometric/{nn → llm}/models/git_mol.py +1 -1
torch_geometric/{nn/nlp → llm/models}/llm.py +167 -33
torch_geometric/llm/models/llm_judge.py +158 -0
torch_geometric/{nn → llm}/models/molecule_gpt.py +1 -1
torch_geometric/{nn/nlp → llm/models}/sentence_transformer.py +42 -8
torch_geometric/llm/models/txt2kg.py +353 -0
torch_geometric/llm/rag_loader.py +154 -0
torch_geometric/llm/utils/backend_utils.py +442 -0
torch_geometric/llm/utils/feature_store.py +169 -0
torch_geometric/llm/utils/graph_store.py +199 -0
torch_geometric/llm/utils/vectorrag.py +124 -0
torch_geometric/loader/__init__.py +0 -4
torch_geometric/nn/__init__.py +0 -1
torch_geometric/nn/models/__init__.py +0 -10
torch_geometric/nn/models/sgformer.py +2 -0
torch_geometric/loader/rag_loader.py +0 -107
torch_geometric/nn/nlp/__init__.py +0 -9
{pyg_nightly-2.7.0.dev20250905.dist-info → pyg_nightly-2.7.0.dev20250906.dist-info}/WHEEL +0 -0
{pyg_nightly-2.7.0.dev20250905.dist-info → pyg_nightly-2.7.0.dev20250906.dist-info}/licenses/LICENSE +0 -0
/torch_geometric/{nn → llm}/models/glem.py +0 -0
/torch_geometric/{nn → llm}/models/protein_mpnn.py +0 -0
/torch_geometric/{nn/nlp → llm/models}/vision_transformer.py +0 -0

torch_geometric/{data → llm}/large_graph_indexer.py RENAMED Viewed

@@ -2,7 +2,7 @@ import os
 import pickle as pkl
 import shutil
 from dataclasses import dataclass
-from itertools import chain
+from itertools import chain, islice, tee
 from typing import (
     Any,
     Callable,
@@ -37,15 +37,15 @@ def ordered_set(values: Iterable[str]) -> List[str]:
 # TODO: Refactor Node and Edge funcs and attrs to be accessible via an Enum?
-NODE_PID = "pid"
+NODE_PID = "pid"  # Encodes node id
 NODE_KEYS = {NODE_PID}
-EDGE_PID = "e_pid"
-EDGE_HEAD = "h"
-EDGE_RELATION = "r"
-EDGE_TAIL = "t"
-EDGE_INDEX = "edge_idx"
+EDGE_PID = "e_pid"  # Encodes source node, relation, destination node
+EDGE_HEAD = "h"  # Encodes source node
+EDGE_RELATION = "r"  # Encodes relation
+EDGE_TAIL = "t"  # Encodes destination node
+EDGE_INDEX = "edge_idx"  # Encodes source node, destination node
 EDGE_KEYS = {EDGE_PID, EDGE_HEAD, EDGE_RELATION, EDGE_TAIL, EDGE_INDEX}
@@ -88,6 +88,7 @@ class LargeGraphIndexer:
         Args:
             nodes (Iterable[str]): Node ids in the graph.
             edges (KnowledgeGraphLike): Edge ids in the graph.
+                Example: [("cats", "eat", "dogs")]
             node_attr (Optional[Dict[str, List[Any]]], optional): Mapping node
                 attribute name and list of their values in order of unique node
                 ids. Defaults to None.
@@ -148,7 +149,6 @@ class LargeGraphIndexer:
                 self.edge_attr[EDGE_TAIL].append(t)
                 self.edge_attr[EDGE_INDEX].append(
                     (self._nodes[h], self._nodes[t]))
         for i, tup in enumerate(edges):
             self._edges[tup] = i
@@ -164,7 +164,8 @@ class LargeGraphIndexer:
         Args:
             triplets (KnowledgeGraphLike): Series of triplets representing
-                knowledge graph relations.
+                knowledge graph relations. Example: [("cats", "eat", dogs")].
+                Note: Please ensure triplets are unique.
             pre_transform (Optional[Callable[[TripletLike], TripletLike]]):
                 Optional preprocessing function to apply to triplets.
                 Defaults to None.
@@ -173,8 +174,8 @@ class LargeGraphIndexer:
             LargeGraphIndexer: Index of unique nodes and edges.
         """
         # NOTE: Right now assumes that all trips can be loaded into memory
-        nodes = set()
-        edges = set()
+        nodes = []
+        edges = []
         if pre_transform is not None:
@@ -183,16 +184,17 @@ class LargeGraphIndexer:
                 for trip in trips:
                     yield pre_transform(trip)
-            triplets = apply_transform(triplets)
+            triplets = list(apply_transform(triplets))
         for h, r, t in triplets:
             for node in (h, t):
-                nodes.add(node)
+                nodes.append(node)
             edge_idx = (h, r, t)
-            edges.add(edge_idx)
+            edges.append(edge_idx)
+        nodes = ordered_set(nodes)
+        edges = ordered_set(edges)
         return cls(list(nodes), list(edges))
     @classmethod
@@ -291,13 +293,12 @@ class LargeGraphIndexer:
             values = self.node_attr[feature_name].values
         else:
             values = self.node_attr[feature_name]
         # TODO: torch_geometric.utils.select
         if isinstance(values, torch.Tensor):
             idxs = list(
                 self.get_node_features_iter(feature_name, pids,
                                             index_only=True))
-            return values[torch.tensor(idxs)]
+            return values[torch.tensor(idxs).long()]
         return list(self.get_node_features_iter(feature_name, pids))
     def get_node_features_iter(
@@ -421,7 +422,7 @@ class LargeGraphIndexer:
             idxs = list(
                 self.get_edge_features_iter(feature_name, pids,
                                             index_only=True))
-            return values[torch.tensor(idxs)]
+            return values[torch.tensor(idxs).long()]
         return list(self.get_edge_features_iter(feature_name, pids))
     def get_edge_features_iter(
@@ -532,7 +533,6 @@ class LargeGraphIndexer:
         """
         x = torch.Tensor(self.get_node_features(node_feature_name))
         node_id = torch.LongTensor(range(len(x)))
         edge_index = torch.t(
             torch.LongTensor(self.get_edge_features(EDGE_INDEX)))
@@ -572,8 +572,10 @@ def get_features_for_triplets_groups(
     triplet_groups: Iterable[KnowledgeGraphLike],
     node_feature_name: str = "x",
     edge_feature_name: str = "edge_attr",
-    pre_transform: Optional[Callable[[TripletLike], TripletLike]] = None,
+    pre_transform: Callable[[TripletLike], TripletLike] = lambda trip: trip,
     verbose: bool = False,
+    max_batch_size: int = 250,
+    num_workers: Optional[int] = None,
 ) -> Iterator[Data]:
     """Given an indexer and a series of triplet groups (like a dataset),
     retrieve the specified node and edge features for each triplet from the
@@ -587,62 +589,123 @@ def get_features_for_triplets_groups(
             Defaults to "x".
         edge_feature_name (str, optional): edge feature to fetch.
             Defaults to "edge_attr".
-        pre_transform (Optional[Callable[[TripletLike], TripletLike]]):
+        pre_transform (Callable[[TripletLike], TripletLike]):
             Optional preprocessing to perform on triplets.
             Defaults to None.
-        verbose (bool, optional): Whether to print progress. Defaults to False.
+        verbose (bool, optional): Whether to print progress.
+            Defaults to False.
+        max_batch_size (int, optional):
+            Maximum batch size for fetching features.
+            Defaults to 250.
+        num_workers (int, optional):
+            Number of workers to use for fetching features.
+            Defaults to None (all available).
     Yields:
         Iterator[Data]: For each triplet group, yield a data object containing
             the unique graph and features from the index.
     """
-    if pre_transform is not None:
+    def apply_transform(trips: Iterable[TripletLike]) -> Iterator[TripletLike]:
+        for trip in trips:
+            yield pre_transform(tuple(trip))
-        def apply_transform(trips):
-            for trip in trips:
-                yield pre_transform(tuple(trip))
-        # TODO: Make this safe for large amounts of triplets?
-        triplet_groups = (list(apply_transform(triplets))
-                          for triplets in triplet_groups)
+    # Carefully trying to avoid loading all triplets into memory at once
+    # While also still tracking the number of elements for tqdm
+    triplet_groups: List[Iterator[TripletLike]] = [
+        apply_transform(triplets) for triplets in triplet_groups
+    ]
     node_keys = []
     edge_keys = []
     edge_index = []
+    """
+    For each KG, we gather the node_indices, edge_keys,
+    and edge_indices needed to construct each Data object
+    """
-    for triplets in tqdm(triplet_groups, disable=not verbose):
+    for kg_triplets in tqdm(triplet_groups, disable=not verbose):
+        kg_triplets_nodes, kg_triplets_edge_keys, kg_triplets_edge_index = tee(
+            kg_triplets, 3)
+        """
+        Don't apply pre_transform here,
+        because it has already been applied on the triplet groups/
+        """
         small_graph_indexer = LargeGraphIndexer.from_triplets(
-            triplets, pre_transform=pre_transform)
+            kg_triplets_nodes)
         node_keys.append(small_graph_indexer.get_node_features())
-        edge_keys.append(small_graph_indexer.get_edge_features(pids=triplets))
+        edge_keys.append(
+            small_graph_indexer.get_edge_features(pids=kg_triplets_edge_keys))
         edge_index.append(
-            small_graph_indexer.get_edge_features(EDGE_INDEX, triplets))
-    node_feats = indexer.get_node_features(feature_name=node_feature_name,
-                                           pids=chain.from_iterable(node_keys))
-    edge_feats = indexer.get_edge_features(feature_name=edge_feature_name,
-                                           pids=chain.from_iterable(edge_keys))
-    last_node_idx, last_edge_idx = 0, 0
-    for (nkeys, ekeys, eidx) in zip(node_keys, edge_keys, edge_index):
-        nlen, elen = len(nkeys), len(ekeys)
-        x = torch.Tensor(node_feats[last_node_idx:last_node_idx + nlen])
-        last_node_idx += len(nkeys)
-        edge_attr = torch.Tensor(edge_feats[last_edge_idx:last_edge_idx +
-                                            elen])
-        last_edge_idx += len(ekeys)
-        edge_idx = torch.LongTensor(eidx).T
-        data_obj = Data(x=x, edge_attr=edge_attr, edge_index=edge_idx)
-        data_obj[NODE_PID] = node_keys
-        data_obj[EDGE_PID] = edge_keys
-        data_obj["node_idx"] = [indexer._nodes[k] for k in nkeys]
-        data_obj["edge_idx"] = [indexer._edges[e] for e in ekeys]
+            small_graph_indexer.get_edge_features(
+                EDGE_INDEX,
+                kg_triplets_edge_index,
+            ))
+    """
+    We get the embeddings for each node and edge key in the KG,
+    but we need to do so in batches.
+    Batches that are too small waste compute time,
+    as each call to get features has an upfront cost.
+    Batches that are too large waste memory,
+    as we need to store all the result embeddings in memory.
+    """
-        yield data_obj
+    def _fetch_feature_batch(batches):
+        node_key_batch, edge_key_batch, edge_index_batch = batches
+        node_feats = indexer.get_node_features(
+            feature_name=node_feature_name,
+            pids=chain.from_iterable(node_key_batch))
+        edge_feats = indexer.get_edge_features(
+            feature_name=edge_feature_name,
+            pids=chain.from_iterable(edge_key_batch))
+        last_node_idx, last_edge_idx = 0, 0
+        for (nkeys, ekeys, eidx) in zip(node_key_batch, edge_key_batch,
+                                        edge_index_batch):
+            nlen, elen = len(nkeys), len(ekeys)
+            x = torch.Tensor(node_feats[last_node_idx:last_node_idx + nlen])
+            last_node_idx += len(nkeys)
+            edge_attr = torch.Tensor(edge_feats[last_edge_idx:last_edge_idx +
+                                                elen])
+            last_edge_idx += len(ekeys)
+            edge_idx = torch.LongTensor(eidx).T
+            data_obj = Data(x=x, edge_attr=edge_attr, edge_index=edge_idx)
+            data_obj[NODE_PID] = node_keys
+            data_obj[EDGE_PID] = edge_keys
+            data_obj["node_idx"] = [indexer._nodes[k] for k in nkeys]
+            data_obj["edge_idx"] = [indexer._edges[e] for e in ekeys]
+            yield data_obj
+    # NOTE: Backport of itertools.batched from Python 3.12
+    def batched(iterable, n, *, strict=False):
+        # batched('ABCDEFG', 3) → ABC DEF G
+        if n < 1:
+            raise ValueError('n must be at least one')
+        iterator = iter(iterable)
+        while batch := tuple(islice(iterator, n)):
+            if strict and len(batch) != n:
+                raise ValueError('batched(): incomplete batch')
+            yield batch
+    import multiprocessing as mp
+    import multiprocessing.pool as mpp
+    num_workers = num_workers if num_workers is not None else mp.cpu_count()
+    ideal_batch_size = min(max_batch_size,
+                           max(1,
+                               len(triplet_groups) // num_workers))
+    node_key_batches = batched(node_keys, ideal_batch_size)
+    edge_key_batches = batched(edge_keys, ideal_batch_size)
+    edge_index_batches = batched(edge_index, ideal_batch_size)
+    batches = zip(node_key_batches, edge_key_batches, edge_index_batches)
+    with mpp.ThreadPool() as pool:
+        result = pool.map(_fetch_feature_batch, batches)
+    yield from chain.from_iterable(result)
 def get_features_for_triplets(
@@ -650,7 +713,7 @@ def get_features_for_triplets(
     triplets: KnowledgeGraphLike,
     node_feature_name: str = "x",
     edge_feature_name: str = "edge_attr",
-    pre_transform: Optional[Callable[[TripletLike], TripletLike]] = None,
+    pre_transform: Callable[[TripletLike], TripletLike] = lambda trip: trip,
     verbose: bool = False,
 ) -> Data:
     """For a given set of triplets retrieve a Data object containing the
@@ -663,7 +726,7 @@ def get_features_for_triplets(
             Defaults to "x".
         edge_feature_name (str, optional): Feature to use for edge features.
             Defaults to "edge_attr".
-        pre_transform (Optional[Callable[[TripletLike], TripletLike]]):
+        pre_transform (Callable[[TripletLike], TripletLike]):
             Optional preprocessing function for triplets. Defaults to None.
         verbose (bool, optional): Whether to print progress. Defaults to False.
@@ -674,5 +737,5 @@ def get_features_for_triplets(
     gen = get_features_for_triplets_groups(indexer, [triplets],
                                            node_feature_name,
                                            edge_feature_name, pre_transform,
-                                           verbose)
+                                           verbose, max_batch_size=1)
     return next(gen)

torch_geometric/llm/models/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+from .sentence_transformer import SentenceTransformer
+from .vision_transformer import VisionTransformer
+from .llm import LLM
+from .txt2kg import TXT2KG
+from .llm_judge import LLMJudge
+from .g_retriever import GRetriever
+from .molecule_gpt import MoleculeGPT
+from .glem import GLEM
+from .protein_mpnn import ProteinMPNN
+from .git_mol import GITMol
+__all__ = [
+    'SentenceTransformer',
+    'VisionTransformer',
+    'LLM',
+    'LLMJudge',
+    'TXT2KG',
+    'GRetriever',
+    'MoleculeGPT',
+    'GLEM',
+    'ProteinMPNN',
+    'GITMol',
+]

torch_geometric/{nn → llm}/models/g_retriever.py RENAMED Viewed

@@ -3,7 +3,7 @@ from typing import List, Optional
 import torch
 from torch import Tensor
-from torch_geometric.nn.nlp.llm import BOS, LLM, MAX_NEW_TOKENS
+from torch_geometric.llm.models.llm import LLM, MAX_NEW_TOKENS
 from torch_geometric.utils import scatter
@@ -19,8 +19,6 @@ class GRetriever(torch.nn.Module):
             :obj:`peft` for training the LLM, see
             `here <https://huggingface.co/docs/peft/en/index>`_ for details.
             (default: :obj:`False`)
-        mlp_out_channels (int, optional): The size of each graph embedding
-            after projection. (default: :obj:`4096`)
         mlp_out_tokens (int, optional): Number of LLM prefix tokens to
             reserve for GNN output. (default: :obj:`1`)
@@ -42,15 +40,14 @@ class GRetriever(torch.nn.Module):
     def __init__(
         self,
         llm: LLM,
-        gnn: torch.nn.Module,
+        gnn: torch.nn.Module = None,
         use_lora: bool = False,
-        mlp_out_channels: int = 4096,
         mlp_out_tokens: int = 1,
     ) -> None:
         super().__init__()
         self.llm = llm
-        self.gnn = gnn.to(self.llm.device)
+        self.gnn = gnn.to(self.llm.device) if gnn is not None else None
         self.word_embedding = self.llm.word_embedding
         self.llm_generator = self.llm.llm
@@ -76,14 +73,18 @@ class GRetriever(torch.nn.Module):
             )
             self.llm_generator = get_peft_model(self.llm_generator, config)
-        mlp_hidden_channels = self.gnn.out_channels
-        self.projector = torch.nn.Sequential(
-            torch.nn.Linear(mlp_hidden_channels, mlp_hidden_channels),
-            torch.nn.Sigmoid(),
-            torch.nn.Linear(mlp_hidden_channels,
-                            mlp_out_channels * mlp_out_tokens),
-            torch.nn.Unflatten(-1, (mlp_out_tokens, mlp_out_channels)),
-        ).to(self.llm.device)
+        if self.gnn is not None:
+            mlp_out_channels = llm.word_embedding.embedding_dim
+            mlp_hidden_channels = self.gnn.out_channels
+            self.projector = torch.nn.Sequential(
+                torch.nn.Linear(mlp_hidden_channels, mlp_hidden_channels),
+                torch.nn.Sigmoid(),
+                torch.nn.Linear(mlp_hidden_channels,
+                                mlp_out_channels * mlp_out_tokens),
+                torch.nn.Unflatten(-1, (mlp_out_tokens, mlp_out_channels)),
+            ).to(self.llm.device)
+        self.seq_length_stats = []
     def encode(
         self,
@@ -98,7 +99,16 @@ class GRetriever(torch.nn.Module):
             edge_attr = edge_attr.to(self.llm.device)
         batch = batch.to(self.llm.device)
-        out = self.gnn(x, edge_index, edge_attr=edge_attr)
+        model_specific_kwargs = {}
+        # duck typing for SGFormer to get around circular import
+        if (hasattr(self.gnn, 'trans_conv')
+                and hasattr(self.gnn, 'graph_conv')):
+            model_specific_kwargs['batch'] = batch
+        else:
+            model_specific_kwargs['edge_attr'] = edge_attr
+        out = self.gnn(x, edge_index, **model_specific_kwargs)
         return scatter(out, batch, dim=0, reduce='mean')
     def forward(
@@ -127,27 +137,32 @@ class GRetriever(torch.nn.Module):
                 to give to the LLM, such as textified knowledge graphs.
                 (default: :obj:`None`)
         """
-        x = self.encode(x, edge_index, batch, edge_attr)
-        x = self.projector(x)
-        xs = x.split(1, dim=0)
-        # Handle case where there's more than one embedding for each sample
-        xs = [x.squeeze(0) for x in xs]
-        # Handle questions without node features:
-        batch_unique = batch.unique()
-        batch_size = len(question)
-        if len(batch_unique) < batch_size:
-            xs = [
-                xs[i] if i in batch_unique else None for i in range(batch_size)
-            ]
+        xs = None
+        if self.gnn is not None:
+            x = self.encode(x, edge_index, batch, edge_attr)
+            x = self.projector(x)
+            xs = x.split(1, dim=0)
+            # Handle case where theres more than one embedding for each sample
+            xs = [x.squeeze(0) for x in xs]
+            # Handle questions without node features:
+            batch_unique = batch.unique()
+            batch_size = len(question)
+            if len(batch_unique) < batch_size:
+                xs = [
+                    xs[i] if i in batch_unique else None
+                    for i in range(batch_size)
+                ]
         (
             inputs_embeds,
             attention_mask,
             label_input_ids,
         ) = self.llm._get_embeds(question, additional_text_context, xs, label)
+        max_seq_len = inputs_embeds.size(1)
+        self.seq_length_stats.append(max_seq_len)
         with self.llm.autocast_context:
             outputs = self.llm_generator(
                 inputs_embeds=inputs_embeds,
@@ -186,35 +201,39 @@ class GRetriever(torch.nn.Module):
             max_out_tokens (int, optional): How many tokens for the LLM to
                 generate. (default: :obj:`32`)
         """
-        x = self.encode(x, edge_index, batch, edge_attr)
-        x = self.projector(x)
-        xs = x.split(1, dim=0)
-        # Handle case where there's more than one embedding for each sample
-        xs = [x.squeeze(0) for x in xs]
-        # Handle questions without node features:
-        batch_unique = batch.unique()
-        batch_size = len(question)
-        if len(batch_unique) < batch_size:
-            xs = [
-                xs[i] if i in batch_unique else None for i in range(batch_size)
-            ]
+        xs = None
+        if self.gnn is not None:
+            x = self.encode(x, edge_index, batch, edge_attr)
+            x = self.projector(x)
+            xs = x.split(1, dim=0)
+            # Handle case where theres more than one embedding for each sample
+            xs = [x.squeeze(0) for x in xs]
+            # Handle questions without node features:
+            batch_unique = batch.unique()
+            batch_size = len(question)
+            if len(batch_unique) < batch_size:
+                xs = [
+                    xs[i] if i in batch_unique else None
+                    for i in range(batch_size)
+                ]
         inputs_embeds, attention_mask, _ = self.llm._get_embeds(
             question, additional_text_context, xs)
-        bos_token = self.llm.tokenizer(
-            BOS,
-            add_special_tokens=False,
-        ).input_ids[0]
+        # bos_token = self.llm.tokenizer(
+        #     self.llm.tokenizer.bos_token_id,
+        #     add_special_tokens=False,
+        # ).input_ids[0]
         with self.llm.autocast_context:
             outputs = self.llm_generator.generate(
                 inputs_embeds=inputs_embeds,
                 max_new_tokens=max_out_tokens,
                 attention_mask=attention_mask,
-                bos_token_id=bos_token,
+                bos_token_id=self.llm.tokenizer.bos_token_id,
+                pad_token_id=self.llm.tokenizer.eos_token_id,
                 use_cache=True  # Important to set!
             )

torch_geometric/{nn → llm}/models/git_mol.py RENAMED Viewed

@@ -5,8 +5,8 @@ import torch.nn.functional as F
 from torch import Tensor
 from torch.nn import BatchNorm1d, LayerNorm, Linear, ReLU, Sequential
+from torch_geometric.llm.models import SentenceTransformer, VisionTransformer
 from torch_geometric.nn import GINEConv
-from torch_geometric.nn.nlp import SentenceTransformer, VisionTransformer
 from torch_geometric.utils import add_self_loops, to_dense_batch

pyg-nightly 2.7.0.dev20250905__py3-none-any.whl → 2.7.0.dev20250906__py3-none-any.whl

pyg-nightly 2.7.0.dev20250905py3-none-any.whl → 2.7.0.dev20250906py3-none-any.whl