PyPI - biocypher - Versions diffs - 0.9.7__tar.gz → 0.10.0__tar.gz - Mend

biocypher 0.9.7tar.gz → 0.10.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biocypher might be problematic. Click here for more details.

Files changed (47) hide show

{biocypher-0.9.7 → biocypher-0.10.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: biocypher
-Version: 0.9.7
+Version: 0.10.0
 Summary: A unifying framework for biomedical research knowledge graphs
 Home-page: https://github.com/biocypher/biocypher
 License: MIT
@@ -19,6 +19,7 @@ Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
+Provides-Extra: scirpy
 Requires-Dist: PyYAML (>=5.0)
 Requires-Dist: appdirs
 Requires-Dist: more_itertools
@@ -27,6 +28,7 @@ Requires-Dist: networkx (>=3.0,<4.0)
 Requires-Dist: pandas (>=2.0.1,<3.0.0)
 Requires-Dist: pooch (>=1.7.0,<2.0.0)
 Requires-Dist: rdflib (>=6.2.0,<7.0.0)
+Requires-Dist: scirpy (>=0.22.0,<0.23.0) ; extra == "scirpy"
 Requires-Dist: tqdm (>=4.65.0,<5.0.0)
 Requires-Dist: treelib (==1.6.4)
 Project-URL: Bug Tracker, https://github.com/biocypher/biocypher/issues

{biocypher-0.9.7 → biocypher-0.10.0}/biocypher/_metadata.py RENAMED Viewed

@@ -10,7 +10,7 @@ import pathlib
 import toml
-_VERSION = "0.9.7"
+_VERSION = "0.10.0"
 def get_metadata():

biocypher-0.10.0/biocypher/output/in_memory/_airr.py ADDED Viewed

@@ -0,0 +1,491 @@
+from typing import Any, Optional
+from biocypher._create import BioCypherEdge, BioCypherNode
+from biocypher._deduplicate import Deduplicator
+from biocypher._logger import logger
+from biocypher.output.in_memory._in_memory_kg import _InMemoryKG
+try:
+    from scirpy.io import AirrCell
+    HAS_SCIRPY = True
+except ImportError:
+    HAS_SCIRPY = False
+class AirrKG(_InMemoryKG):
+    """Knowledge graph for AIRR (Adaptive Immune Receptor Repertoire) data.
+    This class implements the AIRR data model for representing immune receptor sequences
+    (antibodies and T cell receptors) and their annotations. To ensure proper conversion
+    to AIRR format, your schema file should define immune receptor entities with property
+    names that match the AIRR standards.
+    Key property names in your schema for immune receptor entities:
+    - locus: The gene locus (e.g., "TRA", "TRB", "IGH", "IGK", "IGL")
+    - junction_aa: The amino acid sequence of the junction region (CDR3)
+    - v_call: The V gene assignment
+    - j_call: The J gene assignment
+    - productive: Whether the sequence is productive
+    Pairing Strategies specified in get_kg method:
+    - Indirect pairings allowed:
+        Epitope is only matched with ONE of the paired receptors -> the "paired" AIRR cell will be created
+    - Indirect pairings not allowed:
+        Epitope is only matched with ONE of the paired receptors -> no "paired" AIRR cell will be created
+    For a complete list of available fields and their descriptions, see:
+    https://docs.airr-community.org/en/stable/datarep/rearrangements.html#fields
+    All properties from the biocypher schema defined by user will be preserved in the AIRR format.
+    """
+    # Constants for internal property filtering
+    _INTERNAL_PROPERTIES = frozenset(["node_id", "node_label", "id", "preferred_id"])
+    def __init__(
+        self,
+        deduplicator: Optional["Deduplicator"] = None,
+        metadata_entity_type: str = "epitope",
+    ) -> None:
+        """Initialize AirrKG with configurable metadata node type.
+        Args:
+        ----
+            deduplicator: Deduplicator instance
+            metadata_entity_type: String specifying the metadata node type (default: "epitope")
+        """
+        super().__init__()
+        self.deduplicator = deduplicator or Deduplicator()
+        self.metadata_entity_type = metadata_entity_type
+        # Initialize storage for processed cells
+        self.adjacency_list = {}
+        self.airr_cells = []
+        # These will be populated when nodes and edges are added
+        self.sequence_entity_types = {}
+        self.chain_relationship_types = []
+        self.chain_to_epitope_relationship_types = []
+    def _check_dependencies(self) -> None:
+        """Verify that scirpy is available."""
+        if not HAS_SCIRPY:
+            msg = (
+                "AirrCell module from scirpy not detected. "
+                "Install it with 'poetry add biocypher[scirpy]' or 'poetry add scirpy'."
+            )
+            raise ImportError(msg)
+    def get_kg(self, indirect_pairings: bool = True) -> list[AirrCell]:
+        """Convert directly to AIRR format using AirCell from scirpy.
+        Args:
+        ----
+            indirect_pairings: Boolean controlling pairing strategy (default: True)
+                - True:
+                    Epitope is only matched with ONE of the paired receptors -> the "paired" AIRR cell will be created
+                - False:
+                    Epitope is only matched with ONE of the paired receptors -> no "paired" AIRR cell will be created
+        Returns:
+        -------
+            list: List of generated AIRR cells
+        """
+        self._check_dependencies()
+        if not self.airr_cells:
+            self.airr_cells = self._to_airr_cells(self.adjacency_list, indirect_pairings)
+        return self.airr_cells
+    def add_nodes(self, nodes: list[BioCypherNode]) -> None:
+        """Add BioCypher nodes, organizing them by type."""
+        self._add_to_entities_by_type(nodes)
+    def add_edges(self, edges: list[BioCypherEdge]) -> None:
+        """Add BioCypher edges, organizing them by type."""
+        self._add_to_entities_by_type(edges)
+    def _add_to_entities_by_type(self, entities: dict[str, list[Any]]) -> None:
+        """Add all entities (both nodes and edges) to a common adj. list."""
+        lists = self._separate_entity_types(entities)
+        for _type, _entities in lists.items():
+            if _type not in self.adjacency_list:
+                self.adjacency_list[_type] = []
+            self.adjacency_list[_type].extend(_entities)
+    def _process_entities(self, entities: dict[str, list[Any]]) -> tuple[dict, dict, dict]:
+        """Process entities and organize them into sequence nodes, metadata nodes, and receptor-epitope mappings.
+        Args:
+        ----
+            entities: Dictionary mapping entity types to lists of BioCypherNode/BioCypherEdge objects
+        Returns:
+        -------
+            tuple: (sequence_nodes, metadata_nodes, receptor_epitope_mapping)
+        """
+        sequence_nodes = {}
+        metadata_nodes = {}
+        receptor_epitope_mapping = {}
+        # Determine entity types while processing
+        all_node_types = set()
+        all_edge_types = set()
+        for entity_type, entities_list in entities.items():
+            if not entities_list:  # Skip empty lists
+                continue
+            # Determine if this is a node or edge type
+            if isinstance(entities_list[0], BioCypherNode):
+                all_node_types.add(entity_type)
+                if entity_type == self.metadata_entity_type:
+                    metadata_nodes.update({node.get_id(): node for node in entities_list})
+                else:
+                    sequence_nodes.update({node.get_id(): node for node in entities_list})
+                    self.sequence_entity_types[entity_type] = entity_type.replace(" sequence", "").upper()
+            elif isinstance(entities_list[0], BioCypherEdge):
+                all_edge_types.add(entity_type)
+        # Update relationship types
+        self.chain_relationship_types = [
+            edge_type for edge_type in all_edge_types if self.metadata_entity_type not in edge_type.lower()
+        ]
+        self.chain_to_epitope_relationship_types = [
+            edge_type for edge_type in all_edge_types if self.metadata_entity_type in edge_type.lower()
+        ]
+        # Process chain-to-epitope relationships
+        for entity_type in self.chain_to_epitope_relationship_types:
+            self._update_receptor_epitope_mapping(entities[entity_type], receptor_epitope_mapping)
+        return sequence_nodes, metadata_nodes, receptor_epitope_mapping
+    def _update_receptor_epitope_mapping(self, edges: list[BioCypherEdge], mapping: dict[str, set]) -> None:
+        """Update receptor-epitope mapping with new edges.
+        Args:
+        ----
+            edges: List of edges to process
+            mapping: Dictionary to update with receptor-epitope mappings
+        """
+        for edge in edges:
+            source_id = edge.get_source_id()
+            if source_id not in mapping:
+                mapping[source_id] = set()
+            mapping[source_id].add(edge.get_target_id())
+    def _process_paired_chains(
+        self,
+        entities: dict[str, list[Any]],
+        sequence_nodes: dict[str, BioCypherNode],
+        metadata_nodes: dict[str, BioCypherNode],
+        receptor_epitope_mapping: dict[str, set],
+        indirect_pairings: bool = True,
+    ) -> tuple[list[AirrCell], set[str], int]:
+        """Process paired chains and generate AIRR cells.
+        Args:
+        ----
+            entities: Dictionary of all entities
+            sequence_nodes: Dictionary of sequence nodes
+            metadata_nodes: Dictionary of metadata nodes
+            receptor_epitope_mapping: Dictionary of receptor-epitope mappings
+            indirect_pairings: Boolean controlling pairing strategy
+        Returns:
+        -------
+            tuple: (list of generated cells, set of processed chain IDs, count of cells with multiple epitopes)
+        """
+        airr_cells = []
+        processed_chains = set()
+        n_metacells = 0
+        for entity_type, edges in entities.items():
+            if entity_type in self.chain_relationship_types:
+                for edge in edges:
+                    source_id, target_id = edge.get_source_id(), edge.get_target_id()
+                    processed_chains.update([source_id, target_id])
+                    # Use conditional logic for pairing strategy
+                    source_metadata = receptor_epitope_mapping.get(source_id, set())
+                    target_metadata = receptor_epitope_mapping.get(target_id, set())
+                    if indirect_pairings:
+                        # Union: create paired cell if either chain binds epitopes
+                        metadata_ids = source_metadata | target_metadata
+                        metadata_nodes_cell = self._get_metadata_nodes(metadata_ids, metadata_nodes)
+                        if metadata_nodes_cell:
+                            cell_s = self._generate_airr_cell(
+                                cell_id=edge.get_id(),
+                                source_node=sequence_nodes.get(source_id),
+                                target_node=sequence_nodes.get(target_id),
+                                metadata_nodes=metadata_nodes_cell,
+                                paired=True,
+                                receptor_epitope_mapping=receptor_epitope_mapping,
+                            )
+                            airr_cells.extend(cell_s)
+                            if len(cell_s) > 1:
+                                n_metacells += 1
+                    else:
+                        # Intersection: create paired cell only if both chains bind same epitopes
+                        shared_metadata_ids = source_metadata & target_metadata
+                        # Create paired cell if there are shared epitopes
+                        if shared_metadata_ids:
+                            shared_metadata_nodes = self._get_metadata_nodes(shared_metadata_ids, metadata_nodes)
+                            if shared_metadata_nodes:
+                                cell_s = self._generate_airr_cell(
+                                    cell_id=edge.get_id(),
+                                    source_node=sequence_nodes.get(source_id),
+                                    target_node=sequence_nodes.get(target_id),
+                                    metadata_nodes=shared_metadata_nodes,
+                                    paired=True,
+                                    receptor_epitope_mapping=receptor_epitope_mapping,
+                                )
+                                airr_cells.extend(cell_s)
+                                if len(cell_s) > 1:
+                                    n_metacells += 1
+                        # Create unpaired cells for chains with non-overlapping epitopes
+                        source_only_metadata = source_metadata - target_metadata
+                        target_only_metadata = target_metadata - source_metadata
+                        # Create unpaired cell for source chain if it has unique epitopes
+                        if source_only_metadata:
+                            source_only_nodes = self._get_metadata_nodes(source_only_metadata, metadata_nodes)
+                            if source_only_nodes:
+                                source_cells = self._generate_airr_cell(
+                                    cell_id=f"unpaired_{source_id}",
+                                    source_node=sequence_nodes.get(source_id),
+                                    target_node=None,
+                                    metadata_nodes=source_only_nodes,
+                                    paired=False,
+                                    receptor_epitope_mapping=receptor_epitope_mapping,
+                                )
+                                airr_cells.extend(source_cells)
+                                if len(source_cells) > 1:
+                                    n_metacells += 1
+                        # Create unpaired cell for target chain if it has unique epitopes
+                        if target_only_metadata:
+                            target_only_nodes = self._get_metadata_nodes(target_only_metadata, metadata_nodes)
+                            if target_only_nodes:
+                                target_cells = self._generate_airr_cell(
+                                    cell_id=f"unpaired_{target_id}",
+                                    source_node=sequence_nodes.get(target_id),
+                                    target_node=None,
+                                    metadata_nodes=target_only_nodes,
+                                    paired=False,
+                                    receptor_epitope_mapping=receptor_epitope_mapping,
+                                )
+                                airr_cells.extend(target_cells)
+                                if len(target_cells) > 1:
+                                    n_metacells += 1
+        return airr_cells, processed_chains, n_metacells
+    def _process_unpaired_chains(
+        self,
+        receptor_epitope_mapping: dict[str, set],
+        sequence_nodes: dict[str, BioCypherNode],
+        metadata_nodes: dict[str, BioCypherNode],
+        processed_chains: set[str],
+    ) -> tuple[list[AirrCell], int]:
+        """Process unpaired chains and generate AIRR cells.
+        Args:
+        ----
+            receptor_epitope_mapping: Dictionary of receptor-epitope mappings
+            sequence_nodes: Dictionary of sequence nodes
+            metadata_nodes: Dictionary of metadata nodes
+            processed_chains: Set of already processed chain IDs
+        Returns:
+        -------
+            tuple: (List of generated cells, count of cells with multiple epitopes)
+        """
+        airr_cells = []
+        n_metacells = 0
+        for chain_id in receptor_epitope_mapping:
+            if chain_id not in processed_chains:
+                # Get all metadata nodes for this unpaired chain
+                metadata_nodes_cell = self._get_metadata_nodes(receptor_epitope_mapping[chain_id], metadata_nodes)
+                if metadata_nodes_cell:
+                    cell_s = self._generate_airr_cell(
+                        cell_id=f"unpaired_{chain_id}",
+                        source_node=sequence_nodes.get(chain_id),
+                        target_node=None,
+                        metadata_nodes=metadata_nodes_cell,
+                        paired=False,
+                        receptor_epitope_mapping=receptor_epitope_mapping,
+                    )
+                    airr_cells.extend(cell_s)
+                    # Check if multiple cells were generated (indicating multiple epitopes)
+                    if len(cell_s) > 1:
+                        n_metacells += 1
+        return airr_cells, n_metacells
+    def _to_airr_cells(self, entities: dict[str, list[Any]], indirect_pairings: bool = True) -> list[AirrCell]:
+        """Convert BioCypher entities to AIRR cells using configurable mappings.
+        Args:
+        ----
+            entities: Dictionary mapping entity types to lists of BioCypherNode/BioCypherEdge objects
+            indirect_pairings: Boolean controlling pairing strategy (default: True)
+                - True:
+                    Epitope is only matched with ONE of the paired receptors -> the "paired" AIRR cell will be created
+                - False:
+                    Epitope is only matched with ONE of the paired receptors -> no "paired" AIRR cell will be created
+        Returns:
+        -------
+            list: List of generated AIRR cells
+        """
+        if not entities:
+            msg = "No entities provided for conversion."
+            raise ValueError(msg)
+        logger.info("Starting conversion to AIRR cells")
+        # Process all entities
+        sequence_nodes, metadata_nodes, receptor_epitope_mapping = self._process_entities(entities)
+        # Process paired chains
+        airr_cells, processed_chains, paired_metacells = self._process_paired_chains(
+            entities,
+            sequence_nodes,
+            metadata_nodes,
+            receptor_epitope_mapping,
+            indirect_pairings,
+        )
+        # Process unpaired chains
+        unpaired_cells, unpaired_metacells = self._process_unpaired_chains(
+            receptor_epitope_mapping,
+            sequence_nodes,
+            metadata_nodes,
+            processed_chains,
+        )
+        airr_cells.extend(unpaired_cells)
+        # Calculate total cells with multiple epitopes
+        total_metacells = paired_metacells + unpaired_metacells
+        # Log information about cells
+        logger.info(f"Generated total of {len(airr_cells)} AIRR cells")
+        if total_metacells > 0:
+            logger.info(f"{total_metacells} cells with more than 1 epitope were detected")
+        return airr_cells
+    def _get_metadata_nodes(
+        self,
+        metadata_ids: set[str],
+        metadata_nodes: dict[str, BioCypherNode],
+    ) -> list[BioCypherNode]:
+        """Get metadata nodes for a set of metadata IDs.
+        Args:
+        ----
+            metadata_ids: Set of metadata IDs
+            metadata_nodes: Dictionary of metadata nodes
+        Returns:
+        -------
+            list: List of metadata nodes
+        """
+        return [metadata_nodes[ep_id] for ep_id in metadata_ids if ep_id in metadata_nodes]
+    def _generate_airr_cell(
+        self,
+        cell_id: str,
+        source_node: BioCypherNode | None,
+        target_node: BioCypherNode | None,
+        metadata_nodes: list[BioCypherNode],
+        paired: bool,
+        receptor_epitope_mapping: dict[str, set] | None = None,
+    ) -> list[AirrCell]:
+        cell = AirrCell(cell_id=cell_id)
+        # Process both chains
+        for node in [source_node, target_node]:
+            if not node:  # Skip if node is None
+                continue
+            props = node.get_properties()
+            chain = AirrCell.empty_chain_dict()
+            # Add all properties except internal ones
+            for key, value in props.items():
+                if key not in self._INTERNAL_PROPERTIES:
+                    chain[key] = value
+            # Add locus based on node type
+            chain["locus"] = self.sequence_entity_types.get(node.get_label(), node.get_label())
+            chain["consensus_count"] = 0
+            chain["productive"] = True
+            # Add binds_epitope field based on receptor_epitope_mapping
+            if receptor_epitope_mapping and node.get_id() in receptor_epitope_mapping:
+                chain["validated_epitope"] = bool(receptor_epitope_mapping[node.get_id()])
+            else:
+                chain["validated_epitope"] = False
+            cell.add_chain(chain)
+        # Add metadata
+        return self.add_metadata(metadata_nodes, cell, paired)
+    def add_metadata(self, metadata_nodes: list[BioCypherNode], cell: AirrCell, paired: bool) -> list[AirrCell]:
+        """Add metadata from nodes to cell(s) and return a list of cells.
+        Args:
+        ----
+            metadata_nodes: List of metadata nodes to add
+            cell: Base cell to add metadata to
+            paired: Whether the cell is paired
+        Returns:
+        -------
+            List of cells with metadata added
+        """
+        cells = []
+        if not metadata_nodes:
+            cell["data_source"] = "BioCypher"
+            cell["is_paired"] = paired
+            cells.append(cell)
+        else:
+            for i, node in enumerate(metadata_nodes):
+                # Create a new AirrCell for each metadata node
+                if i > 0:
+                    cell_id_new = f"{cell.cell_id}_meta{i+1}"
+                    meta_cell = AirrCell(cell_id=cell_id_new)
+                    for chain in cell.chains:
+                        meta_cell.add_chain(chain)
+                else:
+                    meta_cell = cell
+                props = node.get_properties()
+                for key, value in props.items():
+                    if key not in self._INTERNAL_PROPERTIES:
+                        meta_cell[key] = value
+                meta_cell["data_source"] = "BioCypher"
+                meta_cell["is_paired"] = paired
+                cells.append(meta_cell)
+        return cells

{biocypher-0.9.7 → biocypher-0.10.0}/biocypher/output/in_memory/_get_in_memory_kg.py RENAMED Viewed

@@ -8,6 +8,7 @@ from __future__ import annotations
 from typing import TYPE_CHECKING
 from biocypher._logger import logger
+from biocypher.output.in_memory._airr import AirrKG
 from biocypher.output.in_memory._networkx import NetworkxKG
 from biocypher.output.in_memory._pandas import PandasKG
@@ -19,7 +20,7 @@ logger.debug(f"Loading module {__name__}.")
 __all__ = ["get_in_memory_kg"]
-IN_MEMORY_DBMS = ["csv", "pandas", "tabular", "networkx"]
+IN_MEMORY_DBMS = ["csv", "pandas", "tabular", "networkx", "airr"]
 def get_in_memory_kg(
@@ -35,10 +36,11 @@ def get_in_memory_kg(
     """
     if dbms in ["csv", "pandas", "tabular"]:
         return PandasKG(deduplicator)
     if dbms == "networkx":
         return NetworkxKG(deduplicator)
-    msg = f"Getting the in memory BioCypher KG is not supported for the DBMS {dbms}. Supported: {IN_MEMORY_DBMS}."
-    logger.error(msg)
-    raise NotImplementedError(msg)
+    elif dbms == "airr":
+        return AirrKG(deduplicator)
+    else:
+        msg = f"Getting the in memory BioCypher KG is not supported for the DBMS {dbms}. Supported: {IN_MEMORY_DBMS}."
+        logger.error(msg)
+        raise NotImplementedError(msg)

biocypher-0.9.7/biocypher/output/in_memory/_pandas.py → biocypher-0.10.0/biocypher/output/in_memory/_in_memory_kg.py RENAMED Viewed

@@ -1,24 +1,46 @@
-import pandas as pd
+from abc import ABC, abstractmethod
 from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
-from biocypher.output.in_memory._in_memory_kg import _InMemoryKG
-class PandasKG(_InMemoryKG):
-    def __init__(self, deduplicator):
-        super().__init__()  # keeping in spite of ABC not having __init__
-        self.deduplicator = deduplicator
+class _InMemoryKG(ABC):
+    """Abstract class for handling the in-memory Knowledge Graph instance.
+    Specifics of the different in-memory implementations (e.g. csv, networkx)
+    are implemented in the child classes. Any concrete in-memory implementation
+    needs to implement at least:
+    - add_nodes
+    - add_edges
+    - get_kg
+    - _separate_entity_types
-        self.dfs = {}
-    def get_kg(self):
-        return self.dfs
+    Raises:
+        NotImplementedError: InMemoryKG implementation must override 'add_nodes'
+        NotImplementedError: InMemoryKG implementation must override 'add_edges'
+        NotImplementedError: InMemoryKG implementation must override 'get_kg'
+    """
+    @abstractmethod
     def add_nodes(self, nodes):
-        self.add_tables(nodes)
+        """Add nodes to the in-memory knowledge graph.
+        Args:
+            nodes (Iterable[BioCypherNode]): Iterable of BioCypherNode objects.
+        """
+        raise NotImplementedError("InMemoryKG implementation must override 'add_nodes'")
+    @abstractmethod
     def add_edges(self, edges):
-        self.add_tables(edges)
+        """Add edges to the in-memory knowledge graph.
+        Args:
+            edges (Iterable[BioCypherEdge]): Iterable of BioCypherEdge objects.
+        """
+        raise NotImplementedError("InMemoryKG implementation must override 'add_edges'")
+    @abstractmethod
+    def get_kg(self):
+        """Return the in-memory knowledge graph."""
+        raise NotImplementedError("InMemoryKG implementation must override 'get_kg'")
     def _separate_entity_types(self, entities):
         """
@@ -73,23 +95,3 @@ class PandasKG(_InMemoryKG):
             lists[_type].append(entity)
         return lists
-    def add_tables(self, entities):
-        """
-        Add Pandas dataframes for each node and edge type in the input.
-        """
-        lists = self._separate_entity_types(entities)
-        for _type, _entities in lists.items():
-            self._add_entity_df(_type, _entities)
-    def _add_entity_df(self, _type, _entities):
-        df = pd.DataFrame(pd.json_normalize([node.get_dict() for node in _entities]))
-        # replace "properties." with "" in column names
-        df.columns = [col.replace("properties.", "") for col in df.columns]
-        if _type not in self.dfs:
-            self.dfs[_type] = df
-        else:
-            self.dfs[_type] = pd.concat([self.dfs[_type], df], ignore_index=True)
-        return self.dfs[_type]

biocypher-0.10.0/biocypher/output/in_memory/_pandas.py ADDED Viewed

@@ -0,0 +1,37 @@
+import pandas as pd
+from biocypher.output.in_memory._in_memory_kg import _InMemoryKG
+class PandasKG(_InMemoryKG):
+    def __init__(self, deduplicator):
+        super().__init__()  # keeping in spite of ABC not having __init__
+        self.deduplicator = deduplicator
+        self.dfs = {}
+    def get_kg(self):
+        return self.dfs
+    def add_nodes(self, nodes):
+        self.add_tables(nodes)
+    def add_edges(self, edges):
+        self.add_tables(edges)
+    def add_tables(self, entities):
+        """Add Pandas dataframes for each node and edge type in the input."""
+        lists = self._separate_entity_types(entities)
+        for _type, _entities in lists.items():
+            self._add_entity_df(_type, _entities)
+    def _add_entity_df(self, _type, _entities):
+        df = pd.DataFrame(pd.json_normalize([node.get_dict() for node in _entities]))
+        # replace "properties." with "" in column names
+        df.columns = [col.replace("properties.", "") for col in df.columns]
+        if _type not in self.dfs:
+            self.dfs[_type] = df
+        else:
+            self.dfs[_type] = pd.concat([self.dfs[_type], df], ignore_index=True)
+        return self.dfs[_type]

{biocypher-0.9.7 → biocypher-0.10.0}/biocypher/output/write/_get_writer.py RENAMED Viewed

@@ -9,6 +9,7 @@ from typing import TYPE_CHECKING
 from biocypher._config import config as _config
 from biocypher._logger import logger
 from biocypher.output.write._batch_writer import _BatchWriter
+from biocypher.output.write.graph._airr import _AirrWriter
 from biocypher.output.write.graph._arangodb import _ArangoDBBatchWriter
 from biocypher.output.write.graph._neo4j import _Neo4jBatchWriter
 from biocypher.output.write.graph._networkx import _NetworkXWriter
@@ -50,6 +51,7 @@ DBMS_TO_CLASS = {
     "Tabular": _PandasCSVWriter,
     "networkx": _NetworkXWriter,
     "NetworkX": _NetworkXWriter,
+    "airr": _AirrWriter,
 }

biocypher-0.10.0/biocypher/output/write/graph/_airr.py ADDED Viewed

@@ -0,0 +1,32 @@
+"""Module to provide the AnnData writer class for BioCypher."""
+from biocypher._logger import logger
+from biocypher.output.write._writer import _Writer
+class _AirrWriter(_Writer):
+    """A minimal placeholder writer class that implements the required methods
+    but performs no actual writing operations, since there is an existing anndata native writer functionality
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        logger.info("Placeholder writer initialized")
+    def _write_node_data(self, nodes) -> bool:
+        """Required implementation that does nothing with nodes."""
+        logger.info("Placeholder: Node data received but not processed")
+        return True
+    def _write_edge_data(self, edges) -> bool:
+        """Required implementation that does nothing with edges."""
+        logger.info("Placeholder: Edge data received but not processed")
+        return True
+    def _construct_import_call(self) -> str:
+        """Return a placeholder import script."""
+        return "# This is a placeholder import script\nprint('No actual import functionality implemented')"
+    def _get_import_script_name(self) -> str:
+        """Return a placeholder script name."""
+        return "placeholder_import.py"

{biocypher-0.9.7 → biocypher-0.10.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "biocypher"
-version = "0.9.7"
+version = "0.10.0"
 description = "A unifying framework for biomedical research knowledge graphs"
 authors = [
     "Sebastian Lobentanzer <sebastian.lobentanzer@gmail.com>",
@@ -44,6 +44,10 @@ neo4j-utils = "0.0.7"
 pandas = "^2.0.1"
 pooch = "^1.7.0"
 tqdm = "^4.65.0"
+scirpy = { version = "^0.22.0", optional = true }
+[tool.poetry.extras]
+scirpy = ["scirpy"]
 [tool.poetry.group.dev.dependencies]
 mkdocs-material = "^9.5"
@@ -59,7 +63,7 @@ hypothesis = "^6.50.1"
 ipython = "^8.7.0"
 ipykernel = "^6.23.1"
 coverage-badge = "^1.1.0"
-nbsphinx = "^0.9.7"
+nbsphinx = "^0.9.6"
 ruff = "^0.2.0"
 mike = "^2.1.3"

biocypher-0.9.7/biocypher/output/in_memory/_in_memory_kg.py DELETED Viewed

@@ -1,40 +0,0 @@
-from abc import ABC, abstractmethod
-class _InMemoryKG(ABC):
-    """Abstract class for handling the in-memory Knowledge Graph instance.
-    Specifics of the different in-memory implementations (e.g. csv, networkx)
-    are implemented in the child classes. Any concrete in-memory implementation
-    needs to implement at least:
-    - add_nodes
-    - add_edges
-    - get_kg
-    Raises:
-        NotImplementedError: InMemoryKG implementation must override 'add_nodes'
-        NotImplementedError: InMemoryKG implementation must override 'add_edges'
-        NotImplementedError: InMemoryKG implementation must override 'get_kg'
-    """
-    @abstractmethod
-    def add_nodes(self, nodes):
-        """Add nodes to the in-memory knowledge graph.
-        Args:
-            nodes (Iterable[BioCypherNode]): Iterable of BioCypherNode objects.
-        """
-        raise NotImplementedError("InMemoryKG implementation must override 'add_nodes'")
-    @abstractmethod
-    def add_edges(self, edges):
-        """Add edges to the in-memory knowledge graph.
-        Args:
-            edges (Iterable[BioCypherEdge]): Iterable of BioCypherEdge objects.
-        """
-        raise NotImplementedError("InMemoryKG implementation must override 'add_edges'")
-    @abstractmethod
-    def get_kg(self):
-        """Return the in-memory knowledge graph."""
-        raise NotImplementedError("InMemoryKG implementation must override 'get_kg'")