PyPI - biocypher - Versions diffs - 0.5.20__py3-none-any.whl → 0.5.21__py3-none-any.whl - Mend

biocypher 0.5.20py3-none-any.whl → 0.5.21py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biocypher might be problematic. Click here for more details.

Files changed (7) hide show

biocypher/_core.py +48 -3
biocypher/_get.py +299 -0
biocypher/_metadata.py +1 -1
{biocypher-0.5.20.dist-info → biocypher-0.5.21.dist-info}/METADATA +3 -1
{biocypher-0.5.20.dist-info → biocypher-0.5.21.dist-info}/RECORD +7 -6
{biocypher-0.5.20.dist-info → biocypher-0.5.21.dist-info}/LICENSE +0 -0
{biocypher-0.5.20.dist-info → biocypher-0.5.21.dist-info}/WHEEL +0 -0

biocypher/_core.py CHANGED Viewed

@@ -24,6 +24,7 @@ from ._logger import logger
 logger.debug(f"Loading module {__name__}.")
+from ._get import Downloader
 from ._write import get_writer
 from ._config import config as _config
 from ._config import update_from_file as _file_update
@@ -307,12 +308,20 @@ class BioCypher:
         return self._pd.dfs
-    def add(self, entities):
+    def add(self, entities) -> None:
         """
         Function to add entities to the in-memory database. Accepts an iterable
         of tuples (if given, translates to ``BioCypherNode`` or
         ``BioCypherEdge`` objects) or an iterable of ``BioCypherNode`` or
         ``BioCypherEdge`` objects.
+        Args:
+            entities (iterable): An iterable of entities to add to the database.
+                Can be 3-tuples (nodes) or 5-tuples (edges); also accepts
+                4-tuples for edges (deprecated).
+        Returns:
+            None
         """
         if not self._pd:
             self._pd = Pandas(
@@ -335,10 +344,28 @@ class BioCypher:
         self._pd.add_tables(tentities)
-    def add_nodes(self, nodes):
+    def add_nodes(self, nodes) -> None:
+        """
+        Wrapper for ``add()`` to add nodes to the in-memory database.
+        Args:
+            nodes (iterable): An iterable of node tuples to add to the database.
+        Returns:
+            None
+        """
         self.add(nodes)
-    def add_edges(self, edges):
+    def add_edges(self, edges) -> None:
+        """
+        Wrapper for ``add()`` to add edges to the in-memory database.
+        Args:
+            edges (iterable): An iterable of edge tuples to add to the database.
+        Returns:
+            None
+        """
         self.add(edges)
     def merge_nodes(self, nodes) -> bool:
@@ -389,6 +416,24 @@ class BioCypher:
         # write edge files
         return self._driver.add_biocypher_edges(tedges)
+    # DOWNLOAD AND CACHE MANAGEMENT METHODS ###
+    def _get_downloader(self):
+        """
+        Create downloader if not exists.
+        """
+        if not self._downloader:
+            self._downloader = Downloader()
+    def download(self, force: bool = False) -> None:
+        """
+        Use the :class:`Downloader` class to download or load from cache the
+        resources given by the adapter.
+        """
+        self._get_downloader()
     # OVERVIEW AND CONVENIENCE METHODS ###
     def log_missing_input_labels(self) -> Optional[dict[str, list[str]]]:

biocypher/_get.py ADDED Viewed

@@ -0,0 +1,299 @@
+#!/usr/bin/env python
+#
+# Copyright 2021, Heidelberg University Clinic
+#
+# File author(s): Sebastian Lobentanzer
+#                 ...
+#
+# Distributed under MIT licence, see the file `LICENSE`.
+#
+"""
+BioCypher get module. Used to download and cache data from external sources.
+"""
+from __future__ import annotations
+from ._logger import logger
+logger.debug(f"Loading module {__name__}.")
+from datetime import datetime, timedelta
+from tempfile import TemporaryDirectory
+import os
+import json
+import ftplib
+import pooch
+from ._misc import to_list
+class Resource:
+    def __init__(
+        self,
+        name: str,
+        url_s: str | list[str],
+        lifetime: int = 0,
+        is_dir: bool = False,
+    ):
+        """
+        A resource is a file that can be downloaded from a URL and cached
+        locally. This class implements checks of the minimum requirements for
+        a resource, to be implemented by a biocypher adapter.
+        Args:
+            name (str): The name of the resource.
+            url_s (str | list[str]): The URL or URLs of the resource.
+            lifetime (int): The lifetime of the resource in days. If 0, the
+                resource is considered to be permanent.
+        """
+        self.name = name
+        self.url_s = url_s
+        self.lifetime = lifetime
+        self.is_dir = is_dir
+class Downloader:
+    def __init__(self, cache_dir: str):
+        """
+        A downloader is a collection of resources that can be downloaded
+        and cached locally. It manages the lifetime of downloaded resources by
+        keeping a JSON record of the download date of each resource.
+        Args:
+            cache_dir (str): The directory where the resources are cached. If
+                not given, a temporary directory is created.
+        """
+        self.cache_dir = cache_dir or TemporaryDirectory().name
+        self.cache_file = os.path.join(self.cache_dir, "cache.json")
+        self.cache_dict = self._load_cache_dict()
+    # download function that accepts a resource or a list of resources
+    def download(self, *resources: Resource):
+        """
+        Download one or multiple resources.
+        Args:
+            resources (Resource): The resource or resources to download.
+        Returns:
+            str or list: The path or paths to the downloaded resource(s).
+        """
+        paths = []
+        for resource in resources:
+            paths.append(self._download_or_cache(resource))
+        # flatten list if it is nested
+        if is_nested(paths):
+            paths = [path for sublist in paths for path in sublist]
+        return paths
+    def _download_or_cache(self, resource: Resource, cache: bool = True):
+        """
+        Download a resource if it is not cached or exceeded its lifetime.
+        Args:
+            resource (Resource): The resource to download.
+        Returns:
+            str or list: The path or paths to the downloaded resource(s).
+        """
+        # check if resource is cached
+        cache_record = self._get_cache_record(resource)
+        if cache_record:
+            # check if resource is expired (formatted in days)
+            dl = cache_record.get("date_downloaded")
+            lt = timedelta(days=resource.lifetime)
+            expired = dl + lt < datetime.now()
+        else:
+            expired = True
+        # download resource
+        if expired or not cache:
+            logger.info(f"Downloading resource {resource.name}.")
+            if resource.is_dir:
+                files = self._get_files(resource)
+                resource.url_s = [resource.url_s + "/" + file for file in files]
+                resource.is_dir = False
+                paths = self._download_or_cache(resource, cache)
+            elif isinstance(resource.url_s, list):
+                paths = []
+                for url in resource.url_s:
+                    fname = url[url.rfind("/") + 1 :]
+                    paths.append(
+                        self._retrieve(
+                            url=url,
+                            fname=fname,
+                            path=os.path.join(self.cache_dir, resource.name),
+                        )
+                    )
+            else:
+                fname = resource.url_s[resource.url_s.rfind("/") + 1 :]
+                paths = self._retrieve(
+                    url=resource.url_s,
+                    fname=fname,
+                    path=os.path.join(self.cache_dir, resource.name),
+                )
+            # sometimes a compressed file contains multiple files
+            # TODO ask for a list of files in the archive to be used from the
+            # adapter
+            # update cache record
+            self._update_cache_record(resource)
+            return paths
+    def _retrieve(
+        self,
+        url: str,
+        fname: str,
+        path: str,
+        known_hash: str = None,
+    ):
+        """
+        Retrieve a file from a URL using Pooch. Infer type of file from
+        extension and use appropriate processor.
+        Args:
+            url (str): The URL to retrieve the file from.
+            fname (str): The name of the file.
+            path (str): The path to the file.
+        """
+        if fname.endswith(".zip"):
+            return pooch.retrieve(
+                url=url,
+                known_hash=known_hash,
+                fname=fname,
+                path=path,
+                processor=pooch.Unzip(),
+                progressbar=True,
+            )
+        elif fname.endswith(".tar.gz"):
+            return pooch.retrieve(
+                url=url,
+                known_hash=known_hash,
+                fname=fname,
+                path=path,
+                processor=pooch.Untar(),
+                progressbar=True,
+            )
+        elif fname.endswith(".gz"):
+            return pooch.retrieve(
+                url=url,
+                known_hash=known_hash,
+                fname=fname,
+                path=path,
+                processor=pooch.Decompress(),
+                progressbar=True,
+            )
+        else:
+            return pooch.retrieve(
+                url=url,
+                known_hash=known_hash,
+                fname=fname,
+                path=path,
+                progressbar=True,
+            )
+    def _get_files(self, resource: Resource):
+        """
+        Get the files contained in a directory resource.
+        Args:
+            resource (Resource): The directory resource.
+        Returns:
+            list: The files contained in the directory.
+        """
+        if resource.url_s.startswith("ftp://"):
+            # remove protocol
+            url = resource.url_s[6:]
+            # get base url
+            url = url[: url.find("/")]
+            # get directory (remove initial slash as well)
+            dir = resource.url_s[7 + len(url) :]
+            # get files
+            ftp = ftplib.FTP(url)
+            ftp.login()
+            ftp.cwd(dir)
+            files = ftp.nlst()
+            ftp.quit()
+        else:
+            raise NotImplementedError(
+                "Only FTP directories are supported at the moment."
+            )
+        return files
+    def _load_cache_dict(self):
+        """
+        Load the cache dictionary from the cache file. Create an empty cache
+        file if it does not exist.
+        """
+        if not os.path.exists(self.cache_dir):
+            logger.info(f"Creating cache directory {self.cache_dir}.")
+            os.makedirs(self.cache_dir)
+        if not os.path.exists(self.cache_file):
+            logger.info(f"Creating cache file {self.cache_file}.")
+            with open(self.cache_file, "w") as f:
+                json.dump({}, f)
+        with open(self.cache_file, "r") as f:
+            logger.info(f"Loading cache file {self.cache_file}.")
+            return json.load(f)
+    def _get_cache_record(self, resource: Resource):
+        """
+        Get the cache record of a resource.
+        Args:
+            resource (Resource): The resource to get the cache record of.
+        Returns:
+            The cache record of the resource.
+        """
+        return self.cache_dict.get(resource.name, {})
+    def _update_cache_record(self, resource: Resource):
+        """
+        Update the cache record of a resource.
+        Args:
+            resource (Resource): The resource to update the cache record of.
+        """
+        cache_record = {}
+        cache_record["url"] = to_list(resource.url_s)
+        cache_record["date_downloaded"] = datetime.now()
+        cache_record["lifetime"] = resource.lifetime
+        self.cache_dict[resource.name] = cache_record
+        with open(self.cache_file, "w") as f:
+            json.dump(self.cache_dict, f, default=str)
+def is_nested(lst):
+    """
+    Check if a list is nested.
+    Args:
+        lst (list): The list to check.
+    Returns:
+        bool: True if the list is nested, False otherwise.
+    """
+    for item in lst:
+        if isinstance(item, list):
+            return True
+    return False

biocypher/_metadata.py CHANGED Viewed

@@ -19,7 +19,7 @@ import importlib.metadata
 import toml
-_VERSION = "0.5.20"
+_VERSION = "0.5.21"
 def get_metadata():

{biocypher-0.5.20.dist-info → biocypher-0.5.21.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: biocypher
-Version: 0.5.20
+Version: 0.5.21
 Summary: A unifying framework for biomedical research knowledge graphs
 Home-page: https://github.com/biocypher/biocypher
 License: MIT
@@ -25,8 +25,10 @@ Requires-Dist: more_itertools
 Requires-Dist: neo4j-utils (==0.0.7)
 Requires-Dist: networkx (>=3.0,<4.0)
 Requires-Dist: pandas (>=2.0.1,<3.0.0)
+Requires-Dist: pooch (>=1.7.0,<2.0.0)
 Requires-Dist: rdflib (>=6.2.0,<7.0.0)
 Requires-Dist: stringcase (>=1.2.0,<2.0.0)
+Requires-Dist: tqdm (>=4.65.0,<5.0.0)
 Requires-Dist: treelib (>=1.6.1,<2.0.0)
 Project-URL: Bug Tracker, https://github.com/biocypher/biocypher/issues
 Project-URL: Repository, https://github.com/biocypher/biocypher

{biocypher-0.5.20.dist-info → biocypher-0.5.21.dist-info}/RECORD RENAMED Viewed

@@ -6,18 +6,19 @@ biocypher/_config/test_schema_config.yaml,sha256=D1600WgEj3iTXrumVU9LIivJHJO36ia
 biocypher/_config/test_schema_config_disconnected.yaml,sha256=Qm8FLxEn2spHcyj_5F859KjcDvKSxNhxDvi4b4LLkvQ,68
 biocypher/_config/test_schema_config_extended.yaml,sha256=wn3A76142hhjnImhMF6RODbCFESTJ2TtPvcFdIFsAT0,3309
 biocypher/_connect.py,sha256=0oSyO6CEIlKL8rHo-HHE7y0FzGfSi4vnEXSDy1TnIUE,12456
-biocypher/_core.py,sha256=fA0tRorzy3R1mgzzT77mFk-l6oQ01ZAfjg8l6KbPQYM,19882
+biocypher/_core.py,sha256=cc8iOOAhaByobN6zOwdUm1hZFAJ5CpGpKmQnBIIQrbY,21090
 biocypher/_create.py,sha256=vpUchUdEpWupZi1LgFLxAWMtqoBwnWbP7PwEDUCBS4A,10202
 biocypher/_deduplicate.py,sha256=BBvfpXzu6L5YDY5FdtXxnf8YlsbJpbCE8RdUoKsm0n0,4949
+biocypher/_get.py,sha256=MHjHEqvPr4Z7Ud05qBcUJkR--iZ1SgUvUoft8MfwUic,8996
 biocypher/_logger.py,sha256=soYtz1DiduLFw3XrMnphWWUxeuJqvSof4AYhlafxl08,2933
 biocypher/_mapping.py,sha256=XJZjmXTPnXVkyub1ZU0h3EKXQ2YROaGaJOaGyPMqgy4,9338
-biocypher/_metadata.py,sha256=Hmz4g_CSuqikUJ6EtLEq2GS7Z0BawtAsL0Wk-7AiE8c,1658
+biocypher/_metadata.py,sha256=CHGBWJ8qYrb7QNQO-Fk0ROkRDXHvtFECSoex9GytJ4A,1658
 biocypher/_misc.py,sha256=wsjGVOqBDVM5hxbE_TEaZ69u1kJc8HXwRAtQHUgE8XQ,4545
 biocypher/_ontology.py,sha256=pHc4hO8iZx-yg9gzqfBR9khoIni-lKAxWgnRFyNP91E,21530
 biocypher/_pandas.py,sha256=GVCFM68J7yBjh40MpkNVgD8qT1RFMrrIjMOtD3iKsf4,3040
 biocypher/_translate.py,sha256=nj4Y60F0U3JBH36N2dh5pFcC8Ot86rskJ2ChJwje9dI,16494
 biocypher/_write.py,sha256=2ynF-VkvTr8WT2qPt2wji3iupP3WON94TlT6NpfDvCs,67738
-biocypher-0.5.20.dist-info/LICENSE,sha256=SjUaQkq671iQUZOxEUpC4jvJxXOlfSiHTTueyz9kXJM,1065
-biocypher-0.5.20.dist-info/WHEEL,sha256=vxFmldFsRN_Hx10GDvsdv1wroKq8r5Lzvjp6GZ4OO8c,88
-biocypher-0.5.20.dist-info/METADATA,sha256=B3VOakjkLgCjusCElMML-neoPoc869g4jNI45Bchibo,9429
-biocypher-0.5.20.dist-info/RECORD,,
+biocypher-0.5.21.dist-info/LICENSE,sha256=SjUaQkq671iQUZOxEUpC4jvJxXOlfSiHTTueyz9kXJM,1065
+biocypher-0.5.21.dist-info/WHEEL,sha256=vxFmldFsRN_Hx10GDvsdv1wroKq8r5Lzvjp6GZ4OO8c,88
+biocypher-0.5.21.dist-info/METADATA,sha256=wJ1Hnuq_erwEJRMCKA3e7VeUF7cLibnZdcnSCryynx0,9505
+biocypher-0.5.21.dist-info/RECORD,,

{biocypher-0.5.20.dist-info → biocypher-0.5.21.dist-info}/LICENSE RENAMED Viewed

File without changes

{biocypher-0.5.20.dist-info → biocypher-0.5.21.dist-info}/WHEEL RENAMED Viewed

File without changes

biocypher 0.5.20__py3-none-any.whl → 0.5.21__py3-none-any.whl

Potentially problematic release.

biocypher 0.5.20py3-none-any.whl → 0.5.21py3-none-any.whl