PyPI - zombie-squirrel - Versions diffs - 0.7.4__tar.gz → 0.8.1__tar.gz - Mend

zombie-squirrel 0.7.4tar.gz → 0.8.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

{zombie_squirrel-0.7.4/src/zombie_squirrel.egg-info → zombie_squirrel-0.8.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: zombie-squirrel
-Version: 0.7.4
+Version: 0.8.1
 Summary: Generated from aind-library-template
 Author: Allen Institute for Neural Dynamics
 License: MIT
@@ -21,7 +21,7 @@ Dynamic: license-file
 ![Code Style](https://img.shields.io/badge/code%20style-black-black)
 [![semantic-release: angular](https://img.shields.io/badge/semantic--release-angular-e10079?logo=semantic-release)](https://github.com/semantic-release/semantic-release)
 ![Interrogate](https://img.shields.io/badge/interrogate-100.0%25-brightgreen)
-![Coverage](https://img.shields.io/badge/coverage-99%25-brightgreen)
+![Coverage](https://img.shields.io/badge/coverage-100%25-brightgreen)
 ![Python](https://img.shields.io/badge/python->=3.10-blue?logo=python)
 <img src="zombie-squirrel_logo.png" width="400" alt="Logo (image from ChatGPT)">
@@ -37,10 +37,10 @@ pip install zombie-squirrel
 ### Set backend
 ```bash
-export TREE_SPECIES='s3'
+export FOREST_TYPE='S3'
 ```
-Options are 's3', 'MEMORY'.
+Options are 'S3', 'MEMORY'.
 ### Scurry (fetch) data

{zombie_squirrel-0.7.4 → zombie_squirrel-0.8.1}/README.md RENAMED Viewed

@@ -4,7 +4,7 @@
 ![Code Style](https://img.shields.io/badge/code%20style-black-black)
 [![semantic-release: angular](https://img.shields.io/badge/semantic--release-angular-e10079?logo=semantic-release)](https://github.com/semantic-release/semantic-release)
 ![Interrogate](https://img.shields.io/badge/interrogate-100.0%25-brightgreen)
-![Coverage](https://img.shields.io/badge/coverage-99%25-brightgreen)
+![Coverage](https://img.shields.io/badge/coverage-100%25-brightgreen)
 ![Python](https://img.shields.io/badge/python->=3.10-blue?logo=python)
 <img src="zombie-squirrel_logo.png" width="400" alt="Logo (image from ChatGPT)">
@@ -20,10 +20,10 @@ pip install zombie-squirrel
 ### Set backend
 ```bash
-export TREE_SPECIES='s3'
+export FOREST_TYPE='S3'
 ```
-Options are 's3', 'MEMORY'.
+Options are 'S3', 'MEMORY'.
 ### Scurry (fetch) data

zombie_squirrel-0.8.1/src/zombie_squirrel/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+"""Zombie-squirrel: caching and synchronization for AIND metadata.
+Provides functions to fetch and cache project names, subject IDs, and asset
+metadata from the AIND metadata database with support for multiple backends."""
+__version__ = "0.8.1"
+from zombie_squirrel.acorn_contents.asset_basics import asset_basics  # noqa: F401
+from zombie_squirrel.acorn_contents.raw_to_derived import raw_to_derived  # noqa: F401
+from zombie_squirrel.acorn_contents.source_data import source_data  # noqa: F401
+from zombie_squirrel.acorn_contents.unique_project_names import unique_project_names  # noqa: F401
+from zombie_squirrel.acorn_contents.unique_subject_ids import unique_subject_ids  # noqa: F401

zombie_squirrel-0.8.1/src/zombie_squirrel/acorn_contents/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+"""Acorns module: individual data fetching functions."""
+# Import the acorn modules to trigger decorator registration
+from zombie_squirrel.acorn_contents import (  # noqa: F401
+    asset_basics,
+    raw_to_derived,
+    source_data,
+    unique_project_names,
+    unique_subject_ids,
+)

zombie_squirrel-0.8.1/src/zombie_squirrel/acorn_contents/asset_basics.py ADDED Viewed

@@ -0,0 +1,135 @@
+"""Asset basics acorn."""
+import logging
+import pandas as pd
+from aind_data_access_api.document_db import MetadataDbClient
+import zombie_squirrel.acorns as acorns
+@acorns.register_acorn(acorns.NAMES["basics"])
+def asset_basics(force_update: bool = False) -> pd.DataFrame:
+    """Fetch basic asset metadata including modalities, projects, and subject info.
+    Returns a DataFrame with columns: _id, _last_modified, modalities,
+    project_name, data_level, subject_id, acquisition_start_time, and
+    acquisition_end_time. Uses incremental updates based on _last_modified
+    timestamps to avoid re-fetching unchanged records.
+    Args:
+        force_update: If True, bypass cache and fetch fresh data from database.
+    Returns:
+        DataFrame with basic asset metadata."""
+    df = acorns.TREE.scurry(acorns.NAMES["basics"])
+    FIELDS = [
+        "data_description.modalities",
+        "data_description.project_name",
+        "data_description.data_level",
+        "subject.subject_id",
+        "acquisition.acquisition_start_time",
+        "acquisition.acquisition_end_time",
+        "processing.data_processes.start_date_time",
+        "subject.subject_details.genotype",
+        "other_identifiers",
+        "location",
+    ]
+    if df.empty or force_update:
+        logging.info("Updating cache for asset basics")
+        df = pd.DataFrame(
+            columns=[
+                "_id",
+                "_last_modified",
+                "modalities",
+                "project_name",
+                "data_level",
+                "subject_id",
+                "acquisition_start_time",
+                "acquisition_end_time",
+                "code_ocean",
+                "process_date",
+                "genotype",
+                "location",
+            ]
+        )
+        client = MetadataDbClient(
+            host=acorns.API_GATEWAY_HOST,
+            version="v2",
+        )
+        # It's a bit complex to get multiple fields that aren't indexed in a database
+        # as large as DocDB. We'll also try to limit ourselves to only updating fields
+        # that are necessary
+        record_ids = client.retrieve_docdb_records(
+            filter_query={},
+            projection={"_id": 1, "_last_modified": 1},
+            limit=0,
+        )
+        keep_ids = []
+        # Drop all _ids where _last_modified matches cache
+        for record in record_ids:
+            cached_row = df[df["_id"] == record["_id"]]
+            if cached_row.empty or cached_row["_last_modified"].values[0] != record["_last_modified"]:
+                keep_ids.append(record["_id"])
+        # Now batch by 100 IDs at a time to avoid overloading server, and fetch all the fields
+        BATCH_SIZE = 100
+        asset_records = []
+        for i in range(0, len(keep_ids), BATCH_SIZE):
+            logging.info(f"Fetching asset basics batch {i // BATCH_SIZE + 1}...")
+            batch_ids = keep_ids[i: i + BATCH_SIZE]
+            batch_records = client.retrieve_docdb_records(
+                filter_query={"_id": {"$in": batch_ids}},
+                projection={field: 1 for field in FIELDS + ["_id", "_last_modified"]},
+                limit=0,
+            )
+            asset_records.extend(batch_records)
+        # Unwrap nested fields
+        records = []
+        for record in asset_records:
+            modalities = record.get("data_description", {}).get("modalities", [])
+            modality_abbreviations = [modality["abbreviation"] for modality in modalities if "abbreviation" in modality]
+            modality_abbreviations_str = ", ".join(modality_abbreviations)
+            # Get the process date, convert to YYYY-MM-DD if present
+            data_processes = record.get("processing", {}).get("data_processes", [])
+            if data_processes:
+                latest_process = data_processes[-1]
+                process_datetime = latest_process.get("start_date_time", None)
+                process_date = process_datetime.split("T")[0]
+            else:
+                process_date = None
+            # Get the CO asset ID
+            other_identifiers = record.get("other_identifiers", {})
+            if other_identifiers:
+                code_ocean = other_identifiers.get("Code Ocean", None)
+            else:
+                code_ocean = None
+            flat_record = {
+                "_id": record["_id"],
+                "_last_modified": record.get("_last_modified", None),
+                "modalities": modality_abbreviations_str,
+                "project_name": record.get("data_description", {}).get("project_name", None),
+                "data_level": record.get("data_description", {}).get("data_level", None),
+                "subject_id": record.get("subject", {}).get("subject_id", None),
+                "acquisition_start_time": record.get("acquisition", {}).get("acquisition_start_time", None),
+                "acquisition_end_time": record.get("acquisition", {}).get("acquisition_end_time", None),
+                "code_ocean": code_ocean,
+                "process_date": process_date,
+                "genotype": record.get("subject", {}).get("subject_details", {}).get("genotype", None),
+                "location": record.get("location", None),
+            }
+            records.append(flat_record)
+        # Combine new records with the old df and store in cache
+        new_df = pd.DataFrame(records)
+        df = pd.concat([df[~df["_id"].isin(keep_ids)], new_df], ignore_index=True)
+        acorns.TREE.hide(acorns.NAMES["basics"], df)
+    return df

zombie_squirrel-0.8.1/src/zombie_squirrel/acorn_contents/raw_to_derived.py ADDED Viewed

@@ -0,0 +1,71 @@
+"""Raw to derived mapping acorn."""
+import logging
+import pandas as pd
+from aind_data_access_api.document_db import MetadataDbClient
+import zombie_squirrel.acorns as acorns
+@acorns.register_acorn(acorns.NAMES["r2d"])
+def raw_to_derived(force_update: bool = False) -> pd.DataFrame:
+    """Fetch mapping of raw records to their derived records.
+    Returns a DataFrame mapping raw record IDs to lists of derived record IDs
+    that depend on them as source data.
+    Args:
+        force_update: If True, bypass cache and fetch fresh data from database.
+    Returns:
+        DataFrame with _id and derived_records columns."""
+    df = acorns.TREE.scurry(acorns.NAMES["r2d"])
+    if df.empty or force_update:
+        logging.info("Updating cache for raw to derived mapping")
+        client = MetadataDbClient(
+            host=acorns.API_GATEWAY_HOST,
+            version="v2",
+        )
+        # Get all raw record IDs
+        raw_records = client.retrieve_docdb_records(
+            filter_query={"data_description.data_level": "raw"},
+            projection={"_id": 1},
+            limit=0,
+        )
+        raw_ids = {record["_id"] for record in raw_records}
+        # Get all derived records with their _id and source_data
+        derived_records = client.retrieve_docdb_records(
+            filter_query={"data_description.data_level": "derived"},
+            projection={"_id": 1, "data_description.source_data": 1},
+            limit=0,
+        )
+        # Build mapping: raw_id -> list of derived _ids
+        raw_to_derived_map = {raw_id: [] for raw_id in raw_ids}
+        for derived_record in derived_records:
+            source_data_list = derived_record.get("data_description", {}).get("source_data", [])
+            derived_id = derived_record["_id"]
+            # Add this derived record to each raw record it depends on
+            for source_id in source_data_list:
+                if source_id in raw_to_derived_map:
+                    raw_to_derived_map[source_id].append(derived_id)
+        # Convert to DataFrame
+        data = []
+        for raw_id, derived_ids in raw_to_derived_map.items():
+            derived_ids_str = ", ".join(derived_ids)
+            data.append(
+                {
+                    "_id": raw_id,
+                    "derived_records": derived_ids_str,
+                }
+            )
+        df = pd.DataFrame(data)
+        acorns.TREE.hide(acorns.NAMES["r2d"], df)
+    return df

zombie_squirrel-0.8.1/src/zombie_squirrel/acorn_contents/source_data.py ADDED Viewed

@@ -0,0 +1,50 @@
+"""Source data acorn."""
+import logging
+import pandas as pd
+from aind_data_access_api.document_db import MetadataDbClient
+import zombie_squirrel.acorns as acorns
+@acorns.register_acorn(acorns.NAMES["d2r"])
+def source_data(force_update: bool = False) -> pd.DataFrame:
+    """Fetch source data references for derived records.
+    Returns a DataFrame mapping record IDs to their upstream source data
+    dependencies as comma-separated lists.
+    Args:
+        force_update: If True, bypass cache and fetch fresh data from database.
+    Returns:
+        DataFrame with _id and source_data columns."""
+    df = acorns.TREE.scurry(acorns.NAMES["d2r"])
+    if df.empty or force_update:
+        logging.info("Updating cache for source data")
+        client = MetadataDbClient(
+            host=acorns.API_GATEWAY_HOST,
+            version="v2",
+        )
+        records = client.retrieve_docdb_records(
+            filter_query={},
+            projection={"_id": 1, "data_description.source_data": 1},
+            limit=0,
+        )
+        data = []
+        for record in records:
+            source_data_list = record.get("data_description", {}).get("source_data", [])
+            source_data_str = ", ".join(source_data_list) if source_data_list else ""
+            data.append(
+                {
+                    "_id": record["_id"],
+                    "source_data": source_data_str,
+                }
+            )
+        df = pd.DataFrame(data)
+        acorns.TREE.hide(acorns.NAMES["d2r"], df)
+    return df

zombie_squirrel-0.8.1/src/zombie_squirrel/acorn_contents/unique_project_names.py ADDED Viewed

@@ -0,0 +1,41 @@
+"""Unique project names acorn."""
+import logging
+import pandas as pd
+from aind_data_access_api.document_db import MetadataDbClient
+import zombie_squirrel.acorns as acorns
+@acorns.register_acorn(acorns.NAMES["upn"])
+def unique_project_names(force_update: bool = False) -> list[str]:
+    """Fetch unique project names from metadata database.
+    Returns cached results if available, fetches from database if cache is empty
+    or force_update is True.
+    Args:
+        force_update: If True, bypass cache and fetch fresh data from database.
+    Returns:
+        List of unique project names."""
+    df = acorns.TREE.scurry(acorns.NAMES["upn"])
+    if df.empty or force_update:
+        # If cache is missing, fetch data
+        logging.info("Updating cache for unique project names")
+        client = MetadataDbClient(
+            host=acorns.API_GATEWAY_HOST,
+            version="v2",
+        )
+        unique_project_names = client.aggregate_docdb_records(
+            pipeline=[
+                {"$group": {"_id": "$data_description.project_name"}},
+                {"$project": {"project_name": "$_id", "_id": 0}},
+            ]
+        )
+        df = pd.DataFrame(unique_project_names)
+        acorns.TREE.hide(acorns.NAMES["upn"], df)
+    return df["project_name"].tolist()

zombie_squirrel-0.8.1/src/zombie_squirrel/acorn_contents/unique_subject_ids.py ADDED Viewed

@@ -0,0 +1,41 @@
+"""Unique subject IDs acorn."""
+import logging
+import pandas as pd
+from aind_data_access_api.document_db import MetadataDbClient
+import zombie_squirrel.acorns as acorns
+@acorns.register_acorn(acorns.NAMES["usi"])
+def unique_subject_ids(force_update: bool = False) -> list[str]:
+    """Fetch unique subject IDs from metadata database.
+    Returns cached results if available, fetches from database if cache is empty
+    or force_update is True.
+    Args:
+        force_update: If True, bypass cache and fetch fresh data from database.
+    Returns:
+        List of unique subject IDs."""
+    df = acorns.TREE.scurry(acorns.NAMES["usi"])
+    if df.empty or force_update:
+        # If cache is missing, fetch data
+        logging.info("Updating cache for unique subject IDs")
+        client = MetadataDbClient(
+            host=acorns.API_GATEWAY_HOST,
+            version="v2",
+        )
+        unique_subject_ids = client.aggregate_docdb_records(
+            pipeline=[
+                {"$group": {"_id": "$subject.subject_id"}},
+                {"$project": {"subject_id": "$_id", "_id": 0}},
+            ]
+        )
+        df = pd.DataFrame(unique_subject_ids)
+        acorns.TREE.hide(acorns.NAMES["usi"], df)
+    return df["subject_id"].tolist()

zombie_squirrel-0.8.1/src/zombie_squirrel/acorns.py ADDED Viewed

@@ -0,0 +1,47 @@
+"""Acorns: functions to fetch and cache data from MongoDB."""
+import logging
+import os
+from collections.abc import Callable
+from typing import Any
+from zombie_squirrel.forest import (
+    MemoryTree,
+    S3Tree,
+)
+# --- Backend setup ---------------------------------------------------
+API_GATEWAY_HOST = "api.allenneuraldynamics.org"
+forest_type = os.getenv("FOREST_TYPE", "memory").lower()
+if forest_type == "S3":  # pragma: no cover
+    logging.info("Using S3 forest for caching")
+    TREE = S3Tree()
+else:
+    logging.info("Using in-memory forest for caching")
+    TREE = MemoryTree()
+# --- Acorn registry and names -----------------------------------------------------
+NAMES = {
+    "upn": "unique_project_names",
+    "usi": "unique_subject_ids",
+    "basics": "asset_basics",
+    "d2r": "source_data",
+    "r2d": "raw_to_derived",
+}
+ACORN_REGISTRY: dict[str, Callable[[], Any]] = {}
+def register_acorn(name: str):
+    """Decorator for registering new acorns."""
+    def decorator(func):
+        """Register function in acorn registry."""
+        ACORN_REGISTRY[name] = func
+        return func
+    return decorator

zombie_squirrel-0.7.4/src/zombie_squirrel/acorns.py → zombie_squirrel-0.8.1/src/zombie_squirrel/forest.py RENAMED Viewed

@@ -11,11 +11,11 @@ import pandas as pd
 from zombie_squirrel.utils import get_s3_cache_path, prefix_table_name
-class Acorn(ABC):
+class Tree(ABC):
     """Base class for a storage backend (the cache)."""
     def __init__(self) -> None:
-        """Initialize the Acorn."""
+        """Initialize the Tree."""
         super().__init__()
     @abstractmethod
@@ -29,7 +29,7 @@ class Acorn(ABC):
         pass  # pragma: no cover
-class S3Acorn(Acorn):
+class S3Tree(Tree):
     """Stores and retrieves caches using AWS S3 with parquet files."""
     def __init__(self) -> None:
@@ -79,7 +79,7 @@ class S3Acorn(Acorn):
             return pd.DataFrame()
-class MemoryAcorn(Acorn):
+class MemoryTree(Tree):
     """A simple in-memory backend for testing or local development."""
     def __init__(self) -> None:

zombie_squirrel-0.8.1/src/zombie_squirrel/sync.py ADDED Viewed

@@ -0,0 +1,18 @@
+"""Synchronization utilities for updating all cached data."""
+import logging
+from .acorns import ACORN_REGISTRY
+def hide_acorns():
+    """Trigger force update of all registered acorn functions.
+    Calls each acorn function with force_update=True to refresh
+    all cached data in the tree backend."""
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(levelname)s %(message)s"
+    )
+    for acorn in ACORN_REGISTRY.values():
+        acorn(force_update=True)

{zombie_squirrel-0.7.4 → zombie_squirrel-0.8.1/src/zombie_squirrel.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: zombie-squirrel
-Version: 0.7.4
+Version: 0.8.1
 Summary: Generated from aind-library-template
 Author: Allen Institute for Neural Dynamics
 License: MIT
@@ -21,7 +21,7 @@ Dynamic: license-file
 ![Code Style](https://img.shields.io/badge/code%20style-black-black)
 [![semantic-release: angular](https://img.shields.io/badge/semantic--release-angular-e10079?logo=semantic-release)](https://github.com/semantic-release/semantic-release)
 ![Interrogate](https://img.shields.io/badge/interrogate-100.0%25-brightgreen)
-![Coverage](https://img.shields.io/badge/coverage-99%25-brightgreen)
+![Coverage](https://img.shields.io/badge/coverage-100%25-brightgreen)
 ![Python](https://img.shields.io/badge/python->=3.10-blue?logo=python)
 <img src="zombie-squirrel_logo.png" width="400" alt="Logo (image from ChatGPT)">
@@ -37,10 +37,10 @@ pip install zombie-squirrel
 ### Set backend
 ```bash
-export TREE_SPECIES='s3'
+export FOREST_TYPE='S3'
 ```
-Options are 's3', 'MEMORY'.
+Options are 'S3', 'MEMORY'.
 ### Scurry (fetch) data

{zombie_squirrel-0.7.4 → zombie_squirrel-0.8.1}/src/zombie_squirrel.egg-info/SOURCES.txt RENAMED Viewed

@@ -4,7 +4,7 @@ pyproject.toml
 setup.py
 src/zombie_squirrel/__init__.py
 src/zombie_squirrel/acorns.py
-src/zombie_squirrel/squirrels.py
+src/zombie_squirrel/forest.py
 src/zombie_squirrel/sync.py
 src/zombie_squirrel/utils.py
 src/zombie_squirrel.egg-info/PKG-INFO
@@ -12,7 +12,13 @@ src/zombie_squirrel.egg-info/SOURCES.txt
 src/zombie_squirrel.egg-info/dependency_links.txt
 src/zombie_squirrel.egg-info/requires.txt
 src/zombie_squirrel.egg-info/top_level.txt
+src/zombie_squirrel/acorn_contents/__init__.py
+src/zombie_squirrel/acorn_contents/asset_basics.py
+src/zombie_squirrel/acorn_contents/raw_to_derived.py
+src/zombie_squirrel/acorn_contents/source_data.py
+src/zombie_squirrel/acorn_contents/unique_project_names.py
+src/zombie_squirrel/acorn_contents/unique_subject_ids.py
 tests/test_acorns.py
-tests/test_squirrels.py
 tests/test_sync.py
+tests/test_trees.py
 tests/test_utils.py

zombie_squirrel-0.8.1/tests/test_acorns.py ADDED Viewed

@@ -0,0 +1,37 @@
+"""Unit tests for acorn registry mechanism.
+Tests for acorn registration and NAMES dictionary."""
+import unittest
+from zombie_squirrel.acorns import (
+    NAMES,
+    ACORN_REGISTRY,
+)
+class TestAcornRegistration(unittest.TestCase):
+    """Tests for acorn registration mechanism."""
+    def test_acorn_registry_contains_all_functions(self):
+        """Test that all acorn functions are registered."""
+        self.assertIn(NAMES["upn"], ACORN_REGISTRY)
+        self.assertIn(NAMES["usi"], ACORN_REGISTRY)
+        self.assertIn(NAMES["basics"], ACORN_REGISTRY)
+        self.assertIn(NAMES["d2r"], ACORN_REGISTRY)
+        self.assertIn(NAMES["r2d"], ACORN_REGISTRY)
+    def test_registry_values_are_callable(self):
+        """Test that registry values are callable functions."""
+        for name, func in ACORN_REGISTRY.items():
+            self.assertTrue(callable(func), f"{name} is not callable")
+    def test_names_dict_completeness(self):
+        """Test that NAMES dict has expected keys."""
+        expected_keys = ["upn", "usi", "basics", "d2r", "r2d"]
+        for key in expected_keys:
+            self.assertIn(key, NAMES)
+if __name__ == "__main__":
+    unittest.main()

zombie-squirrel 0.7.4__tar.gz → 0.8.1__tar.gz

zombie-squirrel 0.7.4tar.gz → 0.8.1tar.gz