zombie-squirrel 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ """Zombie-squirrel: caching and synchronization for AIND metadata.
2
+
3
+ Provides functions to fetch and cache project names, subject IDs, and asset
4
+ metadata from the AIND metadata database with support for multiple backends."""
5
+
6
+ __version__ = "0.4.4"
7
+
8
+ from zombie_squirrel.squirrels import ( # noqa: F401
9
+ asset_basics,
10
+ raw_to_derived,
11
+ source_data,
12
+ unique_project_names,
13
+ unique_subject_ids,
14
+ )
@@ -0,0 +1,81 @@
1
+ """Storage backend interfaces for caching data."""
2
+
3
+ import logging
4
+ import os
5
+ from abc import ABC, abstractmethod
6
+
7
+ import pandas as pd
8
+ from aind_data_access_api.rds_tables import Client, RDSCredentials
9
+
10
+ from zombie_squirrel.utils import prefix_table_name
11
+
12
+
13
+ class Acorn(ABC):
14
+ """Base class for a storage backend (the cache)."""
15
+
16
+ def __init__(self) -> None:
17
+ """Initialize the Acorn."""
18
+ super().__init__()
19
+
20
+ @abstractmethod
21
+ def hide(self, table_name: str, data: pd.DataFrame) -> None:
22
+ """Store records in the cache."""
23
+ pass # pragma: no cover
24
+
25
+ @abstractmethod
26
+ def scurry(self, table_name: str) -> pd.DataFrame:
27
+ """Fetch records from the cache."""
28
+ pass # pragma: no cover
29
+
30
+
31
+ class RedshiftAcorn(Acorn):
32
+ """Stores and retrieves caches using aind-data-access-api
33
+ Redshift Client"""
34
+
35
+ def __init__(self) -> None:
36
+ """Initialize RedshiftAcorn with Redshift credentials."""
37
+ REDSHIFT_SECRETS = os.getenv("REDSHIFT_SECRETS", "/aind/prod/redshift/credentials/readwrite")
38
+ self.rds_client = Client(
39
+ credentials=RDSCredentials(aws_secrets_name=REDSHIFT_SECRETS),
40
+ )
41
+
42
+ def hide(self, table_name: str, data: pd.DataFrame) -> None:
43
+ """Store DataFrame in Redshift table."""
44
+ self.rds_client.overwrite_table_with_df(
45
+ df=data,
46
+ table_name=prefix_table_name(table_name),
47
+ )
48
+
49
+ def scurry(self, table_name: str) -> pd.DataFrame:
50
+ """Fetch DataFrame from Redshift table."""
51
+ return self.rds_client.read_table(table_name=prefix_table_name(table_name))
52
+
53
+
54
+ class MemoryAcorn(Acorn):
55
+ """A simple in-memory backend for testing or local development."""
56
+
57
+ def __init__(self) -> None:
58
+ """Initialize MemoryAcorn with empty store."""
59
+ super().__init__()
60
+ self._store: dict[str, pd.DataFrame] = {}
61
+
62
+ def hide(self, table_name: str, data: pd.DataFrame) -> None:
63
+ """Store DataFrame in memory."""
64
+ self._store[table_name] = data
65
+
66
+ def scurry(self, table_name: str) -> pd.DataFrame:
67
+ """Fetch DataFrame from memory."""
68
+ return self._store.get(table_name, pd.DataFrame())
69
+
70
+
71
+ def rds_get_handle_empty(acorn: Acorn, table_name: str) -> pd.DataFrame:
72
+ """Helper for handling errors when loading from redshift, because
73
+ there's no helper function"""
74
+ try:
75
+ logging.info(f"Fetching from cache: {table_name}")
76
+ df = acorn.scurry(table_name)
77
+ except Exception as e:
78
+ logging.warning(f"Error fetching from cache: {e}")
79
+ df = pd.DataFrame()
80
+
81
+ return df
@@ -0,0 +1,323 @@
1
+ """Squirrels: functions to fetch and cache data from MongoDB."""
2
+
3
+ import logging
4
+ import os
5
+ from collections.abc import Callable
6
+ from typing import Any
7
+
8
+ import pandas as pd
9
+ from aind_data_access_api.document_db import MetadataDbClient
10
+
11
+ from zombie_squirrel.acorns import (
12
+ MemoryAcorn,
13
+ RedshiftAcorn,
14
+ rds_get_handle_empty,
15
+ )
16
+
17
+ # --- Backend setup ---------------------------------------------------
18
+
19
+ API_GATEWAY_HOST = "api.allenneuraldynamics.org"
20
+
21
+ tree_type = os.getenv("TREE_SPECIES", "memory").lower()
22
+
23
+ if tree_type == "redshift": # pragma: no cover
24
+ logging.info("Using Redshift acorn for caching")
25
+ ACORN = RedshiftAcorn()
26
+ else:
27
+ logging.info("Using in-memory acorn for caching")
28
+ ACORN = MemoryAcorn()
29
+
30
+ # --- Squirrel registry -----------------------------------------------------
31
+
32
+ SQUIRREL_REGISTRY: dict[str, Callable[[], Any]] = {}
33
+
34
+
35
+ def register_squirrel(name: str):
36
+ """Decorator for registering new squirrels."""
37
+
38
+ def decorator(func):
39
+ """Register function in squirrel registry."""
40
+ SQUIRREL_REGISTRY[name] = func
41
+ return func
42
+
43
+ return decorator
44
+
45
+
46
+ # --- Squirrels -----------------------------------------------------
47
+
48
+ NAMES = {
49
+ "upn": "unique_project_names",
50
+ "usi": "unique_subject_ids",
51
+ "basics": "asset_basics",
52
+ "d2r": "source_data",
53
+ "r2d": "raw_to_derived",
54
+ }
55
+
56
+
57
+ @register_squirrel(NAMES["upn"])
58
+ def unique_project_names(force_update: bool = False) -> list[str]:
59
+ """Fetch unique project names from metadata database.
60
+
61
+ Returns cached results if available, fetches from database if cache is empty
62
+ or force_update is True.
63
+
64
+ Args:
65
+ force_update: If True, bypass cache and fetch fresh data from database.
66
+
67
+ Returns:
68
+ List of unique project names."""
69
+ df = rds_get_handle_empty(ACORN, NAMES["upn"])
70
+
71
+ if df.empty or force_update:
72
+ # If cache is missing, fetch data
73
+ logging.info("Updating cache for unique project names")
74
+ client = MetadataDbClient(
75
+ host=API_GATEWAY_HOST,
76
+ version="v2",
77
+ )
78
+ unique_project_names = client.aggregate_docdb_records(
79
+ pipeline=[
80
+ {"$group": {"_id": "$data_description.project_name"}},
81
+ {"$project": {"project_name": "$_id", "_id": 0}},
82
+ ]
83
+ )
84
+ df = pd.DataFrame(unique_project_names)
85
+ ACORN.hide(NAMES["upn"], df)
86
+
87
+ return df["project_name"].tolist()
88
+
89
+
90
+ @register_squirrel(NAMES["usi"])
91
+ def unique_subject_ids(force_update: bool = False) -> list[str]:
92
+ """Fetch unique subject IDs from metadata database.
93
+
94
+ Returns cached results if available, fetches from database if cache is empty
95
+ or force_update is True.
96
+
97
+ Args:
98
+ force_update: If True, bypass cache and fetch fresh data from database.
99
+
100
+ Returns:
101
+ List of unique subject IDs."""
102
+ df = rds_get_handle_empty(ACORN, NAMES["usi"])
103
+
104
+ if df.empty or force_update:
105
+ # If cache is missing, fetch data
106
+ logging.info("Updating cache for unique subject IDs")
107
+ client = MetadataDbClient(
108
+ host=API_GATEWAY_HOST,
109
+ version="v2",
110
+ )
111
+ unique_subject_ids = client.aggregate_docdb_records(
112
+ pipeline=[
113
+ {"$group": {"_id": "$subject.subject_id"}},
114
+ {"$project": {"subject_id": "$_id", "_id": 0}},
115
+ ]
116
+ )
117
+ df = pd.DataFrame(unique_subject_ids)
118
+ ACORN.hide(NAMES["usi"], df)
119
+
120
+ return df["subject_id"].tolist()
121
+
122
+
123
+ @register_squirrel(NAMES["basics"])
124
+ def asset_basics(force_update: bool = False) -> pd.DataFrame:
125
+ """Fetch basic asset metadata including modalities, projects, and subject info.
126
+
127
+ Returns a DataFrame with columns: _id, _last_modified, modalities,
128
+ project_name, data_level, subject_id, acquisition_start_time, and
129
+ acquisition_end_time. Uses incremental updates based on _last_modified
130
+ timestamps to avoid re-fetching unchanged records.
131
+
132
+ Args:
133
+ force_update: If True, bypass cache and fetch fresh data from database.
134
+
135
+ Returns:
136
+ DataFrame with basic asset metadata."""
137
+ df = rds_get_handle_empty(ACORN, NAMES["basics"])
138
+
139
+ FIELDS = [
140
+ "data_description.modalities",
141
+ "data_description.project_name",
142
+ "data_description.data_level",
143
+ "subject.subject_id",
144
+ "acquisition.acquisition_start_time",
145
+ "acquisition.acquisition_end_time",
146
+ ]
147
+
148
+ if df.empty or force_update:
149
+ logging.info("Updating cache for asset basics")
150
+ df = pd.DataFrame(
151
+ columns=[
152
+ "_id",
153
+ "_last_modified",
154
+ "modalities",
155
+ "project_name",
156
+ "data_level",
157
+ "subject_id",
158
+ "acquisition_start_time",
159
+ "acquisition_end_time",
160
+ ]
161
+ )
162
+ client = MetadataDbClient(
163
+ host=API_GATEWAY_HOST,
164
+ version="v2",
165
+ )
166
+ # It's a bit complex to get multiple fields that aren't indexed in a database
167
+ # as large as DocDB. We'll also try to limit ourselves to only updating fields
168
+ # that are necessary
169
+ record_ids = client.retrieve_docdb_records(
170
+ filter_query={},
171
+ projection={"_id": 1, "_last_modified": 1},
172
+ limit=0,
173
+ )
174
+ keep_ids = []
175
+ # Drop all _ids where _last_modified matches cache
176
+ for record in record_ids:
177
+ cached_row = df[df["_id"] == record["_id"]]
178
+ if cached_row.empty or cached_row["_last_modified"].values[0] != record["_last_modified"]:
179
+ keep_ids.append(record["_id"])
180
+
181
+ # Now batch by 100 IDs at a time to avoid overloading server, and fetch all the fields
182
+ BATCH_SIZE = 100
183
+ asset_records = []
184
+ for i in range(0, len(keep_ids), BATCH_SIZE):
185
+ logging.info(f"Fetching asset basics batch {i // BATCH_SIZE + 1}...")
186
+ batch_ids = keep_ids[i : i + BATCH_SIZE]
187
+ batch_records = client.retrieve_docdb_records(
188
+ filter_query={"_id": {"$in": batch_ids}},
189
+ projection={field: 1 for field in FIELDS + ["_id", "_last_modified"]},
190
+ limit=0,
191
+ )
192
+ asset_records.extend(batch_records)
193
+
194
+ # Unwrap nested fields
195
+ records = []
196
+ for record in asset_records:
197
+ modalities = record.get("data_description", {}).get("modalities", [])
198
+ modality_abbreviations = [modality["abbreviation"] for modality in modalities if "abbreviation" in modality]
199
+ modality_abbreviations_str = ", ".join(modality_abbreviations)
200
+ flat_record = {
201
+ "_id": record["_id"],
202
+ "_last_modified": record.get("_last_modified", None),
203
+ "modalities": modality_abbreviations_str,
204
+ "project_name": record.get("data_description", {}).get("project_name", None),
205
+ "data_level": record.get("data_description", {}).get("data_level", None),
206
+ "subject_id": record.get("subject", {}).get("subject_id", None),
207
+ "acquisition_start_time": record.get("acquisition", {}).get("acquisition_start_time", None),
208
+ "acquisition_end_time": record.get("acquisition", {}).get("acquisition_end_time", None),
209
+ }
210
+ records.append(flat_record)
211
+
212
+ # Combine new records with the old df and store in cache
213
+ new_df = pd.DataFrame(records)
214
+ df = pd.concat([df[~df["_id"].isin(keep_ids)], new_df], ignore_index=True)
215
+
216
+ ACORN.hide(NAMES["basics"], df)
217
+
218
+ return df
219
+
220
+
221
+ @register_squirrel(NAMES["d2r"])
222
+ def source_data(force_update: bool = False) -> pd.DataFrame:
223
+ """Fetch source data references for derived records.
224
+
225
+ Returns a DataFrame mapping record IDs to their upstream source data
226
+ dependencies as comma-separated lists.
227
+
228
+ Args:
229
+ force_update: If True, bypass cache and fetch fresh data from database.
230
+
231
+ Returns:
232
+ DataFrame with _id and source_data columns."""
233
+ df = rds_get_handle_empty(ACORN, NAMES["d2r"])
234
+
235
+ if df.empty or force_update:
236
+ logging.info("Updating cache for source data")
237
+ client = MetadataDbClient(
238
+ host=API_GATEWAY_HOST,
239
+ version="v2",
240
+ )
241
+ records = client.retrieve_docdb_records(
242
+ filter_query={},
243
+ projection={"_id": 1, "data_description.source_data": 1},
244
+ limit=0,
245
+ )
246
+ data = []
247
+ for record in records:
248
+ source_data_list = record.get("data_description", {}).get("source_data", [])
249
+ source_data_str = ", ".join(source_data_list) if source_data_list else ""
250
+ data.append(
251
+ {
252
+ "_id": record["_id"],
253
+ "source_data": source_data_str,
254
+ }
255
+ )
256
+
257
+ df = pd.DataFrame(data)
258
+ ACORN.hide(NAMES["d2r"], df)
259
+
260
+ return df
261
+
262
+
263
+ @register_squirrel(NAMES["r2d"])
264
+ def raw_to_derived(force_update: bool = False) -> pd.DataFrame:
265
+ """Fetch mapping of raw records to their derived records.
266
+
267
+ Returns a DataFrame mapping raw record IDs to lists of derived record IDs
268
+ that depend on them as source data.
269
+
270
+ Args:
271
+ force_update: If True, bypass cache and fetch fresh data from database.
272
+
273
+ Returns:
274
+ DataFrame with _id and derived_records columns."""
275
+ df = rds_get_handle_empty(ACORN, NAMES["r2d"])
276
+
277
+ if df.empty or force_update:
278
+ logging.info("Updating cache for raw to derived mapping")
279
+ client = MetadataDbClient(
280
+ host=API_GATEWAY_HOST,
281
+ version="v2",
282
+ )
283
+
284
+ # Get all raw record IDs
285
+ raw_records = client.retrieve_docdb_records(
286
+ filter_query={"data_description.data_level": "raw"},
287
+ projection={"_id": 1},
288
+ limit=0,
289
+ )
290
+ raw_ids = {record["_id"] for record in raw_records}
291
+
292
+ # Get all derived records with their _id and source_data
293
+ derived_records = client.retrieve_docdb_records(
294
+ filter_query={"data_description.data_level": "derived"},
295
+ projection={"_id": 1, "data_description.source_data": 1},
296
+ limit=0,
297
+ )
298
+
299
+ # Build mapping: raw_id -> list of derived _ids
300
+ raw_to_derived_map = {raw_id: [] for raw_id in raw_ids}
301
+ for derived_record in derived_records:
302
+ source_data_list = derived_record.get("data_description", {}).get("source_data", [])
303
+ derived_id = derived_record["_id"]
304
+ # Add this derived record to each raw record it depends on
305
+ for source_id in source_data_list:
306
+ if source_id in raw_to_derived_map:
307
+ raw_to_derived_map[source_id].append(derived_id)
308
+
309
+ # Convert to DataFrame
310
+ data = []
311
+ for raw_id, derived_ids in raw_to_derived_map.items():
312
+ derived_ids_str = ", ".join(derived_ids)
313
+ data.append(
314
+ {
315
+ "_id": raw_id,
316
+ "derived_records": derived_ids_str,
317
+ }
318
+ )
319
+
320
+ df = pd.DataFrame(data)
321
+ ACORN.hide(NAMES["r2d"], df)
322
+
323
+ return df
@@ -0,0 +1,18 @@
1
+ """Synchronization utilities for updating all cached data."""
2
+
3
+ import logging
4
+
5
+ from .squirrels import SQUIRREL_REGISTRY
6
+
7
+
8
+ def hide_acorns():
9
+ """Trigger force update of all registered squirrel functions.
10
+
11
+ Calls each squirrel function with force_update=True to refresh
12
+ all cached data in the acorn backend."""
13
+ logging.basicConfig(
14
+ level=logging.INFO,
15
+ format="%(asctime)s %(levelname)s %(message)s"
16
+ )
17
+ for squirrel in SQUIRREL_REGISTRY.values():
18
+ squirrel(force_update=True)
@@ -0,0 +1,12 @@
1
+ """Utility functions for zombie-squirrel package."""
2
+
3
+
4
+ def prefix_table_name(table_name: str) -> str:
5
+ """Add zombie-squirrel prefix to table names.
6
+
7
+ Args:
8
+ table_name: The base table name.
9
+
10
+ Returns:
11
+ Table name with 'zs_' prefix."""
12
+ return "zs_" + table_name
@@ -0,0 +1,58 @@
1
+ Metadata-Version: 2.4
2
+ Name: zombie-squirrel
3
+ Version: 0.4.4
4
+ Summary: Generated from aind-library-template
5
+ Author: Allen Institute for Neural Dynamics
6
+ License: MIT
7
+ Classifier: Programming Language :: Python :: 3
8
+ Requires-Python: >=3.10
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Requires-Dist: aind-data-access-api[docdb,rds]
12
+ Dynamic: license-file
13
+
14
+ # zombie-squirrel
15
+
16
+ [![License](https://img.shields.io/badge/license-MIT-brightgreen)](LICENSE)
17
+ ![Code Style](https://img.shields.io/badge/code%20style-black-black)
18
+ [![semantic-release: angular](https://img.shields.io/badge/semantic--release-angular-e10079?logo=semantic-release)](https://github.com/semantic-release/semantic-release)
19
+ ![Interrogate](https://img.shields.io/badge/interrogate-100.0%25-brightgreen)
20
+ ![Coverage](https://img.shields.io/badge/coverage-100%25-brightgreen)
21
+ ![Python](https://img.shields.io/badge/python->=3.10-blue?logo=python)
22
+
23
+ <img src="zombie-squirrel_logo.png" width="400" alt="Logo (image from ChatGPT)">
24
+
25
+ ## Installation
26
+
27
+ ```bash
28
+ pip install zombie-squirrel
29
+
30
+ ```bash
31
+ uv sync
32
+ ```
33
+
34
+ ## Usage
35
+
36
+ ### Set backend
37
+
38
+ ```bash
39
+ export REDSHIFT_SECRETS='/aind/prod/redshift/credentials/readwrite'
40
+ export TREE_SPECIES='REDSHIFT'
41
+ ```
42
+
43
+ Options are 'REDSHIFT', 'MEMORY'.
44
+
45
+ ### Scurry (fetch) data
46
+
47
+ ```python
48
+ import zombie_squirrel as zs
49
+
50
+ unique_project_names = zs.scurry_project_names()
51
+ ```
52
+
53
+ ### Hide the acorns
54
+
55
+ ```python
56
+ from zombie_squirrel.sync import hide_acorns
57
+ hide_acorns()
58
+ ```
@@ -0,0 +1,10 @@
1
+ zombie_squirrel/__init__.py,sha256=DNpLCiGI7ruzugujNWAtYUCsCZOaTmlzf3Mjd5jQUek,409
2
+ zombie_squirrel/acorns.py,sha256=4uBzYtYgW2oD5sOohNQUw4qfjmNjmAIK2RlL1Ge1Udo,2597
3
+ zombie_squirrel/squirrels.py,sha256=b1kQ2itTBo4o0e0r8Fg56YcJsiJAIqxzs86CSv0ExXE,11181
4
+ zombie_squirrel/sync.py,sha256=84Ta5beHiPuGBVzp9SCo7G1b4McTUohcUIf_TJbNIV8,518
5
+ zombie_squirrel/utils.py,sha256=woPxU4vYMUv-T0XOjV5ieViksU_q7It_n_5Ll4zpocA,289
6
+ zombie_squirrel-0.4.4.dist-info/licenses/LICENSE,sha256=U0Y7B3gZJHXpjJVLgTQjM8e_c8w4JJpLgGhIdsoFR1Y,1092
7
+ zombie_squirrel-0.4.4.dist-info/METADATA,sha256=DJ7Ai_ZPSRlnDzoUw2QZdSzmza3RieZXwe1I1mgnJts,1464
8
+ zombie_squirrel-0.4.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
+ zombie_squirrel-0.4.4.dist-info/top_level.txt,sha256=FmM0coe4AangURZLjM4JwwRv2B8H6oINYCoZLKLDCKA,16
10
+ zombie_squirrel-0.4.4.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2022 Allen Institute for Neural Dynamics
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ zombie_squirrel