zombie-squirrel 0.5.0__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zombie_squirrel/__init__.py +1 -1
- zombie_squirrel/acorns.py +45 -30
- zombie_squirrel/squirrels.py +23 -12
- zombie_squirrel/utils.py +14 -3
- {zombie_squirrel-0.5.0.dist-info → zombie_squirrel-0.6.1.dist-info}/METADATA +8 -8
- zombie_squirrel-0.6.1.dist-info/RECORD +10 -0
- zombie_squirrel-0.5.0.dist-info/RECORD +0 -10
- {zombie_squirrel-0.5.0.dist-info → zombie_squirrel-0.6.1.dist-info}/WHEEL +0 -0
- {zombie_squirrel-0.5.0.dist-info → zombie_squirrel-0.6.1.dist-info}/licenses/LICENSE +0 -0
- {zombie_squirrel-0.5.0.dist-info → zombie_squirrel-0.6.1.dist-info}/top_level.txt +0 -0
zombie_squirrel/__init__.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
Provides functions to fetch and cache project names, subject IDs, and asset
|
|
4
4
|
metadata from the AIND metadata database with support for multiple backends."""
|
|
5
5
|
|
|
6
|
-
__version__ = "0.
|
|
6
|
+
__version__ = "0.6.1"
|
|
7
7
|
|
|
8
8
|
from zombie_squirrel.squirrels import ( # noqa: F401
|
|
9
9
|
asset_basics,
|
zombie_squirrel/acorns.py
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
"""Storage backend interfaces for caching data."""
|
|
2
2
|
|
|
3
|
+
import io
|
|
3
4
|
import logging
|
|
4
|
-
import os
|
|
5
5
|
from abc import ABC, abstractmethod
|
|
6
6
|
|
|
7
|
+
import boto3
|
|
8
|
+
import duckdb
|
|
7
9
|
import pandas as pd
|
|
8
|
-
from aind_data_access_api.rds_tables import Client, RDSCredentials
|
|
9
10
|
|
|
10
|
-
from zombie_squirrel.utils import prefix_table_name
|
|
11
|
+
from zombie_squirrel.utils import get_s3_cache_path, prefix_table_name
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
class Acorn(ABC):
|
|
@@ -28,27 +29,54 @@ class Acorn(ABC):
|
|
|
28
29
|
pass # pragma: no cover
|
|
29
30
|
|
|
30
31
|
|
|
31
|
-
class
|
|
32
|
-
"""Stores and retrieves caches using
|
|
33
|
-
Redshift Client"""
|
|
32
|
+
class S3Acorn(Acorn):
|
|
33
|
+
"""Stores and retrieves caches using AWS S3 with parquet files."""
|
|
34
34
|
|
|
35
35
|
def __init__(self) -> None:
|
|
36
|
-
"""Initialize
|
|
37
|
-
|
|
38
|
-
self.
|
|
39
|
-
credentials=RDSCredentials(aws_secrets_name=REDSHIFT_SECRETS),
|
|
40
|
-
)
|
|
36
|
+
"""Initialize S3Acorn with S3 client."""
|
|
37
|
+
self.bucket = "aind-scratch-data"
|
|
38
|
+
self.s3_client = boto3.client("s3")
|
|
41
39
|
|
|
42
40
|
def hide(self, table_name: str, data: pd.DataFrame) -> None:
|
|
43
|
-
"""Store DataFrame in
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
41
|
+
"""Store DataFrame as parquet file in S3."""
|
|
42
|
+
filename = prefix_table_name(table_name)
|
|
43
|
+
s3_key = get_s3_cache_path(filename)
|
|
44
|
+
|
|
45
|
+
# Convert DataFrame to parquet bytes
|
|
46
|
+
parquet_buffer = io.BytesIO()
|
|
47
|
+
data.to_parquet(parquet_buffer, index=False)
|
|
48
|
+
parquet_buffer.seek(0)
|
|
49
|
+
|
|
50
|
+
# Upload to S3
|
|
51
|
+
self.s3_client.put_object(
|
|
52
|
+
Bucket=self.bucket,
|
|
53
|
+
Key=s3_key,
|
|
54
|
+
Body=parquet_buffer.getvalue(),
|
|
47
55
|
)
|
|
56
|
+
logging.info(f"Stored cache to S3: s3://{self.bucket}/{s3_key}")
|
|
48
57
|
|
|
49
58
|
def scurry(self, table_name: str) -> pd.DataFrame:
|
|
50
|
-
"""Fetch DataFrame from
|
|
51
|
-
|
|
59
|
+
"""Fetch DataFrame from S3 parquet file."""
|
|
60
|
+
filename = prefix_table_name(table_name)
|
|
61
|
+
s3_key = get_s3_cache_path(filename)
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
# Read directly from S3 using DuckDB
|
|
65
|
+
query = f"""
|
|
66
|
+
SELECT * FROM read_parquet(
|
|
67
|
+
's3://{self.bucket}/{s3_key}'
|
|
68
|
+
)
|
|
69
|
+
"""
|
|
70
|
+
result = duckdb.query(query).to_df()
|
|
71
|
+
logging.info(
|
|
72
|
+
f"Retrieved cache from S3: s3://{self.bucket}/{s3_key}"
|
|
73
|
+
)
|
|
74
|
+
return result
|
|
75
|
+
except Exception as e:
|
|
76
|
+
logging.warning(
|
|
77
|
+
f"Error fetching from cache {s3_key}: {e}"
|
|
78
|
+
)
|
|
79
|
+
return pd.DataFrame()
|
|
52
80
|
|
|
53
81
|
|
|
54
82
|
class MemoryAcorn(Acorn):
|
|
@@ -66,16 +94,3 @@ class MemoryAcorn(Acorn):
|
|
|
66
94
|
def scurry(self, table_name: str) -> pd.DataFrame:
|
|
67
95
|
"""Fetch DataFrame from memory."""
|
|
68
96
|
return self._store.get(table_name, pd.DataFrame())
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
def rds_get_handle_empty(acorn: Acorn, table_name: str) -> pd.DataFrame:
|
|
72
|
-
"""Helper for handling errors when loading from redshift, because
|
|
73
|
-
there's no helper function"""
|
|
74
|
-
try:
|
|
75
|
-
logging.info(f"Fetching from cache: {table_name}")
|
|
76
|
-
df = acorn.scurry(table_name)
|
|
77
|
-
except Exception as e:
|
|
78
|
-
logging.warning(f"Error fetching from cache: {e}")
|
|
79
|
-
df = pd.DataFrame()
|
|
80
|
-
|
|
81
|
-
return df
|
zombie_squirrel/squirrels.py
CHANGED
|
@@ -10,8 +10,7 @@ from aind_data_access_api.document_db import MetadataDbClient
|
|
|
10
10
|
|
|
11
11
|
from zombie_squirrel.acorns import (
|
|
12
12
|
MemoryAcorn,
|
|
13
|
-
|
|
14
|
-
rds_get_handle_empty,
|
|
13
|
+
S3Acorn,
|
|
15
14
|
)
|
|
16
15
|
|
|
17
16
|
# --- Backend setup ---------------------------------------------------
|
|
@@ -20,9 +19,9 @@ API_GATEWAY_HOST = "api.allenneuraldynamics.org"
|
|
|
20
19
|
|
|
21
20
|
tree_type = os.getenv("TREE_SPECIES", "memory").lower()
|
|
22
21
|
|
|
23
|
-
if tree_type == "
|
|
24
|
-
logging.info("Using
|
|
25
|
-
ACORN =
|
|
22
|
+
if tree_type == "s3": # pragma: no cover
|
|
23
|
+
logging.info("Using S3 acorn for caching")
|
|
24
|
+
ACORN = S3Acorn()
|
|
26
25
|
else:
|
|
27
26
|
logging.info("Using in-memory acorn for caching")
|
|
28
27
|
ACORN = MemoryAcorn()
|
|
@@ -66,7 +65,7 @@ def unique_project_names(force_update: bool = False) -> list[str]:
|
|
|
66
65
|
|
|
67
66
|
Returns:
|
|
68
67
|
List of unique project names."""
|
|
69
|
-
df =
|
|
68
|
+
df = ACORN.scurry(NAMES["upn"])
|
|
70
69
|
|
|
71
70
|
if df.empty or force_update:
|
|
72
71
|
# If cache is missing, fetch data
|
|
@@ -99,7 +98,7 @@ def unique_subject_ids(force_update: bool = False) -> list[str]:
|
|
|
99
98
|
|
|
100
99
|
Returns:
|
|
101
100
|
List of unique subject IDs."""
|
|
102
|
-
df =
|
|
101
|
+
df = ACORN.scurry(NAMES["usi"])
|
|
103
102
|
|
|
104
103
|
if df.empty or force_update:
|
|
105
104
|
# If cache is missing, fetch data
|
|
@@ -134,7 +133,7 @@ def asset_basics(force_update: bool = False) -> pd.DataFrame:
|
|
|
134
133
|
|
|
135
134
|
Returns:
|
|
136
135
|
DataFrame with basic asset metadata."""
|
|
137
|
-
df =
|
|
136
|
+
df = ACORN.scurry(NAMES["basics"])
|
|
138
137
|
|
|
139
138
|
FIELDS = [
|
|
140
139
|
"data_description.modalities",
|
|
@@ -145,6 +144,7 @@ def asset_basics(force_update: bool = False) -> pd.DataFrame:
|
|
|
145
144
|
"acquisition.acquisition_end_time",
|
|
146
145
|
"processing.data_processes.start_date_time",
|
|
147
146
|
"subject.subject_details.genotype",
|
|
147
|
+
"other_identifiers",
|
|
148
148
|
"location",
|
|
149
149
|
]
|
|
150
150
|
|
|
@@ -160,6 +160,7 @@ def asset_basics(force_update: bool = False) -> pd.DataFrame:
|
|
|
160
160
|
"subject_id",
|
|
161
161
|
"acquisition_start_time",
|
|
162
162
|
"acquisition_end_time",
|
|
163
|
+
"code_ocean",
|
|
163
164
|
"process_date",
|
|
164
165
|
"genotype",
|
|
165
166
|
"location",
|
|
@@ -205,12 +206,21 @@ def asset_basics(force_update: bool = False) -> pd.DataFrame:
|
|
|
205
206
|
modality_abbreviations_str = ", ".join(modality_abbreviations)
|
|
206
207
|
|
|
207
208
|
# Get the process date, convert to YYYY-MM-DD if present
|
|
208
|
-
|
|
209
|
-
if
|
|
209
|
+
data_processes = record.get("processing", {}).get("data_processes", [])
|
|
210
|
+
if data_processes:
|
|
211
|
+
latest_process = data_processes[-1]
|
|
212
|
+
process_datetime = latest_process.get("start_date_time", None)
|
|
210
213
|
process_date = process_datetime.split("T")[0]
|
|
211
214
|
else:
|
|
212
215
|
process_date = None
|
|
213
216
|
|
|
217
|
+
# Get the CO asset ID
|
|
218
|
+
other_identifiers = record.get("other_identifiers", {})
|
|
219
|
+
if other_identifiers:
|
|
220
|
+
code_ocean = other_identifiers.get("Code Ocean", None)
|
|
221
|
+
else:
|
|
222
|
+
code_ocean = None
|
|
223
|
+
|
|
214
224
|
flat_record = {
|
|
215
225
|
"_id": record["_id"],
|
|
216
226
|
"_last_modified": record.get("_last_modified", None),
|
|
@@ -220,6 +230,7 @@ def asset_basics(force_update: bool = False) -> pd.DataFrame:
|
|
|
220
230
|
"subject_id": record.get("subject", {}).get("subject_id", None),
|
|
221
231
|
"acquisition_start_time": record.get("acquisition", {}).get("acquisition_start_time", None),
|
|
222
232
|
"acquisition_end_time": record.get("acquisition", {}).get("acquisition_end_time", None),
|
|
233
|
+
"code_ocean": code_ocean,
|
|
223
234
|
"process_date": process_date,
|
|
224
235
|
"genotype": record.get("subject", {}).get("subject_details", {}).get("genotype", None),
|
|
225
236
|
"location": record.get("location", None),
|
|
@@ -247,7 +258,7 @@ def source_data(force_update: bool = False) -> pd.DataFrame:
|
|
|
247
258
|
|
|
248
259
|
Returns:
|
|
249
260
|
DataFrame with _id and source_data columns."""
|
|
250
|
-
df =
|
|
261
|
+
df = ACORN.scurry(NAMES["d2r"])
|
|
251
262
|
|
|
252
263
|
if df.empty or force_update:
|
|
253
264
|
logging.info("Updating cache for source data")
|
|
@@ -289,7 +300,7 @@ def raw_to_derived(force_update: bool = False) -> pd.DataFrame:
|
|
|
289
300
|
|
|
290
301
|
Returns:
|
|
291
302
|
DataFrame with _id and derived_records columns."""
|
|
292
|
-
df =
|
|
303
|
+
df = ACORN.scurry(NAMES["r2d"])
|
|
293
304
|
|
|
294
305
|
if df.empty or force_update:
|
|
295
306
|
logging.info("Updating cache for raw to derived mapping")
|
zombie_squirrel/utils.py
CHANGED
|
@@ -2,11 +2,22 @@
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
def prefix_table_name(table_name: str) -> str:
|
|
5
|
-
"""Add zombie-squirrel prefix to
|
|
5
|
+
"""Add zombie-squirrel prefix and parquet extension to filenames.
|
|
6
6
|
|
|
7
7
|
Args:
|
|
8
8
|
table_name: The base table name.
|
|
9
9
|
|
|
10
10
|
Returns:
|
|
11
|
-
|
|
12
|
-
return "zs_" + table_name
|
|
11
|
+
Filename with 'zs_' prefix and '.pqt' extension."""
|
|
12
|
+
return "zs_" + table_name + ".pqt"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_s3_cache_path(filename: str) -> str:
|
|
16
|
+
"""Get the full S3 path for a cache file.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
filename: The cache filename (e.g., "zs_unique_project_names.pqt").
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
Full S3 path: application-caches/filename"""
|
|
23
|
+
return f"application-caches/{filename}"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: zombie-squirrel
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.1
|
|
4
4
|
Summary: Generated from aind-library-template
|
|
5
5
|
Author: Allen Institute for Neural Dynamics
|
|
6
6
|
License: MIT
|
|
@@ -8,7 +8,11 @@ Classifier: Programming Language :: Python :: 3
|
|
|
8
8
|
Requires-Python: >=3.10
|
|
9
9
|
Description-Content-Type: text/markdown
|
|
10
10
|
License-File: LICENSE
|
|
11
|
-
Requires-Dist:
|
|
11
|
+
Requires-Dist: duckdb
|
|
12
|
+
Requires-Dist: fastparquet
|
|
13
|
+
Requires-Dist: boto3
|
|
14
|
+
Requires-Dist: pandas
|
|
15
|
+
Requires-Dist: aind-data-access-api[docdb]
|
|
12
16
|
Dynamic: license-file
|
|
13
17
|
|
|
14
18
|
# zombie-squirrel
|
|
@@ -26,9 +30,6 @@ Dynamic: license-file
|
|
|
26
30
|
|
|
27
31
|
```bash
|
|
28
32
|
pip install zombie-squirrel
|
|
29
|
-
|
|
30
|
-
```bash
|
|
31
|
-
uv sync
|
|
32
33
|
```
|
|
33
34
|
|
|
34
35
|
## Usage
|
|
@@ -36,11 +37,10 @@ uv sync
|
|
|
36
37
|
### Set backend
|
|
37
38
|
|
|
38
39
|
```bash
|
|
39
|
-
export
|
|
40
|
-
export TREE_SPECIES='REDSHIFT'
|
|
40
|
+
export TREE_SPECIES='s3'
|
|
41
41
|
```
|
|
42
42
|
|
|
43
|
-
Options are '
|
|
43
|
+
Options are 's3', 'MEMORY'.
|
|
44
44
|
|
|
45
45
|
### Scurry (fetch) data
|
|
46
46
|
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
zombie_squirrel/__init__.py,sha256=v9cqU6rYDVeeVPFLDyhe6N28mEP2j7vXb8R98nwc--0,409
|
|
2
|
+
zombie_squirrel/acorns.py,sha256=mpinFacaN9BM6CvRy0M76JMb6n3oVPZLJxn8O4J9Wlw,2945
|
|
3
|
+
zombie_squirrel/squirrels.py,sha256=JWO0ihe7J3P9HbrSA4DHG3-o7EhM93WiwtxC2yVD0L0,12262
|
|
4
|
+
zombie_squirrel/sync.py,sha256=84Ta5beHiPuGBVzp9SCo7G1b4McTUohcUIf_TJbNIV8,518
|
|
5
|
+
zombie_squirrel/utils.py,sha256=kojQpHUKlRJD7WEZDfcpQIZTj9iUrtX5_6F-gWWzJW0,628
|
|
6
|
+
zombie_squirrel-0.6.1.dist-info/licenses/LICENSE,sha256=U0Y7B3gZJHXpjJVLgTQjM8e_c8w4JJpLgGhIdsoFR1Y,1092
|
|
7
|
+
zombie_squirrel-0.6.1.dist-info/METADATA,sha256=vywgegMbrLaSk7VdS_L3cT0PT9YD53nRA3o1mKUEVqc,1893
|
|
8
|
+
zombie_squirrel-0.6.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
+
zombie_squirrel-0.6.1.dist-info/top_level.txt,sha256=FmM0coe4AangURZLjM4JwwRv2B8H6oINYCoZLKLDCKA,16
|
|
10
|
+
zombie_squirrel-0.6.1.dist-info/RECORD,,
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
zombie_squirrel/__init__.py,sha256=4zjJN8AP-RKQir_Wgvy0KqVUcV2REOE9kTHwrMyRV5U,409
|
|
2
|
-
zombie_squirrel/acorns.py,sha256=4uBzYtYgW2oD5sOohNQUw4qfjmNjmAIK2RlL1Ge1Udo,2597
|
|
3
|
-
zombie_squirrel/squirrels.py,sha256=BjRF7MhSJMK4BsBhHU5PzAdG6Rb7OloTMXUvrA7pZT8,11934
|
|
4
|
-
zombie_squirrel/sync.py,sha256=84Ta5beHiPuGBVzp9SCo7G1b4McTUohcUIf_TJbNIV8,518
|
|
5
|
-
zombie_squirrel/utils.py,sha256=woPxU4vYMUv-T0XOjV5ieViksU_q7It_n_5Ll4zpocA,289
|
|
6
|
-
zombie_squirrel-0.5.0.dist-info/licenses/LICENSE,sha256=U0Y7B3gZJHXpjJVLgTQjM8e_c8w4JJpLgGhIdsoFR1Y,1092
|
|
7
|
-
zombie_squirrel-0.5.0.dist-info/METADATA,sha256=WDnQXat1d6eC1-RkESW5yZ1lmu8vaHPYEM87SvBuB1g,1902
|
|
8
|
-
zombie_squirrel-0.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
-
zombie_squirrel-0.5.0.dist-info/top_level.txt,sha256=FmM0coe4AangURZLjM4JwwRv2B8H6oINYCoZLKLDCKA,16
|
|
10
|
-
zombie_squirrel-0.5.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|