zombie-squirrel 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zombie_squirrel/__init__.py +1 -1
- zombie_squirrel/acorns.py +45 -30
- zombie_squirrel/squirrels.py +13 -12
- zombie_squirrel/utils.py +14 -3
- {zombie_squirrel-0.5.0.dist-info → zombie_squirrel-0.5.2.dist-info}/METADATA +7 -3
- zombie_squirrel-0.5.2.dist-info/RECORD +10 -0
- zombie_squirrel-0.5.0.dist-info/RECORD +0 -10
- {zombie_squirrel-0.5.0.dist-info → zombie_squirrel-0.5.2.dist-info}/WHEEL +0 -0
- {zombie_squirrel-0.5.0.dist-info → zombie_squirrel-0.5.2.dist-info}/licenses/LICENSE +0 -0
- {zombie_squirrel-0.5.0.dist-info → zombie_squirrel-0.5.2.dist-info}/top_level.txt +0 -0
zombie_squirrel/__init__.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
Provides functions to fetch and cache project names, subject IDs, and asset
|
|
4
4
|
metadata from the AIND metadata database with support for multiple backends."""
|
|
5
5
|
|
|
6
|
-
__version__ = "0.5.
|
|
6
|
+
__version__ = "0.5.2"
|
|
7
7
|
|
|
8
8
|
from zombie_squirrel.squirrels import ( # noqa: F401
|
|
9
9
|
asset_basics,
|
zombie_squirrel/acorns.py
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
"""Storage backend interfaces for caching data."""
|
|
2
2
|
|
|
3
|
+
import io
|
|
3
4
|
import logging
|
|
4
|
-
import os
|
|
5
5
|
from abc import ABC, abstractmethod
|
|
6
6
|
|
|
7
|
+
import boto3
|
|
8
|
+
import duckdb
|
|
7
9
|
import pandas as pd
|
|
8
|
-
from aind_data_access_api.rds_tables import Client, RDSCredentials
|
|
9
10
|
|
|
10
|
-
from zombie_squirrel.utils import prefix_table_name
|
|
11
|
+
from zombie_squirrel.utils import get_s3_cache_path, prefix_table_name
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
class Acorn(ABC):
|
|
@@ -28,27 +29,54 @@ class Acorn(ABC):
|
|
|
28
29
|
pass # pragma: no cover
|
|
29
30
|
|
|
30
31
|
|
|
31
|
-
class
|
|
32
|
-
"""Stores and retrieves caches using
|
|
33
|
-
Redshift Client"""
|
|
32
|
+
class S3Acorn(Acorn):
|
|
33
|
+
"""Stores and retrieves caches using AWS S3 with parquet files."""
|
|
34
34
|
|
|
35
35
|
def __init__(self) -> None:
|
|
36
|
-
"""Initialize
|
|
37
|
-
|
|
38
|
-
self.
|
|
39
|
-
credentials=RDSCredentials(aws_secrets_name=REDSHIFT_SECRETS),
|
|
40
|
-
)
|
|
36
|
+
"""Initialize S3Acorn with S3 client."""
|
|
37
|
+
self.bucket = "aind-scratch-data"
|
|
38
|
+
self.s3_client = boto3.client("s3")
|
|
41
39
|
|
|
42
40
|
def hide(self, table_name: str, data: pd.DataFrame) -> None:
|
|
43
|
-
"""Store DataFrame in
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
41
|
+
"""Store DataFrame as parquet file in S3."""
|
|
42
|
+
filename = prefix_table_name(table_name)
|
|
43
|
+
s3_key = get_s3_cache_path(filename)
|
|
44
|
+
|
|
45
|
+
# Convert DataFrame to parquet bytes
|
|
46
|
+
parquet_buffer = io.BytesIO()
|
|
47
|
+
data.to_parquet(parquet_buffer, index=False)
|
|
48
|
+
parquet_buffer.seek(0)
|
|
49
|
+
|
|
50
|
+
# Upload to S3
|
|
51
|
+
self.s3_client.put_object(
|
|
52
|
+
Bucket=self.bucket,
|
|
53
|
+
Key=s3_key,
|
|
54
|
+
Body=parquet_buffer.getvalue(),
|
|
47
55
|
)
|
|
56
|
+
logging.info(f"Stored cache to S3: s3://{self.bucket}/{s3_key}")
|
|
48
57
|
|
|
49
58
|
def scurry(self, table_name: str) -> pd.DataFrame:
|
|
50
|
-
"""Fetch DataFrame from
|
|
51
|
-
|
|
59
|
+
"""Fetch DataFrame from S3 parquet file."""
|
|
60
|
+
filename = prefix_table_name(table_name)
|
|
61
|
+
s3_key = get_s3_cache_path(filename)
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
# Read directly from S3 using DuckDB
|
|
65
|
+
query = f"""
|
|
66
|
+
SELECT * FROM read_parquet(
|
|
67
|
+
's3://{self.bucket}/{s3_key}'
|
|
68
|
+
)
|
|
69
|
+
"""
|
|
70
|
+
result = duckdb.query(query).to_df()
|
|
71
|
+
logging.info(
|
|
72
|
+
f"Retrieved cache from S3: s3://{self.bucket}/{s3_key}"
|
|
73
|
+
)
|
|
74
|
+
return result
|
|
75
|
+
except Exception as e:
|
|
76
|
+
logging.warning(
|
|
77
|
+
f"Error fetching from cache {s3_key}: {e}"
|
|
78
|
+
)
|
|
79
|
+
return pd.DataFrame()
|
|
52
80
|
|
|
53
81
|
|
|
54
82
|
class MemoryAcorn(Acorn):
|
|
@@ -66,16 +94,3 @@ class MemoryAcorn(Acorn):
|
|
|
66
94
|
def scurry(self, table_name: str) -> pd.DataFrame:
|
|
67
95
|
"""Fetch DataFrame from memory."""
|
|
68
96
|
return self._store.get(table_name, pd.DataFrame())
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
def rds_get_handle_empty(acorn: Acorn, table_name: str) -> pd.DataFrame:
|
|
72
|
-
"""Helper for handling errors when loading from redshift, because
|
|
73
|
-
there's no helper function"""
|
|
74
|
-
try:
|
|
75
|
-
logging.info(f"Fetching from cache: {table_name}")
|
|
76
|
-
df = acorn.scurry(table_name)
|
|
77
|
-
except Exception as e:
|
|
78
|
-
logging.warning(f"Error fetching from cache: {e}")
|
|
79
|
-
df = pd.DataFrame()
|
|
80
|
-
|
|
81
|
-
return df
|
zombie_squirrel/squirrels.py
CHANGED
|
@@ -10,8 +10,7 @@ from aind_data_access_api.document_db import MetadataDbClient
|
|
|
10
10
|
|
|
11
11
|
from zombie_squirrel.acorns import (
|
|
12
12
|
MemoryAcorn,
|
|
13
|
-
|
|
14
|
-
rds_get_handle_empty,
|
|
13
|
+
S3Acorn,
|
|
15
14
|
)
|
|
16
15
|
|
|
17
16
|
# --- Backend setup ---------------------------------------------------
|
|
@@ -20,9 +19,9 @@ API_GATEWAY_HOST = "api.allenneuraldynamics.org"
|
|
|
20
19
|
|
|
21
20
|
tree_type = os.getenv("TREE_SPECIES", "memory").lower()
|
|
22
21
|
|
|
23
|
-
if tree_type == "
|
|
24
|
-
logging.info("Using
|
|
25
|
-
ACORN =
|
|
22
|
+
if tree_type == "s3": # pragma: no cover
|
|
23
|
+
logging.info("Using S3 acorn for caching")
|
|
24
|
+
ACORN = S3Acorn()
|
|
26
25
|
else:
|
|
27
26
|
logging.info("Using in-memory acorn for caching")
|
|
28
27
|
ACORN = MemoryAcorn()
|
|
@@ -66,7 +65,7 @@ def unique_project_names(force_update: bool = False) -> list[str]:
|
|
|
66
65
|
|
|
67
66
|
Returns:
|
|
68
67
|
List of unique project names."""
|
|
69
|
-
df =
|
|
68
|
+
df = ACORN.scurry(NAMES["upn"])
|
|
70
69
|
|
|
71
70
|
if df.empty or force_update:
|
|
72
71
|
# If cache is missing, fetch data
|
|
@@ -99,7 +98,7 @@ def unique_subject_ids(force_update: bool = False) -> list[str]:
|
|
|
99
98
|
|
|
100
99
|
Returns:
|
|
101
100
|
List of unique subject IDs."""
|
|
102
|
-
df =
|
|
101
|
+
df = ACORN.scurry(NAMES["usi"])
|
|
103
102
|
|
|
104
103
|
if df.empty or force_update:
|
|
105
104
|
# If cache is missing, fetch data
|
|
@@ -134,7 +133,7 @@ def asset_basics(force_update: bool = False) -> pd.DataFrame:
|
|
|
134
133
|
|
|
135
134
|
Returns:
|
|
136
135
|
DataFrame with basic asset metadata."""
|
|
137
|
-
df =
|
|
136
|
+
df = ACORN.scurry(NAMES["basics"])
|
|
138
137
|
|
|
139
138
|
FIELDS = [
|
|
140
139
|
"data_description.modalities",
|
|
@@ -205,8 +204,10 @@ def asset_basics(force_update: bool = False) -> pd.DataFrame:
|
|
|
205
204
|
modality_abbreviations_str = ", ".join(modality_abbreviations)
|
|
206
205
|
|
|
207
206
|
# Get the process date, convert to YYYY-MM-DD if present
|
|
208
|
-
|
|
209
|
-
if
|
|
207
|
+
data_processes = record.get("processing", {}).get("data_processes", [])
|
|
208
|
+
if data_processes:
|
|
209
|
+
latest_process = data_processes[-1]
|
|
210
|
+
process_datetime = latest_process.get("start_date_time", None)
|
|
210
211
|
process_date = process_datetime.split("T")[0]
|
|
211
212
|
else:
|
|
212
213
|
process_date = None
|
|
@@ -247,7 +248,7 @@ def source_data(force_update: bool = False) -> pd.DataFrame:
|
|
|
247
248
|
|
|
248
249
|
Returns:
|
|
249
250
|
DataFrame with _id and source_data columns."""
|
|
250
|
-
df =
|
|
251
|
+
df = ACORN.scurry(NAMES["d2r"])
|
|
251
252
|
|
|
252
253
|
if df.empty or force_update:
|
|
253
254
|
logging.info("Updating cache for source data")
|
|
@@ -289,7 +290,7 @@ def raw_to_derived(force_update: bool = False) -> pd.DataFrame:
|
|
|
289
290
|
|
|
290
291
|
Returns:
|
|
291
292
|
DataFrame with _id and derived_records columns."""
|
|
292
|
-
df =
|
|
293
|
+
df = ACORN.scurry(NAMES["r2d"])
|
|
293
294
|
|
|
294
295
|
if df.empty or force_update:
|
|
295
296
|
logging.info("Updating cache for raw to derived mapping")
|
zombie_squirrel/utils.py
CHANGED
|
@@ -2,11 +2,22 @@
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
def prefix_table_name(table_name: str) -> str:
|
|
5
|
-
"""Add zombie-squirrel prefix to
|
|
5
|
+
"""Add zombie-squirrel prefix and parquet extension to filenames.
|
|
6
6
|
|
|
7
7
|
Args:
|
|
8
8
|
table_name: The base table name.
|
|
9
9
|
|
|
10
10
|
Returns:
|
|
11
|
-
|
|
12
|
-
return "zs_" + table_name
|
|
11
|
+
Filename with 'zs_' prefix and '.pqt' extension."""
|
|
12
|
+
return "zs_" + table_name + ".pqt"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_s3_cache_path(filename: str) -> str:
|
|
16
|
+
"""Get the full S3 path for a cache file.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
filename: The cache filename (e.g., "zs_unique_project_names.pqt").
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
Full S3 path: application-caches/filename"""
|
|
23
|
+
return f"application-caches/{filename}"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: zombie-squirrel
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.2
|
|
4
4
|
Summary: Generated from aind-library-template
|
|
5
5
|
Author: Allen Institute for Neural Dynamics
|
|
6
6
|
License: MIT
|
|
@@ -8,7 +8,11 @@ Classifier: Programming Language :: Python :: 3
|
|
|
8
8
|
Requires-Python: >=3.10
|
|
9
9
|
Description-Content-Type: text/markdown
|
|
10
10
|
License-File: LICENSE
|
|
11
|
-
Requires-Dist:
|
|
11
|
+
Requires-Dist: duckdb
|
|
12
|
+
Requires-Dist: fastparquet
|
|
13
|
+
Requires-Dist: boto3
|
|
14
|
+
Requires-Dist: pandas
|
|
15
|
+
Requires-Dist: aind-data-access-api[docdb]
|
|
12
16
|
Dynamic: license-file
|
|
13
17
|
|
|
14
18
|
# zombie-squirrel
|
|
@@ -17,7 +21,7 @@ Dynamic: license-file
|
|
|
17
21
|

|
|
18
22
|
[](https://github.com/semantic-release/semantic-release)
|
|
19
23
|

|
|
20
|
-

|
|
21
25
|

|
|
22
26
|
|
|
23
27
|
<img src="zombie-squirrel_logo.png" width="400" alt="Logo (image from ChatGPT)">
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
zombie_squirrel/__init__.py,sha256=KZsMdWYM4SLPDVJh0hBmNbNouqd7-KFxEsMG-VB1-IU,409
|
|
2
|
+
zombie_squirrel/acorns.py,sha256=mpinFacaN9BM6CvRy0M76JMb6n3oVPZLJxn8O4J9Wlw,2945
|
|
3
|
+
zombie_squirrel/squirrels.py,sha256=PwLhJdpNZZCWIdqb3-MA3VRzCMf1tWwKcCxpTMbIjn0,11901
|
|
4
|
+
zombie_squirrel/sync.py,sha256=84Ta5beHiPuGBVzp9SCo7G1b4McTUohcUIf_TJbNIV8,518
|
|
5
|
+
zombie_squirrel/utils.py,sha256=kojQpHUKlRJD7WEZDfcpQIZTj9iUrtX5_6F-gWWzJW0,628
|
|
6
|
+
zombie_squirrel-0.5.2.dist-info/licenses/LICENSE,sha256=U0Y7B3gZJHXpjJVLgTQjM8e_c8w4JJpLgGhIdsoFR1Y,1092
|
|
7
|
+
zombie_squirrel-0.5.2.dist-info/METADATA,sha256=TPPXZYGlj_QbYlCu8d-ewMj0o6vETxmU3UdlKf8VvWE,1991
|
|
8
|
+
zombie_squirrel-0.5.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
+
zombie_squirrel-0.5.2.dist-info/top_level.txt,sha256=FmM0coe4AangURZLjM4JwwRv2B8H6oINYCoZLKLDCKA,16
|
|
10
|
+
zombie_squirrel-0.5.2.dist-info/RECORD,,
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
zombie_squirrel/__init__.py,sha256=4zjJN8AP-RKQir_Wgvy0KqVUcV2REOE9kTHwrMyRV5U,409
|
|
2
|
-
zombie_squirrel/acorns.py,sha256=4uBzYtYgW2oD5sOohNQUw4qfjmNjmAIK2RlL1Ge1Udo,2597
|
|
3
|
-
zombie_squirrel/squirrels.py,sha256=BjRF7MhSJMK4BsBhHU5PzAdG6Rb7OloTMXUvrA7pZT8,11934
|
|
4
|
-
zombie_squirrel/sync.py,sha256=84Ta5beHiPuGBVzp9SCo7G1b4McTUohcUIf_TJbNIV8,518
|
|
5
|
-
zombie_squirrel/utils.py,sha256=woPxU4vYMUv-T0XOjV5ieViksU_q7It_n_5Ll4zpocA,289
|
|
6
|
-
zombie_squirrel-0.5.0.dist-info/licenses/LICENSE,sha256=U0Y7B3gZJHXpjJVLgTQjM8e_c8w4JJpLgGhIdsoFR1Y,1092
|
|
7
|
-
zombie_squirrel-0.5.0.dist-info/METADATA,sha256=WDnQXat1d6eC1-RkESW5yZ1lmu8vaHPYEM87SvBuB1g,1902
|
|
8
|
-
zombie_squirrel-0.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
-
zombie_squirrel-0.5.0.dist-info/top_level.txt,sha256=FmM0coe4AangURZLjM4JwwRv2B8H6oINYCoZLKLDCKA,16
|
|
10
|
-
zombie_squirrel-0.5.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|