zombie-squirrel 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zombie_squirrel/__init__.py +1 -1
- zombie_squirrel/acorns.py +45 -30
- zombie_squirrel/squirrels.py +9 -12
- zombie_squirrel/utils.py +14 -3
- {zombie_squirrel-0.5.1.dist-info → zombie_squirrel-0.5.3.dist-info}/METADATA +9 -6
- zombie_squirrel-0.5.3.dist-info/RECORD +10 -0
- zombie_squirrel-0.5.1.dist-info/RECORD +0 -10
- {zombie_squirrel-0.5.1.dist-info → zombie_squirrel-0.5.3.dist-info}/WHEEL +0 -0
- {zombie_squirrel-0.5.1.dist-info → zombie_squirrel-0.5.3.dist-info}/licenses/LICENSE +0 -0
- {zombie_squirrel-0.5.1.dist-info → zombie_squirrel-0.5.3.dist-info}/top_level.txt +0 -0
zombie_squirrel/__init__.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
Provides functions to fetch and cache project names, subject IDs, and asset
|
|
4
4
|
metadata from the AIND metadata database with support for multiple backends."""
|
|
5
5
|
|
|
6
|
-
__version__ = "0.5.
|
|
6
|
+
__version__ = "0.5.3"
|
|
7
7
|
|
|
8
8
|
from zombie_squirrel.squirrels import ( # noqa: F401
|
|
9
9
|
asset_basics,
|
zombie_squirrel/acorns.py
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
"""Storage backend interfaces for caching data."""
|
|
2
2
|
|
|
3
|
+
import io
|
|
3
4
|
import logging
|
|
4
|
-
import os
|
|
5
5
|
from abc import ABC, abstractmethod
|
|
6
6
|
|
|
7
|
+
import boto3
|
|
8
|
+
import duckdb
|
|
7
9
|
import pandas as pd
|
|
8
|
-
from aind_data_access_api.rds_tables import Client, RDSCredentials
|
|
9
10
|
|
|
10
|
-
from zombie_squirrel.utils import prefix_table_name
|
|
11
|
+
from zombie_squirrel.utils import get_s3_cache_path, prefix_table_name
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
class Acorn(ABC):
|
|
@@ -28,27 +29,54 @@ class Acorn(ABC):
|
|
|
28
29
|
pass # pragma: no cover
|
|
29
30
|
|
|
30
31
|
|
|
31
|
-
class
|
|
32
|
-
"""Stores and retrieves caches using
|
|
33
|
-
Redshift Client"""
|
|
32
|
+
class S3Acorn(Acorn):
|
|
33
|
+
"""Stores and retrieves caches using AWS S3 with parquet files."""
|
|
34
34
|
|
|
35
35
|
def __init__(self) -> None:
|
|
36
|
-
"""Initialize
|
|
37
|
-
|
|
38
|
-
self.
|
|
39
|
-
credentials=RDSCredentials(aws_secrets_name=REDSHIFT_SECRETS),
|
|
40
|
-
)
|
|
36
|
+
"""Initialize S3Acorn with S3 client."""
|
|
37
|
+
self.bucket = "aind-scratch-data"
|
|
38
|
+
self.s3_client = boto3.client("s3")
|
|
41
39
|
|
|
42
40
|
def hide(self, table_name: str, data: pd.DataFrame) -> None:
|
|
43
|
-
"""Store DataFrame in
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
41
|
+
"""Store DataFrame as parquet file in S3."""
|
|
42
|
+
filename = prefix_table_name(table_name)
|
|
43
|
+
s3_key = get_s3_cache_path(filename)
|
|
44
|
+
|
|
45
|
+
# Convert DataFrame to parquet bytes
|
|
46
|
+
parquet_buffer = io.BytesIO()
|
|
47
|
+
data.to_parquet(parquet_buffer, index=False)
|
|
48
|
+
parquet_buffer.seek(0)
|
|
49
|
+
|
|
50
|
+
# Upload to S3
|
|
51
|
+
self.s3_client.put_object(
|
|
52
|
+
Bucket=self.bucket,
|
|
53
|
+
Key=s3_key,
|
|
54
|
+
Body=parquet_buffer.getvalue(),
|
|
47
55
|
)
|
|
56
|
+
logging.info(f"Stored cache to S3: s3://{self.bucket}/{s3_key}")
|
|
48
57
|
|
|
49
58
|
def scurry(self, table_name: str) -> pd.DataFrame:
|
|
50
|
-
"""Fetch DataFrame from
|
|
51
|
-
|
|
59
|
+
"""Fetch DataFrame from S3 parquet file."""
|
|
60
|
+
filename = prefix_table_name(table_name)
|
|
61
|
+
s3_key = get_s3_cache_path(filename)
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
# Read directly from S3 using DuckDB
|
|
65
|
+
query = f"""
|
|
66
|
+
SELECT * FROM read_parquet(
|
|
67
|
+
's3://{self.bucket}/{s3_key}'
|
|
68
|
+
)
|
|
69
|
+
"""
|
|
70
|
+
result = duckdb.query(query).to_df()
|
|
71
|
+
logging.info(
|
|
72
|
+
f"Retrieved cache from S3: s3://{self.bucket}/{s3_key}"
|
|
73
|
+
)
|
|
74
|
+
return result
|
|
75
|
+
except Exception as e:
|
|
76
|
+
logging.warning(
|
|
77
|
+
f"Error fetching from cache {s3_key}: {e}"
|
|
78
|
+
)
|
|
79
|
+
return pd.DataFrame()
|
|
52
80
|
|
|
53
81
|
|
|
54
82
|
class MemoryAcorn(Acorn):
|
|
@@ -66,16 +94,3 @@ class MemoryAcorn(Acorn):
|
|
|
66
94
|
def scurry(self, table_name: str) -> pd.DataFrame:
|
|
67
95
|
"""Fetch DataFrame from memory."""
|
|
68
96
|
return self._store.get(table_name, pd.DataFrame())
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
def rds_get_handle_empty(acorn: Acorn, table_name: str) -> pd.DataFrame:
|
|
72
|
-
"""Helper for handling errors when loading from redshift, because
|
|
73
|
-
there's no helper function"""
|
|
74
|
-
try:
|
|
75
|
-
logging.info(f"Fetching from cache: {table_name}")
|
|
76
|
-
df = acorn.scurry(table_name)
|
|
77
|
-
except Exception as e:
|
|
78
|
-
logging.warning(f"Error fetching from cache: {e}")
|
|
79
|
-
df = pd.DataFrame()
|
|
80
|
-
|
|
81
|
-
return df
|
zombie_squirrel/squirrels.py
CHANGED
|
@@ -10,8 +10,7 @@ from aind_data_access_api.document_db import MetadataDbClient
|
|
|
10
10
|
|
|
11
11
|
from zombie_squirrel.acorns import (
|
|
12
12
|
MemoryAcorn,
|
|
13
|
-
|
|
14
|
-
rds_get_handle_empty,
|
|
13
|
+
S3Acorn,
|
|
15
14
|
)
|
|
16
15
|
|
|
17
16
|
# --- Backend setup ---------------------------------------------------
|
|
@@ -20,9 +19,9 @@ API_GATEWAY_HOST = "api.allenneuraldynamics.org"
|
|
|
20
19
|
|
|
21
20
|
tree_type = os.getenv("TREE_SPECIES", "memory").lower()
|
|
22
21
|
|
|
23
|
-
if tree_type == "
|
|
24
|
-
logging.info("Using
|
|
25
|
-
ACORN =
|
|
22
|
+
if tree_type == "s3": # pragma: no cover
|
|
23
|
+
logging.info("Using S3 acorn for caching")
|
|
24
|
+
ACORN = S3Acorn()
|
|
26
25
|
else:
|
|
27
26
|
logging.info("Using in-memory acorn for caching")
|
|
28
27
|
ACORN = MemoryAcorn()
|
|
@@ -66,7 +65,7 @@ def unique_project_names(force_update: bool = False) -> list[str]:
|
|
|
66
65
|
|
|
67
66
|
Returns:
|
|
68
67
|
List of unique project names."""
|
|
69
|
-
df =
|
|
68
|
+
df = ACORN.scurry(NAMES["upn"])
|
|
70
69
|
|
|
71
70
|
if df.empty or force_update:
|
|
72
71
|
# If cache is missing, fetch data
|
|
@@ -99,7 +98,7 @@ def unique_subject_ids(force_update: bool = False) -> list[str]:
|
|
|
99
98
|
|
|
100
99
|
Returns:
|
|
101
100
|
List of unique subject IDs."""
|
|
102
|
-
df =
|
|
101
|
+
df = ACORN.scurry(NAMES["usi"])
|
|
103
102
|
|
|
104
103
|
if df.empty or force_update:
|
|
105
104
|
# If cache is missing, fetch data
|
|
@@ -134,7 +133,7 @@ def asset_basics(force_update: bool = False) -> pd.DataFrame:
|
|
|
134
133
|
|
|
135
134
|
Returns:
|
|
136
135
|
DataFrame with basic asset metadata."""
|
|
137
|
-
df =
|
|
136
|
+
df = ACORN.scurry(NAMES["basics"])
|
|
138
137
|
|
|
139
138
|
FIELDS = [
|
|
140
139
|
"data_description.modalities",
|
|
@@ -249,7 +248,7 @@ def source_data(force_update: bool = False) -> pd.DataFrame:
|
|
|
249
248
|
|
|
250
249
|
Returns:
|
|
251
250
|
DataFrame with _id and source_data columns."""
|
|
252
|
-
df =
|
|
251
|
+
df = ACORN.scurry(NAMES["d2r"])
|
|
253
252
|
|
|
254
253
|
if df.empty or force_update:
|
|
255
254
|
logging.info("Updating cache for source data")
|
|
@@ -291,7 +290,7 @@ def raw_to_derived(force_update: bool = False) -> pd.DataFrame:
|
|
|
291
290
|
|
|
292
291
|
Returns:
|
|
293
292
|
DataFrame with _id and derived_records columns."""
|
|
294
|
-
df =
|
|
293
|
+
df = ACORN.scurry(NAMES["r2d"])
|
|
295
294
|
|
|
296
295
|
if df.empty or force_update:
|
|
297
296
|
logging.info("Updating cache for raw to derived mapping")
|
|
@@ -319,8 +318,6 @@ def raw_to_derived(force_update: bool = False) -> pd.DataFrame:
|
|
|
319
318
|
raw_to_derived_map = {raw_id: [] for raw_id in raw_ids}
|
|
320
319
|
for derived_record in derived_records:
|
|
321
320
|
source_data_list = derived_record.get("data_description", {}).get("source_data", [])
|
|
322
|
-
if not source_data_list:
|
|
323
|
-
continue
|
|
324
321
|
derived_id = derived_record["_id"]
|
|
325
322
|
# Add this derived record to each raw record it depends on
|
|
326
323
|
for source_id in source_data_list:
|
zombie_squirrel/utils.py
CHANGED
|
@@ -2,11 +2,22 @@
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
def prefix_table_name(table_name: str) -> str:
|
|
5
|
-
"""Add zombie-squirrel prefix to
|
|
5
|
+
"""Add zombie-squirrel prefix and parquet extension to filenames.
|
|
6
6
|
|
|
7
7
|
Args:
|
|
8
8
|
table_name: The base table name.
|
|
9
9
|
|
|
10
10
|
Returns:
|
|
11
|
-
|
|
12
|
-
return "zs_" + table_name
|
|
11
|
+
Filename with 'zs_' prefix and '.pqt' extension."""
|
|
12
|
+
return "zs_" + table_name + ".pqt"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_s3_cache_path(filename: str) -> str:
|
|
16
|
+
"""Get the full S3 path for a cache file.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
filename: The cache filename (e.g., "zs_unique_project_names.pqt").
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
Full S3 path: application-caches/filename"""
|
|
23
|
+
return f"application-caches/{filename}"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: zombie-squirrel
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.3
|
|
4
4
|
Summary: Generated from aind-library-template
|
|
5
5
|
Author: Allen Institute for Neural Dynamics
|
|
6
6
|
License: MIT
|
|
@@ -8,7 +8,11 @@ Classifier: Programming Language :: Python :: 3
|
|
|
8
8
|
Requires-Python: >=3.10
|
|
9
9
|
Description-Content-Type: text/markdown
|
|
10
10
|
License-File: LICENSE
|
|
11
|
-
Requires-Dist:
|
|
11
|
+
Requires-Dist: duckdb
|
|
12
|
+
Requires-Dist: fastparquet
|
|
13
|
+
Requires-Dist: boto3
|
|
14
|
+
Requires-Dist: pandas
|
|
15
|
+
Requires-Dist: aind-data-access-api[docdb]
|
|
12
16
|
Dynamic: license-file
|
|
13
17
|
|
|
14
18
|
# zombie-squirrel
|
|
@@ -17,7 +21,7 @@ Dynamic: license-file
|
|
|
17
21
|

|
|
18
22
|
[](https://github.com/semantic-release/semantic-release)
|
|
19
23
|

|
|
20
|
-

|
|
21
25
|

|
|
22
26
|
|
|
23
27
|
<img src="zombie-squirrel_logo.png" width="400" alt="Logo (image from ChatGPT)">
|
|
@@ -36,11 +40,10 @@ uv sync
|
|
|
36
40
|
### Set backend
|
|
37
41
|
|
|
38
42
|
```bash
|
|
39
|
-
export
|
|
40
|
-
export TREE_SPECIES='REDSHIFT'
|
|
43
|
+
export TREE_SPECIES='s3'
|
|
41
44
|
```
|
|
42
45
|
|
|
43
|
-
Options are '
|
|
46
|
+
Options are 's3', 'MEMORY'.
|
|
44
47
|
|
|
45
48
|
### Scurry (fetch) data
|
|
46
49
|
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
zombie_squirrel/__init__.py,sha256=Kc5hE79CUDqA0D2_ISxTgPYz9QyK1PaMwEvUbYh2wEE,409
|
|
2
|
+
zombie_squirrel/acorns.py,sha256=mpinFacaN9BM6CvRy0M76JMb6n3oVPZLJxn8O4J9Wlw,2945
|
|
3
|
+
zombie_squirrel/squirrels.py,sha256=PwLhJdpNZZCWIdqb3-MA3VRzCMf1tWwKcCxpTMbIjn0,11901
|
|
4
|
+
zombie_squirrel/sync.py,sha256=84Ta5beHiPuGBVzp9SCo7G1b4McTUohcUIf_TJbNIV8,518
|
|
5
|
+
zombie_squirrel/utils.py,sha256=kojQpHUKlRJD7WEZDfcpQIZTj9iUrtX5_6F-gWWzJW0,628
|
|
6
|
+
zombie_squirrel-0.5.3.dist-info/licenses/LICENSE,sha256=U0Y7B3gZJHXpjJVLgTQjM8e_c8w4JJpLgGhIdsoFR1Y,1092
|
|
7
|
+
zombie_squirrel-0.5.3.dist-info/METADATA,sha256=T3oVLz6gY4adCdyUdgJT0JA17ziOCU17AuCsrKHSVNw,1911
|
|
8
|
+
zombie_squirrel-0.5.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
+
zombie_squirrel-0.5.3.dist-info/top_level.txt,sha256=FmM0coe4AangURZLjM4JwwRv2B8H6oINYCoZLKLDCKA,16
|
|
10
|
+
zombie_squirrel-0.5.3.dist-info/RECORD,,
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
zombie_squirrel/__init__.py,sha256=ihoDdnYVX9vzrIOTRGU1EFj_vGu_6OgJHeOC_6bVyLQ,409
|
|
2
|
-
zombie_squirrel/acorns.py,sha256=4uBzYtYgW2oD5sOohNQUw4qfjmNjmAIK2RlL1Ge1Udo,2597
|
|
3
|
-
zombie_squirrel/squirrels.py,sha256=xFGckcm4hcmFVxQneqeloSZjoY6Qc2wvHT9Wws9L-Rk,12088
|
|
4
|
-
zombie_squirrel/sync.py,sha256=84Ta5beHiPuGBVzp9SCo7G1b4McTUohcUIf_TJbNIV8,518
|
|
5
|
-
zombie_squirrel/utils.py,sha256=woPxU4vYMUv-T0XOjV5ieViksU_q7It_n_5Ll4zpocA,289
|
|
6
|
-
zombie_squirrel-0.5.1.dist-info/licenses/LICENSE,sha256=U0Y7B3gZJHXpjJVLgTQjM8e_c8w4JJpLgGhIdsoFR1Y,1092
|
|
7
|
-
zombie_squirrel-0.5.1.dist-info/METADATA,sha256=CgLCs4WxISN0rMMXSCw0H-E90W8V87bXErZbRbZ26XU,1902
|
|
8
|
-
zombie_squirrel-0.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
-
zombie_squirrel-0.5.1.dist-info/top_level.txt,sha256=FmM0coe4AangURZLjM4JwwRv2B8H6oINYCoZLKLDCKA,16
|
|
10
|
-
zombie_squirrel-0.5.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|