zombie-squirrel 0.5.0__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,7 @@
3
3
  Provides functions to fetch and cache project names, subject IDs, and asset
4
4
  metadata from the AIND metadata database with support for multiple backends."""
5
5
 
6
- __version__ = "0.5.0"
6
+ __version__ = "0.6.1"
7
7
 
8
8
  from zombie_squirrel.squirrels import ( # noqa: F401
9
9
  asset_basics,
zombie_squirrel/acorns.py CHANGED
@@ -1,13 +1,14 @@
1
1
  """Storage backend interfaces for caching data."""
2
2
 
3
+ import io
3
4
  import logging
4
- import os
5
5
  from abc import ABC, abstractmethod
6
6
 
7
+ import boto3
8
+ import duckdb
7
9
  import pandas as pd
8
- from aind_data_access_api.rds_tables import Client, RDSCredentials
9
10
 
10
- from zombie_squirrel.utils import prefix_table_name
11
+ from zombie_squirrel.utils import get_s3_cache_path, prefix_table_name
11
12
 
12
13
 
13
14
  class Acorn(ABC):
@@ -28,27 +29,54 @@ class Acorn(ABC):
28
29
  pass # pragma: no cover
29
30
 
30
31
 
31
- class RedshiftAcorn(Acorn):
32
- """Stores and retrieves caches using aind-data-access-api
33
- Redshift Client"""
32
+ class S3Acorn(Acorn):
33
+ """Stores and retrieves caches using AWS S3 with parquet files."""
34
34
 
35
35
  def __init__(self) -> None:
36
- """Initialize RedshiftAcorn with Redshift credentials."""
37
- REDSHIFT_SECRETS = os.getenv("REDSHIFT_SECRETS", "/aind/prod/redshift/credentials/readwrite")
38
- self.rds_client = Client(
39
- credentials=RDSCredentials(aws_secrets_name=REDSHIFT_SECRETS),
40
- )
36
+ """Initialize S3Acorn with S3 client."""
37
+ self.bucket = "aind-scratch-data"
38
+ self.s3_client = boto3.client("s3")
41
39
 
42
40
  def hide(self, table_name: str, data: pd.DataFrame) -> None:
43
- """Store DataFrame in Redshift table."""
44
- self.rds_client.overwrite_table_with_df(
45
- df=data,
46
- table_name=prefix_table_name(table_name),
41
+ """Store DataFrame as parquet file in S3."""
42
+ filename = prefix_table_name(table_name)
43
+ s3_key = get_s3_cache_path(filename)
44
+
45
+ # Convert DataFrame to parquet bytes
46
+ parquet_buffer = io.BytesIO()
47
+ data.to_parquet(parquet_buffer, index=False)
48
+ parquet_buffer.seek(0)
49
+
50
+ # Upload to S3
51
+ self.s3_client.put_object(
52
+ Bucket=self.bucket,
53
+ Key=s3_key,
54
+ Body=parquet_buffer.getvalue(),
47
55
  )
56
+ logging.info(f"Stored cache to S3: s3://{self.bucket}/{s3_key}")
48
57
 
49
58
  def scurry(self, table_name: str) -> pd.DataFrame:
50
- """Fetch DataFrame from Redshift table."""
51
- return self.rds_client.read_table(table_name=prefix_table_name(table_name))
59
+ """Fetch DataFrame from S3 parquet file."""
60
+ filename = prefix_table_name(table_name)
61
+ s3_key = get_s3_cache_path(filename)
62
+
63
+ try:
64
+ # Read directly from S3 using DuckDB
65
+ query = f"""
66
+ SELECT * FROM read_parquet(
67
+ 's3://{self.bucket}/{s3_key}'
68
+ )
69
+ """
70
+ result = duckdb.query(query).to_df()
71
+ logging.info(
72
+ f"Retrieved cache from S3: s3://{self.bucket}/{s3_key}"
73
+ )
74
+ return result
75
+ except Exception as e:
76
+ logging.warning(
77
+ f"Error fetching from cache {s3_key}: {e}"
78
+ )
79
+ return pd.DataFrame()
52
80
 
53
81
 
54
82
  class MemoryAcorn(Acorn):
@@ -66,16 +94,3 @@ class MemoryAcorn(Acorn):
66
94
  def scurry(self, table_name: str) -> pd.DataFrame:
67
95
  """Fetch DataFrame from memory."""
68
96
  return self._store.get(table_name, pd.DataFrame())
69
-
70
-
71
- def rds_get_handle_empty(acorn: Acorn, table_name: str) -> pd.DataFrame:
72
- """Helper for handling errors when loading from redshift, because
73
- there's no helper function"""
74
- try:
75
- logging.info(f"Fetching from cache: {table_name}")
76
- df = acorn.scurry(table_name)
77
- except Exception as e:
78
- logging.warning(f"Error fetching from cache: {e}")
79
- df = pd.DataFrame()
80
-
81
- return df
@@ -10,8 +10,7 @@ from aind_data_access_api.document_db import MetadataDbClient
10
10
 
11
11
  from zombie_squirrel.acorns import (
12
12
  MemoryAcorn,
13
- RedshiftAcorn,
14
- rds_get_handle_empty,
13
+ S3Acorn,
15
14
  )
16
15
 
17
16
  # --- Backend setup ---------------------------------------------------
@@ -20,9 +19,9 @@ API_GATEWAY_HOST = "api.allenneuraldynamics.org"
20
19
 
21
20
  tree_type = os.getenv("TREE_SPECIES", "memory").lower()
22
21
 
23
- if tree_type == "redshift": # pragma: no cover
24
- logging.info("Using Redshift acorn for caching")
25
- ACORN = RedshiftAcorn()
22
+ if tree_type == "s3": # pragma: no cover
23
+ logging.info("Using S3 acorn for caching")
24
+ ACORN = S3Acorn()
26
25
  else:
27
26
  logging.info("Using in-memory acorn for caching")
28
27
  ACORN = MemoryAcorn()
@@ -66,7 +65,7 @@ def unique_project_names(force_update: bool = False) -> list[str]:
66
65
 
67
66
  Returns:
68
67
  List of unique project names."""
69
- df = rds_get_handle_empty(ACORN, NAMES["upn"])
68
+ df = ACORN.scurry(NAMES["upn"])
70
69
 
71
70
  if df.empty or force_update:
72
71
  # If cache is missing, fetch data
@@ -99,7 +98,7 @@ def unique_subject_ids(force_update: bool = False) -> list[str]:
99
98
 
100
99
  Returns:
101
100
  List of unique subject IDs."""
102
- df = rds_get_handle_empty(ACORN, NAMES["usi"])
101
+ df = ACORN.scurry(NAMES["usi"])
103
102
 
104
103
  if df.empty or force_update:
105
104
  # If cache is missing, fetch data
@@ -134,7 +133,7 @@ def asset_basics(force_update: bool = False) -> pd.DataFrame:
134
133
 
135
134
  Returns:
136
135
  DataFrame with basic asset metadata."""
137
- df = rds_get_handle_empty(ACORN, NAMES["basics"])
136
+ df = ACORN.scurry(NAMES["basics"])
138
137
 
139
138
  FIELDS = [
140
139
  "data_description.modalities",
@@ -145,6 +144,7 @@ def asset_basics(force_update: bool = False) -> pd.DataFrame:
145
144
  "acquisition.acquisition_end_time",
146
145
  "processing.data_processes.start_date_time",
147
146
  "subject.subject_details.genotype",
147
+ "other_identifiers",
148
148
  "location",
149
149
  ]
150
150
 
@@ -160,6 +160,7 @@ def asset_basics(force_update: bool = False) -> pd.DataFrame:
160
160
  "subject_id",
161
161
  "acquisition_start_time",
162
162
  "acquisition_end_time",
163
+ "code_ocean",
163
164
  "process_date",
164
165
  "genotype",
165
166
  "location",
@@ -205,12 +206,21 @@ def asset_basics(force_update: bool = False) -> pd.DataFrame:
205
206
  modality_abbreviations_str = ", ".join(modality_abbreviations)
206
207
 
207
208
  # Get the process date, convert to YYYY-MM-DD if present
208
- process_datetime = record.get("processing", {}).get("data_processes", [{}])[-1].get("start_date_time", None)
209
- if process_datetime:
209
+ data_processes = record.get("processing", {}).get("data_processes", [])
210
+ if data_processes:
211
+ latest_process = data_processes[-1]
212
+ process_datetime = latest_process.get("start_date_time", None)
210
213
  process_date = process_datetime.split("T")[0]
211
214
  else:
212
215
  process_date = None
213
216
 
217
+ # Get the CO asset ID
218
+ other_identifiers = record.get("other_identifiers", {})
219
+ if other_identifiers:
220
+ code_ocean = other_identifiers.get("Code Ocean", None)
221
+ else:
222
+ code_ocean = None
223
+
214
224
  flat_record = {
215
225
  "_id": record["_id"],
216
226
  "_last_modified": record.get("_last_modified", None),
@@ -220,6 +230,7 @@ def asset_basics(force_update: bool = False) -> pd.DataFrame:
220
230
  "subject_id": record.get("subject", {}).get("subject_id", None),
221
231
  "acquisition_start_time": record.get("acquisition", {}).get("acquisition_start_time", None),
222
232
  "acquisition_end_time": record.get("acquisition", {}).get("acquisition_end_time", None),
233
+ "code_ocean": code_ocean,
223
234
  "process_date": process_date,
224
235
  "genotype": record.get("subject", {}).get("subject_details", {}).get("genotype", None),
225
236
  "location": record.get("location", None),
@@ -247,7 +258,7 @@ def source_data(force_update: bool = False) -> pd.DataFrame:
247
258
 
248
259
  Returns:
249
260
  DataFrame with _id and source_data columns."""
250
- df = rds_get_handle_empty(ACORN, NAMES["d2r"])
261
+ df = ACORN.scurry(NAMES["d2r"])
251
262
 
252
263
  if df.empty or force_update:
253
264
  logging.info("Updating cache for source data")
@@ -289,7 +300,7 @@ def raw_to_derived(force_update: bool = False) -> pd.DataFrame:
289
300
 
290
301
  Returns:
291
302
  DataFrame with _id and derived_records columns."""
292
- df = rds_get_handle_empty(ACORN, NAMES["r2d"])
303
+ df = ACORN.scurry(NAMES["r2d"])
293
304
 
294
305
  if df.empty or force_update:
295
306
  logging.info("Updating cache for raw to derived mapping")
zombie_squirrel/utils.py CHANGED
@@ -2,11 +2,22 @@
2
2
 
3
3
 
4
4
  def prefix_table_name(table_name: str) -> str:
5
- """Add zombie-squirrel prefix to table names.
5
+ """Add zombie-squirrel prefix and parquet extension to filenames.
6
6
 
7
7
  Args:
8
8
  table_name: The base table name.
9
9
 
10
10
  Returns:
11
- Table name with 'zs_' prefix."""
12
- return "zs_" + table_name
11
+ Filename with 'zs_' prefix and '.pqt' extension."""
12
+ return "zs_" + table_name + ".pqt"
13
+
14
+
15
+ def get_s3_cache_path(filename: str) -> str:
16
+ """Get the full S3 path for a cache file.
17
+
18
+ Args:
19
+ filename: The cache filename (e.g., "zs_unique_project_names.pqt").
20
+
21
+ Returns:
22
+ Full S3 path: application-caches/filename"""
23
+ return f"application-caches/{filename}"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: zombie-squirrel
3
- Version: 0.5.0
3
+ Version: 0.6.1
4
4
  Summary: Generated from aind-library-template
5
5
  Author: Allen Institute for Neural Dynamics
6
6
  License: MIT
@@ -8,7 +8,11 @@ Classifier: Programming Language :: Python :: 3
8
8
  Requires-Python: >=3.10
9
9
  Description-Content-Type: text/markdown
10
10
  License-File: LICENSE
11
- Requires-Dist: aind-data-access-api[docdb,rds]
11
+ Requires-Dist: duckdb
12
+ Requires-Dist: fastparquet
13
+ Requires-Dist: boto3
14
+ Requires-Dist: pandas
15
+ Requires-Dist: aind-data-access-api[docdb]
12
16
  Dynamic: license-file
13
17
 
14
18
  # zombie-squirrel
@@ -26,9 +30,6 @@ Dynamic: license-file
26
30
 
27
31
  ```bash
28
32
  pip install zombie-squirrel
29
-
30
- ```bash
31
- uv sync
32
33
  ```
33
34
 
34
35
  ## Usage
@@ -36,11 +37,10 @@ uv sync
36
37
  ### Set backend
37
38
 
38
39
  ```bash
39
- export REDSHIFT_SECRETS='/aind/prod/redshift/credentials/readwrite'
40
- export TREE_SPECIES='REDSHIFT'
40
+ export TREE_SPECIES='s3'
41
41
  ```
42
42
 
43
- Options are 'REDSHIFT', 'MEMORY'.
43
+ Options are 's3', 'MEMORY'.
44
44
 
45
45
  ### Scurry (fetch) data
46
46
 
@@ -0,0 +1,10 @@
1
+ zombie_squirrel/__init__.py,sha256=v9cqU6rYDVeeVPFLDyhe6N28mEP2j7vXb8R98nwc--0,409
2
+ zombie_squirrel/acorns.py,sha256=mpinFacaN9BM6CvRy0M76JMb6n3oVPZLJxn8O4J9Wlw,2945
3
+ zombie_squirrel/squirrels.py,sha256=JWO0ihe7J3P9HbrSA4DHG3-o7EhM93WiwtxC2yVD0L0,12262
4
+ zombie_squirrel/sync.py,sha256=84Ta5beHiPuGBVzp9SCo7G1b4McTUohcUIf_TJbNIV8,518
5
+ zombie_squirrel/utils.py,sha256=kojQpHUKlRJD7WEZDfcpQIZTj9iUrtX5_6F-gWWzJW0,628
6
+ zombie_squirrel-0.6.1.dist-info/licenses/LICENSE,sha256=U0Y7B3gZJHXpjJVLgTQjM8e_c8w4JJpLgGhIdsoFR1Y,1092
7
+ zombie_squirrel-0.6.1.dist-info/METADATA,sha256=vywgegMbrLaSk7VdS_L3cT0PT9YD53nRA3o1mKUEVqc,1893
8
+ zombie_squirrel-0.6.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
+ zombie_squirrel-0.6.1.dist-info/top_level.txt,sha256=FmM0coe4AangURZLjM4JwwRv2B8H6oINYCoZLKLDCKA,16
10
+ zombie_squirrel-0.6.1.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- zombie_squirrel/__init__.py,sha256=4zjJN8AP-RKQir_Wgvy0KqVUcV2REOE9kTHwrMyRV5U,409
2
- zombie_squirrel/acorns.py,sha256=4uBzYtYgW2oD5sOohNQUw4qfjmNjmAIK2RlL1Ge1Udo,2597
3
- zombie_squirrel/squirrels.py,sha256=BjRF7MhSJMK4BsBhHU5PzAdG6Rb7OloTMXUvrA7pZT8,11934
4
- zombie_squirrel/sync.py,sha256=84Ta5beHiPuGBVzp9SCo7G1b4McTUohcUIf_TJbNIV8,518
5
- zombie_squirrel/utils.py,sha256=woPxU4vYMUv-T0XOjV5ieViksU_q7It_n_5Ll4zpocA,289
6
- zombie_squirrel-0.5.0.dist-info/licenses/LICENSE,sha256=U0Y7B3gZJHXpjJVLgTQjM8e_c8w4JJpLgGhIdsoFR1Y,1092
7
- zombie_squirrel-0.5.0.dist-info/METADATA,sha256=WDnQXat1d6eC1-RkESW5yZ1lmu8vaHPYEM87SvBuB1g,1902
8
- zombie_squirrel-0.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
- zombie_squirrel-0.5.0.dist-info/top_level.txt,sha256=FmM0coe4AangURZLjM4JwwRv2B8H6oINYCoZLKLDCKA,16
10
- zombie_squirrel-0.5.0.dist-info/RECORD,,