zombie-squirrel 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,7 @@
3
3
  Provides functions to fetch and cache project names, subject IDs, and asset
4
4
  metadata from the AIND metadata database with support for multiple backends."""
5
5
 
6
- __version__ = "0.5.0"
6
+ __version__ = "0.5.2"
7
7
 
8
8
  from zombie_squirrel.squirrels import ( # noqa: F401
9
9
  asset_basics,
zombie_squirrel/acorns.py CHANGED
@@ -1,13 +1,14 @@
1
1
  """Storage backend interfaces for caching data."""
2
2
 
3
+ import io
3
4
  import logging
4
- import os
5
5
  from abc import ABC, abstractmethod
6
6
 
7
+ import boto3
8
+ import duckdb
7
9
  import pandas as pd
8
- from aind_data_access_api.rds_tables import Client, RDSCredentials
9
10
 
10
- from zombie_squirrel.utils import prefix_table_name
11
+ from zombie_squirrel.utils import get_s3_cache_path, prefix_table_name
11
12
 
12
13
 
13
14
  class Acorn(ABC):
@@ -28,27 +29,54 @@ class Acorn(ABC):
28
29
  pass # pragma: no cover
29
30
 
30
31
 
31
- class RedshiftAcorn(Acorn):
32
- """Stores and retrieves caches using aind-data-access-api
33
- Redshift Client"""
32
+ class S3Acorn(Acorn):
33
+ """Stores and retrieves caches using AWS S3 with parquet files."""
34
34
 
35
35
  def __init__(self) -> None:
36
- """Initialize RedshiftAcorn with Redshift credentials."""
37
- REDSHIFT_SECRETS = os.getenv("REDSHIFT_SECRETS", "/aind/prod/redshift/credentials/readwrite")
38
- self.rds_client = Client(
39
- credentials=RDSCredentials(aws_secrets_name=REDSHIFT_SECRETS),
40
- )
36
+ """Initialize S3Acorn with S3 client."""
37
+ self.bucket = "aind-scratch-data"
38
+ self.s3_client = boto3.client("s3")
41
39
 
42
40
  def hide(self, table_name: str, data: pd.DataFrame) -> None:
43
- """Store DataFrame in Redshift table."""
44
- self.rds_client.overwrite_table_with_df(
45
- df=data,
46
- table_name=prefix_table_name(table_name),
41
+ """Store DataFrame as parquet file in S3."""
42
+ filename = prefix_table_name(table_name)
43
+ s3_key = get_s3_cache_path(filename)
44
+
45
+ # Convert DataFrame to parquet bytes
46
+ parquet_buffer = io.BytesIO()
47
+ data.to_parquet(parquet_buffer, index=False)
48
+ parquet_buffer.seek(0)
49
+
50
+ # Upload to S3
51
+ self.s3_client.put_object(
52
+ Bucket=self.bucket,
53
+ Key=s3_key,
54
+ Body=parquet_buffer.getvalue(),
47
55
  )
56
+ logging.info(f"Stored cache to S3: s3://{self.bucket}/{s3_key}")
48
57
 
49
58
  def scurry(self, table_name: str) -> pd.DataFrame:
50
- """Fetch DataFrame from Redshift table."""
51
- return self.rds_client.read_table(table_name=prefix_table_name(table_name))
59
+ """Fetch DataFrame from S3 parquet file."""
60
+ filename = prefix_table_name(table_name)
61
+ s3_key = get_s3_cache_path(filename)
62
+
63
+ try:
64
+ # Read directly from S3 using DuckDB
65
+ query = f"""
66
+ SELECT * FROM read_parquet(
67
+ 's3://{self.bucket}/{s3_key}'
68
+ )
69
+ """
70
+ result = duckdb.query(query).to_df()
71
+ logging.info(
72
+ f"Retrieved cache from S3: s3://{self.bucket}/{s3_key}"
73
+ )
74
+ return result
75
+ except Exception as e:
76
+ logging.warning(
77
+ f"Error fetching from cache {s3_key}: {e}"
78
+ )
79
+ return pd.DataFrame()
52
80
 
53
81
 
54
82
  class MemoryAcorn(Acorn):
@@ -66,16 +94,3 @@ class MemoryAcorn(Acorn):
66
94
  def scurry(self, table_name: str) -> pd.DataFrame:
67
95
  """Fetch DataFrame from memory."""
68
96
  return self._store.get(table_name, pd.DataFrame())
69
-
70
-
71
- def rds_get_handle_empty(acorn: Acorn, table_name: str) -> pd.DataFrame:
72
- """Helper for handling errors when loading from redshift, because
73
- there's no helper function"""
74
- try:
75
- logging.info(f"Fetching from cache: {table_name}")
76
- df = acorn.scurry(table_name)
77
- except Exception as e:
78
- logging.warning(f"Error fetching from cache: {e}")
79
- df = pd.DataFrame()
80
-
81
- return df
@@ -10,8 +10,7 @@ from aind_data_access_api.document_db import MetadataDbClient
10
10
 
11
11
  from zombie_squirrel.acorns import (
12
12
  MemoryAcorn,
13
- RedshiftAcorn,
14
- rds_get_handle_empty,
13
+ S3Acorn,
15
14
  )
16
15
 
17
16
  # --- Backend setup ---------------------------------------------------
@@ -20,9 +19,9 @@ API_GATEWAY_HOST = "api.allenneuraldynamics.org"
20
19
 
21
20
  tree_type = os.getenv("TREE_SPECIES", "memory").lower()
22
21
 
23
- if tree_type == "redshift": # pragma: no cover
24
- logging.info("Using Redshift acorn for caching")
25
- ACORN = RedshiftAcorn()
22
+ if tree_type == "s3": # pragma: no cover
23
+ logging.info("Using S3 acorn for caching")
24
+ ACORN = S3Acorn()
26
25
  else:
27
26
  logging.info("Using in-memory acorn for caching")
28
27
  ACORN = MemoryAcorn()
@@ -66,7 +65,7 @@ def unique_project_names(force_update: bool = False) -> list[str]:
66
65
 
67
66
  Returns:
68
67
  List of unique project names."""
69
- df = rds_get_handle_empty(ACORN, NAMES["upn"])
68
+ df = ACORN.scurry(NAMES["upn"])
70
69
 
71
70
  if df.empty or force_update:
72
71
  # If cache is missing, fetch data
@@ -99,7 +98,7 @@ def unique_subject_ids(force_update: bool = False) -> list[str]:
99
98
 
100
99
  Returns:
101
100
  List of unique subject IDs."""
102
- df = rds_get_handle_empty(ACORN, NAMES["usi"])
101
+ df = ACORN.scurry(NAMES["usi"])
103
102
 
104
103
  if df.empty or force_update:
105
104
  # If cache is missing, fetch data
@@ -134,7 +133,7 @@ def asset_basics(force_update: bool = False) -> pd.DataFrame:
134
133
 
135
134
  Returns:
136
135
  DataFrame with basic asset metadata."""
137
- df = rds_get_handle_empty(ACORN, NAMES["basics"])
136
+ df = ACORN.scurry(NAMES["basics"])
138
137
 
139
138
  FIELDS = [
140
139
  "data_description.modalities",
@@ -205,8 +204,10 @@ def asset_basics(force_update: bool = False) -> pd.DataFrame:
205
204
  modality_abbreviations_str = ", ".join(modality_abbreviations)
206
205
 
207
206
  # Get the process date, convert to YYYY-MM-DD if present
208
- process_datetime = record.get("processing", {}).get("data_processes", [{}])[-1].get("start_date_time", None)
209
- if process_datetime:
207
+ data_processes = record.get("processing", {}).get("data_processes", [])
208
+ if data_processes:
209
+ latest_process = data_processes[-1]
210
+ process_datetime = latest_process.get("start_date_time", None)
210
211
  process_date = process_datetime.split("T")[0]
211
212
  else:
212
213
  process_date = None
@@ -247,7 +248,7 @@ def source_data(force_update: bool = False) -> pd.DataFrame:
247
248
 
248
249
  Returns:
249
250
  DataFrame with _id and source_data columns."""
250
- df = rds_get_handle_empty(ACORN, NAMES["d2r"])
251
+ df = ACORN.scurry(NAMES["d2r"])
251
252
 
252
253
  if df.empty or force_update:
253
254
  logging.info("Updating cache for source data")
@@ -289,7 +290,7 @@ def raw_to_derived(force_update: bool = False) -> pd.DataFrame:
289
290
 
290
291
  Returns:
291
292
  DataFrame with _id and derived_records columns."""
292
- df = rds_get_handle_empty(ACORN, NAMES["r2d"])
293
+ df = ACORN.scurry(NAMES["r2d"])
293
294
 
294
295
  if df.empty or force_update:
295
296
  logging.info("Updating cache for raw to derived mapping")
zombie_squirrel/utils.py CHANGED
@@ -2,11 +2,22 @@
2
2
 
3
3
 
4
4
  def prefix_table_name(table_name: str) -> str:
5
- """Add zombie-squirrel prefix to table names.
5
+ """Add zombie-squirrel prefix and parquet extension to filenames.
6
6
 
7
7
  Args:
8
8
  table_name: The base table name.
9
9
 
10
10
  Returns:
11
- Table name with 'zs_' prefix."""
12
- return "zs_" + table_name
11
+ Filename with 'zs_' prefix and '.pqt' extension."""
12
+ return "zs_" + table_name + ".pqt"
13
+
14
+
15
+ def get_s3_cache_path(filename: str) -> str:
16
+ """Get the full S3 path for a cache file.
17
+
18
+ Args:
19
+ filename: The cache filename (e.g., "zs_unique_project_names.pqt").
20
+
21
+ Returns:
22
+ Full S3 path: application-caches/filename"""
23
+ return f"application-caches/{filename}"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: zombie-squirrel
3
- Version: 0.5.0
3
+ Version: 0.5.2
4
4
  Summary: Generated from aind-library-template
5
5
  Author: Allen Institute for Neural Dynamics
6
6
  License: MIT
@@ -8,7 +8,11 @@ Classifier: Programming Language :: Python :: 3
8
8
  Requires-Python: >=3.10
9
9
  Description-Content-Type: text/markdown
10
10
  License-File: LICENSE
11
- Requires-Dist: aind-data-access-api[docdb,rds]
11
+ Requires-Dist: duckdb
12
+ Requires-Dist: fastparquet
13
+ Requires-Dist: boto3
14
+ Requires-Dist: pandas
15
+ Requires-Dist: aind-data-access-api[docdb]
12
16
  Dynamic: license-file
13
17
 
14
18
  # zombie-squirrel
@@ -17,7 +21,7 @@ Dynamic: license-file
17
21
  ![Code Style](https://img.shields.io/badge/code%20style-black-black)
18
22
  [![semantic-release: angular](https://img.shields.io/badge/semantic--release-angular-e10079?logo=semantic-release)](https://github.com/semantic-release/semantic-release)
19
23
  ![Interrogate](https://img.shields.io/badge/interrogate-100.0%25-brightgreen)
20
- ![Coverage](https://img.shields.io/badge/coverage-99%25-brightgreen)
24
+ ![Coverage](https://img.shields.io/badge/coverage-100%25-brightgreen)
21
25
  ![Python](https://img.shields.io/badge/python->=3.10-blue?logo=python)
22
26
 
23
27
  <img src="zombie-squirrel_logo.png" width="400" alt="Logo (image from ChatGPT)">
@@ -0,0 +1,10 @@
1
+ zombie_squirrel/__init__.py,sha256=KZsMdWYM4SLPDVJh0hBmNbNouqd7-KFxEsMG-VB1-IU,409
2
+ zombie_squirrel/acorns.py,sha256=mpinFacaN9BM6CvRy0M76JMb6n3oVPZLJxn8O4J9Wlw,2945
3
+ zombie_squirrel/squirrels.py,sha256=PwLhJdpNZZCWIdqb3-MA3VRzCMf1tWwKcCxpTMbIjn0,11901
4
+ zombie_squirrel/sync.py,sha256=84Ta5beHiPuGBVzp9SCo7G1b4McTUohcUIf_TJbNIV8,518
5
+ zombie_squirrel/utils.py,sha256=kojQpHUKlRJD7WEZDfcpQIZTj9iUrtX5_6F-gWWzJW0,628
6
+ zombie_squirrel-0.5.2.dist-info/licenses/LICENSE,sha256=U0Y7B3gZJHXpjJVLgTQjM8e_c8w4JJpLgGhIdsoFR1Y,1092
7
+ zombie_squirrel-0.5.2.dist-info/METADATA,sha256=TPPXZYGlj_QbYlCu8d-ewMj0o6vETxmU3UdlKf8VvWE,1991
8
+ zombie_squirrel-0.5.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
+ zombie_squirrel-0.5.2.dist-info/top_level.txt,sha256=FmM0coe4AangURZLjM4JwwRv2B8H6oINYCoZLKLDCKA,16
10
+ zombie_squirrel-0.5.2.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- zombie_squirrel/__init__.py,sha256=4zjJN8AP-RKQir_Wgvy0KqVUcV2REOE9kTHwrMyRV5U,409
2
- zombie_squirrel/acorns.py,sha256=4uBzYtYgW2oD5sOohNQUw4qfjmNjmAIK2RlL1Ge1Udo,2597
3
- zombie_squirrel/squirrels.py,sha256=BjRF7MhSJMK4BsBhHU5PzAdG6Rb7OloTMXUvrA7pZT8,11934
4
- zombie_squirrel/sync.py,sha256=84Ta5beHiPuGBVzp9SCo7G1b4McTUohcUIf_TJbNIV8,518
5
- zombie_squirrel/utils.py,sha256=woPxU4vYMUv-T0XOjV5ieViksU_q7It_n_5Ll4zpocA,289
6
- zombie_squirrel-0.5.0.dist-info/licenses/LICENSE,sha256=U0Y7B3gZJHXpjJVLgTQjM8e_c8w4JJpLgGhIdsoFR1Y,1092
7
- zombie_squirrel-0.5.0.dist-info/METADATA,sha256=WDnQXat1d6eC1-RkESW5yZ1lmu8vaHPYEM87SvBuB1g,1902
8
- zombie_squirrel-0.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
- zombie_squirrel-0.5.0.dist-info/top_level.txt,sha256=FmM0coe4AangURZLjM4JwwRv2B8H6oINYCoZLKLDCKA,16
10
- zombie_squirrel-0.5.0.dist-info/RECORD,,