zombie-squirrel 0.5.1__tar.gz → 0.5.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {zombie_squirrel-0.5.1/src/zombie_squirrel.egg-info → zombie_squirrel-0.5.3}/PKG-INFO +9 -6
  2. {zombie_squirrel-0.5.1 → zombie_squirrel-0.5.3}/README.md +3 -4
  3. {zombie_squirrel-0.5.1 → zombie_squirrel-0.5.3}/pyproject.toml +5 -1
  4. {zombie_squirrel-0.5.1 → zombie_squirrel-0.5.3}/src/zombie_squirrel/__init__.py +1 -1
  5. zombie_squirrel-0.5.3/src/zombie_squirrel/acorns.py +96 -0
  6. {zombie_squirrel-0.5.1 → zombie_squirrel-0.5.3}/src/zombie_squirrel/squirrels.py +9 -12
  7. zombie_squirrel-0.5.3/src/zombie_squirrel/utils.py +23 -0
  8. {zombie_squirrel-0.5.1 → zombie_squirrel-0.5.3/src/zombie_squirrel.egg-info}/PKG-INFO +9 -6
  9. zombie_squirrel-0.5.3/src/zombie_squirrel.egg-info/requires.txt +5 -0
  10. zombie_squirrel-0.5.3/tests/test_acorns.py +184 -0
  11. {zombie_squirrel-0.5.1 → zombie_squirrel-0.5.3}/tests/test_squirrels.py +41 -1
  12. zombie_squirrel-0.5.3/tests/test_utils.py +54 -0
  13. zombie_squirrel-0.5.1/src/zombie_squirrel/acorns.py +0 -81
  14. zombie_squirrel-0.5.1/src/zombie_squirrel/utils.py +0 -12
  15. zombie_squirrel-0.5.1/src/zombie_squirrel.egg-info/requires.txt +0 -1
  16. zombie_squirrel-0.5.1/tests/test_acorns.py +0 -217
  17. zombie_squirrel-0.5.1/tests/test_utils.py +0 -40
  18. {zombie_squirrel-0.5.1 → zombie_squirrel-0.5.3}/LICENSE +0 -0
  19. {zombie_squirrel-0.5.1 → zombie_squirrel-0.5.3}/setup.cfg +0 -0
  20. {zombie_squirrel-0.5.1 → zombie_squirrel-0.5.3}/setup.py +0 -0
  21. {zombie_squirrel-0.5.1 → zombie_squirrel-0.5.3}/src/zombie_squirrel/sync.py +0 -0
  22. {zombie_squirrel-0.5.1 → zombie_squirrel-0.5.3}/src/zombie_squirrel.egg-info/SOURCES.txt +0 -0
  23. {zombie_squirrel-0.5.1 → zombie_squirrel-0.5.3}/src/zombie_squirrel.egg-info/dependency_links.txt +0 -0
  24. {zombie_squirrel-0.5.1 → zombie_squirrel-0.5.3}/src/zombie_squirrel.egg-info/top_level.txt +0 -0
  25. {zombie_squirrel-0.5.1 → zombie_squirrel-0.5.3}/tests/test_sync.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: zombie-squirrel
3
- Version: 0.5.1
3
+ Version: 0.5.3
4
4
  Summary: Generated from aind-library-template
5
5
  Author: Allen Institute for Neural Dynamics
6
6
  License: MIT
@@ -8,7 +8,11 @@ Classifier: Programming Language :: Python :: 3
8
8
  Requires-Python: >=3.10
9
9
  Description-Content-Type: text/markdown
10
10
  License-File: LICENSE
11
- Requires-Dist: aind-data-access-api[docdb,rds]
11
+ Requires-Dist: duckdb
12
+ Requires-Dist: fastparquet
13
+ Requires-Dist: boto3
14
+ Requires-Dist: pandas
15
+ Requires-Dist: aind-data-access-api[docdb]
12
16
  Dynamic: license-file
13
17
 
14
18
  # zombie-squirrel
@@ -17,7 +21,7 @@ Dynamic: license-file
17
21
  ![Code Style](https://img.shields.io/badge/code%20style-black-black)
18
22
  [![semantic-release: angular](https://img.shields.io/badge/semantic--release-angular-e10079?logo=semantic-release)](https://github.com/semantic-release/semantic-release)
19
23
  ![Interrogate](https://img.shields.io/badge/interrogate-100.0%25-brightgreen)
20
- ![Coverage](https://img.shields.io/badge/coverage-99%25-brightgreen)
24
+ ![Coverage](https://img.shields.io/badge/coverage-100%25-brightgreen)
21
25
  ![Python](https://img.shields.io/badge/python->=3.10-blue?logo=python)
22
26
 
23
27
  <img src="zombie-squirrel_logo.png" width="400" alt="Logo (image from ChatGPT)">
@@ -36,11 +40,10 @@ uv sync
36
40
  ### Set backend
37
41
 
38
42
  ```bash
39
- export REDSHIFT_SECRETS='/aind/prod/redshift/credentials/readwrite'
40
- export TREE_SPECIES='REDSHIFT'
43
+ export TREE_SPECIES='s3'
41
44
  ```
42
45
 
43
- Options are 'REDSHIFT', 'MEMORY'.
46
+ Options are 's3', 'MEMORY'.
44
47
 
45
48
  ### Scurry (fetch) data
46
49
 
@@ -4,7 +4,7 @@
4
4
  ![Code Style](https://img.shields.io/badge/code%20style-black-black)
5
5
  [![semantic-release: angular](https://img.shields.io/badge/semantic--release-angular-e10079?logo=semantic-release)](https://github.com/semantic-release/semantic-release)
6
6
  ![Interrogate](https://img.shields.io/badge/interrogate-100.0%25-brightgreen)
7
- ![Coverage](https://img.shields.io/badge/coverage-99%25-brightgreen)
7
+ ![Coverage](https://img.shields.io/badge/coverage-100%25-brightgreen)
8
8
  ![Python](https://img.shields.io/badge/python->=3.10-blue?logo=python)
9
9
 
10
10
  <img src="zombie-squirrel_logo.png" width="400" alt="Logo (image from ChatGPT)">
@@ -23,11 +23,10 @@ uv sync
23
23
  ### Set backend
24
24
 
25
25
  ```bash
26
- export REDSHIFT_SECRETS='/aind/prod/redshift/credentials/readwrite'
27
- export TREE_SPECIES='REDSHIFT'
26
+ export TREE_SPECIES='s3'
28
27
  ```
29
28
 
30
- Options are 'REDSHIFT', 'MEMORY'.
29
+ Options are 's3', 'MEMORY'.
31
30
 
32
31
  ### Scurry (fetch) data
33
32
 
@@ -17,7 +17,11 @@ readme = "README.md"
17
17
  dynamic = ["version"]
18
18
 
19
19
  dependencies = [
20
- 'aind-data-access-api[docdb,rds]',
20
+ 'duckdb',
21
+ 'fastparquet',
22
+ 'boto3',
23
+ 'pandas',
24
+ 'aind-data-access-api[docdb]',
21
25
  ]
22
26
 
23
27
  [dependency-groups]
@@ -3,7 +3,7 @@
3
3
  Provides functions to fetch and cache project names, subject IDs, and asset
4
4
  metadata from the AIND metadata database with support for multiple backends."""
5
5
 
6
- __version__ = "0.5.1"
6
+ __version__ = "0.5.3"
7
7
 
8
8
  from zombie_squirrel.squirrels import ( # noqa: F401
9
9
  asset_basics,
@@ -0,0 +1,96 @@
1
+ """Storage backend interfaces for caching data."""
2
+
3
+ import io
4
+ import logging
5
+ from abc import ABC, abstractmethod
6
+
7
+ import boto3
8
+ import duckdb
9
+ import pandas as pd
10
+
11
+ from zombie_squirrel.utils import get_s3_cache_path, prefix_table_name
12
+
13
+
14
+ class Acorn(ABC):
15
+ """Base class for a storage backend (the cache)."""
16
+
17
+ def __init__(self) -> None:
18
+ """Initialize the Acorn."""
19
+ super().__init__()
20
+
21
+ @abstractmethod
22
+ def hide(self, table_name: str, data: pd.DataFrame) -> None:
23
+ """Store records in the cache."""
24
+ pass # pragma: no cover
25
+
26
+ @abstractmethod
27
+ def scurry(self, table_name: str) -> pd.DataFrame:
28
+ """Fetch records from the cache."""
29
+ pass # pragma: no cover
30
+
31
+
32
+ class S3Acorn(Acorn):
33
+ """Stores and retrieves caches using AWS S3 with parquet files."""
34
+
35
+ def __init__(self) -> None:
36
+ """Initialize S3Acorn with S3 client."""
37
+ self.bucket = "aind-scratch-data"
38
+ self.s3_client = boto3.client("s3")
39
+
40
+ def hide(self, table_name: str, data: pd.DataFrame) -> None:
41
+ """Store DataFrame as parquet file in S3."""
42
+ filename = prefix_table_name(table_name)
43
+ s3_key = get_s3_cache_path(filename)
44
+
45
+ # Convert DataFrame to parquet bytes
46
+ parquet_buffer = io.BytesIO()
47
+ data.to_parquet(parquet_buffer, index=False)
48
+ parquet_buffer.seek(0)
49
+
50
+ # Upload to S3
51
+ self.s3_client.put_object(
52
+ Bucket=self.bucket,
53
+ Key=s3_key,
54
+ Body=parquet_buffer.getvalue(),
55
+ )
56
+ logging.info(f"Stored cache to S3: s3://{self.bucket}/{s3_key}")
57
+
58
+ def scurry(self, table_name: str) -> pd.DataFrame:
59
+ """Fetch DataFrame from S3 parquet file."""
60
+ filename = prefix_table_name(table_name)
61
+ s3_key = get_s3_cache_path(filename)
62
+
63
+ try:
64
+ # Read directly from S3 using DuckDB
65
+ query = f"""
66
+ SELECT * FROM read_parquet(
67
+ 's3://{self.bucket}/{s3_key}'
68
+ )
69
+ """
70
+ result = duckdb.query(query).to_df()
71
+ logging.info(
72
+ f"Retrieved cache from S3: s3://{self.bucket}/{s3_key}"
73
+ )
74
+ return result
75
+ except Exception as e:
76
+ logging.warning(
77
+ f"Error fetching from cache {s3_key}: {e}"
78
+ )
79
+ return pd.DataFrame()
80
+
81
+
82
+ class MemoryAcorn(Acorn):
83
+ """A simple in-memory backend for testing or local development."""
84
+
85
+ def __init__(self) -> None:
86
+ """Initialize MemoryAcorn with empty store."""
87
+ super().__init__()
88
+ self._store: dict[str, pd.DataFrame] = {}
89
+
90
+ def hide(self, table_name: str, data: pd.DataFrame) -> None:
91
+ """Store DataFrame in memory."""
92
+ self._store[table_name] = data
93
+
94
+ def scurry(self, table_name: str) -> pd.DataFrame:
95
+ """Fetch DataFrame from memory."""
96
+ return self._store.get(table_name, pd.DataFrame())
@@ -10,8 +10,7 @@ from aind_data_access_api.document_db import MetadataDbClient
10
10
 
11
11
  from zombie_squirrel.acorns import (
12
12
  MemoryAcorn,
13
- RedshiftAcorn,
14
- rds_get_handle_empty,
13
+ S3Acorn,
15
14
  )
16
15
 
17
16
  # --- Backend setup ---------------------------------------------------
@@ -20,9 +19,9 @@ API_GATEWAY_HOST = "api.allenneuraldynamics.org"
20
19
 
21
20
  tree_type = os.getenv("TREE_SPECIES", "memory").lower()
22
21
 
23
- if tree_type == "redshift": # pragma: no cover
24
- logging.info("Using Redshift acorn for caching")
25
- ACORN = RedshiftAcorn()
22
+ if tree_type == "s3": # pragma: no cover
23
+ logging.info("Using S3 acorn for caching")
24
+ ACORN = S3Acorn()
26
25
  else:
27
26
  logging.info("Using in-memory acorn for caching")
28
27
  ACORN = MemoryAcorn()
@@ -66,7 +65,7 @@ def unique_project_names(force_update: bool = False) -> list[str]:
66
65
 
67
66
  Returns:
68
67
  List of unique project names."""
69
- df = rds_get_handle_empty(ACORN, NAMES["upn"])
68
+ df = ACORN.scurry(NAMES["upn"])
70
69
 
71
70
  if df.empty or force_update:
72
71
  # If cache is missing, fetch data
@@ -99,7 +98,7 @@ def unique_subject_ids(force_update: bool = False) -> list[str]:
99
98
 
100
99
  Returns:
101
100
  List of unique subject IDs."""
102
- df = rds_get_handle_empty(ACORN, NAMES["usi"])
101
+ df = ACORN.scurry(NAMES["usi"])
103
102
 
104
103
  if df.empty or force_update:
105
104
  # If cache is missing, fetch data
@@ -134,7 +133,7 @@ def asset_basics(force_update: bool = False) -> pd.DataFrame:
134
133
 
135
134
  Returns:
136
135
  DataFrame with basic asset metadata."""
137
- df = rds_get_handle_empty(ACORN, NAMES["basics"])
136
+ df = ACORN.scurry(NAMES["basics"])
138
137
 
139
138
  FIELDS = [
140
139
  "data_description.modalities",
@@ -249,7 +248,7 @@ def source_data(force_update: bool = False) -> pd.DataFrame:
249
248
 
250
249
  Returns:
251
250
  DataFrame with _id and source_data columns."""
252
- df = rds_get_handle_empty(ACORN, NAMES["d2r"])
251
+ df = ACORN.scurry(NAMES["d2r"])
253
252
 
254
253
  if df.empty or force_update:
255
254
  logging.info("Updating cache for source data")
@@ -291,7 +290,7 @@ def raw_to_derived(force_update: bool = False) -> pd.DataFrame:
291
290
 
292
291
  Returns:
293
292
  DataFrame with _id and derived_records columns."""
294
- df = rds_get_handle_empty(ACORN, NAMES["r2d"])
293
+ df = ACORN.scurry(NAMES["r2d"])
295
294
 
296
295
  if df.empty or force_update:
297
296
  logging.info("Updating cache for raw to derived mapping")
@@ -319,8 +318,6 @@ def raw_to_derived(force_update: bool = False) -> pd.DataFrame:
319
318
  raw_to_derived_map = {raw_id: [] for raw_id in raw_ids}
320
319
  for derived_record in derived_records:
321
320
  source_data_list = derived_record.get("data_description", {}).get("source_data", [])
322
- if not source_data_list:
323
- continue
324
321
  derived_id = derived_record["_id"]
325
322
  # Add this derived record to each raw record it depends on
326
323
  for source_id in source_data_list:
@@ -0,0 +1,23 @@
1
+ """Utility functions for zombie-squirrel package."""
2
+
3
+
4
+ def prefix_table_name(table_name: str) -> str:
5
+ """Add zombie-squirrel prefix and parquet extension to filenames.
6
+
7
+ Args:
8
+ table_name: The base table name.
9
+
10
+ Returns:
11
+ Filename with 'zs_' prefix and '.pqt' extension."""
12
+ return "zs_" + table_name + ".pqt"
13
+
14
+
15
+ def get_s3_cache_path(filename: str) -> str:
16
+ """Get the full S3 path for a cache file.
17
+
18
+ Args:
19
+ filename: The cache filename (e.g., "zs_unique_project_names.pqt").
20
+
21
+ Returns:
22
+ Full S3 path: application-caches/filename"""
23
+ return f"application-caches/{filename}"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: zombie-squirrel
3
- Version: 0.5.1
3
+ Version: 0.5.3
4
4
  Summary: Generated from aind-library-template
5
5
  Author: Allen Institute for Neural Dynamics
6
6
  License: MIT
@@ -8,7 +8,11 @@ Classifier: Programming Language :: Python :: 3
8
8
  Requires-Python: >=3.10
9
9
  Description-Content-Type: text/markdown
10
10
  License-File: LICENSE
11
- Requires-Dist: aind-data-access-api[docdb,rds]
11
+ Requires-Dist: duckdb
12
+ Requires-Dist: fastparquet
13
+ Requires-Dist: boto3
14
+ Requires-Dist: pandas
15
+ Requires-Dist: aind-data-access-api[docdb]
12
16
  Dynamic: license-file
13
17
 
14
18
  # zombie-squirrel
@@ -17,7 +21,7 @@ Dynamic: license-file
17
21
  ![Code Style](https://img.shields.io/badge/code%20style-black-black)
18
22
  [![semantic-release: angular](https://img.shields.io/badge/semantic--release-angular-e10079?logo=semantic-release)](https://github.com/semantic-release/semantic-release)
19
23
  ![Interrogate](https://img.shields.io/badge/interrogate-100.0%25-brightgreen)
20
- ![Coverage](https://img.shields.io/badge/coverage-99%25-brightgreen)
24
+ ![Coverage](https://img.shields.io/badge/coverage-100%25-brightgreen)
21
25
  ![Python](https://img.shields.io/badge/python->=3.10-blue?logo=python)
22
26
 
23
27
  <img src="zombie-squirrel_logo.png" width="400" alt="Logo (image from ChatGPT)">
@@ -36,11 +40,10 @@ uv sync
36
40
  ### Set backend
37
41
 
38
42
  ```bash
39
- export REDSHIFT_SECRETS='/aind/prod/redshift/credentials/readwrite'
40
- export TREE_SPECIES='REDSHIFT'
43
+ export TREE_SPECIES='s3'
41
44
  ```
42
45
 
43
- Options are 'REDSHIFT', 'MEMORY'.
46
+ Options are 's3', 'MEMORY'.
44
47
 
45
48
  ### Scurry (fetch) data
46
49
 
@@ -0,0 +1,5 @@
1
+ duckdb
2
+ fastparquet
3
+ boto3
4
+ pandas
5
+ aind-data-access-api[docdb]
@@ -0,0 +1,184 @@
1
+ """Unit tests for zombie_squirrel.acorns module.
2
+
3
+ Tests for abstract base class, memory backend, and S3 backend
4
+ for caching functionality."""
5
+
6
+ import unittest
7
+ from unittest.mock import MagicMock, Mock, patch
8
+
9
+ import pandas as pd
10
+
11
+ from zombie_squirrel.acorns import (
12
+ Acorn,
13
+ MemoryAcorn,
14
+ S3Acorn,
15
+ )
16
+
17
+
18
+ class TestAcornAbstractClass(unittest.TestCase):
19
+ """Tests for Acorn abstract base class."""
20
+
21
+ def test_acorn_cannot_be_instantiated(self):
22
+ """Test that Acorn abstract class cannot be instantiated."""
23
+ with self.assertRaises(TypeError):
24
+ Acorn()
25
+
26
+ def test_acorn_subclass_must_implement_hide(self):
27
+ """Test that subclasses must implement hide method."""
28
+
29
+ class IncompleteAcorn(Acorn):
30
+ """Incomplete Acorn subclass missing hide method."""
31
+
32
+ def scurry(self, table_name: str) -> pd.DataFrame: # pragma: no cover
33
+ """Fetch records from the cache."""
34
+ return pd.DataFrame()
35
+
36
+ with self.assertRaises(TypeError):
37
+ IncompleteAcorn()
38
+
39
+ def test_acorn_subclass_must_implement_scurry(self):
40
+ """Test that subclasses must implement scurry method."""
41
+
42
+ class IncompleteAcorn(Acorn):
43
+ """Incomplete Acorn subclass missing scurry method."""
44
+
45
+ def hide(self, table_name: str, data: pd.DataFrame) -> None: # pragma: no cover
46
+ """Store records in the cache."""
47
+ pass
48
+
49
+ with self.assertRaises(TypeError):
50
+ IncompleteAcorn()
51
+
52
+
53
+ class TestMemoryAcorn(unittest.TestCase):
54
+ """Tests for MemoryAcorn implementation."""
55
+
56
+ def setUp(self):
57
+ """Initialize a fresh MemoryAcorn for each test."""
58
+ self.acorn = MemoryAcorn()
59
+
60
+ def test_hide_and_scurry_basic(self):
61
+ """Test basic hide and scurry operations."""
62
+ df = pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]})
63
+ self.acorn.hide("test_table", df)
64
+
65
+ retrieved = self.acorn.scurry("test_table")
66
+ pd.testing.assert_frame_equal(df, retrieved)
67
+
68
+ def test_scurry_empty_table(self):
69
+ """Test scurrying a table that doesn't exist returns empty DataFrame."""
70
+ result = self.acorn.scurry("nonexistent_table")
71
+ self.assertTrue(result.empty)
72
+ self.assertIsInstance(result, pd.DataFrame)
73
+
74
+ def test_hide_overwrites_existing(self):
75
+ """Test that hiding data overwrites existing data."""
76
+ df1 = pd.DataFrame({"col1": [1, 2, 3]})
77
+ df2 = pd.DataFrame({"col1": [4, 5, 6]})
78
+
79
+ self.acorn.hide("table", df1)
80
+ self.acorn.hide("table", df2)
81
+
82
+ retrieved = self.acorn.scurry("table")
83
+ pd.testing.assert_frame_equal(df2, retrieved)
84
+
85
+ def test_multiple_tables(self):
86
+ """Test managing multiple tables."""
87
+ df1 = pd.DataFrame({"col1": [1, 2]})
88
+ df2 = pd.DataFrame({"col2": ["a", "b"]})
89
+
90
+ self.acorn.hide("table1", df1)
91
+ self.acorn.hide("table2", df2)
92
+
93
+ retrieved1 = self.acorn.scurry("table1")
94
+ retrieved2 = self.acorn.scurry("table2")
95
+
96
+ pd.testing.assert_frame_equal(df1, retrieved1)
97
+ pd.testing.assert_frame_equal(df2, retrieved2)
98
+
99
+ def test_hide_empty_dataframe(self):
100
+ """Test hiding an empty DataFrame."""
101
+ df = pd.DataFrame()
102
+ self.acorn.hide("empty_table", df)
103
+
104
+ retrieved = self.acorn.scurry("empty_table")
105
+ pd.testing.assert_frame_equal(df, retrieved)
106
+
107
+
108
+ class TestS3Acorn(unittest.TestCase):
109
+ """Tests for S3Acorn implementation with mocking."""
110
+
111
+ @patch("zombie_squirrel.acorns.boto3.client")
112
+ def test_s3_acorn_initialization(self, mock_boto3_client):
113
+ """Test S3Acorn initialization."""
114
+ mock_s3_client = MagicMock()
115
+ mock_boto3_client.return_value = mock_s3_client
116
+
117
+ acorn = S3Acorn()
118
+
119
+ self.assertEqual(acorn.bucket, "aind-scratch-data")
120
+ self.assertEqual(acorn.s3_client, mock_s3_client)
121
+ mock_boto3_client.assert_called_once_with("s3")
122
+
123
+ @patch("zombie_squirrel.acorns.boto3.client")
124
+ def test_s3_hide(self, mock_boto3_client):
125
+ """Test S3Acorn.hide method writes to S3."""
126
+ mock_s3_client = MagicMock()
127
+ mock_boto3_client.return_value = mock_s3_client
128
+
129
+ acorn = S3Acorn()
130
+ df = pd.DataFrame({"col1": [1, 2, 3]})
131
+
132
+ acorn.hide("test_table", df)
133
+
134
+ mock_s3_client.put_object.assert_called_once()
135
+ call_kwargs = mock_s3_client.put_object.call_args[1]
136
+ self.assertEqual(call_kwargs["Bucket"], "aind-scratch-data")
137
+ self.assertEqual(
138
+ call_kwargs["Key"], "application-caches/zs_test_table.pqt"
139
+ )
140
+ self.assertIsInstance(call_kwargs["Body"], bytes)
141
+
142
+ @patch("zombie_squirrel.acorns.duckdb.query")
143
+ @patch("zombie_squirrel.acorns.boto3.client")
144
+ def test_s3_scurry(self, mock_boto3_client, mock_duckdb_query):
145
+ """Test S3Acorn.scurry method reads from S3 using DuckDB."""
146
+ mock_s3_client = MagicMock()
147
+ mock_boto3_client.return_value = mock_s3_client
148
+
149
+ expected_df = pd.DataFrame({"col1": [1, 2, 3]})
150
+ mock_result = MagicMock()
151
+ mock_result.to_df.return_value = expected_df
152
+ mock_duckdb_query.return_value = mock_result
153
+
154
+ acorn = S3Acorn()
155
+ result = acorn.scurry("test_table")
156
+
157
+ # Verify DuckDB was called with correct S3 path
158
+ mock_duckdb_query.assert_called_once()
159
+ query_call = mock_duckdb_query.call_args[0][0]
160
+ self.assertIn(
161
+ "s3://aind-scratch-data/application-caches/zs_test_table.pqt",
162
+ query_call,
163
+ )
164
+ pd.testing.assert_frame_equal(result, expected_df)
165
+
166
+ @patch("zombie_squirrel.acorns.duckdb.query")
167
+ @patch("zombie_squirrel.acorns.boto3.client")
168
+ def test_s3_scurry_handles_error(
169
+ self, mock_boto3_client, mock_duckdb_query
170
+ ):
171
+ """Test S3Acorn.scurry returns empty DataFrame on error."""
172
+ mock_s3_client = MagicMock()
173
+ mock_boto3_client.return_value = mock_s3_client
174
+ mock_duckdb_query.side_effect = Exception("S3 access error")
175
+
176
+ acorn = S3Acorn()
177
+ result = acorn.scurry("nonexistent_table")
178
+
179
+ self.assertTrue(result.empty)
180
+ self.assertIsInstance(result, pd.DataFrame)
181
+
182
+
183
+ if __name__ == "__main__":
184
+ unittest.main()
@@ -204,7 +204,47 @@ class TestAssetBasics(unittest.TestCase):
204
204
 
205
205
  @patch("zombie_squirrel.squirrels.ACORN", new_callable=MemoryAcorn)
206
206
  @patch("zombie_squirrel.squirrels.MetadataDbClient")
207
- def test_asset_basics_incremental_update(self, mock_client_class, mock_acorn):
207
+ def test_asset_basics_with_data_processes(
208
+ self, mock_client_class, mock_acorn
209
+ ):
210
+ """Test asset_basics includes process_date from data_processes."""
211
+ mock_client_instance = MagicMock()
212
+ mock_client_class.return_value = mock_client_instance
213
+
214
+ mock_client_instance.retrieve_docdb_records.return_value = [
215
+ {
216
+ "_id": "id1",
217
+ "_last_modified": "2023-01-01",
218
+ "data_description": {
219
+ "modalities": [{"abbreviation": "img"}],
220
+ "project_name": "proj1",
221
+ "data_level": "raw",
222
+ },
223
+ "subject": {"subject_id": "sub001"},
224
+ "acquisition": {
225
+ "acquisition_start_time": "2023-01-01T10:00:00",
226
+ "acquisition_end_time": "2023-01-01T11:00:00",
227
+ },
228
+ "processing": {
229
+ "data_processes": [
230
+ {"start_date_time": "2023-01-15T14:30:00"},
231
+ {"start_date_time": "2023-01-20T09:15:00"},
232
+ ]
233
+ },
234
+ }
235
+ ]
236
+
237
+ result = asset_basics()
238
+
239
+ self.assertEqual(len(result), 1)
240
+ self.assertEqual(result.iloc[0]["_id"], "id1")
241
+ self.assertEqual(result.iloc[0]["process_date"], "2023-01-20")
242
+
243
+ @patch("zombie_squirrel.squirrels.ACORN", new_callable=MemoryAcorn)
244
+ @patch("zombie_squirrel.squirrels.MetadataDbClient")
245
+ def test_asset_basics_incremental_update(
246
+ self, mock_client_class, mock_acorn
247
+ ):
208
248
  """Test incremental cache update with partial data refresh."""
209
249
  mock_client_instance = MagicMock()
210
250
  mock_client_class.return_value = mock_client_instance
@@ -0,0 +1,54 @@
1
+ """Unit tests for zombie_squirrel.utils module.
2
+
3
+ Tests for utility functions."""
4
+
5
+ import unittest
6
+
7
+ from zombie_squirrel.utils import get_s3_cache_path, prefix_table_name
8
+
9
+
10
+ class TestPrefixTableName(unittest.TestCase):
11
+ """Tests for the prefix_table_name function."""
12
+
13
+ def test_prefix_table_name_basic(self):
14
+ """Test that prefix_table_name adds 'zs_' prefix and '.pqt' ext."""
15
+ result = prefix_table_name("my_table")
16
+ self.assertEqual(result, "zs_my_table.pqt")
17
+
18
+ def test_prefix_table_name_empty_string(self):
19
+ """Test with empty string."""
20
+ result = prefix_table_name("")
21
+ self.assertEqual(result, "zs_.pqt")
22
+
23
+ def test_prefix_table_name_single_char(self):
24
+ """Test with single character."""
25
+ result = prefix_table_name("a")
26
+ self.assertEqual(result, "zs_a.pqt")
27
+
28
+ def test_prefix_table_name_with_underscores(self):
29
+ """Test with table name containing underscores."""
30
+ result = prefix_table_name("my_long_table_name")
31
+ self.assertEqual(result, "zs_my_long_table_name.pqt")
32
+
33
+ def test_prefix_table_name_with_numbers(self):
34
+ """Test with table name containing numbers."""
35
+ result = prefix_table_name("table123")
36
+ self.assertEqual(result, "zs_table123.pqt")
37
+
38
+
39
+ class TestGetS3CachePath(unittest.TestCase):
40
+ """Tests for the get_s3_cache_path function."""
41
+
42
+ def test_get_s3_cache_path_basic(self):
43
+ """Test that get_s3_cache_path constructs correct S3 path."""
44
+ result = get_s3_cache_path("zs_test.pqt")
45
+ self.assertEqual(result, "application-caches/zs_test.pqt")
46
+
47
+ def test_get_s3_cache_path_various_names(self):
48
+ """Test with various filenames."""
49
+ result = get_s3_cache_path("zs_my_data.pqt")
50
+ self.assertEqual(result, "application-caches/zs_my_data.pqt")
51
+
52
+
53
+ if __name__ == "__main__":
54
+ unittest.main()
@@ -1,81 +0,0 @@
1
- """Storage backend interfaces for caching data."""
2
-
3
- import logging
4
- import os
5
- from abc import ABC, abstractmethod
6
-
7
- import pandas as pd
8
- from aind_data_access_api.rds_tables import Client, RDSCredentials
9
-
10
- from zombie_squirrel.utils import prefix_table_name
11
-
12
-
13
- class Acorn(ABC):
14
- """Base class for a storage backend (the cache)."""
15
-
16
- def __init__(self) -> None:
17
- """Initialize the Acorn."""
18
- super().__init__()
19
-
20
- @abstractmethod
21
- def hide(self, table_name: str, data: pd.DataFrame) -> None:
22
- """Store records in the cache."""
23
- pass # pragma: no cover
24
-
25
- @abstractmethod
26
- def scurry(self, table_name: str) -> pd.DataFrame:
27
- """Fetch records from the cache."""
28
- pass # pragma: no cover
29
-
30
-
31
- class RedshiftAcorn(Acorn):
32
- """Stores and retrieves caches using aind-data-access-api
33
- Redshift Client"""
34
-
35
- def __init__(self) -> None:
36
- """Initialize RedshiftAcorn with Redshift credentials."""
37
- REDSHIFT_SECRETS = os.getenv("REDSHIFT_SECRETS", "/aind/prod/redshift/credentials/readwrite")
38
- self.rds_client = Client(
39
- credentials=RDSCredentials(aws_secrets_name=REDSHIFT_SECRETS),
40
- )
41
-
42
- def hide(self, table_name: str, data: pd.DataFrame) -> None:
43
- """Store DataFrame in Redshift table."""
44
- self.rds_client.overwrite_table_with_df(
45
- df=data,
46
- table_name=prefix_table_name(table_name),
47
- )
48
-
49
- def scurry(self, table_name: str) -> pd.DataFrame:
50
- """Fetch DataFrame from Redshift table."""
51
- return self.rds_client.read_table(table_name=prefix_table_name(table_name))
52
-
53
-
54
- class MemoryAcorn(Acorn):
55
- """A simple in-memory backend for testing or local development."""
56
-
57
- def __init__(self) -> None:
58
- """Initialize MemoryAcorn with empty store."""
59
- super().__init__()
60
- self._store: dict[str, pd.DataFrame] = {}
61
-
62
- def hide(self, table_name: str, data: pd.DataFrame) -> None:
63
- """Store DataFrame in memory."""
64
- self._store[table_name] = data
65
-
66
- def scurry(self, table_name: str) -> pd.DataFrame:
67
- """Fetch DataFrame from memory."""
68
- return self._store.get(table_name, pd.DataFrame())
69
-
70
-
71
- def rds_get_handle_empty(acorn: Acorn, table_name: str) -> pd.DataFrame:
72
- """Helper for handling errors when loading from redshift, because
73
- there's no helper function"""
74
- try:
75
- logging.info(f"Fetching from cache: {table_name}")
76
- df = acorn.scurry(table_name)
77
- except Exception as e:
78
- logging.warning(f"Error fetching from cache: {e}")
79
- df = pd.DataFrame()
80
-
81
- return df
@@ -1,12 +0,0 @@
1
- """Utility functions for zombie-squirrel package."""
2
-
3
-
4
- def prefix_table_name(table_name: str) -> str:
5
- """Add zombie-squirrel prefix to table names.
6
-
7
- Args:
8
- table_name: The base table name.
9
-
10
- Returns:
11
- Table name with 'zs_' prefix."""
12
- return "zs_" + table_name
@@ -1 +0,0 @@
1
- aind-data-access-api[docdb,rds]
@@ -1,217 +0,0 @@
1
- """Unit tests for zombie_squirrel.acorns module.
2
-
3
- Tests for abstract base class, memory backend, and Redshift backend
4
- for caching functionality."""
5
-
6
- import os
7
- import unittest
8
- from unittest.mock import MagicMock, Mock, patch
9
-
10
- import pandas as pd
11
-
12
- from zombie_squirrel.acorns import (
13
- Acorn,
14
- MemoryAcorn,
15
- RedshiftAcorn,
16
- rds_get_handle_empty,
17
- )
18
-
19
-
20
- class TestAcornAbstractClass(unittest.TestCase):
21
- """Tests for Acorn abstract base class."""
22
-
23
- def test_acorn_cannot_be_instantiated(self):
24
- """Test that Acorn abstract class cannot be instantiated."""
25
- with self.assertRaises(TypeError):
26
- Acorn()
27
-
28
- def test_acorn_subclass_must_implement_hide(self):
29
- """Test that subclasses must implement hide method."""
30
-
31
- class IncompleteAcorn(Acorn):
32
- """Incomplete Acorn subclass missing hide method."""
33
-
34
- def scurry(self, table_name: str) -> pd.DataFrame: # pragma: no cover
35
- """Fetch records from the cache."""
36
- return pd.DataFrame()
37
-
38
- with self.assertRaises(TypeError):
39
- IncompleteAcorn()
40
-
41
- def test_acorn_subclass_must_implement_scurry(self):
42
- """Test that subclasses must implement scurry method."""
43
-
44
- class IncompleteAcorn(Acorn):
45
- """Incomplete Acorn subclass missing scurry method."""
46
-
47
- def hide(self, table_name: str, data: pd.DataFrame) -> None: # pragma: no cover
48
- """Store records in the cache."""
49
- pass
50
-
51
- with self.assertRaises(TypeError):
52
- IncompleteAcorn()
53
-
54
-
55
- class TestMemoryAcorn(unittest.TestCase):
56
- """Tests for MemoryAcorn implementation."""
57
-
58
- def setUp(self):
59
- """Initialize a fresh MemoryAcorn for each test."""
60
- self.acorn = MemoryAcorn()
61
-
62
- def test_hide_and_scurry_basic(self):
63
- """Test basic hide and scurry operations."""
64
- df = pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]})
65
- self.acorn.hide("test_table", df)
66
-
67
- retrieved = self.acorn.scurry("test_table")
68
- pd.testing.assert_frame_equal(df, retrieved)
69
-
70
- def test_scurry_empty_table(self):
71
- """Test scurrying a table that doesn't exist returns empty DataFrame."""
72
- result = self.acorn.scurry("nonexistent_table")
73
- self.assertTrue(result.empty)
74
- self.assertIsInstance(result, pd.DataFrame)
75
-
76
- def test_hide_overwrites_existing(self):
77
- """Test that hiding data overwrites existing data."""
78
- df1 = pd.DataFrame({"col1": [1, 2, 3]})
79
- df2 = pd.DataFrame({"col1": [4, 5, 6]})
80
-
81
- self.acorn.hide("table", df1)
82
- self.acorn.hide("table", df2)
83
-
84
- retrieved = self.acorn.scurry("table")
85
- pd.testing.assert_frame_equal(df2, retrieved)
86
-
87
- def test_multiple_tables(self):
88
- """Test managing multiple tables."""
89
- df1 = pd.DataFrame({"col1": [1, 2]})
90
- df2 = pd.DataFrame({"col2": ["a", "b"]})
91
-
92
- self.acorn.hide("table1", df1)
93
- self.acorn.hide("table2", df2)
94
-
95
- retrieved1 = self.acorn.scurry("table1")
96
- retrieved2 = self.acorn.scurry("table2")
97
-
98
- pd.testing.assert_frame_equal(df1, retrieved1)
99
- pd.testing.assert_frame_equal(df2, retrieved2)
100
-
101
- def test_hide_empty_dataframe(self):
102
- """Test hiding an empty DataFrame."""
103
- df = pd.DataFrame()
104
- self.acorn.hide("empty_table", df)
105
-
106
- retrieved = self.acorn.scurry("empty_table")
107
- pd.testing.assert_frame_equal(df, retrieved)
108
-
109
-
110
- class TestRedshiftAcorn(unittest.TestCase):
111
- """Tests for RedshiftAcorn implementation with mocking."""
112
-
113
- @patch("zombie_squirrel.acorns.RDSCredentials")
114
- @patch("zombie_squirrel.acorns.Client")
115
- def test_redshift_acorn_initialization(self, mock_client_class, mock_credentials_class):
116
- """Test RedshiftAcorn initialization."""
117
- mock_client_instance = MagicMock()
118
- mock_client_class.return_value = mock_client_instance
119
- mock_credentials_instance = MagicMock()
120
- mock_credentials_class.return_value = mock_credentials_instance
121
-
122
- acorn = RedshiftAcorn()
123
-
124
- self.assertEqual(acorn.rds_client, mock_client_instance)
125
- mock_client_class.assert_called_once()
126
-
127
- @patch("zombie_squirrel.acorns.RDSCredentials")
128
- @patch("zombie_squirrel.acorns.Client")
129
- def test_redshift_hide(self, mock_client_class, mock_credentials_class):
130
- """Test RedshiftAcorn.hide method."""
131
- mock_client_instance = MagicMock()
132
- mock_client_class.return_value = mock_client_instance
133
- mock_credentials_instance = MagicMock()
134
- mock_credentials_class.return_value = mock_credentials_instance
135
-
136
- acorn = RedshiftAcorn()
137
- df = pd.DataFrame({"col1": [1, 2, 3]})
138
-
139
- acorn.hide("test_table", df)
140
-
141
- mock_client_instance.overwrite_table_with_df.assert_called_once()
142
- call_args = mock_client_instance.overwrite_table_with_df.call_args
143
- pd.testing.assert_frame_equal(call_args[1]["df"], df)
144
- self.assertEqual(call_args[1]["table_name"], "zs_test_table")
145
-
146
- @patch("zombie_squirrel.acorns.RDSCredentials")
147
- @patch("zombie_squirrel.acorns.Client")
148
- def test_redshift_scurry(self, mock_client_class, mock_credentials_class):
149
- """Test RedshiftAcorn.scurry method."""
150
- mock_client_instance = MagicMock()
151
- expected_df = pd.DataFrame({"col1": [1, 2, 3]})
152
- mock_client_instance.read_table.return_value = expected_df
153
- mock_client_class.return_value = mock_client_instance
154
- mock_credentials_instance = MagicMock()
155
- mock_credentials_class.return_value = mock_credentials_instance
156
-
157
- acorn = RedshiftAcorn()
158
- result = acorn.scurry("test_table")
159
-
160
- mock_client_instance.read_table.assert_called_once_with(table_name="zs_test_table")
161
- pd.testing.assert_frame_equal(result, expected_df)
162
-
163
- @patch.dict("os.environ", {}, clear=False)
164
- @patch("zombie_squirrel.acorns.RDSCredentials")
165
- @patch("zombie_squirrel.acorns.Client")
166
- def test_redshift_default_secrets_path(self, mock_client_class, mock_credentials_class):
167
- """Test RedshiftAcorn uses default secrets path."""
168
- if "REDSHIFT_SECRETS" in os.environ: # pragma: no cover
169
- del os.environ["REDSHIFT_SECRETS"] # pragma: no cover
170
-
171
- mock_client_instance = MagicMock()
172
- mock_client_class.return_value = mock_client_instance
173
- mock_credentials_instance = MagicMock()
174
- mock_credentials_class.return_value = mock_credentials_instance
175
-
176
- RedshiftAcorn()
177
-
178
- mock_client_class.assert_called_once()
179
- call_args = mock_client_class.call_args
180
- self.assertIsNotNone(call_args)
181
-
182
-
183
- class TestRdsGetHandleEmpty(unittest.TestCase):
184
- """Tests for rds_get_handle_empty helper function."""
185
-
186
- def test_rds_get_handle_empty_success(self):
187
- """Test successful retrieval from acorn."""
188
- acorn = MemoryAcorn()
189
- df = pd.DataFrame({"col1": [1, 2, 3]})
190
- acorn.hide("test_table", df)
191
-
192
- result = rds_get_handle_empty(acorn, "test_table")
193
-
194
- pd.testing.assert_frame_equal(result, df)
195
-
196
- def test_rds_get_handle_empty_missing_table(self):
197
- """Test returns empty DataFrame when table is missing."""
198
- acorn = MemoryAcorn()
199
-
200
- result = rds_get_handle_empty(acorn, "nonexistent_table")
201
-
202
- self.assertTrue(result.empty)
203
- self.assertIsInstance(result, pd.DataFrame)
204
-
205
- def test_rds_get_handle_empty_exception(self):
206
- """Test returns empty DataFrame when acorn raises exception."""
207
- acorn = Mock(spec=["scurry"])
208
- acorn.scurry.side_effect = Exception("Connection error")
209
-
210
- result = rds_get_handle_empty(acorn, "test_table")
211
-
212
- self.assertTrue(result.empty)
213
- self.assertIsInstance(result, pd.DataFrame)
214
-
215
-
216
- if __name__ == "__main__":
217
- unittest.main()
@@ -1,40 +0,0 @@
1
- """Unit tests for zombie_squirrel.utils module.
2
-
3
- Tests for utility functions."""
4
-
5
- import unittest
6
-
7
- from zombie_squirrel.utils import prefix_table_name
8
-
9
-
10
- class TestPrefixTableName(unittest.TestCase):
11
- """Tests for the prefix_table_name function."""
12
-
13
- def test_prefix_table_name_basic(self):
14
- """Test that prefix_table_name correctly adds 'zs_' prefix."""
15
- result = prefix_table_name("my_table")
16
- self.assertEqual(result, "zs_my_table")
17
-
18
- def test_prefix_table_name_empty_string(self):
19
- """Test with empty string."""
20
- result = prefix_table_name("")
21
- self.assertEqual(result, "zs_")
22
-
23
- def test_prefix_table_name_single_char(self):
24
- """Test with single character."""
25
- result = prefix_table_name("a")
26
- self.assertEqual(result, "zs_a")
27
-
28
- def test_prefix_table_name_with_underscores(self):
29
- """Test with table name containing underscores."""
30
- result = prefix_table_name("my_long_table_name")
31
- self.assertEqual(result, "zs_my_long_table_name")
32
-
33
- def test_prefix_table_name_with_numbers(self):
34
- """Test with table name containing numbers."""
35
- result = prefix_table_name("table123")
36
- self.assertEqual(result, "zs_table123")
37
-
38
-
39
- if __name__ == "__main__":
40
- unittest.main()
File without changes