zombie-squirrel 0.2.3__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {zombie_squirrel-0.2.3/src/zombie_squirrel.egg-info → zombie_squirrel-0.4.0}/PKG-INFO +1 -1
- {zombie_squirrel-0.2.3 → zombie_squirrel-0.4.0}/src/zombie_squirrel/__init__.py +2 -1
- {zombie_squirrel-0.2.3 → zombie_squirrel-0.4.0}/src/zombie_squirrel/acorns.py +2 -1
- zombie_squirrel-0.4.0/src/zombie_squirrel/squirrels.py +168 -0
- {zombie_squirrel-0.2.3 → zombie_squirrel-0.4.0/src/zombie_squirrel.egg-info}/PKG-INFO +1 -1
- zombie_squirrel-0.2.3/src/zombie_squirrel/squirrels.py +0 -86
- {zombie_squirrel-0.2.3 → zombie_squirrel-0.4.0}/LICENSE +0 -0
- {zombie_squirrel-0.2.3 → zombie_squirrel-0.4.0}/README.md +0 -0
- {zombie_squirrel-0.2.3 → zombie_squirrel-0.4.0}/pyproject.toml +0 -0
- {zombie_squirrel-0.2.3 → zombie_squirrel-0.4.0}/setup.cfg +0 -0
- {zombie_squirrel-0.2.3 → zombie_squirrel-0.4.0}/setup.py +0 -0
- {zombie_squirrel-0.2.3 → zombie_squirrel-0.4.0}/src/zombie_squirrel/sync.py +0 -0
- {zombie_squirrel-0.2.3 → zombie_squirrel-0.4.0}/src/zombie_squirrel/utils.py +0 -0
- {zombie_squirrel-0.2.3 → zombie_squirrel-0.4.0}/src/zombie_squirrel.egg-info/SOURCES.txt +0 -0
- {zombie_squirrel-0.2.3 → zombie_squirrel-0.4.0}/src/zombie_squirrel.egg-info/dependency_links.txt +0 -0
- {zombie_squirrel-0.2.3 → zombie_squirrel-0.4.0}/src/zombie_squirrel.egg-info/requires.txt +0 -0
- {zombie_squirrel-0.2.3 → zombie_squirrel-0.4.0}/src/zombie_squirrel.egg-info/top_level.txt +0 -0
|
@@ -59,7 +59,8 @@ class MemoryAcorn(Acorn):
|
|
|
59
59
|
|
|
60
60
|
|
|
61
61
|
def rds_get_handle_empty(acorn: Acorn, table_name: str) -> pd.DataFrame:
|
|
62
|
-
"""
|
|
62
|
+
"""Helper for handling errors when loading from redshift, because
|
|
63
|
+
there's no helper function """
|
|
63
64
|
try:
|
|
64
65
|
logging.info(f"Fetching from cache: {table_name}")
|
|
65
66
|
df = acorn.scurry(table_name)
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""Squirrels: functions to fetch and cache data from MongoDB."""
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from typing import Any, Callable
|
|
4
|
+
from zombie_squirrel.acorns import RedshiftAcorn, MemoryAcorn, rds_get_handle_empty
|
|
5
|
+
from aind_data_access_api.document_db import MetadataDbClient
|
|
6
|
+
import os
|
|
7
|
+
import logging
|
|
8
|
+
|
|
9
|
+
# --- Backend setup ---------------------------------------------------
|
|
10
|
+
|
|
11
|
+
API_GATEWAY_HOST = "api.allenneuraldynamics.org"
|
|
12
|
+
|
|
13
|
+
tree_type = os.getenv("TREE_SPECIES", "memory").lower()
|
|
14
|
+
|
|
15
|
+
if tree_type == "redshift":
|
|
16
|
+
logging.info("Using Redshift acorn for caching")
|
|
17
|
+
ACORN = RedshiftAcorn()
|
|
18
|
+
else:
|
|
19
|
+
logging.info("Using in-memory acorn for caching")
|
|
20
|
+
ACORN = MemoryAcorn()
|
|
21
|
+
|
|
22
|
+
# --- Squirrel registry -----------------------------------------------------
|
|
23
|
+
|
|
24
|
+
SQUIRREL_REGISTRY: dict[str, Callable[[], Any]] = {}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def register_squirrel(name: str):
|
|
28
|
+
"""Decorator for registering new squirrels."""
|
|
29
|
+
def decorator(func):
|
|
30
|
+
SQUIRREL_REGISTRY[name] = func
|
|
31
|
+
return func
|
|
32
|
+
return decorator
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# --- Squirrels -----------------------------------------------------
|
|
36
|
+
|
|
37
|
+
NAMES = {
|
|
38
|
+
"upn": "unique_project_names",
|
|
39
|
+
"usi": "unique_subject_ids",
|
|
40
|
+
"basics": "asset_basics",
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@register_squirrel(NAMES["upn"])
|
|
45
|
+
def unique_project_names(force_update: bool = False) -> list[str]:
|
|
46
|
+
df = rds_get_handle_empty(ACORN, NAMES["upn"])
|
|
47
|
+
|
|
48
|
+
if df.empty or force_update:
|
|
49
|
+
# If cache is missing, fetch data
|
|
50
|
+
logging.info("Updating cache for unique project names")
|
|
51
|
+
client = MetadataDbClient(
|
|
52
|
+
host=API_GATEWAY_HOST,
|
|
53
|
+
version="v2",
|
|
54
|
+
)
|
|
55
|
+
unique_project_names = client.aggregate_docdb_records(
|
|
56
|
+
pipeline=[
|
|
57
|
+
{"$group": {"_id": "$data_description.project_name"}},
|
|
58
|
+
{"$project": {"project_name": "$_id", "_id": 0}},
|
|
59
|
+
]
|
|
60
|
+
)
|
|
61
|
+
df = pd.DataFrame(unique_project_names)
|
|
62
|
+
ACORN.hide(NAMES["upn"], df)
|
|
63
|
+
|
|
64
|
+
return df["project_name"].tolist()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@register_squirrel(NAMES["usi"])
|
|
68
|
+
def unique_subject_ids(force_update: bool = False) -> list[str]:
|
|
69
|
+
df = rds_get_handle_empty(ACORN, NAMES["usi"])
|
|
70
|
+
|
|
71
|
+
if df.empty or force_update:
|
|
72
|
+
# If cache is missing, fetch data
|
|
73
|
+
logging.info("Updating cache for unique subject IDs")
|
|
74
|
+
client = MetadataDbClient(
|
|
75
|
+
host=API_GATEWAY_HOST,
|
|
76
|
+
version="v2",
|
|
77
|
+
)
|
|
78
|
+
unique_subject_ids = client.aggregate_docdb_records(
|
|
79
|
+
pipeline=[
|
|
80
|
+
{"$group": {"_id": "$subject.subject_id"}},
|
|
81
|
+
{"$project": {"subject_id": "$_id", "_id": 0}},
|
|
82
|
+
]
|
|
83
|
+
)
|
|
84
|
+
df = pd.DataFrame(unique_subject_ids)
|
|
85
|
+
ACORN.hide(NAMES["usi"], df)
|
|
86
|
+
|
|
87
|
+
return df["subject_id"].tolist()
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@register_squirrel(NAMES["basics"])
|
|
91
|
+
def asset_basics(force_update: bool = False) -> pd.DataFrame:
|
|
92
|
+
"""Basic asset metadata.
|
|
93
|
+
|
|
94
|
+
_id, _last_modified,
|
|
95
|
+
modalities, project names, data_level, subject_id, acquisition_start and _end
|
|
96
|
+
"""
|
|
97
|
+
df = rds_get_handle_empty(ACORN, NAMES["basics"])
|
|
98
|
+
|
|
99
|
+
FIELDS = [
|
|
100
|
+
"data_description.modalities",
|
|
101
|
+
"data_description.project_name",
|
|
102
|
+
"data_description.data_level",
|
|
103
|
+
"subject.subject_id",
|
|
104
|
+
"acquisition.acquisition_start_time",
|
|
105
|
+
"acquisition.acquisition_end_time",
|
|
106
|
+
]
|
|
107
|
+
|
|
108
|
+
if df.empty or force_update:
|
|
109
|
+
logging.info("Updating cache for asset basics")
|
|
110
|
+
df = pd.DataFrame(columns=["_id", "_last_modified", "modalities", "project_name",
|
|
111
|
+
"data_level", "subject_id",
|
|
112
|
+
"acquisition_start_time", "acquisition_end_time"])
|
|
113
|
+
client = MetadataDbClient(
|
|
114
|
+
host=API_GATEWAY_HOST,
|
|
115
|
+
version="v2",
|
|
116
|
+
)
|
|
117
|
+
# It's a bit complex to get multiple fields that aren't indexed in a database
|
|
118
|
+
# as large as DocDB. We'll also try to limit ourselves to only updating fields
|
|
119
|
+
# that are necessary
|
|
120
|
+
record_ids = client.retrieve_docdb_records(
|
|
121
|
+
filter_query={}, projection={"_id": 1, "_last_modified": 1}, limit=0,
|
|
122
|
+
)
|
|
123
|
+
keep_ids = []
|
|
124
|
+
# Drop all _ids where _last_modified matches cache
|
|
125
|
+
for record in record_ids:
|
|
126
|
+
cached_row = df[df["_id"] == record["_id"]]
|
|
127
|
+
if cached_row.empty or cached_row["_last_modified"].values[0] != record["_last_modified"]:
|
|
128
|
+
keep_ids.append(record["_id"])
|
|
129
|
+
|
|
130
|
+
# Now batch by 100 IDs at a time to avoid overloading server, and fetch all the fields
|
|
131
|
+
BATCH_SIZE = 100
|
|
132
|
+
asset_records = []
|
|
133
|
+
for i in range(0, len(keep_ids), BATCH_SIZE):
|
|
134
|
+
logging.info(f"Fetching asset basics batch {i // BATCH_SIZE + 1}...")
|
|
135
|
+
batch_ids = keep_ids[i:i + BATCH_SIZE]
|
|
136
|
+
batch_records = client.retrieve_docdb_records(
|
|
137
|
+
filter_query={"_id": {"$in": batch_ids}},
|
|
138
|
+
projection={field: 1 for field in FIELDS + ["_id", "_last_modified"]},
|
|
139
|
+
limit=0,
|
|
140
|
+
)
|
|
141
|
+
asset_records.extend(batch_records)
|
|
142
|
+
|
|
143
|
+
# Unwrap nested fields
|
|
144
|
+
records = []
|
|
145
|
+
for record in asset_records:
|
|
146
|
+
|
|
147
|
+
modalities = record.get("data_description", {}).get("modalities", [])
|
|
148
|
+
modality_abbreviations = [modality["abbreviation"] for modality in modalities if "abbreviation" in modality]
|
|
149
|
+
modality_abbreviations_str = ", ".join(modality_abbreviations)
|
|
150
|
+
flat_record = {
|
|
151
|
+
"_id": record["_id"],
|
|
152
|
+
"_last_modified": record.get("_last_modified", None),
|
|
153
|
+
"modalities": modality_abbreviations_str,
|
|
154
|
+
"project_name": record.get("data_description", {}).get("project_name", None),
|
|
155
|
+
"data_level": record.get("data_description", {}).get("data_level", None),
|
|
156
|
+
"subject_id": record.get("subject", {}).get("subject_id", None),
|
|
157
|
+
"acquisition_start_time": record.get("acquisition", {}).get("acquisition_start_time", None),
|
|
158
|
+
"acquisition_end_time": record.get("acquisition", {}).get("acquisition_end_time", None),
|
|
159
|
+
}
|
|
160
|
+
records.append(flat_record)
|
|
161
|
+
|
|
162
|
+
# Combine new records with the old df and store in cache
|
|
163
|
+
new_df = pd.DataFrame(records)
|
|
164
|
+
df = pd.concat([df[df["_id"].isin(keep_ids) == False], new_df], ignore_index=True)
|
|
165
|
+
|
|
166
|
+
ACORN.hide(NAMES["basics"], df)
|
|
167
|
+
|
|
168
|
+
return df
|
|
@@ -1,86 +0,0 @@
|
|
|
1
|
-
"""Squirrels: functions to fetch and cache data from MongoDB."""
|
|
2
|
-
import pandas as pd
|
|
3
|
-
from typing import Any, Callable
|
|
4
|
-
from zombie_squirrel.acorns import RedshiftAcorn, MemoryAcorn, rds_get_handle_empty
|
|
5
|
-
from aind_data_access_api.document_db import MetadataDbClient
|
|
6
|
-
import os
|
|
7
|
-
import logging
|
|
8
|
-
|
|
9
|
-
# --- Backend setup ---------------------------------------------------
|
|
10
|
-
|
|
11
|
-
API_GATEWAY_HOST = "api.allenneuraldynamics.org"
|
|
12
|
-
|
|
13
|
-
tree_type = os.getenv("TREE_SPECIES", "memory").lower()
|
|
14
|
-
|
|
15
|
-
if tree_type == "redshift":
|
|
16
|
-
logging.info("Using Redshift acorn for caching")
|
|
17
|
-
ACORN = RedshiftAcorn()
|
|
18
|
-
else:
|
|
19
|
-
logging.info("Using in-memory acorn for caching")
|
|
20
|
-
ACORN = MemoryAcorn()
|
|
21
|
-
|
|
22
|
-
# --- Squirrel registry -----------------------------------------------------
|
|
23
|
-
|
|
24
|
-
SQUIRREL_REGISTRY: dict[str, Callable[[], Any]] = {}
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
def register_squirrel(name: str):
|
|
28
|
-
"""Decorator for registering new squirrels."""
|
|
29
|
-
def decorator(func):
|
|
30
|
-
SQUIRREL_REGISTRY[name] = func
|
|
31
|
-
return func
|
|
32
|
-
return decorator
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
# --- Squirrels -----------------------------------------------------
|
|
36
|
-
|
|
37
|
-
NAMES = {
|
|
38
|
-
"upn": "unique_project_names",
|
|
39
|
-
"usi": "unique_subject_ids",
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
@register_squirrel(NAMES["upn"])
|
|
44
|
-
def unique_project_names(force_update: bool = False) -> list[str]:
|
|
45
|
-
df = rds_get_handle_empty(ACORN, NAMES["upn"])
|
|
46
|
-
|
|
47
|
-
if df.empty or force_update:
|
|
48
|
-
# If cache is missing, fetch data
|
|
49
|
-
logging.info("Updating cache for unique project names")
|
|
50
|
-
client = MetadataDbClient(
|
|
51
|
-
host=API_GATEWAY_HOST,
|
|
52
|
-
version="v2",
|
|
53
|
-
)
|
|
54
|
-
unique_project_names = client.aggregate_docdb_records(
|
|
55
|
-
pipeline=[
|
|
56
|
-
{"$group": {"_id": "$data_description.project_name"}},
|
|
57
|
-
{"$project": {"project_name": "$_id", "_id": 0}},
|
|
58
|
-
]
|
|
59
|
-
)
|
|
60
|
-
df = pd.DataFrame(unique_project_names)
|
|
61
|
-
ACORN.hide(NAMES["upn"], df)
|
|
62
|
-
|
|
63
|
-
return df["project_name"].tolist()
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
@register_squirrel(NAMES["usi"])
|
|
67
|
-
def unique_subject_ids(force_update: bool = False) -> list[str]:
|
|
68
|
-
df = rds_get_handle_empty(ACORN, NAMES["usi"])
|
|
69
|
-
|
|
70
|
-
if df.empty or force_update:
|
|
71
|
-
# If cache is missing, fetch data
|
|
72
|
-
logging.info("Updating cache for unique subject IDs")
|
|
73
|
-
client = MetadataDbClient(
|
|
74
|
-
host=API_GATEWAY_HOST,
|
|
75
|
-
version="v2",
|
|
76
|
-
)
|
|
77
|
-
unique_subject_ids = client.aggregate_docdb_records(
|
|
78
|
-
pipeline=[
|
|
79
|
-
{"$group": {"_id": "$subject.subject_id"}},
|
|
80
|
-
{"$project": {"subject_id": "$_id", "_id": 0}},
|
|
81
|
-
]
|
|
82
|
-
)
|
|
83
|
-
df = pd.DataFrame(unique_subject_ids)
|
|
84
|
-
ACORN.hide(NAMES["usi"], df)
|
|
85
|
-
|
|
86
|
-
return df["subject_id"].tolist()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{zombie_squirrel-0.2.3 → zombie_squirrel-0.4.0}/src/zombie_squirrel.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|