zombie-squirrel 0.2.3__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zombie_squirrel/__init__.py +2 -1
- zombie_squirrel/acorns.py +2 -1
- zombie_squirrel/squirrels.py +82 -0
- {zombie_squirrel-0.2.3.dist-info → zombie_squirrel-0.4.0.dist-info}/METADATA +1 -1
- zombie_squirrel-0.4.0.dist-info/RECORD +10 -0
- zombie_squirrel-0.2.3.dist-info/RECORD +0 -10
- {zombie_squirrel-0.2.3.dist-info → zombie_squirrel-0.4.0.dist-info}/WHEEL +0 -0
- {zombie_squirrel-0.2.3.dist-info → zombie_squirrel-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {zombie_squirrel-0.2.3.dist-info → zombie_squirrel-0.4.0.dist-info}/top_level.txt +0 -0
zombie_squirrel/__init__.py
CHANGED
zombie_squirrel/acorns.py
CHANGED
|
@@ -59,7 +59,8 @@ class MemoryAcorn(Acorn):
|
|
|
59
59
|
|
|
60
60
|
|
|
61
61
|
def rds_get_handle_empty(acorn: Acorn, table_name: str) -> pd.DataFrame:
|
|
62
|
-
"""
|
|
62
|
+
"""Helper for handling errors when loading from redshift, because
|
|
63
|
+
there's no helper function """
|
|
63
64
|
try:
|
|
64
65
|
logging.info(f"Fetching from cache: {table_name}")
|
|
65
66
|
df = acorn.scurry(table_name)
|
zombie_squirrel/squirrels.py
CHANGED
|
@@ -37,6 +37,7 @@ def register_squirrel(name: str):
|
|
|
37
37
|
NAMES = {
|
|
38
38
|
"upn": "unique_project_names",
|
|
39
39
|
"usi": "unique_subject_ids",
|
|
40
|
+
"basics": "asset_basics",
|
|
40
41
|
}
|
|
41
42
|
|
|
42
43
|
|
|
@@ -84,3 +85,84 @@ def unique_subject_ids(force_update: bool = False) -> list[str]:
|
|
|
84
85
|
ACORN.hide(NAMES["usi"], df)
|
|
85
86
|
|
|
86
87
|
return df["subject_id"].tolist()
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@register_squirrel(NAMES["basics"])
|
|
91
|
+
def asset_basics(force_update: bool = False) -> pd.DataFrame:
|
|
92
|
+
"""Basic asset metadata.
|
|
93
|
+
|
|
94
|
+
_id, _last_modified,
|
|
95
|
+
modalities, project names, data_level, subject_id, acquisition_start and _end
|
|
96
|
+
"""
|
|
97
|
+
df = rds_get_handle_empty(ACORN, NAMES["basics"])
|
|
98
|
+
|
|
99
|
+
FIELDS = [
|
|
100
|
+
"data_description.modalities",
|
|
101
|
+
"data_description.project_name",
|
|
102
|
+
"data_description.data_level",
|
|
103
|
+
"subject.subject_id",
|
|
104
|
+
"acquisition.acquisition_start_time",
|
|
105
|
+
"acquisition.acquisition_end_time",
|
|
106
|
+
]
|
|
107
|
+
|
|
108
|
+
if df.empty or force_update:
|
|
109
|
+
logging.info("Updating cache for asset basics")
|
|
110
|
+
df = pd.DataFrame(columns=["_id", "_last_modified", "modalities", "project_name",
|
|
111
|
+
"data_level", "subject_id",
|
|
112
|
+
"acquisition_start_time", "acquisition_end_time"])
|
|
113
|
+
client = MetadataDbClient(
|
|
114
|
+
host=API_GATEWAY_HOST,
|
|
115
|
+
version="v2",
|
|
116
|
+
)
|
|
117
|
+
# It's a bit complex to get multiple fields that aren't indexed in a database
|
|
118
|
+
# as large as DocDB. We'll also try to limit ourselves to only updating fields
|
|
119
|
+
# that are necessary
|
|
120
|
+
record_ids = client.retrieve_docdb_records(
|
|
121
|
+
filter_query={}, projection={"_id": 1, "_last_modified": 1}, limit=0,
|
|
122
|
+
)
|
|
123
|
+
keep_ids = []
|
|
124
|
+
# Drop all _ids where _last_modified matches cache
|
|
125
|
+
for record in record_ids:
|
|
126
|
+
cached_row = df[df["_id"] == record["_id"]]
|
|
127
|
+
if cached_row.empty or cached_row["_last_modified"].values[0] != record["_last_modified"]:
|
|
128
|
+
keep_ids.append(record["_id"])
|
|
129
|
+
|
|
130
|
+
# Now batch by 100 IDs at a time to avoid overloading server, and fetch all the fields
|
|
131
|
+
BATCH_SIZE = 100
|
|
132
|
+
asset_records = []
|
|
133
|
+
for i in range(0, len(keep_ids), BATCH_SIZE):
|
|
134
|
+
logging.info(f"Fetching asset basics batch {i // BATCH_SIZE + 1}...")
|
|
135
|
+
batch_ids = keep_ids[i:i + BATCH_SIZE]
|
|
136
|
+
batch_records = client.retrieve_docdb_records(
|
|
137
|
+
filter_query={"_id": {"$in": batch_ids}},
|
|
138
|
+
projection={field: 1 for field in FIELDS + ["_id", "_last_modified"]},
|
|
139
|
+
limit=0,
|
|
140
|
+
)
|
|
141
|
+
asset_records.extend(batch_records)
|
|
142
|
+
|
|
143
|
+
# Unwrap nested fields
|
|
144
|
+
records = []
|
|
145
|
+
for record in asset_records:
|
|
146
|
+
|
|
147
|
+
modalities = record.get("data_description", {}).get("modalities", [])
|
|
148
|
+
modality_abbreviations = [modality["abbreviation"] for modality in modalities if "abbreviation" in modality]
|
|
149
|
+
modality_abbreviations_str = ", ".join(modality_abbreviations)
|
|
150
|
+
flat_record = {
|
|
151
|
+
"_id": record["_id"],
|
|
152
|
+
"_last_modified": record.get("_last_modified", None),
|
|
153
|
+
"modalities": modality_abbreviations_str,
|
|
154
|
+
"project_name": record.get("data_description", {}).get("project_name", None),
|
|
155
|
+
"data_level": record.get("data_description", {}).get("data_level", None),
|
|
156
|
+
"subject_id": record.get("subject", {}).get("subject_id", None),
|
|
157
|
+
"acquisition_start_time": record.get("acquisition", {}).get("acquisition_start_time", None),
|
|
158
|
+
"acquisition_end_time": record.get("acquisition", {}).get("acquisition_end_time", None),
|
|
159
|
+
}
|
|
160
|
+
records.append(flat_record)
|
|
161
|
+
|
|
162
|
+
# Combine new records with the old df and store in cache
|
|
163
|
+
new_df = pd.DataFrame(records)
|
|
164
|
+
df = pd.concat([df[df["_id"].isin(keep_ids) == False], new_df], ignore_index=True)
|
|
165
|
+
|
|
166
|
+
ACORN.hide(NAMES["basics"], df)
|
|
167
|
+
|
|
168
|
+
return df
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
zombie_squirrel/__init__.py,sha256=8er6wgFVb0XMkMDsmLRvR_YeO1E_sL3KaOJN9VXXwOw,152
|
|
2
|
+
zombie_squirrel/acorns.py,sha256=1mCnWCDFRnbHLddCCgiUG3RumuKUjMKVbyTVoYI0FB8,2188
|
|
3
|
+
zombie_squirrel/squirrels.py,sha256=Ln8tsa51rK6d2rpOIktSAeHYX3sYMXr3o4njZzAujAo,6340
|
|
4
|
+
zombie_squirrel/sync.py,sha256=jslTVIend5Z-sLJuNXKkhn-nqmKK_P0FAiRuFFYRnto,168
|
|
5
|
+
zombie_squirrel/utils.py,sha256=74DSFK1Qbp8yQeUXpnli4kqx_QcAc8v4_6FZut0xZ8g,103
|
|
6
|
+
zombie_squirrel-0.4.0.dist-info/licenses/LICENSE,sha256=U0Y7B3gZJHXpjJVLgTQjM8e_c8w4JJpLgGhIdsoFR1Y,1092
|
|
7
|
+
zombie_squirrel-0.4.0.dist-info/METADATA,sha256=0Rv7O3SRGDe06_F4-Kefj9JxC2xMQG1m1l3BYrZyfUE,1382
|
|
8
|
+
zombie_squirrel-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
+
zombie_squirrel-0.4.0.dist-info/top_level.txt,sha256=FmM0coe4AangURZLjM4JwwRv2B8H6oINYCoZLKLDCKA,16
|
|
10
|
+
zombie_squirrel-0.4.0.dist-info/RECORD,,
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
zombie_squirrel/__init__.py,sha256=i7J51pQP-ID_TrBZtNQmxjgzlqJ8WGQIObfHAzospx8,134
|
|
2
|
-
zombie_squirrel/acorns.py,sha256=MZaScwDpnuuGnrx8a1vRmfv5-fr6h4Idw1_rQ2FWdB0,2132
|
|
3
|
-
zombie_squirrel/squirrels.py,sha256=3ybJQpuNsoM8gBkHSWOKBZ_zOfsnzq35TKh0Aig2voc,2662
|
|
4
|
-
zombie_squirrel/sync.py,sha256=jslTVIend5Z-sLJuNXKkhn-nqmKK_P0FAiRuFFYRnto,168
|
|
5
|
-
zombie_squirrel/utils.py,sha256=74DSFK1Qbp8yQeUXpnli4kqx_QcAc8v4_6FZut0xZ8g,103
|
|
6
|
-
zombie_squirrel-0.2.3.dist-info/licenses/LICENSE,sha256=U0Y7B3gZJHXpjJVLgTQjM8e_c8w4JJpLgGhIdsoFR1Y,1092
|
|
7
|
-
zombie_squirrel-0.2.3.dist-info/METADATA,sha256=o82tnKDGfBAcDwt3fiSpF14octxGxKym5BmluyTyTg0,1382
|
|
8
|
-
zombie_squirrel-0.2.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
-
zombie_squirrel-0.2.3.dist-info/top_level.txt,sha256=FmM0coe4AangURZLjM4JwwRv2B8H6oINYCoZLKLDCKA,16
|
|
10
|
-
zombie_squirrel-0.2.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|