biodata-cache 0.32.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biodata_cache/__init__.py +32 -0
- biodata_cache/backend.py +310 -0
- biodata_cache/cache_table_helpers/__init__.py +20 -0
- biodata_cache/cache_table_helpers/asset_basics.py +271 -0
- biodata_cache/cache_table_helpers/behavior_curriculum.py +170 -0
- biodata_cache/cache_table_helpers/custom.py +31 -0
- biodata_cache/cache_table_helpers/foraging_sessions.py +110 -0
- biodata_cache/cache_table_helpers/metadata_core.py +120 -0
- biodata_cache/cache_table_helpers/metadata_upgrade.py +150 -0
- biodata_cache/cache_table_helpers/platform_exaspim.py +171 -0
- biodata_cache/cache_table_helpers/platform_fib.py +197 -0
- biodata_cache/cache_table_helpers/platform_qc.py +196 -0
- biodata_cache/cache_table_helpers/platform_smartspim.py +274 -0
- biodata_cache/cache_table_helpers/qc.py +259 -0
- biodata_cache/cache_table_helpers/raw_to_derived.py +37 -0
- biodata_cache/cache_table_helpers/source_data.py +97 -0
- biodata_cache/cache_table_helpers/unique_genotypes.py +63 -0
- biodata_cache/cache_table_helpers/unique_project_names.py +62 -0
- biodata_cache/cache_table_helpers/unique_subject_ids.py +62 -0
- biodata_cache/models.py +40 -0
- biodata_cache/registry.py +66 -0
- biodata_cache/sync.py +186 -0
- biodata_cache/utils.py +124 -0
- biodata_cache-0.32.0.dist-info/METADATA +113 -0
- biodata_cache-0.32.0.dist-info/RECORD +29 -0
- biodata_cache-0.32.0.dist-info/WHEEL +5 -0
- biodata_cache-0.32.0.dist-info/licenses/LICENSE +21 -0
- biodata_cache-0.32.0.dist-info/top_level.txt +2 -0
- zombie_squirrel/__init__.py +32 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Zombie-squirrel: caching and synchronization for AIND metadata.
|
|
2
|
+
|
|
3
|
+
Provides functions to fetch and cache project names, subject IDs, and asset
|
|
4
|
+
metadata from the AIND metadata database with support for multiple backends.
|
|
5
|
+
Also exposes get_cache_registry to retrieve the cache_registry.json registry of all
|
|
6
|
+
available cache tables and their metadata.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
__version__ = "0.32.0"
|
|
10
|
+
|
|
11
|
+
from biodata_cache.cache_table_helpers.asset_basics import asset_basics # noqa: F401
|
|
12
|
+
from biodata_cache.cache_table_helpers.behavior_curriculum import behavior_curriculum # noqa: F401
|
|
13
|
+
from biodata_cache.cache_table_helpers.custom import custom # noqa: F401
|
|
14
|
+
from biodata_cache.cache_table_helpers.foraging_sessions import foraging_sessions # noqa: F401
|
|
15
|
+
from biodata_cache.cache_table_helpers.metadata_upgrade import metadata_upgrade # noqa: F401
|
|
16
|
+
from biodata_cache.cache_table_helpers.platform_exaspim import platform_exaspim # noqa: F401
|
|
17
|
+
from biodata_cache.cache_table_helpers.platform_fib import platform_fib # noqa: F401
|
|
18
|
+
from biodata_cache.cache_table_helpers.platform_qc import platform_qc # noqa: F401
|
|
19
|
+
from biodata_cache.cache_table_helpers.platform_smartspim import assets_smartspim # noqa: F401
|
|
20
|
+
from biodata_cache.cache_table_helpers.qc import qc, qc_columns # noqa: F401
|
|
21
|
+
from biodata_cache.cache_table_helpers.raw_to_derived import raw_to_derived # noqa: F401
|
|
22
|
+
from biodata_cache.cache_table_helpers.source_data import source_data # noqa: F401
|
|
23
|
+
from biodata_cache.cache_table_helpers.unique_genotypes import ( # noqa: F401
|
|
24
|
+
unique_genotypes,
|
|
25
|
+
)
|
|
26
|
+
from biodata_cache.cache_table_helpers.unique_project_names import ( # noqa: F401
|
|
27
|
+
unique_project_names,
|
|
28
|
+
)
|
|
29
|
+
from biodata_cache.cache_table_helpers.unique_subject_ids import ( # noqa: F401
|
|
30
|
+
unique_subject_ids,
|
|
31
|
+
)
|
|
32
|
+
from biodata_cache.utils import get_cache_registry # noqa: F401
|
biodata_cache/backend.py
ADDED
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
"""Storage backend interfaces for caching data."""
|
|
2
|
+
|
|
3
|
+
import io
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
|
|
8
|
+
import boto3
|
|
9
|
+
import duckdb
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
from biodata_cache.utils import BDC_VERSION, CacheLogMessage
|
|
13
|
+
|
|
14
|
+
_CACHE_ROOT = "data-asset-cache"
|
|
15
|
+
_VERSION_FOLDER = f"bdc-v{BDC_VERSION}"
|
|
16
|
+
|
|
17
|
+
HIVE_PARTITION_KEYS = {
|
|
18
|
+
"qc": "subject_id",
|
|
19
|
+
"qc_tag_status": "subject_id",
|
|
20
|
+
"platform_qc": "platform",
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Backend(ABC):
|
|
25
|
+
"""Base class for a cache storage backend."""
|
|
26
|
+
|
|
27
|
+
def __init__(self) -> None:
|
|
28
|
+
"""Initialize the Backend."""
|
|
29
|
+
super().__init__()
|
|
30
|
+
|
|
31
|
+
@abstractmethod
|
|
32
|
+
def write(self, table_name: str, data: pd.DataFrame) -> None:
|
|
33
|
+
"""Write records to the cache."""
|
|
34
|
+
pass # pragma: no cover
|
|
35
|
+
|
|
36
|
+
@abstractmethod
|
|
37
|
+
def read(self, table_name: str | list[str]) -> pd.DataFrame:
|
|
38
|
+
"""Read records from the cache.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
table_name: Single table name or list of table names.
|
|
42
|
+
When a list is provided, merges all tables and adds
|
|
43
|
+
an 'asset_name' column to differentiate sources.
|
|
44
|
+
|
|
45
|
+
"""
|
|
46
|
+
pass # pragma: no cover
|
|
47
|
+
|
|
48
|
+
@abstractmethod
|
|
49
|
+
def get_location(self, table_name: str, partitioned: bool = False) -> str:
|
|
50
|
+
"""Return the storage location string for a given table."""
|
|
51
|
+
pass # pragma: no cover
|
|
52
|
+
|
|
53
|
+
@abstractmethod
|
|
54
|
+
def put_json(self, key: str, data: str) -> None:
|
|
55
|
+
"""Write a JSON string to the storage root under the given key."""
|
|
56
|
+
pass # pragma: no cover
|
|
57
|
+
|
|
58
|
+
@abstractmethod
|
|
59
|
+
def get_json(self, key: str) -> str:
|
|
60
|
+
"""Read a JSON string from the storage root under the given key."""
|
|
61
|
+
pass # pragma: no cover
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class S3Backend(Backend):
|
|
65
|
+
"""Stores and retrieves caches using AWS S3 with parquet files."""
|
|
66
|
+
|
|
67
|
+
def __init__(self) -> None:
|
|
68
|
+
"""Initialize S3Backend with S3 client."""
|
|
69
|
+
self.bucket = "allen-data-views"
|
|
70
|
+
self.s3_client = boto3.client("s3")
|
|
71
|
+
|
|
72
|
+
def write(self, table_name: str, data: pd.DataFrame) -> None:
|
|
73
|
+
"""Store DataFrame as parquet file in S3."""
|
|
74
|
+
if "/" in table_name:
|
|
75
|
+
base, value = table_name.split("/", 1)
|
|
76
|
+
partition_key = HIVE_PARTITION_KEYS[base]
|
|
77
|
+
s3_key = f"{_CACHE_ROOT}/{_VERSION_FOLDER}/{base}/{partition_key}={value}/data.pqt"
|
|
78
|
+
json_key = f"{_CACHE_ROOT}/{_VERSION_FOLDER}/{base}.json"
|
|
79
|
+
else:
|
|
80
|
+
s3_key = f"{_CACHE_ROOT}/{_VERSION_FOLDER}/{table_name}.pqt"
|
|
81
|
+
json_key = f"{_CACHE_ROOT}/{_VERSION_FOLDER}/{table_name}.json"
|
|
82
|
+
|
|
83
|
+
parquet_buffer = io.BytesIO()
|
|
84
|
+
data.to_parquet(parquet_buffer, index=False)
|
|
85
|
+
parquet_buffer.seek(0)
|
|
86
|
+
|
|
87
|
+
self.s3_client.put_object(
|
|
88
|
+
Bucket=self.bucket,
|
|
89
|
+
Key=s3_key,
|
|
90
|
+
Body=parquet_buffer.getvalue(),
|
|
91
|
+
)
|
|
92
|
+
logging.info(
|
|
93
|
+
CacheLogMessage(
|
|
94
|
+
backend="S3Backend", table=table_name, message=f"Stored cache to s3://{self.bucket}/{s3_key}"
|
|
95
|
+
).to_json()
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
metadata = {"columns": data.columns.tolist()}
|
|
99
|
+
self.s3_client.put_object(
|
|
100
|
+
Bucket=self.bucket,
|
|
101
|
+
Key=json_key,
|
|
102
|
+
Body=json.dumps(metadata),
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
def read(self, table_name: str | list[str]) -> pd.DataFrame:
|
|
106
|
+
"""Fetch DataFrame from S3 parquet file(s).
|
|
107
|
+
|
|
108
|
+
When given a list of table names, merges them using DuckDB
|
|
109
|
+
and adds an 'asset_name' column.
|
|
110
|
+
"""
|
|
111
|
+
if isinstance(table_name, list):
|
|
112
|
+
return self._read_multiple(table_name)
|
|
113
|
+
return self._read_single(table_name)
|
|
114
|
+
|
|
115
|
+
def _read_single(self, table_name: str) -> pd.DataFrame:
|
|
116
|
+
"""Fetch a single table from S3."""
|
|
117
|
+
if "/" in table_name:
|
|
118
|
+
base, value = table_name.split("/", 1)
|
|
119
|
+
partition_key = HIVE_PARTITION_KEYS[base]
|
|
120
|
+
s3_key = f"{_CACHE_ROOT}/{_VERSION_FOLDER}/{base}/{partition_key}={value}/data.pqt"
|
|
121
|
+
else:
|
|
122
|
+
s3_key = f"{_CACHE_ROOT}/{_VERSION_FOLDER}/{table_name}.pqt"
|
|
123
|
+
|
|
124
|
+
try:
|
|
125
|
+
query = f"""
|
|
126
|
+
SELECT * FROM read_parquet(
|
|
127
|
+
's3://{self.bucket}/{s3_key}'
|
|
128
|
+
)
|
|
129
|
+
"""
|
|
130
|
+
result = duckdb.query(query).to_df()
|
|
131
|
+
logging.info(
|
|
132
|
+
CacheLogMessage(
|
|
133
|
+
backend="S3Backend", table=table_name, message=f"Retrieved cache from s3://{self.bucket}/{s3_key}"
|
|
134
|
+
).to_json()
|
|
135
|
+
)
|
|
136
|
+
return result
|
|
137
|
+
except Exception as e:
|
|
138
|
+
logging.warning(
|
|
139
|
+
CacheLogMessage(
|
|
140
|
+
backend="S3Backend", table=table_name, message=f"Error fetching from cache {s3_key}: {e}"
|
|
141
|
+
).to_json()
|
|
142
|
+
)
|
|
143
|
+
return pd.DataFrame()
|
|
144
|
+
|
|
145
|
+
def get_location(self, table_name: str, partitioned: bool = False) -> str:
|
|
146
|
+
"""Return the S3 URI for a given table."""
|
|
147
|
+
if partitioned:
|
|
148
|
+
return f"s3://{self.bucket}/{_CACHE_ROOT}/{_VERSION_FOLDER}/{table_name}/"
|
|
149
|
+
if "/" in table_name:
|
|
150
|
+
base, value = table_name.split("/", 1)
|
|
151
|
+
partition_key = HIVE_PARTITION_KEYS[base]
|
|
152
|
+
return f"s3://{self.bucket}/{_CACHE_ROOT}/{_VERSION_FOLDER}/{base}/{partition_key}={value}/data.pqt"
|
|
153
|
+
return f"s3://{self.bucket}/{_CACHE_ROOT}/{_VERSION_FOLDER}/{table_name}.pqt"
|
|
154
|
+
|
|
155
|
+
def put_json(self, key: str, data: str) -> None: # pragma: no cover
|
|
156
|
+
"""Write a JSON string to the versioned folder in S3 and update the index."""
|
|
157
|
+
s3_key = f"{_CACHE_ROOT}/{_VERSION_FOLDER}/{key}"
|
|
158
|
+
self.s3_client.put_object(
|
|
159
|
+
Bucket=self.bucket,
|
|
160
|
+
Key=s3_key,
|
|
161
|
+
Body=data.encode(),
|
|
162
|
+
ContentType="application/json",
|
|
163
|
+
)
|
|
164
|
+
logging.info(
|
|
165
|
+
CacheLogMessage(
|
|
166
|
+
backend="S3Backend", table=key, message=f"Published metadata to s3://{self.bucket}/{s3_key}"
|
|
167
|
+
).to_json()
|
|
168
|
+
)
|
|
169
|
+
index_key = f"{_CACHE_ROOT}/cache_versions.json"
|
|
170
|
+
try:
|
|
171
|
+
response = self.s3_client.get_object(Bucket=self.bucket, Key=index_key)
|
|
172
|
+
existing = json.loads(response["Body"].read().decode())
|
|
173
|
+
except Exception:
|
|
174
|
+
existing = []
|
|
175
|
+
if _VERSION_FOLDER not in existing:
|
|
176
|
+
existing.append(_VERSION_FOLDER)
|
|
177
|
+
self.s3_client.put_object(
|
|
178
|
+
Bucket=self.bucket,
|
|
179
|
+
Key=index_key,
|
|
180
|
+
Body=json.dumps(existing).encode(),
|
|
181
|
+
ContentType="application/json",
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
def get_json(self, key: str) -> str: # pragma: no cover
|
|
185
|
+
"""Read a JSON string from the versioned folder in S3."""
|
|
186
|
+
s3_key = f"{_CACHE_ROOT}/{_VERSION_FOLDER}/{key}"
|
|
187
|
+
response = self.s3_client.get_object(Bucket=self.bucket, Key=s3_key)
|
|
188
|
+
return response["Body"].read().decode()
|
|
189
|
+
|
|
190
|
+
def _read_multiple(self, table_names: list[str]) -> pd.DataFrame:
|
|
191
|
+
"""Fetch and merge multiple tables from S3."""
|
|
192
|
+
parquet_paths = []
|
|
193
|
+
asset_names = []
|
|
194
|
+
|
|
195
|
+
for tbl_name in table_names:
|
|
196
|
+
s3_key = f"{_CACHE_ROOT}/{_VERSION_FOLDER}/{tbl_name}.pqt"
|
|
197
|
+
s3_path = f"s3://{self.bucket}/{s3_key}"
|
|
198
|
+
parquet_paths.append(f"'{s3_path}'")
|
|
199
|
+
asset_names.append(tbl_name)
|
|
200
|
+
|
|
201
|
+
try:
|
|
202
|
+
union_query = " UNION ALL ".join(
|
|
203
|
+
[
|
|
204
|
+
f"SELECT *, '{asset}' as asset_name FROM read_parquet({path})"
|
|
205
|
+
for path, asset in zip(parquet_paths, asset_names, strict=False)
|
|
206
|
+
]
|
|
207
|
+
)
|
|
208
|
+
result = duckdb.query(union_query).to_df()
|
|
209
|
+
logging.info(
|
|
210
|
+
CacheLogMessage(
|
|
211
|
+
backend="S3Backend", table="merged", message=f"Merged {len(table_names)} tables from S3"
|
|
212
|
+
).to_json()
|
|
213
|
+
)
|
|
214
|
+
return result
|
|
215
|
+
except Exception as e:
|
|
216
|
+
logging.warning(
|
|
217
|
+
CacheLogMessage(backend="S3Backend", table="merged", message=f"Error merging tables: {e}").to_json()
|
|
218
|
+
)
|
|
219
|
+
return pd.DataFrame()
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
class MemoryBackend(Backend):
|
|
223
|
+
"""A simple in-memory backend for testing or local development."""
|
|
224
|
+
|
|
225
|
+
def __init__(self) -> None:
|
|
226
|
+
"""Initialize MemoryBackend with empty store."""
|
|
227
|
+
super().__init__()
|
|
228
|
+
self._store: dict[str, pd.DataFrame] = {}
|
|
229
|
+
self._json_store: dict[str, str] = {}
|
|
230
|
+
|
|
231
|
+
def write(self, table_name: str, data: pd.DataFrame) -> None:
|
|
232
|
+
"""Store DataFrame in memory."""
|
|
233
|
+
logging.info(
|
|
234
|
+
CacheLogMessage(
|
|
235
|
+
backend="MemoryBackend", table=table_name, message=f"Storing cache in memory for {table_name}"
|
|
236
|
+
).to_json()
|
|
237
|
+
)
|
|
238
|
+
self._store[table_name] = data
|
|
239
|
+
|
|
240
|
+
def read(self, table_name: str | list[str]) -> pd.DataFrame:
|
|
241
|
+
"""Fetch DataFrame from memory.
|
|
242
|
+
|
|
243
|
+
When given a list of table names, merges them and adds
|
|
244
|
+
an 'asset_name' column.
|
|
245
|
+
"""
|
|
246
|
+
if isinstance(table_name, list):
|
|
247
|
+
return self._read_multiple(table_name)
|
|
248
|
+
return self._read_single(table_name)
|
|
249
|
+
|
|
250
|
+
def _read_single(self, table_name: str) -> pd.DataFrame:
|
|
251
|
+
"""Fetch a single table from memory."""
|
|
252
|
+
logging.info(
|
|
253
|
+
CacheLogMessage(
|
|
254
|
+
backend="MemoryBackend", table=table_name, message=f"Fetching cache from memory for {table_name}"
|
|
255
|
+
).to_json()
|
|
256
|
+
)
|
|
257
|
+
return self._store.get(table_name, pd.DataFrame())
|
|
258
|
+
|
|
259
|
+
def get_location(self, table_name: str, partitioned: bool = False) -> str:
|
|
260
|
+
"""Return the in-memory identifier for a given table."""
|
|
261
|
+
if partitioned:
|
|
262
|
+
return f"{_VERSION_FOLDER}/{table_name}/"
|
|
263
|
+
if "/" in table_name:
|
|
264
|
+
base, value = table_name.split("/", 1)
|
|
265
|
+
partition_key = HIVE_PARTITION_KEYS[base]
|
|
266
|
+
return f"{_VERSION_FOLDER}/{base}/{partition_key}={value}/data.pqt"
|
|
267
|
+
return f"{_VERSION_FOLDER}/{table_name}.pqt"
|
|
268
|
+
|
|
269
|
+
def put_json(self, key: str, data: str) -> None:
|
|
270
|
+
"""Store a JSON string in the versioned in-memory JSON store and update index."""
|
|
271
|
+
logging.info(
|
|
272
|
+
CacheLogMessage(
|
|
273
|
+
backend="MemoryBackend", table=key, message=f"Storing metadata in memory for {key}"
|
|
274
|
+
).to_json()
|
|
275
|
+
)
|
|
276
|
+
self._json_store[f"{_VERSION_FOLDER}/{key}"] = data
|
|
277
|
+
existing = json.loads(self._json_store.get("cache_versions.json", "[]"))
|
|
278
|
+
if _VERSION_FOLDER not in existing:
|
|
279
|
+
existing.append(_VERSION_FOLDER)
|
|
280
|
+
self._json_store["cache_versions.json"] = json.dumps(existing)
|
|
281
|
+
|
|
282
|
+
def get_json(self, key: str) -> str:
|
|
283
|
+
"""Read a JSON string from the versioned in-memory JSON store."""
|
|
284
|
+
return self._json_store.get(f"{_VERSION_FOLDER}/{key}", "{}")
|
|
285
|
+
|
|
286
|
+
def _read_multiple(self, table_names: list[str]) -> pd.DataFrame:
|
|
287
|
+
"""Fetch and merge multiple tables from memory."""
|
|
288
|
+
dfs = []
|
|
289
|
+
for tbl_name in table_names:
|
|
290
|
+
df = self._store.get(tbl_name, pd.DataFrame())
|
|
291
|
+
if not df.empty:
|
|
292
|
+
df = df.copy()
|
|
293
|
+
df["asset_name"] = tbl_name
|
|
294
|
+
dfs.append(df)
|
|
295
|
+
|
|
296
|
+
if not dfs:
|
|
297
|
+
logging.warning(
|
|
298
|
+
CacheLogMessage(
|
|
299
|
+
backend="MemoryBackend", table="merged", message=f"No valid tables found among {table_names}"
|
|
300
|
+
).to_json()
|
|
301
|
+
)
|
|
302
|
+
return pd.DataFrame()
|
|
303
|
+
|
|
304
|
+
result = pd.concat(dfs, ignore_index=True)
|
|
305
|
+
logging.info(
|
|
306
|
+
CacheLogMessage(
|
|
307
|
+
backend="MemoryBackend", table="merged", message=f"Merged {len(dfs)} tables from memory"
|
|
308
|
+
).to_json()
|
|
309
|
+
)
|
|
310
|
+
return result
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""Acorns module: individual data fetching functions."""
|
|
2
|
+
|
|
3
|
+
from biodata_cache.cache_table_helpers import ( # noqa: F401
|
|
4
|
+
asset_basics,
|
|
5
|
+
behavior_curriculum,
|
|
6
|
+
custom,
|
|
7
|
+
foraging_sessions,
|
|
8
|
+
metadata_core,
|
|
9
|
+
metadata_upgrade,
|
|
10
|
+
platform_exaspim,
|
|
11
|
+
platform_fib,
|
|
12
|
+
platform_qc,
|
|
13
|
+
platform_smartspim,
|
|
14
|
+
qc,
|
|
15
|
+
raw_to_derived,
|
|
16
|
+
source_data,
|
|
17
|
+
unique_genotypes,
|
|
18
|
+
unique_project_names,
|
|
19
|
+
unique_subject_ids,
|
|
20
|
+
)
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
"""Asset basics cache table."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from aind_data_access_api.document_db import MetadataDbClient
|
|
7
|
+
|
|
8
|
+
import biodata_cache.registry as registry
|
|
9
|
+
from biodata_cache.models import Column
|
|
10
|
+
from biodata_cache.utils import (
|
|
11
|
+
CacheLogMessage,
|
|
12
|
+
apply_first_name_map,
|
|
13
|
+
build_first_name_map,
|
|
14
|
+
normalize_experimenters,
|
|
15
|
+
normalize_instrument_id,
|
|
16
|
+
setup_logging,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@registry.register_table(registry.NAMES["basics"])
|
|
21
|
+
def asset_basics(force_update: bool = False) -> pd.DataFrame:
|
|
22
|
+
"""Fetch basic asset metadata including modalities, projects, and subject info.
|
|
23
|
+
|
|
24
|
+
Returns a DataFrame with columns: _id, _last_modified, modalities,
|
|
25
|
+
project_name, data_level, subject_id, acquisition_start_time, and
|
|
26
|
+
acquisition_end_time. Uses incremental updates based on _last_modified
|
|
27
|
+
timestamps to avoid re-fetching unchanged records.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
force_update: If True, bypass cache and fetch fresh data from database.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
DataFrame with basic asset metadata.
|
|
34
|
+
|
|
35
|
+
"""
|
|
36
|
+
df = registry.BACKEND.read(registry.NAMES["basics"])
|
|
37
|
+
|
|
38
|
+
FIELDS = [
|
|
39
|
+
"data_description.modalities",
|
|
40
|
+
"data_description.project_name",
|
|
41
|
+
"data_description.data_level",
|
|
42
|
+
"subject.subject_id",
|
|
43
|
+
"acquisition.acquisition_start_time",
|
|
44
|
+
"acquisition.acquisition_end_time",
|
|
45
|
+
"acquisition.acquisition_type",
|
|
46
|
+
"acquisition.subject_details.date_of_birth",
|
|
47
|
+
"acquisition.subject_details.year_of_birth",
|
|
48
|
+
"processing.data_processes.start_date_time",
|
|
49
|
+
"subject.subject_details.genotype",
|
|
50
|
+
"other_identifiers",
|
|
51
|
+
"location",
|
|
52
|
+
"name",
|
|
53
|
+
"acquisition.experimenters",
|
|
54
|
+
"acquisition.instrument_id",
|
|
55
|
+
"data_description.investigators",
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
if df.empty or force_update:
|
|
59
|
+
setup_logging()
|
|
60
|
+
logging.info(
|
|
61
|
+
CacheLogMessage(
|
|
62
|
+
backend=registry.BACKEND.__class__.__name__, table=registry.NAMES["basics"], message="Updating cache"
|
|
63
|
+
).to_json()
|
|
64
|
+
)
|
|
65
|
+
df = pd.DataFrame(
|
|
66
|
+
columns=[
|
|
67
|
+
"_id",
|
|
68
|
+
"_last_modified",
|
|
69
|
+
"modalities",
|
|
70
|
+
"project_name",
|
|
71
|
+
"data_level",
|
|
72
|
+
"subject_id",
|
|
73
|
+
"acquisition_start_time",
|
|
74
|
+
"acquisition_end_time",
|
|
75
|
+
"code_ocean",
|
|
76
|
+
"process_date",
|
|
77
|
+
"genotype",
|
|
78
|
+
"age",
|
|
79
|
+
"acquisition_type",
|
|
80
|
+
"location",
|
|
81
|
+
"name",
|
|
82
|
+
"experimenters",
|
|
83
|
+
"experimenters_normalized",
|
|
84
|
+
"instrument_id",
|
|
85
|
+
"instrument_id_normalized",
|
|
86
|
+
"investigators",
|
|
87
|
+
"investigators_normalized",
|
|
88
|
+
]
|
|
89
|
+
)
|
|
90
|
+
client = MetadataDbClient(
|
|
91
|
+
host=registry.API_GATEWAY_HOST,
|
|
92
|
+
version="v2",
|
|
93
|
+
)
|
|
94
|
+
# It's a bit complex to get multiple fields that aren't indexed in a database
|
|
95
|
+
# as large as DocDB. We'll also try to limit ourselves to only updating fields
|
|
96
|
+
# that are necessary
|
|
97
|
+
record_ids = client.retrieve_docdb_records(
|
|
98
|
+
filter_query={},
|
|
99
|
+
projection={"_id": 1, "_last_modified": 1},
|
|
100
|
+
limit=0,
|
|
101
|
+
)
|
|
102
|
+
keep_ids = []
|
|
103
|
+
# Drop all _ids where _last_modified matches cache
|
|
104
|
+
for record in record_ids:
|
|
105
|
+
cached_row = df[df["_id"] == record["_id"]]
|
|
106
|
+
if cached_row.empty or cached_row["_last_modified"].values[0] != record["_last_modified"]:
|
|
107
|
+
keep_ids.append(record["_id"])
|
|
108
|
+
|
|
109
|
+
# Now batch by 100 IDs at a time to avoid overloading server, and fetch all the fields
|
|
110
|
+
BATCH_SIZE = 100
|
|
111
|
+
asset_records = []
|
|
112
|
+
for i in range(0, len(keep_ids), BATCH_SIZE):
|
|
113
|
+
logging.info(
|
|
114
|
+
CacheLogMessage(
|
|
115
|
+
backend=registry.BACKEND.__class__.__name__,
|
|
116
|
+
table=registry.NAMES["basics"],
|
|
117
|
+
message=f"Fetching batch {i // BATCH_SIZE + 1}",
|
|
118
|
+
).to_json()
|
|
119
|
+
)
|
|
120
|
+
batch_ids = keep_ids[i : i + BATCH_SIZE]
|
|
121
|
+
batch_records = client.retrieve_docdb_records(
|
|
122
|
+
filter_query={"_id": {"$in": batch_ids}},
|
|
123
|
+
projection={field: 1 for field in FIELDS + ["_id", "_last_modified"]},
|
|
124
|
+
limit=0,
|
|
125
|
+
)
|
|
126
|
+
asset_records.extend(batch_records)
|
|
127
|
+
|
|
128
|
+
# Unwrap nested fields
|
|
129
|
+
records = []
|
|
130
|
+
for record in asset_records:
|
|
131
|
+
modalities = record.get("data_description", {}).get("modalities", [])
|
|
132
|
+
modality_abbreviations = [modality["abbreviation"] for modality in modalities if "abbreviation" in modality]
|
|
133
|
+
|
|
134
|
+
# Get the process date, convert to YYYY-MM-DD if present
|
|
135
|
+
data_processes = record.get("processing", {}).get("data_processes", [])
|
|
136
|
+
if data_processes:
|
|
137
|
+
latest_process = data_processes[-1]
|
|
138
|
+
process_datetime = latest_process.get("start_date_time", None)
|
|
139
|
+
process_date = process_datetime.split("T")[0]
|
|
140
|
+
else:
|
|
141
|
+
process_date = None
|
|
142
|
+
|
|
143
|
+
# Get the CO asset ID
|
|
144
|
+
other_identifiers = record.get("other_identifiers", {})
|
|
145
|
+
if other_identifiers:
|
|
146
|
+
code_ocean = other_identifiers.get("Code Ocean", None)
|
|
147
|
+
else:
|
|
148
|
+
code_ocean = None
|
|
149
|
+
|
|
150
|
+
# Calculate age in days from acquisition_start_time and date_of_birth or year_of_birth
|
|
151
|
+
acquisition_start = record.get("acquisition", {}).get("acquisition_start_time", None)
|
|
152
|
+
acq_subject_details = record.get("acquisition", {}).get("subject_details", {}) or {}
|
|
153
|
+
date_of_birth = acq_subject_details.get("date_of_birth", None)
|
|
154
|
+
year_of_birth = acq_subject_details.get("year_of_birth", None)
|
|
155
|
+
age = None
|
|
156
|
+
if acquisition_start and (date_of_birth or year_of_birth):
|
|
157
|
+
try:
|
|
158
|
+
acq_date = pd.to_datetime(acquisition_start)
|
|
159
|
+
if date_of_birth:
|
|
160
|
+
dob = pd.to_datetime(date_of_birth)
|
|
161
|
+
else:
|
|
162
|
+
dob = pd.Timestamp(int(year_of_birth), 1, 1)
|
|
163
|
+
age = (acq_date - dob).days
|
|
164
|
+
except Exception:
|
|
165
|
+
age = None
|
|
166
|
+
|
|
167
|
+
flat_record = {
|
|
168
|
+
"_id": record["_id"],
|
|
169
|
+
"_last_modified": record.get("_last_modified", None),
|
|
170
|
+
"modalities": modality_abbreviations,
|
|
171
|
+
"project_name": record.get("data_description", {}).get("project_name", None),
|
|
172
|
+
"data_level": record.get("data_description", {}).get("data_level", None),
|
|
173
|
+
"subject_id": record.get("subject", {}).get("subject_id", None),
|
|
174
|
+
"acquisition_start_time": record.get("acquisition", {}).get("acquisition_start_time", None),
|
|
175
|
+
"acquisition_end_time": record.get("acquisition", {}).get("acquisition_end_time", None),
|
|
176
|
+
"code_ocean": code_ocean,
|
|
177
|
+
"process_date": process_date,
|
|
178
|
+
"genotype": record.get("subject", {}).get("subject_details", {}).get("genotype", None),
|
|
179
|
+
"age": age,
|
|
180
|
+
"acquisition_type": record.get("acquisition", {}).get("acquisition_type", None),
|
|
181
|
+
"location": record.get("location", None),
|
|
182
|
+
"name": record.get("name", None),
|
|
183
|
+
"experimenters": [
|
|
184
|
+
e if isinstance(e, str) else e.get("name", "")
|
|
185
|
+
for e in (record.get("acquisition", {}).get("experimenters", []) or [])
|
|
186
|
+
],
|
|
187
|
+
"experimenters_normalized": normalize_experimenters(
|
|
188
|
+
[
|
|
189
|
+
e if isinstance(e, str) else e.get("name", "")
|
|
190
|
+
for e in (record.get("acquisition", {}).get("experimenters", []) or [])
|
|
191
|
+
]
|
|
192
|
+
),
|
|
193
|
+
"instrument_id": record.get("acquisition", {}).get("instrument_id", None),
|
|
194
|
+
"instrument_id_normalized": normalize_instrument_id(
|
|
195
|
+
record.get("acquisition", {}).get("instrument_id", None)
|
|
196
|
+
),
|
|
197
|
+
"investigators": [
|
|
198
|
+
i.get("name", "") for i in (record.get("data_description", {}).get("investigators", []) or [])
|
|
199
|
+
],
|
|
200
|
+
"investigators_normalized": normalize_experimenters(
|
|
201
|
+
[i.get("name", "") for i in (record.get("data_description", {}).get("investigators", []) or [])]
|
|
202
|
+
),
|
|
203
|
+
}
|
|
204
|
+
records.append(flat_record)
|
|
205
|
+
|
|
206
|
+
# Combine new records with the old df and store in cache
|
|
207
|
+
new_df = pd.DataFrame(records)
|
|
208
|
+
df = pd.concat([df[~df["_id"].isin(keep_ids)], new_df], ignore_index=True)
|
|
209
|
+
|
|
210
|
+
def _iter_names(cell):
|
|
211
|
+
"""Helper to iterate over names in a cell that may be a list or a single string."""
|
|
212
|
+
if hasattr(cell, "__iter__") and not isinstance(cell, str):
|
|
213
|
+
return list(cell)
|
|
214
|
+
return []
|
|
215
|
+
|
|
216
|
+
all_names = [
|
|
217
|
+
name
|
|
218
|
+
for col in ("experimenters_normalized", "investigators_normalized")
|
|
219
|
+
for cell in df[col]
|
|
220
|
+
for name in _iter_names(cell)
|
|
221
|
+
]
|
|
222
|
+
first_name_map = build_first_name_map(list(dict.fromkeys(all_names)))
|
|
223
|
+
if first_name_map:
|
|
224
|
+
df["experimenters_normalized"] = df["experimenters_normalized"].apply(
|
|
225
|
+
lambda x: apply_first_name_map(_iter_names(x), first_name_map)
|
|
226
|
+
)
|
|
227
|
+
df["investigators_normalized"] = df["investigators_normalized"].apply(
|
|
228
|
+
lambda x: apply_first_name_map(_iter_names(x), first_name_map)
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
registry.BACKEND.write(registry.NAMES["basics"], df)
|
|
232
|
+
|
|
233
|
+
return df
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def asset_basics_columns() -> list[Column]:
|
|
237
|
+
"""Return asset basics cache table column definitions."""
|
|
238
|
+
return [
|
|
239
|
+
Column(name="_id", description="DocDB record ID for the asset"),
|
|
240
|
+
Column(name="_last_modified", description="DocDB last modified timestamp for the asset record"),
|
|
241
|
+
Column(name="modalities", description="Modalities present in the asset as a list of abbreviations"),
|
|
242
|
+
Column(name="project_name", description="Project name associated with the asset"),
|
|
243
|
+
Column(name="data_level", description="Data level of the asset (e.g. raw, derived)"),
|
|
244
|
+
Column(name="subject_id", description="Subject ID"),
|
|
245
|
+
Column(name="acquisition_start_time", description="Acquisition start time in ISO format"),
|
|
246
|
+
Column(name="acquisition_end_time", description="Acquisition end time in ISO format"),
|
|
247
|
+
Column(name="code_ocean", description="Code Ocean asset ID if available"),
|
|
248
|
+
Column(name="process_date", description="Date of latest processing in YYYY-MM-DD format"),
|
|
249
|
+
Column(name="genotype", description="Genotype information for the subject if available"),
|
|
250
|
+
Column(
|
|
251
|
+
name="age",
|
|
252
|
+
description="Age of the subject in days at time of acquisition, derived from date_of_birth or year_of_birth",
|
|
253
|
+
),
|
|
254
|
+
Column(name="acquisition_type", description="Acquisition type (e.g. multiplane-2photon)"),
|
|
255
|
+
Column(name="location", description="Location of the asset in S3"),
|
|
256
|
+
Column(name="name", description="Asset name"),
|
|
257
|
+
Column(name="experimenters", description="Acquisition experimenters as a list of raw names"),
|
|
258
|
+
Column(
|
|
259
|
+
name="experimenters_normalized",
|
|
260
|
+
description="Normalized, deduplicated list of experimenter display names in original order",
|
|
261
|
+
),
|
|
262
|
+
Column(name="instrument_id", description="Instrument ID used for the acquisition"),
|
|
263
|
+
Column(
|
|
264
|
+
name="instrument_id_normalized", description="Normalized short instrument name derived from instrument_id"
|
|
265
|
+
),
|
|
266
|
+
Column(name="investigators", description="Investigators from data_description as a list of names"),
|
|
267
|
+
Column(
|
|
268
|
+
name="investigators_normalized",
|
|
269
|
+
description="Normalized, deduplicated list of investigator display names in original order",
|
|
270
|
+
),
|
|
271
|
+
]
|