biodata-cache 0.32.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. biodata_cache/__init__.py +32 -0
  2. biodata_cache/backend.py +310 -0
  3. biodata_cache/cache_table_helpers/__init__.py +20 -0
  4. biodata_cache/cache_table_helpers/asset_basics.py +271 -0
  5. biodata_cache/cache_table_helpers/behavior_curriculum.py +170 -0
  6. biodata_cache/cache_table_helpers/custom.py +31 -0
  7. biodata_cache/cache_table_helpers/foraging_sessions.py +110 -0
  8. biodata_cache/cache_table_helpers/metadata_core.py +120 -0
  9. biodata_cache/cache_table_helpers/metadata_upgrade.py +150 -0
  10. biodata_cache/cache_table_helpers/platform_exaspim.py +171 -0
  11. biodata_cache/cache_table_helpers/platform_fib.py +197 -0
  12. biodata_cache/cache_table_helpers/platform_qc.py +196 -0
  13. biodata_cache/cache_table_helpers/platform_smartspim.py +274 -0
  14. biodata_cache/cache_table_helpers/qc.py +259 -0
  15. biodata_cache/cache_table_helpers/raw_to_derived.py +37 -0
  16. biodata_cache/cache_table_helpers/source_data.py +97 -0
  17. biodata_cache/cache_table_helpers/unique_genotypes.py +63 -0
  18. biodata_cache/cache_table_helpers/unique_project_names.py +62 -0
  19. biodata_cache/cache_table_helpers/unique_subject_ids.py +62 -0
  20. biodata_cache/models.py +40 -0
  21. biodata_cache/registry.py +66 -0
  22. biodata_cache/sync.py +186 -0
  23. biodata_cache/utils.py +124 -0
  24. biodata_cache-0.32.0.dist-info/METADATA +113 -0
  25. biodata_cache-0.32.0.dist-info/RECORD +29 -0
  26. biodata_cache-0.32.0.dist-info/WHEEL +5 -0
  27. biodata_cache-0.32.0.dist-info/licenses/LICENSE +21 -0
  28. biodata_cache-0.32.0.dist-info/top_level.txt +2 -0
  29. zombie_squirrel/__init__.py +32 -0
@@ -0,0 +1,32 @@
1
+ """Zombie-squirrel: caching and synchronization for AIND metadata.
2
+
3
+ Provides functions to fetch and cache project names, subject IDs, and asset
4
+ metadata from the AIND metadata database with support for multiple backends.
5
+ Also exposes get_cache_registry to retrieve the cache_registry.json registry of all
6
+ available cache tables and their metadata.
7
+ """
8
+
9
+ __version__ = "0.32.0"
10
+
11
+ from biodata_cache.cache_table_helpers.asset_basics import asset_basics # noqa: F401
12
+ from biodata_cache.cache_table_helpers.behavior_curriculum import behavior_curriculum # noqa: F401
13
+ from biodata_cache.cache_table_helpers.custom import custom # noqa: F401
14
+ from biodata_cache.cache_table_helpers.foraging_sessions import foraging_sessions # noqa: F401
15
+ from biodata_cache.cache_table_helpers.metadata_upgrade import metadata_upgrade # noqa: F401
16
+ from biodata_cache.cache_table_helpers.platform_exaspim import platform_exaspim # noqa: F401
17
+ from biodata_cache.cache_table_helpers.platform_fib import platform_fib # noqa: F401
18
+ from biodata_cache.cache_table_helpers.platform_qc import platform_qc # noqa: F401
19
+ from biodata_cache.cache_table_helpers.platform_smartspim import assets_smartspim # noqa: F401
20
+ from biodata_cache.cache_table_helpers.qc import qc, qc_columns # noqa: F401
21
+ from biodata_cache.cache_table_helpers.raw_to_derived import raw_to_derived # noqa: F401
22
+ from biodata_cache.cache_table_helpers.source_data import source_data # noqa: F401
23
+ from biodata_cache.cache_table_helpers.unique_genotypes import ( # noqa: F401
24
+ unique_genotypes,
25
+ )
26
+ from biodata_cache.cache_table_helpers.unique_project_names import ( # noqa: F401
27
+ unique_project_names,
28
+ )
29
+ from biodata_cache.cache_table_helpers.unique_subject_ids import ( # noqa: F401
30
+ unique_subject_ids,
31
+ )
32
+ from biodata_cache.utils import get_cache_registry # noqa: F401
@@ -0,0 +1,310 @@
1
+ """Storage backend interfaces for caching data."""
2
+
3
+ import io
4
+ import json
5
+ import logging
6
+ from abc import ABC, abstractmethod
7
+
8
+ import boto3
9
+ import duckdb
10
+ import pandas as pd
11
+
12
+ from biodata_cache.utils import BDC_VERSION, CacheLogMessage
13
+
14
+ _CACHE_ROOT = "data-asset-cache"
15
+ _VERSION_FOLDER = f"bdc-v{BDC_VERSION}"
16
+
17
+ HIVE_PARTITION_KEYS = {
18
+ "qc": "subject_id",
19
+ "qc_tag_status": "subject_id",
20
+ "platform_qc": "platform",
21
+ }
22
+
23
+
24
+ class Backend(ABC):
25
+ """Base class for a cache storage backend."""
26
+
27
+ def __init__(self) -> None:
28
+ """Initialize the Backend."""
29
+ super().__init__()
30
+
31
+ @abstractmethod
32
+ def write(self, table_name: str, data: pd.DataFrame) -> None:
33
+ """Write records to the cache."""
34
+ pass # pragma: no cover
35
+
36
+ @abstractmethod
37
+ def read(self, table_name: str | list[str]) -> pd.DataFrame:
38
+ """Read records from the cache.
39
+
40
+ Args:
41
+ table_name: Single table name or list of table names.
42
+ When a list is provided, merges all tables and adds
43
+ an 'asset_name' column to differentiate sources.
44
+
45
+ """
46
+ pass # pragma: no cover
47
+
48
+ @abstractmethod
49
+ def get_location(self, table_name: str, partitioned: bool = False) -> str:
50
+ """Return the storage location string for a given table."""
51
+ pass # pragma: no cover
52
+
53
+ @abstractmethod
54
+ def put_json(self, key: str, data: str) -> None:
55
+ """Write a JSON string to the storage root under the given key."""
56
+ pass # pragma: no cover
57
+
58
+ @abstractmethod
59
+ def get_json(self, key: str) -> str:
60
+ """Read a JSON string from the storage root under the given key."""
61
+ pass # pragma: no cover
62
+
63
+
64
+ class S3Backend(Backend):
65
+ """Stores and retrieves caches using AWS S3 with parquet files."""
66
+
67
+ def __init__(self) -> None:
68
+ """Initialize S3Backend with S3 client."""
69
+ self.bucket = "allen-data-views"
70
+ self.s3_client = boto3.client("s3")
71
+
72
+ def write(self, table_name: str, data: pd.DataFrame) -> None:
73
+ """Store DataFrame as parquet file in S3."""
74
+ if "/" in table_name:
75
+ base, value = table_name.split("/", 1)
76
+ partition_key = HIVE_PARTITION_KEYS[base]
77
+ s3_key = f"{_CACHE_ROOT}/{_VERSION_FOLDER}/{base}/{partition_key}={value}/data.pqt"
78
+ json_key = f"{_CACHE_ROOT}/{_VERSION_FOLDER}/{base}.json"
79
+ else:
80
+ s3_key = f"{_CACHE_ROOT}/{_VERSION_FOLDER}/{table_name}.pqt"
81
+ json_key = f"{_CACHE_ROOT}/{_VERSION_FOLDER}/{table_name}.json"
82
+
83
+ parquet_buffer = io.BytesIO()
84
+ data.to_parquet(parquet_buffer, index=False)
85
+ parquet_buffer.seek(0)
86
+
87
+ self.s3_client.put_object(
88
+ Bucket=self.bucket,
89
+ Key=s3_key,
90
+ Body=parquet_buffer.getvalue(),
91
+ )
92
+ logging.info(
93
+ CacheLogMessage(
94
+ backend="S3Backend", table=table_name, message=f"Stored cache to s3://{self.bucket}/{s3_key}"
95
+ ).to_json()
96
+ )
97
+
98
+ metadata = {"columns": data.columns.tolist()}
99
+ self.s3_client.put_object(
100
+ Bucket=self.bucket,
101
+ Key=json_key,
102
+ Body=json.dumps(metadata),
103
+ )
104
+
105
+ def read(self, table_name: str | list[str]) -> pd.DataFrame:
106
+ """Fetch DataFrame from S3 parquet file(s).
107
+
108
+ When given a list of table names, merges them using DuckDB
109
+ and adds an 'asset_name' column.
110
+ """
111
+ if isinstance(table_name, list):
112
+ return self._read_multiple(table_name)
113
+ return self._read_single(table_name)
114
+
115
+ def _read_single(self, table_name: str) -> pd.DataFrame:
116
+ """Fetch a single table from S3."""
117
+ if "/" in table_name:
118
+ base, value = table_name.split("/", 1)
119
+ partition_key = HIVE_PARTITION_KEYS[base]
120
+ s3_key = f"{_CACHE_ROOT}/{_VERSION_FOLDER}/{base}/{partition_key}={value}/data.pqt"
121
+ else:
122
+ s3_key = f"{_CACHE_ROOT}/{_VERSION_FOLDER}/{table_name}.pqt"
123
+
124
+ try:
125
+ query = f"""
126
+ SELECT * FROM read_parquet(
127
+ 's3://{self.bucket}/{s3_key}'
128
+ )
129
+ """
130
+ result = duckdb.query(query).to_df()
131
+ logging.info(
132
+ CacheLogMessage(
133
+ backend="S3Backend", table=table_name, message=f"Retrieved cache from s3://{self.bucket}/{s3_key}"
134
+ ).to_json()
135
+ )
136
+ return result
137
+ except Exception as e:
138
+ logging.warning(
139
+ CacheLogMessage(
140
+ backend="S3Backend", table=table_name, message=f"Error fetching from cache {s3_key}: {e}"
141
+ ).to_json()
142
+ )
143
+ return pd.DataFrame()
144
+
145
+ def get_location(self, table_name: str, partitioned: bool = False) -> str:
146
+ """Return the S3 URI for a given table."""
147
+ if partitioned:
148
+ return f"s3://{self.bucket}/{_CACHE_ROOT}/{_VERSION_FOLDER}/{table_name}/"
149
+ if "/" in table_name:
150
+ base, value = table_name.split("/", 1)
151
+ partition_key = HIVE_PARTITION_KEYS[base]
152
+ return f"s3://{self.bucket}/{_CACHE_ROOT}/{_VERSION_FOLDER}/{base}/{partition_key}={value}/data.pqt"
153
+ return f"s3://{self.bucket}/{_CACHE_ROOT}/{_VERSION_FOLDER}/{table_name}.pqt"
154
+
155
+ def put_json(self, key: str, data: str) -> None: # pragma: no cover
156
+ """Write a JSON string to the versioned folder in S3 and update the index."""
157
+ s3_key = f"{_CACHE_ROOT}/{_VERSION_FOLDER}/{key}"
158
+ self.s3_client.put_object(
159
+ Bucket=self.bucket,
160
+ Key=s3_key,
161
+ Body=data.encode(),
162
+ ContentType="application/json",
163
+ )
164
+ logging.info(
165
+ CacheLogMessage(
166
+ backend="S3Backend", table=key, message=f"Published metadata to s3://{self.bucket}/{s3_key}"
167
+ ).to_json()
168
+ )
169
+ index_key = f"{_CACHE_ROOT}/cache_versions.json"
170
+ try:
171
+ response = self.s3_client.get_object(Bucket=self.bucket, Key=index_key)
172
+ existing = json.loads(response["Body"].read().decode())
173
+ except Exception:
174
+ existing = []
175
+ if _VERSION_FOLDER not in existing:
176
+ existing.append(_VERSION_FOLDER)
177
+ self.s3_client.put_object(
178
+ Bucket=self.bucket,
179
+ Key=index_key,
180
+ Body=json.dumps(existing).encode(),
181
+ ContentType="application/json",
182
+ )
183
+
184
+ def get_json(self, key: str) -> str: # pragma: no cover
185
+ """Read a JSON string from the versioned folder in S3."""
186
+ s3_key = f"{_CACHE_ROOT}/{_VERSION_FOLDER}/{key}"
187
+ response = self.s3_client.get_object(Bucket=self.bucket, Key=s3_key)
188
+ return response["Body"].read().decode()
189
+
190
+ def _read_multiple(self, table_names: list[str]) -> pd.DataFrame:
191
+ """Fetch and merge multiple tables from S3."""
192
+ parquet_paths = []
193
+ asset_names = []
194
+
195
+ for tbl_name in table_names:
196
+ s3_key = f"{_CACHE_ROOT}/{_VERSION_FOLDER}/{tbl_name}.pqt"
197
+ s3_path = f"s3://{self.bucket}/{s3_key}"
198
+ parquet_paths.append(f"'{s3_path}'")
199
+ asset_names.append(tbl_name)
200
+
201
+ try:
202
+ union_query = " UNION ALL ".join(
203
+ [
204
+ f"SELECT *, '{asset}' as asset_name FROM read_parquet({path})"
205
+ for path, asset in zip(parquet_paths, asset_names, strict=False)
206
+ ]
207
+ )
208
+ result = duckdb.query(union_query).to_df()
209
+ logging.info(
210
+ CacheLogMessage(
211
+ backend="S3Backend", table="merged", message=f"Merged {len(table_names)} tables from S3"
212
+ ).to_json()
213
+ )
214
+ return result
215
+ except Exception as e:
216
+ logging.warning(
217
+ CacheLogMessage(backend="S3Backend", table="merged", message=f"Error merging tables: {e}").to_json()
218
+ )
219
+ return pd.DataFrame()
220
+
221
+
222
+ class MemoryBackend(Backend):
223
+ """A simple in-memory backend for testing or local development."""
224
+
225
+ def __init__(self) -> None:
226
+ """Initialize MemoryBackend with empty store."""
227
+ super().__init__()
228
+ self._store: dict[str, pd.DataFrame] = {}
229
+ self._json_store: dict[str, str] = {}
230
+
231
+ def write(self, table_name: str, data: pd.DataFrame) -> None:
232
+ """Store DataFrame in memory."""
233
+ logging.info(
234
+ CacheLogMessage(
235
+ backend="MemoryBackend", table=table_name, message=f"Storing cache in memory for {table_name}"
236
+ ).to_json()
237
+ )
238
+ self._store[table_name] = data
239
+
240
+ def read(self, table_name: str | list[str]) -> pd.DataFrame:
241
+ """Fetch DataFrame from memory.
242
+
243
+ When given a list of table names, merges them and adds
244
+ an 'asset_name' column.
245
+ """
246
+ if isinstance(table_name, list):
247
+ return self._read_multiple(table_name)
248
+ return self._read_single(table_name)
249
+
250
+ def _read_single(self, table_name: str) -> pd.DataFrame:
251
+ """Fetch a single table from memory."""
252
+ logging.info(
253
+ CacheLogMessage(
254
+ backend="MemoryBackend", table=table_name, message=f"Fetching cache from memory for {table_name}"
255
+ ).to_json()
256
+ )
257
+ return self._store.get(table_name, pd.DataFrame())
258
+
259
+ def get_location(self, table_name: str, partitioned: bool = False) -> str:
260
+ """Return the in-memory identifier for a given table."""
261
+ if partitioned:
262
+ return f"{_VERSION_FOLDER}/{table_name}/"
263
+ if "/" in table_name:
264
+ base, value = table_name.split("/", 1)
265
+ partition_key = HIVE_PARTITION_KEYS[base]
266
+ return f"{_VERSION_FOLDER}/{base}/{partition_key}={value}/data.pqt"
267
+ return f"{_VERSION_FOLDER}/{table_name}.pqt"
268
+
269
+ def put_json(self, key: str, data: str) -> None:
270
+ """Store a JSON string in the versioned in-memory JSON store and update index."""
271
+ logging.info(
272
+ CacheLogMessage(
273
+ backend="MemoryBackend", table=key, message=f"Storing metadata in memory for {key}"
274
+ ).to_json()
275
+ )
276
+ self._json_store[f"{_VERSION_FOLDER}/{key}"] = data
277
+ existing = json.loads(self._json_store.get("cache_versions.json", "[]"))
278
+ if _VERSION_FOLDER not in existing:
279
+ existing.append(_VERSION_FOLDER)
280
+ self._json_store["cache_versions.json"] = json.dumps(existing)
281
+
282
+ def get_json(self, key: str) -> str:
283
+ """Read a JSON string from the versioned in-memory JSON store."""
284
+ return self._json_store.get(f"{_VERSION_FOLDER}/{key}", "{}")
285
+
286
+ def _read_multiple(self, table_names: list[str]) -> pd.DataFrame:
287
+ """Fetch and merge multiple tables from memory."""
288
+ dfs = []
289
+ for tbl_name in table_names:
290
+ df = self._store.get(tbl_name, pd.DataFrame())
291
+ if not df.empty:
292
+ df = df.copy()
293
+ df["asset_name"] = tbl_name
294
+ dfs.append(df)
295
+
296
+ if not dfs:
297
+ logging.warning(
298
+ CacheLogMessage(
299
+ backend="MemoryBackend", table="merged", message=f"No valid tables found among {table_names}"
300
+ ).to_json()
301
+ )
302
+ return pd.DataFrame()
303
+
304
+ result = pd.concat(dfs, ignore_index=True)
305
+ logging.info(
306
+ CacheLogMessage(
307
+ backend="MemoryBackend", table="merged", message=f"Merged {len(dfs)} tables from memory"
308
+ ).to_json()
309
+ )
310
+ return result
@@ -0,0 +1,20 @@
1
+ """Acorns module: individual data fetching functions."""
2
+
3
+ from biodata_cache.cache_table_helpers import ( # noqa: F401
4
+ asset_basics,
5
+ behavior_curriculum,
6
+ custom,
7
+ foraging_sessions,
8
+ metadata_core,
9
+ metadata_upgrade,
10
+ platform_exaspim,
11
+ platform_fib,
12
+ platform_qc,
13
+ platform_smartspim,
14
+ qc,
15
+ raw_to_derived,
16
+ source_data,
17
+ unique_genotypes,
18
+ unique_project_names,
19
+ unique_subject_ids,
20
+ )
@@ -0,0 +1,271 @@
1
+ """Asset basics cache table."""
2
+
3
+ import logging
4
+
5
+ import pandas as pd
6
+ from aind_data_access_api.document_db import MetadataDbClient
7
+
8
+ import biodata_cache.registry as registry
9
+ from biodata_cache.models import Column
10
+ from biodata_cache.utils import (
11
+ CacheLogMessage,
12
+ apply_first_name_map,
13
+ build_first_name_map,
14
+ normalize_experimenters,
15
+ normalize_instrument_id,
16
+ setup_logging,
17
+ )
18
+
19
+
20
+ @registry.register_table(registry.NAMES["basics"])
21
+ def asset_basics(force_update: bool = False) -> pd.DataFrame:
22
+ """Fetch basic asset metadata including modalities, projects, and subject info.
23
+
24
+ Returns a DataFrame with columns: _id, _last_modified, modalities,
25
+ project_name, data_level, subject_id, acquisition_start_time, and
26
+ acquisition_end_time. Uses incremental updates based on _last_modified
27
+ timestamps to avoid re-fetching unchanged records.
28
+
29
+ Args:
30
+ force_update: If True, bypass cache and fetch fresh data from database.
31
+
32
+ Returns:
33
+ DataFrame with basic asset metadata.
34
+
35
+ """
36
+ df = registry.BACKEND.read(registry.NAMES["basics"])
37
+
38
+ FIELDS = [
39
+ "data_description.modalities",
40
+ "data_description.project_name",
41
+ "data_description.data_level",
42
+ "subject.subject_id",
43
+ "acquisition.acquisition_start_time",
44
+ "acquisition.acquisition_end_time",
45
+ "acquisition.acquisition_type",
46
+ "acquisition.subject_details.date_of_birth",
47
+ "acquisition.subject_details.year_of_birth",
48
+ "processing.data_processes.start_date_time",
49
+ "subject.subject_details.genotype",
50
+ "other_identifiers",
51
+ "location",
52
+ "name",
53
+ "acquisition.experimenters",
54
+ "acquisition.instrument_id",
55
+ "data_description.investigators",
56
+ ]
57
+
58
+ if df.empty or force_update:
59
+ setup_logging()
60
+ logging.info(
61
+ CacheLogMessage(
62
+ backend=registry.BACKEND.__class__.__name__, table=registry.NAMES["basics"], message="Updating cache"
63
+ ).to_json()
64
+ )
65
+ df = pd.DataFrame(
66
+ columns=[
67
+ "_id",
68
+ "_last_modified",
69
+ "modalities",
70
+ "project_name",
71
+ "data_level",
72
+ "subject_id",
73
+ "acquisition_start_time",
74
+ "acquisition_end_time",
75
+ "code_ocean",
76
+ "process_date",
77
+ "genotype",
78
+ "age",
79
+ "acquisition_type",
80
+ "location",
81
+ "name",
82
+ "experimenters",
83
+ "experimenters_normalized",
84
+ "instrument_id",
85
+ "instrument_id_normalized",
86
+ "investigators",
87
+ "investigators_normalized",
88
+ ]
89
+ )
90
+ client = MetadataDbClient(
91
+ host=registry.API_GATEWAY_HOST,
92
+ version="v2",
93
+ )
94
+ # It's a bit complex to get multiple fields that aren't indexed in a database
95
+ # as large as DocDB. We'll also try to limit ourselves to only updating fields
96
+ # that are necessary
97
+ record_ids = client.retrieve_docdb_records(
98
+ filter_query={},
99
+ projection={"_id": 1, "_last_modified": 1},
100
+ limit=0,
101
+ )
102
+ keep_ids = []
103
+ # Drop all _ids where _last_modified matches cache
104
+ for record in record_ids:
105
+ cached_row = df[df["_id"] == record["_id"]]
106
+ if cached_row.empty or cached_row["_last_modified"].values[0] != record["_last_modified"]:
107
+ keep_ids.append(record["_id"])
108
+
109
+ # Now batch by 100 IDs at a time to avoid overloading server, and fetch all the fields
110
+ BATCH_SIZE = 100
111
+ asset_records = []
112
+ for i in range(0, len(keep_ids), BATCH_SIZE):
113
+ logging.info(
114
+ CacheLogMessage(
115
+ backend=registry.BACKEND.__class__.__name__,
116
+ table=registry.NAMES["basics"],
117
+ message=f"Fetching batch {i // BATCH_SIZE + 1}",
118
+ ).to_json()
119
+ )
120
+ batch_ids = keep_ids[i : i + BATCH_SIZE]
121
+ batch_records = client.retrieve_docdb_records(
122
+ filter_query={"_id": {"$in": batch_ids}},
123
+ projection={field: 1 for field in FIELDS + ["_id", "_last_modified"]},
124
+ limit=0,
125
+ )
126
+ asset_records.extend(batch_records)
127
+
128
+ # Unwrap nested fields
129
+ records = []
130
+ for record in asset_records:
131
+ modalities = record.get("data_description", {}).get("modalities", [])
132
+ modality_abbreviations = [modality["abbreviation"] for modality in modalities if "abbreviation" in modality]
133
+
134
+ # Get the process date, convert to YYYY-MM-DD if present
135
+ data_processes = record.get("processing", {}).get("data_processes", [])
136
+ if data_processes:
137
+ latest_process = data_processes[-1]
138
+ process_datetime = latest_process.get("start_date_time", None)
139
+ process_date = process_datetime.split("T")[0]
140
+ else:
141
+ process_date = None
142
+
143
+ # Get the CO asset ID
144
+ other_identifiers = record.get("other_identifiers", {})
145
+ if other_identifiers:
146
+ code_ocean = other_identifiers.get("Code Ocean", None)
147
+ else:
148
+ code_ocean = None
149
+
150
+ # Calculate age in days from acquisition_start_time and date_of_birth or year_of_birth
151
+ acquisition_start = record.get("acquisition", {}).get("acquisition_start_time", None)
152
+ acq_subject_details = record.get("acquisition", {}).get("subject_details", {}) or {}
153
+ date_of_birth = acq_subject_details.get("date_of_birth", None)
154
+ year_of_birth = acq_subject_details.get("year_of_birth", None)
155
+ age = None
156
+ if acquisition_start and (date_of_birth or year_of_birth):
157
+ try:
158
+ acq_date = pd.to_datetime(acquisition_start)
159
+ if date_of_birth:
160
+ dob = pd.to_datetime(date_of_birth)
161
+ else:
162
+ dob = pd.Timestamp(int(year_of_birth), 1, 1)
163
+ age = (acq_date - dob).days
164
+ except Exception:
165
+ age = None
166
+
167
+ flat_record = {
168
+ "_id": record["_id"],
169
+ "_last_modified": record.get("_last_modified", None),
170
+ "modalities": modality_abbreviations,
171
+ "project_name": record.get("data_description", {}).get("project_name", None),
172
+ "data_level": record.get("data_description", {}).get("data_level", None),
173
+ "subject_id": record.get("subject", {}).get("subject_id", None),
174
+ "acquisition_start_time": record.get("acquisition", {}).get("acquisition_start_time", None),
175
+ "acquisition_end_time": record.get("acquisition", {}).get("acquisition_end_time", None),
176
+ "code_ocean": code_ocean,
177
+ "process_date": process_date,
178
+ "genotype": record.get("subject", {}).get("subject_details", {}).get("genotype", None),
179
+ "age": age,
180
+ "acquisition_type": record.get("acquisition", {}).get("acquisition_type", None),
181
+ "location": record.get("location", None),
182
+ "name": record.get("name", None),
183
+ "experimenters": [
184
+ e if isinstance(e, str) else e.get("name", "")
185
+ for e in (record.get("acquisition", {}).get("experimenters", []) or [])
186
+ ],
187
+ "experimenters_normalized": normalize_experimenters(
188
+ [
189
+ e if isinstance(e, str) else e.get("name", "")
190
+ for e in (record.get("acquisition", {}).get("experimenters", []) or [])
191
+ ]
192
+ ),
193
+ "instrument_id": record.get("acquisition", {}).get("instrument_id", None),
194
+ "instrument_id_normalized": normalize_instrument_id(
195
+ record.get("acquisition", {}).get("instrument_id", None)
196
+ ),
197
+ "investigators": [
198
+ i.get("name", "") for i in (record.get("data_description", {}).get("investigators", []) or [])
199
+ ],
200
+ "investigators_normalized": normalize_experimenters(
201
+ [i.get("name", "") for i in (record.get("data_description", {}).get("investigators", []) or [])]
202
+ ),
203
+ }
204
+ records.append(flat_record)
205
+
206
+ # Combine new records with the old df and store in cache
207
+ new_df = pd.DataFrame(records)
208
+ df = pd.concat([df[~df["_id"].isin(keep_ids)], new_df], ignore_index=True)
209
+
210
+ def _iter_names(cell):
211
+ """Helper to iterate over names in a cell that may be a list or a single string."""
212
+ if hasattr(cell, "__iter__") and not isinstance(cell, str):
213
+ return list(cell)
214
+ return []
215
+
216
+ all_names = [
217
+ name
218
+ for col in ("experimenters_normalized", "investigators_normalized")
219
+ for cell in df[col]
220
+ for name in _iter_names(cell)
221
+ ]
222
+ first_name_map = build_first_name_map(list(dict.fromkeys(all_names)))
223
+ if first_name_map:
224
+ df["experimenters_normalized"] = df["experimenters_normalized"].apply(
225
+ lambda x: apply_first_name_map(_iter_names(x), first_name_map)
226
+ )
227
+ df["investigators_normalized"] = df["investigators_normalized"].apply(
228
+ lambda x: apply_first_name_map(_iter_names(x), first_name_map)
229
+ )
230
+
231
+ registry.BACKEND.write(registry.NAMES["basics"], df)
232
+
233
+ return df
234
+
235
+
236
+ def asset_basics_columns() -> list[Column]:
237
+ """Return asset basics cache table column definitions."""
238
+ return [
239
+ Column(name="_id", description="DocDB record ID for the asset"),
240
+ Column(name="_last_modified", description="DocDB last modified timestamp for the asset record"),
241
+ Column(name="modalities", description="Modalities present in the asset as a list of abbreviations"),
242
+ Column(name="project_name", description="Project name associated with the asset"),
243
+ Column(name="data_level", description="Data level of the asset (e.g. raw, derived)"),
244
+ Column(name="subject_id", description="Subject ID"),
245
+ Column(name="acquisition_start_time", description="Acquisition start time in ISO format"),
246
+ Column(name="acquisition_end_time", description="Acquisition end time in ISO format"),
247
+ Column(name="code_ocean", description="Code Ocean asset ID if available"),
248
+ Column(name="process_date", description="Date of latest processing in YYYY-MM-DD format"),
249
+ Column(name="genotype", description="Genotype information for the subject if available"),
250
+ Column(
251
+ name="age",
252
+ description="Age of the subject in days at time of acquisition, derived from date_of_birth or year_of_birth",
253
+ ),
254
+ Column(name="acquisition_type", description="Acquisition type (e.g. multiplane-2photon)"),
255
+ Column(name="location", description="Location of the asset in S3"),
256
+ Column(name="name", description="Asset name"),
257
+ Column(name="experimenters", description="Acquisition experimenters as a list of raw names"),
258
+ Column(
259
+ name="experimenters_normalized",
260
+ description="Normalized, deduplicated list of experimenter display names in original order",
261
+ ),
262
+ Column(name="instrument_id", description="Instrument ID used for the acquisition"),
263
+ Column(
264
+ name="instrument_id_normalized", description="Normalized short instrument name derived from instrument_id"
265
+ ),
266
+ Column(name="investigators", description="Investigators from data_description as a list of names"),
267
+ Column(
268
+ name="investigators_normalized",
269
+ description="Normalized, deduplicated list of investigator display names in original order",
270
+ ),
271
+ ]