zombie-squirrel 0.8.0__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,12 +3,10 @@
3
3
  Provides functions to fetch and cache project names, subject IDs, and asset
4
4
  metadata from the AIND metadata database with support for multiple backends."""
5
5
 
6
- __version__ = "0.8.0"
6
+ __version__ = "0.8.1"
7
7
 
8
- from zombie_squirrel.acorns import ( # noqa: F401
9
- asset_basics,
10
- raw_to_derived,
11
- source_data,
12
- unique_project_names,
13
- unique_subject_ids,
14
- )
8
+ from zombie_squirrel.acorn_contents.asset_basics import asset_basics # noqa: F401
9
+ from zombie_squirrel.acorn_contents.raw_to_derived import raw_to_derived # noqa: F401
10
+ from zombie_squirrel.acorn_contents.source_data import source_data # noqa: F401
11
+ from zombie_squirrel.acorn_contents.unique_project_names import unique_project_names # noqa: F401
12
+ from zombie_squirrel.acorn_contents.unique_subject_ids import unique_subject_ids # noqa: F401
@@ -0,0 +1,10 @@
1
+ """Acorns module: individual data fetching functions."""
2
+
3
+ # Import the acorn modules to trigger decorator registration
4
+ from zombie_squirrel.acorn_contents import ( # noqa: F401
5
+ asset_basics,
6
+ raw_to_derived,
7
+ source_data,
8
+ unique_project_names,
9
+ unique_subject_ids,
10
+ )
@@ -0,0 +1,135 @@
1
+ """Asset basics acorn."""
2
+
3
+ import logging
4
+
5
+ import pandas as pd
6
+ from aind_data_access_api.document_db import MetadataDbClient
7
+
8
+ import zombie_squirrel.acorns as acorns
9
+
10
+
11
+ @acorns.register_acorn(acorns.NAMES["basics"])
12
+ def asset_basics(force_update: bool = False) -> pd.DataFrame:
13
+ """Fetch basic asset metadata including modalities, projects, and subject info.
14
+
15
+ Returns a DataFrame with columns: _id, _last_modified, modalities,
16
+ project_name, data_level, subject_id, acquisition_start_time, and
17
+ acquisition_end_time. Uses incremental updates based on _last_modified
18
+ timestamps to avoid re-fetching unchanged records.
19
+
20
+ Args:
21
+ force_update: If True, bypass cache and fetch fresh data from database.
22
+
23
+ Returns:
24
+ DataFrame with basic asset metadata."""
25
+ df = acorns.TREE.scurry(acorns.NAMES["basics"])
26
+
27
+ FIELDS = [
28
+ "data_description.modalities",
29
+ "data_description.project_name",
30
+ "data_description.data_level",
31
+ "subject.subject_id",
32
+ "acquisition.acquisition_start_time",
33
+ "acquisition.acquisition_end_time",
34
+ "processing.data_processes.start_date_time",
35
+ "subject.subject_details.genotype",
36
+ "other_identifiers",
37
+ "location",
38
+ ]
39
+
40
+ if df.empty or force_update:
41
+ logging.info("Updating cache for asset basics")
42
+ df = pd.DataFrame(
43
+ columns=[
44
+ "_id",
45
+ "_last_modified",
46
+ "modalities",
47
+ "project_name",
48
+ "data_level",
49
+ "subject_id",
50
+ "acquisition_start_time",
51
+ "acquisition_end_time",
52
+ "code_ocean",
53
+ "process_date",
54
+ "genotype",
55
+ "location",
56
+ ]
57
+ )
58
+ client = MetadataDbClient(
59
+ host=acorns.API_GATEWAY_HOST,
60
+ version="v2",
61
+ )
62
+ # It's a bit complex to get multiple fields that aren't indexed in a database
63
+ # as large as DocDB. We'll also try to limit ourselves to only updating fields
64
+ # that are necessary
65
+ record_ids = client.retrieve_docdb_records(
66
+ filter_query={},
67
+ projection={"_id": 1, "_last_modified": 1},
68
+ limit=0,
69
+ )
70
+ keep_ids = []
71
+ # Drop all _ids where _last_modified matches cache
72
+ for record in record_ids:
73
+ cached_row = df[df["_id"] == record["_id"]]
74
+ if cached_row.empty or cached_row["_last_modified"].values[0] != record["_last_modified"]:
75
+ keep_ids.append(record["_id"])
76
+
77
+ # Now batch by 100 IDs at a time to avoid overloading server, and fetch all the fields
78
+ BATCH_SIZE = 100
79
+ asset_records = []
80
+ for i in range(0, len(keep_ids), BATCH_SIZE):
81
+ logging.info(f"Fetching asset basics batch {i // BATCH_SIZE + 1}...")
82
+ batch_ids = keep_ids[i: i + BATCH_SIZE]
83
+ batch_records = client.retrieve_docdb_records(
84
+ filter_query={"_id": {"$in": batch_ids}},
85
+ projection={field: 1 for field in FIELDS + ["_id", "_last_modified"]},
86
+ limit=0,
87
+ )
88
+ asset_records.extend(batch_records)
89
+
90
+ # Unwrap nested fields
91
+ records = []
92
+ for record in asset_records:
93
+ modalities = record.get("data_description", {}).get("modalities", [])
94
+ modality_abbreviations = [modality["abbreviation"] for modality in modalities if "abbreviation" in modality]
95
+ modality_abbreviations_str = ", ".join(modality_abbreviations)
96
+
97
+ # Get the process date, convert to YYYY-MM-DD if present
98
+ data_processes = record.get("processing", {}).get("data_processes", [])
99
+ if data_processes:
100
+ latest_process = data_processes[-1]
101
+ process_datetime = latest_process.get("start_date_time", None)
102
+ process_date = process_datetime.split("T")[0]
103
+ else:
104
+ process_date = None
105
+
106
+ # Get the CO asset ID
107
+ other_identifiers = record.get("other_identifiers", {})
108
+ if other_identifiers:
109
+ code_ocean = other_identifiers.get("Code Ocean", None)
110
+ else:
111
+ code_ocean = None
112
+
113
+ flat_record = {
114
+ "_id": record["_id"],
115
+ "_last_modified": record.get("_last_modified", None),
116
+ "modalities": modality_abbreviations_str,
117
+ "project_name": record.get("data_description", {}).get("project_name", None),
118
+ "data_level": record.get("data_description", {}).get("data_level", None),
119
+ "subject_id": record.get("subject", {}).get("subject_id", None),
120
+ "acquisition_start_time": record.get("acquisition", {}).get("acquisition_start_time", None),
121
+ "acquisition_end_time": record.get("acquisition", {}).get("acquisition_end_time", None),
122
+ "code_ocean": code_ocean,
123
+ "process_date": process_date,
124
+ "genotype": record.get("subject", {}).get("subject_details", {}).get("genotype", None),
125
+ "location": record.get("location", None),
126
+ }
127
+ records.append(flat_record)
128
+
129
+ # Combine new records with the old df and store in cache
130
+ new_df = pd.DataFrame(records)
131
+ df = pd.concat([df[~df["_id"].isin(keep_ids)], new_df], ignore_index=True)
132
+
133
+ acorns.TREE.hide(acorns.NAMES["basics"], df)
134
+
135
+ return df
@@ -0,0 +1,71 @@
1
+ """Raw to derived mapping acorn."""
2
+
3
+ import logging
4
+
5
+ import pandas as pd
6
+ from aind_data_access_api.document_db import MetadataDbClient
7
+
8
+ import zombie_squirrel.acorns as acorns
9
+
10
+
11
+ @acorns.register_acorn(acorns.NAMES["r2d"])
12
+ def raw_to_derived(force_update: bool = False) -> pd.DataFrame:
13
+ """Fetch mapping of raw records to their derived records.
14
+
15
+ Returns a DataFrame mapping raw record IDs to lists of derived record IDs
16
+ that depend on them as source data.
17
+
18
+ Args:
19
+ force_update: If True, bypass cache and fetch fresh data from database.
20
+
21
+ Returns:
22
+ DataFrame with _id and derived_records columns."""
23
+ df = acorns.TREE.scurry(acorns.NAMES["r2d"])
24
+
25
+ if df.empty or force_update:
26
+ logging.info("Updating cache for raw to derived mapping")
27
+ client = MetadataDbClient(
28
+ host=acorns.API_GATEWAY_HOST,
29
+ version="v2",
30
+ )
31
+
32
+ # Get all raw record IDs
33
+ raw_records = client.retrieve_docdb_records(
34
+ filter_query={"data_description.data_level": "raw"},
35
+ projection={"_id": 1},
36
+ limit=0,
37
+ )
38
+ raw_ids = {record["_id"] for record in raw_records}
39
+
40
+ # Get all derived records with their _id and source_data
41
+ derived_records = client.retrieve_docdb_records(
42
+ filter_query={"data_description.data_level": "derived"},
43
+ projection={"_id": 1, "data_description.source_data": 1},
44
+ limit=0,
45
+ )
46
+
47
+ # Build mapping: raw_id -> list of derived _ids
48
+ raw_to_derived_map = {raw_id: [] for raw_id in raw_ids}
49
+ for derived_record in derived_records:
50
+ source_data_list = derived_record.get("data_description", {}).get("source_data", [])
51
+ derived_id = derived_record["_id"]
52
+ # Add this derived record to each raw record it depends on
53
+ for source_id in source_data_list:
54
+ if source_id in raw_to_derived_map:
55
+ raw_to_derived_map[source_id].append(derived_id)
56
+
57
+ # Convert to DataFrame
58
+ data = []
59
+ for raw_id, derived_ids in raw_to_derived_map.items():
60
+ derived_ids_str = ", ".join(derived_ids)
61
+ data.append(
62
+ {
63
+ "_id": raw_id,
64
+ "derived_records": derived_ids_str,
65
+ }
66
+ )
67
+
68
+ df = pd.DataFrame(data)
69
+ acorns.TREE.hide(acorns.NAMES["r2d"], df)
70
+
71
+ return df
@@ -0,0 +1,50 @@
1
+ """Source data acorn."""
2
+
3
+ import logging
4
+
5
+ import pandas as pd
6
+ from aind_data_access_api.document_db import MetadataDbClient
7
+
8
+ import zombie_squirrel.acorns as acorns
9
+
10
+
11
+ @acorns.register_acorn(acorns.NAMES["d2r"])
12
+ def source_data(force_update: bool = False) -> pd.DataFrame:
13
+ """Fetch source data references for derived records.
14
+
15
+ Returns a DataFrame mapping record IDs to their upstream source data
16
+ dependencies as comma-separated lists.
17
+
18
+ Args:
19
+ force_update: If True, bypass cache and fetch fresh data from database.
20
+
21
+ Returns:
22
+ DataFrame with _id and source_data columns."""
23
+ df = acorns.TREE.scurry(acorns.NAMES["d2r"])
24
+
25
+ if df.empty or force_update:
26
+ logging.info("Updating cache for source data")
27
+ client = MetadataDbClient(
28
+ host=acorns.API_GATEWAY_HOST,
29
+ version="v2",
30
+ )
31
+ records = client.retrieve_docdb_records(
32
+ filter_query={},
33
+ projection={"_id": 1, "data_description.source_data": 1},
34
+ limit=0,
35
+ )
36
+ data = []
37
+ for record in records:
38
+ source_data_list = record.get("data_description", {}).get("source_data", [])
39
+ source_data_str = ", ".join(source_data_list) if source_data_list else ""
40
+ data.append(
41
+ {
42
+ "_id": record["_id"],
43
+ "source_data": source_data_str,
44
+ }
45
+ )
46
+
47
+ df = pd.DataFrame(data)
48
+ acorns.TREE.hide(acorns.NAMES["d2r"], df)
49
+
50
+ return df
@@ -0,0 +1,41 @@
1
+ """Unique project names acorn."""
2
+
3
+ import logging
4
+
5
+ import pandas as pd
6
+ from aind_data_access_api.document_db import MetadataDbClient
7
+
8
+ import zombie_squirrel.acorns as acorns
9
+
10
+
11
+ @acorns.register_acorn(acorns.NAMES["upn"])
12
+ def unique_project_names(force_update: bool = False) -> list[str]:
13
+ """Fetch unique project names from metadata database.
14
+
15
+ Returns cached results if available, fetches from database if cache is empty
16
+ or force_update is True.
17
+
18
+ Args:
19
+ force_update: If True, bypass cache and fetch fresh data from database.
20
+
21
+ Returns:
22
+ List of unique project names."""
23
+ df = acorns.TREE.scurry(acorns.NAMES["upn"])
24
+
25
+ if df.empty or force_update:
26
+ # If cache is missing, fetch data
27
+ logging.info("Updating cache for unique project names")
28
+ client = MetadataDbClient(
29
+ host=acorns.API_GATEWAY_HOST,
30
+ version="v2",
31
+ )
32
+ unique_project_names = client.aggregate_docdb_records(
33
+ pipeline=[
34
+ {"$group": {"_id": "$data_description.project_name"}},
35
+ {"$project": {"project_name": "$_id", "_id": 0}},
36
+ ]
37
+ )
38
+ df = pd.DataFrame(unique_project_names)
39
+ acorns.TREE.hide(acorns.NAMES["upn"], df)
40
+
41
+ return df["project_name"].tolist()
@@ -0,0 +1,41 @@
1
+ """Unique subject IDs acorn."""
2
+
3
+ import logging
4
+
5
+ import pandas as pd
6
+ from aind_data_access_api.document_db import MetadataDbClient
7
+
8
+ import zombie_squirrel.acorns as acorns
9
+
10
+
11
+ @acorns.register_acorn(acorns.NAMES["usi"])
12
+ def unique_subject_ids(force_update: bool = False) -> list[str]:
13
+ """Fetch unique subject IDs from metadata database.
14
+
15
+ Returns cached results if available, fetches from database if cache is empty
16
+ or force_update is True.
17
+
18
+ Args:
19
+ force_update: If True, bypass cache and fetch fresh data from database.
20
+
21
+ Returns:
22
+ List of unique subject IDs."""
23
+ df = acorns.TREE.scurry(acorns.NAMES["usi"])
24
+
25
+ if df.empty or force_update:
26
+ # If cache is missing, fetch data
27
+ logging.info("Updating cache for unique subject IDs")
28
+ client = MetadataDbClient(
29
+ host=acorns.API_GATEWAY_HOST,
30
+ version="v2",
31
+ )
32
+ unique_subject_ids = client.aggregate_docdb_records(
33
+ pipeline=[
34
+ {"$group": {"_id": "$subject.subject_id"}},
35
+ {"$project": {"subject_id": "$_id", "_id": 0}},
36
+ ]
37
+ )
38
+ df = pd.DataFrame(unique_subject_ids)
39
+ acorns.TREE.hide(acorns.NAMES["usi"], df)
40
+
41
+ return df["subject_id"].tolist()
zombie_squirrel/acorns.py CHANGED
@@ -5,9 +5,6 @@ import os
5
5
  from collections.abc import Callable
6
6
  from typing import Any
7
7
 
8
- import pandas as pd
9
- from aind_data_access_api.document_db import MetadataDbClient
10
-
11
8
  from zombie_squirrel.forest import (
12
9
  MemoryTree,
13
10
  S3Tree,
@@ -26,23 +23,7 @@ else:
26
23
  logging.info("Using in-memory forest for caching")
27
24
  TREE = MemoryTree()
28
25
 
29
- # --- Acorn registry -----------------------------------------------------
30
-
31
- ACORN_REGISTRY: dict[str, Callable[[], Any]] = {}
32
-
33
-
34
- def register_acorn(name: str):
35
- """Decorator for registering new acorns."""
36
-
37
- def decorator(func):
38
- """Register function in acorn registry."""
39
- ACORN_REGISTRY[name] = func
40
- return func
41
-
42
- return decorator
43
-
44
-
45
- # --- Acorns -----------------------------------------------------
26
+ # --- Acorn registry and names -----------------------------------------------------
46
27
 
47
28
  NAMES = {
48
29
  "upn": "unique_project_names",
@@ -52,300 +33,15 @@ NAMES = {
52
33
  "r2d": "raw_to_derived",
53
34
  }
54
35
 
36
+ ACORN_REGISTRY: dict[str, Callable[[], Any]] = {}
55
37
 
56
- @register_acorn(NAMES["upn"])
57
- def unique_project_names(force_update: bool = False) -> list[str]:
58
- """Fetch unique project names from metadata database.
59
-
60
- Returns cached results if available, fetches from database if cache is empty
61
- or force_update is True.
62
-
63
- Args:
64
- force_update: If True, bypass cache and fetch fresh data from database.
65
-
66
- Returns:
67
- List of unique project names."""
68
- df = TREE.scurry(NAMES["upn"])
69
-
70
- if df.empty or force_update:
71
- # If cache is missing, fetch data
72
- logging.info("Updating cache for unique project names")
73
- client = MetadataDbClient(
74
- host=API_GATEWAY_HOST,
75
- version="v2",
76
- )
77
- unique_project_names = client.aggregate_docdb_records(
78
- pipeline=[
79
- {"$group": {"_id": "$data_description.project_name"}},
80
- {"$project": {"project_name": "$_id", "_id": 0}},
81
- ]
82
- )
83
- df = pd.DataFrame(unique_project_names)
84
- TREE.hide(NAMES["upn"], df)
85
-
86
- return df["project_name"].tolist()
87
-
88
-
89
- @register_acorn(NAMES["usi"])
90
- def unique_subject_ids(force_update: bool = False) -> list[str]:
91
- """Fetch unique subject IDs from metadata database.
92
-
93
- Returns cached results if available, fetches from database if cache is empty
94
- or force_update is True.
95
-
96
- Args:
97
- force_update: If True, bypass cache and fetch fresh data from database.
98
-
99
- Returns:
100
- List of unique subject IDs."""
101
- df = TREE.scurry(NAMES["usi"])
102
-
103
- if df.empty or force_update:
104
- # If cache is missing, fetch data
105
- logging.info("Updating cache for unique subject IDs")
106
- client = MetadataDbClient(
107
- host=API_GATEWAY_HOST,
108
- version="v2",
109
- )
110
- unique_subject_ids = client.aggregate_docdb_records(
111
- pipeline=[
112
- {"$group": {"_id": "$subject.subject_id"}},
113
- {"$project": {"subject_id": "$_id", "_id": 0}},
114
- ]
115
- )
116
- df = pd.DataFrame(unique_subject_ids)
117
- TREE.hide(NAMES["usi"], df)
118
-
119
- return df["subject_id"].tolist()
120
-
121
-
122
- @register_acorn(NAMES["basics"])
123
- def asset_basics(force_update: bool = False) -> pd.DataFrame:
124
- """Fetch basic asset metadata including modalities, projects, and subject info.
125
-
126
- Returns a DataFrame with columns: _id, _last_modified, modalities,
127
- project_name, data_level, subject_id, acquisition_start_time, and
128
- acquisition_end_time. Uses incremental updates based on _last_modified
129
- timestamps to avoid re-fetching unchanged records.
130
-
131
- Args:
132
- force_update: If True, bypass cache and fetch fresh data from database.
133
-
134
- Returns:
135
- DataFrame with basic asset metadata."""
136
- df = TREE.scurry(NAMES["basics"])
137
-
138
- FIELDS = [
139
- "data_description.modalities",
140
- "data_description.project_name",
141
- "data_description.data_level",
142
- "subject.subject_id",
143
- "acquisition.acquisition_start_time",
144
- "acquisition.acquisition_end_time",
145
- "processing.data_processes.start_date_time",
146
- "subject.subject_details.genotype",
147
- "other_identifiers",
148
- "location",
149
- ]
150
-
151
- if df.empty or force_update:
152
- logging.info("Updating cache for asset basics")
153
- df = pd.DataFrame(
154
- columns=[
155
- "_id",
156
- "_last_modified",
157
- "modalities",
158
- "project_name",
159
- "data_level",
160
- "subject_id",
161
- "acquisition_start_time",
162
- "acquisition_end_time",
163
- "code_ocean",
164
- "process_date",
165
- "genotype",
166
- "location",
167
- ]
168
- )
169
- client = MetadataDbClient(
170
- host=API_GATEWAY_HOST,
171
- version="v2",
172
- )
173
- # It's a bit complex to get multiple fields that aren't indexed in a database
174
- # as large as DocDB. We'll also try to limit ourselves to only updating fields
175
- # that are necessary
176
- record_ids = client.retrieve_docdb_records(
177
- filter_query={},
178
- projection={"_id": 1, "_last_modified": 1},
179
- limit=0,
180
- )
181
- keep_ids = []
182
- # Drop all _ids where _last_modified matches cache
183
- for record in record_ids:
184
- cached_row = df[df["_id"] == record["_id"]]
185
- if cached_row.empty or cached_row["_last_modified"].values[0] != record["_last_modified"]:
186
- keep_ids.append(record["_id"])
187
-
188
- # Now batch by 100 IDs at a time to avoid overloading server, and fetch all the fields
189
- BATCH_SIZE = 100
190
- asset_records = []
191
- for i in range(0, len(keep_ids), BATCH_SIZE):
192
- logging.info(f"Fetching asset basics batch {i // BATCH_SIZE + 1}...")
193
- batch_ids = keep_ids[i: i + BATCH_SIZE]
194
- batch_records = client.retrieve_docdb_records(
195
- filter_query={"_id": {"$in": batch_ids}},
196
- projection={field: 1 for field in FIELDS + ["_id", "_last_modified"]},
197
- limit=0,
198
- )
199
- asset_records.extend(batch_records)
200
-
201
- # Unwrap nested fields
202
- records = []
203
- for record in asset_records:
204
- modalities = record.get("data_description", {}).get("modalities", [])
205
- modality_abbreviations = [modality["abbreviation"] for modality in modalities if "abbreviation" in modality]
206
- modality_abbreviations_str = ", ".join(modality_abbreviations)
207
-
208
- # Get the process date, convert to YYYY-MM-DD if present
209
- data_processes = record.get("processing", {}).get("data_processes", [])
210
- if data_processes:
211
- latest_process = data_processes[-1]
212
- process_datetime = latest_process.get("start_date_time", None)
213
- process_date = process_datetime.split("T")[0]
214
- else:
215
- process_date = None
216
-
217
- # Get the CO asset ID
218
- other_identifiers = record.get("other_identifiers", {})
219
- if other_identifiers:
220
- code_ocean = other_identifiers.get("Code Ocean", None)
221
- else:
222
- code_ocean = None
223
-
224
- flat_record = {
225
- "_id": record["_id"],
226
- "_last_modified": record.get("_last_modified", None),
227
- "modalities": modality_abbreviations_str,
228
- "project_name": record.get("data_description", {}).get("project_name", None),
229
- "data_level": record.get("data_description", {}).get("data_level", None),
230
- "subject_id": record.get("subject", {}).get("subject_id", None),
231
- "acquisition_start_time": record.get("acquisition", {}).get("acquisition_start_time", None),
232
- "acquisition_end_time": record.get("acquisition", {}).get("acquisition_end_time", None),
233
- "code_ocean": code_ocean,
234
- "process_date": process_date,
235
- "genotype": record.get("subject", {}).get("subject_details", {}).get("genotype", None),
236
- "location": record.get("location", None),
237
- }
238
- records.append(flat_record)
239
-
240
- # Combine new records with the old df and store in cache
241
- new_df = pd.DataFrame(records)
242
- df = pd.concat([df[~df["_id"].isin(keep_ids)], new_df], ignore_index=True)
243
-
244
- TREE.hide(NAMES["basics"], df)
245
-
246
- return df
247
-
248
-
249
- @register_acorn(NAMES["d2r"])
250
- def source_data(force_update: bool = False) -> pd.DataFrame:
251
- """Fetch source data references for derived records.
252
-
253
- Returns a DataFrame mapping record IDs to their upstream source data
254
- dependencies as comma-separated lists.
255
-
256
- Args:
257
- force_update: If True, bypass cache and fetch fresh data from database.
258
-
259
- Returns:
260
- DataFrame with _id and source_data columns."""
261
- df = TREE.scurry(NAMES["d2r"])
262
-
263
- if df.empty or force_update:
264
- logging.info("Updating cache for source data")
265
- client = MetadataDbClient(
266
- host=API_GATEWAY_HOST,
267
- version="v2",
268
- )
269
- records = client.retrieve_docdb_records(
270
- filter_query={},
271
- projection={"_id": 1, "data_description.source_data": 1},
272
- limit=0,
273
- )
274
- data = []
275
- for record in records:
276
- source_data_list = record.get("data_description", {}).get("source_data", [])
277
- source_data_str = ", ".join(source_data_list) if source_data_list else ""
278
- data.append(
279
- {
280
- "_id": record["_id"],
281
- "source_data": source_data_str,
282
- }
283
- )
284
-
285
- df = pd.DataFrame(data)
286
- TREE.hide(NAMES["d2r"], df)
287
-
288
- return df
289
-
290
-
291
- @register_acorn(NAMES["r2d"])
292
- def raw_to_derived(force_update: bool = False) -> pd.DataFrame:
293
- """Fetch mapping of raw records to their derived records.
294
-
295
- Returns a DataFrame mapping raw record IDs to lists of derived record IDs
296
- that depend on them as source data.
297
-
298
- Args:
299
- force_update: If True, bypass cache and fetch fresh data from database.
300
-
301
- Returns:
302
- DataFrame with _id and derived_records columns."""
303
- df = TREE.scurry(NAMES["r2d"])
304
-
305
- if df.empty or force_update:
306
- logging.info("Updating cache for raw to derived mapping")
307
- client = MetadataDbClient(
308
- host=API_GATEWAY_HOST,
309
- version="v2",
310
- )
311
-
312
- # Get all raw record IDs
313
- raw_records = client.retrieve_docdb_records(
314
- filter_query={"data_description.data_level": "raw"},
315
- projection={"_id": 1},
316
- limit=0,
317
- )
318
- raw_ids = {record["_id"] for record in raw_records}
319
-
320
- # Get all derived records with their _id and source_data
321
- derived_records = client.retrieve_docdb_records(
322
- filter_query={"data_description.data_level": "derived"},
323
- projection={"_id": 1, "data_description.source_data": 1},
324
- limit=0,
325
- )
326
-
327
- # Build mapping: raw_id -> list of derived _ids
328
- raw_to_derived_map = {raw_id: [] for raw_id in raw_ids}
329
- for derived_record in derived_records:
330
- source_data_list = derived_record.get("data_description", {}).get("source_data", [])
331
- derived_id = derived_record["_id"]
332
- # Add this derived record to each raw record it depends on
333
- for source_id in source_data_list:
334
- if source_id in raw_to_derived_map:
335
- raw_to_derived_map[source_id].append(derived_id)
336
38
 
337
- # Convert to DataFrame
338
- data = []
339
- for raw_id, derived_ids in raw_to_derived_map.items():
340
- derived_ids_str = ", ".join(derived_ids)
341
- data.append(
342
- {
343
- "_id": raw_id,
344
- "derived_records": derived_ids_str,
345
- }
346
- )
39
+ def register_acorn(name: str):
40
+ """Decorator for registering new acorns."""
347
41
 
348
- df = pd.DataFrame(data)
349
- TREE.hide(NAMES["r2d"], df)
42
+ def decorator(func):
43
+ """Register function in acorn registry."""
44
+ ACORN_REGISTRY[name] = func
45
+ return func
350
46
 
351
- return df
47
+ return decorator
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: zombie-squirrel
3
- Version: 0.8.0
3
+ Version: 0.8.1
4
4
  Summary: Generated from aind-library-template
5
5
  Author: Allen Institute for Neural Dynamics
6
6
  License: MIT
@@ -0,0 +1,16 @@
1
+ zombie_squirrel/__init__.py,sha256=sPD8B2awlREi8c33mhKoGw0khdE5u1XQmU_3hWiVH9A,693
2
+ zombie_squirrel/acorns.py,sha256=BLhySoLfb7D6IfbNMOhxbLJba-8Wp-L9e1w_2KozjjM,1134
3
+ zombie_squirrel/forest.py,sha256=v0K1u0EA0OptzxocFC-fPEi6xYcnJ9SoWJ6aiPF4jLg,2939
4
+ zombie_squirrel/sync.py,sha256=9cpfSzTj0cQz4-d3glMAOejCZgekMirLc-dwEFFQhlg,496
5
+ zombie_squirrel/utils.py,sha256=kojQpHUKlRJD7WEZDfcpQIZTj9iUrtX5_6F-gWWzJW0,628
6
+ zombie_squirrel/acorn_contents/__init__.py,sha256=LsNy5xjlJS981SVxRLD4yPhUg__fuY8-HbPDapOhH-o,285
7
+ zombie_squirrel/acorn_contents/asset_basics.py,sha256=cTg016sZbrJH5_As7CqnirumR1hSMn4GiYX0IReoou0,5513
8
+ zombie_squirrel/acorn_contents/raw_to_derived.py,sha256=EFjLP9szzk3Q6jNhyDwxx8rqhG1bFTVmxeCpCAsv6yg,2445
9
+ zombie_squirrel/acorn_contents/source_data.py,sha256=zbw3DnTwXo6xDTZ1uCoIbcYrVBH2r2xZN3trKtxT8cg,1527
10
+ zombie_squirrel/acorn_contents/unique_project_names.py,sha256=Fu0P2DyI91W7xhx1uRk2YVMyFG8WPihDCj7n_A7zL2E,1289
11
+ zombie_squirrel/acorn_contents/unique_subject_ids.py,sha256=dOVn1ObDF86p8S8US__hlyjTGxT0vz01oeGzjoWGFIc,1260
12
+ zombie_squirrel-0.8.1.dist-info/licenses/LICENSE,sha256=U0Y7B3gZJHXpjJVLgTQjM8e_c8w4JJpLgGhIdsoFR1Y,1092
13
+ zombie_squirrel-0.8.1.dist-info/METADATA,sha256=xEQ04ahuoz0XhbxLpAUCYFLG1kj6VSV2RFMfRtAq1PY,1898
14
+ zombie_squirrel-0.8.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
15
+ zombie_squirrel-0.8.1.dist-info/top_level.txt,sha256=FmM0coe4AangURZLjM4JwwRv2B8H6oINYCoZLKLDCKA,16
16
+ zombie_squirrel-0.8.1.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- zombie_squirrel/__init__.py,sha256=rTMJ-AnaIVT0HYJAlXTbCbmYrjtCdyYJmunF-gY_4-k,406
2
- zombie_squirrel/acorns.py,sha256=k43lDNxGt4EcON-d41Gm3rwWUvbmFYSveayVlCo1Rm4,12212
3
- zombie_squirrel/forest.py,sha256=v0K1u0EA0OptzxocFC-fPEi6xYcnJ9SoWJ6aiPF4jLg,2939
4
- zombie_squirrel/sync.py,sha256=9cpfSzTj0cQz4-d3glMAOejCZgekMirLc-dwEFFQhlg,496
5
- zombie_squirrel/utils.py,sha256=kojQpHUKlRJD7WEZDfcpQIZTj9iUrtX5_6F-gWWzJW0,628
6
- zombie_squirrel-0.8.0.dist-info/licenses/LICENSE,sha256=U0Y7B3gZJHXpjJVLgTQjM8e_c8w4JJpLgGhIdsoFR1Y,1092
7
- zombie_squirrel-0.8.0.dist-info/METADATA,sha256=AZPiAwF4DAA9iUdKjmJ6pvaVbvPxqySkX0cTTelM0cg,1898
8
- zombie_squirrel-0.8.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
9
- zombie_squirrel-0.8.0.dist-info/top_level.txt,sha256=FmM0coe4AangURZLjM4JwwRv2B8H6oINYCoZLKLDCKA,16
10
- zombie_squirrel-0.8.0.dist-info/RECORD,,