zombie-squirrel 0.2.3__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: zombie-squirrel
3
- Version: 0.2.3
3
+ Version: 0.4.0
4
4
  Summary: Generated from aind-library-template
5
5
  Author: Allen Institute for Neural Dynamics
6
6
  License: MIT
@@ -1,7 +1,8 @@
1
1
  """Init package"""
2
- __version__ = "0.2.3"
2
+ __version__ = "0.4.0"
3
3
 
4
4
  from zombie_squirrel.squirrels import (
5
5
  unique_project_names,
6
6
  unique_subject_ids,
7
+ asset_basics,
7
8
  )
@@ -59,7 +59,8 @@ class MemoryAcorn(Acorn):
59
59
 
60
60
 
61
61
  def rds_get_handle_empty(acorn: Acorn, table_name: str) -> pd.DataFrame:
62
- """Utility function for testing purposes."""
62
+ """Helper for handling errors when loading from redshift, because
63
+ there's no helper function """
63
64
  try:
64
65
  logging.info(f"Fetching from cache: {table_name}")
65
66
  df = acorn.scurry(table_name)
@@ -0,0 +1,168 @@
1
+ """Squirrels: functions to fetch and cache data from MongoDB."""
2
+ import pandas as pd
3
+ from typing import Any, Callable
4
+ from zombie_squirrel.acorns import RedshiftAcorn, MemoryAcorn, rds_get_handle_empty
5
+ from aind_data_access_api.document_db import MetadataDbClient
6
+ import os
7
+ import logging
8
+
9
+ # --- Backend setup ---------------------------------------------------
10
+
11
+ API_GATEWAY_HOST = "api.allenneuraldynamics.org"
12
+
13
+ tree_type = os.getenv("TREE_SPECIES", "memory").lower()
14
+
15
+ if tree_type == "redshift":
16
+ logging.info("Using Redshift acorn for caching")
17
+ ACORN = RedshiftAcorn()
18
+ else:
19
+ logging.info("Using in-memory acorn for caching")
20
+ ACORN = MemoryAcorn()
21
+
22
+ # --- Squirrel registry -----------------------------------------------------
23
+
24
+ SQUIRREL_REGISTRY: dict[str, Callable[[], Any]] = {}
25
+
26
+
27
+ def register_squirrel(name: str):
28
+ """Decorator for registering new squirrels."""
29
+ def decorator(func):
30
+ SQUIRREL_REGISTRY[name] = func
31
+ return func
32
+ return decorator
33
+
34
+
35
+ # --- Squirrels -----------------------------------------------------
36
+
37
+ NAMES = {
38
+ "upn": "unique_project_names",
39
+ "usi": "unique_subject_ids",
40
+ "basics": "asset_basics",
41
+ }
42
+
43
+
44
+ @register_squirrel(NAMES["upn"])
45
+ def unique_project_names(force_update: bool = False) -> list[str]:
46
+ df = rds_get_handle_empty(ACORN, NAMES["upn"])
47
+
48
+ if df.empty or force_update:
49
+ # If cache is missing, fetch data
50
+ logging.info("Updating cache for unique project names")
51
+ client = MetadataDbClient(
52
+ host=API_GATEWAY_HOST,
53
+ version="v2",
54
+ )
55
+ unique_project_names = client.aggregate_docdb_records(
56
+ pipeline=[
57
+ {"$group": {"_id": "$data_description.project_name"}},
58
+ {"$project": {"project_name": "$_id", "_id": 0}},
59
+ ]
60
+ )
61
+ df = pd.DataFrame(unique_project_names)
62
+ ACORN.hide(NAMES["upn"], df)
63
+
64
+ return df["project_name"].tolist()
65
+
66
+
67
+ @register_squirrel(NAMES["usi"])
68
+ def unique_subject_ids(force_update: bool = False) -> list[str]:
69
+ df = rds_get_handle_empty(ACORN, NAMES["usi"])
70
+
71
+ if df.empty or force_update:
72
+ # If cache is missing, fetch data
73
+ logging.info("Updating cache for unique subject IDs")
74
+ client = MetadataDbClient(
75
+ host=API_GATEWAY_HOST,
76
+ version="v2",
77
+ )
78
+ unique_subject_ids = client.aggregate_docdb_records(
79
+ pipeline=[
80
+ {"$group": {"_id": "$subject.subject_id"}},
81
+ {"$project": {"subject_id": "$_id", "_id": 0}},
82
+ ]
83
+ )
84
+ df = pd.DataFrame(unique_subject_ids)
85
+ ACORN.hide(NAMES["usi"], df)
86
+
87
+ return df["subject_id"].tolist()
88
+
89
+
90
+ @register_squirrel(NAMES["basics"])
91
+ def asset_basics(force_update: bool = False) -> pd.DataFrame:
92
+ """Basic asset metadata.
93
+
94
+ _id, _last_modified,
95
+ modalities, project names, data_level, subject_id, acquisition_start and _end
96
+ """
97
+ df = rds_get_handle_empty(ACORN, NAMES["basics"])
98
+
99
+ FIELDS = [
100
+ "data_description.modalities",
101
+ "data_description.project_name",
102
+ "data_description.data_level",
103
+ "subject.subject_id",
104
+ "acquisition.acquisition_start_time",
105
+ "acquisition.acquisition_end_time",
106
+ ]
107
+
108
+ if df.empty or force_update:
109
+ logging.info("Updating cache for asset basics")
110
+ df = pd.DataFrame(columns=["_id", "_last_modified", "modalities", "project_name",
111
+ "data_level", "subject_id",
112
+ "acquisition_start_time", "acquisition_end_time"])
113
+ client = MetadataDbClient(
114
+ host=API_GATEWAY_HOST,
115
+ version="v2",
116
+ )
117
+ # It's a bit complex to get multiple fields that aren't indexed in a database
118
+ # as large as DocDB. We'll also try to limit ourselves to only updating fields
119
+ # that are necessary
120
+ record_ids = client.retrieve_docdb_records(
121
+ filter_query={}, projection={"_id": 1, "_last_modified": 1}, limit=0,
122
+ )
123
+ keep_ids = []
124
+ # Drop all _ids where _last_modified matches cache
125
+ for record in record_ids:
126
+ cached_row = df[df["_id"] == record["_id"]]
127
+ if cached_row.empty or cached_row["_last_modified"].values[0] != record["_last_modified"]:
128
+ keep_ids.append(record["_id"])
129
+
130
+ # Now batch by 100 IDs at a time to avoid overloading server, and fetch all the fields
131
+ BATCH_SIZE = 100
132
+ asset_records = []
133
+ for i in range(0, len(keep_ids), BATCH_SIZE):
134
+ logging.info(f"Fetching asset basics batch {i // BATCH_SIZE + 1}...")
135
+ batch_ids = keep_ids[i:i + BATCH_SIZE]
136
+ batch_records = client.retrieve_docdb_records(
137
+ filter_query={"_id": {"$in": batch_ids}},
138
+ projection={field: 1 for field in FIELDS + ["_id", "_last_modified"]},
139
+ limit=0,
140
+ )
141
+ asset_records.extend(batch_records)
142
+
143
+ # Unwrap nested fields
144
+ records = []
145
+ for record in asset_records:
146
+
147
+ modalities = record.get("data_description", {}).get("modalities", [])
148
+ modality_abbreviations = [modality["abbreviation"] for modality in modalities if "abbreviation" in modality]
149
+ modality_abbreviations_str = ", ".join(modality_abbreviations)
150
+ flat_record = {
151
+ "_id": record["_id"],
152
+ "_last_modified": record.get("_last_modified", None),
153
+ "modalities": modality_abbreviations_str,
154
+ "project_name": record.get("data_description", {}).get("project_name", None),
155
+ "data_level": record.get("data_description", {}).get("data_level", None),
156
+ "subject_id": record.get("subject", {}).get("subject_id", None),
157
+ "acquisition_start_time": record.get("acquisition", {}).get("acquisition_start_time", None),
158
+ "acquisition_end_time": record.get("acquisition", {}).get("acquisition_end_time", None),
159
+ }
160
+ records.append(flat_record)
161
+
162
+ # Combine new records with the old df and store in cache
163
+ new_df = pd.DataFrame(records)
164
+ df = pd.concat([df[df["_id"].isin(keep_ids) == False], new_df], ignore_index=True)
165
+
166
+ ACORN.hide(NAMES["basics"], df)
167
+
168
+ return df
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: zombie-squirrel
3
- Version: 0.2.3
3
+ Version: 0.4.0
4
4
  Summary: Generated from aind-library-template
5
5
  Author: Allen Institute for Neural Dynamics
6
6
  License: MIT
@@ -1,86 +0,0 @@
1
- """Squirrels: functions to fetch and cache data from MongoDB."""
2
- import pandas as pd
3
- from typing import Any, Callable
4
- from zombie_squirrel.acorns import RedshiftAcorn, MemoryAcorn, rds_get_handle_empty
5
- from aind_data_access_api.document_db import MetadataDbClient
6
- import os
7
- import logging
8
-
9
- # --- Backend setup ---------------------------------------------------
10
-
11
- API_GATEWAY_HOST = "api.allenneuraldynamics.org"
12
-
13
- tree_type = os.getenv("TREE_SPECIES", "memory").lower()
14
-
15
- if tree_type == "redshift":
16
- logging.info("Using Redshift acorn for caching")
17
- ACORN = RedshiftAcorn()
18
- else:
19
- logging.info("Using in-memory acorn for caching")
20
- ACORN = MemoryAcorn()
21
-
22
- # --- Squirrel registry -----------------------------------------------------
23
-
24
- SQUIRREL_REGISTRY: dict[str, Callable[[], Any]] = {}
25
-
26
-
27
- def register_squirrel(name: str):
28
- """Decorator for registering new squirrels."""
29
- def decorator(func):
30
- SQUIRREL_REGISTRY[name] = func
31
- return func
32
- return decorator
33
-
34
-
35
- # --- Squirrels -----------------------------------------------------
36
-
37
- NAMES = {
38
- "upn": "unique_project_names",
39
- "usi": "unique_subject_ids",
40
- }
41
-
42
-
43
- @register_squirrel(NAMES["upn"])
44
- def unique_project_names(force_update: bool = False) -> list[str]:
45
- df = rds_get_handle_empty(ACORN, NAMES["upn"])
46
-
47
- if df.empty or force_update:
48
- # If cache is missing, fetch data
49
- logging.info("Updating cache for unique project names")
50
- client = MetadataDbClient(
51
- host=API_GATEWAY_HOST,
52
- version="v2",
53
- )
54
- unique_project_names = client.aggregate_docdb_records(
55
- pipeline=[
56
- {"$group": {"_id": "$data_description.project_name"}},
57
- {"$project": {"project_name": "$_id", "_id": 0}},
58
- ]
59
- )
60
- df = pd.DataFrame(unique_project_names)
61
- ACORN.hide(NAMES["upn"], df)
62
-
63
- return df["project_name"].tolist()
64
-
65
-
66
- @register_squirrel(NAMES["usi"])
67
- def unique_subject_ids(force_update: bool = False) -> list[str]:
68
- df = rds_get_handle_empty(ACORN, NAMES["usi"])
69
-
70
- if df.empty or force_update:
71
- # If cache is missing, fetch data
72
- logging.info("Updating cache for unique subject IDs")
73
- client = MetadataDbClient(
74
- host=API_GATEWAY_HOST,
75
- version="v2",
76
- )
77
- unique_subject_ids = client.aggregate_docdb_records(
78
- pipeline=[
79
- {"$group": {"_id": "$subject.subject_id"}},
80
- {"$project": {"subject_id": "$_id", "_id": 0}},
81
- ]
82
- )
83
- df = pd.DataFrame(unique_subject_ids)
84
- ACORN.hide(NAMES["usi"], df)
85
-
86
- return df["subject_id"].tolist()
File without changes