aind-dynamic-foraging-database 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aind_dynamic_foraging_database-0.0.1/LICENSE +21 -0
- aind_dynamic_foraging_database-0.0.1/PKG-INFO +572 -0
- aind_dynamic_foraging_database-0.0.1/README.md +541 -0
- aind_dynamic_foraging_database-0.0.1/pyproject.toml +91 -0
- aind_dynamic_foraging_database-0.0.1/setup.cfg +4 -0
- aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database/__init__.py +31 -0
- aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database/build_cache.py +274 -0
- aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database/query.py +284 -0
- aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database/query_examples.py +112 -0
- aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database/util/__init__.py +1 -0
- aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database/util/nwb_reader_aind.py +77 -0
- aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database/util/nwb_reader_legacy.py +398 -0
- aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database/util/parquet_builder.py +1203 -0
- aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database/util/postprocess.py +208 -0
- aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database/validate/__init__.py +1 -0
- aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database/validate/plot_validation.py +50 -0
- aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database/validate/validate_step1.py +210 -0
- aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database/validate/validate_step2.py +111 -0
- aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database.egg-info/PKG-INFO +572 -0
- aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database.egg-info/SOURCES.txt +23 -0
- aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database.egg-info/dependency_links.txt +1 -0
- aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database.egg-info/requires.txt +21 -0
- aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database.egg-info/top_level.txt +1 -0
- aind_dynamic_foraging_database-0.0.1/tests/test_parquet_cache.py +411 -0
- aind_dynamic_foraging_database-0.0.1/tests/test_query.py +126 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2022 Allen Institute for Neural Dynamics
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,572 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: aind-dynamic-foraging-database
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Query (and build) the AIND dynamic-foraging behavioral parquet database.
|
|
5
|
+
Author: Allen Institute for Neural Dynamics
|
|
6
|
+
License: MIT
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Requires-Python: >=3.9
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Requires-Dist: duckdb
|
|
12
|
+
Requires-Dist: pandas
|
|
13
|
+
Provides-Extra: build
|
|
14
|
+
Requires-Dist: pynwb; extra == "build"
|
|
15
|
+
Requires-Dist: hdmf_zarr; extra == "build"
|
|
16
|
+
Requires-Dist: h5py; extra == "build"
|
|
17
|
+
Requires-Dist: s3fs>=2025.10.0; extra == "build"
|
|
18
|
+
Requires-Dist: pyarrow; extra == "build"
|
|
19
|
+
Requires-Dist: aind-data-access-api; extra == "build"
|
|
20
|
+
Requires-Dist: codeocean; extra == "build"
|
|
21
|
+
Requires-Dist: aind-dynamic-foraging-data-utils; extra == "build"
|
|
22
|
+
Provides-Extra: dev
|
|
23
|
+
Requires-Dist: black; extra == "dev"
|
|
24
|
+
Requires-Dist: coverage; extra == "dev"
|
|
25
|
+
Requires-Dist: flake8; extra == "dev"
|
|
26
|
+
Requires-Dist: interrogate; extra == "dev"
|
|
27
|
+
Requires-Dist: isort; extra == "dev"
|
|
28
|
+
Requires-Dist: Sphinx; extra == "dev"
|
|
29
|
+
Requires-Dist: furo; extra == "dev"
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
|
|
32
|
+
# AIND Dynamic Foraging Database
|
|
33
|
+
|
|
34
|
+
[](LICENSE)
|
|
35
|
+

|
|
36
|
+
[](https://github.com/semantic-release/semantic-release)
|
|
37
|
+

|
|
38
|
+

|
|
39
|
+

|
|
40
|
+
|
|
41
|
+
The **single, queryable source of truth for _all_ AIND dynamic-foraging behavior** — every
|
|
42
|
+
session / trial / event, unified into one **parquet database** on a public S3 bucket. Query any
|
|
43
|
+
mice / sessions, or the whole dataset, in **seconds** with a few Python calls (DuckDB + pandas) —
|
|
44
|
+
instead of opening thousands of NWBs.
|
|
45
|
+
|
|
46
|
+
> **~24k sessions · 12.5M trials · 117M events** — the *complete* dataset, [**~10,000× faster**](#vs-the-legacy-nwb_utils-route) to
|
|
47
|
+
> query. (Per Po-Chen's test, reading data directly from NWBs via `aind-dynamic-foraging-data-utils`
|
|
48
|
+
> took **~6 days and reached only ~12k sessions** — about half.)
|
|
49
|
+
|
|
50
|
+
## Installation
|
|
51
|
+
|
|
52
|
+
Querying is lightweight — just `duckdb` + `pandas`:
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
uv add aind-dynamic-foraging-database # uv (recommended)
|
|
56
|
+
pip install aind-dynamic-foraging-database
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Or install the latest straight from GitHub (prefix with `!` in a notebook):
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
pip install "git+https://github.com/AllenNeuralDynamics/aind-dynamic-foraging-database.git"
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
To **build or extend** the database from NWBs, install the `build` extra (adds the NWB readers
|
|
66
|
+
+ `aind-dynamic-foraging-data-utils`) — see [`README_build.md`](README_build.md):
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
uv add "aind-dynamic-foraging-database[build]"
|
|
70
|
+
pip install "aind-dynamic-foraging-database[build]"
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
> 🚀 **Start with the query helpers** — importable from `aind_dynamic_foraging_database`,
|
|
76
|
+
> they wrap DuckDB and hand back a pandas DataFrame:
|
|
77
|
+
> - **`select_sessions(where=…, subjects=…, columns=…)`** — filter the (small) session table on any
|
|
78
|
+
> metric / metadata (or a subject list); returns a DataFrame of the selected sessions.
|
|
79
|
+
> - **`fetch_trials(sel, …)` / `fetch_events(sel, …)`** — pull those sessions' trials / events with the
|
|
80
|
+
> session metadata joined onto every row, reading only the selected subjects' partitions (fast).
|
|
81
|
+
> - **`read_trials(subjects)` / `read_events(subjects)`** — escape hatch: a fast, partition-scoped
|
|
82
|
+
> `read_parquet(...)` clause to drop into any DuckDB SQL you write (aggregations, windows, joins).
|
|
83
|
+
>
|
|
84
|
+
> See [**Quick start**](#quick-start--the-query-helpers) for runnable examples; drop to native SQL
|
|
85
|
+
> only when the helpers don't cover what you need.
|
|
86
|
+
|
|
87
|
+
> 💡 **Need custom DuckDB SQL? Let an LLM write it.** This README is self-contained: paste the
|
|
88
|
+
> whole file into the LLM of your choice (Claude / ChatGPT / Cursor / …) as context, then ask
|
|
89
|
+
> in plain English (e.g. *"trials for subjects 754372 and 758435 with foraging_eff > 0.8"*).
|
|
90
|
+
> It will return runnable DuckDB that follows the conventions below — including the key
|
|
91
|
+
> columns. See [**Use an LLM**](#use-an-llm-to-write-queries) for a copy-paste preamble — or, with
|
|
92
|
+
> a coding agent (Claude Code / Codex / OpenCode), load the `aind-dynamic-foraging-data-access`
|
|
93
|
+
> skill in `.claude/skills/` instead.
|
|
94
|
+
|
|
95
|
+
> 📊 **Prefer to browse the session metadata visually?** The interactive
|
|
96
|
+
> [**foraging behavior browser**](https://foraging-behavior-browser.allenneuraldynamics.org/)
|
|
97
|
+
> (Streamlit) renders this same session table with rich plots and point-and-click filters —
|
|
98
|
+
> a great way to find sessions/subjects before pulling their trials/events here.
|
|
99
|
+
> *Caveat:* the app is built from **Han's pipeline** only, so the **~381 CO-only sessions** this
|
|
100
|
+
> cache adds from the Code Ocean universe (all `nwb_data_source = 'co_asset'`, with NULL Han
|
|
101
|
+
> metadata — find them via `WHERE foraging_eff IS NULL`) **do not appear in the app**, even though
|
|
102
|
+
> their trials/events are fully in the cache.
|
|
103
|
+
|
|
104
|
+
> 🔧 **Building or extending the database?** See **[`README_build.md`](README_build.md)**.
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
## The database
|
|
109
|
+
|
|
110
|
+
Three tables on a **public** S3 bucket (`s3://aind-scratch-data/aind-dynamic-foraging-cache/`):
|
|
111
|
+
|
|
112
|
+
| Table | Path | Grain | Size |
|
|
113
|
+
|---|---|---|---|
|
|
114
|
+
| **session** | `session_table.parquet` | one row per session | ~24k rows × 160 cols (~MB) |
|
|
115
|
+
| **trial** | `trial_table/subject_id=<id>/…parquet` | one row per trial | ~12.5M rows × 103 cols (~21 GB) |
|
|
116
|
+
| **event** | `event_table/subject_id=<id>/…parquet` | one row per behavioral event | ~117M rows × 10 cols (~9 events / trial) |
|
|
117
|
+
|
|
118
|
+
The trial/event tables are **Hive-partitioned by `subject_id`** and coalesced to one file per
|
|
119
|
+
subject. The bucket is **public — DuckDB reads `s3://` natively with no AWS credentials or
|
|
120
|
+
setup** (httpfs auto-loads). Point at a local directory instead to query a local build.
|
|
121
|
+
|
|
122
|
+
The paths are importable:
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
from aind_dynamic_foraging_database import SESSION_DB, TRIAL_DB, EVENT_DB
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
---
|
|
129
|
+
|
|
130
|
+
## Quick start — the query helpers
|
|
131
|
+
|
|
132
|
+
Reach for the helpers first. They do the fiddly, easy-to-get-wrong part (reading the right
|
|
133
|
+
partition files, fast *and* correct) and hand back a pandas DataFrame. Drop to
|
|
134
|
+
[native SQL](#native-sql-what-the-helpers-are-built-on) only when you need more.
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
from aind_dynamic_foraging_database import (
|
|
138
|
+
select_sessions, fetch_trials, fetch_events,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# 1) Filter the (small) session table on any metric / metadata.
|
|
142
|
+
sel = select_sessions("task LIKE '%Uncoupled%' AND finished_trials > 500 AND finished_rate > 0.9")
|
|
143
|
+
|
|
144
|
+
# 2) Pull those sessions' trials — session metadata is joined onto every row.
|
|
145
|
+
trials = fetch_trials(sel, columns=["animal_response", "earned_reward",
|
|
146
|
+
"reward_probabilityL", "reward_probabilityR"])
|
|
147
|
+
|
|
148
|
+
# ... or their events (optionally restricted to certain event types).
|
|
149
|
+
licks = fetch_events(sel, events=["left_lick_time", "right_lick_time"])
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
**Two workflows, same two calls** — the only difference is how you filter the session table:
|
|
153
|
+
|
|
154
|
+
- **Filter on session metrics/metadata, then fetch** — pass a `where=` predicate (any column
|
|
155
|
+
of the session table; see [filter columns](#common-filter-columns-session-table)).
|
|
156
|
+
- **Subject first, then session, then fetch** — pass `subjects=[...]` (optionally with `where=`):
|
|
157
|
+
|
|
158
|
+
```python
|
|
159
|
+
sel = select_sessions("finished_trials > 200", subjects=["754372", "758435"])
|
|
160
|
+
trials = fetch_trials(sel, columns=["animal_response", "earned_reward"])
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
`fetch_trials` / `fetch_events` read **only the selected subjects' partition files** (≈1 s, not
|
|
164
|
+
every subject's) and inner-join to your selection, so you get exactly those sessions' rows with
|
|
165
|
+
their metadata attached — one row per trial/event, leading `subject_id, session_date,
|
|
166
|
+
session_id`. Add `columns=` to project specific columns (default is a small choice/reward set;
|
|
167
|
+
`columns="*"` returns all — large).
|
|
168
|
+
|
|
169
|
+
### Need more than the helpers cover? Drop to SQL on a fast source
|
|
170
|
+
|
|
171
|
+
The helpers don't try to express *every* query (aggregations, window functions, trial↔event
|
|
172
|
+
joins). For those, `read_trials(subjects)` / `read_events(subjects)` return a **fast,
|
|
173
|
+
partition-scoped `read_parquet(...)` clause** you drop into any SQL — so you keep the full
|
|
174
|
+
power of SQL without the slow full-table glob:
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
import duckdb
|
|
178
|
+
from aind_dynamic_foraging_database import read_trials
|
|
179
|
+
|
|
180
|
+
src = read_trials(["754372", "758435"]) # scoped -> reads only these subjects' files
|
|
181
|
+
duckdb.sql(f"""
|
|
182
|
+
SELECT subject_id, COUNT(*) AS n_trials, AVG(earned_reward::DOUBLE) AS reward_rate
|
|
183
|
+
FROM {src} GROUP BY subject_id ORDER BY subject_id
|
|
184
|
+
""").df()
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
`read_trials()` / `read_events()` with no arguments return the full-table glob (correct for any
|
|
188
|
+
query, but reads every subject's footer — slow; scope to subjects whenever you can).
|
|
189
|
+
|
|
190
|
+
> All helpers query the public S3 cache by default. Pass `base=` (a local dir or another S3
|
|
191
|
+
> prefix) to any of them to query a different build.
|
|
192
|
+
|
|
193
|
+
---
|
|
194
|
+
|
|
195
|
+
## Native SQL (what the helpers are built on)
|
|
196
|
+
|
|
197
|
+
Everything below is the raw DuckDB layer. Use it directly when you want full control — or to
|
|
198
|
+
understand what the helpers do under the hood. (You can still read the session table directly,
|
|
199
|
+
e.g. `duckdb.sql(f"SELECT COUNT(*) FROM read_parquet('{SESSION_DB}') WHERE subject_id = '754372'")`.)
|
|
200
|
+
|
|
201
|
+
## The three read options (always use these on the partitioned tables)
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
READ_TRIALS = f"read_parquet('{TRIAL_DB}/**/*.parquet', hive_partitioning=true, union_by_name=true)"
|
|
205
|
+
READ_EVENTS = f"read_parquet('{EVENT_DB}/**/*.parquet', hive_partitioning=true, union_by_name=true)"
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
- **`hive_partitioning=true`** — exposes `subject_id` from the directory name and prunes
|
|
209
|
+
partitions, so filtering by `subject_id` reads only that mouse's file(s).
|
|
210
|
+
- **`union_by_name=true`** — merges columns across the three NWB readers (a column missing in
|
|
211
|
+
some files fills with `NULL` instead of erroring).
|
|
212
|
+
- **`CAST(subject_id AS VARCHAR)`** — in the trial/event tables `subject_id` comes from the
|
|
213
|
+
partition directory and DuckDB infers it as **BIGINT**; the session table stores it as a
|
|
214
|
+
**string**. Always cast when filtering/joining `subject_id` on the trial/event tables.
|
|
215
|
+
|
|
216
|
+
The session table is a single plain parquet — read it with `read_parquet('{SESSION_DB}')`
|
|
217
|
+
(no options needed).
|
|
218
|
+
|
|
219
|
+
---
|
|
220
|
+
|
|
221
|
+
## Keys & joins
|
|
222
|
+
|
|
223
|
+
- Session key column is **`_session_id`** in the session table; the trial/event tables call it
|
|
224
|
+
**`session_id`** (same value: `"{subject_id}_{session_date}_{nwb_suffix}"`).
|
|
225
|
+
- Canonical pattern — **filter sessions, then JOIN to trials** so every trial row carries its
|
|
226
|
+
session metadata:
|
|
227
|
+
|
|
228
|
+
```python
|
|
229
|
+
df = duckdb.sql(f"""
|
|
230
|
+
WITH sel AS (
|
|
231
|
+
SELECT _session_id, subject_id, session_date, task, foraging_eff
|
|
232
|
+
FROM read_parquet('{SESSION_DB}')
|
|
233
|
+
WHERE task LIKE '%Uncoupled%' AND foraging_eff > 0.8
|
|
234
|
+
)
|
|
235
|
+
SELECT s.subject_id, s.session_date, t.session_id,
|
|
236
|
+
t.animal_response, t.earned_reward,
|
|
237
|
+
t.reward_probabilityL, t.reward_probabilityR
|
|
238
|
+
FROM {READ_TRIALS} t
|
|
239
|
+
JOIN sel s ON t.session_id = s._session_id
|
|
240
|
+
WHERE CAST(t.subject_id AS VARCHAR) IN (SELECT subject_id FROM sel)
|
|
241
|
+
ORDER BY s.subject_id, s.session_date
|
|
242
|
+
""").df()
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
(The extra `WHERE CAST(subject_id …) IN (…)` lets DuckDB prune partitions before the join.)
|
|
246
|
+
|
|
247
|
+
---
|
|
248
|
+
|
|
249
|
+
## Common filter columns (session table)
|
|
250
|
+
|
|
251
|
+
Almost all analyses start by selecting sessions on a few columns of the **session table**, then
|
|
252
|
+
joining to trials/events. The columns you'll filter on most:
|
|
253
|
+
|
|
254
|
+
| Filter on | Column(s) | Example values / predicate |
|
|
255
|
+
|---|---|---|
|
|
256
|
+
| **Identity** | `subject_id`, `session_date` | `subject_id IN ('754372','758435')`; `session_date >= '2024-01-01'` |
|
|
257
|
+
| **Institute / hardware / rig** | `institute`, `hardware`, `rig_type`, `room` | `institute`: `AIND` \| `Janelia`; `hardware`: `bonsai` \| `bpod`; `rig_type`: `training` \| `ephys`; `room`: `447`, `446`, … → e.g. `institute = 'Janelia'`, `hardware = 'bpod'`, `rig_type = 'ephys'` |
|
|
258
|
+
| **Behavior task** | `task` | `Uncoupled Baiting`, `Coupled Baiting`, `Uncoupled Without Baiting`, `Coupled Without Baiting` → `task LIKE '%Uncoupled%'` |
|
|
259
|
+
| **Curriculum** | `curriculum_name`, `curriculum_version` | e.g. `Uncoupled Baiting` / `'2.3'`; **`'None'` = off-curriculum** → `curriculum_name <> 'None'` for on-curriculum only |
|
|
260
|
+
| **Curriculum stage** | `current_stage_actual` | `STAGE_1_WARMUP` → `STAGE_1` → `STAGE_2/3/4` → `STAGE_FINAL` → `GRADUATED` (`'None'` = off-curriculum). For fully-trained sessions use the **"Final stages"**: `current_stage_actual IN ('STAGE_FINAL', 'GRADUATED')` (see note below) |
|
|
261
|
+
| **Performance metrics** | `finished_trials`, `finished_rate`, `foraging_eff`, `total_trials`, `reward_trials`, `bias_naive`, … | combine freely: `foraging_eff > 0.8 AND finished_trials > 200 AND finished_rate > 0.7` |
|
|
262
|
+
|
|
263
|
+
> 💡 **Use `institute` / `hardware` / `rig_type` for high-level grouping** (clean values:
|
|
264
|
+
> `AIND`/`Janelia`, `bonsai`/`bpod`, `training`/`ephys`). `data_source` is their fine-grained
|
|
265
|
+
> concatenation (e.g. `AIND_training_447_bonsai`) — usually too granular to filter on directly.
|
|
266
|
+
> And **`data_source` ≠ `nwb_data_source`**: `nwb_data_source` (`co_asset`/`bonsai_s3`/`bpod_s3`)
|
|
267
|
+
> is just *which NWB the cache built the row from*, not a science filter.
|
|
268
|
+
>
|
|
269
|
+
> **Curriculum "off" vs "missing":** off-curriculum sessions have the **string** `curriculum_name
|
|
270
|
+
> = 'None'` (and `curriculum_version = 'None'`); the ~381 CO-only sessions absent from Han have
|
|
271
|
+
> SQL `NULL`. `curriculum_name NOT IN ('None')` keeps on-curriculum sessions (it also drops the
|
|
272
|
+
> NULLs).
|
|
273
|
+
>
|
|
274
|
+
> **"Final stages" — `STAGE_FINAL` vs `GRADUATED`:** a curriculum's terminal *training
|
|
275
|
+
> parameters* are reached at `current_stage_actual = 'STAGE_FINAL'`; once a mouse meets the
|
|
276
|
+
> graduation criteria the stage is relabeled `'GRADUATED'` — but **both run the identical task
|
|
277
|
+
> parameters**. So for "fully-trained" sessions, treat them as one:
|
|
278
|
+
> `current_stage_actual IN ('STAGE_FINAL', 'GRADUATED')`.
|
|
279
|
+
>
|
|
280
|
+
> **Curriculum vs. `task` — related but not the same.** `curriculum_name` is the auto-training
|
|
281
|
+
> *program* a mouse is enrolled in (named after its **target task**, and constant as the mouse
|
|
282
|
+
> progresses); `task` is the paradigm **actually run that session**, which changes by stage
|
|
283
|
+
> because the curriculum ramps difficulty. E.g. the *Uncoupled Baiting* curriculum runs the
|
|
284
|
+
> easier **Coupled Baiting** task in `STAGE_1_WARMUP`→`STAGE_2`, then switches to **Uncoupled
|
|
285
|
+
> Baiting** from `STAGE_3`→`STAGE_FINAL`/`GRADUATED`. So filter **`curriculum_name`** to pick mice
|
|
286
|
+
> *enrolled in* a program, and **`task`** to pick sessions that *actually ran* a paradigm — they
|
|
287
|
+
> match for most sessions but differ for ~3.2k on-curriculum sessions (the early stages).
|
|
288
|
+
|
|
289
|
+
---
|
|
290
|
+
|
|
291
|
+
## Schema catalog
|
|
292
|
+
|
|
293
|
+
Column types come straight from the files. To list **every** column of a table:
|
|
294
|
+
|
|
295
|
+
```python
|
|
296
|
+
duckdb.sql(f"DESCRIBE SELECT * FROM read_parquet('{SESSION_DB}')").df() # 160 cols
|
|
297
|
+
duckdb.sql(f"DESCRIBE SELECT * FROM {READ_TRIALS}").df() # 103 cols
|
|
298
|
+
duckdb.sql(f"DESCRIBE SELECT * FROM {READ_EVENTS}").df() # 10 cols
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
### session table (key columns; 160 total)
|
|
302
|
+
| column | type | meaning |
|
|
303
|
+
|---|---|---|
|
|
304
|
+
| `_session_id` | VARCHAR | session key → join to trial/event `session_id` |
|
|
305
|
+
| `subject_id` | VARCHAR | mouse ID (string) |
|
|
306
|
+
| `session_date` | VARCHAR | `YYYY-MM-DD` |
|
|
307
|
+
| `nwb_suffix` | BIGINT | session start `HHMMSS` as int (disambiguates same-day sessions) |
|
|
308
|
+
| `task` | VARCHAR | e.g. `Uncoupled Baiting`, `Coupled Baiting`, `Uncoupled Without Baiting` |
|
|
309
|
+
| `total_trials` | DOUBLE | foraging trials, **autowater excluded** |
|
|
310
|
+
| `total_trials_with_autowater` | DOUBLE | all trials (= trial-table `COUNT(*)`) |
|
|
311
|
+
| `finished_trials` | DOUBLE | non-ignored foraging trials |
|
|
312
|
+
| `ignored_trials` | DOUBLE | foraging trials with no response |
|
|
313
|
+
| `finished_rate`, `ignore_rate` | DOUBLE | finished / ignored fraction |
|
|
314
|
+
| `reward_trials` | DOUBLE | earned (non-autowater) rewards |
|
|
315
|
+
| `reward_rate` | DOUBLE | reward / finished |
|
|
316
|
+
| `foraging_eff` | DOUBLE | foraging efficiency vs ideal |
|
|
317
|
+
| `foraging_performance` | DOUBLE | `foraging_eff × finished_rate` |
|
|
318
|
+
| `bias_naive` | DOUBLE | side bias, −1 (left) … +1 (right) |
|
|
319
|
+
| `autowater_collected`, `autowater_ignored` | DOUBLE | autowater trial counts |
|
|
320
|
+
| `reaction_time_median`, `early_lick_rate` | DOUBLE | timing / lick metrics |
|
|
321
|
+
| `institute`, `hardware`, `rig_type` | VARCHAR | **high-level grouping** — `AIND`/`Janelia`, `bonsai`/`bpod`, `training`/`ephys` |
|
|
322
|
+
| `room` | VARCHAR | rig room (`447`, `446`, `347`, …) |
|
|
323
|
+
| `data_source` | VARCHAR | fine-grained composite ≈ `{institute}_{rig_type}_{room}_{hardware}` (e.g. `AIND_training_447_bonsai`) — **≠ `nwb_data_source`** |
|
|
324
|
+
| `curriculum_name`, `curriculum_version` | VARCHAR | curriculum + version; **`'None'` = off-curriculum**, `NULL` = not in Han |
|
|
325
|
+
| `current_stage_actual` | VARCHAR | curriculum stage reached: `STAGE_1_WARMUP`…`STAGE_FINAL`/`GRADUATED` — the two **"Final stages"** (`STAGE_FINAL`, `GRADUATED`) share training parameters; `'None'` = off-curriculum |
|
|
326
|
+
| `rig`, `trainer`, `PI` | VARCHAR | session metadata |
|
|
327
|
+
| `weight_after`, `water_in_session_total` | DOUBLE | weight / water |
|
|
328
|
+
| `logistic_*`, `abs(*_bias)` | DOUBLE | fitted logistic-regression model coefficients |
|
|
329
|
+
| `nwb_data_source` | VARCHAR | `co_asset` \| `bonsai_s3` \| `bpod_s3` — which NWB the cache built the row from (not a science filter) |
|
|
330
|
+
| `co_asset_id`, `co_s3_nwb_uri` | VARCHAR | Code Ocean asset id / NWB URI (NULL if none) |
|
|
331
|
+
|
|
332
|
+
### trial table (key columns; 103 total)
|
|
333
|
+
| column | type | meaning |
|
|
334
|
+
|---|---|---|
|
|
335
|
+
| `session_id` | VARCHAR | join key → session `_session_id` |
|
|
336
|
+
| `subject_id` | BIGINT* | mouse ID — *cast to VARCHAR when filtering* (partition column) |
|
|
337
|
+
| `session_date` | VARCHAR | `YYYY-MM-DD` |
|
|
338
|
+
| `nwb_suffix` | BIGINT | session suffix |
|
|
339
|
+
| `trial` | DOUBLE | trial index within the session |
|
|
340
|
+
| `animal_response` | DOUBLE | **0 = lick left, 1 = lick right, 2 = ignore (no response)** |
|
|
341
|
+
| `earned_reward` | BOOLEAN | earned a (non-autowater) reward (= `rewarded_historyL OR rewarded_historyR`) |
|
|
342
|
+
| `rewarded_historyL` / `rewarded_historyR` | BOOLEAN | reward delivered on left / right |
|
|
343
|
+
| `reward_probabilityL` / `reward_probabilityR` | DOUBLE | scheduled reward prob per side |
|
|
344
|
+
| `auto_waterL` / `auto_waterR` | BIGINT | autowater given on left / right (**non-autowater trial = both 0**) |
|
|
345
|
+
| `reward_random_number_left` / `_right` | DOUBLE | the draw used for baiting |
|
|
346
|
+
| `goCue_start_time_in_session` | DOUBLE | go-cue time (s from session start) |
|
|
347
|
+
| `choice_time_in_session` | DOUBLE | choice (lick) time (s) |
|
|
348
|
+
| `reward_time_in_session` | DOUBLE | reward time (s) |
|
|
349
|
+
| `reaction_time` | DOUBLE | choice − go-cue (s) |
|
|
350
|
+
| `laser_*` | mixed | optogenetics parameters (NULL on non-opto trials) |
|
|
351
|
+
| `nwb_data_source` | VARCHAR | reader source |
|
|
352
|
+
|
|
353
|
+
### event table (all 10 columns)
|
|
354
|
+
| column | type | meaning |
|
|
355
|
+
|---|---|---|
|
|
356
|
+
| `session_id` | VARCHAR | join key → session `_session_id` |
|
|
357
|
+
| `subject_id` | BIGINT* | mouse ID — *cast to VARCHAR when filtering* |
|
|
358
|
+
| `session_date` | VARCHAR | `YYYY-MM-DD` |
|
|
359
|
+
| `nwb_suffix` | BIGINT | session suffix |
|
|
360
|
+
| `trial` | DOUBLE | trial index this event falls in (−1 before first go-cue) |
|
|
361
|
+
| `timestamps` | DOUBLE | event time, s from session start |
|
|
362
|
+
| `raw_timestamps` | DOUBLE | original NWB timestamp (un-aligned) |
|
|
363
|
+
| `event` | VARCHAR | one of: `goCue_start_time`, `left_lick_time`, `right_lick_time`, `left_reward_delivery_time`, `right_reward_delivery_time`, `optogenetics_time` |
|
|
364
|
+
| `data` | VARCHAR | event payload (string-normalized) |
|
|
365
|
+
| `nwb_data_source` | VARCHAR | reader source |
|
|
366
|
+
|
|
367
|
+
---
|
|
368
|
+
|
|
369
|
+
## Conventions & gotchas
|
|
370
|
+
|
|
371
|
+
- **Cast `subject_id`** on the trial/event tables — when filtering, joining, **and grouping**:
|
|
372
|
+
`WHERE CAST(subject_id AS VARCHAR) = '754372'`, `GROUP BY CAST(subject_id AS VARCHAR)`
|
|
373
|
+
(partition column is BIGINT; session-table column is string; grouping on the raw BIGINT
|
|
374
|
+
partition column can also trip a DuckDB stats error).
|
|
375
|
+
- **`subject_id` and `session_date` are strings** — quote them (`'754372'`, `'2024-05-01'`).
|
|
376
|
+
- **Session key naming:** session table `_session_id` ↔ trial/event `session_id`.
|
|
377
|
+
- **Autowater:** `total_trials` **excludes** autowater; `total_trials_with_autowater` is all
|
|
378
|
+
trials (and equals the trial table's `COUNT(*)`). A trial is non-autowater iff
|
|
379
|
+
`auto_waterL = 0 AND auto_waterR = 0`. `reward_trials` counts earned (non-autowater) rewards.
|
|
380
|
+
- **`animal_response`:** 0 = left, 1 = right, 2 = ignore.
|
|
381
|
+
- **Performance:** *filter by `subject_id`* (prunes partitions) and *project only the columns
|
|
382
|
+
you need*. `SELECT *` over trials is ~21 GB; the choice/reward 5-column slice is ~2 GB / ~6 s.
|
|
383
|
+
- **NULLs:** `union_by_name` fills reader-specific columns with `NULL`; numeric comparisons
|
|
384
|
+
(`> 0.8`) drop NULL/NaN rows.
|
|
385
|
+
- **⚠️ The ~381 CO-only sessions have NO Han metadata.** Sessions added from the Code Ocean
|
|
386
|
+
universe but absent from Han's pipeline (`nwb_data_source = 'co_asset'`) have **only the
|
|
387
|
+
identity + CO columns populated** (`subject_id, session_date, nwb_suffix, _session_id,
|
|
388
|
+
co_asset_id, co_s3_nwb_uri, nwb_data_source`); **all Han columns are NULL** (`task`,
|
|
389
|
+
`institute`, `hardware`, `curriculum_*`, `foraging_eff`, `finished_trials`, every metric). So
|
|
390
|
+
**any filter on a Han column silently excludes them** (NULL fails every comparison — they
|
|
391
|
+
"never return"). Their **trials/events are fully in the cache**, so reach them by
|
|
392
|
+
`subject_id`/`session_id` (or isolate them with `WHERE foraging_eff IS NULL`). **We plan to
|
|
393
|
+
rebuild the session metric table directly from the cache** — recomputing these per-session
|
|
394
|
+
stats from the trial data (the single source of truth) — which will fill in the CO-only
|
|
395
|
+
sessions and eventually supersede Han's pipeline as the source of session metadata.
|
|
396
|
+
|
|
397
|
+
---
|
|
398
|
+
|
|
399
|
+
## Output-formatting rules
|
|
400
|
+
|
|
401
|
+
So results stay identifiable and joinable, **every query that returns trial/event/session rows
|
|
402
|
+
should**:
|
|
403
|
+
|
|
404
|
+
1. `SELECT subject_id, session_date, session_id` as the **leading** columns;
|
|
405
|
+
2. return **one row per trial / event / session**;
|
|
406
|
+
3. `ORDER BY subject_id, session_date` (then `trial`/`timestamps` where relevant).
|
|
407
|
+
|
|
408
|
+
---
|
|
409
|
+
|
|
410
|
+
## Worked examples
|
|
411
|
+
|
|
412
|
+
```python
|
|
413
|
+
import duckdb
|
|
414
|
+
from aind_dynamic_foraging_database import SESSION_DB, TRIAL_DB, EVENT_DB
|
|
415
|
+
READ_TRIALS = f"read_parquet('{TRIAL_DB}/**/*.parquet', hive_partitioning=true, union_by_name=true)"
|
|
416
|
+
READ_EVENTS = f"read_parquet('{EVENT_DB}/**/*.parquet', hive_partitioning=true, union_by_name=true)"
|
|
417
|
+
```
|
|
418
|
+
|
|
419
|
+
**1. Count records (with a filter)**
|
|
420
|
+
```python
|
|
421
|
+
duckdb.sql(f"SELECT COUNT(*) FROM read_parquet('{SESSION_DB}') WHERE foraging_eff > 0.8").df()
|
|
422
|
+
duckdb.sql(f"SELECT COUNT(*) FROM {READ_TRIALS} WHERE CAST(subject_id AS VARCHAR) = '754372'").df()
|
|
423
|
+
```
|
|
424
|
+
|
|
425
|
+
**2. Fetch selected columns for a list of subjects**
|
|
426
|
+
```python
|
|
427
|
+
duckdb.sql(f"""
|
|
428
|
+
SELECT subject_id, session_date, session_id, animal_response, earned_reward
|
|
429
|
+
FROM {READ_TRIALS}
|
|
430
|
+
WHERE CAST(subject_id AS VARCHAR) IN ('754372', '758435')
|
|
431
|
+
ORDER BY subject_id, session_date
|
|
432
|
+
""").df()
|
|
433
|
+
```
|
|
434
|
+
|
|
435
|
+
**3. Filter by `(subject_id, session_date)` combinations**
|
|
436
|
+
```python
|
|
437
|
+
duckdb.sql(f"""
|
|
438
|
+
SELECT subject_id, session_date, session_id, trial, animal_response, earned_reward
|
|
439
|
+
FROM {READ_TRIALS}
|
|
440
|
+
WHERE (CAST(subject_id AS VARCHAR), session_date) IN (
|
|
441
|
+
('754372', '2024-05-01'), ('758435', '2024-06-12')
|
|
442
|
+
)
|
|
443
|
+
ORDER BY subject_id, session_date, trial
|
|
444
|
+
""").df()
|
|
445
|
+
```
|
|
446
|
+
|
|
447
|
+
**4. Filter on source + task + curriculum + performance metrics → join to trials**
|
|
448
|
+
```python
|
|
449
|
+
duckdb.sql(f"""
|
|
450
|
+
WITH sel AS (
|
|
451
|
+
SELECT _session_id, subject_id, session_date
|
|
452
|
+
FROM read_parquet('{SESSION_DB}')
|
|
453
|
+
WHERE institute = 'AIND' AND hardware = 'bonsai' -- institute / hardware
|
|
454
|
+
AND task LIKE '%Uncoupled%' -- behavior task
|
|
455
|
+
AND curriculum_name NOT IN ('None') -- on-curriculum only
|
|
456
|
+
AND foraging_eff > 0.8 AND finished_trials > 200 -- performance metrics
|
|
457
|
+
)
|
|
458
|
+
SELECT s.subject_id, s.session_date, t.session_id, t.trial,
|
|
459
|
+
t.animal_response, t.earned_reward, t.reward_probabilityL, t.reward_probabilityR
|
|
460
|
+
FROM {READ_TRIALS} t
|
|
461
|
+
JOIN sel s ON t.session_id = s._session_id
|
|
462
|
+
WHERE CAST(t.subject_id AS VARCHAR) IN (SELECT subject_id FROM sel)
|
|
463
|
+
ORDER BY s.subject_id, s.session_date, t.trial
|
|
464
|
+
""").df()
|
|
465
|
+
```
|
|
466
|
+
|
|
467
|
+
**5. Events (e.g. licks) for selected sessions**
|
|
468
|
+
```python
|
|
469
|
+
duckdb.sql(f"""
|
|
470
|
+
SELECT subject_id, session_date, session_id, trial, timestamps, event
|
|
471
|
+
FROM {READ_EVENTS}
|
|
472
|
+
WHERE CAST(subject_id AS VARCHAR) = '754372'
|
|
473
|
+
AND event IN ('left_lick_time', 'right_lick_time')
|
|
474
|
+
ORDER BY subject_id, session_date, timestamps
|
|
475
|
+
""").df()
|
|
476
|
+
```
|
|
477
|
+
|
|
478
|
+
**6. Per-subject aggregate (all in SQL)**
|
|
479
|
+
```python
|
|
480
|
+
duckdb.sql(f"""
|
|
481
|
+
SELECT CAST(subject_id AS VARCHAR) AS subject_id,
|
|
482
|
+
COUNT(DISTINCT session_id) AS n_sessions,
|
|
483
|
+
COUNT(*) AS n_trials,
|
|
484
|
+
AVG(earned_reward::DOUBLE) AS reward_rate
|
|
485
|
+
FROM {READ_TRIALS}
|
|
486
|
+
WHERE CAST(subject_id AS VARCHAR) IN ('754372', '758435')
|
|
487
|
+
GROUP BY CAST(subject_id AS VARCHAR)
|
|
488
|
+
ORDER BY subject_id
|
|
489
|
+
""").df()
|
|
490
|
+
```
|
|
491
|
+
(Cast `subject_id` in the `GROUP BY` too — grouping on the raw BIGINT partition column can hit
|
|
492
|
+
a DuckDB stats error.)
|
|
493
|
+
|
|
494
|
+
Runnable versions of these (and an at-a-glance DB overview + a DuckDB primer) are in
|
|
495
|
+
[`query_examples.ipynb`](query_examples.ipynb) / `query_examples.py`.
|
|
496
|
+
|
|
497
|
+
---
|
|
498
|
+
|
|
499
|
+
## Use an LLM to write queries
|
|
500
|
+
|
|
501
|
+
Paste this README into your LLM as context, prefixed with something like:
|
|
502
|
+
|
|
503
|
+
> *You write DuckDB SQL against the AIND dynamic-foraging parquet cache described below. Rules:
|
|
504
|
+
> read the partitioned trial/event tables with `read_parquet('…/**/*.parquet',
|
|
505
|
+
> hive_partitioning=true, union_by_name=true)`; `CAST(subject_id AS VARCHAR)` whenever you
|
|
506
|
+
> filter or join `subject_id` on those tables; quote `subject_id`/`session_date` (strings);
|
|
507
|
+
> the session key is `_session_id` (session table) ↔ `session_id` (trial/event); always SELECT
|
|
508
|
+
> `subject_id, session_date, session_id` as the leading columns and `ORDER BY subject_id,
|
|
509
|
+
> session_date`; project only the columns asked for; filter by `subject_id` when possible.
|
|
510
|
+
> Return a single runnable `duckdb.sql(...).df()` snippet. Schema and conventions follow:*
|
|
511
|
+
|
|
512
|
+
Then ask your question in plain English.
|
|
513
|
+
|
|
514
|
+
**Using a coding agent** (Claude Code, Codex, OpenCode, …)? This repo ships an
|
|
515
|
+
**`aind-dynamic-foraging-data-access`** skill (in `.claude/skills/`) with exactly this context.
|
|
516
|
+
With Claude Code it loads automatically when you work in the repo; for other agents, point them
|
|
517
|
+
at `.claude/skills/aind-dynamic-foraging-data-access/SKILL.md`. Then just ask for the data you
|
|
518
|
+
want — no need to paste this README.
|
|
519
|
+
|
|
520
|
+
---
|
|
521
|
+
|
|
522
|
+
## Read performance (full database — ~24k sessions, 12.5M trials, over S3)
|
|
523
|
+
|
|
524
|
+
**Scope the read to the subjects you need** — this is what the helpers do, and it dominates
|
|
525
|
+
selective-query latency:
|
|
526
|
+
|
|
527
|
+
- **Selective query via the helpers / a scoped read** (a handful of subjects, choice/reward
|
|
528
|
+
columns) → **~1 s**. `fetch_trials(sel, …)` / `read_trials(subjects)` read only those
|
|
529
|
+
subjects' partition files.
|
|
530
|
+
- **Full-table glob** (`/**/*.parquet` + `union_by_name`) → **~25 s cold** *before* any data:
|
|
531
|
+
it reads every subject file's footer to build the column union, even for one subject. Reuse a
|
|
532
|
+
single DuckDB connection and repeats drop to ~7 s; scoping avoids it entirely.
|
|
533
|
+
|
|
534
|
+
Whole-dataset reads (where you genuinely touch every subject, so the footer scan isn't extra):
|
|
535
|
+
|
|
536
|
+
- **5-column projection** (choice/reward/prob, + keys) → **~6 s, ~2 GB** — the normal analysis pattern.
|
|
537
|
+
- full 103-column trial table → **~53 s, ~21 GB**.
|
|
538
|
+
- `COUNT(*)` over the trial table → **~1 s**.
|
|
539
|
+
- **Return-loop join** (filter sessions → pull all their trials + events) → **~44 s** over S3.
|
|
540
|
+
|
|
541
|
+
### vs. the legacy `nwb_utils` route
|
|
542
|
+
|
|
543
|
+
The way to get this data *without* the new database is to open each session's NWB yourself —
|
|
544
|
+
`code_ocean_utils.get_subject_assets()` (docDB query) → `add_s3_location()` (S3 glob) →
|
|
545
|
+
`nwb_utils.create_df_trials()` / `create_df_events()` — **one session at a time**. That costs
|
|
546
|
+
**~23 s per session** (dominated by the **~17 s docDB query**; +~4 s to open/parse the NWB,
|
|
547
|
+
+~3 s for events), and it does **not** scale: there's no projection (you read the whole NWB to
|
|
548
|
+
get 5 columns) and every session pays the docDB round-trip again. The new database replaces the whole
|
|
549
|
+
chain with a single parquet scan:
|
|
550
|
+
|
|
551
|
+
| Fetch | **The new database** (~24k sessions) | **Legacy `nwb_utils`** (~12k sessions) |
|
|
552
|
+
|---|---|---|
|
|
553
|
+
| 1 session, trials | ~1 s | ~23 s |
|
|
554
|
+
| 100 sessions, trials | ~3 s | **~40 min** |
|
|
555
|
+
| Full DB, 5-col | **~6 s** | **~6 days** |
|
|
556
|
+
| Full DB, full 103-col | **~53 s** | ~6 days |
|
|
557
|
+
|
|
558
|
+
> **Not just faster — more complete.** Po-Chen's prior `nwb_utils` effort reached only **~12k
|
|
559
|
+
> sessions in ~6 days** — roughly **half** of what's here (~24k). The per-session route
|
|
560
|
+
> doesn't realistically scale to the full dataset, so in practice it also yielded **~2× less data**.
|
|
561
|
+
> The new database is the *complete* set, rebuilt end-to-end in **under 2 h**.
|
|
562
|
+
|
|
563
|
+
→ **~10,000× faster** at full-dataset scale, verified equivalent to a direct
|
|
564
|
+
`nwb_utils` read (33/33 sessions exact-match — see `README_build.md`). Solid = the new database (measured),
|
|
565
|
+
dashed = legacy `nwb_utils` (per-session cost, extrapolated):
|
|
566
|
+
|
|
567
|
+

|
|
568
|
+
|
|
569
|
+
Memory scales with the columns you select (a few columns ≈ 10× less RAM than the full width);
|
|
570
|
+
per-subject coalescing
|
|
571
|
+
keeps file-open overhead small even for full-width loads. See [`README_build.md`](README_build.md)
|
|
572
|
+
for build performance and the full validation results (data-equivalence + apples-to-apples vs Han).
|