aind-dynamic-foraging-database 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. aind_dynamic_foraging_database-0.0.1/LICENSE +21 -0
  2. aind_dynamic_foraging_database-0.0.1/PKG-INFO +572 -0
  3. aind_dynamic_foraging_database-0.0.1/README.md +541 -0
  4. aind_dynamic_foraging_database-0.0.1/pyproject.toml +91 -0
  5. aind_dynamic_foraging_database-0.0.1/setup.cfg +4 -0
  6. aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database/__init__.py +31 -0
  7. aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database/build_cache.py +274 -0
  8. aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database/query.py +284 -0
  9. aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database/query_examples.py +112 -0
  10. aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database/util/__init__.py +1 -0
  11. aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database/util/nwb_reader_aind.py +77 -0
  12. aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database/util/nwb_reader_legacy.py +398 -0
  13. aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database/util/parquet_builder.py +1203 -0
  14. aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database/util/postprocess.py +208 -0
  15. aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database/validate/__init__.py +1 -0
  16. aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database/validate/plot_validation.py +50 -0
  17. aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database/validate/validate_step1.py +210 -0
  18. aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database/validate/validate_step2.py +111 -0
  19. aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database.egg-info/PKG-INFO +572 -0
  20. aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database.egg-info/SOURCES.txt +23 -0
  21. aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database.egg-info/dependency_links.txt +1 -0
  22. aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database.egg-info/requires.txt +21 -0
  23. aind_dynamic_foraging_database-0.0.1/src/aind_dynamic_foraging_database.egg-info/top_level.txt +1 -0
  24. aind_dynamic_foraging_database-0.0.1/tests/test_parquet_cache.py +411 -0
  25. aind_dynamic_foraging_database-0.0.1/tests/test_query.py +126 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2022 Allen Institute for Neural Dynamics
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,572 @@
1
+ Metadata-Version: 2.4
2
+ Name: aind-dynamic-foraging-database
3
+ Version: 0.0.1
4
+ Summary: Query (and build) the AIND dynamic-foraging behavioral parquet database.
5
+ Author: Allen Institute for Neural Dynamics
6
+ License: MIT
7
+ Classifier: Programming Language :: Python :: 3
8
+ Requires-Python: >=3.9
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Requires-Dist: duckdb
12
+ Requires-Dist: pandas
13
+ Provides-Extra: build
14
+ Requires-Dist: pynwb; extra == "build"
15
+ Requires-Dist: hdmf_zarr; extra == "build"
16
+ Requires-Dist: h5py; extra == "build"
17
+ Requires-Dist: s3fs>=2025.10.0; extra == "build"
18
+ Requires-Dist: pyarrow; extra == "build"
19
+ Requires-Dist: aind-data-access-api; extra == "build"
20
+ Requires-Dist: codeocean; extra == "build"
21
+ Requires-Dist: aind-dynamic-foraging-data-utils; extra == "build"
22
+ Provides-Extra: dev
23
+ Requires-Dist: black; extra == "dev"
24
+ Requires-Dist: coverage; extra == "dev"
25
+ Requires-Dist: flake8; extra == "dev"
26
+ Requires-Dist: interrogate; extra == "dev"
27
+ Requires-Dist: isort; extra == "dev"
28
+ Requires-Dist: Sphinx; extra == "dev"
29
+ Requires-Dist: furo; extra == "dev"
30
+ Dynamic: license-file
31
+
32
+ # AIND Dynamic Foraging Database
33
+
34
+ [![License](https://img.shields.io/badge/license-MIT-brightgreen)](LICENSE)
35
+ ![Code Style](https://img.shields.io/badge/code%20style-black-black)
36
+ [![semantic-release: angular](https://img.shields.io/badge/semantic--release-angular-e10079?logo=semantic-release)](https://github.com/semantic-release/semantic-release)
37
+ ![Interrogate](https://img.shields.io/badge/interrogate-100.0%25-brightgreen)
38
+ ![Coverage](https://img.shields.io/badge/coverage-83%25-yellow?logo=codecov)
39
+ ![Python](https://img.shields.io/badge/python->=3.9-blue?logo=python)
40
+
41
+ The **single, queryable source of truth for _all_ AIND dynamic-foraging behavior** — every
42
+ session / trial / event, unified into one **parquet database** on a public S3 bucket. Query any
43
+ mice / sessions, or the whole dataset, in **seconds** with a few Python calls (DuckDB + pandas) —
44
+ instead of opening thousands of NWBs.
45
+
46
+ > **~24k sessions · 12.5M trials · 117M events** — the *complete* dataset, [**~10,000× faster**](#vs-the-legacy-nwb_utils-route) to
47
+ > query. (Per Po-Chen's test, reading data directly from NWBs via `aind-dynamic-foraging-data-utils`
48
+ > took **~6 days and reached only ~12k sessions** — about half.)
49
+
50
+ ## Installation
51
+
52
+ Querying is lightweight — just `duckdb` + `pandas`:
53
+
54
+ ```bash
55
+ uv add aind-dynamic-foraging-database # uv (recommended)
56
+ pip install aind-dynamic-foraging-database
57
+ ```
58
+
59
+ Or install the latest straight from GitHub (prefix with `!` in a notebook):
60
+
61
+ ```bash
62
+ pip install "git+https://github.com/AllenNeuralDynamics/aind-dynamic-foraging-database.git"
63
+ ```
64
+
65
+ To **build or extend** the database from NWBs, install the `build` extra (adds the NWB readers
66
+ + `aind-dynamic-foraging-data-utils`) — see [`README_build.md`](README_build.md):
67
+
68
+ ```bash
69
+ uv add "aind-dynamic-foraging-database[build]"
70
+ pip install "aind-dynamic-foraging-database[build]"
71
+ ```
72
+
73
+ ---
74
+
75
+ > 🚀 **Start with the query helpers** — importable from `aind_dynamic_foraging_database`,
76
+ > they wrap DuckDB and hand back a pandas DataFrame:
77
+ > - **`select_sessions(where=…, subjects=…, columns=…)`** — filter the (small) session table on any
78
+ > metric / metadata (or a subject list); returns a DataFrame of the selected sessions.
79
+ > - **`fetch_trials(sel, …)` / `fetch_events(sel, …)`** — pull those sessions' trials / events with the
80
+ > session metadata joined onto every row, reading only the selected subjects' partitions (fast).
81
+ > - **`read_trials(subjects)` / `read_events(subjects)`** — escape hatch: a fast, partition-scoped
82
+ > `read_parquet(...)` clause to drop into any DuckDB SQL you write (aggregations, windows, joins).
83
+ >
84
+ > See [**Quick start**](#quick-start--the-query-helpers) for runnable examples; drop to native SQL
85
+ > only when the helpers don't cover what you need.
86
+
87
+ > 💡 **Need custom DuckDB SQL? Let an LLM write it.** This README is self-contained: paste the
88
+ > whole file into the LLM of your choice (Claude / ChatGPT / Cursor / …) as context, then ask
89
+ > in plain English (e.g. *"trials for subjects 754372 and 758435 with foraging_eff > 0.8"*).
90
+ > It will return runnable DuckDB that follows the conventions below — including the key
91
+ > columns. See [**Use an LLM**](#use-an-llm-to-write-queries) for a copy-paste preamble — or, with
92
+ > a coding agent (Claude Code / Codex / OpenCode), load the `aind-dynamic-foraging-data-access`
93
+ > skill in `.claude/skills/` instead.
94
+
95
+ > 📊 **Prefer to browse the session metadata visually?** The interactive
96
+ > [**foraging behavior browser**](https://foraging-behavior-browser.allenneuraldynamics.org/)
97
+ > (Streamlit) renders this same session table with rich plots and point-and-click filters —
98
+ > a great way to find sessions/subjects before pulling their trials/events here.
99
+ > *Caveat:* the app is built from **Han's pipeline** only, so the **~381 CO-only sessions** this
100
+ > cache adds from the Code Ocean universe (all `nwb_data_source = 'co_asset'`, with NULL Han
101
+ > metadata — find them via `WHERE foraging_eff IS NULL`) **do not appear in the app**, even though
102
+ > their trials/events are fully in the cache.
103
+
104
+ > 🔧 **Building or extending the database?** See **[`README_build.md`](README_build.md)**.
105
+
106
+ ---
107
+
108
+ ## The database
109
+
110
+ Three tables on a **public** S3 bucket (`s3://aind-scratch-data/aind-dynamic-foraging-cache/`):
111
+
112
+ | Table | Path | Grain | Size |
113
+ |---|---|---|---|
114
+ | **session** | `session_table.parquet` | one row per session | ~24k rows × 160 cols (~MB) |
115
+ | **trial** | `trial_table/subject_id=<id>/…parquet` | one row per trial | ~12.5M rows × 103 cols (~21 GB) |
116
+ | **event** | `event_table/subject_id=<id>/…parquet` | one row per behavioral event | ~117M rows × 10 cols (~9 events / trial) |
117
+
118
+ The trial/event tables are **Hive-partitioned by `subject_id`** and coalesced to one file per
119
+ subject. The bucket is **public — DuckDB reads `s3://` natively with no AWS credentials or
120
+ setup** (httpfs auto-loads). Point at a local directory instead to query a local build.
121
+
122
+ The paths are importable:
123
+
124
+ ```python
125
+ from aind_dynamic_foraging_database import SESSION_DB, TRIAL_DB, EVENT_DB
126
+ ```
127
+
128
+ ---
129
+
130
+ ## Quick start — the query helpers
131
+
132
+ Reach for the helpers first. They do the fiddly, easy-to-get-wrong part (reading the right
133
+ partition files, fast *and* correct) and hand back a pandas DataFrame. Drop to
134
+ [native SQL](#native-sql-what-the-helpers-are-built-on) only when you need more.
135
+
136
+ ```python
137
+ from aind_dynamic_foraging_database import (
138
+ select_sessions, fetch_trials, fetch_events,
139
+ )
140
+
141
+ # 1) Filter the (small) session table on any metric / metadata.
142
+ sel = select_sessions("task LIKE '%Uncoupled%' AND finished_trials > 500 AND finished_rate > 0.9")
143
+
144
+ # 2) Pull those sessions' trials — session metadata is joined onto every row.
145
+ trials = fetch_trials(sel, columns=["animal_response", "earned_reward",
146
+ "reward_probabilityL", "reward_probabilityR"])
147
+
148
+ # ... or their events (optionally restricted to certain event types).
149
+ licks = fetch_events(sel, events=["left_lick_time", "right_lick_time"])
150
+ ```
151
+
152
+ **Two workflows, same two calls** — the only difference is how you filter the session table:
153
+
154
+ - **Filter on session metrics/metadata, then fetch** — pass a `where=` predicate (any column
155
+ of the session table; see [filter columns](#common-filter-columns-session-table)).
156
+ - **Subject first, then session, then fetch** — pass `subjects=[...]` (optionally with `where=`):
157
+
158
+ ```python
159
+ sel = select_sessions("finished_trials > 200", subjects=["754372", "758435"])
160
+ trials = fetch_trials(sel, columns=["animal_response", "earned_reward"])
161
+ ```
162
+
163
+ `fetch_trials` / `fetch_events` read **only the selected subjects' partition files** (≈1 s, not
164
+ every subject's) and inner-join to your selection, so you get exactly those sessions' rows with
165
+ their metadata attached — one row per trial/event, leading `subject_id, session_date,
166
+ session_id`. Add `columns=` to project specific columns (default is a small choice/reward set;
167
+ `columns="*"` returns all — large).
168
+
169
+ ### Need more than the helpers cover? Drop to SQL on a fast source
170
+
171
+ The helpers don't try to express *every* query (aggregations, window functions, trial↔event
172
+ joins). For those, `read_trials(subjects)` / `read_events(subjects)` return a **fast,
173
+ partition-scoped `read_parquet(...)` clause** you drop into any SQL — so you keep the full
174
+ power of SQL without the slow full-table glob:
175
+
176
+ ```python
177
+ import duckdb
178
+ from aind_dynamic_foraging_database import read_trials
179
+
180
+ src = read_trials(["754372", "758435"]) # scoped -> reads only these subjects' files
181
+ duckdb.sql(f"""
182
+ SELECT subject_id, COUNT(*) AS n_trials, AVG(earned_reward::DOUBLE) AS reward_rate
183
+ FROM {src} GROUP BY subject_id ORDER BY subject_id
184
+ """).df()
185
+ ```
186
+
187
+ `read_trials()` / `read_events()` with no arguments return the full-table glob (correct for any
188
+ query, but reads every subject's footer — slow; scope to subjects whenever you can).
189
+
190
+ > All helpers query the public S3 cache by default. Pass `base=` (a local dir or another S3
191
+ > prefix) to any of them to query a different build.
192
+
193
+ ---
194
+
195
+ ## Native SQL (what the helpers are built on)
196
+
197
+ Everything below is the raw DuckDB layer. Use it directly when you want full control — or to
198
+ understand what the helpers do under the hood. (You can still read the session table directly,
199
+ e.g. `duckdb.sql(f"SELECT COUNT(*) FROM read_parquet('{SESSION_DB}') WHERE subject_id = '754372'")`.)
200
+
201
+ ## The three read options (always use these on the partitioned tables)
202
+
203
+ ```python
204
+ READ_TRIALS = f"read_parquet('{TRIAL_DB}/**/*.parquet', hive_partitioning=true, union_by_name=true)"
205
+ READ_EVENTS = f"read_parquet('{EVENT_DB}/**/*.parquet', hive_partitioning=true, union_by_name=true)"
206
+ ```
207
+
208
+ - **`hive_partitioning=true`** — exposes `subject_id` from the directory name and prunes
209
+ partitions, so filtering by `subject_id` reads only that mouse's file(s).
210
+ - **`union_by_name=true`** — merges columns across the three NWB readers (a column missing in
211
+ some files fills with `NULL` instead of erroring).
212
+ - **`CAST(subject_id AS VARCHAR)`** — in the trial/event tables `subject_id` comes from the
213
+ partition directory and DuckDB infers it as **BIGINT**; the session table stores it as a
214
+ **string**. Always cast when filtering/joining `subject_id` on the trial/event tables.
215
+
216
+ The session table is a single plain parquet — read it with `read_parquet('{SESSION_DB}')`
217
+ (no options needed).
218
+
219
+ ---
220
+
221
+ ## Keys & joins
222
+
223
+ - Session key column is **`_session_id`** in the session table; the trial/event tables call it
224
+ **`session_id`** (same value: `"{subject_id}_{session_date}_{nwb_suffix}"`).
225
+ - Canonical pattern — **filter sessions, then JOIN to trials** so every trial row carries its
226
+ session metadata:
227
+
228
+ ```python
229
+ df = duckdb.sql(f"""
230
+ WITH sel AS (
231
+ SELECT _session_id, subject_id, session_date, task, foraging_eff
232
+ FROM read_parquet('{SESSION_DB}')
233
+ WHERE task LIKE '%Uncoupled%' AND foraging_eff > 0.8
234
+ )
235
+ SELECT s.subject_id, s.session_date, t.session_id,
236
+ t.animal_response, t.earned_reward,
237
+ t.reward_probabilityL, t.reward_probabilityR
238
+ FROM {READ_TRIALS} t
239
+ JOIN sel s ON t.session_id = s._session_id
240
+ WHERE CAST(t.subject_id AS VARCHAR) IN (SELECT subject_id FROM sel)
241
+ ORDER BY s.subject_id, s.session_date
242
+ """).df()
243
+ ```
244
+
245
+ (The extra `WHERE CAST(subject_id …) IN (…)` lets DuckDB prune partitions before the join.)
246
+
247
+ ---
248
+
249
+ ## Common filter columns (session table)
250
+
251
+ Almost all analyses start by selecting sessions on a few columns of the **session table**, then
252
+ joining to trials/events. The columns you'll filter on most:
253
+
254
+ | Filter on | Column(s) | Example values / predicate |
255
+ |---|---|---|
256
+ | **Identity** | `subject_id`, `session_date` | `subject_id IN ('754372','758435')`; `session_date >= '2024-01-01'` |
257
+ | **Institute / hardware / rig** | `institute`, `hardware`, `rig_type`, `room` | `institute`: `AIND` \| `Janelia`; `hardware`: `bonsai` \| `bpod`; `rig_type`: `training` \| `ephys`; `room`: `447`, `446`, … → e.g. `institute = 'Janelia'`, `hardware = 'bpod'`, `rig_type = 'ephys'` |
258
+ | **Behavior task** | `task` | `Uncoupled Baiting`, `Coupled Baiting`, `Uncoupled Without Baiting`, `Coupled Without Baiting` → `task LIKE '%Uncoupled%'` |
259
+ | **Curriculum** | `curriculum_name`, `curriculum_version` | e.g. `Uncoupled Baiting` / `'2.3'`; **`'None'` = off-curriculum** → `curriculum_name <> 'None'` for on-curriculum only |
260
+ | **Curriculum stage** | `current_stage_actual` | `STAGE_1_WARMUP` → `STAGE_1` → `STAGE_2/3/4` → `STAGE_FINAL` → `GRADUATED` (`'None'` = off-curriculum). For fully-trained sessions use the **"Final stages"**: `current_stage_actual IN ('STAGE_FINAL', 'GRADUATED')` (see note below) |
261
+ | **Performance metrics** | `finished_trials`, `finished_rate`, `foraging_eff`, `total_trials`, `reward_trials`, `bias_naive`, … | combine freely: `foraging_eff > 0.8 AND finished_trials > 200 AND finished_rate > 0.7` |
262
+
263
+ > 💡 **Use `institute` / `hardware` / `rig_type` for high-level grouping** (clean values:
264
+ > `AIND`/`Janelia`, `bonsai`/`bpod`, `training`/`ephys`). `data_source` is their fine-grained
265
+ > concatenation (e.g. `AIND_training_447_bonsai`) — usually too granular to filter on directly.
266
+ > And **`data_source` ≠ `nwb_data_source`**: `nwb_data_source` (`co_asset`/`bonsai_s3`/`bpod_s3`)
267
+ > is just *which NWB the cache built the row from*, not a science filter.
268
+ >
269
+ > **Curriculum "off" vs "missing":** off-curriculum sessions have the **string** `curriculum_name
270
+ > = 'None'` (and `curriculum_version = 'None'`); the ~381 CO-only sessions absent from Han have
271
+ > SQL `NULL`. `curriculum_name NOT IN ('None')` keeps on-curriculum sessions (it also drops the
272
+ > NULLs).
273
+ >
274
+ > **"Final stages" — `STAGE_FINAL` vs `GRADUATED`:** a curriculum's terminal *training
275
+ > parameters* are reached at `current_stage_actual = 'STAGE_FINAL'`; once a mouse meets the
276
+ > graduation criteria the stage is relabeled `'GRADUATED'` — but **both run the identical task
277
+ > parameters**. So for "fully-trained" sessions, treat them as one:
278
+ > `current_stage_actual IN ('STAGE_FINAL', 'GRADUATED')`.
279
+ >
280
+ > **Curriculum vs. `task` — related but not the same.** `curriculum_name` is the auto-training
281
+ > *program* a mouse is enrolled in (named after its **target task**, and constant as the mouse
282
+ > progresses); `task` is the paradigm **actually run that session**, which changes by stage
283
+ > because the curriculum ramps difficulty. E.g. the *Uncoupled Baiting* curriculum runs the
284
+ > easier **Coupled Baiting** task in `STAGE_1_WARMUP`→`STAGE_2`, then switches to **Uncoupled
285
+ > Baiting** from `STAGE_3`→`STAGE_FINAL`/`GRADUATED`. So filter **`curriculum_name`** to pick mice
286
+ > *enrolled in* a program, and **`task`** to pick sessions that *actually ran* a paradigm — they
287
+ > match for most sessions but differ for ~3.2k on-curriculum sessions (the early stages).
288
+
289
+ ---
290
+
291
+ ## Schema catalog
292
+
293
+ Column types come straight from the files. To list **every** column of a table:
294
+
295
+ ```python
296
+ duckdb.sql(f"DESCRIBE SELECT * FROM read_parquet('{SESSION_DB}')").df() # 160 cols
297
+ duckdb.sql(f"DESCRIBE SELECT * FROM {READ_TRIALS}").df() # 103 cols
298
+ duckdb.sql(f"DESCRIBE SELECT * FROM {READ_EVENTS}").df() # 10 cols
299
+ ```
300
+
301
+ ### session table (key columns; 160 total)
302
+ | column | type | meaning |
303
+ |---|---|---|
304
+ | `_session_id` | VARCHAR | session key → join to trial/event `session_id` |
305
+ | `subject_id` | VARCHAR | mouse ID (string) |
306
+ | `session_date` | VARCHAR | `YYYY-MM-DD` |
307
+ | `nwb_suffix` | BIGINT | session start `HHMMSS` as int (disambiguates same-day sessions) |
308
+ | `task` | VARCHAR | e.g. `Uncoupled Baiting`, `Coupled Baiting`, `Uncoupled Without Baiting` |
309
+ | `total_trials` | DOUBLE | foraging trials, **autowater excluded** |
310
+ | `total_trials_with_autowater` | DOUBLE | all trials (= trial-table `COUNT(*)`) |
311
+ | `finished_trials` | DOUBLE | non-ignored foraging trials |
312
+ | `ignored_trials` | DOUBLE | foraging trials with no response |
313
+ | `finished_rate`, `ignore_rate` | DOUBLE | finished / ignored fraction |
314
+ | `reward_trials` | DOUBLE | earned (non-autowater) rewards |
315
+ | `reward_rate` | DOUBLE | reward / finished |
316
+ | `foraging_eff` | DOUBLE | foraging efficiency vs ideal |
317
+ | `foraging_performance` | DOUBLE | `foraging_eff × finished_rate` |
318
+ | `bias_naive` | DOUBLE | side bias, −1 (left) … +1 (right) |
319
+ | `autowater_collected`, `autowater_ignored` | DOUBLE | autowater trial counts |
320
+ | `reaction_time_median`, `early_lick_rate` | DOUBLE | timing / lick metrics |
321
+ | `institute`, `hardware`, `rig_type` | VARCHAR | **high-level grouping** — `AIND`/`Janelia`, `bonsai`/`bpod`, `training`/`ephys` |
322
+ | `room` | VARCHAR | rig room (`447`, `446`, `347`, …) |
323
+ | `data_source` | VARCHAR | fine-grained composite ≈ `{institute}_{rig_type}_{room}_{hardware}` (e.g. `AIND_training_447_bonsai`) — **≠ `nwb_data_source`** |
324
+ | `curriculum_name`, `curriculum_version` | VARCHAR | curriculum + version; **`'None'` = off-curriculum**, `NULL` = not in Han |
325
+ | `current_stage_actual` | VARCHAR | curriculum stage reached: `STAGE_1_WARMUP`…`STAGE_FINAL`/`GRADUATED` — the two **"Final stages"** (`STAGE_FINAL`, `GRADUATED`) share training parameters; `'None'` = off-curriculum |
326
+ | `rig`, `trainer`, `PI` | VARCHAR | session metadata |
327
+ | `weight_after`, `water_in_session_total` | DOUBLE | weight / water |
328
+ | `logistic_*`, `abs(*_bias)` | DOUBLE | fitted logistic-regression model coefficients |
329
+ | `nwb_data_source` | VARCHAR | `co_asset` \| `bonsai_s3` \| `bpod_s3` — which NWB the cache built the row from (not a science filter) |
330
+ | `co_asset_id`, `co_s3_nwb_uri` | VARCHAR | Code Ocean asset id / NWB URI (NULL if none) |
331
+
332
+ ### trial table (key columns; 103 total)
333
+ | column | type | meaning |
334
+ |---|---|---|
335
+ | `session_id` | VARCHAR | join key → session `_session_id` |
336
+ | `subject_id` | BIGINT* | mouse ID — *cast to VARCHAR when filtering* (partition column) |
337
+ | `session_date` | VARCHAR | `YYYY-MM-DD` |
338
+ | `nwb_suffix` | BIGINT | session suffix |
339
+ | `trial` | DOUBLE | trial index within the session |
340
+ | `animal_response` | DOUBLE | **0 = lick left, 1 = lick right, 2 = ignore (no response)** |
341
+ | `earned_reward` | BOOLEAN | earned a (non-autowater) reward (= `rewarded_historyL OR rewarded_historyR`) |
342
+ | `rewarded_historyL` / `rewarded_historyR` | BOOLEAN | reward delivered on left / right |
343
+ | `reward_probabilityL` / `reward_probabilityR` | DOUBLE | scheduled reward prob per side |
344
+ | `auto_waterL` / `auto_waterR` | BIGINT | autowater given on left / right (**non-autowater trial = both 0**) |
345
+ | `reward_random_number_left` / `_right` | DOUBLE | the draw used for baiting |
346
+ | `goCue_start_time_in_session` | DOUBLE | go-cue time (s from session start) |
347
+ | `choice_time_in_session` | DOUBLE | choice (lick) time (s) |
348
+ | `reward_time_in_session` | DOUBLE | reward time (s) |
349
+ | `reaction_time` | DOUBLE | choice − go-cue (s) |
350
+ | `laser_*` | mixed | optogenetics parameters (NULL on non-opto trials) |
351
+ | `nwb_data_source` | VARCHAR | reader source |
352
+
353
+ ### event table (all 10 columns)
354
+ | column | type | meaning |
355
+ |---|---|---|
356
+ | `session_id` | VARCHAR | join key → session `_session_id` |
357
+ | `subject_id` | BIGINT* | mouse ID — *cast to VARCHAR when filtering* |
358
+ | `session_date` | VARCHAR | `YYYY-MM-DD` |
359
+ | `nwb_suffix` | BIGINT | session suffix |
360
+ | `trial` | DOUBLE | trial index this event falls in (−1 before first go-cue) |
361
+ | `timestamps` | DOUBLE | event time, s from session start |
362
+ | `raw_timestamps` | DOUBLE | original NWB timestamp (un-aligned) |
363
+ | `event` | VARCHAR | one of: `goCue_start_time`, `left_lick_time`, `right_lick_time`, `left_reward_delivery_time`, `right_reward_delivery_time`, `optogenetics_time` |
364
+ | `data` | VARCHAR | event payload (string-normalized) |
365
+ | `nwb_data_source` | VARCHAR | reader source |
366
+
367
+ ---
368
+
369
+ ## Conventions & gotchas
370
+
371
+ - **Cast `subject_id`** on the trial/event tables — when filtering, joining, **and grouping**:
372
+ `WHERE CAST(subject_id AS VARCHAR) = '754372'`, `GROUP BY CAST(subject_id AS VARCHAR)`
373
+ (partition column is BIGINT; session-table column is string; grouping on the raw BIGINT
374
+ partition column can also trip a DuckDB stats error).
375
+ - **`subject_id` and `session_date` are strings** — quote them (`'754372'`, `'2024-05-01'`).
376
+ - **Session key naming:** session table `_session_id` ↔ trial/event `session_id`.
377
+ - **Autowater:** `total_trials` **excludes** autowater; `total_trials_with_autowater` is all
378
+ trials (and equals the trial table's `COUNT(*)`). A trial is non-autowater iff
379
+ `auto_waterL = 0 AND auto_waterR = 0`. `reward_trials` counts earned (non-autowater) rewards.
380
+ - **`animal_response`:** 0 = left, 1 = right, 2 = ignore.
381
+ - **Performance:** *filter by `subject_id`* (prunes partitions) and *project only the columns
382
+ you need*. `SELECT *` over trials is ~21 GB; the choice/reward 5-column slice is ~2 GB / ~6 s.
383
+ - **NULLs:** `union_by_name` fills reader-specific columns with `NULL`; numeric comparisons
384
+ (`> 0.8`) drop NULL/NaN rows.
385
+ - **⚠️ The ~381 CO-only sessions have NO Han metadata.** Sessions added from the Code Ocean
386
+ universe but absent from Han's pipeline (`nwb_data_source = 'co_asset'`) have **only the
387
+ identity + CO columns populated** (`subject_id, session_date, nwb_suffix, _session_id,
388
+ co_asset_id, co_s3_nwb_uri, nwb_data_source`); **all Han columns are NULL** (`task`,
389
+ `institute`, `hardware`, `curriculum_*`, `foraging_eff`, `finished_trials`, every metric). So
390
+ **any filter on a Han column silently excludes them** (NULL fails every comparison — they
391
+ "never return"). Their **trials/events are fully in the cache**, so reach them by
392
+ `subject_id`/`session_id` (or isolate them with `WHERE foraging_eff IS NULL`). **We plan to
393
+ rebuild the session metric table directly from the cache** — recomputing these per-session
394
+ stats from the trial data (the single source of truth) — which will fill in the CO-only
395
+ sessions and eventually supersede Han's pipeline as the source of session metadata.
396
+
397
+ ---
398
+
399
+ ## Output-formatting rules
400
+
401
+ So results stay identifiable and joinable, **every query that returns trial/event/session rows
402
+ should**:
403
+
404
+ 1. `SELECT subject_id, session_date, session_id` as the **leading** columns;
405
+ 2. return **one row per trial / event / session**;
406
+ 3. `ORDER BY subject_id, session_date` (then `trial`/`timestamps` where relevant).
407
+
408
+ ---
409
+
410
+ ## Worked examples
411
+
412
+ ```python
413
+ import duckdb
414
+ from aind_dynamic_foraging_database import SESSION_DB, TRIAL_DB, EVENT_DB
415
+ READ_TRIALS = f"read_parquet('{TRIAL_DB}/**/*.parquet', hive_partitioning=true, union_by_name=true)"
416
+ READ_EVENTS = f"read_parquet('{EVENT_DB}/**/*.parquet', hive_partitioning=true, union_by_name=true)"
417
+ ```
418
+
419
+ **1. Count records (with a filter)**
420
+ ```python
421
+ duckdb.sql(f"SELECT COUNT(*) FROM read_parquet('{SESSION_DB}') WHERE foraging_eff > 0.8").df()
422
+ duckdb.sql(f"SELECT COUNT(*) FROM {READ_TRIALS} WHERE CAST(subject_id AS VARCHAR) = '754372'").df()
423
+ ```
424
+
425
+ **2. Fetch selected columns for a list of subjects**
426
+ ```python
427
+ duckdb.sql(f"""
428
+ SELECT subject_id, session_date, session_id, animal_response, earned_reward
429
+ FROM {READ_TRIALS}
430
+ WHERE CAST(subject_id AS VARCHAR) IN ('754372', '758435')
431
+ ORDER BY subject_id, session_date
432
+ """).df()
433
+ ```
434
+
435
+ **3. Filter by `(subject_id, session_date)` combinations**
436
+ ```python
437
+ duckdb.sql(f"""
438
+ SELECT subject_id, session_date, session_id, trial, animal_response, earned_reward
439
+ FROM {READ_TRIALS}
440
+ WHERE (CAST(subject_id AS VARCHAR), session_date) IN (
441
+ ('754372', '2024-05-01'), ('758435', '2024-06-12')
442
+ )
443
+ ORDER BY subject_id, session_date, trial
444
+ """).df()
445
+ ```
446
+
447
+ **4. Filter on source + task + curriculum + performance metrics → join to trials**
448
+ ```python
449
+ duckdb.sql(f"""
450
+ WITH sel AS (
451
+ SELECT _session_id, subject_id, session_date
452
+ FROM read_parquet('{SESSION_DB}')
453
+ WHERE institute = 'AIND' AND hardware = 'bonsai' -- institute / hardware
454
+ AND task LIKE '%Uncoupled%' -- behavior task
455
+ AND curriculum_name NOT IN ('None') -- on-curriculum only
456
+ AND foraging_eff > 0.8 AND finished_trials > 200 -- performance metrics
457
+ )
458
+ SELECT s.subject_id, s.session_date, t.session_id, t.trial,
459
+ t.animal_response, t.earned_reward, t.reward_probabilityL, t.reward_probabilityR
460
+ FROM {READ_TRIALS} t
461
+ JOIN sel s ON t.session_id = s._session_id
462
+ WHERE CAST(t.subject_id AS VARCHAR) IN (SELECT subject_id FROM sel)
463
+ ORDER BY s.subject_id, s.session_date, t.trial
464
+ """).df()
465
+ ```
466
+
467
+ **5. Events (e.g. licks) for selected sessions**
468
+ ```python
469
+ duckdb.sql(f"""
470
+ SELECT subject_id, session_date, session_id, trial, timestamps, event
471
+ FROM {READ_EVENTS}
472
+ WHERE CAST(subject_id AS VARCHAR) = '754372'
473
+ AND event IN ('left_lick_time', 'right_lick_time')
474
+ ORDER BY subject_id, session_date, timestamps
475
+ """).df()
476
+ ```
477
+
478
+ **6. Per-subject aggregate (all in SQL)**
479
+ ```python
480
+ duckdb.sql(f"""
481
+ SELECT CAST(subject_id AS VARCHAR) AS subject_id,
482
+ COUNT(DISTINCT session_id) AS n_sessions,
483
+ COUNT(*) AS n_trials,
484
+ AVG(earned_reward::DOUBLE) AS reward_rate
485
+ FROM {READ_TRIALS}
486
+ WHERE CAST(subject_id AS VARCHAR) IN ('754372', '758435')
487
+ GROUP BY CAST(subject_id AS VARCHAR)
488
+ ORDER BY subject_id
489
+ """).df()
490
+ ```
491
+ (Cast `subject_id` in the `GROUP BY` too — grouping on the raw BIGINT partition column can hit
492
+ a DuckDB stats error.)
493
+
494
+ Runnable versions of these (and an at-a-glance DB overview + a DuckDB primer) are in
495
+ [`query_examples.ipynb`](query_examples.ipynb) / `query_examples.py`.
496
+
497
+ ---
498
+
499
+ ## Use an LLM to write queries
500
+
501
+ Paste this README into your LLM as context, prefixed with something like:
502
+
503
+ > *You write DuckDB SQL against the AIND dynamic-foraging parquet cache described below. Rules:
504
+ > read the partitioned trial/event tables with `read_parquet('…/**/*.parquet',
505
+ > hive_partitioning=true, union_by_name=true)`; `CAST(subject_id AS VARCHAR)` whenever you
506
+ > filter or join `subject_id` on those tables; quote `subject_id`/`session_date` (strings);
507
+ > the session key is `_session_id` (session table) ↔ `session_id` (trial/event); always SELECT
508
+ > `subject_id, session_date, session_id` as the leading columns and `ORDER BY subject_id,
509
+ > session_date`; project only the columns asked for; filter by `subject_id` when possible.
510
+ > Return a single runnable `duckdb.sql(...).df()` snippet. Schema and conventions follow:*
511
+
512
+ Then ask your question in plain English.
513
+
514
+ **Using a coding agent** (Claude Code, Codex, OpenCode, …)? This repo ships an
515
+ **`aind-dynamic-foraging-data-access`** skill (in `.claude/skills/`) with exactly this context.
516
+ With Claude Code it loads automatically when you work in the repo; for other agents, point them
517
+ at `.claude/skills/aind-dynamic-foraging-data-access/SKILL.md`. Then just ask for the data you
518
+ want — no need to paste this README.
519
+
520
+ ---
521
+
522
+ ## Read performance (full database — ~24k sessions, 12.5M trials, over S3)
523
+
524
+ **Scope the read to the subjects you need** — this is what the helpers do, and it dominates
525
+ selective-query latency:
526
+
527
+ - **Selective query via the helpers / a scoped read** (a handful of subjects, choice/reward
528
+ columns) → **~1 s**. `fetch_trials(sel, …)` / `read_trials(subjects)` read only those
529
+ subjects' partition files.
530
+ - **Full-table glob** (`/**/*.parquet` + `union_by_name`) → **~25 s cold** *before* any data:
531
+ it reads every subject file's footer to build the column union, even for one subject. Reuse a
532
+ single DuckDB connection and repeats drop to ~7 s; scoping avoids it entirely.
533
+
534
+ Whole-dataset reads (where you genuinely touch every subject, so the footer scan isn't extra):
535
+
536
+ - **5-column projection** (choice/reward/prob, + keys) → **~6 s, ~2 GB** — the normal analysis pattern.
537
+ - full 103-column trial table → **~53 s, ~21 GB**.
538
+ - `COUNT(*)` over the trial table → **~1 s**.
539
+ - **Return-loop join** (filter sessions → pull all their trials + events) → **~44 s** over S3.
540
+
541
+ ### vs. the legacy `nwb_utils` route
542
+
543
+ The way to get this data *without* the new database is to open each session's NWB yourself —
544
+ `code_ocean_utils.get_subject_assets()` (docDB query) → `add_s3_location()` (S3 glob) →
545
+ `nwb_utils.create_df_trials()` / `create_df_events()` — **one session at a time**. That costs
546
+ **~23 s per session** (dominated by the **~17 s docDB query**; +~4 s to open/parse the NWB,
547
+ +~3 s for events), and it does **not** scale: there's no projection (you read the whole NWB to
548
+ get 5 columns) and every session pays the docDB round-trip again. The new database replaces the whole
549
+ chain with a single parquet scan:
550
+
551
+ | Fetch | **The new database** (~24k sessions) | **Legacy `nwb_utils`** (~12k sessions) |
552
+ |---|---|---|
553
+ | 1 session, trials | ~1 s | ~23 s |
554
+ | 100 sessions, trials | ~3 s | **~40 min** |
555
+ | Full DB, 5-col | **~6 s** | **~6 days** |
556
+ | Full DB, full 103-col | **~53 s** | ~6 days |
557
+
558
+ > **Not just faster — more complete.** Po-Chen's prior `nwb_utils` effort reached only **~12k
559
+ > sessions in ~6 days** — roughly **half** of what's here (~24k). The per-session route
560
+ > doesn't realistically scale to the full dataset, so in practice it also yielded **~2× less data**.
561
+ > The new database is the *complete* set, rebuilt end-to-end in **under 2 h**.
562
+
563
+ → **~10,000× faster** at full-dataset scale, verified equivalent to a direct
564
+ `nwb_utils` read (33/33 sessions exact-match — see `README_build.md`). Solid = the new database (measured),
565
+ dashed = legacy `nwb_utils` (per-session cost, extrapolated):
566
+
567
+ ![Cache vs legacy nwb_utils fetch time](src/aind_dynamic_foraging_database/validate/cache_vs_legacy.png)
568
+
569
+ Memory scales with the columns you select (a few columns ≈ 10× less RAM than the full width);
570
+ per-subject coalescing
571
+ keeps file-open overhead small even for full-width loads. See [`README_build.md`](README_build.md)
572
+ for build performance and the full validation results (data-equivalence + apples-to-apples vs Han).