hyperstudy 0.2.1__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/PKG-INFO +1 -1
- hyperstudy-0.2.2/docs/superpowers/specs/2026-04-10-recording-downloads-design.md +128 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/pyproject.toml +1 -1
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/src/hyperstudy/__init__.py +1 -1
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/src/hyperstudy/_dataframe.py +47 -0
- hyperstudy-0.2.2/src/hyperstudy/_downloads.py +50 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/src/hyperstudy/client.py +124 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/conftest.py +10 -0
- hyperstudy-0.2.2/tests/fixtures/recordings_response.json +71 -0
- hyperstudy-0.2.2/tests/fixtures/sparse_ratings_response.json +108 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/test_client.py +140 -0
- hyperstudy-0.2.2/tests/test_dataframe.py +182 -0
- hyperstudy-0.2.2/tests/test_downloads.py +105 -0
- hyperstudy-0.2.1/tests/test_dataframe.py +0 -78
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/.github/workflows/publish.yml +0 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/.github/workflows/sync-release-notes.yml +0 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/.github/workflows/test.yml +0 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/.gitignore +0 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/CHANGELOG.md +0 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/LICENSE +0 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/README.md +0 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/src/hyperstudy/_display.py +0 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/src/hyperstudy/_http.py +0 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/src/hyperstudy/_pagination.py +0 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/src/hyperstudy/_types.py +0 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/src/hyperstudy/exceptions.py +0 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/src/hyperstudy/experiments.py +0 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/__init__.py +0 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/fixtures/deployment_sessions_response.json +0 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/fixtures/deployment_single_response.json +0 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/fixtures/deployments_list_response.json +0 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/fixtures/error_401.json +0 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/fixtures/error_403.json +0 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/fixtures/events_response.json +0 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/fixtures/experiment_single_response.json +0 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/fixtures/experiments_list_response.json +0 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/fixtures/paginated_page1.json +0 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/fixtures/paginated_page2.json +0 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/fixtures/pre_experiment_response.json +0 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/fixtures/warnings_response.json +0 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/test_experiments.py +0 -0
- {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/test_pagination.py +0 -0
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
# Recording Downloads via Python SDK
|
|
2
|
+
|
|
3
|
+
## Problem
|
|
4
|
+
|
|
5
|
+
The Python SDK's `get_recordings()` returns metadata only. Users need the actual audio/video files for offline analysis (ML models, manual review, archival). Currently they must manually extract `downloadUrl` from each record and fetch files themselves.
|
|
6
|
+
|
|
7
|
+
## Decision: SDK-only, no backend changes
|
|
8
|
+
|
|
9
|
+
The V3 API already returns signed GCS download URLs (7-day expiry) in the recording metadata. The SDK will fetch metadata and download files in the same call, so URL expiry is not a practical concern. This matches how the frontend downloads recordings.
|
|
10
|
+
|
|
11
|
+
## API Surface
|
|
12
|
+
|
|
13
|
+
### `download_recordings()` — Bulk download
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
df = hs.download_recordings(
|
|
17
|
+
"exp_abc123",
|
|
18
|
+
output_dir="./data/recordings",
|
|
19
|
+
scope="experiment", # "experiment" | "room" | "participant"
|
|
20
|
+
deployment_id=None, # optional filter
|
|
21
|
+
room_id=None, # optional filter
|
|
22
|
+
recording_type=None, # "audio" | "video" | None (both)
|
|
23
|
+
progress=True, # tqdm progress bar
|
|
24
|
+
skip_existing=True, # skip files already on disk with matching size
|
|
25
|
+
)
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
**Returns**: `pandas.DataFrame` with all recording metadata columns plus:
|
|
29
|
+
- `local_path` — absolute path to the downloaded file on disk
|
|
30
|
+
- `download_status` — `"downloaded"`, `"skipped"`, or `"failed"`
|
|
31
|
+
|
|
32
|
+
**Side effects**:
|
|
33
|
+
- Writes media files to `output_dir`
|
|
34
|
+
- Writes `recordings_metadata.csv` to `output_dir`
|
|
35
|
+
|
|
36
|
+
### `download_recording()` — Single recording
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
path = hs.download_recording(
|
|
40
|
+
recording, # dict from get_recordings(output="dict")
|
|
41
|
+
output_dir="./data/recordings",
|
|
42
|
+
)
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
**Returns**: `pathlib.Path` to downloaded file.
|
|
46
|
+
|
|
47
|
+
## Directory Structure
|
|
48
|
+
|
|
49
|
+
```
|
|
50
|
+
output_dir/
|
|
51
|
+
recordings_metadata.csv
|
|
52
|
+
user1_video_EG_abc123.mp4
|
|
53
|
+
user1_audio_EG_def456.webm
|
|
54
|
+
user2_video_EG_ghi789.mp4
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
**Filename pattern**: `{participantName}_{recordingType}_{recordingId}.{ext}`
|
|
58
|
+
|
|
59
|
+
- `participantName`: from recording metadata, sanitized for filesystem safety
|
|
60
|
+
- `recordingType`: `"video"` or `"audio"` from `metadata.type`
|
|
61
|
+
- `recordingId`: egressId or recordingId
|
|
62
|
+
- `ext`: from `format` field, falling back to `mp4` (video) or `webm` (audio)
|
|
63
|
+
|
|
64
|
+
## Internal Design
|
|
65
|
+
|
|
66
|
+
### Download flow (`download_recordings`)
|
|
67
|
+
|
|
68
|
+
1. Call `self.get_recordings(scope_id, scope=scope, output="dict")` to get metadata
|
|
69
|
+
2. Filter by `recording_type` if specified (via `metadata.type`)
|
|
70
|
+
3. Create `output_dir` via `os.makedirs(exist_ok=True)`
|
|
71
|
+
4. For each recording:
|
|
72
|
+
- Build filename using pattern above
|
|
73
|
+
- If `skip_existing=True` and file exists with size matching `fileSize` metadata, mark as `"skipped"`
|
|
74
|
+
- Otherwise, fetch from `downloadUrl` (fallback: `url`) using streaming HTTP GET
|
|
75
|
+
- Write to disk in 8KB chunks
|
|
76
|
+
- Mark as `"downloaded"` or `"failed"` (with warning logged)
|
|
77
|
+
5. Build DataFrame from metadata, add `local_path` and `download_status` columns
|
|
78
|
+
6. Write `recordings_metadata.csv` to `output_dir`
|
|
79
|
+
7. Return DataFrame
|
|
80
|
+
|
|
81
|
+
### Streaming downloads
|
|
82
|
+
|
|
83
|
+
Use `requests.get(url, stream=True)` with chunked iteration to avoid loading large video files into memory. The SDK's existing `HttpTransport` handles JSON responses only, so file downloads use a standalone `requests.get()` — the signed GCS URLs don't need API key auth.
|
|
84
|
+
|
|
85
|
+
### Error handling
|
|
86
|
+
|
|
87
|
+
- Per-file failure tolerance: if one recording fails (404, timeout, network error), log a warning, set `download_status="failed"`, continue with remaining files
|
|
88
|
+
- If the metadata API call itself fails, raise normally (same as `get_recordings()`)
|
|
89
|
+
- Invalid/missing `downloadUrl`: set `download_status="failed"`, log warning
|
|
90
|
+
|
|
91
|
+
### Skip-existing logic
|
|
92
|
+
|
|
93
|
+
Compare `os.path.getsize(local_path)` against `fileSize` from metadata. If `fileSize` is `None` (metadata missing), fall back to checking file existence only (any existing file is considered complete).
|
|
94
|
+
|
|
95
|
+
## File Layout
|
|
96
|
+
|
|
97
|
+
| File | Change |
|
|
98
|
+
|------|--------|
|
|
99
|
+
| `src/hyperstudy/_downloads.py` | **New.** `build_filename()`, `download_file()` streaming helper |
|
|
100
|
+
| `src/hyperstudy/client.py` | Add `download_recordings()` and `download_recording()` methods |
|
|
101
|
+
| `tests/test_downloads.py` | **New.** Unit tests for filename building, skip logic, status tracking |
|
|
102
|
+
| `tests/test_client.py` | Integration test: mock API + GCS, verify files + DataFrame |
|
|
103
|
+
| `tests/fixtures/sparse_ratings_response.json` | Already exists (from prior work) |
|
|
104
|
+
|
|
105
|
+
## Testing
|
|
106
|
+
|
|
107
|
+
### Unit tests (`tests/test_downloads.py`)
|
|
108
|
+
- `test_build_filename` — video, audio, missing fields, filesystem-unsafe characters
|
|
109
|
+
- `test_build_filename_dedup` — duplicate names get numeric suffix
|
|
110
|
+
- `test_skip_existing_matching_size` — file with correct size is skipped
|
|
111
|
+
- `test_skip_existing_wrong_size` — file with wrong size is re-downloaded
|
|
112
|
+
|
|
113
|
+
### Integration tests (`tests/test_client.py`)
|
|
114
|
+
- `test_download_recordings` — mock API + GCS fetch, verify files on disk, CSV sidecar, DataFrame with `local_path` + `download_status`
|
|
115
|
+
- `test_download_recordings_filter_type` — `recording_type="audio"` only downloads audio
|
|
116
|
+
- `test_download_recording_single` — single recording download
|
|
117
|
+
|
|
118
|
+
### Mocking strategy
|
|
119
|
+
- V3 API: `responses` library (existing pattern)
|
|
120
|
+
- GCS signed URL: also `responses` (it's just an HTTP GET to a URL)
|
|
121
|
+
- File I/O: real writes to `pytest` `tmp_path`
|
|
122
|
+
|
|
123
|
+
## No Backend Changes Required
|
|
124
|
+
|
|
125
|
+
The existing V3 API endpoints return all necessary data:
|
|
126
|
+
- `GET /api/v3/data/recordings/{scope}/{scopeId}` returns metadata with `downloadUrl`
|
|
127
|
+
- Signed GCS URLs are valid for 7 days
|
|
128
|
+
- SDK downloads immediately after fetching metadata, so expiry is not an issue
|
|
@@ -6,6 +6,51 @@ from typing import Any
|
|
|
6
6
|
|
|
7
7
|
import pandas as pd
|
|
8
8
|
|
|
9
|
+
# Nested dict fields to flatten into top-level columns.
|
|
10
|
+
# Mapping of {field_name: prefix} — sub-keys become ``{prefix}_{sub_key}``.
|
|
11
|
+
FLATTEN_FIELDS: dict[str, str] = {
|
|
12
|
+
"sparseRatingData": "sparseRatingData",
|
|
13
|
+
"metadata": "metadata",
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _flatten_nested_dicts(
|
|
18
|
+
data: list[dict[str, Any]],
|
|
19
|
+
fields: dict[str, str] | None = None,
|
|
20
|
+
) -> list[dict[str, Any]]:
|
|
21
|
+
"""Promote sub-keys of nested dict fields to top-level keys.
|
|
22
|
+
|
|
23
|
+
For each *field* present in a record whose value is a ``dict``, every
|
|
24
|
+
sub-key is copied to ``{prefix}_{sub_key}``. The original nested dict
|
|
25
|
+
is preserved for backward compatibility.
|
|
26
|
+
|
|
27
|
+
Records where the target field is ``None`` or missing are left
|
|
28
|
+
untouched — downstream DataFrame construction fills those columns
|
|
29
|
+
with ``NaN`` / ``null``.
|
|
30
|
+
"""
|
|
31
|
+
if not data:
|
|
32
|
+
return data
|
|
33
|
+
|
|
34
|
+
fields = fields if fields is not None else FLATTEN_FIELDS
|
|
35
|
+
|
|
36
|
+
# Quick check on first record — skip work when no target fields exist.
|
|
37
|
+
sample = data[0]
|
|
38
|
+
targets = [f for f in fields if f in sample and isinstance(sample[f], dict)]
|
|
39
|
+
if not targets:
|
|
40
|
+
return data
|
|
41
|
+
|
|
42
|
+
out: list[dict[str, Any]] = []
|
|
43
|
+
for record in data:
|
|
44
|
+
record = dict(record) # shallow copy to avoid mutating caller's data
|
|
45
|
+
for field in targets:
|
|
46
|
+
nested = record.get(field)
|
|
47
|
+
if isinstance(nested, dict):
|
|
48
|
+
prefix = fields[field]
|
|
49
|
+
for sub_key, sub_val in nested.items():
|
|
50
|
+
record[f"{prefix}_{sub_key}"] = sub_val
|
|
51
|
+
out.append(record)
|
|
52
|
+
return out
|
|
53
|
+
|
|
9
54
|
|
|
10
55
|
def _post_process(df: pd.DataFrame) -> pd.DataFrame:
|
|
11
56
|
"""Shared post-processing for pandas DataFrames.
|
|
@@ -32,6 +77,7 @@ def to_pandas(data: list[dict[str, Any]]) -> pd.DataFrame:
|
|
|
32
77
|
"""Convert API response data to a pandas DataFrame with post-processing."""
|
|
33
78
|
if not data:
|
|
34
79
|
return pd.DataFrame()
|
|
80
|
+
data = _flatten_nested_dicts(data)
|
|
35
81
|
df = pd.DataFrame(data)
|
|
36
82
|
return _post_process(df)
|
|
37
83
|
|
|
@@ -51,6 +97,7 @@ def to_polars(data: list[dict[str, Any]]):
|
|
|
51
97
|
if not data:
|
|
52
98
|
return pl.DataFrame()
|
|
53
99
|
|
|
100
|
+
data = _flatten_nested_dicts(data)
|
|
54
101
|
df = pl.DataFrame(data)
|
|
55
102
|
|
|
56
103
|
# Parse timestamps
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Helpers for downloading recording files from signed URLs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import requests
|
|
10
|
+
|
|
11
|
+
_CHUNK_SIZE = 65536 # 64 KB — good balance for large video files
|
|
12
|
+
_UNSAFE_RE = re.compile(r"[^\w\-]")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_download_url(recording: dict[str, Any]) -> str | None:
|
|
16
|
+
"""Return the best download URL from a recording dict, or ``None``."""
|
|
17
|
+
return recording.get("downloadUrl") or recording.get("url") or None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def build_filename(recording: dict[str, Any]) -> str:
|
|
21
|
+
"""Build a filesystem-safe filename from recording metadata.
|
|
22
|
+
|
|
23
|
+
Pattern: ``{participantName}_{type}_{recordingId}.{ext}``
|
|
24
|
+
"""
|
|
25
|
+
name = recording.get("participantName") or recording.get("participantId") or "unknown"
|
|
26
|
+
name = _UNSAFE_RE.sub("_", name)
|
|
27
|
+
|
|
28
|
+
meta = recording.get("metadata") or {}
|
|
29
|
+
rec_type = meta.get("type") or "recording"
|
|
30
|
+
|
|
31
|
+
rec_id = recording.get("recordingId") or recording.get("egressId") or "unknown"
|
|
32
|
+
|
|
33
|
+
fmt = recording.get("format")
|
|
34
|
+
if not fmt:
|
|
35
|
+
fmt = "webm" if rec_type == "audio" else "mp4"
|
|
36
|
+
|
|
37
|
+
return f"{name}_{rec_type}_{rec_id}.{fmt}"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def download_file(url: str, dest: Path, timeout: int = 300) -> int:
|
|
41
|
+
"""Stream-download *url* to *dest* and return bytes written."""
|
|
42
|
+
resp = requests.get(url, stream=True, timeout=timeout)
|
|
43
|
+
resp.raise_for_status()
|
|
44
|
+
|
|
45
|
+
written = 0
|
|
46
|
+
with open(dest, "wb") as fh:
|
|
47
|
+
for chunk in resp.iter_content(chunk_size=_CHUNK_SIZE):
|
|
48
|
+
fh.write(chunk)
|
|
49
|
+
written += len(chunk)
|
|
50
|
+
return written
|
|
@@ -2,9 +2,14 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import warnings
|
|
6
|
+
from pathlib import Path
|
|
5
7
|
from typing import Any
|
|
6
8
|
|
|
9
|
+
from tqdm.auto import tqdm
|
|
10
|
+
|
|
7
11
|
from ._dataframe import to_pandas, to_polars
|
|
12
|
+
from ._downloads import build_filename, download_file, get_download_url
|
|
8
13
|
from ._http import HttpTransport
|
|
9
14
|
from ._pagination import fetch_all_pages
|
|
10
15
|
from ._types import Scope
|
|
@@ -466,6 +471,125 @@ class HyperStudy(ExperimentMixin):
|
|
|
466
471
|
"consent": self.get_consent(participant_id, **common),
|
|
467
472
|
}
|
|
468
473
|
|
|
474
|
+
# ------------------------------------------------------------------
|
|
475
|
+
# Recording downloads
|
|
476
|
+
# ------------------------------------------------------------------
|
|
477
|
+
|
|
478
|
+
def download_recording(
|
|
479
|
+
self,
|
|
480
|
+
recording: dict[str, Any],
|
|
481
|
+
output_dir: str = ".",
|
|
482
|
+
) -> Path:
|
|
483
|
+
"""Download a single recording file to disk.
|
|
484
|
+
|
|
485
|
+
Args:
|
|
486
|
+
recording: A recording dict (from ``get_recordings(output="dict")``).
|
|
487
|
+
output_dir: Directory to save the file.
|
|
488
|
+
|
|
489
|
+
Returns:
|
|
490
|
+
Path to the downloaded file.
|
|
491
|
+
"""
|
|
492
|
+
url = get_download_url(recording)
|
|
493
|
+
if not url:
|
|
494
|
+
raise ValueError("Recording has no downloadUrl or url field")
|
|
495
|
+
|
|
496
|
+
dest_dir = Path(output_dir)
|
|
497
|
+
dest_dir.mkdir(parents=True, exist_ok=True)
|
|
498
|
+
|
|
499
|
+
filename = build_filename(recording)
|
|
500
|
+
dest = dest_dir / filename
|
|
501
|
+
download_file(url, dest)
|
|
502
|
+
return dest
|
|
503
|
+
|
|
504
|
+
def download_recordings(
|
|
505
|
+
self,
|
|
506
|
+
scope_id: str,
|
|
507
|
+
*,
|
|
508
|
+
output_dir: str,
|
|
509
|
+
scope: str = "experiment",
|
|
510
|
+
deployment_id: str | None = None,
|
|
511
|
+
room_id: str | None = None,
|
|
512
|
+
recording_type: str | None = None,
|
|
513
|
+
progress: bool = True,
|
|
514
|
+
skip_existing: bool = True,
|
|
515
|
+
):
|
|
516
|
+
"""Download recording files to disk.
|
|
517
|
+
|
|
518
|
+
Fetches recording metadata, downloads each file from its signed
|
|
519
|
+
URL, writes a ``recordings_metadata.csv`` sidecar, and returns a
|
|
520
|
+
DataFrame with a ``local_path`` column.
|
|
521
|
+
|
|
522
|
+
Args:
|
|
523
|
+
scope_id: Experiment, room, or participant ID.
|
|
524
|
+
output_dir: Directory to save files.
|
|
525
|
+
scope: ``"experiment"``, ``"room"``, or ``"participant"``.
|
|
526
|
+
deployment_id: Filter by deployment (experiment scope only).
|
|
527
|
+
room_id: Filter by room.
|
|
528
|
+
recording_type: ``"audio"``, ``"video"``, or ``None`` (both).
|
|
529
|
+
progress: Show progress bar.
|
|
530
|
+
skip_existing: Skip files already on disk with matching size.
|
|
531
|
+
|
|
532
|
+
Returns:
|
|
533
|
+
pandas DataFrame with recording metadata plus ``local_path``
|
|
534
|
+
and ``download_status`` columns.
|
|
535
|
+
"""
|
|
536
|
+
recordings = self.get_recordings(
|
|
537
|
+
scope_id,
|
|
538
|
+
scope=scope,
|
|
539
|
+
deployment_id=deployment_id,
|
|
540
|
+
room_id=room_id,
|
|
541
|
+
output="dict",
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
if recording_type:
|
|
545
|
+
recordings = [
|
|
546
|
+
r for r in recordings
|
|
547
|
+
if (r.get("metadata") or {}).get("type") == recording_type
|
|
548
|
+
]
|
|
549
|
+
|
|
550
|
+
dest_dir = Path(output_dir)
|
|
551
|
+
dest_dir.mkdir(parents=True, exist_ok=True)
|
|
552
|
+
|
|
553
|
+
local_paths: list[str | None] = []
|
|
554
|
+
statuses: list[str] = []
|
|
555
|
+
|
|
556
|
+
for rec in tqdm(recordings, desc="Downloading recordings", disable=not progress):
|
|
557
|
+
filename = build_filename(rec)
|
|
558
|
+
dest = dest_dir / filename
|
|
559
|
+
|
|
560
|
+
url = get_download_url(rec)
|
|
561
|
+
if not url:
|
|
562
|
+
local_paths.append(None)
|
|
563
|
+
statuses.append("failed")
|
|
564
|
+
warnings.warn(f"Recording {rec.get('recordingId')} has no download URL")
|
|
565
|
+
continue
|
|
566
|
+
|
|
567
|
+
if skip_existing and dest.exists():
|
|
568
|
+
expected_size = rec.get("fileSize")
|
|
569
|
+
if expected_size is None or dest.stat().st_size == expected_size:
|
|
570
|
+
local_paths.append(str(dest.resolve()))
|
|
571
|
+
statuses.append("skipped")
|
|
572
|
+
continue
|
|
573
|
+
|
|
574
|
+
try:
|
|
575
|
+
download_file(url, dest)
|
|
576
|
+
local_paths.append(str(dest.resolve()))
|
|
577
|
+
statuses.append("downloaded")
|
|
578
|
+
except Exception as exc:
|
|
579
|
+
local_paths.append(None)
|
|
580
|
+
statuses.append("failed")
|
|
581
|
+
warnings.warn(
|
|
582
|
+
f"Failed to download recording {rec.get('recordingId')}: {exc}"
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
df = to_pandas(recordings)
|
|
586
|
+
if not df.empty:
|
|
587
|
+
df["local_path"] = local_paths
|
|
588
|
+
df["download_status"] = statuses
|
|
589
|
+
df.to_csv(dest_dir / "recordings_metadata.csv", index=False)
|
|
590
|
+
|
|
591
|
+
return df
|
|
592
|
+
|
|
469
593
|
# ------------------------------------------------------------------
|
|
470
594
|
# Internal helpers
|
|
471
595
|
# ------------------------------------------------------------------
|
|
@@ -71,6 +71,16 @@ def deployment_sessions_response():
|
|
|
71
71
|
return load_fixture("deployment_sessions_response.json")
|
|
72
72
|
|
|
73
73
|
|
|
74
|
+
@pytest.fixture
|
|
75
|
+
def sparse_ratings_response():
|
|
76
|
+
return load_fixture("sparse_ratings_response.json")
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@pytest.fixture
|
|
80
|
+
def recordings_response():
|
|
81
|
+
return load_fixture("recordings_response.json")
|
|
82
|
+
|
|
83
|
+
|
|
74
84
|
@pytest.fixture
|
|
75
85
|
def warnings_response():
|
|
76
86
|
return load_fixture("warnings_response.json")
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
{
|
|
2
|
+
"status": "success",
|
|
3
|
+
"metadata": {
|
|
4
|
+
"dataType": "recordings",
|
|
5
|
+
"scope": "experiment",
|
|
6
|
+
"scopeId": "exp_abc123",
|
|
7
|
+
"timestamp": "2024-06-15T10:00:00.000Z",
|
|
8
|
+
"query": {
|
|
9
|
+
"limit": 1000,
|
|
10
|
+
"offset": 0,
|
|
11
|
+
"sort": "startTime",
|
|
12
|
+
"order": "asc"
|
|
13
|
+
},
|
|
14
|
+
"pagination": {
|
|
15
|
+
"total": 2,
|
|
16
|
+
"returned": 2,
|
|
17
|
+
"hasMore": false,
|
|
18
|
+
"limit": 1000,
|
|
19
|
+
"offset": 0
|
|
20
|
+
},
|
|
21
|
+
"processing": {
|
|
22
|
+
"processingTimeMs": 35,
|
|
23
|
+
"enriched": true,
|
|
24
|
+
"version": "3.0.0"
|
|
25
|
+
}
|
|
26
|
+
},
|
|
27
|
+
"data": [
|
|
28
|
+
{
|
|
29
|
+
"recordingId": "EG_video_001",
|
|
30
|
+
"egressId": "EG_video_001",
|
|
31
|
+
"participantId": "user_1",
|
|
32
|
+
"participantName": "Alice",
|
|
33
|
+
"startTime": "2024-06-15T10:00:05.000Z",
|
|
34
|
+
"endTime": "2024-06-15T10:05:05.000Z",
|
|
35
|
+
"duration": 300000,
|
|
36
|
+
"videoOffset": 500,
|
|
37
|
+
"url": "https://storage.googleapis.com/bucket/recordings/video1.mp4",
|
|
38
|
+
"downloadUrl": "https://storage.googleapis.com/bucket/recordings/video1.mp4?X-Goog-Signature=abc",
|
|
39
|
+
"fileSize": 1024,
|
|
40
|
+
"format": "mp4",
|
|
41
|
+
"status": "complete",
|
|
42
|
+
"metadata": {
|
|
43
|
+
"type": "video",
|
|
44
|
+
"recordingType": "individual",
|
|
45
|
+
"roomName": "room_1",
|
|
46
|
+
"experimentId": "exp_abc123"
|
|
47
|
+
}
|
|
48
|
+
},
|
|
49
|
+
{
|
|
50
|
+
"recordingId": "EG_audio_002",
|
|
51
|
+
"egressId": "EG_audio_002",
|
|
52
|
+
"participantId": "user_1",
|
|
53
|
+
"participantName": "Alice",
|
|
54
|
+
"startTime": "2024-06-15T10:00:05.000Z",
|
|
55
|
+
"endTime": "2024-06-15T10:05:05.000Z",
|
|
56
|
+
"duration": 300000,
|
|
57
|
+
"videoOffset": 500,
|
|
58
|
+
"url": "https://storage.googleapis.com/bucket/recordings/audio1.webm",
|
|
59
|
+
"downloadUrl": "https://storage.googleapis.com/bucket/recordings/audio1.webm?X-Goog-Signature=def",
|
|
60
|
+
"fileSize": 512,
|
|
61
|
+
"format": "webm",
|
|
62
|
+
"status": "complete",
|
|
63
|
+
"metadata": {
|
|
64
|
+
"type": "audio",
|
|
65
|
+
"recordingType": "audio",
|
|
66
|
+
"roomName": "room_1",
|
|
67
|
+
"experimentId": "exp_abc123"
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
]
|
|
71
|
+
}
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
{
|
|
2
|
+
"status": "success",
|
|
3
|
+
"metadata": {
|
|
4
|
+
"dataType": "ratings",
|
|
5
|
+
"ratingType": "sparse",
|
|
6
|
+
"scope": "experiment",
|
|
7
|
+
"scopeId": "exp_abc123",
|
|
8
|
+
"timestamp": "2024-06-15T10:00:00.000Z",
|
|
9
|
+
"query": {
|
|
10
|
+
"startTime": null,
|
|
11
|
+
"endTime": null,
|
|
12
|
+
"limit": 1000,
|
|
13
|
+
"offset": 0,
|
|
14
|
+
"sort": "timestamp",
|
|
15
|
+
"order": "asc"
|
|
16
|
+
},
|
|
17
|
+
"pagination": {
|
|
18
|
+
"total": 2,
|
|
19
|
+
"returned": 2,
|
|
20
|
+
"hasMore": false,
|
|
21
|
+
"limit": 1000,
|
|
22
|
+
"offset": 0
|
|
23
|
+
},
|
|
24
|
+
"processing": {
|
|
25
|
+
"processingTimeMs": 58,
|
|
26
|
+
"enriched": true,
|
|
27
|
+
"version": "3.0.0"
|
|
28
|
+
}
|
|
29
|
+
},
|
|
30
|
+
"data": [
|
|
31
|
+
{
|
|
32
|
+
"ratingId": "rat_001",
|
|
33
|
+
"participantId": "user_1",
|
|
34
|
+
"timestamp": "2024-06-15T10:01:30.000Z",
|
|
35
|
+
"onset": 8500,
|
|
36
|
+
"rawOnset": 8520,
|
|
37
|
+
"clockOffsetApplied": 20,
|
|
38
|
+
"value": 72.5,
|
|
39
|
+
"rawValue": 58,
|
|
40
|
+
"scale": { "min": 0, "max": 80 },
|
|
41
|
+
"type": "sparse",
|
|
42
|
+
"stateId": "state_video_1",
|
|
43
|
+
"stimulusId": "video_abc",
|
|
44
|
+
"stimulusTime": 5000,
|
|
45
|
+
"responseTime": 2100,
|
|
46
|
+
"confidence": null,
|
|
47
|
+
"metadata": {
|
|
48
|
+
"question": "How engaging is this video?",
|
|
49
|
+
"dimension": "engagement",
|
|
50
|
+
"componentType": "vasrating",
|
|
51
|
+
"sampleIndex": 0
|
|
52
|
+
},
|
|
53
|
+
"ratingEndOnset": 10600,
|
|
54
|
+
"sparseRatingData": {
|
|
55
|
+
"videoId": "video_abc",
|
|
56
|
+
"pauseIndex": 0,
|
|
57
|
+
"videoRelativeTime": 5000,
|
|
58
|
+
"pauseTimestamp": 1718445690000,
|
|
59
|
+
"componentType": "vasrating",
|
|
60
|
+
"componentData": { "value": 58 },
|
|
61
|
+
"previousRatings": null,
|
|
62
|
+
"mediaPauseOnset": 8200,
|
|
63
|
+
"mediaResumeOnset": 10800,
|
|
64
|
+
"actualPauseDuration": 2600
|
|
65
|
+
},
|
|
66
|
+
"stateStartTime": "2024-06-15T10:00:00.000Z",
|
|
67
|
+
"stateDuration": 60000
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
"ratingId": "rat_002",
|
|
71
|
+
"participantId": "user_1",
|
|
72
|
+
"timestamp": "2024-06-15T10:02:45.000Z",
|
|
73
|
+
"onset": 25300,
|
|
74
|
+
"rawOnset": 25320,
|
|
75
|
+
"clockOffsetApplied": 20,
|
|
76
|
+
"value": 45.0,
|
|
77
|
+
"rawValue": 36,
|
|
78
|
+
"scale": { "min": 0, "max": 80 },
|
|
79
|
+
"type": "sparse",
|
|
80
|
+
"stateId": "state_video_1",
|
|
81
|
+
"stimulusId": "video_abc",
|
|
82
|
+
"stimulusTime": 20000,
|
|
83
|
+
"responseTime": 1800,
|
|
84
|
+
"confidence": null,
|
|
85
|
+
"metadata": {
|
|
86
|
+
"question": "How engaging is this video?",
|
|
87
|
+
"dimension": "engagement",
|
|
88
|
+
"componentType": "vasrating",
|
|
89
|
+
"sampleIndex": 1
|
|
90
|
+
},
|
|
91
|
+
"ratingEndOnset": 27100,
|
|
92
|
+
"sparseRatingData": {
|
|
93
|
+
"videoId": "video_abc",
|
|
94
|
+
"pauseIndex": 1,
|
|
95
|
+
"videoRelativeTime": 20000,
|
|
96
|
+
"pauseTimestamp": 1718445765000,
|
|
97
|
+
"componentType": "vasrating",
|
|
98
|
+
"componentData": { "value": 36 },
|
|
99
|
+
"previousRatings": { "video_abc": 58 },
|
|
100
|
+
"mediaPauseOnset": 25000,
|
|
101
|
+
"mediaResumeOnset": 27300,
|
|
102
|
+
"actualPauseDuration": 2300
|
|
103
|
+
},
|
|
104
|
+
"stateStartTime": "2024-06-15T10:00:00.000Z",
|
|
105
|
+
"stateDuration": 60000
|
|
106
|
+
}
|
|
107
|
+
]
|
|
108
|
+
}
|
|
@@ -224,6 +224,24 @@ def test_get_ratings_sparse(api_key, events_response):
|
|
|
224
224
|
assert isinstance(df, pd.DataFrame)
|
|
225
225
|
|
|
226
226
|
|
|
227
|
+
@responses.activate
|
|
228
|
+
def test_get_ratings_sparse_flattens_data(api_key, sparse_ratings_response):
|
|
229
|
+
"""Sparse ratings DataFrame contains flattened sparseRatingData columns."""
|
|
230
|
+
responses.get(
|
|
231
|
+
f"{BASE_URL}/data/ratings/sparse/experiment/exp_abc123",
|
|
232
|
+
json=sparse_ratings_response,
|
|
233
|
+
status=200,
|
|
234
|
+
)
|
|
235
|
+
client = HyperStudy(api_key=api_key, base_url=BASE_URL)
|
|
236
|
+
df = client.get_ratings("exp_abc123", kind="sparse", limit=1000)
|
|
237
|
+
assert isinstance(df, pd.DataFrame)
|
|
238
|
+
assert "sparseRatingData_mediaPauseOnset" in df.columns
|
|
239
|
+
assert "sparseRatingData_mediaResumeOnset" in df.columns
|
|
240
|
+
assert "sparseRatingData_actualPauseDuration" in df.columns
|
|
241
|
+
assert "metadata_question" in df.columns
|
|
242
|
+
assert df["sparseRatingData_mediaPauseOnset"].iloc[0] == 8200
|
|
243
|
+
|
|
244
|
+
|
|
227
245
|
@responses.activate
|
|
228
246
|
def test_get_sync_with_aggregation(api_key, events_response):
|
|
229
247
|
"""get_sync passes aggregationWindow param."""
|
|
@@ -237,6 +255,128 @@ def test_get_sync_with_aggregation(api_key, events_response):
|
|
|
237
255
|
assert "aggregationWindow=5000" in responses.calls[0].request.url
|
|
238
256
|
|
|
239
257
|
|
|
258
|
+
# ------------------------------------------------------------------
|
|
259
|
+
# download_recordings
|
|
260
|
+
# ------------------------------------------------------------------
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
@responses.activate
|
|
264
|
+
def test_download_recordings(api_key, recordings_response, tmp_path):
|
|
265
|
+
"""download_recordings writes files, CSV sidecar, and returns DataFrame."""
|
|
266
|
+
# Mock the metadata API
|
|
267
|
+
responses.get(
|
|
268
|
+
f"{BASE_URL}/data/recordings/experiment/exp_abc123",
|
|
269
|
+
json=recordings_response,
|
|
270
|
+
status=200,
|
|
271
|
+
)
|
|
272
|
+
# Mock the GCS signed URL downloads
|
|
273
|
+
responses.get(
|
|
274
|
+
recordings_response["data"][0]["downloadUrl"],
|
|
275
|
+
body=b"fake video bytes",
|
|
276
|
+
status=200,
|
|
277
|
+
)
|
|
278
|
+
responses.get(
|
|
279
|
+
recordings_response["data"][1]["downloadUrl"],
|
|
280
|
+
body=b"fake audio bytes",
|
|
281
|
+
status=200,
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
client = HyperStudy(api_key=api_key, base_url=BASE_URL)
|
|
285
|
+
df = client.download_recordings(
|
|
286
|
+
"exp_abc123", output_dir=str(tmp_path), progress=False
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
assert isinstance(df, pd.DataFrame)
|
|
290
|
+
assert len(df) == 2
|
|
291
|
+
assert "local_path" in df.columns
|
|
292
|
+
assert "download_status" in df.columns
|
|
293
|
+
assert list(df["download_status"]) == ["downloaded", "downloaded"]
|
|
294
|
+
|
|
295
|
+
# Files exist on disk
|
|
296
|
+
assert (tmp_path / "Alice_video_EG_video_001.mp4").exists()
|
|
297
|
+
assert (tmp_path / "Alice_audio_EG_audio_002.webm").exists()
|
|
298
|
+
assert (tmp_path / "Alice_video_EG_video_001.mp4").read_bytes() == b"fake video bytes"
|
|
299
|
+
|
|
300
|
+
# CSV sidecar written
|
|
301
|
+
assert (tmp_path / "recordings_metadata.csv").exists()
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
@responses.activate
|
|
305
|
+
def test_download_recordings_filter_type(api_key, recordings_response, tmp_path):
|
|
306
|
+
"""recording_type filter limits downloads to matching type."""
|
|
307
|
+
responses.get(
|
|
308
|
+
f"{BASE_URL}/data/recordings/experiment/exp_abc123",
|
|
309
|
+
json=recordings_response,
|
|
310
|
+
status=200,
|
|
311
|
+
)
|
|
312
|
+
responses.get(
|
|
313
|
+
recordings_response["data"][1]["downloadUrl"],
|
|
314
|
+
body=b"audio bytes",
|
|
315
|
+
status=200,
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
client = HyperStudy(api_key=api_key, base_url=BASE_URL)
|
|
319
|
+
df = client.download_recordings(
|
|
320
|
+
"exp_abc123",
|
|
321
|
+
output_dir=str(tmp_path),
|
|
322
|
+
recording_type="audio",
|
|
323
|
+
progress=False,
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
assert len(df) == 1
|
|
327
|
+
assert (tmp_path / "Alice_audio_EG_audio_002.webm").exists()
|
|
328
|
+
assert not (tmp_path / "Alice_video_EG_video_001.mp4").exists()
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
@responses.activate
|
|
332
|
+
def test_download_recordings_skip_existing(api_key, recordings_response, tmp_path):
|
|
333
|
+
"""Files with matching size are skipped."""
|
|
334
|
+
responses.get(
|
|
335
|
+
f"{BASE_URL}/data/recordings/experiment/exp_abc123",
|
|
336
|
+
json=recordings_response,
|
|
337
|
+
status=200,
|
|
338
|
+
)
|
|
339
|
+
# Pre-create the video file with the expected size (1024 bytes)
|
|
340
|
+
video_path = tmp_path / "Alice_video_EG_video_001.mp4"
|
|
341
|
+
video_path.write_bytes(b"\x00" * 1024)
|
|
342
|
+
|
|
343
|
+
# Only the audio file needs a mock download URL
|
|
344
|
+
responses.get(
|
|
345
|
+
recordings_response["data"][1]["downloadUrl"],
|
|
346
|
+
body=b"\x00" * 512,
|
|
347
|
+
status=200,
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
client = HyperStudy(api_key=api_key, base_url=BASE_URL)
|
|
351
|
+
df = client.download_recordings(
|
|
352
|
+
"exp_abc123", output_dir=str(tmp_path), progress=False
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
assert df["download_status"].iloc[0] == "skipped"
|
|
356
|
+
assert df["download_status"].iloc[1] == "downloaded"
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
@responses.activate
|
|
360
|
+
def test_download_recording_single(api_key, tmp_path):
|
|
361
|
+
"""download_recording downloads a single file."""
|
|
362
|
+
url = "https://storage.example.com/rec.mp4"
|
|
363
|
+
responses.get(url, body=b"video data", status=200)
|
|
364
|
+
|
|
365
|
+
client = HyperStudy(api_key=api_key, base_url=BASE_URL)
|
|
366
|
+
rec = {
|
|
367
|
+
"recordingId": "EG_001",
|
|
368
|
+
"participantName": "Bob",
|
|
369
|
+
"downloadUrl": url,
|
|
370
|
+
"format": "mp4",
|
|
371
|
+
"metadata": {"type": "video"},
|
|
372
|
+
}
|
|
373
|
+
path = client.download_recording(rec, output_dir=str(tmp_path))
|
|
374
|
+
|
|
375
|
+
assert path.exists()
|
|
376
|
+
assert path.name == "Bob_video_EG_001.mp4"
|
|
377
|
+
assert path.read_bytes() == b"video data"
|
|
378
|
+
|
|
379
|
+
|
|
240
380
|
# ------------------------------------------------------------------
|
|
241
381
|
# get_all_data
|
|
242
382
|
# ------------------------------------------------------------------
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
"""Tests for DataFrame conversion (pandas and polars)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import pytest
|
|
7
|
+
|
|
8
|
+
from hyperstudy._dataframe import _flatten_nested_dicts, to_pandas, to_polars
|
|
9
|
+
|
|
10
|
+
SAMPLE_DATA = [
|
|
11
|
+
{
|
|
12
|
+
"id": "evt_001",
|
|
13
|
+
"onset": 1500,
|
|
14
|
+
"timestamp": "2024-06-15T10:00:01.500Z",
|
|
15
|
+
"category": "component",
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
"id": "evt_002",
|
|
19
|
+
"onset": 3200,
|
|
20
|
+
"timestamp": "2024-06-15T10:00:03.200Z",
|
|
21
|
+
"category": "component",
|
|
22
|
+
},
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
SPARSE_RATING_DATA = [
|
|
26
|
+
{
|
|
27
|
+
"ratingId": "rat_001",
|
|
28
|
+
"onset": 8500,
|
|
29
|
+
"timestamp": "2024-06-15T10:01:30.000Z",
|
|
30
|
+
"value": 72.5,
|
|
31
|
+
"type": "sparse",
|
|
32
|
+
"metadata": {
|
|
33
|
+
"question": "How engaging?",
|
|
34
|
+
"dimension": "engagement",
|
|
35
|
+
"componentType": "vasrating",
|
|
36
|
+
},
|
|
37
|
+
"sparseRatingData": {
|
|
38
|
+
"videoId": "video_abc",
|
|
39
|
+
"pauseIndex": 0,
|
|
40
|
+
"mediaPauseOnset": 8200,
|
|
41
|
+
"mediaResumeOnset": 10800,
|
|
42
|
+
"actualPauseDuration": 2600,
|
|
43
|
+
"componentData": {"value": 58},
|
|
44
|
+
},
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
"ratingId": "rat_002",
|
|
48
|
+
"onset": 25300,
|
|
49
|
+
"timestamp": "2024-06-15T10:02:45.000Z",
|
|
50
|
+
"value": 45.0,
|
|
51
|
+
"type": "sparse",
|
|
52
|
+
"metadata": {
|
|
53
|
+
"question": "How engaging?",
|
|
54
|
+
"dimension": "engagement",
|
|
55
|
+
"componentType": "vasrating",
|
|
56
|
+
},
|
|
57
|
+
"sparseRatingData": {
|
|
58
|
+
"videoId": "video_abc",
|
|
59
|
+
"pauseIndex": 1,
|
|
60
|
+
"mediaPauseOnset": 25000,
|
|
61
|
+
"mediaResumeOnset": 27300,
|
|
62
|
+
"actualPauseDuration": 2300,
|
|
63
|
+
"componentData": {"value": 36},
|
|
64
|
+
},
|
|
65
|
+
},
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# ------------------------------------------------------------------
|
|
70
|
+
# Pandas
|
|
71
|
+
# ------------------------------------------------------------------
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def test_to_pandas_creates_dataframe():
|
|
75
|
+
df = to_pandas(SAMPLE_DATA)
|
|
76
|
+
assert isinstance(df, pd.DataFrame)
|
|
77
|
+
assert len(df) == 2
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def test_to_pandas_onset_sec():
|
|
81
|
+
df = to_pandas(SAMPLE_DATA)
|
|
82
|
+
assert "onset_sec" in df.columns
|
|
83
|
+
assert df["onset_sec"].iloc[0] == pytest.approx(1.5)
|
|
84
|
+
assert df["onset_sec"].iloc[1] == pytest.approx(3.2)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def test_to_pandas_timestamp_parsed():
|
|
88
|
+
df = to_pandas(SAMPLE_DATA)
|
|
89
|
+
assert pd.api.types.is_datetime64_any_dtype(df["timestamp"])
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def test_to_pandas_empty():
|
|
93
|
+
df = to_pandas([])
|
|
94
|
+
assert isinstance(df, pd.DataFrame)
|
|
95
|
+
assert df.empty
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
# ------------------------------------------------------------------
|
|
99
|
+
# Polars
|
|
100
|
+
# ------------------------------------------------------------------
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def test_to_polars_creates_dataframe():
|
|
104
|
+
polars = pytest.importorskip("polars")
|
|
105
|
+
df = to_polars(SAMPLE_DATA)
|
|
106
|
+
assert isinstance(df, polars.DataFrame)
|
|
107
|
+
assert len(df) == 2
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def test_to_polars_onset_sec():
|
|
111
|
+
pytest.importorskip("polars")
|
|
112
|
+
df = to_polars(SAMPLE_DATA)
|
|
113
|
+
assert "onset_sec" in df.columns
|
|
114
|
+
assert df["onset_sec"][0] == pytest.approx(1.5)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def test_to_polars_empty():
|
|
118
|
+
polars = pytest.importorskip("polars")
|
|
119
|
+
df = to_polars([])
|
|
120
|
+
assert isinstance(df, polars.DataFrame)
|
|
121
|
+
assert len(df) == 0
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
# ------------------------------------------------------------------
|
|
125
|
+
# Nested dict flattening
|
|
126
|
+
# ------------------------------------------------------------------
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def test_flatten_sparse_rating_data():
|
|
130
|
+
df = to_pandas(SPARSE_RATING_DATA)
|
|
131
|
+
assert "sparseRatingData_mediaPauseOnset" in df.columns
|
|
132
|
+
assert "sparseRatingData_mediaResumeOnset" in df.columns
|
|
133
|
+
assert "sparseRatingData_actualPauseDuration" in df.columns
|
|
134
|
+
assert "sparseRatingData_videoId" in df.columns
|
|
135
|
+
assert "sparseRatingData_pauseIndex" in df.columns
|
|
136
|
+
assert df["sparseRatingData_mediaPauseOnset"].iloc[0] == 8200
|
|
137
|
+
assert df["sparseRatingData_mediaPauseOnset"].iloc[1] == 25000
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def test_flatten_metadata():
|
|
141
|
+
df = to_pandas(SPARSE_RATING_DATA)
|
|
142
|
+
assert "metadata_question" in df.columns
|
|
143
|
+
assert "metadata_dimension" in df.columns
|
|
144
|
+
assert "metadata_componentType" in df.columns
|
|
145
|
+
assert df["metadata_question"].iloc[0] == "How engaging?"
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def test_flatten_preserves_original():
|
|
149
|
+
df = to_pandas(SPARSE_RATING_DATA)
|
|
150
|
+
assert "sparseRatingData" in df.columns
|
|
151
|
+
assert isinstance(df["sparseRatingData"].iloc[0], dict)
|
|
152
|
+
assert "metadata" in df.columns
|
|
153
|
+
assert isinstance(df["metadata"].iloc[0], dict)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def test_flatten_handles_none():
|
|
157
|
+
data = [
|
|
158
|
+
{"ratingId": "r1", "onset": 100, "sparseRatingData": None, "metadata": None},
|
|
159
|
+
]
|
|
160
|
+
df = to_pandas(data)
|
|
161
|
+
assert "sparseRatingData" in df.columns
|
|
162
|
+
# No flattened columns since the nested value is None, not a dict
|
|
163
|
+
assert "sparseRatingData_mediaPauseOnset" not in df.columns
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def test_flatten_no_target_fields():
|
|
167
|
+
"""Data without any flatten-target fields passes through unchanged."""
|
|
168
|
+
result = _flatten_nested_dicts(SAMPLE_DATA)
|
|
169
|
+
assert result is SAMPLE_DATA # same object — no copy needed
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def test_flatten_empty():
|
|
173
|
+
result = _flatten_nested_dicts([])
|
|
174
|
+
assert result == []
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def test_flatten_polars():
|
|
178
|
+
pytest.importorskip("polars")
|
|
179
|
+
df = to_polars(SPARSE_RATING_DATA)
|
|
180
|
+
assert "sparseRatingData_mediaPauseOnset" in df.columns
|
|
181
|
+
assert "metadata_question" in df.columns
|
|
182
|
+
assert df["sparseRatingData_mediaPauseOnset"][0] == 8200
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""Tests for recording download helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import responses
|
|
6
|
+
import pytest
|
|
7
|
+
|
|
8
|
+
from hyperstudy._downloads import build_filename, download_file
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# ------------------------------------------------------------------
|
|
12
|
+
# build_filename
|
|
13
|
+
# ------------------------------------------------------------------
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
VIDEO_RECORDING = {
|
|
17
|
+
"recordingId": "EG_video_001",
|
|
18
|
+
"participantName": "Alice",
|
|
19
|
+
"format": "mp4",
|
|
20
|
+
"metadata": {"type": "video"},
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
AUDIO_RECORDING = {
|
|
24
|
+
"recordingId": "EG_audio_002",
|
|
25
|
+
"participantName": "Alice",
|
|
26
|
+
"format": "webm",
|
|
27
|
+
"metadata": {"type": "audio"},
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_build_filename_video():
|
|
32
|
+
assert build_filename(VIDEO_RECORDING) == "Alice_video_EG_video_001.mp4"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_build_filename_audio():
|
|
36
|
+
assert build_filename(AUDIO_RECORDING) == "Alice_audio_EG_audio_002.webm"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_build_filename_missing_fields():
|
|
40
|
+
rec = {"egressId": "EG_123"}
|
|
41
|
+
name = build_filename(rec)
|
|
42
|
+
assert name == "unknown_recording_EG_123.mp4"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def test_build_filename_sanitizes_name():
|
|
46
|
+
rec = {
|
|
47
|
+
"recordingId": "EG_001",
|
|
48
|
+
"participantName": "Alice O'Brien (test)",
|
|
49
|
+
"format": "mp4",
|
|
50
|
+
"metadata": {"type": "video"},
|
|
51
|
+
}
|
|
52
|
+
name = build_filename(rec)
|
|
53
|
+
assert name == "Alice_O_Brien__test__video_EG_001.mp4"
|
|
54
|
+
# No special characters remain
|
|
55
|
+
assert "'" not in name
|
|
56
|
+
assert "(" not in name
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_build_filename_uses_participant_id_fallback():
|
|
60
|
+
rec = {
|
|
61
|
+
"recordingId": "EG_001",
|
|
62
|
+
"participantId": "user_42",
|
|
63
|
+
"format": "mp4",
|
|
64
|
+
"metadata": {"type": "video"},
|
|
65
|
+
}
|
|
66
|
+
assert build_filename(rec) == "user_42_video_EG_001.mp4"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def test_build_filename_audio_default_format():
|
|
70
|
+
"""Audio recording with no format field defaults to webm."""
|
|
71
|
+
rec = {
|
|
72
|
+
"recordingId": "EG_001",
|
|
73
|
+
"participantName": "Bob",
|
|
74
|
+
"metadata": {"type": "audio"},
|
|
75
|
+
}
|
|
76
|
+
assert build_filename(rec).endswith(".webm")
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# ------------------------------------------------------------------
|
|
80
|
+
# download_file
|
|
81
|
+
# ------------------------------------------------------------------
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@responses.activate
|
|
85
|
+
def test_download_file(tmp_path):
|
|
86
|
+
url = "https://storage.example.com/file.mp4"
|
|
87
|
+
content = b"fake video content " * 100
|
|
88
|
+
responses.get(url, body=content, status=200)
|
|
89
|
+
|
|
90
|
+
dest = tmp_path / "output.mp4"
|
|
91
|
+
written = download_file(url, dest)
|
|
92
|
+
|
|
93
|
+
assert dest.exists()
|
|
94
|
+
assert dest.read_bytes() == content
|
|
95
|
+
assert written == len(content)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@responses.activate
|
|
99
|
+
def test_download_file_raises_on_error(tmp_path):
|
|
100
|
+
url = "https://storage.example.com/missing.mp4"
|
|
101
|
+
responses.get(url, status=404)
|
|
102
|
+
|
|
103
|
+
dest = tmp_path / "output.mp4"
|
|
104
|
+
with pytest.raises(Exception):
|
|
105
|
+
download_file(url, dest)
|
|
@@ -1,78 +0,0 @@
|
|
|
1
|
-
"""Tests for DataFrame conversion (pandas and polars)."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import pandas as pd
|
|
6
|
-
import pytest
|
|
7
|
-
|
|
8
|
-
from hyperstudy._dataframe import to_pandas, to_polars
|
|
9
|
-
|
|
10
|
-
SAMPLE_DATA = [
|
|
11
|
-
{
|
|
12
|
-
"id": "evt_001",
|
|
13
|
-
"onset": 1500,
|
|
14
|
-
"timestamp": "2024-06-15T10:00:01.500Z",
|
|
15
|
-
"category": "component",
|
|
16
|
-
},
|
|
17
|
-
{
|
|
18
|
-
"id": "evt_002",
|
|
19
|
-
"onset": 3200,
|
|
20
|
-
"timestamp": "2024-06-15T10:00:03.200Z",
|
|
21
|
-
"category": "component",
|
|
22
|
-
},
|
|
23
|
-
]
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
# ------------------------------------------------------------------
|
|
27
|
-
# Pandas
|
|
28
|
-
# ------------------------------------------------------------------
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def test_to_pandas_creates_dataframe():
|
|
32
|
-
df = to_pandas(SAMPLE_DATA)
|
|
33
|
-
assert isinstance(df, pd.DataFrame)
|
|
34
|
-
assert len(df) == 2
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def test_to_pandas_onset_sec():
|
|
38
|
-
df = to_pandas(SAMPLE_DATA)
|
|
39
|
-
assert "onset_sec" in df.columns
|
|
40
|
-
assert df["onset_sec"].iloc[0] == pytest.approx(1.5)
|
|
41
|
-
assert df["onset_sec"].iloc[1] == pytest.approx(3.2)
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
def test_to_pandas_timestamp_parsed():
|
|
45
|
-
df = to_pandas(SAMPLE_DATA)
|
|
46
|
-
assert pd.api.types.is_datetime64_any_dtype(df["timestamp"])
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def test_to_pandas_empty():
|
|
50
|
-
df = to_pandas([])
|
|
51
|
-
assert isinstance(df, pd.DataFrame)
|
|
52
|
-
assert df.empty
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
# ------------------------------------------------------------------
|
|
56
|
-
# Polars
|
|
57
|
-
# ------------------------------------------------------------------
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def test_to_polars_creates_dataframe():
|
|
61
|
-
polars = pytest.importorskip("polars")
|
|
62
|
-
df = to_polars(SAMPLE_DATA)
|
|
63
|
-
assert isinstance(df, polars.DataFrame)
|
|
64
|
-
assert len(df) == 2
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
def test_to_polars_onset_sec():
|
|
68
|
-
pytest.importorskip("polars")
|
|
69
|
-
df = to_polars(SAMPLE_DATA)
|
|
70
|
-
assert "onset_sec" in df.columns
|
|
71
|
-
assert df["onset_sec"][0] == pytest.approx(1.5)
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
def test_to_polars_empty():
|
|
75
|
-
polars = pytest.importorskip("polars")
|
|
76
|
-
df = to_polars([])
|
|
77
|
-
assert isinstance(df, polars.DataFrame)
|
|
78
|
-
assert len(df) == 0
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|