hyperstudy 0.2.1__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/PKG-INFO +1 -1
  2. hyperstudy-0.2.2/docs/superpowers/specs/2026-04-10-recording-downloads-design.md +128 -0
  3. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/pyproject.toml +1 -1
  4. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/src/hyperstudy/__init__.py +1 -1
  5. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/src/hyperstudy/_dataframe.py +47 -0
  6. hyperstudy-0.2.2/src/hyperstudy/_downloads.py +50 -0
  7. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/src/hyperstudy/client.py +124 -0
  8. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/conftest.py +10 -0
  9. hyperstudy-0.2.2/tests/fixtures/recordings_response.json +71 -0
  10. hyperstudy-0.2.2/tests/fixtures/sparse_ratings_response.json +108 -0
  11. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/test_client.py +140 -0
  12. hyperstudy-0.2.2/tests/test_dataframe.py +182 -0
  13. hyperstudy-0.2.2/tests/test_downloads.py +105 -0
  14. hyperstudy-0.2.1/tests/test_dataframe.py +0 -78
  15. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/.github/workflows/publish.yml +0 -0
  16. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/.github/workflows/sync-release-notes.yml +0 -0
  17. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/.github/workflows/test.yml +0 -0
  18. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/.gitignore +0 -0
  19. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/CHANGELOG.md +0 -0
  20. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/LICENSE +0 -0
  21. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/README.md +0 -0
  22. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/src/hyperstudy/_display.py +0 -0
  23. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/src/hyperstudy/_http.py +0 -0
  24. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/src/hyperstudy/_pagination.py +0 -0
  25. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/src/hyperstudy/_types.py +0 -0
  26. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/src/hyperstudy/exceptions.py +0 -0
  27. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/src/hyperstudy/experiments.py +0 -0
  28. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/__init__.py +0 -0
  29. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/fixtures/deployment_sessions_response.json +0 -0
  30. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/fixtures/deployment_single_response.json +0 -0
  31. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/fixtures/deployments_list_response.json +0 -0
  32. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/fixtures/error_401.json +0 -0
  33. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/fixtures/error_403.json +0 -0
  34. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/fixtures/events_response.json +0 -0
  35. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/fixtures/experiment_single_response.json +0 -0
  36. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/fixtures/experiments_list_response.json +0 -0
  37. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/fixtures/paginated_page1.json +0 -0
  38. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/fixtures/paginated_page2.json +0 -0
  39. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/fixtures/pre_experiment_response.json +0 -0
  40. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/fixtures/warnings_response.json +0 -0
  41. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/test_experiments.py +0 -0
  42. {hyperstudy-0.2.1 → hyperstudy-0.2.2}/tests/test_pagination.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hyperstudy
3
- Version: 0.2.1
3
+ Version: 0.2.2
4
4
  Summary: Python SDK for the HyperStudy experiment platform API
5
5
  Project-URL: Homepage, https://hyperstudy.io
6
6
  Project-URL: Documentation, https://docs.hyperstudy.io/developers/python-sdk
@@ -0,0 +1,128 @@
1
+ # Recording Downloads via Python SDK
2
+
3
+ ## Problem
4
+
5
+ The Python SDK's `get_recordings()` returns metadata only. Users need the actual audio/video files for offline analysis (ML models, manual review, archival). Currently they must manually extract `downloadUrl` from each record and fetch files themselves.
6
+
7
+ ## Decision: SDK-only, no backend changes
8
+
9
+ The V3 API already returns signed GCS download URLs (7-day expiry) in the recording metadata. The SDK will fetch metadata and download files in the same call, so URL expiry is not a practical concern. This matches how the frontend downloads recordings.
10
+
11
+ ## API Surface
12
+
13
+ ### `download_recordings()` — Bulk download
14
+
15
+ ```python
16
+ df = hs.download_recordings(
17
+ "exp_abc123",
18
+ output_dir="./data/recordings",
19
+ scope="experiment", # "experiment" | "room" | "participant"
20
+ deployment_id=None, # optional filter
21
+ room_id=None, # optional filter
22
+ recording_type=None, # "audio" | "video" | None (both)
23
+ progress=True, # tqdm progress bar
24
+ skip_existing=True, # skip files already on disk with matching size
25
+ )
26
+ ```
27
+
28
+ **Returns**: `pandas.DataFrame` with all recording metadata columns plus:
29
+ - `local_path` — absolute path to the downloaded file on disk
30
+ - `download_status` — `"downloaded"`, `"skipped"`, or `"failed"`
31
+
32
+ **Side effects**:
33
+ - Writes media files to `output_dir`
34
+ - Writes `recordings_metadata.csv` to `output_dir`
35
+
36
+ ### `download_recording()` — Single recording
37
+
38
+ ```python
39
+ path = hs.download_recording(
40
+ recording, # dict from get_recordings(output="dict")
41
+ output_dir="./data/recordings",
42
+ )
43
+ ```
44
+
45
+ **Returns**: `pathlib.Path` to downloaded file.
46
+
47
+ ## Directory Structure
48
+
49
+ ```
50
+ output_dir/
51
+ recordings_metadata.csv
52
+ user1_video_EG_abc123.mp4
53
+ user1_audio_EG_def456.webm
54
+ user2_video_EG_ghi789.mp4
55
+ ```
56
+
57
+ **Filename pattern**: `{participantName}_{recordingType}_{recordingId}.{ext}`
58
+
59
+ - `participantName`: from recording metadata, sanitized for filesystem safety
60
+ - `recordingType`: `"video"` or `"audio"` from `metadata.type`
61
+ - `recordingId`: egressId or recordingId
62
+ - `ext`: from `format` field, falling back to `mp4` (video) or `webm` (audio)
63
+
64
+ ## Internal Design
65
+
66
+ ### Download flow (`download_recordings`)
67
+
68
+ 1. Call `self.get_recordings(scope_id, scope=scope, output="dict")` to get metadata
69
+ 2. Filter by `recording_type` if specified (via `metadata.type`)
70
+ 3. Create `output_dir` via `os.makedirs(exist_ok=True)`
71
+ 4. For each recording:
72
+ - Build filename using pattern above
73
+ - If `skip_existing=True` and file exists with size matching `fileSize` metadata, mark as `"skipped"`
74
+ - Otherwise, fetch from `downloadUrl` (fallback: `url`) using streaming HTTP GET
75
+ - Write to disk in 8KB chunks
76
+ - Mark as `"downloaded"` or `"failed"` (with warning logged)
77
+ 5. Build DataFrame from metadata, add `local_path` and `download_status` columns
78
+ 6. Write `recordings_metadata.csv` to `output_dir`
79
+ 7. Return DataFrame
80
+
81
+ ### Streaming downloads
82
+
83
+ Use `requests.get(url, stream=True)` with chunked iteration to avoid loading large video files into memory. The SDK's existing `HttpTransport` handles JSON responses only, so file downloads use a standalone `requests.get()` — the signed GCS URLs don't need API key auth.
84
+
85
+ ### Error handling
86
+
87
+ - Per-file failure tolerance: if one recording fails (404, timeout, network error), log a warning, set `download_status="failed"`, continue with remaining files
88
+ - If the metadata API call itself fails, raise normally (same as `get_recordings()`)
89
+ - Invalid/missing `downloadUrl`: set `download_status="failed"`, log warning
90
+
91
+ ### Skip-existing logic
92
+
93
+ Compare `os.path.getsize(local_path)` against `fileSize` from metadata. If `fileSize` is `None` (metadata missing), fall back to checking file existence only (any existing file is considered complete).
94
+
95
+ ## File Layout
96
+
97
+ | File | Change |
98
+ |------|--------|
99
+ | `src/hyperstudy/_downloads.py` | **New.** `build_filename()`, `download_file()` streaming helper |
100
+ | `src/hyperstudy/client.py` | Add `download_recordings()` and `download_recording()` methods |
101
+ | `tests/test_downloads.py` | **New.** Unit tests for filename building, skip logic, status tracking |
102
+ | `tests/test_client.py` | Integration test: mock API + GCS, verify files + DataFrame |
103
+ | `tests/fixtures/sparse_ratings_response.json` | Already exists (from prior work) |
104
+
105
+ ## Testing
106
+
107
+ ### Unit tests (`tests/test_downloads.py`)
108
+ - `test_build_filename` — video, audio, missing fields, filesystem-unsafe characters
109
+ - `test_build_filename_dedup` — duplicate names get numeric suffix
110
+ - `test_skip_existing_matching_size` — file with correct size is skipped
111
+ - `test_skip_existing_wrong_size` — file with wrong size is re-downloaded
112
+
113
+ ### Integration tests (`tests/test_client.py`)
114
+ - `test_download_recordings` — mock API + GCS fetch, verify files on disk, CSV sidecar, DataFrame with `local_path` + `download_status`
115
+ - `test_download_recordings_filter_type` — `recording_type="audio"` only downloads audio
116
+ - `test_download_recording_single` — single recording download
117
+
118
+ ### Mocking strategy
119
+ - V3 API: `responses` library (existing pattern)
120
+ - GCS signed URL: also `responses` (it's just an HTTP GET to a URL)
121
+ - File I/O: real writes to `pytest` `tmp_path`
122
+
123
+ ## No Backend Changes Required
124
+
125
+ The existing V3 API endpoints return all necessary data:
126
+ - `GET /api/v3/data/recordings/{scope}/{scopeId}` returns metadata with `downloadUrl`
127
+ - Signed GCS URLs are valid for 7 days
128
+ - SDK downloads immediately after fetching metadata, so expiry is not an issue
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "hyperstudy"
7
- version = "0.2.1"
7
+ version = "0.2.2"
8
8
  description = "Python SDK for the HyperStudy experiment platform API"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -19,7 +19,7 @@ from .exceptions import (
19
19
  ValidationError,
20
20
  )
21
21
 
22
- __version__ = "0.2.0"
22
+ __version__ = "0.2.2"
23
23
 
24
24
  __all__ = [
25
25
  "HyperStudy",
@@ -6,6 +6,51 @@ from typing import Any
6
6
 
7
7
  import pandas as pd
8
8
 
9
+ # Nested dict fields to flatten into top-level columns.
10
+ # Mapping of {field_name: prefix} — sub-keys become ``{prefix}_{sub_key}``.
11
+ FLATTEN_FIELDS: dict[str, str] = {
12
+ "sparseRatingData": "sparseRatingData",
13
+ "metadata": "metadata",
14
+ }
15
+
16
+
17
+ def _flatten_nested_dicts(
18
+ data: list[dict[str, Any]],
19
+ fields: dict[str, str] | None = None,
20
+ ) -> list[dict[str, Any]]:
21
+ """Promote sub-keys of nested dict fields to top-level keys.
22
+
23
+ For each *field* present in a record whose value is a ``dict``, every
24
+ sub-key is copied to ``{prefix}_{sub_key}``. The original nested dict
25
+ is preserved for backward compatibility.
26
+
27
+ Records where the target field is ``None`` or missing are left
28
+ untouched — downstream DataFrame construction fills those columns
29
+ with ``NaN`` / ``null``.
30
+ """
31
+ if not data:
32
+ return data
33
+
34
+ fields = fields if fields is not None else FLATTEN_FIELDS
35
+
36
+ # Quick check on first record — skip work when no target fields exist.
37
+ sample = data[0]
38
+ targets = [f for f in fields if f in sample and isinstance(sample[f], dict)]
39
+ if not targets:
40
+ return data
41
+
42
+ out: list[dict[str, Any]] = []
43
+ for record in data:
44
+ record = dict(record) # shallow copy to avoid mutating caller's data
45
+ for field in targets:
46
+ nested = record.get(field)
47
+ if isinstance(nested, dict):
48
+ prefix = fields[field]
49
+ for sub_key, sub_val in nested.items():
50
+ record[f"{prefix}_{sub_key}"] = sub_val
51
+ out.append(record)
52
+ return out
53
+
9
54
 
10
55
  def _post_process(df: pd.DataFrame) -> pd.DataFrame:
11
56
  """Shared post-processing for pandas DataFrames.
@@ -32,6 +77,7 @@ def to_pandas(data: list[dict[str, Any]]) -> pd.DataFrame:
32
77
  """Convert API response data to a pandas DataFrame with post-processing."""
33
78
  if not data:
34
79
  return pd.DataFrame()
80
+ data = _flatten_nested_dicts(data)
35
81
  df = pd.DataFrame(data)
36
82
  return _post_process(df)
37
83
 
@@ -51,6 +97,7 @@ def to_polars(data: list[dict[str, Any]]):
51
97
  if not data:
52
98
  return pl.DataFrame()
53
99
 
100
+ data = _flatten_nested_dicts(data)
54
101
  df = pl.DataFrame(data)
55
102
 
56
103
  # Parse timestamps
@@ -0,0 +1,50 @@
1
+ """Helpers for downloading recording files from signed URLs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ import requests
10
+
11
+ _CHUNK_SIZE = 65536 # 64 KB — good balance for large video files
12
+ _UNSAFE_RE = re.compile(r"[^\w\-]")
13
+
14
+
15
+ def get_download_url(recording: dict[str, Any]) -> str | None:
16
+ """Return the best download URL from a recording dict, or ``None``."""
17
+ return recording.get("downloadUrl") or recording.get("url") or None
18
+
19
+
20
+ def build_filename(recording: dict[str, Any]) -> str:
21
+ """Build a filesystem-safe filename from recording metadata.
22
+
23
+ Pattern: ``{participantName}_{type}_{recordingId}.{ext}``
24
+ """
25
+ name = recording.get("participantName") or recording.get("participantId") or "unknown"
26
+ name = _UNSAFE_RE.sub("_", name)
27
+
28
+ meta = recording.get("metadata") or {}
29
+ rec_type = meta.get("type") or "recording"
30
+
31
+ rec_id = recording.get("recordingId") or recording.get("egressId") or "unknown"
32
+
33
+ fmt = recording.get("format")
34
+ if not fmt:
35
+ fmt = "webm" if rec_type == "audio" else "mp4"
36
+
37
+ return f"{name}_{rec_type}_{rec_id}.{fmt}"
38
+
39
+
40
+ def download_file(url: str, dest: Path, timeout: int = 300) -> int:
41
+ """Stream-download *url* to *dest* and return bytes written."""
42
+ resp = requests.get(url, stream=True, timeout=timeout)
43
+ resp.raise_for_status()
44
+
45
+ written = 0
46
+ with open(dest, "wb") as fh:
47
+ for chunk in resp.iter_content(chunk_size=_CHUNK_SIZE):
48
+ fh.write(chunk)
49
+ written += len(chunk)
50
+ return written
@@ -2,9 +2,14 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import warnings
6
+ from pathlib import Path
5
7
  from typing import Any
6
8
 
9
+ from tqdm.auto import tqdm
10
+
7
11
  from ._dataframe import to_pandas, to_polars
12
+ from ._downloads import build_filename, download_file, get_download_url
8
13
  from ._http import HttpTransport
9
14
  from ._pagination import fetch_all_pages
10
15
  from ._types import Scope
@@ -466,6 +471,125 @@ class HyperStudy(ExperimentMixin):
466
471
  "consent": self.get_consent(participant_id, **common),
467
472
  }
468
473
 
474
+ # ------------------------------------------------------------------
475
+ # Recording downloads
476
+ # ------------------------------------------------------------------
477
+
478
+ def download_recording(
479
+ self,
480
+ recording: dict[str, Any],
481
+ output_dir: str = ".",
482
+ ) -> Path:
483
+ """Download a single recording file to disk.
484
+
485
+ Args:
486
+ recording: A recording dict (from ``get_recordings(output="dict")``).
487
+ output_dir: Directory to save the file.
488
+
489
+ Returns:
490
+ Path to the downloaded file.
491
+ """
492
+ url = get_download_url(recording)
493
+ if not url:
494
+ raise ValueError("Recording has no downloadUrl or url field")
495
+
496
+ dest_dir = Path(output_dir)
497
+ dest_dir.mkdir(parents=True, exist_ok=True)
498
+
499
+ filename = build_filename(recording)
500
+ dest = dest_dir / filename
501
+ download_file(url, dest)
502
+ return dest
503
+
504
+ def download_recordings(
505
+ self,
506
+ scope_id: str,
507
+ *,
508
+ output_dir: str,
509
+ scope: str = "experiment",
510
+ deployment_id: str | None = None,
511
+ room_id: str | None = None,
512
+ recording_type: str | None = None,
513
+ progress: bool = True,
514
+ skip_existing: bool = True,
515
+ ):
516
+ """Download recording files to disk.
517
+
518
+ Fetches recording metadata, downloads each file from its signed
519
+ URL, writes a ``recordings_metadata.csv`` sidecar, and returns a
520
+ DataFrame with a ``local_path`` column.
521
+
522
+ Args:
523
+ scope_id: Experiment, room, or participant ID.
524
+ output_dir: Directory to save files.
525
+ scope: ``"experiment"``, ``"room"``, or ``"participant"``.
526
+ deployment_id: Filter by deployment (experiment scope only).
527
+ room_id: Filter by room.
528
+ recording_type: ``"audio"``, ``"video"``, or ``None`` (both).
529
+ progress: Show progress bar.
530
+ skip_existing: Skip files already on disk with matching size.
531
+
532
+ Returns:
533
+ pandas DataFrame with recording metadata plus ``local_path``
534
+ and ``download_status`` columns.
535
+ """
536
+ recordings = self.get_recordings(
537
+ scope_id,
538
+ scope=scope,
539
+ deployment_id=deployment_id,
540
+ room_id=room_id,
541
+ output="dict",
542
+ )
543
+
544
+ if recording_type:
545
+ recordings = [
546
+ r for r in recordings
547
+ if (r.get("metadata") or {}).get("type") == recording_type
548
+ ]
549
+
550
+ dest_dir = Path(output_dir)
551
+ dest_dir.mkdir(parents=True, exist_ok=True)
552
+
553
+ local_paths: list[str | None] = []
554
+ statuses: list[str] = []
555
+
556
+ for rec in tqdm(recordings, desc="Downloading recordings", disable=not progress):
557
+ filename = build_filename(rec)
558
+ dest = dest_dir / filename
559
+
560
+ url = get_download_url(rec)
561
+ if not url:
562
+ local_paths.append(None)
563
+ statuses.append("failed")
564
+ warnings.warn(f"Recording {rec.get('recordingId')} has no download URL")
565
+ continue
566
+
567
+ if skip_existing and dest.exists():
568
+ expected_size = rec.get("fileSize")
569
+ if expected_size is None or dest.stat().st_size == expected_size:
570
+ local_paths.append(str(dest.resolve()))
571
+ statuses.append("skipped")
572
+ continue
573
+
574
+ try:
575
+ download_file(url, dest)
576
+ local_paths.append(str(dest.resolve()))
577
+ statuses.append("downloaded")
578
+ except Exception as exc:
579
+ local_paths.append(None)
580
+ statuses.append("failed")
581
+ warnings.warn(
582
+ f"Failed to download recording {rec.get('recordingId')}: {exc}"
583
+ )
584
+
585
+ df = to_pandas(recordings)
586
+ if not df.empty:
587
+ df["local_path"] = local_paths
588
+ df["download_status"] = statuses
589
+ df.to_csv(dest_dir / "recordings_metadata.csv", index=False)
590
+
591
+ return df
592
+
469
593
  # ------------------------------------------------------------------
470
594
  # Internal helpers
471
595
  # ------------------------------------------------------------------
@@ -71,6 +71,16 @@ def deployment_sessions_response():
71
71
  return load_fixture("deployment_sessions_response.json")
72
72
 
73
73
 
74
+ @pytest.fixture
75
+ def sparse_ratings_response():
76
+ return load_fixture("sparse_ratings_response.json")
77
+
78
+
79
+ @pytest.fixture
80
+ def recordings_response():
81
+ return load_fixture("recordings_response.json")
82
+
83
+
74
84
  @pytest.fixture
75
85
  def warnings_response():
76
86
  return load_fixture("warnings_response.json")
@@ -0,0 +1,71 @@
1
+ {
2
+ "status": "success",
3
+ "metadata": {
4
+ "dataType": "recordings",
5
+ "scope": "experiment",
6
+ "scopeId": "exp_abc123",
7
+ "timestamp": "2024-06-15T10:00:00.000Z",
8
+ "query": {
9
+ "limit": 1000,
10
+ "offset": 0,
11
+ "sort": "startTime",
12
+ "order": "asc"
13
+ },
14
+ "pagination": {
15
+ "total": 2,
16
+ "returned": 2,
17
+ "hasMore": false,
18
+ "limit": 1000,
19
+ "offset": 0
20
+ },
21
+ "processing": {
22
+ "processingTimeMs": 35,
23
+ "enriched": true,
24
+ "version": "3.0.0"
25
+ }
26
+ },
27
+ "data": [
28
+ {
29
+ "recordingId": "EG_video_001",
30
+ "egressId": "EG_video_001",
31
+ "participantId": "user_1",
32
+ "participantName": "Alice",
33
+ "startTime": "2024-06-15T10:00:05.000Z",
34
+ "endTime": "2024-06-15T10:05:05.000Z",
35
+ "duration": 300000,
36
+ "videoOffset": 500,
37
+ "url": "https://storage.googleapis.com/bucket/recordings/video1.mp4",
38
+ "downloadUrl": "https://storage.googleapis.com/bucket/recordings/video1.mp4?X-Goog-Signature=abc",
39
+ "fileSize": 1024,
40
+ "format": "mp4",
41
+ "status": "complete",
42
+ "metadata": {
43
+ "type": "video",
44
+ "recordingType": "individual",
45
+ "roomName": "room_1",
46
+ "experimentId": "exp_abc123"
47
+ }
48
+ },
49
+ {
50
+ "recordingId": "EG_audio_002",
51
+ "egressId": "EG_audio_002",
52
+ "participantId": "user_1",
53
+ "participantName": "Alice",
54
+ "startTime": "2024-06-15T10:00:05.000Z",
55
+ "endTime": "2024-06-15T10:05:05.000Z",
56
+ "duration": 300000,
57
+ "videoOffset": 500,
58
+ "url": "https://storage.googleapis.com/bucket/recordings/audio1.webm",
59
+ "downloadUrl": "https://storage.googleapis.com/bucket/recordings/audio1.webm?X-Goog-Signature=def",
60
+ "fileSize": 512,
61
+ "format": "webm",
62
+ "status": "complete",
63
+ "metadata": {
64
+ "type": "audio",
65
+ "recordingType": "audio",
66
+ "roomName": "room_1",
67
+ "experimentId": "exp_abc123"
68
+ }
69
+ }
70
+ ]
71
+ }
@@ -0,0 +1,108 @@
1
+ {
2
+ "status": "success",
3
+ "metadata": {
4
+ "dataType": "ratings",
5
+ "ratingType": "sparse",
6
+ "scope": "experiment",
7
+ "scopeId": "exp_abc123",
8
+ "timestamp": "2024-06-15T10:00:00.000Z",
9
+ "query": {
10
+ "startTime": null,
11
+ "endTime": null,
12
+ "limit": 1000,
13
+ "offset": 0,
14
+ "sort": "timestamp",
15
+ "order": "asc"
16
+ },
17
+ "pagination": {
18
+ "total": 2,
19
+ "returned": 2,
20
+ "hasMore": false,
21
+ "limit": 1000,
22
+ "offset": 0
23
+ },
24
+ "processing": {
25
+ "processingTimeMs": 58,
26
+ "enriched": true,
27
+ "version": "3.0.0"
28
+ }
29
+ },
30
+ "data": [
31
+ {
32
+ "ratingId": "rat_001",
33
+ "participantId": "user_1",
34
+ "timestamp": "2024-06-15T10:01:30.000Z",
35
+ "onset": 8500,
36
+ "rawOnset": 8520,
37
+ "clockOffsetApplied": 20,
38
+ "value": 72.5,
39
+ "rawValue": 58,
40
+ "scale": { "min": 0, "max": 80 },
41
+ "type": "sparse",
42
+ "stateId": "state_video_1",
43
+ "stimulusId": "video_abc",
44
+ "stimulusTime": 5000,
45
+ "responseTime": 2100,
46
+ "confidence": null,
47
+ "metadata": {
48
+ "question": "How engaging is this video?",
49
+ "dimension": "engagement",
50
+ "componentType": "vasrating",
51
+ "sampleIndex": 0
52
+ },
53
+ "ratingEndOnset": 10600,
54
+ "sparseRatingData": {
55
+ "videoId": "video_abc",
56
+ "pauseIndex": 0,
57
+ "videoRelativeTime": 5000,
58
+ "pauseTimestamp": 1718445690000,
59
+ "componentType": "vasrating",
60
+ "componentData": { "value": 58 },
61
+ "previousRatings": null,
62
+ "mediaPauseOnset": 8200,
63
+ "mediaResumeOnset": 10800,
64
+ "actualPauseDuration": 2600
65
+ },
66
+ "stateStartTime": "2024-06-15T10:00:00.000Z",
67
+ "stateDuration": 60000
68
+ },
69
+ {
70
+ "ratingId": "rat_002",
71
+ "participantId": "user_1",
72
+ "timestamp": "2024-06-15T10:02:45.000Z",
73
+ "onset": 25300,
74
+ "rawOnset": 25320,
75
+ "clockOffsetApplied": 20,
76
+ "value": 45.0,
77
+ "rawValue": 36,
78
+ "scale": { "min": 0, "max": 80 },
79
+ "type": "sparse",
80
+ "stateId": "state_video_1",
81
+ "stimulusId": "video_abc",
82
+ "stimulusTime": 20000,
83
+ "responseTime": 1800,
84
+ "confidence": null,
85
+ "metadata": {
86
+ "question": "How engaging is this video?",
87
+ "dimension": "engagement",
88
+ "componentType": "vasrating",
89
+ "sampleIndex": 1
90
+ },
91
+ "ratingEndOnset": 27100,
92
+ "sparseRatingData": {
93
+ "videoId": "video_abc",
94
+ "pauseIndex": 1,
95
+ "videoRelativeTime": 20000,
96
+ "pauseTimestamp": 1718445765000,
97
+ "componentType": "vasrating",
98
+ "componentData": { "value": 36 },
99
+ "previousRatings": { "video_abc": 58 },
100
+ "mediaPauseOnset": 25000,
101
+ "mediaResumeOnset": 27300,
102
+ "actualPauseDuration": 2300
103
+ },
104
+ "stateStartTime": "2024-06-15T10:00:00.000Z",
105
+ "stateDuration": 60000
106
+ }
107
+ ]
108
+ }
@@ -224,6 +224,24 @@ def test_get_ratings_sparse(api_key, events_response):
224
224
  assert isinstance(df, pd.DataFrame)
225
225
 
226
226
 
227
+ @responses.activate
228
+ def test_get_ratings_sparse_flattens_data(api_key, sparse_ratings_response):
229
+ """Sparse ratings DataFrame contains flattened sparseRatingData columns."""
230
+ responses.get(
231
+ f"{BASE_URL}/data/ratings/sparse/experiment/exp_abc123",
232
+ json=sparse_ratings_response,
233
+ status=200,
234
+ )
235
+ client = HyperStudy(api_key=api_key, base_url=BASE_URL)
236
+ df = client.get_ratings("exp_abc123", kind="sparse", limit=1000)
237
+ assert isinstance(df, pd.DataFrame)
238
+ assert "sparseRatingData_mediaPauseOnset" in df.columns
239
+ assert "sparseRatingData_mediaResumeOnset" in df.columns
240
+ assert "sparseRatingData_actualPauseDuration" in df.columns
241
+ assert "metadata_question" in df.columns
242
+ assert df["sparseRatingData_mediaPauseOnset"].iloc[0] == 8200
243
+
244
+
227
245
  @responses.activate
228
246
  def test_get_sync_with_aggregation(api_key, events_response):
229
247
  """get_sync passes aggregationWindow param."""
@@ -237,6 +255,128 @@ def test_get_sync_with_aggregation(api_key, events_response):
237
255
  assert "aggregationWindow=5000" in responses.calls[0].request.url
238
256
 
239
257
 
258
+ # ------------------------------------------------------------------
259
+ # download_recordings
260
+ # ------------------------------------------------------------------
261
+
262
+
263
+ @responses.activate
264
+ def test_download_recordings(api_key, recordings_response, tmp_path):
265
+ """download_recordings writes files, CSV sidecar, and returns DataFrame."""
266
+ # Mock the metadata API
267
+ responses.get(
268
+ f"{BASE_URL}/data/recordings/experiment/exp_abc123",
269
+ json=recordings_response,
270
+ status=200,
271
+ )
272
+ # Mock the GCS signed URL downloads
273
+ responses.get(
274
+ recordings_response["data"][0]["downloadUrl"],
275
+ body=b"fake video bytes",
276
+ status=200,
277
+ )
278
+ responses.get(
279
+ recordings_response["data"][1]["downloadUrl"],
280
+ body=b"fake audio bytes",
281
+ status=200,
282
+ )
283
+
284
+ client = HyperStudy(api_key=api_key, base_url=BASE_URL)
285
+ df = client.download_recordings(
286
+ "exp_abc123", output_dir=str(tmp_path), progress=False
287
+ )
288
+
289
+ assert isinstance(df, pd.DataFrame)
290
+ assert len(df) == 2
291
+ assert "local_path" in df.columns
292
+ assert "download_status" in df.columns
293
+ assert list(df["download_status"]) == ["downloaded", "downloaded"]
294
+
295
+ # Files exist on disk
296
+ assert (tmp_path / "Alice_video_EG_video_001.mp4").exists()
297
+ assert (tmp_path / "Alice_audio_EG_audio_002.webm").exists()
298
+ assert (tmp_path / "Alice_video_EG_video_001.mp4").read_bytes() == b"fake video bytes"
299
+
300
+ # CSV sidecar written
301
+ assert (tmp_path / "recordings_metadata.csv").exists()
302
+
303
+
304
+ @responses.activate
305
+ def test_download_recordings_filter_type(api_key, recordings_response, tmp_path):
306
+ """recording_type filter limits downloads to matching type."""
307
+ responses.get(
308
+ f"{BASE_URL}/data/recordings/experiment/exp_abc123",
309
+ json=recordings_response,
310
+ status=200,
311
+ )
312
+ responses.get(
313
+ recordings_response["data"][1]["downloadUrl"],
314
+ body=b"audio bytes",
315
+ status=200,
316
+ )
317
+
318
+ client = HyperStudy(api_key=api_key, base_url=BASE_URL)
319
+ df = client.download_recordings(
320
+ "exp_abc123",
321
+ output_dir=str(tmp_path),
322
+ recording_type="audio",
323
+ progress=False,
324
+ )
325
+
326
+ assert len(df) == 1
327
+ assert (tmp_path / "Alice_audio_EG_audio_002.webm").exists()
328
+ assert not (tmp_path / "Alice_video_EG_video_001.mp4").exists()
329
+
330
+
331
+ @responses.activate
332
+ def test_download_recordings_skip_existing(api_key, recordings_response, tmp_path):
333
+ """Files with matching size are skipped."""
334
+ responses.get(
335
+ f"{BASE_URL}/data/recordings/experiment/exp_abc123",
336
+ json=recordings_response,
337
+ status=200,
338
+ )
339
+ # Pre-create the video file with the expected size (1024 bytes)
340
+ video_path = tmp_path / "Alice_video_EG_video_001.mp4"
341
+ video_path.write_bytes(b"\x00" * 1024)
342
+
343
+ # Only the audio file needs a mock download URL
344
+ responses.get(
345
+ recordings_response["data"][1]["downloadUrl"],
346
+ body=b"\x00" * 512,
347
+ status=200,
348
+ )
349
+
350
+ client = HyperStudy(api_key=api_key, base_url=BASE_URL)
351
+ df = client.download_recordings(
352
+ "exp_abc123", output_dir=str(tmp_path), progress=False
353
+ )
354
+
355
+ assert df["download_status"].iloc[0] == "skipped"
356
+ assert df["download_status"].iloc[1] == "downloaded"
357
+
358
+
359
+ @responses.activate
360
+ def test_download_recording_single(api_key, tmp_path):
361
+ """download_recording downloads a single file."""
362
+ url = "https://storage.example.com/rec.mp4"
363
+ responses.get(url, body=b"video data", status=200)
364
+
365
+ client = HyperStudy(api_key=api_key, base_url=BASE_URL)
366
+ rec = {
367
+ "recordingId": "EG_001",
368
+ "participantName": "Bob",
369
+ "downloadUrl": url,
370
+ "format": "mp4",
371
+ "metadata": {"type": "video"},
372
+ }
373
+ path = client.download_recording(rec, output_dir=str(tmp_path))
374
+
375
+ assert path.exists()
376
+ assert path.name == "Bob_video_EG_001.mp4"
377
+ assert path.read_bytes() == b"video data"
378
+
379
+
240
380
  # ------------------------------------------------------------------
241
381
  # get_all_data
242
382
  # ------------------------------------------------------------------
@@ -0,0 +1,182 @@
1
+ """Tests for DataFrame conversion (pandas and polars)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pandas as pd
6
+ import pytest
7
+
8
+ from hyperstudy._dataframe import _flatten_nested_dicts, to_pandas, to_polars
9
+
10
+ SAMPLE_DATA = [
11
+ {
12
+ "id": "evt_001",
13
+ "onset": 1500,
14
+ "timestamp": "2024-06-15T10:00:01.500Z",
15
+ "category": "component",
16
+ },
17
+ {
18
+ "id": "evt_002",
19
+ "onset": 3200,
20
+ "timestamp": "2024-06-15T10:00:03.200Z",
21
+ "category": "component",
22
+ },
23
+ ]
24
+
25
+ SPARSE_RATING_DATA = [
26
+ {
27
+ "ratingId": "rat_001",
28
+ "onset": 8500,
29
+ "timestamp": "2024-06-15T10:01:30.000Z",
30
+ "value": 72.5,
31
+ "type": "sparse",
32
+ "metadata": {
33
+ "question": "How engaging?",
34
+ "dimension": "engagement",
35
+ "componentType": "vasrating",
36
+ },
37
+ "sparseRatingData": {
38
+ "videoId": "video_abc",
39
+ "pauseIndex": 0,
40
+ "mediaPauseOnset": 8200,
41
+ "mediaResumeOnset": 10800,
42
+ "actualPauseDuration": 2600,
43
+ "componentData": {"value": 58},
44
+ },
45
+ },
46
+ {
47
+ "ratingId": "rat_002",
48
+ "onset": 25300,
49
+ "timestamp": "2024-06-15T10:02:45.000Z",
50
+ "value": 45.0,
51
+ "type": "sparse",
52
+ "metadata": {
53
+ "question": "How engaging?",
54
+ "dimension": "engagement",
55
+ "componentType": "vasrating",
56
+ },
57
+ "sparseRatingData": {
58
+ "videoId": "video_abc",
59
+ "pauseIndex": 1,
60
+ "mediaPauseOnset": 25000,
61
+ "mediaResumeOnset": 27300,
62
+ "actualPauseDuration": 2300,
63
+ "componentData": {"value": 36},
64
+ },
65
+ },
66
+ ]
67
+
68
+
69
+ # ------------------------------------------------------------------
70
+ # Pandas
71
+ # ------------------------------------------------------------------
72
+
73
+
74
+ def test_to_pandas_creates_dataframe():
75
+ df = to_pandas(SAMPLE_DATA)
76
+ assert isinstance(df, pd.DataFrame)
77
+ assert len(df) == 2
78
+
79
+
80
+ def test_to_pandas_onset_sec():
81
+ df = to_pandas(SAMPLE_DATA)
82
+ assert "onset_sec" in df.columns
83
+ assert df["onset_sec"].iloc[0] == pytest.approx(1.5)
84
+ assert df["onset_sec"].iloc[1] == pytest.approx(3.2)
85
+
86
+
87
+ def test_to_pandas_timestamp_parsed():
88
+ df = to_pandas(SAMPLE_DATA)
89
+ assert pd.api.types.is_datetime64_any_dtype(df["timestamp"])
90
+
91
+
92
+ def test_to_pandas_empty():
93
+ df = to_pandas([])
94
+ assert isinstance(df, pd.DataFrame)
95
+ assert df.empty
96
+
97
+
98
+ # ------------------------------------------------------------------
99
+ # Polars
100
+ # ------------------------------------------------------------------
101
+
102
+
103
+ def test_to_polars_creates_dataframe():
104
+ polars = pytest.importorskip("polars")
105
+ df = to_polars(SAMPLE_DATA)
106
+ assert isinstance(df, polars.DataFrame)
107
+ assert len(df) == 2
108
+
109
+
110
+ def test_to_polars_onset_sec():
111
+ pytest.importorskip("polars")
112
+ df = to_polars(SAMPLE_DATA)
113
+ assert "onset_sec" in df.columns
114
+ assert df["onset_sec"][0] == pytest.approx(1.5)
115
+
116
+
117
+ def test_to_polars_empty():
118
+ polars = pytest.importorskip("polars")
119
+ df = to_polars([])
120
+ assert isinstance(df, polars.DataFrame)
121
+ assert len(df) == 0
122
+
123
+
124
+ # ------------------------------------------------------------------
125
+ # Nested dict flattening
126
+ # ------------------------------------------------------------------
127
+
128
+
129
+ def test_flatten_sparse_rating_data():
130
+ df = to_pandas(SPARSE_RATING_DATA)
131
+ assert "sparseRatingData_mediaPauseOnset" in df.columns
132
+ assert "sparseRatingData_mediaResumeOnset" in df.columns
133
+ assert "sparseRatingData_actualPauseDuration" in df.columns
134
+ assert "sparseRatingData_videoId" in df.columns
135
+ assert "sparseRatingData_pauseIndex" in df.columns
136
+ assert df["sparseRatingData_mediaPauseOnset"].iloc[0] == 8200
137
+ assert df["sparseRatingData_mediaPauseOnset"].iloc[1] == 25000
138
+
139
+
140
+ def test_flatten_metadata():
141
+ df = to_pandas(SPARSE_RATING_DATA)
142
+ assert "metadata_question" in df.columns
143
+ assert "metadata_dimension" in df.columns
144
+ assert "metadata_componentType" in df.columns
145
+ assert df["metadata_question"].iloc[0] == "How engaging?"
146
+
147
+
148
+ def test_flatten_preserves_original():
149
+ df = to_pandas(SPARSE_RATING_DATA)
150
+ assert "sparseRatingData" in df.columns
151
+ assert isinstance(df["sparseRatingData"].iloc[0], dict)
152
+ assert "metadata" in df.columns
153
+ assert isinstance(df["metadata"].iloc[0], dict)
154
+
155
+
156
+ def test_flatten_handles_none():
157
+ data = [
158
+ {"ratingId": "r1", "onset": 100, "sparseRatingData": None, "metadata": None},
159
+ ]
160
+ df = to_pandas(data)
161
+ assert "sparseRatingData" in df.columns
162
+ # No flattened columns since the nested value is None, not a dict
163
+ assert "sparseRatingData_mediaPauseOnset" not in df.columns
164
+
165
+
166
+ def test_flatten_no_target_fields():
167
+ """Data without any flatten-target fields passes through unchanged."""
168
+ result = _flatten_nested_dicts(SAMPLE_DATA)
169
+ assert result is SAMPLE_DATA # same object — no copy needed
170
+
171
+
172
+ def test_flatten_empty():
173
+ result = _flatten_nested_dicts([])
174
+ assert result == []
175
+
176
+
177
+ def test_flatten_polars():
178
+ pytest.importorskip("polars")
179
+ df = to_polars(SPARSE_RATING_DATA)
180
+ assert "sparseRatingData_mediaPauseOnset" in df.columns
181
+ assert "metadata_question" in df.columns
182
+ assert df["sparseRatingData_mediaPauseOnset"][0] == 8200
@@ -0,0 +1,105 @@
1
+ """Tests for recording download helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import responses
6
+ import pytest
7
+
8
+ from hyperstudy._downloads import build_filename, download_file
9
+
10
+
11
+ # ------------------------------------------------------------------
12
+ # build_filename
13
+ # ------------------------------------------------------------------
14
+
15
+
16
+ VIDEO_RECORDING = {
17
+ "recordingId": "EG_video_001",
18
+ "participantName": "Alice",
19
+ "format": "mp4",
20
+ "metadata": {"type": "video"},
21
+ }
22
+
23
+ AUDIO_RECORDING = {
24
+ "recordingId": "EG_audio_002",
25
+ "participantName": "Alice",
26
+ "format": "webm",
27
+ "metadata": {"type": "audio"},
28
+ }
29
+
30
+
31
+ def test_build_filename_video():
32
+ assert build_filename(VIDEO_RECORDING) == "Alice_video_EG_video_001.mp4"
33
+
34
+
35
+ def test_build_filename_audio():
36
+ assert build_filename(AUDIO_RECORDING) == "Alice_audio_EG_audio_002.webm"
37
+
38
+
39
+ def test_build_filename_missing_fields():
40
+ rec = {"egressId": "EG_123"}
41
+ name = build_filename(rec)
42
+ assert name == "unknown_recording_EG_123.mp4"
43
+
44
+
45
+ def test_build_filename_sanitizes_name():
46
+ rec = {
47
+ "recordingId": "EG_001",
48
+ "participantName": "Alice O'Brien (test)",
49
+ "format": "mp4",
50
+ "metadata": {"type": "video"},
51
+ }
52
+ name = build_filename(rec)
53
+ assert name == "Alice_O_Brien__test__video_EG_001.mp4"
54
+ # No special characters remain
55
+ assert "'" not in name
56
+ assert "(" not in name
57
+
58
+
59
+ def test_build_filename_uses_participant_id_fallback():
60
+ rec = {
61
+ "recordingId": "EG_001",
62
+ "participantId": "user_42",
63
+ "format": "mp4",
64
+ "metadata": {"type": "video"},
65
+ }
66
+ assert build_filename(rec) == "user_42_video_EG_001.mp4"
67
+
68
+
69
+ def test_build_filename_audio_default_format():
70
+ """Audio recording with no format field defaults to webm."""
71
+ rec = {
72
+ "recordingId": "EG_001",
73
+ "participantName": "Bob",
74
+ "metadata": {"type": "audio"},
75
+ }
76
+ assert build_filename(rec).endswith(".webm")
77
+
78
+
79
+ # ------------------------------------------------------------------
80
+ # download_file
81
+ # ------------------------------------------------------------------
82
+
83
+
84
+ @responses.activate
85
+ def test_download_file(tmp_path):
86
+ url = "https://storage.example.com/file.mp4"
87
+ content = b"fake video content " * 100
88
+ responses.get(url, body=content, status=200)
89
+
90
+ dest = tmp_path / "output.mp4"
91
+ written = download_file(url, dest)
92
+
93
+ assert dest.exists()
94
+ assert dest.read_bytes() == content
95
+ assert written == len(content)
96
+
97
+
98
+ @responses.activate
99
+ def test_download_file_raises_on_error(tmp_path):
100
+ url = "https://storage.example.com/missing.mp4"
101
+ responses.get(url, status=404)
102
+
103
+ dest = tmp_path / "output.mp4"
104
+ with pytest.raises(Exception):
105
+ download_file(url, dest)
@@ -1,78 +0,0 @@
1
- """Tests for DataFrame conversion (pandas and polars)."""
2
-
3
- from __future__ import annotations
4
-
5
- import pandas as pd
6
- import pytest
7
-
8
- from hyperstudy._dataframe import to_pandas, to_polars
9
-
10
- SAMPLE_DATA = [
11
- {
12
- "id": "evt_001",
13
- "onset": 1500,
14
- "timestamp": "2024-06-15T10:00:01.500Z",
15
- "category": "component",
16
- },
17
- {
18
- "id": "evt_002",
19
- "onset": 3200,
20
- "timestamp": "2024-06-15T10:00:03.200Z",
21
- "category": "component",
22
- },
23
- ]
24
-
25
-
26
- # ------------------------------------------------------------------
27
- # Pandas
28
- # ------------------------------------------------------------------
29
-
30
-
31
- def test_to_pandas_creates_dataframe():
32
- df = to_pandas(SAMPLE_DATA)
33
- assert isinstance(df, pd.DataFrame)
34
- assert len(df) == 2
35
-
36
-
37
- def test_to_pandas_onset_sec():
38
- df = to_pandas(SAMPLE_DATA)
39
- assert "onset_sec" in df.columns
40
- assert df["onset_sec"].iloc[0] == pytest.approx(1.5)
41
- assert df["onset_sec"].iloc[1] == pytest.approx(3.2)
42
-
43
-
44
- def test_to_pandas_timestamp_parsed():
45
- df = to_pandas(SAMPLE_DATA)
46
- assert pd.api.types.is_datetime64_any_dtype(df["timestamp"])
47
-
48
-
49
- def test_to_pandas_empty():
50
- df = to_pandas([])
51
- assert isinstance(df, pd.DataFrame)
52
- assert df.empty
53
-
54
-
55
- # ------------------------------------------------------------------
56
- # Polars
57
- # ------------------------------------------------------------------
58
-
59
-
60
- def test_to_polars_creates_dataframe():
61
- polars = pytest.importorskip("polars")
62
- df = to_polars(SAMPLE_DATA)
63
- assert isinstance(df, polars.DataFrame)
64
- assert len(df) == 2
65
-
66
-
67
- def test_to_polars_onset_sec():
68
- pytest.importorskip("polars")
69
- df = to_polars(SAMPLE_DATA)
70
- assert "onset_sec" in df.columns
71
- assert df["onset_sec"][0] == pytest.approx(1.5)
72
-
73
-
74
- def test_to_polars_empty():
75
- polars = pytest.importorskip("polars")
76
- df = to_polars([])
77
- assert isinstance(df, polars.DataFrame)
78
- assert len(df) == 0
File without changes
File without changes
File without changes
File without changes
File without changes