zeno-cli 0.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zeno_adapters/__init__.py +17 -0
- zeno_adapters/_common.py +38 -0
- zeno_adapters/anthropic.py +68 -0
- zeno_adapters/claude_code.py +101 -0
- zeno_adapters/crewai.py +92 -0
- zeno_adapters/langgraph.py +49 -0
- zeno_adapters/openai.py +108 -0
- zeno_cli/__init__.py +1 -0
- zeno_cli/_hooks/cc_bridge.py +1016 -0
- zeno_cli/doctor.py +535 -0
- zeno_cli/hook_install.py +269 -0
- zeno_cli/hud/__init__.py +1 -0
- zeno_cli/hud/hud_install.py +652 -0
- zeno_cli/hud/zeno_attention.py +288 -0
- zeno_cli/hud/zeno_cognition.py +457 -0
- zeno_cli/hud/zeno_hud.py +496 -0
- zeno_cli/interview_invites.py +342 -0
- zeno_cli/login.py +241 -0
- zeno_cli/main.py +2534 -0
- zeno_cli/onboard.py +206 -0
- zeno_cli/outreach.py +456 -0
- zeno_cli/version.py +67 -0
- zeno_cli-0.3.4.dist-info/METADATA +161 -0
- zeno_cli-0.3.4.dist-info/RECORD +69 -0
- zeno_cli-0.3.4.dist-info/WHEEL +4 -0
- zeno_cli-0.3.4.dist-info/entry_points.txt +4 -0
- zeno_core/__init__.py +67 -0
- zeno_core/analytics.py +193 -0
- zeno_core/rtlx_s.py +460 -0
- zeno_core/streak.py +178 -0
- zeno_core/tlx_s.py +192 -0
- zeno_sdk/__init__.py +6 -0
- zeno_sdk/_generated/__init__.py +6 -0
- zeno_sdk/_generated/client.py +819 -0
- zeno_sdk/_migrations/alembic/env.py +33 -0
- zeno_sdk/_migrations/alembic/script.py.mako +18 -0
- zeno_sdk/_migrations/alembic/versions/0001_initial.py +79 -0
- zeno_sdk/_migrations/alembic/versions/0002_cognition_samples.py +53 -0
- zeno_sdk/_migrations/alembic/versions/0003_cognition_drivers.py +41 -0
- zeno_sdk/_migrations/alembic/versions/0004_transcript_intelligence.py +248 -0
- zeno_sdk/_migrations/alembic.ini +35 -0
- zeno_sdk/_runtime.py +12 -0
- zeno_sdk/adapters/__init__.py +15 -0
- zeno_sdk/adapters/anthropic.py +5 -0
- zeno_sdk/adapters/claude_code.py +5 -0
- zeno_sdk/adapters/crewai.py +5 -0
- zeno_sdk/adapters/langgraph.py +5 -0
- zeno_sdk/adapters/openai.py +5 -0
- zeno_sdk/auth.py +25 -0
- zeno_sdk/client.py +87 -0
- zeno_sdk/config.py +61 -0
- zeno_sdk/daemon.py +72 -0
- zeno_sdk/privacy.py +46 -0
- zeno_sdk/session.py +179 -0
- zeno_sdk/storage.py +487 -0
- zeno_sdk/types/__init__.py +121 -0
- zeno_session_intel/__init__.py +19 -0
- zeno_session_intel/analytics.py +588 -0
- zeno_session_intel/compression.py +123 -0
- zeno_session_intel/ingest.py +376 -0
- zeno_session_intel/model.py +129 -0
- zeno_session_intel/parsers/__init__.py +31 -0
- zeno_session_intel/parsers/claude_code.py +169 -0
- zeno_session_intel/parsers/codex.py +265 -0
- zeno_session_intel/parsers/cursor.py +198 -0
- zeno_session_intel/prices.py +281 -0
- zeno_session_intel/schema.py +277 -0
- zeno_session_intel/signals.py +319 -0
- zeno_session_intel/taxonomy.py +71 -0
zeno_core/rtlx_s.py
ADDED
|
@@ -0,0 +1,460 @@
|
|
|
1
|
+
"""RTLX-S: Raw NASA-TLX, Supervision-augmented (research 1, 2026-06-07 PM3).
|
|
2
|
+
|
|
3
|
+
Five-item cognitive-load probe. Three items are validated raw NASA-TLX
|
|
4
|
+
subscales (mental demand, effort, frustration); two are novel additions
|
|
5
|
+
specific to AI-supervised work (supervision load, execution load).
|
|
6
|
+
|
|
7
|
+
Design contract from research 1:
|
|
8
|
+
- 5 items, 0-10 each (NOT 0-100 like classic TLX-S)
|
|
9
|
+
- Single keypress per item, Enter accepts the previous answer
|
|
10
|
+
- End-anchored labels only ("0 = none / very low", "10 = very high")
|
|
11
|
+
- NO pairwise weighting step (dropped entirely)
|
|
12
|
+
- NEVER collapse to a single index at capture time - store all 5 raw values
|
|
13
|
+
- Default each item to the user's last-entered value, persisted in the
|
|
14
|
+
local zeno data dir at ~/.zeno/rtlxs_last.json
|
|
15
|
+
- Total prompt completes in well under 5 seconds for an experienced user
|
|
16
|
+
|
|
17
|
+
The five items are PINNED. Wording is locked here so the Stage 2 CFA / alpha
|
|
18
|
+
validation pass has a stable instrument across cohorts. Do NOT edit wording
|
|
19
|
+
without recording the change as a new schema_version constant - mixing
|
|
20
|
+
re-worded items into the same response stream invalidates the validation.
|
|
21
|
+
|
|
22
|
+
The novel two items (supervision_load, execution_load) are NOT yet
|
|
23
|
+
empirically validated. Research 1 specifies that they pass validation only
|
|
24
|
+
if Cronbach alpha >= 0.7 and CFA model fit is acceptable at N>=150-300
|
|
25
|
+
sessions across two cohorts. Until then they are treated as candidate
|
|
26
|
+
correlates, not load-bearing for any policy gate.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
from __future__ import annotations
|
|
30
|
+
|
|
31
|
+
import json
|
|
32
|
+
import os
|
|
33
|
+
import random
|
|
34
|
+
import sys
|
|
35
|
+
import termios
|
|
36
|
+
import tty
|
|
37
|
+
from dataclasses import asdict, dataclass
|
|
38
|
+
from datetime import UTC, datetime
|
|
39
|
+
from pathlib import Path
|
|
40
|
+
|
|
41
|
+
# Schema version for the persisted defaults + API payloads. Bump when adding
|
|
42
|
+
# or rewording items so CFA / alpha analyses don't silently mix instruments.
|
|
43
|
+
#
|
|
44
|
+
# v1 (2026-06-07): initial 5-item probe.
|
|
45
|
+
# v2 (2026-06-10): supervision_load + execution_load rewritten from
|
|
46
|
+
# proportion-of-effort phrasing ("how much of your effort went into...")
|
|
47
|
+
# to absolute cost. Research 3 (2026-06-10) flagged the proportion framing
|
|
48
|
+
# as an internal-contamination threat: by definition the two share total
|
|
49
|
+
# effort and will correlate near-perfectly negatively, undermining factor
|
|
50
|
+
# independence. Absolute-cost framing anchors each item on its own
|
|
51
|
+
# construct (Sheridan monitor+intervene for supervision; raw mental
|
|
52
|
+
# demand of own work for execution) and is the wording the prior-art scan
|
|
53
|
+
# explicitly recommended pre-survey to protect Stage 2 discriminant
|
|
54
|
+
# validity vs RTLX Mental Demand + Frustration (the vigilance "workload
|
|
55
|
+
# signature" - Warm, Parasuraman & Matthews 2008).
|
|
56
|
+
RTLXS_SCHEMA_VERSION: int = 2
|
|
57
|
+
|
|
58
|
+
# Randomized single-case-experiment (SCED) condition tags (2026-06-15).
|
|
59
|
+
# A per-session experimental condition lets occasions be randomly assigned
|
|
60
|
+
# to autonomy blocks (ABAB / alternating-treatments) so the within-person
|
|
61
|
+
# double dissociation can be tested: autonomy should move supervision_load
|
|
62
|
+
# but not execution_load. None = untagged. See
|
|
63
|
+
# docs/STAGE_2A_WITHIN_PERSON_DESIGN.md.
|
|
64
|
+
RTLXS_AUTONOMY_CONDITIONS: tuple[str, ...] = (
|
|
65
|
+
"high_autonomy",
|
|
66
|
+
"low_autonomy",
|
|
67
|
+
"control",
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Rating-time concealment control (SCED v2, 2026-06-19; pre-reg Section 12 A.2).
|
|
71
|
+
# In a BLINDED SCED session the survey never shows the condition; after the load
|
|
72
|
+
# items it asks the operator to GUESS the concealed condition + rate confidence.
|
|
73
|
+
# The Bang/James leakage index is computed from these at N=90 (NOT trial blinding
|
|
74
|
+
# - the operator enacts the condition; this measures whether the RATINGS leaked
|
|
75
|
+
# it). The guess is placed strictly AFTER the load items so it cannot prime them.
|
|
76
|
+
RTLXS_BLIND_GUESS_OPTIONS: tuple[str, ...] = ("high", "low", "unsure")
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# Ordered tuple of (key, label) pairs - presentation order is the response
|
|
80
|
+
# field order in storage too, so downstream stats can index by position.
|
|
81
|
+
RTLXS_ITEMS: tuple[tuple[str, str], ...] = (
|
|
82
|
+
(
|
|
83
|
+
"mental_demand",
|
|
84
|
+
"Mental demand: How mentally demanding was this session?",
|
|
85
|
+
),
|
|
86
|
+
(
|
|
87
|
+
"effort",
|
|
88
|
+
"Effort: How hard did you have to work to get the result you wanted?",
|
|
89
|
+
),
|
|
90
|
+
(
|
|
91
|
+
"frustration",
|
|
92
|
+
"Frustration: How frustrated, stressed, or annoyed did you feel?",
|
|
93
|
+
),
|
|
94
|
+
(
|
|
95
|
+
"supervision_load",
|
|
96
|
+
"Supervision load: How much effort did watching, reviewing, or "
|
|
97
|
+
"correcting the agent's work cost you?",
|
|
98
|
+
),
|
|
99
|
+
(
|
|
100
|
+
"execution_load",
|
|
101
|
+
"Execution load: How mentally demanding was the work you did " "yourself?",
|
|
102
|
+
),
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
# Trust anchor (research 3, 2026-06-10). One Jian/Bisantz/Drury-derived
|
|
107
|
+
# item, fixed across all sessions (not rotated): a single item with full
|
|
108
|
+
# N gives stronger convergent-validity evidence than rotation across n=300.
|
|
109
|
+
# Wording is adapted from Jian et al. (2000) "The system is dependable"
|
|
110
|
+
# to zeno's per-session context. Captured alongside the 5 RTLX-S items;
|
|
111
|
+
# scored 0-10. Persisted in a separate column so it is never collapsed
|
|
112
|
+
# into the RTLX-S latent structure. Establishes discriminant validity
|
|
113
|
+
# vs trust: we expect a moderate negative correlation with supervision_load
|
|
114
|
+
# (higher trust -> lower oversight burden) but NOT a near-perfect one.
|
|
115
|
+
RTLXS_TRUST_ITEM_KEY: str = "trust_dependable"
|
|
116
|
+
RTLXS_TRUST_ITEM_LABEL: str = "Trust: The AI agent(s) I used this session were dependable."
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# Anchor labels (end-anchored only - research 1 contract)
|
|
120
|
+
LOW_ANCHOR: str = "0 = none / very low"
|
|
121
|
+
HIGH_ANCHOR: str = "10 = very high"
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@dataclass(slots=True)
|
|
125
|
+
class RTLXResponse:
|
|
126
|
+
"""One complete RTLX-S response. Stores raw integers, never a composite.
|
|
127
|
+
|
|
128
|
+
Five validated/candidate items are 0-10 each. ``session_id`` ties the
|
|
129
|
+
response to a CLI session aggregate. ``active_agents`` and
|
|
130
|
+
``parallel_agent_count`` are operational context for the Stage 2
|
|
131
|
+
validation pass: they let CFA / alpha analyses stratify by agent mix.
|
|
132
|
+
|
|
133
|
+
``schema_version`` pins the item wording version for the row so a
|
|
134
|
+
pooled Stage 2 CFA can exclude / stratify by wording cohort.
|
|
135
|
+
|
|
136
|
+
``trust_dependable`` (research 3, 2026-06-10) is a single Jian-derived
|
|
137
|
+
trust anchor item, captured alongside the 5 RTLX-S items to provide
|
|
138
|
+
discriminant-validity evidence against trust constructs.
|
|
139
|
+
|
|
140
|
+
``agent_interrupts_count``, ``agent_turns_count``, and
|
|
141
|
+
``verification_seconds`` (research 3, 2026-06-10) are behavioral
|
|
142
|
+
anchors populated from CLI session telemetry. They are the strongest
|
|
143
|
+
discriminant-validity evidence the Stage 2 CFA can use: supervision
|
|
144
|
+
load should correlate positively with them; mental demand should not
|
|
145
|
+
(or should correlate less). All three are Optional - the CLI fills
|
|
146
|
+
them in when telemetry is available, otherwise they remain None.
|
|
147
|
+
"""
|
|
148
|
+
|
|
149
|
+
mental_demand: int
|
|
150
|
+
effort: int
|
|
151
|
+
frustration: int
|
|
152
|
+
supervision_load: int
|
|
153
|
+
execution_load: int
|
|
154
|
+
captured_at: datetime
|
|
155
|
+
session_id: str
|
|
156
|
+
active_agents: list[str]
|
|
157
|
+
parallel_agent_count: int
|
|
158
|
+
schema_version: int = RTLXS_SCHEMA_VERSION
|
|
159
|
+
trust_dependable: int | None = None
|
|
160
|
+
agent_interrupts_count: int | None = None
|
|
161
|
+
agent_turns_count: int | None = None
|
|
162
|
+
verification_seconds: int | None = None
|
|
163
|
+
autonomy_condition: str | None = None
|
|
164
|
+
# SCED v2 rating-time concealment (2026-06-19, pre-reg Section 12 A.1/A.2).
|
|
165
|
+
# ``blind_guess`` is the operator's forced post-rating guess of the concealed
|
|
166
|
+
# condition (one of RTLXS_BLIND_GUESS_OPTIONS); ``blind_confidence`` is 0..10.
|
|
167
|
+
# ``sced_session_index`` is the locked-schedule slot this session was assigned;
|
|
168
|
+
# in a blinded session ``autonomy_condition`` stays None (the true condition
|
|
169
|
+
# lives only in the sealed log, joined back by this index at N=90).
|
|
170
|
+
blind_guess: str | None = None
|
|
171
|
+
blind_confidence: int | None = None
|
|
172
|
+
sced_session_index: int | None = None
|
|
173
|
+
|
|
174
|
+
def to_payload(self) -> dict[str, object]:
|
|
175
|
+
"""Wire format for POST /v1/sessions/{session_id}/rtlxs."""
|
|
176
|
+
payload: dict[str, object] = {
|
|
177
|
+
"mental_demand": self.mental_demand,
|
|
178
|
+
"effort": self.effort,
|
|
179
|
+
"frustration": self.frustration,
|
|
180
|
+
"supervision_load": self.supervision_load,
|
|
181
|
+
"execution_load": self.execution_load,
|
|
182
|
+
"captured_at": self.captured_at.isoformat(),
|
|
183
|
+
"active_agents": list(self.active_agents),
|
|
184
|
+
"parallel_agent_count": self.parallel_agent_count,
|
|
185
|
+
"schema_version": self.schema_version,
|
|
186
|
+
}
|
|
187
|
+
if self.trust_dependable is not None:
|
|
188
|
+
payload["trust_dependable"] = self.trust_dependable
|
|
189
|
+
if self.agent_interrupts_count is not None:
|
|
190
|
+
payload["agent_interrupts_count"] = self.agent_interrupts_count
|
|
191
|
+
if self.agent_turns_count is not None:
|
|
192
|
+
payload["agent_turns_count"] = self.agent_turns_count
|
|
193
|
+
if self.verification_seconds is not None:
|
|
194
|
+
payload["verification_seconds"] = self.verification_seconds
|
|
195
|
+
if self.autonomy_condition is not None:
|
|
196
|
+
payload["autonomy_condition"] = self.autonomy_condition
|
|
197
|
+
if self.blind_guess is not None:
|
|
198
|
+
payload["blind_guess"] = self.blind_guess
|
|
199
|
+
if self.blind_confidence is not None:
|
|
200
|
+
payload["blind_confidence"] = self.blind_confidence
|
|
201
|
+
if self.sced_session_index is not None:
|
|
202
|
+
payload["sced_session_index"] = self.sced_session_index
|
|
203
|
+
return payload
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _data_dir() -> Path:
|
|
207
|
+
"""Local zeno data directory. Mirrors the SDK's ZENO_HOME convention."""
|
|
208
|
+
home = Path.home()
|
|
209
|
+
base_dir = Path(os.environ.get("ZENO_HOME", home / ".zeno")).expanduser()
|
|
210
|
+
base_dir.mkdir(parents=True, exist_ok=True)
|
|
211
|
+
return base_dir
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _last_response_path() -> Path:
|
|
215
|
+
return _data_dir() / "rtlxs_last.json"
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def load_last_defaults() -> dict[str, int]:
|
|
219
|
+
"""Read the most recently submitted RTLX-S response.
|
|
220
|
+
|
|
221
|
+
Returns a dict mapping each item key to its previous value, or 5 for
|
|
222
|
+
every item on first run / corrupted file. 5 is chosen as the
|
|
223
|
+
midpoint default so an out-of-the-box submission doesn't bias the
|
|
224
|
+
score toward either extreme. Falls back silently on any IO error -
|
|
225
|
+
the user must NEVER see a stacktrace from the survey TUI.
|
|
226
|
+
"""
|
|
227
|
+
keys = [key for key, _ in RTLXS_ITEMS] + [RTLXS_TRUST_ITEM_KEY]
|
|
228
|
+
fallback: dict[str, int] = {key: 5 for key in keys}
|
|
229
|
+
path = _last_response_path()
|
|
230
|
+
if not path.exists():
|
|
231
|
+
return fallback
|
|
232
|
+
try:
|
|
233
|
+
raw = json.loads(path.read_text(encoding="utf-8"))
|
|
234
|
+
except (json.JSONDecodeError, OSError):
|
|
235
|
+
return fallback
|
|
236
|
+
if not isinstance(raw, dict):
|
|
237
|
+
return fallback
|
|
238
|
+
out: dict[str, int] = {}
|
|
239
|
+
for key in keys:
|
|
240
|
+
value = raw.get(key)
|
|
241
|
+
if isinstance(value, int) and 0 <= value <= 10:
|
|
242
|
+
out[key] = value
|
|
243
|
+
else:
|
|
244
|
+
out[key] = 5
|
|
245
|
+
return out
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def save_last_defaults(values: dict[str, int]) -> None:
|
|
249
|
+
"""Persist the freshly submitted response so the next run defaults to it.
|
|
250
|
+
|
|
251
|
+
Silent on any IO error - never disrupt the CLI survey path. Only the
|
|
252
|
+
five item keys are written; metadata stays out of this file by design
|
|
253
|
+
so it can be safely git-ignored as a per-user preference.
|
|
254
|
+
"""
|
|
255
|
+
keys = [key for key, _ in RTLXS_ITEMS] + [RTLXS_TRUST_ITEM_KEY]
|
|
256
|
+
payload = {key: int(values.get(key, 5)) for key in keys}
|
|
257
|
+
try:
|
|
258
|
+
_last_response_path().write_text(
|
|
259
|
+
json.dumps(payload, sort_keys=True),
|
|
260
|
+
encoding="utf-8",
|
|
261
|
+
)
|
|
262
|
+
except OSError:
|
|
263
|
+
pass
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _read_single_keypress() -> str:
|
|
267
|
+
"""Read one keypress from stdin in raw mode. Handles Enter + digits.
|
|
268
|
+
|
|
269
|
+
POSIX-only (termios). The TUI fallback in run_rtlxs_survey_tui()
|
|
270
|
+
guards isatty() before reaching here. Returns the character as a
|
|
271
|
+
string; raw control chars are returned as-is so the caller can
|
|
272
|
+
handle Ctrl-C / Ctrl-D / Enter.
|
|
273
|
+
"""
|
|
274
|
+
fd = sys.stdin.fileno()
|
|
275
|
+
old_settings = termios.tcgetattr(fd)
|
|
276
|
+
try:
|
|
277
|
+
tty.setraw(fd)
|
|
278
|
+
ch = sys.stdin.read(1)
|
|
279
|
+
finally:
|
|
280
|
+
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
|
|
281
|
+
return ch
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def _prompt_one_item(*, label: str, previous: int) -> int | None:
|
|
285
|
+
"""Show one item, read a single keypress 0-10.
|
|
286
|
+
|
|
287
|
+
Returns the integer chosen, or None on cancel (Ctrl-C / Ctrl-D).
|
|
288
|
+
Enter keeps ``previous``. Digits 0-9 commit immediately. The
|
|
289
|
+
pseudo-"10" path: press 1 followed by 0 within the same prompt -
|
|
290
|
+
callers handle the two-digit case by re-prompting if needed.
|
|
291
|
+
|
|
292
|
+
For simplicity at MVP, "10" is entered as a single keypress: the
|
|
293
|
+
upper-case 'T' (think "ten") OR the digit 9 + 1. We pick the most
|
|
294
|
+
accessible default: digits 0..9 commit immediately as their value,
|
|
295
|
+
'T' or 't' commits 10. This keeps the whole prompt single-keypress
|
|
296
|
+
per item AND under 5 seconds total - the research 1 contract.
|
|
297
|
+
"""
|
|
298
|
+
sys.stdout.write(f" {label}\n")
|
|
299
|
+
sys.stdout.write(
|
|
300
|
+
f" [{LOW_ANCHOR}, {HIGH_ANCHOR}] (last = {previous}, Enter keeps, T = 10): "
|
|
301
|
+
)
|
|
302
|
+
sys.stdout.flush()
|
|
303
|
+
try:
|
|
304
|
+
ch = _read_single_keypress()
|
|
305
|
+
except (OSError, termios.error):
|
|
306
|
+
sys.stdout.write(f"{previous}\n")
|
|
307
|
+
return previous
|
|
308
|
+
if ch in ("\x03", "\x04"):
|
|
309
|
+
sys.stdout.write("\n")
|
|
310
|
+
return None
|
|
311
|
+
if ch in ("\r", "\n"):
|
|
312
|
+
sys.stdout.write(f"{previous}\n")
|
|
313
|
+
return previous
|
|
314
|
+
if ch in ("t", "T"):
|
|
315
|
+
sys.stdout.write("10\n")
|
|
316
|
+
return 10
|
|
317
|
+
if ch.isdigit():
|
|
318
|
+
sys.stdout.write(f"{ch}\n")
|
|
319
|
+
return int(ch)
|
|
320
|
+
# Unknown key - treat as Enter (keep previous) and move on. Refusing
|
|
321
|
+
# to advance would put the user in a stuck-prompt state, which is a
|
|
322
|
+
# worse failure mode than accepting the previous value.
|
|
323
|
+
sys.stdout.write(f"{previous}\n")
|
|
324
|
+
return previous
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def _prompt_blind_guess() -> str | None:
|
|
328
|
+
"""Forced guess of the concealed SCED condition (rating-time leakage check).
|
|
329
|
+
|
|
330
|
+
Single keypress: h = high_autonomy, l = low_autonomy, anything else = unsure.
|
|
331
|
+
Returns one of RTLXS_BLIND_GUESS_OPTIONS, or None on cancel (Ctrl-C/Ctrl-D).
|
|
332
|
+
Never blocks on an unknown key - defaults to "unsure" so the prompt cannot
|
|
333
|
+
strand the operator (same failure-mode contract as _prompt_one_item).
|
|
334
|
+
"""
|
|
335
|
+
sys.stdout.write(" Which condition do you think this session was?\n")
|
|
336
|
+
sys.stdout.write(" [h = high autonomy, l = low autonomy, u = unsure]: ")
|
|
337
|
+
sys.stdout.flush()
|
|
338
|
+
try:
|
|
339
|
+
ch = _read_single_keypress()
|
|
340
|
+
except (OSError, termios.error):
|
|
341
|
+
sys.stdout.write("unsure\n")
|
|
342
|
+
return "unsure"
|
|
343
|
+
if ch in ("\x03", "\x04"):
|
|
344
|
+
sys.stdout.write("\n")
|
|
345
|
+
return None
|
|
346
|
+
if ch in ("h", "H"):
|
|
347
|
+
sys.stdout.write("high\n")
|
|
348
|
+
return "high"
|
|
349
|
+
if ch in ("l", "L"):
|
|
350
|
+
sys.stdout.write("low\n")
|
|
351
|
+
return "low"
|
|
352
|
+
sys.stdout.write("unsure\n")
|
|
353
|
+
return "unsure"
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def run_rtlxs_survey_tui(
|
|
357
|
+
*,
|
|
358
|
+
session_id: str,
|
|
359
|
+
active_agents: list[str] | None = None,
|
|
360
|
+
parallel_agent_count: int = 0,
|
|
361
|
+
agent_interrupts_count: int | None = None,
|
|
362
|
+
agent_turns_count: int | None = None,
|
|
363
|
+
verification_seconds: int | None = None,
|
|
364
|
+
autonomy_condition: str | None = None,
|
|
365
|
+
sced_blinded: bool = False,
|
|
366
|
+
sced_session_index: int | None = None,
|
|
367
|
+
) -> RTLXResponse | None:
|
|
368
|
+
"""Five-item RTLX-S prompt plus one trust anchor item.
|
|
369
|
+
|
|
370
|
+
Returns None if skipped/cancelled. Non-tty stdin returns None
|
|
371
|
+
(background daemon / piped invocation should skip the probe rather
|
|
372
|
+
than block on stdin.read). Single keypress per item, Enter keeps the
|
|
373
|
+
previous answer. Total time well under 6 seconds for an experienced
|
|
374
|
+
user (5 RTLX-S items + 1 trust item).
|
|
375
|
+
|
|
376
|
+
The three behavioral-anchor parameters
|
|
377
|
+
(``agent_interrupts_count``, ``agent_turns_count``,
|
|
378
|
+
``verification_seconds``) are populated from CLI session telemetry
|
|
379
|
+
by the caller. They are never user-keyed - they are the objective
|
|
380
|
+
behavioral side of the discriminant-validity check for supervision
|
|
381
|
+
load. Pass None when telemetry is unavailable.
|
|
382
|
+
"""
|
|
383
|
+
if not sys.stdin.isatty():
|
|
384
|
+
return None
|
|
385
|
+
defaults = load_last_defaults()
|
|
386
|
+
sys.stdout.write("RTLX-S probe (5 items + 1 trust, single keypress each):\n")
|
|
387
|
+
sys.stdout.flush()
|
|
388
|
+
answers: dict[str, int] = {}
|
|
389
|
+
# Item-order randomization (SCED v2 A.4): in a blinded session the 5 load
|
|
390
|
+
# items are presented in a random order to disrupt response sets. Storage is
|
|
391
|
+
# keyed by item name, so the order is presentation-only and does not affect
|
|
392
|
+
# stored values or downstream analysis. Non-blinded surveys keep the pinned
|
|
393
|
+
# order (so the validated daily-dogfood instrument is unchanged).
|
|
394
|
+
items = list(RTLXS_ITEMS)
|
|
395
|
+
if sced_blinded:
|
|
396
|
+
random.shuffle(items)
|
|
397
|
+
for key, label in items:
|
|
398
|
+
value = _prompt_one_item(label=label, previous=defaults[key])
|
|
399
|
+
if value is None:
|
|
400
|
+
sys.stdout.write("Cancelled.\n")
|
|
401
|
+
return None
|
|
402
|
+
answers[key] = value
|
|
403
|
+
trust_previous = defaults.get(RTLXS_TRUST_ITEM_KEY, 5)
|
|
404
|
+
trust_value = _prompt_one_item(label=RTLXS_TRUST_ITEM_LABEL, previous=trust_previous)
|
|
405
|
+
if trust_value is None:
|
|
406
|
+
sys.stdout.write("Cancelled.\n")
|
|
407
|
+
return None
|
|
408
|
+
answers[RTLXS_TRUST_ITEM_KEY] = trust_value
|
|
409
|
+
save_last_defaults(answers)
|
|
410
|
+
# Rating-time concealment capture (SCED v2 A.2): AFTER all load items, a
|
|
411
|
+
# blinded session collects a forced condition guess + confidence. The true
|
|
412
|
+
# autonomy_condition is forced None for a blinded session (it is never stored
|
|
413
|
+
# in the queryable row; it lives only in the sealed log and is joined back by
|
|
414
|
+
# sced_session_index at N=90).
|
|
415
|
+
blind_guess: str | None = None
|
|
416
|
+
blind_confidence: int | None = None
|
|
417
|
+
if sced_blinded:
|
|
418
|
+
blind_guess = _prompt_blind_guess()
|
|
419
|
+
if blind_guess is None:
|
|
420
|
+
sys.stdout.write("Cancelled.\n")
|
|
421
|
+
return None
|
|
422
|
+
conf = _prompt_one_item(
|
|
423
|
+
label="Confidence in that guess (0 = pure guess, 10 = certain)",
|
|
424
|
+
previous=5,
|
|
425
|
+
)
|
|
426
|
+
if conf is None:
|
|
427
|
+
sys.stdout.write("Cancelled.\n")
|
|
428
|
+
return None
|
|
429
|
+
blind_confidence = conf
|
|
430
|
+
return RTLXResponse(
|
|
431
|
+
mental_demand=answers["mental_demand"],
|
|
432
|
+
effort=answers["effort"],
|
|
433
|
+
frustration=answers["frustration"],
|
|
434
|
+
supervision_load=answers["supervision_load"],
|
|
435
|
+
execution_load=answers["execution_load"],
|
|
436
|
+
captured_at=datetime.now(tz=UTC),
|
|
437
|
+
session_id=session_id,
|
|
438
|
+
active_agents=list(active_agents or []),
|
|
439
|
+
parallel_agent_count=parallel_agent_count,
|
|
440
|
+
trust_dependable=trust_value,
|
|
441
|
+
agent_interrupts_count=agent_interrupts_count,
|
|
442
|
+
agent_turns_count=agent_turns_count,
|
|
443
|
+
verification_seconds=verification_seconds,
|
|
444
|
+
autonomy_condition=None if sced_blinded else autonomy_condition,
|
|
445
|
+
blind_guess=blind_guess,
|
|
446
|
+
blind_confidence=blind_confidence,
|
|
447
|
+
sced_session_index=sced_session_index,
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
def validate_item_keys() -> tuple[str, ...]:
|
|
452
|
+
"""Sanity helper for tests / API schema. Order matters."""
|
|
453
|
+
return tuple(key for key, _ in RTLXS_ITEMS)
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
def response_to_dict(response: RTLXResponse) -> dict[str, object]:
|
|
457
|
+
"""Plain dict view of a response for ad-hoc logging."""
|
|
458
|
+
data = asdict(response)
|
|
459
|
+
data["captured_at"] = response.captured_at.isoformat()
|
|
460
|
+
return data
|
zeno_core/streak.py
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"""Survey streak tracker (research 1 PM3, 2026-06-07).
|
|
2
|
+
|
|
3
|
+
Intrinsic-motivation lever for the RTLX-S probe. The streak counts the
|
|
4
|
+
number of consecutive local-calendar days on which the user has logged at
|
|
5
|
+
least one RTLX-S response. Habit-formation literature (BJ Fogg, Tiny Habits;
|
|
6
|
+
Wood 2019, Good Habits Bad Habits) shows that visible streaks plus a
|
|
7
|
+
"don't break the chain" frame drive higher self-report compliance than
|
|
8
|
+
abstract progress meters, especially for low-effort daily probes like this
|
|
9
|
+
one. We display BOTH current and longest-ever streak so a broken chain
|
|
10
|
+
still leaves the user with a personal-best target to beat.
|
|
11
|
+
|
|
12
|
+
Day boundary policy: a "day" runs from 03:00 LOCAL time (NOT UTC midnight).
|
|
13
|
+
Late-night sessions count toward the previous calendar day so a 2am wrap-up
|
|
14
|
+
does not artificially extend or break a streak. This matches the convention
|
|
15
|
+
used by `morning-pipeline-timezone` (MEMORY.md) and the dashboard digest.
|
|
16
|
+
|
|
17
|
+
Storage neutrality: the helper takes any object exposing a
|
|
18
|
+
``timestamps_after(cutoff_iso)`` async method that returns ISO timestamps
|
|
19
|
+
of probe responses for a user. The CLI passes its local ZenoStorage (reads
|
|
20
|
+
``load_probes`` where ``skipped=0``); the API uses ``RTLXSResponse`` rows
|
|
21
|
+
via ``streak_for_user``. Same algorithm, two storage drivers, one return
|
|
22
|
+
shape.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
from collections.abc import Sequence
|
|
28
|
+
from dataclasses import dataclass
|
|
29
|
+
from datetime import date, datetime, time, timedelta
|
|
30
|
+
|
|
31
|
+
# The local-day cutoff (hour of day). Sessions before this are bucketed into
|
|
32
|
+
# the previous calendar day. Mirrors `morning-pipeline-timezone` policy.
|
|
33
|
+
DAY_CUTOFF_HOUR: int = 3
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass(slots=True)
|
|
37
|
+
class StreakResult:
|
|
38
|
+
"""One streak snapshot.
|
|
39
|
+
|
|
40
|
+
``current`` : streak length ending on (or extending to) today's bucket
|
|
41
|
+
after applying the 03:00 cutoff. 0 if the user has not
|
|
42
|
+
logged in the current bucket AND the previous bucket.
|
|
43
|
+
``longest`` : longest streak ever recorded.
|
|
44
|
+
``last_response_at`` : ISO timestamp of the most recent response, or None
|
|
45
|
+
if the user has never submitted one.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
current: int
|
|
49
|
+
longest: int
|
|
50
|
+
last_response_at: str | None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _bucket_day(ts: datetime, *, cutoff_hour: int = DAY_CUTOFF_HOUR) -> date:
|
|
54
|
+
"""Map a timestamp to its streak-day bucket.
|
|
55
|
+
|
|
56
|
+
Sessions before ``cutoff_hour`` count toward the previous calendar day.
|
|
57
|
+
Naive timestamps are treated as local-tz wall-clock - good enough at MVP
|
|
58
|
+
because both writers (CLI local SQLite and the API RTLXS table) record
|
|
59
|
+
UTC-aware values today, and the tz the caller chose to convert to is
|
|
60
|
+
where the user lives. Tests pass tz-aware UTC datetimes to keep the
|
|
61
|
+
bucket math deterministic across CI machines.
|
|
62
|
+
"""
|
|
63
|
+
if ts.time() < time(cutoff_hour):
|
|
64
|
+
return (ts - timedelta(days=1)).date()
|
|
65
|
+
return ts.date()
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def compute_streak(
|
|
69
|
+
timestamps: Sequence[datetime],
|
|
70
|
+
*,
|
|
71
|
+
today: date,
|
|
72
|
+
cutoff_hour: int = DAY_CUTOFF_HOUR,
|
|
73
|
+
) -> StreakResult:
|
|
74
|
+
"""Pure function: given response timestamps and today's date, return the streak.
|
|
75
|
+
|
|
76
|
+
Algorithm:
|
|
77
|
+
1. Bucket each timestamp into a streak-day via ``_bucket_day``.
|
|
78
|
+
2. Sort unique buckets descending.
|
|
79
|
+
3. Current streak = number of consecutive days starting at today and
|
|
80
|
+
walking back. Allow a one-day grace: if the user has not yet logged
|
|
81
|
+
today but they DID log yesterday, the streak is preserved (counting
|
|
82
|
+
from yesterday). This avoids penalizing them mid-day before they
|
|
83
|
+
have had a chance to wrap up. The CLI surfaces a "you have not
|
|
84
|
+
logged today" warning separately so they still see the nudge.
|
|
85
|
+
4. Longest streak = max run of consecutive-day buckets across history.
|
|
86
|
+
5. last_response_at = max raw timestamp.
|
|
87
|
+
|
|
88
|
+
Empty input returns all zeros + None.
|
|
89
|
+
"""
|
|
90
|
+
if not timestamps:
|
|
91
|
+
return StreakResult(current=0, longest=0, last_response_at=None)
|
|
92
|
+
|
|
93
|
+
buckets = sorted({_bucket_day(ts, cutoff_hour=cutoff_hour) for ts in timestamps})
|
|
94
|
+
last_ts = max(timestamps)
|
|
95
|
+
|
|
96
|
+
# Longest run: single pass over sorted ascending buckets
|
|
97
|
+
longest = 1
|
|
98
|
+
run = 1
|
|
99
|
+
for prev, curr in zip(buckets, buckets[1:], strict=False):
|
|
100
|
+
if curr - prev == timedelta(days=1):
|
|
101
|
+
run += 1
|
|
102
|
+
longest = max(longest, run)
|
|
103
|
+
else:
|
|
104
|
+
run = 1
|
|
105
|
+
|
|
106
|
+
# Current streak: walk backwards from the most recent bucket, allowing
|
|
107
|
+
# the user to be at most 1 day stale (today vs yesterday). If the latest
|
|
108
|
+
# bucket is older than yesterday, the streak is broken (0).
|
|
109
|
+
latest_bucket = buckets[-1]
|
|
110
|
+
if latest_bucket == today or latest_bucket == today - timedelta(days=1):
|
|
111
|
+
current = 1
|
|
112
|
+
i = len(buckets) - 2
|
|
113
|
+
while i >= 0 and buckets[i + 1] - buckets[i] == timedelta(days=1):
|
|
114
|
+
current += 1
|
|
115
|
+
i -= 1
|
|
116
|
+
else:
|
|
117
|
+
current = 0
|
|
118
|
+
|
|
119
|
+
return StreakResult(
|
|
120
|
+
current=current,
|
|
121
|
+
longest=longest,
|
|
122
|
+
last_response_at=last_ts.isoformat(),
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def has_logged_today(
|
|
127
|
+
timestamps: Sequence[datetime],
|
|
128
|
+
*,
|
|
129
|
+
today: date,
|
|
130
|
+
cutoff_hour: int = DAY_CUTOFF_HOUR,
|
|
131
|
+
) -> bool:
|
|
132
|
+
"""True iff the user has at least one response that buckets to today.
|
|
133
|
+
|
|
134
|
+
Used by the CLI to decide whether to print the "you have not logged
|
|
135
|
+
today" warning. Distinct from `current > 0` because the one-day grace
|
|
136
|
+
means current can be non-zero with no response yet today.
|
|
137
|
+
"""
|
|
138
|
+
if not timestamps:
|
|
139
|
+
return False
|
|
140
|
+
return any(_bucket_day(ts, cutoff_hour=cutoff_hour) == today for ts in timestamps)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
async def read_streak_from_local_storage(
|
|
144
|
+
storage,
|
|
145
|
+
project_id: str,
|
|
146
|
+
*,
|
|
147
|
+
today: date | None = None,
|
|
148
|
+
cutoff_hour: int = DAY_CUTOFF_HOUR,
|
|
149
|
+
) -> StreakResult:
|
|
150
|
+
"""Read the streak from a local SDK ZenoStorage instance.
|
|
151
|
+
|
|
152
|
+
Pulls timestamps from ``load_probes`` joined to ``sessions`` (project
|
|
153
|
+
scope), keeping only non-skipped probes - those are the rows the RTLX-S
|
|
154
|
+
survey writes for a real response. ISO-string columns are parsed with
|
|
155
|
+
``datetime.fromisoformat``; rows that fail to parse are dropped (the
|
|
156
|
+
helper is read-only / best-effort, never bubbles a SQLite or parse
|
|
157
|
+
error to the user).
|
|
158
|
+
"""
|
|
159
|
+
rows = await storage._fetchall(
|
|
160
|
+
"""
|
|
161
|
+
SELECT lp.responded_at, lp.prompted_at
|
|
162
|
+
FROM load_probes lp
|
|
163
|
+
JOIN sessions s ON s.id = lp.session_id
|
|
164
|
+
WHERE s.project_id = ? AND lp.skipped = 0
|
|
165
|
+
""",
|
|
166
|
+
(project_id,),
|
|
167
|
+
)
|
|
168
|
+
timestamps: list[datetime] = []
|
|
169
|
+
for row in rows:
|
|
170
|
+
raw = row[0] or row[1]
|
|
171
|
+
if not raw:
|
|
172
|
+
continue
|
|
173
|
+
try:
|
|
174
|
+
timestamps.append(datetime.fromisoformat(str(raw)))
|
|
175
|
+
except ValueError:
|
|
176
|
+
continue
|
|
177
|
+
today = today or datetime.now().date()
|
|
178
|
+
return compute_streak(timestamps, today=today, cutoff_hour=cutoff_hour)
|