zeno-cli 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. zeno_adapters/__init__.py +17 -0
  2. zeno_adapters/_common.py +38 -0
  3. zeno_adapters/anthropic.py +68 -0
  4. zeno_adapters/claude_code.py +101 -0
  5. zeno_adapters/crewai.py +92 -0
  6. zeno_adapters/langgraph.py +49 -0
  7. zeno_adapters/openai.py +108 -0
  8. zeno_cli/__init__.py +1 -0
  9. zeno_cli/_hooks/cc_bridge.py +1016 -0
  10. zeno_cli/doctor.py +535 -0
  11. zeno_cli/hook_install.py +269 -0
  12. zeno_cli/hud/__init__.py +1 -0
  13. zeno_cli/hud/hud_install.py +652 -0
  14. zeno_cli/hud/zeno_attention.py +288 -0
  15. zeno_cli/hud/zeno_cognition.py +457 -0
  16. zeno_cli/hud/zeno_hud.py +496 -0
  17. zeno_cli/interview_invites.py +342 -0
  18. zeno_cli/login.py +241 -0
  19. zeno_cli/main.py +2534 -0
  20. zeno_cli/onboard.py +206 -0
  21. zeno_cli/outreach.py +456 -0
  22. zeno_cli/version.py +67 -0
  23. zeno_cli-0.3.4.dist-info/METADATA +161 -0
  24. zeno_cli-0.3.4.dist-info/RECORD +69 -0
  25. zeno_cli-0.3.4.dist-info/WHEEL +4 -0
  26. zeno_cli-0.3.4.dist-info/entry_points.txt +4 -0
  27. zeno_core/__init__.py +67 -0
  28. zeno_core/analytics.py +193 -0
  29. zeno_core/rtlx_s.py +460 -0
  30. zeno_core/streak.py +178 -0
  31. zeno_core/tlx_s.py +192 -0
  32. zeno_sdk/__init__.py +6 -0
  33. zeno_sdk/_generated/__init__.py +6 -0
  34. zeno_sdk/_generated/client.py +819 -0
  35. zeno_sdk/_migrations/alembic/env.py +33 -0
  36. zeno_sdk/_migrations/alembic/script.py.mako +18 -0
  37. zeno_sdk/_migrations/alembic/versions/0001_initial.py +79 -0
  38. zeno_sdk/_migrations/alembic/versions/0002_cognition_samples.py +53 -0
  39. zeno_sdk/_migrations/alembic/versions/0003_cognition_drivers.py +41 -0
  40. zeno_sdk/_migrations/alembic/versions/0004_transcript_intelligence.py +248 -0
  41. zeno_sdk/_migrations/alembic.ini +35 -0
  42. zeno_sdk/_runtime.py +12 -0
  43. zeno_sdk/adapters/__init__.py +15 -0
  44. zeno_sdk/adapters/anthropic.py +5 -0
  45. zeno_sdk/adapters/claude_code.py +5 -0
  46. zeno_sdk/adapters/crewai.py +5 -0
  47. zeno_sdk/adapters/langgraph.py +5 -0
  48. zeno_sdk/adapters/openai.py +5 -0
  49. zeno_sdk/auth.py +25 -0
  50. zeno_sdk/client.py +87 -0
  51. zeno_sdk/config.py +61 -0
  52. zeno_sdk/daemon.py +72 -0
  53. zeno_sdk/privacy.py +46 -0
  54. zeno_sdk/session.py +179 -0
  55. zeno_sdk/storage.py +487 -0
  56. zeno_sdk/types/__init__.py +121 -0
  57. zeno_session_intel/__init__.py +19 -0
  58. zeno_session_intel/analytics.py +588 -0
  59. zeno_session_intel/compression.py +123 -0
  60. zeno_session_intel/ingest.py +376 -0
  61. zeno_session_intel/model.py +129 -0
  62. zeno_session_intel/parsers/__init__.py +31 -0
  63. zeno_session_intel/parsers/claude_code.py +169 -0
  64. zeno_session_intel/parsers/codex.py +265 -0
  65. zeno_session_intel/parsers/cursor.py +198 -0
  66. zeno_session_intel/prices.py +281 -0
  67. zeno_session_intel/schema.py +277 -0
  68. zeno_session_intel/signals.py +319 -0
  69. zeno_session_intel/taxonomy.py +71 -0
zeno_core/rtlx_s.py ADDED
@@ -0,0 +1,460 @@
1
+ """RTLX-S: Raw NASA-TLX, Supervision-augmented (research 1, 2026-06-07 PM3).
2
+
3
+ Five-item cognitive-load probe. Three items are validated raw NASA-TLX
4
+ subscales (mental demand, effort, frustration); two are novel additions
5
+ specific to AI-supervised work (supervision load, execution load).
6
+
7
+ Design contract from research 1:
8
+ - 5 items, 0-10 each (NOT 0-100 like classic TLX-S)
9
+ - Single keypress per item, Enter accepts the previous answer
10
+ - End-anchored labels only ("0 = none / very low", "10 = very high")
11
+ - NO pairwise weighting step (dropped entirely)
12
+ - NEVER collapse to a single index at capture time - store all 5 raw values
13
+ - Default each item to the user's last-entered value, persisted in the
14
+ local zeno data dir at ~/.zeno/rtlxs_last.json
15
+ - Total prompt completes in well under 5 seconds for an experienced user
16
+
17
+ The five items are PINNED. Wording is locked here so the Stage 2 CFA / alpha
18
+ validation pass has a stable instrument across cohorts. Do NOT edit wording
19
+ without recording the change as a new schema_version constant - mixing
20
+ re-worded items into the same response stream invalidates the validation.
21
+
22
+ The novel two items (supervision_load, execution_load) are NOT yet
23
+ empirically validated. Research 1 specifies that they pass validation only
24
+ if Cronbach alpha >= 0.7 and CFA model fit is acceptable at N>=150-300
25
+ sessions across two cohorts. Until then they are treated as candidate
26
+ correlates, not load-bearing for any policy gate.
27
+ """
28
+
29
+ from __future__ import annotations
30
+
31
+ import json
32
+ import os
33
+ import random
34
+ import sys
35
+ import termios
36
+ import tty
37
+ from dataclasses import asdict, dataclass
38
+ from datetime import UTC, datetime
39
+ from pathlib import Path
40
+
41
+ # Schema version for the persisted defaults + API payloads. Bump when adding
42
+ # or rewording items so CFA / alpha analyses don't silently mix instruments.
43
+ #
44
+ # v1 (2026-06-07): initial 5-item probe.
45
+ # v2 (2026-06-10): supervision_load + execution_load rewritten from
46
+ # proportion-of-effort phrasing ("how much of your effort went into...")
47
+ # to absolute cost. Research 3 (2026-06-10) flagged the proportion framing
48
+ # as an internal-contamination threat: by definition the two share total
49
+ # effort and will correlate near-perfectly negatively, undermining factor
50
+ # independence. Absolute-cost framing anchors each item on its own
51
+ # construct (Sheridan monitor+intervene for supervision; raw mental
52
+ # demand of own work for execution) and is the wording the prior-art scan
53
+ # explicitly recommended pre-survey to protect Stage 2 discriminant
54
+ # validity vs RTLX Mental Demand + Frustration (the vigilance "workload
55
+ # signature" - Warm, Parasuraman & Matthews 2008).
56
+ RTLXS_SCHEMA_VERSION: int = 2
57
+
58
+ # Randomized single-case-experiment (SCED) condition tags (2026-06-15).
59
+ # A per-session experimental condition lets occasions be randomly assigned
60
+ # to autonomy blocks (ABAB / alternating-treatments) so the within-person
61
+ # double dissociation can be tested: autonomy should move supervision_load
62
+ # but not execution_load. None = untagged. See
63
+ # docs/STAGE_2A_WITHIN_PERSON_DESIGN.md.
64
+ RTLXS_AUTONOMY_CONDITIONS: tuple[str, ...] = (
65
+ "high_autonomy",
66
+ "low_autonomy",
67
+ "control",
68
+ )
69
+
70
+ # Rating-time concealment control (SCED v2, 2026-06-19; pre-reg Section 12 A.2).
71
+ # In a BLINDED SCED session the survey never shows the condition; after the load
72
+ # items it asks the operator to GUESS the concealed condition + rate confidence.
73
+ # The Bang/James leakage index is computed from these at N=90 (NOT trial blinding
74
+ # - the operator enacts the condition; this measures whether the RATINGS leaked
75
+ # it). The guess is placed strictly AFTER the load items so it cannot prime them.
76
+ RTLXS_BLIND_GUESS_OPTIONS: tuple[str, ...] = ("high", "low", "unsure")
77
+
78
+
79
+ # Ordered tuple of (key, label) pairs - presentation order is the response
80
+ # field order in storage too, so downstream stats can index by position.
81
+ RTLXS_ITEMS: tuple[tuple[str, str], ...] = (
82
+ (
83
+ "mental_demand",
84
+ "Mental demand: How mentally demanding was this session?",
85
+ ),
86
+ (
87
+ "effort",
88
+ "Effort: How hard did you have to work to get the result you wanted?",
89
+ ),
90
+ (
91
+ "frustration",
92
+ "Frustration: How frustrated, stressed, or annoyed did you feel?",
93
+ ),
94
+ (
95
+ "supervision_load",
96
+ "Supervision load: How much effort did watching, reviewing, or "
97
+ "correcting the agent's work cost you?",
98
+ ),
99
+ (
100
+ "execution_load",
101
+ "Execution load: How mentally demanding was the work you did " "yourself?",
102
+ ),
103
+ )
104
+
105
+
106
+ # Trust anchor (research 3, 2026-06-10). One Jian/Bisantz/Drury-derived
107
+ # item, fixed across all sessions (not rotated): a single item with full
108
+ # N gives stronger convergent-validity evidence than rotation across n=300.
109
+ # Wording is adapted from Jian et al. (2000) "The system is dependable"
110
+ # to zeno's per-session context. Captured alongside the 5 RTLX-S items;
111
+ # scored 0-10. Persisted in a separate column so it is never collapsed
112
+ # into the RTLX-S latent structure. Establishes discriminant validity
113
+ # vs trust: we expect a moderate negative correlation with supervision_load
114
+ # (higher trust -> lower oversight burden) but NOT a near-perfect one.
115
+ RTLXS_TRUST_ITEM_KEY: str = "trust_dependable"
116
+ RTLXS_TRUST_ITEM_LABEL: str = "Trust: The AI agent(s) I used this session were dependable."
117
+
118
+
119
+ # Anchor labels (end-anchored only - research 1 contract)
120
+ LOW_ANCHOR: str = "0 = none / very low"
121
+ HIGH_ANCHOR: str = "10 = very high"
122
+
123
+
124
+ @dataclass(slots=True)
125
+ class RTLXResponse:
126
+ """One complete RTLX-S response. Stores raw integers, never a composite.
127
+
128
+ Five validated/candidate items are 0-10 each. ``session_id`` ties the
129
+ response to a CLI session aggregate. ``active_agents`` and
130
+ ``parallel_agent_count`` are operational context for the Stage 2
131
+ validation pass: they let CFA / alpha analyses stratify by agent mix.
132
+
133
+ ``schema_version`` pins the item wording version for the row so a
134
+ pooled Stage 2 CFA can exclude / stratify by wording cohort.
135
+
136
+ ``trust_dependable`` (research 3, 2026-06-10) is a single Jian-derived
137
+ trust anchor item, captured alongside the 5 RTLX-S items to provide
138
+ discriminant-validity evidence against trust constructs.
139
+
140
+ ``agent_interrupts_count``, ``agent_turns_count``, and
141
+ ``verification_seconds`` (research 3, 2026-06-10) are behavioral
142
+ anchors populated from CLI session telemetry. They are the strongest
143
+ discriminant-validity evidence the Stage 2 CFA can use: supervision
144
+ load should correlate positively with them; mental demand should not
145
+ (or should correlate less). All three are Optional - the CLI fills
146
+ them in when telemetry is available, otherwise they remain None.
147
+ """
148
+
149
+ mental_demand: int
150
+ effort: int
151
+ frustration: int
152
+ supervision_load: int
153
+ execution_load: int
154
+ captured_at: datetime
155
+ session_id: str
156
+ active_agents: list[str]
157
+ parallel_agent_count: int
158
+ schema_version: int = RTLXS_SCHEMA_VERSION
159
+ trust_dependable: int | None = None
160
+ agent_interrupts_count: int | None = None
161
+ agent_turns_count: int | None = None
162
+ verification_seconds: int | None = None
163
+ autonomy_condition: str | None = None
164
+ # SCED v2 rating-time concealment (2026-06-19, pre-reg Section 12 A.1/A.2).
165
+ # ``blind_guess`` is the operator's forced post-rating guess of the concealed
166
+ # condition (one of RTLXS_BLIND_GUESS_OPTIONS); ``blind_confidence`` is 0..10.
167
+ # ``sced_session_index`` is the locked-schedule slot this session was assigned;
168
+ # in a blinded session ``autonomy_condition`` stays None (the true condition
169
+ # lives only in the sealed log, joined back by this index at N=90).
170
+ blind_guess: str | None = None
171
+ blind_confidence: int | None = None
172
+ sced_session_index: int | None = None
173
+
174
+ def to_payload(self) -> dict[str, object]:
175
+ """Wire format for POST /v1/sessions/{session_id}/rtlxs."""
176
+ payload: dict[str, object] = {
177
+ "mental_demand": self.mental_demand,
178
+ "effort": self.effort,
179
+ "frustration": self.frustration,
180
+ "supervision_load": self.supervision_load,
181
+ "execution_load": self.execution_load,
182
+ "captured_at": self.captured_at.isoformat(),
183
+ "active_agents": list(self.active_agents),
184
+ "parallel_agent_count": self.parallel_agent_count,
185
+ "schema_version": self.schema_version,
186
+ }
187
+ if self.trust_dependable is not None:
188
+ payload["trust_dependable"] = self.trust_dependable
189
+ if self.agent_interrupts_count is not None:
190
+ payload["agent_interrupts_count"] = self.agent_interrupts_count
191
+ if self.agent_turns_count is not None:
192
+ payload["agent_turns_count"] = self.agent_turns_count
193
+ if self.verification_seconds is not None:
194
+ payload["verification_seconds"] = self.verification_seconds
195
+ if self.autonomy_condition is not None:
196
+ payload["autonomy_condition"] = self.autonomy_condition
197
+ if self.blind_guess is not None:
198
+ payload["blind_guess"] = self.blind_guess
199
+ if self.blind_confidence is not None:
200
+ payload["blind_confidence"] = self.blind_confidence
201
+ if self.sced_session_index is not None:
202
+ payload["sced_session_index"] = self.sced_session_index
203
+ return payload
204
+
205
+
206
+ def _data_dir() -> Path:
207
+ """Local zeno data directory. Mirrors the SDK's ZENO_HOME convention."""
208
+ home = Path.home()
209
+ base_dir = Path(os.environ.get("ZENO_HOME", home / ".zeno")).expanduser()
210
+ base_dir.mkdir(parents=True, exist_ok=True)
211
+ return base_dir
212
+
213
+
214
+ def _last_response_path() -> Path:
215
+ return _data_dir() / "rtlxs_last.json"
216
+
217
+
218
+ def load_last_defaults() -> dict[str, int]:
219
+ """Read the most recently submitted RTLX-S response.
220
+
221
+ Returns a dict mapping each item key to its previous value, or 5 for
222
+ every item on first run / corrupted file. 5 is chosen as the
223
+ midpoint default so an out-of-the-box submission doesn't bias the
224
+ score toward either extreme. Falls back silently on any IO error -
225
+ the user must NEVER see a stacktrace from the survey TUI.
226
+ """
227
+ keys = [key for key, _ in RTLXS_ITEMS] + [RTLXS_TRUST_ITEM_KEY]
228
+ fallback: dict[str, int] = {key: 5 for key in keys}
229
+ path = _last_response_path()
230
+ if not path.exists():
231
+ return fallback
232
+ try:
233
+ raw = json.loads(path.read_text(encoding="utf-8"))
234
+ except (json.JSONDecodeError, OSError):
235
+ return fallback
236
+ if not isinstance(raw, dict):
237
+ return fallback
238
+ out: dict[str, int] = {}
239
+ for key in keys:
240
+ value = raw.get(key)
241
+ if isinstance(value, int) and 0 <= value <= 10:
242
+ out[key] = value
243
+ else:
244
+ out[key] = 5
245
+ return out
246
+
247
+
248
+ def save_last_defaults(values: dict[str, int]) -> None:
249
+ """Persist the freshly submitted response so the next run defaults to it.
250
+
251
+ Silent on any IO error - never disrupt the CLI survey path. Only the
252
+ five item keys are written; metadata stays out of this file by design
253
+ so it can be safely git-ignored as a per-user preference.
254
+ """
255
+ keys = [key for key, _ in RTLXS_ITEMS] + [RTLXS_TRUST_ITEM_KEY]
256
+ payload = {key: int(values.get(key, 5)) for key in keys}
257
+ try:
258
+ _last_response_path().write_text(
259
+ json.dumps(payload, sort_keys=True),
260
+ encoding="utf-8",
261
+ )
262
+ except OSError:
263
+ pass
264
+
265
+
266
+ def _read_single_keypress() -> str:
267
+ """Read one keypress from stdin in raw mode. Handles Enter + digits.
268
+
269
+ POSIX-only (termios). The TUI fallback in run_rtlxs_survey_tui()
270
+ guards isatty() before reaching here. Returns the character as a
271
+ string; raw control chars are returned as-is so the caller can
272
+ handle Ctrl-C / Ctrl-D / Enter.
273
+ """
274
+ fd = sys.stdin.fileno()
275
+ old_settings = termios.tcgetattr(fd)
276
+ try:
277
+ tty.setraw(fd)
278
+ ch = sys.stdin.read(1)
279
+ finally:
280
+ termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
281
+ return ch
282
+
283
+
284
+ def _prompt_one_item(*, label: str, previous: int) -> int | None:
285
+ """Show one item, read a single keypress 0-10.
286
+
287
+ Returns the integer chosen, or None on cancel (Ctrl-C / Ctrl-D).
288
+ Enter keeps ``previous``. Digits 0-9 commit immediately. The
289
+ pseudo-"10" path: press 1 followed by 0 within the same prompt -
290
+ callers handle the two-digit case by re-prompting if needed.
291
+
292
+ For simplicity at MVP, "10" is entered as a single keypress: the
293
+ upper-case 'T' (think "ten") OR the digit 9 + 1. We pick the most
294
+ accessible default: digits 0..9 commit immediately as their value,
295
+ 'T' or 't' commits 10. This keeps the whole prompt single-keypress
296
+ per item AND under 5 seconds total - the research 1 contract.
297
+ """
298
+ sys.stdout.write(f" {label}\n")
299
+ sys.stdout.write(
300
+ f" [{LOW_ANCHOR}, {HIGH_ANCHOR}] (last = {previous}, Enter keeps, T = 10): "
301
+ )
302
+ sys.stdout.flush()
303
+ try:
304
+ ch = _read_single_keypress()
305
+ except (OSError, termios.error):
306
+ sys.stdout.write(f"{previous}\n")
307
+ return previous
308
+ if ch in ("\x03", "\x04"):
309
+ sys.stdout.write("\n")
310
+ return None
311
+ if ch in ("\r", "\n"):
312
+ sys.stdout.write(f"{previous}\n")
313
+ return previous
314
+ if ch in ("t", "T"):
315
+ sys.stdout.write("10\n")
316
+ return 10
317
+ if ch.isdigit():
318
+ sys.stdout.write(f"{ch}\n")
319
+ return int(ch)
320
+ # Unknown key - treat as Enter (keep previous) and move on. Refusing
321
+ # to advance would put the user in a stuck-prompt state, which is a
322
+ # worse failure mode than accepting the previous value.
323
+ sys.stdout.write(f"{previous}\n")
324
+ return previous
325
+
326
+
327
+ def _prompt_blind_guess() -> str | None:
328
+ """Forced guess of the concealed SCED condition (rating-time leakage check).
329
+
330
+ Single keypress: h = high_autonomy, l = low_autonomy, anything else = unsure.
331
+ Returns one of RTLXS_BLIND_GUESS_OPTIONS, or None on cancel (Ctrl-C/Ctrl-D).
332
+ Never blocks on an unknown key - defaults to "unsure" so the prompt cannot
333
+ strand the operator (same failure-mode contract as _prompt_one_item).
334
+ """
335
+ sys.stdout.write(" Which condition do you think this session was?\n")
336
+ sys.stdout.write(" [h = high autonomy, l = low autonomy, u = unsure]: ")
337
+ sys.stdout.flush()
338
+ try:
339
+ ch = _read_single_keypress()
340
+ except (OSError, termios.error):
341
+ sys.stdout.write("unsure\n")
342
+ return "unsure"
343
+ if ch in ("\x03", "\x04"):
344
+ sys.stdout.write("\n")
345
+ return None
346
+ if ch in ("h", "H"):
347
+ sys.stdout.write("high\n")
348
+ return "high"
349
+ if ch in ("l", "L"):
350
+ sys.stdout.write("low\n")
351
+ return "low"
352
+ sys.stdout.write("unsure\n")
353
+ return "unsure"
354
+
355
+
356
+ def run_rtlxs_survey_tui(
357
+ *,
358
+ session_id: str,
359
+ active_agents: list[str] | None = None,
360
+ parallel_agent_count: int = 0,
361
+ agent_interrupts_count: int | None = None,
362
+ agent_turns_count: int | None = None,
363
+ verification_seconds: int | None = None,
364
+ autonomy_condition: str | None = None,
365
+ sced_blinded: bool = False,
366
+ sced_session_index: int | None = None,
367
+ ) -> RTLXResponse | None:
368
+ """Five-item RTLX-S prompt plus one trust anchor item.
369
+
370
+ Returns None if skipped/cancelled. Non-tty stdin returns None
371
+ (background daemon / piped invocation should skip the probe rather
372
+ than block on stdin.read). Single keypress per item, Enter keeps the
373
+ previous answer. Total time well under 6 seconds for an experienced
374
+ user (5 RTLX-S items + 1 trust item).
375
+
376
+ The three behavioral-anchor parameters
377
+ (``agent_interrupts_count``, ``agent_turns_count``,
378
+ ``verification_seconds``) are populated from CLI session telemetry
379
+ by the caller. They are never user-keyed - they are the objective
380
+ behavioral side of the discriminant-validity check for supervision
381
+ load. Pass None when telemetry is unavailable.
382
+ """
383
+ if not sys.stdin.isatty():
384
+ return None
385
+ defaults = load_last_defaults()
386
+ sys.stdout.write("RTLX-S probe (5 items + 1 trust, single keypress each):\n")
387
+ sys.stdout.flush()
388
+ answers: dict[str, int] = {}
389
+ # Item-order randomization (SCED v2 A.4): in a blinded session the 5 load
390
+ # items are presented in a random order to disrupt response sets. Storage is
391
+ # keyed by item name, so the order is presentation-only and does not affect
392
+ # stored values or downstream analysis. Non-blinded surveys keep the pinned
393
+ # order (so the validated daily-dogfood instrument is unchanged).
394
+ items = list(RTLXS_ITEMS)
395
+ if sced_blinded:
396
+ random.shuffle(items)
397
+ for key, label in items:
398
+ value = _prompt_one_item(label=label, previous=defaults[key])
399
+ if value is None:
400
+ sys.stdout.write("Cancelled.\n")
401
+ return None
402
+ answers[key] = value
403
+ trust_previous = defaults.get(RTLXS_TRUST_ITEM_KEY, 5)
404
+ trust_value = _prompt_one_item(label=RTLXS_TRUST_ITEM_LABEL, previous=trust_previous)
405
+ if trust_value is None:
406
+ sys.stdout.write("Cancelled.\n")
407
+ return None
408
+ answers[RTLXS_TRUST_ITEM_KEY] = trust_value
409
+ save_last_defaults(answers)
410
+ # Rating-time concealment capture (SCED v2 A.2): AFTER all load items, a
411
+ # blinded session collects a forced condition guess + confidence. The true
412
+ # autonomy_condition is forced None for a blinded session (it is never stored
413
+ # in the queryable row; it lives only in the sealed log and is joined back by
414
+ # sced_session_index at N=90).
415
+ blind_guess: str | None = None
416
+ blind_confidence: int | None = None
417
+ if sced_blinded:
418
+ blind_guess = _prompt_blind_guess()
419
+ if blind_guess is None:
420
+ sys.stdout.write("Cancelled.\n")
421
+ return None
422
+ conf = _prompt_one_item(
423
+ label="Confidence in that guess (0 = pure guess, 10 = certain)",
424
+ previous=5,
425
+ )
426
+ if conf is None:
427
+ sys.stdout.write("Cancelled.\n")
428
+ return None
429
+ blind_confidence = conf
430
+ return RTLXResponse(
431
+ mental_demand=answers["mental_demand"],
432
+ effort=answers["effort"],
433
+ frustration=answers["frustration"],
434
+ supervision_load=answers["supervision_load"],
435
+ execution_load=answers["execution_load"],
436
+ captured_at=datetime.now(tz=UTC),
437
+ session_id=session_id,
438
+ active_agents=list(active_agents or []),
439
+ parallel_agent_count=parallel_agent_count,
440
+ trust_dependable=trust_value,
441
+ agent_interrupts_count=agent_interrupts_count,
442
+ agent_turns_count=agent_turns_count,
443
+ verification_seconds=verification_seconds,
444
+ autonomy_condition=None if sced_blinded else autonomy_condition,
445
+ blind_guess=blind_guess,
446
+ blind_confidence=blind_confidence,
447
+ sced_session_index=sced_session_index,
448
+ )
449
+
450
+
451
+ def validate_item_keys() -> tuple[str, ...]:
452
+ """Sanity helper for tests / API schema. Order matters."""
453
+ return tuple(key for key, _ in RTLXS_ITEMS)
454
+
455
+
456
+ def response_to_dict(response: RTLXResponse) -> dict[str, object]:
457
+ """Plain dict view of a response for ad-hoc logging."""
458
+ data = asdict(response)
459
+ data["captured_at"] = response.captured_at.isoformat()
460
+ return data
zeno_core/streak.py ADDED
@@ -0,0 +1,178 @@
1
+ """Survey streak tracker (research 1 PM3, 2026-06-07).
2
+
3
+ Intrinsic-motivation lever for the RTLX-S probe. The streak counts the
4
+ number of consecutive local-calendar days on which the user has logged at
5
+ least one RTLX-S response. Habit-formation literature (BJ Fogg, Tiny Habits;
6
+ Wood 2019, Good Habits Bad Habits) shows that visible streaks plus a
7
+ "don't break the chain" frame drive higher self-report compliance than
8
+ abstract progress meters, especially for low-effort daily probes like this
9
+ one. We display BOTH current and longest-ever streak so a broken chain
10
+ still leaves the user with a personal-best target to beat.
11
+
12
+ Day boundary policy: a "day" runs from 03:00 LOCAL time (NOT UTC midnight).
13
+ Late-night sessions count toward the previous calendar day so a 2am wrap-up
14
+ does not artificially extend or break a streak. This matches the convention
15
+ used by `morning-pipeline-timezone` (MEMORY.md) and the dashboard digest.
16
+
17
+ Storage neutrality: the helper takes any object exposing a
18
+ ``timestamps_after(cutoff_iso)`` async method that returns ISO timestamps
19
+ of probe responses for a user. The CLI passes its local ZenoStorage (reads
20
+ ``load_probes`` where ``skipped=0``); the API uses ``RTLXSResponse`` rows
21
+ via ``streak_for_user``. Same algorithm, two storage drivers, one return
22
+ shape.
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ from collections.abc import Sequence
28
+ from dataclasses import dataclass
29
+ from datetime import date, datetime, time, timedelta
30
+
31
+ # The local-day cutoff (hour of day). Sessions before this are bucketed into
32
+ # the previous calendar day. Mirrors `morning-pipeline-timezone` policy.
33
+ DAY_CUTOFF_HOUR: int = 3
34
+
35
+
36
+ @dataclass(slots=True)
37
+ class StreakResult:
38
+ """One streak snapshot.
39
+
40
+ ``current`` : streak length ending on (or extending to) today's bucket
41
+ after applying the 03:00 cutoff. 0 if the user has not
42
+ logged in the current bucket AND the previous bucket.
43
+ ``longest`` : longest streak ever recorded.
44
+ ``last_response_at`` : ISO timestamp of the most recent response, or None
45
+ if the user has never submitted one.
46
+ """
47
+
48
+ current: int
49
+ longest: int
50
+ last_response_at: str | None
51
+
52
+
53
+ def _bucket_day(ts: datetime, *, cutoff_hour: int = DAY_CUTOFF_HOUR) -> date:
54
+ """Map a timestamp to its streak-day bucket.
55
+
56
+ Sessions before ``cutoff_hour`` count toward the previous calendar day.
57
+ Naive timestamps are treated as local-tz wall-clock - good enough at MVP
58
+ because both writers (CLI local SQLite and the API RTLXS table) record
59
+ UTC-aware values today, and the tz the caller chose to convert to is
60
+ where the user lives. Tests pass tz-aware UTC datetimes to keep the
61
+ bucket math deterministic across CI machines.
62
+ """
63
+ if ts.time() < time(cutoff_hour):
64
+ return (ts - timedelta(days=1)).date()
65
+ return ts.date()
66
+
67
+
68
+ def compute_streak(
69
+ timestamps: Sequence[datetime],
70
+ *,
71
+ today: date,
72
+ cutoff_hour: int = DAY_CUTOFF_HOUR,
73
+ ) -> StreakResult:
74
+ """Pure function: given response timestamps and today's date, return the streak.
75
+
76
+ Algorithm:
77
+ 1. Bucket each timestamp into a streak-day via ``_bucket_day``.
78
+ 2. Sort unique buckets descending.
79
+ 3. Current streak = number of consecutive days starting at today and
80
+ walking back. Allow a one-day grace: if the user has not yet logged
81
+ today but they DID log yesterday, the streak is preserved (counting
82
+ from yesterday). This avoids penalizing them mid-day before they
83
+ have had a chance to wrap up. The CLI surfaces a "you have not
84
+ logged today" warning separately so they still see the nudge.
85
+ 4. Longest streak = max run of consecutive-day buckets across history.
86
+ 5. last_response_at = max raw timestamp.
87
+
88
+ Empty input returns all zeros + None.
89
+ """
90
+ if not timestamps:
91
+ return StreakResult(current=0, longest=0, last_response_at=None)
92
+
93
+ buckets = sorted({_bucket_day(ts, cutoff_hour=cutoff_hour) for ts in timestamps})
94
+ last_ts = max(timestamps)
95
+
96
+ # Longest run: single pass over sorted ascending buckets
97
+ longest = 1
98
+ run = 1
99
+ for prev, curr in zip(buckets, buckets[1:], strict=False):
100
+ if curr - prev == timedelta(days=1):
101
+ run += 1
102
+ longest = max(longest, run)
103
+ else:
104
+ run = 1
105
+
106
+ # Current streak: walk backwards from the most recent bucket, allowing
107
+ # the user to be at most 1 day stale (today vs yesterday). If the latest
108
+ # bucket is older than yesterday, the streak is broken (0).
109
+ latest_bucket = buckets[-1]
110
+ if latest_bucket == today or latest_bucket == today - timedelta(days=1):
111
+ current = 1
112
+ i = len(buckets) - 2
113
+ while i >= 0 and buckets[i + 1] - buckets[i] == timedelta(days=1):
114
+ current += 1
115
+ i -= 1
116
+ else:
117
+ current = 0
118
+
119
+ return StreakResult(
120
+ current=current,
121
+ longest=longest,
122
+ last_response_at=last_ts.isoformat(),
123
+ )
124
+
125
+
126
+ def has_logged_today(
127
+ timestamps: Sequence[datetime],
128
+ *,
129
+ today: date,
130
+ cutoff_hour: int = DAY_CUTOFF_HOUR,
131
+ ) -> bool:
132
+ """True iff the user has at least one response that buckets to today.
133
+
134
+ Used by the CLI to decide whether to print the "you have not logged
135
+ today" warning. Distinct from `current > 0` because the one-day grace
136
+ means current can be non-zero with no response yet today.
137
+ """
138
+ if not timestamps:
139
+ return False
140
+ return any(_bucket_day(ts, cutoff_hour=cutoff_hour) == today for ts in timestamps)
141
+
142
+
143
+ async def read_streak_from_local_storage(
144
+ storage,
145
+ project_id: str,
146
+ *,
147
+ today: date | None = None,
148
+ cutoff_hour: int = DAY_CUTOFF_HOUR,
149
+ ) -> StreakResult:
150
+ """Read the streak from a local SDK ZenoStorage instance.
151
+
152
+ Pulls timestamps from ``load_probes`` joined to ``sessions`` (project
153
+ scope), keeping only non-skipped probes - those are the rows the RTLX-S
154
+ survey writes for a real response. ISO-string columns are parsed with
155
+ ``datetime.fromisoformat``; rows that fail to parse are dropped (the
156
+ helper is read-only / best-effort, never bubbles a SQLite or parse
157
+ error to the user).
158
+ """
159
+ rows = await storage._fetchall(
160
+ """
161
+ SELECT lp.responded_at, lp.prompted_at
162
+ FROM load_probes lp
163
+ JOIN sessions s ON s.id = lp.session_id
164
+ WHERE s.project_id = ? AND lp.skipped = 0
165
+ """,
166
+ (project_id,),
167
+ )
168
+ timestamps: list[datetime] = []
169
+ for row in rows:
170
+ raw = row[0] or row[1]
171
+ if not raw:
172
+ continue
173
+ try:
174
+ timestamps.append(datetime.fromisoformat(str(raw)))
175
+ except ValueError:
176
+ continue
177
+ today = today or datetime.now().date()
178
+ return compute_streak(timestamps, today=today, cutoff_hour=cutoff_hour)