@hegemonart/get-design-done 1.18.0 → 1.19.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +11 -5
- package/.claude-plugin/plugin.json +10 -4
- package/CHANGELOG.md +51 -0
- package/README.md +7 -0
- package/SKILL.md +10 -4
- package/agents/README.md +53 -0
- package/agents/a11y-mapper.md +10 -0
- package/agents/component-benchmark-harvester.md +11 -0
- package/agents/component-benchmark-synthesizer.md +11 -0
- package/agents/component-taxonomy-mapper.md +10 -0
- package/agents/design-advisor.md +10 -0
- package/agents/design-assumptions-analyzer.md +10 -0
- package/agents/design-auditor.md +15 -0
- package/agents/design-authority-watcher.md +10 -0
- package/agents/design-component-generator.md +10 -0
- package/agents/design-context-builder.md +6 -1
- package/agents/design-context-checker-gate.md +10 -0
- package/agents/design-context-checker.md +10 -0
- package/agents/design-discussant.md +10 -0
- package/agents/design-doc-writer.md +12 -0
- package/agents/design-executor.md +11 -1
- package/agents/design-figma-writer.md +10 -0
- package/agents/design-fixer.md +10 -0
- package/agents/design-integration-checker-gate.md +10 -0
- package/agents/design-integration-checker.md +10 -0
- package/agents/design-paper-writer.md +10 -0
- package/agents/design-pattern-mapper.md +11 -0
- package/agents/design-pencil-writer.md +10 -0
- package/agents/design-phase-researcher.md +11 -1
- package/agents/design-plan-checker.md +10 -0
- package/agents/design-planner.md +10 -0
- package/agents/design-reflector.md +10 -0
- package/agents/design-research-synthesizer.md +10 -0
- package/agents/design-start-writer.md +10 -0
- package/agents/design-update-checker.md +10 -0
- package/agents/design-verifier-gate.md +10 -0
- package/agents/design-verifier.md +11 -0
- package/agents/gdd-graphify-sync.md +10 -0
- package/agents/gdd-intel-updater.md +10 -0
- package/agents/gdd-learnings-extractor.md +10 -0
- package/agents/motion-mapper.md +10 -0
- package/agents/token-mapper.md +10 -0
- package/agents/visual-hierarchy-mapper.md +10 -0
- package/hooks/gdd-decision-injector.js +30 -8
- package/package.json +16 -3
- package/reference/data-visualization.md +333 -0
- package/reference/form-patterns.md +245 -0
- package/reference/information-architecture.md +255 -0
- package/reference/onboarding-progressive-disclosure.md +250 -0
- package/reference/platforms.md +346 -0
- package/reference/registry.json +409 -360
- package/reference/rtl-cjk-cultural.md +353 -0
- package/reference/schemas/insight-line.schema.json +37 -0
- package/reference/user-research.md +360 -0
- package/scripts/lib/design-search.cjs +206 -0
- package/scripts/lib/probe-optional.cjs +29 -0
- package/scripts/lib/relevance-counter.cjs +121 -0
- package/skills/complete-cycle/SKILL.md +40 -2
- package/skills/continue/SKILL.md +23 -0
- package/skills/pause/SKILL.md +40 -14
- package/skills/recall/SKILL.md +74 -0
- package/skills/resume/SKILL.md +34 -16
- package/skills/timeline/SKILL.md +65 -0
|
@@ -0,0 +1,360 @@
|
|
|
1
|
+
# User Research Methods & Practices
|
|
2
|
+
|
|
3
|
+
Use this during Discover (choosing the right method), Plan (sizing samples correctly), and Verify (interpreting results without over-claiming).
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Research Method Matrix
|
|
8
|
+
|
|
9
|
+
Every method sits in one of four quadrants defined by two axes: whether the goal is to **generate** new understanding or **evaluate** an existing solution, and whether the data is **qualitative** (words, observations, themes) or **quantitative** (numbers, rates, statistical comparisons).
|
|
10
|
+
|
|
11
|
+
| | **Qualitative** | **Quantitative** |
|
|
12
|
+
|---|---|---|
|
|
13
|
+
| **Generative** | Contextual interviews, diary studies, field shadowing, card sorting (open), JTBD discovery | Surveys with open-ended analysis, usage log mining, keyword frequency |
|
|
14
|
+
| **Evaluative** | Moderated usability testing, cognitive walkthrough, expert heuristic review | Tree testing, A/B testing, first-click test, 5-second test, unmoderated task completion rates |
|
|
15
|
+
|
|
16
|
+
**Generative research** answers "What problems exist and why?" It is appropriate early in a project cycle before solutions are defined. Outputs are themes, job stories, opportunity areas, and mental models.
|
|
17
|
+
|
|
18
|
+
**Evaluative research** answers "Does this solution work, and how well?" It is appropriate once a concept, prototype, or live product exists. Outputs are pass/fail rates, benchmarks, and ranked preference scores.
|
|
19
|
+
|
|
20
|
+
Mixing quadrants in the same study is usually a mistake — participants primed by evaluation tasks answer generative questions defensively. Plan separate sessions or separate phases.
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Method Catalog
|
|
25
|
+
|
|
26
|
+
### Contextual Interviews / User Interviews
|
|
27
|
+
|
|
28
|
+
A contextual interview is a structured or semi-structured conversation conducted in the environment where the work actually happens — at a desk, in a kitchen, on a shop floor. A plain user interview happens in a neutral setting (a conference room, a video call) and relies on self-report rather than observation. Contextual interviews surface discrepancies between what people say they do and what they actually do; plain interviews are faster to schedule and sufficient when behavior is verbal or cognitive.
|
|
29
|
+
|
|
30
|
+
**When to use:** Early discovery, when you need to understand goals, mental models, workarounds, and emotional context before any design artifact exists.
|
|
31
|
+
|
|
32
|
+
**Sample size:** 5–8 participants per distinct user segment for qualitative saturation. Add a segment (new 5–8) when the population differs meaningfully in goals, context, or technical ability.
|
|
33
|
+
|
|
34
|
+
**Key pitfalls:**
|
|
35
|
+
- Asking leading questions ("Do you find the checkout confusing?") primes the answer
|
|
36
|
+
- Asking hypothetical questions ("Would you use a feature that…?") produces aspirational fiction, not real behavior
|
|
37
|
+
- Failing to probe the last three minutes of a story — "tell me more about that" almost always yields the real insight
|
|
38
|
+
- Recording without consent, or failing to inform participants how data will be used
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
### Field Observation (Diary Studies, Shadowing)
|
|
43
|
+
|
|
44
|
+
Field observation captures behavior as it naturally unfolds, without a researcher present (diary study) or with a researcher silently present (shadowing). In a **diary study**, participants self-report via a structured log — a form, a voice note, or a photo — triggered by an event or a fixed schedule over days to weeks. In **shadowing**, the researcher follows a participant through a real work session and takes notes or records video.
|
|
45
|
+
|
|
46
|
+
**When to use:** When the behavior of interest is distributed across time (diary) or deeply embedded in a physical environment (shadowing); when recall in an interview would be too compressed or distorted.
|
|
47
|
+
|
|
48
|
+
**Sample size:** 8–15 participants for diary studies (attrition is high; plan for 30–40% dropout). Shadowing rarely exceeds 5–8 sessions before themes repeat.
|
|
49
|
+
|
|
50
|
+
**Key pitfalls:**
|
|
51
|
+
- Diary prompts that are too long or too frequent cause abandonment
|
|
52
|
+
- Shadowing creates an observer effect — participants perform "correct" behavior rather than habitual behavior; use this method alongside analytics to cross-check
|
|
53
|
+
- Diary data requires significant cleaning before synthesis; budget time accordingly
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
### Surveys
|
|
58
|
+
|
|
59
|
+
A survey collects self-reported data at scale through a fixed set of questions delivered asynchronously. Surveys are the only method that can reach hundreds or thousands of respondents simultaneously, making them suited for measuring prevalence ("What percentage of users have encountered this problem?") and tracking attitudes over time.
|
|
60
|
+
|
|
61
|
+
**When to use:** When you need to generalize a finding across a population, measure satisfaction (NPS, CSAT, CES), or validate that a qualitative theme observed in interviews appears at meaningful frequency.
|
|
62
|
+
|
|
63
|
+
**Sample size:** For a ±5% margin of error at 95% confidence in a large population, n = 385. For ±3%, n = 1067. Smaller populations require proportionally larger samples relative to population size.
|
|
64
|
+
|
|
65
|
+
**Key pitfalls:**
|
|
66
|
+
- Question order effects — put sensitive or demographic questions last
|
|
67
|
+
- Acquiescence bias — respondents agree with statements regardless of content; use balanced scales (Likert with both positive and negative anchors)
|
|
68
|
+
- Survivorship bias — survey respondents are not the same as non-respondents; a 10% response rate produces a self-selected sample
|
|
69
|
+
- Treating ordinal Likert data as interval data for arithmetic mean comparisons
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
### Card Sorting
|
|
74
|
+
|
|
75
|
+
Card sorting asks participants to organize a set of labeled cards (representing content, features, or navigation items) into groups that make sense to them. **Open card sorting** lets participants create their own group names — this reveals the mental model. **Closed card sorting** provides predefined categories — this tests whether an existing structure matches expectations. **Hybrid card sorting** asks participants to sort into predefined categories but allows them to create new ones when nothing fits.
|
|
76
|
+
|
|
77
|
+
**Output:** Group labels (open) surface vocabulary and categories that match user language. A **dendrogram** — a tree diagram produced by cluster analysis — shows which cards are grouped together most consistently across participants, revealing the underlying mental model numerically.
|
|
78
|
+
|
|
79
|
+
**When to use:** Before designing information architecture; after tree testing reveals a structure that is failing.
|
|
80
|
+
|
|
81
|
+
**Sample size:** 15–20 participants for open sorting; 30+ for closed sorting where you want statistical confidence in the clusters.
|
|
82
|
+
|
|
83
|
+
**Key pitfalls:**
|
|
84
|
+
- Cards that are ambiguous produce noise, not insight — pilot test card labels first
|
|
85
|
+
- Too many cards (>60) causes fatigue; split into focused subsets
|
|
86
|
+
- Dendrograms look authoritative but are sensitive to the distance algorithm chosen; report the algorithm and test alternative cuts
|
|
87
|
+
|
|
88
|
+
---
|
|
89
|
+
|
|
90
|
+
### Tree Testing
|
|
91
|
+
|
|
92
|
+
Tree testing evaluates an information architecture by asking participants to find content using only the text labels of a navigation tree — no visual design, no search, no breadcrumbs. Participants are given a task ("Find the return policy for a purchased item") and navigate the tree until they select a destination. The test isolates whether the structure and labeling work, independent of visual presentation.
|
|
93
|
+
|
|
94
|
+
**Output:**
|
|
95
|
+
- **Success rate** — percentage of participants who reached the correct destination. >75% = good; >60% = acceptable; below 60% = the path needs redesign.
|
|
96
|
+
- **Directness** — percentage of successful participants who reached the answer without backtracking. High directness means the path is intuitive; high success with low directness means participants found it eventually but the labels are misleading.
|
|
97
|
+
- **Time on task** — useful for comparing two versions of a tree; not meaningful in isolation.
|
|
98
|
+
|
|
99
|
+
**When to use:** Before building navigation; after card sorting to validate that an IA derived from participant groups actually works for task completion.
|
|
100
|
+
|
|
101
|
+
**Sample size:** 50 participants per tree variant for reliable rates; 30 is the practical minimum.
|
|
102
|
+
|
|
103
|
+
**Key pitfalls:**
|
|
104
|
+
- Tasks that hint at the answer ("Find the Privacy section in Settings") teach rather than test
|
|
105
|
+
- A tree that passes at 76% success for one task may still fail for a different task — test all high-traffic paths
|
|
106
|
+
- Do not test more than 10 tasks per session without randomization to prevent fatigue effects
|
|
107
|
+
|
|
108
|
+
---
|
|
109
|
+
|
|
110
|
+
### Preference Testing
|
|
111
|
+
|
|
112
|
+
Preference testing presents two or more design options and asks participants to choose the one they prefer, often with a follow-up asking why. It is a fast, low-cost way to break internal disagreements and gather directional signal, but it measures stated preference, not performance or usability.
|
|
113
|
+
|
|
114
|
+
**When to use:** When stakeholders are deadlocked between two visual directions; when you need to validate that a redesign is at least as well-received as the current design.
|
|
115
|
+
|
|
116
|
+
**Sample size:** 50–100 participants for a binary preference test to achieve statistically meaningful proportions.
|
|
117
|
+
|
|
118
|
+
**Key pitfalls:**
|
|
119
|
+
- Preference does not equal usability — participants consistently prefer interfaces they can use, but can prefer a beautiful interface they cannot use over an ugly one they can
|
|
120
|
+
- Options must be shown in randomized order to control for primacy and recency bias
|
|
121
|
+
- "Why did you choose this?" follow-ups suffer from post-hoc rationalization; treat qualitative reasons as directional, not causal
|
|
122
|
+
|
|
123
|
+
---
|
|
124
|
+
|
|
125
|
+
### First-Click Test
|
|
126
|
+
|
|
127
|
+
A first-click test measures where participants click first when presented with a task on a static image or live prototype. Research shows that users who make a correct first click complete tasks successfully about 87% of the time; users who make an incorrect first click complete tasks successfully only 46% of the time. The first click is therefore a strong predictor of task success.
|
|
128
|
+
|
|
129
|
+
**When to use:** To validate that call-to-action placement, labeling, and visual hierarchy direct users to the correct starting point before investing in a fully functional prototype.
|
|
130
|
+
|
|
131
|
+
**Sample size:** 20–40 participants provides sufficient data for heatmap patterns and task success rates.
|
|
132
|
+
|
|
133
|
+
**Key pitfalls:**
|
|
134
|
+
- A click in the right location for the wrong reason (the participant was guessing) is not a pass — follow up with "why did you click there?"
|
|
135
|
+
- First-click tests do not account for context built up from prior pages; test each page in context when the flow matters
|
|
136
|
+
- Heatmaps with small n look authoritative but individual outlier clicks distort the visualization
|
|
137
|
+
|
|
138
|
+
---
|
|
139
|
+
|
|
140
|
+
### 5-Second Test
|
|
141
|
+
|
|
142
|
+
A 5-second test exposes participants to a design for exactly five seconds, then hides it and asks what they remember, what the purpose of the page is, or what stood out. It measures first impressions and whether the primary message is communicated before any deliberate reading begins.
|
|
143
|
+
|
|
144
|
+
**When to use:** To test whether a landing page, hero section, or dashboard communicates its core value immediately; useful in early evaluation of competing concepts.
|
|
145
|
+
|
|
146
|
+
**Sample size:** 20–50 participants; the test is fast enough that larger samples are easy to recruit.
|
|
147
|
+
|
|
148
|
+
**Key pitfalls:**
|
|
149
|
+
- Questions asked immediately after exposure must be closed or very specific — open-ended questions produce memory artifacts, not design feedback
|
|
150
|
+
- The 5-second window captures attention and memory, not comprehension or usability; do not use this to predict task performance
|
|
151
|
+
- Results are highly sensitive to the specific question asked — "What is this page for?" and "What do you remember?" produce different data from the same exposure
|
|
152
|
+
|
|
153
|
+
---
|
|
154
|
+
|
|
155
|
+
## Sample-Size Heuristics
|
|
156
|
+
|
|
157
|
+
Getting sample size wrong is one of the most common research errors. Under-powering a quantitative study produces unreliable results; over-sizing a qualitative study wastes time without yielding more themes.
|
|
158
|
+
|
|
159
|
+
### Nielsen's "5 Users" Rule
|
|
160
|
+
|
|
161
|
+
Jakob Nielsen's finding that five users uncover approximately 85% of usability problems applies specifically to **moderated qualitative usability testing** with a homogeneous user population. It describes the point of diminishing returns for finding new themes, not for measuring rates or reaching statistical confidence.
|
|
162
|
+
|
|
163
|
+
The rule does **not** apply to:
|
|
164
|
+
- Quantitative usability studies where you need accurate task-completion rates (requires 20+ per condition)
|
|
165
|
+
- Surveys where you need generalizable proportions (requires hundreds)
|
|
166
|
+
- A/B tests where you need to detect a treatment effect (requires thousands)
|
|
167
|
+
- Tree tests where success-rate benchmarks must be reliable (requires 50+)
|
|
168
|
+
- Studies with heterogeneous populations — run 5 users per distinct segment
|
|
169
|
+
|
|
170
|
+
Misapplying the "5 users" rule to quantitative contexts is a common and costly error. A sample of five will tell you which problems exist, not how often they occur.
|
|
171
|
+
|
|
172
|
+
### A/B Testing Sample Size
|
|
173
|
+
|
|
174
|
+
The minimum sample size per variant is determined by three inputs: the **baseline conversion rate**, the **minimum detectable effect (MDE)** expressed as a relative lift, and the desired **statistical power**.
|
|
175
|
+
|
|
176
|
+
```
|
|
177
|
+
n ≈ (16 × σ²) / δ²
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
Where σ² is the variance of the metric and δ is the absolute MDE. For a binary conversion metric:
|
|
181
|
+
|
|
182
|
+
- Baseline = 10%, MDE = 5% relative lift (0.5pp absolute), 80% power → n ≈ 31,000 per variant
|
|
183
|
+
- Baseline = 10%, MDE = 20% relative lift (2pp absolute), 80% power → n ≈ 2,000 per variant
|
|
184
|
+
|
|
185
|
+
As a practical floor: **n ≥ 1,000 per variant** is required to reliably detect a ≥5% relative MDE at 80% power for typical conversion metrics. Below 1,000, the confidence interval is wide enough that most real effects will be declared non-significant.
|
|
186
|
+
|
|
187
|
+
### Survey Margin of Error
|
|
188
|
+
|
|
189
|
+
For a proportion in a large population (>10,000), the margin of error at 95% confidence is:
|
|
190
|
+
|
|
191
|
+
- ±5%: n = 385
|
|
192
|
+
- ±3%: n = 1,067
|
|
193
|
+
- ±1%: n = 9,604
|
|
194
|
+
|
|
195
|
+
For smaller populations, apply the finite population correction. These figures assume simple random sampling; convenience samples (self-selected respondents) do not produce these margins regardless of n.
|
|
196
|
+
|
|
197
|
+
---
|
|
198
|
+
|
|
199
|
+
## Synthesis Techniques
|
|
200
|
+
|
|
201
|
+
Raw research data — interview transcripts, survey responses, diary entries — does not become insight until it is actively interpreted and structured. Three frameworks cover most synthesis needs.
|
|
202
|
+
|
|
203
|
+
### Affinity Diagrams (KJ Method)
|
|
204
|
+
|
|
205
|
+
An affinity diagram (developed by Jiro Kawakita, hence KJ method) organizes qualitative observations into emergent themes through a bottom-up clustering process.
|
|
206
|
+
|
|
207
|
+
**Process:**
|
|
208
|
+
1. Write each distinct observation, quote, or behavior on a separate note (physical or digital)
|
|
209
|
+
2. Silently sort notes into groups based on natural similarity — team members work in parallel, moving notes without discussion
|
|
210
|
+
3. Once sorting slows, name each cluster with a header that captures the insight, not just the topic ("Users abandon checkout when shipping cost appears late" rather than "Shipping")
|
|
211
|
+
4. Merge or split clusters until the structure is stable
|
|
212
|
+
|
|
213
|
+
**When done:** Clustering is complete when new data no longer creates new groups — typically after three to four passes with a complete data set. If a new interview consistently produces notes that fit existing clusters without creating new ones, theoretical saturation has been reached.
|
|
214
|
+
|
|
215
|
+
### Jobs-to-be-Done
|
|
216
|
+
|
|
217
|
+
Jobs-to-be-Done (JTBD) frames user behavior around the progress a person is trying to make, not the product they are using. The canonical job story format is:
|
|
218
|
+
|
|
219
|
+
> **"When I [situation], I want to [motivation/goal], so I can [expected outcome]."**
|
|
220
|
+
|
|
221
|
+
JTBD distinguishes three job layers:
|
|
222
|
+
- **Functional job** — the practical task being accomplished ("When I'm running low on groceries, I want to reorder my usual items quickly, so I can avoid running out")
|
|
223
|
+
- **Social job** — how the person wants to be perceived while doing it ("so my family sees me as organized and dependable")
|
|
224
|
+
- **Emotional job** — how the person wants to feel ("so I don't feel the anxiety of an empty fridge")
|
|
225
|
+
|
|
226
|
+
Designs that serve only the functional job often feel transactional. Designs that also address emotional and social jobs create habitual use and preference. Surface all three layers during synthesis by asking "and why does that matter to you?" until the answer is about identity or feeling, not functionality.
|
|
227
|
+
|
|
228
|
+
### User Journey Mapping
|
|
229
|
+
|
|
230
|
+
A user journey map visualizes the sequence of steps a user takes to accomplish a goal, annotated with emotional valence, touchpoints, and observations at each step.
|
|
231
|
+
|
|
232
|
+
**Key components:**
|
|
233
|
+
- **Stages** — the high-level phases of the experience (Awareness, Onboarding, First Use, Routine Use, Problem Recovery)
|
|
234
|
+
- **Touchpoints** — every channel or interface the user contacts (email, app, support chat, physical receipt)
|
|
235
|
+
- **Emotional valence** — a curve showing frustration, confusion, delight, or confidence at each step, derived from research data, not assumed
|
|
236
|
+
- **Moments of truth** — the two or three steps where user trust is won or lost; these are the highest-leverage design targets
|
|
237
|
+
- **Pain points and gains** — specific friction (long wait, confusing label, missing feedback) and specific delights (unexpected shortcut, empathetic error message)
|
|
238
|
+
|
|
239
|
+
A journey map built from assumed experience is a risk document, not a research artifact. Only map journeys from observed or reported user behavior; mark any step built on assumption explicitly so it can be validated.
|
|
240
|
+
|
|
241
|
+
---
|
|
242
|
+
|
|
243
|
+
## A/B Testing
|
|
244
|
+
|
|
245
|
+
A/B testing (controlled experiment) measures the causal effect of a design change by randomly assigning users to a control variant (A) and a treatment variant (B) and comparing a primary metric after sufficient observations have accumulated.
|
|
246
|
+
|
|
247
|
+
### Sample Size and Power
|
|
248
|
+
|
|
249
|
+
Pre-register the following before running any experiment:
|
|
250
|
+
- **Primary metric** — the single metric the test is designed to move (conversion rate, session length, activation)
|
|
251
|
+
- **MDE** — the smallest relative improvement worth shipping; smaller MDEs require larger samples
|
|
252
|
+
- **Baseline rate** — the current value of the primary metric, measured over a representative period
|
|
253
|
+
- **Power** — 80% (β = 0.20) is the standard minimum; 90% is appropriate when the cost of a missed effect is high
|
|
254
|
+
- **Significance threshold** — α = 0.05 (5% Type I error rate) is standard; use α = 0.01 for high-stakes decisions or multiple simultaneous tests
|
|
255
|
+
|
|
256
|
+
### Sequential vs Fixed-Horizon Testing
|
|
257
|
+
|
|
258
|
+
**Fixed-horizon testing** requires committing to a sample size before the experiment begins and not checking results until that sample is collected. Peeking at results and stopping early when significance is reached inflates the Type I error rate dramatically — running 20 checks at α = 0.05 raises the effective false-positive rate to ~64%.
|
|
259
|
+
|
|
260
|
+
**Sequential testing** (e.g., using always-valid confidence sequences or mSPRT methods) allows early stopping while controlling error rates. It is safer when experiments run on volatile traffic or when shipping quickly matters. Sequential tests typically require 20–30% more total sample on average but eliminate the false-positive inflation from peeking. Use sequential testing as the default when your tooling supports it.
|
|
261
|
+
|
|
262
|
+
### Primary Metric vs Guardrail Metrics
|
|
263
|
+
|
|
264
|
+
Before launching any test, define the **guardrail metrics** — the metrics that must not decline as a condition of shipping, regardless of the primary metric result.
|
|
265
|
+
|
|
266
|
+
- **Primary metric:** checkout completion rate
|
|
267
|
+
- **Guardrail metrics:** page load time (must not increase >50ms), return rate (must not increase >1pp), customer support contact rate (must not increase)
|
|
268
|
+
|
|
269
|
+
A treatment that improves checkout completion by 8% while increasing support contacts by 30% is not a win. Define guardrails before running; changing them after results are known is p-hacking by another name.
|
|
270
|
+
|
|
271
|
+
### Novelty Effect
|
|
272
|
+
|
|
273
|
+
New UI patterns, colors, or flows often see inflated positive effects in the first week because users engage with anything that is different. To avoid shipping a change that performs well only because it is new, run experiments for **at least two full business cycles** (typically two weeks for B2C products) before reading results. Monitor the treatment effect over time — a novelty lift decays; a genuine improvement holds flat or grows.
|
|
274
|
+
|
|
275
|
+
---
|
|
276
|
+
|
|
277
|
+
## Analytics-Informed Design
|
|
278
|
+
|
|
279
|
+
Quantitative analytics tells you what users are doing at scale, but it cannot tell you why. Use analytics to identify where to focus research attention, not to replace it.
|
|
280
|
+
|
|
281
|
+
### Funnel Analysis
|
|
282
|
+
|
|
283
|
+
A funnel analysis counts users at each step of a defined sequence (landing → sign-up → onboarding → first key action → activation) and reports the drop-off rate between steps. A large drop between two steps identifies a problem worth investigating; it does not explain the cause.
|
|
284
|
+
|
|
285
|
+
**Correct interpretation:** "60% of users who start sign-up do not complete it — this step warrants usability testing and exit surveys."
|
|
286
|
+
|
|
287
|
+
**Incorrect interpretation:** "The sign-up form is confusing — we should simplify it." Funnels explain what, not why. The drop may be caused by form friction, price discovery too late, technical errors on specific devices, or users deliberately deferring. Research the step before redesigning it.
|
|
288
|
+
|
|
289
|
+
### Cohort Retention Curves
|
|
290
|
+
|
|
291
|
+
A cohort retention curve shows what percentage of users acquired in a given period return to the product on each subsequent day (or week). The shape of the curve identifies which phase of the product relationship is failing:
|
|
292
|
+
|
|
293
|
+
- **Early drop (Day 1–3):** Users are not completing onboarding or are not reaching the first moment of value; the onboarding flow is the design target
|
|
294
|
+
- **Mid drop (Day 7–14):** Users completed onboarding but are not forming a usage habit; engagement features (reminders, progress, social hooks) and the core loop are the design targets
|
|
295
|
+
- **Late drop (Day 30+):** Users had a habit but lost it; this indicates habit failure, not acquisition or onboarding issues — investigate trigger frequency, competing alternatives, and changing life context
|
|
296
|
+
|
|
297
|
+
A flat retention curve (asymptoting above zero) is the signal that product-market fit exists for at least a subset of users.
|
|
298
|
+
|
|
299
|
+
### Heatmap Interpretation
|
|
300
|
+
|
|
301
|
+
Heatmaps aggregate mouse movement, scroll depth, and click positions across many sessions into a visual overlay. They communicate density of attention, with important caveats:
|
|
302
|
+
|
|
303
|
+
- **Scroll depth ≠ engagement.** A user who scrolled to the bottom of a page and immediately left read nothing. Cross-reference scroll depth with time on page.
|
|
304
|
+
- **Click heatmaps lie on dynamic content.** Carousels, tabs, and modal triggers change what is rendered at a coordinate; aggregated clicks on a static screenshot misrepresent where those clicks actually landed.
|
|
305
|
+
- **Move maps are proxies.** Eye-tracking research shows mouse position correlates with gaze only roughly (r ≈ 0.64); do not treat move heatmaps as attention maps.
|
|
306
|
+
|
|
307
|
+
Use heatmaps to generate hypotheses ("Almost nobody clicks the secondary CTA — maybe it is not visible enough at typical scroll depths") and confirm those hypotheses with usability testing or A/B tests.
|
|
308
|
+
|
|
309
|
+
### Session Recordings
|
|
310
|
+
|
|
311
|
+
Session recordings capture individual user interactions as video replays. They are the analytics tool closest to observational research and the most time-intensive to analyze at scale.
|
|
312
|
+
|
|
313
|
+
**Patterns that indicate design problems:**
|
|
314
|
+
- **Rage clicks** — repeated rapid clicks on an element, indicating the user believes something should be clickable and it is not (or it is broken)
|
|
315
|
+
- **Dead clicks** — clicks on non-interactive elements, indicating visual affordance is misleading
|
|
316
|
+
- **Scroll-and-return patterns** — the user scrolls past a point, returns, and re-reads it, indicating the content is ambiguous or the action they need is not where they expected it
|
|
317
|
+
- **U-turns on multi-step flows** — navigating forward then immediately back, indicating confusion or missing information at the forward step
|
|
318
|
+
|
|
319
|
+
Sampling strategy matters: watch recordings filtered by the segment of users you are trying to understand (new users on mobile, users who abandoned checkout) rather than random sessions. Random sessions produce anecdote, not insight.
|
|
320
|
+
|
|
321
|
+
---
|
|
322
|
+
|
|
323
|
+
## Research Ethics
|
|
324
|
+
|
|
325
|
+
Ethical research practice is not only a moral obligation — it also produces better data. Participants who feel respected and safe give more candid and accurate responses.
|
|
326
|
+
|
|
327
|
+
### Informed Consent
|
|
328
|
+
|
|
329
|
+
Every participant must be told, before the session begins: what they will be asked to do, how long it will take, whether the session will be recorded (audio, video, or screen), who will have access to recordings and data, and how data will be stored and eventually deleted. Consent must be explicit and voluntary — a participant who feels they cannot say no has not given meaningful consent.
|
|
330
|
+
|
|
331
|
+
For unmoderated remote studies (tree tests, surveys, unmoderated usability tests), informed consent is captured in a pre-task screen that participants must actively acknowledge before proceeding.
|
|
332
|
+
|
|
333
|
+
### Observer Effect
|
|
334
|
+
|
|
335
|
+
The presence of a researcher — or even the knowledge that behavior is being observed — changes that behavior. Participants tend to perform more carefully, ask more clarifying questions, and avoid the shortcuts and workarounds that characterize natural use. This is the Hawthorne effect applied to usability research.
|
|
336
|
+
|
|
337
|
+
**Mitigations:**
|
|
338
|
+
- Use diary studies and screen recordings to capture naturalistic behavior without researcher presence
|
|
339
|
+
- In moderated sessions, use indirect tasks ("Show me how you would accomplish X") rather than direct observation ("Try to do X while I watch")
|
|
340
|
+
- Cross-reference moderated findings with analytics — discrepancies between observed behavior and logged behavior indicate observer effect
|
|
341
|
+
|
|
342
|
+
### Data Minimization
|
|
343
|
+
|
|
344
|
+
Collect only the data you will actually analyze. If the research question does not require knowing a participant's age, do not ask for it. If screen recordings will not be reviewed, do not record them. At rest, research data should be:
|
|
345
|
+
- Stored in access-controlled systems, not personal drives or shared folders
|
|
346
|
+
- Anonymized or pseudonymized — names replaced with participant IDs, identifying details removed from quotes before sharing
|
|
347
|
+
- Deleted on a defined schedule once the synthesis is complete
|
|
348
|
+
|
|
349
|
+
Retaining raw recordings indefinitely is a liability, not a resource.
|
|
350
|
+
|
|
351
|
+
### Vulnerable Populations
|
|
352
|
+
|
|
353
|
+
Research involving **children under 13**, **users with cognitive or developmental disabilities**, or **users in crisis** (mental health platforms, bereavement services) requires heightened ethical review equivalent to IRB (Institutional Review Board) standards, even in commercial settings. Specific requirements:
|
|
354
|
+
|
|
355
|
+
- Parental or guardian consent for minors, in addition to the participant's own assent
|
|
356
|
+
- Simplified consent language and extended time for participants with cognitive disabilities
|
|
357
|
+
- Safeguarding protocols — a defined plan for what happens if a participant discloses distress during a session
|
|
358
|
+
- Avoiding incentive structures that exploit vulnerability (large cash payments to economically precarious participants create coercive pressure)
|
|
359
|
+
|
|
360
|
+
When in doubt about whether a population qualifies for heightened review, treat them as though they do. The cost of over-protecting participants is low; the cost of under-protecting them is not.
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
/**
|
|
3
|
+
* design-search.cjs — cross-cycle recall search backend.
|
|
4
|
+
*
|
|
5
|
+
* Priority chain:
|
|
6
|
+
* 1. FTS5 via better-sqlite3 (fast, ranked) — when module is available
|
|
7
|
+
* 2. ripgrep — when rg is on PATH
|
|
8
|
+
* 3. Node fs line scan — universal fallback
|
|
9
|
+
*
|
|
10
|
+
* Public API:
|
|
11
|
+
* search(query, projectRoot, opts?) → [{file, line, text}]
|
|
12
|
+
* reindex(projectRoot) → void (rebuilds FTS5 DB; no-op on grep path)
|
|
13
|
+
* backendName() → 'fts5' | 'ripgrep' | 'node-grep'
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
const fs = require('fs');
|
|
17
|
+
const path = require('path');
|
|
18
|
+
const { spawnSync } = require('child_process');
|
|
19
|
+
const { probeOptional } = require('./probe-optional.cjs');
|
|
20
|
+
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
// Backend selection (evaluated once at module load)
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
|
|
25
|
+
const Database = probeOptional('better-sqlite3');
|
|
26
|
+
|
|
27
|
+
let _fts5Supported = false;
|
|
28
|
+
if (Database) {
|
|
29
|
+
try {
|
|
30
|
+
const probe = new Database(':memory:');
|
|
31
|
+
probe.exec('CREATE VIRTUAL TABLE _p USING fts5(t)');
|
|
32
|
+
probe.close();
|
|
33
|
+
_fts5Supported = true;
|
|
34
|
+
} catch { /* fts5 extension not compiled in */ }
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
function _rgAvailable() {
|
|
38
|
+
try {
|
|
39
|
+
const r = spawnSync('rg', ['--version'], { encoding: 'utf8', windowsHide: true });
|
|
40
|
+
return r.status === 0;
|
|
41
|
+
} catch { return false; }
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
const _hasRg = _rgAvailable();
|
|
45
|
+
|
|
46
|
+
function backendName() {
|
|
47
|
+
if (_fts5Supported) return 'fts5';
|
|
48
|
+
if (_hasRg) return 'ripgrep';
|
|
49
|
+
return 'node-grep';
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// ---------------------------------------------------------------------------
|
|
53
|
+
// Index paths
|
|
54
|
+
// ---------------------------------------------------------------------------
|
|
55
|
+
|
|
56
|
+
const INDEXED_GLOBS = [
|
|
57
|
+
'.design/archive/**/*.md',
|
|
58
|
+
'.design/learnings/LEARNINGS.md',
|
|
59
|
+
'.design/CYCLES.md',
|
|
60
|
+
];
|
|
61
|
+
|
|
62
|
+
function _dbPath(projectRoot) {
|
|
63
|
+
return path.join(projectRoot, '.design', 'search.db');
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
function _collectFiles(projectRoot) {
|
|
67
|
+
const results = [];
|
|
68
|
+
function walk(dir) {
|
|
69
|
+
let entries;
|
|
70
|
+
try { entries = fs.readdirSync(dir, { withFileTypes: true }); } catch { return; }
|
|
71
|
+
for (const e of entries) {
|
|
72
|
+
const full = path.join(dir, e.name);
|
|
73
|
+
if (e.isDirectory()) { walk(full); continue; }
|
|
74
|
+
if (e.name.endsWith('.md')) results.push(full);
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
walk(path.join(projectRoot, '.design', 'archive'));
|
|
78
|
+
for (const rel of [
|
|
79
|
+
path.join('.design', 'learnings', 'LEARNINGS.md'),
|
|
80
|
+
path.join('.design', 'CYCLES.md'),
|
|
81
|
+
]) {
|
|
82
|
+
const full = path.join(projectRoot, rel);
|
|
83
|
+
if (fs.existsSync(full)) results.push(full);
|
|
84
|
+
}
|
|
85
|
+
// STATE.md decision blocks
|
|
86
|
+
const state = path.join(projectRoot, '.design', 'STATE.md');
|
|
87
|
+
if (fs.existsSync(state)) results.push(state);
|
|
88
|
+
return results;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// ---------------------------------------------------------------------------
|
|
92
|
+
// FTS5 backend
|
|
93
|
+
// ---------------------------------------------------------------------------
|
|
94
|
+
|
|
95
|
+
function _openDb(projectRoot) {
|
|
96
|
+
const dbPath = _dbPath(projectRoot);
|
|
97
|
+
fs.mkdirSync(path.dirname(dbPath), { recursive: true });
|
|
98
|
+
const db = new Database(dbPath);
|
|
99
|
+
db.exec(`
|
|
100
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS docs USING fts5(
|
|
101
|
+
file UNINDEXED, line UNINDEXED, text, tokenize='trigram'
|
|
102
|
+
);
|
|
103
|
+
`);
|
|
104
|
+
return db;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
function reindex(projectRoot) {
|
|
108
|
+
if (!_fts5Supported) return;
|
|
109
|
+
const db = _openDb(projectRoot);
|
|
110
|
+
db.exec('DELETE FROM docs');
|
|
111
|
+
const insert = db.prepare('INSERT INTO docs(file, line, text) VALUES (?,?,?)');
|
|
112
|
+
const txn = db.transaction((files) => {
|
|
113
|
+
for (const file of files) {
|
|
114
|
+
let content;
|
|
115
|
+
try { content = fs.readFileSync(file, 'utf8'); } catch { continue; }
|
|
116
|
+
const lines = content.split(/\r?\n/);
|
|
117
|
+
for (let i = 0; i < lines.length; i++) {
|
|
118
|
+
const t = lines[i].trim();
|
|
119
|
+
if (t) insert.run(file, i + 1, t);
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
});
|
|
123
|
+
txn(_collectFiles(projectRoot));
|
|
124
|
+
db.close();
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
function _searchFts5(query, projectRoot, limit) {
|
|
128
|
+
const dbPath = _dbPath(projectRoot);
|
|
129
|
+
if (!fs.existsSync(dbPath)) reindex(projectRoot);
|
|
130
|
+
const db = new Database(dbPath, { readonly: true });
|
|
131
|
+
try {
|
|
132
|
+
const rows = db.prepare(
|
|
133
|
+
`SELECT file, line, text FROM docs WHERE docs MATCH ? ORDER BY rank LIMIT ?`
|
|
134
|
+
).all(query, limit);
|
|
135
|
+
return rows.map(r => ({ file: r.file, line: r.line, text: r.text }));
|
|
136
|
+
} finally {
|
|
137
|
+
db.close();
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// ---------------------------------------------------------------------------
|
|
142
|
+
// Ripgrep backend
|
|
143
|
+
// ---------------------------------------------------------------------------
|
|
144
|
+
|
|
145
|
+
function _escapeRe(s) {
|
|
146
|
+
return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
function _searchRg(query, projectRoot, limit) {
|
|
150
|
+
const terms = query.split(/\s+/).filter(Boolean);
|
|
151
|
+
const pattern = terms.map(_escapeRe).join('|');
|
|
152
|
+
const targets = _collectFiles(projectRoot);
|
|
153
|
+
if (!targets.length || !pattern) return [];
|
|
154
|
+
const r = spawnSync('rg', ['-n', '--no-heading', '-i', '-S', pattern, ...targets], {
|
|
155
|
+
encoding: 'utf8', windowsHide: true,
|
|
156
|
+
});
|
|
157
|
+
const results = [];
|
|
158
|
+
for (const line of (r.stdout || '').split(/\r?\n/)) {
|
|
159
|
+
const m = line.match(/^(.+?):(\d+):(.*)$/);
|
|
160
|
+
if (m) results.push({ file: m[1], line: Number(m[2]), text: m[3].trim() });
|
|
161
|
+
if (results.length >= limit) break;
|
|
162
|
+
}
|
|
163
|
+
return results;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// ---------------------------------------------------------------------------
|
|
167
|
+
// Node fs fallback backend
|
|
168
|
+
// ---------------------------------------------------------------------------
|
|
169
|
+
|
|
170
|
+
function _searchNode(query, projectRoot, limit) {
|
|
171
|
+
const terms = query.toLowerCase().split(/\s+/).filter(Boolean);
|
|
172
|
+
const files = _collectFiles(projectRoot);
|
|
173
|
+
const results = [];
|
|
174
|
+
for (const file of files) {
|
|
175
|
+
let content;
|
|
176
|
+
try { content = fs.readFileSync(file, 'utf8'); } catch { continue; }
|
|
177
|
+
const lines = content.split(/\r?\n/);
|
|
178
|
+
for (let i = 0; i < lines.length; i++) {
|
|
179
|
+
const lower = lines[i].toLowerCase();
|
|
180
|
+
if (terms.every(t => lower.includes(t))) {
|
|
181
|
+
results.push({ file, line: i + 1, text: lines[i].trim() });
|
|
182
|
+
if (results.length >= limit) return results;
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
return results;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
// ---------------------------------------------------------------------------
|
|
190
|
+
// Public search
|
|
191
|
+
// ---------------------------------------------------------------------------
|
|
192
|
+
|
|
193
|
+
/**
|
|
194
|
+
* @param {string} query
|
|
195
|
+
* @param {string} projectRoot absolute path to the project (contains .design/)
|
|
196
|
+
* @param {{ limit?: number }} [opts]
|
|
197
|
+
* @returns {{ file: string, line: number, text: string }[]}
|
|
198
|
+
*/
|
|
199
|
+
function search(query, projectRoot, opts = {}) {
|
|
200
|
+
const limit = opts.limit ?? 20;
|
|
201
|
+
if (_fts5Supported) return _searchFts5(query, projectRoot, limit);
|
|
202
|
+
if (_hasRg) return _searchRg(query, projectRoot, limit);
|
|
203
|
+
return _searchNode(query, projectRoot, limit);
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
module.exports = { search, reindex, backendName };
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
/**
|
|
3
|
+
* probe-optional.cjs — safely require optional native dependencies.
|
|
4
|
+
*
|
|
5
|
+
* Usage:
|
|
6
|
+
* const { probeOptional } = require('./probe-optional.cjs');
|
|
7
|
+
* const Database = probeOptional('better-sqlite3');
|
|
8
|
+
* if (Database) { ... } else { // fallback }
|
|
9
|
+
*
|
|
10
|
+
* Returns the module if available and natively compatible, null otherwise.
|
|
11
|
+
* Swallows MODULE_NOT_FOUND and native binding errors silently — callers
|
|
12
|
+
* must implement their own fallback path.
|
|
13
|
+
*/
|
|
14
|
+
function probeOptional(name) {
|
|
15
|
+
try {
|
|
16
|
+
return require(name);
|
|
17
|
+
} catch (e) {
|
|
18
|
+
if (
|
|
19
|
+
e.code === 'MODULE_NOT_FOUND' ||
|
|
20
|
+
e.message?.includes('was compiled against a different Node.js version') ||
|
|
21
|
+
e.message?.includes('NODE_MODULE_VERSION')
|
|
22
|
+
) {
|
|
23
|
+
return null;
|
|
24
|
+
}
|
|
25
|
+
throw e;
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
module.exports = { probeOptional };
|