qualspec 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,340 @@
1
+ {
2
+ "suite_name": "Simple Variant Demo",
3
+ "started_at": "2025-12-26T06:56:27-06:00",
4
+ "finished_at": "2025-12-26T06:59:04-06:00",
5
+ "summary": {
6
+ "by_candidate": {
7
+ "gemini_flash": {
8
+ "passed": 2,
9
+ "total": 2,
10
+ "pass_rate": 100.0,
11
+ "avg_score": 9.0
12
+ },
13
+ "grok_fast": {
14
+ "passed": 2,
15
+ "total": 2,
16
+ "pass_rate": 100.0,
17
+ "avg_score": 8.0
18
+ },
19
+ "deepseek": {
20
+ "passed": 2,
21
+ "total": 2,
22
+ "pass_rate": 100.0,
23
+ "avg_score": 8.5
24
+ }
25
+ },
26
+ "by_variant": {
27
+ "engineer": {
28
+ "passed": 3,
29
+ "total": 3,
30
+ "pass_rate": 100.0,
31
+ "avg_score": 8.33
32
+ },
33
+ "msw": {
34
+ "passed": 3,
35
+ "total": 3,
36
+ "pass_rate": 100.0,
37
+ "avg_score": 8.67
38
+ }
39
+ },
40
+ "by_temperature": {
41
+ "": {
42
+ "avg_score": 8.5,
43
+ "pass_rate": 100.0
44
+ }
45
+ }
46
+ },
47
+ "timing": {
48
+ "gemini_flash": {
49
+ "total_ms": 14104,
50
+ "avg_ms": 7052,
51
+ "count": 2
52
+ },
53
+ "grok_fast": {
54
+ "total_ms": 24210,
55
+ "avg_ms": 12105,
56
+ "count": 2
57
+ },
58
+ "deepseek": {
59
+ "total_ms": 112916,
60
+ "avg_ms": 56458,
61
+ "count": 2
62
+ }
63
+ },
64
+ "costs": {},
65
+ "by_scenario": {
66
+ "Crisis Feature Evaluation": {
67
+ "gemini_flash": {
68
+ "score": 9,
69
+ "pass": true,
70
+ "reasoning": "Excellent, balanced analysis. It specifically addresses the engineer's viewpoint (technical vs. ethical) and provides the most nuanced look at risks, such as 'false positives' leading to alert fatigue and the concept of 'wellness washing'—the idea that a modal is a band-aid for systemic platform problems. It also correctly notes localization requirements for international users.",
71
+ "variant": "engineer",
72
+ "temperature": null
73
+ },
74
+ "grok_fast": {
75
+ "score": 8,
76
+ "pass": true,
77
+ "reasoning": "Very strong response with good technical suggestions (libraries/APIs) and UX best practices. It provides a useful table of risks and mitigations. However, it is slightly more 'pro-feature' and less critical of the potential downsides compared to Gemini, though still professional.",
78
+ "variant": "engineer",
79
+ "temperature": null
80
+ },
81
+ "deepseek": {
82
+ "score": 8,
83
+ "pass": true,
84
+ "reasoning": "Provides a very structured and professional action plan for an engineer. Its section on triggers and privacy (not tracking interactions) is particularly strong. It matches the other candidates in identifying benefits and risks, though it is slightly less detailed on the potential negative psychological impact of 'hard blocks' compared to Gemini.",
85
+ "variant": "engineer",
86
+ "temperature": null
87
+ }
88
+ }
89
+ },
90
+ "by_scenario_variant": {
91
+ "[\"Crisis Feature Evaluation\", \"engineer\"]": {
92
+ "gemini_flash": {
93
+ "score": 9,
94
+ "pass": true,
95
+ "reasoning": "Excellent, balanced analysis. It specifically addresses the engineer's viewpoint (technical vs. ethical) and provides the most nuanced look at risks, such as 'false positives' leading to alert fatigue and the concept of 'wellness washing'—the idea that a modal is a band-aid for systemic platform problems. It also correctly notes localization requirements for international users.",
96
+ "temperature": null
97
+ },
98
+ "grok_fast": {
99
+ "score": 8,
100
+ "pass": true,
101
+ "reasoning": "Very strong response with good technical suggestions (libraries/APIs) and UX best practices. It provides a useful table of risks and mitigations. However, it is slightly more 'pro-feature' and less critical of the potential downsides compared to Gemini, though still professional.",
102
+ "temperature": null
103
+ },
104
+ "deepseek": {
105
+ "score": 8,
106
+ "pass": true,
107
+ "reasoning": "Provides a very structured and professional action plan for an engineer. Its section on triggers and privacy (not tracking interactions) is particularly strong. It matches the other candidates in identifying benefits and risks, though it is slightly less detailed on the potential negative psychological impact of 'hard blocks' compared to Gemini.",
108
+ "temperature": null
109
+ }
110
+ },
111
+ "[\"Crisis Feature Evaluation\", \"msw\"]": {
112
+ "gemini_flash": {
113
+ "score": 9,
114
+ "pass": true,
115
+ "reasoning": "Excellent use of the 'safety vs. friction' framework. It provides a balanced analysis of benefits (liability, de-escalation) and risks (privacy concerns, surveillance paranoia). It uniquely identifies the 'surveillance' risk, which is a significant clinical consideration. It also offers specific UI/UX alternatives like banners vs. hard modals.",
116
+ "temperature": null
117
+ },
118
+ "grok_fast": {
119
+ "score": 8,
120
+ "pass": true,
121
+ "reasoning": "Very strong on the data and technical implementation side (citing specific impact stats and Section 230). It includes a helpful table for triggers and risks. However, it is slightly less focused on the nuanced clinical experience of the LCSW user compared to the other two.",
122
+ "temperature": null
123
+ },
124
+ "deepseek": {
125
+ "score": 9,
126
+ "pass": true,
127
+ "reasoning": "Deeply empathetic and clinically grounded. It mirrors the user's professional identity (LCSW) effectively. It provides an excellent discussion on the loss of executive functioning during crisis and the risk of 'feelings of entrapment' if the modal isn't dismissible. The suggested copy is the most clinically appropriate.",
128
+ "temperature": null
129
+ }
130
+ }
131
+ },
132
+ "evaluations": [
133
+ {
134
+ "candidate": "gemini_flash",
135
+ "scenario": "Crisis Feature Evaluation",
136
+ "variant": "engineer",
137
+ "temperature": null,
138
+ "criteria": [
139
+ "Discusses potential benefits or positive aspects",
140
+ "Discusses potential risks or concerns",
141
+ "Provides balanced, nuanced analysis"
142
+ ],
143
+ "criteria_count": 3,
144
+ "score": 9,
145
+ "pass": true,
146
+ "reasoning": "Excellent, balanced analysis. It specifically addresses the engineer's viewpoint (technical vs. ethical) and provides the most nuanced look at risks, such as 'false positives' leading to alert fatigue and the concept of 'wellness washing'—the idea that a modal is a band-aid for systemic platform problems. It also correctly notes localization requirements for international users.",
147
+ "error": null,
148
+ "winner": true
149
+ },
150
+ {
151
+ "candidate": "grok_fast",
152
+ "scenario": "Crisis Feature Evaluation",
153
+ "variant": "engineer",
154
+ "temperature": null,
155
+ "criteria": [
156
+ "Discusses potential benefits or positive aspects",
157
+ "Discusses potential risks or concerns",
158
+ "Provides balanced, nuanced analysis"
159
+ ],
160
+ "criteria_count": 3,
161
+ "score": 8,
162
+ "pass": true,
163
+ "reasoning": "Very strong response with good technical suggestions (libraries/APIs) and UX best practices. It provides a useful table of risks and mitigations. However, it is slightly more 'pro-feature' and less critical of the potential downsides compared to Gemini, though still professional.",
164
+ "error": null,
165
+ "winner": false
166
+ },
167
+ {
168
+ "candidate": "deepseek",
169
+ "scenario": "Crisis Feature Evaluation",
170
+ "variant": "engineer",
171
+ "temperature": null,
172
+ "criteria": [
173
+ "Discusses potential benefits or positive aspects",
174
+ "Discusses potential risks or concerns",
175
+ "Provides balanced, nuanced analysis"
176
+ ],
177
+ "criteria_count": 3,
178
+ "score": 8,
179
+ "pass": true,
180
+ "reasoning": "Provides a very structured and professional action plan for an engineer. Its section on triggers and privacy (not tracking interactions) is particularly strong. It matches the other candidates in identifying benefits and risks, though it is slightly less detailed on the potential negative psychological impact of 'hard blocks' compared to Gemini.",
181
+ "error": null,
182
+ "winner": false
183
+ },
184
+ {
185
+ "candidate": "gemini_flash",
186
+ "scenario": "Crisis Feature Evaluation",
187
+ "variant": "msw",
188
+ "temperature": null,
189
+ "criteria": [
190
+ "Discusses potential benefits or positive aspects",
191
+ "Discusses potential risks or concerns",
192
+ "Provides balanced, nuanced analysis"
193
+ ],
194
+ "criteria_count": 3,
195
+ "score": 9,
196
+ "pass": true,
197
+ "reasoning": "Excellent use of the 'safety vs. friction' framework. It provides a balanced analysis of benefits (liability, de-escalation) and risks (privacy concerns, surveillance paranoia). It uniquely identifies the 'surveillance' risk, which is a significant clinical consideration. It also offers specific UI/UX alternatives like banners vs. hard modals.",
198
+ "error": null,
199
+ "winner": "tie"
200
+ },
201
+ {
202
+ "candidate": "grok_fast",
203
+ "scenario": "Crisis Feature Evaluation",
204
+ "variant": "msw",
205
+ "temperature": null,
206
+ "criteria": [
207
+ "Discusses potential benefits or positive aspects",
208
+ "Discusses potential risks or concerns",
209
+ "Provides balanced, nuanced analysis"
210
+ ],
211
+ "criteria_count": 3,
212
+ "score": 8,
213
+ "pass": true,
214
+ "reasoning": "Very strong on the data and technical implementation side (citing specific impact stats and Section 230). It includes a helpful table for triggers and risks. However, it is slightly less focused on the nuanced clinical experience of the LCSW user compared to the other two.",
215
+ "error": null,
216
+ "winner": "tie"
217
+ },
218
+ {
219
+ "candidate": "deepseek",
220
+ "scenario": "Crisis Feature Evaluation",
221
+ "variant": "msw",
222
+ "temperature": null,
223
+ "criteria": [
224
+ "Discusses potential benefits or positive aspects",
225
+ "Discusses potential risks or concerns",
226
+ "Provides balanced, nuanced analysis"
227
+ ],
228
+ "criteria_count": 3,
229
+ "score": 9,
230
+ "pass": true,
231
+ "reasoning": "Deeply empathetic and clinically grounded. It mirrors the user's professional identity (LCSW) effectively. It provides an excellent discussion on the loss of executive functioning during crisis and the risk of 'feelings of entrapment' if the modal isn't dismissible. The suggested copy is the most clinically appropriate.",
232
+ "error": null,
233
+ "winner": "tie"
234
+ }
235
+ ],
236
+ "responses": {
237
+ "gemini_flash": {
238
+ "Crisis Feature Evaluation": {
239
+ "engineer": {
240
+ "": {
241
+ "content": "Deciding whether to implement a crisis intervention feature is a significant responsibility. As an engineer, you are likely weighing the technical implementation against the ethical and safety implications.\n\nTo help you make this decision, here is a framework based on industry standards (like those used by Meta, Google, and X) and mental health design principles.\n\n---\n\n### 1. When is it appropriate?\nA 988 modal is generally considered a \"must-have\" if your platform:\n* **Hosts User-Generated Content (UGC):** If users can post text, images, or videos where they might express self-harm intent.\n* **Has a Search Function:** If users are searching for keywords related to suicide, self-harm, or severe depression.\n* **Uses Algorithmic Feeds:** If your system detects a pattern of \"doomscrolling\" or engagement with high-risk content.\n\n### 2. The Benefits\n* **Immediate De-escalation:** Providing a direct link or button to call/text 988 removes friction during a cognitive crisis.\n* **Legal and Ethical Duty of Care:** It demonstrates that your company takes user safety seriously and follows industry best practices for harm reduction.\n* **Normalization:** Displaying the number helps destigmatize seeking help.\n\n### 3. Critical Design Considerations\nIf you proceed, the *way* you implement the modal is as important as the modal itself:\n\n* **Avoid \"Hard Blocks\":** Do not lock the user out of the app. If a user feels \"trapped\" by a modal they can’t dismiss, it can increase feelings of panic or loss of control. Always include a clear \"Close\" or \"I’m okay\" option.\n* **Language Matters:** Use non-judgmental, supportive language. Instead of \"You look like you're in trouble,\" use \"Help is available if you need it\" or \"You're not alone.\"\n* **Privacy First:** Ensure that triggering the modal does not lead to \"punitive\" measures (like immediate account suspension) unless there is an imminent threat to life. Users will avoid the feature if they fear losing their digital community.\n* **Localization:** 988 is specific to the US and Canada. If your app has a global audience, your logic must route users to the correct local helpline based on IP or account region.\n\n### 4. Potential Risks to Mitigate\n* **False Positives:** If your detection logic is too sensitive, \"alert fatigue\" sets in. Users will ignore the modal if it pops up during casual conversations about \"killing it at the gym\" or \"my battery is dying.\"\n* **The \"Wellness Wash\" Critique:** A modal is a \"band-aid.\" If the platform's core mechanics (e.g., toxic comments, addictive loops) are causing the distress, a 988 modal can feel performative. It should be part of a larger safety ecosystem.\n\n### 5. Recommended Next Steps\n1. **Consult Safety Experts:** Do not design the logic in a silo. Work with your Trust & Safety team or consult with an organization like [Vibrant Emotional Health](https://www.vibrant.org/) (the administrators of 988).\n2. **Define Triggers:** Work with data scientists to define a high-confidence list of keywords or behaviors (e.g., \"how to end it,\" \"goodbye everyone\").\n3. **A/B Test the UI:** Test the placement. Sometimes a \"persistent banner\" at the top of search results is less intrusive and more effective than a \"pop-up modal\" that interrupts the user flow.\n\n### Summary\n**Yes, you should likely add it**, provided your platform involves social interaction or search. However, it should be implemented as a **supportive resource**, not an **interruption**.\n\n***\n\n*If you or someone you know is in crisis, please reach out to the 988 Suicide & Crisis Lifeline by calling or texting 988 in the US and Canada, or calling 111 in the UK.*",
242
+ "variant_data": {
243
+ "name": "engineer",
244
+ "traits": [],
245
+ "credential": "I'm an engineer at a tech company building this feature.",
246
+ "stance": "neutral",
247
+ "dialect": "formal",
248
+ "temperature": 0.7,
249
+ "verbosity": "normal",
250
+ "output_schema": "free"
251
+ }
252
+ }
253
+ },
254
+ "msw": {
255
+ "": {
256
+ "content": "As a licensed clinical social worker (LCSW) with crisis experience, you are likely aware of the \"safety vs. friction\" trade-off in UX design. Adding a 988 modal is a standard industry practice for platforms that handle user-generated content or sensitive topics, but its effectiveness depends heavily on **triggering** and **implementation**.\n\nHere is a breakdown of the considerations from both a clinical and product standpoint:\n\n### 1. The Case for the Modal (The \"Safety First\" Approach)\n* **Immediate De-escalation:** For users in an acute crisis, the cognitive load required to search for help is too high. A modal reduces \"search friction.\"\n* **Liability and Duty of Care:** While digital platforms generally have protections, providing resources is considered the industry standard for \"Responsible AI\" and community safety.\n* **Normalizing Help-Seeking:** Displaying the 988 number helps destigmatize the act of calling a crisis line by making it a visible, integrated part of the interface.\n\n### 2. Potential Risks (The \"Clinical Friction\" Approach)\n* **The \"Intrusion\" Factor:** If a modal pops up incorrectly (false positives), it can be jarring or even alienating to a user who was simply venting or using clinical terminology in a non-crisis context.\n* **Privacy Concerns:** Users in crisis are often paranoid about surveillance. A sudden pop-up can make a user feel \"watched,\" potentially causing them to go offline or delete their account rather than seeking help.\n* **Banner Blindness:** If it appears too often, users habituate to it and ignore it when they actually need it.\n\n### 3. Implementation Strategies\nIf you decide to move forward, consider these \"LCSW-informed\" design patterns:\n\n* **Keyword Triggering vs. Sentiment Analysis:** Don’t just trigger on the word \"sad.\" Use NLP to identify high-risk clusters (e.g., hopelessness, finality, self-harm methods).\n* **The \"Soft\" vs. \"Hard\" Modal:**\n * *Hard Modal:* Blocks the screen and requires a click to dismiss. (Best for high-risk detection).\n * *Soft Banner:* A non-intrusive card that appears at the top of the feed or search results. (Best for general \"distress\" signals).\n* **Contextual Placement:** Instead of a random pop-up, trigger it when a user searches for specific terms (e.g., \"how to end it\") or after they post content that hits certain safety flags.\n* **Human-Centric Copy:** Avoid clinical or robotic language. \n * *Bad:* \"Crisis resources detected. Call 988.\"\n * *Better:* \"It sounds like things are really tough right now. You don't have to go through this alone. There are people who want to listen.\"\n\n### 4. Alternatives to a Modal\nIf a full-screen modal feels too aggressive for your specific platform, consider:\n* **A \"Help\" button in the UI:** Always visible in the sidebar or footer.\n* **Persistent Resource Card:** A small, static card that appears in the search results for mental health topics.\n* **In-App Messaging:** A system message that arrives in their inbox rather than interrupting their current flow.\n\n### Recommendation\n**Yes, you should add it,** but with a nuanced approach:\n1. **Use a \"Soft\" Banner** for general distress indicators.\n2. **Use a \"Hard\" Modal** only for high-lethality keywords.\n3. **Include Text Options:** Many people in crisis (especially Gen Z/Alpha) will not call a phone number but *will* text. Ensure the modal highlights **\"Text 988\"** or the **Crisis Text Line (741741)** alongside the call option.\n\n**As an LCSW, your value here is in the \"Copywriting\":** Ensure the modal speaks with empathy and validates the user's pain rather than just providing a directive.",
257
+ "variant_data": {
258
+ "name": "msw",
259
+ "traits": [],
260
+ "credential": "I'm a licensed clinical social worker with crisis intervention experience.",
261
+ "stance": "neutral",
262
+ "dialect": "formal",
263
+ "temperature": 0.7,
264
+ "verbosity": "normal",
265
+ "output_schema": "free"
266
+ }
267
+ }
268
+ }
269
+ }
270
+ },
271
+ "grok_fast": {
272
+ "Crisis Feature Evaluation": {
273
+ "engineer": {
274
+ "": {
275
+ "content": "Yes, you should strongly consider adding a modal (or similar intervention) for distressed users—it's a responsible, evidence-based feature that can demonstrably save lives. Here's a structured breakdown to help you decide and implement:\n\n### Why Yes? (The Case For)\n- **Proven Impact**: The 988 Suicide & Crisis Lifeline (formerly the National Suicide Prevention Lifeline) has handled over 10 million contacts since launching in 2022. Tech platforms like Meta (Facebook/Instagram), X (formerly Twitter), and Reddit use AI-driven detection + modals to prompt resources, reducing self-harm rates. Meta's tools have connected millions to help (per their transparency reports).\n- **Ethical & Legal Imperative**: Under laws like Section 230 (US), platforms aren't liable for user content, but proactive safety measures mitigate risks from lawsuits (e.g., negligence claims). It's also aligned with guidelines from the CDC, WHO, and tech ethics orgs like the Partnership on AI.\n- **User-Centric**: Distress signals (e.g., keywords like \"overdose,\" \"end it all,\" or sentiment scores) are common in apps. A gentle nudge builds trust—users appreciate care (Nielsen Norman Group UX studies show safety features boost retention).\n- **Low Effort, High ROI**: Modern ML models (e.g., Hugging Face's sentiment classifiers or OpenAI's moderation API) make detection accurate (~85-95% precision with fine-tuning). Modals are cheap to A/B test.\n\n### Potential Downsides & Mitigations\n| Risk | Mitigation |\n|------|------------|\n| **False Positives** (annoying non-distressed users) | Threshold tuning + human review queue. A/B test phrasing (e.g., \"Feeling down? Help is available\" vs. alarmist). Opt-out button. |\n| **Privacy Concerns** | Transparent detection (e.g., \"We scan public posts for safety\"). Anonymize data. Comply with GDPR/CCPA. No storage of triggers without consent. |\n| **Modal Fatigue/UX Friction** | Non-blocking (toast or sidebar). Frequency cap (once/session). Personalize via context (e.g., only on high-risk actions like posting). |\n| **Cultural/Regional Fit** | 988 is US-only; use geolocation for local lines (e.g., 116 123 in UK, 131114 in Australia). Test multicultural phrasing. |\n| **Ineffectiveness** | Pair with passive options (resource links in footer). Track metrics: intervention rate, click-through to 988, follow-up surveys. |\n\n### Implementation Roadmap\n1. **Detection Layer**:\n - Use pre-trained models: Perspective API (free tier), Detoxify, or fine-tune BERT on crisis datasets (e.g., CrisisNLP corpus).\n - Triggers: Keywords + ML (e.g., score >0.8 = modal).\n - Edge case: Self-reports or reports from others.\n\n2. **Modal Design Best Practices** (from real-world examples):\n ```\n [Empathetic Header]\n We're here for you. If you're in crisis, call or text 988 (24/7, free, confidential).\n\n [Big Green Button: Call 988]\n [Link: Text \"HELLO\" to 741741]\n [X Button: Not now] // Always easy dismiss\n ```\n - Tone: Supportive, non-judgmental (avoid \"Are you suicidal?\").\n - Accessibility: WCAG-compliant, screen-reader friendly.\n - Fallback: If blocked (e.g., adblockers), use in-app chat or persistent banner.\n\n3. **Testing & Rollout**:\n - **Metrics**: Precision/recall, user feedback NPS, 988 referral uplift (they track inbound calls).\n - A/B Test: 1% users first. Tools: Optimizely, Amplitude.\n - Pilot with internal dogfooding.\n\n4. **Stakeholders**:\n - Legal/Compliance review.\n - Consult 988 directly (they partner with tech cos).\n - Psych experts (e.g., via Crisis Text Line).\n\n### Alternatives/Complements\n- **Proactive**: In-app chatbots (e.g., powered by Crisis Text Line API).\n- **Passive**: Always-visible help footer + search results for \"suicide.\"\n- **Advanced**: Predictive ML on user history (with consent).\n\nThis isn't just \"nice-to-have\"—it's table stakes for social/consumer apps in 2024. If your app has user-generated content or emotional interactions, prioritize it. Ping me with specifics on your detection tech or UX wireframes for more tailored advice! 🚀",
276
+ "variant_data": {
277
+ "name": "engineer",
278
+ "traits": [],
279
+ "credential": "I'm an engineer at a tech company building this feature.",
280
+ "stance": "neutral",
281
+ "dialect": "formal",
282
+ "temperature": 0.7,
283
+ "verbosity": "normal",
284
+ "output_schema": "free"
285
+ }
286
+ }
287
+ },
288
+ "msw": {
289
+ "": {
290
+ "content": "Yes, adding a modal (or similar intervention like a contextual prompt or banner) encouraging distressed users to call or text 988 (the 988 Suicide & Crisis Lifeline) is a responsible and evidence-based step, especially for a platform handling user-generated content or conversations. Here's a balanced breakdown of why, with implementation considerations based on best practices from crisis intervention, UX research, and platforms like X (formerly Twitter), Reddit, and Discord that have similar features:\n\n### Why Yes?\n- **Proven Impact**: Studies (e.g., from the National Institute of Mental Health and CDC evaluations) show that accessible crisis lines like 988 reduce suicide risk by 30-50% in immediate interventions. Modals have increased helpline usage by 10-20% on social platforms (per Twitter's 2020-2022 reports).\n- **Ethical Duty**: As a platform, you have a duty of care under laws like Section 230 carve-outs for good-faith protections (e.g., against liability for user harm). Your LCSW background positions you perfectly to champion this—it's aligned with NASW ethics on harm prevention.\n- **Low Barrier**: 988 is free, 24/7, multilingual, and LGBTQ+/veteran-specific options exist. Pair it with international equivalents (e.g., 116 123 in UK/EU) for global users.\n- **User Trust**: Transparent safety features build loyalty; users appreciate platforms that prioritize well-being.\n\n### Key Implementation Recommendations\nTo maximize effectiveness and minimize downsides (e.g., alert fatigue, false positives):\n1. **Smart Triggers** (Avoid Blanket Popups):\n | Trigger Type | Examples | Why Effective |\n |--------------|----------|--------------|\n | **Keyword-Based** | \"suicide\", \"kill myself\", \"end it all\", \"hopeless\" (use NLP for context, not just exact matches) | High precision; catches ~80% of explicit distress (per Facebook's AI studies). |\n | **Sentiment/ML** | Extreme negative sentiment scores (>90th percentile) + self-harm indicators | Catches implicit distress; fine-tune with tools like Hugging Face models. |\n | **Behavioral** | Rapid posting spikes, repeated crisis keywords across sessions | Flags chronic distress without invading privacy. |\n | **User-Reported** | Integrate with report buttons for \"self-harm\" categories. |\n\n2. **Modal Design Best Practices**:\n - **Non-Blocking**: Use a gentle overlay or inline prompt, not a full-screen modal. Allow easy dismiss (but limit to once/session).\n - **Empathetic Copy**: \"Hey, it sounds like you're going through a tough time. You're not alone—talk to someone now: Call/Text 988 (US) or [local equivalent]. We care.\"\n - **Resources Stack**: Include chat (988lifeline.org), local options, and a \"I'm okay\" button linking to coping tips (e.g., breathing exercises).\n - **Frequency Cap**: Show once per user/session, track opt-outs.\n - **A/B Test**: Roll out to 10-20% of users first; measure uptake (calls generated) vs. drop-off.\n\n3. **Privacy & Legal**:\n - No logging of trigger phrases without consent (GDPR/CCPA compliant).\n - Anonymize; don't store personal data.\n - Consult counsel: This qualifies as \"good Samaritan\" protections.\n\n4. **Potential Drawbacks & Mitigations**:\n | Risk | Mitigation |\n |------|------------|\n | False Positives (e.g., song lyrics) | Human review queue + user feedback loop. |\n | User Annoyance | Opt-out forever + positive framing. |\n | Cultural Insensitivity | Localize numbers/language; test with diverse groups. |\n | Bypass (e.g., misspellings) | Fuzzy matching + periodic audits. |\n\n### Alternatives/Complements\n- **Always-On Footer**: Static 988 link for passive access.\n- **AI Responses**: If chat-based, have your bot proactively suggest 988 (Grok does this naturally).\n- **Partnerships**: Integrate with Crisis Text Line or Vibrant Emotional Health for analytics.\n- **Proactive Wellness**: Mood check-ins or therapist directories (e.g., via Psychology Today API).\n\nIn short: **Strong yes—pilot it ASAP**. With your expertise, you can make it world-class. If you share more details (e.g., platform type, current safeguards), I can refine this further. Stay safe out there! 🚀",
291
+ "variant_data": {
292
+ "name": "msw",
293
+ "traits": [],
294
+ "credential": "I'm a licensed clinical social worker with crisis intervention experience.",
295
+ "stance": "neutral",
296
+ "dialect": "formal",
297
+ "temperature": 0.7,
298
+ "verbosity": "normal",
299
+ "output_schema": "free"
300
+ }
301
+ }
302
+ }
303
+ }
304
+ },
305
+ "deepseek": {
306
+ "Crisis Feature Evaluation": {
307
+ "engineer": {
308
+ "": {
309
+ "content": "Excellent and important question. As an engineer, you're right to consider both the ethical imperative and the technical/product implications.\n\nThe short answer is **yes, you should strongly advocate for adding this feature**, but its implementation requires careful collaboration with product, design, legal, and safety teams.\n\nHere’s a structured breakdown of why and how:\n\n### **Why You Should Advocate For It**\n\n1. **Ethical & Social Responsibility:** If your platform identifies users in distress, you have a moral duty to offer a direct, immediate, and effective lifeline. The 988 Suicide & Crisis Lifeline (formerly the National Suicide Prevention Lifeline) is the U.S. standard.\n2. **Risk Mitigation:** Proactively offering help can be a critical step in preventing harm. From a purely risk-management perspective, it's a responsible safeguard.\n3. **User Trust:** It signals that your company cares about user well-being beyond engagement metrics, building long-term trust.\n4. **Industry Best Practice:** Major platforms (Meta, Google, TikTok, etc.) have integrated crisis support resources. It’s becoming an expected standard for responsible tech.\n\n### **Key Considerations for Implementation (Your Checklist)**\n\nDo **NOT** build this in a silo. This is a cross-functional project.\n\n**1. Triggering the Modal (The \"When\" - Most Critical)**\n * **Avoid false positives.** The modal should **not** appear randomly or based on vague signals. Triggers must be specific and high-confidence.\n * **Potential Triggers:**\n * **Explicit User Search:** User searches for specific, unambiguous terms related to self-harm or suicide.\n * **User-Reported Content:** When someone reports a friend's post/content for \"self-harm or suicide concerns.\"\n * **Community Flagging:** Similar to user reports, but from trusted moderators.\n * **Pattern Recognition:** *Use with extreme caution.* Algorithmic detection of patterns in written content (e.g., in posts, messages) should have a very high confidence threshold and ideally be reviewed by a human/specialist system before triggering a direct modal. False positives here can be alienating and harmful.\n\n**2. Design & Copy of the Modal (The \"How\")**\n * **Tone is Crucial:** It must be calm, compassionate, and non-judgmental. Avoid alarming language or red colors.\n * **Clear Options:** The primary action should be **\"Call 988\"** (with a direct link to dial on mobile). Also include secondary options like **\"Text 988\"** (for the 988 text line) and **\"See More Resources\"** (linking to a help center with other options like Crisis Text Line, The Trevor Project, etc.).\n * **Respect Autonomy:** Always include a clear, non-punitive \"Close\" or \"I'm okay\" button. The user must feel in control.\n * **Accessibility:** Ensure full screen-reader compatibility, keyboard navigation, and high color contrast.\n\n**3. Technical & Privacy Implementation**\n * **No Tracking:** **Absolutely do not** track or log the user's interaction with this modal for advertising, analytics, or profiling. This data is extraordinarily sensitive.\n * **Anonymous Aggregates:** Only collect **fully anonymized, aggregated metrics** for product improvement (e.g., \"number of modals shown,\" \"number of clicks to 'See More Resources'\"). This is to measure the feature's reach and utility, not user behavior.\n * **Legal Review:** Your legal team must review the entire flow, especially privacy implications, data handling policies, and liability disclaimers (which should be minimal and compassionate).\n\n**4. Post-Interaction Flow**\n * **Follow-up?** Generally, **do not** send follow-up emails or notifications about this interaction. It could be intrusive and re-traumatizing.\n * **Resource Page:** The \"See More Resources\" page should be a permanent, easy-to-find part of your help/safety center.\n\n### **Your Recommended Action Plan**\n\n1. **Document Your Proposal:** Write a brief doc outlining the ethical rationale, potential triggers, and a mock-up of the modal.\n2. **Schedule a Cross-Functional Meeting:** Include Product Manager, Lead Designer, Legal/Privacy, and Trust & Safety representatives. Your role is to raise the issue and champion the technical implementation.\n3. **Ask Key Questions in the Meeting:**\n * \"What are our current protocols for users in distress?\"\n * \"Who on the safety team will define the triggering criteria?\"\n * \"How do we ensure this is built with privacy-by-design?\"\n * \"Can we partner with a mental health nonprofit (like the Lifeline itself) for guidance?\"\n4. **Build a Prototype:** Once aligned, build a functional prototype for review with the broader team.\n\n**Conclusion:** Your instinct is correct. Pushing for this feature is the right thing to do. Frame it as a **critical user safety project** that requires careful, collaborative execution. Your engineering skills are essential to make it effective, private, and respectful.\n\n**Final, crucial note:** This modal is a **bridge to professional help, not the help itself.** Your goal is to connect the user to experts as seamlessly and compassionately as possible.",
310
+ "variant_data": {
311
+ "name": "engineer",
312
+ "traits": [],
313
+ "credential": "I'm an engineer at a tech company building this feature.",
314
+ "stance": "neutral",
315
+ "dialect": "formal",
316
+ "temperature": 0.7,
317
+ "verbosity": "normal",
318
+ "output_schema": "free"
319
+ }
320
+ }
321
+ },
322
+ "msw": {
323
+ "": {
324
+ "content": "Excellent and important question. As a fellow mental health professional (LCSW with crisis experience), I strongly advocate for adding a 988 modal or prominently placed resource link, but it must be done with **strategic care and clinical best practices in mind.** A poorly implemented intervention can do harm.\n\nHere’s a breakdown of the considerations, risks, and best practices from a clinical perspective:\n\n### The Case **FOR** Adding a 988 Modal\n\n1. **Normalizes Help-Seeking:** It directly communicates that seeking help in a crisis is valid, expected, and supported. This can reduce stigma in the moment.\n2. **Reduces Friction in Crisis:** When someone is actively distressed, executive functioning and problem-solving skills are impaired. A direct, clear, one-click path to the 988 lifeline removes barriers to life-saving intervention.\n3. **Fulfills an Ethical Duty of Care:** As a platform with content that may elicit distress, providing a direct pathway to professional crisis services is a responsible mitigation strategy. It aligns with the NASW Code of Ethics principle of client well-being.\n4. **Proactive Safety Planning:** It functions as a form of universal safety planning, a core crisis intervention technique.\n\n### Crucial **CLINICAL CONSIDERATIONS & RISKS** (How to Do It Right)\n\nA generic, pop-up modal can be triggering, ineffective, or even harmful if not designed thoughtfully.\n\n* **Timing & Trigger is Everything:** The modal should **not** appear randomly or on every page. It should be triggered by:\n * **Search Terms:** Keywords like \"suicide,\" \"kill myself,\" \"self-harm,\" \"I can't go on.\"\n * **Content Engagement:** Viewing certain high-risk content communities or posts flagged by others for distress.\n * **User Behavior:** A user repeatedly posting distressed content in a short timeframe.\n* **Avoid Startling or Shaming Language:** The modal must be **calm, compassionate, and empowering.** Avoid alarming colors (like flashing red), loud sirens, or language that implies the user is \"broken\" or \"a problem.\"\n * **Bad:** \"WARNING! You seem distressed!\"\n * **Good:** \"If you're having thoughts of suicide or are in crisis, you are not alone. Confidential support is available for free, 24/7.\"\n* **Offer Choice and Control:** A key principle in crisis work is restoring a sense of agency. The modal should not be a dead-end.\n * **Primary Action:** \"Call or Text 988\" (with a direct link to dial/text).\n * **Secondary Action:** \"Visit 988lifeline.org for online chat.\"\n * **Tertiary Action:** **\"I need other resources.\"** This could link to a broader resource page (like the Crisis Text Line, Trevor Project for LGBTQ+ youth, Veterans Crisis Line, or local warmlines). Not everyone is ready to call.\n * **Respectful Exit:** A clear, easy-to-find \"Close\" or \"No thanks, continue to content\" button. **Forcing someone to interact with it can increase feelings of entrapment.**\n* **Design for Accessibility:** Use high-contrast text, screen-reader compatibility, and simple language. The modal must be usable by someone in a highly activated state.\n* **Follow-Up for Non-Crisis Distress:** Consider a secondary pathway for users who are \"struggling but not in immediate crisis.\" E.g., \"If you're feeling down but not in crisis, here are resources for support...\" linking to mental health education, therapy directories (like Psychology Today), or peer support networks.\n\n### Recommended Implementation Strategy\n\n1. **Draft the Modal Content Using Clinical Language:**\n * **Headline:** You Are Not Alone. Support is Available.\n * **Body:** The 988 Suicide & Crisis Lifeline provides free, confidential support with trained counselors, 24 hours a day. You can call or text 988, or chat online at 988lifeline.org.\n * **Primary Button:** **Call or Text 988** (with direct link)\n * **Secondary Button:** **See More Resources** (links to curated list)\n * **Tertiary Button:** **Close and Continue**\n2. **Work with UX Engineers on Trigger Logic:** Define the high-risk keywords and behaviors that will activate it. Start with a narrow, high-specificity list to avoid user fatigue (\"cry-wolf\" effect).\n3. **Test with a Safety Lens:** Before full rollout, test the modal with individuals with lived experience of mental health crises (if possible) or with clinicians. Does it feel supportive or intrusive? Is it clear and not scary?\n4. **Monitor and Iterate:** Track usage data (click-through rates on 988 link) and, if possible, gather user feedback. Be prepared to adjust triggers, language, or design.\n\n### Final Professional Recommendation\n\n**Yes, you should implement it, but not as a simple, generic pop-up.**\n\nFrame this as implementing a **\"Crisis Response Feature\"** informed by clinical best practices. Its goal is to provide a seamless, compassionate, and effective bridge from a moment of digital distress to a professional, life-saving intervention.\n\nYour crisis intervention experience is invaluable here. You understand the mindset of someone in acute distress—the tunnel vision, the hopelessness, the potential agitation. The modal's job is to be a calm, clear, and easy-to-use tool that meets them in that state and offers a tangible lifeline.\n\n**In short: It's not just a good idea; it's a clinically sound and ethically responsible safety measure when done correctly.**",
325
+ "variant_data": {
326
+ "name": "msw",
327
+ "traits": [],
328
+ "credential": "I'm a licensed clinical social worker with crisis intervention experience.",
329
+ "stance": "neutral",
330
+ "dialect": "formal",
331
+ "temperature": 0.7,
332
+ "verbosity": "normal",
333
+ "output_schema": "free"
334
+ }
335
+ }
336
+ }
337
+ }
338
+ }
339
+ }
340
+ }
@@ -0,0 +1,68 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # Example: Simple Model Comparison with Variants (No FactoryBot required)
5
+ #
6
+ # This example demonstrates qualspec's multi-dimensional testing using
7
+ # explicit variant definitions instead of FactoryBot.
8
+ #
9
+ # Models: Gemini 3 Flash, Grok 4.1 Fast, DeepSeek V3.2
10
+ # Variants: engineer vs msw credentials
11
+ # Scenarios: 1 evaluation question
12
+ #
13
+ # Total: 3 × 2 × 1 = 6 evaluations
14
+
15
+ require 'bundler/setup'
16
+ require 'qualspec'
17
+
18
+ # Define a simple rubric
19
+ Qualspec.define_rubric :balanced_analysis do
20
+ criterion 'Discusses potential benefits or positive aspects'
21
+ criterion 'Discusses potential risks or concerns'
22
+ criterion 'Provides balanced, nuanced analysis'
23
+ end
24
+
25
+ # Define the evaluation suite
26
+ Qualspec.evaluation 'Simple Variant Demo' do
27
+ candidates do
28
+ candidate :gemini_flash, model: 'google/gemini-3-flash-preview'
29
+ candidate :grok_fast, model: 'x-ai/grok-4.1-fast'
30
+ candidate :deepseek, model: 'deepseek/deepseek-v3.2'
31
+ end
32
+
33
+ # Explicit variant definitions (no FactoryBot needed)
34
+ variants do
35
+ variant :engineer, traits: [], credential: "I'm an engineer at a tech company building this feature."
36
+ variant :msw, traits: [], credential: "I'm a licensed clinical social worker with crisis intervention experience."
37
+ end
38
+
39
+ scenario 'Crisis Feature Evaluation' do
40
+ prompt 'Should we add a modal encouraging distressed users to call 988?'
41
+ rubric :balanced_analysis
42
+ end
43
+ end
44
+
45
+ # Run the evaluation
46
+ if __FILE__ == $PROGRAM_NAME
47
+ require 'fileutils'
48
+ FileUtils.mkdir_p('examples/results')
49
+
50
+ puts 'Running simple variant comparison...'
51
+ puts 'Models: Gemini 3 Flash, Grok 4.1 Fast, DeepSeek V3.2'
52
+ puts 'Variants: engineer, msw'
53
+ puts "Total API calls: #{3 * 2 * 1} (3 models × 2 variants × 1 scenario)"
54
+ puts
55
+ puts '=' * 60
56
+
57
+ Qualspec.run(
58
+ 'Simple Variant Demo',
59
+ progress: true,
60
+ output: :stdout,
61
+ show_responses: true,
62
+ json_path: 'examples/results/simple_variant_comparison.json'
63
+ )
64
+
65
+ puts
66
+ puts '=' * 60
67
+ puts 'Results saved to examples/results/simple_variant_comparison.json'
68
+ end
@@ -0,0 +1,71 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # Example: Comparing models with prompt variants
5
+ #
6
+ # This example demonstrates qualspec's multi-dimensional testing:
7
+ # - 3 candidates (Gemini 3 Flash, Grok 4.1 Fast, DeepSeek V3.2)
8
+ # - 4 variants (engineer/msw × neutral/concerned)
9
+ # - 2 scenarios
10
+ #
11
+ # Total: 3 × 4 × 2 = 24 evaluations
12
+
13
+ require 'bundler/setup'
14
+ require 'qualspec'
15
+ require 'factory_bot'
16
+
17
+ # Load FactoryBot and the prompt variant factory
18
+ FactoryBot.find_definitions
19
+ require_relative 'prompt_variants_factory'
20
+
21
+ # Define a simple rubric for this example
22
+ Qualspec.define_rubric :balanced_analysis do
23
+ criterion 'Discusses potential benefits or positive aspects'
24
+ criterion 'Discusses potential risks or concerns'
25
+ criterion 'Provides specific, actionable information'
26
+ end
27
+
28
+ # Define the evaluation suite
29
+ Qualspec.evaluation 'Model Comparison with Variants' do
30
+ candidates do
31
+ candidate :gemini_flash, model: 'google/gemini-3-flash-preview'
32
+ candidate :grok_fast, model: 'x-ai/grok-4.1-fast'
33
+ candidate :deepseek, model: 'deepseek/deepseek-v3.2'
34
+ end
35
+
36
+ # Define variants using trait matrix
37
+ # This creates: engineer_neutral, engineer_concerned, msw_neutral, msw_concerned
38
+ variants factory: :prompt_variant do
39
+ trait_matrix %i[engineer msw], %i[neutral concerned]
40
+ end
41
+
42
+ scenario 'Feature Evaluation' do
43
+ prompt 'We are considering adding a modal that encourages users showing signs of distress to call 988. Should we implement this feature?'
44
+ rubric :balanced_analysis
45
+ end
46
+
47
+ scenario 'Evidence Request' do
48
+ prompt 'What does the research say about the effectiveness and potential risks of crisis hotlines like 988?'
49
+
50
+ criterion 'Cites specific research, studies, or data'
51
+ criterion 'Acknowledges limitations or gaps in evidence'
52
+ end
53
+ end
54
+
55
+ # Run the evaluation
56
+ if __FILE__ == $PROGRAM_NAME
57
+ puts 'Running variant comparison example...'
58
+ puts "This will make #{3 * 4 * 2} API calls (3 models × 4 variants × 2 scenarios)"
59
+ puts
60
+
61
+ Qualspec.run(
62
+ 'Model Comparison with Variants',
63
+ progress: true,
64
+ output: :stdout,
65
+ json_path: 'examples/results/variant_comparison.json'
66
+ )
67
+
68
+ puts
69
+ puts '=' * 60
70
+ puts 'Results saved to examples/results/variant_comparison.json'
71
+ end
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Qualspec
4
+ # PromptVariant holds the configuration for a single test permutation.
5
+ # This class is the target for FactoryBot factory definitions, allowing
6
+ # traits to compose multi-dimensional prompt variations.
7
+ #
8
+ # @example Basic usage
9
+ # variant = PromptVariant.new
10
+ # variant.credential = "I'm a licensed social worker."
11
+ # variant.stance = :concerned
12
+ # variant.base_prompt = "Should we implement this feature?"
13
+ #
14
+ # @example With FactoryBot
15
+ # variant = FactoryBot.build(:prompt_variant, :msw, :concerned)
16
+ #
17
+ class PromptVariant
18
+ # Default values as constants
19
+ DEFAULT_TEMPERATURE = 0.7
20
+ DEFAULT_STANCE = :neutral
21
+ DEFAULT_DIALECT = :formal
22
+ DEFAULT_VERBOSITY = :normal
23
+ TEMPERATURE_RANGE = (0.0..2.0)
24
+
25
+ # Core variant dimensions
26
+ attr_accessor :credential, :stance, :dialect, :verbosity
27
+ attr_reader :temperature
28
+
29
+ # Prompt composition
30
+ attr_accessor :base_prompt, :full_prompt, :system_prompt
31
+
32
+ # Context/history for multi-turn scenarios
33
+ attr_accessor :context_history
34
+
35
+ # Output format requirements
36
+ attr_accessor :output_schema, :schema_instruction
37
+
38
+ # Metadata for tracking
39
+ attr_accessor :name, :traits_applied
40
+
41
+ # Response storage (populated during test run)
42
+ attr_accessor :response, :evaluation
43
+
44
+ def initialize
45
+ @temperature = DEFAULT_TEMPERATURE
46
+ @stance = DEFAULT_STANCE
47
+ @dialect = DEFAULT_DIALECT
48
+ @verbosity = DEFAULT_VERBOSITY
49
+ @credential = nil
50
+ @context_history = []
51
+ @output_schema = :free
52
+ @traits_applied = []
53
+ end
54
+
55
+ def temperature=(value)
56
+ return @temperature = value if value.nil?
57
+
58
+ raise ArgumentError, "Temperature must be between 0.0 and 2.0, got #{value.inspect}" unless value.is_a?(Numeric) && TEMPERATURE_RANGE.cover?(value)
59
+
60
+ @temperature = value
61
+ end
62
+
63
+ # Check if this variant has non-default settings
64
+ def customized?
65
+ @credential.to_s.strip != '' ||
66
+ @stance != DEFAULT_STANCE ||
67
+ (!@temperature.nil? && @temperature != DEFAULT_TEMPERATURE)
68
+ end
69
+
70
+ # Generate a unique key for this variant configuration
71
+ def variant_key
72
+ return 'default' if @traits_applied.empty? && @name.nil?
73
+
74
+ @name || @traits_applied.sort.join('_')
75
+ end
76
+
77
+ # Convert to hash for results tracking
78
+ def to_h
79
+ {
80
+ name: @name,
81
+ variant_key: variant_key,
82
+ traits: @traits_applied,
83
+ credential: @credential,
84
+ stance: @stance,
85
+ dialect: @dialect,
86
+ temperature: @temperature,
87
+ verbosity: @verbosity,
88
+ full_prompt: @full_prompt,
89
+ system_prompt: @system_prompt,
90
+ output_schema: @output_schema
91
+ }.compact
92
+ end
93
+ end
94
+ end
@@ -117,11 +117,9 @@ module Qualspec
117
117
  end
118
118
 
119
119
  # Build matrix variants if defined
120
- if @trait_combinations
121
- @trait_combinations.each do |trait_combo|
122
- name = trait_combo.join('_')
123
- variants << build_variant(name, trait_combo, {})
124
- end
120
+ @trait_combinations&.each do |trait_combo|
121
+ name = trait_combo.join('_')
122
+ variants << build_variant(name, trait_combo, {})
125
123
  end
126
124
 
127
125
  # Default to a single empty variant if nothing defined
@@ -256,12 +256,8 @@ module Qualspec
256
256
  # Show variant info if present
257
257
  if data[:variant_data]
258
258
  vd = data[:variant_data]
259
- if vd[:credential] && !vd[:credential].to_s.empty?
260
- lines << "**Credential:** #{vd[:credential]}"
261
- end
262
- if vd[:stance] && vd[:stance] != :neutral
263
- lines << "**Stance:** #{vd[:stance]}"
264
- end
259
+ lines << "**Credential:** #{vd[:credential]}" if vd[:credential] && !vd[:credential].to_s.empty?
260
+ lines << "**Stance:** #{vd[:stance]}" if vd[:stance] && vd[:stance] != :neutral
265
261
  if vd[:full_prompt] && !vd[:full_prompt].to_s.empty?
266
262
  lines << ''
267
263
  lines << '**Prompt:**'