kollabor 0.4.9__py3-none-any.whl → 0.4.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. agents/__init__.py +2 -0
  2. agents/coder/__init__.py +0 -0
  3. agents/coder/agent.json +4 -0
  4. agents/coder/api-integration.md +2150 -0
  5. agents/coder/cli-pretty.md +765 -0
  6. agents/coder/code-review.md +1092 -0
  7. agents/coder/database-design.md +1525 -0
  8. agents/coder/debugging.md +1102 -0
  9. agents/coder/dependency-management.md +1397 -0
  10. agents/coder/git-workflow.md +1099 -0
  11. agents/coder/refactoring.md +1454 -0
  12. agents/coder/security-hardening.md +1732 -0
  13. agents/coder/system_prompt.md +1448 -0
  14. agents/coder/tdd.md +1367 -0
  15. agents/creative-writer/__init__.py +0 -0
  16. agents/creative-writer/agent.json +4 -0
  17. agents/creative-writer/character-development.md +1852 -0
  18. agents/creative-writer/dialogue-craft.md +1122 -0
  19. agents/creative-writer/plot-structure.md +1073 -0
  20. agents/creative-writer/revision-editing.md +1484 -0
  21. agents/creative-writer/system_prompt.md +690 -0
  22. agents/creative-writer/worldbuilding.md +2049 -0
  23. agents/data-analyst/__init__.py +30 -0
  24. agents/data-analyst/agent.json +4 -0
  25. agents/data-analyst/data-visualization.md +992 -0
  26. agents/data-analyst/exploratory-data-analysis.md +1110 -0
  27. agents/data-analyst/pandas-data-manipulation.md +1081 -0
  28. agents/data-analyst/sql-query-optimization.md +881 -0
  29. agents/data-analyst/statistical-analysis.md +1118 -0
  30. agents/data-analyst/system_prompt.md +928 -0
  31. agents/default/__init__.py +0 -0
  32. agents/default/agent.json +4 -0
  33. agents/default/dead-code.md +794 -0
  34. agents/default/explore-agent-system.md +585 -0
  35. agents/default/system_prompt.md +1448 -0
  36. agents/kollabor/__init__.py +0 -0
  37. agents/kollabor/analyze-plugin-lifecycle.md +175 -0
  38. agents/kollabor/analyze-terminal-rendering.md +388 -0
  39. agents/kollabor/code-review.md +1092 -0
  40. agents/kollabor/debug-mcp-integration.md +521 -0
  41. agents/kollabor/debug-plugin-hooks.md +547 -0
  42. agents/kollabor/debugging.md +1102 -0
  43. agents/kollabor/dependency-management.md +1397 -0
  44. agents/kollabor/git-workflow.md +1099 -0
  45. agents/kollabor/inspect-llm-conversation.md +148 -0
  46. agents/kollabor/monitor-event-bus.md +558 -0
  47. agents/kollabor/profile-performance.md +576 -0
  48. agents/kollabor/refactoring.md +1454 -0
  49. agents/kollabor/system_prompt copy.md +1448 -0
  50. agents/kollabor/system_prompt.md +757 -0
  51. agents/kollabor/trace-command-execution.md +178 -0
  52. agents/kollabor/validate-config.md +879 -0
  53. agents/research/__init__.py +0 -0
  54. agents/research/agent.json +4 -0
  55. agents/research/architecture-mapping.md +1099 -0
  56. agents/research/codebase-analysis.md +1077 -0
  57. agents/research/dependency-audit.md +1027 -0
  58. agents/research/performance-profiling.md +1047 -0
  59. agents/research/security-review.md +1359 -0
  60. agents/research/system_prompt.md +492 -0
  61. agents/technical-writer/__init__.py +0 -0
  62. agents/technical-writer/agent.json +4 -0
  63. agents/technical-writer/api-documentation.md +2328 -0
  64. agents/technical-writer/changelog-management.md +1181 -0
  65. agents/technical-writer/readme-writing.md +1360 -0
  66. agents/technical-writer/style-guide.md +1410 -0
  67. agents/technical-writer/system_prompt.md +653 -0
  68. agents/technical-writer/tutorial-creation.md +1448 -0
  69. core/__init__.py +0 -2
  70. core/application.py +343 -88
  71. core/cli.py +229 -10
  72. core/commands/menu_renderer.py +463 -59
  73. core/commands/registry.py +14 -9
  74. core/commands/system_commands.py +2461 -14
  75. core/config/loader.py +151 -37
  76. core/config/service.py +18 -6
  77. core/events/bus.py +29 -9
  78. core/events/executor.py +205 -75
  79. core/events/models.py +27 -8
  80. core/fullscreen/command_integration.py +20 -24
  81. core/fullscreen/components/__init__.py +10 -1
  82. core/fullscreen/components/matrix_components.py +1 -2
  83. core/fullscreen/components/space_shooter_components.py +654 -0
  84. core/fullscreen/plugin.py +5 -0
  85. core/fullscreen/renderer.py +52 -13
  86. core/fullscreen/session.py +52 -15
  87. core/io/__init__.py +29 -5
  88. core/io/buffer_manager.py +6 -1
  89. core/io/config_status_view.py +7 -29
  90. core/io/core_status_views.py +267 -347
  91. core/io/input/__init__.py +25 -0
  92. core/io/input/command_mode_handler.py +711 -0
  93. core/io/input/display_controller.py +128 -0
  94. core/io/input/hook_registrar.py +286 -0
  95. core/io/input/input_loop_manager.py +421 -0
  96. core/io/input/key_press_handler.py +502 -0
  97. core/io/input/modal_controller.py +1011 -0
  98. core/io/input/paste_processor.py +339 -0
  99. core/io/input/status_modal_renderer.py +184 -0
  100. core/io/input_errors.py +5 -1
  101. core/io/input_handler.py +211 -2452
  102. core/io/key_parser.py +7 -0
  103. core/io/layout.py +15 -3
  104. core/io/message_coordinator.py +111 -2
  105. core/io/message_renderer.py +129 -4
  106. core/io/status_renderer.py +147 -607
  107. core/io/terminal_renderer.py +97 -51
  108. core/io/terminal_state.py +21 -4
  109. core/io/visual_effects.py +816 -165
  110. core/llm/agent_manager.py +1063 -0
  111. core/llm/api_adapters/__init__.py +44 -0
  112. core/llm/api_adapters/anthropic_adapter.py +432 -0
  113. core/llm/api_adapters/base.py +241 -0
  114. core/llm/api_adapters/openai_adapter.py +326 -0
  115. core/llm/api_communication_service.py +167 -113
  116. core/llm/conversation_logger.py +322 -16
  117. core/llm/conversation_manager.py +556 -30
  118. core/llm/file_operations_executor.py +84 -32
  119. core/llm/llm_service.py +934 -103
  120. core/llm/mcp_integration.py +541 -57
  121. core/llm/message_display_service.py +135 -18
  122. core/llm/plugin_sdk.py +1 -2
  123. core/llm/profile_manager.py +1183 -0
  124. core/llm/response_parser.py +274 -56
  125. core/llm/response_processor.py +16 -3
  126. core/llm/tool_executor.py +6 -1
  127. core/logging/__init__.py +2 -0
  128. core/logging/setup.py +34 -6
  129. core/models/resume.py +54 -0
  130. core/plugins/__init__.py +4 -2
  131. core/plugins/base.py +127 -0
  132. core/plugins/collector.py +23 -161
  133. core/plugins/discovery.py +37 -3
  134. core/plugins/factory.py +6 -12
  135. core/plugins/registry.py +5 -17
  136. core/ui/config_widgets.py +128 -28
  137. core/ui/live_modal_renderer.py +2 -1
  138. core/ui/modal_actions.py +5 -0
  139. core/ui/modal_overlay_renderer.py +0 -60
  140. core/ui/modal_renderer.py +268 -7
  141. core/ui/modal_state_manager.py +29 -4
  142. core/ui/widgets/base_widget.py +7 -0
  143. core/updates/__init__.py +10 -0
  144. core/updates/version_check_service.py +348 -0
  145. core/updates/version_comparator.py +103 -0
  146. core/utils/config_utils.py +685 -526
  147. core/utils/plugin_utils.py +1 -1
  148. core/utils/session_naming.py +111 -0
  149. fonts/LICENSE +21 -0
  150. fonts/README.md +46 -0
  151. fonts/SymbolsNerdFont-Regular.ttf +0 -0
  152. fonts/SymbolsNerdFontMono-Regular.ttf +0 -0
  153. fonts/__init__.py +44 -0
  154. {kollabor-0.4.9.dist-info → kollabor-0.4.15.dist-info}/METADATA +54 -4
  155. kollabor-0.4.15.dist-info/RECORD +228 -0
  156. {kollabor-0.4.9.dist-info → kollabor-0.4.15.dist-info}/top_level.txt +2 -0
  157. plugins/agent_orchestrator/__init__.py +39 -0
  158. plugins/agent_orchestrator/activity_monitor.py +181 -0
  159. plugins/agent_orchestrator/file_attacher.py +77 -0
  160. plugins/agent_orchestrator/message_injector.py +135 -0
  161. plugins/agent_orchestrator/models.py +48 -0
  162. plugins/agent_orchestrator/orchestrator.py +403 -0
  163. plugins/agent_orchestrator/plugin.py +976 -0
  164. plugins/agent_orchestrator/xml_parser.py +191 -0
  165. plugins/agent_orchestrator_plugin.py +9 -0
  166. plugins/enhanced_input/box_styles.py +1 -0
  167. plugins/enhanced_input/color_engine.py +19 -4
  168. plugins/enhanced_input/config.py +2 -2
  169. plugins/enhanced_input_plugin.py +61 -11
  170. plugins/fullscreen/__init__.py +6 -2
  171. plugins/fullscreen/example_plugin.py +1035 -222
  172. plugins/fullscreen/setup_wizard_plugin.py +592 -0
  173. plugins/fullscreen/space_shooter_plugin.py +131 -0
  174. plugins/hook_monitoring_plugin.py +436 -78
  175. plugins/query_enhancer_plugin.py +66 -30
  176. plugins/resume_conversation_plugin.py +1494 -0
  177. plugins/save_conversation_plugin.py +98 -32
  178. plugins/system_commands_plugin.py +70 -56
  179. plugins/tmux_plugin.py +154 -78
  180. plugins/workflow_enforcement_plugin.py +94 -92
  181. system_prompt/default.md +952 -886
  182. core/io/input_mode_manager.py +0 -402
  183. core/io/modal_interaction_handler.py +0 -315
  184. core/io/raw_input_processor.py +0 -946
  185. core/storage/__init__.py +0 -5
  186. core/storage/state_manager.py +0 -84
  187. core/ui/widget_integration.py +0 -222
  188. core/utils/key_reader.py +0 -171
  189. kollabor-0.4.9.dist-info/RECORD +0 -128
  190. {kollabor-0.4.9.dist-info → kollabor-0.4.15.dist-info}/WHEEL +0 -0
  191. {kollabor-0.4.9.dist-info → kollabor-0.4.15.dist-info}/entry_points.txt +0 -0
  192. {kollabor-0.4.9.dist-info → kollabor-0.4.15.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1118 @@
1
+ <!-- Statistical Analysis skill - perform rigorous statistical tests and inference -->
2
+
3
+ statistical analysis mode: RIGOROUS INFERENCE
4
+
5
+ when this skill is active, you follow disciplined statistical analysis.
6
+ this is a comprehensive guide to conducting proper statistical analysis.
7
+
8
+
9
+ PHASE 0: STATISTICAL ENVIRONMENT VERIFICATION
10
+
11
+ before attempting ANY statistical analysis, verify your tools are ready.
12
+
13
+
14
+ check scipy availability
15
+
16
+ <terminal>python -c "import scipy; print(f'scipy {scipy.__version__} available')"</terminal>
17
+
18
+ if scipy not available:
19
+ <terminal>pip install scipy</terminal>
20
+
21
+ verify stats module:
22
+ <terminal>python -c "from scipy import stats; print('stats module ready')"</terminal>
23
+
24
+
25
+ check statsmodels availability
26
+
27
+ <terminal>python -c "import statsmodels; print(f'statsmodels {statsmodels.__version__} available')"</terminal>
28
+
29
+ if statsmodels not installed (recommended for advanced analysis):
30
+ <terminal>pip install statsmodels</terminal>
31
+
32
+
33
+ check numpy and pandas
34
+
35
+ <terminal>python -c "import numpy; print(f'numpy {numpy.__version__} available')"</terminal>
36
+ <terminal>python -c "import pandas; print(f'pandas {pandas.__version__} available')"</terminal>
37
+
38
+
39
+ check matplotlib for statistical plots
40
+
41
+ <terminal>python -c "import matplotlib; print(f'matplotlib {matplotlib.__version__} available')"</terminal>
42
+
43
+
44
+ check pingouin for advanced tests (optional but recommended)
45
+
46
+ <terminal>python -c "import pingouin; print(f'pingouin {pingouin.__version__} available')" 2>/dev/null || echo "pingouin not installed"</terminal>
47
+
48
+ if pingouin not installed:
49
+ <terminal>pip install pingouin</terminal>
50
+
51
+
52
+ PHASE 1: DESCRIPTIVE STATISTICS
53
+
54
+
55
+ central tendency measures
56
+
57
+ calculate mean:
58
+ import pandas as pd
59
+
60
+ mean_value = df['column'].mean()
61
+ print(f"Mean: {mean_value:.2f}")
62
+
63
+ calculate median:
64
+ median_value = df['column'].median()
65
+ print(f"Median: {median_value:.2f}")
66
+
67
+ calculate mode:
68
+ mode_value = df['column'].mode()[0]
69
+ print(f"Mode: {mode_value}")
70
+
71
+ when to use which:
72
+ - mean: symmetric distributions, no outliers
73
+ - median: skewed distributions, presence of outliers
74
+ - mode: categorical data, identifying most frequent value
75
+
76
+
77
+ measures of dispersion
78
+
79
+ standard deviation:
80
+ std_dev = df['column'].std()
81
+ print(f"Standard Deviation: {std_dev:.2f}")
82
+
83
+ variance:
84
+ variance = df['column'].var()
85
+ print(f"Variance: {variance:.2f}")
86
+
87
+ range:
88
+ data_range = df['column'].max() - df['column'].min()
89
+ print(f"Range: {data_range:.2f}")
90
+
91
+ interquartile range (IQR):
92
+ q1 = df['column'].quantile(0.25)
93
+ q3 = df['column'].quantile(0.75)
94
+ iqr = q3 - q1
95
+ print(f"IQR: {iqr:.2f}")
96
+
97
+ coefficient of variation (CV):
98
+ cv = (df['column'].std() / df['column'].mean()) * 100
99
+ print(f"CV: {cv:.2f}%")
100
+
101
+ interpretation:
102
+ - CV < 10%: low variability
103
+ - CV 10-20%: moderate variability
104
+ - CV > 20%: high variability
105
+
106
+
107
+ shape of distribution
108
+
109
+ skewness:
110
+ from scipy import stats
111
+
112
+ skewness = df['column'].skew()
113
+ print(f"Skewness: {skewness:.2f}")
114
+
115
+ interpretation:
116
+ - skewness = 0: perfectly symmetric
117
+ - skewness > 0: right-skewed (tail to the right)
118
+ - skewness < 0: left-skewed (tail to the left)
119
+ - |skewness| > 1: highly skewed
120
+ - 0.5 < |skewness| < 1: moderately skewed
121
+
122
+ kurtosis:
123
+ kurtosis = df['column'].kurtosis()
124
+ print(f"Kurtosis: {kurtosis:.2f}")
125
+
126
+ interpretation (excess kurtosis):
127
+ - kurtosis = 0: mesokurtic (normal-like)
128
+ - kurtosis > 0: leptokurtic (heavy tails, more outliers)
129
+ - kurtosis < 0: platykurtic (light tails, fewer outliers)
130
+
131
+
132
+ comprehensive summary statistics
133
+
134
+ using describe():
135
+ summary = df['column'].describe()
136
+ print(summary)
137
+
138
+ percentile ranges:
139
+ percentiles = [0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]
140
+ df['column'].quantile(percentiles)
141
+
142
+ correlation matrix:
143
+ correlation = df.corr()
144
+ print(correlation)
145
+
146
+ correlation with specific column:
147
+ df.corr()['target_column'].sort_values(ascending=False)
148
+
149
+
150
+ PHASE 2: PROBABILITY DISTRIBUTIONS
151
+
152
+
153
+ identifying distributions
154
+
155
+ visual inspection:
156
+ import matplotlib.pyplot as plt
157
+
158
+ df['column'].hist(bins=30, edgecolor='black')
159
+ plt.xlabel('Value')
160
+ plt.ylabel('Frequency')
161
+ plt.title('Distribution of Column')
162
+ plt.show()
163
+
164
+ q-q plot for normality:
165
+ from scipy import stats
166
+
167
+ stats.probplot(df['column'], dist="norm", plot=plt)
168
+ plt.title('Q-Q Plot')
169
+ plt.show()
170
+
171
+ interpretation:
172
+ - points on diagonal: normal distribution
173
+ - deviation at ends: heavy/light tails
174
+ - s-curve: skewed distribution
175
+
176
+
177
+ testing normality
178
+
179
+ shapiro-wilk test (small samples, n < 5000):
180
+ from scipy import stats
181
+
182
+ stat, p_value = stats.shapiro(df['column'])
183
+ print(f"Shapiro-Wilk: statistic={stat:.4f}, p-value={p_value:.4f}")
184
+
185
+ if p_value < 0.05:
186
+ print("Reject null hypothesis: data is NOT normal")
187
+ else:
188
+ print("Fail to reject null hypothesis: data appears normal")
189
+
190
+
191
+ kolmogorov-smirnov test (larger samples):
192
+ from scipy import stats
193
+
194
+ stat, p_value = stats.kstest(df['column'], 'norm')
195
+ print(f"KS Test: statistic={stat:.4f}, p-value={p_value:.4f}")
196
+
197
+ interpretation same as shapiro-wilk
198
+
199
+
200
+ anderson-darling test (more sensitive to tails):
201
+ from scipy import stats
202
+
203
+ result = stats.anderson(df['column'], dist='norm')
204
+ print(f"Anderson-Darling: statistic={result.statistic:.4f}")
205
+
206
+ for i, (cv, sl) in enumerate(zip(result.critical_values,
207
+ result.significance_level)):
208
+ if result.statistic > cv:
209
+ print(f"At {sl}% significance level, reject normality")
210
+ else:
211
+ print(f"At {sl}% significance level, fail to reject normality")
212
+
213
+
214
+ working with common distributions
215
+
216
+ normal distribution:
217
+ from scipy import stats
218
+ import numpy as np
219
+
220
+ # generate samples
221
+ samples = np.random.normal(loc=0, scale=1, size=1000)
222
+
223
+ # calculate probability density
224
+ x = np.linspace(-3, 3, 100)
225
+ pdf = stats.norm.pdf(x, loc=0, scale=1)
226
+
227
+ # calculate cumulative probability
228
+ cdf = stats.norm.cdf(x, loc=0, scale=1)
229
+
230
+ # percentile point
231
+ percentile_95 = stats.norm.ppf(0.95, loc=0, scale=1)
232
+
233
+ # survival function (1 - CDF)
234
+ sf = stats.norm.sf(x, loc=0, scale=1)
235
+
236
+
237
+ binomial distribution:
238
+ # probability of k successes in n trials
239
+ n = 10
240
+ p = 0.5
241
+ k = 5
242
+
243
+ prob = stats.binom.pmf(k, n, p)
244
+ print(f"Probability of {k} successes: {prob:.4f}")
245
+
246
+ # cumulative probability
247
+ cum_prob = stats.binom.cdf(k, n, p)
248
+ print(f"Probability of <= {k} successes: {cum_prob:.4f}")
249
+
250
+ # generate samples
251
+ samples = stats.binom.rvs(n, p, size=1000)
252
+
253
+
254
+ poisson distribution:
255
+ # probability of k events with rate lambda
256
+ lam = 5
257
+ k = 3
258
+
259
+ prob = stats.poisson.pmf(k, lam)
260
+ print(f"Probability of {k} events: {prob:.4f}")
261
+
262
+ # cumulative probability
263
+ cum_prob = stats.poisson.cdf(k, lam)
264
+ print(f"Probability of <= {k} events: {cum_prob:.4f}")
265
+
266
+ # generate samples
267
+ samples = stats.poisson.rvs(lam, size=1000)
268
+
269
+
270
+ exponential distribution:
271
+ # time between events with rate lambda
272
+ lam = 0.5
273
+
274
+ # probability density
275
+ x = np.linspace(0, 10, 100)
276
+ pdf = stats.expon.pdf(x, scale=1/lam)
277
+
278
+ # percentile
279
+ percentile_90 = stats.expon.ppf(0.90, scale=1/lam)
280
+
281
+ # generate samples
282
+ samples = stats.expon.rvs(scale=1/lam, size=1000)
283
+
284
+
285
+ PHASE 3: HYPOTHESIS TESTING FRAMEWORK
286
+
287
+
288
+ the hypothesis testing workflow
289
+
290
+ always follow this systematic approach:
291
+
292
+ 1. formulate hypotheses
293
+ - null hypothesis (H0): default position, no effect
294
+ - alternative hypothesis (H1/Ha): what you want to prove
295
+
296
+ 2. choose significance level
297
+ - typically alpha = 0.05 (5% risk of Type I error)
298
+ - more stringent: alpha = 0.01
299
+ - less stringent: alpha = 0.10
300
+
301
+ 3. select appropriate test
302
+ - based on data type, distribution, sample size
303
+
304
+ 4. calculate test statistic
305
+ - using appropriate statistical method
306
+
307
+ 5. determine p-value
308
+ - probability of observing results if H0 is true
309
+
310
+ 6. make decision
311
+ - p-value < alpha: reject H0
312
+ - p-value >= alpha: fail to reject H0
313
+
314
+ 7. interpret in context
315
+ - what does this mean for your data?
316
+
317
+ 8. report effect size
318
+ - magnitude of the difference/relationship
319
+
320
+
321
+ types of errors
322
+
323
+ type I error (false positive):
324
+ - rejecting H0 when it's actually true
325
+ - controlled by significance level (alpha)
326
+ - alpha = 0.05 means 5% chance of false positive
327
+
328
+ type II error (false negative):
329
+ - failing to reject H0 when it's actually false
330
+ - related to statistical power (1 - beta)
331
+ - higher sample size reduces type II error
332
+
333
+ power analysis:
334
+ from statsmodels.stats.power import TTestIndPower
335
+
336
+ power_analysis = TTestIndPower()
337
+ sample_size = power_analysis.solve_power(
338
+ effect_size=0.5, # medium effect
339
+ alpha=0.05,
340
+ power=0.8, # 80% power
341
+ alternative='two-sided'
342
+ )
343
+ print(f"Required sample size: {sample_size:.0f}")
344
+
345
+
346
+ one-sample t-test
347
+
348
+ test if sample mean differs from known value:
349
+ from scipy import stats
350
+
351
+ # known population value
352
+ population_mean = 100
353
+
354
+ # sample data
355
+ sample_data = df['column']
356
+
357
+ # perform test
358
+ statistic, p_value = stats.ttest_1samp(
359
+ sample_data,
360
+ population_mean
361
+ )
362
+
363
+ print(f"T-statistic: {statistic:.4f}")
364
+ print(f"P-value: {p_value:.4f}")
365
+
366
+ # interpret
367
+ alpha = 0.05
368
+ if p_value < alpha:
369
+ print(f"Reject H0: mean differs from {population_mean}")
370
+ else:
371
+ print(f"Fail to reject H0: no evidence mean differs from {population_mean}")
372
+
373
+ # calculate effect size (Cohen's d)
374
+ effect_size = (sample_data.mean() - population_mean) / sample_data.std()
375
+ print(f"Cohen's d: {effect_size:.4f}")
376
+
377
+ interpretation:
378
+ - |d| < 0.2: small effect
379
+ - 0.2 <= |d| < 0.5: medium effect
380
+ - |d| >= 0.5: large effect
381
+
382
+
383
+ two-sample t-test (independent)
384
+
385
+ compare means of two independent groups:
386
+ from scipy import stats
387
+
388
+ group1 = df[df['category'] == 'A']['value']
389
+ group2 = df[df['category'] == 'B']['value']
390
+
391
+ # check for equal variances first
392
+ statistic, p_value_var = stats.levene(group1, group2)
393
+ equal_var = p_value_var >= 0.05
394
+
395
+ # perform appropriate t-test
396
+ statistic, p_value = stats.ttest_ind(
397
+ group1,
398
+ group2,
399
+ equal_var=equal_var
400
+ )
401
+
402
+ print(f"T-statistic: {statistic:.4f}")
403
+ print(f"P-value: {p_value:.4f}")
404
+ print(f"Equal variances assumed: {equal_var}")
405
+
406
+ # interpret
407
+ if p_value < 0.05:
408
+ print("Reject H0: means are significantly different")
409
+ else:
410
+ print("Fail to reject H0: no evidence of difference in means")
411
+
412
+ # calculate effect size (Cohen's d)
413
+ pooled_std = np.sqrt(
414
+ (group1.std()**2 + group2.std()**2) / 2
415
+ )
416
+ effect_size = (group1.mean() - group2.mean()) / pooled_std
417
+ print(f"Cohen's d: {effect_size:.4f}")
418
+
419
+
420
+ paired t-test (dependent)
421
+
422
+ compare means of paired samples:
423
+ from scipy import stats
424
+
425
+ before = df['before']
426
+ after = df['after']
427
+
428
+ # perform paired t-test
429
+ statistic, p_value = stats.ttest_rel(before, after)
430
+
431
+ print(f"T-statistic: {statistic:.4f}")
432
+ print(f"P-value: {p_value:.4f}")
433
+
434
+ # interpret
435
+ if p_value < 0.05:
436
+ print("Reject H0: significant difference between paired samples")
437
+ else:
438
+ print("Fail to reject H0: no evidence of difference")
439
+
440
+ # calculate effect size (Cohen's d for paired)
441
+ differences = after - before
442
+ effect_size = differences.mean() / differences.std()
443
+ print(f"Cohen's d: {effect_size:.4f}")
444
+
445
+
446
+ anova (analysis of variance)
447
+
448
+ compare means of three or more groups:
449
+ from scipy import stats
450
+
451
+ groups = [df[df['category'] == cat]['value'].values
452
+ for cat in df['category'].unique()]
453
+
454
+ # perform one-way ANOVA
455
+ statistic, p_value = stats.f_oneway(*groups)
456
+
457
+ print(f"F-statistic: {statistic:.4f}")
458
+ print(f"P-value: {p_value:.4f}")
459
+
460
+ # interpret
461
+ if p_value < 0.05:
462
+ print("Reject H0: at least one group mean differs")
463
+
464
+ # post-hoc test to find which groups differ
465
+ from statsmodels.stats.multicomp import pairwise_tukeyhsd
466
+
467
+ tukey = pairwise_tukeyhsd(
468
+ df['value'],
469
+ df['category']
470
+ )
471
+ print(tukey)
472
+ else:
473
+ print("Fail to reject H0: no evidence of difference in group means")
474
+
475
+
476
+ PHASE 4: NON-PARAMETRIC TESTS
477
+
478
+
479
+ when to use non-parametric tests
480
+
481
+ use when:
482
+ - data is not normally distributed
483
+ - sample size is small
484
+ - ordinal data (ranked categories)
485
+ - presence of extreme outliers
486
+ - variances are not homogeneous
487
+
488
+
489
+ mann-whitney u test (independent samples, non-parametric)
490
+
491
+ alternative to independent t-test:
492
+ from scipy import stats
493
+
494
+ group1 = df[df['category'] == 'A']['value']
495
+ group2 = df[df['category'] == 'B']['value']
496
+
497
+ statistic, p_value = stats.mannwhitneyu(
498
+ group1,
499
+ group2,
500
+ alternative='two-sided'
501
+ )
502
+
503
+ print(f"Mann-Whitney U statistic: {statistic:.4f}")
504
+ print(f"P-value: {p_value:.4f}")
505
+
506
+ # interpret
507
+ if p_value < 0.05:
508
+ print("Reject H0: distributions differ significantly")
509
+ else:
510
+ print("Fail to reject H0: no evidence of difference")
511
+
512
+
513
+ wilcoxon signed-rank test (paired samples, non-parametric)
514
+
515
+ alternative to paired t-test:
516
+ from scipy import stats
517
+
518
+ before = df['before']
519
+ after = df['after']
520
+
521
+ statistic, p_value = stats.wilcoxon(before, after)
522
+
523
+ print(f"Wilcoxon statistic: {statistic:.4f}")
524
+ print(f"P-value: {p_value:.4f}")
525
+
526
+ # interpret
527
+ if p_value < 0.05:
528
+ print("Reject H0: significant difference in paired samples")
529
+ else:
530
+ print("Fail to reject H0: no evidence of difference")
531
+
532
+
533
+ kruskal-wallis test (multiple groups, non-parametric)
534
+
535
+ alternative to one-way ANOVA:
536
+ from scipy import stats
537
+
538
+ groups = [df[df['category'] == cat]['value'].values
539
+ for cat in df['category'].unique()]
540
+
541
+ statistic, p_value = stats.kruskal(*groups)
542
+
543
+ print(f"Kruskal-Wallis statistic: {statistic:.4f}")
544
+ print(f"P-value: {p_value:.4f}")
545
+
546
+ # interpret
547
+ if p_value < 0.05:
548
+ print("Reject H0: at least one group distribution differs")
549
+
550
+ # post-hoc Dunn's test
551
+ import scikit_posthocs as sp
552
+
553
+ dunn = sp.posthoc_dunn(
554
+ df,
555
+ val_col='value',
556
+ group_col='category',
557
+ p_adjust='bonferroni'
558
+ )
559
+ print(dunn)
560
+ else:
561
+ print("Fail to reject H0: no evidence of difference")
562
+
563
+
564
+ chi-square test of independence
565
+
566
+ test association between categorical variables:
567
+ from scipy import stats
568
+
569
+ # create contingency table
570
+ contingency_table = pd.crosstab(df['category1'], df['category2'])
571
+
572
+ # perform chi-square test
573
+ statistic, p_value, dof, expected = stats.chi2_contingency(
574
+ contingency_table
575
+ )
576
+
577
+ print(f"Chi-square statistic: {statistic:.4f}")
578
+ print(f"P-value: {p_value:.4f}")
579
+ print(f"Degrees of freedom: {dof}")
580
+
581
+ # interpret
582
+ if p_value < 0.05:
583
+ print("Reject H0: variables are associated")
584
+ else:
585
+ print("Fail to reject H0: no evidence of association")
586
+
587
+ # check assumptions
588
+ # expected frequencies should be >= 5
589
+ print(f"\nExpected frequencies:")
590
+ print(expected)
591
+
592
+ if (expected < 5).any():
593
+ print("Warning: some expected frequencies < 5")
594
+ print("Consider combining categories or using Fisher's exact test")
595
+
596
+
597
+ PHASE 5: CORRELATION AND REGRESSION
598
+
599
+
600
+ pearson correlation (linear, parametric)
601
+
602
+ measure linear relationship between continuous variables:
603
+ from scipy import stats
604
+
605
+ x = df['column1']
606
+ y = df['column2']
607
+
608
+ # calculate correlation
609
+ correlation, p_value = stats.pearsonr(x, y)
610
+
611
+ print(f"Pearson correlation: {correlation:.4f}")
612
+ print(f"P-value: {p_value:.4f}")
613
+
614
+ # interpret correlation strength
615
+ abs_correlation = abs(correlation)
616
+ if abs_correlation < 0.3:
617
+ strength = "weak"
618
+ elif abs_correlation < 0.7:
619
+ strength = "moderate"
620
+ else:
621
+ strength = "strong"
622
+
623
+ print(f"Correlation strength: {strength}")
624
+
625
+ # interpret direction
626
+ if correlation > 0:
627
+ print(f"Direction: positive (as X increases, Y increases)")
628
+ elif correlation < 0:
629
+ print(f"Direction: negative (as X increases, Y decreases)")
630
+ else:
631
+ print(f"Direction: no linear relationship")
632
+
633
+ # visualize
634
+ import matplotlib.pyplot as plt
635
+
636
+ plt.scatter(x, y, alpha=0.5)
637
+ plt.xlabel('Column 1')
638
+ plt.ylabel('Column 2')
639
+ plt.title(f'Scatter Plot (r={correlation:.3f})')
640
+
641
+ # add trend line
642
+ z = np.polyfit(x, y, 1)
643
+ p = np.poly1d(z)
644
+ plt.plot(x, p(x), "r--")
645
+
646
+ plt.show()
647
+
648
+
649
+ spearman correlation (rank-based, non-parametric)
650
+
651
+ measure monotonic relationship (not necessarily linear):
652
+ from scipy import stats
653
+
654
+ correlation, p_value = stats.spearmanr(
655
+ df['column1'],
656
+ df['column2']
657
+ )
658
+
659
+ print(f"Spearman correlation: {correlation:.4f}")
660
+ print(f"P-value: {p_value:.4f}")
661
+
662
+ # interpret
663
+ if p_value < 0.05:
664
+ print("Significant monotonic relationship")
665
+ else:
666
+ print("No evidence of monotonic relationship")
667
+
668
+
669
+ simple linear regression
670
+
671
+ model relationship between one predictor and one response:
672
+ import statsmodels.api as sm
673
+
674
+ X = df['predictor']
675
+ y = df['response']
676
+
677
+ # add constant for intercept
678
+ X_with_const = sm.add_constant(X)
679
+
680
+ # fit model
681
+ model = sm.OLS(y, X_with_const).fit()
682
+
683
+ # print results
684
+ print(model.summary())
685
+
686
+ # extract key metrics
687
+ print(f"\nR-squared: {model.rsquared:.4f}")
688
+ print(f"Adjusted R-squared: {model.rsquared_adj:.4f}")
689
+ print(f"F-statistic: {model.fvalue:.4f}")
690
+ print(f"F-statistic p-value: {model.f_pvalue:.4f}")
691
+
692
+ # interpret coefficients
693
+ intercept = model.params['const']
694
+ slope = model.params['predictor']
695
+
696
+ print(f"\nIntercept: {intercept:.4f}")
697
+ print(f"Slope: {slope:.4f}")
698
+ print(f"\nInterpretation: For each 1-unit increase in predictor,")
699
+ print(f"response changes by {slope:.4f} units")
700
+
701
+ # check assumptions
702
+ # 1. linearity: residuals vs fitted plot
703
+ # 2. normality: q-q plot of residuals
704
+ # 3. homoscedasticity: residuals vs fitted plot
705
+ # 4. independence: durbin-watson test (check in summary)
706
+
707
+ # visualize
708
+ import matplotlib.pyplot as plt
709
+
710
+ plt.figure(figsize=(12, 4))
711
+
712
+ # residuals vs fitted
713
+ plt.subplot(1, 2, 1)
714
+ plt.scatter(model.fittedvalues, model.resid, alpha=0.5)
715
+ plt.xlabel('Fitted values')
716
+ plt.ylabel('Residuals')
717
+ plt.title('Residuals vs Fitted')
718
+ plt.axhline(y=0, color='r', linestyle='--')
719
+
720
+ # q-q plot
721
+ plt.subplot(1, 2, 2)
722
+ sm.qqplot(model.resid, line='s', fit=True)
723
+ plt.title('Q-Q Plot')
724
+
725
+ plt.tight_layout()
726
+ plt.show()
727
+
728
+
729
+ multiple linear regression
730
+
731
+ model relationship with multiple predictors:
732
+ import statsmodels.api as sm
733
+
734
+ # define predictors
735
+ X = df[['predictor1', 'predictor2', 'predictor3']]
736
+ y = df['response']
737
+
738
+ # add constant
739
+ X_with_const = sm.add_constant(X)
740
+
741
+ # fit model
742
+ model = sm.OLS(y, X_with_const).fit()
743
+
744
+ # print results
745
+ print(model.summary())
746
+
747
+ # interpret coefficients
748
+ print("\nCoefficient interpretations:")
749
+ for predictor in X.columns:
750
+ coef = model.params[predictor]
751
+ print(f" {predictor}: {coef:.4f}")
752
+
753
+ # check for multicollinearity
754
+ from statsmodels.stats.outliers_influence import variance_inflation_factor
755
+
756
+ print("\nVariance Inflation Factors (VIF):")
757
+ vif_data = pd.DataFrame()
758
+ vif_data["predictor"] = X.columns
759
+ vif_data["VIF"] = [
760
+ variance_inflation_factor(X_with_const.values, i)
761
+ for i in range(1, len(X.columns) + 1)
762
+ ]
763
+ print(vif_data)
764
+
765
+ # interpret VIF
766
+ # VIF > 10: high multicollinearity
767
+ # VIF > 5: moderate multicollinearity
768
+ # VIF < 5: low multicollinearity
769
+
770
+
771
+ logistic regression
772
+
773
+ model binary outcomes:
774
+ import statsmodels.api as sm
775
+
776
+ X = df[['predictor1', 'predictor2', 'predictor3']]
777
+ y = df['binary_outcome']
778
+
779
+ # add constant
780
+ X_with_const = sm.add_constant(X)
781
+
782
+ # fit logistic regression
783
+ model = sm.Logit(y, X_with_const).fit()
784
+
785
+ # print results
786
+ print(model.summary())
787
+
788
+ # convert coefficients to odds ratios
789
+ print("\nOdds Ratios:")
790
+ odds_ratios = np.exp(model.params)
791
+ print(odds_ratios)
792
+
793
+ # interpret
794
+ for predictor in X.columns:
795
+ or_value = odds_ratios[predictor]
796
+ print(f"\n {predictor}: OR = {or_value:.4f}")
797
+ if or_value > 1:
798
+ print(f" Each unit increase multiplies odds by {or_value:.2f}x")
799
+ else:
800
+ print(f" Each unit increase divides odds by {1/or_value:.2f}x")
801
+
802
+
803
+ PHASE 6: ADVANCED STATISTICAL CONCEPTS
804
+
805
+
806
+ effect size calculation
807
+
808
+ cohen's d (two groups):
809
+ from scipy import stats
810
+
811
+ def cohen_d(group1, group2):
812
+ pooled_std = np.sqrt(
813
+ (group1.std()**2 + group2.std()**2) / 2
814
+ )
815
+ return (group1.mean() - group2.mean()) / pooled_std
816
+
817
+ d = cohen_d(group1, group2)
818
+ print(f"Cohen's d: {d:.4f}")
819
+
820
+ interpretation:
821
+ - |d| < 0.2: trivial effect
822
+ - 0.2 <= |d| < 0.5: small effect
823
+ - 0.5 <= |d| < 0.8: medium effect
824
+ - |d| >= 0.8: large effect
825
+
826
+
827
+ eta-squared (ANOVA):
828
+ # proportion of variance explained
829
+ ss_between = sum(len(g) * (np.mean(g) - np.mean(df['value']))**2
830
+ for g in groups)
831
+ ss_total = sum((x - np.mean(df['value']))**2 for x in df['value'])
832
+
833
+ eta_squared = ss_between / ss_total
834
+ print(f"Eta-squared: {eta_squared:.4f}")
835
+
836
+ interpretation:
837
+ - 0.01: small effect
838
+ - 0.06: medium effect
839
+ - 0.14: large effect
840
+
841
+
842
+ phi coefficient (2x2 contingency table):
843
+ # measure association in 2x2 table
844
+ from scipy import stats
845
+
846
+ # calculate chi-square
847
+ chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
848
+
849
+ # calculate phi
850
+ n = contingency_table.values.sum()
851
+ phi = np.sqrt(chi2 / n)
852
+
853
+ print(f"Phi coefficient: {phi:.4f}")
854
+
855
+ interpretation same as correlation (-1 to +1)
856
+
857
+
858
+ confidence intervals
859
+
860
+ confidence interval for mean:
861
+ from scipy import stats
862
+
863
+ confidence_level = 0.95
864
+ degrees_of_freedom = len(df['column']) - 1
865
+
866
+ sample_mean = df['column'].mean()
867
+ sample_std = df['column'].std()
868
+ standard_error = sample_std / np.sqrt(len(df['column']))
869
+
870
+ # calculate margin of error
871
+ t_critical = stats.t.ppf(
872
+ (1 + confidence_level) / 2,
873
+ degrees_of_freedom
874
+ )
875
+ margin_of_error = t_critical * standard_error
876
+
877
+ # calculate CI
878
+ ci_lower = sample_mean - margin_of_error
879
+ ci_upper = sample_mean + margin_of_error
880
+
881
+ print(f"{confidence_level*100}% CI for mean: "
882
+ f"({ci_lower:.4f}, {ci_upper:.4f})")
883
+
884
+
885
+ confidence interval for proportion:
886
+ from statsmodels.stats.proportion import proportion_confint
887
+
888
+ successes = df[df['outcome'] == 'success'].shape[0]
889
+ total = df.shape[0]
890
+
891
+ ci_lower, ci_upper = proportion_confint(
892
+ successes,
893
+ total,
894
+ alpha=0.05,
895
+ method='wilson'
896
+ )
897
+
898
+ print(f"95% CI for proportion: ({ci_lower:.4f}, {ci_upper:.4f})")
899
+
900
+
901
+ bootstrap confidence intervals
902
+
903
+ non-parametric confidence intervals:
904
+ import numpy as np
905
+
906
+ def bootstrap_mean(data, n_bootstrap=10000, ci=95):
907
+ bootstrap_means = []
908
+ for _ in range(n_bootstrap):
909
+ sample = np.random.choice(data, size=len(data), replace=True)
910
+ bootstrap_means.append(np.mean(sample))
911
+
912
+ lower = np.percentile(bootstrap_means, (100 - ci) / 2)
913
+ upper = np.percentile(bootstrap_means, 100 - (100 - ci) / 2)
914
+ return lower, upper
915
+
916
+ ci_lower, ci_upper = bootstrap_mean(df['column'])
917
+ print(f"95% bootstrap CI: ({ci_lower:.4f}, {ci_upper:.4f})")
918
+
919
+
920
+ PHASE 7: TIME SERIES ANALYSIS
921
+
922
+
923
+ time series decomposition
924
+
925
+ decompose into trend, seasonal, and residual components:
926
+ from statsmodels.tsa.seasonal import seasonal_decompose
927
+
928
+ # ensure datetime index
929
+ df['date'] = pd.to_datetime(df['date'])
930
+ df.set_index('date', inplace=True)
931
+
932
+ # decompose
933
+ decomposition = seasonal_decompose(
934
+ df['value'],
935
+ model='additive',
936
+ period=12 # seasonal period
937
+ )
938
+
939
+ # plot
940
+ import matplotlib.pyplot as plt
941
+
942
+ fig = decomposition.plot()
943
+ plt.tight_layout()
944
+ plt.show()
945
+
946
+
947
+ checking for stationarity
948
+
949
+ augmented dickey-fuller test:
950
+ from statsmodels.tsa.stattools import adfuller
951
+
952
+ result = adfuller(df['value'])
953
+
954
+ print(f"ADF Statistic: {result[0]:.4f}")
955
+ print(f"P-value: {result[1]:.4f}")
956
+ print("Critical Values:")
957
+ for key, value in result[4].items():
958
+ print(f" {key}: {value:.4f}")
959
+
960
+ # interpret
961
+ if result[1] < 0.05:
962
+ print("Reject H0: time series is stationary")
963
+ else:
964
+ print("Fail to reject H0: time series is non-stationary")
965
+
966
+
967
+ autocorrelation and partial autocorrelation
968
+
969
+ autocorrelation function (ACF):
970
+ from statsmodels.tsa.stattools import acf
971
+ from statsmodels.graphics.tsaplots import plot_acf
972
+
973
+ # calculate ACF
974
+ acf_values = acf(df['value'], nlags=20)
975
+
976
+ # plot
977
+ plot_acf(df['value'], lags=20)
978
+ plt.show()
979
+
980
+ # interpret
981
+ # significant spikes at lag k: correlation at lag k
982
+
983
+
984
+ partial autocorrelation function (PACF):
985
+ from statsmodels.tsa.stattools import pacf
986
+ from statsmodels.graphics.tsaplots import plot_pacf
987
+
988
+ # calculate PACF
989
+ pacf_values = pacf(df['value'], nlags=20)
990
+
991
+ # plot
992
+ plot_pacf(df['value'], lags=20)
993
+ plt.show()
994
+
995
+
996
+ PHASE 8: STATISTICAL POWER AND SAMPLE SIZE
997
+
998
+
999
+ power analysis for t-tests
1000
+
1001
+ from statsmodels.stats.power import TTestIndPower
1002
+
1003
+ power_analysis = TTestIndPower()
1004
+
1005
+ # calculate required sample size
1006
+ effect_size = 0.5 # medium effect
1007
+ alpha = 0.05
1008
+ power = 0.8 # 80% power
1009
+ ratio = 1 # equal group sizes
1010
+
1011
+ sample_size = power_analysis.solve_power(
1012
+ effect_size=effect_size,
1013
+ alpha=alpha,
1014
+ power=power,
1015
+ ratio=ratio,
1016
+ alternative='two-sided'
1017
+ )
1018
+
1019
+ print(f"Required sample size per group: {sample_size:.0f}")
1020
+
1021
+
1022
+ power curve
1023
+ effect_sizes = np.array([0.2, 0.5, 0.8])
1024
+ sample_sizes = np.array(range(10, 500, 10))
1025
+
1026
+ power_analysis.plot_power(
1027
+ dep_var='nobs',
1028
+ nobs=sample_sizes,
1029
+ effect_size=effect_sizes,
1030
+ alpha=0.05
1031
+ )
1032
+
1033
+ plt.show()
1034
+
1035
+
1036
+ PHASE 9: STATISTICAL RULES (MANDATORY)
1037
+
1038
+
1039
+ while this skill is active, these rules are MANDATORY:
1040
+
1041
+ [1] CHECK ASSUMPTIONS before applying tests
1042
+ - normality for parametric tests
1043
+ - homogeneity of variances
1044
+ - independence of observations
1045
+ if assumptions violated, use non-parametric alternatives
1046
+
1047
+ [2] ALWAYS CALCULATE EFFECT SIZES
1048
+ p-values alone are insufficient
1049
+ report both statistical significance and practical significance
1050
+
1051
+ [3] VISUALIZE DATA before testing
1052
+ understanding the data distribution
1053
+ is crucial for selecting appropriate tests
1054
+
1055
+ [4] PRE-SPECIFY HYPOTHESES
1056
+ avoid p-hacking by testing only what you planned
1057
+ control for multiple comparisons if testing many hypotheses
1058
+
1059
+ [5] USE APPROPRIATE SAMPLE SIZE
1060
+ too small: low power, inconclusive results
1061
+ too large: detects trivial differences
1062
+ perform power analysis when possible
1063
+
1064
+ [6] REPORT CONFIDENCE INTERVALS
1065
+ they provide more information than p-values
1066
+ show precision of estimates
1067
+
1068
+ [7] INTERPRET IN CONTEXT
1069
+ statistical significance != practical importance
1070
+ consider domain knowledge when interpreting results
1071
+
1072
+ [8] CHECK FOR OUTLIERS
1073
+ outliers can dramatically affect results
1074
+ investigate their cause before removing
1075
+
1076
+ [9] DOCUMENT ANALYSIS DECISIONS
1077
+ why you chose specific tests
1078
+ what assumptions you checked
1079
+ any transformations applied
1080
+
1081
+ [10] REPRODUCIBLE ANALYSIS
1082
+ set random seeds
1083
+ document code clearly
1084
+ use version control
1085
+
1086
+
1087
+ FINAL REMINDERS
1088
+
1089
+
1090
+ statistics is about uncertainty
1091
+
1092
+ not about proving things beyond doubt.
1093
+ about quantifying uncertainty.
1094
+ about making informed decisions.
1095
+
1096
+
1097
+ p-values are not probabilities
1098
+
1099
+ p-value = P(data | H0)
1100
+ NOT: P(H0 | data)
1101
+ common misinterpretation to avoid.
1102
+
1103
+
1104
+ correlation is not causation
1105
+
1106
+ just because two variables are related
1107
+ doesn't mean one causes the other
1108
+ consider confounding variables
1109
+
1110
+
1111
+ the goal
1112
+
1113
+ not to find "significant" results.
1114
+ to understand your data.
1115
+ to quantify evidence.
1116
+ to make informed decisions.
1117
+
1118
+ now go analyze that data.