dsba-python1-alpha 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ # workshops sub-package
@@ -0,0 +1,286 @@
1
+ """
2
+ dsba_checkers.workshops.w14_scipy
3
+ ──────────────────────────────────
4
+ Checker for Workshop 14 — Using scipy.stats
5
+ Dataset bundled inside the package (insurance.csv).
6
+
7
+ Usage:
8
+ from dsba_checkers.workshops.w14_scipy import q1, q2, q3, q4, q5, q6, q7
9
+ """
10
+
11
+ import importlib.resources
12
+ import numpy as np
13
+ import pandas as pd
14
+ from scipy import stats
15
+
16
+ from ..base_task import Task
17
+
18
+ # ── Load bundled data once at import time ────────────────────────────────────
19
+
20
+ def _load_df():
21
+ try:
22
+ # Python 3.9+
23
+ ref = importlib.resources.files("dsba_checkers.data").joinpath("insurance.csv")
24
+ with importlib.resources.as_file(ref) as path:
25
+ return pd.read_csv(path)
26
+ except AttributeError:
27
+ # Python 3.8 fallback
28
+ with importlib.resources.open_text("dsba_checkers.data", "insurance.csv") as f:
29
+ return pd.read_csv(f)
30
+
31
+ _df = _load_df()
32
+
33
+
34
+ # ── Task 1 — Descriptive statistics for `charges` ───────────────────────────
35
+
36
+ class _Q1_DescriptiveStats(Task):
37
+ _hint = (
38
+ "Use df['charges'].mean(), .median(), stats.skew(df['charges']). "
39
+ "Outliers: q1 = quantile(0.25), q3 = quantile(0.75), "
40
+ "iqr = q3 - q1, then keep rows outside [q1 - 1.5*iqr, q3 + 1.5*iqr]."
41
+ )
42
+ _solution = """\
43
+ mean_charges = df['charges'].mean() # 13 270.42
44
+ median_charges = df['charges'].median() # 9 382.03
45
+ skewness_charges = stats.skew(df['charges']) # 1.515
46
+
47
+ q1_val = df['charges'].quantile(0.25)
48
+ q3_val = df['charges'].quantile(0.75)
49
+ iqr = q3_val - q1_val
50
+ outliers = df[(df['charges'] < q1_val - 1.5*iqr) | (df['charges'] > q3_val + 1.5*iqr)]
51
+ n_outliers = len(outliers) # 139"""
52
+
53
+ def _check(self, mean_charges, median_charges, skewness_charges, n_outliers):
54
+ ref_mean = _df['charges'].mean()
55
+ ref_median = _df['charges'].median()
56
+ ref_skew = float(stats.skew(_df['charges']))
57
+ q1v = _df['charges'].quantile(0.25)
58
+ q3v = _df['charges'].quantile(0.75)
59
+ iqr = q3v - q1v
60
+ ref_n = int(((_df['charges'] < q1v - 1.5*iqr) | (_df['charges'] > q3v + 1.5*iqr)).sum())
61
+
62
+ assert self._approx_equal(mean_charges, ref_mean), (
63
+ f"mean_charges ≈ {ref_mean:,.2f}, got {mean_charges:,.2f}.")
64
+ assert self._approx_equal(median_charges, ref_median), (
65
+ f"median_charges ≈ {ref_median:,.2f}, got {median_charges:,.2f}.")
66
+ assert self._approx_equal(skewness_charges, ref_skew, tol=0.02), (
67
+ f"skewness ≈ {ref_skew:.3f}, got {skewness_charges:.3f}. "
68
+ "Use stats.skew(df['charges']).")
69
+ assert int(n_outliers) == ref_n, (
70
+ f"Expected {ref_n} IQR outliers, got {n_outliers}.")
71
+ return True
72
+
73
+
74
+ # ── Task 2 — Pearson correlation: age vs charges ─────────────────────────────
75
+
76
+ class _Q2_PearsonAge(Task):
77
+ _hint = (
78
+ "stats.pearsonr(df['age'], df['charges']) returns (r, p_value). "
79
+ "Check: is p < 0.05?"
80
+ )
81
+ _solution = """\
82
+ r_age_charges, p_age_charges = stats.pearsonr(df['age'], df['charges'])
83
+ # r ≈ 0.2990, p ≈ 0.0"""
84
+
85
+ def _check(self, r, p):
86
+ ref_r, ref_p = stats.pearsonr(_df['age'], _df['charges'])
87
+ assert self._approx_equal(r, ref_r, tol=0.005), (
88
+ f"r ≈ {ref_r:.4f}, got {r:.4f}. "
89
+ "Pass df['age'] and df['charges'] to stats.pearsonr().")
90
+ assert p < 0.05, (
91
+ f"p-value should be < 0.05 (actual ≈ {ref_p:.2e}). "
92
+ "Double-check which columns you passed.")
93
+ return True
94
+
95
+
96
+ # ── Task 3 — Pearson vs Spearman, split by smoker ────────────────────────────
97
+
98
+ class _Q3_PearsonSpearman(Task):
99
+ _hint = (
100
+ "stats.pearsonr(x, y) and stats.spearmanr(x, y) both return (stat, p). "
101
+ "Split first: non_smokers = df[df['smoker'] == 'no']."
102
+ )
103
+ _solution = """\
104
+ pearson_all = stats.pearsonr( df['bmi'], df['charges'])
105
+ spearman_all = stats.spearmanr(df['bmi'], df['charges'])
106
+
107
+ ns = df[df['smoker'] == 'no']
108
+ pearson_ns = stats.pearsonr( ns['bmi'], ns['charges'])
109
+ spearman_ns = stats.spearmanr(ns['bmi'], ns['charges'])
110
+
111
+ s = df[df['smoker'] == 'yes']
112
+ pearson_s = stats.pearsonr( s['bmi'], s['charges'])
113
+ spearman_s = stats.spearmanr(s['bmi'], s['charges'])"""
114
+
115
+ def _check(self, pearson_all, spearman_all, pearson_ns, spearman_ns, pearson_s, spearman_s):
116
+ ns = _df[_df['smoker'] == 'no']
117
+ s = _df[_df['smoker'] == 'yes']
118
+ refs = [
119
+ (pearson_all[0], stats.pearsonr( _df['bmi'], _df['charges'])[0], "pearson_all"),
120
+ (spearman_all[0], stats.spearmanr(_df['bmi'], _df['charges'])[0], "spearman_all"),
121
+ (pearson_ns[0], stats.pearsonr( ns['bmi'], ns['charges'])[0], "pearson_ns"),
122
+ (spearman_ns[0], stats.spearmanr(ns['bmi'], ns['charges'])[0], "spearman_ns"),
123
+ (pearson_s[0], stats.pearsonr( s['bmi'], s['charges'])[0], "pearson_s"),
124
+ (spearman_s[0], stats.spearmanr(s['bmi'], s['charges'])[0], "spearman_s"),
125
+ ]
126
+ for got, ref, name in refs:
127
+ assert self._approx_equal(got, ref, tol=0.005), (
128
+ f"{name}: expected ≈ {ref:.4f}, got {got:.4f}.")
129
+ return True
130
+
131
+
132
+ # ── Task 4 — Normality tests for BMI ─────────────────────────────────────────
133
+
134
+ class _Q4_NormalityBMI(Task):
135
+ _hint = (
136
+ "stats.shapiro(df['bmi']) → (W, p). "
137
+ "stats.normaltest(df['bmi']) → (stat, p)."
138
+ )
139
+ _solution = """\
140
+ stat_sw, p_sw = stats.shapiro(df['bmi'])
141
+ stat_da, p_da = stats.normaltest(df['bmi'])
142
+ # Shapiro-Wilk: W ≈ 0.9977, p ≈ 0.0099
143
+ # D'Agostino K²: stat ≈ 1.01, p ≈ 0.603"""
144
+
145
+ def _check(self, stat_sw, p_sw, stat_da, p_da):
146
+ ref_sw_s, ref_sw_p = stats.shapiro(_df['bmi'])
147
+ ref_da_s, ref_da_p = stats.normaltest(_df['bmi'])
148
+ assert self._approx_equal(stat_sw, ref_sw_s, tol=0.001), (
149
+ f"Shapiro W ≈ {ref_sw_s:.4f}, got {stat_sw:.4f}.")
150
+ assert self._approx_equal(p_sw, ref_sw_p, tol=0.05), (
151
+ f"Shapiro p ≈ {ref_sw_p:.4f}, got {p_sw:.4f}.")
152
+ assert self._approx_equal(stat_da, ref_da_s, tol=0.05), (
153
+ f"D'Agostino stat ≈ {ref_da_s:.4f}, got {stat_da:.4f}. Use stats.normaltest().")
154
+ assert self._approx_equal(p_da, ref_da_p, tol=0.05), (
155
+ f"D'Agostino p ≈ {ref_da_p:.4f}, got {p_da:.4f}.")
156
+ return True
157
+
158
+
159
+ # ── Task 5 — Two-sample t-test: smokers vs non-smokers ───────────────────────
160
+
161
+ class _Q5_TTest(Task):
162
+ _hint = (
163
+ "Levene's test: stats.levene(smoker_charges, nonsmoker_charges). "
164
+ "Welch's t-test: stats.ttest_ind(..., equal_var=False). "
165
+ "Cohen's d = (mean1 - mean2) / pooled_std, "
166
+ "where pooled_std = sqrt(((n1-1)*s1² + (n2-1)*s2²) / (n1+n2-2))."
167
+ )
168
+ _solution = """\
169
+ sc = df[df['smoker'] == 'yes']['charges']
170
+ nsc = df[df['smoker'] == 'no']['charges']
171
+
172
+ stat_levene, p_levene = stats.levene(sc, nsc)
173
+ stat_t, p_t = stats.ttest_ind(sc, nsc, equal_var=False)
174
+
175
+ n1, n2 = len(sc), len(nsc)
176
+ s1, s2 = sc.std(ddof=1), nsc.std(ddof=1)
177
+ pooled_std = np.sqrt(((n1-1)*s1**2 + (n2-1)*s2**2) / (n1+n2-2))
178
+ cohens_d = (sc.mean() - nsc.mean()) / pooled_std # ≈ 2.09"""
179
+
180
+ def _check(self, stat_t, p_t, cohens_d, stat_levene=None, p_levene=None):
181
+ sc = _df[_df['smoker'] == 'yes']['charges']
182
+ nsc = _df[_df['smoker'] == 'no']['charges']
183
+ ref_t, ref_p = stats.ttest_ind(sc, nsc, equal_var=False)
184
+ n1, n2 = len(sc), len(nsc)
185
+ s1, s2 = sc.std(ddof=1), nsc.std(ddof=1)
186
+ pooled = np.sqrt(((n1-1)*s1**2 + (n2-1)*s2**2) / (n1+n2-2))
187
+ ref_d = float((sc.mean() - nsc.mean()) / pooled)
188
+
189
+ assert self._approx_equal(abs(stat_t), abs(ref_t), tol=0.005), (
190
+ f"t ≈ {ref_t:.4f}, got {stat_t:.4f}. "
191
+ "Use stats.ttest_ind(sc, nsc, equal_var=False).")
192
+ assert p_t < 1e-10, f"p should be ≈ 0, got {p_t}."
193
+ assert self._approx_equal(cohens_d, ref_d, tol=0.02), (
194
+ f"Cohen's d ≈ {ref_d:.4f}, got {cohens_d:.4f}.")
195
+ if stat_levene is not None:
196
+ ref_lev, _ = stats.levene(sc, nsc)
197
+ assert self._approx_equal(stat_levene, ref_lev, tol=0.05), (
198
+ f"Levene stat ≈ {ref_lev:.4f}, got {stat_levene:.4f}.")
199
+ return True
200
+
201
+
202
+ # ── Task 6 — Paired t-test: wellness BMI scenario ────────────────────────────
203
+
204
+ class _Q6_PairedTTest(Task):
205
+ _hint = (
206
+ "diff = before_bmi - after_bmi. "
207
+ "mean_diff = np.mean(diff), std_diff = np.std(diff, ddof=1). "
208
+ "stats.ttest_rel(before_bmi, after_bmi) → (stat, p)."
209
+ )
210
+ _solution = """\
211
+ diff = before_bmi - after_bmi
212
+ mean_diff = np.mean(diff) # ≈ 2.45
213
+ std_diff = np.std(diff, ddof=1)
214
+ stat_paired, p_paired = stats.ttest_rel(before_bmi, after_bmi)
215
+ # stat ≈ 4.32, p ≈ 0.00017"""
216
+
217
+ def _check(self, mean_diff, std_diff, stat_paired, p_paired):
218
+ np.random.seed(123)
219
+ before = np.random.normal(32, 5, 30)
220
+ after = np.maximum(before - np.random.normal(2.5, 3, 30), 18)
221
+ diff = before - after
222
+ ref_mean = float(np.mean(diff))
223
+ ref_std = float(np.std(diff, ddof=1))
224
+ ref_stat, ref_p = stats.ttest_rel(before, after)
225
+
226
+ assert self._approx_equal(mean_diff, ref_mean, tol=0.02), (
227
+ f"mean_diff ≈ {ref_mean:.4f}, got {mean_diff:.4f}.")
228
+ assert self._approx_equal(std_diff, ref_std, tol=0.02), (
229
+ f"std_diff ≈ {ref_std:.4f}, got {std_diff:.4f}. Use ddof=1.")
230
+ assert self._approx_equal(abs(stat_paired), abs(ref_stat), tol=0.02), (
231
+ f"t ≈ {ref_stat:.4f}, got {stat_paired:.4f}.")
232
+ assert p_paired < 0.05, f"p should be < 0.05, got {p_paired:.4f}."
233
+ return True
234
+
235
+
236
+ # ── Task 7 — Bonferroni correction across regions ────────────────────────────
237
+
238
+ class _Q7_Bonferroni(Task):
239
+ _hint = (
240
+ "4 regions → C(4,2) = 6 pairs. bonferroni_alpha = 0.05 / 6. "
241
+ "Loop with combinations(regions, 2), call stats.ttest_ind(d1, d2), "
242
+ "count pairs where p_val < bonferroni_alpha."
243
+ )
244
+ _solution = """\
245
+ from itertools import combinations
246
+ regions = df['region'].unique()
247
+ n_comparisons = len(list(combinations(regions, 2))) # 6
248
+ bonferroni_alpha = 0.05 / n_comparisons # ≈ 0.008333
249
+
250
+ n_significant_pairs = 0
251
+ for r1, r2 in combinations(regions, 2):
252
+ _, p = stats.ttest_ind(df[df['region']==r1]['charges'],
253
+ df[df['region']==r2]['charges'])
254
+ if p < bonferroni_alpha:
255
+ n_significant_pairs += 1 # result: 1"""
256
+
257
+ def _check(self, n_comparisons, bonferroni_alpha, n_significant_pairs):
258
+ from itertools import combinations
259
+ regions = _df['region'].unique()
260
+ ref_n = len(list(combinations(regions, 2)))
261
+ ref_bonf = 0.05 / ref_n
262
+ ref_sig = sum(
263
+ 1 for r1, r2 in combinations(regions, 2)
264
+ if stats.ttest_ind(_df[_df['region']==r1]['charges'],
265
+ _df[_df['region']==r2]['charges'])[1] < ref_bonf
266
+ )
267
+ assert int(n_comparisons) == ref_n, (
268
+ f"Expected {ref_n} comparisons (C(4,2)), got {n_comparisons}.")
269
+ assert self._approx_equal(bonferroni_alpha, ref_bonf, tol=0.001), (
270
+ f"bonferroni_alpha = 0.05/{ref_n} ≈ {ref_bonf:.6f}, got {bonferroni_alpha:.6f}.")
271
+ assert int(n_significant_pairs) == ref_sig, (
272
+ f"Expected {ref_sig} significant pair(s), got {n_significant_pairs}.")
273
+ return True
274
+
275
+
276
+ # ── Public objects ────────────────────────────────────────────────────────────
277
+
278
+ q1 = _Q1_DescriptiveStats()
279
+ q2 = _Q2_PearsonAge()
280
+ q3 = _Q3_PearsonSpearman()
281
+ q4 = _Q4_NormalityBMI()
282
+ q5 = _Q5_TTest()
283
+ q6 = _Q6_PairedTTest()
284
+ q7 = _Q7_Bonferroni()
285
+
286
+ __all__ = ["q1", "q2", "q3", "q4", "q5", "q6", "q7"]
@@ -0,0 +1,47 @@
1
+ Metadata-Version: 2.4
2
+ Name: dsba-python1-alpha
3
+ Version: 0.1.0
4
+ Summary: Auto-checkers for the DSBA Python for Data Science course
5
+ Project-URL: Homepage, https://github.com/your-org/dsba-checkers
6
+ License: MIT
7
+ Requires-Python: >=3.8
8
+ Requires-Dist: numpy>=1.21
9
+ Requires-Dist: pandas>=1.3
10
+ Requires-Dist: scipy>=1.7
11
+ Description-Content-Type: text/markdown
12
+
13
+ # dsba-python1-alpha
14
+
15
+ Auto-checkers for the **DSBA Python for Data Science** course.
16
+ Inspired by Kaggle's `learntools` — same `.check()` / `.hint()` / `.solution()` API,
17
+ all datasets bundled inside the package (no internet required).
18
+
19
+ ## Install
20
+
21
+ ```bash
22
+ pip install dsba-python1-alpha
23
+ ```
24
+
25
+ ## Workshop 14 — scipy.stats
26
+
27
+ ```python
28
+ from dsba_checkers.workshops.w14_scipy import q1, q2, q3, q4, q5, q6, q7
29
+
30
+ # After writing your code:
31
+ q1.check(mean_charges, median_charges, skewness_charges, n_outliers)
32
+ # ✓ Correct!
33
+
34
+ # Stuck? Ask for a hint:
35
+ q1.hint()
36
+
37
+ # Give up? Show the solution (also shown automatically after 3 failed attempts):
38
+ q1.solution()
39
+ ```
40
+
41
+ ## Available workshops
42
+
43
+ | Module | Topic |
44
+ |--------|-------|
45
+ | `dsba_checkers.workshops.w14_scipy` | scipy.stats: descriptive stats, correlation, t-tests, Bonferroni |
46
+
47
+ More workshops coming in future versions.
@@ -0,0 +1,9 @@
1
+ dsba_checkers/__init__.py,sha256=-Nql8Eus0KntYBV-SFhWnNYurjoD7EsNRL8fI5w1TWk,395
2
+ dsba_checkers/base_task.py,sha256=nywQPKSQ3R4YXLd3fdoYo4GTzBUUVwejLGYlpFVNtRo,1808
3
+ dsba_checkers/data/__init__.py,sha256=V1sknNbutTDg3wiZ0UuYMwWQiLKMMMK9FL-YZfsoJEE,53
4
+ dsba_checkers/workshops/__init__.py,sha256=puMXpi3fz5ZBMyKz-gYuefbv9UF7ySbc8hO2zBRs7G8,24
5
+ dsba_checkers/workshops/w14_scipy.py,sha256=Dsj05RN9hfkurXhZw8cHAyueuVCypchzSFehgGbY8V4,12541
6
+ dsba_checkers/data/insurance.csv,sha256=UFwcvC5j0DY7rFlQFWPfJTCq30zbnP7iJvTvMvVGgoE,54288
7
+ dsba_python1_alpha-0.1.0.dist-info/METADATA,sha256=Waryy72vlpxyTz4q3-HZU0fbegQBTguwoGwmw_CSyp8,1225
8
+ dsba_python1_alpha-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
9
+ dsba_python1_alpha-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any