themis-eval 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. themis/cli/__init__.py +5 -0
  2. themis/cli/__main__.py +6 -0
  3. themis/cli/commands/__init__.py +19 -0
  4. themis/cli/commands/benchmarks.py +221 -0
  5. themis/cli/commands/comparison.py +394 -0
  6. themis/cli/commands/config_commands.py +244 -0
  7. themis/cli/commands/cost.py +214 -0
  8. themis/cli/commands/demo.py +68 -0
  9. themis/cli/commands/info.py +90 -0
  10. themis/cli/commands/leaderboard.py +362 -0
  11. themis/cli/commands/math_benchmarks.py +318 -0
  12. themis/cli/commands/mcq_benchmarks.py +207 -0
  13. themis/cli/commands/sample_run.py +244 -0
  14. themis/cli/commands/visualize.py +299 -0
  15. themis/cli/main.py +93 -0
  16. themis/cli/new_project.py +33 -0
  17. themis/cli/utils.py +51 -0
  18. themis/config/__init__.py +19 -0
  19. themis/config/loader.py +27 -0
  20. themis/config/registry.py +34 -0
  21. themis/config/runtime.py +214 -0
  22. themis/config/schema.py +112 -0
  23. themis/core/__init__.py +5 -0
  24. themis/core/conversation.py +354 -0
  25. themis/core/entities.py +164 -0
  26. themis/core/serialization.py +231 -0
  27. themis/core/tools.py +393 -0
  28. themis/core/types.py +141 -0
  29. themis/datasets/__init__.py +273 -0
  30. themis/datasets/base.py +264 -0
  31. themis/datasets/commonsense_qa.py +174 -0
  32. themis/datasets/competition_math.py +265 -0
  33. themis/datasets/coqa.py +133 -0
  34. themis/datasets/gpqa.py +190 -0
  35. themis/datasets/gsm8k.py +123 -0
  36. themis/datasets/gsm_symbolic.py +124 -0
  37. themis/datasets/math500.py +122 -0
  38. themis/datasets/med_qa.py +179 -0
  39. themis/datasets/medmcqa.py +169 -0
  40. themis/datasets/mmlu_pro.py +262 -0
  41. themis/datasets/piqa.py +146 -0
  42. themis/datasets/registry.py +201 -0
  43. themis/datasets/schema.py +245 -0
  44. themis/datasets/sciq.py +150 -0
  45. themis/datasets/social_i_qa.py +151 -0
  46. themis/datasets/super_gpqa.py +263 -0
  47. themis/evaluation/__init__.py +1 -0
  48. themis/evaluation/conditional.py +410 -0
  49. themis/evaluation/extractors/__init__.py +19 -0
  50. themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
  51. themis/evaluation/extractors/exceptions.py +7 -0
  52. themis/evaluation/extractors/identity_extractor.py +29 -0
  53. themis/evaluation/extractors/json_field_extractor.py +45 -0
  54. themis/evaluation/extractors/math_verify_extractor.py +37 -0
  55. themis/evaluation/extractors/regex_extractor.py +43 -0
  56. themis/evaluation/math_verify_utils.py +87 -0
  57. themis/evaluation/metrics/__init__.py +21 -0
  58. themis/evaluation/metrics/composite_metric.py +47 -0
  59. themis/evaluation/metrics/consistency_metric.py +80 -0
  60. themis/evaluation/metrics/exact_match.py +51 -0
  61. themis/evaluation/metrics/length_difference_tolerance.py +33 -0
  62. themis/evaluation/metrics/math_verify_accuracy.py +40 -0
  63. themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
  64. themis/evaluation/metrics/response_length.py +33 -0
  65. themis/evaluation/metrics/rubric_judge_metric.py +134 -0
  66. themis/evaluation/pipeline.py +49 -0
  67. themis/evaluation/pipelines/__init__.py +15 -0
  68. themis/evaluation/pipelines/composable_pipeline.py +357 -0
  69. themis/evaluation/pipelines/standard_pipeline.py +288 -0
  70. themis/evaluation/reports.py +293 -0
  71. themis/evaluation/statistics/__init__.py +53 -0
  72. themis/evaluation/statistics/bootstrap.py +79 -0
  73. themis/evaluation/statistics/confidence_intervals.py +121 -0
  74. themis/evaluation/statistics/distributions.py +207 -0
  75. themis/evaluation/statistics/effect_sizes.py +124 -0
  76. themis/evaluation/statistics/hypothesis_tests.py +305 -0
  77. themis/evaluation/statistics/types.py +139 -0
  78. themis/evaluation/strategies/__init__.py +13 -0
  79. themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
  80. themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
  81. themis/evaluation/strategies/evaluation_strategy.py +24 -0
  82. themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
  83. themis/experiment/__init__.py +5 -0
  84. themis/experiment/builder.py +151 -0
  85. themis/experiment/cache_manager.py +129 -0
  86. themis/experiment/comparison.py +631 -0
  87. themis/experiment/cost.py +310 -0
  88. themis/experiment/definitions.py +62 -0
  89. themis/experiment/export.py +690 -0
  90. themis/experiment/export_csv.py +159 -0
  91. themis/experiment/integration_manager.py +104 -0
  92. themis/experiment/math.py +192 -0
  93. themis/experiment/mcq.py +169 -0
  94. themis/experiment/orchestrator.py +373 -0
  95. themis/experiment/pricing.py +317 -0
  96. themis/experiment/storage.py +255 -0
  97. themis/experiment/visualization.py +588 -0
  98. themis/generation/__init__.py +1 -0
  99. themis/generation/agentic_runner.py +420 -0
  100. themis/generation/batching.py +254 -0
  101. themis/generation/clients.py +143 -0
  102. themis/generation/conversation_runner.py +236 -0
  103. themis/generation/plan.py +456 -0
  104. themis/generation/providers/litellm_provider.py +221 -0
  105. themis/generation/providers/vllm_provider.py +135 -0
  106. themis/generation/router.py +34 -0
  107. themis/generation/runner.py +207 -0
  108. themis/generation/strategies.py +98 -0
  109. themis/generation/templates.py +71 -0
  110. themis/generation/turn_strategies.py +393 -0
  111. themis/generation/types.py +9 -0
  112. themis/integrations/__init__.py +0 -0
  113. themis/integrations/huggingface.py +61 -0
  114. themis/integrations/wandb.py +65 -0
  115. themis/interfaces/__init__.py +83 -0
  116. themis/project/__init__.py +20 -0
  117. themis/project/definitions.py +98 -0
  118. themis/project/patterns.py +230 -0
  119. themis/providers/__init__.py +5 -0
  120. themis/providers/registry.py +39 -0
  121. themis/utils/api_generator.py +379 -0
  122. themis/utils/cost_tracking.py +376 -0
  123. themis/utils/dashboard.py +452 -0
  124. themis/utils/logging_utils.py +41 -0
  125. themis/utils/progress.py +58 -0
  126. themis/utils/tracing.py +320 -0
  127. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/METADATA +1 -1
  128. themis_eval-0.1.1.dist-info/RECORD +134 -0
  129. themis_eval-0.1.0.dist-info/RECORD +0 -8
  130. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/WHEEL +0 -0
  131. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/licenses/LICENSE +0 -0
  132. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,79 @@
1
+ """Bootstrap resampling for confidence intervals."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import random
6
+ from statistics import mean
7
+ from typing import Callable, Sequence
8
+
9
+ from .types import BootstrapResult
10
+
11
+
12
+ def bootstrap_ci(
13
+ values: Sequence[float],
14
+ statistic: Callable[[Sequence[float]], float] = mean,
15
+ n_bootstrap: int = 10000,
16
+ confidence_level: float = 0.95,
17
+ seed: int | None = None,
18
+ ) -> BootstrapResult:
19
+ """Compute bootstrap confidence interval for a statistic.
20
+
21
+ Bootstrap resampling provides non-parametric confidence intervals
22
+ without assuming normality of the underlying distribution.
23
+
24
+ Args:
25
+ values: Sample values
26
+ statistic: Function to compute on each bootstrap sample (default: mean)
27
+ n_bootstrap: Number of bootstrap iterations (default: 10000)
28
+ confidence_level: Confidence level (default: 0.95)
29
+ seed: Random seed for reproducibility
30
+
31
+ Returns:
32
+ BootstrapResult with CI bounds and point estimate
33
+
34
+ Raises:
35
+ ValueError: If values is empty
36
+
37
+ Example:
38
+ >>> values = [1.2, 2.3, 3.1, 2.8, 3.5]
39
+ >>> result = bootstrap_ci(values, statistic=mean, n_bootstrap=10000)
40
+ >>> print(f"Mean: {result.statistic:.2f}, 95% CI: [{result.ci_lower:.2f}, {result.ci_upper:.2f}]")
41
+ """
42
+ if not values:
43
+ raise ValueError("Cannot compute bootstrap CI for empty sequence")
44
+
45
+ rng = random.Random(seed)
46
+
47
+ n = len(values)
48
+ values_list = list(values)
49
+
50
+ # Compute observed statistic
51
+ observed_stat = statistic(values_list)
52
+
53
+ # Bootstrap iterations
54
+ bootstrap_stats = []
55
+ for _ in range(n_bootstrap):
56
+ # Resample with replacement
57
+ resample = rng.choices(values_list, k=n)
58
+ boot_stat = statistic(resample)
59
+ bootstrap_stats.append(boot_stat)
60
+
61
+ # Sort bootstrap statistics
62
+ bootstrap_stats.sort()
63
+
64
+ # Compute percentile CI
65
+ alpha = 1 - confidence_level
66
+ lower_idx = int(n_bootstrap * alpha / 2)
67
+ upper_idx = int(n_bootstrap * (1 - alpha / 2))
68
+
69
+ # Ensure indices are within bounds
70
+ lower_idx = max(0, min(lower_idx, n_bootstrap - 1))
71
+ upper_idx = max(0, min(upper_idx, n_bootstrap - 1))
72
+
73
+ return BootstrapResult(
74
+ statistic=observed_stat,
75
+ ci_lower=bootstrap_stats[lower_idx],
76
+ ci_upper=bootstrap_stats[upper_idx],
77
+ confidence_level=confidence_level,
78
+ n_bootstrap=n_bootstrap,
79
+ )
@@ -0,0 +1,121 @@
1
+ """Confidence interval computation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import math
6
+ from statistics import mean, stdev
7
+ from typing import List, Sequence
8
+
9
+ from themis.core import entities as core_entities
10
+
11
+ from .distributions import inverse_normal_cdf, t_critical_value
12
+ from .types import ConfidenceInterval, StatisticalSummary
13
+
14
+
15
+ def compute_confidence_interval(
16
+ values: Sequence[float],
17
+ confidence_level: float = 0.95,
18
+ ) -> ConfidenceInterval:
19
+ """Compute confidence interval for a sample mean using t-distribution.
20
+
21
+ Args:
22
+ values: Sequence of numeric values
23
+ confidence_level: Confidence level (default: 0.95)
24
+
25
+ Returns:
26
+ ConfidenceInterval with bounds and statistics
27
+
28
+ Raises:
29
+ ValueError: If values is empty or has insufficient data
30
+ """
31
+ n = len(values)
32
+ if n == 0:
33
+ raise ValueError("Cannot compute confidence interval for empty sequence")
34
+ if n == 1:
35
+ # Single value - return degenerate interval
36
+ val = float(values[0])
37
+ return ConfidenceInterval(
38
+ mean=val,
39
+ lower=val,
40
+ upper=val,
41
+ confidence_level=confidence_level,
42
+ sample_size=1,
43
+ )
44
+
45
+ sample_mean = mean(values)
46
+ sample_std = stdev(values)
47
+
48
+ # For large samples (n >= 30), use normal approximation with z-score
49
+ # For small samples, use t-distribution critical value
50
+ if n >= 30:
51
+ # Normal approximation: use z-scores
52
+ # For 95% CI: z = 1.96, for 99% CI: z = 2.576
53
+ if abs(confidence_level - 0.95) < 0.01:
54
+ critical_value = 1.96
55
+ elif abs(confidence_level - 0.99) < 0.01:
56
+ critical_value = 2.576
57
+ elif abs(confidence_level - 0.90) < 0.01:
58
+ critical_value = 1.645
59
+ else:
60
+ # General approximation using inverse normal CDF
61
+ critical_value = inverse_normal_cdf((1 + confidence_level) / 2)
62
+ else:
63
+ # Small sample: use t-distribution critical value (approximation)
64
+ critical_value = t_critical_value(n - 1, confidence_level)
65
+
66
+ standard_error = sample_std / math.sqrt(n)
67
+ margin_of_error = critical_value * standard_error
68
+
69
+ return ConfidenceInterval(
70
+ mean=sample_mean,
71
+ lower=sample_mean - margin_of_error,
72
+ upper=sample_mean + margin_of_error,
73
+ confidence_level=confidence_level,
74
+ sample_size=n,
75
+ )
76
+
77
+
78
+ def compute_statistical_summary(
79
+ scores: List[core_entities.MetricScore],
80
+ ) -> StatisticalSummary:
81
+ """Compute comprehensive statistical summary for metric scores.
82
+
83
+ Args:
84
+ scores: List of MetricScore objects
85
+
86
+ Returns:
87
+ StatisticalSummary with descriptive statistics
88
+
89
+ Raises:
90
+ ValueError: If scores is empty
91
+ """
92
+ if not scores:
93
+ raise ValueError("Cannot compute statistical summary for empty scores list")
94
+
95
+ metric_name = scores[0].metric_name
96
+ values = [score.value for score in scores]
97
+ n = len(values)
98
+
99
+ # Sort for percentile calculations
100
+ sorted_values = sorted(values)
101
+ median_idx = n // 2
102
+ if n % 2 == 0:
103
+ median_value = (sorted_values[median_idx - 1] + sorted_values[median_idx]) / 2.0
104
+ else:
105
+ median_value = sorted_values[median_idx]
106
+
107
+ # Compute confidence interval if we have enough data
108
+ ci_95 = None
109
+ if n >= 2:
110
+ ci_95 = compute_confidence_interval(values, confidence_level=0.95)
111
+
112
+ return StatisticalSummary(
113
+ metric_name=metric_name,
114
+ count=n,
115
+ mean=mean(values),
116
+ std=stdev(values) if n >= 2 else 0.0,
117
+ min_value=min(values),
118
+ max_value=max(values),
119
+ median=median_value,
120
+ confidence_interval_95=ci_95,
121
+ )
@@ -0,0 +1,207 @@
1
+ """Helper functions for statistical distributions."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import math
6
+
7
+
8
+ def inverse_normal_cdf(p: float) -> float:
9
+ """Approximate inverse normal CDF (probit function) for standard normal.
10
+
11
+ Uses Beasley-Springer-Moro approximation.
12
+
13
+ Args:
14
+ p: Probability value between 0 and 1
15
+
16
+ Returns:
17
+ z-score corresponding to probability p
18
+
19
+ Raises:
20
+ ValueError: If p is not between 0 and 1
21
+ """
22
+ if p <= 0 or p >= 1:
23
+ raise ValueError("Probability must be between 0 and 1")
24
+
25
+ # Constants for approximation
26
+ a = [2.50662823884, -18.61500062529, 41.39119773534, -25.44106049637]
27
+ b = [-8.47351093090, 23.08336743743, -21.06224101826, 3.13082909833]
28
+ c = [
29
+ 0.3374754822726147,
30
+ 0.9761690190917186,
31
+ 0.1607979714918209,
32
+ 0.0276438810333863,
33
+ 0.0038405729373609,
34
+ 0.0003951896511919,
35
+ 0.0000321767881768,
36
+ 0.0000002888167364,
37
+ 0.0000003960315187,
38
+ ]
39
+
40
+ # Transform to standard normal
41
+ y = p - 0.5
42
+ if abs(y) < 0.42:
43
+ # Central region
44
+ r = y * y
45
+ x = (
46
+ y
47
+ * (((a[3] * r + a[2]) * r + a[1]) * r + a[0])
48
+ / (((b[3] * r + b[2]) * r + b[1]) * r + b[0] + 1.0)
49
+ )
50
+ return x
51
+ else:
52
+ # Tail region
53
+ r = p if y > 0 else 1 - p
54
+ r = math.log(-math.log(r))
55
+ x = c[0] + r * (
56
+ c[1]
57
+ + r
58
+ * (
59
+ c[2]
60
+ + r
61
+ * (c[3] + r * (c[4] + r * (c[5] + r * (c[6] + r * (c[7] + r * c[8])))))
62
+ )
63
+ )
64
+ if y < 0:
65
+ x = -x
66
+ return x
67
+
68
+
69
+ def t_critical_value(df: int, confidence_level: float) -> float:
70
+ """Approximate t-distribution critical value.
71
+
72
+ This is a simplified approximation. For production use, consider scipy.stats.t.ppf.
73
+
74
+ Args:
75
+ df: Degrees of freedom
76
+ confidence_level: Confidence level (e.g., 0.95)
77
+
78
+ Returns:
79
+ Critical value for two-tailed test
80
+ """
81
+ try:
82
+ from scipy import stats
83
+ except Exception: # pragma: no cover - optional dependency
84
+ stats = None
85
+
86
+ if stats is not None:
87
+ alpha = (1 - confidence_level) / 2
88
+ return float(stats.t.ppf(1 - alpha, df))
89
+
90
+ # For common confidence levels and degrees of freedom, use lookup table
91
+ # Otherwise, use normal approximation for large df
92
+ if df >= 30:
93
+ # Use normal approximation for large df
94
+ alpha = (1 - confidence_level) / 2
95
+ return inverse_normal_cdf(1 - alpha)
96
+
97
+ # Simplified lookup table for small df (two-tailed)
98
+ # Format: {confidence_level: {df: critical_value}}
99
+ lookup_95 = {
100
+ 1: 12.706,
101
+ 2: 4.303,
102
+ 3: 3.182,
103
+ 4: 2.776,
104
+ 5: 2.571,
105
+ 6: 2.447,
106
+ 7: 2.365,
107
+ 8: 2.306,
108
+ 9: 2.262,
109
+ 10: 2.228,
110
+ 15: 2.131,
111
+ 20: 2.086,
112
+ 25: 2.060,
113
+ 29: 2.045,
114
+ }
115
+ lookup_99 = {
116
+ 1: 63.657,
117
+ 2: 9.925,
118
+ 3: 5.841,
119
+ 4: 4.604,
120
+ 5: 4.032,
121
+ 6: 3.707,
122
+ 7: 3.499,
123
+ 8: 3.355,
124
+ 9: 3.250,
125
+ 10: 3.169,
126
+ 15: 2.947,
127
+ 20: 2.845,
128
+ 25: 2.787,
129
+ 29: 2.756,
130
+ }
131
+
132
+ if abs(confidence_level - 0.95) < 0.01:
133
+ lookup = lookup_95
134
+ elif abs(confidence_level - 0.99) < 0.01:
135
+ lookup = lookup_99
136
+ else:
137
+ # Fall back to normal approximation
138
+ alpha = (1 - confidence_level) / 2
139
+ return inverse_normal_cdf(1 - alpha)
140
+
141
+ # Find closest df in lookup table
142
+ if df in lookup:
143
+ return lookup[df]
144
+ else:
145
+ # Linear interpolation or nearest neighbor
146
+ df_keys = sorted(lookup.keys())
147
+ for i, key_df in enumerate(df_keys):
148
+ if df < key_df:
149
+ if i == 0:
150
+ return lookup[key_df]
151
+ else:
152
+ # Interpolate between previous and current
153
+ prev_df = df_keys[i - 1]
154
+ weight = (df - prev_df) / (key_df - prev_df)
155
+ return lookup[prev_df] * (1 - weight) + lookup[key_df] * weight
156
+ return lookup[df_keys[-1]]
157
+
158
+
159
+ def t_to_p_value(t_stat: float, df: int) -> float:
160
+ """Approximate two-tailed p-value for t-statistic.
161
+
162
+ This is a simplified approximation. For production use, consider scipy.stats.t.cdf.
163
+
164
+ Args:
165
+ t_stat: t-statistic value
166
+ df: Degrees of freedom
167
+
168
+ Returns:
169
+ Two-tailed p-value
170
+ """
171
+ try:
172
+ from scipy import stats
173
+ except Exception: # pragma: no cover - optional dependency
174
+ stats = None
175
+
176
+ if stats is not None:
177
+ p_one_tail = stats.t.cdf(-abs(t_stat), df)
178
+ return float(2 * p_one_tail)
179
+
180
+ # For large df, use normal approximation
181
+ if df >= 30:
182
+ # Use normal distribution CDF
183
+ p_one_tail = normal_cdf(-abs(t_stat))
184
+ return 2 * p_one_tail
185
+
186
+ # For small df, use approximation
187
+ # Very rough approximation: convert t to approximate p-value
188
+ if abs(t_stat) < 0.5:
189
+ return 1.0
190
+ elif abs(t_stat) > 10:
191
+ return 0.0001
192
+ else:
193
+ # Rough approximation using exponential decay
194
+ base_p = math.exp(-abs(t_stat) * 0.5) * (df / (df + t_stat**2))
195
+ return min(1.0, 2 * base_p)
196
+
197
+
198
+ def normal_cdf(x: float) -> float:
199
+ """Standard normal CDF using error function approximation.
200
+
201
+ Args:
202
+ x: Value to evaluate CDF at
203
+
204
+ Returns:
205
+ Cumulative probability
206
+ """
207
+ return 0.5 * (1 + math.erf(x / math.sqrt(2)))
@@ -0,0 +1,124 @@
1
+ """Effect size measures for statistical comparisons."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import math
6
+ from typing import Sequence
7
+
8
+ from .types import EffectSize
9
+
10
+
11
+ def cohens_h(p1: float, p2: float) -> EffectSize:
12
+ """Compute Cohen's h effect size for comparing two proportions.
13
+
14
+ Cohen's h measures the distance between two proportions using
15
+ the arcsine transformation. This is useful for comparing success
16
+ rates, accuracy proportions, etc.
17
+
18
+ Args:
19
+ p1: Proportion for group 1 (e.g., baseline accuracy)
20
+ p2: Proportion for group 2 (e.g., treatment accuracy)
21
+
22
+ Returns:
23
+ EffectSize with value and interpretation
24
+
25
+ Interpretation:
26
+ - |h| < 0.2: negligible
27
+ - 0.2 <= |h| < 0.5: small
28
+ - 0.5 <= |h| < 0.8: medium
29
+ - |h| >= 0.8: large
30
+
31
+ Example:
32
+ >>> # Baseline: 65% accuracy, Treatment: 75% accuracy
33
+ >>> effect = cohens_h(0.65, 0.75)
34
+ >>> print(f"Effect: {effect.value:.3f} ({effect.interpretation})")
35
+ """
36
+ # Arcsine transformation
37
+ phi1 = 2 * math.asin(math.sqrt(p1))
38
+ phi2 = 2 * math.asin(math.sqrt(p2))
39
+
40
+ h = phi2 - phi1
41
+
42
+ # Interpret effect size
43
+ abs_h = abs(h)
44
+ if abs_h < 0.2:
45
+ interpretation = "negligible"
46
+ elif abs_h < 0.5:
47
+ interpretation = "small"
48
+ elif abs_h < 0.8:
49
+ interpretation = "medium"
50
+ else:
51
+ interpretation = "large"
52
+
53
+ return EffectSize(
54
+ name="cohen_h",
55
+ value=h,
56
+ interpretation=interpretation,
57
+ )
58
+
59
+
60
+ def cohens_d(group1: Sequence[float], group2: Sequence[float]) -> EffectSize:
61
+ """Compute Cohen's d effect size for comparing two means.
62
+
63
+ Cohen's d measures the standardized difference between two group means.
64
+ This is the most common effect size for t-tests.
65
+
66
+ Args:
67
+ group1: Values from first group (e.g., baseline)
68
+ group2: Values from second group (e.g., treatment)
69
+
70
+ Returns:
71
+ EffectSize with value and interpretation
72
+
73
+ Interpretation:
74
+ - |d| < 0.2: negligible
75
+ - 0.2 <= |d| < 0.5: small
76
+ - 0.5 <= |d| < 0.8: medium
77
+ - |d| >= 0.8: large
78
+
79
+ Example:
80
+ >>> baseline = [1.2, 1.5, 1.3, 1.4]
81
+ >>> treatment = [1.8, 2.0, 1.9, 2.1]
82
+ >>> effect = cohens_d(baseline, treatment)
83
+ """
84
+ from statistics import mean, stdev
85
+
86
+ n1 = len(group1)
87
+ n2 = len(group2)
88
+
89
+ if n1 < 2 or n2 < 2:
90
+ raise ValueError("Each group must have at least 2 values")
91
+
92
+ mean1 = mean(group1)
93
+ mean2 = mean(group2)
94
+ std1 = stdev(group1)
95
+ std2 = stdev(group2)
96
+
97
+ # Pooled standard deviation
98
+ pooled_std = math.sqrt(((n1 - 1) * std1**2 + (n2 - 1) * std2**2) / (n1 + n2 - 2))
99
+
100
+ if pooled_std == 0:
101
+ # No variance - return 0 if means are equal, infinity otherwise
102
+ if mean1 == mean2:
103
+ d = 0.0
104
+ else:
105
+ d = float("inf")
106
+ else:
107
+ d = (mean2 - mean1) / pooled_std
108
+
109
+ # Interpret effect size
110
+ abs_d = abs(d)
111
+ if abs_d < 0.2:
112
+ interpretation = "negligible"
113
+ elif abs_d < 0.5:
114
+ interpretation = "small"
115
+ elif abs_d < 0.8:
116
+ interpretation = "medium"
117
+ else:
118
+ interpretation = "large"
119
+
120
+ return EffectSize(
121
+ name="cohen_d",
122
+ value=d,
123
+ interpretation=interpretation,
124
+ )