@wentorai/research-plugins 1.2.3 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. package/README.md +16 -8
  2. package/openclaw.plugin.json +10 -3
  3. package/package.json +2 -5
  4. package/skills/analysis/dataviz/SKILL.md +25 -0
  5. package/skills/analysis/dataviz/chart-image-generator/SKILL.md +1 -1
  6. package/skills/analysis/econometrics/SKILL.md +23 -0
  7. package/skills/analysis/econometrics/robustness-checks/SKILL.md +1 -1
  8. package/skills/analysis/statistics/SKILL.md +21 -0
  9. package/skills/analysis/statistics/data-anomaly-detection/SKILL.md +1 -1
  10. package/skills/analysis/statistics/ml-experiment-tracker/SKILL.md +1 -1
  11. package/skills/analysis/statistics/{senior-data-scientist-guide → modeling-strategy-guide}/SKILL.md +5 -5
  12. package/skills/analysis/wrangling/SKILL.md +21 -0
  13. package/skills/analysis/wrangling/csv-data-analyzer/SKILL.md +1 -1
  14. package/skills/analysis/wrangling/data-cog-guide/SKILL.md +1 -1
  15. package/skills/domains/ai-ml/SKILL.md +37 -0
  16. package/skills/domains/biomedical/SKILL.md +28 -0
  17. package/skills/domains/biomedical/genomas-guide/SKILL.md +1 -1
  18. package/skills/domains/biomedical/med-researcher-guide/SKILL.md +1 -1
  19. package/skills/domains/biomedical/medgeclaw-guide/SKILL.md +1 -1
  20. package/skills/domains/business/SKILL.md +17 -0
  21. package/skills/domains/business/architecture-design-guide/SKILL.md +1 -1
  22. package/skills/domains/chemistry/SKILL.md +19 -0
  23. package/skills/domains/chemistry/computational-chemistry-guide/SKILL.md +1 -1
  24. package/skills/domains/cs/SKILL.md +21 -0
  25. package/skills/domains/ecology/SKILL.md +16 -0
  26. package/skills/domains/economics/SKILL.md +20 -0
  27. package/skills/domains/economics/post-labor-economics/SKILL.md +1 -1
  28. package/skills/domains/economics/pricing-psychology-guide/SKILL.md +1 -1
  29. package/skills/domains/education/SKILL.md +19 -0
  30. package/skills/domains/education/academic-study-methods/SKILL.md +1 -1
  31. package/skills/domains/education/edumcp-guide/SKILL.md +1 -1
  32. package/skills/domains/finance/SKILL.md +19 -0
  33. package/skills/domains/finance/akshare-finance-data/SKILL.md +1 -1
  34. package/skills/domains/finance/options-analytics-agent-guide/SKILL.md +1 -1
  35. package/skills/domains/finance/stata-accounting-research/SKILL.md +1 -1
  36. package/skills/domains/geoscience/SKILL.md +17 -0
  37. package/skills/domains/humanities/SKILL.md +16 -0
  38. package/skills/domains/humanities/history-research-guide/SKILL.md +1 -1
  39. package/skills/domains/humanities/political-history-guide/SKILL.md +1 -1
  40. package/skills/domains/law/SKILL.md +19 -0
  41. package/skills/domains/math/SKILL.md +17 -0
  42. package/skills/domains/pharma/SKILL.md +17 -0
  43. package/skills/domains/physics/SKILL.md +16 -0
  44. package/skills/domains/social-science/SKILL.md +17 -0
  45. package/skills/domains/social-science/sociology-research-methods/SKILL.md +1 -1
  46. package/skills/literature/discovery/SKILL.md +20 -0
  47. package/skills/literature/discovery/paper-recommendation-guide/SKILL.md +1 -1
  48. package/skills/literature/discovery/semantic-paper-radar/SKILL.md +1 -1
  49. package/skills/literature/fulltext/SKILL.md +26 -0
  50. package/skills/literature/metadata/SKILL.md +35 -0
  51. package/skills/literature/metadata/doi-content-negotiation/SKILL.md +4 -0
  52. package/skills/literature/metadata/doi-resolution-guide/SKILL.md +4 -0
  53. package/skills/literature/metadata/orcid-api/SKILL.md +4 -0
  54. package/skills/literature/metadata/orcid-integration-guide/SKILL.md +4 -0
  55. package/skills/literature/search/SKILL.md +43 -0
  56. package/skills/literature/search/paper-search-mcp-guide/SKILL.md +1 -1
  57. package/skills/research/automation/SKILL.md +21 -0
  58. package/skills/research/deep-research/SKILL.md +24 -0
  59. package/skills/research/deep-research/auto-deep-research-guide/SKILL.md +1 -1
  60. package/skills/research/deep-research/in-depth-research-guide/SKILL.md +1 -1
  61. package/skills/research/funding/SKILL.md +20 -0
  62. package/skills/research/methodology/SKILL.md +24 -0
  63. package/skills/research/paper-review/SKILL.md +19 -0
  64. package/skills/research/paper-review/paper-critique-framework/SKILL.md +1 -1
  65. package/skills/tools/code-exec/SKILL.md +18 -0
  66. package/skills/tools/diagram/SKILL.md +20 -0
  67. package/skills/tools/document/SKILL.md +21 -0
  68. package/skills/tools/knowledge-graph/SKILL.md +21 -0
  69. package/skills/tools/ocr-translate/SKILL.md +18 -0
  70. package/skills/tools/ocr-translate/handwriting-recognition-guide/SKILL.md +2 -0
  71. package/skills/tools/ocr-translate/latex-ocr-guide/SKILL.md +2 -0
  72. package/skills/tools/scraping/SKILL.md +17 -0
  73. package/skills/writing/citation/SKILL.md +33 -0
  74. package/skills/writing/citation/zotfile-attachment-guide/SKILL.md +2 -0
  75. package/skills/writing/composition/SKILL.md +22 -0
  76. package/skills/writing/composition/research-paper-writer/SKILL.md +1 -1
  77. package/skills/writing/composition/scientific-writing-wrapper/SKILL.md +1 -1
  78. package/skills/writing/latex/SKILL.md +22 -0
  79. package/skills/writing/latex/academic-writing-latex/SKILL.md +1 -1
  80. package/skills/writing/latex/latex-drawing-guide/SKILL.md +1 -1
  81. package/skills/writing/polish/SKILL.md +20 -0
  82. package/skills/writing/polish/chinese-text-humanizer/SKILL.md +1 -1
  83. package/skills/writing/templates/SKILL.md +22 -0
  84. package/skills/writing/templates/beamer-presentation-guide/SKILL.md +1 -1
  85. package/skills/writing/templates/scientific-article-pdf/SKILL.md +1 -1
  86. package/skills/analysis/dataviz/citation-map-guide/SKILL.md +0 -184
  87. package/skills/analysis/dataviz/data-visualization-principles/SKILL.md +0 -171
  88. package/skills/analysis/econometrics/empirical-paper-analysis/SKILL.md +0 -192
  89. package/skills/analysis/econometrics/panel-data-regression-workflow/SKILL.md +0 -267
  90. package/skills/analysis/econometrics/stata-regression/SKILL.md +0 -117
  91. package/skills/analysis/statistics/general-statistics-guide/SKILL.md +0 -226
  92. package/skills/analysis/statistics/infiagent-benchmark-guide/SKILL.md +0 -106
  93. package/skills/analysis/statistics/pywayne-statistics-guide/SKILL.md +0 -192
  94. package/skills/analysis/statistics/quantitative-methods-guide/SKILL.md +0 -193
  95. package/skills/analysis/wrangling/claude-data-analysis-guide/SKILL.md +0 -100
  96. package/skills/analysis/wrangling/open-data-scientist-guide/SKILL.md +0 -197
  97. package/skills/domains/ai-ml/annotated-dl-papers-guide/SKILL.md +0 -159
  98. package/skills/domains/humanities/digital-humanities-methods/SKILL.md +0 -232
  99. package/skills/domains/law/legal-research-methods/SKILL.md +0 -190
  100. package/skills/domains/social-science/sociology-research-guide/SKILL.md +0 -238
  101. package/skills/literature/discovery/arxiv-paper-monitoring/SKILL.md +0 -233
  102. package/skills/literature/discovery/paper-tracking-guide/SKILL.md +0 -211
  103. package/skills/literature/fulltext/zotero-scihub-guide/SKILL.md +0 -168
  104. package/skills/literature/search/arxiv-osiris/SKILL.md +0 -199
  105. package/skills/literature/search/deepgit-search-guide/SKILL.md +0 -147
  106. package/skills/literature/search/multi-database-literature-search/SKILL.md +0 -198
  107. package/skills/literature/search/papers-chat-guide/SKILL.md +0 -194
  108. package/skills/literature/search/pasa-paper-search-guide/SKILL.md +0 -138
  109. package/skills/literature/search/scientify-literature-survey/SKILL.md +0 -203
  110. package/skills/research/automation/ai-scientist-guide/SKILL.md +0 -228
  111. package/skills/research/automation/coexist-ai-guide/SKILL.md +0 -149
  112. package/skills/research/automation/foam-agent-guide/SKILL.md +0 -203
  113. package/skills/research/automation/research-paper-orchestrator/SKILL.md +0 -254
  114. package/skills/research/deep-research/academic-deep-research/SKILL.md +0 -190
  115. package/skills/research/deep-research/cognitive-kernel-guide/SKILL.md +0 -200
  116. package/skills/research/deep-research/corvus-research-guide/SKILL.md +0 -132
  117. package/skills/research/deep-research/deep-research-pro/SKILL.md +0 -213
  118. package/skills/research/deep-research/deep-research-work/SKILL.md +0 -204
  119. package/skills/research/deep-research/research-cog/SKILL.md +0 -153
  120. package/skills/research/methodology/academic-mentor-guide/SKILL.md +0 -169
  121. package/skills/research/methodology/deep-innovator-guide/SKILL.md +0 -242
  122. package/skills/research/methodology/research-pipeline-units-guide/SKILL.md +0 -169
  123. package/skills/research/paper-review/paper-compare-guide/SKILL.md +0 -238
  124. package/skills/research/paper-review/paper-digest-guide/SKILL.md +0 -240
  125. package/skills/research/paper-review/paper-research-assistant/SKILL.md +0 -231
  126. package/skills/research/paper-review/research-quality-filter/SKILL.md +0 -261
  127. package/skills/tools/code-exec/contextplus-mcp-guide/SKILL.md +0 -110
  128. package/skills/tools/diagram/clawphd-guide/SKILL.md +0 -149
  129. package/skills/tools/diagram/scientific-graphical-abstract/SKILL.md +0 -201
  130. package/skills/tools/document/md2pdf-xelatex/SKILL.md +0 -212
  131. package/skills/tools/document/openpaper-guide/SKILL.md +0 -232
  132. package/skills/tools/document/qq-connect/SKILL.md +0 -227
  133. package/skills/tools/document/weknora-guide/SKILL.md +0 -216
  134. package/skills/tools/knowledge-graph/mimir-memory-guide/SKILL.md +0 -135
  135. package/skills/tools/knowledge-graph/open-webui-tools-guide/SKILL.md +0 -156
  136. package/skills/tools/ocr-translate/formula-recognition-guide/SKILL.md +0 -367
  137. package/skills/tools/ocr-translate/math-equation-renderer/SKILL.md +0 -198
  138. package/skills/tools/scraping/api-data-collection-guide/SKILL.md +0 -301
  139. package/skills/writing/citation/academic-citation-manager-guide/SKILL.md +0 -182
  140. package/skills/writing/composition/opendraft-thesis-guide/SKILL.md +0 -200
  141. package/skills/writing/composition/paper-debugger-guide/SKILL.md +0 -143
  142. package/skills/writing/composition/paperforge-guide/SKILL.md +0 -205
@@ -1,226 +0,0 @@
1
- ---
2
- name: general-statistics-guide
3
- description: "Conceptual foundations of statistical inference for empirical research"
4
- metadata:
5
- openclaw:
6
- emoji: "📈"
7
- category: "analysis"
8
- subcategory: "statistics"
9
- keywords: ["statistical inference", "hypothesis testing", "probability", "regression", "confidence intervals", "statistical thinking"]
10
- source: "https://clawhub.com/ivangdavila/statistics"
11
- ---
12
-
13
- # Statistical Foundations for Empirical Research
14
-
15
- ## Overview
16
-
17
- This guide builds statistical intuition from probability fundamentals through inferential methods to practical application in research. It is language-agnostic (not tied to R, Python, or Stata) and focuses on the concepts, assumptions, and interpretation of statistical methods commonly used in empirical papers. Use it as a reference when designing studies, choosing tests, or interpreting results.
18
-
19
- ## Probability Foundations
20
-
21
- ### Key Distributions
22
-
23
- | Distribution | When to Use | Parameters | Example |
24
- |-------------|-------------|-----------|---------|
25
- | **Normal** | Continuous, symmetric data; CLT applications | μ (mean), σ (std) | Height, test scores |
26
- | **Binomial** | Count of successes in n trials | n (trials), p (probability) | Survey yes/no responses |
27
- | **Poisson** | Count of rare events in fixed interval | λ (rate) | Paper citations per year |
28
- | **t-distribution** | Small sample means (n < 30) | df (degrees of freedom) | Pilot study comparisons |
29
- | **Chi-squared** | Goodness of fit, contingency tables | df | Category frequency tests |
30
- | **F-distribution** | Ratio of variances, ANOVA | df₁, df₂ | Comparing model fits |
31
-
32
- ### Central Limit Theorem
33
-
34
- The sample mean $\bar{X}$ of n independent observations approaches a normal distribution as n increases, regardless of the population distribution:
35
-
36
- ```
37
- If X₁, X₂, ..., Xₙ are i.i.d. with mean μ and variance σ²:
38
- √n(X̄ - μ) / σ → N(0, 1) as n → ∞
39
-
40
- Practical rule: n ≥ 30 is usually sufficient
41
- Exception: heavily skewed distributions may need n ≥ 100
42
- ```
43
-
44
- This is why most inferential statistics (confidence intervals, t-tests, regression) work even when the underlying data is not normally distributed.
45
-
46
- ## Descriptive Statistics
47
-
48
- ### Measures of Central Tendency
49
-
50
- | Measure | Formula | When to Use | Sensitive to Outliers? |
51
- |---------|---------|-------------|----------------------|
52
- | Mean | Σxᵢ / n | Symmetric distributions | Yes |
53
- | Median | Middle value when sorted | Skewed distributions, ordinal data | No |
54
- | Mode | Most frequent value | Categorical data, multimodal distributions | No |
55
-
56
- ### Measures of Spread
57
-
58
- | Measure | Interpretation | When to Report |
59
- |---------|---------------|----------------|
60
- | Standard deviation (σ) | Average distance from mean | With the mean |
61
- | IQR (Q3 - Q1) | Spread of middle 50% | With the median |
62
- | Range (max - min) | Total spread | Rarely (sensitive to outliers) |
63
- | Coefficient of variation (σ/μ) | Relative spread | Comparing variability across scales |
64
-
65
- ## Hypothesis Testing
66
-
67
- ### The Testing Framework
68
-
69
- ```
70
- 1. State hypotheses:
71
- H₀: null hypothesis (no effect, no difference)
72
- H₁: alternative hypothesis (there is an effect)
73
-
74
- 2. Choose significance level: α = 0.05 (conventional)
75
-
76
- 3. Compute test statistic from data
77
-
78
- 4. Compare to critical value or compute p-value
79
-
80
- 5. Decision:
81
- p < α → Reject H₀ (statistically significant)
82
- p ≥ α → Fail to reject H₀ (not significant)
83
- ```
84
-
85
- ### Common Errors
86
-
87
- | | H₀ is True | H₀ is False |
88
- |---|---|---|
89
- | **Reject H₀** | Type I Error (α) | Correct (Power = 1 - β) |
90
- | **Fail to Reject H₀** | Correct | Type II Error (β) |
91
-
92
- **Practical interpretation**:
93
- - Type I (false positive): Claiming a drug works when it doesn't
94
- - Type II (false negative): Missing a real drug effect
95
- - Power: Probability of detecting a real effect (target ≥ 0.80)
96
-
97
- ### Choosing the Right Test
98
-
99
- | Question | Data Type | Test | Assumptions |
100
- |----------|-----------|------|-------------|
101
- | Compare 2 means | Continuous, normal | Independent t-test | Equal variance (or Welch's) |
102
- | Compare 2 means (paired) | Continuous, normal | Paired t-test | Paired observations |
103
- | Compare 2 means (non-normal) | Continuous/ordinal | Mann-Whitney U | Independent samples |
104
- | Compare >2 means | Continuous, normal | One-way ANOVA | Equal variance, normality |
105
- | Compare >2 means (non-normal) | Ordinal | Kruskal-Wallis | Independent samples |
106
- | Association (categorical) | Categorical × Categorical | Chi-squared test | Expected count ≥ 5 |
107
- | Correlation | Continuous × Continuous | Pearson r | Linear relationship, bivariate normal |
108
- | Correlation (non-normal) | Ordinal or non-normal | Spearman ρ | Monotonic relationship |
109
-
110
- ## Regression Analysis
111
-
112
- ### Linear Regression
113
-
114
- ```
115
- Y = β₀ + β₁X₁ + β₂X₂ + ... + βₖXₖ + ε
116
-
117
- Interpretation:
118
- β₁ = change in Y for a 1-unit increase in X₁, holding other X's constant
119
- R² = proportion of variance in Y explained by the model
120
- Adjusted R² = R² penalized for number of predictors
121
- ```
122
-
123
- **Key assumptions** (check before trusting results):
124
- 1. **Linearity**: Y is a linear function of X's
125
- 2. **Independence**: Observations are independent
126
- 3. **Homoscedasticity**: Constant variance of residuals
127
- 4. **Normality**: Residuals are approximately normal (for inference)
128
- 5. **No multicollinearity**: X's are not highly correlated with each other
129
-
130
- **Diagnostic checks**:
131
-
132
- ```
133
- Linearity: Plot residuals vs. fitted values (no pattern)
134
- Homoscedasticity: Breusch-Pagan test or residual plot (no funnel shape)
135
- Normality: Q-Q plot of residuals, Shapiro-Wilk test
136
- Multicollinearity: VIF (Variance Inflation Factor) — VIF > 10 is concerning
137
- Influential obs: Cook's distance — D > 4/n warrants investigation
138
- ```
139
-
140
- ### Logistic Regression
141
-
142
- For binary outcomes (0/1):
143
-
144
- ```
145
- log(p / (1-p)) = β₀ + β₁X₁ + β₂X₂ + ...
146
-
147
- Where p = P(Y = 1 | X)
148
-
149
- Interpretation:
150
- exp(β₁) = odds ratio
151
- exp(β₁) = 1.5 means "a 1-unit increase in X₁ multiplies the odds by 1.5"
152
- Report: odds ratios with 95% CI
153
- ```
154
-
155
- ## Confidence Intervals
156
-
157
- ```
158
- Point estimate ± (critical value × standard error)
159
-
160
- For a mean: X̄ ± z*(σ/√n) or X̄ ± t*(s/√n)
161
-
162
- Interpretation (frequentist):
163
- "If we repeated this study many times, 95% of the resulting intervals
164
- would contain the true population parameter."
165
-
166
- NOT: "There is a 95% probability that the true value is in this interval."
167
- ```
168
-
169
- ## Effect Sizes
170
-
171
- p-values tell you IF an effect exists; effect sizes tell you HOW BIG it is.
172
-
173
- | Measure | Context | Small | Medium | Large |
174
- |---------|---------|-------|--------|-------|
175
- | Cohen's d | Mean difference | 0.2 | 0.5 | 0.8 |
176
- | Pearson r | Correlation | 0.1 | 0.3 | 0.5 |
177
- | η² (eta-squared) | ANOVA | 0.01 | 0.06 | 0.14 |
178
- | Odds ratio | Logistic regression | 1.5 | 2.5 | 4.3 |
179
- | R² | Regression | 0.02 | 0.13 | 0.26 |
180
-
181
- **Always report effect sizes alongside p-values** — a "significant" result with d = 0.05 is trivial in practice.
182
-
183
- ## Multiple Testing
184
-
185
- When testing multiple hypotheses simultaneously, the chance of at least one false positive increases:
186
-
187
- ```
188
- With α = 0.05 and 20 independent tests:
189
- P(at least one false positive) = 1 - (1 - 0.05)^20 = 0.64
190
-
191
- Corrections:
192
- Bonferroni: α_adj = α / m (conservative)
193
- Benjamini-Hochberg: Controls false discovery rate (FDR) (less conservative)
194
- Holm-Bonferroni: Step-down procedure (more powerful than Bonferroni)
195
- ```
196
-
197
- ## Sample Size and Power
198
-
199
- Before collecting data, determine the required sample size:
200
-
201
- ```
202
- Inputs needed:
203
- 1. Desired power (typically 0.80)
204
- 2. Significance level (α = 0.05)
205
- 3. Expected effect size (from pilot study or literature)
206
- 4. Type of test (t-test, ANOVA, regression, etc.)
207
-
208
- Rule of thumb for two-sample t-test:
209
- n per group ≈ 16 / d² (for 80% power, α = 0.05)
210
- d = 0.5 → n ≈ 64 per group
211
- d = 0.2 → n ≈ 400 per group
212
- ```
213
-
214
- ## Common Pitfalls
215
-
216
- 1. **p-hacking**: Trying many analyses until p < 0.05. Fix: pre-register analyses.
217
- 2. **Absence of evidence ≠ evidence of absence**: p > 0.05 does not prove H₀. Consider equivalence tests.
218
- 3. **Correlation ≠ causation**: Regression coefficients are causal only with proper identification strategy.
219
- 4. **Simpson's paradox**: A trend in subgroups can reverse when combined. Always check stratified analyses.
220
- 5. **Overfitting**: Too many predictors relative to sample size. Rule of thumb: n ≥ 10-20 per predictor.
221
-
222
- ## References
223
-
224
- - Agresti, A. (2018). *Statistical Methods for the Social Sciences* (5th ed.). Pearson.
225
- - Wasserstein, R. L., & Lazar, N. A. (2016). "The ASA Statement on p-Values." *The American Statistician*, 70(2), 129-133.
226
- - Cohen, J. (1988). *Statistical Power Analysis for the Behavioral Sciences* (2nd ed.). Routledge.
@@ -1,106 +0,0 @@
1
- ---
2
- name: infiagent-benchmark-guide
3
- description: "Agent benchmark for data analysis evaluation (ICML 2024)"
4
- metadata:
5
- openclaw:
6
- emoji: "🏆"
7
- category: "analysis"
8
- subcategory: "statistics"
9
- keywords: ["InfiAgent", "benchmark", "data analysis", "agent evaluation", "ICML", "DABench"]
10
- source: "https://github.com/InfiAgent/InfiAgent"
11
- ---
12
-
13
- # InfiAgent Data Analysis Benchmark Guide
14
-
15
- ## Overview
16
-
17
- InfiAgent (ICML 2024) is a benchmark for evaluating AI agents on data analysis tasks. It provides DABench — a standardized set of data analysis problems ranging from basic EDA to complex statistical modeling, each with ground-truth solutions and automated evaluation metrics. Measures agent capabilities in code generation, statistical reasoning, and visualization.
18
-
19
- ## Benchmark Structure
20
-
21
- ```
22
- DABench (Data Analysis Benchmark)
23
- ├── Task Categories
24
- │ ├── Data Understanding (profiling, cleaning)
25
- │ ├── Exploratory Analysis (distributions, correlations)
26
- │ ├── Statistical Testing (hypothesis tests)
27
- │ ├── Visualization (appropriate chart selection)
28
- │ ├── Modeling (regression, classification)
29
- │ └── Interpretation (insights, conclusions)
30
- ├── Difficulty Levels
31
- │ ├── Easy (single-step operations)
32
- │ ├── Medium (multi-step analysis)
33
- │ └── Hard (complex reasoning + code)
34
- └── Evaluation Metrics
35
- ├── Code executability
36
- ├── Answer correctness
37
- ├── Visualization quality
38
- └── Statistical validity
39
- ```
40
-
41
- ## Usage
42
-
43
- ```python
44
- from infiagent import DABench
45
-
46
- bench = DABench()
47
-
48
- # List tasks
49
- for task in bench.tasks[:5]:
50
- print(f"[{task.difficulty}] {task.id}: {task.description}")
51
- print(f" Dataset: {task.dataset}")
52
- print(f" Category: {task.category}")
53
-
54
- # Evaluate an agent
55
- from infiagent import evaluate
56
-
57
- results = evaluate(
58
- agent_fn=my_data_agent,
59
- tasks="all",
60
- timeout=120,
61
- )
62
-
63
- print(f"Executability: {results.exec_rate:.1%}")
64
- print(f"Correctness: {results.correct_rate:.1%}")
65
- print(f"Statistical validity: {results.stats_valid:.1%}")
66
- ```
67
-
68
- ## Task Examples
69
-
70
- ```python
71
- # Easy: "What is the mean and standard deviation of column X?"
72
- # Medium: "Is there a significant correlation between A and B?
73
- # Control for confounders C and D."
74
- # Hard: "Build a predictive model for Y using all available
75
- # features. Report cross-validated performance and
76
- # identify the 3 most important features."
77
- ```
78
-
79
- ## Leaderboard Results
80
-
81
- ```python
82
- # Selected results from DABench
83
- scores = {
84
- "GPT-4 + Code": {"exec": 95, "correct": 67},
85
- "Claude 3.5 Sonnet": {"exec": 93, "correct": 64},
86
- "GPT-3.5 + Code": {"exec": 88, "correct": 45},
87
- "CodeLlama-34B": {"exec": 72, "correct": 31},
88
- }
89
-
90
- print(f"{'Agent':<22} {'Exec%':>6} {'Correct%':>9}")
91
- for agent, s in scores.items():
92
- print(f"{agent:<22} {s['exec']:>5}% {s['correct']:>8}%")
93
- ```
94
-
95
- ## Use Cases
96
-
97
- 1. **Agent evaluation**: Standard benchmark for data analysis agents
98
- 2. **Model comparison**: Compare LLMs on analytical tasks
99
- 3. **Capability testing**: Assess statistical reasoning abilities
100
- 4. **Research**: Study agent strengths and failure modes
101
- 5. **Development**: Target specific weak areas for improvement
102
-
103
- ## References
104
-
105
- - [InfiAgent GitHub](https://github.com/InfiAgent/InfiAgent)
106
- - [DABench Paper (ICML 2024)](https://arxiv.org/abs/2401.05507)
@@ -1,192 +0,0 @@
1
- ---
2
- name: pywayne-statistics-guide
3
- description: "37+ statistical testing methods for rigorous hypothesis testing"
4
- metadata:
5
- openclaw:
6
- emoji: "📐"
7
- category: "analysis"
8
- subcategory: "statistics"
9
- keywords: ["hypothesis testing", "statistical tests", "p-value", "parametric tests", "nonparametric tests", "effect size", "multiple comparisons"]
10
- source: "https://github.com/AcademicSkills/pywayne-statistics-guide"
11
- ---
12
-
13
- # PyWayne Statistics Guide
14
-
15
- A comprehensive reference for 37+ statistical testing methods covering parametric, nonparametric, and resampling-based hypothesis tests. Provides decision trees for test selection, implementation in Python (scipy, statsmodels, pingouin), effect size calculation, and proper reporting standards for academic publications.
16
-
17
- ## Overview
18
-
19
- Hypothesis testing remains the backbone of quantitative research across the sciences, social sciences, and engineering. However, selecting the appropriate test for a given research question, data structure, and assumption profile is a persistent challenge, especially for researchers outside statistics. This skill provides a structured decision framework that maps research questions to the correct statistical test, verifies assumptions, computes test statistics and effect sizes, and formats results for publication.
20
-
21
- All 37+ tests are organized by the type of comparison (one-sample, two-sample, k-sample, association, agreement) and whether parametric assumptions are met. Each test entry includes when to use it, assumptions to verify, the Python implementation, and the correct APA-style reporting format.
22
-
23
- ## Test Selection Decision Tree
24
-
25
- ### Step 1: Identify the Research Question Type
26
-
27
- | Question Type | Examples |
28
- |--------------|----------|
29
- | **One-sample** | Is this sample mean different from a known value? |
30
- | **Two-sample (independent)** | Do treatment and control groups differ? |
31
- | **Two-sample (paired)** | Do pre-test and post-test scores differ? |
32
- | **K-sample (independent)** | Do 3+ groups differ on an outcome? |
33
- | **K-sample (repeated)** | Do measurements differ across 3+ time points? |
34
- | **Association** | Is variable X related to variable Y? |
35
- | **Agreement** | Do two raters/methods agree? |
36
-
37
- ### Step 2: Check Data Type and Assumptions
38
-
39
- ```
40
- Is the outcome variable continuous?
41
- ├── Yes → Are the data normally distributed?
42
- │ ├── Yes → Are variances equal (for group comparisons)?
43
- │ │ ├── Yes → Use PARAMETRIC test
44
- │ │ └── No → Use Welch's correction or nonparametric
45
- │ └── No → Use NONPARAMETRIC test
46
- └── No → Is it ordinal or nominal?
47
- ├── Ordinal → Use rank-based NONPARAMETRIC test
48
- └── Nominal → Use CHI-SQUARE or exact test
49
- ```
50
-
51
- ## Parametric Tests
52
-
53
- ### Two-Sample Tests
54
-
55
- ```python
56
- from scipy import stats
57
- import pingouin as pg
58
- import numpy as np
59
-
60
- def two_sample_comparison(group_a, group_b, paired=False):
61
- """
62
- Perform the appropriate two-sample test with assumption checks.
63
- """
64
- results = {}
65
-
66
- # Assumption: Normality
67
- _, p_norm_a = stats.shapiro(group_a)
68
- _, p_norm_b = stats.shapiro(group_b)
69
- normal = p_norm_a > 0.05 and p_norm_b > 0.05
70
-
71
- if paired:
72
- if normal:
73
- # Paired t-test
74
- t, p = stats.ttest_rel(group_a, group_b)
75
- d = pg.compute_effsize(group_a, group_b, paired=True, eftype='cohen')
76
- results = {'test': 'paired t-test', 't': t, 'p': p, 'cohens_d': d}
77
- else:
78
- # Wilcoxon signed-rank
79
- w, p = stats.wilcoxon(group_a, group_b)
80
- r = w / (len(group_a) * (len(group_a) + 1) / 2)
81
- results = {'test': 'Wilcoxon signed-rank', 'W': w, 'p': p, 'rank_biserial': r}
82
- else:
83
- if normal:
84
- # Check equal variances
85
- _, p_levene = stats.levene(group_a, group_b)
86
- if p_levene > 0.05:
87
- t, p = stats.ttest_ind(group_a, group_b)
88
- results = {'test': 'independent t-test', 't': t, 'p': p}
89
- else:
90
- t, p = stats.ttest_ind(group_a, group_b, equal_var=False)
91
- results = {'test': "Welch's t-test", 't': t, 'p': p}
92
- d = pg.compute_effsize(group_a, group_b, eftype='cohen')
93
- results['cohens_d'] = d
94
- else:
95
- # Mann-Whitney U
96
- u, p = stats.mannwhitneyu(group_a, group_b, alternative='two-sided')
97
- results = {'test': 'Mann-Whitney U', 'U': u, 'p': p}
98
-
99
- return results
100
- ```
101
-
102
- ### K-Sample Tests (ANOVA Family)
103
-
104
- | Test | Use Case | Assumptions |
105
- |------|----------|-------------|
106
- | One-way ANOVA | 3+ independent groups, continuous outcome | Normality, homoscedasticity |
107
- | Welch's ANOVA | 3+ groups, unequal variances | Normality |
108
- | Repeated measures ANOVA | 3+ related measurements | Normality, sphericity |
109
- | Two-way ANOVA | Two factors, continuous outcome | Normality, homoscedasticity |
110
- | ANCOVA | Group comparison controlling for covariate | Normality, homogeneity of slopes |
111
- | MANOVA | Multiple dependent variables | Multivariate normality |
112
-
113
- ```python
114
- def k_sample_test(groups: list, method: str = 'auto'):
115
- """Run the appropriate k-sample comparison."""
116
- # Check normality for all groups
117
- all_normal = all(stats.shapiro(g)[1] > 0.05 for g in groups)
118
-
119
- if all_normal:
120
- # Check homogeneity of variance
121
- _, p_levene = stats.levene(*groups)
122
- if p_levene > 0.05:
123
- f, p = stats.f_oneway(*groups)
124
- return {'test': 'one-way ANOVA', 'F': f, 'p': p}
125
- else:
126
- # Welch's ANOVA via pingouin
127
- return {'test': "Welch's ANOVA", 'note': 'Use pg.welch_anova()'}
128
- else:
129
- h, p = stats.kruskal(*groups)
130
- return {'test': 'Kruskal-Wallis H', 'H': h, 'p': p}
131
- ```
132
-
133
- ## Nonparametric Tests Reference
134
-
135
- | Parametric Test | Nonparametric Alternative | When to Use |
136
- |----------------|--------------------------|-------------|
137
- | One-sample t-test | Wilcoxon signed-rank | Non-normal single sample |
138
- | Independent t-test | Mann-Whitney U | Non-normal, 2 independent groups |
139
- | Paired t-test | Wilcoxon signed-rank | Non-normal, paired data |
140
- | One-way ANOVA | Kruskal-Wallis H | Non-normal, 3+ groups |
141
- | Repeated measures ANOVA | Friedman test | Non-normal, 3+ related measures |
142
- | Pearson correlation | Spearman rho / Kendall tau | Non-linear or ordinal association |
143
-
144
- ## Multiple Comparisons Correction
145
-
146
- When performing multiple hypothesis tests, control the family-wise error rate:
147
-
148
- ```python
149
- from statsmodels.stats.multitest import multipletests
150
-
151
- def correct_multiple_tests(p_values: list, method: str = 'fdr_bh') -> dict:
152
- """
153
- Apply multiple comparisons correction.
154
-
155
- Methods:
156
- 'bonferroni': Conservative, controls FWER
157
- 'holm': Less conservative than Bonferroni, controls FWER
158
- 'fdr_bh': Benjamini-Hochberg, controls FDR (recommended default)
159
- 'fdr_by': Benjamini-Yekutieli, conservative FDR control
160
- """
161
- reject, corrected_p, _, _ = multipletests(p_values, method=method)
162
- return {
163
- 'method': method,
164
- 'original_p': p_values,
165
- 'corrected_p': corrected_p.tolist(),
166
- 'reject': reject.tolist(),
167
- 'n_significant': int(reject.sum())
168
- }
169
- ```
170
-
171
- ## Effect Size Reference
172
-
173
- | Test | Effect Size | Small | Medium | Large |
174
- |------|------------|-------|--------|-------|
175
- | t-test | Cohen's d | 0.2 | 0.5 | 0.8 |
176
- | ANOVA | Eta-squared | 0.01 | 0.06 | 0.14 |
177
- | Correlation | r | 0.1 | 0.3 | 0.5 |
178
- | Chi-square | Cramér's V | 0.1 | 0.3 | 0.5 |
179
- | Mann-Whitney | Rank-biserial r | 0.1 | 0.3 | 0.5 |
180
-
181
- ## APA Reporting Examples
182
-
183
- - **t-test**: "An independent samples t-test revealed a significant difference, t(58) = 2.45, p = .017, d = 0.63."
184
- - **ANOVA**: "A one-way ANOVA showed a significant main effect of condition, F(2, 87) = 4.12, p = .020, eta-squared = 0.09."
185
- - **Mann-Whitney**: "A Mann-Whitney U test indicated that scores were significantly higher in the treatment group, U = 245, p = .003, r = 0.42."
186
- - **Chi-square**: "A chi-square test of independence revealed a significant association, X2(2, N = 150) = 8.34, p = .015, V = 0.24."
187
-
188
- ## References
189
-
190
- - Cohen, J. (1988). *Statistical Power Analysis for the Behavioral Sciences* (2nd ed.). Routledge.
191
- - Vallat, R. (2018). Pingouin: Statistics in Python. *JOSS*, 3(31), 1026.
192
- - Lakens, D. (2013). Calculating and Reporting Effect Sizes. *Frontiers in Psychology*, 4, 863.